diff --git "a/trainer_state.json" "b/trainer_state.json"
new file mode 100644--- /dev/null
+++ "b/trainer_state.json"
@@ -0,0 +1,55195 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.9999365409754921,
+  "eval_steps": 500,
+  "global_step": 39394,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 5.076721960630021e-05,
+      "grad_norm": 0.15623666546780773,
+      "learning_rate": 2.538071065989848e-07,
+      "loss": 1.1099,
+      "step": 1
+    },
+    {
+      "epoch": 0.00025383609803150106,
+      "grad_norm": 0.1494363970160199,
+      "learning_rate": 1.2690355329949238e-06,
+      "loss": 1.1138,
+      "step": 5
+    },
+    {
+      "epoch": 0.0005076721960630021,
+      "grad_norm": 0.15321706073016964,
+      "learning_rate": 2.5380710659898476e-06,
+      "loss": 1.1368,
+      "step": 10
+    },
+    {
+      "epoch": 0.0007615082940945031,
+      "grad_norm": 0.1542643246754322,
+      "learning_rate": 3.807106598984772e-06,
+      "loss": 1.1091,
+      "step": 15
+    },
+    {
+      "epoch": 0.0010153443921260042,
+      "grad_norm": 0.13824138762283186,
+      "learning_rate": 5.076142131979695e-06,
+      "loss": 1.0911,
+      "step": 20
+    },
+    {
+      "epoch": 0.0012691804901575053,
+      "grad_norm": 0.1579877396947511,
+      "learning_rate": 6.345177664974619e-06,
+      "loss": 1.1267,
+      "step": 25
+    },
+    {
+      "epoch": 0.0015230165881890063,
+      "grad_norm": 0.1478480849215318,
+      "learning_rate": 7.614213197969544e-06,
+      "loss": 1.0848,
+      "step": 30
+    },
+    {
+      "epoch": 0.0017768526862205075,
+      "grad_norm": 0.11527528929066594,
+      "learning_rate": 8.883248730964468e-06,
+      "loss": 1.07,
+      "step": 35
+    },
+    {
+      "epoch": 0.0020306887842520085,
+      "grad_norm": 0.10942201134957022,
+      "learning_rate": 1.015228426395939e-05,
+      "loss": 1.0856,
+      "step": 40
+    },
+    {
+      "epoch": 0.0022845248822835097,
+      "grad_norm": 0.1003586160454669,
+      "learning_rate": 1.1421319796954315e-05,
+      "loss": 1.0483,
+      "step": 45
+    },
+    {
+      "epoch": 0.0025383609803150105,
+      "grad_norm": 0.09455563009354048,
+      "learning_rate": 1.2690355329949238e-05,
+      "loss": 1.0842,
+      "step": 50
+    },
+    {
+      "epoch": 0.0027921970783465117,
+      "grad_norm": 0.08578428766129725,
+      "learning_rate": 1.3959390862944163e-05,
+      "loss": 1.0387,
+      "step": 55
+    },
+    {
+      "epoch": 0.0030460331763780125,
+      "grad_norm": 0.08715199888221983,
+      "learning_rate": 1.5228426395939088e-05,
+      "loss": 1.0585,
+      "step": 60
+    },
+    {
+      "epoch": 0.0032998692744095138,
+      "grad_norm": 0.07478324580510415,
+      "learning_rate": 1.6497461928934012e-05,
+      "loss": 0.9773,
+      "step": 65
+    },
+    {
+      "epoch": 0.003553705372441015,
+      "grad_norm": 0.07368740079084418,
+      "learning_rate": 1.7766497461928935e-05,
+      "loss": 1.0023,
+      "step": 70
+    },
+    {
+      "epoch": 0.0038075414704725158,
+      "grad_norm": 0.07301153154637648,
+      "learning_rate": 1.9035532994923858e-05,
+      "loss": 0.9723,
+      "step": 75
+    },
+    {
+      "epoch": 0.004061377568504017,
+      "grad_norm": 0.07289038213742076,
+      "learning_rate": 2.030456852791878e-05,
+      "loss": 0.9927,
+      "step": 80
+    },
+    {
+      "epoch": 0.004315213666535518,
+      "grad_norm": 0.07465327431922439,
+      "learning_rate": 2.1573604060913704e-05,
+      "loss": 0.9821,
+      "step": 85
+    },
+    {
+      "epoch": 0.0045690497645670194,
+      "grad_norm": 0.06774510984286918,
+      "learning_rate": 2.284263959390863e-05,
+      "loss": 0.965,
+      "step": 90
+    },
+    {
+      "epoch": 0.00482288586259852,
+      "grad_norm": 0.06855996583318778,
+      "learning_rate": 2.4111675126903553e-05,
+      "loss": 0.9821,
+      "step": 95
+    },
+    {
+      "epoch": 0.005076721960630021,
+      "grad_norm": 0.07536106861979534,
+      "learning_rate": 2.5380710659898476e-05,
+      "loss": 0.9284,
+      "step": 100
+    },
+    {
+      "epoch": 0.005330558058661523,
+      "grad_norm": 0.06730944873958547,
+      "learning_rate": 2.6649746192893403e-05,
+      "loss": 0.9626,
+      "step": 105
+    },
+    {
+      "epoch": 0.0055843941566930235,
+      "grad_norm": 0.069457861068512,
+      "learning_rate": 2.7918781725888326e-05,
+      "loss": 0.956,
+      "step": 110
+    },
+    {
+      "epoch": 0.005838230254724524,
+      "grad_norm": 0.06693731117055934,
+      "learning_rate": 2.918781725888325e-05,
+      "loss": 0.9747,
+      "step": 115
+    },
+    {
+      "epoch": 0.006092066352756025,
+      "grad_norm": 0.06277015163357891,
+      "learning_rate": 3.0456852791878175e-05,
+      "loss": 0.9116,
+      "step": 120
+    },
+    {
+      "epoch": 0.006345902450787527,
+      "grad_norm": 0.06643101153314454,
+      "learning_rate": 3.17258883248731e-05,
+      "loss": 0.9232,
+      "step": 125
+    },
+    {
+      "epoch": 0.0065997385488190275,
+      "grad_norm": 0.06924486382591209,
+      "learning_rate": 3.2994923857868024e-05,
+      "loss": 0.9399,
+      "step": 130
+    },
+    {
+      "epoch": 0.006853574646850528,
+      "grad_norm": 0.07604177904943392,
+      "learning_rate": 3.4263959390862944e-05,
+      "loss": 0.8964,
+      "step": 135
+    },
+    {
+      "epoch": 0.00710741074488203,
+      "grad_norm": 0.06481060411660645,
+      "learning_rate": 3.553299492385787e-05,
+      "loss": 0.948,
+      "step": 140
+    },
+    {
+      "epoch": 0.007361246842913531,
+      "grad_norm": 0.06555462608634341,
+      "learning_rate": 3.680203045685279e-05,
+      "loss": 0.9325,
+      "step": 145
+    },
+    {
+      "epoch": 0.0076150829409450315,
+      "grad_norm": 0.07121267717524804,
+      "learning_rate": 3.8071065989847716e-05,
+      "loss": 0.9379,
+      "step": 150
+    },
+    {
+      "epoch": 0.007868919038976532,
+      "grad_norm": 0.07190008765656207,
+      "learning_rate": 3.934010152284264e-05,
+      "loss": 0.9104,
+      "step": 155
+    },
+    {
+      "epoch": 0.008122755137008034,
+      "grad_norm": 0.06497388555993808,
+      "learning_rate": 4.060913705583756e-05,
+      "loss": 0.9238,
+      "step": 160
+    },
+    {
+      "epoch": 0.008376591235039536,
+      "grad_norm": 0.06647076390288592,
+      "learning_rate": 4.187817258883249e-05,
+      "loss": 0.9081,
+      "step": 165
+    },
+    {
+      "epoch": 0.008630427333071036,
+      "grad_norm": 0.07078773209118674,
+      "learning_rate": 4.314720812182741e-05,
+      "loss": 0.8845,
+      "step": 170
+    },
+    {
+      "epoch": 0.008884263431102537,
+      "grad_norm": 0.060550162204052516,
+      "learning_rate": 4.4416243654822335e-05,
+      "loss": 0.8901,
+      "step": 175
+    },
+    {
+      "epoch": 0.009138099529134039,
+      "grad_norm": 0.06924081884858772,
+      "learning_rate": 4.568527918781726e-05,
+      "loss": 0.924,
+      "step": 180
+    },
+    {
+      "epoch": 0.009391935627165539,
+      "grad_norm": 0.06720539183021615,
+      "learning_rate": 4.695431472081219e-05,
+      "loss": 0.8938,
+      "step": 185
+    },
+    {
+      "epoch": 0.00964577172519704,
+      "grad_norm": 0.07177203010161029,
+      "learning_rate": 4.822335025380711e-05,
+      "loss": 0.9048,
+      "step": 190
+    },
+    {
+      "epoch": 0.009899607823228542,
+      "grad_norm": 0.07321268374977867,
+      "learning_rate": 4.949238578680203e-05,
+      "loss": 0.8666,
+      "step": 195
+    },
+    {
+      "epoch": 0.010153443921260042,
+      "grad_norm": 0.0704505269850039,
+      "learning_rate": 5.076142131979695e-05,
+      "loss": 0.9043,
+      "step": 200
+    },
+    {
+      "epoch": 0.010407280019291544,
+      "grad_norm": 0.06657597088837011,
+      "learning_rate": 5.2030456852791886e-05,
+      "loss": 0.8896,
+      "step": 205
+    },
+    {
+      "epoch": 0.010661116117323045,
+      "grad_norm": 0.06540442514049039,
+      "learning_rate": 5.3299492385786806e-05,
+      "loss": 0.8706,
+      "step": 210
+    },
+    {
+      "epoch": 0.010914952215354545,
+      "grad_norm": 0.07007986246869724,
+      "learning_rate": 5.4568527918781725e-05,
+      "loss": 0.8502,
+      "step": 215
+    },
+    {
+      "epoch": 0.011168788313386047,
+      "grad_norm": 0.07149965435491132,
+      "learning_rate": 5.583756345177665e-05,
+      "loss": 0.8837,
+      "step": 220
+    },
+    {
+      "epoch": 0.011422624411417547,
+      "grad_norm": 0.06719503318382268,
+      "learning_rate": 5.710659898477157e-05,
+      "loss": 0.8644,
+      "step": 225
+    },
+    {
+      "epoch": 0.011676460509449049,
+      "grad_norm": 0.06782034211539793,
+      "learning_rate": 5.83756345177665e-05,
+      "loss": 0.8966,
+      "step": 230
+    },
+    {
+      "epoch": 0.01193029660748055,
+      "grad_norm": 0.06913885877260682,
+      "learning_rate": 5.9644670050761424e-05,
+      "loss": 0.881,
+      "step": 235
+    },
+    {
+      "epoch": 0.01218413270551205,
+      "grad_norm": 0.06719486728451429,
+      "learning_rate": 6.091370558375635e-05,
+      "loss": 0.8766,
+      "step": 240
+    },
+    {
+      "epoch": 0.012437968803543552,
+      "grad_norm": 0.06653298354073509,
+      "learning_rate": 6.218274111675127e-05,
+      "loss": 0.8781,
+      "step": 245
+    },
+    {
+      "epoch": 0.012691804901575053,
+      "grad_norm": 0.06438730665451305,
+      "learning_rate": 6.34517766497462e-05,
+      "loss": 0.8918,
+      "step": 250
+    },
+    {
+      "epoch": 0.012945640999606553,
+      "grad_norm": 0.06665969371026668,
+      "learning_rate": 6.472081218274112e-05,
+      "loss": 0.8583,
+      "step": 255
+    },
+    {
+      "epoch": 0.013199477097638055,
+      "grad_norm": 0.06715127430825377,
+      "learning_rate": 6.598984771573605e-05,
+      "loss": 0.8656,
+      "step": 260
+    },
+    {
+      "epoch": 0.013453313195669557,
+      "grad_norm": 0.06926262661035425,
+      "learning_rate": 6.725888324873096e-05,
+      "loss": 0.8649,
+      "step": 265
+    },
+    {
+      "epoch": 0.013707149293701057,
+      "grad_norm": 0.06985857406011756,
+      "learning_rate": 6.852791878172589e-05,
+      "loss": 0.8729,
+      "step": 270
+    },
+    {
+      "epoch": 0.013960985391732558,
+      "grad_norm": 0.06860201087036598,
+      "learning_rate": 6.979695431472081e-05,
+      "loss": 0.8601,
+      "step": 275
+    },
+    {
+      "epoch": 0.01421482148976406,
+      "grad_norm": 0.06930283958418837,
+      "learning_rate": 7.106598984771574e-05,
+      "loss": 0.8531,
+      "step": 280
+    },
+    {
+      "epoch": 0.01446865758779556,
+      "grad_norm": 0.06690534995876123,
+      "learning_rate": 7.233502538071065e-05,
+      "loss": 0.8696,
+      "step": 285
+    },
+    {
+      "epoch": 0.014722493685827061,
+      "grad_norm": 0.07121164184727187,
+      "learning_rate": 7.360406091370558e-05,
+      "loss": 0.8475,
+      "step": 290
+    },
+    {
+      "epoch": 0.014976329783858563,
+      "grad_norm": 0.06510867515978803,
+      "learning_rate": 7.48730964467005e-05,
+      "loss": 0.8529,
+      "step": 295
+    },
+    {
+      "epoch": 0.015230165881890063,
+      "grad_norm": 0.07520321408132809,
+      "learning_rate": 7.614213197969543e-05,
+      "loss": 0.8385,
+      "step": 300
+    },
+    {
+      "epoch": 0.015484001979921565,
+      "grad_norm": 0.06551584845379264,
+      "learning_rate": 7.741116751269036e-05,
+      "loss": 0.8687,
+      "step": 305
+    },
+    {
+      "epoch": 0.015737838077953065,
+      "grad_norm": 0.07490825713450537,
+      "learning_rate": 7.868020304568529e-05,
+      "loss": 0.8349,
+      "step": 310
+    },
+    {
+      "epoch": 0.015991674175984568,
+      "grad_norm": 0.06896780239118258,
+      "learning_rate": 7.994923857868021e-05,
+      "loss": 0.8212,
+      "step": 315
+    },
+    {
+      "epoch": 0.016245510274016068,
+      "grad_norm": 0.07054318367146102,
+      "learning_rate": 8.121827411167512e-05,
+      "loss": 0.8385,
+      "step": 320
+    },
+    {
+      "epoch": 0.016499346372047568,
+      "grad_norm": 0.07148603385955352,
+      "learning_rate": 8.248730964467005e-05,
+      "loss": 0.821,
+      "step": 325
+    },
+    {
+      "epoch": 0.01675318247007907,
+      "grad_norm": 0.07322626942475637,
+      "learning_rate": 8.375634517766498e-05,
+      "loss": 0.8307,
+      "step": 330
+    },
+    {
+      "epoch": 0.01700701856811057,
+      "grad_norm": 0.0656810178548751,
+      "learning_rate": 8.50253807106599e-05,
+      "loss": 0.8172,
+      "step": 335
+    },
+    {
+      "epoch": 0.01726085466614207,
+      "grad_norm": 0.08377135110233941,
+      "learning_rate": 8.629441624365482e-05,
+      "loss": 0.8464,
+      "step": 340
+    },
+    {
+      "epoch": 0.017514690764173575,
+      "grad_norm": 0.075227118003797,
+      "learning_rate": 8.756345177664974e-05,
+      "loss": 0.8505,
+      "step": 345
+    },
+    {
+      "epoch": 0.017768526862205074,
+      "grad_norm": 0.06433840181173646,
+      "learning_rate": 8.883248730964467e-05,
+      "loss": 0.8511,
+      "step": 350
+    },
+    {
+      "epoch": 0.018022362960236574,
+      "grad_norm": 0.06922064842556071,
+      "learning_rate": 9.01015228426396e-05,
+      "loss": 0.8391,
+      "step": 355
+    },
+    {
+      "epoch": 0.018276199058268078,
+      "grad_norm": 0.0755697527347306,
+      "learning_rate": 9.137055837563452e-05,
+      "loss": 0.821,
+      "step": 360
+    },
+    {
+      "epoch": 0.018530035156299578,
+      "grad_norm": 0.07031238898852438,
+      "learning_rate": 9.263959390862945e-05,
+      "loss": 0.8231,
+      "step": 365
+    },
+    {
+      "epoch": 0.018783871254331078,
+      "grad_norm": 0.0736219488733804,
+      "learning_rate": 9.390862944162437e-05,
+      "loss": 0.822,
+      "step": 370
+    },
+    {
+      "epoch": 0.01903770735236258,
+      "grad_norm": 0.06790722048303544,
+      "learning_rate": 9.517766497461929e-05,
+      "loss": 0.8175,
+      "step": 375
+    },
+    {
+      "epoch": 0.01929154345039408,
+      "grad_norm": 0.06650805003812338,
+      "learning_rate": 9.644670050761421e-05,
+      "loss": 0.821,
+      "step": 380
+    },
+    {
+      "epoch": 0.01954537954842558,
+      "grad_norm": 0.0655885804187293,
+      "learning_rate": 9.771573604060914e-05,
+      "loss": 0.8174,
+      "step": 385
+    },
+    {
+      "epoch": 0.019799215646457084,
+      "grad_norm": 0.07059246576832452,
+      "learning_rate": 9.898477157360407e-05,
+      "loss": 0.8255,
+      "step": 390
+    },
+    {
+      "epoch": 0.020053051744488584,
+      "grad_norm": 0.08379327242669067,
+      "learning_rate": 0.00010025380710659898,
+      "loss": 0.8401,
+      "step": 395
+    },
+    {
+      "epoch": 0.020306887842520084,
+      "grad_norm": 0.06727444934015664,
+      "learning_rate": 0.0001015228426395939,
+      "loss": 0.8304,
+      "step": 400
+    },
+    {
+      "epoch": 0.020560723940551588,
+      "grad_norm": 0.0772208129389319,
+      "learning_rate": 0.00010279187817258883,
+      "loss": 0.8172,
+      "step": 405
+    },
+    {
+      "epoch": 0.020814560038583087,
+      "grad_norm": 0.07506045651461135,
+      "learning_rate": 0.00010406091370558377,
+      "loss": 0.8148,
+      "step": 410
+    },
+    {
+      "epoch": 0.021068396136614587,
+      "grad_norm": 0.07444802891412307,
+      "learning_rate": 0.00010532994923857868,
+      "loss": 0.8642,
+      "step": 415
+    },
+    {
+      "epoch": 0.02132223223464609,
+      "grad_norm": 0.06410369753897495,
+      "learning_rate": 0.00010659898477157361,
+      "loss": 0.8523,
+      "step": 420
+    },
+    {
+      "epoch": 0.02157606833267759,
+      "grad_norm": 0.06581099995077219,
+      "learning_rate": 0.00010786802030456854,
+      "loss": 0.8214,
+      "step": 425
+    },
+    {
+      "epoch": 0.02182990443070909,
+      "grad_norm": 0.06664549211252414,
+      "learning_rate": 0.00010913705583756345,
+      "loss": 0.8083,
+      "step": 430
+    },
+    {
+      "epoch": 0.02208374052874059,
+      "grad_norm": 0.06658304689511382,
+      "learning_rate": 0.00011040609137055838,
+      "loss": 0.8143,
+      "step": 435
+    },
+    {
+      "epoch": 0.022337576626772094,
+      "grad_norm": 0.0683685494048942,
+      "learning_rate": 0.0001116751269035533,
+      "loss": 0.8239,
+      "step": 440
+    },
+    {
+      "epoch": 0.022591412724803594,
+      "grad_norm": 0.06845204574468279,
+      "learning_rate": 0.00011294416243654823,
+      "loss": 0.8109,
+      "step": 445
+    },
+    {
+      "epoch": 0.022845248822835094,
+      "grad_norm": 0.0677741143702505,
+      "learning_rate": 0.00011421319796954314,
+      "loss": 0.8244,
+      "step": 450
+    },
+    {
+      "epoch": 0.023099084920866597,
+      "grad_norm": 0.06606869955263507,
+      "learning_rate": 0.00011548223350253807,
+      "loss": 0.8075,
+      "step": 455
+    },
+    {
+      "epoch": 0.023352921018898097,
+      "grad_norm": 0.07205738377017837,
+      "learning_rate": 0.000116751269035533,
+      "loss": 0.7934,
+      "step": 460
+    },
+    {
+      "epoch": 0.023606757116929597,
+      "grad_norm": 0.06872617393230794,
+      "learning_rate": 0.00011802030456852793,
+      "loss": 0.8011,
+      "step": 465
+    },
+    {
+      "epoch": 0.0238605932149611,
+      "grad_norm": 0.08044019183258687,
+      "learning_rate": 0.00011928934010152285,
+      "loss": 0.8127,
+      "step": 470
+    },
+    {
+      "epoch": 0.0241144293129926,
+      "grad_norm": 0.07828212067502605,
+      "learning_rate": 0.00012055837563451777,
+      "loss": 0.787,
+      "step": 475
+    },
+    {
+      "epoch": 0.0243682654110241,
+      "grad_norm": 0.07606104395777182,
+      "learning_rate": 0.0001218274111675127,
+      "loss": 0.8251,
+      "step": 480
+    },
+    {
+      "epoch": 0.024622101509055604,
+      "grad_norm": 0.06583732816278534,
+      "learning_rate": 0.0001230964467005076,
+      "loss": 0.8246,
+      "step": 485
+    },
+    {
+      "epoch": 0.024875937607087104,
+      "grad_norm": 0.06441670784951158,
+      "learning_rate": 0.00012436548223350254,
+      "loss": 0.7876,
+      "step": 490
+    },
+    {
+      "epoch": 0.025129773705118603,
+      "grad_norm": 0.07758246886806429,
+      "learning_rate": 0.00012563451776649747,
+      "loss": 0.8221,
+      "step": 495
+    },
+    {
+      "epoch": 0.025383609803150107,
+      "grad_norm": 0.06322158391207536,
+      "learning_rate": 0.0001269035532994924,
+      "loss": 0.7881,
+      "step": 500
+    },
+    {
+      "epoch": 0.025637445901181607,
+      "grad_norm": 0.07176275752432852,
+      "learning_rate": 0.00012817258883248732,
+      "loss": 0.8097,
+      "step": 505
+    },
+    {
+      "epoch": 0.025891281999213107,
+      "grad_norm": 0.06256825439016349,
+      "learning_rate": 0.00012944162436548224,
+      "loss": 0.7871,
+      "step": 510
+    },
+    {
+      "epoch": 0.02614511809724461,
+      "grad_norm": 0.06274483186816927,
+      "learning_rate": 0.00013071065989847717,
+      "loss": 0.8353,
+      "step": 515
+    },
+    {
+      "epoch": 0.02639895419527611,
+      "grad_norm": 0.06260104403606175,
+      "learning_rate": 0.0001319796954314721,
+      "loss": 0.8062,
+      "step": 520
+    },
+    {
+      "epoch": 0.02665279029330761,
+      "grad_norm": 0.0692807834839823,
+      "learning_rate": 0.00013324873096446702,
+      "loss": 0.8184,
+      "step": 525
+    },
+    {
+      "epoch": 0.026906626391339113,
+      "grad_norm": 0.06940858744959649,
+      "learning_rate": 0.00013451776649746192,
+      "loss": 0.7818,
+      "step": 530
+    },
+    {
+      "epoch": 0.027160462489370613,
+      "grad_norm": 0.06227095786073527,
+      "learning_rate": 0.00013578680203045685,
+      "loss": 0.7997,
+      "step": 535
+    },
+    {
+      "epoch": 0.027414298587402113,
+      "grad_norm": 0.06606458794872055,
+      "learning_rate": 0.00013705583756345178,
+      "loss": 0.791,
+      "step": 540
+    },
+    {
+      "epoch": 0.027668134685433617,
+      "grad_norm": 0.07921383531960545,
+      "learning_rate": 0.0001383248730964467,
+      "loss": 0.7865,
+      "step": 545
+    },
+    {
+      "epoch": 0.027921970783465117,
+      "grad_norm": 0.07202541973928622,
+      "learning_rate": 0.00013959390862944163,
+      "loss": 0.799,
+      "step": 550
+    },
+    {
+      "epoch": 0.028175806881496616,
+      "grad_norm": 0.07446687633156211,
+      "learning_rate": 0.00014086294416243656,
+      "loss": 0.8005,
+      "step": 555
+    },
+    {
+      "epoch": 0.02842964297952812,
+      "grad_norm": 0.06806792304243547,
+      "learning_rate": 0.00014213197969543148,
+      "loss": 0.7807,
+      "step": 560
+    },
+    {
+      "epoch": 0.02868347907755962,
+      "grad_norm": 0.06961466568049711,
+      "learning_rate": 0.0001434010152284264,
+      "loss": 0.798,
+      "step": 565
+    },
+    {
+      "epoch": 0.02893731517559112,
+      "grad_norm": 0.06993114177456627,
+      "learning_rate": 0.0001446700507614213,
+      "loss": 0.8141,
+      "step": 570
+    },
+    {
+      "epoch": 0.029191151273622623,
+      "grad_norm": 0.07132008217126029,
+      "learning_rate": 0.00014593908629441623,
+      "loss": 0.7732,
+      "step": 575
+    },
+    {
+      "epoch": 0.029444987371654123,
+      "grad_norm": 0.0634032782262794,
+      "learning_rate": 0.00014720812182741116,
+      "loss": 0.7833,
+      "step": 580
+    },
+    {
+      "epoch": 0.029698823469685623,
+      "grad_norm": 0.06291224826282306,
+      "learning_rate": 0.00014847715736040609,
+      "loss": 0.7713,
+      "step": 585
+    },
+    {
+      "epoch": 0.029952659567717126,
+      "grad_norm": 0.06955080092630633,
+      "learning_rate": 0.000149746192893401,
+      "loss": 0.812,
+      "step": 590
+    },
+    {
+      "epoch": 0.030206495665748626,
+      "grad_norm": 0.06642744415224454,
+      "learning_rate": 0.00015101522842639594,
+      "loss": 0.7816,
+      "step": 595
+    },
+    {
+      "epoch": 0.030460331763780126,
+      "grad_norm": 0.07915699035230568,
+      "learning_rate": 0.00015228426395939087,
+      "loss": 0.815,
+      "step": 600
+    },
+    {
+      "epoch": 0.03071416786181163,
+      "grad_norm": 0.07162405450532806,
+      "learning_rate": 0.0001535532994923858,
+      "loss": 0.7785,
+      "step": 605
+    },
+    {
+      "epoch": 0.03096800395984313,
+      "grad_norm": 0.06622725095139695,
+      "learning_rate": 0.00015482233502538072,
+      "loss": 0.7698,
+      "step": 610
+    },
+    {
+      "epoch": 0.03122184005787463,
+      "grad_norm": 0.06718329111593666,
+      "learning_rate": 0.00015609137055837564,
+      "loss": 0.7938,
+      "step": 615
+    },
+    {
+      "epoch": 0.03147567615590613,
+      "grad_norm": 0.0834368594300888,
+      "learning_rate": 0.00015736040609137057,
+      "loss": 0.8093,
+      "step": 620
+    },
+    {
+      "epoch": 0.03172951225393763,
+      "grad_norm": 0.09950838339388045,
+      "learning_rate": 0.0001586294416243655,
+      "loss": 0.7822,
+      "step": 625
+    },
+    {
+      "epoch": 0.031983348351969136,
+      "grad_norm": 0.08312710734927868,
+      "learning_rate": 0.00015989847715736042,
+      "loss": 0.7726,
+      "step": 630
+    },
+    {
+      "epoch": 0.032237184450000636,
+      "grad_norm": 0.06335807274711928,
+      "learning_rate": 0.00016116751269035535,
+      "loss": 0.7934,
+      "step": 635
+    },
+    {
+      "epoch": 0.032491020548032136,
+      "grad_norm": 0.07178135932549387,
+      "learning_rate": 0.00016243654822335025,
+      "loss": 0.7719,
+      "step": 640
+    },
+    {
+      "epoch": 0.032744856646063636,
+      "grad_norm": 0.07635177651816702,
+      "learning_rate": 0.00016370558375634518,
+      "loss": 0.7969,
+      "step": 645
+    },
+    {
+      "epoch": 0.032998692744095136,
+      "grad_norm": 0.17477231843517774,
+      "learning_rate": 0.0001649746192893401,
+      "loss": 0.7771,
+      "step": 650
+    },
+    {
+      "epoch": 0.033252528842126636,
+      "grad_norm": 0.06434201144677448,
+      "learning_rate": 0.00016624365482233503,
+      "loss": 0.7829,
+      "step": 655
+    },
+    {
+      "epoch": 0.03350636494015814,
+      "grad_norm": 0.0663036035191937,
+      "learning_rate": 0.00016751269035532995,
+      "loss": 0.7742,
+      "step": 660
+    },
+    {
+      "epoch": 0.03376020103818964,
+      "grad_norm": 0.061908820772137774,
+      "learning_rate": 0.00016878172588832488,
+      "loss": 0.7748,
+      "step": 665
+    },
+    {
+      "epoch": 0.03401403713622114,
+      "grad_norm": 0.07340604363772968,
+      "learning_rate": 0.0001700507614213198,
+      "loss": 0.7663,
+      "step": 670
+    },
+    {
+      "epoch": 0.03426787323425264,
+      "grad_norm": 0.06471552425628542,
+      "learning_rate": 0.0001713197969543147,
+      "loss": 0.738,
+      "step": 675
+    },
+    {
+      "epoch": 0.03452170933228414,
+      "grad_norm": 0.0628486951271683,
+      "learning_rate": 0.00017258883248730963,
+      "loss": 0.7693,
+      "step": 680
+    },
+    {
+      "epoch": 0.03477554543031564,
+      "grad_norm": 0.0821063953061165,
+      "learning_rate": 0.00017385786802030456,
+      "loss": 0.7937,
+      "step": 685
+    },
+    {
+      "epoch": 0.03502938152834715,
+      "grad_norm": 0.0677477739831011,
+      "learning_rate": 0.00017512690355329949,
+      "loss": 0.7516,
+      "step": 690
+    },
+    {
+      "epoch": 0.03528321762637865,
+      "grad_norm": 0.06601728867845341,
+      "learning_rate": 0.0001763959390862944,
+      "loss": 0.7711,
+      "step": 695
+    },
+    {
+      "epoch": 0.03553705372441015,
+      "grad_norm": 0.06337314268961115,
+      "learning_rate": 0.00017766497461928934,
+      "loss": 0.7725,
+      "step": 700
+    },
+    {
+      "epoch": 0.03579088982244165,
+      "grad_norm": 0.06700277491271579,
+      "learning_rate": 0.00017893401015228426,
+      "loss": 0.7779,
+      "step": 705
+    },
+    {
+      "epoch": 0.03604472592047315,
+      "grad_norm": 0.06439978678064547,
+      "learning_rate": 0.0001802030456852792,
+      "loss": 0.7377,
+      "step": 710
+    },
+    {
+      "epoch": 0.03629856201850465,
+      "grad_norm": 0.08022019987059843,
+      "learning_rate": 0.00018147208121827412,
+      "loss": 0.7891,
+      "step": 715
+    },
+    {
+      "epoch": 0.036552398116536156,
+      "grad_norm": 0.06618773295124729,
+      "learning_rate": 0.00018274111675126904,
+      "loss": 0.7847,
+      "step": 720
+    },
+    {
+      "epoch": 0.036806234214567655,
+      "grad_norm": 0.06785165350325073,
+      "learning_rate": 0.00018401015228426397,
+      "loss": 0.7709,
+      "step": 725
+    },
+    {
+      "epoch": 0.037060070312599155,
+      "grad_norm": 0.06446068323928258,
+      "learning_rate": 0.0001852791878172589,
+      "loss": 0.7466,
+      "step": 730
+    },
+    {
+      "epoch": 0.037313906410630655,
+      "grad_norm": 0.0743985429884066,
+      "learning_rate": 0.00018654822335025382,
+      "loss": 0.7495,
+      "step": 735
+    },
+    {
+      "epoch": 0.037567742508662155,
+      "grad_norm": 0.06381394090876102,
+      "learning_rate": 0.00018781725888324875,
+      "loss": 0.7724,
+      "step": 740
+    },
+    {
+      "epoch": 0.037821578606693655,
+      "grad_norm": 0.07609937482268822,
+      "learning_rate": 0.00018908629441624368,
+      "loss": 0.8044,
+      "step": 745
+    },
+    {
+      "epoch": 0.03807541470472516,
+      "grad_norm": 0.07080823262148744,
+      "learning_rate": 0.00019035532994923857,
+      "loss": 0.7392,
+      "step": 750
+    },
+    {
+      "epoch": 0.03832925080275666,
+      "grad_norm": 0.08986296323961589,
+      "learning_rate": 0.0001916243654822335,
+      "loss": 0.7506,
+      "step": 755
+    },
+    {
+      "epoch": 0.03858308690078816,
+      "grad_norm": 0.0633546160075049,
+      "learning_rate": 0.00019289340101522843,
+      "loss": 0.775,
+      "step": 760
+    },
+    {
+      "epoch": 0.03883692299881966,
+      "grad_norm": 0.06672741053597235,
+      "learning_rate": 0.00019416243654822335,
+      "loss": 0.7918,
+      "step": 765
+    },
+    {
+      "epoch": 0.03909075909685116,
+      "grad_norm": 0.0667100508132584,
+      "learning_rate": 0.00019543147208121828,
+      "loss": 0.7857,
+      "step": 770
+    },
+    {
+      "epoch": 0.03934459519488266,
+      "grad_norm": 0.07489251478473465,
+      "learning_rate": 0.0001967005076142132,
+      "loss": 0.7601,
+      "step": 775
+    },
+    {
+      "epoch": 0.03959843129291417,
+      "grad_norm": 0.07514829685101908,
+      "learning_rate": 0.00019796954314720813,
+      "loss": 0.7734,
+      "step": 780
+    },
+    {
+      "epoch": 0.03985226739094567,
+      "grad_norm": 0.06072516621359633,
+      "learning_rate": 0.00019923857868020303,
+      "loss": 0.7716,
+      "step": 785
+    },
+    {
+      "epoch": 0.04010610348897717,
+      "grad_norm": 0.10545789364607022,
+      "learning_rate": 0.00020050761421319796,
+      "loss": 0.6972,
+      "step": 790
+    },
+    {
+      "epoch": 0.04035993958700867,
+      "grad_norm": 0.06621028315139817,
+      "learning_rate": 0.00020177664974619288,
+      "loss": 0.7619,
+      "step": 795
+    },
+    {
+      "epoch": 0.04061377568504017,
+      "grad_norm": 0.07339900729710998,
+      "learning_rate": 0.0002030456852791878,
+      "loss": 0.7748,
+      "step": 800
+    },
+    {
+      "epoch": 0.04086761178307167,
+      "grad_norm": 0.07861308330289217,
+      "learning_rate": 0.00020431472081218274,
+      "loss": 0.7492,
+      "step": 805
+    },
+    {
+      "epoch": 0.041121447881103175,
+      "grad_norm": 0.06891500592846019,
+      "learning_rate": 0.00020558375634517766,
+      "loss": 0.7389,
+      "step": 810
+    },
+    {
+      "epoch": 0.041375283979134675,
+      "grad_norm": 0.061487916642653274,
+      "learning_rate": 0.0002068527918781726,
+      "loss": 0.7533,
+      "step": 815
+    },
+    {
+      "epoch": 0.041629120077166175,
+      "grad_norm": 0.06175106319498476,
+      "learning_rate": 0.00020812182741116754,
+      "loss": 0.775,
+      "step": 820
+    },
+    {
+      "epoch": 0.041882956175197675,
+      "grad_norm": 0.07347500552851026,
+      "learning_rate": 0.00020939086294416244,
+      "loss": 0.7565,
+      "step": 825
+    },
+    {
+      "epoch": 0.042136792273229175,
+      "grad_norm": 0.06765223170162757,
+      "learning_rate": 0.00021065989847715737,
+      "loss": 0.7441,
+      "step": 830
+    },
+    {
+      "epoch": 0.042390628371260675,
+      "grad_norm": 0.07847030528657203,
+      "learning_rate": 0.0002119289340101523,
+      "loss": 0.7446,
+      "step": 835
+    },
+    {
+      "epoch": 0.04264446446929218,
+      "grad_norm": 0.07339106925074805,
+      "learning_rate": 0.00021319796954314722,
+      "loss": 0.7719,
+      "step": 840
+    },
+    {
+      "epoch": 0.04289830056732368,
+      "grad_norm": 0.06531782875804885,
+      "learning_rate": 0.00021446700507614215,
+      "loss": 0.7569,
+      "step": 845
+    },
+    {
+      "epoch": 0.04315213666535518,
+      "grad_norm": 0.06579891597056135,
+      "learning_rate": 0.00021573604060913707,
+      "loss": 0.7495,
+      "step": 850
+    },
+    {
+      "epoch": 0.04340597276338668,
+      "grad_norm": 0.07284604345072491,
+      "learning_rate": 0.000217005076142132,
+      "loss": 0.7566,
+      "step": 855
+    },
+    {
+      "epoch": 0.04365980886141818,
+      "grad_norm": 0.06726864956360354,
+      "learning_rate": 0.0002182741116751269,
+      "loss": 0.7742,
+      "step": 860
+    },
+    {
+      "epoch": 0.04391364495944968,
+      "grad_norm": 0.06299243656591876,
+      "learning_rate": 0.00021954314720812183,
+      "loss": 0.7676,
+      "step": 865
+    },
+    {
+      "epoch": 0.04416748105748118,
+      "grad_norm": 0.07127446574398555,
+      "learning_rate": 0.00022081218274111675,
+      "loss": 0.7328,
+      "step": 870
+    },
+    {
+      "epoch": 0.04442131715551269,
+      "grad_norm": 0.059333625669680895,
+      "learning_rate": 0.00022208121827411168,
+      "loss": 0.7528,
+      "step": 875
+    },
+    {
+      "epoch": 0.04467515325354419,
+      "grad_norm": 0.06769961067024773,
+      "learning_rate": 0.0002233502538071066,
+      "loss": 0.7702,
+      "step": 880
+    },
+    {
+      "epoch": 0.04492898935157569,
+      "grad_norm": 0.05797151521946424,
+      "learning_rate": 0.00022461928934010153,
+      "loss": 0.7565,
+      "step": 885
+    },
+    {
+      "epoch": 0.04518282544960719,
+      "grad_norm": 0.06349124685160816,
+      "learning_rate": 0.00022588832487309646,
+      "loss": 0.7386,
+      "step": 890
+    },
+    {
+      "epoch": 0.04543666154763869,
+      "grad_norm": 0.07021759323373769,
+      "learning_rate": 0.00022715736040609136,
+      "loss": 0.7306,
+      "step": 895
+    },
+    {
+      "epoch": 0.04569049764567019,
+      "grad_norm": 0.0637596966680832,
+      "learning_rate": 0.00022842639593908628,
+      "loss": 0.7699,
+      "step": 900
+    },
+    {
+      "epoch": 0.045944333743701694,
+      "grad_norm": 0.06500213255962216,
+      "learning_rate": 0.0002296954314720812,
+      "loss": 0.7816,
+      "step": 905
+    },
+    {
+      "epoch": 0.046198169841733194,
+      "grad_norm": 0.06309605572576463,
+      "learning_rate": 0.00023096446700507614,
+      "loss": 0.749,
+      "step": 910
+    },
+    {
+      "epoch": 0.046452005939764694,
+      "grad_norm": 0.05963186976020712,
+      "learning_rate": 0.00023223350253807106,
+      "loss": 0.7287,
+      "step": 915
+    },
+    {
+      "epoch": 0.046705842037796194,
+      "grad_norm": 0.06576264741938838,
+      "learning_rate": 0.000233502538071066,
+      "loss": 0.7624,
+      "step": 920
+    },
+    {
+      "epoch": 0.046959678135827694,
+      "grad_norm": 0.0660867978726128,
+      "learning_rate": 0.00023477157360406092,
+      "loss": 0.7249,
+      "step": 925
+    },
+    {
+      "epoch": 0.047213514233859194,
+      "grad_norm": 0.08980020895136409,
+      "learning_rate": 0.00023604060913705587,
+      "loss": 0.7694,
+      "step": 930
+    },
+    {
+      "epoch": 0.0474673503318907,
+      "grad_norm": 0.06355755979289256,
+      "learning_rate": 0.00023730964467005077,
+      "loss": 0.7519,
+      "step": 935
+    },
+    {
+      "epoch": 0.0477211864299222,
+      "grad_norm": 0.0726778191121942,
+      "learning_rate": 0.0002385786802030457,
+      "loss": 0.7576,
+      "step": 940
+    },
+    {
+      "epoch": 0.0479750225279537,
+      "grad_norm": 0.058859928139284715,
+      "learning_rate": 0.00023984771573604062,
+      "loss": 0.7244,
+      "step": 945
+    },
+    {
+      "epoch": 0.0482288586259852,
+      "grad_norm": 0.06081900024027338,
+      "learning_rate": 0.00024111675126903555,
+      "loss": 0.7373,
+      "step": 950
+    },
+    {
+      "epoch": 0.0484826947240167,
+      "grad_norm": 0.061248646181951284,
+      "learning_rate": 0.00024238578680203047,
+      "loss": 0.7822,
+      "step": 955
+    },
+    {
+      "epoch": 0.0487365308220482,
+      "grad_norm": 0.06672583029591972,
+      "learning_rate": 0.0002436548223350254,
+      "loss": 0.7488,
+      "step": 960
+    },
+    {
+      "epoch": 0.04899036692007971,
+      "grad_norm": 0.0572603692641915,
+      "learning_rate": 0.0002449238578680203,
+      "loss": 0.7198,
+      "step": 965
+    },
+    {
+      "epoch": 0.04924420301811121,
+      "grad_norm": 0.06681173443942526,
+      "learning_rate": 0.0002461928934010152,
+      "loss": 0.7525,
+      "step": 970
+    },
+    {
+      "epoch": 0.04949803911614271,
+      "grad_norm": 0.07257418993579445,
+      "learning_rate": 0.00024746192893401015,
+      "loss": 0.7395,
+      "step": 975
+    },
+    {
+      "epoch": 0.04975187521417421,
+      "grad_norm": 0.07302937635349391,
+      "learning_rate": 0.0002487309644670051,
+      "loss": 0.7372,
+      "step": 980
+    },
+    {
+      "epoch": 0.05000571131220571,
+      "grad_norm": 0.06996794132833503,
+      "learning_rate": 0.00025,
+      "loss": 0.729,
+      "step": 985
+    },
+    {
+      "epoch": 0.05025954741023721,
+      "grad_norm": 0.05788422679267036,
+      "learning_rate": 0.00025126903553299493,
+      "loss": 0.7263,
+      "step": 990
+    },
+    {
+      "epoch": 0.050513383508268714,
+      "grad_norm": 0.06234697260625598,
+      "learning_rate": 0.00025253807106598986,
+      "loss": 0.7245,
+      "step": 995
+    },
+    {
+      "epoch": 0.050767219606300214,
+      "grad_norm": 0.058460398878268476,
+      "learning_rate": 0.0002538071065989848,
+      "loss": 0.7382,
+      "step": 1000
+    },
+    {
+      "epoch": 0.051021055704331714,
+      "grad_norm": 0.05925618041244326,
+      "learning_rate": 0.0002550761421319797,
+      "loss": 0.7431,
+      "step": 1005
+    },
+    {
+      "epoch": 0.051274891802363214,
+      "grad_norm": 0.059944269822413306,
+      "learning_rate": 0.00025634517766497464,
+      "loss": 0.7556,
+      "step": 1010
+    },
+    {
+      "epoch": 0.051528727900394714,
+      "grad_norm": 0.05405315094033796,
+      "learning_rate": 0.00025761421319796956,
+      "loss": 0.6909,
+      "step": 1015
+    },
+    {
+      "epoch": 0.05178256399842621,
+      "grad_norm": 0.05752428711722885,
+      "learning_rate": 0.0002588832487309645,
+      "loss": 0.7232,
+      "step": 1020
+    },
+    {
+      "epoch": 0.05203640009645772,
+      "grad_norm": 0.06068982302190885,
+      "learning_rate": 0.00026015228426395936,
+      "loss": 0.7159,
+      "step": 1025
+    },
+    {
+      "epoch": 0.05229023619448922,
+      "grad_norm": 0.06926054723257352,
+      "learning_rate": 0.00026142131979695434,
+      "loss": 0.7097,
+      "step": 1030
+    },
+    {
+      "epoch": 0.05254407229252072,
+      "grad_norm": 0.05944640426895496,
+      "learning_rate": 0.0002626903553299492,
+      "loss": 0.7266,
+      "step": 1035
+    },
+    {
+      "epoch": 0.05279790839055222,
+      "grad_norm": 0.05911056311965479,
+      "learning_rate": 0.0002639593908629442,
+      "loss": 0.7034,
+      "step": 1040
+    },
+    {
+      "epoch": 0.05305174448858372,
+      "grad_norm": 0.05404273390108656,
+      "learning_rate": 0.00026522842639593907,
+      "loss": 0.7263,
+      "step": 1045
+    },
+    {
+      "epoch": 0.05330558058661522,
+      "grad_norm": 0.0681569692981572,
+      "learning_rate": 0.00026649746192893405,
+      "loss": 0.709,
+      "step": 1050
+    },
+    {
+      "epoch": 0.05355941668464673,
+      "grad_norm": 0.061027222712780306,
+      "learning_rate": 0.0002677664974619289,
+      "loss": 0.745,
+      "step": 1055
+    },
+    {
+      "epoch": 0.05381325278267823,
+      "grad_norm": 0.060423144178792326,
+      "learning_rate": 0.00026903553299492385,
+      "loss": 0.7525,
+      "step": 1060
+    },
+    {
+      "epoch": 0.05406708888070973,
+      "grad_norm": 0.05967597970626861,
+      "learning_rate": 0.00027030456852791877,
+      "loss": 0.739,
+      "step": 1065
+    },
+    {
+      "epoch": 0.05432092497874123,
+      "grad_norm": 0.06674697326887237,
+      "learning_rate": 0.0002715736040609137,
+      "loss": 0.754,
+      "step": 1070
+    },
+    {
+      "epoch": 0.054574761076772726,
+      "grad_norm": 0.06984166791857392,
+      "learning_rate": 0.0002728426395939086,
+      "loss": 0.7347,
+      "step": 1075
+    },
+    {
+      "epoch": 0.054828597174804226,
+      "grad_norm": 0.05660547722828721,
+      "learning_rate": 0.00027411167512690355,
+      "loss": 0.7372,
+      "step": 1080
+    },
+    {
+      "epoch": 0.05508243327283573,
+      "grad_norm": 0.061731160855571954,
+      "learning_rate": 0.0002753807106598985,
+      "loss": 0.7273,
+      "step": 1085
+    },
+    {
+      "epoch": 0.05533626937086723,
+      "grad_norm": 0.05843567186487286,
+      "learning_rate": 0.0002766497461928934,
+      "loss": 0.7405,
+      "step": 1090
+    },
+    {
+      "epoch": 0.05559010546889873,
+      "grad_norm": 0.06160351540468081,
+      "learning_rate": 0.0002779187817258883,
+      "loss": 0.7392,
+      "step": 1095
+    },
+    {
+      "epoch": 0.05584394156693023,
+      "grad_norm": 0.06302649918893416,
+      "learning_rate": 0.00027918781725888326,
+      "loss": 0.7169,
+      "step": 1100
+    },
+    {
+      "epoch": 0.05609777766496173,
+      "grad_norm": 0.26748522478789255,
+      "learning_rate": 0.0002804568527918782,
+      "loss": 0.7258,
+      "step": 1105
+    },
+    {
+      "epoch": 0.05635161376299323,
+      "grad_norm": 0.060283031147792536,
+      "learning_rate": 0.0002817258883248731,
+      "loss": 0.7216,
+      "step": 1110
+    },
+    {
+      "epoch": 0.05660544986102474,
+      "grad_norm": 0.06726444207819834,
+      "learning_rate": 0.00028299492385786804,
+      "loss": 0.751,
+      "step": 1115
+    },
+    {
+      "epoch": 0.05685928595905624,
+      "grad_norm": 0.09905598633925246,
+      "learning_rate": 0.00028426395939086296,
+      "loss": 0.7714,
+      "step": 1120
+    },
+    {
+      "epoch": 0.05711312205708774,
+      "grad_norm": 0.1277329704703035,
+      "learning_rate": 0.0002855329949238579,
+      "loss": 0.7602,
+      "step": 1125
+    },
+    {
+      "epoch": 0.05736695815511924,
+      "grad_norm": 0.20723659499952354,
+      "learning_rate": 0.0002868020304568528,
+      "loss": 0.7063,
+      "step": 1130
+    },
+    {
+      "epoch": 0.05762079425315074,
+      "grad_norm": 0.08585174275787383,
+      "learning_rate": 0.00028807106598984774,
+      "loss": 0.7199,
+      "step": 1135
+    },
+    {
+      "epoch": 0.05787463035118224,
+      "grad_norm": 0.0660384074730241,
+      "learning_rate": 0.0002893401015228426,
+      "loss": 0.7308,
+      "step": 1140
+    },
+    {
+      "epoch": 0.05812846644921374,
+      "grad_norm": 0.19383133097044608,
+      "learning_rate": 0.0002906091370558376,
+      "loss": 0.9052,
+      "step": 1145
+    },
+    {
+      "epoch": 0.058382302547245246,
+      "grad_norm": 0.24673064263656,
+      "learning_rate": 0.00029187817258883247,
+      "loss": 0.7624,
+      "step": 1150
+    },
+    {
+      "epoch": 0.058636138645276746,
+      "grad_norm": 32.757348375145966,
+      "learning_rate": 0.00029314720812182745,
+      "loss": 0.7527,
+      "step": 1155
+    },
+    {
+      "epoch": 0.058889974743308246,
+      "grad_norm": 0.09916235027198479,
+      "learning_rate": 0.0002944162436548223,
+      "loss": 0.7613,
+      "step": 1160
+    },
+    {
+      "epoch": 0.059143810841339746,
+      "grad_norm": 0.12917795113084668,
+      "learning_rate": 0.0002956852791878173,
+      "loss": 0.7631,
+      "step": 1165
+    },
+    {
+      "epoch": 0.059397646939371246,
+      "grad_norm": 0.08883151007488581,
+      "learning_rate": 0.00029695431472081217,
+      "loss": 0.7412,
+      "step": 1170
+    },
+    {
+      "epoch": 0.059651483037402746,
+      "grad_norm": 0.08449886407082698,
+      "learning_rate": 0.0002982233502538071,
+      "loss": 0.7789,
+      "step": 1175
+    },
+    {
+      "epoch": 0.05990531913543425,
+      "grad_norm": 0.12057734525050375,
+      "learning_rate": 0.000299492385786802,
+      "loss": 0.7513,
+      "step": 1180
+    },
+    {
+      "epoch": 0.06015915523346575,
+      "grad_norm": 0.16122305869580125,
+      "learning_rate": 0.00030076142131979695,
+      "loss": 0.8202,
+      "step": 1185
+    },
+    {
+      "epoch": 0.06041299133149725,
+      "grad_norm": 0.09478283020933216,
+      "learning_rate": 0.0003020304568527919,
+      "loss": 0.7556,
+      "step": 1190
+    },
+    {
+      "epoch": 0.06066682742952875,
+      "grad_norm": 1.1482190144535993,
+      "learning_rate": 0.0003032994923857868,
+      "loss": 0.742,
+      "step": 1195
+    },
+    {
+      "epoch": 0.06092066352756025,
+      "grad_norm": 0.08948971775561998,
+      "learning_rate": 0.00030456852791878173,
+      "loss": 0.7897,
+      "step": 1200
+    },
+    {
+      "epoch": 0.06117449962559175,
+      "grad_norm": 0.11543924190628982,
+      "learning_rate": 0.00030583756345177666,
+      "loss": 0.7372,
+      "step": 1205
+    },
+    {
+      "epoch": 0.06142833572362326,
+      "grad_norm": 0.06522079820810416,
+      "learning_rate": 0.0003071065989847716,
+      "loss": 0.7409,
+      "step": 1210
+    },
+    {
+      "epoch": 0.06168217182165476,
+      "grad_norm": 0.05653686696839828,
+      "learning_rate": 0.0003083756345177665,
+      "loss": 0.7432,
+      "step": 1215
+    },
+    {
+      "epoch": 0.06193600791968626,
+      "grad_norm": 0.06485576389961441,
+      "learning_rate": 0.00030964467005076144,
+      "loss": 0.7637,
+      "step": 1220
+    },
+    {
+      "epoch": 0.06218984401771776,
+      "grad_norm": 0.06287754156459976,
+      "learning_rate": 0.00031091370558375636,
+      "loss": 0.7496,
+      "step": 1225
+    },
+    {
+      "epoch": 0.06244368011574926,
+      "grad_norm": 0.06275169467007742,
+      "learning_rate": 0.0003121827411167513,
+      "loss": 0.7356,
+      "step": 1230
+    },
+    {
+      "epoch": 0.06269751621378077,
+      "grad_norm": 0.1431156053310293,
+      "learning_rate": 0.0003134517766497462,
+      "loss": 0.7021,
+      "step": 1235
+    },
+    {
+      "epoch": 0.06295135231181226,
+      "grad_norm": 0.0698152906671079,
+      "learning_rate": 0.00031472081218274114,
+      "loss": 0.7369,
+      "step": 1240
+    },
+    {
+      "epoch": 0.06320518840984377,
+      "grad_norm": 0.06474169533744434,
+      "learning_rate": 0.000315989847715736,
+      "loss": 0.7431,
+      "step": 1245
+    },
+    {
+      "epoch": 0.06345902450787526,
+      "grad_norm": 0.05656964302149857,
+      "learning_rate": 0.000317258883248731,
+      "loss": 0.7279,
+      "step": 1250
+    },
+    {
+      "epoch": 0.06371286060590677,
+      "grad_norm": 0.06008444247403088,
+      "learning_rate": 0.00031852791878172587,
+      "loss": 0.7255,
+      "step": 1255
+    },
+    {
+      "epoch": 0.06396669670393827,
+      "grad_norm": 0.06258676825297353,
+      "learning_rate": 0.00031979695431472085,
+      "loss": 0.7294,
+      "step": 1260
+    },
+    {
+      "epoch": 0.06422053280196977,
+      "grad_norm": 0.06101644314132169,
+      "learning_rate": 0.0003210659898477157,
+      "loss": 0.7495,
+      "step": 1265
+    },
+    {
+      "epoch": 0.06447436890000127,
+      "grad_norm": 0.05784297759132409,
+      "learning_rate": 0.0003223350253807107,
+      "loss": 0.7089,
+      "step": 1270
+    },
+    {
+      "epoch": 0.06472820499803277,
+      "grad_norm": 0.06608430212446814,
+      "learning_rate": 0.00032360406091370557,
+      "loss": 0.729,
+      "step": 1275
+    },
+    {
+      "epoch": 0.06498204109606427,
+      "grad_norm": 0.06682999306491608,
+      "learning_rate": 0.0003248730964467005,
+      "loss": 0.7659,
+      "step": 1280
+    },
+    {
+      "epoch": 0.06523587719409578,
+      "grad_norm": 0.05567632533610063,
+      "learning_rate": 0.0003261421319796954,
+      "loss": 0.7043,
+      "step": 1285
+    },
+    {
+      "epoch": 0.06548971329212727,
+      "grad_norm": 0.08049433253072921,
+      "learning_rate": 0.00032741116751269035,
+      "loss": 0.7045,
+      "step": 1290
+    },
+    {
+      "epoch": 0.06574354939015878,
+      "grad_norm": 0.06943993107179286,
+      "learning_rate": 0.0003286802030456853,
+      "loss": 0.7442,
+      "step": 1295
+    },
+    {
+      "epoch": 0.06599738548819027,
+      "grad_norm": 0.3124576265680848,
+      "learning_rate": 0.0003299492385786802,
+      "loss": 0.7156,
+      "step": 1300
+    },
+    {
+      "epoch": 0.06625122158622178,
+      "grad_norm": 0.05993979750837367,
+      "learning_rate": 0.00033121827411167513,
+      "loss": 0.7378,
+      "step": 1305
+    },
+    {
+      "epoch": 0.06650505768425327,
+      "grad_norm": 0.05955364139261034,
+      "learning_rate": 0.00033248730964467006,
+      "loss": 0.7258,
+      "step": 1310
+    },
+    {
+      "epoch": 0.06675889378228478,
+      "grad_norm": 0.08613170534764741,
+      "learning_rate": 0.00033375634517766493,
+      "loss": 0.7505,
+      "step": 1315
+    },
+    {
+      "epoch": 0.06701272988031629,
+      "grad_norm": 0.06998830116145732,
+      "learning_rate": 0.0003350253807106599,
+      "loss": 0.7261,
+      "step": 1320
+    },
+    {
+      "epoch": 0.06726656597834778,
+      "grad_norm": 0.05795740324311744,
+      "learning_rate": 0.00033629441624365484,
+      "loss": 0.7025,
+      "step": 1325
+    },
+    {
+      "epoch": 0.06752040207637929,
+      "grad_norm": 0.055676641145626066,
+      "learning_rate": 0.00033756345177664976,
+      "loss": 0.72,
+      "step": 1330
+    },
+    {
+      "epoch": 0.06777423817441078,
+      "grad_norm": 0.05604862800727641,
+      "learning_rate": 0.0003388324873096447,
+      "loss": 0.7285,
+      "step": 1335
+    },
+    {
+      "epoch": 0.06802807427244228,
+      "grad_norm": 0.05356518336455629,
+      "learning_rate": 0.0003401015228426396,
+      "loss": 0.7386,
+      "step": 1340
+    },
+    {
+      "epoch": 0.06828191037047379,
+      "grad_norm": 0.09605226497693169,
+      "learning_rate": 0.00034137055837563454,
+      "loss": 0.7118,
+      "step": 1345
+    },
+    {
+      "epoch": 0.06853574646850528,
+      "grad_norm": 0.06982897061697936,
+      "learning_rate": 0.0003426395939086294,
+      "loss": 0.6877,
+      "step": 1350
+    },
+    {
+      "epoch": 0.06878958256653679,
+      "grad_norm": 0.07303904652751834,
+      "learning_rate": 0.0003439086294416244,
+      "loss": 0.7174,
+      "step": 1355
+    },
+    {
+      "epoch": 0.06904341866456828,
+      "grad_norm": 0.07291571688385211,
+      "learning_rate": 0.00034517766497461927,
+      "loss": 0.7344,
+      "step": 1360
+    },
+    {
+      "epoch": 0.06929725476259979,
+      "grad_norm": 0.07543740254911013,
+      "learning_rate": 0.00034644670050761425,
+      "loss": 0.7053,
+      "step": 1365
+    },
+    {
+      "epoch": 0.06955109086063128,
+      "grad_norm": 0.06625493235058802,
+      "learning_rate": 0.0003477157360406091,
+      "loss": 0.7,
+      "step": 1370
+    },
+    {
+      "epoch": 0.06980492695866279,
+      "grad_norm": 0.05544221962273842,
+      "learning_rate": 0.0003489847715736041,
+      "loss": 0.7211,
+      "step": 1375
+    },
+    {
+      "epoch": 0.0700587630566943,
+      "grad_norm": 0.07915130432819355,
+      "learning_rate": 0.00035025380710659897,
+      "loss": 0.7113,
+      "step": 1380
+    },
+    {
+      "epoch": 0.07031259915472579,
+      "grad_norm": 0.37754027559534814,
+      "learning_rate": 0.00035152284263959395,
+      "loss": 0.7025,
+      "step": 1385
+    },
+    {
+      "epoch": 0.0705664352527573,
+      "grad_norm": 0.12584003699209365,
+      "learning_rate": 0.0003527918781725888,
+      "loss": 0.7465,
+      "step": 1390
+    },
+    {
+      "epoch": 0.07082027135078879,
+      "grad_norm": 0.06582449891223112,
+      "learning_rate": 0.00035406091370558375,
+      "loss": 0.7178,
+      "step": 1395
+    },
+    {
+      "epoch": 0.0710741074488203,
+      "grad_norm": 0.06342116685715943,
+      "learning_rate": 0.0003553299492385787,
+      "loss": 0.7553,
+      "step": 1400
+    },
+    {
+      "epoch": 0.0713279435468518,
+      "grad_norm": 0.062008656515146095,
+      "learning_rate": 0.0003565989847715736,
+      "loss": 0.7584,
+      "step": 1405
+    },
+    {
+      "epoch": 0.0715817796448833,
+      "grad_norm": 0.15575295778070705,
+      "learning_rate": 0.00035786802030456853,
+      "loss": 0.7721,
+      "step": 1410
+    },
+    {
+      "epoch": 0.0718356157429148,
+      "grad_norm": 0.07024393682133559,
+      "learning_rate": 0.00035913705583756346,
+      "loss": 0.716,
+      "step": 1415
+    },
+    {
+      "epoch": 0.0720894518409463,
+      "grad_norm": 0.06141309488998922,
+      "learning_rate": 0.0003604060913705584,
+      "loss": 0.7537,
+      "step": 1420
+    },
+    {
+      "epoch": 0.0723432879389778,
+      "grad_norm": 0.05848534147436462,
+      "learning_rate": 0.0003616751269035533,
+      "loss": 0.7358,
+      "step": 1425
+    },
+    {
+      "epoch": 0.0725971240370093,
+      "grad_norm": 0.05896313633341948,
+      "learning_rate": 0.00036294416243654823,
+      "loss": 0.7163,
+      "step": 1430
+    },
+    {
+      "epoch": 0.0728509601350408,
+      "grad_norm": 0.0612049129333866,
+      "learning_rate": 0.00036421319796954316,
+      "loss": 0.7278,
+      "step": 1435
+    },
+    {
+      "epoch": 0.07310479623307231,
+      "grad_norm": 0.20510721585245476,
+      "learning_rate": 0.0003654822335025381,
+      "loss": 0.7204,
+      "step": 1440
+    },
+    {
+      "epoch": 0.0733586323311038,
+      "grad_norm": 0.06353888649819851,
+      "learning_rate": 0.000366751269035533,
+      "loss": 0.703,
+      "step": 1445
+    },
+    {
+      "epoch": 0.07361246842913531,
+      "grad_norm": 0.11598595193927975,
+      "learning_rate": 0.00036802030456852794,
+      "loss": 0.7353,
+      "step": 1450
+    },
+    {
+      "epoch": 0.0738663045271668,
+      "grad_norm": 0.06735218740803854,
+      "learning_rate": 0.00036928934010152287,
+      "loss": 0.7387,
+      "step": 1455
+    },
+    {
+      "epoch": 0.07412014062519831,
+      "grad_norm": 0.056456597607688834,
+      "learning_rate": 0.0003705583756345178,
+      "loss": 0.7244,
+      "step": 1460
+    },
+    {
+      "epoch": 0.0743739767232298,
+      "grad_norm": 0.06695445879742773,
+      "learning_rate": 0.00037182741116751266,
+      "loss": 0.733,
+      "step": 1465
+    },
+    {
+      "epoch": 0.07462781282126131,
+      "grad_norm": 0.062469347353580236,
+      "learning_rate": 0.00037309644670050765,
+      "loss": 0.7283,
+      "step": 1470
+    },
+    {
+      "epoch": 0.07488164891929282,
+      "grad_norm": 0.10743187268032847,
+      "learning_rate": 0.0003743654822335025,
+      "loss": 0.7627,
+      "step": 1475
+    },
+    {
+      "epoch": 0.07513548501732431,
+      "grad_norm": 0.06985614418900853,
+      "learning_rate": 0.0003756345177664975,
+      "loss": 0.7398,
+      "step": 1480
+    },
+    {
+      "epoch": 0.07538932111535582,
+      "grad_norm": 4.112242925569732,
+      "learning_rate": 0.00037690355329949237,
+      "loss": 0.7503,
+      "step": 1485
+    },
+    {
+      "epoch": 0.07564315721338731,
+      "grad_norm": 3.056249205800858,
+      "learning_rate": 0.00037817258883248735,
+      "loss": 0.7365,
+      "step": 1490
+    },
+    {
+      "epoch": 0.07589699331141882,
+      "grad_norm": 0.08367423229399179,
+      "learning_rate": 0.0003794416243654822,
+      "loss": 0.751,
+      "step": 1495
+    },
+    {
+      "epoch": 0.07615082940945032,
+      "grad_norm": 0.17368019423571526,
+      "learning_rate": 0.00038071065989847715,
+      "loss": 0.7447,
+      "step": 1500
+    },
+    {
+      "epoch": 0.07640466550748182,
+      "grad_norm": 0.06787177882481868,
+      "learning_rate": 0.0003819796954314721,
+      "loss": 0.7489,
+      "step": 1505
+    },
+    {
+      "epoch": 0.07665850160551332,
+      "grad_norm": 0.07169631398568231,
+      "learning_rate": 0.000383248730964467,
+      "loss": 0.7393,
+      "step": 1510
+    },
+    {
+      "epoch": 0.07691233770354482,
+      "grad_norm": 0.05749839111352872,
+      "learning_rate": 0.00038451776649746193,
+      "loss": 0.721,
+      "step": 1515
+    },
+    {
+      "epoch": 0.07716617380157632,
+      "grad_norm": 0.08575887204955374,
+      "learning_rate": 0.00038578680203045685,
+      "loss": 0.7128,
+      "step": 1520
+    },
+    {
+      "epoch": 0.07742000989960782,
+      "grad_norm": 0.05613491181829679,
+      "learning_rate": 0.0003870558375634518,
+      "loss": 0.7338,
+      "step": 1525
+    },
+    {
+      "epoch": 0.07767384599763932,
+      "grad_norm": 0.06300181911559392,
+      "learning_rate": 0.0003883248730964467,
+      "loss": 0.7337,
+      "step": 1530
+    },
+    {
+      "epoch": 0.07792768209567083,
+      "grad_norm": 0.07134915135851151,
+      "learning_rate": 0.00038959390862944163,
+      "loss": 0.7629,
+      "step": 1535
+    },
+    {
+      "epoch": 0.07818151819370232,
+      "grad_norm": 0.05162935081471609,
+      "learning_rate": 0.00039086294416243656,
+      "loss": 0.6955,
+      "step": 1540
+    },
+    {
+      "epoch": 0.07843535429173383,
+      "grad_norm": 0.06414129871881698,
+      "learning_rate": 0.0003921319796954315,
+      "loss": 0.7301,
+      "step": 1545
+    },
+    {
+      "epoch": 0.07868919038976532,
+      "grad_norm": 0.05373549484924304,
+      "learning_rate": 0.0003934010152284264,
+      "loss": 0.6976,
+      "step": 1550
+    },
+    {
+      "epoch": 0.07894302648779683,
+      "grad_norm": 0.06837620727230255,
+      "learning_rate": 0.00039467005076142134,
+      "loss": 0.6985,
+      "step": 1555
+    },
+    {
+      "epoch": 0.07919686258582834,
+      "grad_norm": 0.07846375652980406,
+      "learning_rate": 0.00039593908629441627,
+      "loss": 0.7338,
+      "step": 1560
+    },
+    {
+      "epoch": 0.07945069868385983,
+      "grad_norm": 0.06174362803606399,
+      "learning_rate": 0.0003972081218274112,
+      "loss": 0.7168,
+      "step": 1565
+    },
+    {
+      "epoch": 0.07970453478189134,
+      "grad_norm": 0.05882865445136937,
+      "learning_rate": 0.00039847715736040606,
+      "loss": 0.7184,
+      "step": 1570
+    },
+    {
+      "epoch": 0.07995837087992283,
+      "grad_norm": 0.0493701696839989,
+      "learning_rate": 0.00039974619289340104,
+      "loss": 0.7053,
+      "step": 1575
+    },
+    {
+      "epoch": 0.08021220697795434,
+      "grad_norm": 0.054577428336826876,
+      "learning_rate": 0.0004010152284263959,
+      "loss": 0.7315,
+      "step": 1580
+    },
+    {
+      "epoch": 0.08046604307598583,
+      "grad_norm": 0.05891870223720398,
+      "learning_rate": 0.0004022842639593909,
+      "loss": 0.708,
+      "step": 1585
+    },
+    {
+      "epoch": 0.08071987917401734,
+      "grad_norm": 0.09122730379802985,
+      "learning_rate": 0.00040355329949238577,
+      "loss": 0.6886,
+      "step": 1590
+    },
+    {
+      "epoch": 0.08097371527204884,
+      "grad_norm": 0.05434428539871062,
+      "learning_rate": 0.00040482233502538075,
+      "loss": 0.7235,
+      "step": 1595
+    },
+    {
+      "epoch": 0.08122755137008034,
+      "grad_norm": 0.057406938749746325,
+      "learning_rate": 0.0004060913705583756,
+      "loss": 0.6998,
+      "step": 1600
+    },
+    {
+      "epoch": 0.08148138746811184,
+      "grad_norm": 0.05624064887105565,
+      "learning_rate": 0.0004073604060913706,
+      "loss": 0.7167,
+      "step": 1605
+    },
+    {
+      "epoch": 0.08173522356614334,
+      "grad_norm": 0.08475156677013762,
+      "learning_rate": 0.0004086294416243655,
+      "loss": 0.717,
+      "step": 1610
+    },
+    {
+      "epoch": 0.08198905966417484,
+      "grad_norm": 0.061648445637681494,
+      "learning_rate": 0.0004098984771573604,
+      "loss": 0.7219,
+      "step": 1615
+    },
+    {
+      "epoch": 0.08224289576220635,
+      "grad_norm": 0.05747063059303642,
+      "learning_rate": 0.00041116751269035533,
+      "loss": 0.698,
+      "step": 1620
+    },
+    {
+      "epoch": 0.08249673186023784,
+      "grad_norm": 0.0531564390368294,
+      "learning_rate": 0.00041243654822335025,
+      "loss": 0.6933,
+      "step": 1625
+    },
+    {
+      "epoch": 0.08275056795826935,
+      "grad_norm": 0.0531546385182133,
+      "learning_rate": 0.0004137055837563452,
+      "loss": 0.7269,
+      "step": 1630
+    },
+    {
+      "epoch": 0.08300440405630084,
+      "grad_norm": 0.05287338545106486,
+      "learning_rate": 0.0004149746192893401,
+      "loss": 0.7225,
+      "step": 1635
+    },
+    {
+      "epoch": 0.08325824015433235,
+      "grad_norm": 0.05148433899438861,
+      "learning_rate": 0.0004162436548223351,
+      "loss": 0.7105,
+      "step": 1640
+    },
+    {
+      "epoch": 0.08351207625236384,
+      "grad_norm": 0.05687600911185101,
+      "learning_rate": 0.00041751269035532996,
+      "loss": 0.7124,
+      "step": 1645
+    },
+    {
+      "epoch": 0.08376591235039535,
+      "grad_norm": 0.05524705398314237,
+      "learning_rate": 0.0004187817258883249,
+      "loss": 0.7175,
+      "step": 1650
+    },
+    {
+      "epoch": 0.08401974844842686,
+      "grad_norm": 0.05772786142305999,
+      "learning_rate": 0.0004200507614213198,
+      "loss": 0.72,
+      "step": 1655
+    },
+    {
+      "epoch": 0.08427358454645835,
+      "grad_norm": 0.06460258179361435,
+      "learning_rate": 0.00042131979695431474,
+      "loss": 0.7155,
+      "step": 1660
+    },
+    {
+      "epoch": 0.08452742064448986,
+      "grad_norm": 0.08154444806905058,
+      "learning_rate": 0.00042258883248730967,
+      "loss": 0.7249,
+      "step": 1665
+    },
+    {
+      "epoch": 0.08478125674252135,
+      "grad_norm": 0.07036154319504197,
+      "learning_rate": 0.0004238578680203046,
+      "loss": 0.7018,
+      "step": 1670
+    },
+    {
+      "epoch": 0.08503509284055286,
+      "grad_norm": 0.07974859335868449,
+      "learning_rate": 0.0004251269035532995,
+      "loss": 0.6848,
+      "step": 1675
+    },
+    {
+      "epoch": 0.08528892893858436,
+      "grad_norm": 0.07106622592720147,
+      "learning_rate": 0.00042639593908629444,
+      "loss": 0.7306,
+      "step": 1680
+    },
+    {
+      "epoch": 0.08554276503661586,
+      "grad_norm": 0.07520683130587769,
+      "learning_rate": 0.0004276649746192893,
+      "loss": 0.6777,
+      "step": 1685
+    },
+    {
+      "epoch": 0.08579660113464736,
+      "grad_norm": 0.060430901146499016,
+      "learning_rate": 0.0004289340101522843,
+      "loss": 0.7051,
+      "step": 1690
+    },
+    {
+      "epoch": 0.08605043723267886,
+      "grad_norm": 0.0635450282792058,
+      "learning_rate": 0.00043020304568527917,
+      "loss": 0.6913,
+      "step": 1695
+    },
+    {
+      "epoch": 0.08630427333071036,
+      "grad_norm": 0.04972958060129512,
+      "learning_rate": 0.00043147208121827415,
+      "loss": 0.7135,
+      "step": 1700
+    },
+    {
+      "epoch": 0.08655810942874186,
+      "grad_norm": 0.04944823830530392,
+      "learning_rate": 0.000432741116751269,
+      "loss": 0.7038,
+      "step": 1705
+    },
+    {
+      "epoch": 0.08681194552677336,
+      "grad_norm": 0.0567985625555269,
+      "learning_rate": 0.000434010152284264,
+      "loss": 0.7287,
+      "step": 1710
+    },
+    {
+      "epoch": 0.08706578162480487,
+      "grad_norm": 0.051682155754569976,
+      "learning_rate": 0.0004352791878172589,
+      "loss": 0.6989,
+      "step": 1715
+    },
+    {
+      "epoch": 0.08731961772283636,
+      "grad_norm": 0.057888169540962604,
+      "learning_rate": 0.0004365482233502538,
+      "loss": 0.6968,
+      "step": 1720
+    },
+    {
+      "epoch": 0.08757345382086787,
+      "grad_norm": 0.04580235344300138,
+      "learning_rate": 0.00043781725888324873,
+      "loss": 0.7062,
+      "step": 1725
+    },
+    {
+      "epoch": 0.08782728991889936,
+      "grad_norm": 0.05599349020439277,
+      "learning_rate": 0.00043908629441624365,
+      "loss": 0.7211,
+      "step": 1730
+    },
+    {
+      "epoch": 0.08808112601693087,
+      "grad_norm": 0.06293126880283036,
+      "learning_rate": 0.0004403553299492386,
+      "loss": 0.7035,
+      "step": 1735
+    },
+    {
+      "epoch": 0.08833496211496236,
+      "grad_norm": 0.05082245406055062,
+      "learning_rate": 0.0004416243654822335,
+      "loss": 0.674,
+      "step": 1740
+    },
+    {
+      "epoch": 0.08858879821299387,
+      "grad_norm": 0.05354641526756831,
+      "learning_rate": 0.00044289340101522843,
+      "loss": 0.6947,
+      "step": 1745
+    },
+    {
+      "epoch": 0.08884263431102538,
+      "grad_norm": 0.05188231038829293,
+      "learning_rate": 0.00044416243654822336,
+      "loss": 0.6985,
+      "step": 1750
+    },
+    {
+      "epoch": 0.08909647040905687,
+      "grad_norm": 0.28571205654683896,
+      "learning_rate": 0.0004454314720812183,
+      "loss": 0.7366,
+      "step": 1755
+    },
+    {
+      "epoch": 0.08935030650708838,
+      "grad_norm": 0.05009396851866736,
+      "learning_rate": 0.0004467005076142132,
+      "loss": 0.7037,
+      "step": 1760
+    },
+    {
+      "epoch": 0.08960414260511987,
+      "grad_norm": 0.057008248968552507,
+      "learning_rate": 0.00044796954314720814,
+      "loss": 0.7004,
+      "step": 1765
+    },
+    {
+      "epoch": 0.08985797870315138,
+      "grad_norm": 0.08570192402076927,
+      "learning_rate": 0.00044923857868020306,
+      "loss": 0.6962,
+      "step": 1770
+    },
+    {
+      "epoch": 0.09011181480118288,
+      "grad_norm": 0.07466469422515871,
+      "learning_rate": 0.000450507614213198,
+      "loss": 0.6782,
+      "step": 1775
+    },
+    {
+      "epoch": 0.09036565089921438,
+      "grad_norm": 0.050896499358629874,
+      "learning_rate": 0.0004517766497461929,
+      "loss": 0.6832,
+      "step": 1780
+    },
+    {
+      "epoch": 0.09061948699724588,
+      "grad_norm": 0.05086157843514099,
+      "learning_rate": 0.00045304568527918784,
+      "loss": 0.7398,
+      "step": 1785
+    },
+    {
+      "epoch": 0.09087332309527738,
+      "grad_norm": 0.048846685380405284,
+      "learning_rate": 0.0004543147208121827,
+      "loss": 0.7125,
+      "step": 1790
+    },
+    {
+      "epoch": 0.09112715919330888,
+      "grad_norm": 0.06253755883118842,
+      "learning_rate": 0.0004555837563451777,
+      "loss": 0.7122,
+      "step": 1795
+    },
+    {
+      "epoch": 0.09138099529134038,
+      "grad_norm": 0.0511560475400958,
+      "learning_rate": 0.00045685279187817257,
+      "loss": 0.7291,
+      "step": 1800
+    },
+    {
+      "epoch": 0.09163483138937188,
+      "grad_norm": 0.052121572152721875,
+      "learning_rate": 0.00045812182741116755,
+      "loss": 0.6912,
+      "step": 1805
+    },
+    {
+      "epoch": 0.09188866748740339,
+      "grad_norm": 0.049287056641165186,
+      "learning_rate": 0.0004593908629441624,
+      "loss": 0.6776,
+      "step": 1810
+    },
+    {
+      "epoch": 0.09214250358543488,
+      "grad_norm": 0.04890540112646688,
+      "learning_rate": 0.0004606598984771574,
+      "loss": 0.6833,
+      "step": 1815
+    },
+    {
+      "epoch": 0.09239633968346639,
+      "grad_norm": 0.0640968907329142,
+      "learning_rate": 0.0004619289340101523,
+      "loss": 0.7165,
+      "step": 1820
+    },
+    {
+      "epoch": 0.09265017578149788,
+      "grad_norm": 0.09373858909563126,
+      "learning_rate": 0.0004631979695431472,
+      "loss": 0.6946,
+      "step": 1825
+    },
+    {
+      "epoch": 0.09290401187952939,
+      "grad_norm": 0.0502898696426856,
+      "learning_rate": 0.0004644670050761421,
+      "loss": 0.7373,
+      "step": 1830
+    },
+    {
+      "epoch": 0.0931578479775609,
+      "grad_norm": 0.052116651580975205,
+      "learning_rate": 0.00046573604060913705,
+      "loss": 0.6888,
+      "step": 1835
+    },
+    {
+      "epoch": 0.09341168407559239,
+      "grad_norm": 0.06418059125424298,
+      "learning_rate": 0.000467005076142132,
+      "loss": 0.6905,
+      "step": 1840
+    },
+    {
+      "epoch": 0.0936655201736239,
+      "grad_norm": 0.06311317514924268,
+      "learning_rate": 0.0004682741116751269,
+      "loss": 0.6741,
+      "step": 1845
+    },
+    {
+      "epoch": 0.09391935627165539,
+      "grad_norm": 0.04939022002771504,
+      "learning_rate": 0.00046954314720812183,
+      "loss": 0.7203,
+      "step": 1850
+    },
+    {
+      "epoch": 0.0941731923696869,
+      "grad_norm": 0.05910410775267137,
+      "learning_rate": 0.00047081218274111676,
+      "loss": 0.7099,
+      "step": 1855
+    },
+    {
+      "epoch": 0.09442702846771839,
+      "grad_norm": 0.0814151828326972,
+      "learning_rate": 0.00047208121827411174,
+      "loss": 0.7248,
+      "step": 1860
+    },
+    {
+      "epoch": 0.0946808645657499,
+      "grad_norm": 0.8076656010487256,
+      "learning_rate": 0.0004733502538071066,
+      "loss": 0.7253,
+      "step": 1865
+    },
+    {
+      "epoch": 0.0949347006637814,
+      "grad_norm": 0.06765805196660898,
+      "learning_rate": 0.00047461928934010154,
+      "loss": 0.6885,
+      "step": 1870
+    },
+    {
+      "epoch": 0.0951885367618129,
+      "grad_norm": 0.05841136910440899,
+      "learning_rate": 0.00047588832487309646,
+      "loss": 0.7059,
+      "step": 1875
+    },
+    {
+      "epoch": 0.0954423728598444,
+      "grad_norm": 0.06300415339926192,
+      "learning_rate": 0.0004771573604060914,
+      "loss": 0.7087,
+      "step": 1880
+    },
+    {
+      "epoch": 0.0956962089578759,
+      "grad_norm": 0.0740423426972598,
+      "learning_rate": 0.0004784263959390863,
+      "loss": 0.6992,
+      "step": 1885
+    },
+    {
+      "epoch": 0.0959500450559074,
+      "grad_norm": 0.06612814847151591,
+      "learning_rate": 0.00047969543147208124,
+      "loss": 0.713,
+      "step": 1890
+    },
+    {
+      "epoch": 0.09620388115393891,
+      "grad_norm": 0.05216005232585783,
+      "learning_rate": 0.00048096446700507617,
+      "loss": 0.707,
+      "step": 1895
+    },
+    {
+      "epoch": 0.0964577172519704,
+      "grad_norm": 0.05326644260558058,
+      "learning_rate": 0.0004822335025380711,
+      "loss": 0.7025,
+      "step": 1900
+    },
+    {
+      "epoch": 0.09671155335000191,
+      "grad_norm": 0.06471751560857093,
+      "learning_rate": 0.00048350253807106597,
+      "loss": 0.7059,
+      "step": 1905
+    },
+    {
+      "epoch": 0.0969653894480334,
+      "grad_norm": 0.06373553800707545,
+      "learning_rate": 0.00048477157360406095,
+      "loss": 0.7124,
+      "step": 1910
+    },
+    {
+      "epoch": 0.09721922554606491,
+      "grad_norm": 0.07107687339224335,
+      "learning_rate": 0.0004860406091370558,
+      "loss": 0.7199,
+      "step": 1915
+    },
+    {
+      "epoch": 0.0974730616440964,
+      "grad_norm": 0.07062703466342107,
+      "learning_rate": 0.0004873096446700508,
+      "loss": 0.7306,
+      "step": 1920
+    },
+    {
+      "epoch": 0.09772689774212791,
+      "grad_norm": 0.05938067294182585,
+      "learning_rate": 0.0004885786802030457,
+      "loss": 0.683,
+      "step": 1925
+    },
+    {
+      "epoch": 0.09798073384015941,
+      "grad_norm": 0.07174216980411427,
+      "learning_rate": 0.0004898477157360406,
+      "loss": 0.7168,
+      "step": 1930
+    },
+    {
+      "epoch": 0.09823456993819091,
+      "grad_norm": 0.06009740027025641,
+      "learning_rate": 0.0004911167512690356,
+      "loss": 0.6961,
+      "step": 1935
+    },
+    {
+      "epoch": 0.09848840603622241,
+      "grad_norm": 0.05206994171495008,
+      "learning_rate": 0.0004923857868020305,
+      "loss": 0.6947,
+      "step": 1940
+    },
+    {
+      "epoch": 0.09874224213425391,
+      "grad_norm": 0.07987693756335262,
+      "learning_rate": 0.0004936548223350254,
+      "loss": 0.6895,
+      "step": 1945
+    },
+    {
+      "epoch": 0.09899607823228541,
+      "grad_norm": 0.05144943467180292,
+      "learning_rate": 0.0004949238578680203,
+      "loss": 0.7062,
+      "step": 1950
+    },
+    {
+      "epoch": 0.09924991433031692,
+      "grad_norm": 0.05082445787484508,
+      "learning_rate": 0.0004961928934010153,
+      "loss": 0.6943,
+      "step": 1955
+    },
+    {
+      "epoch": 0.09950375042834841,
+      "grad_norm": 0.047141885648804095,
+      "learning_rate": 0.0004974619289340102,
+      "loss": 0.6917,
+      "step": 1960
+    },
+    {
+      "epoch": 0.09975758652637992,
+      "grad_norm": 0.047197680147561615,
+      "learning_rate": 0.0004987309644670051,
+      "loss": 0.6746,
+      "step": 1965
+    },
+    {
+      "epoch": 0.10001142262441141,
+      "grad_norm": 0.07137820513966749,
+      "learning_rate": 0.0005,
+      "loss": 0.6841,
+      "step": 1970
+    },
+    {
+      "epoch": 0.10026525872244292,
+      "grad_norm": 0.052618795283310475,
+      "learning_rate": 0.000501269035532995,
+      "loss": 0.6897,
+      "step": 1975
+    },
+    {
+      "epoch": 0.10051909482047441,
+      "grad_norm": 0.05796055424874039,
+      "learning_rate": 0.0005025380710659899,
+      "loss": 0.6861,
+      "step": 1980
+    },
+    {
+      "epoch": 0.10077293091850592,
+      "grad_norm": 0.050447551050472175,
+      "learning_rate": 0.0005038071065989847,
+      "loss": 0.6834,
+      "step": 1985
+    },
+    {
+      "epoch": 0.10102676701653743,
+      "grad_norm": 0.048726174649120095,
+      "learning_rate": 0.0005050761421319797,
+      "loss": 0.6813,
+      "step": 1990
+    },
+    {
+      "epoch": 0.10128060311456892,
+      "grad_norm": 0.07440137618882016,
+      "learning_rate": 0.0005063451776649747,
+      "loss": 0.6647,
+      "step": 1995
+    },
+    {
+      "epoch": 0.10153443921260043,
+      "grad_norm": 0.05076461271762827,
+      "learning_rate": 0.0005076142131979696,
+      "loss": 0.7137,
+      "step": 2000
+    },
+    {
+      "epoch": 0.10178827531063192,
+      "grad_norm": 0.0471884569121775,
+      "learning_rate": 0.0005088832487309644,
+      "loss": 0.6887,
+      "step": 2005
+    },
+    {
+      "epoch": 0.10204211140866343,
+      "grad_norm": 0.11880359913639196,
+      "learning_rate": 0.0005101522842639594,
+      "loss": 0.6919,
+      "step": 2010
+    },
+    {
+      "epoch": 0.10229594750669492,
+      "grad_norm": 0.20996719664722965,
+      "learning_rate": 0.0005114213197969543,
+      "loss": 0.6802,
+      "step": 2015
+    },
+    {
+      "epoch": 0.10254978360472643,
+      "grad_norm": 0.047803725376976,
+      "learning_rate": 0.0005126903553299493,
+      "loss": 0.7059,
+      "step": 2020
+    },
+    {
+      "epoch": 0.10280361970275793,
+      "grad_norm": 0.0449286937065325,
+      "learning_rate": 0.0005139593908629441,
+      "loss": 0.7061,
+      "step": 2025
+    },
+    {
+      "epoch": 0.10305745580078943,
+      "grad_norm": 0.052746260634763203,
+      "learning_rate": 0.0005152284263959391,
+      "loss": 0.6967,
+      "step": 2030
+    },
+    {
+      "epoch": 0.10331129189882093,
+      "grad_norm": 0.04943616988366681,
+      "learning_rate": 0.000516497461928934,
+      "loss": 0.717,
+      "step": 2035
+    },
+    {
+      "epoch": 0.10356512799685243,
+      "grad_norm": 0.049716392637897996,
+      "learning_rate": 0.000517766497461929,
+      "loss": 0.6805,
+      "step": 2040
+    },
+    {
+      "epoch": 0.10381896409488393,
+      "grad_norm": 0.04637995606001784,
+      "learning_rate": 0.0005190355329949239,
+      "loss": 0.6779,
+      "step": 2045
+    },
+    {
+      "epoch": 0.10407280019291544,
+      "grad_norm": 0.04774106805011772,
+      "learning_rate": 0.0005203045685279187,
+      "loss": 0.7253,
+      "step": 2050
+    },
+    {
+      "epoch": 0.10432663629094693,
+      "grad_norm": 0.051793990117296816,
+      "learning_rate": 0.0005215736040609137,
+      "loss": 0.6626,
+      "step": 2055
+    },
+    {
+      "epoch": 0.10458047238897844,
+      "grad_norm": 0.045546210431367966,
+      "learning_rate": 0.0005228426395939087,
+      "loss": 0.6842,
+      "step": 2060
+    },
+    {
+      "epoch": 0.10483430848700993,
+      "grad_norm": 0.048518075416691675,
+      "learning_rate": 0.0005241116751269036,
+      "loss": 0.7328,
+      "step": 2065
+    },
+    {
+      "epoch": 0.10508814458504144,
+      "grad_norm": 0.05679454235982254,
+      "learning_rate": 0.0005253807106598984,
+      "loss": 0.6876,
+      "step": 2070
+    },
+    {
+      "epoch": 0.10534198068307293,
+      "grad_norm": 0.05098370288454924,
+      "learning_rate": 0.0005266497461928934,
+      "loss": 0.6929,
+      "step": 2075
+    },
+    {
+      "epoch": 0.10559581678110444,
+      "grad_norm": 0.04168186472739525,
+      "learning_rate": 0.0005279187817258884,
+      "loss": 0.6769,
+      "step": 2080
+    },
+    {
+      "epoch": 0.10584965287913595,
+      "grad_norm": 0.04581891098414401,
+      "learning_rate": 0.0005291878172588833,
+      "loss": 0.6754,
+      "step": 2085
+    },
+    {
+      "epoch": 0.10610348897716744,
+      "grad_norm": 0.043468753842354504,
+      "learning_rate": 0.0005304568527918781,
+      "loss": 0.7335,
+      "step": 2090
+    },
+    {
+      "epoch": 0.10635732507519895,
+      "grad_norm": 0.05308604744136886,
+      "learning_rate": 0.0005317258883248731,
+      "loss": 0.6921,
+      "step": 2095
+    },
+    {
+      "epoch": 0.10661116117323044,
+      "grad_norm": 0.07051381632192111,
+      "learning_rate": 0.0005329949238578681,
+      "loss": 0.6755,
+      "step": 2100
+    },
+    {
+      "epoch": 0.10686499727126195,
+      "grad_norm": 0.046426623183754255,
+      "learning_rate": 0.000534263959390863,
+      "loss": 0.7157,
+      "step": 2105
+    },
+    {
+      "epoch": 0.10711883336929345,
+      "grad_norm": 0.052422754447883815,
+      "learning_rate": 0.0005355329949238578,
+      "loss": 0.7047,
+      "step": 2110
+    },
+    {
+      "epoch": 0.10737266946732495,
+      "grad_norm": 0.07613534981689268,
+      "learning_rate": 0.0005368020304568528,
+      "loss": 0.7077,
+      "step": 2115
+    },
+    {
+      "epoch": 0.10762650556535645,
+      "grad_norm": 0.053514202239991294,
+      "learning_rate": 0.0005380710659898477,
+      "loss": 0.6843,
+      "step": 2120
+    },
+    {
+      "epoch": 0.10788034166338795,
+      "grad_norm": 0.04913041286768531,
+      "learning_rate": 0.0005393401015228427,
+      "loss": 0.6961,
+      "step": 2125
+    },
+    {
+      "epoch": 0.10813417776141945,
+      "grad_norm": 0.0568300226701408,
+      "learning_rate": 0.0005406091370558375,
+      "loss": 0.6692,
+      "step": 2130
+    },
+    {
+      "epoch": 0.10838801385945095,
+      "grad_norm": 0.04636934678676007,
+      "learning_rate": 0.0005418781725888325,
+      "loss": 0.6763,
+      "step": 2135
+    },
+    {
+      "epoch": 0.10864184995748245,
+      "grad_norm": 0.05136400672323533,
+      "learning_rate": 0.0005431472081218274,
+      "loss": 0.6894,
+      "step": 2140
+    },
+    {
+      "epoch": 0.10889568605551396,
+      "grad_norm": 0.04344668075028007,
+      "learning_rate": 0.0005444162436548224,
+      "loss": 0.6631,
+      "step": 2145
+    },
+    {
+      "epoch": 0.10914952215354545,
+      "grad_norm": 0.0555972048428014,
+      "learning_rate": 0.0005456852791878173,
+      "loss": 0.6784,
+      "step": 2150
+    },
+    {
+      "epoch": 0.10940335825157696,
+      "grad_norm": 0.06620172522346869,
+      "learning_rate": 0.0005469543147208121,
+      "loss": 0.6911,
+      "step": 2155
+    },
+    {
+      "epoch": 0.10965719434960845,
+      "grad_norm": 0.0601809692146959,
+      "learning_rate": 0.0005482233502538071,
+      "loss": 0.7259,
+      "step": 2160
+    },
+    {
+      "epoch": 0.10991103044763996,
+      "grad_norm": 0.044989749038749825,
+      "learning_rate": 0.0005494923857868021,
+      "loss": 0.7114,
+      "step": 2165
+    },
+    {
+      "epoch": 0.11016486654567147,
+      "grad_norm": 0.04741683660493615,
+      "learning_rate": 0.000550761421319797,
+      "loss": 0.6949,
+      "step": 2170
+    },
+    {
+      "epoch": 0.11041870264370296,
+      "grad_norm": 0.054064091770256034,
+      "learning_rate": 0.0005520304568527918,
+      "loss": 0.6774,
+      "step": 2175
+    },
+    {
+      "epoch": 0.11067253874173447,
+      "grad_norm": 0.050772197507611055,
+      "learning_rate": 0.0005532994923857868,
+      "loss": 0.6747,
+      "step": 2180
+    },
+    {
+      "epoch": 0.11092637483976596,
+      "grad_norm": 0.07122688113002712,
+      "learning_rate": 0.0005545685279187818,
+      "loss": 0.6751,
+      "step": 2185
+    },
+    {
+      "epoch": 0.11118021093779747,
+      "grad_norm": 0.056605934899527,
+      "learning_rate": 0.0005558375634517766,
+      "loss": 0.7048,
+      "step": 2190
+    },
+    {
+      "epoch": 0.11143404703582896,
+      "grad_norm": 0.049631694115174936,
+      "learning_rate": 0.0005571065989847715,
+      "loss": 0.6795,
+      "step": 2195
+    },
+    {
+      "epoch": 0.11168788313386047,
+      "grad_norm": 0.05830993033392446,
+      "learning_rate": 0.0005583756345177665,
+      "loss": 0.6846,
+      "step": 2200
+    },
+    {
+      "epoch": 0.11194171923189197,
+      "grad_norm": 0.044352746447960285,
+      "learning_rate": 0.0005596446700507615,
+      "loss": 0.6956,
+      "step": 2205
+    },
+    {
+      "epoch": 0.11219555532992347,
+      "grad_norm": 0.04064168684790446,
+      "learning_rate": 0.0005609137055837564,
+      "loss": 0.6584,
+      "step": 2210
+    },
+    {
+      "epoch": 0.11244939142795497,
+      "grad_norm": 0.05355535018905834,
+      "learning_rate": 0.0005621827411167512,
+      "loss": 0.6551,
+      "step": 2215
+    },
+    {
+      "epoch": 0.11270322752598647,
+      "grad_norm": 0.04823668883816903,
+      "learning_rate": 0.0005634517766497462,
+      "loss": 0.7101,
+      "step": 2220
+    },
+    {
+      "epoch": 0.11295706362401797,
+      "grad_norm": 0.04507099918690335,
+      "learning_rate": 0.0005647208121827412,
+      "loss": 0.6928,
+      "step": 2225
+    },
+    {
+      "epoch": 0.11321089972204948,
+      "grad_norm": 0.04388054794584642,
+      "learning_rate": 0.0005659898477157361,
+      "loss": 0.6937,
+      "step": 2230
+    },
+    {
+      "epoch": 0.11346473582008097,
+      "grad_norm": 0.04682924867200663,
+      "learning_rate": 0.0005672588832487309,
+      "loss": 0.6785,
+      "step": 2235
+    },
+    {
+      "epoch": 0.11371857191811248,
+      "grad_norm": 0.06604610799465334,
+      "learning_rate": 0.0005685279187817259,
+      "loss": 0.6779,
+      "step": 2240
+    },
+    {
+      "epoch": 0.11397240801614397,
+      "grad_norm": 0.06120863018511214,
+      "learning_rate": 0.0005697969543147208,
+      "loss": 0.6533,
+      "step": 2245
+    },
+    {
+      "epoch": 0.11422624411417548,
+      "grad_norm": 0.05270150831774229,
+      "learning_rate": 0.0005710659898477158,
+      "loss": 0.6622,
+      "step": 2250
+    },
+    {
+      "epoch": 0.11448008021220697,
+      "grad_norm": 0.054970737100826304,
+      "learning_rate": 0.0005723350253807107,
+      "loss": 0.682,
+      "step": 2255
+    },
+    {
+      "epoch": 0.11473391631023848,
+      "grad_norm": 0.04567202978046955,
+      "learning_rate": 0.0005736040609137056,
+      "loss": 0.6642,
+      "step": 2260
+    },
+    {
+      "epoch": 0.11498775240826999,
+      "grad_norm": 0.06810657291430826,
+      "learning_rate": 0.0005748730964467005,
+      "loss": 0.7241,
+      "step": 2265
+    },
+    {
+      "epoch": 0.11524158850630148,
+      "grad_norm": 0.051106730998495775,
+      "learning_rate": 0.0005761421319796955,
+      "loss": 0.6964,
+      "step": 2270
+    },
+    {
+      "epoch": 0.11549542460433299,
+      "grad_norm": 0.05289405047092881,
+      "learning_rate": 0.0005774111675126904,
+      "loss": 0.6951,
+      "step": 2275
+    },
+    {
+      "epoch": 0.11574926070236448,
+      "grad_norm": 0.04803879297706406,
+      "learning_rate": 0.0005786802030456852,
+      "loss": 0.668,
+      "step": 2280
+    },
+    {
+      "epoch": 0.11600309680039599,
+      "grad_norm": 0.05408216249103949,
+      "learning_rate": 0.0005799492385786802,
+      "loss": 0.6976,
+      "step": 2285
+    },
+    {
+      "epoch": 0.11625693289842748,
+      "grad_norm": 0.046893742742678574,
+      "learning_rate": 0.0005812182741116752,
+      "loss": 0.6492,
+      "step": 2290
+    },
+    {
+      "epoch": 0.11651076899645899,
+      "grad_norm": 0.05546942162432547,
+      "learning_rate": 0.0005824873096446702,
+      "loss": 0.6905,
+      "step": 2295
+    },
+    {
+      "epoch": 0.11676460509449049,
+      "grad_norm": 0.044480364722733014,
+      "learning_rate": 0.0005837563451776649,
+      "loss": 0.6774,
+      "step": 2300
+    },
+    {
+      "epoch": 0.11701844119252199,
+      "grad_norm": 0.058930815501948106,
+      "learning_rate": 0.0005850253807106599,
+      "loss": 0.7125,
+      "step": 2305
+    },
+    {
+      "epoch": 0.11727227729055349,
+      "grad_norm": 0.06993707438691858,
+      "learning_rate": 0.0005862944162436549,
+      "loss": 0.6672,
+      "step": 2310
+    },
+    {
+      "epoch": 0.11752611338858499,
+      "grad_norm": 0.07094737512117218,
+      "learning_rate": 0.0005875634517766498,
+      "loss": 0.6874,
+      "step": 2315
+    },
+    {
+      "epoch": 0.11777994948661649,
+      "grad_norm": 0.04776737380254815,
+      "learning_rate": 0.0005888324873096446,
+      "loss": 0.6743,
+      "step": 2320
+    },
+    {
+      "epoch": 0.118033785584648,
+      "grad_norm": 0.04754319568857845,
+      "learning_rate": 0.0005901015228426396,
+      "loss": 0.6955,
+      "step": 2325
+    },
+    {
+      "epoch": 0.11828762168267949,
+      "grad_norm": 0.0530012975882747,
+      "learning_rate": 0.0005913705583756346,
+      "loss": 0.6741,
+      "step": 2330
+    },
+    {
+      "epoch": 0.118541457780711,
+      "grad_norm": 0.047906343503061846,
+      "learning_rate": 0.0005926395939086295,
+      "loss": 0.6932,
+      "step": 2335
+    },
+    {
+      "epoch": 0.11879529387874249,
+      "grad_norm": 0.04482089427430776,
+      "learning_rate": 0.0005939086294416243,
+      "loss": 0.6872,
+      "step": 2340
+    },
+    {
+      "epoch": 0.119049129976774,
+      "grad_norm": 0.04529373048320046,
+      "learning_rate": 0.0005951776649746193,
+      "loss": 0.6626,
+      "step": 2345
+    },
+    {
+      "epoch": 0.11930296607480549,
+      "grad_norm": 0.0476388588473774,
+      "learning_rate": 0.0005964467005076142,
+      "loss": 0.6791,
+      "step": 2350
+    },
+    {
+      "epoch": 0.119556802172837,
+      "grad_norm": 0.04934267321682797,
+      "learning_rate": 0.0005977157360406092,
+      "loss": 0.686,
+      "step": 2355
+    },
+    {
+      "epoch": 0.1198106382708685,
+      "grad_norm": 0.048244390716089255,
+      "learning_rate": 0.000598984771573604,
+      "loss": 0.6926,
+      "step": 2360
+    },
+    {
+      "epoch": 0.1200644743689,
+      "grad_norm": 0.062491881852921594,
+      "learning_rate": 0.000600253807106599,
+      "loss": 0.7189,
+      "step": 2365
+    },
+    {
+      "epoch": 0.1203183104669315,
+      "grad_norm": 0.06931903303967604,
+      "learning_rate": 0.0006015228426395939,
+      "loss": 0.712,
+      "step": 2370
+    },
+    {
+      "epoch": 0.120572146564963,
+      "grad_norm": 0.05423697704542445,
+      "learning_rate": 0.0006027918781725889,
+      "loss": 0.6846,
+      "step": 2375
+    },
+    {
+      "epoch": 0.1208259826629945,
+      "grad_norm": 0.05778931196460261,
+      "learning_rate": 0.0006040609137055838,
+      "loss": 0.693,
+      "step": 2380
+    },
+    {
+      "epoch": 0.12107981876102601,
+      "grad_norm": 0.06091151475086323,
+      "learning_rate": 0.0006053299492385786,
+      "loss": 0.6676,
+      "step": 2385
+    },
+    {
+      "epoch": 0.1213336548590575,
+      "grad_norm": 0.055216368348407444,
+      "learning_rate": 0.0006065989847715736,
+      "loss": 0.6724,
+      "step": 2390
+    },
+    {
+      "epoch": 0.12158749095708901,
+      "grad_norm": 0.04670707090251174,
+      "learning_rate": 0.0006078680203045686,
+      "loss": 0.6651,
+      "step": 2395
+    },
+    {
+      "epoch": 0.1218413270551205,
+      "grad_norm": 0.05746162859057901,
+      "learning_rate": 0.0006091370558375635,
+      "loss": 0.6942,
+      "step": 2400
+    },
+    {
+      "epoch": 0.12209516315315201,
+      "grad_norm": 0.056859124253096104,
+      "learning_rate": 0.0006104060913705583,
+      "loss": 0.6698,
+      "step": 2405
+    },
+    {
+      "epoch": 0.1223489992511835,
+      "grad_norm": 0.05339650908828867,
+      "learning_rate": 0.0006116751269035533,
+      "loss": 0.6844,
+      "step": 2410
+    },
+    {
+      "epoch": 0.12260283534921501,
+      "grad_norm": 0.04544174695117297,
+      "learning_rate": 0.0006129441624365483,
+      "loss": 0.681,
+      "step": 2415
+    },
+    {
+      "epoch": 0.12285667144724652,
+      "grad_norm": 0.047853644230645295,
+      "learning_rate": 0.0006142131979695432,
+      "loss": 0.6736,
+      "step": 2420
+    },
+    {
+      "epoch": 0.12311050754527801,
+      "grad_norm": 0.044716395502977895,
+      "learning_rate": 0.000615482233502538,
+      "loss": 0.6692,
+      "step": 2425
+    },
+    {
+      "epoch": 0.12336434364330952,
+      "grad_norm": 0.0440973721411255,
+      "learning_rate": 0.000616751269035533,
+      "loss": 0.6751,
+      "step": 2430
+    },
+    {
+      "epoch": 0.12361817974134101,
+      "grad_norm": 0.04429030828005958,
+      "learning_rate": 0.000618020304568528,
+      "loss": 0.6906,
+      "step": 2435
+    },
+    {
+      "epoch": 0.12387201583937252,
+      "grad_norm": 0.04787659061724324,
+      "learning_rate": 0.0006192893401015229,
+      "loss": 0.6818,
+      "step": 2440
+    },
+    {
+      "epoch": 0.12412585193740402,
+      "grad_norm": 0.04128004321338866,
+      "learning_rate": 0.0006205583756345177,
+      "loss": 0.6588,
+      "step": 2445
+    },
+    {
+      "epoch": 0.12437968803543552,
+      "grad_norm": 0.061281310808498836,
+      "learning_rate": 0.0006218274111675127,
+      "loss": 0.6794,
+      "step": 2450
+    },
+    {
+      "epoch": 0.12463352413346702,
+      "grad_norm": 0.0452210479730751,
+      "learning_rate": 0.0006230964467005076,
+      "loss": 0.6981,
+      "step": 2455
+    },
+    {
+      "epoch": 0.12488736023149852,
+      "grad_norm": 0.07279557551578562,
+      "learning_rate": 0.0006243654822335026,
+      "loss": 0.6425,
+      "step": 2460
+    },
+    {
+      "epoch": 0.12514119632953002,
+      "grad_norm": 0.050867733799510144,
+      "learning_rate": 0.0006256345177664974,
+      "loss": 0.6822,
+      "step": 2465
+    },
+    {
+      "epoch": 0.12539503242756153,
+      "grad_norm": 0.03997161618281022,
+      "learning_rate": 0.0006269035532994924,
+      "loss": 0.6798,
+      "step": 2470
+    },
+    {
+      "epoch": 0.12564886852559304,
+      "grad_norm": 0.04666231998293777,
+      "learning_rate": 0.0006281725888324873,
+      "loss": 0.6669,
+      "step": 2475
+    },
+    {
+      "epoch": 0.12590270462362452,
+      "grad_norm": 0.06993861990367087,
+      "learning_rate": 0.0006294416243654823,
+      "loss": 0.6769,
+      "step": 2480
+    },
+    {
+      "epoch": 0.12615654072165602,
+      "grad_norm": 0.04460301577499877,
+      "learning_rate": 0.0006307106598984772,
+      "loss": 0.6702,
+      "step": 2485
+    },
+    {
+      "epoch": 0.12641037681968753,
+      "grad_norm": 0.046300009587602504,
+      "learning_rate": 0.000631979695431472,
+      "loss": 0.6485,
+      "step": 2490
+    },
+    {
+      "epoch": 0.12666421291771904,
+      "grad_norm": 0.04525513100008759,
+      "learning_rate": 0.000633248730964467,
+      "loss": 0.7022,
+      "step": 2495
+    },
+    {
+      "epoch": 0.12691804901575052,
+      "grad_norm": 0.046659707052599364,
+      "learning_rate": 0.000634517766497462,
+      "loss": 0.6636,
+      "step": 2500
+    },
+    {
+      "epoch": 0.12717188511378202,
+      "grad_norm": 0.04824843023874754,
+      "learning_rate": 0.0006357868020304569,
+      "loss": 0.6485,
+      "step": 2505
+    },
+    {
+      "epoch": 0.12742572121181353,
+      "grad_norm": 0.04746300692436404,
+      "learning_rate": 0.0006370558375634517,
+      "loss": 0.7173,
+      "step": 2510
+    },
+    {
+      "epoch": 0.12767955730984504,
+      "grad_norm": 0.04431626679908001,
+      "learning_rate": 0.0006383248730964467,
+      "loss": 0.6767,
+      "step": 2515
+    },
+    {
+      "epoch": 0.12793339340787654,
+      "grad_norm": 0.04212599191521632,
+      "learning_rate": 0.0006395939086294417,
+      "loss": 0.6486,
+      "step": 2520
+    },
+    {
+      "epoch": 0.12818722950590802,
+      "grad_norm": 0.044688664389786185,
+      "learning_rate": 0.0006408629441624366,
+      "loss": 0.6343,
+      "step": 2525
+    },
+    {
+      "epoch": 0.12844106560393953,
+      "grad_norm": 0.04418877402589161,
+      "learning_rate": 0.0006421319796954314,
+      "loss": 0.6908,
+      "step": 2530
+    },
+    {
+      "epoch": 0.12869490170197104,
+      "grad_norm": 0.04434768157491682,
+      "learning_rate": 0.0006434010152284264,
+      "loss": 0.6997,
+      "step": 2535
+    },
+    {
+      "epoch": 0.12894873780000254,
+      "grad_norm": 0.04818401670766656,
+      "learning_rate": 0.0006446700507614214,
+      "loss": 0.6832,
+      "step": 2540
+    },
+    {
+      "epoch": 0.12920257389803405,
+      "grad_norm": 0.04591714580956639,
+      "learning_rate": 0.0006459390862944163,
+      "loss": 0.662,
+      "step": 2545
+    },
+    {
+      "epoch": 0.12945640999606553,
+      "grad_norm": 0.050750089447246204,
+      "learning_rate": 0.0006472081218274111,
+      "loss": 0.6859,
+      "step": 2550
+    },
+    {
+      "epoch": 0.12971024609409704,
+      "grad_norm": 0.049535561340610365,
+      "learning_rate": 0.0006484771573604061,
+      "loss": 0.6929,
+      "step": 2555
+    },
+    {
+      "epoch": 0.12996408219212854,
+      "grad_norm": 0.04625672713487381,
+      "learning_rate": 0.000649746192893401,
+      "loss": 0.6712,
+      "step": 2560
+    },
+    {
+      "epoch": 0.13021791829016005,
+      "grad_norm": 0.0443189289054786,
+      "learning_rate": 0.000651015228426396,
+      "loss": 0.7355,
+      "step": 2565
+    },
+    {
+      "epoch": 0.13047175438819156,
+      "grad_norm": 0.05280014157523752,
+      "learning_rate": 0.0006522842639593908,
+      "loss": 0.701,
+      "step": 2570
+    },
+    {
+      "epoch": 0.13072559048622304,
+      "grad_norm": 0.04705146570879352,
+      "learning_rate": 0.0006535532994923858,
+      "loss": 0.6695,
+      "step": 2575
+    },
+    {
+      "epoch": 0.13097942658425454,
+      "grad_norm": 0.0460576955250553,
+      "learning_rate": 0.0006548223350253807,
+      "loss": 0.692,
+      "step": 2580
+    },
+    {
+      "epoch": 0.13123326268228605,
+      "grad_norm": 0.03813344619291145,
+      "learning_rate": 0.0006560913705583757,
+      "loss": 0.66,
+      "step": 2585
+    },
+    {
+      "epoch": 0.13148709878031756,
+      "grad_norm": 0.04969973984569192,
+      "learning_rate": 0.0006573604060913706,
+      "loss": 0.6934,
+      "step": 2590
+    },
+    {
+      "epoch": 0.13174093487834904,
+      "grad_norm": 0.042656040318584894,
+      "learning_rate": 0.0006586294416243654,
+      "loss": 0.6615,
+      "step": 2595
+    },
+    {
+      "epoch": 0.13199477097638054,
+      "grad_norm": 0.04426457599994935,
+      "learning_rate": 0.0006598984771573604,
+      "loss": 0.7075,
+      "step": 2600
+    },
+    {
+      "epoch": 0.13224860707441205,
+      "grad_norm": 0.04911188070281771,
+      "learning_rate": 0.0006611675126903554,
+      "loss": 0.663,
+      "step": 2605
+    },
+    {
+      "epoch": 0.13250244317244356,
+      "grad_norm": 0.045086612880435376,
+      "learning_rate": 0.0006624365482233503,
+      "loss": 0.6769,
+      "step": 2610
+    },
+    {
+      "epoch": 0.13275627927047506,
+      "grad_norm": 0.0806263949064106,
+      "learning_rate": 0.0006637055837563451,
+      "loss": 0.6701,
+      "step": 2615
+    },
+    {
+      "epoch": 0.13301011536850654,
+      "grad_norm": 0.09236793730312937,
+      "learning_rate": 0.0006649746192893401,
+      "loss": 0.648,
+      "step": 2620
+    },
+    {
+      "epoch": 0.13326395146653805,
+      "grad_norm": 0.045322993172678835,
+      "learning_rate": 0.0006662436548223351,
+      "loss": 0.6718,
+      "step": 2625
+    },
+    {
+      "epoch": 0.13351778756456956,
+      "grad_norm": 0.04182199738451879,
+      "learning_rate": 0.0006675126903553299,
+      "loss": 0.6724,
+      "step": 2630
+    },
+    {
+      "epoch": 0.13377162366260106,
+      "grad_norm": 0.042921119598924244,
+      "learning_rate": 0.0006687817258883248,
+      "loss": 0.6813,
+      "step": 2635
+    },
+    {
+      "epoch": 0.13402545976063257,
+      "grad_norm": 0.10282609717664869,
+      "learning_rate": 0.0006700507614213198,
+      "loss": 0.6714,
+      "step": 2640
+    },
+    {
+      "epoch": 0.13427929585866405,
+      "grad_norm": 0.05297792239316466,
+      "learning_rate": 0.0006713197969543148,
+      "loss": 0.6872,
+      "step": 2645
+    },
+    {
+      "epoch": 0.13453313195669556,
+      "grad_norm": 0.10242225950893688,
+      "learning_rate": 0.0006725888324873097,
+      "loss": 0.6771,
+      "step": 2650
+    },
+    {
+      "epoch": 0.13478696805472706,
+      "grad_norm": 0.0603931866469296,
+      "learning_rate": 0.0006738578680203045,
+      "loss": 0.6974,
+      "step": 2655
+    },
+    {
+      "epoch": 0.13504080415275857,
+      "grad_norm": 0.04679504191127658,
+      "learning_rate": 0.0006751269035532995,
+      "loss": 0.7012,
+      "step": 2660
+    },
+    {
+      "epoch": 0.13529464025079008,
+      "grad_norm": 0.04709498367472285,
+      "learning_rate": 0.0006763959390862944,
+      "loss": 0.6706,
+      "step": 2665
+    },
+    {
+      "epoch": 0.13554847634882156,
+      "grad_norm": 0.0564473112589115,
+      "learning_rate": 0.0006776649746192894,
+      "loss": 0.6901,
+      "step": 2670
+    },
+    {
+      "epoch": 0.13580231244685306,
+      "grad_norm": 0.06027753857126253,
+      "learning_rate": 0.0006789340101522842,
+      "loss": 0.6819,
+      "step": 2675
+    },
+    {
+      "epoch": 0.13605614854488457,
+      "grad_norm": 0.06314337231356607,
+      "learning_rate": 0.0006802030456852792,
+      "loss": 0.6699,
+      "step": 2680
+    },
+    {
+      "epoch": 0.13630998464291608,
+      "grad_norm": 0.057435222380299925,
+      "learning_rate": 0.0006814720812182741,
+      "loss": 0.6497,
+      "step": 2685
+    },
+    {
+      "epoch": 0.13656382074094758,
+      "grad_norm": 0.04260996003530418,
+      "learning_rate": 0.0006827411167512691,
+      "loss": 0.6658,
+      "step": 2690
+    },
+    {
+      "epoch": 0.13681765683897906,
+      "grad_norm": 0.04258191163717926,
+      "learning_rate": 0.000684010152284264,
+      "loss": 0.6924,
+      "step": 2695
+    },
+    {
+      "epoch": 0.13707149293701057,
+      "grad_norm": 0.05691730259771199,
+      "learning_rate": 0.0006852791878172588,
+      "loss": 0.6983,
+      "step": 2700
+    },
+    {
+      "epoch": 0.13732532903504208,
+      "grad_norm": 0.043909736858620214,
+      "learning_rate": 0.0006865482233502538,
+      "loss": 0.6979,
+      "step": 2705
+    },
+    {
+      "epoch": 0.13757916513307358,
+      "grad_norm": 0.047090946914990356,
+      "learning_rate": 0.0006878172588832488,
+      "loss": 0.6779,
+      "step": 2710
+    },
+    {
+      "epoch": 0.13783300123110506,
+      "grad_norm": 0.047548729011293284,
+      "learning_rate": 0.0006890862944162437,
+      "loss": 0.6499,
+      "step": 2715
+    },
+    {
+      "epoch": 0.13808683732913657,
+      "grad_norm": 0.06616744593038089,
+      "learning_rate": 0.0006903553299492385,
+      "loss": 0.6969,
+      "step": 2720
+    },
+    {
+      "epoch": 0.13834067342716808,
+      "grad_norm": 0.0703509579934864,
+      "learning_rate": 0.0006916243654822335,
+      "loss": 0.652,
+      "step": 2725
+    },
+    {
+      "epoch": 0.13859450952519958,
+      "grad_norm": 0.04246140238019912,
+      "learning_rate": 0.0006928934010152285,
+      "loss": 0.6682,
+      "step": 2730
+    },
+    {
+      "epoch": 0.1388483456232311,
+      "grad_norm": 0.04135755013487853,
+      "learning_rate": 0.0006941624365482235,
+      "loss": 0.6556,
+      "step": 2735
+    },
+    {
+      "epoch": 0.13910218172126257,
+      "grad_norm": 0.04205045142396158,
+      "learning_rate": 0.0006954314720812182,
+      "loss": 0.7063,
+      "step": 2740
+    },
+    {
+      "epoch": 0.13935601781929408,
+      "grad_norm": 0.05001019855676361,
+      "learning_rate": 0.0006967005076142132,
+      "loss": 0.7266,
+      "step": 2745
+    },
+    {
+      "epoch": 0.13960985391732558,
+      "grad_norm": 0.05281826891896312,
+      "learning_rate": 0.0006979695431472082,
+      "loss": 0.685,
+      "step": 2750
+    },
+    {
+      "epoch": 0.1398636900153571,
+      "grad_norm": 0.08165529805729099,
+      "learning_rate": 0.0006992385786802031,
+      "loss": 0.6946,
+      "step": 2755
+    },
+    {
+      "epoch": 0.1401175261133886,
+      "grad_norm": 0.06359142712647757,
+      "learning_rate": 0.0007005076142131979,
+      "loss": 0.691,
+      "step": 2760
+    },
+    {
+      "epoch": 0.14037136221142008,
+      "grad_norm": 0.04783534944187642,
+      "learning_rate": 0.0007017766497461929,
+      "loss": 0.6572,
+      "step": 2765
+    },
+    {
+      "epoch": 0.14062519830945158,
+      "grad_norm": 0.04836474183739573,
+      "learning_rate": 0.0007030456852791879,
+      "loss": 0.6499,
+      "step": 2770
+    },
+    {
+      "epoch": 0.1408790344074831,
+      "grad_norm": 0.04622813948916543,
+      "learning_rate": 0.0007043147208121828,
+      "loss": 0.6722,
+      "step": 2775
+    },
+    {
+      "epoch": 0.1411328705055146,
+      "grad_norm": 0.044196925779096605,
+      "learning_rate": 0.0007055837563451776,
+      "loss": 0.6927,
+      "step": 2780
+    },
+    {
+      "epoch": 0.1413867066035461,
+      "grad_norm": 0.0563954116918076,
+      "learning_rate": 0.0007068527918781726,
+      "loss": 0.6773,
+      "step": 2785
+    },
+    {
+      "epoch": 0.14164054270157758,
+      "grad_norm": 0.05630324142592108,
+      "learning_rate": 0.0007081218274111675,
+      "loss": 0.6948,
+      "step": 2790
+    },
+    {
+      "epoch": 0.1418943787996091,
+      "grad_norm": 0.04380530703415174,
+      "learning_rate": 0.0007093908629441625,
+      "loss": 0.6608,
+      "step": 2795
+    },
+    {
+      "epoch": 0.1421482148976406,
+      "grad_norm": 0.09238879549470941,
+      "learning_rate": 0.0007106598984771574,
+      "loss": 0.7379,
+      "step": 2800
+    },
+    {
+      "epoch": 0.1424020509956721,
+      "grad_norm": 0.08771073436704688,
+      "learning_rate": 0.0007119289340101523,
+      "loss": 0.7167,
+      "step": 2805
+    },
+    {
+      "epoch": 0.1426558870937036,
+      "grad_norm": 0.06359201377032256,
+      "learning_rate": 0.0007131979695431472,
+      "loss": 0.7088,
+      "step": 2810
+    },
+    {
+      "epoch": 0.1429097231917351,
+      "grad_norm": 0.1200886744339994,
+      "learning_rate": 0.0007144670050761422,
+      "loss": 0.6847,
+      "step": 2815
+    },
+    {
+      "epoch": 0.1431635592897666,
+      "grad_norm": 0.138281486633561,
+      "learning_rate": 0.0007157360406091371,
+      "loss": 0.6848,
+      "step": 2820
+    },
+    {
+      "epoch": 0.1434173953877981,
+      "grad_norm": 0.04480938074303019,
+      "learning_rate": 0.0007170050761421319,
+      "loss": 0.6904,
+      "step": 2825
+    },
+    {
+      "epoch": 0.1436712314858296,
+      "grad_norm": 21.175586578852744,
+      "learning_rate": 0.0007182741116751269,
+      "loss": 0.7594,
+      "step": 2830
+    },
+    {
+      "epoch": 0.1439250675838611,
+      "grad_norm": 0.06295691464093359,
+      "learning_rate": 0.0007195431472081219,
+      "loss": 0.7236,
+      "step": 2835
+    },
+    {
+      "epoch": 0.1441789036818926,
+      "grad_norm": 0.10004928301513774,
+      "learning_rate": 0.0007208121827411168,
+      "loss": 0.7397,
+      "step": 2840
+    },
+    {
+      "epoch": 0.1444327397799241,
+      "grad_norm": 0.46538792031189885,
+      "learning_rate": 0.0007220812182741116,
+      "loss": 0.8646,
+      "step": 2845
+    },
+    {
+      "epoch": 0.1446865758779556,
+      "grad_norm": 0.4358423651644742,
+      "learning_rate": 0.0007233502538071066,
+      "loss": 0.8406,
+      "step": 2850
+    },
+    {
+      "epoch": 0.14494041197598712,
+      "grad_norm": 0.10845670895421451,
+      "learning_rate": 0.0007246192893401016,
+      "loss": 0.785,
+      "step": 2855
+    },
+    {
+      "epoch": 0.1451942480740186,
+      "grad_norm": 0.06788040013815881,
+      "learning_rate": 0.0007258883248730965,
+      "loss": 0.7328,
+      "step": 2860
+    },
+    {
+      "epoch": 0.1454480841720501,
+      "grad_norm": 0.07189189427731303,
+      "learning_rate": 0.0007271573604060913,
+      "loss": 0.7376,
+      "step": 2865
+    },
+    {
+      "epoch": 0.1457019202700816,
+      "grad_norm": 0.04909596292317751,
+      "learning_rate": 0.0007284263959390863,
+      "loss": 0.7061,
+      "step": 2870
+    },
+    {
+      "epoch": 0.14595575636811312,
+      "grad_norm": 0.05537834807483057,
+      "learning_rate": 0.0007296954314720813,
+      "loss": 0.7313,
+      "step": 2875
+    },
+    {
+      "epoch": 0.14620959246614462,
+      "grad_norm": 0.2299897910231885,
+      "learning_rate": 0.0007309644670050762,
+      "loss": 1.4098,
+      "step": 2880
+    },
+    {
+      "epoch": 0.1464634285641761,
+      "grad_norm": 0.23582707917647705,
+      "learning_rate": 0.000732233502538071,
+      "loss": 0.8452,
+      "step": 2885
+    },
+    {
+      "epoch": 0.1467172646622076,
+      "grad_norm": 0.11644453166636645,
+      "learning_rate": 0.000733502538071066,
+      "loss": 0.7672,
+      "step": 2890
+    },
+    {
+      "epoch": 0.14697110076023912,
+      "grad_norm": 0.09392008019229685,
+      "learning_rate": 0.0007347715736040609,
+      "loss": 0.744,
+      "step": 2895
+    },
+    {
+      "epoch": 0.14722493685827062,
+      "grad_norm": 0.09298026761511366,
+      "learning_rate": 0.0007360406091370559,
+      "loss": 0.7396,
+      "step": 2900
+    },
+    {
+      "epoch": 0.14747877295630213,
+      "grad_norm": 0.058376952983496454,
+      "learning_rate": 0.0007373096446700508,
+      "loss": 0.7174,
+      "step": 2905
+    },
+    {
+      "epoch": 0.1477326090543336,
+      "grad_norm": 0.05509134155546014,
+      "learning_rate": 0.0007385786802030457,
+      "loss": 0.7056,
+      "step": 2910
+    },
+    {
+      "epoch": 0.14798644515236511,
+      "grad_norm": 0.04295937409763232,
+      "learning_rate": 0.0007398477157360406,
+      "loss": 0.7003,
+      "step": 2915
+    },
+    {
+      "epoch": 0.14824028125039662,
+      "grad_norm": 0.050074822465611714,
+      "learning_rate": 0.0007411167512690356,
+      "loss": 0.6936,
+      "step": 2920
+    },
+    {
+      "epoch": 0.14849411734842813,
+      "grad_norm": 0.06561188123909512,
+      "learning_rate": 0.0007423857868020305,
+      "loss": 0.69,
+      "step": 2925
+    },
+    {
+      "epoch": 0.1487479534464596,
+      "grad_norm": 0.05110790900431651,
+      "learning_rate": 0.0007436548223350253,
+      "loss": 0.7239,
+      "step": 2930
+    },
+    {
+      "epoch": 0.14900178954449111,
+      "grad_norm": 0.053991352186369024,
+      "learning_rate": 0.0007449238578680203,
+      "loss": 0.7211,
+      "step": 2935
+    },
+    {
+      "epoch": 0.14925562564252262,
+      "grad_norm": 0.07363083960749695,
+      "learning_rate": 0.0007461928934010153,
+      "loss": 0.6989,
+      "step": 2940
+    },
+    {
+      "epoch": 0.14950946174055413,
+      "grad_norm": 0.06727590148650675,
+      "learning_rate": 0.0007474619289340102,
+      "loss": 0.7114,
+      "step": 2945
+    },
+    {
+      "epoch": 0.14976329783858564,
+      "grad_norm": 0.057979540617668884,
+      "learning_rate": 0.000748730964467005,
+      "loss": 0.7061,
+      "step": 2950
+    },
+    {
+      "epoch": 0.15001713393661711,
+      "grad_norm": 0.059049940642629514,
+      "learning_rate": 0.00075,
+      "loss": 0.7095,
+      "step": 2955
+    },
+    {
+      "epoch": 0.15027097003464862,
+      "grad_norm": 0.0653578877080519,
+      "learning_rate": 0.000751269035532995,
+      "loss": 0.7319,
+      "step": 2960
+    },
+    {
+      "epoch": 0.15052480613268013,
+      "grad_norm": 0.06333871135346043,
+      "learning_rate": 0.0007525380710659899,
+      "loss": 0.7417,
+      "step": 2965
+    },
+    {
+      "epoch": 0.15077864223071163,
+      "grad_norm": 0.0728096512372998,
+      "learning_rate": 0.0007538071065989847,
+      "loss": 0.7424,
+      "step": 2970
+    },
+    {
+      "epoch": 0.15103247832874314,
+      "grad_norm": 0.041806345943965685,
+      "learning_rate": 0.0007550761421319797,
+      "loss": 0.6842,
+      "step": 2975
+    },
+    {
+      "epoch": 0.15128631442677462,
+      "grad_norm": 0.048509525473583434,
+      "learning_rate": 0.0007563451776649747,
+      "loss": 0.7402,
+      "step": 2980
+    },
+    {
+      "epoch": 0.15154015052480613,
+      "grad_norm": 0.06583937758938846,
+      "learning_rate": 0.0007576142131979696,
+      "loss": 0.7292,
+      "step": 2985
+    },
+    {
+      "epoch": 0.15179398662283763,
+      "grad_norm": 0.04468189699925895,
+      "learning_rate": 0.0007588832487309644,
+      "loss": 0.7317,
+      "step": 2990
+    },
+    {
+      "epoch": 0.15204782272086914,
+      "grad_norm": 0.047287822361576956,
+      "learning_rate": 0.0007601522842639594,
+      "loss": 0.6964,
+      "step": 2995
+    },
+    {
+      "epoch": 0.15230165881890065,
+      "grad_norm": 0.03938327148057227,
+      "learning_rate": 0.0007614213197969543,
+      "loss": 0.7265,
+      "step": 3000
+    },
+    {
+      "epoch": 0.15255549491693213,
+      "grad_norm": 0.03972449267379906,
+      "learning_rate": 0.0007626903553299493,
+      "loss": 0.6827,
+      "step": 3005
+    },
+    {
+      "epoch": 0.15280933101496363,
+      "grad_norm": 0.04711089033081518,
+      "learning_rate": 0.0007639593908629442,
+      "loss": 0.7007,
+      "step": 3010
+    },
+    {
+      "epoch": 0.15306316711299514,
+      "grad_norm": 0.06336031295625702,
+      "learning_rate": 0.0007652284263959391,
+      "loss": 0.6908,
+      "step": 3015
+    },
+    {
+      "epoch": 0.15331700321102665,
+      "grad_norm": 0.03736772924042978,
+      "learning_rate": 0.000766497461928934,
+      "loss": 0.689,
+      "step": 3020
+    },
+    {
+      "epoch": 0.15357083930905815,
+      "grad_norm": 0.041934899038226205,
+      "learning_rate": 0.000767766497461929,
+      "loss": 0.7015,
+      "step": 3025
+    },
+    {
+      "epoch": 0.15382467540708963,
+      "grad_norm": 0.04211578318800713,
+      "learning_rate": 0.0007690355329949239,
+      "loss": 0.6677,
+      "step": 3030
+    },
+    {
+      "epoch": 0.15407851150512114,
+      "grad_norm": 0.04215189545395842,
+      "learning_rate": 0.0007703045685279187,
+      "loss": 0.6836,
+      "step": 3035
+    },
+    {
+      "epoch": 0.15433234760315265,
+      "grad_norm": 0.05201347380185946,
+      "learning_rate": 0.0007715736040609137,
+      "loss": 0.7107,
+      "step": 3040
+    },
+    {
+      "epoch": 0.15458618370118415,
+      "grad_norm": 0.04761766831274283,
+      "learning_rate": 0.0007728426395939087,
+      "loss": 0.7119,
+      "step": 3045
+    },
+    {
+      "epoch": 0.15484001979921563,
+      "grad_norm": 0.03933638281586551,
+      "learning_rate": 0.0007741116751269036,
+      "loss": 0.6738,
+      "step": 3050
+    },
+    {
+      "epoch": 0.15509385589724714,
+      "grad_norm": 0.11996962799432084,
+      "learning_rate": 0.0007753807106598984,
+      "loss": 0.6781,
+      "step": 3055
+    },
+    {
+      "epoch": 0.15534769199527865,
+      "grad_norm": 0.05837163303706869,
+      "learning_rate": 0.0007766497461928934,
+      "loss": 0.6902,
+      "step": 3060
+    },
+    {
+      "epoch": 0.15560152809331015,
+      "grad_norm": 0.06028633251581502,
+      "learning_rate": 0.0007779187817258884,
+      "loss": 0.6698,
+      "step": 3065
+    },
+    {
+      "epoch": 0.15585536419134166,
+      "grad_norm": 0.07092481098060303,
+      "learning_rate": 0.0007791878172588833,
+      "loss": 0.6567,
+      "step": 3070
+    },
+    {
+      "epoch": 0.15610920028937314,
+      "grad_norm": 0.04707521754835134,
+      "learning_rate": 0.0007804568527918781,
+      "loss": 0.6966,
+      "step": 3075
+    },
+    {
+      "epoch": 0.15636303638740465,
+      "grad_norm": 0.047329839412153664,
+      "learning_rate": 0.0007817258883248731,
+      "loss": 0.7177,
+      "step": 3080
+    },
+    {
+      "epoch": 0.15661687248543615,
+      "grad_norm": 0.04088170174383998,
+      "learning_rate": 0.0007829949238578681,
+      "loss": 0.6803,
+      "step": 3085
+    },
+    {
+      "epoch": 0.15687070858346766,
+      "grad_norm": 0.038345774566105946,
+      "learning_rate": 0.000784263959390863,
+      "loss": 0.6656,
+      "step": 3090
+    },
+    {
+      "epoch": 0.15712454468149917,
+      "grad_norm": 0.04514482638739989,
+      "learning_rate": 0.0007855329949238578,
+      "loss": 0.7314,
+      "step": 3095
+    },
+    {
+      "epoch": 0.15737838077953065,
+      "grad_norm": 0.04006321580374188,
+      "learning_rate": 0.0007868020304568528,
+      "loss": 0.6747,
+      "step": 3100
+    },
+    {
+      "epoch": 0.15763221687756215,
+      "grad_norm": 0.05067763436233774,
+      "learning_rate": 0.0007880710659898477,
+      "loss": 0.7088,
+      "step": 3105
+    },
+    {
+      "epoch": 0.15788605297559366,
+      "grad_norm": 0.03882503010997677,
+      "learning_rate": 0.0007893401015228427,
+      "loss": 0.6857,
+      "step": 3110
+    },
+    {
+      "epoch": 0.15813988907362517,
+      "grad_norm": 0.041915161141761616,
+      "learning_rate": 0.0007906091370558376,
+      "loss": 0.6804,
+      "step": 3115
+    },
+    {
+      "epoch": 0.15839372517165667,
+      "grad_norm": 0.055655601622924857,
+      "learning_rate": 0.0007918781725888325,
+      "loss": 0.7128,
+      "step": 3120
+    },
+    {
+      "epoch": 0.15864756126968815,
+      "grad_norm": 0.09341393989640125,
+      "learning_rate": 0.0007931472081218274,
+      "loss": 0.6803,
+      "step": 3125
+    },
+    {
+      "epoch": 0.15890139736771966,
+      "grad_norm": 0.06199349132436108,
+      "learning_rate": 0.0007944162436548224,
+      "loss": 0.7121,
+      "step": 3130
+    },
+    {
+      "epoch": 0.15915523346575117,
+      "grad_norm": 0.08342488615098623,
+      "learning_rate": 0.0007956852791878173,
+      "loss": 0.631,
+      "step": 3135
+    },
+    {
+      "epoch": 0.15940906956378267,
+      "grad_norm": 0.06702222456990227,
+      "learning_rate": 0.0007969543147208121,
+      "loss": 0.681,
+      "step": 3140
+    },
+    {
+      "epoch": 0.15966290566181415,
+      "grad_norm": 0.051322791325766115,
+      "learning_rate": 0.0007982233502538071,
+      "loss": 0.6961,
+      "step": 3145
+    },
+    {
+      "epoch": 0.15991674175984566,
+      "grad_norm": 0.05093510864847829,
+      "learning_rate": 0.0007994923857868021,
+      "loss": 0.6924,
+      "step": 3150
+    },
+    {
+      "epoch": 0.16017057785787717,
+      "grad_norm": 0.05191372708283371,
+      "learning_rate": 0.000800761421319797,
+      "loss": 0.6651,
+      "step": 3155
+    },
+    {
+      "epoch": 0.16042441395590867,
+      "grad_norm": 0.065309480406257,
+      "learning_rate": 0.0008020304568527918,
+      "loss": 0.6873,
+      "step": 3160
+    },
+    {
+      "epoch": 0.16067825005394018,
+      "grad_norm": 0.04035360012723001,
+      "learning_rate": 0.0008032994923857868,
+      "loss": 0.7014,
+      "step": 3165
+    },
+    {
+      "epoch": 0.16093208615197166,
+      "grad_norm": 0.03787636570491385,
+      "learning_rate": 0.0008045685279187818,
+      "loss": 0.7184,
+      "step": 3170
+    },
+    {
+      "epoch": 0.16118592225000317,
+      "grad_norm": 0.04892199114261499,
+      "learning_rate": 0.0008058375634517766,
+      "loss": 0.7074,
+      "step": 3175
+    },
+    {
+      "epoch": 0.16143975834803467,
+      "grad_norm": 0.044454591373735045,
+      "learning_rate": 0.0008071065989847715,
+      "loss": 0.6681,
+      "step": 3180
+    },
+    {
+      "epoch": 0.16169359444606618,
+      "grad_norm": 0.05477810689691977,
+      "learning_rate": 0.0008083756345177665,
+      "loss": 0.7073,
+      "step": 3185
+    },
+    {
+      "epoch": 0.1619474305440977,
+      "grad_norm": 0.1561533627636135,
+      "learning_rate": 0.0008096446700507615,
+      "loss": 0.7062,
+      "step": 3190
+    },
+    {
+      "epoch": 0.16220126664212917,
+      "grad_norm": 0.07695082260270414,
+      "learning_rate": 0.0008109137055837564,
+      "loss": 0.6695,
+      "step": 3195
+    },
+    {
+      "epoch": 0.16245510274016067,
+      "grad_norm": 0.0675592941811925,
+      "learning_rate": 0.0008121827411167512,
+      "loss": 0.6852,
+      "step": 3200
+    },
+    {
+      "epoch": 0.16270893883819218,
+      "grad_norm": 0.052732094884202066,
+      "learning_rate": 0.0008134517766497462,
+      "loss": 0.7046,
+      "step": 3205
+    },
+    {
+      "epoch": 0.1629627749362237,
+      "grad_norm": 0.055496608441699284,
+      "learning_rate": 0.0008147208121827412,
+      "loss": 0.6916,
+      "step": 3210
+    },
+    {
+      "epoch": 0.1632166110342552,
+      "grad_norm": 0.043427396820308814,
+      "learning_rate": 0.0008159898477157361,
+      "loss": 0.7338,
+      "step": 3215
+    },
+    {
+      "epoch": 0.16347044713228667,
+      "grad_norm": 0.04788846665606786,
+      "learning_rate": 0.000817258883248731,
+      "loss": 0.7164,
+      "step": 3220
+    },
+    {
+      "epoch": 0.16372428323031818,
+      "grad_norm": 0.05499792725011934,
+      "learning_rate": 0.0008185279187817259,
+      "loss": 0.6441,
+      "step": 3225
+    },
+    {
+      "epoch": 0.1639781193283497,
+      "grad_norm": 0.07060079550345069,
+      "learning_rate": 0.0008197969543147208,
+      "loss": 0.6704,
+      "step": 3230
+    },
+    {
+      "epoch": 0.1642319554263812,
+      "grad_norm": 0.04697902634882019,
+      "learning_rate": 0.0008210659898477158,
+      "loss": 0.6896,
+      "step": 3235
+    },
+    {
+      "epoch": 0.1644857915244127,
+      "grad_norm": 0.03693026821420421,
+      "learning_rate": 0.0008223350253807107,
+      "loss": 0.6979,
+      "step": 3240
+    },
+    {
+      "epoch": 0.16473962762244418,
+      "grad_norm": 0.044236930949549086,
+      "learning_rate": 0.0008236040609137056,
+      "loss": 0.6727,
+      "step": 3245
+    },
+    {
+      "epoch": 0.16499346372047569,
+      "grad_norm": 0.04297691167933406,
+      "learning_rate": 0.0008248730964467005,
+      "loss": 0.7183,
+      "step": 3250
+    },
+    {
+      "epoch": 0.1652472998185072,
+      "grad_norm": 0.03926872438952914,
+      "learning_rate": 0.0008261421319796955,
+      "loss": 0.6678,
+      "step": 3255
+    },
+    {
+      "epoch": 0.1655011359165387,
+      "grad_norm": 0.058748589368983826,
+      "learning_rate": 0.0008274111675126904,
+      "loss": 0.6859,
+      "step": 3260
+    },
+    {
+      "epoch": 0.16575497201457018,
+      "grad_norm": 0.043071867704478226,
+      "learning_rate": 0.0008286802030456852,
+      "loss": 0.6584,
+      "step": 3265
+    },
+    {
+      "epoch": 0.16600880811260169,
+      "grad_norm": 0.036727442528139344,
+      "learning_rate": 0.0008299492385786802,
+      "loss": 0.6644,
+      "step": 3270
+    },
+    {
+      "epoch": 0.1662626442106332,
+      "grad_norm": 0.045406805927585635,
+      "learning_rate": 0.0008312182741116752,
+      "loss": 0.6902,
+      "step": 3275
+    },
+    {
+      "epoch": 0.1665164803086647,
+      "grad_norm": 0.03886588985080634,
+      "learning_rate": 0.0008324873096446702,
+      "loss": 0.6912,
+      "step": 3280
+    },
+    {
+      "epoch": 0.1667703164066962,
+      "grad_norm": 0.03860450090032993,
+      "learning_rate": 0.0008337563451776649,
+      "loss": 0.6918,
+      "step": 3285
+    },
+    {
+      "epoch": 0.16702415250472769,
+      "grad_norm": 0.03919405708529676,
+      "learning_rate": 0.0008350253807106599,
+      "loss": 0.7097,
+      "step": 3290
+    },
+    {
+      "epoch": 0.1672779886027592,
+      "grad_norm": 0.06384676391238123,
+      "learning_rate": 0.0008362944162436549,
+      "loss": 0.6769,
+      "step": 3295
+    },
+    {
+      "epoch": 0.1675318247007907,
+      "grad_norm": 0.04387951589525683,
+      "learning_rate": 0.0008375634517766498,
+      "loss": 0.7001,
+      "step": 3300
+    },
+    {
+      "epoch": 0.1677856607988222,
+      "grad_norm": 0.04020760640485976,
+      "learning_rate": 0.0008388324873096446,
+      "loss": 0.7,
+      "step": 3305
+    },
+    {
+      "epoch": 0.1680394968968537,
+      "grad_norm": 0.04343991292299841,
+      "learning_rate": 0.0008401015228426396,
+      "loss": 0.6653,
+      "step": 3310
+    },
+    {
+      "epoch": 0.1682933329948852,
+      "grad_norm": 0.04877171128678498,
+      "learning_rate": 0.0008413705583756346,
+      "loss": 0.6491,
+      "step": 3315
+    },
+    {
+      "epoch": 0.1685471690929167,
+      "grad_norm": 0.04489329625384213,
+      "learning_rate": 0.0008426395939086295,
+      "loss": 0.6944,
+      "step": 3320
+    },
+    {
+      "epoch": 0.1688010051909482,
+      "grad_norm": 0.03924346550703404,
+      "learning_rate": 0.0008439086294416243,
+      "loss": 0.7004,
+      "step": 3325
+    },
+    {
+      "epoch": 0.1690548412889797,
+      "grad_norm": 0.0351682478604759,
+      "learning_rate": 0.0008451776649746193,
+      "loss": 0.6289,
+      "step": 3330
+    },
+    {
+      "epoch": 0.16930867738701122,
+      "grad_norm": 0.05804265280485358,
+      "learning_rate": 0.0008464467005076142,
+      "loss": 0.6819,
+      "step": 3335
+    },
+    {
+      "epoch": 0.1695625134850427,
+      "grad_norm": 0.05558949651884164,
+      "learning_rate": 0.0008477157360406092,
+      "loss": 0.6995,
+      "step": 3340
+    },
+    {
+      "epoch": 0.1698163495830742,
+      "grad_norm": 0.07239860999828403,
+      "learning_rate": 0.0008489847715736041,
+      "loss": 0.698,
+      "step": 3345
+    },
+    {
+      "epoch": 0.1700701856811057,
+      "grad_norm": 0.04268065537159936,
+      "learning_rate": 0.000850253807106599,
+      "loss": 0.6724,
+      "step": 3350
+    },
+    {
+      "epoch": 0.17032402177913722,
+      "grad_norm": 0.04596502747992512,
+      "learning_rate": 0.0008515228426395939,
+      "loss": 0.715,
+      "step": 3355
+    },
+    {
+      "epoch": 0.17057785787716873,
+      "grad_norm": 0.07978824559788683,
+      "learning_rate": 0.0008527918781725889,
+      "loss": 0.6687,
+      "step": 3360
+    },
+    {
+      "epoch": 0.1708316939752002,
+      "grad_norm": 0.03941742907400371,
+      "learning_rate": 0.0008540609137055838,
+      "loss": 0.6799,
+      "step": 3365
+    },
+    {
+      "epoch": 0.1710855300732317,
+      "grad_norm": 0.0527239971063264,
+      "learning_rate": 0.0008553299492385786,
+      "loss": 0.7063,
+      "step": 3370
+    },
+    {
+      "epoch": 0.17133936617126322,
+      "grad_norm": 3.822834197350998,
+      "learning_rate": 0.0008565989847715736,
+      "loss": 0.696,
+      "step": 3375
+    },
+    {
+      "epoch": 0.17159320226929473,
+      "grad_norm": 0.083121560754106,
+      "learning_rate": 0.0008578680203045686,
+      "loss": 0.6914,
+      "step": 3380
+    },
+    {
+      "epoch": 0.1718470383673262,
+      "grad_norm": 0.06454785794212209,
+      "learning_rate": 0.0008591370558375635,
+      "loss": 0.7085,
+      "step": 3385
+    },
+    {
+      "epoch": 0.1721008744653577,
+      "grad_norm": 0.03945210916226416,
+      "learning_rate": 0.0008604060913705583,
+      "loss": 0.682,
+      "step": 3390
+    },
+    {
+      "epoch": 0.17235471056338922,
+      "grad_norm": 0.04193981751233244,
+      "learning_rate": 0.0008616751269035533,
+      "loss": 0.666,
+      "step": 3395
+    },
+    {
+      "epoch": 0.17260854666142073,
+      "grad_norm": 0.03967018442585797,
+      "learning_rate": 0.0008629441624365483,
+      "loss": 0.6608,
+      "step": 3400
+    },
+    {
+      "epoch": 0.17286238275945223,
+      "grad_norm": 0.04644004597796404,
+      "learning_rate": 0.0008642131979695432,
+      "loss": 0.6535,
+      "step": 3405
+    },
+    {
+      "epoch": 0.1731162188574837,
+      "grad_norm": 0.03873588952866688,
+      "learning_rate": 0.000865482233502538,
+      "loss": 0.7228,
+      "step": 3410
+    },
+    {
+      "epoch": 0.17337005495551522,
+      "grad_norm": 0.31833461711624433,
+      "learning_rate": 0.000866751269035533,
+      "loss": 0.7115,
+      "step": 3415
+    },
+    {
+      "epoch": 0.17362389105354673,
+      "grad_norm": 0.35935016596831215,
+      "learning_rate": 0.000868020304568528,
+      "loss": 0.6879,
+      "step": 3420
+    },
+    {
+      "epoch": 0.17387772715157823,
+      "grad_norm": 0.04780743742889909,
+      "learning_rate": 0.0008692893401015229,
+      "loss": 0.6907,
+      "step": 3425
+    },
+    {
+      "epoch": 0.17413156324960974,
+      "grad_norm": 0.04110231970405351,
+      "learning_rate": 0.0008705583756345177,
+      "loss": 0.7238,
+      "step": 3430
+    },
+    {
+      "epoch": 0.17438539934764122,
+      "grad_norm": 0.046525580251483005,
+      "learning_rate": 0.0008718274111675127,
+      "loss": 0.6778,
+      "step": 3435
+    },
+    {
+      "epoch": 0.17463923544567272,
+      "grad_norm": 0.041718923819856486,
+      "learning_rate": 0.0008730964467005076,
+      "loss": 0.6486,
+      "step": 3440
+    },
+    {
+      "epoch": 0.17489307154370423,
+      "grad_norm": 0.04055383524511006,
+      "learning_rate": 0.0008743654822335026,
+      "loss": 0.6968,
+      "step": 3445
+    },
+    {
+      "epoch": 0.17514690764173574,
+      "grad_norm": 0.045736208992562415,
+      "learning_rate": 0.0008756345177664975,
+      "loss": 0.6675,
+      "step": 3450
+    },
+    {
+      "epoch": 0.17540074373976725,
+      "grad_norm": 0.06303424703405444,
+      "learning_rate": 0.0008769035532994924,
+      "loss": 0.7083,
+      "step": 3455
+    },
+    {
+      "epoch": 0.17565457983779872,
+      "grad_norm": 0.03565589845558804,
+      "learning_rate": 0.0008781725888324873,
+      "loss": 0.6513,
+      "step": 3460
+    },
+    {
+      "epoch": 0.17590841593583023,
+      "grad_norm": 0.039028072883575,
+      "learning_rate": 0.0008794416243654823,
+      "loss": 0.6768,
+      "step": 3465
+    },
+    {
+      "epoch": 0.17616225203386174,
+      "grad_norm": 0.03826237652273635,
+      "learning_rate": 0.0008807106598984772,
+      "loss": 0.6931,
+      "step": 3470
+    },
+    {
+      "epoch": 0.17641608813189325,
+      "grad_norm": 0.03786297862349843,
+      "learning_rate": 0.000881979695431472,
+      "loss": 0.6665,
+      "step": 3475
+    },
+    {
+      "epoch": 0.17666992422992472,
+      "grad_norm": 0.035346860951597725,
+      "learning_rate": 0.000883248730964467,
+      "loss": 0.6739,
+      "step": 3480
+    },
+    {
+      "epoch": 0.17692376032795623,
+      "grad_norm": 0.038526863874646516,
+      "learning_rate": 0.000884517766497462,
+      "loss": 0.665,
+      "step": 3485
+    },
+    {
+      "epoch": 0.17717759642598774,
+      "grad_norm": 0.06026897162610092,
+      "learning_rate": 0.0008857868020304569,
+      "loss": 0.68,
+      "step": 3490
+    },
+    {
+      "epoch": 0.17743143252401924,
+      "grad_norm": 0.05069239821159444,
+      "learning_rate": 0.0008870558375634517,
+      "loss": 0.7229,
+      "step": 3495
+    },
+    {
+      "epoch": 0.17768526862205075,
+      "grad_norm": 0.09985009714735808,
+      "learning_rate": 0.0008883248730964467,
+      "loss": 0.6917,
+      "step": 3500
+    },
+    {
+      "epoch": 0.17793910472008223,
+      "grad_norm": 0.10170892988817608,
+      "learning_rate": 0.0008895939086294417,
+      "loss": 0.7233,
+      "step": 3505
+    },
+    {
+      "epoch": 0.17819294081811374,
+      "grad_norm": 0.07075066250119805,
+      "learning_rate": 0.0008908629441624366,
+      "loss": 0.669,
+      "step": 3510
+    },
+    {
+      "epoch": 0.17844677691614524,
+      "grad_norm": 0.08757273613751611,
+      "learning_rate": 0.0008921319796954314,
+      "loss": 0.6676,
+      "step": 3515
+    },
+    {
+      "epoch": 0.17870061301417675,
+      "grad_norm": 0.038714672890372746,
+      "learning_rate": 0.0008934010152284264,
+      "loss": 0.7115,
+      "step": 3520
+    },
+    {
+      "epoch": 0.17895444911220826,
+      "grad_norm": 0.04036233500547512,
+      "learning_rate": 0.0008946700507614214,
+      "loss": 0.6783,
+      "step": 3525
+    },
+    {
+      "epoch": 0.17920828521023974,
+      "grad_norm": 0.06230842789641548,
+      "learning_rate": 0.0008959390862944163,
+      "loss": 0.669,
+      "step": 3530
+    },
+    {
+      "epoch": 0.17946212130827124,
+      "grad_norm": 0.07454575938873274,
+      "learning_rate": 0.0008972081218274111,
+      "loss": 0.7506,
+      "step": 3535
+    },
+    {
+      "epoch": 0.17971595740630275,
+      "grad_norm": 0.06240895100177482,
+      "learning_rate": 0.0008984771573604061,
+      "loss": 0.7194,
+      "step": 3540
+    },
+    {
+      "epoch": 0.17996979350433426,
+      "grad_norm": 0.04630554537296931,
+      "learning_rate": 0.000899746192893401,
+      "loss": 0.7211,
+      "step": 3545
+    },
+    {
+      "epoch": 0.18022362960236576,
+      "grad_norm": 0.056718557931670986,
+      "learning_rate": 0.000901015228426396,
+      "loss": 0.7443,
+      "step": 3550
+    },
+    {
+      "epoch": 0.18047746570039724,
+      "grad_norm": 0.053855068524236355,
+      "learning_rate": 0.0009022842639593909,
+      "loss": 0.6938,
+      "step": 3555
+    },
+    {
+      "epoch": 0.18073130179842875,
+      "grad_norm": 0.04098910289114666,
+      "learning_rate": 0.0009035532994923858,
+      "loss": 0.6882,
+      "step": 3560
+    },
+    {
+      "epoch": 0.18098513789646026,
+      "grad_norm": 0.043572400017178894,
+      "learning_rate": 0.0009048223350253807,
+      "loss": 0.7237,
+      "step": 3565
+    },
+    {
+      "epoch": 0.18123897399449176,
+      "grad_norm": 0.05122515570696231,
+      "learning_rate": 0.0009060913705583757,
+      "loss": 0.6965,
+      "step": 3570
+    },
+    {
+      "epoch": 0.18149281009252327,
+      "grad_norm": 0.0486904734197597,
+      "learning_rate": 0.0009073604060913706,
+      "loss": 0.6821,
+      "step": 3575
+    },
+    {
+      "epoch": 0.18174664619055475,
+      "grad_norm": 0.14211715338289158,
+      "learning_rate": 0.0009086294416243654,
+      "loss": 0.6776,
+      "step": 3580
+    },
+    {
+      "epoch": 0.18200048228858626,
+      "grad_norm": 0.04140861099483773,
+      "learning_rate": 0.0009098984771573604,
+      "loss": 0.716,
+      "step": 3585
+    },
+    {
+      "epoch": 0.18225431838661776,
+      "grad_norm": 0.05317734506655789,
+      "learning_rate": 0.0009111675126903554,
+      "loss": 0.7328,
+      "step": 3590
+    },
+    {
+      "epoch": 0.18250815448464927,
+      "grad_norm": 0.04412671048588959,
+      "learning_rate": 0.0009124365482233503,
+      "loss": 0.7013,
+      "step": 3595
+    },
+    {
+      "epoch": 0.18276199058268075,
+      "grad_norm": 0.043226638147343656,
+      "learning_rate": 0.0009137055837563451,
+      "loss": 0.7068,
+      "step": 3600
+    },
+    {
+      "epoch": 0.18301582668071226,
+      "grad_norm": 0.03626076158298662,
+      "learning_rate": 0.0009149746192893401,
+      "loss": 0.6853,
+      "step": 3605
+    },
+    {
+      "epoch": 0.18326966277874376,
+      "grad_norm": 0.047674224438480246,
+      "learning_rate": 0.0009162436548223351,
+      "loss": 0.6745,
+      "step": 3610
+    },
+    {
+      "epoch": 0.18352349887677527,
+      "grad_norm": 0.04231046030159459,
+      "learning_rate": 0.0009175126903553299,
+      "loss": 0.6849,
+      "step": 3615
+    },
+    {
+      "epoch": 0.18377733497480678,
+      "grad_norm": 0.04195288389214527,
+      "learning_rate": 0.0009187817258883248,
+      "loss": 0.7041,
+      "step": 3620
+    },
+    {
+      "epoch": 0.18403117107283826,
+      "grad_norm": 0.039251889309433935,
+      "learning_rate": 0.0009200507614213198,
+      "loss": 0.6504,
+      "step": 3625
+    },
+    {
+      "epoch": 0.18428500717086976,
+      "grad_norm": 0.03738914770413547,
+      "learning_rate": 0.0009213197969543148,
+      "loss": 0.7048,
+      "step": 3630
+    },
+    {
+      "epoch": 0.18453884326890127,
+      "grad_norm": 0.0436948152095552,
+      "learning_rate": 0.0009225888324873097,
+      "loss": 0.6771,
+      "step": 3635
+    },
+    {
+      "epoch": 0.18479267936693278,
+      "grad_norm": 0.03767490570058011,
+      "learning_rate": 0.0009238578680203045,
+      "loss": 0.6494,
+      "step": 3640
+    },
+    {
+      "epoch": 0.18504651546496428,
+      "grad_norm": 0.037090867993691726,
+      "learning_rate": 0.0009251269035532995,
+      "loss": 0.6545,
+      "step": 3645
+    },
+    {
+      "epoch": 0.18530035156299576,
+      "grad_norm": 0.042846186747705906,
+      "learning_rate": 0.0009263959390862944,
+      "loss": 0.6573,
+      "step": 3650
+    },
+    {
+      "epoch": 0.18555418766102727,
+      "grad_norm": 0.046132833928787344,
+      "learning_rate": 0.0009276649746192894,
+      "loss": 0.6716,
+      "step": 3655
+    },
+    {
+      "epoch": 0.18580802375905878,
+      "grad_norm": 0.04810476990314317,
+      "learning_rate": 0.0009289340101522843,
+      "loss": 0.6552,
+      "step": 3660
+    },
+    {
+      "epoch": 0.18606185985709028,
+      "grad_norm": 0.05725224131003678,
+      "learning_rate": 0.0009302030456852792,
+      "loss": 0.7188,
+      "step": 3665
+    },
+    {
+      "epoch": 0.1863156959551218,
+      "grad_norm": 0.03760096816148779,
+      "learning_rate": 0.0009314720812182741,
+      "loss": 0.679,
+      "step": 3670
+    },
+    {
+      "epoch": 0.18656953205315327,
+      "grad_norm": 0.0414336045399388,
+      "learning_rate": 0.0009327411167512691,
+      "loss": 0.703,
+      "step": 3675
+    },
+    {
+      "epoch": 0.18682336815118478,
+      "grad_norm": 0.04888803816351589,
+      "learning_rate": 0.000934010152284264,
+      "loss": 0.6984,
+      "step": 3680
+    },
+    {
+      "epoch": 0.18707720424921628,
+      "grad_norm": 0.03932910243932595,
+      "learning_rate": 0.0009352791878172588,
+      "loss": 0.6935,
+      "step": 3685
+    },
+    {
+      "epoch": 0.1873310403472478,
+      "grad_norm": 0.053329295837259566,
+      "learning_rate": 0.0009365482233502538,
+      "loss": 0.6654,
+      "step": 3690
+    },
+    {
+      "epoch": 0.18758487644527927,
+      "grad_norm": 0.04033686402339195,
+      "learning_rate": 0.0009378172588832488,
+      "loss": 0.6656,
+      "step": 3695
+    },
+    {
+      "epoch": 0.18783871254331078,
+      "grad_norm": 0.05078207384763503,
+      "learning_rate": 0.0009390862944162437,
+      "loss": 0.6616,
+      "step": 3700
+    },
+    {
+      "epoch": 0.18809254864134228,
+      "grad_norm": 0.03612814331688169,
+      "learning_rate": 0.0009403553299492385,
+      "loss": 0.6856,
+      "step": 3705
+    },
+    {
+      "epoch": 0.1883463847393738,
+      "grad_norm": 0.04152827694147588,
+      "learning_rate": 0.0009416243654822335,
+      "loss": 0.707,
+      "step": 3710
+    },
+    {
+      "epoch": 0.1886002208374053,
+      "grad_norm": 0.03476616159624936,
+      "learning_rate": 0.0009428934010152285,
+      "loss": 0.6786,
+      "step": 3715
+    },
+    {
+      "epoch": 0.18885405693543678,
+      "grad_norm": 0.03981634185742135,
+      "learning_rate": 0.0009441624365482235,
+      "loss": 0.7042,
+      "step": 3720
+    },
+    {
+      "epoch": 0.18910789303346828,
+      "grad_norm": 0.04204868547098972,
+      "learning_rate": 0.0009454314720812182,
+      "loss": 0.6841,
+      "step": 3725
+    },
+    {
+      "epoch": 0.1893617291314998,
+      "grad_norm": 0.039850666477108665,
+      "learning_rate": 0.0009467005076142132,
+      "loss": 0.6583,
+      "step": 3730
+    },
+    {
+      "epoch": 0.1896155652295313,
+      "grad_norm": 0.03866838212345305,
+      "learning_rate": 0.0009479695431472082,
+      "loss": 0.7246,
+      "step": 3735
+    },
+    {
+      "epoch": 0.1898694013275628,
+      "grad_norm": 0.040841413496425324,
+      "learning_rate": 0.0009492385786802031,
+      "loss": 0.6598,
+      "step": 3740
+    },
+    {
+      "epoch": 0.19012323742559428,
+      "grad_norm": 0.04010141554319208,
+      "learning_rate": 0.000950507614213198,
+      "loss": 0.6813,
+      "step": 3745
+    },
+    {
+      "epoch": 0.1903770735236258,
+      "grad_norm": 0.06530308653395914,
+      "learning_rate": 0.0009517766497461929,
+      "loss": 0.6685,
+      "step": 3750
+    },
+    {
+      "epoch": 0.1906309096216573,
+      "grad_norm": 0.07450856843136024,
+      "learning_rate": 0.0009530456852791879,
+      "loss": 0.6874,
+      "step": 3755
+    },
+    {
+      "epoch": 0.1908847457196888,
+      "grad_norm": 0.035499974530579376,
+      "learning_rate": 0.0009543147208121828,
+      "loss": 0.681,
+      "step": 3760
+    },
+    {
+      "epoch": 0.1911385818177203,
+      "grad_norm": 0.0437247777174162,
+      "learning_rate": 0.0009555837563451777,
+      "loss": 0.6848,
+      "step": 3765
+    },
+    {
+      "epoch": 0.1913924179157518,
+      "grad_norm": 0.035676219852568955,
+      "learning_rate": 0.0009568527918781726,
+      "loss": 0.627,
+      "step": 3770
+    },
+    {
+      "epoch": 0.1916462540137833,
+      "grad_norm": 0.03768653840820619,
+      "learning_rate": 0.0009581218274111675,
+      "loss": 0.6689,
+      "step": 3775
+    },
+    {
+      "epoch": 0.1919000901118148,
+      "grad_norm": 0.03823207024695021,
+      "learning_rate": 0.0009593908629441625,
+      "loss": 0.6335,
+      "step": 3780
+    },
+    {
+      "epoch": 0.1921539262098463,
+      "grad_norm": 0.04684915109347205,
+      "learning_rate": 0.0009606598984771574,
+      "loss": 0.6975,
+      "step": 3785
+    },
+    {
+      "epoch": 0.19240776230787782,
+      "grad_norm": 0.04745389321415872,
+      "learning_rate": 0.0009619289340101523,
+      "loss": 0.7062,
+      "step": 3790
+    },
+    {
+      "epoch": 0.1926615984059093,
+      "grad_norm": 0.03760091854535719,
+      "learning_rate": 0.0009631979695431472,
+      "loss": 0.6646,
+      "step": 3795
+    },
+    {
+      "epoch": 0.1929154345039408,
+      "grad_norm": 0.03932186553338342,
+      "learning_rate": 0.0009644670050761422,
+      "loss": 0.6743,
+      "step": 3800
+    },
+    {
+      "epoch": 0.1931692706019723,
+      "grad_norm": 0.03951929870880764,
+      "learning_rate": 0.0009657360406091371,
+      "loss": 0.6831,
+      "step": 3805
+    },
+    {
+      "epoch": 0.19342310670000382,
+      "grad_norm": 0.03553090190878169,
+      "learning_rate": 0.0009670050761421319,
+      "loss": 0.6571,
+      "step": 3810
+    },
+    {
+      "epoch": 0.1936769427980353,
+      "grad_norm": 0.06138620851402293,
+      "learning_rate": 0.0009682741116751269,
+      "loss": 0.6986,
+      "step": 3815
+    },
+    {
+      "epoch": 0.1939307788960668,
+      "grad_norm": 0.24733244873927648,
+      "learning_rate": 0.0009695431472081219,
+      "loss": 0.6976,
+      "step": 3820
+    },
+    {
+      "epoch": 0.1941846149940983,
+      "grad_norm": 0.0666364936631057,
+      "learning_rate": 0.0009708121827411168,
+      "loss": 0.7391,
+      "step": 3825
+    },
+    {
+      "epoch": 0.19443845109212982,
+      "grad_norm": 0.07967106187155334,
+      "learning_rate": 0.0009720812182741116,
+      "loss": 0.6993,
+      "step": 3830
+    },
+    {
+      "epoch": 0.19469228719016132,
+      "grad_norm": 0.051138778909318464,
+      "learning_rate": 0.0009733502538071066,
+      "loss": 0.7191,
+      "step": 3835
+    },
+    {
+      "epoch": 0.1949461232881928,
+      "grad_norm": 0.5413258576711231,
+      "learning_rate": 0.0009746192893401016,
+      "loss": 0.7592,
+      "step": 3840
+    },
+    {
+      "epoch": 0.1951999593862243,
+      "grad_norm": 0.1471254894588324,
+      "learning_rate": 0.0009758883248730965,
+      "loss": 0.7572,
+      "step": 3845
+    },
+    {
+      "epoch": 0.19545379548425582,
+      "grad_norm": 0.2706851022164675,
+      "learning_rate": 0.0009771573604060915,
+      "loss": 0.6907,
+      "step": 3850
+    },
+    {
+      "epoch": 0.19570763158228732,
+      "grad_norm": 0.14415862874984905,
+      "learning_rate": 0.0009784263959390863,
+      "loss": 0.7062,
+      "step": 3855
+    },
+    {
+      "epoch": 0.19596146768031883,
+      "grad_norm": 0.14791455727385014,
+      "learning_rate": 0.0009796954314720812,
+      "loss": 0.7451,
+      "step": 3860
+    },
+    {
+      "epoch": 0.1962153037783503,
+      "grad_norm": 0.12586017082802645,
+      "learning_rate": 0.000980964467005076,
+      "loss": 0.6971,
+      "step": 3865
+    },
+    {
+      "epoch": 0.19646913987638182,
+      "grad_norm": 0.0826384222127971,
+      "learning_rate": 0.0009822335025380712,
+      "loss": 0.7288,
+      "step": 3870
+    },
+    {
+      "epoch": 0.19672297597441332,
+      "grad_norm": 0.4096552727669512,
+      "learning_rate": 0.000983502538071066,
+      "loss": 0.6983,
+      "step": 3875
+    },
+    {
+      "epoch": 0.19697681207244483,
+      "grad_norm": 0.126131262033545,
+      "learning_rate": 0.000984771573604061,
+      "loss": 0.7836,
+      "step": 3880
+    },
+    {
+      "epoch": 0.19723064817047634,
+      "grad_norm": 0.1721626969232291,
+      "learning_rate": 0.0009860406091370558,
+      "loss": 0.7095,
+      "step": 3885
+    },
+    {
+      "epoch": 0.19748448426850782,
+      "grad_norm": 0.1375632163342092,
+      "learning_rate": 0.0009873096446700509,
+      "loss": 0.7249,
+      "step": 3890
+    },
+    {
+      "epoch": 0.19773832036653932,
+      "grad_norm": 0.16481287346009105,
+      "learning_rate": 0.0009885786802030457,
+      "loss": 0.7144,
+      "step": 3895
+    },
+    {
+      "epoch": 0.19799215646457083,
+      "grad_norm": 0.08436713045812702,
+      "learning_rate": 0.0009898477157360406,
+      "loss": 0.7119,
+      "step": 3900
+    },
+    {
+      "epoch": 0.19824599256260234,
+      "grad_norm": 0.05626919966189561,
+      "learning_rate": 0.0009911167512690355,
+      "loss": 0.7135,
+      "step": 3905
+    },
+    {
+      "epoch": 0.19849982866063384,
+      "grad_norm": 0.04456845230313172,
+      "learning_rate": 0.0009923857868020306,
+      "loss": 0.6619,
+      "step": 3910
+    },
+    {
+      "epoch": 0.19875366475866532,
+      "grad_norm": 0.07101056929004967,
+      "learning_rate": 0.0009936548223350254,
+      "loss": 0.7022,
+      "step": 3915
+    },
+    {
+      "epoch": 0.19900750085669683,
+      "grad_norm": 0.04108441619780391,
+      "learning_rate": 0.0009949238578680203,
+      "loss": 0.7024,
+      "step": 3920
+    },
+    {
+      "epoch": 0.19926133695472834,
+      "grad_norm": 0.07150668017794924,
+      "learning_rate": 0.0009961928934010152,
+      "loss": 0.7146,
+      "step": 3925
+    },
+    {
+      "epoch": 0.19951517305275984,
+      "grad_norm": 0.058887395407361695,
+      "learning_rate": 0.0009974619289340103,
+      "loss": 0.7013,
+      "step": 3930
+    },
+    {
+      "epoch": 0.19976900915079132,
+      "grad_norm": 0.08301973441191342,
+      "learning_rate": 0.0009987309644670051,
+      "loss": 0.7245,
+      "step": 3935
+    },
+    {
+      "epoch": 0.20002284524882283,
+      "grad_norm": 0.13425731803767796,
+      "learning_rate": 0.001,
+      "loss": 0.7384,
+      "step": 3940
+    },
+    {
+      "epoch": 0.20027668134685433,
+      "grad_norm": 0.0954240553881326,
+      "learning_rate": 0.0009999999509262467,
+      "loss": 0.7082,
+      "step": 3945
+    },
+    {
+      "epoch": 0.20053051744488584,
+      "grad_norm": 0.09001615927204533,
+      "learning_rate": 0.0009999998037049968,
+      "loss": 0.7479,
+      "step": 3950
+    },
+    {
+      "epoch": 0.20078435354291735,
+      "grad_norm": 0.055536250218606135,
+      "learning_rate": 0.0009999995583362786,
+      "loss": 0.7217,
+      "step": 3955
+    },
+    {
+      "epoch": 0.20103818964094883,
+      "grad_norm": 0.039988384096081575,
+      "learning_rate": 0.0009999992148201407,
+      "loss": 0.7023,
+      "step": 3960
+    },
+    {
+      "epoch": 0.20129202573898033,
+      "grad_norm": 0.07631095658372448,
+      "learning_rate": 0.0009999987731566505,
+      "loss": 0.6593,
+      "step": 3965
+    },
+    {
+      "epoch": 0.20154586183701184,
+      "grad_norm": 0.055019840260049975,
+      "learning_rate": 0.0009999982333458942,
+      "loss": 0.7141,
+      "step": 3970
+    },
+    {
+      "epoch": 0.20179969793504335,
+      "grad_norm": 0.13389800181440178,
+      "learning_rate": 0.0009999975953879788,
+      "loss": 0.8059,
+      "step": 3975
+    },
+    {
+      "epoch": 0.20205353403307486,
+      "grad_norm": 0.10002422022594674,
+      "learning_rate": 0.0009999968592830286,
+      "loss": 0.7418,
+      "step": 3980
+    },
+    {
+      "epoch": 0.20230737013110633,
+      "grad_norm": 0.065235305911112,
+      "learning_rate": 0.0009999960250311885,
+      "loss": 0.7388,
+      "step": 3985
+    },
+    {
+      "epoch": 0.20256120622913784,
+      "grad_norm": 0.07700805891945464,
+      "learning_rate": 0.0009999950926326221,
+      "loss": 0.7619,
+      "step": 3990
+    },
+    {
+      "epoch": 0.20281504232716935,
+      "grad_norm": 0.05585759735632213,
+      "learning_rate": 0.0009999940620875124,
+      "loss": 0.7262,
+      "step": 3995
+    },
+    {
+      "epoch": 0.20306887842520085,
+      "grad_norm": 0.04210144533533203,
+      "learning_rate": 0.0009999929333960617,
+      "loss": 0.7003,
+      "step": 4000
+    },
+    {
+      "epoch": 0.20332271452323236,
+      "grad_norm": 0.04939954030213548,
+      "learning_rate": 0.0009999917065584918,
+      "loss": 0.7223,
+      "step": 4005
+    },
+    {
+      "epoch": 0.20357655062126384,
+      "grad_norm": 0.05682863013701724,
+      "learning_rate": 0.0009999903815750436,
+      "loss": 0.7366,
+      "step": 4010
+    },
+    {
+      "epoch": 0.20383038671929535,
+      "grad_norm": 0.07201666259364202,
+      "learning_rate": 0.0009999889584459765,
+      "loss": 0.7237,
+      "step": 4015
+    },
+    {
+      "epoch": 0.20408422281732685,
+      "grad_norm": 0.05415233755750322,
+      "learning_rate": 0.0009999874371715706,
+      "loss": 0.7099,
+      "step": 4020
+    },
+    {
+      "epoch": 0.20433805891535836,
+      "grad_norm": 0.09491299760351525,
+      "learning_rate": 0.0009999858177521242,
+      "loss": 0.9184,
+      "step": 4025
+    },
+    {
+      "epoch": 0.20459189501338984,
+      "grad_norm": 0.09564328359072853,
+      "learning_rate": 0.0009999841001879551,
+      "loss": 0.799,
+      "step": 4030
+    },
+    {
+      "epoch": 0.20484573111142135,
+      "grad_norm": 2.399109555715659,
+      "learning_rate": 0.0009999822844794005,
+      "loss": 1.0065,
+      "step": 4035
+    },
+    {
+      "epoch": 0.20509956720945285,
+      "grad_norm": 0.5227791347356591,
+      "learning_rate": 0.000999980370626817,
+      "loss": 0.8278,
+      "step": 4040
+    },
+    {
+      "epoch": 0.20535340330748436,
+      "grad_norm": 0.16651871729788892,
+      "learning_rate": 0.00099997835863058,
+      "loss": 0.8614,
+      "step": 4045
+    },
+    {
+      "epoch": 0.20560723940551587,
+      "grad_norm": 0.18221807327257578,
+      "learning_rate": 0.0009999762484910846,
+      "loss": 0.8153,
+      "step": 4050
+    },
+    {
+      "epoch": 0.20586107550354735,
+      "grad_norm": 0.10310087152591199,
+      "learning_rate": 0.0009999740402087452,
+      "loss": 0.8029,
+      "step": 4055
+    },
+    {
+      "epoch": 0.20611491160157885,
+      "grad_norm": 0.0703395551608927,
+      "learning_rate": 0.0009999717337839948,
+      "loss": 0.7577,
+      "step": 4060
+    },
+    {
+      "epoch": 0.20636874769961036,
+      "grad_norm": 0.08012644608333368,
+      "learning_rate": 0.0009999693292172865,
+      "loss": 0.7503,
+      "step": 4065
+    },
+    {
+      "epoch": 0.20662258379764187,
+      "grad_norm": 0.06287680300777991,
+      "learning_rate": 0.0009999668265090924,
+      "loss": 0.7382,
+      "step": 4070
+    },
+    {
+      "epoch": 0.20687641989567337,
+      "grad_norm": 0.05354965471224598,
+      "learning_rate": 0.0009999642256599034,
+      "loss": 0.7085,
+      "step": 4075
+    },
+    {
+      "epoch": 0.20713025599370485,
+      "grad_norm": 0.0482492491840279,
+      "learning_rate": 0.0009999615266702302,
+      "loss": 0.7315,
+      "step": 4080
+    },
+    {
+      "epoch": 0.20738409209173636,
+      "grad_norm": 0.05181020090220497,
+      "learning_rate": 0.0009999587295406026,
+      "loss": 0.7227,
+      "step": 4085
+    },
+    {
+      "epoch": 0.20763792818976787,
+      "grad_norm": 0.040021587065134735,
+      "learning_rate": 0.00099995583427157,
+      "loss": 0.7407,
+      "step": 4090
+    },
+    {
+      "epoch": 0.20789176428779937,
+      "grad_norm": 0.04000165166533322,
+      "learning_rate": 0.0009999528408637,
+      "loss": 0.7042,
+      "step": 4095
+    },
+    {
+      "epoch": 0.20814560038583088,
+      "grad_norm": 0.03943135892336847,
+      "learning_rate": 0.0009999497493175808,
+      "loss": 0.7267,
+      "step": 4100
+    },
+    {
+      "epoch": 0.20839943648386236,
+      "grad_norm": 0.0634943990185323,
+      "learning_rate": 0.0009999465596338191,
+      "loss": 0.6844,
+      "step": 4105
+    },
+    {
+      "epoch": 0.20865327258189387,
+      "grad_norm": 0.053343219962136215,
+      "learning_rate": 0.000999943271813041,
+      "loss": 0.6986,
+      "step": 4110
+    },
+    {
+      "epoch": 0.20890710867992537,
+      "grad_norm": 0.051181086485765775,
+      "learning_rate": 0.0009999398858558917,
+      "loss": 0.7183,
+      "step": 4115
+    },
+    {
+      "epoch": 0.20916094477795688,
+      "grad_norm": 0.057765287845386676,
+      "learning_rate": 0.0009999364017630361,
+      "loss": 0.7451,
+      "step": 4120
+    },
+    {
+      "epoch": 0.2094147808759884,
+      "grad_norm": 0.564887138188438,
+      "learning_rate": 0.0009999328195351579,
+      "loss": 0.7099,
+      "step": 4125
+    },
+    {
+      "epoch": 0.20966861697401987,
+      "grad_norm": 0.04854697031445982,
+      "learning_rate": 0.0009999291391729606,
+      "loss": 0.7045,
+      "step": 4130
+    },
+    {
+      "epoch": 0.20992245307205137,
+      "grad_norm": 0.06786359137742107,
+      "learning_rate": 0.0009999253606771661,
+      "loss": 0.7074,
+      "step": 4135
+    },
+    {
+      "epoch": 0.21017628917008288,
+      "grad_norm": 0.07218662791060168,
+      "learning_rate": 0.0009999214840485167,
+      "loss": 0.6902,
+      "step": 4140
+    },
+    {
+      "epoch": 0.2104301252681144,
+      "grad_norm": 0.1208003703070213,
+      "learning_rate": 0.000999917509287773,
+      "loss": 0.7087,
+      "step": 4145
+    },
+    {
+      "epoch": 0.21068396136614587,
+      "grad_norm": 0.05878145753992489,
+      "learning_rate": 0.0009999134363957152,
+      "loss": 0.674,
+      "step": 4150
+    },
+    {
+      "epoch": 0.21093779746417737,
+      "grad_norm": 0.046846061065139784,
+      "learning_rate": 0.0009999092653731432,
+      "loss": 0.7174,
+      "step": 4155
+    },
+    {
+      "epoch": 0.21119163356220888,
+      "grad_norm": 0.06346911438232657,
+      "learning_rate": 0.0009999049962208751,
+      "loss": 0.708,
+      "step": 4160
+    },
+    {
+      "epoch": 0.2114454696602404,
+      "grad_norm": 0.05215116911139721,
+      "learning_rate": 0.0009999006289397494,
+      "loss": 0.7164,
+      "step": 4165
+    },
+    {
+      "epoch": 0.2116993057582719,
+      "grad_norm": 0.04940387903306439,
+      "learning_rate": 0.0009998961635306234,
+      "loss": 0.7235,
+      "step": 4170
+    },
+    {
+      "epoch": 0.21195314185630337,
+      "grad_norm": 0.05729960864593158,
+      "learning_rate": 0.0009998915999943733,
+      "loss": 0.655,
+      "step": 4175
+    },
+    {
+      "epoch": 0.21220697795433488,
+      "grad_norm": 0.03918310878918639,
+      "learning_rate": 0.0009998869383318952,
+      "loss": 0.6842,
+      "step": 4180
+    },
+    {
+      "epoch": 0.2124608140523664,
+      "grad_norm": 0.06071981733933351,
+      "learning_rate": 0.0009998821785441039,
+      "loss": 0.7281,
+      "step": 4185
+    },
+    {
+      "epoch": 0.2127146501503979,
+      "grad_norm": 0.04728543724507953,
+      "learning_rate": 0.000999877320631934,
+      "loss": 0.684,
+      "step": 4190
+    },
+    {
+      "epoch": 0.2129684862484294,
+      "grad_norm": 0.0404988884730302,
+      "learning_rate": 0.0009998723645963388,
+      "loss": 0.7254,
+      "step": 4195
+    },
+    {
+      "epoch": 0.21322232234646088,
+      "grad_norm": 0.047811829648568324,
+      "learning_rate": 0.0009998673104382912,
+      "loss": 0.6826,
+      "step": 4200
+    },
+    {
+      "epoch": 0.2134761584444924,
+      "grad_norm": 0.049104308173568804,
+      "learning_rate": 0.0009998621581587836,
+      "loss": 0.7314,
+      "step": 4205
+    },
+    {
+      "epoch": 0.2137299945425239,
+      "grad_norm": 0.03823024902121797,
+      "learning_rate": 0.000999856907758827,
+      "loss": 0.72,
+      "step": 4210
+    },
+    {
+      "epoch": 0.2139838306405554,
+      "grad_norm": 0.057874634046649116,
+      "learning_rate": 0.0009998515592394524,
+      "loss": 0.713,
+      "step": 4215
+    },
+    {
+      "epoch": 0.2142376667385869,
+      "grad_norm": 0.07434713454091414,
+      "learning_rate": 0.0009998461126017094,
+      "loss": 0.7125,
+      "step": 4220
+    },
+    {
+      "epoch": 0.2144915028366184,
+      "grad_norm": 0.06272529032155742,
+      "learning_rate": 0.0009998405678466671,
+      "loss": 0.6889,
+      "step": 4225
+    },
+    {
+      "epoch": 0.2147453389346499,
+      "grad_norm": 0.033595008050920305,
+      "learning_rate": 0.0009998349249754142,
+      "loss": 0.6796,
+      "step": 4230
+    },
+    {
+      "epoch": 0.2149991750326814,
+      "grad_norm": 0.04029569391079619,
+      "learning_rate": 0.0009998291839890582,
+      "loss": 0.7028,
+      "step": 4235
+    },
+    {
+      "epoch": 0.2152530111307129,
+      "grad_norm": 0.056686671703328986,
+      "learning_rate": 0.000999823344888726,
+      "loss": 0.7324,
+      "step": 4240
+    },
+    {
+      "epoch": 0.21550684722874439,
+      "grad_norm": 0.06609484871358738,
+      "learning_rate": 0.0009998174076755637,
+      "loss": 0.6927,
+      "step": 4245
+    },
+    {
+      "epoch": 0.2157606833267759,
+      "grad_norm": 0.03952252087967003,
+      "learning_rate": 0.000999811372350737,
+      "loss": 0.6872,
+      "step": 4250
+    },
+    {
+      "epoch": 0.2160145194248074,
+      "grad_norm": 0.0421816140235618,
+      "learning_rate": 0.0009998052389154303,
+      "loss": 0.6629,
+      "step": 4255
+    },
+    {
+      "epoch": 0.2162683555228389,
+      "grad_norm": 0.0392009074437296,
+      "learning_rate": 0.0009997990073708479,
+      "loss": 0.6661,
+      "step": 4260
+    },
+    {
+      "epoch": 0.2165221916208704,
+      "grad_norm": 0.04466501671098514,
+      "learning_rate": 0.0009997926777182127,
+      "loss": 0.6814,
+      "step": 4265
+    },
+    {
+      "epoch": 0.2167760277189019,
+      "grad_norm": 0.04350807791667501,
+      "learning_rate": 0.0009997862499587673,
+      "loss": 0.7324,
+      "step": 4270
+    },
+    {
+      "epoch": 0.2170298638169334,
+      "grad_norm": 0.04149859498803727,
+      "learning_rate": 0.0009997797240937736,
+      "loss": 0.6829,
+      "step": 4275
+    },
+    {
+      "epoch": 0.2172836999149649,
+      "grad_norm": 0.04436311666473786,
+      "learning_rate": 0.0009997731001245124,
+      "loss": 0.6919,
+      "step": 4280
+    },
+    {
+      "epoch": 0.2175375360129964,
+      "grad_norm": 0.08118673796209477,
+      "learning_rate": 0.0009997663780522842,
+      "loss": 0.6884,
+      "step": 4285
+    },
+    {
+      "epoch": 0.21779137211102792,
+      "grad_norm": 1.4346902161122619,
+      "learning_rate": 0.000999759557878408,
+      "loss": 0.9,
+      "step": 4290
+    },
+    {
+      "epoch": 0.2180452082090594,
+      "grad_norm": 0.2203188373312744,
+      "learning_rate": 0.0009997526396042231,
+      "loss": 0.7955,
+      "step": 4295
+    },
+    {
+      "epoch": 0.2182990443070909,
+      "grad_norm": 0.10935101729406492,
+      "learning_rate": 0.000999745623231087,
+      "loss": 0.799,
+      "step": 4300
+    },
+    {
+      "epoch": 0.2185528804051224,
+      "grad_norm": 0.08157477584763652,
+      "learning_rate": 0.0009997385087603776,
+      "loss": 0.7428,
+      "step": 4305
+    },
+    {
+      "epoch": 0.21880671650315392,
+      "grad_norm": 0.33104745046572154,
+      "learning_rate": 0.0009997312961934912,
+      "loss": 0.7342,
+      "step": 4310
+    },
+    {
+      "epoch": 0.21906055260118543,
+      "grad_norm": 0.11235803791274827,
+      "learning_rate": 0.000999723985531843,
+      "loss": 0.7372,
+      "step": 4315
+    },
+    {
+      "epoch": 0.2193143886992169,
+      "grad_norm": 0.5110586261632338,
+      "learning_rate": 0.0009997165767768692,
+      "loss": 0.7387,
+      "step": 4320
+    },
+    {
+      "epoch": 0.2195682247972484,
+      "grad_norm": 0.12206962291351009,
+      "learning_rate": 0.000999709069930023,
+      "loss": 0.7563,
+      "step": 4325
+    },
+    {
+      "epoch": 0.21982206089527992,
+      "grad_norm": 0.08087926540403238,
+      "learning_rate": 0.0009997014649927786,
+      "loss": 0.729,
+      "step": 4330
+    },
+    {
+      "epoch": 0.22007589699331143,
+      "grad_norm": 0.14194129116961565,
+      "learning_rate": 0.0009996937619666287,
+      "loss": 0.763,
+      "step": 4335
+    },
+    {
+      "epoch": 0.22032973309134293,
+      "grad_norm": 0.05593580754563041,
+      "learning_rate": 0.0009996859608530852,
+      "loss": 0.738,
+      "step": 4340
+    },
+    {
+      "epoch": 0.2205835691893744,
+      "grad_norm": 0.05744534028157504,
+      "learning_rate": 0.0009996780616536795,
+      "loss": 0.7737,
+      "step": 4345
+    },
+    {
+      "epoch": 0.22083740528740592,
+      "grad_norm": 0.1748737919710415,
+      "learning_rate": 0.0009996700643699623,
+      "loss": 0.8663,
+      "step": 4350
+    },
+    {
+      "epoch": 0.22109124138543743,
+      "grad_norm": 0.21064202815185434,
+      "learning_rate": 0.0009996619690035033,
+      "loss": 0.7204,
+      "step": 4355
+    },
+    {
+      "epoch": 0.22134507748346893,
+      "grad_norm": 0.10370603131130617,
+      "learning_rate": 0.0009996537755558915,
+      "loss": 0.7197,
+      "step": 4360
+    },
+    {
+      "epoch": 0.2215989135815004,
+      "grad_norm": 0.06885717236979047,
+      "learning_rate": 0.0009996454840287355,
+      "loss": 0.744,
+      "step": 4365
+    },
+    {
+      "epoch": 0.22185274967953192,
+      "grad_norm": 0.07087821700739863,
+      "learning_rate": 0.0009996370944236625,
+      "loss": 0.7341,
+      "step": 4370
+    },
+    {
+      "epoch": 0.22210658577756343,
+      "grad_norm": 0.08283414484063312,
+      "learning_rate": 0.0009996286067423196,
+      "loss": 0.7271,
+      "step": 4375
+    },
+    {
+      "epoch": 0.22236042187559493,
+      "grad_norm": 0.03861660942385861,
+      "learning_rate": 0.000999620020986373,
+      "loss": 0.6911,
+      "step": 4380
+    },
+    {
+      "epoch": 0.22261425797362644,
+      "grad_norm": 0.06635541968079052,
+      "learning_rate": 0.0009996113371575075,
+      "loss": 0.7413,
+      "step": 4385
+    },
+    {
+      "epoch": 0.22286809407165792,
+      "grad_norm": 0.064436083193186,
+      "learning_rate": 0.0009996025552574284,
+      "loss": 0.687,
+      "step": 4390
+    },
+    {
+      "epoch": 0.22312193016968943,
+      "grad_norm": 0.07447130960554259,
+      "learning_rate": 0.000999593675287859,
+      "loss": 0.7174,
+      "step": 4395
+    },
+    {
+      "epoch": 0.22337576626772093,
+      "grad_norm": 0.04227245688144544,
+      "learning_rate": 0.0009995846972505429,
+      "loss": 0.7318,
+      "step": 4400
+    },
+    {
+      "epoch": 0.22362960236575244,
+      "grad_norm": 0.04336627684359626,
+      "learning_rate": 0.000999575621147242,
+      "loss": 0.758,
+      "step": 4405
+    },
+    {
+      "epoch": 0.22388343846378395,
+      "grad_norm": 0.052749435841111,
+      "learning_rate": 0.000999566446979738,
+      "loss": 0.7302,
+      "step": 4410
+    },
+    {
+      "epoch": 0.22413727456181542,
+      "grad_norm": 0.046319696344097425,
+      "learning_rate": 0.0009995571747498319,
+      "loss": 0.6572,
+      "step": 4415
+    },
+    {
+      "epoch": 0.22439111065984693,
+      "grad_norm": 0.036417375306608095,
+      "learning_rate": 0.0009995478044593435,
+      "loss": 0.7134,
+      "step": 4420
+    },
+    {
+      "epoch": 0.22464494675787844,
+      "grad_norm": 0.03453022900900243,
+      "learning_rate": 0.0009995383361101125,
+      "loss": 0.7012,
+      "step": 4425
+    },
+    {
+      "epoch": 0.22489878285590995,
+      "grad_norm": 0.07427033738679807,
+      "learning_rate": 0.0009995287697039973,
+      "loss": 0.7013,
+      "step": 4430
+    },
+    {
+      "epoch": 0.22515261895394145,
+      "grad_norm": 0.054569394891019696,
+      "learning_rate": 0.0009995191052428758,
+      "loss": 0.7198,
+      "step": 4435
+    },
+    {
+      "epoch": 0.22540645505197293,
+      "grad_norm": 0.0393817069157149,
+      "learning_rate": 0.0009995093427286447,
+      "loss": 0.6906,
+      "step": 4440
+    },
+    {
+      "epoch": 0.22566029115000444,
+      "grad_norm": 0.061316789682121224,
+      "learning_rate": 0.000999499482163221,
+      "loss": 0.7074,
+      "step": 4445
+    },
+    {
+      "epoch": 0.22591412724803595,
+      "grad_norm": 0.05219263283094663,
+      "learning_rate": 0.00099948952354854,
+      "loss": 0.7237,
+      "step": 4450
+    },
+    {
+      "epoch": 0.22616796334606745,
+      "grad_norm": 0.041749572353213034,
+      "learning_rate": 0.0009994794668865563,
+      "loss": 0.7215,
+      "step": 4455
+    },
+    {
+      "epoch": 0.22642179944409896,
+      "grad_norm": 0.06621048957460272,
+      "learning_rate": 0.0009994693121792443,
+      "loss": 0.7196,
+      "step": 4460
+    },
+    {
+      "epoch": 0.22667563554213044,
+      "grad_norm": 0.04745304839755999,
+      "learning_rate": 0.000999459059428597,
+      "loss": 0.7145,
+      "step": 4465
+    },
+    {
+      "epoch": 0.22692947164016194,
+      "grad_norm": 0.03774244254610854,
+      "learning_rate": 0.0009994487086366272,
+      "loss": 0.684,
+      "step": 4470
+    },
+    {
+      "epoch": 0.22718330773819345,
+      "grad_norm": 0.05065779440911714,
+      "learning_rate": 0.0009994382598053665,
+      "loss": 0.676,
+      "step": 4475
+    },
+    {
+      "epoch": 0.22743714383622496,
+      "grad_norm": 0.04660423097011398,
+      "learning_rate": 0.0009994277129368664,
+      "loss": 0.6876,
+      "step": 4480
+    },
+    {
+      "epoch": 0.22769097993425644,
+      "grad_norm": 0.039430766510009256,
+      "learning_rate": 0.0009994170680331968,
+      "loss": 0.6723,
+      "step": 4485
+    },
+    {
+      "epoch": 0.22794481603228794,
+      "grad_norm": 0.043461044788944345,
+      "learning_rate": 0.0009994063250964472,
+      "loss": 0.6599,
+      "step": 4490
+    },
+    {
+      "epoch": 0.22819865213031945,
+      "grad_norm": 0.05440998730009002,
+      "learning_rate": 0.0009993954841287266,
+      "loss": 0.6705,
+      "step": 4495
+    },
+    {
+      "epoch": 0.22845248822835096,
+      "grad_norm": 0.058972665330176754,
+      "learning_rate": 0.000999384545132163,
+      "loss": 0.6872,
+      "step": 4500
+    },
+    {
+      "epoch": 0.22870632432638247,
+      "grad_norm": 0.07456158646652374,
+      "learning_rate": 0.0009993735081089035,
+      "loss": 0.6873,
+      "step": 4505
+    },
+    {
+      "epoch": 0.22896016042441394,
+      "grad_norm": 0.05068153364562075,
+      "learning_rate": 0.0009993623730611147,
+      "loss": 0.6833,
+      "step": 4510
+    },
+    {
+      "epoch": 0.22921399652244545,
+      "grad_norm": 0.06571910537543511,
+      "learning_rate": 0.0009993511399909825,
+      "loss": 0.6663,
+      "step": 4515
+    },
+    {
+      "epoch": 0.22946783262047696,
+      "grad_norm": 0.1595236692921901,
+      "learning_rate": 0.0009993398089007117,
+      "loss": 0.6632,
+      "step": 4520
+    },
+    {
+      "epoch": 0.22972166871850846,
+      "grad_norm": 0.057680648886969166,
+      "learning_rate": 0.0009993283797925267,
+      "loss": 0.6815,
+      "step": 4525
+    },
+    {
+      "epoch": 0.22997550481653997,
+      "grad_norm": 0.05224803456061352,
+      "learning_rate": 0.0009993168526686708,
+      "loss": 0.7055,
+      "step": 4530
+    },
+    {
+      "epoch": 0.23022934091457145,
+      "grad_norm": 0.03339058561257075,
+      "learning_rate": 0.000999305227531407,
+      "loss": 0.6897,
+      "step": 4535
+    },
+    {
+      "epoch": 0.23048317701260296,
+      "grad_norm": 0.03485057296061167,
+      "learning_rate": 0.000999293504383017,
+      "loss": 0.6382,
+      "step": 4540
+    },
+    {
+      "epoch": 0.23073701311063446,
+      "grad_norm": 0.06126014494022501,
+      "learning_rate": 0.000999281683225802,
+      "loss": 0.7143,
+      "step": 4545
+    },
+    {
+      "epoch": 0.23099084920866597,
+      "grad_norm": 0.045158627169637415,
+      "learning_rate": 0.0009992697640620824,
+      "loss": 0.7128,
+      "step": 4550
+    },
+    {
+      "epoch": 0.23124468530669748,
+      "grad_norm": 0.0453807537141564,
+      "learning_rate": 0.000999257746894198,
+      "loss": 0.7073,
+      "step": 4555
+    },
+    {
+      "epoch": 0.23149852140472896,
+      "grad_norm": 0.038686364819667377,
+      "learning_rate": 0.0009992456317245077,
+      "loss": 0.7112,
+      "step": 4560
+    },
+    {
+      "epoch": 0.23175235750276046,
+      "grad_norm": 0.04309918961411153,
+      "learning_rate": 0.0009992334185553898,
+      "loss": 0.7091,
+      "step": 4565
+    },
+    {
+      "epoch": 0.23200619360079197,
+      "grad_norm": 0.03768367614783312,
+      "learning_rate": 0.0009992211073892414,
+      "loss": 0.6774,
+      "step": 4570
+    },
+    {
+      "epoch": 0.23226002969882348,
+      "grad_norm": 0.05953571982775251,
+      "learning_rate": 0.000999208698228479,
+      "loss": 0.6822,
+      "step": 4575
+    },
+    {
+      "epoch": 0.23251386579685496,
+      "grad_norm": 0.033857197586949536,
+      "learning_rate": 0.0009991961910755392,
+      "loss": 0.663,
+      "step": 4580
+    },
+    {
+      "epoch": 0.23276770189488646,
+      "grad_norm": 0.039971596166750424,
+      "learning_rate": 0.0009991835859328763,
+      "loss": 0.6908,
+      "step": 4585
+    },
+    {
+      "epoch": 0.23302153799291797,
+      "grad_norm": 0.050452966613271054,
+      "learning_rate": 0.0009991708828029648,
+      "loss": 0.642,
+      "step": 4590
+    },
+    {
+      "epoch": 0.23327537409094948,
+      "grad_norm": 0.0579358422967732,
+      "learning_rate": 0.0009991580816882983,
+      "loss": 0.6956,
+      "step": 4595
+    },
+    {
+      "epoch": 0.23352921018898098,
+      "grad_norm": 0.034608348704914735,
+      "learning_rate": 0.00099914518259139,
+      "loss": 0.7036,
+      "step": 4600
+    },
+    {
+      "epoch": 0.23378304628701246,
+      "grad_norm": 0.04500468096828464,
+      "learning_rate": 0.0009991321855147713,
+      "loss": 0.7096,
+      "step": 4605
+    },
+    {
+      "epoch": 0.23403688238504397,
+      "grad_norm": 0.06192464258502063,
+      "learning_rate": 0.0009991190904609939,
+      "loss": 0.6963,
+      "step": 4610
+    },
+    {
+      "epoch": 0.23429071848307548,
+      "grad_norm": 0.036824417923052896,
+      "learning_rate": 0.0009991058974326281,
+      "loss": 0.6719,
+      "step": 4615
+    },
+    {
+      "epoch": 0.23454455458110698,
+      "grad_norm": 0.034966580229344214,
+      "learning_rate": 0.0009990926064322636,
+      "loss": 0.6867,
+      "step": 4620
+    },
+    {
+      "epoch": 0.2347983906791385,
+      "grad_norm": 0.03553588744447904,
+      "learning_rate": 0.0009990792174625095,
+      "loss": 0.6936,
+      "step": 4625
+    },
+    {
+      "epoch": 0.23505222677716997,
+      "grad_norm": 0.03744697297486917,
+      "learning_rate": 0.000999065730525994,
+      "loss": 0.725,
+      "step": 4630
+    },
+    {
+      "epoch": 0.23530606287520148,
+      "grad_norm": 0.03140251768780669,
+      "learning_rate": 0.0009990521456253643,
+      "loss": 0.6593,
+      "step": 4635
+    },
+    {
+      "epoch": 0.23555989897323298,
+      "grad_norm": 0.039136334308997274,
+      "learning_rate": 0.0009990384627632872,
+      "loss": 0.6846,
+      "step": 4640
+    },
+    {
+      "epoch": 0.2358137350712645,
+      "grad_norm": 0.03616571384042556,
+      "learning_rate": 0.0009990246819424487,
+      "loss": 0.6877,
+      "step": 4645
+    },
+    {
+      "epoch": 0.236067571169296,
+      "grad_norm": 0.038208828921227095,
+      "learning_rate": 0.0009990108031655536,
+      "loss": 0.6841,
+      "step": 4650
+    },
+    {
+      "epoch": 0.23632140726732748,
+      "grad_norm": 0.03751589202735807,
+      "learning_rate": 0.0009989968264353265,
+      "loss": 0.6737,
+      "step": 4655
+    },
+    {
+      "epoch": 0.23657524336535898,
+      "grad_norm": 0.06015453571891426,
+      "learning_rate": 0.0009989827517545107,
+      "loss": 0.6753,
+      "step": 4660
+    },
+    {
+      "epoch": 0.2368290794633905,
+      "grad_norm": 0.03457380624142321,
+      "learning_rate": 0.0009989685791258693,
+      "loss": 0.6835,
+      "step": 4665
+    },
+    {
+      "epoch": 0.237082915561422,
+      "grad_norm": 0.033252457228367796,
+      "learning_rate": 0.0009989543085521843,
+      "loss": 0.676,
+      "step": 4670
+    },
+    {
+      "epoch": 0.2373367516594535,
+      "grad_norm": 0.05093290561964852,
+      "learning_rate": 0.0009989399400362566,
+      "loss": 0.7078,
+      "step": 4675
+    },
+    {
+      "epoch": 0.23759058775748498,
+      "grad_norm": 0.030107770564768616,
+      "learning_rate": 0.0009989254735809068,
+      "loss": 0.6547,
+      "step": 4680
+    },
+    {
+      "epoch": 0.2378444238555165,
+      "grad_norm": 0.03606716641926515,
+      "learning_rate": 0.000998910909188975,
+      "loss": 0.7156,
+      "step": 4685
+    },
+    {
+      "epoch": 0.238098259953548,
+      "grad_norm": 0.030847705095771165,
+      "learning_rate": 0.0009988962468633195,
+      "loss": 0.6556,
+      "step": 4690
+    },
+    {
+      "epoch": 0.2383520960515795,
+      "grad_norm": 0.03107834681283107,
+      "learning_rate": 0.000998881486606819,
+      "loss": 0.6654,
+      "step": 4695
+    },
+    {
+      "epoch": 0.23860593214961098,
+      "grad_norm": 0.03269129778574898,
+      "learning_rate": 0.0009988666284223703,
+      "loss": 0.646,
+      "step": 4700
+    },
+    {
+      "epoch": 0.2388597682476425,
+      "grad_norm": 0.03326143191194153,
+      "learning_rate": 0.0009988516723128905,
+      "loss": 0.6804,
+      "step": 4705
+    },
+    {
+      "epoch": 0.239113604345674,
+      "grad_norm": 0.032004462939776754,
+      "learning_rate": 0.0009988366182813152,
+      "loss": 0.6756,
+      "step": 4710
+    },
+    {
+      "epoch": 0.2393674404437055,
+      "grad_norm": 0.03226471228597121,
+      "learning_rate": 0.0009988214663305991,
+      "loss": 0.6736,
+      "step": 4715
+    },
+    {
+      "epoch": 0.239621276541737,
+      "grad_norm": 0.03050888275116761,
+      "learning_rate": 0.000998806216463717,
+      "loss": 0.6593,
+      "step": 4720
+    },
+    {
+      "epoch": 0.2398751126397685,
+      "grad_norm": 0.06624380514348276,
+      "learning_rate": 0.0009987908686836622,
+      "loss": 0.666,
+      "step": 4725
+    },
+    {
+      "epoch": 0.2401289487378,
+      "grad_norm": 0.04639500533331041,
+      "learning_rate": 0.0009987754229934473,
+      "loss": 0.6432,
+      "step": 4730
+    },
+    {
+      "epoch": 0.2403827848358315,
+      "grad_norm": 0.03651599837288676,
+      "learning_rate": 0.0009987598793961044,
+      "loss": 0.6984,
+      "step": 4735
+    },
+    {
+      "epoch": 0.240636620933863,
+      "grad_norm": 0.05906840836987052,
+      "learning_rate": 0.0009987442378946842,
+      "loss": 0.6734,
+      "step": 4740
+    },
+    {
+      "epoch": 0.24089045703189452,
+      "grad_norm": 0.0448969230153068,
+      "learning_rate": 0.0009987284984922576,
+      "loss": 0.6632,
+      "step": 4745
+    },
+    {
+      "epoch": 0.241144293129926,
+      "grad_norm": 0.0369703497767851,
+      "learning_rate": 0.0009987126611919136,
+      "loss": 0.6797,
+      "step": 4750
+    },
+    {
+      "epoch": 0.2413981292279575,
+      "grad_norm": 0.03187137783090064,
+      "learning_rate": 0.0009986967259967617,
+      "loss": 0.6988,
+      "step": 4755
+    },
+    {
+      "epoch": 0.241651965325989,
+      "grad_norm": 0.046567965068659574,
+      "learning_rate": 0.0009986806929099291,
+      "loss": 0.6878,
+      "step": 4760
+    },
+    {
+      "epoch": 0.24190580142402052,
+      "grad_norm": 0.03459321103042977,
+      "learning_rate": 0.0009986645619345636,
+      "loss": 0.678,
+      "step": 4765
+    },
+    {
+      "epoch": 0.24215963752205202,
+      "grad_norm": 0.03218614265241276,
+      "learning_rate": 0.0009986483330738313,
+      "loss": 0.6708,
+      "step": 4770
+    },
+    {
+      "epoch": 0.2424134736200835,
+      "grad_norm": 0.032098655982760134,
+      "learning_rate": 0.0009986320063309182,
+      "loss": 0.6975,
+      "step": 4775
+    },
+    {
+      "epoch": 0.242667309718115,
+      "grad_norm": 0.03147966656169076,
+      "learning_rate": 0.0009986155817090288,
+      "loss": 0.6792,
+      "step": 4780
+    },
+    {
+      "epoch": 0.24292114581614652,
+      "grad_norm": 0.036497678142445616,
+      "learning_rate": 0.0009985990592113873,
+      "loss": 0.6736,
+      "step": 4785
+    },
+    {
+      "epoch": 0.24317498191417802,
+      "grad_norm": 0.037003131869676374,
+      "learning_rate": 0.000998582438841237,
+      "loss": 0.7012,
+      "step": 4790
+    },
+    {
+      "epoch": 0.2434288180122095,
+      "grad_norm": 0.02977697312157573,
+      "learning_rate": 0.0009985657206018404,
+      "loss": 0.6579,
+      "step": 4795
+    },
+    {
+      "epoch": 0.243682654110241,
+      "grad_norm": 0.033364670469226705,
+      "learning_rate": 0.0009985489044964792,
+      "loss": 0.6868,
+      "step": 4800
+    },
+    {
+      "epoch": 0.24393649020827252,
+      "grad_norm": 0.03611311028340616,
+      "learning_rate": 0.0009985319905284542,
+      "loss": 0.7035,
+      "step": 4805
+    },
+    {
+      "epoch": 0.24419032630630402,
+      "grad_norm": 0.03271327971441799,
+      "learning_rate": 0.0009985149787010857,
+      "loss": 0.682,
+      "step": 4810
+    },
+    {
+      "epoch": 0.24444416240433553,
+      "grad_norm": 0.045957037298533635,
+      "learning_rate": 0.000998497869017713,
+      "loss": 0.6568,
+      "step": 4815
+    },
+    {
+      "epoch": 0.244697998502367,
+      "grad_norm": 0.058416697755835315,
+      "learning_rate": 0.0009984806614816944,
+      "loss": 0.6353,
+      "step": 4820
+    },
+    {
+      "epoch": 0.24495183460039852,
+      "grad_norm": 0.04141568212799508,
+      "learning_rate": 0.000998463356096408,
+      "loss": 0.6997,
+      "step": 4825
+    },
+    {
+      "epoch": 0.24520567069843002,
+      "grad_norm": 0.03429415259338343,
+      "learning_rate": 0.0009984459528652508,
+      "loss": 0.6693,
+      "step": 4830
+    },
+    {
+      "epoch": 0.24545950679646153,
+      "grad_norm": 0.04246659526708136,
+      "learning_rate": 0.0009984284517916386,
+      "loss": 0.6749,
+      "step": 4835
+    },
+    {
+      "epoch": 0.24571334289449304,
+      "grad_norm": 0.044954596862590616,
+      "learning_rate": 0.000998410852879007,
+      "loss": 0.6954,
+      "step": 4840
+    },
+    {
+      "epoch": 0.24596717899252452,
+      "grad_norm": 0.04365653720088704,
+      "learning_rate": 0.0009983931561308105,
+      "loss": 0.6583,
+      "step": 4845
+    },
+    {
+      "epoch": 0.24622101509055602,
+      "grad_norm": 0.04095497127619676,
+      "learning_rate": 0.0009983753615505232,
+      "loss": 0.6648,
+      "step": 4850
+    },
+    {
+      "epoch": 0.24647485118858753,
+      "grad_norm": 0.037206051604634686,
+      "learning_rate": 0.0009983574691416377,
+      "loss": 0.7103,
+      "step": 4855
+    },
+    {
+      "epoch": 0.24672868728661904,
+      "grad_norm": 0.033774627357303716,
+      "learning_rate": 0.0009983394789076663,
+      "loss": 0.6761,
+      "step": 4860
+    },
+    {
+      "epoch": 0.24698252338465054,
+      "grad_norm": 0.04836628251413319,
+      "learning_rate": 0.0009983213908521403,
+      "loss": 0.6753,
+      "step": 4865
+    },
+    {
+      "epoch": 0.24723635948268202,
+      "grad_norm": 0.05466352076606845,
+      "learning_rate": 0.0009983032049786106,
+      "loss": 0.6939,
+      "step": 4870
+    },
+    {
+      "epoch": 0.24749019558071353,
+      "grad_norm": 0.03600417172852513,
+      "learning_rate": 0.0009982849212906465,
+      "loss": 0.6242,
+      "step": 4875
+    },
+    {
+      "epoch": 0.24774403167874504,
+      "grad_norm": 0.03575528198191203,
+      "learning_rate": 0.0009982665397918376,
+      "loss": 0.6428,
+      "step": 4880
+    },
+    {
+      "epoch": 0.24799786777677654,
+      "grad_norm": 0.0444096053237933,
+      "learning_rate": 0.0009982480604857915,
+      "loss": 0.6974,
+      "step": 4885
+    },
+    {
+      "epoch": 0.24825170387480805,
+      "grad_norm": 0.0467175869465778,
+      "learning_rate": 0.000998229483376136,
+      "loss": 0.6542,
+      "step": 4890
+    },
+    {
+      "epoch": 0.24850553997283953,
+      "grad_norm": 0.03386552164563125,
+      "learning_rate": 0.0009982108084665177,
+      "loss": 0.6855,
+      "step": 4895
+    },
+    {
+      "epoch": 0.24875937607087104,
+      "grad_norm": 0.04919740684938901,
+      "learning_rate": 0.0009981920357606023,
+      "loss": 0.6631,
+      "step": 4900
+    },
+    {
+      "epoch": 0.24901321216890254,
+      "grad_norm": 0.057577585211750874,
+      "learning_rate": 0.0009981731652620746,
+      "loss": 0.6562,
+      "step": 4905
+    },
+    {
+      "epoch": 0.24926704826693405,
+      "grad_norm": 0.036406073723948565,
+      "learning_rate": 0.0009981541969746389,
+      "loss": 0.647,
+      "step": 4910
+    },
+    {
+      "epoch": 0.24952088436496553,
+      "grad_norm": 0.05465235293761601,
+      "learning_rate": 0.0009981351309020189,
+      "loss": 0.6631,
+      "step": 4915
+    },
+    {
+      "epoch": 0.24977472046299704,
+      "grad_norm": 0.048518701182604415,
+      "learning_rate": 0.0009981159670479566,
+      "loss": 0.6637,
+      "step": 4920
+    },
+    {
+      "epoch": 0.25002855656102857,
+      "grad_norm": 0.0440335328009626,
+      "learning_rate": 0.0009980967054162141,
+      "loss": 0.6347,
+      "step": 4925
+    },
+    {
+      "epoch": 0.25028239265906005,
+      "grad_norm": 0.035049011739473654,
+      "learning_rate": 0.0009980773460105726,
+      "loss": 0.6469,
+      "step": 4930
+    },
+    {
+      "epoch": 0.25053622875709153,
+      "grad_norm": 0.030844174743583502,
+      "learning_rate": 0.0009980578888348318,
+      "loss": 0.6618,
+      "step": 4935
+    },
+    {
+      "epoch": 0.25079006485512306,
+      "grad_norm": 0.04942299263880296,
+      "learning_rate": 0.000998038333892811,
+      "loss": 0.659,
+      "step": 4940
+    },
+    {
+      "epoch": 0.25104390095315454,
+      "grad_norm": 0.03920588146136878,
+      "learning_rate": 0.0009980186811883495,
+      "loss": 0.6439,
+      "step": 4945
+    },
+    {
+      "epoch": 0.2512977370511861,
+      "grad_norm": 0.037389944860020057,
+      "learning_rate": 0.000997998930725304,
+      "loss": 0.6883,
+      "step": 4950
+    },
+    {
+      "epoch": 0.25155157314921756,
+      "grad_norm": 0.035236188832557136,
+      "learning_rate": 0.0009979790825075522,
+      "loss": 0.6553,
+      "step": 4955
+    },
+    {
+      "epoch": 0.25180540924724903,
+      "grad_norm": 0.040826549993116246,
+      "learning_rate": 0.0009979591365389898,
+      "loss": 0.6896,
+      "step": 4960
+    },
+    {
+      "epoch": 0.25205924534528057,
+      "grad_norm": 0.08332873012050707,
+      "learning_rate": 0.0009979390928235323,
+      "loss": 0.6845,
+      "step": 4965
+    },
+    {
+      "epoch": 0.25231308144331205,
+      "grad_norm": 0.033440948748920125,
+      "learning_rate": 0.000997918951365114,
+      "loss": 0.639,
+      "step": 4970
+    },
+    {
+      "epoch": 0.2525669175413435,
+      "grad_norm": 0.03031028775821842,
+      "learning_rate": 0.0009978987121676889,
+      "loss": 0.6361,
+      "step": 4975
+    },
+    {
+      "epoch": 0.25282075363937506,
+      "grad_norm": 0.031208187381832196,
+      "learning_rate": 0.0009978783752352294,
+      "loss": 0.652,
+      "step": 4980
+    },
+    {
+      "epoch": 0.25307458973740654,
+      "grad_norm": 0.02872166512981619,
+      "learning_rate": 0.0009978579405717277,
+      "loss": 0.6724,
+      "step": 4985
+    },
+    {
+      "epoch": 0.2533284258354381,
+      "grad_norm": 0.03216058478323663,
+      "learning_rate": 0.0009978374081811951,
+      "loss": 0.6371,
+      "step": 4990
+    },
+    {
+      "epoch": 0.25358226193346955,
+      "grad_norm": 0.044909528815065505,
+      "learning_rate": 0.000997816778067662,
+      "loss": 0.6411,
+      "step": 4995
+    },
+    {
+      "epoch": 0.25383609803150103,
+      "grad_norm": 0.05288790528971391,
+      "learning_rate": 0.0009977960502351782,
+      "loss": 0.649,
+      "step": 5000
+    },
+    {
+      "epoch": 0.25408993412953257,
+      "grad_norm": 0.053954926716690006,
+      "learning_rate": 0.000997775224687812,
+      "loss": 0.6916,
+      "step": 5005
+    },
+    {
+      "epoch": 0.25434377022756405,
+      "grad_norm": 0.03358524756795457,
+      "learning_rate": 0.0009977543014296516,
+      "loss": 0.6414,
+      "step": 5010
+    },
+    {
+      "epoch": 0.2545976063255956,
+      "grad_norm": 0.04116556703883807,
+      "learning_rate": 0.0009977332804648044,
+      "loss": 0.629,
+      "step": 5015
+    },
+    {
+      "epoch": 0.25485144242362706,
+      "grad_norm": 0.0607964434321599,
+      "learning_rate": 0.000997712161797396,
+      "loss": 0.6169,
+      "step": 5020
+    },
+    {
+      "epoch": 0.25510527852165854,
+      "grad_norm": 0.03948191550117113,
+      "learning_rate": 0.0009976909454315727,
+      "loss": 0.6516,
+      "step": 5025
+    },
+    {
+      "epoch": 0.2553591146196901,
+      "grad_norm": 0.03816107077659965,
+      "learning_rate": 0.0009976696313714986,
+      "loss": 0.6208,
+      "step": 5030
+    },
+    {
+      "epoch": 0.25561295071772155,
+      "grad_norm": 0.03580194239027954,
+      "learning_rate": 0.0009976482196213578,
+      "loss": 0.6565,
+      "step": 5035
+    },
+    {
+      "epoch": 0.2558667868157531,
+      "grad_norm": 0.03359792481796965,
+      "learning_rate": 0.0009976267101853534,
+      "loss": 0.6612,
+      "step": 5040
+    },
+    {
+      "epoch": 0.25612062291378457,
+      "grad_norm": 0.07818021673605252,
+      "learning_rate": 0.000997605103067707,
+      "loss": 0.6803,
+      "step": 5045
+    },
+    {
+      "epoch": 0.25637445901181605,
+      "grad_norm": 0.05189617302318715,
+      "learning_rate": 0.000997583398272661,
+      "loss": 0.6446,
+      "step": 5050
+    },
+    {
+      "epoch": 0.2566282951098476,
+      "grad_norm": 0.040877169883040965,
+      "learning_rate": 0.000997561595804475,
+      "loss": 0.6738,
+      "step": 5055
+    },
+    {
+      "epoch": 0.25688213120787906,
+      "grad_norm": 0.03223264560242032,
+      "learning_rate": 0.0009975396956674292,
+      "loss": 0.6671,
+      "step": 5060
+    },
+    {
+      "epoch": 0.2571359673059106,
+      "grad_norm": 0.0293074841987577,
+      "learning_rate": 0.0009975176978658223,
+      "loss": 0.6393,
+      "step": 5065
+    },
+    {
+      "epoch": 0.2573898034039421,
+      "grad_norm": 0.029454580799978237,
+      "learning_rate": 0.0009974956024039723,
+      "loss": 0.668,
+      "step": 5070
+    },
+    {
+      "epoch": 0.25764363950197355,
+      "grad_norm": 0.036264462292175975,
+      "learning_rate": 0.0009974734092862167,
+      "loss": 0.6323,
+      "step": 5075
+    },
+    {
+      "epoch": 0.2578974756000051,
+      "grad_norm": 0.03401180337813072,
+      "learning_rate": 0.0009974511185169119,
+      "loss": 0.6179,
+      "step": 5080
+    },
+    {
+      "epoch": 0.25815131169803657,
+      "grad_norm": 0.035116697500188865,
+      "learning_rate": 0.0009974287301004333,
+      "loss": 0.6568,
+      "step": 5085
+    },
+    {
+      "epoch": 0.2584051477960681,
+      "grad_norm": 0.032907495214095854,
+      "learning_rate": 0.0009974062440411754,
+      "loss": 0.6608,
+      "step": 5090
+    },
+    {
+      "epoch": 0.2586589838940996,
+      "grad_norm": 0.03230228212232674,
+      "learning_rate": 0.0009973836603435525,
+      "loss": 0.6692,
+      "step": 5095
+    },
+    {
+      "epoch": 0.25891281999213106,
+      "grad_norm": 0.036175243285705705,
+      "learning_rate": 0.0009973609790119974,
+      "loss": 0.6115,
+      "step": 5100
+    },
+    {
+      "epoch": 0.2591666560901626,
+      "grad_norm": 0.04087747325856648,
+      "learning_rate": 0.0009973382000509627,
+      "loss": 0.6158,
+      "step": 5105
+    },
+    {
+      "epoch": 0.2594204921881941,
+      "grad_norm": 0.05131461463304514,
+      "learning_rate": 0.0009973153234649195,
+      "loss": 0.6733,
+      "step": 5110
+    },
+    {
+      "epoch": 0.2596743282862256,
+      "grad_norm": 0.042101949951047324,
+      "learning_rate": 0.0009972923492583582,
+      "loss": 0.6649,
+      "step": 5115
+    },
+    {
+      "epoch": 0.2599281643842571,
+      "grad_norm": 0.032414502454393444,
+      "learning_rate": 0.0009972692774357888,
+      "loss": 0.6502,
+      "step": 5120
+    },
+    {
+      "epoch": 0.26018200048228857,
+      "grad_norm": 0.031594756495697617,
+      "learning_rate": 0.0009972461080017404,
+      "loss": 0.6269,
+      "step": 5125
+    },
+    {
+      "epoch": 0.2604358365803201,
+      "grad_norm": 0.029578356553082198,
+      "learning_rate": 0.0009972228409607605,
+      "loss": 0.658,
+      "step": 5130
+    },
+    {
+      "epoch": 0.2606896726783516,
+      "grad_norm": 0.04180221452673936,
+      "learning_rate": 0.0009971994763174165,
+      "loss": 0.6404,
+      "step": 5135
+    },
+    {
+      "epoch": 0.2609435087763831,
+      "grad_norm": 0.034100422332999716,
+      "learning_rate": 0.0009971760140762948,
+      "loss": 0.6475,
+      "step": 5140
+    },
+    {
+      "epoch": 0.2611973448744146,
+      "grad_norm": 0.04081751959411303,
+      "learning_rate": 0.0009971524542420013,
+      "loss": 0.64,
+      "step": 5145
+    },
+    {
+      "epoch": 0.2614511809724461,
+      "grad_norm": 0.028872945512903075,
+      "learning_rate": 0.00099712879681916,
+      "loss": 0.6393,
+      "step": 5150
+    },
+    {
+      "epoch": 0.2617050170704776,
+      "grad_norm": 0.035240956930609156,
+      "learning_rate": 0.0009971050418124152,
+      "loss": 0.6333,
+      "step": 5155
+    },
+    {
+      "epoch": 0.2619588531685091,
+      "grad_norm": 0.044136885746740954,
+      "learning_rate": 0.0009970811892264298,
+      "loss": 0.635,
+      "step": 5160
+    },
+    {
+      "epoch": 0.2622126892665406,
+      "grad_norm": 0.0459739244555897,
+      "learning_rate": 0.0009970572390658858,
+      "loss": 0.6544,
+      "step": 5165
+    },
+    {
+      "epoch": 0.2624665253645721,
+      "grad_norm": 0.04065361653486883,
+      "learning_rate": 0.0009970331913354846,
+      "loss": 0.6466,
+      "step": 5170
+    },
+    {
+      "epoch": 0.2627203614626036,
+      "grad_norm": 0.06797018866692801,
+      "learning_rate": 0.0009970090460399467,
+      "loss": 0.635,
+      "step": 5175
+    },
+    {
+      "epoch": 0.2629741975606351,
+      "grad_norm": 0.037934420949089415,
+      "learning_rate": 0.0009969848031840117,
+      "loss": 0.6785,
+      "step": 5180
+    },
+    {
+      "epoch": 0.2632280336586666,
+      "grad_norm": 0.041773499604102336,
+      "learning_rate": 0.000996960462772438,
+      "loss": 0.6269,
+      "step": 5185
+    },
+    {
+      "epoch": 0.2634818697566981,
+      "grad_norm": 0.03264370334223641,
+      "learning_rate": 0.000996936024810004,
+      "loss": 0.6399,
+      "step": 5190
+    },
+    {
+      "epoch": 0.2637357058547296,
+      "grad_norm": 0.05100248961195995,
+      "learning_rate": 0.0009969114893015065,
+      "loss": 0.6499,
+      "step": 5195
+    },
+    {
+      "epoch": 0.2639895419527611,
+      "grad_norm": 0.052788735735977844,
+      "learning_rate": 0.000996886856251762,
+      "loss": 0.6809,
+      "step": 5200
+    },
+    {
+      "epoch": 0.2642433780507926,
+      "grad_norm": 0.04842406245587388,
+      "learning_rate": 0.0009968621256656051,
+      "loss": 0.6552,
+      "step": 5205
+    },
+    {
+      "epoch": 0.2644972141488241,
+      "grad_norm": 0.03292511247465923,
+      "learning_rate": 0.0009968372975478913,
+      "loss": 0.6661,
+      "step": 5210
+    },
+    {
+      "epoch": 0.2647510502468556,
+      "grad_norm": 0.058627129552400285,
+      "learning_rate": 0.0009968123719034934,
+      "loss": 0.6759,
+      "step": 5215
+    },
+    {
+      "epoch": 0.2650048863448871,
+      "grad_norm": 0.04585833544931062,
+      "learning_rate": 0.0009967873487373045,
+      "loss": 0.6838,
+      "step": 5220
+    },
+    {
+      "epoch": 0.2652587224429186,
+      "grad_norm": 0.06213801760761045,
+      "learning_rate": 0.0009967622280542365,
+      "loss": 0.686,
+      "step": 5225
+    },
+    {
+      "epoch": 0.2655125585409501,
+      "grad_norm": 0.0465036803469171,
+      "learning_rate": 0.0009967370098592206,
+      "loss": 0.6789,
+      "step": 5230
+    },
+    {
+      "epoch": 0.2657663946389816,
+      "grad_norm": 0.036623256236247154,
+      "learning_rate": 0.000996711694157207,
+      "loss": 0.6537,
+      "step": 5235
+    },
+    {
+      "epoch": 0.2660202307370131,
+      "grad_norm": 0.036813613833440686,
+      "learning_rate": 0.0009966862809531647,
+      "loss": 0.6605,
+      "step": 5240
+    },
+    {
+      "epoch": 0.2662740668350446,
+      "grad_norm": 0.033104325193404505,
+      "learning_rate": 0.0009966607702520825,
+      "loss": 0.6667,
+      "step": 5245
+    },
+    {
+      "epoch": 0.2665279029330761,
+      "grad_norm": 0.03616645613143295,
+      "learning_rate": 0.0009966351620589679,
+      "loss": 0.6442,
+      "step": 5250
+    },
+    {
+      "epoch": 0.26678173903110763,
+      "grad_norm": 0.1824434683613809,
+      "learning_rate": 0.0009966094563788478,
+      "loss": 0.7154,
+      "step": 5255
+    },
+    {
+      "epoch": 0.2670355751291391,
+      "grad_norm": 0.08584590008440728,
+      "learning_rate": 0.0009965836532167679,
+      "loss": 0.7169,
+      "step": 5260
+    },
+    {
+      "epoch": 0.2672894112271706,
+      "grad_norm": 0.08992314008060993,
+      "learning_rate": 0.0009965577525777934,
+      "loss": 0.7042,
+      "step": 5265
+    },
+    {
+      "epoch": 0.2675432473252021,
+      "grad_norm": 0.05488924202012461,
+      "learning_rate": 0.0009965317544670083,
+      "loss": 0.6752,
+      "step": 5270
+    },
+    {
+      "epoch": 0.2677970834232336,
+      "grad_norm": 0.045608215425613115,
+      "learning_rate": 0.000996505658889516,
+      "loss": 0.6711,
+      "step": 5275
+    },
+    {
+      "epoch": 0.26805091952126514,
+      "grad_norm": 0.04073278787103573,
+      "learning_rate": 0.000996479465850439,
+      "loss": 0.6915,
+      "step": 5280
+    },
+    {
+      "epoch": 0.2683047556192966,
+      "grad_norm": 0.04291579060984302,
+      "learning_rate": 0.000996453175354919,
+      "loss": 0.6813,
+      "step": 5285
+    },
+    {
+      "epoch": 0.2685585917173281,
+      "grad_norm": 0.037038830744694835,
+      "learning_rate": 0.000996426787408116,
+      "loss": 0.7183,
+      "step": 5290
+    },
+    {
+      "epoch": 0.26881242781535963,
+      "grad_norm": 0.03993617745746424,
+      "learning_rate": 0.0009964003020152107,
+      "loss": 0.7023,
+      "step": 5295
+    },
+    {
+      "epoch": 0.2690662639133911,
+      "grad_norm": 0.043203555024935315,
+      "learning_rate": 0.0009963737191814015,
+      "loss": 0.6355,
+      "step": 5300
+    },
+    {
+      "epoch": 0.26932010001142265,
+      "grad_norm": 0.03774583913684898,
+      "learning_rate": 0.0009963470389119068,
+      "loss": 0.6524,
+      "step": 5305
+    },
+    {
+      "epoch": 0.2695739361094541,
+      "grad_norm": 0.03395726169338956,
+      "learning_rate": 0.0009963202612119635,
+      "loss": 0.6512,
+      "step": 5310
+    },
+    {
+      "epoch": 0.2698277722074856,
+      "grad_norm": 0.043530029342712495,
+      "learning_rate": 0.000996293386086828,
+      "loss": 0.6618,
+      "step": 5315
+    },
+    {
+      "epoch": 0.27008160830551714,
+      "grad_norm": 0.03403180797982201,
+      "learning_rate": 0.0009962664135417761,
+      "loss": 0.6811,
+      "step": 5320
+    },
+    {
+      "epoch": 0.2703354444035486,
+      "grad_norm": 0.04666110465558761,
+      "learning_rate": 0.0009962393435821017,
+      "loss": 0.6615,
+      "step": 5325
+    },
+    {
+      "epoch": 0.27058928050158015,
+      "grad_norm": 0.032556938435121995,
+      "learning_rate": 0.0009962121762131192,
+      "loss": 0.6395,
+      "step": 5330
+    },
+    {
+      "epoch": 0.27084311659961163,
+      "grad_norm": 0.04419252183394335,
+      "learning_rate": 0.0009961849114401612,
+      "loss": 0.6119,
+      "step": 5335
+    },
+    {
+      "epoch": 0.2710969526976431,
+      "grad_norm": 0.035595422315825305,
+      "learning_rate": 0.0009961575492685793,
+      "loss": 0.6484,
+      "step": 5340
+    },
+    {
+      "epoch": 0.27135078879567465,
+      "grad_norm": 0.02926242788618824,
+      "learning_rate": 0.0009961300897037449,
+      "loss": 0.6407,
+      "step": 5345
+    },
+    {
+      "epoch": 0.2716046248937061,
+      "grad_norm": 0.03602952233417284,
+      "learning_rate": 0.000996102532751048,
+      "loss": 0.6641,
+      "step": 5350
+    },
+    {
+      "epoch": 0.27185846099173766,
+      "grad_norm": 0.045258498714948955,
+      "learning_rate": 0.000996074878415898,
+      "loss": 0.6636,
+      "step": 5355
+    },
+    {
+      "epoch": 0.27211229708976914,
+      "grad_norm": 0.055406038005293826,
+      "learning_rate": 0.0009960471267037234,
+      "loss": 0.6462,
+      "step": 5360
+    },
+    {
+      "epoch": 0.2723661331878006,
+      "grad_norm": 0.05518321218015624,
+      "learning_rate": 0.0009960192776199716,
+      "loss": 0.6265,
+      "step": 5365
+    },
+    {
+      "epoch": 0.27261996928583215,
+      "grad_norm": 0.03701471191581659,
+      "learning_rate": 0.0009959913311701092,
+      "loss": 0.6311,
+      "step": 5370
+    },
+    {
+      "epoch": 0.27287380538386363,
+      "grad_norm": 0.04247484585566076,
+      "learning_rate": 0.000995963287359622,
+      "loss": 0.663,
+      "step": 5375
+    },
+    {
+      "epoch": 0.27312764148189517,
+      "grad_norm": 0.03805986357905345,
+      "learning_rate": 0.0009959351461940149,
+      "loss": 0.6367,
+      "step": 5380
+    },
+    {
+      "epoch": 0.27338147757992665,
+      "grad_norm": 0.041801835611640255,
+      "learning_rate": 0.0009959069076788118,
+      "loss": 0.6948,
+      "step": 5385
+    },
+    {
+      "epoch": 0.2736353136779581,
+      "grad_norm": 0.034657777527560114,
+      "learning_rate": 0.0009958785718195559,
+      "loss": 0.6829,
+      "step": 5390
+    },
+    {
+      "epoch": 0.27388914977598966,
+      "grad_norm": 0.18006607610520042,
+      "learning_rate": 0.000995850138621809,
+      "loss": 0.6177,
+      "step": 5395
+    },
+    {
+      "epoch": 0.27414298587402114,
+      "grad_norm": 0.10543857125691686,
+      "learning_rate": 0.0009958216080911528,
+      "loss": 0.6223,
+      "step": 5400
+    },
+    {
+      "epoch": 0.2743968219720526,
+      "grad_norm": 0.0900614057284698,
+      "learning_rate": 0.0009957929802331877,
+      "loss": 0.6623,
+      "step": 5405
+    },
+    {
+      "epoch": 0.27465065807008415,
+      "grad_norm": 0.04819979788344021,
+      "learning_rate": 0.000995764255053533,
+      "loss": 0.7131,
+      "step": 5410
+    },
+    {
+      "epoch": 0.27490449416811563,
+      "grad_norm": 0.03076997581248242,
+      "learning_rate": 0.0009957354325578276,
+      "loss": 0.6477,
+      "step": 5415
+    },
+    {
+      "epoch": 0.27515833026614717,
+      "grad_norm": 0.041385162650395144,
+      "learning_rate": 0.000995706512751729,
+      "loss": 0.6565,
+      "step": 5420
+    },
+    {
+      "epoch": 0.27541216636417865,
+      "grad_norm": 0.03721380433063758,
+      "learning_rate": 0.0009956774956409139,
+      "loss": 0.6774,
+      "step": 5425
+    },
+    {
+      "epoch": 0.2756660024622101,
+      "grad_norm": 0.0354007938770567,
+      "learning_rate": 0.0009956483812310782,
+      "loss": 0.6286,
+      "step": 5430
+    },
+    {
+      "epoch": 0.27591983856024166,
+      "grad_norm": 0.04494906800664398,
+      "learning_rate": 0.0009956191695279374,
+      "loss": 0.6499,
+      "step": 5435
+    },
+    {
+      "epoch": 0.27617367465827314,
+      "grad_norm": 0.06723158886677959,
+      "learning_rate": 0.0009955898605372249,
+      "loss": 0.6385,
+      "step": 5440
+    },
+    {
+      "epoch": 0.2764275107563047,
+      "grad_norm": 0.04401273742367783,
+      "learning_rate": 0.0009955604542646946,
+      "loss": 0.7135,
+      "step": 5445
+    },
+    {
+      "epoch": 0.27668134685433615,
+      "grad_norm": 0.05565588713522223,
+      "learning_rate": 0.0009955309507161184,
+      "loss": 0.6492,
+      "step": 5450
+    },
+    {
+      "epoch": 0.27693518295236763,
+      "grad_norm": 0.0719042454362043,
+      "learning_rate": 0.0009955013498972876,
+      "loss": 0.6936,
+      "step": 5455
+    },
+    {
+      "epoch": 0.27718901905039917,
+      "grad_norm": 0.03740121789307142,
+      "learning_rate": 0.000995471651814013,
+      "loss": 0.6424,
+      "step": 5460
+    },
+    {
+      "epoch": 0.27744285514843064,
+      "grad_norm": 0.03850759086931981,
+      "learning_rate": 0.0009954418564721242,
+      "loss": 0.6759,
+      "step": 5465
+    },
+    {
+      "epoch": 0.2776966912464622,
+      "grad_norm": 0.036923309271708,
+      "learning_rate": 0.0009954119638774695,
+      "loss": 0.6807,
+      "step": 5470
+    },
+    {
+      "epoch": 0.27795052734449366,
+      "grad_norm": 0.042899936320904095,
+      "learning_rate": 0.000995381974035917,
+      "loss": 0.6874,
+      "step": 5475
+    },
+    {
+      "epoch": 0.27820436344252514,
+      "grad_norm": 0.0784299633425732,
+      "learning_rate": 0.0009953518869533536,
+      "loss": 0.6906,
+      "step": 5480
+    },
+    {
+      "epoch": 0.2784581995405567,
+      "grad_norm": 0.05844919432805366,
+      "learning_rate": 0.0009953217026356848,
+      "loss": 0.6793,
+      "step": 5485
+    },
+    {
+      "epoch": 0.27871203563858815,
+      "grad_norm": 0.040898154688727616,
+      "learning_rate": 0.0009952914210888363,
+      "loss": 0.6796,
+      "step": 5490
+    },
+    {
+      "epoch": 0.2789658717366197,
+      "grad_norm": 0.056560854563487195,
+      "learning_rate": 0.0009952610423187517,
+      "loss": 0.6589,
+      "step": 5495
+    },
+    {
+      "epoch": 0.27921970783465117,
+      "grad_norm": 0.07111930095820192,
+      "learning_rate": 0.0009952305663313943,
+      "loss": 0.665,
+      "step": 5500
+    },
+    {
+      "epoch": 0.27947354393268264,
+      "grad_norm": 0.06949742084914043,
+      "learning_rate": 0.0009951999931327464,
+      "loss": 0.6705,
+      "step": 5505
+    },
+    {
+      "epoch": 0.2797273800307142,
+      "grad_norm": 0.05060283874594893,
+      "learning_rate": 0.0009951693227288096,
+      "loss": 0.7088,
+      "step": 5510
+    },
+    {
+      "epoch": 0.27998121612874566,
+      "grad_norm": 0.03589194724274505,
+      "learning_rate": 0.0009951385551256041,
+      "loss": 0.6602,
+      "step": 5515
+    },
+    {
+      "epoch": 0.2802350522267772,
+      "grad_norm": 0.0327414416620521,
+      "learning_rate": 0.0009951076903291693,
+      "loss": 0.6571,
+      "step": 5520
+    },
+    {
+      "epoch": 0.28048888832480867,
+      "grad_norm": 0.052789013483981816,
+      "learning_rate": 0.000995076728345564,
+      "loss": 0.6725,
+      "step": 5525
+    },
+    {
+      "epoch": 0.28074272442284015,
+      "grad_norm": 0.05677596579461743,
+      "learning_rate": 0.000995045669180866,
+      "loss": 0.6413,
+      "step": 5530
+    },
+    {
+      "epoch": 0.2809965605208717,
+      "grad_norm": 0.03745942908058603,
+      "learning_rate": 0.000995014512841172,
+      "loss": 0.6909,
+      "step": 5535
+    },
+    {
+      "epoch": 0.28125039661890316,
+      "grad_norm": 0.14627372454354676,
+      "learning_rate": 0.0009949832593325978,
+      "loss": 0.6459,
+      "step": 5540
+    },
+    {
+      "epoch": 0.2815042327169347,
+      "grad_norm": 0.04133886420786463,
+      "learning_rate": 0.000994951908661278,
+      "loss": 0.6771,
+      "step": 5545
+    },
+    {
+      "epoch": 0.2817580688149662,
+      "grad_norm": 0.038383767135896224,
+      "learning_rate": 0.0009949204608333672,
+      "loss": 0.6659,
+      "step": 5550
+    },
+    {
+      "epoch": 0.28201190491299766,
+      "grad_norm": 0.03870804386622496,
+      "learning_rate": 0.0009948889158550376,
+      "loss": 0.6628,
+      "step": 5555
+    },
+    {
+      "epoch": 0.2822657410110292,
+      "grad_norm": 0.03666141726099845,
+      "learning_rate": 0.0009948572737324822,
+      "loss": 0.6783,
+      "step": 5560
+    },
+    {
+      "epoch": 0.28251957710906067,
+      "grad_norm": 0.13133641642519978,
+      "learning_rate": 0.0009948255344719118,
+      "loss": 0.675,
+      "step": 5565
+    },
+    {
+      "epoch": 0.2827734132070922,
+      "grad_norm": 0.12174050657136312,
+      "learning_rate": 0.0009947936980795565,
+      "loss": 0.7195,
+      "step": 5570
+    },
+    {
+      "epoch": 0.2830272493051237,
+      "grad_norm": 0.058601197176472185,
+      "learning_rate": 0.000994761764561666,
+      "loss": 0.6784,
+      "step": 5575
+    },
+    {
+      "epoch": 0.28328108540315516,
+      "grad_norm": 0.16508393923966153,
+      "learning_rate": 0.0009947297339245084,
+      "loss": 0.7202,
+      "step": 5580
+    },
+    {
+      "epoch": 0.2835349215011867,
+      "grad_norm": 0.0766947212292105,
+      "learning_rate": 0.0009946976061743712,
+      "loss": 0.6596,
+      "step": 5585
+    },
+    {
+      "epoch": 0.2837887575992182,
+      "grad_norm": 0.03340983687623817,
+      "learning_rate": 0.000994665381317561,
+      "loss": 0.6599,
+      "step": 5590
+    },
+    {
+      "epoch": 0.2840425936972497,
+      "grad_norm": 0.037573302023749436,
+      "learning_rate": 0.0009946330593604033,
+      "loss": 0.6594,
+      "step": 5595
+    },
+    {
+      "epoch": 0.2842964297952812,
+      "grad_norm": 0.03926723139173781,
+      "learning_rate": 0.000994600640309243,
+      "loss": 0.6436,
+      "step": 5600
+    },
+    {
+      "epoch": 0.28455026589331267,
+      "grad_norm": 0.033758520131084185,
+      "learning_rate": 0.0009945681241704434,
+      "loss": 0.6706,
+      "step": 5605
+    },
+    {
+      "epoch": 0.2848041019913442,
+      "grad_norm": 0.035833106887649714,
+      "learning_rate": 0.0009945355109503872,
+      "loss": 0.6757,
+      "step": 5610
+    },
+    {
+      "epoch": 0.2850579380893757,
+      "grad_norm": 0.03439574893245049,
+      "learning_rate": 0.0009945028006554768,
+      "loss": 0.674,
+      "step": 5615
+    },
+    {
+      "epoch": 0.2853117741874072,
+      "grad_norm": 0.04115472629656225,
+      "learning_rate": 0.0009944699932921326,
+      "loss": 0.6901,
+      "step": 5620
+    },
+    {
+      "epoch": 0.2855656102854387,
+      "grad_norm": 0.042426909272189464,
+      "learning_rate": 0.0009944370888667947,
+      "loss": 0.6709,
+      "step": 5625
+    },
+    {
+      "epoch": 0.2858194463834702,
+      "grad_norm": 0.05644132704585222,
+      "learning_rate": 0.0009944040873859218,
+      "loss": 0.6765,
+      "step": 5630
+    },
+    {
+      "epoch": 0.2860732824815017,
+      "grad_norm": 0.050269965909692245,
+      "learning_rate": 0.0009943709888559922,
+      "loss": 0.6463,
+      "step": 5635
+    },
+    {
+      "epoch": 0.2863271185795332,
+      "grad_norm": 0.03990106543413605,
+      "learning_rate": 0.000994337793283503,
+      "loss": 0.6788,
+      "step": 5640
+    },
+    {
+      "epoch": 0.28658095467756467,
+      "grad_norm": 0.039426229153061605,
+      "learning_rate": 0.0009943045006749703,
+      "loss": 0.6477,
+      "step": 5645
+    },
+    {
+      "epoch": 0.2868347907755962,
+      "grad_norm": 0.037701616134469464,
+      "learning_rate": 0.0009942711110369291,
+      "loss": 0.6609,
+      "step": 5650
+    },
+    {
+      "epoch": 0.2870886268736277,
+      "grad_norm": 0.036088987975116615,
+      "learning_rate": 0.0009942376243759336,
+      "loss": 0.6237,
+      "step": 5655
+    },
+    {
+      "epoch": 0.2873424629716592,
+      "grad_norm": 0.039086804987042645,
+      "learning_rate": 0.0009942040406985574,
+      "loss": 0.6313,
+      "step": 5660
+    },
+    {
+      "epoch": 0.2875962990696907,
+      "grad_norm": 0.05039912347585908,
+      "learning_rate": 0.0009941703600113926,
+      "loss": 0.6786,
+      "step": 5665
+    },
+    {
+      "epoch": 0.2878501351677222,
+      "grad_norm": 0.05153135663276811,
+      "learning_rate": 0.0009941365823210506,
+      "loss": 0.6796,
+      "step": 5670
+    },
+    {
+      "epoch": 0.2881039712657537,
+      "grad_norm": 0.048537339426450846,
+      "learning_rate": 0.0009941027076341615,
+      "loss": 0.6335,
+      "step": 5675
+    },
+    {
+      "epoch": 0.2883578073637852,
+      "grad_norm": 0.05231418629671102,
+      "learning_rate": 0.0009940687359573752,
+      "loss": 0.689,
+      "step": 5680
+    },
+    {
+      "epoch": 0.2886116434618167,
+      "grad_norm": 0.03179751321808205,
+      "learning_rate": 0.00099403466729736,
+      "loss": 0.6398,
+      "step": 5685
+    },
+    {
+      "epoch": 0.2888654795598482,
+      "grad_norm": 0.056582858417932216,
+      "learning_rate": 0.000994000501660803,
+      "loss": 0.6721,
+      "step": 5690
+    },
+    {
+      "epoch": 0.2891193156578797,
+      "grad_norm": 0.02974455078712651,
+      "learning_rate": 0.0009939662390544115,
+      "loss": 0.6327,
+      "step": 5695
+    },
+    {
+      "epoch": 0.2893731517559112,
+      "grad_norm": 0.05164849531976483,
+      "learning_rate": 0.0009939318794849104,
+      "loss": 0.6492,
+      "step": 5700
+    },
+    {
+      "epoch": 0.2896269878539427,
+      "grad_norm": 0.03957457040909659,
+      "learning_rate": 0.0009938974229590446,
+      "loss": 0.6436,
+      "step": 5705
+    },
+    {
+      "epoch": 0.28988082395197423,
+      "grad_norm": 0.04242341712591207,
+      "learning_rate": 0.000993862869483578,
+      "loss": 0.6644,
+      "step": 5710
+    },
+    {
+      "epoch": 0.2901346600500057,
+      "grad_norm": 0.1372690604293506,
+      "learning_rate": 0.0009938282190652928,
+      "loss": 0.681,
+      "step": 5715
+    },
+    {
+      "epoch": 0.2903884961480372,
+      "grad_norm": 0.06712065575391647,
+      "learning_rate": 0.0009937934717109912,
+      "loss": 0.6436,
+      "step": 5720
+    },
+    {
+      "epoch": 0.2906423322460687,
+      "grad_norm": 0.0657319128974874,
+      "learning_rate": 0.0009937586274274932,
+      "loss": 0.646,
+      "step": 5725
+    },
+    {
+      "epoch": 0.2908961683441002,
+      "grad_norm": 0.032380117007307335,
+      "learning_rate": 0.0009937236862216391,
+      "loss": 0.6722,
+      "step": 5730
+    },
+    {
+      "epoch": 0.29115000444213174,
+      "grad_norm": 0.051361069870512616,
+      "learning_rate": 0.0009936886481002878,
+      "loss": 0.6756,
+      "step": 5735
+    },
+    {
+      "epoch": 0.2914038405401632,
+      "grad_norm": 0.03396960893164104,
+      "learning_rate": 0.0009936535130703169,
+      "loss": 0.6785,
+      "step": 5740
+    },
+    {
+      "epoch": 0.2916576766381947,
+      "grad_norm": 0.03936833237945123,
+      "learning_rate": 0.0009936182811386232,
+      "loss": 0.6588,
+      "step": 5745
+    },
+    {
+      "epoch": 0.29191151273622623,
+      "grad_norm": 0.05198925092916733,
+      "learning_rate": 0.0009935829523121224,
+      "loss": 0.6675,
+      "step": 5750
+    },
+    {
+      "epoch": 0.2921653488342577,
+      "grad_norm": 0.06085607860536531,
+      "learning_rate": 0.0009935475265977498,
+      "loss": 0.6286,
+      "step": 5755
+    },
+    {
+      "epoch": 0.29241918493228924,
+      "grad_norm": 0.059359860916179744,
+      "learning_rate": 0.0009935120040024587,
+      "loss": 0.6369,
+      "step": 5760
+    },
+    {
+      "epoch": 0.2926730210303207,
+      "grad_norm": 0.039118870920788806,
+      "learning_rate": 0.0009934763845332228,
+      "loss": 0.6544,
+      "step": 5765
+    },
+    {
+      "epoch": 0.2929268571283522,
+      "grad_norm": 0.04229042882872178,
+      "learning_rate": 0.0009934406681970332,
+      "loss": 0.6304,
+      "step": 5770
+    },
+    {
+      "epoch": 0.29318069322638374,
+      "grad_norm": 0.026802917169532255,
+      "learning_rate": 0.0009934048550009015,
+      "loss": 0.6143,
+      "step": 5775
+    },
+    {
+      "epoch": 0.2934345293244152,
+      "grad_norm": 0.03201916028903979,
+      "learning_rate": 0.0009933689449518573,
+      "loss": 0.6586,
+      "step": 5780
+    },
+    {
+      "epoch": 0.29368836542244675,
+      "grad_norm": 0.03780618180915201,
+      "learning_rate": 0.0009933329380569494,
+      "loss": 0.6688,
+      "step": 5785
+    },
+    {
+      "epoch": 0.29394220152047823,
+      "grad_norm": 0.04436936096005898,
+      "learning_rate": 0.0009932968343232462,
+      "loss": 0.6441,
+      "step": 5790
+    },
+    {
+      "epoch": 0.2941960376185097,
+      "grad_norm": 0.05967133531857773,
+      "learning_rate": 0.0009932606337578346,
+      "loss": 0.6466,
+      "step": 5795
+    },
+    {
+      "epoch": 0.29444987371654124,
+      "grad_norm": 0.05591640331892786,
+      "learning_rate": 0.0009932243363678203,
+      "loss": 0.6458,
+      "step": 5800
+    },
+    {
+      "epoch": 0.2947037098145727,
+      "grad_norm": 0.0350539399039161,
+      "learning_rate": 0.0009931879421603285,
+      "loss": 0.6694,
+      "step": 5805
+    },
+    {
+      "epoch": 0.29495754591260426,
+      "grad_norm": 0.047049178898087726,
+      "learning_rate": 0.0009931514511425032,
+      "loss": 0.628,
+      "step": 5810
+    },
+    {
+      "epoch": 0.29521138201063574,
+      "grad_norm": 0.06870970965553135,
+      "learning_rate": 0.0009931148633215074,
+      "loss": 0.6383,
+      "step": 5815
+    },
+    {
+      "epoch": 0.2954652181086672,
+      "grad_norm": 0.04607237701564132,
+      "learning_rate": 0.000993078178704523,
+      "loss": 0.6395,
+      "step": 5820
+    },
+    {
+      "epoch": 0.29571905420669875,
+      "grad_norm": 0.04293642734855891,
+      "learning_rate": 0.0009930413972987513,
+      "loss": 0.6509,
+      "step": 5825
+    },
+    {
+      "epoch": 0.29597289030473023,
+      "grad_norm": 0.04186066507422814,
+      "learning_rate": 0.000993004519111412,
+      "loss": 0.689,
+      "step": 5830
+    },
+    {
+      "epoch": 0.29622672640276176,
+      "grad_norm": 0.04758901402636238,
+      "learning_rate": 0.0009929675441497441,
+      "loss": 0.6446,
+      "step": 5835
+    },
+    {
+      "epoch": 0.29648056250079324,
+      "grad_norm": 0.02860243925691624,
+      "learning_rate": 0.000992930472421006,
+      "loss": 0.6412,
+      "step": 5840
+    },
+    {
+      "epoch": 0.2967343985988247,
+      "grad_norm": 0.14274718702473935,
+      "learning_rate": 0.0009928933039324741,
+      "loss": 0.6036,
+      "step": 5845
+    },
+    {
+      "epoch": 0.29698823469685626,
+      "grad_norm": 0.048711259308299386,
+      "learning_rate": 0.0009928560386914447,
+      "loss": 0.6508,
+      "step": 5850
+    },
+    {
+      "epoch": 0.29724207079488774,
+      "grad_norm": 0.05737639269415528,
+      "learning_rate": 0.000992818676705233,
+      "loss": 0.6338,
+      "step": 5855
+    },
+    {
+      "epoch": 0.2974959068929192,
+      "grad_norm": 0.08608374496462033,
+      "learning_rate": 0.0009927812179811727,
+      "loss": 0.7236,
+      "step": 5860
+    },
+    {
+      "epoch": 0.29774974299095075,
+      "grad_norm": 0.0849381389952358,
+      "learning_rate": 0.0009927436625266166,
+      "loss": 0.6686,
+      "step": 5865
+    },
+    {
+      "epoch": 0.29800357908898223,
+      "grad_norm": 1.0620336080619193,
+      "learning_rate": 0.0009927060103489369,
+      "loss": 0.6706,
+      "step": 5870
+    },
+    {
+      "epoch": 0.29825741518701376,
+      "grad_norm": 13.801221772105595,
+      "learning_rate": 0.0009926682614555247,
+      "loss": 2.1038,
+      "step": 5875
+    },
+    {
+      "epoch": 0.29851125128504524,
+      "grad_norm": 0.529463438355754,
+      "learning_rate": 0.0009926304158537895,
+      "loss": 1.9905,
+      "step": 5880
+    },
+    {
+      "epoch": 0.2987650873830767,
+      "grad_norm": 0.5693873876849637,
+      "learning_rate": 0.0009925924735511603,
+      "loss": 0.7879,
+      "step": 5885
+    },
+    {
+      "epoch": 0.29901892348110826,
+      "grad_norm": 0.15207649725469286,
+      "learning_rate": 0.0009925544345550854,
+      "loss": 0.7494,
+      "step": 5890
+    },
+    {
+      "epoch": 0.29927275957913974,
+      "grad_norm": 0.053777419398167665,
+      "learning_rate": 0.0009925162988730313,
+      "loss": 0.7054,
+      "step": 5895
+    },
+    {
+      "epoch": 0.29952659567717127,
+      "grad_norm": 0.11506938387205977,
+      "learning_rate": 0.0009924780665124839,
+      "loss": 0.6893,
+      "step": 5900
+    },
+    {
+      "epoch": 0.29978043177520275,
+      "grad_norm": 0.07887804598963974,
+      "learning_rate": 0.000992439737480948,
+      "loss": 0.687,
+      "step": 5905
+    },
+    {
+      "epoch": 0.30003426787323423,
+      "grad_norm": 0.06054971527019416,
+      "learning_rate": 0.0009924013117859475,
+      "loss": 0.7238,
+      "step": 5910
+    },
+    {
+      "epoch": 0.30028810397126576,
+      "grad_norm": 0.04536799993236666,
+      "learning_rate": 0.0009923627894350248,
+      "loss": 0.7173,
+      "step": 5915
+    },
+    {
+      "epoch": 0.30054194006929724,
+      "grad_norm": 0.05390447300870516,
+      "learning_rate": 0.0009923241704357423,
+      "loss": 0.7083,
+      "step": 5920
+    },
+    {
+      "epoch": 0.3007957761673288,
+      "grad_norm": 0.05672780707635087,
+      "learning_rate": 0.0009922854547956802,
+      "loss": 0.6606,
+      "step": 5925
+    },
+    {
+      "epoch": 0.30104961226536026,
+      "grad_norm": 0.03885747550342492,
+      "learning_rate": 0.0009922466425224383,
+      "loss": 0.6993,
+      "step": 5930
+    },
+    {
+      "epoch": 0.30130344836339173,
+      "grad_norm": 0.07191125789862535,
+      "learning_rate": 0.0009922077336236353,
+      "loss": 0.6375,
+      "step": 5935
+    },
+    {
+      "epoch": 0.30155728446142327,
+      "grad_norm": 0.10081257590820915,
+      "learning_rate": 0.000992168728106909,
+      "loss": 0.6541,
+      "step": 5940
+    },
+    {
+      "epoch": 0.30181112055945475,
+      "grad_norm": 0.03547952948787697,
+      "learning_rate": 0.0009921296259799155,
+      "loss": 0.6846,
+      "step": 5945
+    },
+    {
+      "epoch": 0.3020649566574863,
+      "grad_norm": 0.04217686067301573,
+      "learning_rate": 0.000992090427250331,
+      "loss": 0.6567,
+      "step": 5950
+    },
+    {
+      "epoch": 0.30231879275551776,
+      "grad_norm": 0.05110904105032828,
+      "learning_rate": 0.0009920511319258495,
+      "loss": 0.6976,
+      "step": 5955
+    },
+    {
+      "epoch": 0.30257262885354924,
+      "grad_norm": 0.03701759809254075,
+      "learning_rate": 0.0009920117400141848,
+      "loss": 0.67,
+      "step": 5960
+    },
+    {
+      "epoch": 0.3028264649515808,
+      "grad_norm": 0.7757113792101427,
+      "learning_rate": 0.0009919722515230691,
+      "loss": 0.6825,
+      "step": 5965
+    },
+    {
+      "epoch": 0.30308030104961226,
+      "grad_norm": 0.10831424664913561,
+      "learning_rate": 0.0009919326664602538,
+      "loss": 0.6896,
+      "step": 5970
+    },
+    {
+      "epoch": 0.3033341371476438,
+      "grad_norm": 0.09797456779700615,
+      "learning_rate": 0.0009918929848335095,
+      "loss": 0.7677,
+      "step": 5975
+    },
+    {
+      "epoch": 0.30358797324567527,
+      "grad_norm": 0.05159857874021976,
+      "learning_rate": 0.0009918532066506252,
+      "loss": 0.7122,
+      "step": 5980
+    },
+    {
+      "epoch": 0.30384180934370675,
+      "grad_norm": 0.05813781521982605,
+      "learning_rate": 0.0009918133319194093,
+      "loss": 0.6823,
+      "step": 5985
+    },
+    {
+      "epoch": 0.3040956454417383,
+      "grad_norm": 0.03671398334295547,
+      "learning_rate": 0.000991773360647689,
+      "loss": 0.6991,
+      "step": 5990
+    },
+    {
+      "epoch": 0.30434948153976976,
+      "grad_norm": 0.07402367959365846,
+      "learning_rate": 0.0009917332928433106,
+      "loss": 0.6946,
+      "step": 5995
+    },
+    {
+      "epoch": 0.3046033176378013,
+      "grad_norm": 0.0653670605879878,
+      "learning_rate": 0.000991693128514139,
+      "loss": 0.7273,
+      "step": 6000
+    },
+    {
+      "epoch": 0.3048571537358328,
+      "grad_norm": 0.03813871957965758,
+      "learning_rate": 0.0009916528676680585,
+      "loss": 0.6652,
+      "step": 6005
+    },
+    {
+      "epoch": 0.30511098983386425,
+      "grad_norm": 0.04376402030900822,
+      "learning_rate": 0.0009916125103129718,
+      "loss": 0.6455,
+      "step": 6010
+    },
+    {
+      "epoch": 0.3053648259318958,
+      "grad_norm": 0.04963286911499542,
+      "learning_rate": 0.000991572056456801,
+      "loss": 0.7049,
+      "step": 6015
+    },
+    {
+      "epoch": 0.30561866202992727,
+      "grad_norm": 0.030884815020089395,
+      "learning_rate": 0.000991531506107487,
+      "loss": 0.6823,
+      "step": 6020
+    },
+    {
+      "epoch": 0.3058724981279588,
+      "grad_norm": 0.03715644515895416,
+      "learning_rate": 0.0009914908592729896,
+      "loss": 0.7246,
+      "step": 6025
+    },
+    {
+      "epoch": 0.3061263342259903,
+      "grad_norm": 0.05308010746588163,
+      "learning_rate": 0.0009914501159612877,
+      "loss": 0.6453,
+      "step": 6030
+    },
+    {
+      "epoch": 0.30638017032402176,
+      "grad_norm": 0.042657404727221966,
+      "learning_rate": 0.0009914092761803789,
+      "loss": 0.6524,
+      "step": 6035
+    },
+    {
+      "epoch": 0.3066340064220533,
+      "grad_norm": 0.06017465873203829,
+      "learning_rate": 0.0009913683399382796,
+      "loss": 0.7004,
+      "step": 6040
+    },
+    {
+      "epoch": 0.3068878425200848,
+      "grad_norm": 0.038922670583128045,
+      "learning_rate": 0.0009913273072430257,
+      "loss": 0.6804,
+      "step": 6045
+    },
+    {
+      "epoch": 0.3071416786181163,
+      "grad_norm": 0.0648567542128592,
+      "learning_rate": 0.0009912861781026718,
+      "loss": 0.6558,
+      "step": 6050
+    },
+    {
+      "epoch": 0.3073955147161478,
+      "grad_norm": 0.03494754440461928,
+      "learning_rate": 0.0009912449525252911,
+      "loss": 0.666,
+      "step": 6055
+    },
+    {
+      "epoch": 0.30764935081417927,
+      "grad_norm": 0.03092884139080322,
+      "learning_rate": 0.000991203630518976,
+      "loss": 0.6688,
+      "step": 6060
+    },
+    {
+      "epoch": 0.3079031869122108,
+      "grad_norm": 0.05872150090284288,
+      "learning_rate": 0.0009911622120918379,
+      "loss": 0.6385,
+      "step": 6065
+    },
+    {
+      "epoch": 0.3081570230102423,
+      "grad_norm": 0.08520802622712559,
+      "learning_rate": 0.0009911206972520068,
+      "loss": 0.6635,
+      "step": 6070
+    },
+    {
+      "epoch": 0.30841085910827376,
+      "grad_norm": 0.06879085230717281,
+      "learning_rate": 0.0009910790860076324,
+      "loss": 0.7711,
+      "step": 6075
+    },
+    {
+      "epoch": 0.3086646952063053,
+      "grad_norm": 0.06496604434818472,
+      "learning_rate": 0.0009910373783668823,
+      "loss": 0.6546,
+      "step": 6080
+    },
+    {
+      "epoch": 0.3089185313043368,
+      "grad_norm": 0.03788866020152313,
+      "learning_rate": 0.0009909955743379435,
+      "loss": 0.6574,
+      "step": 6085
+    },
+    {
+      "epoch": 0.3091723674023683,
+      "grad_norm": 0.036029343391252786,
+      "learning_rate": 0.0009909536739290221,
+      "loss": 0.6614,
+      "step": 6090
+    },
+    {
+      "epoch": 0.3094262035003998,
+      "grad_norm": 0.0357204003060438,
+      "learning_rate": 0.0009909116771483427,
+      "loss": 0.6273,
+      "step": 6095
+    },
+    {
+      "epoch": 0.30968003959843127,
+      "grad_norm": 0.036306841185922825,
+      "learning_rate": 0.0009908695840041496,
+      "loss": 0.6725,
+      "step": 6100
+    },
+    {
+      "epoch": 0.3099338756964628,
+      "grad_norm": 0.03660198406181924,
+      "learning_rate": 0.000990827394504705,
+      "loss": 0.647,
+      "step": 6105
+    },
+    {
+      "epoch": 0.3101877117944943,
+      "grad_norm": 0.04095269101364938,
+      "learning_rate": 0.0009907851086582906,
+      "loss": 0.6755,
+      "step": 6110
+    },
+    {
+      "epoch": 0.3104415478925258,
+      "grad_norm": 0.03247697430503874,
+      "learning_rate": 0.0009907427264732069,
+      "loss": 0.6813,
+      "step": 6115
+    },
+    {
+      "epoch": 0.3106953839905573,
+      "grad_norm": 0.03162996739245832,
+      "learning_rate": 0.0009907002479577734,
+      "loss": 0.6643,
+      "step": 6120
+    },
+    {
+      "epoch": 0.3109492200885888,
+      "grad_norm": 0.5455514035154928,
+      "learning_rate": 0.0009906576731203282,
+      "loss": 0.6433,
+      "step": 6125
+    },
+    {
+      "epoch": 0.3112030561866203,
+      "grad_norm": 0.044476465750440224,
+      "learning_rate": 0.0009906150019692288,
+      "loss": 0.6653,
+      "step": 6130
+    },
+    {
+      "epoch": 0.3114568922846518,
+      "grad_norm": 0.05927549810439251,
+      "learning_rate": 0.000990572234512851,
+      "loss": 0.6589,
+      "step": 6135
+    },
+    {
+      "epoch": 0.3117107283826833,
+      "grad_norm": 0.056665905206855084,
+      "learning_rate": 0.0009905293707595903,
+      "loss": 0.649,
+      "step": 6140
+    },
+    {
+      "epoch": 0.3119645644807148,
+      "grad_norm": 0.03331713795791355,
+      "learning_rate": 0.0009904864107178602,
+      "loss": 0.64,
+      "step": 6145
+    },
+    {
+      "epoch": 0.3122184005787463,
+      "grad_norm": 0.04406210853179684,
+      "learning_rate": 0.000990443354396094,
+      "loss": 0.6702,
+      "step": 6150
+    },
+    {
+      "epoch": 0.3124722366767778,
+      "grad_norm": 0.057693446080025364,
+      "learning_rate": 0.000990400201802743,
+      "loss": 0.7062,
+      "step": 6155
+    },
+    {
+      "epoch": 0.3127260727748093,
+      "grad_norm": 0.035021125314635915,
+      "learning_rate": 0.0009903569529462778,
+      "loss": 0.697,
+      "step": 6160
+    },
+    {
+      "epoch": 0.31297990887284083,
+      "grad_norm": 0.055650402580445305,
+      "learning_rate": 0.0009903136078351885,
+      "loss": 0.648,
+      "step": 6165
+    },
+    {
+      "epoch": 0.3132337449708723,
+      "grad_norm": 0.06196313978890203,
+      "learning_rate": 0.0009902701664779828,
+      "loss": 0.6454,
+      "step": 6170
+    },
+    {
+      "epoch": 0.3134875810689038,
+      "grad_norm": 0.038941666671227525,
+      "learning_rate": 0.0009902266288831887,
+      "loss": 0.676,
+      "step": 6175
+    },
+    {
+      "epoch": 0.3137414171669353,
+      "grad_norm": 0.0415527424909999,
+      "learning_rate": 0.000990182995059352,
+      "loss": 0.6999,
+      "step": 6180
+    },
+    {
+      "epoch": 0.3139952532649668,
+      "grad_norm": 0.032478914330314065,
+      "learning_rate": 0.0009901392650150378,
+      "loss": 0.6384,
+      "step": 6185
+    },
+    {
+      "epoch": 0.31424908936299834,
+      "grad_norm": 0.03685841261439452,
+      "learning_rate": 0.0009900954387588303,
+      "loss": 0.6149,
+      "step": 6190
+    },
+    {
+      "epoch": 0.3145029254610298,
+      "grad_norm": 0.0576119132892154,
+      "learning_rate": 0.0009900515162993325,
+      "loss": 0.6754,
+      "step": 6195
+    },
+    {
+      "epoch": 0.3147567615590613,
+      "grad_norm": 0.03227069972941047,
+      "learning_rate": 0.0009900074976451655,
+      "loss": 0.6563,
+      "step": 6200
+    },
+    {
+      "epoch": 0.31501059765709283,
+      "grad_norm": 0.03374111959001576,
+      "learning_rate": 0.0009899633828049706,
+      "loss": 0.6457,
+      "step": 6205
+    },
+    {
+      "epoch": 0.3152644337551243,
+      "grad_norm": 0.036155101851612435,
+      "learning_rate": 0.0009899191717874071,
+      "loss": 0.646,
+      "step": 6210
+    },
+    {
+      "epoch": 0.31551826985315584,
+      "grad_norm": 0.031824458816045474,
+      "learning_rate": 0.0009898748646011534,
+      "loss": 0.6559,
+      "step": 6215
+    },
+    {
+      "epoch": 0.3157721059511873,
+      "grad_norm": 0.03276354958142416,
+      "learning_rate": 0.0009898304612549068,
+      "loss": 0.6378,
+      "step": 6220
+    },
+    {
+      "epoch": 0.3160259420492188,
+      "grad_norm": 0.029188354303673416,
+      "learning_rate": 0.0009897859617573833,
+      "loss": 0.6496,
+      "step": 6225
+    },
+    {
+      "epoch": 0.31627977814725033,
+      "grad_norm": 0.03357230960317099,
+      "learning_rate": 0.0009897413661173182,
+      "loss": 0.6655,
+      "step": 6230
+    },
+    {
+      "epoch": 0.3165336142452818,
+      "grad_norm": 0.03530965710230692,
+      "learning_rate": 0.0009896966743434654,
+      "loss": 0.6709,
+      "step": 6235
+    },
+    {
+      "epoch": 0.31678745034331335,
+      "grad_norm": 0.03547112168077008,
+      "learning_rate": 0.0009896518864445974,
+      "loss": 0.6459,
+      "step": 6240
+    },
+    {
+      "epoch": 0.3170412864413448,
+      "grad_norm": 0.05701195512800427,
+      "learning_rate": 0.0009896070024295058,
+      "loss": 0.6933,
+      "step": 6245
+    },
+    {
+      "epoch": 0.3172951225393763,
+      "grad_norm": 0.034050322178401336,
+      "learning_rate": 0.0009895620223070013,
+      "loss": 0.6286,
+      "step": 6250
+    },
+    {
+      "epoch": 0.31754895863740784,
+      "grad_norm": 0.10844739490541815,
+      "learning_rate": 0.0009895169460859136,
+      "loss": 0.6691,
+      "step": 6255
+    },
+    {
+      "epoch": 0.3178027947354393,
+      "grad_norm": 0.04811906636847503,
+      "learning_rate": 0.0009894717737750905,
+      "loss": 0.6406,
+      "step": 6260
+    },
+    {
+      "epoch": 0.31805663083347085,
+      "grad_norm": 0.03651016020761419,
+      "learning_rate": 0.000989426505383399,
+      "loss": 0.6404,
+      "step": 6265
+    },
+    {
+      "epoch": 0.31831046693150233,
+      "grad_norm": 0.03909160423336249,
+      "learning_rate": 0.0009893811409197254,
+      "loss": 0.6718,
+      "step": 6270
+    },
+    {
+      "epoch": 0.3185643030295338,
+      "grad_norm": 0.03433471167278222,
+      "learning_rate": 0.0009893356803929742,
+      "loss": 0.7035,
+      "step": 6275
+    },
+    {
+      "epoch": 0.31881813912756535,
+      "grad_norm": 0.0315707692611077,
+      "learning_rate": 0.0009892901238120694,
+      "loss": 0.6278,
+      "step": 6280
+    },
+    {
+      "epoch": 0.3190719752255968,
+      "grad_norm": 0.03181406843213265,
+      "learning_rate": 0.0009892444711859536,
+      "loss": 0.6195,
+      "step": 6285
+    },
+    {
+      "epoch": 0.3193258113236283,
+      "grad_norm": 0.03252733678185362,
+      "learning_rate": 0.0009891987225235876,
+      "loss": 0.6591,
+      "step": 6290
+    },
+    {
+      "epoch": 0.31957964742165984,
+      "grad_norm": 0.028275724026086066,
+      "learning_rate": 0.0009891528778339523,
+      "loss": 0.632,
+      "step": 6295
+    },
+    {
+      "epoch": 0.3198334835196913,
+      "grad_norm": 0.0332639388947232,
+      "learning_rate": 0.0009891069371260463,
+      "loss": 0.6706,
+      "step": 6300
+    },
+    {
+      "epoch": 0.32008731961772285,
+      "grad_norm": 0.04023616497430709,
+      "learning_rate": 0.0009890609004088878,
+      "loss": 0.6397,
+      "step": 6305
+    },
+    {
+      "epoch": 0.32034115571575433,
+      "grad_norm": 0.04189562916683901,
+      "learning_rate": 0.0009890147676915133,
+      "loss": 0.6589,
+      "step": 6310
+    },
+    {
+      "epoch": 0.3205949918137858,
+      "grad_norm": 0.04615410615909303,
+      "learning_rate": 0.0009889685389829787,
+      "loss": 0.654,
+      "step": 6315
+    },
+    {
+      "epoch": 0.32084882791181735,
+      "grad_norm": 0.0876708805444767,
+      "learning_rate": 0.0009889222142923585,
+      "loss": 0.6873,
+      "step": 6320
+    },
+    {
+      "epoch": 0.3211026640098488,
+      "grad_norm": 0.0731800370414708,
+      "learning_rate": 0.0009888757936287458,
+      "loss": 0.6661,
+      "step": 6325
+    },
+    {
+      "epoch": 0.32135650010788036,
+      "grad_norm": 0.04489055013050584,
+      "learning_rate": 0.0009888292770012528,
+      "loss": 0.6437,
+      "step": 6330
+    },
+    {
+      "epoch": 0.32161033620591184,
+      "grad_norm": 0.05950573079091027,
+      "learning_rate": 0.0009887826644190106,
+      "loss": 0.6107,
+      "step": 6335
+    },
+    {
+      "epoch": 0.3218641723039433,
+      "grad_norm": 0.03311301201808374,
+      "learning_rate": 0.0009887359558911689,
+      "loss": 0.6396,
+      "step": 6340
+    },
+    {
+      "epoch": 0.32211800840197485,
+      "grad_norm": 0.04257944673946886,
+      "learning_rate": 0.0009886891514268963,
+      "loss": 0.6497,
+      "step": 6345
+    },
+    {
+      "epoch": 0.32237184450000633,
+      "grad_norm": 0.03870234251780305,
+      "learning_rate": 0.0009886422510353805,
+      "loss": 0.657,
+      "step": 6350
+    },
+    {
+      "epoch": 0.32262568059803787,
+      "grad_norm": 0.03016914715973346,
+      "learning_rate": 0.0009885952547258278,
+      "loss": 0.6503,
+      "step": 6355
+    },
+    {
+      "epoch": 0.32287951669606935,
+      "grad_norm": 0.039652301246605876,
+      "learning_rate": 0.000988548162507463,
+      "loss": 0.7024,
+      "step": 6360
+    },
+    {
+      "epoch": 0.3231333527941008,
+      "grad_norm": 0.032827930330652684,
+      "learning_rate": 0.0009885009743895302,
+      "loss": 0.6377,
+      "step": 6365
+    },
+    {
+      "epoch": 0.32338718889213236,
+      "grad_norm": 0.04068567499221553,
+      "learning_rate": 0.0009884536903812923,
+      "loss": 0.6727,
+      "step": 6370
+    },
+    {
+      "epoch": 0.32364102499016384,
+      "grad_norm": 0.0345449057811016,
+      "learning_rate": 0.000988406310492031,
+      "loss": 0.6875,
+      "step": 6375
+    },
+    {
+      "epoch": 0.3238948610881954,
+      "grad_norm": 0.04902874284849529,
+      "learning_rate": 0.0009883588347310466,
+      "loss": 0.6455,
+      "step": 6380
+    },
+    {
+      "epoch": 0.32414869718622685,
+      "grad_norm": 0.03422848521805073,
+      "learning_rate": 0.0009883112631076585,
+      "loss": 0.6338,
+      "step": 6385
+    },
+    {
+      "epoch": 0.32440253328425833,
+      "grad_norm": 0.03165939758398714,
+      "learning_rate": 0.0009882635956312046,
+      "loss": 0.6472,
+      "step": 6390
+    },
+    {
+      "epoch": 0.32465636938228987,
+      "grad_norm": 0.027078597446684824,
+      "learning_rate": 0.0009882158323110417,
+      "loss": 0.6291,
+      "step": 6395
+    },
+    {
+      "epoch": 0.32491020548032135,
+      "grad_norm": 0.03356428796023044,
+      "learning_rate": 0.0009881679731565457,
+      "loss": 0.649,
+      "step": 6400
+    },
+    {
+      "epoch": 0.3251640415783529,
+      "grad_norm": 0.0644450340702738,
+      "learning_rate": 0.000988120018177111,
+      "loss": 0.6616,
+      "step": 6405
+    },
+    {
+      "epoch": 0.32541787767638436,
+      "grad_norm": 0.049328556792104926,
+      "learning_rate": 0.0009880719673821513,
+      "loss": 0.6334,
+      "step": 6410
+    },
+    {
+      "epoch": 0.32567171377441584,
+      "grad_norm": 0.03340548046894897,
+      "learning_rate": 0.000988023820781098,
+      "loss": 0.6107,
+      "step": 6415
+    },
+    {
+      "epoch": 0.3259255498724474,
+      "grad_norm": 0.031492806873797255,
+      "learning_rate": 0.000987975578383403,
+      "loss": 0.6654,
+      "step": 6420
+    },
+    {
+      "epoch": 0.32617938597047885,
+      "grad_norm": 0.03786542334456132,
+      "learning_rate": 0.0009879272401985349,
+      "loss": 0.6533,
+      "step": 6425
+    },
+    {
+      "epoch": 0.3264332220685104,
+      "grad_norm": 0.03333849837492178,
+      "learning_rate": 0.0009878788062359831,
+      "loss": 0.6664,
+      "step": 6430
+    },
+    {
+      "epoch": 0.32668705816654187,
+      "grad_norm": 0.03610249374246746,
+      "learning_rate": 0.0009878302765052548,
+      "loss": 0.633,
+      "step": 6435
+    },
+    {
+      "epoch": 0.32694089426457335,
+      "grad_norm": 0.031209587596926817,
+      "learning_rate": 0.0009877816510158756,
+      "loss": 0.6613,
+      "step": 6440
+    },
+    {
+      "epoch": 0.3271947303626049,
+      "grad_norm": 0.03483245394564131,
+      "learning_rate": 0.0009877329297773914,
+      "loss": 0.6727,
+      "step": 6445
+    },
+    {
+      "epoch": 0.32744856646063636,
+      "grad_norm": 0.034854234123111616,
+      "learning_rate": 0.000987684112799365,
+      "loss": 0.6448,
+      "step": 6450
+    },
+    {
+      "epoch": 0.3277024025586679,
+      "grad_norm": 0.03027463862874961,
+      "learning_rate": 0.0009876352000913796,
+      "loss": 0.6376,
+      "step": 6455
+    },
+    {
+      "epoch": 0.3279562386566994,
+      "grad_norm": 0.029904982015815273,
+      "learning_rate": 0.000987586191663036,
+      "loss": 0.6271,
+      "step": 6460
+    },
+    {
+      "epoch": 0.32821007475473085,
+      "grad_norm": 0.04606450117610927,
+      "learning_rate": 0.0009875370875239548,
+      "loss": 0.6324,
+      "step": 6465
+    },
+    {
+      "epoch": 0.3284639108527624,
+      "grad_norm": 0.047345984810625526,
+      "learning_rate": 0.0009874878876837746,
+      "loss": 0.6559,
+      "step": 6470
+    },
+    {
+      "epoch": 0.32871774695079387,
+      "grad_norm": 0.027399081554610526,
+      "learning_rate": 0.0009874385921521533,
+      "loss": 0.6782,
+      "step": 6475
+    },
+    {
+      "epoch": 0.3289715830488254,
+      "grad_norm": 0.03402265966097058,
+      "learning_rate": 0.000987389200938767,
+      "loss": 0.6826,
+      "step": 6480
+    },
+    {
+      "epoch": 0.3292254191468569,
+      "grad_norm": 0.07145987939489644,
+      "learning_rate": 0.0009873397140533111,
+      "loss": 0.648,
+      "step": 6485
+    },
+    {
+      "epoch": 0.32947925524488836,
+      "grad_norm": 0.030640240717414162,
+      "learning_rate": 0.0009872901315054999,
+      "loss": 0.6225,
+      "step": 6490
+    },
+    {
+      "epoch": 0.3297330913429199,
+      "grad_norm": 0.029273085435207902,
+      "learning_rate": 0.000987240453305066,
+      "loss": 0.668,
+      "step": 6495
+    },
+    {
+      "epoch": 0.32998692744095137,
+      "grad_norm": 0.027915653420681048,
+      "learning_rate": 0.0009871906794617607,
+      "loss": 0.675,
+      "step": 6500
+    },
+    {
+      "epoch": 0.33024076353898285,
+      "grad_norm": 0.028085088656119427,
+      "learning_rate": 0.0009871408099853547,
+      "loss": 0.6468,
+      "step": 6505
+    },
+    {
+      "epoch": 0.3304945996370144,
+      "grad_norm": 0.03469790217987671,
+      "learning_rate": 0.0009870908448856373,
+      "loss": 0.6291,
+      "step": 6510
+    },
+    {
+      "epoch": 0.33074843573504586,
+      "grad_norm": 0.03940795336268566,
+      "learning_rate": 0.000987040784172416,
+      "loss": 0.6471,
+      "step": 6515
+    },
+    {
+      "epoch": 0.3310022718330774,
+      "grad_norm": 0.0372777835424837,
+      "learning_rate": 0.0009869906278555177,
+      "loss": 0.6434,
+      "step": 6520
+    },
+    {
+      "epoch": 0.3312561079311089,
+      "grad_norm": 0.05751236855268058,
+      "learning_rate": 0.0009869403759447876,
+      "loss": 0.6963,
+      "step": 6525
+    },
+    {
+      "epoch": 0.33150994402914036,
+      "grad_norm": 0.028415506939899773,
+      "learning_rate": 0.0009868900284500904,
+      "loss": 0.6116,
+      "step": 6530
+    },
+    {
+      "epoch": 0.3317637801271719,
+      "grad_norm": 0.06720606915783799,
+      "learning_rate": 0.0009868395853813085,
+      "loss": 0.665,
+      "step": 6535
+    },
+    {
+      "epoch": 0.33201761622520337,
+      "grad_norm": 0.037437450250349114,
+      "learning_rate": 0.000986789046748344,
+      "loss": 0.6683,
+      "step": 6540
+    },
+    {
+      "epoch": 0.3322714523232349,
+      "grad_norm": 0.03050364304716209,
+      "learning_rate": 0.000986738412561117,
+      "loss": 0.6796,
+      "step": 6545
+    },
+    {
+      "epoch": 0.3325252884212664,
+      "grad_norm": 0.03574775300175517,
+      "learning_rate": 0.0009866876828295672,
+      "loss": 0.6177,
+      "step": 6550
+    },
+    {
+      "epoch": 0.33277912451929786,
+      "grad_norm": 0.02999643436973219,
+      "learning_rate": 0.0009866368575636522,
+      "loss": 0.6632,
+      "step": 6555
+    },
+    {
+      "epoch": 0.3330329606173294,
+      "grad_norm": 0.034667486256933826,
+      "learning_rate": 0.0009865859367733489,
+      "loss": 0.6685,
+      "step": 6560
+    },
+    {
+      "epoch": 0.3332867967153609,
+      "grad_norm": 0.035749048566894245,
+      "learning_rate": 0.0009865349204686532,
+      "loss": 0.624,
+      "step": 6565
+    },
+    {
+      "epoch": 0.3335406328133924,
+      "grad_norm": 0.02721431745155214,
+      "learning_rate": 0.0009864838086595783,
+      "loss": 0.6438,
+      "step": 6570
+    },
+    {
+      "epoch": 0.3337944689114239,
+      "grad_norm": 0.030303232949427904,
+      "learning_rate": 0.0009864326013561584,
+      "loss": 0.6504,
+      "step": 6575
+    },
+    {
+      "epoch": 0.33404830500945537,
+      "grad_norm": 0.033151379828375675,
+      "learning_rate": 0.0009863812985684446,
+      "loss": 0.6616,
+      "step": 6580
+    },
+    {
+      "epoch": 0.3343021411074869,
+      "grad_norm": 0.03492354659989374,
+      "learning_rate": 0.0009863299003065073,
+      "loss": 0.6649,
+      "step": 6585
+    },
+    {
+      "epoch": 0.3345559772055184,
+      "grad_norm": 0.029289412856450096,
+      "learning_rate": 0.000986278406580436,
+      "loss": 0.671,
+      "step": 6590
+    },
+    {
+      "epoch": 0.3348098133035499,
+      "grad_norm": 0.029758417773996532,
+      "learning_rate": 0.0009862268174003386,
+      "loss": 0.6431,
+      "step": 6595
+    },
+    {
+      "epoch": 0.3350636494015814,
+      "grad_norm": 0.02917170623260094,
+      "learning_rate": 0.0009861751327763415,
+      "loss": 0.6586,
+      "step": 6600
+    },
+    {
+      "epoch": 0.3353174854996129,
+      "grad_norm": 0.039238413473097804,
+      "learning_rate": 0.0009861233527185907,
+      "loss": 0.6523,
+      "step": 6605
+    },
+    {
+      "epoch": 0.3355713215976444,
+      "grad_norm": 0.05769974377090198,
+      "learning_rate": 0.00098607147723725,
+      "loss": 0.6534,
+      "step": 6610
+    },
+    {
+      "epoch": 0.3358251576956759,
+      "grad_norm": 0.05207362585582225,
+      "learning_rate": 0.000986019506342502,
+      "loss": 0.628,
+      "step": 6615
+    },
+    {
+      "epoch": 0.3360789937937074,
+      "grad_norm": 0.042481555414880016,
+      "learning_rate": 0.0009859674400445491,
+      "loss": 0.6352,
+      "step": 6620
+    },
+    {
+      "epoch": 0.3363328298917389,
+      "grad_norm": 0.038592274715997285,
+      "learning_rate": 0.0009859152783536112,
+      "loss": 0.655,
+      "step": 6625
+    },
+    {
+      "epoch": 0.3365866659897704,
+      "grad_norm": 0.03810235969474428,
+      "learning_rate": 0.0009858630212799273,
+      "loss": 0.6078,
+      "step": 6630
+    },
+    {
+      "epoch": 0.3368405020878019,
+      "grad_norm": 0.04141260814892809,
+      "learning_rate": 0.0009858106688337552,
+      "loss": 0.6309,
+      "step": 6635
+    },
+    {
+      "epoch": 0.3370943381858334,
+      "grad_norm": 0.050224588657700506,
+      "learning_rate": 0.0009857582210253718,
+      "loss": 0.6598,
+      "step": 6640
+    },
+    {
+      "epoch": 0.33734817428386493,
+      "grad_norm": 0.044838404082403416,
+      "learning_rate": 0.000985705677865072,
+      "loss": 0.643,
+      "step": 6645
+    },
+    {
+      "epoch": 0.3376020103818964,
+      "grad_norm": 0.039078603300489616,
+      "learning_rate": 0.0009856530393631698,
+      "loss": 0.6293,
+      "step": 6650
+    },
+    {
+      "epoch": 0.3378558464799279,
+      "grad_norm": 0.049346205757841856,
+      "learning_rate": 0.0009856003055299979,
+      "loss": 0.67,
+      "step": 6655
+    },
+    {
+      "epoch": 0.3381096825779594,
+      "grad_norm": 0.0467859227211263,
+      "learning_rate": 0.0009855474763759075,
+      "loss": 0.6425,
+      "step": 6660
+    },
+    {
+      "epoch": 0.3383635186759909,
+      "grad_norm": 0.03718081935719444,
+      "learning_rate": 0.0009854945519112692,
+      "loss": 0.65,
+      "step": 6665
+    },
+    {
+      "epoch": 0.33861735477402244,
+      "grad_norm": 0.04145755563324682,
+      "learning_rate": 0.0009854415321464715,
+      "loss": 0.6424,
+      "step": 6670
+    },
+    {
+      "epoch": 0.3388711908720539,
+      "grad_norm": 0.03672619994306692,
+      "learning_rate": 0.0009853884170919218,
+      "loss": 0.66,
+      "step": 6675
+    },
+    {
+      "epoch": 0.3391250269700854,
+      "grad_norm": 0.03752204942770816,
+      "learning_rate": 0.0009853352067580466,
+      "loss": 0.6683,
+      "step": 6680
+    },
+    {
+      "epoch": 0.33937886306811693,
+      "grad_norm": 0.03222882623511126,
+      "learning_rate": 0.0009852819011552908,
+      "loss": 0.6438,
+      "step": 6685
+    },
+    {
+      "epoch": 0.3396326991661484,
+      "grad_norm": 0.02965590085193827,
+      "learning_rate": 0.0009852285002941174,
+      "loss": 0.6642,
+      "step": 6690
+    },
+    {
+      "epoch": 0.33988653526417995,
+      "grad_norm": 0.03879854510223968,
+      "learning_rate": 0.0009851750041850098,
+      "loss": 0.6477,
+      "step": 6695
+    },
+    {
+      "epoch": 0.3401403713622114,
+      "grad_norm": 0.14912016060061054,
+      "learning_rate": 0.000985121412838468,
+      "loss": 0.6301,
+      "step": 6700
+    },
+    {
+      "epoch": 0.3403942074602429,
+      "grad_norm": 0.031220805777869454,
+      "learning_rate": 0.0009850677262650124,
+      "loss": 0.6785,
+      "step": 6705
+    },
+    {
+      "epoch": 0.34064804355827444,
+      "grad_norm": 0.047945073583954875,
+      "learning_rate": 0.000985013944475181,
+      "loss": 0.6329,
+      "step": 6710
+    },
+    {
+      "epoch": 0.3409018796563059,
+      "grad_norm": 0.03125211749366002,
+      "learning_rate": 0.0009849600674795313,
+      "loss": 0.6694,
+      "step": 6715
+    },
+    {
+      "epoch": 0.34115571575433745,
+      "grad_norm": 0.030016984119769507,
+      "learning_rate": 0.0009849060952886385,
+      "loss": 0.6489,
+      "step": 6720
+    },
+    {
+      "epoch": 0.34140955185236893,
+      "grad_norm": 0.0321881850801395,
+      "learning_rate": 0.0009848520279130979,
+      "loss": 0.6557,
+      "step": 6725
+    },
+    {
+      "epoch": 0.3416633879504004,
+      "grad_norm": 0.027489966034472072,
+      "learning_rate": 0.0009847978653635219,
+      "loss": 0.6161,
+      "step": 6730
+    },
+    {
+      "epoch": 0.34191722404843194,
+      "grad_norm": 0.03167397292683289,
+      "learning_rate": 0.0009847436076505425,
+      "loss": 0.6661,
+      "step": 6735
+    },
+    {
+      "epoch": 0.3421710601464634,
+      "grad_norm": 0.027164902072622888,
+      "learning_rate": 0.0009846892547848106,
+      "loss": 0.6416,
+      "step": 6740
+    },
+    {
+      "epoch": 0.3424248962444949,
+      "grad_norm": 0.032186132345030184,
+      "learning_rate": 0.000984634806776995,
+      "loss": 0.6011,
+      "step": 6745
+    },
+    {
+      "epoch": 0.34267873234252644,
+      "grad_norm": 0.05422054145392441,
+      "learning_rate": 0.0009845802636377834,
+      "loss": 0.619,
+      "step": 6750
+    },
+    {
+      "epoch": 0.3429325684405579,
+      "grad_norm": 0.03126640652473011,
+      "learning_rate": 0.000984525625377883,
+      "loss": 0.663,
+      "step": 6755
+    },
+    {
+      "epoch": 0.34318640453858945,
+      "grad_norm": 0.04178980876855967,
+      "learning_rate": 0.0009844708920080185,
+      "loss": 0.6764,
+      "step": 6760
+    },
+    {
+      "epoch": 0.34344024063662093,
+      "grad_norm": 0.0479771178058185,
+      "learning_rate": 0.000984416063538934,
+      "loss": 0.636,
+      "step": 6765
+    },
+    {
+      "epoch": 0.3436940767346524,
+      "grad_norm": 0.048210015731811606,
+      "learning_rate": 0.0009843611399813921,
+      "loss": 0.6826,
+      "step": 6770
+    },
+    {
+      "epoch": 0.34394791283268394,
+      "grad_norm": 0.040035034533658136,
+      "learning_rate": 0.0009843061213461739,
+      "loss": 0.6551,
+      "step": 6775
+    },
+    {
+      "epoch": 0.3442017489307154,
+      "grad_norm": 0.04450679708387797,
+      "learning_rate": 0.0009842510076440792,
+      "loss": 0.6789,
+      "step": 6780
+    },
+    {
+      "epoch": 0.34445558502874696,
+      "grad_norm": 0.03469053565219521,
+      "learning_rate": 0.0009841957988859268,
+      "loss": 0.636,
+      "step": 6785
+    },
+    {
+      "epoch": 0.34470942112677844,
+      "grad_norm": 0.03854111003127534,
+      "learning_rate": 0.0009841404950825536,
+      "loss": 0.6504,
+      "step": 6790
+    },
+    {
+      "epoch": 0.3449632572248099,
+      "grad_norm": 0.0422532957102432,
+      "learning_rate": 0.0009840850962448157,
+      "loss": 0.7046,
+      "step": 6795
+    },
+    {
+      "epoch": 0.34521709332284145,
+      "grad_norm": 0.042849883678541305,
+      "learning_rate": 0.0009840296023835877,
+      "loss": 0.6982,
+      "step": 6800
+    },
+    {
+      "epoch": 0.34547092942087293,
+      "grad_norm": 0.11970895618204253,
+      "learning_rate": 0.0009839740135097624,
+      "loss": 0.7033,
+      "step": 6805
+    },
+    {
+      "epoch": 0.34572476551890446,
+      "grad_norm": 0.28594499591001454,
+      "learning_rate": 0.0009839183296342518,
+      "loss": 0.6618,
+      "step": 6810
+    },
+    {
+      "epoch": 0.34597860161693594,
+      "grad_norm": 0.09723775531984895,
+      "learning_rate": 0.0009838625507679866,
+      "loss": 0.7034,
+      "step": 6815
+    },
+    {
+      "epoch": 0.3462324377149674,
+      "grad_norm": 0.12532852308987766,
+      "learning_rate": 0.0009838066769219155,
+      "loss": 0.7629,
+      "step": 6820
+    },
+    {
+      "epoch": 0.34648627381299896,
+      "grad_norm": 0.0753799546279816,
+      "learning_rate": 0.0009837507081070064,
+      "loss": 0.7012,
+      "step": 6825
+    },
+    {
+      "epoch": 0.34674010991103044,
+      "grad_norm": 0.23690674591283684,
+      "learning_rate": 0.000983694644334246,
+      "loss": 0.7184,
+      "step": 6830
+    },
+    {
+      "epoch": 0.34699394600906197,
+      "grad_norm": 0.06495848284102379,
+      "learning_rate": 0.000983638485614639,
+      "loss": 0.6394,
+      "step": 6835
+    },
+    {
+      "epoch": 0.34724778210709345,
+      "grad_norm": 0.10113306645676379,
+      "learning_rate": 0.0009835822319592092,
+      "loss": 0.7055,
+      "step": 6840
+    },
+    {
+      "epoch": 0.34750161820512493,
+      "grad_norm": 13.507304609333943,
+      "learning_rate": 0.0009835258833789987,
+      "loss": 0.7508,
+      "step": 6845
+    },
+    {
+      "epoch": 0.34775545430315646,
+      "grad_norm": 0.08682835501016006,
+      "learning_rate": 0.0009834694398850687,
+      "loss": 0.6614,
+      "step": 6850
+    },
+    {
+      "epoch": 0.34800929040118794,
+      "grad_norm": 0.0777945624030565,
+      "learning_rate": 0.000983412901488499,
+      "loss": 0.6523,
+      "step": 6855
+    },
+    {
+      "epoch": 0.3482631264992195,
+      "grad_norm": 0.04009386117281962,
+      "learning_rate": 0.0009833562682003871,
+      "loss": 0.6425,
+      "step": 6860
+    },
+    {
+      "epoch": 0.34851696259725096,
+      "grad_norm": 0.07938149186312117,
+      "learning_rate": 0.0009832995400318506,
+      "loss": 0.6724,
+      "step": 6865
+    },
+    {
+      "epoch": 0.34877079869528244,
+      "grad_norm": 0.0464733613481905,
+      "learning_rate": 0.0009832427169940243,
+      "loss": 0.6723,
+      "step": 6870
+    },
+    {
+      "epoch": 0.34902463479331397,
+      "grad_norm": 0.030541854449360313,
+      "learning_rate": 0.0009831857990980628,
+      "loss": 0.6328,
+      "step": 6875
+    },
+    {
+      "epoch": 0.34927847089134545,
+      "grad_norm": 0.08684468777696315,
+      "learning_rate": 0.0009831287863551386,
+      "loss": 0.6537,
+      "step": 6880
+    },
+    {
+      "epoch": 0.349532306989377,
+      "grad_norm": 0.032667710885275855,
+      "learning_rate": 0.000983071678776443,
+      "loss": 0.6233,
+      "step": 6885
+    },
+    {
+      "epoch": 0.34978614308740846,
+      "grad_norm": 0.06215345815334673,
+      "learning_rate": 0.0009830144763731856,
+      "loss": 0.6523,
+      "step": 6890
+    },
+    {
+      "epoch": 0.35003997918543994,
+      "grad_norm": 0.050425409257167854,
+      "learning_rate": 0.0009829571791565956,
+      "loss": 0.6414,
+      "step": 6895
+    },
+    {
+      "epoch": 0.3502938152834715,
+      "grad_norm": 0.039909942468084936,
+      "learning_rate": 0.0009828997871379197,
+      "loss": 0.6715,
+      "step": 6900
+    },
+    {
+      "epoch": 0.35054765138150296,
+      "grad_norm": 0.03305269203844265,
+      "learning_rate": 0.0009828423003284239,
+      "loss": 0.68,
+      "step": 6905
+    },
+    {
+      "epoch": 0.3508014874795345,
+      "grad_norm": 0.06428144126234532,
+      "learning_rate": 0.0009827847187393924,
+      "loss": 0.6522,
+      "step": 6910
+    },
+    {
+      "epoch": 0.35105532357756597,
+      "grad_norm": 0.040717882474944,
+      "learning_rate": 0.0009827270423821283,
+      "loss": 0.6798,
+      "step": 6915
+    },
+    {
+      "epoch": 0.35130915967559745,
+      "grad_norm": 0.03774641806829989,
+      "learning_rate": 0.000982669271267953,
+      "loss": 0.6249,
+      "step": 6920
+    },
+    {
+      "epoch": 0.351562995773629,
+      "grad_norm": 0.22254368278616823,
+      "learning_rate": 0.000982611405408207,
+      "loss": 0.6375,
+      "step": 6925
+    },
+    {
+      "epoch": 0.35181683187166046,
+      "grad_norm": 0.0492079469548121,
+      "learning_rate": 0.0009825534448142487,
+      "loss": 0.6596,
+      "step": 6930
+    },
+    {
+      "epoch": 0.352070667969692,
+      "grad_norm": 0.030183273462930794,
+      "learning_rate": 0.0009824953894974559,
+      "loss": 0.6492,
+      "step": 6935
+    },
+    {
+      "epoch": 0.3523245040677235,
+      "grad_norm": 0.03485104672445065,
+      "learning_rate": 0.0009824372394692242,
+      "loss": 0.6405,
+      "step": 6940
+    },
+    {
+      "epoch": 0.35257834016575496,
+      "grad_norm": 0.04394880912256667,
+      "learning_rate": 0.0009823789947409685,
+      "loss": 0.6408,
+      "step": 6945
+    },
+    {
+      "epoch": 0.3528321762637865,
+      "grad_norm": 0.062030496326437375,
+      "learning_rate": 0.0009823206553241214,
+      "loss": 0.6799,
+      "step": 6950
+    },
+    {
+      "epoch": 0.35308601236181797,
+      "grad_norm": 0.034400042065388416,
+      "learning_rate": 0.0009822622212301354,
+      "loss": 0.684,
+      "step": 6955
+    },
+    {
+      "epoch": 0.35333984845984945,
+      "grad_norm": 0.04472477328842319,
+      "learning_rate": 0.0009822036924704803,
+      "loss": 0.6524,
+      "step": 6960
+    },
+    {
+      "epoch": 0.353593684557881,
+      "grad_norm": 0.0724030385807204,
+      "learning_rate": 0.000982145069056645,
+      "loss": 0.668,
+      "step": 6965
+    },
+    {
+      "epoch": 0.35384752065591246,
+      "grad_norm": 0.19364536723665782,
+      "learning_rate": 0.000982086351000137,
+      "loss": 0.6629,
+      "step": 6970
+    },
+    {
+      "epoch": 0.354101356753944,
+      "grad_norm": 0.04733731040455226,
+      "learning_rate": 0.0009820275383124826,
+      "loss": 0.6729,
+      "step": 6975
+    },
+    {
+      "epoch": 0.3543551928519755,
+      "grad_norm": 0.041046331221462486,
+      "learning_rate": 0.0009819686310052263,
+      "loss": 0.6494,
+      "step": 6980
+    },
+    {
+      "epoch": 0.35460902895000695,
+      "grad_norm": 0.030904028187576038,
+      "learning_rate": 0.0009819096290899312,
+      "loss": 0.6761,
+      "step": 6985
+    },
+    {
+      "epoch": 0.3548628650480385,
+      "grad_norm": 0.06764767017877209,
+      "learning_rate": 0.0009818505325781793,
+      "loss": 0.6716,
+      "step": 6990
+    },
+    {
+      "epoch": 0.35511670114606997,
+      "grad_norm": 0.028029672458931103,
+      "learning_rate": 0.000981791341481571,
+      "loss": 0.6701,
+      "step": 6995
+    },
+    {
+      "epoch": 0.3553705372441015,
+      "grad_norm": 0.05118116772844429,
+      "learning_rate": 0.0009817320558117247,
+      "loss": 0.663,
+      "step": 7000
+    },
+    {
+      "epoch": 0.355624373342133,
+      "grad_norm": 0.07241085260585647,
+      "learning_rate": 0.0009816726755802784,
+      "loss": 0.6521,
+      "step": 7005
+    },
+    {
+      "epoch": 0.35587820944016446,
+      "grad_norm": 0.041960569266684886,
+      "learning_rate": 0.000981613200798888,
+      "loss": 0.6904,
+      "step": 7010
+    },
+    {
+      "epoch": 0.356132045538196,
+      "grad_norm": 0.052753495247033405,
+      "learning_rate": 0.000981553631479228,
+      "loss": 0.6356,
+      "step": 7015
+    },
+    {
+      "epoch": 0.3563858816362275,
+      "grad_norm": 0.032282399891573395,
+      "learning_rate": 0.0009814939676329917,
+      "loss": 0.6411,
+      "step": 7020
+    },
+    {
+      "epoch": 0.356639717734259,
+      "grad_norm": 0.03581374152863383,
+      "learning_rate": 0.0009814342092718908,
+      "loss": 0.6605,
+      "step": 7025
+    },
+    {
+      "epoch": 0.3568935538322905,
+      "grad_norm": 0.03356418157006434,
+      "learning_rate": 0.0009813743564076557,
+      "loss": 0.6459,
+      "step": 7030
+    },
+    {
+      "epoch": 0.35714738993032197,
+      "grad_norm": 0.027321655354040797,
+      "learning_rate": 0.0009813144090520347,
+      "loss": 0.6455,
+      "step": 7035
+    },
+    {
+      "epoch": 0.3574012260283535,
+      "grad_norm": 0.036750526110344274,
+      "learning_rate": 0.0009812543672167958,
+      "loss": 0.6281,
+      "step": 7040
+    },
+    {
+      "epoch": 0.357655062126385,
+      "grad_norm": 0.03514246272392273,
+      "learning_rate": 0.0009811942309137242,
+      "loss": 0.676,
+      "step": 7045
+    },
+    {
+      "epoch": 0.3579088982244165,
+      "grad_norm": 0.061078619376600155,
+      "learning_rate": 0.0009811340001546253,
+      "loss": 0.66,
+      "step": 7050
+    },
+    {
+      "epoch": 0.358162734322448,
+      "grad_norm": 0.03588660012791982,
+      "learning_rate": 0.0009810736749513212,
+      "loss": 0.6456,
+      "step": 7055
+    },
+    {
+      "epoch": 0.3584165704204795,
+      "grad_norm": 0.032510848869803814,
+      "learning_rate": 0.000981013255315654,
+      "loss": 0.6658,
+      "step": 7060
+    },
+    {
+      "epoch": 0.358670406518511,
+      "grad_norm": 0.03332754665414955,
+      "learning_rate": 0.0009809527412594837,
+      "loss": 0.6503,
+      "step": 7065
+    },
+    {
+      "epoch": 0.3589242426165425,
+      "grad_norm": 0.03149052779731926,
+      "learning_rate": 0.0009808921327946886,
+      "loss": 0.6563,
+      "step": 7070
+    },
+    {
+      "epoch": 0.359178078714574,
+      "grad_norm": 0.04408836704626032,
+      "learning_rate": 0.000980831429933166,
+      "loss": 0.6587,
+      "step": 7075
+    },
+    {
+      "epoch": 0.3594319148126055,
+      "grad_norm": 0.029092555752435204,
+      "learning_rate": 0.0009807706326868317,
+      "loss": 0.6508,
+      "step": 7080
+    },
+    {
+      "epoch": 0.359685750910637,
+      "grad_norm": 0.0310181753692322,
+      "learning_rate": 0.00098070974106762,
+      "loss": 0.6438,
+      "step": 7085
+    },
+    {
+      "epoch": 0.3599395870086685,
+      "grad_norm": 0.04993998965712036,
+      "learning_rate": 0.0009806487550874832,
+      "loss": 0.635,
+      "step": 7090
+    },
+    {
+      "epoch": 0.3601934231067,
+      "grad_norm": 0.04501979297683468,
+      "learning_rate": 0.0009805876747583928,
+      "loss": 0.6353,
+      "step": 7095
+    },
+    {
+      "epoch": 0.36044725920473153,
+      "grad_norm": 0.04933885261852858,
+      "learning_rate": 0.0009805265000923384,
+      "loss": 0.6906,
+      "step": 7100
+    },
+    {
+      "epoch": 0.360701095302763,
+      "grad_norm": 0.025031721543288105,
+      "learning_rate": 0.0009804652311013286,
+      "loss": 0.62,
+      "step": 7105
+    },
+    {
+      "epoch": 0.3609549314007945,
+      "grad_norm": 0.03681142313849968,
+      "learning_rate": 0.00098040386779739,
+      "loss": 0.6725,
+      "step": 7110
+    },
+    {
+      "epoch": 0.361208767498826,
+      "grad_norm": 0.02848773898790337,
+      "learning_rate": 0.0009803424101925678,
+      "loss": 0.6265,
+      "step": 7115
+    },
+    {
+      "epoch": 0.3614626035968575,
+      "grad_norm": 0.0629595006387732,
+      "learning_rate": 0.000980280858298926,
+      "loss": 0.6621,
+      "step": 7120
+    },
+    {
+      "epoch": 0.36171643969488904,
+      "grad_norm": 0.031817556084941155,
+      "learning_rate": 0.000980219212128547,
+      "loss": 0.6269,
+      "step": 7125
+    },
+    {
+      "epoch": 0.3619702757929205,
+      "grad_norm": 0.03482414094775657,
+      "learning_rate": 0.0009801574716935314,
+      "loss": 0.6279,
+      "step": 7130
+    },
+    {
+      "epoch": 0.362224111890952,
+      "grad_norm": 0.02962568942750712,
+      "learning_rate": 0.0009800956370059986,
+      "loss": 0.6858,
+      "step": 7135
+    },
+    {
+      "epoch": 0.36247794798898353,
+      "grad_norm": 0.0303541560939534,
+      "learning_rate": 0.0009800337080780866,
+      "loss": 0.6156,
+      "step": 7140
+    },
+    {
+      "epoch": 0.362731784087015,
+      "grad_norm": 0.033517980729787206,
+      "learning_rate": 0.0009799716849219515,
+      "loss": 0.6849,
+      "step": 7145
+    },
+    {
+      "epoch": 0.36298562018504654,
+      "grad_norm": 0.03930435599887139,
+      "learning_rate": 0.0009799095675497684,
+      "loss": 0.6508,
+      "step": 7150
+    },
+    {
+      "epoch": 0.363239456283078,
+      "grad_norm": 0.03507509033957748,
+      "learning_rate": 0.0009798473559737304,
+      "loss": 0.6221,
+      "step": 7155
+    },
+    {
+      "epoch": 0.3634932923811095,
+      "grad_norm": 0.0345726788751751,
+      "learning_rate": 0.0009797850502060495,
+      "loss": 0.6244,
+      "step": 7160
+    },
+    {
+      "epoch": 0.36374712847914104,
+      "grad_norm": 0.031654784953979505,
+      "learning_rate": 0.0009797226502589558,
+      "loss": 0.6224,
+      "step": 7165
+    },
+    {
+      "epoch": 0.3640009645771725,
+      "grad_norm": 0.03475111940127676,
+      "learning_rate": 0.0009796601561446983,
+      "loss": 0.604,
+      "step": 7170
+    },
+    {
+      "epoch": 0.364254800675204,
+      "grad_norm": 0.03356801085531049,
+      "learning_rate": 0.0009795975678755441,
+      "loss": 0.6165,
+      "step": 7175
+    },
+    {
+      "epoch": 0.36450863677323553,
+      "grad_norm": 0.030202402409465263,
+      "learning_rate": 0.0009795348854637793,
+      "loss": 0.6357,
+      "step": 7180
+    },
+    {
+      "epoch": 0.364762472871267,
+      "grad_norm": 0.042891378351094925,
+      "learning_rate": 0.0009794721089217077,
+      "loss": 0.6639,
+      "step": 7185
+    },
+    {
+      "epoch": 0.36501630896929854,
+      "grad_norm": 0.026781555972239687,
+      "learning_rate": 0.0009794092382616525,
+      "loss": 0.6193,
+      "step": 7190
+    },
+    {
+      "epoch": 0.36527014506733,
+      "grad_norm": 0.058555352910862106,
+      "learning_rate": 0.0009793462734959545,
+      "loss": 0.6684,
+      "step": 7195
+    },
+    {
+      "epoch": 0.3655239811653615,
+      "grad_norm": 0.04710681738488992,
+      "learning_rate": 0.0009792832146369734,
+      "loss": 0.6433,
+      "step": 7200
+    },
+    {
+      "epoch": 0.36577781726339303,
+      "grad_norm": 0.05005107749394292,
+      "learning_rate": 0.0009792200616970876,
+      "loss": 0.7054,
+      "step": 7205
+    },
+    {
+      "epoch": 0.3660316533614245,
+      "grad_norm": 0.03323686626122536,
+      "learning_rate": 0.0009791568146886936,
+      "loss": 0.6797,
+      "step": 7210
+    },
+    {
+      "epoch": 0.36628548945945605,
+      "grad_norm": 0.03185618160464647,
+      "learning_rate": 0.0009790934736242064,
+      "loss": 0.6476,
+      "step": 7215
+    },
+    {
+      "epoch": 0.3665393255574875,
+      "grad_norm": 0.03692950551425256,
+      "learning_rate": 0.0009790300385160594,
+      "loss": 0.6653,
+      "step": 7220
+    },
+    {
+      "epoch": 0.366793161655519,
+      "grad_norm": 0.03377555225120197,
+      "learning_rate": 0.0009789665093767048,
+      "loss": 0.6571,
+      "step": 7225
+    },
+    {
+      "epoch": 0.36704699775355054,
+      "grad_norm": 0.029454638958564077,
+      "learning_rate": 0.000978902886218613,
+      "loss": 0.6743,
+      "step": 7230
+    },
+    {
+      "epoch": 0.367300833851582,
+      "grad_norm": 0.05946350290356885,
+      "learning_rate": 0.000978839169054273,
+      "loss": 0.6702,
+      "step": 7235
+    },
+    {
+      "epoch": 0.36755466994961355,
+      "grad_norm": 0.05244370262839264,
+      "learning_rate": 0.0009787753578961922,
+      "loss": 0.6083,
+      "step": 7240
+    },
+    {
+      "epoch": 0.36780850604764503,
+      "grad_norm": 0.03264974043837502,
+      "learning_rate": 0.0009787114527568962,
+      "loss": 0.6166,
+      "step": 7245
+    },
+    {
+      "epoch": 0.3680623421456765,
+      "grad_norm": 0.03513830130194355,
+      "learning_rate": 0.0009786474536489292,
+      "loss": 0.6634,
+      "step": 7250
+    },
+    {
+      "epoch": 0.36831617824370805,
+      "grad_norm": 0.029479205631305596,
+      "learning_rate": 0.0009785833605848542,
+      "loss": 0.636,
+      "step": 7255
+    },
+    {
+      "epoch": 0.3685700143417395,
+      "grad_norm": 0.032292807860328114,
+      "learning_rate": 0.0009785191735772521,
+      "loss": 0.6858,
+      "step": 7260
+    },
+    {
+      "epoch": 0.36882385043977106,
+      "grad_norm": 0.04406918705243669,
+      "learning_rate": 0.0009784548926387226,
+      "loss": 0.6267,
+      "step": 7265
+    },
+    {
+      "epoch": 0.36907768653780254,
+      "grad_norm": 0.03484394221767657,
+      "learning_rate": 0.000978390517781884,
+      "loss": 0.6394,
+      "step": 7270
+    },
+    {
+      "epoch": 0.369331522635834,
+      "grad_norm": 0.03455567149269723,
+      "learning_rate": 0.0009783260490193722,
+      "loss": 0.657,
+      "step": 7275
+    },
+    {
+      "epoch": 0.36958535873386555,
+      "grad_norm": 0.0553757152489157,
+      "learning_rate": 0.0009782614863638424,
+      "loss": 0.6519,
+      "step": 7280
+    },
+    {
+      "epoch": 0.36983919483189703,
+      "grad_norm": 0.03087746692822628,
+      "learning_rate": 0.000978196829827968,
+      "loss": 0.6106,
+      "step": 7285
+    },
+    {
+      "epoch": 0.37009303092992857,
+      "grad_norm": 0.028942872631375818,
+      "learning_rate": 0.0009781320794244408,
+      "loss": 0.689,
+      "step": 7290
+    },
+    {
+      "epoch": 0.37034686702796005,
+      "grad_norm": 0.05560542668705997,
+      "learning_rate": 0.0009780672351659707,
+      "loss": 0.6009,
+      "step": 7295
+    },
+    {
+      "epoch": 0.3706007031259915,
+      "grad_norm": 0.03597874219905048,
+      "learning_rate": 0.0009780022970652864,
+      "loss": 0.675,
+      "step": 7300
+    },
+    {
+      "epoch": 0.37085453922402306,
+      "grad_norm": 0.041750636143700906,
+      "learning_rate": 0.000977937265135135,
+      "loss": 0.6347,
+      "step": 7305
+    },
+    {
+      "epoch": 0.37110837532205454,
+      "grad_norm": 0.03053212587846297,
+      "learning_rate": 0.000977872139388282,
+      "loss": 0.6547,
+      "step": 7310
+    },
+    {
+      "epoch": 0.3713622114200861,
+      "grad_norm": 0.04623962660097538,
+      "learning_rate": 0.0009778069198375112,
+      "loss": 0.6296,
+      "step": 7315
+    },
+    {
+      "epoch": 0.37161604751811755,
+      "grad_norm": 0.053173942447247664,
+      "learning_rate": 0.0009777416064956248,
+      "loss": 0.6557,
+      "step": 7320
+    },
+    {
+      "epoch": 0.37186988361614903,
+      "grad_norm": 0.04322723325958535,
+      "learning_rate": 0.0009776761993754435,
+      "loss": 0.6506,
+      "step": 7325
+    },
+    {
+      "epoch": 0.37212371971418057,
+      "grad_norm": 0.054474373793746526,
+      "learning_rate": 0.0009776106984898066,
+      "loss": 0.6337,
+      "step": 7330
+    },
+    {
+      "epoch": 0.37237755581221205,
+      "grad_norm": 0.07566751344663698,
+      "learning_rate": 0.0009775451038515712,
+      "loss": 0.5778,
+      "step": 7335
+    },
+    {
+      "epoch": 0.3726313919102436,
+      "grad_norm": 0.05812108518347267,
+      "learning_rate": 0.0009774794154736135,
+      "loss": 0.6431,
+      "step": 7340
+    },
+    {
+      "epoch": 0.37288522800827506,
+      "grad_norm": 0.0569408117979321,
+      "learning_rate": 0.0009774136333688278,
+      "loss": 0.6382,
+      "step": 7345
+    },
+    {
+      "epoch": 0.37313906410630654,
+      "grad_norm": 0.04610218065404068,
+      "learning_rate": 0.0009773477575501265,
+      "loss": 0.6282,
+      "step": 7350
+    },
+    {
+      "epoch": 0.3733929002043381,
+      "grad_norm": 0.02735821778929375,
+      "learning_rate": 0.0009772817880304412,
+      "loss": 0.639,
+      "step": 7355
+    },
+    {
+      "epoch": 0.37364673630236955,
+      "grad_norm": 0.02996563286462766,
+      "learning_rate": 0.0009772157248227212,
+      "loss": 0.643,
+      "step": 7360
+    },
+    {
+      "epoch": 0.3739005724004011,
+      "grad_norm": 0.03578755871655436,
+      "learning_rate": 0.000977149567939934,
+      "loss": 0.6244,
+      "step": 7365
+    },
+    {
+      "epoch": 0.37415440849843257,
+      "grad_norm": 0.03305618805759151,
+      "learning_rate": 0.0009770833173950663,
+      "loss": 0.6467,
+      "step": 7370
+    },
+    {
+      "epoch": 0.37440824459646405,
+      "grad_norm": 0.030303389029491416,
+      "learning_rate": 0.0009770169732011224,
+      "loss": 0.6439,
+      "step": 7375
+    },
+    {
+      "epoch": 0.3746620806944956,
+      "grad_norm": 0.028992948847689715,
+      "learning_rate": 0.000976950535371126,
+      "loss": 0.6306,
+      "step": 7380
+    },
+    {
+      "epoch": 0.37491591679252706,
+      "grad_norm": 0.03888889527619476,
+      "learning_rate": 0.0009768840039181177,
+      "loss": 0.6581,
+      "step": 7385
+    },
+    {
+      "epoch": 0.37516975289055854,
+      "grad_norm": 0.031228572216811263,
+      "learning_rate": 0.0009768173788551576,
+      "loss": 0.6081,
+      "step": 7390
+    },
+    {
+      "epoch": 0.3754235889885901,
+      "grad_norm": 0.03181486445785271,
+      "learning_rate": 0.000976750660195324,
+      "loss": 0.6339,
+      "step": 7395
+    },
+    {
+      "epoch": 0.37567742508662155,
+      "grad_norm": 0.031669584990155854,
+      "learning_rate": 0.0009766838479517133,
+      "loss": 0.626,
+      "step": 7400
+    },
+    {
+      "epoch": 0.3759312611846531,
+      "grad_norm": 0.03222078035458589,
+      "learning_rate": 0.0009766169421374406,
+      "loss": 0.6635,
+      "step": 7405
+    },
+    {
+      "epoch": 0.37618509728268457,
+      "grad_norm": 0.02957011725240359,
+      "learning_rate": 0.000976549942765639,
+      "loss": 0.6494,
+      "step": 7410
+    },
+    {
+      "epoch": 0.37643893338071605,
+      "grad_norm": 0.03590678349168216,
+      "learning_rate": 0.0009764828498494602,
+      "loss": 0.6354,
+      "step": 7415
+    },
+    {
+      "epoch": 0.3766927694787476,
+      "grad_norm": 0.03220030735871836,
+      "learning_rate": 0.0009764156634020742,
+      "loss": 0.6464,
+      "step": 7420
+    },
+    {
+      "epoch": 0.37694660557677906,
+      "grad_norm": 0.03607797476983635,
+      "learning_rate": 0.0009763483834366693,
+      "loss": 0.6277,
+      "step": 7425
+    },
+    {
+      "epoch": 0.3772004416748106,
+      "grad_norm": 0.11467843343234524,
+      "learning_rate": 0.0009762810099664523,
+      "loss": 0.6575,
+      "step": 7430
+    },
+    {
+      "epoch": 0.3774542777728421,
+      "grad_norm": 0.03779371870962273,
+      "learning_rate": 0.0009762135430046483,
+      "loss": 0.6564,
+      "step": 7435
+    },
+    {
+      "epoch": 0.37770811387087355,
+      "grad_norm": 0.03333634417865055,
+      "learning_rate": 0.0009761459825645006,
+      "loss": 0.6516,
+      "step": 7440
+    },
+    {
+      "epoch": 0.3779619499689051,
+      "grad_norm": 0.03352151545779738,
+      "learning_rate": 0.0009760783286592711,
+      "loss": 0.628,
+      "step": 7445
+    },
+    {
+      "epoch": 0.37821578606693657,
+      "grad_norm": 0.031390578882334645,
+      "learning_rate": 0.0009760105813022399,
+      "loss": 0.6631,
+      "step": 7450
+    },
+    {
+      "epoch": 0.3784696221649681,
+      "grad_norm": 0.03126775917347245,
+      "learning_rate": 0.0009759427405067054,
+      "loss": 0.6296,
+      "step": 7455
+    },
+    {
+      "epoch": 0.3787234582629996,
+      "grad_norm": 0.03189164977628853,
+      "learning_rate": 0.0009758748062859844,
+      "loss": 0.614,
+      "step": 7460
+    },
+    {
+      "epoch": 0.37897729436103106,
+      "grad_norm": 0.03924473622722979,
+      "learning_rate": 0.0009758067786534123,
+      "loss": 0.648,
+      "step": 7465
+    },
+    {
+      "epoch": 0.3792311304590626,
+      "grad_norm": 0.030124031962455097,
+      "learning_rate": 0.0009757386576223423,
+      "loss": 0.6831,
+      "step": 7470
+    },
+    {
+      "epoch": 0.37948496655709407,
+      "grad_norm": 0.026069578510561218,
+      "learning_rate": 0.0009756704432061463,
+      "loss": 0.6449,
+      "step": 7475
+    },
+    {
+      "epoch": 0.3797388026551256,
+      "grad_norm": 0.04496395917283666,
+      "learning_rate": 0.0009756021354182145,
+      "loss": 0.6236,
+      "step": 7480
+    },
+    {
+      "epoch": 0.3799926387531571,
+      "grad_norm": 0.03492290304715105,
+      "learning_rate": 0.0009755337342719552,
+      "loss": 0.6452,
+      "step": 7485
+    },
+    {
+      "epoch": 0.38024647485118857,
+      "grad_norm": 0.03119933363654312,
+      "learning_rate": 0.0009754652397807955,
+      "loss": 0.697,
+      "step": 7490
+    },
+    {
+      "epoch": 0.3805003109492201,
+      "grad_norm": 0.025025130639123577,
+      "learning_rate": 0.0009753966519581803,
+      "loss": 0.5946,
+      "step": 7495
+    },
+    {
+      "epoch": 0.3807541470472516,
+      "grad_norm": 0.028756961098509868,
+      "learning_rate": 0.0009753279708175731,
+      "loss": 0.6274,
+      "step": 7500
+    },
+    {
+      "epoch": 0.3810079831452831,
+      "grad_norm": 0.04267829613214311,
+      "learning_rate": 0.0009752591963724558,
+      "loss": 0.6514,
+      "step": 7505
+    },
+    {
+      "epoch": 0.3812618192433146,
+      "grad_norm": 0.027804227185939587,
+      "learning_rate": 0.0009751903286363283,
+      "loss": 0.6459,
+      "step": 7510
+    },
+    {
+      "epoch": 0.38151565534134607,
+      "grad_norm": 0.0368399353284197,
+      "learning_rate": 0.0009751213676227091,
+      "loss": 0.6143,
+      "step": 7515
+    },
+    {
+      "epoch": 0.3817694914393776,
+      "grad_norm": 0.03566837281270354,
+      "learning_rate": 0.0009750523133451348,
+      "loss": 0.6591,
+      "step": 7520
+    },
+    {
+      "epoch": 0.3820233275374091,
+      "grad_norm": 0.04104675754438289,
+      "learning_rate": 0.0009749831658171605,
+      "loss": 0.6285,
+      "step": 7525
+    },
+    {
+      "epoch": 0.3822771636354406,
+      "grad_norm": 0.03463924815280617,
+      "learning_rate": 0.0009749139250523596,
+      "loss": 0.6361,
+      "step": 7530
+    },
+    {
+      "epoch": 0.3825309997334721,
+      "grad_norm": 0.0371233837488764,
+      "learning_rate": 0.0009748445910643233,
+      "loss": 0.6529,
+      "step": 7535
+    },
+    {
+      "epoch": 0.3827848358315036,
+      "grad_norm": 0.030735573426834636,
+      "learning_rate": 0.000974775163866662,
+      "loss": 0.6463,
+      "step": 7540
+    },
+    {
+      "epoch": 0.3830386719295351,
+      "grad_norm": 0.028788082474888697,
+      "learning_rate": 0.0009747056434730037,
+      "loss": 0.7047,
+      "step": 7545
+    },
+    {
+      "epoch": 0.3832925080275666,
+      "grad_norm": 0.34285998838897536,
+      "learning_rate": 0.0009746360298969951,
+      "loss": 0.6438,
+      "step": 7550
+    },
+    {
+      "epoch": 0.3835463441255981,
+      "grad_norm": 0.04150800275275346,
+      "learning_rate": 0.0009745663231523008,
+      "loss": 0.6424,
+      "step": 7555
+    },
+    {
+      "epoch": 0.3838001802236296,
+      "grad_norm": 0.08071417524310523,
+      "learning_rate": 0.0009744965232526037,
+      "loss": 0.6581,
+      "step": 7560
+    },
+    {
+      "epoch": 0.3840540163216611,
+      "grad_norm": 0.03292538278471775,
+      "learning_rate": 0.0009744266302116056,
+      "loss": 0.6429,
+      "step": 7565
+    },
+    {
+      "epoch": 0.3843078524196926,
+      "grad_norm": 0.0553091431056109,
+      "learning_rate": 0.0009743566440430258,
+      "loss": 0.6247,
+      "step": 7570
+    },
+    {
+      "epoch": 0.3845616885177241,
+      "grad_norm": 0.0576869905147013,
+      "learning_rate": 0.0009742865647606025,
+      "loss": 0.6362,
+      "step": 7575
+    },
+    {
+      "epoch": 0.38481552461575563,
+      "grad_norm": 0.054333812412449035,
+      "learning_rate": 0.0009742163923780918,
+      "loss": 0.6644,
+      "step": 7580
+    },
+    {
+      "epoch": 0.3850693607137871,
+      "grad_norm": 0.0458876312027763,
+      "learning_rate": 0.0009741461269092682,
+      "loss": 0.6174,
+      "step": 7585
+    },
+    {
+      "epoch": 0.3853231968118186,
+      "grad_norm": 0.026346604004534237,
+      "learning_rate": 0.0009740757683679244,
+      "loss": 0.6396,
+      "step": 7590
+    },
+    {
+      "epoch": 0.3855770329098501,
+      "grad_norm": 0.026408511013485066,
+      "learning_rate": 0.0009740053167678715,
+      "loss": 0.641,
+      "step": 7595
+    },
+    {
+      "epoch": 0.3858308690078816,
+      "grad_norm": 0.027408625330890064,
+      "learning_rate": 0.0009739347721229388,
+      "loss": 0.6739,
+      "step": 7600
+    },
+    {
+      "epoch": 0.38608470510591314,
+      "grad_norm": 0.0261075942022835,
+      "learning_rate": 0.0009738641344469737,
+      "loss": 0.6311,
+      "step": 7605
+    },
+    {
+      "epoch": 0.3863385412039446,
+      "grad_norm": 0.035196730638145504,
+      "learning_rate": 0.0009737934037538422,
+      "loss": 0.6283,
+      "step": 7610
+    },
+    {
+      "epoch": 0.3865923773019761,
+      "grad_norm": 0.029985199254207513,
+      "learning_rate": 0.0009737225800574285,
+      "loss": 0.5949,
+      "step": 7615
+    },
+    {
+      "epoch": 0.38684621340000763,
+      "grad_norm": 0.0314143972278088,
+      "learning_rate": 0.0009736516633716348,
+      "loss": 0.6309,
+      "step": 7620
+    },
+    {
+      "epoch": 0.3871000494980391,
+      "grad_norm": 0.040375384651748156,
+      "learning_rate": 0.0009735806537103815,
+      "loss": 0.6346,
+      "step": 7625
+    },
+    {
+      "epoch": 0.3873538855960706,
+      "grad_norm": 0.030875166168935275,
+      "learning_rate": 0.0009735095510876077,
+      "loss": 0.6188,
+      "step": 7630
+    },
+    {
+      "epoch": 0.3876077216941021,
+      "grad_norm": 0.035657526104679045,
+      "learning_rate": 0.0009734383555172705,
+      "loss": 0.6334,
+      "step": 7635
+    },
+    {
+      "epoch": 0.3878615577921336,
+      "grad_norm": 0.03726651149611401,
+      "learning_rate": 0.000973367067013345,
+      "loss": 0.6194,
+      "step": 7640
+    },
+    {
+      "epoch": 0.38811539389016514,
+      "grad_norm": 0.0549306418678483,
+      "learning_rate": 0.000973295685589825,
+      "loss": 0.6289,
+      "step": 7645
+    },
+    {
+      "epoch": 0.3883692299881966,
+      "grad_norm": 0.029283307487707237,
+      "learning_rate": 0.0009732242112607222,
+      "loss": 0.6263,
+      "step": 7650
+    },
+    {
+      "epoch": 0.3886230660862281,
+      "grad_norm": 0.03836427004065547,
+      "learning_rate": 0.0009731526440400667,
+      "loss": 0.671,
+      "step": 7655
+    },
+    {
+      "epoch": 0.38887690218425963,
+      "grad_norm": 0.03320166265686439,
+      "learning_rate": 0.0009730809839419069,
+      "loss": 0.679,
+      "step": 7660
+    },
+    {
+      "epoch": 0.3891307382822911,
+      "grad_norm": 0.03903157209354159,
+      "learning_rate": 0.0009730092309803091,
+      "loss": 0.6721,
+      "step": 7665
+    },
+    {
+      "epoch": 0.38938457438032265,
+      "grad_norm": 0.02937314090643266,
+      "learning_rate": 0.0009729373851693581,
+      "loss": 0.619,
+      "step": 7670
+    },
+    {
+      "epoch": 0.3896384104783541,
+      "grad_norm": 0.0567425226530842,
+      "learning_rate": 0.000972865446523157,
+      "loss": 0.6287,
+      "step": 7675
+    },
+    {
+      "epoch": 0.3898922465763856,
+      "grad_norm": 0.03706531779273381,
+      "learning_rate": 0.000972793415055827,
+      "loss": 0.6683,
+      "step": 7680
+    },
+    {
+      "epoch": 0.39014608267441714,
+      "grad_norm": 0.034876406305835016,
+      "learning_rate": 0.0009727212907815072,
+      "loss": 0.6199,
+      "step": 7685
+    },
+    {
+      "epoch": 0.3903999187724486,
+      "grad_norm": 0.04177776494427368,
+      "learning_rate": 0.0009726490737143557,
+      "loss": 0.6593,
+      "step": 7690
+    },
+    {
+      "epoch": 0.39065375487048015,
+      "grad_norm": 0.029493444763321025,
+      "learning_rate": 0.0009725767638685481,
+      "loss": 0.6818,
+      "step": 7695
+    },
+    {
+      "epoch": 0.39090759096851163,
+      "grad_norm": 0.05797901330208343,
+      "learning_rate": 0.0009725043612582785,
+      "loss": 0.6558,
+      "step": 7700
+    },
+    {
+      "epoch": 0.3911614270665431,
+      "grad_norm": 0.03060632068082371,
+      "learning_rate": 0.0009724318658977591,
+      "loss": 0.6483,
+      "step": 7705
+    },
+    {
+      "epoch": 0.39141526316457464,
+      "grad_norm": 0.029533842342163073,
+      "learning_rate": 0.0009723592778012205,
+      "loss": 0.5882,
+      "step": 7710
+    },
+    {
+      "epoch": 0.3916690992626061,
+      "grad_norm": 0.04016752549164152,
+      "learning_rate": 0.0009722865969829111,
+      "loss": 0.6289,
+      "step": 7715
+    },
+    {
+      "epoch": 0.39192293536063766,
+      "grad_norm": 0.02740885040789266,
+      "learning_rate": 0.0009722138234570983,
+      "loss": 0.6119,
+      "step": 7720
+    },
+    {
+      "epoch": 0.39217677145866914,
+      "grad_norm": 0.027711495610382927,
+      "learning_rate": 0.0009721409572380666,
+      "loss": 0.6439,
+      "step": 7725
+    },
+    {
+      "epoch": 0.3924306075567006,
+      "grad_norm": 0.03504746630633313,
+      "learning_rate": 0.0009720679983401197,
+      "loss": 0.6287,
+      "step": 7730
+    },
+    {
+      "epoch": 0.39268444365473215,
+      "grad_norm": 0.030714782507982694,
+      "learning_rate": 0.0009719949467775791,
+      "loss": 0.6305,
+      "step": 7735
+    },
+    {
+      "epoch": 0.39293827975276363,
+      "grad_norm": 0.05519161310942375,
+      "learning_rate": 0.000971921802564784,
+      "loss": 0.6733,
+      "step": 7740
+    },
+    {
+      "epoch": 0.39319211585079517,
+      "grad_norm": 0.035905400393446016,
+      "learning_rate": 0.0009718485657160927,
+      "loss": 0.6309,
+      "step": 7745
+    },
+    {
+      "epoch": 0.39344595194882664,
+      "grad_norm": 0.05482351027401504,
+      "learning_rate": 0.000971775236245881,
+      "loss": 0.6257,
+      "step": 7750
+    },
+    {
+      "epoch": 0.3936997880468581,
+      "grad_norm": 0.033754877354996,
+      "learning_rate": 0.0009717018141685432,
+      "loss": 0.6465,
+      "step": 7755
+    },
+    {
+      "epoch": 0.39395362414488966,
+      "grad_norm": 0.03394880260163253,
+      "learning_rate": 0.0009716282994984915,
+      "loss": 0.6321,
+      "step": 7760
+    },
+    {
+      "epoch": 0.39420746024292114,
+      "grad_norm": 0.06103387015592127,
+      "learning_rate": 0.0009715546922501568,
+      "loss": 0.5975,
+      "step": 7765
+    },
+    {
+      "epoch": 0.39446129634095267,
+      "grad_norm": 0.05594462999211319,
+      "learning_rate": 0.0009714809924379875,
+      "loss": 0.6332,
+      "step": 7770
+    },
+    {
+      "epoch": 0.39471513243898415,
+      "grad_norm": 0.028505003978790574,
+      "learning_rate": 0.0009714072000764508,
+      "loss": 0.6577,
+      "step": 7775
+    },
+    {
+      "epoch": 0.39496896853701563,
+      "grad_norm": 0.028286990335348833,
+      "learning_rate": 0.0009713333151800315,
+      "loss": 0.6279,
+      "step": 7780
+    },
+    {
+      "epoch": 0.39522280463504716,
+      "grad_norm": 0.030106626846128356,
+      "learning_rate": 0.0009712593377632331,
+      "loss": 0.6379,
+      "step": 7785
+    },
+    {
+      "epoch": 0.39547664073307864,
+      "grad_norm": 0.03861700452164129,
+      "learning_rate": 0.0009711852678405768,
+      "loss": 0.6545,
+      "step": 7790
+    },
+    {
+      "epoch": 0.3957304768311102,
+      "grad_norm": 0.02838597618173486,
+      "learning_rate": 0.0009711111054266022,
+      "loss": 0.6671,
+      "step": 7795
+    },
+    {
+      "epoch": 0.39598431292914166,
+      "grad_norm": 0.034442366907157136,
+      "learning_rate": 0.000971036850535867,
+      "loss": 0.7508,
+      "step": 7800
+    },
+    {
+      "epoch": 0.39623814902717314,
+      "grad_norm": 0.11425516985484442,
+      "learning_rate": 0.0009709625031829473,
+      "loss": 0.66,
+      "step": 7805
+    },
+    {
+      "epoch": 0.39649198512520467,
+      "grad_norm": 0.04883647009890089,
+      "learning_rate": 0.0009708880633824366,
+      "loss": 0.644,
+      "step": 7810
+    },
+    {
+      "epoch": 0.39674582122323615,
+      "grad_norm": 0.039289664485130486,
+      "learning_rate": 0.0009708135311489475,
+      "loss": 0.6813,
+      "step": 7815
+    },
+    {
+      "epoch": 0.3969996573212677,
+      "grad_norm": 0.0375152643384516,
+      "learning_rate": 0.0009707389064971102,
+      "loss": 0.6693,
+      "step": 7820
+    },
+    {
+      "epoch": 0.39725349341929916,
+      "grad_norm": 0.03153109087788052,
+      "learning_rate": 0.0009706641894415731,
+      "loss": 0.6824,
+      "step": 7825
+    },
+    {
+      "epoch": 0.39750732951733064,
+      "grad_norm": 0.031082795917225698,
+      "learning_rate": 0.0009705893799970029,
+      "loss": 0.6108,
+      "step": 7830
+    },
+    {
+      "epoch": 0.3977611656153622,
+      "grad_norm": 0.04178387644138036,
+      "learning_rate": 0.0009705144781780842,
+      "loss": 0.6963,
+      "step": 7835
+    },
+    {
+      "epoch": 0.39801500171339366,
+      "grad_norm": 0.06903963849411496,
+      "learning_rate": 0.0009704394839995198,
+      "loss": 0.6825,
+      "step": 7840
+    },
+    {
+      "epoch": 0.39826883781142514,
+      "grad_norm": 0.03484218684833334,
+      "learning_rate": 0.0009703643974760307,
+      "loss": 0.6589,
+      "step": 7845
+    },
+    {
+      "epoch": 0.39852267390945667,
+      "grad_norm": 0.045063431620059595,
+      "learning_rate": 0.0009702892186223564,
+      "loss": 0.5665,
+      "step": 7850
+    },
+    {
+      "epoch": 0.39877651000748815,
+      "grad_norm": 0.029449955491035615,
+      "learning_rate": 0.0009702139474532536,
+      "loss": 0.6465,
+      "step": 7855
+    },
+    {
+      "epoch": 0.3990303461055197,
+      "grad_norm": 0.03916288270704511,
+      "learning_rate": 0.0009701385839834979,
+      "loss": 0.6582,
+      "step": 7860
+    },
+    {
+      "epoch": 0.39928418220355116,
+      "grad_norm": 0.04166533905758244,
+      "learning_rate": 0.0009700631282278827,
+      "loss": 0.6625,
+      "step": 7865
+    },
+    {
+      "epoch": 0.39953801830158264,
+      "grad_norm": 0.024806646516510142,
+      "learning_rate": 0.0009699875802012197,
+      "loss": 0.6456,
+      "step": 7870
+    },
+    {
+      "epoch": 0.3997918543996142,
+      "grad_norm": 0.0349710848699053,
+      "learning_rate": 0.0009699119399183385,
+      "loss": 0.6463,
+      "step": 7875
+    },
+    {
+      "epoch": 0.40004569049764566,
+      "grad_norm": 0.031334987561882265,
+      "learning_rate": 0.0009698362073940869,
+      "loss": 0.6219,
+      "step": 7880
+    },
+    {
+      "epoch": 0.4002995265956772,
+      "grad_norm": 0.031472418909322404,
+      "learning_rate": 0.0009697603826433308,
+      "loss": 0.6228,
+      "step": 7885
+    },
+    {
+      "epoch": 0.40055336269370867,
+      "grad_norm": 0.037854618230179916,
+      "learning_rate": 0.0009696844656809545,
+      "loss": 0.6558,
+      "step": 7890
+    },
+    {
+      "epoch": 0.40080719879174015,
+      "grad_norm": 0.11265157750996117,
+      "learning_rate": 0.0009696084565218597,
+      "loss": 0.6325,
+      "step": 7895
+    },
+    {
+      "epoch": 0.4010610348897717,
+      "grad_norm": 0.031122545622557275,
+      "learning_rate": 0.0009695323551809669,
+      "loss": 0.6229,
+      "step": 7900
+    },
+    {
+      "epoch": 0.40131487098780316,
+      "grad_norm": 0.04038263274235042,
+      "learning_rate": 0.0009694561616732143,
+      "loss": 0.6507,
+      "step": 7905
+    },
+    {
+      "epoch": 0.4015687070858347,
+      "grad_norm": 0.029612453037692527,
+      "learning_rate": 0.0009693798760135584,
+      "loss": 0.6059,
+      "step": 7910
+    },
+    {
+      "epoch": 0.4018225431838662,
+      "grad_norm": 0.032308274222676875,
+      "learning_rate": 0.0009693034982169735,
+      "loss": 0.6363,
+      "step": 7915
+    },
+    {
+      "epoch": 0.40207637928189766,
+      "grad_norm": 0.05317177280278271,
+      "learning_rate": 0.0009692270282984525,
+      "loss": 0.6503,
+      "step": 7920
+    },
+    {
+      "epoch": 0.4023302153799292,
+      "grad_norm": 0.04291373669655286,
+      "learning_rate": 0.0009691504662730058,
+      "loss": 0.6311,
+      "step": 7925
+    },
+    {
+      "epoch": 0.40258405147796067,
+      "grad_norm": 0.03662642484263608,
+      "learning_rate": 0.0009690738121556621,
+      "loss": 0.6753,
+      "step": 7930
+    },
+    {
+      "epoch": 0.4028378875759922,
+      "grad_norm": 0.035393766402028314,
+      "learning_rate": 0.0009689970659614684,
+      "loss": 0.6479,
+      "step": 7935
+    },
+    {
+      "epoch": 0.4030917236740237,
+      "grad_norm": 0.03128196520109021,
+      "learning_rate": 0.0009689202277054896,
+      "loss": 0.671,
+      "step": 7940
+    },
+    {
+      "epoch": 0.40334555977205516,
+      "grad_norm": 0.03677958165657406,
+      "learning_rate": 0.0009688432974028085,
+      "loss": 0.6932,
+      "step": 7945
+    },
+    {
+      "epoch": 0.4035993958700867,
+      "grad_norm": 0.02727568027543021,
+      "learning_rate": 0.0009687662750685265,
+      "loss": 0.6504,
+      "step": 7950
+    },
+    {
+      "epoch": 0.4038532319681182,
+      "grad_norm": 0.028131562831269996,
+      "learning_rate": 0.0009686891607177621,
+      "loss": 0.6585,
+      "step": 7955
+    },
+    {
+      "epoch": 0.4041070680661497,
+      "grad_norm": 0.027514481928846146,
+      "learning_rate": 0.0009686119543656531,
+      "loss": 0.6182,
+      "step": 7960
+    },
+    {
+      "epoch": 0.4043609041641812,
+      "grad_norm": 0.07056505685894615,
+      "learning_rate": 0.0009685346560273542,
+      "loss": 0.636,
+      "step": 7965
+    },
+    {
+      "epoch": 0.40461474026221267,
+      "grad_norm": 0.04403662030554125,
+      "learning_rate": 0.000968457265718039,
+      "loss": 0.6695,
+      "step": 7970
+    },
+    {
+      "epoch": 0.4048685763602442,
+      "grad_norm": 0.027718390866363754,
+      "learning_rate": 0.0009683797834528987,
+      "loss": 0.6283,
+      "step": 7975
+    },
+    {
+      "epoch": 0.4051224124582757,
+      "grad_norm": 0.040775149461867354,
+      "learning_rate": 0.0009683022092471427,
+      "loss": 0.6609,
+      "step": 7980
+    },
+    {
+      "epoch": 0.4053762485563072,
+      "grad_norm": 0.03695097130504616,
+      "learning_rate": 0.0009682245431159984,
+      "loss": 0.6104,
+      "step": 7985
+    },
+    {
+      "epoch": 0.4056300846543387,
+      "grad_norm": 0.046398150204463785,
+      "learning_rate": 0.0009681467850747114,
+      "loss": 0.6298,
+      "step": 7990
+    },
+    {
+      "epoch": 0.4058839207523702,
+      "grad_norm": 0.04641307748919207,
+      "learning_rate": 0.0009680689351385453,
+      "loss": 0.6244,
+      "step": 7995
+    },
+    {
+      "epoch": 0.4061377568504017,
+      "grad_norm": 0.03193976715382944,
+      "learning_rate": 0.0009679909933227811,
+      "loss": 0.6378,
+      "step": 8000
+    },
+    {
+      "epoch": 0.4063915929484332,
+      "grad_norm": 0.028276413703941978,
+      "learning_rate": 0.0009679129596427189,
+      "loss": 0.6194,
+      "step": 8005
+    },
+    {
+      "epoch": 0.4066454290464647,
+      "grad_norm": 0.02733137371225709,
+      "learning_rate": 0.0009678348341136764,
+      "loss": 0.6586,
+      "step": 8010
+    },
+    {
+      "epoch": 0.4068992651444962,
+      "grad_norm": 0.029814450966989852,
+      "learning_rate": 0.000967756616750989,
+      "loss": 0.6222,
+      "step": 8015
+    },
+    {
+      "epoch": 0.4071531012425277,
+      "grad_norm": 0.03102914660521326,
+      "learning_rate": 0.0009676783075700103,
+      "loss": 0.5845,
+      "step": 8020
+    },
+    {
+      "epoch": 0.4074069373405592,
+      "grad_norm": 0.031168821244497057,
+      "learning_rate": 0.0009675999065861121,
+      "loss": 0.6355,
+      "step": 8025
+    },
+    {
+      "epoch": 0.4076607734385907,
+      "grad_norm": 0.05486797052015813,
+      "learning_rate": 0.0009675214138146844,
+      "loss": 0.6322,
+      "step": 8030
+    },
+    {
+      "epoch": 0.40791460953662223,
+      "grad_norm": 0.04707616172534586,
+      "learning_rate": 0.0009674428292711346,
+      "loss": 0.6169,
+      "step": 8035
+    },
+    {
+      "epoch": 0.4081684456346537,
+      "grad_norm": 0.02787363667264948,
+      "learning_rate": 0.0009673641529708884,
+      "loss": 0.6481,
+      "step": 8040
+    },
+    {
+      "epoch": 0.4084222817326852,
+      "grad_norm": 0.027658440071354547,
+      "learning_rate": 0.0009672853849293899,
+      "loss": 0.6315,
+      "step": 8045
+    },
+    {
+      "epoch": 0.4086761178307167,
+      "grad_norm": 0.04557845624399485,
+      "learning_rate": 0.0009672065251621005,
+      "loss": 0.6724,
+      "step": 8050
+    },
+    {
+      "epoch": 0.4089299539287482,
+      "grad_norm": 0.028701244900625106,
+      "learning_rate": 0.0009671275736845002,
+      "loss": 0.6497,
+      "step": 8055
+    },
+    {
+      "epoch": 0.4091837900267797,
+      "grad_norm": 0.027420431774163943,
+      "learning_rate": 0.0009670485305120868,
+      "loss": 0.617,
+      "step": 8060
+    },
+    {
+      "epoch": 0.4094376261248112,
+      "grad_norm": 0.04387568253300268,
+      "learning_rate": 0.0009669693956603761,
+      "loss": 0.6354,
+      "step": 8065
+    },
+    {
+      "epoch": 0.4096914622228427,
+      "grad_norm": 0.0356837346671153,
+      "learning_rate": 0.0009668901691449017,
+      "loss": 0.6622,
+      "step": 8070
+    },
+    {
+      "epoch": 0.40994529832087423,
+      "grad_norm": 0.0663989720981148,
+      "learning_rate": 0.0009668108509812155,
+      "loss": 0.6492,
+      "step": 8075
+    },
+    {
+      "epoch": 0.4101991344189057,
+      "grad_norm": 0.06052011549737725,
+      "learning_rate": 0.0009667314411848873,
+      "loss": 0.5987,
+      "step": 8080
+    },
+    {
+      "epoch": 0.4104529705169372,
+      "grad_norm": 0.036195040168482975,
+      "learning_rate": 0.0009666519397715048,
+      "loss": 0.654,
+      "step": 8085
+    },
+    {
+      "epoch": 0.4107068066149687,
+      "grad_norm": 0.040727444865344034,
+      "learning_rate": 0.0009665723467566736,
+      "loss": 0.6036,
+      "step": 8090
+    },
+    {
+      "epoch": 0.4109606427130002,
+      "grad_norm": 0.1897729919545745,
+      "learning_rate": 0.0009664926621560175,
+      "loss": 0.6395,
+      "step": 8095
+    },
+    {
+      "epoch": 0.41121447881103174,
+      "grad_norm": 0.0569334892002028,
+      "learning_rate": 0.0009664128859851784,
+      "loss": 0.6519,
+      "step": 8100
+    },
+    {
+      "epoch": 0.4114683149090632,
+      "grad_norm": 0.06785163170844886,
+      "learning_rate": 0.0009663330182598155,
+      "loss": 0.6455,
+      "step": 8105
+    },
+    {
+      "epoch": 0.4117221510070947,
+      "grad_norm": 0.040654887306683485,
+      "learning_rate": 0.0009662530589956069,
+      "loss": 0.6278,
+      "step": 8110
+    },
+    {
+      "epoch": 0.41197598710512623,
+      "grad_norm": 0.03066395095140888,
+      "learning_rate": 0.0009661730082082481,
+      "loss": 0.5972,
+      "step": 8115
+    },
+    {
+      "epoch": 0.4122298232031577,
+      "grad_norm": 0.0319600496844395,
+      "learning_rate": 0.0009660928659134525,
+      "loss": 0.6551,
+      "step": 8120
+    },
+    {
+      "epoch": 0.41248365930118924,
+      "grad_norm": 0.03366629328090797,
+      "learning_rate": 0.0009660126321269516,
+      "loss": 0.6494,
+      "step": 8125
+    },
+    {
+      "epoch": 0.4127374953992207,
+      "grad_norm": 0.045751657644504846,
+      "learning_rate": 0.0009659323068644952,
+      "loss": 0.6166,
+      "step": 8130
+    },
+    {
+      "epoch": 0.4129913314972522,
+      "grad_norm": 0.03206675763665257,
+      "learning_rate": 0.0009658518901418505,
+      "loss": 0.6429,
+      "step": 8135
+    },
+    {
+      "epoch": 0.41324516759528374,
+      "grad_norm": 0.026550019766652787,
+      "learning_rate": 0.0009657713819748028,
+      "loss": 0.6498,
+      "step": 8140
+    },
+    {
+      "epoch": 0.4134990036933152,
+      "grad_norm": 0.045396032327730144,
+      "learning_rate": 0.0009656907823791559,
+      "loss": 0.6428,
+      "step": 8145
+    },
+    {
+      "epoch": 0.41375283979134675,
+      "grad_norm": 0.036710167826551975,
+      "learning_rate": 0.0009656100913707306,
+      "loss": 0.6255,
+      "step": 8150
+    },
+    {
+      "epoch": 0.41400667588937823,
+      "grad_norm": 0.031135118412712215,
+      "learning_rate": 0.0009655293089653665,
+      "loss": 0.6103,
+      "step": 8155
+    },
+    {
+      "epoch": 0.4142605119874097,
+      "grad_norm": 0.03347596053431247,
+      "learning_rate": 0.0009654484351789206,
+      "loss": 0.627,
+      "step": 8160
+    },
+    {
+      "epoch": 0.41451434808544124,
+      "grad_norm": 0.030646522747677672,
+      "learning_rate": 0.000965367470027268,
+      "loss": 0.6277,
+      "step": 8165
+    },
+    {
+      "epoch": 0.4147681841834727,
+      "grad_norm": 0.05286996241213806,
+      "learning_rate": 0.0009652864135263018,
+      "loss": 0.633,
+      "step": 8170
+    },
+    {
+      "epoch": 0.41502202028150426,
+      "grad_norm": 0.028068076907754813,
+      "learning_rate": 0.0009652052656919331,
+      "loss": 0.6432,
+      "step": 8175
+    },
+    {
+      "epoch": 0.41527585637953573,
+      "grad_norm": 0.03619239399249029,
+      "learning_rate": 0.0009651240265400907,
+      "loss": 0.6117,
+      "step": 8180
+    },
+    {
+      "epoch": 0.4155296924775672,
+      "grad_norm": 0.03319322210171537,
+      "learning_rate": 0.0009650426960867215,
+      "loss": 0.6499,
+      "step": 8185
+    },
+    {
+      "epoch": 0.41578352857559875,
+      "grad_norm": 0.04854675850168715,
+      "learning_rate": 0.00096496127434779,
+      "loss": 0.6564,
+      "step": 8190
+    },
+    {
+      "epoch": 0.41603736467363023,
+      "grad_norm": 0.029353149898544276,
+      "learning_rate": 0.0009648797613392794,
+      "loss": 0.6103,
+      "step": 8195
+    },
+    {
+      "epoch": 0.41629120077166176,
+      "grad_norm": 0.027116416725451065,
+      "learning_rate": 0.0009647981570771898,
+      "loss": 0.6166,
+      "step": 8200
+    },
+    {
+      "epoch": 0.41654503686969324,
+      "grad_norm": 0.05618152750846531,
+      "learning_rate": 0.00096471646157754,
+      "loss": 0.6162,
+      "step": 8205
+    },
+    {
+      "epoch": 0.4167988729677247,
+      "grad_norm": 0.02729960033208377,
+      "learning_rate": 0.0009646346748563663,
+      "loss": 0.6004,
+      "step": 8210
+    },
+    {
+      "epoch": 0.41705270906575626,
+      "grad_norm": 0.08612473698169335,
+      "learning_rate": 0.0009645527969297231,
+      "loss": 0.6483,
+      "step": 8215
+    },
+    {
+      "epoch": 0.41730654516378773,
+      "grad_norm": 0.029936015816733892,
+      "learning_rate": 0.0009644708278136826,
+      "loss": 0.6317,
+      "step": 8220
+    },
+    {
+      "epoch": 0.41756038126181927,
+      "grad_norm": 0.0726201142485528,
+      "learning_rate": 0.0009643887675243348,
+      "loss": 0.6244,
+      "step": 8225
+    },
+    {
+      "epoch": 0.41781421735985075,
+      "grad_norm": 0.057917723581260475,
+      "learning_rate": 0.0009643066160777879,
+      "loss": 0.647,
+      "step": 8230
+    },
+    {
+      "epoch": 0.4180680534578822,
+      "grad_norm": 0.13072452439749868,
+      "learning_rate": 0.0009642243734901678,
+      "loss": 0.6838,
+      "step": 8235
+    },
+    {
+      "epoch": 0.41832188955591376,
+      "grad_norm": 0.05469152409284166,
+      "learning_rate": 0.0009641420397776181,
+      "loss": 0.6624,
+      "step": 8240
+    },
+    {
+      "epoch": 0.41857572565394524,
+      "grad_norm": 0.033211019095030915,
+      "learning_rate": 0.0009640596149563008,
+      "loss": 0.6251,
+      "step": 8245
+    },
+    {
+      "epoch": 0.4188295617519768,
+      "grad_norm": 0.039537117087901134,
+      "learning_rate": 0.0009639770990423954,
+      "loss": 0.6764,
+      "step": 8250
+    },
+    {
+      "epoch": 0.41908339785000825,
+      "grad_norm": 0.043759885870117206,
+      "learning_rate": 0.0009638944920520992,
+      "loss": 0.6291,
+      "step": 8255
+    },
+    {
+      "epoch": 0.41933723394803973,
+      "grad_norm": 0.06431426896668371,
+      "learning_rate": 0.0009638117940016278,
+      "loss": 0.6548,
+      "step": 8260
+    },
+    {
+      "epoch": 0.41959107004607127,
+      "grad_norm": 0.027159981833348182,
+      "learning_rate": 0.000963729004907214,
+      "loss": 0.6055,
+      "step": 8265
+    },
+    {
+      "epoch": 0.41984490614410275,
+      "grad_norm": 0.10032739292221377,
+      "learning_rate": 0.0009636461247851094,
+      "loss": 0.6578,
+      "step": 8270
+    },
+    {
+      "epoch": 0.4200987422421342,
+      "grad_norm": 0.06844856797298811,
+      "learning_rate": 0.0009635631536515825,
+      "loss": 0.6316,
+      "step": 8275
+    },
+    {
+      "epoch": 0.42035257834016576,
+      "grad_norm": 0.03539995161128657,
+      "learning_rate": 0.0009634800915229205,
+      "loss": 0.6597,
+      "step": 8280
+    },
+    {
+      "epoch": 0.42060641443819724,
+      "grad_norm": 0.033253904264618245,
+      "learning_rate": 0.0009633969384154279,
+      "loss": 0.6066,
+      "step": 8285
+    },
+    {
+      "epoch": 0.4208602505362288,
+      "grad_norm": 0.042401610577462875,
+      "learning_rate": 0.0009633136943454271,
+      "loss": 0.6235,
+      "step": 8290
+    },
+    {
+      "epoch": 0.42111408663426025,
+      "grad_norm": 0.06416678518804314,
+      "learning_rate": 0.0009632303593292589,
+      "loss": 0.6112,
+      "step": 8295
+    },
+    {
+      "epoch": 0.42136792273229173,
+      "grad_norm": 0.055313941008177936,
+      "learning_rate": 0.0009631469333832809,
+      "loss": 0.6632,
+      "step": 8300
+    },
+    {
+      "epoch": 0.42162175883032327,
+      "grad_norm": 0.11795660265352328,
+      "learning_rate": 0.0009630634165238699,
+      "loss": 0.6342,
+      "step": 8305
+    },
+    {
+      "epoch": 0.42187559492835475,
+      "grad_norm": 0.0657921582096661,
+      "learning_rate": 0.0009629798087674194,
+      "loss": 0.6253,
+      "step": 8310
+    },
+    {
+      "epoch": 0.4221294310263863,
+      "grad_norm": 0.03717901771126588,
+      "learning_rate": 0.0009628961101303412,
+      "loss": 0.6518,
+      "step": 8315
+    },
+    {
+      "epoch": 0.42238326712441776,
+      "grad_norm": 0.04495023963968202,
+      "learning_rate": 0.0009628123206290654,
+      "loss": 0.6451,
+      "step": 8320
+    },
+    {
+      "epoch": 0.42263710322244924,
+      "grad_norm": 0.0418969458155804,
+      "learning_rate": 0.0009627284402800388,
+      "loss": 0.6534,
+      "step": 8325
+    },
+    {
+      "epoch": 0.4228909393204808,
+      "grad_norm": 0.030384984685908507,
+      "learning_rate": 0.0009626444690997272,
+      "loss": 0.5976,
+      "step": 8330
+    },
+    {
+      "epoch": 0.42314477541851225,
+      "grad_norm": 0.04470751844633285,
+      "learning_rate": 0.0009625604071046133,
+      "loss": 0.6242,
+      "step": 8335
+    },
+    {
+      "epoch": 0.4233986115165438,
+      "grad_norm": 0.025004516406755696,
+      "learning_rate": 0.0009624762543111985,
+      "loss": 0.6245,
+      "step": 8340
+    },
+    {
+      "epoch": 0.42365244761457527,
+      "grad_norm": 0.04181782200662666,
+      "learning_rate": 0.0009623920107360011,
+      "loss": 0.6792,
+      "step": 8345
+    },
+    {
+      "epoch": 0.42390628371260675,
+      "grad_norm": 0.03102456315861995,
+      "learning_rate": 0.0009623076763955581,
+      "loss": 0.6725,
+      "step": 8350
+    },
+    {
+      "epoch": 0.4241601198106383,
+      "grad_norm": 0.057488518244734055,
+      "learning_rate": 0.0009622232513064237,
+      "loss": 0.6335,
+      "step": 8355
+    },
+    {
+      "epoch": 0.42441395590866976,
+      "grad_norm": 0.05907000801077856,
+      "learning_rate": 0.00096213873548517,
+      "loss": 0.6626,
+      "step": 8360
+    },
+    {
+      "epoch": 0.4246677920067013,
+      "grad_norm": 0.07423416923574946,
+      "learning_rate": 0.0009620541289483875,
+      "loss": 0.6819,
+      "step": 8365
+    },
+    {
+      "epoch": 0.4249216281047328,
+      "grad_norm": 0.04354747716701456,
+      "learning_rate": 0.0009619694317126837,
+      "loss": 0.6638,
+      "step": 8370
+    },
+    {
+      "epoch": 0.42517546420276425,
+      "grad_norm": 0.03257416270726558,
+      "learning_rate": 0.0009618846437946842,
+      "loss": 0.6575,
+      "step": 8375
+    },
+    {
+      "epoch": 0.4254293003007958,
+      "grad_norm": 0.1385958109330031,
+      "learning_rate": 0.0009617997652110326,
+      "loss": 0.6893,
+      "step": 8380
+    },
+    {
+      "epoch": 0.42568313639882727,
+      "grad_norm": 0.11178651527717962,
+      "learning_rate": 0.00096171479597839,
+      "loss": 0.7221,
+      "step": 8385
+    },
+    {
+      "epoch": 0.4259369724968588,
+      "grad_norm": 0.07142680791086048,
+      "learning_rate": 0.0009616297361134355,
+      "loss": 0.6673,
+      "step": 8390
+    },
+    {
+      "epoch": 0.4261908085948903,
+      "grad_norm": 0.07599589867371198,
+      "learning_rate": 0.000961544585632866,
+      "loss": 0.6966,
+      "step": 8395
+    },
+    {
+      "epoch": 0.42644464469292176,
+      "grad_norm": 0.0324535534315102,
+      "learning_rate": 0.0009614593445533961,
+      "loss": 0.6887,
+      "step": 8400
+    },
+    {
+      "epoch": 0.4266984807909533,
+      "grad_norm": 0.03664906295093519,
+      "learning_rate": 0.0009613740128917581,
+      "loss": 0.6472,
+      "step": 8405
+    },
+    {
+      "epoch": 0.4269523168889848,
+      "grad_norm": 0.06479920979013709,
+      "learning_rate": 0.0009612885906647023,
+      "loss": 0.6738,
+      "step": 8410
+    },
+    {
+      "epoch": 0.4272061529870163,
+      "grad_norm": 0.04275229522059579,
+      "learning_rate": 0.0009612030778889966,
+      "loss": 0.6488,
+      "step": 8415
+    },
+    {
+      "epoch": 0.4274599890850478,
+      "grad_norm": 0.029470064994178823,
+      "learning_rate": 0.0009611174745814266,
+      "loss": 0.6667,
+      "step": 8420
+    },
+    {
+      "epoch": 0.42771382518307927,
+      "grad_norm": 0.03351287696319307,
+      "learning_rate": 0.000961031780758796,
+      "loss": 0.6559,
+      "step": 8425
+    },
+    {
+      "epoch": 0.4279676612811108,
+      "grad_norm": 0.05384940363959001,
+      "learning_rate": 0.000960945996437926,
+      "loss": 0.6732,
+      "step": 8430
+    },
+    {
+      "epoch": 0.4282214973791423,
+      "grad_norm": 0.0320326929365238,
+      "learning_rate": 0.0009608601216356557,
+      "loss": 0.7019,
+      "step": 8435
+    },
+    {
+      "epoch": 0.4284753334771738,
+      "grad_norm": 0.02719381758672677,
+      "learning_rate": 0.0009607741563688417,
+      "loss": 0.6465,
+      "step": 8440
+    },
+    {
+      "epoch": 0.4287291695752053,
+      "grad_norm": 0.03282220675318019,
+      "learning_rate": 0.0009606881006543589,
+      "loss": 0.6366,
+      "step": 8445
+    },
+    {
+      "epoch": 0.4289830056732368,
+      "grad_norm": 0.03248468141086955,
+      "learning_rate": 0.0009606019545090992,
+      "loss": 0.6066,
+      "step": 8450
+    },
+    {
+      "epoch": 0.4292368417712683,
+      "grad_norm": 0.03936893710205544,
+      "learning_rate": 0.0009605157179499728,
+      "loss": 0.6604,
+      "step": 8455
+    },
+    {
+      "epoch": 0.4294906778692998,
+      "grad_norm": 0.03206017279382604,
+      "learning_rate": 0.0009604293909939077,
+      "loss": 0.6396,
+      "step": 8460
+    },
+    {
+      "epoch": 0.4297445139673313,
+      "grad_norm": 0.056617557142407925,
+      "learning_rate": 0.0009603429736578493,
+      "loss": 0.6345,
+      "step": 8465
+    },
+    {
+      "epoch": 0.4299983500653628,
+      "grad_norm": 0.04111342531938289,
+      "learning_rate": 0.0009602564659587608,
+      "loss": 0.6756,
+      "step": 8470
+    },
+    {
+      "epoch": 0.4302521861633943,
+      "grad_norm": 0.03727263305415869,
+      "learning_rate": 0.0009601698679136233,
+      "loss": 0.6321,
+      "step": 8475
+    },
+    {
+      "epoch": 0.4305060222614258,
+      "grad_norm": 0.03320811173723645,
+      "learning_rate": 0.0009600831795394358,
+      "loss": 0.6278,
+      "step": 8480
+    },
+    {
+      "epoch": 0.4307598583594573,
+      "grad_norm": 0.04118865787603138,
+      "learning_rate": 0.0009599964008532144,
+      "loss": 0.6521,
+      "step": 8485
+    },
+    {
+      "epoch": 0.43101369445748877,
+      "grad_norm": 0.027675504471304815,
+      "learning_rate": 0.0009599095318719935,
+      "loss": 0.6282,
+      "step": 8490
+    },
+    {
+      "epoch": 0.4312675305555203,
+      "grad_norm": 0.04646301540154575,
+      "learning_rate": 0.0009598225726128251,
+      "loss": 0.6408,
+      "step": 8495
+    },
+    {
+      "epoch": 0.4315213666535518,
+      "grad_norm": 0.030455384549113017,
+      "learning_rate": 0.0009597355230927789,
+      "loss": 0.6444,
+      "step": 8500
+    },
+    {
+      "epoch": 0.4317752027515833,
+      "grad_norm": 0.031905572716127935,
+      "learning_rate": 0.0009596483833289422,
+      "loss": 0.632,
+      "step": 8505
+    },
+    {
+      "epoch": 0.4320290388496148,
+      "grad_norm": 0.03178070686854921,
+      "learning_rate": 0.0009595611533384201,
+      "loss": 0.6341,
+      "step": 8510
+    },
+    {
+      "epoch": 0.4322828749476463,
+      "grad_norm": 0.037639770941935886,
+      "learning_rate": 0.0009594738331383355,
+      "loss": 0.6236,
+      "step": 8515
+    },
+    {
+      "epoch": 0.4325367110456778,
+      "grad_norm": 0.06449433125052263,
+      "learning_rate": 0.0009593864227458287,
+      "loss": 0.6534,
+      "step": 8520
+    },
+    {
+      "epoch": 0.4327905471437093,
+      "grad_norm": 0.028739688486159997,
+      "learning_rate": 0.0009592989221780581,
+      "loss": 0.6303,
+      "step": 8525
+    },
+    {
+      "epoch": 0.4330443832417408,
+      "grad_norm": 0.07881302169372612,
+      "learning_rate": 0.0009592113314521996,
+      "loss": 0.6427,
+      "step": 8530
+    },
+    {
+      "epoch": 0.4332982193397723,
+      "grad_norm": 0.06386066146903421,
+      "learning_rate": 0.0009591236505854468,
+      "loss": 0.6411,
+      "step": 8535
+    },
+    {
+      "epoch": 0.4335520554378038,
+      "grad_norm": 0.08823347046423324,
+      "learning_rate": 0.0009590358795950112,
+      "loss": 0.6628,
+      "step": 8540
+    },
+    {
+      "epoch": 0.4338058915358353,
+      "grad_norm": 0.04402973321402076,
+      "learning_rate": 0.0009589480184981214,
+      "loss": 0.6312,
+      "step": 8545
+    },
+    {
+      "epoch": 0.4340597276338668,
+      "grad_norm": 0.06702143934000367,
+      "learning_rate": 0.0009588600673120245,
+      "loss": 0.6386,
+      "step": 8550
+    },
+    {
+      "epoch": 0.43431356373189833,
+      "grad_norm": 0.0303696868506802,
+      "learning_rate": 0.0009587720260539847,
+      "loss": 0.6389,
+      "step": 8555
+    },
+    {
+      "epoch": 0.4345673998299298,
+      "grad_norm": 0.028798244571663335,
+      "learning_rate": 0.000958683894741284,
+      "loss": 0.6517,
+      "step": 8560
+    },
+    {
+      "epoch": 0.4348212359279613,
+      "grad_norm": 0.026950156282351694,
+      "learning_rate": 0.0009585956733912224,
+      "loss": 0.6254,
+      "step": 8565
+    },
+    {
+      "epoch": 0.4350750720259928,
+      "grad_norm": 0.0355083939976014,
+      "learning_rate": 0.0009585073620211169,
+      "loss": 0.6603,
+      "step": 8570
+    },
+    {
+      "epoch": 0.4353289081240243,
+      "grad_norm": 0.03066199950726959,
+      "learning_rate": 0.0009584189606483029,
+      "loss": 0.6305,
+      "step": 8575
+    },
+    {
+      "epoch": 0.43558274422205584,
+      "grad_norm": 0.030869429351402574,
+      "learning_rate": 0.0009583304692901331,
+      "loss": 0.6479,
+      "step": 8580
+    },
+    {
+      "epoch": 0.4358365803200873,
+      "grad_norm": 0.03223477546507825,
+      "learning_rate": 0.0009582418879639778,
+      "loss": 0.6709,
+      "step": 8585
+    },
+    {
+      "epoch": 0.4360904164181188,
+      "grad_norm": 0.0480603195178494,
+      "learning_rate": 0.0009581532166872252,
+      "loss": 0.6198,
+      "step": 8590
+    },
+    {
+      "epoch": 0.43634425251615033,
+      "grad_norm": 0.02653848497460224,
+      "learning_rate": 0.0009580644554772809,
+      "loss": 0.6247,
+      "step": 8595
+    },
+    {
+      "epoch": 0.4365980886141818,
+      "grad_norm": 0.03447545219420309,
+      "learning_rate": 0.0009579756043515684,
+      "loss": 0.6677,
+      "step": 8600
+    },
+    {
+      "epoch": 0.43685192471221335,
+      "grad_norm": 0.0340747508415548,
+      "learning_rate": 0.0009578866633275287,
+      "loss": 0.695,
+      "step": 8605
+    },
+    {
+      "epoch": 0.4371057608102448,
+      "grad_norm": 0.034686685618524345,
+      "learning_rate": 0.0009577976324226205,
+      "loss": 0.6303,
+      "step": 8610
+    },
+    {
+      "epoch": 0.4373595969082763,
+      "grad_norm": 0.04739152588761622,
+      "learning_rate": 0.0009577085116543201,
+      "loss": 0.6718,
+      "step": 8615
+    },
+    {
+      "epoch": 0.43761343300630784,
+      "grad_norm": 0.06752904402187106,
+      "learning_rate": 0.0009576193010401213,
+      "loss": 0.6227,
+      "step": 8620
+    },
+    {
+      "epoch": 0.4378672691043393,
+      "grad_norm": 0.03265339538465412,
+      "learning_rate": 0.0009575300005975361,
+      "loss": 0.6396,
+      "step": 8625
+    },
+    {
+      "epoch": 0.43812110520237085,
+      "grad_norm": 0.04449318891895552,
+      "learning_rate": 0.0009574406103440931,
+      "loss": 0.6716,
+      "step": 8630
+    },
+    {
+      "epoch": 0.43837494130040233,
+      "grad_norm": 0.04632596776822006,
+      "learning_rate": 0.0009573511302973399,
+      "loss": 0.6233,
+      "step": 8635
+    },
+    {
+      "epoch": 0.4386287773984338,
+      "grad_norm": 0.03465904010797608,
+      "learning_rate": 0.0009572615604748405,
+      "loss": 0.6524,
+      "step": 8640
+    },
+    {
+      "epoch": 0.43888261349646535,
+      "grad_norm": 0.0870434657255882,
+      "learning_rate": 0.000957171900894177,
+      "loss": 0.6214,
+      "step": 8645
+    },
+    {
+      "epoch": 0.4391364495944968,
+      "grad_norm": 0.04534670041292767,
+      "learning_rate": 0.0009570821515729496,
+      "loss": 0.6523,
+      "step": 8650
+    },
+    {
+      "epoch": 0.43939028569252836,
+      "grad_norm": 0.028417736922154647,
+      "learning_rate": 0.0009569923125287749,
+      "loss": 0.6387,
+      "step": 8655
+    },
+    {
+      "epoch": 0.43964412179055984,
+      "grad_norm": 0.03085472383605309,
+      "learning_rate": 0.0009569023837792885,
+      "loss": 0.6569,
+      "step": 8660
+    },
+    {
+      "epoch": 0.4398979578885913,
+      "grad_norm": 0.029597971880738125,
+      "learning_rate": 0.0009568123653421427,
+      "loss": 0.6517,
+      "step": 8665
+    },
+    {
+      "epoch": 0.44015179398662285,
+      "grad_norm": 0.031124321107765782,
+      "learning_rate": 0.0009567222572350078,
+      "loss": 0.6481,
+      "step": 8670
+    },
+    {
+      "epoch": 0.44040563008465433,
+      "grad_norm": 0.04250290000689362,
+      "learning_rate": 0.0009566320594755713,
+      "loss": 0.6301,
+      "step": 8675
+    },
+    {
+      "epoch": 0.44065946618268587,
+      "grad_norm": 0.03803602491108363,
+      "learning_rate": 0.0009565417720815389,
+      "loss": 0.6506,
+      "step": 8680
+    },
+    {
+      "epoch": 0.44091330228071735,
+      "grad_norm": 0.10059999444471293,
+      "learning_rate": 0.0009564513950706333,
+      "loss": 0.7053,
+      "step": 8685
+    },
+    {
+      "epoch": 0.4411671383787488,
+      "grad_norm": 0.028992308416219137,
+      "learning_rate": 0.0009563609284605951,
+      "loss": 0.6764,
+      "step": 8690
+    },
+    {
+      "epoch": 0.44142097447678036,
+      "grad_norm": 0.028072857577109308,
+      "learning_rate": 0.0009562703722691828,
+      "loss": 0.6359,
+      "step": 8695
+    },
+    {
+      "epoch": 0.44167481057481184,
+      "grad_norm": 0.04504031556014271,
+      "learning_rate": 0.0009561797265141717,
+      "loss": 0.6518,
+      "step": 8700
+    },
+    {
+      "epoch": 0.4419286466728434,
+      "grad_norm": 0.028108033251197,
+      "learning_rate": 0.0009560889912133552,
+      "loss": 0.6291,
+      "step": 8705
+    },
+    {
+      "epoch": 0.44218248277087485,
+      "grad_norm": 0.03109459111595203,
+      "learning_rate": 0.0009559981663845443,
+      "loss": 0.6163,
+      "step": 8710
+    },
+    {
+      "epoch": 0.44243631886890633,
+      "grad_norm": 0.04169194643254887,
+      "learning_rate": 0.0009559072520455672,
+      "loss": 0.6475,
+      "step": 8715
+    },
+    {
+      "epoch": 0.44269015496693787,
+      "grad_norm": 0.20480559958951622,
+      "learning_rate": 0.0009558162482142703,
+      "loss": 0.5948,
+      "step": 8720
+    },
+    {
+      "epoch": 0.44294399106496934,
+      "grad_norm": 0.056634271165419775,
+      "learning_rate": 0.000955725154908517,
+      "loss": 0.6407,
+      "step": 8725
+    },
+    {
+      "epoch": 0.4431978271630008,
+      "grad_norm": 0.04431366313723485,
+      "learning_rate": 0.0009556339721461885,
+      "loss": 0.6483,
+      "step": 8730
+    },
+    {
+      "epoch": 0.44345166326103236,
+      "grad_norm": 0.030702014731428928,
+      "learning_rate": 0.0009555426999451835,
+      "loss": 0.6127,
+      "step": 8735
+    },
+    {
+      "epoch": 0.44370549935906384,
+      "grad_norm": 0.04349749497078023,
+      "learning_rate": 0.0009554513383234184,
+      "loss": 0.6401,
+      "step": 8740
+    },
+    {
+      "epoch": 0.44395933545709537,
+      "grad_norm": 0.028774456041381447,
+      "learning_rate": 0.0009553598872988268,
+      "loss": 0.6411,
+      "step": 8745
+    },
+    {
+      "epoch": 0.44421317155512685,
+      "grad_norm": 0.04729637621631858,
+      "learning_rate": 0.0009552683468893601,
+      "loss": 0.6229,
+      "step": 8750
+    },
+    {
+      "epoch": 0.44446700765315833,
+      "grad_norm": 0.03315699465561875,
+      "learning_rate": 0.0009551767171129874,
+      "loss": 0.6576,
+      "step": 8755
+    },
+    {
+      "epoch": 0.44472084375118986,
+      "grad_norm": 0.07162055236767001,
+      "learning_rate": 0.0009550849979876952,
+      "loss": 0.6478,
+      "step": 8760
+    },
+    {
+      "epoch": 0.44497467984922134,
+      "grad_norm": 0.03618826906366848,
+      "learning_rate": 0.0009549931895314874,
+      "loss": 0.6262,
+      "step": 8765
+    },
+    {
+      "epoch": 0.4452285159472529,
+      "grad_norm": 0.05135770096791484,
+      "learning_rate": 0.0009549012917623854,
+      "loss": 0.6681,
+      "step": 8770
+    },
+    {
+      "epoch": 0.44548235204528436,
+      "grad_norm": 0.02358653749589321,
+      "learning_rate": 0.0009548093046984285,
+      "loss": 0.5986,
+      "step": 8775
+    },
+    {
+      "epoch": 0.44573618814331584,
+      "grad_norm": 0.04897232885513862,
+      "learning_rate": 0.0009547172283576733,
+      "loss": 0.6277,
+      "step": 8780
+    },
+    {
+      "epoch": 0.44599002424134737,
+      "grad_norm": 0.029813286658092467,
+      "learning_rate": 0.0009546250627581936,
+      "loss": 0.6221,
+      "step": 8785
+    },
+    {
+      "epoch": 0.44624386033937885,
+      "grad_norm": 0.04724784641629323,
+      "learning_rate": 0.0009545328079180815,
+      "loss": 0.644,
+      "step": 8790
+    },
+    {
+      "epoch": 0.4464976964374104,
+      "grad_norm": 0.04105666493239327,
+      "learning_rate": 0.0009544404638554459,
+      "loss": 0.6266,
+      "step": 8795
+    },
+    {
+      "epoch": 0.44675153253544186,
+      "grad_norm": 0.036873167174765825,
+      "learning_rate": 0.0009543480305884136,
+      "loss": 0.6148,
+      "step": 8800
+    },
+    {
+      "epoch": 0.44700536863347334,
+      "grad_norm": 0.024048534299153178,
+      "learning_rate": 0.0009542555081351286,
+      "loss": 0.6191,
+      "step": 8805
+    },
+    {
+      "epoch": 0.4472592047315049,
+      "grad_norm": 0.02625656937587616,
+      "learning_rate": 0.0009541628965137528,
+      "loss": 0.6347,
+      "step": 8810
+    },
+    {
+      "epoch": 0.44751304082953636,
+      "grad_norm": 0.060240180235133486,
+      "learning_rate": 0.0009540701957424653,
+      "loss": 0.5963,
+      "step": 8815
+    },
+    {
+      "epoch": 0.4477668769275679,
+      "grad_norm": 0.05265523726241905,
+      "learning_rate": 0.0009539774058394628,
+      "loss": 0.6619,
+      "step": 8820
+    },
+    {
+      "epoch": 0.44802071302559937,
+      "grad_norm": 0.06653130957206357,
+      "learning_rate": 0.0009538845268229596,
+      "loss": 0.643,
+      "step": 8825
+    },
+    {
+      "epoch": 0.44827454912363085,
+      "grad_norm": 0.027089718279573065,
+      "learning_rate": 0.0009537915587111872,
+      "loss": 0.6249,
+      "step": 8830
+    },
+    {
+      "epoch": 0.4485283852216624,
+      "grad_norm": 0.041501163456008426,
+      "learning_rate": 0.0009536985015223949,
+      "loss": 0.669,
+      "step": 8835
+    },
+    {
+      "epoch": 0.44878222131969386,
+      "grad_norm": 0.04886005734739699,
+      "learning_rate": 0.0009536053552748494,
+      "loss": 0.5982,
+      "step": 8840
+    },
+    {
+      "epoch": 0.4490360574177254,
+      "grad_norm": 0.07228127391608218,
+      "learning_rate": 0.0009535121199868348,
+      "loss": 0.6427,
+      "step": 8845
+    },
+    {
+      "epoch": 0.4492898935157569,
+      "grad_norm": 0.0560495813187393,
+      "learning_rate": 0.0009534187956766526,
+      "loss": 0.6376,
+      "step": 8850
+    },
+    {
+      "epoch": 0.44954372961378836,
+      "grad_norm": 0.024625434128465788,
+      "learning_rate": 0.000953325382362622,
+      "loss": 0.6456,
+      "step": 8855
+    },
+    {
+      "epoch": 0.4497975657118199,
+      "grad_norm": 0.02767775635007945,
+      "learning_rate": 0.0009532318800630797,
+      "loss": 0.6126,
+      "step": 8860
+    },
+    {
+      "epoch": 0.45005140180985137,
+      "grad_norm": 0.03132593059311349,
+      "learning_rate": 0.0009531382887963796,
+      "loss": 0.626,
+      "step": 8865
+    },
+    {
+      "epoch": 0.4503052379078829,
+      "grad_norm": 0.03409858571498195,
+      "learning_rate": 0.0009530446085808932,
+      "loss": 0.6232,
+      "step": 8870
+    },
+    {
+      "epoch": 0.4505590740059144,
+      "grad_norm": 0.03060723735055533,
+      "learning_rate": 0.0009529508394350093,
+      "loss": 0.6515,
+      "step": 8875
+    },
+    {
+      "epoch": 0.45081291010394586,
+      "grad_norm": 0.02791574567598346,
+      "learning_rate": 0.0009528569813771346,
+      "loss": 0.643,
+      "step": 8880
+    },
+    {
+      "epoch": 0.4510667462019774,
+      "grad_norm": 0.025463994979441774,
+      "learning_rate": 0.0009527630344256929,
+      "loss": 0.6072,
+      "step": 8885
+    },
+    {
+      "epoch": 0.4513205823000089,
+      "grad_norm": 0.03198977951851349,
+      "learning_rate": 0.0009526689985991255,
+      "loss": 0.6013,
+      "step": 8890
+    },
+    {
+      "epoch": 0.4515744183980404,
+      "grad_norm": 0.03573354091801804,
+      "learning_rate": 0.000952574873915891,
+      "loss": 0.6139,
+      "step": 8895
+    },
+    {
+      "epoch": 0.4518282544960719,
+      "grad_norm": 0.037344124835746804,
+      "learning_rate": 0.0009524806603944658,
+      "loss": 0.6095,
+      "step": 8900
+    },
+    {
+      "epoch": 0.45208209059410337,
+      "grad_norm": 0.0338181770188993,
+      "learning_rate": 0.0009523863580533434,
+      "loss": 0.63,
+      "step": 8905
+    },
+    {
+      "epoch": 0.4523359266921349,
+      "grad_norm": 0.047741999628023585,
+      "learning_rate": 0.000952291966911035,
+      "loss": 0.6191,
+      "step": 8910
+    },
+    {
+      "epoch": 0.4525897627901664,
+      "grad_norm": 0.027889977098360996,
+      "learning_rate": 0.0009521974869860691,
+      "loss": 0.6289,
+      "step": 8915
+    },
+    {
+      "epoch": 0.4528435988881979,
+      "grad_norm": 0.04542033140089097,
+      "learning_rate": 0.0009521029182969915,
+      "loss": 0.6292,
+      "step": 8920
+    },
+    {
+      "epoch": 0.4530974349862294,
+      "grad_norm": 0.060630645721840855,
+      "learning_rate": 0.000952008260862366,
+      "loss": 0.6073,
+      "step": 8925
+    },
+    {
+      "epoch": 0.4533512710842609,
+      "grad_norm": 0.042297603142427635,
+      "learning_rate": 0.0009519135147007726,
+      "loss": 0.6255,
+      "step": 8930
+    },
+    {
+      "epoch": 0.4536051071822924,
+      "grad_norm": 0.03012511251936056,
+      "learning_rate": 0.0009518186798308104,
+      "loss": 0.6177,
+      "step": 8935
+    },
+    {
+      "epoch": 0.4538589432803239,
+      "grad_norm": 0.030825815994073906,
+      "learning_rate": 0.0009517237562710943,
+      "loss": 0.6154,
+      "step": 8940
+    },
+    {
+      "epoch": 0.45411277937835537,
+      "grad_norm": 0.027418304719546968,
+      "learning_rate": 0.0009516287440402576,
+      "loss": 0.6451,
+      "step": 8945
+    },
+    {
+      "epoch": 0.4543666154763869,
+      "grad_norm": 0.03437809223733027,
+      "learning_rate": 0.0009515336431569508,
+      "loss": 0.6372,
+      "step": 8950
+    },
+    {
+      "epoch": 0.4546204515744184,
+      "grad_norm": 0.02706173332906627,
+      "learning_rate": 0.0009514384536398416,
+      "loss": 0.5825,
+      "step": 8955
+    },
+    {
+      "epoch": 0.4548742876724499,
+      "grad_norm": 0.026630978850147032,
+      "learning_rate": 0.0009513431755076152,
+      "loss": 0.6007,
+      "step": 8960
+    },
+    {
+      "epoch": 0.4551281237704814,
+      "grad_norm": 0.027776987645599445,
+      "learning_rate": 0.0009512478087789745,
+      "loss": 0.6777,
+      "step": 8965
+    },
+    {
+      "epoch": 0.4553819598685129,
+      "grad_norm": 0.03127848050806932,
+      "learning_rate": 0.0009511523534726391,
+      "loss": 0.6137,
+      "step": 8970
+    },
+    {
+      "epoch": 0.4556357959665444,
+      "grad_norm": 0.0316814909688181,
+      "learning_rate": 0.0009510568096073466,
+      "loss": 0.6259,
+      "step": 8975
+    },
+    {
+      "epoch": 0.4558896320645759,
+      "grad_norm": 0.03288592896635044,
+      "learning_rate": 0.0009509611772018519,
+      "loss": 0.594,
+      "step": 8980
+    },
+    {
+      "epoch": 0.4561434681626074,
+      "grad_norm": 0.030504149235616482,
+      "learning_rate": 0.0009508654562749271,
+      "loss": 0.6559,
+      "step": 8985
+    },
+    {
+      "epoch": 0.4563973042606389,
+      "grad_norm": 0.02918212945710069,
+      "learning_rate": 0.0009507696468453615,
+      "loss": 0.6376,
+      "step": 8990
+    },
+    {
+      "epoch": 0.4566511403586704,
+      "grad_norm": 0.02867657062417966,
+      "learning_rate": 0.0009506737489319623,
+      "loss": 0.6153,
+      "step": 8995
+    },
+    {
+      "epoch": 0.4569049764567019,
+      "grad_norm": 0.04312848043233246,
+      "learning_rate": 0.0009505777625535538,
+      "loss": 0.6249,
+      "step": 9000
+    },
+    {
+      "epoch": 0.4571588125547334,
+      "grad_norm": 0.025378226871406697,
+      "learning_rate": 0.0009504816877289775,
+      "loss": 0.625,
+      "step": 9005
+    },
+    {
+      "epoch": 0.45741264865276493,
+      "grad_norm": 0.034824462280793105,
+      "learning_rate": 0.0009503855244770923,
+      "loss": 0.5977,
+      "step": 9010
+    },
+    {
+      "epoch": 0.4576664847507964,
+      "grad_norm": 0.06002542817312333,
+      "learning_rate": 0.0009502892728167749,
+      "loss": 0.6148,
+      "step": 9015
+    },
+    {
+      "epoch": 0.4579203208488279,
+      "grad_norm": 0.04708946523260842,
+      "learning_rate": 0.0009501929327669188,
+      "loss": 0.6292,
+      "step": 9020
+    },
+    {
+      "epoch": 0.4581741569468594,
+      "grad_norm": 0.030847513835845684,
+      "learning_rate": 0.0009500965043464349,
+      "loss": 0.5828,
+      "step": 9025
+    },
+    {
+      "epoch": 0.4584279930448909,
+      "grad_norm": 0.030701249013652274,
+      "learning_rate": 0.000949999987574252,
+      "loss": 0.6038,
+      "step": 9030
+    },
+    {
+      "epoch": 0.45868182914292244,
+      "grad_norm": 0.027439149183266075,
+      "learning_rate": 0.0009499033824693158,
+      "loss": 0.6028,
+      "step": 9035
+    },
+    {
+      "epoch": 0.4589356652409539,
+      "grad_norm": 0.050799419978763395,
+      "learning_rate": 0.000949806689050589,
+      "loss": 0.633,
+      "step": 9040
+    },
+    {
+      "epoch": 0.4591895013389854,
+      "grad_norm": 0.03760838688025009,
+      "learning_rate": 0.0009497099073370526,
+      "loss": 0.5974,
+      "step": 9045
+    },
+    {
+      "epoch": 0.45944333743701693,
+      "grad_norm": 0.03301722262990158,
+      "learning_rate": 0.0009496130373477039,
+      "loss": 0.614,
+      "step": 9050
+    },
+    {
+      "epoch": 0.4596971735350484,
+      "grad_norm": 0.03076280298803993,
+      "learning_rate": 0.0009495160791015583,
+      "loss": 0.6149,
+      "step": 9055
+    },
+    {
+      "epoch": 0.45995100963307994,
+      "grad_norm": 0.02435240521738851,
+      "learning_rate": 0.0009494190326176479,
+      "loss": 0.5875,
+      "step": 9060
+    },
+    {
+      "epoch": 0.4602048457311114,
+      "grad_norm": 0.036608354215028616,
+      "learning_rate": 0.0009493218979150229,
+      "loss": 0.6492,
+      "step": 9065
+    },
+    {
+      "epoch": 0.4604586818291429,
+      "grad_norm": 0.031052494072487117,
+      "learning_rate": 0.00094922467501275,
+      "loss": 0.6289,
+      "step": 9070
+    },
+    {
+      "epoch": 0.46071251792717444,
+      "grad_norm": 0.05646399870150115,
+      "learning_rate": 0.0009491273639299136,
+      "loss": 0.6317,
+      "step": 9075
+    },
+    {
+      "epoch": 0.4609663540252059,
+      "grad_norm": 0.026516267771592012,
+      "learning_rate": 0.0009490299646856156,
+      "loss": 0.6345,
+      "step": 9080
+    },
+    {
+      "epoch": 0.46122019012323745,
+      "grad_norm": 0.026314766475744072,
+      "learning_rate": 0.0009489324772989747,
+      "loss": 0.5714,
+      "step": 9085
+    },
+    {
+      "epoch": 0.46147402622126893,
+      "grad_norm": 0.04954065910053027,
+      "learning_rate": 0.0009488349017891275,
+      "loss": 0.5893,
+      "step": 9090
+    },
+    {
+      "epoch": 0.4617278623193004,
+      "grad_norm": 0.030999818544982013,
+      "learning_rate": 0.0009487372381752273,
+      "loss": 0.6209,
+      "step": 9095
+    },
+    {
+      "epoch": 0.46198169841733194,
+      "grad_norm": 0.05472893804971776,
+      "learning_rate": 0.0009486394864764452,
+      "loss": 0.6358,
+      "step": 9100
+    },
+    {
+      "epoch": 0.4622355345153634,
+      "grad_norm": 0.03583598972027129,
+      "learning_rate": 0.000948541646711969,
+      "loss": 0.6119,
+      "step": 9105
+    },
+    {
+      "epoch": 0.46248937061339496,
+      "grad_norm": 0.03444527417330176,
+      "learning_rate": 0.0009484437189010047,
+      "loss": 0.6398,
+      "step": 9110
+    },
+    {
+      "epoch": 0.46274320671142644,
+      "grad_norm": 0.03009633415385528,
+      "learning_rate": 0.0009483457030627746,
+      "loss": 0.6811,
+      "step": 9115
+    },
+    {
+      "epoch": 0.4629970428094579,
+      "grad_norm": 0.02932226956272446,
+      "learning_rate": 0.000948247599216519,
+      "loss": 0.65,
+      "step": 9120
+    },
+    {
+      "epoch": 0.46325087890748945,
+      "grad_norm": 0.040643168448799956,
+      "learning_rate": 0.0009481494073814951,
+      "loss": 0.6312,
+      "step": 9125
+    },
+    {
+      "epoch": 0.46350471500552093,
+      "grad_norm": 0.035491703470452655,
+      "learning_rate": 0.0009480511275769773,
+      "loss": 0.6314,
+      "step": 9130
+    },
+    {
+      "epoch": 0.46375855110355246,
+      "grad_norm": 0.030658119230129795,
+      "learning_rate": 0.0009479527598222577,
+      "loss": 0.6384,
+      "step": 9135
+    },
+    {
+      "epoch": 0.46401238720158394,
+      "grad_norm": 0.031569879525829354,
+      "learning_rate": 0.0009478543041366452,
+      "loss": 0.6376,
+      "step": 9140
+    },
+    {
+      "epoch": 0.4642662232996154,
+      "grad_norm": 0.03790647935672875,
+      "learning_rate": 0.0009477557605394664,
+      "loss": 0.6153,
+      "step": 9145
+    },
+    {
+      "epoch": 0.46452005939764696,
+      "grad_norm": 0.030688459561337894,
+      "learning_rate": 0.0009476571290500647,
+      "loss": 0.6538,
+      "step": 9150
+    },
+    {
+      "epoch": 0.46477389549567844,
+      "grad_norm": 0.03266933170402601,
+      "learning_rate": 0.000947558409687801,
+      "loss": 0.642,
+      "step": 9155
+    },
+    {
+      "epoch": 0.4650277315937099,
+      "grad_norm": 0.02719276820532039,
+      "learning_rate": 0.0009474596024720534,
+      "loss": 0.6071,
+      "step": 9160
+    },
+    {
+      "epoch": 0.46528156769174145,
+      "grad_norm": 0.03990484440945978,
+      "learning_rate": 0.0009473607074222172,
+      "loss": 0.6301,
+      "step": 9165
+    },
+    {
+      "epoch": 0.46553540378977293,
+      "grad_norm": 0.030262431248659433,
+      "learning_rate": 0.0009472617245577053,
+      "loss": 0.6482,
+      "step": 9170
+    },
+    {
+      "epoch": 0.46578923988780446,
+      "grad_norm": 0.02733900220464855,
+      "learning_rate": 0.0009471626538979474,
+      "loss": 0.6199,
+      "step": 9175
+    },
+    {
+      "epoch": 0.46604307598583594,
+      "grad_norm": 0.04504748091932876,
+      "learning_rate": 0.0009470634954623905,
+      "loss": 0.596,
+      "step": 9180
+    },
+    {
+      "epoch": 0.4662969120838674,
+      "grad_norm": 0.027370854127821743,
+      "learning_rate": 0.0009469642492704989,
+      "loss": 0.6347,
+      "step": 9185
+    },
+    {
+      "epoch": 0.46655074818189896,
+      "grad_norm": 0.05378088594277385,
+      "learning_rate": 0.0009468649153417542,
+      "loss": 0.6133,
+      "step": 9190
+    },
+    {
+      "epoch": 0.46680458427993043,
+      "grad_norm": 0.03391320932942569,
+      "learning_rate": 0.000946765493695655,
+      "loss": 0.6205,
+      "step": 9195
+    },
+    {
+      "epoch": 0.46705842037796197,
+      "grad_norm": 0.04140473437665591,
+      "learning_rate": 0.0009466659843517176,
+      "loss": 0.6276,
+      "step": 9200
+    },
+    {
+      "epoch": 0.46731225647599345,
+      "grad_norm": 0.02569082334485342,
+      "learning_rate": 0.0009465663873294747,
+      "loss": 0.6059,
+      "step": 9205
+    },
+    {
+      "epoch": 0.4675660925740249,
+      "grad_norm": 0.02578745353589086,
+      "learning_rate": 0.0009464667026484774,
+      "loss": 0.6109,
+      "step": 9210
+    },
+    {
+      "epoch": 0.46781992867205646,
+      "grad_norm": 0.04347854360585143,
+      "learning_rate": 0.0009463669303282927,
+      "loss": 0.6338,
+      "step": 9215
+    },
+    {
+      "epoch": 0.46807376477008794,
+      "grad_norm": 0.03554446657138465,
+      "learning_rate": 0.0009462670703885054,
+      "loss": 0.615,
+      "step": 9220
+    },
+    {
+      "epoch": 0.4683276008681195,
+      "grad_norm": 0.035194592885897366,
+      "learning_rate": 0.0009461671228487181,
+      "loss": 0.6292,
+      "step": 9225
+    },
+    {
+      "epoch": 0.46858143696615095,
+      "grad_norm": 0.048381105428334986,
+      "learning_rate": 0.0009460670877285493,
+      "loss": 0.6208,
+      "step": 9230
+    },
+    {
+      "epoch": 0.46883527306418243,
+      "grad_norm": 0.029440014096268563,
+      "learning_rate": 0.0009459669650476359,
+      "loss": 0.6429,
+      "step": 9235
+    },
+    {
+      "epoch": 0.46908910916221397,
+      "grad_norm": 0.032081756608336384,
+      "learning_rate": 0.0009458667548256312,
+      "loss": 0.6007,
+      "step": 9240
+    },
+    {
+      "epoch": 0.46934294526024545,
+      "grad_norm": 0.029846212941765773,
+      "learning_rate": 0.0009457664570822061,
+      "loss": 0.6394,
+      "step": 9245
+    },
+    {
+      "epoch": 0.469596781358277,
+      "grad_norm": 0.029316590912338028,
+      "learning_rate": 0.0009456660718370484,
+      "loss": 0.6067,
+      "step": 9250
+    },
+    {
+      "epoch": 0.46985061745630846,
+      "grad_norm": 0.0239561828979098,
+      "learning_rate": 0.0009455655991098635,
+      "loss": 0.6099,
+      "step": 9255
+    },
+    {
+      "epoch": 0.47010445355433994,
+      "grad_norm": 0.031644371771562194,
+      "learning_rate": 0.0009454650389203735,
+      "loss": 0.6135,
+      "step": 9260
+    },
+    {
+      "epoch": 0.4703582896523715,
+      "grad_norm": 0.02960336906370349,
+      "learning_rate": 0.0009453643912883179,
+      "loss": 0.5989,
+      "step": 9265
+    },
+    {
+      "epoch": 0.47061212575040295,
+      "grad_norm": 0.026335689130472083,
+      "learning_rate": 0.0009452636562334532,
+      "loss": 0.6412,
+      "step": 9270
+    },
+    {
+      "epoch": 0.4708659618484345,
+      "grad_norm": 0.02648170596691074,
+      "learning_rate": 0.0009451628337755533,
+      "loss": 0.5987,
+      "step": 9275
+    },
+    {
+      "epoch": 0.47111979794646597,
+      "grad_norm": 0.09832497965370343,
+      "learning_rate": 0.0009450619239344094,
+      "loss": 0.6353,
+      "step": 9280
+    },
+    {
+      "epoch": 0.47137363404449745,
+      "grad_norm": 0.04865339323397357,
+      "learning_rate": 0.0009449609267298292,
+      "loss": 0.6184,
+      "step": 9285
+    },
+    {
+      "epoch": 0.471627470142529,
+      "grad_norm": 0.0675079877924424,
+      "learning_rate": 0.000944859842181638,
+      "loss": 0.6434,
+      "step": 9290
+    },
+    {
+      "epoch": 0.47188130624056046,
+      "grad_norm": 0.049608513841766726,
+      "learning_rate": 0.0009447586703096784,
+      "loss": 0.6152,
+      "step": 9295
+    },
+    {
+      "epoch": 0.472135142338592,
+      "grad_norm": 0.027169536333829045,
+      "learning_rate": 0.0009446574111338097,
+      "loss": 0.6311,
+      "step": 9300
+    },
+    {
+      "epoch": 0.4723889784366235,
+      "grad_norm": 0.03714479241574352,
+      "learning_rate": 0.0009445560646739088,
+      "loss": 0.6124,
+      "step": 9305
+    },
+    {
+      "epoch": 0.47264281453465495,
+      "grad_norm": 0.02908329857098156,
+      "learning_rate": 0.0009444546309498693,
+      "loss": 0.5914,
+      "step": 9310
+    },
+    {
+      "epoch": 0.4728966506326865,
+      "grad_norm": 0.040933687722797846,
+      "learning_rate": 0.0009443531099816025,
+      "loss": 0.6233,
+      "step": 9315
+    },
+    {
+      "epoch": 0.47315048673071797,
+      "grad_norm": 0.039379070667730956,
+      "learning_rate": 0.0009442515017890361,
+      "loss": 0.6072,
+      "step": 9320
+    },
+    {
+      "epoch": 0.4734043228287495,
+      "grad_norm": 0.029435204711081575,
+      "learning_rate": 0.0009441498063921152,
+      "loss": 0.6187,
+      "step": 9325
+    },
+    {
+      "epoch": 0.473658158926781,
+      "grad_norm": 0.03446056203368723,
+      "learning_rate": 0.0009440480238108025,
+      "loss": 0.6397,
+      "step": 9330
+    },
+    {
+      "epoch": 0.47391199502481246,
+      "grad_norm": 0.033665244711908196,
+      "learning_rate": 0.000943946154065077,
+      "loss": 0.6008,
+      "step": 9335
+    },
+    {
+      "epoch": 0.474165831122844,
+      "grad_norm": 0.027286057953673223,
+      "learning_rate": 0.0009438441971749354,
+      "loss": 0.6321,
+      "step": 9340
+    },
+    {
+      "epoch": 0.4744196672208755,
+      "grad_norm": 0.027252383518020805,
+      "learning_rate": 0.0009437421531603916,
+      "loss": 0.6052,
+      "step": 9345
+    },
+    {
+      "epoch": 0.474673503318907,
+      "grad_norm": 0.0323130206125324,
+      "learning_rate": 0.0009436400220414758,
+      "loss": 0.5934,
+      "step": 9350
+    },
+    {
+      "epoch": 0.4749273394169385,
+      "grad_norm": 0.05540094012390889,
+      "learning_rate": 0.0009435378038382363,
+      "loss": 0.6281,
+      "step": 9355
+    },
+    {
+      "epoch": 0.47518117551496997,
+      "grad_norm": 0.026971796001700972,
+      "learning_rate": 0.0009434354985707376,
+      "loss": 0.5914,
+      "step": 9360
+    },
+    {
+      "epoch": 0.4754350116130015,
+      "grad_norm": 0.028590512194596497,
+      "learning_rate": 0.0009433331062590621,
+      "loss": 0.5702,
+      "step": 9365
+    },
+    {
+      "epoch": 0.475688847711033,
+      "grad_norm": 0.03025843533876914,
+      "learning_rate": 0.0009432306269233087,
+      "loss": 0.6067,
+      "step": 9370
+    },
+    {
+      "epoch": 0.47594268380906446,
+      "grad_norm": 0.038050502244553336,
+      "learning_rate": 0.0009431280605835937,
+      "loss": 0.5976,
+      "step": 9375
+    },
+    {
+      "epoch": 0.476196519907096,
+      "grad_norm": 0.02875268521706395,
+      "learning_rate": 0.0009430254072600501,
+      "loss": 0.6181,
+      "step": 9380
+    },
+    {
+      "epoch": 0.4764503560051275,
+      "grad_norm": 0.03480214322443527,
+      "learning_rate": 0.0009429226669728285,
+      "loss": 0.5914,
+      "step": 9385
+    },
+    {
+      "epoch": 0.476704192103159,
+      "grad_norm": 0.02601422654788008,
+      "learning_rate": 0.0009428198397420964,
+      "loss": 0.5903,
+      "step": 9390
+    },
+    {
+      "epoch": 0.4769580282011905,
+      "grad_norm": 0.025876539373789892,
+      "learning_rate": 0.0009427169255880379,
+      "loss": 0.6328,
+      "step": 9395
+    },
+    {
+      "epoch": 0.47721186429922197,
+      "grad_norm": 0.04738818931522865,
+      "learning_rate": 0.0009426139245308548,
+      "loss": 0.5819,
+      "step": 9400
+    },
+    {
+      "epoch": 0.4774657003972535,
+      "grad_norm": 0.035112867568037956,
+      "learning_rate": 0.0009425108365907658,
+      "loss": 0.6039,
+      "step": 9405
+    },
+    {
+      "epoch": 0.477719536495285,
+      "grad_norm": 0.03811522461096082,
+      "learning_rate": 0.0009424076617880059,
+      "loss": 0.5912,
+      "step": 9410
+    },
+    {
+      "epoch": 0.4779733725933165,
+      "grad_norm": 0.03389803328245173,
+      "learning_rate": 0.0009423044001428287,
+      "loss": 0.5831,
+      "step": 9415
+    },
+    {
+      "epoch": 0.478227208691348,
+      "grad_norm": 0.03133776562544537,
+      "learning_rate": 0.0009422010516755034,
+      "loss": 0.6577,
+      "step": 9420
+    },
+    {
+      "epoch": 0.4784810447893795,
+      "grad_norm": 0.0269014540427202,
+      "learning_rate": 0.0009420976164063169,
+      "loss": 0.6213,
+      "step": 9425
+    },
+    {
+      "epoch": 0.478734880887411,
+      "grad_norm": 0.02669607689703728,
+      "learning_rate": 0.0009419940943555731,
+      "loss": 0.6164,
+      "step": 9430
+    },
+    {
+      "epoch": 0.4789887169854425,
+      "grad_norm": 0.026824612081548814,
+      "learning_rate": 0.0009418904855435927,
+      "loss": 0.6229,
+      "step": 9435
+    },
+    {
+      "epoch": 0.479242553083474,
+      "grad_norm": 0.046154821680137924,
+      "learning_rate": 0.0009417867899907138,
+      "loss": 0.5931,
+      "step": 9440
+    },
+    {
+      "epoch": 0.4794963891815055,
+      "grad_norm": 0.036033799620862,
+      "learning_rate": 0.0009416830077172911,
+      "loss": 0.6269,
+      "step": 9445
+    },
+    {
+      "epoch": 0.479750225279537,
+      "grad_norm": 0.06210323273721612,
+      "learning_rate": 0.0009415791387436968,
+      "loss": 0.6021,
+      "step": 9450
+    },
+    {
+      "epoch": 0.4800040613775685,
+      "grad_norm": 0.027549653199184822,
+      "learning_rate": 0.0009414751830903195,
+      "loss": 0.6554,
+      "step": 9455
+    },
+    {
+      "epoch": 0.4802578974756,
+      "grad_norm": 0.03550040518296244,
+      "learning_rate": 0.0009413711407775655,
+      "loss": 0.6116,
+      "step": 9460
+    },
+    {
+      "epoch": 0.4805117335736315,
+      "grad_norm": 0.025622868890069674,
+      "learning_rate": 0.0009412670118258578,
+      "loss": 0.6054,
+      "step": 9465
+    },
+    {
+      "epoch": 0.480765569671663,
+      "grad_norm": 0.04189228398723917,
+      "learning_rate": 0.0009411627962556359,
+      "loss": 0.6122,
+      "step": 9470
+    },
+    {
+      "epoch": 0.4810194057696945,
+      "grad_norm": 0.02570476986142007,
+      "learning_rate": 0.0009410584940873574,
+      "loss": 0.6176,
+      "step": 9475
+    },
+    {
+      "epoch": 0.481273241867726,
+      "grad_norm": 0.07031584927871945,
+      "learning_rate": 0.0009409541053414963,
+      "loss": 0.5885,
+      "step": 9480
+    },
+    {
+      "epoch": 0.4815270779657575,
+      "grad_norm": 0.03149142845263447,
+      "learning_rate": 0.000940849630038543,
+      "loss": 0.6325,
+      "step": 9485
+    },
+    {
+      "epoch": 0.48178091406378903,
+      "grad_norm": 0.049358378222708374,
+      "learning_rate": 0.0009407450681990061,
+      "loss": 0.6283,
+      "step": 9490
+    },
+    {
+      "epoch": 0.4820347501618205,
+      "grad_norm": 0.025008692305935783,
+      "learning_rate": 0.0009406404198434102,
+      "loss": 0.6001,
+      "step": 9495
+    },
+    {
+      "epoch": 0.482288586259852,
+      "grad_norm": 0.030799405008239217,
+      "learning_rate": 0.0009405356849922972,
+      "loss": 0.638,
+      "step": 9500
+    },
+    {
+      "epoch": 0.4825424223578835,
+      "grad_norm": 0.024246543364429107,
+      "learning_rate": 0.0009404308636662264,
+      "loss": 0.6356,
+      "step": 9505
+    },
+    {
+      "epoch": 0.482796258455915,
+      "grad_norm": 0.03302243412784637,
+      "learning_rate": 0.0009403259558857734,
+      "loss": 0.6112,
+      "step": 9510
+    },
+    {
+      "epoch": 0.48305009455394654,
+      "grad_norm": 0.023532175082063685,
+      "learning_rate": 0.0009402209616715311,
+      "loss": 0.585,
+      "step": 9515
+    },
+    {
+      "epoch": 0.483303930651978,
+      "grad_norm": 0.04255227744929531,
+      "learning_rate": 0.0009401158810441095,
+      "loss": 0.6327,
+      "step": 9520
+    },
+    {
+      "epoch": 0.4835577667500095,
+      "grad_norm": 0.024840322869465154,
+      "learning_rate": 0.0009400107140241354,
+      "loss": 0.6208,
+      "step": 9525
+    },
+    {
+      "epoch": 0.48381160284804103,
+      "grad_norm": 0.024921227435088424,
+      "learning_rate": 0.0009399054606322524,
+      "loss": 0.6054,
+      "step": 9530
+    },
+    {
+      "epoch": 0.4840654389460725,
+      "grad_norm": 0.031344853057544725,
+      "learning_rate": 0.0009398001208891212,
+      "loss": 0.5989,
+      "step": 9535
+    },
+    {
+      "epoch": 0.48431927504410405,
+      "grad_norm": 0.14707366575213146,
+      "learning_rate": 0.0009396946948154194,
+      "loss": 0.6113,
+      "step": 9540
+    },
+    {
+      "epoch": 0.4845731111421355,
+      "grad_norm": 0.02572297244175927,
+      "learning_rate": 0.0009395891824318421,
+      "loss": 0.6063,
+      "step": 9545
+    },
+    {
+      "epoch": 0.484826947240167,
+      "grad_norm": 0.024708458838109962,
+      "learning_rate": 0.0009394835837591004,
+      "loss": 0.6199,
+      "step": 9550
+    },
+    {
+      "epoch": 0.48508078333819854,
+      "grad_norm": 0.035794765917293574,
+      "learning_rate": 0.0009393778988179229,
+      "loss": 0.635,
+      "step": 9555
+    },
+    {
+      "epoch": 0.48533461943623,
+      "grad_norm": 0.0554244840891106,
+      "learning_rate": 0.0009392721276290549,
+      "loss": 0.6169,
+      "step": 9560
+    },
+    {
+      "epoch": 0.48558845553426155,
+      "grad_norm": 0.029279116423008678,
+      "learning_rate": 0.0009391662702132591,
+      "loss": 0.637,
+      "step": 9565
+    },
+    {
+      "epoch": 0.48584229163229303,
+      "grad_norm": 0.028687460122576846,
+      "learning_rate": 0.0009390603265913145,
+      "loss": 0.6328,
+      "step": 9570
+    },
+    {
+      "epoch": 0.4860961277303245,
+      "grad_norm": 0.03505406613441969,
+      "learning_rate": 0.0009389542967840173,
+      "loss": 0.5973,
+      "step": 9575
+    },
+    {
+      "epoch": 0.48634996382835605,
+      "grad_norm": 0.041803466393629654,
+      "learning_rate": 0.0009388481808121807,
+      "loss": 0.599,
+      "step": 9580
+    },
+    {
+      "epoch": 0.4866037999263875,
+      "grad_norm": 0.023902502916385335,
+      "learning_rate": 0.0009387419786966348,
+      "loss": 0.5804,
+      "step": 9585
+    },
+    {
+      "epoch": 0.486857636024419,
+      "grad_norm": 0.04180433344415987,
+      "learning_rate": 0.0009386356904582265,
+      "loss": 0.6429,
+      "step": 9590
+    },
+    {
+      "epoch": 0.48711147212245054,
+      "grad_norm": 0.037702560242762466,
+      "learning_rate": 0.0009385293161178197,
+      "loss": 0.6352,
+      "step": 9595
+    },
+    {
+      "epoch": 0.487365308220482,
+      "grad_norm": 0.040454587767540365,
+      "learning_rate": 0.0009384228556962949,
+      "loss": 0.617,
+      "step": 9600
+    },
+    {
+      "epoch": 0.48761914431851355,
+      "grad_norm": 0.051660525518630485,
+      "learning_rate": 0.0009383163092145501,
+      "loss": 0.6255,
+      "step": 9605
+    },
+    {
+      "epoch": 0.48787298041654503,
+      "grad_norm": 0.03034861191132385,
+      "learning_rate": 0.0009382096766934996,
+      "loss": 0.6528,
+      "step": 9610
+    },
+    {
+      "epoch": 0.4881268165145765,
+      "grad_norm": 0.03757127293588768,
+      "learning_rate": 0.000938102958154075,
+      "loss": 0.59,
+      "step": 9615
+    },
+    {
+      "epoch": 0.48838065261260805,
+      "grad_norm": 0.07147380234071418,
+      "learning_rate": 0.0009379961536172244,
+      "loss": 0.6392,
+      "step": 9620
+    },
+    {
+      "epoch": 0.4886344887106395,
+      "grad_norm": 0.026977337654578798,
+      "learning_rate": 0.0009378892631039132,
+      "loss": 0.6504,
+      "step": 9625
+    },
+    {
+      "epoch": 0.48888832480867106,
+      "grad_norm": 0.04626336090648292,
+      "learning_rate": 0.0009377822866351235,
+      "loss": 0.651,
+      "step": 9630
+    },
+    {
+      "epoch": 0.48914216090670254,
+      "grad_norm": 0.06597352026364534,
+      "learning_rate": 0.000937675224231854,
+      "loss": 0.6051,
+      "step": 9635
+    },
+    {
+      "epoch": 0.489395997004734,
+      "grad_norm": 0.04524993145754299,
+      "learning_rate": 0.0009375680759151206,
+      "loss": 0.6247,
+      "step": 9640
+    },
+    {
+      "epoch": 0.48964983310276555,
+      "grad_norm": 0.034280234478550185,
+      "learning_rate": 0.0009374608417059562,
+      "loss": 0.612,
+      "step": 9645
+    },
+    {
+      "epoch": 0.48990366920079703,
+      "grad_norm": 0.02631497433619995,
+      "learning_rate": 0.0009373535216254101,
+      "loss": 0.612,
+      "step": 9650
+    },
+    {
+      "epoch": 0.49015750529882857,
+      "grad_norm": 0.03832792024810497,
+      "learning_rate": 0.0009372461156945489,
+      "loss": 0.6249,
+      "step": 9655
+    },
+    {
+      "epoch": 0.49041134139686005,
+      "grad_norm": 0.033364201876492666,
+      "learning_rate": 0.0009371386239344557,
+      "loss": 0.637,
+      "step": 9660
+    },
+    {
+      "epoch": 0.4906651774948915,
+      "grad_norm": 0.03219096028559519,
+      "learning_rate": 0.0009370310463662306,
+      "loss": 0.6425,
+      "step": 9665
+    },
+    {
+      "epoch": 0.49091901359292306,
+      "grad_norm": 0.024519457484542897,
+      "learning_rate": 0.0009369233830109905,
+      "loss": 0.5807,
+      "step": 9670
+    },
+    {
+      "epoch": 0.49117284969095454,
+      "grad_norm": 0.0341270179716214,
+      "learning_rate": 0.0009368156338898694,
+      "loss": 0.6251,
+      "step": 9675
+    },
+    {
+      "epoch": 0.4914266857889861,
+      "grad_norm": 0.024071390232175503,
+      "learning_rate": 0.0009367077990240176,
+      "loss": 0.5962,
+      "step": 9680
+    },
+    {
+      "epoch": 0.49168052188701755,
+      "grad_norm": 0.02743756336479237,
+      "learning_rate": 0.0009365998784346028,
+      "loss": 0.6005,
+      "step": 9685
+    },
+    {
+      "epoch": 0.49193435798504903,
+      "grad_norm": 0.04122709532915945,
+      "learning_rate": 0.0009364918721428093,
+      "loss": 0.5867,
+      "step": 9690
+    },
+    {
+      "epoch": 0.49218819408308057,
+      "grad_norm": 0.02721674730288296,
+      "learning_rate": 0.0009363837801698379,
+      "loss": 0.62,
+      "step": 9695
+    },
+    {
+      "epoch": 0.49244203018111204,
+      "grad_norm": 0.04100687725132822,
+      "learning_rate": 0.0009362756025369067,
+      "loss": 0.6184,
+      "step": 9700
+    },
+    {
+      "epoch": 0.4926958662791436,
+      "grad_norm": 0.023552301564775294,
+      "learning_rate": 0.0009361673392652505,
+      "loss": 0.5772,
+      "step": 9705
+    },
+    {
+      "epoch": 0.49294970237717506,
+      "grad_norm": 0.043765850357777386,
+      "learning_rate": 0.0009360589903761208,
+      "loss": 0.5763,
+      "step": 9710
+    },
+    {
+      "epoch": 0.49320353847520654,
+      "grad_norm": 0.0377192420219829,
+      "learning_rate": 0.0009359505558907857,
+      "loss": 0.5907,
+      "step": 9715
+    },
+    {
+      "epoch": 0.4934573745732381,
+      "grad_norm": 0.04173445599728607,
+      "learning_rate": 0.0009358420358305307,
+      "loss": 0.6209,
+      "step": 9720
+    },
+    {
+      "epoch": 0.49371121067126955,
+      "grad_norm": 0.02795345786794999,
+      "learning_rate": 0.0009357334302166577,
+      "loss": 0.6139,
+      "step": 9725
+    },
+    {
+      "epoch": 0.4939650467693011,
+      "grad_norm": 0.030534586657452248,
+      "learning_rate": 0.0009356247390704853,
+      "loss": 0.6334,
+      "step": 9730
+    },
+    {
+      "epoch": 0.49421888286733257,
+      "grad_norm": 0.024191524057541644,
+      "learning_rate": 0.0009355159624133489,
+      "loss": 0.5692,
+      "step": 9735
+    },
+    {
+      "epoch": 0.49447271896536404,
+      "grad_norm": 0.03323609638954307,
+      "learning_rate": 0.0009354071002666011,
+      "loss": 0.6106,
+      "step": 9740
+    },
+    {
+      "epoch": 0.4947265550633956,
+      "grad_norm": 0.024949282216645566,
+      "learning_rate": 0.000935298152651611,
+      "loss": 0.5978,
+      "step": 9745
+    },
+    {
+      "epoch": 0.49498039116142706,
+      "grad_norm": 0.06663669858744471,
+      "learning_rate": 0.0009351891195897644,
+      "loss": 0.623,
+      "step": 9750
+    },
+    {
+      "epoch": 0.4952342272594586,
+      "grad_norm": 0.032099026906883724,
+      "learning_rate": 0.0009350800011024636,
+      "loss": 0.6189,
+      "step": 9755
+    },
+    {
+      "epoch": 0.49548806335749007,
+      "grad_norm": 0.0663218764773123,
+      "learning_rate": 0.0009349707972111285,
+      "loss": 0.6074,
+      "step": 9760
+    },
+    {
+      "epoch": 0.49574189945552155,
+      "grad_norm": 0.025197965203499328,
+      "learning_rate": 0.0009348615079371952,
+      "loss": 0.5815,
+      "step": 9765
+    },
+    {
+      "epoch": 0.4959957355535531,
+      "grad_norm": 0.04923718941099546,
+      "learning_rate": 0.0009347521333021165,
+      "loss": 0.6104,
+      "step": 9770
+    },
+    {
+      "epoch": 0.49624957165158456,
+      "grad_norm": 0.024233669045321673,
+      "learning_rate": 0.000934642673327362,
+      "loss": 0.5966,
+      "step": 9775
+    },
+    {
+      "epoch": 0.4965034077496161,
+      "grad_norm": 0.02630064833029989,
+      "learning_rate": 0.0009345331280344184,
+      "loss": 0.6308,
+      "step": 9780
+    },
+    {
+      "epoch": 0.4967572438476476,
+      "grad_norm": 0.03733420897200894,
+      "learning_rate": 0.0009344234974447888,
+      "loss": 0.5984,
+      "step": 9785
+    },
+    {
+      "epoch": 0.49701107994567906,
+      "grad_norm": 0.03288604787671546,
+      "learning_rate": 0.0009343137815799931,
+      "loss": 0.6278,
+      "step": 9790
+    },
+    {
+      "epoch": 0.4972649160437106,
+      "grad_norm": 0.02870697067749721,
+      "learning_rate": 0.000934203980461568,
+      "loss": 0.5907,
+      "step": 9795
+    },
+    {
+      "epoch": 0.49751875214174207,
+      "grad_norm": 0.029210706904460447,
+      "learning_rate": 0.0009340940941110669,
+      "loss": 0.623,
+      "step": 9800
+    },
+    {
+      "epoch": 0.4977725882397736,
+      "grad_norm": 0.022650840666329053,
+      "learning_rate": 0.00093398412255006,
+      "loss": 0.585,
+      "step": 9805
+    },
+    {
+      "epoch": 0.4980264243378051,
+      "grad_norm": 0.033968761190204845,
+      "learning_rate": 0.000933874065800134,
+      "loss": 0.6272,
+      "step": 9810
+    },
+    {
+      "epoch": 0.49828026043583656,
+      "grad_norm": 0.03225151989127355,
+      "learning_rate": 0.0009337639238828927,
+      "loss": 0.5957,
+      "step": 9815
+    },
+    {
+      "epoch": 0.4985340965338681,
+      "grad_norm": 0.03766789630190195,
+      "learning_rate": 0.0009336536968199562,
+      "loss": 0.5961,
+      "step": 9820
+    },
+    {
+      "epoch": 0.4987879326318996,
+      "grad_norm": 0.024291519990984625,
+      "learning_rate": 0.0009335433846329618,
+      "loss": 0.5848,
+      "step": 9825
+    },
+    {
+      "epoch": 0.49904176872993106,
+      "grad_norm": 1.1081797676644811,
+      "learning_rate": 0.000933432987343563,
+      "loss": 0.6314,
+      "step": 9830
+    },
+    {
+      "epoch": 0.4992956048279626,
+      "grad_norm": 0.06216658843899505,
+      "learning_rate": 0.0009333225049734303,
+      "loss": 0.6137,
+      "step": 9835
+    },
+    {
+      "epoch": 0.49954944092599407,
+      "grad_norm": 0.06842641946547982,
+      "learning_rate": 0.0009332119375442509,
+      "loss": 0.6494,
+      "step": 9840
+    },
+    {
+      "epoch": 0.4998032770240256,
+      "grad_norm": 0.07771247073389582,
+      "learning_rate": 0.0009331012850777286,
+      "loss": 0.6461,
+      "step": 9845
+    },
+    {
+      "epoch": 0.5000571131220571,
+      "grad_norm": 0.09586623316765225,
+      "learning_rate": 0.0009329905475955838,
+      "loss": 0.6127,
+      "step": 9850
+    },
+    {
+      "epoch": 0.5003109492200886,
+      "grad_norm": 0.0549907807273626,
+      "learning_rate": 0.0009328797251195539,
+      "loss": 0.6336,
+      "step": 9855
+    },
+    {
+      "epoch": 0.5005647853181201,
+      "grad_norm": 0.03707374666368335,
+      "learning_rate": 0.0009327688176713927,
+      "loss": 0.6643,
+      "step": 9860
+    },
+    {
+      "epoch": 0.5008186214161516,
+      "grad_norm": 0.06451159664108401,
+      "learning_rate": 0.0009326578252728708,
+      "loss": 0.6221,
+      "step": 9865
+    },
+    {
+      "epoch": 0.5010724575141831,
+      "grad_norm": 0.07176924497990926,
+      "learning_rate": 0.0009325467479457754,
+      "loss": 0.6478,
+      "step": 9870
+    },
+    {
+      "epoch": 0.5013262936122146,
+      "grad_norm": 0.05337950431785672,
+      "learning_rate": 0.0009324355857119106,
+      "loss": 0.6161,
+      "step": 9875
+    },
+    {
+      "epoch": 0.5015801297102461,
+      "grad_norm": 0.040267440416152635,
+      "learning_rate": 0.0009323243385930968,
+      "loss": 0.6131,
+      "step": 9880
+    },
+    {
+      "epoch": 0.5018339658082775,
+      "grad_norm": 0.03271936843258226,
+      "learning_rate": 0.0009322130066111713,
+      "loss": 0.662,
+      "step": 9885
+    },
+    {
+      "epoch": 0.5020878019063091,
+      "grad_norm": 0.029561308985711136,
+      "learning_rate": 0.0009321015897879883,
+      "loss": 0.6276,
+      "step": 9890
+    },
+    {
+      "epoch": 0.5023416380043406,
+      "grad_norm": 0.053789724310795484,
+      "learning_rate": 0.0009319900881454179,
+      "loss": 0.6598,
+      "step": 9895
+    },
+    {
+      "epoch": 0.5025954741023722,
+      "grad_norm": 0.040676726302224304,
+      "learning_rate": 0.0009318785017053475,
+      "loss": 0.6075,
+      "step": 9900
+    },
+    {
+      "epoch": 0.5028493102004036,
+      "grad_norm": 0.04707911068377355,
+      "learning_rate": 0.0009317668304896811,
+      "loss": 0.6384,
+      "step": 9905
+    },
+    {
+      "epoch": 0.5031031462984351,
+      "grad_norm": 0.02729704500716682,
+      "learning_rate": 0.000931655074520339,
+      "loss": 0.6042,
+      "step": 9910
+    },
+    {
+      "epoch": 0.5033569823964666,
+      "grad_norm": 0.041817519847404235,
+      "learning_rate": 0.0009315432338192584,
+      "loss": 0.6407,
+      "step": 9915
+    },
+    {
+      "epoch": 0.5036108184944981,
+      "grad_norm": 0.06693004074647073,
+      "learning_rate": 0.0009314313084083933,
+      "loss": 0.6267,
+      "step": 9920
+    },
+    {
+      "epoch": 0.5038646545925296,
+      "grad_norm": 0.037734517481965976,
+      "learning_rate": 0.0009313192983097137,
+      "loss": 0.6235,
+      "step": 9925
+    },
+    {
+      "epoch": 0.5041184906905611,
+      "grad_norm": 0.04441041670706489,
+      "learning_rate": 0.0009312072035452069,
+      "loss": 0.6722,
+      "step": 9930
+    },
+    {
+      "epoch": 0.5043723267885926,
+      "grad_norm": 0.06966465373395124,
+      "learning_rate": 0.0009310950241368765,
+      "loss": 0.6524,
+      "step": 9935
+    },
+    {
+      "epoch": 0.5046261628866241,
+      "grad_norm": 0.11851335463883052,
+      "learning_rate": 0.0009309827601067428,
+      "loss": 0.6386,
+      "step": 9940
+    },
+    {
+      "epoch": 0.5048799989846556,
+      "grad_norm": 0.044597706923391586,
+      "learning_rate": 0.0009308704114768425,
+      "loss": 0.6495,
+      "step": 9945
+    },
+    {
+      "epoch": 0.505133835082687,
+      "grad_norm": 0.048673205080514974,
+      "learning_rate": 0.0009307579782692291,
+      "loss": 0.6183,
+      "step": 9950
+    },
+    {
+      "epoch": 0.5053876711807186,
+      "grad_norm": 0.03281635254658095,
+      "learning_rate": 0.0009306454605059729,
+      "loss": 0.6426,
+      "step": 9955
+    },
+    {
+      "epoch": 0.5056415072787501,
+      "grad_norm": 0.046125277826759015,
+      "learning_rate": 0.0009305328582091603,
+      "loss": 0.6343,
+      "step": 9960
+    },
+    {
+      "epoch": 0.5058953433767817,
+      "grad_norm": 0.03821293529358294,
+      "learning_rate": 0.0009304201714008948,
+      "loss": 0.6326,
+      "step": 9965
+    },
+    {
+      "epoch": 0.5061491794748131,
+      "grad_norm": 0.03994101386720053,
+      "learning_rate": 0.0009303074001032961,
+      "loss": 0.6243,
+      "step": 9970
+    },
+    {
+      "epoch": 0.5064030155728446,
+      "grad_norm": 0.04022586000406283,
+      "learning_rate": 0.0009301945443385007,
+      "loss": 0.6491,
+      "step": 9975
+    },
+    {
+      "epoch": 0.5066568516708762,
+      "grad_norm": 0.045502011916775865,
+      "learning_rate": 0.0009300816041286617,
+      "loss": 0.6329,
+      "step": 9980
+    },
+    {
+      "epoch": 0.5069106877689076,
+      "grad_norm": 0.031196383196199537,
+      "learning_rate": 0.0009299685794959485,
+      "loss": 0.6071,
+      "step": 9985
+    },
+    {
+      "epoch": 0.5071645238669391,
+      "grad_norm": 0.05198326476515164,
+      "learning_rate": 0.0009298554704625474,
+      "loss": 0.6269,
+      "step": 9990
+    },
+    {
+      "epoch": 0.5074183599649706,
+      "grad_norm": 0.044150229220093255,
+      "learning_rate": 0.0009297422770506613,
+      "loss": 0.6137,
+      "step": 9995
+    },
+    {
+      "epoch": 0.5076721960630021,
+      "grad_norm": 0.03554285265944353,
+      "learning_rate": 0.0009296289992825091,
+      "loss": 0.6603,
+      "step": 10000
+    },
+    {
+      "epoch": 0.5079260321610336,
+      "grad_norm": 0.02972651584300317,
+      "learning_rate": 0.0009295156371803271,
+      "loss": 0.6034,
+      "step": 10005
+    },
+    {
+      "epoch": 0.5081798682590651,
+      "grad_norm": 0.04646869581333915,
+      "learning_rate": 0.0009294021907663674,
+      "loss": 0.6213,
+      "step": 10010
+    },
+    {
+      "epoch": 0.5084337043570967,
+      "grad_norm": 0.04166905651002559,
+      "learning_rate": 0.0009292886600628991,
+      "loss": 0.6103,
+      "step": 10015
+    },
+    {
+      "epoch": 0.5086875404551281,
+      "grad_norm": 0.05922446812891187,
+      "learning_rate": 0.0009291750450922078,
+      "loss": 0.6225,
+      "step": 10020
+    },
+    {
+      "epoch": 0.5089413765531596,
+      "grad_norm": 0.024191658630679954,
+      "learning_rate": 0.0009290613458765953,
+      "loss": 0.6064,
+      "step": 10025
+    },
+    {
+      "epoch": 0.5091952126511912,
+      "grad_norm": 0.06362952277601344,
+      "learning_rate": 0.0009289475624383804,
+      "loss": 0.6077,
+      "step": 10030
+    },
+    {
+      "epoch": 0.5094490487492226,
+      "grad_norm": 0.040791426024977054,
+      "learning_rate": 0.0009288336947998981,
+      "loss": 0.6121,
+      "step": 10035
+    },
+    {
+      "epoch": 0.5097028848472541,
+      "grad_norm": 0.07317831667654075,
+      "learning_rate": 0.0009287197429835002,
+      "loss": 0.6198,
+      "step": 10040
+    },
+    {
+      "epoch": 0.5099567209452857,
+      "grad_norm": 0.031549492144105,
+      "learning_rate": 0.0009286057070115545,
+      "loss": 0.6359,
+      "step": 10045
+    },
+    {
+      "epoch": 0.5102105570433171,
+      "grad_norm": 0.047125852190035775,
+      "learning_rate": 0.0009284915869064463,
+      "loss": 0.6185,
+      "step": 10050
+    },
+    {
+      "epoch": 0.5104643931413486,
+      "grad_norm": 0.03542009861223921,
+      "learning_rate": 0.0009283773826905764,
+      "loss": 0.6187,
+      "step": 10055
+    },
+    {
+      "epoch": 0.5107182292393802,
+      "grad_norm": 0.026666894145488004,
+      "learning_rate": 0.0009282630943863625,
+      "loss": 0.6106,
+      "step": 10060
+    },
+    {
+      "epoch": 0.5109720653374117,
+      "grad_norm": 0.05545034501844204,
+      "learning_rate": 0.0009281487220162388,
+      "loss": 0.6283,
+      "step": 10065
+    },
+    {
+      "epoch": 0.5112259014354431,
+      "grad_norm": 0.02963889637887912,
+      "learning_rate": 0.0009280342656026564,
+      "loss": 0.6136,
+      "step": 10070
+    },
+    {
+      "epoch": 0.5114797375334746,
+      "grad_norm": 0.05901358817696739,
+      "learning_rate": 0.0009279197251680822,
+      "loss": 0.6145,
+      "step": 10075
+    },
+    {
+      "epoch": 0.5117335736315062,
+      "grad_norm": 0.040264054246851005,
+      "learning_rate": 0.000927805100735,
+      "loss": 0.6421,
+      "step": 10080
+    },
+    {
+      "epoch": 0.5119874097295376,
+      "grad_norm": 0.0649111774735333,
+      "learning_rate": 0.0009276903923259099,
+      "loss": 0.6036,
+      "step": 10085
+    },
+    {
+      "epoch": 0.5122412458275691,
+      "grad_norm": 0.037817792224739064,
+      "learning_rate": 0.0009275755999633286,
+      "loss": 0.6248,
+      "step": 10090
+    },
+    {
+      "epoch": 0.5124950819256007,
+      "grad_norm": 0.03397505282281072,
+      "learning_rate": 0.0009274607236697895,
+      "loss": 0.612,
+      "step": 10095
+    },
+    {
+      "epoch": 0.5127489180236321,
+      "grad_norm": 0.05208015537649028,
+      "learning_rate": 0.000927345763467842,
+      "loss": 0.6096,
+      "step": 10100
+    },
+    {
+      "epoch": 0.5130027541216636,
+      "grad_norm": 0.08839766979888053,
+      "learning_rate": 0.0009272307193800524,
+      "loss": 0.6074,
+      "step": 10105
+    },
+    {
+      "epoch": 0.5132565902196952,
+      "grad_norm": 0.03841799364281357,
+      "learning_rate": 0.000927115591429003,
+      "loss": 0.6152,
+      "step": 10110
+    },
+    {
+      "epoch": 0.5135104263177267,
+      "grad_norm": 0.03045633762227496,
+      "learning_rate": 0.0009270003796372933,
+      "loss": 0.6044,
+      "step": 10115
+    },
+    {
+      "epoch": 0.5137642624157581,
+      "grad_norm": 0.02569055323506306,
+      "learning_rate": 0.0009268850840275382,
+      "loss": 0.6355,
+      "step": 10120
+    },
+    {
+      "epoch": 0.5140180985137897,
+      "grad_norm": 0.026408037229659734,
+      "learning_rate": 0.0009267697046223702,
+      "loss": 0.6462,
+      "step": 10125
+    },
+    {
+      "epoch": 0.5142719346118212,
+      "grad_norm": 0.025909580463186872,
+      "learning_rate": 0.0009266542414444374,
+      "loss": 0.5864,
+      "step": 10130
+    },
+    {
+      "epoch": 0.5145257707098526,
+      "grad_norm": 0.03722734324648191,
+      "learning_rate": 0.0009265386945164049,
+      "loss": 0.6161,
+      "step": 10135
+    },
+    {
+      "epoch": 0.5147796068078841,
+      "grad_norm": 0.03566709272968282,
+      "learning_rate": 0.0009264230638609535,
+      "loss": 0.6085,
+      "step": 10140
+    },
+    {
+      "epoch": 0.5150334429059157,
+      "grad_norm": 0.06277602947724216,
+      "learning_rate": 0.0009263073495007814,
+      "loss": 0.6338,
+      "step": 10145
+    },
+    {
+      "epoch": 0.5152872790039471,
+      "grad_norm": 0.058049067673817435,
+      "learning_rate": 0.0009261915514586026,
+      "loss": 0.6555,
+      "step": 10150
+    },
+    {
+      "epoch": 0.5155411151019786,
+      "grad_norm": 0.03795674743934634,
+      "learning_rate": 0.0009260756697571477,
+      "loss": 0.6693,
+      "step": 10155
+    },
+    {
+      "epoch": 0.5157949512000102,
+      "grad_norm": 0.04777864042916011,
+      "learning_rate": 0.0009259597044191636,
+      "loss": 0.6382,
+      "step": 10160
+    },
+    {
+      "epoch": 0.5160487872980416,
+      "grad_norm": 0.03739573996244197,
+      "learning_rate": 0.0009258436554674137,
+      "loss": 0.638,
+      "step": 10165
+    },
+    {
+      "epoch": 0.5163026233960731,
+      "grad_norm": 0.028436191862654784,
+      "learning_rate": 0.000925727522924678,
+      "loss": 0.6003,
+      "step": 10170
+    },
+    {
+      "epoch": 0.5165564594941047,
+      "grad_norm": 0.03204307790523904,
+      "learning_rate": 0.0009256113068137526,
+      "loss": 0.6642,
+      "step": 10175
+    },
+    {
+      "epoch": 0.5168102955921362,
+      "grad_norm": 0.055050382059251654,
+      "learning_rate": 0.0009254950071574502,
+      "loss": 0.6275,
+      "step": 10180
+    },
+    {
+      "epoch": 0.5170641316901676,
+      "grad_norm": 0.026441208032810616,
+      "learning_rate": 0.0009253786239785999,
+      "loss": 0.6071,
+      "step": 10185
+    },
+    {
+      "epoch": 0.5173179677881992,
+      "grad_norm": 0.035768658322057015,
+      "learning_rate": 0.0009252621573000472,
+      "loss": 0.6499,
+      "step": 10190
+    },
+    {
+      "epoch": 0.5175718038862307,
+      "grad_norm": 0.024019701385016478,
+      "learning_rate": 0.0009251456071446536,
+      "loss": 0.5909,
+      "step": 10195
+    },
+    {
+      "epoch": 0.5178256399842621,
+      "grad_norm": 0.0261249278412097,
+      "learning_rate": 0.0009250289735352975,
+      "loss": 0.6388,
+      "step": 10200
+    },
+    {
+      "epoch": 0.5180794760822937,
+      "grad_norm": 0.05688851509224552,
+      "learning_rate": 0.0009249122564948736,
+      "loss": 0.6392,
+      "step": 10205
+    },
+    {
+      "epoch": 0.5183333121803252,
+      "grad_norm": 0.02773345047952887,
+      "learning_rate": 0.0009247954560462928,
+      "loss": 0.6311,
+      "step": 10210
+    },
+    {
+      "epoch": 0.5185871482783566,
+      "grad_norm": 0.0270974270992603,
+      "learning_rate": 0.0009246785722124823,
+      "loss": 0.6285,
+      "step": 10215
+    },
+    {
+      "epoch": 0.5188409843763881,
+      "grad_norm": 0.03094525704725652,
+      "learning_rate": 0.0009245616050163861,
+      "loss": 0.6084,
+      "step": 10220
+    },
+    {
+      "epoch": 0.5190948204744197,
+      "grad_norm": 0.0302889231805603,
+      "learning_rate": 0.000924444554480964,
+      "loss": 0.6384,
+      "step": 10225
+    },
+    {
+      "epoch": 0.5193486565724512,
+      "grad_norm": 0.053230977077894856,
+      "learning_rate": 0.0009243274206291926,
+      "loss": 0.5947,
+      "step": 10230
+    },
+    {
+      "epoch": 0.5196024926704826,
+      "grad_norm": 0.06540135430875998,
+      "learning_rate": 0.0009242102034840647,
+      "loss": 0.6397,
+      "step": 10235
+    },
+    {
+      "epoch": 0.5198563287685142,
+      "grad_norm": 0.026623481542029606,
+      "learning_rate": 0.0009240929030685893,
+      "loss": 0.6642,
+      "step": 10240
+    },
+    {
+      "epoch": 0.5201101648665457,
+      "grad_norm": 0.034876098813892115,
+      "learning_rate": 0.0009239755194057921,
+      "loss": 0.6457,
+      "step": 10245
+    },
+    {
+      "epoch": 0.5203640009645771,
+      "grad_norm": 0.03378616222313645,
+      "learning_rate": 0.0009238580525187146,
+      "loss": 0.5793,
+      "step": 10250
+    },
+    {
+      "epoch": 0.5206178370626087,
+      "grad_norm": 0.1052756526973527,
+      "learning_rate": 0.0009237405024304153,
+      "loss": 0.6458,
+      "step": 10255
+    },
+    {
+      "epoch": 0.5208716731606402,
+      "grad_norm": 0.03839663563882219,
+      "learning_rate": 0.0009236228691639686,
+      "loss": 0.665,
+      "step": 10260
+    },
+    {
+      "epoch": 0.5211255092586716,
+      "grad_norm": 0.033097916077724686,
+      "learning_rate": 0.0009235051527424652,
+      "loss": 0.6032,
+      "step": 10265
+    },
+    {
+      "epoch": 0.5213793453567032,
+      "grad_norm": 0.04114159903818375,
+      "learning_rate": 0.0009233873531890123,
+      "loss": 0.6592,
+      "step": 10270
+    },
+    {
+      "epoch": 0.5216331814547347,
+      "grad_norm": 0.059483637355810894,
+      "learning_rate": 0.0009232694705267335,
+      "loss": 0.6168,
+      "step": 10275
+    },
+    {
+      "epoch": 0.5218870175527662,
+      "grad_norm": 0.03912192748754646,
+      "learning_rate": 0.0009231515047787686,
+      "loss": 0.6632,
+      "step": 10280
+    },
+    {
+      "epoch": 0.5221408536507977,
+      "grad_norm": 0.03455355947863813,
+      "learning_rate": 0.0009230334559682734,
+      "loss": 0.6174,
+      "step": 10285
+    },
+    {
+      "epoch": 0.5223946897488292,
+      "grad_norm": 0.06201615106125605,
+      "learning_rate": 0.0009229153241184204,
+      "loss": 0.6291,
+      "step": 10290
+    },
+    {
+      "epoch": 0.5226485258468607,
+      "grad_norm": 0.03866401919091889,
+      "learning_rate": 0.0009227971092523983,
+      "loss": 0.6441,
+      "step": 10295
+    },
+    {
+      "epoch": 0.5229023619448921,
+      "grad_norm": 0.04274992026640168,
+      "learning_rate": 0.0009226788113934123,
+      "loss": 0.6226,
+      "step": 10300
+    },
+    {
+      "epoch": 0.5231561980429237,
+      "grad_norm": 0.050672917058466604,
+      "learning_rate": 0.0009225604305646835,
+      "loss": 0.6336,
+      "step": 10305
+    },
+    {
+      "epoch": 0.5234100341409552,
+      "grad_norm": 0.03842833678462949,
+      "learning_rate": 0.0009224419667894495,
+      "loss": 0.6085,
+      "step": 10310
+    },
+    {
+      "epoch": 0.5236638702389866,
+      "grad_norm": 0.038015665579822645,
+      "learning_rate": 0.000922323420090964,
+      "loss": 0.6214,
+      "step": 10315
+    },
+    {
+      "epoch": 0.5239177063370182,
+      "grad_norm": 0.03004579581769112,
+      "learning_rate": 0.0009222047904924975,
+      "loss": 0.5919,
+      "step": 10320
+    },
+    {
+      "epoch": 0.5241715424350497,
+      "grad_norm": 0.03190679818928855,
+      "learning_rate": 0.000922086078017336,
+      "loss": 0.6159,
+      "step": 10325
+    },
+    {
+      "epoch": 0.5244253785330812,
+      "grad_norm": 0.029075752083056672,
+      "learning_rate": 0.0009219672826887824,
+      "loss": 0.5941,
+      "step": 10330
+    },
+    {
+      "epoch": 0.5246792146311127,
+      "grad_norm": 0.034715107108026645,
+      "learning_rate": 0.0009218484045301554,
+      "loss": 0.6209,
+      "step": 10335
+    },
+    {
+      "epoch": 0.5249330507291442,
+      "grad_norm": 0.02962263191628107,
+      "learning_rate": 0.0009217294435647905,
+      "loss": 0.6439,
+      "step": 10340
+    },
+    {
+      "epoch": 0.5251868868271757,
+      "grad_norm": 0.0357418067321709,
+      "learning_rate": 0.0009216103998160389,
+      "loss": 0.6227,
+      "step": 10345
+    },
+    {
+      "epoch": 0.5254407229252072,
+      "grad_norm": 0.05508757844826437,
+      "learning_rate": 0.0009214912733072685,
+      "loss": 0.618,
+      "step": 10350
+    },
+    {
+      "epoch": 0.5256945590232387,
+      "grad_norm": 0.04548692629351668,
+      "learning_rate": 0.0009213720640618631,
+      "loss": 0.6661,
+      "step": 10355
+    },
+    {
+      "epoch": 0.5259483951212702,
+      "grad_norm": 0.03260375865114965,
+      "learning_rate": 0.0009212527721032226,
+      "loss": 0.6416,
+      "step": 10360
+    },
+    {
+      "epoch": 0.5262022312193017,
+      "grad_norm": 0.029374475221709984,
+      "learning_rate": 0.000921133397454764,
+      "loss": 0.5965,
+      "step": 10365
+    },
+    {
+      "epoch": 0.5264560673173332,
+      "grad_norm": 0.025766929716711365,
+      "learning_rate": 0.0009210139401399197,
+      "loss": 0.64,
+      "step": 10370
+    },
+    {
+      "epoch": 0.5267099034153647,
+      "grad_norm": 0.028038442454314576,
+      "learning_rate": 0.0009208944001821384,
+      "loss": 0.6088,
+      "step": 10375
+    },
+    {
+      "epoch": 0.5269637395133961,
+      "grad_norm": 0.052489144282257295,
+      "learning_rate": 0.0009207747776048855,
+      "loss": 0.6408,
+      "step": 10380
+    },
+    {
+      "epoch": 0.5272175756114277,
+      "grad_norm": 0.03444955545077113,
+      "learning_rate": 0.000920655072431642,
+      "loss": 0.5995,
+      "step": 10385
+    },
+    {
+      "epoch": 0.5274714117094592,
+      "grad_norm": 0.03088222308172369,
+      "learning_rate": 0.0009205352846859056,
+      "loss": 0.6194,
+      "step": 10390
+    },
+    {
+      "epoch": 0.5277252478074907,
+      "grad_norm": 0.027998757741213697,
+      "learning_rate": 0.0009204154143911903,
+      "loss": 0.63,
+      "step": 10395
+    },
+    {
+      "epoch": 0.5279790839055222,
+      "grad_norm": 0.04158454624606828,
+      "learning_rate": 0.0009202954615710256,
+      "loss": 0.5983,
+      "step": 10400
+    },
+    {
+      "epoch": 0.5282329200035537,
+      "grad_norm": 0.0453153646691045,
+      "learning_rate": 0.0009201754262489575,
+      "loss": 0.6328,
+      "step": 10405
+    },
+    {
+      "epoch": 0.5284867561015852,
+      "grad_norm": 0.04473664568912869,
+      "learning_rate": 0.0009200553084485491,
+      "loss": 0.6301,
+      "step": 10410
+    },
+    {
+      "epoch": 0.5287405921996167,
+      "grad_norm": 0.0382431324349534,
+      "learning_rate": 0.0009199351081933781,
+      "loss": 0.6,
+      "step": 10415
+    },
+    {
+      "epoch": 0.5289944282976482,
+      "grad_norm": 0.039893358245096724,
+      "learning_rate": 0.0009198148255070398,
+      "loss": 0.5886,
+      "step": 10420
+    },
+    {
+      "epoch": 0.5292482643956797,
+      "grad_norm": 0.058211501320811744,
+      "learning_rate": 0.0009196944604131448,
+      "loss": 0.5985,
+      "step": 10425
+    },
+    {
+      "epoch": 0.5295021004937112,
+      "grad_norm": 0.03851407689786308,
+      "learning_rate": 0.0009195740129353202,
+      "loss": 0.5908,
+      "step": 10430
+    },
+    {
+      "epoch": 0.5297559365917427,
+      "grad_norm": 0.038851954860420396,
+      "learning_rate": 0.0009194534830972092,
+      "loss": 0.6391,
+      "step": 10435
+    },
+    {
+      "epoch": 0.5300097726897742,
+      "grad_norm": 0.026937948392526747,
+      "learning_rate": 0.0009193328709224714,
+      "loss": 0.6123,
+      "step": 10440
+    },
+    {
+      "epoch": 0.5302636087878058,
+      "grad_norm": 0.036109446414125626,
+      "learning_rate": 0.0009192121764347822,
+      "loss": 0.5954,
+      "step": 10445
+    },
+    {
+      "epoch": 0.5305174448858372,
+      "grad_norm": 0.03316804876731099,
+      "learning_rate": 0.0009190913996578334,
+      "loss": 0.6145,
+      "step": 10450
+    },
+    {
+      "epoch": 0.5307712809838687,
+      "grad_norm": 0.06849212460462101,
+      "learning_rate": 0.000918970540615333,
+      "loss": 0.613,
+      "step": 10455
+    },
+    {
+      "epoch": 0.5310251170819003,
+      "grad_norm": 0.03323906997049166,
+      "learning_rate": 0.0009188495993310046,
+      "loss": 0.6341,
+      "step": 10460
+    },
+    {
+      "epoch": 0.5312789531799317,
+      "grad_norm": 0.03941007227631917,
+      "learning_rate": 0.0009187285758285889,
+      "loss": 0.5978,
+      "step": 10465
+    },
+    {
+      "epoch": 0.5315327892779632,
+      "grad_norm": 0.02635949534355945,
+      "learning_rate": 0.0009186074701318419,
+      "loss": 0.5906,
+      "step": 10470
+    },
+    {
+      "epoch": 0.5317866253759947,
+      "grad_norm": 0.04078016260737411,
+      "learning_rate": 0.0009184862822645359,
+      "loss": 0.6263,
+      "step": 10475
+    },
+    {
+      "epoch": 0.5320404614740262,
+      "grad_norm": 0.057867925606058655,
+      "learning_rate": 0.0009183650122504598,
+      "loss": 0.6272,
+      "step": 10480
+    },
+    {
+      "epoch": 0.5322942975720577,
+      "grad_norm": 0.037243395752825356,
+      "learning_rate": 0.0009182436601134184,
+      "loss": 0.6025,
+      "step": 10485
+    },
+    {
+      "epoch": 0.5325481336700892,
+      "grad_norm": 0.04821834594916435,
+      "learning_rate": 0.0009181222258772319,
+      "loss": 0.6018,
+      "step": 10490
+    },
+    {
+      "epoch": 0.5328019697681208,
+      "grad_norm": 0.028085738317345087,
+      "learning_rate": 0.0009180007095657379,
+      "loss": 0.5925,
+      "step": 10495
+    },
+    {
+      "epoch": 0.5330558058661522,
+      "grad_norm": 0.036071988511367975,
+      "learning_rate": 0.0009178791112027891,
+      "loss": 0.6027,
+      "step": 10500
+    },
+    {
+      "epoch": 0.5333096419641837,
+      "grad_norm": 0.02739881003703629,
+      "learning_rate": 0.0009177574308122547,
+      "loss": 0.6641,
+      "step": 10505
+    },
+    {
+      "epoch": 0.5335634780622153,
+      "grad_norm": 0.02822235924487654,
+      "learning_rate": 0.00091763566841802,
+      "loss": 0.6287,
+      "step": 10510
+    },
+    {
+      "epoch": 0.5338173141602467,
+      "grad_norm": 0.024121670310468826,
+      "learning_rate": 0.0009175138240439864,
+      "loss": 0.5854,
+      "step": 10515
+    },
+    {
+      "epoch": 0.5340711502582782,
+      "grad_norm": 0.025342117126993798,
+      "learning_rate": 0.0009173918977140713,
+      "loss": 0.5713,
+      "step": 10520
+    },
+    {
+      "epoch": 0.5343249863563098,
+      "grad_norm": 0.027321368031964118,
+      "learning_rate": 0.0009172698894522082,
+      "loss": 0.6106,
+      "step": 10525
+    },
+    {
+      "epoch": 0.5345788224543412,
+      "grad_norm": 0.03343234624185458,
+      "learning_rate": 0.0009171477992823467,
+      "loss": 0.6268,
+      "step": 10530
+    },
+    {
+      "epoch": 0.5348326585523727,
+      "grad_norm": 0.04732716404477403,
+      "learning_rate": 0.0009170256272284525,
+      "loss": 0.5807,
+      "step": 10535
+    },
+    {
+      "epoch": 0.5350864946504043,
+      "grad_norm": 0.02840525748237608,
+      "learning_rate": 0.0009169033733145074,
+      "loss": 0.6045,
+      "step": 10540
+    },
+    {
+      "epoch": 0.5353403307484358,
+      "grad_norm": 0.027475221513180086,
+      "learning_rate": 0.0009167810375645091,
+      "loss": 0.6323,
+      "step": 10545
+    },
+    {
+      "epoch": 0.5355941668464672,
+      "grad_norm": 0.025750551058873007,
+      "learning_rate": 0.0009166586200024717,
+      "loss": 0.5838,
+      "step": 10550
+    },
+    {
+      "epoch": 0.5358480029444987,
+      "grad_norm": 0.03765393114821948,
+      "learning_rate": 0.000916536120652425,
+      "loss": 0.6147,
+      "step": 10555
+    },
+    {
+      "epoch": 0.5361018390425303,
+      "grad_norm": 0.03715730591665516,
+      "learning_rate": 0.0009164135395384151,
+      "loss": 0.6005,
+      "step": 10560
+    },
+    {
+      "epoch": 0.5363556751405617,
+      "grad_norm": 0.03645041602808113,
+      "learning_rate": 0.0009162908766845041,
+      "loss": 0.5842,
+      "step": 10565
+    },
+    {
+      "epoch": 0.5366095112385932,
+      "grad_norm": 0.03100263105448387,
+      "learning_rate": 0.00091616813211477,
+      "loss": 0.5594,
+      "step": 10570
+    },
+    {
+      "epoch": 0.5368633473366248,
+      "grad_norm": 0.04226102686913694,
+      "learning_rate": 0.0009160453058533071,
+      "loss": 0.6151,
+      "step": 10575
+    },
+    {
+      "epoch": 0.5371171834346562,
+      "grad_norm": 0.02589182283128981,
+      "learning_rate": 0.0009159223979242253,
+      "loss": 0.614,
+      "step": 10580
+    },
+    {
+      "epoch": 0.5373710195326877,
+      "grad_norm": 0.028467877248238536,
+      "learning_rate": 0.0009157994083516511,
+      "loss": 0.6492,
+      "step": 10585
+    },
+    {
+      "epoch": 0.5376248556307193,
+      "grad_norm": 0.03135996868398204,
+      "learning_rate": 0.0009156763371597266,
+      "loss": 0.6065,
+      "step": 10590
+    },
+    {
+      "epoch": 0.5378786917287507,
+      "grad_norm": 0.040987082480228504,
+      "learning_rate": 0.0009155531843726101,
+      "loss": 0.6084,
+      "step": 10595
+    },
+    {
+      "epoch": 0.5381325278267822,
+      "grad_norm": 0.029712117528617986,
+      "learning_rate": 0.0009154299500144758,
+      "loss": 0.6086,
+      "step": 10600
+    },
+    {
+      "epoch": 0.5383863639248138,
+      "grad_norm": 0.05504579201351947,
+      "learning_rate": 0.0009153066341095142,
+      "loss": 0.6395,
+      "step": 10605
+    },
+    {
+      "epoch": 0.5386402000228453,
+      "grad_norm": 0.030518737256477527,
+      "learning_rate": 0.0009151832366819314,
+      "loss": 0.6192,
+      "step": 10610
+    },
+    {
+      "epoch": 0.5388940361208767,
+      "grad_norm": 0.029879557984338494,
+      "learning_rate": 0.0009150597577559496,
+      "loss": 0.668,
+      "step": 10615
+    },
+    {
+      "epoch": 0.5391478722189083,
+      "grad_norm": 0.03153018920722057,
+      "learning_rate": 0.0009149361973558075,
+      "loss": 0.5801,
+      "step": 10620
+    },
+    {
+      "epoch": 0.5394017083169398,
+      "grad_norm": 0.030512110696911215,
+      "learning_rate": 0.000914812555505759,
+      "loss": 0.5942,
+      "step": 10625
+    },
+    {
+      "epoch": 0.5396555444149712,
+      "grad_norm": 0.041125863080099585,
+      "learning_rate": 0.0009146888322300745,
+      "loss": 0.6002,
+      "step": 10630
+    },
+    {
+      "epoch": 0.5399093805130027,
+      "grad_norm": 0.054165606681991184,
+      "learning_rate": 0.0009145650275530404,
+      "loss": 0.5951,
+      "step": 10635
+    },
+    {
+      "epoch": 0.5401632166110343,
+      "grad_norm": 0.029366439611697406,
+      "learning_rate": 0.0009144411414989587,
+      "loss": 0.5834,
+      "step": 10640
+    },
+    {
+      "epoch": 0.5404170527090657,
+      "grad_norm": 0.04321665819488211,
+      "learning_rate": 0.0009143171740921479,
+      "loss": 0.61,
+      "step": 10645
+    },
+    {
+      "epoch": 0.5406708888070972,
+      "grad_norm": 0.02466718128736207,
+      "learning_rate": 0.0009141931253569418,
+      "loss": 0.6155,
+      "step": 10650
+    },
+    {
+      "epoch": 0.5409247249051288,
+      "grad_norm": 0.025734399782198193,
+      "learning_rate": 0.000914068995317691,
+      "loss": 0.6249,
+      "step": 10655
+    },
+    {
+      "epoch": 0.5411785610031603,
+      "grad_norm": 0.030956416519900224,
+      "learning_rate": 0.0009139447839987613,
+      "loss": 0.588,
+      "step": 10660
+    },
+    {
+      "epoch": 0.5414323971011917,
+      "grad_norm": 0.03328165539133208,
+      "learning_rate": 0.0009138204914245347,
+      "loss": 0.5812,
+      "step": 10665
+    },
+    {
+      "epoch": 0.5416862331992233,
+      "grad_norm": 0.03418976866258275,
+      "learning_rate": 0.0009136961176194094,
+      "loss": 0.6126,
+      "step": 10670
+    },
+    {
+      "epoch": 0.5419400692972548,
+      "grad_norm": 0.060922163184233144,
+      "learning_rate": 0.0009135716626077994,
+      "loss": 0.5858,
+      "step": 10675
+    },
+    {
+      "epoch": 0.5421939053952862,
+      "grad_norm": 0.0316654793096513,
+      "learning_rate": 0.0009134471264141345,
+      "loss": 0.6172,
+      "step": 10680
+    },
+    {
+      "epoch": 0.5424477414933178,
+      "grad_norm": 0.033052227554561796,
+      "learning_rate": 0.0009133225090628605,
+      "loss": 0.6176,
+      "step": 10685
+    },
+    {
+      "epoch": 0.5427015775913493,
+      "grad_norm": 0.034867711839285526,
+      "learning_rate": 0.0009131978105784394,
+      "loss": 0.6441,
+      "step": 10690
+    },
+    {
+      "epoch": 0.5429554136893807,
+      "grad_norm": 0.03242440613542117,
+      "learning_rate": 0.0009130730309853483,
+      "loss": 0.5871,
+      "step": 10695
+    },
+    {
+      "epoch": 0.5432092497874123,
+      "grad_norm": 0.02846877585722155,
+      "learning_rate": 0.0009129481703080816,
+      "loss": 0.636,
+      "step": 10700
+    },
+    {
+      "epoch": 0.5434630858854438,
+      "grad_norm": 0.03968673666286646,
+      "learning_rate": 0.0009128232285711482,
+      "loss": 0.6109,
+      "step": 10705
+    },
+    {
+      "epoch": 0.5437169219834753,
+      "grad_norm": 0.025092489100120463,
+      "learning_rate": 0.0009126982057990738,
+      "loss": 0.6156,
+      "step": 10710
+    },
+    {
+      "epoch": 0.5439707580815067,
+      "grad_norm": 0.03529191550732015,
+      "learning_rate": 0.0009125731020163998,
+      "loss": 0.5723,
+      "step": 10715
+    },
+    {
+      "epoch": 0.5442245941795383,
+      "grad_norm": 0.027050926910876526,
+      "learning_rate": 0.0009124479172476833,
+      "loss": 0.6045,
+      "step": 10720
+    },
+    {
+      "epoch": 0.5444784302775698,
+      "grad_norm": 0.07622157764619372,
+      "learning_rate": 0.0009123226515174976,
+      "loss": 0.5689,
+      "step": 10725
+    },
+    {
+      "epoch": 0.5447322663756012,
+      "grad_norm": 0.02751158017548396,
+      "learning_rate": 0.0009121973048504316,
+      "loss": 0.6213,
+      "step": 10730
+    },
+    {
+      "epoch": 0.5449861024736328,
+      "grad_norm": 0.049460936685531046,
+      "learning_rate": 0.0009120718772710903,
+      "loss": 0.6061,
+      "step": 10735
+    },
+    {
+      "epoch": 0.5452399385716643,
+      "grad_norm": 0.03544395471579363,
+      "learning_rate": 0.0009119463688040945,
+      "loss": 0.5933,
+      "step": 10740
+    },
+    {
+      "epoch": 0.5454937746696957,
+      "grad_norm": 0.03400989240869563,
+      "learning_rate": 0.0009118207794740809,
+      "loss": 0.5727,
+      "step": 10745
+    },
+    {
+      "epoch": 0.5457476107677273,
+      "grad_norm": 0.03151062837134726,
+      "learning_rate": 0.000911695109305702,
+      "loss": 0.5955,
+      "step": 10750
+    },
+    {
+      "epoch": 0.5460014468657588,
+      "grad_norm": 0.02463583148102344,
+      "learning_rate": 0.0009115693583236263,
+      "loss": 0.6051,
+      "step": 10755
+    },
+    {
+      "epoch": 0.5462552829637903,
+      "grad_norm": 0.033094296247287235,
+      "learning_rate": 0.0009114435265525381,
+      "loss": 0.5763,
+      "step": 10760
+    },
+    {
+      "epoch": 0.5465091190618218,
+      "grad_norm": 0.026074781871230777,
+      "learning_rate": 0.0009113176140171373,
+      "loss": 0.6216,
+      "step": 10765
+    },
+    {
+      "epoch": 0.5467629551598533,
+      "grad_norm": 0.03141881910628044,
+      "learning_rate": 0.0009111916207421402,
+      "loss": 0.606,
+      "step": 10770
+    },
+    {
+      "epoch": 0.5470167912578848,
+      "grad_norm": 0.029052049913770146,
+      "learning_rate": 0.0009110655467522786,
+      "loss": 0.6244,
+      "step": 10775
+    },
+    {
+      "epoch": 0.5472706273559163,
+      "grad_norm": 0.0251807877680953,
+      "learning_rate": 0.0009109393920723001,
+      "loss": 0.5854,
+      "step": 10780
+    },
+    {
+      "epoch": 0.5475244634539478,
+      "grad_norm": 0.03612511248761835,
+      "learning_rate": 0.0009108131567269684,
+      "loss": 0.601,
+      "step": 10785
+    },
+    {
+      "epoch": 0.5477782995519793,
+      "grad_norm": 0.05249632040568085,
+      "learning_rate": 0.0009106868407410627,
+      "loss": 0.6207,
+      "step": 10790
+    },
+    {
+      "epoch": 0.5480321356500107,
+      "grad_norm": 0.030858769777672986,
+      "learning_rate": 0.0009105604441393782,
+      "loss": 0.624,
+      "step": 10795
+    },
+    {
+      "epoch": 0.5482859717480423,
+      "grad_norm": 0.029487600818791027,
+      "learning_rate": 0.0009104339669467261,
+      "loss": 0.5926,
+      "step": 10800
+    },
+    {
+      "epoch": 0.5485398078460738,
+      "grad_norm": 0.03067140315629251,
+      "learning_rate": 0.0009103074091879331,
+      "loss": 0.623,
+      "step": 10805
+    },
+    {
+      "epoch": 0.5487936439441052,
+      "grad_norm": 0.02640097178132819,
+      "learning_rate": 0.0009101807708878418,
+      "loss": 0.6049,
+      "step": 10810
+    },
+    {
+      "epoch": 0.5490474800421368,
+      "grad_norm": 0.049525814735870075,
+      "learning_rate": 0.0009100540520713108,
+      "loss": 0.6012,
+      "step": 10815
+    },
+    {
+      "epoch": 0.5493013161401683,
+      "grad_norm": 0.04668719700489464,
+      "learning_rate": 0.0009099272527632142,
+      "loss": 0.6075,
+      "step": 10820
+    },
+    {
+      "epoch": 0.5495551522381998,
+      "grad_norm": 0.04879877503113339,
+      "learning_rate": 0.0009098003729884423,
+      "loss": 0.6007,
+      "step": 10825
+    },
+    {
+      "epoch": 0.5498089883362313,
+      "grad_norm": 0.035478008135277204,
+      "learning_rate": 0.0009096734127719007,
+      "loss": 0.6009,
+      "step": 10830
+    },
+    {
+      "epoch": 0.5500628244342628,
+      "grad_norm": 0.06071680497508226,
+      "learning_rate": 0.0009095463721385113,
+      "loss": 0.6103,
+      "step": 10835
+    },
+    {
+      "epoch": 0.5503166605322943,
+      "grad_norm": 0.06356999878241335,
+      "learning_rate": 0.0009094192511132116,
+      "loss": 0.6008,
+      "step": 10840
+    },
+    {
+      "epoch": 0.5505704966303258,
+      "grad_norm": 0.07522895375448929,
+      "learning_rate": 0.0009092920497209545,
+      "loss": 0.59,
+      "step": 10845
+    },
+    {
+      "epoch": 0.5508243327283573,
+      "grad_norm": 0.06152876042967086,
+      "learning_rate": 0.0009091647679867092,
+      "loss": 0.6016,
+      "step": 10850
+    },
+    {
+      "epoch": 0.5510781688263888,
+      "grad_norm": 1.435812648824866,
+      "learning_rate": 0.0009090374059354605,
+      "loss": 0.9246,
+      "step": 10855
+    },
+    {
+      "epoch": 0.5513320049244202,
+      "grad_norm": 0.16453277042657283,
+      "learning_rate": 0.0009089099635922089,
+      "loss": 0.7979,
+      "step": 10860
+    },
+    {
+      "epoch": 0.5515858410224518,
+      "grad_norm": 0.09400053941343282,
+      "learning_rate": 0.0009087824409819706,
+      "loss": 0.7172,
+      "step": 10865
+    },
+    {
+      "epoch": 0.5518396771204833,
+      "grad_norm": 0.037709382524332286,
+      "learning_rate": 0.0009086548381297778,
+      "loss": 0.7196,
+      "step": 10870
+    },
+    {
+      "epoch": 0.5520935132185149,
+      "grad_norm": 0.058763207110847336,
+      "learning_rate": 0.0009085271550606782,
+      "loss": 0.6644,
+      "step": 10875
+    },
+    {
+      "epoch": 0.5523473493165463,
+      "grad_norm": 0.03877532647328554,
+      "learning_rate": 0.0009083993917997354,
+      "loss": 0.6212,
+      "step": 10880
+    },
+    {
+      "epoch": 0.5526011854145778,
+      "grad_norm": 0.032910769522172735,
+      "learning_rate": 0.0009082715483720287,
+      "loss": 0.6176,
+      "step": 10885
+    },
+    {
+      "epoch": 0.5528550215126093,
+      "grad_norm": 0.06379896202591075,
+      "learning_rate": 0.000908143624802653,
+      "loss": 0.6868,
+      "step": 10890
+    },
+    {
+      "epoch": 0.5531088576106408,
+      "grad_norm": 0.03960368139013127,
+      "learning_rate": 0.0009080156211167192,
+      "loss": 0.595,
+      "step": 10895
+    },
+    {
+      "epoch": 0.5533626937086723,
+      "grad_norm": 0.03265424433724443,
+      "learning_rate": 0.0009078875373393538,
+      "loss": 0.649,
+      "step": 10900
+    },
+    {
+      "epoch": 0.5536165298067038,
+      "grad_norm": 0.03415971908723185,
+      "learning_rate": 0.0009077593734956988,
+      "loss": 0.6206,
+      "step": 10905
+    },
+    {
+      "epoch": 0.5538703659047353,
+      "grad_norm": 0.036488130956054673,
+      "learning_rate": 0.0009076311296109125,
+      "loss": 0.67,
+      "step": 10910
+    },
+    {
+      "epoch": 0.5541242020027668,
+      "grad_norm": 0.035320885747738756,
+      "learning_rate": 0.0009075028057101682,
+      "loss": 0.6229,
+      "step": 10915
+    },
+    {
+      "epoch": 0.5543780381007983,
+      "grad_norm": 0.027348385875276045,
+      "learning_rate": 0.0009073744018186554,
+      "loss": 0.6252,
+      "step": 10920
+    },
+    {
+      "epoch": 0.5546318741988299,
+      "grad_norm": 0.05990522832658699,
+      "learning_rate": 0.0009072459179615789,
+      "loss": 0.6792,
+      "step": 10925
+    },
+    {
+      "epoch": 0.5548857102968613,
+      "grad_norm": 0.025462128094323643,
+      "learning_rate": 0.0009071173541641598,
+      "loss": 0.5896,
+      "step": 10930
+    },
+    {
+      "epoch": 0.5551395463948928,
+      "grad_norm": 0.02807969122992142,
+      "learning_rate": 0.0009069887104516344,
+      "loss": 0.6309,
+      "step": 10935
+    },
+    {
+      "epoch": 0.5553933824929244,
+      "grad_norm": 0.036603755788089144,
+      "learning_rate": 0.0009068599868492549,
+      "loss": 0.6168,
+      "step": 10940
+    },
+    {
+      "epoch": 0.5556472185909558,
+      "grad_norm": 0.027661489675158593,
+      "learning_rate": 0.0009067311833822887,
+      "loss": 0.6306,
+      "step": 10945
+    },
+    {
+      "epoch": 0.5559010546889873,
+      "grad_norm": 0.03077694227440884,
+      "learning_rate": 0.0009066023000760198,
+      "loss": 0.6059,
+      "step": 10950
+    },
+    {
+      "epoch": 0.5561548907870189,
+      "grad_norm": 0.03144955523682315,
+      "learning_rate": 0.0009064733369557469,
+      "loss": 0.5914,
+      "step": 10955
+    },
+    {
+      "epoch": 0.5564087268850503,
+      "grad_norm": 0.02757118827722829,
+      "learning_rate": 0.0009063442940467852,
+      "loss": 0.6138,
+      "step": 10960
+    },
+    {
+      "epoch": 0.5566625629830818,
+      "grad_norm": 0.028124044394521205,
+      "learning_rate": 0.0009062151713744649,
+      "loss": 0.5849,
+      "step": 10965
+    },
+    {
+      "epoch": 0.5569163990811133,
+      "grad_norm": 0.025072368259489837,
+      "learning_rate": 0.0009060859689641323,
+      "loss": 0.6349,
+      "step": 10970
+    },
+    {
+      "epoch": 0.5571702351791449,
+      "grad_norm": 0.031444917396144835,
+      "learning_rate": 0.0009059566868411492,
+      "loss": 0.6198,
+      "step": 10975
+    },
+    {
+      "epoch": 0.5574240712771763,
+      "grad_norm": 0.04501045513342346,
+      "learning_rate": 0.0009058273250308929,
+      "loss": 0.6399,
+      "step": 10980
+    },
+    {
+      "epoch": 0.5576779073752078,
+      "grad_norm": 0.03025025685706769,
+      "learning_rate": 0.0009056978835587566,
+      "loss": 0.6388,
+      "step": 10985
+    },
+    {
+      "epoch": 0.5579317434732394,
+      "grad_norm": 0.03855276480190827,
+      "learning_rate": 0.0009055683624501489,
+      "loss": 0.6172,
+      "step": 10990
+    },
+    {
+      "epoch": 0.5581855795712708,
+      "grad_norm": 0.04172515052455696,
+      "learning_rate": 0.0009054387617304945,
+      "loss": 0.6024,
+      "step": 10995
+    },
+    {
+      "epoch": 0.5584394156693023,
+      "grad_norm": 0.02429228351814789,
+      "learning_rate": 0.0009053090814252327,
+      "loss": 0.5964,
+      "step": 11000
+    },
+    {
+      "epoch": 0.5586932517673339,
+      "grad_norm": 0.04151154242191302,
+      "learning_rate": 0.0009051793215598197,
+      "loss": 0.6044,
+      "step": 11005
+    },
+    {
+      "epoch": 0.5589470878653653,
+      "grad_norm": 0.024858734225047054,
+      "learning_rate": 0.0009050494821597264,
+      "loss": 0.6301,
+      "step": 11010
+    },
+    {
+      "epoch": 0.5592009239633968,
+      "grad_norm": 0.028072967963923535,
+      "learning_rate": 0.0009049195632504399,
+      "loss": 0.626,
+      "step": 11015
+    },
+    {
+      "epoch": 0.5594547600614284,
+      "grad_norm": 0.03193777274190929,
+      "learning_rate": 0.0009047895648574623,
+      "loss": 0.6255,
+      "step": 11020
+    },
+    {
+      "epoch": 0.5597085961594598,
+      "grad_norm": 0.026480125682836954,
+      "learning_rate": 0.0009046594870063118,
+      "loss": 0.6033,
+      "step": 11025
+    },
+    {
+      "epoch": 0.5599624322574913,
+      "grad_norm": 0.026097311141357767,
+      "learning_rate": 0.0009045293297225221,
+      "loss": 0.5809,
+      "step": 11030
+    },
+    {
+      "epoch": 0.5602162683555229,
+      "grad_norm": 0.0308704453992081,
+      "learning_rate": 0.0009043990930316424,
+      "loss": 0.6089,
+      "step": 11035
+    },
+    {
+      "epoch": 0.5604701044535544,
+      "grad_norm": 0.025288526434383552,
+      "learning_rate": 0.0009042687769592375,
+      "loss": 0.6248,
+      "step": 11040
+    },
+    {
+      "epoch": 0.5607239405515858,
+      "grad_norm": 0.024553453597092455,
+      "learning_rate": 0.0009041383815308877,
+      "loss": 0.598,
+      "step": 11045
+    },
+    {
+      "epoch": 0.5609777766496173,
+      "grad_norm": 0.030367718805119067,
+      "learning_rate": 0.0009040079067721889,
+      "loss": 0.5852,
+      "step": 11050
+    },
+    {
+      "epoch": 0.5612316127476489,
+      "grad_norm": 0.03317092988088747,
+      "learning_rate": 0.0009038773527087529,
+      "loss": 0.5776,
+      "step": 11055
+    },
+    {
+      "epoch": 0.5614854488456803,
+      "grad_norm": 0.05184744896110897,
+      "learning_rate": 0.0009037467193662068,
+      "loss": 0.6059,
+      "step": 11060
+    },
+    {
+      "epoch": 0.5617392849437118,
+      "grad_norm": 0.025577618854582287,
+      "learning_rate": 0.0009036160067701931,
+      "loss": 0.5846,
+      "step": 11065
+    },
+    {
+      "epoch": 0.5619931210417434,
+      "grad_norm": 0.03938538897439106,
+      "learning_rate": 0.00090348521494637,
+      "loss": 0.6368,
+      "step": 11070
+    },
+    {
+      "epoch": 0.5622469571397748,
+      "grad_norm": 0.038820715798665424,
+      "learning_rate": 0.0009033543439204114,
+      "loss": 0.609,
+      "step": 11075
+    },
+    {
+      "epoch": 0.5625007932378063,
+      "grad_norm": 0.029533488100105525,
+      "learning_rate": 0.0009032233937180067,
+      "loss": 0.5961,
+      "step": 11080
+    },
+    {
+      "epoch": 0.5627546293358379,
+      "grad_norm": 0.02379671510332352,
+      "learning_rate": 0.0009030923643648607,
+      "loss": 0.5998,
+      "step": 11085
+    },
+    {
+      "epoch": 0.5630084654338694,
+      "grad_norm": 0.023448978175139953,
+      "learning_rate": 0.0009029612558866938,
+      "loss": 0.5834,
+      "step": 11090
+    },
+    {
+      "epoch": 0.5632623015319008,
+      "grad_norm": 0.024265093856533426,
+      "learning_rate": 0.0009028300683092418,
+      "loss": 0.5921,
+      "step": 11095
+    },
+    {
+      "epoch": 0.5635161376299324,
+      "grad_norm": 0.024632811713908073,
+      "learning_rate": 0.0009026988016582564,
+      "loss": 0.6196,
+      "step": 11100
+    },
+    {
+      "epoch": 0.5637699737279639,
+      "grad_norm": 0.022302125548208863,
+      "learning_rate": 0.0009025674559595045,
+      "loss": 0.6236,
+      "step": 11105
+    },
+    {
+      "epoch": 0.5640238098259953,
+      "grad_norm": 0.036192999100420786,
+      "learning_rate": 0.0009024360312387687,
+      "loss": 0.609,
+      "step": 11110
+    },
+    {
+      "epoch": 0.5642776459240268,
+      "grad_norm": 0.0306562971571317,
+      "learning_rate": 0.0009023045275218467,
+      "loss": 0.5926,
+      "step": 11115
+    },
+    {
+      "epoch": 0.5645314820220584,
+      "grad_norm": 0.032739588825804515,
+      "learning_rate": 0.0009021729448345524,
+      "loss": 0.6067,
+      "step": 11120
+    },
+    {
+      "epoch": 0.5647853181200898,
+      "grad_norm": 0.02788797841269012,
+      "learning_rate": 0.0009020412832027146,
+      "loss": 0.6026,
+      "step": 11125
+    },
+    {
+      "epoch": 0.5650391542181213,
+      "grad_norm": 0.025558068319595086,
+      "learning_rate": 0.0009019095426521779,
+      "loss": 0.6021,
+      "step": 11130
+    },
+    {
+      "epoch": 0.5652929903161529,
+      "grad_norm": 0.025476402210346454,
+      "learning_rate": 0.0009017777232088023,
+      "loss": 0.6236,
+      "step": 11135
+    },
+    {
+      "epoch": 0.5655468264141844,
+      "grad_norm": 0.024392907820231798,
+      "learning_rate": 0.0009016458248984632,
+      "loss": 0.6497,
+      "step": 11140
+    },
+    {
+      "epoch": 0.5658006625122158,
+      "grad_norm": 0.02799976269471266,
+      "learning_rate": 0.0009015138477470516,
+      "loss": 0.601,
+      "step": 11145
+    },
+    {
+      "epoch": 0.5660544986102474,
+      "grad_norm": 0.027785695512840384,
+      "learning_rate": 0.0009013817917804743,
+      "loss": 0.6225,
+      "step": 11150
+    },
+    {
+      "epoch": 0.5663083347082789,
+      "grad_norm": 0.031205524445284426,
+      "learning_rate": 0.0009012496570246529,
+      "loss": 0.6179,
+      "step": 11155
+    },
+    {
+      "epoch": 0.5665621708063103,
+      "grad_norm": 0.025209600845023012,
+      "learning_rate": 0.0009011174435055247,
+      "loss": 0.5718,
+      "step": 11160
+    },
+    {
+      "epoch": 0.5668160069043419,
+      "grad_norm": 0.03060213292836761,
+      "learning_rate": 0.0009009851512490428,
+      "loss": 0.5973,
+      "step": 11165
+    },
+    {
+      "epoch": 0.5670698430023734,
+      "grad_norm": 0.03631452482840258,
+      "learning_rate": 0.0009008527802811754,
+      "loss": 0.5809,
+      "step": 11170
+    },
+    {
+      "epoch": 0.5673236791004048,
+      "grad_norm": 0.03908183938816148,
+      "learning_rate": 0.0009007203306279064,
+      "loss": 0.6072,
+      "step": 11175
+    },
+    {
+      "epoch": 0.5675775151984364,
+      "grad_norm": 0.035964383599072204,
+      "learning_rate": 0.0009005878023152348,
+      "loss": 0.6082,
+      "step": 11180
+    },
+    {
+      "epoch": 0.5678313512964679,
+      "grad_norm": 0.0306440737992479,
+      "learning_rate": 0.0009004551953691754,
+      "loss": 0.6095,
+      "step": 11185
+    },
+    {
+      "epoch": 0.5680851873944994,
+      "grad_norm": 0.0513744832338725,
+      "learning_rate": 0.000900322509815758,
+      "loss": 0.6013,
+      "step": 11190
+    },
+    {
+      "epoch": 0.5683390234925308,
+      "grad_norm": 0.025114635513695662,
+      "learning_rate": 0.0009001897456810286,
+      "loss": 0.6058,
+      "step": 11195
+    },
+    {
+      "epoch": 0.5685928595905624,
+      "grad_norm": 0.04276993258659403,
+      "learning_rate": 0.0009000569029910477,
+      "loss": 0.6343,
+      "step": 11200
+    },
+    {
+      "epoch": 0.5688466956885939,
+      "grad_norm": 0.03646375136050398,
+      "learning_rate": 0.0008999239817718918,
+      "loss": 0.6169,
+      "step": 11205
+    },
+    {
+      "epoch": 0.5691005317866253,
+      "grad_norm": 0.024107862227893833,
+      "learning_rate": 0.0008997909820496528,
+      "loss": 0.605,
+      "step": 11210
+    },
+    {
+      "epoch": 0.5693543678846569,
+      "grad_norm": 0.02312217074080101,
+      "learning_rate": 0.0008996579038504376,
+      "loss": 0.6016,
+      "step": 11215
+    },
+    {
+      "epoch": 0.5696082039826884,
+      "grad_norm": 0.026924470535448695,
+      "learning_rate": 0.0008995247472003691,
+      "loss": 0.6151,
+      "step": 11220
+    },
+    {
+      "epoch": 0.5698620400807198,
+      "grad_norm": 0.025057655726728318,
+      "learning_rate": 0.0008993915121255852,
+      "loss": 0.6265,
+      "step": 11225
+    },
+    {
+      "epoch": 0.5701158761787514,
+      "grad_norm": 0.025445367684530214,
+      "learning_rate": 0.0008992581986522392,
+      "loss": 0.6009,
+      "step": 11230
+    },
+    {
+      "epoch": 0.5703697122767829,
+      "grad_norm": 0.0382438609341841,
+      "learning_rate": 0.0008991248068064999,
+      "loss": 0.5777,
+      "step": 11235
+    },
+    {
+      "epoch": 0.5706235483748144,
+      "grad_norm": 0.03626958625239848,
+      "learning_rate": 0.0008989913366145515,
+      "loss": 0.5829,
+      "step": 11240
+    },
+    {
+      "epoch": 0.5708773844728459,
+      "grad_norm": 0.024880449607419988,
+      "learning_rate": 0.0008988577881025935,
+      "loss": 0.5972,
+      "step": 11245
+    },
+    {
+      "epoch": 0.5711312205708774,
+      "grad_norm": 0.02888281590301525,
+      "learning_rate": 0.0008987241612968406,
+      "loss": 0.615,
+      "step": 11250
+    },
+    {
+      "epoch": 0.5713850566689089,
+      "grad_norm": 0.03757091988973743,
+      "learning_rate": 0.0008985904562235234,
+      "loss": 0.5978,
+      "step": 11255
+    },
+    {
+      "epoch": 0.5716388927669404,
+      "grad_norm": 0.03374674197558035,
+      "learning_rate": 0.0008984566729088874,
+      "loss": 0.581,
+      "step": 11260
+    },
+    {
+      "epoch": 0.5718927288649719,
+      "grad_norm": 0.031246163529604482,
+      "learning_rate": 0.0008983228113791937,
+      "loss": 0.6174,
+      "step": 11265
+    },
+    {
+      "epoch": 0.5721465649630034,
+      "grad_norm": 0.03523115249240424,
+      "learning_rate": 0.0008981888716607184,
+      "loss": 0.5909,
+      "step": 11270
+    },
+    {
+      "epoch": 0.5724004010610348,
+      "grad_norm": 0.058253114080335025,
+      "learning_rate": 0.0008980548537797535,
+      "loss": 0.5974,
+      "step": 11275
+    },
+    {
+      "epoch": 0.5726542371590664,
+      "grad_norm": 0.03323643646387071,
+      "learning_rate": 0.0008979207577626058,
+      "loss": 0.6457,
+      "step": 11280
+    },
+    {
+      "epoch": 0.5729080732570979,
+      "grad_norm": 0.02636690022689109,
+      "learning_rate": 0.0008977865836355979,
+      "loss": 0.5843,
+      "step": 11285
+    },
+    {
+      "epoch": 0.5731619093551293,
+      "grad_norm": 0.02518146447116509,
+      "learning_rate": 0.0008976523314250672,
+      "loss": 0.5996,
+      "step": 11290
+    },
+    {
+      "epoch": 0.5734157454531609,
+      "grad_norm": 0.03063657652619607,
+      "learning_rate": 0.0008975180011573669,
+      "loss": 0.5909,
+      "step": 11295
+    },
+    {
+      "epoch": 0.5736695815511924,
+      "grad_norm": 0.033555485623473526,
+      "learning_rate": 0.0008973835928588656,
+      "loss": 0.5984,
+      "step": 11300
+    },
+    {
+      "epoch": 0.5739234176492239,
+      "grad_norm": 0.026858437506102287,
+      "learning_rate": 0.0008972491065559467,
+      "loss": 0.5745,
+      "step": 11305
+    },
+    {
+      "epoch": 0.5741772537472554,
+      "grad_norm": 0.041900500650273065,
+      "learning_rate": 0.0008971145422750094,
+      "loss": 0.5525,
+      "step": 11310
+    },
+    {
+      "epoch": 0.5744310898452869,
+      "grad_norm": 0.023692636987519583,
+      "learning_rate": 0.0008969799000424676,
+      "loss": 0.6412,
+      "step": 11315
+    },
+    {
+      "epoch": 0.5746849259433184,
+      "grad_norm": 0.03520820653367125,
+      "learning_rate": 0.0008968451798847513,
+      "loss": 0.5895,
+      "step": 11320
+    },
+    {
+      "epoch": 0.5749387620413499,
+      "grad_norm": 0.045950374099341315,
+      "learning_rate": 0.0008967103818283051,
+      "loss": 0.6144,
+      "step": 11325
+    },
+    {
+      "epoch": 0.5751925981393814,
+      "grad_norm": 0.03013110218086917,
+      "learning_rate": 0.0008965755058995896,
+      "loss": 0.6167,
+      "step": 11330
+    },
+    {
+      "epoch": 0.5754464342374129,
+      "grad_norm": 0.028677449123263064,
+      "learning_rate": 0.0008964405521250798,
+      "loss": 0.5907,
+      "step": 11335
+    },
+    {
+      "epoch": 0.5757002703354444,
+      "grad_norm": 0.06897665640762995,
+      "learning_rate": 0.0008963055205312667,
+      "loss": 0.6008,
+      "step": 11340
+    },
+    {
+      "epoch": 0.5759541064334759,
+      "grad_norm": 0.039108751736186075,
+      "learning_rate": 0.0008961704111446564,
+      "loss": 0.5963,
+      "step": 11345
+    },
+    {
+      "epoch": 0.5762079425315074,
+      "grad_norm": 0.02497304268795272,
+      "learning_rate": 0.00089603522399177,
+      "loss": 0.5648,
+      "step": 11350
+    },
+    {
+      "epoch": 0.576461778629539,
+      "grad_norm": 0.02409915928784757,
+      "learning_rate": 0.0008958999590991441,
+      "loss": 0.6111,
+      "step": 11355
+    },
+    {
+      "epoch": 0.5767156147275704,
+      "grad_norm": 0.023423833712250722,
+      "learning_rate": 0.0008957646164933307,
+      "loss": 0.6034,
+      "step": 11360
+    },
+    {
+      "epoch": 0.5769694508256019,
+      "grad_norm": 0.02939021201285094,
+      "learning_rate": 0.0008956291962008967,
+      "loss": 0.601,
+      "step": 11365
+    },
+    {
+      "epoch": 0.5772232869236334,
+      "grad_norm": 0.025113894696930674,
+      "learning_rate": 0.0008954936982484245,
+      "loss": 0.5741,
+      "step": 11370
+    },
+    {
+      "epoch": 0.5774771230216649,
+      "grad_norm": 0.03523771168659597,
+      "learning_rate": 0.0008953581226625116,
+      "loss": 0.5955,
+      "step": 11375
+    },
+    {
+      "epoch": 0.5777309591196964,
+      "grad_norm": 0.033018941463141716,
+      "learning_rate": 0.000895222469469771,
+      "loss": 0.6561,
+      "step": 11380
+    },
+    {
+      "epoch": 0.5779847952177279,
+      "grad_norm": 0.03870222645684432,
+      "learning_rate": 0.0008950867386968305,
+      "loss": 0.5742,
+      "step": 11385
+    },
+    {
+      "epoch": 0.5782386313157594,
+      "grad_norm": 0.033035287220069445,
+      "learning_rate": 0.0008949509303703336,
+      "loss": 0.6268,
+      "step": 11390
+    },
+    {
+      "epoch": 0.5784924674137909,
+      "grad_norm": 0.02535187944282503,
+      "learning_rate": 0.0008948150445169386,
+      "loss": 0.6145,
+      "step": 11395
+    },
+    {
+      "epoch": 0.5787463035118224,
+      "grad_norm": 0.03673161990221083,
+      "learning_rate": 0.0008946790811633193,
+      "loss": 0.5821,
+      "step": 11400
+    },
+    {
+      "epoch": 0.579000139609854,
+      "grad_norm": 0.027797183940164883,
+      "learning_rate": 0.0008945430403361647,
+      "loss": 0.6329,
+      "step": 11405
+    },
+    {
+      "epoch": 0.5792539757078854,
+      "grad_norm": 0.024400895985974023,
+      "learning_rate": 0.0008944069220621788,
+      "loss": 0.5537,
+      "step": 11410
+    },
+    {
+      "epoch": 0.5795078118059169,
+      "grad_norm": 0.029540130582463044,
+      "learning_rate": 0.000894270726368081,
+      "loss": 0.6305,
+      "step": 11415
+    },
+    {
+      "epoch": 0.5797616479039485,
+      "grad_norm": 0.03403617081620501,
+      "learning_rate": 0.0008941344532806057,
+      "loss": 0.6018,
+      "step": 11420
+    },
+    {
+      "epoch": 0.5800154840019799,
+      "grad_norm": 0.036771472648533886,
+      "learning_rate": 0.000893998102826503,
+      "loss": 0.6164,
+      "step": 11425
+    },
+    {
+      "epoch": 0.5802693201000114,
+      "grad_norm": 0.050072623532946885,
+      "learning_rate": 0.0008938616750325375,
+      "loss": 0.5806,
+      "step": 11430
+    },
+    {
+      "epoch": 0.580523156198043,
+      "grad_norm": 0.031054510529734854,
+      "learning_rate": 0.0008937251699254893,
+      "loss": 0.6024,
+      "step": 11435
+    },
+    {
+      "epoch": 0.5807769922960744,
+      "grad_norm": 0.02607242206947156,
+      "learning_rate": 0.0008935885875321539,
+      "loss": 0.5976,
+      "step": 11440
+    },
+    {
+      "epoch": 0.5810308283941059,
+      "grad_norm": 0.025700401652453922,
+      "learning_rate": 0.0008934519278793416,
+      "loss": 0.6159,
+      "step": 11445
+    },
+    {
+      "epoch": 0.5812846644921374,
+      "grad_norm": 0.03334981900638615,
+      "learning_rate": 0.0008933151909938778,
+      "loss": 0.5664,
+      "step": 11450
+    },
+    {
+      "epoch": 0.581538500590169,
+      "grad_norm": 0.03470397493711404,
+      "learning_rate": 0.0008931783769026036,
+      "loss": 0.6065,
+      "step": 11455
+    },
+    {
+      "epoch": 0.5817923366882004,
+      "grad_norm": 0.040190824236687406,
+      "learning_rate": 0.0008930414856323747,
+      "loss": 0.5918,
+      "step": 11460
+    },
+    {
+      "epoch": 0.5820461727862319,
+      "grad_norm": 0.02671898679248395,
+      "learning_rate": 0.0008929045172100624,
+      "loss": 0.5995,
+      "step": 11465
+    },
+    {
+      "epoch": 0.5823000088842635,
+      "grad_norm": 0.06746663315212366,
+      "learning_rate": 0.0008927674716625527,
+      "loss": 0.6212,
+      "step": 11470
+    },
+    {
+      "epoch": 0.5825538449822949,
+      "grad_norm": 0.03231819490055282,
+      "learning_rate": 0.0008926303490167471,
+      "loss": 0.6091,
+      "step": 11475
+    },
+    {
+      "epoch": 0.5828076810803264,
+      "grad_norm": 0.028477552470271787,
+      "learning_rate": 0.0008924931492995619,
+      "loss": 0.6107,
+      "step": 11480
+    },
+    {
+      "epoch": 0.583061517178358,
+      "grad_norm": 0.030545895434276424,
+      "learning_rate": 0.000892355872537929,
+      "loss": 0.6016,
+      "step": 11485
+    },
+    {
+      "epoch": 0.5833153532763894,
+      "grad_norm": 0.02764360721101484,
+      "learning_rate": 0.0008922185187587949,
+      "loss": 0.6014,
+      "step": 11490
+    },
+    {
+      "epoch": 0.5835691893744209,
+      "grad_norm": 0.051972230411543044,
+      "learning_rate": 0.0008920810879891217,
+      "loss": 0.6184,
+      "step": 11495
+    },
+    {
+      "epoch": 0.5838230254724525,
+      "grad_norm": 0.049601905584870654,
+      "learning_rate": 0.0008919435802558862,
+      "loss": 0.5848,
+      "step": 11500
+    },
+    {
+      "epoch": 0.5840768615704839,
+      "grad_norm": 0.5732510853690319,
+      "learning_rate": 0.0008918059955860803,
+      "loss": 0.6587,
+      "step": 11505
+    },
+    {
+      "epoch": 0.5843306976685154,
+      "grad_norm": 0.042150081750240605,
+      "learning_rate": 0.0008916683340067116,
+      "loss": 0.625,
+      "step": 11510
+    },
+    {
+      "epoch": 0.584584533766547,
+      "grad_norm": 0.05774925217438834,
+      "learning_rate": 0.0008915305955448021,
+      "loss": 0.6199,
+      "step": 11515
+    },
+    {
+      "epoch": 0.5848383698645785,
+      "grad_norm": 0.04160871680886203,
+      "learning_rate": 0.0008913927802273894,
+      "loss": 0.6081,
+      "step": 11520
+    },
+    {
+      "epoch": 0.5850922059626099,
+      "grad_norm": 0.03323206015318204,
+      "learning_rate": 0.0008912548880815256,
+      "loss": 0.6115,
+      "step": 11525
+    },
+    {
+      "epoch": 0.5853460420606414,
+      "grad_norm": 0.04576230402616482,
+      "learning_rate": 0.0008911169191342785,
+      "loss": 0.6077,
+      "step": 11530
+    },
+    {
+      "epoch": 0.585599878158673,
+      "grad_norm": 0.09158330505750234,
+      "learning_rate": 0.0008909788734127307,
+      "loss": 0.6539,
+      "step": 11535
+    },
+    {
+      "epoch": 0.5858537142567044,
+      "grad_norm": 0.02861387432236888,
+      "learning_rate": 0.00089084075094398,
+      "loss": 0.6305,
+      "step": 11540
+    },
+    {
+      "epoch": 0.5861075503547359,
+      "grad_norm": 0.037516416748120646,
+      "learning_rate": 0.0008907025517551388,
+      "loss": 0.6258,
+      "step": 11545
+    },
+    {
+      "epoch": 0.5863613864527675,
+      "grad_norm": 0.04086135649565531,
+      "learning_rate": 0.0008905642758733352,
+      "loss": 0.599,
+      "step": 11550
+    },
+    {
+      "epoch": 0.5866152225507989,
+      "grad_norm": 0.03266087385037055,
+      "learning_rate": 0.000890425923325712,
+      "loss": 0.6342,
+      "step": 11555
+    },
+    {
+      "epoch": 0.5868690586488304,
+      "grad_norm": 0.04351600531336264,
+      "learning_rate": 0.0008902874941394271,
+      "loss": 0.623,
+      "step": 11560
+    },
+    {
+      "epoch": 0.587122894746862,
+      "grad_norm": 0.04031763253939243,
+      "learning_rate": 0.0008901489883416535,
+      "loss": 0.6061,
+      "step": 11565
+    },
+    {
+      "epoch": 0.5873767308448935,
+      "grad_norm": 0.03024842628718133,
+      "learning_rate": 0.0008900104059595791,
+      "loss": 0.5995,
+      "step": 11570
+    },
+    {
+      "epoch": 0.5876305669429249,
+      "grad_norm": 0.03277584675848276,
+      "learning_rate": 0.000889871747020407,
+      "loss": 0.5808,
+      "step": 11575
+    },
+    {
+      "epoch": 0.5878844030409565,
+      "grad_norm": 0.025589433458913628,
+      "learning_rate": 0.0008897330115513553,
+      "loss": 0.6386,
+      "step": 11580
+    },
+    {
+      "epoch": 0.588138239138988,
+      "grad_norm": 0.04730997143858022,
+      "learning_rate": 0.0008895941995796569,
+      "loss": 0.6175,
+      "step": 11585
+    },
+    {
+      "epoch": 0.5883920752370194,
+      "grad_norm": 0.02716519389485637,
+      "learning_rate": 0.0008894553111325601,
+      "loss": 0.6319,
+      "step": 11590
+    },
+    {
+      "epoch": 0.588645911335051,
+      "grad_norm": 0.029548377642727466,
+      "learning_rate": 0.0008893163462373279,
+      "loss": 0.6307,
+      "step": 11595
+    },
+    {
+      "epoch": 0.5888997474330825,
+      "grad_norm": 0.029441752118062915,
+      "learning_rate": 0.0008891773049212387,
+      "loss": 0.604,
+      "step": 11600
+    },
+    {
+      "epoch": 0.5891535835311139,
+      "grad_norm": 0.03547180982249062,
+      "learning_rate": 0.000889038187211585,
+      "loss": 0.615,
+      "step": 11605
+    },
+    {
+      "epoch": 0.5894074196291454,
+      "grad_norm": 0.025520246542906348,
+      "learning_rate": 0.0008888989931356754,
+      "loss": 0.5925,
+      "step": 11610
+    },
+    {
+      "epoch": 0.589661255727177,
+      "grad_norm": 0.032232116965678355,
+      "learning_rate": 0.0008887597227208331,
+      "loss": 0.618,
+      "step": 11615
+    },
+    {
+      "epoch": 0.5899150918252085,
+      "grad_norm": 0.026107740044775674,
+      "learning_rate": 0.0008886203759943957,
+      "loss": 0.6311,
+      "step": 11620
+    },
+    {
+      "epoch": 0.5901689279232399,
+      "grad_norm": 0.028672705224940687,
+      "learning_rate": 0.0008884809529837167,
+      "loss": 0.5692,
+      "step": 11625
+    },
+    {
+      "epoch": 0.5904227640212715,
+      "grad_norm": 0.0258655607946437,
+      "learning_rate": 0.0008883414537161638,
+      "loss": 0.6251,
+      "step": 11630
+    },
+    {
+      "epoch": 0.590676600119303,
+      "grad_norm": 0.028780253182444794,
+      "learning_rate": 0.0008882018782191204,
+      "loss": 0.6016,
+      "step": 11635
+    },
+    {
+      "epoch": 0.5909304362173344,
+      "grad_norm": 0.03331921506050773,
+      "learning_rate": 0.0008880622265199841,
+      "loss": 0.6073,
+      "step": 11640
+    },
+    {
+      "epoch": 0.591184272315366,
+      "grad_norm": 0.4723036335382741,
+      "learning_rate": 0.0008879224986461681,
+      "loss": 0.668,
+      "step": 11645
+    },
+    {
+      "epoch": 0.5914381084133975,
+      "grad_norm": 0.06636323581751623,
+      "learning_rate": 0.0008877826946251002,
+      "loss": 0.6493,
+      "step": 11650
+    },
+    {
+      "epoch": 0.5916919445114289,
+      "grad_norm": 0.06678436419997426,
+      "learning_rate": 0.0008876428144842231,
+      "loss": 0.5844,
+      "step": 11655
+    },
+    {
+      "epoch": 0.5919457806094605,
+      "grad_norm": 0.048242208081622634,
+      "learning_rate": 0.0008875028582509948,
+      "loss": 0.606,
+      "step": 11660
+    },
+    {
+      "epoch": 0.592199616707492,
+      "grad_norm": 0.06276218448082216,
+      "learning_rate": 0.0008873628259528878,
+      "loss": 0.6185,
+      "step": 11665
+    },
+    {
+      "epoch": 0.5924534528055235,
+      "grad_norm": 0.02996611743977883,
+      "learning_rate": 0.0008872227176173899,
+      "loss": 0.6132,
+      "step": 11670
+    },
+    {
+      "epoch": 0.592707288903555,
+      "grad_norm": 0.025393102232066908,
+      "learning_rate": 0.0008870825332720036,
+      "loss": 0.6025,
+      "step": 11675
+    },
+    {
+      "epoch": 0.5929611250015865,
+      "grad_norm": 0.04284571854613707,
+      "learning_rate": 0.0008869422729442465,
+      "loss": 0.6017,
+      "step": 11680
+    },
+    {
+      "epoch": 0.593214961099618,
+      "grad_norm": 0.03802645828166949,
+      "learning_rate": 0.0008868019366616508,
+      "loss": 0.6635,
+      "step": 11685
+    },
+    {
+      "epoch": 0.5934687971976494,
+      "grad_norm": 0.02382312826786996,
+      "learning_rate": 0.0008866615244517639,
+      "loss": 0.569,
+      "step": 11690
+    },
+    {
+      "epoch": 0.593722633295681,
+      "grad_norm": 0.027194057612111737,
+      "learning_rate": 0.000886521036342148,
+      "loss": 0.6323,
+      "step": 11695
+    },
+    {
+      "epoch": 0.5939764693937125,
+      "grad_norm": 0.030713511992266334,
+      "learning_rate": 0.0008863804723603803,
+      "loss": 0.5997,
+      "step": 11700
+    },
+    {
+      "epoch": 0.5942303054917439,
+      "grad_norm": 0.027258233011688147,
+      "learning_rate": 0.0008862398325340526,
+      "loss": 0.6317,
+      "step": 11705
+    },
+    {
+      "epoch": 0.5944841415897755,
+      "grad_norm": 0.03926742498370449,
+      "learning_rate": 0.0008860991168907721,
+      "loss": 0.6199,
+      "step": 11710
+    },
+    {
+      "epoch": 0.594737977687807,
+      "grad_norm": 0.043600047694201435,
+      "learning_rate": 0.0008859583254581605,
+      "loss": 0.6552,
+      "step": 11715
+    },
+    {
+      "epoch": 0.5949918137858384,
+      "grad_norm": 0.030006140812709456,
+      "learning_rate": 0.0008858174582638543,
+      "loss": 0.608,
+      "step": 11720
+    },
+    {
+      "epoch": 0.59524564988387,
+      "grad_norm": 0.022938563538545016,
+      "learning_rate": 0.0008856765153355051,
+      "loss": 0.6107,
+      "step": 11725
+    },
+    {
+      "epoch": 0.5954994859819015,
+      "grad_norm": 0.029591425378169715,
+      "learning_rate": 0.0008855354967007793,
+      "loss": 0.5701,
+      "step": 11730
+    },
+    {
+      "epoch": 0.595753322079933,
+      "grad_norm": 0.037546799230365954,
+      "learning_rate": 0.0008853944023873581,
+      "loss": 0.6099,
+      "step": 11735
+    },
+    {
+      "epoch": 0.5960071581779645,
+      "grad_norm": 0.030940988076782326,
+      "learning_rate": 0.0008852532324229379,
+      "loss": 0.6363,
+      "step": 11740
+    },
+    {
+      "epoch": 0.596260994275996,
+      "grad_norm": 0.02823182925476383,
+      "learning_rate": 0.0008851119868352292,
+      "loss": 0.6331,
+      "step": 11745
+    },
+    {
+      "epoch": 0.5965148303740275,
+      "grad_norm": 0.03156718715990183,
+      "learning_rate": 0.000884970665651958,
+      "loss": 0.6566,
+      "step": 11750
+    },
+    {
+      "epoch": 0.596768666472059,
+      "grad_norm": 0.041051536536051314,
+      "learning_rate": 0.0008848292689008653,
+      "loss": 0.644,
+      "step": 11755
+    },
+    {
+      "epoch": 0.5970225025700905,
+      "grad_norm": 0.023005223765102525,
+      "learning_rate": 0.0008846877966097059,
+      "loss": 0.592,
+      "step": 11760
+    },
+    {
+      "epoch": 0.597276338668122,
+      "grad_norm": 0.027109360668611315,
+      "learning_rate": 0.0008845462488062506,
+      "loss": 0.5873,
+      "step": 11765
+    },
+    {
+      "epoch": 0.5975301747661534,
+      "grad_norm": 0.027879918337945408,
+      "learning_rate": 0.0008844046255182844,
+      "loss": 0.6062,
+      "step": 11770
+    },
+    {
+      "epoch": 0.597784010864185,
+      "grad_norm": 0.03759374089636192,
+      "learning_rate": 0.0008842629267736072,
+      "loss": 0.5949,
+      "step": 11775
+    },
+    {
+      "epoch": 0.5980378469622165,
+      "grad_norm": 0.02369923860074637,
+      "learning_rate": 0.0008841211526000339,
+      "loss": 0.6322,
+      "step": 11780
+    },
+    {
+      "epoch": 0.598291683060248,
+      "grad_norm": 0.028051536267109798,
+      "learning_rate": 0.0008839793030253937,
+      "loss": 0.6232,
+      "step": 11785
+    },
+    {
+      "epoch": 0.5985455191582795,
+      "grad_norm": 0.056018315371266905,
+      "learning_rate": 0.0008838373780775315,
+      "loss": 0.6123,
+      "step": 11790
+    },
+    {
+      "epoch": 0.598799355256311,
+      "grad_norm": 0.03552958483190999,
+      "learning_rate": 0.000883695377784306,
+      "loss": 0.5976,
+      "step": 11795
+    },
+    {
+      "epoch": 0.5990531913543425,
+      "grad_norm": 0.032759803712457444,
+      "learning_rate": 0.0008835533021735914,
+      "loss": 0.5939,
+      "step": 11800
+    },
+    {
+      "epoch": 0.599307027452374,
+      "grad_norm": 0.049806073645126195,
+      "learning_rate": 0.0008834111512732763,
+      "loss": 0.6214,
+      "step": 11805
+    },
+    {
+      "epoch": 0.5995608635504055,
+      "grad_norm": 0.02802494107222027,
+      "learning_rate": 0.0008832689251112645,
+      "loss": 0.6295,
+      "step": 11810
+    },
+    {
+      "epoch": 0.599814699648437,
+      "grad_norm": 0.030855803033759243,
+      "learning_rate": 0.0008831266237154738,
+      "loss": 0.6435,
+      "step": 11815
+    },
+    {
+      "epoch": 0.6000685357464685,
+      "grad_norm": 0.024861923925470025,
+      "learning_rate": 0.0008829842471138376,
+      "loss": 0.6032,
+      "step": 11820
+    },
+    {
+      "epoch": 0.6003223718445,
+      "grad_norm": 0.02762869776919164,
+      "learning_rate": 0.0008828417953343035,
+      "loss": 0.5834,
+      "step": 11825
+    },
+    {
+      "epoch": 0.6005762079425315,
+      "grad_norm": 0.029202677438373507,
+      "learning_rate": 0.0008826992684048344,
+      "loss": 0.5823,
+      "step": 11830
+    },
+    {
+      "epoch": 0.6008300440405631,
+      "grad_norm": 0.04246057047785471,
+      "learning_rate": 0.0008825566663534074,
+      "loss": 0.5808,
+      "step": 11835
+    },
+    {
+      "epoch": 0.6010838801385945,
+      "grad_norm": 0.02677918683034048,
+      "learning_rate": 0.0008824139892080145,
+      "loss": 0.589,
+      "step": 11840
+    },
+    {
+      "epoch": 0.601337716236626,
+      "grad_norm": 0.022623912880773735,
+      "learning_rate": 0.0008822712369966628,
+      "loss": 0.5985,
+      "step": 11845
+    },
+    {
+      "epoch": 0.6015915523346576,
+      "grad_norm": 0.03863218744236265,
+      "learning_rate": 0.0008821284097473734,
+      "loss": 0.5964,
+      "step": 11850
+    },
+    {
+      "epoch": 0.601845388432689,
+      "grad_norm": 0.026457485001171663,
+      "learning_rate": 0.000881985507488183,
+      "loss": 0.592,
+      "step": 11855
+    },
+    {
+      "epoch": 0.6020992245307205,
+      "grad_norm": 0.03158510312459021,
+      "learning_rate": 0.0008818425302471424,
+      "loss": 0.5976,
+      "step": 11860
+    },
+    {
+      "epoch": 0.602353060628752,
+      "grad_norm": 0.044049060218755355,
+      "learning_rate": 0.0008816994780523175,
+      "loss": 0.5742,
+      "step": 11865
+    },
+    {
+      "epoch": 0.6026068967267835,
+      "grad_norm": 0.02599413568481521,
+      "learning_rate": 0.0008815563509317883,
+      "loss": 0.5854,
+      "step": 11870
+    },
+    {
+      "epoch": 0.602860732824815,
+      "grad_norm": 0.03864798470702778,
+      "learning_rate": 0.0008814131489136506,
+      "loss": 0.5732,
+      "step": 11875
+    },
+    {
+      "epoch": 0.6031145689228465,
+      "grad_norm": 0.032289876389169754,
+      "learning_rate": 0.0008812698720260135,
+      "loss": 0.621,
+      "step": 11880
+    },
+    {
+      "epoch": 0.6033684050208781,
+      "grad_norm": 0.05318024428744845,
+      "learning_rate": 0.000881126520297002,
+      "loss": 0.5706,
+      "step": 11885
+    },
+    {
+      "epoch": 0.6036222411189095,
+      "grad_norm": 0.03086598470372497,
+      "learning_rate": 0.0008809830937547554,
+      "loss": 0.601,
+      "step": 11890
+    },
+    {
+      "epoch": 0.603876077216941,
+      "grad_norm": 0.025731078946828065,
+      "learning_rate": 0.0008808395924274274,
+      "loss": 0.5996,
+      "step": 11895
+    },
+    {
+      "epoch": 0.6041299133149726,
+      "grad_norm": 0.025339932657364943,
+      "learning_rate": 0.0008806960163431866,
+      "loss": 0.601,
+      "step": 11900
+    },
+    {
+      "epoch": 0.604383749413004,
+      "grad_norm": 0.037850633222621166,
+      "learning_rate": 0.0008805523655302164,
+      "loss": 0.5848,
+      "step": 11905
+    },
+    {
+      "epoch": 0.6046375855110355,
+      "grad_norm": 0.026694596427904598,
+      "learning_rate": 0.0008804086400167146,
+      "loss": 0.6027,
+      "step": 11910
+    },
+    {
+      "epoch": 0.6048914216090671,
+      "grad_norm": 0.028977129692472393,
+      "learning_rate": 0.0008802648398308939,
+      "loss": 0.5947,
+      "step": 11915
+    },
+    {
+      "epoch": 0.6051452577070985,
+      "grad_norm": 0.02467551449885606,
+      "learning_rate": 0.0008801209650009813,
+      "loss": 0.6287,
+      "step": 11920
+    },
+    {
+      "epoch": 0.60539909380513,
+      "grad_norm": 0.024282792825939505,
+      "learning_rate": 0.0008799770155552192,
+      "loss": 0.5401,
+      "step": 11925
+    },
+    {
+      "epoch": 0.6056529299031616,
+      "grad_norm": 0.04118279824961832,
+      "learning_rate": 0.0008798329915218638,
+      "loss": 0.6076,
+      "step": 11930
+    },
+    {
+      "epoch": 0.605906766001193,
+      "grad_norm": 0.025423020492523492,
+      "learning_rate": 0.0008796888929291864,
+      "loss": 0.5958,
+      "step": 11935
+    },
+    {
+      "epoch": 0.6061606020992245,
+      "grad_norm": 0.02499168872123397,
+      "learning_rate": 0.0008795447198054729,
+      "loss": 0.6043,
+      "step": 11940
+    },
+    {
+      "epoch": 0.606414438197256,
+      "grad_norm": 0.042162291687962206,
+      "learning_rate": 0.0008794004721790235,
+      "loss": 0.589,
+      "step": 11945
+    },
+    {
+      "epoch": 0.6066682742952876,
+      "grad_norm": 0.03291178818373513,
+      "learning_rate": 0.0008792561500781535,
+      "loss": 0.5671,
+      "step": 11950
+    },
+    {
+      "epoch": 0.606922110393319,
+      "grad_norm": 0.0376953507530463,
+      "learning_rate": 0.0008791117535311928,
+      "loss": 0.6097,
+      "step": 11955
+    },
+    {
+      "epoch": 0.6071759464913505,
+      "grad_norm": 0.025349852157537867,
+      "learning_rate": 0.0008789672825664854,
+      "loss": 0.5582,
+      "step": 11960
+    },
+    {
+      "epoch": 0.6074297825893821,
+      "grad_norm": 0.02634738699215234,
+      "learning_rate": 0.0008788227372123902,
+      "loss": 0.6388,
+      "step": 11965
+    },
+    {
+      "epoch": 0.6076836186874135,
+      "grad_norm": 0.030461346907874626,
+      "learning_rate": 0.0008786781174972811,
+      "loss": 0.6106,
+      "step": 11970
+    },
+    {
+      "epoch": 0.607937454785445,
+      "grad_norm": 0.033633731377443056,
+      "learning_rate": 0.0008785334234495459,
+      "loss": 0.5998,
+      "step": 11975
+    },
+    {
+      "epoch": 0.6081912908834766,
+      "grad_norm": 0.022923564848087382,
+      "learning_rate": 0.0008783886550975872,
+      "loss": 0.547,
+      "step": 11980
+    },
+    {
+      "epoch": 0.608445126981508,
+      "grad_norm": 0.029198962706889067,
+      "learning_rate": 0.0008782438124698229,
+      "loss": 0.618,
+      "step": 11985
+    },
+    {
+      "epoch": 0.6086989630795395,
+      "grad_norm": 0.026047548738704816,
+      "learning_rate": 0.0008780988955946843,
+      "loss": 0.5686,
+      "step": 11990
+    },
+    {
+      "epoch": 0.6089527991775711,
+      "grad_norm": 0.02660176396313372,
+      "learning_rate": 0.0008779539045006182,
+      "loss": 0.5884,
+      "step": 11995
+    },
+    {
+      "epoch": 0.6092066352756026,
+      "grad_norm": 0.030589674203168663,
+      "learning_rate": 0.0008778088392160853,
+      "loss": 0.6039,
+      "step": 12000
+    },
+    {
+      "epoch": 0.609460471373634,
+      "grad_norm": 0.050693411471426746,
+      "learning_rate": 0.0008776636997695615,
+      "loss": 0.6013,
+      "step": 12005
+    },
+    {
+      "epoch": 0.6097143074716656,
+      "grad_norm": 0.026446497681540608,
+      "learning_rate": 0.0008775184861895369,
+      "loss": 0.5416,
+      "step": 12010
+    },
+    {
+      "epoch": 0.6099681435696971,
+      "grad_norm": 0.1932080655888927,
+      "learning_rate": 0.0008773731985045162,
+      "loss": 0.5818,
+      "step": 12015
+    },
+    {
+      "epoch": 0.6102219796677285,
+      "grad_norm": 0.027856228756609126,
+      "learning_rate": 0.0008772278367430185,
+      "loss": 0.5952,
+      "step": 12020
+    },
+    {
+      "epoch": 0.61047581576576,
+      "grad_norm": 0.04117569443549469,
+      "learning_rate": 0.0008770824009335775,
+      "loss": 0.5738,
+      "step": 12025
+    },
+    {
+      "epoch": 0.6107296518637916,
+      "grad_norm": 0.025174168899307972,
+      "learning_rate": 0.000876936891104742,
+      "loss": 0.5825,
+      "step": 12030
+    },
+    {
+      "epoch": 0.610983487961823,
+      "grad_norm": 0.024070740808714138,
+      "learning_rate": 0.0008767913072850743,
+      "loss": 0.5766,
+      "step": 12035
+    },
+    {
+      "epoch": 0.6112373240598545,
+      "grad_norm": 0.022925340728893576,
+      "learning_rate": 0.0008766456495031521,
+      "loss": 0.6117,
+      "step": 12040
+    },
+    {
+      "epoch": 0.6114911601578861,
+      "grad_norm": 0.02650268728322704,
+      "learning_rate": 0.0008764999177875673,
+      "loss": 0.594,
+      "step": 12045
+    },
+    {
+      "epoch": 0.6117449962559176,
+      "grad_norm": 0.04585848245773159,
+      "learning_rate": 0.0008763541121669263,
+      "loss": 0.5841,
+      "step": 12050
+    },
+    {
+      "epoch": 0.611998832353949,
+      "grad_norm": 0.028232685184612846,
+      "learning_rate": 0.0008762082326698498,
+      "loss": 0.5985,
+      "step": 12055
+    },
+    {
+      "epoch": 0.6122526684519806,
+      "grad_norm": 0.02453819199214054,
+      "learning_rate": 0.0008760622793249735,
+      "loss": 0.6275,
+      "step": 12060
+    },
+    {
+      "epoch": 0.6125065045500121,
+      "grad_norm": 0.03043290217761071,
+      "learning_rate": 0.0008759162521609472,
+      "loss": 0.5899,
+      "step": 12065
+    },
+    {
+      "epoch": 0.6127603406480435,
+      "grad_norm": 0.035932514886264776,
+      "learning_rate": 0.0008757701512064351,
+      "loss": 0.5917,
+      "step": 12070
+    },
+    {
+      "epoch": 0.6130141767460751,
+      "grad_norm": 0.024922837264232393,
+      "learning_rate": 0.0008756239764901165,
+      "loss": 0.5963,
+      "step": 12075
+    },
+    {
+      "epoch": 0.6132680128441066,
+      "grad_norm": 0.021912894422967358,
+      "learning_rate": 0.0008754777280406845,
+      "loss": 0.5423,
+      "step": 12080
+    },
+    {
+      "epoch": 0.613521848942138,
+      "grad_norm": 0.03153730727693658,
+      "learning_rate": 0.0008753314058868469,
+      "loss": 0.6127,
+      "step": 12085
+    },
+    {
+      "epoch": 0.6137756850401695,
+      "grad_norm": 0.026200893072720284,
+      "learning_rate": 0.0008751850100573262,
+      "loss": 0.5789,
+      "step": 12090
+    },
+    {
+      "epoch": 0.6140295211382011,
+      "grad_norm": 0.03394058282414533,
+      "learning_rate": 0.000875038540580859,
+      "loss": 0.5715,
+      "step": 12095
+    },
+    {
+      "epoch": 0.6142833572362326,
+      "grad_norm": 0.027581562059680105,
+      "learning_rate": 0.0008748919974861967,
+      "loss": 0.5693,
+      "step": 12100
+    },
+    {
+      "epoch": 0.614537193334264,
+      "grad_norm": 0.029272850241735242,
+      "learning_rate": 0.0008747453808021047,
+      "loss": 0.6127,
+      "step": 12105
+    },
+    {
+      "epoch": 0.6147910294322956,
+      "grad_norm": 0.02771381768749453,
+      "learning_rate": 0.0008745986905573634,
+      "loss": 0.5814,
+      "step": 12110
+    },
+    {
+      "epoch": 0.6150448655303271,
+      "grad_norm": 0.02582305425809761,
+      "learning_rate": 0.0008744519267807673,
+      "loss": 0.5818,
+      "step": 12115
+    },
+    {
+      "epoch": 0.6152987016283585,
+      "grad_norm": 0.03650039105604249,
+      "learning_rate": 0.0008743050895011253,
+      "loss": 0.5948,
+      "step": 12120
+    },
+    {
+      "epoch": 0.6155525377263901,
+      "grad_norm": 0.025855707462884535,
+      "learning_rate": 0.000874158178747261,
+      "loss": 0.5788,
+      "step": 12125
+    },
+    {
+      "epoch": 0.6158063738244216,
+      "grad_norm": 0.02817299125660346,
+      "learning_rate": 0.000874011194548012,
+      "loss": 0.5825,
+      "step": 12130
+    },
+    {
+      "epoch": 0.616060209922453,
+      "grad_norm": 0.05520364073190341,
+      "learning_rate": 0.0008738641369322308,
+      "loss": 0.5901,
+      "step": 12135
+    },
+    {
+      "epoch": 0.6163140460204846,
+      "grad_norm": 0.04862377362129934,
+      "learning_rate": 0.0008737170059287838,
+      "loss": 0.5905,
+      "step": 12140
+    },
+    {
+      "epoch": 0.6165678821185161,
+      "grad_norm": 0.043586867841226354,
+      "learning_rate": 0.0008735698015665525,
+      "loss": 0.6009,
+      "step": 12145
+    },
+    {
+      "epoch": 0.6168217182165475,
+      "grad_norm": 0.02855284608566154,
+      "learning_rate": 0.000873422523874432,
+      "loss": 0.5848,
+      "step": 12150
+    },
+    {
+      "epoch": 0.6170755543145791,
+      "grad_norm": 0.02574982692090748,
+      "learning_rate": 0.0008732751728813324,
+      "loss": 0.6078,
+      "step": 12155
+    },
+    {
+      "epoch": 0.6173293904126106,
+      "grad_norm": 0.028740949383117314,
+      "learning_rate": 0.0008731277486161777,
+      "loss": 0.5622,
+      "step": 12160
+    },
+    {
+      "epoch": 0.6175832265106421,
+      "grad_norm": 0.028127163339311326,
+      "learning_rate": 0.000872980251107907,
+      "loss": 0.5664,
+      "step": 12165
+    },
+    {
+      "epoch": 0.6178370626086735,
+      "grad_norm": 0.024587054800826992,
+      "learning_rate": 0.0008728326803854728,
+      "loss": 0.6062,
+      "step": 12170
+    },
+    {
+      "epoch": 0.6180908987067051,
+      "grad_norm": 0.03673435237469503,
+      "learning_rate": 0.0008726850364778429,
+      "loss": 0.6079,
+      "step": 12175
+    },
+    {
+      "epoch": 0.6183447348047366,
+      "grad_norm": 0.022463713399365034,
+      "learning_rate": 0.000872537319413999,
+      "loss": 0.5885,
+      "step": 12180
+    },
+    {
+      "epoch": 0.618598570902768,
+      "grad_norm": 0.04006474303404941,
+      "learning_rate": 0.000872389529222937,
+      "loss": 0.5838,
+      "step": 12185
+    },
+    {
+      "epoch": 0.6188524070007996,
+      "grad_norm": 0.03323874643198722,
+      "learning_rate": 0.0008722416659336676,
+      "loss": 0.5907,
+      "step": 12190
+    },
+    {
+      "epoch": 0.6191062430988311,
+      "grad_norm": 0.0325269083515918,
+      "learning_rate": 0.0008720937295752153,
+      "loss": 0.5848,
+      "step": 12195
+    },
+    {
+      "epoch": 0.6193600791968625,
+      "grad_norm": 0.023584804219643197,
+      "learning_rate": 0.0008719457201766199,
+      "loss": 0.5965,
+      "step": 12200
+    },
+    {
+      "epoch": 0.6196139152948941,
+      "grad_norm": 0.02567919434786954,
+      "learning_rate": 0.0008717976377669343,
+      "loss": 0.5644,
+      "step": 12205
+    },
+    {
+      "epoch": 0.6198677513929256,
+      "grad_norm": 0.023457406627368636,
+      "learning_rate": 0.0008716494823752265,
+      "loss": 0.5846,
+      "step": 12210
+    },
+    {
+      "epoch": 0.6201215874909571,
+      "grad_norm": 0.10110762567310781,
+      "learning_rate": 0.0008715012540305789,
+      "loss": 0.5958,
+      "step": 12215
+    },
+    {
+      "epoch": 0.6203754235889886,
+      "grad_norm": 0.027499055613954763,
+      "learning_rate": 0.0008713529527620876,
+      "loss": 0.6264,
+      "step": 12220
+    },
+    {
+      "epoch": 0.6206292596870201,
+      "grad_norm": 0.04044664204628187,
+      "learning_rate": 0.0008712045785988638,
+      "loss": 0.5648,
+      "step": 12225
+    },
+    {
+      "epoch": 0.6208830957850516,
+      "grad_norm": 0.026521946530749428,
+      "learning_rate": 0.0008710561315700323,
+      "loss": 0.6316,
+      "step": 12230
+    },
+    {
+      "epoch": 0.621136931883083,
+      "grad_norm": 0.024722241475712593,
+      "learning_rate": 0.0008709076117047326,
+      "loss": 0.5668,
+      "step": 12235
+    },
+    {
+      "epoch": 0.6213907679811146,
+      "grad_norm": 0.03259920434042201,
+      "learning_rate": 0.0008707590190321186,
+      "loss": 0.5987,
+      "step": 12240
+    },
+    {
+      "epoch": 0.6216446040791461,
+      "grad_norm": 3.80266301123272,
+      "learning_rate": 0.000870610353581358,
+      "loss": 0.6304,
+      "step": 12245
+    },
+    {
+      "epoch": 0.6218984401771775,
+      "grad_norm": 0.08694604714084496,
+      "learning_rate": 0.0008704616153816332,
+      "loss": 0.649,
+      "step": 12250
+    },
+    {
+      "epoch": 0.6221522762752091,
+      "grad_norm": 0.05774492909835282,
+      "learning_rate": 0.0008703128044621409,
+      "loss": 0.6147,
+      "step": 12255
+    },
+    {
+      "epoch": 0.6224061123732406,
+      "grad_norm": 0.04804931614993646,
+      "learning_rate": 0.0008701639208520917,
+      "loss": 0.6145,
+      "step": 12260
+    },
+    {
+      "epoch": 0.6226599484712722,
+      "grad_norm": 0.08384982616634495,
+      "learning_rate": 0.000870014964580711,
+      "loss": 0.6363,
+      "step": 12265
+    },
+    {
+      "epoch": 0.6229137845693036,
+      "grad_norm": 0.03400226647956488,
+      "learning_rate": 0.000869865935677238,
+      "loss": 0.5997,
+      "step": 12270
+    },
+    {
+      "epoch": 0.6231676206673351,
+      "grad_norm": 0.026650165047009896,
+      "learning_rate": 0.0008697168341709263,
+      "loss": 0.6205,
+      "step": 12275
+    },
+    {
+      "epoch": 0.6234214567653666,
+      "grad_norm": 0.03254784519557384,
+      "learning_rate": 0.0008695676600910437,
+      "loss": 0.6218,
+      "step": 12280
+    },
+    {
+      "epoch": 0.6236752928633981,
+      "grad_norm": 0.025852394090405486,
+      "learning_rate": 0.0008694184134668726,
+      "loss": 0.6043,
+      "step": 12285
+    },
+    {
+      "epoch": 0.6239291289614296,
+      "grad_norm": 0.03673450921472964,
+      "learning_rate": 0.0008692690943277092,
+      "loss": 0.6277,
+      "step": 12290
+    },
+    {
+      "epoch": 0.6241829650594611,
+      "grad_norm": 0.026035740947944315,
+      "learning_rate": 0.0008691197027028641,
+      "loss": 0.6135,
+      "step": 12295
+    },
+    {
+      "epoch": 0.6244368011574926,
+      "grad_norm": 0.035313678241679185,
+      "learning_rate": 0.0008689702386216622,
+      "loss": 0.584,
+      "step": 12300
+    },
+    {
+      "epoch": 0.6246906372555241,
+      "grad_norm": 0.03306444173549,
+      "learning_rate": 0.0008688207021134424,
+      "loss": 0.6576,
+      "step": 12305
+    },
+    {
+      "epoch": 0.6249444733535556,
+      "grad_norm": 0.03609474653140698,
+      "learning_rate": 0.0008686710932075582,
+      "loss": 0.5882,
+      "step": 12310
+    },
+    {
+      "epoch": 0.6251983094515872,
+      "grad_norm": 0.02644393403575191,
+      "learning_rate": 0.000868521411933377,
+      "loss": 0.5805,
+      "step": 12315
+    },
+    {
+      "epoch": 0.6254521455496186,
+      "grad_norm": 0.03558589982698361,
+      "learning_rate": 0.0008683716583202803,
+      "loss": 0.597,
+      "step": 12320
+    },
+    {
+      "epoch": 0.6257059816476501,
+      "grad_norm": 0.027207972337934594,
+      "learning_rate": 0.0008682218323976643,
+      "loss": 0.6125,
+      "step": 12325
+    },
+    {
+      "epoch": 0.6259598177456817,
+      "grad_norm": 0.030538062894411674,
+      "learning_rate": 0.0008680719341949388,
+      "loss": 0.6047,
+      "step": 12330
+    },
+    {
+      "epoch": 0.6262136538437131,
+      "grad_norm": 0.02839935399711346,
+      "learning_rate": 0.0008679219637415281,
+      "loss": 0.5826,
+      "step": 12335
+    },
+    {
+      "epoch": 0.6264674899417446,
+      "grad_norm": 0.03696443623177008,
+      "learning_rate": 0.0008677719210668708,
+      "loss": 0.6189,
+      "step": 12340
+    },
+    {
+      "epoch": 0.6267213260397761,
+      "grad_norm": 0.037688218408350786,
+      "learning_rate": 0.0008676218062004196,
+      "loss": 0.6191,
+      "step": 12345
+    },
+    {
+      "epoch": 0.6269751621378076,
+      "grad_norm": 0.030327644773068856,
+      "learning_rate": 0.0008674716191716412,
+      "loss": 0.5768,
+      "step": 12350
+    },
+    {
+      "epoch": 0.6272289982358391,
+      "grad_norm": 0.026282274839364978,
+      "learning_rate": 0.0008673213600100165,
+      "loss": 0.589,
+      "step": 12355
+    },
+    {
+      "epoch": 0.6274828343338706,
+      "grad_norm": 0.023814603743172128,
+      "learning_rate": 0.0008671710287450406,
+      "loss": 0.5615,
+      "step": 12360
+    },
+    {
+      "epoch": 0.6277366704319021,
+      "grad_norm": 0.027581917581802243,
+      "learning_rate": 0.0008670206254062227,
+      "loss": 0.6108,
+      "step": 12365
+    },
+    {
+      "epoch": 0.6279905065299336,
+      "grad_norm": 0.026215277115865347,
+      "learning_rate": 0.0008668701500230865,
+      "loss": 0.6145,
+      "step": 12370
+    },
+    {
+      "epoch": 0.6282443426279651,
+      "grad_norm": 0.025711494856042162,
+      "learning_rate": 0.0008667196026251694,
+      "loss": 0.6259,
+      "step": 12375
+    },
+    {
+      "epoch": 0.6284981787259967,
+      "grad_norm": 0.02851492204909293,
+      "learning_rate": 0.0008665689832420231,
+      "loss": 0.6542,
+      "step": 12380
+    },
+    {
+      "epoch": 0.6287520148240281,
+      "grad_norm": 0.0445871823188339,
+      "learning_rate": 0.0008664182919032135,
+      "loss": 0.5829,
+      "step": 12385
+    },
+    {
+      "epoch": 0.6290058509220596,
+      "grad_norm": 0.03817207065267871,
+      "learning_rate": 0.0008662675286383206,
+      "loss": 0.5721,
+      "step": 12390
+    },
+    {
+      "epoch": 0.6292596870200912,
+      "grad_norm": 0.025461696058893753,
+      "learning_rate": 0.0008661166934769384,
+      "loss": 0.6207,
+      "step": 12395
+    },
+    {
+      "epoch": 0.6295135231181226,
+      "grad_norm": 0.03291254373796645,
+      "learning_rate": 0.000865965786448675,
+      "loss": 0.6254,
+      "step": 12400
+    },
+    {
+      "epoch": 0.6297673592161541,
+      "grad_norm": 0.02456833220774285,
+      "learning_rate": 0.0008658148075831529,
+      "loss": 0.6136,
+      "step": 12405
+    },
+    {
+      "epoch": 0.6300211953141857,
+      "grad_norm": 0.03660349536181771,
+      "learning_rate": 0.0008656637569100083,
+      "loss": 0.6312,
+      "step": 12410
+    },
+    {
+      "epoch": 0.6302750314122171,
+      "grad_norm": 0.028629048846541345,
+      "learning_rate": 0.0008655126344588917,
+      "loss": 0.6128,
+      "step": 12415
+    },
+    {
+      "epoch": 0.6305288675102486,
+      "grad_norm": 0.027152018036820328,
+      "learning_rate": 0.0008653614402594679,
+      "loss": 0.5857,
+      "step": 12420
+    },
+    {
+      "epoch": 0.6307827036082801,
+      "grad_norm": 0.02680519867482763,
+      "learning_rate": 0.0008652101743414154,
+      "loss": 0.6,
+      "step": 12425
+    },
+    {
+      "epoch": 0.6310365397063117,
+      "grad_norm": 0.038568306378071626,
+      "learning_rate": 0.000865058836734427,
+      "loss": 0.5764,
+      "step": 12430
+    },
+    {
+      "epoch": 0.6312903758043431,
+      "grad_norm": 0.022332649415363656,
+      "learning_rate": 0.0008649074274682094,
+      "loss": 0.5723,
+      "step": 12435
+    },
+    {
+      "epoch": 0.6315442119023746,
+      "grad_norm": 0.03745046556172155,
+      "learning_rate": 0.0008647559465724837,
+      "loss": 0.6402,
+      "step": 12440
+    },
+    {
+      "epoch": 0.6317980480004062,
+      "grad_norm": 0.043267157791484344,
+      "learning_rate": 0.0008646043940769846,
+      "loss": 0.614,
+      "step": 12445
+    },
+    {
+      "epoch": 0.6320518840984376,
+      "grad_norm": 0.026176498937215194,
+      "learning_rate": 0.0008644527700114613,
+      "loss": 0.6109,
+      "step": 12450
+    },
+    {
+      "epoch": 0.6323057201964691,
+      "grad_norm": 0.026927373965425307,
+      "learning_rate": 0.0008643010744056768,
+      "loss": 0.5921,
+      "step": 12455
+    },
+    {
+      "epoch": 0.6325595562945007,
+      "grad_norm": 0.030985586351536417,
+      "learning_rate": 0.0008641493072894081,
+      "loss": 0.6037,
+      "step": 12460
+    },
+    {
+      "epoch": 0.6328133923925321,
+      "grad_norm": 0.034088985294575574,
+      "learning_rate": 0.0008639974686924463,
+      "loss": 0.5987,
+      "step": 12465
+    },
+    {
+      "epoch": 0.6330672284905636,
+      "grad_norm": 0.04071949159100916,
+      "learning_rate": 0.0008638455586445967,
+      "loss": 0.598,
+      "step": 12470
+    },
+    {
+      "epoch": 0.6333210645885952,
+      "grad_norm": 0.04119075917419428,
+      "learning_rate": 0.0008636935771756787,
+      "loss": 0.6045,
+      "step": 12475
+    },
+    {
+      "epoch": 0.6335749006866267,
+      "grad_norm": 0.024601400128880478,
+      "learning_rate": 0.000863541524315525,
+      "loss": 0.5895,
+      "step": 12480
+    },
+    {
+      "epoch": 0.6338287367846581,
+      "grad_norm": 0.025669915239589995,
+      "learning_rate": 0.000863389400093983,
+      "loss": 0.6138,
+      "step": 12485
+    },
+    {
+      "epoch": 0.6340825728826897,
+      "grad_norm": 0.07547210741704416,
+      "learning_rate": 0.0008632372045409141,
+      "loss": 0.5917,
+      "step": 12490
+    },
+    {
+      "epoch": 0.6343364089807212,
+      "grad_norm": 0.08126800386669199,
+      "learning_rate": 0.0008630849376861933,
+      "loss": 0.5978,
+      "step": 12495
+    },
+    {
+      "epoch": 0.6345902450787526,
+      "grad_norm": 0.039597508909703365,
+      "learning_rate": 0.0008629325995597101,
+      "loss": 0.6147,
+      "step": 12500
+    },
+    {
+      "epoch": 0.6348440811767841,
+      "grad_norm": 0.03847450277611692,
+      "learning_rate": 0.0008627801901913675,
+      "loss": 0.6161,
+      "step": 12505
+    },
+    {
+      "epoch": 0.6350979172748157,
+      "grad_norm": 0.031096240484264088,
+      "learning_rate": 0.0008626277096110826,
+      "loss": 0.6256,
+      "step": 12510
+    },
+    {
+      "epoch": 0.6353517533728471,
+      "grad_norm": 0.0351603079410334,
+      "learning_rate": 0.0008624751578487868,
+      "loss": 0.5906,
+      "step": 12515
+    },
+    {
+      "epoch": 0.6356055894708786,
+      "grad_norm": 0.05304450618895726,
+      "learning_rate": 0.0008623225349344252,
+      "loss": 0.5935,
+      "step": 12520
+    },
+    {
+      "epoch": 0.6358594255689102,
+      "grad_norm": 0.042723117144634414,
+      "learning_rate": 0.000862169840897957,
+      "loss": 0.6222,
+      "step": 12525
+    },
+    {
+      "epoch": 0.6361132616669417,
+      "grad_norm": 0.06453506596782073,
+      "learning_rate": 0.0008620170757693551,
+      "loss": 0.5989,
+      "step": 12530
+    },
+    {
+      "epoch": 0.6363670977649731,
+      "grad_norm": 0.03504784561459624,
+      "learning_rate": 0.0008618642395786065,
+      "loss": 0.5673,
+      "step": 12535
+    },
+    {
+      "epoch": 0.6366209338630047,
+      "grad_norm": 0.0351102465977865,
+      "learning_rate": 0.0008617113323557124,
+      "loss": 0.6076,
+      "step": 12540
+    },
+    {
+      "epoch": 0.6368747699610362,
+      "grad_norm": 0.030573876028541808,
+      "learning_rate": 0.0008615583541306875,
+      "loss": 0.5974,
+      "step": 12545
+    },
+    {
+      "epoch": 0.6371286060590676,
+      "grad_norm": 0.03746793184981241,
+      "learning_rate": 0.0008614053049335608,
+      "loss": 0.5956,
+      "step": 12550
+    },
+    {
+      "epoch": 0.6373824421570992,
+      "grad_norm": 0.03858770491598553,
+      "learning_rate": 0.0008612521847943751,
+      "loss": 0.5991,
+      "step": 12555
+    },
+    {
+      "epoch": 0.6376362782551307,
+      "grad_norm": 0.026289258814351405,
+      "learning_rate": 0.0008610989937431872,
+      "loss": 0.5822,
+      "step": 12560
+    },
+    {
+      "epoch": 0.6378901143531621,
+      "grad_norm": 0.03593649422727849,
+      "learning_rate": 0.0008609457318100674,
+      "loss": 0.6193,
+      "step": 12565
+    },
+    {
+      "epoch": 0.6381439504511937,
+      "grad_norm": 0.02527123338083977,
+      "learning_rate": 0.0008607923990251005,
+      "loss": 0.5848,
+      "step": 12570
+    },
+    {
+      "epoch": 0.6383977865492252,
+      "grad_norm": 0.03742296667921461,
+      "learning_rate": 0.0008606389954183851,
+      "loss": 0.625,
+      "step": 12575
+    },
+    {
+      "epoch": 0.6386516226472566,
+      "grad_norm": 0.025365542502550003,
+      "learning_rate": 0.0008604855210200333,
+      "loss": 0.5827,
+      "step": 12580
+    },
+    {
+      "epoch": 0.6389054587452881,
+      "grad_norm": 0.0405102841258219,
+      "learning_rate": 0.0008603319758601715,
+      "loss": 0.6005,
+      "step": 12585
+    },
+    {
+      "epoch": 0.6391592948433197,
+      "grad_norm": 0.034192188660412175,
+      "learning_rate": 0.0008601783599689399,
+      "loss": 0.5751,
+      "step": 12590
+    },
+    {
+      "epoch": 0.6394131309413512,
+      "grad_norm": 0.027215532759491887,
+      "learning_rate": 0.0008600246733764923,
+      "loss": 0.5862,
+      "step": 12595
+    },
+    {
+      "epoch": 0.6396669670393826,
+      "grad_norm": 0.03061750468523527,
+      "learning_rate": 0.0008598709161129969,
+      "loss": 0.5811,
+      "step": 12600
+    },
+    {
+      "epoch": 0.6399208031374142,
+      "grad_norm": 0.028928109299113807,
+      "learning_rate": 0.0008597170882086351,
+      "loss": 0.5806,
+      "step": 12605
+    },
+    {
+      "epoch": 0.6401746392354457,
+      "grad_norm": 0.027021955515260244,
+      "learning_rate": 0.000859563189693603,
+      "loss": 0.6105,
+      "step": 12610
+    },
+    {
+      "epoch": 0.6404284753334771,
+      "grad_norm": 0.027332261017979507,
+      "learning_rate": 0.0008594092205981099,
+      "loss": 0.5754,
+      "step": 12615
+    },
+    {
+      "epoch": 0.6406823114315087,
+      "grad_norm": 0.02712378032650469,
+      "learning_rate": 0.0008592551809523791,
+      "loss": 0.6216,
+      "step": 12620
+    },
+    {
+      "epoch": 0.6409361475295402,
+      "grad_norm": 0.02824436218143356,
+      "learning_rate": 0.0008591010707866478,
+      "loss": 0.6344,
+      "step": 12625
+    },
+    {
+      "epoch": 0.6411899836275716,
+      "grad_norm": 0.023321393807142935,
+      "learning_rate": 0.0008589468901311672,
+      "loss": 0.6034,
+      "step": 12630
+    },
+    {
+      "epoch": 0.6414438197256032,
+      "grad_norm": 0.03151905452662985,
+      "learning_rate": 0.0008587926390162022,
+      "loss": 0.587,
+      "step": 12635
+    },
+    {
+      "epoch": 0.6416976558236347,
+      "grad_norm": 0.030624964870169982,
+      "learning_rate": 0.0008586383174720315,
+      "loss": 0.6196,
+      "step": 12640
+    },
+    {
+      "epoch": 0.6419514919216662,
+      "grad_norm": 0.025288630292415442,
+      "learning_rate": 0.0008584839255289475,
+      "loss": 0.6114,
+      "step": 12645
+    },
+    {
+      "epoch": 0.6422053280196977,
+      "grad_norm": 0.028822411703530795,
+      "learning_rate": 0.0008583294632172567,
+      "loss": 0.598,
+      "step": 12650
+    },
+    {
+      "epoch": 0.6424591641177292,
+      "grad_norm": 0.026442735674073018,
+      "learning_rate": 0.0008581749305672792,
+      "loss": 0.5951,
+      "step": 12655
+    },
+    {
+      "epoch": 0.6427130002157607,
+      "grad_norm": 0.026576792989007662,
+      "learning_rate": 0.0008580203276093492,
+      "loss": 0.5872,
+      "step": 12660
+    },
+    {
+      "epoch": 0.6429668363137921,
+      "grad_norm": 0.029042645916427186,
+      "learning_rate": 0.0008578656543738141,
+      "loss": 0.5867,
+      "step": 12665
+    },
+    {
+      "epoch": 0.6432206724118237,
+      "grad_norm": 0.04209495966527208,
+      "learning_rate": 0.0008577109108910359,
+      "loss": 0.5837,
+      "step": 12670
+    },
+    {
+      "epoch": 0.6434745085098552,
+      "grad_norm": 0.043317431826600195,
+      "learning_rate": 0.0008575560971913898,
+      "loss": 0.5904,
+      "step": 12675
+    },
+    {
+      "epoch": 0.6437283446078866,
+      "grad_norm": 0.038439775882813054,
+      "learning_rate": 0.0008574012133052649,
+      "loss": 0.5627,
+      "step": 12680
+    },
+    {
+      "epoch": 0.6439821807059182,
+      "grad_norm": 0.48049526028798634,
+      "learning_rate": 0.0008572462592630641,
+      "loss": 0.5769,
+      "step": 12685
+    },
+    {
+      "epoch": 0.6442360168039497,
+      "grad_norm": 0.033053890270663924,
+      "learning_rate": 0.0008570912350952044,
+      "loss": 0.5878,
+      "step": 12690
+    },
+    {
+      "epoch": 0.6444898529019812,
+      "grad_norm": 0.07863611845071104,
+      "learning_rate": 0.0008569361408321159,
+      "loss": 0.6133,
+      "step": 12695
+    },
+    {
+      "epoch": 0.6447436890000127,
+      "grad_norm": 0.029398204077650845,
+      "learning_rate": 0.000856780976504243,
+      "loss": 0.5799,
+      "step": 12700
+    },
+    {
+      "epoch": 0.6449975250980442,
+      "grad_norm": 0.02869897907225985,
+      "learning_rate": 0.0008566257421420439,
+      "loss": 0.6008,
+      "step": 12705
+    },
+    {
+      "epoch": 0.6452513611960757,
+      "grad_norm": 0.026097172563309384,
+      "learning_rate": 0.0008564704377759897,
+      "loss": 0.5939,
+      "step": 12710
+    },
+    {
+      "epoch": 0.6455051972941072,
+      "grad_norm": 0.02502019281825702,
+      "learning_rate": 0.0008563150634365666,
+      "loss": 0.598,
+      "step": 12715
+    },
+    {
+      "epoch": 0.6457590333921387,
+      "grad_norm": 0.02483685696837895,
+      "learning_rate": 0.0008561596191542733,
+      "loss": 0.5801,
+      "step": 12720
+    },
+    {
+      "epoch": 0.6460128694901702,
+      "grad_norm": 0.03535549265065644,
+      "learning_rate": 0.000856004104959623,
+      "loss": 0.5946,
+      "step": 12725
+    },
+    {
+      "epoch": 0.6462667055882017,
+      "grad_norm": 0.027879010371972223,
+      "learning_rate": 0.0008558485208831424,
+      "loss": 0.612,
+      "step": 12730
+    },
+    {
+      "epoch": 0.6465205416862332,
+      "grad_norm": 0.04735218553478145,
+      "learning_rate": 0.0008556928669553717,
+      "loss": 0.5938,
+      "step": 12735
+    },
+    {
+      "epoch": 0.6467743777842647,
+      "grad_norm": 0.0279629249002624,
+      "learning_rate": 0.000855537143206865,
+      "loss": 0.5837,
+      "step": 12740
+    },
+    {
+      "epoch": 0.6470282138822963,
+      "grad_norm": 0.02533240032359082,
+      "learning_rate": 0.00085538134966819,
+      "loss": 0.6252,
+      "step": 12745
+    },
+    {
+      "epoch": 0.6472820499803277,
+      "grad_norm": 0.025603639669077066,
+      "learning_rate": 0.0008552254863699286,
+      "loss": 0.5819,
+      "step": 12750
+    },
+    {
+      "epoch": 0.6475358860783592,
+      "grad_norm": 0.04145899859904642,
+      "learning_rate": 0.0008550695533426756,
+      "loss": 0.597,
+      "step": 12755
+    },
+    {
+      "epoch": 0.6477897221763907,
+      "grad_norm": 0.029237034218711128,
+      "learning_rate": 0.00085491355061704,
+      "loss": 0.6067,
+      "step": 12760
+    },
+    {
+      "epoch": 0.6480435582744222,
+      "grad_norm": 0.030474815495890747,
+      "learning_rate": 0.0008547574782236444,
+      "loss": 0.5969,
+      "step": 12765
+    },
+    {
+      "epoch": 0.6482973943724537,
+      "grad_norm": 0.033389985535876285,
+      "learning_rate": 0.0008546013361931251,
+      "loss": 0.5902,
+      "step": 12770
+    },
+    {
+      "epoch": 0.6485512304704852,
+      "grad_norm": 0.03244401680509467,
+      "learning_rate": 0.0008544451245561318,
+      "loss": 0.5714,
+      "step": 12775
+    },
+    {
+      "epoch": 0.6488050665685167,
+      "grad_norm": 0.047562688287312124,
+      "learning_rate": 0.0008542888433433283,
+      "loss": 0.5706,
+      "step": 12780
+    },
+    {
+      "epoch": 0.6490589026665482,
+      "grad_norm": 0.028307045275590848,
+      "learning_rate": 0.0008541324925853915,
+      "loss": 0.5689,
+      "step": 12785
+    },
+    {
+      "epoch": 0.6493127387645797,
+      "grad_norm": 0.04704027854528433,
+      "learning_rate": 0.0008539760723130125,
+      "loss": 0.5661,
+      "step": 12790
+    },
+    {
+      "epoch": 0.6495665748626112,
+      "grad_norm": 0.04648606005162825,
+      "learning_rate": 0.0008538195825568958,
+      "loss": 0.6028,
+      "step": 12795
+    },
+    {
+      "epoch": 0.6498204109606427,
+      "grad_norm": 0.025943070142767706,
+      "learning_rate": 0.0008536630233477594,
+      "loss": 0.5877,
+      "step": 12800
+    },
+    {
+      "epoch": 0.6500742470586742,
+      "grad_norm": 0.04012620623020576,
+      "learning_rate": 0.0008535063947163355,
+      "loss": 0.5976,
+      "step": 12805
+    },
+    {
+      "epoch": 0.6503280831567058,
+      "grad_norm": 0.03544811219416931,
+      "learning_rate": 0.0008533496966933691,
+      "loss": 0.5855,
+      "step": 12810
+    },
+    {
+      "epoch": 0.6505819192547372,
+      "grad_norm": 0.0281669036894832,
+      "learning_rate": 0.0008531929293096194,
+      "loss": 0.6111,
+      "step": 12815
+    },
+    {
+      "epoch": 0.6508357553527687,
+      "grad_norm": 0.023763951065239725,
+      "learning_rate": 0.0008530360925958591,
+      "loss": 0.5776,
+      "step": 12820
+    },
+    {
+      "epoch": 0.6510895914508003,
+      "grad_norm": 0.24418838214511798,
+      "learning_rate": 0.0008528791865828742,
+      "loss": 0.6009,
+      "step": 12825
+    },
+    {
+      "epoch": 0.6513434275488317,
+      "grad_norm": 0.029087961946777385,
+      "learning_rate": 0.000852722211301465,
+      "loss": 0.6235,
+      "step": 12830
+    },
+    {
+      "epoch": 0.6515972636468632,
+      "grad_norm": 0.034137672479786794,
+      "learning_rate": 0.0008525651667824447,
+      "loss": 0.5799,
+      "step": 12835
+    },
+    {
+      "epoch": 0.6518510997448947,
+      "grad_norm": 0.028312942222624603,
+      "learning_rate": 0.0008524080530566405,
+      "loss": 0.6404,
+      "step": 12840
+    },
+    {
+      "epoch": 0.6521049358429262,
+      "grad_norm": 0.02843554231766899,
+      "learning_rate": 0.0008522508701548927,
+      "loss": 0.5799,
+      "step": 12845
+    },
+    {
+      "epoch": 0.6523587719409577,
+      "grad_norm": 0.027322928571429136,
+      "learning_rate": 0.0008520936181080561,
+      "loss": 0.5999,
+      "step": 12850
+    },
+    {
+      "epoch": 0.6526126080389892,
+      "grad_norm": 0.0303740372452228,
+      "learning_rate": 0.0008519362969469979,
+      "loss": 0.5929,
+      "step": 12855
+    },
+    {
+      "epoch": 0.6528664441370208,
+      "grad_norm": 0.027100993271736346,
+      "learning_rate": 0.0008517789067025997,
+      "loss": 0.6328,
+      "step": 12860
+    },
+    {
+      "epoch": 0.6531202802350522,
+      "grad_norm": 0.028849366128399865,
+      "learning_rate": 0.0008516214474057565,
+      "loss": 0.5698,
+      "step": 12865
+    },
+    {
+      "epoch": 0.6533741163330837,
+      "grad_norm": 0.029545255854450884,
+      "learning_rate": 0.0008514639190873767,
+      "loss": 0.5783,
+      "step": 12870
+    },
+    {
+      "epoch": 0.6536279524311153,
+      "grad_norm": 0.024473560866559003,
+      "learning_rate": 0.0008513063217783824,
+      "loss": 0.6031,
+      "step": 12875
+    },
+    {
+      "epoch": 0.6538817885291467,
+      "grad_norm": 0.02666332020016646,
+      "learning_rate": 0.000851148655509709,
+      "loss": 0.6249,
+      "step": 12880
+    },
+    {
+      "epoch": 0.6541356246271782,
+      "grad_norm": 0.03803144464344789,
+      "learning_rate": 0.0008509909203123057,
+      "loss": 0.6052,
+      "step": 12885
+    },
+    {
+      "epoch": 0.6543894607252098,
+      "grad_norm": 0.029430501206934002,
+      "learning_rate": 0.0008508331162171353,
+      "loss": 0.6082,
+      "step": 12890
+    },
+    {
+      "epoch": 0.6546432968232412,
+      "grad_norm": 0.038636878659898226,
+      "learning_rate": 0.0008506752432551736,
+      "loss": 0.5922,
+      "step": 12895
+    },
+    {
+      "epoch": 0.6548971329212727,
+      "grad_norm": 0.03645227270336851,
+      "learning_rate": 0.0008505173014574104,
+      "loss": 0.5961,
+      "step": 12900
+    },
+    {
+      "epoch": 0.6551509690193043,
+      "grad_norm": 0.0339469915664989,
+      "learning_rate": 0.0008503592908548492,
+      "loss": 0.6087,
+      "step": 12905
+    },
+    {
+      "epoch": 0.6554048051173358,
+      "grad_norm": 0.04584178211309174,
+      "learning_rate": 0.0008502012114785062,
+      "loss": 0.6057,
+      "step": 12910
+    },
+    {
+      "epoch": 0.6556586412153672,
+      "grad_norm": 0.027847814793399383,
+      "learning_rate": 0.0008500430633594121,
+      "loss": 0.5957,
+      "step": 12915
+    },
+    {
+      "epoch": 0.6559124773133987,
+      "grad_norm": 0.04088969160713702,
+      "learning_rate": 0.0008498848465286101,
+      "loss": 0.6299,
+      "step": 12920
+    },
+    {
+      "epoch": 0.6561663134114303,
+      "grad_norm": 0.024361703944890165,
+      "learning_rate": 0.0008497265610171576,
+      "loss": 0.5643,
+      "step": 12925
+    },
+    {
+      "epoch": 0.6564201495094617,
+      "grad_norm": 0.08374406284028113,
+      "learning_rate": 0.0008495682068561254,
+      "loss": 0.5758,
+      "step": 12930
+    },
+    {
+      "epoch": 0.6566739856074932,
+      "grad_norm": 0.023934163458897068,
+      "learning_rate": 0.0008494097840765975,
+      "loss": 0.5817,
+      "step": 12935
+    },
+    {
+      "epoch": 0.6569278217055248,
+      "grad_norm": 0.027116947138030544,
+      "learning_rate": 0.0008492512927096714,
+      "loss": 0.6075,
+      "step": 12940
+    },
+    {
+      "epoch": 0.6571816578035562,
+      "grad_norm": 0.027883742219997405,
+      "learning_rate": 0.0008490927327864581,
+      "loss": 0.587,
+      "step": 12945
+    },
+    {
+      "epoch": 0.6574354939015877,
+      "grad_norm": 0.02716053289665079,
+      "learning_rate": 0.0008489341043380825,
+      "loss": 0.657,
+      "step": 12950
+    },
+    {
+      "epoch": 0.6576893299996193,
+      "grad_norm": 0.03758581951547631,
+      "learning_rate": 0.0008487754073956823,
+      "loss": 0.5958,
+      "step": 12955
+    },
+    {
+      "epoch": 0.6579431660976508,
+      "grad_norm": 0.028060630011589125,
+      "learning_rate": 0.0008486166419904089,
+      "loss": 0.5604,
+      "step": 12960
+    },
+    {
+      "epoch": 0.6581970021956822,
+      "grad_norm": 0.026078088164496267,
+      "learning_rate": 0.0008484578081534274,
+      "loss": 0.6172,
+      "step": 12965
+    },
+    {
+      "epoch": 0.6584508382937138,
+      "grad_norm": 0.028071334962187697,
+      "learning_rate": 0.0008482989059159158,
+      "loss": 0.5903,
+      "step": 12970
+    },
+    {
+      "epoch": 0.6587046743917453,
+      "grad_norm": 0.03378636851534933,
+      "learning_rate": 0.0008481399353090659,
+      "loss": 0.6051,
+      "step": 12975
+    },
+    {
+      "epoch": 0.6589585104897767,
+      "grad_norm": 0.026105241851777608,
+      "learning_rate": 0.0008479808963640828,
+      "loss": 0.6411,
+      "step": 12980
+    },
+    {
+      "epoch": 0.6592123465878083,
+      "grad_norm": 0.030533873522670462,
+      "learning_rate": 0.0008478217891121853,
+      "loss": 0.5837,
+      "step": 12985
+    },
+    {
+      "epoch": 0.6594661826858398,
+      "grad_norm": 0.03300525899669726,
+      "learning_rate": 0.0008476626135846051,
+      "loss": 0.5938,
+      "step": 12990
+    },
+    {
+      "epoch": 0.6597200187838712,
+      "grad_norm": 0.028329077912125737,
+      "learning_rate": 0.0008475033698125876,
+      "loss": 0.6348,
+      "step": 12995
+    },
+    {
+      "epoch": 0.6599738548819027,
+      "grad_norm": 0.04110076364540404,
+      "learning_rate": 0.0008473440578273916,
+      "loss": 0.5867,
+      "step": 13000
+    },
+    {
+      "epoch": 0.6602276909799343,
+      "grad_norm": 0.05757425924457839,
+      "learning_rate": 0.0008471846776602894,
+      "loss": 0.5384,
+      "step": 13005
+    },
+    {
+      "epoch": 0.6604815270779657,
+      "grad_norm": 0.05844590144036168,
+      "learning_rate": 0.0008470252293425662,
+      "loss": 0.5848,
+      "step": 13010
+    },
+    {
+      "epoch": 0.6607353631759972,
+      "grad_norm": 0.06415338224333281,
+      "learning_rate": 0.0008468657129055213,
+      "loss": 0.5522,
+      "step": 13015
+    },
+    {
+      "epoch": 0.6609891992740288,
+      "grad_norm": 0.030473738178567217,
+      "learning_rate": 0.0008467061283804665,
+      "loss": 0.5945,
+      "step": 13020
+    },
+    {
+      "epoch": 0.6612430353720603,
+      "grad_norm": 0.02536912248622989,
+      "learning_rate": 0.000846546475798728,
+      "loss": 0.5816,
+      "step": 13025
+    },
+    {
+      "epoch": 0.6614968714700917,
+      "grad_norm": 0.0461113719066347,
+      "learning_rate": 0.0008463867551916443,
+      "loss": 0.6512,
+      "step": 13030
+    },
+    {
+      "epoch": 0.6617507075681233,
+      "grad_norm": 0.024517762602526454,
+      "learning_rate": 0.0008462269665905682,
+      "loss": 0.597,
+      "step": 13035
+    },
+    {
+      "epoch": 0.6620045436661548,
+      "grad_norm": 1.033031229487397,
+      "learning_rate": 0.0008460671100268649,
+      "loss": 0.6271,
+      "step": 13040
+    },
+    {
+      "epoch": 0.6622583797641862,
+      "grad_norm": 0.0693341169836654,
+      "learning_rate": 0.0008459071855319141,
+      "loss": 0.6248,
+      "step": 13045
+    },
+    {
+      "epoch": 0.6625122158622178,
+      "grad_norm": 0.03717279346920806,
+      "learning_rate": 0.0008457471931371074,
+      "loss": 0.573,
+      "step": 13050
+    },
+    {
+      "epoch": 0.6627660519602493,
+      "grad_norm": 0.03262397839797902,
+      "learning_rate": 0.0008455871328738512,
+      "loss": 0.5841,
+      "step": 13055
+    },
+    {
+      "epoch": 0.6630198880582807,
+      "grad_norm": 0.05355080175870677,
+      "learning_rate": 0.0008454270047735643,
+      "loss": 0.5727,
+      "step": 13060
+    },
+    {
+      "epoch": 0.6632737241563122,
+      "grad_norm": 0.03628283883616998,
+      "learning_rate": 0.0008452668088676789,
+      "loss": 0.6012,
+      "step": 13065
+    },
+    {
+      "epoch": 0.6635275602543438,
+      "grad_norm": 0.06228849912821036,
+      "learning_rate": 0.0008451065451876408,
+      "loss": 0.5836,
+      "step": 13070
+    },
+    {
+      "epoch": 0.6637813963523753,
+      "grad_norm": 0.03209553307639597,
+      "learning_rate": 0.0008449462137649087,
+      "loss": 0.5907,
+      "step": 13075
+    },
+    {
+      "epoch": 0.6640352324504067,
+      "grad_norm": 0.029984865563620306,
+      "learning_rate": 0.0008447858146309554,
+      "loss": 0.5891,
+      "step": 13080
+    },
+    {
+      "epoch": 0.6642890685484383,
+      "grad_norm": 0.03128245374162066,
+      "learning_rate": 0.000844625347817266,
+      "loss": 0.5829,
+      "step": 13085
+    },
+    {
+      "epoch": 0.6645429046464698,
+      "grad_norm": 0.03488425051563897,
+      "learning_rate": 0.0008444648133553394,
+      "loss": 0.6055,
+      "step": 13090
+    },
+    {
+      "epoch": 0.6647967407445012,
+      "grad_norm": 0.026243668933171885,
+      "learning_rate": 0.0008443042112766879,
+      "loss": 0.5931,
+      "step": 13095
+    },
+    {
+      "epoch": 0.6650505768425328,
+      "grad_norm": 0.0776128900108717,
+      "learning_rate": 0.0008441435416128367,
+      "loss": 0.7144,
+      "step": 13100
+    },
+    {
+      "epoch": 0.6653044129405643,
+      "grad_norm": 0.06729227309686386,
+      "learning_rate": 0.0008439828043953246,
+      "loss": 0.6272,
+      "step": 13105
+    },
+    {
+      "epoch": 0.6655582490385957,
+      "grad_norm": 0.06770135653654577,
+      "learning_rate": 0.0008438219996557033,
+      "loss": 0.6059,
+      "step": 13110
+    },
+    {
+      "epoch": 0.6658120851366273,
+      "grad_norm": 0.059522461982278785,
+      "learning_rate": 0.0008436611274255382,
+      "loss": 0.5964,
+      "step": 13115
+    },
+    {
+      "epoch": 0.6660659212346588,
+      "grad_norm": 0.038971286412593786,
+      "learning_rate": 0.0008435001877364076,
+      "loss": 0.6201,
+      "step": 13120
+    },
+    {
+      "epoch": 0.6663197573326903,
+      "grad_norm": 0.03964531176980359,
+      "learning_rate": 0.0008433391806199033,
+      "loss": 0.6378,
+      "step": 13125
+    },
+    {
+      "epoch": 0.6665735934307218,
+      "grad_norm": 0.03537229559330627,
+      "learning_rate": 0.0008431781061076298,
+      "loss": 0.6107,
+      "step": 13130
+    },
+    {
+      "epoch": 0.6668274295287533,
+      "grad_norm": 0.03125614020555413,
+      "learning_rate": 0.0008430169642312058,
+      "loss": 0.6444,
+      "step": 13135
+    },
+    {
+      "epoch": 0.6670812656267848,
+      "grad_norm": 0.02696637588263313,
+      "learning_rate": 0.0008428557550222622,
+      "loss": 0.6245,
+      "step": 13140
+    },
+    {
+      "epoch": 0.6673351017248162,
+      "grad_norm": 0.03589221798809491,
+      "learning_rate": 0.0008426944785124437,
+      "loss": 0.5984,
+      "step": 13145
+    },
+    {
+      "epoch": 0.6675889378228478,
+      "grad_norm": 0.05479122023801771,
+      "learning_rate": 0.000842533134733408,
+      "loss": 0.5568,
+      "step": 13150
+    },
+    {
+      "epoch": 0.6678427739208793,
+      "grad_norm": 0.02850027631836941,
+      "learning_rate": 0.0008423717237168263,
+      "loss": 0.5844,
+      "step": 13155
+    },
+    {
+      "epoch": 0.6680966100189107,
+      "grad_norm": 0.043303911644560475,
+      "learning_rate": 0.0008422102454943827,
+      "loss": 0.6056,
+      "step": 13160
+    },
+    {
+      "epoch": 0.6683504461169423,
+      "grad_norm": 0.0485597197702503,
+      "learning_rate": 0.0008420487000977743,
+      "loss": 0.5952,
+      "step": 13165
+    },
+    {
+      "epoch": 0.6686042822149738,
+      "grad_norm": 0.04377645965991001,
+      "learning_rate": 0.0008418870875587121,
+      "loss": 0.6004,
+      "step": 13170
+    },
+    {
+      "epoch": 0.6688581183130053,
+      "grad_norm": 0.04194534237527523,
+      "learning_rate": 0.0008417254079089194,
+      "loss": 0.6054,
+      "step": 13175
+    },
+    {
+      "epoch": 0.6691119544110368,
+      "grad_norm": 0.03712390306106914,
+      "learning_rate": 0.0008415636611801334,
+      "loss": 0.6166,
+      "step": 13180
+    },
+    {
+      "epoch": 0.6693657905090683,
+      "grad_norm": 0.03756186192265224,
+      "learning_rate": 0.0008414018474041041,
+      "loss": 0.5976,
+      "step": 13185
+    },
+    {
+      "epoch": 0.6696196266070998,
+      "grad_norm": 0.039758177882621364,
+      "learning_rate": 0.0008412399666125945,
+      "loss": 0.5797,
+      "step": 13190
+    },
+    {
+      "epoch": 0.6698734627051313,
+      "grad_norm": 0.034160759225817725,
+      "learning_rate": 0.0008410780188373814,
+      "loss": 0.603,
+      "step": 13195
+    },
+    {
+      "epoch": 0.6701272988031628,
+      "grad_norm": 0.045688219761594664,
+      "learning_rate": 0.0008409160041102543,
+      "loss": 0.5717,
+      "step": 13200
+    },
+    {
+      "epoch": 0.6703811349011943,
+      "grad_norm": 0.025679114000337882,
+      "learning_rate": 0.0008407539224630157,
+      "loss": 0.5667,
+      "step": 13205
+    },
+    {
+      "epoch": 0.6706349709992258,
+      "grad_norm": 0.036873394000505495,
+      "learning_rate": 0.0008405917739274813,
+      "loss": 0.6186,
+      "step": 13210
+    },
+    {
+      "epoch": 0.6708888070972573,
+      "grad_norm": 0.04153117906728298,
+      "learning_rate": 0.0008404295585354802,
+      "loss": 0.5834,
+      "step": 13215
+    },
+    {
+      "epoch": 0.6711426431952888,
+      "grad_norm": 0.02339181698231598,
+      "learning_rate": 0.0008402672763188545,
+      "loss": 0.5755,
+      "step": 13220
+    },
+    {
+      "epoch": 0.6713964792933204,
+      "grad_norm": 0.030301352786187497,
+      "learning_rate": 0.0008401049273094594,
+      "loss": 0.5768,
+      "step": 13225
+    },
+    {
+      "epoch": 0.6716503153913518,
+      "grad_norm": 0.030870240027105245,
+      "learning_rate": 0.0008399425115391632,
+      "loss": 0.5899,
+      "step": 13230
+    },
+    {
+      "epoch": 0.6719041514893833,
+      "grad_norm": 0.04099103986806237,
+      "learning_rate": 0.0008397800290398473,
+      "loss": 0.5572,
+      "step": 13235
+    },
+    {
+      "epoch": 0.6721579875874149,
+      "grad_norm": 0.025526012832099508,
+      "learning_rate": 0.0008396174798434062,
+      "loss": 0.5796,
+      "step": 13240
+    },
+    {
+      "epoch": 0.6724118236854463,
+      "grad_norm": 0.02582782722174359,
+      "learning_rate": 0.0008394548639817474,
+      "loss": 0.5748,
+      "step": 13245
+    },
+    {
+      "epoch": 0.6726656597834778,
+      "grad_norm": 0.044686293141161536,
+      "learning_rate": 0.0008392921814867916,
+      "loss": 0.6161,
+      "step": 13250
+    },
+    {
+      "epoch": 0.6729194958815093,
+      "grad_norm": 0.03157479001428999,
+      "learning_rate": 0.0008391294323904726,
+      "loss": 0.579,
+      "step": 13255
+    },
+    {
+      "epoch": 0.6731733319795408,
+      "grad_norm": 0.03230925419893887,
+      "learning_rate": 0.0008389666167247374,
+      "loss": 0.6105,
+      "step": 13260
+    },
+    {
+      "epoch": 0.6734271680775723,
+      "grad_norm": 0.0227012216152469,
+      "learning_rate": 0.0008388037345215457,
+      "loss": 0.5985,
+      "step": 13265
+    },
+    {
+      "epoch": 0.6736810041756038,
+      "grad_norm": 0.027786730311156502,
+      "learning_rate": 0.0008386407858128706,
+      "loss": 0.609,
+      "step": 13270
+    },
+    {
+      "epoch": 0.6739348402736353,
+      "grad_norm": 0.024074341825161803,
+      "learning_rate": 0.0008384777706306979,
+      "loss": 0.5953,
+      "step": 13275
+    },
+    {
+      "epoch": 0.6741886763716668,
+      "grad_norm": 0.024751468872537288,
+      "learning_rate": 0.0008383146890070269,
+      "loss": 0.5925,
+      "step": 13280
+    },
+    {
+      "epoch": 0.6744425124696983,
+      "grad_norm": 0.028042950086599638,
+      "learning_rate": 0.0008381515409738696,
+      "loss": 0.5829,
+      "step": 13285
+    },
+    {
+      "epoch": 0.6746963485677299,
+      "grad_norm": 0.026500810254070112,
+      "learning_rate": 0.0008379883265632512,
+      "loss": 0.6014,
+      "step": 13290
+    },
+    {
+      "epoch": 0.6749501846657613,
+      "grad_norm": 0.031677519675672304,
+      "learning_rate": 0.0008378250458072099,
+      "loss": 0.5688,
+      "step": 13295
+    },
+    {
+      "epoch": 0.6752040207637928,
+      "grad_norm": 0.03226776792346643,
+      "learning_rate": 0.0008376616987377968,
+      "loss": 0.637,
+      "step": 13300
+    },
+    {
+      "epoch": 0.6754578568618244,
+      "grad_norm": 0.03935591032487197,
+      "learning_rate": 0.0008374982853870761,
+      "loss": 0.6372,
+      "step": 13305
+    },
+    {
+      "epoch": 0.6757116929598558,
+      "grad_norm": 0.04985731682272338,
+      "learning_rate": 0.000837334805787125,
+      "loss": 0.5976,
+      "step": 13310
+    },
+    {
+      "epoch": 0.6759655290578873,
+      "grad_norm": 0.03831220440900208,
+      "learning_rate": 0.0008371712599700338,
+      "loss": 0.6055,
+      "step": 13315
+    },
+    {
+      "epoch": 0.6762193651559189,
+      "grad_norm": 0.02840577329842378,
+      "learning_rate": 0.0008370076479679059,
+      "loss": 0.5921,
+      "step": 13320
+    },
+    {
+      "epoch": 0.6764732012539503,
+      "grad_norm": 0.02670171769669835,
+      "learning_rate": 0.0008368439698128574,
+      "loss": 0.5863,
+      "step": 13325
+    },
+    {
+      "epoch": 0.6767270373519818,
+      "grad_norm": 0.027414460086001645,
+      "learning_rate": 0.0008366802255370174,
+      "loss": 0.5851,
+      "step": 13330
+    },
+    {
+      "epoch": 0.6769808734500133,
+      "grad_norm": 0.028303785071122464,
+      "learning_rate": 0.000836516415172528,
+      "loss": 0.5752,
+      "step": 13335
+    },
+    {
+      "epoch": 0.6772347095480449,
+      "grad_norm": 0.025538256488275953,
+      "learning_rate": 0.0008363525387515446,
+      "loss": 0.5939,
+      "step": 13340
+    },
+    {
+      "epoch": 0.6774885456460763,
+      "grad_norm": 0.03068030408097493,
+      "learning_rate": 0.0008361885963062353,
+      "loss": 0.5596,
+      "step": 13345
+    },
+    {
+      "epoch": 0.6777423817441078,
+      "grad_norm": 0.02823000001525365,
+      "learning_rate": 0.000836024587868781,
+      "loss": 0.6162,
+      "step": 13350
+    },
+    {
+      "epoch": 0.6779962178421394,
+      "grad_norm": 0.025476619812650033,
+      "learning_rate": 0.0008358605134713759,
+      "loss": 0.5924,
+      "step": 13355
+    },
+    {
+      "epoch": 0.6782500539401708,
+      "grad_norm": 0.04780163681995968,
+      "learning_rate": 0.0008356963731462271,
+      "loss": 0.5633,
+      "step": 13360
+    },
+    {
+      "epoch": 0.6785038900382023,
+      "grad_norm": 0.02825831061440577,
+      "learning_rate": 0.0008355321669255542,
+      "loss": 0.5918,
+      "step": 13365
+    },
+    {
+      "epoch": 0.6787577261362339,
+      "grad_norm": 0.026437801730974143,
+      "learning_rate": 0.0008353678948415901,
+      "loss": 0.5642,
+      "step": 13370
+    },
+    {
+      "epoch": 0.6790115622342653,
+      "grad_norm": 0.025454718983034218,
+      "learning_rate": 0.0008352035569265809,
+      "loss": 0.5691,
+      "step": 13375
+    },
+    {
+      "epoch": 0.6792653983322968,
+      "grad_norm": 0.023242855883304124,
+      "learning_rate": 0.0008350391532127851,
+      "loss": 0.6205,
+      "step": 13380
+    },
+    {
+      "epoch": 0.6795192344303284,
+      "grad_norm": 0.03623503812266734,
+      "learning_rate": 0.0008348746837324743,
+      "loss": 0.5955,
+      "step": 13385
+    },
+    {
+      "epoch": 0.6797730705283599,
+      "grad_norm": 0.029114104299875758,
+      "learning_rate": 0.0008347101485179332,
+      "loss": 0.5573,
+      "step": 13390
+    },
+    {
+      "epoch": 0.6800269066263913,
+      "grad_norm": 0.024202571333690956,
+      "learning_rate": 0.0008345455476014592,
+      "loss": 0.5487,
+      "step": 13395
+    },
+    {
+      "epoch": 0.6802807427244228,
+      "grad_norm": 0.02802981312295006,
+      "learning_rate": 0.0008343808810153624,
+      "loss": 0.5798,
+      "step": 13400
+    },
+    {
+      "epoch": 0.6805345788224544,
+      "grad_norm": 0.024805503868118448,
+      "learning_rate": 0.0008342161487919664,
+      "loss": 0.5874,
+      "step": 13405
+    },
+    {
+      "epoch": 0.6807884149204858,
+      "grad_norm": 0.023290389364338096,
+      "learning_rate": 0.000834051350963607,
+      "loss": 0.5926,
+      "step": 13410
+    },
+    {
+      "epoch": 0.6810422510185173,
+      "grad_norm": 0.028882137299632987,
+      "learning_rate": 0.0008338864875626333,
+      "loss": 0.5975,
+      "step": 13415
+    },
+    {
+      "epoch": 0.6812960871165489,
+      "grad_norm": 0.0288092686879103,
+      "learning_rate": 0.0008337215586214073,
+      "loss": 0.6053,
+      "step": 13420
+    },
+    {
+      "epoch": 0.6815499232145803,
+      "grad_norm": 0.025146916551842938,
+      "learning_rate": 0.0008335565641723035,
+      "loss": 0.5884,
+      "step": 13425
+    },
+    {
+      "epoch": 0.6818037593126118,
+      "grad_norm": 0.024739281591221643,
+      "learning_rate": 0.0008333915042477096,
+      "loss": 0.5652,
+      "step": 13430
+    },
+    {
+      "epoch": 0.6820575954106434,
+      "grad_norm": 0.024056449982588327,
+      "learning_rate": 0.000833226378880026,
+      "loss": 0.6189,
+      "step": 13435
+    },
+    {
+      "epoch": 0.6823114315086749,
+      "grad_norm": 0.026316533874474936,
+      "learning_rate": 0.000833061188101666,
+      "loss": 0.6197,
+      "step": 13440
+    },
+    {
+      "epoch": 0.6825652676067063,
+      "grad_norm": 0.022435368988669158,
+      "learning_rate": 0.000832895931945056,
+      "loss": 0.5676,
+      "step": 13445
+    },
+    {
+      "epoch": 0.6828191037047379,
+      "grad_norm": 0.03713138580934235,
+      "learning_rate": 0.0008327306104426345,
+      "loss": 0.587,
+      "step": 13450
+    },
+    {
+      "epoch": 0.6830729398027694,
+      "grad_norm": 0.028633714641747837,
+      "learning_rate": 0.0008325652236268536,
+      "loss": 0.599,
+      "step": 13455
+    },
+    {
+      "epoch": 0.6833267759008008,
+      "grad_norm": 0.040045692037592666,
+      "learning_rate": 0.0008323997715301777,
+      "loss": 0.5729,
+      "step": 13460
+    },
+    {
+      "epoch": 0.6835806119988324,
+      "grad_norm": 0.03628423966164721,
+      "learning_rate": 0.0008322342541850844,
+      "loss": 0.5689,
+      "step": 13465
+    },
+    {
+      "epoch": 0.6838344480968639,
+      "grad_norm": 0.030711278133780332,
+      "learning_rate": 0.0008320686716240637,
+      "loss": 0.5646,
+      "step": 13470
+    },
+    {
+      "epoch": 0.6840882841948953,
+      "grad_norm": 0.025362649640482234,
+      "learning_rate": 0.000831903023879619,
+      "loss": 0.5774,
+      "step": 13475
+    },
+    {
+      "epoch": 0.6843421202929268,
+      "grad_norm": 0.03407568648626278,
+      "learning_rate": 0.0008317373109842658,
+      "loss": 0.573,
+      "step": 13480
+    },
+    {
+      "epoch": 0.6845959563909584,
+      "grad_norm": 0.02491716289782007,
+      "learning_rate": 0.0008315715329705329,
+      "loss": 0.5727,
+      "step": 13485
+    },
+    {
+      "epoch": 0.6848497924889898,
+      "grad_norm": 0.0317932126461997,
+      "learning_rate": 0.0008314056898709615,
+      "loss": 0.6018,
+      "step": 13490
+    },
+    {
+      "epoch": 0.6851036285870213,
+      "grad_norm": 0.024107080698907192,
+      "learning_rate": 0.0008312397817181059,
+      "loss": 0.6016,
+      "step": 13495
+    },
+    {
+      "epoch": 0.6853574646850529,
+      "grad_norm": 0.03632719720582319,
+      "learning_rate": 0.0008310738085445332,
+      "loss": 0.5991,
+      "step": 13500
+    },
+    {
+      "epoch": 0.6856113007830844,
+      "grad_norm": 0.05715841618806044,
+      "learning_rate": 0.0008309077703828228,
+      "loss": 0.6122,
+      "step": 13505
+    },
+    {
+      "epoch": 0.6858651368811158,
+      "grad_norm": 0.03704079824369565,
+      "learning_rate": 0.0008307416672655674,
+      "loss": 0.6023,
+      "step": 13510
+    },
+    {
+      "epoch": 0.6861189729791474,
+      "grad_norm": 0.24267969330580477,
+      "learning_rate": 0.000830575499225372,
+      "loss": 0.5998,
+      "step": 13515
+    },
+    {
+      "epoch": 0.6863728090771789,
+      "grad_norm": 0.048764967617313686,
+      "learning_rate": 0.0008304092662948548,
+      "loss": 0.608,
+      "step": 13520
+    },
+    {
+      "epoch": 0.6866266451752103,
+      "grad_norm": 0.0516560962912755,
+      "learning_rate": 0.0008302429685066462,
+      "loss": 0.5713,
+      "step": 13525
+    },
+    {
+      "epoch": 0.6868804812732419,
+      "grad_norm": 0.03554128227511413,
+      "learning_rate": 0.0008300766058933899,
+      "loss": 0.5681,
+      "step": 13530
+    },
+    {
+      "epoch": 0.6871343173712734,
+      "grad_norm": 0.028215204615185188,
+      "learning_rate": 0.0008299101784877421,
+      "loss": 0.5954,
+      "step": 13535
+    },
+    {
+      "epoch": 0.6873881534693048,
+      "grad_norm": 0.0321511775141223,
+      "learning_rate": 0.0008297436863223715,
+      "loss": 0.5873,
+      "step": 13540
+    },
+    {
+      "epoch": 0.6876419895673364,
+      "grad_norm": 0.029157356552754574,
+      "learning_rate": 0.0008295771294299596,
+      "loss": 0.5775,
+      "step": 13545
+    },
+    {
+      "epoch": 0.6878958256653679,
+      "grad_norm": 0.028937031419344138,
+      "learning_rate": 0.0008294105078432007,
+      "loss": 0.5808,
+      "step": 13550
+    },
+    {
+      "epoch": 0.6881496617633994,
+      "grad_norm": 0.037942050160010676,
+      "learning_rate": 0.000829243821594802,
+      "loss": 0.6233,
+      "step": 13555
+    },
+    {
+      "epoch": 0.6884034978614308,
+      "grad_norm": 0.031543194063541696,
+      "learning_rate": 0.0008290770707174831,
+      "loss": 0.5977,
+      "step": 13560
+    },
+    {
+      "epoch": 0.6886573339594624,
+      "grad_norm": 0.02238872742391851,
+      "learning_rate": 0.0008289102552439762,
+      "loss": 0.5518,
+      "step": 13565
+    },
+    {
+      "epoch": 0.6889111700574939,
+      "grad_norm": 0.027982976773836193,
+      "learning_rate": 0.0008287433752070265,
+      "loss": 0.5747,
+      "step": 13570
+    },
+    {
+      "epoch": 0.6891650061555253,
+      "grad_norm": 0.03491184560447035,
+      "learning_rate": 0.0008285764306393917,
+      "loss": 0.5675,
+      "step": 13575
+    },
+    {
+      "epoch": 0.6894188422535569,
+      "grad_norm": 0.05242850032688674,
+      "learning_rate": 0.0008284094215738422,
+      "loss": 0.5764,
+      "step": 13580
+    },
+    {
+      "epoch": 0.6896726783515884,
+      "grad_norm": 0.05106024303010987,
+      "learning_rate": 0.000828242348043161,
+      "loss": 0.6068,
+      "step": 13585
+    },
+    {
+      "epoch": 0.6899265144496198,
+      "grad_norm": 0.030655123821702948,
+      "learning_rate": 0.0008280752100801439,
+      "loss": 0.5834,
+      "step": 13590
+    },
+    {
+      "epoch": 0.6901803505476514,
+      "grad_norm": 0.030080411514861318,
+      "learning_rate": 0.0008279080077175992,
+      "loss": 0.589,
+      "step": 13595
+    },
+    {
+      "epoch": 0.6904341866456829,
+      "grad_norm": 0.07602111036419369,
+      "learning_rate": 0.0008277407409883476,
+      "loss": 0.5973,
+      "step": 13600
+    },
+    {
+      "epoch": 0.6906880227437144,
+      "grad_norm": 0.026230560906281026,
+      "learning_rate": 0.0008275734099252233,
+      "loss": 0.5867,
+      "step": 13605
+    },
+    {
+      "epoch": 0.6909418588417459,
+      "grad_norm": 0.03576027679760639,
+      "learning_rate": 0.0008274060145610719,
+      "loss": 0.5555,
+      "step": 13610
+    },
+    {
+      "epoch": 0.6911956949397774,
+      "grad_norm": 0.03286724650115783,
+      "learning_rate": 0.0008272385549287529,
+      "loss": 0.6358,
+      "step": 13615
+    },
+    {
+      "epoch": 0.6914495310378089,
+      "grad_norm": 0.023355654899445015,
+      "learning_rate": 0.0008270710310611374,
+      "loss": 0.5994,
+      "step": 13620
+    },
+    {
+      "epoch": 0.6917033671358404,
+      "grad_norm": 0.026604280616454975,
+      "learning_rate": 0.0008269034429911095,
+      "loss": 0.6027,
+      "step": 13625
+    },
+    {
+      "epoch": 0.6919572032338719,
+      "grad_norm": 0.025639930478733225,
+      "learning_rate": 0.0008267357907515661,
+      "loss": 0.5732,
+      "step": 13630
+    },
+    {
+      "epoch": 0.6922110393319034,
+      "grad_norm": 0.0332874158069147,
+      "learning_rate": 0.0008265680743754165,
+      "loss": 0.6007,
+      "step": 13635
+    },
+    {
+      "epoch": 0.6924648754299348,
+      "grad_norm": 0.029848135095345608,
+      "learning_rate": 0.0008264002938955823,
+      "loss": 0.568,
+      "step": 13640
+    },
+    {
+      "epoch": 0.6927187115279664,
+      "grad_norm": 0.02669194226166377,
+      "learning_rate": 0.0008262324493449982,
+      "loss": 0.5983,
+      "step": 13645
+    },
+    {
+      "epoch": 0.6929725476259979,
+      "grad_norm": 0.027289337864795267,
+      "learning_rate": 0.0008260645407566114,
+      "loss": 0.6212,
+      "step": 13650
+    },
+    {
+      "epoch": 0.6932263837240294,
+      "grad_norm": 0.028021835675969425,
+      "learning_rate": 0.0008258965681633813,
+      "loss": 0.5927,
+      "step": 13655
+    },
+    {
+      "epoch": 0.6934802198220609,
+      "grad_norm": 0.025781289683893413,
+      "learning_rate": 0.0008257285315982799,
+      "loss": 0.5623,
+      "step": 13660
+    },
+    {
+      "epoch": 0.6937340559200924,
+      "grad_norm": 0.03639235414298802,
+      "learning_rate": 0.0008255604310942922,
+      "loss": 0.5608,
+      "step": 13665
+    },
+    {
+      "epoch": 0.6939878920181239,
+      "grad_norm": 0.025081206584032647,
+      "learning_rate": 0.0008253922666844155,
+      "loss": 0.5641,
+      "step": 13670
+    },
+    {
+      "epoch": 0.6942417281161554,
+      "grad_norm": 0.02503169555093532,
+      "learning_rate": 0.0008252240384016596,
+      "loss": 0.5815,
+      "step": 13675
+    },
+    {
+      "epoch": 0.6944955642141869,
+      "grad_norm": 0.025522493920881646,
+      "learning_rate": 0.0008250557462790469,
+      "loss": 0.6103,
+      "step": 13680
+    },
+    {
+      "epoch": 0.6947494003122184,
+      "grad_norm": 0.029829670493091157,
+      "learning_rate": 0.0008248873903496123,
+      "loss": 0.561,
+      "step": 13685
+    },
+    {
+      "epoch": 0.6950032364102499,
+      "grad_norm": 0.03274242306959272,
+      "learning_rate": 0.000824718970646403,
+      "loss": 0.5965,
+      "step": 13690
+    },
+    {
+      "epoch": 0.6952570725082814,
+      "grad_norm": 0.053602824065729755,
+      "learning_rate": 0.0008245504872024793,
+      "loss": 0.5778,
+      "step": 13695
+    },
+    {
+      "epoch": 0.6955109086063129,
+      "grad_norm": 0.02930268258189034,
+      "learning_rate": 0.0008243819400509133,
+      "loss": 0.5291,
+      "step": 13700
+    },
+    {
+      "epoch": 0.6957647447043444,
+      "grad_norm": 0.024987980789305964,
+      "learning_rate": 0.0008242133292247902,
+      "loss": 0.5828,
+      "step": 13705
+    },
+    {
+      "epoch": 0.6960185808023759,
+      "grad_norm": 0.02829975573628627,
+      "learning_rate": 0.0008240446547572076,
+      "loss": 0.5881,
+      "step": 13710
+    },
+    {
+      "epoch": 0.6962724169004074,
+      "grad_norm": 0.03443116573593433,
+      "learning_rate": 0.0008238759166812751,
+      "loss": 0.5919,
+      "step": 13715
+    },
+    {
+      "epoch": 0.696526252998439,
+      "grad_norm": 0.031587204673142105,
+      "learning_rate": 0.0008237071150301154,
+      "loss": 0.6065,
+      "step": 13720
+    },
+    {
+      "epoch": 0.6967800890964704,
+      "grad_norm": 0.023508246283948264,
+      "learning_rate": 0.0008235382498368634,
+      "loss": 0.5946,
+      "step": 13725
+    },
+    {
+      "epoch": 0.6970339251945019,
+      "grad_norm": 0.025009199129867415,
+      "learning_rate": 0.0008233693211346663,
+      "loss": 0.5563,
+      "step": 13730
+    },
+    {
+      "epoch": 0.6972877612925334,
+      "grad_norm": 0.03326099984467122,
+      "learning_rate": 0.0008232003289566843,
+      "loss": 0.5873,
+      "step": 13735
+    },
+    {
+      "epoch": 0.6975415973905649,
+      "grad_norm": 0.034975275494994254,
+      "learning_rate": 0.0008230312733360894,
+      "loss": 0.5658,
+      "step": 13740
+    },
+    {
+      "epoch": 0.6977954334885964,
+      "grad_norm": 0.025872022613802486,
+      "learning_rate": 0.0008228621543060665,
+      "loss": 0.5572,
+      "step": 13745
+    },
+    {
+      "epoch": 0.6980492695866279,
+      "grad_norm": 0.02546372161263482,
+      "learning_rate": 0.0008226929718998129,
+      "loss": 0.5905,
+      "step": 13750
+    },
+    {
+      "epoch": 0.6983031056846594,
+      "grad_norm": 0.03215322261937108,
+      "learning_rate": 0.0008225237261505381,
+      "loss": 0.5581,
+      "step": 13755
+    },
+    {
+      "epoch": 0.6985569417826909,
+      "grad_norm": 0.027702755257204188,
+      "learning_rate": 0.0008223544170914641,
+      "loss": 0.6156,
+      "step": 13760
+    },
+    {
+      "epoch": 0.6988107778807224,
+      "grad_norm": 0.03195843106084198,
+      "learning_rate": 0.0008221850447558259,
+      "loss": 0.6007,
+      "step": 13765
+    },
+    {
+      "epoch": 0.699064613978754,
+      "grad_norm": 0.037527909108500845,
+      "learning_rate": 0.00082201560917687,
+      "loss": 0.5738,
+      "step": 13770
+    },
+    {
+      "epoch": 0.6993184500767854,
+      "grad_norm": 0.05309666088358755,
+      "learning_rate": 0.000821846110387856,
+      "loss": 0.585,
+      "step": 13775
+    },
+    {
+      "epoch": 0.6995722861748169,
+      "grad_norm": 0.03185639071908041,
+      "learning_rate": 0.0008216765484220554,
+      "loss": 0.594,
+      "step": 13780
+    },
+    {
+      "epoch": 0.6998261222728485,
+      "grad_norm": 0.029682191254135553,
+      "learning_rate": 0.0008215069233127528,
+      "loss": 0.5832,
+      "step": 13785
+    },
+    {
+      "epoch": 0.7000799583708799,
+      "grad_norm": 0.030629890653038103,
+      "learning_rate": 0.0008213372350932444,
+      "loss": 0.5727,
+      "step": 13790
+    },
+    {
+      "epoch": 0.7003337944689114,
+      "grad_norm": 0.025415360193154723,
+      "learning_rate": 0.0008211674837968391,
+      "loss": 0.5891,
+      "step": 13795
+    },
+    {
+      "epoch": 0.700587630566943,
+      "grad_norm": 0.04954711837375036,
+      "learning_rate": 0.0008209976694568586,
+      "loss": 0.58,
+      "step": 13800
+    },
+    {
+      "epoch": 0.7008414666649744,
+      "grad_norm": 0.047662707757456846,
+      "learning_rate": 0.0008208277921066362,
+      "loss": 0.5835,
+      "step": 13805
+    },
+    {
+      "epoch": 0.7010953027630059,
+      "grad_norm": 0.049928278726034496,
+      "learning_rate": 0.0008206578517795185,
+      "loss": 0.584,
+      "step": 13810
+    },
+    {
+      "epoch": 0.7013491388610374,
+      "grad_norm": 0.06832687556620186,
+      "learning_rate": 0.0008204878485088634,
+      "loss": 0.6406,
+      "step": 13815
+    },
+    {
+      "epoch": 0.701602974959069,
+      "grad_norm": 0.08083335473139422,
+      "learning_rate": 0.0008203177823280419,
+      "loss": 0.6479,
+      "step": 13820
+    },
+    {
+      "epoch": 0.7018568110571004,
+      "grad_norm": 0.07135721218869323,
+      "learning_rate": 0.000820147653270437,
+      "loss": 0.6405,
+      "step": 13825
+    },
+    {
+      "epoch": 0.7021106471551319,
+      "grad_norm": 0.04104888957953317,
+      "learning_rate": 0.0008199774613694447,
+      "loss": 0.5936,
+      "step": 13830
+    },
+    {
+      "epoch": 0.7023644832531635,
+      "grad_norm": 0.0875218883592723,
+      "learning_rate": 0.0008198072066584721,
+      "loss": 0.6216,
+      "step": 13835
+    },
+    {
+      "epoch": 0.7026183193511949,
+      "grad_norm": 0.07942963393083619,
+      "learning_rate": 0.0008196368891709399,
+      "loss": 0.6315,
+      "step": 13840
+    },
+    {
+      "epoch": 0.7028721554492264,
+      "grad_norm": 0.06110437267505552,
+      "learning_rate": 0.0008194665089402804,
+      "loss": 0.5965,
+      "step": 13845
+    },
+    {
+      "epoch": 0.703125991547258,
+      "grad_norm": 0.08891102687332698,
+      "learning_rate": 0.0008192960659999383,
+      "loss": 0.6391,
+      "step": 13850
+    },
+    {
+      "epoch": 0.7033798276452894,
+      "grad_norm": 0.06448648520699571,
+      "learning_rate": 0.0008191255603833708,
+      "loss": 0.5794,
+      "step": 13855
+    },
+    {
+      "epoch": 0.7036336637433209,
+      "grad_norm": 0.030570919417378353,
+      "learning_rate": 0.0008189549921240472,
+      "loss": 0.5855,
+      "step": 13860
+    },
+    {
+      "epoch": 0.7038874998413525,
+      "grad_norm": 0.03473828499726488,
+      "learning_rate": 0.0008187843612554493,
+      "loss": 0.6168,
+      "step": 13865
+    },
+    {
+      "epoch": 0.704141335939384,
+      "grad_norm": 0.050512426090771084,
+      "learning_rate": 0.0008186136678110711,
+      "loss": 0.6157,
+      "step": 13870
+    },
+    {
+      "epoch": 0.7043951720374154,
+      "grad_norm": 0.03331461557177089,
+      "learning_rate": 0.000818442911824419,
+      "loss": 0.6325,
+      "step": 13875
+    },
+    {
+      "epoch": 0.704649008135447,
+      "grad_norm": 0.03887589726537461,
+      "learning_rate": 0.0008182720933290111,
+      "loss": 0.619,
+      "step": 13880
+    },
+    {
+      "epoch": 0.7049028442334785,
+      "grad_norm": 0.034491361473236455,
+      "learning_rate": 0.0008181012123583786,
+      "loss": 0.6265,
+      "step": 13885
+    },
+    {
+      "epoch": 0.7051566803315099,
+      "grad_norm": 0.04939733683559655,
+      "learning_rate": 0.0008179302689460646,
+      "loss": 0.6209,
+      "step": 13890
+    },
+    {
+      "epoch": 0.7054105164295414,
+      "grad_norm": 0.036958385336953684,
+      "learning_rate": 0.0008177592631256241,
+      "loss": 0.5847,
+      "step": 13895
+    },
+    {
+      "epoch": 0.705664352527573,
+      "grad_norm": 0.0338882190476505,
+      "learning_rate": 0.0008175881949306252,
+      "loss": 0.6022,
+      "step": 13900
+    },
+    {
+      "epoch": 0.7059181886256044,
+      "grad_norm": 0.03164427898151885,
+      "learning_rate": 0.0008174170643946472,
+      "loss": 0.6194,
+      "step": 13905
+    },
+    {
+      "epoch": 0.7061720247236359,
+      "grad_norm": 0.02788808622745641,
+      "learning_rate": 0.0008172458715512825,
+      "loss": 0.5735,
+      "step": 13910
+    },
+    {
+      "epoch": 0.7064258608216675,
+      "grad_norm": 0.05228385785798862,
+      "learning_rate": 0.0008170746164341352,
+      "loss": 0.6066,
+      "step": 13915
+    },
+    {
+      "epoch": 0.7066796969196989,
+      "grad_norm": 0.026836897802726748,
+      "learning_rate": 0.0008169032990768221,
+      "loss": 0.6333,
+      "step": 13920
+    },
+    {
+      "epoch": 0.7069335330177304,
+      "grad_norm": 0.028759260350827046,
+      "learning_rate": 0.0008167319195129717,
+      "loss": 0.6147,
+      "step": 13925
+    },
+    {
+      "epoch": 0.707187369115762,
+      "grad_norm": 0.045590276542868145,
+      "learning_rate": 0.0008165604777762251,
+      "loss": 0.6096,
+      "step": 13930
+    },
+    {
+      "epoch": 0.7074412052137935,
+      "grad_norm": 0.025644349892156408,
+      "learning_rate": 0.0008163889739002354,
+      "loss": 0.571,
+      "step": 13935
+    },
+    {
+      "epoch": 0.7076950413118249,
+      "grad_norm": 0.03515540024760426,
+      "learning_rate": 0.000816217407918668,
+      "loss": 0.57,
+      "step": 13940
+    },
+    {
+      "epoch": 0.7079488774098565,
+      "grad_norm": 0.030510481245704536,
+      "learning_rate": 0.0008160457798652002,
+      "loss": 0.5802,
+      "step": 13945
+    },
+    {
+      "epoch": 0.708202713507888,
+      "grad_norm": 0.026068842912907026,
+      "learning_rate": 0.0008158740897735221,
+      "loss": 0.59,
+      "step": 13950
+    },
+    {
+      "epoch": 0.7084565496059194,
+      "grad_norm": 0.02799394637765536,
+      "learning_rate": 0.0008157023376773354,
+      "loss": 0.6148,
+      "step": 13955
+    },
+    {
+      "epoch": 0.708710385703951,
+      "grad_norm": 0.039004735079580716,
+      "learning_rate": 0.0008155305236103543,
+      "loss": 0.6115,
+      "step": 13960
+    },
+    {
+      "epoch": 0.7089642218019825,
+      "grad_norm": 0.028262356505209356,
+      "learning_rate": 0.0008153586476063048,
+      "loss": 0.5613,
+      "step": 13965
+    },
+    {
+      "epoch": 0.7092180579000139,
+      "grad_norm": 0.02565901929071906,
+      "learning_rate": 0.0008151867096989256,
+      "loss": 0.5753,
+      "step": 13970
+    },
+    {
+      "epoch": 0.7094718939980454,
+      "grad_norm": 0.03604279567945798,
+      "learning_rate": 0.0008150147099219669,
+      "loss": 0.6221,
+      "step": 13975
+    },
+    {
+      "epoch": 0.709725730096077,
+      "grad_norm": 0.02599493314932738,
+      "learning_rate": 0.0008148426483091919,
+      "loss": 0.6047,
+      "step": 13980
+    },
+    {
+      "epoch": 0.7099795661941085,
+      "grad_norm": 0.03222079110872602,
+      "learning_rate": 0.000814670524894375,
+      "loss": 0.6341,
+      "step": 13985
+    },
+    {
+      "epoch": 0.7102334022921399,
+      "grad_norm": 0.027685751102907306,
+      "learning_rate": 0.0008144983397113032,
+      "loss": 0.6027,
+      "step": 13990
+    },
+    {
+      "epoch": 0.7104872383901715,
+      "grad_norm": 0.023842700636128793,
+      "learning_rate": 0.000814326092793776,
+      "loss": 0.5593,
+      "step": 13995
+    },
+    {
+      "epoch": 0.710741074488203,
+      "grad_norm": 0.04054047646149784,
+      "learning_rate": 0.0008141537841756043,
+      "loss": 0.563,
+      "step": 14000
+    },
+    {
+      "epoch": 0.7109949105862344,
+      "grad_norm": 0.028942162080538656,
+      "learning_rate": 0.0008139814138906112,
+      "loss": 0.5957,
+      "step": 14005
+    },
+    {
+      "epoch": 0.711248746684266,
+      "grad_norm": 0.03050796562803737,
+      "learning_rate": 0.0008138089819726326,
+      "loss": 0.5865,
+      "step": 14010
+    },
+    {
+      "epoch": 0.7115025827822975,
+      "grad_norm": 0.02451193708223736,
+      "learning_rate": 0.0008136364884555158,
+      "loss": 0.5557,
+      "step": 14015
+    },
+    {
+      "epoch": 0.7117564188803289,
+      "grad_norm": 0.04559764332264477,
+      "learning_rate": 0.0008134639333731202,
+      "loss": 0.5906,
+      "step": 14020
+    },
+    {
+      "epoch": 0.7120102549783605,
+      "grad_norm": 0.03944520888736035,
+      "learning_rate": 0.0008132913167593179,
+      "loss": 0.5703,
+      "step": 14025
+    },
+    {
+      "epoch": 0.712264091076392,
+      "grad_norm": 0.02938776092497612,
+      "learning_rate": 0.0008131186386479925,
+      "loss": 0.5766,
+      "step": 14030
+    },
+    {
+      "epoch": 0.7125179271744235,
+      "grad_norm": 0.04170810908228807,
+      "learning_rate": 0.0008129458990730398,
+      "loss": 0.5721,
+      "step": 14035
+    },
+    {
+      "epoch": 0.712771763272455,
+      "grad_norm": 0.04306584561555878,
+      "learning_rate": 0.0008127730980683677,
+      "loss": 0.6244,
+      "step": 14040
+    },
+    {
+      "epoch": 0.7130255993704865,
+      "grad_norm": 0.05080947630039173,
+      "learning_rate": 0.0008126002356678965,
+      "loss": 0.6219,
+      "step": 14045
+    },
+    {
+      "epoch": 0.713279435468518,
+      "grad_norm": 0.04665163730281675,
+      "learning_rate": 0.0008124273119055577,
+      "loss": 0.6287,
+      "step": 14050
+    },
+    {
+      "epoch": 0.7135332715665494,
+      "grad_norm": 0.023905364788607067,
+      "learning_rate": 0.0008122543268152957,
+      "loss": 0.6114,
+      "step": 14055
+    },
+    {
+      "epoch": 0.713787107664581,
+      "grad_norm": 0.031161542177222803,
+      "learning_rate": 0.0008120812804310667,
+      "loss": 0.5888,
+      "step": 14060
+    },
+    {
+      "epoch": 0.7140409437626125,
+      "grad_norm": 0.03335446756811596,
+      "learning_rate": 0.0008119081727868386,
+      "loss": 0.6239,
+      "step": 14065
+    },
+    {
+      "epoch": 0.7142947798606439,
+      "grad_norm": 0.12183613795209156,
+      "learning_rate": 0.0008117350039165916,
+      "loss": 0.5827,
+      "step": 14070
+    },
+    {
+      "epoch": 0.7145486159586755,
+      "grad_norm": 0.031453942182186376,
+      "learning_rate": 0.0008115617738543182,
+      "loss": 0.6154,
+      "step": 14075
+    },
+    {
+      "epoch": 0.714802452056707,
+      "grad_norm": 0.03179438032903415,
+      "learning_rate": 0.0008113884826340221,
+      "loss": 0.6307,
+      "step": 14080
+    },
+    {
+      "epoch": 0.7150562881547385,
+      "grad_norm": 0.03411034419877854,
+      "learning_rate": 0.0008112151302897198,
+      "loss": 0.6119,
+      "step": 14085
+    },
+    {
+      "epoch": 0.71531012425277,
+      "grad_norm": 0.03217809599591784,
+      "learning_rate": 0.0008110417168554396,
+      "loss": 0.5976,
+      "step": 14090
+    },
+    {
+      "epoch": 0.7155639603508015,
+      "grad_norm": 0.039786783196638205,
+      "learning_rate": 0.0008108682423652213,
+      "loss": 0.5819,
+      "step": 14095
+    },
+    {
+      "epoch": 0.715817796448833,
+      "grad_norm": 0.027681937005003987,
+      "learning_rate": 0.0008106947068531174,
+      "loss": 0.5556,
+      "step": 14100
+    },
+    {
+      "epoch": 0.7160716325468645,
+      "grad_norm": 0.046793772542064,
+      "learning_rate": 0.000810521110353192,
+      "loss": 0.6361,
+      "step": 14105
+    },
+    {
+      "epoch": 0.716325468644896,
+      "grad_norm": 0.02471149262461818,
+      "learning_rate": 0.0008103474528995213,
+      "loss": 0.5904,
+      "step": 14110
+    },
+    {
+      "epoch": 0.7165793047429275,
+      "grad_norm": 0.034604562342302796,
+      "learning_rate": 0.0008101737345261932,
+      "loss": 0.5659,
+      "step": 14115
+    },
+    {
+      "epoch": 0.716833140840959,
+      "grad_norm": 0.03117446691261137,
+      "learning_rate": 0.0008099999552673079,
+      "loss": 0.6229,
+      "step": 14120
+    },
+    {
+      "epoch": 0.7170869769389905,
+      "grad_norm": 0.0247974767087348,
+      "learning_rate": 0.0008098261151569772,
+      "loss": 0.5935,
+      "step": 14125
+    },
+    {
+      "epoch": 0.717340813037022,
+      "grad_norm": 0.03509041523808686,
+      "learning_rate": 0.0008096522142293255,
+      "loss": 0.6033,
+      "step": 14130
+    },
+    {
+      "epoch": 0.7175946491350534,
+      "grad_norm": 0.025370999477552076,
+      "learning_rate": 0.0008094782525184881,
+      "loss": 0.6065,
+      "step": 14135
+    },
+    {
+      "epoch": 0.717848485233085,
+      "grad_norm": 0.03293482244748489,
+      "learning_rate": 0.0008093042300586132,
+      "loss": 0.5631,
+      "step": 14140
+    },
+    {
+      "epoch": 0.7181023213311165,
+      "grad_norm": 0.02484308148359049,
+      "learning_rate": 0.0008091301468838604,
+      "loss": 0.6092,
+      "step": 14145
+    },
+    {
+      "epoch": 0.718356157429148,
+      "grad_norm": 0.024837575281261598,
+      "learning_rate": 0.0008089560030284014,
+      "loss": 0.5881,
+      "step": 14150
+    },
+    {
+      "epoch": 0.7186099935271795,
+      "grad_norm": 0.03353516289786398,
+      "learning_rate": 0.0008087817985264197,
+      "loss": 0.5782,
+      "step": 14155
+    },
+    {
+      "epoch": 0.718863829625211,
+      "grad_norm": 0.03056718458106469,
+      "learning_rate": 0.0008086075334121111,
+      "loss": 0.5962,
+      "step": 14160
+    },
+    {
+      "epoch": 0.7191176657232425,
+      "grad_norm": 0.05151128438996498,
+      "learning_rate": 0.0008084332077196824,
+      "loss": 0.5671,
+      "step": 14165
+    },
+    {
+      "epoch": 0.719371501821274,
+      "grad_norm": 0.029141462162182278,
+      "learning_rate": 0.0008082588214833534,
+      "loss": 0.6444,
+      "step": 14170
+    },
+    {
+      "epoch": 0.7196253379193055,
+      "grad_norm": 0.036675084804481534,
+      "learning_rate": 0.000808084374737355,
+      "loss": 0.5763,
+      "step": 14175
+    },
+    {
+      "epoch": 0.719879174017337,
+      "grad_norm": 0.03544199673026975,
+      "learning_rate": 0.0008079098675159302,
+      "loss": 0.5726,
+      "step": 14180
+    },
+    {
+      "epoch": 0.7201330101153685,
+      "grad_norm": 0.026411392839355837,
+      "learning_rate": 0.0008077352998533339,
+      "loss": 0.5642,
+      "step": 14185
+    },
+    {
+      "epoch": 0.7203868462134,
+      "grad_norm": 0.06222643500106606,
+      "learning_rate": 0.0008075606717838329,
+      "loss": 0.579,
+      "step": 14190
+    },
+    {
+      "epoch": 0.7206406823114315,
+      "grad_norm": 0.04757491182106772,
+      "learning_rate": 0.0008073859833417059,
+      "loss": 0.5845,
+      "step": 14195
+    },
+    {
+      "epoch": 0.7208945184094631,
+      "grad_norm": 0.03794874490419018,
+      "learning_rate": 0.0008072112345612433,
+      "loss": 0.6024,
+      "step": 14200
+    },
+    {
+      "epoch": 0.7211483545074945,
+      "grad_norm": 0.02909593167125118,
+      "learning_rate": 0.0008070364254767475,
+      "loss": 0.5807,
+      "step": 14205
+    },
+    {
+      "epoch": 0.721402190605526,
+      "grad_norm": 0.04995516503580864,
+      "learning_rate": 0.0008068615561225324,
+      "loss": 0.6365,
+      "step": 14210
+    },
+    {
+      "epoch": 0.7216560267035576,
+      "grad_norm": 0.029511579035933468,
+      "learning_rate": 0.0008066866265329242,
+      "loss": 0.5855,
+      "step": 14215
+    },
+    {
+      "epoch": 0.721909862801589,
+      "grad_norm": 0.04837520573816359,
+      "learning_rate": 0.0008065116367422607,
+      "loss": 0.5877,
+      "step": 14220
+    },
+    {
+      "epoch": 0.7221636988996205,
+      "grad_norm": 0.029346824274389435,
+      "learning_rate": 0.0008063365867848916,
+      "loss": 0.6129,
+      "step": 14225
+    },
+    {
+      "epoch": 0.722417534997652,
+      "grad_norm": 0.03801220854976456,
+      "learning_rate": 0.0008061614766951779,
+      "loss": 0.6121,
+      "step": 14230
+    },
+    {
+      "epoch": 0.7226713710956835,
+      "grad_norm": 0.035405249909125754,
+      "learning_rate": 0.0008059863065074934,
+      "loss": 0.5757,
+      "step": 14235
+    },
+    {
+      "epoch": 0.722925207193715,
+      "grad_norm": 0.03770160969115643,
+      "learning_rate": 0.0008058110762562227,
+      "loss": 0.6527,
+      "step": 14240
+    },
+    {
+      "epoch": 0.7231790432917465,
+      "grad_norm": 0.031787448281367404,
+      "learning_rate": 0.0008056357859757631,
+      "loss": 0.6437,
+      "step": 14245
+    },
+    {
+      "epoch": 0.7234328793897781,
+      "grad_norm": 0.3335810711614509,
+      "learning_rate": 0.0008054604357005227,
+      "loss": 0.5659,
+      "step": 14250
+    },
+    {
+      "epoch": 0.7236867154878095,
+      "grad_norm": 0.04424485372113206,
+      "learning_rate": 0.000805285025464922,
+      "loss": 0.5756,
+      "step": 14255
+    },
+    {
+      "epoch": 0.723940551585841,
+      "grad_norm": 0.03463932389672562,
+      "learning_rate": 0.0008051095553033935,
+      "loss": 0.5662,
+      "step": 14260
+    },
+    {
+      "epoch": 0.7241943876838726,
+      "grad_norm": 0.059238021172196606,
+      "learning_rate": 0.0008049340252503808,
+      "loss": 0.6028,
+      "step": 14265
+    },
+    {
+      "epoch": 0.724448223781904,
+      "grad_norm": 0.037906766126114544,
+      "learning_rate": 0.0008047584353403396,
+      "loss": 0.6386,
+      "step": 14270
+    },
+    {
+      "epoch": 0.7247020598799355,
+      "grad_norm": 0.03614272842990135,
+      "learning_rate": 0.0008045827856077373,
+      "loss": 0.5869,
+      "step": 14275
+    },
+    {
+      "epoch": 0.7249558959779671,
+      "grad_norm": 0.025607815005150466,
+      "learning_rate": 0.0008044070760870533,
+      "loss": 0.5728,
+      "step": 14280
+    },
+    {
+      "epoch": 0.7252097320759985,
+      "grad_norm": 0.027627783001377804,
+      "learning_rate": 0.0008042313068127781,
+      "loss": 0.639,
+      "step": 14285
+    },
+    {
+      "epoch": 0.72546356817403,
+      "grad_norm": 0.043412145360747785,
+      "learning_rate": 0.0008040554778194148,
+      "loss": 0.5497,
+      "step": 14290
+    },
+    {
+      "epoch": 0.7257174042720616,
+      "grad_norm": 0.02701727522162977,
+      "learning_rate": 0.0008038795891414774,
+      "loss": 0.6095,
+      "step": 14295
+    },
+    {
+      "epoch": 0.7259712403700931,
+      "grad_norm": 0.029754029318448776,
+      "learning_rate": 0.0008037036408134921,
+      "loss": 0.6163,
+      "step": 14300
+    },
+    {
+      "epoch": 0.7262250764681245,
+      "grad_norm": 0.04031215844232491,
+      "learning_rate": 0.0008035276328699967,
+      "loss": 0.6099,
+      "step": 14305
+    },
+    {
+      "epoch": 0.726478912566156,
+      "grad_norm": 0.029676037361304357,
+      "learning_rate": 0.0008033515653455408,
+      "loss": 0.5771,
+      "step": 14310
+    },
+    {
+      "epoch": 0.7267327486641876,
+      "grad_norm": 0.059697029416254835,
+      "learning_rate": 0.0008031754382746854,
+      "loss": 0.5749,
+      "step": 14315
+    },
+    {
+      "epoch": 0.726986584762219,
+      "grad_norm": 0.025871394003377013,
+      "learning_rate": 0.0008029992516920033,
+      "loss": 0.5962,
+      "step": 14320
+    },
+    {
+      "epoch": 0.7272404208602505,
+      "grad_norm": 0.02569159141087064,
+      "learning_rate": 0.0008028230056320791,
+      "loss": 0.575,
+      "step": 14325
+    },
+    {
+      "epoch": 0.7274942569582821,
+      "grad_norm": 0.02738571226532053,
+      "learning_rate": 0.0008026467001295092,
+      "loss": 0.5774,
+      "step": 14330
+    },
+    {
+      "epoch": 0.7277480930563135,
+      "grad_norm": 0.036913640747463713,
+      "learning_rate": 0.0008024703352189011,
+      "loss": 0.6074,
+      "step": 14335
+    },
+    {
+      "epoch": 0.728001929154345,
+      "grad_norm": 0.02570980610387182,
+      "learning_rate": 0.0008022939109348749,
+      "loss": 0.5959,
+      "step": 14340
+    },
+    {
+      "epoch": 0.7282557652523766,
+      "grad_norm": 0.04358525580235644,
+      "learning_rate": 0.0008021174273120615,
+      "loss": 0.5795,
+      "step": 14345
+    },
+    {
+      "epoch": 0.728509601350408,
+      "grad_norm": 0.07617898724384416,
+      "learning_rate": 0.0008019408843851037,
+      "loss": 0.7202,
+      "step": 14350
+    },
+    {
+      "epoch": 0.7287634374484395,
+      "grad_norm": 0.07466034802353338,
+      "learning_rate": 0.0008017642821886562,
+      "loss": 0.6215,
+      "step": 14355
+    },
+    {
+      "epoch": 0.7290172735464711,
+      "grad_norm": 0.06325253244834098,
+      "learning_rate": 0.0008015876207573848,
+      "loss": 0.6182,
+      "step": 14360
+    },
+    {
+      "epoch": 0.7292711096445026,
+      "grad_norm": 0.033647850896565544,
+      "learning_rate": 0.0008014109001259675,
+      "loss": 0.62,
+      "step": 14365
+    },
+    {
+      "epoch": 0.729524945742534,
+      "grad_norm": 0.0311901569505441,
+      "learning_rate": 0.0008012341203290936,
+      "loss": 0.5985,
+      "step": 14370
+    },
+    {
+      "epoch": 0.7297787818405655,
+      "grad_norm": 0.030231478505345966,
+      "learning_rate": 0.0008010572814014643,
+      "loss": 0.6101,
+      "step": 14375
+    },
+    {
+      "epoch": 0.7300326179385971,
+      "grad_norm": 0.03148649478982455,
+      "learning_rate": 0.0008008803833777919,
+      "loss": 0.5824,
+      "step": 14380
+    },
+    {
+      "epoch": 0.7302864540366285,
+      "grad_norm": 0.030003355989516,
+      "learning_rate": 0.0008007034262928008,
+      "loss": 0.5957,
+      "step": 14385
+    },
+    {
+      "epoch": 0.73054029013466,
+      "grad_norm": 0.02917680627459806,
+      "learning_rate": 0.0008005264101812267,
+      "loss": 0.5986,
+      "step": 14390
+    },
+    {
+      "epoch": 0.7307941262326916,
+      "grad_norm": 0.02495332231474039,
+      "learning_rate": 0.000800349335077817,
+      "loss": 0.5705,
+      "step": 14395
+    },
+    {
+      "epoch": 0.731047962330723,
+      "grad_norm": 0.024227094406430567,
+      "learning_rate": 0.0008001722010173306,
+      "loss": 0.606,
+      "step": 14400
+    },
+    {
+      "epoch": 0.7313017984287545,
+      "grad_norm": 0.02497321166367223,
+      "learning_rate": 0.0007999950080345382,
+      "loss": 0.598,
+      "step": 14405
+    },
+    {
+      "epoch": 0.7315556345267861,
+      "grad_norm": 0.03244596407041108,
+      "learning_rate": 0.0007998177561642218,
+      "loss": 0.6059,
+      "step": 14410
+    },
+    {
+      "epoch": 0.7318094706248176,
+      "grad_norm": 0.031337458561177915,
+      "learning_rate": 0.000799640445441175,
+      "loss": 0.6066,
+      "step": 14415
+    },
+    {
+      "epoch": 0.732063306722849,
+      "grad_norm": 0.027382503542512458,
+      "learning_rate": 0.000799463075900203,
+      "loss": 0.6036,
+      "step": 14420
+    },
+    {
+      "epoch": 0.7323171428208806,
+      "grad_norm": 0.11807874331082033,
+      "learning_rate": 0.0007992856475761228,
+      "loss": 0.5847,
+      "step": 14425
+    },
+    {
+      "epoch": 0.7325709789189121,
+      "grad_norm": 0.05405316379164612,
+      "learning_rate": 0.0007991081605037624,
+      "loss": 0.5619,
+      "step": 14430
+    },
+    {
+      "epoch": 0.7328248150169435,
+      "grad_norm": 0.03327425512133745,
+      "learning_rate": 0.0007989306147179618,
+      "loss": 0.5953,
+      "step": 14435
+    },
+    {
+      "epoch": 0.733078651114975,
+      "grad_norm": 0.03368726791670784,
+      "learning_rate": 0.0007987530102535723,
+      "loss": 0.5914,
+      "step": 14440
+    },
+    {
+      "epoch": 0.7333324872130066,
+      "grad_norm": 0.03358231187464119,
+      "learning_rate": 0.0007985753471454566,
+      "loss": 0.5838,
+      "step": 14445
+    },
+    {
+      "epoch": 0.733586323311038,
+      "grad_norm": 0.03261890078454357,
+      "learning_rate": 0.0007983976254284894,
+      "loss": 0.5523,
+      "step": 14450
+    },
+    {
+      "epoch": 0.7338401594090695,
+      "grad_norm": 0.05662683445430045,
+      "learning_rate": 0.0007982198451375564,
+      "loss": 0.6053,
+      "step": 14455
+    },
+    {
+      "epoch": 0.7340939955071011,
+      "grad_norm": 0.030310060759633013,
+      "learning_rate": 0.0007980420063075551,
+      "loss": 0.6454,
+      "step": 14460
+    },
+    {
+      "epoch": 0.7343478316051326,
+      "grad_norm": 0.026572897492377096,
+      "learning_rate": 0.0007978641089733941,
+      "loss": 0.5835,
+      "step": 14465
+    },
+    {
+      "epoch": 0.734601667703164,
+      "grad_norm": 0.03738980925312766,
+      "learning_rate": 0.0007976861531699942,
+      "loss": 0.5832,
+      "step": 14470
+    },
+    {
+      "epoch": 0.7348555038011956,
+      "grad_norm": 0.04967817488154915,
+      "learning_rate": 0.0007975081389322868,
+      "loss": 0.6216,
+      "step": 14475
+    },
+    {
+      "epoch": 0.7351093398992271,
+      "grad_norm": 0.021984750342595057,
+      "learning_rate": 0.0007973300662952155,
+      "loss": 0.5908,
+      "step": 14480
+    },
+    {
+      "epoch": 0.7353631759972585,
+      "grad_norm": 0.030283172609555046,
+      "learning_rate": 0.0007971519352937349,
+      "loss": 0.5622,
+      "step": 14485
+    },
+    {
+      "epoch": 0.7356170120952901,
+      "grad_norm": 0.025554536052177994,
+      "learning_rate": 0.0007969737459628112,
+      "loss": 0.5918,
+      "step": 14490
+    },
+    {
+      "epoch": 0.7358708481933216,
+      "grad_norm": 0.04131262873095309,
+      "learning_rate": 0.0007967954983374224,
+      "loss": 0.6124,
+      "step": 14495
+    },
+    {
+      "epoch": 0.736124684291353,
+      "grad_norm": 0.028079868427916613,
+      "learning_rate": 0.0007966171924525573,
+      "loss": 0.5802,
+      "step": 14500
+    },
+    {
+      "epoch": 0.7363785203893846,
+      "grad_norm": 0.03973060406341919,
+      "learning_rate": 0.0007964388283432165,
+      "loss": 0.5826,
+      "step": 14505
+    },
+    {
+      "epoch": 0.7366323564874161,
+      "grad_norm": 0.02430548491118239,
+      "learning_rate": 0.0007962604060444121,
+      "loss": 0.5866,
+      "step": 14510
+    },
+    {
+      "epoch": 0.7368861925854476,
+      "grad_norm": 0.02552835377781297,
+      "learning_rate": 0.0007960819255911673,
+      "loss": 0.5807,
+      "step": 14515
+    },
+    {
+      "epoch": 0.737140028683479,
+      "grad_norm": 0.030612793762513597,
+      "learning_rate": 0.0007959033870185173,
+      "loss": 0.5847,
+      "step": 14520
+    },
+    {
+      "epoch": 0.7373938647815106,
+      "grad_norm": 0.02107127051595995,
+      "learning_rate": 0.0007957247903615079,
+      "loss": 0.5667,
+      "step": 14525
+    },
+    {
+      "epoch": 0.7376477008795421,
+      "grad_norm": 0.022070086673001976,
+      "learning_rate": 0.0007955461356551971,
+      "loss": 0.5777,
+      "step": 14530
+    },
+    {
+      "epoch": 0.7379015369775735,
+      "grad_norm": 0.02546159830842789,
+      "learning_rate": 0.0007953674229346537,
+      "loss": 0.5899,
+      "step": 14535
+    },
+    {
+      "epoch": 0.7381553730756051,
+      "grad_norm": 0.039042598157433286,
+      "learning_rate": 0.000795188652234958,
+      "loss": 0.5747,
+      "step": 14540
+    },
+    {
+      "epoch": 0.7384092091736366,
+      "grad_norm": 0.2906439126670049,
+      "learning_rate": 0.0007950098235912021,
+      "loss": 0.6219,
+      "step": 14545
+    },
+    {
+      "epoch": 0.738663045271668,
+      "grad_norm": 0.07072338732370455,
+      "learning_rate": 0.0007948309370384891,
+      "loss": 0.5922,
+      "step": 14550
+    },
+    {
+      "epoch": 0.7389168813696996,
+      "grad_norm": 0.47514585488788236,
+      "learning_rate": 0.0007946519926119335,
+      "loss": 0.615,
+      "step": 14555
+    },
+    {
+      "epoch": 0.7391707174677311,
+      "grad_norm": 0.0648487542350592,
+      "learning_rate": 0.000794472990346661,
+      "loss": 0.5799,
+      "step": 14560
+    },
+    {
+      "epoch": 0.7394245535657625,
+      "grad_norm": 0.03408321166656662,
+      "learning_rate": 0.0007942939302778092,
+      "loss": 0.5847,
+      "step": 14565
+    },
+    {
+      "epoch": 0.7396783896637941,
+      "grad_norm": 0.04411672773665418,
+      "learning_rate": 0.0007941148124405264,
+      "loss": 0.6344,
+      "step": 14570
+    },
+    {
+      "epoch": 0.7399322257618256,
+      "grad_norm": 0.029167074032466867,
+      "learning_rate": 0.0007939356368699727,
+      "loss": 0.6158,
+      "step": 14575
+    },
+    {
+      "epoch": 0.7401860618598571,
+      "grad_norm": 0.030168988293000874,
+      "learning_rate": 0.0007937564036013194,
+      "loss": 0.5652,
+      "step": 14580
+    },
+    {
+      "epoch": 0.7404398979578886,
+      "grad_norm": 0.035815596751334416,
+      "learning_rate": 0.000793577112669749,
+      "loss": 0.5966,
+      "step": 14585
+    },
+    {
+      "epoch": 0.7406937340559201,
+      "grad_norm": 0.028868811480219827,
+      "learning_rate": 0.0007933977641104555,
+      "loss": 0.6047,
+      "step": 14590
+    },
+    {
+      "epoch": 0.7409475701539516,
+      "grad_norm": 0.031907618454550465,
+      "learning_rate": 0.000793218357958644,
+      "loss": 0.5645,
+      "step": 14595
+    },
+    {
+      "epoch": 0.741201406251983,
+      "grad_norm": 0.030941483646788018,
+      "learning_rate": 0.0007930388942495312,
+      "loss": 0.6008,
+      "step": 14600
+    },
+    {
+      "epoch": 0.7414552423500146,
+      "grad_norm": 0.026271292559618437,
+      "learning_rate": 0.0007928593730183447,
+      "loss": 0.5566,
+      "step": 14605
+    },
+    {
+      "epoch": 0.7417090784480461,
+      "grad_norm": 0.030445120700402395,
+      "learning_rate": 0.0007926797943003239,
+      "loss": 0.5926,
+      "step": 14610
+    },
+    {
+      "epoch": 0.7419629145460775,
+      "grad_norm": 0.02990094964372429,
+      "learning_rate": 0.0007925001581307189,
+      "loss": 0.5997,
+      "step": 14615
+    },
+    {
+      "epoch": 0.7422167506441091,
+      "grad_norm": 0.02511691268212505,
+      "learning_rate": 0.0007923204645447916,
+      "loss": 0.5804,
+      "step": 14620
+    },
+    {
+      "epoch": 0.7424705867421406,
+      "grad_norm": 0.041391672490102865,
+      "learning_rate": 0.0007921407135778151,
+      "loss": 0.5659,
+      "step": 14625
+    },
+    {
+      "epoch": 0.7427244228401721,
+      "grad_norm": 0.02995128050105831,
+      "learning_rate": 0.0007919609052650734,
+      "loss": 0.6081,
+      "step": 14630
+    },
+    {
+      "epoch": 0.7429782589382036,
+      "grad_norm": 0.03127151350046966,
+      "learning_rate": 0.0007917810396418618,
+      "loss": 0.5946,
+      "step": 14635
+    },
+    {
+      "epoch": 0.7432320950362351,
+      "grad_norm": 0.03606392489536054,
+      "learning_rate": 0.0007916011167434873,
+      "loss": 0.5931,
+      "step": 14640
+    },
+    {
+      "epoch": 0.7434859311342666,
+      "grad_norm": 0.037073684283635416,
+      "learning_rate": 0.000791421136605268,
+      "loss": 0.5968,
+      "step": 14645
+    },
+    {
+      "epoch": 0.7437397672322981,
+      "grad_norm": 0.04600230974128539,
+      "learning_rate": 0.0007912410992625326,
+      "loss": 0.6161,
+      "step": 14650
+    },
+    {
+      "epoch": 0.7439936033303296,
+      "grad_norm": 0.030237177490696085,
+      "learning_rate": 0.0007910610047506219,
+      "loss": 0.5662,
+      "step": 14655
+    },
+    {
+      "epoch": 0.7442474394283611,
+      "grad_norm": 0.04480838241032686,
+      "learning_rate": 0.0007908808531048876,
+      "loss": 0.6055,
+      "step": 14660
+    },
+    {
+      "epoch": 0.7445012755263926,
+      "grad_norm": 0.038523733334650756,
+      "learning_rate": 0.0007907006443606924,
+      "loss": 0.6029,
+      "step": 14665
+    },
+    {
+      "epoch": 0.7447551116244241,
+      "grad_norm": 0.03737018496192438,
+      "learning_rate": 0.0007905203785534104,
+      "loss": 0.6037,
+      "step": 14670
+    },
+    {
+      "epoch": 0.7450089477224556,
+      "grad_norm": 0.09121012906159448,
+      "learning_rate": 0.000790340055718427,
+      "loss": 0.5908,
+      "step": 14675
+    },
+    {
+      "epoch": 0.7452627838204872,
+      "grad_norm": 0.024868732926211504,
+      "learning_rate": 0.0007901596758911384,
+      "loss": 0.5974,
+      "step": 14680
+    },
+    {
+      "epoch": 0.7455166199185186,
+      "grad_norm": 0.029371473458118812,
+      "learning_rate": 0.0007899792391069527,
+      "loss": 0.5968,
+      "step": 14685
+    },
+    {
+      "epoch": 0.7457704560165501,
+      "grad_norm": 0.030092147058257405,
+      "learning_rate": 0.0007897987454012885,
+      "loss": 0.5867,
+      "step": 14690
+    },
+    {
+      "epoch": 0.7460242921145817,
+      "grad_norm": 0.06169934776691142,
+      "learning_rate": 0.0007896181948095755,
+      "loss": 0.6084,
+      "step": 14695
+    },
+    {
+      "epoch": 0.7462781282126131,
+      "grad_norm": 0.03499538883906122,
+      "learning_rate": 0.0007894375873672555,
+      "loss": 0.5695,
+      "step": 14700
+    },
+    {
+      "epoch": 0.7465319643106446,
+      "grad_norm": 0.03834546473926501,
+      "learning_rate": 0.0007892569231097804,
+      "loss": 0.6104,
+      "step": 14705
+    },
+    {
+      "epoch": 0.7467858004086761,
+      "grad_norm": 0.054730939898757484,
+      "learning_rate": 0.0007890762020726136,
+      "loss": 0.5776,
+      "step": 14710
+    },
+    {
+      "epoch": 0.7470396365067076,
+      "grad_norm": 0.024843870205877142,
+      "learning_rate": 0.0007888954242912303,
+      "loss": 0.5656,
+      "step": 14715
+    },
+    {
+      "epoch": 0.7472934726047391,
+      "grad_norm": 0.03349184814575679,
+      "learning_rate": 0.0007887145898011158,
+      "loss": 0.5901,
+      "step": 14720
+    },
+    {
+      "epoch": 0.7475473087027706,
+      "grad_norm": 0.049461573887305814,
+      "learning_rate": 0.0007885336986377671,
+      "loss": 0.5508,
+      "step": 14725
+    },
+    {
+      "epoch": 0.7478011448008022,
+      "grad_norm": 0.03552849919599442,
+      "learning_rate": 0.0007883527508366923,
+      "loss": 0.5951,
+      "step": 14730
+    },
+    {
+      "epoch": 0.7480549808988336,
+      "grad_norm": 0.028690443617235013,
+      "learning_rate": 0.0007881717464334104,
+      "loss": 0.5891,
+      "step": 14735
+    },
+    {
+      "epoch": 0.7483088169968651,
+      "grad_norm": 0.04167848811425972,
+      "learning_rate": 0.000787990685463452,
+      "loss": 0.562,
+      "step": 14740
+    },
+    {
+      "epoch": 0.7485626530948967,
+      "grad_norm": 0.05775833745744054,
+      "learning_rate": 0.000787809567962358,
+      "loss": 0.5688,
+      "step": 14745
+    },
+    {
+      "epoch": 0.7488164891929281,
+      "grad_norm": 0.026872351737841308,
+      "learning_rate": 0.0007876283939656814,
+      "loss": 0.5846,
+      "step": 14750
+    },
+    {
+      "epoch": 0.7490703252909596,
+      "grad_norm": 0.03924993939110344,
+      "learning_rate": 0.0007874471635089853,
+      "loss": 0.5622,
+      "step": 14755
+    },
+    {
+      "epoch": 0.7493241613889912,
+      "grad_norm": 0.04221861220551446,
+      "learning_rate": 0.0007872658766278444,
+      "loss": 0.5605,
+      "step": 14760
+    },
+    {
+      "epoch": 0.7495779974870226,
+      "grad_norm": 0.026884242563953744,
+      "learning_rate": 0.0007870845333578447,
+      "loss": 0.5434,
+      "step": 14765
+    },
+    {
+      "epoch": 0.7498318335850541,
+      "grad_norm": 0.05659339772085351,
+      "learning_rate": 0.0007869031337345828,
+      "loss": 0.5545,
+      "step": 14770
+    },
+    {
+      "epoch": 0.7500856696830857,
+      "grad_norm": 0.04676635774385785,
+      "learning_rate": 0.0007867216777936665,
+      "loss": 0.6038,
+      "step": 14775
+    },
+    {
+      "epoch": 0.7503395057811171,
+      "grad_norm": 0.03555192654015796,
+      "learning_rate": 0.0007865401655707148,
+      "loss": 0.628,
+      "step": 14780
+    },
+    {
+      "epoch": 0.7505933418791486,
+      "grad_norm": 0.02697674684520458,
+      "learning_rate": 0.0007863585971013574,
+      "loss": 0.6192,
+      "step": 14785
+    },
+    {
+      "epoch": 0.7508471779771801,
+      "grad_norm": 0.032422868044355145,
+      "learning_rate": 0.0007861769724212353,
+      "loss": 0.5789,
+      "step": 14790
+    },
+    {
+      "epoch": 0.7511010140752117,
+      "grad_norm": 0.0308503473942792,
+      "learning_rate": 0.0007859952915660009,
+      "loss": 0.6233,
+      "step": 14795
+    },
+    {
+      "epoch": 0.7513548501732431,
+      "grad_norm": 0.024092818055271237,
+      "learning_rate": 0.000785813554571317,
+      "loss": 0.5842,
+      "step": 14800
+    },
+    {
+      "epoch": 0.7516086862712746,
+      "grad_norm": 0.03344296842370167,
+      "learning_rate": 0.0007856317614728578,
+      "loss": 0.6261,
+      "step": 14805
+    },
+    {
+      "epoch": 0.7518625223693062,
+      "grad_norm": 0.037965967888487176,
+      "learning_rate": 0.0007854499123063081,
+      "loss": 0.5733,
+      "step": 14810
+    },
+    {
+      "epoch": 0.7521163584673376,
+      "grad_norm": 0.04191928838742156,
+      "learning_rate": 0.0007852680071073644,
+      "loss": 0.6117,
+      "step": 14815
+    },
+    {
+      "epoch": 0.7523701945653691,
+      "grad_norm": 0.04383694642505963,
+      "learning_rate": 0.0007850860459117332,
+      "loss": 0.5831,
+      "step": 14820
+    },
+    {
+      "epoch": 0.7526240306634007,
+      "grad_norm": 0.037614692321617,
+      "learning_rate": 0.0007849040287551332,
+      "loss": 0.5871,
+      "step": 14825
+    },
+    {
+      "epoch": 0.7528778667614321,
+      "grad_norm": 0.02516403676981507,
+      "learning_rate": 0.0007847219556732929,
+      "loss": 0.5927,
+      "step": 14830
+    },
+    {
+      "epoch": 0.7531317028594636,
+      "grad_norm": 0.041692616391361986,
+      "learning_rate": 0.0007845398267019528,
+      "loss": 0.6591,
+      "step": 14835
+    },
+    {
+      "epoch": 0.7533855389574952,
+      "grad_norm": 0.025964287691236976,
+      "learning_rate": 0.0007843576418768637,
+      "loss": 0.6005,
+      "step": 14840
+    },
+    {
+      "epoch": 0.7536393750555267,
+      "grad_norm": 0.03353359440550333,
+      "learning_rate": 0.0007841754012337876,
+      "loss": 0.579,
+      "step": 14845
+    },
+    {
+      "epoch": 0.7538932111535581,
+      "grad_norm": 0.055412308005210985,
+      "learning_rate": 0.0007839931048084971,
+      "loss": 0.6126,
+      "step": 14850
+    },
+    {
+      "epoch": 0.7541470472515897,
+      "grad_norm": 0.04895600693106303,
+      "learning_rate": 0.0007838107526367768,
+      "loss": 0.62,
+      "step": 14855
+    },
+    {
+      "epoch": 0.7544008833496212,
+      "grad_norm": 0.03477352614622541,
+      "learning_rate": 0.0007836283447544211,
+      "loss": 0.585,
+      "step": 14860
+    },
+    {
+      "epoch": 0.7546547194476526,
+      "grad_norm": 0.03588526651863759,
+      "learning_rate": 0.0007834458811972356,
+      "loss": 0.5851,
+      "step": 14865
+    },
+    {
+      "epoch": 0.7549085555456841,
+      "grad_norm": 0.05624481095210902,
+      "learning_rate": 0.0007832633620010372,
+      "loss": 0.6079,
+      "step": 14870
+    },
+    {
+      "epoch": 0.7551623916437157,
+      "grad_norm": 0.04357936006581149,
+      "learning_rate": 0.0007830807872016536,
+      "loss": 0.6187,
+      "step": 14875
+    },
+    {
+      "epoch": 0.7554162277417471,
+      "grad_norm": 0.040083961110080835,
+      "learning_rate": 0.000782898156834923,
+      "loss": 0.6431,
+      "step": 14880
+    },
+    {
+      "epoch": 0.7556700638397786,
+      "grad_norm": 0.07764594003089959,
+      "learning_rate": 0.000782715470936695,
+      "loss": 0.5915,
+      "step": 14885
+    },
+    {
+      "epoch": 0.7559238999378102,
+      "grad_norm": 0.030985982015469513,
+      "learning_rate": 0.0007825327295428302,
+      "loss": 0.5931,
+      "step": 14890
+    },
+    {
+      "epoch": 0.7561777360358417,
+      "grad_norm": 0.029912753853148843,
+      "learning_rate": 0.0007823499326891994,
+      "loss": 0.6124,
+      "step": 14895
+    },
+    {
+      "epoch": 0.7564315721338731,
+      "grad_norm": 0.028550336668593645,
+      "learning_rate": 0.000782167080411685,
+      "loss": 0.5834,
+      "step": 14900
+    },
+    {
+      "epoch": 0.7566854082319047,
+      "grad_norm": 0.028003935514170488,
+      "learning_rate": 0.0007819841727461798,
+      "loss": 0.6129,
+      "step": 14905
+    },
+    {
+      "epoch": 0.7569392443299362,
+      "grad_norm": 0.030636633612124004,
+      "learning_rate": 0.0007818012097285876,
+      "loss": 0.6299,
+      "step": 14910
+    },
+    {
+      "epoch": 0.7571930804279676,
+      "grad_norm": 0.05006422454029914,
+      "learning_rate": 0.0007816181913948235,
+      "loss": 0.6086,
+      "step": 14915
+    },
+    {
+      "epoch": 0.7574469165259992,
+      "grad_norm": 0.026143934888707424,
+      "learning_rate": 0.0007814351177808128,
+      "loss": 0.6139,
+      "step": 14920
+    },
+    {
+      "epoch": 0.7577007526240307,
+      "grad_norm": 1.2578760148102972,
+      "learning_rate": 0.000781251988922492,
+      "loss": 0.8194,
+      "step": 14925
+    },
+    {
+      "epoch": 0.7579545887220621,
+      "grad_norm": 0.10182912331451577,
+      "learning_rate": 0.0007810688048558083,
+      "loss": 0.6552,
+      "step": 14930
+    },
+    {
+      "epoch": 0.7582084248200937,
+      "grad_norm": 0.1212458030523509,
+      "learning_rate": 0.00078088556561672,
+      "loss": 0.6837,
+      "step": 14935
+    },
+    {
+      "epoch": 0.7584622609181252,
+      "grad_norm": 0.10362361037228977,
+      "learning_rate": 0.0007807022712411957,
+      "loss": 0.6756,
+      "step": 14940
+    },
+    {
+      "epoch": 0.7587160970161567,
+      "grad_norm": 0.04440285687666505,
+      "learning_rate": 0.0007805189217652158,
+      "loss": 0.6408,
+      "step": 14945
+    },
+    {
+      "epoch": 0.7589699331141881,
+      "grad_norm": 0.05277231543250202,
+      "learning_rate": 0.0007803355172247702,
+      "loss": 0.6197,
+      "step": 14950
+    },
+    {
+      "epoch": 0.7592237692122197,
+      "grad_norm": 0.031231568656463315,
+      "learning_rate": 0.0007801520576558608,
+      "loss": 0.6298,
+      "step": 14955
+    },
+    {
+      "epoch": 0.7594776053102512,
+      "grad_norm": 0.0345196918831133,
+      "learning_rate": 0.0007799685430944995,
+      "loss": 0.5926,
+      "step": 14960
+    },
+    {
+      "epoch": 0.7597314414082826,
+      "grad_norm": 0.03315835092152511,
+      "learning_rate": 0.0007797849735767094,
+      "loss": 0.5859,
+      "step": 14965
+    },
+    {
+      "epoch": 0.7599852775063142,
+      "grad_norm": 0.05487099815278874,
+      "learning_rate": 0.0007796013491385243,
+      "loss": 0.6047,
+      "step": 14970
+    },
+    {
+      "epoch": 0.7602391136043457,
+      "grad_norm": 0.0305550620003498,
+      "learning_rate": 0.0007794176698159887,
+      "loss": 0.5898,
+      "step": 14975
+    },
+    {
+      "epoch": 0.7604929497023771,
+      "grad_norm": 0.03492544688949025,
+      "learning_rate": 0.000779233935645158,
+      "loss": 0.6285,
+      "step": 14980
+    },
+    {
+      "epoch": 0.7607467858004087,
+      "grad_norm": 0.03623180099708256,
+      "learning_rate": 0.0007790501466620983,
+      "loss": 0.6035,
+      "step": 14985
+    },
+    {
+      "epoch": 0.7610006218984402,
+      "grad_norm": 0.04180008581977664,
+      "learning_rate": 0.0007788663029028863,
+      "loss": 0.5536,
+      "step": 14990
+    },
+    {
+      "epoch": 0.7612544579964716,
+      "grad_norm": 0.030734490730055696,
+      "learning_rate": 0.0007786824044036098,
+      "loss": 0.5731,
+      "step": 14995
+    },
+    {
+      "epoch": 0.7615082940945032,
+      "grad_norm": 0.055420433682433455,
+      "learning_rate": 0.0007784984512003671,
+      "loss": 0.6263,
+      "step": 15000
+    },
+    {
+      "epoch": 0.7617621301925347,
+      "grad_norm": 0.02941885467547894,
+      "learning_rate": 0.0007783144433292673,
+      "loss": 0.6284,
+      "step": 15005
+    },
+    {
+      "epoch": 0.7620159662905662,
+      "grad_norm": 0.033206110423357985,
+      "learning_rate": 0.0007781303808264303,
+      "loss": 0.5358,
+      "step": 15010
+    },
+    {
+      "epoch": 0.7622698023885977,
+      "grad_norm": 0.02625796801377728,
+      "learning_rate": 0.0007779462637279865,
+      "loss": 0.5969,
+      "step": 15015
+    },
+    {
+      "epoch": 0.7625236384866292,
+      "grad_norm": 0.29298031514322703,
+      "learning_rate": 0.0007777620920700773,
+      "loss": 0.5957,
+      "step": 15020
+    },
+    {
+      "epoch": 0.7627774745846607,
+      "grad_norm": 0.04086600855995801,
+      "learning_rate": 0.0007775778658888546,
+      "loss": 0.6322,
+      "step": 15025
+    },
+    {
+      "epoch": 0.7630313106826921,
+      "grad_norm": 0.039401332495816975,
+      "learning_rate": 0.000777393585220481,
+      "loss": 0.5687,
+      "step": 15030
+    },
+    {
+      "epoch": 0.7632851467807237,
+      "grad_norm": 0.026702482139940326,
+      "learning_rate": 0.0007772092501011301,
+      "loss": 0.6218,
+      "step": 15035
+    },
+    {
+      "epoch": 0.7635389828787552,
+      "grad_norm": 0.050236563570187805,
+      "learning_rate": 0.0007770248605669858,
+      "loss": 0.5861,
+      "step": 15040
+    },
+    {
+      "epoch": 0.7637928189767866,
+      "grad_norm": 0.02874533178510391,
+      "learning_rate": 0.0007768404166542431,
+      "loss": 0.5967,
+      "step": 15045
+    },
+    {
+      "epoch": 0.7640466550748182,
+      "grad_norm": 0.02753978513413307,
+      "learning_rate": 0.000776655918399107,
+      "loss": 0.5802,
+      "step": 15050
+    },
+    {
+      "epoch": 0.7643004911728497,
+      "grad_norm": 0.037996461348599846,
+      "learning_rate": 0.0007764713658377938,
+      "loss": 0.5915,
+      "step": 15055
+    },
+    {
+      "epoch": 0.7645543272708812,
+      "grad_norm": 0.06312266025775402,
+      "learning_rate": 0.0007762867590065302,
+      "loss": 0.5816,
+      "step": 15060
+    },
+    {
+      "epoch": 0.7648081633689127,
+      "grad_norm": 0.04647511323225117,
+      "learning_rate": 0.0007761020979415537,
+      "loss": 0.6081,
+      "step": 15065
+    },
+    {
+      "epoch": 0.7650619994669442,
+      "grad_norm": 0.0350344173940654,
+      "learning_rate": 0.0007759173826791123,
+      "loss": 0.5988,
+      "step": 15070
+    },
+    {
+      "epoch": 0.7653158355649757,
+      "grad_norm": 0.026503015783973544,
+      "learning_rate": 0.0007757326132554648,
+      "loss": 0.5924,
+      "step": 15075
+    },
+    {
+      "epoch": 0.7655696716630072,
+      "grad_norm": 0.03832259438303605,
+      "learning_rate": 0.0007755477897068803,
+      "loss": 0.6175,
+      "step": 15080
+    },
+    {
+      "epoch": 0.7658235077610387,
+      "grad_norm": 0.09757514516462057,
+      "learning_rate": 0.0007753629120696388,
+      "loss": 0.6075,
+      "step": 15085
+    },
+    {
+      "epoch": 0.7660773438590702,
+      "grad_norm": 0.05636880292552427,
+      "learning_rate": 0.000775177980380031,
+      "loss": 0.5902,
+      "step": 15090
+    },
+    {
+      "epoch": 0.7663311799571016,
+      "grad_norm": 0.044508398900761414,
+      "learning_rate": 0.0007749929946743578,
+      "loss": 0.6134,
+      "step": 15095
+    },
+    {
+      "epoch": 0.7665850160551332,
+      "grad_norm": 0.02889076972000296,
+      "learning_rate": 0.0007748079549889312,
+      "loss": 0.5796,
+      "step": 15100
+    },
+    {
+      "epoch": 0.7668388521531647,
+      "grad_norm": 0.038063937210235835,
+      "learning_rate": 0.0007746228613600735,
+      "loss": 0.5813,
+      "step": 15105
+    },
+    {
+      "epoch": 0.7670926882511963,
+      "grad_norm": 0.09708033315517485,
+      "learning_rate": 0.0007744377138241177,
+      "loss": 0.5844,
+      "step": 15110
+    },
+    {
+      "epoch": 0.7673465243492277,
+      "grad_norm": 0.06930206252274214,
+      "learning_rate": 0.0007742525124174073,
+      "loss": 0.6186,
+      "step": 15115
+    },
+    {
+      "epoch": 0.7676003604472592,
+      "grad_norm": 0.03248594459557167,
+      "learning_rate": 0.0007740672571762963,
+      "loss": 0.6108,
+      "step": 15120
+    },
+    {
+      "epoch": 0.7678541965452907,
+      "grad_norm": 0.0358203234252936,
+      "learning_rate": 0.0007738819481371495,
+      "loss": 0.5629,
+      "step": 15125
+    },
+    {
+      "epoch": 0.7681080326433222,
+      "grad_norm": 0.0446450825233111,
+      "learning_rate": 0.0007736965853363423,
+      "loss": 0.5974,
+      "step": 15130
+    },
+    {
+      "epoch": 0.7683618687413537,
+      "grad_norm": 0.03164679167642612,
+      "learning_rate": 0.0007735111688102602,
+      "loss": 0.6547,
+      "step": 15135
+    },
+    {
+      "epoch": 0.7686157048393852,
+      "grad_norm": 0.03064186972892073,
+      "learning_rate": 0.0007733256985952997,
+      "loss": 0.6022,
+      "step": 15140
+    },
+    {
+      "epoch": 0.7688695409374167,
+      "grad_norm": 0.34573913673428947,
+      "learning_rate": 0.0007731401747278676,
+      "loss": 0.5827,
+      "step": 15145
+    },
+    {
+      "epoch": 0.7691233770354482,
+      "grad_norm": 0.13966486620739513,
+      "learning_rate": 0.0007729545972443812,
+      "loss": 0.6077,
+      "step": 15150
+    },
+    {
+      "epoch": 0.7693772131334797,
+      "grad_norm": 0.029192487039169105,
+      "learning_rate": 0.000772768966181269,
+      "loss": 0.5599,
+      "step": 15155
+    },
+    {
+      "epoch": 0.7696310492315113,
+      "grad_norm": 0.03124263927843463,
+      "learning_rate": 0.0007725832815749686,
+      "loss": 0.6285,
+      "step": 15160
+    },
+    {
+      "epoch": 0.7698848853295427,
+      "grad_norm": 0.024326930348444097,
+      "learning_rate": 0.0007723975434619296,
+      "loss": 0.5531,
+      "step": 15165
+    },
+    {
+      "epoch": 0.7701387214275742,
+      "grad_norm": 0.02512413491231233,
+      "learning_rate": 0.0007722117518786112,
+      "loss": 0.5537,
+      "step": 15170
+    },
+    {
+      "epoch": 0.7703925575256058,
+      "grad_norm": 0.031772939647128885,
+      "learning_rate": 0.0007720259068614836,
+      "loss": 0.6025,
+      "step": 15175
+    },
+    {
+      "epoch": 0.7706463936236372,
+      "grad_norm": 0.043947294185195836,
+      "learning_rate": 0.0007718400084470267,
+      "loss": 0.6034,
+      "step": 15180
+    },
+    {
+      "epoch": 0.7709002297216687,
+      "grad_norm": 0.03222419275981874,
+      "learning_rate": 0.0007716540566717321,
+      "loss": 0.5981,
+      "step": 15185
+    },
+    {
+      "epoch": 0.7711540658197003,
+      "grad_norm": 0.03819007335282729,
+      "learning_rate": 0.0007714680515721008,
+      "loss": 0.5963,
+      "step": 15190
+    },
+    {
+      "epoch": 0.7714079019177317,
+      "grad_norm": 0.04093973435288178,
+      "learning_rate": 0.0007712819931846448,
+      "loss": 0.613,
+      "step": 15195
+    },
+    {
+      "epoch": 0.7716617380157632,
+      "grad_norm": 0.03695759978176583,
+      "learning_rate": 0.0007710958815458866,
+      "loss": 0.5745,
+      "step": 15200
+    },
+    {
+      "epoch": 0.7719155741137947,
+      "grad_norm": 0.027960391790393533,
+      "learning_rate": 0.0007709097166923586,
+      "loss": 0.5462,
+      "step": 15205
+    },
+    {
+      "epoch": 0.7721694102118263,
+      "grad_norm": 0.05757494108734914,
+      "learning_rate": 0.0007707234986606043,
+      "loss": 0.615,
+      "step": 15210
+    },
+    {
+      "epoch": 0.7724232463098577,
+      "grad_norm": 0.04085819775177726,
+      "learning_rate": 0.0007705372274871774,
+      "loss": 0.61,
+      "step": 15215
+    },
+    {
+      "epoch": 0.7726770824078892,
+      "grad_norm": 0.04422316744214125,
+      "learning_rate": 0.0007703509032086417,
+      "loss": 0.5625,
+      "step": 15220
+    },
+    {
+      "epoch": 0.7729309185059208,
+      "grad_norm": 0.029609165102383688,
+      "learning_rate": 0.0007701645258615721,
+      "loss": 0.5884,
+      "step": 15225
+    },
+    {
+      "epoch": 0.7731847546039522,
+      "grad_norm": 0.025139781930555237,
+      "learning_rate": 0.0007699780954825534,
+      "loss": 0.599,
+      "step": 15230
+    },
+    {
+      "epoch": 0.7734385907019837,
+      "grad_norm": 0.02759229289936248,
+      "learning_rate": 0.0007697916121081809,
+      "loss": 0.5581,
+      "step": 15235
+    },
+    {
+      "epoch": 0.7736924268000153,
+      "grad_norm": 0.022703912785005373,
+      "learning_rate": 0.0007696050757750603,
+      "loss": 0.5604,
+      "step": 15240
+    },
+    {
+      "epoch": 0.7739462628980467,
+      "grad_norm": 0.026623783300295627,
+      "learning_rate": 0.000769418486519808,
+      "loss": 0.5961,
+      "step": 15245
+    },
+    {
+      "epoch": 0.7742000989960782,
+      "grad_norm": 0.026127263841029254,
+      "learning_rate": 0.0007692318443790503,
+      "loss": 0.5585,
+      "step": 15250
+    },
+    {
+      "epoch": 0.7744539350941098,
+      "grad_norm": 0.02569593152061675,
+      "learning_rate": 0.0007690451493894241,
+      "loss": 0.6041,
+      "step": 15255
+    },
+    {
+      "epoch": 0.7747077711921412,
+      "grad_norm": 0.0256631736374656,
+      "learning_rate": 0.0007688584015875769,
+      "loss": 0.6003,
+      "step": 15260
+    },
+    {
+      "epoch": 0.7749616072901727,
+      "grad_norm": 0.03236587398767528,
+      "learning_rate": 0.0007686716010101663,
+      "loss": 0.642,
+      "step": 15265
+    },
+    {
+      "epoch": 0.7752154433882043,
+      "grad_norm": 0.03721845153105975,
+      "learning_rate": 0.0007684847476938601,
+      "loss": 0.5819,
+      "step": 15270
+    },
+    {
+      "epoch": 0.7754692794862358,
+      "grad_norm": 0.028027879120731618,
+      "learning_rate": 0.0007682978416753371,
+      "loss": 0.5795,
+      "step": 15275
+    },
+    {
+      "epoch": 0.7757231155842672,
+      "grad_norm": 0.02737803847928579,
+      "learning_rate": 0.0007681108829912857,
+      "loss": 0.5746,
+      "step": 15280
+    },
+    {
+      "epoch": 0.7759769516822987,
+      "grad_norm": 0.027303438264093486,
+      "learning_rate": 0.0007679238716784049,
+      "loss": 0.8898,
+      "step": 15285
+    },
+    {
+      "epoch": 0.7762307877803303,
+      "grad_norm": 0.036685326663591804,
+      "learning_rate": 0.0007677368077734045,
+      "loss": 0.5854,
+      "step": 15290
+    },
+    {
+      "epoch": 0.7764846238783617,
+      "grad_norm": 0.1602053406884727,
+      "learning_rate": 0.0007675496913130038,
+      "loss": 0.6604,
+      "step": 15295
+    },
+    {
+      "epoch": 0.7767384599763932,
+      "grad_norm": 0.04847230882404775,
+      "learning_rate": 0.0007673625223339329,
+      "loss": 0.623,
+      "step": 15300
+    },
+    {
+      "epoch": 0.7769922960744248,
+      "grad_norm": 0.05089648219518116,
+      "learning_rate": 0.0007671753008729323,
+      "loss": 0.6436,
+      "step": 15305
+    },
+    {
+      "epoch": 0.7772461321724562,
+      "grad_norm": 0.07145564787304685,
+      "learning_rate": 0.0007669880269667524,
+      "loss": 0.5677,
+      "step": 15310
+    },
+    {
+      "epoch": 0.7774999682704877,
+      "grad_norm": 0.058442914245472245,
+      "learning_rate": 0.0007668007006521544,
+      "loss": 0.6102,
+      "step": 15315
+    },
+    {
+      "epoch": 0.7777538043685193,
+      "grad_norm": 0.04062643001361585,
+      "learning_rate": 0.0007666133219659094,
+      "loss": 0.5994,
+      "step": 15320
+    },
+    {
+      "epoch": 0.7780076404665508,
+      "grad_norm": 0.04022904195831844,
+      "learning_rate": 0.0007664258909447989,
+      "loss": 0.6411,
+      "step": 15325
+    },
+    {
+      "epoch": 0.7782614765645822,
+      "grad_norm": 0.02659845024976935,
+      "learning_rate": 0.0007662384076256146,
+      "loss": 0.6142,
+      "step": 15330
+    },
+    {
+      "epoch": 0.7785153126626138,
+      "grad_norm": 0.04785803634447859,
+      "learning_rate": 0.0007660508720451585,
+      "loss": 0.6396,
+      "step": 15335
+    },
+    {
+      "epoch": 0.7787691487606453,
+      "grad_norm": 0.061095944021831515,
+      "learning_rate": 0.0007658632842402432,
+      "loss": 0.5819,
+      "step": 15340
+    },
+    {
+      "epoch": 0.7790229848586767,
+      "grad_norm": 0.030130348283759538,
+      "learning_rate": 0.0007656756442476911,
+      "loss": 0.5868,
+      "step": 15345
+    },
+    {
+      "epoch": 0.7792768209567082,
+      "grad_norm": 0.028351823717474526,
+      "learning_rate": 0.0007654879521043347,
+      "loss": 0.5797,
+      "step": 15350
+    },
+    {
+      "epoch": 0.7795306570547398,
+      "grad_norm": 0.050575288848887096,
+      "learning_rate": 0.0007653002078470175,
+      "loss": 0.6096,
+      "step": 15355
+    },
+    {
+      "epoch": 0.7797844931527712,
+      "grad_norm": 0.05292190850284731,
+      "learning_rate": 0.0007651124115125924,
+      "loss": 0.5891,
+      "step": 15360
+    },
+    {
+      "epoch": 0.7800383292508027,
+      "grad_norm": 0.03779146555526197,
+      "learning_rate": 0.0007649245631379232,
+      "loss": 0.5974,
+      "step": 15365
+    },
+    {
+      "epoch": 0.7802921653488343,
+      "grad_norm": 0.03100086787490601,
+      "learning_rate": 0.0007647366627598835,
+      "loss": 0.566,
+      "step": 15370
+    },
+    {
+      "epoch": 0.7805460014468658,
+      "grad_norm": 0.030255527750173147,
+      "learning_rate": 0.0007645487104153568,
+      "loss": 0.6128,
+      "step": 15375
+    },
+    {
+      "epoch": 0.7807998375448972,
+      "grad_norm": 0.027165593937075252,
+      "learning_rate": 0.0007643607061412379,
+      "loss": 0.5686,
+      "step": 15380
+    },
+    {
+      "epoch": 0.7810536736429288,
+      "grad_norm": 0.02824615720081996,
+      "learning_rate": 0.0007641726499744306,
+      "loss": 0.582,
+      "step": 15385
+    },
+    {
+      "epoch": 0.7813075097409603,
+      "grad_norm": 0.04126298838740098,
+      "learning_rate": 0.0007639845419518494,
+      "loss": 0.6027,
+      "step": 15390
+    },
+    {
+      "epoch": 0.7815613458389917,
+      "grad_norm": 0.032036059387028706,
+      "learning_rate": 0.0007637963821104192,
+      "loss": 0.5775,
+      "step": 15395
+    },
+    {
+      "epoch": 0.7818151819370233,
+      "grad_norm": 0.028486079119276198,
+      "learning_rate": 0.0007636081704870749,
+      "loss": 0.5682,
+      "step": 15400
+    },
+    {
+      "epoch": 0.7820690180350548,
+      "grad_norm": 0.024905654752730703,
+      "learning_rate": 0.0007634199071187613,
+      "loss": 0.5981,
+      "step": 15405
+    },
+    {
+      "epoch": 0.7823228541330862,
+      "grad_norm": 0.17057789658969538,
+      "learning_rate": 0.0007632315920424335,
+      "loss": 0.5801,
+      "step": 15410
+    },
+    {
+      "epoch": 0.7825766902311178,
+      "grad_norm": 0.03659353032866976,
+      "learning_rate": 0.000763043225295057,
+      "loss": 0.5597,
+      "step": 15415
+    },
+    {
+      "epoch": 0.7828305263291493,
+      "grad_norm": 0.026725388304567464,
+      "learning_rate": 0.0007628548069136071,
+      "loss": 0.5931,
+      "step": 15420
+    },
+    {
+      "epoch": 0.7830843624271808,
+      "grad_norm": 0.037625318866698565,
+      "learning_rate": 0.0007626663369350695,
+      "loss": 0.538,
+      "step": 15425
+    },
+    {
+      "epoch": 0.7833381985252122,
+      "grad_norm": 0.03956094460634958,
+      "learning_rate": 0.0007624778153964398,
+      "loss": 0.5758,
+      "step": 15430
+    },
+    {
+      "epoch": 0.7835920346232438,
+      "grad_norm": 0.02719027610400642,
+      "learning_rate": 0.0007622892423347241,
+      "loss": 0.6001,
+      "step": 15435
+    },
+    {
+      "epoch": 0.7838458707212753,
+      "grad_norm": 0.03100928874647277,
+      "learning_rate": 0.000762100617786938,
+      "loss": 0.6138,
+      "step": 15440
+    },
+    {
+      "epoch": 0.7840997068193067,
+      "grad_norm": 0.027836970066281757,
+      "learning_rate": 0.0007619119417901077,
+      "loss": 0.6017,
+      "step": 15445
+    },
+    {
+      "epoch": 0.7843535429173383,
+      "grad_norm": 0.03469260516526115,
+      "learning_rate": 0.0007617232143812693,
+      "loss": 0.5756,
+      "step": 15450
+    },
+    {
+      "epoch": 0.7846073790153698,
+      "grad_norm": 0.03935777037891353,
+      "learning_rate": 0.0007615344355974694,
+      "loss": 0.595,
+      "step": 15455
+    },
+    {
+      "epoch": 0.7848612151134012,
+      "grad_norm": 0.02501991833734147,
+      "learning_rate": 0.0007613456054757639,
+      "loss": 0.6073,
+      "step": 15460
+    },
+    {
+      "epoch": 0.7851150512114328,
+      "grad_norm": 0.02333012899999052,
+      "learning_rate": 0.0007611567240532193,
+      "loss": 0.5868,
+      "step": 15465
+    },
+    {
+      "epoch": 0.7853688873094643,
+      "grad_norm": 0.03818225366150291,
+      "learning_rate": 0.0007609677913669124,
+      "loss": 0.599,
+      "step": 15470
+    },
+    {
+      "epoch": 0.7856227234074957,
+      "grad_norm": 0.028704075914218637,
+      "learning_rate": 0.0007607788074539293,
+      "loss": 0.5973,
+      "step": 15475
+    },
+    {
+      "epoch": 0.7858765595055273,
+      "grad_norm": 0.021928136458700793,
+      "learning_rate": 0.0007605897723513669,
+      "loss": 0.593,
+      "step": 15480
+    },
+    {
+      "epoch": 0.7861303956035588,
+      "grad_norm": 0.025069154514387224,
+      "learning_rate": 0.0007604006860963315,
+      "loss": 0.5762,
+      "step": 15485
+    },
+    {
+      "epoch": 0.7863842317015903,
+      "grad_norm": 0.029790848879588383,
+      "learning_rate": 0.0007602115487259403,
+      "loss": 0.5952,
+      "step": 15490
+    },
+    {
+      "epoch": 0.7866380677996218,
+      "grad_norm": 0.030994892822406125,
+      "learning_rate": 0.0007600223602773198,
+      "loss": 0.6024,
+      "step": 15495
+    },
+    {
+      "epoch": 0.7868919038976533,
+      "grad_norm": 0.028391213579639876,
+      "learning_rate": 0.0007598331207876066,
+      "loss": 0.58,
+      "step": 15500
+    },
+    {
+      "epoch": 0.7871457399956848,
+      "grad_norm": 0.04047000777595839,
+      "learning_rate": 0.0007596438302939475,
+      "loss": 0.5813,
+      "step": 15505
+    },
+    {
+      "epoch": 0.7873995760937162,
+      "grad_norm": 0.032259318761633765,
+      "learning_rate": 0.0007594544888334994,
+      "loss": 0.583,
+      "step": 15510
+    },
+    {
+      "epoch": 0.7876534121917478,
+      "grad_norm": 0.036383271252755846,
+      "learning_rate": 0.0007592650964434292,
+      "loss": 0.6082,
+      "step": 15515
+    },
+    {
+      "epoch": 0.7879072482897793,
+      "grad_norm": 0.046019554411827555,
+      "learning_rate": 0.0007590756531609133,
+      "loss": 0.6063,
+      "step": 15520
+    },
+    {
+      "epoch": 0.7881610843878107,
+      "grad_norm": 0.027578051274795613,
+      "learning_rate": 0.0007588861590231388,
+      "loss": 0.5804,
+      "step": 15525
+    },
+    {
+      "epoch": 0.7884149204858423,
+      "grad_norm": 0.030164791483657458,
+      "learning_rate": 0.0007586966140673024,
+      "loss": 0.5828,
+      "step": 15530
+    },
+    {
+      "epoch": 0.7886687565838738,
+      "grad_norm": 0.029268754849734627,
+      "learning_rate": 0.0007585070183306106,
+      "loss": 0.5624,
+      "step": 15535
+    },
+    {
+      "epoch": 0.7889225926819053,
+      "grad_norm": 0.026293524029202597,
+      "learning_rate": 0.0007583173718502803,
+      "loss": 0.5694,
+      "step": 15540
+    },
+    {
+      "epoch": 0.7891764287799368,
+      "grad_norm": 0.03225939541940473,
+      "learning_rate": 0.0007581276746635383,
+      "loss": 0.5923,
+      "step": 15545
+    },
+    {
+      "epoch": 0.7894302648779683,
+      "grad_norm": 0.02421244006898213,
+      "learning_rate": 0.000757937926807621,
+      "loss": 0.5453,
+      "step": 15550
+    },
+    {
+      "epoch": 0.7896841009759998,
+      "grad_norm": 0.03888331896268403,
+      "learning_rate": 0.0007577481283197749,
+      "loss": 0.6002,
+      "step": 15555
+    },
+    {
+      "epoch": 0.7899379370740313,
+      "grad_norm": 0.031178413757377548,
+      "learning_rate": 0.0007575582792372567,
+      "loss": 0.5885,
+      "step": 15560
+    },
+    {
+      "epoch": 0.7901917731720628,
+      "grad_norm": 0.02453845415254218,
+      "learning_rate": 0.0007573683795973328,
+      "loss": 0.5623,
+      "step": 15565
+    },
+    {
+      "epoch": 0.7904456092700943,
+      "grad_norm": 0.03542656617397271,
+      "learning_rate": 0.0007571784294372792,
+      "loss": 0.6026,
+      "step": 15570
+    },
+    {
+      "epoch": 0.7906994453681258,
+      "grad_norm": 0.024593970240234127,
+      "learning_rate": 0.0007569884287943826,
+      "loss": 0.5946,
+      "step": 15575
+    },
+    {
+      "epoch": 0.7909532814661573,
+      "grad_norm": 0.034196088349742554,
+      "learning_rate": 0.000756798377705939,
+      "loss": 0.6394,
+      "step": 15580
+    },
+    {
+      "epoch": 0.7912071175641888,
+      "grad_norm": 0.03593770916178315,
+      "learning_rate": 0.0007566082762092546,
+      "loss": 0.6134,
+      "step": 15585
+    },
+    {
+      "epoch": 0.7914609536622204,
+      "grad_norm": 0.02744220052589826,
+      "learning_rate": 0.0007564181243416453,
+      "loss": 0.574,
+      "step": 15590
+    },
+    {
+      "epoch": 0.7917147897602518,
+      "grad_norm": 0.04256423578944382,
+      "learning_rate": 0.0007562279221404368,
+      "loss": 0.5861,
+      "step": 15595
+    },
+    {
+      "epoch": 0.7919686258582833,
+      "grad_norm": 0.036644059764111316,
+      "learning_rate": 0.0007560376696429651,
+      "loss": 0.5489,
+      "step": 15600
+    },
+    {
+      "epoch": 0.7922224619563148,
+      "grad_norm": 0.03260114588100312,
+      "learning_rate": 0.0007558473668865755,
+      "loss": 0.5637,
+      "step": 15605
+    },
+    {
+      "epoch": 0.7924762980543463,
+      "grad_norm": 0.024808409878328188,
+      "learning_rate": 0.0007556570139086239,
+      "loss": 0.593,
+      "step": 15610
+    },
+    {
+      "epoch": 0.7927301341523778,
+      "grad_norm": 0.04023189432619606,
+      "learning_rate": 0.0007554666107464754,
+      "loss": 0.5664,
+      "step": 15615
+    },
+    {
+      "epoch": 0.7929839702504093,
+      "grad_norm": 0.02342950434065168,
+      "learning_rate": 0.0007552761574375052,
+      "loss": 0.5895,
+      "step": 15620
+    },
+    {
+      "epoch": 0.7932378063484408,
+      "grad_norm": 0.03876234729699636,
+      "learning_rate": 0.0007550856540190985,
+      "loss": 0.5723,
+      "step": 15625
+    },
+    {
+      "epoch": 0.7934916424464723,
+      "grad_norm": 0.023809875697390476,
+      "learning_rate": 0.0007548951005286498,
+      "loss": 0.5897,
+      "step": 15630
+    },
+    {
+      "epoch": 0.7937454785445038,
+      "grad_norm": 0.025531257953473226,
+      "learning_rate": 0.0007547044970035641,
+      "loss": 0.5728,
+      "step": 15635
+    },
+    {
+      "epoch": 0.7939993146425354,
+      "grad_norm": 0.023549442711572447,
+      "learning_rate": 0.0007545138434812559,
+      "loss": 0.5479,
+      "step": 15640
+    },
+    {
+      "epoch": 0.7942531507405668,
+      "grad_norm": 0.03393140892826125,
+      "learning_rate": 0.0007543231399991495,
+      "loss": 0.591,
+      "step": 15645
+    },
+    {
+      "epoch": 0.7945069868385983,
+      "grad_norm": 0.026850249760398237,
+      "learning_rate": 0.0007541323865946789,
+      "loss": 0.5756,
+      "step": 15650
+    },
+    {
+      "epoch": 0.7947608229366299,
+      "grad_norm": 0.02644298213479883,
+      "learning_rate": 0.0007539415833052882,
+      "loss": 0.6027,
+      "step": 15655
+    },
+    {
+      "epoch": 0.7950146590346613,
+      "grad_norm": 1.7634760340274238,
+      "learning_rate": 0.0007537507301684312,
+      "loss": 0.6009,
+      "step": 15660
+    },
+    {
+      "epoch": 0.7952684951326928,
+      "grad_norm": 0.05116113095501194,
+      "learning_rate": 0.0007535598272215712,
+      "loss": 0.6035,
+      "step": 15665
+    },
+    {
+      "epoch": 0.7955223312307244,
+      "grad_norm": 0.09123746164702429,
+      "learning_rate": 0.0007533688745021817,
+      "loss": 0.5869,
+      "step": 15670
+    },
+    {
+      "epoch": 0.7957761673287558,
+      "grad_norm": 0.04008705946671749,
+      "learning_rate": 0.0007531778720477457,
+      "loss": 0.6197,
+      "step": 15675
+    },
+    {
+      "epoch": 0.7960300034267873,
+      "grad_norm": 0.2577476743899855,
+      "learning_rate": 0.000752986819895756,
+      "loss": 0.6145,
+      "step": 15680
+    },
+    {
+      "epoch": 0.7962838395248188,
+      "grad_norm": 0.04408392445254025,
+      "learning_rate": 0.0007527957180837152,
+      "loss": 0.6326,
+      "step": 15685
+    },
+    {
+      "epoch": 0.7965376756228503,
+      "grad_norm": 0.03340637676201848,
+      "learning_rate": 0.0007526045666491355,
+      "loss": 0.6089,
+      "step": 15690
+    },
+    {
+      "epoch": 0.7967915117208818,
+      "grad_norm": 0.03220967045008419,
+      "learning_rate": 0.0007524133656295392,
+      "loss": 0.5869,
+      "step": 15695
+    },
+    {
+      "epoch": 0.7970453478189133,
+      "grad_norm": 0.03674143282021481,
+      "learning_rate": 0.0007522221150624579,
+      "loss": 0.6527,
+      "step": 15700
+    },
+    {
+      "epoch": 0.7972991839169449,
+      "grad_norm": 0.02344499359362633,
+      "learning_rate": 0.0007520308149854336,
+      "loss": 0.5767,
+      "step": 15705
+    },
+    {
+      "epoch": 0.7975530200149763,
+      "grad_norm": 0.030359982821470545,
+      "learning_rate": 0.0007518394654360169,
+      "loss": 0.5843,
+      "step": 15710
+    },
+    {
+      "epoch": 0.7978068561130078,
+      "grad_norm": 0.03557247505749754,
+      "learning_rate": 0.000751648066451769,
+      "loss": 0.608,
+      "step": 15715
+    },
+    {
+      "epoch": 0.7980606922110394,
+      "grad_norm": 0.027894981110671378,
+      "learning_rate": 0.0007514566180702609,
+      "loss": 0.5726,
+      "step": 15720
+    },
+    {
+      "epoch": 0.7983145283090708,
+      "grad_norm": 0.03575658449386519,
+      "learning_rate": 0.0007512651203290723,
+      "loss": 0.616,
+      "step": 15725
+    },
+    {
+      "epoch": 0.7985683644071023,
+      "grad_norm": 0.031795758337900826,
+      "learning_rate": 0.000751073573265794,
+      "loss": 0.5772,
+      "step": 15730
+    },
+    {
+      "epoch": 0.7988222005051339,
+      "grad_norm": 0.03329640143386616,
+      "learning_rate": 0.0007508819769180252,
+      "loss": 0.5722,
+      "step": 15735
+    },
+    {
+      "epoch": 0.7990760366031653,
+      "grad_norm": 0.028561079497327932,
+      "learning_rate": 0.0007506903313233755,
+      "loss": 0.5843,
+      "step": 15740
+    },
+    {
+      "epoch": 0.7993298727011968,
+      "grad_norm": 0.03499905510867631,
+      "learning_rate": 0.0007504986365194639,
+      "loss": 0.5441,
+      "step": 15745
+    },
+    {
+      "epoch": 0.7995837087992284,
+      "grad_norm": 0.02584998094099768,
+      "learning_rate": 0.0007503068925439194,
+      "loss": 0.5588,
+      "step": 15750
+    },
+    {
+      "epoch": 0.7998375448972599,
+      "grad_norm": 0.02484740511899366,
+      "learning_rate": 0.00075011509943438,
+      "loss": 0.5439,
+      "step": 15755
+    },
+    {
+      "epoch": 0.8000913809952913,
+      "grad_norm": 0.05001793472695475,
+      "learning_rate": 0.0007499232572284938,
+      "loss": 0.5912,
+      "step": 15760
+    },
+    {
+      "epoch": 0.8003452170933228,
+      "grad_norm": 0.02822375304428738,
+      "learning_rate": 0.0007497313659639188,
+      "loss": 0.5858,
+      "step": 15765
+    },
+    {
+      "epoch": 0.8005990531913544,
+      "grad_norm": 0.025776424090071148,
+      "learning_rate": 0.0007495394256783219,
+      "loss": 0.5725,
+      "step": 15770
+    },
+    {
+      "epoch": 0.8008528892893858,
+      "grad_norm": 0.03319633421805169,
+      "learning_rate": 0.0007493474364093803,
+      "loss": 0.5897,
+      "step": 15775
+    },
+    {
+      "epoch": 0.8011067253874173,
+      "grad_norm": 0.04258494801205897,
+      "learning_rate": 0.0007491553981947804,
+      "loss": 0.5431,
+      "step": 15780
+    },
+    {
+      "epoch": 0.8013605614854489,
+      "grad_norm": 0.028145407612735995,
+      "learning_rate": 0.0007489633110722183,
+      "loss": 0.5549,
+      "step": 15785
+    },
+    {
+      "epoch": 0.8016143975834803,
+      "grad_norm": 0.026623484939927906,
+      "learning_rate": 0.0007487711750793998,
+      "loss": 0.595,
+      "step": 15790
+    },
+    {
+      "epoch": 0.8018682336815118,
+      "grad_norm": 0.022934168464128556,
+      "learning_rate": 0.0007485789902540403,
+      "loss": 0.5527,
+      "step": 15795
+    },
+    {
+      "epoch": 0.8021220697795434,
+      "grad_norm": 0.025252927285032714,
+      "learning_rate": 0.0007483867566338647,
+      "loss": 0.5876,
+      "step": 15800
+    },
+    {
+      "epoch": 0.8023759058775749,
+      "grad_norm": 0.028849244701322482,
+      "learning_rate": 0.0007481944742566076,
+      "loss": 0.5715,
+      "step": 15805
+    },
+    {
+      "epoch": 0.8026297419756063,
+      "grad_norm": 0.026220259487405768,
+      "learning_rate": 0.0007480021431600128,
+      "loss": 0.6162,
+      "step": 15810
+    },
+    {
+      "epoch": 0.8028835780736379,
+      "grad_norm": 0.021106176901375888,
+      "learning_rate": 0.000747809763381834,
+      "loss": 0.5713,
+      "step": 15815
+    },
+    {
+      "epoch": 0.8031374141716694,
+      "grad_norm": 0.02762824758071424,
+      "learning_rate": 0.0007476173349598345,
+      "loss": 0.5849,
+      "step": 15820
+    },
+    {
+      "epoch": 0.8033912502697008,
+      "grad_norm": 0.05745838787319821,
+      "learning_rate": 0.000747424857931787,
+      "loss": 0.5462,
+      "step": 15825
+    },
+    {
+      "epoch": 0.8036450863677324,
+      "grad_norm": 0.040680994894634726,
+      "learning_rate": 0.0007472323323354739,
+      "loss": 0.5626,
+      "step": 15830
+    },
+    {
+      "epoch": 0.8038989224657639,
+      "grad_norm": 0.029285643222152852,
+      "learning_rate": 0.0007470397582086869,
+      "loss": 0.598,
+      "step": 15835
+    },
+    {
+      "epoch": 0.8041527585637953,
+      "grad_norm": 0.03200481493108892,
+      "learning_rate": 0.0007468471355892275,
+      "loss": 0.5663,
+      "step": 15840
+    },
+    {
+      "epoch": 0.8044065946618268,
+      "grad_norm": 0.02381141335581086,
+      "learning_rate": 0.0007466544645149061,
+      "loss": 0.5944,
+      "step": 15845
+    },
+    {
+      "epoch": 0.8046604307598584,
+      "grad_norm": 0.02766719856403892,
+      "learning_rate": 0.0007464617450235434,
+      "loss": 0.5815,
+      "step": 15850
+    },
+    {
+      "epoch": 0.8049142668578899,
+      "grad_norm": 0.04488529656718956,
+      "learning_rate": 0.0007462689771529695,
+      "loss": 0.5526,
+      "step": 15855
+    },
+    {
+      "epoch": 0.8051681029559213,
+      "grad_norm": 0.034731993566618206,
+      "learning_rate": 0.0007460761609410233,
+      "loss": 0.6007,
+      "step": 15860
+    },
+    {
+      "epoch": 0.8054219390539529,
+      "grad_norm": 0.025791063644137125,
+      "learning_rate": 0.000745883296425554,
+      "loss": 0.5695,
+      "step": 15865
+    },
+    {
+      "epoch": 0.8056757751519844,
+      "grad_norm": 0.028080587090770482,
+      "learning_rate": 0.00074569038364442,
+      "loss": 0.556,
+      "step": 15870
+    },
+    {
+      "epoch": 0.8059296112500158,
+      "grad_norm": 0.035411194412573295,
+      "learning_rate": 0.0007454974226354887,
+      "loss": 0.5774,
+      "step": 15875
+    },
+    {
+      "epoch": 0.8061834473480474,
+      "grad_norm": 0.03763595723144201,
+      "learning_rate": 0.0007453044134366377,
+      "loss": 0.5604,
+      "step": 15880
+    },
+    {
+      "epoch": 0.8064372834460789,
+      "grad_norm": 0.026026178191929684,
+      "learning_rate": 0.0007451113560857537,
+      "loss": 0.5668,
+      "step": 15885
+    },
+    {
+      "epoch": 0.8066911195441103,
+      "grad_norm": 0.023998437380902855,
+      "learning_rate": 0.0007449182506207328,
+      "loss": 0.5542,
+      "step": 15890
+    },
+    {
+      "epoch": 0.8069449556421419,
+      "grad_norm": 0.02465158291279762,
+      "learning_rate": 0.0007447250970794807,
+      "loss": 0.585,
+      "step": 15895
+    },
+    {
+      "epoch": 0.8071987917401734,
+      "grad_norm": 0.03811256192460854,
+      "learning_rate": 0.0007445318954999126,
+      "loss": 0.5816,
+      "step": 15900
+    },
+    {
+      "epoch": 0.8074526278382048,
+      "grad_norm": 0.03160589407825792,
+      "learning_rate": 0.0007443386459199528,
+      "loss": 0.577,
+      "step": 15905
+    },
+    {
+      "epoch": 0.8077064639362364,
+      "grad_norm": 0.028906762540522022,
+      "learning_rate": 0.0007441453483775354,
+      "loss": 0.6078,
+      "step": 15910
+    },
+    {
+      "epoch": 0.8079603000342679,
+      "grad_norm": 0.036054008668968705,
+      "learning_rate": 0.0007439520029106035,
+      "loss": 0.5942,
+      "step": 15915
+    },
+    {
+      "epoch": 0.8082141361322994,
+      "grad_norm": 0.026999292449829813,
+      "learning_rate": 0.0007437586095571102,
+      "loss": 0.5836,
+      "step": 15920
+    },
+    {
+      "epoch": 0.8084679722303308,
+      "grad_norm": 0.027971628557511584,
+      "learning_rate": 0.0007435651683550173,
+      "loss": 0.5629,
+      "step": 15925
+    },
+    {
+      "epoch": 0.8087218083283624,
+      "grad_norm": 0.036526997758639046,
+      "learning_rate": 0.0007433716793422967,
+      "loss": 0.5892,
+      "step": 15930
+    },
+    {
+      "epoch": 0.8089756444263939,
+      "grad_norm": 0.03567724658137178,
+      "learning_rate": 0.0007431781425569289,
+      "loss": 0.5557,
+      "step": 15935
+    },
+    {
+      "epoch": 0.8092294805244253,
+      "grad_norm": 0.024802333956108272,
+      "learning_rate": 0.0007429845580369046,
+      "loss": 0.5618,
+      "step": 15940
+    },
+    {
+      "epoch": 0.8094833166224569,
+      "grad_norm": 0.022538774195908166,
+      "learning_rate": 0.0007427909258202232,
+      "loss": 0.583,
+      "step": 15945
+    },
+    {
+      "epoch": 0.8097371527204884,
+      "grad_norm": 0.025056735200451364,
+      "learning_rate": 0.0007425972459448941,
+      "loss": 0.5647,
+      "step": 15950
+    },
+    {
+      "epoch": 0.8099909888185198,
+      "grad_norm": 0.03851297901292128,
+      "learning_rate": 0.0007424035184489352,
+      "loss": 0.5914,
+      "step": 15955
+    },
+    {
+      "epoch": 0.8102448249165514,
+      "grad_norm": 0.04099783815757058,
+      "learning_rate": 0.0007422097433703748,
+      "loss": 0.5802,
+      "step": 15960
+    },
+    {
+      "epoch": 0.8104986610145829,
+      "grad_norm": 0.03222527568690099,
+      "learning_rate": 0.0007420159207472494,
+      "loss": 0.5839,
+      "step": 15965
+    },
+    {
+      "epoch": 0.8107524971126144,
+      "grad_norm": 0.024865699732427023,
+      "learning_rate": 0.0007418220506176058,
+      "loss": 0.5913,
+      "step": 15970
+    },
+    {
+      "epoch": 0.8110063332106459,
+      "grad_norm": 0.035177466344471775,
+      "learning_rate": 0.0007416281330194996,
+      "loss": 0.5812,
+      "step": 15975
+    },
+    {
+      "epoch": 0.8112601693086774,
+      "grad_norm": 0.024089520192005706,
+      "learning_rate": 0.0007414341679909958,
+      "loss": 0.609,
+      "step": 15980
+    },
+    {
+      "epoch": 0.8115140054067089,
+      "grad_norm": 0.023966198926588644,
+      "learning_rate": 0.0007412401555701689,
+      "loss": 0.5816,
+      "step": 15985
+    },
+    {
+      "epoch": 0.8117678415047404,
+      "grad_norm": 0.09247687498068556,
+      "learning_rate": 0.0007410460957951026,
+      "loss": 0.5439,
+      "step": 15990
+    },
+    {
+      "epoch": 0.8120216776027719,
+      "grad_norm": 0.026101986363411846,
+      "learning_rate": 0.0007408519887038898,
+      "loss": 0.5656,
+      "step": 15995
+    },
+    {
+      "epoch": 0.8122755137008034,
+      "grad_norm": 0.024600475072470263,
+      "learning_rate": 0.0007406578343346327,
+      "loss": 0.5966,
+      "step": 16000
+    },
+    {
+      "epoch": 0.8125293497988348,
+      "grad_norm": 0.02581395178412151,
+      "learning_rate": 0.0007404636327254428,
+      "loss": 0.5841,
+      "step": 16005
+    },
+    {
+      "epoch": 0.8127831858968664,
+      "grad_norm": 0.06338067688891946,
+      "learning_rate": 0.000740269383914441,
+      "loss": 0.5751,
+      "step": 16010
+    },
+    {
+      "epoch": 0.8130370219948979,
+      "grad_norm": 0.02366983052130053,
+      "learning_rate": 0.0007400750879397576,
+      "loss": 0.5536,
+      "step": 16015
+    },
+    {
+      "epoch": 0.8132908580929294,
+      "grad_norm": 0.0363662136727588,
+      "learning_rate": 0.0007398807448395314,
+      "loss": 0.5709,
+      "step": 16020
+    },
+    {
+      "epoch": 0.8135446941909609,
+      "grad_norm": 0.04335279844371627,
+      "learning_rate": 0.0007396863546519113,
+      "loss": 0.5775,
+      "step": 16025
+    },
+    {
+      "epoch": 0.8137985302889924,
+      "grad_norm": 0.028807809521302862,
+      "learning_rate": 0.0007394919174150552,
+      "loss": 0.587,
+      "step": 16030
+    },
+    {
+      "epoch": 0.8140523663870239,
+      "grad_norm": 0.03090810771638599,
+      "learning_rate": 0.0007392974331671301,
+      "loss": 0.5813,
+      "step": 16035
+    },
+    {
+      "epoch": 0.8143062024850554,
+      "grad_norm": 0.03332179615924521,
+      "learning_rate": 0.0007391029019463121,
+      "loss": 0.5748,
+      "step": 16040
+    },
+    {
+      "epoch": 0.8145600385830869,
+      "grad_norm": 0.02260131038125526,
+      "learning_rate": 0.0007389083237907869,
+      "loss": 0.5357,
+      "step": 16045
+    },
+    {
+      "epoch": 0.8148138746811184,
+      "grad_norm": 0.03144023339848837,
+      "learning_rate": 0.0007387136987387493,
+      "loss": 0.5479,
+      "step": 16050
+    },
+    {
+      "epoch": 0.8150677107791499,
+      "grad_norm": 0.04852227270793531,
+      "learning_rate": 0.0007385190268284028,
+      "loss": 0.562,
+      "step": 16055
+    },
+    {
+      "epoch": 0.8153215468771814,
+      "grad_norm": 0.0238226778020187,
+      "learning_rate": 0.000738324308097961,
+      "loss": 0.5748,
+      "step": 16060
+    },
+    {
+      "epoch": 0.8155753829752129,
+      "grad_norm": 0.024025342394617455,
+      "learning_rate": 0.0007381295425856461,
+      "loss": 0.5779,
+      "step": 16065
+    },
+    {
+      "epoch": 0.8158292190732445,
+      "grad_norm": 0.025164599314322966,
+      "learning_rate": 0.0007379347303296895,
+      "loss": 0.5699,
+      "step": 16070
+    },
+    {
+      "epoch": 0.8160830551712759,
+      "grad_norm": 0.024454600763767285,
+      "learning_rate": 0.0007377398713683319,
+      "loss": 0.5647,
+      "step": 16075
+    },
+    {
+      "epoch": 0.8163368912693074,
+      "grad_norm": 0.0465643154498082,
+      "learning_rate": 0.0007375449657398232,
+      "loss": 0.6121,
+      "step": 16080
+    },
+    {
+      "epoch": 0.816590727367339,
+      "grad_norm": 0.05708836406856427,
+      "learning_rate": 0.0007373500134824224,
+      "loss": 0.552,
+      "step": 16085
+    },
+    {
+      "epoch": 0.8168445634653704,
+      "grad_norm": 0.03307721737401848,
+      "learning_rate": 0.0007371550146343976,
+      "loss": 0.5815,
+      "step": 16090
+    },
+    {
+      "epoch": 0.8170983995634019,
+      "grad_norm": 0.022676746585353742,
+      "learning_rate": 0.0007369599692340261,
+      "loss": 0.5196,
+      "step": 16095
+    },
+    {
+      "epoch": 0.8173522356614334,
+      "grad_norm": 0.022835698320630423,
+      "learning_rate": 0.0007367648773195942,
+      "loss": 0.5688,
+      "step": 16100
+    },
+    {
+      "epoch": 0.8176060717594649,
+      "grad_norm": 0.024353558374283157,
+      "learning_rate": 0.000736569738929398,
+      "loss": 0.5533,
+      "step": 16105
+    },
+    {
+      "epoch": 0.8178599078574964,
+      "grad_norm": 0.035753016297181556,
+      "learning_rate": 0.0007363745541017415,
+      "loss": 0.5759,
+      "step": 16110
+    },
+    {
+      "epoch": 0.8181137439555279,
+      "grad_norm": 0.04602579365083806,
+      "learning_rate": 0.0007361793228749387,
+      "loss": 0.6057,
+      "step": 16115
+    },
+    {
+      "epoch": 0.8183675800535594,
+      "grad_norm": 0.02654832147555816,
+      "learning_rate": 0.0007359840452873129,
+      "loss": 0.5732,
+      "step": 16120
+    },
+    {
+      "epoch": 0.8186214161515909,
+      "grad_norm": 0.029689830503051053,
+      "learning_rate": 0.0007357887213771958,
+      "loss": 0.5735,
+      "step": 16125
+    },
+    {
+      "epoch": 0.8188752522496224,
+      "grad_norm": 0.02922774898175317,
+      "learning_rate": 0.0007355933511829286,
+      "loss": 0.5886,
+      "step": 16130
+    },
+    {
+      "epoch": 0.819129088347654,
+      "grad_norm": 0.02346950210966065,
+      "learning_rate": 0.0007353979347428614,
+      "loss": 0.5436,
+      "step": 16135
+    },
+    {
+      "epoch": 0.8193829244456854,
+      "grad_norm": 0.4292564062685949,
+      "learning_rate": 0.0007352024720953536,
+      "loss": 0.5475,
+      "step": 16140
+    },
+    {
+      "epoch": 0.8196367605437169,
+      "grad_norm": 0.054772914446169205,
+      "learning_rate": 0.0007350069632787734,
+      "loss": 0.5315,
+      "step": 16145
+    },
+    {
+      "epoch": 0.8198905966417485,
+      "grad_norm": 0.05912804272652365,
+      "learning_rate": 0.0007348114083314984,
+      "loss": 0.5795,
+      "step": 16150
+    },
+    {
+      "epoch": 0.8201444327397799,
+      "grad_norm": 0.030284659576007726,
+      "learning_rate": 0.0007346158072919149,
+      "loss": 0.5843,
+      "step": 16155
+    },
+    {
+      "epoch": 0.8203982688378114,
+      "grad_norm": 0.023151058600271775,
+      "learning_rate": 0.0007344201601984185,
+      "loss": 0.5864,
+      "step": 16160
+    },
+    {
+      "epoch": 0.820652104935843,
+      "grad_norm": 0.03376567799773459,
+      "learning_rate": 0.0007342244670894136,
+      "loss": 0.5571,
+      "step": 16165
+    },
+    {
+      "epoch": 0.8209059410338744,
+      "grad_norm": 0.03959840237334942,
+      "learning_rate": 0.000734028728003314,
+      "loss": 0.5696,
+      "step": 16170
+    },
+    {
+      "epoch": 0.8211597771319059,
+      "grad_norm": 0.02935275476865624,
+      "learning_rate": 0.000733832942978542,
+      "loss": 0.5647,
+      "step": 16175
+    },
+    {
+      "epoch": 0.8214136132299374,
+      "grad_norm": 0.03823963239401788,
+      "learning_rate": 0.0007336371120535295,
+      "loss": 0.5742,
+      "step": 16180
+    },
+    {
+      "epoch": 0.821667449327969,
+      "grad_norm": 0.024101714729295634,
+      "learning_rate": 0.0007334412352667173,
+      "loss": 0.6284,
+      "step": 16185
+    },
+    {
+      "epoch": 0.8219212854260004,
+      "grad_norm": 2.693520215640975,
+      "learning_rate": 0.0007332453126565545,
+      "loss": 0.6209,
+      "step": 16190
+    },
+    {
+      "epoch": 0.8221751215240319,
+      "grad_norm": 0.03279854521465908,
+      "learning_rate": 0.0007330493442615,
+      "loss": 0.5537,
+      "step": 16195
+    },
+    {
+      "epoch": 0.8224289576220635,
+      "grad_norm": 0.044838327703993694,
+      "learning_rate": 0.0007328533301200216,
+      "loss": 0.557,
+      "step": 16200
+    },
+    {
+      "epoch": 0.8226827937200949,
+      "grad_norm": 0.02638836436847605,
+      "learning_rate": 0.0007326572702705958,
+      "loss": 0.5701,
+      "step": 16205
+    },
+    {
+      "epoch": 0.8229366298181264,
+      "grad_norm": 0.04922931478557384,
+      "learning_rate": 0.0007324611647517078,
+      "loss": 0.56,
+      "step": 16210
+    },
+    {
+      "epoch": 0.823190465916158,
+      "grad_norm": 0.03891712292801701,
+      "learning_rate": 0.0007322650136018527,
+      "loss": 0.5869,
+      "step": 16215
+    },
+    {
+      "epoch": 0.8234443020141894,
+      "grad_norm": 0.04138342114685851,
+      "learning_rate": 0.0007320688168595338,
+      "loss": 0.6059,
+      "step": 16220
+    },
+    {
+      "epoch": 0.8236981381122209,
+      "grad_norm": 0.02918011994976219,
+      "learning_rate": 0.0007318725745632632,
+      "loss": 0.5551,
+      "step": 16225
+    },
+    {
+      "epoch": 0.8239519742102525,
+      "grad_norm": 0.0419742382923944,
+      "learning_rate": 0.0007316762867515627,
+      "loss": 0.5673,
+      "step": 16230
+    },
+    {
+      "epoch": 0.824205810308284,
+      "grad_norm": 0.030211128758018017,
+      "learning_rate": 0.0007314799534629625,
+      "loss": 0.6303,
+      "step": 16235
+    },
+    {
+      "epoch": 0.8244596464063154,
+      "grad_norm": 0.02340354696643915,
+      "learning_rate": 0.0007312835747360018,
+      "loss": 0.5603,
+      "step": 16240
+    },
+    {
+      "epoch": 0.824713482504347,
+      "grad_norm": 0.042394083569737895,
+      "learning_rate": 0.0007310871506092287,
+      "loss": 0.5427,
+      "step": 16245
+    },
+    {
+      "epoch": 0.8249673186023785,
+      "grad_norm": 0.04912858556363834,
+      "learning_rate": 0.0007308906811212004,
+      "loss": 0.5917,
+      "step": 16250
+    },
+    {
+      "epoch": 0.8252211547004099,
+      "grad_norm": 0.03971810340456524,
+      "learning_rate": 0.000730694166310483,
+      "loss": 0.562,
+      "step": 16255
+    },
+    {
+      "epoch": 0.8254749907984414,
+      "grad_norm": 0.02785763337910464,
+      "learning_rate": 0.0007304976062156512,
+      "loss": 0.5795,
+      "step": 16260
+    },
+    {
+      "epoch": 0.825728826896473,
+      "grad_norm": 0.033158827041856395,
+      "learning_rate": 0.0007303010008752886,
+      "loss": 0.5688,
+      "step": 16265
+    },
+    {
+      "epoch": 0.8259826629945044,
+      "grad_norm": 0.030706479795822323,
+      "learning_rate": 0.0007301043503279881,
+      "loss": 0.5976,
+      "step": 16270
+    },
+    {
+      "epoch": 0.8262364990925359,
+      "grad_norm": 0.030873986022704285,
+      "learning_rate": 0.0007299076546123512,
+      "loss": 0.5715,
+      "step": 16275
+    },
+    {
+      "epoch": 0.8264903351905675,
+      "grad_norm": 0.0474915666866346,
+      "learning_rate": 0.0007297109137669882,
+      "loss": 0.563,
+      "step": 16280
+    },
+    {
+      "epoch": 0.826744171288599,
+      "grad_norm": 0.03893683462709932,
+      "learning_rate": 0.0007295141278305185,
+      "loss": 0.5592,
+      "step": 16285
+    },
+    {
+      "epoch": 0.8269980073866304,
+      "grad_norm": 0.03111456293981649,
+      "learning_rate": 0.0007293172968415701,
+      "loss": 0.5792,
+      "step": 16290
+    },
+    {
+      "epoch": 0.827251843484662,
+      "grad_norm": 0.024853102754921644,
+      "learning_rate": 0.0007291204208387798,
+      "loss": 0.58,
+      "step": 16295
+    },
+    {
+      "epoch": 0.8275056795826935,
+      "grad_norm": 0.0251707952164745,
+      "learning_rate": 0.0007289234998607935,
+      "loss": 0.5397,
+      "step": 16300
+    },
+    {
+      "epoch": 0.8277595156807249,
+      "grad_norm": 0.03637341065833897,
+      "learning_rate": 0.000728726533946266,
+      "loss": 0.5889,
+      "step": 16305
+    },
+    {
+      "epoch": 0.8280133517787565,
+      "grad_norm": 0.025249035570284573,
+      "learning_rate": 0.0007285295231338605,
+      "loss": 0.5608,
+      "step": 16310
+    },
+    {
+      "epoch": 0.828267187876788,
+      "grad_norm": 0.03480589568385215,
+      "learning_rate": 0.0007283324674622491,
+      "loss": 0.5958,
+      "step": 16315
+    },
+    {
+      "epoch": 0.8285210239748194,
+      "grad_norm": 0.06002094158028185,
+      "learning_rate": 0.0007281353669701131,
+      "loss": 0.5949,
+      "step": 16320
+    },
+    {
+      "epoch": 0.828774860072851,
+      "grad_norm": 0.02645055337203729,
+      "learning_rate": 0.0007279382216961426,
+      "loss": 0.5696,
+      "step": 16325
+    },
+    {
+      "epoch": 0.8290286961708825,
+      "grad_norm": 0.026182888420755522,
+      "learning_rate": 0.0007277410316790355,
+      "loss": 0.5815,
+      "step": 16330
+    },
+    {
+      "epoch": 0.8292825322689139,
+      "grad_norm": 0.026085049179655987,
+      "learning_rate": 0.0007275437969574999,
+      "loss": 0.5716,
+      "step": 16335
+    },
+    {
+      "epoch": 0.8295363683669454,
+      "grad_norm": 0.03239294409527313,
+      "learning_rate": 0.0007273465175702515,
+      "loss": 0.5524,
+      "step": 16340
+    },
+    {
+      "epoch": 0.829790204464977,
+      "grad_norm": 0.027393441053260734,
+      "learning_rate": 0.0007271491935560155,
+      "loss": 0.6076,
+      "step": 16345
+    },
+    {
+      "epoch": 0.8300440405630085,
+      "grad_norm": 0.029966666901580178,
+      "learning_rate": 0.0007269518249535256,
+      "loss": 0.5385,
+      "step": 16350
+    },
+    {
+      "epoch": 0.8302978766610399,
+      "grad_norm": 0.02610780272688068,
+      "learning_rate": 0.0007267544118015243,
+      "loss": 0.5672,
+      "step": 16355
+    },
+    {
+      "epoch": 0.8305517127590715,
+      "grad_norm": 0.03032975964214376,
+      "learning_rate": 0.0007265569541387628,
+      "loss": 0.5809,
+      "step": 16360
+    },
+    {
+      "epoch": 0.830805548857103,
+      "grad_norm": 0.025317306734505777,
+      "learning_rate": 0.0007263594520040011,
+      "loss": 0.5645,
+      "step": 16365
+    },
+    {
+      "epoch": 0.8310593849551344,
+      "grad_norm": 0.02593763038542392,
+      "learning_rate": 0.0007261619054360078,
+      "loss": 0.5595,
+      "step": 16370
+    },
+    {
+      "epoch": 0.831313221053166,
+      "grad_norm": 0.0439291131573673,
+      "learning_rate": 0.0007259643144735603,
+      "loss": 0.568,
+      "step": 16375
+    },
+    {
+      "epoch": 0.8315670571511975,
+      "grad_norm": 0.026930260112695524,
+      "learning_rate": 0.0007257666791554447,
+      "loss": 0.5841,
+      "step": 16380
+    },
+    {
+      "epoch": 0.8318208932492289,
+      "grad_norm": 0.031340436270697436,
+      "learning_rate": 0.0007255689995204559,
+      "loss": 0.5776,
+      "step": 16385
+    },
+    {
+      "epoch": 0.8320747293472605,
+      "grad_norm": 0.035038658960907056,
+      "learning_rate": 0.0007253712756073973,
+      "loss": 0.5945,
+      "step": 16390
+    },
+    {
+      "epoch": 0.832328565445292,
+      "grad_norm": 0.02459703987491012,
+      "learning_rate": 0.0007251735074550815,
+      "loss": 0.5377,
+      "step": 16395
+    },
+    {
+      "epoch": 0.8325824015433235,
+      "grad_norm": 0.04858837100594378,
+      "learning_rate": 0.000724975695102329,
+      "loss": 0.5757,
+      "step": 16400
+    },
+    {
+      "epoch": 0.832836237641355,
+      "grad_norm": 0.025374293791486993,
+      "learning_rate": 0.0007247778385879695,
+      "loss": 0.5681,
+      "step": 16405
+    },
+    {
+      "epoch": 0.8330900737393865,
+      "grad_norm": 0.024727570721873534,
+      "learning_rate": 0.0007245799379508412,
+      "loss": 0.5556,
+      "step": 16410
+    },
+    {
+      "epoch": 0.833343909837418,
+      "grad_norm": 0.04461548506528957,
+      "learning_rate": 0.000724381993229791,
+      "loss": 0.5403,
+      "step": 16415
+    },
+    {
+      "epoch": 0.8335977459354494,
+      "grad_norm": 0.02730610710183291,
+      "learning_rate": 0.0007241840044636747,
+      "loss": 0.5994,
+      "step": 16420
+    },
+    {
+      "epoch": 0.833851582033481,
+      "grad_norm": 0.0335198165281854,
+      "learning_rate": 0.0007239859716913562,
+      "loss": 0.5605,
+      "step": 16425
+    },
+    {
+      "epoch": 0.8341054181315125,
+      "grad_norm": 0.04217088658047506,
+      "learning_rate": 0.0007237878949517085,
+      "loss": 0.5743,
+      "step": 16430
+    },
+    {
+      "epoch": 0.8343592542295439,
+      "grad_norm": 0.024585253772533063,
+      "learning_rate": 0.0007235897742836131,
+      "loss": 0.5929,
+      "step": 16435
+    },
+    {
+      "epoch": 0.8346130903275755,
+      "grad_norm": 0.025207671295824716,
+      "learning_rate": 0.00072339160972596,
+      "loss": 0.5635,
+      "step": 16440
+    },
+    {
+      "epoch": 0.834866926425607,
+      "grad_norm": 0.027558993795189373,
+      "learning_rate": 0.000723193401317648,
+      "loss": 0.592,
+      "step": 16445
+    },
+    {
+      "epoch": 0.8351207625236385,
+      "grad_norm": 0.036827202153997284,
+      "learning_rate": 0.0007229951490975844,
+      "loss": 0.5541,
+      "step": 16450
+    },
+    {
+      "epoch": 0.83537459862167,
+      "grad_norm": 0.026861426162605315,
+      "learning_rate": 0.000722796853104685,
+      "loss": 0.552,
+      "step": 16455
+    },
+    {
+      "epoch": 0.8356284347197015,
+      "grad_norm": 0.04528560408360573,
+      "learning_rate": 0.0007225985133778745,
+      "loss": 0.5598,
+      "step": 16460
+    },
+    {
+      "epoch": 0.835882270817733,
+      "grad_norm": 0.026142557018130994,
+      "learning_rate": 0.0007224001299560859,
+      "loss": 0.5774,
+      "step": 16465
+    },
+    {
+      "epoch": 0.8361361069157645,
+      "grad_norm": 0.03854871080553361,
+      "learning_rate": 0.000722201702878261,
+      "loss": 0.5299,
+      "step": 16470
+    },
+    {
+      "epoch": 0.836389943013796,
+      "grad_norm": 0.03451324677710476,
+      "learning_rate": 0.0007220032321833498,
+      "loss": 0.5753,
+      "step": 16475
+    },
+    {
+      "epoch": 0.8366437791118275,
+      "grad_norm": 0.026185635664542194,
+      "learning_rate": 0.0007218047179103112,
+      "loss": 0.5643,
+      "step": 16480
+    },
+    {
+      "epoch": 0.836897615209859,
+      "grad_norm": 0.026415165159016925,
+      "learning_rate": 0.0007216061600981128,
+      "loss": 0.5673,
+      "step": 16485
+    },
+    {
+      "epoch": 0.8371514513078905,
+      "grad_norm": 0.022148025561463187,
+      "learning_rate": 0.0007214075587857302,
+      "loss": 0.5244,
+      "step": 16490
+    },
+    {
+      "epoch": 0.837405287405922,
+      "grad_norm": 0.0809295820317378,
+      "learning_rate": 0.0007212089140121481,
+      "loss": 0.5359,
+      "step": 16495
+    },
+    {
+      "epoch": 0.8376591235039536,
+      "grad_norm": 0.03396241665558352,
+      "learning_rate": 0.0007210102258163592,
+      "loss": 0.5489,
+      "step": 16500
+    },
+    {
+      "epoch": 0.837912959601985,
+      "grad_norm": 0.04736874657246381,
+      "learning_rate": 0.0007208114942373651,
+      "loss": 0.5593,
+      "step": 16505
+    },
+    {
+      "epoch": 0.8381667957000165,
+      "grad_norm": 0.02415642764938736,
+      "learning_rate": 0.0007206127193141761,
+      "loss": 0.5368,
+      "step": 16510
+    },
+    {
+      "epoch": 0.838420631798048,
+      "grad_norm": 0.025190772006756453,
+      "learning_rate": 0.0007204139010858103,
+      "loss": 0.5872,
+      "step": 16515
+    },
+    {
+      "epoch": 0.8386744678960795,
+      "grad_norm": 0.023720884374368048,
+      "learning_rate": 0.0007202150395912949,
+      "loss": 0.5686,
+      "step": 16520
+    },
+    {
+      "epoch": 0.838928303994111,
+      "grad_norm": 0.02328821938311436,
+      "learning_rate": 0.0007200161348696655,
+      "loss": 0.5578,
+      "step": 16525
+    },
+    {
+      "epoch": 0.8391821400921425,
+      "grad_norm": 0.05346546124165315,
+      "learning_rate": 0.0007198171869599662,
+      "loss": 0.5899,
+      "step": 16530
+    },
+    {
+      "epoch": 0.839435976190174,
+      "grad_norm": 0.04804743967944873,
+      "learning_rate": 0.0007196181959012491,
+      "loss": 0.5919,
+      "step": 16535
+    },
+    {
+      "epoch": 0.8396898122882055,
+      "grad_norm": 0.07197115729997203,
+      "learning_rate": 0.0007194191617325755,
+      "loss": 0.5914,
+      "step": 16540
+    },
+    {
+      "epoch": 0.839943648386237,
+      "grad_norm": 0.09143766754452975,
+      "learning_rate": 0.0007192200844930147,
+      "loss": 0.5068,
+      "step": 16545
+    },
+    {
+      "epoch": 0.8401974844842685,
+      "grad_norm": 0.03931986444195336,
+      "learning_rate": 0.0007190209642216445,
+      "loss": 0.5951,
+      "step": 16550
+    },
+    {
+      "epoch": 0.8404513205823,
+      "grad_norm": 0.05372740389831407,
+      "learning_rate": 0.0007188218009575514,
+      "loss": 0.5746,
+      "step": 16555
+    },
+    {
+      "epoch": 0.8407051566803315,
+      "grad_norm": 0.03388013411522178,
+      "learning_rate": 0.0007186225947398298,
+      "loss": 0.5847,
+      "step": 16560
+    },
+    {
+      "epoch": 0.8409589927783631,
+      "grad_norm": 0.2323506802544606,
+      "learning_rate": 0.0007184233456075833,
+      "loss": 1.0376,
+      "step": 16565
+    },
+    {
+      "epoch": 0.8412128288763945,
+      "grad_norm": 0.1835106341020443,
+      "learning_rate": 0.0007182240535999232,
+      "loss": 0.5778,
+      "step": 16570
+    },
+    {
+      "epoch": 0.841466664974426,
+      "grad_norm": 0.11156210838067974,
+      "learning_rate": 0.0007180247187559697,
+      "loss": 0.5769,
+      "step": 16575
+    },
+    {
+      "epoch": 0.8417205010724575,
+      "grad_norm": 0.07718512327967042,
+      "learning_rate": 0.0007178253411148513,
+      "loss": 0.6366,
+      "step": 16580
+    },
+    {
+      "epoch": 0.841974337170489,
+      "grad_norm": 0.043680931999653776,
+      "learning_rate": 0.0007176259207157048,
+      "loss": 0.5654,
+      "step": 16585
+    },
+    {
+      "epoch": 0.8422281732685205,
+      "grad_norm": 0.05565560445937635,
+      "learning_rate": 0.0007174264575976752,
+      "loss": 0.5576,
+      "step": 16590
+    },
+    {
+      "epoch": 0.842482009366552,
+      "grad_norm": 0.04784086578137076,
+      "learning_rate": 0.0007172269517999163,
+      "loss": 0.5756,
+      "step": 16595
+    },
+    {
+      "epoch": 0.8427358454645835,
+      "grad_norm": 0.029385540515945264,
+      "learning_rate": 0.00071702740336159,
+      "loss": 0.5723,
+      "step": 16600
+    },
+    {
+      "epoch": 0.842989681562615,
+      "grad_norm": 0.028863407001418123,
+      "learning_rate": 0.0007168278123218667,
+      "loss": 0.5663,
+      "step": 16605
+    },
+    {
+      "epoch": 0.8432435176606465,
+      "grad_norm": 0.036227912872782514,
+      "learning_rate": 0.0007166281787199253,
+      "loss": 0.5634,
+      "step": 16610
+    },
+    {
+      "epoch": 0.8434973537586781,
+      "grad_norm": 0.05143219348236945,
+      "learning_rate": 0.0007164285025949528,
+      "loss": 0.6063,
+      "step": 16615
+    },
+    {
+      "epoch": 0.8437511898567095,
+      "grad_norm": 0.028270375163286395,
+      "learning_rate": 0.0007162287839861445,
+      "loss": 0.5714,
+      "step": 16620
+    },
+    {
+      "epoch": 0.844005025954741,
+      "grad_norm": 0.045237364055904485,
+      "learning_rate": 0.0007160290229327042,
+      "loss": 0.5614,
+      "step": 16625
+    },
+    {
+      "epoch": 0.8442588620527726,
+      "grad_norm": 0.04742378757483653,
+      "learning_rate": 0.000715829219473844,
+      "loss": 0.5856,
+      "step": 16630
+    },
+    {
+      "epoch": 0.844512698150804,
+      "grad_norm": 0.028756290264305383,
+      "learning_rate": 0.0007156293736487844,
+      "loss": 0.5441,
+      "step": 16635
+    },
+    {
+      "epoch": 0.8447665342488355,
+      "grad_norm": 0.03771401096316666,
+      "learning_rate": 0.0007154294854967541,
+      "loss": 0.5825,
+      "step": 16640
+    },
+    {
+      "epoch": 0.8450203703468671,
+      "grad_norm": 0.03402827219689952,
+      "learning_rate": 0.0007152295550569902,
+      "loss": 0.5564,
+      "step": 16645
+    },
+    {
+      "epoch": 0.8452742064448985,
+      "grad_norm": 0.03963421220660269,
+      "learning_rate": 0.0007150295823687379,
+      "loss": 0.5801,
+      "step": 16650
+    },
+    {
+      "epoch": 0.84552804254293,
+      "grad_norm": 0.03217463407046853,
+      "learning_rate": 0.000714829567471251,
+      "loss": 0.5673,
+      "step": 16655
+    },
+    {
+      "epoch": 0.8457818786409615,
+      "grad_norm": 0.025969270367900015,
+      "learning_rate": 0.0007146295104037914,
+      "loss": 0.5762,
+      "step": 16660
+    },
+    {
+      "epoch": 0.8460357147389931,
+      "grad_norm": 0.0372684784203102,
+      "learning_rate": 0.0007144294112056292,
+      "loss": 0.5858,
+      "step": 16665
+    },
+    {
+      "epoch": 0.8462895508370245,
+      "grad_norm": 0.03206359149474644,
+      "learning_rate": 0.000714229269916043,
+      "loss": 0.5754,
+      "step": 16670
+    },
+    {
+      "epoch": 0.846543386935056,
+      "grad_norm": 0.08771530469398538,
+      "learning_rate": 0.0007140290865743194,
+      "loss": 0.5861,
+      "step": 16675
+    },
+    {
+      "epoch": 0.8467972230330876,
+      "grad_norm": 0.04534708545965994,
+      "learning_rate": 0.0007138288612197534,
+      "loss": 0.5762,
+      "step": 16680
+    },
+    {
+      "epoch": 0.847051059131119,
+      "grad_norm": 0.03943245508252062,
+      "learning_rate": 0.0007136285938916484,
+      "loss": 0.5512,
+      "step": 16685
+    },
+    {
+      "epoch": 0.8473048952291505,
+      "grad_norm": 0.05354631204112542,
+      "learning_rate": 0.0007134282846293157,
+      "loss": 0.5659,
+      "step": 16690
+    },
+    {
+      "epoch": 0.8475587313271821,
+      "grad_norm": 0.10850672830641753,
+      "learning_rate": 0.0007132279334720751,
+      "loss": 0.5914,
+      "step": 16695
+    },
+    {
+      "epoch": 0.8478125674252135,
+      "grad_norm": 0.0344090734999104,
+      "learning_rate": 0.0007130275404592547,
+      "loss": 0.583,
+      "step": 16700
+    },
+    {
+      "epoch": 0.848066403523245,
+      "grad_norm": 0.026564143833072755,
+      "learning_rate": 0.0007128271056301902,
+      "loss": 0.5518,
+      "step": 16705
+    },
+    {
+      "epoch": 0.8483202396212766,
+      "grad_norm": 0.02413909355518316,
+      "learning_rate": 0.0007126266290242264,
+      "loss": 0.5541,
+      "step": 16710
+    },
+    {
+      "epoch": 0.8485740757193081,
+      "grad_norm": 0.02253771359833544,
+      "learning_rate": 0.0007124261106807158,
+      "loss": 0.5439,
+      "step": 16715
+    },
+    {
+      "epoch": 0.8488279118173395,
+      "grad_norm": 0.025639805853241732,
+      "learning_rate": 0.0007122255506390188,
+      "loss": 0.5553,
+      "step": 16720
+    },
+    {
+      "epoch": 0.849081747915371,
+      "grad_norm": 0.025441727802141368,
+      "learning_rate": 0.0007120249489385048,
+      "loss": 0.5888,
+      "step": 16725
+    },
+    {
+      "epoch": 0.8493355840134026,
+      "grad_norm": 0.20833156756734772,
+      "learning_rate": 0.0007118243056185505,
+      "loss": 0.584,
+      "step": 16730
+    },
+    {
+      "epoch": 0.849589420111434,
+      "grad_norm": 0.02582708018087541,
+      "learning_rate": 0.0007116236207185414,
+      "loss": 0.5758,
+      "step": 16735
+    },
+    {
+      "epoch": 0.8498432562094655,
+      "grad_norm": 0.02244099996933645,
+      "learning_rate": 0.0007114228942778711,
+      "loss": 0.5464,
+      "step": 16740
+    },
+    {
+      "epoch": 0.8500970923074971,
+      "grad_norm": 0.030980292003979164,
+      "learning_rate": 0.0007112221263359408,
+      "loss": 0.5484,
+      "step": 16745
+    },
+    {
+      "epoch": 0.8503509284055285,
+      "grad_norm": 0.03890776257527718,
+      "learning_rate": 0.0007110213169321606,
+      "loss": 0.6233,
+      "step": 16750
+    },
+    {
+      "epoch": 0.85060476450356,
+      "grad_norm": 0.04528676589062287,
+      "learning_rate": 0.0007108204661059482,
+      "loss": 0.5863,
+      "step": 16755
+    },
+    {
+      "epoch": 0.8508586006015916,
+      "grad_norm": 0.028278146245680834,
+      "learning_rate": 0.0007106195738967296,
+      "loss": 0.5689,
+      "step": 16760
+    },
+    {
+      "epoch": 0.851112436699623,
+      "grad_norm": 0.02656249132824659,
+      "learning_rate": 0.0007104186403439391,
+      "loss": 0.5627,
+      "step": 16765
+    },
+    {
+      "epoch": 0.8513662727976545,
+      "grad_norm": 0.032513287659724305,
+      "learning_rate": 0.0007102176654870189,
+      "loss": 0.5654,
+      "step": 16770
+    },
+    {
+      "epoch": 0.8516201088956861,
+      "grad_norm": 0.02776149485485987,
+      "learning_rate": 0.0007100166493654192,
+      "loss": 0.5786,
+      "step": 16775
+    },
+    {
+      "epoch": 0.8518739449937176,
+      "grad_norm": 0.031105339929159002,
+      "learning_rate": 0.0007098155920185987,
+      "loss": 0.562,
+      "step": 16780
+    },
+    {
+      "epoch": 0.852127781091749,
+      "grad_norm": 0.02446590396405293,
+      "learning_rate": 0.0007096144934860237,
+      "loss": 0.5624,
+      "step": 16785
+    },
+    {
+      "epoch": 0.8523816171897806,
+      "grad_norm": 0.03981011051439899,
+      "learning_rate": 0.0007094133538071691,
+      "loss": 0.539,
+      "step": 16790
+    },
+    {
+      "epoch": 0.8526354532878121,
+      "grad_norm": 0.033938471991999015,
+      "learning_rate": 0.0007092121730215174,
+      "loss": 0.6041,
+      "step": 16795
+    },
+    {
+      "epoch": 0.8528892893858435,
+      "grad_norm": 0.028696702111982848,
+      "learning_rate": 0.0007090109511685595,
+      "loss": 0.5812,
+      "step": 16800
+    },
+    {
+      "epoch": 0.853143125483875,
+      "grad_norm": 0.031170929377724053,
+      "learning_rate": 0.0007088096882877942,
+      "loss": 0.6141,
+      "step": 16805
+    },
+    {
+      "epoch": 0.8533969615819066,
+      "grad_norm": 0.0229475127058901,
+      "learning_rate": 0.0007086083844187284,
+      "loss": 0.5381,
+      "step": 16810
+    },
+    {
+      "epoch": 0.853650797679938,
+      "grad_norm": 0.04104758489251979,
+      "learning_rate": 0.0007084070396008771,
+      "loss": 0.5961,
+      "step": 16815
+    },
+    {
+      "epoch": 0.8539046337779695,
+      "grad_norm": 0.02451932040436503,
+      "learning_rate": 0.0007082056538737633,
+      "loss": 0.5366,
+      "step": 16820
+    },
+    {
+      "epoch": 0.8541584698760011,
+      "grad_norm": 0.04468035129714473,
+      "learning_rate": 0.0007080042272769179,
+      "loss": 0.5693,
+      "step": 16825
+    },
+    {
+      "epoch": 0.8544123059740326,
+      "grad_norm": 0.025369526471293296,
+      "learning_rate": 0.0007078027598498801,
+      "loss": 0.5659,
+      "step": 16830
+    },
+    {
+      "epoch": 0.854666142072064,
+      "grad_norm": 0.023020797662670914,
+      "learning_rate": 0.0007076012516321968,
+      "loss": 0.5979,
+      "step": 16835
+    },
+    {
+      "epoch": 0.8549199781700956,
+      "grad_norm": 0.02317984351608055,
+      "learning_rate": 0.0007073997026634229,
+      "loss": 0.5815,
+      "step": 16840
+    },
+    {
+      "epoch": 0.8551738142681271,
+      "grad_norm": 0.024195045783076848,
+      "learning_rate": 0.000707198112983122,
+      "loss": 0.5277,
+      "step": 16845
+    },
+    {
+      "epoch": 0.8554276503661585,
+      "grad_norm": 0.03085842460268176,
+      "learning_rate": 0.0007069964826308646,
+      "loss": 0.5641,
+      "step": 16850
+    },
+    {
+      "epoch": 0.8556814864641901,
+      "grad_norm": 0.023758504611377088,
+      "learning_rate": 0.00070679481164623,
+      "loss": 0.5577,
+      "step": 16855
+    },
+    {
+      "epoch": 0.8559353225622216,
+      "grad_norm": 0.03533884780505401,
+      "learning_rate": 0.0007065931000688053,
+      "loss": 0.5214,
+      "step": 16860
+    },
+    {
+      "epoch": 0.856189158660253,
+      "grad_norm": 0.028013725394752647,
+      "learning_rate": 0.0007063913479381851,
+      "loss": 0.5775,
+      "step": 16865
+    },
+    {
+      "epoch": 0.8564429947582846,
+      "grad_norm": 0.03922126124071241,
+      "learning_rate": 0.0007061895552939727,
+      "loss": 0.5676,
+      "step": 16870
+    },
+    {
+      "epoch": 0.8566968308563161,
+      "grad_norm": 0.02828532099990536,
+      "learning_rate": 0.0007059877221757789,
+      "loss": 0.5652,
+      "step": 16875
+    },
+    {
+      "epoch": 0.8569506669543476,
+      "grad_norm": 0.036809316995682864,
+      "learning_rate": 0.0007057858486232224,
+      "loss": 0.536,
+      "step": 16880
+    },
+    {
+      "epoch": 0.857204503052379,
+      "grad_norm": 0.021695340919237437,
+      "learning_rate": 0.00070558393467593,
+      "loss": 0.529,
+      "step": 16885
+    },
+    {
+      "epoch": 0.8574583391504106,
+      "grad_norm": 0.02740340955824536,
+      "learning_rate": 0.0007053819803735367,
+      "loss": 0.5352,
+      "step": 16890
+    },
+    {
+      "epoch": 0.8577121752484421,
+      "grad_norm": 0.03336334546505141,
+      "learning_rate": 0.0007051799857556848,
+      "loss": 0.5372,
+      "step": 16895
+    },
+    {
+      "epoch": 0.8579660113464735,
+      "grad_norm": 0.028637615728519715,
+      "learning_rate": 0.0007049779508620248,
+      "loss": 0.5726,
+      "step": 16900
+    },
+    {
+      "epoch": 0.8582198474445051,
+      "grad_norm": 0.02540896995489235,
+      "learning_rate": 0.0007047758757322155,
+      "loss": 0.5588,
+      "step": 16905
+    },
+    {
+      "epoch": 0.8584736835425366,
+      "grad_norm": 0.0243774664015233,
+      "learning_rate": 0.0007045737604059228,
+      "loss": 0.5855,
+      "step": 16910
+    },
+    {
+      "epoch": 0.858727519640568,
+      "grad_norm": 0.025712991028124108,
+      "learning_rate": 0.0007043716049228212,
+      "loss": 0.5725,
+      "step": 16915
+    },
+    {
+      "epoch": 0.8589813557385996,
+      "grad_norm": 0.01963408655252367,
+      "learning_rate": 0.0007041694093225929,
+      "loss": 0.5553,
+      "step": 16920
+    },
+    {
+      "epoch": 0.8592351918366311,
+      "grad_norm": 0.024595690577265465,
+      "learning_rate": 0.0007039671736449275,
+      "loss": 0.5464,
+      "step": 16925
+    },
+    {
+      "epoch": 0.8594890279346626,
+      "grad_norm": 0.022243385284112183,
+      "learning_rate": 0.0007037648979295232,
+      "loss": 0.5316,
+      "step": 16930
+    },
+    {
+      "epoch": 0.8597428640326941,
+      "grad_norm": 0.02440380975141963,
+      "learning_rate": 0.0007035625822160856,
+      "loss": 0.5465,
+      "step": 16935
+    },
+    {
+      "epoch": 0.8599967001307256,
+      "grad_norm": 0.03482315185034089,
+      "learning_rate": 0.0007033602265443284,
+      "loss": 0.5636,
+      "step": 16940
+    },
+    {
+      "epoch": 0.8602505362287571,
+      "grad_norm": 0.024590501396839586,
+      "learning_rate": 0.0007031578309539728,
+      "loss": 0.5877,
+      "step": 16945
+    },
+    {
+      "epoch": 0.8605043723267886,
+      "grad_norm": 0.03922772861007277,
+      "learning_rate": 0.000702955395484748,
+      "loss": 0.5928,
+      "step": 16950
+    },
+    {
+      "epoch": 0.8607582084248201,
+      "grad_norm": 0.026668312848002942,
+      "learning_rate": 0.0007027529201763913,
+      "loss": 0.5848,
+      "step": 16955
+    },
+    {
+      "epoch": 0.8610120445228516,
+      "grad_norm": 0.022676881579317083,
+      "learning_rate": 0.0007025504050686475,
+      "loss": 0.5759,
+      "step": 16960
+    },
+    {
+      "epoch": 0.861265880620883,
+      "grad_norm": 0.023945888246282236,
+      "learning_rate": 0.0007023478502012694,
+      "loss": 0.5796,
+      "step": 16965
+    },
+    {
+      "epoch": 0.8615197167189146,
+      "grad_norm": 0.030829524909033925,
+      "learning_rate": 0.0007021452556140173,
+      "loss": 0.5673,
+      "step": 16970
+    },
+    {
+      "epoch": 0.8617735528169461,
+      "grad_norm": 0.023288491755525775,
+      "learning_rate": 0.0007019426213466597,
+      "loss": 0.5592,
+      "step": 16975
+    },
+    {
+      "epoch": 0.8620273889149775,
+      "grad_norm": 0.034313257493583556,
+      "learning_rate": 0.0007017399474389725,
+      "loss": 0.5415,
+      "step": 16980
+    },
+    {
+      "epoch": 0.8622812250130091,
+      "grad_norm": 0.030676916955254648,
+      "learning_rate": 0.0007015372339307398,
+      "loss": 0.5374,
+      "step": 16985
+    },
+    {
+      "epoch": 0.8625350611110406,
+      "grad_norm": 0.9270526277643246,
+      "learning_rate": 0.000701334480861753,
+      "loss": 0.5703,
+      "step": 16990
+    },
+    {
+      "epoch": 0.8627888972090721,
+      "grad_norm": 0.03881534525111206,
+      "learning_rate": 0.0007011316882718119,
+      "loss": 0.5436,
+      "step": 16995
+    },
+    {
+      "epoch": 0.8630427333071036,
+      "grad_norm": 0.028288958072038958,
+      "learning_rate": 0.0007009288562007232,
+      "loss": 0.5424,
+      "step": 17000
+    },
+    {
+      "epoch": 0.8632965694051351,
+      "grad_norm": 0.03263081861380027,
+      "learning_rate": 0.0007007259846883022,
+      "loss": 0.5635,
+      "step": 17005
+    },
+    {
+      "epoch": 0.8635504055031666,
+      "grad_norm": 0.02801944612812956,
+      "learning_rate": 0.0007005230737743714,
+      "loss": 0.5607,
+      "step": 17010
+    },
+    {
+      "epoch": 0.8638042416011981,
+      "grad_norm": 0.0354537026800304,
+      "learning_rate": 0.0007003201234987612,
+      "loss": 0.5787,
+      "step": 17015
+    },
+    {
+      "epoch": 0.8640580776992296,
+      "grad_norm": 0.04224026943667041,
+      "learning_rate": 0.0007001171339013097,
+      "loss": 0.5469,
+      "step": 17020
+    },
+    {
+      "epoch": 0.8643119137972611,
+      "grad_norm": 0.027519369512133282,
+      "learning_rate": 0.0006999141050218628,
+      "loss": 0.5826,
+      "step": 17025
+    },
+    {
+      "epoch": 0.8645657498952926,
+      "grad_norm": 0.026240030212773185,
+      "learning_rate": 0.0006997110369002742,
+      "loss": 0.5602,
+      "step": 17030
+    },
+    {
+      "epoch": 0.8648195859933241,
+      "grad_norm": 0.02599836695677239,
+      "learning_rate": 0.0006995079295764048,
+      "loss": 0.5593,
+      "step": 17035
+    },
+    {
+      "epoch": 0.8650734220913556,
+      "grad_norm": 0.03230402905795593,
+      "learning_rate": 0.000699304783090124,
+      "loss": 0.5685,
+      "step": 17040
+    },
+    {
+      "epoch": 0.8653272581893872,
+      "grad_norm": 0.028811832838592445,
+      "learning_rate": 0.0006991015974813081,
+      "loss": 0.5859,
+      "step": 17045
+    },
+    {
+      "epoch": 0.8655810942874186,
+      "grad_norm": 0.034518109548492715,
+      "learning_rate": 0.0006988983727898414,
+      "loss": 0.5502,
+      "step": 17050
+    },
+    {
+      "epoch": 0.8658349303854501,
+      "grad_norm": 0.03883667058154074,
+      "learning_rate": 0.0006986951090556161,
+      "loss": 0.5473,
+      "step": 17055
+    },
+    {
+      "epoch": 0.8660887664834817,
+      "grad_norm": 0.025695035028394198,
+      "learning_rate": 0.0006984918063185319,
+      "loss": 0.5815,
+      "step": 17060
+    },
+    {
+      "epoch": 0.8663426025815131,
+      "grad_norm": 0.034846551058862335,
+      "learning_rate": 0.0006982884646184959,
+      "loss": 0.543,
+      "step": 17065
+    },
+    {
+      "epoch": 0.8665964386795446,
+      "grad_norm": 0.03999095610243723,
+      "learning_rate": 0.0006980850839954232,
+      "loss": 0.5682,
+      "step": 17070
+    },
+    {
+      "epoch": 0.8668502747775761,
+      "grad_norm": 0.029998948571798918,
+      "learning_rate": 0.0006978816644892364,
+      "loss": 0.5651,
+      "step": 17075
+    },
+    {
+      "epoch": 0.8671041108756076,
+      "grad_norm": 0.03870411499824972,
+      "learning_rate": 0.0006976782061398657,
+      "loss": 0.5865,
+      "step": 17080
+    },
+    {
+      "epoch": 0.8673579469736391,
+      "grad_norm": 0.02581985137998219,
+      "learning_rate": 0.0006974747089872488,
+      "loss": 0.5883,
+      "step": 17085
+    },
+    {
+      "epoch": 0.8676117830716706,
+      "grad_norm": 0.03341194341814896,
+      "learning_rate": 0.0006972711730713315,
+      "loss": 0.573,
+      "step": 17090
+    },
+    {
+      "epoch": 0.8678656191697022,
+      "grad_norm": 0.036010523707030416,
+      "learning_rate": 0.0006970675984320667,
+      "loss": 0.5535,
+      "step": 17095
+    },
+    {
+      "epoch": 0.8681194552677336,
+      "grad_norm": 0.027649651226152285,
+      "learning_rate": 0.000696863985109415,
+      "loss": 0.5654,
+      "step": 17100
+    },
+    {
+      "epoch": 0.8683732913657651,
+      "grad_norm": 0.03162081059227697,
+      "learning_rate": 0.0006966603331433447,
+      "loss": 0.6022,
+      "step": 17105
+    },
+    {
+      "epoch": 0.8686271274637967,
+      "grad_norm": 0.032299427855624986,
+      "learning_rate": 0.0006964566425738321,
+      "loss": 0.5425,
+      "step": 17110
+    },
+    {
+      "epoch": 0.8688809635618281,
+      "grad_norm": 0.021742217941501947,
+      "learning_rate": 0.0006962529134408599,
+      "loss": 0.5897,
+      "step": 17115
+    },
+    {
+      "epoch": 0.8691347996598596,
+      "grad_norm": 0.0346527041544656,
+      "learning_rate": 0.0006960491457844198,
+      "loss": 0.5532,
+      "step": 17120
+    },
+    {
+      "epoch": 0.8693886357578912,
+      "grad_norm": 0.022954764473995297,
+      "learning_rate": 0.00069584533964451,
+      "loss": 0.5665,
+      "step": 17125
+    },
+    {
+      "epoch": 0.8696424718559226,
+      "grad_norm": 0.02307425717340246,
+      "learning_rate": 0.0006956414950611366,
+      "loss": 0.5429,
+      "step": 17130
+    },
+    {
+      "epoch": 0.8698963079539541,
+      "grad_norm": 0.023508308014224755,
+      "learning_rate": 0.0006954376120743136,
+      "loss": 0.5923,
+      "step": 17135
+    },
+    {
+      "epoch": 0.8701501440519857,
+      "grad_norm": 0.025046949091913972,
+      "learning_rate": 0.0006952336907240616,
+      "loss": 0.5354,
+      "step": 17140
+    },
+    {
+      "epoch": 0.8704039801500172,
+      "grad_norm": 0.02807435788273573,
+      "learning_rate": 0.00069502973105041,
+      "loss": 0.5499,
+      "step": 17145
+    },
+    {
+      "epoch": 0.8706578162480486,
+      "grad_norm": 0.023394478814900033,
+      "learning_rate": 0.0006948257330933948,
+      "loss": 0.5804,
+      "step": 17150
+    },
+    {
+      "epoch": 0.8709116523460801,
+      "grad_norm": 0.02484051188427158,
+      "learning_rate": 0.0006946216968930598,
+      "loss": 0.5795,
+      "step": 17155
+    },
+    {
+      "epoch": 0.8711654884441117,
+      "grad_norm": 0.02775584758767845,
+      "learning_rate": 0.0006944176224894563,
+      "loss": 0.5821,
+      "step": 17160
+    },
+    {
+      "epoch": 0.8714193245421431,
+      "grad_norm": 0.03912060374962221,
+      "learning_rate": 0.000694213509922643,
+      "loss": 0.583,
+      "step": 17165
+    },
+    {
+      "epoch": 0.8716731606401746,
+      "grad_norm": 0.0362056815020368,
+      "learning_rate": 0.0006940093592326861,
+      "loss": 0.571,
+      "step": 17170
+    },
+    {
+      "epoch": 0.8719269967382062,
+      "grad_norm": 0.028946737163921484,
+      "learning_rate": 0.0006938051704596598,
+      "loss": 0.5959,
+      "step": 17175
+    },
+    {
+      "epoch": 0.8721808328362376,
+      "grad_norm": 0.02620608291978458,
+      "learning_rate": 0.0006936009436436448,
+      "loss": 0.5515,
+      "step": 17180
+    },
+    {
+      "epoch": 0.8724346689342691,
+      "grad_norm": 0.02801972461379925,
+      "learning_rate": 0.0006933966788247302,
+      "loss": 0.565,
+      "step": 17185
+    },
+    {
+      "epoch": 0.8726885050323007,
+      "grad_norm": 0.042778133049176235,
+      "learning_rate": 0.000693192376043012,
+      "loss": 0.5829,
+      "step": 17190
+    },
+    {
+      "epoch": 0.8729423411303322,
+      "grad_norm": 0.03858970271483881,
+      "learning_rate": 0.0006929880353385938,
+      "loss": 0.5507,
+      "step": 17195
+    },
+    {
+      "epoch": 0.8731961772283636,
+      "grad_norm": 0.031470436939788314,
+      "learning_rate": 0.0006927836567515866,
+      "loss": 0.5978,
+      "step": 17200
+    },
+    {
+      "epoch": 0.8734500133263952,
+      "grad_norm": 0.07394640988697473,
+      "learning_rate": 0.0006925792403221091,
+      "loss": 0.5749,
+      "step": 17205
+    },
+    {
+      "epoch": 0.8737038494244267,
+      "grad_norm": 0.03603952523691002,
+      "learning_rate": 0.0006923747860902871,
+      "loss": 0.5866,
+      "step": 17210
+    },
+    {
+      "epoch": 0.8739576855224581,
+      "grad_norm": 0.043584509249294856,
+      "learning_rate": 0.000692170294096254,
+      "loss": 0.5899,
+      "step": 17215
+    },
+    {
+      "epoch": 0.8742115216204897,
+      "grad_norm": 0.024373073949978077,
+      "learning_rate": 0.0006919657643801504,
+      "loss": 0.506,
+      "step": 17220
+    },
+    {
+      "epoch": 0.8744653577185212,
+      "grad_norm": 0.029580565745340112,
+      "learning_rate": 0.0006917611969821248,
+      "loss": 0.5343,
+      "step": 17225
+    },
+    {
+      "epoch": 0.8747191938165526,
+      "grad_norm": 0.0257140032071113,
+      "learning_rate": 0.0006915565919423324,
+      "loss": 0.5875,
+      "step": 17230
+    },
+    {
+      "epoch": 0.8749730299145841,
+      "grad_norm": 0.03390362964198948,
+      "learning_rate": 0.0006913519493009363,
+      "loss": 0.5813,
+      "step": 17235
+    },
+    {
+      "epoch": 0.8752268660126157,
+      "grad_norm": 0.05042303052722009,
+      "learning_rate": 0.0006911472690981069,
+      "loss": 0.606,
+      "step": 17240
+    },
+    {
+      "epoch": 0.8754807021106471,
+      "grad_norm": 0.05272668324721124,
+      "learning_rate": 0.0006909425513740217,
+      "loss": 0.5965,
+      "step": 17245
+    },
+    {
+      "epoch": 0.8757345382086786,
+      "grad_norm": 0.0488542916140239,
+      "learning_rate": 0.000690737796168866,
+      "loss": 0.5871,
+      "step": 17250
+    },
+    {
+      "epoch": 0.8759883743067102,
+      "grad_norm": 0.030433512897338318,
+      "learning_rate": 0.0006905330035228321,
+      "loss": 0.5384,
+      "step": 17255
+    },
+    {
+      "epoch": 0.8762422104047417,
+      "grad_norm": 0.026478083460412936,
+      "learning_rate": 0.0006903281734761197,
+      "loss": 0.5365,
+      "step": 17260
+    },
+    {
+      "epoch": 0.8764960465027731,
+      "grad_norm": 0.029126871537591283,
+      "learning_rate": 0.000690123306068936,
+      "loss": 0.5554,
+      "step": 17265
+    },
+    {
+      "epoch": 0.8767498826008047,
+      "grad_norm": 0.032584410814189266,
+      "learning_rate": 0.0006899184013414955,
+      "loss": 0.5551,
+      "step": 17270
+    },
+    {
+      "epoch": 0.8770037186988362,
+      "grad_norm": 0.03105939707552323,
+      "learning_rate": 0.00068971345933402,
+      "loss": 0.5244,
+      "step": 17275
+    },
+    {
+      "epoch": 0.8772575547968676,
+      "grad_norm": 0.04428702286739715,
+      "learning_rate": 0.0006895084800867386,
+      "loss": 0.5524,
+      "step": 17280
+    },
+    {
+      "epoch": 0.8775113908948992,
+      "grad_norm": 0.026501029034102612,
+      "learning_rate": 0.0006893034636398875,
+      "loss": 0.5697,
+      "step": 17285
+    },
+    {
+      "epoch": 0.8777652269929307,
+      "grad_norm": 0.02789104304834869,
+      "learning_rate": 0.0006890984100337105,
+      "loss": 0.5451,
+      "step": 17290
+    },
+    {
+      "epoch": 0.8780190630909621,
+      "grad_norm": 0.030381112646398638,
+      "learning_rate": 0.0006888933193084588,
+      "loss": 0.5728,
+      "step": 17295
+    },
+    {
+      "epoch": 0.8782728991889936,
+      "grad_norm": 0.03320203500452083,
+      "learning_rate": 0.0006886881915043905,
+      "loss": 0.5478,
+      "step": 17300
+    },
+    {
+      "epoch": 0.8785267352870252,
+      "grad_norm": 0.04308556659587886,
+      "learning_rate": 0.0006884830266617711,
+      "loss": 0.5476,
+      "step": 17305
+    },
+    {
+      "epoch": 0.8787805713850567,
+      "grad_norm": 0.04744726865498166,
+      "learning_rate": 0.0006882778248208737,
+      "loss": 0.5283,
+      "step": 17310
+    },
+    {
+      "epoch": 0.8790344074830881,
+      "grad_norm": 0.04730302242145846,
+      "learning_rate": 0.000688072586021978,
+      "loss": 0.5921,
+      "step": 17315
+    },
+    {
+      "epoch": 0.8792882435811197,
+      "grad_norm": 0.02844248850997349,
+      "learning_rate": 0.0006878673103053717,
+      "loss": 0.5359,
+      "step": 17320
+    },
+    {
+      "epoch": 0.8795420796791512,
+      "grad_norm": 0.02665626078133169,
+      "learning_rate": 0.0006876619977113492,
+      "loss": 0.5729,
+      "step": 17325
+    },
+    {
+      "epoch": 0.8797959157771826,
+      "grad_norm": 0.05159803146684941,
+      "learning_rate": 0.0006874566482802125,
+      "loss": 0.5683,
+      "step": 17330
+    },
+    {
+      "epoch": 0.8800497518752142,
+      "grad_norm": 0.028811741517915785,
+      "learning_rate": 0.0006872512620522707,
+      "loss": 0.577,
+      "step": 17335
+    },
+    {
+      "epoch": 0.8803035879732457,
+      "grad_norm": 0.032289986764814115,
+      "learning_rate": 0.0006870458390678397,
+      "loss": 0.5473,
+      "step": 17340
+    },
+    {
+      "epoch": 0.8805574240712771,
+      "grad_norm": 0.026018555624320636,
+      "learning_rate": 0.0006868403793672435,
+      "loss": 0.5738,
+      "step": 17345
+    },
+    {
+      "epoch": 0.8808112601693087,
+      "grad_norm": 0.02320427721745887,
+      "learning_rate": 0.0006866348829908125,
+      "loss": 0.5572,
+      "step": 17350
+    },
+    {
+      "epoch": 0.8810650962673402,
+      "grad_norm": 0.028063747667248035,
+      "learning_rate": 0.0006864293499788849,
+      "loss": 0.5831,
+      "step": 17355
+    },
+    {
+      "epoch": 0.8813189323653717,
+      "grad_norm": 0.026346028747411987,
+      "learning_rate": 0.0006862237803718054,
+      "loss": 0.5708,
+      "step": 17360
+    },
+    {
+      "epoch": 0.8815727684634032,
+      "grad_norm": 0.023583638229848918,
+      "learning_rate": 0.0006860181742099266,
+      "loss": 0.5457,
+      "step": 17365
+    },
+    {
+      "epoch": 0.8818266045614347,
+      "grad_norm": 0.03017396130009387,
+      "learning_rate": 0.0006858125315336079,
+      "loss": 0.5551,
+      "step": 17370
+    },
+    {
+      "epoch": 0.8820804406594662,
+      "grad_norm": 0.04360521104064648,
+      "learning_rate": 0.0006856068523832158,
+      "loss": 0.5653,
+      "step": 17375
+    },
+    {
+      "epoch": 0.8823342767574976,
+      "grad_norm": 0.02249120515299714,
+      "learning_rate": 0.0006854011367991243,
+      "loss": 0.521,
+      "step": 17380
+    },
+    {
+      "epoch": 0.8825881128555292,
+      "grad_norm": 0.03244681552967337,
+      "learning_rate": 0.0006851953848217142,
+      "loss": 0.5652,
+      "step": 17385
+    },
+    {
+      "epoch": 0.8828419489535607,
+      "grad_norm": 0.026484829488799315,
+      "learning_rate": 0.0006849895964913737,
+      "loss": 0.5456,
+      "step": 17390
+    },
+    {
+      "epoch": 0.8830957850515921,
+      "grad_norm": 0.027157693421820065,
+      "learning_rate": 0.0006847837718484977,
+      "loss": 0.5214,
+      "step": 17395
+    },
+    {
+      "epoch": 0.8833496211496237,
+      "grad_norm": 0.07609582956202243,
+      "learning_rate": 0.0006845779109334891,
+      "loss": 0.545,
+      "step": 17400
+    },
+    {
+      "epoch": 0.8836034572476552,
+      "grad_norm": 0.03239068638126852,
+      "learning_rate": 0.0006843720137867569,
+      "loss": 0.5713,
+      "step": 17405
+    },
+    {
+      "epoch": 0.8838572933456867,
+      "grad_norm": 0.025149606981124856,
+      "learning_rate": 0.0006841660804487179,
+      "loss": 0.5704,
+      "step": 17410
+    },
+    {
+      "epoch": 0.8841111294437182,
+      "grad_norm": 0.023833536556687185,
+      "learning_rate": 0.0006839601109597957,
+      "loss": 0.5377,
+      "step": 17415
+    },
+    {
+      "epoch": 0.8843649655417497,
+      "grad_norm": 0.028450371295118047,
+      "learning_rate": 0.0006837541053604213,
+      "loss": 0.5916,
+      "step": 17420
+    },
+    {
+      "epoch": 0.8846188016397812,
+      "grad_norm": 0.033176344085271356,
+      "learning_rate": 0.0006835480636910321,
+      "loss": 0.5335,
+      "step": 17425
+    },
+    {
+      "epoch": 0.8848726377378127,
+      "grad_norm": 0.02880029477938549,
+      "learning_rate": 0.0006833419859920736,
+      "loss": 0.5554,
+      "step": 17430
+    },
+    {
+      "epoch": 0.8851264738358442,
+      "grad_norm": 0.029522099650815575,
+      "learning_rate": 0.0006831358723039976,
+      "loss": 0.5704,
+      "step": 17435
+    },
+    {
+      "epoch": 0.8853803099338757,
+      "grad_norm": 0.04615263077662782,
+      "learning_rate": 0.000682929722667263,
+      "loss": 0.5362,
+      "step": 17440
+    },
+    {
+      "epoch": 0.8856341460319072,
+      "grad_norm": 0.48427634793861496,
+      "learning_rate": 0.0006827235371223362,
+      "loss": 0.5819,
+      "step": 17445
+    },
+    {
+      "epoch": 0.8858879821299387,
+      "grad_norm": 0.030739394979893288,
+      "learning_rate": 0.0006825173157096903,
+      "loss": 0.5927,
+      "step": 17450
+    },
+    {
+      "epoch": 0.8861418182279702,
+      "grad_norm": 0.03176037900151184,
+      "learning_rate": 0.0006823110584698055,
+      "loss": 0.5528,
+      "step": 17455
+    },
+    {
+      "epoch": 0.8863956543260016,
+      "grad_norm": 0.026725514935062814,
+      "learning_rate": 0.0006821047654431691,
+      "loss": 0.5695,
+      "step": 17460
+    },
+    {
+      "epoch": 0.8866494904240332,
+      "grad_norm": 0.02741254348251184,
+      "learning_rate": 0.0006818984366702754,
+      "loss": 0.5361,
+      "step": 17465
+    },
+    {
+      "epoch": 0.8869033265220647,
+      "grad_norm": 0.06467389710456385,
+      "learning_rate": 0.0006816920721916259,
+      "loss": 0.5637,
+      "step": 17470
+    },
+    {
+      "epoch": 0.8871571626200963,
+      "grad_norm": 0.05652090655931264,
+      "learning_rate": 0.0006814856720477285,
+      "loss": 0.5662,
+      "step": 17475
+    },
+    {
+      "epoch": 0.8874109987181277,
+      "grad_norm": 0.025971100514922223,
+      "learning_rate": 0.0006812792362790987,
+      "loss": 0.5662,
+      "step": 17480
+    },
+    {
+      "epoch": 0.8876648348161592,
+      "grad_norm": 0.037312238893479364,
+      "learning_rate": 0.0006810727649262591,
+      "loss": 0.5216,
+      "step": 17485
+    },
+    {
+      "epoch": 0.8879186709141907,
+      "grad_norm": 0.03529026683625101,
+      "learning_rate": 0.0006808662580297385,
+      "loss": 0.5754,
+      "step": 17490
+    },
+    {
+      "epoch": 0.8881725070122222,
+      "grad_norm": 0.052177166996155586,
+      "learning_rate": 0.0006806597156300736,
+      "loss": 0.5862,
+      "step": 17495
+    },
+    {
+      "epoch": 0.8884263431102537,
+      "grad_norm": 0.034917141903712645,
+      "learning_rate": 0.0006804531377678074,
+      "loss": 0.5504,
+      "step": 17500
+    },
+    {
+      "epoch": 0.8886801792082852,
+      "grad_norm": 0.035789909497893145,
+      "learning_rate": 0.0006802465244834901,
+      "loss": 0.523,
+      "step": 17505
+    },
+    {
+      "epoch": 0.8889340153063167,
+      "grad_norm": 0.02633496325094234,
+      "learning_rate": 0.000680039875817679,
+      "loss": 0.5773,
+      "step": 17510
+    },
+    {
+      "epoch": 0.8891878514043482,
+      "grad_norm": 0.1027046920453322,
+      "learning_rate": 0.0006798331918109381,
+      "loss": 0.5328,
+      "step": 17515
+    },
+    {
+      "epoch": 0.8894416875023797,
+      "grad_norm": 0.03431649346909258,
+      "learning_rate": 0.0006796264725038387,
+      "loss": 0.584,
+      "step": 17520
+    },
+    {
+      "epoch": 0.8896955236004113,
+      "grad_norm": 0.02609942274925054,
+      "learning_rate": 0.0006794197179369584,
+      "loss": 0.5586,
+      "step": 17525
+    },
+    {
+      "epoch": 0.8899493596984427,
+      "grad_norm": 0.03430169189866086,
+      "learning_rate": 0.0006792129281508821,
+      "loss": 0.546,
+      "step": 17530
+    },
+    {
+      "epoch": 0.8902031957964742,
+      "grad_norm": 0.024928043771357686,
+      "learning_rate": 0.0006790061031862018,
+      "loss": 0.5435,
+      "step": 17535
+    },
+    {
+      "epoch": 0.8904570318945058,
+      "grad_norm": 0.028688456933202128,
+      "learning_rate": 0.0006787992430835161,
+      "loss": 0.5732,
+      "step": 17540
+    },
+    {
+      "epoch": 0.8907108679925372,
+      "grad_norm": 0.042830556579080276,
+      "learning_rate": 0.0006785923478834308,
+      "loss": 0.5484,
+      "step": 17545
+    },
+    {
+      "epoch": 0.8909647040905687,
+      "grad_norm": 0.035751869994615854,
+      "learning_rate": 0.0006783854176265582,
+      "loss": 0.5575,
+      "step": 17550
+    },
+    {
+      "epoch": 0.8912185401886003,
+      "grad_norm": 0.028210300266072946,
+      "learning_rate": 0.0006781784523535177,
+      "loss": 0.5915,
+      "step": 17555
+    },
+    {
+      "epoch": 0.8914723762866317,
+      "grad_norm": 0.03715659125320336,
+      "learning_rate": 0.0006779714521049356,
+      "loss": 0.5359,
+      "step": 17560
+    },
+    {
+      "epoch": 0.8917262123846632,
+      "grad_norm": 0.03583841060325269,
+      "learning_rate": 0.000677764416921445,
+      "loss": 0.5624,
+      "step": 17565
+    },
+    {
+      "epoch": 0.8919800484826947,
+      "grad_norm": 0.03003577944086306,
+      "learning_rate": 0.000677557346843686,
+      "loss": 0.5508,
+      "step": 17570
+    },
+    {
+      "epoch": 0.8922338845807263,
+      "grad_norm": 0.03580088688841066,
+      "learning_rate": 0.0006773502419123051,
+      "loss": 0.5862,
+      "step": 17575
+    },
+    {
+      "epoch": 0.8924877206787577,
+      "grad_norm": 0.03560540588373299,
+      "learning_rate": 0.0006771431021679561,
+      "loss": 0.5623,
+      "step": 17580
+    },
+    {
+      "epoch": 0.8927415567767892,
+      "grad_norm": 0.029606401551905418,
+      "learning_rate": 0.0006769359276512998,
+      "loss": 0.5346,
+      "step": 17585
+    },
+    {
+      "epoch": 0.8929953928748208,
+      "grad_norm": 0.034532543727339556,
+      "learning_rate": 0.0006767287184030031,
+      "loss": 0.5903,
+      "step": 17590
+    },
+    {
+      "epoch": 0.8932492289728522,
+      "grad_norm": 0.029377320096539063,
+      "learning_rate": 0.0006765214744637402,
+      "loss": 0.5723,
+      "step": 17595
+    },
+    {
+      "epoch": 0.8935030650708837,
+      "grad_norm": 0.030743555347065143,
+      "learning_rate": 0.0006763141958741924,
+      "loss": 0.5641,
+      "step": 17600
+    },
+    {
+      "epoch": 0.8937569011689153,
+      "grad_norm": 0.0409722689803858,
+      "learning_rate": 0.0006761068826750472,
+      "loss": 0.5476,
+      "step": 17605
+    },
+    {
+      "epoch": 0.8940107372669467,
+      "grad_norm": 0.029101547286446497,
+      "learning_rate": 0.0006758995349069992,
+      "loss": 0.5678,
+      "step": 17610
+    },
+    {
+      "epoch": 0.8942645733649782,
+      "grad_norm": 0.026075517265298196,
+      "learning_rate": 0.0006756921526107495,
+      "loss": 0.5715,
+      "step": 17615
+    },
+    {
+      "epoch": 0.8945184094630098,
+      "grad_norm": 0.04138837555118283,
+      "learning_rate": 0.0006754847358270066,
+      "loss": 0.5776,
+      "step": 17620
+    },
+    {
+      "epoch": 0.8947722455610413,
+      "grad_norm": 0.02499298797537694,
+      "learning_rate": 0.0006752772845964852,
+      "loss": 0.5813,
+      "step": 17625
+    },
+    {
+      "epoch": 0.8950260816590727,
+      "grad_norm": 0.02920872117024721,
+      "learning_rate": 0.0006750697989599068,
+      "loss": 0.567,
+      "step": 17630
+    },
+    {
+      "epoch": 0.8952799177571042,
+      "grad_norm": 0.024311481377263307,
+      "learning_rate": 0.0006748622789580001,
+      "loss": 0.5472,
+      "step": 17635
+    },
+    {
+      "epoch": 0.8955337538551358,
+      "grad_norm": 0.022553193192463565,
+      "learning_rate": 0.0006746547246315,
+      "loss": 0.5839,
+      "step": 17640
+    },
+    {
+      "epoch": 0.8957875899531672,
+      "grad_norm": 0.025203410801119673,
+      "learning_rate": 0.0006744471360211484,
+      "loss": 0.5593,
+      "step": 17645
+    },
+    {
+      "epoch": 0.8960414260511987,
+      "grad_norm": 0.06256849888992963,
+      "learning_rate": 0.0006742395131676942,
+      "loss": 0.5406,
+      "step": 17650
+    },
+    {
+      "epoch": 0.8962952621492303,
+      "grad_norm": 0.02521288112975126,
+      "learning_rate": 0.0006740318561118922,
+      "loss": 0.5682,
+      "step": 17655
+    },
+    {
+      "epoch": 0.8965490982472617,
+      "grad_norm": 0.3098613025846971,
+      "learning_rate": 0.0006738241648945049,
+      "loss": 0.5492,
+      "step": 17660
+    },
+    {
+      "epoch": 0.8968029343452932,
+      "grad_norm": 0.037730912321007094,
+      "learning_rate": 0.0006736164395563009,
+      "loss": 0.59,
+      "step": 17665
+    },
+    {
+      "epoch": 0.8970567704433248,
+      "grad_norm": 0.034261701601160224,
+      "learning_rate": 0.0006734086801380556,
+      "loss": 0.549,
+      "step": 17670
+    },
+    {
+      "epoch": 0.8973106065413562,
+      "grad_norm": 0.023592660379858885,
+      "learning_rate": 0.0006732008866805512,
+      "loss": 0.5566,
+      "step": 17675
+    },
+    {
+      "epoch": 0.8975644426393877,
+      "grad_norm": 0.03082538341771991,
+      "learning_rate": 0.0006729930592245764,
+      "loss": 0.5699,
+      "step": 17680
+    },
+    {
+      "epoch": 0.8978182787374193,
+      "grad_norm": 0.028133490847564162,
+      "learning_rate": 0.000672785197810927,
+      "loss": 0.5445,
+      "step": 17685
+    },
+    {
+      "epoch": 0.8980721148354508,
+      "grad_norm": 0.03384650095501219,
+      "learning_rate": 0.0006725773024804047,
+      "loss": 0.572,
+      "step": 17690
+    },
+    {
+      "epoch": 0.8983259509334822,
+      "grad_norm": 0.029532460122143937,
+      "learning_rate": 0.0006723693732738188,
+      "loss": 0.5905,
+      "step": 17695
+    },
+    {
+      "epoch": 0.8985797870315138,
+      "grad_norm": 0.04184269802969518,
+      "learning_rate": 0.0006721614102319845,
+      "loss": 0.595,
+      "step": 17700
+    },
+    {
+      "epoch": 0.8988336231295453,
+      "grad_norm": 0.024257335369379907,
+      "learning_rate": 0.0006719534133957237,
+      "loss": 0.5904,
+      "step": 17705
+    },
+    {
+      "epoch": 0.8990874592275767,
+      "grad_norm": 0.8407754555100807,
+      "learning_rate": 0.0006717453828058655,
+      "loss": 0.5799,
+      "step": 17710
+    },
+    {
+      "epoch": 0.8993412953256082,
+      "grad_norm": 0.03889746059406645,
+      "learning_rate": 0.0006715373185032452,
+      "loss": 0.5636,
+      "step": 17715
+    },
+    {
+      "epoch": 0.8995951314236398,
+      "grad_norm": 0.03812093829559977,
+      "learning_rate": 0.0006713292205287047,
+      "loss": 0.5268,
+      "step": 17720
+    },
+    {
+      "epoch": 0.8998489675216712,
+      "grad_norm": 0.031502538038959534,
+      "learning_rate": 0.0006711210889230926,
+      "loss": 0.5409,
+      "step": 17725
+    },
+    {
+      "epoch": 0.9001028036197027,
+      "grad_norm": 0.04019619925278292,
+      "learning_rate": 0.0006709129237272642,
+      "loss": 0.5921,
+      "step": 17730
+    },
+    {
+      "epoch": 0.9003566397177343,
+      "grad_norm": 0.029122633192066108,
+      "learning_rate": 0.0006707047249820813,
+      "loss": 0.5613,
+      "step": 17735
+    },
+    {
+      "epoch": 0.9006104758157658,
+      "grad_norm": 0.04486417536535739,
+      "learning_rate": 0.0006704964927284119,
+      "loss": 0.5838,
+      "step": 17740
+    },
+    {
+      "epoch": 0.9008643119137972,
+      "grad_norm": 0.035238015488419025,
+      "learning_rate": 0.0006702882270071313,
+      "loss": 0.56,
+      "step": 17745
+    },
+    {
+      "epoch": 0.9011181480118288,
+      "grad_norm": 0.047574854581445294,
+      "learning_rate": 0.0006700799278591212,
+      "loss": 0.5946,
+      "step": 17750
+    },
+    {
+      "epoch": 0.9013719841098603,
+      "grad_norm": 0.029209760126982294,
+      "learning_rate": 0.0006698715953252693,
+      "loss": 0.5785,
+      "step": 17755
+    },
+    {
+      "epoch": 0.9016258202078917,
+      "grad_norm": 0.04635207472974123,
+      "learning_rate": 0.0006696632294464704,
+      "loss": 0.6096,
+      "step": 17760
+    },
+    {
+      "epoch": 0.9018796563059233,
+      "grad_norm": 0.04669566058976855,
+      "learning_rate": 0.0006694548302636256,
+      "loss": 0.5996,
+      "step": 17765
+    },
+    {
+      "epoch": 0.9021334924039548,
+      "grad_norm": 0.047565163368266106,
+      "learning_rate": 0.0006692463978176428,
+      "loss": 0.5887,
+      "step": 17770
+    },
+    {
+      "epoch": 0.9023873285019862,
+      "grad_norm": 0.04629943424684873,
+      "learning_rate": 0.0006690379321494361,
+      "loss": 0.5573,
+      "step": 17775
+    },
+    {
+      "epoch": 0.9026411646000178,
+      "grad_norm": 0.02436241562205948,
+      "learning_rate": 0.0006688294332999263,
+      "loss": 0.5669,
+      "step": 17780
+    },
+    {
+      "epoch": 0.9028950006980493,
+      "grad_norm": 0.04739393205079893,
+      "learning_rate": 0.0006686209013100407,
+      "loss": 0.5763,
+      "step": 17785
+    },
+    {
+      "epoch": 0.9031488367960808,
+      "grad_norm": 0.06024300726679382,
+      "learning_rate": 0.0006684123362207131,
+      "loss": 0.5468,
+      "step": 17790
+    },
+    {
+      "epoch": 0.9034026728941122,
+      "grad_norm": 0.024550757216584715,
+      "learning_rate": 0.0006682037380728839,
+      "loss": 0.5404,
+      "step": 17795
+    },
+    {
+      "epoch": 0.9036565089921438,
+      "grad_norm": 0.029738050538604267,
+      "learning_rate": 0.0006679951069074995,
+      "loss": 0.5511,
+      "step": 17800
+    },
+    {
+      "epoch": 0.9039103450901753,
+      "grad_norm": 0.031231955912328697,
+      "learning_rate": 0.0006677864427655135,
+      "loss": 0.5778,
+      "step": 17805
+    },
+    {
+      "epoch": 0.9041641811882067,
+      "grad_norm": 0.12238924641223849,
+      "learning_rate": 0.0006675777456878855,
+      "loss": 0.5493,
+      "step": 17810
+    },
+    {
+      "epoch": 0.9044180172862383,
+      "grad_norm": 0.04489807461665631,
+      "learning_rate": 0.0006673690157155818,
+      "loss": 0.5639,
+      "step": 17815
+    },
+    {
+      "epoch": 0.9046718533842698,
+      "grad_norm": 0.040984958114977495,
+      "learning_rate": 0.000667160252889575,
+      "loss": 0.5693,
+      "step": 17820
+    },
+    {
+      "epoch": 0.9049256894823012,
+      "grad_norm": 0.0299007750606263,
+      "learning_rate": 0.0006669514572508441,
+      "loss": 0.5359,
+      "step": 17825
+    },
+    {
+      "epoch": 0.9051795255803328,
+      "grad_norm": 0.02649079224122146,
+      "learning_rate": 0.0006667426288403749,
+      "loss": 0.5571,
+      "step": 17830
+    },
+    {
+      "epoch": 0.9054333616783643,
+      "grad_norm": 0.0491849427073363,
+      "learning_rate": 0.000666533767699159,
+      "loss": 0.5594,
+      "step": 17835
+    },
+    {
+      "epoch": 0.9056871977763958,
+      "grad_norm": 0.028577497869547455,
+      "learning_rate": 0.0006663248738681951,
+      "loss": 0.5776,
+      "step": 17840
+    },
+    {
+      "epoch": 0.9059410338744273,
+      "grad_norm": 0.029544273344263332,
+      "learning_rate": 0.0006661159473884879,
+      "loss": 0.5156,
+      "step": 17845
+    },
+    {
+      "epoch": 0.9061948699724588,
+      "grad_norm": 0.07785921945262643,
+      "learning_rate": 0.0006659069883010487,
+      "loss": 0.6146,
+      "step": 17850
+    },
+    {
+      "epoch": 0.9064487060704903,
+      "grad_norm": 0.04916615480146852,
+      "learning_rate": 0.0006656979966468949,
+      "loss": 0.621,
+      "step": 17855
+    },
+    {
+      "epoch": 0.9067025421685218,
+      "grad_norm": 0.04266402533079821,
+      "learning_rate": 0.0006654889724670509,
+      "loss": 0.5852,
+      "step": 17860
+    },
+    {
+      "epoch": 0.9069563782665533,
+      "grad_norm": 0.03478646050112459,
+      "learning_rate": 0.0006652799158025466,
+      "loss": 0.5586,
+      "step": 17865
+    },
+    {
+      "epoch": 0.9072102143645848,
+      "grad_norm": 0.03269211204236239,
+      "learning_rate": 0.0006650708266944194,
+      "loss": 0.5417,
+      "step": 17870
+    },
+    {
+      "epoch": 0.9074640504626162,
+      "grad_norm": 0.05926828968618208,
+      "learning_rate": 0.000664861705183712,
+      "loss": 0.5573,
+      "step": 17875
+    },
+    {
+      "epoch": 0.9077178865606478,
+      "grad_norm": 1.137998555724827,
+      "learning_rate": 0.0006646525513114741,
+      "loss": 0.7911,
+      "step": 17880
+    },
+    {
+      "epoch": 0.9079717226586793,
+      "grad_norm": 0.15186991344137019,
+      "learning_rate": 0.0006644433651187613,
+      "loss": 0.6537,
+      "step": 17885
+    },
+    {
+      "epoch": 0.9082255587567107,
+      "grad_norm": 0.10837397788584424,
+      "learning_rate": 0.0006642341466466363,
+      "loss": 0.5914,
+      "step": 17890
+    },
+    {
+      "epoch": 0.9084793948547423,
+      "grad_norm": 0.04567223182586979,
+      "learning_rate": 0.0006640248959361671,
+      "loss": 0.5919,
+      "step": 17895
+    },
+    {
+      "epoch": 0.9087332309527738,
+      "grad_norm": 0.0424493699036947,
+      "learning_rate": 0.000663815613028429,
+      "loss": 0.5887,
+      "step": 17900
+    },
+    {
+      "epoch": 0.9089870670508053,
+      "grad_norm": 0.02914392670442529,
+      "learning_rate": 0.0006636062979645029,
+      "loss": 0.6148,
+      "step": 17905
+    },
+    {
+      "epoch": 0.9092409031488368,
+      "grad_norm": 0.03378335007634043,
+      "learning_rate": 0.0006633969507854764,
+      "loss": 0.5855,
+      "step": 17910
+    },
+    {
+      "epoch": 0.9094947392468683,
+      "grad_norm": 0.045172687759939864,
+      "learning_rate": 0.0006631875715324433,
+      "loss": 0.5607,
+      "step": 17915
+    },
+    {
+      "epoch": 0.9097485753448998,
+      "grad_norm": 0.03534790871616725,
+      "learning_rate": 0.0006629781602465039,
+      "loss": 0.5642,
+      "step": 17920
+    },
+    {
+      "epoch": 0.9100024114429313,
+      "grad_norm": 0.031709338585968895,
+      "learning_rate": 0.0006627687169687643,
+      "loss": 0.5685,
+      "step": 17925
+    },
+    {
+      "epoch": 0.9102562475409628,
+      "grad_norm": 0.02405831853389006,
+      "learning_rate": 0.0006625592417403372,
+      "loss": 0.5837,
+      "step": 17930
+    },
+    {
+      "epoch": 0.9105100836389943,
+      "grad_norm": 0.02887059553974142,
+      "learning_rate": 0.0006623497346023419,
+      "loss": 0.6114,
+      "step": 17935
+    },
+    {
+      "epoch": 0.9107639197370258,
+      "grad_norm": 0.026405583798025718,
+      "learning_rate": 0.0006621401955959029,
+      "loss": 0.5811,
+      "step": 17940
+    },
+    {
+      "epoch": 0.9110177558350573,
+      "grad_norm": 0.030545139437636752,
+      "learning_rate": 0.0006619306247621525,
+      "loss": 0.5621,
+      "step": 17945
+    },
+    {
+      "epoch": 0.9112715919330888,
+      "grad_norm": 0.03844677188791095,
+      "learning_rate": 0.0006617210221422278,
+      "loss": 0.5567,
+      "step": 17950
+    },
+    {
+      "epoch": 0.9115254280311204,
+      "grad_norm": 0.025348216932646248,
+      "learning_rate": 0.0006615113877772729,
+      "loss": 0.5636,
+      "step": 17955
+    },
+    {
+      "epoch": 0.9117792641291518,
+      "grad_norm": 0.02747141569387044,
+      "learning_rate": 0.0006613017217084382,
+      "loss": 0.58,
+      "step": 17960
+    },
+    {
+      "epoch": 0.9120331002271833,
+      "grad_norm": 0.02571958303044386,
+      "learning_rate": 0.00066109202397688,
+      "loss": 0.569,
+      "step": 17965
+    },
+    {
+      "epoch": 0.9122869363252148,
+      "grad_norm": 0.02567934522052075,
+      "learning_rate": 0.0006608822946237607,
+      "loss": 0.5563,
+      "step": 17970
+    },
+    {
+      "epoch": 0.9125407724232463,
+      "grad_norm": 0.02982421448410097,
+      "learning_rate": 0.0006606725336902493,
+      "loss": 0.5524,
+      "step": 17975
+    },
+    {
+      "epoch": 0.9127946085212778,
+      "grad_norm": 0.02603891319299408,
+      "learning_rate": 0.0006604627412175209,
+      "loss": 0.5962,
+      "step": 17980
+    },
+    {
+      "epoch": 0.9130484446193093,
+      "grad_norm": 0.030676874285685652,
+      "learning_rate": 0.0006602529172467564,
+      "loss": 0.5556,
+      "step": 17985
+    },
+    {
+      "epoch": 0.9133022807173408,
+      "grad_norm": 0.028691123229749194,
+      "learning_rate": 0.0006600430618191436,
+      "loss": 0.559,
+      "step": 17990
+    },
+    {
+      "epoch": 0.9135561168153723,
+      "grad_norm": 0.022843921568002558,
+      "learning_rate": 0.0006598331749758759,
+      "loss": 0.5935,
+      "step": 17995
+    },
+    {
+      "epoch": 0.9138099529134038,
+      "grad_norm": 0.026576251107490675,
+      "learning_rate": 0.0006596232567581531,
+      "loss": 0.5514,
+      "step": 18000
+    },
+    {
+      "epoch": 0.9140637890114354,
+      "grad_norm": 0.029295378037257274,
+      "learning_rate": 0.0006594133072071809,
+      "loss": 0.5623,
+      "step": 18005
+    },
+    {
+      "epoch": 0.9143176251094668,
+      "grad_norm": 0.026342637202697045,
+      "learning_rate": 0.0006592033263641715,
+      "loss": 0.5377,
+      "step": 18010
+    },
+    {
+      "epoch": 0.9145714612074983,
+      "grad_norm": 0.0268948168872539,
+      "learning_rate": 0.000658993314270343,
+      "loss": 0.5734,
+      "step": 18015
+    },
+    {
+      "epoch": 0.9148252973055299,
+      "grad_norm": 0.03228314048721056,
+      "learning_rate": 0.00065878327096692,
+      "loss": 0.5695,
+      "step": 18020
+    },
+    {
+      "epoch": 0.9150791334035613,
+      "grad_norm": 0.03776376655159827,
+      "learning_rate": 0.0006585731964951327,
+      "loss": 0.5676,
+      "step": 18025
+    },
+    {
+      "epoch": 0.9153329695015928,
+      "grad_norm": 0.02713281422594516,
+      "learning_rate": 0.0006583630908962178,
+      "loss": 0.6121,
+      "step": 18030
+    },
+    {
+      "epoch": 0.9155868055996244,
+      "grad_norm": 0.0313761277274463,
+      "learning_rate": 0.0006581529542114178,
+      "loss": 0.5656,
+      "step": 18035
+    },
+    {
+      "epoch": 0.9158406416976558,
+      "grad_norm": 0.023551377190705478,
+      "learning_rate": 0.0006579427864819817,
+      "loss": 0.5859,
+      "step": 18040
+    },
+    {
+      "epoch": 0.9160944777956873,
+      "grad_norm": 0.03213179076154911,
+      "learning_rate": 0.0006577325877491641,
+      "loss": 0.5385,
+      "step": 18045
+    },
+    {
+      "epoch": 0.9163483138937188,
+      "grad_norm": 0.03309679333832961,
+      "learning_rate": 0.0006575223580542263,
+      "loss": 0.5686,
+      "step": 18050
+    },
+    {
+      "epoch": 0.9166021499917504,
+      "grad_norm": 0.02292030032279284,
+      "learning_rate": 0.0006573120974384351,
+      "loss": 0.569,
+      "step": 18055
+    },
+    {
+      "epoch": 0.9168559860897818,
+      "grad_norm": 0.02196534616948093,
+      "learning_rate": 0.0006571018059430638,
+      "loss": 0.5583,
+      "step": 18060
+    },
+    {
+      "epoch": 0.9171098221878133,
+      "grad_norm": 0.03751186679215072,
+      "learning_rate": 0.0006568914836093913,
+      "loss": 0.5632,
+      "step": 18065
+    },
+    {
+      "epoch": 0.9173636582858449,
+      "grad_norm": 0.039073154648674925,
+      "learning_rate": 0.000656681130478703,
+      "loss": 0.5875,
+      "step": 18070
+    },
+    {
+      "epoch": 0.9176174943838763,
+      "grad_norm": 0.03016782826811619,
+      "learning_rate": 0.0006564707465922901,
+      "loss": 0.5562,
+      "step": 18075
+    },
+    {
+      "epoch": 0.9178713304819078,
+      "grad_norm": 0.44620351828974875,
+      "learning_rate": 0.0006562603319914502,
+      "loss": 0.5836,
+      "step": 18080
+    },
+    {
+      "epoch": 0.9181251665799394,
+      "grad_norm": 0.03733778783819997,
+      "learning_rate": 0.0006560498867174862,
+      "loss": 0.5544,
+      "step": 18085
+    },
+    {
+      "epoch": 0.9183790026779708,
+      "grad_norm": 0.04185835413154692,
+      "learning_rate": 0.0006558394108117078,
+      "loss": 0.5843,
+      "step": 18090
+    },
+    {
+      "epoch": 0.9186328387760023,
+      "grad_norm": 0.02584084106316554,
+      "learning_rate": 0.00065562890431543,
+      "loss": 0.545,
+      "step": 18095
+    },
+    {
+      "epoch": 0.9188866748740339,
+      "grad_norm": 0.026170167620165836,
+      "learning_rate": 0.0006554183672699747,
+      "loss": 0.5684,
+      "step": 18100
+    },
+    {
+      "epoch": 0.9191405109720653,
+      "grad_norm": 0.028617521912761606,
+      "learning_rate": 0.0006552077997166686,
+      "loss": 0.5778,
+      "step": 18105
+    },
+    {
+      "epoch": 0.9193943470700968,
+      "grad_norm": 0.024764375605303438,
+      "learning_rate": 0.0006549972016968457,
+      "loss": 0.5668,
+      "step": 18110
+    },
+    {
+      "epoch": 0.9196481831681284,
+      "grad_norm": 0.04224569793867106,
+      "learning_rate": 0.0006547865732518451,
+      "loss": 0.5563,
+      "step": 18115
+    },
+    {
+      "epoch": 0.9199020192661599,
+      "grad_norm": 0.024423393777903805,
+      "learning_rate": 0.0006545759144230122,
+      "loss": 0.5272,
+      "step": 18120
+    },
+    {
+      "epoch": 0.9201558553641913,
+      "grad_norm": 0.03562323884403274,
+      "learning_rate": 0.0006543652252516978,
+      "loss": 0.5497,
+      "step": 18125
+    },
+    {
+      "epoch": 0.9204096914622228,
+      "grad_norm": 0.03532107692857155,
+      "learning_rate": 0.0006541545057792597,
+      "loss": 0.562,
+      "step": 18130
+    },
+    {
+      "epoch": 0.9206635275602544,
+      "grad_norm": 0.02233461039237326,
+      "learning_rate": 0.0006539437560470609,
+      "loss": 0.5778,
+      "step": 18135
+    },
+    {
+      "epoch": 0.9209173636582858,
+      "grad_norm": 0.036379888044198186,
+      "learning_rate": 0.0006537329760964705,
+      "loss": 0.5385,
+      "step": 18140
+    },
+    {
+      "epoch": 0.9211711997563173,
+      "grad_norm": 0.031306097214398455,
+      "learning_rate": 0.0006535221659688636,
+      "loss": 0.5766,
+      "step": 18145
+    },
+    {
+      "epoch": 0.9214250358543489,
+      "grad_norm": 0.032940875660410714,
+      "learning_rate": 0.0006533113257056212,
+      "loss": 0.5745,
+      "step": 18150
+    },
+    {
+      "epoch": 0.9216788719523803,
+      "grad_norm": 0.028269439007305754,
+      "learning_rate": 0.0006531004553481299,
+      "loss": 0.5593,
+      "step": 18155
+    },
+    {
+      "epoch": 0.9219327080504118,
+      "grad_norm": 0.025325938543204793,
+      "learning_rate": 0.0006528895549377829,
+      "loss": 0.5545,
+      "step": 18160
+    },
+    {
+      "epoch": 0.9221865441484434,
+      "grad_norm": 0.025608019654732625,
+      "learning_rate": 0.0006526786245159785,
+      "loss": 0.5645,
+      "step": 18165
+    },
+    {
+      "epoch": 0.9224403802464749,
+      "grad_norm": 0.03073301426221954,
+      "learning_rate": 0.0006524676641241216,
+      "loss": 0.5729,
+      "step": 18170
+    },
+    {
+      "epoch": 0.9226942163445063,
+      "grad_norm": 0.02655926848711976,
+      "learning_rate": 0.0006522566738036227,
+      "loss": 0.5605,
+      "step": 18175
+    },
+    {
+      "epoch": 0.9229480524425379,
+      "grad_norm": 0.030923379907562828,
+      "learning_rate": 0.0006520456535958981,
+      "loss": 0.5438,
+      "step": 18180
+    },
+    {
+      "epoch": 0.9232018885405694,
+      "grad_norm": 0.02498118812708288,
+      "learning_rate": 0.0006518346035423697,
+      "loss": 0.5618,
+      "step": 18185
+    },
+    {
+      "epoch": 0.9234557246386008,
+      "grad_norm": 0.03195096400109142,
+      "learning_rate": 0.0006516235236844661,
+      "loss": 0.5771,
+      "step": 18190
+    },
+    {
+      "epoch": 0.9237095607366324,
+      "grad_norm": 0.025673518361398005,
+      "learning_rate": 0.0006514124140636206,
+      "loss": 0.5714,
+      "step": 18195
+    },
+    {
+      "epoch": 0.9239633968346639,
+      "grad_norm": 0.023420267058336542,
+      "learning_rate": 0.0006512012747212736,
+      "loss": 0.5543,
+      "step": 18200
+    },
+    {
+      "epoch": 0.9242172329326953,
+      "grad_norm": 0.035974375956175315,
+      "learning_rate": 0.0006509901056988703,
+      "loss": 0.5399,
+      "step": 18205
+    },
+    {
+      "epoch": 0.9244710690307268,
+      "grad_norm": 0.026974281528685886,
+      "learning_rate": 0.0006507789070378623,
+      "loss": 0.5803,
+      "step": 18210
+    },
+    {
+      "epoch": 0.9247249051287584,
+      "grad_norm": 0.03386779705440503,
+      "learning_rate": 0.0006505676787797068,
+      "loss": 0.5573,
+      "step": 18215
+    },
+    {
+      "epoch": 0.9249787412267899,
+      "grad_norm": 0.02994069343757094,
+      "learning_rate": 0.0006503564209658668,
+      "loss": 0.5631,
+      "step": 18220
+    },
+    {
+      "epoch": 0.9252325773248213,
+      "grad_norm": 0.025204798762410694,
+      "learning_rate": 0.0006501451336378111,
+      "loss": 0.5778,
+      "step": 18225
+    },
+    {
+      "epoch": 0.9254864134228529,
+      "grad_norm": 0.02844201792241884,
+      "learning_rate": 0.0006499338168370145,
+      "loss": 0.5494,
+      "step": 18230
+    },
+    {
+      "epoch": 0.9257402495208844,
+      "grad_norm": 0.027272927584755857,
+      "learning_rate": 0.0006497224706049574,
+      "loss": 0.5513,
+      "step": 18235
+    },
+    {
+      "epoch": 0.9259940856189158,
+      "grad_norm": 0.022625222413103843,
+      "learning_rate": 0.000649511094983126,
+      "loss": 0.5454,
+      "step": 18240
+    },
+    {
+      "epoch": 0.9262479217169474,
+      "grad_norm": 0.025965109273752868,
+      "learning_rate": 0.0006492996900130122,
+      "loss": 0.5521,
+      "step": 18245
+    },
+    {
+      "epoch": 0.9265017578149789,
+      "grad_norm": 0.023734960990954648,
+      "learning_rate": 0.0006490882557361138,
+      "loss": 0.569,
+      "step": 18250
+    },
+    {
+      "epoch": 0.9267555939130103,
+      "grad_norm": 0.029694319744587402,
+      "learning_rate": 0.0006488767921939344,
+      "loss": 0.5544,
+      "step": 18255
+    },
+    {
+      "epoch": 0.9270094300110419,
+      "grad_norm": 0.027522631414110266,
+      "learning_rate": 0.0006486652994279832,
+      "loss": 0.5191,
+      "step": 18260
+    },
+    {
+      "epoch": 0.9272632661090734,
+      "grad_norm": 0.0213560260774621,
+      "learning_rate": 0.000648453777479775,
+      "loss": 0.5602,
+      "step": 18265
+    },
+    {
+      "epoch": 0.9275171022071049,
+      "grad_norm": 0.027648563843928633,
+      "learning_rate": 0.0006482422263908305,
+      "loss": 0.5757,
+      "step": 18270
+    },
+    {
+      "epoch": 0.9277709383051364,
+      "grad_norm": 0.024346877510486684,
+      "learning_rate": 0.0006480306462026765,
+      "loss": 0.5502,
+      "step": 18275
+    },
+    {
+      "epoch": 0.9280247744031679,
+      "grad_norm": 0.025987941627693693,
+      "learning_rate": 0.0006478190369568447,
+      "loss": 0.572,
+      "step": 18280
+    },
+    {
+      "epoch": 0.9282786105011994,
+      "grad_norm": 0.03607494520553649,
+      "learning_rate": 0.0006476073986948731,
+      "loss": 0.5474,
+      "step": 18285
+    },
+    {
+      "epoch": 0.9285324465992308,
+      "grad_norm": 0.04133198513293862,
+      "learning_rate": 0.0006473957314583053,
+      "loss": 0.5646,
+      "step": 18290
+    },
+    {
+      "epoch": 0.9287862826972624,
+      "grad_norm": 0.02356878697506822,
+      "learning_rate": 0.0006471840352886906,
+      "loss": 0.5555,
+      "step": 18295
+    },
+    {
+      "epoch": 0.9290401187952939,
+      "grad_norm": 0.026719448119602498,
+      "learning_rate": 0.0006469723102275835,
+      "loss": 0.5478,
+      "step": 18300
+    },
+    {
+      "epoch": 0.9292939548933253,
+      "grad_norm": 0.023824789207588595,
+      "learning_rate": 0.000646760556316545,
+      "loss": 0.5686,
+      "step": 18305
+    },
+    {
+      "epoch": 0.9295477909913569,
+      "grad_norm": 0.02504122878294649,
+      "learning_rate": 0.0006465487735971414,
+      "loss": 0.5822,
+      "step": 18310
+    },
+    {
+      "epoch": 0.9298016270893884,
+      "grad_norm": 0.025465279303564035,
+      "learning_rate": 0.000646336962110944,
+      "loss": 0.5391,
+      "step": 18315
+    },
+    {
+      "epoch": 0.9300554631874198,
+      "grad_norm": 0.024053916899900416,
+      "learning_rate": 0.0006461251218995309,
+      "loss": 0.5812,
+      "step": 18320
+    },
+    {
+      "epoch": 0.9303092992854514,
+      "grad_norm": 0.023988031306641153,
+      "learning_rate": 0.0006459132530044851,
+      "loss": 0.5653,
+      "step": 18325
+    },
+    {
+      "epoch": 0.9305631353834829,
+      "grad_norm": 0.033028824504018735,
+      "learning_rate": 0.0006457013554673954,
+      "loss": 0.529,
+      "step": 18330
+    },
+    {
+      "epoch": 0.9308169714815144,
+      "grad_norm": 0.031328475615483485,
+      "learning_rate": 0.0006454894293298563,
+      "loss": 0.6023,
+      "step": 18335
+    },
+    {
+      "epoch": 0.9310708075795459,
+      "grad_norm": 0.025458288041452327,
+      "learning_rate": 0.0006452774746334677,
+      "loss": 0.5946,
+      "step": 18340
+    },
+    {
+      "epoch": 0.9313246436775774,
+      "grad_norm": 0.029766324738899293,
+      "learning_rate": 0.0006450654914198354,
+      "loss": 0.5466,
+      "step": 18345
+    },
+    {
+      "epoch": 0.9315784797756089,
+      "grad_norm": 0.02517080042746789,
+      "learning_rate": 0.0006448534797305704,
+      "loss": 0.5717,
+      "step": 18350
+    },
+    {
+      "epoch": 0.9318323158736403,
+      "grad_norm": 0.02380537403364461,
+      "learning_rate": 0.0006446414396072899,
+      "loss": 0.5402,
+      "step": 18355
+    },
+    {
+      "epoch": 0.9320861519716719,
+      "grad_norm": 0.03236454209338435,
+      "learning_rate": 0.0006444293710916161,
+      "loss": 0.5468,
+      "step": 18360
+    },
+    {
+      "epoch": 0.9323399880697034,
+      "grad_norm": 0.07009893831871468,
+      "learning_rate": 0.000644217274225177,
+      "loss": 0.5848,
+      "step": 18365
+    },
+    {
+      "epoch": 0.9325938241677348,
+      "grad_norm": 0.03355070794386776,
+      "learning_rate": 0.000644005149049606,
+      "loss": 0.5628,
+      "step": 18370
+    },
+    {
+      "epoch": 0.9328476602657664,
+      "grad_norm": 0.04417035285010824,
+      "learning_rate": 0.0006437929956065426,
+      "loss": 0.5699,
+      "step": 18375
+    },
+    {
+      "epoch": 0.9331014963637979,
+      "grad_norm": 0.025686169691077572,
+      "learning_rate": 0.0006435808139376313,
+      "loss": 0.5644,
+      "step": 18380
+    },
+    {
+      "epoch": 0.9333553324618294,
+      "grad_norm": 0.02456795859417348,
+      "learning_rate": 0.0006433686040845222,
+      "loss": 0.5688,
+      "step": 18385
+    },
+    {
+      "epoch": 0.9336091685598609,
+      "grad_norm": 0.027858399120178644,
+      "learning_rate": 0.0006431563660888711,
+      "loss": 0.5418,
+      "step": 18390
+    },
+    {
+      "epoch": 0.9338630046578924,
+      "grad_norm": 0.031235035810057876,
+      "learning_rate": 0.0006429440999923392,
+      "loss": 0.5544,
+      "step": 18395
+    },
+    {
+      "epoch": 0.9341168407559239,
+      "grad_norm": 0.03109567390073356,
+      "learning_rate": 0.0006427318058365934,
+      "loss": 0.5845,
+      "step": 18400
+    },
+    {
+      "epoch": 0.9343706768539554,
+      "grad_norm": 0.023673410214207512,
+      "learning_rate": 0.0006425194836633058,
+      "loss": 0.586,
+      "step": 18405
+    },
+    {
+      "epoch": 0.9346245129519869,
+      "grad_norm": 0.025651225561305648,
+      "learning_rate": 0.0006423071335141543,
+      "loss": 0.5602,
+      "step": 18410
+    },
+    {
+      "epoch": 0.9348783490500184,
+      "grad_norm": 0.024917392040912964,
+      "learning_rate": 0.0006420947554308223,
+      "loss": 0.5642,
+      "step": 18415
+    },
+    {
+      "epoch": 0.9351321851480499,
+      "grad_norm": 0.027718757843249613,
+      "learning_rate": 0.0006418823494549983,
+      "loss": 0.5735,
+      "step": 18420
+    },
+    {
+      "epoch": 0.9353860212460814,
+      "grad_norm": 0.026445735170235473,
+      "learning_rate": 0.0006416699156283768,
+      "loss": 0.5166,
+      "step": 18425
+    },
+    {
+      "epoch": 0.9356398573441129,
+      "grad_norm": 0.024653454471051918,
+      "learning_rate": 0.0006414574539926574,
+      "loss": 0.5693,
+      "step": 18430
+    },
+    {
+      "epoch": 0.9358936934421445,
+      "grad_norm": 0.035138012803531134,
+      "learning_rate": 0.0006412449645895452,
+      "loss": 0.5398,
+      "step": 18435
+    },
+    {
+      "epoch": 0.9361475295401759,
+      "grad_norm": 0.03579404088252254,
+      "learning_rate": 0.0006410324474607507,
+      "loss": 0.5867,
+      "step": 18440
+    },
+    {
+      "epoch": 0.9364013656382074,
+      "grad_norm": 0.05203695866136957,
+      "learning_rate": 0.0006408199026479901,
+      "loss": 0.5798,
+      "step": 18445
+    },
+    {
+      "epoch": 0.936655201736239,
+      "grad_norm": 0.03418141205359679,
+      "learning_rate": 0.000640607330192985,
+      "loss": 0.589,
+      "step": 18450
+    },
+    {
+      "epoch": 0.9369090378342704,
+      "grad_norm": 0.029127603885871836,
+      "learning_rate": 0.0006403947301374622,
+      "loss": 0.5181,
+      "step": 18455
+    },
+    {
+      "epoch": 0.9371628739323019,
+      "grad_norm": 0.025330901763078596,
+      "learning_rate": 0.000640182102523154,
+      "loss": 0.5358,
+      "step": 18460
+    },
+    {
+      "epoch": 0.9374167100303334,
+      "grad_norm": 0.02939613956496011,
+      "learning_rate": 0.0006399694473917981,
+      "loss": 0.5448,
+      "step": 18465
+    },
+    {
+      "epoch": 0.9376705461283649,
+      "grad_norm": 0.05802483149330994,
+      "learning_rate": 0.0006397567647851377,
+      "loss": 0.6088,
+      "step": 18470
+    },
+    {
+      "epoch": 0.9379243822263964,
+      "grad_norm": 0.05969020572466202,
+      "learning_rate": 0.0006395440547449214,
+      "loss": 0.5606,
+      "step": 18475
+    },
+    {
+      "epoch": 0.9381782183244279,
+      "grad_norm": 0.03090071852022251,
+      "learning_rate": 0.000639331317312903,
+      "loss": 0.5658,
+      "step": 18480
+    },
+    {
+      "epoch": 0.9384320544224595,
+      "grad_norm": 0.026204359425882364,
+      "learning_rate": 0.0006391185525308419,
+      "loss": 0.5685,
+      "step": 18485
+    },
+    {
+      "epoch": 0.9386858905204909,
+      "grad_norm": 0.03498061994333065,
+      "learning_rate": 0.0006389057604405027,
+      "loss": 0.5711,
+      "step": 18490
+    },
+    {
+      "epoch": 0.9389397266185224,
+      "grad_norm": 0.02717442339063079,
+      "learning_rate": 0.0006386929410836555,
+      "loss": 0.5629,
+      "step": 18495
+    },
+    {
+      "epoch": 0.939193562716554,
+      "grad_norm": 0.02682126084664184,
+      "learning_rate": 0.0006384800945020755,
+      "loss": 0.5369,
+      "step": 18500
+    },
+    {
+      "epoch": 0.9394473988145854,
+      "grad_norm": 0.021417798921295972,
+      "learning_rate": 0.0006382672207375438,
+      "loss": 0.516,
+      "step": 18505
+    },
+    {
+      "epoch": 0.9397012349126169,
+      "grad_norm": 0.0335394781975359,
+      "learning_rate": 0.000638054319831846,
+      "loss": 0.5536,
+      "step": 18510
+    },
+    {
+      "epoch": 0.9399550710106485,
+      "grad_norm": 0.033542368843596015,
+      "learning_rate": 0.0006378413918267737,
+      "loss": 0.5476,
+      "step": 18515
+    },
+    {
+      "epoch": 0.9402089071086799,
+      "grad_norm": 0.03314207140461258,
+      "learning_rate": 0.0006376284367641237,
+      "loss": 0.5405,
+      "step": 18520
+    },
+    {
+      "epoch": 0.9404627432067114,
+      "grad_norm": 0.025257880645441717,
+      "learning_rate": 0.0006374154546856978,
+      "loss": 0.5727,
+      "step": 18525
+    },
+    {
+      "epoch": 0.940716579304743,
+      "grad_norm": 0.03153823246454771,
+      "learning_rate": 0.0006372024456333034,
+      "loss": 0.576,
+      "step": 18530
+    },
+    {
+      "epoch": 0.9409704154027744,
+      "grad_norm": 0.02414021611137436,
+      "learning_rate": 0.0006369894096487533,
+      "loss": 0.5608,
+      "step": 18535
+    },
+    {
+      "epoch": 0.9412242515008059,
+      "grad_norm": 0.05820198130239536,
+      "learning_rate": 0.0006367763467738652,
+      "loss": 0.5498,
+      "step": 18540
+    },
+    {
+      "epoch": 0.9414780875988374,
+      "grad_norm": 0.024189243271371962,
+      "learning_rate": 0.0006365632570504622,
+      "loss": 0.5532,
+      "step": 18545
+    },
+    {
+      "epoch": 0.941731923696869,
+      "grad_norm": 0.025398202281715243,
+      "learning_rate": 0.000636350140520373,
+      "loss": 0.5623,
+      "step": 18550
+    },
+    {
+      "epoch": 0.9419857597949004,
+      "grad_norm": 0.03514525492016054,
+      "learning_rate": 0.0006361369972254313,
+      "loss": 0.5561,
+      "step": 18555
+    },
+    {
+      "epoch": 0.9422395958929319,
+      "grad_norm": 0.027198150257000973,
+      "learning_rate": 0.0006359238272074757,
+      "loss": 0.5701,
+      "step": 18560
+    },
+    {
+      "epoch": 0.9424934319909635,
+      "grad_norm": 0.03242882219511253,
+      "learning_rate": 0.0006357106305083509,
+      "loss": 0.5913,
+      "step": 18565
+    },
+    {
+      "epoch": 0.9427472680889949,
+      "grad_norm": 0.030898225159171074,
+      "learning_rate": 0.000635497407169906,
+      "loss": 0.5652,
+      "step": 18570
+    },
+    {
+      "epoch": 0.9430011041870264,
+      "grad_norm": 0.023609900849047016,
+      "learning_rate": 0.0006352841572339957,
+      "loss": 0.5138,
+      "step": 18575
+    },
+    {
+      "epoch": 0.943254940285058,
+      "grad_norm": 0.022870274202999735,
+      "learning_rate": 0.0006350708807424803,
+      "loss": 0.5139,
+      "step": 18580
+    },
+    {
+      "epoch": 0.9435087763830894,
+      "grad_norm": 0.029704150434133162,
+      "learning_rate": 0.0006348575777372244,
+      "loss": 0.5581,
+      "step": 18585
+    },
+    {
+      "epoch": 0.9437626124811209,
+      "grad_norm": 0.023261609889708712,
+      "learning_rate": 0.0006346442482600986,
+      "loss": 0.5341,
+      "step": 18590
+    },
+    {
+      "epoch": 0.9440164485791525,
+      "grad_norm": 0.03251302144724121,
+      "learning_rate": 0.0006344308923529784,
+      "loss": 0.5377,
+      "step": 18595
+    },
+    {
+      "epoch": 0.944270284677184,
+      "grad_norm": 0.04594564704845458,
+      "learning_rate": 0.0006342175100577443,
+      "loss": 0.5423,
+      "step": 18600
+    },
+    {
+      "epoch": 0.9445241207752154,
+      "grad_norm": 0.03186429074273563,
+      "learning_rate": 0.0006340041014162822,
+      "loss": 0.5286,
+      "step": 18605
+    },
+    {
+      "epoch": 0.944777956873247,
+      "grad_norm": 0.025609007700680392,
+      "learning_rate": 0.0006337906664704836,
+      "loss": 0.5456,
+      "step": 18610
+    },
+    {
+      "epoch": 0.9450317929712785,
+      "grad_norm": 0.024349448360170327,
+      "learning_rate": 0.0006335772052622441,
+      "loss": 0.5522,
+      "step": 18615
+    },
+    {
+      "epoch": 0.9452856290693099,
+      "grad_norm": 0.03254833741297758,
+      "learning_rate": 0.0006333637178334655,
+      "loss": 0.5441,
+      "step": 18620
+    },
+    {
+      "epoch": 0.9455394651673414,
+      "grad_norm": 0.05909130904483525,
+      "learning_rate": 0.0006331502042260541,
+      "loss": 0.5259,
+      "step": 18625
+    },
+    {
+      "epoch": 0.945793301265373,
+      "grad_norm": 0.05134549162232977,
+      "learning_rate": 0.0006329366644819217,
+      "loss": 0.5649,
+      "step": 18630
+    },
+    {
+      "epoch": 0.9460471373634044,
+      "grad_norm": 0.037053648813884996,
+      "learning_rate": 0.0006327230986429849,
+      "loss": 0.5742,
+      "step": 18635
+    },
+    {
+      "epoch": 0.9463009734614359,
+      "grad_norm": 0.03129953084498823,
+      "learning_rate": 0.0006325095067511658,
+      "loss": 0.5378,
+      "step": 18640
+    },
+    {
+      "epoch": 0.9465548095594675,
+      "grad_norm": 0.03654618370634232,
+      "learning_rate": 0.0006322958888483914,
+      "loss": 0.5617,
+      "step": 18645
+    },
+    {
+      "epoch": 0.946808645657499,
+      "grad_norm": 0.02422795139105715,
+      "learning_rate": 0.0006320822449765937,
+      "loss": 0.5494,
+      "step": 18650
+    },
+    {
+      "epoch": 0.9470624817555304,
+      "grad_norm": 0.02329083288512449,
+      "learning_rate": 0.00063186857517771,
+      "loss": 0.5506,
+      "step": 18655
+    },
+    {
+      "epoch": 0.947316317853562,
+      "grad_norm": 0.03354540198025808,
+      "learning_rate": 0.0006316548794936827,
+      "loss": 0.5735,
+      "step": 18660
+    },
+    {
+      "epoch": 0.9475701539515935,
+      "grad_norm": 0.027348158991933078,
+      "learning_rate": 0.0006314411579664591,
+      "loss": 0.5586,
+      "step": 18665
+    },
+    {
+      "epoch": 0.9478239900496249,
+      "grad_norm": 0.023645544124025912,
+      "learning_rate": 0.0006312274106379916,
+      "loss": 0.5575,
+      "step": 18670
+    },
+    {
+      "epoch": 0.9480778261476565,
+      "grad_norm": 0.04599714324805517,
+      "learning_rate": 0.0006310136375502379,
+      "loss": 0.5205,
+      "step": 18675
+    },
+    {
+      "epoch": 0.948331662245688,
+      "grad_norm": 0.02201034707176053,
+      "learning_rate": 0.0006307998387451604,
+      "loss": 0.5386,
+      "step": 18680
+    },
+    {
+      "epoch": 0.9485854983437194,
+      "grad_norm": 0.02399542893990445,
+      "learning_rate": 0.0006305860142647269,
+      "loss": 0.5139,
+      "step": 18685
+    },
+    {
+      "epoch": 0.948839334441751,
+      "grad_norm": 0.04397570627326147,
+      "learning_rate": 0.0006303721641509101,
+      "loss": 0.5376,
+      "step": 18690
+    },
+    {
+      "epoch": 0.9490931705397825,
+      "grad_norm": 0.025883089049502404,
+      "learning_rate": 0.0006301582884456877,
+      "loss": 0.5458,
+      "step": 18695
+    },
+    {
+      "epoch": 0.949347006637814,
+      "grad_norm": 0.026996067831657474,
+      "learning_rate": 0.0006299443871910423,
+      "loss": 0.5854,
+      "step": 18700
+    },
+    {
+      "epoch": 0.9496008427358454,
+      "grad_norm": 0.02942829816892566,
+      "learning_rate": 0.0006297304604289618,
+      "loss": 0.5797,
+      "step": 18705
+    },
+    {
+      "epoch": 0.949854678833877,
+      "grad_norm": 0.024736740711198107,
+      "learning_rate": 0.0006295165082014387,
+      "loss": 0.5299,
+      "step": 18710
+    },
+    {
+      "epoch": 0.9501085149319085,
+      "grad_norm": 0.02630491688826239,
+      "learning_rate": 0.0006293025305504712,
+      "loss": 0.5528,
+      "step": 18715
+    },
+    {
+      "epoch": 0.9503623510299399,
+      "grad_norm": 0.028244133702288142,
+      "learning_rate": 0.0006290885275180615,
+      "loss": 0.5188,
+      "step": 18720
+    },
+    {
+      "epoch": 0.9506161871279715,
+      "grad_norm": 0.02421009162431851,
+      "learning_rate": 0.0006288744991462177,
+      "loss": 0.5593,
+      "step": 18725
+    },
+    {
+      "epoch": 0.950870023226003,
+      "grad_norm": 0.023551000998892246,
+      "learning_rate": 0.0006286604454769526,
+      "loss": 0.5521,
+      "step": 18730
+    },
+    {
+      "epoch": 0.9511238593240344,
+      "grad_norm": 0.024350962137287317,
+      "learning_rate": 0.0006284463665522835,
+      "loss": 0.59,
+      "step": 18735
+    },
+    {
+      "epoch": 0.951377695422066,
+      "grad_norm": 0.08344993588030145,
+      "learning_rate": 0.0006282322624142332,
+      "loss": 0.571,
+      "step": 18740
+    },
+    {
+      "epoch": 0.9516315315200975,
+      "grad_norm": 0.031834745293134896,
+      "learning_rate": 0.0006280181331048293,
+      "loss": 0.5855,
+      "step": 18745
+    },
+    {
+      "epoch": 0.9518853676181289,
+      "grad_norm": 0.025834530453184186,
+      "learning_rate": 0.0006278039786661042,
+      "loss": 0.5814,
+      "step": 18750
+    },
+    {
+      "epoch": 0.9521392037161605,
+      "grad_norm": 0.027296230433010688,
+      "learning_rate": 0.0006275897991400956,
+      "loss": 0.5759,
+      "step": 18755
+    },
+    {
+      "epoch": 0.952393039814192,
+      "grad_norm": 0.06183118735204768,
+      "learning_rate": 0.0006273755945688458,
+      "loss": 0.5715,
+      "step": 18760
+    },
+    {
+      "epoch": 0.9526468759122235,
+      "grad_norm": 0.026071457227966246,
+      "learning_rate": 0.0006271613649944019,
+      "loss": 0.5506,
+      "step": 18765
+    },
+    {
+      "epoch": 0.952900712010255,
+      "grad_norm": 0.05151300719134269,
+      "learning_rate": 0.000626947110458816,
+      "loss": 0.5903,
+      "step": 18770
+    },
+    {
+      "epoch": 0.9531545481082865,
+      "grad_norm": 0.029346199524142243,
+      "learning_rate": 0.0006267328310041457,
+      "loss": 0.5632,
+      "step": 18775
+    },
+    {
+      "epoch": 0.953408384206318,
+      "grad_norm": 0.026410770850243227,
+      "learning_rate": 0.0006265185266724526,
+      "loss": 0.5699,
+      "step": 18780
+    },
+    {
+      "epoch": 0.9536622203043494,
+      "grad_norm": 0.049041169261654256,
+      "learning_rate": 0.0006263041975058035,
+      "loss": 0.5605,
+      "step": 18785
+    },
+    {
+      "epoch": 0.953916056402381,
+      "grad_norm": 0.0363002561476258,
+      "learning_rate": 0.0006260898435462705,
+      "loss": 0.5506,
+      "step": 18790
+    },
+    {
+      "epoch": 0.9541698925004125,
+      "grad_norm": 0.022941890322864124,
+      "learning_rate": 0.0006258754648359301,
+      "loss": 0.5471,
+      "step": 18795
+    },
+    {
+      "epoch": 0.9544237285984439,
+      "grad_norm": 0.025645292089913767,
+      "learning_rate": 0.0006256610614168634,
+      "loss": 0.5604,
+      "step": 18800
+    },
+    {
+      "epoch": 0.9546775646964755,
+      "grad_norm": 0.022494488082290532,
+      "learning_rate": 0.0006254466333311573,
+      "loss": 0.5778,
+      "step": 18805
+    },
+    {
+      "epoch": 0.954931400794507,
+      "grad_norm": 0.02606808639822887,
+      "learning_rate": 0.0006252321806209024,
+      "loss": 0.6058,
+      "step": 18810
+    },
+    {
+      "epoch": 0.9551852368925385,
+      "grad_norm": 0.025745633241197222,
+      "learning_rate": 0.0006250177033281952,
+      "loss": 0.5883,
+      "step": 18815
+    },
+    {
+      "epoch": 0.95543907299057,
+      "grad_norm": 0.033669145164535265,
+      "learning_rate": 0.0006248032014951363,
+      "loss": 0.5292,
+      "step": 18820
+    },
+    {
+      "epoch": 0.9556929090886015,
+      "grad_norm": 0.03772741877989379,
+      "learning_rate": 0.0006245886751638312,
+      "loss": 0.5183,
+      "step": 18825
+    },
+    {
+      "epoch": 0.955946745186633,
+      "grad_norm": 0.03475199734255405,
+      "learning_rate": 0.0006243741243763906,
+      "loss": 0.5497,
+      "step": 18830
+    },
+    {
+      "epoch": 0.9562005812846645,
+      "grad_norm": 0.028007500754168836,
+      "learning_rate": 0.0006241595491749297,
+      "loss": 0.5348,
+      "step": 18835
+    },
+    {
+      "epoch": 0.956454417382696,
+      "grad_norm": 0.03964976547186352,
+      "learning_rate": 0.0006239449496015684,
+      "loss": 0.5696,
+      "step": 18840
+    },
+    {
+      "epoch": 0.9567082534807275,
+      "grad_norm": 0.029844026908865874,
+      "learning_rate": 0.0006237303256984315,
+      "loss": 0.5383,
+      "step": 18845
+    },
+    {
+      "epoch": 0.956962089578759,
+      "grad_norm": 0.03293271336871704,
+      "learning_rate": 0.0006235156775076488,
+      "loss": 0.5653,
+      "step": 18850
+    },
+    {
+      "epoch": 0.9572159256767905,
+      "grad_norm": 0.023764813247282635,
+      "learning_rate": 0.0006233010050713546,
+      "loss": 0.5474,
+      "step": 18855
+    },
+    {
+      "epoch": 0.957469761774822,
+      "grad_norm": 0.02729058413216523,
+      "learning_rate": 0.0006230863084316879,
+      "loss": 0.546,
+      "step": 18860
+    },
+    {
+      "epoch": 0.9577235978728535,
+      "grad_norm": 0.03048111721240167,
+      "learning_rate": 0.0006228715876307928,
+      "loss": 0.5302,
+      "step": 18865
+    },
+    {
+      "epoch": 0.957977433970885,
+      "grad_norm": 0.024821623443136005,
+      "learning_rate": 0.0006226568427108177,
+      "loss": 0.5267,
+      "step": 18870
+    },
+    {
+      "epoch": 0.9582312700689165,
+      "grad_norm": 0.02171005251341012,
+      "learning_rate": 0.0006224420737139161,
+      "loss": 0.5686,
+      "step": 18875
+    },
+    {
+      "epoch": 0.958485106166948,
+      "grad_norm": 0.02438019118654564,
+      "learning_rate": 0.0006222272806822463,
+      "loss": 0.5566,
+      "step": 18880
+    },
+    {
+      "epoch": 0.9587389422649795,
+      "grad_norm": 0.0223542750232972,
+      "learning_rate": 0.0006220124636579704,
+      "loss": 0.5438,
+      "step": 18885
+    },
+    {
+      "epoch": 0.958992778363011,
+      "grad_norm": 0.03015303343884345,
+      "learning_rate": 0.0006217976226832565,
+      "loss": 0.5771,
+      "step": 18890
+    },
+    {
+      "epoch": 0.9592466144610425,
+      "grad_norm": 0.025825991972978846,
+      "learning_rate": 0.0006215827578002768,
+      "loss": 0.5591,
+      "step": 18895
+    },
+    {
+      "epoch": 0.959500450559074,
+      "grad_norm": 0.03547031836750829,
+      "learning_rate": 0.0006213678690512081,
+      "loss": 0.5608,
+      "step": 18900
+    },
+    {
+      "epoch": 0.9597542866571055,
+      "grad_norm": 0.020995458669697365,
+      "learning_rate": 0.0006211529564782319,
+      "loss": 0.5428,
+      "step": 18905
+    },
+    {
+      "epoch": 0.960008122755137,
+      "grad_norm": 0.02476217559421539,
+      "learning_rate": 0.0006209380201235345,
+      "loss": 0.5592,
+      "step": 18910
+    },
+    {
+      "epoch": 0.9602619588531686,
+      "grad_norm": 0.023830491201295642,
+      "learning_rate": 0.000620723060029307,
+      "loss": 0.5601,
+      "step": 18915
+    },
+    {
+      "epoch": 0.9605157949512,
+      "grad_norm": 0.024232475162824736,
+      "learning_rate": 0.0006205080762377446,
+      "loss": 0.5588,
+      "step": 18920
+    },
+    {
+      "epoch": 0.9607696310492315,
+      "grad_norm": 0.02838429347591899,
+      "learning_rate": 0.000620293068791048,
+      "loss": 0.5365,
+      "step": 18925
+    },
+    {
+      "epoch": 0.961023467147263,
+      "grad_norm": 0.022820427870362616,
+      "learning_rate": 0.0006200780377314219,
+      "loss": 0.5594,
+      "step": 18930
+    },
+    {
+      "epoch": 0.9612773032452945,
+      "grad_norm": 0.03755731589801296,
+      "learning_rate": 0.0006198629831010758,
+      "loss": 0.5745,
+      "step": 18935
+    },
+    {
+      "epoch": 0.961531139343326,
+      "grad_norm": 0.024014657854385738,
+      "learning_rate": 0.0006196479049422239,
+      "loss": 0.5418,
+      "step": 18940
+    },
+    {
+      "epoch": 0.9617849754413575,
+      "grad_norm": 0.027051758292076112,
+      "learning_rate": 0.0006194328032970848,
+      "loss": 0.5438,
+      "step": 18945
+    },
+    {
+      "epoch": 0.962038811539389,
+      "grad_norm": 0.023230927228254993,
+      "learning_rate": 0.0006192176782078822,
+      "loss": 0.5285,
+      "step": 18950
+    },
+    {
+      "epoch": 0.9622926476374205,
+      "grad_norm": 0.02453226053942355,
+      "learning_rate": 0.0006190025297168437,
+      "loss": 0.5408,
+      "step": 18955
+    },
+    {
+      "epoch": 0.962546483735452,
+      "grad_norm": 0.28109523504650163,
+      "learning_rate": 0.0006187873578662024,
+      "loss": 0.5413,
+      "step": 18960
+    },
+    {
+      "epoch": 0.9628003198334835,
+      "grad_norm": 0.03123756702410563,
+      "learning_rate": 0.0006185721626981949,
+      "loss": 0.5555,
+      "step": 18965
+    },
+    {
+      "epoch": 0.963054155931515,
+      "grad_norm": 0.031041861048312636,
+      "learning_rate": 0.0006183569442550633,
+      "loss": 0.6098,
+      "step": 18970
+    },
+    {
+      "epoch": 0.9633079920295465,
+      "grad_norm": 0.031099393750686002,
+      "learning_rate": 0.0006181417025790536,
+      "loss": 0.5361,
+      "step": 18975
+    },
+    {
+      "epoch": 0.9635618281275781,
+      "grad_norm": 0.02595781815514382,
+      "learning_rate": 0.000617926437712417,
+      "loss": 0.5539,
+      "step": 18980
+    },
+    {
+      "epoch": 0.9638156642256095,
+      "grad_norm": 0.03639463677804149,
+      "learning_rate": 0.0006177111496974087,
+      "loss": 0.5472,
+      "step": 18985
+    },
+    {
+      "epoch": 0.964069500323641,
+      "grad_norm": 0.02606363135734158,
+      "learning_rate": 0.0006174958385762888,
+      "loss": 0.5701,
+      "step": 18990
+    },
+    {
+      "epoch": 0.9643233364216726,
+      "grad_norm": 0.04038231790604605,
+      "learning_rate": 0.0006172805043913218,
+      "loss": 0.5584,
+      "step": 18995
+    },
+    {
+      "epoch": 0.964577172519704,
+      "grad_norm": 0.03184750465530715,
+      "learning_rate": 0.0006170651471847766,
+      "loss": 0.5571,
+      "step": 19000
+    },
+    {
+      "epoch": 0.9648310086177355,
+      "grad_norm": 0.03705308485544799,
+      "learning_rate": 0.0006168497669989268,
+      "loss": 0.5572,
+      "step": 19005
+    },
+    {
+      "epoch": 0.965084844715767,
+      "grad_norm": 0.03454894666757974,
+      "learning_rate": 0.0006166343638760504,
+      "loss": 0.6049,
+      "step": 19010
+    },
+    {
+      "epoch": 0.9653386808137985,
+      "grad_norm": 0.03088369982680049,
+      "learning_rate": 0.0006164189378584301,
+      "loss": 0.5598,
+      "step": 19015
+    },
+    {
+      "epoch": 0.96559251691183,
+      "grad_norm": 0.028758495609164273,
+      "learning_rate": 0.0006162034889883529,
+      "loss": 0.5474,
+      "step": 19020
+    },
+    {
+      "epoch": 0.9658463530098615,
+      "grad_norm": 0.02750682584809894,
+      "learning_rate": 0.0006159880173081103,
+      "loss": 0.542,
+      "step": 19025
+    },
+    {
+      "epoch": 0.9661001891078931,
+      "grad_norm": 0.03970699514356098,
+      "learning_rate": 0.0006157725228599982,
+      "loss": 0.5545,
+      "step": 19030
+    },
+    {
+      "epoch": 0.9663540252059245,
+      "grad_norm": 0.022540600879443772,
+      "learning_rate": 0.0006155570056863175,
+      "loss": 0.5211,
+      "step": 19035
+    },
+    {
+      "epoch": 0.966607861303956,
+      "grad_norm": 0.03517528202934328,
+      "learning_rate": 0.0006153414658293725,
+      "loss": 0.5706,
+      "step": 19040
+    },
+    {
+      "epoch": 0.9668616974019876,
+      "grad_norm": 0.025523416998452415,
+      "learning_rate": 0.0006151259033314733,
+      "loss": 0.5416,
+      "step": 19045
+    },
+    {
+      "epoch": 0.967115533500019,
+      "grad_norm": 0.024970335802673438,
+      "learning_rate": 0.0006149103182349333,
+      "loss": 0.5711,
+      "step": 19050
+    },
+    {
+      "epoch": 0.9673693695980505,
+      "grad_norm": 0.04734193158993932,
+      "learning_rate": 0.0006146947105820709,
+      "loss": 0.5787,
+      "step": 19055
+    },
+    {
+      "epoch": 0.9676232056960821,
+      "grad_norm": 0.025774746375266132,
+      "learning_rate": 0.0006144790804152088,
+      "loss": 0.5752,
+      "step": 19060
+    },
+    {
+      "epoch": 0.9678770417941135,
+      "grad_norm": 0.035809770586720045,
+      "learning_rate": 0.0006142634277766741,
+      "loss": 0.5544,
+      "step": 19065
+    },
+    {
+      "epoch": 0.968130877892145,
+      "grad_norm": 0.022096408564194946,
+      "learning_rate": 0.0006140477527087983,
+      "loss": 0.5594,
+      "step": 19070
+    },
+    {
+      "epoch": 0.9683847139901766,
+      "grad_norm": 0.02285174641983169,
+      "learning_rate": 0.0006138320552539175,
+      "loss": 0.5824,
+      "step": 19075
+    },
+    {
+      "epoch": 0.9686385500882081,
+      "grad_norm": 0.022639632770351623,
+      "learning_rate": 0.000613616335454372,
+      "loss": 0.5582,
+      "step": 19080
+    },
+    {
+      "epoch": 0.9688923861862395,
+      "grad_norm": 0.02407222130765984,
+      "learning_rate": 0.0006134005933525062,
+      "loss": 0.5672,
+      "step": 19085
+    },
+    {
+      "epoch": 0.969146222284271,
+      "grad_norm": 0.02688787209576168,
+      "learning_rate": 0.0006131848289906696,
+      "loss": 0.5335,
+      "step": 19090
+    },
+    {
+      "epoch": 0.9694000583823026,
+      "grad_norm": 0.02409004068147683,
+      "learning_rate": 0.0006129690424112156,
+      "loss": 0.5962,
+      "step": 19095
+    },
+    {
+      "epoch": 0.969653894480334,
+      "grad_norm": 0.024141956964642438,
+      "learning_rate": 0.0006127532336565018,
+      "loss": 0.5666,
+      "step": 19100
+    },
+    {
+      "epoch": 0.9699077305783655,
+      "grad_norm": 0.022675312599314844,
+      "learning_rate": 0.0006125374027688905,
+      "loss": 0.551,
+      "step": 19105
+    },
+    {
+      "epoch": 0.9701615666763971,
+      "grad_norm": 0.037996729698158885,
+      "learning_rate": 0.0006123215497907484,
+      "loss": 0.5652,
+      "step": 19110
+    },
+    {
+      "epoch": 0.9704154027744285,
+      "grad_norm": 0.0272987646077441,
+      "learning_rate": 0.0006121056747644461,
+      "loss": 0.565,
+      "step": 19115
+    },
+    {
+      "epoch": 0.97066923887246,
+      "grad_norm": 0.04267144316368038,
+      "learning_rate": 0.000611889777732359,
+      "loss": 0.5429,
+      "step": 19120
+    },
+    {
+      "epoch": 0.9709230749704916,
+      "grad_norm": 0.02925397740944498,
+      "learning_rate": 0.0006116738587368665,
+      "loss": 0.5578,
+      "step": 19125
+    },
+    {
+      "epoch": 0.9711769110685231,
+      "grad_norm": 0.023290731955223867,
+      "learning_rate": 0.0006114579178203524,
+      "loss": 0.5429,
+      "step": 19130
+    },
+    {
+      "epoch": 0.9714307471665545,
+      "grad_norm": 0.03164983053501458,
+      "learning_rate": 0.000611241955025205,
+      "loss": 0.5405,
+      "step": 19135
+    },
+    {
+      "epoch": 0.9716845832645861,
+      "grad_norm": 0.037520719870187604,
+      "learning_rate": 0.0006110259703938165,
+      "loss": 0.5724,
+      "step": 19140
+    },
+    {
+      "epoch": 0.9719384193626176,
+      "grad_norm": 0.034312355122149724,
+      "learning_rate": 0.0006108099639685837,
+      "loss": 0.5474,
+      "step": 19145
+    },
+    {
+      "epoch": 0.972192255460649,
+      "grad_norm": 0.02562062543260294,
+      "learning_rate": 0.0006105939357919076,
+      "loss": 0.5461,
+      "step": 19150
+    },
+    {
+      "epoch": 0.9724460915586806,
+      "grad_norm": 0.02354699369047809,
+      "learning_rate": 0.0006103778859061935,
+      "loss": 0.545,
+      "step": 19155
+    },
+    {
+      "epoch": 0.9726999276567121,
+      "grad_norm": 0.021930383911286492,
+      "learning_rate": 0.0006101618143538508,
+      "loss": 0.5573,
+      "step": 19160
+    },
+    {
+      "epoch": 0.9729537637547435,
+      "grad_norm": 0.02583749368715139,
+      "learning_rate": 0.0006099457211772933,
+      "loss": 0.5544,
+      "step": 19165
+    },
+    {
+      "epoch": 0.973207599852775,
+      "grad_norm": 0.022556631905887248,
+      "learning_rate": 0.0006097296064189391,
+      "loss": 0.5559,
+      "step": 19170
+    },
+    {
+      "epoch": 0.9734614359508066,
+      "grad_norm": 0.03226579384353296,
+      "learning_rate": 0.0006095134701212102,
+      "loss": 0.5902,
+      "step": 19175
+    },
+    {
+      "epoch": 0.973715272048838,
+      "grad_norm": 0.023654661735220267,
+      "learning_rate": 0.0006092973123265334,
+      "loss": 0.5331,
+      "step": 19180
+    },
+    {
+      "epoch": 0.9739691081468695,
+      "grad_norm": 0.021164960310687785,
+      "learning_rate": 0.0006090811330773392,
+      "loss": 0.5349,
+      "step": 19185
+    },
+    {
+      "epoch": 0.9742229442449011,
+      "grad_norm": 0.026505686211264395,
+      "learning_rate": 0.0006088649324160626,
+      "loss": 0.4964,
+      "step": 19190
+    },
+    {
+      "epoch": 0.9744767803429326,
+      "grad_norm": 0.15964885508324672,
+      "learning_rate": 0.0006086487103851426,
+      "loss": 0.5849,
+      "step": 19195
+    },
+    {
+      "epoch": 0.974730616440964,
+      "grad_norm": 0.031195162048445326,
+      "learning_rate": 0.0006084324670270227,
+      "loss": 0.5735,
+      "step": 19200
+    },
+    {
+      "epoch": 0.9749844525389956,
+      "grad_norm": 0.024371095692914808,
+      "learning_rate": 0.0006082162023841502,
+      "loss": 0.5566,
+      "step": 19205
+    },
+    {
+      "epoch": 0.9752382886370271,
+      "grad_norm": 0.027331834493339167,
+      "learning_rate": 0.0006079999164989769,
+      "loss": 0.551,
+      "step": 19210
+    },
+    {
+      "epoch": 0.9754921247350585,
+      "grad_norm": 0.027798770270491248,
+      "learning_rate": 0.0006077836094139586,
+      "loss": 0.5576,
+      "step": 19215
+    },
+    {
+      "epoch": 0.9757459608330901,
+      "grad_norm": 0.025968021890026095,
+      "learning_rate": 0.0006075672811715553,
+      "loss": 0.5549,
+      "step": 19220
+    },
+    {
+      "epoch": 0.9759997969311216,
+      "grad_norm": 0.02489623198625103,
+      "learning_rate": 0.0006073509318142308,
+      "loss": 0.5526,
+      "step": 19225
+    },
+    {
+      "epoch": 0.976253633029153,
+      "grad_norm": 0.02608892933733737,
+      "learning_rate": 0.0006071345613844541,
+      "loss": 0.5696,
+      "step": 19230
+    },
+    {
+      "epoch": 0.9765074691271846,
+      "grad_norm": 0.023901773570461967,
+      "learning_rate": 0.0006069181699246973,
+      "loss": 0.5763,
+      "step": 19235
+    },
+    {
+      "epoch": 0.9767613052252161,
+      "grad_norm": 0.04188933752859617,
+      "learning_rate": 0.0006067017574774369,
+      "loss": 0.5685,
+      "step": 19240
+    },
+    {
+      "epoch": 0.9770151413232476,
+      "grad_norm": 0.04061459638720143,
+      "learning_rate": 0.0006064853240851536,
+      "loss": 0.5394,
+      "step": 19245
+    },
+    {
+      "epoch": 0.977268977421279,
+      "grad_norm": 0.037889258081175194,
+      "learning_rate": 0.0006062688697903322,
+      "loss": 0.5385,
+      "step": 19250
+    },
+    {
+      "epoch": 0.9775228135193106,
+      "grad_norm": 0.03360689695404566,
+      "learning_rate": 0.0006060523946354615,
+      "loss": 0.5318,
+      "step": 19255
+    },
+    {
+      "epoch": 0.9777766496173421,
+      "grad_norm": 0.03162668844088351,
+      "learning_rate": 0.0006058358986630347,
+      "loss": 0.5254,
+      "step": 19260
+    },
+    {
+      "epoch": 0.9780304857153735,
+      "grad_norm": 0.028559717544297737,
+      "learning_rate": 0.0006056193819155488,
+      "loss": 0.5786,
+      "step": 19265
+    },
+    {
+      "epoch": 0.9782843218134051,
+      "grad_norm": 0.02644336830634299,
+      "learning_rate": 0.0006054028444355051,
+      "loss": 0.5469,
+      "step": 19270
+    },
+    {
+      "epoch": 0.9785381579114366,
+      "grad_norm": 0.03308577275301962,
+      "learning_rate": 0.0006051862862654085,
+      "loss": 0.5332,
+      "step": 19275
+    },
+    {
+      "epoch": 0.978791994009468,
+      "grad_norm": 0.022000448921150856,
+      "learning_rate": 0.0006049697074477686,
+      "loss": 0.5501,
+      "step": 19280
+    },
+    {
+      "epoch": 0.9790458301074996,
+      "grad_norm": 0.0247774682959936,
+      "learning_rate": 0.0006047531080250985,
+      "loss": 0.5404,
+      "step": 19285
+    },
+    {
+      "epoch": 0.9792996662055311,
+      "grad_norm": 0.022885746957311163,
+      "learning_rate": 0.0006045364880399158,
+      "loss": 0.5675,
+      "step": 19290
+    },
+    {
+      "epoch": 0.9795535023035626,
+      "grad_norm": 0.024643646796613586,
+      "learning_rate": 0.0006043198475347418,
+      "loss": 0.5897,
+      "step": 19295
+    },
+    {
+      "epoch": 0.9798073384015941,
+      "grad_norm": 0.03123615100865316,
+      "learning_rate": 0.0006041031865521019,
+      "loss": 0.5386,
+      "step": 19300
+    },
+    {
+      "epoch": 0.9800611744996256,
+      "grad_norm": 0.023897535561443067,
+      "learning_rate": 0.0006038865051345257,
+      "loss": 0.5787,
+      "step": 19305
+    },
+    {
+      "epoch": 0.9803150105976571,
+      "grad_norm": 0.0238186011366476,
+      "learning_rate": 0.0006036698033245466,
+      "loss": 0.5415,
+      "step": 19310
+    },
+    {
+      "epoch": 0.9805688466956886,
+      "grad_norm": 0.024067063625727737,
+      "learning_rate": 0.000603453081164702,
+      "loss": 0.5225,
+      "step": 19315
+    },
+    {
+      "epoch": 0.9808226827937201,
+      "grad_norm": 0.02230127040642843,
+      "learning_rate": 0.0006032363386975337,
+      "loss": 0.5599,
+      "step": 19320
+    },
+    {
+      "epoch": 0.9810765188917516,
+      "grad_norm": 0.023928492267361032,
+      "learning_rate": 0.0006030195759655867,
+      "loss": 0.5454,
+      "step": 19325
+    },
+    {
+      "epoch": 0.981330354989783,
+      "grad_norm": 0.022074494679136896,
+      "learning_rate": 0.0006028027930114109,
+      "loss": 0.545,
+      "step": 19330
+    },
+    {
+      "epoch": 0.9815841910878146,
+      "grad_norm": 0.036818146764293826,
+      "learning_rate": 0.0006025859898775596,
+      "loss": 0.5431,
+      "step": 19335
+    },
+    {
+      "epoch": 0.9818380271858461,
+      "grad_norm": 0.021267108908214902,
+      "learning_rate": 0.0006023691666065899,
+      "loss": 0.5694,
+      "step": 19340
+    },
+    {
+      "epoch": 0.9820918632838777,
+      "grad_norm": 0.18195885280117177,
+      "learning_rate": 0.0006021523232410633,
+      "loss": 0.53,
+      "step": 19345
+    },
+    {
+      "epoch": 0.9823456993819091,
+      "grad_norm": 0.02933469861631167,
+      "learning_rate": 0.0006019354598235451,
+      "loss": 0.5383,
+      "step": 19350
+    },
+    {
+      "epoch": 0.9825995354799406,
+      "grad_norm": 0.02677555173548487,
+      "learning_rate": 0.0006017185763966044,
+      "loss": 0.5268,
+      "step": 19355
+    },
+    {
+      "epoch": 0.9828533715779721,
+      "grad_norm": 0.03238477638416001,
+      "learning_rate": 0.0006015016730028147,
+      "loss": 0.5301,
+      "step": 19360
+    },
+    {
+      "epoch": 0.9831072076760036,
+      "grad_norm": 0.026772761845334205,
+      "learning_rate": 0.0006012847496847525,
+      "loss": 0.5573,
+      "step": 19365
+    },
+    {
+      "epoch": 0.9833610437740351,
+      "grad_norm": 0.033966700490202034,
+      "learning_rate": 0.0006010678064849993,
+      "loss": 0.5473,
+      "step": 19370
+    },
+    {
+      "epoch": 0.9836148798720666,
+      "grad_norm": 0.023419696424668214,
+      "learning_rate": 0.0006008508434461394,
+      "loss": 0.5467,
+      "step": 19375
+    },
+    {
+      "epoch": 0.9838687159700981,
+      "grad_norm": 0.023940801308642033,
+      "learning_rate": 0.0006006338606107621,
+      "loss": 0.5717,
+      "step": 19380
+    },
+    {
+      "epoch": 0.9841225520681296,
+      "grad_norm": 0.025782454053669163,
+      "learning_rate": 0.0006004168580214598,
+      "loss": 0.52,
+      "step": 19385
+    },
+    {
+      "epoch": 0.9843763881661611,
+      "grad_norm": 0.0347230672413288,
+      "learning_rate": 0.000600199835720829,
+      "loss": 0.5136,
+      "step": 19390
+    },
+    {
+      "epoch": 0.9846302242641927,
+      "grad_norm": 0.03694921277593407,
+      "learning_rate": 0.0005999827937514701,
+      "loss": 0.557,
+      "step": 19395
+    },
+    {
+      "epoch": 0.9848840603622241,
+      "grad_norm": 0.042215904453139365,
+      "learning_rate": 0.0005997657321559875,
+      "loss": 0.5597,
+      "step": 19400
+    },
+    {
+      "epoch": 0.9851378964602556,
+      "grad_norm": 0.029186970807708833,
+      "learning_rate": 0.0005995486509769892,
+      "loss": 0.543,
+      "step": 19405
+    },
+    {
+      "epoch": 0.9853917325582872,
+      "grad_norm": 0.030825680890777987,
+      "learning_rate": 0.0005993315502570871,
+      "loss": 0.5283,
+      "step": 19410
+    },
+    {
+      "epoch": 0.9856455686563186,
+      "grad_norm": 0.03084685303258577,
+      "learning_rate": 0.000599114430038897,
+      "loss": 0.5378,
+      "step": 19415
+    },
+    {
+      "epoch": 0.9858994047543501,
+      "grad_norm": 0.03624341566980718,
+      "learning_rate": 0.0005988972903650388,
+      "loss": 0.5581,
+      "step": 19420
+    },
+    {
+      "epoch": 0.9861532408523817,
+      "grad_norm": 0.03238937387426097,
+      "learning_rate": 0.0005986801312781356,
+      "loss": 0.5855,
+      "step": 19425
+    },
+    {
+      "epoch": 0.9864070769504131,
+      "grad_norm": 0.029275031406698588,
+      "learning_rate": 0.0005984629528208147,
+      "loss": 0.5407,
+      "step": 19430
+    },
+    {
+      "epoch": 0.9866609130484446,
+      "grad_norm": 0.025894922300801874,
+      "learning_rate": 0.000598245755035707,
+      "loss": 0.5429,
+      "step": 19435
+    },
+    {
+      "epoch": 0.9869147491464761,
+      "grad_norm": 0.03331858196787023,
+      "learning_rate": 0.0005980285379654478,
+      "loss": 0.5862,
+      "step": 19440
+    },
+    {
+      "epoch": 0.9871685852445076,
+      "grad_norm": 0.02236486818578095,
+      "learning_rate": 0.0005978113016526753,
+      "loss": 0.5557,
+      "step": 19445
+    },
+    {
+      "epoch": 0.9874224213425391,
+      "grad_norm": 0.028767154063198953,
+      "learning_rate": 0.0005975940461400322,
+      "loss": 0.5677,
+      "step": 19450
+    },
+    {
+      "epoch": 0.9876762574405706,
+      "grad_norm": 0.02765546851820107,
+      "learning_rate": 0.0005973767714701646,
+      "loss": 0.5399,
+      "step": 19455
+    },
+    {
+      "epoch": 0.9879300935386022,
+      "grad_norm": 0.025484293553854787,
+      "learning_rate": 0.0005971594776857224,
+      "loss": 0.5593,
+      "step": 19460
+    },
+    {
+      "epoch": 0.9881839296366336,
+      "grad_norm": 0.031096244994143988,
+      "learning_rate": 0.000596942164829359,
+      "loss": 0.5332,
+      "step": 19465
+    },
+    {
+      "epoch": 0.9884377657346651,
+      "grad_norm": 0.044264139567922633,
+      "learning_rate": 0.0005967248329437322,
+      "loss": 0.5601,
+      "step": 19470
+    },
+    {
+      "epoch": 0.9886916018326967,
+      "grad_norm": 0.038014080072740736,
+      "learning_rate": 0.0005965074820715031,
+      "loss": 0.5252,
+      "step": 19475
+    },
+    {
+      "epoch": 0.9889454379307281,
+      "grad_norm": 0.02925907334151629,
+      "learning_rate": 0.0005962901122553366,
+      "loss": 0.5219,
+      "step": 19480
+    },
+    {
+      "epoch": 0.9891992740287596,
+      "grad_norm": 0.03246914140061126,
+      "learning_rate": 0.000596072723537901,
+      "loss": 0.5414,
+      "step": 19485
+    },
+    {
+      "epoch": 0.9894531101267912,
+      "grad_norm": 0.08010941576386262,
+      "learning_rate": 0.0005958553159618693,
+      "loss": 0.5208,
+      "step": 19490
+    },
+    {
+      "epoch": 0.9897069462248226,
+      "grad_norm": 0.0337186100394352,
+      "learning_rate": 0.0005956378895699169,
+      "loss": 0.5167,
+      "step": 19495
+    },
+    {
+      "epoch": 0.9899607823228541,
+      "grad_norm": 0.02503702827550537,
+      "learning_rate": 0.0005954204444047237,
+      "loss": 0.577,
+      "step": 19500
+    },
+    {
+      "epoch": 0.9902146184208857,
+      "grad_norm": 0.027330468479088564,
+      "learning_rate": 0.000595202980508973,
+      "loss": 0.5403,
+      "step": 19505
+    },
+    {
+      "epoch": 0.9904684545189172,
+      "grad_norm": 0.024841582567018985,
+      "learning_rate": 0.0005949854979253521,
+      "loss": 0.5139,
+      "step": 19510
+    },
+    {
+      "epoch": 0.9907222906169486,
+      "grad_norm": 0.02841062368445544,
+      "learning_rate": 0.0005947679966965517,
+      "loss": 0.5688,
+      "step": 19515
+    },
+    {
+      "epoch": 0.9909761267149801,
+      "grad_norm": 0.023717779713039483,
+      "learning_rate": 0.0005945504768652664,
+      "loss": 0.5672,
+      "step": 19520
+    },
+    {
+      "epoch": 0.9912299628130117,
+      "grad_norm": 0.02633602181222713,
+      "learning_rate": 0.0005943329384741937,
+      "loss": 0.5766,
+      "step": 19525
+    },
+    {
+      "epoch": 0.9914837989110431,
+      "grad_norm": 0.04290875603513479,
+      "learning_rate": 0.0005941153815660357,
+      "loss": 0.5472,
+      "step": 19530
+    },
+    {
+      "epoch": 0.9917376350090746,
+      "grad_norm": 0.024582207250698523,
+      "learning_rate": 0.0005938978061834977,
+      "loss": 0.5855,
+      "step": 19535
+    },
+    {
+      "epoch": 0.9919914711071062,
+      "grad_norm": 0.023721707310345872,
+      "learning_rate": 0.0005936802123692885,
+      "loss": 0.5748,
+      "step": 19540
+    },
+    {
+      "epoch": 0.9922453072051376,
+      "grad_norm": 0.026918040386621857,
+      "learning_rate": 0.0005934626001661209,
+      "loss": 0.5301,
+      "step": 19545
+    },
+    {
+      "epoch": 0.9924991433031691,
+      "grad_norm": 0.021708866914701308,
+      "learning_rate": 0.000593244969616711,
+      "loss": 0.5495,
+      "step": 19550
+    },
+    {
+      "epoch": 0.9927529794012007,
+      "grad_norm": 0.026495248356829797,
+      "learning_rate": 0.0005930273207637783,
+      "loss": 0.5222,
+      "step": 19555
+    },
+    {
+      "epoch": 0.9930068154992322,
+      "grad_norm": 0.02555187322005583,
+      "learning_rate": 0.0005928096536500467,
+      "loss": 0.54,
+      "step": 19560
+    },
+    {
+      "epoch": 0.9932606515972636,
+      "grad_norm": 0.02168086852198851,
+      "learning_rate": 0.0005925919683182429,
+      "loss": 0.5276,
+      "step": 19565
+    },
+    {
+      "epoch": 0.9935144876952952,
+      "grad_norm": 0.02499980089901408,
+      "learning_rate": 0.0005923742648110974,
+      "loss": 0.5073,
+      "step": 19570
+    },
+    {
+      "epoch": 0.9937683237933267,
+      "grad_norm": 0.02692412234182429,
+      "learning_rate": 0.0005921565431713445,
+      "loss": 0.5752,
+      "step": 19575
+    },
+    {
+      "epoch": 0.9940221598913581,
+      "grad_norm": 0.024546898354117743,
+      "learning_rate": 0.0005919388034417218,
+      "loss": 0.5328,
+      "step": 19580
+    },
+    {
+      "epoch": 0.9942759959893896,
+      "grad_norm": 0.024371485803514188,
+      "learning_rate": 0.0005917210456649703,
+      "loss": 0.5451,
+      "step": 19585
+    },
+    {
+      "epoch": 0.9945298320874212,
+      "grad_norm": 0.021869432790112914,
+      "learning_rate": 0.0005915032698838351,
+      "loss": 0.5394,
+      "step": 19590
+    },
+    {
+      "epoch": 0.9947836681854526,
+      "grad_norm": 0.03548043935373576,
+      "learning_rate": 0.0005912854761410642,
+      "loss": 0.5672,
+      "step": 19595
+    },
+    {
+      "epoch": 0.9950375042834841,
+      "grad_norm": 0.03662482896138081,
+      "learning_rate": 0.0005910676644794098,
+      "loss": 0.5641,
+      "step": 19600
+    },
+    {
+      "epoch": 0.9952913403815157,
+      "grad_norm": 0.0378468309188474,
+      "learning_rate": 0.0005908498349416269,
+      "loss": 0.5613,
+      "step": 19605
+    },
+    {
+      "epoch": 0.9955451764795472,
+      "grad_norm": 0.03627088854294031,
+      "learning_rate": 0.0005906319875704744,
+      "loss": 0.5532,
+      "step": 19610
+    },
+    {
+      "epoch": 0.9957990125775786,
+      "grad_norm": 0.03649215551829472,
+      "learning_rate": 0.0005904141224087147,
+      "loss": 0.4949,
+      "step": 19615
+    },
+    {
+      "epoch": 0.9960528486756102,
+      "grad_norm": 0.03763331065484329,
+      "learning_rate": 0.0005901962394991139,
+      "loss": 0.5862,
+      "step": 19620
+    },
+    {
+      "epoch": 0.9963066847736417,
+      "grad_norm": 0.0383387603823414,
+      "learning_rate": 0.0005899783388844408,
+      "loss": 0.5556,
+      "step": 19625
+    },
+    {
+      "epoch": 0.9965605208716731,
+      "grad_norm": 0.023484212925512286,
+      "learning_rate": 0.0005897604206074687,
+      "loss": 0.5708,
+      "step": 19630
+    },
+    {
+      "epoch": 0.9968143569697047,
+      "grad_norm": 0.03442449480611108,
+      "learning_rate": 0.0005895424847109736,
+      "loss": 0.5424,
+      "step": 19635
+    },
+    {
+      "epoch": 0.9970681930677362,
+      "grad_norm": 0.025468803257999176,
+      "learning_rate": 0.0005893245312377353,
+      "loss": 0.5184,
+      "step": 19640
+    },
+    {
+      "epoch": 0.9973220291657676,
+      "grad_norm": 0.023711176900071695,
+      "learning_rate": 0.0005891065602305369,
+      "loss": 0.5628,
+      "step": 19645
+    },
+    {
+      "epoch": 0.9975758652637992,
+      "grad_norm": 0.02757286076818459,
+      "learning_rate": 0.0005888885717321653,
+      "loss": 0.5549,
+      "step": 19650
+    },
+    {
+      "epoch": 0.9978297013618307,
+      "grad_norm": 0.022946177140901372,
+      "learning_rate": 0.0005886705657854101,
+      "loss": 0.531,
+      "step": 19655
+    },
+    {
+      "epoch": 0.9980835374598621,
+      "grad_norm": 0.03239722642028069,
+      "learning_rate": 0.0005884525424330652,
+      "loss": 0.547,
+      "step": 19660
+    },
+    {
+      "epoch": 0.9983373735578936,
+      "grad_norm": 0.02284766198581876,
+      "learning_rate": 0.0005882345017179274,
+      "loss": 0.5415,
+      "step": 19665
+    },
+    {
+      "epoch": 0.9985912096559252,
+      "grad_norm": 0.024423645649003435,
+      "learning_rate": 0.0005880164436827968,
+      "loss": 0.5319,
+      "step": 19670
+    },
+    {
+      "epoch": 0.9988450457539567,
+      "grad_norm": 0.023330277892660952,
+      "learning_rate": 0.0005877983683704772,
+      "loss": 0.5493,
+      "step": 19675
+    },
+    {
+      "epoch": 0.9990988818519881,
+      "grad_norm": 0.023557950454847,
+      "learning_rate": 0.0005875802758237758,
+      "loss": 0.5102,
+      "step": 19680
+    },
+    {
+      "epoch": 0.9993527179500197,
+      "grad_norm": 0.023129582403367616,
+      "learning_rate": 0.0005873621660855031,
+      "loss": 0.5691,
+      "step": 19685
+    },
+    {
+      "epoch": 0.9996065540480512,
+      "grad_norm": 0.03468112252443195,
+      "learning_rate": 0.0005871440391984729,
+      "loss": 0.5328,
+      "step": 19690
+    },
+    {
+      "epoch": 0.9998603901460826,
+      "grad_norm": 0.027312657946593693,
+      "learning_rate": 0.0005869258952055023,
+      "loss": 0.5544,
+      "step": 19695
+    },
+    {
+      "epoch": 1.0001269180490158,
+      "grad_norm": 0.026262955674306445,
+      "learning_rate": 0.000586707734149412,
+      "loss": 0.5426,
+      "step": 19700
+    },
+    {
+      "epoch": 1.0003807541470473,
+      "grad_norm": 0.022474094556517613,
+      "learning_rate": 0.0005864895560730257,
+      "loss": 0.4976,
+      "step": 19705
+    },
+    {
+      "epoch": 1.0006345902450788,
+      "grad_norm": 0.030681716691258087,
+      "learning_rate": 0.000586271361019171,
+      "loss": 0.484,
+      "step": 19710
+    },
+    {
+      "epoch": 1.0008884263431101,
+      "grad_norm": 0.06523945944943467,
+      "learning_rate": 0.0005860531490306784,
+      "loss": 0.5035,
+      "step": 19715
+    },
+    {
+      "epoch": 1.0011422624411417,
+      "grad_norm": 0.02136637932462246,
+      "learning_rate": 0.0005858349201503819,
+      "loss": 0.4933,
+      "step": 19720
+    },
+    {
+      "epoch": 1.0013960985391732,
+      "grad_norm": 0.023171987689390824,
+      "learning_rate": 0.0005856166744211185,
+      "loss": 0.5241,
+      "step": 19725
+    },
+    {
+      "epoch": 1.0016499346372048,
+      "grad_norm": 0.024915720736357203,
+      "learning_rate": 0.000585398411885729,
+      "loss": 0.5251,
+      "step": 19730
+    },
+    {
+      "epoch": 1.0019037707352363,
+      "grad_norm": 0.026668874934590553,
+      "learning_rate": 0.0005851801325870569,
+      "loss": 0.5185,
+      "step": 19735
+    },
+    {
+      "epoch": 1.0021576068332678,
+      "grad_norm": 0.03835627675960685,
+      "learning_rate": 0.0005849618365679497,
+      "loss": 0.5184,
+      "step": 19740
+    },
+    {
+      "epoch": 1.0024114429312994,
+      "grad_norm": 0.034939337784261996,
+      "learning_rate": 0.0005847435238712578,
+      "loss": 0.492,
+      "step": 19745
+    },
+    {
+      "epoch": 1.0026652790293307,
+      "grad_norm": 0.02797155118461246,
+      "learning_rate": 0.0005845251945398347,
+      "loss": 0.5061,
+      "step": 19750
+    },
+    {
+      "epoch": 1.0029191151273622,
+      "grad_norm": 0.02625721377041572,
+      "learning_rate": 0.0005843068486165374,
+      "loss": 0.5162,
+      "step": 19755
+    },
+    {
+      "epoch": 1.0031729512253937,
+      "grad_norm": 0.033958806367861846,
+      "learning_rate": 0.0005840884861442262,
+      "loss": 0.5117,
+      "step": 19760
+    },
+    {
+      "epoch": 1.0034267873234253,
+      "grad_norm": 0.03202782443206157,
+      "learning_rate": 0.0005838701071657643,
+      "loss": 0.5007,
+      "step": 19765
+    },
+    {
+      "epoch": 1.0036806234214568,
+      "grad_norm": 0.05036650421537131,
+      "learning_rate": 0.0005836517117240188,
+      "loss": 0.5151,
+      "step": 19770
+    },
+    {
+      "epoch": 1.0039344595194883,
+      "grad_norm": 0.04653612155409561,
+      "learning_rate": 0.0005834332998618596,
+      "loss": 0.4854,
+      "step": 19775
+    },
+    {
+      "epoch": 1.0041882956175199,
+      "grad_norm": 0.02804671100615471,
+      "learning_rate": 0.0005832148716221595,
+      "loss": 0.4987,
+      "step": 19780
+    },
+    {
+      "epoch": 1.0044421317155512,
+      "grad_norm": 0.022529443828552497,
+      "learning_rate": 0.0005829964270477953,
+      "loss": 0.5166,
+      "step": 19785
+    },
+    {
+      "epoch": 1.0046959678135827,
+      "grad_norm": 0.026684379922705167,
+      "learning_rate": 0.0005827779661816461,
+      "loss": 0.4885,
+      "step": 19790
+    },
+    {
+      "epoch": 1.0049498039116143,
+      "grad_norm": 0.060963234819393114,
+      "learning_rate": 0.000582559489066595,
+      "loss": 0.4791,
+      "step": 19795
+    },
+    {
+      "epoch": 1.0052036400096458,
+      "grad_norm": 0.03024468169105826,
+      "learning_rate": 0.0005823409957455281,
+      "loss": 0.5277,
+      "step": 19800
+    },
+    {
+      "epoch": 1.0054574761076773,
+      "grad_norm": 0.026438898745094152,
+      "learning_rate": 0.0005821224862613343,
+      "loss": 0.5012,
+      "step": 19805
+    },
+    {
+      "epoch": 1.0057113122057089,
+      "grad_norm": 0.03800242164729915,
+      "learning_rate": 0.000581903960656906,
+      "loss": 0.5191,
+      "step": 19810
+    },
+    {
+      "epoch": 1.0059651483037402,
+      "grad_norm": 0.02349650729472903,
+      "learning_rate": 0.0005816854189751386,
+      "loss": 0.5025,
+      "step": 19815
+    },
+    {
+      "epoch": 1.0062189844017717,
+      "grad_norm": 0.027214316740470396,
+      "learning_rate": 0.0005814668612589309,
+      "loss": 0.5251,
+      "step": 19820
+    },
+    {
+      "epoch": 1.0064728204998032,
+      "grad_norm": 0.025006007655423122,
+      "learning_rate": 0.0005812482875511845,
+      "loss": 0.5188,
+      "step": 19825
+    },
+    {
+      "epoch": 1.0067266565978348,
+      "grad_norm": 0.023683757750602448,
+      "learning_rate": 0.0005810296978948045,
+      "loss": 0.5285,
+      "step": 19830
+    },
+    {
+      "epoch": 1.0069804926958663,
+      "grad_norm": 0.02813093510207219,
+      "learning_rate": 0.0005808110923326989,
+      "loss": 0.5355,
+      "step": 19835
+    },
+    {
+      "epoch": 1.0072343287938978,
+      "grad_norm": 0.036465732354810876,
+      "learning_rate": 0.000580592470907779,
+      "loss": 0.5106,
+      "step": 19840
+    },
+    {
+      "epoch": 1.0074881648919294,
+      "grad_norm": 0.027979504250805996,
+      "learning_rate": 0.0005803738336629588,
+      "loss": 0.5217,
+      "step": 19845
+    },
+    {
+      "epoch": 1.0077420009899607,
+      "grad_norm": 0.02444979840392901,
+      "learning_rate": 0.0005801551806411561,
+      "loss": 0.5038,
+      "step": 19850
+    },
+    {
+      "epoch": 1.0079958370879922,
+      "grad_norm": 0.02488255109410723,
+      "learning_rate": 0.000579936511885291,
+      "loss": 0.5093,
+      "step": 19855
+    },
+    {
+      "epoch": 1.0082496731860238,
+      "grad_norm": 0.02550049459565845,
+      "learning_rate": 0.0005797178274382873,
+      "loss": 0.5269,
+      "step": 19860
+    },
+    {
+      "epoch": 1.0085035092840553,
+      "grad_norm": 0.02148889517361041,
+      "learning_rate": 0.0005794991273430716,
+      "loss": 0.5236,
+      "step": 19865
+    },
+    {
+      "epoch": 1.0087573453820868,
+      "grad_norm": 0.029497009648340886,
+      "learning_rate": 0.0005792804116425736,
+      "loss": 0.5042,
+      "step": 19870
+    },
+    {
+      "epoch": 1.0090111814801184,
+      "grad_norm": 0.02775516992605051,
+      "learning_rate": 0.0005790616803797263,
+      "loss": 0.5132,
+      "step": 19875
+    },
+    {
+      "epoch": 1.0092650175781497,
+      "grad_norm": 0.021211655705574328,
+      "learning_rate": 0.0005788429335974653,
+      "loss": 0.4801,
+      "step": 19880
+    },
+    {
+      "epoch": 1.0095188536761812,
+      "grad_norm": 0.02517030398746431,
+      "learning_rate": 0.0005786241713387297,
+      "loss": 0.5146,
+      "step": 19885
+    },
+    {
+      "epoch": 1.0097726897742128,
+      "grad_norm": 0.09462270152737755,
+      "learning_rate": 0.0005784053936464613,
+      "loss": 0.5127,
+      "step": 19890
+    },
+    {
+      "epoch": 1.0100265258722443,
+      "grad_norm": 0.026408205454532502,
+      "learning_rate": 0.0005781866005636052,
+      "loss": 0.5132,
+      "step": 19895
+    },
+    {
+      "epoch": 1.0102803619702758,
+      "grad_norm": 0.048718416202262686,
+      "learning_rate": 0.0005779677921331093,
+      "loss": 0.5499,
+      "step": 19900
+    },
+    {
+      "epoch": 1.0105341980683074,
+      "grad_norm": 0.05158533517321516,
+      "learning_rate": 0.0005777489683979247,
+      "loss": 0.4807,
+      "step": 19905
+    },
+    {
+      "epoch": 1.010788034166339,
+      "grad_norm": 0.042414622078804524,
+      "learning_rate": 0.0005775301294010052,
+      "loss": 0.5528,
+      "step": 19910
+    },
+    {
+      "epoch": 1.0110418702643702,
+      "grad_norm": 0.04213150411449631,
+      "learning_rate": 0.000577311275185308,
+      "loss": 0.5266,
+      "step": 19915
+    },
+    {
+      "epoch": 1.0112957063624017,
+      "grad_norm": 0.03233858503499294,
+      "learning_rate": 0.000577092405793793,
+      "loss": 0.4932,
+      "step": 19920
+    },
+    {
+      "epoch": 1.0115495424604333,
+      "grad_norm": 0.039425677298710396,
+      "learning_rate": 0.0005768735212694232,
+      "loss": 0.5242,
+      "step": 19925
+    },
+    {
+      "epoch": 1.0118033785584648,
+      "grad_norm": 0.04737514302447274,
+      "learning_rate": 0.0005766546216551646,
+      "loss": 0.4966,
+      "step": 19930
+    },
+    {
+      "epoch": 1.0120572146564963,
+      "grad_norm": 0.03637605851463593,
+      "learning_rate": 0.0005764357069939861,
+      "loss": 0.4907,
+      "step": 19935
+    },
+    {
+      "epoch": 1.0123110507545279,
+      "grad_norm": 0.03694503903527369,
+      "learning_rate": 0.0005762167773288594,
+      "loss": 0.5324,
+      "step": 19940
+    },
+    {
+      "epoch": 1.0125648868525594,
+      "grad_norm": 0.02639160420478131,
+      "learning_rate": 0.0005759978327027594,
+      "loss": 0.5153,
+      "step": 19945
+    },
+    {
+      "epoch": 1.0128187229505907,
+      "grad_norm": 0.028504997239272276,
+      "learning_rate": 0.000575778873158664,
+      "loss": 0.5113,
+      "step": 19950
+    },
+    {
+      "epoch": 1.0130725590486223,
+      "grad_norm": 0.024034023043991923,
+      "learning_rate": 0.0005755598987395535,
+      "loss": 0.5544,
+      "step": 19955
+    },
+    {
+      "epoch": 1.0133263951466538,
+      "grad_norm": 0.024483302437720976,
+      "learning_rate": 0.0005753409094884118,
+      "loss": 0.512,
+      "step": 19960
+    },
+    {
+      "epoch": 1.0135802312446853,
+      "grad_norm": 0.03202531364702948,
+      "learning_rate": 0.0005751219054482252,
+      "loss": 0.5313,
+      "step": 19965
+    },
+    {
+      "epoch": 1.0138340673427169,
+      "grad_norm": 0.04629752856614847,
+      "learning_rate": 0.0005749028866619833,
+      "loss": 0.5335,
+      "step": 19970
+    },
+    {
+      "epoch": 1.0140879034407484,
+      "grad_norm": 0.03603069481589699,
+      "learning_rate": 0.0005746838531726783,
+      "loss": 0.4915,
+      "step": 19975
+    },
+    {
+      "epoch": 1.0143417395387797,
+      "grad_norm": 0.036047586573516865,
+      "learning_rate": 0.0005744648050233053,
+      "loss": 0.5061,
+      "step": 19980
+    },
+    {
+      "epoch": 1.0145955756368112,
+      "grad_norm": 0.02500217530038146,
+      "learning_rate": 0.0005742457422568626,
+      "loss": 0.5021,
+      "step": 19985
+    },
+    {
+      "epoch": 1.0148494117348428,
+      "grad_norm": 0.030208741553390497,
+      "learning_rate": 0.0005740266649163507,
+      "loss": 0.5167,
+      "step": 19990
+    },
+    {
+      "epoch": 1.0151032478328743,
+      "grad_norm": 0.027545548141998097,
+      "learning_rate": 0.0005738075730447738,
+      "loss": 0.523,
+      "step": 19995
+    },
+    {
+      "epoch": 1.0153570839309058,
+      "grad_norm": 0.08854306261500218,
+      "learning_rate": 0.0005735884666851383,
+      "loss": 0.5208,
+      "step": 20000
+    },
+    {
+      "epoch": 1.0156109200289374,
+      "grad_norm": 0.030820917059042843,
+      "learning_rate": 0.0005733693458804537,
+      "loss": 0.5101,
+      "step": 20005
+    },
+    {
+      "epoch": 1.015864756126969,
+      "grad_norm": 0.034348864701267225,
+      "learning_rate": 0.0005731502106737326,
+      "loss": 0.5029,
+      "step": 20010
+    },
+    {
+      "epoch": 1.0161185922250002,
+      "grad_norm": 0.05872350374420824,
+      "learning_rate": 0.0005729310611079899,
+      "loss": 0.522,
+      "step": 20015
+    },
+    {
+      "epoch": 1.0163724283230318,
+      "grad_norm": 0.156969240858159,
+      "learning_rate": 0.0005727118972262437,
+      "loss": 0.5268,
+      "step": 20020
+    },
+    {
+      "epoch": 1.0166262644210633,
+      "grad_norm": 0.028641596405852787,
+      "learning_rate": 0.0005724927190715144,
+      "loss": 0.5322,
+      "step": 20025
+    },
+    {
+      "epoch": 1.0168801005190948,
+      "grad_norm": 0.03521162865341286,
+      "learning_rate": 0.0005722735266868261,
+      "loss": 0.4916,
+      "step": 20030
+    },
+    {
+      "epoch": 1.0171339366171264,
+      "grad_norm": 0.10045881918602356,
+      "learning_rate": 0.0005720543201152048,
+      "loss": 0.5197,
+      "step": 20035
+    },
+    {
+      "epoch": 1.017387772715158,
+      "grad_norm": 0.029867425574080526,
+      "learning_rate": 0.0005718350993996798,
+      "loss": 0.517,
+      "step": 20040
+    },
+    {
+      "epoch": 1.0176416088131894,
+      "grad_norm": 0.025642792051341005,
+      "learning_rate": 0.0005716158645832831,
+      "loss": 0.5213,
+      "step": 20045
+    },
+    {
+      "epoch": 1.0178954449112207,
+      "grad_norm": 0.02164896016678895,
+      "learning_rate": 0.0005713966157090493,
+      "loss": 0.5081,
+      "step": 20050
+    },
+    {
+      "epoch": 1.0181492810092523,
+      "grad_norm": 0.029307046081771176,
+      "learning_rate": 0.000571177352820016,
+      "loss": 0.56,
+      "step": 20055
+    },
+    {
+      "epoch": 1.0184031171072838,
+      "grad_norm": 0.03348711512912175,
+      "learning_rate": 0.0005709580759592232,
+      "loss": 0.4986,
+      "step": 20060
+    },
+    {
+      "epoch": 1.0186569532053154,
+      "grad_norm": 0.022750518622437432,
+      "learning_rate": 0.000570738785169714,
+      "loss": 0.5318,
+      "step": 20065
+    },
+    {
+      "epoch": 1.0189107893033469,
+      "grad_norm": 0.03273089769737566,
+      "learning_rate": 0.0005705194804945339,
+      "loss": 0.5205,
+      "step": 20070
+    },
+    {
+      "epoch": 1.0191646254013784,
+      "grad_norm": 0.03101997693290262,
+      "learning_rate": 0.0005703001619767317,
+      "loss": 0.5585,
+      "step": 20075
+    },
+    {
+      "epoch": 1.0194184614994097,
+      "grad_norm": 0.48293463443735013,
+      "learning_rate": 0.0005700808296593581,
+      "loss": 0.5232,
+      "step": 20080
+    },
+    {
+      "epoch": 1.0196722975974413,
+      "grad_norm": 0.026869468991904833,
+      "learning_rate": 0.0005698614835854672,
+      "loss": 0.5329,
+      "step": 20085
+    },
+    {
+      "epoch": 1.0199261336954728,
+      "grad_norm": 0.03175236206705197,
+      "learning_rate": 0.0005696421237981155,
+      "loss": 0.5293,
+      "step": 20090
+    },
+    {
+      "epoch": 1.0201799697935043,
+      "grad_norm": 0.02839497971854906,
+      "learning_rate": 0.0005694227503403623,
+      "loss": 0.5012,
+      "step": 20095
+    },
+    {
+      "epoch": 1.0204338058915359,
+      "grad_norm": 0.03500525520582677,
+      "learning_rate": 0.0005692033632552691,
+      "loss": 0.5219,
+      "step": 20100
+    },
+    {
+      "epoch": 1.0206876419895674,
+      "grad_norm": 0.034416581373221365,
+      "learning_rate": 0.000568983962585901,
+      "loss": 0.4977,
+      "step": 20105
+    },
+    {
+      "epoch": 1.020941478087599,
+      "grad_norm": 0.025912971110602936,
+      "learning_rate": 0.0005687645483753252,
+      "loss": 0.5208,
+      "step": 20110
+    },
+    {
+      "epoch": 1.0211953141856303,
+      "grad_norm": 0.033142018106894794,
+      "learning_rate": 0.0005685451206666113,
+      "loss": 0.4926,
+      "step": 20115
+    },
+    {
+      "epoch": 1.0214491502836618,
+      "grad_norm": 0.03866721854730293,
+      "learning_rate": 0.0005683256795028321,
+      "loss": 0.4984,
+      "step": 20120
+    },
+    {
+      "epoch": 1.0217029863816933,
+      "grad_norm": 0.027988000692318403,
+      "learning_rate": 0.0005681062249270627,
+      "loss": 0.4899,
+      "step": 20125
+    },
+    {
+      "epoch": 1.0219568224797249,
+      "grad_norm": 0.022355333164415802,
+      "learning_rate": 0.000567886756982381,
+      "loss": 0.5059,
+      "step": 20130
+    },
+    {
+      "epoch": 1.0222106585777564,
+      "grad_norm": 0.0440650062378424,
+      "learning_rate": 0.0005676672757118675,
+      "loss": 0.5015,
+      "step": 20135
+    },
+    {
+      "epoch": 1.022464494675788,
+      "grad_norm": 0.023375514087650193,
+      "learning_rate": 0.0005674477811586053,
+      "loss": 0.4984,
+      "step": 20140
+    },
+    {
+      "epoch": 1.0227183307738192,
+      "grad_norm": 0.03157563540299216,
+      "learning_rate": 0.0005672282733656799,
+      "loss": 0.5046,
+      "step": 20145
+    },
+    {
+      "epoch": 1.0229721668718508,
+      "grad_norm": 0.02116231894172818,
+      "learning_rate": 0.0005670087523761797,
+      "loss": 0.5165,
+      "step": 20150
+    },
+    {
+      "epoch": 1.0232260029698823,
+      "grad_norm": 0.02699273830998635,
+      "learning_rate": 0.0005667892182331958,
+      "loss": 0.5343,
+      "step": 20155
+    },
+    {
+      "epoch": 1.0234798390679138,
+      "grad_norm": 0.025520103162852272,
+      "learning_rate": 0.0005665696709798211,
+      "loss": 0.5015,
+      "step": 20160
+    },
+    {
+      "epoch": 1.0237336751659454,
+      "grad_norm": 0.02320429503242182,
+      "learning_rate": 0.0005663501106591522,
+      "loss": 0.5069,
+      "step": 20165
+    },
+    {
+      "epoch": 1.023987511263977,
+      "grad_norm": 0.03088412096065196,
+      "learning_rate": 0.0005661305373142874,
+      "loss": 0.5269,
+      "step": 20170
+    },
+    {
+      "epoch": 1.0242413473620084,
+      "grad_norm": 0.028956706789680344,
+      "learning_rate": 0.0005659109509883279,
+      "loss": 0.5156,
+      "step": 20175
+    },
+    {
+      "epoch": 1.0244951834600398,
+      "grad_norm": 0.028907980726983182,
+      "learning_rate": 0.0005656913517243775,
+      "loss": 0.5126,
+      "step": 20180
+    },
+    {
+      "epoch": 1.0247490195580713,
+      "grad_norm": 0.031564123214603855,
+      "learning_rate": 0.0005654717395655423,
+      "loss": 0.5212,
+      "step": 20185
+    },
+    {
+      "epoch": 1.0250028556561028,
+      "grad_norm": 0.023659437611885405,
+      "learning_rate": 0.0005652521145549312,
+      "loss": 0.5182,
+      "step": 20190
+    },
+    {
+      "epoch": 1.0252566917541344,
+      "grad_norm": 0.024467789569289182,
+      "learning_rate": 0.0005650324767356553,
+      "loss": 0.5317,
+      "step": 20195
+    },
+    {
+      "epoch": 1.025510527852166,
+      "grad_norm": 0.0259776185535513,
+      "learning_rate": 0.0005648128261508287,
+      "loss": 0.5111,
+      "step": 20200
+    },
+    {
+      "epoch": 1.0257643639501974,
+      "grad_norm": 0.037490439119855785,
+      "learning_rate": 0.0005645931628435674,
+      "loss": 0.4861,
+      "step": 20205
+    },
+    {
+      "epoch": 1.026018200048229,
+      "grad_norm": 0.033783262227514106,
+      "learning_rate": 0.0005643734868569904,
+      "loss": 0.5077,
+      "step": 20210
+    },
+    {
+      "epoch": 1.0262720361462603,
+      "grad_norm": 0.02114752691351358,
+      "learning_rate": 0.0005641537982342189,
+      "loss": 0.5203,
+      "step": 20215
+    },
+    {
+      "epoch": 1.0265258722442918,
+      "grad_norm": 0.0438818125159392,
+      "learning_rate": 0.0005639340970183767,
+      "loss": 0.5161,
+      "step": 20220
+    },
+    {
+      "epoch": 1.0267797083423233,
+      "grad_norm": 0.025324346639667195,
+      "learning_rate": 0.0005637143832525902,
+      "loss": 0.5301,
+      "step": 20225
+    },
+    {
+      "epoch": 1.0270335444403549,
+      "grad_norm": 0.02505419634462526,
+      "learning_rate": 0.000563494656979988,
+      "loss": 0.5142,
+      "step": 20230
+    },
+    {
+      "epoch": 1.0272873805383864,
+      "grad_norm": 0.0260932413609203,
+      "learning_rate": 0.0005632749182437013,
+      "loss": 0.476,
+      "step": 20235
+    },
+    {
+      "epoch": 1.027541216636418,
+      "grad_norm": 0.02657813325380469,
+      "learning_rate": 0.0005630551670868638,
+      "loss": 0.5121,
+      "step": 20240
+    },
+    {
+      "epoch": 1.0277950527344493,
+      "grad_norm": 0.02517945560861088,
+      "learning_rate": 0.0005628354035526113,
+      "loss": 0.5014,
+      "step": 20245
+    },
+    {
+      "epoch": 1.0280488888324808,
+      "grad_norm": 0.02457715419344653,
+      "learning_rate": 0.0005626156276840824,
+      "loss": 0.5445,
+      "step": 20250
+    },
+    {
+      "epoch": 1.0283027249305123,
+      "grad_norm": 0.023965938720396072,
+      "learning_rate": 0.0005623958395244182,
+      "loss": 0.5173,
+      "step": 20255
+    },
+    {
+      "epoch": 1.0285565610285439,
+      "grad_norm": 0.025808791859439455,
+      "learning_rate": 0.0005621760391167618,
+      "loss": 0.512,
+      "step": 20260
+    },
+    {
+      "epoch": 1.0288103971265754,
+      "grad_norm": 0.02616645418239067,
+      "learning_rate": 0.0005619562265042589,
+      "loss": 0.5024,
+      "step": 20265
+    },
+    {
+      "epoch": 1.029064233224607,
+      "grad_norm": 0.026011495515384515,
+      "learning_rate": 0.0005617364017300579,
+      "loss": 0.5688,
+      "step": 20270
+    },
+    {
+      "epoch": 1.0293180693226385,
+      "grad_norm": 0.023645680045192928,
+      "learning_rate": 0.0005615165648373091,
+      "loss": 0.5107,
+      "step": 20275
+    },
+    {
+      "epoch": 1.0295719054206698,
+      "grad_norm": 0.030625762897844983,
+      "learning_rate": 0.0005612967158691652,
+      "loss": 0.5425,
+      "step": 20280
+    },
+    {
+      "epoch": 1.0298257415187013,
+      "grad_norm": 0.02374291305295429,
+      "learning_rate": 0.0005610768548687818,
+      "loss": 0.4875,
+      "step": 20285
+    },
+    {
+      "epoch": 1.0300795776167329,
+      "grad_norm": 0.030056754991982286,
+      "learning_rate": 0.0005608569818793163,
+      "loss": 0.5138,
+      "step": 20290
+    },
+    {
+      "epoch": 1.0303334137147644,
+      "grad_norm": 0.021415600184617786,
+      "learning_rate": 0.0005606370969439288,
+      "loss": 0.5262,
+      "step": 20295
+    },
+    {
+      "epoch": 1.030587249812796,
+      "grad_norm": 0.035674997734029885,
+      "learning_rate": 0.0005604172001057817,
+      "loss": 0.4913,
+      "step": 20300
+    },
+    {
+      "epoch": 1.0308410859108275,
+      "grad_norm": 0.02322983861959061,
+      "learning_rate": 0.0005601972914080394,
+      "loss": 0.5258,
+      "step": 20305
+    },
+    {
+      "epoch": 1.0310949220088588,
+      "grad_norm": 0.02417219163699135,
+      "learning_rate": 0.000559977370893869,
+      "loss": 0.523,
+      "step": 20310
+    },
+    {
+      "epoch": 1.0313487581068903,
+      "grad_norm": 0.028159975076131768,
+      "learning_rate": 0.0005597574386064398,
+      "loss": 0.4937,
+      "step": 20315
+    },
+    {
+      "epoch": 1.0316025942049218,
+      "grad_norm": 0.025480520142062726,
+      "learning_rate": 0.0005595374945889235,
+      "loss": 0.5297,
+      "step": 20320
+    },
+    {
+      "epoch": 1.0318564303029534,
+      "grad_norm": 0.02648031129017469,
+      "learning_rate": 0.0005593175388844939,
+      "loss": 0.5191,
+      "step": 20325
+    },
+    {
+      "epoch": 1.032110266400985,
+      "grad_norm": 0.027996109985128536,
+      "learning_rate": 0.0005590975715363271,
+      "loss": 0.5182,
+      "step": 20330
+    },
+    {
+      "epoch": 1.0323641024990164,
+      "grad_norm": 0.02208647698714924,
+      "learning_rate": 0.0005588775925876019,
+      "loss": 0.4733,
+      "step": 20335
+    },
+    {
+      "epoch": 1.032617938597048,
+      "grad_norm": 0.03617341314263012,
+      "learning_rate": 0.0005586576020814986,
+      "loss": 0.5569,
+      "step": 20340
+    },
+    {
+      "epoch": 1.0328717746950793,
+      "grad_norm": 0.02362640361558171,
+      "learning_rate": 0.0005584376000612008,
+      "loss": 0.5051,
+      "step": 20345
+    },
+    {
+      "epoch": 1.0331256107931108,
+      "grad_norm": 0.024439837400368546,
+      "learning_rate": 0.0005582175865698935,
+      "loss": 0.5007,
+      "step": 20350
+    },
+    {
+      "epoch": 1.0333794468911424,
+      "grad_norm": 0.02551816319500186,
+      "learning_rate": 0.0005579975616507642,
+      "loss": 0.5351,
+      "step": 20355
+    },
+    {
+      "epoch": 1.033633282989174,
+      "grad_norm": 0.03547530832085492,
+      "learning_rate": 0.0005577775253470028,
+      "loss": 0.5283,
+      "step": 20360
+    },
+    {
+      "epoch": 1.0338871190872054,
+      "grad_norm": 0.024034616664719018,
+      "learning_rate": 0.0005575574777018014,
+      "loss": 0.5191,
+      "step": 20365
+    },
+    {
+      "epoch": 1.034140955185237,
+      "grad_norm": 0.023421751953444182,
+      "learning_rate": 0.000557337418758354,
+      "loss": 0.5522,
+      "step": 20370
+    },
+    {
+      "epoch": 1.0343947912832685,
+      "grad_norm": 0.02493069566308207,
+      "learning_rate": 0.0005571173485598575,
+      "loss": 0.5506,
+      "step": 20375
+    },
+    {
+      "epoch": 1.0346486273812998,
+      "grad_norm": 0.02674149513349264,
+      "learning_rate": 0.0005568972671495102,
+      "loss": 0.5057,
+      "step": 20380
+    },
+    {
+      "epoch": 1.0349024634793313,
+      "grad_norm": 0.025106714581499427,
+      "learning_rate": 0.000556677174570513,
+      "loss": 0.5033,
+      "step": 20385
+    },
+    {
+      "epoch": 1.0351562995773629,
+      "grad_norm": 0.025943628794495237,
+      "learning_rate": 0.0005564570708660692,
+      "loss": 0.4955,
+      "step": 20390
+    },
+    {
+      "epoch": 1.0354101356753944,
+      "grad_norm": 0.026309101778550982,
+      "learning_rate": 0.000556236956079384,
+      "loss": 0.4825,
+      "step": 20395
+    },
+    {
+      "epoch": 1.035663971773426,
+      "grad_norm": 0.026006362089191562,
+      "learning_rate": 0.0005560168302536645,
+      "loss": 0.5453,
+      "step": 20400
+    },
+    {
+      "epoch": 1.0359178078714575,
+      "grad_norm": 0.026998637894890546,
+      "learning_rate": 0.0005557966934321208,
+      "loss": 0.5181,
+      "step": 20405
+    },
+    {
+      "epoch": 1.0361716439694888,
+      "grad_norm": 0.023293334910978493,
+      "learning_rate": 0.0005555765456579645,
+      "loss": 0.4928,
+      "step": 20410
+    },
+    {
+      "epoch": 1.0364254800675203,
+      "grad_norm": 0.02235857148830006,
+      "learning_rate": 0.0005553563869744092,
+      "loss": 0.5101,
+      "step": 20415
+    },
+    {
+      "epoch": 1.0366793161655519,
+      "grad_norm": 0.031120430714844608,
+      "learning_rate": 0.0005551362174246714,
+      "loss": 0.4839,
+      "step": 20420
+    },
+    {
+      "epoch": 1.0369331522635834,
+      "grad_norm": 0.03754523909785098,
+      "learning_rate": 0.000554916037051969,
+      "loss": 0.5065,
+      "step": 20425
+    },
+    {
+      "epoch": 1.037186988361615,
+      "grad_norm": 0.02326842333764262,
+      "learning_rate": 0.0005546958458995225,
+      "loss": 0.5045,
+      "step": 20430
+    },
+    {
+      "epoch": 1.0374408244596465,
+      "grad_norm": 0.02252685117622399,
+      "learning_rate": 0.0005544756440105541,
+      "loss": 0.4895,
+      "step": 20435
+    },
+    {
+      "epoch": 1.037694660557678,
+      "grad_norm": 0.022048155029177167,
+      "learning_rate": 0.0005542554314282885,
+      "loss": 0.4994,
+      "step": 20440
+    },
+    {
+      "epoch": 1.0379484966557093,
+      "grad_norm": 0.034805475396144946,
+      "learning_rate": 0.0005540352081959524,
+      "loss": 0.5204,
+      "step": 20445
+    },
+    {
+      "epoch": 1.0382023327537409,
+      "grad_norm": 0.03443533236482359,
+      "learning_rate": 0.0005538149743567742,
+      "loss": 0.4981,
+      "step": 20450
+    },
+    {
+      "epoch": 1.0384561688517724,
+      "grad_norm": 0.02382977310407126,
+      "learning_rate": 0.000553594729953985,
+      "loss": 0.51,
+      "step": 20455
+    },
+    {
+      "epoch": 1.038710004949804,
+      "grad_norm": 0.025391666920891502,
+      "learning_rate": 0.0005533744750308173,
+      "loss": 0.5266,
+      "step": 20460
+    },
+    {
+      "epoch": 1.0389638410478355,
+      "grad_norm": 0.03609117313209483,
+      "learning_rate": 0.0005531542096305067,
+      "loss": 0.4907,
+      "step": 20465
+    },
+    {
+      "epoch": 1.039217677145867,
+      "grad_norm": 0.023507381495373715,
+      "learning_rate": 0.0005529339337962898,
+      "loss": 0.4975,
+      "step": 20470
+    },
+    {
+      "epoch": 1.0394715132438983,
+      "grad_norm": 0.07606685865235592,
+      "learning_rate": 0.0005527136475714055,
+      "loss": 0.4953,
+      "step": 20475
+    },
+    {
+      "epoch": 1.0397253493419298,
+      "grad_norm": 0.025963486371282662,
+      "learning_rate": 0.0005524933509990953,
+      "loss": 0.5241,
+      "step": 20480
+    },
+    {
+      "epoch": 1.0399791854399614,
+      "grad_norm": 0.03372063969736153,
+      "learning_rate": 0.0005522730441226019,
+      "loss": 0.4818,
+      "step": 20485
+    },
+    {
+      "epoch": 1.040233021537993,
+      "grad_norm": 0.024045885674214548,
+      "learning_rate": 0.0005520527269851707,
+      "loss": 0.5318,
+      "step": 20490
+    },
+    {
+      "epoch": 1.0404868576360244,
+      "grad_norm": 0.025422330712362343,
+      "learning_rate": 0.0005518323996300486,
+      "loss": 0.4953,
+      "step": 20495
+    },
+    {
+      "epoch": 1.040740693734056,
+      "grad_norm": 0.033795313139793635,
+      "learning_rate": 0.0005516120621004852,
+      "loss": 0.5,
+      "step": 20500
+    },
+    {
+      "epoch": 1.0409945298320875,
+      "grad_norm": 0.023641788988281797,
+      "learning_rate": 0.0005513917144397313,
+      "loss": 0.5393,
+      "step": 20505
+    },
+    {
+      "epoch": 1.0412483659301188,
+      "grad_norm": 0.029211469214469557,
+      "learning_rate": 0.0005511713566910401,
+      "loss": 0.5019,
+      "step": 20510
+    },
+    {
+      "epoch": 1.0415022020281504,
+      "grad_norm": 0.03256182907446245,
+      "learning_rate": 0.0005509509888976668,
+      "loss": 0.5106,
+      "step": 20515
+    },
+    {
+      "epoch": 1.041756038126182,
+      "grad_norm": 0.03264231316359048,
+      "learning_rate": 0.0005507306111028683,
+      "loss": 0.5352,
+      "step": 20520
+    },
+    {
+      "epoch": 1.0420098742242134,
+      "grad_norm": 0.04894371601933567,
+      "learning_rate": 0.000550510223349904,
+      "loss": 0.5144,
+      "step": 20525
+    },
+    {
+      "epoch": 1.042263710322245,
+      "grad_norm": 0.0439869944668601,
+      "learning_rate": 0.0005502898256820349,
+      "loss": 0.5165,
+      "step": 20530
+    },
+    {
+      "epoch": 1.0425175464202765,
+      "grad_norm": 0.03490709301718278,
+      "learning_rate": 0.0005500694181425237,
+      "loss": 0.5228,
+      "step": 20535
+    },
+    {
+      "epoch": 1.042771382518308,
+      "grad_norm": 0.02637599113409724,
+      "learning_rate": 0.0005498490007746354,
+      "loss": 0.4868,
+      "step": 20540
+    },
+    {
+      "epoch": 1.0430252186163393,
+      "grad_norm": 0.030905442645324806,
+      "learning_rate": 0.0005496285736216369,
+      "loss": 0.5007,
+      "step": 20545
+    },
+    {
+      "epoch": 1.0432790547143709,
+      "grad_norm": 0.022912410310244463,
+      "learning_rate": 0.0005494081367267968,
+      "loss": 0.531,
+      "step": 20550
+    },
+    {
+      "epoch": 1.0435328908124024,
+      "grad_norm": 0.03822182605381993,
+      "learning_rate": 0.0005491876901333859,
+      "loss": 0.5184,
+      "step": 20555
+    },
+    {
+      "epoch": 1.043786726910434,
+      "grad_norm": 0.03854079290096544,
+      "learning_rate": 0.0005489672338846767,
+      "loss": 0.5451,
+      "step": 20560
+    },
+    {
+      "epoch": 1.0440405630084655,
+      "grad_norm": 0.023025560697022475,
+      "learning_rate": 0.0005487467680239437,
+      "loss": 0.486,
+      "step": 20565
+    },
+    {
+      "epoch": 1.044294399106497,
+      "grad_norm": 0.02820566558878494,
+      "learning_rate": 0.0005485262925944633,
+      "loss": 0.5208,
+      "step": 20570
+    },
+    {
+      "epoch": 1.0445482352045283,
+      "grad_norm": 0.023098213177233765,
+      "learning_rate": 0.0005483058076395136,
+      "loss": 0.5211,
+      "step": 20575
+    },
+    {
+      "epoch": 1.0448020713025599,
+      "grad_norm": 0.032862139607394404,
+      "learning_rate": 0.0005480853132023746,
+      "loss": 0.4847,
+      "step": 20580
+    },
+    {
+      "epoch": 1.0450559074005914,
+      "grad_norm": 0.02296236963012799,
+      "learning_rate": 0.0005478648093263286,
+      "loss": 0.4886,
+      "step": 20585
+    },
+    {
+      "epoch": 1.045309743498623,
+      "grad_norm": 0.022572602535311893,
+      "learning_rate": 0.0005476442960546592,
+      "loss": 0.5119,
+      "step": 20590
+    },
+    {
+      "epoch": 1.0455635795966545,
+      "grad_norm": 0.030771735483987164,
+      "learning_rate": 0.0005474237734306522,
+      "loss": 0.4762,
+      "step": 20595
+    },
+    {
+      "epoch": 1.045817415694686,
+      "grad_norm": 0.03547547036096252,
+      "learning_rate": 0.0005472032414975949,
+      "loss": 0.5127,
+      "step": 20600
+    },
+    {
+      "epoch": 1.0460712517927175,
+      "grad_norm": 0.0254360744170134,
+      "learning_rate": 0.0005469827002987767,
+      "loss": 0.4929,
+      "step": 20605
+    },
+    {
+      "epoch": 1.0463250878907489,
+      "grad_norm": 0.025518311798158718,
+      "learning_rate": 0.0005467621498774886,
+      "loss": 0.5245,
+      "step": 20610
+    },
+    {
+      "epoch": 1.0465789239887804,
+      "grad_norm": 0.026490108041312813,
+      "learning_rate": 0.0005465415902770238,
+      "loss": 0.4944,
+      "step": 20615
+    },
+    {
+      "epoch": 1.046832760086812,
+      "grad_norm": 0.02526869174684408,
+      "learning_rate": 0.0005463210215406769,
+      "loss": 0.5045,
+      "step": 20620
+    },
+    {
+      "epoch": 1.0470865961848435,
+      "grad_norm": 0.02074411912466121,
+      "learning_rate": 0.0005461004437117445,
+      "loss": 0.51,
+      "step": 20625
+    },
+    {
+      "epoch": 1.047340432282875,
+      "grad_norm": 0.027093812806925615,
+      "learning_rate": 0.0005458798568335249,
+      "loss": 0.5166,
+      "step": 20630
+    },
+    {
+      "epoch": 1.0475942683809065,
+      "grad_norm": 0.02436102653069909,
+      "learning_rate": 0.0005456592609493182,
+      "loss": 0.5132,
+      "step": 20635
+    },
+    {
+      "epoch": 1.047848104478938,
+      "grad_norm": 0.023097199436276732,
+      "learning_rate": 0.0005454386561024263,
+      "loss": 0.4994,
+      "step": 20640
+    },
+    {
+      "epoch": 1.0481019405769694,
+      "grad_norm": 0.026331858968549688,
+      "learning_rate": 0.0005452180423361528,
+      "loss": 0.5153,
+      "step": 20645
+    },
+    {
+      "epoch": 1.048355776675001,
+      "grad_norm": 0.02645976286908623,
+      "learning_rate": 0.0005449974196938031,
+      "loss": 0.5188,
+      "step": 20650
+    },
+    {
+      "epoch": 1.0486096127730324,
+      "grad_norm": 0.025310276627568677,
+      "learning_rate": 0.0005447767882186844,
+      "loss": 0.53,
+      "step": 20655
+    },
+    {
+      "epoch": 1.048863448871064,
+      "grad_norm": 0.0261930070884005,
+      "learning_rate": 0.0005445561479541053,
+      "loss": 0.4882,
+      "step": 20660
+    },
+    {
+      "epoch": 1.0491172849690955,
+      "grad_norm": 0.025129244942486483,
+      "learning_rate": 0.0005443354989433766,
+      "loss": 0.4951,
+      "step": 20665
+    },
+    {
+      "epoch": 1.049371121067127,
+      "grad_norm": 0.020547190474118154,
+      "learning_rate": 0.0005441148412298106,
+      "loss": 0.4623,
+      "step": 20670
+    },
+    {
+      "epoch": 1.0496249571651584,
+      "grad_norm": 0.02180036590225857,
+      "learning_rate": 0.0005438941748567212,
+      "loss": 0.5372,
+      "step": 20675
+    },
+    {
+      "epoch": 1.04987879326319,
+      "grad_norm": 0.026634325082656105,
+      "learning_rate": 0.0005436734998674242,
+      "loss": 0.526,
+      "step": 20680
+    },
+    {
+      "epoch": 1.0501326293612214,
+      "grad_norm": 0.022973149324837107,
+      "learning_rate": 0.0005434528163052371,
+      "loss": 0.4898,
+      "step": 20685
+    },
+    {
+      "epoch": 1.050386465459253,
+      "grad_norm": 0.024359590869779388,
+      "learning_rate": 0.0005432321242134787,
+      "loss": 0.504,
+      "step": 20690
+    },
+    {
+      "epoch": 1.0506403015572845,
+      "grad_norm": 0.024548501089468676,
+      "learning_rate": 0.0005430114236354701,
+      "loss": 0.5368,
+      "step": 20695
+    },
+    {
+      "epoch": 1.050894137655316,
+      "grad_norm": 0.021875608126201377,
+      "learning_rate": 0.0005427907146145333,
+      "loss": 0.5023,
+      "step": 20700
+    },
+    {
+      "epoch": 1.0511479737533476,
+      "grad_norm": 0.03438836236481197,
+      "learning_rate": 0.0005425699971939927,
+      "loss": 0.5339,
+      "step": 20705
+    },
+    {
+      "epoch": 1.0514018098513789,
+      "grad_norm": 0.023250977742150812,
+      "learning_rate": 0.000542349271417174,
+      "loss": 0.5067,
+      "step": 20710
+    },
+    {
+      "epoch": 1.0516556459494104,
+      "grad_norm": 0.030063734343917937,
+      "learning_rate": 0.0005421285373274045,
+      "loss": 0.4875,
+      "step": 20715
+    },
+    {
+      "epoch": 1.051909482047442,
+      "grad_norm": 0.028285057645613915,
+      "learning_rate": 0.0005419077949680132,
+      "loss": 0.5006,
+      "step": 20720
+    },
+    {
+      "epoch": 1.0521633181454735,
+      "grad_norm": 0.023333102184669698,
+      "learning_rate": 0.0005416870443823308,
+      "loss": 0.4778,
+      "step": 20725
+    },
+    {
+      "epoch": 1.052417154243505,
+      "grad_norm": 0.02557383529258429,
+      "learning_rate": 0.0005414662856136894,
+      "loss": 0.53,
+      "step": 20730
+    },
+    {
+      "epoch": 1.0526709903415365,
+      "grad_norm": 0.023805131180192746,
+      "learning_rate": 0.0005412455187054229,
+      "loss": 0.5292,
+      "step": 20735
+    },
+    {
+      "epoch": 1.0529248264395679,
+      "grad_norm": 0.049484796752778205,
+      "learning_rate": 0.0005410247437008668,
+      "loss": 0.5113,
+      "step": 20740
+    },
+    {
+      "epoch": 1.0531786625375994,
+      "grad_norm": 0.04103309197810282,
+      "learning_rate": 0.0005408039606433582,
+      "loss": 0.5546,
+      "step": 20745
+    },
+    {
+      "epoch": 1.053432498635631,
+      "grad_norm": 0.03609048454627881,
+      "learning_rate": 0.0005405831695762355,
+      "loss": 0.5268,
+      "step": 20750
+    },
+    {
+      "epoch": 1.0536863347336625,
+      "grad_norm": 0.02838028673259011,
+      "learning_rate": 0.0005403623705428391,
+      "loss": 0.5169,
+      "step": 20755
+    },
+    {
+      "epoch": 1.053940170831694,
+      "grad_norm": 0.02412838334841983,
+      "learning_rate": 0.0005401415635865106,
+      "loss": 0.4983,
+      "step": 20760
+    },
+    {
+      "epoch": 1.0541940069297255,
+      "grad_norm": 0.023534830774553976,
+      "learning_rate": 0.0005399207487505934,
+      "loss": 0.4908,
+      "step": 20765
+    },
+    {
+      "epoch": 1.054447843027757,
+      "grad_norm": 0.08793074530330729,
+      "learning_rate": 0.0005396999260784323,
+      "loss": 0.4971,
+      "step": 20770
+    },
+    {
+      "epoch": 1.0547016791257884,
+      "grad_norm": 0.02146817209464667,
+      "learning_rate": 0.0005394790956133736,
+      "loss": 0.487,
+      "step": 20775
+    },
+    {
+      "epoch": 1.05495551522382,
+      "grad_norm": 0.02956735657834776,
+      "learning_rate": 0.0005392582573987654,
+      "loss": 0.4974,
+      "step": 20780
+    },
+    {
+      "epoch": 1.0552093513218515,
+      "grad_norm": 0.03392596129392696,
+      "learning_rate": 0.0005390374114779571,
+      "loss": 0.5112,
+      "step": 20785
+    },
+    {
+      "epoch": 1.055463187419883,
+      "grad_norm": 0.024893466438836578,
+      "learning_rate": 0.0005388165578942993,
+      "loss": 0.4961,
+      "step": 20790
+    },
+    {
+      "epoch": 1.0557170235179145,
+      "grad_norm": 0.02466824162848842,
+      "learning_rate": 0.0005385956966911451,
+      "loss": 0.5279,
+      "step": 20795
+    },
+    {
+      "epoch": 1.055970859615946,
+      "grad_norm": 0.021932982882764882,
+      "learning_rate": 0.000538374827911848,
+      "loss": 0.5209,
+      "step": 20800
+    },
+    {
+      "epoch": 1.0562246957139776,
+      "grad_norm": 0.031454975273976886,
+      "learning_rate": 0.0005381539515997636,
+      "loss": 0.5206,
+      "step": 20805
+    },
+    {
+      "epoch": 1.056478531812009,
+      "grad_norm": 0.02097302093594751,
+      "learning_rate": 0.0005379330677982487,
+      "loss": 0.4813,
+      "step": 20810
+    },
+    {
+      "epoch": 1.0567323679100404,
+      "grad_norm": 0.027013738993206558,
+      "learning_rate": 0.0005377121765506619,
+      "loss": 0.5156,
+      "step": 20815
+    },
+    {
+      "epoch": 1.056986204008072,
+      "grad_norm": 0.035362483004186006,
+      "learning_rate": 0.0005374912779003626,
+      "loss": 0.5105,
+      "step": 20820
+    },
+    {
+      "epoch": 1.0572400401061035,
+      "grad_norm": 0.028929030452595348,
+      "learning_rate": 0.0005372703718907127,
+      "loss": 0.5047,
+      "step": 20825
+    },
+    {
+      "epoch": 1.057493876204135,
+      "grad_norm": 0.024817950996213613,
+      "learning_rate": 0.0005370494585650746,
+      "loss": 0.4983,
+      "step": 20830
+    },
+    {
+      "epoch": 1.0577477123021666,
+      "grad_norm": 0.022644591412287526,
+      "learning_rate": 0.0005368285379668125,
+      "loss": 0.5294,
+      "step": 20835
+    },
+    {
+      "epoch": 1.0580015484001979,
+      "grad_norm": 0.02520849939725291,
+      "learning_rate": 0.0005366076101392922,
+      "loss": 0.4923,
+      "step": 20840
+    },
+    {
+      "epoch": 1.0582553844982294,
+      "grad_norm": 0.023884449834670873,
+      "learning_rate": 0.0005363866751258805,
+      "loss": 0.5227,
+      "step": 20845
+    },
+    {
+      "epoch": 1.058509220596261,
+      "grad_norm": 0.0246942336074402,
+      "learning_rate": 0.0005361657329699457,
+      "loss": 0.5014,
+      "step": 20850
+    },
+    {
+      "epoch": 1.0587630566942925,
+      "grad_norm": 0.024562310682568606,
+      "learning_rate": 0.0005359447837148582,
+      "loss": 0.5134,
+      "step": 20855
+    },
+    {
+      "epoch": 1.059016892792324,
+      "grad_norm": 0.023069233139811948,
+      "learning_rate": 0.0005357238274039888,
+      "loss": 0.5302,
+      "step": 20860
+    },
+    {
+      "epoch": 1.0592707288903556,
+      "grad_norm": 0.03147995452519133,
+      "learning_rate": 0.0005355028640807103,
+      "loss": 0.4765,
+      "step": 20865
+    },
+    {
+      "epoch": 1.059524564988387,
+      "grad_norm": 0.023156418652504413,
+      "learning_rate": 0.0005352818937883966,
+      "loss": 0.5326,
+      "step": 20870
+    },
+    {
+      "epoch": 1.0597784010864184,
+      "grad_norm": 0.023516340548810546,
+      "learning_rate": 0.0005350609165704231,
+      "loss": 0.5014,
+      "step": 20875
+    },
+    {
+      "epoch": 1.06003223718445,
+      "grad_norm": 0.02173782389595552,
+      "learning_rate": 0.0005348399324701665,
+      "loss": 0.4892,
+      "step": 20880
+    },
+    {
+      "epoch": 1.0602860732824815,
+      "grad_norm": 0.02497605503933453,
+      "learning_rate": 0.0005346189415310049,
+      "loss": 0.5238,
+      "step": 20885
+    },
+    {
+      "epoch": 1.060539909380513,
+      "grad_norm": 0.026640052657665187,
+      "learning_rate": 0.0005343979437963178,
+      "loss": 0.5475,
+      "step": 20890
+    },
+    {
+      "epoch": 1.0607937454785445,
+      "grad_norm": 0.02611282353111445,
+      "learning_rate": 0.0005341769393094857,
+      "loss": 0.5454,
+      "step": 20895
+    },
+    {
+      "epoch": 1.061047581576576,
+      "grad_norm": 0.05663032774630538,
+      "learning_rate": 0.000533955928113891,
+      "loss": 0.5268,
+      "step": 20900
+    },
+    {
+      "epoch": 1.0613014176746076,
+      "grad_norm": 0.027192717640165086,
+      "learning_rate": 0.000533734910252917,
+      "loss": 0.5328,
+      "step": 20905
+    },
+    {
+      "epoch": 1.061555253772639,
+      "grad_norm": 0.0479761926852491,
+      "learning_rate": 0.0005335138857699482,
+      "loss": 0.5321,
+      "step": 20910
+    },
+    {
+      "epoch": 1.0618090898706705,
+      "grad_norm": 0.03449047232824588,
+      "learning_rate": 0.0005332928547083707,
+      "loss": 0.5135,
+      "step": 20915
+    },
+    {
+      "epoch": 1.062062925968702,
+      "grad_norm": 0.03882056462374773,
+      "learning_rate": 0.0005330718171115721,
+      "loss": 0.5159,
+      "step": 20920
+    },
+    {
+      "epoch": 1.0623167620667335,
+      "grad_norm": 0.029049562658363927,
+      "learning_rate": 0.0005328507730229407,
+      "loss": 0.5109,
+      "step": 20925
+    },
+    {
+      "epoch": 1.062570598164765,
+      "grad_norm": 0.03658465415118424,
+      "learning_rate": 0.0005326297224858661,
+      "loss": 0.5055,
+      "step": 20930
+    },
+    {
+      "epoch": 1.0628244342627966,
+      "grad_norm": 0.026304675203290016,
+      "learning_rate": 0.00053240866554374,
+      "loss": 0.4788,
+      "step": 20935
+    },
+    {
+      "epoch": 1.063078270360828,
+      "grad_norm": 0.02473561068343538,
+      "learning_rate": 0.0005321876022399542,
+      "loss": 0.4786,
+      "step": 20940
+    },
+    {
+      "epoch": 1.0633321064588594,
+      "grad_norm": 0.027156090763984255,
+      "learning_rate": 0.0005319665326179028,
+      "loss": 0.5033,
+      "step": 20945
+    },
+    {
+      "epoch": 1.063585942556891,
+      "grad_norm": 0.02473336770832404,
+      "learning_rate": 0.0005317454567209804,
+      "loss": 0.4906,
+      "step": 20950
+    },
+    {
+      "epoch": 1.0638397786549225,
+      "grad_norm": 0.03156233005770823,
+      "learning_rate": 0.0005315243745925833,
+      "loss": 0.5012,
+      "step": 20955
+    },
+    {
+      "epoch": 1.064093614752954,
+      "grad_norm": 0.02170352188784017,
+      "learning_rate": 0.0005313032862761085,
+      "loss": 0.5181,
+      "step": 20960
+    },
+    {
+      "epoch": 1.0643474508509856,
+      "grad_norm": 0.022630967619430002,
+      "learning_rate": 0.0005310821918149548,
+      "loss": 0.4965,
+      "step": 20965
+    },
+    {
+      "epoch": 1.0646012869490171,
+      "grad_norm": 0.021957567917221844,
+      "learning_rate": 0.0005308610912525218,
+      "loss": 0.5313,
+      "step": 20970
+    },
+    {
+      "epoch": 1.0648551230470484,
+      "grad_norm": 0.0830362281234276,
+      "learning_rate": 0.0005306399846322106,
+      "loss": 0.5142,
+      "step": 20975
+    },
+    {
+      "epoch": 1.06510895914508,
+      "grad_norm": 0.021880705238877092,
+      "learning_rate": 0.000530418871997423,
+      "loss": 0.4935,
+      "step": 20980
+    },
+    {
+      "epoch": 1.0653627952431115,
+      "grad_norm": 0.025755954372723704,
+      "learning_rate": 0.0005301977533915627,
+      "loss": 0.5006,
+      "step": 20985
+    },
+    {
+      "epoch": 1.065616631341143,
+      "grad_norm": 0.02290313209246396,
+      "learning_rate": 0.000529976628858034,
+      "loss": 0.4983,
+      "step": 20990
+    },
+    {
+      "epoch": 1.0658704674391746,
+      "grad_norm": 0.02118947492338632,
+      "learning_rate": 0.0005297554984402426,
+      "loss": 0.5367,
+      "step": 20995
+    },
+    {
+      "epoch": 1.066124303537206,
+      "grad_norm": 0.025500946021106714,
+      "learning_rate": 0.0005295343621815952,
+      "loss": 0.5239,
+      "step": 21000
+    },
+    {
+      "epoch": 1.0663781396352374,
+      "grad_norm": 0.02849283742078081,
+      "learning_rate": 0.0005293132201254996,
+      "loss": 0.5033,
+      "step": 21005
+    },
+    {
+      "epoch": 1.066631975733269,
+      "grad_norm": 0.023519558376559214,
+      "learning_rate": 0.0005290920723153653,
+      "loss": 0.5283,
+      "step": 21010
+    },
+    {
+      "epoch": 1.0668858118313005,
+      "grad_norm": 0.023929624757609843,
+      "learning_rate": 0.0005288709187946022,
+      "loss": 0.5176,
+      "step": 21015
+    },
+    {
+      "epoch": 1.067139647929332,
+      "grad_norm": 0.024787372939198683,
+      "learning_rate": 0.0005286497596066218,
+      "loss": 0.5717,
+      "step": 21020
+    },
+    {
+      "epoch": 1.0673934840273636,
+      "grad_norm": 0.02105047392838496,
+      "learning_rate": 0.0005284285947948364,
+      "loss": 0.4807,
+      "step": 21025
+    },
+    {
+      "epoch": 1.067647320125395,
+      "grad_norm": 0.021809780590215072,
+      "learning_rate": 0.0005282074244026597,
+      "loss": 0.4876,
+      "step": 21030
+    },
+    {
+      "epoch": 1.0679011562234266,
+      "grad_norm": 0.02979096066481348,
+      "learning_rate": 0.0005279862484735059,
+      "loss": 0.4964,
+      "step": 21035
+    },
+    {
+      "epoch": 1.068154992321458,
+      "grad_norm": 0.02698075454085126,
+      "learning_rate": 0.0005277650670507915,
+      "loss": 0.5425,
+      "step": 21040
+    },
+    {
+      "epoch": 1.0684088284194895,
+      "grad_norm": 0.02305290180830383,
+      "learning_rate": 0.0005275438801779327,
+      "loss": 0.4809,
+      "step": 21045
+    },
+    {
+      "epoch": 1.068662664517521,
+      "grad_norm": 0.024541759228573837,
+      "learning_rate": 0.0005273226878983476,
+      "loss": 0.5089,
+      "step": 21050
+    },
+    {
+      "epoch": 1.0689165006155525,
+      "grad_norm": 0.022014145757905085,
+      "learning_rate": 0.0005271014902554552,
+      "loss": 0.4971,
+      "step": 21055
+    },
+    {
+      "epoch": 1.069170336713584,
+      "grad_norm": 0.024792727012130904,
+      "learning_rate": 0.0005268802872926755,
+      "loss": 0.5472,
+      "step": 21060
+    },
+    {
+      "epoch": 1.0694241728116156,
+      "grad_norm": 0.02829315368981998,
+      "learning_rate": 0.0005266590790534292,
+      "loss": 0.5336,
+      "step": 21065
+    },
+    {
+      "epoch": 1.069678008909647,
+      "grad_norm": 0.022298962580040806,
+      "learning_rate": 0.0005264378655811388,
+      "loss": 0.4895,
+      "step": 21070
+    },
+    {
+      "epoch": 1.0699318450076785,
+      "grad_norm": 0.029443771995413214,
+      "learning_rate": 0.0005262166469192273,
+      "loss": 0.5068,
+      "step": 21075
+    },
+    {
+      "epoch": 1.07018568110571,
+      "grad_norm": 0.035247934270422096,
+      "learning_rate": 0.0005259954231111186,
+      "loss": 0.5181,
+      "step": 21080
+    },
+    {
+      "epoch": 1.0704395172037415,
+      "grad_norm": 0.02766821768801607,
+      "learning_rate": 0.000525774194200238,
+      "loss": 0.5312,
+      "step": 21085
+    },
+    {
+      "epoch": 1.070693353301773,
+      "grad_norm": 0.034962832983413614,
+      "learning_rate": 0.0005255529602300118,
+      "loss": 0.5087,
+      "step": 21090
+    },
+    {
+      "epoch": 1.0709471893998046,
+      "grad_norm": 0.024917411843302463,
+      "learning_rate": 0.0005253317212438668,
+      "loss": 0.5247,
+      "step": 21095
+    },
+    {
+      "epoch": 1.0712010254978361,
+      "grad_norm": 0.022976680538935538,
+      "learning_rate": 0.0005251104772852312,
+      "loss": 0.527,
+      "step": 21100
+    },
+    {
+      "epoch": 1.0714548615958674,
+      "grad_norm": 0.023641409157934393,
+      "learning_rate": 0.0005248892283975341,
+      "loss": 0.5004,
+      "step": 21105
+    },
+    {
+      "epoch": 1.071708697693899,
+      "grad_norm": 0.026029533102603888,
+      "learning_rate": 0.0005246679746242058,
+      "loss": 0.5154,
+      "step": 21110
+    },
+    {
+      "epoch": 1.0719625337919305,
+      "grad_norm": 0.03545812118043111,
+      "learning_rate": 0.000524446716008677,
+      "loss": 0.5032,
+      "step": 21115
+    },
+    {
+      "epoch": 1.072216369889962,
+      "grad_norm": 0.02325775970323228,
+      "learning_rate": 0.0005242254525943799,
+      "loss": 0.5166,
+      "step": 21120
+    },
+    {
+      "epoch": 1.0724702059879936,
+      "grad_norm": 0.027063279281270988,
+      "learning_rate": 0.000524004184424747,
+      "loss": 0.5139,
+      "step": 21125
+    },
+    {
+      "epoch": 1.0727240420860251,
+      "grad_norm": 0.022406878965903397,
+      "learning_rate": 0.0005237829115432124,
+      "loss": 0.5163,
+      "step": 21130
+    },
+    {
+      "epoch": 1.0729778781840567,
+      "grad_norm": 0.020476946022531705,
+      "learning_rate": 0.000523561633993211,
+      "loss": 0.4938,
+      "step": 21135
+    },
+    {
+      "epoch": 1.073231714282088,
+      "grad_norm": 0.02946149615897215,
+      "learning_rate": 0.0005233403518181784,
+      "loss": 0.4777,
+      "step": 21140
+    },
+    {
+      "epoch": 1.0734855503801195,
+      "grad_norm": 0.022659371359572644,
+      "learning_rate": 0.000523119065061551,
+      "loss": 0.52,
+      "step": 21145
+    },
+    {
+      "epoch": 1.073739386478151,
+      "grad_norm": 0.024333860488001735,
+      "learning_rate": 0.0005228977737667665,
+      "loss": 0.5307,
+      "step": 21150
+    },
+    {
+      "epoch": 1.0739932225761826,
+      "grad_norm": 0.024349542450303005,
+      "learning_rate": 0.0005226764779772632,
+      "loss": 0.5321,
+      "step": 21155
+    },
+    {
+      "epoch": 1.074247058674214,
+      "grad_norm": 0.0349413047957909,
+      "learning_rate": 0.0005224551777364803,
+      "loss": 0.5386,
+      "step": 21160
+    },
+    {
+      "epoch": 1.0745008947722456,
+      "grad_norm": 0.024434904354044745,
+      "learning_rate": 0.0005222338730878581,
+      "loss": 0.5224,
+      "step": 21165
+    },
+    {
+      "epoch": 1.0747547308702772,
+      "grad_norm": 0.022587237289784082,
+      "learning_rate": 0.0005220125640748375,
+      "loss": 0.5014,
+      "step": 21170
+    },
+    {
+      "epoch": 1.0750085669683085,
+      "grad_norm": 0.028695622755737175,
+      "learning_rate": 0.0005217912507408602,
+      "loss": 0.5129,
+      "step": 21175
+    },
+    {
+      "epoch": 1.07526240306634,
+      "grad_norm": 0.04548078606712318,
+      "learning_rate": 0.0005215699331293692,
+      "loss": 0.4981,
+      "step": 21180
+    },
+    {
+      "epoch": 1.0755162391643716,
+      "grad_norm": 0.03372321027470343,
+      "learning_rate": 0.0005213486112838076,
+      "loss": 0.4986,
+      "step": 21185
+    },
+    {
+      "epoch": 1.075770075262403,
+      "grad_norm": 0.03485560152221166,
+      "learning_rate": 0.0005211272852476204,
+      "loss": 0.5012,
+      "step": 21190
+    },
+    {
+      "epoch": 1.0760239113604346,
+      "grad_norm": 0.025328513759871395,
+      "learning_rate": 0.0005209059550642523,
+      "loss": 0.536,
+      "step": 21195
+    },
+    {
+      "epoch": 1.0762777474584662,
+      "grad_norm": 0.0382721168074841,
+      "learning_rate": 0.0005206846207771496,
+      "loss": 0.5569,
+      "step": 21200
+    },
+    {
+      "epoch": 1.0765315835564975,
+      "grad_norm": 0.02520714240837497,
+      "learning_rate": 0.0005204632824297589,
+      "loss": 0.5249,
+      "step": 21205
+    },
+    {
+      "epoch": 1.076785419654529,
+      "grad_norm": 0.028982430714109,
+      "learning_rate": 0.0005202419400655281,
+      "loss": 0.5233,
+      "step": 21210
+    },
+    {
+      "epoch": 1.0770392557525605,
+      "grad_norm": 0.02574048454288774,
+      "learning_rate": 0.0005200205937279052,
+      "loss": 0.4889,
+      "step": 21215
+    },
+    {
+      "epoch": 1.077293091850592,
+      "grad_norm": 0.03233762725205312,
+      "learning_rate": 0.0005197992434603397,
+      "loss": 0.5201,
+      "step": 21220
+    },
+    {
+      "epoch": 1.0775469279486236,
+      "grad_norm": 0.026051642045672115,
+      "learning_rate": 0.0005195778893062814,
+      "loss": 0.5596,
+      "step": 21225
+    },
+    {
+      "epoch": 1.0778007640466551,
+      "grad_norm": 0.034440397145426654,
+      "learning_rate": 0.000519356531309181,
+      "loss": 0.508,
+      "step": 21230
+    },
+    {
+      "epoch": 1.0780546001446867,
+      "grad_norm": 0.024779579773170867,
+      "learning_rate": 0.0005191351695124902,
+      "loss": 0.511,
+      "step": 21235
+    },
+    {
+      "epoch": 1.078308436242718,
+      "grad_norm": 0.027199827628791112,
+      "learning_rate": 0.000518913803959661,
+      "loss": 0.5391,
+      "step": 21240
+    },
+    {
+      "epoch": 1.0785622723407495,
+      "grad_norm": 0.039295388227057915,
+      "learning_rate": 0.0005186924346941463,
+      "loss": 0.5159,
+      "step": 21245
+    },
+    {
+      "epoch": 1.078816108438781,
+      "grad_norm": 0.03140488277705228,
+      "learning_rate": 0.0005184710617593998,
+      "loss": 0.5333,
+      "step": 21250
+    },
+    {
+      "epoch": 1.0790699445368126,
+      "grad_norm": 0.037347453599965476,
+      "learning_rate": 0.0005182496851988763,
+      "loss": 0.4879,
+      "step": 21255
+    },
+    {
+      "epoch": 1.0793237806348441,
+      "grad_norm": 0.030428369817123643,
+      "learning_rate": 0.0005180283050560304,
+      "loss": 0.5088,
+      "step": 21260
+    },
+    {
+      "epoch": 1.0795776167328757,
+      "grad_norm": 0.02612433843324173,
+      "learning_rate": 0.0005178069213743182,
+      "loss": 0.4955,
+      "step": 21265
+    },
+    {
+      "epoch": 1.079831452830907,
+      "grad_norm": 0.028085153442456857,
+      "learning_rate": 0.0005175855341971961,
+      "loss": 0.5297,
+      "step": 21270
+    },
+    {
+      "epoch": 1.0800852889289385,
+      "grad_norm": 0.030147412276925384,
+      "learning_rate": 0.0005173641435681212,
+      "loss": 0.4905,
+      "step": 21275
+    },
+    {
+      "epoch": 1.08033912502697,
+      "grad_norm": 0.029195226257636587,
+      "learning_rate": 0.0005171427495305517,
+      "loss": 0.5101,
+      "step": 21280
+    },
+    {
+      "epoch": 1.0805929611250016,
+      "grad_norm": 0.02573350843933712,
+      "learning_rate": 0.000516921352127946,
+      "loss": 0.5075,
+      "step": 21285
+    },
+    {
+      "epoch": 1.0808467972230331,
+      "grad_norm": 0.03039462615229446,
+      "learning_rate": 0.0005166999514037631,
+      "loss": 0.5506,
+      "step": 21290
+    },
+    {
+      "epoch": 1.0811006333210647,
+      "grad_norm": 0.03422447727787612,
+      "learning_rate": 0.0005164785474014631,
+      "loss": 0.4838,
+      "step": 21295
+    },
+    {
+      "epoch": 1.0813544694190962,
+      "grad_norm": 0.031835929504930464,
+      "learning_rate": 0.0005162571401645065,
+      "loss": 0.5303,
+      "step": 21300
+    },
+    {
+      "epoch": 1.0816083055171275,
+      "grad_norm": 0.024796482258768755,
+      "learning_rate": 0.0005160357297363541,
+      "loss": 0.5407,
+      "step": 21305
+    },
+    {
+      "epoch": 1.081862141615159,
+      "grad_norm": 0.024946672122012212,
+      "learning_rate": 0.0005158143161604682,
+      "loss": 0.5406,
+      "step": 21310
+    },
+    {
+      "epoch": 1.0821159777131906,
+      "grad_norm": 0.028919849891573573,
+      "learning_rate": 0.0005155928994803108,
+      "loss": 0.5164,
+      "step": 21315
+    },
+    {
+      "epoch": 1.082369813811222,
+      "grad_norm": 0.02634430322655897,
+      "learning_rate": 0.0005153714797393451,
+      "loss": 0.5272,
+      "step": 21320
+    },
+    {
+      "epoch": 1.0826236499092536,
+      "grad_norm": 0.034896141494776906,
+      "learning_rate": 0.0005151500569810345,
+      "loss": 0.5127,
+      "step": 21325
+    },
+    {
+      "epoch": 1.0828774860072852,
+      "grad_norm": 0.022623036190050343,
+      "learning_rate": 0.0005149286312488432,
+      "loss": 0.5492,
+      "step": 21330
+    },
+    {
+      "epoch": 1.0831313221053165,
+      "grad_norm": 0.02338043777628199,
+      "learning_rate": 0.0005147072025862362,
+      "loss": 0.4818,
+      "step": 21335
+    },
+    {
+      "epoch": 1.083385158203348,
+      "grad_norm": 0.02630639246261155,
+      "learning_rate": 0.0005144857710366785,
+      "loss": 0.5142,
+      "step": 21340
+    },
+    {
+      "epoch": 1.0836389943013796,
+      "grad_norm": 0.02480597381903302,
+      "learning_rate": 0.0005142643366436362,
+      "loss": 0.5441,
+      "step": 21345
+    },
+    {
+      "epoch": 1.083892830399411,
+      "grad_norm": 0.02501822087153717,
+      "learning_rate": 0.0005140428994505759,
+      "loss": 0.5228,
+      "step": 21350
+    },
+    {
+      "epoch": 1.0841466664974426,
+      "grad_norm": 0.027223628649433902,
+      "learning_rate": 0.0005138214595009643,
+      "loss": 0.5148,
+      "step": 21355
+    },
+    {
+      "epoch": 1.0844005025954742,
+      "grad_norm": 0.022696406307624702,
+      "learning_rate": 0.0005136000168382693,
+      "loss": 0.5075,
+      "step": 21360
+    },
+    {
+      "epoch": 1.0846543386935057,
+      "grad_norm": 0.032940782729276794,
+      "learning_rate": 0.0005133785715059586,
+      "loss": 0.5322,
+      "step": 21365
+    },
+    {
+      "epoch": 1.084908174791537,
+      "grad_norm": 0.022402759329344542,
+      "learning_rate": 0.0005131571235475012,
+      "loss": 0.5077,
+      "step": 21370
+    },
+    {
+      "epoch": 1.0851620108895685,
+      "grad_norm": 0.038649939505366236,
+      "learning_rate": 0.000512935673006366,
+      "loss": 0.5236,
+      "step": 21375
+    },
+    {
+      "epoch": 1.0854158469876,
+      "grad_norm": 0.022969477729405774,
+      "learning_rate": 0.0005127142199260228,
+      "loss": 0.4826,
+      "step": 21380
+    },
+    {
+      "epoch": 1.0856696830856316,
+      "grad_norm": 0.02759754028356563,
+      "learning_rate": 0.0005124927643499415,
+      "loss": 0.5351,
+      "step": 21385
+    },
+    {
+      "epoch": 1.0859235191836631,
+      "grad_norm": 0.026556927936903868,
+      "learning_rate": 0.000512271306321593,
+      "loss": 0.5354,
+      "step": 21390
+    },
+    {
+      "epoch": 1.0861773552816947,
+      "grad_norm": 0.024696980583765446,
+      "learning_rate": 0.000512049845884448,
+      "loss": 0.5162,
+      "step": 21395
+    },
+    {
+      "epoch": 1.0864311913797262,
+      "grad_norm": 0.026456834753896295,
+      "learning_rate": 0.0005118283830819786,
+      "loss": 0.4936,
+      "step": 21400
+    },
+    {
+      "epoch": 1.0866850274777575,
+      "grad_norm": 0.023930744190106705,
+      "learning_rate": 0.0005116069179576565,
+      "loss": 0.4917,
+      "step": 21405
+    },
+    {
+      "epoch": 1.086938863575789,
+      "grad_norm": 0.02349838967686964,
+      "learning_rate": 0.0005113854505549543,
+      "loss": 0.5321,
+      "step": 21410
+    },
+    {
+      "epoch": 1.0871926996738206,
+      "grad_norm": 0.032693372410154034,
+      "learning_rate": 0.000511163980917345,
+      "loss": 0.471,
+      "step": 21415
+    },
+    {
+      "epoch": 1.0874465357718521,
+      "grad_norm": 0.03223543121863213,
+      "learning_rate": 0.0005109425090883019,
+      "loss": 0.5017,
+      "step": 21420
+    },
+    {
+      "epoch": 1.0877003718698837,
+      "grad_norm": 0.027354722210189156,
+      "learning_rate": 0.0005107210351112986,
+      "loss": 0.5257,
+      "step": 21425
+    },
+    {
+      "epoch": 1.0879542079679152,
+      "grad_norm": 0.020761117911752558,
+      "learning_rate": 0.0005104995590298098,
+      "loss": 0.5032,
+      "step": 21430
+    },
+    {
+      "epoch": 1.0882080440659467,
+      "grad_norm": 0.025848466019290892,
+      "learning_rate": 0.0005102780808873098,
+      "loss": 0.4959,
+      "step": 21435
+    },
+    {
+      "epoch": 1.088461880163978,
+      "grad_norm": 0.026496761790380605,
+      "learning_rate": 0.000510056600727274,
+      "loss": 0.4579,
+      "step": 21440
+    },
+    {
+      "epoch": 1.0887157162620096,
+      "grad_norm": 0.024422392773643196,
+      "learning_rate": 0.0005098351185931775,
+      "loss": 0.503,
+      "step": 21445
+    },
+    {
+      "epoch": 1.0889695523600411,
+      "grad_norm": 0.04356332488795266,
+      "learning_rate": 0.0005096136345284963,
+      "loss": 0.5012,
+      "step": 21450
+    },
+    {
+      "epoch": 1.0892233884580726,
+      "grad_norm": 0.025205409841538344,
+      "learning_rate": 0.0005093921485767066,
+      "loss": 0.5125,
+      "step": 21455
+    },
+    {
+      "epoch": 1.0894772245561042,
+      "grad_norm": 0.03256381052307056,
+      "learning_rate": 0.0005091706607812848,
+      "loss": 0.5021,
+      "step": 21460
+    },
+    {
+      "epoch": 1.0897310606541357,
+      "grad_norm": 0.022789084607815834,
+      "learning_rate": 0.0005089491711857083,
+      "loss": 0.5027,
+      "step": 21465
+    },
+    {
+      "epoch": 1.089984896752167,
+      "grad_norm": 0.022777957966430507,
+      "learning_rate": 0.0005087276798334539,
+      "loss": 0.4724,
+      "step": 21470
+    },
+    {
+      "epoch": 1.0902387328501986,
+      "grad_norm": 0.024179892808939247,
+      "learning_rate": 0.0005085061867679995,
+      "loss": 0.5076,
+      "step": 21475
+    },
+    {
+      "epoch": 1.09049256894823,
+      "grad_norm": 0.023288334670357742,
+      "learning_rate": 0.0005082846920328232,
+      "loss": 0.5211,
+      "step": 21480
+    },
+    {
+      "epoch": 1.0907464050462616,
+      "grad_norm": 0.024409120224983153,
+      "learning_rate": 0.0005080631956714029,
+      "loss": 0.5234,
+      "step": 21485
+    },
+    {
+      "epoch": 1.0910002411442932,
+      "grad_norm": 0.02599437175288467,
+      "learning_rate": 0.0005078416977272178,
+      "loss": 0.5029,
+      "step": 21490
+    },
+    {
+      "epoch": 1.0912540772423247,
+      "grad_norm": 0.022222357794203767,
+      "learning_rate": 0.0005076201982437464,
+      "loss": 0.5228,
+      "step": 21495
+    },
+    {
+      "epoch": 1.0915079133403562,
+      "grad_norm": 0.02880385866020692,
+      "learning_rate": 0.0005073986972644681,
+      "loss": 0.5447,
+      "step": 21500
+    },
+    {
+      "epoch": 1.0917617494383876,
+      "grad_norm": 0.020822683076055126,
+      "learning_rate": 0.0005071771948328624,
+      "loss": 0.4931,
+      "step": 21505
+    },
+    {
+      "epoch": 1.092015585536419,
+      "grad_norm": 0.02398468484355743,
+      "learning_rate": 0.0005069556909924092,
+      "loss": 0.482,
+      "step": 21510
+    },
+    {
+      "epoch": 1.0922694216344506,
+      "grad_norm": 0.02807227467217408,
+      "learning_rate": 0.0005067341857865885,
+      "loss": 0.5237,
+      "step": 21515
+    },
+    {
+      "epoch": 1.0925232577324822,
+      "grad_norm": 0.02290810606351165,
+      "learning_rate": 0.0005065126792588807,
+      "loss": 0.5166,
+      "step": 21520
+    },
+    {
+      "epoch": 1.0927770938305137,
+      "grad_norm": 0.028239546173554175,
+      "learning_rate": 0.0005062911714527664,
+      "loss": 0.5152,
+      "step": 21525
+    },
+    {
+      "epoch": 1.0930309299285452,
+      "grad_norm": 0.026605118843009996,
+      "learning_rate": 0.0005060696624117266,
+      "loss": 0.4891,
+      "step": 21530
+    },
+    {
+      "epoch": 1.0932847660265765,
+      "grad_norm": 0.03451731867184281,
+      "learning_rate": 0.0005058481521792424,
+      "loss": 0.5054,
+      "step": 21535
+    },
+    {
+      "epoch": 1.093538602124608,
+      "grad_norm": 0.022870995542147783,
+      "learning_rate": 0.000505626640798795,
+      "loss": 0.5094,
+      "step": 21540
+    },
+    {
+      "epoch": 1.0937924382226396,
+      "grad_norm": 0.023384178570215613,
+      "learning_rate": 0.000505405128313866,
+      "loss": 0.4954,
+      "step": 21545
+    },
+    {
+      "epoch": 1.0940462743206711,
+      "grad_norm": 0.021332217474057745,
+      "learning_rate": 0.0005051836147679374,
+      "loss": 0.4802,
+      "step": 21550
+    },
+    {
+      "epoch": 1.0943001104187027,
+      "grad_norm": 0.025123000830193358,
+      "learning_rate": 0.000504962100204491,
+      "loss": 0.4802,
+      "step": 21555
+    },
+    {
+      "epoch": 1.0945539465167342,
+      "grad_norm": 0.025473977289128598,
+      "learning_rate": 0.0005047405846670091,
+      "loss": 0.4966,
+      "step": 21560
+    },
+    {
+      "epoch": 1.0948077826147657,
+      "grad_norm": 0.026371420615893433,
+      "learning_rate": 0.0005045190681989742,
+      "loss": 0.4988,
+      "step": 21565
+    },
+    {
+      "epoch": 1.095061618712797,
+      "grad_norm": 0.022460113412766883,
+      "learning_rate": 0.0005042975508438687,
+      "loss": 0.4925,
+      "step": 21570
+    },
+    {
+      "epoch": 1.0953154548108286,
+      "grad_norm": 0.02307578944045605,
+      "learning_rate": 0.0005040760326451752,
+      "loss": 0.4906,
+      "step": 21575
+    },
+    {
+      "epoch": 1.0955692909088601,
+      "grad_norm": 0.03059351913144231,
+      "learning_rate": 0.000503854513646377,
+      "loss": 0.4782,
+      "step": 21580
+    },
+    {
+      "epoch": 1.0958231270068917,
+      "grad_norm": 0.024544268108570917,
+      "learning_rate": 0.000503632993890957,
+      "loss": 0.4977,
+      "step": 21585
+    },
+    {
+      "epoch": 1.0960769631049232,
+      "grad_norm": 0.021615409644089055,
+      "learning_rate": 0.0005034114734223983,
+      "loss": 0.5147,
+      "step": 21590
+    },
+    {
+      "epoch": 1.0963307992029547,
+      "grad_norm": 0.025081101527570617,
+      "learning_rate": 0.0005031899522841845,
+      "loss": 0.5216,
+      "step": 21595
+    },
+    {
+      "epoch": 1.096584635300986,
+      "grad_norm": 0.02738150628716518,
+      "learning_rate": 0.0005029684305197989,
+      "loss": 0.4893,
+      "step": 21600
+    },
+    {
+      "epoch": 1.0968384713990176,
+      "grad_norm": 0.030632362820872255,
+      "learning_rate": 0.000502746908172725,
+      "loss": 0.512,
+      "step": 21605
+    },
+    {
+      "epoch": 1.0970923074970491,
+      "grad_norm": 0.03751097192664053,
+      "learning_rate": 0.000502525385286447,
+      "loss": 0.4916,
+      "step": 21610
+    },
+    {
+      "epoch": 1.0973461435950806,
+      "grad_norm": 0.03045939530777982,
+      "learning_rate": 0.0005023038619044485,
+      "loss": 0.4898,
+      "step": 21615
+    },
+    {
+      "epoch": 1.0975999796931122,
+      "grad_norm": 0.025961661067633108,
+      "learning_rate": 0.0005020823380702133,
+      "loss": 0.5124,
+      "step": 21620
+    },
+    {
+      "epoch": 1.0978538157911437,
+      "grad_norm": 0.026346768663115368,
+      "learning_rate": 0.0005018608138272255,
+      "loss": 0.4997,
+      "step": 21625
+    },
+    {
+      "epoch": 1.0981076518891753,
+      "grad_norm": 0.035779285163878446,
+      "learning_rate": 0.0005016392892189692,
+      "loss": 0.5007,
+      "step": 21630
+    },
+    {
+      "epoch": 1.0983614879872066,
+      "grad_norm": 0.022963332458905212,
+      "learning_rate": 0.0005014177642889286,
+      "loss": 0.5089,
+      "step": 21635
+    },
+    {
+      "epoch": 1.098615324085238,
+      "grad_norm": 0.027026555042418753,
+      "learning_rate": 0.000501196239080588,
+      "loss": 0.4791,
+      "step": 21640
+    },
+    {
+      "epoch": 1.0988691601832696,
+      "grad_norm": 0.026351077220473095,
+      "learning_rate": 0.0005009747136374317,
+      "loss": 0.5035,
+      "step": 21645
+    },
+    {
+      "epoch": 1.0991229962813012,
+      "grad_norm": 0.02173089121604654,
+      "learning_rate": 0.0005007531880029438,
+      "loss": 0.4921,
+      "step": 21650
+    },
+    {
+      "epoch": 1.0993768323793327,
+      "grad_norm": 0.02864481532801961,
+      "learning_rate": 0.000500531662220609,
+      "loss": 0.5183,
+      "step": 21655
+    },
+    {
+      "epoch": 1.0996306684773642,
+      "grad_norm": 0.02446699653140137,
+      "learning_rate": 0.0005003101363339114,
+      "loss": 0.5084,
+      "step": 21660
+    },
+    {
+      "epoch": 1.0998845045753955,
+      "grad_norm": 0.023667829604516112,
+      "learning_rate": 0.0005000886103863355,
+      "loss": 0.4801,
+      "step": 21665
+    },
+    {
+      "epoch": 1.100138340673427,
+      "grad_norm": 0.023439234864149394,
+      "learning_rate": 0.0004998670844213661,
+      "loss": 0.5036,
+      "step": 21670
+    },
+    {
+      "epoch": 1.1003921767714586,
+      "grad_norm": 0.027040266970354027,
+      "learning_rate": 0.0004996455584824873,
+      "loss": 0.5186,
+      "step": 21675
+    },
+    {
+      "epoch": 1.1006460128694902,
+      "grad_norm": 0.0317481536525307,
+      "learning_rate": 0.0004994240326131837,
+      "loss": 0.5298,
+      "step": 21680
+    },
+    {
+      "epoch": 1.1008998489675217,
+      "grad_norm": 0.023747077663502984,
+      "learning_rate": 0.0004992025068569395,
+      "loss": 0.5139,
+      "step": 21685
+    },
+    {
+      "epoch": 1.1011536850655532,
+      "grad_norm": 0.035914229145339024,
+      "learning_rate": 0.0004989809812572392,
+      "loss": 0.553,
+      "step": 21690
+    },
+    {
+      "epoch": 1.1014075211635848,
+      "grad_norm": 0.022286343442528934,
+      "learning_rate": 0.0004987594558575673,
+      "loss": 0.511,
+      "step": 21695
+    },
+    {
+      "epoch": 1.1016613572616163,
+      "grad_norm": 0.032836479339666325,
+      "learning_rate": 0.0004985379307014079,
+      "loss": 0.5277,
+      "step": 21700
+    },
+    {
+      "epoch": 1.1019151933596476,
+      "grad_norm": 0.021909939697584042,
+      "learning_rate": 0.0004983164058322455,
+      "loss": 0.5027,
+      "step": 21705
+    },
+    {
+      "epoch": 1.1021690294576791,
+      "grad_norm": 0.036886073050320635,
+      "learning_rate": 0.000498094881293564,
+      "loss": 0.5319,
+      "step": 21710
+    },
+    {
+      "epoch": 1.1024228655557107,
+      "grad_norm": 0.04010799722360534,
+      "learning_rate": 0.000497873357128848,
+      "loss": 0.493,
+      "step": 21715
+    },
+    {
+      "epoch": 1.1026767016537422,
+      "grad_norm": 0.03364367343902673,
+      "learning_rate": 0.0004976518333815814,
+      "loss": 0.5138,
+      "step": 21720
+    },
+    {
+      "epoch": 1.1029305377517737,
+      "grad_norm": 0.03065112641730614,
+      "learning_rate": 0.0004974303100952483,
+      "loss": 0.4981,
+      "step": 21725
+    },
+    {
+      "epoch": 1.1031843738498053,
+      "grad_norm": 0.02896054754547121,
+      "learning_rate": 0.0004972087873133323,
+      "loss": 0.5215,
+      "step": 21730
+    },
+    {
+      "epoch": 1.1034382099478366,
+      "grad_norm": 0.03710991307969362,
+      "learning_rate": 0.0004969872650793176,
+      "loss": 0.4901,
+      "step": 21735
+    },
+    {
+      "epoch": 1.1036920460458681,
+      "grad_norm": 0.022960913289772747,
+      "learning_rate": 0.0004967657434366877,
+      "loss": 0.4889,
+      "step": 21740
+    },
+    {
+      "epoch": 1.1039458821438997,
+      "grad_norm": 0.025188988881946872,
+      "learning_rate": 0.0004965442224289262,
+      "loss": 0.4756,
+      "step": 21745
+    },
+    {
+      "epoch": 1.1041997182419312,
+      "grad_norm": 0.022061989690903828,
+      "learning_rate": 0.0004963227020995167,
+      "loss": 0.5295,
+      "step": 21750
+    },
+    {
+      "epoch": 1.1044535543399627,
+      "grad_norm": 0.03873982984670393,
+      "learning_rate": 0.0004961011824919422,
+      "loss": 0.5255,
+      "step": 21755
+    },
+    {
+      "epoch": 1.1047073904379943,
+      "grad_norm": 0.024110509410144455,
+      "learning_rate": 0.0004958796636496864,
+      "loss": 0.4762,
+      "step": 21760
+    },
+    {
+      "epoch": 1.1049612265360258,
+      "grad_norm": 0.024542539175151357,
+      "learning_rate": 0.0004956581456162319,
+      "loss": 0.4792,
+      "step": 21765
+    },
+    {
+      "epoch": 1.105215062634057,
+      "grad_norm": 0.022363840834245956,
+      "learning_rate": 0.0004954366284350617,
+      "loss": 0.5118,
+      "step": 21770
+    },
+    {
+      "epoch": 1.1054688987320886,
+      "grad_norm": 0.025982163556048574,
+      "learning_rate": 0.0004952151121496587,
+      "loss": 0.5072,
+      "step": 21775
+    },
+    {
+      "epoch": 1.1057227348301202,
+      "grad_norm": 0.02270591702590939,
+      "learning_rate": 0.0004949935968035054,
+      "loss": 0.504,
+      "step": 21780
+    },
+    {
+      "epoch": 1.1059765709281517,
+      "grad_norm": 0.022406489019998924,
+      "learning_rate": 0.000494772082440084,
+      "loss": 0.4982,
+      "step": 21785
+    },
+    {
+      "epoch": 1.1062304070261832,
+      "grad_norm": 0.025696915164668793,
+      "learning_rate": 0.0004945505691028769,
+      "loss": 0.5271,
+      "step": 21790
+    },
+    {
+      "epoch": 1.1064842431242148,
+      "grad_norm": 0.02202726327814844,
+      "learning_rate": 0.0004943290568353657,
+      "loss": 0.4841,
+      "step": 21795
+    },
+    {
+      "epoch": 1.106738079222246,
+      "grad_norm": 0.027349694310620433,
+      "learning_rate": 0.0004941075456810324,
+      "loss": 0.4977,
+      "step": 21800
+    },
+    {
+      "epoch": 1.1069919153202776,
+      "grad_norm": 0.021841600344543598,
+      "learning_rate": 0.0004938860356833585,
+      "loss": 0.4939,
+      "step": 21805
+    },
+    {
+      "epoch": 1.1072457514183092,
+      "grad_norm": 0.024165588262214474,
+      "learning_rate": 0.0004936645268858253,
+      "loss": 0.4974,
+      "step": 21810
+    },
+    {
+      "epoch": 1.1074995875163407,
+      "grad_norm": 0.020070193682354385,
+      "learning_rate": 0.000493443019331914,
+      "loss": 0.4816,
+      "step": 21815
+    },
+    {
+      "epoch": 1.1077534236143722,
+      "grad_norm": 0.024862032516753653,
+      "learning_rate": 0.0004932215130651052,
+      "loss": 0.5147,
+      "step": 21820
+    },
+    {
+      "epoch": 1.1080072597124038,
+      "grad_norm": 0.025675685849875715,
+      "learning_rate": 0.0004930000081288797,
+      "loss": 0.4855,
+      "step": 21825
+    },
+    {
+      "epoch": 1.1082610958104353,
+      "grad_norm": 0.02461817640839028,
+      "learning_rate": 0.0004927785045667173,
+      "loss": 0.4864,
+      "step": 21830
+    },
+    {
+      "epoch": 1.1085149319084666,
+      "grad_norm": 0.023549777823439446,
+      "learning_rate": 0.0004925570024220987,
+      "loss": 0.5247,
+      "step": 21835
+    },
+    {
+      "epoch": 1.1087687680064982,
+      "grad_norm": 0.022915635775272027,
+      "learning_rate": 0.0004923355017385035,
+      "loss": 0.4903,
+      "step": 21840
+    },
+    {
+      "epoch": 1.1090226041045297,
+      "grad_norm": 0.028502210315459423,
+      "learning_rate": 0.000492114002559411,
+      "loss": 0.4815,
+      "step": 21845
+    },
+    {
+      "epoch": 1.1092764402025612,
+      "grad_norm": 0.03661671337568482,
+      "learning_rate": 0.0004918925049283005,
+      "loss": 0.5259,
+      "step": 21850
+    },
+    {
+      "epoch": 1.1095302763005928,
+      "grad_norm": 0.034776988326917536,
+      "learning_rate": 0.0004916710088886508,
+      "loss": 0.5042,
+      "step": 21855
+    },
+    {
+      "epoch": 1.1097841123986243,
+      "grad_norm": 0.022641065605251254,
+      "learning_rate": 0.0004914495144839406,
+      "loss": 0.4759,
+      "step": 21860
+    },
+    {
+      "epoch": 1.1100379484966556,
+      "grad_norm": 0.02470161392259303,
+      "learning_rate": 0.0004912280217576481,
+      "loss": 0.484,
+      "step": 21865
+    },
+    {
+      "epoch": 1.1102917845946871,
+      "grad_norm": 0.02216524988218757,
+      "learning_rate": 0.0004910065307532511,
+      "loss": 0.514,
+      "step": 21870
+    },
+    {
+      "epoch": 1.1105456206927187,
+      "grad_norm": 0.022165983415213607,
+      "learning_rate": 0.0004907850415142273,
+      "loss": 0.4873,
+      "step": 21875
+    },
+    {
+      "epoch": 1.1107994567907502,
+      "grad_norm": 0.02581011869369852,
+      "learning_rate": 0.0004905635540840539,
+      "loss": 0.4913,
+      "step": 21880
+    },
+    {
+      "epoch": 1.1110532928887817,
+      "grad_norm": 0.02854559216993533,
+      "learning_rate": 0.0004903420685062077,
+      "loss": 0.4985,
+      "step": 21885
+    },
+    {
+      "epoch": 1.1113071289868133,
+      "grad_norm": 0.031286700118217534,
+      "learning_rate": 0.0004901205848241654,
+      "loss": 0.5251,
+      "step": 21890
+    },
+    {
+      "epoch": 1.1115609650848448,
+      "grad_norm": 0.027160788459885017,
+      "learning_rate": 0.0004898991030814028,
+      "loss": 0.5148,
+      "step": 21895
+    },
+    {
+      "epoch": 1.1118148011828761,
+      "grad_norm": 0.024176529505431532,
+      "learning_rate": 0.000489677623321396,
+      "loss": 0.4936,
+      "step": 21900
+    },
+    {
+      "epoch": 1.1120686372809077,
+      "grad_norm": 0.039337612553737564,
+      "learning_rate": 0.0004894561455876204,
+      "loss": 0.5253,
+      "step": 21905
+    },
+    {
+      "epoch": 1.1123224733789392,
+      "grad_norm": 0.022635785746774068,
+      "learning_rate": 0.0004892346699235507,
+      "loss": 0.5176,
+      "step": 21910
+    },
+    {
+      "epoch": 1.1125763094769707,
+      "grad_norm": 0.02559706784297847,
+      "learning_rate": 0.0004890131963726617,
+      "loss": 0.5059,
+      "step": 21915
+    },
+    {
+      "epoch": 1.1128301455750023,
+      "grad_norm": 0.021313731704569797,
+      "learning_rate": 0.0004887917249784275,
+      "loss": 0.4962,
+      "step": 21920
+    },
+    {
+      "epoch": 1.1130839816730338,
+      "grad_norm": 0.03607627354015021,
+      "learning_rate": 0.0004885702557843217,
+      "loss": 0.4859,
+      "step": 21925
+    },
+    {
+      "epoch": 1.113337817771065,
+      "grad_norm": 0.02559178946483111,
+      "learning_rate": 0.0004883487888338177,
+      "loss": 0.4888,
+      "step": 21930
+    },
+    {
+      "epoch": 1.1135916538690966,
+      "grad_norm": 0.023591231457929233,
+      "learning_rate": 0.0004881273241703884,
+      "loss": 0.5408,
+      "step": 21935
+    },
+    {
+      "epoch": 1.1138454899671282,
+      "grad_norm": 0.032232207206550165,
+      "learning_rate": 0.00048790586183750605,
+      "loss": 0.4845,
+      "step": 21940
+    },
+    {
+      "epoch": 1.1140993260651597,
+      "grad_norm": 0.023369247501599504,
+      "learning_rate": 0.0004876844018786428,
+      "loss": 0.5135,
+      "step": 21945
+    },
+    {
+      "epoch": 1.1143531621631912,
+      "grad_norm": 0.021929243467260426,
+      "learning_rate": 0.00048746294433727003,
+      "loss": 0.5041,
+      "step": 21950
+    },
+    {
+      "epoch": 1.1146069982612228,
+      "grad_norm": 0.024271700794895327,
+      "learning_rate": 0.0004872414892568585,
+      "loss": 0.5053,
+      "step": 21955
+    },
+    {
+      "epoch": 1.1148608343592543,
+      "grad_norm": 0.02341400981114356,
+      "learning_rate": 0.00048702003668087926,
+      "loss": 0.5172,
+      "step": 21960
+    },
+    {
+      "epoch": 1.1151146704572856,
+      "grad_norm": 0.024390827336790148,
+      "learning_rate": 0.00048679858665280206,
+      "loss": 0.4964,
+      "step": 21965
+    },
+    {
+      "epoch": 1.1153685065553172,
+      "grad_norm": 0.023466277161900626,
+      "learning_rate": 0.00048657713921609647,
+      "loss": 0.5098,
+      "step": 21970
+    },
+    {
+      "epoch": 1.1156223426533487,
+      "grad_norm": 0.03522806209015746,
+      "learning_rate": 0.0004863556944142316,
+      "loss": 0.5102,
+      "step": 21975
+    },
+    {
+      "epoch": 1.1158761787513802,
+      "grad_norm": 0.023062324780772376,
+      "learning_rate": 0.00048613425229067575,
+      "loss": 0.4558,
+      "step": 21980
+    },
+    {
+      "epoch": 1.1161300148494118,
+      "grad_norm": 0.029175018799990165,
+      "learning_rate": 0.0004859128128888971,
+      "loss": 0.5498,
+      "step": 21985
+    },
+    {
+      "epoch": 1.1163838509474433,
+      "grad_norm": 0.023244545510022245,
+      "learning_rate": 0.000485691376252363,
+      "loss": 0.5382,
+      "step": 21990
+    },
+    {
+      "epoch": 1.1166376870454748,
+      "grad_norm": 0.02540046969404091,
+      "learning_rate": 0.0004854699424245404,
+      "loss": 0.5167,
+      "step": 21995
+    },
+    {
+      "epoch": 1.1168915231435061,
+      "grad_norm": 0.023122619586228372,
+      "learning_rate": 0.00048524851144889563,
+      "loss": 0.4975,
+      "step": 22000
+    },
+    {
+      "epoch": 1.1171453592415377,
+      "grad_norm": 0.028102268868484732,
+      "learning_rate": 0.0004850270833688945,
+      "loss": 0.5306,
+      "step": 22005
+    },
+    {
+      "epoch": 1.1173991953395692,
+      "grad_norm": 0.028295243268877415,
+      "learning_rate": 0.0004848056582280022,
+      "loss": 0.485,
+      "step": 22010
+    },
+    {
+      "epoch": 1.1176530314376008,
+      "grad_norm": 0.023662648309147206,
+      "learning_rate": 0.00048458423606968337,
+      "loss": 0.5316,
+      "step": 22015
+    },
+    {
+      "epoch": 1.1179068675356323,
+      "grad_norm": 0.023956513141186277,
+      "learning_rate": 0.0004843628169374022,
+      "loss": 0.4989,
+      "step": 22020
+    },
+    {
+      "epoch": 1.1181607036336638,
+      "grad_norm": 0.03533645288614336,
+      "learning_rate": 0.0004841414008746221,
+      "loss": 0.4866,
+      "step": 22025
+    },
+    {
+      "epoch": 1.1184145397316954,
+      "grad_norm": 0.02362689256504165,
+      "learning_rate": 0.0004839199879248059,
+      "loss": 0.5038,
+      "step": 22030
+    },
+    {
+      "epoch": 1.1186683758297267,
+      "grad_norm": 0.022341925551004328,
+      "learning_rate": 0.00048369857813141586,
+      "loss": 0.4776,
+      "step": 22035
+    },
+    {
+      "epoch": 1.1189222119277582,
+      "grad_norm": 0.02777412452937675,
+      "learning_rate": 0.00048347717153791365,
+      "loss": 0.4765,
+      "step": 22040
+    },
+    {
+      "epoch": 1.1191760480257897,
+      "grad_norm": 0.03839285711516218,
+      "learning_rate": 0.0004832557681877603,
+      "loss": 0.4991,
+      "step": 22045
+    },
+    {
+      "epoch": 1.1194298841238213,
+      "grad_norm": 0.02786618691587262,
+      "learning_rate": 0.0004830343681244161,
+      "loss": 0.5314,
+      "step": 22050
+    },
+    {
+      "epoch": 1.1196837202218528,
+      "grad_norm": 0.02177076732255751,
+      "learning_rate": 0.0004828129713913409,
+      "loss": 0.5031,
+      "step": 22055
+    },
+    {
+      "epoch": 1.1199375563198843,
+      "grad_norm": 0.021374169537151152,
+      "learning_rate": 0.0004825915780319937,
+      "loss": 0.4887,
+      "step": 22060
+    },
+    {
+      "epoch": 1.1201913924179157,
+      "grad_norm": 0.021242873714302467,
+      "learning_rate": 0.00048237018808983286,
+      "loss": 0.5098,
+      "step": 22065
+    },
+    {
+      "epoch": 1.1204452285159472,
+      "grad_norm": 0.03862878033938459,
+      "learning_rate": 0.0004821488016083162,
+      "loss": 0.4902,
+      "step": 22070
+    },
+    {
+      "epoch": 1.1206990646139787,
+      "grad_norm": 0.022914131678326958,
+      "learning_rate": 0.0004819274186309005,
+      "loss": 0.5181,
+      "step": 22075
+    },
+    {
+      "epoch": 1.1209529007120103,
+      "grad_norm": 0.02444901284526625,
+      "learning_rate": 0.0004817060392010427,
+      "loss": 0.519,
+      "step": 22080
+    },
+    {
+      "epoch": 1.1212067368100418,
+      "grad_norm": 0.027511001579787645,
+      "learning_rate": 0.0004814846633621981,
+      "loss": 0.5124,
+      "step": 22085
+    },
+    {
+      "epoch": 1.1214605729080733,
+      "grad_norm": 0.02170627891570555,
+      "learning_rate": 0.0004812632911578218,
+      "loss": 0.4825,
+      "step": 22090
+    },
+    {
+      "epoch": 1.1217144090061049,
+      "grad_norm": 0.023975365604404518,
+      "learning_rate": 0.000481041922631368,
+      "loss": 0.5226,
+      "step": 22095
+    },
+    {
+      "epoch": 1.1219682451041362,
+      "grad_norm": 0.023405836814547467,
+      "learning_rate": 0.00048082055782629017,
+      "loss": 0.5045,
+      "step": 22100
+    },
+    {
+      "epoch": 1.1222220812021677,
+      "grad_norm": 0.023988948357005786,
+      "learning_rate": 0.00048059919678604125,
+      "loss": 0.5046,
+      "step": 22105
+    },
+    {
+      "epoch": 1.1224759173001992,
+      "grad_norm": 0.019265243232751396,
+      "learning_rate": 0.0004803778395540733,
+      "loss": 0.4932,
+      "step": 22110
+    },
+    {
+      "epoch": 1.1227297533982308,
+      "grad_norm": 0.020015777068000878,
+      "learning_rate": 0.0004801564861738375,
+      "loss": 0.4692,
+      "step": 22115
+    },
+    {
+      "epoch": 1.1229835894962623,
+      "grad_norm": 0.02271364249987384,
+      "learning_rate": 0.00047993513668878455,
+      "loss": 0.5071,
+      "step": 22120
+    },
+    {
+      "epoch": 1.1232374255942938,
+      "grad_norm": 0.0283767253917993,
+      "learning_rate": 0.0004797137911423642,
+      "loss": 0.49,
+      "step": 22125
+    },
+    {
+      "epoch": 1.1234912616923252,
+      "grad_norm": 0.03057189959396885,
+      "learning_rate": 0.00047949244957802545,
+      "loss": 0.5132,
+      "step": 22130
+    },
+    {
+      "epoch": 1.1237450977903567,
+      "grad_norm": 0.02307975684077437,
+      "learning_rate": 0.0004792711120392165,
+      "loss": 0.5052,
+      "step": 22135
+    },
+    {
+      "epoch": 1.1239989338883882,
+      "grad_norm": 0.022609530542469808,
+      "learning_rate": 0.00047904977856938496,
+      "loss": 0.4825,
+      "step": 22140
+    },
+    {
+      "epoch": 1.1242527699864198,
+      "grad_norm": 0.02667533885777748,
+      "learning_rate": 0.0004788284492119775,
+      "loss": 0.4987,
+      "step": 22145
+    },
+    {
+      "epoch": 1.1245066060844513,
+      "grad_norm": 0.03401959306286431,
+      "learning_rate": 0.00047860712401043976,
+      "loss": 0.4834,
+      "step": 22150
+    },
+    {
+      "epoch": 1.1247604421824828,
+      "grad_norm": 0.03470866794755956,
+      "learning_rate": 0.00047838580300821695,
+      "loss": 0.4963,
+      "step": 22155
+    },
+    {
+      "epoch": 1.1250142782805144,
+      "grad_norm": 0.023970283863950843,
+      "learning_rate": 0.0004781644862487532,
+      "loss": 0.4791,
+      "step": 22160
+    },
+    {
+      "epoch": 1.1252681143785457,
+      "grad_norm": 0.02312660422996864,
+      "learning_rate": 0.000477943173775492,
+      "loss": 0.5058,
+      "step": 22165
+    },
+    {
+      "epoch": 1.1255219504765772,
+      "grad_norm": 0.02435017898171767,
+      "learning_rate": 0.00047772186563187566,
+      "loss": 0.4919,
+      "step": 22170
+    },
+    {
+      "epoch": 1.1257757865746088,
+      "grad_norm": 0.023843442848662726,
+      "learning_rate": 0.00047750056186134603,
+      "loss": 0.5119,
+      "step": 22175
+    },
+    {
+      "epoch": 1.1260296226726403,
+      "grad_norm": 0.027676021986730217,
+      "learning_rate": 0.00047727926250734393,
+      "loss": 0.5182,
+      "step": 22180
+    },
+    {
+      "epoch": 1.1262834587706718,
+      "grad_norm": 0.02668274381778642,
+      "learning_rate": 0.00047705796761330927,
+      "loss": 0.5088,
+      "step": 22185
+    },
+    {
+      "epoch": 1.1265372948687034,
+      "grad_norm": 0.04197766800019646,
+      "learning_rate": 0.00047683667722268116,
+      "loss": 0.5212,
+      "step": 22190
+    },
+    {
+      "epoch": 1.1267911309667347,
+      "grad_norm": 0.04081434517250474,
+      "learning_rate": 0.0004766153913788976,
+      "loss": 0.4979,
+      "step": 22195
+    },
+    {
+      "epoch": 1.1270449670647662,
+      "grad_norm": 0.03853608127797696,
+      "learning_rate": 0.00047639411012539626,
+      "loss": 0.5104,
+      "step": 22200
+    },
+    {
+      "epoch": 1.1272988031627977,
+      "grad_norm": 0.02809744226799178,
+      "learning_rate": 0.0004761728335056134,
+      "loss": 0.4911,
+      "step": 22205
+    },
+    {
+      "epoch": 1.1275526392608293,
+      "grad_norm": 0.027375956086551304,
+      "learning_rate": 0.00047595156156298455,
+      "loss": 0.4629,
+      "step": 22210
+    },
+    {
+      "epoch": 1.1278064753588608,
+      "grad_norm": 0.026405679471089256,
+      "learning_rate": 0.0004757302943409442,
+      "loss": 0.5249,
+      "step": 22215
+    },
+    {
+      "epoch": 1.1280603114568923,
+      "grad_norm": 0.02422688158994748,
+      "learning_rate": 0.000475509031882926,
+      "loss": 0.4936,
+      "step": 22220
+    },
+    {
+      "epoch": 1.1283141475549239,
+      "grad_norm": 0.02310122351518651,
+      "learning_rate": 0.00047528777423236276,
+      "loss": 0.4977,
+      "step": 22225
+    },
+    {
+      "epoch": 1.1285679836529554,
+      "grad_norm": 0.027179185579231067,
+      "learning_rate": 0.00047506652143268615,
+      "loss": 0.527,
+      "step": 22230
+    },
+    {
+      "epoch": 1.1288218197509867,
+      "grad_norm": 0.02555209948452092,
+      "learning_rate": 0.0004748452735273271,
+      "loss": 0.4867,
+      "step": 22235
+    },
+    {
+      "epoch": 1.1290756558490183,
+      "grad_norm": 0.024286447301691477,
+      "learning_rate": 0.0004746240305597154,
+      "loss": 0.4958,
+      "step": 22240
+    },
+    {
+      "epoch": 1.1293294919470498,
+      "grad_norm": 0.028126452190403196,
+      "learning_rate": 0.0004744027925732799,
+      "loss": 0.5077,
+      "step": 22245
+    },
+    {
+      "epoch": 1.1295833280450813,
+      "grad_norm": 0.031834226850280145,
+      "learning_rate": 0.0004741815596114486,
+      "loss": 0.4713,
+      "step": 22250
+    },
+    {
+      "epoch": 1.1298371641431129,
+      "grad_norm": 0.025659978434002053,
+      "learning_rate": 0.00047396033171764825,
+      "loss": 0.4642,
+      "step": 22255
+    },
+    {
+      "epoch": 1.1300910002411442,
+      "grad_norm": 0.029700890203732126,
+      "learning_rate": 0.00047373910893530504,
+      "loss": 0.5396,
+      "step": 22260
+    },
+    {
+      "epoch": 1.1303448363391757,
+      "grad_norm": 0.022579372378051864,
+      "learning_rate": 0.00047351789130784384,
+      "loss": 0.4994,
+      "step": 22265
+    },
+    {
+      "epoch": 1.1305986724372072,
+      "grad_norm": 0.029980851132857454,
+      "learning_rate": 0.00047329667887868846,
+      "loss": 0.4837,
+      "step": 22270
+    },
+    {
+      "epoch": 1.1308525085352388,
+      "grad_norm": 0.038482466915576954,
+      "learning_rate": 0.00047307547169126183,
+      "loss": 0.4999,
+      "step": 22275
+    },
+    {
+      "epoch": 1.1311063446332703,
+      "grad_norm": 0.028758075959962347,
+      "learning_rate": 0.0004728542697889859,
+      "loss": 0.4943,
+      "step": 22280
+    },
+    {
+      "epoch": 1.1313601807313018,
+      "grad_norm": 0.02302067520972011,
+      "learning_rate": 0.00047263307321528136,
+      "loss": 0.5058,
+      "step": 22285
+    },
+    {
+      "epoch": 1.1316140168293334,
+      "grad_norm": 0.02338759212969084,
+      "learning_rate": 0.0004724118820135681,
+      "loss": 0.5417,
+      "step": 22290
+    },
+    {
+      "epoch": 1.131867852927365,
+      "grad_norm": 0.024240682457677416,
+      "learning_rate": 0.00047219069622726485,
+      "loss": 0.5429,
+      "step": 22295
+    },
+    {
+      "epoch": 1.1321216890253962,
+      "grad_norm": 0.03240119123904312,
+      "learning_rate": 0.0004719695158997892,
+      "loss": 0.4872,
+      "step": 22300
+    },
+    {
+      "epoch": 1.1323755251234278,
+      "grad_norm": 0.027587688170413006,
+      "learning_rate": 0.00047174834107455784,
+      "loss": 0.5008,
+      "step": 22305
+    },
+    {
+      "epoch": 1.1326293612214593,
+      "grad_norm": 0.026389922450704612,
+      "learning_rate": 0.00047152717179498624,
+      "loss": 0.5294,
+      "step": 22310
+    },
+    {
+      "epoch": 1.1328831973194908,
+      "grad_norm": 0.031230602354335214,
+      "learning_rate": 0.00047130600810448855,
+      "loss": 0.4712,
+      "step": 22315
+    },
+    {
+      "epoch": 1.1331370334175224,
+      "grad_norm": 0.02765524085794462,
+      "learning_rate": 0.0004710848500464786,
+      "loss": 0.526,
+      "step": 22320
+    },
+    {
+      "epoch": 1.133390869515554,
+      "grad_norm": 0.023377797728668624,
+      "learning_rate": 0.0004708636976643684,
+      "loss": 0.4667,
+      "step": 22325
+    },
+    {
+      "epoch": 1.1336447056135852,
+      "grad_norm": 0.024350148694776152,
+      "learning_rate": 0.00047064255100156904,
+      "loss": 0.4631,
+      "step": 22330
+    },
+    {
+      "epoch": 1.1338985417116167,
+      "grad_norm": 0.023281128309858613,
+      "learning_rate": 0.00047042141010149053,
+      "loss": 0.5129,
+      "step": 22335
+    },
+    {
+      "epoch": 1.1341523778096483,
+      "grad_norm": 0.030603931112854497,
+      "learning_rate": 0.0004702002750075417,
+      "loss": 0.517,
+      "step": 22340
+    },
+    {
+      "epoch": 1.1344062139076798,
+      "grad_norm": 0.024716062309770172,
+      "learning_rate": 0.0004699791457631303,
+      "loss": 0.4932,
+      "step": 22345
+    },
+    {
+      "epoch": 1.1346600500057114,
+      "grad_norm": 0.10774252187329189,
+      "learning_rate": 0.00046975802241166283,
+      "loss": 0.5326,
+      "step": 22350
+    },
+    {
+      "epoch": 1.1349138861037429,
+      "grad_norm": 0.029932485775150907,
+      "learning_rate": 0.00046953690499654477,
+      "loss": 0.5271,
+      "step": 22355
+    },
+    {
+      "epoch": 1.1351677222017744,
+      "grad_norm": 0.02306648216314524,
+      "learning_rate": 0.0004693157935611803,
+      "loss": 0.4831,
+      "step": 22360
+    },
+    {
+      "epoch": 1.1354215582998057,
+      "grad_norm": 0.024003978824830057,
+      "learning_rate": 0.0004690946881489726,
+      "loss": 0.4986,
+      "step": 22365
+    },
+    {
+      "epoch": 1.1356753943978373,
+      "grad_norm": 0.021905055146049835,
+      "learning_rate": 0.00046887358880332345,
+      "loss": 0.5525,
+      "step": 22370
+    },
+    {
+      "epoch": 1.1359292304958688,
+      "grad_norm": 0.022848413649515294,
+      "learning_rate": 0.00046865249556763344,
+      "loss": 0.529,
+      "step": 22375
+    },
+    {
+      "epoch": 1.1361830665939003,
+      "grad_norm": 0.03605177144637833,
+      "learning_rate": 0.0004684314084853024,
+      "loss": 0.4814,
+      "step": 22380
+    },
+    {
+      "epoch": 1.1364369026919319,
+      "grad_norm": 0.056644899234727326,
+      "learning_rate": 0.0004682103275997284,
+      "loss": 0.546,
+      "step": 22385
+    },
+    {
+      "epoch": 1.1366907387899634,
+      "grad_norm": 0.032741353664718975,
+      "learning_rate": 0.00046798925295430863,
+      "loss": 0.492,
+      "step": 22390
+    },
+    {
+      "epoch": 1.1369445748879947,
+      "grad_norm": 0.024068036984486157,
+      "learning_rate": 0.00046776818459243874,
+      "loss": 0.4783,
+      "step": 22395
+    },
+    {
+      "epoch": 1.1371984109860263,
+      "grad_norm": 0.026300065257889048,
+      "learning_rate": 0.0004675471225575136,
+      "loss": 0.5042,
+      "step": 22400
+    },
+    {
+      "epoch": 1.1374522470840578,
+      "grad_norm": 0.026588922808316026,
+      "learning_rate": 0.00046732606689292637,
+      "loss": 0.4825,
+      "step": 22405
+    },
+    {
+      "epoch": 1.1377060831820893,
+      "grad_norm": 0.03082021206138233,
+      "learning_rate": 0.00046710501764206933,
+      "loss": 0.4886,
+      "step": 22410
+    },
+    {
+      "epoch": 1.1379599192801209,
+      "grad_norm": 0.02702663439746158,
+      "learning_rate": 0.0004668839748483332,
+      "loss": 0.4956,
+      "step": 22415
+    },
+    {
+      "epoch": 1.1382137553781524,
+      "grad_norm": 0.02770867084856874,
+      "learning_rate": 0.0004666629385551078,
+      "loss": 0.5162,
+      "step": 22420
+    },
+    {
+      "epoch": 1.138467591476184,
+      "grad_norm": 0.02593191401749543,
+      "learning_rate": 0.0004664419088057812,
+      "loss": 0.5222,
+      "step": 22425
+    },
+    {
+      "epoch": 1.1387214275742152,
+      "grad_norm": 0.025067133006483912,
+      "learning_rate": 0.0004662208856437405,
+      "loss": 0.5045,
+      "step": 22430
+    },
+    {
+      "epoch": 1.1389752636722468,
+      "grad_norm": 0.02386909656610933,
+      "learning_rate": 0.00046599986911237135,
+      "loss": 0.4937,
+      "step": 22435
+    },
+    {
+      "epoch": 1.1392290997702783,
+      "grad_norm": 0.02308227237038335,
+      "learning_rate": 0.00046577885925505857,
+      "loss": 0.5298,
+      "step": 22440
+    },
+    {
+      "epoch": 1.1394829358683098,
+      "grad_norm": 0.022608086605042318,
+      "learning_rate": 0.00046555785611518505,
+      "loss": 0.5098,
+      "step": 22445
+    },
+    {
+      "epoch": 1.1397367719663414,
+      "grad_norm": 0.022743886474204174,
+      "learning_rate": 0.0004653368597361326,
+      "loss": 0.5065,
+      "step": 22450
+    },
+    {
+      "epoch": 1.139990608064373,
+      "grad_norm": 0.021721050273651527,
+      "learning_rate": 0.00046511587016128173,
+      "loss": 0.5221,
+      "step": 22455
+    },
+    {
+      "epoch": 1.1402444441624042,
+      "grad_norm": 0.028647441808002567,
+      "learning_rate": 0.0004648948874340115,
+      "loss": 0.5102,
+      "step": 22460
+    },
+    {
+      "epoch": 1.1404982802604358,
+      "grad_norm": 0.023629836872766076,
+      "learning_rate": 0.0004646739115976999,
+      "loss": 0.4838,
+      "step": 22465
+    },
+    {
+      "epoch": 1.1407521163584673,
+      "grad_norm": 0.022987645202448132,
+      "learning_rate": 0.00046445294269572326,
+      "loss": 0.5101,
+      "step": 22470
+    },
+    {
+      "epoch": 1.1410059524564988,
+      "grad_norm": 0.028359397619188362,
+      "learning_rate": 0.0004642319807714567,
+      "loss": 0.49,
+      "step": 22475
+    },
+    {
+      "epoch": 1.1412597885545304,
+      "grad_norm": 0.024714621127131544,
+      "learning_rate": 0.0004640110258682739,
+      "loss": 0.5394,
+      "step": 22480
+    },
+    {
+      "epoch": 1.141513624652562,
+      "grad_norm": 0.023929726037479372,
+      "learning_rate": 0.0004637900780295472,
+      "loss": 0.4957,
+      "step": 22485
+    },
+    {
+      "epoch": 1.1417674607505934,
+      "grad_norm": 0.02145690677992983,
+      "learning_rate": 0.0004635691372986477,
+      "loss": 0.455,
+      "step": 22490
+    },
+    {
+      "epoch": 1.142021296848625,
+      "grad_norm": 0.020947420613049612,
+      "learning_rate": 0.0004633482037189447,
+      "loss": 0.4911,
+      "step": 22495
+    },
+    {
+      "epoch": 1.1422751329466563,
+      "grad_norm": 0.02298839539671349,
+      "learning_rate": 0.00046312727733380666,
+      "loss": 0.5216,
+      "step": 22500
+    },
+    {
+      "epoch": 1.1425289690446878,
+      "grad_norm": 0.021966101109555183,
+      "learning_rate": 0.0004629063581866002,
+      "loss": 0.5083,
+      "step": 22505
+    },
+    {
+      "epoch": 1.1427828051427193,
+      "grad_norm": 0.03686984297075965,
+      "learning_rate": 0.00046268544632069064,
+      "loss": 0.5224,
+      "step": 22510
+    },
+    {
+      "epoch": 1.1430366412407509,
+      "grad_norm": 0.030756815032274516,
+      "learning_rate": 0.00046246454177944194,
+      "loss": 0.508,
+      "step": 22515
+    },
+    {
+      "epoch": 1.1432904773387824,
+      "grad_norm": 0.05181483793136802,
+      "learning_rate": 0.0004622436446062164,
+      "loss": 0.489,
+      "step": 22520
+    },
+    {
+      "epoch": 1.1435443134368137,
+      "grad_norm": 0.03958359517287309,
+      "learning_rate": 0.0004620227548443752,
+      "loss": 0.5243,
+      "step": 22525
+    },
+    {
+      "epoch": 1.1437981495348453,
+      "grad_norm": 0.03427906955795205,
+      "learning_rate": 0.0004618018725372778,
+      "loss": 0.4834,
+      "step": 22530
+    },
+    {
+      "epoch": 1.1440519856328768,
+      "grad_norm": 0.023909407473577712,
+      "learning_rate": 0.0004615809977282823,
+      "loss": 0.4969,
+      "step": 22535
+    },
+    {
+      "epoch": 1.1443058217309083,
+      "grad_norm": 0.026640110354467325,
+      "learning_rate": 0.0004613601304607454,
+      "loss": 0.5131,
+      "step": 22540
+    },
+    {
+      "epoch": 1.1445596578289399,
+      "grad_norm": 0.0237982378487044,
+      "learning_rate": 0.0004611392707780222,
+      "loss": 0.4965,
+      "step": 22545
+    },
+    {
+      "epoch": 1.1448134939269714,
+      "grad_norm": 0.03386310576456612,
+      "learning_rate": 0.00046091841872346627,
+      "loss": 0.5167,
+      "step": 22550
+    },
+    {
+      "epoch": 1.145067330025003,
+      "grad_norm": 0.02459001107447409,
+      "learning_rate": 0.00046069757434042975,
+      "loss": 0.5098,
+      "step": 22555
+    },
+    {
+      "epoch": 1.1453211661230345,
+      "grad_norm": 0.02750293868217048,
+      "learning_rate": 0.0004604767376722635,
+      "loss": 0.5183,
+      "step": 22560
+    },
+    {
+      "epoch": 1.1455750022210658,
+      "grad_norm": 0.02465764415596952,
+      "learning_rate": 0.0004602559087623166,
+      "loss": 0.5346,
+      "step": 22565
+    },
+    {
+      "epoch": 1.1458288383190973,
+      "grad_norm": 0.021840825134174797,
+      "learning_rate": 0.0004600350876539366,
+      "loss": 0.4734,
+      "step": 22570
+    },
+    {
+      "epoch": 1.1460826744171289,
+      "grad_norm": 0.022144355795008893,
+      "learning_rate": 0.00045981427439046956,
+      "loss": 0.4902,
+      "step": 22575
+    },
+    {
+      "epoch": 1.1463365105151604,
+      "grad_norm": 0.02333513320332411,
+      "learning_rate": 0.00045959346901526006,
+      "loss": 0.4993,
+      "step": 22580
+    },
+    {
+      "epoch": 1.146590346613192,
+      "grad_norm": 0.02599869001595599,
+      "learning_rate": 0.0004593726715716511,
+      "loss": 0.5261,
+      "step": 22585
+    },
+    {
+      "epoch": 1.1468441827112235,
+      "grad_norm": 0.03686029455414443,
+      "learning_rate": 0.00045915188210298406,
+      "loss": 0.4647,
+      "step": 22590
+    },
+    {
+      "epoch": 1.1470980188092548,
+      "grad_norm": 0.027385432815099645,
+      "learning_rate": 0.00045893110065259893,
+      "loss": 0.5352,
+      "step": 22595
+    },
+    {
+      "epoch": 1.1473518549072863,
+      "grad_norm": 0.02888582352407677,
+      "learning_rate": 0.0004587103272638339,
+      "loss": 0.4924,
+      "step": 22600
+    },
+    {
+      "epoch": 1.1476056910053178,
+      "grad_norm": 0.023625257936494698,
+      "learning_rate": 0.0004584895619800257,
+      "loss": 0.505,
+      "step": 22605
+    },
+    {
+      "epoch": 1.1478595271033494,
+      "grad_norm": 0.0277603791896047,
+      "learning_rate": 0.00045826880484450946,
+      "loss": 0.5021,
+      "step": 22610
+    },
+    {
+      "epoch": 1.148113363201381,
+      "grad_norm": 0.028467365276233746,
+      "learning_rate": 0.0004580480559006186,
+      "loss": 0.497,
+      "step": 22615
+    },
+    {
+      "epoch": 1.1483671992994124,
+      "grad_norm": 0.03687524663675464,
+      "learning_rate": 0.0004578273151916853,
+      "loss": 0.4686,
+      "step": 22620
+    },
+    {
+      "epoch": 1.148621035397444,
+      "grad_norm": 0.033659979229860255,
+      "learning_rate": 0.0004576065827610397,
+      "loss": 0.489,
+      "step": 22625
+    },
+    {
+      "epoch": 1.1488748714954753,
+      "grad_norm": 0.025235412390316816,
+      "learning_rate": 0.0004573858586520105,
+      "loss": 0.5098,
+      "step": 22630
+    },
+    {
+      "epoch": 1.1491287075935068,
+      "grad_norm": 0.03243044981273508,
+      "learning_rate": 0.0004571651429079247,
+      "loss": 0.4934,
+      "step": 22635
+    },
+    {
+      "epoch": 1.1493825436915384,
+      "grad_norm": 0.022833415951073927,
+      "learning_rate": 0.00045694443557210777,
+      "loss": 0.5142,
+      "step": 22640
+    },
+    {
+      "epoch": 1.14963637978957,
+      "grad_norm": 0.03068941635429747,
+      "learning_rate": 0.00045672373668788336,
+      "loss": 0.466,
+      "step": 22645
+    },
+    {
+      "epoch": 1.1498902158876014,
+      "grad_norm": 0.0246266009056399,
+      "learning_rate": 0.0004565030462985737,
+      "loss": 0.5178,
+      "step": 22650
+    },
+    {
+      "epoch": 1.150144051985633,
+      "grad_norm": 0.030534870475012552,
+      "learning_rate": 0.00045628236444749905,
+      "loss": 0.4653,
+      "step": 22655
+    },
+    {
+      "epoch": 1.1503978880836643,
+      "grad_norm": 0.024906564205494433,
+      "learning_rate": 0.0004560616911779783,
+      "loss": 0.5041,
+      "step": 22660
+    },
+    {
+      "epoch": 1.1506517241816958,
+      "grad_norm": 0.03444194262921391,
+      "learning_rate": 0.00045584102653332845,
+      "loss": 0.4884,
+      "step": 22665
+    },
+    {
+      "epoch": 1.1509055602797273,
+      "grad_norm": 0.036461759923969016,
+      "learning_rate": 0.0004556203705568648,
+      "loss": 0.5029,
+      "step": 22670
+    },
+    {
+      "epoch": 1.1511593963777589,
+      "grad_norm": 0.04594830004230979,
+      "learning_rate": 0.0004553997232919009,
+      "loss": 0.5145,
+      "step": 22675
+    },
+    {
+      "epoch": 1.1514132324757904,
+      "grad_norm": 0.03261588655559848,
+      "learning_rate": 0.00045517908478174917,
+      "loss": 0.5217,
+      "step": 22680
+    },
+    {
+      "epoch": 1.151667068573822,
+      "grad_norm": 0.0242254240936562,
+      "learning_rate": 0.0004549584550697196,
+      "loss": 0.4833,
+      "step": 22685
+    },
+    {
+      "epoch": 1.1519209046718535,
+      "grad_norm": 0.023948731317067408,
+      "learning_rate": 0.00045473783419912057,
+      "loss": 0.473,
+      "step": 22690
+    },
+    {
+      "epoch": 1.1521747407698848,
+      "grad_norm": 0.02369769333893661,
+      "learning_rate": 0.000454517222213259,
+      "loss": 0.4996,
+      "step": 22695
+    },
+    {
+      "epoch": 1.1524285768679163,
+      "grad_norm": 0.0235533696148796,
+      "learning_rate": 0.00045429661915543995,
+      "loss": 0.5215,
+      "step": 22700
+    },
+    {
+      "epoch": 1.1526824129659479,
+      "grad_norm": 0.029693427304754598,
+      "learning_rate": 0.0004540760250689666,
+      "loss": 0.5202,
+      "step": 22705
+    },
+    {
+      "epoch": 1.1529362490639794,
+      "grad_norm": 0.026223364679563748,
+      "learning_rate": 0.0004538554399971406,
+      "loss": 0.499,
+      "step": 22710
+    },
+    {
+      "epoch": 1.153190085162011,
+      "grad_norm": 0.022238545679295847,
+      "learning_rate": 0.00045363486398326147,
+      "loss": 0.4906,
+      "step": 22715
+    },
+    {
+      "epoch": 1.1534439212600425,
+      "grad_norm": 0.02977765503263147,
+      "learning_rate": 0.0004534142970706274,
+      "loss": 0.529,
+      "step": 22720
+    },
+    {
+      "epoch": 1.1536977573580738,
+      "grad_norm": 0.022706195045235984,
+      "learning_rate": 0.0004531937393025344,
+      "loss": 0.5039,
+      "step": 22725
+    },
+    {
+      "epoch": 1.1539515934561053,
+      "grad_norm": 0.02434660390615717,
+      "learning_rate": 0.000452973190722277,
+      "loss": 0.5186,
+      "step": 22730
+    },
+    {
+      "epoch": 1.1542054295541369,
+      "grad_norm": 0.0213880961035262,
+      "learning_rate": 0.00045275265137314754,
+      "loss": 0.5202,
+      "step": 22735
+    },
+    {
+      "epoch": 1.1544592656521684,
+      "grad_norm": 0.024719044730757197,
+      "learning_rate": 0.0004525321212984372,
+      "loss": 0.5122,
+      "step": 22740
+    },
+    {
+      "epoch": 1.1547131017502,
+      "grad_norm": 0.024745026751628207,
+      "learning_rate": 0.00045231160054143467,
+      "loss": 0.5046,
+      "step": 22745
+    },
+    {
+      "epoch": 1.1549669378482315,
+      "grad_norm": 0.025339355379831775,
+      "learning_rate": 0.00045209108914542716,
+      "loss": 0.5085,
+      "step": 22750
+    },
+    {
+      "epoch": 1.155220773946263,
+      "grad_norm": 0.03885787120756748,
+      "learning_rate": 0.0004518705871537,
+      "loss": 0.5161,
+      "step": 22755
+    },
+    {
+      "epoch": 1.1554746100442943,
+      "grad_norm": 0.022406997994527122,
+      "learning_rate": 0.0004516500946095365,
+      "loss": 0.4808,
+      "step": 22760
+    },
+    {
+      "epoch": 1.1557284461423258,
+      "grad_norm": 0.026005826539297415,
+      "learning_rate": 0.0004514296115562183,
+      "loss": 0.5135,
+      "step": 22765
+    },
+    {
+      "epoch": 1.1559822822403574,
+      "grad_norm": 0.03539270632977643,
+      "learning_rate": 0.0004512091380370251,
+      "loss": 0.4971,
+      "step": 22770
+    },
+    {
+      "epoch": 1.156236118338389,
+      "grad_norm": 0.029564361875585962,
+      "learning_rate": 0.00045098867409523486,
+      "loss": 0.4924,
+      "step": 22775
+    },
+    {
+      "epoch": 1.1564899544364204,
+      "grad_norm": 0.02358096736186789,
+      "learning_rate": 0.0004507682197741235,
+      "loss": 0.4856,
+      "step": 22780
+    },
+    {
+      "epoch": 1.156743790534452,
+      "grad_norm": 0.028230367180303174,
+      "learning_rate": 0.000450547775116965,
+      "loss": 0.4659,
+      "step": 22785
+    },
+    {
+      "epoch": 1.1569976266324833,
+      "grad_norm": 0.02700559394625303,
+      "learning_rate": 0.00045032734016703163,
+      "loss": 0.4845,
+      "step": 22790
+    },
+    {
+      "epoch": 1.1572514627305148,
+      "grad_norm": 0.026220476689667822,
+      "learning_rate": 0.0004501069149675937,
+      "loss": 0.5081,
+      "step": 22795
+    },
+    {
+      "epoch": 1.1575052988285464,
+      "grad_norm": 0.023325253274810016,
+      "learning_rate": 0.00044988649956191943,
+      "loss": 0.5124,
+      "step": 22800
+    },
+    {
+      "epoch": 1.157759134926578,
+      "grad_norm": 0.02964466344942591,
+      "learning_rate": 0.00044966609399327544,
+      "loss": 0.5034,
+      "step": 22805
+    },
+    {
+      "epoch": 1.1580129710246094,
+      "grad_norm": 0.025629931037714163,
+      "learning_rate": 0.0004494456983049263,
+      "loss": 0.5159,
+      "step": 22810
+    },
+    {
+      "epoch": 1.158266807122641,
+      "grad_norm": 0.02818068589305215,
+      "learning_rate": 0.0004492253125401344,
+      "loss": 0.517,
+      "step": 22815
+    },
+    {
+      "epoch": 1.1585206432206725,
+      "grad_norm": 0.025937834093816534,
+      "learning_rate": 0.00044900493674216043,
+      "loss": 0.535,
+      "step": 22820
+    },
+    {
+      "epoch": 1.158774479318704,
+      "grad_norm": 0.021906620341182587,
+      "learning_rate": 0.00044878457095426307,
+      "loss": 0.4881,
+      "step": 22825
+    },
+    {
+      "epoch": 1.1590283154167353,
+      "grad_norm": 0.023326406428330577,
+      "learning_rate": 0.000448564215219699,
+      "loss": 0.4898,
+      "step": 22830
+    },
+    {
+      "epoch": 1.1592821515147669,
+      "grad_norm": 0.025849041107935036,
+      "learning_rate": 0.00044834386958172295,
+      "loss": 0.5198,
+      "step": 22835
+    },
+    {
+      "epoch": 1.1595359876127984,
+      "grad_norm": 0.02594641672975575,
+      "learning_rate": 0.00044812353408358777,
+      "loss": 0.4972,
+      "step": 22840
+    },
+    {
+      "epoch": 1.15978982371083,
+      "grad_norm": 0.0330275583093027,
+      "learning_rate": 0.0004479032087685441,
+      "loss": 0.5119,
+      "step": 22845
+    },
+    {
+      "epoch": 1.1600436598088615,
+      "grad_norm": 0.023604960673479395,
+      "learning_rate": 0.00044768289367984077,
+      "loss": 0.5103,
+      "step": 22850
+    },
+    {
+      "epoch": 1.1602974959068928,
+      "grad_norm": 0.04128228371869332,
+      "learning_rate": 0.0004474625888607245,
+      "loss": 0.5197,
+      "step": 22855
+    },
+    {
+      "epoch": 1.1605513320049243,
+      "grad_norm": 0.020903884654111818,
+      "learning_rate": 0.00044724229435443973,
+      "loss": 0.5284,
+      "step": 22860
+    },
+    {
+      "epoch": 1.1608051681029559,
+      "grad_norm": 0.031129567663155357,
+      "learning_rate": 0.0004470220102042298,
+      "loss": 0.5068,
+      "step": 22865
+    },
+    {
+      "epoch": 1.1610590042009874,
+      "grad_norm": 0.020499848777966484,
+      "learning_rate": 0.00044680173645333504,
+      "loss": 0.5014,
+      "step": 22870
+    },
+    {
+      "epoch": 1.161312840299019,
+      "grad_norm": 0.024298947458801685,
+      "learning_rate": 0.0004465814731449941,
+      "loss": 0.4989,
+      "step": 22875
+    },
+    {
+      "epoch": 1.1615666763970505,
+      "grad_norm": 0.02238268280787287,
+      "learning_rate": 0.0004463612203224436,
+      "loss": 0.5114,
+      "step": 22880
+    },
+    {
+      "epoch": 1.161820512495082,
+      "grad_norm": 0.030230746480821854,
+      "learning_rate": 0.0004461409780289181,
+      "loss": 0.4876,
+      "step": 22885
+    },
+    {
+      "epoch": 1.1620743485931135,
+      "grad_norm": 0.024028109056989183,
+      "learning_rate": 0.0004459207463076499,
+      "loss": 0.4815,
+      "step": 22890
+    },
+    {
+      "epoch": 1.1623281846911449,
+      "grad_norm": 0.021016127843322022,
+      "learning_rate": 0.00044570052520186956,
+      "loss": 0.5067,
+      "step": 22895
+    },
+    {
+      "epoch": 1.1625820207891764,
+      "grad_norm": 0.029561997978537945,
+      "learning_rate": 0.00044548031475480533,
+      "loss": 0.5074,
+      "step": 22900
+    },
+    {
+      "epoch": 1.162835856887208,
+      "grad_norm": 0.022169964271249176,
+      "learning_rate": 0.0004452601150096834,
+      "loss": 0.522,
+      "step": 22905
+    },
+    {
+      "epoch": 1.1630896929852395,
+      "grad_norm": 0.024800692528682368,
+      "learning_rate": 0.000445039926009728,
+      "loss": 0.5108,
+      "step": 22910
+    },
+    {
+      "epoch": 1.163343529083271,
+      "grad_norm": 0.02591690066399758,
+      "learning_rate": 0.00044481974779816096,
+      "loss": 0.5044,
+      "step": 22915
+    },
+    {
+      "epoch": 1.1635973651813025,
+      "grad_norm": 0.025486400367960977,
+      "learning_rate": 0.00044459958041820217,
+      "loss": 0.5242,
+      "step": 22920
+    },
+    {
+      "epoch": 1.1638512012793338,
+      "grad_norm": 2.4946821079686567,
+      "learning_rate": 0.0004443794239130696,
+      "loss": 0.7984,
+      "step": 22925
+    },
+    {
+      "epoch": 1.1641050373773654,
+      "grad_norm": 0.06100607110479418,
+      "learning_rate": 0.00044415927832597865,
+      "loss": 0.4905,
+      "step": 22930
+    },
+    {
+      "epoch": 1.164358873475397,
+      "grad_norm": 0.025761932513384978,
+      "learning_rate": 0.00044393914370014295,
+      "loss": 0.4968,
+      "step": 22935
+    },
+    {
+      "epoch": 1.1646127095734284,
+      "grad_norm": 0.02625663236193525,
+      "learning_rate": 0.00044371902007877374,
+      "loss": 0.4951,
+      "step": 22940
+    },
+    {
+      "epoch": 1.16486654567146,
+      "grad_norm": 0.02683158448012171,
+      "learning_rate": 0.0004434989075050802,
+      "loss": 0.5164,
+      "step": 22945
+    },
+    {
+      "epoch": 1.1651203817694915,
+      "grad_norm": 0.03126770701983838,
+      "learning_rate": 0.0004432788060222694,
+      "loss": 0.4774,
+      "step": 22950
+    },
+    {
+      "epoch": 1.165374217867523,
+      "grad_norm": 0.02778926944239494,
+      "learning_rate": 0.00044305871567354606,
+      "loss": 0.5266,
+      "step": 22955
+    },
+    {
+      "epoch": 1.1656280539655544,
+      "grad_norm": 0.02199391640791797,
+      "learning_rate": 0.0004428386365021129,
+      "loss": 0.5053,
+      "step": 22960
+    },
+    {
+      "epoch": 1.165881890063586,
+      "grad_norm": 0.02185784711754856,
+      "learning_rate": 0.0004426185685511703,
+      "loss": 0.5215,
+      "step": 22965
+    },
+    {
+      "epoch": 1.1661357261616174,
+      "grad_norm": 0.03374512683832721,
+      "learning_rate": 0.00044239851186391653,
+      "loss": 0.4848,
+      "step": 22970
+    },
+    {
+      "epoch": 1.166389562259649,
+      "grad_norm": 0.029868509213157268,
+      "learning_rate": 0.00044217846648354764,
+      "loss": 0.4479,
+      "step": 22975
+    },
+    {
+      "epoch": 1.1666433983576805,
+      "grad_norm": 0.04777615390254651,
+      "learning_rate": 0.00044195843245325723,
+      "loss": 0.4859,
+      "step": 22980
+    },
+    {
+      "epoch": 1.166897234455712,
+      "grad_norm": 0.023512758989838855,
+      "learning_rate": 0.0004417384098162373,
+      "loss": 0.5342,
+      "step": 22985
+    },
+    {
+      "epoch": 1.1671510705537433,
+      "grad_norm": 0.02229369139136557,
+      "learning_rate": 0.00044151839861567694,
+      "loss": 0.5134,
+      "step": 22990
+    },
+    {
+      "epoch": 1.1674049066517749,
+      "grad_norm": 0.026894096234455038,
+      "learning_rate": 0.0004412983988947633,
+      "loss": 0.4969,
+      "step": 22995
+    },
+    {
+      "epoch": 1.1676587427498064,
+      "grad_norm": 0.02589303820825699,
+      "learning_rate": 0.0004410784106966812,
+      "loss": 0.4886,
+      "step": 23000
+    },
+    {
+      "epoch": 1.167912578847838,
+      "grad_norm": 0.02124381698964111,
+      "learning_rate": 0.0004408584340646132,
+      "loss": 0.523,
+      "step": 23005
+    },
+    {
+      "epoch": 1.1681664149458695,
+      "grad_norm": 0.03791510606931018,
+      "learning_rate": 0.0004406384690417397,
+      "loss": 0.5113,
+      "step": 23010
+    },
+    {
+      "epoch": 1.168420251043901,
+      "grad_norm": 0.02862399836284424,
+      "learning_rate": 0.0004404185156712387,
+      "loss": 0.5179,
+      "step": 23015
+    },
+    {
+      "epoch": 1.1686740871419325,
+      "grad_norm": 0.030684358273063143,
+      "learning_rate": 0.00044019857399628593,
+      "loss": 0.4732,
+      "step": 23020
+    },
+    {
+      "epoch": 1.1689279232399639,
+      "grad_norm": 0.023354500795241345,
+      "learning_rate": 0.0004399786440600549,
+      "loss": 0.5008,
+      "step": 23025
+    },
+    {
+      "epoch": 1.1691817593379954,
+      "grad_norm": 0.024691152727817047,
+      "learning_rate": 0.0004397587259057166,
+      "loss": 0.4704,
+      "step": 23030
+    },
+    {
+      "epoch": 1.169435595436027,
+      "grad_norm": 0.023111558411901535,
+      "learning_rate": 0.0004395388195764401,
+      "loss": 0.4923,
+      "step": 23035
+    },
+    {
+      "epoch": 1.1696894315340585,
+      "grad_norm": 0.023376429033597424,
+      "learning_rate": 0.00043931892511539164,
+      "loss": 0.4986,
+      "step": 23040
+    },
+    {
+      "epoch": 1.16994326763209,
+      "grad_norm": 0.021932488535556502,
+      "learning_rate": 0.0004390990425657357,
+      "loss": 0.5079,
+      "step": 23045
+    },
+    {
+      "epoch": 1.1701971037301215,
+      "grad_norm": 0.02295870523021244,
+      "learning_rate": 0.00043887917197063395,
+      "loss": 0.4881,
+      "step": 23050
+    },
+    {
+      "epoch": 1.1704509398281528,
+      "grad_norm": 0.025540059947805397,
+      "learning_rate": 0.00043865931337324596,
+      "loss": 0.4948,
+      "step": 23055
+    },
+    {
+      "epoch": 1.1707047759261844,
+      "grad_norm": 0.022156403341379288,
+      "learning_rate": 0.0004384394668167288,
+      "loss": 0.4972,
+      "step": 23060
+    },
+    {
+      "epoch": 1.170958612024216,
+      "grad_norm": 0.02786034852874224,
+      "learning_rate": 0.00043821963234423736,
+      "loss": 0.5261,
+      "step": 23065
+    },
+    {
+      "epoch": 1.1712124481222475,
+      "grad_norm": 0.04119583380859592,
+      "learning_rate": 0.00043799980999892395,
+      "loss": 0.5146,
+      "step": 23070
+    },
+    {
+      "epoch": 1.171466284220279,
+      "grad_norm": 0.023409857934518236,
+      "learning_rate": 0.00043777999982393866,
+      "loss": 0.5204,
+      "step": 23075
+    },
+    {
+      "epoch": 1.1717201203183105,
+      "grad_norm": 0.02598832215798176,
+      "learning_rate": 0.00043756020186242915,
+      "loss": 0.4982,
+      "step": 23080
+    },
+    {
+      "epoch": 1.171973956416342,
+      "grad_norm": 0.03531536654582589,
+      "learning_rate": 0.0004373404161575406,
+      "loss": 0.4819,
+      "step": 23085
+    },
+    {
+      "epoch": 1.1722277925143736,
+      "grad_norm": 0.022966689799931606,
+      "learning_rate": 0.00043712064275241584,
+      "loss": 0.5127,
+      "step": 23090
+    },
+    {
+      "epoch": 1.172481628612405,
+      "grad_norm": 0.022269854028272786,
+      "learning_rate": 0.00043690088169019535,
+      "loss": 0.4744,
+      "step": 23095
+    },
+    {
+      "epoch": 1.1727354647104364,
+      "grad_norm": 0.025940748821848634,
+      "learning_rate": 0.0004366811330140169,
+      "loss": 0.4777,
+      "step": 23100
+    },
+    {
+      "epoch": 1.172989300808468,
+      "grad_norm": 0.022624731878350762,
+      "learning_rate": 0.0004364613967670165,
+      "loss": 0.4943,
+      "step": 23105
+    },
+    {
+      "epoch": 1.1732431369064995,
+      "grad_norm": 0.023056585507083997,
+      "learning_rate": 0.0004362416729923271,
+      "loss": 0.495,
+      "step": 23110
+    },
+    {
+      "epoch": 1.173496973004531,
+      "grad_norm": 0.03515992882856805,
+      "learning_rate": 0.0004360219617330792,
+      "loss": 0.5044,
+      "step": 23115
+    },
+    {
+      "epoch": 1.1737508091025624,
+      "grad_norm": 0.02197279853949111,
+      "learning_rate": 0.00043580226303240125,
+      "loss": 0.5008,
+      "step": 23120
+    },
+    {
+      "epoch": 1.1740046452005939,
+      "grad_norm": 0.026489459382601607,
+      "learning_rate": 0.0004355825769334189,
+      "loss": 0.5203,
+      "step": 23125
+    },
+    {
+      "epoch": 1.1742584812986254,
+      "grad_norm": 0.028795879529102696,
+      "learning_rate": 0.00043536290347925545,
+      "loss": 0.4819,
+      "step": 23130
+    },
+    {
+      "epoch": 1.174512317396657,
+      "grad_norm": 0.023530562218747813,
+      "learning_rate": 0.0004351432427130316,
+      "loss": 0.5285,
+      "step": 23135
+    },
+    {
+      "epoch": 1.1747661534946885,
+      "grad_norm": 0.026020705705209802,
+      "learning_rate": 0.0004349235946778659,
+      "loss": 0.5238,
+      "step": 23140
+    },
+    {
+      "epoch": 1.17501998959272,
+      "grad_norm": 0.0214632854435188,
+      "learning_rate": 0.000434703959416874,
+      "loss": 0.4631,
+      "step": 23145
+    },
+    {
+      "epoch": 1.1752738256907516,
+      "grad_norm": 0.025379100473350413,
+      "learning_rate": 0.0004344843369731692,
+      "loss": 0.479,
+      "step": 23150
+    },
+    {
+      "epoch": 1.175527661788783,
+      "grad_norm": 0.02437051797462676,
+      "learning_rate": 0.00043426472738986233,
+      "loss": 0.512,
+      "step": 23155
+    },
+    {
+      "epoch": 1.1757814978868144,
+      "grad_norm": 0.02897066685669499,
+      "learning_rate": 0.00043404513071006157,
+      "loss": 0.5283,
+      "step": 23160
+    },
+    {
+      "epoch": 1.176035333984846,
+      "grad_norm": 0.02546901345942979,
+      "learning_rate": 0.0004338255469768728,
+      "loss": 0.4924,
+      "step": 23165
+    },
+    {
+      "epoch": 1.1762891700828775,
+      "grad_norm": 0.02368301627174018,
+      "learning_rate": 0.0004336059762333992,
+      "loss": 0.5317,
+      "step": 23170
+    },
+    {
+      "epoch": 1.176543006180909,
+      "grad_norm": 0.022864082081725436,
+      "learning_rate": 0.0004333864185227413,
+      "loss": 0.5158,
+      "step": 23175
+    },
+    {
+      "epoch": 1.1767968422789405,
+      "grad_norm": 0.02715490428058028,
+      "learning_rate": 0.0004331668738879973,
+      "loss": 0.5156,
+      "step": 23180
+    },
+    {
+      "epoch": 1.177050678376972,
+      "grad_norm": 0.021324658368549614,
+      "learning_rate": 0.00043294734237226263,
+      "loss": 0.5044,
+      "step": 23185
+    },
+    {
+      "epoch": 1.1773045144750034,
+      "grad_norm": 0.024522981714582275,
+      "learning_rate": 0.0004327278240186303,
+      "loss": 0.5064,
+      "step": 23190
+    },
+    {
+      "epoch": 1.177558350573035,
+      "grad_norm": 0.03465334835103066,
+      "learning_rate": 0.0004325083188701906,
+      "loss": 0.4927,
+      "step": 23195
+    },
+    {
+      "epoch": 1.1778121866710665,
+      "grad_norm": 0.02331481482855284,
+      "learning_rate": 0.0004322888269700313,
+      "loss": 0.5376,
+      "step": 23200
+    },
+    {
+      "epoch": 1.178066022769098,
+      "grad_norm": 0.020402232394778678,
+      "learning_rate": 0.00043206934836123763,
+      "loss": 0.4963,
+      "step": 23205
+    },
+    {
+      "epoch": 1.1783198588671295,
+      "grad_norm": 0.029288417733953877,
+      "learning_rate": 0.0004318498830868921,
+      "loss": 0.4816,
+      "step": 23210
+    },
+    {
+      "epoch": 1.178573694965161,
+      "grad_norm": 0.026993170435865486,
+      "learning_rate": 0.0004316304311900746,
+      "loss": 0.4947,
+      "step": 23215
+    },
+    {
+      "epoch": 1.1788275310631926,
+      "grad_norm": 0.029826322623069505,
+      "learning_rate": 0.00043141099271386236,
+      "loss": 0.5017,
+      "step": 23220
+    },
+    {
+      "epoch": 1.179081367161224,
+      "grad_norm": 0.02839469569776015,
+      "learning_rate": 0.0004311915677013304,
+      "loss": 0.5048,
+      "step": 23225
+    },
+    {
+      "epoch": 1.1793352032592554,
+      "grad_norm": 0.03104613930338224,
+      "learning_rate": 0.00043097215619555053,
+      "loss": 0.4949,
+      "step": 23230
+    },
+    {
+      "epoch": 1.179589039357287,
+      "grad_norm": 0.023927607690992232,
+      "learning_rate": 0.00043075275823959217,
+      "loss": 0.4748,
+      "step": 23235
+    },
+    {
+      "epoch": 1.1798428754553185,
+      "grad_norm": 0.027181678664027754,
+      "learning_rate": 0.000430533373876522,
+      "loss": 0.5051,
+      "step": 23240
+    },
+    {
+      "epoch": 1.18009671155335,
+      "grad_norm": 0.025312092368077646,
+      "learning_rate": 0.0004303140031494042,
+      "loss": 0.5043,
+      "step": 23245
+    },
+    {
+      "epoch": 1.1803505476513816,
+      "grad_norm": 0.024109005993253833,
+      "learning_rate": 0.0004300946461012999,
+      "loss": 0.4829,
+      "step": 23250
+    },
+    {
+      "epoch": 1.180604383749413,
+      "grad_norm": 0.031076290791344705,
+      "learning_rate": 0.0004298753027752681,
+      "loss": 0.5013,
+      "step": 23255
+    },
+    {
+      "epoch": 1.1808582198474444,
+      "grad_norm": 0.02301069082386406,
+      "learning_rate": 0.00042965597321436454,
+      "loss": 0.496,
+      "step": 23260
+    },
+    {
+      "epoch": 1.181112055945476,
+      "grad_norm": 0.02834869538619908,
+      "learning_rate": 0.00042943665746164274,
+      "loss": 0.4945,
+      "step": 23265
+    },
+    {
+      "epoch": 1.1813658920435075,
+      "grad_norm": 0.040647134876522086,
+      "learning_rate": 0.0004292173555601531,
+      "loss": 0.5038,
+      "step": 23270
+    },
+    {
+      "epoch": 1.181619728141539,
+      "grad_norm": 0.047047098373209854,
+      "learning_rate": 0.00042899806755294364,
+      "loss": 0.5186,
+      "step": 23275
+    },
+    {
+      "epoch": 1.1818735642395706,
+      "grad_norm": 0.023835906470185797,
+      "learning_rate": 0.00042877879348305925,
+      "loss": 0.4914,
+      "step": 23280
+    },
+    {
+      "epoch": 1.182127400337602,
+      "grad_norm": 0.3198338312283238,
+      "learning_rate": 0.0004285595333935427,
+      "loss": 0.4655,
+      "step": 23285
+    },
+    {
+      "epoch": 1.1823812364356334,
+      "grad_norm": 0.08480024872059884,
+      "learning_rate": 0.0004283402873274334,
+      "loss": 0.5015,
+      "step": 23290
+    },
+    {
+      "epoch": 1.182635072533665,
+      "grad_norm": 0.044275755637726996,
+      "learning_rate": 0.0004281210553277684,
+      "loss": 0.5064,
+      "step": 23295
+    },
+    {
+      "epoch": 1.1828889086316965,
+      "grad_norm": 0.03453100961430329,
+      "learning_rate": 0.0004279018374375817,
+      "loss": 0.518,
+      "step": 23300
+    },
+    {
+      "epoch": 1.183142744729728,
+      "grad_norm": 0.02331481023981138,
+      "learning_rate": 0.00042768263369990486,
+      "loss": 0.5057,
+      "step": 23305
+    },
+    {
+      "epoch": 1.1833965808277596,
+      "grad_norm": 0.02423433591461226,
+      "learning_rate": 0.00042746344415776634,
+      "loss": 0.5355,
+      "step": 23310
+    },
+    {
+      "epoch": 1.183650416925791,
+      "grad_norm": 0.020158746070223478,
+      "learning_rate": 0.00042724426885419197,
+      "loss": 0.5056,
+      "step": 23315
+    },
+    {
+      "epoch": 1.1839042530238224,
+      "grad_norm": 0.02346254177392359,
+      "learning_rate": 0.0004270251078322048,
+      "loss": 0.4555,
+      "step": 23320
+    },
+    {
+      "epoch": 1.184158089121854,
+      "grad_norm": 0.02215680364485662,
+      "learning_rate": 0.000426805961134825,
+      "loss": 0.4967,
+      "step": 23325
+    },
+    {
+      "epoch": 1.1844119252198855,
+      "grad_norm": 0.025983453386552117,
+      "learning_rate": 0.00042658682880507005,
+      "loss": 0.488,
+      "step": 23330
+    },
+    {
+      "epoch": 1.184665761317917,
+      "grad_norm": 0.02558287853502543,
+      "learning_rate": 0.0004263677108859545,
+      "loss": 0.5222,
+      "step": 23335
+    },
+    {
+      "epoch": 1.1849195974159485,
+      "grad_norm": 0.029862575005858382,
+      "learning_rate": 0.0004261486074204899,
+      "loss": 0.5155,
+      "step": 23340
+    },
+    {
+      "epoch": 1.18517343351398,
+      "grad_norm": 0.03365037560913944,
+      "learning_rate": 0.0004259295184516855,
+      "loss": 0.5005,
+      "step": 23345
+    },
+    {
+      "epoch": 1.1854272696120116,
+      "grad_norm": 0.02324079187444754,
+      "learning_rate": 0.00042571044402254734,
+      "loss": 0.5356,
+      "step": 23350
+    },
+    {
+      "epoch": 1.1856811057100431,
+      "grad_norm": 0.023311446376579104,
+      "learning_rate": 0.00042549138417607855,
+      "loss": 0.5226,
+      "step": 23355
+    },
+    {
+      "epoch": 1.1859349418080745,
+      "grad_norm": 0.02264754449055334,
+      "learning_rate": 0.0004252723389552794,
+      "loss": 0.4857,
+      "step": 23360
+    },
+    {
+      "epoch": 1.186188777906106,
+      "grad_norm": 0.024792949347000763,
+      "learning_rate": 0.0004250533084031474,
+      "loss": 0.4847,
+      "step": 23365
+    },
+    {
+      "epoch": 1.1864426140041375,
+      "grad_norm": 0.02365423995133714,
+      "learning_rate": 0.0004248342925626773,
+      "loss": 0.5237,
+      "step": 23370
+    },
+    {
+      "epoch": 1.186696450102169,
+      "grad_norm": 0.025122984233723788,
+      "learning_rate": 0.0004246152914768607,
+      "loss": 0.4653,
+      "step": 23375
+    },
+    {
+      "epoch": 1.1869502862002006,
+      "grad_norm": 0.037495988695979386,
+      "learning_rate": 0.00042439630518868645,
+      "loss": 0.4927,
+      "step": 23380
+    },
+    {
+      "epoch": 1.187204122298232,
+      "grad_norm": 0.02744207164178393,
+      "learning_rate": 0.00042417733374114044,
+      "loss": 0.4939,
+      "step": 23385
+    },
+    {
+      "epoch": 1.1874579583962634,
+      "grad_norm": 0.023497982340652368,
+      "learning_rate": 0.00042395837717720564,
+      "loss": 0.517,
+      "step": 23390
+    },
+    {
+      "epoch": 1.187711794494295,
+      "grad_norm": 0.022057752647203964,
+      "learning_rate": 0.0004237394355398622,
+      "loss": 0.4763,
+      "step": 23395
+    },
+    {
+      "epoch": 1.1879656305923265,
+      "grad_norm": 0.022675614809165457,
+      "learning_rate": 0.0004235205088720872,
+      "loss": 0.4862,
+      "step": 23400
+    },
+    {
+      "epoch": 1.188219466690358,
+      "grad_norm": 0.027088398303537622,
+      "learning_rate": 0.000423301597216855,
+      "loss": 0.4862,
+      "step": 23405
+    },
+    {
+      "epoch": 1.1884733027883896,
+      "grad_norm": 0.021433012116146354,
+      "learning_rate": 0.0004230827006171367,
+      "loss": 0.4712,
+      "step": 23410
+    },
+    {
+      "epoch": 1.1887271388864211,
+      "grad_norm": 0.026295761373166058,
+      "learning_rate": 0.00042286381911590075,
+      "loss": 0.5153,
+      "step": 23415
+    },
+    {
+      "epoch": 1.1889809749844527,
+      "grad_norm": 0.0245055560686366,
+      "learning_rate": 0.0004226449527561124,
+      "loss": 0.4915,
+      "step": 23420
+    },
+    {
+      "epoch": 1.189234811082484,
+      "grad_norm": 0.023358170637321662,
+      "learning_rate": 0.0004224261015807341,
+      "loss": 0.5051,
+      "step": 23425
+    },
+    {
+      "epoch": 1.1894886471805155,
+      "grad_norm": 0.02494520282712566,
+      "learning_rate": 0.00042220726563272514,
+      "loss": 0.5168,
+      "step": 23430
+    },
+    {
+      "epoch": 1.189742483278547,
+      "grad_norm": 0.026188639690865594,
+      "learning_rate": 0.0004219884449550421,
+      "loss": 0.5007,
+      "step": 23435
+    },
+    {
+      "epoch": 1.1899963193765786,
+      "grad_norm": 0.02181777095566158,
+      "learning_rate": 0.0004217696395906381,
+      "loss": 0.4987,
+      "step": 23440
+    },
+    {
+      "epoch": 1.19025015547461,
+      "grad_norm": 0.022429924845303997,
+      "learning_rate": 0.00042155084958246387,
+      "loss": 0.5191,
+      "step": 23445
+    },
+    {
+      "epoch": 1.1905039915726416,
+      "grad_norm": 0.022466417610871727,
+      "learning_rate": 0.0004213320749734665,
+      "loss": 0.507,
+      "step": 23450
+    },
+    {
+      "epoch": 1.190757827670673,
+      "grad_norm": 0.03174964407796664,
+      "learning_rate": 0.0004211133158065906,
+      "loss": 0.487,
+      "step": 23455
+    },
+    {
+      "epoch": 1.1910116637687045,
+      "grad_norm": 0.02782479444320585,
+      "learning_rate": 0.0004208945721247772,
+      "loss": 0.4833,
+      "step": 23460
+    },
+    {
+      "epoch": 1.191265499866736,
+      "grad_norm": 0.023164893847604912,
+      "learning_rate": 0.0004206758439709649,
+      "loss": 0.5069,
+      "step": 23465
+    },
+    {
+      "epoch": 1.1915193359647676,
+      "grad_norm": 0.02156991148966579,
+      "learning_rate": 0.00042045713138808894,
+      "loss": 0.5061,
+      "step": 23470
+    },
+    {
+      "epoch": 1.191773172062799,
+      "grad_norm": 0.03206441910648273,
+      "learning_rate": 0.0004202384344190814,
+      "loss": 0.4991,
+      "step": 23475
+    },
+    {
+      "epoch": 1.1920270081608306,
+      "grad_norm": 0.033785505523490716,
+      "learning_rate": 0.00042001975310687134,
+      "loss": 0.5094,
+      "step": 23480
+    },
+    {
+      "epoch": 1.1922808442588622,
+      "grad_norm": 0.029304316918651336,
+      "learning_rate": 0.0004198010874943849,
+      "loss": 0.4911,
+      "step": 23485
+    },
+    {
+      "epoch": 1.1925346803568935,
+      "grad_norm": 0.024414079665053167,
+      "learning_rate": 0.0004195824376245451,
+      "loss": 0.5014,
+      "step": 23490
+    },
+    {
+      "epoch": 1.192788516454925,
+      "grad_norm": 0.02755089300926629,
+      "learning_rate": 0.0004193638035402717,
+      "loss": 0.4836,
+      "step": 23495
+    },
+    {
+      "epoch": 1.1930423525529565,
+      "grad_norm": 0.04448377045587385,
+      "learning_rate": 0.0004191451852844816,
+      "loss": 0.5055,
+      "step": 23500
+    },
+    {
+      "epoch": 1.193296188650988,
+      "grad_norm": 0.02563765603314885,
+      "learning_rate": 0.00041892658290008835,
+      "loss": 0.4757,
+      "step": 23505
+    },
+    {
+      "epoch": 1.1935500247490196,
+      "grad_norm": 0.029729997745895555,
+      "learning_rate": 0.00041870799643000257,
+      "loss": 0.5031,
+      "step": 23510
+    },
+    {
+      "epoch": 1.1938038608470511,
+      "grad_norm": 0.022197332496704934,
+      "learning_rate": 0.00041848942591713167,
+      "loss": 0.493,
+      "step": 23515
+    },
+    {
+      "epoch": 1.1940576969450825,
+      "grad_norm": 0.02508559588342066,
+      "learning_rate": 0.0004182708714043799,
+      "loss": 0.493,
+      "step": 23520
+    },
+    {
+      "epoch": 1.194311533043114,
+      "grad_norm": 0.02801379075039443,
+      "learning_rate": 0.0004180523329346486,
+      "loss": 0.4848,
+      "step": 23525
+    },
+    {
+      "epoch": 1.1945653691411455,
+      "grad_norm": 0.02487825457446648,
+      "learning_rate": 0.00041783381055083565,
+      "loss": 0.5065,
+      "step": 23530
+    },
+    {
+      "epoch": 1.194819205239177,
+      "grad_norm": 0.024301005036306705,
+      "learning_rate": 0.0004176153042958359,
+      "loss": 0.4799,
+      "step": 23535
+    },
+    {
+      "epoch": 1.1950730413372086,
+      "grad_norm": 0.028592227759142063,
+      "learning_rate": 0.0004173968142125411,
+      "loss": 0.4706,
+      "step": 23540
+    },
+    {
+      "epoch": 1.1953268774352401,
+      "grad_norm": 0.020208642081283146,
+      "learning_rate": 0.00041717834034383974,
+      "loss": 0.4838,
+      "step": 23545
+    },
+    {
+      "epoch": 1.1955807135332717,
+      "grad_norm": 0.025013814826272913,
+      "learning_rate": 0.0004169598827326171,
+      "loss": 0.5043,
+      "step": 23550
+    },
+    {
+      "epoch": 1.195834549631303,
+      "grad_norm": 0.02413307493483816,
+      "learning_rate": 0.0004167414414217554,
+      "loss": 0.5207,
+      "step": 23555
+    },
+    {
+      "epoch": 1.1960883857293345,
+      "grad_norm": 0.02524654414343768,
+      "learning_rate": 0.0004165230164541335,
+      "loss": 0.465,
+      "step": 23560
+    },
+    {
+      "epoch": 1.196342221827366,
+      "grad_norm": 0.03571503975031509,
+      "learning_rate": 0.00041630460787262717,
+      "loss": 0.4715,
+      "step": 23565
+    },
+    {
+      "epoch": 1.1965960579253976,
+      "grad_norm": 0.05081085862061478,
+      "learning_rate": 0.00041608621572010896,
+      "loss": 0.5229,
+      "step": 23570
+    },
+    {
+      "epoch": 1.1968498940234291,
+      "grad_norm": 0.04184691240326009,
+      "learning_rate": 0.0004158678400394481,
+      "loss": 0.5048,
+      "step": 23575
+    },
+    {
+      "epoch": 1.1971037301214607,
+      "grad_norm": 0.026686408386705083,
+      "learning_rate": 0.00041564948087351053,
+      "loss": 0.5156,
+      "step": 23580
+    },
+    {
+      "epoch": 1.197357566219492,
+      "grad_norm": 0.03417816047008303,
+      "learning_rate": 0.0004154311382651593,
+      "loss": 0.4824,
+      "step": 23585
+    },
+    {
+      "epoch": 1.1976114023175235,
+      "grad_norm": 0.026200514928376224,
+      "learning_rate": 0.000415212812257254,
+      "loss": 0.4909,
+      "step": 23590
+    },
+    {
+      "epoch": 1.197865238415555,
+      "grad_norm": 0.033241782938311475,
+      "learning_rate": 0.0004149945028926507,
+      "loss": 0.4919,
+      "step": 23595
+    },
+    {
+      "epoch": 1.1981190745135866,
+      "grad_norm": 0.023753551533967514,
+      "learning_rate": 0.0004147762102142027,
+      "loss": 0.4899,
+      "step": 23600
+    },
+    {
+      "epoch": 1.198372910611618,
+      "grad_norm": 0.026945495744205454,
+      "learning_rate": 0.0004145579342647595,
+      "loss": 0.4666,
+      "step": 23605
+    },
+    {
+      "epoch": 1.1986267467096496,
+      "grad_norm": 0.02038326341118833,
+      "learning_rate": 0.0004143396750871678,
+      "loss": 0.4999,
+      "step": 23610
+    },
+    {
+      "epoch": 1.1988805828076812,
+      "grad_norm": 0.022956204878543746,
+      "learning_rate": 0.0004141214327242707,
+      "loss": 0.491,
+      "step": 23615
+    },
+    {
+      "epoch": 1.1991344189057125,
+      "grad_norm": 0.0219273779464646,
+      "learning_rate": 0.000413903207218908,
+      "loss": 0.4867,
+      "step": 23620
+    },
+    {
+      "epoch": 1.199388255003744,
+      "grad_norm": 0.021082208350651496,
+      "learning_rate": 0.0004136849986139164,
+      "loss": 0.4782,
+      "step": 23625
+    },
+    {
+      "epoch": 1.1996420911017756,
+      "grad_norm": 0.021461829412799976,
+      "learning_rate": 0.0004134668069521291,
+      "loss": 0.5114,
+      "step": 23630
+    },
+    {
+      "epoch": 1.199895927199807,
+      "grad_norm": 0.027611384568723554,
+      "learning_rate": 0.00041324863227637607,
+      "loss": 0.5122,
+      "step": 23635
+    },
+    {
+      "epoch": 1.2001497632978386,
+      "grad_norm": 0.0313518897326471,
+      "learning_rate": 0.0004130304746294839,
+      "loss": 0.5,
+      "step": 23640
+    },
+    {
+      "epoch": 1.2004035993958702,
+      "grad_norm": 0.021179468678818012,
+      "learning_rate": 0.0004128123340542757,
+      "loss": 0.4911,
+      "step": 23645
+    },
+    {
+      "epoch": 1.2006574354939015,
+      "grad_norm": 0.021479315942391494,
+      "learning_rate": 0.0004125942105935717,
+      "loss": 0.5172,
+      "step": 23650
+    },
+    {
+      "epoch": 1.200911271591933,
+      "grad_norm": 0.030395801642170088,
+      "learning_rate": 0.00041237610429018824,
+      "loss": 0.4812,
+      "step": 23655
+    },
+    {
+      "epoch": 1.2011651076899645,
+      "grad_norm": 0.026149647662465924,
+      "learning_rate": 0.0004121580151869385,
+      "loss": 0.5146,
+      "step": 23660
+    },
+    {
+      "epoch": 1.201418943787996,
+      "grad_norm": 0.025141949645227077,
+      "learning_rate": 0.0004119399433266323,
+      "loss": 0.5049,
+      "step": 23665
+    },
+    {
+      "epoch": 1.2016727798860276,
+      "grad_norm": 0.022368500516788497,
+      "learning_rate": 0.0004117218887520761,
+      "loss": 0.4685,
+      "step": 23670
+    },
+    {
+      "epoch": 1.2019266159840591,
+      "grad_norm": 0.028886687584022533,
+      "learning_rate": 0.00041150385150607287,
+      "loss": 0.5113,
+      "step": 23675
+    },
+    {
+      "epoch": 1.2021804520820907,
+      "grad_norm": 0.026390424071676334,
+      "learning_rate": 0.0004112858316314223,
+      "loss": 0.486,
+      "step": 23680
+    },
+    {
+      "epoch": 1.2024342881801222,
+      "grad_norm": 0.027944997664867154,
+      "learning_rate": 0.00041106782917092055,
+      "loss": 0.5285,
+      "step": 23685
+    },
+    {
+      "epoch": 1.2026881242781535,
+      "grad_norm": 0.024740823096402697,
+      "learning_rate": 0.00041084984416736044,
+      "loss": 0.5107,
+      "step": 23690
+    },
+    {
+      "epoch": 1.202941960376185,
+      "grad_norm": 0.02106534018159003,
+      "learning_rate": 0.0004106318766635313,
+      "loss": 0.4754,
+      "step": 23695
+    },
+    {
+      "epoch": 1.2031957964742166,
+      "grad_norm": 0.0238222430975329,
+      "learning_rate": 0.00041041392670221913,
+      "loss": 0.4942,
+      "step": 23700
+    },
+    {
+      "epoch": 1.2034496325722481,
+      "grad_norm": 0.0237081263610808,
+      "learning_rate": 0.00041019599432620614,
+      "loss": 0.4864,
+      "step": 23705
+    },
+    {
+      "epoch": 1.2037034686702797,
+      "grad_norm": 0.02344590182148663,
+      "learning_rate": 0.00040997807957827184,
+      "loss": 0.4607,
+      "step": 23710
+    },
+    {
+      "epoch": 1.2039573047683112,
+      "grad_norm": 0.022849121360239078,
+      "learning_rate": 0.0004097601825011916,
+      "loss": 0.4918,
+      "step": 23715
+    },
+    {
+      "epoch": 1.2042111408663425,
+      "grad_norm": 0.02180468331845955,
+      "learning_rate": 0.00040954230313773745,
+      "loss": 0.4798,
+      "step": 23720
+    },
+    {
+      "epoch": 1.204464976964374,
+      "grad_norm": 0.0243468829191348,
+      "learning_rate": 0.0004093244415306781,
+      "loss": 0.5206,
+      "step": 23725
+    },
+    {
+      "epoch": 1.2047188130624056,
+      "grad_norm": 0.0376925453970005,
+      "learning_rate": 0.00040910659772277867,
+      "loss": 0.4733,
+      "step": 23730
+    },
+    {
+      "epoch": 1.2049726491604371,
+      "grad_norm": 0.02194738892506034,
+      "learning_rate": 0.0004088887717568009,
+      "loss": 0.4802,
+      "step": 23735
+    },
+    {
+      "epoch": 1.2052264852584686,
+      "grad_norm": 0.02392526501857356,
+      "learning_rate": 0.0004086709636755029,
+      "loss": 0.5137,
+      "step": 23740
+    },
+    {
+      "epoch": 1.2054803213565002,
+      "grad_norm": 0.023675767647900945,
+      "learning_rate": 0.0004084531735216392,
+      "loss": 0.4837,
+      "step": 23745
+    },
+    {
+      "epoch": 1.2057341574545317,
+      "grad_norm": 0.029907048109683313,
+      "learning_rate": 0.000408235401337961,
+      "loss": 0.4991,
+      "step": 23750
+    },
+    {
+      "epoch": 1.205987993552563,
+      "grad_norm": 0.023611085556861953,
+      "learning_rate": 0.00040801764716721586,
+      "loss": 0.5074,
+      "step": 23755
+    },
+    {
+      "epoch": 1.2062418296505946,
+      "grad_norm": 0.02342929941369781,
+      "learning_rate": 0.00040779991105214787,
+      "loss": 0.4753,
+      "step": 23760
+    },
+    {
+      "epoch": 1.206495665748626,
+      "grad_norm": 0.024184197724051767,
+      "learning_rate": 0.00040758219303549734,
+      "loss": 0.5053,
+      "step": 23765
+    },
+    {
+      "epoch": 1.2067495018466576,
+      "grad_norm": 0.022748064850076327,
+      "learning_rate": 0.00040736449316000156,
+      "loss": 0.5263,
+      "step": 23770
+    },
+    {
+      "epoch": 1.2070033379446892,
+      "grad_norm": 0.04285865957318062,
+      "learning_rate": 0.00040714681146839394,
+      "loss": 0.4945,
+      "step": 23775
+    },
+    {
+      "epoch": 1.2072571740427207,
+      "grad_norm": 0.027550877084688605,
+      "learning_rate": 0.00040692914800340407,
+      "loss": 0.519,
+      "step": 23780
+    },
+    {
+      "epoch": 1.207511010140752,
+      "grad_norm": 0.028024709655043955,
+      "learning_rate": 0.00040671150280775835,
+      "loss": 0.5254,
+      "step": 23785
+    },
+    {
+      "epoch": 1.2077648462387836,
+      "grad_norm": 0.027347216039832773,
+      "learning_rate": 0.0004064938759241794,
+      "loss": 0.5175,
+      "step": 23790
+    },
+    {
+      "epoch": 1.208018682336815,
+      "grad_norm": 0.03246702500036394,
+      "learning_rate": 0.0004062762673953863,
+      "loss": 0.4976,
+      "step": 23795
+    },
+    {
+      "epoch": 1.2082725184348466,
+      "grad_norm": 0.024018195972550192,
+      "learning_rate": 0.00040605867726409446,
+      "loss": 0.5024,
+      "step": 23800
+    },
+    {
+      "epoch": 1.2085263545328782,
+      "grad_norm": 0.036249081128423837,
+      "learning_rate": 0.00040584110557301576,
+      "loss": 0.5149,
+      "step": 23805
+    },
+    {
+      "epoch": 1.2087801906309097,
+      "grad_norm": 0.025423232304757903,
+      "learning_rate": 0.0004056235523648586,
+      "loss": 0.4963,
+      "step": 23810
+    },
+    {
+      "epoch": 1.2090340267289412,
+      "grad_norm": 0.02314566298487677,
+      "learning_rate": 0.0004054060176823273,
+      "loss": 0.4801,
+      "step": 23815
+    },
+    {
+      "epoch": 1.2092878628269725,
+      "grad_norm": 0.031246175124657095,
+      "learning_rate": 0.00040518850156812315,
+      "loss": 0.5157,
+      "step": 23820
+    },
+    {
+      "epoch": 1.209541698925004,
+      "grad_norm": 0.025447196021980784,
+      "learning_rate": 0.0004049710040649431,
+      "loss": 0.4978,
+      "step": 23825
+    },
+    {
+      "epoch": 1.2097955350230356,
+      "grad_norm": 0.19408857583730024,
+      "learning_rate": 0.0004047535252154812,
+      "loss": 0.4927,
+      "step": 23830
+    },
+    {
+      "epoch": 1.2100493711210671,
+      "grad_norm": 0.037859047454809586,
+      "learning_rate": 0.0004045360650624272,
+      "loss": 0.492,
+      "step": 23835
+    },
+    {
+      "epoch": 1.2103032072190987,
+      "grad_norm": 0.02407784644881004,
+      "learning_rate": 0.0004043186236484677,
+      "loss": 0.5306,
+      "step": 23840
+    },
+    {
+      "epoch": 1.2105570433171302,
+      "grad_norm": 0.021170736417903336,
+      "learning_rate": 0.0004041012010162852,
+      "loss": 0.4789,
+      "step": 23845
+    },
+    {
+      "epoch": 1.2108108794151615,
+      "grad_norm": 0.02333015002979291,
+      "learning_rate": 0.0004038837972085586,
+      "loss": 0.5165,
+      "step": 23850
+    },
+    {
+      "epoch": 1.211064715513193,
+      "grad_norm": 0.022900385191847467,
+      "learning_rate": 0.0004036664122679633,
+      "loss": 0.529,
+      "step": 23855
+    },
+    {
+      "epoch": 1.2113185516112246,
+      "grad_norm": 0.02900664488594827,
+      "learning_rate": 0.00040344904623717094,
+      "loss": 0.5186,
+      "step": 23860
+    },
+    {
+      "epoch": 1.2115723877092561,
+      "grad_norm": 0.02318704654191729,
+      "learning_rate": 0.00040323169915884924,
+      "loss": 0.5114,
+      "step": 23865
+    },
+    {
+      "epoch": 1.2118262238072877,
+      "grad_norm": 0.02241334253331378,
+      "learning_rate": 0.0004030143710756624,
+      "loss": 0.5244,
+      "step": 23870
+    },
+    {
+      "epoch": 1.2120800599053192,
+      "grad_norm": 0.02019981834426705,
+      "learning_rate": 0.0004027970620302709,
+      "loss": 0.5171,
+      "step": 23875
+    },
+    {
+      "epoch": 1.2123338960033507,
+      "grad_norm": 0.021530806359524663,
+      "learning_rate": 0.0004025797720653313,
+      "loss": 0.5028,
+      "step": 23880
+    },
+    {
+      "epoch": 1.212587732101382,
+      "grad_norm": 0.023565880923201048,
+      "learning_rate": 0.00040236250122349643,
+      "loss": 0.5048,
+      "step": 23885
+    },
+    {
+      "epoch": 1.2128415681994136,
+      "grad_norm": 0.02799458782575493,
+      "learning_rate": 0.0004021452495474159,
+      "loss": 0.5116,
+      "step": 23890
+    },
+    {
+      "epoch": 1.2130954042974451,
+      "grad_norm": 0.026146402918077717,
+      "learning_rate": 0.0004019280170797349,
+      "loss": 0.5023,
+      "step": 23895
+    },
+    {
+      "epoch": 1.2133492403954766,
+      "grad_norm": 0.04633882230495636,
+      "learning_rate": 0.000401710803863095,
+      "loss": 0.499,
+      "step": 23900
+    },
+    {
+      "epoch": 1.2136030764935082,
+      "grad_norm": 0.03278896093851392,
+      "learning_rate": 0.0004014936099401341,
+      "loss": 0.484,
+      "step": 23905
+    },
+    {
+      "epoch": 1.2138569125915397,
+      "grad_norm": 0.023279990625039652,
+      "learning_rate": 0.0004012764353534864,
+      "loss": 0.5017,
+      "step": 23910
+    },
+    {
+      "epoch": 1.214110748689571,
+      "grad_norm": 0.025580791250068846,
+      "learning_rate": 0.00040105928014578206,
+      "loss": 0.4841,
+      "step": 23915
+    },
+    {
+      "epoch": 1.2143645847876026,
+      "grad_norm": 0.0231176720102259,
+      "learning_rate": 0.00040084214435964766,
+      "loss": 0.4992,
+      "step": 23920
+    },
+    {
+      "epoch": 1.214618420885634,
+      "grad_norm": 0.02248737002851678,
+      "learning_rate": 0.0004006250280377058,
+      "loss": 0.5068,
+      "step": 23925
+    },
+    {
+      "epoch": 1.2148722569836656,
+      "grad_norm": 0.022927316960127005,
+      "learning_rate": 0.0004004079312225754,
+      "loss": 0.492,
+      "step": 23930
+    },
+    {
+      "epoch": 1.2151260930816972,
+      "grad_norm": 0.02313720027070532,
+      "learning_rate": 0.00040019085395687134,
+      "loss": 0.4949,
+      "step": 23935
+    },
+    {
+      "epoch": 1.2153799291797287,
+      "grad_norm": 0.030106027729234503,
+      "learning_rate": 0.00039997379628320493,
+      "loss": 0.5044,
+      "step": 23940
+    },
+    {
+      "epoch": 1.2156337652777602,
+      "grad_norm": 0.023137200469528597,
+      "learning_rate": 0.0003997567582441834,
+      "loss": 0.5162,
+      "step": 23945
+    },
+    {
+      "epoch": 1.2158876013757918,
+      "grad_norm": 0.02729799923558458,
+      "learning_rate": 0.00039953973988241035,
+      "loss": 0.5144,
+      "step": 23950
+    },
+    {
+      "epoch": 1.216141437473823,
+      "grad_norm": 0.02110940033197545,
+      "learning_rate": 0.00039932274124048546,
+      "loss": 0.4959,
+      "step": 23955
+    },
+    {
+      "epoch": 1.2163952735718546,
+      "grad_norm": 0.023832077215208247,
+      "learning_rate": 0.00039910576236100437,
+      "loss": 0.5058,
+      "step": 23960
+    },
+    {
+      "epoch": 1.2166491096698862,
+      "grad_norm": 0.021490387702712672,
+      "learning_rate": 0.000398888803286559,
+      "loss": 0.5041,
+      "step": 23965
+    },
+    {
+      "epoch": 1.2169029457679177,
+      "grad_norm": 0.022133880712670376,
+      "learning_rate": 0.0003986718640597372,
+      "loss": 0.5059,
+      "step": 23970
+    },
+    {
+      "epoch": 1.2171567818659492,
+      "grad_norm": 0.021425175563689818,
+      "learning_rate": 0.0003984549447231232,
+      "loss": 0.4609,
+      "step": 23975
+    },
+    {
+      "epoch": 1.2174106179639805,
+      "grad_norm": 0.019327536501010368,
+      "learning_rate": 0.0003982380453192972,
+      "loss": 0.4943,
+      "step": 23980
+    },
+    {
+      "epoch": 1.217664454062012,
+      "grad_norm": 0.023998177431719198,
+      "learning_rate": 0.0003980211658908354,
+      "loss": 0.4973,
+      "step": 23985
+    },
+    {
+      "epoch": 1.2179182901600436,
+      "grad_norm": 0.033915972076828955,
+      "learning_rate": 0.0003978043064803101,
+      "loss": 0.5149,
+      "step": 23990
+    },
+    {
+      "epoch": 1.2181721262580751,
+      "grad_norm": 0.0237613621554742,
+      "learning_rate": 0.0003975874671302899,
+      "loss": 0.5135,
+      "step": 23995
+    },
+    {
+      "epoch": 1.2184259623561067,
+      "grad_norm": 0.026750732728818505,
+      "learning_rate": 0.00039737064788333907,
+      "loss": 0.503,
+      "step": 24000
+    },
+    {
+      "epoch": 1.2186797984541382,
+      "grad_norm": 0.025308308981860574,
+      "learning_rate": 0.0003971538487820181,
+      "loss": 0.5002,
+      "step": 24005
+    },
+    {
+      "epoch": 1.2189336345521697,
+      "grad_norm": 0.02222643637338913,
+      "learning_rate": 0.0003969370698688839,
+      "loss": 0.5231,
+      "step": 24010
+    },
+    {
+      "epoch": 1.2191874706502013,
+      "grad_norm": 0.031118899097702987,
+      "learning_rate": 0.0003967203111864889,
+      "loss": 0.5099,
+      "step": 24015
+    },
+    {
+      "epoch": 1.2194413067482326,
+      "grad_norm": 0.022092906382930166,
+      "learning_rate": 0.0003965035727773818,
+      "loss": 0.4864,
+      "step": 24020
+    },
+    {
+      "epoch": 1.2196951428462641,
+      "grad_norm": 0.028023218544810078,
+      "learning_rate": 0.0003962868546841072,
+      "loss": 0.4551,
+      "step": 24025
+    },
+    {
+      "epoch": 1.2199489789442957,
+      "grad_norm": 0.023384820051804464,
+      "learning_rate": 0.0003960701569492058,
+      "loss": 0.4975,
+      "step": 24030
+    },
+    {
+      "epoch": 1.2202028150423272,
+      "grad_norm": 0.022821055406039938,
+      "learning_rate": 0.00039585347961521434,
+      "loss": 0.5172,
+      "step": 24035
+    },
+    {
+      "epoch": 1.2204566511403587,
+      "grad_norm": 0.030261713548462982,
+      "learning_rate": 0.0003956368227246654,
+      "loss": 0.5088,
+      "step": 24040
+    },
+    {
+      "epoch": 1.2207104872383903,
+      "grad_norm": 0.023111376142268682,
+      "learning_rate": 0.00039542018632008773,
+      "loss": 0.5049,
+      "step": 24045
+    },
+    {
+      "epoch": 1.2209643233364216,
+      "grad_norm": 0.021954597042835224,
+      "learning_rate": 0.00039520357044400595,
+      "loss": 0.4923,
+      "step": 24050
+    },
+    {
+      "epoch": 1.221218159434453,
+      "grad_norm": 0.022076500459662276,
+      "learning_rate": 0.0003949869751389407,
+      "loss": 0.4696,
+      "step": 24055
+    },
+    {
+      "epoch": 1.2214719955324846,
+      "grad_norm": 0.023319825211927294,
+      "learning_rate": 0.0003947704004474085,
+      "loss": 0.5188,
+      "step": 24060
+    },
+    {
+      "epoch": 1.2217258316305162,
+      "grad_norm": 0.02700682357188388,
+      "learning_rate": 0.0003945538464119218,
+      "loss": 0.5128,
+      "step": 24065
+    },
+    {
+      "epoch": 1.2219796677285477,
+      "grad_norm": 0.025886961687645718,
+      "learning_rate": 0.00039433731307498925,
+      "loss": 0.5348,
+      "step": 24070
+    },
+    {
+      "epoch": 1.2222335038265792,
+      "grad_norm": 0.02644977307390592,
+      "learning_rate": 0.00039412080047911526,
+      "loss": 0.5141,
+      "step": 24075
+    },
+    {
+      "epoch": 1.2224873399246108,
+      "grad_norm": 0.028225811453520037,
+      "learning_rate": 0.00039390430866680017,
+      "loss": 0.5302,
+      "step": 24080
+    },
+    {
+      "epoch": 1.222741176022642,
+      "grad_norm": 0.035161610050816194,
+      "learning_rate": 0.00039368783768054005,
+      "loss": 0.4871,
+      "step": 24085
+    },
+    {
+      "epoch": 1.2229950121206736,
+      "grad_norm": 0.08376585705403677,
+      "learning_rate": 0.00039347138756282737,
+      "loss": 0.5101,
+      "step": 24090
+    },
+    {
+      "epoch": 1.2232488482187052,
+      "grad_norm": 0.028243847099455724,
+      "learning_rate": 0.0003932549583561499,
+      "loss": 0.5145,
+      "step": 24095
+    },
+    {
+      "epoch": 1.2235026843167367,
+      "grad_norm": 0.02241171828134657,
+      "learning_rate": 0.00039303855010299187,
+      "loss": 0.4779,
+      "step": 24100
+    },
+    {
+      "epoch": 1.2237565204147682,
+      "grad_norm": 0.027565864630121506,
+      "learning_rate": 0.00039282216284583304,
+      "loss": 0.5191,
+      "step": 24105
+    },
+    {
+      "epoch": 1.2240103565127998,
+      "grad_norm": 0.04141754255679934,
+      "learning_rate": 0.00039260579662714915,
+      "loss": 0.5127,
+      "step": 24110
+    },
+    {
+      "epoch": 1.224264192610831,
+      "grad_norm": 0.03073490220186529,
+      "learning_rate": 0.0003923894514894118,
+      "loss": 0.5125,
+      "step": 24115
+    },
+    {
+      "epoch": 1.2245180287088626,
+      "grad_norm": 0.02681137516278315,
+      "learning_rate": 0.00039217312747508843,
+      "loss": 0.4995,
+      "step": 24120
+    },
+    {
+      "epoch": 1.2247718648068942,
+      "grad_norm": 0.03856684515614741,
+      "learning_rate": 0.00039195682462664225,
+      "loss": 0.4739,
+      "step": 24125
+    },
+    {
+      "epoch": 1.2250257009049257,
+      "grad_norm": 0.039563497611539114,
+      "learning_rate": 0.0003917405429865327,
+      "loss": 0.4841,
+      "step": 24130
+    },
+    {
+      "epoch": 1.2252795370029572,
+      "grad_norm": 0.02634194065105358,
+      "learning_rate": 0.0003915242825972148,
+      "loss": 0.5017,
+      "step": 24135
+    },
+    {
+      "epoch": 1.2255333731009888,
+      "grad_norm": 0.022677125872718785,
+      "learning_rate": 0.0003913080435011392,
+      "loss": 0.5316,
+      "step": 24140
+    },
+    {
+      "epoch": 1.2257872091990203,
+      "grad_norm": 0.021362302216391706,
+      "learning_rate": 0.00039109182574075256,
+      "loss": 0.4939,
+      "step": 24145
+    },
+    {
+      "epoch": 1.2260410452970516,
+      "grad_norm": 0.02490355388687261,
+      "learning_rate": 0.00039087562935849745,
+      "loss": 0.4987,
+      "step": 24150
+    },
+    {
+      "epoch": 1.2262948813950831,
+      "grad_norm": 0.02658190177665588,
+      "learning_rate": 0.00039065945439681213,
+      "loss": 0.4917,
+      "step": 24155
+    },
+    {
+      "epoch": 1.2265487174931147,
+      "grad_norm": 0.020375606787208123,
+      "learning_rate": 0.0003904433008981306,
+      "loss": 0.4781,
+      "step": 24160
+    },
+    {
+      "epoch": 1.2268025535911462,
+      "grad_norm": 0.02515928592237166,
+      "learning_rate": 0.00039022716890488275,
+      "loss": 0.4924,
+      "step": 24165
+    },
+    {
+      "epoch": 1.2270563896891777,
+      "grad_norm": 0.036589589288339636,
+      "learning_rate": 0.0003900110584594942,
+      "loss": 0.5066,
+      "step": 24170
+    },
+    {
+      "epoch": 1.2273102257872093,
+      "grad_norm": 0.023900416252566864,
+      "learning_rate": 0.00038979496960438637,
+      "loss": 0.4742,
+      "step": 24175
+    },
+    {
+      "epoch": 1.2275640618852406,
+      "grad_norm": 0.024765774151169644,
+      "learning_rate": 0.0003895789023819764,
+      "loss": 0.5312,
+      "step": 24180
+    },
+    {
+      "epoch": 1.2278178979832721,
+      "grad_norm": 0.023371819709462876,
+      "learning_rate": 0.0003893628568346771,
+      "loss": 0.495,
+      "step": 24185
+    },
+    {
+      "epoch": 1.2280717340813037,
+      "grad_norm": 0.023492189347015212,
+      "learning_rate": 0.0003891468330048974,
+      "loss": 0.4349,
+      "step": 24190
+    },
+    {
+      "epoch": 1.2283255701793352,
+      "grad_norm": 0.022915433000463895,
+      "learning_rate": 0.00038893083093504154,
+      "loss": 0.4776,
+      "step": 24195
+    },
+    {
+      "epoch": 1.2285794062773667,
+      "grad_norm": 0.03768217141505213,
+      "learning_rate": 0.00038871485066750965,
+      "loss": 0.4771,
+      "step": 24200
+    },
+    {
+      "epoch": 1.2288332423753983,
+      "grad_norm": 0.08372265623632144,
+      "learning_rate": 0.00038849889224469765,
+      "loss": 0.4923,
+      "step": 24205
+    },
+    {
+      "epoch": 1.2290870784734298,
+      "grad_norm": 0.023183814231157947,
+      "learning_rate": 0.000388282955708997,
+      "loss": 0.4936,
+      "step": 24210
+    },
+    {
+      "epoch": 1.2293409145714613,
+      "grad_norm": 0.021976570825354085,
+      "learning_rate": 0.0003880670411027951,
+      "loss": 0.4798,
+      "step": 24215
+    },
+    {
+      "epoch": 1.2295947506694926,
+      "grad_norm": 0.02966175812308913,
+      "learning_rate": 0.0003878511484684747,
+      "loss": 0.4887,
+      "step": 24220
+    },
+    {
+      "epoch": 1.2298485867675242,
+      "grad_norm": 0.02807429193190882,
+      "learning_rate": 0.00038763527784841463,
+      "loss": 0.5163,
+      "step": 24225
+    },
+    {
+      "epoch": 1.2301024228655557,
+      "grad_norm": 0.023104952777379555,
+      "learning_rate": 0.00038741942928498913,
+      "loss": 0.5479,
+      "step": 24230
+    },
+    {
+      "epoch": 1.2303562589635872,
+      "grad_norm": 0.029930387788839295,
+      "learning_rate": 0.0003872036028205683,
+      "loss": 0.4894,
+      "step": 24235
+    },
+    {
+      "epoch": 1.2306100950616188,
+      "grad_norm": 0.024336392055221638,
+      "learning_rate": 0.00038698779849751766,
+      "loss": 0.4459,
+      "step": 24240
+    },
+    {
+      "epoch": 1.23086393115965,
+      "grad_norm": 0.022263929857281015,
+      "learning_rate": 0.0003867720163581983,
+      "loss": 0.4906,
+      "step": 24245
+    },
+    {
+      "epoch": 1.2311177672576816,
+      "grad_norm": 0.03133076104283715,
+      "learning_rate": 0.0003865562564449678,
+      "loss": 0.4989,
+      "step": 24250
+    },
+    {
+      "epoch": 1.2313716033557132,
+      "grad_norm": 0.03260231148423326,
+      "learning_rate": 0.0003863405188001783,
+      "loss": 0.5352,
+      "step": 24255
+    },
+    {
+      "epoch": 1.2316254394537447,
+      "grad_norm": 0.04700860380299025,
+      "learning_rate": 0.00038612480346617825,
+      "loss": 0.477,
+      "step": 24260
+    },
+    {
+      "epoch": 1.2318792755517762,
+      "grad_norm": 0.02740921040477875,
+      "learning_rate": 0.00038590911048531136,
+      "loss": 0.5073,
+      "step": 24265
+    },
+    {
+      "epoch": 1.2321331116498078,
+      "grad_norm": 0.028178131413544947,
+      "learning_rate": 0.00038569343989991705,
+      "loss": 0.5108,
+      "step": 24270
+    },
+    {
+      "epoch": 1.2323869477478393,
+      "grad_norm": 0.03757846827473028,
+      "learning_rate": 0.0003854777917523305,
+      "loss": 0.5107,
+      "step": 24275
+    },
+    {
+      "epoch": 1.2326407838458708,
+      "grad_norm": 0.023537367550062548,
+      "learning_rate": 0.00038526216608488227,
+      "loss": 0.5113,
+      "step": 24280
+    },
+    {
+      "epoch": 1.2328946199439021,
+      "grad_norm": 0.023626230913656128,
+      "learning_rate": 0.0003850465629398987,
+      "loss": 0.5004,
+      "step": 24285
+    },
+    {
+      "epoch": 1.2331484560419337,
+      "grad_norm": 0.0223229940793588,
+      "learning_rate": 0.00038483098235970147,
+      "loss": 0.474,
+      "step": 24290
+    },
+    {
+      "epoch": 1.2334022921399652,
+      "grad_norm": 0.024562899395510082,
+      "learning_rate": 0.00038461542438660815,
+      "loss": 0.5167,
+      "step": 24295
+    },
+    {
+      "epoch": 1.2336561282379968,
+      "grad_norm": 0.028102102596075113,
+      "learning_rate": 0.00038439988906293157,
+      "loss": 0.4756,
+      "step": 24300
+    },
+    {
+      "epoch": 1.2339099643360283,
+      "grad_norm": 0.025740036122312678,
+      "learning_rate": 0.00038418437643098006,
+      "loss": 0.4896,
+      "step": 24305
+    },
+    {
+      "epoch": 1.2341638004340598,
+      "grad_norm": 0.026595707267021335,
+      "learning_rate": 0.0003839688865330581,
+      "loss": 0.4887,
+      "step": 24310
+    },
+    {
+      "epoch": 1.2344176365320911,
+      "grad_norm": 0.024095548845127494,
+      "learning_rate": 0.00038375341941146505,
+      "loss": 0.4901,
+      "step": 24315
+    },
+    {
+      "epoch": 1.2346714726301227,
+      "grad_norm": 0.021876912693680587,
+      "learning_rate": 0.0003835379751084961,
+      "loss": 0.4839,
+      "step": 24320
+    },
+    {
+      "epoch": 1.2349253087281542,
+      "grad_norm": 0.02794133991499325,
+      "learning_rate": 0.00038332255366644175,
+      "loss": 0.4859,
+      "step": 24325
+    },
+    {
+      "epoch": 1.2351791448261857,
+      "grad_norm": 0.03176212918526051,
+      "learning_rate": 0.0003831071551275883,
+      "loss": 0.5001,
+      "step": 24330
+    },
+    {
+      "epoch": 1.2354329809242173,
+      "grad_norm": 0.02008532610763912,
+      "learning_rate": 0.0003828917795342173,
+      "loss": 0.474,
+      "step": 24335
+    },
+    {
+      "epoch": 1.2356868170222488,
+      "grad_norm": 0.02404321499467447,
+      "learning_rate": 0.000382676426928606,
+      "loss": 0.523,
+      "step": 24340
+    },
+    {
+      "epoch": 1.2359406531202803,
+      "grad_norm": 0.020304618971407788,
+      "learning_rate": 0.00038246109735302696,
+      "loss": 0.4927,
+      "step": 24345
+    },
+    {
+      "epoch": 1.2361944892183117,
+      "grad_norm": 0.021943643137327405,
+      "learning_rate": 0.0003822457908497484,
+      "loss": 0.5214,
+      "step": 24350
+    },
+    {
+      "epoch": 1.2364483253163432,
+      "grad_norm": 0.026610455716482826,
+      "learning_rate": 0.00038203050746103386,
+      "loss": 0.5162,
+      "step": 24355
+    },
+    {
+      "epoch": 1.2367021614143747,
+      "grad_norm": 0.028298758995324663,
+      "learning_rate": 0.00038181524722914235,
+      "loss": 0.509,
+      "step": 24360
+    },
+    {
+      "epoch": 1.2369559975124063,
+      "grad_norm": 0.037423844884409904,
+      "learning_rate": 0.0003816000101963282,
+      "loss": 0.5026,
+      "step": 24365
+    },
+    {
+      "epoch": 1.2372098336104378,
+      "grad_norm": 0.022982766278687995,
+      "learning_rate": 0.00038138479640484183,
+      "loss": 0.5009,
+      "step": 24370
+    },
+    {
+      "epoch": 1.2374636697084693,
+      "grad_norm": 0.024787564495660528,
+      "learning_rate": 0.00038116960589692844,
+      "loss": 0.4921,
+      "step": 24375
+    },
+    {
+      "epoch": 1.2377175058065006,
+      "grad_norm": 0.02088814143474573,
+      "learning_rate": 0.00038095443871482876,
+      "loss": 0.4885,
+      "step": 24380
+    },
+    {
+      "epoch": 1.2379713419045322,
+      "grad_norm": 0.021715700250562936,
+      "learning_rate": 0.0003807392949007791,
+      "loss": 0.4914,
+      "step": 24385
+    },
+    {
+      "epoch": 1.2382251780025637,
+      "grad_norm": 0.027943308695689863,
+      "learning_rate": 0.00038052417449701106,
+      "loss": 0.4809,
+      "step": 24390
+    },
+    {
+      "epoch": 1.2384790141005952,
+      "grad_norm": 0.024457173191587765,
+      "learning_rate": 0.00038030907754575173,
+      "loss": 0.4905,
+      "step": 24395
+    },
+    {
+      "epoch": 1.2387328501986268,
+      "grad_norm": 0.02298993158514161,
+      "learning_rate": 0.0003800940040892236,
+      "loss": 0.5076,
+      "step": 24400
+    },
+    {
+      "epoch": 1.2389866862966583,
+      "grad_norm": 0.02407909122160034,
+      "learning_rate": 0.00037987895416964455,
+      "loss": 0.505,
+      "step": 24405
+    },
+    {
+      "epoch": 1.2392405223946898,
+      "grad_norm": 0.023168723950299244,
+      "learning_rate": 0.0003796639278292277,
+      "loss": 0.4801,
+      "step": 24410
+    },
+    {
+      "epoch": 1.2394943584927212,
+      "grad_norm": 0.022695039167580793,
+      "learning_rate": 0.0003794489251101817,
+      "loss": 0.4998,
+      "step": 24415
+    },
+    {
+      "epoch": 1.2397481945907527,
+      "grad_norm": 0.025740573541342248,
+      "learning_rate": 0.00037923394605471057,
+      "loss": 0.4891,
+      "step": 24420
+    },
+    {
+      "epoch": 1.2400020306887842,
+      "grad_norm": 0.022190743593717716,
+      "learning_rate": 0.00037901899070501337,
+      "loss": 0.5166,
+      "step": 24425
+    },
+    {
+      "epoch": 1.2402558667868158,
+      "grad_norm": 0.02273570131152975,
+      "learning_rate": 0.00037880405910328515,
+      "loss": 0.4837,
+      "step": 24430
+    },
+    {
+      "epoch": 1.2405097028848473,
+      "grad_norm": 0.02247267893276436,
+      "learning_rate": 0.0003785891512917157,
+      "loss": 0.503,
+      "step": 24435
+    },
+    {
+      "epoch": 1.2407635389828788,
+      "grad_norm": 0.027397988839106935,
+      "learning_rate": 0.00037837426731249035,
+      "loss": 0.5138,
+      "step": 24440
+    },
+    {
+      "epoch": 1.2410173750809101,
+      "grad_norm": 0.0360747255061236,
+      "learning_rate": 0.0003781594072077899,
+      "loss": 0.5247,
+      "step": 24445
+    },
+    {
+      "epoch": 1.2412712111789417,
+      "grad_norm": 0.020839949347156835,
+      "learning_rate": 0.00037794457101979,
+      "loss": 0.4719,
+      "step": 24450
+    },
+    {
+      "epoch": 1.2415250472769732,
+      "grad_norm": 0.023805826469806165,
+      "learning_rate": 0.00037772975879066224,
+      "loss": 0.528,
+      "step": 24455
+    },
+    {
+      "epoch": 1.2417788833750047,
+      "grad_norm": 0.02229363835470009,
+      "learning_rate": 0.00037751497056257304,
+      "loss": 0.5116,
+      "step": 24460
+    },
+    {
+      "epoch": 1.2420327194730363,
+      "grad_norm": 0.02478864260181994,
+      "learning_rate": 0.0003773002063776843,
+      "loss": 0.4765,
+      "step": 24465
+    },
+    {
+      "epoch": 1.2422865555710678,
+      "grad_norm": 0.026718793626477343,
+      "learning_rate": 0.00037708546627815317,
+      "loss": 0.4911,
+      "step": 24470
+    },
+    {
+      "epoch": 1.2425403916690994,
+      "grad_norm": 0.022347714227584022,
+      "learning_rate": 0.000376870750306132,
+      "loss": 0.5098,
+      "step": 24475
+    },
+    {
+      "epoch": 1.2427942277671309,
+      "grad_norm": 0.02195899584253101,
+      "learning_rate": 0.0003766560585037685,
+      "loss": 0.4948,
+      "step": 24480
+    },
+    {
+      "epoch": 1.2430480638651622,
+      "grad_norm": 0.024116784723832947,
+      "learning_rate": 0.0003764413909132054,
+      "loss": 0.4916,
+      "step": 24485
+    },
+    {
+      "epoch": 1.2433018999631937,
+      "grad_norm": 0.025348617292129703,
+      "learning_rate": 0.00037622674757658127,
+      "loss": 0.4983,
+      "step": 24490
+    },
+    {
+      "epoch": 1.2435557360612253,
+      "grad_norm": 0.028723847683448087,
+      "learning_rate": 0.0003760121285360293,
+      "loss": 0.4974,
+      "step": 24495
+    },
+    {
+      "epoch": 1.2438095721592568,
+      "grad_norm": 0.02549204124081865,
+      "learning_rate": 0.00037579753383367825,
+      "loss": 0.4797,
+      "step": 24500
+    },
+    {
+      "epoch": 1.2440634082572883,
+      "grad_norm": 0.02572550068222192,
+      "learning_rate": 0.0003755829635116519,
+      "loss": 0.4911,
+      "step": 24505
+    },
+    {
+      "epoch": 1.2443172443553197,
+      "grad_norm": 0.02151255214938373,
+      "learning_rate": 0.0003753684176120693,
+      "loss": 0.4818,
+      "step": 24510
+    },
+    {
+      "epoch": 1.2445710804533512,
+      "grad_norm": 0.03485262446331745,
+      "learning_rate": 0.0003751538961770448,
+      "loss": 0.4964,
+      "step": 24515
+    },
+    {
+      "epoch": 1.2448249165513827,
+      "grad_norm": 0.020421404595873636,
+      "learning_rate": 0.0003749393992486879,
+      "loss": 0.4928,
+      "step": 24520
+    },
+    {
+      "epoch": 1.2450787526494143,
+      "grad_norm": 0.02213502791269541,
+      "learning_rate": 0.0003747249268691033,
+      "loss": 0.5011,
+      "step": 24525
+    },
+    {
+      "epoch": 1.2453325887474458,
+      "grad_norm": 0.039753961527774426,
+      "learning_rate": 0.0003745104790803907,
+      "loss": 0.5099,
+      "step": 24530
+    },
+    {
+      "epoch": 1.2455864248454773,
+      "grad_norm": 0.020760234136978934,
+      "learning_rate": 0.0003742960559246453,
+      "loss": 0.471,
+      "step": 24535
+    },
+    {
+      "epoch": 1.2458402609435089,
+      "grad_norm": 0.027835500455752622,
+      "learning_rate": 0.0003740816574439572,
+      "loss": 0.4985,
+      "step": 24540
+    },
+    {
+      "epoch": 1.2460940970415404,
+      "grad_norm": 0.024653367801313206,
+      "learning_rate": 0.00037386728368041185,
+      "loss": 0.5224,
+      "step": 24545
+    },
+    {
+      "epoch": 1.2463479331395717,
+      "grad_norm": 0.025495982916413043,
+      "learning_rate": 0.00037365293467608954,
+      "loss": 0.5011,
+      "step": 24550
+    },
+    {
+      "epoch": 1.2466017692376032,
+      "grad_norm": 0.04123159279622747,
+      "learning_rate": 0.00037343861047306617,
+      "loss": 0.4972,
+      "step": 24555
+    },
+    {
+      "epoch": 1.2468556053356348,
+      "grad_norm": 0.021840485245885177,
+      "learning_rate": 0.00037322431111341245,
+      "loss": 0.484,
+      "step": 24560
+    },
+    {
+      "epoch": 1.2471094414336663,
+      "grad_norm": 0.02574861374734803,
+      "learning_rate": 0.0003730100366391942,
+      "loss": 0.4747,
+      "step": 24565
+    },
+    {
+      "epoch": 1.2473632775316978,
+      "grad_norm": 0.029726658779212437,
+      "learning_rate": 0.0003727957870924724,
+      "loss": 0.468,
+      "step": 24570
+    },
+    {
+      "epoch": 1.2476171136297294,
+      "grad_norm": 0.023386237965567654,
+      "learning_rate": 0.0003725815625153033,
+      "loss": 0.461,
+      "step": 24575
+    },
+    {
+      "epoch": 1.2478709497277607,
+      "grad_norm": 0.02511675497455319,
+      "learning_rate": 0.00037236736294973805,
+      "loss": 0.4921,
+      "step": 24580
+    },
+    {
+      "epoch": 1.2481247858257922,
+      "grad_norm": 0.022973041358262655,
+      "learning_rate": 0.00037215318843782287,
+      "loss": 0.4862,
+      "step": 24585
+    },
+    {
+      "epoch": 1.2483786219238238,
+      "grad_norm": 0.025350959019371976,
+      "learning_rate": 0.0003719390390215993,
+      "loss": 0.4988,
+      "step": 24590
+    },
+    {
+      "epoch": 1.2486324580218553,
+      "grad_norm": 0.022332478902111395,
+      "learning_rate": 0.0003717249147431037,
+      "loss": 0.4785,
+      "step": 24595
+    },
+    {
+      "epoch": 1.2488862941198868,
+      "grad_norm": 0.02605623024379265,
+      "learning_rate": 0.0003715108156443676,
+      "loss": 0.5072,
+      "step": 24600
+    },
+    {
+      "epoch": 1.2491401302179184,
+      "grad_norm": 0.023303658340822323,
+      "learning_rate": 0.0003712967417674177,
+      "loss": 0.5011,
+      "step": 24605
+    },
+    {
+      "epoch": 1.24939396631595,
+      "grad_norm": 0.02406384974114823,
+      "learning_rate": 0.0003710826931542753,
+      "loss": 0.512,
+      "step": 24610
+    },
+    {
+      "epoch": 1.2496478024139812,
+      "grad_norm": 0.02759010527581803,
+      "learning_rate": 0.0003708686698469575,
+      "loss": 0.4894,
+      "step": 24615
+    },
+    {
+      "epoch": 1.2499016385120127,
+      "grad_norm": 0.03146887342339219,
+      "learning_rate": 0.00037065467188747593,
+      "loss": 0.4801,
+      "step": 24620
+    },
+    {
+      "epoch": 1.2501554746100443,
+      "grad_norm": 0.030508984982450908,
+      "learning_rate": 0.0003704406993178371,
+      "loss": 0.4932,
+      "step": 24625
+    },
+    {
+      "epoch": 1.2504093107080758,
+      "grad_norm": 0.02545476005935238,
+      "learning_rate": 0.000370226752180043,
+      "loss": 0.5114,
+      "step": 24630
+    },
+    {
+      "epoch": 1.2506631468061074,
+      "grad_norm": 0.02230948334682719,
+      "learning_rate": 0.0003700128305160901,
+      "loss": 0.5132,
+      "step": 24635
+    },
+    {
+      "epoch": 1.2509169829041387,
+      "grad_norm": 0.02206263568624184,
+      "learning_rate": 0.00036979893436797054,
+      "loss": 0.5015,
+      "step": 24640
+    },
+    {
+      "epoch": 1.2511708190021702,
+      "grad_norm": 0.02398238667525352,
+      "learning_rate": 0.0003695850637776707,
+      "loss": 0.4873,
+      "step": 24645
+    },
+    {
+      "epoch": 1.2514246551002017,
+      "grad_norm": 0.02334469006495267,
+      "learning_rate": 0.0003693712187871725,
+      "loss": 0.4907,
+      "step": 24650
+    },
+    {
+      "epoch": 1.2516784911982333,
+      "grad_norm": 0.02105351477888063,
+      "learning_rate": 0.0003691573994384526,
+      "loss": 0.5158,
+      "step": 24655
+    },
+    {
+      "epoch": 1.2519323272962648,
+      "grad_norm": 0.023611719484646673,
+      "learning_rate": 0.00036894360577348275,
+      "loss": 0.4912,
+      "step": 24660
+    },
+    {
+      "epoch": 1.2521861633942963,
+      "grad_norm": 0.026957081159899983,
+      "learning_rate": 0.00036872983783422944,
+      "loss": 0.5186,
+      "step": 24665
+    },
+    {
+      "epoch": 1.2524399994923279,
+      "grad_norm": 0.022112456631055228,
+      "learning_rate": 0.0003685160956626542,
+      "loss": 0.4708,
+      "step": 24670
+    },
+    {
+      "epoch": 1.2526938355903594,
+      "grad_norm": 0.03960707777333683,
+      "learning_rate": 0.0003683023793007138,
+      "loss": 0.4818,
+      "step": 24675
+    },
+    {
+      "epoch": 1.252947671688391,
+      "grad_norm": 0.024900205832922344,
+      "learning_rate": 0.0003680886887903596,
+      "loss": 0.4882,
+      "step": 24680
+    },
+    {
+      "epoch": 1.2532015077864223,
+      "grad_norm": 0.02600602146903172,
+      "learning_rate": 0.0003678750241735379,
+      "loss": 0.482,
+      "step": 24685
+    },
+    {
+      "epoch": 1.2534553438844538,
+      "grad_norm": 0.020762831712885167,
+      "learning_rate": 0.00036766138549219007,
+      "loss": 0.4721,
+      "step": 24690
+    },
+    {
+      "epoch": 1.2537091799824853,
+      "grad_norm": 0.02365987982582443,
+      "learning_rate": 0.00036744777278825225,
+      "loss": 0.4996,
+      "step": 24695
+    },
+    {
+      "epoch": 1.2539630160805169,
+      "grad_norm": 0.021839292335870203,
+      "learning_rate": 0.0003672341861036557,
+      "loss": 0.5116,
+      "step": 24700
+    },
+    {
+      "epoch": 1.2542168521785484,
+      "grad_norm": 0.02473930369877982,
+      "learning_rate": 0.00036702062548032624,
+      "loss": 0.4747,
+      "step": 24705
+    },
+    {
+      "epoch": 1.2544706882765797,
+      "grad_norm": 0.021843985621647143,
+      "learning_rate": 0.00036680709096018483,
+      "loss": 0.4981,
+      "step": 24710
+    },
+    {
+      "epoch": 1.2547245243746112,
+      "grad_norm": 0.024828258327636316,
+      "learning_rate": 0.0003665935825851473,
+      "loss": 0.5257,
+      "step": 24715
+    },
+    {
+      "epoch": 1.2549783604726428,
+      "grad_norm": 0.02156185794599099,
+      "learning_rate": 0.0003663801003971241,
+      "loss": 0.505,
+      "step": 24720
+    },
+    {
+      "epoch": 1.2552321965706743,
+      "grad_norm": 0.02121222455282886,
+      "learning_rate": 0.0003661666444380209,
+      "loss": 0.4864,
+      "step": 24725
+    },
+    {
+      "epoch": 1.2554860326687058,
+      "grad_norm": 0.028321758337109544,
+      "learning_rate": 0.00036595321474973777,
+      "loss": 0.4814,
+      "step": 24730
+    },
+    {
+      "epoch": 1.2557398687667374,
+      "grad_norm": 0.02973621886010056,
+      "learning_rate": 0.0003657398113741703,
+      "loss": 0.4777,
+      "step": 24735
+    },
+    {
+      "epoch": 1.255993704864769,
+      "grad_norm": 0.03303744106278796,
+      "learning_rate": 0.0003655264343532083,
+      "loss": 0.4917,
+      "step": 24740
+    },
+    {
+      "epoch": 1.2562475409628004,
+      "grad_norm": 0.024275455013697354,
+      "learning_rate": 0.0003653130837287366,
+      "loss": 0.4928,
+      "step": 24745
+    },
+    {
+      "epoch": 1.2565013770608318,
+      "grad_norm": 0.021864363759526228,
+      "learning_rate": 0.00036509975954263486,
+      "loss": 0.5008,
+      "step": 24750
+    },
+    {
+      "epoch": 1.2567552131588633,
+      "grad_norm": 0.023011380155086403,
+      "learning_rate": 0.00036488646183677767,
+      "loss": 0.4985,
+      "step": 24755
+    },
+    {
+      "epoch": 1.2570090492568948,
+      "grad_norm": 0.020669161817962644,
+      "learning_rate": 0.00036467319065303414,
+      "loss": 0.5056,
+      "step": 24760
+    },
+    {
+      "epoch": 1.2572628853549264,
+      "grad_norm": 0.02725897387201023,
+      "learning_rate": 0.00036445994603326835,
+      "loss": 0.5112,
+      "step": 24765
+    },
+    {
+      "epoch": 1.257516721452958,
+      "grad_norm": 0.024508242607115118,
+      "learning_rate": 0.00036424672801933946,
+      "loss": 0.5077,
+      "step": 24770
+    },
+    {
+      "epoch": 1.2577705575509892,
+      "grad_norm": 0.025773782621984185,
+      "learning_rate": 0.0003640335366531007,
+      "loss": 0.4975,
+      "step": 24775
+    },
+    {
+      "epoch": 1.2580243936490207,
+      "grad_norm": 0.023553740478245766,
+      "learning_rate": 0.00036382037197640063,
+      "loss": 0.4949,
+      "step": 24780
+    },
+    {
+      "epoch": 1.2582782297470523,
+      "grad_norm": 0.022708195915233667,
+      "learning_rate": 0.00036360723403108233,
+      "loss": 0.5042,
+      "step": 24785
+    },
+    {
+      "epoch": 1.2585320658450838,
+      "grad_norm": 0.021576770915417216,
+      "learning_rate": 0.00036339412285898363,
+      "loss": 0.4956,
+      "step": 24790
+    },
+    {
+      "epoch": 1.2587859019431153,
+      "grad_norm": 0.024661142091304572,
+      "learning_rate": 0.0003631810385019376,
+      "loss": 0.5243,
+      "step": 24795
+    },
+    {
+      "epoch": 1.2590397380411469,
+      "grad_norm": 0.023783896612178973,
+      "learning_rate": 0.0003629679810017714,
+      "loss": 0.5104,
+      "step": 24800
+    },
+    {
+      "epoch": 1.2592935741391784,
+      "grad_norm": 0.028819292594759488,
+      "learning_rate": 0.0003627549504003072,
+      "loss": 0.4754,
+      "step": 24805
+    },
+    {
+      "epoch": 1.25954741023721,
+      "grad_norm": 0.02512338445035506,
+      "learning_rate": 0.00036254194673936174,
+      "loss": 0.4788,
+      "step": 24810
+    },
+    {
+      "epoch": 1.2598012463352413,
+      "grad_norm": 0.02170026035535352,
+      "learning_rate": 0.0003623289700607466,
+      "loss": 0.5096,
+      "step": 24815
+    },
+    {
+      "epoch": 1.2600550824332728,
+      "grad_norm": 0.026629983889606926,
+      "learning_rate": 0.00036211602040626815,
+      "loss": 0.4805,
+      "step": 24820
+    },
+    {
+      "epoch": 1.2603089185313043,
+      "grad_norm": 0.03586703668787359,
+      "learning_rate": 0.00036190309781772723,
+      "loss": 0.4917,
+      "step": 24825
+    },
+    {
+      "epoch": 1.2605627546293359,
+      "grad_norm": 0.021805993713398496,
+      "learning_rate": 0.00036169020233691953,
+      "loss": 0.4935,
+      "step": 24830
+    },
+    {
+      "epoch": 1.2608165907273674,
+      "grad_norm": 0.02066388286099757,
+      "learning_rate": 0.0003614773340056353,
+      "loss": 0.4849,
+      "step": 24835
+    },
+    {
+      "epoch": 1.2610704268253987,
+      "grad_norm": 0.02456481899175869,
+      "learning_rate": 0.00036126449286565966,
+      "loss": 0.4749,
+      "step": 24840
+    },
+    {
+      "epoch": 1.2613242629234303,
+      "grad_norm": 0.022104636358690293,
+      "learning_rate": 0.0003610516789587722,
+      "loss": 0.5236,
+      "step": 24845
+    },
+    {
+      "epoch": 1.2615780990214618,
+      "grad_norm": 0.023314937158047715,
+      "learning_rate": 0.000360838892326747,
+      "loss": 0.478,
+      "step": 24850
+    },
+    {
+      "epoch": 1.2618319351194933,
+      "grad_norm": 0.02278136876188265,
+      "learning_rate": 0.00036062613301135357,
+      "loss": 0.5083,
+      "step": 24855
+    },
+    {
+      "epoch": 1.2620857712175249,
+      "grad_norm": 0.022721574613654438,
+      "learning_rate": 0.00036041340105435506,
+      "loss": 0.5178,
+      "step": 24860
+    },
+    {
+      "epoch": 1.2623396073155564,
+      "grad_norm": 0.021038721362959267,
+      "learning_rate": 0.00036020069649750976,
+      "loss": 0.4987,
+      "step": 24865
+    },
+    {
+      "epoch": 1.262593443413588,
+      "grad_norm": 0.024677584467370603,
+      "learning_rate": 0.00035998801938257063,
+      "loss": 0.4939,
+      "step": 24870
+    },
+    {
+      "epoch": 1.2628472795116195,
+      "grad_norm": 0.021066193323710593,
+      "learning_rate": 0.000359775369751285,
+      "loss": 0.4553,
+      "step": 24875
+    },
+    {
+      "epoch": 1.2631011156096508,
+      "grad_norm": 0.020772474815759088,
+      "learning_rate": 0.00035956274764539504,
+      "loss": 0.4793,
+      "step": 24880
+    },
+    {
+      "epoch": 1.2633549517076823,
+      "grad_norm": 0.02074020180323779,
+      "learning_rate": 0.0003593501531066373,
+      "loss": 0.4897,
+      "step": 24885
+    },
+    {
+      "epoch": 1.2636087878057138,
+      "grad_norm": 0.022634506709179467,
+      "learning_rate": 0.00035913758617674315,
+      "loss": 0.4656,
+      "step": 24890
+    },
+    {
+      "epoch": 1.2638626239037454,
+      "grad_norm": 0.02442246115751448,
+      "learning_rate": 0.0003589250468974383,
+      "loss": 0.4923,
+      "step": 24895
+    },
+    {
+      "epoch": 1.264116460001777,
+      "grad_norm": 0.02182147534232203,
+      "learning_rate": 0.00035871253531044323,
+      "loss": 0.4827,
+      "step": 24900
+    },
+    {
+      "epoch": 1.2643702960998082,
+      "grad_norm": 0.03540133910293839,
+      "learning_rate": 0.00035850005145747287,
+      "loss": 0.4997,
+      "step": 24905
+    },
+    {
+      "epoch": 1.2646241321978398,
+      "grad_norm": 0.031068992552075844,
+      "learning_rate": 0.00035828759538023653,
+      "loss": 0.4974,
+      "step": 24910
+    },
+    {
+      "epoch": 1.2648779682958713,
+      "grad_norm": 0.024541408824232794,
+      "learning_rate": 0.00035807516712043876,
+      "loss": 0.521,
+      "step": 24915
+    },
+    {
+      "epoch": 1.2651318043939028,
+      "grad_norm": 0.027145012369650882,
+      "learning_rate": 0.00035786276671977786,
+      "loss": 0.4929,
+      "step": 24920
+    },
+    {
+      "epoch": 1.2653856404919344,
+      "grad_norm": 0.03763414082526663,
+      "learning_rate": 0.000357650394219947,
+      "loss": 0.4915,
+      "step": 24925
+    },
+    {
+      "epoch": 1.265639476589966,
+      "grad_norm": 0.04014144903363009,
+      "learning_rate": 0.0003574380496626339,
+      "loss": 0.4897,
+      "step": 24930
+    },
+    {
+      "epoch": 1.2658933126879974,
+      "grad_norm": 0.029764740803608244,
+      "learning_rate": 0.00035722573308952064,
+      "loss": 0.4696,
+      "step": 24935
+    },
+    {
+      "epoch": 1.266147148786029,
+      "grad_norm": 0.030962115180249043,
+      "learning_rate": 0.000357013444542284,
+      "loss": 0.5014,
+      "step": 24940
+    },
+    {
+      "epoch": 1.2664009848840605,
+      "grad_norm": 0.027834184566246624,
+      "learning_rate": 0.00035680118406259515,
+      "loss": 0.4928,
+      "step": 24945
+    },
+    {
+      "epoch": 1.2666548209820918,
+      "grad_norm": 0.021043989423317384,
+      "learning_rate": 0.00035658895169211966,
+      "loss": 0.4762,
+      "step": 24950
+    },
+    {
+      "epoch": 1.2669086570801233,
+      "grad_norm": 0.02449083248247568,
+      "learning_rate": 0.00035637674747251785,
+      "loss": 0.491,
+      "step": 24955
+    },
+    {
+      "epoch": 1.2671624931781549,
+      "grad_norm": 0.02271799388618845,
+      "learning_rate": 0.00035616457144544425,
+      "loss": 0.4848,
+      "step": 24960
+    },
+    {
+      "epoch": 1.2674163292761864,
+      "grad_norm": 0.02138761722620517,
+      "learning_rate": 0.0003559524236525479,
+      "loss": 0.478,
+      "step": 24965
+    },
+    {
+      "epoch": 1.267670165374218,
+      "grad_norm": 0.02379623979828549,
+      "learning_rate": 0.0003557403041354724,
+      "loss": 0.4819,
+      "step": 24970
+    },
+    {
+      "epoch": 1.2679240014722493,
+      "grad_norm": 0.02449724126117815,
+      "learning_rate": 0.0003555282129358558,
+      "loss": 0.4804,
+      "step": 24975
+    },
+    {
+      "epoch": 1.2681778375702808,
+      "grad_norm": 0.021883903501979027,
+      "learning_rate": 0.0003553161500953306,
+      "loss": 0.4858,
+      "step": 24980
+    },
+    {
+      "epoch": 1.2684316736683123,
+      "grad_norm": 0.0209418839992429,
+      "learning_rate": 0.0003551041156555236,
+      "loss": 0.4907,
+      "step": 24985
+    },
+    {
+      "epoch": 1.2686855097663439,
+      "grad_norm": 0.023379422469475157,
+      "learning_rate": 0.000354892109658056,
+      "loss": 0.4686,
+      "step": 24990
+    },
+    {
+      "epoch": 1.2689393458643754,
+      "grad_norm": 0.03050915873944768,
+      "learning_rate": 0.00035468013214454375,
+      "loss": 0.4872,
+      "step": 24995
+    },
+    {
+      "epoch": 1.269193181962407,
+      "grad_norm": 0.019469878265224637,
+      "learning_rate": 0.0003544681831565968,
+      "loss": 0.467,
+      "step": 25000
+    },
+    {
+      "epoch": 1.2694470180604385,
+      "grad_norm": 0.02706167777407716,
+      "learning_rate": 0.0003542562627358197,
+      "loss": 0.5358,
+      "step": 25005
+    },
+    {
+      "epoch": 1.26970085415847,
+      "grad_norm": 0.02607408126692583,
+      "learning_rate": 0.0003540443709238114,
+      "loss": 0.484,
+      "step": 25010
+    },
+    {
+      "epoch": 1.2699546902565013,
+      "grad_norm": 0.026898207511883893,
+      "learning_rate": 0.00035383250776216526,
+      "loss": 0.4652,
+      "step": 25015
+    },
+    {
+      "epoch": 1.2702085263545329,
+      "grad_norm": 0.02822647163163459,
+      "learning_rate": 0.00035362067329246884,
+      "loss": 0.5043,
+      "step": 25020
+    },
+    {
+      "epoch": 1.2704623624525644,
+      "grad_norm": 0.024912999653158442,
+      "learning_rate": 0.0003534088675563043,
+      "loss": 0.4968,
+      "step": 25025
+    },
+    {
+      "epoch": 1.270716198550596,
+      "grad_norm": 0.04104540138730313,
+      "learning_rate": 0.0003531970905952478,
+      "loss": 0.48,
+      "step": 25030
+    },
+    {
+      "epoch": 1.2709700346486275,
+      "grad_norm": 0.04967344080591417,
+      "learning_rate": 0.00035298534245087055,
+      "loss": 0.4968,
+      "step": 25035
+    },
+    {
+      "epoch": 1.2712238707466588,
+      "grad_norm": 0.032041261058398655,
+      "learning_rate": 0.0003527736231647374,
+      "loss": 0.4833,
+      "step": 25040
+    },
+    {
+      "epoch": 1.2714777068446903,
+      "grad_norm": 0.026420537505594786,
+      "learning_rate": 0.0003525619327784078,
+      "loss": 0.5085,
+      "step": 25045
+    },
+    {
+      "epoch": 1.2717315429427218,
+      "grad_norm": 0.02728321137774862,
+      "learning_rate": 0.00035235027133343546,
+      "loss": 0.4869,
+      "step": 25050
+    },
+    {
+      "epoch": 1.2719853790407534,
+      "grad_norm": 0.022824173369318995,
+      "learning_rate": 0.0003521386388713686,
+      "loss": 0.5129,
+      "step": 25055
+    },
+    {
+      "epoch": 1.272239215138785,
+      "grad_norm": 0.027880785233754136,
+      "learning_rate": 0.0003519270354337495,
+      "loss": 0.48,
+      "step": 25060
+    },
+    {
+      "epoch": 1.2724930512368164,
+      "grad_norm": 0.02194127139960059,
+      "learning_rate": 0.0003517154610621149,
+      "loss": 0.4868,
+      "step": 25065
+    },
+    {
+      "epoch": 1.272746887334848,
+      "grad_norm": 0.023243161860758137,
+      "learning_rate": 0.0003515039157979959,
+      "loss": 0.4901,
+      "step": 25070
+    },
+    {
+      "epoch": 1.2730007234328795,
+      "grad_norm": 0.0375654084513337,
+      "learning_rate": 0.0003512923996829176,
+      "loss": 0.4824,
+      "step": 25075
+    },
+    {
+      "epoch": 1.2732545595309108,
+      "grad_norm": 0.02148030589323779,
+      "learning_rate": 0.0003510809127583997,
+      "loss": 0.4985,
+      "step": 25080
+    },
+    {
+      "epoch": 1.2735083956289424,
+      "grad_norm": 0.02366581081516137,
+      "learning_rate": 0.0003508694550659559,
+      "loss": 0.4889,
+      "step": 25085
+    },
+    {
+      "epoch": 1.273762231726974,
+      "grad_norm": 0.026078232012127944,
+      "learning_rate": 0.00035065802664709426,
+      "loss": 0.5308,
+      "step": 25090
+    },
+    {
+      "epoch": 1.2740160678250054,
+      "grad_norm": 0.025296448194318342,
+      "learning_rate": 0.00035044662754331736,
+      "loss": 0.4917,
+      "step": 25095
+    },
+    {
+      "epoch": 1.274269903923037,
+      "grad_norm": 0.032298444356296446,
+      "learning_rate": 0.00035023525779612165,
+      "loss": 0.4935,
+      "step": 25100
+    },
+    {
+      "epoch": 1.2745237400210683,
+      "grad_norm": 0.02559527128726925,
+      "learning_rate": 0.0003500239174469979,
+      "loss": 0.502,
+      "step": 25105
+    },
+    {
+      "epoch": 1.2747775761190998,
+      "grad_norm": 0.024979128795998007,
+      "learning_rate": 0.0003498126065374313,
+      "loss": 0.4913,
+      "step": 25110
+    },
+    {
+      "epoch": 1.2750314122171313,
+      "grad_norm": 0.025909934373053247,
+      "learning_rate": 0.00034960132510890096,
+      "loss": 0.4648,
+      "step": 25115
+    },
+    {
+      "epoch": 1.2752852483151629,
+      "grad_norm": 0.02298436546813651,
+      "learning_rate": 0.0003493900732028806,
+      "loss": 0.5013,
+      "step": 25120
+    },
+    {
+      "epoch": 1.2755390844131944,
+      "grad_norm": 0.02056212672636294,
+      "learning_rate": 0.0003491788508608377,
+      "loss": 0.4907,
+      "step": 25125
+    },
+    {
+      "epoch": 1.275792920511226,
+      "grad_norm": 0.021393425271120866,
+      "learning_rate": 0.00034896765812423425,
+      "loss": 0.4743,
+      "step": 25130
+    },
+    {
+      "epoch": 1.2760467566092575,
+      "grad_norm": 0.02799046607865763,
+      "learning_rate": 0.00034875649503452626,
+      "loss": 0.5046,
+      "step": 25135
+    },
+    {
+      "epoch": 1.276300592707289,
+      "grad_norm": 0.021557706427491538,
+      "learning_rate": 0.0003485453616331641,
+      "loss": 0.445,
+      "step": 25140
+    },
+    {
+      "epoch": 1.2765544288053203,
+      "grad_norm": 0.035754348141408966,
+      "learning_rate": 0.00034833425796159214,
+      "loss": 0.4767,
+      "step": 25145
+    },
+    {
+      "epoch": 1.2768082649033519,
+      "grad_norm": 0.022260297154022193,
+      "learning_rate": 0.00034812318406124876,
+      "loss": 0.4998,
+      "step": 25150
+    },
+    {
+      "epoch": 1.2770621010013834,
+      "grad_norm": 0.020517704282148764,
+      "learning_rate": 0.0003479121399735672,
+      "loss": 0.4705,
+      "step": 25155
+    },
+    {
+      "epoch": 1.277315937099415,
+      "grad_norm": 0.023421817797892777,
+      "learning_rate": 0.00034770112573997405,
+      "loss": 0.486,
+      "step": 25160
+    },
+    {
+      "epoch": 1.2775697731974465,
+      "grad_norm": 0.03614632807366431,
+      "learning_rate": 0.0003474901414018904,
+      "loss": 0.5111,
+      "step": 25165
+    },
+    {
+      "epoch": 1.2778236092954778,
+      "grad_norm": 0.05816272517086025,
+      "learning_rate": 0.00034727918700073145,
+      "loss": 0.4814,
+      "step": 25170
+    },
+    {
+      "epoch": 1.2780774453935093,
+      "grad_norm": 0.025050111858038723,
+      "learning_rate": 0.0003470682625779065,
+      "loss": 0.5197,
+      "step": 25175
+    },
+    {
+      "epoch": 1.2783312814915408,
+      "grad_norm": 0.03165688938324348,
+      "learning_rate": 0.0003468573681748188,
+      "loss": 0.4827,
+      "step": 25180
+    },
+    {
+      "epoch": 1.2785851175895724,
+      "grad_norm": 0.019624471813982844,
+      "learning_rate": 0.00034664650383286615,
+      "loss": 0.4922,
+      "step": 25185
+    },
+    {
+      "epoch": 1.278838953687604,
+      "grad_norm": 0.021835974079306434,
+      "learning_rate": 0.00034643566959343997,
+      "loss": 0.5077,
+      "step": 25190
+    },
+    {
+      "epoch": 1.2790927897856355,
+      "grad_norm": 0.021070204790334,
+      "learning_rate": 0.0003462248654979261,
+      "loss": 0.4665,
+      "step": 25195
+    },
+    {
+      "epoch": 1.279346625883667,
+      "grad_norm": 0.04029683650164615,
+      "learning_rate": 0.0003460140915877041,
+      "loss": 0.4625,
+      "step": 25200
+    },
+    {
+      "epoch": 1.2796004619816985,
+      "grad_norm": 0.02203877571856326,
+      "learning_rate": 0.00034580334790414814,
+      "loss": 0.4589,
+      "step": 25205
+    },
+    {
+      "epoch": 1.2798542980797298,
+      "grad_norm": 0.03135309516087329,
+      "learning_rate": 0.0003455926344886259,
+      "loss": 0.4735,
+      "step": 25210
+    },
+    {
+      "epoch": 1.2801081341777614,
+      "grad_norm": 0.02096327734711526,
+      "learning_rate": 0.0003453819513824995,
+      "loss": 0.4874,
+      "step": 25215
+    },
+    {
+      "epoch": 1.280361970275793,
+      "grad_norm": 0.022733042961183666,
+      "learning_rate": 0.00034517129862712506,
+      "loss": 0.4795,
+      "step": 25220
+    },
+    {
+      "epoch": 1.2806158063738244,
+      "grad_norm": 0.02232352087148914,
+      "learning_rate": 0.00034496067626385254,
+      "loss": 0.4843,
+      "step": 25225
+    },
+    {
+      "epoch": 1.280869642471856,
+      "grad_norm": 0.021657090158753636,
+      "learning_rate": 0.000344750084334026,
+      "loss": 0.4864,
+      "step": 25230
+    },
+    {
+      "epoch": 1.2811234785698873,
+      "grad_norm": 0.02829516889946878,
+      "learning_rate": 0.00034453952287898375,
+      "loss": 0.4628,
+      "step": 25235
+    },
+    {
+      "epoch": 1.2813773146679188,
+      "grad_norm": 0.02189023360857141,
+      "learning_rate": 0.0003443289919400579,
+      "loss": 0.4704,
+      "step": 25240
+    },
+    {
+      "epoch": 1.2816311507659504,
+      "grad_norm": 0.02205365383036312,
+      "learning_rate": 0.0003441184915585746,
+      "loss": 0.4757,
+      "step": 25245
+    },
+    {
+      "epoch": 1.281884986863982,
+      "grad_norm": 0.03272625160720233,
+      "learning_rate": 0.000343908021775854,
+      "loss": 0.462,
+      "step": 25250
+    },
+    {
+      "epoch": 1.2821388229620134,
+      "grad_norm": 0.02110097107472652,
+      "learning_rate": 0.00034369758263321025,
+      "loss": 0.4615,
+      "step": 25255
+    },
+    {
+      "epoch": 1.282392659060045,
+      "grad_norm": 0.02123030495394618,
+      "learning_rate": 0.0003434871741719516,
+      "loss": 0.4795,
+      "step": 25260
+    },
+    {
+      "epoch": 1.2826464951580765,
+      "grad_norm": 0.02373251352589659,
+      "learning_rate": 0.0003432767964333802,
+      "loss": 0.4727,
+      "step": 25265
+    },
+    {
+      "epoch": 1.282900331256108,
+      "grad_norm": 0.03378365804202472,
+      "learning_rate": 0.00034306644945879174,
+      "loss": 0.4789,
+      "step": 25270
+    },
+    {
+      "epoch": 1.2831541673541396,
+      "grad_norm": 0.023436130695674143,
+      "learning_rate": 0.0003428561332894769,
+      "loss": 0.4851,
+      "step": 25275
+    },
+    {
+      "epoch": 1.2834080034521709,
+      "grad_norm": 0.025875198122374644,
+      "learning_rate": 0.0003426458479667194,
+      "loss": 0.4709,
+      "step": 25280
+    },
+    {
+      "epoch": 1.2836618395502024,
+      "grad_norm": 0.030081526490696734,
+      "learning_rate": 0.00034243559353179726,
+      "loss": 0.4867,
+      "step": 25285
+    },
+    {
+      "epoch": 1.283915675648234,
+      "grad_norm": 0.021009383912915302,
+      "learning_rate": 0.00034222537002598233,
+      "loss": 0.4919,
+      "step": 25290
+    },
+    {
+      "epoch": 1.2841695117462655,
+      "grad_norm": 0.020722639706099787,
+      "learning_rate": 0.00034201517749054037,
+      "loss": 0.4786,
+      "step": 25295
+    },
+    {
+      "epoch": 1.284423347844297,
+      "grad_norm": 0.02511803417211432,
+      "learning_rate": 0.0003418050159667313,
+      "loss": 0.4895,
+      "step": 25300
+    },
+    {
+      "epoch": 1.2846771839423283,
+      "grad_norm": 0.023199730249595922,
+      "learning_rate": 0.00034159488549580865,
+      "loss": 0.4975,
+      "step": 25305
+    },
+    {
+      "epoch": 1.2849310200403599,
+      "grad_norm": 0.022681672373907304,
+      "learning_rate": 0.00034138478611902,
+      "loss": 0.4618,
+      "step": 25310
+    },
+    {
+      "epoch": 1.2851848561383914,
+      "grad_norm": 0.025437230351334297,
+      "learning_rate": 0.0003411747178776068,
+      "loss": 0.6043,
+      "step": 25315
+    },
+    {
+      "epoch": 1.285438692236423,
+      "grad_norm": 0.02517031394643481,
+      "learning_rate": 0.00034096468081280443,
+      "loss": 0.4689,
+      "step": 25320
+    },
+    {
+      "epoch": 1.2856925283344545,
+      "grad_norm": 0.0255164084090977,
+      "learning_rate": 0.00034075467496584214,
+      "loss": 0.4664,
+      "step": 25325
+    },
+    {
+      "epoch": 1.285946364432486,
+      "grad_norm": 0.024254877277534864,
+      "learning_rate": 0.00034054470037794284,
+      "loss": 0.4927,
+      "step": 25330
+    },
+    {
+      "epoch": 1.2862002005305175,
+      "grad_norm": 0.02399484634796993,
+      "learning_rate": 0.0003403347570903238,
+      "loss": 0.4992,
+      "step": 25335
+    },
+    {
+      "epoch": 1.286454036628549,
+      "grad_norm": 0.022617445238938865,
+      "learning_rate": 0.0003401248451441957,
+      "loss": 0.4784,
+      "step": 25340
+    },
+    {
+      "epoch": 1.2867078727265804,
+      "grad_norm": 0.02702146861655969,
+      "learning_rate": 0.0003399149645807632,
+      "loss": 0.4879,
+      "step": 25345
+    },
+    {
+      "epoch": 1.286961708824612,
+      "grad_norm": 0.021698144351784757,
+      "learning_rate": 0.00033970511544122476,
+      "loss": 0.5057,
+      "step": 25350
+    },
+    {
+      "epoch": 1.2872155449226435,
+      "grad_norm": 0.02835164720141961,
+      "learning_rate": 0.0003394952977667728,
+      "loss": 0.4854,
+      "step": 25355
+    },
+    {
+      "epoch": 1.287469381020675,
+      "grad_norm": 0.023058090063842972,
+      "learning_rate": 0.0003392855115985935,
+      "loss": 0.5309,
+      "step": 25360
+    },
+    {
+      "epoch": 1.2877232171187065,
+      "grad_norm": 0.043240874010317586,
+      "learning_rate": 0.00033907575697786677,
+      "loss": 0.4942,
+      "step": 25365
+    },
+    {
+      "epoch": 1.2879770532167378,
+      "grad_norm": 0.030301182776510747,
+      "learning_rate": 0.0003388660339457664,
+      "loss": 0.4814,
+      "step": 25370
+    },
+    {
+      "epoch": 1.2882308893147694,
+      "grad_norm": 0.037066055185106375,
+      "learning_rate": 0.00033865634254345996,
+      "loss": 0.4831,
+      "step": 25375
+    },
+    {
+      "epoch": 1.288484725412801,
+      "grad_norm": 0.031338071085445056,
+      "learning_rate": 0.0003384466828121089,
+      "loss": 0.508,
+      "step": 25380
+    },
+    {
+      "epoch": 1.2887385615108324,
+      "grad_norm": 0.02436415302207937,
+      "learning_rate": 0.0003382370547928683,
+      "loss": 0.4708,
+      "step": 25385
+    },
+    {
+      "epoch": 1.288992397608864,
+      "grad_norm": 0.028312170689583957,
+      "learning_rate": 0.000338027458526887,
+      "loss": 0.4962,
+      "step": 25390
+    },
+    {
+      "epoch": 1.2892462337068955,
+      "grad_norm": 0.02167851043165668,
+      "learning_rate": 0.00033781789405530794,
+      "loss": 0.4667,
+      "step": 25395
+    },
+    {
+      "epoch": 1.289500069804927,
+      "grad_norm": 0.023856335129568706,
+      "learning_rate": 0.00033760836141926754,
+      "loss": 0.4736,
+      "step": 25400
+    },
+    {
+      "epoch": 1.2897539059029586,
+      "grad_norm": 0.028628411965523105,
+      "learning_rate": 0.000337398860659896,
+      "loss": 0.5157,
+      "step": 25405
+    },
+    {
+      "epoch": 1.2900077420009899,
+      "grad_norm": 0.021708828812962416,
+      "learning_rate": 0.0003371893918183171,
+      "loss": 0.4964,
+      "step": 25410
+    },
+    {
+      "epoch": 1.2902615780990214,
+      "grad_norm": 0.02198643320874413,
+      "learning_rate": 0.0003369799549356487,
+      "loss": 0.4799,
+      "step": 25415
+    },
+    {
+      "epoch": 1.290515414197053,
+      "grad_norm": 0.02453087273669778,
+      "learning_rate": 0.00033677055005300224,
+      "loss": 0.4635,
+      "step": 25420
+    },
+    {
+      "epoch": 1.2907692502950845,
+      "grad_norm": 0.026346186269380012,
+      "learning_rate": 0.0003365611772114827,
+      "loss": 0.5175,
+      "step": 25425
+    },
+    {
+      "epoch": 1.291023086393116,
+      "grad_norm": 0.029201729373243494,
+      "learning_rate": 0.000336351836452189,
+      "loss": 0.5042,
+      "step": 25430
+    },
+    {
+      "epoch": 1.2912769224911473,
+      "grad_norm": 0.020057883724721514,
+      "learning_rate": 0.00033614252781621374,
+      "loss": 0.4929,
+      "step": 25435
+    },
+    {
+      "epoch": 1.2915307585891789,
+      "grad_norm": 0.023365071609751025,
+      "learning_rate": 0.0003359332513446431,
+      "loss": 0.4701,
+      "step": 25440
+    },
+    {
+      "epoch": 1.2917845946872104,
+      "grad_norm": 0.029195703341591574,
+      "learning_rate": 0.000335724007078557,
+      "loss": 0.4679,
+      "step": 25445
+    },
+    {
+      "epoch": 1.292038430785242,
+      "grad_norm": 0.024061801809005346,
+      "learning_rate": 0.0003355147950590291,
+      "loss": 0.4735,
+      "step": 25450
+    },
+    {
+      "epoch": 1.2922922668832735,
+      "grad_norm": 0.020717453665263782,
+      "learning_rate": 0.00033530561532712653,
+      "loss": 0.5058,
+      "step": 25455
+    },
+    {
+      "epoch": 1.292546102981305,
+      "grad_norm": 0.02955683642722223,
+      "learning_rate": 0.00033509646792391045,
+      "loss": 0.4869,
+      "step": 25460
+    },
+    {
+      "epoch": 1.2927999390793365,
+      "grad_norm": 0.022595682726649052,
+      "learning_rate": 0.0003348873528904353,
+      "loss": 0.4827,
+      "step": 25465
+    },
+    {
+      "epoch": 1.293053775177368,
+      "grad_norm": 0.029808289348049386,
+      "learning_rate": 0.0003346782702677494,
+      "loss": 0.496,
+      "step": 25470
+    },
+    {
+      "epoch": 1.2933076112753994,
+      "grad_norm": 0.02274550903922533,
+      "learning_rate": 0.0003344692200968946,
+      "loss": 0.4972,
+      "step": 25475
+    },
+    {
+      "epoch": 1.293561447373431,
+      "grad_norm": 0.041851383405792156,
+      "learning_rate": 0.00033426020241890636,
+      "loss": 0.4798,
+      "step": 25480
+    },
+    {
+      "epoch": 1.2938152834714625,
+      "grad_norm": 0.033587120320584446,
+      "learning_rate": 0.00033405121727481384,
+      "loss": 0.4889,
+      "step": 25485
+    },
+    {
+      "epoch": 1.294069119569494,
+      "grad_norm": 0.037859152680054645,
+      "learning_rate": 0.00033384226470563983,
+      "loss": 0.492,
+      "step": 25490
+    },
+    {
+      "epoch": 1.2943229556675255,
+      "grad_norm": 0.028727700779199155,
+      "learning_rate": 0.0003336333447524006,
+      "loss": 0.5041,
+      "step": 25495
+    },
+    {
+      "epoch": 1.2945767917655568,
+      "grad_norm": 0.025080840121736163,
+      "learning_rate": 0.0003334244574561061,
+      "loss": 0.5099,
+      "step": 25500
+    },
+    {
+      "epoch": 1.2948306278635884,
+      "grad_norm": 0.02222522508564205,
+      "learning_rate": 0.0003332156028577599,
+      "loss": 0.5108,
+      "step": 25505
+    },
+    {
+      "epoch": 1.29508446396162,
+      "grad_norm": 0.037804332629990035,
+      "learning_rate": 0.00033300678099835914,
+      "loss": 0.472,
+      "step": 25510
+    },
+    {
+      "epoch": 1.2953383000596514,
+      "grad_norm": 0.022133420759234763,
+      "learning_rate": 0.00033279799191889426,
+      "loss": 0.4965,
+      "step": 25515
+    },
+    {
+      "epoch": 1.295592136157683,
+      "grad_norm": 0.027447234503632068,
+      "learning_rate": 0.00033258923566034995,
+      "loss": 0.4852,
+      "step": 25520
+    },
+    {
+      "epoch": 1.2958459722557145,
+      "grad_norm": 0.026292192328524953,
+      "learning_rate": 0.0003323805122637038,
+      "loss": 0.4646,
+      "step": 25525
+    },
+    {
+      "epoch": 1.296099808353746,
+      "grad_norm": 0.02694952032736674,
+      "learning_rate": 0.0003321718217699271,
+      "loss": 0.4964,
+      "step": 25530
+    },
+    {
+      "epoch": 1.2963536444517776,
+      "grad_norm": 0.0217904749033225,
+      "learning_rate": 0.00033196316421998495,
+      "loss": 0.5027,
+      "step": 25535
+    },
+    {
+      "epoch": 1.2966074805498091,
+      "grad_norm": 0.021851950907082737,
+      "learning_rate": 0.0003317545396548356,
+      "loss": 0.4986,
+      "step": 25540
+    },
+    {
+      "epoch": 1.2968613166478404,
+      "grad_norm": 0.023510352499926397,
+      "learning_rate": 0.00033154594811543104,
+      "loss": 0.4833,
+      "step": 25545
+    },
+    {
+      "epoch": 1.297115152745872,
+      "grad_norm": 0.024076269555153962,
+      "learning_rate": 0.00033133738964271687,
+      "loss": 0.4734,
+      "step": 25550
+    },
+    {
+      "epoch": 1.2973689888439035,
+      "grad_norm": 0.022988360192355953,
+      "learning_rate": 0.00033112886427763197,
+      "loss": 0.5029,
+      "step": 25555
+    },
+    {
+      "epoch": 1.297622824941935,
+      "grad_norm": 0.02164944305735075,
+      "learning_rate": 0.0003309203720611088,
+      "loss": 0.5109,
+      "step": 25560
+    },
+    {
+      "epoch": 1.2978766610399666,
+      "grad_norm": 0.038596290229903046,
+      "learning_rate": 0.00033071191303407345,
+      "loss": 0.4938,
+      "step": 25565
+    },
+    {
+      "epoch": 1.2981304971379979,
+      "grad_norm": 0.024119758300361713,
+      "learning_rate": 0.00033050348723744527,
+      "loss": 0.4897,
+      "step": 25570
+    },
+    {
+      "epoch": 1.2983843332360294,
+      "grad_norm": 0.02587506880218007,
+      "learning_rate": 0.00033029509471213726,
+      "loss": 0.505,
+      "step": 25575
+    },
+    {
+      "epoch": 1.298638169334061,
+      "grad_norm": 0.02275936565570734,
+      "learning_rate": 0.00033008673549905586,
+      "loss": 0.5145,
+      "step": 25580
+    },
+    {
+      "epoch": 1.2988920054320925,
+      "grad_norm": 0.023072141573578203,
+      "learning_rate": 0.000329878409639101,
+      "loss": 0.4862,
+      "step": 25585
+    },
+    {
+      "epoch": 1.299145841530124,
+      "grad_norm": 0.021274802226077427,
+      "learning_rate": 0.00032967011717316587,
+      "loss": 0.4965,
+      "step": 25590
+    },
+    {
+      "epoch": 1.2993996776281556,
+      "grad_norm": 0.030502737103929357,
+      "learning_rate": 0.00032946185814213734,
+      "loss": 0.5204,
+      "step": 25595
+    },
+    {
+      "epoch": 1.299653513726187,
+      "grad_norm": 0.020252746392252694,
+      "learning_rate": 0.00032925363258689557,
+      "loss": 0.479,
+      "step": 25600
+    },
+    {
+      "epoch": 1.2999073498242186,
+      "grad_norm": 0.021662085892109512,
+      "learning_rate": 0.0003290454405483142,
+      "loss": 0.4786,
+      "step": 25605
+    },
+    {
+      "epoch": 1.30016118592225,
+      "grad_norm": 0.030512365258254626,
+      "learning_rate": 0.00032883728206726035,
+      "loss": 0.4816,
+      "step": 25610
+    },
+    {
+      "epoch": 1.3004150220202815,
+      "grad_norm": 0.02419501135859568,
+      "learning_rate": 0.00032862915718459443,
+      "loss": 0.4659,
+      "step": 25615
+    },
+    {
+      "epoch": 1.300668858118313,
+      "grad_norm": 0.02205521149147181,
+      "learning_rate": 0.0003284210659411703,
+      "loss": 0.4919,
+      "step": 25620
+    },
+    {
+      "epoch": 1.3009226942163445,
+      "grad_norm": 0.03676879072186463,
+      "learning_rate": 0.0003282130083778352,
+      "loss": 0.4887,
+      "step": 25625
+    },
+    {
+      "epoch": 1.301176530314376,
+      "grad_norm": 0.02352083834323318,
+      "learning_rate": 0.0003280049845354299,
+      "loss": 0.4994,
+      "step": 25630
+    },
+    {
+      "epoch": 1.3014303664124074,
+      "grad_norm": 0.02443911005181168,
+      "learning_rate": 0.00032779699445478826,
+      "loss": 0.4826,
+      "step": 25635
+    },
+    {
+      "epoch": 1.301684202510439,
+      "grad_norm": 0.02197560170718096,
+      "learning_rate": 0.000327589038176738,
+      "loss": 0.4615,
+      "step": 25640
+    },
+    {
+      "epoch": 1.3019380386084705,
+      "grad_norm": 0.022748104689422538,
+      "learning_rate": 0.00032738111574209973,
+      "loss": 0.4801,
+      "step": 25645
+    },
+    {
+      "epoch": 1.302191874706502,
+      "grad_norm": 0.02023037758904707,
+      "learning_rate": 0.0003271732271916876,
+      "loss": 0.4661,
+      "step": 25650
+    },
+    {
+      "epoch": 1.3024457108045335,
+      "grad_norm": 0.029780248485063505,
+      "learning_rate": 0.0003269653725663091,
+      "loss": 0.5154,
+      "step": 25655
+    },
+    {
+      "epoch": 1.302699546902565,
+      "grad_norm": 0.022291598091070952,
+      "learning_rate": 0.000326757551906765,
+      "loss": 0.5173,
+      "step": 25660
+    },
+    {
+      "epoch": 1.3029533830005966,
+      "grad_norm": 0.022819230833246173,
+      "learning_rate": 0.00032654976525384947,
+      "loss": 0.4833,
+      "step": 25665
+    },
+    {
+      "epoch": 1.3032072190986281,
+      "grad_norm": 0.02715749696309478,
+      "learning_rate": 0.0003263420126483501,
+      "loss": 0.4879,
+      "step": 25670
+    },
+    {
+      "epoch": 1.3034610551966594,
+      "grad_norm": 0.023854247107745894,
+      "learning_rate": 0.0003261342941310476,
+      "loss": 0.473,
+      "step": 25675
+    },
+    {
+      "epoch": 1.303714891294691,
+      "grad_norm": 0.022155878634300286,
+      "learning_rate": 0.00032592660974271615,
+      "loss": 0.505,
+      "step": 25680
+    },
+    {
+      "epoch": 1.3039687273927225,
+      "grad_norm": 0.02398677758854186,
+      "learning_rate": 0.000325718959524123,
+      "loss": 0.5156,
+      "step": 25685
+    },
+    {
+      "epoch": 1.304222563490754,
+      "grad_norm": 0.03172998468913407,
+      "learning_rate": 0.000325511343516029,
+      "loss": 0.488,
+      "step": 25690
+    },
+    {
+      "epoch": 1.3044763995887856,
+      "grad_norm": 0.03394625530167979,
+      "learning_rate": 0.00032530376175918794,
+      "loss": 0.4604,
+      "step": 25695
+    },
+    {
+      "epoch": 1.304730235686817,
+      "grad_norm": 0.02947534576412719,
+      "learning_rate": 0.00032509621429434744,
+      "loss": 0.4747,
+      "step": 25700
+    },
+    {
+      "epoch": 1.3049840717848484,
+      "grad_norm": 0.03537345478451157,
+      "learning_rate": 0.0003248887011622478,
+      "loss": 0.4933,
+      "step": 25705
+    },
+    {
+      "epoch": 1.30523790788288,
+      "grad_norm": 0.022484576203938268,
+      "learning_rate": 0.00032468122240362287,
+      "loss": 0.5131,
+      "step": 25710
+    },
+    {
+      "epoch": 1.3054917439809115,
+      "grad_norm": 0.03394847342947618,
+      "learning_rate": 0.00032447377805919957,
+      "loss": 0.4996,
+      "step": 25715
+    },
+    {
+      "epoch": 1.305745580078943,
+      "grad_norm": 0.022258649273689368,
+      "learning_rate": 0.00032426636816969837,
+      "loss": 0.4864,
+      "step": 25720
+    },
+    {
+      "epoch": 1.3059994161769746,
+      "grad_norm": 0.028081809394511537,
+      "learning_rate": 0.0003240589927758327,
+      "loss": 0.4942,
+      "step": 25725
+    },
+    {
+      "epoch": 1.306253252275006,
+      "grad_norm": 0.02227139522287654,
+      "learning_rate": 0.0003238516519183093,
+      "loss": 0.5094,
+      "step": 25730
+    },
+    {
+      "epoch": 1.3065070883730376,
+      "grad_norm": 0.03070685559989823,
+      "learning_rate": 0.0003236443456378282,
+      "loss": 0.4944,
+      "step": 25735
+    },
+    {
+      "epoch": 1.306760924471069,
+      "grad_norm": 0.04102837340472807,
+      "learning_rate": 0.0003234370739750826,
+      "loss": 0.458,
+      "step": 25740
+    },
+    {
+      "epoch": 1.3070147605691005,
+      "grad_norm": 0.021808995007741704,
+      "learning_rate": 0.00032322983697075883,
+      "loss": 0.4717,
+      "step": 25745
+    },
+    {
+      "epoch": 1.307268596667132,
+      "grad_norm": 0.023349246400971222,
+      "learning_rate": 0.0003230226346655365,
+      "loss": 0.4855,
+      "step": 25750
+    },
+    {
+      "epoch": 1.3075224327651636,
+      "grad_norm": 0.023103164390614456,
+      "learning_rate": 0.0003228154671000882,
+      "loss": 0.4876,
+      "step": 25755
+    },
+    {
+      "epoch": 1.307776268863195,
+      "grad_norm": 0.024319382145632857,
+      "learning_rate": 0.0003226083343150803,
+      "loss": 0.4753,
+      "step": 25760
+    },
+    {
+      "epoch": 1.3080301049612264,
+      "grad_norm": 0.02350220796453745,
+      "learning_rate": 0.0003224012363511717,
+      "loss": 0.4565,
+      "step": 25765
+    },
+    {
+      "epoch": 1.308283941059258,
+      "grad_norm": 0.023084276590446533,
+      "learning_rate": 0.0003221941732490148,
+      "loss": 0.4742,
+      "step": 25770
+    },
+    {
+      "epoch": 1.3085377771572895,
+      "grad_norm": 0.023561783980489238,
+      "learning_rate": 0.00032198714504925487,
+      "loss": 0.4818,
+      "step": 25775
+    },
+    {
+      "epoch": 1.308791613255321,
+      "grad_norm": 0.024018962472135568,
+      "learning_rate": 0.0003217801517925307,
+      "loss": 0.454,
+      "step": 25780
+    },
+    {
+      "epoch": 1.3090454493533525,
+      "grad_norm": 0.02133114101515424,
+      "learning_rate": 0.0003215731935194739,
+      "loss": 0.4503,
+      "step": 25785
+    },
+    {
+      "epoch": 1.309299285451384,
+      "grad_norm": 0.026035923942215537,
+      "learning_rate": 0.0003213662702707094,
+      "loss": 0.5116,
+      "step": 25790
+    },
+    {
+      "epoch": 1.3095531215494156,
+      "grad_norm": 0.022735126112242227,
+      "learning_rate": 0.00032115938208685527,
+      "loss": 0.4965,
+      "step": 25795
+    },
+    {
+      "epoch": 1.3098069576474471,
+      "grad_norm": 0.023075574740689275,
+      "learning_rate": 0.0003209525290085226,
+      "loss": 0.4858,
+      "step": 25800
+    },
+    {
+      "epoch": 1.3100607937454787,
+      "grad_norm": 0.02157939323367077,
+      "learning_rate": 0.00032074571107631544,
+      "loss": 0.465,
+      "step": 25805
+    },
+    {
+      "epoch": 1.31031462984351,
+      "grad_norm": 0.02922405968847422,
+      "learning_rate": 0.0003205389283308313,
+      "loss": 0.4925,
+      "step": 25810
+    },
+    {
+      "epoch": 1.3105684659415415,
+      "grad_norm": 0.022879840515332046,
+      "learning_rate": 0.0003203321808126604,
+      "loss": 0.4703,
+      "step": 25815
+    },
+    {
+      "epoch": 1.310822302039573,
+      "grad_norm": 0.023144556467176263,
+      "learning_rate": 0.0003201254685623866,
+      "loss": 0.4869,
+      "step": 25820
+    },
+    {
+      "epoch": 1.3110761381376046,
+      "grad_norm": 0.02520401264527816,
+      "learning_rate": 0.00031991879162058623,
+      "loss": 0.4813,
+      "step": 25825
+    },
+    {
+      "epoch": 1.3113299742356361,
+      "grad_norm": 0.02170783835021678,
+      "learning_rate": 0.00031971215002782907,
+      "loss": 0.4802,
+      "step": 25830
+    },
+    {
+      "epoch": 1.3115838103336674,
+      "grad_norm": 0.028491353202148535,
+      "learning_rate": 0.00031950554382467766,
+      "loss": 0.4757,
+      "step": 25835
+    },
+    {
+      "epoch": 1.311837646431699,
+      "grad_norm": 0.02918056571262906,
+      "learning_rate": 0.000319298973051688,
+      "loss": 0.4895,
+      "step": 25840
+    },
+    {
+      "epoch": 1.3120914825297305,
+      "grad_norm": 0.03246670924753988,
+      "learning_rate": 0.00031909243774940865,
+      "loss": 0.5045,
+      "step": 25845
+    },
+    {
+      "epoch": 1.312345318627762,
+      "grad_norm": 0.04517489478201621,
+      "learning_rate": 0.0003188859379583816,
+      "loss": 0.49,
+      "step": 25850
+    },
+    {
+      "epoch": 1.3125991547257936,
+      "grad_norm": 0.025048916079119958,
+      "learning_rate": 0.0003186794737191418,
+      "loss": 0.4813,
+      "step": 25855
+    },
+    {
+      "epoch": 1.3128529908238251,
+      "grad_norm": 0.02702368574587235,
+      "learning_rate": 0.000318473045072217,
+      "loss": 0.4809,
+      "step": 25860
+    },
+    {
+      "epoch": 1.3131068269218567,
+      "grad_norm": 0.02451499381789933,
+      "learning_rate": 0.00031826665205812824,
+      "loss": 0.4777,
+      "step": 25865
+    },
+    {
+      "epoch": 1.3133606630198882,
+      "grad_norm": 0.027600609156731175,
+      "learning_rate": 0.00031806029471738933,
+      "loss": 0.4788,
+      "step": 25870
+    },
+    {
+      "epoch": 1.3136144991179195,
+      "grad_norm": 0.03131877820295245,
+      "learning_rate": 0.000317853973090507,
+      "loss": 0.4761,
+      "step": 25875
+    },
+    {
+      "epoch": 1.313868335215951,
+      "grad_norm": 0.027123485696514793,
+      "learning_rate": 0.00031764768721798163,
+      "loss": 0.4727,
+      "step": 25880
+    },
+    {
+      "epoch": 1.3141221713139826,
+      "grad_norm": 0.02108556651131552,
+      "learning_rate": 0.00031744143714030606,
+      "loss": 0.4948,
+      "step": 25885
+    },
+    {
+      "epoch": 1.314376007412014,
+      "grad_norm": 0.02128012590382479,
+      "learning_rate": 0.00031723522289796573,
+      "loss": 0.4942,
+      "step": 25890
+    },
+    {
+      "epoch": 1.3146298435100456,
+      "grad_norm": 0.021709980768361257,
+      "learning_rate": 0.00031702904453143976,
+      "loss": 0.4826,
+      "step": 25895
+    },
+    {
+      "epoch": 1.314883679608077,
+      "grad_norm": 0.023018801114703516,
+      "learning_rate": 0.0003168229020811999,
+      "loss": 0.4924,
+      "step": 25900
+    },
+    {
+      "epoch": 1.3151375157061085,
+      "grad_norm": 0.024999798230040154,
+      "learning_rate": 0.00031661679558771076,
+      "loss": 0.4843,
+      "step": 25905
+    },
+    {
+      "epoch": 1.31539135180414,
+      "grad_norm": 0.02466258211924113,
+      "learning_rate": 0.0003164107250914302,
+      "loss": 0.5157,
+      "step": 25910
+    },
+    {
+      "epoch": 1.3156451879021716,
+      "grad_norm": 0.03808085877482315,
+      "learning_rate": 0.0003162046906328087,
+      "loss": 0.4982,
+      "step": 25915
+    },
+    {
+      "epoch": 1.315899024000203,
+      "grad_norm": 0.022915850153156146,
+      "learning_rate": 0.0003159986922522899,
+      "loss": 0.4911,
+      "step": 25920
+    },
+    {
+      "epoch": 1.3161528600982346,
+      "grad_norm": 0.029411931313225212,
+      "learning_rate": 0.0003157927299903102,
+      "loss": 0.5142,
+      "step": 25925
+    },
+    {
+      "epoch": 1.3164066961962662,
+      "grad_norm": 0.022508290380115467,
+      "learning_rate": 0.0003155868038872989,
+      "loss": 0.4781,
+      "step": 25930
+    },
+    {
+      "epoch": 1.3166605322942977,
+      "grad_norm": 0.02367130521025921,
+      "learning_rate": 0.0003153809139836781,
+      "loss": 0.5034,
+      "step": 25935
+    },
+    {
+      "epoch": 1.316914368392329,
+      "grad_norm": 0.02370837844837251,
+      "learning_rate": 0.0003151750603198634,
+      "loss": 0.4825,
+      "step": 25940
+    },
+    {
+      "epoch": 1.3171682044903605,
+      "grad_norm": 0.022145644887770835,
+      "learning_rate": 0.0003149692429362627,
+      "loss": 0.4711,
+      "step": 25945
+    },
+    {
+      "epoch": 1.317422040588392,
+      "grad_norm": 0.030152956850519705,
+      "learning_rate": 0.00031476346187327684,
+      "loss": 0.4648,
+      "step": 25950
+    },
+    {
+      "epoch": 1.3176758766864236,
+      "grad_norm": 0.027631145783320436,
+      "learning_rate": 0.0003145577171712997,
+      "loss": 0.4992,
+      "step": 25955
+    },
+    {
+      "epoch": 1.3179297127844551,
+      "grad_norm": 0.028341925435365112,
+      "learning_rate": 0.00031435200887071786,
+      "loss": 0.4761,
+      "step": 25960
+    },
+    {
+      "epoch": 1.3181835488824865,
+      "grad_norm": 0.026884867627421965,
+      "learning_rate": 0.0003141463370119108,
+      "loss": 0.4643,
+      "step": 25965
+    },
+    {
+      "epoch": 1.318437384980518,
+      "grad_norm": 0.022874705393107205,
+      "learning_rate": 0.00031394070163525095,
+      "loss": 0.4786,
+      "step": 25970
+    },
+    {
+      "epoch": 1.3186912210785495,
+      "grad_norm": 0.027134145494403333,
+      "learning_rate": 0.0003137351027811035,
+      "loss": 0.503,
+      "step": 25975
+    },
+    {
+      "epoch": 1.318945057176581,
+      "grad_norm": 0.024479146918958583,
+      "learning_rate": 0.0003135295404898265,
+      "loss": 0.4983,
+      "step": 25980
+    },
+    {
+      "epoch": 1.3191988932746126,
+      "grad_norm": 0.02172267954854436,
+      "learning_rate": 0.00031332401480177073,
+      "loss": 0.4721,
+      "step": 25985
+    },
+    {
+      "epoch": 1.3194527293726441,
+      "grad_norm": 0.025255869240494836,
+      "learning_rate": 0.0003131185257572799,
+      "loss": 0.4665,
+      "step": 25990
+    },
+    {
+      "epoch": 1.3197065654706757,
+      "grad_norm": 0.023915003608357185,
+      "learning_rate": 0.0003129130733966904,
+      "loss": 0.5186,
+      "step": 25995
+    },
+    {
+      "epoch": 1.3199604015687072,
+      "grad_norm": 0.038739561505479345,
+      "learning_rate": 0.00031270765776033173,
+      "loss": 0.4665,
+      "step": 26000
+    },
+    {
+      "epoch": 1.3202142376667385,
+      "grad_norm": 0.02087696169338501,
+      "learning_rate": 0.00031250227888852576,
+      "loss": 0.4838,
+      "step": 26005
+    },
+    {
+      "epoch": 1.32046807376477,
+      "grad_norm": 0.024862811416681813,
+      "learning_rate": 0.0003122969368215874,
+      "loss": 0.4861,
+      "step": 26010
+    },
+    {
+      "epoch": 1.3207219098628016,
+      "grad_norm": 0.019972495995757008,
+      "learning_rate": 0.0003120916315998243,
+      "loss": 0.4681,
+      "step": 26015
+    },
+    {
+      "epoch": 1.3209757459608331,
+      "grad_norm": 0.026081632082320026,
+      "learning_rate": 0.0003118863632635368,
+      "loss": 0.4591,
+      "step": 26020
+    },
+    {
+      "epoch": 1.3212295820588646,
+      "grad_norm": 0.03877882882655485,
+      "learning_rate": 0.00031168113185301815,
+      "loss": 0.5011,
+      "step": 26025
+    },
+    {
+      "epoch": 1.321483418156896,
+      "grad_norm": 0.035565096811897436,
+      "learning_rate": 0.00031147593740855407,
+      "loss": 0.4884,
+      "step": 26030
+    },
+    {
+      "epoch": 1.3217372542549275,
+      "grad_norm": 0.032673275953771695,
+      "learning_rate": 0.00031127077997042336,
+      "loss": 0.4688,
+      "step": 26035
+    },
+    {
+      "epoch": 1.321991090352959,
+      "grad_norm": 0.02225124249672015,
+      "learning_rate": 0.0003110656595788973,
+      "loss": 0.5164,
+      "step": 26040
+    },
+    {
+      "epoch": 1.3222449264509906,
+      "grad_norm": 0.027980878312804384,
+      "learning_rate": 0.0003108605762742401,
+      "loss": 0.4872,
+      "step": 26045
+    },
+    {
+      "epoch": 1.322498762549022,
+      "grad_norm": 0.025276827695637313,
+      "learning_rate": 0.00031065553009670857,
+      "loss": 0.4903,
+      "step": 26050
+    },
+    {
+      "epoch": 1.3227525986470536,
+      "grad_norm": 0.028706939574934476,
+      "learning_rate": 0.00031045052108655193,
+      "loss": 0.5026,
+      "step": 26055
+    },
+    {
+      "epoch": 1.3230064347450852,
+      "grad_norm": 0.027941893356288494,
+      "learning_rate": 0.0003102455492840129,
+      "loss": 0.4932,
+      "step": 26060
+    },
+    {
+      "epoch": 1.3232602708431167,
+      "grad_norm": 0.026634652222519675,
+      "learning_rate": 0.00031004061472932634,
+      "loss": 0.5158,
+      "step": 26065
+    },
+    {
+      "epoch": 1.3235141069411482,
+      "grad_norm": 0.03025644762717795,
+      "learning_rate": 0.00030983571746271977,
+      "loss": 0.4733,
+      "step": 26070
+    },
+    {
+      "epoch": 1.3237679430391796,
+      "grad_norm": 0.02398881393021072,
+      "learning_rate": 0.0003096308575244135,
+      "loss": 0.4784,
+      "step": 26075
+    },
+    {
+      "epoch": 1.324021779137211,
+      "grad_norm": 0.023325566374430225,
+      "learning_rate": 0.00030942603495462054,
+      "loss": 0.501,
+      "step": 26080
+    },
+    {
+      "epoch": 1.3242756152352426,
+      "grad_norm": 0.02552948646532719,
+      "learning_rate": 0.0003092212497935465,
+      "loss": 0.4894,
+      "step": 26085
+    },
+    {
+      "epoch": 1.3245294513332742,
+      "grad_norm": 0.03612792972246537,
+      "learning_rate": 0.0003090165020813897,
+      "loss": 0.4688,
+      "step": 26090
+    },
+    {
+      "epoch": 1.3247832874313055,
+      "grad_norm": 0.028878202348219067,
+      "learning_rate": 0.00030881179185834114,
+      "loss": 0.5304,
+      "step": 26095
+    },
+    {
+      "epoch": 1.325037123529337,
+      "grad_norm": 0.020583251018650484,
+      "learning_rate": 0.0003086071191645844,
+      "loss": 0.4639,
+      "step": 26100
+    },
+    {
+      "epoch": 1.3252909596273685,
+      "grad_norm": 0.023744717612926514,
+      "learning_rate": 0.00030840248404029563,
+      "loss": 0.4727,
+      "step": 26105
+    },
+    {
+      "epoch": 1.3255447957254,
+      "grad_norm": 0.024384417847231648,
+      "learning_rate": 0.00030819788652564377,
+      "loss": 0.4774,
+      "step": 26110
+    },
+    {
+      "epoch": 1.3257986318234316,
+      "grad_norm": 0.029828723552213644,
+      "learning_rate": 0.00030799332666079016,
+      "loss": 0.4778,
+      "step": 26115
+    },
+    {
+      "epoch": 1.3260524679214631,
+      "grad_norm": 0.0277189472799926,
+      "learning_rate": 0.0003077888044858891,
+      "loss": 0.4987,
+      "step": 26120
+    },
+    {
+      "epoch": 1.3263063040194947,
+      "grad_norm": 0.021616997214167058,
+      "learning_rate": 0.00030758432004108723,
+      "loss": 0.491,
+      "step": 26125
+    },
+    {
+      "epoch": 1.3265601401175262,
+      "grad_norm": 0.03097966201237373,
+      "learning_rate": 0.0003073798733665237,
+      "loss": 0.4743,
+      "step": 26130
+    },
+    {
+      "epoch": 1.3268139762155577,
+      "grad_norm": 0.03448839917044529,
+      "learning_rate": 0.00030717546450233045,
+      "loss": 0.5102,
+      "step": 26135
+    },
+    {
+      "epoch": 1.327067812313589,
+      "grad_norm": 0.027797346027596648,
+      "learning_rate": 0.0003069710934886319,
+      "loss": 0.4805,
+      "step": 26140
+    },
+    {
+      "epoch": 1.3273216484116206,
+      "grad_norm": 0.023865639312762033,
+      "learning_rate": 0.0003067667603655451,
+      "loss": 0.4738,
+      "step": 26145
+    },
+    {
+      "epoch": 1.3275754845096521,
+      "grad_norm": 0.022994781293125738,
+      "learning_rate": 0.0003065624651731795,
+      "loss": 0.4604,
+      "step": 26150
+    },
+    {
+      "epoch": 1.3278293206076837,
+      "grad_norm": 0.02452841527289796,
+      "learning_rate": 0.00030635820795163737,
+      "loss": 0.495,
+      "step": 26155
+    },
+    {
+      "epoch": 1.3280831567057152,
+      "grad_norm": 0.025166790867288878,
+      "learning_rate": 0.0003061539887410133,
+      "loss": 0.4653,
+      "step": 26160
+    },
+    {
+      "epoch": 1.3283369928037465,
+      "grad_norm": 0.025844039701185498,
+      "learning_rate": 0.0003059498075813946,
+      "loss": 0.474,
+      "step": 26165
+    },
+    {
+      "epoch": 1.328590828901778,
+      "grad_norm": 0.022922263461315293,
+      "learning_rate": 0.0003057456645128609,
+      "loss": 0.5229,
+      "step": 26170
+    },
+    {
+      "epoch": 1.3288446649998096,
+      "grad_norm": 0.028735220280624686,
+      "learning_rate": 0.00030554155957548425,
+      "loss": 0.4895,
+      "step": 26175
+    },
+    {
+      "epoch": 1.3290985010978411,
+      "grad_norm": 0.02399502885581283,
+      "learning_rate": 0.00030533749280933,
+      "loss": 0.4898,
+      "step": 26180
+    },
+    {
+      "epoch": 1.3293523371958726,
+      "grad_norm": 0.027489214240756413,
+      "learning_rate": 0.0003051334642544551,
+      "loss": 0.4868,
+      "step": 26185
+    },
+    {
+      "epoch": 1.3296061732939042,
+      "grad_norm": 0.022807932982948913,
+      "learning_rate": 0.0003049294739509093,
+      "loss": 0.4645,
+      "step": 26190
+    },
+    {
+      "epoch": 1.3298600093919357,
+      "grad_norm": 0.02364717870718042,
+      "learning_rate": 0.00030472552193873506,
+      "loss": 0.4874,
+      "step": 26195
+    },
+    {
+      "epoch": 1.3301138454899673,
+      "grad_norm": 0.02421526146734168,
+      "learning_rate": 0.0003045216082579669,
+      "loss": 0.4753,
+      "step": 26200
+    },
+    {
+      "epoch": 1.3303676815879986,
+      "grad_norm": 0.023753864278555786,
+      "learning_rate": 0.0003043177329486323,
+      "loss": 0.4974,
+      "step": 26205
+    },
+    {
+      "epoch": 1.33062151768603,
+      "grad_norm": 0.02405794761573703,
+      "learning_rate": 0.0003041138960507508,
+      "loss": 0.5051,
+      "step": 26210
+    },
+    {
+      "epoch": 1.3308753537840616,
+      "grad_norm": 0.022815391635180565,
+      "learning_rate": 0.0003039100976043346,
+      "loss": 0.491,
+      "step": 26215
+    },
+    {
+      "epoch": 1.3311291898820932,
+      "grad_norm": 0.022343572537838263,
+      "learning_rate": 0.0003037063376493884,
+      "loss": 0.4743,
+      "step": 26220
+    },
+    {
+      "epoch": 1.3313830259801247,
+      "grad_norm": 0.022723276658171112,
+      "learning_rate": 0.00030350261622590926,
+      "loss": 0.4531,
+      "step": 26225
+    },
+    {
+      "epoch": 1.331636862078156,
+      "grad_norm": 0.023454158002978998,
+      "learning_rate": 0.0003032989333738865,
+      "loss": 0.4808,
+      "step": 26230
+    },
+    {
+      "epoch": 1.3318906981761875,
+      "grad_norm": 0.023075690824087954,
+      "learning_rate": 0.0003030952891333021,
+      "loss": 0.4672,
+      "step": 26235
+    },
+    {
+      "epoch": 1.332144534274219,
+      "grad_norm": 0.024106778177103005,
+      "learning_rate": 0.00030289168354413065,
+      "loss": 0.4942,
+      "step": 26240
+    },
+    {
+      "epoch": 1.3323983703722506,
+      "grad_norm": 0.025151670342148585,
+      "learning_rate": 0.00030268811664633865,
+      "loss": 0.4927,
+      "step": 26245
+    },
+    {
+      "epoch": 1.3326522064702822,
+      "grad_norm": 0.030957786516116338,
+      "learning_rate": 0.0003024845884798855,
+      "loss": 0.4773,
+      "step": 26250
+    },
+    {
+      "epoch": 1.3329060425683137,
+      "grad_norm": 0.025925112474195305,
+      "learning_rate": 0.00030228109908472247,
+      "loss": 0.4611,
+      "step": 26255
+    },
+    {
+      "epoch": 1.3331598786663452,
+      "grad_norm": 0.019884408833520717,
+      "learning_rate": 0.00030207764850079374,
+      "loss": 0.4644,
+      "step": 26260
+    },
+    {
+      "epoch": 1.3334137147643768,
+      "grad_norm": 0.025171919728835243,
+      "learning_rate": 0.00030187423676803556,
+      "loss": 0.4628,
+      "step": 26265
+    },
+    {
+      "epoch": 1.333667550862408,
+      "grad_norm": 0.02155136741752636,
+      "learning_rate": 0.00030167086392637665,
+      "loss": 0.4931,
+      "step": 26270
+    },
+    {
+      "epoch": 1.3339213869604396,
+      "grad_norm": 0.03331276638766642,
+      "learning_rate": 0.0003014675300157381,
+      "loss": 0.4617,
+      "step": 26275
+    },
+    {
+      "epoch": 1.3341752230584711,
+      "grad_norm": 0.026091056559288128,
+      "learning_rate": 0.00030126423507603327,
+      "loss": 0.4691,
+      "step": 26280
+    },
+    {
+      "epoch": 1.3344290591565027,
+      "grad_norm": 0.021480786402548066,
+      "learning_rate": 0.00030106097914716804,
+      "loss": 0.4798,
+      "step": 26285
+    },
+    {
+      "epoch": 1.3346828952545342,
+      "grad_norm": 0.026115011721068254,
+      "learning_rate": 0.0003008577622690405,
+      "loss": 0.4778,
+      "step": 26290
+    },
+    {
+      "epoch": 1.3349367313525655,
+      "grad_norm": 0.029188603624534938,
+      "learning_rate": 0.00030065458448154094,
+      "loss": 0.4956,
+      "step": 26295
+    },
+    {
+      "epoch": 1.335190567450597,
+      "grad_norm": 0.02343770485301329,
+      "learning_rate": 0.0003004514458245525,
+      "loss": 0.5235,
+      "step": 26300
+    },
+    {
+      "epoch": 1.3354444035486286,
+      "grad_norm": 0.023629932549987128,
+      "learning_rate": 0.00030024834633795005,
+      "loss": 0.4652,
+      "step": 26305
+    },
+    {
+      "epoch": 1.3356982396466601,
+      "grad_norm": 0.02116981769281469,
+      "learning_rate": 0.0003000452860616011,
+      "loss": 0.4936,
+      "step": 26310
+    },
+    {
+      "epoch": 1.3359520757446917,
+      "grad_norm": 0.019859957952761565,
+      "learning_rate": 0.00029984226503536527,
+      "loss": 0.4681,
+      "step": 26315
+    },
+    {
+      "epoch": 1.3362059118427232,
+      "grad_norm": 0.025886391465037177,
+      "learning_rate": 0.0002996392832990946,
+      "loss": 0.4803,
+      "step": 26320
+    },
+    {
+      "epoch": 1.3364597479407547,
+      "grad_norm": 0.02594128632014247,
+      "learning_rate": 0.00029943634089263355,
+      "loss": 0.4873,
+      "step": 26325
+    },
+    {
+      "epoch": 1.3367135840387863,
+      "grad_norm": 0.03408346447284297,
+      "learning_rate": 0.0002992334378558185,
+      "loss": 0.48,
+      "step": 26330
+    },
+    {
+      "epoch": 1.3369674201368176,
+      "grad_norm": 0.03841155039452,
+      "learning_rate": 0.00029903057422847834,
+      "loss": 0.4702,
+      "step": 26335
+    },
+    {
+      "epoch": 1.337221256234849,
+      "grad_norm": 0.022888290658989296,
+      "learning_rate": 0.0002988277500504343,
+      "loss": 0.4388,
+      "step": 26340
+    },
+    {
+      "epoch": 1.3374750923328806,
+      "grad_norm": 0.03122320066551133,
+      "learning_rate": 0.00029862496536149966,
+      "loss": 0.4969,
+      "step": 26345
+    },
+    {
+      "epoch": 1.3377289284309122,
+      "grad_norm": 0.03769099264555599,
+      "learning_rate": 0.00029842222020148,
+      "loss": 0.489,
+      "step": 26350
+    },
+    {
+      "epoch": 1.3379827645289437,
+      "grad_norm": 0.029431216873870947,
+      "learning_rate": 0.0002982195146101734,
+      "loss": 0.5069,
+      "step": 26355
+    },
+    {
+      "epoch": 1.338236600626975,
+      "grad_norm": 0.024178396636187073,
+      "learning_rate": 0.00029801684862736956,
+      "loss": 0.4598,
+      "step": 26360
+    },
+    {
+      "epoch": 1.3384904367250066,
+      "grad_norm": 0.027229237482082266,
+      "learning_rate": 0.0002978142222928512,
+      "loss": 0.4881,
+      "step": 26365
+    },
+    {
+      "epoch": 1.338744272823038,
+      "grad_norm": 0.02518085523341514,
+      "learning_rate": 0.0002976116356463927,
+      "loss": 0.4734,
+      "step": 26370
+    },
+    {
+      "epoch": 1.3389981089210696,
+      "grad_norm": 0.026449318799886906,
+      "learning_rate": 0.00029740908872776087,
+      "loss": 0.4857,
+      "step": 26375
+    },
+    {
+      "epoch": 1.3392519450191012,
+      "grad_norm": 0.02736453220212371,
+      "learning_rate": 0.00029720658157671447,
+      "loss": 0.4952,
+      "step": 26380
+    },
+    {
+      "epoch": 1.3395057811171327,
+      "grad_norm": 0.02011376143672255,
+      "learning_rate": 0.0002970041142330049,
+      "loss": 0.475,
+      "step": 26385
+    },
+    {
+      "epoch": 1.3397596172151642,
+      "grad_norm": 0.022297115519452224,
+      "learning_rate": 0.0002968016867363753,
+      "loss": 0.4881,
+      "step": 26390
+    },
+    {
+      "epoch": 1.3400134533131958,
+      "grad_norm": 0.028308862230847293,
+      "learning_rate": 0.00029659929912656123,
+      "loss": 0.4707,
+      "step": 26395
+    },
+    {
+      "epoch": 1.3402672894112273,
+      "grad_norm": 0.02418795190807018,
+      "learning_rate": 0.0002963969514432904,
+      "loss": 0.472,
+      "step": 26400
+    },
+    {
+      "epoch": 1.3405211255092586,
+      "grad_norm": 0.03161928209152755,
+      "learning_rate": 0.0002961946437262827,
+      "loss": 0.4715,
+      "step": 26405
+    },
+    {
+      "epoch": 1.3407749616072901,
+      "grad_norm": 0.03500499761857729,
+      "learning_rate": 0.00029599237601525,
+      "loss": 0.4636,
+      "step": 26410
+    },
+    {
+      "epoch": 1.3410287977053217,
+      "grad_norm": 0.03348168345879801,
+      "learning_rate": 0.00029579014834989653,
+      "loss": 0.4979,
+      "step": 26415
+    },
+    {
+      "epoch": 1.3412826338033532,
+      "grad_norm": 0.029165985289788657,
+      "learning_rate": 0.00029558796076991836,
+      "loss": 0.5169,
+      "step": 26420
+    },
+    {
+      "epoch": 1.3415364699013848,
+      "grad_norm": 0.026140837166966736,
+      "learning_rate": 0.00029538581331500427,
+      "loss": 0.4786,
+      "step": 26425
+    },
+    {
+      "epoch": 1.341790305999416,
+      "grad_norm": 0.031152081282308442,
+      "learning_rate": 0.0002951837060248346,
+      "loss": 0.4906,
+      "step": 26430
+    },
+    {
+      "epoch": 1.3420441420974476,
+      "grad_norm": 0.036089699561631966,
+      "learning_rate": 0.000294981638939082,
+      "loss": 0.4527,
+      "step": 26435
+    },
+    {
+      "epoch": 1.3422979781954791,
+      "grad_norm": 0.029379508239414574,
+      "learning_rate": 0.0002947796120974113,
+      "loss": 0.4576,
+      "step": 26440
+    },
+    {
+      "epoch": 1.3425518142935107,
+      "grad_norm": 0.026574119568228798,
+      "learning_rate": 0.0002945776255394793,
+      "loss": 0.487,
+      "step": 26445
+    },
+    {
+      "epoch": 1.3428056503915422,
+      "grad_norm": 0.02274745690514462,
+      "learning_rate": 0.00029437567930493493,
+      "loss": 0.4661,
+      "step": 26450
+    },
+    {
+      "epoch": 1.3430594864895737,
+      "grad_norm": 0.021614065812495135,
+      "learning_rate": 0.0002941737734334193,
+      "loss": 0.496,
+      "step": 26455
+    },
+    {
+      "epoch": 1.3433133225876053,
+      "grad_norm": 0.022482586999432463,
+      "learning_rate": 0.00029397190796456553,
+      "loss": 0.4844,
+      "step": 26460
+    },
+    {
+      "epoch": 1.3435671586856368,
+      "grad_norm": 0.022116336028821203,
+      "learning_rate": 0.00029377008293799865,
+      "loss": 0.52,
+      "step": 26465
+    },
+    {
+      "epoch": 1.3438209947836681,
+      "grad_norm": 0.028349507842694705,
+      "learning_rate": 0.00029356829839333615,
+      "loss": 0.4871,
+      "step": 26470
+    },
+    {
+      "epoch": 1.3440748308816997,
+      "grad_norm": 0.022089539699478508,
+      "learning_rate": 0.0002933665543701871,
+      "loss": 0.4943,
+      "step": 26475
+    },
+    {
+      "epoch": 1.3443286669797312,
+      "grad_norm": 0.026039145829479762,
+      "learning_rate": 0.0002931648509081529,
+      "loss": 0.502,
+      "step": 26480
+    },
+    {
+      "epoch": 1.3445825030777627,
+      "grad_norm": 0.028120685619677774,
+      "learning_rate": 0.0002929631880468271,
+      "loss": 0.4956,
+      "step": 26485
+    },
+    {
+      "epoch": 1.3448363391757943,
+      "grad_norm": 0.02097286192345862,
+      "learning_rate": 0.000292761565825795,
+      "loss": 0.4729,
+      "step": 26490
+    },
+    {
+      "epoch": 1.3450901752738256,
+      "grad_norm": 0.027620567755231737,
+      "learning_rate": 0.000292559984284634,
+      "loss": 0.5011,
+      "step": 26495
+    },
+    {
+      "epoch": 1.345344011371857,
+      "grad_norm": 0.022184671610469525,
+      "learning_rate": 0.0002923584434629136,
+      "loss": 0.4848,
+      "step": 26500
+    },
+    {
+      "epoch": 1.3455978474698886,
+      "grad_norm": 0.027060057039042365,
+      "learning_rate": 0.0002921569434001952,
+      "loss": 0.446,
+      "step": 26505
+    },
+    {
+      "epoch": 1.3458516835679202,
+      "grad_norm": 0.022283856953366245,
+      "learning_rate": 0.00029195548413603236,
+      "loss": 0.4637,
+      "step": 26510
+    },
+    {
+      "epoch": 1.3461055196659517,
+      "grad_norm": 0.02142821011081133,
+      "learning_rate": 0.0002917540657099703,
+      "loss": 0.4999,
+      "step": 26515
+    },
+    {
+      "epoch": 1.3463593557639832,
+      "grad_norm": 0.03140709448224681,
+      "learning_rate": 0.0002915526881615469,
+      "loss": 0.5199,
+      "step": 26520
+    },
+    {
+      "epoch": 1.3466131918620148,
+      "grad_norm": 0.02193762571699132,
+      "learning_rate": 0.000291351351530291,
+      "loss": 0.5037,
+      "step": 26525
+    },
+    {
+      "epoch": 1.3468670279600463,
+      "grad_norm": 0.024860792301576635,
+      "learning_rate": 0.0002911500558557245,
+      "loss": 0.4918,
+      "step": 26530
+    },
+    {
+      "epoch": 1.3471208640580776,
+      "grad_norm": 0.02411891556809714,
+      "learning_rate": 0.0002909488011773603,
+      "loss": 0.48,
+      "step": 26535
+    },
+    {
+      "epoch": 1.3473747001561092,
+      "grad_norm": 0.02334451935870803,
+      "learning_rate": 0.000290747587534704,
+      "loss": 0.5067,
+      "step": 26540
+    },
+    {
+      "epoch": 1.3476285362541407,
+      "grad_norm": 0.033294686217091356,
+      "learning_rate": 0.00029054641496725276,
+      "loss": 0.533,
+      "step": 26545
+    },
+    {
+      "epoch": 1.3478823723521722,
+      "grad_norm": 0.021364746646164178,
+      "learning_rate": 0.00029034528351449564,
+      "loss": 0.4972,
+      "step": 26550
+    },
+    {
+      "epoch": 1.3481362084502038,
+      "grad_norm": 0.022844057569169195,
+      "learning_rate": 0.00029014419321591396,
+      "loss": 0.4839,
+      "step": 26555
+    },
+    {
+      "epoch": 1.348390044548235,
+      "grad_norm": 0.021739041832296423,
+      "learning_rate": 0.00028994314411098044,
+      "loss": 0.4683,
+      "step": 26560
+    },
+    {
+      "epoch": 1.3486438806462666,
+      "grad_norm": 0.022308989962987445,
+      "learning_rate": 0.00028974213623916037,
+      "loss": 0.4752,
+      "step": 26565
+    },
+    {
+      "epoch": 1.3488977167442981,
+      "grad_norm": 0.022250254963704218,
+      "learning_rate": 0.0002895411696399102,
+      "loss": 0.5067,
+      "step": 26570
+    },
+    {
+      "epoch": 1.3491515528423297,
+      "grad_norm": 0.02323169518050859,
+      "learning_rate": 0.000289340244352679,
+      "loss": 0.4824,
+      "step": 26575
+    },
+    {
+      "epoch": 1.3494053889403612,
+      "grad_norm": 0.021304787629742503,
+      "learning_rate": 0.00028913936041690715,
+      "loss": 0.4832,
+      "step": 26580
+    },
+    {
+      "epoch": 1.3496592250383928,
+      "grad_norm": 0.021154944633414424,
+      "learning_rate": 0.00028893851787202746,
+      "loss": 0.496,
+      "step": 26585
+    },
+    {
+      "epoch": 1.3499130611364243,
+      "grad_norm": 0.023742114405543396,
+      "learning_rate": 0.00028873771675746394,
+      "loss": 0.4846,
+      "step": 26590
+    },
+    {
+      "epoch": 1.3501668972344558,
+      "grad_norm": 0.02312754487239046,
+      "learning_rate": 0.0002885369571126333,
+      "loss": 0.4775,
+      "step": 26595
+    },
+    {
+      "epoch": 1.3504207333324871,
+      "grad_norm": 0.026159799755686802,
+      "learning_rate": 0.000288336238976943,
+      "loss": 0.5369,
+      "step": 26600
+    },
+    {
+      "epoch": 1.3506745694305187,
+      "grad_norm": 0.022936903925025648,
+      "learning_rate": 0.00028813556238979377,
+      "loss": 0.5077,
+      "step": 26605
+    },
+    {
+      "epoch": 1.3509284055285502,
+      "grad_norm": 0.023814084119461076,
+      "learning_rate": 0.000287934927390577,
+      "loss": 0.4698,
+      "step": 26610
+    },
+    {
+      "epoch": 1.3511822416265817,
+      "grad_norm": 0.02246864872428122,
+      "learning_rate": 0.0002877343340186765,
+      "loss": 0.482,
+      "step": 26615
+    },
+    {
+      "epoch": 1.3514360777246133,
+      "grad_norm": 0.02288321097342424,
+      "learning_rate": 0.0002875337823134675,
+      "loss": 0.5256,
+      "step": 26620
+    },
+    {
+      "epoch": 1.3516899138226446,
+      "grad_norm": 0.0224047370327657,
+      "learning_rate": 0.0002873332723143177,
+      "loss": 0.5193,
+      "step": 26625
+    },
+    {
+      "epoch": 1.3519437499206761,
+      "grad_norm": 0.025203167907080704,
+      "learning_rate": 0.00028713280406058575,
+      "loss": 0.484,
+      "step": 26630
+    },
+    {
+      "epoch": 1.3521975860187077,
+      "grad_norm": 0.020720560042865592,
+      "learning_rate": 0.00028693237759162295,
+      "loss": 0.4852,
+      "step": 26635
+    },
+    {
+      "epoch": 1.3524514221167392,
+      "grad_norm": 0.021481598559433032,
+      "learning_rate": 0.0002867319929467717,
+      "loss": 0.5115,
+      "step": 26640
+    },
+    {
+      "epoch": 1.3527052582147707,
+      "grad_norm": 0.022428252435808805,
+      "learning_rate": 0.0002865316501653669,
+      "loss": 0.5024,
+      "step": 26645
+    },
+    {
+      "epoch": 1.3529590943128023,
+      "grad_norm": 0.02233749023643697,
+      "learning_rate": 0.0002863313492867344,
+      "loss": 0.476,
+      "step": 26650
+    },
+    {
+      "epoch": 1.3532129304108338,
+      "grad_norm": 0.03464327071026982,
+      "learning_rate": 0.0002861310903501926,
+      "loss": 0.5206,
+      "step": 26655
+    },
+    {
+      "epoch": 1.3534667665088653,
+      "grad_norm": 0.02542767368623182,
+      "learning_rate": 0.0002859308733950511,
+      "loss": 0.4785,
+      "step": 26660
+    },
+    {
+      "epoch": 1.3537206026068969,
+      "grad_norm": 0.021672839236436318,
+      "learning_rate": 0.0002857306984606115,
+      "loss": 0.4777,
+      "step": 26665
+    },
+    {
+      "epoch": 1.3539744387049282,
+      "grad_norm": 0.021627780540600057,
+      "learning_rate": 0.0002855305655861675,
+      "loss": 0.4673,
+      "step": 26670
+    },
+    {
+      "epoch": 1.3542282748029597,
+      "grad_norm": 0.02290542274140012,
+      "learning_rate": 0.0002853304748110037,
+      "loss": 0.4879,
+      "step": 26675
+    },
+    {
+      "epoch": 1.3544821109009912,
+      "grad_norm": 0.024948536996439105,
+      "learning_rate": 0.00028513042617439734,
+      "loss": 0.4777,
+      "step": 26680
+    },
+    {
+      "epoch": 1.3547359469990228,
+      "grad_norm": 0.029359162803752524,
+      "learning_rate": 0.0002849304197156166,
+      "loss": 0.5148,
+      "step": 26685
+    },
+    {
+      "epoch": 1.3549897830970543,
+      "grad_norm": 0.023302687409291374,
+      "learning_rate": 0.00028473045547392205,
+      "loss": 0.482,
+      "step": 26690
+    },
+    {
+      "epoch": 1.3552436191950856,
+      "grad_norm": 0.025299148598432575,
+      "learning_rate": 0.0002845305334885654,
+      "loss": 0.51,
+      "step": 26695
+    },
+    {
+      "epoch": 1.3554974552931172,
+      "grad_norm": 0.02516023607604002,
+      "learning_rate": 0.0002843306537987906,
+      "loss": 0.4369,
+      "step": 26700
+    },
+    {
+      "epoch": 1.3557512913911487,
+      "grad_norm": 0.022672487670563547,
+      "learning_rate": 0.00028413081644383285,
+      "loss": 0.4883,
+      "step": 26705
+    },
+    {
+      "epoch": 1.3560051274891802,
+      "grad_norm": 0.02464631035571871,
+      "learning_rate": 0.0002839310214629194,
+      "loss": 0.5134,
+      "step": 26710
+    },
+    {
+      "epoch": 1.3562589635872118,
+      "grad_norm": 0.021789322301810896,
+      "learning_rate": 0.00028373126889526875,
+      "loss": 0.4862,
+      "step": 26715
+    },
+    {
+      "epoch": 1.3565127996852433,
+      "grad_norm": 0.021183030059470056,
+      "learning_rate": 0.0002835315587800914,
+      "loss": 0.4785,
+      "step": 26720
+    },
+    {
+      "epoch": 1.3567666357832748,
+      "grad_norm": 0.02477154579365883,
+      "learning_rate": 0.00028333189115658966,
+      "loss": 0.4824,
+      "step": 26725
+    },
+    {
+      "epoch": 1.3570204718813064,
+      "grad_norm": 0.023028212328728488,
+      "learning_rate": 0.0002831322660639573,
+      "loss": 0.5141,
+      "step": 26730
+    },
+    {
+      "epoch": 1.3572743079793377,
+      "grad_norm": 0.03275446295412662,
+      "learning_rate": 0.0002829326835413794,
+      "loss": 0.4805,
+      "step": 26735
+    },
+    {
+      "epoch": 1.3575281440773692,
+      "grad_norm": 0.02501122701564825,
+      "learning_rate": 0.00028273314362803337,
+      "loss": 0.4855,
+      "step": 26740
+    },
+    {
+      "epoch": 1.3577819801754007,
+      "grad_norm": 0.02328137101092054,
+      "learning_rate": 0.0002825336463630875,
+      "loss": 0.4999,
+      "step": 26745
+    },
+    {
+      "epoch": 1.3580358162734323,
+      "grad_norm": 0.026605008466514972,
+      "learning_rate": 0.0002823341917857027,
+      "loss": 0.4587,
+      "step": 26750
+    },
+    {
+      "epoch": 1.3582896523714638,
+      "grad_norm": 0.023150456810902106,
+      "learning_rate": 0.0002821347799350302,
+      "loss": 0.4723,
+      "step": 26755
+    },
+    {
+      "epoch": 1.3585434884694951,
+      "grad_norm": 0.02251804410714866,
+      "learning_rate": 0.00028193541085021423,
+      "loss": 0.501,
+      "step": 26760
+    },
+    {
+      "epoch": 1.3587973245675267,
+      "grad_norm": 0.029335610284536826,
+      "learning_rate": 0.00028173608457038936,
+      "loss": 0.5047,
+      "step": 26765
+    },
+    {
+      "epoch": 1.3590511606655582,
+      "grad_norm": 0.022188907618175026,
+      "learning_rate": 0.0002815368011346828,
+      "loss": 0.4884,
+      "step": 26770
+    },
+    {
+      "epoch": 1.3593049967635897,
+      "grad_norm": 0.021239459035322466,
+      "learning_rate": 0.00028133756058221253,
+      "loss": 0.4655,
+      "step": 26775
+    },
+    {
+      "epoch": 1.3595588328616213,
+      "grad_norm": 0.022946805788639644,
+      "learning_rate": 0.0002811383629520887,
+      "loss": 0.4871,
+      "step": 26780
+    },
+    {
+      "epoch": 1.3598126689596528,
+      "grad_norm": 0.023284296559063916,
+      "learning_rate": 0.0002809392082834129,
+      "loss": 0.4965,
+      "step": 26785
+    },
+    {
+      "epoch": 1.3600665050576843,
+      "grad_norm": 0.028648125257779124,
+      "learning_rate": 0.0002807400966152778,
+      "loss": 0.4915,
+      "step": 26790
+    },
+    {
+      "epoch": 1.3603203411557159,
+      "grad_norm": 0.02302269054921378,
+      "learning_rate": 0.0002805410279867686,
+      "loss": 0.459,
+      "step": 26795
+    },
+    {
+      "epoch": 1.3605741772537472,
+      "grad_norm": 0.030405240018795764,
+      "learning_rate": 0.0002803420024369609,
+      "loss": 0.4316,
+      "step": 26800
+    },
+    {
+      "epoch": 1.3608280133517787,
+      "grad_norm": 0.030800198523568283,
+      "learning_rate": 0.00028014302000492285,
+      "loss": 0.4752,
+      "step": 26805
+    },
+    {
+      "epoch": 1.3610818494498103,
+      "grad_norm": 0.02475384060074233,
+      "learning_rate": 0.00027994408072971346,
+      "loss": 0.4718,
+      "step": 26810
+    },
+    {
+      "epoch": 1.3613356855478418,
+      "grad_norm": 0.021700406170621297,
+      "learning_rate": 0.0002797451846503837,
+      "loss": 0.4708,
+      "step": 26815
+    },
+    {
+      "epoch": 1.3615895216458733,
+      "grad_norm": 0.021235969952080012,
+      "learning_rate": 0.00027954633180597564,
+      "loss": 0.495,
+      "step": 26820
+    },
+    {
+      "epoch": 1.3618433577439046,
+      "grad_norm": 0.024028737162551318,
+      "learning_rate": 0.00027934752223552343,
+      "loss": 0.4563,
+      "step": 26825
+    },
+    {
+      "epoch": 1.3620971938419362,
+      "grad_norm": 0.024465335502800833,
+      "learning_rate": 0.0002791487559780521,
+      "loss": 0.4787,
+      "step": 26830
+    },
+    {
+      "epoch": 1.3623510299399677,
+      "grad_norm": 0.022491243446900795,
+      "learning_rate": 0.00027895003307257867,
+      "loss": 0.4787,
+      "step": 26835
+    },
+    {
+      "epoch": 1.3626048660379992,
+      "grad_norm": 0.021133211476665054,
+      "learning_rate": 0.000278751353558111,
+      "loss": 0.4931,
+      "step": 26840
+    },
+    {
+      "epoch": 1.3628587021360308,
+      "grad_norm": 0.02049442999020091,
+      "learning_rate": 0.00027855271747364966,
+      "loss": 0.4676,
+      "step": 26845
+    },
+    {
+      "epoch": 1.3631125382340623,
+      "grad_norm": 0.08243543665279841,
+      "learning_rate": 0.00027835412485818534,
+      "loss": 0.4618,
+      "step": 26850
+    },
+    {
+      "epoch": 1.3633663743320938,
+      "grad_norm": 0.03544850817167854,
+      "learning_rate": 0.00027815557575070117,
+      "loss": 0.4583,
+      "step": 26855
+    },
+    {
+      "epoch": 1.3636202104301254,
+      "grad_norm": 0.020365646726838955,
+      "learning_rate": 0.0002779570701901709,
+      "loss": 0.4487,
+      "step": 26860
+    },
+    {
+      "epoch": 1.3638740465281567,
+      "grad_norm": 0.02299060460650672,
+      "learning_rate": 0.0002777586082155607,
+      "loss": 0.5215,
+      "step": 26865
+    },
+    {
+      "epoch": 1.3641278826261882,
+      "grad_norm": 0.02293172060847605,
+      "learning_rate": 0.00027756018986582715,
+      "loss": 0.4658,
+      "step": 26870
+    },
+    {
+      "epoch": 1.3643817187242198,
+      "grad_norm": 0.024224482941596973,
+      "learning_rate": 0.00027736181517991923,
+      "loss": 0.4582,
+      "step": 26875
+    },
+    {
+      "epoch": 1.3646355548222513,
+      "grad_norm": 0.02013654142332293,
+      "learning_rate": 0.0002771634841967767,
+      "loss": 0.5103,
+      "step": 26880
+    },
+    {
+      "epoch": 1.3648893909202828,
+      "grad_norm": 0.022624447257192006,
+      "learning_rate": 0.00027696519695533074,
+      "loss": 0.5027,
+      "step": 26885
+    },
+    {
+      "epoch": 1.3651432270183141,
+      "grad_norm": 0.023533731523000142,
+      "learning_rate": 0.00027676695349450456,
+      "loss": 0.4481,
+      "step": 26890
+    },
+    {
+      "epoch": 1.3653970631163457,
+      "grad_norm": 0.023179531374566843,
+      "learning_rate": 0.0002765687538532119,
+      "loss": 0.4483,
+      "step": 26895
+    },
+    {
+      "epoch": 1.3656508992143772,
+      "grad_norm": 0.024281101734940795,
+      "learning_rate": 0.0002763705980703586,
+      "loss": 0.4812,
+      "step": 26900
+    },
+    {
+      "epoch": 1.3659047353124087,
+      "grad_norm": 0.026283033797865986,
+      "learning_rate": 0.0002761724861848417,
+      "loss": 0.491,
+      "step": 26905
+    },
+    {
+      "epoch": 1.3661585714104403,
+      "grad_norm": 0.02323522667080857,
+      "learning_rate": 0.0002759744182355498,
+      "loss": 0.4649,
+      "step": 26910
+    },
+    {
+      "epoch": 1.3664124075084718,
+      "grad_norm": 0.021854167824604177,
+      "learning_rate": 0.00027577639426136204,
+      "loss": 0.5056,
+      "step": 26915
+    },
+    {
+      "epoch": 1.3666662436065034,
+      "grad_norm": 0.023141909554007044,
+      "learning_rate": 0.00027557841430115015,
+      "loss": 0.4835,
+      "step": 26920
+    },
+    {
+      "epoch": 1.3669200797045349,
+      "grad_norm": 0.025826171421759428,
+      "learning_rate": 0.0002753804783937762,
+      "loss": 0.5056,
+      "step": 26925
+    },
+    {
+      "epoch": 1.3671739158025664,
+      "grad_norm": 0.022488613024276922,
+      "learning_rate": 0.0002751825865780943,
+      "loss": 0.4655,
+      "step": 26930
+    },
+    {
+      "epoch": 1.3674277519005977,
+      "grad_norm": 0.023927541727324465,
+      "learning_rate": 0.0002749847388929493,
+      "loss": 0.4865,
+      "step": 26935
+    },
+    {
+      "epoch": 1.3676815879986293,
+      "grad_norm": 0.022655438670871445,
+      "learning_rate": 0.0002747869353771781,
+      "loss": 0.4848,
+      "step": 26940
+    },
+    {
+      "epoch": 1.3679354240966608,
+      "grad_norm": 0.022064396845660302,
+      "learning_rate": 0.0002745891760696082,
+      "loss": 0.5048,
+      "step": 26945
+    },
+    {
+      "epoch": 1.3681892601946923,
+      "grad_norm": 0.026679064333024728,
+      "learning_rate": 0.0002743914610090591,
+      "loss": 0.4827,
+      "step": 26950
+    },
+    {
+      "epoch": 1.3684430962927239,
+      "grad_norm": 0.027571108833935626,
+      "learning_rate": 0.0002741937902343409,
+      "loss": 0.4845,
+      "step": 26955
+    },
+    {
+      "epoch": 1.3686969323907552,
+      "grad_norm": 0.025110754378475025,
+      "learning_rate": 0.0002739961637842555,
+      "loss": 0.4623,
+      "step": 26960
+    },
+    {
+      "epoch": 1.3689507684887867,
+      "grad_norm": 0.0203989644652357,
+      "learning_rate": 0.0002737985816975963,
+      "loss": 0.5092,
+      "step": 26965
+    },
+    {
+      "epoch": 1.3692046045868183,
+      "grad_norm": 0.02200896596387883,
+      "learning_rate": 0.00027360104401314735,
+      "loss": 0.4924,
+      "step": 26970
+    },
+    {
+      "epoch": 1.3694584406848498,
+      "grad_norm": 0.027141315110277984,
+      "learning_rate": 0.0002734035507696845,
+      "loss": 0.4874,
+      "step": 26975
+    },
+    {
+      "epoch": 1.3697122767828813,
+      "grad_norm": 0.02848772652258403,
+      "learning_rate": 0.0002732061020059745,
+      "loss": 0.5233,
+      "step": 26980
+    },
+    {
+      "epoch": 1.3699661128809129,
+      "grad_norm": 0.022711189059726297,
+      "learning_rate": 0.00027300869776077574,
+      "loss": 0.5153,
+      "step": 26985
+    },
+    {
+      "epoch": 1.3702199489789444,
+      "grad_norm": 0.023955919803751433,
+      "learning_rate": 0.0002728113380728375,
+      "loss": 0.4977,
+      "step": 26990
+    },
+    {
+      "epoch": 1.370473785076976,
+      "grad_norm": 0.028659928908923184,
+      "learning_rate": 0.0002726140229809008,
+      "loss": 0.515,
+      "step": 26995
+    },
+    {
+      "epoch": 1.3707276211750072,
+      "grad_norm": 0.02508581003225146,
+      "learning_rate": 0.00027241675252369715,
+      "loss": 0.4582,
+      "step": 27000
+    },
+    {
+      "epoch": 1.3709814572730388,
+      "grad_norm": 0.021436214718563537,
+      "learning_rate": 0.0002722195267399502,
+      "loss": 0.4612,
+      "step": 27005
+    },
+    {
+      "epoch": 1.3712352933710703,
+      "grad_norm": 0.02201171442218941,
+      "learning_rate": 0.00027202234566837415,
+      "loss": 0.4995,
+      "step": 27010
+    },
+    {
+      "epoch": 1.3714891294691018,
+      "grad_norm": 0.02526187332817724,
+      "learning_rate": 0.0002718252093476748,
+      "loss": 0.4808,
+      "step": 27015
+    },
+    {
+      "epoch": 1.3717429655671334,
+      "grad_norm": 0.020953262767424676,
+      "learning_rate": 0.0002716281178165486,
+      "loss": 0.4999,
+      "step": 27020
+    },
+    {
+      "epoch": 1.3719968016651647,
+      "grad_norm": 0.022858740436538003,
+      "learning_rate": 0.00027143107111368437,
+      "loss": 0.5066,
+      "step": 27025
+    },
+    {
+      "epoch": 1.3722506377631962,
+      "grad_norm": 0.030228302005709733,
+      "learning_rate": 0.00027123406927776085,
+      "loss": 0.4618,
+      "step": 27030
+    },
+    {
+      "epoch": 1.3725044738612278,
+      "grad_norm": 0.021247447506756632,
+      "learning_rate": 0.0002710371123474488,
+      "loss": 0.4838,
+      "step": 27035
+    },
+    {
+      "epoch": 1.3727583099592593,
+      "grad_norm": 0.029048074402705838,
+      "learning_rate": 0.00027084020036140965,
+      "loss": 0.4537,
+      "step": 27040
+    },
+    {
+      "epoch": 1.3730121460572908,
+      "grad_norm": 0.020897232940356406,
+      "learning_rate": 0.00027064333335829647,
+      "loss": 0.4661,
+      "step": 27045
+    },
+    {
+      "epoch": 1.3732659821553224,
+      "grad_norm": 0.024458604155040357,
+      "learning_rate": 0.00027044651137675304,
+      "loss": 0.4854,
+      "step": 27050
+    },
+    {
+      "epoch": 1.373519818253354,
+      "grad_norm": 0.021452148822171557,
+      "learning_rate": 0.00027024973445541475,
+      "loss": 0.4756,
+      "step": 27055
+    },
+    {
+      "epoch": 1.3737736543513854,
+      "grad_norm": 0.02638487068742648,
+      "learning_rate": 0.00027005300263290764,
+      "loss": 0.5049,
+      "step": 27060
+    },
+    {
+      "epoch": 1.3740274904494167,
+      "grad_norm": 0.030588138064835756,
+      "learning_rate": 0.00026985631594784966,
+      "loss": 0.4904,
+      "step": 27065
+    },
+    {
+      "epoch": 1.3742813265474483,
+      "grad_norm": 0.021448138541180863,
+      "learning_rate": 0.0002696596744388488,
+      "loss": 0.4687,
+      "step": 27070
+    },
+    {
+      "epoch": 1.3745351626454798,
+      "grad_norm": 0.0220331732837399,
+      "learning_rate": 0.0002694630781445054,
+      "loss": 0.5052,
+      "step": 27075
+    },
+    {
+      "epoch": 1.3747889987435113,
+      "grad_norm": 0.021848289466681034,
+      "learning_rate": 0.0002692665271034099,
+      "loss": 0.4823,
+      "step": 27080
+    },
+    {
+      "epoch": 1.3750428348415429,
+      "grad_norm": 0.024123987652464134,
+      "learning_rate": 0.00026907002135414447,
+      "loss": 0.4641,
+      "step": 27085
+    },
+    {
+      "epoch": 1.3752966709395742,
+      "grad_norm": 0.020535646841589324,
+      "learning_rate": 0.00026887356093528237,
+      "loss": 0.4648,
+      "step": 27090
+    },
+    {
+      "epoch": 1.3755505070376057,
+      "grad_norm": 0.02584026244252551,
+      "learning_rate": 0.00026867714588538747,
+      "loss": 0.5047,
+      "step": 27095
+    },
+    {
+      "epoch": 1.3758043431356373,
+      "grad_norm": 0.02367328497332991,
+      "learning_rate": 0.00026848077624301537,
+      "loss": 0.4565,
+      "step": 27100
+    },
+    {
+      "epoch": 1.3760581792336688,
+      "grad_norm": 0.031745274615003184,
+      "learning_rate": 0.00026828445204671216,
+      "loss": 0.4953,
+      "step": 27105
+    },
+    {
+      "epoch": 1.3763120153317003,
+      "grad_norm": 0.029581043724073622,
+      "learning_rate": 0.0002680881733350156,
+      "loss": 0.4612,
+      "step": 27110
+    },
+    {
+      "epoch": 1.3765658514297319,
+      "grad_norm": 0.025514285969248228,
+      "learning_rate": 0.0002678919401464539,
+      "loss": 0.4811,
+      "step": 27115
+    },
+    {
+      "epoch": 1.3768196875277634,
+      "grad_norm": 0.023296903129213998,
+      "learning_rate": 0.00026769575251954703,
+      "loss": 0.5023,
+      "step": 27120
+    },
+    {
+      "epoch": 1.377073523625795,
+      "grad_norm": 0.02227739230704745,
+      "learning_rate": 0.00026749961049280527,
+      "loss": 0.4709,
+      "step": 27125
+    },
+    {
+      "epoch": 1.3773273597238263,
+      "grad_norm": 0.023504231198353757,
+      "learning_rate": 0.0002673035141047306,
+      "loss": 0.4861,
+      "step": 27130
+    },
+    {
+      "epoch": 1.3775811958218578,
+      "grad_norm": 0.02229075980390175,
+      "learning_rate": 0.0002671074633938156,
+      "loss": 0.4808,
+      "step": 27135
+    },
+    {
+      "epoch": 1.3778350319198893,
+      "grad_norm": 0.024181315546383808,
+      "learning_rate": 0.00026691145839854405,
+      "loss": 0.4894,
+      "step": 27140
+    },
+    {
+      "epoch": 1.3780888680179209,
+      "grad_norm": 0.02532811062819618,
+      "learning_rate": 0.00026671549915739076,
+      "loss": 0.472,
+      "step": 27145
+    },
+    {
+      "epoch": 1.3783427041159524,
+      "grad_norm": 0.021472103926306933,
+      "learning_rate": 0.0002665195857088218,
+      "loss": 0.4636,
+      "step": 27150
+    },
+    {
+      "epoch": 1.3785965402139837,
+      "grad_norm": 0.028858062964877782,
+      "learning_rate": 0.0002663237180912936,
+      "loss": 0.4687,
+      "step": 27155
+    },
+    {
+      "epoch": 1.3788503763120152,
+      "grad_norm": 0.022973674462700357,
+      "learning_rate": 0.0002661278963432544,
+      "loss": 0.4748,
+      "step": 27160
+    },
+    {
+      "epoch": 1.3791042124100468,
+      "grad_norm": 0.024592376476516513,
+      "learning_rate": 0.00026593212050314265,
+      "loss": 0.4917,
+      "step": 27165
+    },
+    {
+      "epoch": 1.3793580485080783,
+      "grad_norm": 0.026905170435774242,
+      "learning_rate": 0.0002657363906093886,
+      "loss": 0.461,
+      "step": 27170
+    },
+    {
+      "epoch": 1.3796118846061098,
+      "grad_norm": 0.03238266587023234,
+      "learning_rate": 0.0002655407067004125,
+      "loss": 0.4955,
+      "step": 27175
+    },
+    {
+      "epoch": 1.3798657207041414,
+      "grad_norm": 0.03437683326100972,
+      "learning_rate": 0.00026534506881462674,
+      "loss": 0.4659,
+      "step": 27180
+    },
+    {
+      "epoch": 1.380119556802173,
+      "grad_norm": 0.030372754755464044,
+      "learning_rate": 0.0002651494769904335,
+      "loss": 0.4952,
+      "step": 27185
+    },
+    {
+      "epoch": 1.3803733929002044,
+      "grad_norm": 0.029874387863824136,
+      "learning_rate": 0.00026495393126622685,
+      "loss": 0.4846,
+      "step": 27190
+    },
+    {
+      "epoch": 1.3806272289982358,
+      "grad_norm": 0.0267554731125212,
+      "learning_rate": 0.00026475843168039117,
+      "loss": 0.4723,
+      "step": 27195
+    },
+    {
+      "epoch": 1.3808810650962673,
+      "grad_norm": 0.020690270070420125,
+      "learning_rate": 0.0002645629782713022,
+      "loss": 0.4799,
+      "step": 27200
+    },
+    {
+      "epoch": 1.3811349011942988,
+      "grad_norm": 0.02838109794666923,
+      "learning_rate": 0.00026436757107732665,
+      "loss": 0.4916,
+      "step": 27205
+    },
+    {
+      "epoch": 1.3813887372923304,
+      "grad_norm": 0.02358589235650878,
+      "learning_rate": 0.0002641722101368217,
+      "loss": 0.4953,
+      "step": 27210
+    },
+    {
+      "epoch": 1.381642573390362,
+      "grad_norm": 0.02842708954146116,
+      "learning_rate": 0.000263976895488136,
+      "loss": 0.5203,
+      "step": 27215
+    },
+    {
+      "epoch": 1.3818964094883932,
+      "grad_norm": 0.0220954764308833,
+      "learning_rate": 0.0002637816271696084,
+      "loss": 0.4997,
+      "step": 27220
+    },
+    {
+      "epoch": 1.3821502455864247,
+      "grad_norm": 0.031197550414260204,
+      "learning_rate": 0.0002635864052195696,
+      "loss": 0.5016,
+      "step": 27225
+    },
+    {
+      "epoch": 1.3824040816844563,
+      "grad_norm": 0.02454468885248768,
+      "learning_rate": 0.00026339122967634026,
+      "loss": 0.499,
+      "step": 27230
+    },
+    {
+      "epoch": 1.3826579177824878,
+      "grad_norm": 0.05778464299883465,
+      "learning_rate": 0.0002631961005782328,
+      "loss": 0.4341,
+      "step": 27235
+    },
+    {
+      "epoch": 1.3829117538805193,
+      "grad_norm": 0.023244866350785674,
+      "learning_rate": 0.00026300101796354966,
+      "loss": 0.4571,
+      "step": 27240
+    },
+    {
+      "epoch": 1.3831655899785509,
+      "grad_norm": 0.06533569452053184,
+      "learning_rate": 0.0002628059818705849,
+      "loss": 0.471,
+      "step": 27245
+    },
+    {
+      "epoch": 1.3834194260765824,
+      "grad_norm": 0.026721629818407002,
+      "learning_rate": 0.00026261099233762286,
+      "loss": 0.448,
+      "step": 27250
+    },
+    {
+      "epoch": 1.383673262174614,
+      "grad_norm": 0.023742554699451898,
+      "learning_rate": 0.0002624160494029394,
+      "loss": 0.4868,
+      "step": 27255
+    },
+    {
+      "epoch": 1.3839270982726455,
+      "grad_norm": 0.028301790837454648,
+      "learning_rate": 0.0002622211531048004,
+      "loss": 0.4682,
+      "step": 27260
+    },
+    {
+      "epoch": 1.3841809343706768,
+      "grad_norm": 0.025666697236186767,
+      "learning_rate": 0.0002620263034814632,
+      "loss": 0.4725,
+      "step": 27265
+    },
+    {
+      "epoch": 1.3844347704687083,
+      "grad_norm": 0.02634404140327674,
+      "learning_rate": 0.00026183150057117595,
+      "loss": 0.4929,
+      "step": 27270
+    },
+    {
+      "epoch": 1.3846886065667399,
+      "grad_norm": 0.025227814246512378,
+      "learning_rate": 0.0002616367444121775,
+      "loss": 0.4846,
+      "step": 27275
+    },
+    {
+      "epoch": 1.3849424426647714,
+      "grad_norm": 0.028927076349626928,
+      "learning_rate": 0.0002614420350426973,
+      "loss": 0.4822,
+      "step": 27280
+    },
+    {
+      "epoch": 1.385196278762803,
+      "grad_norm": 0.023294755705656967,
+      "learning_rate": 0.00026124737250095596,
+      "loss": 0.4622,
+      "step": 27285
+    },
+    {
+      "epoch": 1.3854501148608342,
+      "grad_norm": 0.023578850146889208,
+      "learning_rate": 0.0002610527568251647,
+      "loss": 0.457,
+      "step": 27290
+    },
+    {
+      "epoch": 1.3857039509588658,
+      "grad_norm": 0.021005770334938684,
+      "learning_rate": 0.0002608581880535258,
+      "loss": 0.4816,
+      "step": 27295
+    },
+    {
+      "epoch": 1.3859577870568973,
+      "grad_norm": 0.022374150226942276,
+      "learning_rate": 0.00026066366622423177,
+      "loss": 0.4843,
+      "step": 27300
+    },
+    {
+      "epoch": 1.3862116231549289,
+      "grad_norm": 0.027829807992719183,
+      "learning_rate": 0.0002604691913754668,
+      "loss": 0.4921,
+      "step": 27305
+    },
+    {
+      "epoch": 1.3864654592529604,
+      "grad_norm": 0.021996056034670426,
+      "learning_rate": 0.0002602747635454047,
+      "loss": 0.5174,
+      "step": 27310
+    },
+    {
+      "epoch": 1.386719295350992,
+      "grad_norm": 0.021135081168996664,
+      "learning_rate": 0.00026008038277221127,
+      "loss": 0.4732,
+      "step": 27315
+    },
+    {
+      "epoch": 1.3869731314490235,
+      "grad_norm": 0.040553027428796726,
+      "learning_rate": 0.0002598860490940419,
+      "loss": 0.4869,
+      "step": 27320
+    },
+    {
+      "epoch": 1.387226967547055,
+      "grad_norm": 0.02726713196600684,
+      "learning_rate": 0.0002596917625490438,
+      "loss": 0.4646,
+      "step": 27325
+    },
+    {
+      "epoch": 1.3874808036450863,
+      "grad_norm": 0.030567319881294505,
+      "learning_rate": 0.0002594975231753544,
+      "loss": 0.4909,
+      "step": 27330
+    },
+    {
+      "epoch": 1.3877346397431178,
+      "grad_norm": 0.02274111194508241,
+      "learning_rate": 0.00025930333101110173,
+      "loss": 0.476,
+      "step": 27335
+    },
+    {
+      "epoch": 1.3879884758411494,
+      "grad_norm": 0.02044291002475312,
+      "learning_rate": 0.0002591091860944049,
+      "loss": 0.5066,
+      "step": 27340
+    },
+    {
+      "epoch": 1.388242311939181,
+      "grad_norm": 0.028846129971471627,
+      "learning_rate": 0.00025891508846337337,
+      "loss": 0.4598,
+      "step": 27345
+    },
+    {
+      "epoch": 1.3884961480372124,
+      "grad_norm": 0.024004903325607403,
+      "learning_rate": 0.00025872103815610794,
+      "loss": 0.4617,
+      "step": 27350
+    },
+    {
+      "epoch": 1.3887499841352438,
+      "grad_norm": 0.02506170048370503,
+      "learning_rate": 0.0002585270352106992,
+      "loss": 0.4712,
+      "step": 27355
+    },
+    {
+      "epoch": 1.3890038202332753,
+      "grad_norm": 0.02408913428393671,
+      "learning_rate": 0.0002583330796652294,
+      "loss": 0.4632,
+      "step": 27360
+    },
+    {
+      "epoch": 1.3892576563313068,
+      "grad_norm": 0.022166652995046117,
+      "learning_rate": 0.0002581391715577707,
+      "loss": 0.4792,
+      "step": 27365
+    },
+    {
+      "epoch": 1.3895114924293384,
+      "grad_norm": 0.034847220934961225,
+      "learning_rate": 0.00025794531092638667,
+      "loss": 0.4824,
+      "step": 27370
+    },
+    {
+      "epoch": 1.38976532852737,
+      "grad_norm": 0.02419671012650832,
+      "learning_rate": 0.0002577514978091308,
+      "loss": 0.5055,
+      "step": 27375
+    },
+    {
+      "epoch": 1.3900191646254014,
+      "grad_norm": 0.02417967296068721,
+      "learning_rate": 0.000257557732244048,
+      "loss": 0.4874,
+      "step": 27380
+    },
+    {
+      "epoch": 1.390273000723433,
+      "grad_norm": 0.022712157728031215,
+      "learning_rate": 0.00025736401426917286,
+      "loss": 0.4702,
+      "step": 27385
+    },
+    {
+      "epoch": 1.3905268368214645,
+      "grad_norm": 0.027626729533074428,
+      "learning_rate": 0.0002571703439225322,
+      "loss": 0.4904,
+      "step": 27390
+    },
+    {
+      "epoch": 1.3907806729194958,
+      "grad_norm": 0.023010404248220272,
+      "learning_rate": 0.00025697672124214176,
+      "loss": 0.4923,
+      "step": 27395
+    },
+    {
+      "epoch": 1.3910345090175273,
+      "grad_norm": 0.021117714170819323,
+      "learning_rate": 0.00025678314626600924,
+      "loss": 0.4616,
+      "step": 27400
+    },
+    {
+      "epoch": 1.3912883451155589,
+      "grad_norm": 0.023323314097295,
+      "learning_rate": 0.00025658961903213197,
+      "loss": 0.4779,
+      "step": 27405
+    },
+    {
+      "epoch": 1.3915421812135904,
+      "grad_norm": 0.03386308270675858,
+      "learning_rate": 0.0002563961395784987,
+      "loss": 0.4701,
+      "step": 27410
+    },
+    {
+      "epoch": 1.391796017311622,
+      "grad_norm": 0.023072249782764793,
+      "learning_rate": 0.0002562027079430883,
+      "loss": 0.4628,
+      "step": 27415
+    },
+    {
+      "epoch": 1.3920498534096533,
+      "grad_norm": 0.026329411232554476,
+      "learning_rate": 0.0002560093241638707,
+      "loss": 0.5117,
+      "step": 27420
+    },
+    {
+      "epoch": 1.3923036895076848,
+      "grad_norm": 0.02564234053698023,
+      "learning_rate": 0.00025581598827880575,
+      "loss": 0.4713,
+      "step": 27425
+    },
+    {
+      "epoch": 1.3925575256057163,
+      "grad_norm": 0.026068964054194645,
+      "learning_rate": 0.0002556227003258448,
+      "loss": 0.4654,
+      "step": 27430
+    },
+    {
+      "epoch": 1.3928113617037479,
+      "grad_norm": 0.020125692575617058,
+      "learning_rate": 0.0002554294603429288,
+      "loss": 0.4917,
+      "step": 27435
+    },
+    {
+      "epoch": 1.3930651978017794,
+      "grad_norm": 0.02566386436456717,
+      "learning_rate": 0.0002552362683679903,
+      "loss": 0.491,
+      "step": 27440
+    },
+    {
+      "epoch": 1.393319033899811,
+      "grad_norm": 0.022596381640967816,
+      "learning_rate": 0.0002550431244389515,
+      "loss": 0.4924,
+      "step": 27445
+    },
+    {
+      "epoch": 1.3935728699978425,
+      "grad_norm": 0.05194383248908799,
+      "learning_rate": 0.00025485002859372574,
+      "loss": 0.472,
+      "step": 27450
+    },
+    {
+      "epoch": 1.393826706095874,
+      "grad_norm": 0.0222046493642593,
+      "learning_rate": 0.00025465698087021705,
+      "loss": 0.4746,
+      "step": 27455
+    },
+    {
+      "epoch": 1.3940805421939053,
+      "grad_norm": 0.02186007738670582,
+      "learning_rate": 0.0002544639813063193,
+      "loss": 0.4409,
+      "step": 27460
+    },
+    {
+      "epoch": 1.3943343782919368,
+      "grad_norm": 0.023367690449110124,
+      "learning_rate": 0.0002542710299399177,
+      "loss": 0.4573,
+      "step": 27465
+    },
+    {
+      "epoch": 1.3945882143899684,
+      "grad_norm": 0.02272755185104169,
+      "learning_rate": 0.00025407812680888726,
+      "loss": 0.4495,
+      "step": 27470
+    },
+    {
+      "epoch": 1.394842050488,
+      "grad_norm": 0.024020085751695088,
+      "learning_rate": 0.0002538852719510943,
+      "loss": 0.4779,
+      "step": 27475
+    },
+    {
+      "epoch": 1.3950958865860315,
+      "grad_norm": 0.0218924915046459,
+      "learning_rate": 0.00025369246540439495,
+      "loss": 0.4737,
+      "step": 27480
+    },
+    {
+      "epoch": 1.3953497226840628,
+      "grad_norm": 0.02276957102386517,
+      "learning_rate": 0.00025349970720663653,
+      "loss": 0.5073,
+      "step": 27485
+    },
+    {
+      "epoch": 1.3956035587820943,
+      "grad_norm": 0.02384126314470508,
+      "learning_rate": 0.000253306997395656,
+      "loss": 0.4972,
+      "step": 27490
+    },
+    {
+      "epoch": 1.3958573948801258,
+      "grad_norm": 0.03572583730853441,
+      "learning_rate": 0.00025311433600928184,
+      "loss": 0.4611,
+      "step": 27495
+    },
+    {
+      "epoch": 1.3961112309781574,
+      "grad_norm": 0.7840304155881027,
+      "learning_rate": 0.00025292172308533214,
+      "loss": 0.4962,
+      "step": 27500
+    },
+    {
+      "epoch": 1.396365067076189,
+      "grad_norm": 0.0463916296322995,
+      "learning_rate": 0.000252729158661616,
+      "loss": 0.4765,
+      "step": 27505
+    },
+    {
+      "epoch": 1.3966189031742204,
+      "grad_norm": 0.05568946088292396,
+      "learning_rate": 0.0002525366427759329,
+      "loss": 0.4964,
+      "step": 27510
+    },
+    {
+      "epoch": 1.396872739272252,
+      "grad_norm": 0.023057160898420713,
+      "learning_rate": 0.00025234417546607293,
+      "loss": 0.4928,
+      "step": 27515
+    },
+    {
+      "epoch": 1.3971265753702835,
+      "grad_norm": 0.029288129095807625,
+      "learning_rate": 0.000252151756769816,
+      "loss": 0.4895,
+      "step": 27520
+    },
+    {
+      "epoch": 1.397380411468315,
+      "grad_norm": 0.03340361260423104,
+      "learning_rate": 0.00025195938672493344,
+      "loss": 0.479,
+      "step": 27525
+    },
+    {
+      "epoch": 1.3976342475663464,
+      "grad_norm": 0.03739192421428975,
+      "learning_rate": 0.0002517670653691861,
+      "loss": 0.4901,
+      "step": 27530
+    },
+    {
+      "epoch": 1.397888083664378,
+      "grad_norm": 0.023015539091900444,
+      "learning_rate": 0.0002515747927403261,
+      "loss": 0.457,
+      "step": 27535
+    },
+    {
+      "epoch": 1.3981419197624094,
+      "grad_norm": 0.024683773093099548,
+      "learning_rate": 0.00025138256887609513,
+      "loss": 0.4845,
+      "step": 27540
+    },
+    {
+      "epoch": 1.398395755860441,
+      "grad_norm": 0.03275767392537276,
+      "learning_rate": 0.0002511903938142263,
+      "loss": 0.4641,
+      "step": 27545
+    },
+    {
+      "epoch": 1.3986495919584725,
+      "grad_norm": 0.02455035950939294,
+      "learning_rate": 0.0002509982675924421,
+      "loss": 0.4641,
+      "step": 27550
+    },
+    {
+      "epoch": 1.3989034280565038,
+      "grad_norm": 0.021707198486694775,
+      "learning_rate": 0.00025080619024845643,
+      "loss": 0.482,
+      "step": 27555
+    },
+    {
+      "epoch": 1.3991572641545353,
+      "grad_norm": 0.022054949788400096,
+      "learning_rate": 0.0002506141618199727,
+      "loss": 0.4771,
+      "step": 27560
+    },
+    {
+      "epoch": 1.3994111002525669,
+      "grad_norm": 0.023827476345478014,
+      "learning_rate": 0.0002504221823446853,
+      "loss": 0.4608,
+      "step": 27565
+    },
+    {
+      "epoch": 1.3996649363505984,
+      "grad_norm": 0.028504264487398272,
+      "learning_rate": 0.00025023025186027905,
+      "loss": 0.4909,
+      "step": 27570
+    },
+    {
+      "epoch": 1.39991877244863,
+      "grad_norm": 0.029743187667604372,
+      "learning_rate": 0.0002500383704044286,
+      "loss": 0.4695,
+      "step": 27575
+    },
+    {
+      "epoch": 1.4001726085466615,
+      "grad_norm": 0.024573032907380145,
+      "learning_rate": 0.00024984653801479967,
+      "loss": 0.4768,
+      "step": 27580
+    },
+    {
+      "epoch": 1.400426444644693,
+      "grad_norm": 0.04851132365758049,
+      "learning_rate": 0.0002496547547290476,
+      "loss": 0.4658,
+      "step": 27585
+    },
+    {
+      "epoch": 1.4006802807427245,
+      "grad_norm": 0.03611595038168912,
+      "learning_rate": 0.0002494630205848189,
+      "loss": 0.4891,
+      "step": 27590
+    },
+    {
+      "epoch": 1.4009341168407559,
+      "grad_norm": 0.020312909657943287,
+      "learning_rate": 0.0002492713356197497,
+      "loss": 0.4731,
+      "step": 27595
+    },
+    {
+      "epoch": 1.4011879529387874,
+      "grad_norm": 0.02051178256643885,
+      "learning_rate": 0.0002490796998714671,
+      "loss": 0.4633,
+      "step": 27600
+    },
+    {
+      "epoch": 1.401441789036819,
+      "grad_norm": 0.028805213882367668,
+      "learning_rate": 0.0002488881133775878,
+      "loss": 0.4689,
+      "step": 27605
+    },
+    {
+      "epoch": 1.4016956251348505,
+      "grad_norm": 0.03456140699472926,
+      "learning_rate": 0.00024869657617571984,
+      "loss": 0.4732,
+      "step": 27610
+    },
+    {
+      "epoch": 1.401949461232882,
+      "grad_norm": 0.02567267995131493,
+      "learning_rate": 0.00024850508830346046,
+      "loss": 0.4893,
+      "step": 27615
+    },
+    {
+      "epoch": 1.4022032973309133,
+      "grad_norm": 0.027690736913673394,
+      "learning_rate": 0.0002483136497983983,
+      "loss": 0.4773,
+      "step": 27620
+    },
+    {
+      "epoch": 1.4024571334289448,
+      "grad_norm": 0.02059800948274391,
+      "learning_rate": 0.00024812226069811114,
+      "loss": 0.4533,
+      "step": 27625
+    },
+    {
+      "epoch": 1.4027109695269764,
+      "grad_norm": 0.022577872784147853,
+      "learning_rate": 0.00024793092104016844,
+      "loss": 0.4989,
+      "step": 27630
+    },
+    {
+      "epoch": 1.402964805625008,
+      "grad_norm": 0.02564370891867458,
+      "learning_rate": 0.00024773963086212867,
+      "loss": 0.483,
+      "step": 27635
+    },
+    {
+      "epoch": 1.4032186417230395,
+      "grad_norm": 0.022783404442849806,
+      "learning_rate": 0.0002475483902015416,
+      "loss": 0.4517,
+      "step": 27640
+    },
+    {
+      "epoch": 1.403472477821071,
+      "grad_norm": 0.02130850865861562,
+      "learning_rate": 0.00024735719909594635,
+      "loss": 0.5042,
+      "step": 27645
+    },
+    {
+      "epoch": 1.4037263139191025,
+      "grad_norm": 0.025642889265787623,
+      "learning_rate": 0.00024716605758287315,
+      "loss": 0.4993,
+      "step": 27650
+    },
+    {
+      "epoch": 1.403980150017134,
+      "grad_norm": 0.022945742341174302,
+      "learning_rate": 0.00024697496569984177,
+      "loss": 0.4853,
+      "step": 27655
+    },
+    {
+      "epoch": 1.4042339861151654,
+      "grad_norm": 0.022770860368356186,
+      "learning_rate": 0.000246783923484363,
+      "loss": 0.467,
+      "step": 27660
+    },
+    {
+      "epoch": 1.404487822213197,
+      "grad_norm": 0.022660091235407458,
+      "learning_rate": 0.0002465929309739371,
+      "loss": 0.4698,
+      "step": 27665
+    },
+    {
+      "epoch": 1.4047416583112284,
+      "grad_norm": 0.027042451768303048,
+      "learning_rate": 0.0002464019882060553,
+      "loss": 0.4554,
+      "step": 27670
+    },
+    {
+      "epoch": 1.40499549440926,
+      "grad_norm": 0.02246177180890777,
+      "learning_rate": 0.0002462110952181982,
+      "loss": 0.4776,
+      "step": 27675
+    },
+    {
+      "epoch": 1.4052493305072915,
+      "grad_norm": 0.022416923064172367,
+      "learning_rate": 0.0002460202520478378,
+      "loss": 0.4547,
+      "step": 27680
+    },
+    {
+      "epoch": 1.4055031666053228,
+      "grad_norm": 0.02287798343042281,
+      "learning_rate": 0.0002458294587324351,
+      "loss": 0.4598,
+      "step": 27685
+    },
+    {
+      "epoch": 1.4057570027033544,
+      "grad_norm": 0.02035412219965513,
+      "learning_rate": 0.0002456387153094421,
+      "loss": 0.4872,
+      "step": 27690
+    },
+    {
+      "epoch": 1.4060108388013859,
+      "grad_norm": 0.029570301391272168,
+      "learning_rate": 0.000245448021816301,
+      "loss": 0.4461,
+      "step": 27695
+    },
+    {
+      "epoch": 1.4062646748994174,
+      "grad_norm": 0.024728980525368392,
+      "learning_rate": 0.00024525737829044354,
+      "loss": 0.4797,
+      "step": 27700
+    },
+    {
+      "epoch": 1.406518510997449,
+      "grad_norm": 0.022333792784804983,
+      "learning_rate": 0.0002450667847692925,
+      "loss": 0.5004,
+      "step": 27705
+    },
+    {
+      "epoch": 1.4067723470954805,
+      "grad_norm": 0.02145010095522035,
+      "learning_rate": 0.00024487624129026017,
+      "loss": 0.4473,
+      "step": 27710
+    },
+    {
+      "epoch": 1.407026183193512,
+      "grad_norm": 0.02758397602988774,
+      "learning_rate": 0.00024468574789074946,
+      "loss": 0.4705,
+      "step": 27715
+    },
+    {
+      "epoch": 1.4072800192915436,
+      "grad_norm": 0.033109491790319426,
+      "learning_rate": 0.000244495304608153,
+      "loss": 0.4633,
+      "step": 27720
+    },
+    {
+      "epoch": 1.4075338553895749,
+      "grad_norm": 0.029190273291780672,
+      "learning_rate": 0.0002443049114798543,
+      "loss": 0.4585,
+      "step": 27725
+    },
+    {
+      "epoch": 1.4077876914876064,
+      "grad_norm": 0.03253325936543185,
+      "learning_rate": 0.00024411456854322612,
+      "loss": 0.4757,
+      "step": 27730
+    },
+    {
+      "epoch": 1.408041527585638,
+      "grad_norm": 0.020506343909216858,
+      "learning_rate": 0.0002439242758356322,
+      "loss": 0.4797,
+      "step": 27735
+    },
+    {
+      "epoch": 1.4082953636836695,
+      "grad_norm": 0.026935301152816532,
+      "learning_rate": 0.0002437340333944257,
+      "loss": 0.4494,
+      "step": 27740
+    },
+    {
+      "epoch": 1.408549199781701,
+      "grad_norm": 0.029552183804907328,
+      "learning_rate": 0.00024354384125695045,
+      "loss": 0.4962,
+      "step": 27745
+    },
+    {
+      "epoch": 1.4088030358797323,
+      "grad_norm": 0.027966829528624507,
+      "learning_rate": 0.00024335369946054027,
+      "loss": 0.4627,
+      "step": 27750
+    },
+    {
+      "epoch": 1.4090568719777639,
+      "grad_norm": 0.020488294841671674,
+      "learning_rate": 0.00024316360804251907,
+      "loss": 0.4783,
+      "step": 27755
+    },
+    {
+      "epoch": 1.4093107080757954,
+      "grad_norm": 0.02500272619569665,
+      "learning_rate": 0.0002429735670402007,
+      "loss": 0.4523,
+      "step": 27760
+    },
+    {
+      "epoch": 1.409564544173827,
+      "grad_norm": 0.03524132852415086,
+      "learning_rate": 0.00024278357649088945,
+      "loss": 0.4832,
+      "step": 27765
+    },
+    {
+      "epoch": 1.4098183802718585,
+      "grad_norm": 0.029454084316621682,
+      "learning_rate": 0.00024259363643187922,
+      "loss": 0.4866,
+      "step": 27770
+    },
+    {
+      "epoch": 1.41007221636989,
+      "grad_norm": 0.0230214354412747,
+      "learning_rate": 0.00024240374690045468,
+      "loss": 0.4832,
+      "step": 27775
+    },
+    {
+      "epoch": 1.4103260524679215,
+      "grad_norm": 0.022547181126946717,
+      "learning_rate": 0.00024221390793388977,
+      "loss": 0.4898,
+      "step": 27780
+    },
+    {
+      "epoch": 1.410579888565953,
+      "grad_norm": 0.023719925710689885,
+      "learning_rate": 0.00024202411956944937,
+      "loss": 0.4859,
+      "step": 27785
+    },
+    {
+      "epoch": 1.4108337246639846,
+      "grad_norm": 0.026253153450593418,
+      "learning_rate": 0.00024183438184438761,
+      "loss": 0.4943,
+      "step": 27790
+    },
+    {
+      "epoch": 1.411087560762016,
+      "grad_norm": 0.023581771288596627,
+      "learning_rate": 0.00024164469479594935,
+      "loss": 0.5134,
+      "step": 27795
+    },
+    {
+      "epoch": 1.4113413968600474,
+      "grad_norm": 0.024469131177750587,
+      "learning_rate": 0.00024145505846136895,
+      "loss": 0.4991,
+      "step": 27800
+    },
+    {
+      "epoch": 1.411595232958079,
+      "grad_norm": 0.03535425997247597,
+      "learning_rate": 0.0002412654728778712,
+      "loss": 0.4964,
+      "step": 27805
+    },
+    {
+      "epoch": 1.4118490690561105,
+      "grad_norm": 0.024175285871346007,
+      "learning_rate": 0.00024107593808267102,
+      "loss": 0.471,
+      "step": 27810
+    },
+    {
+      "epoch": 1.412102905154142,
+      "grad_norm": 0.023755148541426182,
+      "learning_rate": 0.00024088645411297273,
+      "loss": 0.4849,
+      "step": 27815
+    },
+    {
+      "epoch": 1.4123567412521734,
+      "grad_norm": 0.02516434829571276,
+      "learning_rate": 0.00024069702100597146,
+      "loss": 0.467,
+      "step": 27820
+    },
+    {
+      "epoch": 1.412610577350205,
+      "grad_norm": 0.023488420926309095,
+      "learning_rate": 0.00024050763879885167,
+      "loss": 0.482,
+      "step": 27825
+    },
+    {
+      "epoch": 1.4128644134482364,
+      "grad_norm": 0.022254767215497136,
+      "learning_rate": 0.00024031830752878854,
+      "loss": 0.5116,
+      "step": 27830
+    },
+    {
+      "epoch": 1.413118249546268,
+      "grad_norm": 0.02191194664116013,
+      "learning_rate": 0.00024012902723294632,
+      "loss": 0.4826,
+      "step": 27835
+    },
+    {
+      "epoch": 1.4133720856442995,
+      "grad_norm": 0.03718887288280671,
+      "learning_rate": 0.00023993979794848037,
+      "loss": 0.4901,
+      "step": 27840
+    },
+    {
+      "epoch": 1.413625921742331,
+      "grad_norm": 0.028432289745428645,
+      "learning_rate": 0.00023975061971253492,
+      "loss": 0.4806,
+      "step": 27845
+    },
+    {
+      "epoch": 1.4138797578403626,
+      "grad_norm": 0.023502679374965474,
+      "learning_rate": 0.00023956149256224512,
+      "loss": 0.4374,
+      "step": 27850
+    },
+    {
+      "epoch": 1.414133593938394,
+      "grad_norm": 0.02184921161134165,
+      "learning_rate": 0.0002393724165347354,
+      "loss": 0.4712,
+      "step": 27855
+    },
+    {
+      "epoch": 1.4143874300364254,
+      "grad_norm": 0.02284152025331955,
+      "learning_rate": 0.0002391833916671207,
+      "loss": 0.5026,
+      "step": 27860
+    },
+    {
+      "epoch": 1.414641266134457,
+      "grad_norm": 0.02065393654716191,
+      "learning_rate": 0.0002389944179965052,
+      "loss": 0.4614,
+      "step": 27865
+    },
+    {
+      "epoch": 1.4148951022324885,
+      "grad_norm": 0.021188395785789092,
+      "learning_rate": 0.00023880549555998416,
+      "loss": 0.4681,
+      "step": 27870
+    },
+    {
+      "epoch": 1.41514893833052,
+      "grad_norm": 0.02498643430523538,
+      "learning_rate": 0.00023861662439464155,
+      "loss": 0.5116,
+      "step": 27875
+    },
+    {
+      "epoch": 1.4154027744285516,
+      "grad_norm": 0.03642276120823365,
+      "learning_rate": 0.00023842780453755231,
+      "loss": 0.479,
+      "step": 27880
+    },
+    {
+      "epoch": 1.4156566105265829,
+      "grad_norm": 0.021274402113326943,
+      "learning_rate": 0.00023823903602578035,
+      "loss": 0.4518,
+      "step": 27885
+    },
+    {
+      "epoch": 1.4159104466246144,
+      "grad_norm": 0.02378943254532682,
+      "learning_rate": 0.0002380503188963804,
+      "loss": 0.5052,
+      "step": 27890
+    },
+    {
+      "epoch": 1.416164282722646,
+      "grad_norm": 0.026776271663452332,
+      "learning_rate": 0.00023786165318639635,
+      "loss": 0.4953,
+      "step": 27895
+    },
+    {
+      "epoch": 1.4164181188206775,
+      "grad_norm": 0.022203192350251453,
+      "learning_rate": 0.00023767303893286262,
+      "loss": 0.4893,
+      "step": 27900
+    },
+    {
+      "epoch": 1.416671954918709,
+      "grad_norm": 0.0235156458738656,
+      "learning_rate": 0.00023748447617280322,
+      "loss": 0.4461,
+      "step": 27905
+    },
+    {
+      "epoch": 1.4169257910167405,
+      "grad_norm": 0.021958486852655536,
+      "learning_rate": 0.00023729596494323173,
+      "loss": 0.4653,
+      "step": 27910
+    },
+    {
+      "epoch": 1.417179627114772,
+      "grad_norm": 0.0289039207384805,
+      "learning_rate": 0.00023710750528115244,
+      "loss": 0.4816,
+      "step": 27915
+    },
+    {
+      "epoch": 1.4174334632128036,
+      "grad_norm": 0.024505561379556843,
+      "learning_rate": 0.00023691909722355864,
+      "loss": 0.475,
+      "step": 27920
+    },
+    {
+      "epoch": 1.417687299310835,
+      "grad_norm": 0.021676083599215912,
+      "learning_rate": 0.00023673074080743405,
+      "loss": 0.4999,
+      "step": 27925
+    },
+    {
+      "epoch": 1.4179411354088665,
+      "grad_norm": 0.027958626700585656,
+      "learning_rate": 0.00023654243606975213,
+      "loss": 0.512,
+      "step": 27930
+    },
+    {
+      "epoch": 1.418194971506898,
+      "grad_norm": 0.024897726465202892,
+      "learning_rate": 0.0002363541830474763,
+      "loss": 0.5286,
+      "step": 27935
+    },
+    {
+      "epoch": 1.4184488076049295,
+      "grad_norm": 0.02113190762244004,
+      "learning_rate": 0.00023616598177755938,
+      "loss": 0.4781,
+      "step": 27940
+    },
+    {
+      "epoch": 1.418702643702961,
+      "grad_norm": 0.020658914881807294,
+      "learning_rate": 0.0002359778322969447,
+      "loss": 0.476,
+      "step": 27945
+    },
+    {
+      "epoch": 1.4189564798009924,
+      "grad_norm": 0.02016987097475399,
+      "learning_rate": 0.00023578973464256464,
+      "loss": 0.5082,
+      "step": 27950
+    },
+    {
+      "epoch": 1.419210315899024,
+      "grad_norm": 0.025221457362809504,
+      "learning_rate": 0.0002356016888513423,
+      "loss": 0.4956,
+      "step": 27955
+    },
+    {
+      "epoch": 1.4194641519970554,
+      "grad_norm": 0.027710307385371332,
+      "learning_rate": 0.00023541369496018967,
+      "loss": 0.4782,
+      "step": 27960
+    },
+    {
+      "epoch": 1.419717988095087,
+      "grad_norm": 0.019837189877263843,
+      "learning_rate": 0.0002352257530060094,
+      "loss": 0.4673,
+      "step": 27965
+    },
+    {
+      "epoch": 1.4199718241931185,
+      "grad_norm": 0.022705298855318596,
+      "learning_rate": 0.00023503786302569318,
+      "loss": 0.4746,
+      "step": 27970
+    },
+    {
+      "epoch": 1.42022566029115,
+      "grad_norm": 0.030143315594230304,
+      "learning_rate": 0.0002348500250561233,
+      "loss": 0.4531,
+      "step": 27975
+    },
+    {
+      "epoch": 1.4204794963891816,
+      "grad_norm": 0.02996052140446814,
+      "learning_rate": 0.00023466223913417105,
+      "loss": 0.4657,
+      "step": 27980
+    },
+    {
+      "epoch": 1.4207333324872131,
+      "grad_norm": 0.02310648055769472,
+      "learning_rate": 0.00023447450529669796,
+      "loss": 0.4884,
+      "step": 27985
+    },
+    {
+      "epoch": 1.4209871685852444,
+      "grad_norm": 0.029314484029678897,
+      "learning_rate": 0.00023428682358055553,
+      "loss": 0.4989,
+      "step": 27990
+    },
+    {
+      "epoch": 1.421241004683276,
+      "grad_norm": 0.0252418080632575,
+      "learning_rate": 0.00023409919402258433,
+      "loss": 0.4721,
+      "step": 27995
+    },
+    {
+      "epoch": 1.4214948407813075,
+      "grad_norm": 0.023141843733552995,
+      "learning_rate": 0.00023391161665961546,
+      "loss": 0.4946,
+      "step": 28000
+    },
+    {
+      "epoch": 1.421748676879339,
+      "grad_norm": 0.02792804961488345,
+      "learning_rate": 0.00023372409152846912,
+      "loss": 0.4681,
+      "step": 28005
+    },
+    {
+      "epoch": 1.4220025129773706,
+      "grad_norm": 0.029553149663017995,
+      "learning_rate": 0.00023353661866595582,
+      "loss": 0.5152,
+      "step": 28010
+    },
+    {
+      "epoch": 1.4222563490754019,
+      "grad_norm": 0.020892682323818024,
+      "learning_rate": 0.00023334919810887527,
+      "loss": 0.4211,
+      "step": 28015
+    },
+    {
+      "epoch": 1.4225101851734334,
+      "grad_norm": 0.02707479709867003,
+      "learning_rate": 0.0002331618298940176,
+      "loss": 0.4819,
+      "step": 28020
+    },
+    {
+      "epoch": 1.422764021271465,
+      "grad_norm": 0.02176754481264695,
+      "learning_rate": 0.00023297451405816173,
+      "loss": 0.4723,
+      "step": 28025
+    },
+    {
+      "epoch": 1.4230178573694965,
+      "grad_norm": 0.02889767627713131,
+      "learning_rate": 0.00023278725063807733,
+      "loss": 0.4705,
+      "step": 28030
+    },
+    {
+      "epoch": 1.423271693467528,
+      "grad_norm": 0.020395619422563103,
+      "learning_rate": 0.0002326000396705228,
+      "loss": 0.4619,
+      "step": 28035
+    },
+    {
+      "epoch": 1.4235255295655596,
+      "grad_norm": 0.02300164782732942,
+      "learning_rate": 0.0002324128811922472,
+      "loss": 0.4976,
+      "step": 28040
+    },
+    {
+      "epoch": 1.423779365663591,
+      "grad_norm": 0.026291959890614048,
+      "learning_rate": 0.00023222577523998816,
+      "loss": 0.473,
+      "step": 28045
+    },
+    {
+      "epoch": 1.4240332017616226,
+      "grad_norm": 0.020485251522475535,
+      "learning_rate": 0.00023203872185047442,
+      "loss": 0.4657,
+      "step": 28050
+    },
+    {
+      "epoch": 1.4242870378596542,
+      "grad_norm": 0.025828189579312467,
+      "learning_rate": 0.00023185172106042308,
+      "loss": 0.4625,
+      "step": 28055
+    },
+    {
+      "epoch": 1.4245408739576855,
+      "grad_norm": 0.02218487881694467,
+      "learning_rate": 0.00023166477290654185,
+      "loss": 0.4819,
+      "step": 28060
+    },
+    {
+      "epoch": 1.424794710055717,
+      "grad_norm": 0.0286179264415249,
+      "learning_rate": 0.00023147787742552734,
+      "loss": 0.4737,
+      "step": 28065
+    },
+    {
+      "epoch": 1.4250485461537485,
+      "grad_norm": 0.0228564657619892,
+      "learning_rate": 0.00023129103465406654,
+      "loss": 0.4672,
+      "step": 28070
+    },
+    {
+      "epoch": 1.42530238225178,
+      "grad_norm": 0.029588254076403645,
+      "learning_rate": 0.00023110424462883538,
+      "loss": 0.49,
+      "step": 28075
+    },
+    {
+      "epoch": 1.4255562183498114,
+      "grad_norm": 0.0214582359967556,
+      "learning_rate": 0.00023091750738650024,
+      "loss": 0.4618,
+      "step": 28080
+    },
+    {
+      "epoch": 1.425810054447843,
+      "grad_norm": 0.032133974034998646,
+      "learning_rate": 0.00023073082296371628,
+      "loss": 0.4364,
+      "step": 28085
+    },
+    {
+      "epoch": 1.4260638905458745,
+      "grad_norm": 0.027651903639121147,
+      "learning_rate": 0.0002305441913971291,
+      "loss": 0.4692,
+      "step": 28090
+    },
+    {
+      "epoch": 1.426317726643906,
+      "grad_norm": 0.019039201847567744,
+      "learning_rate": 0.0002303576127233732,
+      "loss": 0.4859,
+      "step": 28095
+    },
+    {
+      "epoch": 1.4265715627419375,
+      "grad_norm": 0.02386583789317677,
+      "learning_rate": 0.0002301710869790734,
+      "loss": 0.4875,
+      "step": 28100
+    },
+    {
+      "epoch": 1.426825398839969,
+      "grad_norm": 0.025025245204414376,
+      "learning_rate": 0.00022998461420084342,
+      "loss": 0.5166,
+      "step": 28105
+    },
+    {
+      "epoch": 1.4270792349380006,
+      "grad_norm": 0.023420261634063733,
+      "learning_rate": 0.00022979819442528715,
+      "loss": 0.4759,
+      "step": 28110
+    },
+    {
+      "epoch": 1.4273330710360321,
+      "grad_norm": 0.02333034761205301,
+      "learning_rate": 0.00022961182768899797,
+      "loss": 0.48,
+      "step": 28115
+    },
+    {
+      "epoch": 1.4275869071340637,
+      "grad_norm": 0.02627854902400825,
+      "learning_rate": 0.00022942551402855839,
+      "loss": 0.4807,
+      "step": 28120
+    },
+    {
+      "epoch": 1.427840743232095,
+      "grad_norm": 0.021412030460517,
+      "learning_rate": 0.0002292392534805412,
+      "loss": 0.5039,
+      "step": 28125
+    },
+    {
+      "epoch": 1.4280945793301265,
+      "grad_norm": 0.022170572716743882,
+      "learning_rate": 0.0002290530460815082,
+      "loss": 0.4968,
+      "step": 28130
+    },
+    {
+      "epoch": 1.428348415428158,
+      "grad_norm": 0.025786423164189262,
+      "learning_rate": 0.00022886689186801113,
+      "loss": 0.4907,
+      "step": 28135
+    },
+    {
+      "epoch": 1.4286022515261896,
+      "grad_norm": 0.021201177325464542,
+      "learning_rate": 0.00022868079087659087,
+      "loss": 0.4856,
+      "step": 28140
+    },
+    {
+      "epoch": 1.4288560876242211,
+      "grad_norm": 0.02309415988605629,
+      "learning_rate": 0.0002284947431437785,
+      "loss": 0.4825,
+      "step": 28145
+    },
+    {
+      "epoch": 1.4291099237222524,
+      "grad_norm": 0.024799580659098996,
+      "learning_rate": 0.00022830874870609385,
+      "loss": 0.4728,
+      "step": 28150
+    },
+    {
+      "epoch": 1.429363759820284,
+      "grad_norm": 0.02544415478890754,
+      "learning_rate": 0.00022812280760004718,
+      "loss": 0.4449,
+      "step": 28155
+    },
+    {
+      "epoch": 1.4296175959183155,
+      "grad_norm": 0.022858736494680027,
+      "learning_rate": 0.00022793691986213726,
+      "loss": 0.4719,
+      "step": 28160
+    },
+    {
+      "epoch": 1.429871432016347,
+      "grad_norm": 0.02868557828847498,
+      "learning_rate": 0.00022775108552885336,
+      "loss": 0.4756,
+      "step": 28165
+    },
+    {
+      "epoch": 1.4301252681143786,
+      "grad_norm": 0.022645438963720982,
+      "learning_rate": 0.00022756530463667336,
+      "loss": 0.4726,
+      "step": 28170
+    },
+    {
+      "epoch": 1.43037910421241,
+      "grad_norm": 0.02751085893232087,
+      "learning_rate": 0.00022737957722206576,
+      "loss": 0.4698,
+      "step": 28175
+    },
+    {
+      "epoch": 1.4306329403104416,
+      "grad_norm": 0.02151280270290485,
+      "learning_rate": 0.00022719390332148743,
+      "loss": 0.4608,
+      "step": 28180
+    },
+    {
+      "epoch": 1.4308867764084732,
+      "grad_norm": 0.02606728512274907,
+      "learning_rate": 0.0002270082829713856,
+      "loss": 0.4721,
+      "step": 28185
+    },
+    {
+      "epoch": 1.4311406125065045,
+      "grad_norm": 0.023997247097793137,
+      "learning_rate": 0.00022682271620819622,
+      "loss": 0.4877,
+      "step": 28190
+    },
+    {
+      "epoch": 1.431394448604536,
+      "grad_norm": 0.022872074241439312,
+      "learning_rate": 0.00022663720306834544,
+      "loss": 0.4929,
+      "step": 28195
+    },
+    {
+      "epoch": 1.4316482847025676,
+      "grad_norm": 0.029600629396223117,
+      "learning_rate": 0.00022645174358824834,
+      "loss": 0.4875,
+      "step": 28200
+    },
+    {
+      "epoch": 1.431902120800599,
+      "grad_norm": 0.024623593274030287,
+      "learning_rate": 0.00022626633780430995,
+      "loss": 0.4856,
+      "step": 28205
+    },
+    {
+      "epoch": 1.4321559568986306,
+      "grad_norm": 0.02466052196985095,
+      "learning_rate": 0.00022608098575292412,
+      "loss": 0.4738,
+      "step": 28210
+    },
+    {
+      "epoch": 1.432409792996662,
+      "grad_norm": 0.02014319185333175,
+      "learning_rate": 0.00022589568747047496,
+      "loss": 0.4535,
+      "step": 28215
+    },
+    {
+      "epoch": 1.4326636290946935,
+      "grad_norm": 0.07392864104661222,
+      "learning_rate": 0.00022571044299333522,
+      "loss": 0.5205,
+      "step": 28220
+    },
+    {
+      "epoch": 1.432917465192725,
+      "grad_norm": 0.02808956874894487,
+      "learning_rate": 0.0002255252523578678,
+      "loss": 0.4641,
+      "step": 28225
+    },
+    {
+      "epoch": 1.4331713012907565,
+      "grad_norm": 0.023026002461065096,
+      "learning_rate": 0.0002253401156004244,
+      "loss": 0.4719,
+      "step": 28230
+    },
+    {
+      "epoch": 1.433425137388788,
+      "grad_norm": 0.025239834221944403,
+      "learning_rate": 0.00022515503275734655,
+      "loss": 0.4812,
+      "step": 28235
+    },
+    {
+      "epoch": 1.4336789734868196,
+      "grad_norm": 0.023740320525211498,
+      "learning_rate": 0.0002249700038649653,
+      "loss": 0.4871,
+      "step": 28240
+    },
+    {
+      "epoch": 1.4339328095848511,
+      "grad_norm": 0.022974485649644116,
+      "learning_rate": 0.00022478502895960056,
+      "loss": 0.5321,
+      "step": 28245
+    },
+    {
+      "epoch": 1.4341866456828827,
+      "grad_norm": 0.038073746528805776,
+      "learning_rate": 0.00022460010807756232,
+      "loss": 0.4879,
+      "step": 28250
+    },
+    {
+      "epoch": 1.434440481780914,
+      "grad_norm": 0.022813896482576042,
+      "learning_rate": 0.00022441524125514924,
+      "loss": 0.4661,
+      "step": 28255
+    },
+    {
+      "epoch": 1.4346943178789455,
+      "grad_norm": 0.023168119706696216,
+      "learning_rate": 0.0002242304285286501,
+      "loss": 0.4958,
+      "step": 28260
+    },
+    {
+      "epoch": 1.434948153976977,
+      "grad_norm": 0.02529122063862638,
+      "learning_rate": 0.0002240456699343425,
+      "loss": 0.477,
+      "step": 28265
+    },
+    {
+      "epoch": 1.4352019900750086,
+      "grad_norm": 0.022195490060621263,
+      "learning_rate": 0.00022386096550849384,
+      "loss": 0.4525,
+      "step": 28270
+    },
+    {
+      "epoch": 1.4354558261730401,
+      "grad_norm": 0.022086437502397378,
+      "learning_rate": 0.00022367631528736037,
+      "loss": 0.4679,
+      "step": 28275
+    },
+    {
+      "epoch": 1.4357096622710714,
+      "grad_norm": 0.02864867259943996,
+      "learning_rate": 0.00022349171930718836,
+      "loss": 0.4855,
+      "step": 28280
+    },
+    {
+      "epoch": 1.435963498369103,
+      "grad_norm": 0.0224610763647068,
+      "learning_rate": 0.0002233071776042127,
+      "loss": 0.4817,
+      "step": 28285
+    },
+    {
+      "epoch": 1.4362173344671345,
+      "grad_norm": 0.02349424018618673,
+      "learning_rate": 0.00022312269021465826,
+      "loss": 0.4559,
+      "step": 28290
+    },
+    {
+      "epoch": 1.436471170565166,
+      "grad_norm": 0.02339458889206688,
+      "learning_rate": 0.00022293825717473891,
+      "loss": 0.4929,
+      "step": 28295
+    },
+    {
+      "epoch": 1.4367250066631976,
+      "grad_norm": 0.031316045398563565,
+      "learning_rate": 0.0002227538785206582,
+      "loss": 0.5114,
+      "step": 28300
+    },
+    {
+      "epoch": 1.4369788427612291,
+      "grad_norm": 0.029663272075113593,
+      "learning_rate": 0.0002225695542886083,
+      "loss": 0.4813,
+      "step": 28305
+    },
+    {
+      "epoch": 1.4372326788592606,
+      "grad_norm": 0.01994990696285228,
+      "learning_rate": 0.00022238528451477152,
+      "loss": 0.4663,
+      "step": 28310
+    },
+    {
+      "epoch": 1.4374865149572922,
+      "grad_norm": 0.02117337091557514,
+      "learning_rate": 0.0002222010692353188,
+      "loss": 0.4757,
+      "step": 28315
+    },
+    {
+      "epoch": 1.4377403510553235,
+      "grad_norm": 0.027257286084898684,
+      "learning_rate": 0.00022201690848641092,
+      "loss": 0.465,
+      "step": 28320
+    },
+    {
+      "epoch": 1.437994187153355,
+      "grad_norm": 0.02068920674069367,
+      "learning_rate": 0.00022183280230419746,
+      "loss": 0.454,
+      "step": 28325
+    },
+    {
+      "epoch": 1.4382480232513866,
+      "grad_norm": 0.026482096977822,
+      "learning_rate": 0.00022164875072481788,
+      "loss": 0.4992,
+      "step": 28330
+    },
+    {
+      "epoch": 1.438501859349418,
+      "grad_norm": 0.025250420834823727,
+      "learning_rate": 0.00022146475378440018,
+      "loss": 0.4845,
+      "step": 28335
+    },
+    {
+      "epoch": 1.4387556954474496,
+      "grad_norm": 0.023124139028515872,
+      "learning_rate": 0.00022128081151906248,
+      "loss": 0.4981,
+      "step": 28340
+    },
+    {
+      "epoch": 1.439009531545481,
+      "grad_norm": 0.0250480716309079,
+      "learning_rate": 0.00022109692396491128,
+      "loss": 0.4932,
+      "step": 28345
+    },
+    {
+      "epoch": 1.4392633676435125,
+      "grad_norm": 0.025172649290414614,
+      "learning_rate": 0.00022091309115804305,
+      "loss": 0.4734,
+      "step": 28350
+    },
+    {
+      "epoch": 1.439517203741544,
+      "grad_norm": 0.023861858060389464,
+      "learning_rate": 0.0002207293131345434,
+      "loss": 0.4958,
+      "step": 28355
+    },
+    {
+      "epoch": 1.4397710398395756,
+      "grad_norm": 0.023276629706840232,
+      "learning_rate": 0.00022054558993048667,
+      "loss": 0.4775,
+      "step": 28360
+    },
+    {
+      "epoch": 1.440024875937607,
+      "grad_norm": 0.021929215795727805,
+      "learning_rate": 0.00022036192158193717,
+      "loss": 0.5146,
+      "step": 28365
+    },
+    {
+      "epoch": 1.4402787120356386,
+      "grad_norm": 0.022629914839638336,
+      "learning_rate": 0.00022017830812494778,
+      "loss": 0.4748,
+      "step": 28370
+    },
+    {
+      "epoch": 1.4405325481336702,
+      "grad_norm": 0.024040761607839467,
+      "learning_rate": 0.0002199947495955612,
+      "loss": 0.5049,
+      "step": 28375
+    },
+    {
+      "epoch": 1.4407863842317017,
+      "grad_norm": 0.02291771186966908,
+      "learning_rate": 0.00021981124602980868,
+      "loss": 0.4945,
+      "step": 28380
+    },
+    {
+      "epoch": 1.4410402203297332,
+      "grad_norm": 0.02136166148441168,
+      "learning_rate": 0.00021962779746371148,
+      "loss": 0.4428,
+      "step": 28385
+    },
+    {
+      "epoch": 1.4412940564277645,
+      "grad_norm": 0.02612556885350365,
+      "learning_rate": 0.0002194444039332792,
+      "loss": 0.4812,
+      "step": 28390
+    },
+    {
+      "epoch": 1.441547892525796,
+      "grad_norm": 0.03403548088384833,
+      "learning_rate": 0.00021926106547451153,
+      "loss": 0.5052,
+      "step": 28395
+    },
+    {
+      "epoch": 1.4418017286238276,
+      "grad_norm": 0.02777524166007245,
+      "learning_rate": 0.00021907778212339646,
+      "loss": 0.4847,
+      "step": 28400
+    },
+    {
+      "epoch": 1.4420555647218591,
+      "grad_norm": 0.02317959016446424,
+      "learning_rate": 0.00021889455391591197,
+      "loss": 0.4857,
+      "step": 28405
+    },
+    {
+      "epoch": 1.4423094008198907,
+      "grad_norm": 0.03177895171595999,
+      "learning_rate": 0.00021871138088802434,
+      "loss": 0.4558,
+      "step": 28410
+    },
+    {
+      "epoch": 1.442563236917922,
+      "grad_norm": 0.02126964198718204,
+      "learning_rate": 0.00021852826307569017,
+      "loss": 0.4506,
+      "step": 28415
+    },
+    {
+      "epoch": 1.4428170730159535,
+      "grad_norm": 0.024740968782442203,
+      "learning_rate": 0.00021834520051485412,
+      "loss": 0.4973,
+      "step": 28420
+    },
+    {
+      "epoch": 1.443070909113985,
+      "grad_norm": 0.027489399167317248,
+      "learning_rate": 0.00021816219324145082,
+      "loss": 0.4751,
+      "step": 28425
+    },
+    {
+      "epoch": 1.4433247452120166,
+      "grad_norm": 0.025374524663172544,
+      "learning_rate": 0.00021797924129140323,
+      "loss": 0.4764,
+      "step": 28430
+    },
+    {
+      "epoch": 1.4435785813100481,
+      "grad_norm": 0.02277464171649709,
+      "learning_rate": 0.00021779634470062433,
+      "loss": 0.4912,
+      "step": 28435
+    },
+    {
+      "epoch": 1.4438324174080797,
+      "grad_norm": 0.021400550142160534,
+      "learning_rate": 0.0002176135035050154,
+      "loss": 0.4559,
+      "step": 28440
+    },
+    {
+      "epoch": 1.4440862535061112,
+      "grad_norm": 0.03045404966662296,
+      "learning_rate": 0.00021743071774046768,
+      "loss": 0.5034,
+      "step": 28445
+    },
+    {
+      "epoch": 1.4443400896041427,
+      "grad_norm": 0.02407917586260191,
+      "learning_rate": 0.00021724798744286072,
+      "loss": 0.4785,
+      "step": 28450
+    },
+    {
+      "epoch": 1.444593925702174,
+      "grad_norm": 0.025205748983629185,
+      "learning_rate": 0.00021706531264806394,
+      "loss": 0.4874,
+      "step": 28455
+    },
+    {
+      "epoch": 1.4448477618002056,
+      "grad_norm": 0.02348829524564938,
+      "learning_rate": 0.00021688269339193513,
+      "loss": 0.4709,
+      "step": 28460
+    },
+    {
+      "epoch": 1.4451015978982371,
+      "grad_norm": 0.0232211398020149,
+      "learning_rate": 0.00021670012971032184,
+      "loss": 0.4774,
+      "step": 28465
+    },
+    {
+      "epoch": 1.4453554339962686,
+      "grad_norm": 0.024930973074259177,
+      "learning_rate": 0.00021651762163906008,
+      "loss": 0.4685,
+      "step": 28470
+    },
+    {
+      "epoch": 1.4456092700943002,
+      "grad_norm": 0.03092756571772325,
+      "learning_rate": 0.0002163351692139755,
+      "loss": 0.4734,
+      "step": 28475
+    },
+    {
+      "epoch": 1.4458631061923315,
+      "grad_norm": 0.022477760693391274,
+      "learning_rate": 0.00021615277247088278,
+      "loss": 0.4869,
+      "step": 28480
+    },
+    {
+      "epoch": 1.446116942290363,
+      "grad_norm": 0.03361966594571212,
+      "learning_rate": 0.00021597043144558505,
+      "loss": 0.4451,
+      "step": 28485
+    },
+    {
+      "epoch": 1.4463707783883946,
+      "grad_norm": 0.030322892044589482,
+      "learning_rate": 0.00021578814617387537,
+      "loss": 0.4828,
+      "step": 28490
+    },
+    {
+      "epoch": 1.446624614486426,
+      "grad_norm": 0.024900585317585298,
+      "learning_rate": 0.00021560591669153505,
+      "loss": 0.4799,
+      "step": 28495
+    },
+    {
+      "epoch": 1.4468784505844576,
+      "grad_norm": 0.022462012440346523,
+      "learning_rate": 0.00021542374303433522,
+      "loss": 0.4528,
+      "step": 28500
+    },
+    {
+      "epoch": 1.4471322866824892,
+      "grad_norm": 0.02385434946419265,
+      "learning_rate": 0.00021524162523803525,
+      "loss": 0.4762,
+      "step": 28505
+    },
+    {
+      "epoch": 1.4473861227805207,
+      "grad_norm": 0.0237899246854406,
+      "learning_rate": 0.00021505956333838432,
+      "loss": 0.5128,
+      "step": 28510
+    },
+    {
+      "epoch": 1.4476399588785522,
+      "grad_norm": 0.022093078514918426,
+      "learning_rate": 0.00021487755737111997,
+      "loss": 0.5007,
+      "step": 28515
+    },
+    {
+      "epoch": 1.4478937949765835,
+      "grad_norm": 0.03720387345715551,
+      "learning_rate": 0.00021469560737196936,
+      "loss": 0.4737,
+      "step": 28520
+    },
+    {
+      "epoch": 1.448147631074615,
+      "grad_norm": 0.02307666838695736,
+      "learning_rate": 0.00021451371337664803,
+      "loss": 0.4847,
+      "step": 28525
+    },
+    {
+      "epoch": 1.4484014671726466,
+      "grad_norm": 0.027202707719082012,
+      "learning_rate": 0.00021433187542086102,
+      "loss": 0.5096,
+      "step": 28530
+    },
+    {
+      "epoch": 1.4486553032706782,
+      "grad_norm": 0.02281167822321718,
+      "learning_rate": 0.0002141500935403023,
+      "loss": 0.4936,
+      "step": 28535
+    },
+    {
+      "epoch": 1.4489091393687097,
+      "grad_norm": 0.022981780525392213,
+      "learning_rate": 0.0002139683677706548,
+      "loss": 0.4721,
+      "step": 28540
+    },
+    {
+      "epoch": 1.449162975466741,
+      "grad_norm": 0.023971573526569936,
+      "learning_rate": 0.00021378669814759016,
+      "loss": 0.4788,
+      "step": 28545
+    },
+    {
+      "epoch": 1.4494168115647725,
+      "grad_norm": 0.0351687277665943,
+      "learning_rate": 0.00021360508470676947,
+      "loss": 0.4275,
+      "step": 28550
+    },
+    {
+      "epoch": 1.449670647662804,
+      "grad_norm": 0.030238615571759643,
+      "learning_rate": 0.00021342352748384224,
+      "loss": 0.4425,
+      "step": 28555
+    },
+    {
+      "epoch": 1.4499244837608356,
+      "grad_norm": 0.019177171938210256,
+      "learning_rate": 0.00021324202651444758,
+      "loss": 0.4624,
+      "step": 28560
+    },
+    {
+      "epoch": 1.4501783198588671,
+      "grad_norm": 0.023734475246235107,
+      "learning_rate": 0.00021306058183421289,
+      "loss": 0.4531,
+      "step": 28565
+    },
+    {
+      "epoch": 1.4504321559568987,
+      "grad_norm": 0.022972344810526544,
+      "learning_rate": 0.00021287919347875517,
+      "loss": 0.5048,
+      "step": 28570
+    },
+    {
+      "epoch": 1.4506859920549302,
+      "grad_norm": 0.022124511197820666,
+      "learning_rate": 0.00021269786148367975,
+      "loss": 0.4901,
+      "step": 28575
+    },
+    {
+      "epoch": 1.4509398281529617,
+      "grad_norm": 0.020604414986246704,
+      "learning_rate": 0.00021251658588458151,
+      "loss": 0.498,
+      "step": 28580
+    },
+    {
+      "epoch": 1.451193664250993,
+      "grad_norm": 0.03208564684499312,
+      "learning_rate": 0.00021233536671704363,
+      "loss": 0.4814,
+      "step": 28585
+    },
+    {
+      "epoch": 1.4514475003490246,
+      "grad_norm": 0.026818436879605508,
+      "learning_rate": 0.00021215420401663864,
+      "loss": 0.494,
+      "step": 28590
+    },
+    {
+      "epoch": 1.4517013364470561,
+      "grad_norm": 0.02278810185462568,
+      "learning_rate": 0.0002119730978189281,
+      "loss": 0.4588,
+      "step": 28595
+    },
+    {
+      "epoch": 1.4519551725450877,
+      "grad_norm": 0.03576004008501719,
+      "learning_rate": 0.0002117920481594619,
+      "loss": 0.4828,
+      "step": 28600
+    },
+    {
+      "epoch": 1.4522090086431192,
+      "grad_norm": 0.02300363802637254,
+      "learning_rate": 0.00021161105507377958,
+      "loss": 0.4625,
+      "step": 28605
+    },
+    {
+      "epoch": 1.4524628447411505,
+      "grad_norm": 0.021251968272071712,
+      "learning_rate": 0.00021143011859740875,
+      "loss": 0.4577,
+      "step": 28610
+    },
+    {
+      "epoch": 1.452716680839182,
+      "grad_norm": 0.02134933721672599,
+      "learning_rate": 0.00021124923876586672,
+      "loss": 0.4804,
+      "step": 28615
+    },
+    {
+      "epoch": 1.4529705169372136,
+      "grad_norm": 0.022574435169212356,
+      "learning_rate": 0.0002110684156146589,
+      "loss": 0.4659,
+      "step": 28620
+    },
+    {
+      "epoch": 1.453224353035245,
+      "grad_norm": 0.021844543840398207,
+      "learning_rate": 0.00021088764917928044,
+      "loss": 0.4765,
+      "step": 28625
+    },
+    {
+      "epoch": 1.4534781891332766,
+      "grad_norm": 0.022297227696848144,
+      "learning_rate": 0.0002107069394952144,
+      "loss": 0.4838,
+      "step": 28630
+    },
+    {
+      "epoch": 1.4537320252313082,
+      "grad_norm": 0.025232582647418608,
+      "learning_rate": 0.00021052628659793367,
+      "loss": 0.4793,
+      "step": 28635
+    },
+    {
+      "epoch": 1.4539858613293397,
+      "grad_norm": 0.02270811870174028,
+      "learning_rate": 0.00021034569052289908,
+      "loss": 0.4736,
+      "step": 28640
+    },
+    {
+      "epoch": 1.4542396974273712,
+      "grad_norm": 0.03638109948966619,
+      "learning_rate": 0.00021016515130556113,
+      "loss": 0.4767,
+      "step": 28645
+    },
+    {
+      "epoch": 1.4544935335254028,
+      "grad_norm": 0.020506178116723225,
+      "learning_rate": 0.0002099846689813582,
+      "loss": 0.4831,
+      "step": 28650
+    },
+    {
+      "epoch": 1.454747369623434,
+      "grad_norm": 0.02980987722615396,
+      "learning_rate": 0.0002098042435857188,
+      "loss": 0.4477,
+      "step": 28655
+    },
+    {
+      "epoch": 1.4550012057214656,
+      "grad_norm": 0.02633014121866591,
+      "learning_rate": 0.000209623875154059,
+      "loss": 0.4758,
+      "step": 28660
+    },
+    {
+      "epoch": 1.4552550418194972,
+      "grad_norm": 0.024442989213594755,
+      "learning_rate": 0.00020944356372178458,
+      "loss": 0.4776,
+      "step": 28665
+    },
+    {
+      "epoch": 1.4555088779175287,
+      "grad_norm": 0.022043581830940656,
+      "learning_rate": 0.00020926330932428944,
+      "loss": 0.4482,
+      "step": 28670
+    },
+    {
+      "epoch": 1.4557627140155602,
+      "grad_norm": 0.023858740881236372,
+      "learning_rate": 0.00020908311199695695,
+      "loss": 0.4835,
+      "step": 28675
+    },
+    {
+      "epoch": 1.4560165501135915,
+      "grad_norm": 0.027411770365613423,
+      "learning_rate": 0.0002089029717751586,
+      "loss": 0.4671,
+      "step": 28680
+    },
+    {
+      "epoch": 1.456270386211623,
+      "grad_norm": 0.023697876508040217,
+      "learning_rate": 0.00020872288869425536,
+      "loss": 0.4911,
+      "step": 28685
+    },
+    {
+      "epoch": 1.4565242223096546,
+      "grad_norm": 0.026627596602070615,
+      "learning_rate": 0.0002085428627895963,
+      "loss": 0.4721,
+      "step": 28690
+    },
+    {
+      "epoch": 1.4567780584076861,
+      "grad_norm": 0.021374243564867154,
+      "learning_rate": 0.00020836289409651993,
+      "loss": 0.4851,
+      "step": 28695
+    },
+    {
+      "epoch": 1.4570318945057177,
+      "grad_norm": 0.026025357905320266,
+      "learning_rate": 0.0002081829826503529,
+      "loss": 0.4711,
+      "step": 28700
+    },
+    {
+      "epoch": 1.4572857306037492,
+      "grad_norm": 0.022910628027651588,
+      "learning_rate": 0.0002080031284864113,
+      "loss": 0.467,
+      "step": 28705
+    },
+    {
+      "epoch": 1.4575395667017808,
+      "grad_norm": 0.020149333711585025,
+      "learning_rate": 0.00020782333163999917,
+      "loss": 0.4638,
+      "step": 28710
+    },
+    {
+      "epoch": 1.4577934027998123,
+      "grad_norm": 0.022348782139786152,
+      "learning_rate": 0.00020764359214640998,
+      "loss": 0.4672,
+      "step": 28715
+    },
+    {
+      "epoch": 1.4580472388978436,
+      "grad_norm": 0.0314334704344233,
+      "learning_rate": 0.0002074639100409258,
+      "loss": 0.4774,
+      "step": 28720
+    },
+    {
+      "epoch": 1.4583010749958751,
+      "grad_norm": 0.022750058450229413,
+      "learning_rate": 0.0002072842853588171,
+      "loss": 0.4761,
+      "step": 28725
+    },
+    {
+      "epoch": 1.4585549110939067,
+      "grad_norm": 0.026687684346631817,
+      "learning_rate": 0.00020710471813534354,
+      "loss": 0.4796,
+      "step": 28730
+    },
+    {
+      "epoch": 1.4588087471919382,
+      "grad_norm": 0.025046655726691035,
+      "learning_rate": 0.00020692520840575297,
+      "loss": 0.48,
+      "step": 28735
+    },
+    {
+      "epoch": 1.4590625832899697,
+      "grad_norm": 0.024616034157493274,
+      "learning_rate": 0.00020674575620528262,
+      "loss": 0.4789,
+      "step": 28740
+    },
+    {
+      "epoch": 1.459316419388001,
+      "grad_norm": 0.022143594499575536,
+      "learning_rate": 0.0002065663615691577,
+      "loss": 0.5053,
+      "step": 28745
+    },
+    {
+      "epoch": 1.4595702554860326,
+      "grad_norm": 0.021547465775443694,
+      "learning_rate": 0.00020638702453259285,
+      "loss": 0.4676,
+      "step": 28750
+    },
+    {
+      "epoch": 1.4598240915840641,
+      "grad_norm": 0.02362010469124488,
+      "learning_rate": 0.0002062077451307906,
+      "loss": 0.4553,
+      "step": 28755
+    },
+    {
+      "epoch": 1.4600779276820957,
+      "grad_norm": 0.021056652145822894,
+      "learning_rate": 0.00020602852339894306,
+      "loss": 0.4686,
+      "step": 28760
+    },
+    {
+      "epoch": 1.4603317637801272,
+      "grad_norm": 0.02196518330009373,
+      "learning_rate": 0.00020584935937223016,
+      "loss": 0.4884,
+      "step": 28765
+    },
+    {
+      "epoch": 1.4605855998781587,
+      "grad_norm": 0.021922777074083848,
+      "learning_rate": 0.0002056702530858211,
+      "loss": 0.4752,
+      "step": 28770
+    },
+    {
+      "epoch": 1.4608394359761903,
+      "grad_norm": 0.02529934917068532,
+      "learning_rate": 0.00020549120457487354,
+      "loss": 0.4553,
+      "step": 28775
+    },
+    {
+      "epoch": 1.4610932720742218,
+      "grad_norm": 0.02498486840017278,
+      "learning_rate": 0.00020531221387453392,
+      "loss": 0.4434,
+      "step": 28780
+    },
+    {
+      "epoch": 1.461347108172253,
+      "grad_norm": 0.021722508591940946,
+      "learning_rate": 0.000205133281019937,
+      "loss": 0.4787,
+      "step": 28785
+    },
+    {
+      "epoch": 1.4616009442702846,
+      "grad_norm": 0.023233582767727436,
+      "learning_rate": 0.0002049544060462067,
+      "loss": 0.4614,
+      "step": 28790
+    },
+    {
+      "epoch": 1.4618547803683162,
+      "grad_norm": 0.02278293695138392,
+      "learning_rate": 0.00020477558898845488,
+      "loss": 0.5037,
+      "step": 28795
+    },
+    {
+      "epoch": 1.4621086164663477,
+      "grad_norm": 0.023402783149172365,
+      "learning_rate": 0.00020459682988178285,
+      "loss": 0.4573,
+      "step": 28800
+    },
+    {
+      "epoch": 1.4623624525643792,
+      "grad_norm": 0.031169336608011022,
+      "learning_rate": 0.0002044181287612798,
+      "loss": 0.4791,
+      "step": 28805
+    },
+    {
+      "epoch": 1.4626162886624106,
+      "grad_norm": 0.028362314190744413,
+      "learning_rate": 0.00020423948566202415,
+      "loss": 0.495,
+      "step": 28810
+    },
+    {
+      "epoch": 1.462870124760442,
+      "grad_norm": 0.02179210935015138,
+      "learning_rate": 0.00020406090061908234,
+      "loss": 0.4638,
+      "step": 28815
+    },
+    {
+      "epoch": 1.4631239608584736,
+      "grad_norm": 0.024277811689262054,
+      "learning_rate": 0.00020388237366751006,
+      "loss": 0.4549,
+      "step": 28820
+    },
+    {
+      "epoch": 1.4633777969565052,
+      "grad_norm": 0.020361262646131383,
+      "learning_rate": 0.00020370390484235096,
+      "loss": 0.479,
+      "step": 28825
+    },
+    {
+      "epoch": 1.4636316330545367,
+      "grad_norm": 0.02705830030106512,
+      "learning_rate": 0.00020352549417863768,
+      "loss": 0.4854,
+      "step": 28830
+    },
+    {
+      "epoch": 1.4638854691525682,
+      "grad_norm": 0.03431989841271053,
+      "learning_rate": 0.00020334714171139158,
+      "loss": 0.4722,
+      "step": 28835
+    },
+    {
+      "epoch": 1.4641393052505998,
+      "grad_norm": 0.02690359771060965,
+      "learning_rate": 0.00020316884747562192,
+      "loss": 0.4755,
+      "step": 28840
+    },
+    {
+      "epoch": 1.4643931413486313,
+      "grad_norm": 0.02164943326089008,
+      "learning_rate": 0.0002029906115063274,
+      "loss": 0.468,
+      "step": 28845
+    },
+    {
+      "epoch": 1.4646469774466626,
+      "grad_norm": 0.024397789571063064,
+      "learning_rate": 0.0002028124338384945,
+      "loss": 0.4625,
+      "step": 28850
+    },
+    {
+      "epoch": 1.4649008135446941,
+      "grad_norm": 0.022519450507155134,
+      "learning_rate": 0.00020263431450709895,
+      "loss": 0.4975,
+      "step": 28855
+    },
+    {
+      "epoch": 1.4651546496427257,
+      "grad_norm": 0.0217210003632832,
+      "learning_rate": 0.00020245625354710435,
+      "loss": 0.4629,
+      "step": 28860
+    },
+    {
+      "epoch": 1.4654084857407572,
+      "grad_norm": 0.025787902107206176,
+      "learning_rate": 0.00020227825099346347,
+      "loss": 0.4741,
+      "step": 28865
+    },
+    {
+      "epoch": 1.4656623218387888,
+      "grad_norm": 0.022185672758679997,
+      "learning_rate": 0.00020210030688111701,
+      "loss": 0.4508,
+      "step": 28870
+    },
+    {
+      "epoch": 1.46591615793682,
+      "grad_norm": 0.023178483612238884,
+      "learning_rate": 0.00020192242124499488,
+      "loss": 0.4874,
+      "step": 28875
+    },
+    {
+      "epoch": 1.4661699940348516,
+      "grad_norm": 0.02582945990064946,
+      "learning_rate": 0.00020174459412001473,
+      "loss": 0.4582,
+      "step": 28880
+    },
+    {
+      "epoch": 1.4664238301328831,
+      "grad_norm": 0.023143750413606654,
+      "learning_rate": 0.00020156682554108357,
+      "loss": 0.4656,
+      "step": 28885
+    },
+    {
+      "epoch": 1.4666776662309147,
+      "grad_norm": 0.02269378720031511,
+      "learning_rate": 0.0002013891155430959,
+      "loss": 0.4668,
+      "step": 28890
+    },
+    {
+      "epoch": 1.4669315023289462,
+      "grad_norm": 0.03448609380058457,
+      "learning_rate": 0.00020121146416093605,
+      "loss": 0.4811,
+      "step": 28895
+    },
+    {
+      "epoch": 1.4671853384269777,
+      "grad_norm": 0.021699709617265108,
+      "learning_rate": 0.00020103387142947555,
+      "loss": 0.5225,
+      "step": 28900
+    },
+    {
+      "epoch": 1.4674391745250093,
+      "grad_norm": 0.026406847147590853,
+      "learning_rate": 0.00020085633738357533,
+      "loss": 0.4825,
+      "step": 28905
+    },
+    {
+      "epoch": 1.4676930106230408,
+      "grad_norm": 0.033192215599298,
+      "learning_rate": 0.00020067886205808405,
+      "loss": 0.4979,
+      "step": 28910
+    },
+    {
+      "epoch": 1.4679468467210723,
+      "grad_norm": 0.02394413280206084,
+      "learning_rate": 0.0002005014454878396,
+      "loss": 0.459,
+      "step": 28915
+    },
+    {
+      "epoch": 1.4682006828191037,
+      "grad_norm": 0.0245933526818191,
+      "learning_rate": 0.0002003240877076677,
+      "loss": 0.4576,
+      "step": 28920
+    },
+    {
+      "epoch": 1.4684545189171352,
+      "grad_norm": 0.02516217005778102,
+      "learning_rate": 0.00020014678875238302,
+      "loss": 0.5001,
+      "step": 28925
+    },
+    {
+      "epoch": 1.4687083550151667,
+      "grad_norm": 0.02361500803054123,
+      "learning_rate": 0.00019996954865678817,
+      "loss": 0.4896,
+      "step": 28930
+    },
+    {
+      "epoch": 1.4689621911131983,
+      "grad_norm": 0.024603894061880013,
+      "learning_rate": 0.00019979236745567487,
+      "loss": 0.5019,
+      "step": 28935
+    },
+    {
+      "epoch": 1.4692160272112298,
+      "grad_norm": 0.020585521169676338,
+      "learning_rate": 0.00019961524518382267,
+      "loss": 0.456,
+      "step": 28940
+    },
+    {
+      "epoch": 1.469469863309261,
+      "grad_norm": 0.02522878829527217,
+      "learning_rate": 0.00019943818187599966,
+      "loss": 0.4565,
+      "step": 28945
+    },
+    {
+      "epoch": 1.4697236994072926,
+      "grad_norm": 0.02118075848457831,
+      "learning_rate": 0.00019926117756696265,
+      "loss": 0.4868,
+      "step": 28950
+    },
+    {
+      "epoch": 1.4699775355053242,
+      "grad_norm": 0.022435152582389803,
+      "learning_rate": 0.00019908423229145672,
+      "loss": 0.4959,
+      "step": 28955
+    },
+    {
+      "epoch": 1.4702313716033557,
+      "grad_norm": 0.02197759398001768,
+      "learning_rate": 0.00019890734608421552,
+      "loss": 0.4627,
+      "step": 28960
+    },
+    {
+      "epoch": 1.4704852077013872,
+      "grad_norm": 0.04773874949155717,
+      "learning_rate": 0.00019873051897996053,
+      "loss": 0.4562,
+      "step": 28965
+    },
+    {
+      "epoch": 1.4707390437994188,
+      "grad_norm": 0.03353710032706133,
+      "learning_rate": 0.0001985537510134024,
+      "loss": 0.4665,
+      "step": 28970
+    },
+    {
+      "epoch": 1.4709928798974503,
+      "grad_norm": 0.026180926097628836,
+      "learning_rate": 0.00019837704221923946,
+      "loss": 0.4479,
+      "step": 28975
+    },
+    {
+      "epoch": 1.4712467159954818,
+      "grad_norm": 0.03126605626141384,
+      "learning_rate": 0.00019820039263215917,
+      "loss": 0.4602,
+      "step": 28980
+    },
+    {
+      "epoch": 1.4715005520935132,
+      "grad_norm": 0.023705023219665274,
+      "learning_rate": 0.00019802380228683646,
+      "loss": 0.4881,
+      "step": 28985
+    },
+    {
+      "epoch": 1.4717543881915447,
+      "grad_norm": 0.023534377244730398,
+      "learning_rate": 0.00019784727121793566,
+      "loss": 0.4828,
+      "step": 28990
+    },
+    {
+      "epoch": 1.4720082242895762,
+      "grad_norm": 0.021276660406363043,
+      "learning_rate": 0.00019767079946010852,
+      "loss": 0.4731,
+      "step": 28995
+    },
+    {
+      "epoch": 1.4722620603876078,
+      "grad_norm": 0.0275960081210604,
+      "learning_rate": 0.00019749438704799588,
+      "loss": 0.4718,
+      "step": 29000
+    },
+    {
+      "epoch": 1.4725158964856393,
+      "grad_norm": 0.021170380103597952,
+      "learning_rate": 0.0001973180340162263,
+      "loss": 0.4525,
+      "step": 29005
+    },
+    {
+      "epoch": 1.4727697325836706,
+      "grad_norm": 0.024756352788350977,
+      "learning_rate": 0.00019714174039941736,
+      "loss": 0.5201,
+      "step": 29010
+    },
+    {
+      "epoch": 1.4730235686817021,
+      "grad_norm": 0.02683451441512577,
+      "learning_rate": 0.00019696550623217403,
+      "loss": 0.4786,
+      "step": 29015
+    },
+    {
+      "epoch": 1.4732774047797337,
+      "grad_norm": 0.02575373794125539,
+      "learning_rate": 0.00019678933154909095,
+      "loss": 0.4784,
+      "step": 29020
+    },
+    {
+      "epoch": 1.4735312408777652,
+      "grad_norm": 0.020364829491525765,
+      "learning_rate": 0.00019661321638475004,
+      "loss": 0.4341,
+      "step": 29025
+    },
+    {
+      "epoch": 1.4737850769757967,
+      "grad_norm": 0.021298525215463972,
+      "learning_rate": 0.00019643716077372153,
+      "loss": 0.4904,
+      "step": 29030
+    },
+    {
+      "epoch": 1.4740389130738283,
+      "grad_norm": 0.021611695522008065,
+      "learning_rate": 0.0001962611647505647,
+      "loss": 0.4968,
+      "step": 29035
+    },
+    {
+      "epoch": 1.4742927491718598,
+      "grad_norm": 0.021550734290455672,
+      "learning_rate": 0.00019608522834982633,
+      "loss": 0.4822,
+      "step": 29040
+    },
+    {
+      "epoch": 1.4745465852698914,
+      "grad_norm": 0.021212859093263894,
+      "learning_rate": 0.00019590935160604218,
+      "loss": 0.4758,
+      "step": 29045
+    },
+    {
+      "epoch": 1.4748004213679227,
+      "grad_norm": 0.02522241453937803,
+      "learning_rate": 0.0001957335345537356,
+      "loss": 0.4638,
+      "step": 29050
+    },
+    {
+      "epoch": 1.4750542574659542,
+      "grad_norm": 0.02216798656322059,
+      "learning_rate": 0.00019555777722741902,
+      "loss": 0.4727,
+      "step": 29055
+    },
+    {
+      "epoch": 1.4753080935639857,
+      "grad_norm": 0.024334896417849243,
+      "learning_rate": 0.00019538207966159234,
+      "loss": 0.4876,
+      "step": 29060
+    },
+    {
+      "epoch": 1.4755619296620173,
+      "grad_norm": 0.020644818633358916,
+      "learning_rate": 0.00019520644189074444,
+      "loss": 0.4633,
+      "step": 29065
+    },
+    {
+      "epoch": 1.4758157657600488,
+      "grad_norm": 0.02651778575033576,
+      "learning_rate": 0.00019503086394935182,
+      "loss": 0.4603,
+      "step": 29070
+    },
+    {
+      "epoch": 1.4760696018580801,
+      "grad_norm": 0.03491940275430201,
+      "learning_rate": 0.00019485534587187977,
+      "loss": 0.4425,
+      "step": 29075
+    },
+    {
+      "epoch": 1.4763234379561117,
+      "grad_norm": 0.023186579903332387,
+      "learning_rate": 0.00019467988769278154,
+      "loss": 0.4549,
+      "step": 29080
+    },
+    {
+      "epoch": 1.4765772740541432,
+      "grad_norm": 0.025307596693011927,
+      "learning_rate": 0.00019450448944649895,
+      "loss": 0.4795,
+      "step": 29085
+    },
+    {
+      "epoch": 1.4768311101521747,
+      "grad_norm": 0.021614617095183405,
+      "learning_rate": 0.00019432915116746136,
+      "loss": 0.4892,
+      "step": 29090
+    },
+    {
+      "epoch": 1.4770849462502063,
+      "grad_norm": 0.02464047963920047,
+      "learning_rate": 0.0001941538728900872,
+      "loss": 0.4628,
+      "step": 29095
+    },
+    {
+      "epoch": 1.4773387823482378,
+      "grad_norm": 0.029044426055459413,
+      "learning_rate": 0.00019397865464878235,
+      "loss": 0.4777,
+      "step": 29100
+    },
+    {
+      "epoch": 1.4775926184462693,
+      "grad_norm": 0.03081511255172388,
+      "learning_rate": 0.00019380349647794165,
+      "loss": 0.4915,
+      "step": 29105
+    },
+    {
+      "epoch": 1.4778464545443009,
+      "grad_norm": 0.02441428752542893,
+      "learning_rate": 0.00019362839841194747,
+      "loss": 0.4716,
+      "step": 29110
+    },
+    {
+      "epoch": 1.4781002906423322,
+      "grad_norm": 0.022749416876083776,
+      "learning_rate": 0.00019345336048517094,
+      "loss": 0.4796,
+      "step": 29115
+    },
+    {
+      "epoch": 1.4783541267403637,
+      "grad_norm": 0.023954531450951794,
+      "learning_rate": 0.00019327838273197078,
+      "loss": 0.4952,
+      "step": 29120
+    },
+    {
+      "epoch": 1.4786079628383952,
+      "grad_norm": 0.02345877737054963,
+      "learning_rate": 0.0001931034651866947,
+      "loss": 0.477,
+      "step": 29125
+    },
+    {
+      "epoch": 1.4788617989364268,
+      "grad_norm": 0.02651510050871646,
+      "learning_rate": 0.00019292860788367773,
+      "loss": 0.4987,
+      "step": 29130
+    },
+    {
+      "epoch": 1.4791156350344583,
+      "grad_norm": 0.02263999113571259,
+      "learning_rate": 0.00019275381085724364,
+      "loss": 0.455,
+      "step": 29135
+    },
+    {
+      "epoch": 1.4793694711324896,
+      "grad_norm": 0.020869618792912294,
+      "learning_rate": 0.00019257907414170445,
+      "loss": 0.4501,
+      "step": 29140
+    },
+    {
+      "epoch": 1.4796233072305212,
+      "grad_norm": 0.02387312884738094,
+      "learning_rate": 0.00019240439777135976,
+      "loss": 0.4511,
+      "step": 29145
+    },
+    {
+      "epoch": 1.4798771433285527,
+      "grad_norm": 0.05515921100199077,
+      "learning_rate": 0.00019222978178049793,
+      "loss": 0.4692,
+      "step": 29150
+    },
+    {
+      "epoch": 1.4801309794265842,
+      "grad_norm": 0.022481925501552334,
+      "learning_rate": 0.00019205522620339494,
+      "loss": 0.4822,
+      "step": 29155
+    },
+    {
+      "epoch": 1.4803848155246158,
+      "grad_norm": 0.029596722532962007,
+      "learning_rate": 0.00019188073107431546,
+      "loss": 0.4648,
+      "step": 29160
+    },
+    {
+      "epoch": 1.4806386516226473,
+      "grad_norm": 0.022443147646602133,
+      "learning_rate": 0.00019170629642751175,
+      "loss": 0.4588,
+      "step": 29165
+    },
+    {
+      "epoch": 1.4808924877206788,
+      "grad_norm": 0.019695512999081097,
+      "learning_rate": 0.00019153192229722478,
+      "loss": 0.47,
+      "step": 29170
+    },
+    {
+      "epoch": 1.4811463238187104,
+      "grad_norm": 0.023069499823679177,
+      "learning_rate": 0.00019135760871768294,
+      "loss": 0.4826,
+      "step": 29175
+    },
+    {
+      "epoch": 1.4814001599167417,
+      "grad_norm": 0.020513414900472458,
+      "learning_rate": 0.00019118335572310347,
+      "loss": 0.4897,
+      "step": 29180
+    },
+    {
+      "epoch": 1.4816539960147732,
+      "grad_norm": 0.02458185146292343,
+      "learning_rate": 0.00019100916334769107,
+      "loss": 0.4503,
+      "step": 29185
+    },
+    {
+      "epoch": 1.4819078321128047,
+      "grad_norm": 0.020161767636643235,
+      "learning_rate": 0.00019083503162563908,
+      "loss": 0.4888,
+      "step": 29190
+    },
+    {
+      "epoch": 1.4821616682108363,
+      "grad_norm": 0.021529664850411454,
+      "learning_rate": 0.0001906609605911283,
+      "loss": 0.4678,
+      "step": 29195
+    },
+    {
+      "epoch": 1.4824155043088678,
+      "grad_norm": 0.022419770519300376,
+      "learning_rate": 0.00019048695027832862,
+      "loss": 0.4569,
+      "step": 29200
+    },
+    {
+      "epoch": 1.4826693404068991,
+      "grad_norm": 0.02111471529503639,
+      "learning_rate": 0.00019031300072139685,
+      "loss": 0.4865,
+      "step": 29205
+    },
+    {
+      "epoch": 1.4829231765049307,
+      "grad_norm": 0.022270474614644514,
+      "learning_rate": 0.00019013911195447887,
+      "loss": 0.4909,
+      "step": 29210
+    },
+    {
+      "epoch": 1.4831770126029622,
+      "grad_norm": 0.0246869876351486,
+      "learning_rate": 0.0001899652840117077,
+      "loss": 0.4963,
+      "step": 29215
+    },
+    {
+      "epoch": 1.4834308487009937,
+      "grad_norm": 0.02576348095397173,
+      "learning_rate": 0.0001897915169272053,
+      "loss": 0.4913,
+      "step": 29220
+    },
+    {
+      "epoch": 1.4836846847990253,
+      "grad_norm": 0.024597752477760727,
+      "learning_rate": 0.000189617810735081,
+      "loss": 0.5088,
+      "step": 29225
+    },
+    {
+      "epoch": 1.4839385208970568,
+      "grad_norm": 0.02344812114499017,
+      "learning_rate": 0.0001894441654694327,
+      "loss": 0.4355,
+      "step": 29230
+    },
+    {
+      "epoch": 1.4841923569950883,
+      "grad_norm": 0.021559453697928024,
+      "learning_rate": 0.00018927058116434588,
+      "loss": 0.4764,
+      "step": 29235
+    },
+    {
+      "epoch": 1.4844461930931199,
+      "grad_norm": 0.02036859797483484,
+      "learning_rate": 0.00018909705785389452,
+      "loss": 0.4684,
+      "step": 29240
+    },
+    {
+      "epoch": 1.4847000291911514,
+      "grad_norm": 0.02205258982868522,
+      "learning_rate": 0.00018892359557214,
+      "loss": 0.4725,
+      "step": 29245
+    },
+    {
+      "epoch": 1.4849538652891827,
+      "grad_norm": 0.020563901133827663,
+      "learning_rate": 0.00018875019435313255,
+      "loss": 0.4731,
+      "step": 29250
+    },
+    {
+      "epoch": 1.4852077013872143,
+      "grad_norm": 0.028685312406026363,
+      "learning_rate": 0.0001885768542309096,
+      "loss": 0.4728,
+      "step": 29255
+    },
+    {
+      "epoch": 1.4854615374852458,
+      "grad_norm": 0.02454713847924293,
+      "learning_rate": 0.0001884035752394971,
+      "loss": 0.4763,
+      "step": 29260
+    },
+    {
+      "epoch": 1.4857153735832773,
+      "grad_norm": 0.02365382394464072,
+      "learning_rate": 0.000188230357412909,
+      "loss": 0.4331,
+      "step": 29265
+    },
+    {
+      "epoch": 1.4859692096813089,
+      "grad_norm": 0.021080550215192822,
+      "learning_rate": 0.00018805720078514677,
+      "loss": 0.4587,
+      "step": 29270
+    },
+    {
+      "epoch": 1.4862230457793402,
+      "grad_norm": 0.022858405035791375,
+      "learning_rate": 0.0001878841053902005,
+      "loss": 0.4602,
+      "step": 29275
+    },
+    {
+      "epoch": 1.4864768818773717,
+      "grad_norm": 0.026334553802682143,
+      "learning_rate": 0.00018771107126204771,
+      "loss": 0.4749,
+      "step": 29280
+    },
+    {
+      "epoch": 1.4867307179754032,
+      "grad_norm": 0.024924349203167863,
+      "learning_rate": 0.00018753809843465442,
+      "loss": 0.4775,
+      "step": 29285
+    },
+    {
+      "epoch": 1.4869845540734348,
+      "grad_norm": 0.031850516565785435,
+      "learning_rate": 0.00018736518694197396,
+      "loss": 0.4589,
+      "step": 29290
+    },
+    {
+      "epoch": 1.4872383901714663,
+      "grad_norm": 0.028274802301810807,
+      "learning_rate": 0.0001871923368179484,
+      "loss": 0.4538,
+      "step": 29295
+    },
+    {
+      "epoch": 1.4874922262694978,
+      "grad_norm": 0.021991555650211984,
+      "learning_rate": 0.000187019548096507,
+      "loss": 0.4906,
+      "step": 29300
+    },
+    {
+      "epoch": 1.4877460623675294,
+      "grad_norm": 0.027104083276996482,
+      "learning_rate": 0.00018684682081156762,
+      "loss": 0.4922,
+      "step": 29305
+    },
+    {
+      "epoch": 1.487999898465561,
+      "grad_norm": 0.02050509919684562,
+      "learning_rate": 0.00018667415499703545,
+      "loss": 0.4614,
+      "step": 29310
+    },
+    {
+      "epoch": 1.4882537345635922,
+      "grad_norm": 0.021216527675499326,
+      "learning_rate": 0.00018650155068680407,
+      "loss": 0.4525,
+      "step": 29315
+    },
+    {
+      "epoch": 1.4885075706616238,
+      "grad_norm": 0.022730076917583136,
+      "learning_rate": 0.00018632900791475492,
+      "loss": 0.4685,
+      "step": 29320
+    },
+    {
+      "epoch": 1.4887614067596553,
+      "grad_norm": 0.023575223517873986,
+      "learning_rate": 0.0001861565267147574,
+      "loss": 0.4376,
+      "step": 29325
+    },
+    {
+      "epoch": 1.4890152428576868,
+      "grad_norm": 0.02277744320446659,
+      "learning_rate": 0.0001859841071206684,
+      "loss": 0.4398,
+      "step": 29330
+    },
+    {
+      "epoch": 1.4892690789557184,
+      "grad_norm": 0.021608319308460473,
+      "learning_rate": 0.0001858117491663333,
+      "loss": 0.489,
+      "step": 29335
+    },
+    {
+      "epoch": 1.4895229150537497,
+      "grad_norm": 0.020073745176878363,
+      "learning_rate": 0.0001856394528855848,
+      "loss": 0.4597,
+      "step": 29340
+    },
+    {
+      "epoch": 1.4897767511517812,
+      "grad_norm": 0.022010323279037017,
+      "learning_rate": 0.00018546721831224424,
+      "loss": 0.4762,
+      "step": 29345
+    },
+    {
+      "epoch": 1.4900305872498127,
+      "grad_norm": 0.02437069862496661,
+      "learning_rate": 0.00018529504548011995,
+      "loss": 0.4831,
+      "step": 29350
+    },
+    {
+      "epoch": 1.4902844233478443,
+      "grad_norm": 0.021983892566894717,
+      "learning_rate": 0.00018512293442300893,
+      "loss": 0.4737,
+      "step": 29355
+    },
+    {
+      "epoch": 1.4905382594458758,
+      "grad_norm": 0.03613783598950718,
+      "learning_rate": 0.00018495088517469545,
+      "loss": 0.4716,
+      "step": 29360
+    },
+    {
+      "epoch": 1.4907920955439073,
+      "grad_norm": 0.02504114958866562,
+      "learning_rate": 0.00018477889776895225,
+      "loss": 0.453,
+      "step": 29365
+    },
+    {
+      "epoch": 1.4910459316419389,
+      "grad_norm": 0.023647519356946433,
+      "learning_rate": 0.0001846069722395392,
+      "loss": 0.4599,
+      "step": 29370
+    },
+    {
+      "epoch": 1.4912997677399704,
+      "grad_norm": 0.02647181372921064,
+      "learning_rate": 0.00018443510862020467,
+      "loss": 0.4836,
+      "step": 29375
+    },
+    {
+      "epoch": 1.4915536038380017,
+      "grad_norm": 0.0222009899568302,
+      "learning_rate": 0.0001842633069446848,
+      "loss": 0.4592,
+      "step": 29380
+    },
+    {
+      "epoch": 1.4918074399360333,
+      "grad_norm": 0.02282816694246597,
+      "learning_rate": 0.00018409156724670295,
+      "loss": 0.5007,
+      "step": 29385
+    },
+    {
+      "epoch": 1.4920612760340648,
+      "grad_norm": 0.02159121780004676,
+      "learning_rate": 0.00018391988955997126,
+      "loss": 0.4567,
+      "step": 29390
+    },
+    {
+      "epoch": 1.4923151121320963,
+      "grad_norm": 0.022260535759933305,
+      "learning_rate": 0.00018374827391818877,
+      "loss": 0.4663,
+      "step": 29395
+    },
+    {
+      "epoch": 1.4925689482301279,
+      "grad_norm": 0.022850427122847246,
+      "learning_rate": 0.00018357672035504313,
+      "loss": 0.4874,
+      "step": 29400
+    },
+    {
+      "epoch": 1.4928227843281592,
+      "grad_norm": 0.019789375321462733,
+      "learning_rate": 0.00018340522890420907,
+      "loss": 0.4172,
+      "step": 29405
+    },
+    {
+      "epoch": 1.4930766204261907,
+      "grad_norm": 0.025613999903597914,
+      "learning_rate": 0.00018323379959934993,
+      "loss": 0.4852,
+      "step": 29410
+    },
+    {
+      "epoch": 1.4933304565242222,
+      "grad_norm": 0.027238755553218558,
+      "learning_rate": 0.0001830624324741161,
+      "loss": 0.4733,
+      "step": 29415
+    },
+    {
+      "epoch": 1.4935842926222538,
+      "grad_norm": 0.025748721906667212,
+      "learning_rate": 0.00018289112756214633,
+      "loss": 0.4633,
+      "step": 29420
+    },
+    {
+      "epoch": 1.4938381287202853,
+      "grad_norm": 0.022524685723581257,
+      "learning_rate": 0.0001827198848970666,
+      "loss": 0.4573,
+      "step": 29425
+    },
+    {
+      "epoch": 1.4940919648183169,
+      "grad_norm": 0.02722240287350484,
+      "learning_rate": 0.00018254870451249138,
+      "loss": 0.4754,
+      "step": 29430
+    },
+    {
+      "epoch": 1.4943458009163484,
+      "grad_norm": 0.024092764906284942,
+      "learning_rate": 0.000182377586442022,
+      "loss": 0.4882,
+      "step": 29435
+    },
+    {
+      "epoch": 1.49459963701438,
+      "grad_norm": 0.023881198609627088,
+      "learning_rate": 0.00018220653071924876,
+      "loss": 0.4624,
+      "step": 29440
+    },
+    {
+      "epoch": 1.4948534731124112,
+      "grad_norm": 0.027775639716056417,
+      "learning_rate": 0.0001820355373777486,
+      "loss": 0.438,
+      "step": 29445
+    },
+    {
+      "epoch": 1.4951073092104428,
+      "grad_norm": 0.028210032309733676,
+      "learning_rate": 0.0001818646064510868,
+      "loss": 0.485,
+      "step": 29450
+    },
+    {
+      "epoch": 1.4953611453084743,
+      "grad_norm": 0.024018687894484327,
+      "learning_rate": 0.00018169373797281618,
+      "loss": 0.4624,
+      "step": 29455
+    },
+    {
+      "epoch": 1.4956149814065058,
+      "grad_norm": 0.02242820966339362,
+      "learning_rate": 0.0001815229319764775,
+      "loss": 0.4483,
+      "step": 29460
+    },
+    {
+      "epoch": 1.4958688175045374,
+      "grad_norm": 0.024756677687856363,
+      "learning_rate": 0.00018135218849559887,
+      "loss": 0.4973,
+      "step": 29465
+    },
+    {
+      "epoch": 1.4961226536025687,
+      "grad_norm": 0.026600698958258437,
+      "learning_rate": 0.00018118150756369673,
+      "loss": 0.4575,
+      "step": 29470
+    },
+    {
+      "epoch": 1.4963764897006002,
+      "grad_norm": 0.021303694946705722,
+      "learning_rate": 0.00018101088921427456,
+      "loss": 0.4561,
+      "step": 29475
+    },
+    {
+      "epoch": 1.4966303257986318,
+      "grad_norm": 0.02617735975797147,
+      "learning_rate": 0.00018084033348082418,
+      "loss": 0.4593,
+      "step": 29480
+    },
+    {
+      "epoch": 1.4968841618966633,
+      "grad_norm": 0.021397946565801608,
+      "learning_rate": 0.00018066984039682456,
+      "loss": 0.4405,
+      "step": 29485
+    },
+    {
+      "epoch": 1.4971379979946948,
+      "grad_norm": 0.024482557930807926,
+      "learning_rate": 0.00018049940999574288,
+      "loss": 0.4788,
+      "step": 29490
+    },
+    {
+      "epoch": 1.4973918340927264,
+      "grad_norm": 0.025781729735713393,
+      "learning_rate": 0.00018032904231103354,
+      "loss": 0.4797,
+      "step": 29495
+    },
+    {
+      "epoch": 1.497645670190758,
+      "grad_norm": 0.01949959677459056,
+      "learning_rate": 0.00018015873737613897,
+      "loss": 0.4558,
+      "step": 29500
+    },
+    {
+      "epoch": 1.4978995062887894,
+      "grad_norm": 0.022386748122245725,
+      "learning_rate": 0.0001799884952244894,
+      "loss": 0.4935,
+      "step": 29505
+    },
+    {
+      "epoch": 1.498153342386821,
+      "grad_norm": 0.021221757126598573,
+      "learning_rate": 0.00017981831588950216,
+      "loss": 0.4462,
+      "step": 29510
+    },
+    {
+      "epoch": 1.4984071784848523,
+      "grad_norm": 0.022333608494739168,
+      "learning_rate": 0.00017964819940458293,
+      "loss": 0.4621,
+      "step": 29515
+    },
+    {
+      "epoch": 1.4986610145828838,
+      "grad_norm": 0.024666056084257013,
+      "learning_rate": 0.00017947814580312438,
+      "loss": 0.4549,
+      "step": 29520
+    },
+    {
+      "epoch": 1.4989148506809153,
+      "grad_norm": 0.02090549438169576,
+      "learning_rate": 0.00017930815511850757,
+      "loss": 0.4689,
+      "step": 29525
+    },
+    {
+      "epoch": 1.4991686867789469,
+      "grad_norm": 0.03565612070546386,
+      "learning_rate": 0.00017913822738410042,
+      "loss": 0.4928,
+      "step": 29530
+    },
+    {
+      "epoch": 1.4994225228769784,
+      "grad_norm": 0.023113435548243647,
+      "learning_rate": 0.00017896836263325928,
+      "loss": 0.4889,
+      "step": 29535
+    },
+    {
+      "epoch": 1.4996763589750097,
+      "grad_norm": 0.023728923010015324,
+      "learning_rate": 0.0001787985608993274,
+      "loss": 0.4956,
+      "step": 29540
+    },
+    {
+      "epoch": 1.4999301950730413,
+      "grad_norm": 0.025393929885262224,
+      "learning_rate": 0.00017862882221563635,
+      "loss": 0.4454,
+      "step": 29545
+    },
+    {
+      "epoch": 1.5001840311710728,
+      "grad_norm": 0.028049319465082344,
+      "learning_rate": 0.00017845914661550466,
+      "loss": 0.4871,
+      "step": 29550
+    },
+    {
+      "epoch": 1.5004378672691043,
+      "grad_norm": 0.031110725136696264,
+      "learning_rate": 0.00017828953413223897,
+      "loss": 0.4707,
+      "step": 29555
+    },
+    {
+      "epoch": 1.5006917033671359,
+      "grad_norm": 0.023067267396166907,
+      "learning_rate": 0.00017811998479913337,
+      "loss": 0.4852,
+      "step": 29560
+    },
+    {
+      "epoch": 1.5009455394651674,
+      "grad_norm": 0.022599329066948542,
+      "learning_rate": 0.0001779504986494697,
+      "loss": 0.4629,
+      "step": 29565
+    },
+    {
+      "epoch": 1.501199375563199,
+      "grad_norm": 0.024939817004019073,
+      "learning_rate": 0.00017778107571651692,
+      "loss": 0.486,
+      "step": 29570
+    },
+    {
+      "epoch": 1.5014532116612305,
+      "grad_norm": 0.0293017112366802,
+      "learning_rate": 0.00017761171603353226,
+      "loss": 0.4563,
+      "step": 29575
+    },
+    {
+      "epoch": 1.501707047759262,
+      "grad_norm": 0.029238115277750512,
+      "learning_rate": 0.00017744241963375986,
+      "loss": 0.4879,
+      "step": 29580
+    },
+    {
+      "epoch": 1.5019608838572933,
+      "grad_norm": 0.02734864802953103,
+      "learning_rate": 0.00017727318655043196,
+      "loss": 0.4643,
+      "step": 29585
+    },
+    {
+      "epoch": 1.5022147199553249,
+      "grad_norm": 0.023307569346119706,
+      "learning_rate": 0.00017710401681676803,
+      "loss": 0.4675,
+      "step": 29590
+    },
+    {
+      "epoch": 1.5024685560533564,
+      "grad_norm": 0.03538931937461218,
+      "learning_rate": 0.00017693491046597544,
+      "loss": 0.4816,
+      "step": 29595
+    },
+    {
+      "epoch": 1.5027223921513877,
+      "grad_norm": 0.02593067797767141,
+      "learning_rate": 0.0001767658675312486,
+      "loss": 0.4854,
+      "step": 29600
+    },
+    {
+      "epoch": 1.5029762282494192,
+      "grad_norm": 0.038262040752877625,
+      "learning_rate": 0.00017659688804577022,
+      "loss": 0.4869,
+      "step": 29605
+    },
+    {
+      "epoch": 1.5032300643474508,
+      "grad_norm": 0.03034656256994557,
+      "learning_rate": 0.00017642797204270972,
+      "loss": 0.4745,
+      "step": 29610
+    },
+    {
+      "epoch": 1.5034839004454823,
+      "grad_norm": 0.02925654062689158,
+      "learning_rate": 0.00017625911955522467,
+      "loss": 0.4796,
+      "step": 29615
+    },
+    {
+      "epoch": 1.5037377365435138,
+      "grad_norm": 0.35865233957359255,
+      "learning_rate": 0.00017609033061646013,
+      "loss": 0.4897,
+      "step": 29620
+    },
+    {
+      "epoch": 1.5039915726415454,
+      "grad_norm": 0.024975856991066987,
+      "learning_rate": 0.0001759216052595482,
+      "loss": 0.4771,
+      "step": 29625
+    },
+    {
+      "epoch": 1.504245408739577,
+      "grad_norm": 0.027727973964852774,
+      "learning_rate": 0.00017575294351760912,
+      "loss": 0.4449,
+      "step": 29630
+    },
+    {
+      "epoch": 1.5044992448376084,
+      "grad_norm": 0.03183878239195353,
+      "learning_rate": 0.00017558434542375002,
+      "loss": 0.4803,
+      "step": 29635
+    },
+    {
+      "epoch": 1.50475308093564,
+      "grad_norm": 0.021762613260108758,
+      "learning_rate": 0.0001754158110110663,
+      "loss": 0.4505,
+      "step": 29640
+    },
+    {
+      "epoch": 1.5050069170336715,
+      "grad_norm": 0.02265057874561824,
+      "learning_rate": 0.00017524734031263995,
+      "loss": 0.4783,
+      "step": 29645
+    },
+    {
+      "epoch": 1.5052607531317028,
+      "grad_norm": 0.024018928533084835,
+      "learning_rate": 0.00017507893336154136,
+      "loss": 0.4687,
+      "step": 29650
+    },
+    {
+      "epoch": 1.5055145892297344,
+      "grad_norm": 0.025100700637922822,
+      "learning_rate": 0.00017491059019082757,
+      "loss": 0.4687,
+      "step": 29655
+    },
+    {
+      "epoch": 1.505768425327766,
+      "grad_norm": 0.025226901808710973,
+      "learning_rate": 0.00017474231083354386,
+      "loss": 0.4664,
+      "step": 29660
+    },
+    {
+      "epoch": 1.5060222614257972,
+      "grad_norm": 0.019733370421100523,
+      "learning_rate": 0.00017457409532272233,
+      "loss": 0.4593,
+      "step": 29665
+    },
+    {
+      "epoch": 1.5062760975238287,
+      "grad_norm": 0.036127873751405706,
+      "learning_rate": 0.00017440594369138318,
+      "loss": 0.4612,
+      "step": 29670
+    },
+    {
+      "epoch": 1.5065299336218603,
+      "grad_norm": 0.027988146551036864,
+      "learning_rate": 0.00017423785597253322,
+      "loss": 0.4712,
+      "step": 29675
+    },
+    {
+      "epoch": 1.5067837697198918,
+      "grad_norm": 0.022722614037556717,
+      "learning_rate": 0.00017406983219916784,
+      "loss": 0.4818,
+      "step": 29680
+    },
+    {
+      "epoch": 1.5070376058179233,
+      "grad_norm": 0.021733007504925076,
+      "learning_rate": 0.00017390187240426885,
+      "loss": 0.4636,
+      "step": 29685
+    },
+    {
+      "epoch": 1.5072914419159549,
+      "grad_norm": 0.022966860647516654,
+      "learning_rate": 0.00017373397662080625,
+      "loss": 0.4507,
+      "step": 29690
+    },
+    {
+      "epoch": 1.5075452780139864,
+      "grad_norm": 0.02313533592545971,
+      "learning_rate": 0.0001735661448817368,
+      "loss": 0.4839,
+      "step": 29695
+    },
+    {
+      "epoch": 1.507799114112018,
+      "grad_norm": 0.02643057262905096,
+      "learning_rate": 0.0001733983772200053,
+      "loss": 0.4706,
+      "step": 29700
+    },
+    {
+      "epoch": 1.5080529502100495,
+      "grad_norm": 0.021347818597730134,
+      "learning_rate": 0.00017323067366854344,
+      "loss": 0.481,
+      "step": 29705
+    },
+    {
+      "epoch": 1.508306786308081,
+      "grad_norm": 0.02075350561553384,
+      "learning_rate": 0.00017306303426027094,
+      "loss": 0.5027,
+      "step": 29710
+    },
+    {
+      "epoch": 1.5085606224061123,
+      "grad_norm": 0.044968810989294526,
+      "learning_rate": 0.00017289545902809416,
+      "loss": 0.4437,
+      "step": 29715
+    },
+    {
+      "epoch": 1.5088144585041439,
+      "grad_norm": 0.02712313510234019,
+      "learning_rate": 0.00017272794800490772,
+      "loss": 0.4846,
+      "step": 29720
+    },
+    {
+      "epoch": 1.5090682946021754,
+      "grad_norm": 0.030525162317820355,
+      "learning_rate": 0.00017256050122359278,
+      "loss": 0.4566,
+      "step": 29725
+    },
+    {
+      "epoch": 1.5093221307002067,
+      "grad_norm": 0.023955850748629902,
+      "learning_rate": 0.00017239311871701868,
+      "loss": 0.453,
+      "step": 29730
+    },
+    {
+      "epoch": 1.5095759667982382,
+      "grad_norm": 0.02040802810169976,
+      "learning_rate": 0.00017222580051804147,
+      "loss": 0.4381,
+      "step": 29735
+    },
+    {
+      "epoch": 1.5098298028962698,
+      "grad_norm": 0.022363301802030486,
+      "learning_rate": 0.000172058546659505,
+      "loss": 0.4707,
+      "step": 29740
+    },
+    {
+      "epoch": 1.5100836389943013,
+      "grad_norm": 0.020755527638949292,
+      "learning_rate": 0.00017189135717424054,
+      "loss": 0.4495,
+      "step": 29745
+    },
+    {
+      "epoch": 1.5103374750923328,
+      "grad_norm": 0.022821381482327964,
+      "learning_rate": 0.0001717242320950662,
+      "loss": 0.4443,
+      "step": 29750
+    },
+    {
+      "epoch": 1.5105913111903644,
+      "grad_norm": 0.02519971954586667,
+      "learning_rate": 0.00017155717145478822,
+      "loss": 0.4724,
+      "step": 29755
+    },
+    {
+      "epoch": 1.510845147288396,
+      "grad_norm": 0.031004863317514574,
+      "learning_rate": 0.00017139017528619932,
+      "loss": 0.4501,
+      "step": 29760
+    },
+    {
+      "epoch": 1.5110989833864275,
+      "grad_norm": 0.022610866791681877,
+      "learning_rate": 0.0001712232436220804,
+      "loss": 0.4789,
+      "step": 29765
+    },
+    {
+      "epoch": 1.511352819484459,
+      "grad_norm": 0.027520059296275306,
+      "learning_rate": 0.000171056376495199,
+      "loss": 0.4893,
+      "step": 29770
+    },
+    {
+      "epoch": 1.5116066555824905,
+      "grad_norm": 0.02082044707695213,
+      "learning_rate": 0.00017088957393831066,
+      "loss": 0.4471,
+      "step": 29775
+    },
+    {
+      "epoch": 1.5118604916805218,
+      "grad_norm": 0.02144250442393683,
+      "learning_rate": 0.0001707228359841575,
+      "loss": 0.4981,
+      "step": 29780
+    },
+    {
+      "epoch": 1.5121143277785534,
+      "grad_norm": 0.021818336086735435,
+      "learning_rate": 0.0001705561626654697,
+      "loss": 0.4511,
+      "step": 29785
+    },
+    {
+      "epoch": 1.512368163876585,
+      "grad_norm": 0.023425182235900163,
+      "learning_rate": 0.00017038955401496404,
+      "loss": 0.4725,
+      "step": 29790
+    },
+    {
+      "epoch": 1.5126219999746164,
+      "grad_norm": 0.0227877965017971,
+      "learning_rate": 0.00017022301006534512,
+      "loss": 0.4842,
+      "step": 29795
+    },
+    {
+      "epoch": 1.5128758360726478,
+      "grad_norm": 0.02442465002826424,
+      "learning_rate": 0.00017005653084930483,
+      "loss": 0.485,
+      "step": 29800
+    },
+    {
+      "epoch": 1.5131296721706793,
+      "grad_norm": 0.020723345058742938,
+      "learning_rate": 0.00016989011639952222,
+      "loss": 0.4655,
+      "step": 29805
+    },
+    {
+      "epoch": 1.5133835082687108,
+      "grad_norm": 0.020479347433989627,
+      "learning_rate": 0.00016972376674866336,
+      "loss": 0.473,
+      "step": 29810
+    },
+    {
+      "epoch": 1.5136373443667424,
+      "grad_norm": 0.02383837108874991,
+      "learning_rate": 0.00016955748192938215,
+      "loss": 0.4885,
+      "step": 29815
+    },
+    {
+      "epoch": 1.513891180464774,
+      "grad_norm": 0.02189974689590864,
+      "learning_rate": 0.00016939126197431916,
+      "loss": 0.4831,
+      "step": 29820
+    },
+    {
+      "epoch": 1.5141450165628054,
+      "grad_norm": 0.022326421553039454,
+      "learning_rate": 0.00016922510691610288,
+      "loss": 0.4914,
+      "step": 29825
+    },
+    {
+      "epoch": 1.514398852660837,
+      "grad_norm": 0.025686926161214048,
+      "learning_rate": 0.00016905901678734836,
+      "loss": 0.4565,
+      "step": 29830
+    },
+    {
+      "epoch": 1.5146526887588685,
+      "grad_norm": 0.02510185893652824,
+      "learning_rate": 0.00016889299162065863,
+      "loss": 0.4412,
+      "step": 29835
+    },
+    {
+      "epoch": 1.5149065248569,
+      "grad_norm": 0.019743756304140575,
+      "learning_rate": 0.00016872703144862322,
+      "loss": 0.4503,
+      "step": 29840
+    },
+    {
+      "epoch": 1.5151603609549316,
+      "grad_norm": 0.023099741532777593,
+      "learning_rate": 0.0001685611363038197,
+      "loss": 0.4646,
+      "step": 29845
+    },
+    {
+      "epoch": 1.5154141970529629,
+      "grad_norm": 0.02167102495604313,
+      "learning_rate": 0.000168395306218812,
+      "loss": 0.4297,
+      "step": 29850
+    },
+    {
+      "epoch": 1.5156680331509944,
+      "grad_norm": 0.021199600428507007,
+      "learning_rate": 0.00016822954122615202,
+      "loss": 0.5018,
+      "step": 29855
+    },
+    {
+      "epoch": 1.515921869249026,
+      "grad_norm": 0.03525431444628682,
+      "learning_rate": 0.0001680638413583787,
+      "loss": 0.4627,
+      "step": 29860
+    },
+    {
+      "epoch": 1.5161757053470573,
+      "grad_norm": 0.021847874025035,
+      "learning_rate": 0.00016789820664801785,
+      "loss": 0.4601,
+      "step": 29865
+    },
+    {
+      "epoch": 1.5164295414450888,
+      "grad_norm": 0.023339409719603202,
+      "learning_rate": 0.00016773263712758298,
+      "loss": 0.4853,
+      "step": 29870
+    },
+    {
+      "epoch": 1.5166833775431203,
+      "grad_norm": 0.028528438177320366,
+      "learning_rate": 0.00016756713282957425,
+      "loss": 0.4764,
+      "step": 29875
+    },
+    {
+      "epoch": 1.5169372136411519,
+      "grad_norm": 0.02968109811483413,
+      "learning_rate": 0.00016740169378647967,
+      "loss": 0.4622,
+      "step": 29880
+    },
+    {
+      "epoch": 1.5171910497391834,
+      "grad_norm": 0.020421612487212134,
+      "learning_rate": 0.00016723632003077382,
+      "loss": 0.4608,
+      "step": 29885
+    },
+    {
+      "epoch": 1.517444885837215,
+      "grad_norm": 0.022429618618365205,
+      "learning_rate": 0.000167071011594919,
+      "loss": 0.4365,
+      "step": 29890
+    },
+    {
+      "epoch": 1.5176987219352465,
+      "grad_norm": 0.022662076262398696,
+      "learning_rate": 0.00016690576851136407,
+      "loss": 0.4606,
+      "step": 29895
+    },
+    {
+      "epoch": 1.517952558033278,
+      "grad_norm": 0.024997243449074575,
+      "learning_rate": 0.00016674059081254588,
+      "loss": 0.494,
+      "step": 29900
+    },
+    {
+      "epoch": 1.5182063941313095,
+      "grad_norm": 0.02074484540640106,
+      "learning_rate": 0.00016657547853088755,
+      "loss": 0.4665,
+      "step": 29905
+    },
+    {
+      "epoch": 1.518460230229341,
+      "grad_norm": 0.02155076942870292,
+      "learning_rate": 0.00016641043169880016,
+      "loss": 0.4733,
+      "step": 29910
+    },
+    {
+      "epoch": 1.5187140663273724,
+      "grad_norm": 0.021665837981174028,
+      "learning_rate": 0.00016624545034868126,
+      "loss": 0.4754,
+      "step": 29915
+    },
+    {
+      "epoch": 1.518967902425404,
+      "grad_norm": 0.021946502878706562,
+      "learning_rate": 0.00016608053451291606,
+      "loss": 0.4603,
+      "step": 29920
+    },
+    {
+      "epoch": 1.5192217385234354,
+      "grad_norm": 0.021175079987024562,
+      "learning_rate": 0.0001659156842238766,
+      "loss": 0.4527,
+      "step": 29925
+    },
+    {
+      "epoch": 1.5194755746214668,
+      "grad_norm": 0.0237496786913426,
+      "learning_rate": 0.00016575089951392246,
+      "loss": 0.4646,
+      "step": 29930
+    },
+    {
+      "epoch": 1.5197294107194983,
+      "grad_norm": 0.023737688121588916,
+      "learning_rate": 0.0001655861804153997,
+      "loss": 0.4924,
+      "step": 29935
+    },
+    {
+      "epoch": 1.5199832468175298,
+      "grad_norm": 0.021122130099155718,
+      "learning_rate": 0.00016542152696064216,
+      "loss": 0.4394,
+      "step": 29940
+    },
+    {
+      "epoch": 1.5202370829155614,
+      "grad_norm": 0.02227799640020005,
+      "learning_rate": 0.00016525693918197017,
+      "loss": 0.4538,
+      "step": 29945
+    },
+    {
+      "epoch": 1.520490919013593,
+      "grad_norm": 0.02210238996254023,
+      "learning_rate": 0.00016509241711169182,
+      "loss": 0.4354,
+      "step": 29950
+    },
+    {
+      "epoch": 1.5207447551116244,
+      "grad_norm": 0.023706052350875976,
+      "learning_rate": 0.00016492796078210165,
+      "loss": 0.4673,
+      "step": 29955
+    },
+    {
+      "epoch": 1.520998591209656,
+      "grad_norm": 0.024032972742291287,
+      "learning_rate": 0.00016476357022548194,
+      "loss": 0.4675,
+      "step": 29960
+    },
+    {
+      "epoch": 1.5212524273076875,
+      "grad_norm": 0.022993782010873133,
+      "learning_rate": 0.0001645992454741016,
+      "loss": 0.467,
+      "step": 29965
+    },
+    {
+      "epoch": 1.521506263405719,
+      "grad_norm": 0.02190020611794009,
+      "learning_rate": 0.0001644349865602165,
+      "loss": 0.4702,
+      "step": 29970
+    },
+    {
+      "epoch": 1.5217600995037506,
+      "grad_norm": 0.028275273809187658,
+      "learning_rate": 0.00016427079351607031,
+      "loss": 0.4871,
+      "step": 29975
+    },
+    {
+      "epoch": 1.5220139356017819,
+      "grad_norm": 0.021445956898613706,
+      "learning_rate": 0.00016410666637389272,
+      "loss": 0.4798,
+      "step": 29980
+    },
+    {
+      "epoch": 1.5222677716998134,
+      "grad_norm": 0.02438123499531666,
+      "learning_rate": 0.00016394260516590175,
+      "loss": 0.4827,
+      "step": 29985
+    },
+    {
+      "epoch": 1.522521607797845,
+      "grad_norm": 0.022720865331639653,
+      "learning_rate": 0.00016377860992430128,
+      "loss": 0.4744,
+      "step": 29990
+    },
+    {
+      "epoch": 1.5227754438958763,
+      "grad_norm": 0.022386692990421366,
+      "learning_rate": 0.00016361468068128314,
+      "loss": 0.487,
+      "step": 29995
+    },
+    {
+      "epoch": 1.5230292799939078,
+      "grad_norm": 0.022169321428591356,
+      "learning_rate": 0.00016345081746902546,
+      "loss": 0.455,
+      "step": 30000
+    },
+    {
+      "epoch": 1.5232831160919393,
+      "grad_norm": 0.023182632494306774,
+      "learning_rate": 0.0001632870203196941,
+      "loss": 0.4439,
+      "step": 30005
+    },
+    {
+      "epoch": 1.5235369521899709,
+      "grad_norm": 0.02371609367353566,
+      "learning_rate": 0.00016312328926544134,
+      "loss": 0.4749,
+      "step": 30010
+    },
+    {
+      "epoch": 1.5237907882880024,
+      "grad_norm": 0.03987679265817027,
+      "learning_rate": 0.00016295962433840705,
+      "loss": 0.4789,
+      "step": 30015
+    },
+    {
+      "epoch": 1.524044624386034,
+      "grad_norm": 0.02301138402117863,
+      "learning_rate": 0.0001627960255707175,
+      "loss": 0.4964,
+      "step": 30020
+    },
+    {
+      "epoch": 1.5242984604840655,
+      "grad_norm": 0.03265353278900402,
+      "learning_rate": 0.0001626324929944867,
+      "loss": 0.4898,
+      "step": 30025
+    },
+    {
+      "epoch": 1.524552296582097,
+      "grad_norm": 0.022592094290072112,
+      "learning_rate": 0.00016246902664181483,
+      "loss": 0.4581,
+      "step": 30030
+    },
+    {
+      "epoch": 1.5248061326801285,
+      "grad_norm": 0.022149041415701717,
+      "learning_rate": 0.00016230562654478997,
+      "loss": 0.501,
+      "step": 30035
+    },
+    {
+      "epoch": 1.52505996877816,
+      "grad_norm": 0.027689045530166045,
+      "learning_rate": 0.00016214229273548626,
+      "loss": 0.4852,
+      "step": 30040
+    },
+    {
+      "epoch": 1.5253138048761914,
+      "grad_norm": 0.021429952552435644,
+      "learning_rate": 0.00016197902524596586,
+      "loss": 0.4657,
+      "step": 30045
+    },
+    {
+      "epoch": 1.525567640974223,
+      "grad_norm": 0.020301150588238986,
+      "learning_rate": 0.0001618158241082771,
+      "loss": 0.4545,
+      "step": 30050
+    },
+    {
+      "epoch": 1.5258214770722545,
+      "grad_norm": 0.031778032501045626,
+      "learning_rate": 0.00016165268935445544,
+      "loss": 0.4439,
+      "step": 30055
+    },
+    {
+      "epoch": 1.526075313170286,
+      "grad_norm": 0.035429628853980294,
+      "learning_rate": 0.00016148962101652364,
+      "loss": 0.4669,
+      "step": 30060
+    },
+    {
+      "epoch": 1.5263291492683173,
+      "grad_norm": 0.02204480950917167,
+      "learning_rate": 0.00016132661912649093,
+      "loss": 0.4762,
+      "step": 30065
+    },
+    {
+      "epoch": 1.5265829853663488,
+      "grad_norm": 0.10463008791075984,
+      "learning_rate": 0.0001611636837163541,
+      "loss": 0.4746,
+      "step": 30070
+    },
+    {
+      "epoch": 1.5268368214643804,
+      "grad_norm": 0.022253586897096424,
+      "learning_rate": 0.0001610008148180962,
+      "loss": 0.4731,
+      "step": 30075
+    },
+    {
+      "epoch": 1.527090657562412,
+      "grad_norm": 0.02649746556621996,
+      "learning_rate": 0.0001608380124636879,
+      "loss": 0.4716,
+      "step": 30080
+    },
+    {
+      "epoch": 1.5273444936604434,
+      "grad_norm": 0.02103303584052773,
+      "learning_rate": 0.00016067527668508624,
+      "loss": 0.4537,
+      "step": 30085
+    },
+    {
+      "epoch": 1.527598329758475,
+      "grad_norm": 0.022990234812475127,
+      "learning_rate": 0.00016051260751423575,
+      "loss": 0.4653,
+      "step": 30090
+    },
+    {
+      "epoch": 1.5278521658565065,
+      "grad_norm": 0.023544638654335974,
+      "learning_rate": 0.00016035000498306712,
+      "loss": 0.4698,
+      "step": 30095
+    },
+    {
+      "epoch": 1.528106001954538,
+      "grad_norm": 0.02200543924407069,
+      "learning_rate": 0.00016018746912349873,
+      "loss": 0.4672,
+      "step": 30100
+    },
+    {
+      "epoch": 1.5283598380525696,
+      "grad_norm": 0.024788258547969233,
+      "learning_rate": 0.00016002499996743553,
+      "loss": 0.4542,
+      "step": 30105
+    },
+    {
+      "epoch": 1.5286136741506011,
+      "grad_norm": 0.020208891949950916,
+      "learning_rate": 0.00015986259754676956,
+      "loss": 0.4441,
+      "step": 30110
+    },
+    {
+      "epoch": 1.5288675102486324,
+      "grad_norm": 0.020897749391169118,
+      "learning_rate": 0.00015970026189337922,
+      "loss": 0.4426,
+      "step": 30115
+    },
+    {
+      "epoch": 1.529121346346664,
+      "grad_norm": 0.029396390617214736,
+      "learning_rate": 0.00015953799303913057,
+      "loss": 0.4663,
+      "step": 30120
+    },
+    {
+      "epoch": 1.5293751824446955,
+      "grad_norm": 0.022145565505644067,
+      "learning_rate": 0.0001593757910158759,
+      "loss": 0.461,
+      "step": 30125
+    },
+    {
+      "epoch": 1.5296290185427268,
+      "grad_norm": 0.025670850634170336,
+      "learning_rate": 0.00015921365585545483,
+      "loss": 0.4842,
+      "step": 30130
+    },
+    {
+      "epoch": 1.5298828546407583,
+      "grad_norm": 0.02368956140515389,
+      "learning_rate": 0.00015905158758969351,
+      "loss": 0.47,
+      "step": 30135
+    },
+    {
+      "epoch": 1.5301366907387899,
+      "grad_norm": 0.029291043930741913,
+      "learning_rate": 0.0001588895862504054,
+      "loss": 0.4698,
+      "step": 30140
+    },
+    {
+      "epoch": 1.5303905268368214,
+      "grad_norm": 0.025337071026864683,
+      "learning_rate": 0.00015872765186939025,
+      "loss": 0.4856,
+      "step": 30145
+    },
+    {
+      "epoch": 1.530644362934853,
+      "grad_norm": 0.026591156717338086,
+      "learning_rate": 0.00015856578447843523,
+      "loss": 0.4747,
+      "step": 30150
+    },
+    {
+      "epoch": 1.5308981990328845,
+      "grad_norm": 0.022063128074845867,
+      "learning_rate": 0.0001584039841093139,
+      "loss": 0.4591,
+      "step": 30155
+    },
+    {
+      "epoch": 1.531152035130916,
+      "grad_norm": 0.021335741188654174,
+      "learning_rate": 0.00015824225079378684,
+      "loss": 0.4508,
+      "step": 30160
+    },
+    {
+      "epoch": 1.5314058712289476,
+      "grad_norm": 0.0223720420156148,
+      "learning_rate": 0.00015808058456360185,
+      "loss": 0.476,
+      "step": 30165
+    },
+    {
+      "epoch": 1.531659707326979,
+      "grad_norm": 0.020840588861711914,
+      "learning_rate": 0.00015791898545049277,
+      "loss": 0.4812,
+      "step": 30170
+    },
+    {
+      "epoch": 1.5319135434250106,
+      "grad_norm": 0.022298891763324967,
+      "learning_rate": 0.0001577574534861811,
+      "loss": 0.4683,
+      "step": 30175
+    },
+    {
+      "epoch": 1.532167379523042,
+      "grad_norm": 0.023587454451723652,
+      "learning_rate": 0.00015759598870237435,
+      "loss": 0.4414,
+      "step": 30180
+    },
+    {
+      "epoch": 1.5324212156210735,
+      "grad_norm": 0.021926001890396517,
+      "learning_rate": 0.00015743459113076757,
+      "loss": 0.4848,
+      "step": 30185
+    },
+    {
+      "epoch": 1.532675051719105,
+      "grad_norm": 0.02060956296914585,
+      "learning_rate": 0.0001572732608030421,
+      "loss": 0.4925,
+      "step": 30190
+    },
+    {
+      "epoch": 1.5329288878171363,
+      "grad_norm": 0.023321532056538852,
+      "learning_rate": 0.0001571119977508665,
+      "loss": 0.4832,
+      "step": 30195
+    },
+    {
+      "epoch": 1.5331827239151679,
+      "grad_norm": 0.020455538805839175,
+      "learning_rate": 0.00015695080200589555,
+      "loss": 0.46,
+      "step": 30200
+    },
+    {
+      "epoch": 1.5334365600131994,
+      "grad_norm": 0.027072650225216113,
+      "learning_rate": 0.0001567896735997716,
+      "loss": 0.5015,
+      "step": 30205
+    },
+    {
+      "epoch": 1.533690396111231,
+      "grad_norm": 0.025146547764616688,
+      "learning_rate": 0.00015662861256412293,
+      "loss": 0.4592,
+      "step": 30210
+    },
+    {
+      "epoch": 1.5339442322092625,
+      "grad_norm": 0.036222624314755626,
+      "learning_rate": 0.0001564676189305654,
+      "loss": 0.4494,
+      "step": 30215
+    },
+    {
+      "epoch": 1.534198068307294,
+      "grad_norm": 0.021401020225457784,
+      "learning_rate": 0.00015630669273070075,
+      "loss": 0.4658,
+      "step": 30220
+    },
+    {
+      "epoch": 1.5344519044053255,
+      "grad_norm": 0.030566491748286028,
+      "learning_rate": 0.00015614583399611864,
+      "loss": 0.4616,
+      "step": 30225
+    },
+    {
+      "epoch": 1.534705740503357,
+      "grad_norm": 0.02095090812110531,
+      "learning_rate": 0.00015598504275839443,
+      "loss": 0.4697,
+      "step": 30230
+    },
+    {
+      "epoch": 1.5349595766013886,
+      "grad_norm": 0.022932148457458463,
+      "learning_rate": 0.00015582431904909082,
+      "loss": 0.4771,
+      "step": 30235
+    },
+    {
+      "epoch": 1.5352134126994201,
+      "grad_norm": 0.03407518278595503,
+      "learning_rate": 0.00015566366289975682,
+      "loss": 0.4951,
+      "step": 30240
+    },
+    {
+      "epoch": 1.5354672487974514,
+      "grad_norm": 0.025253916128979353,
+      "learning_rate": 0.00015550307434192878,
+      "loss": 0.4665,
+      "step": 30245
+    },
+    {
+      "epoch": 1.535721084895483,
+      "grad_norm": 0.02181284888478828,
+      "learning_rate": 0.00015534255340712906,
+      "loss": 0.4712,
+      "step": 30250
+    },
+    {
+      "epoch": 1.5359749209935145,
+      "grad_norm": 0.026511357983416556,
+      "learning_rate": 0.00015518210012686746,
+      "loss": 0.457,
+      "step": 30255
+    },
+    {
+      "epoch": 1.5362287570915458,
+      "grad_norm": 0.02225675314670062,
+      "learning_rate": 0.00015502171453263985,
+      "loss": 0.4655,
+      "step": 30260
+    },
+    {
+      "epoch": 1.5364825931895774,
+      "grad_norm": 0.02579060282785478,
+      "learning_rate": 0.0001548613966559294,
+      "loss": 0.4537,
+      "step": 30265
+    },
+    {
+      "epoch": 1.536736429287609,
+      "grad_norm": 0.022733734170467795,
+      "learning_rate": 0.00015470114652820548,
+      "loss": 0.4893,
+      "step": 30270
+    },
+    {
+      "epoch": 1.5369902653856404,
+      "grad_norm": 0.033369886202445666,
+      "learning_rate": 0.0001545409641809246,
+      "loss": 0.4329,
+      "step": 30275
+    },
+    {
+      "epoch": 1.537244101483672,
+      "grad_norm": 0.02399529836464429,
+      "learning_rate": 0.00015438084964552952,
+      "loss": 0.4608,
+      "step": 30280
+    },
+    {
+      "epoch": 1.5374979375817035,
+      "grad_norm": 0.026731534341703136,
+      "learning_rate": 0.0001542208029534501,
+      "loss": 0.4834,
+      "step": 30285
+    },
+    {
+      "epoch": 1.537751773679735,
+      "grad_norm": 0.02160422482777416,
+      "learning_rate": 0.00015406082413610273,
+      "loss": 0.4872,
+      "step": 30290
+    },
+    {
+      "epoch": 1.5380056097777666,
+      "grad_norm": 0.0316642181786694,
+      "learning_rate": 0.0001539009132248903,
+      "loss": 0.4739,
+      "step": 30295
+    },
+    {
+      "epoch": 1.538259445875798,
+      "grad_norm": 0.021390203234012684,
+      "learning_rate": 0.0001537410702512027,
+      "loss": 0.485,
+      "step": 30300
+    },
+    {
+      "epoch": 1.5385132819738296,
+      "grad_norm": 0.021910472645039536,
+      "learning_rate": 0.00015358129524641612,
+      "loss": 0.4836,
+      "step": 30305
+    },
+    {
+      "epoch": 1.538767118071861,
+      "grad_norm": 0.026086465805670835,
+      "learning_rate": 0.00015342158824189383,
+      "loss": 0.4645,
+      "step": 30310
+    },
+    {
+      "epoch": 1.5390209541698925,
+      "grad_norm": 0.02206117837334602,
+      "learning_rate": 0.00015326194926898524,
+      "loss": 0.4429,
+      "step": 30315
+    },
+    {
+      "epoch": 1.539274790267924,
+      "grad_norm": 0.04439473835947078,
+      "learning_rate": 0.00015310237835902696,
+      "loss": 0.4955,
+      "step": 30320
+    },
+    {
+      "epoch": 1.5395286263659556,
+      "grad_norm": 0.022276905328733724,
+      "learning_rate": 0.0001529428755433417,
+      "loss": 0.4728,
+      "step": 30325
+    },
+    {
+      "epoch": 1.5397824624639869,
+      "grad_norm": 0.022985726029175126,
+      "learning_rate": 0.00015278344085323936,
+      "loss": 0.4855,
+      "step": 30330
+    },
+    {
+      "epoch": 1.5400362985620184,
+      "grad_norm": 0.02518413121712412,
+      "learning_rate": 0.00015262407432001585,
+      "loss": 0.4615,
+      "step": 30335
+    },
+    {
+      "epoch": 1.54029013466005,
+      "grad_norm": 0.02425031078779939,
+      "learning_rate": 0.00015246477597495418,
+      "loss": 0.4743,
+      "step": 30340
+    },
+    {
+      "epoch": 1.5405439707580815,
+      "grad_norm": 0.02735453587465954,
+      "learning_rate": 0.00015230554584932382,
+      "loss": 0.5191,
+      "step": 30345
+    },
+    {
+      "epoch": 1.540797806856113,
+      "grad_norm": 0.019919301710259858,
+      "learning_rate": 0.00015214638397438108,
+      "loss": 0.4306,
+      "step": 30350
+    },
+    {
+      "epoch": 1.5410516429541445,
+      "grad_norm": 0.0254003448653861,
+      "learning_rate": 0.00015198729038136822,
+      "loss": 0.4859,
+      "step": 30355
+    },
+    {
+      "epoch": 1.541305479052176,
+      "grad_norm": 0.025573678483990125,
+      "learning_rate": 0.00015182826510151486,
+      "loss": 0.4723,
+      "step": 30360
+    },
+    {
+      "epoch": 1.5415593151502076,
+      "grad_norm": 0.027652411805370227,
+      "learning_rate": 0.00015166930816603658,
+      "loss": 0.455,
+      "step": 30365
+    },
+    {
+      "epoch": 1.5418131512482391,
+      "grad_norm": 0.020242159971136965,
+      "learning_rate": 0.00015151041960613615,
+      "loss": 0.4484,
+      "step": 30370
+    },
+    {
+      "epoch": 1.5420669873462707,
+      "grad_norm": 0.020441851093310592,
+      "learning_rate": 0.0001513515994530023,
+      "loss": 0.4777,
+      "step": 30375
+    },
+    {
+      "epoch": 1.542320823444302,
+      "grad_norm": 0.02264328559702477,
+      "learning_rate": 0.00015119284773781088,
+      "loss": 0.4692,
+      "step": 30380
+    },
+    {
+      "epoch": 1.5425746595423335,
+      "grad_norm": 0.026801259654149444,
+      "learning_rate": 0.00015103416449172385,
+      "loss": 0.4879,
+      "step": 30385
+    },
+    {
+      "epoch": 1.542828495640365,
+      "grad_norm": 0.029775650566010867,
+      "learning_rate": 0.0001508755497458902,
+      "loss": 0.4733,
+      "step": 30390
+    },
+    {
+      "epoch": 1.5430823317383964,
+      "grad_norm": 0.023702035657112406,
+      "learning_rate": 0.00015071700353144486,
+      "loss": 0.4844,
+      "step": 30395
+    },
+    {
+      "epoch": 1.543336167836428,
+      "grad_norm": 0.02932424299604861,
+      "learning_rate": 0.00015055852587950985,
+      "loss": 0.4498,
+      "step": 30400
+    },
+    {
+      "epoch": 1.5435900039344594,
+      "grad_norm": 0.024319591793915026,
+      "learning_rate": 0.0001504001168211937,
+      "loss": 0.485,
+      "step": 30405
+    },
+    {
+      "epoch": 1.543843840032491,
+      "grad_norm": 0.02366358061890231,
+      "learning_rate": 0.00015024177638759106,
+      "loss": 0.4566,
+      "step": 30410
+    },
+    {
+      "epoch": 1.5440976761305225,
+      "grad_norm": 0.024118758618433097,
+      "learning_rate": 0.00015008350460978358,
+      "loss": 0.4397,
+      "step": 30415
+    },
+    {
+      "epoch": 1.544351512228554,
+      "grad_norm": 0.020666341449150978,
+      "learning_rate": 0.00014992530151883898,
+      "loss": 0.4599,
+      "step": 30420
+    },
+    {
+      "epoch": 1.5446053483265856,
+      "grad_norm": 0.02778511686784231,
+      "learning_rate": 0.000149767167145812,
+      "loss": 0.454,
+      "step": 30425
+    },
+    {
+      "epoch": 1.5448591844246171,
+      "grad_norm": 0.02389674933505474,
+      "learning_rate": 0.0001496091015217434,
+      "loss": 0.463,
+      "step": 30430
+    },
+    {
+      "epoch": 1.5451130205226487,
+      "grad_norm": 0.027551341882567568,
+      "learning_rate": 0.00014945110467766087,
+      "loss": 0.4622,
+      "step": 30435
+    },
+    {
+      "epoch": 1.5453668566206802,
+      "grad_norm": 0.02501687597581702,
+      "learning_rate": 0.0001492931766445782,
+      "loss": 0.4764,
+      "step": 30440
+    },
+    {
+      "epoch": 1.5456206927187115,
+      "grad_norm": 0.0233086153289092,
+      "learning_rate": 0.0001491353174534961,
+      "loss": 0.4686,
+      "step": 30445
+    },
+    {
+      "epoch": 1.545874528816743,
+      "grad_norm": 0.02184154260928588,
+      "learning_rate": 0.0001489775271354013,
+      "loss": 0.4702,
+      "step": 30450
+    },
+    {
+      "epoch": 1.5461283649147746,
+      "grad_norm": 0.024678224162576597,
+      "learning_rate": 0.00014881980572126752,
+      "loss": 0.483,
+      "step": 30455
+    },
+    {
+      "epoch": 1.5463822010128059,
+      "grad_norm": 0.023783148168890343,
+      "learning_rate": 0.00014866215324205423,
+      "loss": 0.4683,
+      "step": 30460
+    },
+    {
+      "epoch": 1.5466360371108374,
+      "grad_norm": 0.022198329720484546,
+      "learning_rate": 0.00014850456972870845,
+      "loss": 0.458,
+      "step": 30465
+    },
+    {
+      "epoch": 1.546889873208869,
+      "grad_norm": 0.02239881749730953,
+      "learning_rate": 0.00014834705521216262,
+      "loss": 0.4715,
+      "step": 30470
+    },
+    {
+      "epoch": 1.5471437093069005,
+      "grad_norm": 0.022857676745027398,
+      "learning_rate": 0.0001481896097233363,
+      "loss": 0.4553,
+      "step": 30475
+    },
+    {
+      "epoch": 1.547397545404932,
+      "grad_norm": 0.021961872351366773,
+      "learning_rate": 0.00014803223329313493,
+      "loss": 0.458,
+      "step": 30480
+    },
+    {
+      "epoch": 1.5476513815029636,
+      "grad_norm": 0.025052913119013042,
+      "learning_rate": 0.00014787492595245107,
+      "loss": 0.4809,
+      "step": 30485
+    },
+    {
+      "epoch": 1.547905217600995,
+      "grad_norm": 0.023440120967819812,
+      "learning_rate": 0.00014771768773216298,
+      "loss": 0.4511,
+      "step": 30490
+    },
+    {
+      "epoch": 1.5481590536990266,
+      "grad_norm": 0.021273780361796304,
+      "learning_rate": 0.00014756051866313618,
+      "loss": 0.4716,
+      "step": 30495
+    },
+    {
+      "epoch": 1.5484128897970582,
+      "grad_norm": 0.17776653515516164,
+      "learning_rate": 0.00014740341877622181,
+      "loss": 0.4469,
+      "step": 30500
+    },
+    {
+      "epoch": 1.5486667258950897,
+      "grad_norm": 0.028505550422634855,
+      "learning_rate": 0.0001472463881022581,
+      "loss": 0.4391,
+      "step": 30505
+    },
+    {
+      "epoch": 1.548920561993121,
+      "grad_norm": 0.021992503155247888,
+      "learning_rate": 0.00014708942667206903,
+      "loss": 0.4806,
+      "step": 30510
+    },
+    {
+      "epoch": 1.5491743980911525,
+      "grad_norm": 0.022741796670375448,
+      "learning_rate": 0.0001469325345164657,
+      "loss": 0.4632,
+      "step": 30515
+    },
+    {
+      "epoch": 1.549428234189184,
+      "grad_norm": 0.020106646227831544,
+      "learning_rate": 0.00014677571166624498,
+      "loss": 0.4321,
+      "step": 30520
+    },
+    {
+      "epoch": 1.5496820702872154,
+      "grad_norm": 0.02404819367108056,
+      "learning_rate": 0.0001466189581521905,
+      "loss": 0.5009,
+      "step": 30525
+    },
+    {
+      "epoch": 1.549935906385247,
+      "grad_norm": 0.020931172832830407,
+      "learning_rate": 0.00014646227400507238,
+      "loss": 0.4553,
+      "step": 30530
+    },
+    {
+      "epoch": 1.5501897424832785,
+      "grad_norm": 0.021373194488072105,
+      "learning_rate": 0.00014630565925564666,
+      "loss": 0.4716,
+      "step": 30535
+    },
+    {
+      "epoch": 1.55044357858131,
+      "grad_norm": 0.0197633540678443,
+      "learning_rate": 0.0001461491139346563,
+      "loss": 0.4438,
+      "step": 30540
+    },
+    {
+      "epoch": 1.5506974146793415,
+      "grad_norm": 0.02187526961268043,
+      "learning_rate": 0.00014599263807283004,
+      "loss": 0.475,
+      "step": 30545
+    },
+    {
+      "epoch": 1.550951250777373,
+      "grad_norm": 0.026271464926687128,
+      "learning_rate": 0.00014583623170088368,
+      "loss": 0.4536,
+      "step": 30550
+    },
+    {
+      "epoch": 1.5512050868754046,
+      "grad_norm": 0.02494946006654002,
+      "learning_rate": 0.00014567989484951866,
+      "loss": 0.4895,
+      "step": 30555
+    },
+    {
+      "epoch": 1.5514589229734361,
+      "grad_norm": 0.021477409162667542,
+      "learning_rate": 0.00014552362754942345,
+      "loss": 0.4711,
+      "step": 30560
+    },
+    {
+      "epoch": 1.5517127590714677,
+      "grad_norm": 0.02423951069240377,
+      "learning_rate": 0.00014536742983127222,
+      "loss": 0.4594,
+      "step": 30565
+    },
+    {
+      "epoch": 1.5519665951694992,
+      "grad_norm": 0.022676565651745723,
+      "learning_rate": 0.0001452113017257261,
+      "loss": 0.4663,
+      "step": 30570
+    },
+    {
+      "epoch": 1.5522204312675305,
+      "grad_norm": 0.02291640098589712,
+      "learning_rate": 0.000145055243263432,
+      "loss": 0.4895,
+      "step": 30575
+    },
+    {
+      "epoch": 1.552474267365562,
+      "grad_norm": 0.028475726341260384,
+      "learning_rate": 0.0001448992544750235,
+      "loss": 0.4935,
+      "step": 30580
+    },
+    {
+      "epoch": 1.5527281034635936,
+      "grad_norm": 0.027950998531424066,
+      "learning_rate": 0.0001447433353911205,
+      "loss": 0.4825,
+      "step": 30585
+    },
+    {
+      "epoch": 1.5529819395616251,
+      "grad_norm": 0.03974053562564052,
+      "learning_rate": 0.00014458748604232924,
+      "loss": 0.4758,
+      "step": 30590
+    },
+    {
+      "epoch": 1.5532357756596564,
+      "grad_norm": 0.022027661241062403,
+      "learning_rate": 0.00014443170645924192,
+      "loss": 0.4816,
+      "step": 30595
+    },
+    {
+      "epoch": 1.553489611757688,
+      "grad_norm": 0.02238767269393634,
+      "learning_rate": 0.0001442759966724375,
+      "loss": 0.4531,
+      "step": 30600
+    },
+    {
+      "epoch": 1.5537434478557195,
+      "grad_norm": 0.026046907786261127,
+      "learning_rate": 0.0001441203567124808,
+      "loss": 0.459,
+      "step": 30605
+    },
+    {
+      "epoch": 1.553997283953751,
+      "grad_norm": 0.023211596978539077,
+      "learning_rate": 0.00014396478660992353,
+      "loss": 0.4597,
+      "step": 30610
+    },
+    {
+      "epoch": 1.5542511200517826,
+      "grad_norm": 0.02283234851673069,
+      "learning_rate": 0.00014380928639530282,
+      "loss": 0.4729,
+      "step": 30615
+    },
+    {
+      "epoch": 1.554504956149814,
+      "grad_norm": 0.022351336504833237,
+      "learning_rate": 0.00014365385609914312,
+      "loss": 0.4719,
+      "step": 30620
+    },
+    {
+      "epoch": 1.5547587922478456,
+      "grad_norm": 0.022992716838049543,
+      "learning_rate": 0.00014349849575195423,
+      "loss": 0.444,
+      "step": 30625
+    },
+    {
+      "epoch": 1.5550126283458772,
+      "grad_norm": 0.024021395143192816,
+      "learning_rate": 0.00014334320538423285,
+      "loss": 0.4633,
+      "step": 30630
+    },
+    {
+      "epoch": 1.5552664644439087,
+      "grad_norm": 0.023643090769376898,
+      "learning_rate": 0.00014318798502646146,
+      "loss": 0.4528,
+      "step": 30635
+    },
+    {
+      "epoch": 1.55552030054194,
+      "grad_norm": 0.01987202638142388,
+      "learning_rate": 0.00014303283470910923,
+      "loss": 0.4648,
+      "step": 30640
+    },
+    {
+      "epoch": 1.5557741366399715,
+      "grad_norm": 0.022557897730457903,
+      "learning_rate": 0.00014287775446263147,
+      "loss": 0.4736,
+      "step": 30645
+    },
+    {
+      "epoch": 1.556027972738003,
+      "grad_norm": 0.045642258711147084,
+      "learning_rate": 0.0001427227443174694,
+      "loss": 0.4524,
+      "step": 30650
+    },
+    {
+      "epoch": 1.5562818088360346,
+      "grad_norm": 0.02528878503814024,
+      "learning_rate": 0.00014256780430405103,
+      "loss": 0.4454,
+      "step": 30655
+    },
+    {
+      "epoch": 1.556535644934066,
+      "grad_norm": 0.026569740501743554,
+      "learning_rate": 0.00014241293445279,
+      "loss": 0.4783,
+      "step": 30660
+    },
+    {
+      "epoch": 1.5567894810320975,
+      "grad_norm": 0.021501409027796143,
+      "learning_rate": 0.00014225813479408684,
+      "loss": 0.4639,
+      "step": 30665
+    },
+    {
+      "epoch": 1.557043317130129,
+      "grad_norm": 0.02015158186890378,
+      "learning_rate": 0.0001421034053583276,
+      "loss": 0.452,
+      "step": 30670
+    },
+    {
+      "epoch": 1.5572971532281605,
+      "grad_norm": 0.02290562338028314,
+      "learning_rate": 0.00014194874617588522,
+      "loss": 0.4564,
+      "step": 30675
+    },
+    {
+      "epoch": 1.557550989326192,
+      "grad_norm": 0.021262535192917962,
+      "learning_rate": 0.0001417941572771182,
+      "loss": 0.4404,
+      "step": 30680
+    },
+    {
+      "epoch": 1.5578048254242236,
+      "grad_norm": 0.023855140132822632,
+      "learning_rate": 0.0001416396386923719,
+      "loss": 0.4657,
+      "step": 30685
+    },
+    {
+      "epoch": 1.5580586615222551,
+      "grad_norm": 0.026001147130948372,
+      "learning_rate": 0.00014148519045197722,
+      "loss": 0.4983,
+      "step": 30690
+    },
+    {
+      "epoch": 1.5583124976202867,
+      "grad_norm": 0.025029246692496336,
+      "learning_rate": 0.00014133081258625192,
+      "loss": 0.4728,
+      "step": 30695
+    },
+    {
+      "epoch": 1.5585663337183182,
+      "grad_norm": 0.02861011211966918,
+      "learning_rate": 0.00014117650512549912,
+      "loss": 0.4667,
+      "step": 30700
+    },
+    {
+      "epoch": 1.5588201698163497,
+      "grad_norm": 0.020989857959052866,
+      "learning_rate": 0.00014102226810000919,
+      "loss": 0.4785,
+      "step": 30705
+    },
+    {
+      "epoch": 1.559074005914381,
+      "grad_norm": 0.02520491934250732,
+      "learning_rate": 0.0001408681015400577,
+      "loss": 0.4715,
+      "step": 30710
+    },
+    {
+      "epoch": 1.5593278420124126,
+      "grad_norm": 0.02884493504793044,
+      "learning_rate": 0.000140714005475907,
+      "loss": 0.4626,
+      "step": 30715
+    },
+    {
+      "epoch": 1.5595816781104441,
+      "grad_norm": 0.023290080100767288,
+      "learning_rate": 0.00014055997993780512,
+      "loss": 0.4761,
+      "step": 30720
+    },
+    {
+      "epoch": 1.5598355142084754,
+      "grad_norm": 0.024842802813832925,
+      "learning_rate": 0.0001404060249559868,
+      "loss": 0.4583,
+      "step": 30725
+    },
+    {
+      "epoch": 1.560089350306507,
+      "grad_norm": 0.02281559318997292,
+      "learning_rate": 0.00014025214056067237,
+      "loss": 0.4806,
+      "step": 30730
+    },
+    {
+      "epoch": 1.5603431864045385,
+      "grad_norm": 0.033399438534170926,
+      "learning_rate": 0.00014009832678206887,
+      "loss": 0.4709,
+      "step": 30735
+    },
+    {
+      "epoch": 1.56059702250257,
+      "grad_norm": 0.028247300578566486,
+      "learning_rate": 0.00013994458365036879,
+      "loss": 0.4727,
+      "step": 30740
+    },
+    {
+      "epoch": 1.5608508586006016,
+      "grad_norm": 0.02286113664149876,
+      "learning_rate": 0.0001397909111957515,
+      "loss": 0.4916,
+      "step": 30745
+    },
+    {
+      "epoch": 1.561104694698633,
+      "grad_norm": 0.02910635163937602,
+      "learning_rate": 0.00013963730944838181,
+      "loss": 0.4586,
+      "step": 30750
+    },
+    {
+      "epoch": 1.5613585307966646,
+      "grad_norm": 0.028491477035486,
+      "learning_rate": 0.00013948377843841137,
+      "loss": 0.4695,
+      "step": 30755
+    },
+    {
+      "epoch": 1.5616123668946962,
+      "grad_norm": 0.0224890085130081,
+      "learning_rate": 0.00013933031819597714,
+      "loss": 0.4666,
+      "step": 30760
+    },
+    {
+      "epoch": 1.5618662029927277,
+      "grad_norm": 0.019158984174170664,
+      "learning_rate": 0.00013917692875120276,
+      "loss": 0.4534,
+      "step": 30765
+    },
+    {
+      "epoch": 1.5621200390907592,
+      "grad_norm": 0.023081415384837618,
+      "learning_rate": 0.00013902361013419807,
+      "loss": 0.4889,
+      "step": 30770
+    },
+    {
+      "epoch": 1.5623738751887906,
+      "grad_norm": 0.023710781179949143,
+      "learning_rate": 0.0001388703623750583,
+      "loss": 0.4508,
+      "step": 30775
+    },
+    {
+      "epoch": 1.562627711286822,
+      "grad_norm": 0.02229210338649228,
+      "learning_rate": 0.00013871718550386564,
+      "loss": 0.4581,
+      "step": 30780
+    },
+    {
+      "epoch": 1.5628815473848536,
+      "grad_norm": 0.02255486124550216,
+      "learning_rate": 0.00013856407955068755,
+      "loss": 0.468,
+      "step": 30785
+    },
+    {
+      "epoch": 1.563135383482885,
+      "grad_norm": 0.028951039267342597,
+      "learning_rate": 0.0001384110445455784,
+      "loss": 0.4481,
+      "step": 30790
+    },
+    {
+      "epoch": 1.5633892195809165,
+      "grad_norm": 0.03206682793554021,
+      "learning_rate": 0.00013825808051857774,
+      "loss": 0.4628,
+      "step": 30795
+    },
+    {
+      "epoch": 1.563643055678948,
+      "grad_norm": 0.03315618731467156,
+      "learning_rate": 0.00013810518749971207,
+      "loss": 0.4913,
+      "step": 30800
+    },
+    {
+      "epoch": 1.5638968917769795,
+      "grad_norm": 0.020986799281081246,
+      "learning_rate": 0.00013795236551899316,
+      "loss": 0.4788,
+      "step": 30805
+    },
+    {
+      "epoch": 1.564150727875011,
+      "grad_norm": 0.027439058381662006,
+      "learning_rate": 0.0001377996146064195,
+      "loss": 0.5025,
+      "step": 30810
+    },
+    {
+      "epoch": 1.5644045639730426,
+      "grad_norm": 0.03575554930996507,
+      "learning_rate": 0.00013764693479197503,
+      "loss": 0.4559,
+      "step": 30815
+    },
+    {
+      "epoch": 1.5646584000710742,
+      "grad_norm": 0.022347257231482485,
+      "learning_rate": 0.00013749432610563045,
+      "loss": 0.464,
+      "step": 30820
+    },
+    {
+      "epoch": 1.5649122361691057,
+      "grad_norm": 0.022141752765691802,
+      "learning_rate": 0.00013734178857734147,
+      "loss": 0.4646,
+      "step": 30825
+    },
+    {
+      "epoch": 1.5651660722671372,
+      "grad_norm": 0.0205841238198598,
+      "learning_rate": 0.0001371893222370511,
+      "loss": 0.4786,
+      "step": 30830
+    },
+    {
+      "epoch": 1.5654199083651688,
+      "grad_norm": 0.02330750400737154,
+      "learning_rate": 0.00013703692711468734,
+      "loss": 0.479,
+      "step": 30835
+    },
+    {
+      "epoch": 1.5656737444632,
+      "grad_norm": 0.02083094398512009,
+      "learning_rate": 0.00013688460324016484,
+      "loss": 0.4422,
+      "step": 30840
+    },
+    {
+      "epoch": 1.5659275805612316,
+      "grad_norm": 0.02046271028364385,
+      "learning_rate": 0.00013673235064338375,
+      "loss": 0.4481,
+      "step": 30845
+    },
+    {
+      "epoch": 1.5661814166592631,
+      "grad_norm": 0.020193980043060795,
+      "learning_rate": 0.00013658016935423067,
+      "loss": 0.4385,
+      "step": 30850
+    },
+    {
+      "epoch": 1.5664352527572944,
+      "grad_norm": 0.023275239337087233,
+      "learning_rate": 0.0001364280594025779,
+      "loss": 0.4721,
+      "step": 30855
+    },
+    {
+      "epoch": 1.566689088855326,
+      "grad_norm": 0.021162190591902365,
+      "learning_rate": 0.00013627602081828412,
+      "loss": 0.4614,
+      "step": 30860
+    },
+    {
+      "epoch": 1.5669429249533575,
+      "grad_norm": 0.020039658211474635,
+      "learning_rate": 0.00013612405363119334,
+      "loss": 0.4461,
+      "step": 30865
+    },
+    {
+      "epoch": 1.567196761051389,
+      "grad_norm": 0.023617453564502285,
+      "learning_rate": 0.00013597215787113638,
+      "loss": 0.4713,
+      "step": 30870
+    },
+    {
+      "epoch": 1.5674505971494206,
+      "grad_norm": 0.025261328277977876,
+      "learning_rate": 0.00013582033356792923,
+      "loss": 0.464,
+      "step": 30875
+    },
+    {
+      "epoch": 1.5677044332474521,
+      "grad_norm": 0.022210536243253937,
+      "learning_rate": 0.00013566858075137462,
+      "loss": 0.4461,
+      "step": 30880
+    },
+    {
+      "epoch": 1.5679582693454837,
+      "grad_norm": 0.02139787157750851,
+      "learning_rate": 0.00013551689945126056,
+      "loss": 0.4619,
+      "step": 30885
+    },
+    {
+      "epoch": 1.5682121054435152,
+      "grad_norm": 0.02347373800866568,
+      "learning_rate": 0.0001353652896973614,
+      "loss": 0.4681,
+      "step": 30890
+    },
+    {
+      "epoch": 1.5684659415415467,
+      "grad_norm": 0.027527125241354457,
+      "learning_rate": 0.00013521375151943766,
+      "loss": 0.4738,
+      "step": 30895
+    },
+    {
+      "epoch": 1.5687197776395783,
+      "grad_norm": 0.023524626568918166,
+      "learning_rate": 0.0001350622849472351,
+      "loss": 0.4821,
+      "step": 30900
+    },
+    {
+      "epoch": 1.5689736137376096,
+      "grad_norm": 0.02466639732929348,
+      "learning_rate": 0.00013491089001048628,
+      "loss": 0.4721,
+      "step": 30905
+    },
+    {
+      "epoch": 1.569227449835641,
+      "grad_norm": 0.027029307313648266,
+      "learning_rate": 0.00013475956673890887,
+      "loss": 0.4678,
+      "step": 30910
+    },
+    {
+      "epoch": 1.5694812859336726,
+      "grad_norm": 0.03282760997469555,
+      "learning_rate": 0.0001346083151622072,
+      "loss": 0.4394,
+      "step": 30915
+    },
+    {
+      "epoch": 1.5697351220317042,
+      "grad_norm": 0.02789891853127172,
+      "learning_rate": 0.00013445713531007092,
+      "loss": 0.4628,
+      "step": 30920
+    },
+    {
+      "epoch": 1.5699889581297355,
+      "grad_norm": 0.02426468888804551,
+      "learning_rate": 0.00013430602721217617,
+      "loss": 0.48,
+      "step": 30925
+    },
+    {
+      "epoch": 1.570242794227767,
+      "grad_norm": 0.021681960512083976,
+      "learning_rate": 0.0001341549908981844,
+      "loss": 0.4885,
+      "step": 30930
+    },
+    {
+      "epoch": 1.5704966303257986,
+      "grad_norm": 0.022832994313996206,
+      "learning_rate": 0.00013400402639774362,
+      "loss": 0.4431,
+      "step": 30935
+    },
+    {
+      "epoch": 1.57075046642383,
+      "grad_norm": 0.024825658736222323,
+      "learning_rate": 0.00013385313374048708,
+      "loss": 0.4614,
+      "step": 30940
+    },
+    {
+      "epoch": 1.5710043025218616,
+      "grad_norm": 0.021382123283392958,
+      "learning_rate": 0.0001337023129560344,
+      "loss": 0.4473,
+      "step": 30945
+    },
+    {
+      "epoch": 1.5712581386198932,
+      "grad_norm": 0.022778600002653724,
+      "learning_rate": 0.000133551564073991,
+      "loss": 0.4876,
+      "step": 30950
+    },
+    {
+      "epoch": 1.5715119747179247,
+      "grad_norm": 0.022378373834901225,
+      "learning_rate": 0.0001334008871239482,
+      "loss": 0.4689,
+      "step": 30955
+    },
+    {
+      "epoch": 1.5717658108159562,
+      "grad_norm": 0.022126622701371928,
+      "learning_rate": 0.0001332502821354829,
+      "loss": 0.4686,
+      "step": 30960
+    },
+    {
+      "epoch": 1.5720196469139878,
+      "grad_norm": 0.02328246824343477,
+      "learning_rate": 0.00013309974913815843,
+      "loss": 0.49,
+      "step": 30965
+    },
+    {
+      "epoch": 1.5722734830120193,
+      "grad_norm": 0.022513981649263024,
+      "learning_rate": 0.0001329492881615233,
+      "loss": 0.4663,
+      "step": 30970
+    },
+    {
+      "epoch": 1.5725273191100506,
+      "grad_norm": 0.02648229761719871,
+      "learning_rate": 0.00013279889923511256,
+      "loss": 0.4824,
+      "step": 30975
+    },
+    {
+      "epoch": 1.5727811552080821,
+      "grad_norm": 0.031697592758533163,
+      "learning_rate": 0.00013264858238844652,
+      "loss": 0.4597,
+      "step": 30980
+    },
+    {
+      "epoch": 1.5730349913061137,
+      "grad_norm": 0.023692598750396664,
+      "learning_rate": 0.0001324983376510319,
+      "loss": 0.4792,
+      "step": 30985
+    },
+    {
+      "epoch": 1.573288827404145,
+      "grad_norm": 0.02253078704005018,
+      "learning_rate": 0.0001323481650523608,
+      "loss": 0.4632,
+      "step": 30990
+    },
+    {
+      "epoch": 1.5735426635021765,
+      "grad_norm": 0.023678013208962723,
+      "learning_rate": 0.00013219806462191154,
+      "loss": 0.4776,
+      "step": 30995
+    },
+    {
+      "epoch": 1.573796499600208,
+      "grad_norm": 0.022573287303128707,
+      "learning_rate": 0.00013204803638914791,
+      "loss": 0.5042,
+      "step": 31000
+    },
+    {
+      "epoch": 1.5740503356982396,
+      "grad_norm": 0.022353046325857568,
+      "learning_rate": 0.00013189808038351953,
+      "loss": 0.468,
+      "step": 31005
+    },
+    {
+      "epoch": 1.5743041717962711,
+      "grad_norm": 0.019324948043265216,
+      "learning_rate": 0.00013174819663446254,
+      "loss": 0.4637,
+      "step": 31010
+    },
+    {
+      "epoch": 1.5745580078943027,
+      "grad_norm": 0.02228322129344453,
+      "learning_rate": 0.00013159838517139795,
+      "loss": 0.4464,
+      "step": 31015
+    },
+    {
+      "epoch": 1.5748118439923342,
+      "grad_norm": 0.020602212019198058,
+      "learning_rate": 0.00013144864602373325,
+      "loss": 0.4768,
+      "step": 31020
+    },
+    {
+      "epoch": 1.5750656800903657,
+      "grad_norm": 0.021221567403693256,
+      "learning_rate": 0.0001312989792208612,
+      "loss": 0.4386,
+      "step": 31025
+    },
+    {
+      "epoch": 1.5753195161883973,
+      "grad_norm": 0.022718156061845098,
+      "learning_rate": 0.00013114938479216105,
+      "loss": 0.4555,
+      "step": 31030
+    },
+    {
+      "epoch": 1.5755733522864288,
+      "grad_norm": 0.026040312386550257,
+      "learning_rate": 0.000130999862766997,
+      "loss": 0.4637,
+      "step": 31035
+    },
+    {
+      "epoch": 1.5758271883844601,
+      "grad_norm": 0.025384578710037565,
+      "learning_rate": 0.00013085041317471984,
+      "loss": 0.4709,
+      "step": 31040
+    },
+    {
+      "epoch": 1.5760810244824917,
+      "grad_norm": 0.019114058763165305,
+      "learning_rate": 0.00013070103604466548,
+      "loss": 0.43,
+      "step": 31045
+    },
+    {
+      "epoch": 1.5763348605805232,
+      "grad_norm": 0.02267175025832458,
+      "learning_rate": 0.00013055173140615623,
+      "loss": 0.4853,
+      "step": 31050
+    },
+    {
+      "epoch": 1.5765886966785545,
+      "grad_norm": 0.02231164473017753,
+      "learning_rate": 0.00013040249928849952,
+      "loss": 0.4755,
+      "step": 31055
+    },
+    {
+      "epoch": 1.576842532776586,
+      "grad_norm": 0.025173835816743373,
+      "learning_rate": 0.00013025333972098912,
+      "loss": 0.4666,
+      "step": 31060
+    },
+    {
+      "epoch": 1.5770963688746176,
+      "grad_norm": 0.022930909710894763,
+      "learning_rate": 0.00013010425273290394,
+      "loss": 0.4614,
+      "step": 31065
+    },
+    {
+      "epoch": 1.577350204972649,
+      "grad_norm": 0.019310788047425583,
+      "learning_rate": 0.00012995523835350958,
+      "loss": 0.4661,
+      "step": 31070
+    },
+    {
+      "epoch": 1.5776040410706806,
+      "grad_norm": 0.033091726187777254,
+      "learning_rate": 0.0001298062966120564,
+      "loss": 0.4882,
+      "step": 31075
+    },
+    {
+      "epoch": 1.5778578771687122,
+      "grad_norm": 0.02326961130506038,
+      "learning_rate": 0.00012965742753778115,
+      "loss": 0.4549,
+      "step": 31080
+    },
+    {
+      "epoch": 1.5781117132667437,
+      "grad_norm": 0.03266197407987793,
+      "learning_rate": 0.00012950863115990602,
+      "loss": 0.4458,
+      "step": 31085
+    },
+    {
+      "epoch": 1.5783655493647752,
+      "grad_norm": 0.021876108652836432,
+      "learning_rate": 0.00012935990750763876,
+      "loss": 0.4695,
+      "step": 31090
+    },
+    {
+      "epoch": 1.5786193854628068,
+      "grad_norm": 0.026393027361750386,
+      "learning_rate": 0.00012921125661017347,
+      "loss": 0.4486,
+      "step": 31095
+    },
+    {
+      "epoch": 1.5788732215608383,
+      "grad_norm": 0.02184839617602591,
+      "learning_rate": 0.0001290626784966892,
+      "loss": 0.4432,
+      "step": 31100
+    },
+    {
+      "epoch": 1.5791270576588696,
+      "grad_norm": 0.029331145194564957,
+      "learning_rate": 0.00012891417319635146,
+      "loss": 0.4636,
+      "step": 31105
+    },
+    {
+      "epoch": 1.5793808937569012,
+      "grad_norm": 0.02774728403803806,
+      "learning_rate": 0.0001287657407383107,
+      "loss": 0.4598,
+      "step": 31110
+    },
+    {
+      "epoch": 1.5796347298549327,
+      "grad_norm": 0.02128441451715541,
+      "learning_rate": 0.0001286173811517039,
+      "loss": 0.461,
+      "step": 31115
+    },
+    {
+      "epoch": 1.579888565952964,
+      "grad_norm": 0.026163707221929008,
+      "learning_rate": 0.00012846909446565297,
+      "loss": 0.4585,
+      "step": 31120
+    },
+    {
+      "epoch": 1.5801424020509955,
+      "grad_norm": 0.022788567101665227,
+      "learning_rate": 0.00012832088070926595,
+      "loss": 0.4514,
+      "step": 31125
+    },
+    {
+      "epoch": 1.580396238149027,
+      "grad_norm": 0.023031836651392787,
+      "learning_rate": 0.00012817273991163648,
+      "loss": 0.4727,
+      "step": 31130
+    },
+    {
+      "epoch": 1.5806500742470586,
+      "grad_norm": 0.022560904451555855,
+      "learning_rate": 0.00012802467210184398,
+      "loss": 0.4728,
+      "step": 31135
+    },
+    {
+      "epoch": 1.5809039103450901,
+      "grad_norm": 0.024809637462144523,
+      "learning_rate": 0.00012787667730895325,
+      "loss": 0.4804,
+      "step": 31140
+    },
+    {
+      "epoch": 1.5811577464431217,
+      "grad_norm": 0.027218959388466347,
+      "learning_rate": 0.00012772875556201507,
+      "loss": 0.4705,
+      "step": 31145
+    },
+    {
+      "epoch": 1.5814115825411532,
+      "grad_norm": 0.022561455510123646,
+      "learning_rate": 0.0001275809068900655,
+      "loss": 0.45,
+      "step": 31150
+    },
+    {
+      "epoch": 1.5816654186391848,
+      "grad_norm": 0.022204515235270998,
+      "learning_rate": 0.00012743313132212685,
+      "loss": 0.4892,
+      "step": 31155
+    },
+    {
+      "epoch": 1.5819192547372163,
+      "grad_norm": 0.022075257403621297,
+      "learning_rate": 0.00012728542888720633,
+      "loss": 0.4829,
+      "step": 31160
+    },
+    {
+      "epoch": 1.5821730908352478,
+      "grad_norm": 0.028454740968265037,
+      "learning_rate": 0.0001271377996142976,
+      "loss": 0.4713,
+      "step": 31165
+    },
+    {
+      "epoch": 1.5824269269332791,
+      "grad_norm": 0.02252943808074146,
+      "learning_rate": 0.00012699024353237921,
+      "loss": 0.4869,
+      "step": 31170
+    },
+    {
+      "epoch": 1.5826807630313107,
+      "grad_norm": 0.02494152110893949,
+      "learning_rate": 0.0001268427606704159,
+      "loss": 0.46,
+      "step": 31175
+    },
+    {
+      "epoch": 1.5829345991293422,
+      "grad_norm": 0.02655046766353095,
+      "learning_rate": 0.00012669535105735763,
+      "loss": 0.4563,
+      "step": 31180
+    },
+    {
+      "epoch": 1.5831884352273737,
+      "grad_norm": 0.023971988910522334,
+      "learning_rate": 0.0001265480147221403,
+      "loss": 0.4794,
+      "step": 31185
+    },
+    {
+      "epoch": 1.583442271325405,
+      "grad_norm": 0.02428090384726735,
+      "learning_rate": 0.00012640075169368536,
+      "loss": 0.4667,
+      "step": 31190
+    },
+    {
+      "epoch": 1.5836961074234366,
+      "grad_norm": 0.02616066326968095,
+      "learning_rate": 0.0001262535620008996,
+      "loss": 0.4711,
+      "step": 31195
+    },
+    {
+      "epoch": 1.5839499435214681,
+      "grad_norm": 0.029430410330922516,
+      "learning_rate": 0.00012610644567267592,
+      "loss": 0.4481,
+      "step": 31200
+    },
+    {
+      "epoch": 1.5842037796194997,
+      "grad_norm": 0.023433715884341892,
+      "learning_rate": 0.0001259594027378922,
+      "loss": 0.4922,
+      "step": 31205
+    },
+    {
+      "epoch": 1.5844576157175312,
+      "grad_norm": 0.022517731868072016,
+      "learning_rate": 0.00012581243322541252,
+      "loss": 0.5017,
+      "step": 31210
+    },
+    {
+      "epoch": 1.5847114518155627,
+      "grad_norm": 0.019566396686039718,
+      "learning_rate": 0.000125665537164086,
+      "loss": 0.4659,
+      "step": 31215
+    },
+    {
+      "epoch": 1.5849652879135943,
+      "grad_norm": 0.02130331252828402,
+      "learning_rate": 0.00012551871458274787,
+      "loss": 0.4998,
+      "step": 31220
+    },
+    {
+      "epoch": 1.5852191240116258,
+      "grad_norm": 0.021977116905423498,
+      "learning_rate": 0.0001253719655102184,
+      "loss": 0.4663,
+      "step": 31225
+    },
+    {
+      "epoch": 1.5854729601096573,
+      "grad_norm": 0.025769773014901368,
+      "learning_rate": 0.0001252252899753039,
+      "loss": 0.4722,
+      "step": 31230
+    },
+    {
+      "epoch": 1.5857267962076889,
+      "grad_norm": 0.028231473801800827,
+      "learning_rate": 0.00012507868800679594,
+      "loss": 0.4624,
+      "step": 31235
+    },
+    {
+      "epoch": 1.5859806323057202,
+      "grad_norm": 0.022650562195746557,
+      "learning_rate": 0.00012493215963347188,
+      "loss": 0.4796,
+      "step": 31240
+    },
+    {
+      "epoch": 1.5862344684037517,
+      "grad_norm": 0.029622948175144603,
+      "learning_rate": 0.00012478570488409413,
+      "loss": 0.4543,
+      "step": 31245
+    },
+    {
+      "epoch": 1.5864883045017832,
+      "grad_norm": 0.021229634657799406,
+      "learning_rate": 0.00012463932378741166,
+      "loss": 0.4534,
+      "step": 31250
+    },
+    {
+      "epoch": 1.5867421405998146,
+      "grad_norm": 0.02759828595048276,
+      "learning_rate": 0.00012449301637215782,
+      "loss": 0.4761,
+      "step": 31255
+    },
+    {
+      "epoch": 1.586995976697846,
+      "grad_norm": 0.022925858589321915,
+      "learning_rate": 0.0001243467826670524,
+      "loss": 0.4405,
+      "step": 31260
+    },
+    {
+      "epoch": 1.5872498127958776,
+      "grad_norm": 0.023105250881910747,
+      "learning_rate": 0.00012420062270079995,
+      "loss": 0.4712,
+      "step": 31265
+    },
+    {
+      "epoch": 1.5875036488939092,
+      "grad_norm": 0.021868594752397733,
+      "learning_rate": 0.00012405453650209136,
+      "loss": 0.4677,
+      "step": 31270
+    },
+    {
+      "epoch": 1.5877574849919407,
+      "grad_norm": 0.03077895247333048,
+      "learning_rate": 0.00012390852409960223,
+      "loss": 0.4539,
+      "step": 31275
+    },
+    {
+      "epoch": 1.5880113210899722,
+      "grad_norm": 0.02329117852026229,
+      "learning_rate": 0.00012376258552199444,
+      "loss": 0.4749,
+      "step": 31280
+    },
+    {
+      "epoch": 1.5882651571880038,
+      "grad_norm": 0.021664079596118983,
+      "learning_rate": 0.00012361672079791469,
+      "loss": 0.4691,
+      "step": 31285
+    },
+    {
+      "epoch": 1.5885189932860353,
+      "grad_norm": 0.025983796005873985,
+      "learning_rate": 0.00012347092995599574,
+      "loss": 0.4481,
+      "step": 31290
+    },
+    {
+      "epoch": 1.5887728293840668,
+      "grad_norm": 0.027325008699016377,
+      "learning_rate": 0.00012332521302485533,
+      "loss": 0.4636,
+      "step": 31295
+    },
+    {
+      "epoch": 1.5890266654820984,
+      "grad_norm": 0.020269269921609292,
+      "learning_rate": 0.00012317957003309726,
+      "loss": 0.425,
+      "step": 31300
+    },
+    {
+      "epoch": 1.5892805015801297,
+      "grad_norm": 0.019317983071545985,
+      "learning_rate": 0.00012303400100931029,
+      "loss": 0.4336,
+      "step": 31305
+    },
+    {
+      "epoch": 1.5895343376781612,
+      "grad_norm": 0.02161940503170865,
+      "learning_rate": 0.00012288850598206902,
+      "loss": 0.4696,
+      "step": 31310
+    },
+    {
+      "epoch": 1.5897881737761927,
+      "grad_norm": 0.026885333685387532,
+      "learning_rate": 0.00012274308497993346,
+      "loss": 0.4598,
+      "step": 31315
+    },
+    {
+      "epoch": 1.590042009874224,
+      "grad_norm": 0.023792733517593017,
+      "learning_rate": 0.0001225977380314488,
+      "loss": 0.4553,
+      "step": 31320
+    },
+    {
+      "epoch": 1.5902958459722556,
+      "grad_norm": 0.02653728675273625,
+      "learning_rate": 0.00012245246516514626,
+      "loss": 0.4675,
+      "step": 31325
+    },
+    {
+      "epoch": 1.5905496820702871,
+      "grad_norm": 0.021186203668840244,
+      "learning_rate": 0.00012230726640954183,
+      "loss": 0.4436,
+      "step": 31330
+    },
+    {
+      "epoch": 1.5908035181683187,
+      "grad_norm": 0.025039321378471157,
+      "learning_rate": 0.0001221621417931375,
+      "loss": 0.4697,
+      "step": 31335
+    },
+    {
+      "epoch": 1.5910573542663502,
+      "grad_norm": 0.024695574034159236,
+      "learning_rate": 0.00012201709134442041,
+      "loss": 0.4479,
+      "step": 31340
+    },
+    {
+      "epoch": 1.5913111903643817,
+      "grad_norm": 0.020718266810303008,
+      "learning_rate": 0.00012187211509186341,
+      "loss": 0.479,
+      "step": 31345
+    },
+    {
+      "epoch": 1.5915650264624133,
+      "grad_norm": 0.02115176797212826,
+      "learning_rate": 0.00012172721306392437,
+      "loss": 0.4714,
+      "step": 31350
+    },
+    {
+      "epoch": 1.5918188625604448,
+      "grad_norm": 0.02176260766701053,
+      "learning_rate": 0.00012158238528904707,
+      "loss": 0.4647,
+      "step": 31355
+    },
+    {
+      "epoch": 1.5920726986584763,
+      "grad_norm": 0.02570121404643556,
+      "learning_rate": 0.00012143763179566026,
+      "loss": 0.456,
+      "step": 31360
+    },
+    {
+      "epoch": 1.5923265347565079,
+      "grad_norm": 0.02737621105346301,
+      "learning_rate": 0.00012129295261217843,
+      "loss": 0.483,
+      "step": 31365
+    },
+    {
+      "epoch": 1.5925803708545392,
+      "grad_norm": 0.026854830654727595,
+      "learning_rate": 0.0001211483477670014,
+      "loss": 0.457,
+      "step": 31370
+    },
+    {
+      "epoch": 1.5928342069525707,
+      "grad_norm": 0.02062145184869095,
+      "learning_rate": 0.0001210038172885145,
+      "loss": 0.4471,
+      "step": 31375
+    },
+    {
+      "epoch": 1.5930880430506023,
+      "grad_norm": 0.02532668196949466,
+      "learning_rate": 0.00012085936120508811,
+      "loss": 0.4475,
+      "step": 31380
+    },
+    {
+      "epoch": 1.5933418791486336,
+      "grad_norm": 0.020702904030837858,
+      "learning_rate": 0.00012071497954507843,
+      "loss": 0.4668,
+      "step": 31385
+    },
+    {
+      "epoch": 1.593595715246665,
+      "grad_norm": 0.031141316623061212,
+      "learning_rate": 0.00012057067233682667,
+      "loss": 0.4714,
+      "step": 31390
+    },
+    {
+      "epoch": 1.5938495513446966,
+      "grad_norm": 0.028963886369253525,
+      "learning_rate": 0.00012042643960865985,
+      "loss": 0.4677,
+      "step": 31395
+    },
+    {
+      "epoch": 1.5941033874427282,
+      "grad_norm": 0.02100153262498024,
+      "learning_rate": 0.00012028228138888986,
+      "loss": 0.4844,
+      "step": 31400
+    },
+    {
+      "epoch": 1.5943572235407597,
+      "grad_norm": 0.021293170094263958,
+      "learning_rate": 0.00012013819770581458,
+      "loss": 0.4344,
+      "step": 31405
+    },
+    {
+      "epoch": 1.5946110596387912,
+      "grad_norm": 0.022079676785804837,
+      "learning_rate": 0.00011999418858771649,
+      "loss": 0.472,
+      "step": 31410
+    },
+    {
+      "epoch": 1.5948648957368228,
+      "grad_norm": 0.020399139507724554,
+      "learning_rate": 0.00011985025406286432,
+      "loss": 0.4412,
+      "step": 31415
+    },
+    {
+      "epoch": 1.5951187318348543,
+      "grad_norm": 0.020737921370516353,
+      "learning_rate": 0.00011970639415951129,
+      "loss": 0.4782,
+      "step": 31420
+    },
+    {
+      "epoch": 1.5953725679328858,
+      "grad_norm": 0.021206624778793808,
+      "learning_rate": 0.00011956260890589655,
+      "loss": 0.459,
+      "step": 31425
+    },
+    {
+      "epoch": 1.5956264040309174,
+      "grad_norm": 0.0245333591767397,
+      "learning_rate": 0.00011941889833024461,
+      "loss": 0.4675,
+      "step": 31430
+    },
+    {
+      "epoch": 1.5958802401289487,
+      "grad_norm": 0.025273714859497986,
+      "learning_rate": 0.0001192752624607648,
+      "loss": 0.4753,
+      "step": 31435
+    },
+    {
+      "epoch": 1.5961340762269802,
+      "grad_norm": 0.02473632336996924,
+      "learning_rate": 0.00011913170132565248,
+      "loss": 0.4207,
+      "step": 31440
+    },
+    {
+      "epoch": 1.5963879123250118,
+      "grad_norm": 0.024196910552343232,
+      "learning_rate": 0.00011898821495308764,
+      "loss": 0.4838,
+      "step": 31445
+    },
+    {
+      "epoch": 1.5966417484230433,
+      "grad_norm": 0.024546317457647595,
+      "learning_rate": 0.00011884480337123621,
+      "loss": 0.4766,
+      "step": 31450
+    },
+    {
+      "epoch": 1.5968955845210746,
+      "grad_norm": 0.021580443853018175,
+      "learning_rate": 0.00011870146660824899,
+      "loss": 0.4618,
+      "step": 31455
+    },
+    {
+      "epoch": 1.5971494206191061,
+      "grad_norm": 0.021030168327500064,
+      "learning_rate": 0.00011855820469226242,
+      "loss": 0.4697,
+      "step": 31460
+    },
+    {
+      "epoch": 1.5974032567171377,
+      "grad_norm": 0.02542968053225913,
+      "learning_rate": 0.00011841501765139795,
+      "loss": 0.4583,
+      "step": 31465
+    },
+    {
+      "epoch": 1.5976570928151692,
+      "grad_norm": 0.0245457635447329,
+      "learning_rate": 0.00011827190551376265,
+      "loss": 0.469,
+      "step": 31470
+    },
+    {
+      "epoch": 1.5979109289132007,
+      "grad_norm": 0.022086900914002925,
+      "learning_rate": 0.00011812886830744846,
+      "loss": 0.404,
+      "step": 31475
+    },
+    {
+      "epoch": 1.5981647650112323,
+      "grad_norm": 0.024647282768808704,
+      "learning_rate": 0.00011798590606053322,
+      "loss": 0.4778,
+      "step": 31480
+    },
+    {
+      "epoch": 1.5984186011092638,
+      "grad_norm": 0.02520833814662268,
+      "learning_rate": 0.00011784301880107917,
+      "loss": 0.4682,
+      "step": 31485
+    },
+    {
+      "epoch": 1.5986724372072953,
+      "grad_norm": 0.030999706879264646,
+      "learning_rate": 0.00011770020655713509,
+      "loss": 0.4543,
+      "step": 31490
+    },
+    {
+      "epoch": 1.5989262733053269,
+      "grad_norm": 0.028404831004490545,
+      "learning_rate": 0.00011755746935673372,
+      "loss": 0.4685,
+      "step": 31495
+    },
+    {
+      "epoch": 1.5991801094033582,
+      "grad_norm": 0.02067687233055602,
+      "learning_rate": 0.00011741480722789405,
+      "loss": 0.4439,
+      "step": 31500
+    },
+    {
+      "epoch": 1.5994339455013897,
+      "grad_norm": 0.03005348926340442,
+      "learning_rate": 0.00011727222019861966,
+      "loss": 0.4475,
+      "step": 31505
+    },
+    {
+      "epoch": 1.5996877815994213,
+      "grad_norm": 0.025221995139169794,
+      "learning_rate": 0.0001171297082968999,
+      "loss": 0.4601,
+      "step": 31510
+    },
+    {
+      "epoch": 1.5999416176974528,
+      "grad_norm": 0.024617531922327757,
+      "learning_rate": 0.00011698727155070888,
+      "loss": 0.485,
+      "step": 31515
+    },
+    {
+      "epoch": 1.6001954537954841,
+      "grad_norm": 0.02522929219903767,
+      "learning_rate": 0.0001168449099880065,
+      "loss": 0.4684,
+      "step": 31520
+    },
+    {
+      "epoch": 1.6004492898935156,
+      "grad_norm": 0.02188538172758521,
+      "learning_rate": 0.0001167026236367374,
+      "loss": 0.4667,
+      "step": 31525
+    },
+    {
+      "epoch": 1.6007031259915472,
+      "grad_norm": 0.03015421394199565,
+      "learning_rate": 0.00011656041252483185,
+      "loss": 0.4609,
+      "step": 31530
+    },
+    {
+      "epoch": 1.6009569620895787,
+      "grad_norm": 0.02771728039572498,
+      "learning_rate": 0.00011641827668020504,
+      "loss": 0.4537,
+      "step": 31535
+    },
+    {
+      "epoch": 1.6012107981876103,
+      "grad_norm": 0.024448943269020997,
+      "learning_rate": 0.00011627621613075772,
+      "loss": 0.4735,
+      "step": 31540
+    },
+    {
+      "epoch": 1.6014646342856418,
+      "grad_norm": 0.024047651615317647,
+      "learning_rate": 0.00011613423090437536,
+      "loss": 0.4797,
+      "step": 31545
+    },
+    {
+      "epoch": 1.6017184703836733,
+      "grad_norm": 0.022908547869978178,
+      "learning_rate": 0.0001159923210289292,
+      "loss": 0.4419,
+      "step": 31550
+    },
+    {
+      "epoch": 1.6019723064817049,
+      "grad_norm": 0.027921713893488884,
+      "learning_rate": 0.00011585048653227548,
+      "loss": 0.4529,
+      "step": 31555
+    },
+    {
+      "epoch": 1.6022261425797364,
+      "grad_norm": 0.022455295711872506,
+      "learning_rate": 0.00011570872744225541,
+      "loss": 0.4689,
+      "step": 31560
+    },
+    {
+      "epoch": 1.602479978677768,
+      "grad_norm": 0.029028863638509785,
+      "learning_rate": 0.0001155670437866958,
+      "loss": 0.4693,
+      "step": 31565
+    },
+    {
+      "epoch": 1.6027338147757992,
+      "grad_norm": 0.026146904467255937,
+      "learning_rate": 0.00011542543559340817,
+      "loss": 0.4802,
+      "step": 31570
+    },
+    {
+      "epoch": 1.6029876508738308,
+      "grad_norm": 0.021361160108741662,
+      "learning_rate": 0.0001152839028901898,
+      "loss": 0.4735,
+      "step": 31575
+    },
+    {
+      "epoch": 1.6032414869718623,
+      "grad_norm": 0.022332608927233596,
+      "learning_rate": 0.00011514244570482263,
+      "loss": 0.4405,
+      "step": 31580
+    },
+    {
+      "epoch": 1.6034953230698936,
+      "grad_norm": 0.023861647792640446,
+      "learning_rate": 0.00011500106406507416,
+      "loss": 0.4384,
+      "step": 31585
+    },
+    {
+      "epoch": 1.6037491591679252,
+      "grad_norm": 0.03455619002140877,
+      "learning_rate": 0.00011485975799869675,
+      "loss": 0.4781,
+      "step": 31590
+    },
+    {
+      "epoch": 1.6040029952659567,
+      "grad_norm": 0.032908225955371004,
+      "learning_rate": 0.00011471852753342826,
+      "loss": 0.4617,
+      "step": 31595
+    },
+    {
+      "epoch": 1.6042568313639882,
+      "grad_norm": 0.0248333675995091,
+      "learning_rate": 0.00011457737269699125,
+      "loss": 0.4736,
+      "step": 31600
+    },
+    {
+      "epoch": 1.6045106674620198,
+      "grad_norm": 0.023654932219431005,
+      "learning_rate": 0.00011443629351709394,
+      "loss": 0.4893,
+      "step": 31605
+    },
+    {
+      "epoch": 1.6047645035600513,
+      "grad_norm": 0.019152490721215262,
+      "learning_rate": 0.00011429529002142941,
+      "loss": 0.4619,
+      "step": 31610
+    },
+    {
+      "epoch": 1.6050183396580828,
+      "grad_norm": 0.024857396949531627,
+      "learning_rate": 0.00011415436223767606,
+      "loss": 0.4452,
+      "step": 31615
+    },
+    {
+      "epoch": 1.6052721757561144,
+      "grad_norm": 0.02790240391915016,
+      "learning_rate": 0.00011401351019349704,
+      "loss": 0.4561,
+      "step": 31620
+    },
+    {
+      "epoch": 1.605526011854146,
+      "grad_norm": 0.024084282543615547,
+      "learning_rate": 0.00011387273391654118,
+      "loss": 0.4774,
+      "step": 31625
+    },
+    {
+      "epoch": 1.6057798479521774,
+      "grad_norm": 0.02920146578824623,
+      "learning_rate": 0.00011373203343444194,
+      "loss": 0.4789,
+      "step": 31630
+    },
+    {
+      "epoch": 1.6060336840502087,
+      "grad_norm": 0.029127523345097956,
+      "learning_rate": 0.00011359140877481833,
+      "loss": 0.464,
+      "step": 31635
+    },
+    {
+      "epoch": 1.6062875201482403,
+      "grad_norm": 0.022090138726677838,
+      "learning_rate": 0.00011345085996527405,
+      "loss": 0.4806,
+      "step": 31640
+    },
+    {
+      "epoch": 1.6065413562462718,
+      "grad_norm": 0.026141716192201573,
+      "learning_rate": 0.00011331038703339836,
+      "loss": 0.4589,
+      "step": 31645
+    },
+    {
+      "epoch": 1.6067951923443031,
+      "grad_norm": 0.02162959028748172,
+      "learning_rate": 0.00011316999000676514,
+      "loss": 0.4543,
+      "step": 31650
+    },
+    {
+      "epoch": 1.6070490284423347,
+      "grad_norm": 0.023063059924878233,
+      "learning_rate": 0.00011302966891293392,
+      "loss": 0.47,
+      "step": 31655
+    },
+    {
+      "epoch": 1.6073028645403662,
+      "grad_norm": 0.027292009363410075,
+      "learning_rate": 0.00011288942377944872,
+      "loss": 0.4909,
+      "step": 31660
+    },
+    {
+      "epoch": 1.6075567006383977,
+      "grad_norm": 0.04681074491676968,
+      "learning_rate": 0.00011274925463383912,
+      "loss": 0.4656,
+      "step": 31665
+    },
+    {
+      "epoch": 1.6078105367364293,
+      "grad_norm": 0.01979398122113408,
+      "learning_rate": 0.00011260916150361977,
+      "loss": 0.4488,
+      "step": 31670
+    },
+    {
+      "epoch": 1.6080643728344608,
+      "grad_norm": 0.020250869022994698,
+      "learning_rate": 0.00011246914441628992,
+      "loss": 0.4782,
+      "step": 31675
+    },
+    {
+      "epoch": 1.6083182089324923,
+      "grad_norm": 0.019872795940748027,
+      "learning_rate": 0.00011232920339933461,
+      "loss": 0.4777,
+      "step": 31680
+    },
+    {
+      "epoch": 1.6085720450305239,
+      "grad_norm": 0.02316643741156545,
+      "learning_rate": 0.00011218933848022317,
+      "loss": 0.493,
+      "step": 31685
+    },
+    {
+      "epoch": 1.6088258811285554,
+      "grad_norm": 0.02234673533736629,
+      "learning_rate": 0.00011204954968641074,
+      "loss": 0.4629,
+      "step": 31690
+    },
+    {
+      "epoch": 1.609079717226587,
+      "grad_norm": 0.022362390983495437,
+      "learning_rate": 0.00011190983704533685,
+      "loss": 0.4572,
+      "step": 31695
+    },
+    {
+      "epoch": 1.6093335533246182,
+      "grad_norm": 0.021690554415178736,
+      "learning_rate": 0.00011177020058442672,
+      "loss": 0.4685,
+      "step": 31700
+    },
+    {
+      "epoch": 1.6095873894226498,
+      "grad_norm": 0.02776519862681794,
+      "learning_rate": 0.00011163064033108994,
+      "loss": 0.4662,
+      "step": 31705
+    },
+    {
+      "epoch": 1.6098412255206813,
+      "grad_norm": 0.022607748514862483,
+      "learning_rate": 0.00011149115631272183,
+      "loss": 0.4853,
+      "step": 31710
+    },
+    {
+      "epoch": 1.6100950616187126,
+      "grad_norm": 0.041421241284070646,
+      "learning_rate": 0.00011135174855670205,
+      "loss": 0.458,
+      "step": 31715
+    },
+    {
+      "epoch": 1.6103488977167442,
+      "grad_norm": 0.022255670798694553,
+      "learning_rate": 0.00011121241709039604,
+      "loss": 0.4626,
+      "step": 31720
+    },
+    {
+      "epoch": 1.6106027338147757,
+      "grad_norm": 0.02393315473991221,
+      "learning_rate": 0.00011107316194115352,
+      "loss": 0.482,
+      "step": 31725
+    },
+    {
+      "epoch": 1.6108565699128072,
+      "grad_norm": 0.022345117574356848,
+      "learning_rate": 0.00011093398313630975,
+      "loss": 0.4787,
+      "step": 31730
+    },
+    {
+      "epoch": 1.6111104060108388,
+      "grad_norm": 0.021278319102440692,
+      "learning_rate": 0.00011079488070318477,
+      "loss": 0.4639,
+      "step": 31735
+    },
+    {
+      "epoch": 1.6113642421088703,
+      "grad_norm": 0.02130312138742558,
+      "learning_rate": 0.00011065585466908395,
+      "loss": 0.4675,
+      "step": 31740
+    },
+    {
+      "epoch": 1.6116180782069018,
+      "grad_norm": 0.020984294776622912,
+      "learning_rate": 0.00011051690506129702,
+      "loss": 0.4446,
+      "step": 31745
+    },
+    {
+      "epoch": 1.6118719143049334,
+      "grad_norm": 0.023292694738733775,
+      "learning_rate": 0.00011037803190709945,
+      "loss": 0.4559,
+      "step": 31750
+    },
+    {
+      "epoch": 1.612125750402965,
+      "grad_norm": 0.021942067180995492,
+      "learning_rate": 0.00011023923523375102,
+      "loss": 0.4573,
+      "step": 31755
+    },
+    {
+      "epoch": 1.6123795865009964,
+      "grad_norm": 0.02815465580690162,
+      "learning_rate": 0.00011010051506849711,
+      "loss": 0.4792,
+      "step": 31760
+    },
+    {
+      "epoch": 1.6126334225990278,
+      "grad_norm": 0.021597418410303274,
+      "learning_rate": 0.0001099618714385675,
+      "loss": 0.4643,
+      "step": 31765
+    },
+    {
+      "epoch": 1.6128872586970593,
+      "grad_norm": 0.02364408717712364,
+      "learning_rate": 0.0001098233043711776,
+      "loss": 0.4706,
+      "step": 31770
+    },
+    {
+      "epoch": 1.6131410947950908,
+      "grad_norm": 0.027433611435712636,
+      "learning_rate": 0.00010968481389352708,
+      "loss": 0.4131,
+      "step": 31775
+    },
+    {
+      "epoch": 1.6133949308931224,
+      "grad_norm": 0.02152935633696659,
+      "learning_rate": 0.00010954640003280125,
+      "loss": 0.44,
+      "step": 31780
+    },
+    {
+      "epoch": 1.6136487669911537,
+      "grad_norm": 0.026076111440461344,
+      "learning_rate": 0.00010940806281616977,
+      "loss": 0.4446,
+      "step": 31785
+    },
+    {
+      "epoch": 1.6139026030891852,
+      "grad_norm": 0.023558985051178333,
+      "learning_rate": 0.00010926980227078765,
+      "loss": 0.4795,
+      "step": 31790
+    },
+    {
+      "epoch": 1.6141564391872167,
+      "grad_norm": 0.025471211297723347,
+      "learning_rate": 0.00010913161842379493,
+      "loss": 0.442,
+      "step": 31795
+    },
+    {
+      "epoch": 1.6144102752852483,
+      "grad_norm": 0.020061384357153312,
+      "learning_rate": 0.00010899351130231611,
+      "loss": 0.4324,
+      "step": 31800
+    },
+    {
+      "epoch": 1.6146641113832798,
+      "grad_norm": 0.032744538736248614,
+      "learning_rate": 0.00010885548093346126,
+      "loss": 0.468,
+      "step": 31805
+    },
+    {
+      "epoch": 1.6149179474813113,
+      "grad_norm": 0.02071627920287155,
+      "learning_rate": 0.00010871752734432466,
+      "loss": 0.4439,
+      "step": 31810
+    },
+    {
+      "epoch": 1.6151717835793429,
+      "grad_norm": 0.031839601183315996,
+      "learning_rate": 0.00010857965056198633,
+      "loss": 0.4447,
+      "step": 31815
+    },
+    {
+      "epoch": 1.6154256196773744,
+      "grad_norm": 0.024140957539309883,
+      "learning_rate": 0.00010844185061351036,
+      "loss": 0.4768,
+      "step": 31820
+    },
+    {
+      "epoch": 1.615679455775406,
+      "grad_norm": 0.020597217507985967,
+      "learning_rate": 0.00010830412752594659,
+      "loss": 0.4638,
+      "step": 31825
+    },
+    {
+      "epoch": 1.6159332918734375,
+      "grad_norm": 0.02099597835732562,
+      "learning_rate": 0.00010816648132632912,
+      "loss": 0.4407,
+      "step": 31830
+    },
+    {
+      "epoch": 1.6161871279714688,
+      "grad_norm": 0.022374131730080137,
+      "learning_rate": 0.00010802891204167736,
+      "loss": 0.4437,
+      "step": 31835
+    },
+    {
+      "epoch": 1.6164409640695003,
+      "grad_norm": 0.022866381042039967,
+      "learning_rate": 0.0001078914196989953,
+      "loss": 0.4499,
+      "step": 31840
+    },
+    {
+      "epoch": 1.6166948001675319,
+      "grad_norm": 0.023954437468624835,
+      "learning_rate": 0.00010775400432527228,
+      "loss": 0.462,
+      "step": 31845
+    },
+    {
+      "epoch": 1.6169486362655632,
+      "grad_norm": 0.023454406216757356,
+      "learning_rate": 0.00010761666594748176,
+      "loss": 0.4746,
+      "step": 31850
+    },
+    {
+      "epoch": 1.6172024723635947,
+      "grad_norm": 0.023827431315221397,
+      "learning_rate": 0.00010747940459258321,
+      "loss": 0.4376,
+      "step": 31855
+    },
+    {
+      "epoch": 1.6174563084616262,
+      "grad_norm": 0.025096747648666724,
+      "learning_rate": 0.00010734222028751989,
+      "loss": 0.4879,
+      "step": 31860
+    },
+    {
+      "epoch": 1.6177101445596578,
+      "grad_norm": 0.021230019197699266,
+      "learning_rate": 0.00010720511305922065,
+      "loss": 0.4286,
+      "step": 31865
+    },
+    {
+      "epoch": 1.6179639806576893,
+      "grad_norm": 0.024734269364125625,
+      "learning_rate": 0.00010706808293459875,
+      "loss": 0.4792,
+      "step": 31870
+    },
+    {
+      "epoch": 1.6182178167557209,
+      "grad_norm": 0.02274730432603367,
+      "learning_rate": 0.00010693112994055277,
+      "loss": 0.4631,
+      "step": 31875
+    },
+    {
+      "epoch": 1.6184716528537524,
+      "grad_norm": 0.023273183427376207,
+      "learning_rate": 0.00010679425410396559,
+      "loss": 0.4611,
+      "step": 31880
+    },
+    {
+      "epoch": 1.618725488951784,
+      "grad_norm": 0.020775817214814158,
+      "learning_rate": 0.00010665745545170557,
+      "loss": 0.4473,
+      "step": 31885
+    },
+    {
+      "epoch": 1.6189793250498155,
+      "grad_norm": 0.02042396438999102,
+      "learning_rate": 0.00010652073401062529,
+      "loss": 0.4245,
+      "step": 31890
+    },
+    {
+      "epoch": 1.619233161147847,
+      "grad_norm": 0.02282973946779475,
+      "learning_rate": 0.00010638408980756281,
+      "loss": 0.4685,
+      "step": 31895
+    },
+    {
+      "epoch": 1.6194869972458783,
+      "grad_norm": 0.020201540964168187,
+      "learning_rate": 0.00010624752286934037,
+      "loss": 0.4285,
+      "step": 31900
+    },
+    {
+      "epoch": 1.6197408333439098,
+      "grad_norm": 0.02332813243361064,
+      "learning_rate": 0.00010611103322276571,
+      "loss": 0.47,
+      "step": 31905
+    },
+    {
+      "epoch": 1.6199946694419414,
+      "grad_norm": 0.0210065909715921,
+      "learning_rate": 0.00010597462089463078,
+      "loss": 0.4695,
+      "step": 31910
+    },
+    {
+      "epoch": 1.6202485055399727,
+      "grad_norm": 0.02246075990254701,
+      "learning_rate": 0.00010583828591171273,
+      "loss": 0.4382,
+      "step": 31915
+    },
+    {
+      "epoch": 1.6205023416380042,
+      "grad_norm": 0.0283987877904691,
+      "learning_rate": 0.00010570202830077363,
+      "loss": 0.4513,
+      "step": 31920
+    },
+    {
+      "epoch": 1.6207561777360358,
+      "grad_norm": 0.023181166904655805,
+      "learning_rate": 0.0001055658480885599,
+      "loss": 0.454,
+      "step": 31925
+    },
+    {
+      "epoch": 1.6210100138340673,
+      "grad_norm": 0.02263188362017243,
+      "learning_rate": 0.00010542974530180327,
+      "loss": 0.4693,
+      "step": 31930
+    },
+    {
+      "epoch": 1.6212638499320988,
+      "grad_norm": 0.020696164165394288,
+      "learning_rate": 0.00010529371996721976,
+      "loss": 0.4531,
+      "step": 31935
+    },
+    {
+      "epoch": 1.6215176860301304,
+      "grad_norm": 0.022616240854955654,
+      "learning_rate": 0.00010515777211151079,
+      "loss": 0.4457,
+      "step": 31940
+    },
+    {
+      "epoch": 1.621771522128162,
+      "grad_norm": 0.023467651643862778,
+      "learning_rate": 0.00010502190176136195,
+      "loss": 0.4472,
+      "step": 31945
+    },
+    {
+      "epoch": 1.6220253582261934,
+      "grad_norm": 0.024225187265112066,
+      "learning_rate": 0.00010488610894344414,
+      "loss": 0.4586,
+      "step": 31950
+    },
+    {
+      "epoch": 1.622279194324225,
+      "grad_norm": 0.02003294507179616,
+      "learning_rate": 0.00010475039368441258,
+      "loss": 0.4476,
+      "step": 31955
+    },
+    {
+      "epoch": 1.6225330304222565,
+      "grad_norm": 0.02746040189534053,
+      "learning_rate": 0.0001046147560109078,
+      "loss": 0.4355,
+      "step": 31960
+    },
+    {
+      "epoch": 1.6227868665202878,
+      "grad_norm": 0.023121259899165414,
+      "learning_rate": 0.00010447919594955452,
+      "loss": 0.4772,
+      "step": 31965
+    },
+    {
+      "epoch": 1.6230407026183193,
+      "grad_norm": 0.021015795475823218,
+      "learning_rate": 0.00010434371352696259,
+      "loss": 0.4599,
+      "step": 31970
+    },
+    {
+      "epoch": 1.6232945387163509,
+      "grad_norm": 0.020515866606887798,
+      "learning_rate": 0.00010420830876972653,
+      "loss": 0.4425,
+      "step": 31975
+    },
+    {
+      "epoch": 1.6235483748143822,
+      "grad_norm": 0.02337407247961481,
+      "learning_rate": 0.0001040729817044258,
+      "loss": 0.4713,
+      "step": 31980
+    },
+    {
+      "epoch": 1.6238022109124137,
+      "grad_norm": 0.02089505650167657,
+      "learning_rate": 0.00010393773235762416,
+      "loss": 0.4621,
+      "step": 31985
+    },
+    {
+      "epoch": 1.6240560470104453,
+      "grad_norm": 0.023288518470522646,
+      "learning_rate": 0.00010380256075587063,
+      "loss": 0.4926,
+      "step": 31990
+    },
+    {
+      "epoch": 1.6243098831084768,
+      "grad_norm": 0.020663246760806137,
+      "learning_rate": 0.00010366746692569845,
+      "loss": 0.4301,
+      "step": 31995
+    },
+    {
+      "epoch": 1.6245637192065083,
+      "grad_norm": 0.021804964895316997,
+      "learning_rate": 0.00010353245089362612,
+      "loss": 0.4608,
+      "step": 32000
+    },
+    {
+      "epoch": 1.6248175553045399,
+      "grad_norm": 0.02226540859695977,
+      "learning_rate": 0.00010339751268615639,
+      "loss": 0.4374,
+      "step": 32005
+    },
+    {
+      "epoch": 1.6250713914025714,
+      "grad_norm": 0.02024366952881605,
+      "learning_rate": 0.00010326265232977717,
+      "loss": 0.4543,
+      "step": 32010
+    },
+    {
+      "epoch": 1.625325227500603,
+      "grad_norm": 0.022002203748430427,
+      "learning_rate": 0.00010312786985096067,
+      "loss": 0.4619,
+      "step": 32015
+    },
+    {
+      "epoch": 1.6255790635986345,
+      "grad_norm": 0.020301191207565898,
+      "learning_rate": 0.00010299316527616426,
+      "loss": 0.4779,
+      "step": 32020
+    },
+    {
+      "epoch": 1.625832899696666,
+      "grad_norm": 0.021523853309415912,
+      "learning_rate": 0.00010285853863182948,
+      "loss": 0.4441,
+      "step": 32025
+    },
+    {
+      "epoch": 1.6260867357946973,
+      "grad_norm": 0.020392254627019237,
+      "learning_rate": 0.00010272398994438303,
+      "loss": 0.4482,
+      "step": 32030
+    },
+    {
+      "epoch": 1.6263405718927288,
+      "grad_norm": 0.028102974962649765,
+      "learning_rate": 0.00010258951924023625,
+      "loss": 0.4494,
+      "step": 32035
+    },
+    {
+      "epoch": 1.6265944079907604,
+      "grad_norm": 0.028653290930459173,
+      "learning_rate": 0.00010245512654578487,
+      "loss": 0.6634,
+      "step": 32040
+    },
+    {
+      "epoch": 1.626848244088792,
+      "grad_norm": 0.03496966873027593,
+      "learning_rate": 0.00010232081188740971,
+      "loss": 0.4563,
+      "step": 32045
+    },
+    {
+      "epoch": 1.6271020801868232,
+      "grad_norm": 0.0444008021284147,
+      "learning_rate": 0.0001021865752914758,
+      "loss": 0.4782,
+      "step": 32050
+    },
+    {
+      "epoch": 1.6273559162848548,
+      "grad_norm": 0.028458235576396235,
+      "learning_rate": 0.00010205241678433341,
+      "loss": 0.4633,
+      "step": 32055
+    },
+    {
+      "epoch": 1.6276097523828863,
+      "grad_norm": 0.033779463143654556,
+      "learning_rate": 0.00010191833639231695,
+      "loss": 0.473,
+      "step": 32060
+    },
+    {
+      "epoch": 1.6278635884809178,
+      "grad_norm": 0.02289380406880714,
+      "learning_rate": 0.00010178433414174593,
+      "loss": 0.4981,
+      "step": 32065
+    },
+    {
+      "epoch": 1.6281174245789494,
+      "grad_norm": 0.023008365122818897,
+      "learning_rate": 0.00010165041005892412,
+      "loss": 0.4632,
+      "step": 32070
+    },
+    {
+      "epoch": 1.628371260676981,
+      "grad_norm": 0.024659825812692805,
+      "learning_rate": 0.00010151656417014033,
+      "loss": 0.4615,
+      "step": 32075
+    },
+    {
+      "epoch": 1.6286250967750124,
+      "grad_norm": 0.023409476356552144,
+      "learning_rate": 0.00010138279650166765,
+      "loss": 0.5097,
+      "step": 32080
+    },
+    {
+      "epoch": 1.628878932873044,
+      "grad_norm": 0.021110774308924826,
+      "learning_rate": 0.00010124910707976426,
+      "loss": 0.4515,
+      "step": 32085
+    },
+    {
+      "epoch": 1.6291327689710755,
+      "grad_norm": 0.02965948111333057,
+      "learning_rate": 0.00010111549593067226,
+      "loss": 0.4821,
+      "step": 32090
+    },
+    {
+      "epoch": 1.629386605069107,
+      "grad_norm": 0.02391127207510598,
+      "learning_rate": 0.00010098196308061953,
+      "loss": 0.4448,
+      "step": 32095
+    },
+    {
+      "epoch": 1.6296404411671384,
+      "grad_norm": 0.02681195362674087,
+      "learning_rate": 0.00010084850855581734,
+      "loss": 0.433,
+      "step": 32100
+    },
+    {
+      "epoch": 1.6298942772651699,
+      "grad_norm": 0.021012222244902096,
+      "learning_rate": 0.00010071513238246255,
+      "loss": 0.4676,
+      "step": 32105
+    },
+    {
+      "epoch": 1.6301481133632014,
+      "grad_norm": 0.02751151777371785,
+      "learning_rate": 0.00010058183458673587,
+      "loss": 0.4638,
+      "step": 32110
+    },
+    {
+      "epoch": 1.6304019494612327,
+      "grad_norm": 0.02956936482420226,
+      "learning_rate": 0.0001004486151948033,
+      "loss": 0.473,
+      "step": 32115
+    },
+    {
+      "epoch": 1.6306557855592643,
+      "grad_norm": 0.034062325657136044,
+      "learning_rate": 0.00010031547423281501,
+      "loss": 0.4819,
+      "step": 32120
+    },
+    {
+      "epoch": 1.6309096216572958,
+      "grad_norm": 0.024178056660624028,
+      "learning_rate": 0.00010018241172690578,
+      "loss": 0.4669,
+      "step": 32125
+    },
+    {
+      "epoch": 1.6311634577553273,
+      "grad_norm": 0.021402418062271347,
+      "learning_rate": 0.00010004942770319536,
+      "loss": 0.4613,
+      "step": 32130
+    },
+    {
+      "epoch": 1.6314172938533589,
+      "grad_norm": 0.02076067212479062,
+      "learning_rate": 9.991652218778762e-05,
+      "loss": 0.4463,
+      "step": 32135
+    },
+    {
+      "epoch": 1.6316711299513904,
+      "grad_norm": 0.028173969272268517,
+      "learning_rate": 9.97836952067715e-05,
+      "loss": 0.4744,
+      "step": 32140
+    },
+    {
+      "epoch": 1.631924966049422,
+      "grad_norm": 0.022830320689664698,
+      "learning_rate": 9.965094678621994e-05,
+      "loss": 0.4921,
+      "step": 32145
+    },
+    {
+      "epoch": 1.6321788021474535,
+      "grad_norm": 0.022425906549558932,
+      "learning_rate": 9.951827695219107e-05,
+      "loss": 0.4392,
+      "step": 32150
+    },
+    {
+      "epoch": 1.632432638245485,
+      "grad_norm": 0.029061636424618946,
+      "learning_rate": 9.938568573072715e-05,
+      "loss": 0.4671,
+      "step": 32155
+    },
+    {
+      "epoch": 1.6326864743435165,
+      "grad_norm": 0.028166002445137558,
+      "learning_rate": 9.925317314785548e-05,
+      "loss": 0.4338,
+      "step": 32160
+    },
+    {
+      "epoch": 1.6329403104415479,
+      "grad_norm": 0.02300375901450975,
+      "learning_rate": 9.91207392295872e-05,
+      "loss": 0.4764,
+      "step": 32165
+    },
+    {
+      "epoch": 1.6331941465395794,
+      "grad_norm": 0.022728515736743903,
+      "learning_rate": 9.898838400191879e-05,
+      "loss": 0.4725,
+      "step": 32170
+    },
+    {
+      "epoch": 1.633447982637611,
+      "grad_norm": 0.02413443910577726,
+      "learning_rate": 9.885610749083063e-05,
+      "loss": 0.468,
+      "step": 32175
+    },
+    {
+      "epoch": 1.6337018187356422,
+      "grad_norm": 0.02197648542631456,
+      "learning_rate": 9.872390972228823e-05,
+      "loss": 0.4635,
+      "step": 32180
+    },
+    {
+      "epoch": 1.6339556548336738,
+      "grad_norm": 0.02750430405496458,
+      "learning_rate": 9.8591790722241e-05,
+      "loss": 0.4992,
+      "step": 32185
+    },
+    {
+      "epoch": 1.6342094909317053,
+      "grad_norm": 0.022925967015699954,
+      "learning_rate": 9.84597505166236e-05,
+      "loss": 0.4749,
+      "step": 32190
+    },
+    {
+      "epoch": 1.6344633270297368,
+      "grad_norm": 0.023864232586490966,
+      "learning_rate": 9.832778913135454e-05,
+      "loss": 0.4432,
+      "step": 32195
+    },
+    {
+      "epoch": 1.6347171631277684,
+      "grad_norm": 0.023429582390276676,
+      "learning_rate": 9.819590659233746e-05,
+      "loss": 0.4689,
+      "step": 32200
+    },
+    {
+      "epoch": 1.6349709992258,
+      "grad_norm": 0.023005759239922634,
+      "learning_rate": 9.806410292546003e-05,
+      "loss": 0.4647,
+      "step": 32205
+    },
+    {
+      "epoch": 1.6352248353238314,
+      "grad_norm": 0.033322292665807295,
+      "learning_rate": 9.793237815659473e-05,
+      "loss": 0.4787,
+      "step": 32210
+    },
+    {
+      "epoch": 1.635478671421863,
+      "grad_norm": 0.02676499418010649,
+      "learning_rate": 9.780073231159864e-05,
+      "loss": 0.4879,
+      "step": 32215
+    },
+    {
+      "epoch": 1.6357325075198945,
+      "grad_norm": 0.023068752325228373,
+      "learning_rate": 9.766916541631288e-05,
+      "loss": 0.4579,
+      "step": 32220
+    },
+    {
+      "epoch": 1.635986343617926,
+      "grad_norm": 0.02443617843704049,
+      "learning_rate": 9.753767749656361e-05,
+      "loss": 0.4236,
+      "step": 32225
+    },
+    {
+      "epoch": 1.6362401797159574,
+      "grad_norm": 0.022098127052591947,
+      "learning_rate": 9.740626857816109e-05,
+      "loss": 0.4596,
+      "step": 32230
+    },
+    {
+      "epoch": 1.636494015813989,
+      "grad_norm": 0.025000807984976732,
+      "learning_rate": 9.727493868690046e-05,
+      "loss": 0.4547,
+      "step": 32235
+    },
+    {
+      "epoch": 1.6367478519120204,
+      "grad_norm": 0.02155154892913524,
+      "learning_rate": 9.714368784856081e-05,
+      "loss": 0.4733,
+      "step": 32240
+    },
+    {
+      "epoch": 1.6370016880100517,
+      "grad_norm": 0.02097638201730721,
+      "learning_rate": 9.701251608890638e-05,
+      "loss": 0.4939,
+      "step": 32245
+    },
+    {
+      "epoch": 1.6372555241080833,
+      "grad_norm": 0.024844539480766835,
+      "learning_rate": 9.688142343368517e-05,
+      "loss": 0.4623,
+      "step": 32250
+    },
+    {
+      "epoch": 1.6375093602061148,
+      "grad_norm": 0.022892093225189144,
+      "learning_rate": 9.675040990863032e-05,
+      "loss": 0.4643,
+      "step": 32255
+    },
+    {
+      "epoch": 1.6377631963041464,
+      "grad_norm": 0.021736869039684987,
+      "learning_rate": 9.661947553945893e-05,
+      "loss": 0.4592,
+      "step": 32260
+    },
+    {
+      "epoch": 1.6380170324021779,
+      "grad_norm": 0.030318571487825602,
+      "learning_rate": 9.648862035187289e-05,
+      "loss": 0.4798,
+      "step": 32265
+    },
+    {
+      "epoch": 1.6382708685002094,
+      "grad_norm": 0.023079937986626196,
+      "learning_rate": 9.635784437155815e-05,
+      "loss": 0.4786,
+      "step": 32270
+    },
+    {
+      "epoch": 1.638524704598241,
+      "grad_norm": 0.020531984789138997,
+      "learning_rate": 9.622714762418588e-05,
+      "loss": 0.4466,
+      "step": 32275
+    },
+    {
+      "epoch": 1.6387785406962725,
+      "grad_norm": 0.02520449611053328,
+      "learning_rate": 9.609653013541076e-05,
+      "loss": 0.4799,
+      "step": 32280
+    },
+    {
+      "epoch": 1.639032376794304,
+      "grad_norm": 0.02276790959983237,
+      "learning_rate": 9.596599193087263e-05,
+      "loss": 0.4371,
+      "step": 32285
+    },
+    {
+      "epoch": 1.6392862128923356,
+      "grad_norm": 0.023814420038553278,
+      "learning_rate": 9.583553303619524e-05,
+      "loss": 0.4874,
+      "step": 32290
+    },
+    {
+      "epoch": 1.6395400489903669,
+      "grad_norm": 0.032769714816805816,
+      "learning_rate": 9.570515347698727e-05,
+      "loss": 0.4549,
+      "step": 32295
+    },
+    {
+      "epoch": 1.6397938850883984,
+      "grad_norm": 0.02497036630007797,
+      "learning_rate": 9.557485327884136e-05,
+      "loss": 0.4573,
+      "step": 32300
+    },
+    {
+      "epoch": 1.64004772118643,
+      "grad_norm": 0.020083465356179893,
+      "learning_rate": 9.544463246733503e-05,
+      "loss": 0.4504,
+      "step": 32305
+    },
+    {
+      "epoch": 1.6403015572844615,
+      "grad_norm": 0.022338564551145575,
+      "learning_rate": 9.531449106802964e-05,
+      "loss": 0.4702,
+      "step": 32310
+    },
+    {
+      "epoch": 1.6405553933824928,
+      "grad_norm": 0.021589374085527493,
+      "learning_rate": 9.518442910647168e-05,
+      "loss": 0.4527,
+      "step": 32315
+    },
+    {
+      "epoch": 1.6408092294805243,
+      "grad_norm": 0.02577068006041862,
+      "learning_rate": 9.50544466081913e-05,
+      "loss": 0.4341,
+      "step": 32320
+    },
+    {
+      "epoch": 1.6410630655785559,
+      "grad_norm": 0.024543726750310888,
+      "learning_rate": 9.492454359870379e-05,
+      "loss": 0.4649,
+      "step": 32325
+    },
+    {
+      "epoch": 1.6413169016765874,
+      "grad_norm": 0.029648235764047823,
+      "learning_rate": 9.479472010350803e-05,
+      "loss": 0.4673,
+      "step": 32330
+    },
+    {
+      "epoch": 1.641570737774619,
+      "grad_norm": 0.021346922529075836,
+      "learning_rate": 9.466497614808806e-05,
+      "loss": 0.4419,
+      "step": 32335
+    },
+    {
+      "epoch": 1.6418245738726505,
+      "grad_norm": 0.02170707567899142,
+      "learning_rate": 9.453531175791191e-05,
+      "loss": 0.4657,
+      "step": 32340
+    },
+    {
+      "epoch": 1.642078409970682,
+      "grad_norm": 0.025382331522061787,
+      "learning_rate": 9.440572695843192e-05,
+      "loss": 0.4543,
+      "step": 32345
+    },
+    {
+      "epoch": 1.6423322460687135,
+      "grad_norm": 0.01955537085234093,
+      "learning_rate": 9.427622177508521e-05,
+      "loss": 0.4287,
+      "step": 32350
+    },
+    {
+      "epoch": 1.642586082166745,
+      "grad_norm": 0.021772913466288676,
+      "learning_rate": 9.414679623329264e-05,
+      "loss": 0.46,
+      "step": 32355
+    },
+    {
+      "epoch": 1.6428399182647766,
+      "grad_norm": 0.02173314708203356,
+      "learning_rate": 9.40174503584601e-05,
+      "loss": 0.4652,
+      "step": 32360
+    },
+    {
+      "epoch": 1.643093754362808,
+      "grad_norm": 0.02028675485464349,
+      "learning_rate": 9.388818417597733e-05,
+      "loss": 0.4648,
+      "step": 32365
+    },
+    {
+      "epoch": 1.6433475904608394,
+      "grad_norm": 0.024655003878005108,
+      "learning_rate": 9.375899771121888e-05,
+      "loss": 0.4126,
+      "step": 32370
+    },
+    {
+      "epoch": 1.643601426558871,
+      "grad_norm": 0.022638057624033838,
+      "learning_rate": 9.362989098954306e-05,
+      "loss": 0.4586,
+      "step": 32375
+    },
+    {
+      "epoch": 1.6438552626569023,
+      "grad_norm": 0.020410137630603085,
+      "learning_rate": 9.350086403629326e-05,
+      "loss": 0.4371,
+      "step": 32380
+    },
+    {
+      "epoch": 1.6441090987549338,
+      "grad_norm": 0.02664816125540432,
+      "learning_rate": 9.337191687679648e-05,
+      "loss": 0.4442,
+      "step": 32385
+    },
+    {
+      "epoch": 1.6443629348529654,
+      "grad_norm": 0.0234300596390381,
+      "learning_rate": 9.324304953636458e-05,
+      "loss": 0.464,
+      "step": 32390
+    },
+    {
+      "epoch": 1.644616770950997,
+      "grad_norm": 0.022576297788024068,
+      "learning_rate": 9.311426204029355e-05,
+      "loss": 0.4434,
+      "step": 32395
+    },
+    {
+      "epoch": 1.6448706070490284,
+      "grad_norm": 0.022495995573104423,
+      "learning_rate": 9.298555441386392e-05,
+      "loss": 0.495,
+      "step": 32400
+    },
+    {
+      "epoch": 1.64512444314706,
+      "grad_norm": 0.020474348549872002,
+      "learning_rate": 9.285692668233997e-05,
+      "loss": 0.4833,
+      "step": 32405
+    },
+    {
+      "epoch": 1.6453782792450915,
+      "grad_norm": 0.024333096423502235,
+      "learning_rate": 9.272837887097108e-05,
+      "loss": 0.4754,
+      "step": 32410
+    },
+    {
+      "epoch": 1.645632115343123,
+      "grad_norm": 0.020618548983006132,
+      "learning_rate": 9.259991100499021e-05,
+      "loss": 0.4721,
+      "step": 32415
+    },
+    {
+      "epoch": 1.6458859514411546,
+      "grad_norm": 0.020652302792662478,
+      "learning_rate": 9.247152310961527e-05,
+      "loss": 0.4418,
+      "step": 32420
+    },
+    {
+      "epoch": 1.646139787539186,
+      "grad_norm": 0.02297733570270565,
+      "learning_rate": 9.234321521004786e-05,
+      "loss": 0.4864,
+      "step": 32425
+    },
+    {
+      "epoch": 1.6463936236372174,
+      "grad_norm": 0.019973132791394135,
+      "learning_rate": 9.221498733147443e-05,
+      "loss": 0.455,
+      "step": 32430
+    },
+    {
+      "epoch": 1.646647459735249,
+      "grad_norm": 0.0213492601346557,
+      "learning_rate": 9.208683949906526e-05,
+      "loss": 0.4294,
+      "step": 32435
+    },
+    {
+      "epoch": 1.6469012958332805,
+      "grad_norm": 0.020742705287425788,
+      "learning_rate": 9.195877173797534e-05,
+      "loss": 0.4499,
+      "step": 32440
+    },
+    {
+      "epoch": 1.6471551319313118,
+      "grad_norm": 0.01942087181042954,
+      "learning_rate": 9.18307840733435e-05,
+      "loss": 0.456,
+      "step": 32445
+    },
+    {
+      "epoch": 1.6474089680293433,
+      "grad_norm": 0.031308669135702176,
+      "learning_rate": 9.170287653029325e-05,
+      "loss": 0.4647,
+      "step": 32450
+    },
+    {
+      "epoch": 1.6476628041273749,
+      "grad_norm": 0.02130109350045376,
+      "learning_rate": 9.157504913393228e-05,
+      "loss": 0.4667,
+      "step": 32455
+    },
+    {
+      "epoch": 1.6479166402254064,
+      "grad_norm": 0.027077864769477565,
+      "learning_rate": 9.14473019093522e-05,
+      "loss": 0.4551,
+      "step": 32460
+    },
+    {
+      "epoch": 1.648170476323438,
+      "grad_norm": 0.023508906357244588,
+      "learning_rate": 9.131963488162942e-05,
+      "loss": 0.4568,
+      "step": 32465
+    },
+    {
+      "epoch": 1.6484243124214695,
+      "grad_norm": 0.021416094495555104,
+      "learning_rate": 9.119204807582415e-05,
+      "loss": 0.4501,
+      "step": 32470
+    },
+    {
+      "epoch": 1.648678148519501,
+      "grad_norm": 0.026119351851691987,
+      "learning_rate": 9.106454151698118e-05,
+      "loss": 0.4582,
+      "step": 32475
+    },
+    {
+      "epoch": 1.6489319846175325,
+      "grad_norm": 0.0208804764024078,
+      "learning_rate": 9.093711523012933e-05,
+      "loss": 0.4448,
+      "step": 32480
+    },
+    {
+      "epoch": 1.649185820715564,
+      "grad_norm": 0.024577116363089128,
+      "learning_rate": 9.080976924028177e-05,
+      "loss": 0.4357,
+      "step": 32485
+    },
+    {
+      "epoch": 1.6494396568135956,
+      "grad_norm": 0.022644134972793314,
+      "learning_rate": 9.068250357243585e-05,
+      "loss": 0.459,
+      "step": 32490
+    },
+    {
+      "epoch": 1.649693492911627,
+      "grad_norm": 0.024600197563246825,
+      "learning_rate": 9.055531825157332e-05,
+      "loss": 0.4454,
+      "step": 32495
+    },
+    {
+      "epoch": 1.6499473290096585,
+      "grad_norm": 0.021916792555752632,
+      "learning_rate": 9.042821330265976e-05,
+      "loss": 0.4652,
+      "step": 32500
+    },
+    {
+      "epoch": 1.65020116510769,
+      "grad_norm": 0.022504308117136322,
+      "learning_rate": 9.030118875064553e-05,
+      "loss": 0.464,
+      "step": 32505
+    },
+    {
+      "epoch": 1.6504550012057213,
+      "grad_norm": 0.023410283807235067,
+      "learning_rate": 9.017424462046453e-05,
+      "loss": 0.4625,
+      "step": 32510
+    },
+    {
+      "epoch": 1.6507088373037528,
+      "grad_norm": 0.023827231525166772,
+      "learning_rate": 9.00473809370358e-05,
+      "loss": 0.4476,
+      "step": 32515
+    },
+    {
+      "epoch": 1.6509626734017844,
+      "grad_norm": 0.024423379173870094,
+      "learning_rate": 8.992059772526163e-05,
+      "loss": 0.4765,
+      "step": 32520
+    },
+    {
+      "epoch": 1.651216509499816,
+      "grad_norm": 0.022352780343270956,
+      "learning_rate": 8.979389501002916e-05,
+      "loss": 0.4578,
+      "step": 32525
+    },
+    {
+      "epoch": 1.6514703455978474,
+      "grad_norm": 0.021988070837923635,
+      "learning_rate": 8.966727281620929e-05,
+      "loss": 0.4696,
+      "step": 32530
+    },
+    {
+      "epoch": 1.651724181695879,
+      "grad_norm": 0.020573849908693122,
+      "learning_rate": 8.954073116865757e-05,
+      "loss": 0.4873,
+      "step": 32535
+    },
+    {
+      "epoch": 1.6519780177939105,
+      "grad_norm": 0.023883137882499617,
+      "learning_rate": 8.941427009221325e-05,
+      "loss": 0.4686,
+      "step": 32540
+    },
+    {
+      "epoch": 1.652231853891942,
+      "grad_norm": 0.027468298536276997,
+      "learning_rate": 8.928788961170025e-05,
+      "loss": 0.4925,
+      "step": 32545
+    },
+    {
+      "epoch": 1.6524856899899736,
+      "grad_norm": 0.027088296741393195,
+      "learning_rate": 8.916158975192618e-05,
+      "loss": 0.4534,
+      "step": 32550
+    },
+    {
+      "epoch": 1.6527395260880051,
+      "grad_norm": 0.02137783275444411,
+      "learning_rate": 8.903537053768329e-05,
+      "loss": 0.4541,
+      "step": 32555
+    },
+    {
+      "epoch": 1.6529933621860364,
+      "grad_norm": 0.02449402913442447,
+      "learning_rate": 8.890923199374756e-05,
+      "loss": 0.4558,
+      "step": 32560
+    },
+    {
+      "epoch": 1.653247198284068,
+      "grad_norm": 0.022228205017642978,
+      "learning_rate": 8.878317414487964e-05,
+      "loss": 0.4577,
+      "step": 32565
+    },
+    {
+      "epoch": 1.6535010343820995,
+      "grad_norm": 0.023535911694491006,
+      "learning_rate": 8.865719701582376e-05,
+      "loss": 0.4531,
+      "step": 32570
+    },
+    {
+      "epoch": 1.653754870480131,
+      "grad_norm": 0.021119661452584808,
+      "learning_rate": 8.85313006313087e-05,
+      "loss": 0.4574,
+      "step": 32575
+    },
+    {
+      "epoch": 1.6540087065781623,
+      "grad_norm": 0.022839968865042536,
+      "learning_rate": 8.84054850160475e-05,
+      "loss": 0.4713,
+      "step": 32580
+    },
+    {
+      "epoch": 1.6542625426761939,
+      "grad_norm": 0.022416424195115817,
+      "learning_rate": 8.827975019473688e-05,
+      "loss": 0.4826,
+      "step": 32585
+    },
+    {
+      "epoch": 1.6545163787742254,
+      "grad_norm": 0.021476787913188244,
+      "learning_rate": 8.815409619205811e-05,
+      "loss": 0.4419,
+      "step": 32590
+    },
+    {
+      "epoch": 1.654770214872257,
+      "grad_norm": 0.028305345188139118,
+      "learning_rate": 8.802852303267634e-05,
+      "loss": 0.465,
+      "step": 32595
+    },
+    {
+      "epoch": 1.6550240509702885,
+      "grad_norm": 0.04040707057128688,
+      "learning_rate": 8.790303074124106e-05,
+      "loss": 0.5121,
+      "step": 32600
+    },
+    {
+      "epoch": 1.65527788706832,
+      "grad_norm": 0.04087985900665181,
+      "learning_rate": 8.77776193423856e-05,
+      "loss": 0.4659,
+      "step": 32605
+    },
+    {
+      "epoch": 1.6555317231663516,
+      "grad_norm": 0.028685293599005325,
+      "learning_rate": 8.765228886072785e-05,
+      "loss": 0.4659,
+      "step": 32610
+    },
+    {
+      "epoch": 1.655785559264383,
+      "grad_norm": 0.022967724296628123,
+      "learning_rate": 8.75270393208693e-05,
+      "loss": 0.4558,
+      "step": 32615
+    },
+    {
+      "epoch": 1.6560393953624146,
+      "grad_norm": 0.02385629905389606,
+      "learning_rate": 8.740187074739609e-05,
+      "loss": 0.4281,
+      "step": 32620
+    },
+    {
+      "epoch": 1.656293231460446,
+      "grad_norm": 0.024276473538365772,
+      "learning_rate": 8.727678316487786e-05,
+      "loss": 0.4542,
+      "step": 32625
+    },
+    {
+      "epoch": 1.6565470675584775,
+      "grad_norm": 0.02582996339953851,
+      "learning_rate": 8.7151776597869e-05,
+      "loss": 0.4921,
+      "step": 32630
+    },
+    {
+      "epoch": 1.656800903656509,
+      "grad_norm": 0.019009514698150008,
+      "learning_rate": 8.702685107090725e-05,
+      "loss": 0.4833,
+      "step": 32635
+    },
+    {
+      "epoch": 1.6570547397545405,
+      "grad_norm": 0.02262802830227219,
+      "learning_rate": 8.690200660851539e-05,
+      "loss": 0.4611,
+      "step": 32640
+    },
+    {
+      "epoch": 1.6573085758525719,
+      "grad_norm": 0.022327220330587837,
+      "learning_rate": 8.677724323519937e-05,
+      "loss": 0.4822,
+      "step": 32645
+    },
+    {
+      "epoch": 1.6575624119506034,
+      "grad_norm": 0.026247061121182514,
+      "learning_rate": 8.665256097544994e-05,
+      "loss": 0.483,
+      "step": 32650
+    },
+    {
+      "epoch": 1.657816248048635,
+      "grad_norm": 0.021890433127245045,
+      "learning_rate": 8.65279598537413e-05,
+      "loss": 0.4744,
+      "step": 32655
+    },
+    {
+      "epoch": 1.6580700841466665,
+      "grad_norm": 0.023375317295645848,
+      "learning_rate": 8.640343989453225e-05,
+      "loss": 0.4488,
+      "step": 32660
+    },
+    {
+      "epoch": 1.658323920244698,
+      "grad_norm": 0.025314102237468532,
+      "learning_rate": 8.627900112226522e-05,
+      "loss": 0.4704,
+      "step": 32665
+    },
+    {
+      "epoch": 1.6585777563427295,
+      "grad_norm": 0.020441167533399623,
+      "learning_rate": 8.61546435613672e-05,
+      "loss": 0.4608,
+      "step": 32670
+    },
+    {
+      "epoch": 1.658831592440761,
+      "grad_norm": 0.022593790331337692,
+      "learning_rate": 8.603036723624868e-05,
+      "loss": 0.4543,
+      "step": 32675
+    },
+    {
+      "epoch": 1.6590854285387926,
+      "grad_norm": 0.023141528922392886,
+      "learning_rate": 8.590617217130469e-05,
+      "loss": 0.4867,
+      "step": 32680
+    },
+    {
+      "epoch": 1.6593392646368241,
+      "grad_norm": 0.019506697724557195,
+      "learning_rate": 8.578205839091397e-05,
+      "loss": 0.4562,
+      "step": 32685
+    },
+    {
+      "epoch": 1.6595931007348557,
+      "grad_norm": 0.021711834234721967,
+      "learning_rate": 8.565802591943955e-05,
+      "loss": 0.4485,
+      "step": 32690
+    },
+    {
+      "epoch": 1.659846936832887,
+      "grad_norm": 0.021652695537899707,
+      "learning_rate": 8.55340747812282e-05,
+      "loss": 0.4609,
+      "step": 32695
+    },
+    {
+      "epoch": 1.6601007729309185,
+      "grad_norm": 0.021747456684810037,
+      "learning_rate": 8.541020500061109e-05,
+      "loss": 0.4705,
+      "step": 32700
+    },
+    {
+      "epoch": 1.66035460902895,
+      "grad_norm": 0.022086611000210216,
+      "learning_rate": 8.528641660190323e-05,
+      "loss": 0.478,
+      "step": 32705
+    },
+    {
+      "epoch": 1.6606084451269814,
+      "grad_norm": 0.022136451230823195,
+      "learning_rate": 8.516270960940353e-05,
+      "loss": 0.4541,
+      "step": 32710
+    },
+    {
+      "epoch": 1.660862281225013,
+      "grad_norm": 0.025866562335245584,
+      "learning_rate": 8.50390840473953e-05,
+      "loss": 0.4446,
+      "step": 32715
+    },
+    {
+      "epoch": 1.6611161173230444,
+      "grad_norm": 0.0228248045581071,
+      "learning_rate": 8.491553994014528e-05,
+      "loss": 0.4186,
+      "step": 32720
+    },
+    {
+      "epoch": 1.661369953421076,
+      "grad_norm": 0.021908437534819313,
+      "learning_rate": 8.479207731190491e-05,
+      "loss": 0.444,
+      "step": 32725
+    },
+    {
+      "epoch": 1.6616237895191075,
+      "grad_norm": 0.022820980730176514,
+      "learning_rate": 8.466869618690898e-05,
+      "loss": 0.4496,
+      "step": 32730
+    },
+    {
+      "epoch": 1.661877625617139,
+      "grad_norm": 0.02083612493092523,
+      "learning_rate": 8.454539658937688e-05,
+      "loss": 0.4318,
+      "step": 32735
+    },
+    {
+      "epoch": 1.6621314617151706,
+      "grad_norm": 0.02223274504726005,
+      "learning_rate": 8.442217854351142e-05,
+      "loss": 0.4465,
+      "step": 32740
+    },
+    {
+      "epoch": 1.662385297813202,
+      "grad_norm": 0.027608070343403793,
+      "learning_rate": 8.429904207349997e-05,
+      "loss": 0.4895,
+      "step": 32745
+    },
+    {
+      "epoch": 1.6626391339112336,
+      "grad_norm": 0.023017127806108626,
+      "learning_rate": 8.417598720351333e-05,
+      "loss": 0.4566,
+      "step": 32750
+    },
+    {
+      "epoch": 1.6628929700092652,
+      "grad_norm": 0.024694217741001077,
+      "learning_rate": 8.40530139577067e-05,
+      "loss": 0.4202,
+      "step": 32755
+    },
+    {
+      "epoch": 1.6631468061072965,
+      "grad_norm": 0.021747446327509346,
+      "learning_rate": 8.393012236021908e-05,
+      "loss": 0.442,
+      "step": 32760
+    },
+    {
+      "epoch": 1.663400642205328,
+      "grad_norm": 0.02285407342505394,
+      "learning_rate": 8.380731243517365e-05,
+      "loss": 0.4849,
+      "step": 32765
+    },
+    {
+      "epoch": 1.6636544783033596,
+      "grad_norm": 0.01880950444315991,
+      "learning_rate": 8.368458420667707e-05,
+      "loss": 0.4462,
+      "step": 32770
+    },
+    {
+      "epoch": 1.6639083144013909,
+      "grad_norm": 0.02321735100250034,
+      "learning_rate": 8.356193769882064e-05,
+      "loss": 0.4659,
+      "step": 32775
+    },
+    {
+      "epoch": 1.6641621504994224,
+      "grad_norm": 0.021688726089099976,
+      "learning_rate": 8.343937293567888e-05,
+      "loss": 0.4518,
+      "step": 32780
+    },
+    {
+      "epoch": 1.664415986597454,
+      "grad_norm": 0.022255121531889792,
+      "learning_rate": 8.331688994131098e-05,
+      "loss": 0.4709,
+      "step": 32785
+    },
+    {
+      "epoch": 1.6646698226954855,
+      "grad_norm": 0.0234424879295081,
+      "learning_rate": 8.319448873975948e-05,
+      "loss": 0.4485,
+      "step": 32790
+    },
+    {
+      "epoch": 1.664923658793517,
+      "grad_norm": 0.01971346035102544,
+      "learning_rate": 8.307216935505135e-05,
+      "loss": 0.4471,
+      "step": 32795
+    },
+    {
+      "epoch": 1.6651774948915485,
+      "grad_norm": 0.027744043981377947,
+      "learning_rate": 8.294993181119703e-05,
+      "loss": 0.4878,
+      "step": 32800
+    },
+    {
+      "epoch": 1.66543133098958,
+      "grad_norm": 0.02538544426279172,
+      "learning_rate": 8.282777613219139e-05,
+      "loss": 0.4614,
+      "step": 32805
+    },
+    {
+      "epoch": 1.6656851670876116,
+      "grad_norm": 0.024257509182234863,
+      "learning_rate": 8.270570234201274e-05,
+      "loss": 0.456,
+      "step": 32810
+    },
+    {
+      "epoch": 1.6659390031856431,
+      "grad_norm": 0.022358059693099067,
+      "learning_rate": 8.25837104646237e-05,
+      "loss": 0.4957,
+      "step": 32815
+    },
+    {
+      "epoch": 1.6661928392836747,
+      "grad_norm": 0.021859775564087938,
+      "learning_rate": 8.246180052397078e-05,
+      "loss": 0.4784,
+      "step": 32820
+    },
+    {
+      "epoch": 1.666446675381706,
+      "grad_norm": 0.020567147747974097,
+      "learning_rate": 8.233997254398401e-05,
+      "loss": 0.4608,
+      "step": 32825
+    },
+    {
+      "epoch": 1.6667005114797375,
+      "grad_norm": 0.023002497460944293,
+      "learning_rate": 8.221822654857786e-05,
+      "loss": 0.441,
+      "step": 32830
+    },
+    {
+      "epoch": 1.666954347577769,
+      "grad_norm": 0.024727893994143192,
+      "learning_rate": 8.209656256165027e-05,
+      "loss": 0.4698,
+      "step": 32835
+    },
+    {
+      "epoch": 1.6672081836758004,
+      "grad_norm": 0.020590000782565322,
+      "learning_rate": 8.197498060708347e-05,
+      "loss": 0.4686,
+      "step": 32840
+    },
+    {
+      "epoch": 1.667462019773832,
+      "grad_norm": 0.026579843381741665,
+      "learning_rate": 8.185348070874316e-05,
+      "loss": 0.4612,
+      "step": 32845
+    },
+    {
+      "epoch": 1.6677158558718634,
+      "grad_norm": 0.01927314772150972,
+      "learning_rate": 8.173206289047947e-05,
+      "loss": 0.4627,
+      "step": 32850
+    },
+    {
+      "epoch": 1.667969691969895,
+      "grad_norm": 0.028426534307149153,
+      "learning_rate": 8.161072717612578e-05,
+      "loss": 0.4565,
+      "step": 32855
+    },
+    {
+      "epoch": 1.6682235280679265,
+      "grad_norm": 0.028414912943784405,
+      "learning_rate": 8.148947358949992e-05,
+      "loss": 0.4608,
+      "step": 32860
+    },
+    {
+      "epoch": 1.668477364165958,
+      "grad_norm": 0.02128169255167679,
+      "learning_rate": 8.136830215440322e-05,
+      "loss": 0.4669,
+      "step": 32865
+    },
+    {
+      "epoch": 1.6687312002639896,
+      "grad_norm": 0.020873791873976224,
+      "learning_rate": 8.124721289462122e-05,
+      "loss": 0.4489,
+      "step": 32870
+    },
+    {
+      "epoch": 1.6689850363620211,
+      "grad_norm": 0.03135989480369304,
+      "learning_rate": 8.112620583392272e-05,
+      "loss": 0.4529,
+      "step": 32875
+    },
+    {
+      "epoch": 1.6692388724600526,
+      "grad_norm": 0.02000361275840586,
+      "learning_rate": 8.100528099606135e-05,
+      "loss": 0.4683,
+      "step": 32880
+    },
+    {
+      "epoch": 1.6694927085580842,
+      "grad_norm": 0.03133371354490263,
+      "learning_rate": 8.088443840477371e-05,
+      "loss": 0.4594,
+      "step": 32885
+    },
+    {
+      "epoch": 1.6697465446561155,
+      "grad_norm": 0.020707782157897564,
+      "learning_rate": 8.076367808378083e-05,
+      "loss": 0.4547,
+      "step": 32890
+    },
+    {
+      "epoch": 1.670000380754147,
+      "grad_norm": 0.022147898036397162,
+      "learning_rate": 8.064300005678705e-05,
+      "loss": 0.4866,
+      "step": 32895
+    },
+    {
+      "epoch": 1.6702542168521786,
+      "grad_norm": 0.02198563076624078,
+      "learning_rate": 8.052240434748114e-05,
+      "loss": 0.4789,
+      "step": 32900
+    },
+    {
+      "epoch": 1.67050805295021,
+      "grad_norm": 0.39720820061969236,
+      "learning_rate": 8.04018909795352e-05,
+      "loss": 0.438,
+      "step": 32905
+    },
+    {
+      "epoch": 1.6707618890482414,
+      "grad_norm": 0.022207069306431813,
+      "learning_rate": 8.028145997660569e-05,
+      "loss": 0.4757,
+      "step": 32910
+    },
+    {
+      "epoch": 1.671015725146273,
+      "grad_norm": 0.024892721760228,
+      "learning_rate": 8.016111136233229e-05,
+      "loss": 0.4694,
+      "step": 32915
+    },
+    {
+      "epoch": 1.6712695612443045,
+      "grad_norm": 0.026240771643922845,
+      "learning_rate": 8.00408451603391e-05,
+      "loss": 0.483,
+      "step": 32920
+    },
+    {
+      "epoch": 1.671523397342336,
+      "grad_norm": 0.027364163639244488,
+      "learning_rate": 7.992066139423359e-05,
+      "loss": 0.4495,
+      "step": 32925
+    },
+    {
+      "epoch": 1.6717772334403675,
+      "grad_norm": 0.022658845760252785,
+      "learning_rate": 7.980056008760744e-05,
+      "loss": 0.4604,
+      "step": 32930
+    },
+    {
+      "epoch": 1.672031069538399,
+      "grad_norm": 0.020613403322029124,
+      "learning_rate": 7.968054126403568e-05,
+      "loss": 0.4436,
+      "step": 32935
+    },
+    {
+      "epoch": 1.6722849056364306,
+      "grad_norm": 0.020569339169032044,
+      "learning_rate": 7.956060494707757e-05,
+      "loss": 0.4611,
+      "step": 32940
+    },
+    {
+      "epoch": 1.6725387417344622,
+      "grad_norm": 0.02209932289337912,
+      "learning_rate": 7.944075116027604e-05,
+      "loss": 0.4665,
+      "step": 32945
+    },
+    {
+      "epoch": 1.6727925778324937,
+      "grad_norm": 0.022726524271639435,
+      "learning_rate": 7.93209799271577e-05,
+      "loss": 0.4564,
+      "step": 32950
+    },
+    {
+      "epoch": 1.6730464139305252,
+      "grad_norm": 0.021952171929792236,
+      "learning_rate": 7.920129127123316e-05,
+      "loss": 0.4485,
+      "step": 32955
+    },
+    {
+      "epoch": 1.6733002500285565,
+      "grad_norm": 0.02051579303808435,
+      "learning_rate": 7.908168521599646e-05,
+      "loss": 0.4584,
+      "step": 32960
+    },
+    {
+      "epoch": 1.673554086126588,
+      "grad_norm": 0.02294166980769807,
+      "learning_rate": 7.896216178492599e-05,
+      "loss": 0.4563,
+      "step": 32965
+    },
+    {
+      "epoch": 1.6738079222246196,
+      "grad_norm": 0.02490011470275378,
+      "learning_rate": 7.884272100148332e-05,
+      "loss": 0.486,
+      "step": 32970
+    },
+    {
+      "epoch": 1.674061758322651,
+      "grad_norm": 0.02500987816795483,
+      "learning_rate": 7.872336288911436e-05,
+      "loss": 0.4521,
+      "step": 32975
+    },
+    {
+      "epoch": 1.6743155944206825,
+      "grad_norm": 0.025048858782017916,
+      "learning_rate": 7.86040874712482e-05,
+      "loss": 0.4468,
+      "step": 32980
+    },
+    {
+      "epoch": 1.674569430518714,
+      "grad_norm": 0.028012775399430874,
+      "learning_rate": 7.848489477129828e-05,
+      "loss": 0.4734,
+      "step": 32985
+    },
+    {
+      "epoch": 1.6748232666167455,
+      "grad_norm": 0.02640398235527484,
+      "learning_rate": 7.836578481266132e-05,
+      "loss": 0.4504,
+      "step": 32990
+    },
+    {
+      "epoch": 1.675077102714777,
+      "grad_norm": 0.023020229066731457,
+      "learning_rate": 7.824675761871814e-05,
+      "loss": 0.4716,
+      "step": 32995
+    },
+    {
+      "epoch": 1.6753309388128086,
+      "grad_norm": 0.02729357114542021,
+      "learning_rate": 7.812781321283319e-05,
+      "loss": 0.4414,
+      "step": 33000
+    },
+    {
+      "epoch": 1.6755847749108401,
+      "grad_norm": 0.02331912928699373,
+      "learning_rate": 7.800895161835469e-05,
+      "loss": 0.5048,
+      "step": 33005
+    },
+    {
+      "epoch": 1.6758386110088717,
+      "grad_norm": 0.020302890548499142,
+      "learning_rate": 7.789017285861439e-05,
+      "loss": 0.4686,
+      "step": 33010
+    },
+    {
+      "epoch": 1.6760924471069032,
+      "grad_norm": 0.030570223530975774,
+      "learning_rate": 7.777147695692827e-05,
+      "loss": 0.4768,
+      "step": 33015
+    },
+    {
+      "epoch": 1.6763462832049347,
+      "grad_norm": 0.021681113676065226,
+      "learning_rate": 7.765286393659543e-05,
+      "loss": 0.4306,
+      "step": 33020
+    },
+    {
+      "epoch": 1.676600119302966,
+      "grad_norm": 0.02236935979091485,
+      "learning_rate": 7.75343338208993e-05,
+      "loss": 0.4734,
+      "step": 33025
+    },
+    {
+      "epoch": 1.6768539554009976,
+      "grad_norm": 0.025693461163144155,
+      "learning_rate": 7.741588663310644e-05,
+      "loss": 0.4494,
+      "step": 33030
+    },
+    {
+      "epoch": 1.677107791499029,
+      "grad_norm": 0.02791486982571019,
+      "learning_rate": 7.729752239646776e-05,
+      "loss": 0.4523,
+      "step": 33035
+    },
+    {
+      "epoch": 1.6773616275970604,
+      "grad_norm": 0.02226158895252821,
+      "learning_rate": 7.717924113421732e-05,
+      "loss": 0.4467,
+      "step": 33040
+    },
+    {
+      "epoch": 1.677615463695092,
+      "grad_norm": 0.02192720169085694,
+      "learning_rate": 7.706104286957333e-05,
+      "loss": 0.4695,
+      "step": 33045
+    },
+    {
+      "epoch": 1.6778692997931235,
+      "grad_norm": 0.022241780707175234,
+      "learning_rate": 7.694292762573729e-05,
+      "loss": 0.4432,
+      "step": 33050
+    },
+    {
+      "epoch": 1.678123135891155,
+      "grad_norm": 0.025844348165324875,
+      "learning_rate": 7.682489542589483e-05,
+      "loss": 0.5004,
+      "step": 33055
+    },
+    {
+      "epoch": 1.6783769719891866,
+      "grad_norm": 0.03152479859049114,
+      "learning_rate": 7.670694629321511e-05,
+      "loss": 0.4711,
+      "step": 33060
+    },
+    {
+      "epoch": 1.678630808087218,
+      "grad_norm": 0.020936664773053695,
+      "learning_rate": 7.658908025085076e-05,
+      "loss": 0.4632,
+      "step": 33065
+    },
+    {
+      "epoch": 1.6788846441852496,
+      "grad_norm": 0.021671379054153835,
+      "learning_rate": 7.647129732193859e-05,
+      "loss": 0.4327,
+      "step": 33070
+    },
+    {
+      "epoch": 1.6791384802832812,
+      "grad_norm": 0.023902127255322248,
+      "learning_rate": 7.635359752959841e-05,
+      "loss": 0.4704,
+      "step": 33075
+    },
+    {
+      "epoch": 1.6793923163813127,
+      "grad_norm": 0.024510636050162397,
+      "learning_rate": 7.623598089693446e-05,
+      "loss": 0.4771,
+      "step": 33080
+    },
+    {
+      "epoch": 1.6796461524793442,
+      "grad_norm": 0.023726057600865935,
+      "learning_rate": 7.611844744703406e-05,
+      "loss": 0.471,
+      "step": 33085
+    },
+    {
+      "epoch": 1.6798999885773755,
+      "grad_norm": 0.022957493744582655,
+      "learning_rate": 7.600099720296866e-05,
+      "loss": 0.4646,
+      "step": 33090
+    },
+    {
+      "epoch": 1.680153824675407,
+      "grad_norm": 0.022157047872468763,
+      "learning_rate": 7.588363018779288e-05,
+      "loss": 0.4636,
+      "step": 33095
+    },
+    {
+      "epoch": 1.6804076607734386,
+      "grad_norm": 0.020795019876852835,
+      "learning_rate": 7.576634642454555e-05,
+      "loss": 0.4648,
+      "step": 33100
+    },
+    {
+      "epoch": 1.68066149687147,
+      "grad_norm": 0.02546400520836241,
+      "learning_rate": 7.564914593624866e-05,
+      "loss": 0.4771,
+      "step": 33105
+    },
+    {
+      "epoch": 1.6809153329695015,
+      "grad_norm": 0.035868249174773666,
+      "learning_rate": 7.553202874590825e-05,
+      "loss": 0.4593,
+      "step": 33110
+    },
+    {
+      "epoch": 1.681169169067533,
+      "grad_norm": 0.029253511552575565,
+      "learning_rate": 7.54149948765136e-05,
+      "loss": 0.4585,
+      "step": 33115
+    },
+    {
+      "epoch": 1.6814230051655645,
+      "grad_norm": 0.021167662879100695,
+      "learning_rate": 7.529804435103831e-05,
+      "loss": 0.4654,
+      "step": 33120
+    },
+    {
+      "epoch": 1.681676841263596,
+      "grad_norm": 0.02498414369849199,
+      "learning_rate": 7.518117719243878e-05,
+      "loss": 0.4542,
+      "step": 33125
+    },
+    {
+      "epoch": 1.6819306773616276,
+      "grad_norm": 0.022195501835238156,
+      "learning_rate": 7.506439342365573e-05,
+      "loss": 0.4739,
+      "step": 33130
+    },
+    {
+      "epoch": 1.6821845134596591,
+      "grad_norm": 0.021987713178080383,
+      "learning_rate": 7.494769306761296e-05,
+      "loss": 0.4555,
+      "step": 33135
+    },
+    {
+      "epoch": 1.6824383495576907,
+      "grad_norm": 0.030374528516733935,
+      "learning_rate": 7.483107614721846e-05,
+      "loss": 0.4736,
+      "step": 33140
+    },
+    {
+      "epoch": 1.6826921856557222,
+      "grad_norm": 0.02149051980457975,
+      "learning_rate": 7.471454268536338e-05,
+      "loss": 0.45,
+      "step": 33145
+    },
+    {
+      "epoch": 1.6829460217537537,
+      "grad_norm": 0.022265453269349957,
+      "learning_rate": 7.459809270492252e-05,
+      "loss": 0.4573,
+      "step": 33150
+    },
+    {
+      "epoch": 1.683199857851785,
+      "grad_norm": 0.03210156404072531,
+      "learning_rate": 7.448172622875477e-05,
+      "loss": 0.462,
+      "step": 33155
+    },
+    {
+      "epoch": 1.6834536939498166,
+      "grad_norm": 0.027726727483721063,
+      "learning_rate": 7.436544327970191e-05,
+      "loss": 0.4343,
+      "step": 33160
+    },
+    {
+      "epoch": 1.6837075300478481,
+      "grad_norm": 0.026983110267163848,
+      "learning_rate": 7.424924388059007e-05,
+      "loss": 0.4746,
+      "step": 33165
+    },
+    {
+      "epoch": 1.6839613661458797,
+      "grad_norm": 0.022811847390947975,
+      "learning_rate": 7.413312805422834e-05,
+      "loss": 0.475,
+      "step": 33170
+    },
+    {
+      "epoch": 1.684215202243911,
+      "grad_norm": 0.022500674142276494,
+      "learning_rate": 7.40170958234097e-05,
+      "loss": 0.4579,
+      "step": 33175
+    },
+    {
+      "epoch": 1.6844690383419425,
+      "grad_norm": 0.021612375802658696,
+      "learning_rate": 7.390114721091084e-05,
+      "loss": 0.4414,
+      "step": 33180
+    },
+    {
+      "epoch": 1.684722874439974,
+      "grad_norm": 0.0248775111800712,
+      "learning_rate": 7.378528223949194e-05,
+      "loss": 0.5039,
+      "step": 33185
+    },
+    {
+      "epoch": 1.6849767105380056,
+      "grad_norm": 0.02388192498972222,
+      "learning_rate": 7.366950093189651e-05,
+      "loss": 0.4526,
+      "step": 33190
+    },
+    {
+      "epoch": 1.685230546636037,
+      "grad_norm": 0.030723879806664092,
+      "learning_rate": 7.355380331085205e-05,
+      "loss": 0.4692,
+      "step": 33195
+    },
+    {
+      "epoch": 1.6854843827340686,
+      "grad_norm": 0.018952594935604992,
+      "learning_rate": 7.343818939906915e-05,
+      "loss": 0.4533,
+      "step": 33200
+    },
+    {
+      "epoch": 1.6857382188321002,
+      "grad_norm": 0.021896935877159233,
+      "learning_rate": 7.332265921924258e-05,
+      "loss": 0.4908,
+      "step": 33205
+    },
+    {
+      "epoch": 1.6859920549301317,
+      "grad_norm": 0.024915232475315307,
+      "learning_rate": 7.320721279405002e-05,
+      "loss": 0.4696,
+      "step": 33210
+    },
+    {
+      "epoch": 1.6862458910281632,
+      "grad_norm": 0.023173557196539376,
+      "learning_rate": 7.309185014615333e-05,
+      "loss": 0.4438,
+      "step": 33215
+    },
+    {
+      "epoch": 1.6864997271261948,
+      "grad_norm": 0.023554891519709124,
+      "learning_rate": 7.29765712981973e-05,
+      "loss": 0.4373,
+      "step": 33220
+    },
+    {
+      "epoch": 1.686753563224226,
+      "grad_norm": 0.019733917611659817,
+      "learning_rate": 7.286137627281092e-05,
+      "loss": 0.4432,
+      "step": 33225
+    },
+    {
+      "epoch": 1.6870073993222576,
+      "grad_norm": 0.02160655258996359,
+      "learning_rate": 7.274626509260612e-05,
+      "loss": 0.4774,
+      "step": 33230
+    },
+    {
+      "epoch": 1.6872612354202892,
+      "grad_norm": 0.02405968481850796,
+      "learning_rate": 7.263123778017877e-05,
+      "loss": 0.4548,
+      "step": 33235
+    },
+    {
+      "epoch": 1.6875150715183205,
+      "grad_norm": 0.024128327278038663,
+      "learning_rate": 7.251629435810825e-05,
+      "loss": 0.4642,
+      "step": 33240
+    },
+    {
+      "epoch": 1.687768907616352,
+      "grad_norm": 0.024693827881499066,
+      "learning_rate": 7.240143484895718e-05,
+      "loss": 0.4463,
+      "step": 33245
+    },
+    {
+      "epoch": 1.6880227437143835,
+      "grad_norm": 0.03310653970465261,
+      "learning_rate": 7.228665927527217e-05,
+      "loss": 0.4786,
+      "step": 33250
+    },
+    {
+      "epoch": 1.688276579812415,
+      "grad_norm": 0.024219301603149616,
+      "learning_rate": 7.217196765958278e-05,
+      "loss": 0.4417,
+      "step": 33255
+    },
+    {
+      "epoch": 1.6885304159104466,
+      "grad_norm": 0.020412058664254072,
+      "learning_rate": 7.205736002440272e-05,
+      "loss": 0.4324,
+      "step": 33260
+    },
+    {
+      "epoch": 1.6887842520084781,
+      "grad_norm": 0.020652073502858727,
+      "learning_rate": 7.19428363922286e-05,
+      "loss": 0.4505,
+      "step": 33265
+    },
+    {
+      "epoch": 1.6890380881065097,
+      "grad_norm": 0.020752974694944956,
+      "learning_rate": 7.18283967855411e-05,
+      "loss": 0.4729,
+      "step": 33270
+    },
+    {
+      "epoch": 1.6892919242045412,
+      "grad_norm": 0.026444187207730747,
+      "learning_rate": 7.171404122680391e-05,
+      "loss": 0.4503,
+      "step": 33275
+    },
+    {
+      "epoch": 1.6895457603025728,
+      "grad_norm": 0.024097349847725474,
+      "learning_rate": 7.159976973846466e-05,
+      "loss": 0.4576,
+      "step": 33280
+    },
+    {
+      "epoch": 1.6897995964006043,
+      "grad_norm": 0.02500842430792348,
+      "learning_rate": 7.14855823429541e-05,
+      "loss": 0.4684,
+      "step": 33285
+    },
+    {
+      "epoch": 1.6900534324986356,
+      "grad_norm": 0.022174452172228635,
+      "learning_rate": 7.137147906268682e-05,
+      "loss": 0.4605,
+      "step": 33290
+    },
+    {
+      "epoch": 1.6903072685966671,
+      "grad_norm": 0.02242173750304481,
+      "learning_rate": 7.125745992006044e-05,
+      "loss": 0.4677,
+      "step": 33295
+    },
+    {
+      "epoch": 1.6905611046946987,
+      "grad_norm": 0.023083145288370695,
+      "learning_rate": 7.114352493745674e-05,
+      "loss": 0.4695,
+      "step": 33300
+    },
+    {
+      "epoch": 1.69081494079273,
+      "grad_norm": 0.02381655314266417,
+      "learning_rate": 7.102967413724027e-05,
+      "loss": 0.4685,
+      "step": 33305
+    },
+    {
+      "epoch": 1.6910687768907615,
+      "grad_norm": 0.024080193683219564,
+      "learning_rate": 7.091590754175963e-05,
+      "loss": 0.445,
+      "step": 33310
+    },
+    {
+      "epoch": 1.691322612988793,
+      "grad_norm": 0.021690243710150815,
+      "learning_rate": 7.080222517334639e-05,
+      "loss": 0.4125,
+      "step": 33315
+    },
+    {
+      "epoch": 1.6915764490868246,
+      "grad_norm": 0.021918256209021103,
+      "learning_rate": 7.068862705431601e-05,
+      "loss": 0.4648,
+      "step": 33320
+    },
+    {
+      "epoch": 1.6918302851848561,
+      "grad_norm": 0.022452404320154225,
+      "learning_rate": 7.057511320696708e-05,
+      "loss": 0.4451,
+      "step": 33325
+    },
+    {
+      "epoch": 1.6920841212828877,
+      "grad_norm": 0.020322901723341114,
+      "learning_rate": 7.046168365358202e-05,
+      "loss": 0.4244,
+      "step": 33330
+    },
+    {
+      "epoch": 1.6923379573809192,
+      "grad_norm": 0.026597028609778042,
+      "learning_rate": 7.034833841642624e-05,
+      "loss": 0.4772,
+      "step": 33335
+    },
+    {
+      "epoch": 1.6925917934789507,
+      "grad_norm": 0.02102494226253916,
+      "learning_rate": 7.023507751774905e-05,
+      "loss": 0.4323,
+      "step": 33340
+    },
+    {
+      "epoch": 1.6928456295769823,
+      "grad_norm": 0.021714702487951786,
+      "learning_rate": 7.012190097978282e-05,
+      "loss": 0.449,
+      "step": 33345
+    },
+    {
+      "epoch": 1.6930994656750138,
+      "grad_norm": 0.02158282445558592,
+      "learning_rate": 7.000880882474375e-05,
+      "loss": 0.447,
+      "step": 33350
+    },
+    {
+      "epoch": 1.693353301773045,
+      "grad_norm": 0.02078501840382419,
+      "learning_rate": 6.989580107483102e-05,
+      "loss": 0.452,
+      "step": 33355
+    },
+    {
+      "epoch": 1.6936071378710766,
+      "grad_norm": 0.022296558827429345,
+      "learning_rate": 6.978287775222758e-05,
+      "loss": 0.4745,
+      "step": 33360
+    },
+    {
+      "epoch": 1.6938609739691082,
+      "grad_norm": 0.022424807973195095,
+      "learning_rate": 6.967003887909989e-05,
+      "loss": 0.4309,
+      "step": 33365
+    },
+    {
+      "epoch": 1.6941148100671395,
+      "grad_norm": 0.022647524848265772,
+      "learning_rate": 6.95572844775974e-05,
+      "loss": 0.4513,
+      "step": 33370
+    },
+    {
+      "epoch": 1.694368646165171,
+      "grad_norm": 0.02604844884193192,
+      "learning_rate": 6.944461456985346e-05,
+      "loss": 0.4496,
+      "step": 33375
+    },
+    {
+      "epoch": 1.6946224822632026,
+      "grad_norm": 0.02231782508277886,
+      "learning_rate": 6.933202917798443e-05,
+      "loss": 0.4345,
+      "step": 33380
+    },
+    {
+      "epoch": 1.694876318361234,
+      "grad_norm": 0.027331402316067194,
+      "learning_rate": 6.92195283240904e-05,
+      "loss": 0.452,
+      "step": 33385
+    },
+    {
+      "epoch": 1.6951301544592656,
+      "grad_norm": 0.02244181112322746,
+      "learning_rate": 6.910711203025455e-05,
+      "loss": 0.4548,
+      "step": 33390
+    },
+    {
+      "epoch": 1.6953839905572972,
+      "grad_norm": 0.031539164754729246,
+      "learning_rate": 6.89947803185439e-05,
+      "loss": 0.4876,
+      "step": 33395
+    },
+    {
+      "epoch": 1.6956378266553287,
+      "grad_norm": 0.029750827597486304,
+      "learning_rate": 6.888253321100829e-05,
+      "loss": 0.4631,
+      "step": 33400
+    },
+    {
+      "epoch": 1.6958916627533602,
+      "grad_norm": 0.02054220426689229,
+      "learning_rate": 6.877037072968157e-05,
+      "loss": 0.433,
+      "step": 33405
+    },
+    {
+      "epoch": 1.6961454988513918,
+      "grad_norm": 0.022921757508354648,
+      "learning_rate": 6.865829289658044e-05,
+      "loss": 0.4793,
+      "step": 33410
+    },
+    {
+      "epoch": 1.6963993349494233,
+      "grad_norm": 0.02272022241679228,
+      "learning_rate": 6.85462997337053e-05,
+      "loss": 0.4618,
+      "step": 33415
+    },
+    {
+      "epoch": 1.6966531710474546,
+      "grad_norm": 0.021335577126739804,
+      "learning_rate": 6.843439126303985e-05,
+      "loss": 0.4481,
+      "step": 33420
+    },
+    {
+      "epoch": 1.6969070071454861,
+      "grad_norm": 0.024313607070824157,
+      "learning_rate": 6.83225675065513e-05,
+      "loss": 0.4399,
+      "step": 33425
+    },
+    {
+      "epoch": 1.6971608432435177,
+      "grad_norm": 0.02095148113976031,
+      "learning_rate": 6.821082848618988e-05,
+      "loss": 0.4599,
+      "step": 33430
+    },
+    {
+      "epoch": 1.6974146793415492,
+      "grad_norm": 0.023138372142731362,
+      "learning_rate": 6.809917422388961e-05,
+      "loss": 0.4756,
+      "step": 33435
+    },
+    {
+      "epoch": 1.6976685154395805,
+      "grad_norm": 0.022979590715020475,
+      "learning_rate": 6.798760474156745e-05,
+      "loss": 0.4845,
+      "step": 33440
+    },
+    {
+      "epoch": 1.697922351537612,
+      "grad_norm": 0.02503552519860586,
+      "learning_rate": 6.787612006112409e-05,
+      "loss": 0.4802,
+      "step": 33445
+    },
+    {
+      "epoch": 1.6981761876356436,
+      "grad_norm": 0.02551716298674583,
+      "learning_rate": 6.77647202044433e-05,
+      "loss": 0.4388,
+      "step": 33450
+    },
+    {
+      "epoch": 1.6984300237336751,
+      "grad_norm": 0.02430410214676648,
+      "learning_rate": 6.765340519339252e-05,
+      "loss": 0.4553,
+      "step": 33455
+    },
+    {
+      "epoch": 1.6986838598317067,
+      "grad_norm": 0.02781257765021097,
+      "learning_rate": 6.754217504982202e-05,
+      "loss": 0.4474,
+      "step": 33460
+    },
+    {
+      "epoch": 1.6989376959297382,
+      "grad_norm": 0.021901755276533954,
+      "learning_rate": 6.743102979556604e-05,
+      "loss": 0.4726,
+      "step": 33465
+    },
+    {
+      "epoch": 1.6991915320277697,
+      "grad_norm": 0.02314556208967541,
+      "learning_rate": 6.731996945244162e-05,
+      "loss": 0.435,
+      "step": 33470
+    },
+    {
+      "epoch": 1.6994453681258013,
+      "grad_norm": 0.0216630417947033,
+      "learning_rate": 6.720899404224934e-05,
+      "loss": 0.4521,
+      "step": 33475
+    },
+    {
+      "epoch": 1.6996992042238328,
+      "grad_norm": 0.02078810198958909,
+      "learning_rate": 6.709810358677337e-05,
+      "loss": 0.4387,
+      "step": 33480
+    },
+    {
+      "epoch": 1.6999530403218641,
+      "grad_norm": 0.021858124288052016,
+      "learning_rate": 6.698729810778065e-05,
+      "loss": 0.4292,
+      "step": 33485
+    },
+    {
+      "epoch": 1.7002068764198957,
+      "grad_norm": 0.02220940183114807,
+      "learning_rate": 6.687657762702203e-05,
+      "loss": 0.4541,
+      "step": 33490
+    },
+    {
+      "epoch": 1.7004607125179272,
+      "grad_norm": 0.021898318644871732,
+      "learning_rate": 6.67659421662311e-05,
+      "loss": 0.4709,
+      "step": 33495
+    },
+    {
+      "epoch": 1.7007145486159587,
+      "grad_norm": 0.019784763924519774,
+      "learning_rate": 6.665539174712532e-05,
+      "loss": 0.4626,
+      "step": 33500
+    },
+    {
+      "epoch": 1.70096838471399,
+      "grad_norm": 0.022919389047645698,
+      "learning_rate": 6.654492639140492e-05,
+      "loss": 0.4507,
+      "step": 33505
+    },
+    {
+      "epoch": 1.7012222208120216,
+      "grad_norm": 0.02271121221914657,
+      "learning_rate": 6.643454612075395e-05,
+      "loss": 0.4652,
+      "step": 33510
+    },
+    {
+      "epoch": 1.701476056910053,
+      "grad_norm": 0.022198266733550638,
+      "learning_rate": 6.632425095683925e-05,
+      "loss": 0.4125,
+      "step": 33515
+    },
+    {
+      "epoch": 1.7017298930080846,
+      "grad_norm": 0.023454135381851583,
+      "learning_rate": 6.62140409213115e-05,
+      "loss": 0.4537,
+      "step": 33520
+    },
+    {
+      "epoch": 1.7019837291061162,
+      "grad_norm": 0.02311814200716394,
+      "learning_rate": 6.610391603580412e-05,
+      "loss": 0.4415,
+      "step": 33525
+    },
+    {
+      "epoch": 1.7022375652041477,
+      "grad_norm": 0.02338916064340492,
+      "learning_rate": 6.599387632193426e-05,
+      "loss": 0.468,
+      "step": 33530
+    },
+    {
+      "epoch": 1.7024914013021792,
+      "grad_norm": 0.022293680542931123,
+      "learning_rate": 6.588392180130198e-05,
+      "loss": 0.4832,
+      "step": 33535
+    },
+    {
+      "epoch": 1.7027452374002108,
+      "grad_norm": 0.02228526819113989,
+      "learning_rate": 6.577405249549096e-05,
+      "loss": 0.4406,
+      "step": 33540
+    },
+    {
+      "epoch": 1.7029990734982423,
+      "grad_norm": 0.02669841047779239,
+      "learning_rate": 6.566426842606793e-05,
+      "loss": 0.4683,
+      "step": 33545
+    },
+    {
+      "epoch": 1.7032529095962738,
+      "grad_norm": 0.020687449495207438,
+      "learning_rate": 6.555456961458311e-05,
+      "loss": 0.4788,
+      "step": 33550
+    },
+    {
+      "epoch": 1.7035067456943052,
+      "grad_norm": 0.02316737056094181,
+      "learning_rate": 6.544495608256957e-05,
+      "loss": 0.4479,
+      "step": 33555
+    },
+    {
+      "epoch": 1.7037605817923367,
+      "grad_norm": 0.02015658685414917,
+      "learning_rate": 6.533542785154412e-05,
+      "loss": 0.4675,
+      "step": 33560
+    },
+    {
+      "epoch": 1.7040144178903682,
+      "grad_norm": 0.024630709903064985,
+      "learning_rate": 6.522598494300647e-05,
+      "loss": 0.4612,
+      "step": 33565
+    },
+    {
+      "epoch": 1.7042682539883995,
+      "grad_norm": 0.021211217998291603,
+      "learning_rate": 6.511662737843981e-05,
+      "loss": 0.4552,
+      "step": 33570
+    },
+    {
+      "epoch": 1.704522090086431,
+      "grad_norm": 0.022897383797772494,
+      "learning_rate": 6.500735517931033e-05,
+      "loss": 0.4553,
+      "step": 33575
+    },
+    {
+      "epoch": 1.7047759261844626,
+      "grad_norm": 0.021403658562759068,
+      "learning_rate": 6.489816836706786e-05,
+      "loss": 0.4417,
+      "step": 33580
+    },
+    {
+      "epoch": 1.7050297622824941,
+      "grad_norm": 0.024079482108871682,
+      "learning_rate": 6.478906696314496e-05,
+      "loss": 0.4456,
+      "step": 33585
+    },
+    {
+      "epoch": 1.7052835983805257,
+      "grad_norm": 0.02474012090426629,
+      "learning_rate": 6.468005098895797e-05,
+      "loss": 0.4528,
+      "step": 33590
+    },
+    {
+      "epoch": 1.7055374344785572,
+      "grad_norm": 0.025738626621193236,
+      "learning_rate": 6.457112046590585e-05,
+      "loss": 0.5034,
+      "step": 33595
+    },
+    {
+      "epoch": 1.7057912705765887,
+      "grad_norm": 0.0219847722352076,
+      "learning_rate": 6.446227541537136e-05,
+      "loss": 0.4519,
+      "step": 33600
+    },
+    {
+      "epoch": 1.7060451066746203,
+      "grad_norm": 0.023838238873203427,
+      "learning_rate": 6.43535158587203e-05,
+      "loss": 0.4685,
+      "step": 33605
+    },
+    {
+      "epoch": 1.7062989427726518,
+      "grad_norm": 0.02133132469936017,
+      "learning_rate": 6.424484181730134e-05,
+      "loss": 0.4578,
+      "step": 33610
+    },
+    {
+      "epoch": 1.7065527788706834,
+      "grad_norm": 0.020615296500899355,
+      "learning_rate": 6.413625331244698e-05,
+      "loss": 0.4472,
+      "step": 33615
+    },
+    {
+      "epoch": 1.7068066149687147,
+      "grad_norm": 0.02205130832162417,
+      "learning_rate": 6.402775036547231e-05,
+      "loss": 0.4635,
+      "step": 33620
+    },
+    {
+      "epoch": 1.7070604510667462,
+      "grad_norm": 0.022030504089429213,
+      "learning_rate": 6.391933299767622e-05,
+      "loss": 0.4566,
+      "step": 33625
+    },
+    {
+      "epoch": 1.7073142871647777,
+      "grad_norm": 0.026096405850323693,
+      "learning_rate": 6.381100123034017e-05,
+      "loss": 0.4668,
+      "step": 33630
+    },
+    {
+      "epoch": 1.707568123262809,
+      "grad_norm": 0.021626998107609114,
+      "learning_rate": 6.370275508472945e-05,
+      "loss": 0.4256,
+      "step": 33635
+    },
+    {
+      "epoch": 1.7078219593608406,
+      "grad_norm": 0.020387795966162997,
+      "learning_rate": 6.359459458209194e-05,
+      "loss": 0.4587,
+      "step": 33640
+    },
+    {
+      "epoch": 1.7080757954588721,
+      "grad_norm": 0.03128743884752836,
+      "learning_rate": 6.348651974365932e-05,
+      "loss": 0.4348,
+      "step": 33645
+    },
+    {
+      "epoch": 1.7083296315569036,
+      "grad_norm": 0.022169939171472575,
+      "learning_rate": 6.337853059064586e-05,
+      "loss": 0.4487,
+      "step": 33650
+    },
+    {
+      "epoch": 1.7085834676549352,
+      "grad_norm": 0.021740231589529504,
+      "learning_rate": 6.327062714424946e-05,
+      "loss": 0.4785,
+      "step": 33655
+    },
+    {
+      "epoch": 1.7088373037529667,
+      "grad_norm": 0.023768265517867246,
+      "learning_rate": 6.31628094256509e-05,
+      "loss": 0.4787,
+      "step": 33660
+    },
+    {
+      "epoch": 1.7090911398509983,
+      "grad_norm": 0.024960047338923673,
+      "learning_rate": 6.305507745601446e-05,
+      "loss": 0.4519,
+      "step": 33665
+    },
+    {
+      "epoch": 1.7093449759490298,
+      "grad_norm": 0.02652925712341185,
+      "learning_rate": 6.294743125648722e-05,
+      "loss": 0.432,
+      "step": 33670
+    },
+    {
+      "epoch": 1.7095988120470613,
+      "grad_norm": 0.021458655669676602,
+      "learning_rate": 6.28398708481997e-05,
+      "loss": 0.4734,
+      "step": 33675
+    },
+    {
+      "epoch": 1.7098526481450929,
+      "grad_norm": 0.02287235529136257,
+      "learning_rate": 6.273239625226534e-05,
+      "loss": 0.4538,
+      "step": 33680
+    },
+    {
+      "epoch": 1.7101064842431242,
+      "grad_norm": 0.02228112960841006,
+      "learning_rate": 6.262500748978106e-05,
+      "loss": 0.4456,
+      "step": 33685
+    },
+    {
+      "epoch": 1.7103603203411557,
+      "grad_norm": 0.021753237335259395,
+      "learning_rate": 6.251770458182654e-05,
+      "loss": 0.4251,
+      "step": 33690
+    },
+    {
+      "epoch": 1.7106141564391872,
+      "grad_norm": 0.02618127618582834,
+      "learning_rate": 6.241048754946493e-05,
+      "loss": 0.4458,
+      "step": 33695
+    },
+    {
+      "epoch": 1.7108679925372186,
+      "grad_norm": 0.022092349146015322,
+      "learning_rate": 6.23033564137423e-05,
+      "loss": 0.4643,
+      "step": 33700
+    },
+    {
+      "epoch": 1.71112182863525,
+      "grad_norm": 0.02659549264775889,
+      "learning_rate": 6.219631119568814e-05,
+      "loss": 0.4926,
+      "step": 33705
+    },
+    {
+      "epoch": 1.7113756647332816,
+      "grad_norm": 0.021634033709659647,
+      "learning_rate": 6.208935191631465e-05,
+      "loss": 0.4538,
+      "step": 33710
+    },
+    {
+      "epoch": 1.7116295008313132,
+      "grad_norm": 0.020836889426777406,
+      "learning_rate": 6.19824785966176e-05,
+      "loss": 0.4703,
+      "step": 33715
+    },
+    {
+      "epoch": 1.7118833369293447,
+      "grad_norm": 0.01971938523489474,
+      "learning_rate": 6.187569125757553e-05,
+      "loss": 0.4523,
+      "step": 33720
+    },
+    {
+      "epoch": 1.7121371730273762,
+      "grad_norm": 0.021843066134417412,
+      "learning_rate": 6.176898992015034e-05,
+      "loss": 0.4278,
+      "step": 33725
+    },
+    {
+      "epoch": 1.7123910091254078,
+      "grad_norm": 0.020570604910276645,
+      "learning_rate": 6.166237460528706e-05,
+      "loss": 0.4728,
+      "step": 33730
+    },
+    {
+      "epoch": 1.7126448452234393,
+      "grad_norm": 0.019151004234894124,
+      "learning_rate": 6.155584533391356e-05,
+      "loss": 0.4478,
+      "step": 33735
+    },
+    {
+      "epoch": 1.7128986813214708,
+      "grad_norm": 0.020268446809525022,
+      "learning_rate": 6.144940212694122e-05,
+      "loss": 0.4503,
+      "step": 33740
+    },
+    {
+      "epoch": 1.7131525174195024,
+      "grad_norm": 0.024437006226734212,
+      "learning_rate": 6.134304500526411e-05,
+      "loss": 0.4756,
+      "step": 33745
+    },
+    {
+      "epoch": 1.7134063535175337,
+      "grad_norm": 0.02201855727722671,
+      "learning_rate": 6.123677398975974e-05,
+      "loss": 0.4507,
+      "step": 33750
+    },
+    {
+      "epoch": 1.7136601896155652,
+      "grad_norm": 0.021612750422246748,
+      "learning_rate": 6.11305891012885e-05,
+      "loss": 0.4466,
+      "step": 33755
+    },
+    {
+      "epoch": 1.7139140257135967,
+      "grad_norm": 0.02317623568444332,
+      "learning_rate": 6.1024490360694016e-05,
+      "loss": 0.4361,
+      "step": 33760
+    },
+    {
+      "epoch": 1.7141678618116283,
+      "grad_norm": 0.022813604709884243,
+      "learning_rate": 6.091847778880283e-05,
+      "loss": 0.4513,
+      "step": 33765
+    },
+    {
+      "epoch": 1.7144216979096596,
+      "grad_norm": 0.02326852728426279,
+      "learning_rate": 6.081255140642483e-05,
+      "loss": 0.4761,
+      "step": 33770
+    },
+    {
+      "epoch": 1.7146755340076911,
+      "grad_norm": 0.02386406329427358,
+      "learning_rate": 6.0706711234352674e-05,
+      "loss": 0.4465,
+      "step": 33775
+    },
+    {
+      "epoch": 1.7149293701057227,
+      "grad_norm": 0.022526987576066877,
+      "learning_rate": 6.06009572933624e-05,
+      "loss": 0.4818,
+      "step": 33780
+    },
+    {
+      "epoch": 1.7151832062037542,
+      "grad_norm": 0.024915335276817413,
+      "learning_rate": 6.0495289604212853e-05,
+      "loss": 0.4499,
+      "step": 33785
+    },
+    {
+      "epoch": 1.7154370423017857,
+      "grad_norm": 0.02046552164736613,
+      "learning_rate": 6.038970818764633e-05,
+      "loss": 0.4566,
+      "step": 33790
+    },
+    {
+      "epoch": 1.7156908783998173,
+      "grad_norm": 0.02288153652110328,
+      "learning_rate": 6.0284213064387586e-05,
+      "loss": 0.4612,
+      "step": 33795
+    },
+    {
+      "epoch": 1.7159447144978488,
+      "grad_norm": 0.02442365282755549,
+      "learning_rate": 6.0178804255145106e-05,
+      "loss": 0.4732,
+      "step": 33800
+    },
+    {
+      "epoch": 1.7161985505958803,
+      "grad_norm": 0.025400195295964864,
+      "learning_rate": 6.007348178060984e-05,
+      "loss": 0.4789,
+      "step": 33805
+    },
+    {
+      "epoch": 1.7164523866939119,
+      "grad_norm": 0.021023893448474346,
+      "learning_rate": 5.996824566145631e-05,
+      "loss": 0.454,
+      "step": 33810
+    },
+    {
+      "epoch": 1.7167062227919434,
+      "grad_norm": 0.02088473207737844,
+      "learning_rate": 5.98630959183416e-05,
+      "loss": 0.4452,
+      "step": 33815
+    },
+    {
+      "epoch": 1.7169600588899747,
+      "grad_norm": 0.021319092582090367,
+      "learning_rate": 5.975803257190632e-05,
+      "loss": 0.452,
+      "step": 33820
+    },
+    {
+      "epoch": 1.7172138949880063,
+      "grad_norm": 0.02336522341312864,
+      "learning_rate": 5.965305564277368e-05,
+      "loss": 0.4185,
+      "step": 33825
+    },
+    {
+      "epoch": 1.7174677310860378,
+      "grad_norm": 0.02176603784713292,
+      "learning_rate": 5.954816515155026e-05,
+      "loss": 0.4783,
+      "step": 33830
+    },
+    {
+      "epoch": 1.717721567184069,
+      "grad_norm": 0.021689798405552758,
+      "learning_rate": 5.944336111882542e-05,
+      "loss": 0.4443,
+      "step": 33835
+    },
+    {
+      "epoch": 1.7179754032821006,
+      "grad_norm": 0.021388359751042172,
+      "learning_rate": 5.933864356517177e-05,
+      "loss": 0.4269,
+      "step": 33840
+    },
+    {
+      "epoch": 1.7182292393801322,
+      "grad_norm": 0.021256453398072323,
+      "learning_rate": 5.923401251114485e-05,
+      "loss": 0.4655,
+      "step": 33845
+    },
+    {
+      "epoch": 1.7184830754781637,
+      "grad_norm": 0.02151794556558625,
+      "learning_rate": 5.9129467977283135e-05,
+      "loss": 0.4529,
+      "step": 33850
+    },
+    {
+      "epoch": 1.7187369115761952,
+      "grad_norm": 0.029363161811987813,
+      "learning_rate": 5.902500998410831e-05,
+      "loss": 0.4275,
+      "step": 33855
+    },
+    {
+      "epoch": 1.7189907476742268,
+      "grad_norm": 0.02440315247691098,
+      "learning_rate": 5.892063855212476e-05,
+      "loss": 0.4658,
+      "step": 33860
+    },
+    {
+      "epoch": 1.7192445837722583,
+      "grad_norm": 0.018975284982618558,
+      "learning_rate": 5.881635370182037e-05,
+      "loss": 0.4563,
+      "step": 33865
+    },
+    {
+      "epoch": 1.7194984198702898,
+      "grad_norm": 0.021764078280474785,
+      "learning_rate": 5.8712155453665426e-05,
+      "loss": 0.4476,
+      "step": 33870
+    },
+    {
+      "epoch": 1.7197522559683214,
+      "grad_norm": 0.020822416739688775,
+      "learning_rate": 5.8608043828113744e-05,
+      "loss": 0.45,
+      "step": 33875
+    },
+    {
+      "epoch": 1.720006092066353,
+      "grad_norm": 0.022089565418501236,
+      "learning_rate": 5.8504018845601804e-05,
+      "loss": 0.4551,
+      "step": 33880
+    },
+    {
+      "epoch": 1.7202599281643842,
+      "grad_norm": 0.02359465062305091,
+      "learning_rate": 5.840008052654927e-05,
+      "loss": 0.4634,
+      "step": 33885
+    },
+    {
+      "epoch": 1.7205137642624158,
+      "grad_norm": 0.029544919593911603,
+      "learning_rate": 5.8296228891358604e-05,
+      "loss": 0.4532,
+      "step": 33890
+    },
+    {
+      "epoch": 1.7207676003604473,
+      "grad_norm": 0.026839978594084928,
+      "learning_rate": 5.81924639604155e-05,
+      "loss": 0.4486,
+      "step": 33895
+    },
+    {
+      "epoch": 1.7210214364584786,
+      "grad_norm": 0.02088653909861625,
+      "learning_rate": 5.808878575408827e-05,
+      "loss": 0.4447,
+      "step": 33900
+    },
+    {
+      "epoch": 1.7212752725565101,
+      "grad_norm": 0.022797571162386343,
+      "learning_rate": 5.798519429272875e-05,
+      "loss": 0.4544,
+      "step": 33905
+    },
+    {
+      "epoch": 1.7215291086545417,
+      "grad_norm": 0.023109589614580234,
+      "learning_rate": 5.7881689596671226e-05,
+      "loss": 0.4803,
+      "step": 33910
+    },
+    {
+      "epoch": 1.7217829447525732,
+      "grad_norm": 0.020498287608990536,
+      "learning_rate": 5.777827168623323e-05,
+      "loss": 0.4638,
+      "step": 33915
+    },
+    {
+      "epoch": 1.7220367808506047,
+      "grad_norm": 0.021809212367703504,
+      "learning_rate": 5.767494058171507e-05,
+      "loss": 0.4412,
+      "step": 33920
+    },
+    {
+      "epoch": 1.7222906169486363,
+      "grad_norm": 0.01961469054428933,
+      "learning_rate": 5.757169630340031e-05,
+      "loss": 0.4349,
+      "step": 33925
+    },
+    {
+      "epoch": 1.7225444530466678,
+      "grad_norm": 0.021210553875649015,
+      "learning_rate": 5.7468538871555064e-05,
+      "loss": 0.4451,
+      "step": 33930
+    },
+    {
+      "epoch": 1.7227982891446993,
+      "grad_norm": 0.023689032069760745,
+      "learning_rate": 5.736546830642886e-05,
+      "loss": 0.4785,
+      "step": 33935
+    },
+    {
+      "epoch": 1.7230521252427309,
+      "grad_norm": 0.022017968208859488,
+      "learning_rate": 5.726248462825373e-05,
+      "loss": 0.4336,
+      "step": 33940
+    },
+    {
+      "epoch": 1.7233059613407624,
+      "grad_norm": 0.021437511540021992,
+      "learning_rate": 5.715958785724501e-05,
+      "loss": 0.4562,
+      "step": 33945
+    },
+    {
+      "epoch": 1.7235597974387937,
+      "grad_norm": 0.021235793273451938,
+      "learning_rate": 5.705677801360065e-05,
+      "loss": 0.4432,
+      "step": 33950
+    },
+    {
+      "epoch": 1.7238136335368253,
+      "grad_norm": 0.02204282859521283,
+      "learning_rate": 5.69540551175019e-05,
+      "loss": 0.4808,
+      "step": 33955
+    },
+    {
+      "epoch": 1.7240674696348568,
+      "grad_norm": 0.022958015134989514,
+      "learning_rate": 5.6851419189112575e-05,
+      "loss": 0.4472,
+      "step": 33960
+    },
+    {
+      "epoch": 1.724321305732888,
+      "grad_norm": 0.022808401192192258,
+      "learning_rate": 5.6748870248579666e-05,
+      "loss": 0.4495,
+      "step": 33965
+    },
+    {
+      "epoch": 1.7245751418309196,
+      "grad_norm": 0.02509495051395949,
+      "learning_rate": 5.6646408316033185e-05,
+      "loss": 0.4662,
+      "step": 33970
+    },
+    {
+      "epoch": 1.7248289779289512,
+      "grad_norm": 0.022427454109016656,
+      "learning_rate": 5.654403341158565e-05,
+      "loss": 0.4887,
+      "step": 33975
+    },
+    {
+      "epoch": 1.7250828140269827,
+      "grad_norm": 0.02867990044631863,
+      "learning_rate": 5.644174555533288e-05,
+      "loss": 0.4325,
+      "step": 33980
+    },
+    {
+      "epoch": 1.7253366501250142,
+      "grad_norm": 0.02527782595636069,
+      "learning_rate": 5.633954476735337e-05,
+      "loss": 0.455,
+      "step": 33985
+    },
+    {
+      "epoch": 1.7255904862230458,
+      "grad_norm": 0.02978500394864718,
+      "learning_rate": 5.623743106770879e-05,
+      "loss": 0.4693,
+      "step": 33990
+    },
+    {
+      "epoch": 1.7258443223210773,
+      "grad_norm": 0.021989421251067075,
+      "learning_rate": 5.6135404476443384e-05,
+      "loss": 0.4501,
+      "step": 33995
+    },
+    {
+      "epoch": 1.7260981584191089,
+      "grad_norm": 0.02414972943152398,
+      "learning_rate": 5.603346501358458e-05,
+      "loss": 0.5015,
+      "step": 34000
+    },
+    {
+      "epoch": 1.7263519945171404,
+      "grad_norm": 0.020959704894745596,
+      "learning_rate": 5.593161269914249e-05,
+      "loss": 0.4601,
+      "step": 34005
+    },
+    {
+      "epoch": 1.726605830615172,
+      "grad_norm": 0.01883199147720315,
+      "learning_rate": 5.5829847553110326e-05,
+      "loss": 0.4297,
+      "step": 34010
+    },
+    {
+      "epoch": 1.7268596667132032,
+      "grad_norm": 0.02135197011443074,
+      "learning_rate": 5.572816959546389e-05,
+      "loss": 0.4358,
+      "step": 34015
+    },
+    {
+      "epoch": 1.7271135028112348,
+      "grad_norm": 0.02107503468536846,
+      "learning_rate": 5.562657884616223e-05,
+      "loss": 0.4669,
+      "step": 34020
+    },
+    {
+      "epoch": 1.7273673389092663,
+      "grad_norm": 0.023771596018301224,
+      "learning_rate": 5.5525075325147054e-05,
+      "loss": 0.4587,
+      "step": 34025
+    },
+    {
+      "epoch": 1.7276211750072978,
+      "grad_norm": 0.021587351694255002,
+      "learning_rate": 5.542365905234309e-05,
+      "loss": 0.4481,
+      "step": 34030
+    },
+    {
+      "epoch": 1.7278750111053292,
+      "grad_norm": 0.020770474747904666,
+      "learning_rate": 5.532233004765763e-05,
+      "loss": 0.4439,
+      "step": 34035
+    },
+    {
+      "epoch": 1.7281288472033607,
+      "grad_norm": 0.01967096579673539,
+      "learning_rate": 5.5221088330981274e-05,
+      "loss": 0.4629,
+      "step": 34040
+    },
+    {
+      "epoch": 1.7283826833013922,
+      "grad_norm": 0.02081313356039374,
+      "learning_rate": 5.5119933922187115e-05,
+      "loss": 0.4447,
+      "step": 34045
+    },
+    {
+      "epoch": 1.7286365193994238,
+      "grad_norm": 0.030103240651800995,
+      "learning_rate": 5.501886684113139e-05,
+      "loss": 0.448,
+      "step": 34050
+    },
+    {
+      "epoch": 1.7288903554974553,
+      "grad_norm": 0.021992967904753014,
+      "learning_rate": 5.491788710765289e-05,
+      "loss": 0.4657,
+      "step": 34055
+    },
+    {
+      "epoch": 1.7291441915954868,
+      "grad_norm": 0.024640038006979705,
+      "learning_rate": 5.481699474157364e-05,
+      "loss": 0.4678,
+      "step": 34060
+    },
+    {
+      "epoch": 1.7293980276935184,
+      "grad_norm": 0.02259317629920808,
+      "learning_rate": 5.4716189762698044e-05,
+      "loss": 0.4526,
+      "step": 34065
+    },
+    {
+      "epoch": 1.72965186379155,
+      "grad_norm": 0.021047781618106692,
+      "learning_rate": 5.461547219081392e-05,
+      "loss": 0.4498,
+      "step": 34070
+    },
+    {
+      "epoch": 1.7299056998895814,
+      "grad_norm": 0.024835964339130236,
+      "learning_rate": 5.4514842045691346e-05,
+      "loss": 0.4491,
+      "step": 34075
+    },
+    {
+      "epoch": 1.730159535987613,
+      "grad_norm": 0.023902080014017398,
+      "learning_rate": 5.441429934708369e-05,
+      "loss": 0.4293,
+      "step": 34080
+    },
+    {
+      "epoch": 1.7304133720856443,
+      "grad_norm": 0.022346544604734297,
+      "learning_rate": 5.431384411472701e-05,
+      "loss": 0.475,
+      "step": 34085
+    },
+    {
+      "epoch": 1.7306672081836758,
+      "grad_norm": 0.022715026835179515,
+      "learning_rate": 5.421347636834001e-05,
+      "loss": 0.4446,
+      "step": 34090
+    },
+    {
+      "epoch": 1.7309210442817073,
+      "grad_norm": 0.022933158827406614,
+      "learning_rate": 5.411319612762455e-05,
+      "loss": 0.4259,
+      "step": 34095
+    },
+    {
+      "epoch": 1.7311748803797387,
+      "grad_norm": 0.024421520760028334,
+      "learning_rate": 5.4013003412265004e-05,
+      "loss": 0.4691,
+      "step": 34100
+    },
+    {
+      "epoch": 1.7314287164777702,
+      "grad_norm": 0.02420532978107086,
+      "learning_rate": 5.3912898241928796e-05,
+      "loss": 0.4405,
+      "step": 34105
+    },
+    {
+      "epoch": 1.7316825525758017,
+      "grad_norm": 0.02170211562927459,
+      "learning_rate": 5.3812880636265935e-05,
+      "loss": 0.4729,
+      "step": 34110
+    },
+    {
+      "epoch": 1.7319363886738333,
+      "grad_norm": 0.02370624185573575,
+      "learning_rate": 5.371295061490961e-05,
+      "loss": 0.4741,
+      "step": 34115
+    },
+    {
+      "epoch": 1.7321902247718648,
+      "grad_norm": 0.021244344362853385,
+      "learning_rate": 5.3613108197475335e-05,
+      "loss": 0.4529,
+      "step": 34120
+    },
+    {
+      "epoch": 1.7324440608698963,
+      "grad_norm": 0.02100785970567331,
+      "learning_rate": 5.3513353403561895e-05,
+      "loss": 0.462,
+      "step": 34125
+    },
+    {
+      "epoch": 1.7326978969679279,
+      "grad_norm": 0.019808061436518504,
+      "learning_rate": 5.3413686252750445e-05,
+      "loss": 0.4525,
+      "step": 34130
+    },
+    {
+      "epoch": 1.7329517330659594,
+      "grad_norm": 0.02187199410993585,
+      "learning_rate": 5.3314106764605354e-05,
+      "loss": 0.4645,
+      "step": 34135
+    },
+    {
+      "epoch": 1.733205569163991,
+      "grad_norm": 0.024006944977098717,
+      "learning_rate": 5.32146149586733e-05,
+      "loss": 0.4742,
+      "step": 34140
+    },
+    {
+      "epoch": 1.7334594052620225,
+      "grad_norm": 0.019976476111784883,
+      "learning_rate": 5.3115210854484394e-05,
+      "loss": 0.4409,
+      "step": 34145
+    },
+    {
+      "epoch": 1.7337132413600538,
+      "grad_norm": 0.026385558195068118,
+      "learning_rate": 5.301589447155092e-05,
+      "loss": 0.4456,
+      "step": 34150
+    },
+    {
+      "epoch": 1.7339670774580853,
+      "grad_norm": 0.021897238374891215,
+      "learning_rate": 5.2916665829368324e-05,
+      "loss": 0.4438,
+      "step": 34155
+    },
+    {
+      "epoch": 1.7342209135561168,
+      "grad_norm": 0.024811420195424226,
+      "learning_rate": 5.281752494741454e-05,
+      "loss": 0.4627,
+      "step": 34160
+    },
+    {
+      "epoch": 1.7344747496541482,
+      "grad_norm": 0.024672823583302583,
+      "learning_rate": 5.2718471845150604e-05,
+      "loss": 0.4759,
+      "step": 34165
+    },
+    {
+      "epoch": 1.7347285857521797,
+      "grad_norm": 0.02181462463091039,
+      "learning_rate": 5.261950654201997e-05,
+      "loss": 0.4559,
+      "step": 34170
+    },
+    {
+      "epoch": 1.7349824218502112,
+      "grad_norm": 0.024684395145582377,
+      "learning_rate": 5.252062905744926e-05,
+      "loss": 0.4422,
+      "step": 34175
+    },
+    {
+      "epoch": 1.7352362579482428,
+      "grad_norm": 0.02230800553146116,
+      "learning_rate": 5.2421839410847436e-05,
+      "loss": 0.4637,
+      "step": 34180
+    },
+    {
+      "epoch": 1.7354900940462743,
+      "grad_norm": 0.023147228626355657,
+      "learning_rate": 5.2323137621606345e-05,
+      "loss": 0.456,
+      "step": 34185
+    },
+    {
+      "epoch": 1.7357439301443058,
+      "grad_norm": 0.02281138844618608,
+      "learning_rate": 5.2224523709100914e-05,
+      "loss": 0.4666,
+      "step": 34190
+    },
+    {
+      "epoch": 1.7359977662423374,
+      "grad_norm": 0.022744297929036067,
+      "learning_rate": 5.212599769268833e-05,
+      "loss": 0.4577,
+      "step": 34195
+    },
+    {
+      "epoch": 1.736251602340369,
+      "grad_norm": 0.026435757976267977,
+      "learning_rate": 5.202755959170885e-05,
+      "loss": 0.4792,
+      "step": 34200
+    },
+    {
+      "epoch": 1.7365054384384004,
+      "grad_norm": 0.02327427177805453,
+      "learning_rate": 5.1929209425485346e-05,
+      "loss": 0.454,
+      "step": 34205
+    },
+    {
+      "epoch": 1.736759274536432,
+      "grad_norm": 0.019262280027333455,
+      "learning_rate": 5.1830947213323656e-05,
+      "loss": 0.4344,
+      "step": 34210
+    },
+    {
+      "epoch": 1.7370131106344633,
+      "grad_norm": 0.02020587242748104,
+      "learning_rate": 5.17327729745119e-05,
+      "loss": 0.4687,
+      "step": 34215
+    },
+    {
+      "epoch": 1.7372669467324948,
+      "grad_norm": 0.022404415922131816,
+      "learning_rate": 5.163468672832139e-05,
+      "loss": 0.444,
+      "step": 34220
+    },
+    {
+      "epoch": 1.7375207828305264,
+      "grad_norm": 0.022425367350743124,
+      "learning_rate": 5.1536688494005835e-05,
+      "loss": 0.4682,
+      "step": 34225
+    },
+    {
+      "epoch": 1.7377746189285577,
+      "grad_norm": 0.022763870580745405,
+      "learning_rate": 5.14387782908019e-05,
+      "loss": 0.4601,
+      "step": 34230
+    },
+    {
+      "epoch": 1.7380284550265892,
+      "grad_norm": 0.023042118858273286,
+      "learning_rate": 5.134095613792872e-05,
+      "loss": 0.4637,
+      "step": 34235
+    },
+    {
+      "epoch": 1.7382822911246207,
+      "grad_norm": 0.02604482240900869,
+      "learning_rate": 5.124322205458848e-05,
+      "loss": 0.4382,
+      "step": 34240
+    },
+    {
+      "epoch": 1.7385361272226523,
+      "grad_norm": 0.021731629118018583,
+      "learning_rate": 5.1145576059965726e-05,
+      "loss": 0.4654,
+      "step": 34245
+    },
+    {
+      "epoch": 1.7387899633206838,
+      "grad_norm": 0.019413019416783566,
+      "learning_rate": 5.1048018173228015e-05,
+      "loss": 0.4529,
+      "step": 34250
+    },
+    {
+      "epoch": 1.7390437994187153,
+      "grad_norm": 0.02664943630561449,
+      "learning_rate": 5.0950548413525365e-05,
+      "loss": 0.4562,
+      "step": 34255
+    },
+    {
+      "epoch": 1.7392976355167469,
+      "grad_norm": 0.02087198766602968,
+      "learning_rate": 5.085316679999064e-05,
+      "loss": 0.4422,
+      "step": 34260
+    },
+    {
+      "epoch": 1.7395514716147784,
+      "grad_norm": 0.021876909463953986,
+      "learning_rate": 5.075587335173948e-05,
+      "loss": 0.4453,
+      "step": 34265
+    },
+    {
+      "epoch": 1.73980530771281,
+      "grad_norm": 0.02240544876246522,
+      "learning_rate": 5.06586680878699e-05,
+      "loss": 0.4302,
+      "step": 34270
+    },
+    {
+      "epoch": 1.7400591438108415,
+      "grad_norm": 0.020994222583535893,
+      "learning_rate": 5.056155102746302e-05,
+      "loss": 0.4592,
+      "step": 34275
+    },
+    {
+      "epoch": 1.7403129799088728,
+      "grad_norm": 0.022701990879272254,
+      "learning_rate": 5.0464522189582194e-05,
+      "loss": 0.4851,
+      "step": 34280
+    },
+    {
+      "epoch": 1.7405668160069043,
+      "grad_norm": 0.027194326586476105,
+      "learning_rate": 5.036758159327398e-05,
+      "loss": 0.4818,
+      "step": 34285
+    },
+    {
+      "epoch": 1.7408206521049359,
+      "grad_norm": 0.02078698225293263,
+      "learning_rate": 5.027072925756709e-05,
+      "loss": 0.4662,
+      "step": 34290
+    },
+    {
+      "epoch": 1.7410744882029674,
+      "grad_norm": 0.02420724997035512,
+      "learning_rate": 5.017396520147333e-05,
+      "loss": 0.4687,
+      "step": 34295
+    },
+    {
+      "epoch": 1.7413283243009987,
+      "grad_norm": 0.031193300288096932,
+      "learning_rate": 5.007728944398682e-05,
+      "loss": 0.4567,
+      "step": 34300
+    },
+    {
+      "epoch": 1.7415821603990302,
+      "grad_norm": 0.020794031282449752,
+      "learning_rate": 4.9980702004084724e-05,
+      "loss": 0.4529,
+      "step": 34305
+    },
+    {
+      "epoch": 1.7418359964970618,
+      "grad_norm": 0.022257297830772214,
+      "learning_rate": 4.9884202900726486e-05,
+      "loss": 0.4428,
+      "step": 34310
+    },
+    {
+      "epoch": 1.7420898325950933,
+      "grad_norm": 0.019932049025810045,
+      "learning_rate": 4.978779215285456e-05,
+      "loss": 0.4386,
+      "step": 34315
+    },
+    {
+      "epoch": 1.7423436686931248,
+      "grad_norm": 0.026391941731528173,
+      "learning_rate": 4.9691469779393706e-05,
+      "loss": 0.464,
+      "step": 34320
+    },
+    {
+      "epoch": 1.7425975047911564,
+      "grad_norm": 0.022393195253757603,
+      "learning_rate": 4.959523579925179e-05,
+      "loss": 0.4803,
+      "step": 34325
+    },
+    {
+      "epoch": 1.742851340889188,
+      "grad_norm": 0.029090252728671346,
+      "learning_rate": 4.949909023131888e-05,
+      "loss": 0.4668,
+      "step": 34330
+    },
+    {
+      "epoch": 1.7431051769872195,
+      "grad_norm": 0.024897631086516752,
+      "learning_rate": 4.940303309446798e-05,
+      "loss": 0.4674,
+      "step": 34335
+    },
+    {
+      "epoch": 1.743359013085251,
+      "grad_norm": 0.020804078169891622,
+      "learning_rate": 4.9307064407554445e-05,
+      "loss": 0.4664,
+      "step": 34340
+    },
+    {
+      "epoch": 1.7436128491832825,
+      "grad_norm": 0.021704515629220097,
+      "learning_rate": 4.921118418941667e-05,
+      "loss": 0.4632,
+      "step": 34345
+    },
+    {
+      "epoch": 1.7438666852813138,
+      "grad_norm": 0.024256706115496882,
+      "learning_rate": 4.911539245887525e-05,
+      "loss": 0.4219,
+      "step": 34350
+    },
+    {
+      "epoch": 1.7441205213793454,
+      "grad_norm": 0.02298081741126669,
+      "learning_rate": 4.901968923473382e-05,
+      "loss": 0.4591,
+      "step": 34355
+    },
+    {
+      "epoch": 1.744374357477377,
+      "grad_norm": 0.03464863168116285,
+      "learning_rate": 4.8924074535778294e-05,
+      "loss": 0.446,
+      "step": 34360
+    },
+    {
+      "epoch": 1.7446281935754082,
+      "grad_norm": 0.02157577331468465,
+      "learning_rate": 4.882854838077755e-05,
+      "loss": 0.4657,
+      "step": 34365
+    },
+    {
+      "epoch": 1.7448820296734397,
+      "grad_norm": 0.019964492489978992,
+      "learning_rate": 4.873311078848264e-05,
+      "loss": 0.4643,
+      "step": 34370
+    },
+    {
+      "epoch": 1.7451358657714713,
+      "grad_norm": 0.022047232259986137,
+      "learning_rate": 4.863776177762769e-05,
+      "loss": 0.4639,
+      "step": 34375
+    },
+    {
+      "epoch": 1.7453897018695028,
+      "grad_norm": 0.03244136132931666,
+      "learning_rate": 4.854250136692912e-05,
+      "loss": 0.4822,
+      "step": 34380
+    },
+    {
+      "epoch": 1.7456435379675344,
+      "grad_norm": 0.026966628720723723,
+      "learning_rate": 4.844732957508607e-05,
+      "loss": 0.4612,
+      "step": 34385
+    },
+    {
+      "epoch": 1.7458973740655659,
+      "grad_norm": 0.021335951919062863,
+      "learning_rate": 4.8352246420780456e-05,
+      "loss": 0.44,
+      "step": 34390
+    },
+    {
+      "epoch": 1.7461512101635974,
+      "grad_norm": 0.021305484071550175,
+      "learning_rate": 4.825725192267638e-05,
+      "loss": 0.4573,
+      "step": 34395
+    },
+    {
+      "epoch": 1.746405046261629,
+      "grad_norm": 0.021326881921091657,
+      "learning_rate": 4.816234609942105e-05,
+      "loss": 0.4524,
+      "step": 34400
+    },
+    {
+      "epoch": 1.7466588823596605,
+      "grad_norm": 0.02188831813845589,
+      "learning_rate": 4.806752896964373e-05,
+      "loss": 0.4707,
+      "step": 34405
+    },
+    {
+      "epoch": 1.746912718457692,
+      "grad_norm": 0.02064699692483605,
+      "learning_rate": 4.79728005519568e-05,
+      "loss": 0.4565,
+      "step": 34410
+    },
+    {
+      "epoch": 1.7471665545557233,
+      "grad_norm": 0.021389953513019002,
+      "learning_rate": 4.787816086495478e-05,
+      "loss": 0.47,
+      "step": 34415
+    },
+    {
+      "epoch": 1.7474203906537549,
+      "grad_norm": 0.019822070124622355,
+      "learning_rate": 4.7783609927215145e-05,
+      "loss": 0.4244,
+      "step": 34420
+    },
+    {
+      "epoch": 1.7476742267517864,
+      "grad_norm": 0.01981125932792789,
+      "learning_rate": 4.7689147757297605e-05,
+      "loss": 0.4369,
+      "step": 34425
+    },
+    {
+      "epoch": 1.7479280628498177,
+      "grad_norm": 0.028737375632746403,
+      "learning_rate": 4.7594774373744766e-05,
+      "loss": 0.4647,
+      "step": 34430
+    },
+    {
+      "epoch": 1.7481818989478493,
+      "grad_norm": 0.031205219363603577,
+      "learning_rate": 4.750048979508148e-05,
+      "loss": 0.4611,
+      "step": 34435
+    },
+    {
+      "epoch": 1.7484357350458808,
+      "grad_norm": 0.022939345811528095,
+      "learning_rate": 4.7406294039815553e-05,
+      "loss": 0.4711,
+      "step": 34440
+    },
+    {
+      "epoch": 1.7486895711439123,
+      "grad_norm": 0.027141259222797177,
+      "learning_rate": 4.731218712643681e-05,
+      "loss": 0.4422,
+      "step": 34445
+    },
+    {
+      "epoch": 1.7489434072419439,
+      "grad_norm": 0.027268634228652108,
+      "learning_rate": 4.721816907341836e-05,
+      "loss": 0.4768,
+      "step": 34450
+    },
+    {
+      "epoch": 1.7491972433399754,
+      "grad_norm": 0.023701231135006304,
+      "learning_rate": 4.712423989921527e-05,
+      "loss": 0.4631,
+      "step": 34455
+    },
+    {
+      "epoch": 1.749451079438007,
+      "grad_norm": 0.02347054862041871,
+      "learning_rate": 4.703039962226541e-05,
+      "loss": 0.4619,
+      "step": 34460
+    },
+    {
+      "epoch": 1.7497049155360385,
+      "grad_norm": 0.028567040630409428,
+      "learning_rate": 4.693664826098909e-05,
+      "loss": 0.4311,
+      "step": 34465
+    },
+    {
+      "epoch": 1.74995875163407,
+      "grad_norm": 0.02148438393708741,
+      "learning_rate": 4.684298583378943e-05,
+      "loss": 0.4438,
+      "step": 34470
+    },
+    {
+      "epoch": 1.7502125877321015,
+      "grad_norm": 0.02216116188881408,
+      "learning_rate": 4.674941235905161e-05,
+      "loss": 0.4676,
+      "step": 34475
+    },
+    {
+      "epoch": 1.7504664238301328,
+      "grad_norm": 0.022665906180228006,
+      "learning_rate": 4.6655927855143886e-05,
+      "loss": 0.4453,
+      "step": 34480
+    },
+    {
+      "epoch": 1.7507202599281644,
+      "grad_norm": 0.022100912241708407,
+      "learning_rate": 4.656253234041663e-05,
+      "loss": 0.4854,
+      "step": 34485
+    },
+    {
+      "epoch": 1.750974096026196,
+      "grad_norm": 0.019646671652917273,
+      "learning_rate": 4.646922583320307e-05,
+      "loss": 0.4738,
+      "step": 34490
+    },
+    {
+      "epoch": 1.7512279321242272,
+      "grad_norm": 0.02278953025792618,
+      "learning_rate": 4.637600835181866e-05,
+      "loss": 0.469,
+      "step": 34495
+    },
+    {
+      "epoch": 1.7514817682222588,
+      "grad_norm": 0.022089585019886025,
+      "learning_rate": 4.6282879914561646e-05,
+      "loss": 0.4497,
+      "step": 34500
+    },
+    {
+      "epoch": 1.7517356043202903,
+      "grad_norm": 0.023445723973391685,
+      "learning_rate": 4.6189840539712534e-05,
+      "loss": 0.4551,
+      "step": 34505
+    },
+    {
+      "epoch": 1.7519894404183218,
+      "grad_norm": 0.02715037290748338,
+      "learning_rate": 4.609689024553459e-05,
+      "loss": 0.462,
+      "step": 34510
+    },
+    {
+      "epoch": 1.7522432765163534,
+      "grad_norm": 0.02174226702680522,
+      "learning_rate": 4.600402905027357e-05,
+      "loss": 0.4662,
+      "step": 34515
+    },
+    {
+      "epoch": 1.752497112614385,
+      "grad_norm": 0.021279501901226617,
+      "learning_rate": 4.5911256972157476e-05,
+      "loss": 0.4485,
+      "step": 34520
+    },
+    {
+      "epoch": 1.7527509487124164,
+      "grad_norm": 0.023627373642052033,
+      "learning_rate": 4.581857402939721e-05,
+      "loss": 0.4522,
+      "step": 34525
+    },
+    {
+      "epoch": 1.753004784810448,
+      "grad_norm": 0.0215796445898145,
+      "learning_rate": 4.572598024018571e-05,
+      "loss": 0.4708,
+      "step": 34530
+    },
+    {
+      "epoch": 1.7532586209084795,
+      "grad_norm": 0.022638555616672124,
+      "learning_rate": 4.563347562269898e-05,
+      "loss": 0.4391,
+      "step": 34535
+    },
+    {
+      "epoch": 1.753512457006511,
+      "grad_norm": 0.02165756905100213,
+      "learning_rate": 4.5541060195094965e-05,
+      "loss": 0.4557,
+      "step": 34540
+    },
+    {
+      "epoch": 1.7537662931045424,
+      "grad_norm": 0.021506773610349895,
+      "learning_rate": 4.5448733975514524e-05,
+      "loss": 0.4493,
+      "step": 34545
+    },
+    {
+      "epoch": 1.7540201292025739,
+      "grad_norm": 0.021106012563365185,
+      "learning_rate": 4.535649698208066e-05,
+      "loss": 0.4566,
+      "step": 34550
+    },
+    {
+      "epoch": 1.7542739653006054,
+      "grad_norm": 0.02090198796312075,
+      "learning_rate": 4.526434923289924e-05,
+      "loss": 0.4457,
+      "step": 34555
+    },
+    {
+      "epoch": 1.754527801398637,
+      "grad_norm": 0.02136853005412373,
+      "learning_rate": 4.517229074605822e-05,
+      "loss": 0.4747,
+      "step": 34560
+    },
+    {
+      "epoch": 1.7547816374966683,
+      "grad_norm": 0.02157527210244312,
+      "learning_rate": 4.508032153962832e-05,
+      "loss": 0.4704,
+      "step": 34565
+    },
+    {
+      "epoch": 1.7550354735946998,
+      "grad_norm": 0.023144728143769867,
+      "learning_rate": 4.49884416316626e-05,
+      "loss": 0.4704,
+      "step": 34570
+    },
+    {
+      "epoch": 1.7552893096927313,
+      "grad_norm": 0.020386305284952208,
+      "learning_rate": 4.489665104019675e-05,
+      "loss": 0.4557,
+      "step": 34575
+    },
+    {
+      "epoch": 1.7555431457907629,
+      "grad_norm": 0.022166335060348437,
+      "learning_rate": 4.4804949783248564e-05,
+      "loss": 0.46,
+      "step": 34580
+    },
+    {
+      "epoch": 1.7557969818887944,
+      "grad_norm": 0.02094210104633068,
+      "learning_rate": 4.471333787881881e-05,
+      "loss": 0.4561,
+      "step": 34585
+    },
+    {
+      "epoch": 1.756050817986826,
+      "grad_norm": 0.02140439011369098,
+      "learning_rate": 4.4621815344890235e-05,
+      "loss": 0.4253,
+      "step": 34590
+    },
+    {
+      "epoch": 1.7563046540848575,
+      "grad_norm": 0.023342207225477016,
+      "learning_rate": 4.453038219942845e-05,
+      "loss": 0.4902,
+      "step": 34595
+    },
+    {
+      "epoch": 1.756558490182889,
+      "grad_norm": 0.019041730139629988,
+      "learning_rate": 4.443903846038111e-05,
+      "loss": 0.4525,
+      "step": 34600
+    },
+    {
+      "epoch": 1.7568123262809205,
+      "grad_norm": 0.024800524118674366,
+      "learning_rate": 4.4347784145678695e-05,
+      "loss": 0.4606,
+      "step": 34605
+    },
+    {
+      "epoch": 1.7570661623789519,
+      "grad_norm": 0.02916453322026752,
+      "learning_rate": 4.425661927323388e-05,
+      "loss": 0.4643,
+      "step": 34610
+    },
+    {
+      "epoch": 1.7573199984769834,
+      "grad_norm": 0.024905988994358454,
+      "learning_rate": 4.416554386094196e-05,
+      "loss": 0.437,
+      "step": 34615
+    },
+    {
+      "epoch": 1.757573834575015,
+      "grad_norm": 0.030406574388976055,
+      "learning_rate": 4.407455792668047e-05,
+      "loss": 0.4883,
+      "step": 34620
+    },
+    {
+      "epoch": 1.7578276706730465,
+      "grad_norm": 0.020939906014040045,
+      "learning_rate": 4.3983661488309565e-05,
+      "loss": 0.4282,
+      "step": 34625
+    },
+    {
+      "epoch": 1.7580815067710778,
+      "grad_norm": 0.022365655581909887,
+      "learning_rate": 4.389285456367181e-05,
+      "loss": 0.478,
+      "step": 34630
+    },
+    {
+      "epoch": 1.7583353428691093,
+      "grad_norm": 0.023054617223912062,
+      "learning_rate": 4.380213717059206e-05,
+      "loss": 0.4749,
+      "step": 34635
+    },
+    {
+      "epoch": 1.7585891789671408,
+      "grad_norm": 0.021279493374970884,
+      "learning_rate": 4.371150932687784e-05,
+      "loss": 0.4535,
+      "step": 34640
+    },
+    {
+      "epoch": 1.7588430150651724,
+      "grad_norm": 0.021608983124179647,
+      "learning_rate": 4.3620971050318706e-05,
+      "loss": 0.452,
+      "step": 34645
+    },
+    {
+      "epoch": 1.759096851163204,
+      "grad_norm": 0.02125252639391023,
+      "learning_rate": 4.3530522358687045e-05,
+      "loss": 0.4427,
+      "step": 34650
+    },
+    {
+      "epoch": 1.7593506872612354,
+      "grad_norm": 0.021716546978771272,
+      "learning_rate": 4.3440163269737374e-05,
+      "loss": 0.4551,
+      "step": 34655
+    },
+    {
+      "epoch": 1.759604523359267,
+      "grad_norm": 0.022723743581024202,
+      "learning_rate": 4.334989380120691e-05,
+      "loss": 0.4373,
+      "step": 34660
+    },
+    {
+      "epoch": 1.7598583594572985,
+      "grad_norm": 0.023783058996820775,
+      "learning_rate": 4.3259713970814904e-05,
+      "loss": 0.471,
+      "step": 34665
+    },
+    {
+      "epoch": 1.76011219555533,
+      "grad_norm": 0.022842675520249336,
+      "learning_rate": 4.316962379626333e-05,
+      "loss": 0.4575,
+      "step": 34670
+    },
+    {
+      "epoch": 1.7603660316533616,
+      "grad_norm": 0.024246326137658224,
+      "learning_rate": 4.3079623295236345e-05,
+      "loss": 0.4028,
+      "step": 34675
+    },
+    {
+      "epoch": 1.760619867751393,
+      "grad_norm": 0.025535918723743688,
+      "learning_rate": 4.298971248540068e-05,
+      "loss": 0.4636,
+      "step": 34680
+    },
+    {
+      "epoch": 1.7608737038494244,
+      "grad_norm": 0.022390889197142872,
+      "learning_rate": 4.2899891384405196e-05,
+      "loss": 0.4283,
+      "step": 34685
+    },
+    {
+      "epoch": 1.761127539947456,
+      "grad_norm": 0.025131303601364605,
+      "learning_rate": 4.281016000988169e-05,
+      "loss": 0.4524,
+      "step": 34690
+    },
+    {
+      "epoch": 1.7613813760454873,
+      "grad_norm": 0.02186743401814803,
+      "learning_rate": 4.2720518379443684e-05,
+      "loss": 0.4497,
+      "step": 34695
+    },
+    {
+      "epoch": 1.7616352121435188,
+      "grad_norm": 0.02114715992999132,
+      "learning_rate": 4.263096651068754e-05,
+      "loss": 0.4662,
+      "step": 34700
+    },
+    {
+      "epoch": 1.7618890482415503,
+      "grad_norm": 0.028404232206161735,
+      "learning_rate": 4.254150442119164e-05,
+      "loss": 0.4742,
+      "step": 34705
+    },
+    {
+      "epoch": 1.7621428843395819,
+      "grad_norm": 0.02630860958767224,
+      "learning_rate": 4.2452132128517226e-05,
+      "loss": 0.4296,
+      "step": 34710
+    },
+    {
+      "epoch": 1.7623967204376134,
+      "grad_norm": 0.024673175731050732,
+      "learning_rate": 4.236284965020737e-05,
+      "loss": 0.4308,
+      "step": 34715
+    },
+    {
+      "epoch": 1.762650556535645,
+      "grad_norm": 0.02180317292962621,
+      "learning_rate": 4.227365700378799e-05,
+      "loss": 0.4469,
+      "step": 34720
+    },
+    {
+      "epoch": 1.7629043926336765,
+      "grad_norm": 0.0214905144095388,
+      "learning_rate": 4.2184554206767034e-05,
+      "loss": 0.4723,
+      "step": 34725
+    },
+    {
+      "epoch": 1.763158228731708,
+      "grad_norm": 0.021720539302091973,
+      "learning_rate": 4.209554127663495e-05,
+      "loss": 0.4465,
+      "step": 34730
+    },
+    {
+      "epoch": 1.7634120648297396,
+      "grad_norm": 0.02368170581782568,
+      "learning_rate": 4.200661823086454e-05,
+      "loss": 0.4699,
+      "step": 34735
+    },
+    {
+      "epoch": 1.763665900927771,
+      "grad_norm": 0.022332063931247034,
+      "learning_rate": 4.191778508691102e-05,
+      "loss": 0.4846,
+      "step": 34740
+    },
+    {
+      "epoch": 1.7639197370258024,
+      "grad_norm": 0.021873430868819217,
+      "learning_rate": 4.182904186221176e-05,
+      "loss": 0.4475,
+      "step": 34745
+    },
+    {
+      "epoch": 1.764173573123834,
+      "grad_norm": 0.018510023864084377,
+      "learning_rate": 4.174038857418666e-05,
+      "loss": 0.4087,
+      "step": 34750
+    },
+    {
+      "epoch": 1.7644274092218655,
+      "grad_norm": 0.020083902539849405,
+      "learning_rate": 4.165182524023803e-05,
+      "loss": 0.4578,
+      "step": 34755
+    },
+    {
+      "epoch": 1.7646812453198968,
+      "grad_norm": 0.01956729272999206,
+      "learning_rate": 4.156335187775029e-05,
+      "loss": 0.432,
+      "step": 34760
+    },
+    {
+      "epoch": 1.7649350814179283,
+      "grad_norm": 0.033417341393678385,
+      "learning_rate": 4.1474968504090385e-05,
+      "loss": 0.4514,
+      "step": 34765
+    },
+    {
+      "epoch": 1.7651889175159599,
+      "grad_norm": 0.027037751978040633,
+      "learning_rate": 4.1386675136607434e-05,
+      "loss": 0.4497,
+      "step": 34770
+    },
+    {
+      "epoch": 1.7654427536139914,
+      "grad_norm": 0.02221486375997609,
+      "learning_rate": 4.129847179263318e-05,
+      "loss": 0.4261,
+      "step": 34775
+    },
+    {
+      "epoch": 1.765696589712023,
+      "grad_norm": 0.02085106880362318,
+      "learning_rate": 4.121035848948124e-05,
+      "loss": 0.4639,
+      "step": 34780
+    },
+    {
+      "epoch": 1.7659504258100545,
+      "grad_norm": 0.023019324461147312,
+      "learning_rate": 4.112233524444803e-05,
+      "loss": 0.4712,
+      "step": 34785
+    },
+    {
+      "epoch": 1.766204261908086,
+      "grad_norm": 0.019887284736623528,
+      "learning_rate": 4.103440207481196e-05,
+      "loss": 0.4421,
+      "step": 34790
+    },
+    {
+      "epoch": 1.7664580980061175,
+      "grad_norm": 0.0244559527736879,
+      "learning_rate": 4.094655899783395e-05,
+      "loss": 0.4429,
+      "step": 34795
+    },
+    {
+      "epoch": 1.766711934104149,
+      "grad_norm": 0.023135446169896694,
+      "learning_rate": 4.085880603075703e-05,
+      "loss": 0.435,
+      "step": 34800
+    },
+    {
+      "epoch": 1.7669657702021806,
+      "grad_norm": 0.020742048909337243,
+      "learning_rate": 4.077114319080671e-05,
+      "loss": 0.4307,
+      "step": 34805
+    },
+    {
+      "epoch": 1.767219606300212,
+      "grad_norm": 0.02392298449389485,
+      "learning_rate": 4.068357049519089e-05,
+      "loss": 0.4433,
+      "step": 34810
+    },
+    {
+      "epoch": 1.7674734423982434,
+      "grad_norm": 0.023622721434002313,
+      "learning_rate": 4.0596087961099595e-05,
+      "loss": 0.4809,
+      "step": 34815
+    },
+    {
+      "epoch": 1.767727278496275,
+      "grad_norm": 0.02007693135153689,
+      "learning_rate": 4.0508695605705136e-05,
+      "loss": 0.443,
+      "step": 34820
+    },
+    {
+      "epoch": 1.7679811145943063,
+      "grad_norm": 0.020604137444585325,
+      "learning_rate": 4.042139344616236e-05,
+      "loss": 0.45,
+      "step": 34825
+    },
+    {
+      "epoch": 1.7682349506923378,
+      "grad_norm": 0.021241405566437443,
+      "learning_rate": 4.033418149960799e-05,
+      "loss": 0.4537,
+      "step": 34830
+    },
+    {
+      "epoch": 1.7684887867903694,
+      "grad_norm": 0.02142638288944903,
+      "learning_rate": 4.0247059783161565e-05,
+      "loss": 0.4272,
+      "step": 34835
+    },
+    {
+      "epoch": 1.768742622888401,
+      "grad_norm": 0.022520249913372568,
+      "learning_rate": 4.0160028313924456e-05,
+      "loss": 0.4789,
+      "step": 34840
+    },
+    {
+      "epoch": 1.7689964589864324,
+      "grad_norm": 0.021175421917149957,
+      "learning_rate": 4.007308710898061e-05,
+      "loss": 0.4452,
+      "step": 34845
+    },
+    {
+      "epoch": 1.769250295084464,
+      "grad_norm": 0.020501338142536445,
+      "learning_rate": 3.998623618539604e-05,
+      "loss": 0.4764,
+      "step": 34850
+    },
+    {
+      "epoch": 1.7695041311824955,
+      "grad_norm": 0.019904516522167997,
+      "learning_rate": 3.9899475560219336e-05,
+      "loss": 0.4533,
+      "step": 34855
+    },
+    {
+      "epoch": 1.769757967280527,
+      "grad_norm": 0.023716865857408845,
+      "learning_rate": 3.981280525048098e-05,
+      "loss": 0.4717,
+      "step": 34860
+    },
+    {
+      "epoch": 1.7700118033785586,
+      "grad_norm": 0.020968802334100606,
+      "learning_rate": 3.972622527319397e-05,
+      "loss": 0.4447,
+      "step": 34865
+    },
+    {
+      "epoch": 1.77026563947659,
+      "grad_norm": 0.0220213821940423,
+      "learning_rate": 3.963973564535361e-05,
+      "loss": 0.4418,
+      "step": 34870
+    },
+    {
+      "epoch": 1.7705194755746214,
+      "grad_norm": 0.020738385170455637,
+      "learning_rate": 3.955333638393732e-05,
+      "loss": 0.434,
+      "step": 34875
+    },
+    {
+      "epoch": 1.770773311672653,
+      "grad_norm": 0.023096359415643572,
+      "learning_rate": 3.9467027505904916e-05,
+      "loss": 0.4299,
+      "step": 34880
+    },
+    {
+      "epoch": 1.7710271477706845,
+      "grad_norm": 0.0199432213339235,
+      "learning_rate": 3.938080902819824e-05,
+      "loss": 0.4624,
+      "step": 34885
+    },
+    {
+      "epoch": 1.771280983868716,
+      "grad_norm": 0.026960951902493344,
+      "learning_rate": 3.929468096774175e-05,
+      "loss": 0.4648,
+      "step": 34890
+    },
+    {
+      "epoch": 1.7715348199667473,
+      "grad_norm": 0.025299529980017853,
+      "learning_rate": 3.92086433414417e-05,
+      "loss": 0.4428,
+      "step": 34895
+    },
+    {
+      "epoch": 1.7717886560647789,
+      "grad_norm": 0.02174184002171596,
+      "learning_rate": 3.9122696166187186e-05,
+      "loss": 0.4534,
+      "step": 34900
+    },
+    {
+      "epoch": 1.7720424921628104,
+      "grad_norm": 0.024345802063285356,
+      "learning_rate": 3.903683945884884e-05,
+      "loss": 0.4454,
+      "step": 34905
+    },
+    {
+      "epoch": 1.772296328260842,
+      "grad_norm": 0.023452333877164416,
+      "learning_rate": 3.895107323628022e-05,
+      "loss": 0.4538,
+      "step": 34910
+    },
+    {
+      "epoch": 1.7725501643588735,
+      "grad_norm": 0.019713235902259953,
+      "learning_rate": 3.8865397515316645e-05,
+      "loss": 0.442,
+      "step": 34915
+    },
+    {
+      "epoch": 1.772804000456905,
+      "grad_norm": 0.01862175117732225,
+      "learning_rate": 3.8779812312775885e-05,
+      "loss": 0.4352,
+      "step": 34920
+    },
+    {
+      "epoch": 1.7730578365549365,
+      "grad_norm": 0.02407320867951667,
+      "learning_rate": 3.869431764545772e-05,
+      "loss": 0.4435,
+      "step": 34925
+    },
+    {
+      "epoch": 1.773311672652968,
+      "grad_norm": 0.02675629662340058,
+      "learning_rate": 3.860891353014462e-05,
+      "loss": 0.4357,
+      "step": 34930
+    },
+    {
+      "epoch": 1.7735655087509996,
+      "grad_norm": 0.027161122772953582,
+      "learning_rate": 3.8523599983600776e-05,
+      "loss": 0.4311,
+      "step": 34935
+    },
+    {
+      "epoch": 1.7738193448490311,
+      "grad_norm": 0.023763895197796703,
+      "learning_rate": 3.843837702257291e-05,
+      "loss": 0.4361,
+      "step": 34940
+    },
+    {
+      "epoch": 1.7740731809470625,
+      "grad_norm": 0.02520329920417153,
+      "learning_rate": 3.835324466378981e-05,
+      "loss": 0.4608,
+      "step": 34945
+    },
+    {
+      "epoch": 1.774327017045094,
+      "grad_norm": 0.026335133730435882,
+      "learning_rate": 3.82682029239626e-05,
+      "loss": 0.4425,
+      "step": 34950
+    },
+    {
+      "epoch": 1.7745808531431255,
+      "grad_norm": 0.02077693291797904,
+      "learning_rate": 3.8183251819784436e-05,
+      "loss": 0.4415,
+      "step": 34955
+    },
+    {
+      "epoch": 1.7748346892411568,
+      "grad_norm": 0.021994750743455038,
+      "learning_rate": 3.8098391367930976e-05,
+      "loss": 0.4816,
+      "step": 34960
+    },
+    {
+      "epoch": 1.7750885253391884,
+      "grad_norm": 0.022292288456564727,
+      "learning_rate": 3.8013621585059665e-05,
+      "loss": 0.4741,
+      "step": 34965
+    },
+    {
+      "epoch": 1.77534236143722,
+      "grad_norm": 0.022199822217541125,
+      "learning_rate": 3.7928942487810594e-05,
+      "loss": 0.4602,
+      "step": 34970
+    },
+    {
+      "epoch": 1.7755961975352514,
+      "grad_norm": 0.026501730743718647,
+      "learning_rate": 3.7844354092805735e-05,
+      "loss": 0.451,
+      "step": 34975
+    },
+    {
+      "epoch": 1.775850033633283,
+      "grad_norm": 0.02266914903426037,
+      "learning_rate": 3.775985641664942e-05,
+      "loss": 0.4661,
+      "step": 34980
+    },
+    {
+      "epoch": 1.7761038697313145,
+      "grad_norm": 0.01942723439600507,
+      "learning_rate": 3.767544947592805e-05,
+      "loss": 0.4289,
+      "step": 34985
+    },
+    {
+      "epoch": 1.776357705829346,
+      "grad_norm": 0.021825648976358702,
+      "learning_rate": 3.759113328721036e-05,
+      "loss": 0.433,
+      "step": 34990
+    },
+    {
+      "epoch": 1.7766115419273776,
+      "grad_norm": 0.023665285889932587,
+      "learning_rate": 3.750690786704725e-05,
+      "loss": 0.4506,
+      "step": 34995
+    },
+    {
+      "epoch": 1.7768653780254091,
+      "grad_norm": 0.022461997989559473,
+      "learning_rate": 3.742277323197158e-05,
+      "loss": 0.4552,
+      "step": 35000
+    },
+    {
+      "epoch": 1.7771192141234406,
+      "grad_norm": 0.024581088277719675,
+      "learning_rate": 3.733872939849875e-05,
+      "loss": 0.4634,
+      "step": 35005
+    },
+    {
+      "epoch": 1.777373050221472,
+      "grad_norm": 0.026586649366082245,
+      "learning_rate": 3.725477638312591e-05,
+      "loss": 0.4461,
+      "step": 35010
+    },
+    {
+      "epoch": 1.7776268863195035,
+      "grad_norm": 0.029002282206786262,
+      "learning_rate": 3.717091420233293e-05,
+      "loss": 0.4392,
+      "step": 35015
+    },
+    {
+      "epoch": 1.777880722417535,
+      "grad_norm": 0.025326290885756917,
+      "learning_rate": 3.708714287258125e-05,
+      "loss": 0.4314,
+      "step": 35020
+    },
+    {
+      "epoch": 1.7781345585155663,
+      "grad_norm": 0.025316121879115406,
+      "learning_rate": 3.700346241031494e-05,
+      "loss": 0.4434,
+      "step": 35025
+    },
+    {
+      "epoch": 1.7783883946135979,
+      "grad_norm": 0.02063467888087572,
+      "learning_rate": 3.691987283195991e-05,
+      "loss": 0.4665,
+      "step": 35030
+    },
+    {
+      "epoch": 1.7786422307116294,
+      "grad_norm": 0.022082244394205426,
+      "learning_rate": 3.68363741539246e-05,
+      "loss": 0.4657,
+      "step": 35035
+    },
+    {
+      "epoch": 1.778896066809661,
+      "grad_norm": 0.024010484572527937,
+      "learning_rate": 3.675296639259912e-05,
+      "loss": 0.4751,
+      "step": 35040
+    },
+    {
+      "epoch": 1.7791499029076925,
+      "grad_norm": 0.020949059566116365,
+      "learning_rate": 3.66696495643562e-05,
+      "loss": 0.4485,
+      "step": 35045
+    },
+    {
+      "epoch": 1.779403739005724,
+      "grad_norm": 0.02321244261448484,
+      "learning_rate": 3.6586423685550374e-05,
+      "loss": 0.4597,
+      "step": 35050
+    },
+    {
+      "epoch": 1.7796575751037556,
+      "grad_norm": 0.02673174516429816,
+      "learning_rate": 3.6503288772518626e-05,
+      "loss": 0.4677,
+      "step": 35055
+    },
+    {
+      "epoch": 1.779911411201787,
+      "grad_norm": 0.020598141046026204,
+      "learning_rate": 3.64202448415798e-05,
+      "loss": 0.4393,
+      "step": 35060
+    },
+    {
+      "epoch": 1.7801652472998186,
+      "grad_norm": 0.02209046479234219,
+      "learning_rate": 3.6337291909035065e-05,
+      "loss": 0.4413,
+      "step": 35065
+    },
+    {
+      "epoch": 1.7804190833978502,
+      "grad_norm": 0.027347973082832013,
+      "learning_rate": 3.625442999116763e-05,
+      "loss": 0.4273,
+      "step": 35070
+    },
+    {
+      "epoch": 1.7806729194958815,
+      "grad_norm": 0.02196378431429979,
+      "learning_rate": 3.6171659104242914e-05,
+      "loss": 0.4579,
+      "step": 35075
+    },
+    {
+      "epoch": 1.780926755593913,
+      "grad_norm": 0.03402444971932555,
+      "learning_rate": 3.608897926450838e-05,
+      "loss": 0.4532,
+      "step": 35080
+    },
+    {
+      "epoch": 1.7811805916919445,
+      "grad_norm": 0.025631046835950563,
+      "learning_rate": 3.600639048819371e-05,
+      "loss": 0.4314,
+      "step": 35085
+    },
+    {
+      "epoch": 1.7814344277899758,
+      "grad_norm": 0.023888969294360688,
+      "learning_rate": 3.592389279151065e-05,
+      "loss": 0.4677,
+      "step": 35090
+    },
+    {
+      "epoch": 1.7816882638880074,
+      "grad_norm": 0.020867066218234958,
+      "learning_rate": 3.584148619065314e-05,
+      "loss": 0.4444,
+      "step": 35095
+    },
+    {
+      "epoch": 1.781942099986039,
+      "grad_norm": 0.023266323128579264,
+      "learning_rate": 3.575917070179702e-05,
+      "loss": 0.4331,
+      "step": 35100
+    },
+    {
+      "epoch": 1.7821959360840705,
+      "grad_norm": 0.02512759163952341,
+      "learning_rate": 3.567694634110058e-05,
+      "loss": 0.4822,
+      "step": 35105
+    },
+    {
+      "epoch": 1.782449772182102,
+      "grad_norm": 0.021294948101753024,
+      "learning_rate": 3.559481312470403e-05,
+      "loss": 0.4744,
+      "step": 35110
+    },
+    {
+      "epoch": 1.7827036082801335,
+      "grad_norm": 0.021426087304758333,
+      "learning_rate": 3.551277106872963e-05,
+      "loss": 0.4677,
+      "step": 35115
+    },
+    {
+      "epoch": 1.782957444378165,
+      "grad_norm": 0.02346702339510721,
+      "learning_rate": 3.5430820189281954e-05,
+      "loss": 0.4692,
+      "step": 35120
+    },
+    {
+      "epoch": 1.7832112804761966,
+      "grad_norm": 0.020925318483005197,
+      "learning_rate": 3.53489605024474e-05,
+      "loss": 0.4804,
+      "step": 35125
+    },
+    {
+      "epoch": 1.7834651165742281,
+      "grad_norm": 0.024026448655141502,
+      "learning_rate": 3.526719202429474e-05,
+      "loss": 0.4871,
+      "step": 35130
+    },
+    {
+      "epoch": 1.7837189526722597,
+      "grad_norm": 0.02135052995935449,
+      "learning_rate": 3.518551477087462e-05,
+      "loss": 0.4451,
+      "step": 35135
+    },
+    {
+      "epoch": 1.783972788770291,
+      "grad_norm": 0.03284714110137373,
+      "learning_rate": 3.5103928758219995e-05,
+      "loss": 0.4731,
+      "step": 35140
+    },
+    {
+      "epoch": 1.7842266248683225,
+      "grad_norm": 0.021457290842253285,
+      "learning_rate": 3.5022434002345615e-05,
+      "loss": 0.4752,
+      "step": 35145
+    },
+    {
+      "epoch": 1.784480460966354,
+      "grad_norm": 0.030062390522041568,
+      "learning_rate": 3.4941030519248685e-05,
+      "loss": 0.4536,
+      "step": 35150
+    },
+    {
+      "epoch": 1.7847342970643856,
+      "grad_norm": 0.02160896181203855,
+      "learning_rate": 3.485971832490814e-05,
+      "loss": 0.4567,
+      "step": 35155
+    },
+    {
+      "epoch": 1.784988133162417,
+      "grad_norm": 0.022948769188133194,
+      "learning_rate": 3.477849743528533e-05,
+      "loss": 0.45,
+      "step": 35160
+    },
+    {
+      "epoch": 1.7852419692604484,
+      "grad_norm": 0.023052274795903466,
+      "learning_rate": 3.469736786632327e-05,
+      "loss": 0.4513,
+      "step": 35165
+    },
+    {
+      "epoch": 1.78549580535848,
+      "grad_norm": 0.02387145233098405,
+      "learning_rate": 3.461632963394756e-05,
+      "loss": 0.4534,
+      "step": 35170
+    },
+    {
+      "epoch": 1.7857496414565115,
+      "grad_norm": 0.02398398740621941,
+      "learning_rate": 3.453538275406542e-05,
+      "loss": 0.454,
+      "step": 35175
+    },
+    {
+      "epoch": 1.786003477554543,
+      "grad_norm": 0.024153894469605582,
+      "learning_rate": 3.445452724256648e-05,
+      "loss": 0.4466,
+      "step": 35180
+    },
+    {
+      "epoch": 1.7862573136525746,
+      "grad_norm": 0.02474750346228495,
+      "learning_rate": 3.437376311532209e-05,
+      "loss": 0.4586,
+      "step": 35185
+    },
+    {
+      "epoch": 1.786511149750606,
+      "grad_norm": 0.021820142241289246,
+      "learning_rate": 3.4293090388185955e-05,
+      "loss": 0.4259,
+      "step": 35190
+    },
+    {
+      "epoch": 1.7867649858486376,
+      "grad_norm": 0.021702335087422333,
+      "learning_rate": 3.421250907699369e-05,
+      "loss": 0.4538,
+      "step": 35195
+    },
+    {
+      "epoch": 1.7870188219466692,
+      "grad_norm": 0.025281488724625104,
+      "learning_rate": 3.413201919756304e-05,
+      "loss": 0.4311,
+      "step": 35200
+    },
+    {
+      "epoch": 1.7872726580447007,
+      "grad_norm": 0.022103622641024265,
+      "learning_rate": 3.4051620765693734e-05,
+      "loss": 0.4543,
+      "step": 35205
+    },
+    {
+      "epoch": 1.787526494142732,
+      "grad_norm": 0.020184392908854817,
+      "learning_rate": 3.3971313797167555e-05,
+      "loss": 0.4551,
+      "step": 35210
+    },
+    {
+      "epoch": 1.7877803302407635,
+      "grad_norm": 0.024327540101643993,
+      "learning_rate": 3.389109830774845e-05,
+      "loss": 0.462,
+      "step": 35215
+    },
+    {
+      "epoch": 1.788034166338795,
+      "grad_norm": 0.018786966111353623,
+      "learning_rate": 3.38109743131822e-05,
+      "loss": 0.4489,
+      "step": 35220
+    },
+    {
+      "epoch": 1.7882880024368264,
+      "grad_norm": 0.024298565666928316,
+      "learning_rate": 3.373094182919678e-05,
+      "loss": 0.4377,
+      "step": 35225
+    },
+    {
+      "epoch": 1.788541838534858,
+      "grad_norm": 0.025256385511008395,
+      "learning_rate": 3.3651000871502245e-05,
+      "loss": 0.4503,
+      "step": 35230
+    },
+    {
+      "epoch": 1.7887956746328895,
+      "grad_norm": 0.02722898777164149,
+      "learning_rate": 3.357115145579059e-05,
+      "loss": 0.4697,
+      "step": 35235
+    },
+    {
+      "epoch": 1.789049510730921,
+      "grad_norm": 0.022525515289120692,
+      "learning_rate": 3.3491393597735786e-05,
+      "loss": 0.4664,
+      "step": 35240
+    },
+    {
+      "epoch": 1.7893033468289525,
+      "grad_norm": 0.02377488171334522,
+      "learning_rate": 3.341172731299402e-05,
+      "loss": 0.4654,
+      "step": 35245
+    },
+    {
+      "epoch": 1.789557182926984,
+      "grad_norm": 0.020576451658282147,
+      "learning_rate": 3.3332152617203237e-05,
+      "loss": 0.4412,
+      "step": 35250
+    },
+    {
+      "epoch": 1.7898110190250156,
+      "grad_norm": 0.019862554906292372,
+      "learning_rate": 3.325266952598366e-05,
+      "loss": 0.4613,
+      "step": 35255
+    },
+    {
+      "epoch": 1.7900648551230471,
+      "grad_norm": 0.025920687548760694,
+      "learning_rate": 3.317327805493736e-05,
+      "loss": 0.4409,
+      "step": 35260
+    },
+    {
+      "epoch": 1.7903186912210787,
+      "grad_norm": 0.025050850533418417,
+      "learning_rate": 3.3093978219648605e-05,
+      "loss": 0.487,
+      "step": 35265
+    },
+    {
+      "epoch": 1.7905725273191102,
+      "grad_norm": 0.021263072671365613,
+      "learning_rate": 3.3014770035683315e-05,
+      "loss": 0.449,
+      "step": 35270
+    },
+    {
+      "epoch": 1.7908263634171415,
+      "grad_norm": 0.021022969296222715,
+      "learning_rate": 3.293565351858996e-05,
+      "loss": 0.4434,
+      "step": 35275
+    },
+    {
+      "epoch": 1.791080199515173,
+      "grad_norm": 0.02133416611869445,
+      "learning_rate": 3.285662868389849e-05,
+      "loss": 0.4612,
+      "step": 35280
+    },
+    {
+      "epoch": 1.7913340356132046,
+      "grad_norm": 0.021363627371597606,
+      "learning_rate": 3.2777695547121236e-05,
+      "loss": 0.4463,
+      "step": 35285
+    },
+    {
+      "epoch": 1.791587871711236,
+      "grad_norm": 0.023294959319734358,
+      "learning_rate": 3.269885412375223e-05,
+      "loss": 0.4843,
+      "step": 35290
+    },
+    {
+      "epoch": 1.7918417078092674,
+      "grad_norm": 0.020389213955970452,
+      "learning_rate": 3.262010442926772e-05,
+      "loss": 0.4228,
+      "step": 35295
+    },
+    {
+      "epoch": 1.792095543907299,
+      "grad_norm": 0.020902326598321523,
+      "learning_rate": 3.254144647912599e-05,
+      "loss": 0.4467,
+      "step": 35300
+    },
+    {
+      "epoch": 1.7923493800053305,
+      "grad_norm": 0.029630242897729194,
+      "learning_rate": 3.246288028876704e-05,
+      "loss": 0.4303,
+      "step": 35305
+    },
+    {
+      "epoch": 1.792603216103362,
+      "grad_norm": 0.02027047342735473,
+      "learning_rate": 3.2384405873613134e-05,
+      "loss": 0.4566,
+      "step": 35310
+    },
+    {
+      "epoch": 1.7928570522013936,
+      "grad_norm": 0.02183801483079023,
+      "learning_rate": 3.2306023249068285e-05,
+      "loss": 0.4414,
+      "step": 35315
+    },
+    {
+      "epoch": 1.793110888299425,
+      "grad_norm": 0.022104767365956915,
+      "learning_rate": 3.22277324305188e-05,
+      "loss": 0.4563,
+      "step": 35320
+    },
+    {
+      "epoch": 1.7933647243974566,
+      "grad_norm": 0.02103106803838072,
+      "learning_rate": 3.214953343333255e-05,
+      "loss": 0.4399,
+      "step": 35325
+    },
+    {
+      "epoch": 1.7936185604954882,
+      "grad_norm": 0.025480846033686194,
+      "learning_rate": 3.20714262728598e-05,
+      "loss": 0.4418,
+      "step": 35330
+    },
+    {
+      "epoch": 1.7938723965935197,
+      "grad_norm": 0.026910726059110684,
+      "learning_rate": 3.1993410964432424e-05,
+      "loss": 0.4787,
+      "step": 35335
+    },
+    {
+      "epoch": 1.794126232691551,
+      "grad_norm": 0.03201969008252627,
+      "learning_rate": 3.1915487523364596e-05,
+      "loss": 0.4484,
+      "step": 35340
+    },
+    {
+      "epoch": 1.7943800687895826,
+      "grad_norm": 0.030531459636693972,
+      "learning_rate": 3.18376559649522e-05,
+      "loss": 0.4485,
+      "step": 35345
+    },
+    {
+      "epoch": 1.794633904887614,
+      "grad_norm": 0.02274451247134649,
+      "learning_rate": 3.175991630447322e-05,
+      "loss": 0.4718,
+      "step": 35350
+    },
+    {
+      "epoch": 1.7948877409856454,
+      "grad_norm": 0.02435117079584466,
+      "learning_rate": 3.1682268557187535e-05,
+      "loss": 0.436,
+      "step": 35355
+    },
+    {
+      "epoch": 1.795141577083677,
+      "grad_norm": 0.024073899530536718,
+      "learning_rate": 3.160471273833709e-05,
+      "loss": 0.4625,
+      "step": 35360
+    },
+    {
+      "epoch": 1.7953954131817085,
+      "grad_norm": 0.023077987822385543,
+      "learning_rate": 3.152724886314562e-05,
+      "loss": 0.4367,
+      "step": 35365
+    },
+    {
+      "epoch": 1.79564924927974,
+      "grad_norm": 0.024014102288831315,
+      "learning_rate": 3.1449876946819e-05,
+      "loss": 0.4771,
+      "step": 35370
+    },
+    {
+      "epoch": 1.7959030853777715,
+      "grad_norm": 0.022994443638095826,
+      "learning_rate": 3.137259700454481e-05,
+      "loss": 0.4728,
+      "step": 35375
+    },
+    {
+      "epoch": 1.796156921475803,
+      "grad_norm": 0.019915951884140817,
+      "learning_rate": 3.129540905149281e-05,
+      "loss": 0.4655,
+      "step": 35380
+    },
+    {
+      "epoch": 1.7964107575738346,
+      "grad_norm": 0.022563204784642146,
+      "learning_rate": 3.121831310281459e-05,
+      "loss": 0.4764,
+      "step": 35385
+    },
+    {
+      "epoch": 1.7966645936718662,
+      "grad_norm": 0.02124227375276603,
+      "learning_rate": 3.114130917364372e-05,
+      "loss": 0.4419,
+      "step": 35390
+    },
+    {
+      "epoch": 1.7969184297698977,
+      "grad_norm": 0.023126356296767688,
+      "learning_rate": 3.10643972790956e-05,
+      "loss": 0.4736,
+      "step": 35395
+    },
+    {
+      "epoch": 1.7971722658679292,
+      "grad_norm": 0.026053218593494146,
+      "learning_rate": 3.098757743426778e-05,
+      "loss": 0.4739,
+      "step": 35400
+    },
+    {
+      "epoch": 1.7974261019659605,
+      "grad_norm": 0.023364066512373553,
+      "learning_rate": 3.0910849654239456e-05,
+      "loss": 0.4504,
+      "step": 35405
+    },
+    {
+      "epoch": 1.797679938063992,
+      "grad_norm": 0.027082071431975455,
+      "learning_rate": 3.0834213954072046e-05,
+      "loss": 0.4452,
+      "step": 35410
+    },
+    {
+      "epoch": 1.7979337741620236,
+      "grad_norm": 0.02380009827362938,
+      "learning_rate": 3.0757670348808774e-05,
+      "loss": 0.4494,
+      "step": 35415
+    },
+    {
+      "epoch": 1.7981876102600551,
+      "grad_norm": 0.033607420854026,
+      "learning_rate": 3.0681218853474636e-05,
+      "loss": 0.4235,
+      "step": 35420
+    },
+    {
+      "epoch": 1.7984414463580864,
+      "grad_norm": 0.02321159683766678,
+      "learning_rate": 3.0604859483076785e-05,
+      "loss": 0.4543,
+      "step": 35425
+    },
+    {
+      "epoch": 1.798695282456118,
+      "grad_norm": 0.03605534174130652,
+      "learning_rate": 3.0528592252604126e-05,
+      "loss": 0.4683,
+      "step": 35430
+    },
+    {
+      "epoch": 1.7989491185541495,
+      "grad_norm": 0.024039033525210644,
+      "learning_rate": 3.045241717702757e-05,
+      "loss": 0.4711,
+      "step": 35435
+    },
+    {
+      "epoch": 1.799202954652181,
+      "grad_norm": 0.022549638573687806,
+      "learning_rate": 3.0376334271299878e-05,
+      "loss": 0.467,
+      "step": 35440
+    },
+    {
+      "epoch": 1.7994567907502126,
+      "grad_norm": 0.021025752959848737,
+      "learning_rate": 3.0300343550355767e-05,
+      "loss": 0.4471,
+      "step": 35445
+    },
+    {
+      "epoch": 1.7997106268482441,
+      "grad_norm": 0.02042717498913915,
+      "learning_rate": 3.0224445029111812e-05,
+      "loss": 0.4593,
+      "step": 35450
+    },
+    {
+      "epoch": 1.7999644629462757,
+      "grad_norm": 0.01959098368966413,
+      "learning_rate": 3.0148638722466593e-05,
+      "loss": 0.4566,
+      "step": 35455
+    },
+    {
+      "epoch": 1.8002182990443072,
+      "grad_norm": 0.0388203089794261,
+      "learning_rate": 3.007292464530037e-05,
+      "loss": 0.4778,
+      "step": 35460
+    },
+    {
+      "epoch": 1.8004721351423387,
+      "grad_norm": 0.022497742943539102,
+      "learning_rate": 2.9997302812475592e-05,
+      "loss": 0.4565,
+      "step": 35465
+    },
+    {
+      "epoch": 1.80072597124037,
+      "grad_norm": 0.023761047149701668,
+      "learning_rate": 2.9921773238836215e-05,
+      "loss": 0.4653,
+      "step": 35470
+    },
+    {
+      "epoch": 1.8009798073384016,
+      "grad_norm": 0.021599826773256652,
+      "learning_rate": 2.9846335939208602e-05,
+      "loss": 0.437,
+      "step": 35475
+    },
+    {
+      "epoch": 1.801233643436433,
+      "grad_norm": 0.02511013717130302,
+      "learning_rate": 2.9770990928400575e-05,
+      "loss": 0.4416,
+      "step": 35480
+    },
+    {
+      "epoch": 1.8014874795344646,
+      "grad_norm": 0.02204277337871591,
+      "learning_rate": 2.969573822120203e-05,
+      "loss": 0.4594,
+      "step": 35485
+    },
+    {
+      "epoch": 1.801741315632496,
+      "grad_norm": 0.022231649077976815,
+      "learning_rate": 2.9620577832384643e-05,
+      "loss": 0.4725,
+      "step": 35490
+    },
+    {
+      "epoch": 1.8019951517305275,
+      "grad_norm": 0.02225484721568446,
+      "learning_rate": 2.9545509776702062e-05,
+      "loss": 0.448,
+      "step": 35495
+    },
+    {
+      "epoch": 1.802248987828559,
+      "grad_norm": 0.022235212450463348,
+      "learning_rate": 2.947053406888972e-05,
+      "loss": 0.4285,
+      "step": 35500
+    },
+    {
+      "epoch": 1.8025028239265906,
+      "grad_norm": 0.020744906938457715,
+      "learning_rate": 2.939565072366507e-05,
+      "loss": 0.4364,
+      "step": 35505
+    },
+    {
+      "epoch": 1.802756660024622,
+      "grad_norm": 0.02218934079642219,
+      "learning_rate": 2.9320859755727238e-05,
+      "loss": 0.4372,
+      "step": 35510
+    },
+    {
+      "epoch": 1.8030104961226536,
+      "grad_norm": 0.021558249002831046,
+      "learning_rate": 2.9246161179757425e-05,
+      "loss": 0.4534,
+      "step": 35515
+    },
+    {
+      "epoch": 1.8032643322206852,
+      "grad_norm": 0.02400375345216303,
+      "learning_rate": 2.9171555010418404e-05,
+      "loss": 0.43,
+      "step": 35520
+    },
+    {
+      "epoch": 1.8035181683187167,
+      "grad_norm": 0.0224200032122597,
+      "learning_rate": 2.909704126235524e-05,
+      "loss": 0.4536,
+      "step": 35525
+    },
+    {
+      "epoch": 1.8037720044167482,
+      "grad_norm": 0.023048378811341624,
+      "learning_rate": 2.9022619950194395e-05,
+      "loss": 0.4554,
+      "step": 35530
+    },
+    {
+      "epoch": 1.8040258405147798,
+      "grad_norm": 0.026029749704731063,
+      "learning_rate": 2.8948291088544522e-05,
+      "loss": 0.4645,
+      "step": 35535
+    },
+    {
+      "epoch": 1.804279676612811,
+      "grad_norm": 0.019985537420418396,
+      "learning_rate": 2.8874054691996054e-05,
+      "loss": 0.445,
+      "step": 35540
+    },
+    {
+      "epoch": 1.8045335127108426,
+      "grad_norm": 0.022666606334347597,
+      "learning_rate": 2.8799910775121008e-05,
+      "loss": 0.4507,
+      "step": 35545
+    },
+    {
+      "epoch": 1.8047873488088741,
+      "grad_norm": 0.020489511769417657,
+      "learning_rate": 2.8725859352473737e-05,
+      "loss": 0.4697,
+      "step": 35550
+    },
+    {
+      "epoch": 1.8050411849069055,
+      "grad_norm": 0.02213113768181123,
+      "learning_rate": 2.865190043858995e-05,
+      "loss": 0.4638,
+      "step": 35555
+    },
+    {
+      "epoch": 1.805295021004937,
+      "grad_norm": 0.028290545411715884,
+      "learning_rate": 2.8578034047987587e-05,
+      "loss": 0.4496,
+      "step": 35560
+    },
+    {
+      "epoch": 1.8055488571029685,
+      "grad_norm": 0.02059738323425534,
+      "learning_rate": 2.8504260195166055e-05,
+      "loss": 0.4598,
+      "step": 35565
+    },
+    {
+      "epoch": 1.805802693201,
+      "grad_norm": 0.02560738205998867,
+      "learning_rate": 2.8430578894606985e-05,
+      "loss": 0.4699,
+      "step": 35570
+    },
+    {
+      "epoch": 1.8060565292990316,
+      "grad_norm": 0.02077551935432808,
+      "learning_rate": 2.8356990160773534e-05,
+      "loss": 0.4605,
+      "step": 35575
+    },
+    {
+      "epoch": 1.8063103653970631,
+      "grad_norm": 0.02698118825215807,
+      "learning_rate": 2.8283494008110867e-05,
+      "loss": 0.4439,
+      "step": 35580
+    },
+    {
+      "epoch": 1.8065642014950947,
+      "grad_norm": 0.021234911775022253,
+      "learning_rate": 2.821009045104578e-05,
+      "loss": 0.4599,
+      "step": 35585
+    },
+    {
+      "epoch": 1.8068180375931262,
+      "grad_norm": 0.023435049130171297,
+      "learning_rate": 2.8136779503987186e-05,
+      "loss": 0.4547,
+      "step": 35590
+    },
+    {
+      "epoch": 1.8070718736911577,
+      "grad_norm": 0.022261106315884807,
+      "learning_rate": 2.8063561181325526e-05,
+      "loss": 0.4291,
+      "step": 35595
+    },
+    {
+      "epoch": 1.8073257097891893,
+      "grad_norm": 0.025676143757082985,
+      "learning_rate": 2.7990435497433408e-05,
+      "loss": 0.4872,
+      "step": 35600
+    },
+    {
+      "epoch": 1.8075795458872206,
+      "grad_norm": 0.02382218211439765,
+      "learning_rate": 2.79174024666648e-05,
+      "loss": 0.4365,
+      "step": 35605
+    },
+    {
+      "epoch": 1.8078333819852521,
+      "grad_norm": 0.022263485578407526,
+      "learning_rate": 2.7844462103355838e-05,
+      "loss": 0.4421,
+      "step": 35610
+    },
+    {
+      "epoch": 1.8080872180832837,
+      "grad_norm": 0.025275647713049482,
+      "learning_rate": 2.7771614421824297e-05,
+      "loss": 0.471,
+      "step": 35615
+    },
+    {
+      "epoch": 1.808341054181315,
+      "grad_norm": 0.019588771701611427,
+      "learning_rate": 2.769885943636996e-05,
+      "loss": 0.4582,
+      "step": 35620
+    },
+    {
+      "epoch": 1.8085948902793465,
+      "grad_norm": 0.020818530567243063,
+      "learning_rate": 2.7626197161274014e-05,
+      "loss": 0.4502,
+      "step": 35625
+    },
+    {
+      "epoch": 1.808848726377378,
+      "grad_norm": 0.022388154883143465,
+      "learning_rate": 2.7553627610799938e-05,
+      "loss": 0.4305,
+      "step": 35630
+    },
+    {
+      "epoch": 1.8091025624754096,
+      "grad_norm": 0.0278131432647123,
+      "learning_rate": 2.748115079919261e-05,
+      "loss": 0.4423,
+      "step": 35635
+    },
+    {
+      "epoch": 1.809356398573441,
+      "grad_norm": 0.0214926109026721,
+      "learning_rate": 2.7408766740678994e-05,
+      "loss": 0.4498,
+      "step": 35640
+    },
+    {
+      "epoch": 1.8096102346714726,
+      "grad_norm": 0.020383587231959203,
+      "learning_rate": 2.73364754494676e-05,
+      "loss": 0.4468,
+      "step": 35645
+    },
+    {
+      "epoch": 1.8098640707695042,
+      "grad_norm": 0.023441844532656696,
+      "learning_rate": 2.7264276939748923e-05,
+      "loss": 0.4376,
+      "step": 35650
+    },
+    {
+      "epoch": 1.8101179068675357,
+      "grad_norm": 0.021868685315593866,
+      "learning_rate": 2.7192171225695172e-05,
+      "loss": 0.4606,
+      "step": 35655
+    },
+    {
+      "epoch": 1.8103717429655672,
+      "grad_norm": 0.023969847865051304,
+      "learning_rate": 2.712015832146031e-05,
+      "loss": 0.4675,
+      "step": 35660
+    },
+    {
+      "epoch": 1.8106255790635988,
+      "grad_norm": 0.022850546221913993,
+      "learning_rate": 2.7048238241180133e-05,
+      "loss": 0.4327,
+      "step": 35665
+    },
+    {
+      "epoch": 1.81087941516163,
+      "grad_norm": 0.02192908510076702,
+      "learning_rate": 2.6976410998972134e-05,
+      "loss": 0.4755,
+      "step": 35670
+    },
+    {
+      "epoch": 1.8111332512596616,
+      "grad_norm": 0.023714661280836886,
+      "learning_rate": 2.690467660893575e-05,
+      "loss": 0.4509,
+      "step": 35675
+    },
+    {
+      "epoch": 1.8113870873576932,
+      "grad_norm": 0.023176233911916783,
+      "learning_rate": 2.6833035085152003e-05,
+      "loss": 0.4615,
+      "step": 35680
+    },
+    {
+      "epoch": 1.8116409234557245,
+      "grad_norm": 0.023676313879505324,
+      "learning_rate": 2.6761486441683802e-05,
+      "loss": 0.4545,
+      "step": 35685
+    },
+    {
+      "epoch": 1.811894759553756,
+      "grad_norm": 0.03016670003145498,
+      "learning_rate": 2.669003069257575e-05,
+      "loss": 0.4299,
+      "step": 35690
+    },
+    {
+      "epoch": 1.8121485956517875,
+      "grad_norm": 0.022836140384253037,
+      "learning_rate": 2.661866785185435e-05,
+      "loss": 0.4544,
+      "step": 35695
+    },
+    {
+      "epoch": 1.812402431749819,
+      "grad_norm": 0.022515164600397174,
+      "learning_rate": 2.6547397933527562e-05,
+      "loss": 0.4482,
+      "step": 35700
+    },
+    {
+      "epoch": 1.8126562678478506,
+      "grad_norm": 0.023784234188635057,
+      "learning_rate": 2.6476220951585582e-05,
+      "loss": 0.4566,
+      "step": 35705
+    },
+    {
+      "epoch": 1.8129101039458821,
+      "grad_norm": 0.021185807136934295,
+      "learning_rate": 2.640513691999985e-05,
+      "loss": 0.454,
+      "step": 35710
+    },
+    {
+      "epoch": 1.8131639400439137,
+      "grad_norm": 0.019955400348529193,
+      "learning_rate": 2.6334145852724035e-05,
+      "loss": 0.4142,
+      "step": 35715
+    },
+    {
+      "epoch": 1.8134177761419452,
+      "grad_norm": 0.032252200455131774,
+      "learning_rate": 2.6263247763693153e-05,
+      "loss": 0.4394,
+      "step": 35720
+    },
+    {
+      "epoch": 1.8136716122399767,
+      "grad_norm": 0.025647649030433494,
+      "learning_rate": 2.61924426668243e-05,
+      "loss": 0.4871,
+      "step": 35725
+    },
+    {
+      "epoch": 1.8139254483380083,
+      "grad_norm": 0.02352222162068727,
+      "learning_rate": 2.6121730576015967e-05,
+      "loss": 0.4726,
+      "step": 35730
+    },
+    {
+      "epoch": 1.8141792844360396,
+      "grad_norm": 0.023536332582882832,
+      "learning_rate": 2.605111150514883e-05,
+      "loss": 0.4894,
+      "step": 35735
+    },
+    {
+      "epoch": 1.8144331205340711,
+      "grad_norm": 0.020218263139566366,
+      "learning_rate": 2.5980585468084795e-05,
+      "loss": 0.4571,
+      "step": 35740
+    },
+    {
+      "epoch": 1.8146869566321027,
+      "grad_norm": 0.022759174498022658,
+      "learning_rate": 2.5910152478668015e-05,
+      "loss": 0.4687,
+      "step": 35745
+    },
+    {
+      "epoch": 1.8149407927301342,
+      "grad_norm": 0.021047033486352878,
+      "learning_rate": 2.5839812550723928e-05,
+      "loss": 0.4586,
+      "step": 35750
+    },
+    {
+      "epoch": 1.8151946288281655,
+      "grad_norm": 0.023654540393796105,
+      "learning_rate": 2.5769565698060047e-05,
+      "loss": 0.4613,
+      "step": 35755
+    },
+    {
+      "epoch": 1.815448464926197,
+      "grad_norm": 0.021804276959855963,
+      "learning_rate": 2.56994119344654e-05,
+      "loss": 0.4454,
+      "step": 35760
+    },
+    {
+      "epoch": 1.8157023010242286,
+      "grad_norm": 0.02108201902634222,
+      "learning_rate": 2.562935127371091e-05,
+      "loss": 0.4555,
+      "step": 35765
+    },
+    {
+      "epoch": 1.8159561371222601,
+      "grad_norm": 0.02148487710666091,
+      "learning_rate": 2.5559383729549025e-05,
+      "loss": 0.4611,
+      "step": 35770
+    },
+    {
+      "epoch": 1.8162099732202917,
+      "grad_norm": 0.029051791837414652,
+      "learning_rate": 2.5489509315714087e-05,
+      "loss": 0.4538,
+      "step": 35775
+    },
+    {
+      "epoch": 1.8164638093183232,
+      "grad_norm": 0.026776408987169115,
+      "learning_rate": 2.5419728045922186e-05,
+      "loss": 0.4421,
+      "step": 35780
+    },
+    {
+      "epoch": 1.8167176454163547,
+      "grad_norm": 0.02023071442340706,
+      "learning_rate": 2.5350039933870805e-05,
+      "loss": 0.4225,
+      "step": 35785
+    },
+    {
+      "epoch": 1.8169714815143863,
+      "grad_norm": 0.02079097271610684,
+      "learning_rate": 2.5280444993239616e-05,
+      "loss": 0.4471,
+      "step": 35790
+    },
+    {
+      "epoch": 1.8172253176124178,
+      "grad_norm": 0.02335976147958995,
+      "learning_rate": 2.5210943237689575e-05,
+      "loss": 0.4645,
+      "step": 35795
+    },
+    {
+      "epoch": 1.8174791537104493,
+      "grad_norm": 0.025662477366910293,
+      "learning_rate": 2.514153468086372e-05,
+      "loss": 0.4523,
+      "step": 35800
+    },
+    {
+      "epoch": 1.8177329898084806,
+      "grad_norm": 0.02096423059117386,
+      "learning_rate": 2.507221933638637e-05,
+      "loss": 0.4512,
+      "step": 35805
+    },
+    {
+      "epoch": 1.8179868259065122,
+      "grad_norm": 0.021587834721081906,
+      "learning_rate": 2.5002997217863975e-05,
+      "loss": 0.4739,
+      "step": 35810
+    },
+    {
+      "epoch": 1.8182406620045437,
+      "grad_norm": 0.02346278616646674,
+      "learning_rate": 2.4933868338884392e-05,
+      "loss": 0.4391,
+      "step": 35815
+    },
+    {
+      "epoch": 1.818494498102575,
+      "grad_norm": 0.02239295958313847,
+      "learning_rate": 2.4864832713017316e-05,
+      "loss": 0.4868,
+      "step": 35820
+    },
+    {
+      "epoch": 1.8187483342006066,
+      "grad_norm": 0.022180419791008956,
+      "learning_rate": 2.479589035381402e-05,
+      "loss": 0.4144,
+      "step": 35825
+    },
+    {
+      "epoch": 1.819002170298638,
+      "grad_norm": 0.02094838835196554,
+      "learning_rate": 2.472704127480768e-05,
+      "loss": 0.4537,
+      "step": 35830
+    },
+    {
+      "epoch": 1.8192560063966696,
+      "grad_norm": 0.02255615727366666,
+      "learning_rate": 2.4658285489512876e-05,
+      "loss": 0.4575,
+      "step": 35835
+    },
+    {
+      "epoch": 1.8195098424947012,
+      "grad_norm": 0.024813587982619647,
+      "learning_rate": 2.45896230114262e-05,
+      "loss": 0.4705,
+      "step": 35840
+    },
+    {
+      "epoch": 1.8197636785927327,
+      "grad_norm": 0.024876925814751656,
+      "learning_rate": 2.4521053854025587e-05,
+      "loss": 0.443,
+      "step": 35845
+    },
+    {
+      "epoch": 1.8200175146907642,
+      "grad_norm": 0.022425131114226664,
+      "learning_rate": 2.4452578030771e-05,
+      "loss": 0.4437,
+      "step": 35850
+    },
+    {
+      "epoch": 1.8202713507887958,
+      "grad_norm": 0.02983896975643497,
+      "learning_rate": 2.4384195555103685e-05,
+      "loss": 0.4525,
+      "step": 35855
+    },
+    {
+      "epoch": 1.8205251868868273,
+      "grad_norm": 0.022587211482833724,
+      "learning_rate": 2.4315906440446956e-05,
+      "loss": 0.4341,
+      "step": 35860
+    },
+    {
+      "epoch": 1.8207790229848588,
+      "grad_norm": 0.024206758618488585,
+      "learning_rate": 2.4247710700205484e-05,
+      "loss": 0.4649,
+      "step": 35865
+    },
+    {
+      "epoch": 1.8210328590828901,
+      "grad_norm": 0.024478336072376698,
+      "learning_rate": 2.4179608347765948e-05,
+      "loss": 0.5033,
+      "step": 35870
+    },
+    {
+      "epoch": 1.8212866951809217,
+      "grad_norm": 0.02070813235705416,
+      "learning_rate": 2.4111599396496263e-05,
+      "loss": 0.4639,
+      "step": 35875
+    },
+    {
+      "epoch": 1.8215405312789532,
+      "grad_norm": 0.021774866578754614,
+      "learning_rate": 2.404368385974648e-05,
+      "loss": 0.4698,
+      "step": 35880
+    },
+    {
+      "epoch": 1.8217943673769845,
+      "grad_norm": 0.0212623746339756,
+      "learning_rate": 2.3975861750847872e-05,
+      "loss": 0.4637,
+      "step": 35885
+    },
+    {
+      "epoch": 1.822048203475016,
+      "grad_norm": 0.02128933031223815,
+      "learning_rate": 2.3908133083113627e-05,
+      "loss": 0.4293,
+      "step": 35890
+    },
+    {
+      "epoch": 1.8223020395730476,
+      "grad_norm": 0.021642236447562767,
+      "learning_rate": 2.3840497869838718e-05,
+      "loss": 0.4228,
+      "step": 35895
+    },
+    {
+      "epoch": 1.8225558756710791,
+      "grad_norm": 0.020495462348286645,
+      "learning_rate": 2.3772956124299416e-05,
+      "loss": 0.4498,
+      "step": 35900
+    },
+    {
+      "epoch": 1.8228097117691107,
+      "grad_norm": 0.019689106987887187,
+      "learning_rate": 2.3705507859753896e-05,
+      "loss": 0.4314,
+      "step": 35905
+    },
+    {
+      "epoch": 1.8230635478671422,
+      "grad_norm": 0.0217185094138313,
+      "learning_rate": 2.3638153089441893e-05,
+      "loss": 0.4535,
+      "step": 35910
+    },
+    {
+      "epoch": 1.8233173839651737,
+      "grad_norm": 0.02237290814529761,
+      "learning_rate": 2.357089182658484e-05,
+      "loss": 0.4633,
+      "step": 35915
+    },
+    {
+      "epoch": 1.8235712200632053,
+      "grad_norm": 0.02187816074803507,
+      "learning_rate": 2.350372408438578e-05,
+      "loss": 0.4592,
+      "step": 35920
+    },
+    {
+      "epoch": 1.8238250561612368,
+      "grad_norm": 0.022184120130896715,
+      "learning_rate": 2.343664987602939e-05,
+      "loss": 0.4646,
+      "step": 35925
+    },
+    {
+      "epoch": 1.8240788922592683,
+      "grad_norm": 0.019843601924149395,
+      "learning_rate": 2.3369669214681977e-05,
+      "loss": 0.4541,
+      "step": 35930
+    },
+    {
+      "epoch": 1.8243327283572996,
+      "grad_norm": 0.022400869249771683,
+      "learning_rate": 2.3302782113491628e-05,
+      "loss": 0.4757,
+      "step": 35935
+    },
+    {
+      "epoch": 1.8245865644553312,
+      "grad_norm": 0.02151836218094333,
+      "learning_rate": 2.3235988585587784e-05,
+      "loss": 0.4753,
+      "step": 35940
+    },
+    {
+      "epoch": 1.8248404005533627,
+      "grad_norm": 0.02301572497170209,
+      "learning_rate": 2.31692886440818e-05,
+      "loss": 0.4432,
+      "step": 35945
+    },
+    {
+      "epoch": 1.825094236651394,
+      "grad_norm": 0.019813493528431883,
+      "learning_rate": 2.3102682302066412e-05,
+      "loss": 0.4371,
+      "step": 35950
+    },
+    {
+      "epoch": 1.8253480727494256,
+      "grad_norm": 0.01992449587850107,
+      "learning_rate": 2.303616957261634e-05,
+      "loss": 0.4603,
+      "step": 35955
+    },
+    {
+      "epoch": 1.825601908847457,
+      "grad_norm": 0.023016076431343672,
+      "learning_rate": 2.2969750468787466e-05,
+      "loss": 0.4326,
+      "step": 35960
+    },
+    {
+      "epoch": 1.8258557449454886,
+      "grad_norm": 0.022191758185042577,
+      "learning_rate": 2.290342500361775e-05,
+      "loss": 0.4524,
+      "step": 35965
+    },
+    {
+      "epoch": 1.8261095810435202,
+      "grad_norm": 0.025349303799634692,
+      "learning_rate": 2.2837193190126282e-05,
+      "loss": 0.4524,
+      "step": 35970
+    },
+    {
+      "epoch": 1.8263634171415517,
+      "grad_norm": 0.020969407434920425,
+      "learning_rate": 2.2771055041314327e-05,
+      "loss": 0.4363,
+      "step": 35975
+    },
+    {
+      "epoch": 1.8266172532395832,
+      "grad_norm": 0.023006042793577052,
+      "learning_rate": 2.270501057016422e-05,
+      "loss": 0.4669,
+      "step": 35980
+    },
+    {
+      "epoch": 1.8268710893376148,
+      "grad_norm": 0.028873927723596555,
+      "learning_rate": 2.263905978964037e-05,
+      "loss": 0.4695,
+      "step": 35985
+    },
+    {
+      "epoch": 1.8271249254356463,
+      "grad_norm": 0.024116668482079474,
+      "learning_rate": 2.2573202712688367e-05,
+      "loss": 0.4777,
+      "step": 35990
+    },
+    {
+      "epoch": 1.8273787615336778,
+      "grad_norm": 0.024030296769852598,
+      "learning_rate": 2.250743935223587e-05,
+      "loss": 0.4963,
+      "step": 35995
+    },
+    {
+      "epoch": 1.8276325976317092,
+      "grad_norm": 0.026746408895009968,
+      "learning_rate": 2.2441769721191662e-05,
+      "loss": 0.4449,
+      "step": 36000
+    },
+    {
+      "epoch": 1.8278864337297407,
+      "grad_norm": 0.023888483474343483,
+      "learning_rate": 2.23761938324466e-05,
+      "loss": 0.4392,
+      "step": 36005
+    },
+    {
+      "epoch": 1.8281402698277722,
+      "grad_norm": 0.035099541692213014,
+      "learning_rate": 2.2310711698872665e-05,
+      "loss": 0.4504,
+      "step": 36010
+    },
+    {
+      "epoch": 1.8283941059258038,
+      "grad_norm": 0.02112342338748428,
+      "learning_rate": 2.224532333332385e-05,
+      "loss": 0.4618,
+      "step": 36015
+    },
+    {
+      "epoch": 1.828647942023835,
+      "grad_norm": 0.020712955873280085,
+      "learning_rate": 2.2180028748635506e-05,
+      "loss": 0.4715,
+      "step": 36020
+    },
+    {
+      "epoch": 1.8289017781218666,
+      "grad_norm": 0.02460324591191274,
+      "learning_rate": 2.2114827957624595e-05,
+      "loss": 0.4728,
+      "step": 36025
+    },
+    {
+      "epoch": 1.8291556142198981,
+      "grad_norm": 0.02098638010680304,
+      "learning_rate": 2.2049720973089825e-05,
+      "loss": 0.4343,
+      "step": 36030
+    },
+    {
+      "epoch": 1.8294094503179297,
+      "grad_norm": 0.021605128746199135,
+      "learning_rate": 2.19847078078112e-05,
+      "loss": 0.475,
+      "step": 36035
+    },
+    {
+      "epoch": 1.8296632864159612,
+      "grad_norm": 0.020744240541715135,
+      "learning_rate": 2.1919788474550673e-05,
+      "loss": 0.4333,
+      "step": 36040
+    },
+    {
+      "epoch": 1.8299171225139927,
+      "grad_norm": 0.021445317479955133,
+      "learning_rate": 2.185496298605144e-05,
+      "loss": 0.4572,
+      "step": 36045
+    },
+    {
+      "epoch": 1.8301709586120243,
+      "grad_norm": 0.025771996958530503,
+      "learning_rate": 2.1790231355038493e-05,
+      "loss": 0.4197,
+      "step": 36050
+    },
+    {
+      "epoch": 1.8304247947100558,
+      "grad_norm": 0.022223270474904383,
+      "learning_rate": 2.172559359421822e-05,
+      "loss": 0.4581,
+      "step": 36055
+    },
+    {
+      "epoch": 1.8306786308080873,
+      "grad_norm": 0.020030488071074923,
+      "learning_rate": 2.166104971627886e-05,
+      "loss": 0.4306,
+      "step": 36060
+    },
+    {
+      "epoch": 1.8309324669061189,
+      "grad_norm": 0.022278981780526303,
+      "learning_rate": 2.1596599733889888e-05,
+      "loss": 0.4406,
+      "step": 36065
+    },
+    {
+      "epoch": 1.8311863030041502,
+      "grad_norm": 0.0221757803883364,
+      "learning_rate": 2.1532243659702634e-05,
+      "loss": 0.4834,
+      "step": 36070
+    },
+    {
+      "epoch": 1.8314401391021817,
+      "grad_norm": 0.023275974115444426,
+      "learning_rate": 2.146798150634982e-05,
+      "loss": 0.4563,
+      "step": 36075
+    },
+    {
+      "epoch": 1.8316939752002133,
+      "grad_norm": 0.023221039443217652,
+      "learning_rate": 2.140381328644586e-05,
+      "loss": 0.4463,
+      "step": 36080
+    },
+    {
+      "epoch": 1.8319478112982446,
+      "grad_norm": 0.020569710112120266,
+      "learning_rate": 2.133973901258651e-05,
+      "loss": 0.4703,
+      "step": 36085
+    },
+    {
+      "epoch": 1.8322016473962761,
+      "grad_norm": 0.021785715748550184,
+      "learning_rate": 2.1275758697349434e-05,
+      "loss": 0.4624,
+      "step": 36090
+    },
+    {
+      "epoch": 1.8324554834943076,
+      "grad_norm": 0.02146143080616713,
+      "learning_rate": 2.1211872353293417e-05,
+      "loss": 0.4476,
+      "step": 36095
+    },
+    {
+      "epoch": 1.8327093195923392,
+      "grad_norm": 0.020350789547040154,
+      "learning_rate": 2.11480799929592e-05,
+      "loss": 0.4511,
+      "step": 36100
+    },
+    {
+      "epoch": 1.8329631556903707,
+      "grad_norm": 0.02933647592875351,
+      "learning_rate": 2.1084381628868833e-05,
+      "loss": 0.4251,
+      "step": 36105
+    },
+    {
+      "epoch": 1.8332169917884023,
+      "grad_norm": 0.02113714142733342,
+      "learning_rate": 2.1020777273526025e-05,
+      "loss": 0.4418,
+      "step": 36110
+    },
+    {
+      "epoch": 1.8334708278864338,
+      "grad_norm": 0.021965707443001067,
+      "learning_rate": 2.0957266939415965e-05,
+      "loss": 0.4415,
+      "step": 36115
+    },
+    {
+      "epoch": 1.8337246639844653,
+      "grad_norm": 0.02268716397256464,
+      "learning_rate": 2.0893850639005453e-05,
+      "loss": 0.457,
+      "step": 36120
+    },
+    {
+      "epoch": 1.8339785000824969,
+      "grad_norm": 0.025935687476320603,
+      "learning_rate": 2.0830528384742697e-05,
+      "loss": 0.4502,
+      "step": 36125
+    },
+    {
+      "epoch": 1.8342323361805284,
+      "grad_norm": 0.02355894973776656,
+      "learning_rate": 2.076730018905759e-05,
+      "loss": 0.4344,
+      "step": 36130
+    },
+    {
+      "epoch": 1.8344861722785597,
+      "grad_norm": 0.022212368167510682,
+      "learning_rate": 2.0704166064361596e-05,
+      "loss": 0.4486,
+      "step": 36135
+    },
+    {
+      "epoch": 1.8347400083765912,
+      "grad_norm": 0.02141121626813041,
+      "learning_rate": 2.0641126023047518e-05,
+      "loss": 0.4587,
+      "step": 36140
+    },
+    {
+      "epoch": 1.8349938444746228,
+      "grad_norm": 0.02163591724705281,
+      "learning_rate": 2.0578180077489905e-05,
+      "loss": 0.4409,
+      "step": 36145
+    },
+    {
+      "epoch": 1.835247680572654,
+      "grad_norm": 0.023066084860677204,
+      "learning_rate": 2.0515328240044594e-05,
+      "loss": 0.4499,
+      "step": 36150
+    },
+    {
+      "epoch": 1.8355015166706856,
+      "grad_norm": 0.022242330077506877,
+      "learning_rate": 2.0452570523049217e-05,
+      "loss": 0.4362,
+      "step": 36155
+    },
+    {
+      "epoch": 1.8357553527687172,
+      "grad_norm": 0.020051623391778836,
+      "learning_rate": 2.03899069388227e-05,
+      "loss": 0.4427,
+      "step": 36160
+    },
+    {
+      "epoch": 1.8360091888667487,
+      "grad_norm": 0.029608579217612622,
+      "learning_rate": 2.03273374996657e-05,
+      "loss": 0.4502,
+      "step": 36165
+    },
+    {
+      "epoch": 1.8362630249647802,
+      "grad_norm": 0.023737953896778508,
+      "learning_rate": 2.026486221786017e-05,
+      "loss": 0.4767,
+      "step": 36170
+    },
+    {
+      "epoch": 1.8365168610628118,
+      "grad_norm": 0.023375619205875876,
+      "learning_rate": 2.02024811056698e-05,
+      "loss": 0.4497,
+      "step": 36175
+    },
+    {
+      "epoch": 1.8367706971608433,
+      "grad_norm": 0.027714733020031652,
+      "learning_rate": 2.0140194175339575e-05,
+      "loss": 0.4693,
+      "step": 36180
+    },
+    {
+      "epoch": 1.8370245332588748,
+      "grad_norm": 0.02427169464452961,
+      "learning_rate": 2.0078001439096218e-05,
+      "loss": 0.4364,
+      "step": 36185
+    },
+    {
+      "epoch": 1.8372783693569064,
+      "grad_norm": 0.020026240969129284,
+      "learning_rate": 2.001590290914779e-05,
+      "loss": 0.4272,
+      "step": 36190
+    },
+    {
+      "epoch": 1.837532205454938,
+      "grad_norm": 0.022495330500840523,
+      "learning_rate": 1.9953898597683927e-05,
+      "loss": 0.4843,
+      "step": 36195
+    },
+    {
+      "epoch": 1.8377860415529692,
+      "grad_norm": 0.020360142174855975,
+      "learning_rate": 1.989198851687579e-05,
+      "loss": 0.4429,
+      "step": 36200
+    },
+    {
+      "epoch": 1.8380398776510007,
+      "grad_norm": 0.023413422223116345,
+      "learning_rate": 1.9830172678876103e-05,
+      "loss": 0.468,
+      "step": 36205
+    },
+    {
+      "epoch": 1.8382937137490323,
+      "grad_norm": 0.02298430069711076,
+      "learning_rate": 1.9768451095818818e-05,
+      "loss": 0.4567,
+      "step": 36210
+    },
+    {
+      "epoch": 1.8385475498470636,
+      "grad_norm": 0.02280937325321792,
+      "learning_rate": 1.9706823779819692e-05,
+      "loss": 0.442,
+      "step": 36215
+    },
+    {
+      "epoch": 1.8388013859450951,
+      "grad_norm": 0.021396894421407745,
+      "learning_rate": 1.964529074297583e-05,
+      "loss": 0.4653,
+      "step": 36220
+    },
+    {
+      "epoch": 1.8390552220431267,
+      "grad_norm": 0.020024815824637394,
+      "learning_rate": 1.9583851997365954e-05,
+      "loss": 0.4385,
+      "step": 36225
+    },
+    {
+      "epoch": 1.8393090581411582,
+      "grad_norm": 0.02174390016299535,
+      "learning_rate": 1.952250755505003e-05,
+      "loss": 0.4606,
+      "step": 36230
+    },
+    {
+      "epoch": 1.8395628942391897,
+      "grad_norm": 0.022477525249405196,
+      "learning_rate": 1.9461257428069755e-05,
+      "loss": 0.4434,
+      "step": 36235
+    },
+    {
+      "epoch": 1.8398167303372213,
+      "grad_norm": 0.0245485389384089,
+      "learning_rate": 1.9400101628448242e-05,
+      "loss": 0.4237,
+      "step": 36240
+    },
+    {
+      "epoch": 1.8400705664352528,
+      "grad_norm": 0.021224620960195745,
+      "learning_rate": 1.9339040168189937e-05,
+      "loss": 0.4724,
+      "step": 36245
+    },
+    {
+      "epoch": 1.8403244025332843,
+      "grad_norm": 0.023242450571638745,
+      "learning_rate": 1.927807305928109e-05,
+      "loss": 0.4638,
+      "step": 36250
+    },
+    {
+      "epoch": 1.8405782386313159,
+      "grad_norm": 0.020915924336955997,
+      "learning_rate": 1.921720031368901e-05,
+      "loss": 0.4405,
+      "step": 36255
+    },
+    {
+      "epoch": 1.8408320747293474,
+      "grad_norm": 0.02248775188519345,
+      "learning_rate": 1.9156421943362924e-05,
+      "loss": 0.4786,
+      "step": 36260
+    },
+    {
+      "epoch": 1.8410859108273787,
+      "grad_norm": 0.02148666149767411,
+      "learning_rate": 1.9095737960233228e-05,
+      "loss": 0.4436,
+      "step": 36265
+    },
+    {
+      "epoch": 1.8413397469254102,
+      "grad_norm": 0.02416587138910249,
+      "learning_rate": 1.903514837621201e-05,
+      "loss": 0.4714,
+      "step": 36270
+    },
+    {
+      "epoch": 1.8415935830234418,
+      "grad_norm": 0.023157173410389604,
+      "learning_rate": 1.897465320319247e-05,
+      "loss": 0.4338,
+      "step": 36275
+    },
+    {
+      "epoch": 1.8418474191214733,
+      "grad_norm": 0.029233910498072987,
+      "learning_rate": 1.891425245304973e-05,
+      "loss": 0.4753,
+      "step": 36280
+    },
+    {
+      "epoch": 1.8421012552195046,
+      "grad_norm": 0.02283659947545247,
+      "learning_rate": 1.8853946137639966e-05,
+      "loss": 0.4239,
+      "step": 36285
+    },
+    {
+      "epoch": 1.8423550913175362,
+      "grad_norm": 0.019978195457050724,
+      "learning_rate": 1.879373426880121e-05,
+      "loss": 0.4456,
+      "step": 36290
+    },
+    {
+      "epoch": 1.8426089274155677,
+      "grad_norm": 0.021687663942728044,
+      "learning_rate": 1.8733616858352564e-05,
+      "loss": 0.4522,
+      "step": 36295
+    },
+    {
+      "epoch": 1.8428627635135992,
+      "grad_norm": 0.019528780287524306,
+      "learning_rate": 1.8673593918094923e-05,
+      "loss": 0.4439,
+      "step": 36300
+    },
+    {
+      "epoch": 1.8431165996116308,
+      "grad_norm": 0.02155234027892195,
+      "learning_rate": 1.8613665459810357e-05,
+      "loss": 0.4574,
+      "step": 36305
+    },
+    {
+      "epoch": 1.8433704357096623,
+      "grad_norm": 0.0261456620147658,
+      "learning_rate": 1.8553831495262685e-05,
+      "loss": 0.4648,
+      "step": 36310
+    },
+    {
+      "epoch": 1.8436242718076938,
+      "grad_norm": 0.026088651944962685,
+      "learning_rate": 1.849409203619673e-05,
+      "loss": 0.4534,
+      "step": 36315
+    },
+    {
+      "epoch": 1.8438781079057254,
+      "grad_norm": 0.02423314397669894,
+      "learning_rate": 1.8434447094339446e-05,
+      "loss": 0.4652,
+      "step": 36320
+    },
+    {
+      "epoch": 1.844131944003757,
+      "grad_norm": 0.023517519839947224,
+      "learning_rate": 1.837489668139858e-05,
+      "loss": 0.4425,
+      "step": 36325
+    },
+    {
+      "epoch": 1.8443857801017884,
+      "grad_norm": 0.02513005496856768,
+      "learning_rate": 1.8315440809063554e-05,
+      "loss": 0.4472,
+      "step": 36330
+    },
+    {
+      "epoch": 1.8446396161998198,
+      "grad_norm": 0.021229880550278774,
+      "learning_rate": 1.8256079489005485e-05,
+      "loss": 0.4364,
+      "step": 36335
+    },
+    {
+      "epoch": 1.8448934522978513,
+      "grad_norm": 0.01841576739055947,
+      "learning_rate": 1.8196812732876434e-05,
+      "loss": 0.4649,
+      "step": 36340
+    },
+    {
+      "epoch": 1.8451472883958828,
+      "grad_norm": 0.022976094393190677,
+      "learning_rate": 1.8137640552310374e-05,
+      "loss": 0.4356,
+      "step": 36345
+    },
+    {
+      "epoch": 1.8454011244939141,
+      "grad_norm": 0.02206953417030893,
+      "learning_rate": 1.807856295892235e-05,
+      "loss": 0.4636,
+      "step": 36350
+    },
+    {
+      "epoch": 1.8456549605919457,
+      "grad_norm": 0.02132833882982846,
+      "learning_rate": 1.801957996430914e-05,
+      "loss": 0.4605,
+      "step": 36355
+    },
+    {
+      "epoch": 1.8459087966899772,
+      "grad_norm": 0.025669102961975235,
+      "learning_rate": 1.7960691580048705e-05,
+      "loss": 0.4491,
+      "step": 36360
+    },
+    {
+      "epoch": 1.8461626327880087,
+      "grad_norm": 0.0256888308875704,
+      "learning_rate": 1.7901897817700685e-05,
+      "loss": 0.4626,
+      "step": 36365
+    },
+    {
+      "epoch": 1.8464164688860403,
+      "grad_norm": 0.022264542254564815,
+      "learning_rate": 1.7843198688805793e-05,
+      "loss": 0.4803,
+      "step": 36370
+    },
+    {
+      "epoch": 1.8466703049840718,
+      "grad_norm": 0.023955069464813527,
+      "learning_rate": 1.7784594204886485e-05,
+      "loss": 0.4728,
+      "step": 36375
+    },
+    {
+      "epoch": 1.8469241410821033,
+      "grad_norm": 0.028377687030451647,
+      "learning_rate": 1.772608437744655e-05,
+      "loss": 0.4523,
+      "step": 36380
+    },
+    {
+      "epoch": 1.8471779771801349,
+      "grad_norm": 0.02256742111449494,
+      "learning_rate": 1.7667669217971195e-05,
+      "loss": 0.4608,
+      "step": 36385
+    },
+    {
+      "epoch": 1.8474318132781664,
+      "grad_norm": 0.020948702040254757,
+      "learning_rate": 1.7609348737926968e-05,
+      "loss": 0.4561,
+      "step": 36390
+    },
+    {
+      "epoch": 1.847685649376198,
+      "grad_norm": 0.02215357030539792,
+      "learning_rate": 1.7551122948761932e-05,
+      "loss": 0.4575,
+      "step": 36395
+    },
+    {
+      "epoch": 1.8479394854742293,
+      "grad_norm": 0.021140043797980356,
+      "learning_rate": 1.7492991861905394e-05,
+      "loss": 0.4541,
+      "step": 36400
+    },
+    {
+      "epoch": 1.8481933215722608,
+      "grad_norm": 0.02126277301347097,
+      "learning_rate": 1.7434955488768445e-05,
+      "loss": 0.4592,
+      "step": 36405
+    },
+    {
+      "epoch": 1.8484471576702923,
+      "grad_norm": 0.020846498654345385,
+      "learning_rate": 1.7377013840743083e-05,
+      "loss": 0.4505,
+      "step": 36410
+    },
+    {
+      "epoch": 1.8487009937683236,
+      "grad_norm": 0.024918017001830127,
+      "learning_rate": 1.73191669292031e-05,
+      "loss": 0.4629,
+      "step": 36415
+    },
+    {
+      "epoch": 1.8489548298663552,
+      "grad_norm": 0.02250400370693143,
+      "learning_rate": 1.726141476550347e-05,
+      "loss": 0.4666,
+      "step": 36420
+    },
+    {
+      "epoch": 1.8492086659643867,
+      "grad_norm": 0.019638524615499656,
+      "learning_rate": 1.720375736098079e-05,
+      "loss": 0.4416,
+      "step": 36425
+    },
+    {
+      "epoch": 1.8494625020624182,
+      "grad_norm": 0.022038205053431632,
+      "learning_rate": 1.7146194726952778e-05,
+      "loss": 0.4335,
+      "step": 36430
+    },
+    {
+      "epoch": 1.8497163381604498,
+      "grad_norm": 0.022686280035144835,
+      "learning_rate": 1.708872687471874e-05,
+      "loss": 0.4578,
+      "step": 36435
+    },
+    {
+      "epoch": 1.8499701742584813,
+      "grad_norm": 0.025248657754128545,
+      "learning_rate": 1.7031353815559425e-05,
+      "loss": 0.4814,
+      "step": 36440
+    },
+    {
+      "epoch": 1.8502240103565128,
+      "grad_norm": 0.021988387262798826,
+      "learning_rate": 1.697407556073671e-05,
+      "loss": 0.4446,
+      "step": 36445
+    },
+    {
+      "epoch": 1.8504778464545444,
+      "grad_norm": 0.021771081888713024,
+      "learning_rate": 1.6916892121494166e-05,
+      "loss": 0.4603,
+      "step": 36450
+    },
+    {
+      "epoch": 1.850731682552576,
+      "grad_norm": 0.019874477923159228,
+      "learning_rate": 1.6859803509056527e-05,
+      "loss": 0.4329,
+      "step": 36455
+    },
+    {
+      "epoch": 1.8509855186506075,
+      "grad_norm": 0.02171798205791076,
+      "learning_rate": 1.680280973463011e-05,
+      "loss": 0.449,
+      "step": 36460
+    },
+    {
+      "epoch": 1.8512393547486388,
+      "grad_norm": 0.02199931724655234,
+      "learning_rate": 1.674591080940241e-05,
+      "loss": 0.4585,
+      "step": 36465
+    },
+    {
+      "epoch": 1.8514931908466703,
+      "grad_norm": 0.02577647787558014,
+      "learning_rate": 1.6689106744542437e-05,
+      "loss": 0.4542,
+      "step": 36470
+    },
+    {
+      "epoch": 1.8517470269447018,
+      "grad_norm": 0.023435984326753145,
+      "learning_rate": 1.6632397551200496e-05,
+      "loss": 0.4436,
+      "step": 36475
+    },
+    {
+      "epoch": 1.8520008630427331,
+      "grad_norm": 0.020355449487897765,
+      "learning_rate": 1.6575783240508458e-05,
+      "loss": 0.4355,
+      "step": 36480
+    },
+    {
+      "epoch": 1.8522546991407647,
+      "grad_norm": 0.0207668042498274,
+      "learning_rate": 1.6519263823579213e-05,
+      "loss": 0.4557,
+      "step": 36485
+    },
+    {
+      "epoch": 1.8525085352387962,
+      "grad_norm": 0.0236812491259185,
+      "learning_rate": 1.6462839311507494e-05,
+      "loss": 0.4519,
+      "step": 36490
+    },
+    {
+      "epoch": 1.8527623713368278,
+      "grad_norm": 0.026595024576517964,
+      "learning_rate": 1.640650971536889e-05,
+      "loss": 0.4644,
+      "step": 36495
+    },
+    {
+      "epoch": 1.8530162074348593,
+      "grad_norm": 0.022762821590263934,
+      "learning_rate": 1.635027504622083e-05,
+      "loss": 0.4323,
+      "step": 36500
+    },
+    {
+      "epoch": 1.8532700435328908,
+      "grad_norm": 0.025364089662246653,
+      "learning_rate": 1.6294135315101765e-05,
+      "loss": 0.4326,
+      "step": 36505
+    },
+    {
+      "epoch": 1.8535238796309224,
+      "grad_norm": 0.021616350106393015,
+      "learning_rate": 1.6238090533031825e-05,
+      "loss": 0.4613,
+      "step": 36510
+    },
+    {
+      "epoch": 1.853777715728954,
+      "grad_norm": 0.022173841898150175,
+      "learning_rate": 1.6182140711012095e-05,
+      "loss": 0.4623,
+      "step": 36515
+    },
+    {
+      "epoch": 1.8540315518269854,
+      "grad_norm": 0.021041461117188336,
+      "learning_rate": 1.6126285860025403e-05,
+      "loss": 0.4418,
+      "step": 36520
+    },
+    {
+      "epoch": 1.854285387925017,
+      "grad_norm": 0.03205910168306328,
+      "learning_rate": 1.6070525991035646e-05,
+      "loss": 0.4613,
+      "step": 36525
+    },
+    {
+      "epoch": 1.8545392240230483,
+      "grad_norm": 0.02175375824381657,
+      "learning_rate": 1.6014861114988343e-05,
+      "loss": 0.4306,
+      "step": 36530
+    },
+    {
+      "epoch": 1.8547930601210798,
+      "grad_norm": 0.024123083015002662,
+      "learning_rate": 1.5959291242810146e-05,
+      "loss": 0.4818,
+      "step": 36535
+    },
+    {
+      "epoch": 1.8550468962191113,
+      "grad_norm": 0.022298076921305057,
+      "learning_rate": 1.590381638540922e-05,
+      "loss": 0.4215,
+      "step": 36540
+    },
+    {
+      "epoch": 1.8553007323171429,
+      "grad_norm": 0.026828035039684898,
+      "learning_rate": 1.5848436553674905e-05,
+      "loss": 0.456,
+      "step": 36545
+    },
+    {
+      "epoch": 1.8555545684151742,
+      "grad_norm": 0.02438509358051607,
+      "learning_rate": 1.5793151758478064e-05,
+      "loss": 0.4759,
+      "step": 36550
+    },
+    {
+      "epoch": 1.8558084045132057,
+      "grad_norm": 0.022150780639142196,
+      "learning_rate": 1.5737962010670738e-05,
+      "loss": 0.43,
+      "step": 36555
+    },
+    {
+      "epoch": 1.8560622406112373,
+      "grad_norm": 0.02587280793684847,
+      "learning_rate": 1.5682867321086482e-05,
+      "loss": 0.4301,
+      "step": 36560
+    },
+    {
+      "epoch": 1.8563160767092688,
+      "grad_norm": 0.02196586338846343,
+      "learning_rate": 1.5627867700540144e-05,
+      "loss": 0.4392,
+      "step": 36565
+    },
+    {
+      "epoch": 1.8565699128073003,
+      "grad_norm": 0.02236881612109358,
+      "learning_rate": 1.557296315982776e-05,
+      "loss": 0.4533,
+      "step": 36570
+    },
+    {
+      "epoch": 1.8568237489053319,
+      "grad_norm": 0.021640783844340757,
+      "learning_rate": 1.5518153709726922e-05,
+      "loss": 0.4356,
+      "step": 36575
+    },
+    {
+      "epoch": 1.8570775850033634,
+      "grad_norm": 0.021850601092974337,
+      "learning_rate": 1.5463439360996367e-05,
+      "loss": 0.4601,
+      "step": 36580
+    },
+    {
+      "epoch": 1.857331421101395,
+      "grad_norm": 0.022863672714919226,
+      "learning_rate": 1.5408820124376277e-05,
+      "loss": 0.4623,
+      "step": 36585
+    },
+    {
+      "epoch": 1.8575852571994265,
+      "grad_norm": 0.02052988156416512,
+      "learning_rate": 1.535429601058813e-05,
+      "loss": 0.4182,
+      "step": 36590
+    },
+    {
+      "epoch": 1.8578390932974578,
+      "grad_norm": 0.01930622632761717,
+      "learning_rate": 1.5299867030334813e-05,
+      "loss": 0.4422,
+      "step": 36595
+    },
+    {
+      "epoch": 1.8580929293954893,
+      "grad_norm": 0.020532303166287503,
+      "learning_rate": 1.5245533194300387e-05,
+      "loss": 0.4742,
+      "step": 36600
+    },
+    {
+      "epoch": 1.8583467654935208,
+      "grad_norm": 0.021453897011547724,
+      "learning_rate": 1.5191294513150322e-05,
+      "loss": 0.4438,
+      "step": 36605
+    },
+    {
+      "epoch": 1.8586006015915524,
+      "grad_norm": 0.021567947200895245,
+      "learning_rate": 1.5137150997531379e-05,
+      "loss": 0.4645,
+      "step": 36610
+    },
+    {
+      "epoch": 1.8588544376895837,
+      "grad_norm": 0.02120114038014877,
+      "learning_rate": 1.5083102658071667e-05,
+      "loss": 0.4417,
+      "step": 36615
+    },
+    {
+      "epoch": 1.8591082737876152,
+      "grad_norm": 0.02211469006670524,
+      "learning_rate": 1.5029149505380647e-05,
+      "loss": 0.4773,
+      "step": 36620
+    },
+    {
+      "epoch": 1.8593621098856468,
+      "grad_norm": 0.023182166431144506,
+      "learning_rate": 1.4975291550049063e-05,
+      "loss": 0.4537,
+      "step": 36625
+    },
+    {
+      "epoch": 1.8596159459836783,
+      "grad_norm": 0.019178819226592782,
+      "learning_rate": 1.492152880264891e-05,
+      "loss": 0.4491,
+      "step": 36630
+    },
+    {
+      "epoch": 1.8598697820817098,
+      "grad_norm": 0.026391292018557753,
+      "learning_rate": 1.4867861273733629e-05,
+      "loss": 0.4437,
+      "step": 36635
+    },
+    {
+      "epoch": 1.8601236181797414,
+      "grad_norm": 0.02545968689149914,
+      "learning_rate": 1.4814288973837742e-05,
+      "loss": 0.4414,
+      "step": 36640
+    },
+    {
+      "epoch": 1.860377454277773,
+      "grad_norm": 0.02262304039898802,
+      "learning_rate": 1.4760811913477389e-05,
+      "loss": 0.4643,
+      "step": 36645
+    },
+    {
+      "epoch": 1.8606312903758044,
+      "grad_norm": 0.022296739978009437,
+      "learning_rate": 1.4707430103149732e-05,
+      "loss": 0.4651,
+      "step": 36650
+    },
+    {
+      "epoch": 1.860885126473836,
+      "grad_norm": 0.025642319286203506,
+      "learning_rate": 1.4654143553333387e-05,
+      "loss": 0.4729,
+      "step": 36655
+    },
+    {
+      "epoch": 1.8611389625718675,
+      "grad_norm": 0.02273311192918183,
+      "learning_rate": 1.4600952274488265e-05,
+      "loss": 0.4587,
+      "step": 36660
+    },
+    {
+      "epoch": 1.8613927986698988,
+      "grad_norm": 0.02116679503489843,
+      "learning_rate": 1.4547856277055571e-05,
+      "loss": 0.4557,
+      "step": 36665
+    },
+    {
+      "epoch": 1.8616466347679304,
+      "grad_norm": 0.021316010404225146,
+      "learning_rate": 1.4494855571457633e-05,
+      "loss": 0.4697,
+      "step": 36670
+    },
+    {
+      "epoch": 1.8619004708659619,
+      "grad_norm": 0.02350879271004433,
+      "learning_rate": 1.4441950168098406e-05,
+      "loss": 0.4464,
+      "step": 36675
+    },
+    {
+      "epoch": 1.8621543069639932,
+      "grad_norm": 0.023258720770063683,
+      "learning_rate": 1.4389140077362916e-05,
+      "loss": 0.4375,
+      "step": 36680
+    },
+    {
+      "epoch": 1.8624081430620247,
+      "grad_norm": 0.021273891527511624,
+      "learning_rate": 1.433642530961743e-05,
+      "loss": 0.4552,
+      "step": 36685
+    },
+    {
+      "epoch": 1.8626619791600563,
+      "grad_norm": 0.02136026789458103,
+      "learning_rate": 1.4283805875209721e-05,
+      "loss": 0.4587,
+      "step": 36690
+    },
+    {
+      "epoch": 1.8629158152580878,
+      "grad_norm": 0.02214731165919245,
+      "learning_rate": 1.4231281784468587e-05,
+      "loss": 0.4264,
+      "step": 36695
+    },
+    {
+      "epoch": 1.8631696513561193,
+      "grad_norm": 0.02350780328623469,
+      "learning_rate": 1.4178853047704388e-05,
+      "loss": 0.4499,
+      "step": 36700
+    },
+    {
+      "epoch": 1.8634234874541509,
+      "grad_norm": 0.03141213231106275,
+      "learning_rate": 1.412651967520845e-05,
+      "loss": 0.4694,
+      "step": 36705
+    },
+    {
+      "epoch": 1.8636773235521824,
+      "grad_norm": 0.023866885124020652,
+      "learning_rate": 1.4074281677253719e-05,
+      "loss": 0.4456,
+      "step": 36710
+    },
+    {
+      "epoch": 1.863931159650214,
+      "grad_norm": 0.020431070665186554,
+      "learning_rate": 1.4022139064094164e-05,
+      "loss": 0.4463,
+      "step": 36715
+    },
+    {
+      "epoch": 1.8641849957482455,
+      "grad_norm": 0.021642398766930707,
+      "learning_rate": 1.3970091845965205e-05,
+      "loss": 0.4466,
+      "step": 36720
+    },
+    {
+      "epoch": 1.864438831846277,
+      "grad_norm": 0.02411974345336584,
+      "learning_rate": 1.3918140033083338e-05,
+      "loss": 0.4958,
+      "step": 36725
+    },
+    {
+      "epoch": 1.8646926679443083,
+      "grad_norm": 0.024224467721648102,
+      "learning_rate": 1.3866283635646515e-05,
+      "loss": 0.4691,
+      "step": 36730
+    },
+    {
+      "epoch": 1.8649465040423399,
+      "grad_norm": 0.02057402120977364,
+      "learning_rate": 1.3814522663833761e-05,
+      "loss": 0.4473,
+      "step": 36735
+    },
+    {
+      "epoch": 1.8652003401403714,
+      "grad_norm": 0.021182543447772897,
+      "learning_rate": 1.3762857127805727e-05,
+      "loss": 0.4408,
+      "step": 36740
+    },
+    {
+      "epoch": 1.8654541762384027,
+      "grad_norm": 0.025082497648913876,
+      "learning_rate": 1.3711287037703913e-05,
+      "loss": 0.4422,
+      "step": 36745
+    },
+    {
+      "epoch": 1.8657080123364342,
+      "grad_norm": 0.023947880929159032,
+      "learning_rate": 1.3659812403651439e-05,
+      "loss": 0.464,
+      "step": 36750
+    },
+    {
+      "epoch": 1.8659618484344658,
+      "grad_norm": 0.0255745124388733,
+      "learning_rate": 1.3608433235752282e-05,
+      "loss": 0.4467,
+      "step": 36755
+    },
+    {
+      "epoch": 1.8662156845324973,
+      "grad_norm": 0.02307802412282208,
+      "learning_rate": 1.355714954409215e-05,
+      "loss": 0.4477,
+      "step": 36760
+    },
+    {
+      "epoch": 1.8664695206305288,
+      "grad_norm": 0.02612322796667025,
+      "learning_rate": 1.3505961338737604e-05,
+      "loss": 0.4463,
+      "step": 36765
+    },
+    {
+      "epoch": 1.8667233567285604,
+      "grad_norm": 0.029078431865484707,
+      "learning_rate": 1.3454868629736771e-05,
+      "loss": 0.4497,
+      "step": 36770
+    },
+    {
+      "epoch": 1.866977192826592,
+      "grad_norm": 0.025822399311491818,
+      "learning_rate": 1.3403871427118798e-05,
+      "loss": 0.4527,
+      "step": 36775
+    },
+    {
+      "epoch": 1.8672310289246234,
+      "grad_norm": 0.022291834000361617,
+      "learning_rate": 1.3352969740894228e-05,
+      "loss": 0.4711,
+      "step": 36780
+    },
+    {
+      "epoch": 1.867484865022655,
+      "grad_norm": 0.02647393100605352,
+      "learning_rate": 1.3302163581054793e-05,
+      "loss": 0.4574,
+      "step": 36785
+    },
+    {
+      "epoch": 1.8677387011206865,
+      "grad_norm": 0.023446096336786612,
+      "learning_rate": 1.3251452957573517e-05,
+      "loss": 0.4738,
+      "step": 36790
+    },
+    {
+      "epoch": 1.8679925372187178,
+      "grad_norm": 0.02127291076715947,
+      "learning_rate": 1.3200837880404548e-05,
+      "loss": 0.4877,
+      "step": 36795
+    },
+    {
+      "epoch": 1.8682463733167494,
+      "grad_norm": 0.024906256232495056,
+      "learning_rate": 1.3150318359483437e-05,
+      "loss": 0.447,
+      "step": 36800
+    },
+    {
+      "epoch": 1.868500209414781,
+      "grad_norm": 0.02057976417447868,
+      "learning_rate": 1.3099894404726976e-05,
+      "loss": 0.4736,
+      "step": 36805
+    },
+    {
+      "epoch": 1.8687540455128122,
+      "grad_norm": 0.02492191020600024,
+      "learning_rate": 1.3049566026033022e-05,
+      "loss": 0.4711,
+      "step": 36810
+    },
+    {
+      "epoch": 1.8690078816108437,
+      "grad_norm": 0.02420601274081891,
+      "learning_rate": 1.2999333233280896e-05,
+      "loss": 0.4365,
+      "step": 36815
+    },
+    {
+      "epoch": 1.8692617177088753,
+      "grad_norm": 0.020254390572854473,
+      "learning_rate": 1.294919603633088e-05,
+      "loss": 0.439,
+      "step": 36820
+    },
+    {
+      "epoch": 1.8695155538069068,
+      "grad_norm": 0.02474683239453854,
+      "learning_rate": 1.2899154445024874e-05,
+      "loss": 0.4541,
+      "step": 36825
+    },
+    {
+      "epoch": 1.8697693899049384,
+      "grad_norm": 0.03001458417928382,
+      "learning_rate": 1.2849208469185636e-05,
+      "loss": 0.4811,
+      "step": 36830
+    },
+    {
+      "epoch": 1.8700232260029699,
+      "grad_norm": 0.021337265901394033,
+      "learning_rate": 1.2799358118617377e-05,
+      "loss": 0.4643,
+      "step": 36835
+    },
+    {
+      "epoch": 1.8702770621010014,
+      "grad_norm": 0.02208717085882638,
+      "learning_rate": 1.2749603403105437e-05,
+      "loss": 0.4623,
+      "step": 36840
+    },
+    {
+      "epoch": 1.870530898199033,
+      "grad_norm": 0.02100059123832777,
+      "learning_rate": 1.2699944332416502e-05,
+      "loss": 0.4391,
+      "step": 36845
+    },
+    {
+      "epoch": 1.8707847342970645,
+      "grad_norm": 0.023998206914377488,
+      "learning_rate": 1.2650380916298222e-05,
+      "loss": 0.4945,
+      "step": 36850
+    },
+    {
+      "epoch": 1.871038570395096,
+      "grad_norm": 0.022644141112716502,
+      "learning_rate": 1.2600913164479811e-05,
+      "loss": 0.443,
+      "step": 36855
+    },
+    {
+      "epoch": 1.8712924064931273,
+      "grad_norm": 0.019765022307224444,
+      "learning_rate": 1.2551541086671447e-05,
+      "loss": 0.4424,
+      "step": 36860
+    },
+    {
+      "epoch": 1.8715462425911589,
+      "grad_norm": 0.021694636824496228,
+      "learning_rate": 1.2502264692564768e-05,
+      "loss": 0.4575,
+      "step": 36865
+    },
+    {
+      "epoch": 1.8718000786891904,
+      "grad_norm": 0.020689782081024555,
+      "learning_rate": 1.2453083991832258e-05,
+      "loss": 0.4464,
+      "step": 36870
+    },
+    {
+      "epoch": 1.872053914787222,
+      "grad_norm": 0.02173020264000965,
+      "learning_rate": 1.2403998994128085e-05,
+      "loss": 0.4299,
+      "step": 36875
+    },
+    {
+      "epoch": 1.8723077508852533,
+      "grad_norm": 0.023926802677357032,
+      "learning_rate": 1.2355009709087205e-05,
+      "loss": 0.4544,
+      "step": 36880
+    },
+    {
+      "epoch": 1.8725615869832848,
+      "grad_norm": 0.024662532571645124,
+      "learning_rate": 1.2306116146326096e-05,
+      "loss": 0.4503,
+      "step": 36885
+    },
+    {
+      "epoch": 1.8728154230813163,
+      "grad_norm": 0.022866472003658402,
+      "learning_rate": 1.225731831544219e-05,
+      "loss": 0.4684,
+      "step": 36890
+    },
+    {
+      "epoch": 1.8730692591793479,
+      "grad_norm": 0.02737229931132882,
+      "learning_rate": 1.220861622601438e-05,
+      "loss": 0.4691,
+      "step": 36895
+    },
+    {
+      "epoch": 1.8733230952773794,
+      "grad_norm": 0.02045519299338699,
+      "learning_rate": 1.2160009887602575e-05,
+      "loss": 0.439,
+      "step": 36900
+    },
+    {
+      "epoch": 1.873576931375411,
+      "grad_norm": 0.02105021426485451,
+      "learning_rate": 1.2111499309747975e-05,
+      "loss": 0.4512,
+      "step": 36905
+    },
+    {
+      "epoch": 1.8738307674734425,
+      "grad_norm": 0.023234909473193354,
+      "learning_rate": 1.2063084501972966e-05,
+      "loss": 0.4605,
+      "step": 36910
+    },
+    {
+      "epoch": 1.874084603571474,
+      "grad_norm": 0.02495377423784143,
+      "learning_rate": 1.2014765473781053e-05,
+      "loss": 0.4709,
+      "step": 36915
+    },
+    {
+      "epoch": 1.8743384396695055,
+      "grad_norm": 0.01982317350160172,
+      "learning_rate": 1.1966542234657208e-05,
+      "loss": 0.4205,
+      "step": 36920
+    },
+    {
+      "epoch": 1.874592275767537,
+      "grad_norm": 0.023080934785440023,
+      "learning_rate": 1.1918414794067244e-05,
+      "loss": 0.4355,
+      "step": 36925
+    },
+    {
+      "epoch": 1.8748461118655684,
+      "grad_norm": 0.022797451618634154,
+      "learning_rate": 1.1870383161458497e-05,
+      "loss": 0.4465,
+      "step": 36930
+    },
+    {
+      "epoch": 1.8750999479636,
+      "grad_norm": 0.0241812112755731,
+      "learning_rate": 1.182244734625909e-05,
+      "loss": 0.4426,
+      "step": 36935
+    },
+    {
+      "epoch": 1.8753537840616314,
+      "grad_norm": 0.023434936541055627,
+      "learning_rate": 1.1774607357878886e-05,
+      "loss": 0.4484,
+      "step": 36940
+    },
+    {
+      "epoch": 1.8756076201596628,
+      "grad_norm": 0.02410828643170135,
+      "learning_rate": 1.1726863205708372e-05,
+      "loss": 0.4408,
+      "step": 36945
+    },
+    {
+      "epoch": 1.8758614562576943,
+      "grad_norm": 0.022411527527885312,
+      "learning_rate": 1.1679214899119605e-05,
+      "loss": 0.4561,
+      "step": 36950
+    },
+    {
+      "epoch": 1.8761152923557258,
+      "grad_norm": 0.021883107784838412,
+      "learning_rate": 1.1631662447465719e-05,
+      "loss": 0.4449,
+      "step": 36955
+    },
+    {
+      "epoch": 1.8763691284537574,
+      "grad_norm": 0.02541344155482163,
+      "learning_rate": 1.1584205860081021e-05,
+      "loss": 0.4598,
+      "step": 36960
+    },
+    {
+      "epoch": 1.876622964551789,
+      "grad_norm": 0.02445820739790313,
+      "learning_rate": 1.153684514628095e-05,
+      "loss": 0.4213,
+      "step": 36965
+    },
+    {
+      "epoch": 1.8768768006498204,
+      "grad_norm": 0.022274003744240302,
+      "learning_rate": 1.1489580315362292e-05,
+      "loss": 0.4674,
+      "step": 36970
+    },
+    {
+      "epoch": 1.877130636747852,
+      "grad_norm": 0.02302479732283661,
+      "learning_rate": 1.1442411376602679e-05,
+      "loss": 0.4447,
+      "step": 36975
+    },
+    {
+      "epoch": 1.8773844728458835,
+      "grad_norm": 0.023099210723607017,
+      "learning_rate": 1.139533833926143e-05,
+      "loss": 0.4334,
+      "step": 36980
+    },
+    {
+      "epoch": 1.877638308943915,
+      "grad_norm": 0.02557626233760811,
+      "learning_rate": 1.1348361212578484e-05,
+      "loss": 0.4435,
+      "step": 36985
+    },
+    {
+      "epoch": 1.8778921450419466,
+      "grad_norm": 0.02197010518349448,
+      "learning_rate": 1.1301480005775412e-05,
+      "loss": 0.4739,
+      "step": 36990
+    },
+    {
+      "epoch": 1.8781459811399779,
+      "grad_norm": 0.025263565398403336,
+      "learning_rate": 1.1254694728054626e-05,
+      "loss": 0.4556,
+      "step": 36995
+    },
+    {
+      "epoch": 1.8783998172380094,
+      "grad_norm": 0.021885371029893786,
+      "learning_rate": 1.1208005388599951e-05,
+      "loss": 0.4652,
+      "step": 37000
+    },
+    {
+      "epoch": 1.878653653336041,
+      "grad_norm": 0.023486472540742757,
+      "learning_rate": 1.1161411996576165e-05,
+      "loss": 0.4667,
+      "step": 37005
+    },
+    {
+      "epoch": 1.8789074894340723,
+      "grad_norm": 0.022717357060794766,
+      "learning_rate": 1.1114914561129396e-05,
+      "loss": 0.4579,
+      "step": 37010
+    },
+    {
+      "epoch": 1.8791613255321038,
+      "grad_norm": 0.021029880026985255,
+      "learning_rate": 1.106851309138679e-05,
+      "loss": 0.4903,
+      "step": 37015
+    },
+    {
+      "epoch": 1.8794151616301353,
+      "grad_norm": 0.020524349147354494,
+      "learning_rate": 1.1022207596456835e-05,
+      "loss": 0.4325,
+      "step": 37020
+    },
+    {
+      "epoch": 1.8796689977281669,
+      "grad_norm": 0.026662156059636153,
+      "learning_rate": 1.0975998085428984e-05,
+      "loss": 0.4224,
+      "step": 37025
+    },
+    {
+      "epoch": 1.8799228338261984,
+      "grad_norm": 0.02148292116153404,
+      "learning_rate": 1.0929884567373927e-05,
+      "loss": 0.4643,
+      "step": 37030
+    },
+    {
+      "epoch": 1.88017666992423,
+      "grad_norm": 0.030321978223338382,
+      "learning_rate": 1.0883867051343533e-05,
+      "loss": 0.4631,
+      "step": 37035
+    },
+    {
+      "epoch": 1.8804305060222615,
+      "grad_norm": 0.02090326478040996,
+      "learning_rate": 1.0837945546370798e-05,
+      "loss": 0.4325,
+      "step": 37040
+    },
+    {
+      "epoch": 1.880684342120293,
+      "grad_norm": 0.02145454006132866,
+      "learning_rate": 1.0792120061469956e-05,
+      "loss": 0.4412,
+      "step": 37045
+    },
+    {
+      "epoch": 1.8809381782183245,
+      "grad_norm": 0.022566197322495922,
+      "learning_rate": 1.0746390605636259e-05,
+      "loss": 0.463,
+      "step": 37050
+    },
+    {
+      "epoch": 1.881192014316356,
+      "grad_norm": 0.027247127025454444,
+      "learning_rate": 1.0700757187846188e-05,
+      "loss": 0.4749,
+      "step": 37055
+    },
+    {
+      "epoch": 1.8814458504143874,
+      "grad_norm": 0.02154384889356073,
+      "learning_rate": 1.065521981705736e-05,
+      "loss": 0.429,
+      "step": 37060
+    },
+    {
+      "epoch": 1.881699686512419,
+      "grad_norm": 0.026084199584670894,
+      "learning_rate": 1.0609778502208512e-05,
+      "loss": 0.4497,
+      "step": 37065
+    },
+    {
+      "epoch": 1.8819535226104505,
+      "grad_norm": 0.020525637238970808,
+      "learning_rate": 1.0564433252219507e-05,
+      "loss": 0.4537,
+      "step": 37070
+    },
+    {
+      "epoch": 1.8822073587084818,
+      "grad_norm": 0.020483461060641535,
+      "learning_rate": 1.0519184075991505e-05,
+      "loss": 0.4505,
+      "step": 37075
+    },
+    {
+      "epoch": 1.8824611948065133,
+      "grad_norm": 0.02326068817822505,
+      "learning_rate": 1.0474030982406624e-05,
+      "loss": 0.4732,
+      "step": 37080
+    },
+    {
+      "epoch": 1.8827150309045448,
+      "grad_norm": 0.019163506578638276,
+      "learning_rate": 1.0428973980328216e-05,
+      "loss": 0.441,
+      "step": 37085
+    },
+    {
+      "epoch": 1.8829688670025764,
+      "grad_norm": 0.018667241598925556,
+      "learning_rate": 1.038401307860065e-05,
+      "loss": 0.435,
+      "step": 37090
+    },
+    {
+      "epoch": 1.883222703100608,
+      "grad_norm": 0.02123912140480303,
+      "learning_rate": 1.0339148286049705e-05,
+      "loss": 0.4478,
+      "step": 37095
+    },
+    {
+      "epoch": 1.8834765391986394,
+      "grad_norm": 0.021748725396268195,
+      "learning_rate": 1.0294379611481885e-05,
+      "loss": 0.4382,
+      "step": 37100
+    },
+    {
+      "epoch": 1.883730375296671,
+      "grad_norm": 0.021313373453137146,
+      "learning_rate": 1.0249707063685277e-05,
+      "loss": 0.4929,
+      "step": 37105
+    },
+    {
+      "epoch": 1.8839842113947025,
+      "grad_norm": 0.020814925335928592,
+      "learning_rate": 1.0205130651428806e-05,
+      "loss": 0.4589,
+      "step": 37110
+    },
+    {
+      "epoch": 1.884238047492734,
+      "grad_norm": 0.019883791376370075,
+      "learning_rate": 1.0160650383462588e-05,
+      "loss": 0.4311,
+      "step": 37115
+    },
+    {
+      "epoch": 1.8844918835907656,
+      "grad_norm": 0.025265388503289,
+      "learning_rate": 1.0116266268517805e-05,
+      "loss": 0.4132,
+      "step": 37120
+    },
+    {
+      "epoch": 1.884745719688797,
+      "grad_norm": 0.024819525773685097,
+      "learning_rate": 1.0071978315306984e-05,
+      "loss": 0.4526,
+      "step": 37125
+    },
+    {
+      "epoch": 1.8849995557868284,
+      "grad_norm": 0.02126190229536664,
+      "learning_rate": 1.0027786532523508e-05,
+      "loss": 0.4589,
+      "step": 37130
+    },
+    {
+      "epoch": 1.88525339188486,
+      "grad_norm": 0.022454913487583175,
+      "learning_rate": 9.983690928842105e-06,
+      "loss": 0.4721,
+      "step": 37135
+    },
+    {
+      "epoch": 1.8855072279828915,
+      "grad_norm": 0.03277420429629867,
+      "learning_rate": 9.939691512918404e-06,
+      "loss": 0.4367,
+      "step": 37140
+    },
+    {
+      "epoch": 1.8857610640809228,
+      "grad_norm": 0.022196508671986213,
+      "learning_rate": 9.895788293389385e-06,
+      "loss": 0.4626,
+      "step": 37145
+    },
+    {
+      "epoch": 1.8860149001789543,
+      "grad_norm": 0.022168660914226923,
+      "learning_rate": 9.851981278872878e-06,
+      "loss": 0.4689,
+      "step": 37150
+    },
+    {
+      "epoch": 1.8862687362769859,
+      "grad_norm": 0.02563443384862334,
+      "learning_rate": 9.808270477968173e-06,
+      "loss": 0.4455,
+      "step": 37155
+    },
+    {
+      "epoch": 1.8865225723750174,
+      "grad_norm": 0.021376673277626792,
+      "learning_rate": 9.764655899255347e-06,
+      "loss": 0.4679,
+      "step": 37160
+    },
+    {
+      "epoch": 1.886776408473049,
+      "grad_norm": 0.026018940765089024,
+      "learning_rate": 9.721137551295778e-06,
+      "loss": 0.4601,
+      "step": 37165
+    },
+    {
+      "epoch": 1.8870302445710805,
+      "grad_norm": 0.023643143077169092,
+      "learning_rate": 9.677715442631962e-06,
+      "loss": 0.4454,
+      "step": 37170
+    },
+    {
+      "epoch": 1.887284080669112,
+      "grad_norm": 0.025955682817119883,
+      "learning_rate": 9.63438958178725e-06,
+      "loss": 0.47,
+      "step": 37175
+    },
+    {
+      "epoch": 1.8875379167671436,
+      "grad_norm": 0.02096935393978991,
+      "learning_rate": 9.591159977266506e-06,
+      "loss": 0.4425,
+      "step": 37180
+    },
+    {
+      "epoch": 1.887791752865175,
+      "grad_norm": 0.018616468472140536,
+      "learning_rate": 9.54802663755533e-06,
+      "loss": 0.4415,
+      "step": 37185
+    },
+    {
+      "epoch": 1.8880455889632066,
+      "grad_norm": 0.02545228535890788,
+      "learning_rate": 9.504989571120726e-06,
+      "loss": 0.4677,
+      "step": 37190
+    },
+    {
+      "epoch": 1.888299425061238,
+      "grad_norm": 0.022999000833790844,
+      "learning_rate": 9.462048786410492e-06,
+      "loss": 0.4482,
+      "step": 37195
+    },
+    {
+      "epoch": 1.8885532611592695,
+      "grad_norm": 0.02212084208825238,
+      "learning_rate": 9.419204291853834e-06,
+      "loss": 0.4339,
+      "step": 37200
+    },
+    {
+      "epoch": 1.888807097257301,
+      "grad_norm": 0.024284937751072763,
+      "learning_rate": 9.376456095860798e-06,
+      "loss": 0.4601,
+      "step": 37205
+    },
+    {
+      "epoch": 1.8890609333553323,
+      "grad_norm": 0.02382615118490866,
+      "learning_rate": 9.333804206822726e-06,
+      "loss": 0.4606,
+      "step": 37210
+    },
+    {
+      "epoch": 1.8893147694533639,
+      "grad_norm": 0.021486485098027707,
+      "learning_rate": 9.291248633111927e-06,
+      "loss": 0.436,
+      "step": 37215
+    },
+    {
+      "epoch": 1.8895686055513954,
+      "grad_norm": 0.02341752435747304,
+      "learning_rate": 9.248789383081879e-06,
+      "loss": 0.4725,
+      "step": 37220
+    },
+    {
+      "epoch": 1.889822441649427,
+      "grad_norm": 0.027777059793671188,
+      "learning_rate": 9.206426465067031e-06,
+      "loss": 0.4456,
+      "step": 37225
+    },
+    {
+      "epoch": 1.8900762777474585,
+      "grad_norm": 0.021359684770520977,
+      "learning_rate": 9.164159887383172e-06,
+      "loss": 0.4423,
+      "step": 37230
+    },
+    {
+      "epoch": 1.89033011384549,
+      "grad_norm": 0.02349495122091391,
+      "learning_rate": 9.12198965832689e-06,
+      "loss": 0.46,
+      "step": 37235
+    },
+    {
+      "epoch": 1.8905839499435215,
+      "grad_norm": 0.022308573567443774,
+      "learning_rate": 9.079915786176063e-06,
+      "loss": 0.4566,
+      "step": 37240
+    },
+    {
+      "epoch": 1.890837786041553,
+      "grad_norm": 0.020770175767240735,
+      "learning_rate": 9.037938279189528e-06,
+      "loss": 0.4211,
+      "step": 37245
+    },
+    {
+      "epoch": 1.8910916221395846,
+      "grad_norm": 0.0222815680616627,
+      "learning_rate": 8.996057145607306e-06,
+      "loss": 0.4519,
+      "step": 37250
+    },
+    {
+      "epoch": 1.8913454582376161,
+      "grad_norm": 0.02139157051540678,
+      "learning_rate": 8.95427239365043e-06,
+      "loss": 0.4423,
+      "step": 37255
+    },
+    {
+      "epoch": 1.8915992943356474,
+      "grad_norm": 0.021039178876140388,
+      "learning_rate": 8.912584031521065e-06,
+      "loss": 0.4599,
+      "step": 37260
+    },
+    {
+      "epoch": 1.891853130433679,
+      "grad_norm": 0.031078033789477582,
+      "learning_rate": 8.870992067402384e-06,
+      "loss": 0.4544,
+      "step": 37265
+    },
+    {
+      "epoch": 1.8921069665317105,
+      "grad_norm": 0.02651545626840905,
+      "learning_rate": 8.82949650945869e-06,
+      "loss": 0.4506,
+      "step": 37270
+    },
+    {
+      "epoch": 1.8923608026297418,
+      "grad_norm": 0.018840248290589404,
+      "learning_rate": 8.788097365835358e-06,
+      "loss": 0.4068,
+      "step": 37275
+    },
+    {
+      "epoch": 1.8926146387277734,
+      "grad_norm": 0.02098145796800753,
+      "learning_rate": 8.746794644658828e-06,
+      "loss": 0.4254,
+      "step": 37280
+    },
+    {
+      "epoch": 1.892868474825805,
+      "grad_norm": 0.023778470835364184,
+      "learning_rate": 8.705588354036676e-06,
+      "loss": 0.4727,
+      "step": 37285
+    },
+    {
+      "epoch": 1.8931223109238364,
+      "grad_norm": 0.019800277404283806,
+      "learning_rate": 8.664478502057427e-06,
+      "loss": 0.4189,
+      "step": 37290
+    },
+    {
+      "epoch": 1.893376147021868,
+      "grad_norm": 0.022580776759673696,
+      "learning_rate": 8.623465096790794e-06,
+      "loss": 0.4688,
+      "step": 37295
+    },
+    {
+      "epoch": 1.8936299831198995,
+      "grad_norm": 0.024504692780699923,
+      "learning_rate": 8.582548146287395e-06,
+      "loss": 0.4415,
+      "step": 37300
+    },
+    {
+      "epoch": 1.893883819217931,
+      "grad_norm": 0.023773581217463372,
+      "learning_rate": 8.541727658579191e-06,
+      "loss": 0.4546,
+      "step": 37305
+    },
+    {
+      "epoch": 1.8941376553159626,
+      "grad_norm": 0.02457580274560056,
+      "learning_rate": 8.501003641678885e-06,
+      "loss": 0.4493,
+      "step": 37310
+    },
+    {
+      "epoch": 1.894391491413994,
+      "grad_norm": 0.023069551301224907,
+      "learning_rate": 8.460376103580526e-06,
+      "loss": 0.4494,
+      "step": 37315
+    },
+    {
+      "epoch": 1.8946453275120256,
+      "grad_norm": 0.02501756577031343,
+      "learning_rate": 8.419845052258956e-06,
+      "loss": 0.4785,
+      "step": 37320
+    },
+    {
+      "epoch": 1.894899163610057,
+      "grad_norm": 0.025329116897176897,
+      "learning_rate": 8.37941049567037e-06,
+      "loss": 0.4603,
+      "step": 37325
+    },
+    {
+      "epoch": 1.8951529997080885,
+      "grad_norm": 0.023975660111064893,
+      "learning_rate": 8.339072441751749e-06,
+      "loss": 0.4487,
+      "step": 37330
+    },
+    {
+      "epoch": 1.89540683580612,
+      "grad_norm": 0.022681786439959096,
+      "learning_rate": 8.298830898421316e-06,
+      "loss": 0.4756,
+      "step": 37335
+    },
+    {
+      "epoch": 1.8956606719041513,
+      "grad_norm": 0.019839744977402773,
+      "learning_rate": 8.258685873578198e-06,
+      "loss": 0.4743,
+      "step": 37340
+    },
+    {
+      "epoch": 1.8959145080021829,
+      "grad_norm": 0.029110595946668913,
+      "learning_rate": 8.218637375102866e-06,
+      "loss": 0.4219,
+      "step": 37345
+    },
+    {
+      "epoch": 1.8961683441002144,
+      "grad_norm": 0.02468212485542408,
+      "learning_rate": 8.178685410856424e-06,
+      "loss": 0.4636,
+      "step": 37350
+    },
+    {
+      "epoch": 1.896422180198246,
+      "grad_norm": 0.023021032923387984,
+      "learning_rate": 8.138829988681318e-06,
+      "loss": 0.4565,
+      "step": 37355
+    },
+    {
+      "epoch": 1.8966760162962775,
+      "grad_norm": 0.021806816479458407,
+      "learning_rate": 8.09907111640107e-06,
+      "loss": 0.4547,
+      "step": 37360
+    },
+    {
+      "epoch": 1.896929852394309,
+      "grad_norm": 0.028433865062280395,
+      "learning_rate": 8.059408801819934e-06,
+      "loss": 0.4506,
+      "step": 37365
+    },
+    {
+      "epoch": 1.8971836884923405,
+      "grad_norm": 0.024983454984375437,
+      "learning_rate": 8.01984305272363e-06,
+      "loss": 0.4492,
+      "step": 37370
+    },
+    {
+      "epoch": 1.897437524590372,
+      "grad_norm": 0.022957101384112662,
+      "learning_rate": 7.98037387687861e-06,
+      "loss": 0.4389,
+      "step": 37375
+    },
+    {
+      "epoch": 1.8976913606884036,
+      "grad_norm": 0.018439604142735946,
+      "learning_rate": 7.941001282032512e-06,
+      "loss": 0.4117,
+      "step": 37380
+    },
+    {
+      "epoch": 1.8979451967864351,
+      "grad_norm": 0.027207353586784774,
+      "learning_rate": 7.90172527591393e-06,
+      "loss": 0.4506,
+      "step": 37385
+    },
+    {
+      "epoch": 1.8981990328844665,
+      "grad_norm": 0.027055180572066115,
+      "learning_rate": 7.862545866232585e-06,
+      "loss": 0.4689,
+      "step": 37390
+    },
+    {
+      "epoch": 1.898452868982498,
+      "grad_norm": 0.026896364230939135,
+      "learning_rate": 7.823463060679215e-06,
+      "loss": 0.4432,
+      "step": 37395
+    },
+    {
+      "epoch": 1.8987067050805295,
+      "grad_norm": 0.02423512351795487,
+      "learning_rate": 7.784476866925571e-06,
+      "loss": 0.4585,
+      "step": 37400
+    },
+    {
+      "epoch": 1.898960541178561,
+      "grad_norm": 0.02515565929962076,
+      "learning_rate": 7.745587292624423e-06,
+      "loss": 0.4585,
+      "step": 37405
+    },
+    {
+      "epoch": 1.8992143772765924,
+      "grad_norm": 0.020797485192393404,
+      "learning_rate": 7.706794345409662e-06,
+      "loss": 0.4633,
+      "step": 37410
+    },
+    {
+      "epoch": 1.899468213374624,
+      "grad_norm": 0.021221061620204006,
+      "learning_rate": 7.668098032896086e-06,
+      "loss": 0.4474,
+      "step": 37415
+    },
+    {
+      "epoch": 1.8997220494726554,
+      "grad_norm": 0.02165854297022104,
+      "learning_rate": 7.629498362679621e-06,
+      "loss": 0.4516,
+      "step": 37420
+    },
+    {
+      "epoch": 1.899975885570687,
+      "grad_norm": 0.021157573658216487,
+      "learning_rate": 7.590995342337148e-06,
+      "loss": 0.451,
+      "step": 37425
+    },
+    {
+      "epoch": 1.9002297216687185,
+      "grad_norm": 0.022283635644036758,
+      "learning_rate": 7.552588979426733e-06,
+      "loss": 0.4755,
+      "step": 37430
+    },
+    {
+      "epoch": 1.90048355776675,
+      "grad_norm": 0.024413711078674752,
+      "learning_rate": 7.514279281487179e-06,
+      "loss": 0.4585,
+      "step": 37435
+    },
+    {
+      "epoch": 1.9007373938647816,
+      "grad_norm": 0.022093763977813735,
+      "learning_rate": 7.476066256038638e-06,
+      "loss": 0.4473,
+      "step": 37440
+    },
+    {
+      "epoch": 1.9009912299628131,
+      "grad_norm": 0.021626286158038993,
+      "learning_rate": 7.437949910581998e-06,
+      "loss": 0.4354,
+      "step": 37445
+    },
+    {
+      "epoch": 1.9012450660608446,
+      "grad_norm": 0.0273165842843997,
+      "learning_rate": 7.399930252599496e-06,
+      "loss": 0.4737,
+      "step": 37450
+    },
+    {
+      "epoch": 1.901498902158876,
+      "grad_norm": 0.023472801346320262,
+      "learning_rate": 7.362007289553996e-06,
+      "loss": 0.4606,
+      "step": 37455
+    },
+    {
+      "epoch": 1.9017527382569075,
+      "grad_norm": 0.025848419421606466,
+      "learning_rate": 7.324181028889709e-06,
+      "loss": 0.4439,
+      "step": 37460
+    },
+    {
+      "epoch": 1.902006574354939,
+      "grad_norm": 0.022098946912295272,
+      "learning_rate": 7.286451478031753e-06,
+      "loss": 0.4525,
+      "step": 37465
+    },
+    {
+      "epoch": 1.9022604104529706,
+      "grad_norm": 0.020132077745634438,
+      "learning_rate": 7.2488186443862015e-06,
+      "loss": 0.4505,
+      "step": 37470
+    },
+    {
+      "epoch": 1.9025142465510019,
+      "grad_norm": 0.021696897376979055,
+      "learning_rate": 7.211282535340202e-06,
+      "loss": 0.4805,
+      "step": 37475
+    },
+    {
+      "epoch": 1.9027680826490334,
+      "grad_norm": 0.027185595143177976,
+      "learning_rate": 7.173843158261861e-06,
+      "loss": 0.4553,
+      "step": 37480
+    },
+    {
+      "epoch": 1.903021918747065,
+      "grad_norm": 0.02056624187081856,
+      "learning_rate": 7.136500520500466e-06,
+      "loss": 0.4245,
+      "step": 37485
+    },
+    {
+      "epoch": 1.9032757548450965,
+      "grad_norm": 0.020926738183475128,
+      "learning_rate": 7.0992546293860425e-06,
+      "loss": 0.4228,
+      "step": 37490
+    },
+    {
+      "epoch": 1.903529590943128,
+      "grad_norm": 0.028740551485723114,
+      "learning_rate": 7.062105492229909e-06,
+      "loss": 0.459,
+      "step": 37495
+    },
+    {
+      "epoch": 1.9037834270411595,
+      "grad_norm": 0.053682388754172104,
+      "learning_rate": 7.02505311632412e-06,
+      "loss": 0.4424,
+      "step": 37500
+    },
+    {
+      "epoch": 1.904037263139191,
+      "grad_norm": 0.02173561656109306,
+      "learning_rate": 6.988097508942026e-06,
+      "loss": 0.4822,
+      "step": 37505
+    },
+    {
+      "epoch": 1.9042910992372226,
+      "grad_norm": 0.020845652238898626,
+      "learning_rate": 6.951238677337657e-06,
+      "loss": 0.4551,
+      "step": 37510
+    },
+    {
+      "epoch": 1.9045449353352542,
+      "grad_norm": 0.022442971026077695,
+      "learning_rate": 6.914476628746391e-06,
+      "loss": 0.4574,
+      "step": 37515
+    },
+    {
+      "epoch": 1.9047987714332857,
+      "grad_norm": 0.019100573728677607,
+      "learning_rate": 6.8778113703842345e-06,
+      "loss": 0.441,
+      "step": 37520
+    },
+    {
+      "epoch": 1.905052607531317,
+      "grad_norm": 0.022661668431873734,
+      "learning_rate": 6.8412429094485975e-06,
+      "loss": 0.4653,
+      "step": 37525
+    },
+    {
+      "epoch": 1.9053064436293485,
+      "grad_norm": 0.022521358187594164,
+      "learning_rate": 6.80477125311757e-06,
+      "loss": 0.459,
+      "step": 37530
+    },
+    {
+      "epoch": 1.90556027972738,
+      "grad_norm": 0.02587584279617253,
+      "learning_rate": 6.768396408550426e-06,
+      "loss": 0.434,
+      "step": 37535
+    },
+    {
+      "epoch": 1.9058141158254114,
+      "grad_norm": 0.02722351063157932,
+      "learning_rate": 6.732118382887287e-06,
+      "loss": 0.4388,
+      "step": 37540
+    },
+    {
+      "epoch": 1.906067951923443,
+      "grad_norm": 0.02166094929641842,
+      "learning_rate": 6.695937183249401e-06,
+      "loss": 0.4502,
+      "step": 37545
+    },
+    {
+      "epoch": 1.9063217880214745,
+      "grad_norm": 0.02157001533468999,
+      "learning_rate": 6.6598528167389205e-06,
+      "loss": 0.4463,
+      "step": 37550
+    },
+    {
+      "epoch": 1.906575624119506,
+      "grad_norm": 0.02241970910672534,
+      "learning_rate": 6.623865290439068e-06,
+      "loss": 0.4456,
+      "step": 37555
+    },
+    {
+      "epoch": 1.9068294602175375,
+      "grad_norm": 0.021738008804449923,
+      "learning_rate": 6.587974611413972e-06,
+      "loss": 0.4634,
+      "step": 37560
+    },
+    {
+      "epoch": 1.907083296315569,
+      "grad_norm": 0.021239611101420534,
+      "learning_rate": 6.552180786708828e-06,
+      "loss": 0.4402,
+      "step": 37565
+    },
+    {
+      "epoch": 1.9073371324136006,
+      "grad_norm": 0.02101083088232794,
+      "learning_rate": 6.516483823349795e-06,
+      "loss": 0.4725,
+      "step": 37570
+    },
+    {
+      "epoch": 1.9075909685116321,
+      "grad_norm": 0.025164203687074636,
+      "learning_rate": 6.480883728343989e-06,
+      "loss": 0.4614,
+      "step": 37575
+    },
+    {
+      "epoch": 1.9078448046096637,
+      "grad_norm": 0.023582969514340343,
+      "learning_rate": 6.445380508679488e-06,
+      "loss": 0.4138,
+      "step": 37580
+    },
+    {
+      "epoch": 1.9080986407076952,
+      "grad_norm": 0.018915533270275835,
+      "learning_rate": 6.4099741713254945e-06,
+      "loss": 0.432,
+      "step": 37585
+    },
+    {
+      "epoch": 1.9083524768057265,
+      "grad_norm": 0.02275092358064681,
+      "learning_rate": 6.374664723232004e-06,
+      "loss": 0.4535,
+      "step": 37590
+    },
+    {
+      "epoch": 1.908606312903758,
+      "grad_norm": 0.022085053146593946,
+      "learning_rate": 6.33945217133014e-06,
+      "loss": 0.4422,
+      "step": 37595
+    },
+    {
+      "epoch": 1.9088601490017896,
+      "grad_norm": 0.020867827787237652,
+      "learning_rate": 6.304336522531928e-06,
+      "loss": 0.462,
+      "step": 37600
+    },
+    {
+      "epoch": 1.9091139850998209,
+      "grad_norm": 0.024270037364154493,
+      "learning_rate": 6.26931778373041e-06,
+      "loss": 0.4475,
+      "step": 37605
+    },
+    {
+      "epoch": 1.9093678211978524,
+      "grad_norm": 0.019787562403379675,
+      "learning_rate": 6.234395961799588e-06,
+      "loss": 0.4427,
+      "step": 37610
+    },
+    {
+      "epoch": 1.909621657295884,
+      "grad_norm": 0.021795151563263483,
+      "learning_rate": 6.199571063594423e-06,
+      "loss": 0.4609,
+      "step": 37615
+    },
+    {
+      "epoch": 1.9098754933939155,
+      "grad_norm": 0.025237377838803695,
+      "learning_rate": 6.164843095950889e-06,
+      "loss": 0.4763,
+      "step": 37620
+    },
+    {
+      "epoch": 1.910129329491947,
+      "grad_norm": 0.0213538500394289,
+      "learning_rate": 6.13021206568587e-06,
+      "loss": 0.4668,
+      "step": 37625
+    },
+    {
+      "epoch": 1.9103831655899786,
+      "grad_norm": 0.019996110614226716,
+      "learning_rate": 6.095677979597314e-06,
+      "loss": 0.4321,
+      "step": 37630
+    },
+    {
+      "epoch": 1.91063700168801,
+      "grad_norm": 0.0239910603152135,
+      "learning_rate": 6.0612408444640775e-06,
+      "loss": 0.4418,
+      "step": 37635
+    },
+    {
+      "epoch": 1.9108908377860416,
+      "grad_norm": 0.01982225962400516,
+      "learning_rate": 6.026900667045976e-06,
+      "loss": 0.452,
+      "step": 37640
+    },
+    {
+      "epoch": 1.9111446738840732,
+      "grad_norm": 0.020978657467764606,
+      "learning_rate": 5.992657454083839e-06,
+      "loss": 0.4343,
+      "step": 37645
+    },
+    {
+      "epoch": 1.9113985099821047,
+      "grad_norm": 0.021612862899188023,
+      "learning_rate": 5.958511212299455e-06,
+      "loss": 0.4522,
+      "step": 37650
+    },
+    {
+      "epoch": 1.911652346080136,
+      "grad_norm": 0.02111789726848896,
+      "learning_rate": 5.9244619483955206e-06,
+      "loss": 0.4635,
+      "step": 37655
+    },
+    {
+      "epoch": 1.9119061821781675,
+      "grad_norm": 0.026163308653677195,
+      "learning_rate": 5.890509669055799e-06,
+      "loss": 0.4711,
+      "step": 37660
+    },
+    {
+      "epoch": 1.912160018276199,
+      "grad_norm": 0.021019925153574526,
+      "learning_rate": 5.856654380944848e-06,
+      "loss": 0.4837,
+      "step": 37665
+    },
+    {
+      "epoch": 1.9124138543742304,
+      "grad_norm": 0.027602896310674144,
+      "learning_rate": 5.822896090708407e-06,
+      "loss": 0.4239,
+      "step": 37670
+    },
+    {
+      "epoch": 1.912667690472262,
+      "grad_norm": 0.021167535487248926,
+      "learning_rate": 5.789234804972954e-06,
+      "loss": 0.4552,
+      "step": 37675
+    },
+    {
+      "epoch": 1.9129215265702935,
+      "grad_norm": 0.023086996155861482,
+      "learning_rate": 5.755670530346146e-06,
+      "loss": 0.4514,
+      "step": 37680
+    },
+    {
+      "epoch": 1.913175362668325,
+      "grad_norm": 0.022681878507942586,
+      "learning_rate": 5.722203273416326e-06,
+      "loss": 0.4658,
+      "step": 37685
+    },
+    {
+      "epoch": 1.9134291987663565,
+      "grad_norm": 0.02359410198455559,
+      "learning_rate": 5.6888330407531275e-06,
+      "loss": 0.4964,
+      "step": 37690
+    },
+    {
+      "epoch": 1.913683034864388,
+      "grad_norm": 0.03200500971381236,
+      "learning_rate": 5.6555598389068656e-06,
+      "loss": 0.4776,
+      "step": 37695
+    },
+    {
+      "epoch": 1.9139368709624196,
+      "grad_norm": 0.022712836996455054,
+      "learning_rate": 5.622383674408871e-06,
+      "loss": 0.4376,
+      "step": 37700
+    },
+    {
+      "epoch": 1.9141907070604511,
+      "grad_norm": 0.025060341466893926,
+      "learning_rate": 5.589304553771546e-06,
+      "loss": 0.458,
+      "step": 37705
+    },
+    {
+      "epoch": 1.9144445431584827,
+      "grad_norm": 0.030278170291182304,
+      "learning_rate": 5.556322483488086e-06,
+      "loss": 0.4499,
+      "step": 37710
+    },
+    {
+      "epoch": 1.9146983792565142,
+      "grad_norm": 0.02740154808328099,
+      "learning_rate": 5.523437470032755e-06,
+      "loss": 0.4415,
+      "step": 37715
+    },
+    {
+      "epoch": 1.9149522153545455,
+      "grad_norm": 0.02153574679548554,
+      "learning_rate": 5.4906495198607246e-06,
+      "loss": 0.4241,
+      "step": 37720
+    },
+    {
+      "epoch": 1.915206051452577,
+      "grad_norm": 0.017912518417006917,
+      "learning_rate": 5.457958639408067e-06,
+      "loss": 0.4443,
+      "step": 37725
+    },
+    {
+      "epoch": 1.9154598875506086,
+      "grad_norm": 0.02149002570361707,
+      "learning_rate": 5.425364835091817e-06,
+      "loss": 0.4269,
+      "step": 37730
+    },
+    {
+      "epoch": 1.9157137236486401,
+      "grad_norm": 0.020655007242153935,
+      "learning_rate": 5.392868113310023e-06,
+      "loss": 0.4742,
+      "step": 37735
+    },
+    {
+      "epoch": 1.9159675597466714,
+      "grad_norm": 0.02053084592337424,
+      "learning_rate": 5.3604684804416385e-06,
+      "loss": 0.4562,
+      "step": 37740
+    },
+    {
+      "epoch": 1.916221395844703,
+      "grad_norm": 0.0241087565783307,
+      "learning_rate": 5.328165942846519e-06,
+      "loss": 0.4665,
+      "step": 37745
+    },
+    {
+      "epoch": 1.9164752319427345,
+      "grad_norm": 0.022566883002394126,
+      "learning_rate": 5.2959605068654825e-06,
+      "loss": 0.4356,
+      "step": 37750
+    },
+    {
+      "epoch": 1.916729068040766,
+      "grad_norm": 0.02242988197114318,
+      "learning_rate": 5.263852178820305e-06,
+      "loss": 0.4291,
+      "step": 37755
+    },
+    {
+      "epoch": 1.9169829041387976,
+      "grad_norm": 0.020849242044697184,
+      "learning_rate": 5.231840965013668e-06,
+      "loss": 0.4608,
+      "step": 37760
+    },
+    {
+      "epoch": 1.917236740236829,
+      "grad_norm": 0.02353852633397079,
+      "learning_rate": 5.199926871729321e-06,
+      "loss": 0.476,
+      "step": 37765
+    },
+    {
+      "epoch": 1.9174905763348606,
+      "grad_norm": 0.022019434675896826,
+      "learning_rate": 5.1681099052317545e-06,
+      "loss": 0.4421,
+      "step": 37770
+    },
+    {
+      "epoch": 1.9177444124328922,
+      "grad_norm": 0.02349710133273863,
+      "learning_rate": 5.136390071766472e-06,
+      "loss": 0.4404,
+      "step": 37775
+    },
+    {
+      "epoch": 1.9179982485309237,
+      "grad_norm": 0.023170632986613243,
+      "learning_rate": 5.104767377559938e-06,
+      "loss": 0.4633,
+      "step": 37780
+    },
+    {
+      "epoch": 1.9182520846289552,
+      "grad_norm": 0.020601169329142837,
+      "learning_rate": 5.073241828819519e-06,
+      "loss": 0.4633,
+      "step": 37785
+    },
+    {
+      "epoch": 1.9185059207269866,
+      "grad_norm": 0.020705639764346858,
+      "learning_rate": 5.041813431733544e-06,
+      "loss": 0.441,
+      "step": 37790
+    },
+    {
+      "epoch": 1.918759756825018,
+      "grad_norm": 0.02556826389734915,
+      "learning_rate": 5.010482192471244e-06,
+      "loss": 0.4671,
+      "step": 37795
+    },
+    {
+      "epoch": 1.9190135929230496,
+      "grad_norm": 0.025036129806181157,
+      "learning_rate": 4.9792481171828105e-06,
+      "loss": 0.4469,
+      "step": 37800
+    },
+    {
+      "epoch": 1.919267429021081,
+      "grad_norm": 0.022980559401410746,
+      "learning_rate": 4.948111211999284e-06,
+      "loss": 0.454,
+      "step": 37805
+    },
+    {
+      "epoch": 1.9195212651191125,
+      "grad_norm": 0.02211175879102414,
+      "learning_rate": 4.917071483032665e-06,
+      "loss": 0.4545,
+      "step": 37810
+    },
+    {
+      "epoch": 1.919775101217144,
+      "grad_norm": 0.021321225074512732,
+      "learning_rate": 4.886128936375966e-06,
+      "loss": 0.4651,
+      "step": 37815
+    },
+    {
+      "epoch": 1.9200289373151755,
+      "grad_norm": 0.021856946004611112,
+      "learning_rate": 4.855283578103054e-06,
+      "loss": 0.4445,
+      "step": 37820
+    },
+    {
+      "epoch": 1.920282773413207,
+      "grad_norm": 0.020187327224539992,
+      "learning_rate": 4.824535414268638e-06,
+      "loss": 0.4632,
+      "step": 37825
+    },
+    {
+      "epoch": 1.9205366095112386,
+      "grad_norm": 0.023155717064296796,
+      "learning_rate": 4.793884450908559e-06,
+      "loss": 0.4565,
+      "step": 37830
+    },
+    {
+      "epoch": 1.9207904456092701,
+      "grad_norm": 0.023520053434749763,
+      "learning_rate": 4.763330694039281e-06,
+      "loss": 0.4615,
+      "step": 37835
+    },
+    {
+      "epoch": 1.9210442817073017,
+      "grad_norm": 0.02152645989454147,
+      "learning_rate": 4.7328741496585615e-06,
+      "loss": 0.4672,
+      "step": 37840
+    },
+    {
+      "epoch": 1.9212981178053332,
+      "grad_norm": 0.020920871384161287,
+      "learning_rate": 4.7025148237446745e-06,
+      "loss": 0.462,
+      "step": 37845
+    },
+    {
+      "epoch": 1.9215519539033648,
+      "grad_norm": 0.02171301262060699,
+      "learning_rate": 4.672252722257076e-06,
+      "loss": 0.4595,
+      "step": 37850
+    },
+    {
+      "epoch": 1.921805790001396,
+      "grad_norm": 0.02813041922076858,
+      "learning_rate": 4.642087851136123e-06,
+      "loss": 0.4462,
+      "step": 37855
+    },
+    {
+      "epoch": 1.9220596260994276,
+      "grad_norm": 0.025961513788743708,
+      "learning_rate": 4.61202021630297e-06,
+      "loss": 0.4425,
+      "step": 37860
+    },
+    {
+      "epoch": 1.9223134621974591,
+      "grad_norm": 0.031507528395826975,
+      "learning_rate": 4.582049823659673e-06,
+      "loss": 0.4624,
+      "step": 37865
+    },
+    {
+      "epoch": 1.9225672982954904,
+      "grad_norm": 0.022048534282361677,
+      "learning_rate": 4.55217667908947e-06,
+      "loss": 0.467,
+      "step": 37870
+    },
+    {
+      "epoch": 1.922821134393522,
+      "grad_norm": 0.02306257762735943,
+      "learning_rate": 4.522400788456115e-06,
+      "loss": 0.4674,
+      "step": 37875
+    },
+    {
+      "epoch": 1.9230749704915535,
+      "grad_norm": 0.02387832694700697,
+      "learning_rate": 4.492722157604545e-06,
+      "loss": 0.4673,
+      "step": 37880
+    },
+    {
+      "epoch": 1.923328806589585,
+      "grad_norm": 0.020366760561243185,
+      "learning_rate": 4.463140792360487e-06,
+      "loss": 0.4333,
+      "step": 37885
+    },
+    {
+      "epoch": 1.9235826426876166,
+      "grad_norm": 0.02130836354004238,
+      "learning_rate": 4.433656698530741e-06,
+      "loss": 0.4599,
+      "step": 37890
+    },
+    {
+      "epoch": 1.9238364787856481,
+      "grad_norm": 0.0206503846500878,
+      "learning_rate": 4.404269881902734e-06,
+      "loss": 0.4508,
+      "step": 37895
+    },
+    {
+      "epoch": 1.9240903148836797,
+      "grad_norm": 0.02300274852597502,
+      "learning_rate": 4.374980348245072e-06,
+      "loss": 0.4446,
+      "step": 37900
+    },
+    {
+      "epoch": 1.9243441509817112,
+      "grad_norm": 0.02138956368703685,
+      "learning_rate": 4.345788103307047e-06,
+      "loss": 0.4589,
+      "step": 37905
+    },
+    {
+      "epoch": 1.9245979870797427,
+      "grad_norm": 0.02227297400993538,
+      "learning_rate": 4.316693152819018e-06,
+      "loss": 0.4654,
+      "step": 37910
+    },
+    {
+      "epoch": 1.9248518231777743,
+      "grad_norm": 0.022468692731550267,
+      "learning_rate": 4.287695502492139e-06,
+      "loss": 0.4457,
+      "step": 37915
+    },
+    {
+      "epoch": 1.9251056592758056,
+      "grad_norm": 0.02069126082899075,
+      "learning_rate": 4.25879515801858e-06,
+      "loss": 0.4567,
+      "step": 37920
+    },
+    {
+      "epoch": 1.925359495373837,
+      "grad_norm": 0.019826909262503828,
+      "learning_rate": 4.229992125071192e-06,
+      "loss": 0.4453,
+      "step": 37925
+    },
+    {
+      "epoch": 1.9256133314718686,
+      "grad_norm": 0.020987878267737438,
+      "learning_rate": 4.201286409304006e-06,
+      "loss": 0.4454,
+      "step": 37930
+    },
+    {
+      "epoch": 1.9258671675699,
+      "grad_norm": 0.020420855142907778,
+      "learning_rate": 4.172678016351683e-06,
+      "loss": 0.433,
+      "step": 37935
+    },
+    {
+      "epoch": 1.9261210036679315,
+      "grad_norm": 0.02284228322904161,
+      "learning_rate": 4.1441669518300086e-06,
+      "loss": 0.4631,
+      "step": 37940
+    },
+    {
+      "epoch": 1.926374839765963,
+      "grad_norm": 0.0222567772359531,
+      "learning_rate": 4.115753221335561e-06,
+      "loss": 0.4523,
+      "step": 37945
+    },
+    {
+      "epoch": 1.9266286758639946,
+      "grad_norm": 0.023189951687417994,
+      "learning_rate": 4.087436830445768e-06,
+      "loss": 0.4458,
+      "step": 37950
+    },
+    {
+      "epoch": 1.926882511962026,
+      "grad_norm": 0.021658902190008947,
+      "learning_rate": 4.059217784719016e-06,
+      "loss": 0.4459,
+      "step": 37955
+    },
+    {
+      "epoch": 1.9271363480600576,
+      "grad_norm": 0.024217930442230858,
+      "learning_rate": 4.0310960896945415e-06,
+      "loss": 0.4538,
+      "step": 37960
+    },
+    {
+      "epoch": 1.9273901841580892,
+      "grad_norm": 0.022206278190921094,
+      "learning_rate": 4.003071750892595e-06,
+      "loss": 0.4552,
+      "step": 37965
+    },
+    {
+      "epoch": 1.9276440202561207,
+      "grad_norm": 0.025828497485879003,
+      "learning_rate": 3.9751447738140545e-06,
+      "loss": 0.4645,
+      "step": 37970
+    },
+    {
+      "epoch": 1.9278978563541522,
+      "grad_norm": 0.019836230031236992,
+      "learning_rate": 3.9473151639409235e-06,
+      "loss": 0.4585,
+      "step": 37975
+    },
+    {
+      "epoch": 1.9281516924521838,
+      "grad_norm": 0.020662066999979745,
+      "learning_rate": 3.919582926735999e-06,
+      "loss": 0.454,
+      "step": 37980
+    },
+    {
+      "epoch": 1.928405528550215,
+      "grad_norm": 0.022723364615094117,
+      "learning_rate": 3.891948067643036e-06,
+      "loss": 0.4606,
+      "step": 37985
+    },
+    {
+      "epoch": 1.9286593646482466,
+      "grad_norm": 0.023952908845065374,
+      "learning_rate": 3.864410592086587e-06,
+      "loss": 0.4559,
+      "step": 37990
+    },
+    {
+      "epoch": 1.9289132007462781,
+      "grad_norm": 0.025670760204192115,
+      "learning_rate": 3.836970505472104e-06,
+      "loss": 0.4263,
+      "step": 37995
+    },
+    {
+      "epoch": 1.9291670368443097,
+      "grad_norm": 0.02416027669647824,
+      "learning_rate": 3.8096278131859452e-06,
+      "loss": 0.4594,
+      "step": 38000
+    },
+    {
+      "epoch": 1.929420872942341,
+      "grad_norm": 0.02400148828175322,
+      "learning_rate": 3.7823825205953177e-06,
+      "loss": 0.4519,
+      "step": 38005
+    },
+    {
+      "epoch": 1.9296747090403725,
+      "grad_norm": 0.020877060418883157,
+      "learning_rate": 3.755234633048388e-06,
+      "loss": 0.4399,
+      "step": 38010
+    },
+    {
+      "epoch": 1.929928545138404,
+      "grad_norm": 0.021079166677847018,
+      "learning_rate": 3.7281841558741147e-06,
+      "loss": 0.4526,
+      "step": 38015
+    },
+    {
+      "epoch": 1.9301823812364356,
+      "grad_norm": 0.02979341790679115,
+      "learning_rate": 3.7012310943824178e-06,
+      "loss": 0.4559,
+      "step": 38020
+    },
+    {
+      "epoch": 1.9304362173344671,
+      "grad_norm": 0.021472840172437824,
+      "learning_rate": 3.6743754538640093e-06,
+      "loss": 0.4546,
+      "step": 38025
+    },
+    {
+      "epoch": 1.9306900534324987,
+      "grad_norm": 0.024965054424825203,
+      "learning_rate": 3.6476172395905615e-06,
+      "loss": 0.4608,
+      "step": 38030
+    },
+    {
+      "epoch": 1.9309438895305302,
+      "grad_norm": 0.030534410696193816,
+      "learning_rate": 3.6209564568144837e-06,
+      "loss": 0.4537,
+      "step": 38035
+    },
+    {
+      "epoch": 1.9311977256285617,
+      "grad_norm": 0.020565392136713392,
+      "learning_rate": 3.5943931107692563e-06,
+      "loss": 0.4551,
+      "step": 38040
+    },
+    {
+      "epoch": 1.9314515617265933,
+      "grad_norm": 0.02990769064293206,
+      "learning_rate": 3.567927206669097e-06,
+      "loss": 0.4534,
+      "step": 38045
+    },
+    {
+      "epoch": 1.9317053978246248,
+      "grad_norm": 0.020744114829942508,
+      "learning_rate": 3.5415587497090727e-06,
+      "loss": 0.4606,
+      "step": 38050
+    },
+    {
+      "epoch": 1.9319592339226561,
+      "grad_norm": 0.02147231946067232,
+      "learning_rate": 3.515287745065321e-06,
+      "loss": 0.4542,
+      "step": 38055
+    },
+    {
+      "epoch": 1.9322130700206877,
+      "grad_norm": 0.02382770440065348,
+      "learning_rate": 3.4891141978945497e-06,
+      "loss": 0.4489,
+      "step": 38060
+    },
+    {
+      "epoch": 1.9324669061187192,
+      "grad_norm": 0.02168451146145817,
+      "learning_rate": 3.463038113334538e-06,
+      "loss": 0.4486,
+      "step": 38065
+    },
+    {
+      "epoch": 1.9327207422167505,
+      "grad_norm": 0.019029522910771698,
+      "learning_rate": 3.437059496503969e-06,
+      "loss": 0.4532,
+      "step": 38070
+    },
+    {
+      "epoch": 1.932974578314782,
+      "grad_norm": 0.02297240339975973,
+      "learning_rate": 3.4111783525022646e-06,
+      "loss": 0.4704,
+      "step": 38075
+    },
+    {
+      "epoch": 1.9332284144128136,
+      "grad_norm": 0.029039053764296118,
+      "learning_rate": 3.3853946864097486e-06,
+      "loss": 0.4711,
+      "step": 38080
+    },
+    {
+      "epoch": 1.933482250510845,
+      "grad_norm": 0.021336084269481418,
+      "learning_rate": 3.3597085032876505e-06,
+      "loss": 0.4505,
+      "step": 38085
+    },
+    {
+      "epoch": 1.9337360866088766,
+      "grad_norm": 0.026153460382982597,
+      "learning_rate": 3.3341198081780487e-06,
+      "loss": 0.4829,
+      "step": 38090
+    },
+    {
+      "epoch": 1.9339899227069082,
+      "grad_norm": 0.026443914139286612,
+      "learning_rate": 3.3086286061038697e-06,
+      "loss": 0.46,
+      "step": 38095
+    },
+    {
+      "epoch": 1.9342437588049397,
+      "grad_norm": 0.022842151943259267,
+      "learning_rate": 3.283234902068888e-06,
+      "loss": 0.4416,
+      "step": 38100
+    },
+    {
+      "epoch": 1.9344975949029712,
+      "grad_norm": 0.020825476467275915,
+      "learning_rate": 3.2579387010577277e-06,
+      "loss": 0.4656,
+      "step": 38105
+    },
+    {
+      "epoch": 1.9347514310010028,
+      "grad_norm": 0.021769918904403884,
+      "learning_rate": 3.2327400080359725e-06,
+      "loss": 0.4631,
+      "step": 38110
+    },
+    {
+      "epoch": 1.9350052670990343,
+      "grad_norm": 0.02082151349347851,
+      "learning_rate": 3.207638827949999e-06,
+      "loss": 0.4353,
+      "step": 38115
+    },
+    {
+      "epoch": 1.9352591031970656,
+      "grad_norm": 0.02513216757499985,
+      "learning_rate": 3.1826351657270323e-06,
+      "loss": 0.4638,
+      "step": 38120
+    },
+    {
+      "epoch": 1.9355129392950972,
+      "grad_norm": 0.022971752053904795,
+      "learning_rate": 3.1577290262750912e-06,
+      "loss": 0.455,
+      "step": 38125
+    },
+    {
+      "epoch": 1.9357667753931287,
+      "grad_norm": 0.023390667458557612,
+      "learning_rate": 3.1329204144832647e-06,
+      "loss": 0.4503,
+      "step": 38130
+    },
+    {
+      "epoch": 1.93602061149116,
+      "grad_norm": 0.027462875293432002,
+      "learning_rate": 3.108209335221268e-06,
+      "loss": 0.4698,
+      "step": 38135
+    },
+    {
+      "epoch": 1.9362744475891915,
+      "grad_norm": 0.022880416789241798,
+      "learning_rate": 3.0835957933397774e-06,
+      "loss": 0.4613,
+      "step": 38140
+    },
+    {
+      "epoch": 1.936528283687223,
+      "grad_norm": 0.021014225306591475,
+      "learning_rate": 3.0590797936703164e-06,
+      "loss": 0.5413,
+      "step": 38145
+    },
+    {
+      "epoch": 1.9367821197852546,
+      "grad_norm": 0.022815293753873547,
+      "learning_rate": 3.034661341025258e-06,
+      "loss": 0.4312,
+      "step": 38150
+    },
+    {
+      "epoch": 1.9370359558832861,
+      "grad_norm": 0.021813853966513634,
+      "learning_rate": 3.010340440197823e-06,
+      "loss": 0.455,
+      "step": 38155
+    },
+    {
+      "epoch": 1.9372897919813177,
+      "grad_norm": 0.027654537630323616,
+      "learning_rate": 2.986117095962082e-06,
+      "loss": 0.4457,
+      "step": 38160
+    },
+    {
+      "epoch": 1.9375436280793492,
+      "grad_norm": 0.020417251873714362,
+      "learning_rate": 2.961991313072898e-06,
+      "loss": 0.4311,
+      "step": 38165
+    },
+    {
+      "epoch": 1.9377974641773807,
+      "grad_norm": 0.02126948120782193,
+      "learning_rate": 2.9379630962661496e-06,
+      "loss": 0.4697,
+      "step": 38170
+    },
+    {
+      "epoch": 1.9380513002754123,
+      "grad_norm": 0.020504217551342806,
+      "learning_rate": 2.914032450258397e-06,
+      "loss": 0.4503,
+      "step": 38175
+    },
+    {
+      "epoch": 1.9383051363734438,
+      "grad_norm": 0.021169339004345845,
+      "learning_rate": 2.890199379747105e-06,
+      "loss": 0.4436,
+      "step": 38180
+    },
+    {
+      "epoch": 1.9385589724714751,
+      "grad_norm": 0.027548188156958732,
+      "learning_rate": 2.8664638894105867e-06,
+      "loss": 0.4585,
+      "step": 38185
+    },
+    {
+      "epoch": 1.9388128085695067,
+      "grad_norm": 0.02785876405422749,
+      "learning_rate": 2.8428259839079486e-06,
+      "loss": 0.4681,
+      "step": 38190
+    },
+    {
+      "epoch": 1.9390666446675382,
+      "grad_norm": 0.025887282870096888,
+      "learning_rate": 2.819285667879312e-06,
+      "loss": 0.4748,
+      "step": 38195
+    },
+    {
+      "epoch": 1.9393204807655695,
+      "grad_norm": 0.021934926759944635,
+      "learning_rate": 2.7958429459454817e-06,
+      "loss": 0.4409,
+      "step": 38200
+    },
+    {
+      "epoch": 1.939574316863601,
+      "grad_norm": 0.02102254390453021,
+      "learning_rate": 2.7724978227081086e-06,
+      "loss": 0.443,
+      "step": 38205
+    },
+    {
+      "epoch": 1.9398281529616326,
+      "grad_norm": 0.025739303931824997,
+      "learning_rate": 2.7492503027496953e-06,
+      "loss": 0.4346,
+      "step": 38210
+    },
+    {
+      "epoch": 1.9400819890596641,
+      "grad_norm": 0.020415985192957675,
+      "learning_rate": 2.726100390633757e-06,
+      "loss": 0.4278,
+      "step": 38215
+    },
+    {
+      "epoch": 1.9403358251576956,
+      "grad_norm": 0.027445653367167453,
+      "learning_rate": 2.7030480909043254e-06,
+      "loss": 0.4677,
+      "step": 38220
+    },
+    {
+      "epoch": 1.9405896612557272,
+      "grad_norm": 0.022194649303696412,
+      "learning_rate": 2.680093408086559e-06,
+      "loss": 0.4588,
+      "step": 38225
+    },
+    {
+      "epoch": 1.9408434973537587,
+      "grad_norm": 0.03334788531458536,
+      "learning_rate": 2.6572363466863534e-06,
+      "loss": 0.48,
+      "step": 38230
+    },
+    {
+      "epoch": 1.9410973334517903,
+      "grad_norm": 0.02165643766225538,
+      "learning_rate": 2.6344769111903975e-06,
+      "loss": 0.4869,
+      "step": 38235
+    },
+    {
+      "epoch": 1.9413511695498218,
+      "grad_norm": 0.02482685913614366,
+      "learning_rate": 2.6118151060662842e-06,
+      "loss": 0.4267,
+      "step": 38240
+    },
+    {
+      "epoch": 1.9416050056478533,
+      "grad_norm": 0.02207024691983292,
+      "learning_rate": 2.589250935762344e-06,
+      "loss": 0.4325,
+      "step": 38245
+    },
+    {
+      "epoch": 1.9418588417458846,
+      "grad_norm": 0.021736842944571975,
+      "learning_rate": 2.566784404707867e-06,
+      "loss": 0.444,
+      "step": 38250
+    },
+    {
+      "epoch": 1.9421126778439162,
+      "grad_norm": 0.033892642112711965,
+      "learning_rate": 2.5444155173129368e-06,
+      "loss": 0.4529,
+      "step": 38255
+    },
+    {
+      "epoch": 1.9423665139419477,
+      "grad_norm": 0.021770668922768112,
+      "learning_rate": 2.52214427796843e-06,
+      "loss": 0.4422,
+      "step": 38260
+    },
+    {
+      "epoch": 1.9426203500399792,
+      "grad_norm": 0.02338794948994561,
+      "learning_rate": 2.499970691046127e-06,
+      "loss": 0.4577,
+      "step": 38265
+    },
+    {
+      "epoch": 1.9428741861380106,
+      "grad_norm": 0.0211563138278504,
+      "learning_rate": 2.4778947608984915e-06,
+      "loss": 0.4691,
+      "step": 38270
+    },
+    {
+      "epoch": 1.943128022236042,
+      "grad_norm": 0.021156732461866418,
+      "learning_rate": 2.4559164918590005e-06,
+      "loss": 0.4505,
+      "step": 38275
+    },
+    {
+      "epoch": 1.9433818583340736,
+      "grad_norm": 0.029251366916451853,
+      "learning_rate": 2.4340358882418144e-06,
+      "loss": 0.436,
+      "step": 38280
+    },
+    {
+      "epoch": 1.9436356944321052,
+      "grad_norm": 0.020800337122009502,
+      "learning_rate": 2.412252954342109e-06,
+      "loss": 0.4635,
+      "step": 38285
+    },
+    {
+      "epoch": 1.9438895305301367,
+      "grad_norm": 0.020582842457094158,
+      "learning_rate": 2.3905676944356303e-06,
+      "loss": 0.4379,
+      "step": 38290
+    },
+    {
+      "epoch": 1.9441433666281682,
+      "grad_norm": 0.021231715330240406,
+      "learning_rate": 2.36898011277914e-06,
+      "loss": 0.4536,
+      "step": 38295
+    },
+    {
+      "epoch": 1.9443972027261998,
+      "grad_norm": 0.019707268959821594,
+      "learning_rate": 2.3474902136101927e-06,
+      "loss": 0.4483,
+      "step": 38300
+    },
+    {
+      "epoch": 1.9446510388242313,
+      "grad_norm": 0.021565038396861153,
+      "learning_rate": 2.3260980011470258e-06,
+      "loss": 0.4508,
+      "step": 38305
+    },
+    {
+      "epoch": 1.9449048749222628,
+      "grad_norm": 0.022307843520456463,
+      "learning_rate": 2.304803479589057e-06,
+      "loss": 0.4469,
+      "step": 38310
+    },
+    {
+      "epoch": 1.9451587110202944,
+      "grad_norm": 0.021662290602322585,
+      "learning_rate": 2.2836066531161104e-06,
+      "loss": 0.4584,
+      "step": 38315
+    },
+    {
+      "epoch": 1.9454125471183257,
+      "grad_norm": 0.022824581912878204,
+      "learning_rate": 2.2625075258890793e-06,
+      "loss": 0.4524,
+      "step": 38320
+    },
+    {
+      "epoch": 1.9456663832163572,
+      "grad_norm": 0.02383320091907116,
+      "learning_rate": 2.2415061020495954e-06,
+      "loss": 0.4545,
+      "step": 38325
+    },
+    {
+      "epoch": 1.9459202193143887,
+      "grad_norm": 0.02259973050640163,
+      "learning_rate": 2.2206023857201386e-06,
+      "loss": 0.4395,
+      "step": 38330
+    },
+    {
+      "epoch": 1.94617405541242,
+      "grad_norm": 0.02451602070891827,
+      "learning_rate": 2.199796381004038e-06,
+      "loss": 0.4463,
+      "step": 38335
+    },
+    {
+      "epoch": 1.9464278915104516,
+      "grad_norm": 0.02350649241970694,
+      "learning_rate": 2.1790880919853595e-06,
+      "loss": 0.4571,
+      "step": 38340
+    },
+    {
+      "epoch": 1.9466817276084831,
+      "grad_norm": 0.024863563292605854,
+      "learning_rate": 2.1584775227290745e-06,
+      "loss": 0.4537,
+      "step": 38345
+    },
+    {
+      "epoch": 1.9469355637065147,
+      "grad_norm": 0.026608746542194056,
+      "learning_rate": 2.1379646772808903e-06,
+      "loss": 0.4525,
+      "step": 38350
+    },
+    {
+      "epoch": 1.9471893998045462,
+      "grad_norm": 0.021420118326157717,
+      "learning_rate": 2.11754955966742e-06,
+      "loss": 0.4788,
+      "step": 38355
+    },
+    {
+      "epoch": 1.9474432359025777,
+      "grad_norm": 0.021686725777661547,
+      "learning_rate": 2.0972321738960687e-06,
+      "loss": 0.4555,
+      "step": 38360
+    },
+    {
+      "epoch": 1.9476970720006093,
+      "grad_norm": 0.020239535964856954,
+      "learning_rate": 2.0770125239549797e-06,
+      "loss": 0.4275,
+      "step": 38365
+    },
+    {
+      "epoch": 1.9479509080986408,
+      "grad_norm": 0.026279491549929514,
+      "learning_rate": 2.0568906138132002e-06,
+      "loss": 0.4728,
+      "step": 38370
+    },
+    {
+      "epoch": 1.9482047441966723,
+      "grad_norm": 0.03912414768353642,
+      "learning_rate": 2.0368664474205157e-06,
+      "loss": 0.4767,
+      "step": 38375
+    },
+    {
+      "epoch": 1.9484585802947039,
+      "grad_norm": 0.02377861368004799,
+      "learning_rate": 2.01694002870767e-06,
+      "loss": 0.4395,
+      "step": 38380
+    },
+    {
+      "epoch": 1.9487124163927352,
+      "grad_norm": 0.022829057653313226,
+      "learning_rate": 1.997111361586035e-06,
+      "loss": 0.4665,
+      "step": 38385
+    },
+    {
+      "epoch": 1.9489662524907667,
+      "grad_norm": 0.02234878999035154,
+      "learning_rate": 1.9773804499478854e-06,
+      "loss": 0.4479,
+      "step": 38390
+    },
+    {
+      "epoch": 1.9492200885887982,
+      "grad_norm": 0.020446858573344908,
+      "learning_rate": 1.957747297666346e-06,
+      "loss": 0.4451,
+      "step": 38395
+    },
+    {
+      "epoch": 1.9494739246868296,
+      "grad_norm": 0.022150719815722406,
+      "learning_rate": 1.9382119085952777e-06,
+      "loss": 0.467,
+      "step": 38400
+    },
+    {
+      "epoch": 1.949727760784861,
+      "grad_norm": 0.02395305631634785,
+      "learning_rate": 1.9187742865693915e-06,
+      "loss": 0.4446,
+      "step": 38405
+    },
+    {
+      "epoch": 1.9499815968828926,
+      "grad_norm": 0.025226111441204044,
+      "learning_rate": 1.899434435404135e-06,
+      "loss": 0.4212,
+      "step": 38410
+    },
+    {
+      "epoch": 1.9502354329809242,
+      "grad_norm": 0.026080819122294808,
+      "learning_rate": 1.8801923588959157e-06,
+      "loss": 0.43,
+      "step": 38415
+    },
+    {
+      "epoch": 1.9504892690789557,
+      "grad_norm": 0.027147258519150435,
+      "learning_rate": 1.8610480608218239e-06,
+      "loss": 0.4371,
+      "step": 38420
+    },
+    {
+      "epoch": 1.9507431051769872,
+      "grad_norm": 0.02461769872993647,
+      "learning_rate": 1.842001544939742e-06,
+      "loss": 0.47,
+      "step": 38425
+    },
+    {
+      "epoch": 1.9509969412750188,
+      "grad_norm": 0.020951902456712867,
+      "learning_rate": 1.8230528149884573e-06,
+      "loss": 0.4701,
+      "step": 38430
+    },
+    {
+      "epoch": 1.9512507773730503,
+      "grad_norm": 0.026972795277747482,
+      "learning_rate": 1.80420187468755e-06,
+      "loss": 0.4404,
+      "step": 38435
+    },
+    {
+      "epoch": 1.9515046134710818,
+      "grad_norm": 0.021207557562487,
+      "learning_rate": 1.7854487277372822e-06,
+      "loss": 0.4366,
+      "step": 38440
+    },
+    {
+      "epoch": 1.9517584495691134,
+      "grad_norm": 0.02498355979793343,
+      "learning_rate": 1.7667933778188206e-06,
+      "loss": 0.4516,
+      "step": 38445
+    },
+    {
+      "epoch": 1.9520122856671447,
+      "grad_norm": 0.02192218125038996,
+      "learning_rate": 1.7482358285941803e-06,
+      "loss": 0.4627,
+      "step": 38450
+    },
+    {
+      "epoch": 1.9522661217651762,
+      "grad_norm": 0.029378815413494033,
+      "learning_rate": 1.729776083706003e-06,
+      "loss": 0.4461,
+      "step": 38455
+    },
+    {
+      "epoch": 1.9525199578632078,
+      "grad_norm": 0.024581468794183266,
+      "learning_rate": 1.7114141467779454e-06,
+      "loss": 0.454,
+      "step": 38460
+    },
+    {
+      "epoch": 1.952773793961239,
+      "grad_norm": 0.025894600364455185,
+      "learning_rate": 1.693150021414347e-06,
+      "loss": 0.4724,
+      "step": 38465
+    },
+    {
+      "epoch": 1.9530276300592706,
+      "grad_norm": 0.02278420673605267,
+      "learning_rate": 1.6749837112003398e-06,
+      "loss": 0.4594,
+      "step": 38470
+    },
+    {
+      "epoch": 1.9532814661573021,
+      "grad_norm": 0.02509311215738138,
+      "learning_rate": 1.656915219701849e-06,
+      "loss": 0.4309,
+      "step": 38475
+    },
+    {
+      "epoch": 1.9535353022553337,
+      "grad_norm": 0.02132393990165603,
+      "learning_rate": 1.6389445504657041e-06,
+      "loss": 0.4423,
+      "step": 38480
+    },
+    {
+      "epoch": 1.9537891383533652,
+      "grad_norm": 0.02171837487424093,
+      "learning_rate": 1.621071707019417e-06,
+      "loss": 0.4358,
+      "step": 38485
+    },
+    {
+      "epoch": 1.9540429744513967,
+      "grad_norm": 0.024981217751365643,
+      "learning_rate": 1.6032966928713477e-06,
+      "loss": 0.4675,
+      "step": 38490
+    },
+    {
+      "epoch": 1.9542968105494283,
+      "grad_norm": 0.020893534776536122,
+      "learning_rate": 1.5856195115105943e-06,
+      "loss": 0.4433,
+      "step": 38495
+    },
+    {
+      "epoch": 1.9545506466474598,
+      "grad_norm": 0.02068830461015465,
+      "learning_rate": 1.5680401664072141e-06,
+      "loss": 0.4629,
+      "step": 38500
+    },
+    {
+      "epoch": 1.9548044827454913,
+      "grad_norm": 0.020953616835626587,
+      "learning_rate": 1.5505586610118361e-06,
+      "loss": 0.4689,
+      "step": 38505
+    },
+    {
+      "epoch": 1.9550583188435229,
+      "grad_norm": 0.02440341051189393,
+      "learning_rate": 1.5331749987560484e-06,
+      "loss": 0.461,
+      "step": 38510
+    },
+    {
+      "epoch": 1.9553121549415542,
+      "grad_norm": 0.029598548396459087,
+      "learning_rate": 1.5158891830521215e-06,
+      "loss": 0.431,
+      "step": 38515
+    },
+    {
+      "epoch": 1.9555659910395857,
+      "grad_norm": 0.02717560220840279,
+      "learning_rate": 1.4987012172932301e-06,
+      "loss": 0.4535,
+      "step": 38520
+    },
+    {
+      "epoch": 1.9558198271376173,
+      "grad_norm": 0.02370839493798561,
+      "learning_rate": 1.481611104853231e-06,
+      "loss": 0.4735,
+      "step": 38525
+    },
+    {
+      "epoch": 1.9560736632356488,
+      "grad_norm": 0.022848184848146453,
+      "learning_rate": 1.4646188490869405e-06,
+      "loss": 0.4754,
+      "step": 38530
+    },
+    {
+      "epoch": 1.95632749933368,
+      "grad_norm": 0.022225861131520797,
+      "learning_rate": 1.4477244533297463e-06,
+      "loss": 0.4466,
+      "step": 38535
+    },
+    {
+      "epoch": 1.9565813354317116,
+      "grad_norm": 0.022152077073412883,
+      "learning_rate": 1.4309279208979398e-06,
+      "loss": 0.4698,
+      "step": 38540
+    },
+    {
+      "epoch": 1.9568351715297432,
+      "grad_norm": 0.02524987360341103,
+      "learning_rate": 1.414229255088606e-06,
+      "loss": 0.4294,
+      "step": 38545
+    },
+    {
+      "epoch": 1.9570890076277747,
+      "grad_norm": 0.023994085753204972,
+      "learning_rate": 1.3976284591796783e-06,
+      "loss": 0.4485,
+      "step": 38550
+    },
+    {
+      "epoch": 1.9573428437258062,
+      "grad_norm": 0.021722894693704954,
+      "learning_rate": 1.381125536429717e-06,
+      "loss": 0.4456,
+      "step": 38555
+    },
+    {
+      "epoch": 1.9575966798238378,
+      "grad_norm": 0.024344062652594294,
+      "learning_rate": 1.3647204900782417e-06,
+      "loss": 0.4338,
+      "step": 38560
+    },
+    {
+      "epoch": 1.9578505159218693,
+      "grad_norm": 0.021929490605915293,
+      "learning_rate": 1.3484133233454544e-06,
+      "loss": 0.4643,
+      "step": 38565
+    },
+    {
+      "epoch": 1.9581043520199009,
+      "grad_norm": 0.02177586782728187,
+      "learning_rate": 1.3322040394323498e-06,
+      "loss": 0.4649,
+      "step": 38570
+    },
+    {
+      "epoch": 1.9583581881179324,
+      "grad_norm": 0.020581067246549605,
+      "learning_rate": 1.3160926415207163e-06,
+      "loss": 0.4485,
+      "step": 38575
+    },
+    {
+      "epoch": 1.9586120242159637,
+      "grad_norm": 0.0215530463340169,
+      "learning_rate": 1.300079132773191e-06,
+      "loss": 0.4468,
+      "step": 38580
+    },
+    {
+      "epoch": 1.9588658603139952,
+      "grad_norm": 0.024185145615631558,
+      "learning_rate": 1.2841635163330922e-06,
+      "loss": 0.4886,
+      "step": 38585
+    },
+    {
+      "epoch": 1.9591196964120268,
+      "grad_norm": 0.02437935153018092,
+      "learning_rate": 1.268345795324588e-06,
+      "loss": 0.469,
+      "step": 38590
+    },
+    {
+      "epoch": 1.9593735325100583,
+      "grad_norm": 0.023546706097560363,
+      "learning_rate": 1.252625972852639e-06,
+      "loss": 0.4888,
+      "step": 38595
+    },
+    {
+      "epoch": 1.9596273686080896,
+      "grad_norm": 0.021139727672287015,
+      "learning_rate": 1.237004052002999e-06,
+      "loss": 0.4337,
+      "step": 38600
+    },
+    {
+      "epoch": 1.9598812047061211,
+      "grad_norm": 0.03423369843513143,
+      "learning_rate": 1.221480035842104e-06,
+      "loss": 0.4259,
+      "step": 38605
+    },
+    {
+      "epoch": 1.9601350408041527,
+      "grad_norm": 0.021383194616096283,
+      "learning_rate": 1.2060539274172944e-06,
+      "loss": 0.4357,
+      "step": 38610
+    },
+    {
+      "epoch": 1.9603888769021842,
+      "grad_norm": 0.020286977515740528,
+      "learning_rate": 1.1907257297566477e-06,
+      "loss": 0.4446,
+      "step": 38615
+    },
+    {
+      "epoch": 1.9606427130002158,
+      "grad_norm": 0.02403465823618508,
+      "learning_rate": 1.1754954458689238e-06,
+      "loss": 0.4564,
+      "step": 38620
+    },
+    {
+      "epoch": 1.9608965490982473,
+      "grad_norm": 0.02086613709755337,
+      "learning_rate": 1.1603630787438424e-06,
+      "loss": 0.4625,
+      "step": 38625
+    },
+    {
+      "epoch": 1.9611503851962788,
+      "grad_norm": 0.02145705596640427,
+      "learning_rate": 1.1453286313517498e-06,
+      "loss": 0.4622,
+      "step": 38630
+    },
+    {
+      "epoch": 1.9614042212943104,
+      "grad_norm": 0.0198366436003919,
+      "learning_rate": 1.130392106643896e-06,
+      "loss": 0.4296,
+      "step": 38635
+    },
+    {
+      "epoch": 1.961658057392342,
+      "grad_norm": 0.021595570641432995,
+      "learning_rate": 1.1155535075522138e-06,
+      "loss": 0.4395,
+      "step": 38640
+    },
+    {
+      "epoch": 1.9619118934903734,
+      "grad_norm": 0.027632249043748877,
+      "learning_rate": 1.1008128369894288e-06,
+      "loss": 0.4571,
+      "step": 38645
+    },
+    {
+      "epoch": 1.9621657295884047,
+      "grad_norm": 0.0211592206862124,
+      "learning_rate": 1.0861700978490596e-06,
+      "loss": 0.4516,
+      "step": 38650
+    },
+    {
+      "epoch": 1.9624195656864363,
+      "grad_norm": 0.020641952024909045,
+      "learning_rate": 1.0716252930054737e-06,
+      "loss": 0.4525,
+      "step": 38655
+    },
+    {
+      "epoch": 1.9626734017844678,
+      "grad_norm": 0.02239625096519365,
+      "learning_rate": 1.0571784253136652e-06,
+      "loss": 0.4538,
+      "step": 38660
+    },
+    {
+      "epoch": 1.9629272378824991,
+      "grad_norm": 0.020935572931792722,
+      "learning_rate": 1.0428294976094766e-06,
+      "loss": 0.4726,
+      "step": 38665
+    },
+    {
+      "epoch": 1.9631810739805307,
+      "grad_norm": 0.023797580850831307,
+      "learning_rate": 1.0285785127095993e-06,
+      "loss": 0.4591,
+      "step": 38670
+    },
+    {
+      "epoch": 1.9634349100785622,
+      "grad_norm": 0.021663436246309797,
+      "learning_rate": 1.0144254734113511e-06,
+      "loss": 0.439,
+      "step": 38675
+    },
+    {
+      "epoch": 1.9636887461765937,
+      "grad_norm": 0.02399748053716495,
+      "learning_rate": 1.00037038249301e-06,
+      "loss": 0.467,
+      "step": 38680
+    },
+    {
+      "epoch": 1.9639425822746253,
+      "grad_norm": 0.02209544242205649,
+      "learning_rate": 9.864132427134243e-07,
+      "loss": 0.4573,
+      "step": 38685
+    },
+    {
+      "epoch": 1.9641964183726568,
+      "grad_norm": 0.02180806139548943,
+      "learning_rate": 9.725540568122915e-07,
+      "loss": 0.4393,
+      "step": 38690
+    },
+    {
+      "epoch": 1.9644502544706883,
+      "grad_norm": 0.025112167834570206,
+      "learning_rate": 9.587928275102132e-07,
+      "loss": 0.4367,
+      "step": 38695
+    },
+    {
+      "epoch": 1.9647040905687199,
+      "grad_norm": 0.019834862643478415,
+      "learning_rate": 9.451295575083618e-07,
+      "loss": 0.4682,
+      "step": 38700
+    },
+    {
+      "epoch": 1.9649579266667514,
+      "grad_norm": 0.019828424235274936,
+      "learning_rate": 9.315642494888144e-07,
+      "loss": 0.424,
+      "step": 38705
+    },
+    {
+      "epoch": 1.965211762764783,
+      "grad_norm": 0.023011146520013428,
+      "learning_rate": 9.180969061143851e-07,
+      "loss": 0.4488,
+      "step": 38710
+    },
+    {
+      "epoch": 1.9654655988628142,
+      "grad_norm": 0.021083449030767323,
+      "learning_rate": 9.047275300285706e-07,
+      "loss": 0.4656,
+      "step": 38715
+    },
+    {
+      "epoch": 1.9657194349608458,
+      "grad_norm": 0.028229164879771167,
+      "learning_rate": 8.914561238557717e-07,
+      "loss": 0.4336,
+      "step": 38720
+    },
+    {
+      "epoch": 1.9659732710588773,
+      "grad_norm": 0.023561467883931243,
+      "learning_rate": 8.78282690201071e-07,
+      "loss": 0.4416,
+      "step": 38725
+    },
+    {
+      "epoch": 1.9662271071569086,
+      "grad_norm": 0.0231890769370734,
+      "learning_rate": 8.652072316503446e-07,
+      "loss": 0.4391,
+      "step": 38730
+    },
+    {
+      "epoch": 1.9664809432549402,
+      "grad_norm": 0.023528700345852093,
+      "learning_rate": 8.52229750770317e-07,
+      "loss": 0.4776,
+      "step": 38735
+    },
+    {
+      "epoch": 1.9667347793529717,
+      "grad_norm": 0.022550152791175105,
+      "learning_rate": 8.39350250108284e-07,
+      "loss": 0.4482,
+      "step": 38740
+    },
+    {
+      "epoch": 1.9669886154510032,
+      "grad_norm": 0.018641810249880507,
+      "learning_rate": 8.265687321925009e-07,
+      "loss": 0.4285,
+      "step": 38745
+    },
+    {
+      "epoch": 1.9672424515490348,
+      "grad_norm": 0.02746049418696759,
+      "learning_rate": 8.138851995319608e-07,
+      "loss": 0.4298,
+      "step": 38750
+    },
+    {
+      "epoch": 1.9674962876470663,
+      "grad_norm": 0.023113644657525075,
+      "learning_rate": 8.012996546162277e-07,
+      "loss": 0.4573,
+      "step": 38755
+    },
+    {
+      "epoch": 1.9677501237450978,
+      "grad_norm": 0.02536365354716488,
+      "learning_rate": 7.888120999159365e-07,
+      "loss": 0.4669,
+      "step": 38760
+    },
+    {
+      "epoch": 1.9680039598431294,
+      "grad_norm": 0.031051578737683758,
+      "learning_rate": 7.764225378822377e-07,
+      "loss": 0.4395,
+      "step": 38765
+    },
+    {
+      "epoch": 1.968257795941161,
+      "grad_norm": 0.022412814259141482,
+      "learning_rate": 7.641309709471855e-07,
+      "loss": 0.4616,
+      "step": 38770
+    },
+    {
+      "epoch": 1.9685116320391924,
+      "grad_norm": 0.029004560100377053,
+      "learning_rate": 7.51937401523517e-07,
+      "loss": 0.4527,
+      "step": 38775
+    },
+    {
+      "epoch": 1.9687654681372238,
+      "grad_norm": 0.03144197827525452,
+      "learning_rate": 7.398418320048173e-07,
+      "loss": 0.4587,
+      "step": 38780
+    },
+    {
+      "epoch": 1.9690193042352553,
+      "grad_norm": 0.020675890106864256,
+      "learning_rate": 7.278442647653538e-07,
+      "loss": 0.4294,
+      "step": 38785
+    },
+    {
+      "epoch": 1.9692731403332868,
+      "grad_norm": 0.022404177027982874,
+      "learning_rate": 7.159447021601872e-07,
+      "loss": 0.4414,
+      "step": 38790
+    },
+    {
+      "epoch": 1.9695269764313181,
+      "grad_norm": 0.025733305949821667,
+      "learning_rate": 7.041431465251713e-07,
+      "loss": 0.4477,
+      "step": 38795
+    },
+    {
+      "epoch": 1.9697808125293497,
+      "grad_norm": 0.027375264236320213,
+      "learning_rate": 6.924396001768418e-07,
+      "loss": 0.4665,
+      "step": 38800
+    },
+    {
+      "epoch": 1.9700346486273812,
+      "grad_norm": 0.025948495378095977,
+      "learning_rate": 6.808340654125833e-07,
+      "loss": 0.4424,
+      "step": 38805
+    },
+    {
+      "epoch": 1.9702884847254127,
+      "grad_norm": 0.023666766635241906,
+      "learning_rate": 6.693265445105179e-07,
+      "loss": 0.4416,
+      "step": 38810
+    },
+    {
+      "epoch": 1.9705423208234443,
+      "grad_norm": 0.02132357302519558,
+      "learning_rate": 6.579170397294498e-07,
+      "loss": 0.4253,
+      "step": 38815
+    },
+    {
+      "epoch": 1.9707961569214758,
+      "grad_norm": 0.030128825836747055,
+      "learning_rate": 6.466055533090875e-07,
+      "loss": 0.4195,
+      "step": 38820
+    },
+    {
+      "epoch": 1.9710499930195073,
+      "grad_norm": 0.02602764759746211,
+      "learning_rate": 6.35392087469766e-07,
+      "loss": 0.442,
+      "step": 38825
+    },
+    {
+      "epoch": 1.9713038291175389,
+      "grad_norm": 0.021881181041248757,
+      "learning_rate": 6.24276644412669e-07,
+      "loss": 0.473,
+      "step": 38830
+    },
+    {
+      "epoch": 1.9715576652155704,
+      "grad_norm": 0.02185523352546431,
+      "learning_rate": 6.132592263196623e-07,
+      "loss": 0.43,
+      "step": 38835
+    },
+    {
+      "epoch": 1.971811501313602,
+      "grad_norm": 0.021680403099536016,
+      "learning_rate": 6.023398353534604e-07,
+      "loss": 0.4553,
+      "step": 38840
+    },
+    {
+      "epoch": 1.9720653374116333,
+      "grad_norm": 0.021974257862993504,
+      "learning_rate": 5.915184736574597e-07,
+      "loss": 0.4562,
+      "step": 38845
+    },
+    {
+      "epoch": 1.9723191735096648,
+      "grad_norm": 0.02096949718491153,
+      "learning_rate": 5.807951433557946e-07,
+      "loss": 0.4408,
+      "step": 38850
+    },
+    {
+      "epoch": 1.9725730096076963,
+      "grad_norm": 0.025939230078918414,
+      "learning_rate": 5.701698465534477e-07,
+      "loss": 0.456,
+      "step": 38855
+    },
+    {
+      "epoch": 1.9728268457057279,
+      "grad_norm": 0.02828624799294119,
+      "learning_rate": 5.596425853361397e-07,
+      "loss": 0.4327,
+      "step": 38860
+    },
+    {
+      "epoch": 1.9730806818037592,
+      "grad_norm": 0.022004046110511963,
+      "learning_rate": 5.492133617702733e-07,
+      "loss": 0.4511,
+      "step": 38865
+    },
+    {
+      "epoch": 1.9733345179017907,
+      "grad_norm": 0.024590538540810992,
+      "learning_rate": 5.388821779030994e-07,
+      "loss": 0.4676,
+      "step": 38870
+    },
+    {
+      "epoch": 1.9735883539998222,
+      "grad_norm": 0.021191903724915806,
+      "learning_rate": 5.286490357624962e-07,
+      "loss": 0.4506,
+      "step": 38875
+    },
+    {
+      "epoch": 1.9738421900978538,
+      "grad_norm": 0.023563631620566922,
+      "learning_rate": 5.185139373572456e-07,
+      "loss": 0.4451,
+      "step": 38880
+    },
+    {
+      "epoch": 1.9740960261958853,
+      "grad_norm": 0.023529886583346264,
+      "learning_rate": 5.084768846768117e-07,
+      "loss": 0.4457,
+      "step": 38885
+    },
+    {
+      "epoch": 1.9743498622939168,
+      "grad_norm": 0.023884990638485995,
+      "learning_rate": 4.985378796913964e-07,
+      "loss": 0.4755,
+      "step": 38890
+    },
+    {
+      "epoch": 1.9746036983919484,
+      "grad_norm": 0.02209835269662074,
+      "learning_rate": 4.886969243519391e-07,
+      "loss": 0.4252,
+      "step": 38895
+    },
+    {
+      "epoch": 1.97485753448998,
+      "grad_norm": 0.019159617450172008,
+      "learning_rate": 4.789540205902831e-07,
+      "loss": 0.4405,
+      "step": 38900
+    },
+    {
+      "epoch": 1.9751113705880115,
+      "grad_norm": 0.02540563968338361,
+      "learning_rate": 4.6930917031878796e-07,
+      "loss": 0.4234,
+      "step": 38905
+    },
+    {
+      "epoch": 1.975365206686043,
+      "grad_norm": 0.025584914989898507,
+      "learning_rate": 4.597623754307723e-07,
+      "loss": 0.4422,
+      "step": 38910
+    },
+    {
+      "epoch": 1.9756190427840743,
+      "grad_norm": 0.02296434045482897,
+      "learning_rate": 4.5031363780023705e-07,
+      "loss": 0.4342,
+      "step": 38915
+    },
+    {
+      "epoch": 1.9758728788821058,
+      "grad_norm": 0.023745961805210634,
+      "learning_rate": 4.4096295928186534e-07,
+      "loss": 0.4681,
+      "step": 38920
+    },
+    {
+      "epoch": 1.9761267149801374,
+      "grad_norm": 0.023731819927084494,
+      "learning_rate": 4.3171034171113346e-07,
+      "loss": 0.4512,
+      "step": 38925
+    },
+    {
+      "epoch": 1.9763805510781687,
+      "grad_norm": 0.022054760692146153,
+      "learning_rate": 4.225557869043661e-07,
+      "loss": 0.4677,
+      "step": 38930
+    },
+    {
+      "epoch": 1.9766343871762002,
+      "grad_norm": 0.023331327072565897,
+      "learning_rate": 4.134992966584594e-07,
+      "loss": 0.4595,
+      "step": 38935
+    },
+    {
+      "epoch": 1.9768882232742317,
+      "grad_norm": 0.022221322234984905,
+      "learning_rate": 4.0454087275121344e-07,
+      "loss": 0.4274,
+      "step": 38940
+    },
+    {
+      "epoch": 1.9771420593722633,
+      "grad_norm": 0.028686436657490943,
+      "learning_rate": 3.956805169411659e-07,
+      "loss": 0.4756,
+      "step": 38945
+    },
+    {
+      "epoch": 1.9773958954702948,
+      "grad_norm": 0.021950488480769904,
+      "learning_rate": 3.8691823096748126e-07,
+      "loss": 0.4263,
+      "step": 38950
+    },
+    {
+      "epoch": 1.9776497315683264,
+      "grad_norm": 0.021704365417338094,
+      "learning_rate": 3.7825401655017246e-07,
+      "loss": 0.4912,
+      "step": 38955
+    },
+    {
+      "epoch": 1.9779035676663579,
+      "grad_norm": 0.029412474606072804,
+      "learning_rate": 3.6968787538999016e-07,
+      "loss": 0.446,
+      "step": 38960
+    },
+    {
+      "epoch": 1.9781574037643894,
+      "grad_norm": 0.021330314142649562,
+      "learning_rate": 3.6121980916842265e-07,
+      "loss": 0.4515,
+      "step": 38965
+    },
+    {
+      "epoch": 1.978411239862421,
+      "grad_norm": 0.02002666201390011,
+      "learning_rate": 3.528498195476959e-07,
+      "loss": 0.4289,
+      "step": 38970
+    },
+    {
+      "epoch": 1.9786650759604525,
+      "grad_norm": 0.022747671492206325,
+      "learning_rate": 3.445779081708844e-07,
+      "loss": 0.4598,
+      "step": 38975
+    },
+    {
+      "epoch": 1.9789189120584838,
+      "grad_norm": 0.023336518609922925,
+      "learning_rate": 3.3640407666157835e-07,
+      "loss": 0.4739,
+      "step": 38980
+    },
+    {
+      "epoch": 1.9791727481565153,
+      "grad_norm": 0.03376020799973892,
+      "learning_rate": 3.283283266243831e-07,
+      "loss": 0.4483,
+      "step": 38985
+    },
+    {
+      "epoch": 1.9794265842545469,
+      "grad_norm": 0.02132524288935295,
+      "learning_rate": 3.203506596444194e-07,
+      "loss": 0.463,
+      "step": 38990
+    },
+    {
+      "epoch": 1.9796804203525782,
+      "grad_norm": 0.020945363773474973,
+      "learning_rate": 3.1247107728776815e-07,
+      "loss": 0.46,
+      "step": 38995
+    },
+    {
+      "epoch": 1.9799342564506097,
+      "grad_norm": 0.019771955108709015,
+      "learning_rate": 3.046895811011363e-07,
+      "loss": 0.4515,
+      "step": 39000
+    },
+    {
+      "epoch": 1.9801880925486413,
+      "grad_norm": 0.02332612793278335,
+      "learning_rate": 2.970061726119133e-07,
+      "loss": 0.434,
+      "step": 39005
+    },
+    {
+      "epoch": 1.9804419286466728,
+      "grad_norm": 0.022591788630783205,
+      "learning_rate": 2.894208533283371e-07,
+      "loss": 0.4332,
+      "step": 39010
+    },
+    {
+      "epoch": 1.9806957647447043,
+      "grad_norm": 0.020037110918590357,
+      "learning_rate": 2.8193362473943885e-07,
+      "loss": 0.4296,
+      "step": 39015
+    },
+    {
+      "epoch": 1.9809496008427359,
+      "grad_norm": 0.023998278576397646,
+      "learning_rate": 2.7454448831487624e-07,
+      "loss": 0.4527,
+      "step": 39020
+    },
+    {
+      "epoch": 1.9812034369407674,
+      "grad_norm": 0.02707422698823243,
+      "learning_rate": 2.672534455051001e-07,
+      "loss": 0.4571,
+      "step": 39025
+    },
+    {
+      "epoch": 1.981457273038799,
+      "grad_norm": 0.025204251574380124,
+      "learning_rate": 2.60060497741299e-07,
+      "loss": 0.461,
+      "step": 39030
+    },
+    {
+      "epoch": 1.9817111091368305,
+      "grad_norm": 0.024377692549952947,
+      "learning_rate": 2.529656464354546e-07,
+      "loss": 0.4683,
+      "step": 39035
+    },
+    {
+      "epoch": 1.981964945234862,
+      "grad_norm": 0.02538043216380801,
+      "learning_rate": 2.459688929802306e-07,
+      "loss": 0.4442,
+      "step": 39040
+    },
+    {
+      "epoch": 1.9822187813328933,
+      "grad_norm": 0.02464209180856968,
+      "learning_rate": 2.3907023874897295e-07,
+      "loss": 0.4245,
+      "step": 39045
+    },
+    {
+      "epoch": 1.9824726174309248,
+      "grad_norm": 0.023932076143716494,
+      "learning_rate": 2.3226968509598712e-07,
+      "loss": 0.4507,
+      "step": 39050
+    },
+    {
+      "epoch": 1.9827264535289564,
+      "grad_norm": 0.020196673491678915,
+      "learning_rate": 2.2556723335609431e-07,
+      "loss": 0.4407,
+      "step": 39055
+    },
+    {
+      "epoch": 1.9829802896269877,
+      "grad_norm": 0.022940306957792315,
+      "learning_rate": 2.1896288484496428e-07,
+      "loss": 0.4575,
+      "step": 39060
+    },
+    {
+      "epoch": 1.9832341257250192,
+      "grad_norm": 0.028907148506462126,
+      "learning_rate": 2.1245664085906002e-07,
+      "loss": 0.4506,
+      "step": 39065
+    },
+    {
+      "epoch": 1.9834879618230508,
+      "grad_norm": 0.023945110409690495,
+      "learning_rate": 2.0604850267547104e-07,
+      "loss": 0.4592,
+      "step": 39070
+    },
+    {
+      "epoch": 1.9837417979210823,
+      "grad_norm": 0.028605852547885884,
+      "learning_rate": 1.9973847155208003e-07,
+      "loss": 0.4588,
+      "step": 39075
+    },
+    {
+      "epoch": 1.9839956340191138,
+      "grad_norm": 0.02150438379068988,
+      "learning_rate": 1.935265487275073e-07,
+      "loss": 0.4286,
+      "step": 39080
+    },
+    {
+      "epoch": 1.9842494701171454,
+      "grad_norm": 0.021743607443387786,
+      "learning_rate": 1.8741273542116633e-07,
+      "loss": 0.4728,
+      "step": 39085
+    },
+    {
+      "epoch": 1.984503306215177,
+      "grad_norm": 0.024759023733061148,
+      "learning_rate": 1.8139703283315267e-07,
+      "loss": 0.4691,
+      "step": 39090
+    },
+    {
+      "epoch": 1.9847571423132084,
+      "grad_norm": 0.021978062996421237,
+      "learning_rate": 1.7547944214429957e-07,
+      "loss": 0.4413,
+      "step": 39095
+    },
+    {
+      "epoch": 1.98501097841124,
+      "grad_norm": 0.02156291605727596,
+      "learning_rate": 1.6965996451623334e-07,
+      "loss": 0.4424,
+      "step": 39100
+    },
+    {
+      "epoch": 1.9852648145092715,
+      "grad_norm": 0.019974654547251697,
+      "learning_rate": 1.6393860109120695e-07,
+      "loss": 0.4581,
+      "step": 39105
+    },
+    {
+      "epoch": 1.9855186506073028,
+      "grad_norm": 0.023303362723310548,
+      "learning_rate": 1.5831535299243304e-07,
+      "loss": 0.4222,
+      "step": 39110
+    },
+    {
+      "epoch": 1.9857724867053343,
+      "grad_norm": 0.021376371336891062,
+      "learning_rate": 1.5279022132358434e-07,
+      "loss": 0.4265,
+      "step": 39115
+    },
+    {
+      "epoch": 1.9860263228033659,
+      "grad_norm": 0.07893795121247377,
+      "learning_rate": 1.473632071692932e-07,
+      "loss": 0.4357,
+      "step": 39120
+    },
+    {
+      "epoch": 1.9862801589013974,
+      "grad_norm": 0.02242228809324414,
+      "learning_rate": 1.4203431159487413e-07,
+      "loss": 0.4276,
+      "step": 39125
+    },
+    {
+      "epoch": 1.9865339949994287,
+      "grad_norm": 0.024029115581471198,
+      "learning_rate": 1.3680353564632375e-07,
+      "loss": 0.4291,
+      "step": 39130
+    },
+    {
+      "epoch": 1.9867878310974603,
+      "grad_norm": 0.02328166827725145,
+      "learning_rate": 1.3167088035037632e-07,
+      "loss": 0.4332,
+      "step": 39135
+    },
+    {
+      "epoch": 1.9870416671954918,
+      "grad_norm": 0.02357094301240424,
+      "learning_rate": 1.266363467146703e-07,
+      "loss": 0.4545,
+      "step": 39140
+    },
+    {
+      "epoch": 1.9872955032935233,
+      "grad_norm": 0.023516549978020867,
+      "learning_rate": 1.216999357273596e-07,
+      "loss": 0.4384,
+      "step": 39145
+    },
+    {
+      "epoch": 1.9875493393915549,
+      "grad_norm": 0.025546553659660538,
+      "learning_rate": 1.1686164835744695e-07,
+      "loss": 0.4447,
+      "step": 39150
+    },
+    {
+      "epoch": 1.9878031754895864,
+      "grad_norm": 0.018763562747325255,
+      "learning_rate": 1.121214855546726e-07,
+      "loss": 0.4391,
+      "step": 39155
+    },
+    {
+      "epoch": 1.988057011587618,
+      "grad_norm": 0.02484040778673898,
+      "learning_rate": 1.074794482495145e-07,
+      "loss": 0.4554,
+      "step": 39160
+    },
+    {
+      "epoch": 1.9883108476856495,
+      "grad_norm": 0.020917177794273913,
+      "learning_rate": 1.0293553735318817e-07,
+      "loss": 0.4487,
+      "step": 39165
+    },
+    {
+      "epoch": 1.988564683783681,
+      "grad_norm": 0.02066839479802247,
+      "learning_rate": 9.84897537576468e-08,
+      "loss": 0.4241,
+      "step": 39170
+    },
+    {
+      "epoch": 1.9888185198817125,
+      "grad_norm": 0.023515586475883914,
+      "learning_rate": 9.414209833552567e-08,
+      "loss": 0.4709,
+      "step": 39175
+    },
+    {
+      "epoch": 1.9890723559797439,
+      "grad_norm": 0.024226686054511816,
+      "learning_rate": 8.989257194030876e-08,
+      "loss": 0.4775,
+      "step": 39180
+    },
+    {
+      "epoch": 1.9893261920777754,
+      "grad_norm": 0.023019658882310785,
+      "learning_rate": 8.57411754061621e-08,
+      "loss": 0.4539,
+      "step": 39185
+    },
+    {
+      "epoch": 1.989580028175807,
+      "grad_norm": 0.02633296041160311,
+      "learning_rate": 8.168790954793392e-08,
+      "loss": 0.4702,
+      "step": 39190
+    },
+    {
+      "epoch": 1.9898338642738382,
+      "grad_norm": 0.02531198180541627,
+      "learning_rate": 7.773277516126553e-08,
+      "loss": 0.4541,
+      "step": 39195
+    },
+    {
+      "epoch": 1.9900877003718698,
+      "grad_norm": 0.025611896685111196,
+      "learning_rate": 7.38757730225359e-08,
+      "loss": 0.4324,
+      "step": 39200
+    },
+    {
+      "epoch": 1.9903415364699013,
+      "grad_norm": 0.02262366655843535,
+      "learning_rate": 7.01169038888616e-08,
+      "loss": 0.4531,
+      "step": 39205
+    },
+    {
+      "epoch": 1.9905953725679328,
+      "grad_norm": 0.022614701857616906,
+      "learning_rate": 6.64561684981524e-08,
+      "loss": 0.4504,
+      "step": 39210
+    },
+    {
+      "epoch": 1.9908492086659644,
+      "grad_norm": 0.024008714004990543,
+      "learning_rate": 6.289356756888908e-08,
+      "loss": 0.4487,
+      "step": 39215
+    },
+    {
+      "epoch": 1.991103044763996,
+      "grad_norm": 0.020603992655022652,
+      "learning_rate": 5.9429101800401174e-08,
+      "loss": 0.4555,
+      "step": 39220
+    },
+    {
+      "epoch": 1.9913568808620274,
+      "grad_norm": 0.020889931974897115,
+      "learning_rate": 5.606277187286679e-08,
+      "loss": 0.463,
+      "step": 39225
+    },
+    {
+      "epoch": 1.991610716960059,
+      "grad_norm": 0.0313564646039055,
+      "learning_rate": 5.2794578446924145e-08,
+      "loss": 0.4804,
+      "step": 39230
+    },
+    {
+      "epoch": 1.9918645530580905,
+      "grad_norm": 0.021498099912898545,
+      "learning_rate": 4.962452216417113e-08,
+      "loss": 0.4825,
+      "step": 39235
+    },
+    {
+      "epoch": 1.992118389156122,
+      "grad_norm": 0.02066023063054367,
+      "learning_rate": 4.655260364694325e-08,
+      "loss": 0.4463,
+      "step": 39240
+    },
+    {
+      "epoch": 1.9923722252541534,
+      "grad_norm": 0.02131649131072014,
+      "learning_rate": 4.357882349809161e-08,
+      "loss": 0.4666,
+      "step": 39245
+    },
+    {
+      "epoch": 1.992626061352185,
+      "grad_norm": 0.02195382258423416,
+      "learning_rate": 4.0703182301482514e-08,
+      "loss": 0.4372,
+      "step": 39250
+    },
+    {
+      "epoch": 1.9928798974502164,
+      "grad_norm": 0.020715639914265123,
+      "learning_rate": 3.792568062155333e-08,
+      "loss": 0.4769,
+      "step": 39255
+    },
+    {
+      "epoch": 1.9931337335482477,
+      "grad_norm": 0.0206383390200722,
+      "learning_rate": 3.524631900347908e-08,
+      "loss": 0.4327,
+      "step": 39260
+    },
+    {
+      "epoch": 1.9933875696462793,
+      "grad_norm": 0.021381354938705657,
+      "learning_rate": 3.266509797328343e-08,
+      "loss": 0.4715,
+      "step": 39265
+    },
+    {
+      "epoch": 1.9936414057443108,
+      "grad_norm": 0.021713933568330897,
+      "learning_rate": 3.018201803756115e-08,
+      "loss": 0.4428,
+      "step": 39270
+    },
+    {
+      "epoch": 1.9938952418423423,
+      "grad_norm": 0.019288162544681065,
+      "learning_rate": 2.7797079683755666e-08,
+      "loss": 0.4647,
+      "step": 39275
+    },
+    {
+      "epoch": 1.9941490779403739,
+      "grad_norm": 0.027710930933847345,
+      "learning_rate": 2.5510283379992505e-08,
+      "loss": 0.4898,
+      "step": 39280
+    },
+    {
+      "epoch": 1.9944029140384054,
+      "grad_norm": 0.021943033021282254,
+      "learning_rate": 2.3321629575245862e-08,
+      "loss": 0.4366,
+      "step": 39285
+    },
+    {
+      "epoch": 1.994656750136437,
+      "grad_norm": 0.02816046395571235,
+      "learning_rate": 2.1231118699061024e-08,
+      "loss": 0.4599,
+      "step": 39290
+    },
+    {
+      "epoch": 1.9949105862344685,
+      "grad_norm": 0.021547960303644455,
+      "learning_rate": 1.9238751161831936e-08,
+      "loss": 0.425,
+      "step": 39295
+    },
+    {
+      "epoch": 1.9951644223325,
+      "grad_norm": 0.020328451114982965,
+      "learning_rate": 1.7344527354634655e-08,
+      "loss": 0.4201,
+      "step": 39300
+    },
+    {
+      "epoch": 1.9954182584305316,
+      "grad_norm": 0.02182918824731763,
+      "learning_rate": 1.554844764928287e-08,
+      "loss": 0.4637,
+      "step": 39305
+    },
+    {
+      "epoch": 1.9956720945285629,
+      "grad_norm": 0.02193159918916923,
+      "learning_rate": 1.3850512398383419e-08,
+      "loss": 0.4707,
+      "step": 39310
+    },
+    {
+      "epoch": 1.9959259306265944,
+      "grad_norm": 0.02149565745718913,
+      "learning_rate": 1.225072193516974e-08,
+      "loss": 0.4406,
+      "step": 39315
+    },
+    {
+      "epoch": 1.996179766724626,
+      "grad_norm": 0.020693611586431505,
+      "learning_rate": 1.0749076573723927e-08,
+      "loss": 0.4555,
+      "step": 39320
+    },
+    {
+      "epoch": 1.9964336028226572,
+      "grad_norm": 0.0239210359312069,
+      "learning_rate": 9.34557660875468e-09,
+      "loss": 0.4554,
+      "step": 39325
+    },
+    {
+      "epoch": 1.9966874389206888,
+      "grad_norm": 0.02080480579062484,
+      "learning_rate": 8.040222315819357e-09,
+      "loss": 0.4551,
+      "step": 39330
+    },
+    {
+      "epoch": 1.9969412750187203,
+      "grad_norm": 0.023148989474884817,
+      "learning_rate": 6.833013951157429e-09,
+      "loss": 0.4525,
+      "step": 39335
+    },
+    {
+      "epoch": 1.9971951111167519,
+      "grad_norm": 0.023818805572660067,
+      "learning_rate": 5.7239517516904925e-09,
+      "loss": 0.4586,
+      "step": 39340
+    },
+    {
+      "epoch": 1.9974489472147834,
+      "grad_norm": 0.029865648276199377,
+      "learning_rate": 4.713035935188792e-09,
+      "loss": 0.4767,
+      "step": 39345
+    },
+    {
+      "epoch": 1.997702783312815,
+      "grad_norm": 0.020425051877142286,
+      "learning_rate": 3.800266699993671e-09,
+      "loss": 0.444,
+      "step": 39350
+    },
+    {
+      "epoch": 1.9979566194108465,
+      "grad_norm": 0.022269852971449254,
+      "learning_rate": 2.9856442253506366e-09,
+      "loss": 0.4632,
+      "step": 39355
+    },
+    {
+      "epoch": 1.998210455508878,
+      "grad_norm": 0.02273454463601086,
+      "learning_rate": 2.2691686711318048e-09,
+      "loss": 0.4404,
+      "step": 39360
+    },
+    {
+      "epoch": 1.9984642916069095,
+      "grad_norm": 0.021280763318387155,
+      "learning_rate": 1.6508401780024329e-09,
+      "loss": 0.4621,
+      "step": 39365
+    },
+    {
+      "epoch": 1.998718127704941,
+      "grad_norm": 0.025913136491658925,
+      "learning_rate": 1.1306588673098972e-09,
+      "loss": 0.4594,
+      "step": 39370
+    },
+    {
+      "epoch": 1.9989719638029724,
+      "grad_norm": 0.02119584869294317,
+      "learning_rate": 7.08624841194716e-10,
+      "loss": 0.456,
+      "step": 39375
+    },
+    {
+      "epoch": 1.999225799901004,
+      "grad_norm": 0.01982643960905116,
+      "learning_rate": 3.8473818242401594e-10,
+      "loss": 0.436,
+      "step": 39380
+    },
+    {
+      "epoch": 1.9994796359990354,
+      "grad_norm": 0.024448883021925015,
+      "learning_rate": 1.5899895472459848e-10,
+      "loss": 0.4621,
+      "step": 39385
+    },
+    {
+      "epoch": 1.999733472097067,
+      "grad_norm": 0.027796596702240602,
+      "learning_rate": 3.140720228334004e-11,
+      "loss": 0.4499,
+      "step": 39390
+    },
+    {
+      "epoch": 1.9999365409754921,
+      "step": 39394,
+      "total_flos": 3.655598871565828e+18,
+      "train_loss": 0.5567794406680932,
+      "train_runtime": 148446.9136,
+      "train_samples_per_second": 2.123,
+      "train_steps_per_second": 0.265
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 39394,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 3.655598871565828e+18,
+  "train_batch_size": 2,
+  "trial_name": null,
+  "trial_params": null
+}