gliner-decoder-small-v1.0 / trainer_state.json
Ihor's picture
Upload folder using huggingface_hub
e34b34e verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.4444444444444444,
"eval_steps": 500,
"global_step": 325000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0022222222222222222,
"grad_norm": 4388.67578125,
"learning_rate": 5.988e-07,
"loss": 891.7704,
"step": 500
},
{
"epoch": 0.0044444444444444444,
"grad_norm": 1130.9163818359375,
"learning_rate": 1.1988e-06,
"loss": 365.2608,
"step": 1000
},
{
"epoch": 0.006666666666666667,
"grad_norm": 647.9131469726562,
"learning_rate": 1.7988e-06,
"loss": 143.2146,
"step": 1500
},
{
"epoch": 0.008888888888888889,
"grad_norm": 862.5914916992188,
"learning_rate": 2.3988000000000002e-06,
"loss": 101.8926,
"step": 2000
},
{
"epoch": 0.011111111111111112,
"grad_norm": 874.10302734375,
"learning_rate": 2.9988e-06,
"loss": 86.5583,
"step": 2500
},
{
"epoch": 0.013333333333333334,
"grad_norm": 732.438720703125,
"learning_rate": 3.5988e-06,
"loss": 80.9323,
"step": 3000
},
{
"epoch": 0.015555555555555555,
"grad_norm": 493.2248229980469,
"learning_rate": 4.1988e-06,
"loss": 73.8484,
"step": 3500
},
{
"epoch": 0.017777777777777778,
"grad_norm": 678.9496459960938,
"learning_rate": 4.7988e-06,
"loss": 68.8807,
"step": 4000
},
{
"epoch": 0.02,
"grad_norm": 2241.881103515625,
"learning_rate": 5.398800000000001e-06,
"loss": 69.1163,
"step": 4500
},
{
"epoch": 0.022222222222222223,
"grad_norm": 572.26318359375,
"learning_rate": 5.9988e-06,
"loss": 65.9477,
"step": 5000
},
{
"epoch": 0.024444444444444446,
"grad_norm": 472.3359069824219,
"learning_rate": 6.5988e-06,
"loss": 60.6877,
"step": 5500
},
{
"epoch": 0.02666666666666667,
"grad_norm": 713.2996215820312,
"learning_rate": 7.1988000000000004e-06,
"loss": 62.0643,
"step": 6000
},
{
"epoch": 0.028888888888888888,
"grad_norm": 399.187255859375,
"learning_rate": 7.7988e-06,
"loss": 58.1376,
"step": 6500
},
{
"epoch": 0.03111111111111111,
"grad_norm": 494.1978454589844,
"learning_rate": 8.3988e-06,
"loss": 56.4748,
"step": 7000
},
{
"epoch": 0.03333333333333333,
"grad_norm": 338.4364318847656,
"learning_rate": 8.998800000000001e-06,
"loss": 59.7625,
"step": 7500
},
{
"epoch": 0.035555555555555556,
"grad_norm": 287.89202880859375,
"learning_rate": 9.5988e-06,
"loss": 55.0997,
"step": 8000
},
{
"epoch": 0.03777777777777778,
"grad_norm": 213.35813903808594,
"learning_rate": 1.01988e-05,
"loss": 53.2111,
"step": 8500
},
{
"epoch": 0.04,
"grad_norm": 354.8004455566406,
"learning_rate": 1.07988e-05,
"loss": 53.5394,
"step": 9000
},
{
"epoch": 0.042222222222222223,
"grad_norm": 875.28955078125,
"learning_rate": 1.1398800000000002e-05,
"loss": 52.944,
"step": 9500
},
{
"epoch": 0.044444444444444446,
"grad_norm": 523.3621215820312,
"learning_rate": 1.19988e-05,
"loss": 50.8715,
"step": 10000
},
{
"epoch": 0.04666666666666667,
"grad_norm": 545.8438720703125,
"learning_rate": 1.25988e-05,
"loss": 51.0906,
"step": 10500
},
{
"epoch": 0.04888888888888889,
"grad_norm": 371.3891296386719,
"learning_rate": 1.3198800000000001e-05,
"loss": 49.5472,
"step": 11000
},
{
"epoch": 0.051111111111111114,
"grad_norm": 175.73524475097656,
"learning_rate": 1.3798799999999999e-05,
"loss": 47.1287,
"step": 11500
},
{
"epoch": 0.05333333333333334,
"grad_norm": 335.2581481933594,
"learning_rate": 1.43988e-05,
"loss": 47.6528,
"step": 12000
},
{
"epoch": 0.05555555555555555,
"grad_norm": 1022.18115234375,
"learning_rate": 1.4998800000000001e-05,
"loss": 46.9557,
"step": 12500
},
{
"epoch": 0.057777777777777775,
"grad_norm": 380.919677734375,
"learning_rate": 1.55988e-05,
"loss": 44.6385,
"step": 13000
},
{
"epoch": 0.06,
"grad_norm": 305.0384826660156,
"learning_rate": 1.61988e-05,
"loss": 44.5282,
"step": 13500
},
{
"epoch": 0.06222222222222222,
"grad_norm": 458.19122314453125,
"learning_rate": 1.67988e-05,
"loss": 44.6465,
"step": 14000
},
{
"epoch": 0.06444444444444444,
"grad_norm": 143.66160583496094,
"learning_rate": 1.73988e-05,
"loss": 44.0934,
"step": 14500
},
{
"epoch": 0.06666666666666667,
"grad_norm": 436.7533874511719,
"learning_rate": 1.79988e-05,
"loss": 43.5587,
"step": 15000
},
{
"epoch": 0.06888888888888889,
"grad_norm": 455.068359375,
"learning_rate": 1.85988e-05,
"loss": 41.507,
"step": 15500
},
{
"epoch": 0.07111111111111111,
"grad_norm": 394.86676025390625,
"learning_rate": 1.91988e-05,
"loss": 40.521,
"step": 16000
},
{
"epoch": 0.07333333333333333,
"grad_norm": 371.15753173828125,
"learning_rate": 1.97988e-05,
"loss": 40.0934,
"step": 16500
},
{
"epoch": 0.07555555555555556,
"grad_norm": 476.3223571777344,
"learning_rate": 2.0398800000000002e-05,
"loss": 42.2142,
"step": 17000
},
{
"epoch": 0.07777777777777778,
"grad_norm": 498.6954650878906,
"learning_rate": 2.0998800000000003e-05,
"loss": 39.011,
"step": 17500
},
{
"epoch": 0.08,
"grad_norm": 327.6210632324219,
"learning_rate": 2.15988e-05,
"loss": 39.5519,
"step": 18000
},
{
"epoch": 0.08222222222222222,
"grad_norm": 210.87628173828125,
"learning_rate": 2.2198799999999998e-05,
"loss": 39.4893,
"step": 18500
},
{
"epoch": 0.08444444444444445,
"grad_norm": 357.408203125,
"learning_rate": 2.27988e-05,
"loss": 39.7812,
"step": 19000
},
{
"epoch": 0.08666666666666667,
"grad_norm": 312.556640625,
"learning_rate": 2.33988e-05,
"loss": 37.975,
"step": 19500
},
{
"epoch": 0.08888888888888889,
"grad_norm": 363.57891845703125,
"learning_rate": 2.39988e-05,
"loss": 36.2815,
"step": 20000
},
{
"epoch": 0.09111111111111111,
"grad_norm": 332.95977783203125,
"learning_rate": 2.4598800000000002e-05,
"loss": 36.7108,
"step": 20500
},
{
"epoch": 0.09333333333333334,
"grad_norm": 483.03765869140625,
"learning_rate": 2.5198800000000003e-05,
"loss": 36.0883,
"step": 21000
},
{
"epoch": 0.09555555555555556,
"grad_norm": 266.86065673828125,
"learning_rate": 2.5798799999999998e-05,
"loss": 38.5255,
"step": 21500
},
{
"epoch": 0.09777777777777778,
"grad_norm": 371.4537048339844,
"learning_rate": 2.63988e-05,
"loss": 34.8224,
"step": 22000
},
{
"epoch": 0.1,
"grad_norm": 1334.1453857421875,
"learning_rate": 2.69988e-05,
"loss": 36.1617,
"step": 22500
},
{
"epoch": 0.10222222222222223,
"grad_norm": 234.84649658203125,
"learning_rate": 2.75988e-05,
"loss": 35.088,
"step": 23000
},
{
"epoch": 0.10444444444444445,
"grad_norm": 2964.02978515625,
"learning_rate": 2.8198800000000002e-05,
"loss": 34.028,
"step": 23500
},
{
"epoch": 0.10666666666666667,
"grad_norm": 456.6842956542969,
"learning_rate": 2.8798800000000003e-05,
"loss": 36.25,
"step": 24000
},
{
"epoch": 0.10888888888888888,
"grad_norm": 306.76007080078125,
"learning_rate": 2.9398800000000004e-05,
"loss": 33.3643,
"step": 24500
},
{
"epoch": 0.1111111111111111,
"grad_norm": 818.77783203125,
"learning_rate": 2.9998799999999998e-05,
"loss": 36.2583,
"step": 25000
},
{
"epoch": 0.11333333333333333,
"grad_norm": 173.24815368652344,
"learning_rate": 2.9999918308948427e-05,
"loss": 36.2218,
"step": 25500
},
{
"epoch": 0.11555555555555555,
"grad_norm": 542.15234375,
"learning_rate": 2.9999672581521505e-05,
"loss": 33.669,
"step": 26000
},
{
"epoch": 0.11777777777777777,
"grad_norm": 663.7468872070312,
"learning_rate": 2.999926282007839e-05,
"loss": 33.3195,
"step": 26500
},
{
"epoch": 0.12,
"grad_norm": 237.98435974121094,
"learning_rate": 2.9998689029100164e-05,
"loss": 34.6775,
"step": 27000
},
{
"epoch": 0.12222222222222222,
"grad_norm": 350.93109130859375,
"learning_rate": 2.9997951214861724e-05,
"loss": 32.0158,
"step": 27500
},
{
"epoch": 0.12444444444444444,
"grad_norm": 648.1705322265625,
"learning_rate": 2.999704938543168e-05,
"loss": 33.583,
"step": 28000
},
{
"epoch": 0.12666666666666668,
"grad_norm": 263.5220642089844,
"learning_rate": 2.9995983550672296e-05,
"loss": 33.9471,
"step": 28500
},
{
"epoch": 0.1288888888888889,
"grad_norm": 193.79708862304688,
"learning_rate": 2.9994753722239374e-05,
"loss": 32.0882,
"step": 29000
},
{
"epoch": 0.13111111111111112,
"grad_norm": 584.5958862304688,
"learning_rate": 2.999335991358211e-05,
"loss": 32.2817,
"step": 29500
},
{
"epoch": 0.13333333333333333,
"grad_norm": 498.8976745605469,
"learning_rate": 2.999180213994299e-05,
"loss": 31.1343,
"step": 30000
},
{
"epoch": 0.13555555555555557,
"grad_norm": 492.1926574707031,
"learning_rate": 2.9990080418357563e-05,
"loss": 30.703,
"step": 30500
},
{
"epoch": 0.13777777777777778,
"grad_norm": 389.2348937988281,
"learning_rate": 2.99881947676543e-05,
"loss": 32.2483,
"step": 31000
},
{
"epoch": 0.14,
"grad_norm": 687.1718139648438,
"learning_rate": 2.9986145208454382e-05,
"loss": 31.1763,
"step": 31500
},
{
"epoch": 0.14222222222222222,
"grad_norm": 404.84326171875,
"learning_rate": 2.998393176317146e-05,
"loss": 31.7738,
"step": 32000
},
{
"epoch": 0.14444444444444443,
"grad_norm": 492.9033203125,
"learning_rate": 2.9981554456011407e-05,
"loss": 31.7717,
"step": 32500
},
{
"epoch": 0.14666666666666667,
"grad_norm": 393.6338195800781,
"learning_rate": 2.997901331297209e-05,
"loss": 30.5822,
"step": 33000
},
{
"epoch": 0.14888888888888888,
"grad_norm": 510.1676025390625,
"learning_rate": 2.9976308361843024e-05,
"loss": 28.6046,
"step": 33500
},
{
"epoch": 0.1511111111111111,
"grad_norm": 547.7921142578125,
"learning_rate": 2.997343963220513e-05,
"loss": 29.9463,
"step": 34000
},
{
"epoch": 0.15333333333333332,
"grad_norm": 481.76092529296875,
"learning_rate": 2.997040715543038e-05,
"loss": 29.8005,
"step": 34500
},
{
"epoch": 0.15555555555555556,
"grad_norm": 394.83935546875,
"learning_rate": 2.9967210964681447e-05,
"loss": 29.8433,
"step": 35000
},
{
"epoch": 0.15777777777777777,
"grad_norm": 223.97235107421875,
"learning_rate": 2.9963851094911362e-05,
"loss": 30.1751,
"step": 35500
},
{
"epoch": 0.16,
"grad_norm": 587.9564819335938,
"learning_rate": 2.9960327582863126e-05,
"loss": 28.0523,
"step": 36000
},
{
"epoch": 0.1622222222222222,
"grad_norm": 786.5308227539062,
"learning_rate": 2.9956640467069298e-05,
"loss": 30.0858,
"step": 36500
},
{
"epoch": 0.16444444444444445,
"grad_norm": 627.6124267578125,
"learning_rate": 2.995278978785159e-05,
"loss": 27.514,
"step": 37000
},
{
"epoch": 0.16666666666666666,
"grad_norm": 135.85784912109375,
"learning_rate": 2.9948775587320413e-05,
"loss": 29.0652,
"step": 37500
},
{
"epoch": 0.1688888888888889,
"grad_norm": 516.0145874023438,
"learning_rate": 2.9944597909374416e-05,
"loss": 28.7626,
"step": 38000
},
{
"epoch": 0.1711111111111111,
"grad_norm": 381.4872131347656,
"learning_rate": 2.994025679970002e-05,
"loss": 30.4396,
"step": 38500
},
{
"epoch": 0.17333333333333334,
"grad_norm": 612.7399291992188,
"learning_rate": 2.99357523057709e-05,
"loss": 26.5003,
"step": 39000
},
{
"epoch": 0.17555555555555555,
"grad_norm": 365.5273132324219,
"learning_rate": 2.9931084476847486e-05,
"loss": 27.6445,
"step": 39500
},
{
"epoch": 0.17777777777777778,
"grad_norm": 117.53230285644531,
"learning_rate": 2.99262533639764e-05,
"loss": 26.8894,
"step": 40000
},
{
"epoch": 0.18,
"grad_norm": 895.5122680664062,
"learning_rate": 2.9921259019989926e-05,
"loss": 26.3664,
"step": 40500
},
{
"epoch": 0.18222222222222223,
"grad_norm": 493.69683837890625,
"learning_rate": 2.9916101499505408e-05,
"loss": 25.5829,
"step": 41000
},
{
"epoch": 0.18444444444444444,
"grad_norm": 469.6036376953125,
"learning_rate": 2.9910780858924657e-05,
"loss": 27.9183,
"step": 41500
},
{
"epoch": 0.18666666666666668,
"grad_norm": 539.50390625,
"learning_rate": 2.9905297156433357e-05,
"loss": 27.7629,
"step": 42000
},
{
"epoch": 0.18888888888888888,
"grad_norm": 127.55433654785156,
"learning_rate": 2.9899650452000393e-05,
"loss": 26.9212,
"step": 42500
},
{
"epoch": 0.19111111111111112,
"grad_norm": 361.29010009765625,
"learning_rate": 2.9893840807377214e-05,
"loss": 25.828,
"step": 43000
},
{
"epoch": 0.19333333333333333,
"grad_norm": 603.46533203125,
"learning_rate": 2.988786828609718e-05,
"loss": 27.1813,
"step": 43500
},
{
"epoch": 0.19555555555555557,
"grad_norm": 94.64213562011719,
"learning_rate": 2.988173295347481e-05,
"loss": 28.3537,
"step": 44000
},
{
"epoch": 0.19777777777777777,
"grad_norm": 1213.6317138671875,
"learning_rate": 2.987543487660513e-05,
"loss": 25.5299,
"step": 44500
},
{
"epoch": 0.2,
"grad_norm": 504.8955993652344,
"learning_rate": 2.986897412436289e-05,
"loss": 29.0305,
"step": 45000
},
{
"epoch": 0.20222222222222222,
"grad_norm": 734.322021484375,
"learning_rate": 2.9862350767401846e-05,
"loss": 28.3809,
"step": 45500
},
{
"epoch": 0.20444444444444446,
"grad_norm": 1137.0435791015625,
"learning_rate": 2.9855564878153972e-05,
"loss": 26.6201,
"step": 46000
},
{
"epoch": 0.20666666666666667,
"grad_norm": 373.8830871582031,
"learning_rate": 2.984861653082866e-05,
"loss": 25.7129,
"step": 46500
},
{
"epoch": 0.2088888888888889,
"grad_norm": 263.8885498046875,
"learning_rate": 2.9841505801411928e-05,
"loss": 26.2681,
"step": 47000
},
{
"epoch": 0.2111111111111111,
"grad_norm": 1805.83984375,
"learning_rate": 2.983423276766557e-05,
"loss": 26.6592,
"step": 47500
},
{
"epoch": 0.21333333333333335,
"grad_norm": 286.2330627441406,
"learning_rate": 2.982679750912632e-05,
"loss": 25.0459,
"step": 48000
},
{
"epoch": 0.21555555555555556,
"grad_norm": 219.3948516845703,
"learning_rate": 2.9819200107104972e-05,
"loss": 25.5699,
"step": 48500
},
{
"epoch": 0.21777777777777776,
"grad_norm": 412.9397888183594,
"learning_rate": 2.98114406446855e-05,
"loss": 26.1915,
"step": 49000
},
{
"epoch": 0.22,
"grad_norm": 602.8424682617188,
"learning_rate": 2.9803519206724136e-05,
"loss": 27.0685,
"step": 49500
},
{
"epoch": 0.2222222222222222,
"grad_norm": 149.6744384765625,
"learning_rate": 2.9795435879848466e-05,
"loss": 24.8978,
"step": 50000
},
{
"epoch": 0.22444444444444445,
"grad_norm": 339.0307312011719,
"learning_rate": 2.9787190752456448e-05,
"loss": 23.1352,
"step": 50500
},
{
"epoch": 0.22666666666666666,
"grad_norm": 627.1898193359375,
"learning_rate": 2.977878391471548e-05,
"loss": 25.7614,
"step": 51000
},
{
"epoch": 0.2288888888888889,
"grad_norm": 959.9122924804688,
"learning_rate": 2.9770215458561394e-05,
"loss": 23.909,
"step": 51500
},
{
"epoch": 0.2311111111111111,
"grad_norm": 290.6165466308594,
"learning_rate": 2.976148547769745e-05,
"loss": 25.6165,
"step": 52000
},
{
"epoch": 0.23333333333333334,
"grad_norm": 337.4861755371094,
"learning_rate": 2.9752594067593318e-05,
"loss": 24.7856,
"step": 52500
},
{
"epoch": 0.23555555555555555,
"grad_norm": 1252.9945068359375,
"learning_rate": 2.974354132548404e-05,
"loss": 25.353,
"step": 53000
},
{
"epoch": 0.23777777777777778,
"grad_norm": 186.39710998535156,
"learning_rate": 2.973432735036895e-05,
"loss": 24.7965,
"step": 53500
},
{
"epoch": 0.24,
"grad_norm": 795.011962890625,
"learning_rate": 2.9724952243010605e-05,
"loss": 24.6118,
"step": 54000
},
{
"epoch": 0.24222222222222223,
"grad_norm": 217.4955291748047,
"learning_rate": 2.9715416105933675e-05,
"loss": 24.6205,
"step": 54500
},
{
"epoch": 0.24444444444444444,
"grad_norm": 310.7270812988281,
"learning_rate": 2.970571904342383e-05,
"loss": 24.1833,
"step": 55000
},
{
"epoch": 0.24666666666666667,
"grad_norm": 250.29307556152344,
"learning_rate": 2.969586116152659e-05,
"loss": 24.082,
"step": 55500
},
{
"epoch": 0.24888888888888888,
"grad_norm": 243.90106201171875,
"learning_rate": 2.9685842568046167e-05,
"loss": 23.5486,
"step": 56000
},
{
"epoch": 0.2511111111111111,
"grad_norm": 281.5003967285156,
"learning_rate": 2.967566337254431e-05,
"loss": 22.6343,
"step": 56500
},
{
"epoch": 0.25333333333333335,
"grad_norm": 190.99545288085938,
"learning_rate": 2.9665323686339052e-05,
"loss": 25.0189,
"step": 57000
},
{
"epoch": 0.25555555555555554,
"grad_norm": 400.95361328125,
"learning_rate": 2.9654823622503557e-05,
"loss": 23.9388,
"step": 57500
},
{
"epoch": 0.2577777777777778,
"grad_norm": 74.59510040283203,
"learning_rate": 2.9644163295864836e-05,
"loss": 24.4699,
"step": 58000
},
{
"epoch": 0.26,
"grad_norm": 650.9434204101562,
"learning_rate": 2.9633342823002515e-05,
"loss": 22.5825,
"step": 58500
},
{
"epoch": 0.26222222222222225,
"grad_norm": 359.67315673828125,
"learning_rate": 2.9622362322247548e-05,
"loss": 24.1618,
"step": 59000
},
{
"epoch": 0.2644444444444444,
"grad_norm": 0.0,
"learning_rate": 2.9611221913680935e-05,
"loss": 22.4548,
"step": 59500
},
{
"epoch": 0.26666666666666666,
"grad_norm": 392.0536804199219,
"learning_rate": 2.9599921719132397e-05,
"loss": 22.0985,
"step": 60000
},
{
"epoch": 0.2688888888888889,
"grad_norm": 220.76341247558594,
"learning_rate": 2.9588461862179055e-05,
"loss": 22.2635,
"step": 60500
},
{
"epoch": 0.27111111111111114,
"grad_norm": 179.5050048828125,
"learning_rate": 2.9576842468144067e-05,
"loss": 22.9824,
"step": 61000
},
{
"epoch": 0.2733333333333333,
"grad_norm": 625.1077270507812,
"learning_rate": 2.9565063664095265e-05,
"loss": 23.0385,
"step": 61500
},
{
"epoch": 0.27555555555555555,
"grad_norm": 787.576171875,
"learning_rate": 2.955312557884376e-05,
"loss": 23.6391,
"step": 62000
},
{
"epoch": 0.2777777777777778,
"grad_norm": 287.6144714355469,
"learning_rate": 2.954102834294254e-05,
"loss": 22.4223,
"step": 62500
},
{
"epoch": 0.28,
"grad_norm": 598.0758666992188,
"learning_rate": 2.9528772088685042e-05,
"loss": 22.2955,
"step": 63000
},
{
"epoch": 0.2822222222222222,
"grad_norm": 567.0135498046875,
"learning_rate": 2.9516356950103695e-05,
"loss": 22.5473,
"step": 63500
},
{
"epoch": 0.28444444444444444,
"grad_norm": 209.81381225585938,
"learning_rate": 2.950378306296847e-05,
"loss": 23.5631,
"step": 64000
},
{
"epoch": 0.2866666666666667,
"grad_norm": 413.2209167480469,
"learning_rate": 2.9491050564785384e-05,
"loss": 23.1249,
"step": 64500
},
{
"epoch": 0.28888888888888886,
"grad_norm": 140.22494506835938,
"learning_rate": 2.9478159594794985e-05,
"loss": 23.2432,
"step": 65000
},
{
"epoch": 0.2911111111111111,
"grad_norm": 322.0098571777344,
"learning_rate": 2.946511029397087e-05,
"loss": 23.1568,
"step": 65500
},
{
"epoch": 0.29333333333333333,
"grad_norm": 204.205810546875,
"learning_rate": 2.945190280501809e-05,
"loss": 23.9367,
"step": 66000
},
{
"epoch": 0.29555555555555557,
"grad_norm": 247.4243621826172,
"learning_rate": 2.943853727237164e-05,
"loss": 23.2841,
"step": 66500
},
{
"epoch": 0.29777777777777775,
"grad_norm": 767.0619506835938,
"learning_rate": 2.9425013842194833e-05,
"loss": 23.7975,
"step": 67000
},
{
"epoch": 0.3,
"grad_norm": 1255.4112548828125,
"learning_rate": 2.9411332662377744e-05,
"loss": 23.7579,
"step": 67500
},
{
"epoch": 0.3022222222222222,
"grad_norm": 444.0653991699219,
"learning_rate": 2.9397493882535556e-05,
"loss": 22.0943,
"step": 68000
},
{
"epoch": 0.30444444444444446,
"grad_norm": 362.8856506347656,
"learning_rate": 2.9383497654006945e-05,
"loss": 22.6397,
"step": 68500
},
{
"epoch": 0.30666666666666664,
"grad_norm": 450.62237548828125,
"learning_rate": 2.936934412985244e-05,
"loss": 22.2143,
"step": 69000
},
{
"epoch": 0.3088888888888889,
"grad_norm": 148.87391662597656,
"learning_rate": 2.9355033464852697e-05,
"loss": 21.7673,
"step": 69500
},
{
"epoch": 0.3111111111111111,
"grad_norm": 182.1023406982422,
"learning_rate": 2.9340565815506865e-05,
"loss": 22.5551,
"step": 70000
},
{
"epoch": 0.31333333333333335,
"grad_norm": 289.2044677734375,
"learning_rate": 2.932594134003083e-05,
"loss": 22.7895,
"step": 70500
},
{
"epoch": 0.31555555555555553,
"grad_norm": 0.0,
"learning_rate": 2.931116019835553e-05,
"loss": 22.729,
"step": 71000
},
{
"epoch": 0.31777777777777777,
"grad_norm": 361.7475891113281,
"learning_rate": 2.9296222552125148e-05,
"loss": 21.4155,
"step": 71500
},
{
"epoch": 0.32,
"grad_norm": 391.5496520996094,
"learning_rate": 2.928112856469539e-05,
"loss": 22.2849,
"step": 72000
},
{
"epoch": 0.32222222222222224,
"grad_norm": 429.3208923339844,
"learning_rate": 2.9265878401131687e-05,
"loss": 20.7871,
"step": 72500
},
{
"epoch": 0.3244444444444444,
"grad_norm": 912.58154296875,
"learning_rate": 2.9250472228207387e-05,
"loss": 20.8959,
"step": 73000
},
{
"epoch": 0.32666666666666666,
"grad_norm": 145.02476501464844,
"learning_rate": 2.9234910214401926e-05,
"loss": 22.3574,
"step": 73500
},
{
"epoch": 0.3288888888888889,
"grad_norm": 313.38629150390625,
"learning_rate": 2.9219192529899e-05,
"loss": 22.3035,
"step": 74000
},
{
"epoch": 0.33111111111111113,
"grad_norm": 416.150146484375,
"learning_rate": 2.9203319346584673e-05,
"loss": 22.091,
"step": 74500
},
{
"epoch": 0.3333333333333333,
"grad_norm": 125.51025390625,
"learning_rate": 2.9187290838045552e-05,
"loss": 21.6607,
"step": 75000
},
{
"epoch": 0.33555555555555555,
"grad_norm": 256.96875,
"learning_rate": 2.9171107179566826e-05,
"loss": 21.8178,
"step": 75500
},
{
"epoch": 0.3377777777777778,
"grad_norm": 1280.6885986328125,
"learning_rate": 2.91547685481304e-05,
"loss": 21.1816,
"step": 76000
},
{
"epoch": 0.34,
"grad_norm": 276.4981994628906,
"learning_rate": 2.9138275122412927e-05,
"loss": 21.1474,
"step": 76500
},
{
"epoch": 0.3422222222222222,
"grad_norm": 0.0,
"learning_rate": 2.9121627082783864e-05,
"loss": 21.2128,
"step": 77000
},
{
"epoch": 0.34444444444444444,
"grad_norm": 779.9710693359375,
"learning_rate": 2.910482461130351e-05,
"loss": 21.6096,
"step": 77500
},
{
"epoch": 0.3466666666666667,
"grad_norm": 726.4488525390625,
"learning_rate": 2.9087867891721e-05,
"loss": 20.5737,
"step": 78000
},
{
"epoch": 0.3488888888888889,
"grad_norm": 867.9049682617188,
"learning_rate": 2.90707571094723e-05,
"loss": 21.431,
"step": 78500
},
{
"epoch": 0.3511111111111111,
"grad_norm": 1406.6778564453125,
"learning_rate": 2.905349245167819e-05,
"loss": 22.8944,
"step": 79000
},
{
"epoch": 0.35333333333333333,
"grad_norm": 30.834983825683594,
"learning_rate": 2.903607410714219e-05,
"loss": 20.6775,
"step": 79500
},
{
"epoch": 0.35555555555555557,
"grad_norm": 307.98822021484375,
"learning_rate": 2.9018502266348537e-05,
"loss": 19.7868,
"step": 80000
},
{
"epoch": 0.35777777777777775,
"grad_norm": 897.4186401367188,
"learning_rate": 2.900077712146006e-05,
"loss": 22.5855,
"step": 80500
},
{
"epoch": 0.36,
"grad_norm": 203.12339782714844,
"learning_rate": 2.8982898866316107e-05,
"loss": 21.1752,
"step": 81000
},
{
"epoch": 0.3622222222222222,
"grad_norm": 220.3880157470703,
"learning_rate": 2.8964867696430412e-05,
"loss": 21.3629,
"step": 81500
},
{
"epoch": 0.36444444444444446,
"grad_norm": 46.697349548339844,
"learning_rate": 2.8946683808988956e-05,
"loss": 21.3887,
"step": 82000
},
{
"epoch": 0.36666666666666664,
"grad_norm": 179.70164489746094,
"learning_rate": 2.892834740284782e-05,
"loss": 21.825,
"step": 82500
},
{
"epoch": 0.3688888888888889,
"grad_norm": 518.6677856445312,
"learning_rate": 2.8909858678531007e-05,
"loss": 20.7174,
"step": 83000
},
{
"epoch": 0.3711111111111111,
"grad_norm": 643.6600952148438,
"learning_rate": 2.889121783822824e-05,
"loss": 22.1913,
"step": 83500
},
{
"epoch": 0.37333333333333335,
"grad_norm": 262.9464111328125,
"learning_rate": 2.887242508579277e-05,
"loss": 22.0347,
"step": 84000
},
{
"epoch": 0.37555555555555553,
"grad_norm": 1396.894775390625,
"learning_rate": 2.8853480626739115e-05,
"loss": 20.4351,
"step": 84500
},
{
"epoch": 0.37777777777777777,
"grad_norm": 597.5517578125,
"learning_rate": 2.883438466824085e-05,
"loss": 19.2972,
"step": 85000
},
{
"epoch": 0.38,
"grad_norm": 238.96986389160156,
"learning_rate": 2.8815137419128317e-05,
"loss": 20.8544,
"step": 85500
},
{
"epoch": 0.38222222222222224,
"grad_norm": 118.68024444580078,
"learning_rate": 2.8795739089886353e-05,
"loss": 20.0097,
"step": 86000
},
{
"epoch": 0.3844444444444444,
"grad_norm": 281.3915100097656,
"learning_rate": 2.877618989265197e-05,
"loss": 19.3276,
"step": 86500
},
{
"epoch": 0.38666666666666666,
"grad_norm": 412.5798645019531,
"learning_rate": 2.8756490041212067e-05,
"loss": 20.9107,
"step": 87000
},
{
"epoch": 0.3888888888888889,
"grad_norm": 897.23095703125,
"learning_rate": 2.8736639751001056e-05,
"loss": 21.3243,
"step": 87500
},
{
"epoch": 0.39111111111111113,
"grad_norm": 1561.7535400390625,
"learning_rate": 2.871663923909853e-05,
"loss": 20.2997,
"step": 88000
},
{
"epoch": 0.3933333333333333,
"grad_norm": 219.94825744628906,
"learning_rate": 2.8696488724226884e-05,
"loss": 19.0194,
"step": 88500
},
{
"epoch": 0.39555555555555555,
"grad_norm": 175.09353637695312,
"learning_rate": 2.8676188426748923e-05,
"loss": 20.7055,
"step": 89000
},
{
"epoch": 0.3977777777777778,
"grad_norm": 282.50933837890625,
"learning_rate": 2.8655738568665447e-05,
"loss": 19.1337,
"step": 89500
},
{
"epoch": 0.4,
"grad_norm": 60.395172119140625,
"learning_rate": 2.863513937361283e-05,
"loss": 20.728,
"step": 90000
},
{
"epoch": 0.4022222222222222,
"grad_norm": 314.94561767578125,
"learning_rate": 2.861439106686056e-05,
"loss": 19.575,
"step": 90500
},
{
"epoch": 0.40444444444444444,
"grad_norm": 473.822998046875,
"learning_rate": 2.8593493875308805e-05,
"loss": 20.2208,
"step": 91000
},
{
"epoch": 0.4066666666666667,
"grad_norm": 412.5682373046875,
"learning_rate": 2.8572448027485896e-05,
"loss": 19.7487,
"step": 91500
},
{
"epoch": 0.4088888888888889,
"grad_norm": 155.67567443847656,
"learning_rate": 2.855125375354586e-05,
"loss": 18.5899,
"step": 92000
},
{
"epoch": 0.4111111111111111,
"grad_norm": 401.43621826171875,
"learning_rate": 2.8529911285265876e-05,
"loss": 21.001,
"step": 92500
},
{
"epoch": 0.41333333333333333,
"grad_norm": 379.79302978515625,
"learning_rate": 2.8508420856043763e-05,
"loss": 19.6731,
"step": 93000
},
{
"epoch": 0.41555555555555557,
"grad_norm": 224.41383361816406,
"learning_rate": 2.8486782700895407e-05,
"loss": 19.2887,
"step": 93500
},
{
"epoch": 0.4177777777777778,
"grad_norm": 164.6722412109375,
"learning_rate": 2.8464997056452206e-05,
"loss": 20.0013,
"step": 94000
},
{
"epoch": 0.42,
"grad_norm": 241.1973876953125,
"learning_rate": 2.8443064160958483e-05,
"loss": 18.3981,
"step": 94500
},
{
"epoch": 0.4222222222222222,
"grad_norm": 790.732421875,
"learning_rate": 2.8420984254268863e-05,
"loss": 18.5947,
"step": 95000
},
{
"epoch": 0.42444444444444446,
"grad_norm": 446.4692687988281,
"learning_rate": 2.8398757577845665e-05,
"loss": 19.8438,
"step": 95500
},
{
"epoch": 0.4266666666666667,
"grad_norm": 17.384523391723633,
"learning_rate": 2.837638437475627e-05,
"loss": 19.1518,
"step": 96000
},
{
"epoch": 0.4288888888888889,
"grad_norm": 292.8326416015625,
"learning_rate": 2.8353864889670442e-05,
"loss": 18.9518,
"step": 96500
},
{
"epoch": 0.4311111111111111,
"grad_norm": 1216.1114501953125,
"learning_rate": 2.8331199368857656e-05,
"loss": 19.3502,
"step": 97000
},
{
"epoch": 0.43333333333333335,
"grad_norm": 256.9949035644531,
"learning_rate": 2.830838806018442e-05,
"loss": 18.1643,
"step": 97500
},
{
"epoch": 0.43555555555555553,
"grad_norm": 203.0587615966797,
"learning_rate": 2.8285431213111548e-05,
"loss": 19.173,
"step": 98000
},
{
"epoch": 0.43777777777777777,
"grad_norm": 290.00775146484375,
"learning_rate": 2.826232907869145e-05,
"loss": 20.2496,
"step": 98500
},
{
"epoch": 0.44,
"grad_norm": 437.4803771972656,
"learning_rate": 2.823908190956535e-05,
"loss": 19.568,
"step": 99000
},
{
"epoch": 0.44222222222222224,
"grad_norm": 79.48589324951172,
"learning_rate": 2.821568995996058e-05,
"loss": 18.2379,
"step": 99500
},
{
"epoch": 0.4444444444444444,
"grad_norm": 252.00978088378906,
"learning_rate": 2.8192153485687752e-05,
"loss": 19.322,
"step": 100000
},
{
"epoch": 0.44666666666666666,
"grad_norm": 220.2042999267578,
"learning_rate": 2.8168472744137977e-05,
"loss": 18.7556,
"step": 100500
},
{
"epoch": 0.4488888888888889,
"grad_norm": 260.3736572265625,
"learning_rate": 2.814464799428004e-05,
"loss": 18.9124,
"step": 101000
},
{
"epoch": 0.45111111111111113,
"grad_norm": 593.2783203125,
"learning_rate": 2.8120679496657602e-05,
"loss": 19.0002,
"step": 101500
},
{
"epoch": 0.4533333333333333,
"grad_norm": 1167.1844482421875,
"learning_rate": 2.80965675133863e-05,
"loss": 19.2148,
"step": 102000
},
{
"epoch": 0.45555555555555555,
"grad_norm": 15.313830375671387,
"learning_rate": 2.8072312308150934e-05,
"loss": 18.2168,
"step": 102500
},
{
"epoch": 0.4577777777777778,
"grad_norm": 200.6254119873047,
"learning_rate": 2.8047914146202533e-05,
"loss": 19.3346,
"step": 103000
},
{
"epoch": 0.46,
"grad_norm": 426.6332702636719,
"learning_rate": 2.8023373294355492e-05,
"loss": 17.3282,
"step": 103500
},
{
"epoch": 0.4622222222222222,
"grad_norm": 432.8354187011719,
"learning_rate": 2.799869002098463e-05,
"loss": 19.5463,
"step": 104000
},
{
"epoch": 0.46444444444444444,
"grad_norm": 298.2032775878906,
"learning_rate": 2.7973864596022273e-05,
"loss": 18.7725,
"step": 104500
},
{
"epoch": 0.4666666666666667,
"grad_norm": 403.9524841308594,
"learning_rate": 2.7948897290955293e-05,
"loss": 19.5364,
"step": 105000
},
{
"epoch": 0.4688888888888889,
"grad_norm": 51.500240325927734,
"learning_rate": 2.7923788378822135e-05,
"loss": 18.9839,
"step": 105500
},
{
"epoch": 0.4711111111111111,
"grad_norm": 521.7046508789062,
"learning_rate": 2.7898538134209837e-05,
"loss": 18.7831,
"step": 106000
},
{
"epoch": 0.47333333333333333,
"grad_norm": 105.23808288574219,
"learning_rate": 2.787314683325104e-05,
"loss": 18.1615,
"step": 106500
},
{
"epoch": 0.47555555555555556,
"grad_norm": 332.540283203125,
"learning_rate": 2.7847614753620926e-05,
"loss": 19.3657,
"step": 107000
},
{
"epoch": 0.4777777777777778,
"grad_norm": 901.9822387695312,
"learning_rate": 2.7821942174534243e-05,
"loss": 18.9534,
"step": 107500
},
{
"epoch": 0.48,
"grad_norm": 437.5888977050781,
"learning_rate": 2.779612937674219e-05,
"loss": 18.7374,
"step": 108000
},
{
"epoch": 0.4822222222222222,
"grad_norm": 438.2900390625,
"learning_rate": 2.7770176642529397e-05,
"loss": 20.7495,
"step": 108500
},
{
"epoch": 0.48444444444444446,
"grad_norm": 369.8582763671875,
"learning_rate": 2.7744084255710804e-05,
"loss": 17.091,
"step": 109000
},
{
"epoch": 0.4866666666666667,
"grad_norm": 734.362548828125,
"learning_rate": 2.7717852501628574e-05,
"loss": 19.0611,
"step": 109500
},
{
"epoch": 0.4888888888888889,
"grad_norm": 425.8333435058594,
"learning_rate": 2.769148166714897e-05,
"loss": 18.6956,
"step": 110000
},
{
"epoch": 0.4911111111111111,
"grad_norm": 273.7350158691406,
"learning_rate": 2.76649720406592e-05,
"loss": 18.9581,
"step": 110500
},
{
"epoch": 0.49333333333333335,
"grad_norm": 501.64019775390625,
"learning_rate": 2.763832391206431e-05,
"loss": 17.5245,
"step": 111000
},
{
"epoch": 0.4955555555555556,
"grad_norm": 1036.9017333984375,
"learning_rate": 2.7611537572783953e-05,
"loss": 17.9539,
"step": 111500
},
{
"epoch": 0.49777777777777776,
"grad_norm": 63.28369140625,
"learning_rate": 2.7584613315749247e-05,
"loss": 17.5569,
"step": 112000
},
{
"epoch": 0.5,
"grad_norm": 144.62741088867188,
"learning_rate": 2.7557551435399554e-05,
"loss": 18.3981,
"step": 112500
},
{
"epoch": 0.5022222222222222,
"grad_norm": 50.069549560546875,
"learning_rate": 2.753035222767926e-05,
"loss": 18.6216,
"step": 113000
},
{
"epoch": 0.5044444444444445,
"grad_norm": 733.9398193359375,
"learning_rate": 2.7503015990034543e-05,
"loss": 17.1969,
"step": 113500
},
{
"epoch": 0.5066666666666667,
"grad_norm": 444.6294250488281,
"learning_rate": 2.747554302141012e-05,
"loss": 18.0202,
"step": 114000
},
{
"epoch": 0.5088888888888888,
"grad_norm": 59.344337463378906,
"learning_rate": 2.7447933622245974e-05,
"loss": 17.6973,
"step": 114500
},
{
"epoch": 0.5111111111111111,
"grad_norm": 0.0,
"learning_rate": 2.742018809447407e-05,
"loss": 18.7046,
"step": 115000
},
{
"epoch": 0.5133333333333333,
"grad_norm": 421.5881652832031,
"learning_rate": 2.7392306741515056e-05,
"loss": 17.8755,
"step": 115500
},
{
"epoch": 0.5155555555555555,
"grad_norm": 292.31060791015625,
"learning_rate": 2.736428986827494e-05,
"loss": 18.5183,
"step": 116000
},
{
"epoch": 0.5177777777777778,
"grad_norm": 448.3764343261719,
"learning_rate": 2.7336137781141758e-05,
"loss": 18.2446,
"step": 116500
},
{
"epoch": 0.52,
"grad_norm": 312.8506164550781,
"learning_rate": 2.730785078798222e-05,
"loss": 17.2551,
"step": 117000
},
{
"epoch": 0.5222222222222223,
"grad_norm": 198.42645263671875,
"learning_rate": 2.7279429198138368e-05,
"loss": 17.8948,
"step": 117500
},
{
"epoch": 0.5244444444444445,
"grad_norm": 148.22213745117188,
"learning_rate": 2.7250873322424135e-05,
"loss": 17.4501,
"step": 118000
},
{
"epoch": 0.5266666666666666,
"grad_norm": 537.1702270507812,
"learning_rate": 2.7222183473122015e-05,
"loss": 18.9861,
"step": 118500
},
{
"epoch": 0.5288888888888889,
"grad_norm": 363.04833984375,
"learning_rate": 2.71933599639796e-05,
"loss": 18.2579,
"step": 119000
},
{
"epoch": 0.5311111111111111,
"grad_norm": 550.2840576171875,
"learning_rate": 2.7164403110206168e-05,
"loss": 17.3876,
"step": 119500
},
{
"epoch": 0.5333333333333333,
"grad_norm": 99.29381561279297,
"learning_rate": 2.713531322846923e-05,
"loss": 18.4671,
"step": 120000
},
{
"epoch": 0.5355555555555556,
"grad_norm": 267.3313293457031,
"learning_rate": 2.7106090636891077e-05,
"loss": 19.6639,
"step": 120500
},
{
"epoch": 0.5377777777777778,
"grad_norm": 356.0230407714844,
"learning_rate": 2.7076735655045283e-05,
"loss": 18.553,
"step": 121000
},
{
"epoch": 0.54,
"grad_norm": 72.5117416381836,
"learning_rate": 2.7047248603953233e-05,
"loss": 16.9581,
"step": 121500
},
{
"epoch": 0.5422222222222223,
"grad_norm": 283.059326171875,
"learning_rate": 2.701762980608059e-05,
"loss": 17.3513,
"step": 122000
},
{
"epoch": 0.5444444444444444,
"grad_norm": 455.74267578125,
"learning_rate": 2.698787958533378e-05,
"loss": 18.527,
"step": 122500
},
{
"epoch": 0.5466666666666666,
"grad_norm": 264.24700927734375,
"learning_rate": 2.6957998267056454e-05,
"loss": 18.6227,
"step": 123000
},
{
"epoch": 0.5488888888888889,
"grad_norm": 563.1781005859375,
"learning_rate": 2.692798617802592e-05,
"loss": 17.3232,
"step": 123500
},
{
"epoch": 0.5511111111111111,
"grad_norm": 488.3459777832031,
"learning_rate": 2.6897843646449575e-05,
"loss": 17.4262,
"step": 124000
},
{
"epoch": 0.5533333333333333,
"grad_norm": 119.61053466796875,
"learning_rate": 2.6867571001961312e-05,
"loss": 17.022,
"step": 124500
},
{
"epoch": 0.5555555555555556,
"grad_norm": 239.64756774902344,
"learning_rate": 2.683716857561793e-05,
"loss": 17.9908,
"step": 125000
},
{
"epoch": 0.5577777777777778,
"grad_norm": 418.17547607421875,
"learning_rate": 2.6806636699895484e-05,
"loss": 18.6269,
"step": 125500
},
{
"epoch": 0.56,
"grad_norm": 551.5980224609375,
"learning_rate": 2.677597570868568e-05,
"loss": 18.3972,
"step": 126000
},
{
"epoch": 0.5622222222222222,
"grad_norm": 304.7643127441406,
"learning_rate": 2.6745185937292207e-05,
"loss": 18.2829,
"step": 126500
},
{
"epoch": 0.5644444444444444,
"grad_norm": 144.07781982421875,
"learning_rate": 2.6714267722427064e-05,
"loss": 18.218,
"step": 127000
},
{
"epoch": 0.5666666666666667,
"grad_norm": 353.9224548339844,
"learning_rate": 2.66832214022069e-05,
"loss": 18.1345,
"step": 127500
},
{
"epoch": 0.5688888888888889,
"grad_norm": 197.71298217773438,
"learning_rate": 2.66520473161493e-05,
"loss": 17.18,
"step": 128000
},
{
"epoch": 0.5711111111111111,
"grad_norm": 783.7542114257812,
"learning_rate": 2.6620745805169076e-05,
"loss": 16.7577,
"step": 128500
},
{
"epoch": 0.5733333333333334,
"grad_norm": 331.999755859375,
"learning_rate": 2.6589317211574535e-05,
"loss": 16.8293,
"step": 129000
},
{
"epoch": 0.5755555555555556,
"grad_norm": 386.9215393066406,
"learning_rate": 2.6557761879063737e-05,
"loss": 16.7488,
"step": 129500
},
{
"epoch": 0.5777777777777777,
"grad_norm": 670.8016357421875,
"learning_rate": 2.652608015272075e-05,
"loss": 16.6633,
"step": 130000
},
{
"epoch": 0.58,
"grad_norm": 130.0618133544922,
"learning_rate": 2.6494272379011853e-05,
"loss": 17.5815,
"step": 130500
},
{
"epoch": 0.5822222222222222,
"grad_norm": 363.4728698730469,
"learning_rate": 2.6462338905781766e-05,
"loss": 17.5676,
"step": 131000
},
{
"epoch": 0.5844444444444444,
"grad_norm": 194.19207763671875,
"learning_rate": 2.6430280082249832e-05,
"loss": 19.0677,
"step": 131500
},
{
"epoch": 0.5866666666666667,
"grad_norm": 478.40692138671875,
"learning_rate": 2.6398096259006212e-05,
"loss": 16.4278,
"step": 132000
},
{
"epoch": 0.5888888888888889,
"grad_norm": 673.5048828125,
"learning_rate": 2.636578778800804e-05,
"loss": 17.7745,
"step": 132500
},
{
"epoch": 0.5911111111111111,
"grad_norm": 208.15098571777344,
"learning_rate": 2.633335502257558e-05,
"loss": 17.4536,
"step": 133000
},
{
"epoch": 0.5933333333333334,
"grad_norm": 1426.62109375,
"learning_rate": 2.6300798317388357e-05,
"loss": 17.152,
"step": 133500
},
{
"epoch": 0.5955555555555555,
"grad_norm": 253.73455810546875,
"learning_rate": 2.626811802848128e-05,
"loss": 16.4736,
"step": 134000
},
{
"epoch": 0.5977777777777777,
"grad_norm": 890.9122924804688,
"learning_rate": 2.623531451324076e-05,
"loss": 17.913,
"step": 134500
},
{
"epoch": 0.6,
"grad_norm": 880.38671875,
"learning_rate": 2.6202388130400772e-05,
"loss": 17.0165,
"step": 135000
},
{
"epoch": 0.6022222222222222,
"grad_norm": 284.1332702636719,
"learning_rate": 2.616933924003898e-05,
"loss": 17.0189,
"step": 135500
},
{
"epoch": 0.6044444444444445,
"grad_norm": 23.394821166992188,
"learning_rate": 2.6136168203572742e-05,
"loss": 17.2017,
"step": 136000
},
{
"epoch": 0.6066666666666667,
"grad_norm": 790.5655517578125,
"learning_rate": 2.61028753837552e-05,
"loss": 15.7028,
"step": 136500
},
{
"epoch": 0.6088888888888889,
"grad_norm": 196.9662628173828,
"learning_rate": 2.6069461144671298e-05,
"loss": 16.4864,
"step": 137000
},
{
"epoch": 0.6111111111111112,
"grad_norm": 178.7125244140625,
"learning_rate": 2.6035925851733808e-05,
"loss": 17.2559,
"step": 137500
},
{
"epoch": 0.6133333333333333,
"grad_norm": 402.0807800292969,
"learning_rate": 2.600226987167931e-05,
"loss": 17.2757,
"step": 138000
},
{
"epoch": 0.6155555555555555,
"grad_norm": 252.41526794433594,
"learning_rate": 2.5968493572564218e-05,
"loss": 16.8407,
"step": 138500
},
{
"epoch": 0.6177777777777778,
"grad_norm": 0.0,
"learning_rate": 2.593459732376072e-05,
"loss": 16.4473,
"step": 139000
},
{
"epoch": 0.62,
"grad_norm": 324.2782287597656,
"learning_rate": 2.590058149595277e-05,
"loss": 17.0955,
"step": 139500
},
{
"epoch": 0.6222222222222222,
"grad_norm": 259.27532958984375,
"learning_rate": 2.5866446461132007e-05,
"loss": 17.8668,
"step": 140000
},
{
"epoch": 0.6244444444444445,
"grad_norm": 504.20550537109375,
"learning_rate": 2.5832192592593707e-05,
"loss": 18.1582,
"step": 140500
},
{
"epoch": 0.6266666666666667,
"grad_norm": 464.8078918457031,
"learning_rate": 2.5797820264932682e-05,
"loss": 16.0802,
"step": 141000
},
{
"epoch": 0.6288888888888889,
"grad_norm": 294.2264099121094,
"learning_rate": 2.5763329854039204e-05,
"loss": 16.0784,
"step": 141500
},
{
"epoch": 0.6311111111111111,
"grad_norm": 212.64166259765625,
"learning_rate": 2.572872173709488e-05,
"loss": 16.1939,
"step": 142000
},
{
"epoch": 0.6333333333333333,
"grad_norm": 313.9952087402344,
"learning_rate": 2.5693996292568535e-05,
"loss": 16.6863,
"step": 142500
},
{
"epoch": 0.6355555555555555,
"grad_norm": 350.9505615234375,
"learning_rate": 2.565915390021206e-05,
"loss": 15.5249,
"step": 143000
},
{
"epoch": 0.6377777777777778,
"grad_norm": 113.72864532470703,
"learning_rate": 2.562419494105628e-05,
"loss": 17.4712,
"step": 143500
},
{
"epoch": 0.64,
"grad_norm": 439.85784912109375,
"learning_rate": 2.558911979740677e-05,
"loss": 16.1441,
"step": 144000
},
{
"epoch": 0.6422222222222222,
"grad_norm": 107.58014678955078,
"learning_rate": 2.5553928852839686e-05,
"loss": 17.8531,
"step": 144500
},
{
"epoch": 0.6444444444444445,
"grad_norm": 314.7883605957031,
"learning_rate": 2.5518622492197558e-05,
"loss": 16.5554,
"step": 145000
},
{
"epoch": 0.6466666666666666,
"grad_norm": 146.2752227783203,
"learning_rate": 2.5483201101585085e-05,
"loss": 17.0876,
"step": 145500
},
{
"epoch": 0.6488888888888888,
"grad_norm": 493.06488037109375,
"learning_rate": 2.544766506836492e-05,
"loss": 16.4471,
"step": 146000
},
{
"epoch": 0.6511111111111111,
"grad_norm": 331.6954040527344,
"learning_rate": 2.5412014781153433e-05,
"loss": 16.6836,
"step": 146500
},
{
"epoch": 0.6533333333333333,
"grad_norm": 324.4432373046875,
"learning_rate": 2.537625062981645e-05,
"loss": 16.9327,
"step": 147000
},
{
"epoch": 0.6555555555555556,
"grad_norm": 447.0750732421875,
"learning_rate": 2.5340373005465007e-05,
"loss": 16.6021,
"step": 147500
},
{
"epoch": 0.6577777777777778,
"grad_norm": 74.82227325439453,
"learning_rate": 2.530438230045105e-05,
"loss": 16.6877,
"step": 148000
},
{
"epoch": 0.66,
"grad_norm": 408.71380615234375,
"learning_rate": 2.5268278908363157e-05,
"loss": 15.4423,
"step": 148500
},
{
"epoch": 0.6622222222222223,
"grad_norm": 434.0395812988281,
"learning_rate": 2.523206322402225e-05,
"loss": 16.9507,
"step": 149000
},
{
"epoch": 0.6644444444444444,
"grad_norm": 0.0,
"learning_rate": 2.5195735643477244e-05,
"loss": 17.0505,
"step": 149500
},
{
"epoch": 0.6666666666666666,
"grad_norm": 744.4578857421875,
"learning_rate": 2.5159296564000744e-05,
"loss": 16.4468,
"step": 150000
},
{
"epoch": 0.6688888888888889,
"grad_norm": 203.68789672851562,
"learning_rate": 2.5122746384084683e-05,
"loss": 15.6102,
"step": 150500
},
{
"epoch": 0.6711111111111111,
"grad_norm": 304.8150329589844,
"learning_rate": 2.5086085503435973e-05,
"loss": 16.5682,
"step": 151000
},
{
"epoch": 0.6733333333333333,
"grad_norm": 212.24891662597656,
"learning_rate": 2.504931432297213e-05,
"loss": 16.6716,
"step": 151500
},
{
"epoch": 0.6755555555555556,
"grad_norm": 143.3702392578125,
"learning_rate": 2.5012433244816894e-05,
"loss": 17.2561,
"step": 152000
},
{
"epoch": 0.6777777777777778,
"grad_norm": 82.70915985107422,
"learning_rate": 2.4975442672295827e-05,
"loss": 17.7661,
"step": 152500
},
{
"epoch": 0.68,
"grad_norm": 81.59647369384766,
"learning_rate": 2.4938343009931908e-05,
"loss": 15.6807,
"step": 153000
},
{
"epoch": 0.6822222222222222,
"grad_norm": 483.339111328125,
"learning_rate": 2.4901134663441088e-05,
"loss": 16.8148,
"step": 153500
},
{
"epoch": 0.6844444444444444,
"grad_norm": 0.0,
"learning_rate": 2.4863818039727895e-05,
"loss": 17.1794,
"step": 154000
},
{
"epoch": 0.6866666666666666,
"grad_norm": 211.79966735839844,
"learning_rate": 2.482639354688094e-05,
"loss": 15.5973,
"step": 154500
},
{
"epoch": 0.6888888888888889,
"grad_norm": 242.6669464111328,
"learning_rate": 2.4788861594168485e-05,
"loss": 16.9753,
"step": 155000
},
{
"epoch": 0.6911111111111111,
"grad_norm": 186.95126342773438,
"learning_rate": 2.475122259203395e-05,
"loss": 15.0561,
"step": 155500
},
{
"epoch": 0.6933333333333334,
"grad_norm": 332.6864929199219,
"learning_rate": 2.471347695209143e-05,
"loss": 16.4118,
"step": 156000
},
{
"epoch": 0.6955555555555556,
"grad_norm": 373.36944580078125,
"learning_rate": 2.4675625087121204e-05,
"loss": 16.9823,
"step": 156500
},
{
"epoch": 0.6977777777777778,
"grad_norm": 61.25292205810547,
"learning_rate": 2.4637667411065197e-05,
"loss": 16.2012,
"step": 157000
},
{
"epoch": 0.7,
"grad_norm": 549.8672485351562,
"learning_rate": 2.459960433902247e-05,
"loss": 17.6019,
"step": 157500
},
{
"epoch": 0.7022222222222222,
"grad_norm": 478.5077209472656,
"learning_rate": 2.4561436287244685e-05,
"loss": 17.6805,
"step": 158000
},
{
"epoch": 0.7044444444444444,
"grad_norm": 218.25418090820312,
"learning_rate": 2.4523163673131538e-05,
"loss": 15.3333,
"step": 158500
},
{
"epoch": 0.7066666666666667,
"grad_norm": 383.55767822265625,
"learning_rate": 2.4484786915226213e-05,
"loss": 16.3707,
"step": 159000
},
{
"epoch": 0.7088888888888889,
"grad_norm": 729.36474609375,
"learning_rate": 2.444630643321078e-05,
"loss": 15.4495,
"step": 159500
},
{
"epoch": 0.7111111111111111,
"grad_norm": 578.4398193359375,
"learning_rate": 2.4407722647901624e-05,
"loss": 17.7177,
"step": 160000
},
{
"epoch": 0.7133333333333334,
"grad_norm": 284.87823486328125,
"learning_rate": 2.4369035981244836e-05,
"loss": 16.7006,
"step": 160500
},
{
"epoch": 0.7155555555555555,
"grad_norm": 287.9507751464844,
"learning_rate": 2.4330246856311613e-05,
"loss": 16.7623,
"step": 161000
},
{
"epoch": 0.7177777777777777,
"grad_norm": 518.5828857421875,
"learning_rate": 2.429135569729361e-05,
"loss": 18.6743,
"step": 161500
},
{
"epoch": 0.72,
"grad_norm": 741.138916015625,
"learning_rate": 2.42523629294983e-05,
"loss": 15.989,
"step": 162000
},
{
"epoch": 0.7222222222222222,
"grad_norm": 0.0,
"learning_rate": 2.4213268979344362e-05,
"loss": 16.102,
"step": 162500
},
{
"epoch": 0.7244444444444444,
"grad_norm": 358.8752746582031,
"learning_rate": 2.417407427435696e-05,
"loss": 15.923,
"step": 163000
},
{
"epoch": 0.7266666666666667,
"grad_norm": 570.9427490234375,
"learning_rate": 2.4134779243163105e-05,
"loss": 16.5887,
"step": 163500
},
{
"epoch": 0.7288888888888889,
"grad_norm": 435.3963928222656,
"learning_rate": 2.409538431548697e-05,
"loss": 15.2045,
"step": 164000
},
{
"epoch": 0.7311111111111112,
"grad_norm": 298.369140625,
"learning_rate": 2.405588992214517e-05,
"loss": 16.1364,
"step": 164500
},
{
"epoch": 0.7333333333333333,
"grad_norm": 161.4807586669922,
"learning_rate": 2.4016296495042065e-05,
"loss": 16.3397,
"step": 165000
},
{
"epoch": 0.7355555555555555,
"grad_norm": 450.2773742675781,
"learning_rate": 2.3976604467165035e-05,
"loss": 14.8856,
"step": 165500
},
{
"epoch": 0.7377777777777778,
"grad_norm": 62.63951110839844,
"learning_rate": 2.3936814272579718e-05,
"loss": 16.1214,
"step": 166000
},
{
"epoch": 0.74,
"grad_norm": 295.8753662109375,
"learning_rate": 2.389692634642533e-05,
"loss": 16.7177,
"step": 166500
},
{
"epoch": 0.7422222222222222,
"grad_norm": 83.56742858886719,
"learning_rate": 2.385694112490983e-05,
"loss": 16.233,
"step": 167000
},
{
"epoch": 0.7444444444444445,
"grad_norm": 859.1819458007812,
"learning_rate": 2.381685904530519e-05,
"loss": 16.7252,
"step": 167500
},
{
"epoch": 0.7466666666666667,
"grad_norm": 414.3497009277344,
"learning_rate": 2.377668054594262e-05,
"loss": 16.0818,
"step": 168000
},
{
"epoch": 0.7488888888888889,
"grad_norm": 291.54498291015625,
"learning_rate": 2.373640606620775e-05,
"loss": 14.5691,
"step": 168500
},
{
"epoch": 0.7511111111111111,
"grad_norm": 594.7430419921875,
"learning_rate": 2.369603604653583e-05,
"loss": 16.9945,
"step": 169000
},
{
"epoch": 0.7533333333333333,
"grad_norm": 202.13864135742188,
"learning_rate": 2.3655570928406937e-05,
"loss": 15.3943,
"step": 169500
},
{
"epoch": 0.7555555555555555,
"grad_norm": 212.4605712890625,
"learning_rate": 2.361501115434112e-05,
"loss": 16.8734,
"step": 170000
},
{
"epoch": 0.7577777777777778,
"grad_norm": 414.1224060058594,
"learning_rate": 2.357435716789356e-05,
"loss": 15.8502,
"step": 170500
},
{
"epoch": 0.76,
"grad_norm": 92.9588394165039,
"learning_rate": 2.3533609413649745e-05,
"loss": 16.2583,
"step": 171000
},
{
"epoch": 0.7622222222222222,
"grad_norm": 308.7859802246094,
"learning_rate": 2.349276833722059e-05,
"loss": 16.0059,
"step": 171500
},
{
"epoch": 0.7644444444444445,
"grad_norm": 437.89178466796875,
"learning_rate": 2.345183438523756e-05,
"loss": 16.7771,
"step": 172000
},
{
"epoch": 0.7666666666666667,
"grad_norm": 28.078920364379883,
"learning_rate": 2.3410808005347798e-05,
"loss": 17.1159,
"step": 172500
},
{
"epoch": 0.7688888888888888,
"grad_norm": 243.4501495361328,
"learning_rate": 2.336968964620922e-05,
"loss": 17.4442,
"step": 173000
},
{
"epoch": 0.7711111111111111,
"grad_norm": 873.5339965820312,
"learning_rate": 2.3328479757485615e-05,
"loss": 16.389,
"step": 173500
},
{
"epoch": 0.7733333333333333,
"grad_norm": 487.0278015136719,
"learning_rate": 2.328717878984172e-05,
"loss": 15.1246,
"step": 174000
},
{
"epoch": 0.7755555555555556,
"grad_norm": 1256.6805419921875,
"learning_rate": 2.32457871949383e-05,
"loss": 16.0509,
"step": 174500
},
{
"epoch": 0.7777777777777778,
"grad_norm": 437.3548278808594,
"learning_rate": 2.320430542542721e-05,
"loss": 14.2762,
"step": 175000
},
{
"epoch": 0.78,
"grad_norm": 50.979103088378906,
"learning_rate": 2.3162733934946437e-05,
"loss": 15.7425,
"step": 175500
},
{
"epoch": 0.7822222222222223,
"grad_norm": 461.4090270996094,
"learning_rate": 2.3121073178115136e-05,
"loss": 17.1488,
"step": 176000
},
{
"epoch": 0.7844444444444445,
"grad_norm": 163.63095092773438,
"learning_rate": 2.307932361052867e-05,
"loss": 14.9277,
"step": 176500
},
{
"epoch": 0.7866666666666666,
"grad_norm": 349.4720458984375,
"learning_rate": 2.3037485688753623e-05,
"loss": 15.1278,
"step": 177000
},
{
"epoch": 0.7888888888888889,
"grad_norm": 266.4578857421875,
"learning_rate": 2.2995559870322797e-05,
"loss": 14.9445,
"step": 177500
},
{
"epoch": 0.7911111111111111,
"grad_norm": 259.8016357421875,
"learning_rate": 2.2953546613730237e-05,
"loss": 15.8992,
"step": 178000
},
{
"epoch": 0.7933333333333333,
"grad_norm": 302.3138732910156,
"learning_rate": 2.2911446378426177e-05,
"loss": 16.151,
"step": 178500
},
{
"epoch": 0.7955555555555556,
"grad_norm": 302.546142578125,
"learning_rate": 2.286925962481205e-05,
"loss": 15.9711,
"step": 179000
},
{
"epoch": 0.7977777777777778,
"grad_norm": 161.2322998046875,
"learning_rate": 2.282698681423543e-05,
"loss": 15.3818,
"step": 179500
},
{
"epoch": 0.8,
"grad_norm": 338.44873046875,
"learning_rate": 2.2784628408985005e-05,
"loss": 16.7231,
"step": 180000
},
{
"epoch": 0.8022222222222222,
"grad_norm": 331.5046691894531,
"learning_rate": 2.2742184872285507e-05,
"loss": 15.7784,
"step": 180500
},
{
"epoch": 0.8044444444444444,
"grad_norm": 532.013671875,
"learning_rate": 2.2699656668292653e-05,
"loss": 15.8937,
"step": 181000
},
{
"epoch": 0.8066666666666666,
"grad_norm": 30.83024024963379,
"learning_rate": 2.2657044262088068e-05,
"loss": 14.8331,
"step": 181500
},
{
"epoch": 0.8088888888888889,
"grad_norm": 208.97105407714844,
"learning_rate": 2.26143481196742e-05,
"loss": 14.8417,
"step": 182000
},
{
"epoch": 0.8111111111111111,
"grad_norm": 178.349609375,
"learning_rate": 2.2571568707969224e-05,
"loss": 15.9551,
"step": 182500
},
{
"epoch": 0.8133333333333334,
"grad_norm": 191.2917938232422,
"learning_rate": 2.2528706494801933e-05,
"loss": 15.4303,
"step": 183000
},
{
"epoch": 0.8155555555555556,
"grad_norm": 379.2752685546875,
"learning_rate": 2.248576194890661e-05,
"loss": 17.1609,
"step": 183500
},
{
"epoch": 0.8177777777777778,
"grad_norm": 49.782352447509766,
"learning_rate": 2.244273553991795e-05,
"loss": 16.6368,
"step": 184000
},
{
"epoch": 0.82,
"grad_norm": 164.4068603515625,
"learning_rate": 2.239962773836585e-05,
"loss": 16.0915,
"step": 184500
},
{
"epoch": 0.8222222222222222,
"grad_norm": 120.09187316894531,
"learning_rate": 2.2356439015670335e-05,
"loss": 15.3172,
"step": 185000
},
{
"epoch": 0.8244444444444444,
"grad_norm": 119.5110855102539,
"learning_rate": 2.2313169844136342e-05,
"loss": 15.7401,
"step": 185500
},
{
"epoch": 0.8266666666666667,
"grad_norm": 238.2360076904297,
"learning_rate": 2.226982069694861e-05,
"loss": 15.5555,
"step": 186000
},
{
"epoch": 0.8288888888888889,
"grad_norm": 234.07911682128906,
"learning_rate": 2.2226392048166467e-05,
"loss": 15.8124,
"step": 186500
},
{
"epoch": 0.8311111111111111,
"grad_norm": 0.0,
"learning_rate": 2.218288437271865e-05,
"loss": 14.9297,
"step": 187000
},
{
"epoch": 0.8333333333333334,
"grad_norm": 0.0,
"learning_rate": 2.213929814639814e-05,
"loss": 14.9676,
"step": 187500
},
{
"epoch": 0.8355555555555556,
"grad_norm": 221.94076538085938,
"learning_rate": 2.2095633845856912e-05,
"loss": 14.5759,
"step": 188000
},
{
"epoch": 0.8377777777777777,
"grad_norm": 798.3099365234375,
"learning_rate": 2.2051891948600773e-05,
"loss": 16.8336,
"step": 188500
},
{
"epoch": 0.84,
"grad_norm": 148.87489318847656,
"learning_rate": 2.2008072932984095e-05,
"loss": 15.6524,
"step": 189000
},
{
"epoch": 0.8422222222222222,
"grad_norm": 979.9264526367188,
"learning_rate": 2.196417727820461e-05,
"loss": 14.5125,
"step": 189500
},
{
"epoch": 0.8444444444444444,
"grad_norm": 273.1609191894531,
"learning_rate": 2.1920205464298174e-05,
"loss": 14.7308,
"step": 190000
},
{
"epoch": 0.8466666666666667,
"grad_norm": 494.7351989746094,
"learning_rate": 2.187615797213349e-05,
"loss": 14.448,
"step": 190500
},
{
"epoch": 0.8488888888888889,
"grad_norm": 2433.17529296875,
"learning_rate": 2.183203528340689e-05,
"loss": 15.0146,
"step": 191000
},
{
"epoch": 0.8511111111111112,
"grad_norm": 446.34490966796875,
"learning_rate": 2.1787837880637014e-05,
"loss": 15.0511,
"step": 191500
},
{
"epoch": 0.8533333333333334,
"grad_norm": 596.4390869140625,
"learning_rate": 2.1743566247159586e-05,
"loss": 14.3164,
"step": 192000
},
{
"epoch": 0.8555555555555555,
"grad_norm": 927.9017333984375,
"learning_rate": 2.1699220867122087e-05,
"loss": 14.7031,
"step": 192500
},
{
"epoch": 0.8577777777777778,
"grad_norm": 174.5888671875,
"learning_rate": 2.16548022254785e-05,
"loss": 14.77,
"step": 193000
},
{
"epoch": 0.86,
"grad_norm": 346.9240417480469,
"learning_rate": 2.161031080798397e-05,
"loss": 14.618,
"step": 193500
},
{
"epoch": 0.8622222222222222,
"grad_norm": 533.3963623046875,
"learning_rate": 2.156574710118951e-05,
"loss": 14.1816,
"step": 194000
},
{
"epoch": 0.8644444444444445,
"grad_norm": 234.50579833984375,
"learning_rate": 2.1521111592436673e-05,
"loss": 15.6746,
"step": 194500
},
{
"epoch": 0.8666666666666667,
"grad_norm": 654.4329833984375,
"learning_rate": 2.1476404769852238e-05,
"loss": 16.4027,
"step": 195000
},
{
"epoch": 0.8688888888888889,
"grad_norm": 97.57040405273438,
"learning_rate": 2.143162712234285e-05,
"loss": 14.6315,
"step": 195500
},
{
"epoch": 0.8711111111111111,
"grad_norm": 347.2988586425781,
"learning_rate": 2.138677913958969e-05,
"loss": 14.8534,
"step": 196000
},
{
"epoch": 0.8733333333333333,
"grad_norm": 61.20378112792969,
"learning_rate": 2.1341861312043116e-05,
"loss": 14.0666,
"step": 196500
},
{
"epoch": 0.8755555555555555,
"grad_norm": 57.949256896972656,
"learning_rate": 2.1296874130917282e-05,
"loss": 13.8681,
"step": 197000
},
{
"epoch": 0.8777777777777778,
"grad_norm": 417.0851745605469,
"learning_rate": 2.1251818088184808e-05,
"loss": 15.6193,
"step": 197500
},
{
"epoch": 0.88,
"grad_norm": 261.3269958496094,
"learning_rate": 2.1206693676571347e-05,
"loss": 15.1966,
"step": 198000
},
{
"epoch": 0.8822222222222222,
"grad_norm": 105.9546890258789,
"learning_rate": 2.1161501389550242e-05,
"loss": 15.0815,
"step": 198500
},
{
"epoch": 0.8844444444444445,
"grad_norm": 453.0606994628906,
"learning_rate": 2.11162417213371e-05,
"loss": 15.7839,
"step": 199000
},
{
"epoch": 0.8866666666666667,
"grad_norm": 0.0,
"learning_rate": 2.10709151668844e-05,
"loss": 15.5458,
"step": 199500
},
{
"epoch": 0.8888888888888888,
"grad_norm": 373.2171630859375,
"learning_rate": 2.1025522221876087e-05,
"loss": 14.8535,
"step": 200000
},
{
"epoch": 0.8911111111111111,
"grad_norm": 182.15408325195312,
"learning_rate": 2.098006338272212e-05,
"loss": 15.9142,
"step": 200500
},
{
"epoch": 0.8933333333333333,
"grad_norm": 159.78123474121094,
"learning_rate": 2.09345391465531e-05,
"loss": 17.2029,
"step": 201000
},
{
"epoch": 0.8955555555555555,
"grad_norm": 761.6434326171875,
"learning_rate": 2.0888950011214763e-05,
"loss": 14.7574,
"step": 201500
},
{
"epoch": 0.8977777777777778,
"grad_norm": 602.9556274414062,
"learning_rate": 2.0843296475262604e-05,
"loss": 15.3703,
"step": 202000
},
{
"epoch": 0.9,
"grad_norm": 44.228267669677734,
"learning_rate": 2.0797579037956364e-05,
"loss": 16.191,
"step": 202500
},
{
"epoch": 0.9022222222222223,
"grad_norm": 191.9353485107422,
"learning_rate": 2.075179819925462e-05,
"loss": 15.4188,
"step": 203000
},
{
"epoch": 0.9044444444444445,
"grad_norm": 41.51668930053711,
"learning_rate": 2.0705954459809293e-05,
"loss": 14.5222,
"step": 203500
},
{
"epoch": 0.9066666666666666,
"grad_norm": 281.99273681640625,
"learning_rate": 2.0660048320960164e-05,
"loss": 15.4986,
"step": 204000
},
{
"epoch": 0.9088888888888889,
"grad_norm": 3.3990941047668457,
"learning_rate": 2.061408028472942e-05,
"loss": 15.7127,
"step": 204500
},
{
"epoch": 0.9111111111111111,
"grad_norm": 151.7320556640625,
"learning_rate": 2.0568050853816137e-05,
"loss": 14.9146,
"step": 205000
},
{
"epoch": 0.9133333333333333,
"grad_norm": 223.80499267578125,
"learning_rate": 2.0521960531590795e-05,
"loss": 15.3864,
"step": 205500
},
{
"epoch": 0.9155555555555556,
"grad_norm": 394.2869567871094,
"learning_rate": 2.0475809822089774e-05,
"loss": 15.7962,
"step": 206000
},
{
"epoch": 0.9177777777777778,
"grad_norm": 471.55072021484375,
"learning_rate": 2.0429599230009844e-05,
"loss": 14.9467,
"step": 206500
},
{
"epoch": 0.92,
"grad_norm": 773.841552734375,
"learning_rate": 2.0383329260702634e-05,
"loss": 14.1642,
"step": 207000
},
{
"epoch": 0.9222222222222223,
"grad_norm": 269.2467346191406,
"learning_rate": 2.0337000420169113e-05,
"loss": 14.8939,
"step": 207500
},
{
"epoch": 0.9244444444444444,
"grad_norm": 262.891357421875,
"learning_rate": 2.0290613215054063e-05,
"loss": 14.6107,
"step": 208000
},
{
"epoch": 0.9266666666666666,
"grad_norm": 370.94036865234375,
"learning_rate": 2.0244168152640522e-05,
"loss": 14.8097,
"step": 208500
},
{
"epoch": 0.9288888888888889,
"grad_norm": 526.1622924804688,
"learning_rate": 2.0197665740844254e-05,
"loss": 13.5514,
"step": 209000
},
{
"epoch": 0.9311111111111111,
"grad_norm": 402.8370361328125,
"learning_rate": 2.0151106488208185e-05,
"loss": 15.5235,
"step": 209500
},
{
"epoch": 0.9333333333333333,
"grad_norm": 240.7682647705078,
"learning_rate": 2.0104490903896834e-05,
"loss": 15.7625,
"step": 210000
},
{
"epoch": 0.9355555555555556,
"grad_norm": 929.83447265625,
"learning_rate": 2.0057819497690778e-05,
"loss": 13.7892,
"step": 210500
},
{
"epoch": 0.9377777777777778,
"grad_norm": 50.330322265625,
"learning_rate": 2.0011092779981027e-05,
"loss": 14.8297,
"step": 211000
},
{
"epoch": 0.94,
"grad_norm": 106.34629821777344,
"learning_rate": 1.9964311261763482e-05,
"loss": 14.0396,
"step": 211500
},
{
"epoch": 0.9422222222222222,
"grad_norm": 519.3964233398438,
"learning_rate": 1.991747545463333e-05,
"loss": 14.4548,
"step": 212000
},
{
"epoch": 0.9444444444444444,
"grad_norm": 496.7522888183594,
"learning_rate": 1.987058587077946e-05,
"loss": 15.0954,
"step": 212500
},
{
"epoch": 0.9466666666666667,
"grad_norm": 79.46224975585938,
"learning_rate": 1.9823643022978844e-05,
"loss": 15.5782,
"step": 213000
},
{
"epoch": 0.9488888888888889,
"grad_norm": 0.0,
"learning_rate": 1.9776647424590937e-05,
"loss": 14.1761,
"step": 213500
},
{
"epoch": 0.9511111111111111,
"grad_norm": 328.0174560546875,
"learning_rate": 1.9729599589552084e-05,
"loss": 14.5482,
"step": 214000
},
{
"epoch": 0.9533333333333334,
"grad_norm": 223.33721923828125,
"learning_rate": 1.968250003236987e-05,
"loss": 14.5949,
"step": 214500
},
{
"epoch": 0.9555555555555556,
"grad_norm": 233.63478088378906,
"learning_rate": 1.9635349268117507e-05,
"loss": 14.8437,
"step": 215000
},
{
"epoch": 0.9577777777777777,
"grad_norm": 4.987401485443115,
"learning_rate": 1.9588147812428197e-05,
"loss": 15.7183,
"step": 215500
},
{
"epoch": 0.96,
"grad_norm": 341.9475402832031,
"learning_rate": 1.954089618148949e-05,
"loss": 15.5074,
"step": 216000
},
{
"epoch": 0.9622222222222222,
"grad_norm": 186.303466796875,
"learning_rate": 1.9493594892037667e-05,
"loss": 14.1594,
"step": 216500
},
{
"epoch": 0.9644444444444444,
"grad_norm": 196.6855010986328,
"learning_rate": 1.9446244461352033e-05,
"loss": 16.0385,
"step": 217000
},
{
"epoch": 0.9666666666666667,
"grad_norm": 536.9638061523438,
"learning_rate": 1.9398845407249326e-05,
"loss": 15.1219,
"step": 217500
},
{
"epoch": 0.9688888888888889,
"grad_norm": 369.9173889160156,
"learning_rate": 1.9351398248078004e-05,
"loss": 14.1767,
"step": 218000
},
{
"epoch": 0.9711111111111111,
"grad_norm": 36.90256118774414,
"learning_rate": 1.9303903502712592e-05,
"loss": 15.2894,
"step": 218500
},
{
"epoch": 0.9733333333333334,
"grad_norm": 475.021240234375,
"learning_rate": 1.9256361690548026e-05,
"loss": 14.8856,
"step": 219000
},
{
"epoch": 0.9755555555555555,
"grad_norm": 805.5115356445312,
"learning_rate": 1.9208773331493938e-05,
"loss": 14.159,
"step": 219500
},
{
"epoch": 0.9777777777777777,
"grad_norm": 767.4393310546875,
"learning_rate": 1.9161138945969007e-05,
"loss": 14.6288,
"step": 220000
},
{
"epoch": 0.98,
"grad_norm": 122.41221618652344,
"learning_rate": 1.911345905489523e-05,
"loss": 13.795,
"step": 220500
},
{
"epoch": 0.9822222222222222,
"grad_norm": 432.9138488769531,
"learning_rate": 1.9065734179692262e-05,
"loss": 14.115,
"step": 221000
},
{
"epoch": 0.9844444444444445,
"grad_norm": 630.0858764648438,
"learning_rate": 1.90179648422717e-05,
"loss": 13.5404,
"step": 221500
},
{
"epoch": 0.9866666666666667,
"grad_norm": 681.5342407226562,
"learning_rate": 1.897015156503135e-05,
"loss": 14.8603,
"step": 222000
},
{
"epoch": 0.9888888888888889,
"grad_norm": 18.26776885986328,
"learning_rate": 1.8922294870849566e-05,
"loss": 14.8978,
"step": 222500
},
{
"epoch": 0.9911111111111112,
"grad_norm": 610.2125244140625,
"learning_rate": 1.8874395283079478e-05,
"loss": 14.0042,
"step": 223000
},
{
"epoch": 0.9933333333333333,
"grad_norm": 236.45591735839844,
"learning_rate": 1.8826453325543308e-05,
"loss": 13.2571,
"step": 223500
},
{
"epoch": 0.9955555555555555,
"grad_norm": 146.5922393798828,
"learning_rate": 1.877846952252662e-05,
"loss": 14.9317,
"step": 224000
},
{
"epoch": 0.9977777777777778,
"grad_norm": 831.205078125,
"learning_rate": 1.8730444398772605e-05,
"loss": 14.2085,
"step": 224500
},
{
"epoch": 1.0,
"grad_norm": 465.5499267578125,
"learning_rate": 1.8682378479476307e-05,
"loss": 15.6298,
"step": 225000
},
{
"epoch": 1.0022222222222221,
"grad_norm": 130.86990356445312,
"learning_rate": 1.8634272290278932e-05,
"loss": 12.7156,
"step": 225500
},
{
"epoch": 1.0044444444444445,
"grad_norm": 394.0591125488281,
"learning_rate": 1.8586126357262054e-05,
"loss": 12.0245,
"step": 226000
},
{
"epoch": 1.0066666666666666,
"grad_norm": 144.7230682373047,
"learning_rate": 1.853794120694187e-05,
"loss": 12.68,
"step": 226500
},
{
"epoch": 1.008888888888889,
"grad_norm": 108.50147247314453,
"learning_rate": 1.8489717366263487e-05,
"loss": 11.755,
"step": 227000
},
{
"epoch": 1.011111111111111,
"grad_norm": 45.11106872558594,
"learning_rate": 1.8441455362595082e-05,
"loss": 12.0449,
"step": 227500
},
{
"epoch": 1.0133333333333334,
"grad_norm": 321.0522155761719,
"learning_rate": 1.8393155723722205e-05,
"loss": 12.5334,
"step": 228000
},
{
"epoch": 1.0155555555555555,
"grad_norm": 409.6867370605469,
"learning_rate": 1.8344818977841967e-05,
"loss": 12.5081,
"step": 228500
},
{
"epoch": 1.0177777777777777,
"grad_norm": 293.31866455078125,
"learning_rate": 1.829644565355727e-05,
"loss": 11.9373,
"step": 229000
},
{
"epoch": 1.02,
"grad_norm": 182.61883544921875,
"learning_rate": 1.8248036279871043e-05,
"loss": 12.3983,
"step": 229500
},
{
"epoch": 1.0222222222222221,
"grad_norm": 152.36061096191406,
"learning_rate": 1.819959138618044e-05,
"loss": 13.1577,
"step": 230000
},
{
"epoch": 1.0244444444444445,
"grad_norm": 31.093074798583984,
"learning_rate": 1.8151111502271063e-05,
"loss": 13.6112,
"step": 230500
},
{
"epoch": 1.0266666666666666,
"grad_norm": 504.9164733886719,
"learning_rate": 1.810259715831115e-05,
"loss": 12.9236,
"step": 231000
},
{
"epoch": 1.028888888888889,
"grad_norm": 118.45124053955078,
"learning_rate": 1.8054048884845784e-05,
"loss": 14.7912,
"step": 231500
},
{
"epoch": 1.031111111111111,
"grad_norm": 247.5614776611328,
"learning_rate": 1.8005467212791124e-05,
"loss": 13.3697,
"step": 232000
},
{
"epoch": 1.0333333333333334,
"grad_norm": 431.06396484375,
"learning_rate": 1.795685267342854e-05,
"loss": 13.0248,
"step": 232500
},
{
"epoch": 1.0355555555555556,
"grad_norm": 209.7031707763672,
"learning_rate": 1.7908205798398853e-05,
"loss": 13.0866,
"step": 233000
},
{
"epoch": 1.0377777777777777,
"grad_norm": 127.96566009521484,
"learning_rate": 1.7859527119696487e-05,
"loss": 13.5331,
"step": 233500
},
{
"epoch": 1.04,
"grad_norm": 117.52790832519531,
"learning_rate": 1.7810817169663676e-05,
"loss": 11.3817,
"step": 234000
},
{
"epoch": 1.0422222222222222,
"grad_norm": 1179.1375732421875,
"learning_rate": 1.7762076480984635e-05,
"loss": 12.7315,
"step": 234500
},
{
"epoch": 1.0444444444444445,
"grad_norm": 357.2664489746094,
"learning_rate": 1.771330558667971e-05,
"loss": 12.4928,
"step": 235000
},
{
"epoch": 1.0466666666666666,
"grad_norm": 230.9121551513672,
"learning_rate": 1.766450502009961e-05,
"loss": 13.6869,
"step": 235500
},
{
"epoch": 1.048888888888889,
"grad_norm": 236.51214599609375,
"learning_rate": 1.7615675314919504e-05,
"loss": 13.8959,
"step": 236000
},
{
"epoch": 1.051111111111111,
"grad_norm": 32.029823303222656,
"learning_rate": 1.7566817005133215e-05,
"loss": 11.7484,
"step": 236500
},
{
"epoch": 1.0533333333333332,
"grad_norm": 487.9048767089844,
"learning_rate": 1.7517930625047403e-05,
"loss": 12.8478,
"step": 237000
},
{
"epoch": 1.0555555555555556,
"grad_norm": 64.5386962890625,
"learning_rate": 1.7469016709275678e-05,
"loss": 13.1321,
"step": 237500
},
{
"epoch": 1.0577777777777777,
"grad_norm": 123.01608276367188,
"learning_rate": 1.7420075792732797e-05,
"loss": 12.7279,
"step": 238000
},
{
"epoch": 1.06,
"grad_norm": 418.50323486328125,
"learning_rate": 1.7371108410628778e-05,
"loss": 12.7196,
"step": 238500
},
{
"epoch": 1.0622222222222222,
"grad_norm": 15.958662986755371,
"learning_rate": 1.732211509846306e-05,
"loss": 12.8302,
"step": 239000
},
{
"epoch": 1.0644444444444445,
"grad_norm": 903.5818481445312,
"learning_rate": 1.7273096392018664e-05,
"loss": 12.5959,
"step": 239500
},
{
"epoch": 1.0666666666666667,
"grad_norm": 132.69081115722656,
"learning_rate": 1.7224052827356306e-05,
"loss": 12.4179,
"step": 240000
},
{
"epoch": 1.068888888888889,
"grad_norm": 72.78104400634766,
"learning_rate": 1.7174984940808555e-05,
"loss": 12.6991,
"step": 240500
},
{
"epoch": 1.0711111111111111,
"grad_norm": 19.8783016204834,
"learning_rate": 1.7125893268973953e-05,
"loss": 12.3093,
"step": 241000
},
{
"epoch": 1.0733333333333333,
"grad_norm": 53.51363754272461,
"learning_rate": 1.707677834871116e-05,
"loss": 12.2946,
"step": 241500
},
{
"epoch": 1.0755555555555556,
"grad_norm": 310.8068542480469,
"learning_rate": 1.7027640717133074e-05,
"loss": 12.9432,
"step": 242000
},
{
"epoch": 1.0777777777777777,
"grad_norm": 448.7236633300781,
"learning_rate": 1.697848091160096e-05,
"loss": 12.162,
"step": 242500
},
{
"epoch": 1.08,
"grad_norm": 802.4764404296875,
"learning_rate": 1.6929299469718585e-05,
"loss": 13.7779,
"step": 243000
},
{
"epoch": 1.0822222222222222,
"grad_norm": 429.84564208984375,
"learning_rate": 1.68800969293263e-05,
"loss": 12.5977,
"step": 243500
},
{
"epoch": 1.0844444444444445,
"grad_norm": 0.0,
"learning_rate": 1.6830873828495226e-05,
"loss": 11.7274,
"step": 244000
},
{
"epoch": 1.0866666666666667,
"grad_norm": 194.27366638183594,
"learning_rate": 1.6781630705521288e-05,
"loss": 13.384,
"step": 244500
},
{
"epoch": 1.0888888888888888,
"grad_norm": 28.86142921447754,
"learning_rate": 1.67323680989194e-05,
"loss": 12.4926,
"step": 245000
},
{
"epoch": 1.0911111111111111,
"grad_norm": 729.71875,
"learning_rate": 1.6683086547417527e-05,
"loss": 12.177,
"step": 245500
},
{
"epoch": 1.0933333333333333,
"grad_norm": 17.39883804321289,
"learning_rate": 1.663378658995083e-05,
"loss": 11.7948,
"step": 246000
},
{
"epoch": 1.0955555555555556,
"grad_norm": 0.0,
"learning_rate": 1.6584468765655737e-05,
"loss": 12.777,
"step": 246500
},
{
"epoch": 1.0977777777777777,
"grad_norm": 214.7503204345703,
"learning_rate": 1.653513361386408e-05,
"loss": 12.8227,
"step": 247000
},
{
"epoch": 1.1,
"grad_norm": 279.39007568359375,
"learning_rate": 1.6485781674097173e-05,
"loss": 12.6121,
"step": 247500
},
{
"epoch": 1.1022222222222222,
"grad_norm": 74.43594360351562,
"learning_rate": 1.643641348605992e-05,
"loss": 11.8667,
"step": 248000
},
{
"epoch": 1.1044444444444443,
"grad_norm": 35.02223587036133,
"learning_rate": 1.638702958963492e-05,
"loss": 12.2564,
"step": 248500
},
{
"epoch": 1.1066666666666667,
"grad_norm": 23.571346282958984,
"learning_rate": 1.6337630524876546e-05,
"loss": 11.9732,
"step": 249000
},
{
"epoch": 1.1088888888888888,
"grad_norm": 15.899101257324219,
"learning_rate": 1.628821683200506e-05,
"loss": 13.1795,
"step": 249500
},
{
"epoch": 1.1111111111111112,
"grad_norm": 272.45257568359375,
"learning_rate": 1.6238789051400688e-05,
"loss": 12.9309,
"step": 250000
},
{
"epoch": 1.1133333333333333,
"grad_norm": 0.0,
"learning_rate": 1.6189347723597725e-05,
"loss": 12.8293,
"step": 250500
},
{
"epoch": 1.1155555555555556,
"grad_norm": 10.567012786865234,
"learning_rate": 1.6139893389278608e-05,
"loss": 11.9302,
"step": 251000
},
{
"epoch": 1.1177777777777778,
"grad_norm": 823.9113159179688,
"learning_rate": 1.609042658926801e-05,
"loss": 11.3798,
"step": 251500
},
{
"epoch": 1.12,
"grad_norm": 449.7940673828125,
"learning_rate": 1.6040947864526935e-05,
"loss": 12.5211,
"step": 252000
},
{
"epoch": 1.1222222222222222,
"grad_norm": 427.29150390625,
"learning_rate": 1.5991457756146786e-05,
"loss": 12.1701,
"step": 252500
},
{
"epoch": 1.1244444444444444,
"grad_norm": 108.2233657836914,
"learning_rate": 1.5941956805343463e-05,
"loss": 12.4913,
"step": 253000
},
{
"epoch": 1.1266666666666667,
"grad_norm": 92.11042022705078,
"learning_rate": 1.589244555345143e-05,
"loss": 11.8749,
"step": 253500
},
{
"epoch": 1.1288888888888888,
"grad_norm": 177.92575073242188,
"learning_rate": 1.584292454191781e-05,
"loss": 13.8006,
"step": 254000
},
{
"epoch": 1.1311111111111112,
"grad_norm": 203.5926513671875,
"learning_rate": 1.5793394312296444e-05,
"loss": 12.2695,
"step": 254500
},
{
"epoch": 1.1333333333333333,
"grad_norm": 339.7933654785156,
"learning_rate": 1.5743855406242e-05,
"loss": 12.3823,
"step": 255000
},
{
"epoch": 1.1355555555555557,
"grad_norm": 334.1343688964844,
"learning_rate": 1.5694308365504e-05,
"loss": 13.8132,
"step": 255500
},
{
"epoch": 1.1377777777777778,
"grad_norm": 206.6999969482422,
"learning_rate": 1.5644753731920954e-05,
"loss": 12.8192,
"step": 256000
},
{
"epoch": 1.1400000000000001,
"grad_norm": 237.3104248046875,
"learning_rate": 1.5595192047414395e-05,
"loss": 11.9175,
"step": 256500
},
{
"epoch": 1.1422222222222222,
"grad_norm": 673.7626953125,
"learning_rate": 1.5545623853982966e-05,
"loss": 13.1039,
"step": 257000
},
{
"epoch": 1.1444444444444444,
"grad_norm": 40.97128677368164,
"learning_rate": 1.549604969369649e-05,
"loss": 11.9416,
"step": 257500
},
{
"epoch": 1.1466666666666667,
"grad_norm": 125.23896789550781,
"learning_rate": 1.544647010869003e-05,
"loss": 12.4299,
"step": 258000
},
{
"epoch": 1.1488888888888888,
"grad_norm": 297.3369140625,
"learning_rate": 1.5396885641158002e-05,
"loss": 12.2724,
"step": 258500
},
{
"epoch": 1.1511111111111112,
"grad_norm": 0.0,
"learning_rate": 1.534729683334818e-05,
"loss": 10.8568,
"step": 259000
},
{
"epoch": 1.1533333333333333,
"grad_norm": 222.6666717529297,
"learning_rate": 1.529770422755583e-05,
"loss": 11.321,
"step": 259500
},
{
"epoch": 1.1555555555555554,
"grad_norm": 258.8761291503906,
"learning_rate": 1.524810836611775e-05,
"loss": 11.3846,
"step": 260000
},
{
"epoch": 1.1577777777777778,
"grad_norm": 362.4846496582031,
"learning_rate": 1.5198509791406325e-05,
"loss": 12.1888,
"step": 260500
},
{
"epoch": 1.16,
"grad_norm": 325.5453186035156,
"learning_rate": 1.5148909045823626e-05,
"loss": 11.6617,
"step": 261000
},
{
"epoch": 1.1622222222222223,
"grad_norm": 346.42791748046875,
"learning_rate": 1.509930667179546e-05,
"loss": 12.4993,
"step": 261500
},
{
"epoch": 1.1644444444444444,
"grad_norm": 427.6278991699219,
"learning_rate": 1.5049703211765442e-05,
"loss": 12.6815,
"step": 262000
},
{
"epoch": 1.1666666666666667,
"grad_norm": 416.53680419921875,
"learning_rate": 1.5000099208189061e-05,
"loss": 12.9896,
"step": 262500
},
{
"epoch": 1.1688888888888889,
"grad_norm": 181.99703979492188,
"learning_rate": 1.4950495203527755e-05,
"loss": 12.7223,
"step": 263000
},
{
"epoch": 1.1711111111111112,
"grad_norm": 38.73680114746094,
"learning_rate": 1.4900891740242976e-05,
"loss": 12.5012,
"step": 263500
},
{
"epoch": 1.1733333333333333,
"grad_norm": 527.49267578125,
"learning_rate": 1.4851289360790243e-05,
"loss": 11.8226,
"step": 264000
},
{
"epoch": 1.1755555555555555,
"grad_norm": 593.9708862304688,
"learning_rate": 1.480168860761324e-05,
"loss": 11.9695,
"step": 264500
},
{
"epoch": 1.1777777777777778,
"grad_norm": 743.0066528320312,
"learning_rate": 1.4752090023137843e-05,
"loss": 12.0286,
"step": 265000
},
{
"epoch": 1.18,
"grad_norm": 201.2530059814453,
"learning_rate": 1.4702494149766239e-05,
"loss": 10.9088,
"step": 265500
},
{
"epoch": 1.1822222222222223,
"grad_norm": 548.9100952148438,
"learning_rate": 1.465290152987095e-05,
"loss": 11.889,
"step": 266000
},
{
"epoch": 1.1844444444444444,
"grad_norm": 233.81863403320312,
"learning_rate": 1.4603312705788917e-05,
"loss": 12.1066,
"step": 266500
},
{
"epoch": 1.1866666666666668,
"grad_norm": 163.2041015625,
"learning_rate": 1.4553728219815586e-05,
"loss": 12.8837,
"step": 267000
},
{
"epoch": 1.1888888888888889,
"grad_norm": 153.75701904296875,
"learning_rate": 1.4504148614198935e-05,
"loss": 11.7215,
"step": 267500
},
{
"epoch": 1.1911111111111112,
"grad_norm": 32.576324462890625,
"learning_rate": 1.4454574431133605e-05,
"loss": 12.7392,
"step": 268000
},
{
"epoch": 1.1933333333333334,
"grad_norm": 690.4747314453125,
"learning_rate": 1.4405006212754901e-05,
"loss": 12.4667,
"step": 268500
},
{
"epoch": 1.1955555555555555,
"grad_norm": 70.4339828491211,
"learning_rate": 1.4355444501132934e-05,
"loss": 12.3897,
"step": 269000
},
{
"epoch": 1.1977777777777778,
"grad_norm": 1018.3383178710938,
"learning_rate": 1.430588983826664e-05,
"loss": 11.7094,
"step": 269500
},
{
"epoch": 1.2,
"grad_norm": 64.98046112060547,
"learning_rate": 1.4256342766077859e-05,
"loss": 11.031,
"step": 270000
},
{
"epoch": 1.2022222222222223,
"grad_norm": 507.530029296875,
"learning_rate": 1.4206803826405453e-05,
"loss": 11.7225,
"step": 270500
},
{
"epoch": 1.2044444444444444,
"grad_norm": 396.6742248535156,
"learning_rate": 1.4157273560999311e-05,
"loss": 12.0661,
"step": 271000
},
{
"epoch": 1.2066666666666666,
"grad_norm": 741.4268188476562,
"learning_rate": 1.4107752511514499e-05,
"loss": 12.1401,
"step": 271500
},
{
"epoch": 1.208888888888889,
"grad_norm": 977.9871826171875,
"learning_rate": 1.405824121950526e-05,
"loss": 11.8266,
"step": 272000
},
{
"epoch": 1.211111111111111,
"grad_norm": 172.49072265625,
"learning_rate": 1.4008740226419166e-05,
"loss": 12.024,
"step": 272500
},
{
"epoch": 1.2133333333333334,
"grad_norm": 148.6393585205078,
"learning_rate": 1.3959250073591146e-05,
"loss": 11.7095,
"step": 273000
},
{
"epoch": 1.2155555555555555,
"grad_norm": 50.63189697265625,
"learning_rate": 1.390977130223757e-05,
"loss": 11.5046,
"step": 273500
},
{
"epoch": 1.2177777777777778,
"grad_norm": 101.87459564208984,
"learning_rate": 1.3860304453450373e-05,
"loss": 11.3638,
"step": 274000
},
{
"epoch": 1.22,
"grad_norm": 274.5159606933594,
"learning_rate": 1.3810850068191069e-05,
"loss": 12.2588,
"step": 274500
},
{
"epoch": 1.2222222222222223,
"grad_norm": 108.9557876586914,
"learning_rate": 1.3761408687284907e-05,
"loss": 12.7642,
"step": 275000
},
{
"epoch": 1.2244444444444444,
"grad_norm": 455.4017028808594,
"learning_rate": 1.3711980851414898e-05,
"loss": 11.3841,
"step": 275500
},
{
"epoch": 1.2266666666666666,
"grad_norm": 239.2037811279297,
"learning_rate": 1.3662567101115934e-05,
"loss": 12.0606,
"step": 276000
},
{
"epoch": 1.228888888888889,
"grad_norm": 56.60507583618164,
"learning_rate": 1.3613167976768886e-05,
"loss": 11.4546,
"step": 276500
},
{
"epoch": 1.231111111111111,
"grad_norm": 310.4095458984375,
"learning_rate": 1.3563784018594645e-05,
"loss": 11.4747,
"step": 277000
},
{
"epoch": 1.2333333333333334,
"grad_norm": 335.875,
"learning_rate": 1.3514415766648284e-05,
"loss": 11.9081,
"step": 277500
},
{
"epoch": 1.2355555555555555,
"grad_norm": 594.1018676757812,
"learning_rate": 1.346506376081308e-05,
"loss": 11.2674,
"step": 278000
},
{
"epoch": 1.2377777777777779,
"grad_norm": 275.7675476074219,
"learning_rate": 1.3415728540794674e-05,
"loss": 10.7813,
"step": 278500
},
{
"epoch": 1.24,
"grad_norm": 214.95712280273438,
"learning_rate": 1.3366410646115118e-05,
"loss": 12.3449,
"step": 279000
},
{
"epoch": 1.2422222222222223,
"grad_norm": 0.0,
"learning_rate": 1.331711061610701e-05,
"loss": 11.6398,
"step": 279500
},
{
"epoch": 1.2444444444444445,
"grad_norm": 15.316904067993164,
"learning_rate": 1.3267828989907592e-05,
"loss": 11.7452,
"step": 280000
},
{
"epoch": 1.2466666666666666,
"grad_norm": 529.526611328125,
"learning_rate": 1.3218566306452813e-05,
"loss": 12.7856,
"step": 280500
},
{
"epoch": 1.248888888888889,
"grad_norm": 4.096035480499268,
"learning_rate": 1.31693231044715e-05,
"loss": 11.2883,
"step": 281000
},
{
"epoch": 1.251111111111111,
"grad_norm": 641.160888671875,
"learning_rate": 1.3120099922479414e-05,
"loss": 12.2018,
"step": 281500
},
{
"epoch": 1.2533333333333334,
"grad_norm": 218.7012939453125,
"learning_rate": 1.3070897298773392e-05,
"loss": 11.9625,
"step": 282000
},
{
"epoch": 1.2555555555555555,
"grad_norm": 1709.0491943359375,
"learning_rate": 1.3021715771425437e-05,
"loss": 11.9818,
"step": 282500
},
{
"epoch": 1.2577777777777777,
"grad_norm": 325.7183532714844,
"learning_rate": 1.2972555878276857e-05,
"loss": 12.171,
"step": 283000
},
{
"epoch": 1.26,
"grad_norm": 463.99432373046875,
"learning_rate": 1.292341815693237e-05,
"loss": 12.996,
"step": 283500
},
{
"epoch": 1.2622222222222224,
"grad_norm": 30.650217056274414,
"learning_rate": 1.2874303144754219e-05,
"loss": 11.0988,
"step": 284000
},
{
"epoch": 1.2644444444444445,
"grad_norm": 308.7669372558594,
"learning_rate": 1.2825211378856311e-05,
"loss": 11.6588,
"step": 284500
},
{
"epoch": 1.2666666666666666,
"grad_norm": 813.3473510742188,
"learning_rate": 1.2776143396098331e-05,
"loss": 11.7966,
"step": 285000
},
{
"epoch": 1.268888888888889,
"grad_norm": 277.6453857421875,
"learning_rate": 1.272709973307988e-05,
"loss": 11.957,
"step": 285500
},
{
"epoch": 1.271111111111111,
"grad_norm": 614.5536499023438,
"learning_rate": 1.2678080926134595e-05,
"loss": 12.0953,
"step": 286000
},
{
"epoch": 1.2733333333333334,
"grad_norm": 600.1682739257812,
"learning_rate": 1.2629087511324295e-05,
"loss": 12.4912,
"step": 286500
},
{
"epoch": 1.2755555555555556,
"grad_norm": 291.91387939453125,
"learning_rate": 1.2580120024433123e-05,
"loss": 11.737,
"step": 287000
},
{
"epoch": 1.2777777777777777,
"grad_norm": 645.7890625,
"learning_rate": 1.2531179000961662e-05,
"loss": 11.1851,
"step": 287500
},
{
"epoch": 1.28,
"grad_norm": 390.1597900390625,
"learning_rate": 1.2482264976121108e-05,
"loss": 11.5208,
"step": 288000
},
{
"epoch": 1.2822222222222222,
"grad_norm": 15.699028968811035,
"learning_rate": 1.2433378484827395e-05,
"loss": 12.3516,
"step": 288500
},
{
"epoch": 1.2844444444444445,
"grad_norm": 35.82905578613281,
"learning_rate": 1.2384520061695367e-05,
"loss": 11.0025,
"step": 289000
},
{
"epoch": 1.2866666666666666,
"grad_norm": 112.55397033691406,
"learning_rate": 1.2335690241032904e-05,
"loss": 11.9212,
"step": 289500
},
{
"epoch": 1.2888888888888888,
"grad_norm": 143.4647979736328,
"learning_rate": 1.2286889556835105e-05,
"loss": 11.8427,
"step": 290000
},
{
"epoch": 1.291111111111111,
"grad_norm": 83.45748138427734,
"learning_rate": 1.2238118542778435e-05,
"loss": 11.4673,
"step": 290500
},
{
"epoch": 1.2933333333333334,
"grad_norm": 128.21621704101562,
"learning_rate": 1.2189377732214886e-05,
"loss": 10.8374,
"step": 291000
},
{
"epoch": 1.2955555555555556,
"grad_norm": 987.85302734375,
"learning_rate": 1.2140667658166162e-05,
"loss": 12.346,
"step": 291500
},
{
"epoch": 1.2977777777777777,
"grad_norm": 250.47520446777344,
"learning_rate": 1.2091988853317817e-05,
"loss": 10.7999,
"step": 292000
},
{
"epoch": 1.3,
"grad_norm": 33.65868377685547,
"learning_rate": 1.2043341850013472e-05,
"loss": 12.6021,
"step": 292500
},
{
"epoch": 1.3022222222222222,
"grad_norm": 207.2305450439453,
"learning_rate": 1.1994727180248953e-05,
"loss": 12.2435,
"step": 293000
},
{
"epoch": 1.3044444444444445,
"grad_norm": 210.83741760253906,
"learning_rate": 1.1946145375666504e-05,
"loss": 11.2422,
"step": 293500
},
{
"epoch": 1.3066666666666666,
"grad_norm": 289.1300964355469,
"learning_rate": 1.189759696754896e-05,
"loss": 11.7366,
"step": 294000
},
{
"epoch": 1.3088888888888888,
"grad_norm": 491.4790954589844,
"learning_rate": 1.1849082486813923e-05,
"loss": 11.8805,
"step": 294500
},
{
"epoch": 1.3111111111111111,
"grad_norm": 286.23681640625,
"learning_rate": 1.1800602464007995e-05,
"loss": 11.8487,
"step": 295000
},
{
"epoch": 1.3133333333333335,
"grad_norm": 150.55995178222656,
"learning_rate": 1.175215742930093e-05,
"loss": 11.2674,
"step": 295500
},
{
"epoch": 1.3155555555555556,
"grad_norm": 90.90438842773438,
"learning_rate": 1.1703747912479867e-05,
"loss": 12.0513,
"step": 296000
},
{
"epoch": 1.3177777777777777,
"grad_norm": 402.916748046875,
"learning_rate": 1.1655374442943526e-05,
"loss": 11.3287,
"step": 296500
},
{
"epoch": 1.32,
"grad_norm": 221.00369262695312,
"learning_rate": 1.160703754969642e-05,
"loss": 10.8907,
"step": 297000
},
{
"epoch": 1.3222222222222222,
"grad_norm": 84.22000885009766,
"learning_rate": 1.1558737761343074e-05,
"loss": 12.0133,
"step": 297500
},
{
"epoch": 1.3244444444444445,
"grad_norm": 19.054018020629883,
"learning_rate": 1.1510475606082226e-05,
"loss": 10.2377,
"step": 298000
},
{
"epoch": 1.3266666666666667,
"grad_norm": 453.34326171875,
"learning_rate": 1.1462251611701084e-05,
"loss": 11.93,
"step": 298500
},
{
"epoch": 1.3288888888888888,
"grad_norm": 275.5953063964844,
"learning_rate": 1.1414066305569514e-05,
"loss": 13.0519,
"step": 299000
},
{
"epoch": 1.3311111111111111,
"grad_norm": 279.2978210449219,
"learning_rate": 1.1365920214634312e-05,
"loss": 11.8949,
"step": 299500
},
{
"epoch": 1.3333333333333333,
"grad_norm": 278.0643310546875,
"learning_rate": 1.1317813865413409e-05,
"loss": 10.4946,
"step": 300000
},
{
"epoch": 1.3355555555555556,
"grad_norm": 685.4400024414062,
"learning_rate": 1.1269747783990135e-05,
"loss": 11.1153,
"step": 300500
},
{
"epoch": 1.3377777777777777,
"grad_norm": 312.36724853515625,
"learning_rate": 1.1221722496007462e-05,
"loss": 12.0323,
"step": 301000
},
{
"epoch": 1.34,
"grad_norm": 1231.820068359375,
"learning_rate": 1.1173738526662234e-05,
"loss": 10.8594,
"step": 301500
},
{
"epoch": 1.3422222222222222,
"grad_norm": 273.9977111816406,
"learning_rate": 1.1125796400699458e-05,
"loss": 11.2889,
"step": 302000
},
{
"epoch": 1.3444444444444446,
"grad_norm": 222.45266723632812,
"learning_rate": 1.1077896642406542e-05,
"loss": 11.6009,
"step": 302500
},
{
"epoch": 1.3466666666666667,
"grad_norm": 1616.0927734375,
"learning_rate": 1.103003977560757e-05,
"loss": 11.7312,
"step": 303000
},
{
"epoch": 1.3488888888888888,
"grad_norm": 172.6010284423828,
"learning_rate": 1.0982226323657565e-05,
"loss": 11.6923,
"step": 303500
},
{
"epoch": 1.3511111111111112,
"grad_norm": 188.0960235595703,
"learning_rate": 1.093445680943678e-05,
"loss": 10.7696,
"step": 304000
},
{
"epoch": 1.3533333333333333,
"grad_norm": 708.9501342773438,
"learning_rate": 1.0886731755344972e-05,
"loss": 11.5035,
"step": 304500
},
{
"epoch": 1.3555555555555556,
"grad_norm": 112.46131896972656,
"learning_rate": 1.0839051683295682e-05,
"loss": 11.0951,
"step": 305000
},
{
"epoch": 1.3577777777777778,
"grad_norm": 42.40409469604492,
"learning_rate": 1.0791417114710543e-05,
"loss": 12.8662,
"step": 305500
},
{
"epoch": 1.3599999999999999,
"grad_norm": 447.1692810058594,
"learning_rate": 1.074382857051356e-05,
"loss": 11.2495,
"step": 306000
},
{
"epoch": 1.3622222222222222,
"grad_norm": 0.0,
"learning_rate": 1.0696286571125437e-05,
"loss": 12.0512,
"step": 306500
},
{
"epoch": 1.3644444444444446,
"grad_norm": 1327.175537109375,
"learning_rate": 1.0648791636457847e-05,
"loss": 11.3486,
"step": 307000
},
{
"epoch": 1.3666666666666667,
"grad_norm": 114.16178894042969,
"learning_rate": 1.0601344285907797e-05,
"loss": 12.0348,
"step": 307500
},
{
"epoch": 1.3688888888888888,
"grad_norm": 410.4014587402344,
"learning_rate": 1.0553945038351914e-05,
"loss": 11.0606,
"step": 308000
},
{
"epoch": 1.3711111111111112,
"grad_norm": 205.14894104003906,
"learning_rate": 1.0506594412140768e-05,
"loss": 12.0553,
"step": 308500
},
{
"epoch": 1.3733333333333333,
"grad_norm": 70.4958267211914,
"learning_rate": 1.0459292925093228e-05,
"loss": 11.5397,
"step": 309000
},
{
"epoch": 1.3755555555555556,
"grad_norm": 194.81698608398438,
"learning_rate": 1.0412041094490767e-05,
"loss": 10.2973,
"step": 309500
},
{
"epoch": 1.3777777777777778,
"grad_norm": 15.8478364944458,
"learning_rate": 1.0364839437071848e-05,
"loss": 11.748,
"step": 310000
},
{
"epoch": 1.38,
"grad_norm": 117.78498840332031,
"learning_rate": 1.0317688469026219e-05,
"loss": 11.4108,
"step": 310500
},
{
"epoch": 1.3822222222222222,
"grad_norm": 497.8285217285156,
"learning_rate": 1.0270588705989322e-05,
"loss": 11.4724,
"step": 311000
},
{
"epoch": 1.3844444444444444,
"grad_norm": 339.4550476074219,
"learning_rate": 1.0223540663036624e-05,
"loss": 12.0662,
"step": 311500
},
{
"epoch": 1.3866666666666667,
"grad_norm": 191.90072631835938,
"learning_rate": 1.017654485467797e-05,
"loss": 12.0687,
"step": 312000
},
{
"epoch": 1.3888888888888888,
"grad_norm": 201.05392456054688,
"learning_rate": 1.0129601794852007e-05,
"loss": 12.6799,
"step": 312500
},
{
"epoch": 1.3911111111111112,
"grad_norm": 0.0,
"learning_rate": 1.00827119969205e-05,
"loss": 11.8095,
"step": 313000
},
{
"epoch": 1.3933333333333333,
"grad_norm": 110.20486450195312,
"learning_rate": 1.0035875973662787e-05,
"loss": 11.1245,
"step": 313500
},
{
"epoch": 1.3955555555555557,
"grad_norm": 142.47384643554688,
"learning_rate": 9.989094237270094e-06,
"loss": 11.5409,
"step": 314000
},
{
"epoch": 1.3977777777777778,
"grad_norm": 408.63031005859375,
"learning_rate": 9.942367299340003e-06,
"loss": 11.8593,
"step": 314500
},
{
"epoch": 1.4,
"grad_norm": 470.6206970214844,
"learning_rate": 9.89569567087083e-06,
"loss": 11.6008,
"step": 315000
},
{
"epoch": 1.4022222222222223,
"grad_norm": 67.78047180175781,
"learning_rate": 9.84907986225601e-06,
"loss": 10.926,
"step": 315500
},
{
"epoch": 1.4044444444444444,
"grad_norm": 293.5436706542969,
"learning_rate": 9.802520383278574e-06,
"loss": 10.8069,
"step": 316000
},
{
"epoch": 1.4066666666666667,
"grad_norm": 480.68389892578125,
"learning_rate": 9.75601774310551e-06,
"loss": 11.2341,
"step": 316500
},
{
"epoch": 1.4088888888888889,
"grad_norm": 235.30406188964844,
"learning_rate": 9.709572450282253e-06,
"loss": 11.3084,
"step": 317000
},
{
"epoch": 1.411111111111111,
"grad_norm": 977.0435180664062,
"learning_rate": 9.663185012727075e-06,
"loss": 12.978,
"step": 317500
},
{
"epoch": 1.4133333333333333,
"grad_norm": 26.692384719848633,
"learning_rate": 9.61685593772556e-06,
"loss": 11.2446,
"step": 318000
},
{
"epoch": 1.4155555555555557,
"grad_norm": 691.8837280273438,
"learning_rate": 9.570585731925064e-06,
"loss": 11.2801,
"step": 318500
},
{
"epoch": 1.4177777777777778,
"grad_norm": 210.51527404785156,
"learning_rate": 9.524374901329125e-06,
"loss": 10.0809,
"step": 319000
},
{
"epoch": 1.42,
"grad_norm": 0.0,
"learning_rate": 9.478223951292001e-06,
"loss": 11.3325,
"step": 319500
},
{
"epoch": 1.4222222222222223,
"grad_norm": 302.1672668457031,
"learning_rate": 9.432133386513075e-06,
"loss": 10.449,
"step": 320000
},
{
"epoch": 1.4244444444444444,
"grad_norm": 210.6238250732422,
"learning_rate": 9.386103711031384e-06,
"loss": 12.6131,
"step": 320500
},
{
"epoch": 1.4266666666666667,
"grad_norm": 615.0394897460938,
"learning_rate": 9.340135428220081e-06,
"loss": 11.892,
"step": 321000
},
{
"epoch": 1.4288888888888889,
"grad_norm": 44.33654022216797,
"learning_rate": 9.294229040780948e-06,
"loss": 11.7791,
"step": 321500
},
{
"epoch": 1.431111111111111,
"grad_norm": 423.60943603515625,
"learning_rate": 9.248385050738874e-06,
"loss": 11.8577,
"step": 322000
},
{
"epoch": 1.4333333333333333,
"grad_norm": 750.2989501953125,
"learning_rate": 9.202603959436398e-06,
"loss": 11.5078,
"step": 322500
},
{
"epoch": 1.4355555555555555,
"grad_norm": 14.56828784942627,
"learning_rate": 9.156886267528198e-06,
"loss": 11.1005,
"step": 323000
},
{
"epoch": 1.4377777777777778,
"grad_norm": 98.75641632080078,
"learning_rate": 9.111232474975624e-06,
"loss": 10.4616,
"step": 323500
},
{
"epoch": 1.44,
"grad_norm": 58.22282409667969,
"learning_rate": 9.065643081041242e-06,
"loss": 10.8385,
"step": 324000
},
{
"epoch": 1.4422222222222223,
"grad_norm": 95.11531066894531,
"learning_rate": 9.020118584283357e-06,
"loss": 10.93,
"step": 324500
},
{
"epoch": 1.4444444444444444,
"grad_norm": 402.2863464355469,
"learning_rate": 8.974659482550576e-06,
"loss": 10.7504,
"step": 325000
}
],
"logging_steps": 500,
"max_steps": 500000,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 5000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}