Llama3.1-8B-Th / trainer_state.json
Yiran0924's picture
Upload folder using huggingface_hub
221269a verified
{
"best_global_step": 1200,
"best_metric": 1.777273416519165,
"best_model_checkpoint": "./output_dir/th-Llama-3.1-8B-lr4e-06-atten0.25-ffn0.25_20250430_142946/checkpoint-1200",
"epoch": 0.7199856002879942,
"eval_steps": 50,
"global_step": 1200,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0005999880002399952,
"grad_norm": 2.234375,
"learning_rate": 0.0,
"loss": 1.7675,
"step": 1
},
{
"epoch": 0.0011999760004799903,
"grad_norm": 2.1875,
"learning_rate": 8e-08,
"loss": 1.6481,
"step": 2
},
{
"epoch": 0.0017999640007199855,
"grad_norm": 2.15625,
"learning_rate": 1.6e-07,
"loss": 1.6866,
"step": 3
},
{
"epoch": 0.0023999520009599807,
"grad_norm": 1.953125,
"learning_rate": 2.4e-07,
"loss": 1.7062,
"step": 4
},
{
"epoch": 0.002999940001199976,
"grad_norm": 2.3125,
"learning_rate": 3.2e-07,
"loss": 1.774,
"step": 5
},
{
"epoch": 0.003599928001439971,
"grad_norm": 2.09375,
"learning_rate": 4e-07,
"loss": 1.6995,
"step": 6
},
{
"epoch": 0.004199916001679967,
"grad_norm": 1.9765625,
"learning_rate": 4.8e-07,
"loss": 1.6063,
"step": 7
},
{
"epoch": 0.004799904001919961,
"grad_norm": 1.9296875,
"learning_rate": 5.6e-07,
"loss": 1.5995,
"step": 8
},
{
"epoch": 0.005399892002159957,
"grad_norm": 2.03125,
"learning_rate": 6.4e-07,
"loss": 1.7045,
"step": 9
},
{
"epoch": 0.005999880002399952,
"grad_norm": 2.421875,
"learning_rate": 7.2e-07,
"loss": 1.5958,
"step": 10
},
{
"epoch": 0.006599868002639947,
"grad_norm": 2.1875,
"learning_rate": 8e-07,
"loss": 1.6149,
"step": 11
},
{
"epoch": 0.007199856002879942,
"grad_norm": 2.09375,
"learning_rate": 8.799999999999999e-07,
"loss": 1.7559,
"step": 12
},
{
"epoch": 0.007799844003119938,
"grad_norm": 2.015625,
"learning_rate": 9.6e-07,
"loss": 1.682,
"step": 13
},
{
"epoch": 0.008399832003359933,
"grad_norm": 2.4375,
"learning_rate": 1.04e-06,
"loss": 1.6184,
"step": 14
},
{
"epoch": 0.008999820003599928,
"grad_norm": 2.0625,
"learning_rate": 1.12e-06,
"loss": 1.7178,
"step": 15
},
{
"epoch": 0.009599808003839923,
"grad_norm": 2.15625,
"learning_rate": 1.2e-06,
"loss": 1.5901,
"step": 16
},
{
"epoch": 0.01019979600407992,
"grad_norm": 1.9375,
"learning_rate": 1.28e-06,
"loss": 1.6869,
"step": 17
},
{
"epoch": 0.010799784004319914,
"grad_norm": 2.3125,
"learning_rate": 1.3600000000000001e-06,
"loss": 1.6398,
"step": 18
},
{
"epoch": 0.011399772004559909,
"grad_norm": 1.953125,
"learning_rate": 1.44e-06,
"loss": 1.7697,
"step": 19
},
{
"epoch": 0.011999760004799903,
"grad_norm": 2.109375,
"learning_rate": 1.5199999999999998e-06,
"loss": 1.7484,
"step": 20
},
{
"epoch": 0.0125997480050399,
"grad_norm": 2.234375,
"learning_rate": 1.6e-06,
"loss": 1.7561,
"step": 21
},
{
"epoch": 0.013199736005279895,
"grad_norm": 2.234375,
"learning_rate": 1.6799999999999998e-06,
"loss": 1.5346,
"step": 22
},
{
"epoch": 0.01379972400551989,
"grad_norm": 2.390625,
"learning_rate": 1.7599999999999999e-06,
"loss": 1.7269,
"step": 23
},
{
"epoch": 0.014399712005759884,
"grad_norm": 1.921875,
"learning_rate": 1.84e-06,
"loss": 1.6799,
"step": 24
},
{
"epoch": 0.01499970000599988,
"grad_norm": 2.25,
"learning_rate": 1.92e-06,
"loss": 1.6713,
"step": 25
},
{
"epoch": 0.015599688006239875,
"grad_norm": 2.140625,
"learning_rate": 2e-06,
"loss": 1.6378,
"step": 26
},
{
"epoch": 0.016199676006479872,
"grad_norm": 2.140625,
"learning_rate": 2.08e-06,
"loss": 1.7315,
"step": 27
},
{
"epoch": 0.016799664006719867,
"grad_norm": 2.34375,
"learning_rate": 2.16e-06,
"loss": 1.7283,
"step": 28
},
{
"epoch": 0.01739965200695986,
"grad_norm": 2.015625,
"learning_rate": 2.24e-06,
"loss": 1.7627,
"step": 29
},
{
"epoch": 0.017999640007199856,
"grad_norm": 2.1875,
"learning_rate": 2.32e-06,
"loss": 1.6382,
"step": 30
},
{
"epoch": 0.01859962800743985,
"grad_norm": 1.78125,
"learning_rate": 2.4e-06,
"loss": 1.6786,
"step": 31
},
{
"epoch": 0.019199616007679846,
"grad_norm": 2.359375,
"learning_rate": 2.48e-06,
"loss": 1.6262,
"step": 32
},
{
"epoch": 0.01979960400791984,
"grad_norm": 2.0,
"learning_rate": 2.56e-06,
"loss": 1.6589,
"step": 33
},
{
"epoch": 0.02039959200815984,
"grad_norm": 2.34375,
"learning_rate": 2.64e-06,
"loss": 1.671,
"step": 34
},
{
"epoch": 0.020999580008399833,
"grad_norm": 2.03125,
"learning_rate": 2.7200000000000002e-06,
"loss": 1.7393,
"step": 35
},
{
"epoch": 0.021599568008639828,
"grad_norm": 2.140625,
"learning_rate": 2.8e-06,
"loss": 1.6027,
"step": 36
},
{
"epoch": 0.022199556008879823,
"grad_norm": 2.0,
"learning_rate": 2.88e-06,
"loss": 1.8158,
"step": 37
},
{
"epoch": 0.022799544009119817,
"grad_norm": 2.015625,
"learning_rate": 2.96e-06,
"loss": 1.7158,
"step": 38
},
{
"epoch": 0.023399532009359812,
"grad_norm": 1.96875,
"learning_rate": 3.0399999999999997e-06,
"loss": 1.7778,
"step": 39
},
{
"epoch": 0.023999520009599807,
"grad_norm": 1.9765625,
"learning_rate": 3.1199999999999998e-06,
"loss": 1.6903,
"step": 40
},
{
"epoch": 0.0245995080098398,
"grad_norm": 2.046875,
"learning_rate": 3.2e-06,
"loss": 1.6403,
"step": 41
},
{
"epoch": 0.0251994960100798,
"grad_norm": 1.859375,
"learning_rate": 3.2799999999999995e-06,
"loss": 1.7292,
"step": 42
},
{
"epoch": 0.025799484010319795,
"grad_norm": 2.09375,
"learning_rate": 3.3599999999999996e-06,
"loss": 1.6956,
"step": 43
},
{
"epoch": 0.02639947201055979,
"grad_norm": 1.84375,
"learning_rate": 3.4399999999999997e-06,
"loss": 1.6927,
"step": 44
},
{
"epoch": 0.026999460010799784,
"grad_norm": 2.15625,
"learning_rate": 3.5199999999999998e-06,
"loss": 1.6794,
"step": 45
},
{
"epoch": 0.02759944801103978,
"grad_norm": 2.046875,
"learning_rate": 3.6e-06,
"loss": 1.7373,
"step": 46
},
{
"epoch": 0.028199436011279774,
"grad_norm": 2.0,
"learning_rate": 3.68e-06,
"loss": 1.6971,
"step": 47
},
{
"epoch": 0.02879942401151977,
"grad_norm": 2.03125,
"learning_rate": 3.7599999999999996e-06,
"loss": 1.7465,
"step": 48
},
{
"epoch": 0.029399412011759767,
"grad_norm": 1.890625,
"learning_rate": 3.84e-06,
"loss": 1.6621,
"step": 49
},
{
"epoch": 0.02999940001199976,
"grad_norm": 1.9140625,
"learning_rate": 3.92e-06,
"loss": 1.6802,
"step": 50
},
{
"epoch": 0.02999940001199976,
"eval_loss": 1.7964001893997192,
"eval_model_preparation_time": 0.0037,
"eval_runtime": 66.1162,
"eval_samples_per_second": 151.249,
"eval_steps_per_second": 25.213,
"step": 50
},
{
"epoch": 0.030599388012239756,
"grad_norm": 1.8203125,
"learning_rate": 4e-06,
"loss": 1.6904,
"step": 51
},
{
"epoch": 0.03119937601247975,
"grad_norm": 2.140625,
"learning_rate": 3.997524752475248e-06,
"loss": 1.6148,
"step": 52
},
{
"epoch": 0.031799364012719745,
"grad_norm": 1.7578125,
"learning_rate": 3.9950495049504945e-06,
"loss": 1.6905,
"step": 53
},
{
"epoch": 0.032399352012959744,
"grad_norm": 2.09375,
"learning_rate": 3.992574257425742e-06,
"loss": 1.7708,
"step": 54
},
{
"epoch": 0.032999340013199735,
"grad_norm": 1.75,
"learning_rate": 3.99009900990099e-06,
"loss": 1.7365,
"step": 55
},
{
"epoch": 0.03359932801343973,
"grad_norm": 2.109375,
"learning_rate": 3.987623762376238e-06,
"loss": 1.5333,
"step": 56
},
{
"epoch": 0.034199316013679724,
"grad_norm": 1.921875,
"learning_rate": 3.985148514851485e-06,
"loss": 1.7474,
"step": 57
},
{
"epoch": 0.03479930401391972,
"grad_norm": 1.90625,
"learning_rate": 3.9826732673267325e-06,
"loss": 1.667,
"step": 58
},
{
"epoch": 0.035399292014159714,
"grad_norm": 2.140625,
"learning_rate": 3.98019801980198e-06,
"loss": 1.652,
"step": 59
},
{
"epoch": 0.03599928001439971,
"grad_norm": 1.9140625,
"learning_rate": 3.977722772277228e-06,
"loss": 1.6359,
"step": 60
},
{
"epoch": 0.03659926801463971,
"grad_norm": 1.9765625,
"learning_rate": 3.975247524752475e-06,
"loss": 1.6885,
"step": 61
},
{
"epoch": 0.0371992560148797,
"grad_norm": 1.9765625,
"learning_rate": 3.972772277227723e-06,
"loss": 1.6758,
"step": 62
},
{
"epoch": 0.0377992440151197,
"grad_norm": 2.0,
"learning_rate": 3.9702970297029705e-06,
"loss": 1.6944,
"step": 63
},
{
"epoch": 0.03839923201535969,
"grad_norm": 1.734375,
"learning_rate": 3.967821782178218e-06,
"loss": 1.5645,
"step": 64
},
{
"epoch": 0.03899922001559969,
"grad_norm": 1.8125,
"learning_rate": 3.965346534653465e-06,
"loss": 1.7534,
"step": 65
},
{
"epoch": 0.03959920801583968,
"grad_norm": 2.078125,
"learning_rate": 3.962871287128713e-06,
"loss": 1.6782,
"step": 66
},
{
"epoch": 0.04019919601607968,
"grad_norm": 1.90625,
"learning_rate": 3.96039603960396e-06,
"loss": 1.6826,
"step": 67
},
{
"epoch": 0.04079918401631968,
"grad_norm": 1.8203125,
"learning_rate": 3.957920792079208e-06,
"loss": 1.692,
"step": 68
},
{
"epoch": 0.04139917201655967,
"grad_norm": 1.8984375,
"learning_rate": 3.955445544554455e-06,
"loss": 1.5594,
"step": 69
},
{
"epoch": 0.041999160016799666,
"grad_norm": 1.7890625,
"learning_rate": 3.952970297029703e-06,
"loss": 1.677,
"step": 70
},
{
"epoch": 0.04259914801703966,
"grad_norm": 1.8984375,
"learning_rate": 3.95049504950495e-06,
"loss": 1.6147,
"step": 71
},
{
"epoch": 0.043199136017279656,
"grad_norm": 1.8125,
"learning_rate": 3.948019801980198e-06,
"loss": 1.6306,
"step": 72
},
{
"epoch": 0.04379912401751965,
"grad_norm": 1.8984375,
"learning_rate": 3.945544554455446e-06,
"loss": 1.5884,
"step": 73
},
{
"epoch": 0.044399112017759645,
"grad_norm": 1.8359375,
"learning_rate": 3.943069306930693e-06,
"loss": 1.5904,
"step": 74
},
{
"epoch": 0.04499910001799964,
"grad_norm": 1.8671875,
"learning_rate": 3.94059405940594e-06,
"loss": 1.628,
"step": 75
},
{
"epoch": 0.045599088018239635,
"grad_norm": 1.875,
"learning_rate": 3.938118811881188e-06,
"loss": 1.7228,
"step": 76
},
{
"epoch": 0.04619907601847963,
"grad_norm": 1.8671875,
"learning_rate": 3.935643564356436e-06,
"loss": 1.7077,
"step": 77
},
{
"epoch": 0.046799064018719624,
"grad_norm": 1.703125,
"learning_rate": 3.933168316831683e-06,
"loss": 1.5831,
"step": 78
},
{
"epoch": 0.04739905201895962,
"grad_norm": 1.6875,
"learning_rate": 3.9306930693069305e-06,
"loss": 1.6097,
"step": 79
},
{
"epoch": 0.047999040019199614,
"grad_norm": 1.765625,
"learning_rate": 3.928217821782178e-06,
"loss": 1.6748,
"step": 80
},
{
"epoch": 0.04859902801943961,
"grad_norm": 1.78125,
"learning_rate": 3.925742574257425e-06,
"loss": 1.598,
"step": 81
},
{
"epoch": 0.0491990160196796,
"grad_norm": 1.6640625,
"learning_rate": 3.923267326732673e-06,
"loss": 1.6844,
"step": 82
},
{
"epoch": 0.0497990040199196,
"grad_norm": 2.03125,
"learning_rate": 3.920792079207921e-06,
"loss": 1.7564,
"step": 83
},
{
"epoch": 0.0503989920201596,
"grad_norm": 1.984375,
"learning_rate": 3.9183168316831685e-06,
"loss": 1.6621,
"step": 84
},
{
"epoch": 0.05099898002039959,
"grad_norm": 1.953125,
"learning_rate": 3.915841584158415e-06,
"loss": 1.6924,
"step": 85
},
{
"epoch": 0.05159896802063959,
"grad_norm": 1.8515625,
"learning_rate": 3.913366336633663e-06,
"loss": 1.6198,
"step": 86
},
{
"epoch": 0.05219895602087958,
"grad_norm": 1.9140625,
"learning_rate": 3.910891089108911e-06,
"loss": 1.6174,
"step": 87
},
{
"epoch": 0.05279894402111958,
"grad_norm": 1.8515625,
"learning_rate": 3.908415841584159e-06,
"loss": 1.67,
"step": 88
},
{
"epoch": 0.05339893202135957,
"grad_norm": 2.453125,
"learning_rate": 3.905940594059406e-06,
"loss": 1.7446,
"step": 89
},
{
"epoch": 0.05399892002159957,
"grad_norm": 1.828125,
"learning_rate": 3.903465346534653e-06,
"loss": 1.6408,
"step": 90
},
{
"epoch": 0.054598908021839566,
"grad_norm": 1.7421875,
"learning_rate": 3.9009900990099e-06,
"loss": 1.7832,
"step": 91
},
{
"epoch": 0.05519889602207956,
"grad_norm": 1.890625,
"learning_rate": 3.898514851485148e-06,
"loss": 1.6397,
"step": 92
},
{
"epoch": 0.055798884022319556,
"grad_norm": 1.6953125,
"learning_rate": 3.896039603960396e-06,
"loss": 1.6805,
"step": 93
},
{
"epoch": 0.05639887202255955,
"grad_norm": 1.84375,
"learning_rate": 3.893564356435644e-06,
"loss": 1.742,
"step": 94
},
{
"epoch": 0.056998860022799545,
"grad_norm": 1.875,
"learning_rate": 3.8910891089108905e-06,
"loss": 1.5952,
"step": 95
},
{
"epoch": 0.05759884802303954,
"grad_norm": 1.9296875,
"learning_rate": 3.888613861386138e-06,
"loss": 1.7769,
"step": 96
},
{
"epoch": 0.058198836023279535,
"grad_norm": 1.6484375,
"learning_rate": 3.886138613861386e-06,
"loss": 1.6168,
"step": 97
},
{
"epoch": 0.05879882402351953,
"grad_norm": 1.984375,
"learning_rate": 3.883663366336634e-06,
"loss": 1.6283,
"step": 98
},
{
"epoch": 0.059398812023759524,
"grad_norm": 1.8828125,
"learning_rate": 3.881188118811881e-06,
"loss": 1.6354,
"step": 99
},
{
"epoch": 0.05999880002399952,
"grad_norm": 1.609375,
"learning_rate": 3.8787128712871285e-06,
"loss": 1.6169,
"step": 100
},
{
"epoch": 0.05999880002399952,
"eval_loss": 1.7915641069412231,
"eval_model_preparation_time": 0.0037,
"eval_runtime": 66.2561,
"eval_samples_per_second": 150.93,
"eval_steps_per_second": 25.16,
"step": 100
},
{
"epoch": 0.060598788024239514,
"grad_norm": 1.703125,
"learning_rate": 3.876237623762376e-06,
"loss": 1.785,
"step": 101
},
{
"epoch": 0.06119877602447951,
"grad_norm": 1.6484375,
"learning_rate": 3.873762376237624e-06,
"loss": 1.7006,
"step": 102
},
{
"epoch": 0.0617987640247195,
"grad_norm": 1.890625,
"learning_rate": 3.871287128712871e-06,
"loss": 1.7633,
"step": 103
},
{
"epoch": 0.0623987520249595,
"grad_norm": 1.6640625,
"learning_rate": 3.868811881188119e-06,
"loss": 1.7084,
"step": 104
},
{
"epoch": 0.06299874002519949,
"grad_norm": 1.75,
"learning_rate": 3.866336633663366e-06,
"loss": 1.7583,
"step": 105
},
{
"epoch": 0.06359872802543949,
"grad_norm": 1.6875,
"learning_rate": 3.8638613861386134e-06,
"loss": 1.6945,
"step": 106
},
{
"epoch": 0.06419871602567949,
"grad_norm": 1.703125,
"learning_rate": 3.861386138613861e-06,
"loss": 1.7145,
"step": 107
},
{
"epoch": 0.06479870402591949,
"grad_norm": 1.6328125,
"learning_rate": 3.858910891089109e-06,
"loss": 1.6482,
"step": 108
},
{
"epoch": 0.06539869202615947,
"grad_norm": 1.6953125,
"learning_rate": 3.856435643564356e-06,
"loss": 1.7366,
"step": 109
},
{
"epoch": 0.06599868002639947,
"grad_norm": 1.84375,
"learning_rate": 3.853960396039604e-06,
"loss": 1.6372,
"step": 110
},
{
"epoch": 0.06659866802663947,
"grad_norm": 1.8359375,
"learning_rate": 3.851485148514851e-06,
"loss": 1.7359,
"step": 111
},
{
"epoch": 0.06719865602687947,
"grad_norm": 1.71875,
"learning_rate": 3.849009900990099e-06,
"loss": 1.6406,
"step": 112
},
{
"epoch": 0.06779864402711945,
"grad_norm": 1.7421875,
"learning_rate": 3.846534653465346e-06,
"loss": 1.696,
"step": 113
},
{
"epoch": 0.06839863202735945,
"grad_norm": 1.703125,
"learning_rate": 3.844059405940594e-06,
"loss": 1.6321,
"step": 114
},
{
"epoch": 0.06899862002759945,
"grad_norm": 1.8828125,
"learning_rate": 3.841584158415842e-06,
"loss": 1.706,
"step": 115
},
{
"epoch": 0.06959860802783945,
"grad_norm": 1.65625,
"learning_rate": 3.839108910891089e-06,
"loss": 1.7333,
"step": 116
},
{
"epoch": 0.07019859602807944,
"grad_norm": 1.9296875,
"learning_rate": 3.836633663366336e-06,
"loss": 1.7489,
"step": 117
},
{
"epoch": 0.07079858402831943,
"grad_norm": 1.7578125,
"learning_rate": 3.834158415841584e-06,
"loss": 1.6628,
"step": 118
},
{
"epoch": 0.07139857202855943,
"grad_norm": 1.7421875,
"learning_rate": 3.831683168316831e-06,
"loss": 1.7741,
"step": 119
},
{
"epoch": 0.07199856002879942,
"grad_norm": 1.6796875,
"learning_rate": 3.829207920792079e-06,
"loss": 1.6783,
"step": 120
},
{
"epoch": 0.07259854802903942,
"grad_norm": 1.921875,
"learning_rate": 3.8267326732673265e-06,
"loss": 1.6955,
"step": 121
},
{
"epoch": 0.07319853602927942,
"grad_norm": 1.984375,
"learning_rate": 3.824257425742574e-06,
"loss": 1.721,
"step": 122
},
{
"epoch": 0.0737985240295194,
"grad_norm": 1.6640625,
"learning_rate": 3.821782178217821e-06,
"loss": 1.6035,
"step": 123
},
{
"epoch": 0.0743985120297594,
"grad_norm": 2.140625,
"learning_rate": 3.819306930693069e-06,
"loss": 1.5864,
"step": 124
},
{
"epoch": 0.0749985000299994,
"grad_norm": 1.546875,
"learning_rate": 3.816831683168317e-06,
"loss": 1.6237,
"step": 125
},
{
"epoch": 0.0755984880302394,
"grad_norm": 1.734375,
"learning_rate": 3.814356435643564e-06,
"loss": 1.6389,
"step": 126
},
{
"epoch": 0.07619847603047938,
"grad_norm": 1.96875,
"learning_rate": 3.8118811881188114e-06,
"loss": 1.6965,
"step": 127
},
{
"epoch": 0.07679846403071938,
"grad_norm": 2.34375,
"learning_rate": 3.809405940594059e-06,
"loss": 1.5421,
"step": 128
},
{
"epoch": 0.07739845203095938,
"grad_norm": 1.875,
"learning_rate": 3.8069306930693065e-06,
"loss": 1.7131,
"step": 129
},
{
"epoch": 0.07799844003119938,
"grad_norm": 1.7265625,
"learning_rate": 3.8044554455445543e-06,
"loss": 1.7201,
"step": 130
},
{
"epoch": 0.07859842803143938,
"grad_norm": 1.8125,
"learning_rate": 3.8019801980198017e-06,
"loss": 1.7214,
"step": 131
},
{
"epoch": 0.07919841603167936,
"grad_norm": 1.71875,
"learning_rate": 3.7995049504950494e-06,
"loss": 1.7073,
"step": 132
},
{
"epoch": 0.07979840403191936,
"grad_norm": 1.578125,
"learning_rate": 3.7970297029702968e-06,
"loss": 1.6527,
"step": 133
},
{
"epoch": 0.08039839203215936,
"grad_norm": 1.7890625,
"learning_rate": 3.7945544554455445e-06,
"loss": 1.6767,
"step": 134
},
{
"epoch": 0.08099838003239936,
"grad_norm": 1.609375,
"learning_rate": 3.792079207920792e-06,
"loss": 1.7262,
"step": 135
},
{
"epoch": 0.08159836803263935,
"grad_norm": 1.65625,
"learning_rate": 3.7896039603960396e-06,
"loss": 1.7001,
"step": 136
},
{
"epoch": 0.08219835603287934,
"grad_norm": 1.6171875,
"learning_rate": 3.7871287128712866e-06,
"loss": 1.6917,
"step": 137
},
{
"epoch": 0.08279834403311934,
"grad_norm": 1.734375,
"learning_rate": 3.7846534653465343e-06,
"loss": 1.6897,
"step": 138
},
{
"epoch": 0.08339833203335933,
"grad_norm": 1.78125,
"learning_rate": 3.7821782178217817e-06,
"loss": 1.6675,
"step": 139
},
{
"epoch": 0.08399832003359933,
"grad_norm": 1.6015625,
"learning_rate": 3.7797029702970294e-06,
"loss": 1.5694,
"step": 140
},
{
"epoch": 0.08459830803383932,
"grad_norm": 1.71875,
"learning_rate": 3.7772277227722768e-06,
"loss": 1.633,
"step": 141
},
{
"epoch": 0.08519829603407932,
"grad_norm": 1.90625,
"learning_rate": 3.7747524752475245e-06,
"loss": 1.7887,
"step": 142
},
{
"epoch": 0.08579828403431931,
"grad_norm": 1.6953125,
"learning_rate": 3.772277227722772e-06,
"loss": 1.593,
"step": 143
},
{
"epoch": 0.08639827203455931,
"grad_norm": 1.796875,
"learning_rate": 3.7698019801980197e-06,
"loss": 1.6052,
"step": 144
},
{
"epoch": 0.08699826003479931,
"grad_norm": 1.6796875,
"learning_rate": 3.767326732673267e-06,
"loss": 1.5389,
"step": 145
},
{
"epoch": 0.0875982480350393,
"grad_norm": 1.6953125,
"learning_rate": 3.7648514851485148e-06,
"loss": 1.6326,
"step": 146
},
{
"epoch": 0.08819823603527929,
"grad_norm": 1.4921875,
"learning_rate": 3.762376237623762e-06,
"loss": 1.6813,
"step": 147
},
{
"epoch": 0.08879822403551929,
"grad_norm": 1.8359375,
"learning_rate": 3.75990099009901e-06,
"loss": 1.6882,
"step": 148
},
{
"epoch": 0.08939821203575929,
"grad_norm": 1.8515625,
"learning_rate": 3.7574257425742572e-06,
"loss": 1.5999,
"step": 149
},
{
"epoch": 0.08999820003599927,
"grad_norm": 1.6796875,
"learning_rate": 3.754950495049505e-06,
"loss": 1.5369,
"step": 150
},
{
"epoch": 0.08999820003599927,
"eval_loss": 1.788702130317688,
"eval_model_preparation_time": 0.0037,
"eval_runtime": 68.5232,
"eval_samples_per_second": 145.936,
"eval_steps_per_second": 24.328,
"step": 150
},
{
"epoch": 0.09059818803623927,
"grad_norm": 1.8046875,
"learning_rate": 3.752475247524752e-06,
"loss": 1.6148,
"step": 151
},
{
"epoch": 0.09119817603647927,
"grad_norm": 1.8203125,
"learning_rate": 3.7499999999999997e-06,
"loss": 1.6697,
"step": 152
},
{
"epoch": 0.09179816403671927,
"grad_norm": 1.75,
"learning_rate": 3.7475247524752474e-06,
"loss": 1.6048,
"step": 153
},
{
"epoch": 0.09239815203695927,
"grad_norm": 1.6953125,
"learning_rate": 3.7450495049504948e-06,
"loss": 1.6984,
"step": 154
},
{
"epoch": 0.09299814003719925,
"grad_norm": 1.7421875,
"learning_rate": 3.7425742574257425e-06,
"loss": 1.5856,
"step": 155
},
{
"epoch": 0.09359812803743925,
"grad_norm": 1.5546875,
"learning_rate": 3.74009900990099e-06,
"loss": 1.6575,
"step": 156
},
{
"epoch": 0.09419811603767925,
"grad_norm": 1.8359375,
"learning_rate": 3.7376237623762377e-06,
"loss": 1.6302,
"step": 157
},
{
"epoch": 0.09479810403791925,
"grad_norm": 1.6640625,
"learning_rate": 3.735148514851485e-06,
"loss": 1.644,
"step": 158
},
{
"epoch": 0.09539809203815924,
"grad_norm": 1.734375,
"learning_rate": 3.7326732673267328e-06,
"loss": 1.7122,
"step": 159
},
{
"epoch": 0.09599808003839923,
"grad_norm": 1.8984375,
"learning_rate": 3.73019801980198e-06,
"loss": 1.6811,
"step": 160
},
{
"epoch": 0.09659806803863923,
"grad_norm": 1.65625,
"learning_rate": 3.727722772277228e-06,
"loss": 1.6262,
"step": 161
},
{
"epoch": 0.09719805603887922,
"grad_norm": 1.6875,
"learning_rate": 3.7252475247524752e-06,
"loss": 1.5868,
"step": 162
},
{
"epoch": 0.09779804403911922,
"grad_norm": 1.6640625,
"learning_rate": 3.722772277227723e-06,
"loss": 1.7002,
"step": 163
},
{
"epoch": 0.0983980320393592,
"grad_norm": 1.6796875,
"learning_rate": 3.72029702970297e-06,
"loss": 1.6849,
"step": 164
},
{
"epoch": 0.0989980200395992,
"grad_norm": 1.7109375,
"learning_rate": 3.7178217821782177e-06,
"loss": 1.7176,
"step": 165
},
{
"epoch": 0.0995980080398392,
"grad_norm": 1.6875,
"learning_rate": 3.715346534653465e-06,
"loss": 1.6203,
"step": 166
},
{
"epoch": 0.1001979960400792,
"grad_norm": 1.6953125,
"learning_rate": 3.7128712871287128e-06,
"loss": 1.715,
"step": 167
},
{
"epoch": 0.1007979840403192,
"grad_norm": 1.640625,
"learning_rate": 3.71039603960396e-06,
"loss": 1.6671,
"step": 168
},
{
"epoch": 0.10139797204055918,
"grad_norm": 1.734375,
"learning_rate": 3.707920792079208e-06,
"loss": 1.7003,
"step": 169
},
{
"epoch": 0.10199796004079918,
"grad_norm": 1.6875,
"learning_rate": 3.7054455445544552e-06,
"loss": 1.658,
"step": 170
},
{
"epoch": 0.10259794804103918,
"grad_norm": 1.8828125,
"learning_rate": 3.702970297029703e-06,
"loss": 1.6661,
"step": 171
},
{
"epoch": 0.10319793604127918,
"grad_norm": 1.859375,
"learning_rate": 3.7004950495049503e-06,
"loss": 1.6916,
"step": 172
},
{
"epoch": 0.10379792404151916,
"grad_norm": 2.03125,
"learning_rate": 3.698019801980198e-06,
"loss": 1.7125,
"step": 173
},
{
"epoch": 0.10439791204175916,
"grad_norm": 1.671875,
"learning_rate": 3.6955445544554455e-06,
"loss": 1.6975,
"step": 174
},
{
"epoch": 0.10499790004199916,
"grad_norm": 1.7578125,
"learning_rate": 3.6930693069306932e-06,
"loss": 1.7408,
"step": 175
},
{
"epoch": 0.10559788804223916,
"grad_norm": 1.625,
"learning_rate": 3.6905940594059406e-06,
"loss": 1.7682,
"step": 176
},
{
"epoch": 0.10619787604247916,
"grad_norm": 1.6796875,
"learning_rate": 3.6881188118811883e-06,
"loss": 1.6703,
"step": 177
},
{
"epoch": 0.10679786404271914,
"grad_norm": 1.90625,
"learning_rate": 3.6856435643564352e-06,
"loss": 1.6898,
"step": 178
},
{
"epoch": 0.10739785204295914,
"grad_norm": 1.890625,
"learning_rate": 3.683168316831683e-06,
"loss": 1.7006,
"step": 179
},
{
"epoch": 0.10799784004319914,
"grad_norm": 1.734375,
"learning_rate": 3.6806930693069304e-06,
"loss": 1.659,
"step": 180
},
{
"epoch": 0.10859782804343913,
"grad_norm": 1.7421875,
"learning_rate": 3.678217821782178e-06,
"loss": 1.685,
"step": 181
},
{
"epoch": 0.10919781604367913,
"grad_norm": 1.796875,
"learning_rate": 3.6757425742574255e-06,
"loss": 1.7294,
"step": 182
},
{
"epoch": 0.10979780404391912,
"grad_norm": 1.890625,
"learning_rate": 3.6732673267326732e-06,
"loss": 1.7036,
"step": 183
},
{
"epoch": 0.11039779204415912,
"grad_norm": 1.765625,
"learning_rate": 3.6707920792079206e-06,
"loss": 1.6733,
"step": 184
},
{
"epoch": 0.11099778004439911,
"grad_norm": 1.6953125,
"learning_rate": 3.6683168316831683e-06,
"loss": 1.7434,
"step": 185
},
{
"epoch": 0.11159776804463911,
"grad_norm": 1.7734375,
"learning_rate": 3.6658415841584157e-06,
"loss": 1.6844,
"step": 186
},
{
"epoch": 0.1121977560448791,
"grad_norm": 1.7421875,
"learning_rate": 3.6633663366336635e-06,
"loss": 1.7077,
"step": 187
},
{
"epoch": 0.1127977440451191,
"grad_norm": 1.75,
"learning_rate": 3.660891089108911e-06,
"loss": 1.7586,
"step": 188
},
{
"epoch": 0.11339773204535909,
"grad_norm": 1.828125,
"learning_rate": 3.6584158415841586e-06,
"loss": 1.6478,
"step": 189
},
{
"epoch": 0.11399772004559909,
"grad_norm": 1.7265625,
"learning_rate": 3.6559405940594055e-06,
"loss": 1.6893,
"step": 190
},
{
"epoch": 0.11459770804583909,
"grad_norm": 1.9375,
"learning_rate": 3.6534653465346532e-06,
"loss": 1.7054,
"step": 191
},
{
"epoch": 0.11519769604607907,
"grad_norm": 1.65625,
"learning_rate": 3.6509900990099006e-06,
"loss": 1.7179,
"step": 192
},
{
"epoch": 0.11579768404631907,
"grad_norm": 1.6796875,
"learning_rate": 3.6485148514851484e-06,
"loss": 1.5688,
"step": 193
},
{
"epoch": 0.11639767204655907,
"grad_norm": 1.8203125,
"learning_rate": 3.6460396039603957e-06,
"loss": 1.807,
"step": 194
},
{
"epoch": 0.11699766004679907,
"grad_norm": 1.734375,
"learning_rate": 3.6435643564356435e-06,
"loss": 1.6499,
"step": 195
},
{
"epoch": 0.11759764804703907,
"grad_norm": 1.796875,
"learning_rate": 3.641089108910891e-06,
"loss": 1.6366,
"step": 196
},
{
"epoch": 0.11819763604727905,
"grad_norm": 1.7890625,
"learning_rate": 3.6386138613861386e-06,
"loss": 1.7076,
"step": 197
},
{
"epoch": 0.11879762404751905,
"grad_norm": 1.671875,
"learning_rate": 3.636138613861386e-06,
"loss": 1.6531,
"step": 198
},
{
"epoch": 0.11939761204775905,
"grad_norm": 1.6640625,
"learning_rate": 3.6336633663366337e-06,
"loss": 1.6723,
"step": 199
},
{
"epoch": 0.11999760004799905,
"grad_norm": 1.5859375,
"learning_rate": 3.631188118811881e-06,
"loss": 1.5718,
"step": 200
},
{
"epoch": 0.11999760004799905,
"eval_loss": 1.7864090204238892,
"eval_model_preparation_time": 0.0037,
"eval_runtime": 65.9536,
"eval_samples_per_second": 151.622,
"eval_steps_per_second": 25.275,
"step": 200
},
{
"epoch": 0.12059758804823903,
"grad_norm": 1.78125,
"learning_rate": 3.628712871287129e-06,
"loss": 1.6047,
"step": 201
},
{
"epoch": 0.12119757604847903,
"grad_norm": 1.5234375,
"learning_rate": 3.626237623762376e-06,
"loss": 1.5836,
"step": 202
},
{
"epoch": 0.12179756404871903,
"grad_norm": 1.625,
"learning_rate": 3.623762376237624e-06,
"loss": 1.7101,
"step": 203
},
{
"epoch": 0.12239755204895902,
"grad_norm": 1.6953125,
"learning_rate": 3.621287128712871e-06,
"loss": 1.7021,
"step": 204
},
{
"epoch": 0.12299754004919902,
"grad_norm": 1.6875,
"learning_rate": 3.6188118811881186e-06,
"loss": 1.673,
"step": 205
},
{
"epoch": 0.123597528049439,
"grad_norm": 1.71875,
"learning_rate": 3.616336633663366e-06,
"loss": 1.6808,
"step": 206
},
{
"epoch": 0.124197516049679,
"grad_norm": 1.6796875,
"learning_rate": 3.6138613861386137e-06,
"loss": 1.6939,
"step": 207
},
{
"epoch": 0.124797504049919,
"grad_norm": 1.6796875,
"learning_rate": 3.611386138613861e-06,
"loss": 1.691,
"step": 208
},
{
"epoch": 0.125397492050159,
"grad_norm": 1.6640625,
"learning_rate": 3.608910891089109e-06,
"loss": 1.6701,
"step": 209
},
{
"epoch": 0.12599748005039899,
"grad_norm": 1.71875,
"learning_rate": 3.606435643564356e-06,
"loss": 1.7267,
"step": 210
},
{
"epoch": 0.12659746805063898,
"grad_norm": 1.6015625,
"learning_rate": 3.603960396039604e-06,
"loss": 1.626,
"step": 211
},
{
"epoch": 0.12719745605087898,
"grad_norm": 1.8828125,
"learning_rate": 3.6014851485148513e-06,
"loss": 1.6452,
"step": 212
},
{
"epoch": 0.12779744405111898,
"grad_norm": 1.625,
"learning_rate": 3.599009900990099e-06,
"loss": 1.6894,
"step": 213
},
{
"epoch": 0.12839743205135898,
"grad_norm": 1.7109375,
"learning_rate": 3.5965346534653464e-06,
"loss": 1.5345,
"step": 214
},
{
"epoch": 0.12899742005159898,
"grad_norm": 1.828125,
"learning_rate": 3.594059405940594e-06,
"loss": 1.6531,
"step": 215
},
{
"epoch": 0.12959740805183897,
"grad_norm": 1.71875,
"learning_rate": 3.5915841584158415e-06,
"loss": 1.6251,
"step": 216
},
{
"epoch": 0.13019739605207895,
"grad_norm": 1.796875,
"learning_rate": 3.589108910891089e-06,
"loss": 1.6431,
"step": 217
},
{
"epoch": 0.13079738405231894,
"grad_norm": 1.5,
"learning_rate": 3.586633663366336e-06,
"loss": 1.6415,
"step": 218
},
{
"epoch": 0.13139737205255894,
"grad_norm": 1.9765625,
"learning_rate": 3.584158415841584e-06,
"loss": 1.777,
"step": 219
},
{
"epoch": 0.13199736005279894,
"grad_norm": 1.7890625,
"learning_rate": 3.5816831683168313e-06,
"loss": 1.6775,
"step": 220
},
{
"epoch": 0.13259734805303894,
"grad_norm": 1.6640625,
"learning_rate": 3.579207920792079e-06,
"loss": 1.6495,
"step": 221
},
{
"epoch": 0.13319733605327894,
"grad_norm": 1.6796875,
"learning_rate": 3.5767326732673264e-06,
"loss": 1.6738,
"step": 222
},
{
"epoch": 0.13379732405351893,
"grad_norm": 1.8046875,
"learning_rate": 3.574257425742574e-06,
"loss": 1.6602,
"step": 223
},
{
"epoch": 0.13439731205375893,
"grad_norm": 1.8046875,
"learning_rate": 3.5717821782178215e-06,
"loss": 1.6473,
"step": 224
},
{
"epoch": 0.13499730005399893,
"grad_norm": 1.796875,
"learning_rate": 3.5693069306930693e-06,
"loss": 1.6683,
"step": 225
},
{
"epoch": 0.1355972880542389,
"grad_norm": 1.5,
"learning_rate": 3.5668316831683166e-06,
"loss": 1.6702,
"step": 226
},
{
"epoch": 0.1361972760544789,
"grad_norm": 1.8125,
"learning_rate": 3.5643564356435644e-06,
"loss": 1.6736,
"step": 227
},
{
"epoch": 0.1367972640547189,
"grad_norm": 1.6875,
"learning_rate": 3.5618811881188117e-06,
"loss": 1.7395,
"step": 228
},
{
"epoch": 0.1373972520549589,
"grad_norm": 1.7734375,
"learning_rate": 3.5594059405940595e-06,
"loss": 1.6352,
"step": 229
},
{
"epoch": 0.1379972400551989,
"grad_norm": 1.703125,
"learning_rate": 3.5569306930693064e-06,
"loss": 1.7035,
"step": 230
},
{
"epoch": 0.1385972280554389,
"grad_norm": 1.859375,
"learning_rate": 3.554455445544554e-06,
"loss": 1.6634,
"step": 231
},
{
"epoch": 0.1391972160556789,
"grad_norm": 1.5703125,
"learning_rate": 3.5519801980198015e-06,
"loss": 1.5949,
"step": 232
},
{
"epoch": 0.1397972040559189,
"grad_norm": 1.6875,
"learning_rate": 3.5495049504950493e-06,
"loss": 1.6848,
"step": 233
},
{
"epoch": 0.1403971920561589,
"grad_norm": 1.7421875,
"learning_rate": 3.5470297029702966e-06,
"loss": 1.6158,
"step": 234
},
{
"epoch": 0.14099718005639889,
"grad_norm": 1.765625,
"learning_rate": 3.5445544554455444e-06,
"loss": 1.6746,
"step": 235
},
{
"epoch": 0.14159716805663886,
"grad_norm": 1.8515625,
"learning_rate": 3.5420792079207917e-06,
"loss": 1.6911,
"step": 236
},
{
"epoch": 0.14219715605687885,
"grad_norm": 1.7734375,
"learning_rate": 3.5396039603960395e-06,
"loss": 1.6304,
"step": 237
},
{
"epoch": 0.14279714405711885,
"grad_norm": 2.03125,
"learning_rate": 3.537128712871287e-06,
"loss": 1.6485,
"step": 238
},
{
"epoch": 0.14339713205735885,
"grad_norm": 1.6875,
"learning_rate": 3.5346534653465346e-06,
"loss": 1.6509,
"step": 239
},
{
"epoch": 0.14399712005759885,
"grad_norm": 1.7578125,
"learning_rate": 3.532178217821782e-06,
"loss": 1.7299,
"step": 240
},
{
"epoch": 0.14459710805783885,
"grad_norm": 1.6171875,
"learning_rate": 3.5297029702970297e-06,
"loss": 1.5903,
"step": 241
},
{
"epoch": 0.14519709605807885,
"grad_norm": 1.8515625,
"learning_rate": 3.527227722772277e-06,
"loss": 1.7034,
"step": 242
},
{
"epoch": 0.14579708405831884,
"grad_norm": 1.6953125,
"learning_rate": 3.524752475247525e-06,
"loss": 1.604,
"step": 243
},
{
"epoch": 0.14639707205855884,
"grad_norm": 1.578125,
"learning_rate": 3.5222772277227717e-06,
"loss": 1.6242,
"step": 244
},
{
"epoch": 0.1469970600587988,
"grad_norm": 1.703125,
"learning_rate": 3.5198019801980195e-06,
"loss": 1.6533,
"step": 245
},
{
"epoch": 0.1475970480590388,
"grad_norm": 1.6015625,
"learning_rate": 3.517326732673267e-06,
"loss": 1.6703,
"step": 246
},
{
"epoch": 0.1481970360592788,
"grad_norm": 1.6875,
"learning_rate": 3.5148514851485146e-06,
"loss": 1.6464,
"step": 247
},
{
"epoch": 0.1487970240595188,
"grad_norm": 1.7421875,
"learning_rate": 3.512376237623762e-06,
"loss": 1.8545,
"step": 248
},
{
"epoch": 0.1493970120597588,
"grad_norm": 1.5,
"learning_rate": 3.5099009900990097e-06,
"loss": 1.7212,
"step": 249
},
{
"epoch": 0.1499970000599988,
"grad_norm": 1.6484375,
"learning_rate": 3.507425742574257e-06,
"loss": 1.637,
"step": 250
},
{
"epoch": 0.1499970000599988,
"eval_loss": 1.7847853899002075,
"eval_model_preparation_time": 0.0037,
"eval_runtime": 65.9306,
"eval_samples_per_second": 151.675,
"eval_steps_per_second": 25.284,
"step": 250
},
{
"epoch": 0.1505969880602388,
"grad_norm": 1.8828125,
"learning_rate": 3.504950495049505e-06,
"loss": 1.7151,
"step": 251
},
{
"epoch": 0.1511969760604788,
"grad_norm": 1.71875,
"learning_rate": 3.502475247524752e-06,
"loss": 1.6645,
"step": 252
},
{
"epoch": 0.1517969640607188,
"grad_norm": 1.6640625,
"learning_rate": 3.5e-06,
"loss": 1.6666,
"step": 253
},
{
"epoch": 0.15239695206095877,
"grad_norm": 1.84375,
"learning_rate": 3.4975247524752477e-06,
"loss": 1.5748,
"step": 254
},
{
"epoch": 0.15299694006119877,
"grad_norm": 1.6796875,
"learning_rate": 3.495049504950495e-06,
"loss": 1.6351,
"step": 255
},
{
"epoch": 0.15359692806143876,
"grad_norm": 1.734375,
"learning_rate": 3.492574257425743e-06,
"loss": 1.6819,
"step": 256
},
{
"epoch": 0.15419691606167876,
"grad_norm": 1.71875,
"learning_rate": 3.4900990099009897e-06,
"loss": 1.714,
"step": 257
},
{
"epoch": 0.15479690406191876,
"grad_norm": 1.78125,
"learning_rate": 3.4876237623762375e-06,
"loss": 1.6284,
"step": 258
},
{
"epoch": 0.15539689206215876,
"grad_norm": 1.8515625,
"learning_rate": 3.485148514851485e-06,
"loss": 1.5917,
"step": 259
},
{
"epoch": 0.15599688006239876,
"grad_norm": 1.828125,
"learning_rate": 3.4826732673267326e-06,
"loss": 1.7185,
"step": 260
},
{
"epoch": 0.15659686806263876,
"grad_norm": 1.734375,
"learning_rate": 3.48019801980198e-06,
"loss": 1.6383,
"step": 261
},
{
"epoch": 0.15719685606287875,
"grad_norm": 1.78125,
"learning_rate": 3.4777227722772277e-06,
"loss": 1.6595,
"step": 262
},
{
"epoch": 0.15779684406311872,
"grad_norm": 1.71875,
"learning_rate": 3.475247524752475e-06,
"loss": 1.6335,
"step": 263
},
{
"epoch": 0.15839683206335872,
"grad_norm": 1.984375,
"learning_rate": 3.472772277227723e-06,
"loss": 1.7455,
"step": 264
},
{
"epoch": 0.15899682006359872,
"grad_norm": 1.5859375,
"learning_rate": 3.47029702970297e-06,
"loss": 1.7527,
"step": 265
},
{
"epoch": 0.15959680806383872,
"grad_norm": 1.6171875,
"learning_rate": 3.467821782178218e-06,
"loss": 1.6564,
"step": 266
},
{
"epoch": 0.16019679606407872,
"grad_norm": 1.5703125,
"learning_rate": 3.4653465346534653e-06,
"loss": 1.6865,
"step": 267
},
{
"epoch": 0.16079678406431872,
"grad_norm": 1.578125,
"learning_rate": 3.462871287128713e-06,
"loss": 1.5996,
"step": 268
},
{
"epoch": 0.1613967720645587,
"grad_norm": 1.5078125,
"learning_rate": 3.4603960396039604e-06,
"loss": 1.5936,
"step": 269
},
{
"epoch": 0.1619967600647987,
"grad_norm": 1.6640625,
"learning_rate": 3.4579207920792077e-06,
"loss": 1.5691,
"step": 270
},
{
"epoch": 0.1625967480650387,
"grad_norm": 1.59375,
"learning_rate": 3.455445544554455e-06,
"loss": 1.5869,
"step": 271
},
{
"epoch": 0.1631967360652787,
"grad_norm": 1.7265625,
"learning_rate": 3.452970297029703e-06,
"loss": 1.675,
"step": 272
},
{
"epoch": 0.16379672406551868,
"grad_norm": 1.65625,
"learning_rate": 3.45049504950495e-06,
"loss": 1.7083,
"step": 273
},
{
"epoch": 0.16439671206575868,
"grad_norm": 1.7734375,
"learning_rate": 3.448019801980198e-06,
"loss": 1.7339,
"step": 274
},
{
"epoch": 0.16499670006599867,
"grad_norm": 1.6640625,
"learning_rate": 3.4455445544554453e-06,
"loss": 1.6974,
"step": 275
},
{
"epoch": 0.16559668806623867,
"grad_norm": 1.65625,
"learning_rate": 3.443069306930693e-06,
"loss": 1.6987,
"step": 276
},
{
"epoch": 0.16619667606647867,
"grad_norm": 1.90625,
"learning_rate": 3.4405940594059404e-06,
"loss": 1.6543,
"step": 277
},
{
"epoch": 0.16679666406671867,
"grad_norm": 2.203125,
"learning_rate": 3.438118811881188e-06,
"loss": 1.7186,
"step": 278
},
{
"epoch": 0.16739665206695867,
"grad_norm": 1.6328125,
"learning_rate": 3.4356435643564355e-06,
"loss": 1.5786,
"step": 279
},
{
"epoch": 0.16799664006719867,
"grad_norm": 1.78125,
"learning_rate": 3.4331683168316833e-06,
"loss": 1.6934,
"step": 280
},
{
"epoch": 0.16859662806743866,
"grad_norm": 2.125,
"learning_rate": 3.4306930693069306e-06,
"loss": 1.6231,
"step": 281
},
{
"epoch": 0.16919661606767863,
"grad_norm": 1.8125,
"learning_rate": 3.4282178217821784e-06,
"loss": 1.7557,
"step": 282
},
{
"epoch": 0.16979660406791863,
"grad_norm": 1.8359375,
"learning_rate": 3.4257425742574253e-06,
"loss": 1.7097,
"step": 283
},
{
"epoch": 0.17039659206815863,
"grad_norm": 1.8828125,
"learning_rate": 3.423267326732673e-06,
"loss": 1.6093,
"step": 284
},
{
"epoch": 0.17099658006839863,
"grad_norm": 1.6953125,
"learning_rate": 3.4207920792079204e-06,
"loss": 1.6127,
"step": 285
},
{
"epoch": 0.17159656806863863,
"grad_norm": 1.578125,
"learning_rate": 3.418316831683168e-06,
"loss": 1.6953,
"step": 286
},
{
"epoch": 0.17219655606887863,
"grad_norm": 1.8515625,
"learning_rate": 3.4158415841584155e-06,
"loss": 1.5976,
"step": 287
},
{
"epoch": 0.17279654406911862,
"grad_norm": 1.5703125,
"learning_rate": 3.4133663366336633e-06,
"loss": 1.696,
"step": 288
},
{
"epoch": 0.17339653206935862,
"grad_norm": 1.578125,
"learning_rate": 3.4108910891089106e-06,
"loss": 1.6185,
"step": 289
},
{
"epoch": 0.17399652006959862,
"grad_norm": 1.6875,
"learning_rate": 3.4084158415841584e-06,
"loss": 1.671,
"step": 290
},
{
"epoch": 0.1745965080698386,
"grad_norm": 1.671875,
"learning_rate": 3.4059405940594058e-06,
"loss": 1.6512,
"step": 291
},
{
"epoch": 0.1751964960700786,
"grad_norm": 1.75,
"learning_rate": 3.4034653465346535e-06,
"loss": 1.6009,
"step": 292
},
{
"epoch": 0.1757964840703186,
"grad_norm": 1.578125,
"learning_rate": 3.400990099009901e-06,
"loss": 1.5156,
"step": 293
},
{
"epoch": 0.17639647207055859,
"grad_norm": 1.796875,
"learning_rate": 3.3985148514851486e-06,
"loss": 1.694,
"step": 294
},
{
"epoch": 0.17699646007079858,
"grad_norm": 1.7109375,
"learning_rate": 3.396039603960396e-06,
"loss": 1.6629,
"step": 295
},
{
"epoch": 0.17759644807103858,
"grad_norm": 1.625,
"learning_rate": 3.3935643564356437e-06,
"loss": 1.697,
"step": 296
},
{
"epoch": 0.17819643607127858,
"grad_norm": 1.78125,
"learning_rate": 3.3910891089108907e-06,
"loss": 1.597,
"step": 297
},
{
"epoch": 0.17879642407151858,
"grad_norm": 1.65625,
"learning_rate": 3.3886138613861384e-06,
"loss": 1.6441,
"step": 298
},
{
"epoch": 0.17939641207175858,
"grad_norm": 1.9140625,
"learning_rate": 3.3861386138613858e-06,
"loss": 1.6373,
"step": 299
},
{
"epoch": 0.17999640007199855,
"grad_norm": 1.703125,
"learning_rate": 3.3836633663366335e-06,
"loss": 1.75,
"step": 300
},
{
"epoch": 0.17999640007199855,
"eval_loss": 1.7828515768051147,
"eval_model_preparation_time": 0.0037,
"eval_runtime": 68.7039,
"eval_samples_per_second": 145.552,
"eval_steps_per_second": 24.264,
"step": 300
},
{
"epoch": 0.18059638807223855,
"grad_norm": 1.6953125,
"learning_rate": 3.381188118811881e-06,
"loss": 1.648,
"step": 301
},
{
"epoch": 0.18119637607247854,
"grad_norm": 1.859375,
"learning_rate": 3.3787128712871286e-06,
"loss": 1.7426,
"step": 302
},
{
"epoch": 0.18179636407271854,
"grad_norm": 1.640625,
"learning_rate": 3.376237623762376e-06,
"loss": 1.6039,
"step": 303
},
{
"epoch": 0.18239635207295854,
"grad_norm": 1.546875,
"learning_rate": 3.3737623762376238e-06,
"loss": 1.6396,
"step": 304
},
{
"epoch": 0.18299634007319854,
"grad_norm": 1.953125,
"learning_rate": 3.371287128712871e-06,
"loss": 1.6994,
"step": 305
},
{
"epoch": 0.18359632807343854,
"grad_norm": 1.5390625,
"learning_rate": 3.368811881188119e-06,
"loss": 1.4593,
"step": 306
},
{
"epoch": 0.18419631607367853,
"grad_norm": 1.6640625,
"learning_rate": 3.366336633663366e-06,
"loss": 1.5756,
"step": 307
},
{
"epoch": 0.18479630407391853,
"grad_norm": 1.78125,
"learning_rate": 3.363861386138614e-06,
"loss": 1.5962,
"step": 308
},
{
"epoch": 0.1853962920741585,
"grad_norm": 1.796875,
"learning_rate": 3.3613861386138613e-06,
"loss": 1.5964,
"step": 309
},
{
"epoch": 0.1859962800743985,
"grad_norm": 1.5390625,
"learning_rate": 3.3589108910891087e-06,
"loss": 1.6447,
"step": 310
},
{
"epoch": 0.1865962680746385,
"grad_norm": 1.5625,
"learning_rate": 3.356435643564356e-06,
"loss": 1.7261,
"step": 311
},
{
"epoch": 0.1871962560748785,
"grad_norm": 1.828125,
"learning_rate": 3.3539603960396038e-06,
"loss": 1.6419,
"step": 312
},
{
"epoch": 0.1877962440751185,
"grad_norm": 1.7734375,
"learning_rate": 3.351485148514851e-06,
"loss": 1.6787,
"step": 313
},
{
"epoch": 0.1883962320753585,
"grad_norm": 1.6328125,
"learning_rate": 3.349009900990099e-06,
"loss": 1.7553,
"step": 314
},
{
"epoch": 0.1889962200755985,
"grad_norm": 1.6796875,
"learning_rate": 3.3465346534653462e-06,
"loss": 1.7045,
"step": 315
},
{
"epoch": 0.1895962080758385,
"grad_norm": 1.6640625,
"learning_rate": 3.344059405940594e-06,
"loss": 1.715,
"step": 316
},
{
"epoch": 0.1901961960760785,
"grad_norm": 1.6328125,
"learning_rate": 3.3415841584158413e-06,
"loss": 1.6036,
"step": 317
},
{
"epoch": 0.1907961840763185,
"grad_norm": 1.8515625,
"learning_rate": 3.339108910891089e-06,
"loss": 1.7363,
"step": 318
},
{
"epoch": 0.19139617207655846,
"grad_norm": 1.5625,
"learning_rate": 3.3366336633663364e-06,
"loss": 1.7064,
"step": 319
},
{
"epoch": 0.19199616007679846,
"grad_norm": 1.5625,
"learning_rate": 3.334158415841584e-06,
"loss": 1.7196,
"step": 320
},
{
"epoch": 0.19259614807703845,
"grad_norm": 1.7265625,
"learning_rate": 3.3316831683168316e-06,
"loss": 1.6718,
"step": 321
},
{
"epoch": 0.19319613607727845,
"grad_norm": 1.546875,
"learning_rate": 3.3292079207920793e-06,
"loss": 1.689,
"step": 322
},
{
"epoch": 0.19379612407751845,
"grad_norm": 1.6328125,
"learning_rate": 3.3267326732673262e-06,
"loss": 1.716,
"step": 323
},
{
"epoch": 0.19439611207775845,
"grad_norm": 1.8046875,
"learning_rate": 3.324257425742574e-06,
"loss": 1.7491,
"step": 324
},
{
"epoch": 0.19499610007799845,
"grad_norm": 1.75,
"learning_rate": 3.3217821782178213e-06,
"loss": 1.6944,
"step": 325
},
{
"epoch": 0.19559608807823844,
"grad_norm": 1.890625,
"learning_rate": 3.319306930693069e-06,
"loss": 1.691,
"step": 326
},
{
"epoch": 0.19619607607847844,
"grad_norm": 2.0625,
"learning_rate": 3.3168316831683165e-06,
"loss": 1.662,
"step": 327
},
{
"epoch": 0.1967960640787184,
"grad_norm": 1.6953125,
"learning_rate": 3.3143564356435642e-06,
"loss": 1.7127,
"step": 328
},
{
"epoch": 0.1973960520789584,
"grad_norm": 1.75,
"learning_rate": 3.3118811881188116e-06,
"loss": 1.6894,
"step": 329
},
{
"epoch": 0.1979960400791984,
"grad_norm": 1.6640625,
"learning_rate": 3.3094059405940593e-06,
"loss": 1.6596,
"step": 330
},
{
"epoch": 0.1985960280794384,
"grad_norm": 1.609375,
"learning_rate": 3.3069306930693067e-06,
"loss": 1.7074,
"step": 331
},
{
"epoch": 0.1991960160796784,
"grad_norm": 1.7734375,
"learning_rate": 3.3044554455445544e-06,
"loss": 1.7924,
"step": 332
},
{
"epoch": 0.1997960040799184,
"grad_norm": 1.890625,
"learning_rate": 3.3019801980198018e-06,
"loss": 1.6684,
"step": 333
},
{
"epoch": 0.2003959920801584,
"grad_norm": 1.875,
"learning_rate": 3.2995049504950496e-06,
"loss": 1.6355,
"step": 334
},
{
"epoch": 0.2009959800803984,
"grad_norm": 1.6484375,
"learning_rate": 3.297029702970297e-06,
"loss": 1.6694,
"step": 335
},
{
"epoch": 0.2015959680806384,
"grad_norm": 1.6015625,
"learning_rate": 3.2945544554455442e-06,
"loss": 1.722,
"step": 336
},
{
"epoch": 0.20219595608087837,
"grad_norm": 1.703125,
"learning_rate": 3.2920792079207916e-06,
"loss": 1.6248,
"step": 337
},
{
"epoch": 0.20279594408111837,
"grad_norm": 1.703125,
"learning_rate": 3.2896039603960393e-06,
"loss": 1.6511,
"step": 338
},
{
"epoch": 0.20339593208135837,
"grad_norm": 1.9375,
"learning_rate": 3.2871287128712867e-06,
"loss": 1.6785,
"step": 339
},
{
"epoch": 0.20399592008159836,
"grad_norm": 1.7734375,
"learning_rate": 3.2846534653465345e-06,
"loss": 1.6779,
"step": 340
},
{
"epoch": 0.20459590808183836,
"grad_norm": 1.859375,
"learning_rate": 3.282178217821782e-06,
"loss": 1.6735,
"step": 341
},
{
"epoch": 0.20519589608207836,
"grad_norm": 1.59375,
"learning_rate": 3.2797029702970296e-06,
"loss": 1.6365,
"step": 342
},
{
"epoch": 0.20579588408231836,
"grad_norm": 1.7578125,
"learning_rate": 3.277227722772277e-06,
"loss": 1.6497,
"step": 343
},
{
"epoch": 0.20639587208255836,
"grad_norm": 1.6015625,
"learning_rate": 3.2747524752475247e-06,
"loss": 1.6381,
"step": 344
},
{
"epoch": 0.20699586008279836,
"grad_norm": 1.8671875,
"learning_rate": 3.272277227722772e-06,
"loss": 1.669,
"step": 345
},
{
"epoch": 0.20759584808303833,
"grad_norm": 1.8359375,
"learning_rate": 3.2698019801980198e-06,
"loss": 1.6485,
"step": 346
},
{
"epoch": 0.20819583608327832,
"grad_norm": 1.6796875,
"learning_rate": 3.267326732673267e-06,
"loss": 1.5217,
"step": 347
},
{
"epoch": 0.20879582408351832,
"grad_norm": 1.7265625,
"learning_rate": 3.264851485148515e-06,
"loss": 1.5988,
"step": 348
},
{
"epoch": 0.20939581208375832,
"grad_norm": 1.6171875,
"learning_rate": 3.262376237623762e-06,
"loss": 1.6752,
"step": 349
},
{
"epoch": 0.20999580008399832,
"grad_norm": 1.6484375,
"learning_rate": 3.2599009900990096e-06,
"loss": 1.6443,
"step": 350
},
{
"epoch": 0.20999580008399832,
"eval_loss": 1.7816474437713623,
"eval_model_preparation_time": 0.0037,
"eval_runtime": 65.9284,
"eval_samples_per_second": 151.68,
"eval_steps_per_second": 25.285,
"step": 350
},
{
"epoch": 0.21059578808423832,
"grad_norm": 1.84375,
"learning_rate": 3.257425742574257e-06,
"loss": 1.6992,
"step": 351
},
{
"epoch": 0.21119577608447831,
"grad_norm": 1.640625,
"learning_rate": 3.2549504950495047e-06,
"loss": 1.6317,
"step": 352
},
{
"epoch": 0.2117957640847183,
"grad_norm": 1.71875,
"learning_rate": 3.252475247524752e-06,
"loss": 1.6773,
"step": 353
},
{
"epoch": 0.2123957520849583,
"grad_norm": 1.8515625,
"learning_rate": 3.25e-06,
"loss": 1.6705,
"step": 354
},
{
"epoch": 0.2129957400851983,
"grad_norm": 1.578125,
"learning_rate": 3.2475247524752476e-06,
"loss": 1.6884,
"step": 355
},
{
"epoch": 0.21359572808543828,
"grad_norm": 1.84375,
"learning_rate": 3.245049504950495e-06,
"loss": 1.6452,
"step": 356
},
{
"epoch": 0.21419571608567828,
"grad_norm": 1.96875,
"learning_rate": 3.2425742574257427e-06,
"loss": 1.6493,
"step": 357
},
{
"epoch": 0.21479570408591828,
"grad_norm": 1.9765625,
"learning_rate": 3.24009900990099e-06,
"loss": 1.648,
"step": 358
},
{
"epoch": 0.21539569208615827,
"grad_norm": 1.5390625,
"learning_rate": 3.2376237623762378e-06,
"loss": 1.633,
"step": 359
},
{
"epoch": 0.21599568008639827,
"grad_norm": 1.765625,
"learning_rate": 3.235148514851485e-06,
"loss": 1.678,
"step": 360
},
{
"epoch": 0.21659566808663827,
"grad_norm": 1.65625,
"learning_rate": 3.232673267326733e-06,
"loss": 1.7183,
"step": 361
},
{
"epoch": 0.21719565608687827,
"grad_norm": 1.75,
"learning_rate": 3.2301980198019802e-06,
"loss": 1.6863,
"step": 362
},
{
"epoch": 0.21779564408711827,
"grad_norm": 1.71875,
"learning_rate": 3.2277227722772276e-06,
"loss": 1.8238,
"step": 363
},
{
"epoch": 0.21839563208735827,
"grad_norm": 1.7265625,
"learning_rate": 3.225247524752475e-06,
"loss": 1.6829,
"step": 364
},
{
"epoch": 0.21899562008759824,
"grad_norm": 1.6953125,
"learning_rate": 3.2227722772277227e-06,
"loss": 1.5743,
"step": 365
},
{
"epoch": 0.21959560808783823,
"grad_norm": 1.6796875,
"learning_rate": 3.22029702970297e-06,
"loss": 1.7275,
"step": 366
},
{
"epoch": 0.22019559608807823,
"grad_norm": 1.84375,
"learning_rate": 3.217821782178218e-06,
"loss": 1.7962,
"step": 367
},
{
"epoch": 0.22079558408831823,
"grad_norm": 1.5703125,
"learning_rate": 3.215346534653465e-06,
"loss": 1.6825,
"step": 368
},
{
"epoch": 0.22139557208855823,
"grad_norm": 1.5234375,
"learning_rate": 3.212871287128713e-06,
"loss": 1.7308,
"step": 369
},
{
"epoch": 0.22199556008879823,
"grad_norm": 1.6953125,
"learning_rate": 3.2103960396039603e-06,
"loss": 1.6639,
"step": 370
},
{
"epoch": 0.22259554808903823,
"grad_norm": 1.546875,
"learning_rate": 3.207920792079208e-06,
"loss": 1.7165,
"step": 371
},
{
"epoch": 0.22319553608927822,
"grad_norm": 1.6796875,
"learning_rate": 3.2054455445544554e-06,
"loss": 1.655,
"step": 372
},
{
"epoch": 0.22379552408951822,
"grad_norm": 1.859375,
"learning_rate": 3.202970297029703e-06,
"loss": 1.6016,
"step": 373
},
{
"epoch": 0.2243955120897582,
"grad_norm": 1.7578125,
"learning_rate": 3.2004950495049505e-06,
"loss": 1.6397,
"step": 374
},
{
"epoch": 0.2249955000899982,
"grad_norm": 1.6328125,
"learning_rate": 3.1980198019801982e-06,
"loss": 1.6711,
"step": 375
},
{
"epoch": 0.2255954880902382,
"grad_norm": 1.65625,
"learning_rate": 3.195544554455445e-06,
"loss": 1.5991,
"step": 376
},
{
"epoch": 0.2261954760904782,
"grad_norm": 1.8203125,
"learning_rate": 3.193069306930693e-06,
"loss": 1.6873,
"step": 377
},
{
"epoch": 0.22679546409071819,
"grad_norm": 1.65625,
"learning_rate": 3.1905940594059403e-06,
"loss": 1.7299,
"step": 378
},
{
"epoch": 0.22739545209095818,
"grad_norm": 1.6015625,
"learning_rate": 3.188118811881188e-06,
"loss": 1.6678,
"step": 379
},
{
"epoch": 0.22799544009119818,
"grad_norm": 1.8046875,
"learning_rate": 3.1856435643564354e-06,
"loss": 1.632,
"step": 380
},
{
"epoch": 0.22859542809143818,
"grad_norm": 1.828125,
"learning_rate": 3.183168316831683e-06,
"loss": 1.6137,
"step": 381
},
{
"epoch": 0.22919541609167818,
"grad_norm": 1.7421875,
"learning_rate": 3.1806930693069305e-06,
"loss": 1.5535,
"step": 382
},
{
"epoch": 0.22979540409191815,
"grad_norm": 1.6484375,
"learning_rate": 3.1782178217821783e-06,
"loss": 1.6658,
"step": 383
},
{
"epoch": 0.23039539209215815,
"grad_norm": 1.7734375,
"learning_rate": 3.1757425742574256e-06,
"loss": 1.636,
"step": 384
},
{
"epoch": 0.23099538009239814,
"grad_norm": 1.53125,
"learning_rate": 3.1732673267326734e-06,
"loss": 1.64,
"step": 385
},
{
"epoch": 0.23159536809263814,
"grad_norm": 1.609375,
"learning_rate": 3.1707920792079207e-06,
"loss": 1.6077,
"step": 386
},
{
"epoch": 0.23219535609287814,
"grad_norm": 1.7109375,
"learning_rate": 3.1683168316831685e-06,
"loss": 1.6389,
"step": 387
},
{
"epoch": 0.23279534409311814,
"grad_norm": 1.625,
"learning_rate": 3.165841584158416e-06,
"loss": 1.6041,
"step": 388
},
{
"epoch": 0.23339533209335814,
"grad_norm": 1.75,
"learning_rate": 3.163366336633663e-06,
"loss": 1.6911,
"step": 389
},
{
"epoch": 0.23399532009359814,
"grad_norm": 1.78125,
"learning_rate": 3.1608910891089105e-06,
"loss": 1.5314,
"step": 390
},
{
"epoch": 0.23459530809383813,
"grad_norm": 1.765625,
"learning_rate": 3.1584158415841583e-06,
"loss": 1.5704,
"step": 391
},
{
"epoch": 0.23519529609407813,
"grad_norm": 1.6171875,
"learning_rate": 3.1559405940594056e-06,
"loss": 1.7246,
"step": 392
},
{
"epoch": 0.2357952840943181,
"grad_norm": 1.6640625,
"learning_rate": 3.1534653465346534e-06,
"loss": 1.6135,
"step": 393
},
{
"epoch": 0.2363952720945581,
"grad_norm": 1.578125,
"learning_rate": 3.1509900990099007e-06,
"loss": 1.7003,
"step": 394
},
{
"epoch": 0.2369952600947981,
"grad_norm": 1.6953125,
"learning_rate": 3.1485148514851485e-06,
"loss": 1.7987,
"step": 395
},
{
"epoch": 0.2375952480950381,
"grad_norm": 1.8125,
"learning_rate": 3.146039603960396e-06,
"loss": 1.6395,
"step": 396
},
{
"epoch": 0.2381952360952781,
"grad_norm": 1.6484375,
"learning_rate": 3.1435643564356436e-06,
"loss": 1.6497,
"step": 397
},
{
"epoch": 0.2387952240955181,
"grad_norm": 1.8203125,
"learning_rate": 3.141089108910891e-06,
"loss": 1.692,
"step": 398
},
{
"epoch": 0.2393952120957581,
"grad_norm": 1.7421875,
"learning_rate": 3.1386138613861387e-06,
"loss": 1.7406,
"step": 399
},
{
"epoch": 0.2399952000959981,
"grad_norm": 1.7265625,
"learning_rate": 3.136138613861386e-06,
"loss": 1.7191,
"step": 400
},
{
"epoch": 0.2399952000959981,
"eval_loss": 1.7807087898254395,
"eval_model_preparation_time": 0.0037,
"eval_runtime": 66.0367,
"eval_samples_per_second": 151.431,
"eval_steps_per_second": 25.244,
"step": 400
},
{
"epoch": 0.2405951880962381,
"grad_norm": 1.7734375,
"learning_rate": 3.133663366336634e-06,
"loss": 1.6603,
"step": 401
},
{
"epoch": 0.24119517609647806,
"grad_norm": 1.609375,
"learning_rate": 3.1311881188118807e-06,
"loss": 1.6492,
"step": 402
},
{
"epoch": 0.24179516409671806,
"grad_norm": 1.671875,
"learning_rate": 3.1287128712871285e-06,
"loss": 1.7367,
"step": 403
},
{
"epoch": 0.24239515209695806,
"grad_norm": 1.6484375,
"learning_rate": 3.126237623762376e-06,
"loss": 1.7123,
"step": 404
},
{
"epoch": 0.24299514009719805,
"grad_norm": 1.7265625,
"learning_rate": 3.1237623762376236e-06,
"loss": 1.7351,
"step": 405
},
{
"epoch": 0.24359512809743805,
"grad_norm": 1.7109375,
"learning_rate": 3.121287128712871e-06,
"loss": 1.7273,
"step": 406
},
{
"epoch": 0.24419511609767805,
"grad_norm": 1.6953125,
"learning_rate": 3.1188118811881187e-06,
"loss": 1.6324,
"step": 407
},
{
"epoch": 0.24479510409791805,
"grad_norm": 1.90625,
"learning_rate": 3.116336633663366e-06,
"loss": 1.6113,
"step": 408
},
{
"epoch": 0.24539509209815805,
"grad_norm": 1.5078125,
"learning_rate": 3.113861386138614e-06,
"loss": 1.6485,
"step": 409
},
{
"epoch": 0.24599508009839804,
"grad_norm": 1.9921875,
"learning_rate": 3.111386138613861e-06,
"loss": 1.664,
"step": 410
},
{
"epoch": 0.24659506809863802,
"grad_norm": 1.6796875,
"learning_rate": 3.108910891089109e-06,
"loss": 1.7174,
"step": 411
},
{
"epoch": 0.247195056098878,
"grad_norm": 1.5390625,
"learning_rate": 3.1064356435643563e-06,
"loss": 1.6438,
"step": 412
},
{
"epoch": 0.247795044099118,
"grad_norm": 1.5859375,
"learning_rate": 3.103960396039604e-06,
"loss": 1.7208,
"step": 413
},
{
"epoch": 0.248395032099358,
"grad_norm": 1.4765625,
"learning_rate": 3.1014851485148514e-06,
"loss": 1.7257,
"step": 414
},
{
"epoch": 0.248995020099598,
"grad_norm": 1.5703125,
"learning_rate": 3.099009900990099e-06,
"loss": 1.683,
"step": 415
},
{
"epoch": 0.249595008099838,
"grad_norm": 1.6953125,
"learning_rate": 3.096534653465346e-06,
"loss": 1.7018,
"step": 416
},
{
"epoch": 0.250194996100078,
"grad_norm": 1.59375,
"learning_rate": 3.094059405940594e-06,
"loss": 1.6217,
"step": 417
},
{
"epoch": 0.250794984100318,
"grad_norm": 1.703125,
"learning_rate": 3.091584158415841e-06,
"loss": 1.6733,
"step": 418
},
{
"epoch": 0.251394972100558,
"grad_norm": 1.703125,
"learning_rate": 3.089108910891089e-06,
"loss": 1.7294,
"step": 419
},
{
"epoch": 0.25199496010079797,
"grad_norm": 1.625,
"learning_rate": 3.0866336633663363e-06,
"loss": 1.6648,
"step": 420
},
{
"epoch": 0.252594948101038,
"grad_norm": 1.625,
"learning_rate": 3.084158415841584e-06,
"loss": 1.6886,
"step": 421
},
{
"epoch": 0.25319493610127797,
"grad_norm": 1.5078125,
"learning_rate": 3.0816831683168314e-06,
"loss": 1.6579,
"step": 422
},
{
"epoch": 0.253794924101518,
"grad_norm": 1.71875,
"learning_rate": 3.079207920792079e-06,
"loss": 1.5933,
"step": 423
},
{
"epoch": 0.25439491210175796,
"grad_norm": 1.625,
"learning_rate": 3.0767326732673265e-06,
"loss": 1.6736,
"step": 424
},
{
"epoch": 0.25499490010199793,
"grad_norm": 1.6953125,
"learning_rate": 3.0742574257425743e-06,
"loss": 1.6689,
"step": 425
},
{
"epoch": 0.25559488810223796,
"grad_norm": 1.875,
"learning_rate": 3.0717821782178216e-06,
"loss": 1.6232,
"step": 426
},
{
"epoch": 0.25619487610247793,
"grad_norm": 1.609375,
"learning_rate": 3.0693069306930694e-06,
"loss": 1.6268,
"step": 427
},
{
"epoch": 0.25679486410271796,
"grad_norm": 1.6953125,
"learning_rate": 3.0668316831683167e-06,
"loss": 1.6636,
"step": 428
},
{
"epoch": 0.2573948521029579,
"grad_norm": 1.4921875,
"learning_rate": 3.064356435643564e-06,
"loss": 1.6045,
"step": 429
},
{
"epoch": 0.25799484010319795,
"grad_norm": 1.7265625,
"learning_rate": 3.0618811881188114e-06,
"loss": 1.6552,
"step": 430
},
{
"epoch": 0.2585948281034379,
"grad_norm": 1.59375,
"learning_rate": 3.059405940594059e-06,
"loss": 1.7048,
"step": 431
},
{
"epoch": 0.25919481610367795,
"grad_norm": 1.7578125,
"learning_rate": 3.0569306930693065e-06,
"loss": 1.6747,
"step": 432
},
{
"epoch": 0.2597948041039179,
"grad_norm": 1.6328125,
"learning_rate": 3.0544554455445543e-06,
"loss": 1.811,
"step": 433
},
{
"epoch": 0.2603947921041579,
"grad_norm": 1.6484375,
"learning_rate": 3.0519801980198016e-06,
"loss": 1.5845,
"step": 434
},
{
"epoch": 0.2609947801043979,
"grad_norm": 1.6796875,
"learning_rate": 3.0495049504950494e-06,
"loss": 1.6937,
"step": 435
},
{
"epoch": 0.2615947681046379,
"grad_norm": 2.078125,
"learning_rate": 3.0470297029702967e-06,
"loss": 1.7166,
"step": 436
},
{
"epoch": 0.2621947561048779,
"grad_norm": 1.8671875,
"learning_rate": 3.0445544554455445e-06,
"loss": 1.5966,
"step": 437
},
{
"epoch": 0.2627947441051179,
"grad_norm": 1.46875,
"learning_rate": 3.042079207920792e-06,
"loss": 1.5997,
"step": 438
},
{
"epoch": 0.2633947321053579,
"grad_norm": 1.6171875,
"learning_rate": 3.0396039603960396e-06,
"loss": 1.5692,
"step": 439
},
{
"epoch": 0.2639947201055979,
"grad_norm": 1.8125,
"learning_rate": 3.037128712871287e-06,
"loss": 1.6428,
"step": 440
},
{
"epoch": 0.2645947081058379,
"grad_norm": 1.6484375,
"learning_rate": 3.0346534653465347e-06,
"loss": 1.5924,
"step": 441
},
{
"epoch": 0.2651946961060779,
"grad_norm": 1.5625,
"learning_rate": 3.0321782178217817e-06,
"loss": 1.6718,
"step": 442
},
{
"epoch": 0.26579468410631785,
"grad_norm": 1.53125,
"learning_rate": 3.0297029702970294e-06,
"loss": 1.7045,
"step": 443
},
{
"epoch": 0.2663946721065579,
"grad_norm": 1.8046875,
"learning_rate": 3.0272277227722768e-06,
"loss": 1.6391,
"step": 444
},
{
"epoch": 0.26699466010679784,
"grad_norm": 1.53125,
"learning_rate": 3.0247524752475245e-06,
"loss": 1.648,
"step": 445
},
{
"epoch": 0.26759464810703787,
"grad_norm": 1.75,
"learning_rate": 3.022277227722772e-06,
"loss": 1.6945,
"step": 446
},
{
"epoch": 0.26819463610727784,
"grad_norm": 1.921875,
"learning_rate": 3.0198019801980196e-06,
"loss": 1.6156,
"step": 447
},
{
"epoch": 0.26879462410751787,
"grad_norm": 1.6796875,
"learning_rate": 3.017326732673267e-06,
"loss": 1.7582,
"step": 448
},
{
"epoch": 0.26939461210775784,
"grad_norm": 1.6328125,
"learning_rate": 3.0148514851485147e-06,
"loss": 1.6294,
"step": 449
},
{
"epoch": 0.26999460010799786,
"grad_norm": 1.6875,
"learning_rate": 3.012376237623762e-06,
"loss": 1.7376,
"step": 450
},
{
"epoch": 0.26999460010799786,
"eval_loss": 1.7797411680221558,
"eval_model_preparation_time": 0.0037,
"eval_runtime": 68.9129,
"eval_samples_per_second": 145.111,
"eval_steps_per_second": 24.19,
"step": 450
},
{
"epoch": 0.27059458810823783,
"grad_norm": 1.6015625,
"learning_rate": 3.00990099009901e-06,
"loss": 1.6089,
"step": 451
},
{
"epoch": 0.2711945761084778,
"grad_norm": 1.6484375,
"learning_rate": 3.007425742574257e-06,
"loss": 1.657,
"step": 452
},
{
"epoch": 0.27179456410871783,
"grad_norm": 1.5859375,
"learning_rate": 3.004950495049505e-06,
"loss": 1.7169,
"step": 453
},
{
"epoch": 0.2723945521089578,
"grad_norm": 1.6484375,
"learning_rate": 3.0024752475247523e-06,
"loss": 1.5906,
"step": 454
},
{
"epoch": 0.2729945401091978,
"grad_norm": 1.828125,
"learning_rate": 3e-06,
"loss": 1.5775,
"step": 455
},
{
"epoch": 0.2735945281094378,
"grad_norm": 1.6484375,
"learning_rate": 2.9975247524752474e-06,
"loss": 1.7177,
"step": 456
},
{
"epoch": 0.2741945161096778,
"grad_norm": 1.6171875,
"learning_rate": 2.9950495049504948e-06,
"loss": 1.6019,
"step": 457
},
{
"epoch": 0.2747945041099178,
"grad_norm": 1.703125,
"learning_rate": 2.9925742574257425e-06,
"loss": 1.6325,
"step": 458
},
{
"epoch": 0.2753944921101578,
"grad_norm": 1.7109375,
"learning_rate": 2.99009900990099e-06,
"loss": 1.7797,
"step": 459
},
{
"epoch": 0.2759944801103978,
"grad_norm": 1.7890625,
"learning_rate": 2.9876237623762376e-06,
"loss": 1.6406,
"step": 460
},
{
"epoch": 0.2765944681106378,
"grad_norm": 1.71875,
"learning_rate": 2.985148514851485e-06,
"loss": 1.745,
"step": 461
},
{
"epoch": 0.2771944561108778,
"grad_norm": 1.7265625,
"learning_rate": 2.9826732673267327e-06,
"loss": 1.7707,
"step": 462
},
{
"epoch": 0.27779444411111776,
"grad_norm": 1.65625,
"learning_rate": 2.98019801980198e-06,
"loss": 1.7372,
"step": 463
},
{
"epoch": 0.2783944321113578,
"grad_norm": 1.71875,
"learning_rate": 2.977722772277228e-06,
"loss": 1.6024,
"step": 464
},
{
"epoch": 0.27899442011159775,
"grad_norm": 1.6171875,
"learning_rate": 2.975247524752475e-06,
"loss": 1.508,
"step": 465
},
{
"epoch": 0.2795944081118378,
"grad_norm": 1.78125,
"learning_rate": 2.972772277227723e-06,
"loss": 1.8252,
"step": 466
},
{
"epoch": 0.28019439611207775,
"grad_norm": 1.609375,
"learning_rate": 2.9702970297029703e-06,
"loss": 1.5962,
"step": 467
},
{
"epoch": 0.2807943841123178,
"grad_norm": 1.8046875,
"learning_rate": 2.967821782178218e-06,
"loss": 1.6764,
"step": 468
},
{
"epoch": 0.28139437211255774,
"grad_norm": 1.5,
"learning_rate": 2.965346534653465e-06,
"loss": 1.6339,
"step": 469
},
{
"epoch": 0.28199436011279777,
"grad_norm": 1.640625,
"learning_rate": 2.9628712871287128e-06,
"loss": 1.5768,
"step": 470
},
{
"epoch": 0.28259434811303774,
"grad_norm": 1.7265625,
"learning_rate": 2.96039603960396e-06,
"loss": 1.6435,
"step": 471
},
{
"epoch": 0.2831943361132777,
"grad_norm": 1.671875,
"learning_rate": 2.957920792079208e-06,
"loss": 1.6312,
"step": 472
},
{
"epoch": 0.28379432411351774,
"grad_norm": 1.625,
"learning_rate": 2.9554455445544552e-06,
"loss": 1.7725,
"step": 473
},
{
"epoch": 0.2843943121137577,
"grad_norm": 1.6953125,
"learning_rate": 2.952970297029703e-06,
"loss": 1.6755,
"step": 474
},
{
"epoch": 0.28499430011399773,
"grad_norm": 1.78125,
"learning_rate": 2.9504950495049503e-06,
"loss": 1.6577,
"step": 475
},
{
"epoch": 0.2855942881142377,
"grad_norm": 1.6171875,
"learning_rate": 2.948019801980198e-06,
"loss": 1.7085,
"step": 476
},
{
"epoch": 0.28619427611447773,
"grad_norm": 1.59375,
"learning_rate": 2.9455445544554454e-06,
"loss": 1.7308,
"step": 477
},
{
"epoch": 0.2867942641147177,
"grad_norm": 1.6171875,
"learning_rate": 2.943069306930693e-06,
"loss": 1.592,
"step": 478
},
{
"epoch": 0.2873942521149577,
"grad_norm": 1.8203125,
"learning_rate": 2.9405940594059405e-06,
"loss": 1.5693,
"step": 479
},
{
"epoch": 0.2879942401151977,
"grad_norm": 1.6328125,
"learning_rate": 2.9381188118811883e-06,
"loss": 1.6561,
"step": 480
},
{
"epoch": 0.28859422811543767,
"grad_norm": 1.7734375,
"learning_rate": 2.9356435643564357e-06,
"loss": 1.6724,
"step": 481
},
{
"epoch": 0.2891942161156777,
"grad_norm": 1.5546875,
"learning_rate": 2.933168316831683e-06,
"loss": 1.6792,
"step": 482
},
{
"epoch": 0.28979420411591766,
"grad_norm": 1.609375,
"learning_rate": 2.9306930693069303e-06,
"loss": 1.5727,
"step": 483
},
{
"epoch": 0.2903941921161577,
"grad_norm": 1.7265625,
"learning_rate": 2.928217821782178e-06,
"loss": 1.6944,
"step": 484
},
{
"epoch": 0.29099418011639766,
"grad_norm": 1.7265625,
"learning_rate": 2.9257425742574254e-06,
"loss": 1.6274,
"step": 485
},
{
"epoch": 0.2915941681166377,
"grad_norm": 1.59375,
"learning_rate": 2.9232673267326732e-06,
"loss": 1.7017,
"step": 486
},
{
"epoch": 0.29219415611687766,
"grad_norm": 1.671875,
"learning_rate": 2.9207920792079206e-06,
"loss": 1.6242,
"step": 487
},
{
"epoch": 0.2927941441171177,
"grad_norm": 1.8125,
"learning_rate": 2.9183168316831683e-06,
"loss": 1.5515,
"step": 488
},
{
"epoch": 0.29339413211735765,
"grad_norm": 1.546875,
"learning_rate": 2.9158415841584157e-06,
"loss": 1.6318,
"step": 489
},
{
"epoch": 0.2939941201175976,
"grad_norm": 1.734375,
"learning_rate": 2.9133663366336634e-06,
"loss": 1.6594,
"step": 490
},
{
"epoch": 0.29459410811783765,
"grad_norm": 1.5703125,
"learning_rate": 2.9108910891089108e-06,
"loss": 1.6918,
"step": 491
},
{
"epoch": 0.2951940961180776,
"grad_norm": 1.53125,
"learning_rate": 2.9084158415841585e-06,
"loss": 1.7203,
"step": 492
},
{
"epoch": 0.29579408411831765,
"grad_norm": 1.703125,
"learning_rate": 2.905940594059406e-06,
"loss": 1.6643,
"step": 493
},
{
"epoch": 0.2963940721185576,
"grad_norm": 1.7890625,
"learning_rate": 2.9034653465346537e-06,
"loss": 1.5509,
"step": 494
},
{
"epoch": 0.29699406011879764,
"grad_norm": 1.7578125,
"learning_rate": 2.9009900990099006e-06,
"loss": 1.7432,
"step": 495
},
{
"epoch": 0.2975940481190376,
"grad_norm": 1.640625,
"learning_rate": 2.8985148514851483e-06,
"loss": 1.6119,
"step": 496
},
{
"epoch": 0.29819403611927764,
"grad_norm": 1.6328125,
"learning_rate": 2.8960396039603957e-06,
"loss": 1.7538,
"step": 497
},
{
"epoch": 0.2987940241195176,
"grad_norm": 1.59375,
"learning_rate": 2.8935643564356434e-06,
"loss": 1.6927,
"step": 498
},
{
"epoch": 0.2993940121197576,
"grad_norm": 1.7265625,
"learning_rate": 2.891089108910891e-06,
"loss": 1.8514,
"step": 499
},
{
"epoch": 0.2999940001199976,
"grad_norm": 1.4921875,
"learning_rate": 2.8886138613861386e-06,
"loss": 1.6127,
"step": 500
},
{
"epoch": 0.2999940001199976,
"eval_loss": 1.7789958715438843,
"eval_model_preparation_time": 0.0037,
"eval_runtime": 66.0169,
"eval_samples_per_second": 151.476,
"eval_steps_per_second": 25.251,
"step": 500
},
{
"epoch": 0.3005939881202376,
"grad_norm": 1.703125,
"learning_rate": 2.886138613861386e-06,
"loss": 1.7337,
"step": 501
},
{
"epoch": 0.3011939761204776,
"grad_norm": 1.59375,
"learning_rate": 2.8836633663366337e-06,
"loss": 1.684,
"step": 502
},
{
"epoch": 0.3017939641207176,
"grad_norm": 1.8203125,
"learning_rate": 2.881188118811881e-06,
"loss": 1.6062,
"step": 503
},
{
"epoch": 0.3023939521209576,
"grad_norm": 1.6015625,
"learning_rate": 2.8787128712871288e-06,
"loss": 1.6014,
"step": 504
},
{
"epoch": 0.30299394012119757,
"grad_norm": 1.671875,
"learning_rate": 2.876237623762376e-06,
"loss": 1.5647,
"step": 505
},
{
"epoch": 0.3035939281214376,
"grad_norm": 1.8671875,
"learning_rate": 2.873762376237624e-06,
"loss": 1.7988,
"step": 506
},
{
"epoch": 0.30419391612167757,
"grad_norm": 1.5859375,
"learning_rate": 2.8712871287128712e-06,
"loss": 1.6926,
"step": 507
},
{
"epoch": 0.30479390412191754,
"grad_norm": 1.8515625,
"learning_rate": 2.868811881188119e-06,
"loss": 1.5877,
"step": 508
},
{
"epoch": 0.30539389212215756,
"grad_norm": 1.8203125,
"learning_rate": 2.866336633663366e-06,
"loss": 1.6823,
"step": 509
},
{
"epoch": 0.30599388012239753,
"grad_norm": 1.578125,
"learning_rate": 2.8638613861386137e-06,
"loss": 1.5566,
"step": 510
},
{
"epoch": 0.30659386812263756,
"grad_norm": 1.7578125,
"learning_rate": 2.861386138613861e-06,
"loss": 1.6614,
"step": 511
},
{
"epoch": 0.30719385612287753,
"grad_norm": 1.65625,
"learning_rate": 2.858910891089109e-06,
"loss": 1.6932,
"step": 512
},
{
"epoch": 0.30779384412311755,
"grad_norm": 1.6640625,
"learning_rate": 2.856435643564356e-06,
"loss": 1.7368,
"step": 513
},
{
"epoch": 0.3083938321233575,
"grad_norm": 1.6328125,
"learning_rate": 2.853960396039604e-06,
"loss": 1.6836,
"step": 514
},
{
"epoch": 0.30899382012359755,
"grad_norm": 1.6953125,
"learning_rate": 2.8514851485148512e-06,
"loss": 1.5781,
"step": 515
},
{
"epoch": 0.3095938081238375,
"grad_norm": 1.75,
"learning_rate": 2.849009900990099e-06,
"loss": 1.6617,
"step": 516
},
{
"epoch": 0.3101937961240775,
"grad_norm": 1.8203125,
"learning_rate": 2.8465346534653464e-06,
"loss": 1.6568,
"step": 517
},
{
"epoch": 0.3107937841243175,
"grad_norm": 1.671875,
"learning_rate": 2.844059405940594e-06,
"loss": 1.6805,
"step": 518
},
{
"epoch": 0.3113937721245575,
"grad_norm": 1.8046875,
"learning_rate": 2.8415841584158415e-06,
"loss": 1.6114,
"step": 519
},
{
"epoch": 0.3119937601247975,
"grad_norm": 1.71875,
"learning_rate": 2.8391089108910892e-06,
"loss": 1.6399,
"step": 520
},
{
"epoch": 0.3125937481250375,
"grad_norm": 1.6640625,
"learning_rate": 2.8366336633663366e-06,
"loss": 1.6753,
"step": 521
},
{
"epoch": 0.3131937361252775,
"grad_norm": 1.6640625,
"learning_rate": 2.834158415841584e-06,
"loss": 1.6818,
"step": 522
},
{
"epoch": 0.3137937241255175,
"grad_norm": 1.7421875,
"learning_rate": 2.8316831683168313e-06,
"loss": 1.6999,
"step": 523
},
{
"epoch": 0.3143937121257575,
"grad_norm": 1.6484375,
"learning_rate": 2.829207920792079e-06,
"loss": 1.5612,
"step": 524
},
{
"epoch": 0.3149937001259975,
"grad_norm": 1.8671875,
"learning_rate": 2.8267326732673264e-06,
"loss": 1.7166,
"step": 525
},
{
"epoch": 0.31559368812623745,
"grad_norm": 1.78125,
"learning_rate": 2.824257425742574e-06,
"loss": 1.6112,
"step": 526
},
{
"epoch": 0.3161936761264775,
"grad_norm": 1.5625,
"learning_rate": 2.8217821782178215e-06,
"loss": 1.5862,
"step": 527
},
{
"epoch": 0.31679366412671744,
"grad_norm": 1.5390625,
"learning_rate": 2.8193069306930692e-06,
"loss": 1.6285,
"step": 528
},
{
"epoch": 0.31739365212695747,
"grad_norm": 1.5234375,
"learning_rate": 2.8168316831683166e-06,
"loss": 1.6099,
"step": 529
},
{
"epoch": 0.31799364012719744,
"grad_norm": 1.7109375,
"learning_rate": 2.8143564356435644e-06,
"loss": 1.6994,
"step": 530
},
{
"epoch": 0.31859362812743747,
"grad_norm": 1.703125,
"learning_rate": 2.8118811881188117e-06,
"loss": 1.5936,
"step": 531
},
{
"epoch": 0.31919361612767744,
"grad_norm": 1.53125,
"learning_rate": 2.8094059405940595e-06,
"loss": 1.6696,
"step": 532
},
{
"epoch": 0.31979360412791746,
"grad_norm": 1.765625,
"learning_rate": 2.806930693069307e-06,
"loss": 1.5878,
"step": 533
},
{
"epoch": 0.32039359212815743,
"grad_norm": 1.765625,
"learning_rate": 2.8044554455445546e-06,
"loss": 1.6611,
"step": 534
},
{
"epoch": 0.3209935801283974,
"grad_norm": 1.71875,
"learning_rate": 2.8019801980198015e-06,
"loss": 1.7178,
"step": 535
},
{
"epoch": 0.32159356812863743,
"grad_norm": 1.734375,
"learning_rate": 2.7995049504950493e-06,
"loss": 1.6074,
"step": 536
},
{
"epoch": 0.3221935561288774,
"grad_norm": 1.6484375,
"learning_rate": 2.7970297029702966e-06,
"loss": 1.6906,
"step": 537
},
{
"epoch": 0.3227935441291174,
"grad_norm": 1.6015625,
"learning_rate": 2.7945544554455444e-06,
"loss": 1.6133,
"step": 538
},
{
"epoch": 0.3233935321293574,
"grad_norm": 1.5390625,
"learning_rate": 2.7920792079207917e-06,
"loss": 1.6716,
"step": 539
},
{
"epoch": 0.3239935201295974,
"grad_norm": 1.78125,
"learning_rate": 2.7896039603960395e-06,
"loss": 1.6822,
"step": 540
},
{
"epoch": 0.3245935081298374,
"grad_norm": 1.671875,
"learning_rate": 2.787128712871287e-06,
"loss": 1.6233,
"step": 541
},
{
"epoch": 0.3251934961300774,
"grad_norm": 1.578125,
"learning_rate": 2.7846534653465346e-06,
"loss": 1.6941,
"step": 542
},
{
"epoch": 0.3257934841303174,
"grad_norm": 1.59375,
"learning_rate": 2.782178217821782e-06,
"loss": 1.7407,
"step": 543
},
{
"epoch": 0.3263934721305574,
"grad_norm": 1.7578125,
"learning_rate": 2.7797029702970297e-06,
"loss": 1.6666,
"step": 544
},
{
"epoch": 0.3269934601307974,
"grad_norm": 1.6875,
"learning_rate": 2.777227722772277e-06,
"loss": 1.5721,
"step": 545
},
{
"epoch": 0.32759344813103736,
"grad_norm": 1.6328125,
"learning_rate": 2.774752475247525e-06,
"loss": 1.6345,
"step": 546
},
{
"epoch": 0.3281934361312774,
"grad_norm": 1.578125,
"learning_rate": 2.772277227722772e-06,
"loss": 1.7061,
"step": 547
},
{
"epoch": 0.32879342413151735,
"grad_norm": 1.921875,
"learning_rate": 2.7698019801980195e-06,
"loss": 1.72,
"step": 548
},
{
"epoch": 0.3293934121317574,
"grad_norm": 1.4765625,
"learning_rate": 2.767326732673267e-06,
"loss": 1.5754,
"step": 549
},
{
"epoch": 0.32999340013199735,
"grad_norm": 1.796875,
"learning_rate": 2.7648514851485146e-06,
"loss": 1.6856,
"step": 550
},
{
"epoch": 0.32999340013199735,
"eval_loss": 1.7785999774932861,
"eval_model_preparation_time": 0.0037,
"eval_runtime": 65.9577,
"eval_samples_per_second": 151.612,
"eval_steps_per_second": 25.274,
"step": 550
},
{
"epoch": 0.3305933881322374,
"grad_norm": 1.75,
"learning_rate": 2.762376237623762e-06,
"loss": 1.6151,
"step": 551
},
{
"epoch": 0.33119337613247735,
"grad_norm": 1.59375,
"learning_rate": 2.7599009900990097e-06,
"loss": 1.6493,
"step": 552
},
{
"epoch": 0.33179336413271737,
"grad_norm": 1.8203125,
"learning_rate": 2.757425742574257e-06,
"loss": 1.7389,
"step": 553
},
{
"epoch": 0.33239335213295734,
"grad_norm": 1.7109375,
"learning_rate": 2.754950495049505e-06,
"loss": 1.6765,
"step": 554
},
{
"epoch": 0.3329933401331973,
"grad_norm": 1.625,
"learning_rate": 2.752475247524752e-06,
"loss": 1.6237,
"step": 555
},
{
"epoch": 0.33359332813343734,
"grad_norm": 1.5,
"learning_rate": 2.75e-06,
"loss": 1.7249,
"step": 556
},
{
"epoch": 0.3341933161336773,
"grad_norm": 1.71875,
"learning_rate": 2.7475247524752477e-06,
"loss": 1.6253,
"step": 557
},
{
"epoch": 0.33479330413391734,
"grad_norm": 1.671875,
"learning_rate": 2.745049504950495e-06,
"loss": 1.6199,
"step": 558
},
{
"epoch": 0.3353932921341573,
"grad_norm": 1.703125,
"learning_rate": 2.742574257425743e-06,
"loss": 1.6945,
"step": 559
},
{
"epoch": 0.33599328013439733,
"grad_norm": 1.5859375,
"learning_rate": 2.74009900990099e-06,
"loss": 1.6295,
"step": 560
},
{
"epoch": 0.3365932681346373,
"grad_norm": 1.5625,
"learning_rate": 2.737623762376238e-06,
"loss": 1.617,
"step": 561
},
{
"epoch": 0.33719325613487733,
"grad_norm": 1.546875,
"learning_rate": 2.735148514851485e-06,
"loss": 1.7085,
"step": 562
},
{
"epoch": 0.3377932441351173,
"grad_norm": 1.765625,
"learning_rate": 2.7326732673267326e-06,
"loss": 1.7428,
"step": 563
},
{
"epoch": 0.33839323213535727,
"grad_norm": 1.5703125,
"learning_rate": 2.73019801980198e-06,
"loss": 1.5509,
"step": 564
},
{
"epoch": 0.3389932201355973,
"grad_norm": 1.65625,
"learning_rate": 2.7277227722772277e-06,
"loss": 1.6567,
"step": 565
},
{
"epoch": 0.33959320813583727,
"grad_norm": 1.515625,
"learning_rate": 2.725247524752475e-06,
"loss": 1.5832,
"step": 566
},
{
"epoch": 0.3401931961360773,
"grad_norm": 1.7109375,
"learning_rate": 2.722772277227723e-06,
"loss": 1.6637,
"step": 567
},
{
"epoch": 0.34079318413631726,
"grad_norm": 1.5390625,
"learning_rate": 2.72029702970297e-06,
"loss": 1.6742,
"step": 568
},
{
"epoch": 0.3413931721365573,
"grad_norm": 1.4375,
"learning_rate": 2.717821782178218e-06,
"loss": 1.6378,
"step": 569
},
{
"epoch": 0.34199316013679726,
"grad_norm": 1.5234375,
"learning_rate": 2.7153465346534653e-06,
"loss": 1.6595,
"step": 570
},
{
"epoch": 0.3425931481370373,
"grad_norm": 1.765625,
"learning_rate": 2.712871287128713e-06,
"loss": 1.7167,
"step": 571
},
{
"epoch": 0.34319313613727725,
"grad_norm": 1.8046875,
"learning_rate": 2.7103960396039604e-06,
"loss": 1.7184,
"step": 572
},
{
"epoch": 0.3437931241375172,
"grad_norm": 1.6796875,
"learning_rate": 2.707920792079208e-06,
"loss": 1.6598,
"step": 573
},
{
"epoch": 0.34439311213775725,
"grad_norm": 1.578125,
"learning_rate": 2.7054455445544555e-06,
"loss": 1.7302,
"step": 574
},
{
"epoch": 0.3449931001379972,
"grad_norm": 2.0,
"learning_rate": 2.702970297029703e-06,
"loss": 1.6286,
"step": 575
},
{
"epoch": 0.34559308813823725,
"grad_norm": 1.6171875,
"learning_rate": 2.70049504950495e-06,
"loss": 1.7379,
"step": 576
},
{
"epoch": 0.3461930761384772,
"grad_norm": 1.8203125,
"learning_rate": 2.698019801980198e-06,
"loss": 1.7016,
"step": 577
},
{
"epoch": 0.34679306413871724,
"grad_norm": 1.890625,
"learning_rate": 2.6955445544554453e-06,
"loss": 1.6231,
"step": 578
},
{
"epoch": 0.3473930521389572,
"grad_norm": 1.9140625,
"learning_rate": 2.693069306930693e-06,
"loss": 1.7478,
"step": 579
},
{
"epoch": 0.34799304013919724,
"grad_norm": 1.6484375,
"learning_rate": 2.6905940594059404e-06,
"loss": 1.6871,
"step": 580
},
{
"epoch": 0.3485930281394372,
"grad_norm": 1.6015625,
"learning_rate": 2.688118811881188e-06,
"loss": 1.6774,
"step": 581
},
{
"epoch": 0.3491930161396772,
"grad_norm": 1.765625,
"learning_rate": 2.6856435643564355e-06,
"loss": 1.6801,
"step": 582
},
{
"epoch": 0.3497930041399172,
"grad_norm": 1.703125,
"learning_rate": 2.6831683168316833e-06,
"loss": 1.6734,
"step": 583
},
{
"epoch": 0.3503929921401572,
"grad_norm": 1.59375,
"learning_rate": 2.6806930693069306e-06,
"loss": 1.6161,
"step": 584
},
{
"epoch": 0.3509929801403972,
"grad_norm": 1.84375,
"learning_rate": 2.6782178217821784e-06,
"loss": 1.621,
"step": 585
},
{
"epoch": 0.3515929681406372,
"grad_norm": 1.71875,
"learning_rate": 2.6757425742574257e-06,
"loss": 1.6603,
"step": 586
},
{
"epoch": 0.3521929561408772,
"grad_norm": 1.9296875,
"learning_rate": 2.6732673267326735e-06,
"loss": 1.7578,
"step": 587
},
{
"epoch": 0.35279294414111717,
"grad_norm": 1.75,
"learning_rate": 2.6707920792079204e-06,
"loss": 1.7356,
"step": 588
},
{
"epoch": 0.3533929321413572,
"grad_norm": 1.7421875,
"learning_rate": 2.668316831683168e-06,
"loss": 1.7648,
"step": 589
},
{
"epoch": 0.35399292014159717,
"grad_norm": 1.5703125,
"learning_rate": 2.6658415841584155e-06,
"loss": 1.6623,
"step": 590
},
{
"epoch": 0.35459290814183714,
"grad_norm": 1.8515625,
"learning_rate": 2.6633663366336633e-06,
"loss": 1.7709,
"step": 591
},
{
"epoch": 0.35519289614207716,
"grad_norm": 1.7890625,
"learning_rate": 2.6608910891089106e-06,
"loss": 1.6477,
"step": 592
},
{
"epoch": 0.35579288414231713,
"grad_norm": 1.5546875,
"learning_rate": 2.6584158415841584e-06,
"loss": 1.5843,
"step": 593
},
{
"epoch": 0.35639287214255716,
"grad_norm": 1.5703125,
"learning_rate": 2.6559405940594057e-06,
"loss": 1.6412,
"step": 594
},
{
"epoch": 0.35699286014279713,
"grad_norm": 1.6328125,
"learning_rate": 2.6534653465346535e-06,
"loss": 1.646,
"step": 595
},
{
"epoch": 0.35759284814303716,
"grad_norm": 1.65625,
"learning_rate": 2.650990099009901e-06,
"loss": 1.753,
"step": 596
},
{
"epoch": 0.3581928361432771,
"grad_norm": 1.609375,
"learning_rate": 2.6485148514851486e-06,
"loss": 1.6465,
"step": 597
},
{
"epoch": 0.35879282414351715,
"grad_norm": 1.6796875,
"learning_rate": 2.646039603960396e-06,
"loss": 1.6874,
"step": 598
},
{
"epoch": 0.3593928121437571,
"grad_norm": 1.6875,
"learning_rate": 2.6435643564356437e-06,
"loss": 1.6842,
"step": 599
},
{
"epoch": 0.3599928001439971,
"grad_norm": 1.53125,
"learning_rate": 2.641089108910891e-06,
"loss": 1.692,
"step": 600
},
{
"epoch": 0.3599928001439971,
"eval_loss": 1.7782986164093018,
"eval_model_preparation_time": 0.0037,
"eval_runtime": 68.6919,
"eval_samples_per_second": 145.577,
"eval_steps_per_second": 24.268,
"step": 600
},
{
"epoch": 0.3605927881442371,
"grad_norm": 1.703125,
"learning_rate": 2.6386138613861384e-06,
"loss": 1.7203,
"step": 601
},
{
"epoch": 0.3611927761444771,
"grad_norm": 1.6953125,
"learning_rate": 2.6361386138613858e-06,
"loss": 1.7563,
"step": 602
},
{
"epoch": 0.3617927641447171,
"grad_norm": 1.625,
"learning_rate": 2.6336633663366335e-06,
"loss": 1.5713,
"step": 603
},
{
"epoch": 0.3623927521449571,
"grad_norm": 1.671875,
"learning_rate": 2.631188118811881e-06,
"loss": 1.7128,
"step": 604
},
{
"epoch": 0.3629927401451971,
"grad_norm": 1.7109375,
"learning_rate": 2.6287128712871286e-06,
"loss": 1.7093,
"step": 605
},
{
"epoch": 0.3635927281454371,
"grad_norm": 1.9765625,
"learning_rate": 2.626237623762376e-06,
"loss": 1.7053,
"step": 606
},
{
"epoch": 0.3641927161456771,
"grad_norm": 1.5703125,
"learning_rate": 2.6237623762376237e-06,
"loss": 1.6861,
"step": 607
},
{
"epoch": 0.3647927041459171,
"grad_norm": 1.515625,
"learning_rate": 2.621287128712871e-06,
"loss": 1.5996,
"step": 608
},
{
"epoch": 0.36539269214615705,
"grad_norm": 1.65625,
"learning_rate": 2.618811881188119e-06,
"loss": 1.6401,
"step": 609
},
{
"epoch": 0.3659926801463971,
"grad_norm": 1.671875,
"learning_rate": 2.616336633663366e-06,
"loss": 1.6489,
"step": 610
},
{
"epoch": 0.36659266814663705,
"grad_norm": 1.59375,
"learning_rate": 2.613861386138614e-06,
"loss": 1.7161,
"step": 611
},
{
"epoch": 0.36719265614687707,
"grad_norm": 1.9453125,
"learning_rate": 2.6113861386138613e-06,
"loss": 1.5981,
"step": 612
},
{
"epoch": 0.36779264414711704,
"grad_norm": 1.7265625,
"learning_rate": 2.608910891089109e-06,
"loss": 1.7098,
"step": 613
},
{
"epoch": 0.36839263214735707,
"grad_norm": 1.796875,
"learning_rate": 2.606435643564356e-06,
"loss": 1.6385,
"step": 614
},
{
"epoch": 0.36899262014759704,
"grad_norm": 1.6328125,
"learning_rate": 2.6039603960396038e-06,
"loss": 1.7776,
"step": 615
},
{
"epoch": 0.36959260814783707,
"grad_norm": 1.765625,
"learning_rate": 2.601485148514851e-06,
"loss": 1.6092,
"step": 616
},
{
"epoch": 0.37019259614807704,
"grad_norm": 1.6015625,
"learning_rate": 2.599009900990099e-06,
"loss": 1.7118,
"step": 617
},
{
"epoch": 0.370792584148317,
"grad_norm": 1.796875,
"learning_rate": 2.596534653465346e-06,
"loss": 1.5554,
"step": 618
},
{
"epoch": 0.37139257214855703,
"grad_norm": 1.671875,
"learning_rate": 2.594059405940594e-06,
"loss": 1.7137,
"step": 619
},
{
"epoch": 0.371992560148797,
"grad_norm": 2.265625,
"learning_rate": 2.5915841584158413e-06,
"loss": 1.568,
"step": 620
},
{
"epoch": 0.37259254814903703,
"grad_norm": 1.6796875,
"learning_rate": 2.589108910891089e-06,
"loss": 1.6098,
"step": 621
},
{
"epoch": 0.373192536149277,
"grad_norm": 1.7265625,
"learning_rate": 2.5866336633663364e-06,
"loss": 1.7429,
"step": 622
},
{
"epoch": 0.373792524149517,
"grad_norm": 1.515625,
"learning_rate": 2.584158415841584e-06,
"loss": 1.5776,
"step": 623
},
{
"epoch": 0.374392512149757,
"grad_norm": 1.6796875,
"learning_rate": 2.5816831683168315e-06,
"loss": 1.7225,
"step": 624
},
{
"epoch": 0.374992500149997,
"grad_norm": 1.671875,
"learning_rate": 2.5792079207920793e-06,
"loss": 1.636,
"step": 625
},
{
"epoch": 0.375592488150237,
"grad_norm": 1.71875,
"learning_rate": 2.5767326732673266e-06,
"loss": 1.6907,
"step": 626
},
{
"epoch": 0.376192476150477,
"grad_norm": 1.515625,
"learning_rate": 2.5742574257425744e-06,
"loss": 1.6,
"step": 627
},
{
"epoch": 0.376792464150717,
"grad_norm": 1.703125,
"learning_rate": 2.5717821782178213e-06,
"loss": 1.7379,
"step": 628
},
{
"epoch": 0.37739245215095696,
"grad_norm": 1.8203125,
"learning_rate": 2.569306930693069e-06,
"loss": 1.7201,
"step": 629
},
{
"epoch": 0.377992440151197,
"grad_norm": 1.75,
"learning_rate": 2.5668316831683164e-06,
"loss": 1.5961,
"step": 630
},
{
"epoch": 0.37859242815143695,
"grad_norm": 1.609375,
"learning_rate": 2.564356435643564e-06,
"loss": 1.6398,
"step": 631
},
{
"epoch": 0.379192416151677,
"grad_norm": 1.84375,
"learning_rate": 2.5618811881188115e-06,
"loss": 1.7187,
"step": 632
},
{
"epoch": 0.37979240415191695,
"grad_norm": 1.7578125,
"learning_rate": 2.5594059405940593e-06,
"loss": 1.6705,
"step": 633
},
{
"epoch": 0.380392392152157,
"grad_norm": 1.5703125,
"learning_rate": 2.5569306930693067e-06,
"loss": 1.6784,
"step": 634
},
{
"epoch": 0.38099238015239695,
"grad_norm": 1.71875,
"learning_rate": 2.5544554455445544e-06,
"loss": 1.6956,
"step": 635
},
{
"epoch": 0.381592368152637,
"grad_norm": 1.5625,
"learning_rate": 2.5519801980198018e-06,
"loss": 1.6573,
"step": 636
},
{
"epoch": 0.38219235615287694,
"grad_norm": 1.53125,
"learning_rate": 2.5495049504950495e-06,
"loss": 1.6514,
"step": 637
},
{
"epoch": 0.3827923441531169,
"grad_norm": 1.6328125,
"learning_rate": 2.547029702970297e-06,
"loss": 1.5906,
"step": 638
},
{
"epoch": 0.38339233215335694,
"grad_norm": 1.6875,
"learning_rate": 2.5445544554455446e-06,
"loss": 1.6045,
"step": 639
},
{
"epoch": 0.3839923201535969,
"grad_norm": 1.8125,
"learning_rate": 2.542079207920792e-06,
"loss": 1.6549,
"step": 640
},
{
"epoch": 0.38459230815383694,
"grad_norm": 1.59375,
"learning_rate": 2.5396039603960393e-06,
"loss": 1.6548,
"step": 641
},
{
"epoch": 0.3851922961540769,
"grad_norm": 1.75,
"learning_rate": 2.5371287128712867e-06,
"loss": 1.6412,
"step": 642
},
{
"epoch": 0.38579228415431693,
"grad_norm": 1.59375,
"learning_rate": 2.5346534653465344e-06,
"loss": 1.6326,
"step": 643
},
{
"epoch": 0.3863922721545569,
"grad_norm": 1.8125,
"learning_rate": 2.5321782178217818e-06,
"loss": 1.5856,
"step": 644
},
{
"epoch": 0.38699226015479693,
"grad_norm": 1.8203125,
"learning_rate": 2.5297029702970295e-06,
"loss": 1.548,
"step": 645
},
{
"epoch": 0.3875922481550369,
"grad_norm": 1.546875,
"learning_rate": 2.527227722772277e-06,
"loss": 1.6333,
"step": 646
},
{
"epoch": 0.38819223615527687,
"grad_norm": 1.796875,
"learning_rate": 2.5247524752475247e-06,
"loss": 1.6828,
"step": 647
},
{
"epoch": 0.3887922241555169,
"grad_norm": 1.6328125,
"learning_rate": 2.522277227722772e-06,
"loss": 1.6181,
"step": 648
},
{
"epoch": 0.38939221215575687,
"grad_norm": 1.625,
"learning_rate": 2.5198019801980198e-06,
"loss": 1.6848,
"step": 649
},
{
"epoch": 0.3899922001559969,
"grad_norm": 1.734375,
"learning_rate": 2.517326732673267e-06,
"loss": 1.7051,
"step": 650
},
{
"epoch": 0.3899922001559969,
"eval_loss": 1.778051733970642,
"eval_model_preparation_time": 0.0037,
"eval_runtime": 65.9746,
"eval_samples_per_second": 151.573,
"eval_steps_per_second": 25.267,
"step": 650
},
{
"epoch": 0.39059218815623686,
"grad_norm": 1.5,
"learning_rate": 2.514851485148515e-06,
"loss": 1.6987,
"step": 651
},
{
"epoch": 0.3911921761564769,
"grad_norm": 1.84375,
"learning_rate": 2.5123762376237622e-06,
"loss": 1.6951,
"step": 652
},
{
"epoch": 0.39179216415671686,
"grad_norm": 1.6015625,
"learning_rate": 2.50990099009901e-06,
"loss": 1.6218,
"step": 653
},
{
"epoch": 0.3923921521569569,
"grad_norm": 1.6015625,
"learning_rate": 2.507425742574257e-06,
"loss": 1.7695,
"step": 654
},
{
"epoch": 0.39299214015719686,
"grad_norm": 1.5625,
"learning_rate": 2.5049504950495047e-06,
"loss": 1.6856,
"step": 655
},
{
"epoch": 0.3935921281574368,
"grad_norm": 1.6640625,
"learning_rate": 2.502475247524752e-06,
"loss": 1.658,
"step": 656
},
{
"epoch": 0.39419211615767685,
"grad_norm": 1.8203125,
"learning_rate": 2.4999999999999998e-06,
"loss": 1.6642,
"step": 657
},
{
"epoch": 0.3947921041579168,
"grad_norm": 1.890625,
"learning_rate": 2.4975247524752475e-06,
"loss": 1.6252,
"step": 658
},
{
"epoch": 0.39539209215815685,
"grad_norm": 1.6484375,
"learning_rate": 2.495049504950495e-06,
"loss": 1.6601,
"step": 659
},
{
"epoch": 0.3959920801583968,
"grad_norm": 1.671875,
"learning_rate": 2.4925742574257427e-06,
"loss": 1.6114,
"step": 660
},
{
"epoch": 0.39659206815863685,
"grad_norm": 1.515625,
"learning_rate": 2.49009900990099e-06,
"loss": 1.6902,
"step": 661
},
{
"epoch": 0.3971920561588768,
"grad_norm": 1.6015625,
"learning_rate": 2.4876237623762378e-06,
"loss": 1.5903,
"step": 662
},
{
"epoch": 0.39779204415911684,
"grad_norm": 1.8359375,
"learning_rate": 2.485148514851485e-06,
"loss": 1.7081,
"step": 663
},
{
"epoch": 0.3983920321593568,
"grad_norm": 1.765625,
"learning_rate": 2.482673267326733e-06,
"loss": 1.7127,
"step": 664
},
{
"epoch": 0.3989920201595968,
"grad_norm": 1.78125,
"learning_rate": 2.4801980198019802e-06,
"loss": 1.538,
"step": 665
},
{
"epoch": 0.3995920081598368,
"grad_norm": 1.609375,
"learning_rate": 2.477722772277228e-06,
"loss": 1.7153,
"step": 666
},
{
"epoch": 0.4001919961600768,
"grad_norm": 1.5703125,
"learning_rate": 2.475247524752475e-06,
"loss": 1.5796,
"step": 667
},
{
"epoch": 0.4007919841603168,
"grad_norm": 1.796875,
"learning_rate": 2.4727722772277227e-06,
"loss": 1.7261,
"step": 668
},
{
"epoch": 0.4013919721605568,
"grad_norm": 1.75,
"learning_rate": 2.47029702970297e-06,
"loss": 1.6201,
"step": 669
},
{
"epoch": 0.4019919601607968,
"grad_norm": 1.78125,
"learning_rate": 2.4678217821782178e-06,
"loss": 1.5838,
"step": 670
},
{
"epoch": 0.4025919481610368,
"grad_norm": 1.828125,
"learning_rate": 2.465346534653465e-06,
"loss": 1.675,
"step": 671
},
{
"epoch": 0.4031919361612768,
"grad_norm": 1.625,
"learning_rate": 2.462871287128713e-06,
"loss": 1.5512,
"step": 672
},
{
"epoch": 0.40379192416151677,
"grad_norm": 1.625,
"learning_rate": 2.4603960396039602e-06,
"loss": 1.7064,
"step": 673
},
{
"epoch": 0.40439191216175674,
"grad_norm": 1.7265625,
"learning_rate": 2.457920792079208e-06,
"loss": 1.6955,
"step": 674
},
{
"epoch": 0.40499190016199677,
"grad_norm": 1.7265625,
"learning_rate": 2.4554455445544553e-06,
"loss": 1.5868,
"step": 675
},
{
"epoch": 0.40559188816223674,
"grad_norm": 1.84375,
"learning_rate": 2.452970297029703e-06,
"loss": 1.6344,
"step": 676
},
{
"epoch": 0.40619187616247676,
"grad_norm": 1.7734375,
"learning_rate": 2.4504950495049505e-06,
"loss": 1.7793,
"step": 677
},
{
"epoch": 0.40679186416271673,
"grad_norm": 1.6328125,
"learning_rate": 2.4480198019801982e-06,
"loss": 1.6707,
"step": 678
},
{
"epoch": 0.40739185216295676,
"grad_norm": 1.453125,
"learning_rate": 2.4455445544554456e-06,
"loss": 1.6521,
"step": 679
},
{
"epoch": 0.40799184016319673,
"grad_norm": 1.6796875,
"learning_rate": 2.4430693069306933e-06,
"loss": 1.6611,
"step": 680
},
{
"epoch": 0.40859182816343675,
"grad_norm": 1.625,
"learning_rate": 2.4405940594059402e-06,
"loss": 1.7121,
"step": 681
},
{
"epoch": 0.4091918161636767,
"grad_norm": 1.6796875,
"learning_rate": 2.438118811881188e-06,
"loss": 1.6899,
"step": 682
},
{
"epoch": 0.4097918041639167,
"grad_norm": 1.609375,
"learning_rate": 2.4356435643564354e-06,
"loss": 1.5913,
"step": 683
},
{
"epoch": 0.4103917921641567,
"grad_norm": 1.59375,
"learning_rate": 2.433168316831683e-06,
"loss": 1.6691,
"step": 684
},
{
"epoch": 0.4109917801643967,
"grad_norm": 1.8359375,
"learning_rate": 2.4306930693069305e-06,
"loss": 1.5859,
"step": 685
},
{
"epoch": 0.4115917681646367,
"grad_norm": 1.703125,
"learning_rate": 2.4282178217821782e-06,
"loss": 1.6688,
"step": 686
},
{
"epoch": 0.4121917561648767,
"grad_norm": 1.6953125,
"learning_rate": 2.4257425742574256e-06,
"loss": 1.6196,
"step": 687
},
{
"epoch": 0.4127917441651167,
"grad_norm": 1.6171875,
"learning_rate": 2.4232673267326733e-06,
"loss": 1.6959,
"step": 688
},
{
"epoch": 0.4133917321653567,
"grad_norm": 1.671875,
"learning_rate": 2.4207920792079207e-06,
"loss": 1.6958,
"step": 689
},
{
"epoch": 0.4139917201655967,
"grad_norm": 1.6875,
"learning_rate": 2.4183168316831685e-06,
"loss": 1.6803,
"step": 690
},
{
"epoch": 0.4145917081658367,
"grad_norm": 1.6484375,
"learning_rate": 2.415841584158416e-06,
"loss": 1.67,
"step": 691
},
{
"epoch": 0.41519169616607665,
"grad_norm": 1.6953125,
"learning_rate": 2.4133663366336636e-06,
"loss": 1.6805,
"step": 692
},
{
"epoch": 0.4157916841663167,
"grad_norm": 1.859375,
"learning_rate": 2.410891089108911e-06,
"loss": 1.67,
"step": 693
},
{
"epoch": 0.41639167216655665,
"grad_norm": 1.8203125,
"learning_rate": 2.4084158415841582e-06,
"loss": 1.5639,
"step": 694
},
{
"epoch": 0.4169916601667967,
"grad_norm": 1.84375,
"learning_rate": 2.4059405940594056e-06,
"loss": 1.7609,
"step": 695
},
{
"epoch": 0.41759164816703664,
"grad_norm": 1.5625,
"learning_rate": 2.4034653465346534e-06,
"loss": 1.6972,
"step": 696
},
{
"epoch": 0.41819163616727667,
"grad_norm": 1.640625,
"learning_rate": 2.4009900990099007e-06,
"loss": 1.5793,
"step": 697
},
{
"epoch": 0.41879162416751664,
"grad_norm": 1.6640625,
"learning_rate": 2.3985148514851485e-06,
"loss": 1.6058,
"step": 698
},
{
"epoch": 0.41939161216775667,
"grad_norm": 1.8671875,
"learning_rate": 2.396039603960396e-06,
"loss": 1.7138,
"step": 699
},
{
"epoch": 0.41999160016799664,
"grad_norm": 1.53125,
"learning_rate": 2.3935643564356436e-06,
"loss": 1.6364,
"step": 700
},
{
"epoch": 0.41999160016799664,
"eval_loss": 1.777879238128662,
"eval_model_preparation_time": 0.0037,
"eval_runtime": 66.0918,
"eval_samples_per_second": 151.305,
"eval_steps_per_second": 25.222,
"step": 700
},
{
"epoch": 0.4205915881682366,
"grad_norm": 1.75,
"learning_rate": 2.391089108910891e-06,
"loss": 1.7899,
"step": 701
},
{
"epoch": 0.42119157616847663,
"grad_norm": 1.8671875,
"learning_rate": 2.3886138613861387e-06,
"loss": 1.6305,
"step": 702
},
{
"epoch": 0.4217915641687166,
"grad_norm": 1.6953125,
"learning_rate": 2.386138613861386e-06,
"loss": 1.6793,
"step": 703
},
{
"epoch": 0.42239155216895663,
"grad_norm": 1.703125,
"learning_rate": 2.383663366336634e-06,
"loss": 1.657,
"step": 704
},
{
"epoch": 0.4229915401691966,
"grad_norm": 1.6484375,
"learning_rate": 2.381188118811881e-06,
"loss": 1.6353,
"step": 705
},
{
"epoch": 0.4235915281694366,
"grad_norm": 1.8515625,
"learning_rate": 2.378712871287129e-06,
"loss": 1.592,
"step": 706
},
{
"epoch": 0.4241915161696766,
"grad_norm": 1.546875,
"learning_rate": 2.376237623762376e-06,
"loss": 1.615,
"step": 707
},
{
"epoch": 0.4247915041699166,
"grad_norm": 1.6875,
"learning_rate": 2.3737623762376236e-06,
"loss": 1.5985,
"step": 708
},
{
"epoch": 0.4253914921701566,
"grad_norm": 1.8046875,
"learning_rate": 2.371287128712871e-06,
"loss": 1.6037,
"step": 709
},
{
"epoch": 0.4259914801703966,
"grad_norm": 1.671875,
"learning_rate": 2.3688118811881187e-06,
"loss": 1.6959,
"step": 710
},
{
"epoch": 0.4265914681706366,
"grad_norm": 1.6796875,
"learning_rate": 2.366336633663366e-06,
"loss": 1.726,
"step": 711
},
{
"epoch": 0.42719145617087656,
"grad_norm": 1.75,
"learning_rate": 2.363861386138614e-06,
"loss": 1.7963,
"step": 712
},
{
"epoch": 0.4277914441711166,
"grad_norm": 1.6328125,
"learning_rate": 2.361386138613861e-06,
"loss": 1.64,
"step": 713
},
{
"epoch": 0.42839143217135656,
"grad_norm": 1.75,
"learning_rate": 2.358910891089109e-06,
"loss": 1.6623,
"step": 714
},
{
"epoch": 0.4289914201715966,
"grad_norm": 1.609375,
"learning_rate": 2.3564356435643563e-06,
"loss": 1.6111,
"step": 715
},
{
"epoch": 0.42959140817183655,
"grad_norm": 1.765625,
"learning_rate": 2.353960396039604e-06,
"loss": 1.7229,
"step": 716
},
{
"epoch": 0.4301913961720766,
"grad_norm": 1.6640625,
"learning_rate": 2.3514851485148514e-06,
"loss": 1.6864,
"step": 717
},
{
"epoch": 0.43079138417231655,
"grad_norm": 1.453125,
"learning_rate": 2.349009900990099e-06,
"loss": 1.7142,
"step": 718
},
{
"epoch": 0.4313913721725566,
"grad_norm": 1.8984375,
"learning_rate": 2.3465346534653465e-06,
"loss": 1.7028,
"step": 719
},
{
"epoch": 0.43199136017279655,
"grad_norm": 1.765625,
"learning_rate": 2.3440594059405942e-06,
"loss": 1.7853,
"step": 720
},
{
"epoch": 0.4325913481730365,
"grad_norm": 1.6875,
"learning_rate": 2.341584158415841e-06,
"loss": 1.6417,
"step": 721
},
{
"epoch": 0.43319133617327654,
"grad_norm": 1.5546875,
"learning_rate": 2.339108910891089e-06,
"loss": 1.6398,
"step": 722
},
{
"epoch": 0.4337913241735165,
"grad_norm": 1.84375,
"learning_rate": 2.3366336633663363e-06,
"loss": 1.6656,
"step": 723
},
{
"epoch": 0.43439131217375654,
"grad_norm": 1.578125,
"learning_rate": 2.334158415841584e-06,
"loss": 1.6787,
"step": 724
},
{
"epoch": 0.4349913001739965,
"grad_norm": 1.7421875,
"learning_rate": 2.3316831683168314e-06,
"loss": 1.6582,
"step": 725
},
{
"epoch": 0.43559128817423654,
"grad_norm": 1.703125,
"learning_rate": 2.329207920792079e-06,
"loss": 1.6612,
"step": 726
},
{
"epoch": 0.4361912761744765,
"grad_norm": 1.7734375,
"learning_rate": 2.3267326732673265e-06,
"loss": 1.7151,
"step": 727
},
{
"epoch": 0.43679126417471653,
"grad_norm": 1.78125,
"learning_rate": 2.3242574257425743e-06,
"loss": 1.6576,
"step": 728
},
{
"epoch": 0.4373912521749565,
"grad_norm": 1.59375,
"learning_rate": 2.3217821782178216e-06,
"loss": 1.7122,
"step": 729
},
{
"epoch": 0.4379912401751965,
"grad_norm": 1.75,
"learning_rate": 2.3193069306930694e-06,
"loss": 1.6774,
"step": 730
},
{
"epoch": 0.4385912281754365,
"grad_norm": 1.578125,
"learning_rate": 2.3168316831683167e-06,
"loss": 1.6655,
"step": 731
},
{
"epoch": 0.43919121617567647,
"grad_norm": 1.78125,
"learning_rate": 2.3143564356435645e-06,
"loss": 1.6039,
"step": 732
},
{
"epoch": 0.4397912041759165,
"grad_norm": 1.671875,
"learning_rate": 2.3118811881188114e-06,
"loss": 1.6186,
"step": 733
},
{
"epoch": 0.44039119217615647,
"grad_norm": 1.9609375,
"learning_rate": 2.309405940594059e-06,
"loss": 1.628,
"step": 734
},
{
"epoch": 0.4409911801763965,
"grad_norm": 1.625,
"learning_rate": 2.3069306930693065e-06,
"loss": 1.6176,
"step": 735
},
{
"epoch": 0.44159116817663646,
"grad_norm": 1.6171875,
"learning_rate": 2.3044554455445543e-06,
"loss": 1.6363,
"step": 736
},
{
"epoch": 0.4421911561768765,
"grad_norm": 1.5546875,
"learning_rate": 2.3019801980198016e-06,
"loss": 1.6177,
"step": 737
},
{
"epoch": 0.44279114417711646,
"grad_norm": 1.5859375,
"learning_rate": 2.2995049504950494e-06,
"loss": 1.8267,
"step": 738
},
{
"epoch": 0.44339113217735643,
"grad_norm": 1.7109375,
"learning_rate": 2.2970297029702967e-06,
"loss": 1.7321,
"step": 739
},
{
"epoch": 0.44399112017759645,
"grad_norm": 1.671875,
"learning_rate": 2.2945544554455445e-06,
"loss": 1.6686,
"step": 740
},
{
"epoch": 0.4445911081778364,
"grad_norm": 1.828125,
"learning_rate": 2.292079207920792e-06,
"loss": 1.5937,
"step": 741
},
{
"epoch": 0.44519109617807645,
"grad_norm": 1.625,
"learning_rate": 2.2896039603960396e-06,
"loss": 1.6786,
"step": 742
},
{
"epoch": 0.4457910841783164,
"grad_norm": 1.671875,
"learning_rate": 2.287128712871287e-06,
"loss": 1.6417,
"step": 743
},
{
"epoch": 0.44639107217855645,
"grad_norm": 1.7109375,
"learning_rate": 2.2846534653465347e-06,
"loss": 1.7238,
"step": 744
},
{
"epoch": 0.4469910601787964,
"grad_norm": 1.8828125,
"learning_rate": 2.282178217821782e-06,
"loss": 1.6931,
"step": 745
},
{
"epoch": 0.44759104817903644,
"grad_norm": 1.5546875,
"learning_rate": 2.27970297029703e-06,
"loss": 1.6201,
"step": 746
},
{
"epoch": 0.4481910361792764,
"grad_norm": 1.5390625,
"learning_rate": 2.2772277227722767e-06,
"loss": 1.6692,
"step": 747
},
{
"epoch": 0.4487910241795164,
"grad_norm": 1.7734375,
"learning_rate": 2.2747524752475245e-06,
"loss": 1.6056,
"step": 748
},
{
"epoch": 0.4493910121797564,
"grad_norm": 1.65625,
"learning_rate": 2.272277227722772e-06,
"loss": 1.7107,
"step": 749
},
{
"epoch": 0.4499910001799964,
"grad_norm": 1.7421875,
"learning_rate": 2.2698019801980196e-06,
"loss": 1.6711,
"step": 750
},
{
"epoch": 0.4499910001799964,
"eval_loss": 1.77778959274292,
"eval_model_preparation_time": 0.0037,
"eval_runtime": 69.2035,
"eval_samples_per_second": 144.501,
"eval_steps_per_second": 24.088,
"step": 750
},
{
"epoch": 0.4505909881802364,
"grad_norm": 1.703125,
"learning_rate": 2.267326732673267e-06,
"loss": 1.6199,
"step": 751
},
{
"epoch": 0.4511909761804764,
"grad_norm": 1.7734375,
"learning_rate": 2.2648514851485147e-06,
"loss": 1.6252,
"step": 752
},
{
"epoch": 0.4517909641807164,
"grad_norm": 1.65625,
"learning_rate": 2.262376237623762e-06,
"loss": 1.6548,
"step": 753
},
{
"epoch": 0.4523909521809564,
"grad_norm": 1.5390625,
"learning_rate": 2.25990099009901e-06,
"loss": 1.6134,
"step": 754
},
{
"epoch": 0.4529909401811964,
"grad_norm": 1.7109375,
"learning_rate": 2.257425742574257e-06,
"loss": 1.6512,
"step": 755
},
{
"epoch": 0.45359092818143637,
"grad_norm": 1.6953125,
"learning_rate": 2.254950495049505e-06,
"loss": 1.717,
"step": 756
},
{
"epoch": 0.45419091618167634,
"grad_norm": 1.6953125,
"learning_rate": 2.2524752475247523e-06,
"loss": 1.667,
"step": 757
},
{
"epoch": 0.45479090418191637,
"grad_norm": 1.84375,
"learning_rate": 2.25e-06,
"loss": 1.6836,
"step": 758
},
{
"epoch": 0.45539089218215634,
"grad_norm": 1.6875,
"learning_rate": 2.247524752475248e-06,
"loss": 1.7515,
"step": 759
},
{
"epoch": 0.45599088018239636,
"grad_norm": 1.78125,
"learning_rate": 2.2450495049504947e-06,
"loss": 1.7191,
"step": 760
},
{
"epoch": 0.45659086818263633,
"grad_norm": 1.9453125,
"learning_rate": 2.2425742574257425e-06,
"loss": 1.6758,
"step": 761
},
{
"epoch": 0.45719085618287636,
"grad_norm": 1.6640625,
"learning_rate": 2.24009900990099e-06,
"loss": 1.7625,
"step": 762
},
{
"epoch": 0.45779084418311633,
"grad_norm": 1.6796875,
"learning_rate": 2.2376237623762376e-06,
"loss": 1.6278,
"step": 763
},
{
"epoch": 0.45839083218335636,
"grad_norm": 1.7734375,
"learning_rate": 2.235148514851485e-06,
"loss": 1.6825,
"step": 764
},
{
"epoch": 0.4589908201835963,
"grad_norm": 1.65625,
"learning_rate": 2.2326732673267327e-06,
"loss": 1.5404,
"step": 765
},
{
"epoch": 0.4595908081838363,
"grad_norm": 1.6640625,
"learning_rate": 2.23019801980198e-06,
"loss": 1.5464,
"step": 766
},
{
"epoch": 0.4601907961840763,
"grad_norm": 1.6640625,
"learning_rate": 2.227722772277228e-06,
"loss": 1.7487,
"step": 767
},
{
"epoch": 0.4607907841843163,
"grad_norm": 1.4296875,
"learning_rate": 2.225247524752475e-06,
"loss": 1.6372,
"step": 768
},
{
"epoch": 0.4613907721845563,
"grad_norm": 1.6484375,
"learning_rate": 2.222772277227723e-06,
"loss": 1.7136,
"step": 769
},
{
"epoch": 0.4619907601847963,
"grad_norm": 1.59375,
"learning_rate": 2.2202970297029703e-06,
"loss": 1.6032,
"step": 770
},
{
"epoch": 0.4625907481850363,
"grad_norm": 1.5078125,
"learning_rate": 2.217821782178218e-06,
"loss": 1.6514,
"step": 771
},
{
"epoch": 0.4631907361852763,
"grad_norm": 1.6171875,
"learning_rate": 2.2153465346534654e-06,
"loss": 1.6665,
"step": 772
},
{
"epoch": 0.4637907241855163,
"grad_norm": 1.796875,
"learning_rate": 2.212871287128713e-06,
"loss": 1.6432,
"step": 773
},
{
"epoch": 0.4643907121857563,
"grad_norm": 1.8515625,
"learning_rate": 2.21039603960396e-06,
"loss": 1.5685,
"step": 774
},
{
"epoch": 0.46499070018599625,
"grad_norm": 1.5546875,
"learning_rate": 2.207920792079208e-06,
"loss": 1.7123,
"step": 775
},
{
"epoch": 0.4655906881862363,
"grad_norm": 1.4921875,
"learning_rate": 2.205445544554455e-06,
"loss": 1.6538,
"step": 776
},
{
"epoch": 0.46619067618647625,
"grad_norm": 1.8046875,
"learning_rate": 2.202970297029703e-06,
"loss": 1.6715,
"step": 777
},
{
"epoch": 0.4667906641867163,
"grad_norm": 1.7421875,
"learning_rate": 2.2004950495049503e-06,
"loss": 1.6058,
"step": 778
},
{
"epoch": 0.46739065218695625,
"grad_norm": 1.7265625,
"learning_rate": 2.198019801980198e-06,
"loss": 1.766,
"step": 779
},
{
"epoch": 0.46799064018719627,
"grad_norm": 1.5546875,
"learning_rate": 2.1955445544554454e-06,
"loss": 1.7647,
"step": 780
},
{
"epoch": 0.46859062818743624,
"grad_norm": 1.6875,
"learning_rate": 2.193069306930693e-06,
"loss": 1.6069,
"step": 781
},
{
"epoch": 0.46919061618767627,
"grad_norm": 1.6171875,
"learning_rate": 2.1905940594059405e-06,
"loss": 1.6362,
"step": 782
},
{
"epoch": 0.46979060418791624,
"grad_norm": 1.75,
"learning_rate": 2.1881188118811883e-06,
"loss": 1.6003,
"step": 783
},
{
"epoch": 0.47039059218815626,
"grad_norm": 1.671875,
"learning_rate": 2.1856435643564356e-06,
"loss": 1.676,
"step": 784
},
{
"epoch": 0.47099058018839624,
"grad_norm": 1.5703125,
"learning_rate": 2.1831683168316834e-06,
"loss": 1.6357,
"step": 785
},
{
"epoch": 0.4715905681886362,
"grad_norm": 1.7578125,
"learning_rate": 2.1806930693069307e-06,
"loss": 1.7,
"step": 786
},
{
"epoch": 0.47219055618887623,
"grad_norm": 1.6875,
"learning_rate": 2.178217821782178e-06,
"loss": 1.5975,
"step": 787
},
{
"epoch": 0.4727905441891162,
"grad_norm": 1.7265625,
"learning_rate": 2.1757425742574254e-06,
"loss": 1.7075,
"step": 788
},
{
"epoch": 0.47339053218935623,
"grad_norm": 1.5546875,
"learning_rate": 2.173267326732673e-06,
"loss": 1.6066,
"step": 789
},
{
"epoch": 0.4739905201895962,
"grad_norm": 1.765625,
"learning_rate": 2.1707920792079205e-06,
"loss": 1.6408,
"step": 790
},
{
"epoch": 0.4745905081898362,
"grad_norm": 1.5,
"learning_rate": 2.1683168316831683e-06,
"loss": 1.6861,
"step": 791
},
{
"epoch": 0.4751904961900762,
"grad_norm": 1.796875,
"learning_rate": 2.1658415841584156e-06,
"loss": 1.6754,
"step": 792
},
{
"epoch": 0.4757904841903162,
"grad_norm": 1.7265625,
"learning_rate": 2.1633663366336634e-06,
"loss": 1.5473,
"step": 793
},
{
"epoch": 0.4763904721905562,
"grad_norm": 1.6328125,
"learning_rate": 2.1608910891089108e-06,
"loss": 1.5907,
"step": 794
},
{
"epoch": 0.47699046019079616,
"grad_norm": 1.4609375,
"learning_rate": 2.1584158415841585e-06,
"loss": 1.5811,
"step": 795
},
{
"epoch": 0.4775904481910362,
"grad_norm": 1.65625,
"learning_rate": 2.155940594059406e-06,
"loss": 1.7476,
"step": 796
},
{
"epoch": 0.47819043619127616,
"grad_norm": 1.7890625,
"learning_rate": 2.1534653465346536e-06,
"loss": 1.6973,
"step": 797
},
{
"epoch": 0.4787904241915162,
"grad_norm": 1.7421875,
"learning_rate": 2.150990099009901e-06,
"loss": 1.6153,
"step": 798
},
{
"epoch": 0.47939041219175615,
"grad_norm": 1.6171875,
"learning_rate": 2.1485148514851487e-06,
"loss": 1.5589,
"step": 799
},
{
"epoch": 0.4799904001919962,
"grad_norm": 1.6953125,
"learning_rate": 2.1460396039603957e-06,
"loss": 1.6918,
"step": 800
},
{
"epoch": 0.4799904001919962,
"eval_loss": 1.7776175737380981,
"eval_model_preparation_time": 0.0037,
"eval_runtime": 66.0238,
"eval_samples_per_second": 151.461,
"eval_steps_per_second": 25.248,
"step": 800
},
{
"epoch": 0.48059038819223615,
"grad_norm": 1.7109375,
"learning_rate": 2.1435643564356434e-06,
"loss": 1.6829,
"step": 801
},
{
"epoch": 0.4811903761924762,
"grad_norm": 1.65625,
"learning_rate": 2.1410891089108908e-06,
"loss": 1.6587,
"step": 802
},
{
"epoch": 0.48179036419271615,
"grad_norm": 1.75,
"learning_rate": 2.1386138613861385e-06,
"loss": 1.6863,
"step": 803
},
{
"epoch": 0.4823903521929561,
"grad_norm": 1.5546875,
"learning_rate": 2.136138613861386e-06,
"loss": 1.7411,
"step": 804
},
{
"epoch": 0.48299034019319614,
"grad_norm": 1.6015625,
"learning_rate": 2.1336633663366336e-06,
"loss": 1.6127,
"step": 805
},
{
"epoch": 0.4835903281934361,
"grad_norm": 1.640625,
"learning_rate": 2.131188118811881e-06,
"loss": 1.6713,
"step": 806
},
{
"epoch": 0.48419031619367614,
"grad_norm": 1.6171875,
"learning_rate": 2.1287128712871288e-06,
"loss": 1.6152,
"step": 807
},
{
"epoch": 0.4847903041939161,
"grad_norm": 1.6796875,
"learning_rate": 2.126237623762376e-06,
"loss": 1.6861,
"step": 808
},
{
"epoch": 0.48539029219415614,
"grad_norm": 1.515625,
"learning_rate": 2.123762376237624e-06,
"loss": 1.6041,
"step": 809
},
{
"epoch": 0.4859902801943961,
"grad_norm": 1.6328125,
"learning_rate": 2.121287128712871e-06,
"loss": 1.6616,
"step": 810
},
{
"epoch": 0.48659026819463613,
"grad_norm": 1.546875,
"learning_rate": 2.118811881188119e-06,
"loss": 1.6977,
"step": 811
},
{
"epoch": 0.4871902561948761,
"grad_norm": 1.7421875,
"learning_rate": 2.1163366336633663e-06,
"loss": 1.7304,
"step": 812
},
{
"epoch": 0.4877902441951161,
"grad_norm": 1.7265625,
"learning_rate": 2.1138613861386137e-06,
"loss": 1.6383,
"step": 813
},
{
"epoch": 0.4883902321953561,
"grad_norm": 1.515625,
"learning_rate": 2.111386138613861e-06,
"loss": 1.646,
"step": 814
},
{
"epoch": 0.48899022019559607,
"grad_norm": 1.7421875,
"learning_rate": 2.1089108910891088e-06,
"loss": 1.6341,
"step": 815
},
{
"epoch": 0.4895902081958361,
"grad_norm": 1.625,
"learning_rate": 2.106435643564356e-06,
"loss": 1.625,
"step": 816
},
{
"epoch": 0.49019019619607607,
"grad_norm": 1.703125,
"learning_rate": 2.103960396039604e-06,
"loss": 1.6915,
"step": 817
},
{
"epoch": 0.4907901841963161,
"grad_norm": 1.859375,
"learning_rate": 2.1014851485148512e-06,
"loss": 1.5856,
"step": 818
},
{
"epoch": 0.49139017219655606,
"grad_norm": 1.7265625,
"learning_rate": 2.099009900990099e-06,
"loss": 1.6453,
"step": 819
},
{
"epoch": 0.4919901601967961,
"grad_norm": 1.6640625,
"learning_rate": 2.0965346534653463e-06,
"loss": 1.6145,
"step": 820
},
{
"epoch": 0.49259014819703606,
"grad_norm": 1.5625,
"learning_rate": 2.094059405940594e-06,
"loss": 1.6025,
"step": 821
},
{
"epoch": 0.49319013619727603,
"grad_norm": 1.65625,
"learning_rate": 2.0915841584158414e-06,
"loss": 1.753,
"step": 822
},
{
"epoch": 0.49379012419751606,
"grad_norm": 1.640625,
"learning_rate": 2.089108910891089e-06,
"loss": 1.6231,
"step": 823
},
{
"epoch": 0.494390112197756,
"grad_norm": 1.5859375,
"learning_rate": 2.0866336633663366e-06,
"loss": 1.6406,
"step": 824
},
{
"epoch": 0.49499010019799605,
"grad_norm": 1.671875,
"learning_rate": 2.0841584158415843e-06,
"loss": 1.6947,
"step": 825
},
{
"epoch": 0.495590088198236,
"grad_norm": 1.671875,
"learning_rate": 2.0816831683168312e-06,
"loss": 1.6547,
"step": 826
},
{
"epoch": 0.49619007619847605,
"grad_norm": 1.6015625,
"learning_rate": 2.079207920792079e-06,
"loss": 1.6789,
"step": 827
},
{
"epoch": 0.496790064198716,
"grad_norm": 1.7890625,
"learning_rate": 2.0767326732673263e-06,
"loss": 1.7236,
"step": 828
},
{
"epoch": 0.49739005219895605,
"grad_norm": 1.6171875,
"learning_rate": 2.074257425742574e-06,
"loss": 1.6554,
"step": 829
},
{
"epoch": 0.497990040199196,
"grad_norm": 1.7421875,
"learning_rate": 2.0717821782178215e-06,
"loss": 1.651,
"step": 830
},
{
"epoch": 0.498590028199436,
"grad_norm": 1.6171875,
"learning_rate": 2.0693069306930692e-06,
"loss": 1.618,
"step": 831
},
{
"epoch": 0.499190016199676,
"grad_norm": 1.7265625,
"learning_rate": 2.0668316831683166e-06,
"loss": 1.5994,
"step": 832
},
{
"epoch": 0.499790004199916,
"grad_norm": 1.8515625,
"learning_rate": 2.0643564356435643e-06,
"loss": 1.6897,
"step": 833
},
{
"epoch": 0.500389992200156,
"grad_norm": 1.6796875,
"learning_rate": 2.0618811881188117e-06,
"loss": 1.5805,
"step": 834
},
{
"epoch": 0.500989980200396,
"grad_norm": 1.65625,
"learning_rate": 2.0594059405940594e-06,
"loss": 1.7514,
"step": 835
},
{
"epoch": 0.501589968200636,
"grad_norm": 1.6015625,
"learning_rate": 2.0569306930693068e-06,
"loss": 1.7048,
"step": 836
},
{
"epoch": 0.502189956200876,
"grad_norm": 1.53125,
"learning_rate": 2.0544554455445546e-06,
"loss": 1.6226,
"step": 837
},
{
"epoch": 0.502789944201116,
"grad_norm": 1.703125,
"learning_rate": 2.051980198019802e-06,
"loss": 1.7454,
"step": 838
},
{
"epoch": 0.503389932201356,
"grad_norm": 1.515625,
"learning_rate": 2.0495049504950497e-06,
"loss": 1.6367,
"step": 839
},
{
"epoch": 0.5039899202015959,
"grad_norm": 1.7578125,
"learning_rate": 2.0470297029702966e-06,
"loss": 1.6435,
"step": 840
},
{
"epoch": 0.5045899082018359,
"grad_norm": 2.0,
"learning_rate": 2.0445544554455443e-06,
"loss": 1.5614,
"step": 841
},
{
"epoch": 0.505189896202076,
"grad_norm": 1.6796875,
"learning_rate": 2.0420792079207917e-06,
"loss": 1.7469,
"step": 842
},
{
"epoch": 0.505789884202316,
"grad_norm": 1.7109375,
"learning_rate": 2.0396039603960395e-06,
"loss": 1.6062,
"step": 843
},
{
"epoch": 0.5063898722025559,
"grad_norm": 1.4140625,
"learning_rate": 2.037128712871287e-06,
"loss": 1.5944,
"step": 844
},
{
"epoch": 0.5069898602027959,
"grad_norm": 1.6953125,
"learning_rate": 2.0346534653465346e-06,
"loss": 1.6536,
"step": 845
},
{
"epoch": 0.507589848203036,
"grad_norm": 1.734375,
"learning_rate": 2.032178217821782e-06,
"loss": 1.6721,
"step": 846
},
{
"epoch": 0.508189836203276,
"grad_norm": 1.8046875,
"learning_rate": 2.0297029702970297e-06,
"loss": 1.6654,
"step": 847
},
{
"epoch": 0.5087898242035159,
"grad_norm": 1.640625,
"learning_rate": 2.027227722772277e-06,
"loss": 1.7078,
"step": 848
},
{
"epoch": 0.5093898122037559,
"grad_norm": 1.6953125,
"learning_rate": 2.0247524752475248e-06,
"loss": 1.7034,
"step": 849
},
{
"epoch": 0.5099898002039959,
"grad_norm": 1.7421875,
"learning_rate": 2.022277227722772e-06,
"loss": 1.6864,
"step": 850
},
{
"epoch": 0.5099898002039959,
"eval_loss": 1.7774574756622314,
"eval_model_preparation_time": 0.0037,
"eval_runtime": 66.0293,
"eval_samples_per_second": 151.448,
"eval_steps_per_second": 25.246,
"step": 850
},
{
"epoch": 0.510589788204236,
"grad_norm": 1.78125,
"learning_rate": 2.01980198019802e-06,
"loss": 1.7012,
"step": 851
},
{
"epoch": 0.5111897762044759,
"grad_norm": 1.6484375,
"learning_rate": 2.0173267326732672e-06,
"loss": 1.7185,
"step": 852
},
{
"epoch": 0.5117897642047159,
"grad_norm": 1.8203125,
"learning_rate": 2.0148514851485146e-06,
"loss": 1.5713,
"step": 853
},
{
"epoch": 0.5123897522049559,
"grad_norm": 1.640625,
"learning_rate": 2.012376237623762e-06,
"loss": 1.6537,
"step": 854
},
{
"epoch": 0.5129897402051959,
"grad_norm": 1.640625,
"learning_rate": 2.0099009900990097e-06,
"loss": 1.6145,
"step": 855
},
{
"epoch": 0.5135897282054359,
"grad_norm": 1.5859375,
"learning_rate": 2.007425742574257e-06,
"loss": 1.6468,
"step": 856
},
{
"epoch": 0.5141897162056759,
"grad_norm": 1.65625,
"learning_rate": 2.004950495049505e-06,
"loss": 1.6845,
"step": 857
},
{
"epoch": 0.5147897042059159,
"grad_norm": 1.5078125,
"learning_rate": 2.002475247524752e-06,
"loss": 1.7128,
"step": 858
},
{
"epoch": 0.5153896922061558,
"grad_norm": 1.8359375,
"learning_rate": 2e-06,
"loss": 1.58,
"step": 859
},
{
"epoch": 0.5159896802063959,
"grad_norm": 1.625,
"learning_rate": 1.9975247524752473e-06,
"loss": 1.5932,
"step": 860
},
{
"epoch": 0.5165896682066359,
"grad_norm": 1.734375,
"learning_rate": 1.995049504950495e-06,
"loss": 1.6651,
"step": 861
},
{
"epoch": 0.5171896562068758,
"grad_norm": 1.6953125,
"learning_rate": 1.9925742574257424e-06,
"loss": 1.6753,
"step": 862
},
{
"epoch": 0.5177896442071158,
"grad_norm": 1.7578125,
"learning_rate": 1.99009900990099e-06,
"loss": 1.7112,
"step": 863
},
{
"epoch": 0.5183896322073559,
"grad_norm": 1.75,
"learning_rate": 1.9876237623762375e-06,
"loss": 1.6007,
"step": 864
},
{
"epoch": 0.5189896202075959,
"grad_norm": 1.8125,
"learning_rate": 1.9851485148514852e-06,
"loss": 1.763,
"step": 865
},
{
"epoch": 0.5195896082078358,
"grad_norm": 1.6796875,
"learning_rate": 1.9826732673267326e-06,
"loss": 1.6699,
"step": 866
},
{
"epoch": 0.5201895962080758,
"grad_norm": 1.6640625,
"learning_rate": 1.98019801980198e-06,
"loss": 1.6485,
"step": 867
},
{
"epoch": 0.5207895842083158,
"grad_norm": 1.75,
"learning_rate": 1.9777227722772277e-06,
"loss": 1.7541,
"step": 868
},
{
"epoch": 0.5213895722085559,
"grad_norm": 1.703125,
"learning_rate": 1.975247524752475e-06,
"loss": 1.7011,
"step": 869
},
{
"epoch": 0.5219895602087958,
"grad_norm": 1.8125,
"learning_rate": 1.972772277227723e-06,
"loss": 1.5922,
"step": 870
},
{
"epoch": 0.5225895482090358,
"grad_norm": 1.5078125,
"learning_rate": 1.97029702970297e-06,
"loss": 1.6786,
"step": 871
},
{
"epoch": 0.5231895362092758,
"grad_norm": 1.4296875,
"learning_rate": 1.967821782178218e-06,
"loss": 1.6818,
"step": 872
},
{
"epoch": 0.5237895242095159,
"grad_norm": 1.6640625,
"learning_rate": 1.9653465346534653e-06,
"loss": 1.6646,
"step": 873
},
{
"epoch": 0.5243895122097558,
"grad_norm": 1.5625,
"learning_rate": 1.9628712871287126e-06,
"loss": 1.7467,
"step": 874
},
{
"epoch": 0.5249895002099958,
"grad_norm": 1.6015625,
"learning_rate": 1.9603960396039604e-06,
"loss": 1.6728,
"step": 875
},
{
"epoch": 0.5255894882102358,
"grad_norm": 1.7109375,
"learning_rate": 1.9579207920792077e-06,
"loss": 1.654,
"step": 876
},
{
"epoch": 0.5261894762104757,
"grad_norm": 1.5390625,
"learning_rate": 1.9554455445544555e-06,
"loss": 1.6831,
"step": 877
},
{
"epoch": 0.5267894642107158,
"grad_norm": 1.609375,
"learning_rate": 1.952970297029703e-06,
"loss": 1.5535,
"step": 878
},
{
"epoch": 0.5273894522109558,
"grad_norm": 1.984375,
"learning_rate": 1.95049504950495e-06,
"loss": 1.6483,
"step": 879
},
{
"epoch": 0.5279894402111958,
"grad_norm": 1.59375,
"learning_rate": 1.948019801980198e-06,
"loss": 1.7029,
"step": 880
},
{
"epoch": 0.5285894282114357,
"grad_norm": 1.7265625,
"learning_rate": 1.9455445544554453e-06,
"loss": 1.6396,
"step": 881
},
{
"epoch": 0.5291894162116758,
"grad_norm": 1.65625,
"learning_rate": 1.943069306930693e-06,
"loss": 1.6633,
"step": 882
},
{
"epoch": 0.5297894042119158,
"grad_norm": 1.5859375,
"learning_rate": 1.9405940594059404e-06,
"loss": 1.6083,
"step": 883
},
{
"epoch": 0.5303893922121558,
"grad_norm": 1.6171875,
"learning_rate": 1.938118811881188e-06,
"loss": 1.684,
"step": 884
},
{
"epoch": 0.5309893802123957,
"grad_norm": 1.5703125,
"learning_rate": 1.9356435643564355e-06,
"loss": 1.6599,
"step": 885
},
{
"epoch": 0.5315893682126357,
"grad_norm": 1.6328125,
"learning_rate": 1.933168316831683e-06,
"loss": 1.6902,
"step": 886
},
{
"epoch": 0.5321893562128758,
"grad_norm": 1.6484375,
"learning_rate": 1.9306930693069306e-06,
"loss": 1.6117,
"step": 887
},
{
"epoch": 0.5327893442131157,
"grad_norm": 1.6875,
"learning_rate": 1.928217821782178e-06,
"loss": 1.6018,
"step": 888
},
{
"epoch": 0.5333893322133557,
"grad_norm": 1.671875,
"learning_rate": 1.9257425742574257e-06,
"loss": 1.7181,
"step": 889
},
{
"epoch": 0.5339893202135957,
"grad_norm": 1.8046875,
"learning_rate": 1.923267326732673e-06,
"loss": 1.6801,
"step": 890
},
{
"epoch": 0.5345893082138358,
"grad_norm": 1.5703125,
"learning_rate": 1.920792079207921e-06,
"loss": 1.6134,
"step": 891
},
{
"epoch": 0.5351892962140757,
"grad_norm": 1.65625,
"learning_rate": 1.918316831683168e-06,
"loss": 1.7768,
"step": 892
},
{
"epoch": 0.5357892842143157,
"grad_norm": 1.65625,
"learning_rate": 1.9158415841584155e-06,
"loss": 1.6964,
"step": 893
},
{
"epoch": 0.5363892722145557,
"grad_norm": 1.6171875,
"learning_rate": 1.9133663366336633e-06,
"loss": 1.6572,
"step": 894
},
{
"epoch": 0.5369892602147956,
"grad_norm": 1.6171875,
"learning_rate": 1.9108910891089106e-06,
"loss": 1.6688,
"step": 895
},
{
"epoch": 0.5375892482150357,
"grad_norm": 1.515625,
"learning_rate": 1.9084158415841584e-06,
"loss": 1.5866,
"step": 896
},
{
"epoch": 0.5381892362152757,
"grad_norm": 1.71875,
"learning_rate": 1.9059405940594057e-06,
"loss": 1.6582,
"step": 897
},
{
"epoch": 0.5387892242155157,
"grad_norm": 1.78125,
"learning_rate": 1.9034653465346533e-06,
"loss": 1.7269,
"step": 898
},
{
"epoch": 0.5393892122157556,
"grad_norm": 1.8125,
"learning_rate": 1.9009900990099008e-06,
"loss": 1.5366,
"step": 899
},
{
"epoch": 0.5399892002159957,
"grad_norm": 1.8046875,
"learning_rate": 1.8985148514851484e-06,
"loss": 1.6964,
"step": 900
},
{
"epoch": 0.5399892002159957,
"eval_loss": 1.7774394750595093,
"eval_model_preparation_time": 0.0037,
"eval_runtime": 68.6659,
"eval_samples_per_second": 145.633,
"eval_steps_per_second": 24.277,
"step": 900
},
{
"epoch": 0.5405891882162357,
"grad_norm": 1.6953125,
"learning_rate": 1.896039603960396e-06,
"loss": 1.7363,
"step": 901
},
{
"epoch": 0.5411891762164757,
"grad_norm": 1.6328125,
"learning_rate": 1.8935643564356433e-06,
"loss": 1.6164,
"step": 902
},
{
"epoch": 0.5417891642167156,
"grad_norm": 1.6328125,
"learning_rate": 1.8910891089108908e-06,
"loss": 1.6276,
"step": 903
},
{
"epoch": 0.5423891522169556,
"grad_norm": 1.765625,
"learning_rate": 1.8886138613861384e-06,
"loss": 1.5128,
"step": 904
},
{
"epoch": 0.5429891402171957,
"grad_norm": 1.7578125,
"learning_rate": 1.886138613861386e-06,
"loss": 1.6413,
"step": 905
},
{
"epoch": 0.5435891282174357,
"grad_norm": 1.8671875,
"learning_rate": 1.8836633663366335e-06,
"loss": 1.6669,
"step": 906
},
{
"epoch": 0.5441891162176756,
"grad_norm": 1.84375,
"learning_rate": 1.881188118811881e-06,
"loss": 1.6869,
"step": 907
},
{
"epoch": 0.5447891042179156,
"grad_norm": 1.625,
"learning_rate": 1.8787128712871286e-06,
"loss": 1.6706,
"step": 908
},
{
"epoch": 0.5453890922181557,
"grad_norm": 1.6328125,
"learning_rate": 1.876237623762376e-06,
"loss": 1.6194,
"step": 909
},
{
"epoch": 0.5459890802183957,
"grad_norm": 1.7421875,
"learning_rate": 1.8737623762376237e-06,
"loss": 1.7302,
"step": 910
},
{
"epoch": 0.5465890682186356,
"grad_norm": 1.7734375,
"learning_rate": 1.8712871287128713e-06,
"loss": 1.698,
"step": 911
},
{
"epoch": 0.5471890562188756,
"grad_norm": 1.7421875,
"learning_rate": 1.8688118811881188e-06,
"loss": 1.6802,
"step": 912
},
{
"epoch": 0.5477890442191157,
"grad_norm": 1.9453125,
"learning_rate": 1.8663366336633664e-06,
"loss": 1.7833,
"step": 913
},
{
"epoch": 0.5483890322193556,
"grad_norm": 1.7734375,
"learning_rate": 1.863861386138614e-06,
"loss": 1.7153,
"step": 914
},
{
"epoch": 0.5489890202195956,
"grad_norm": 1.84375,
"learning_rate": 1.8613861386138615e-06,
"loss": 1.6458,
"step": 915
},
{
"epoch": 0.5495890082198356,
"grad_norm": 1.65625,
"learning_rate": 1.8589108910891088e-06,
"loss": 1.5899,
"step": 916
},
{
"epoch": 0.5501889962200756,
"grad_norm": 1.890625,
"learning_rate": 1.8564356435643564e-06,
"loss": 1.618,
"step": 917
},
{
"epoch": 0.5507889842203156,
"grad_norm": 1.6953125,
"learning_rate": 1.853960396039604e-06,
"loss": 1.6677,
"step": 918
},
{
"epoch": 0.5513889722205556,
"grad_norm": 1.5703125,
"learning_rate": 1.8514851485148515e-06,
"loss": 1.6409,
"step": 919
},
{
"epoch": 0.5519889602207956,
"grad_norm": 1.671875,
"learning_rate": 1.849009900990099e-06,
"loss": 1.6201,
"step": 920
},
{
"epoch": 0.5525889482210355,
"grad_norm": 1.6171875,
"learning_rate": 1.8465346534653466e-06,
"loss": 1.6385,
"step": 921
},
{
"epoch": 0.5531889362212756,
"grad_norm": 1.6328125,
"learning_rate": 1.8440594059405942e-06,
"loss": 1.6725,
"step": 922
},
{
"epoch": 0.5537889242215156,
"grad_norm": 1.6640625,
"learning_rate": 1.8415841584158415e-06,
"loss": 1.6324,
"step": 923
},
{
"epoch": 0.5543889122217556,
"grad_norm": 1.546875,
"learning_rate": 1.839108910891089e-06,
"loss": 1.6393,
"step": 924
},
{
"epoch": 0.5549889002219955,
"grad_norm": 1.578125,
"learning_rate": 1.8366336633663366e-06,
"loss": 1.727,
"step": 925
},
{
"epoch": 0.5555888882222355,
"grad_norm": 1.6953125,
"learning_rate": 1.8341584158415842e-06,
"loss": 1.7974,
"step": 926
},
{
"epoch": 0.5561888762224756,
"grad_norm": 1.5625,
"learning_rate": 1.8316831683168317e-06,
"loss": 1.7538,
"step": 927
},
{
"epoch": 0.5567888642227156,
"grad_norm": 1.859375,
"learning_rate": 1.8292079207920793e-06,
"loss": 1.6599,
"step": 928
},
{
"epoch": 0.5573888522229555,
"grad_norm": 1.546875,
"learning_rate": 1.8267326732673266e-06,
"loss": 1.6722,
"step": 929
},
{
"epoch": 0.5579888402231955,
"grad_norm": 1.8046875,
"learning_rate": 1.8242574257425742e-06,
"loss": 1.6885,
"step": 930
},
{
"epoch": 0.5585888282234356,
"grad_norm": 1.625,
"learning_rate": 1.8217821782178217e-06,
"loss": 1.6151,
"step": 931
},
{
"epoch": 0.5591888162236756,
"grad_norm": 1.5625,
"learning_rate": 1.8193069306930693e-06,
"loss": 1.621,
"step": 932
},
{
"epoch": 0.5597888042239155,
"grad_norm": 1.7578125,
"learning_rate": 1.8168316831683168e-06,
"loss": 1.6494,
"step": 933
},
{
"epoch": 0.5603887922241555,
"grad_norm": 1.5546875,
"learning_rate": 1.8143564356435644e-06,
"loss": 1.6403,
"step": 934
},
{
"epoch": 0.5609887802243955,
"grad_norm": 1.71875,
"learning_rate": 1.811881188118812e-06,
"loss": 1.6198,
"step": 935
},
{
"epoch": 0.5615887682246355,
"grad_norm": 1.6328125,
"learning_rate": 1.8094059405940593e-06,
"loss": 1.6158,
"step": 936
},
{
"epoch": 0.5621887562248755,
"grad_norm": 1.578125,
"learning_rate": 1.8069306930693068e-06,
"loss": 1.7377,
"step": 937
},
{
"epoch": 0.5627887442251155,
"grad_norm": 1.765625,
"learning_rate": 1.8044554455445544e-06,
"loss": 1.6515,
"step": 938
},
{
"epoch": 0.5633887322253555,
"grad_norm": 1.734375,
"learning_rate": 1.801980198019802e-06,
"loss": 1.6708,
"step": 939
},
{
"epoch": 0.5639887202255955,
"grad_norm": 1.7265625,
"learning_rate": 1.7995049504950495e-06,
"loss": 1.6375,
"step": 940
},
{
"epoch": 0.5645887082258355,
"grad_norm": 1.6171875,
"learning_rate": 1.797029702970297e-06,
"loss": 1.7352,
"step": 941
},
{
"epoch": 0.5651886962260755,
"grad_norm": 1.515625,
"learning_rate": 1.7945544554455444e-06,
"loss": 1.5845,
"step": 942
},
{
"epoch": 0.5657886842263155,
"grad_norm": 1.6953125,
"learning_rate": 1.792079207920792e-06,
"loss": 1.6404,
"step": 943
},
{
"epoch": 0.5663886722265554,
"grad_norm": 1.484375,
"learning_rate": 1.7896039603960395e-06,
"loss": 1.693,
"step": 944
},
{
"epoch": 0.5669886602267955,
"grad_norm": 1.65625,
"learning_rate": 1.787128712871287e-06,
"loss": 1.6918,
"step": 945
},
{
"epoch": 0.5675886482270355,
"grad_norm": 1.8125,
"learning_rate": 1.7846534653465346e-06,
"loss": 1.6853,
"step": 946
},
{
"epoch": 0.5681886362272754,
"grad_norm": 1.703125,
"learning_rate": 1.7821782178217822e-06,
"loss": 1.7385,
"step": 947
},
{
"epoch": 0.5687886242275154,
"grad_norm": 1.734375,
"learning_rate": 1.7797029702970297e-06,
"loss": 1.6999,
"step": 948
},
{
"epoch": 0.5693886122277555,
"grad_norm": 1.71875,
"learning_rate": 1.777227722772277e-06,
"loss": 1.5892,
"step": 949
},
{
"epoch": 0.5699886002279955,
"grad_norm": 1.6328125,
"learning_rate": 1.7747524752475246e-06,
"loss": 1.7026,
"step": 950
},
{
"epoch": 0.5699886002279955,
"eval_loss": 1.777536153793335,
"eval_model_preparation_time": 0.0037,
"eval_runtime": 65.9833,
"eval_samples_per_second": 151.554,
"eval_steps_per_second": 25.264,
"step": 950
},
{
"epoch": 0.5705885882282354,
"grad_norm": 1.6328125,
"learning_rate": 1.7722772277227722e-06,
"loss": 1.6403,
"step": 951
},
{
"epoch": 0.5711885762284754,
"grad_norm": 1.5234375,
"learning_rate": 1.7698019801980197e-06,
"loss": 1.5106,
"step": 952
},
{
"epoch": 0.5717885642287154,
"grad_norm": 1.8515625,
"learning_rate": 1.7673267326732673e-06,
"loss": 1.6019,
"step": 953
},
{
"epoch": 0.5723885522289555,
"grad_norm": 1.6484375,
"learning_rate": 1.7648514851485149e-06,
"loss": 1.5817,
"step": 954
},
{
"epoch": 0.5729885402291954,
"grad_norm": 1.5859375,
"learning_rate": 1.7623762376237624e-06,
"loss": 1.7078,
"step": 955
},
{
"epoch": 0.5735885282294354,
"grad_norm": 1.671875,
"learning_rate": 1.7599009900990098e-06,
"loss": 1.6346,
"step": 956
},
{
"epoch": 0.5741885162296754,
"grad_norm": 1.7421875,
"learning_rate": 1.7574257425742573e-06,
"loss": 1.6595,
"step": 957
},
{
"epoch": 0.5747885042299155,
"grad_norm": 1.6484375,
"learning_rate": 1.7549504950495049e-06,
"loss": 1.6867,
"step": 958
},
{
"epoch": 0.5753884922301554,
"grad_norm": 1.7421875,
"learning_rate": 1.7524752475247524e-06,
"loss": 1.6465,
"step": 959
},
{
"epoch": 0.5759884802303954,
"grad_norm": 1.6875,
"learning_rate": 1.75e-06,
"loss": 1.7808,
"step": 960
},
{
"epoch": 0.5765884682306354,
"grad_norm": 1.6015625,
"learning_rate": 1.7475247524752475e-06,
"loss": 1.648,
"step": 961
},
{
"epoch": 0.5771884562308753,
"grad_norm": 1.65625,
"learning_rate": 1.7450495049504949e-06,
"loss": 1.6428,
"step": 962
},
{
"epoch": 0.5777884442311154,
"grad_norm": 1.8046875,
"learning_rate": 1.7425742574257424e-06,
"loss": 1.7126,
"step": 963
},
{
"epoch": 0.5783884322313554,
"grad_norm": 1.578125,
"learning_rate": 1.74009900990099e-06,
"loss": 1.6308,
"step": 964
},
{
"epoch": 0.5789884202315954,
"grad_norm": 1.75,
"learning_rate": 1.7376237623762375e-06,
"loss": 1.7662,
"step": 965
},
{
"epoch": 0.5795884082318353,
"grad_norm": 1.671875,
"learning_rate": 1.735148514851485e-06,
"loss": 1.6381,
"step": 966
},
{
"epoch": 0.5801883962320754,
"grad_norm": 1.65625,
"learning_rate": 1.7326732673267326e-06,
"loss": 1.7187,
"step": 967
},
{
"epoch": 0.5807883842323154,
"grad_norm": 1.6484375,
"learning_rate": 1.7301980198019802e-06,
"loss": 1.6956,
"step": 968
},
{
"epoch": 0.5813883722325554,
"grad_norm": 1.65625,
"learning_rate": 1.7277227722772275e-06,
"loss": 1.6442,
"step": 969
},
{
"epoch": 0.5819883602327953,
"grad_norm": 1.5859375,
"learning_rate": 1.725247524752475e-06,
"loss": 1.6717,
"step": 970
},
{
"epoch": 0.5825883482330353,
"grad_norm": 1.71875,
"learning_rate": 1.7227722772277227e-06,
"loss": 1.704,
"step": 971
},
{
"epoch": 0.5831883362332754,
"grad_norm": 1.765625,
"learning_rate": 1.7202970297029702e-06,
"loss": 1.6427,
"step": 972
},
{
"epoch": 0.5837883242335153,
"grad_norm": 1.671875,
"learning_rate": 1.7178217821782178e-06,
"loss": 1.7412,
"step": 973
},
{
"epoch": 0.5843883122337553,
"grad_norm": 1.46875,
"learning_rate": 1.7153465346534653e-06,
"loss": 1.6321,
"step": 974
},
{
"epoch": 0.5849883002339953,
"grad_norm": 1.7578125,
"learning_rate": 1.7128712871287127e-06,
"loss": 1.6029,
"step": 975
},
{
"epoch": 0.5855882882342354,
"grad_norm": 1.5703125,
"learning_rate": 1.7103960396039602e-06,
"loss": 1.6052,
"step": 976
},
{
"epoch": 0.5861882762344753,
"grad_norm": 1.5546875,
"learning_rate": 1.7079207920792078e-06,
"loss": 1.638,
"step": 977
},
{
"epoch": 0.5867882642347153,
"grad_norm": 1.65625,
"learning_rate": 1.7054455445544553e-06,
"loss": 1.8175,
"step": 978
},
{
"epoch": 0.5873882522349553,
"grad_norm": 1.7265625,
"learning_rate": 1.7029702970297029e-06,
"loss": 1.7441,
"step": 979
},
{
"epoch": 0.5879882402351952,
"grad_norm": 1.53125,
"learning_rate": 1.7004950495049504e-06,
"loss": 1.5586,
"step": 980
},
{
"epoch": 0.5885882282354353,
"grad_norm": 1.71875,
"learning_rate": 1.698019801980198e-06,
"loss": 1.8408,
"step": 981
},
{
"epoch": 0.5891882162356753,
"grad_norm": 1.6171875,
"learning_rate": 1.6955445544554453e-06,
"loss": 1.7207,
"step": 982
},
{
"epoch": 0.5897882042359153,
"grad_norm": 1.796875,
"learning_rate": 1.6930693069306929e-06,
"loss": 1.654,
"step": 983
},
{
"epoch": 0.5903881922361552,
"grad_norm": 1.859375,
"learning_rate": 1.6905940594059404e-06,
"loss": 1.645,
"step": 984
},
{
"epoch": 0.5909881802363953,
"grad_norm": 1.78125,
"learning_rate": 1.688118811881188e-06,
"loss": 1.7415,
"step": 985
},
{
"epoch": 0.5915881682366353,
"grad_norm": 1.578125,
"learning_rate": 1.6856435643564355e-06,
"loss": 1.7001,
"step": 986
},
{
"epoch": 0.5921881562368753,
"grad_norm": 1.6328125,
"learning_rate": 1.683168316831683e-06,
"loss": 1.6615,
"step": 987
},
{
"epoch": 0.5927881442371152,
"grad_norm": 1.78125,
"learning_rate": 1.6806930693069307e-06,
"loss": 1.7398,
"step": 988
},
{
"epoch": 0.5933881322373552,
"grad_norm": 1.6484375,
"learning_rate": 1.678217821782178e-06,
"loss": 1.6766,
"step": 989
},
{
"epoch": 0.5939881202375953,
"grad_norm": 1.8515625,
"learning_rate": 1.6757425742574256e-06,
"loss": 1.6571,
"step": 990
},
{
"epoch": 0.5945881082378353,
"grad_norm": 1.5078125,
"learning_rate": 1.6732673267326731e-06,
"loss": 1.622,
"step": 991
},
{
"epoch": 0.5951880962380752,
"grad_norm": 1.671875,
"learning_rate": 1.6707920792079207e-06,
"loss": 1.6102,
"step": 992
},
{
"epoch": 0.5957880842383152,
"grad_norm": 1.7265625,
"learning_rate": 1.6683168316831682e-06,
"loss": 1.7403,
"step": 993
},
{
"epoch": 0.5963880722385553,
"grad_norm": 1.8359375,
"learning_rate": 1.6658415841584158e-06,
"loss": 1.7306,
"step": 994
},
{
"epoch": 0.5969880602387952,
"grad_norm": 1.6328125,
"learning_rate": 1.6633663366336631e-06,
"loss": 1.6436,
"step": 995
},
{
"epoch": 0.5975880482390352,
"grad_norm": 2.140625,
"learning_rate": 1.6608910891089107e-06,
"loss": 1.6222,
"step": 996
},
{
"epoch": 0.5981880362392752,
"grad_norm": 1.640625,
"learning_rate": 1.6584158415841582e-06,
"loss": 1.6213,
"step": 997
},
{
"epoch": 0.5987880242395152,
"grad_norm": 1.5859375,
"learning_rate": 1.6559405940594058e-06,
"loss": 1.6387,
"step": 998
},
{
"epoch": 0.5993880122397552,
"grad_norm": 1.734375,
"learning_rate": 1.6534653465346533e-06,
"loss": 1.6206,
"step": 999
},
{
"epoch": 0.5999880002399952,
"grad_norm": 1.546875,
"learning_rate": 1.6509900990099009e-06,
"loss": 1.633,
"step": 1000
},
{
"epoch": 0.5999880002399952,
"eval_loss": 1.7774205207824707,
"eval_model_preparation_time": 0.0037,
"eval_runtime": 66.1167,
"eval_samples_per_second": 151.248,
"eval_steps_per_second": 25.213,
"step": 1000
},
{
"epoch": 0.6005879882402352,
"grad_norm": 1.671875,
"learning_rate": 1.6485148514851484e-06,
"loss": 1.7197,
"step": 1001
},
{
"epoch": 0.6011879762404752,
"grad_norm": 1.640625,
"learning_rate": 1.6460396039603958e-06,
"loss": 1.6299,
"step": 1002
},
{
"epoch": 0.6017879642407152,
"grad_norm": 1.6875,
"learning_rate": 1.6435643564356433e-06,
"loss": 1.6693,
"step": 1003
},
{
"epoch": 0.6023879522409552,
"grad_norm": 1.59375,
"learning_rate": 1.641089108910891e-06,
"loss": 1.6508,
"step": 1004
},
{
"epoch": 0.6029879402411952,
"grad_norm": 1.8046875,
"learning_rate": 1.6386138613861385e-06,
"loss": 1.6538,
"step": 1005
},
{
"epoch": 0.6035879282414351,
"grad_norm": 1.875,
"learning_rate": 1.636138613861386e-06,
"loss": 1.624,
"step": 1006
},
{
"epoch": 0.6041879162416751,
"grad_norm": 1.6796875,
"learning_rate": 1.6336633663366336e-06,
"loss": 1.7978,
"step": 1007
},
{
"epoch": 0.6047879042419152,
"grad_norm": 1.953125,
"learning_rate": 1.631188118811881e-06,
"loss": 1.5949,
"step": 1008
},
{
"epoch": 0.6053878922421552,
"grad_norm": 1.6171875,
"learning_rate": 1.6287128712871285e-06,
"loss": 1.71,
"step": 1009
},
{
"epoch": 0.6059878802423951,
"grad_norm": 1.6015625,
"learning_rate": 1.626237623762376e-06,
"loss": 1.7189,
"step": 1010
},
{
"epoch": 0.6065878682426351,
"grad_norm": 1.7109375,
"learning_rate": 1.6237623762376238e-06,
"loss": 1.6493,
"step": 1011
},
{
"epoch": 0.6071878562428752,
"grad_norm": 1.5625,
"learning_rate": 1.6212871287128713e-06,
"loss": 1.6566,
"step": 1012
},
{
"epoch": 0.6077878442431152,
"grad_norm": 1.7421875,
"learning_rate": 1.6188118811881189e-06,
"loss": 1.6018,
"step": 1013
},
{
"epoch": 0.6083878322433551,
"grad_norm": 2.046875,
"learning_rate": 1.6163366336633664e-06,
"loss": 1.6503,
"step": 1014
},
{
"epoch": 0.6089878202435951,
"grad_norm": 1.6640625,
"learning_rate": 1.6138613861386138e-06,
"loss": 1.6243,
"step": 1015
},
{
"epoch": 0.6095878082438351,
"grad_norm": 1.6328125,
"learning_rate": 1.6113861386138613e-06,
"loss": 1.6594,
"step": 1016
},
{
"epoch": 0.6101877962440752,
"grad_norm": 1.5859375,
"learning_rate": 1.608910891089109e-06,
"loss": 1.6897,
"step": 1017
},
{
"epoch": 0.6107877842443151,
"grad_norm": 1.59375,
"learning_rate": 1.6064356435643565e-06,
"loss": 1.692,
"step": 1018
},
{
"epoch": 0.6113877722445551,
"grad_norm": 1.5859375,
"learning_rate": 1.603960396039604e-06,
"loss": 1.6694,
"step": 1019
},
{
"epoch": 0.6119877602447951,
"grad_norm": 1.75,
"learning_rate": 1.6014851485148516e-06,
"loss": 1.7478,
"step": 1020
},
{
"epoch": 0.6125877482450351,
"grad_norm": 1.703125,
"learning_rate": 1.5990099009900991e-06,
"loss": 1.5819,
"step": 1021
},
{
"epoch": 0.6131877362452751,
"grad_norm": 1.8125,
"learning_rate": 1.5965346534653465e-06,
"loss": 1.6587,
"step": 1022
},
{
"epoch": 0.6137877242455151,
"grad_norm": 1.4765625,
"learning_rate": 1.594059405940594e-06,
"loss": 1.6205,
"step": 1023
},
{
"epoch": 0.6143877122457551,
"grad_norm": 1.671875,
"learning_rate": 1.5915841584158416e-06,
"loss": 1.6359,
"step": 1024
},
{
"epoch": 0.614987700245995,
"grad_norm": 1.6171875,
"learning_rate": 1.5891089108910891e-06,
"loss": 1.6837,
"step": 1025
},
{
"epoch": 0.6155876882462351,
"grad_norm": 1.5546875,
"learning_rate": 1.5866336633663367e-06,
"loss": 1.6778,
"step": 1026
},
{
"epoch": 0.6161876762464751,
"grad_norm": 1.9453125,
"learning_rate": 1.5841584158415842e-06,
"loss": 1.6798,
"step": 1027
},
{
"epoch": 0.616787664246715,
"grad_norm": 1.5546875,
"learning_rate": 1.5816831683168316e-06,
"loss": 1.5861,
"step": 1028
},
{
"epoch": 0.617387652246955,
"grad_norm": 1.625,
"learning_rate": 1.5792079207920791e-06,
"loss": 1.6128,
"step": 1029
},
{
"epoch": 0.6179876402471951,
"grad_norm": 1.7578125,
"learning_rate": 1.5767326732673267e-06,
"loss": 1.6962,
"step": 1030
},
{
"epoch": 0.6185876282474351,
"grad_norm": 1.6875,
"learning_rate": 1.5742574257425742e-06,
"loss": 1.5868,
"step": 1031
},
{
"epoch": 0.619187616247675,
"grad_norm": 1.859375,
"learning_rate": 1.5717821782178218e-06,
"loss": 1.627,
"step": 1032
},
{
"epoch": 0.619787604247915,
"grad_norm": 1.71875,
"learning_rate": 1.5693069306930694e-06,
"loss": 1.6367,
"step": 1033
},
{
"epoch": 0.620387592248155,
"grad_norm": 1.65625,
"learning_rate": 1.566831683168317e-06,
"loss": 1.6377,
"step": 1034
},
{
"epoch": 0.6209875802483951,
"grad_norm": 1.71875,
"learning_rate": 1.5643564356435643e-06,
"loss": 1.6437,
"step": 1035
},
{
"epoch": 0.621587568248635,
"grad_norm": 1.625,
"learning_rate": 1.5618811881188118e-06,
"loss": 1.6799,
"step": 1036
},
{
"epoch": 0.622187556248875,
"grad_norm": 1.6640625,
"learning_rate": 1.5594059405940594e-06,
"loss": 1.6935,
"step": 1037
},
{
"epoch": 0.622787544249115,
"grad_norm": 1.6171875,
"learning_rate": 1.556930693069307e-06,
"loss": 1.782,
"step": 1038
},
{
"epoch": 0.6233875322493551,
"grad_norm": 1.8359375,
"learning_rate": 1.5544554455445545e-06,
"loss": 1.7106,
"step": 1039
},
{
"epoch": 0.623987520249595,
"grad_norm": 1.6328125,
"learning_rate": 1.551980198019802e-06,
"loss": 1.6429,
"step": 1040
},
{
"epoch": 0.624587508249835,
"grad_norm": 1.6328125,
"learning_rate": 1.5495049504950496e-06,
"loss": 1.6784,
"step": 1041
},
{
"epoch": 0.625187496250075,
"grad_norm": 1.671875,
"learning_rate": 1.547029702970297e-06,
"loss": 1.6739,
"step": 1042
},
{
"epoch": 0.6257874842503149,
"grad_norm": 1.8671875,
"learning_rate": 1.5445544554455445e-06,
"loss": 1.6582,
"step": 1043
},
{
"epoch": 0.626387472250555,
"grad_norm": 1.640625,
"learning_rate": 1.542079207920792e-06,
"loss": 1.6651,
"step": 1044
},
{
"epoch": 0.626987460250795,
"grad_norm": 1.6328125,
"learning_rate": 1.5396039603960396e-06,
"loss": 1.6378,
"step": 1045
},
{
"epoch": 0.627587448251035,
"grad_norm": 1.625,
"learning_rate": 1.5371287128712871e-06,
"loss": 1.7835,
"step": 1046
},
{
"epoch": 0.6281874362512749,
"grad_norm": 1.734375,
"learning_rate": 1.5346534653465347e-06,
"loss": 1.7662,
"step": 1047
},
{
"epoch": 0.628787424251515,
"grad_norm": 1.59375,
"learning_rate": 1.532178217821782e-06,
"loss": 1.6731,
"step": 1048
},
{
"epoch": 0.629387412251755,
"grad_norm": 1.59375,
"learning_rate": 1.5297029702970296e-06,
"loss": 1.671,
"step": 1049
},
{
"epoch": 0.629987400251995,
"grad_norm": 1.6484375,
"learning_rate": 1.5272277227722771e-06,
"loss": 1.7948,
"step": 1050
},
{
"epoch": 0.629987400251995,
"eval_loss": 1.777391791343689,
"eval_model_preparation_time": 0.0037,
"eval_runtime": 69.2838,
"eval_samples_per_second": 144.334,
"eval_steps_per_second": 24.06,
"step": 1050
},
{
"epoch": 0.6305873882522349,
"grad_norm": 1.90625,
"learning_rate": 1.5247524752475247e-06,
"loss": 1.6642,
"step": 1051
},
{
"epoch": 0.6311873762524749,
"grad_norm": 1.6640625,
"learning_rate": 1.5222772277227723e-06,
"loss": 1.6911,
"step": 1052
},
{
"epoch": 0.631787364252715,
"grad_norm": 1.65625,
"learning_rate": 1.5198019801980198e-06,
"loss": 1.5992,
"step": 1053
},
{
"epoch": 0.632387352252955,
"grad_norm": 1.7421875,
"learning_rate": 1.5173267326732674e-06,
"loss": 1.6522,
"step": 1054
},
{
"epoch": 0.6329873402531949,
"grad_norm": 1.7734375,
"learning_rate": 1.5148514851485147e-06,
"loss": 1.6688,
"step": 1055
},
{
"epoch": 0.6335873282534349,
"grad_norm": 1.6796875,
"learning_rate": 1.5123762376237623e-06,
"loss": 1.6327,
"step": 1056
},
{
"epoch": 0.634187316253675,
"grad_norm": 1.59375,
"learning_rate": 1.5099009900990098e-06,
"loss": 1.6012,
"step": 1057
},
{
"epoch": 0.6347873042539149,
"grad_norm": 1.5625,
"learning_rate": 1.5074257425742574e-06,
"loss": 1.7478,
"step": 1058
},
{
"epoch": 0.6353872922541549,
"grad_norm": 1.7265625,
"learning_rate": 1.504950495049505e-06,
"loss": 1.6642,
"step": 1059
},
{
"epoch": 0.6359872802543949,
"grad_norm": 1.875,
"learning_rate": 1.5024752475247525e-06,
"loss": 1.6575,
"step": 1060
},
{
"epoch": 0.6365872682546349,
"grad_norm": 1.625,
"learning_rate": 1.5e-06,
"loss": 1.682,
"step": 1061
},
{
"epoch": 0.6371872562548749,
"grad_norm": 1.640625,
"learning_rate": 1.4975247524752474e-06,
"loss": 1.6323,
"step": 1062
},
{
"epoch": 0.6377872442551149,
"grad_norm": 1.65625,
"learning_rate": 1.495049504950495e-06,
"loss": 1.6041,
"step": 1063
},
{
"epoch": 0.6383872322553549,
"grad_norm": 1.8203125,
"learning_rate": 1.4925742574257425e-06,
"loss": 1.6278,
"step": 1064
},
{
"epoch": 0.6389872202555948,
"grad_norm": 1.765625,
"learning_rate": 1.49009900990099e-06,
"loss": 1.6691,
"step": 1065
},
{
"epoch": 0.6395872082558349,
"grad_norm": 1.546875,
"learning_rate": 1.4876237623762376e-06,
"loss": 1.6173,
"step": 1066
},
{
"epoch": 0.6401871962560749,
"grad_norm": 1.7734375,
"learning_rate": 1.4851485148514852e-06,
"loss": 1.6719,
"step": 1067
},
{
"epoch": 0.6407871842563149,
"grad_norm": 1.6015625,
"learning_rate": 1.4826732673267325e-06,
"loss": 1.6717,
"step": 1068
},
{
"epoch": 0.6413871722565548,
"grad_norm": 1.5703125,
"learning_rate": 1.48019801980198e-06,
"loss": 1.6107,
"step": 1069
},
{
"epoch": 0.6419871602567948,
"grad_norm": 1.71875,
"learning_rate": 1.4777227722772276e-06,
"loss": 1.6747,
"step": 1070
},
{
"epoch": 0.6425871482570349,
"grad_norm": 1.59375,
"learning_rate": 1.4752475247524752e-06,
"loss": 1.5889,
"step": 1071
},
{
"epoch": 0.6431871362572749,
"grad_norm": 1.5859375,
"learning_rate": 1.4727722772277227e-06,
"loss": 1.6483,
"step": 1072
},
{
"epoch": 0.6437871242575148,
"grad_norm": 1.7578125,
"learning_rate": 1.4702970297029703e-06,
"loss": 1.713,
"step": 1073
},
{
"epoch": 0.6443871122577548,
"grad_norm": 1.6484375,
"learning_rate": 1.4678217821782178e-06,
"loss": 1.6684,
"step": 1074
},
{
"epoch": 0.6449871002579949,
"grad_norm": 1.6640625,
"learning_rate": 1.4653465346534652e-06,
"loss": 1.566,
"step": 1075
},
{
"epoch": 0.6455870882582349,
"grad_norm": 1.5546875,
"learning_rate": 1.4628712871287127e-06,
"loss": 1.6242,
"step": 1076
},
{
"epoch": 0.6461870762584748,
"grad_norm": 1.640625,
"learning_rate": 1.4603960396039603e-06,
"loss": 1.613,
"step": 1077
},
{
"epoch": 0.6467870642587148,
"grad_norm": 1.6640625,
"learning_rate": 1.4579207920792078e-06,
"loss": 1.595,
"step": 1078
},
{
"epoch": 0.6473870522589549,
"grad_norm": 1.640625,
"learning_rate": 1.4554455445544554e-06,
"loss": 1.6096,
"step": 1079
},
{
"epoch": 0.6479870402591948,
"grad_norm": 1.703125,
"learning_rate": 1.452970297029703e-06,
"loss": 1.6353,
"step": 1080
},
{
"epoch": 0.6485870282594348,
"grad_norm": 2.03125,
"learning_rate": 1.4504950495049503e-06,
"loss": 1.6858,
"step": 1081
},
{
"epoch": 0.6491870162596748,
"grad_norm": 1.609375,
"learning_rate": 1.4480198019801978e-06,
"loss": 1.6339,
"step": 1082
},
{
"epoch": 0.6497870042599148,
"grad_norm": 1.8515625,
"learning_rate": 1.4455445544554454e-06,
"loss": 1.6156,
"step": 1083
},
{
"epoch": 0.6503869922601548,
"grad_norm": 1.703125,
"learning_rate": 1.443069306930693e-06,
"loss": 1.6629,
"step": 1084
},
{
"epoch": 0.6509869802603948,
"grad_norm": 1.734375,
"learning_rate": 1.4405940594059405e-06,
"loss": 1.6012,
"step": 1085
},
{
"epoch": 0.6515869682606348,
"grad_norm": 1.75,
"learning_rate": 1.438118811881188e-06,
"loss": 1.7112,
"step": 1086
},
{
"epoch": 0.6521869562608748,
"grad_norm": 1.6796875,
"learning_rate": 1.4356435643564356e-06,
"loss": 1.7422,
"step": 1087
},
{
"epoch": 0.6527869442611148,
"grad_norm": 1.7421875,
"learning_rate": 1.433168316831683e-06,
"loss": 1.5988,
"step": 1088
},
{
"epoch": 0.6533869322613548,
"grad_norm": 1.6640625,
"learning_rate": 1.4306930693069305e-06,
"loss": 1.631,
"step": 1089
},
{
"epoch": 0.6539869202615948,
"grad_norm": 1.84375,
"learning_rate": 1.428217821782178e-06,
"loss": 1.7748,
"step": 1090
},
{
"epoch": 0.6545869082618347,
"grad_norm": 1.6875,
"learning_rate": 1.4257425742574256e-06,
"loss": 1.6101,
"step": 1091
},
{
"epoch": 0.6551868962620747,
"grad_norm": 1.578125,
"learning_rate": 1.4232673267326732e-06,
"loss": 1.6727,
"step": 1092
},
{
"epoch": 0.6557868842623148,
"grad_norm": 1.765625,
"learning_rate": 1.4207920792079207e-06,
"loss": 1.656,
"step": 1093
},
{
"epoch": 0.6563868722625548,
"grad_norm": 1.84375,
"learning_rate": 1.4183168316831683e-06,
"loss": 1.7244,
"step": 1094
},
{
"epoch": 0.6569868602627947,
"grad_norm": 1.703125,
"learning_rate": 1.4158415841584156e-06,
"loss": 1.6891,
"step": 1095
},
{
"epoch": 0.6575868482630347,
"grad_norm": 1.7265625,
"learning_rate": 1.4133663366336632e-06,
"loss": 1.7265,
"step": 1096
},
{
"epoch": 0.6581868362632748,
"grad_norm": 1.6796875,
"learning_rate": 1.4108910891089107e-06,
"loss": 1.5939,
"step": 1097
},
{
"epoch": 0.6587868242635148,
"grad_norm": 1.765625,
"learning_rate": 1.4084158415841583e-06,
"loss": 1.6942,
"step": 1098
},
{
"epoch": 0.6593868122637547,
"grad_norm": 1.6796875,
"learning_rate": 1.4059405940594058e-06,
"loss": 1.5412,
"step": 1099
},
{
"epoch": 0.6599868002639947,
"grad_norm": 1.5703125,
"learning_rate": 1.4034653465346534e-06,
"loss": 1.5955,
"step": 1100
},
{
"epoch": 0.6599868002639947,
"eval_loss": 1.7773131132125854,
"eval_model_preparation_time": 0.0037,
"eval_runtime": 66.1015,
"eval_samples_per_second": 151.283,
"eval_steps_per_second": 25.219,
"step": 1100
},
{
"epoch": 0.6605867882642347,
"grad_norm": 1.65625,
"learning_rate": 1.4009900990099007e-06,
"loss": 1.665,
"step": 1101
},
{
"epoch": 0.6611867762644748,
"grad_norm": 1.515625,
"learning_rate": 1.3985148514851483e-06,
"loss": 1.6523,
"step": 1102
},
{
"epoch": 0.6617867642647147,
"grad_norm": 1.640625,
"learning_rate": 1.3960396039603959e-06,
"loss": 1.5928,
"step": 1103
},
{
"epoch": 0.6623867522649547,
"grad_norm": 1.6796875,
"learning_rate": 1.3935643564356434e-06,
"loss": 1.5513,
"step": 1104
},
{
"epoch": 0.6629867402651947,
"grad_norm": 1.8359375,
"learning_rate": 1.391089108910891e-06,
"loss": 1.7027,
"step": 1105
},
{
"epoch": 0.6635867282654347,
"grad_norm": 1.65625,
"learning_rate": 1.3886138613861385e-06,
"loss": 1.6501,
"step": 1106
},
{
"epoch": 0.6641867162656747,
"grad_norm": 1.5625,
"learning_rate": 1.386138613861386e-06,
"loss": 1.6262,
"step": 1107
},
{
"epoch": 0.6647867042659147,
"grad_norm": 1.546875,
"learning_rate": 1.3836633663366334e-06,
"loss": 1.6506,
"step": 1108
},
{
"epoch": 0.6653866922661547,
"grad_norm": 1.625,
"learning_rate": 1.381188118811881e-06,
"loss": 1.6444,
"step": 1109
},
{
"epoch": 0.6659866802663946,
"grad_norm": 1.640625,
"learning_rate": 1.3787128712871285e-06,
"loss": 1.6403,
"step": 1110
},
{
"epoch": 0.6665866682666347,
"grad_norm": 1.515625,
"learning_rate": 1.376237623762376e-06,
"loss": 1.6624,
"step": 1111
},
{
"epoch": 0.6671866562668747,
"grad_norm": 1.8515625,
"learning_rate": 1.3737623762376238e-06,
"loss": 1.7689,
"step": 1112
},
{
"epoch": 0.6677866442671146,
"grad_norm": 1.7421875,
"learning_rate": 1.3712871287128714e-06,
"loss": 1.661,
"step": 1113
},
{
"epoch": 0.6683866322673546,
"grad_norm": 2.03125,
"learning_rate": 1.368811881188119e-06,
"loss": 1.6815,
"step": 1114
},
{
"epoch": 0.6689866202675947,
"grad_norm": 1.6484375,
"learning_rate": 1.3663366336633663e-06,
"loss": 1.6806,
"step": 1115
},
{
"epoch": 0.6695866082678347,
"grad_norm": 1.625,
"learning_rate": 1.3638613861386139e-06,
"loss": 1.6907,
"step": 1116
},
{
"epoch": 0.6701865962680746,
"grad_norm": 1.6796875,
"learning_rate": 1.3613861386138614e-06,
"loss": 1.6261,
"step": 1117
},
{
"epoch": 0.6707865842683146,
"grad_norm": 1.6171875,
"learning_rate": 1.358910891089109e-06,
"loss": 1.7169,
"step": 1118
},
{
"epoch": 0.6713865722685546,
"grad_norm": 1.7734375,
"learning_rate": 1.3564356435643565e-06,
"loss": 1.5983,
"step": 1119
},
{
"epoch": 0.6719865602687947,
"grad_norm": 1.6328125,
"learning_rate": 1.353960396039604e-06,
"loss": 1.6789,
"step": 1120
},
{
"epoch": 0.6725865482690346,
"grad_norm": 1.703125,
"learning_rate": 1.3514851485148514e-06,
"loss": 1.6336,
"step": 1121
},
{
"epoch": 0.6731865362692746,
"grad_norm": 1.671875,
"learning_rate": 1.349009900990099e-06,
"loss": 1.7563,
"step": 1122
},
{
"epoch": 0.6737865242695146,
"grad_norm": 1.7578125,
"learning_rate": 1.3465346534653465e-06,
"loss": 1.6033,
"step": 1123
},
{
"epoch": 0.6743865122697547,
"grad_norm": 1.6640625,
"learning_rate": 1.344059405940594e-06,
"loss": 1.6741,
"step": 1124
},
{
"epoch": 0.6749865002699946,
"grad_norm": 1.5234375,
"learning_rate": 1.3415841584158416e-06,
"loss": 1.5703,
"step": 1125
},
{
"epoch": 0.6755864882702346,
"grad_norm": 1.5546875,
"learning_rate": 1.3391089108910892e-06,
"loss": 1.7107,
"step": 1126
},
{
"epoch": 0.6761864762704746,
"grad_norm": 1.640625,
"learning_rate": 1.3366336633663367e-06,
"loss": 1.6928,
"step": 1127
},
{
"epoch": 0.6767864642707145,
"grad_norm": 1.640625,
"learning_rate": 1.334158415841584e-06,
"loss": 1.6344,
"step": 1128
},
{
"epoch": 0.6773864522709546,
"grad_norm": 1.6015625,
"learning_rate": 1.3316831683168316e-06,
"loss": 1.5788,
"step": 1129
},
{
"epoch": 0.6779864402711946,
"grad_norm": 1.609375,
"learning_rate": 1.3292079207920792e-06,
"loss": 1.6234,
"step": 1130
},
{
"epoch": 0.6785864282714346,
"grad_norm": 1.8046875,
"learning_rate": 1.3267326732673268e-06,
"loss": 1.6987,
"step": 1131
},
{
"epoch": 0.6791864162716745,
"grad_norm": 1.7265625,
"learning_rate": 1.3242574257425743e-06,
"loss": 1.5971,
"step": 1132
},
{
"epoch": 0.6797864042719146,
"grad_norm": 1.75,
"learning_rate": 1.3217821782178219e-06,
"loss": 1.6059,
"step": 1133
},
{
"epoch": 0.6803863922721546,
"grad_norm": 1.6640625,
"learning_rate": 1.3193069306930692e-06,
"loss": 1.6445,
"step": 1134
},
{
"epoch": 0.6809863802723946,
"grad_norm": 1.828125,
"learning_rate": 1.3168316831683168e-06,
"loss": 1.6312,
"step": 1135
},
{
"epoch": 0.6815863682726345,
"grad_norm": 1.7578125,
"learning_rate": 1.3143564356435643e-06,
"loss": 1.6458,
"step": 1136
},
{
"epoch": 0.6821863562728745,
"grad_norm": 1.796875,
"learning_rate": 1.3118811881188119e-06,
"loss": 1.6534,
"step": 1137
},
{
"epoch": 0.6827863442731146,
"grad_norm": 1.515625,
"learning_rate": 1.3094059405940594e-06,
"loss": 1.6041,
"step": 1138
},
{
"epoch": 0.6833863322733545,
"grad_norm": 1.921875,
"learning_rate": 1.306930693069307e-06,
"loss": 1.7073,
"step": 1139
},
{
"epoch": 0.6839863202735945,
"grad_norm": 1.671875,
"learning_rate": 1.3044554455445545e-06,
"loss": 1.7033,
"step": 1140
},
{
"epoch": 0.6845863082738345,
"grad_norm": 1.546875,
"learning_rate": 1.3019801980198019e-06,
"loss": 1.6317,
"step": 1141
},
{
"epoch": 0.6851862962740746,
"grad_norm": 1.734375,
"learning_rate": 1.2995049504950494e-06,
"loss": 1.6607,
"step": 1142
},
{
"epoch": 0.6857862842743145,
"grad_norm": 1.609375,
"learning_rate": 1.297029702970297e-06,
"loss": 1.6282,
"step": 1143
},
{
"epoch": 0.6863862722745545,
"grad_norm": 1.7109375,
"learning_rate": 1.2945544554455445e-06,
"loss": 1.7765,
"step": 1144
},
{
"epoch": 0.6869862602747945,
"grad_norm": 1.640625,
"learning_rate": 1.292079207920792e-06,
"loss": 1.6843,
"step": 1145
},
{
"epoch": 0.6875862482750345,
"grad_norm": 1.5859375,
"learning_rate": 1.2896039603960396e-06,
"loss": 1.6195,
"step": 1146
},
{
"epoch": 0.6881862362752745,
"grad_norm": 1.6328125,
"learning_rate": 1.2871287128712872e-06,
"loss": 1.6546,
"step": 1147
},
{
"epoch": 0.6887862242755145,
"grad_norm": 1.59375,
"learning_rate": 1.2846534653465345e-06,
"loss": 1.6723,
"step": 1148
},
{
"epoch": 0.6893862122757545,
"grad_norm": 1.6640625,
"learning_rate": 1.282178217821782e-06,
"loss": 1.6211,
"step": 1149
},
{
"epoch": 0.6899862002759944,
"grad_norm": 1.5625,
"learning_rate": 1.2797029702970297e-06,
"loss": 1.7107,
"step": 1150
},
{
"epoch": 0.6899862002759944,
"eval_loss": 1.7773100137710571,
"eval_model_preparation_time": 0.0037,
"eval_runtime": 66.2071,
"eval_samples_per_second": 151.041,
"eval_steps_per_second": 25.179,
"step": 1150
},
{
"epoch": 0.6905861882762345,
"grad_norm": 1.6796875,
"learning_rate": 1.2772277227722772e-06,
"loss": 1.6873,
"step": 1151
},
{
"epoch": 0.6911861762764745,
"grad_norm": 1.5859375,
"learning_rate": 1.2747524752475248e-06,
"loss": 1.5066,
"step": 1152
},
{
"epoch": 0.6917861642767145,
"grad_norm": 1.640625,
"learning_rate": 1.2722772277227723e-06,
"loss": 1.5879,
"step": 1153
},
{
"epoch": 0.6923861522769544,
"grad_norm": 1.5390625,
"learning_rate": 1.2698019801980197e-06,
"loss": 1.7176,
"step": 1154
},
{
"epoch": 0.6929861402771944,
"grad_norm": 1.5625,
"learning_rate": 1.2673267326732672e-06,
"loss": 1.6146,
"step": 1155
},
{
"epoch": 0.6935861282774345,
"grad_norm": 1.8359375,
"learning_rate": 1.2648514851485148e-06,
"loss": 1.6275,
"step": 1156
},
{
"epoch": 0.6941861162776745,
"grad_norm": 1.6171875,
"learning_rate": 1.2623762376237623e-06,
"loss": 1.6011,
"step": 1157
},
{
"epoch": 0.6947861042779144,
"grad_norm": 1.6171875,
"learning_rate": 1.2599009900990099e-06,
"loss": 1.7129,
"step": 1158
},
{
"epoch": 0.6953860922781544,
"grad_norm": 1.671875,
"learning_rate": 1.2574257425742574e-06,
"loss": 1.6008,
"step": 1159
},
{
"epoch": 0.6959860802783945,
"grad_norm": 1.7265625,
"learning_rate": 1.254950495049505e-06,
"loss": 1.6409,
"step": 1160
},
{
"epoch": 0.6965860682786345,
"grad_norm": 1.7578125,
"learning_rate": 1.2524752475247523e-06,
"loss": 1.7206,
"step": 1161
},
{
"epoch": 0.6971860562788744,
"grad_norm": 1.6484375,
"learning_rate": 1.2499999999999999e-06,
"loss": 1.6677,
"step": 1162
},
{
"epoch": 0.6977860442791144,
"grad_norm": 1.71875,
"learning_rate": 1.2475247524752474e-06,
"loss": 1.4656,
"step": 1163
},
{
"epoch": 0.6983860322793544,
"grad_norm": 1.703125,
"learning_rate": 1.245049504950495e-06,
"loss": 1.6027,
"step": 1164
},
{
"epoch": 0.6989860202795944,
"grad_norm": 1.6640625,
"learning_rate": 1.2425742574257426e-06,
"loss": 1.7318,
"step": 1165
},
{
"epoch": 0.6995860082798344,
"grad_norm": 1.734375,
"learning_rate": 1.2400990099009901e-06,
"loss": 1.6006,
"step": 1166
},
{
"epoch": 0.7001859962800744,
"grad_norm": 1.6328125,
"learning_rate": 1.2376237623762375e-06,
"loss": 1.6734,
"step": 1167
},
{
"epoch": 0.7007859842803144,
"grad_norm": 1.7421875,
"learning_rate": 1.235148514851485e-06,
"loss": 1.7149,
"step": 1168
},
{
"epoch": 0.7013859722805544,
"grad_norm": 1.8515625,
"learning_rate": 1.2326732673267326e-06,
"loss": 1.7003,
"step": 1169
},
{
"epoch": 0.7019859602807944,
"grad_norm": 1.578125,
"learning_rate": 1.2301980198019801e-06,
"loss": 1.5693,
"step": 1170
},
{
"epoch": 0.7025859482810344,
"grad_norm": 1.6015625,
"learning_rate": 1.2277227722772277e-06,
"loss": 1.6384,
"step": 1171
},
{
"epoch": 0.7031859362812743,
"grad_norm": 1.6953125,
"learning_rate": 1.2252475247524752e-06,
"loss": 1.6105,
"step": 1172
},
{
"epoch": 0.7037859242815143,
"grad_norm": 1.75,
"learning_rate": 1.2227722772277228e-06,
"loss": 1.5411,
"step": 1173
},
{
"epoch": 0.7043859122817544,
"grad_norm": 1.6796875,
"learning_rate": 1.2202970297029701e-06,
"loss": 1.677,
"step": 1174
},
{
"epoch": 0.7049859002819944,
"grad_norm": 1.6640625,
"learning_rate": 1.2178217821782177e-06,
"loss": 1.7706,
"step": 1175
},
{
"epoch": 0.7055858882822343,
"grad_norm": 1.4921875,
"learning_rate": 1.2153465346534652e-06,
"loss": 1.666,
"step": 1176
},
{
"epoch": 0.7061858762824743,
"grad_norm": 1.578125,
"learning_rate": 1.2128712871287128e-06,
"loss": 1.6809,
"step": 1177
},
{
"epoch": 0.7067858642827144,
"grad_norm": 1.6171875,
"learning_rate": 1.2103960396039603e-06,
"loss": 1.6347,
"step": 1178
},
{
"epoch": 0.7073858522829544,
"grad_norm": 1.640625,
"learning_rate": 1.207920792079208e-06,
"loss": 1.7277,
"step": 1179
},
{
"epoch": 0.7079858402831943,
"grad_norm": 1.75,
"learning_rate": 1.2054455445544555e-06,
"loss": 1.5418,
"step": 1180
},
{
"epoch": 0.7085858282834343,
"grad_norm": 1.5078125,
"learning_rate": 1.2029702970297028e-06,
"loss": 1.6748,
"step": 1181
},
{
"epoch": 0.7091858162836743,
"grad_norm": 1.6953125,
"learning_rate": 1.2004950495049504e-06,
"loss": 1.6577,
"step": 1182
},
{
"epoch": 0.7097858042839144,
"grad_norm": 1.6796875,
"learning_rate": 1.198019801980198e-06,
"loss": 1.7349,
"step": 1183
},
{
"epoch": 0.7103857922841543,
"grad_norm": 1.65625,
"learning_rate": 1.1955445544554455e-06,
"loss": 1.5652,
"step": 1184
},
{
"epoch": 0.7109857802843943,
"grad_norm": 1.6875,
"learning_rate": 1.193069306930693e-06,
"loss": 1.6844,
"step": 1185
},
{
"epoch": 0.7115857682846343,
"grad_norm": 1.625,
"learning_rate": 1.1905940594059406e-06,
"loss": 1.6704,
"step": 1186
},
{
"epoch": 0.7121857562848743,
"grad_norm": 1.6953125,
"learning_rate": 1.188118811881188e-06,
"loss": 1.6941,
"step": 1187
},
{
"epoch": 0.7127857442851143,
"grad_norm": 1.609375,
"learning_rate": 1.1856435643564355e-06,
"loss": 1.6569,
"step": 1188
},
{
"epoch": 0.7133857322853543,
"grad_norm": 1.7578125,
"learning_rate": 1.183168316831683e-06,
"loss": 1.749,
"step": 1189
},
{
"epoch": 0.7139857202855943,
"grad_norm": 1.5703125,
"learning_rate": 1.1806930693069306e-06,
"loss": 1.6929,
"step": 1190
},
{
"epoch": 0.7145857082858342,
"grad_norm": 1.5859375,
"learning_rate": 1.1782178217821781e-06,
"loss": 1.645,
"step": 1191
},
{
"epoch": 0.7151856962860743,
"grad_norm": 1.59375,
"learning_rate": 1.1757425742574257e-06,
"loss": 1.6773,
"step": 1192
},
{
"epoch": 0.7157856842863143,
"grad_norm": 1.6640625,
"learning_rate": 1.1732673267326732e-06,
"loss": 1.7084,
"step": 1193
},
{
"epoch": 0.7163856722865543,
"grad_norm": 1.59375,
"learning_rate": 1.1707920792079206e-06,
"loss": 1.5868,
"step": 1194
},
{
"epoch": 0.7169856602867942,
"grad_norm": 1.7734375,
"learning_rate": 1.1683168316831681e-06,
"loss": 1.5783,
"step": 1195
},
{
"epoch": 0.7175856482870343,
"grad_norm": 1.7421875,
"learning_rate": 1.1658415841584157e-06,
"loss": 1.7283,
"step": 1196
},
{
"epoch": 0.7181856362872743,
"grad_norm": 1.640625,
"learning_rate": 1.1633663366336632e-06,
"loss": 1.6033,
"step": 1197
},
{
"epoch": 0.7187856242875142,
"grad_norm": 1.6640625,
"learning_rate": 1.1608910891089108e-06,
"loss": 1.7633,
"step": 1198
},
{
"epoch": 0.7193856122877542,
"grad_norm": 1.890625,
"learning_rate": 1.1584158415841584e-06,
"loss": 1.6804,
"step": 1199
},
{
"epoch": 0.7199856002879942,
"grad_norm": 1.6328125,
"learning_rate": 1.1559405940594057e-06,
"loss": 1.5985,
"step": 1200
},
{
"epoch": 0.7199856002879942,
"eval_loss": 1.777273416519165,
"eval_model_preparation_time": 0.0037,
"eval_runtime": 68.7207,
"eval_samples_per_second": 145.517,
"eval_steps_per_second": 24.258,
"step": 1200
}
],
"logging_steps": 1,
"max_steps": 1666,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 2.356578165633057e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}