flyingbugs's picture
Model save
80c710e verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 966,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.003105590062111801,
"grad_norm": 60.46764647043296,
"learning_rate": 5.154639175257732e-07,
"loss": 11.059,
"step": 1
},
{
"epoch": 0.006211180124223602,
"grad_norm": 60.76414526284177,
"learning_rate": 1.0309278350515464e-06,
"loss": 11.012,
"step": 2
},
{
"epoch": 0.009316770186335404,
"grad_norm": 59.20725802433676,
"learning_rate": 1.5463917525773197e-06,
"loss": 11.1319,
"step": 3
},
{
"epoch": 0.012422360248447204,
"grad_norm": 60.61459028364047,
"learning_rate": 2.061855670103093e-06,
"loss": 11.03,
"step": 4
},
{
"epoch": 0.015527950310559006,
"grad_norm": 64.67058884756766,
"learning_rate": 2.577319587628866e-06,
"loss": 10.8306,
"step": 5
},
{
"epoch": 0.018633540372670808,
"grad_norm": 70.79205232466896,
"learning_rate": 3.0927835051546395e-06,
"loss": 10.6598,
"step": 6
},
{
"epoch": 0.021739130434782608,
"grad_norm": 98.53755227750479,
"learning_rate": 3.608247422680412e-06,
"loss": 9.4929,
"step": 7
},
{
"epoch": 0.024844720496894408,
"grad_norm": 111.40861108613349,
"learning_rate": 4.123711340206186e-06,
"loss": 9.0865,
"step": 8
},
{
"epoch": 0.027950310559006212,
"grad_norm": 127.60092525337744,
"learning_rate": 4.639175257731959e-06,
"loss": 8.316,
"step": 9
},
{
"epoch": 0.031055900621118012,
"grad_norm": 57.38601512479266,
"learning_rate": 5.154639175257732e-06,
"loss": 3.5203,
"step": 10
},
{
"epoch": 0.034161490683229816,
"grad_norm": 40.087017160526905,
"learning_rate": 5.670103092783505e-06,
"loss": 2.5941,
"step": 11
},
{
"epoch": 0.037267080745341616,
"grad_norm": 36.371937459465364,
"learning_rate": 6.185567010309279e-06,
"loss": 2.3632,
"step": 12
},
{
"epoch": 0.040372670807453416,
"grad_norm": 5.902085049785153,
"learning_rate": 6.701030927835052e-06,
"loss": 1.3097,
"step": 13
},
{
"epoch": 0.043478260869565216,
"grad_norm": 4.395696518971964,
"learning_rate": 7.216494845360824e-06,
"loss": 1.2339,
"step": 14
},
{
"epoch": 0.046583850931677016,
"grad_norm": 3.1604888794138524,
"learning_rate": 7.731958762886599e-06,
"loss": 1.1463,
"step": 15
},
{
"epoch": 0.049689440993788817,
"grad_norm": 2.399444834184007,
"learning_rate": 8.247422680412371e-06,
"loss": 1.0723,
"step": 16
},
{
"epoch": 0.052795031055900624,
"grad_norm": 1.5487302596438641,
"learning_rate": 8.762886597938144e-06,
"loss": 0.9467,
"step": 17
},
{
"epoch": 0.055900621118012424,
"grad_norm": 80.99605354011946,
"learning_rate": 9.278350515463918e-06,
"loss": 0.9669,
"step": 18
},
{
"epoch": 0.059006211180124224,
"grad_norm": 35.062984197987575,
"learning_rate": 9.793814432989691e-06,
"loss": 0.8903,
"step": 19
},
{
"epoch": 0.062111801242236024,
"grad_norm": 1.7039403556284178,
"learning_rate": 1.0309278350515464e-05,
"loss": 0.8611,
"step": 20
},
{
"epoch": 0.06521739130434782,
"grad_norm": 1.1286556630596418,
"learning_rate": 1.0824742268041238e-05,
"loss": 0.7956,
"step": 21
},
{
"epoch": 0.06832298136645963,
"grad_norm": 0.8885421752095347,
"learning_rate": 1.134020618556701e-05,
"loss": 0.7944,
"step": 22
},
{
"epoch": 0.07142857142857142,
"grad_norm": 0.7771123311111944,
"learning_rate": 1.1855670103092783e-05,
"loss": 0.7888,
"step": 23
},
{
"epoch": 0.07453416149068323,
"grad_norm": 0.8290301807562498,
"learning_rate": 1.2371134020618558e-05,
"loss": 0.7524,
"step": 24
},
{
"epoch": 0.07763975155279502,
"grad_norm": 0.9178800987434453,
"learning_rate": 1.2886597938144329e-05,
"loss": 0.7276,
"step": 25
},
{
"epoch": 0.08074534161490683,
"grad_norm": 0.7280169831391284,
"learning_rate": 1.3402061855670103e-05,
"loss": 0.7049,
"step": 26
},
{
"epoch": 0.08385093167701864,
"grad_norm": 0.590489381671068,
"learning_rate": 1.3917525773195878e-05,
"loss": 0.6846,
"step": 27
},
{
"epoch": 0.08695652173913043,
"grad_norm": 0.6956647472162396,
"learning_rate": 1.4432989690721649e-05,
"loss": 0.6518,
"step": 28
},
{
"epoch": 0.09006211180124224,
"grad_norm": 0.7233291715436561,
"learning_rate": 1.4948453608247423e-05,
"loss": 0.6575,
"step": 29
},
{
"epoch": 0.09316770186335403,
"grad_norm": 0.6557463930133224,
"learning_rate": 1.5463917525773197e-05,
"loss": 0.6648,
"step": 30
},
{
"epoch": 0.09627329192546584,
"grad_norm": 0.5940038812473861,
"learning_rate": 1.5979381443298968e-05,
"loss": 0.6414,
"step": 31
},
{
"epoch": 0.09937888198757763,
"grad_norm": 0.514015705745489,
"learning_rate": 1.6494845360824743e-05,
"loss": 0.6178,
"step": 32
},
{
"epoch": 0.10248447204968944,
"grad_norm": 0.5977361497140969,
"learning_rate": 1.7010309278350517e-05,
"loss": 0.6216,
"step": 33
},
{
"epoch": 0.10559006211180125,
"grad_norm": 0.5377773183845758,
"learning_rate": 1.7525773195876288e-05,
"loss": 0.6195,
"step": 34
},
{
"epoch": 0.10869565217391304,
"grad_norm": 0.40152764208172104,
"learning_rate": 1.8041237113402062e-05,
"loss": 0.5758,
"step": 35
},
{
"epoch": 0.11180124223602485,
"grad_norm": 0.40244189444549017,
"learning_rate": 1.8556701030927837e-05,
"loss": 0.6178,
"step": 36
},
{
"epoch": 0.11490683229813664,
"grad_norm": 0.49886656483811526,
"learning_rate": 1.9072164948453608e-05,
"loss": 0.6062,
"step": 37
},
{
"epoch": 0.11801242236024845,
"grad_norm": 0.43178714425173426,
"learning_rate": 1.9587628865979382e-05,
"loss": 0.5929,
"step": 38
},
{
"epoch": 0.12111801242236025,
"grad_norm": 0.37953785852942284,
"learning_rate": 2.0103092783505157e-05,
"loss": 0.57,
"step": 39
},
{
"epoch": 0.12422360248447205,
"grad_norm": 0.3712229743609745,
"learning_rate": 2.0618556701030927e-05,
"loss": 0.5812,
"step": 40
},
{
"epoch": 0.12732919254658384,
"grad_norm": 0.38350882873215847,
"learning_rate": 2.1134020618556702e-05,
"loss": 0.5714,
"step": 41
},
{
"epoch": 0.13043478260869565,
"grad_norm": 0.4036659557430701,
"learning_rate": 2.1649484536082476e-05,
"loss": 0.5813,
"step": 42
},
{
"epoch": 0.13354037267080746,
"grad_norm": 0.33097703493186653,
"learning_rate": 2.2164948453608247e-05,
"loss": 0.5537,
"step": 43
},
{
"epoch": 0.13664596273291926,
"grad_norm": 0.339069211939581,
"learning_rate": 2.268041237113402e-05,
"loss": 0.57,
"step": 44
},
{
"epoch": 0.13975155279503104,
"grad_norm": 0.34080115423530455,
"learning_rate": 2.3195876288659796e-05,
"loss": 0.5434,
"step": 45
},
{
"epoch": 0.14285714285714285,
"grad_norm": 0.3309319915944845,
"learning_rate": 2.3711340206185567e-05,
"loss": 0.5436,
"step": 46
},
{
"epoch": 0.14596273291925466,
"grad_norm": 0.36258498646852527,
"learning_rate": 2.422680412371134e-05,
"loss": 0.5372,
"step": 47
},
{
"epoch": 0.14906832298136646,
"grad_norm": 0.3289309195150263,
"learning_rate": 2.4742268041237116e-05,
"loss": 0.5519,
"step": 48
},
{
"epoch": 0.15217391304347827,
"grad_norm": 0.29200888913110107,
"learning_rate": 2.5257731958762887e-05,
"loss": 0.5269,
"step": 49
},
{
"epoch": 0.15527950310559005,
"grad_norm": 0.2913726775318078,
"learning_rate": 2.5773195876288658e-05,
"loss": 0.5398,
"step": 50
},
{
"epoch": 0.15838509316770186,
"grad_norm": 0.36183923103400334,
"learning_rate": 2.6288659793814435e-05,
"loss": 0.5313,
"step": 51
},
{
"epoch": 0.16149068322981366,
"grad_norm": 0.289832432081365,
"learning_rate": 2.6804123711340206e-05,
"loss": 0.5294,
"step": 52
},
{
"epoch": 0.16459627329192547,
"grad_norm": 0.28159321988499836,
"learning_rate": 2.7319587628865977e-05,
"loss": 0.5102,
"step": 53
},
{
"epoch": 0.16770186335403728,
"grad_norm": 0.33289230730425107,
"learning_rate": 2.7835051546391755e-05,
"loss": 0.5325,
"step": 54
},
{
"epoch": 0.17080745341614906,
"grad_norm": 0.2711500030362234,
"learning_rate": 2.8350515463917526e-05,
"loss": 0.5203,
"step": 55
},
{
"epoch": 0.17391304347826086,
"grad_norm": 0.2675188946251961,
"learning_rate": 2.8865979381443297e-05,
"loss": 0.5224,
"step": 56
},
{
"epoch": 0.17701863354037267,
"grad_norm": 0.26579895922328955,
"learning_rate": 2.9381443298969075e-05,
"loss": 0.5294,
"step": 57
},
{
"epoch": 0.18012422360248448,
"grad_norm": 0.2489797381353846,
"learning_rate": 2.9896907216494846e-05,
"loss": 0.5111,
"step": 58
},
{
"epoch": 0.18322981366459629,
"grad_norm": 0.26304984714934,
"learning_rate": 3.0412371134020617e-05,
"loss": 0.5063,
"step": 59
},
{
"epoch": 0.18633540372670807,
"grad_norm": 0.29536218486713367,
"learning_rate": 3.0927835051546395e-05,
"loss": 0.5278,
"step": 60
},
{
"epoch": 0.18944099378881987,
"grad_norm": 0.2629951129122119,
"learning_rate": 3.1443298969072166e-05,
"loss": 0.5066,
"step": 61
},
{
"epoch": 0.19254658385093168,
"grad_norm": 0.324166573780215,
"learning_rate": 3.1958762886597937e-05,
"loss": 0.5054,
"step": 62
},
{
"epoch": 0.1956521739130435,
"grad_norm": 0.2729720585938641,
"learning_rate": 3.2474226804123714e-05,
"loss": 0.5142,
"step": 63
},
{
"epoch": 0.19875776397515527,
"grad_norm": 0.27422169347085695,
"learning_rate": 3.2989690721649485e-05,
"loss": 0.5119,
"step": 64
},
{
"epoch": 0.20186335403726707,
"grad_norm": 0.26064941279629095,
"learning_rate": 3.3505154639175256e-05,
"loss": 0.5037,
"step": 65
},
{
"epoch": 0.20496894409937888,
"grad_norm": 0.2589323970095713,
"learning_rate": 3.4020618556701034e-05,
"loss": 0.5181,
"step": 66
},
{
"epoch": 0.2080745341614907,
"grad_norm": 0.2795495681392583,
"learning_rate": 3.4536082474226805e-05,
"loss": 0.5006,
"step": 67
},
{
"epoch": 0.2111801242236025,
"grad_norm": 0.2785747261533415,
"learning_rate": 3.5051546391752576e-05,
"loss": 0.483,
"step": 68
},
{
"epoch": 0.21428571428571427,
"grad_norm": 0.25302395243466885,
"learning_rate": 3.5567010309278354e-05,
"loss": 0.4883,
"step": 69
},
{
"epoch": 0.21739130434782608,
"grad_norm": 0.2752883227640764,
"learning_rate": 3.6082474226804125e-05,
"loss": 0.5094,
"step": 70
},
{
"epoch": 0.2204968944099379,
"grad_norm": 0.3024166121222451,
"learning_rate": 3.6597938144329896e-05,
"loss": 0.4881,
"step": 71
},
{
"epoch": 0.2236024844720497,
"grad_norm": 0.3097948500444575,
"learning_rate": 3.7113402061855674e-05,
"loss": 0.4839,
"step": 72
},
{
"epoch": 0.2267080745341615,
"grad_norm": 0.2876918544530116,
"learning_rate": 3.7628865979381445e-05,
"loss": 0.5144,
"step": 73
},
{
"epoch": 0.22981366459627328,
"grad_norm": 0.3416229447277982,
"learning_rate": 3.8144329896907216e-05,
"loss": 0.4961,
"step": 74
},
{
"epoch": 0.2329192546583851,
"grad_norm": 0.3199113220311117,
"learning_rate": 3.865979381443299e-05,
"loss": 0.473,
"step": 75
},
{
"epoch": 0.2360248447204969,
"grad_norm": 0.3005248837372916,
"learning_rate": 3.9175257731958764e-05,
"loss": 0.4869,
"step": 76
},
{
"epoch": 0.2391304347826087,
"grad_norm": 0.3020219962118337,
"learning_rate": 3.9690721649484535e-05,
"loss": 0.5047,
"step": 77
},
{
"epoch": 0.2422360248447205,
"grad_norm": 0.29698825337519646,
"learning_rate": 4.020618556701031e-05,
"loss": 0.5022,
"step": 78
},
{
"epoch": 0.2453416149068323,
"grad_norm": 0.3021333930392965,
"learning_rate": 4.0721649484536084e-05,
"loss": 0.4866,
"step": 79
},
{
"epoch": 0.2484472049689441,
"grad_norm": 0.29250713103592757,
"learning_rate": 4.1237113402061855e-05,
"loss": 0.4896,
"step": 80
},
{
"epoch": 0.2515527950310559,
"grad_norm": 0.27724800469538824,
"learning_rate": 4.175257731958763e-05,
"loss": 0.4836,
"step": 81
},
{
"epoch": 0.2546583850931677,
"grad_norm": 0.3272751041798097,
"learning_rate": 4.2268041237113404e-05,
"loss": 0.5079,
"step": 82
},
{
"epoch": 0.2577639751552795,
"grad_norm": 0.2875779003405876,
"learning_rate": 4.2783505154639175e-05,
"loss": 0.4822,
"step": 83
},
{
"epoch": 0.2608695652173913,
"grad_norm": 0.351548025457743,
"learning_rate": 4.329896907216495e-05,
"loss": 0.4719,
"step": 84
},
{
"epoch": 0.2639751552795031,
"grad_norm": 0.3104421632805538,
"learning_rate": 4.3814432989690723e-05,
"loss": 0.4985,
"step": 85
},
{
"epoch": 0.2670807453416149,
"grad_norm": 0.29340838316836443,
"learning_rate": 4.4329896907216494e-05,
"loss": 0.4506,
"step": 86
},
{
"epoch": 0.2701863354037267,
"grad_norm": 0.31888072280932184,
"learning_rate": 4.484536082474227e-05,
"loss": 0.4718,
"step": 87
},
{
"epoch": 0.2732919254658385,
"grad_norm": 0.2881905604568596,
"learning_rate": 4.536082474226804e-05,
"loss": 0.4718,
"step": 88
},
{
"epoch": 0.27639751552795033,
"grad_norm": 0.382391969622348,
"learning_rate": 4.5876288659793814e-05,
"loss": 0.489,
"step": 89
},
{
"epoch": 0.2795031055900621,
"grad_norm": 0.28677795566141734,
"learning_rate": 4.639175257731959e-05,
"loss": 0.4625,
"step": 90
},
{
"epoch": 0.2826086956521739,
"grad_norm": 0.44192895579293406,
"learning_rate": 4.690721649484536e-05,
"loss": 0.4901,
"step": 91
},
{
"epoch": 0.2857142857142857,
"grad_norm": 0.36788344235249887,
"learning_rate": 4.7422680412371134e-05,
"loss": 0.4682,
"step": 92
},
{
"epoch": 0.2888198757763975,
"grad_norm": 0.5143785581301379,
"learning_rate": 4.793814432989691e-05,
"loss": 0.4748,
"step": 93
},
{
"epoch": 0.2919254658385093,
"grad_norm": 0.3714484820764116,
"learning_rate": 4.845360824742268e-05,
"loss": 0.4733,
"step": 94
},
{
"epoch": 0.2950310559006211,
"grad_norm": 0.4411279949864707,
"learning_rate": 4.8969072164948454e-05,
"loss": 0.4719,
"step": 95
},
{
"epoch": 0.2981366459627329,
"grad_norm": 0.4095900221196949,
"learning_rate": 4.948453608247423e-05,
"loss": 0.4679,
"step": 96
},
{
"epoch": 0.30124223602484473,
"grad_norm": 0.3876387401039132,
"learning_rate": 5e-05,
"loss": 0.4727,
"step": 97
},
{
"epoch": 0.30434782608695654,
"grad_norm": 0.35671507475673714,
"learning_rate": 4.994246260069045e-05,
"loss": 0.4582,
"step": 98
},
{
"epoch": 0.30745341614906835,
"grad_norm": 0.40457113215141677,
"learning_rate": 4.98849252013809e-05,
"loss": 0.4817,
"step": 99
},
{
"epoch": 0.3105590062111801,
"grad_norm": 0.40014058749708475,
"learning_rate": 4.982738780207135e-05,
"loss": 0.4486,
"step": 100
},
{
"epoch": 0.3136645962732919,
"grad_norm": 0.4870121731575367,
"learning_rate": 4.97698504027618e-05,
"loss": 0.4663,
"step": 101
},
{
"epoch": 0.3167701863354037,
"grad_norm": 0.4340851079572886,
"learning_rate": 4.9712313003452246e-05,
"loss": 0.4484,
"step": 102
},
{
"epoch": 0.3198757763975155,
"grad_norm": 0.35686684080021636,
"learning_rate": 4.9654775604142695e-05,
"loss": 0.467,
"step": 103
},
{
"epoch": 0.32298136645962733,
"grad_norm": 0.4494359291517841,
"learning_rate": 4.9597238204833143e-05,
"loss": 0.4694,
"step": 104
},
{
"epoch": 0.32608695652173914,
"grad_norm": 0.4372407930618466,
"learning_rate": 4.953970080552359e-05,
"loss": 0.4648,
"step": 105
},
{
"epoch": 0.32919254658385094,
"grad_norm": 0.34466736034003903,
"learning_rate": 4.948216340621404e-05,
"loss": 0.4444,
"step": 106
},
{
"epoch": 0.33229813664596275,
"grad_norm": 0.4001800803927703,
"learning_rate": 4.942462600690449e-05,
"loss": 0.464,
"step": 107
},
{
"epoch": 0.33540372670807456,
"grad_norm": 0.3577590335432523,
"learning_rate": 4.936708860759494e-05,
"loss": 0.4647,
"step": 108
},
{
"epoch": 0.3385093167701863,
"grad_norm": 0.3827072494556767,
"learning_rate": 4.930955120828539e-05,
"loss": 0.4452,
"step": 109
},
{
"epoch": 0.3416149068322981,
"grad_norm": 0.40554119841147346,
"learning_rate": 4.9252013808975836e-05,
"loss": 0.457,
"step": 110
},
{
"epoch": 0.3447204968944099,
"grad_norm": 0.3980370218198526,
"learning_rate": 4.9194476409666285e-05,
"loss": 0.4566,
"step": 111
},
{
"epoch": 0.34782608695652173,
"grad_norm": 0.38595447982147235,
"learning_rate": 4.913693901035673e-05,
"loss": 0.4436,
"step": 112
},
{
"epoch": 0.35093167701863354,
"grad_norm": 0.3335566121887473,
"learning_rate": 4.907940161104718e-05,
"loss": 0.4525,
"step": 113
},
{
"epoch": 0.35403726708074534,
"grad_norm": 0.44048069823182057,
"learning_rate": 4.902186421173763e-05,
"loss": 0.4775,
"step": 114
},
{
"epoch": 0.35714285714285715,
"grad_norm": 0.3511836624614759,
"learning_rate": 4.896432681242808e-05,
"loss": 0.4529,
"step": 115
},
{
"epoch": 0.36024844720496896,
"grad_norm": 0.40512550088435406,
"learning_rate": 4.890678941311853e-05,
"loss": 0.4856,
"step": 116
},
{
"epoch": 0.36335403726708076,
"grad_norm": 0.4709820706303788,
"learning_rate": 4.884925201380898e-05,
"loss": 0.4613,
"step": 117
},
{
"epoch": 0.36645962732919257,
"grad_norm": 0.3163807878418199,
"learning_rate": 4.8791714614499426e-05,
"loss": 0.476,
"step": 118
},
{
"epoch": 0.3695652173913043,
"grad_norm": 0.421853544537181,
"learning_rate": 4.8734177215189874e-05,
"loss": 0.4675,
"step": 119
},
{
"epoch": 0.37267080745341613,
"grad_norm": 0.37140388109626665,
"learning_rate": 4.867663981588032e-05,
"loss": 0.452,
"step": 120
},
{
"epoch": 0.37577639751552794,
"grad_norm": 0.42352163355515543,
"learning_rate": 4.861910241657077e-05,
"loss": 0.4468,
"step": 121
},
{
"epoch": 0.37888198757763975,
"grad_norm": 0.4144419361914004,
"learning_rate": 4.856156501726122e-05,
"loss": 0.4526,
"step": 122
},
{
"epoch": 0.38198757763975155,
"grad_norm": 0.40675120816526916,
"learning_rate": 4.850402761795167e-05,
"loss": 0.4611,
"step": 123
},
{
"epoch": 0.38509316770186336,
"grad_norm": 0.5826147735025056,
"learning_rate": 4.844649021864212e-05,
"loss": 0.4803,
"step": 124
},
{
"epoch": 0.38819875776397517,
"grad_norm": 0.3282657199624206,
"learning_rate": 4.838895281933257e-05,
"loss": 0.4552,
"step": 125
},
{
"epoch": 0.391304347826087,
"grad_norm": 0.5159501988757971,
"learning_rate": 4.8331415420023015e-05,
"loss": 0.4794,
"step": 126
},
{
"epoch": 0.3944099378881988,
"grad_norm": 0.3620503849683116,
"learning_rate": 4.8273878020713464e-05,
"loss": 0.4631,
"step": 127
},
{
"epoch": 0.39751552795031053,
"grad_norm": 0.4221189340341717,
"learning_rate": 4.821634062140391e-05,
"loss": 0.4696,
"step": 128
},
{
"epoch": 0.40062111801242234,
"grad_norm": 0.46423436394369083,
"learning_rate": 4.815880322209436e-05,
"loss": 0.4573,
"step": 129
},
{
"epoch": 0.40372670807453415,
"grad_norm": 0.4261777248289121,
"learning_rate": 4.810126582278481e-05,
"loss": 0.4608,
"step": 130
},
{
"epoch": 0.40683229813664595,
"grad_norm": 0.45519667338748365,
"learning_rate": 4.804372842347526e-05,
"loss": 0.4621,
"step": 131
},
{
"epoch": 0.40993788819875776,
"grad_norm": 0.4384463354130905,
"learning_rate": 4.798619102416571e-05,
"loss": 0.4656,
"step": 132
},
{
"epoch": 0.41304347826086957,
"grad_norm": 0.41199291319131776,
"learning_rate": 4.7928653624856157e-05,
"loss": 0.4535,
"step": 133
},
{
"epoch": 0.4161490683229814,
"grad_norm": 0.3655597225332361,
"learning_rate": 4.7871116225546605e-05,
"loss": 0.4501,
"step": 134
},
{
"epoch": 0.4192546583850932,
"grad_norm": 0.44932133556116877,
"learning_rate": 4.7813578826237054e-05,
"loss": 0.4767,
"step": 135
},
{
"epoch": 0.422360248447205,
"grad_norm": 0.3329354062585348,
"learning_rate": 4.77560414269275e-05,
"loss": 0.4455,
"step": 136
},
{
"epoch": 0.4254658385093168,
"grad_norm": 0.45152077511616723,
"learning_rate": 4.769850402761795e-05,
"loss": 0.4623,
"step": 137
},
{
"epoch": 0.42857142857142855,
"grad_norm": 0.3188549796798649,
"learning_rate": 4.76409666283084e-05,
"loss": 0.4304,
"step": 138
},
{
"epoch": 0.43167701863354035,
"grad_norm": 0.39747649807961544,
"learning_rate": 4.758342922899885e-05,
"loss": 0.4486,
"step": 139
},
{
"epoch": 0.43478260869565216,
"grad_norm": 0.2901963778324694,
"learning_rate": 4.75258918296893e-05,
"loss": 0.4485,
"step": 140
},
{
"epoch": 0.43788819875776397,
"grad_norm": 0.5357034478107343,
"learning_rate": 4.7468354430379746e-05,
"loss": 0.4773,
"step": 141
},
{
"epoch": 0.4409937888198758,
"grad_norm": 0.3892373232000147,
"learning_rate": 4.7410817031070195e-05,
"loss": 0.4408,
"step": 142
},
{
"epoch": 0.4440993788819876,
"grad_norm": 0.45033922342477917,
"learning_rate": 4.7353279631760644e-05,
"loss": 0.4598,
"step": 143
},
{
"epoch": 0.4472049689440994,
"grad_norm": 0.37908550777510663,
"learning_rate": 4.729574223245109e-05,
"loss": 0.4452,
"step": 144
},
{
"epoch": 0.4503105590062112,
"grad_norm": 0.4290373855109045,
"learning_rate": 4.723820483314154e-05,
"loss": 0.4536,
"step": 145
},
{
"epoch": 0.453416149068323,
"grad_norm": 0.35676947230487216,
"learning_rate": 4.718066743383199e-05,
"loss": 0.4648,
"step": 146
},
{
"epoch": 0.45652173913043476,
"grad_norm": 0.33636058827665144,
"learning_rate": 4.712313003452244e-05,
"loss": 0.444,
"step": 147
},
{
"epoch": 0.45962732919254656,
"grad_norm": 0.3823016634046083,
"learning_rate": 4.706559263521289e-05,
"loss": 0.4406,
"step": 148
},
{
"epoch": 0.46273291925465837,
"grad_norm": 0.3818789119419192,
"learning_rate": 4.700805523590334e-05,
"loss": 0.4488,
"step": 149
},
{
"epoch": 0.4658385093167702,
"grad_norm": 0.33345974040131937,
"learning_rate": 4.6950517836593785e-05,
"loss": 0.4647,
"step": 150
},
{
"epoch": 0.468944099378882,
"grad_norm": 0.47073824185480967,
"learning_rate": 4.689298043728424e-05,
"loss": 0.4534,
"step": 151
},
{
"epoch": 0.4720496894409938,
"grad_norm": 0.40070437909888434,
"learning_rate": 4.683544303797468e-05,
"loss": 0.4367,
"step": 152
},
{
"epoch": 0.4751552795031056,
"grad_norm": 0.407305468388989,
"learning_rate": 4.677790563866514e-05,
"loss": 0.4415,
"step": 153
},
{
"epoch": 0.4782608695652174,
"grad_norm": 0.4058611659098106,
"learning_rate": 4.672036823935558e-05,
"loss": 0.4576,
"step": 154
},
{
"epoch": 0.4813664596273292,
"grad_norm": 0.3967515788115339,
"learning_rate": 4.6662830840046035e-05,
"loss": 0.4524,
"step": 155
},
{
"epoch": 0.484472049689441,
"grad_norm": 0.4407590164610378,
"learning_rate": 4.660529344073648e-05,
"loss": 0.457,
"step": 156
},
{
"epoch": 0.48757763975155277,
"grad_norm": 0.43880737794315955,
"learning_rate": 4.654775604142693e-05,
"loss": 0.4365,
"step": 157
},
{
"epoch": 0.4906832298136646,
"grad_norm": 0.47864526006501984,
"learning_rate": 4.6490218642117375e-05,
"loss": 0.4479,
"step": 158
},
{
"epoch": 0.4937888198757764,
"grad_norm": 0.4692672779398985,
"learning_rate": 4.643268124280783e-05,
"loss": 0.4546,
"step": 159
},
{
"epoch": 0.4968944099378882,
"grad_norm": 0.4097305951007724,
"learning_rate": 4.637514384349827e-05,
"loss": 0.4355,
"step": 160
},
{
"epoch": 0.5,
"grad_norm": 0.43610759922666353,
"learning_rate": 4.631760644418873e-05,
"loss": 0.447,
"step": 161
},
{
"epoch": 0.5031055900621118,
"grad_norm": 0.2978982430601787,
"learning_rate": 4.626006904487917e-05,
"loss": 0.4524,
"step": 162
},
{
"epoch": 0.5062111801242236,
"grad_norm": 0.43653406806069966,
"learning_rate": 4.6202531645569625e-05,
"loss": 0.4171,
"step": 163
},
{
"epoch": 0.5093167701863354,
"grad_norm": 0.40670821189566986,
"learning_rate": 4.614499424626007e-05,
"loss": 0.439,
"step": 164
},
{
"epoch": 0.5124223602484472,
"grad_norm": 0.33901355170318703,
"learning_rate": 4.608745684695052e-05,
"loss": 0.4461,
"step": 165
},
{
"epoch": 0.515527950310559,
"grad_norm": 0.43610331613751346,
"learning_rate": 4.6029919447640965e-05,
"loss": 0.4554,
"step": 166
},
{
"epoch": 0.5186335403726708,
"grad_norm": 0.3625661313466411,
"learning_rate": 4.597238204833142e-05,
"loss": 0.4554,
"step": 167
},
{
"epoch": 0.5217391304347826,
"grad_norm": 0.3394393399478139,
"learning_rate": 4.591484464902186e-05,
"loss": 0.4367,
"step": 168
},
{
"epoch": 0.5248447204968945,
"grad_norm": 0.3588563348596153,
"learning_rate": 4.585730724971232e-05,
"loss": 0.4461,
"step": 169
},
{
"epoch": 0.5279503105590062,
"grad_norm": 0.3802785353634964,
"learning_rate": 4.579976985040276e-05,
"loss": 0.4387,
"step": 170
},
{
"epoch": 0.531055900621118,
"grad_norm": 0.3869023722709017,
"learning_rate": 4.5742232451093215e-05,
"loss": 0.4528,
"step": 171
},
{
"epoch": 0.5341614906832298,
"grad_norm": 0.36676418356051843,
"learning_rate": 4.568469505178366e-05,
"loss": 0.4348,
"step": 172
},
{
"epoch": 0.5372670807453416,
"grad_norm": 0.46126816544453725,
"learning_rate": 4.562715765247411e-05,
"loss": 0.4231,
"step": 173
},
{
"epoch": 0.5403726708074534,
"grad_norm": 0.35343634631539705,
"learning_rate": 4.556962025316456e-05,
"loss": 0.4369,
"step": 174
},
{
"epoch": 0.5434782608695652,
"grad_norm": 0.4549103689048508,
"learning_rate": 4.551208285385501e-05,
"loss": 0.4387,
"step": 175
},
{
"epoch": 0.546583850931677,
"grad_norm": 0.4303714186336393,
"learning_rate": 4.545454545454546e-05,
"loss": 0.4546,
"step": 176
},
{
"epoch": 0.5496894409937888,
"grad_norm": 0.4531267139678119,
"learning_rate": 4.539700805523591e-05,
"loss": 0.4356,
"step": 177
},
{
"epoch": 0.5527950310559007,
"grad_norm": 0.42240540949166944,
"learning_rate": 4.5339470655926356e-05,
"loss": 0.4442,
"step": 178
},
{
"epoch": 0.5559006211180124,
"grad_norm": 0.3163983623110262,
"learning_rate": 4.5281933256616805e-05,
"loss": 0.4255,
"step": 179
},
{
"epoch": 0.5590062111801242,
"grad_norm": 0.37954620340652895,
"learning_rate": 4.5224395857307253e-05,
"loss": 0.4387,
"step": 180
},
{
"epoch": 0.562111801242236,
"grad_norm": 0.33565801845470367,
"learning_rate": 4.51668584579977e-05,
"loss": 0.4415,
"step": 181
},
{
"epoch": 0.5652173913043478,
"grad_norm": 0.3349864414277053,
"learning_rate": 4.510932105868815e-05,
"loss": 0.4082,
"step": 182
},
{
"epoch": 0.5683229813664596,
"grad_norm": 0.439294679014343,
"learning_rate": 4.50517836593786e-05,
"loss": 0.446,
"step": 183
},
{
"epoch": 0.5714285714285714,
"grad_norm": 0.28999671538444516,
"learning_rate": 4.499424626006905e-05,
"loss": 0.4103,
"step": 184
},
{
"epoch": 0.5745341614906833,
"grad_norm": 0.40660335920713986,
"learning_rate": 4.49367088607595e-05,
"loss": 0.444,
"step": 185
},
{
"epoch": 0.577639751552795,
"grad_norm": 0.3033161017839996,
"learning_rate": 4.4879171461449946e-05,
"loss": 0.435,
"step": 186
},
{
"epoch": 0.5807453416149069,
"grad_norm": 0.30568413065453626,
"learning_rate": 4.4821634062140395e-05,
"loss": 0.4237,
"step": 187
},
{
"epoch": 0.5838509316770186,
"grad_norm": 0.32587134975274057,
"learning_rate": 4.476409666283084e-05,
"loss": 0.4331,
"step": 188
},
{
"epoch": 0.5869565217391305,
"grad_norm": 0.28290562376532075,
"learning_rate": 4.470655926352129e-05,
"loss": 0.4342,
"step": 189
},
{
"epoch": 0.5900621118012422,
"grad_norm": 0.3630490197737241,
"learning_rate": 4.464902186421174e-05,
"loss": 0.4344,
"step": 190
},
{
"epoch": 0.593167701863354,
"grad_norm": 0.3559890010930286,
"learning_rate": 4.459148446490219e-05,
"loss": 0.4498,
"step": 191
},
{
"epoch": 0.5962732919254659,
"grad_norm": 0.3499252907427838,
"learning_rate": 4.453394706559264e-05,
"loss": 0.4507,
"step": 192
},
{
"epoch": 0.5993788819875776,
"grad_norm": 0.33006303704048223,
"learning_rate": 4.447640966628309e-05,
"loss": 0.4281,
"step": 193
},
{
"epoch": 0.6024844720496895,
"grad_norm": 0.36984061156296816,
"learning_rate": 4.4418872266973536e-05,
"loss": 0.4518,
"step": 194
},
{
"epoch": 0.6055900621118012,
"grad_norm": 0.3439812296873207,
"learning_rate": 4.4361334867663984e-05,
"loss": 0.4448,
"step": 195
},
{
"epoch": 0.6086956521739131,
"grad_norm": 0.30685626808124417,
"learning_rate": 4.430379746835443e-05,
"loss": 0.4377,
"step": 196
},
{
"epoch": 0.6118012422360248,
"grad_norm": 0.3925848437000049,
"learning_rate": 4.424626006904488e-05,
"loss": 0.4396,
"step": 197
},
{
"epoch": 0.6149068322981367,
"grad_norm": 0.32639373809266464,
"learning_rate": 4.418872266973533e-05,
"loss": 0.4321,
"step": 198
},
{
"epoch": 0.6180124223602484,
"grad_norm": 0.376079541285074,
"learning_rate": 4.413118527042578e-05,
"loss": 0.4242,
"step": 199
},
{
"epoch": 0.6211180124223602,
"grad_norm": 0.3749608850464733,
"learning_rate": 4.407364787111623e-05,
"loss": 0.4259,
"step": 200
},
{
"epoch": 0.6242236024844721,
"grad_norm": 0.4461881134050382,
"learning_rate": 4.401611047180668e-05,
"loss": 0.4341,
"step": 201
},
{
"epoch": 0.6273291925465838,
"grad_norm": 0.4877320414028972,
"learning_rate": 4.3958573072497125e-05,
"loss": 0.4344,
"step": 202
},
{
"epoch": 0.6304347826086957,
"grad_norm": 0.4070659780535386,
"learning_rate": 4.3901035673187574e-05,
"loss": 0.4227,
"step": 203
},
{
"epoch": 0.6335403726708074,
"grad_norm": 0.4635439998393952,
"learning_rate": 4.384349827387802e-05,
"loss": 0.4355,
"step": 204
},
{
"epoch": 0.6366459627329193,
"grad_norm": 0.35952245913430025,
"learning_rate": 4.378596087456847e-05,
"loss": 0.423,
"step": 205
},
{
"epoch": 0.639751552795031,
"grad_norm": 0.520771866846795,
"learning_rate": 4.372842347525892e-05,
"loss": 0.4306,
"step": 206
},
{
"epoch": 0.6428571428571429,
"grad_norm": 0.3273697337468707,
"learning_rate": 4.367088607594937e-05,
"loss": 0.4324,
"step": 207
},
{
"epoch": 0.6459627329192547,
"grad_norm": 0.4813614761483608,
"learning_rate": 4.361334867663982e-05,
"loss": 0.4478,
"step": 208
},
{
"epoch": 0.6490683229813664,
"grad_norm": 0.3900984777507702,
"learning_rate": 4.3555811277330267e-05,
"loss": 0.4269,
"step": 209
},
{
"epoch": 0.6521739130434783,
"grad_norm": 0.3853092679143466,
"learning_rate": 4.3498273878020715e-05,
"loss": 0.4407,
"step": 210
},
{
"epoch": 0.65527950310559,
"grad_norm": 0.46225724309871613,
"learning_rate": 4.3440736478711164e-05,
"loss": 0.436,
"step": 211
},
{
"epoch": 0.6583850931677019,
"grad_norm": 0.31651404685134377,
"learning_rate": 4.338319907940161e-05,
"loss": 0.4316,
"step": 212
},
{
"epoch": 0.6614906832298136,
"grad_norm": 0.44516432018668023,
"learning_rate": 4.332566168009206e-05,
"loss": 0.4426,
"step": 213
},
{
"epoch": 0.6645962732919255,
"grad_norm": 0.3462443744991128,
"learning_rate": 4.326812428078251e-05,
"loss": 0.4465,
"step": 214
},
{
"epoch": 0.6677018633540373,
"grad_norm": 0.4436257780311306,
"learning_rate": 4.321058688147296e-05,
"loss": 0.4241,
"step": 215
},
{
"epoch": 0.6708074534161491,
"grad_norm": 0.3788099950107418,
"learning_rate": 4.315304948216341e-05,
"loss": 0.4206,
"step": 216
},
{
"epoch": 0.6739130434782609,
"grad_norm": 0.3667132129478159,
"learning_rate": 4.3095512082853856e-05,
"loss": 0.4336,
"step": 217
},
{
"epoch": 0.6770186335403726,
"grad_norm": 0.43405694529571,
"learning_rate": 4.3037974683544305e-05,
"loss": 0.4285,
"step": 218
},
{
"epoch": 0.6801242236024845,
"grad_norm": 0.37501605794405696,
"learning_rate": 4.2980437284234754e-05,
"loss": 0.4354,
"step": 219
},
{
"epoch": 0.6832298136645962,
"grad_norm": 0.7491502232791192,
"learning_rate": 4.29228998849252e-05,
"loss": 0.4622,
"step": 220
},
{
"epoch": 0.6863354037267081,
"grad_norm": 0.34683109305557713,
"learning_rate": 4.286536248561565e-05,
"loss": 0.4349,
"step": 221
},
{
"epoch": 0.6894409937888198,
"grad_norm": 0.41649862939635707,
"learning_rate": 4.28078250863061e-05,
"loss": 0.4278,
"step": 222
},
{
"epoch": 0.6925465838509317,
"grad_norm": 0.33273645633734766,
"learning_rate": 4.275028768699655e-05,
"loss": 0.4241,
"step": 223
},
{
"epoch": 0.6956521739130435,
"grad_norm": 0.358638671370147,
"learning_rate": 4.2692750287687e-05,
"loss": 0.4294,
"step": 224
},
{
"epoch": 0.6987577639751553,
"grad_norm": 0.3505002399312612,
"learning_rate": 4.2635212888377446e-05,
"loss": 0.4339,
"step": 225
},
{
"epoch": 0.7018633540372671,
"grad_norm": 0.28967971081827765,
"learning_rate": 4.2577675489067895e-05,
"loss": 0.4433,
"step": 226
},
{
"epoch": 0.7049689440993789,
"grad_norm": 0.3792183124094411,
"learning_rate": 4.2520138089758344e-05,
"loss": 0.4263,
"step": 227
},
{
"epoch": 0.7080745341614907,
"grad_norm": 0.2915459102300122,
"learning_rate": 4.246260069044879e-05,
"loss": 0.4226,
"step": 228
},
{
"epoch": 0.7111801242236024,
"grad_norm": 0.357404227614541,
"learning_rate": 4.240506329113924e-05,
"loss": 0.4183,
"step": 229
},
{
"epoch": 0.7142857142857143,
"grad_norm": 0.33657909101352584,
"learning_rate": 4.234752589182969e-05,
"loss": 0.4383,
"step": 230
},
{
"epoch": 0.717391304347826,
"grad_norm": 0.28799404238315757,
"learning_rate": 4.228998849252014e-05,
"loss": 0.4059,
"step": 231
},
{
"epoch": 0.7204968944099379,
"grad_norm": 0.3559137710527895,
"learning_rate": 4.223245109321059e-05,
"loss": 0.431,
"step": 232
},
{
"epoch": 0.7236024844720497,
"grad_norm": 0.3571859472649835,
"learning_rate": 4.2174913693901036e-05,
"loss": 0.4365,
"step": 233
},
{
"epoch": 0.7267080745341615,
"grad_norm": 0.27866414620295615,
"learning_rate": 4.2117376294591485e-05,
"loss": 0.4164,
"step": 234
},
{
"epoch": 0.7298136645962733,
"grad_norm": 0.35761820704128017,
"learning_rate": 4.2059838895281933e-05,
"loss": 0.4155,
"step": 235
},
{
"epoch": 0.7329192546583851,
"grad_norm": 0.38239702778323204,
"learning_rate": 4.200230149597238e-05,
"loss": 0.4441,
"step": 236
},
{
"epoch": 0.7360248447204969,
"grad_norm": 0.37338686711282476,
"learning_rate": 4.194476409666283e-05,
"loss": 0.4287,
"step": 237
},
{
"epoch": 0.7391304347826086,
"grad_norm": 0.31078006795719737,
"learning_rate": 4.188722669735328e-05,
"loss": 0.4314,
"step": 238
},
{
"epoch": 0.7422360248447205,
"grad_norm": 0.42962957316409206,
"learning_rate": 4.182968929804373e-05,
"loss": 0.4258,
"step": 239
},
{
"epoch": 0.7453416149068323,
"grad_norm": 0.3531531884915285,
"learning_rate": 4.177215189873418e-05,
"loss": 0.4348,
"step": 240
},
{
"epoch": 0.7484472049689441,
"grad_norm": 0.4645354016036932,
"learning_rate": 4.1714614499424626e-05,
"loss": 0.4204,
"step": 241
},
{
"epoch": 0.7515527950310559,
"grad_norm": 1.09153721353785,
"learning_rate": 4.1657077100115075e-05,
"loss": 0.4386,
"step": 242
},
{
"epoch": 0.7546583850931677,
"grad_norm": 0.32971689202723414,
"learning_rate": 4.159953970080552e-05,
"loss": 0.4286,
"step": 243
},
{
"epoch": 0.7577639751552795,
"grad_norm": 0.47923594956031046,
"learning_rate": 4.154200230149597e-05,
"loss": 0.4355,
"step": 244
},
{
"epoch": 0.7608695652173914,
"grad_norm": 0.3499125189435591,
"learning_rate": 4.148446490218642e-05,
"loss": 0.4363,
"step": 245
},
{
"epoch": 0.7639751552795031,
"grad_norm": 0.3676637215847227,
"learning_rate": 4.142692750287687e-05,
"loss": 0.4351,
"step": 246
},
{
"epoch": 0.7670807453416149,
"grad_norm": 0.3727821108079694,
"learning_rate": 4.136939010356732e-05,
"loss": 0.4418,
"step": 247
},
{
"epoch": 0.7701863354037267,
"grad_norm": 0.3252006506678716,
"learning_rate": 4.131185270425777e-05,
"loss": 0.4158,
"step": 248
},
{
"epoch": 0.7732919254658385,
"grad_norm": 0.6538129311302192,
"learning_rate": 4.1254315304948216e-05,
"loss": 0.457,
"step": 249
},
{
"epoch": 0.7763975155279503,
"grad_norm": 0.33906627374077886,
"learning_rate": 4.1196777905638664e-05,
"loss": 0.4318,
"step": 250
},
{
"epoch": 0.7795031055900621,
"grad_norm": 0.356301991033165,
"learning_rate": 4.113924050632912e-05,
"loss": 0.4236,
"step": 251
},
{
"epoch": 0.782608695652174,
"grad_norm": 0.32783848540999616,
"learning_rate": 4.108170310701956e-05,
"loss": 0.4448,
"step": 252
},
{
"epoch": 0.7857142857142857,
"grad_norm": 0.33633346589846297,
"learning_rate": 4.102416570771002e-05,
"loss": 0.4084,
"step": 253
},
{
"epoch": 0.7888198757763976,
"grad_norm": 0.34262159693990346,
"learning_rate": 4.096662830840046e-05,
"loss": 0.4343,
"step": 254
},
{
"epoch": 0.7919254658385093,
"grad_norm": 0.4238089460532713,
"learning_rate": 4.0909090909090915e-05,
"loss": 0.4197,
"step": 255
},
{
"epoch": 0.7950310559006211,
"grad_norm": 0.34636542219919175,
"learning_rate": 4.085155350978136e-05,
"loss": 0.4216,
"step": 256
},
{
"epoch": 0.7981366459627329,
"grad_norm": 0.5147966796611364,
"learning_rate": 4.079401611047181e-05,
"loss": 0.4335,
"step": 257
},
{
"epoch": 0.8012422360248447,
"grad_norm": 0.3784633526026262,
"learning_rate": 4.0736478711162254e-05,
"loss": 0.4388,
"step": 258
},
{
"epoch": 0.8043478260869565,
"grad_norm": 0.5353215946365089,
"learning_rate": 4.067894131185271e-05,
"loss": 0.4317,
"step": 259
},
{
"epoch": 0.8074534161490683,
"grad_norm": 0.43051530301687313,
"learning_rate": 4.062140391254315e-05,
"loss": 0.4321,
"step": 260
},
{
"epoch": 0.8105590062111802,
"grad_norm": 0.42796425153438244,
"learning_rate": 4.056386651323361e-05,
"loss": 0.4088,
"step": 261
},
{
"epoch": 0.8136645962732919,
"grad_norm": 0.5934293854830046,
"learning_rate": 4.050632911392405e-05,
"loss": 0.4158,
"step": 262
},
{
"epoch": 0.8167701863354038,
"grad_norm": 0.4393755394280156,
"learning_rate": 4.0448791714614505e-05,
"loss": 0.4469,
"step": 263
},
{
"epoch": 0.8198757763975155,
"grad_norm": 0.4605347528048276,
"learning_rate": 4.0391254315304947e-05,
"loss": 0.4346,
"step": 264
},
{
"epoch": 0.8229813664596274,
"grad_norm": 0.35259936117009355,
"learning_rate": 4.03337169159954e-05,
"loss": 0.4168,
"step": 265
},
{
"epoch": 0.8260869565217391,
"grad_norm": 0.4804542899872928,
"learning_rate": 4.0276179516685844e-05,
"loss": 0.4302,
"step": 266
},
{
"epoch": 0.8291925465838509,
"grad_norm": 0.49703176852970304,
"learning_rate": 4.02186421173763e-05,
"loss": 0.4376,
"step": 267
},
{
"epoch": 0.8322981366459627,
"grad_norm": 0.32332376265052126,
"learning_rate": 4.016110471806674e-05,
"loss": 0.4151,
"step": 268
},
{
"epoch": 0.8354037267080745,
"grad_norm": 0.3837962855801273,
"learning_rate": 4.01035673187572e-05,
"loss": 0.4165,
"step": 269
},
{
"epoch": 0.8385093167701864,
"grad_norm": 0.3057885184710408,
"learning_rate": 4.004602991944764e-05,
"loss": 0.4193,
"step": 270
},
{
"epoch": 0.8416149068322981,
"grad_norm": 0.33815716235605003,
"learning_rate": 3.9988492520138094e-05,
"loss": 0.4122,
"step": 271
},
{
"epoch": 0.84472049689441,
"grad_norm": 0.35543954456463683,
"learning_rate": 3.9930955120828536e-05,
"loss": 0.4312,
"step": 272
},
{
"epoch": 0.8478260869565217,
"grad_norm": 0.4061479720971117,
"learning_rate": 3.987341772151899e-05,
"loss": 0.4326,
"step": 273
},
{
"epoch": 0.8509316770186336,
"grad_norm": 0.3293967556583535,
"learning_rate": 3.9815880322209434e-05,
"loss": 0.4162,
"step": 274
},
{
"epoch": 0.8540372670807453,
"grad_norm": 0.32127496899850444,
"learning_rate": 3.975834292289989e-05,
"loss": 0.4064,
"step": 275
},
{
"epoch": 0.8571428571428571,
"grad_norm": 0.3106744319229529,
"learning_rate": 3.970080552359033e-05,
"loss": 0.4219,
"step": 276
},
{
"epoch": 0.860248447204969,
"grad_norm": 0.2851226156515557,
"learning_rate": 3.964326812428079e-05,
"loss": 0.4357,
"step": 277
},
{
"epoch": 0.8633540372670807,
"grad_norm": 0.3367137774364346,
"learning_rate": 3.958573072497123e-05,
"loss": 0.4221,
"step": 278
},
{
"epoch": 0.8664596273291926,
"grad_norm": 0.274716671666842,
"learning_rate": 3.9528193325661684e-05,
"loss": 0.4286,
"step": 279
},
{
"epoch": 0.8695652173913043,
"grad_norm": 0.32476182770932666,
"learning_rate": 3.9470655926352126e-05,
"loss": 0.4156,
"step": 280
},
{
"epoch": 0.8726708074534162,
"grad_norm": 0.34109454423469643,
"learning_rate": 3.941311852704258e-05,
"loss": 0.4133,
"step": 281
},
{
"epoch": 0.8757763975155279,
"grad_norm": 0.35511307476273746,
"learning_rate": 3.9355581127733024e-05,
"loss": 0.4317,
"step": 282
},
{
"epoch": 0.8788819875776398,
"grad_norm": 0.3270722625275185,
"learning_rate": 3.929804372842348e-05,
"loss": 0.4182,
"step": 283
},
{
"epoch": 0.8819875776397516,
"grad_norm": 0.30707956127514435,
"learning_rate": 3.924050632911392e-05,
"loss": 0.4128,
"step": 284
},
{
"epoch": 0.8850931677018633,
"grad_norm": 0.352987960191196,
"learning_rate": 3.9182968929804377e-05,
"loss": 0.4202,
"step": 285
},
{
"epoch": 0.8881987577639752,
"grad_norm": 0.3209556725057783,
"learning_rate": 3.912543153049482e-05,
"loss": 0.4531,
"step": 286
},
{
"epoch": 0.8913043478260869,
"grad_norm": 0.3424777350197383,
"learning_rate": 3.9067894131185274e-05,
"loss": 0.4261,
"step": 287
},
{
"epoch": 0.8944099378881988,
"grad_norm": 0.36115235473805046,
"learning_rate": 3.9010356731875716e-05,
"loss": 0.4208,
"step": 288
},
{
"epoch": 0.8975155279503105,
"grad_norm": 0.3345731728145184,
"learning_rate": 3.895281933256617e-05,
"loss": 0.4243,
"step": 289
},
{
"epoch": 0.9006211180124224,
"grad_norm": 0.3479109694931497,
"learning_rate": 3.8895281933256613e-05,
"loss": 0.408,
"step": 290
},
{
"epoch": 0.9037267080745341,
"grad_norm": 0.35901431270989403,
"learning_rate": 3.883774453394707e-05,
"loss": 0.4275,
"step": 291
},
{
"epoch": 0.906832298136646,
"grad_norm": 0.33289357045170126,
"learning_rate": 3.878020713463751e-05,
"loss": 0.4078,
"step": 292
},
{
"epoch": 0.9099378881987578,
"grad_norm": 0.33168510073705165,
"learning_rate": 3.8722669735327966e-05,
"loss": 0.4218,
"step": 293
},
{
"epoch": 0.9130434782608695,
"grad_norm": 0.2975318289744658,
"learning_rate": 3.866513233601841e-05,
"loss": 0.4311,
"step": 294
},
{
"epoch": 0.9161490683229814,
"grad_norm": 0.31426977572692477,
"learning_rate": 3.8607594936708864e-05,
"loss": 0.4297,
"step": 295
},
{
"epoch": 0.9192546583850931,
"grad_norm": 0.3070483941031755,
"learning_rate": 3.8550057537399306e-05,
"loss": 0.4192,
"step": 296
},
{
"epoch": 0.922360248447205,
"grad_norm": 0.2810848054459513,
"learning_rate": 3.849252013808976e-05,
"loss": 0.427,
"step": 297
},
{
"epoch": 0.9254658385093167,
"grad_norm": 0.2991841633857078,
"learning_rate": 3.84349827387802e-05,
"loss": 0.4052,
"step": 298
},
{
"epoch": 0.9285714285714286,
"grad_norm": 0.33847151615147736,
"learning_rate": 3.837744533947066e-05,
"loss": 0.419,
"step": 299
},
{
"epoch": 0.9316770186335404,
"grad_norm": 0.29017927632864937,
"learning_rate": 3.83199079401611e-05,
"loss": 0.4235,
"step": 300
},
{
"epoch": 0.9347826086956522,
"grad_norm": 0.32565509697744177,
"learning_rate": 3.8262370540851556e-05,
"loss": 0.4218,
"step": 301
},
{
"epoch": 0.937888198757764,
"grad_norm": 0.31402325607805354,
"learning_rate": 3.8204833141542005e-05,
"loss": 0.4374,
"step": 302
},
{
"epoch": 0.9409937888198758,
"grad_norm": 0.3147076556719568,
"learning_rate": 3.8147295742232454e-05,
"loss": 0.4155,
"step": 303
},
{
"epoch": 0.9440993788819876,
"grad_norm": 0.29699738407713266,
"learning_rate": 3.80897583429229e-05,
"loss": 0.4111,
"step": 304
},
{
"epoch": 0.9472049689440993,
"grad_norm": 0.2888210602850056,
"learning_rate": 3.803222094361335e-05,
"loss": 0.4221,
"step": 305
},
{
"epoch": 0.9503105590062112,
"grad_norm": 0.2939573629666098,
"learning_rate": 3.79746835443038e-05,
"loss": 0.4032,
"step": 306
},
{
"epoch": 0.953416149068323,
"grad_norm": 0.2962446654764285,
"learning_rate": 3.791714614499425e-05,
"loss": 0.4214,
"step": 307
},
{
"epoch": 0.9565217391304348,
"grad_norm": 0.271891153920885,
"learning_rate": 3.78596087456847e-05,
"loss": 0.4198,
"step": 308
},
{
"epoch": 0.9596273291925466,
"grad_norm": 0.32256951843172593,
"learning_rate": 3.7802071346375146e-05,
"loss": 0.4216,
"step": 309
},
{
"epoch": 0.9627329192546584,
"grad_norm": 0.33232339921643056,
"learning_rate": 3.7744533947065595e-05,
"loss": 0.4177,
"step": 310
},
{
"epoch": 0.9658385093167702,
"grad_norm": 0.35814851356254335,
"learning_rate": 3.7686996547756043e-05,
"loss": 0.425,
"step": 311
},
{
"epoch": 0.968944099378882,
"grad_norm": 0.29938770364659023,
"learning_rate": 3.762945914844649e-05,
"loss": 0.4128,
"step": 312
},
{
"epoch": 0.9720496894409938,
"grad_norm": 0.38739922253123726,
"learning_rate": 3.757192174913694e-05,
"loss": 0.4113,
"step": 313
},
{
"epoch": 0.9751552795031055,
"grad_norm": 0.31386603107673766,
"learning_rate": 3.751438434982739e-05,
"loss": 0.4104,
"step": 314
},
{
"epoch": 0.9782608695652174,
"grad_norm": 0.34687136495142834,
"learning_rate": 3.745684695051784e-05,
"loss": 0.4307,
"step": 315
},
{
"epoch": 0.9813664596273292,
"grad_norm": 0.3492017123521989,
"learning_rate": 3.739930955120829e-05,
"loss": 0.4077,
"step": 316
},
{
"epoch": 0.984472049689441,
"grad_norm": 0.29396206255406326,
"learning_rate": 3.7341772151898736e-05,
"loss": 0.4067,
"step": 317
},
{
"epoch": 0.9875776397515528,
"grad_norm": 0.31882677984452723,
"learning_rate": 3.7284234752589185e-05,
"loss": 0.4207,
"step": 318
},
{
"epoch": 0.9906832298136646,
"grad_norm": 0.37165416285954644,
"learning_rate": 3.722669735327963e-05,
"loss": 0.4339,
"step": 319
},
{
"epoch": 0.9937888198757764,
"grad_norm": 0.3190088839703568,
"learning_rate": 3.716915995397008e-05,
"loss": 0.4079,
"step": 320
},
{
"epoch": 0.9968944099378882,
"grad_norm": 0.3115319771959773,
"learning_rate": 3.711162255466053e-05,
"loss": 0.4322,
"step": 321
},
{
"epoch": 1.0,
"grad_norm": 0.3044086608586031,
"learning_rate": 3.705408515535098e-05,
"loss": 0.4097,
"step": 322
},
{
"epoch": 1.0031055900621118,
"grad_norm": 0.33417590278362963,
"learning_rate": 3.699654775604143e-05,
"loss": 0.3323,
"step": 323
},
{
"epoch": 1.0062111801242235,
"grad_norm": 0.341573477224664,
"learning_rate": 3.693901035673188e-05,
"loss": 0.3571,
"step": 324
},
{
"epoch": 1.0093167701863355,
"grad_norm": 0.27258326161115387,
"learning_rate": 3.6881472957422326e-05,
"loss": 0.3404,
"step": 325
},
{
"epoch": 1.0124223602484472,
"grad_norm": 0.33991178542501627,
"learning_rate": 3.6823935558112774e-05,
"loss": 0.3493,
"step": 326
},
{
"epoch": 1.015527950310559,
"grad_norm": 0.3446263251981706,
"learning_rate": 3.676639815880322e-05,
"loss": 0.3473,
"step": 327
},
{
"epoch": 1.0186335403726707,
"grad_norm": 0.33801547973317314,
"learning_rate": 3.670886075949367e-05,
"loss": 0.3697,
"step": 328
},
{
"epoch": 1.0217391304347827,
"grad_norm": 0.35908354782023477,
"learning_rate": 3.665132336018412e-05,
"loss": 0.3476,
"step": 329
},
{
"epoch": 1.0248447204968945,
"grad_norm": 0.3234656105570385,
"learning_rate": 3.659378596087457e-05,
"loss": 0.3622,
"step": 330
},
{
"epoch": 1.0279503105590062,
"grad_norm": 0.35587249506855595,
"learning_rate": 3.653624856156502e-05,
"loss": 0.3555,
"step": 331
},
{
"epoch": 1.031055900621118,
"grad_norm": 0.31905169592308186,
"learning_rate": 3.647871116225547e-05,
"loss": 0.3461,
"step": 332
},
{
"epoch": 1.0341614906832297,
"grad_norm": 0.36840310397083925,
"learning_rate": 3.6421173762945915e-05,
"loss": 0.3429,
"step": 333
},
{
"epoch": 1.0372670807453417,
"grad_norm": 0.3651205860513462,
"learning_rate": 3.6363636363636364e-05,
"loss": 0.3435,
"step": 334
},
{
"epoch": 1.0403726708074534,
"grad_norm": 0.31066005439052724,
"learning_rate": 3.630609896432681e-05,
"loss": 0.3272,
"step": 335
},
{
"epoch": 1.0434782608695652,
"grad_norm": 0.3759419584351618,
"learning_rate": 3.624856156501726e-05,
"loss": 0.3395,
"step": 336
},
{
"epoch": 1.046583850931677,
"grad_norm": 0.3021549547887614,
"learning_rate": 3.619102416570771e-05,
"loss": 0.3417,
"step": 337
},
{
"epoch": 1.049689440993789,
"grad_norm": 0.3205703918762732,
"learning_rate": 3.613348676639816e-05,
"loss": 0.3433,
"step": 338
},
{
"epoch": 1.0527950310559007,
"grad_norm": 0.4534884210584356,
"learning_rate": 3.607594936708861e-05,
"loss": 0.3594,
"step": 339
},
{
"epoch": 1.0559006211180124,
"grad_norm": 0.367415386580333,
"learning_rate": 3.6018411967779057e-05,
"loss": 0.3524,
"step": 340
},
{
"epoch": 1.0590062111801242,
"grad_norm": 0.3127875635159284,
"learning_rate": 3.5960874568469505e-05,
"loss": 0.333,
"step": 341
},
{
"epoch": 1.062111801242236,
"grad_norm": 0.4511553956189257,
"learning_rate": 3.5903337169159954e-05,
"loss": 0.3454,
"step": 342
},
{
"epoch": 1.065217391304348,
"grad_norm": 0.27133796776358254,
"learning_rate": 3.58457997698504e-05,
"loss": 0.3307,
"step": 343
},
{
"epoch": 1.0683229813664596,
"grad_norm": 0.37172783607468407,
"learning_rate": 3.578826237054085e-05,
"loss": 0.332,
"step": 344
},
{
"epoch": 1.0714285714285714,
"grad_norm": 0.31903478698253923,
"learning_rate": 3.57307249712313e-05,
"loss": 0.3641,
"step": 345
},
{
"epoch": 1.0745341614906831,
"grad_norm": 0.3590599821405197,
"learning_rate": 3.567318757192175e-05,
"loss": 0.3368,
"step": 346
},
{
"epoch": 1.0776397515527951,
"grad_norm": 0.3228666493670707,
"learning_rate": 3.56156501726122e-05,
"loss": 0.3518,
"step": 347
},
{
"epoch": 1.0807453416149069,
"grad_norm": 0.35040485427397144,
"learning_rate": 3.5558112773302646e-05,
"loss": 0.3567,
"step": 348
},
{
"epoch": 1.0838509316770186,
"grad_norm": 0.3223473550373259,
"learning_rate": 3.5500575373993095e-05,
"loss": 0.3292,
"step": 349
},
{
"epoch": 1.0869565217391304,
"grad_norm": 0.3162329124544906,
"learning_rate": 3.5443037974683544e-05,
"loss": 0.3386,
"step": 350
},
{
"epoch": 1.0900621118012421,
"grad_norm": 0.35250805959488396,
"learning_rate": 3.538550057537399e-05,
"loss": 0.3286,
"step": 351
},
{
"epoch": 1.093167701863354,
"grad_norm": 0.31027768437301634,
"learning_rate": 3.532796317606444e-05,
"loss": 0.3411,
"step": 352
},
{
"epoch": 1.0962732919254659,
"grad_norm": 0.28606898633939265,
"learning_rate": 3.52704257767549e-05,
"loss": 0.3407,
"step": 353
},
{
"epoch": 1.0993788819875776,
"grad_norm": 0.3579167421662421,
"learning_rate": 3.521288837744534e-05,
"loss": 0.3262,
"step": 354
},
{
"epoch": 1.1024844720496894,
"grad_norm": 0.3402295001253341,
"learning_rate": 3.5155350978135794e-05,
"loss": 0.3324,
"step": 355
},
{
"epoch": 1.1055900621118013,
"grad_norm": 0.31366685836024,
"learning_rate": 3.5097813578826236e-05,
"loss": 0.3463,
"step": 356
},
{
"epoch": 1.108695652173913,
"grad_norm": 0.46838911104977027,
"learning_rate": 3.504027617951669e-05,
"loss": 0.3565,
"step": 357
},
{
"epoch": 1.1118012422360248,
"grad_norm": 0.3060846523455061,
"learning_rate": 3.4982738780207134e-05,
"loss": 0.357,
"step": 358
},
{
"epoch": 1.1149068322981366,
"grad_norm": 0.4392245103993425,
"learning_rate": 3.492520138089759e-05,
"loss": 0.3568,
"step": 359
},
{
"epoch": 1.1180124223602483,
"grad_norm": 0.3916417909387617,
"learning_rate": 3.486766398158803e-05,
"loss": 0.3446,
"step": 360
},
{
"epoch": 1.1211180124223603,
"grad_norm": 0.3501561418628378,
"learning_rate": 3.4810126582278487e-05,
"loss": 0.3282,
"step": 361
},
{
"epoch": 1.124223602484472,
"grad_norm": 0.37454862360065444,
"learning_rate": 3.475258918296893e-05,
"loss": 0.3543,
"step": 362
},
{
"epoch": 1.1273291925465838,
"grad_norm": 0.2884683302507566,
"learning_rate": 3.4695051783659384e-05,
"loss": 0.3337,
"step": 363
},
{
"epoch": 1.1304347826086956,
"grad_norm": 0.3254717305148171,
"learning_rate": 3.4637514384349826e-05,
"loss": 0.3271,
"step": 364
},
{
"epoch": 1.1335403726708075,
"grad_norm": 0.3256237761211695,
"learning_rate": 3.457997698504028e-05,
"loss": 0.3298,
"step": 365
},
{
"epoch": 1.1366459627329193,
"grad_norm": 0.30981574585542065,
"learning_rate": 3.4522439585730723e-05,
"loss": 0.3685,
"step": 366
},
{
"epoch": 1.139751552795031,
"grad_norm": 0.29936602875383006,
"learning_rate": 3.446490218642118e-05,
"loss": 0.3524,
"step": 367
},
{
"epoch": 1.1428571428571428,
"grad_norm": 0.2961907533597477,
"learning_rate": 3.440736478711162e-05,
"loss": 0.3414,
"step": 368
},
{
"epoch": 1.1459627329192545,
"grad_norm": 0.2898757967419472,
"learning_rate": 3.4349827387802076e-05,
"loss": 0.3275,
"step": 369
},
{
"epoch": 1.1490683229813665,
"grad_norm": 0.35918811245436444,
"learning_rate": 3.429228998849252e-05,
"loss": 0.3502,
"step": 370
},
{
"epoch": 1.1521739130434783,
"grad_norm": 0.2775107307381104,
"learning_rate": 3.4234752589182974e-05,
"loss": 0.3409,
"step": 371
},
{
"epoch": 1.15527950310559,
"grad_norm": 0.2986400287100927,
"learning_rate": 3.4177215189873416e-05,
"loss": 0.3312,
"step": 372
},
{
"epoch": 1.1583850931677018,
"grad_norm": 0.33238801993955036,
"learning_rate": 3.411967779056387e-05,
"loss": 0.3443,
"step": 373
},
{
"epoch": 1.1614906832298137,
"grad_norm": 0.2893594359102009,
"learning_rate": 3.406214039125431e-05,
"loss": 0.3332,
"step": 374
},
{
"epoch": 1.1645962732919255,
"grad_norm": 0.32293840276637376,
"learning_rate": 3.400460299194477e-05,
"loss": 0.3354,
"step": 375
},
{
"epoch": 1.1677018633540373,
"grad_norm": 0.27306219223391365,
"learning_rate": 3.394706559263521e-05,
"loss": 0.3209,
"step": 376
},
{
"epoch": 1.170807453416149,
"grad_norm": 0.3342500084639322,
"learning_rate": 3.3889528193325666e-05,
"loss": 0.3729,
"step": 377
},
{
"epoch": 1.1739130434782608,
"grad_norm": 0.2661392532196279,
"learning_rate": 3.383199079401611e-05,
"loss": 0.3383,
"step": 378
},
{
"epoch": 1.1770186335403727,
"grad_norm": 0.3386471665658259,
"learning_rate": 3.3774453394706564e-05,
"loss": 0.318,
"step": 379
},
{
"epoch": 1.1801242236024845,
"grad_norm": 0.3155587203894488,
"learning_rate": 3.3716915995397006e-05,
"loss": 0.3321,
"step": 380
},
{
"epoch": 1.1832298136645962,
"grad_norm": 0.3451778286777197,
"learning_rate": 3.365937859608746e-05,
"loss": 0.361,
"step": 381
},
{
"epoch": 1.186335403726708,
"grad_norm": 0.3227976748273063,
"learning_rate": 3.36018411967779e-05,
"loss": 0.3349,
"step": 382
},
{
"epoch": 1.18944099378882,
"grad_norm": 0.320511150129644,
"learning_rate": 3.354430379746836e-05,
"loss": 0.3449,
"step": 383
},
{
"epoch": 1.1925465838509317,
"grad_norm": 0.31955908520280063,
"learning_rate": 3.34867663981588e-05,
"loss": 0.3351,
"step": 384
},
{
"epoch": 1.1956521739130435,
"grad_norm": 0.30633810764776365,
"learning_rate": 3.3429228998849256e-05,
"loss": 0.3275,
"step": 385
},
{
"epoch": 1.1987577639751552,
"grad_norm": 0.41299034529321954,
"learning_rate": 3.33716915995397e-05,
"loss": 0.3309,
"step": 386
},
{
"epoch": 1.201863354037267,
"grad_norm": 0.2750482509074482,
"learning_rate": 3.3314154200230153e-05,
"loss": 0.3398,
"step": 387
},
{
"epoch": 1.204968944099379,
"grad_norm": 0.3081268249974453,
"learning_rate": 3.3256616800920595e-05,
"loss": 0.3322,
"step": 388
},
{
"epoch": 1.2080745341614907,
"grad_norm": 0.3520674198029431,
"learning_rate": 3.319907940161105e-05,
"loss": 0.3663,
"step": 389
},
{
"epoch": 1.2111801242236024,
"grad_norm": 0.32565232106148584,
"learning_rate": 3.314154200230149e-05,
"loss": 0.343,
"step": 390
},
{
"epoch": 1.2142857142857142,
"grad_norm": 0.2938812397405531,
"learning_rate": 3.308400460299195e-05,
"loss": 0.3378,
"step": 391
},
{
"epoch": 1.2173913043478262,
"grad_norm": 0.3141073779827861,
"learning_rate": 3.302646720368239e-05,
"loss": 0.3335,
"step": 392
},
{
"epoch": 1.220496894409938,
"grad_norm": 0.3418673255721663,
"learning_rate": 3.2968929804372846e-05,
"loss": 0.36,
"step": 393
},
{
"epoch": 1.2236024844720497,
"grad_norm": 0.24297614998734132,
"learning_rate": 3.291139240506329e-05,
"loss": 0.3387,
"step": 394
},
{
"epoch": 1.2267080745341614,
"grad_norm": 0.3267179467149504,
"learning_rate": 3.285385500575374e-05,
"loss": 0.3488,
"step": 395
},
{
"epoch": 1.2298136645962732,
"grad_norm": 0.3057560458812451,
"learning_rate": 3.2796317606444185e-05,
"loss": 0.3268,
"step": 396
},
{
"epoch": 1.2329192546583851,
"grad_norm": 0.3134897896860434,
"learning_rate": 3.273878020713464e-05,
"loss": 0.3459,
"step": 397
},
{
"epoch": 1.236024844720497,
"grad_norm": 0.3047314985401556,
"learning_rate": 3.268124280782508e-05,
"loss": 0.3291,
"step": 398
},
{
"epoch": 1.2391304347826086,
"grad_norm": 0.31348581848675783,
"learning_rate": 3.262370540851554e-05,
"loss": 0.3446,
"step": 399
},
{
"epoch": 1.2422360248447206,
"grad_norm": 0.3482328869260001,
"learning_rate": 3.256616800920598e-05,
"loss": 0.3561,
"step": 400
},
{
"epoch": 1.2453416149068324,
"grad_norm": 0.31183834841742225,
"learning_rate": 3.2508630609896436e-05,
"loss": 0.3547,
"step": 401
},
{
"epoch": 1.2484472049689441,
"grad_norm": 0.3061676085086065,
"learning_rate": 3.245109321058688e-05,
"loss": 0.3595,
"step": 402
},
{
"epoch": 1.2515527950310559,
"grad_norm": 0.32549148328343397,
"learning_rate": 3.239355581127733e-05,
"loss": 0.3342,
"step": 403
},
{
"epoch": 1.2546583850931676,
"grad_norm": 0.30445969084522895,
"learning_rate": 3.233601841196778e-05,
"loss": 0.3242,
"step": 404
},
{
"epoch": 1.2577639751552794,
"grad_norm": 0.2742819629805248,
"learning_rate": 3.227848101265823e-05,
"loss": 0.3522,
"step": 405
},
{
"epoch": 1.2608695652173914,
"grad_norm": 0.32581875150876105,
"learning_rate": 3.222094361334868e-05,
"loss": 0.3429,
"step": 406
},
{
"epoch": 1.263975155279503,
"grad_norm": 0.2902255052156193,
"learning_rate": 3.216340621403913e-05,
"loss": 0.3369,
"step": 407
},
{
"epoch": 1.2670807453416149,
"grad_norm": 0.284761382807809,
"learning_rate": 3.210586881472958e-05,
"loss": 0.36,
"step": 408
},
{
"epoch": 1.2701863354037268,
"grad_norm": 0.3025552167032939,
"learning_rate": 3.2048331415420025e-05,
"loss": 0.3445,
"step": 409
},
{
"epoch": 1.2732919254658386,
"grad_norm": 0.3305696776607858,
"learning_rate": 3.1990794016110474e-05,
"loss": 0.3463,
"step": 410
},
{
"epoch": 1.2763975155279503,
"grad_norm": 0.3077574972549534,
"learning_rate": 3.193325661680092e-05,
"loss": 0.3594,
"step": 411
},
{
"epoch": 1.279503105590062,
"grad_norm": 0.27442755120830326,
"learning_rate": 3.187571921749137e-05,
"loss": 0.3362,
"step": 412
},
{
"epoch": 1.2826086956521738,
"grad_norm": 0.3038026451556641,
"learning_rate": 3.181818181818182e-05,
"loss": 0.3353,
"step": 413
},
{
"epoch": 1.2857142857142856,
"grad_norm": 0.2758156658151106,
"learning_rate": 3.176064441887227e-05,
"loss": 0.337,
"step": 414
},
{
"epoch": 1.2888198757763976,
"grad_norm": 0.26613400787975794,
"learning_rate": 3.170310701956272e-05,
"loss": 0.3347,
"step": 415
},
{
"epoch": 1.2919254658385093,
"grad_norm": 0.30006243856469433,
"learning_rate": 3.1645569620253167e-05,
"loss": 0.3575,
"step": 416
},
{
"epoch": 1.295031055900621,
"grad_norm": 0.32225619437705794,
"learning_rate": 3.1588032220943615e-05,
"loss": 0.3404,
"step": 417
},
{
"epoch": 1.298136645962733,
"grad_norm": 0.2933513705620206,
"learning_rate": 3.1530494821634064e-05,
"loss": 0.3367,
"step": 418
},
{
"epoch": 1.3012422360248448,
"grad_norm": 0.34221232972865906,
"learning_rate": 3.147295742232451e-05,
"loss": 0.3507,
"step": 419
},
{
"epoch": 1.3043478260869565,
"grad_norm": 0.3207028944029123,
"learning_rate": 3.141542002301496e-05,
"loss": 0.339,
"step": 420
},
{
"epoch": 1.3074534161490683,
"grad_norm": 0.28691874649916205,
"learning_rate": 3.135788262370541e-05,
"loss": 0.3158,
"step": 421
},
{
"epoch": 1.31055900621118,
"grad_norm": 0.32572558244440175,
"learning_rate": 3.130034522439586e-05,
"loss": 0.3425,
"step": 422
},
{
"epoch": 1.3136645962732918,
"grad_norm": 0.2840181269830042,
"learning_rate": 3.124280782508631e-05,
"loss": 0.3446,
"step": 423
},
{
"epoch": 1.3167701863354038,
"grad_norm": 0.314090935226993,
"learning_rate": 3.1185270425776756e-05,
"loss": 0.3315,
"step": 424
},
{
"epoch": 1.3198757763975155,
"grad_norm": 0.31197822717588264,
"learning_rate": 3.1127733026467205e-05,
"loss": 0.3443,
"step": 425
},
{
"epoch": 1.3229813664596273,
"grad_norm": 0.2864210002126174,
"learning_rate": 3.1070195627157654e-05,
"loss": 0.3375,
"step": 426
},
{
"epoch": 1.3260869565217392,
"grad_norm": 0.25519688185589984,
"learning_rate": 3.10126582278481e-05,
"loss": 0.3384,
"step": 427
},
{
"epoch": 1.329192546583851,
"grad_norm": 0.2744740349540228,
"learning_rate": 3.095512082853855e-05,
"loss": 0.3383,
"step": 428
},
{
"epoch": 1.3322981366459627,
"grad_norm": 0.2607087924929348,
"learning_rate": 3.0897583429229e-05,
"loss": 0.3555,
"step": 429
},
{
"epoch": 1.3354037267080745,
"grad_norm": 0.27684287170228183,
"learning_rate": 3.084004602991945e-05,
"loss": 0.3261,
"step": 430
},
{
"epoch": 1.3385093167701863,
"grad_norm": 0.3095550998483706,
"learning_rate": 3.07825086306099e-05,
"loss": 0.3512,
"step": 431
},
{
"epoch": 1.341614906832298,
"grad_norm": 0.25842001969735057,
"learning_rate": 3.0724971231300346e-05,
"loss": 0.3296,
"step": 432
},
{
"epoch": 1.34472049689441,
"grad_norm": 0.30589083199518,
"learning_rate": 3.0667433831990795e-05,
"loss": 0.3329,
"step": 433
},
{
"epoch": 1.3478260869565217,
"grad_norm": 0.2825831249071207,
"learning_rate": 3.0609896432681244e-05,
"loss": 0.3403,
"step": 434
},
{
"epoch": 1.3509316770186335,
"grad_norm": 0.2849649494187899,
"learning_rate": 3.055235903337169e-05,
"loss": 0.3329,
"step": 435
},
{
"epoch": 1.3540372670807455,
"grad_norm": 0.31227992790240827,
"learning_rate": 3.0494821634062144e-05,
"loss": 0.3402,
"step": 436
},
{
"epoch": 1.3571428571428572,
"grad_norm": 0.28830226140066545,
"learning_rate": 3.043728423475259e-05,
"loss": 0.3343,
"step": 437
},
{
"epoch": 1.360248447204969,
"grad_norm": 0.2920144019191934,
"learning_rate": 3.0379746835443042e-05,
"loss": 0.3262,
"step": 438
},
{
"epoch": 1.3633540372670807,
"grad_norm": 0.25016168615415485,
"learning_rate": 3.0322209436133487e-05,
"loss": 0.3394,
"step": 439
},
{
"epoch": 1.3664596273291925,
"grad_norm": 0.30261081735444717,
"learning_rate": 3.026467203682394e-05,
"loss": 0.3462,
"step": 440
},
{
"epoch": 1.3695652173913042,
"grad_norm": 0.2881616381341832,
"learning_rate": 3.0207134637514385e-05,
"loss": 0.3318,
"step": 441
},
{
"epoch": 1.3726708074534162,
"grad_norm": 0.2880320213397424,
"learning_rate": 3.0149597238204837e-05,
"loss": 0.3467,
"step": 442
},
{
"epoch": 1.375776397515528,
"grad_norm": 0.27020350890941985,
"learning_rate": 3.0092059838895282e-05,
"loss": 0.332,
"step": 443
},
{
"epoch": 1.3788819875776397,
"grad_norm": 0.3103789819064371,
"learning_rate": 3.0034522439585734e-05,
"loss": 0.3557,
"step": 444
},
{
"epoch": 1.3819875776397517,
"grad_norm": 0.2689229148068124,
"learning_rate": 2.997698504027618e-05,
"loss": 0.3439,
"step": 445
},
{
"epoch": 1.3850931677018634,
"grad_norm": 0.3284067395525679,
"learning_rate": 2.991944764096663e-05,
"loss": 0.3361,
"step": 446
},
{
"epoch": 1.3881987577639752,
"grad_norm": 0.31384093062312546,
"learning_rate": 2.9861910241657077e-05,
"loss": 0.3397,
"step": 447
},
{
"epoch": 1.391304347826087,
"grad_norm": 0.2699369577142723,
"learning_rate": 2.980437284234753e-05,
"loss": 0.3586,
"step": 448
},
{
"epoch": 1.3944099378881987,
"grad_norm": 0.3036288617772187,
"learning_rate": 2.9746835443037974e-05,
"loss": 0.3565,
"step": 449
},
{
"epoch": 1.3975155279503104,
"grad_norm": 0.3124807578288405,
"learning_rate": 2.9689298043728427e-05,
"loss": 0.3419,
"step": 450
},
{
"epoch": 1.4006211180124224,
"grad_norm": 0.2979650176094835,
"learning_rate": 2.9631760644418872e-05,
"loss": 0.348,
"step": 451
},
{
"epoch": 1.4037267080745341,
"grad_norm": 0.2681659506306783,
"learning_rate": 2.9574223245109324e-05,
"loss": 0.3299,
"step": 452
},
{
"epoch": 1.406832298136646,
"grad_norm": 0.32598633427460977,
"learning_rate": 2.951668584579977e-05,
"loss": 0.3386,
"step": 453
},
{
"epoch": 1.4099378881987579,
"grad_norm": 0.3247280401348239,
"learning_rate": 2.945914844649022e-05,
"loss": 0.3616,
"step": 454
},
{
"epoch": 1.4130434782608696,
"grad_norm": 0.2674177394891557,
"learning_rate": 2.940161104718067e-05,
"loss": 0.3477,
"step": 455
},
{
"epoch": 1.4161490683229814,
"grad_norm": 0.31284120066769544,
"learning_rate": 2.934407364787112e-05,
"loss": 0.3359,
"step": 456
},
{
"epoch": 1.4192546583850931,
"grad_norm": 0.29650681379778476,
"learning_rate": 2.9286536248561568e-05,
"loss": 0.3534,
"step": 457
},
{
"epoch": 1.4223602484472049,
"grad_norm": 0.28958329200728805,
"learning_rate": 2.9228998849252016e-05,
"loss": 0.3414,
"step": 458
},
{
"epoch": 1.4254658385093169,
"grad_norm": 0.3164618567976454,
"learning_rate": 2.9171461449942465e-05,
"loss": 0.351,
"step": 459
},
{
"epoch": 1.4285714285714286,
"grad_norm": 0.32604862143805774,
"learning_rate": 2.9113924050632914e-05,
"loss": 0.3185,
"step": 460
},
{
"epoch": 1.4316770186335404,
"grad_norm": 0.2371091150488046,
"learning_rate": 2.9056386651323363e-05,
"loss": 0.3268,
"step": 461
},
{
"epoch": 1.434782608695652,
"grad_norm": 0.28836250577098943,
"learning_rate": 2.899884925201381e-05,
"loss": 0.3203,
"step": 462
},
{
"epoch": 1.437888198757764,
"grad_norm": 0.29935943849859553,
"learning_rate": 2.894131185270426e-05,
"loss": 0.3419,
"step": 463
},
{
"epoch": 1.4409937888198758,
"grad_norm": 0.2678768364941078,
"learning_rate": 2.888377445339471e-05,
"loss": 0.3423,
"step": 464
},
{
"epoch": 1.4440993788819876,
"grad_norm": 0.3004413989905001,
"learning_rate": 2.8826237054085157e-05,
"loss": 0.3448,
"step": 465
},
{
"epoch": 1.4472049689440993,
"grad_norm": 0.3437138642713499,
"learning_rate": 2.8768699654775606e-05,
"loss": 0.3624,
"step": 466
},
{
"epoch": 1.450310559006211,
"grad_norm": 0.27833054674558505,
"learning_rate": 2.8711162255466055e-05,
"loss": 0.3559,
"step": 467
},
{
"epoch": 1.453416149068323,
"grad_norm": 0.30426248134832284,
"learning_rate": 2.8653624856156504e-05,
"loss": 0.3409,
"step": 468
},
{
"epoch": 1.4565217391304348,
"grad_norm": 0.2884530747421473,
"learning_rate": 2.8596087456846952e-05,
"loss": 0.3543,
"step": 469
},
{
"epoch": 1.4596273291925466,
"grad_norm": 0.26674718010863235,
"learning_rate": 2.85385500575374e-05,
"loss": 0.3352,
"step": 470
},
{
"epoch": 1.4627329192546583,
"grad_norm": 0.24962947417256104,
"learning_rate": 2.848101265822785e-05,
"loss": 0.3331,
"step": 471
},
{
"epoch": 1.4658385093167703,
"grad_norm": 0.24321872392892266,
"learning_rate": 2.84234752589183e-05,
"loss": 0.3346,
"step": 472
},
{
"epoch": 1.468944099378882,
"grad_norm": 0.25806201920649635,
"learning_rate": 2.8365937859608747e-05,
"loss": 0.3433,
"step": 473
},
{
"epoch": 1.4720496894409938,
"grad_norm": 0.260107860168702,
"learning_rate": 2.8308400460299196e-05,
"loss": 0.3518,
"step": 474
},
{
"epoch": 1.4751552795031055,
"grad_norm": 0.27151545722001336,
"learning_rate": 2.8250863060989645e-05,
"loss": 0.3222,
"step": 475
},
{
"epoch": 1.4782608695652173,
"grad_norm": 0.2699064437677885,
"learning_rate": 2.8193325661680093e-05,
"loss": 0.3408,
"step": 476
},
{
"epoch": 1.4813664596273293,
"grad_norm": 0.2534825847738341,
"learning_rate": 2.8135788262370542e-05,
"loss": 0.3355,
"step": 477
},
{
"epoch": 1.484472049689441,
"grad_norm": 0.2596248018497863,
"learning_rate": 2.807825086306099e-05,
"loss": 0.3317,
"step": 478
},
{
"epoch": 1.4875776397515528,
"grad_norm": 0.22547358749920884,
"learning_rate": 2.802071346375144e-05,
"loss": 0.3172,
"step": 479
},
{
"epoch": 1.4906832298136645,
"grad_norm": 0.28156958226578077,
"learning_rate": 2.796317606444189e-05,
"loss": 0.3476,
"step": 480
},
{
"epoch": 1.4937888198757765,
"grad_norm": 0.26615285376164327,
"learning_rate": 2.7905638665132337e-05,
"loss": 0.3489,
"step": 481
},
{
"epoch": 1.4968944099378882,
"grad_norm": 0.246150116031317,
"learning_rate": 2.7848101265822786e-05,
"loss": 0.3464,
"step": 482
},
{
"epoch": 1.5,
"grad_norm": 0.25268560596400597,
"learning_rate": 2.7790563866513235e-05,
"loss": 0.3391,
"step": 483
},
{
"epoch": 1.5031055900621118,
"grad_norm": 0.2946205590355613,
"learning_rate": 2.7733026467203683e-05,
"loss": 0.3541,
"step": 484
},
{
"epoch": 1.5062111801242235,
"grad_norm": 0.2750424223242439,
"learning_rate": 2.7675489067894132e-05,
"loss": 0.3276,
"step": 485
},
{
"epoch": 1.5093167701863353,
"grad_norm": 0.28954598608369275,
"learning_rate": 2.761795166858458e-05,
"loss": 0.3554,
"step": 486
},
{
"epoch": 1.5124223602484472,
"grad_norm": 0.29461626033953947,
"learning_rate": 2.756041426927503e-05,
"loss": 0.3293,
"step": 487
},
{
"epoch": 1.515527950310559,
"grad_norm": 0.2407514728215296,
"learning_rate": 2.7502876869965478e-05,
"loss": 0.3263,
"step": 488
},
{
"epoch": 1.518633540372671,
"grad_norm": 0.24475815135162626,
"learning_rate": 2.7445339470655927e-05,
"loss": 0.3423,
"step": 489
},
{
"epoch": 1.5217391304347827,
"grad_norm": 0.32665261682040186,
"learning_rate": 2.7387802071346376e-05,
"loss": 0.3333,
"step": 490
},
{
"epoch": 1.5248447204968945,
"grad_norm": 0.2552003566164109,
"learning_rate": 2.7330264672036824e-05,
"loss": 0.3466,
"step": 491
},
{
"epoch": 1.5279503105590062,
"grad_norm": 0.2871320623730171,
"learning_rate": 2.7272727272727273e-05,
"loss": 0.3447,
"step": 492
},
{
"epoch": 1.531055900621118,
"grad_norm": 0.2440639273175817,
"learning_rate": 2.7215189873417722e-05,
"loss": 0.338,
"step": 493
},
{
"epoch": 1.5341614906832297,
"grad_norm": 0.24115465120440344,
"learning_rate": 2.715765247410817e-05,
"loss": 0.3245,
"step": 494
},
{
"epoch": 1.5372670807453415,
"grad_norm": 0.2781064697786101,
"learning_rate": 2.7100115074798623e-05,
"loss": 0.3637,
"step": 495
},
{
"epoch": 1.5403726708074534,
"grad_norm": 0.27237179201858924,
"learning_rate": 2.7042577675489068e-05,
"loss": 0.353,
"step": 496
},
{
"epoch": 1.5434782608695652,
"grad_norm": 0.3018049050362612,
"learning_rate": 2.698504027617952e-05,
"loss": 0.33,
"step": 497
},
{
"epoch": 1.5465838509316772,
"grad_norm": 0.319532872255584,
"learning_rate": 2.6927502876869965e-05,
"loss": 0.341,
"step": 498
},
{
"epoch": 1.549689440993789,
"grad_norm": 0.23318500669833875,
"learning_rate": 2.6869965477560418e-05,
"loss": 0.3324,
"step": 499
},
{
"epoch": 1.5527950310559007,
"grad_norm": 0.3108509837550317,
"learning_rate": 2.6812428078250863e-05,
"loss": 0.3484,
"step": 500
},
{
"epoch": 1.5559006211180124,
"grad_norm": 0.27432952612163103,
"learning_rate": 2.6754890678941315e-05,
"loss": 0.3301,
"step": 501
},
{
"epoch": 1.5590062111801242,
"grad_norm": 0.2474022932813197,
"learning_rate": 2.669735327963176e-05,
"loss": 0.3335,
"step": 502
},
{
"epoch": 1.562111801242236,
"grad_norm": 0.25918516414740417,
"learning_rate": 2.6639815880322212e-05,
"loss": 0.3531,
"step": 503
},
{
"epoch": 1.5652173913043477,
"grad_norm": 0.28631487498944946,
"learning_rate": 2.6582278481012658e-05,
"loss": 0.3461,
"step": 504
},
{
"epoch": 1.5683229813664596,
"grad_norm": 0.2541239393514543,
"learning_rate": 2.652474108170311e-05,
"loss": 0.3406,
"step": 505
},
{
"epoch": 1.5714285714285714,
"grad_norm": 0.26793962508861174,
"learning_rate": 2.646720368239356e-05,
"loss": 0.3333,
"step": 506
},
{
"epoch": 1.5745341614906834,
"grad_norm": 0.30945895076026697,
"learning_rate": 2.6409666283084007e-05,
"loss": 0.3429,
"step": 507
},
{
"epoch": 1.5776397515527951,
"grad_norm": 0.2625598679112342,
"learning_rate": 2.6352128883774456e-05,
"loss": 0.3475,
"step": 508
},
{
"epoch": 1.5807453416149069,
"grad_norm": 0.3340128053196445,
"learning_rate": 2.6294591484464905e-05,
"loss": 0.3548,
"step": 509
},
{
"epoch": 1.5838509316770186,
"grad_norm": 0.33090740840028027,
"learning_rate": 2.6237054085155354e-05,
"loss": 0.3482,
"step": 510
},
{
"epoch": 1.5869565217391304,
"grad_norm": 0.25904205831808136,
"learning_rate": 2.6179516685845802e-05,
"loss": 0.3322,
"step": 511
},
{
"epoch": 1.5900621118012421,
"grad_norm": 0.3458888736647229,
"learning_rate": 2.612197928653625e-05,
"loss": 0.3466,
"step": 512
},
{
"epoch": 1.5931677018633539,
"grad_norm": 0.26139554234188184,
"learning_rate": 2.60644418872267e-05,
"loss": 0.3357,
"step": 513
},
{
"epoch": 1.5962732919254659,
"grad_norm": 0.24797420076401436,
"learning_rate": 2.600690448791715e-05,
"loss": 0.3263,
"step": 514
},
{
"epoch": 1.5993788819875776,
"grad_norm": 0.25127963694679545,
"learning_rate": 2.5949367088607597e-05,
"loss": 0.3541,
"step": 515
},
{
"epoch": 1.6024844720496896,
"grad_norm": 0.24350107098267543,
"learning_rate": 2.5891829689298046e-05,
"loss": 0.3332,
"step": 516
},
{
"epoch": 1.6055900621118013,
"grad_norm": 0.2597186201230917,
"learning_rate": 2.5834292289988495e-05,
"loss": 0.3357,
"step": 517
},
{
"epoch": 1.608695652173913,
"grad_norm": 0.2553977260875351,
"learning_rate": 2.5776754890678943e-05,
"loss": 0.3381,
"step": 518
},
{
"epoch": 1.6118012422360248,
"grad_norm": 0.2495485503111441,
"learning_rate": 2.5719217491369392e-05,
"loss": 0.3489,
"step": 519
},
{
"epoch": 1.6149068322981366,
"grad_norm": 0.2826237704718821,
"learning_rate": 2.566168009205984e-05,
"loss": 0.3269,
"step": 520
},
{
"epoch": 1.6180124223602483,
"grad_norm": 0.2907559187980417,
"learning_rate": 2.560414269275029e-05,
"loss": 0.353,
"step": 521
},
{
"epoch": 1.62111801242236,
"grad_norm": 0.30078662752184515,
"learning_rate": 2.5546605293440738e-05,
"loss": 0.3344,
"step": 522
},
{
"epoch": 1.624223602484472,
"grad_norm": 0.2494274026603714,
"learning_rate": 2.5489067894131187e-05,
"loss": 0.3262,
"step": 523
},
{
"epoch": 1.6273291925465838,
"grad_norm": 0.22856587280801138,
"learning_rate": 2.5431530494821636e-05,
"loss": 0.3316,
"step": 524
},
{
"epoch": 1.6304347826086958,
"grad_norm": 0.24524446266248454,
"learning_rate": 2.5373993095512084e-05,
"loss": 0.3254,
"step": 525
},
{
"epoch": 1.6335403726708075,
"grad_norm": 0.2781145066258604,
"learning_rate": 2.5316455696202533e-05,
"loss": 0.3343,
"step": 526
},
{
"epoch": 1.6366459627329193,
"grad_norm": 0.24971582793985952,
"learning_rate": 2.5258918296892982e-05,
"loss": 0.3423,
"step": 527
},
{
"epoch": 1.639751552795031,
"grad_norm": 0.2961483358525156,
"learning_rate": 2.520138089758343e-05,
"loss": 0.3554,
"step": 528
},
{
"epoch": 1.6428571428571428,
"grad_norm": 0.30349368090110823,
"learning_rate": 2.514384349827388e-05,
"loss": 0.3563,
"step": 529
},
{
"epoch": 1.6459627329192545,
"grad_norm": 0.28292757074394537,
"learning_rate": 2.5086306098964328e-05,
"loss": 0.352,
"step": 530
},
{
"epoch": 1.6490683229813663,
"grad_norm": 0.25778656185495347,
"learning_rate": 2.5028768699654777e-05,
"loss": 0.3486,
"step": 531
},
{
"epoch": 1.6521739130434783,
"grad_norm": 0.32420346337090605,
"learning_rate": 2.4971231300345226e-05,
"loss": 0.3497,
"step": 532
},
{
"epoch": 1.65527950310559,
"grad_norm": 0.24803469539845557,
"learning_rate": 2.4913693901035674e-05,
"loss": 0.3325,
"step": 533
},
{
"epoch": 1.658385093167702,
"grad_norm": 0.23193714998127715,
"learning_rate": 2.4856156501726123e-05,
"loss": 0.3244,
"step": 534
},
{
"epoch": 1.6614906832298137,
"grad_norm": 0.31410082505061837,
"learning_rate": 2.4798619102416572e-05,
"loss": 0.3295,
"step": 535
},
{
"epoch": 1.6645962732919255,
"grad_norm": 0.29805963194310403,
"learning_rate": 2.474108170310702e-05,
"loss": 0.3576,
"step": 536
},
{
"epoch": 1.6677018633540373,
"grad_norm": 0.2773254453129355,
"learning_rate": 2.468354430379747e-05,
"loss": 0.3382,
"step": 537
},
{
"epoch": 1.670807453416149,
"grad_norm": 0.32678020135127306,
"learning_rate": 2.4626006904487918e-05,
"loss": 0.3196,
"step": 538
},
{
"epoch": 1.6739130434782608,
"grad_norm": 0.3166277691971712,
"learning_rate": 2.4568469505178367e-05,
"loss": 0.3567,
"step": 539
},
{
"epoch": 1.6770186335403725,
"grad_norm": 0.28823972531727493,
"learning_rate": 2.4510932105868815e-05,
"loss": 0.3303,
"step": 540
},
{
"epoch": 1.6801242236024845,
"grad_norm": 0.31416636195922193,
"learning_rate": 2.4453394706559264e-05,
"loss": 0.3468,
"step": 541
},
{
"epoch": 1.6832298136645962,
"grad_norm": 0.29389175839717274,
"learning_rate": 2.4395857307249713e-05,
"loss": 0.3334,
"step": 542
},
{
"epoch": 1.6863354037267082,
"grad_norm": 0.2574868658425901,
"learning_rate": 2.433831990794016e-05,
"loss": 0.3459,
"step": 543
},
{
"epoch": 1.68944099378882,
"grad_norm": 0.43013005229440787,
"learning_rate": 2.428078250863061e-05,
"loss": 0.3663,
"step": 544
},
{
"epoch": 1.6925465838509317,
"grad_norm": 0.29719384149686173,
"learning_rate": 2.422324510932106e-05,
"loss": 0.3227,
"step": 545
},
{
"epoch": 1.6956521739130435,
"grad_norm": 0.2630824870951196,
"learning_rate": 2.4165707710011508e-05,
"loss": 0.3334,
"step": 546
},
{
"epoch": 1.6987577639751552,
"grad_norm": 0.262646615576403,
"learning_rate": 2.4108170310701956e-05,
"loss": 0.3289,
"step": 547
},
{
"epoch": 1.701863354037267,
"grad_norm": 0.29464184604515603,
"learning_rate": 2.4050632911392405e-05,
"loss": 0.3606,
"step": 548
},
{
"epoch": 1.704968944099379,
"grad_norm": 0.270420959511805,
"learning_rate": 2.3993095512082854e-05,
"loss": 0.3344,
"step": 549
},
{
"epoch": 1.7080745341614907,
"grad_norm": 0.25692484125212045,
"learning_rate": 2.3935558112773303e-05,
"loss": 0.3487,
"step": 550
},
{
"epoch": 1.7111801242236024,
"grad_norm": 0.2438708000844588,
"learning_rate": 2.387802071346375e-05,
"loss": 0.3368,
"step": 551
},
{
"epoch": 1.7142857142857144,
"grad_norm": 0.3013744640384419,
"learning_rate": 2.38204833141542e-05,
"loss": 0.3374,
"step": 552
},
{
"epoch": 1.7173913043478262,
"grad_norm": 0.25432846986941376,
"learning_rate": 2.376294591484465e-05,
"loss": 0.3229,
"step": 553
},
{
"epoch": 1.720496894409938,
"grad_norm": 0.25904590947672523,
"learning_rate": 2.3705408515535098e-05,
"loss": 0.3446,
"step": 554
},
{
"epoch": 1.7236024844720497,
"grad_norm": 0.30606145603760704,
"learning_rate": 2.3647871116225546e-05,
"loss": 0.3341,
"step": 555
},
{
"epoch": 1.7267080745341614,
"grad_norm": 0.29538170031014566,
"learning_rate": 2.3590333716915995e-05,
"loss": 0.3289,
"step": 556
},
{
"epoch": 1.7298136645962732,
"grad_norm": 0.27134852683063904,
"learning_rate": 2.3532796317606444e-05,
"loss": 0.3547,
"step": 557
},
{
"epoch": 1.7329192546583851,
"grad_norm": 0.2825681002780479,
"learning_rate": 2.3475258918296892e-05,
"loss": 0.3379,
"step": 558
},
{
"epoch": 1.736024844720497,
"grad_norm": 0.2540853936510712,
"learning_rate": 2.341772151898734e-05,
"loss": 0.3382,
"step": 559
},
{
"epoch": 1.7391304347826086,
"grad_norm": 0.23898673149156113,
"learning_rate": 2.336018411967779e-05,
"loss": 0.3204,
"step": 560
},
{
"epoch": 1.7422360248447206,
"grad_norm": 0.28690037448179756,
"learning_rate": 2.330264672036824e-05,
"loss": 0.3475,
"step": 561
},
{
"epoch": 1.7453416149068324,
"grad_norm": 0.2517436769783244,
"learning_rate": 2.3245109321058687e-05,
"loss": 0.3336,
"step": 562
},
{
"epoch": 1.7484472049689441,
"grad_norm": 0.26052794049930406,
"learning_rate": 2.3187571921749136e-05,
"loss": 0.3659,
"step": 563
},
{
"epoch": 1.7515527950310559,
"grad_norm": 0.2520454393087574,
"learning_rate": 2.3130034522439585e-05,
"loss": 0.341,
"step": 564
},
{
"epoch": 1.7546583850931676,
"grad_norm": 0.24469475054712242,
"learning_rate": 2.3072497123130034e-05,
"loss": 0.3385,
"step": 565
},
{
"epoch": 1.7577639751552794,
"grad_norm": 0.2688171235493825,
"learning_rate": 2.3014959723820482e-05,
"loss": 0.3194,
"step": 566
},
{
"epoch": 1.7608695652173914,
"grad_norm": 0.24660650779589638,
"learning_rate": 2.295742232451093e-05,
"loss": 0.3414,
"step": 567
},
{
"epoch": 1.763975155279503,
"grad_norm": 0.24074948906029303,
"learning_rate": 2.289988492520138e-05,
"loss": 0.3487,
"step": 568
},
{
"epoch": 1.7670807453416149,
"grad_norm": 0.2683374003415654,
"learning_rate": 2.284234752589183e-05,
"loss": 0.3324,
"step": 569
},
{
"epoch": 1.7701863354037268,
"grad_norm": 0.2615920960522321,
"learning_rate": 2.278481012658228e-05,
"loss": 0.3609,
"step": 570
},
{
"epoch": 1.7732919254658386,
"grad_norm": 0.24217423401661245,
"learning_rate": 2.272727272727273e-05,
"loss": 0.3642,
"step": 571
},
{
"epoch": 1.7763975155279503,
"grad_norm": 0.3146547539059143,
"learning_rate": 2.2669735327963178e-05,
"loss": 0.3333,
"step": 572
},
{
"epoch": 1.779503105590062,
"grad_norm": 0.2562957388358894,
"learning_rate": 2.2612197928653627e-05,
"loss": 0.3446,
"step": 573
},
{
"epoch": 1.7826086956521738,
"grad_norm": 0.2514732345893343,
"learning_rate": 2.2554660529344075e-05,
"loss": 0.3313,
"step": 574
},
{
"epoch": 1.7857142857142856,
"grad_norm": 0.27268825204355784,
"learning_rate": 2.2497123130034524e-05,
"loss": 0.3431,
"step": 575
},
{
"epoch": 1.7888198757763976,
"grad_norm": 0.24683018483720148,
"learning_rate": 2.2439585730724973e-05,
"loss": 0.3264,
"step": 576
},
{
"epoch": 1.7919254658385093,
"grad_norm": 0.2511343392474156,
"learning_rate": 2.238204833141542e-05,
"loss": 0.3231,
"step": 577
},
{
"epoch": 1.795031055900621,
"grad_norm": 0.29263014098541856,
"learning_rate": 2.232451093210587e-05,
"loss": 0.3425,
"step": 578
},
{
"epoch": 1.798136645962733,
"grad_norm": 0.26439452081008274,
"learning_rate": 2.226697353279632e-05,
"loss": 0.3404,
"step": 579
},
{
"epoch": 1.8012422360248448,
"grad_norm": 0.26624397225893237,
"learning_rate": 2.2209436133486768e-05,
"loss": 0.3521,
"step": 580
},
{
"epoch": 1.8043478260869565,
"grad_norm": 0.27231006655087864,
"learning_rate": 2.2151898734177217e-05,
"loss": 0.3499,
"step": 581
},
{
"epoch": 1.8074534161490683,
"grad_norm": 0.2717715300694685,
"learning_rate": 2.2094361334867665e-05,
"loss": 0.3488,
"step": 582
},
{
"epoch": 1.81055900621118,
"grad_norm": 0.2555398572654679,
"learning_rate": 2.2036823935558114e-05,
"loss": 0.3484,
"step": 583
},
{
"epoch": 1.8136645962732918,
"grad_norm": 0.27285873888872886,
"learning_rate": 2.1979286536248563e-05,
"loss": 0.3301,
"step": 584
},
{
"epoch": 1.8167701863354038,
"grad_norm": 0.25210730048319585,
"learning_rate": 2.192174913693901e-05,
"loss": 0.3424,
"step": 585
},
{
"epoch": 1.8198757763975155,
"grad_norm": 0.26842055467218623,
"learning_rate": 2.186421173762946e-05,
"loss": 0.3408,
"step": 586
},
{
"epoch": 1.8229813664596275,
"grad_norm": 0.3208770216327945,
"learning_rate": 2.180667433831991e-05,
"loss": 0.3312,
"step": 587
},
{
"epoch": 1.8260869565217392,
"grad_norm": 0.2559541162545561,
"learning_rate": 2.1749136939010358e-05,
"loss": 0.36,
"step": 588
},
{
"epoch": 1.829192546583851,
"grad_norm": 0.32134394732411636,
"learning_rate": 2.1691599539700806e-05,
"loss": 0.3394,
"step": 589
},
{
"epoch": 1.8322981366459627,
"grad_norm": 0.2708594663810051,
"learning_rate": 2.1634062140391255e-05,
"loss": 0.3431,
"step": 590
},
{
"epoch": 1.8354037267080745,
"grad_norm": 0.3010404719152366,
"learning_rate": 2.1576524741081704e-05,
"loss": 0.3417,
"step": 591
},
{
"epoch": 1.8385093167701863,
"grad_norm": 0.3070106173244936,
"learning_rate": 2.1518987341772153e-05,
"loss": 0.3388,
"step": 592
},
{
"epoch": 1.841614906832298,
"grad_norm": 0.24023699838734106,
"learning_rate": 2.14614499424626e-05,
"loss": 0.359,
"step": 593
},
{
"epoch": 1.84472049689441,
"grad_norm": 0.27420152661967667,
"learning_rate": 2.140391254315305e-05,
"loss": 0.3262,
"step": 594
},
{
"epoch": 1.8478260869565217,
"grad_norm": 0.3128089706224423,
"learning_rate": 2.13463751438435e-05,
"loss": 0.3284,
"step": 595
},
{
"epoch": 1.8509316770186337,
"grad_norm": 0.26784524761567324,
"learning_rate": 2.1288837744533947e-05,
"loss": 0.3337,
"step": 596
},
{
"epoch": 1.8540372670807455,
"grad_norm": 0.2897924996984458,
"learning_rate": 2.1231300345224396e-05,
"loss": 0.3325,
"step": 597
},
{
"epoch": 1.8571428571428572,
"grad_norm": 0.25009521464769496,
"learning_rate": 2.1173762945914845e-05,
"loss": 0.3457,
"step": 598
},
{
"epoch": 1.860248447204969,
"grad_norm": 0.3055325339631166,
"learning_rate": 2.1116225546605294e-05,
"loss": 0.3506,
"step": 599
},
{
"epoch": 1.8633540372670807,
"grad_norm": 0.28613812992385934,
"learning_rate": 2.1058688147295742e-05,
"loss": 0.3609,
"step": 600
},
{
"epoch": 1.8664596273291925,
"grad_norm": 0.2705173567449075,
"learning_rate": 2.100115074798619e-05,
"loss": 0.3259,
"step": 601
},
{
"epoch": 1.8695652173913042,
"grad_norm": 0.3180478776075474,
"learning_rate": 2.094361334867664e-05,
"loss": 0.3459,
"step": 602
},
{
"epoch": 1.8726708074534162,
"grad_norm": 0.2667088182720578,
"learning_rate": 2.088607594936709e-05,
"loss": 0.3415,
"step": 603
},
{
"epoch": 1.875776397515528,
"grad_norm": 0.2721287511996052,
"learning_rate": 2.0828538550057537e-05,
"loss": 0.3295,
"step": 604
},
{
"epoch": 1.87888198757764,
"grad_norm": 0.24692799289830528,
"learning_rate": 2.0771001150747986e-05,
"loss": 0.3389,
"step": 605
},
{
"epoch": 1.8819875776397517,
"grad_norm": 0.25562500894154333,
"learning_rate": 2.0713463751438435e-05,
"loss": 0.3319,
"step": 606
},
{
"epoch": 1.8850931677018634,
"grad_norm": 0.2788963760074411,
"learning_rate": 2.0655926352128883e-05,
"loss": 0.3285,
"step": 607
},
{
"epoch": 1.8881987577639752,
"grad_norm": 0.24657052891375197,
"learning_rate": 2.0598388952819332e-05,
"loss": 0.3457,
"step": 608
},
{
"epoch": 1.891304347826087,
"grad_norm": 0.32309129817645427,
"learning_rate": 2.054085155350978e-05,
"loss": 0.3403,
"step": 609
},
{
"epoch": 1.8944099378881987,
"grad_norm": 0.30175425766070024,
"learning_rate": 2.048331415420023e-05,
"loss": 0.3471,
"step": 610
},
{
"epoch": 1.8975155279503104,
"grad_norm": 0.26841376007608464,
"learning_rate": 2.042577675489068e-05,
"loss": 0.3375,
"step": 611
},
{
"epoch": 1.9006211180124224,
"grad_norm": 0.262662943207609,
"learning_rate": 2.0368239355581127e-05,
"loss": 0.3323,
"step": 612
},
{
"epoch": 1.9037267080745341,
"grad_norm": 0.2701013469116013,
"learning_rate": 2.0310701956271576e-05,
"loss": 0.3421,
"step": 613
},
{
"epoch": 1.9068322981366461,
"grad_norm": 0.2714596873719603,
"learning_rate": 2.0253164556962025e-05,
"loss": 0.3459,
"step": 614
},
{
"epoch": 1.9099378881987579,
"grad_norm": 0.27588772682551244,
"learning_rate": 2.0195627157652473e-05,
"loss": 0.3393,
"step": 615
},
{
"epoch": 1.9130434782608696,
"grad_norm": 0.2599899065726882,
"learning_rate": 2.0138089758342922e-05,
"loss": 0.3272,
"step": 616
},
{
"epoch": 1.9161490683229814,
"grad_norm": 0.29859390134967695,
"learning_rate": 2.008055235903337e-05,
"loss": 0.3406,
"step": 617
},
{
"epoch": 1.9192546583850931,
"grad_norm": 0.2506363801804046,
"learning_rate": 2.002301495972382e-05,
"loss": 0.3442,
"step": 618
},
{
"epoch": 1.9223602484472049,
"grad_norm": 0.27643958694894183,
"learning_rate": 1.9965477560414268e-05,
"loss": 0.3266,
"step": 619
},
{
"epoch": 1.9254658385093166,
"grad_norm": 0.24433788612177662,
"learning_rate": 1.9907940161104717e-05,
"loss": 0.3282,
"step": 620
},
{
"epoch": 1.9285714285714286,
"grad_norm": 0.23745008386988042,
"learning_rate": 1.9850402761795166e-05,
"loss": 0.3362,
"step": 621
},
{
"epoch": 1.9316770186335404,
"grad_norm": 0.26121859150272697,
"learning_rate": 1.9792865362485614e-05,
"loss": 0.3286,
"step": 622
},
{
"epoch": 1.9347826086956523,
"grad_norm": 0.2593462473033886,
"learning_rate": 1.9735327963176063e-05,
"loss": 0.3277,
"step": 623
},
{
"epoch": 1.937888198757764,
"grad_norm": 0.251734596039316,
"learning_rate": 1.9677790563866512e-05,
"loss": 0.3243,
"step": 624
},
{
"epoch": 1.9409937888198758,
"grad_norm": 0.2796993503020773,
"learning_rate": 1.962025316455696e-05,
"loss": 0.341,
"step": 625
},
{
"epoch": 1.9440993788819876,
"grad_norm": 0.24405119567007771,
"learning_rate": 1.956271576524741e-05,
"loss": 0.3263,
"step": 626
},
{
"epoch": 1.9472049689440993,
"grad_norm": 0.2551104705286801,
"learning_rate": 1.9505178365937858e-05,
"loss": 0.3273,
"step": 627
},
{
"epoch": 1.950310559006211,
"grad_norm": 0.30160867369839583,
"learning_rate": 1.9447640966628307e-05,
"loss": 0.3461,
"step": 628
},
{
"epoch": 1.9534161490683228,
"grad_norm": 0.2307083280109175,
"learning_rate": 1.9390103567318755e-05,
"loss": 0.3195,
"step": 629
},
{
"epoch": 1.9565217391304348,
"grad_norm": 0.27856475014068843,
"learning_rate": 1.9332566168009204e-05,
"loss": 0.3534,
"step": 630
},
{
"epoch": 1.9596273291925466,
"grad_norm": 0.2422123663936176,
"learning_rate": 1.9275028768699653e-05,
"loss": 0.3444,
"step": 631
},
{
"epoch": 1.9627329192546585,
"grad_norm": 0.2508344560707552,
"learning_rate": 1.92174913693901e-05,
"loss": 0.3316,
"step": 632
},
{
"epoch": 1.9658385093167703,
"grad_norm": 0.25457679605852573,
"learning_rate": 1.915995397008055e-05,
"loss": 0.3511,
"step": 633
},
{
"epoch": 1.968944099378882,
"grad_norm": 0.255448027513179,
"learning_rate": 1.9102416570771002e-05,
"loss": 0.3417,
"step": 634
},
{
"epoch": 1.9720496894409938,
"grad_norm": 0.2655225587813165,
"learning_rate": 1.904487917146145e-05,
"loss": 0.3364,
"step": 635
},
{
"epoch": 1.9751552795031055,
"grad_norm": 0.2644532896395622,
"learning_rate": 1.89873417721519e-05,
"loss": 0.3439,
"step": 636
},
{
"epoch": 1.9782608695652173,
"grad_norm": 0.25431765900047304,
"learning_rate": 1.892980437284235e-05,
"loss": 0.3277,
"step": 637
},
{
"epoch": 1.981366459627329,
"grad_norm": 0.253925081047276,
"learning_rate": 1.8872266973532797e-05,
"loss": 0.345,
"step": 638
},
{
"epoch": 1.984472049689441,
"grad_norm": 0.2542103813230237,
"learning_rate": 1.8814729574223246e-05,
"loss": 0.3458,
"step": 639
},
{
"epoch": 1.9875776397515528,
"grad_norm": 0.298104123148457,
"learning_rate": 1.8757192174913695e-05,
"loss": 0.3278,
"step": 640
},
{
"epoch": 1.9906832298136647,
"grad_norm": 0.2322373735825899,
"learning_rate": 1.8699654775604144e-05,
"loss": 0.3426,
"step": 641
},
{
"epoch": 1.9937888198757765,
"grad_norm": 0.24606988538470728,
"learning_rate": 1.8642117376294592e-05,
"loss": 0.3358,
"step": 642
},
{
"epoch": 1.9968944099378882,
"grad_norm": 0.2849652231363428,
"learning_rate": 1.858457997698504e-05,
"loss": 0.315,
"step": 643
},
{
"epoch": 2.0,
"grad_norm": 0.277308601131606,
"learning_rate": 1.852704257767549e-05,
"loss": 0.3275,
"step": 644
},
{
"epoch": 2.0031055900621118,
"grad_norm": 0.3117446232875321,
"learning_rate": 1.846950517836594e-05,
"loss": 0.2631,
"step": 645
},
{
"epoch": 2.0062111801242235,
"grad_norm": 0.27820371561408924,
"learning_rate": 1.8411967779056387e-05,
"loss": 0.26,
"step": 646
},
{
"epoch": 2.0093167701863353,
"grad_norm": 0.35212508445991075,
"learning_rate": 1.8354430379746836e-05,
"loss": 0.2589,
"step": 647
},
{
"epoch": 2.012422360248447,
"grad_norm": 0.29598296168936833,
"learning_rate": 1.8296892980437285e-05,
"loss": 0.2727,
"step": 648
},
{
"epoch": 2.015527950310559,
"grad_norm": 0.23748767777958518,
"learning_rate": 1.8239355581127733e-05,
"loss": 0.2603,
"step": 649
},
{
"epoch": 2.018633540372671,
"grad_norm": 0.3396040317332316,
"learning_rate": 1.8181818181818182e-05,
"loss": 0.2494,
"step": 650
},
{
"epoch": 2.0217391304347827,
"grad_norm": 0.26773683268799814,
"learning_rate": 1.812428078250863e-05,
"loss": 0.2538,
"step": 651
},
{
"epoch": 2.0248447204968945,
"grad_norm": 0.26908218639639603,
"learning_rate": 1.806674338319908e-05,
"loss": 0.2636,
"step": 652
},
{
"epoch": 2.027950310559006,
"grad_norm": 0.2934635435841592,
"learning_rate": 1.8009205983889528e-05,
"loss": 0.25,
"step": 653
},
{
"epoch": 2.031055900621118,
"grad_norm": 0.2785089363356141,
"learning_rate": 1.7951668584579977e-05,
"loss": 0.2441,
"step": 654
},
{
"epoch": 2.0341614906832297,
"grad_norm": 0.2638349484890508,
"learning_rate": 1.7894131185270426e-05,
"loss": 0.2519,
"step": 655
},
{
"epoch": 2.0372670807453415,
"grad_norm": 0.2586235412884467,
"learning_rate": 1.7836593785960874e-05,
"loss": 0.2509,
"step": 656
},
{
"epoch": 2.040372670807453,
"grad_norm": 0.2472488808463837,
"learning_rate": 1.7779056386651323e-05,
"loss": 0.2538,
"step": 657
},
{
"epoch": 2.0434782608695654,
"grad_norm": 0.2795998982851747,
"learning_rate": 1.7721518987341772e-05,
"loss": 0.245,
"step": 658
},
{
"epoch": 2.046583850931677,
"grad_norm": 0.22761521829171,
"learning_rate": 1.766398158803222e-05,
"loss": 0.257,
"step": 659
},
{
"epoch": 2.049689440993789,
"grad_norm": 0.27491770536559856,
"learning_rate": 1.760644418872267e-05,
"loss": 0.2543,
"step": 660
},
{
"epoch": 2.0527950310559007,
"grad_norm": 0.2694008994446243,
"learning_rate": 1.7548906789413118e-05,
"loss": 0.2523,
"step": 661
},
{
"epoch": 2.0559006211180124,
"grad_norm": 0.2649790265269665,
"learning_rate": 1.7491369390103567e-05,
"loss": 0.2638,
"step": 662
},
{
"epoch": 2.059006211180124,
"grad_norm": 0.2592501818191762,
"learning_rate": 1.7433831990794016e-05,
"loss": 0.2577,
"step": 663
},
{
"epoch": 2.062111801242236,
"grad_norm": 0.2669822602469171,
"learning_rate": 1.7376294591484464e-05,
"loss": 0.2609,
"step": 664
},
{
"epoch": 2.0652173913043477,
"grad_norm": 0.24387894874703012,
"learning_rate": 1.7318757192174913e-05,
"loss": 0.2633,
"step": 665
},
{
"epoch": 2.0683229813664594,
"grad_norm": 0.2492896708129992,
"learning_rate": 1.7261219792865362e-05,
"loss": 0.2574,
"step": 666
},
{
"epoch": 2.0714285714285716,
"grad_norm": 0.23542048799701373,
"learning_rate": 1.720368239355581e-05,
"loss": 0.2472,
"step": 667
},
{
"epoch": 2.0745341614906834,
"grad_norm": 0.23318721989860372,
"learning_rate": 1.714614499424626e-05,
"loss": 0.2492,
"step": 668
},
{
"epoch": 2.077639751552795,
"grad_norm": 0.2184370669246145,
"learning_rate": 1.7088607594936708e-05,
"loss": 0.2614,
"step": 669
},
{
"epoch": 2.080745341614907,
"grad_norm": 0.23606974543337897,
"learning_rate": 1.7031070195627157e-05,
"loss": 0.2669,
"step": 670
},
{
"epoch": 2.0838509316770186,
"grad_norm": 0.24573655537821745,
"learning_rate": 1.6973532796317605e-05,
"loss": 0.2495,
"step": 671
},
{
"epoch": 2.0869565217391304,
"grad_norm": 0.22577283258104885,
"learning_rate": 1.6915995397008054e-05,
"loss": 0.2324,
"step": 672
},
{
"epoch": 2.090062111801242,
"grad_norm": 0.22880524187260692,
"learning_rate": 1.6858457997698503e-05,
"loss": 0.2454,
"step": 673
},
{
"epoch": 2.093167701863354,
"grad_norm": 0.23649506425354394,
"learning_rate": 1.680092059838895e-05,
"loss": 0.2706,
"step": 674
},
{
"epoch": 2.0962732919254656,
"grad_norm": 0.25403811052331426,
"learning_rate": 1.67433831990794e-05,
"loss": 0.2543,
"step": 675
},
{
"epoch": 2.099378881987578,
"grad_norm": 0.2443521353581772,
"learning_rate": 1.668584579976985e-05,
"loss": 0.2484,
"step": 676
},
{
"epoch": 2.1024844720496896,
"grad_norm": 0.21359909291298998,
"learning_rate": 1.6628308400460298e-05,
"loss": 0.242,
"step": 677
},
{
"epoch": 2.1055900621118013,
"grad_norm": 0.2270460742418379,
"learning_rate": 1.6570771001150746e-05,
"loss": 0.248,
"step": 678
},
{
"epoch": 2.108695652173913,
"grad_norm": 0.22643050215454086,
"learning_rate": 1.6513233601841195e-05,
"loss": 0.2477,
"step": 679
},
{
"epoch": 2.111801242236025,
"grad_norm": 0.22299140134872011,
"learning_rate": 1.6455696202531644e-05,
"loss": 0.2568,
"step": 680
},
{
"epoch": 2.1149068322981366,
"grad_norm": 0.21597401049687515,
"learning_rate": 1.6398158803222093e-05,
"loss": 0.2567,
"step": 681
},
{
"epoch": 2.1180124223602483,
"grad_norm": 0.21424964627998483,
"learning_rate": 1.634062140391254e-05,
"loss": 0.2594,
"step": 682
},
{
"epoch": 2.12111801242236,
"grad_norm": 0.22064934278360224,
"learning_rate": 1.628308400460299e-05,
"loss": 0.2548,
"step": 683
},
{
"epoch": 2.124223602484472,
"grad_norm": 0.2226010291399242,
"learning_rate": 1.622554660529344e-05,
"loss": 0.2392,
"step": 684
},
{
"epoch": 2.127329192546584,
"grad_norm": 0.23817934921154135,
"learning_rate": 1.616800920598389e-05,
"loss": 0.263,
"step": 685
},
{
"epoch": 2.130434782608696,
"grad_norm": 0.21811012125800597,
"learning_rate": 1.611047180667434e-05,
"loss": 0.239,
"step": 686
},
{
"epoch": 2.1335403726708075,
"grad_norm": 0.22669201592312113,
"learning_rate": 1.605293440736479e-05,
"loss": 0.2526,
"step": 687
},
{
"epoch": 2.1366459627329193,
"grad_norm": 0.23305634956402152,
"learning_rate": 1.5995397008055237e-05,
"loss": 0.2685,
"step": 688
},
{
"epoch": 2.139751552795031,
"grad_norm": 0.2115884014346869,
"learning_rate": 1.5937859608745686e-05,
"loss": 0.2393,
"step": 689
},
{
"epoch": 2.142857142857143,
"grad_norm": 0.2115627751194399,
"learning_rate": 1.5880322209436135e-05,
"loss": 0.251,
"step": 690
},
{
"epoch": 2.1459627329192545,
"grad_norm": 0.21692908050798426,
"learning_rate": 1.5822784810126583e-05,
"loss": 0.2512,
"step": 691
},
{
"epoch": 2.1490683229813663,
"grad_norm": 0.22462301830943301,
"learning_rate": 1.5765247410817032e-05,
"loss": 0.2511,
"step": 692
},
{
"epoch": 2.1521739130434785,
"grad_norm": 0.19326801109974084,
"learning_rate": 1.570771001150748e-05,
"loss": 0.2452,
"step": 693
},
{
"epoch": 2.1552795031055902,
"grad_norm": 0.23274032126488928,
"learning_rate": 1.565017261219793e-05,
"loss": 0.2529,
"step": 694
},
{
"epoch": 2.158385093167702,
"grad_norm": 0.22582002907669432,
"learning_rate": 1.5592635212888378e-05,
"loss": 0.2597,
"step": 695
},
{
"epoch": 2.1614906832298137,
"grad_norm": 0.21665150454694335,
"learning_rate": 1.5535097813578827e-05,
"loss": 0.258,
"step": 696
},
{
"epoch": 2.1645962732919255,
"grad_norm": 0.21401338541621684,
"learning_rate": 1.5477560414269276e-05,
"loss": 0.2556,
"step": 697
},
{
"epoch": 2.1677018633540373,
"grad_norm": 0.23527336018366451,
"learning_rate": 1.5420023014959724e-05,
"loss": 0.2582,
"step": 698
},
{
"epoch": 2.170807453416149,
"grad_norm": 0.21445105954714194,
"learning_rate": 1.5362485615650173e-05,
"loss": 0.2503,
"step": 699
},
{
"epoch": 2.1739130434782608,
"grad_norm": 0.21511080318136375,
"learning_rate": 1.5304948216340622e-05,
"loss": 0.2566,
"step": 700
},
{
"epoch": 2.1770186335403725,
"grad_norm": 0.24111702768301724,
"learning_rate": 1.5247410817031072e-05,
"loss": 0.2425,
"step": 701
},
{
"epoch": 2.1801242236024843,
"grad_norm": 0.21498468064553858,
"learning_rate": 1.5189873417721521e-05,
"loss": 0.2439,
"step": 702
},
{
"epoch": 2.1832298136645965,
"grad_norm": 0.21967291818581178,
"learning_rate": 1.513233601841197e-05,
"loss": 0.2509,
"step": 703
},
{
"epoch": 2.186335403726708,
"grad_norm": 0.2205935490599146,
"learning_rate": 1.5074798619102418e-05,
"loss": 0.2579,
"step": 704
},
{
"epoch": 2.18944099378882,
"grad_norm": 0.23758914721632698,
"learning_rate": 1.5017261219792867e-05,
"loss": 0.2399,
"step": 705
},
{
"epoch": 2.1925465838509317,
"grad_norm": 0.19571832530537867,
"learning_rate": 1.4959723820483316e-05,
"loss": 0.2404,
"step": 706
},
{
"epoch": 2.1956521739130435,
"grad_norm": 0.20772523111005442,
"learning_rate": 1.4902186421173765e-05,
"loss": 0.2489,
"step": 707
},
{
"epoch": 2.198757763975155,
"grad_norm": 0.2078388868457063,
"learning_rate": 1.4844649021864213e-05,
"loss": 0.2404,
"step": 708
},
{
"epoch": 2.201863354037267,
"grad_norm": 0.24074640885647317,
"learning_rate": 1.4787111622554662e-05,
"loss": 0.2647,
"step": 709
},
{
"epoch": 2.2049689440993787,
"grad_norm": 0.20937990276765678,
"learning_rate": 1.472957422324511e-05,
"loss": 0.2578,
"step": 710
},
{
"epoch": 2.208074534161491,
"grad_norm": 0.2639807190802869,
"learning_rate": 1.467203682393556e-05,
"loss": 0.2607,
"step": 711
},
{
"epoch": 2.2111801242236027,
"grad_norm": 0.26293955788698453,
"learning_rate": 1.4614499424626008e-05,
"loss": 0.2638,
"step": 712
},
{
"epoch": 2.2142857142857144,
"grad_norm": 0.23828883015584687,
"learning_rate": 1.4556962025316457e-05,
"loss": 0.2577,
"step": 713
},
{
"epoch": 2.217391304347826,
"grad_norm": 0.24740324327511762,
"learning_rate": 1.4499424626006906e-05,
"loss": 0.2603,
"step": 714
},
{
"epoch": 2.220496894409938,
"grad_norm": 0.22582258369375163,
"learning_rate": 1.4441887226697354e-05,
"loss": 0.2442,
"step": 715
},
{
"epoch": 2.2236024844720497,
"grad_norm": 0.24839008006413138,
"learning_rate": 1.4384349827387803e-05,
"loss": 0.2591,
"step": 716
},
{
"epoch": 2.2267080745341614,
"grad_norm": 0.2507061092171656,
"learning_rate": 1.4326812428078252e-05,
"loss": 0.2525,
"step": 717
},
{
"epoch": 2.229813664596273,
"grad_norm": 0.214855054431312,
"learning_rate": 1.42692750287687e-05,
"loss": 0.2436,
"step": 718
},
{
"epoch": 2.232919254658385,
"grad_norm": 0.21592522701402342,
"learning_rate": 1.421173762945915e-05,
"loss": 0.2516,
"step": 719
},
{
"epoch": 2.2360248447204967,
"grad_norm": 0.20915695199545198,
"learning_rate": 1.4154200230149598e-05,
"loss": 0.2597,
"step": 720
},
{
"epoch": 2.239130434782609,
"grad_norm": 0.22903634190903957,
"learning_rate": 1.4096662830840047e-05,
"loss": 0.2667,
"step": 721
},
{
"epoch": 2.2422360248447206,
"grad_norm": 0.21439993038647093,
"learning_rate": 1.4039125431530495e-05,
"loss": 0.2436,
"step": 722
},
{
"epoch": 2.2453416149068324,
"grad_norm": 0.21937639860358657,
"learning_rate": 1.3981588032220944e-05,
"loss": 0.2458,
"step": 723
},
{
"epoch": 2.248447204968944,
"grad_norm": 0.2013130017609961,
"learning_rate": 1.3924050632911393e-05,
"loss": 0.2491,
"step": 724
},
{
"epoch": 2.251552795031056,
"grad_norm": 0.22887750081435682,
"learning_rate": 1.3866513233601842e-05,
"loss": 0.2441,
"step": 725
},
{
"epoch": 2.2546583850931676,
"grad_norm": 0.2288064963841507,
"learning_rate": 1.380897583429229e-05,
"loss": 0.2418,
"step": 726
},
{
"epoch": 2.2577639751552794,
"grad_norm": 0.23248918550222136,
"learning_rate": 1.3751438434982739e-05,
"loss": 0.2625,
"step": 727
},
{
"epoch": 2.260869565217391,
"grad_norm": 0.2090094171369587,
"learning_rate": 1.3693901035673188e-05,
"loss": 0.2507,
"step": 728
},
{
"epoch": 2.2639751552795033,
"grad_norm": 0.23154174958563464,
"learning_rate": 1.3636363636363637e-05,
"loss": 0.2586,
"step": 729
},
{
"epoch": 2.267080745341615,
"grad_norm": 0.24350218064576923,
"learning_rate": 1.3578826237054085e-05,
"loss": 0.263,
"step": 730
},
{
"epoch": 2.270186335403727,
"grad_norm": 0.2278585941156764,
"learning_rate": 1.3521288837744534e-05,
"loss": 0.2555,
"step": 731
},
{
"epoch": 2.2732919254658386,
"grad_norm": 0.20801276930170154,
"learning_rate": 1.3463751438434983e-05,
"loss": 0.2572,
"step": 732
},
{
"epoch": 2.2763975155279503,
"grad_norm": 0.22919123007559652,
"learning_rate": 1.3406214039125431e-05,
"loss": 0.2582,
"step": 733
},
{
"epoch": 2.279503105590062,
"grad_norm": 0.21268694380279451,
"learning_rate": 1.334867663981588e-05,
"loss": 0.2512,
"step": 734
},
{
"epoch": 2.282608695652174,
"grad_norm": 0.2182606134520971,
"learning_rate": 1.3291139240506329e-05,
"loss": 0.2536,
"step": 735
},
{
"epoch": 2.2857142857142856,
"grad_norm": 0.2177977754376004,
"learning_rate": 1.323360184119678e-05,
"loss": 0.2589,
"step": 736
},
{
"epoch": 2.2888198757763973,
"grad_norm": 0.2079260936390528,
"learning_rate": 1.3176064441887228e-05,
"loss": 0.2445,
"step": 737
},
{
"epoch": 2.291925465838509,
"grad_norm": 0.21654285079809454,
"learning_rate": 1.3118527042577677e-05,
"loss": 0.2492,
"step": 738
},
{
"epoch": 2.2950310559006213,
"grad_norm": 0.22224222175484207,
"learning_rate": 1.3060989643268126e-05,
"loss": 0.2555,
"step": 739
},
{
"epoch": 2.298136645962733,
"grad_norm": 0.2013544241929392,
"learning_rate": 1.3003452243958574e-05,
"loss": 0.2457,
"step": 740
},
{
"epoch": 2.301242236024845,
"grad_norm": 0.21733404218015004,
"learning_rate": 1.2945914844649023e-05,
"loss": 0.2659,
"step": 741
},
{
"epoch": 2.3043478260869565,
"grad_norm": 0.21179336140885693,
"learning_rate": 1.2888377445339472e-05,
"loss": 0.2426,
"step": 742
},
{
"epoch": 2.3074534161490683,
"grad_norm": 0.2285599698694653,
"learning_rate": 1.283084004602992e-05,
"loss": 0.2429,
"step": 743
},
{
"epoch": 2.31055900621118,
"grad_norm": 0.19835079918909265,
"learning_rate": 1.2773302646720369e-05,
"loss": 0.2489,
"step": 744
},
{
"epoch": 2.313664596273292,
"grad_norm": 0.2298623252387309,
"learning_rate": 1.2715765247410818e-05,
"loss": 0.2655,
"step": 745
},
{
"epoch": 2.3167701863354035,
"grad_norm": 0.23867880872639935,
"learning_rate": 1.2658227848101267e-05,
"loss": 0.2498,
"step": 746
},
{
"epoch": 2.3198757763975157,
"grad_norm": 0.21037856832784158,
"learning_rate": 1.2600690448791715e-05,
"loss": 0.2589,
"step": 747
},
{
"epoch": 2.3229813664596275,
"grad_norm": 0.24695028457966048,
"learning_rate": 1.2543153049482164e-05,
"loss": 0.2502,
"step": 748
},
{
"epoch": 2.3260869565217392,
"grad_norm": 0.23360363557581765,
"learning_rate": 1.2485615650172613e-05,
"loss": 0.259,
"step": 749
},
{
"epoch": 2.329192546583851,
"grad_norm": 0.22335503888847086,
"learning_rate": 1.2428078250863062e-05,
"loss": 0.2456,
"step": 750
},
{
"epoch": 2.3322981366459627,
"grad_norm": 0.21231134626201825,
"learning_rate": 1.237054085155351e-05,
"loss": 0.26,
"step": 751
},
{
"epoch": 2.3354037267080745,
"grad_norm": 0.20990198210516803,
"learning_rate": 1.2313003452243959e-05,
"loss": 0.2441,
"step": 752
},
{
"epoch": 2.3385093167701863,
"grad_norm": 0.221067131454967,
"learning_rate": 1.2255466052934408e-05,
"loss": 0.2469,
"step": 753
},
{
"epoch": 2.341614906832298,
"grad_norm": 0.22138406777470937,
"learning_rate": 1.2197928653624856e-05,
"loss": 0.261,
"step": 754
},
{
"epoch": 2.3447204968944098,
"grad_norm": 0.21398489008508845,
"learning_rate": 1.2140391254315305e-05,
"loss": 0.2566,
"step": 755
},
{
"epoch": 2.3478260869565215,
"grad_norm": 0.20448116831895594,
"learning_rate": 1.2082853855005754e-05,
"loss": 0.2598,
"step": 756
},
{
"epoch": 2.3509316770186337,
"grad_norm": 0.21255766006062407,
"learning_rate": 1.2025316455696203e-05,
"loss": 0.2526,
"step": 757
},
{
"epoch": 2.3540372670807455,
"grad_norm": 0.19087455271546003,
"learning_rate": 1.1967779056386651e-05,
"loss": 0.2537,
"step": 758
},
{
"epoch": 2.357142857142857,
"grad_norm": 0.20379774772998854,
"learning_rate": 1.19102416570771e-05,
"loss": 0.2668,
"step": 759
},
{
"epoch": 2.360248447204969,
"grad_norm": 0.19801295062012142,
"learning_rate": 1.1852704257767549e-05,
"loss": 0.2479,
"step": 760
},
{
"epoch": 2.3633540372670807,
"grad_norm": 0.2053725094185451,
"learning_rate": 1.1795166858457997e-05,
"loss": 0.2597,
"step": 761
},
{
"epoch": 2.3664596273291925,
"grad_norm": 0.19414430502845648,
"learning_rate": 1.1737629459148446e-05,
"loss": 0.2445,
"step": 762
},
{
"epoch": 2.369565217391304,
"grad_norm": 0.20779479767313294,
"learning_rate": 1.1680092059838895e-05,
"loss": 0.2649,
"step": 763
},
{
"epoch": 2.372670807453416,
"grad_norm": 0.20304929332054908,
"learning_rate": 1.1622554660529344e-05,
"loss": 0.2624,
"step": 764
},
{
"epoch": 2.375776397515528,
"grad_norm": 0.20512146624367367,
"learning_rate": 1.1565017261219792e-05,
"loss": 0.2532,
"step": 765
},
{
"epoch": 2.37888198757764,
"grad_norm": 0.1948376797912715,
"learning_rate": 1.1507479861910241e-05,
"loss": 0.2593,
"step": 766
},
{
"epoch": 2.3819875776397517,
"grad_norm": 0.20111608619484334,
"learning_rate": 1.144994246260069e-05,
"loss": 0.2431,
"step": 767
},
{
"epoch": 2.3850931677018634,
"grad_norm": 0.20424563225076126,
"learning_rate": 1.139240506329114e-05,
"loss": 0.239,
"step": 768
},
{
"epoch": 2.388198757763975,
"grad_norm": 0.20385122820209117,
"learning_rate": 1.1334867663981589e-05,
"loss": 0.2519,
"step": 769
},
{
"epoch": 2.391304347826087,
"grad_norm": 0.2169017997179514,
"learning_rate": 1.1277330264672038e-05,
"loss": 0.2599,
"step": 770
},
{
"epoch": 2.3944099378881987,
"grad_norm": 0.20583351351917192,
"learning_rate": 1.1219792865362486e-05,
"loss": 0.2515,
"step": 771
},
{
"epoch": 2.3975155279503104,
"grad_norm": 0.20864268761499544,
"learning_rate": 1.1162255466052935e-05,
"loss": 0.2674,
"step": 772
},
{
"epoch": 2.400621118012422,
"grad_norm": 0.18352483617724127,
"learning_rate": 1.1104718066743384e-05,
"loss": 0.2517,
"step": 773
},
{
"epoch": 2.403726708074534,
"grad_norm": 0.19458848397143083,
"learning_rate": 1.1047180667433833e-05,
"loss": 0.2348,
"step": 774
},
{
"epoch": 2.406832298136646,
"grad_norm": 0.22085258658145707,
"learning_rate": 1.0989643268124281e-05,
"loss": 0.2626,
"step": 775
},
{
"epoch": 2.409937888198758,
"grad_norm": 0.2244287112114885,
"learning_rate": 1.093210586881473e-05,
"loss": 0.2656,
"step": 776
},
{
"epoch": 2.4130434782608696,
"grad_norm": 0.2064604218573695,
"learning_rate": 1.0874568469505179e-05,
"loss": 0.2474,
"step": 777
},
{
"epoch": 2.4161490683229814,
"grad_norm": 0.2170623734624135,
"learning_rate": 1.0817031070195628e-05,
"loss": 0.2673,
"step": 778
},
{
"epoch": 2.419254658385093,
"grad_norm": 0.21813795262022834,
"learning_rate": 1.0759493670886076e-05,
"loss": 0.2566,
"step": 779
},
{
"epoch": 2.422360248447205,
"grad_norm": 0.20015983943955706,
"learning_rate": 1.0701956271576525e-05,
"loss": 0.2433,
"step": 780
},
{
"epoch": 2.4254658385093166,
"grad_norm": 0.2518786075901923,
"learning_rate": 1.0644418872266974e-05,
"loss": 0.2542,
"step": 781
},
{
"epoch": 2.4285714285714284,
"grad_norm": 0.2039696978745147,
"learning_rate": 1.0586881472957422e-05,
"loss": 0.2635,
"step": 782
},
{
"epoch": 2.4316770186335406,
"grad_norm": 0.20193387084839037,
"learning_rate": 1.0529344073647871e-05,
"loss": 0.2529,
"step": 783
},
{
"epoch": 2.4347826086956523,
"grad_norm": 0.22256582381404963,
"learning_rate": 1.047180667433832e-05,
"loss": 0.2502,
"step": 784
},
{
"epoch": 2.437888198757764,
"grad_norm": 0.20375665724837322,
"learning_rate": 1.0414269275028769e-05,
"loss": 0.241,
"step": 785
},
{
"epoch": 2.440993788819876,
"grad_norm": 0.2179110760676831,
"learning_rate": 1.0356731875719217e-05,
"loss": 0.2599,
"step": 786
},
{
"epoch": 2.4440993788819876,
"grad_norm": 0.22202976810767208,
"learning_rate": 1.0299194476409666e-05,
"loss": 0.2505,
"step": 787
},
{
"epoch": 2.4472049689440993,
"grad_norm": 0.42670457942092715,
"learning_rate": 1.0241657077100115e-05,
"loss": 0.241,
"step": 788
},
{
"epoch": 2.450310559006211,
"grad_norm": 0.20784564321109833,
"learning_rate": 1.0184119677790564e-05,
"loss": 0.2567,
"step": 789
},
{
"epoch": 2.453416149068323,
"grad_norm": 0.20121980240137796,
"learning_rate": 1.0126582278481012e-05,
"loss": 0.2451,
"step": 790
},
{
"epoch": 2.4565217391304346,
"grad_norm": 0.21747971229319626,
"learning_rate": 1.0069044879171461e-05,
"loss": 0.2387,
"step": 791
},
{
"epoch": 2.4596273291925463,
"grad_norm": 0.18957130652801002,
"learning_rate": 1.001150747986191e-05,
"loss": 0.2404,
"step": 792
},
{
"epoch": 2.4627329192546585,
"grad_norm": 0.19623974528931779,
"learning_rate": 9.953970080552358e-06,
"loss": 0.2505,
"step": 793
},
{
"epoch": 2.4658385093167703,
"grad_norm": 0.2090564420719582,
"learning_rate": 9.896432681242807e-06,
"loss": 0.259,
"step": 794
},
{
"epoch": 2.468944099378882,
"grad_norm": 0.20995347548362167,
"learning_rate": 9.838895281933256e-06,
"loss": 0.2557,
"step": 795
},
{
"epoch": 2.472049689440994,
"grad_norm": 0.21072680749655628,
"learning_rate": 9.781357882623705e-06,
"loss": 0.2507,
"step": 796
},
{
"epoch": 2.4751552795031055,
"grad_norm": 0.2028138320185975,
"learning_rate": 9.723820483314153e-06,
"loss": 0.2428,
"step": 797
},
{
"epoch": 2.4782608695652173,
"grad_norm": 0.203416816769087,
"learning_rate": 9.666283084004602e-06,
"loss": 0.2549,
"step": 798
},
{
"epoch": 2.481366459627329,
"grad_norm": 0.2114980169350222,
"learning_rate": 9.60874568469505e-06,
"loss": 0.2544,
"step": 799
},
{
"epoch": 2.4844720496894412,
"grad_norm": 0.1947781123063217,
"learning_rate": 9.551208285385501e-06,
"loss": 0.246,
"step": 800
},
{
"epoch": 2.487577639751553,
"grad_norm": 0.2313621289649826,
"learning_rate": 9.49367088607595e-06,
"loss": 0.2688,
"step": 801
},
{
"epoch": 2.4906832298136647,
"grad_norm": 0.2070540850596655,
"learning_rate": 9.436133486766399e-06,
"loss": 0.2594,
"step": 802
},
{
"epoch": 2.4937888198757765,
"grad_norm": 0.21169469541077635,
"learning_rate": 9.378596087456847e-06,
"loss": 0.2493,
"step": 803
},
{
"epoch": 2.4968944099378882,
"grad_norm": 0.19281802475760265,
"learning_rate": 9.321058688147296e-06,
"loss": 0.25,
"step": 804
},
{
"epoch": 2.5,
"grad_norm": 0.2175842957962285,
"learning_rate": 9.263521288837745e-06,
"loss": 0.2678,
"step": 805
},
{
"epoch": 2.5031055900621118,
"grad_norm": 0.1942027851518837,
"learning_rate": 9.205983889528194e-06,
"loss": 0.2505,
"step": 806
},
{
"epoch": 2.5062111801242235,
"grad_norm": 0.2119389750172559,
"learning_rate": 9.148446490218642e-06,
"loss": 0.2647,
"step": 807
},
{
"epoch": 2.5093167701863353,
"grad_norm": 0.20993843490643438,
"learning_rate": 9.090909090909091e-06,
"loss": 0.2381,
"step": 808
},
{
"epoch": 2.512422360248447,
"grad_norm": 0.20387329805308116,
"learning_rate": 9.03337169159954e-06,
"loss": 0.2404,
"step": 809
},
{
"epoch": 2.5155279503105588,
"grad_norm": 0.206875715468925,
"learning_rate": 8.975834292289988e-06,
"loss": 0.2625,
"step": 810
},
{
"epoch": 2.518633540372671,
"grad_norm": 0.20699195679204746,
"learning_rate": 8.918296892980437e-06,
"loss": 0.2496,
"step": 811
},
{
"epoch": 2.5217391304347827,
"grad_norm": 0.20082335227786552,
"learning_rate": 8.860759493670886e-06,
"loss": 0.2472,
"step": 812
},
{
"epoch": 2.5248447204968945,
"grad_norm": 0.20337421721934987,
"learning_rate": 8.803222094361335e-06,
"loss": 0.2465,
"step": 813
},
{
"epoch": 2.527950310559006,
"grad_norm": 0.19690561472031543,
"learning_rate": 8.745684695051783e-06,
"loss": 0.2562,
"step": 814
},
{
"epoch": 2.531055900621118,
"grad_norm": 0.20942292198434145,
"learning_rate": 8.688147295742232e-06,
"loss": 0.2448,
"step": 815
},
{
"epoch": 2.5341614906832297,
"grad_norm": 0.22511418926211027,
"learning_rate": 8.630609896432681e-06,
"loss": 0.2585,
"step": 816
},
{
"epoch": 2.5372670807453415,
"grad_norm": 0.21038192778136464,
"learning_rate": 8.57307249712313e-06,
"loss": 0.2463,
"step": 817
},
{
"epoch": 2.5403726708074537,
"grad_norm": 0.18890075777071388,
"learning_rate": 8.515535097813578e-06,
"loss": 0.2521,
"step": 818
},
{
"epoch": 2.5434782608695654,
"grad_norm": 0.21205002134781,
"learning_rate": 8.457997698504027e-06,
"loss": 0.2585,
"step": 819
},
{
"epoch": 2.546583850931677,
"grad_norm": 0.1941024098027217,
"learning_rate": 8.400460299194476e-06,
"loss": 0.2566,
"step": 820
},
{
"epoch": 2.549689440993789,
"grad_norm": 0.30349180360429645,
"learning_rate": 8.342922899884924e-06,
"loss": 0.2623,
"step": 821
},
{
"epoch": 2.5527950310559007,
"grad_norm": 0.22803507573466544,
"learning_rate": 8.285385500575373e-06,
"loss": 0.2558,
"step": 822
},
{
"epoch": 2.5559006211180124,
"grad_norm": 0.2020632346168216,
"learning_rate": 8.227848101265822e-06,
"loss": 0.2586,
"step": 823
},
{
"epoch": 2.559006211180124,
"grad_norm": 0.19503633689058,
"learning_rate": 8.17031070195627e-06,
"loss": 0.2628,
"step": 824
},
{
"epoch": 2.562111801242236,
"grad_norm": 0.19443407045409983,
"learning_rate": 8.11277330264672e-06,
"loss": 0.2492,
"step": 825
},
{
"epoch": 2.5652173913043477,
"grad_norm": 0.20150007916652513,
"learning_rate": 8.05523590333717e-06,
"loss": 0.256,
"step": 826
},
{
"epoch": 2.5683229813664594,
"grad_norm": 0.20193826865741932,
"learning_rate": 7.997698504027619e-06,
"loss": 0.2716,
"step": 827
},
{
"epoch": 2.571428571428571,
"grad_norm": 0.19621787984261999,
"learning_rate": 7.940161104718067e-06,
"loss": 0.2443,
"step": 828
},
{
"epoch": 2.5745341614906834,
"grad_norm": 0.19610522530135707,
"learning_rate": 7.882623705408516e-06,
"loss": 0.2562,
"step": 829
},
{
"epoch": 2.577639751552795,
"grad_norm": 0.1980537399225623,
"learning_rate": 7.825086306098965e-06,
"loss": 0.2475,
"step": 830
},
{
"epoch": 2.580745341614907,
"grad_norm": 0.19074805307763945,
"learning_rate": 7.767548906789413e-06,
"loss": 0.2557,
"step": 831
},
{
"epoch": 2.5838509316770186,
"grad_norm": 0.19613067044699573,
"learning_rate": 7.710011507479862e-06,
"loss": 0.2633,
"step": 832
},
{
"epoch": 2.5869565217391304,
"grad_norm": 0.19682931877320217,
"learning_rate": 7.652474108170311e-06,
"loss": 0.2378,
"step": 833
},
{
"epoch": 2.590062111801242,
"grad_norm": 0.20053417585734873,
"learning_rate": 7.5949367088607605e-06,
"loss": 0.2498,
"step": 834
},
{
"epoch": 2.593167701863354,
"grad_norm": 0.19178100866522357,
"learning_rate": 7.537399309551209e-06,
"loss": 0.2355,
"step": 835
},
{
"epoch": 2.596273291925466,
"grad_norm": 0.2084827189783707,
"learning_rate": 7.479861910241658e-06,
"loss": 0.2764,
"step": 836
},
{
"epoch": 2.599378881987578,
"grad_norm": 0.19540838307901068,
"learning_rate": 7.422324510932107e-06,
"loss": 0.2437,
"step": 837
},
{
"epoch": 2.6024844720496896,
"grad_norm": 0.19587457349490991,
"learning_rate": 7.364787111622555e-06,
"loss": 0.2489,
"step": 838
},
{
"epoch": 2.6055900621118013,
"grad_norm": 0.25075690817051544,
"learning_rate": 7.307249712313004e-06,
"loss": 0.2648,
"step": 839
},
{
"epoch": 2.608695652173913,
"grad_norm": 0.20743291534086578,
"learning_rate": 7.249712313003453e-06,
"loss": 0.2646,
"step": 840
},
{
"epoch": 2.611801242236025,
"grad_norm": 0.21071395029449075,
"learning_rate": 7.1921749136939016e-06,
"loss": 0.2427,
"step": 841
},
{
"epoch": 2.6149068322981366,
"grad_norm": 0.20235523726201224,
"learning_rate": 7.13463751438435e-06,
"loss": 0.2587,
"step": 842
},
{
"epoch": 2.6180124223602483,
"grad_norm": 0.20149232436113795,
"learning_rate": 7.077100115074799e-06,
"loss": 0.2516,
"step": 843
},
{
"epoch": 2.62111801242236,
"grad_norm": 0.21144648873433503,
"learning_rate": 7.019562715765248e-06,
"loss": 0.2582,
"step": 844
},
{
"epoch": 2.624223602484472,
"grad_norm": 0.2162300937976304,
"learning_rate": 6.9620253164556965e-06,
"loss": 0.2556,
"step": 845
},
{
"epoch": 2.6273291925465836,
"grad_norm": 0.21106771620646603,
"learning_rate": 6.904487917146145e-06,
"loss": 0.2558,
"step": 846
},
{
"epoch": 2.630434782608696,
"grad_norm": 0.23609832773446915,
"learning_rate": 6.846950517836594e-06,
"loss": 0.2572,
"step": 847
},
{
"epoch": 2.6335403726708075,
"grad_norm": 0.21122404379666423,
"learning_rate": 6.789413118527043e-06,
"loss": 0.2434,
"step": 848
},
{
"epoch": 2.6366459627329193,
"grad_norm": 0.2015181805089703,
"learning_rate": 6.731875719217491e-06,
"loss": 0.2418,
"step": 849
},
{
"epoch": 2.639751552795031,
"grad_norm": 0.20647243106844593,
"learning_rate": 6.67433831990794e-06,
"loss": 0.265,
"step": 850
},
{
"epoch": 2.642857142857143,
"grad_norm": 0.2083640341120549,
"learning_rate": 6.61680092059839e-06,
"loss": 0.2531,
"step": 851
},
{
"epoch": 2.6459627329192545,
"grad_norm": 0.20501908976688168,
"learning_rate": 6.559263521288838e-06,
"loss": 0.2514,
"step": 852
},
{
"epoch": 2.6490683229813663,
"grad_norm": 0.19844284276810914,
"learning_rate": 6.501726121979287e-06,
"loss": 0.2644,
"step": 853
},
{
"epoch": 2.6521739130434785,
"grad_norm": 0.21201237882135082,
"learning_rate": 6.444188722669736e-06,
"loss": 0.2568,
"step": 854
},
{
"epoch": 2.6552795031055902,
"grad_norm": 0.22195301360518224,
"learning_rate": 6.3866513233601846e-06,
"loss": 0.261,
"step": 855
},
{
"epoch": 2.658385093167702,
"grad_norm": 0.19287865061356418,
"learning_rate": 6.329113924050633e-06,
"loss": 0.2469,
"step": 856
},
{
"epoch": 2.6614906832298137,
"grad_norm": 0.19640829139853255,
"learning_rate": 6.271576524741082e-06,
"loss": 0.2462,
"step": 857
},
{
"epoch": 2.6645962732919255,
"grad_norm": 0.20101972350059313,
"learning_rate": 6.214039125431531e-06,
"loss": 0.255,
"step": 858
},
{
"epoch": 2.6677018633540373,
"grad_norm": 0.2841326489307957,
"learning_rate": 6.1565017261219795e-06,
"loss": 0.2457,
"step": 859
},
{
"epoch": 2.670807453416149,
"grad_norm": 0.18827454901664883,
"learning_rate": 6.098964326812428e-06,
"loss": 0.2427,
"step": 860
},
{
"epoch": 2.6739130434782608,
"grad_norm": 0.20109847479853832,
"learning_rate": 6.041426927502877e-06,
"loss": 0.2402,
"step": 861
},
{
"epoch": 2.6770186335403725,
"grad_norm": 0.1910402172602598,
"learning_rate": 5.983889528193326e-06,
"loss": 0.2627,
"step": 862
},
{
"epoch": 2.6801242236024843,
"grad_norm": 0.1974312904693097,
"learning_rate": 5.926352128883774e-06,
"loss": 0.2625,
"step": 863
},
{
"epoch": 2.683229813664596,
"grad_norm": 0.19911868656713894,
"learning_rate": 5.868814729574223e-06,
"loss": 0.2368,
"step": 864
},
{
"epoch": 2.686335403726708,
"grad_norm": 0.21362726329843149,
"learning_rate": 5.811277330264672e-06,
"loss": 0.2534,
"step": 865
},
{
"epoch": 2.68944099378882,
"grad_norm": 0.20941798902436187,
"learning_rate": 5.7537399309551206e-06,
"loss": 0.2454,
"step": 866
},
{
"epoch": 2.6925465838509317,
"grad_norm": 0.19014291486371018,
"learning_rate": 5.69620253164557e-06,
"loss": 0.2446,
"step": 867
},
{
"epoch": 2.6956521739130435,
"grad_norm": 0.19597012112115988,
"learning_rate": 5.638665132336019e-06,
"loss": 0.2537,
"step": 868
},
{
"epoch": 2.698757763975155,
"grad_norm": 0.19714293851097728,
"learning_rate": 5.581127733026468e-06,
"loss": 0.2468,
"step": 869
},
{
"epoch": 2.701863354037267,
"grad_norm": 0.19621178971442163,
"learning_rate": 5.523590333716916e-06,
"loss": 0.2507,
"step": 870
},
{
"epoch": 2.704968944099379,
"grad_norm": 0.19491684844874946,
"learning_rate": 5.466052934407365e-06,
"loss": 0.2413,
"step": 871
},
{
"epoch": 2.708074534161491,
"grad_norm": 0.27521581958829827,
"learning_rate": 5.408515535097814e-06,
"loss": 0.2633,
"step": 872
},
{
"epoch": 2.7111801242236027,
"grad_norm": 0.2168313001961523,
"learning_rate": 5.3509781357882625e-06,
"loss": 0.264,
"step": 873
},
{
"epoch": 2.7142857142857144,
"grad_norm": 0.18797210234683806,
"learning_rate": 5.293440736478711e-06,
"loss": 0.2447,
"step": 874
},
{
"epoch": 2.717391304347826,
"grad_norm": 0.21084636753160527,
"learning_rate": 5.23590333716916e-06,
"loss": 0.2619,
"step": 875
},
{
"epoch": 2.720496894409938,
"grad_norm": 0.20635684776280216,
"learning_rate": 5.178365937859609e-06,
"loss": 0.2583,
"step": 876
},
{
"epoch": 2.7236024844720497,
"grad_norm": 0.19468296653400607,
"learning_rate": 5.120828538550057e-06,
"loss": 0.2492,
"step": 877
},
{
"epoch": 2.7267080745341614,
"grad_norm": 0.20398048699150237,
"learning_rate": 5.063291139240506e-06,
"loss": 0.2549,
"step": 878
},
{
"epoch": 2.729813664596273,
"grad_norm": 0.18689815600092072,
"learning_rate": 5.005753739930955e-06,
"loss": 0.2488,
"step": 879
},
{
"epoch": 2.732919254658385,
"grad_norm": 0.20597588086540602,
"learning_rate": 4.948216340621404e-06,
"loss": 0.2667,
"step": 880
},
{
"epoch": 2.7360248447204967,
"grad_norm": 0.196856493986424,
"learning_rate": 4.890678941311852e-06,
"loss": 0.2513,
"step": 881
},
{
"epoch": 2.7391304347826084,
"grad_norm": 0.19482252545749987,
"learning_rate": 4.833141542002301e-06,
"loss": 0.2498,
"step": 882
},
{
"epoch": 2.7422360248447206,
"grad_norm": 0.19795769225255558,
"learning_rate": 4.775604142692751e-06,
"loss": 0.2499,
"step": 883
},
{
"epoch": 2.7453416149068324,
"grad_norm": 0.21689477537897567,
"learning_rate": 4.718066743383199e-06,
"loss": 0.284,
"step": 884
},
{
"epoch": 2.748447204968944,
"grad_norm": 0.2418368942479182,
"learning_rate": 4.660529344073648e-06,
"loss": 0.2766,
"step": 885
},
{
"epoch": 2.751552795031056,
"grad_norm": 0.20477977797718222,
"learning_rate": 4.602991944764097e-06,
"loss": 0.2401,
"step": 886
},
{
"epoch": 2.7546583850931676,
"grad_norm": 0.2006217058218365,
"learning_rate": 4.5454545454545455e-06,
"loss": 0.2471,
"step": 887
},
{
"epoch": 2.7577639751552794,
"grad_norm": 0.20256868584609686,
"learning_rate": 4.487917146144994e-06,
"loss": 0.2441,
"step": 888
},
{
"epoch": 2.7608695652173916,
"grad_norm": 0.21086545356411496,
"learning_rate": 4.430379746835443e-06,
"loss": 0.255,
"step": 889
},
{
"epoch": 2.7639751552795033,
"grad_norm": 0.19012940644030216,
"learning_rate": 4.372842347525892e-06,
"loss": 0.2524,
"step": 890
},
{
"epoch": 2.767080745341615,
"grad_norm": 0.20733606950697256,
"learning_rate": 4.3153049482163404e-06,
"loss": 0.2502,
"step": 891
},
{
"epoch": 2.770186335403727,
"grad_norm": 0.19869202500390978,
"learning_rate": 4.257767548906789e-06,
"loss": 0.2488,
"step": 892
},
{
"epoch": 2.7732919254658386,
"grad_norm": 0.21061575298666055,
"learning_rate": 4.200230149597238e-06,
"loss": 0.2535,
"step": 893
},
{
"epoch": 2.7763975155279503,
"grad_norm": 0.1902154270690934,
"learning_rate": 4.142692750287687e-06,
"loss": 0.2523,
"step": 894
},
{
"epoch": 2.779503105590062,
"grad_norm": 0.19515929958573747,
"learning_rate": 4.085155350978135e-06,
"loss": 0.2544,
"step": 895
},
{
"epoch": 2.782608695652174,
"grad_norm": 0.19252012143661815,
"learning_rate": 4.027617951668585e-06,
"loss": 0.2638,
"step": 896
},
{
"epoch": 2.7857142857142856,
"grad_norm": 0.1923327877416844,
"learning_rate": 3.970080552359034e-06,
"loss": 0.2462,
"step": 897
},
{
"epoch": 2.7888198757763973,
"grad_norm": 0.18586501981252762,
"learning_rate": 3.912543153049482e-06,
"loss": 0.2403,
"step": 898
},
{
"epoch": 2.791925465838509,
"grad_norm": 0.1986091973327919,
"learning_rate": 3.855005753739931e-06,
"loss": 0.2655,
"step": 899
},
{
"epoch": 2.795031055900621,
"grad_norm": 0.18536496942596287,
"learning_rate": 3.7974683544303802e-06,
"loss": 0.2389,
"step": 900
},
{
"epoch": 2.798136645962733,
"grad_norm": 0.19607218549803698,
"learning_rate": 3.739930955120829e-06,
"loss": 0.2542,
"step": 901
},
{
"epoch": 2.801242236024845,
"grad_norm": 0.19944282694872204,
"learning_rate": 3.6823935558112777e-06,
"loss": 0.2434,
"step": 902
},
{
"epoch": 2.8043478260869565,
"grad_norm": 0.1972448743409019,
"learning_rate": 3.6248561565017264e-06,
"loss": 0.247,
"step": 903
},
{
"epoch": 2.8074534161490683,
"grad_norm": 0.19361398823404677,
"learning_rate": 3.567318757192175e-06,
"loss": 0.2597,
"step": 904
},
{
"epoch": 2.81055900621118,
"grad_norm": 0.19293534537923737,
"learning_rate": 3.509781357882624e-06,
"loss": 0.2679,
"step": 905
},
{
"epoch": 2.813664596273292,
"grad_norm": 0.1978927145961964,
"learning_rate": 3.4522439585730726e-06,
"loss": 0.2474,
"step": 906
},
{
"epoch": 2.816770186335404,
"grad_norm": 0.18672700788585406,
"learning_rate": 3.3947065592635213e-06,
"loss": 0.2468,
"step": 907
},
{
"epoch": 2.8198757763975157,
"grad_norm": 0.1856966835076563,
"learning_rate": 3.33716915995397e-06,
"loss": 0.2424,
"step": 908
},
{
"epoch": 2.8229813664596275,
"grad_norm": 0.19224475733121915,
"learning_rate": 3.279631760644419e-06,
"loss": 0.2477,
"step": 909
},
{
"epoch": 2.8260869565217392,
"grad_norm": 0.20470161040078505,
"learning_rate": 3.222094361334868e-06,
"loss": 0.2706,
"step": 910
},
{
"epoch": 2.829192546583851,
"grad_norm": 0.19429220598035837,
"learning_rate": 3.1645569620253167e-06,
"loss": 0.2477,
"step": 911
},
{
"epoch": 2.8322981366459627,
"grad_norm": 0.1894109295691752,
"learning_rate": 3.1070195627157654e-06,
"loss": 0.2528,
"step": 912
},
{
"epoch": 2.8354037267080745,
"grad_norm": 0.18097305550473375,
"learning_rate": 3.049482163406214e-06,
"loss": 0.2559,
"step": 913
},
{
"epoch": 2.8385093167701863,
"grad_norm": 0.19783393465985816,
"learning_rate": 2.991944764096663e-06,
"loss": 0.2594,
"step": 914
},
{
"epoch": 2.841614906832298,
"grad_norm": 0.20897012225810746,
"learning_rate": 2.9344073647871116e-06,
"loss": 0.2653,
"step": 915
},
{
"epoch": 2.8447204968944098,
"grad_norm": 0.1896928698309342,
"learning_rate": 2.8768699654775603e-06,
"loss": 0.2477,
"step": 916
},
{
"epoch": 2.8478260869565215,
"grad_norm": 0.1784496457827597,
"learning_rate": 2.8193325661680094e-06,
"loss": 0.244,
"step": 917
},
{
"epoch": 2.8509316770186337,
"grad_norm": 0.18587478842335634,
"learning_rate": 2.761795166858458e-06,
"loss": 0.25,
"step": 918
},
{
"epoch": 2.8540372670807455,
"grad_norm": 0.20084898371613977,
"learning_rate": 2.704257767548907e-06,
"loss": 0.2577,
"step": 919
},
{
"epoch": 2.857142857142857,
"grad_norm": 0.17885954860642703,
"learning_rate": 2.6467203682393556e-06,
"loss": 0.2407,
"step": 920
},
{
"epoch": 2.860248447204969,
"grad_norm": 0.18561208551570504,
"learning_rate": 2.5891829689298043e-06,
"loss": 0.2659,
"step": 921
},
{
"epoch": 2.8633540372670807,
"grad_norm": 0.21932109217206247,
"learning_rate": 2.531645569620253e-06,
"loss": 0.2403,
"step": 922
},
{
"epoch": 2.8664596273291925,
"grad_norm": 0.20030388225206663,
"learning_rate": 2.474108170310702e-06,
"loss": 0.2544,
"step": 923
},
{
"epoch": 2.869565217391304,
"grad_norm": 0.20878429975095714,
"learning_rate": 2.4165707710011505e-06,
"loss": 0.2497,
"step": 924
},
{
"epoch": 2.8726708074534164,
"grad_norm": 0.22186219053087963,
"learning_rate": 2.3590333716915997e-06,
"loss": 0.2672,
"step": 925
},
{
"epoch": 2.875776397515528,
"grad_norm": 0.18672043459559956,
"learning_rate": 2.3014959723820484e-06,
"loss": 0.2485,
"step": 926
},
{
"epoch": 2.87888198757764,
"grad_norm": 0.18051985217560826,
"learning_rate": 2.243958573072497e-06,
"loss": 0.2479,
"step": 927
},
{
"epoch": 2.8819875776397517,
"grad_norm": 0.20846631011511568,
"learning_rate": 2.186421173762946e-06,
"loss": 0.2632,
"step": 928
},
{
"epoch": 2.8850931677018634,
"grad_norm": 0.17696337319445454,
"learning_rate": 2.1288837744533946e-06,
"loss": 0.2288,
"step": 929
},
{
"epoch": 2.888198757763975,
"grad_norm": 0.18293271617504872,
"learning_rate": 2.0713463751438433e-06,
"loss": 0.2495,
"step": 930
},
{
"epoch": 2.891304347826087,
"grad_norm": 0.17853543153947618,
"learning_rate": 2.0138089758342925e-06,
"loss": 0.2468,
"step": 931
},
{
"epoch": 2.8944099378881987,
"grad_norm": 0.18420177129422804,
"learning_rate": 1.956271576524741e-06,
"loss": 0.2538,
"step": 932
},
{
"epoch": 2.8975155279503104,
"grad_norm": 0.18170230261287915,
"learning_rate": 1.8987341772151901e-06,
"loss": 0.2589,
"step": 933
},
{
"epoch": 2.900621118012422,
"grad_norm": 0.18685594957937918,
"learning_rate": 1.8411967779056388e-06,
"loss": 0.2442,
"step": 934
},
{
"epoch": 2.903726708074534,
"grad_norm": 0.18690296703530773,
"learning_rate": 1.7836593785960876e-06,
"loss": 0.2451,
"step": 935
},
{
"epoch": 2.906832298136646,
"grad_norm": 0.20799939121665842,
"learning_rate": 1.7261219792865363e-06,
"loss": 0.2649,
"step": 936
},
{
"epoch": 2.909937888198758,
"grad_norm": 0.18563245527227562,
"learning_rate": 1.668584579976985e-06,
"loss": 0.2473,
"step": 937
},
{
"epoch": 2.9130434782608696,
"grad_norm": 0.18286501772853814,
"learning_rate": 1.611047180667434e-06,
"loss": 0.2486,
"step": 938
},
{
"epoch": 2.9161490683229814,
"grad_norm": 0.17534488578059473,
"learning_rate": 1.5535097813578827e-06,
"loss": 0.259,
"step": 939
},
{
"epoch": 2.919254658385093,
"grad_norm": 0.19817242903037158,
"learning_rate": 1.4959723820483314e-06,
"loss": 0.2428,
"step": 940
},
{
"epoch": 2.922360248447205,
"grad_norm": 0.18335244034678858,
"learning_rate": 1.4384349827387801e-06,
"loss": 0.252,
"step": 941
},
{
"epoch": 2.9254658385093166,
"grad_norm": 0.17672927011117798,
"learning_rate": 1.380897583429229e-06,
"loss": 0.239,
"step": 942
},
{
"epoch": 2.928571428571429,
"grad_norm": 0.19756419475586987,
"learning_rate": 1.3233601841196778e-06,
"loss": 0.2481,
"step": 943
},
{
"epoch": 2.9316770186335406,
"grad_norm": 0.18227787024732953,
"learning_rate": 1.2658227848101265e-06,
"loss": 0.2503,
"step": 944
},
{
"epoch": 2.9347826086956523,
"grad_norm": 0.17546530423346965,
"learning_rate": 1.2082853855005753e-06,
"loss": 0.2435,
"step": 945
},
{
"epoch": 2.937888198757764,
"grad_norm": 0.17977719939100784,
"learning_rate": 1.1507479861910242e-06,
"loss": 0.2462,
"step": 946
},
{
"epoch": 2.940993788819876,
"grad_norm": 0.182411120857819,
"learning_rate": 1.093210586881473e-06,
"loss": 0.2508,
"step": 947
},
{
"epoch": 2.9440993788819876,
"grad_norm": 0.19191429782536917,
"learning_rate": 1.0356731875719217e-06,
"loss": 0.2419,
"step": 948
},
{
"epoch": 2.9472049689440993,
"grad_norm": 0.17465750941257832,
"learning_rate": 9.781357882623706e-07,
"loss": 0.2394,
"step": 949
},
{
"epoch": 2.950310559006211,
"grad_norm": 0.17956692661649218,
"learning_rate": 9.205983889528194e-07,
"loss": 0.2478,
"step": 950
},
{
"epoch": 2.953416149068323,
"grad_norm": 0.18930304144220808,
"learning_rate": 8.630609896432681e-07,
"loss": 0.2371,
"step": 951
},
{
"epoch": 2.9565217391304346,
"grad_norm": 0.19245986180447752,
"learning_rate": 8.05523590333717e-07,
"loss": 0.2324,
"step": 952
},
{
"epoch": 2.9596273291925463,
"grad_norm": 0.18874027199979995,
"learning_rate": 7.479861910241657e-07,
"loss": 0.2482,
"step": 953
},
{
"epoch": 2.9627329192546585,
"grad_norm": 0.18606185656754726,
"learning_rate": 6.904487917146145e-07,
"loss": 0.2602,
"step": 954
},
{
"epoch": 2.9658385093167703,
"grad_norm": 0.18622516429740096,
"learning_rate": 6.329113924050633e-07,
"loss": 0.2514,
"step": 955
},
{
"epoch": 2.968944099378882,
"grad_norm": 0.1910726758431884,
"learning_rate": 5.753739930955121e-07,
"loss": 0.2526,
"step": 956
},
{
"epoch": 2.972049689440994,
"grad_norm": 0.18715451349236073,
"learning_rate": 5.178365937859608e-07,
"loss": 0.2512,
"step": 957
},
{
"epoch": 2.9751552795031055,
"grad_norm": 0.18279876224123887,
"learning_rate": 4.602991944764097e-07,
"loss": 0.2624,
"step": 958
},
{
"epoch": 2.9782608695652173,
"grad_norm": 0.18112311672532813,
"learning_rate": 4.027617951668585e-07,
"loss": 0.261,
"step": 959
},
{
"epoch": 2.981366459627329,
"grad_norm": 0.17566965768374485,
"learning_rate": 3.4522439585730727e-07,
"loss": 0.2437,
"step": 960
},
{
"epoch": 2.9844720496894412,
"grad_norm": 0.19721806486553123,
"learning_rate": 2.8768699654775605e-07,
"loss": 0.2594,
"step": 961
},
{
"epoch": 2.987577639751553,
"grad_norm": 0.19840508059977566,
"learning_rate": 2.3014959723820486e-07,
"loss": 0.2415,
"step": 962
},
{
"epoch": 2.9906832298136647,
"grad_norm": 0.20273843178894588,
"learning_rate": 1.7261219792865363e-07,
"loss": 0.2523,
"step": 963
},
{
"epoch": 2.9937888198757765,
"grad_norm": 0.19347512173901257,
"learning_rate": 1.1507479861910243e-07,
"loss": 0.256,
"step": 964
},
{
"epoch": 2.9968944099378882,
"grad_norm": 0.17382060962656506,
"learning_rate": 5.7537399309551214e-08,
"loss": 0.2351,
"step": 965
},
{
"epoch": 3.0,
"grad_norm": 0.17740605884855634,
"learning_rate": 0.0,
"loss": 0.2395,
"step": 966
},
{
"epoch": 3.0,
"step": 966,
"total_flos": 8.211023406049526e+17,
"train_loss": 0.4564525331862225,
"train_runtime": 83597.7389,
"train_samples_per_second": 0.185,
"train_steps_per_second": 0.012
}
],
"logging_steps": 1,
"max_steps": 966,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 8.211023406049526e+17,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}