{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 966, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.003105590062111801, "grad_norm": 60.46764647043296, "learning_rate": 5.154639175257732e-07, "loss": 11.059, "step": 1 }, { "epoch": 0.006211180124223602, "grad_norm": 60.76414526284177, "learning_rate": 1.0309278350515464e-06, "loss": 11.012, "step": 2 }, { "epoch": 0.009316770186335404, "grad_norm": 59.20725802433676, "learning_rate": 1.5463917525773197e-06, "loss": 11.1319, "step": 3 }, { "epoch": 0.012422360248447204, "grad_norm": 60.61459028364047, "learning_rate": 2.061855670103093e-06, "loss": 11.03, "step": 4 }, { "epoch": 0.015527950310559006, "grad_norm": 64.67058884756766, "learning_rate": 2.577319587628866e-06, "loss": 10.8306, "step": 5 }, { "epoch": 0.018633540372670808, "grad_norm": 70.79205232466896, "learning_rate": 3.0927835051546395e-06, "loss": 10.6598, "step": 6 }, { "epoch": 0.021739130434782608, "grad_norm": 98.53755227750479, "learning_rate": 3.608247422680412e-06, "loss": 9.4929, "step": 7 }, { "epoch": 0.024844720496894408, "grad_norm": 111.40861108613349, "learning_rate": 4.123711340206186e-06, "loss": 9.0865, "step": 8 }, { "epoch": 0.027950310559006212, "grad_norm": 127.60092525337744, "learning_rate": 4.639175257731959e-06, "loss": 8.316, "step": 9 }, { "epoch": 0.031055900621118012, "grad_norm": 57.38601512479266, "learning_rate": 5.154639175257732e-06, "loss": 3.5203, "step": 10 }, { "epoch": 0.034161490683229816, "grad_norm": 40.087017160526905, "learning_rate": 5.670103092783505e-06, "loss": 2.5941, "step": 11 }, { "epoch": 0.037267080745341616, "grad_norm": 36.371937459465364, "learning_rate": 6.185567010309279e-06, "loss": 2.3632, "step": 12 }, { "epoch": 0.040372670807453416, "grad_norm": 5.902085049785153, "learning_rate": 6.701030927835052e-06, "loss": 1.3097, "step": 13 }, { "epoch": 0.043478260869565216, "grad_norm": 4.395696518971964, "learning_rate": 7.216494845360824e-06, "loss": 1.2339, "step": 14 }, { "epoch": 0.046583850931677016, "grad_norm": 3.1604888794138524, "learning_rate": 7.731958762886599e-06, "loss": 1.1463, "step": 15 }, { "epoch": 0.049689440993788817, "grad_norm": 2.399444834184007, "learning_rate": 8.247422680412371e-06, "loss": 1.0723, "step": 16 }, { "epoch": 0.052795031055900624, "grad_norm": 1.5487302596438641, "learning_rate": 8.762886597938144e-06, "loss": 0.9467, "step": 17 }, { "epoch": 0.055900621118012424, "grad_norm": 80.99605354011946, "learning_rate": 9.278350515463918e-06, "loss": 0.9669, "step": 18 }, { "epoch": 0.059006211180124224, "grad_norm": 35.062984197987575, "learning_rate": 9.793814432989691e-06, "loss": 0.8903, "step": 19 }, { "epoch": 0.062111801242236024, "grad_norm": 1.7039403556284178, "learning_rate": 1.0309278350515464e-05, "loss": 0.8611, "step": 20 }, { "epoch": 0.06521739130434782, "grad_norm": 1.1286556630596418, "learning_rate": 1.0824742268041238e-05, "loss": 0.7956, "step": 21 }, { "epoch": 0.06832298136645963, "grad_norm": 0.8885421752095347, "learning_rate": 1.134020618556701e-05, "loss": 0.7944, "step": 22 }, { "epoch": 0.07142857142857142, "grad_norm": 0.7771123311111944, "learning_rate": 1.1855670103092783e-05, "loss": 0.7888, "step": 23 }, { "epoch": 0.07453416149068323, "grad_norm": 0.8290301807562498, "learning_rate": 1.2371134020618558e-05, "loss": 0.7524, "step": 24 }, { "epoch": 0.07763975155279502, "grad_norm": 0.9178800987434453, "learning_rate": 1.2886597938144329e-05, "loss": 0.7276, "step": 25 }, { "epoch": 0.08074534161490683, "grad_norm": 0.7280169831391284, "learning_rate": 1.3402061855670103e-05, "loss": 0.7049, "step": 26 }, { "epoch": 0.08385093167701864, "grad_norm": 0.590489381671068, "learning_rate": 1.3917525773195878e-05, "loss": 0.6846, "step": 27 }, { "epoch": 0.08695652173913043, "grad_norm": 0.6956647472162396, "learning_rate": 1.4432989690721649e-05, "loss": 0.6518, "step": 28 }, { "epoch": 0.09006211180124224, "grad_norm": 0.7233291715436561, "learning_rate": 1.4948453608247423e-05, "loss": 0.6575, "step": 29 }, { "epoch": 0.09316770186335403, "grad_norm": 0.6557463930133224, "learning_rate": 1.5463917525773197e-05, "loss": 0.6648, "step": 30 }, { "epoch": 0.09627329192546584, "grad_norm": 0.5940038812473861, "learning_rate": 1.5979381443298968e-05, "loss": 0.6414, "step": 31 }, { "epoch": 0.09937888198757763, "grad_norm": 0.514015705745489, "learning_rate": 1.6494845360824743e-05, "loss": 0.6178, "step": 32 }, { "epoch": 0.10248447204968944, "grad_norm": 0.5977361497140969, "learning_rate": 1.7010309278350517e-05, "loss": 0.6216, "step": 33 }, { "epoch": 0.10559006211180125, "grad_norm": 0.5377773183845758, "learning_rate": 1.7525773195876288e-05, "loss": 0.6195, "step": 34 }, { "epoch": 0.10869565217391304, "grad_norm": 0.40152764208172104, "learning_rate": 1.8041237113402062e-05, "loss": 0.5758, "step": 35 }, { "epoch": 0.11180124223602485, "grad_norm": 0.40244189444549017, "learning_rate": 1.8556701030927837e-05, "loss": 0.6178, "step": 36 }, { "epoch": 0.11490683229813664, "grad_norm": 0.49886656483811526, "learning_rate": 1.9072164948453608e-05, "loss": 0.6062, "step": 37 }, { "epoch": 0.11801242236024845, "grad_norm": 0.43178714425173426, "learning_rate": 1.9587628865979382e-05, "loss": 0.5929, "step": 38 }, { "epoch": 0.12111801242236025, "grad_norm": 0.37953785852942284, "learning_rate": 2.0103092783505157e-05, "loss": 0.57, "step": 39 }, { "epoch": 0.12422360248447205, "grad_norm": 0.3712229743609745, "learning_rate": 2.0618556701030927e-05, "loss": 0.5812, "step": 40 }, { "epoch": 0.12732919254658384, "grad_norm": 0.38350882873215847, "learning_rate": 2.1134020618556702e-05, "loss": 0.5714, "step": 41 }, { "epoch": 0.13043478260869565, "grad_norm": 0.4036659557430701, "learning_rate": 2.1649484536082476e-05, "loss": 0.5813, "step": 42 }, { "epoch": 0.13354037267080746, "grad_norm": 0.33097703493186653, "learning_rate": 2.2164948453608247e-05, "loss": 0.5537, "step": 43 }, { "epoch": 0.13664596273291926, "grad_norm": 0.339069211939581, "learning_rate": 2.268041237113402e-05, "loss": 0.57, "step": 44 }, { "epoch": 0.13975155279503104, "grad_norm": 0.34080115423530455, "learning_rate": 2.3195876288659796e-05, "loss": 0.5434, "step": 45 }, { "epoch": 0.14285714285714285, "grad_norm": 0.3309319915944845, "learning_rate": 2.3711340206185567e-05, "loss": 0.5436, "step": 46 }, { "epoch": 0.14596273291925466, "grad_norm": 0.36258498646852527, "learning_rate": 2.422680412371134e-05, "loss": 0.5372, "step": 47 }, { "epoch": 0.14906832298136646, "grad_norm": 0.3289309195150263, "learning_rate": 2.4742268041237116e-05, "loss": 0.5519, "step": 48 }, { "epoch": 0.15217391304347827, "grad_norm": 0.29200888913110107, "learning_rate": 2.5257731958762887e-05, "loss": 0.5269, "step": 49 }, { "epoch": 0.15527950310559005, "grad_norm": 0.2913726775318078, "learning_rate": 2.5773195876288658e-05, "loss": 0.5398, "step": 50 }, { "epoch": 0.15838509316770186, "grad_norm": 0.36183923103400334, "learning_rate": 2.6288659793814435e-05, "loss": 0.5313, "step": 51 }, { "epoch": 0.16149068322981366, "grad_norm": 0.289832432081365, "learning_rate": 2.6804123711340206e-05, "loss": 0.5294, "step": 52 }, { "epoch": 0.16459627329192547, "grad_norm": 0.28159321988499836, "learning_rate": 2.7319587628865977e-05, "loss": 0.5102, "step": 53 }, { "epoch": 0.16770186335403728, "grad_norm": 0.33289230730425107, "learning_rate": 2.7835051546391755e-05, "loss": 0.5325, "step": 54 }, { "epoch": 0.17080745341614906, "grad_norm": 0.2711500030362234, "learning_rate": 2.8350515463917526e-05, "loss": 0.5203, "step": 55 }, { "epoch": 0.17391304347826086, "grad_norm": 0.2675188946251961, "learning_rate": 2.8865979381443297e-05, "loss": 0.5224, "step": 56 }, { "epoch": 0.17701863354037267, "grad_norm": 0.26579895922328955, "learning_rate": 2.9381443298969075e-05, "loss": 0.5294, "step": 57 }, { "epoch": 0.18012422360248448, "grad_norm": 0.2489797381353846, "learning_rate": 2.9896907216494846e-05, "loss": 0.5111, "step": 58 }, { "epoch": 0.18322981366459629, "grad_norm": 0.26304984714934, "learning_rate": 3.0412371134020617e-05, "loss": 0.5063, "step": 59 }, { "epoch": 0.18633540372670807, "grad_norm": 0.29536218486713367, "learning_rate": 3.0927835051546395e-05, "loss": 0.5278, "step": 60 }, { "epoch": 0.18944099378881987, "grad_norm": 0.2629951129122119, "learning_rate": 3.1443298969072166e-05, "loss": 0.5066, "step": 61 }, { "epoch": 0.19254658385093168, "grad_norm": 0.324166573780215, "learning_rate": 3.1958762886597937e-05, "loss": 0.5054, "step": 62 }, { "epoch": 0.1956521739130435, "grad_norm": 0.2729720585938641, "learning_rate": 3.2474226804123714e-05, "loss": 0.5142, "step": 63 }, { "epoch": 0.19875776397515527, "grad_norm": 0.27422169347085695, "learning_rate": 3.2989690721649485e-05, "loss": 0.5119, "step": 64 }, { "epoch": 0.20186335403726707, "grad_norm": 0.26064941279629095, "learning_rate": 3.3505154639175256e-05, "loss": 0.5037, "step": 65 }, { "epoch": 0.20496894409937888, "grad_norm": 0.2589323970095713, "learning_rate": 3.4020618556701034e-05, "loss": 0.5181, "step": 66 }, { "epoch": 0.2080745341614907, "grad_norm": 0.2795495681392583, "learning_rate": 3.4536082474226805e-05, "loss": 0.5006, "step": 67 }, { "epoch": 0.2111801242236025, "grad_norm": 0.2785747261533415, "learning_rate": 3.5051546391752576e-05, "loss": 0.483, "step": 68 }, { "epoch": 0.21428571428571427, "grad_norm": 0.25302395243466885, "learning_rate": 3.5567010309278354e-05, "loss": 0.4883, "step": 69 }, { "epoch": 0.21739130434782608, "grad_norm": 0.2752883227640764, "learning_rate": 3.6082474226804125e-05, "loss": 0.5094, "step": 70 }, { "epoch": 0.2204968944099379, "grad_norm": 0.3024166121222451, "learning_rate": 3.6597938144329896e-05, "loss": 0.4881, "step": 71 }, { "epoch": 0.2236024844720497, "grad_norm": 0.3097948500444575, "learning_rate": 3.7113402061855674e-05, "loss": 0.4839, "step": 72 }, { "epoch": 0.2267080745341615, "grad_norm": 0.2876918544530116, "learning_rate": 3.7628865979381445e-05, "loss": 0.5144, "step": 73 }, { "epoch": 0.22981366459627328, "grad_norm": 0.3416229447277982, "learning_rate": 3.8144329896907216e-05, "loss": 0.4961, "step": 74 }, { "epoch": 0.2329192546583851, "grad_norm": 0.3199113220311117, "learning_rate": 3.865979381443299e-05, "loss": 0.473, "step": 75 }, { "epoch": 0.2360248447204969, "grad_norm": 0.3005248837372916, "learning_rate": 3.9175257731958764e-05, "loss": 0.4869, "step": 76 }, { "epoch": 0.2391304347826087, "grad_norm": 0.3020219962118337, "learning_rate": 3.9690721649484535e-05, "loss": 0.5047, "step": 77 }, { "epoch": 0.2422360248447205, "grad_norm": 0.29698825337519646, "learning_rate": 4.020618556701031e-05, "loss": 0.5022, "step": 78 }, { "epoch": 0.2453416149068323, "grad_norm": 0.3021333930392965, "learning_rate": 4.0721649484536084e-05, "loss": 0.4866, "step": 79 }, { "epoch": 0.2484472049689441, "grad_norm": 0.29250713103592757, "learning_rate": 4.1237113402061855e-05, "loss": 0.4896, "step": 80 }, { "epoch": 0.2515527950310559, "grad_norm": 0.27724800469538824, "learning_rate": 4.175257731958763e-05, "loss": 0.4836, "step": 81 }, { "epoch": 0.2546583850931677, "grad_norm": 0.3272751041798097, "learning_rate": 4.2268041237113404e-05, "loss": 0.5079, "step": 82 }, { "epoch": 0.2577639751552795, "grad_norm": 0.2875779003405876, "learning_rate": 4.2783505154639175e-05, "loss": 0.4822, "step": 83 }, { "epoch": 0.2608695652173913, "grad_norm": 0.351548025457743, "learning_rate": 4.329896907216495e-05, "loss": 0.4719, "step": 84 }, { "epoch": 0.2639751552795031, "grad_norm": 0.3104421632805538, "learning_rate": 4.3814432989690723e-05, "loss": 0.4985, "step": 85 }, { "epoch": 0.2670807453416149, "grad_norm": 0.29340838316836443, "learning_rate": 4.4329896907216494e-05, "loss": 0.4506, "step": 86 }, { "epoch": 0.2701863354037267, "grad_norm": 0.31888072280932184, "learning_rate": 4.484536082474227e-05, "loss": 0.4718, "step": 87 }, { "epoch": 0.2732919254658385, "grad_norm": 0.2881905604568596, "learning_rate": 4.536082474226804e-05, "loss": 0.4718, "step": 88 }, { "epoch": 0.27639751552795033, "grad_norm": 0.382391969622348, "learning_rate": 4.5876288659793814e-05, "loss": 0.489, "step": 89 }, { "epoch": 0.2795031055900621, "grad_norm": 0.28677795566141734, "learning_rate": 4.639175257731959e-05, "loss": 0.4625, "step": 90 }, { "epoch": 0.2826086956521739, "grad_norm": 0.44192895579293406, "learning_rate": 4.690721649484536e-05, "loss": 0.4901, "step": 91 }, { "epoch": 0.2857142857142857, "grad_norm": 0.36788344235249887, "learning_rate": 4.7422680412371134e-05, "loss": 0.4682, "step": 92 }, { "epoch": 0.2888198757763975, "grad_norm": 0.5143785581301379, "learning_rate": 4.793814432989691e-05, "loss": 0.4748, "step": 93 }, { "epoch": 0.2919254658385093, "grad_norm": 0.3714484820764116, "learning_rate": 4.845360824742268e-05, "loss": 0.4733, "step": 94 }, { "epoch": 0.2950310559006211, "grad_norm": 0.4411279949864707, "learning_rate": 4.8969072164948454e-05, "loss": 0.4719, "step": 95 }, { "epoch": 0.2981366459627329, "grad_norm": 0.4095900221196949, "learning_rate": 4.948453608247423e-05, "loss": 0.4679, "step": 96 }, { "epoch": 0.30124223602484473, "grad_norm": 0.3876387401039132, "learning_rate": 5e-05, "loss": 0.4727, "step": 97 }, { "epoch": 0.30434782608695654, "grad_norm": 0.35671507475673714, "learning_rate": 4.994246260069045e-05, "loss": 0.4582, "step": 98 }, { "epoch": 0.30745341614906835, "grad_norm": 0.40457113215141677, "learning_rate": 4.98849252013809e-05, "loss": 0.4817, "step": 99 }, { "epoch": 0.3105590062111801, "grad_norm": 0.40014058749708475, "learning_rate": 4.982738780207135e-05, "loss": 0.4486, "step": 100 }, { "epoch": 0.3136645962732919, "grad_norm": 0.4870121731575367, "learning_rate": 4.97698504027618e-05, "loss": 0.4663, "step": 101 }, { "epoch": 0.3167701863354037, "grad_norm": 0.4340851079572886, "learning_rate": 4.9712313003452246e-05, "loss": 0.4484, "step": 102 }, { "epoch": 0.3198757763975155, "grad_norm": 0.35686684080021636, "learning_rate": 4.9654775604142695e-05, "loss": 0.467, "step": 103 }, { "epoch": 0.32298136645962733, "grad_norm": 0.4494359291517841, "learning_rate": 4.9597238204833143e-05, "loss": 0.4694, "step": 104 }, { "epoch": 0.32608695652173914, "grad_norm": 0.4372407930618466, "learning_rate": 4.953970080552359e-05, "loss": 0.4648, "step": 105 }, { "epoch": 0.32919254658385094, "grad_norm": 0.34466736034003903, "learning_rate": 4.948216340621404e-05, "loss": 0.4444, "step": 106 }, { "epoch": 0.33229813664596275, "grad_norm": 0.4001800803927703, "learning_rate": 4.942462600690449e-05, "loss": 0.464, "step": 107 }, { "epoch": 0.33540372670807456, "grad_norm": 0.3577590335432523, "learning_rate": 4.936708860759494e-05, "loss": 0.4647, "step": 108 }, { "epoch": 0.3385093167701863, "grad_norm": 0.3827072494556767, "learning_rate": 4.930955120828539e-05, "loss": 0.4452, "step": 109 }, { "epoch": 0.3416149068322981, "grad_norm": 0.40554119841147346, "learning_rate": 4.9252013808975836e-05, "loss": 0.457, "step": 110 }, { "epoch": 0.3447204968944099, "grad_norm": 0.3980370218198526, "learning_rate": 4.9194476409666285e-05, "loss": 0.4566, "step": 111 }, { "epoch": 0.34782608695652173, "grad_norm": 0.38595447982147235, "learning_rate": 4.913693901035673e-05, "loss": 0.4436, "step": 112 }, { "epoch": 0.35093167701863354, "grad_norm": 0.3335566121887473, "learning_rate": 4.907940161104718e-05, "loss": 0.4525, "step": 113 }, { "epoch": 0.35403726708074534, "grad_norm": 0.44048069823182057, "learning_rate": 4.902186421173763e-05, "loss": 0.4775, "step": 114 }, { "epoch": 0.35714285714285715, "grad_norm": 0.3511836624614759, "learning_rate": 4.896432681242808e-05, "loss": 0.4529, "step": 115 }, { "epoch": 0.36024844720496896, "grad_norm": 0.40512550088435406, "learning_rate": 4.890678941311853e-05, "loss": 0.4856, "step": 116 }, { "epoch": 0.36335403726708076, "grad_norm": 0.4709820706303788, "learning_rate": 4.884925201380898e-05, "loss": 0.4613, "step": 117 }, { "epoch": 0.36645962732919257, "grad_norm": 0.3163807878418199, "learning_rate": 4.8791714614499426e-05, "loss": 0.476, "step": 118 }, { "epoch": 0.3695652173913043, "grad_norm": 0.421853544537181, "learning_rate": 4.8734177215189874e-05, "loss": 0.4675, "step": 119 }, { "epoch": 0.37267080745341613, "grad_norm": 0.37140388109626665, "learning_rate": 4.867663981588032e-05, "loss": 0.452, "step": 120 }, { "epoch": 0.37577639751552794, "grad_norm": 0.42352163355515543, "learning_rate": 4.861910241657077e-05, "loss": 0.4468, "step": 121 }, { "epoch": 0.37888198757763975, "grad_norm": 0.4144419361914004, "learning_rate": 4.856156501726122e-05, "loss": 0.4526, "step": 122 }, { "epoch": 0.38198757763975155, "grad_norm": 0.40675120816526916, "learning_rate": 4.850402761795167e-05, "loss": 0.4611, "step": 123 }, { "epoch": 0.38509316770186336, "grad_norm": 0.5826147735025056, "learning_rate": 4.844649021864212e-05, "loss": 0.4803, "step": 124 }, { "epoch": 0.38819875776397517, "grad_norm": 0.3282657199624206, "learning_rate": 4.838895281933257e-05, "loss": 0.4552, "step": 125 }, { "epoch": 0.391304347826087, "grad_norm": 0.5159501988757971, "learning_rate": 4.8331415420023015e-05, "loss": 0.4794, "step": 126 }, { "epoch": 0.3944099378881988, "grad_norm": 0.3620503849683116, "learning_rate": 4.8273878020713464e-05, "loss": 0.4631, "step": 127 }, { "epoch": 0.39751552795031053, "grad_norm": 0.4221189340341717, "learning_rate": 4.821634062140391e-05, "loss": 0.4696, "step": 128 }, { "epoch": 0.40062111801242234, "grad_norm": 0.46423436394369083, "learning_rate": 4.815880322209436e-05, "loss": 0.4573, "step": 129 }, { "epoch": 0.40372670807453415, "grad_norm": 0.4261777248289121, "learning_rate": 4.810126582278481e-05, "loss": 0.4608, "step": 130 }, { "epoch": 0.40683229813664595, "grad_norm": 0.45519667338748365, "learning_rate": 4.804372842347526e-05, "loss": 0.4621, "step": 131 }, { "epoch": 0.40993788819875776, "grad_norm": 0.4384463354130905, "learning_rate": 4.798619102416571e-05, "loss": 0.4656, "step": 132 }, { "epoch": 0.41304347826086957, "grad_norm": 0.41199291319131776, "learning_rate": 4.7928653624856157e-05, "loss": 0.4535, "step": 133 }, { "epoch": 0.4161490683229814, "grad_norm": 0.3655597225332361, "learning_rate": 4.7871116225546605e-05, "loss": 0.4501, "step": 134 }, { "epoch": 0.4192546583850932, "grad_norm": 0.44932133556116877, "learning_rate": 4.7813578826237054e-05, "loss": 0.4767, "step": 135 }, { "epoch": 0.422360248447205, "grad_norm": 0.3329354062585348, "learning_rate": 4.77560414269275e-05, "loss": 0.4455, "step": 136 }, { "epoch": 0.4254658385093168, "grad_norm": 0.45152077511616723, "learning_rate": 4.769850402761795e-05, "loss": 0.4623, "step": 137 }, { "epoch": 0.42857142857142855, "grad_norm": 0.3188549796798649, "learning_rate": 4.76409666283084e-05, "loss": 0.4304, "step": 138 }, { "epoch": 0.43167701863354035, "grad_norm": 0.39747649807961544, "learning_rate": 4.758342922899885e-05, "loss": 0.4486, "step": 139 }, { "epoch": 0.43478260869565216, "grad_norm": 0.2901963778324694, "learning_rate": 4.75258918296893e-05, "loss": 0.4485, "step": 140 }, { "epoch": 0.43788819875776397, "grad_norm": 0.5357034478107343, "learning_rate": 4.7468354430379746e-05, "loss": 0.4773, "step": 141 }, { "epoch": 0.4409937888198758, "grad_norm": 0.3892373232000147, "learning_rate": 4.7410817031070195e-05, "loss": 0.4408, "step": 142 }, { "epoch": 0.4440993788819876, "grad_norm": 0.45033922342477917, "learning_rate": 4.7353279631760644e-05, "loss": 0.4598, "step": 143 }, { "epoch": 0.4472049689440994, "grad_norm": 0.37908550777510663, "learning_rate": 4.729574223245109e-05, "loss": 0.4452, "step": 144 }, { "epoch": 0.4503105590062112, "grad_norm": 0.4290373855109045, "learning_rate": 4.723820483314154e-05, "loss": 0.4536, "step": 145 }, { "epoch": 0.453416149068323, "grad_norm": 0.35676947230487216, "learning_rate": 4.718066743383199e-05, "loss": 0.4648, "step": 146 }, { "epoch": 0.45652173913043476, "grad_norm": 0.33636058827665144, "learning_rate": 4.712313003452244e-05, "loss": 0.444, "step": 147 }, { "epoch": 0.45962732919254656, "grad_norm": 0.3823016634046083, "learning_rate": 4.706559263521289e-05, "loss": 0.4406, "step": 148 }, { "epoch": 0.46273291925465837, "grad_norm": 0.3818789119419192, "learning_rate": 4.700805523590334e-05, "loss": 0.4488, "step": 149 }, { "epoch": 0.4658385093167702, "grad_norm": 0.33345974040131937, "learning_rate": 4.6950517836593785e-05, "loss": 0.4647, "step": 150 }, { "epoch": 0.468944099378882, "grad_norm": 0.47073824185480967, "learning_rate": 4.689298043728424e-05, "loss": 0.4534, "step": 151 }, { "epoch": 0.4720496894409938, "grad_norm": 0.40070437909888434, "learning_rate": 4.683544303797468e-05, "loss": 0.4367, "step": 152 }, { "epoch": 0.4751552795031056, "grad_norm": 0.407305468388989, "learning_rate": 4.677790563866514e-05, "loss": 0.4415, "step": 153 }, { "epoch": 0.4782608695652174, "grad_norm": 0.4058611659098106, "learning_rate": 4.672036823935558e-05, "loss": 0.4576, "step": 154 }, { "epoch": 0.4813664596273292, "grad_norm": 0.3967515788115339, "learning_rate": 4.6662830840046035e-05, "loss": 0.4524, "step": 155 }, { "epoch": 0.484472049689441, "grad_norm": 0.4407590164610378, "learning_rate": 4.660529344073648e-05, "loss": 0.457, "step": 156 }, { "epoch": 0.48757763975155277, "grad_norm": 0.43880737794315955, "learning_rate": 4.654775604142693e-05, "loss": 0.4365, "step": 157 }, { "epoch": 0.4906832298136646, "grad_norm": 0.47864526006501984, "learning_rate": 4.6490218642117375e-05, "loss": 0.4479, "step": 158 }, { "epoch": 0.4937888198757764, "grad_norm": 0.4692672779398985, "learning_rate": 4.643268124280783e-05, "loss": 0.4546, "step": 159 }, { "epoch": 0.4968944099378882, "grad_norm": 0.4097305951007724, "learning_rate": 4.637514384349827e-05, "loss": 0.4355, "step": 160 }, { "epoch": 0.5, "grad_norm": 0.43610759922666353, "learning_rate": 4.631760644418873e-05, "loss": 0.447, "step": 161 }, { "epoch": 0.5031055900621118, "grad_norm": 0.2978982430601787, "learning_rate": 4.626006904487917e-05, "loss": 0.4524, "step": 162 }, { "epoch": 0.5062111801242236, "grad_norm": 0.43653406806069966, "learning_rate": 4.6202531645569625e-05, "loss": 0.4171, "step": 163 }, { "epoch": 0.5093167701863354, "grad_norm": 0.40670821189566986, "learning_rate": 4.614499424626007e-05, "loss": 0.439, "step": 164 }, { "epoch": 0.5124223602484472, "grad_norm": 0.33901355170318703, "learning_rate": 4.608745684695052e-05, "loss": 0.4461, "step": 165 }, { "epoch": 0.515527950310559, "grad_norm": 0.43610331613751346, "learning_rate": 4.6029919447640965e-05, "loss": 0.4554, "step": 166 }, { "epoch": 0.5186335403726708, "grad_norm": 0.3625661313466411, "learning_rate": 4.597238204833142e-05, "loss": 0.4554, "step": 167 }, { "epoch": 0.5217391304347826, "grad_norm": 0.3394393399478139, "learning_rate": 4.591484464902186e-05, "loss": 0.4367, "step": 168 }, { "epoch": 0.5248447204968945, "grad_norm": 0.3588563348596153, "learning_rate": 4.585730724971232e-05, "loss": 0.4461, "step": 169 }, { "epoch": 0.5279503105590062, "grad_norm": 0.3802785353634964, "learning_rate": 4.579976985040276e-05, "loss": 0.4387, "step": 170 }, { "epoch": 0.531055900621118, "grad_norm": 0.3869023722709017, "learning_rate": 4.5742232451093215e-05, "loss": 0.4528, "step": 171 }, { "epoch": 0.5341614906832298, "grad_norm": 0.36676418356051843, "learning_rate": 4.568469505178366e-05, "loss": 0.4348, "step": 172 }, { "epoch": 0.5372670807453416, "grad_norm": 0.46126816544453725, "learning_rate": 4.562715765247411e-05, "loss": 0.4231, "step": 173 }, { "epoch": 0.5403726708074534, "grad_norm": 0.35343634631539705, "learning_rate": 4.556962025316456e-05, "loss": 0.4369, "step": 174 }, { "epoch": 0.5434782608695652, "grad_norm": 0.4549103689048508, "learning_rate": 4.551208285385501e-05, "loss": 0.4387, "step": 175 }, { "epoch": 0.546583850931677, "grad_norm": 0.4303714186336393, "learning_rate": 4.545454545454546e-05, "loss": 0.4546, "step": 176 }, { "epoch": 0.5496894409937888, "grad_norm": 0.4531267139678119, "learning_rate": 4.539700805523591e-05, "loss": 0.4356, "step": 177 }, { "epoch": 0.5527950310559007, "grad_norm": 0.42240540949166944, "learning_rate": 4.5339470655926356e-05, "loss": 0.4442, "step": 178 }, { "epoch": 0.5559006211180124, "grad_norm": 0.3163983623110262, "learning_rate": 4.5281933256616805e-05, "loss": 0.4255, "step": 179 }, { "epoch": 0.5590062111801242, "grad_norm": 0.37954620340652895, "learning_rate": 4.5224395857307253e-05, "loss": 0.4387, "step": 180 }, { "epoch": 0.562111801242236, "grad_norm": 0.33565801845470367, "learning_rate": 4.51668584579977e-05, "loss": 0.4415, "step": 181 }, { "epoch": 0.5652173913043478, "grad_norm": 0.3349864414277053, "learning_rate": 4.510932105868815e-05, "loss": 0.4082, "step": 182 }, { "epoch": 0.5683229813664596, "grad_norm": 0.439294679014343, "learning_rate": 4.50517836593786e-05, "loss": 0.446, "step": 183 }, { "epoch": 0.5714285714285714, "grad_norm": 0.28999671538444516, "learning_rate": 4.499424626006905e-05, "loss": 0.4103, "step": 184 }, { "epoch": 0.5745341614906833, "grad_norm": 0.40660335920713986, "learning_rate": 4.49367088607595e-05, "loss": 0.444, "step": 185 }, { "epoch": 0.577639751552795, "grad_norm": 0.3033161017839996, "learning_rate": 4.4879171461449946e-05, "loss": 0.435, "step": 186 }, { "epoch": 0.5807453416149069, "grad_norm": 0.30568413065453626, "learning_rate": 4.4821634062140395e-05, "loss": 0.4237, "step": 187 }, { "epoch": 0.5838509316770186, "grad_norm": 0.32587134975274057, "learning_rate": 4.476409666283084e-05, "loss": 0.4331, "step": 188 }, { "epoch": 0.5869565217391305, "grad_norm": 0.28290562376532075, "learning_rate": 4.470655926352129e-05, "loss": 0.4342, "step": 189 }, { "epoch": 0.5900621118012422, "grad_norm": 0.3630490197737241, "learning_rate": 4.464902186421174e-05, "loss": 0.4344, "step": 190 }, { "epoch": 0.593167701863354, "grad_norm": 0.3559890010930286, "learning_rate": 4.459148446490219e-05, "loss": 0.4498, "step": 191 }, { "epoch": 0.5962732919254659, "grad_norm": 0.3499252907427838, "learning_rate": 4.453394706559264e-05, "loss": 0.4507, "step": 192 }, { "epoch": 0.5993788819875776, "grad_norm": 0.33006303704048223, "learning_rate": 4.447640966628309e-05, "loss": 0.4281, "step": 193 }, { "epoch": 0.6024844720496895, "grad_norm": 0.36984061156296816, "learning_rate": 4.4418872266973536e-05, "loss": 0.4518, "step": 194 }, { "epoch": 0.6055900621118012, "grad_norm": 0.3439812296873207, "learning_rate": 4.4361334867663984e-05, "loss": 0.4448, "step": 195 }, { "epoch": 0.6086956521739131, "grad_norm": 0.30685626808124417, "learning_rate": 4.430379746835443e-05, "loss": 0.4377, "step": 196 }, { "epoch": 0.6118012422360248, "grad_norm": 0.3925848437000049, "learning_rate": 4.424626006904488e-05, "loss": 0.4396, "step": 197 }, { "epoch": 0.6149068322981367, "grad_norm": 0.32639373809266464, "learning_rate": 4.418872266973533e-05, "loss": 0.4321, "step": 198 }, { "epoch": 0.6180124223602484, "grad_norm": 0.376079541285074, "learning_rate": 4.413118527042578e-05, "loss": 0.4242, "step": 199 }, { "epoch": 0.6211180124223602, "grad_norm": 0.3749608850464733, "learning_rate": 4.407364787111623e-05, "loss": 0.4259, "step": 200 }, { "epoch": 0.6242236024844721, "grad_norm": 0.4461881134050382, "learning_rate": 4.401611047180668e-05, "loss": 0.4341, "step": 201 }, { "epoch": 0.6273291925465838, "grad_norm": 0.4877320414028972, "learning_rate": 4.3958573072497125e-05, "loss": 0.4344, "step": 202 }, { "epoch": 0.6304347826086957, "grad_norm": 0.4070659780535386, "learning_rate": 4.3901035673187574e-05, "loss": 0.4227, "step": 203 }, { "epoch": 0.6335403726708074, "grad_norm": 0.4635439998393952, "learning_rate": 4.384349827387802e-05, "loss": 0.4355, "step": 204 }, { "epoch": 0.6366459627329193, "grad_norm": 0.35952245913430025, "learning_rate": 4.378596087456847e-05, "loss": 0.423, "step": 205 }, { "epoch": 0.639751552795031, "grad_norm": 0.520771866846795, "learning_rate": 4.372842347525892e-05, "loss": 0.4306, "step": 206 }, { "epoch": 0.6428571428571429, "grad_norm": 0.3273697337468707, "learning_rate": 4.367088607594937e-05, "loss": 0.4324, "step": 207 }, { "epoch": 0.6459627329192547, "grad_norm": 0.4813614761483608, "learning_rate": 4.361334867663982e-05, "loss": 0.4478, "step": 208 }, { "epoch": 0.6490683229813664, "grad_norm": 0.3900984777507702, "learning_rate": 4.3555811277330267e-05, "loss": 0.4269, "step": 209 }, { "epoch": 0.6521739130434783, "grad_norm": 0.3853092679143466, "learning_rate": 4.3498273878020715e-05, "loss": 0.4407, "step": 210 }, { "epoch": 0.65527950310559, "grad_norm": 0.46225724309871613, "learning_rate": 4.3440736478711164e-05, "loss": 0.436, "step": 211 }, { "epoch": 0.6583850931677019, "grad_norm": 0.31651404685134377, "learning_rate": 4.338319907940161e-05, "loss": 0.4316, "step": 212 }, { "epoch": 0.6614906832298136, "grad_norm": 0.44516432018668023, "learning_rate": 4.332566168009206e-05, "loss": 0.4426, "step": 213 }, { "epoch": 0.6645962732919255, "grad_norm": 0.3462443744991128, "learning_rate": 4.326812428078251e-05, "loss": 0.4465, "step": 214 }, { "epoch": 0.6677018633540373, "grad_norm": 0.4436257780311306, "learning_rate": 4.321058688147296e-05, "loss": 0.4241, "step": 215 }, { "epoch": 0.6708074534161491, "grad_norm": 0.3788099950107418, "learning_rate": 4.315304948216341e-05, "loss": 0.4206, "step": 216 }, { "epoch": 0.6739130434782609, "grad_norm": 0.3667132129478159, "learning_rate": 4.3095512082853856e-05, "loss": 0.4336, "step": 217 }, { "epoch": 0.6770186335403726, "grad_norm": 0.43405694529571, "learning_rate": 4.3037974683544305e-05, "loss": 0.4285, "step": 218 }, { "epoch": 0.6801242236024845, "grad_norm": 0.37501605794405696, "learning_rate": 4.2980437284234754e-05, "loss": 0.4354, "step": 219 }, { "epoch": 0.6832298136645962, "grad_norm": 0.7491502232791192, "learning_rate": 4.29228998849252e-05, "loss": 0.4622, "step": 220 }, { "epoch": 0.6863354037267081, "grad_norm": 0.34683109305557713, "learning_rate": 4.286536248561565e-05, "loss": 0.4349, "step": 221 }, { "epoch": 0.6894409937888198, "grad_norm": 0.41649862939635707, "learning_rate": 4.28078250863061e-05, "loss": 0.4278, "step": 222 }, { "epoch": 0.6925465838509317, "grad_norm": 0.33273645633734766, "learning_rate": 4.275028768699655e-05, "loss": 0.4241, "step": 223 }, { "epoch": 0.6956521739130435, "grad_norm": 0.358638671370147, "learning_rate": 4.2692750287687e-05, "loss": 0.4294, "step": 224 }, { "epoch": 0.6987577639751553, "grad_norm": 0.3505002399312612, "learning_rate": 4.2635212888377446e-05, "loss": 0.4339, "step": 225 }, { "epoch": 0.7018633540372671, "grad_norm": 0.28967971081827765, "learning_rate": 4.2577675489067895e-05, "loss": 0.4433, "step": 226 }, { "epoch": 0.7049689440993789, "grad_norm": 0.3792183124094411, "learning_rate": 4.2520138089758344e-05, "loss": 0.4263, "step": 227 }, { "epoch": 0.7080745341614907, "grad_norm": 0.2915459102300122, "learning_rate": 4.246260069044879e-05, "loss": 0.4226, "step": 228 }, { "epoch": 0.7111801242236024, "grad_norm": 0.357404227614541, "learning_rate": 4.240506329113924e-05, "loss": 0.4183, "step": 229 }, { "epoch": 0.7142857142857143, "grad_norm": 0.33657909101352584, "learning_rate": 4.234752589182969e-05, "loss": 0.4383, "step": 230 }, { "epoch": 0.717391304347826, "grad_norm": 0.28799404238315757, "learning_rate": 4.228998849252014e-05, "loss": 0.4059, "step": 231 }, { "epoch": 0.7204968944099379, "grad_norm": 0.3559137710527895, "learning_rate": 4.223245109321059e-05, "loss": 0.431, "step": 232 }, { "epoch": 0.7236024844720497, "grad_norm": 0.3571859472649835, "learning_rate": 4.2174913693901036e-05, "loss": 0.4365, "step": 233 }, { "epoch": 0.7267080745341615, "grad_norm": 0.27866414620295615, "learning_rate": 4.2117376294591485e-05, "loss": 0.4164, "step": 234 }, { "epoch": 0.7298136645962733, "grad_norm": 0.35761820704128017, "learning_rate": 4.2059838895281933e-05, "loss": 0.4155, "step": 235 }, { "epoch": 0.7329192546583851, "grad_norm": 0.38239702778323204, "learning_rate": 4.200230149597238e-05, "loss": 0.4441, "step": 236 }, { "epoch": 0.7360248447204969, "grad_norm": 0.37338686711282476, "learning_rate": 4.194476409666283e-05, "loss": 0.4287, "step": 237 }, { "epoch": 0.7391304347826086, "grad_norm": 0.31078006795719737, "learning_rate": 4.188722669735328e-05, "loss": 0.4314, "step": 238 }, { "epoch": 0.7422360248447205, "grad_norm": 0.42962957316409206, "learning_rate": 4.182968929804373e-05, "loss": 0.4258, "step": 239 }, { "epoch": 0.7453416149068323, "grad_norm": 0.3531531884915285, "learning_rate": 4.177215189873418e-05, "loss": 0.4348, "step": 240 }, { "epoch": 0.7484472049689441, "grad_norm": 0.4645354016036932, "learning_rate": 4.1714614499424626e-05, "loss": 0.4204, "step": 241 }, { "epoch": 0.7515527950310559, "grad_norm": 1.09153721353785, "learning_rate": 4.1657077100115075e-05, "loss": 0.4386, "step": 242 }, { "epoch": 0.7546583850931677, "grad_norm": 0.32971689202723414, "learning_rate": 4.159953970080552e-05, "loss": 0.4286, "step": 243 }, { "epoch": 0.7577639751552795, "grad_norm": 0.47923594956031046, "learning_rate": 4.154200230149597e-05, "loss": 0.4355, "step": 244 }, { "epoch": 0.7608695652173914, "grad_norm": 0.3499125189435591, "learning_rate": 4.148446490218642e-05, "loss": 0.4363, "step": 245 }, { "epoch": 0.7639751552795031, "grad_norm": 0.3676637215847227, "learning_rate": 4.142692750287687e-05, "loss": 0.4351, "step": 246 }, { "epoch": 0.7670807453416149, "grad_norm": 0.3727821108079694, "learning_rate": 4.136939010356732e-05, "loss": 0.4418, "step": 247 }, { "epoch": 0.7701863354037267, "grad_norm": 0.3252006506678716, "learning_rate": 4.131185270425777e-05, "loss": 0.4158, "step": 248 }, { "epoch": 0.7732919254658385, "grad_norm": 0.6538129311302192, "learning_rate": 4.1254315304948216e-05, "loss": 0.457, "step": 249 }, { "epoch": 0.7763975155279503, "grad_norm": 0.33906627374077886, "learning_rate": 4.1196777905638664e-05, "loss": 0.4318, "step": 250 }, { "epoch": 0.7795031055900621, "grad_norm": 0.356301991033165, "learning_rate": 4.113924050632912e-05, "loss": 0.4236, "step": 251 }, { "epoch": 0.782608695652174, "grad_norm": 0.32783848540999616, "learning_rate": 4.108170310701956e-05, "loss": 0.4448, "step": 252 }, { "epoch": 0.7857142857142857, "grad_norm": 0.33633346589846297, "learning_rate": 4.102416570771002e-05, "loss": 0.4084, "step": 253 }, { "epoch": 0.7888198757763976, "grad_norm": 0.34262159693990346, "learning_rate": 4.096662830840046e-05, "loss": 0.4343, "step": 254 }, { "epoch": 0.7919254658385093, "grad_norm": 0.4238089460532713, "learning_rate": 4.0909090909090915e-05, "loss": 0.4197, "step": 255 }, { "epoch": 0.7950310559006211, "grad_norm": 0.34636542219919175, "learning_rate": 4.085155350978136e-05, "loss": 0.4216, "step": 256 }, { "epoch": 0.7981366459627329, "grad_norm": 0.5147966796611364, "learning_rate": 4.079401611047181e-05, "loss": 0.4335, "step": 257 }, { "epoch": 0.8012422360248447, "grad_norm": 0.3784633526026262, "learning_rate": 4.0736478711162254e-05, "loss": 0.4388, "step": 258 }, { "epoch": 0.8043478260869565, "grad_norm": 0.5353215946365089, "learning_rate": 4.067894131185271e-05, "loss": 0.4317, "step": 259 }, { "epoch": 0.8074534161490683, "grad_norm": 0.43051530301687313, "learning_rate": 4.062140391254315e-05, "loss": 0.4321, "step": 260 }, { "epoch": 0.8105590062111802, "grad_norm": 0.42796425153438244, "learning_rate": 4.056386651323361e-05, "loss": 0.4088, "step": 261 }, { "epoch": 0.8136645962732919, "grad_norm": 0.5934293854830046, "learning_rate": 4.050632911392405e-05, "loss": 0.4158, "step": 262 }, { "epoch": 0.8167701863354038, "grad_norm": 0.4393755394280156, "learning_rate": 4.0448791714614505e-05, "loss": 0.4469, "step": 263 }, { "epoch": 0.8198757763975155, "grad_norm": 0.4605347528048276, "learning_rate": 4.0391254315304947e-05, "loss": 0.4346, "step": 264 }, { "epoch": 0.8229813664596274, "grad_norm": 0.35259936117009355, "learning_rate": 4.03337169159954e-05, "loss": 0.4168, "step": 265 }, { "epoch": 0.8260869565217391, "grad_norm": 0.4804542899872928, "learning_rate": 4.0276179516685844e-05, "loss": 0.4302, "step": 266 }, { "epoch": 0.8291925465838509, "grad_norm": 0.49703176852970304, "learning_rate": 4.02186421173763e-05, "loss": 0.4376, "step": 267 }, { "epoch": 0.8322981366459627, "grad_norm": 0.32332376265052126, "learning_rate": 4.016110471806674e-05, "loss": 0.4151, "step": 268 }, { "epoch": 0.8354037267080745, "grad_norm": 0.3837962855801273, "learning_rate": 4.01035673187572e-05, "loss": 0.4165, "step": 269 }, { "epoch": 0.8385093167701864, "grad_norm": 0.3057885184710408, "learning_rate": 4.004602991944764e-05, "loss": 0.4193, "step": 270 }, { "epoch": 0.8416149068322981, "grad_norm": 0.33815716235605003, "learning_rate": 3.9988492520138094e-05, "loss": 0.4122, "step": 271 }, { "epoch": 0.84472049689441, "grad_norm": 0.35543954456463683, "learning_rate": 3.9930955120828536e-05, "loss": 0.4312, "step": 272 }, { "epoch": 0.8478260869565217, "grad_norm": 0.4061479720971117, "learning_rate": 3.987341772151899e-05, "loss": 0.4326, "step": 273 }, { "epoch": 0.8509316770186336, "grad_norm": 0.3293967556583535, "learning_rate": 3.9815880322209434e-05, "loss": 0.4162, "step": 274 }, { "epoch": 0.8540372670807453, "grad_norm": 0.32127496899850444, "learning_rate": 3.975834292289989e-05, "loss": 0.4064, "step": 275 }, { "epoch": 0.8571428571428571, "grad_norm": 0.3106744319229529, "learning_rate": 3.970080552359033e-05, "loss": 0.4219, "step": 276 }, { "epoch": 0.860248447204969, "grad_norm": 0.2851226156515557, "learning_rate": 3.964326812428079e-05, "loss": 0.4357, "step": 277 }, { "epoch": 0.8633540372670807, "grad_norm": 0.3367137774364346, "learning_rate": 3.958573072497123e-05, "loss": 0.4221, "step": 278 }, { "epoch": 0.8664596273291926, "grad_norm": 0.274716671666842, "learning_rate": 3.9528193325661684e-05, "loss": 0.4286, "step": 279 }, { "epoch": 0.8695652173913043, "grad_norm": 0.32476182770932666, "learning_rate": 3.9470655926352126e-05, "loss": 0.4156, "step": 280 }, { "epoch": 0.8726708074534162, "grad_norm": 0.34109454423469643, "learning_rate": 3.941311852704258e-05, "loss": 0.4133, "step": 281 }, { "epoch": 0.8757763975155279, "grad_norm": 0.35511307476273746, "learning_rate": 3.9355581127733024e-05, "loss": 0.4317, "step": 282 }, { "epoch": 0.8788819875776398, "grad_norm": 0.3270722625275185, "learning_rate": 3.929804372842348e-05, "loss": 0.4182, "step": 283 }, { "epoch": 0.8819875776397516, "grad_norm": 0.30707956127514435, "learning_rate": 3.924050632911392e-05, "loss": 0.4128, "step": 284 }, { "epoch": 0.8850931677018633, "grad_norm": 0.352987960191196, "learning_rate": 3.9182968929804377e-05, "loss": 0.4202, "step": 285 }, { "epoch": 0.8881987577639752, "grad_norm": 0.3209556725057783, "learning_rate": 3.912543153049482e-05, "loss": 0.4531, "step": 286 }, { "epoch": 0.8913043478260869, "grad_norm": 0.3424777350197383, "learning_rate": 3.9067894131185274e-05, "loss": 0.4261, "step": 287 }, { "epoch": 0.8944099378881988, "grad_norm": 0.36115235473805046, "learning_rate": 3.9010356731875716e-05, "loss": 0.4208, "step": 288 }, { "epoch": 0.8975155279503105, "grad_norm": 0.3345731728145184, "learning_rate": 3.895281933256617e-05, "loss": 0.4243, "step": 289 }, { "epoch": 0.9006211180124224, "grad_norm": 0.3479109694931497, "learning_rate": 3.8895281933256613e-05, "loss": 0.408, "step": 290 }, { "epoch": 0.9037267080745341, "grad_norm": 0.35901431270989403, "learning_rate": 3.883774453394707e-05, "loss": 0.4275, "step": 291 }, { "epoch": 0.906832298136646, "grad_norm": 0.33289357045170126, "learning_rate": 3.878020713463751e-05, "loss": 0.4078, "step": 292 }, { "epoch": 0.9099378881987578, "grad_norm": 0.33168510073705165, "learning_rate": 3.8722669735327966e-05, "loss": 0.4218, "step": 293 }, { "epoch": 0.9130434782608695, "grad_norm": 0.2975318289744658, "learning_rate": 3.866513233601841e-05, "loss": 0.4311, "step": 294 }, { "epoch": 0.9161490683229814, "grad_norm": 0.31426977572692477, "learning_rate": 3.8607594936708864e-05, "loss": 0.4297, "step": 295 }, { "epoch": 0.9192546583850931, "grad_norm": 0.3070483941031755, "learning_rate": 3.8550057537399306e-05, "loss": 0.4192, "step": 296 }, { "epoch": 0.922360248447205, "grad_norm": 0.2810848054459513, "learning_rate": 3.849252013808976e-05, "loss": 0.427, "step": 297 }, { "epoch": 0.9254658385093167, "grad_norm": 0.2991841633857078, "learning_rate": 3.84349827387802e-05, "loss": 0.4052, "step": 298 }, { "epoch": 0.9285714285714286, "grad_norm": 0.33847151615147736, "learning_rate": 3.837744533947066e-05, "loss": 0.419, "step": 299 }, { "epoch": 0.9316770186335404, "grad_norm": 0.29017927632864937, "learning_rate": 3.83199079401611e-05, "loss": 0.4235, "step": 300 }, { "epoch": 0.9347826086956522, "grad_norm": 0.32565509697744177, "learning_rate": 3.8262370540851556e-05, "loss": 0.4218, "step": 301 }, { "epoch": 0.937888198757764, "grad_norm": 0.31402325607805354, "learning_rate": 3.8204833141542005e-05, "loss": 0.4374, "step": 302 }, { "epoch": 0.9409937888198758, "grad_norm": 0.3147076556719568, "learning_rate": 3.8147295742232454e-05, "loss": 0.4155, "step": 303 }, { "epoch": 0.9440993788819876, "grad_norm": 0.29699738407713266, "learning_rate": 3.80897583429229e-05, "loss": 0.4111, "step": 304 }, { "epoch": 0.9472049689440993, "grad_norm": 0.2888210602850056, "learning_rate": 3.803222094361335e-05, "loss": 0.4221, "step": 305 }, { "epoch": 0.9503105590062112, "grad_norm": 0.2939573629666098, "learning_rate": 3.79746835443038e-05, "loss": 0.4032, "step": 306 }, { "epoch": 0.953416149068323, "grad_norm": 0.2962446654764285, "learning_rate": 3.791714614499425e-05, "loss": 0.4214, "step": 307 }, { "epoch": 0.9565217391304348, "grad_norm": 0.271891153920885, "learning_rate": 3.78596087456847e-05, "loss": 0.4198, "step": 308 }, { "epoch": 0.9596273291925466, "grad_norm": 0.32256951843172593, "learning_rate": 3.7802071346375146e-05, "loss": 0.4216, "step": 309 }, { "epoch": 0.9627329192546584, "grad_norm": 0.33232339921643056, "learning_rate": 3.7744533947065595e-05, "loss": 0.4177, "step": 310 }, { "epoch": 0.9658385093167702, "grad_norm": 0.35814851356254335, "learning_rate": 3.7686996547756043e-05, "loss": 0.425, "step": 311 }, { "epoch": 0.968944099378882, "grad_norm": 0.29938770364659023, "learning_rate": 3.762945914844649e-05, "loss": 0.4128, "step": 312 }, { "epoch": 0.9720496894409938, "grad_norm": 0.38739922253123726, "learning_rate": 3.757192174913694e-05, "loss": 0.4113, "step": 313 }, { "epoch": 0.9751552795031055, "grad_norm": 0.31386603107673766, "learning_rate": 3.751438434982739e-05, "loss": 0.4104, "step": 314 }, { "epoch": 0.9782608695652174, "grad_norm": 0.34687136495142834, "learning_rate": 3.745684695051784e-05, "loss": 0.4307, "step": 315 }, { "epoch": 0.9813664596273292, "grad_norm": 0.3492017123521989, "learning_rate": 3.739930955120829e-05, "loss": 0.4077, "step": 316 }, { "epoch": 0.984472049689441, "grad_norm": 0.29396206255406326, "learning_rate": 3.7341772151898736e-05, "loss": 0.4067, "step": 317 }, { "epoch": 0.9875776397515528, "grad_norm": 0.31882677984452723, "learning_rate": 3.7284234752589185e-05, "loss": 0.4207, "step": 318 }, { "epoch": 0.9906832298136646, "grad_norm": 0.37165416285954644, "learning_rate": 3.722669735327963e-05, "loss": 0.4339, "step": 319 }, { "epoch": 0.9937888198757764, "grad_norm": 0.3190088839703568, "learning_rate": 3.716915995397008e-05, "loss": 0.4079, "step": 320 }, { "epoch": 0.9968944099378882, "grad_norm": 0.3115319771959773, "learning_rate": 3.711162255466053e-05, "loss": 0.4322, "step": 321 }, { "epoch": 1.0, "grad_norm": 0.3044086608586031, "learning_rate": 3.705408515535098e-05, "loss": 0.4097, "step": 322 }, { "epoch": 1.0031055900621118, "grad_norm": 0.33417590278362963, "learning_rate": 3.699654775604143e-05, "loss": 0.3323, "step": 323 }, { "epoch": 1.0062111801242235, "grad_norm": 0.341573477224664, "learning_rate": 3.693901035673188e-05, "loss": 0.3571, "step": 324 }, { "epoch": 1.0093167701863355, "grad_norm": 0.27258326161115387, "learning_rate": 3.6881472957422326e-05, "loss": 0.3404, "step": 325 }, { "epoch": 1.0124223602484472, "grad_norm": 0.33991178542501627, "learning_rate": 3.6823935558112774e-05, "loss": 0.3493, "step": 326 }, { "epoch": 1.015527950310559, "grad_norm": 0.3446263251981706, "learning_rate": 3.676639815880322e-05, "loss": 0.3473, "step": 327 }, { "epoch": 1.0186335403726707, "grad_norm": 0.33801547973317314, "learning_rate": 3.670886075949367e-05, "loss": 0.3697, "step": 328 }, { "epoch": 1.0217391304347827, "grad_norm": 0.35908354782023477, "learning_rate": 3.665132336018412e-05, "loss": 0.3476, "step": 329 }, { "epoch": 1.0248447204968945, "grad_norm": 0.3234656105570385, "learning_rate": 3.659378596087457e-05, "loss": 0.3622, "step": 330 }, { "epoch": 1.0279503105590062, "grad_norm": 0.35587249506855595, "learning_rate": 3.653624856156502e-05, "loss": 0.3555, "step": 331 }, { "epoch": 1.031055900621118, "grad_norm": 0.31905169592308186, "learning_rate": 3.647871116225547e-05, "loss": 0.3461, "step": 332 }, { "epoch": 1.0341614906832297, "grad_norm": 0.36840310397083925, "learning_rate": 3.6421173762945915e-05, "loss": 0.3429, "step": 333 }, { "epoch": 1.0372670807453417, "grad_norm": 0.3651205860513462, "learning_rate": 3.6363636363636364e-05, "loss": 0.3435, "step": 334 }, { "epoch": 1.0403726708074534, "grad_norm": 0.31066005439052724, "learning_rate": 3.630609896432681e-05, "loss": 0.3272, "step": 335 }, { "epoch": 1.0434782608695652, "grad_norm": 0.3759419584351618, "learning_rate": 3.624856156501726e-05, "loss": 0.3395, "step": 336 }, { "epoch": 1.046583850931677, "grad_norm": 0.3021549547887614, "learning_rate": 3.619102416570771e-05, "loss": 0.3417, "step": 337 }, { "epoch": 1.049689440993789, "grad_norm": 0.3205703918762732, "learning_rate": 3.613348676639816e-05, "loss": 0.3433, "step": 338 }, { "epoch": 1.0527950310559007, "grad_norm": 0.4534884210584356, "learning_rate": 3.607594936708861e-05, "loss": 0.3594, "step": 339 }, { "epoch": 1.0559006211180124, "grad_norm": 0.367415386580333, "learning_rate": 3.6018411967779057e-05, "loss": 0.3524, "step": 340 }, { "epoch": 1.0590062111801242, "grad_norm": 0.3127875635159284, "learning_rate": 3.5960874568469505e-05, "loss": 0.333, "step": 341 }, { "epoch": 1.062111801242236, "grad_norm": 0.4511553956189257, "learning_rate": 3.5903337169159954e-05, "loss": 0.3454, "step": 342 }, { "epoch": 1.065217391304348, "grad_norm": 0.27133796776358254, "learning_rate": 3.58457997698504e-05, "loss": 0.3307, "step": 343 }, { "epoch": 1.0683229813664596, "grad_norm": 0.37172783607468407, "learning_rate": 3.578826237054085e-05, "loss": 0.332, "step": 344 }, { "epoch": 1.0714285714285714, "grad_norm": 0.31903478698253923, "learning_rate": 3.57307249712313e-05, "loss": 0.3641, "step": 345 }, { "epoch": 1.0745341614906831, "grad_norm": 0.3590599821405197, "learning_rate": 3.567318757192175e-05, "loss": 0.3368, "step": 346 }, { "epoch": 1.0776397515527951, "grad_norm": 0.3228666493670707, "learning_rate": 3.56156501726122e-05, "loss": 0.3518, "step": 347 }, { "epoch": 1.0807453416149069, "grad_norm": 0.35040485427397144, "learning_rate": 3.5558112773302646e-05, "loss": 0.3567, "step": 348 }, { "epoch": 1.0838509316770186, "grad_norm": 0.3223473550373259, "learning_rate": 3.5500575373993095e-05, "loss": 0.3292, "step": 349 }, { "epoch": 1.0869565217391304, "grad_norm": 0.3162329124544906, "learning_rate": 3.5443037974683544e-05, "loss": 0.3386, "step": 350 }, { "epoch": 1.0900621118012421, "grad_norm": 0.35250805959488396, "learning_rate": 3.538550057537399e-05, "loss": 0.3286, "step": 351 }, { "epoch": 1.093167701863354, "grad_norm": 0.31027768437301634, "learning_rate": 3.532796317606444e-05, "loss": 0.3411, "step": 352 }, { "epoch": 1.0962732919254659, "grad_norm": 0.28606898633939265, "learning_rate": 3.52704257767549e-05, "loss": 0.3407, "step": 353 }, { "epoch": 1.0993788819875776, "grad_norm": 0.3579167421662421, "learning_rate": 3.521288837744534e-05, "loss": 0.3262, "step": 354 }, { "epoch": 1.1024844720496894, "grad_norm": 0.3402295001253341, "learning_rate": 3.5155350978135794e-05, "loss": 0.3324, "step": 355 }, { "epoch": 1.1055900621118013, "grad_norm": 0.31366685836024, "learning_rate": 3.5097813578826236e-05, "loss": 0.3463, "step": 356 }, { "epoch": 1.108695652173913, "grad_norm": 0.46838911104977027, "learning_rate": 3.504027617951669e-05, "loss": 0.3565, "step": 357 }, { "epoch": 1.1118012422360248, "grad_norm": 0.3060846523455061, "learning_rate": 3.4982738780207134e-05, "loss": 0.357, "step": 358 }, { "epoch": 1.1149068322981366, "grad_norm": 0.4392245103993425, "learning_rate": 3.492520138089759e-05, "loss": 0.3568, "step": 359 }, { "epoch": 1.1180124223602483, "grad_norm": 0.3916417909387617, "learning_rate": 3.486766398158803e-05, "loss": 0.3446, "step": 360 }, { "epoch": 1.1211180124223603, "grad_norm": 0.3501561418628378, "learning_rate": 3.4810126582278487e-05, "loss": 0.3282, "step": 361 }, { "epoch": 1.124223602484472, "grad_norm": 0.37454862360065444, "learning_rate": 3.475258918296893e-05, "loss": 0.3543, "step": 362 }, { "epoch": 1.1273291925465838, "grad_norm": 0.2884683302507566, "learning_rate": 3.4695051783659384e-05, "loss": 0.3337, "step": 363 }, { "epoch": 1.1304347826086956, "grad_norm": 0.3254717305148171, "learning_rate": 3.4637514384349826e-05, "loss": 0.3271, "step": 364 }, { "epoch": 1.1335403726708075, "grad_norm": 0.3256237761211695, "learning_rate": 3.457997698504028e-05, "loss": 0.3298, "step": 365 }, { "epoch": 1.1366459627329193, "grad_norm": 0.30981574585542065, "learning_rate": 3.4522439585730723e-05, "loss": 0.3685, "step": 366 }, { "epoch": 1.139751552795031, "grad_norm": 0.29936602875383006, "learning_rate": 3.446490218642118e-05, "loss": 0.3524, "step": 367 }, { "epoch": 1.1428571428571428, "grad_norm": 0.2961907533597477, "learning_rate": 3.440736478711162e-05, "loss": 0.3414, "step": 368 }, { "epoch": 1.1459627329192545, "grad_norm": 0.2898757967419472, "learning_rate": 3.4349827387802076e-05, "loss": 0.3275, "step": 369 }, { "epoch": 1.1490683229813665, "grad_norm": 0.35918811245436444, "learning_rate": 3.429228998849252e-05, "loss": 0.3502, "step": 370 }, { "epoch": 1.1521739130434783, "grad_norm": 0.2775107307381104, "learning_rate": 3.4234752589182974e-05, "loss": 0.3409, "step": 371 }, { "epoch": 1.15527950310559, "grad_norm": 0.2986400287100927, "learning_rate": 3.4177215189873416e-05, "loss": 0.3312, "step": 372 }, { "epoch": 1.1583850931677018, "grad_norm": 0.33238801993955036, "learning_rate": 3.411967779056387e-05, "loss": 0.3443, "step": 373 }, { "epoch": 1.1614906832298137, "grad_norm": 0.2893594359102009, "learning_rate": 3.406214039125431e-05, "loss": 0.3332, "step": 374 }, { "epoch": 1.1645962732919255, "grad_norm": 0.32293840276637376, "learning_rate": 3.400460299194477e-05, "loss": 0.3354, "step": 375 }, { "epoch": 1.1677018633540373, "grad_norm": 0.27306219223391365, "learning_rate": 3.394706559263521e-05, "loss": 0.3209, "step": 376 }, { "epoch": 1.170807453416149, "grad_norm": 0.3342500084639322, "learning_rate": 3.3889528193325666e-05, "loss": 0.3729, "step": 377 }, { "epoch": 1.1739130434782608, "grad_norm": 0.2661392532196279, "learning_rate": 3.383199079401611e-05, "loss": 0.3383, "step": 378 }, { "epoch": 1.1770186335403727, "grad_norm": 0.3386471665658259, "learning_rate": 3.3774453394706564e-05, "loss": 0.318, "step": 379 }, { "epoch": 1.1801242236024845, "grad_norm": 0.3155587203894488, "learning_rate": 3.3716915995397006e-05, "loss": 0.3321, "step": 380 }, { "epoch": 1.1832298136645962, "grad_norm": 0.3451778286777197, "learning_rate": 3.365937859608746e-05, "loss": 0.361, "step": 381 }, { "epoch": 1.186335403726708, "grad_norm": 0.3227976748273063, "learning_rate": 3.36018411967779e-05, "loss": 0.3349, "step": 382 }, { "epoch": 1.18944099378882, "grad_norm": 0.320511150129644, "learning_rate": 3.354430379746836e-05, "loss": 0.3449, "step": 383 }, { "epoch": 1.1925465838509317, "grad_norm": 0.31955908520280063, "learning_rate": 3.34867663981588e-05, "loss": 0.3351, "step": 384 }, { "epoch": 1.1956521739130435, "grad_norm": 0.30633810764776365, "learning_rate": 3.3429228998849256e-05, "loss": 0.3275, "step": 385 }, { "epoch": 1.1987577639751552, "grad_norm": 0.41299034529321954, "learning_rate": 3.33716915995397e-05, "loss": 0.3309, "step": 386 }, { "epoch": 1.201863354037267, "grad_norm": 0.2750482509074482, "learning_rate": 3.3314154200230153e-05, "loss": 0.3398, "step": 387 }, { "epoch": 1.204968944099379, "grad_norm": 0.3081268249974453, "learning_rate": 3.3256616800920595e-05, "loss": 0.3322, "step": 388 }, { "epoch": 1.2080745341614907, "grad_norm": 0.3520674198029431, "learning_rate": 3.319907940161105e-05, "loss": 0.3663, "step": 389 }, { "epoch": 1.2111801242236024, "grad_norm": 0.32565232106148584, "learning_rate": 3.314154200230149e-05, "loss": 0.343, "step": 390 }, { "epoch": 1.2142857142857142, "grad_norm": 0.2938812397405531, "learning_rate": 3.308400460299195e-05, "loss": 0.3378, "step": 391 }, { "epoch": 1.2173913043478262, "grad_norm": 0.3141073779827861, "learning_rate": 3.302646720368239e-05, "loss": 0.3335, "step": 392 }, { "epoch": 1.220496894409938, "grad_norm": 0.3418673255721663, "learning_rate": 3.2968929804372846e-05, "loss": 0.36, "step": 393 }, { "epoch": 1.2236024844720497, "grad_norm": 0.24297614998734132, "learning_rate": 3.291139240506329e-05, "loss": 0.3387, "step": 394 }, { "epoch": 1.2267080745341614, "grad_norm": 0.3267179467149504, "learning_rate": 3.285385500575374e-05, "loss": 0.3488, "step": 395 }, { "epoch": 1.2298136645962732, "grad_norm": 0.3057560458812451, "learning_rate": 3.2796317606444185e-05, "loss": 0.3268, "step": 396 }, { "epoch": 1.2329192546583851, "grad_norm": 0.3134897896860434, "learning_rate": 3.273878020713464e-05, "loss": 0.3459, "step": 397 }, { "epoch": 1.236024844720497, "grad_norm": 0.3047314985401556, "learning_rate": 3.268124280782508e-05, "loss": 0.3291, "step": 398 }, { "epoch": 1.2391304347826086, "grad_norm": 0.31348581848675783, "learning_rate": 3.262370540851554e-05, "loss": 0.3446, "step": 399 }, { "epoch": 1.2422360248447206, "grad_norm": 0.3482328869260001, "learning_rate": 3.256616800920598e-05, "loss": 0.3561, "step": 400 }, { "epoch": 1.2453416149068324, "grad_norm": 0.31183834841742225, "learning_rate": 3.2508630609896436e-05, "loss": 0.3547, "step": 401 }, { "epoch": 1.2484472049689441, "grad_norm": 0.3061676085086065, "learning_rate": 3.245109321058688e-05, "loss": 0.3595, "step": 402 }, { "epoch": 1.2515527950310559, "grad_norm": 0.32549148328343397, "learning_rate": 3.239355581127733e-05, "loss": 0.3342, "step": 403 }, { "epoch": 1.2546583850931676, "grad_norm": 0.30445969084522895, "learning_rate": 3.233601841196778e-05, "loss": 0.3242, "step": 404 }, { "epoch": 1.2577639751552794, "grad_norm": 0.2742819629805248, "learning_rate": 3.227848101265823e-05, "loss": 0.3522, "step": 405 }, { "epoch": 1.2608695652173914, "grad_norm": 0.32581875150876105, "learning_rate": 3.222094361334868e-05, "loss": 0.3429, "step": 406 }, { "epoch": 1.263975155279503, "grad_norm": 0.2902255052156193, "learning_rate": 3.216340621403913e-05, "loss": 0.3369, "step": 407 }, { "epoch": 1.2670807453416149, "grad_norm": 0.284761382807809, "learning_rate": 3.210586881472958e-05, "loss": 0.36, "step": 408 }, { "epoch": 1.2701863354037268, "grad_norm": 0.3025552167032939, "learning_rate": 3.2048331415420025e-05, "loss": 0.3445, "step": 409 }, { "epoch": 1.2732919254658386, "grad_norm": 0.3305696776607858, "learning_rate": 3.1990794016110474e-05, "loss": 0.3463, "step": 410 }, { "epoch": 1.2763975155279503, "grad_norm": 0.3077574972549534, "learning_rate": 3.193325661680092e-05, "loss": 0.3594, "step": 411 }, { "epoch": 1.279503105590062, "grad_norm": 0.27442755120830326, "learning_rate": 3.187571921749137e-05, "loss": 0.3362, "step": 412 }, { "epoch": 1.2826086956521738, "grad_norm": 0.3038026451556641, "learning_rate": 3.181818181818182e-05, "loss": 0.3353, "step": 413 }, { "epoch": 1.2857142857142856, "grad_norm": 0.2758156658151106, "learning_rate": 3.176064441887227e-05, "loss": 0.337, "step": 414 }, { "epoch": 1.2888198757763976, "grad_norm": 0.26613400787975794, "learning_rate": 3.170310701956272e-05, "loss": 0.3347, "step": 415 }, { "epoch": 1.2919254658385093, "grad_norm": 0.30006243856469433, "learning_rate": 3.1645569620253167e-05, "loss": 0.3575, "step": 416 }, { "epoch": 1.295031055900621, "grad_norm": 0.32225619437705794, "learning_rate": 3.1588032220943615e-05, "loss": 0.3404, "step": 417 }, { "epoch": 1.298136645962733, "grad_norm": 0.2933513705620206, "learning_rate": 3.1530494821634064e-05, "loss": 0.3367, "step": 418 }, { "epoch": 1.3012422360248448, "grad_norm": 0.34221232972865906, "learning_rate": 3.147295742232451e-05, "loss": 0.3507, "step": 419 }, { "epoch": 1.3043478260869565, "grad_norm": 0.3207028944029123, "learning_rate": 3.141542002301496e-05, "loss": 0.339, "step": 420 }, { "epoch": 1.3074534161490683, "grad_norm": 0.28691874649916205, "learning_rate": 3.135788262370541e-05, "loss": 0.3158, "step": 421 }, { "epoch": 1.31055900621118, "grad_norm": 0.32572558244440175, "learning_rate": 3.130034522439586e-05, "loss": 0.3425, "step": 422 }, { "epoch": 1.3136645962732918, "grad_norm": 0.2840181269830042, "learning_rate": 3.124280782508631e-05, "loss": 0.3446, "step": 423 }, { "epoch": 1.3167701863354038, "grad_norm": 0.314090935226993, "learning_rate": 3.1185270425776756e-05, "loss": 0.3315, "step": 424 }, { "epoch": 1.3198757763975155, "grad_norm": 0.31197822717588264, "learning_rate": 3.1127733026467205e-05, "loss": 0.3443, "step": 425 }, { "epoch": 1.3229813664596273, "grad_norm": 0.2864210002126174, "learning_rate": 3.1070195627157654e-05, "loss": 0.3375, "step": 426 }, { "epoch": 1.3260869565217392, "grad_norm": 0.25519688185589984, "learning_rate": 3.10126582278481e-05, "loss": 0.3384, "step": 427 }, { "epoch": 1.329192546583851, "grad_norm": 0.2744740349540228, "learning_rate": 3.095512082853855e-05, "loss": 0.3383, "step": 428 }, { "epoch": 1.3322981366459627, "grad_norm": 0.2607087924929348, "learning_rate": 3.0897583429229e-05, "loss": 0.3555, "step": 429 }, { "epoch": 1.3354037267080745, "grad_norm": 0.27684287170228183, "learning_rate": 3.084004602991945e-05, "loss": 0.3261, "step": 430 }, { "epoch": 1.3385093167701863, "grad_norm": 0.3095550998483706, "learning_rate": 3.07825086306099e-05, "loss": 0.3512, "step": 431 }, { "epoch": 1.341614906832298, "grad_norm": 0.25842001969735057, "learning_rate": 3.0724971231300346e-05, "loss": 0.3296, "step": 432 }, { "epoch": 1.34472049689441, "grad_norm": 0.30589083199518, "learning_rate": 3.0667433831990795e-05, "loss": 0.3329, "step": 433 }, { "epoch": 1.3478260869565217, "grad_norm": 0.2825831249071207, "learning_rate": 3.0609896432681244e-05, "loss": 0.3403, "step": 434 }, { "epoch": 1.3509316770186335, "grad_norm": 0.2849649494187899, "learning_rate": 3.055235903337169e-05, "loss": 0.3329, "step": 435 }, { "epoch": 1.3540372670807455, "grad_norm": 0.31227992790240827, "learning_rate": 3.0494821634062144e-05, "loss": 0.3402, "step": 436 }, { "epoch": 1.3571428571428572, "grad_norm": 0.28830226140066545, "learning_rate": 3.043728423475259e-05, "loss": 0.3343, "step": 437 }, { "epoch": 1.360248447204969, "grad_norm": 0.2920144019191934, "learning_rate": 3.0379746835443042e-05, "loss": 0.3262, "step": 438 }, { "epoch": 1.3633540372670807, "grad_norm": 0.25016168615415485, "learning_rate": 3.0322209436133487e-05, "loss": 0.3394, "step": 439 }, { "epoch": 1.3664596273291925, "grad_norm": 0.30261081735444717, "learning_rate": 3.026467203682394e-05, "loss": 0.3462, "step": 440 }, { "epoch": 1.3695652173913042, "grad_norm": 0.2881616381341832, "learning_rate": 3.0207134637514385e-05, "loss": 0.3318, "step": 441 }, { "epoch": 1.3726708074534162, "grad_norm": 0.2880320213397424, "learning_rate": 3.0149597238204837e-05, "loss": 0.3467, "step": 442 }, { "epoch": 1.375776397515528, "grad_norm": 0.27020350890941985, "learning_rate": 3.0092059838895282e-05, "loss": 0.332, "step": 443 }, { "epoch": 1.3788819875776397, "grad_norm": 0.3103789819064371, "learning_rate": 3.0034522439585734e-05, "loss": 0.3557, "step": 444 }, { "epoch": 1.3819875776397517, "grad_norm": 0.2689229148068124, "learning_rate": 2.997698504027618e-05, "loss": 0.3439, "step": 445 }, { "epoch": 1.3850931677018634, "grad_norm": 0.3284067395525679, "learning_rate": 2.991944764096663e-05, "loss": 0.3361, "step": 446 }, { "epoch": 1.3881987577639752, "grad_norm": 0.31384093062312546, "learning_rate": 2.9861910241657077e-05, "loss": 0.3397, "step": 447 }, { "epoch": 1.391304347826087, "grad_norm": 0.2699369577142723, "learning_rate": 2.980437284234753e-05, "loss": 0.3586, "step": 448 }, { "epoch": 1.3944099378881987, "grad_norm": 0.3036288617772187, "learning_rate": 2.9746835443037974e-05, "loss": 0.3565, "step": 449 }, { "epoch": 1.3975155279503104, "grad_norm": 0.3124807578288405, "learning_rate": 2.9689298043728427e-05, "loss": 0.3419, "step": 450 }, { "epoch": 1.4006211180124224, "grad_norm": 0.2979650176094835, "learning_rate": 2.9631760644418872e-05, "loss": 0.348, "step": 451 }, { "epoch": 1.4037267080745341, "grad_norm": 0.2681659506306783, "learning_rate": 2.9574223245109324e-05, "loss": 0.3299, "step": 452 }, { "epoch": 1.406832298136646, "grad_norm": 0.32598633427460977, "learning_rate": 2.951668584579977e-05, "loss": 0.3386, "step": 453 }, { "epoch": 1.4099378881987579, "grad_norm": 0.3247280401348239, "learning_rate": 2.945914844649022e-05, "loss": 0.3616, "step": 454 }, { "epoch": 1.4130434782608696, "grad_norm": 0.2674177394891557, "learning_rate": 2.940161104718067e-05, "loss": 0.3477, "step": 455 }, { "epoch": 1.4161490683229814, "grad_norm": 0.31284120066769544, "learning_rate": 2.934407364787112e-05, "loss": 0.3359, "step": 456 }, { "epoch": 1.4192546583850931, "grad_norm": 0.29650681379778476, "learning_rate": 2.9286536248561568e-05, "loss": 0.3534, "step": 457 }, { "epoch": 1.4223602484472049, "grad_norm": 0.28958329200728805, "learning_rate": 2.9228998849252016e-05, "loss": 0.3414, "step": 458 }, { "epoch": 1.4254658385093169, "grad_norm": 0.3164618567976454, "learning_rate": 2.9171461449942465e-05, "loss": 0.351, "step": 459 }, { "epoch": 1.4285714285714286, "grad_norm": 0.32604862143805774, "learning_rate": 2.9113924050632914e-05, "loss": 0.3185, "step": 460 }, { "epoch": 1.4316770186335404, "grad_norm": 0.2371091150488046, "learning_rate": 2.9056386651323363e-05, "loss": 0.3268, "step": 461 }, { "epoch": 1.434782608695652, "grad_norm": 0.28836250577098943, "learning_rate": 2.899884925201381e-05, "loss": 0.3203, "step": 462 }, { "epoch": 1.437888198757764, "grad_norm": 0.29935943849859553, "learning_rate": 2.894131185270426e-05, "loss": 0.3419, "step": 463 }, { "epoch": 1.4409937888198758, "grad_norm": 0.2678768364941078, "learning_rate": 2.888377445339471e-05, "loss": 0.3423, "step": 464 }, { "epoch": 1.4440993788819876, "grad_norm": 0.3004413989905001, "learning_rate": 2.8826237054085157e-05, "loss": 0.3448, "step": 465 }, { "epoch": 1.4472049689440993, "grad_norm": 0.3437138642713499, "learning_rate": 2.8768699654775606e-05, "loss": 0.3624, "step": 466 }, { "epoch": 1.450310559006211, "grad_norm": 0.27833054674558505, "learning_rate": 2.8711162255466055e-05, "loss": 0.3559, "step": 467 }, { "epoch": 1.453416149068323, "grad_norm": 0.30426248134832284, "learning_rate": 2.8653624856156504e-05, "loss": 0.3409, "step": 468 }, { "epoch": 1.4565217391304348, "grad_norm": 0.2884530747421473, "learning_rate": 2.8596087456846952e-05, "loss": 0.3543, "step": 469 }, { "epoch": 1.4596273291925466, "grad_norm": 0.26674718010863235, "learning_rate": 2.85385500575374e-05, "loss": 0.3352, "step": 470 }, { "epoch": 1.4627329192546583, "grad_norm": 0.24962947417256104, "learning_rate": 2.848101265822785e-05, "loss": 0.3331, "step": 471 }, { "epoch": 1.4658385093167703, "grad_norm": 0.24321872392892266, "learning_rate": 2.84234752589183e-05, "loss": 0.3346, "step": 472 }, { "epoch": 1.468944099378882, "grad_norm": 0.25806201920649635, "learning_rate": 2.8365937859608747e-05, "loss": 0.3433, "step": 473 }, { "epoch": 1.4720496894409938, "grad_norm": 0.260107860168702, "learning_rate": 2.8308400460299196e-05, "loss": 0.3518, "step": 474 }, { "epoch": 1.4751552795031055, "grad_norm": 0.27151545722001336, "learning_rate": 2.8250863060989645e-05, "loss": 0.3222, "step": 475 }, { "epoch": 1.4782608695652173, "grad_norm": 0.2699064437677885, "learning_rate": 2.8193325661680093e-05, "loss": 0.3408, "step": 476 }, { "epoch": 1.4813664596273293, "grad_norm": 0.2534825847738341, "learning_rate": 2.8135788262370542e-05, "loss": 0.3355, "step": 477 }, { "epoch": 1.484472049689441, "grad_norm": 0.2596248018497863, "learning_rate": 2.807825086306099e-05, "loss": 0.3317, "step": 478 }, { "epoch": 1.4875776397515528, "grad_norm": 0.22547358749920884, "learning_rate": 2.802071346375144e-05, "loss": 0.3172, "step": 479 }, { "epoch": 1.4906832298136645, "grad_norm": 0.28156958226578077, "learning_rate": 2.796317606444189e-05, "loss": 0.3476, "step": 480 }, { "epoch": 1.4937888198757765, "grad_norm": 0.26615285376164327, "learning_rate": 2.7905638665132337e-05, "loss": 0.3489, "step": 481 }, { "epoch": 1.4968944099378882, "grad_norm": 0.246150116031317, "learning_rate": 2.7848101265822786e-05, "loss": 0.3464, "step": 482 }, { "epoch": 1.5, "grad_norm": 0.25268560596400597, "learning_rate": 2.7790563866513235e-05, "loss": 0.3391, "step": 483 }, { "epoch": 1.5031055900621118, "grad_norm": 0.2946205590355613, "learning_rate": 2.7733026467203683e-05, "loss": 0.3541, "step": 484 }, { "epoch": 1.5062111801242235, "grad_norm": 0.2750424223242439, "learning_rate": 2.7675489067894132e-05, "loss": 0.3276, "step": 485 }, { "epoch": 1.5093167701863353, "grad_norm": 0.28954598608369275, "learning_rate": 2.761795166858458e-05, "loss": 0.3554, "step": 486 }, { "epoch": 1.5124223602484472, "grad_norm": 0.29461626033953947, "learning_rate": 2.756041426927503e-05, "loss": 0.3293, "step": 487 }, { "epoch": 1.515527950310559, "grad_norm": 0.2407514728215296, "learning_rate": 2.7502876869965478e-05, "loss": 0.3263, "step": 488 }, { "epoch": 1.518633540372671, "grad_norm": 0.24475815135162626, "learning_rate": 2.7445339470655927e-05, "loss": 0.3423, "step": 489 }, { "epoch": 1.5217391304347827, "grad_norm": 0.32665261682040186, "learning_rate": 2.7387802071346376e-05, "loss": 0.3333, "step": 490 }, { "epoch": 1.5248447204968945, "grad_norm": 0.2552003566164109, "learning_rate": 2.7330264672036824e-05, "loss": 0.3466, "step": 491 }, { "epoch": 1.5279503105590062, "grad_norm": 0.2871320623730171, "learning_rate": 2.7272727272727273e-05, "loss": 0.3447, "step": 492 }, { "epoch": 1.531055900621118, "grad_norm": 0.2440639273175817, "learning_rate": 2.7215189873417722e-05, "loss": 0.338, "step": 493 }, { "epoch": 1.5341614906832297, "grad_norm": 0.24115465120440344, "learning_rate": 2.715765247410817e-05, "loss": 0.3245, "step": 494 }, { "epoch": 1.5372670807453415, "grad_norm": 0.2781064697786101, "learning_rate": 2.7100115074798623e-05, "loss": 0.3637, "step": 495 }, { "epoch": 1.5403726708074534, "grad_norm": 0.27237179201858924, "learning_rate": 2.7042577675489068e-05, "loss": 0.353, "step": 496 }, { "epoch": 1.5434782608695652, "grad_norm": 0.3018049050362612, "learning_rate": 2.698504027617952e-05, "loss": 0.33, "step": 497 }, { "epoch": 1.5465838509316772, "grad_norm": 0.319532872255584, "learning_rate": 2.6927502876869965e-05, "loss": 0.341, "step": 498 }, { "epoch": 1.549689440993789, "grad_norm": 0.23318500669833875, "learning_rate": 2.6869965477560418e-05, "loss": 0.3324, "step": 499 }, { "epoch": 1.5527950310559007, "grad_norm": 0.3108509837550317, "learning_rate": 2.6812428078250863e-05, "loss": 0.3484, "step": 500 }, { "epoch": 1.5559006211180124, "grad_norm": 0.27432952612163103, "learning_rate": 2.6754890678941315e-05, "loss": 0.3301, "step": 501 }, { "epoch": 1.5590062111801242, "grad_norm": 0.2474022932813197, "learning_rate": 2.669735327963176e-05, "loss": 0.3335, "step": 502 }, { "epoch": 1.562111801242236, "grad_norm": 0.25918516414740417, "learning_rate": 2.6639815880322212e-05, "loss": 0.3531, "step": 503 }, { "epoch": 1.5652173913043477, "grad_norm": 0.28631487498944946, "learning_rate": 2.6582278481012658e-05, "loss": 0.3461, "step": 504 }, { "epoch": 1.5683229813664596, "grad_norm": 0.2541239393514543, "learning_rate": 2.652474108170311e-05, "loss": 0.3406, "step": 505 }, { "epoch": 1.5714285714285714, "grad_norm": 0.26793962508861174, "learning_rate": 2.646720368239356e-05, "loss": 0.3333, "step": 506 }, { "epoch": 1.5745341614906834, "grad_norm": 0.30945895076026697, "learning_rate": 2.6409666283084007e-05, "loss": 0.3429, "step": 507 }, { "epoch": 1.5776397515527951, "grad_norm": 0.2625598679112342, "learning_rate": 2.6352128883774456e-05, "loss": 0.3475, "step": 508 }, { "epoch": 1.5807453416149069, "grad_norm": 0.3340128053196445, "learning_rate": 2.6294591484464905e-05, "loss": 0.3548, "step": 509 }, { "epoch": 1.5838509316770186, "grad_norm": 0.33090740840028027, "learning_rate": 2.6237054085155354e-05, "loss": 0.3482, "step": 510 }, { "epoch": 1.5869565217391304, "grad_norm": 0.25904205831808136, "learning_rate": 2.6179516685845802e-05, "loss": 0.3322, "step": 511 }, { "epoch": 1.5900621118012421, "grad_norm": 0.3458888736647229, "learning_rate": 2.612197928653625e-05, "loss": 0.3466, "step": 512 }, { "epoch": 1.5931677018633539, "grad_norm": 0.26139554234188184, "learning_rate": 2.60644418872267e-05, "loss": 0.3357, "step": 513 }, { "epoch": 1.5962732919254659, "grad_norm": 0.24797420076401436, "learning_rate": 2.600690448791715e-05, "loss": 0.3263, "step": 514 }, { "epoch": 1.5993788819875776, "grad_norm": 0.25127963694679545, "learning_rate": 2.5949367088607597e-05, "loss": 0.3541, "step": 515 }, { "epoch": 1.6024844720496896, "grad_norm": 0.24350107098267543, "learning_rate": 2.5891829689298046e-05, "loss": 0.3332, "step": 516 }, { "epoch": 1.6055900621118013, "grad_norm": 0.2597186201230917, "learning_rate": 2.5834292289988495e-05, "loss": 0.3357, "step": 517 }, { "epoch": 1.608695652173913, "grad_norm": 0.2553977260875351, "learning_rate": 2.5776754890678943e-05, "loss": 0.3381, "step": 518 }, { "epoch": 1.6118012422360248, "grad_norm": 0.2495485503111441, "learning_rate": 2.5719217491369392e-05, "loss": 0.3489, "step": 519 }, { "epoch": 1.6149068322981366, "grad_norm": 0.2826237704718821, "learning_rate": 2.566168009205984e-05, "loss": 0.3269, "step": 520 }, { "epoch": 1.6180124223602483, "grad_norm": 0.2907559187980417, "learning_rate": 2.560414269275029e-05, "loss": 0.353, "step": 521 }, { "epoch": 1.62111801242236, "grad_norm": 0.30078662752184515, "learning_rate": 2.5546605293440738e-05, "loss": 0.3344, "step": 522 }, { "epoch": 1.624223602484472, "grad_norm": 0.2494274026603714, "learning_rate": 2.5489067894131187e-05, "loss": 0.3262, "step": 523 }, { "epoch": 1.6273291925465838, "grad_norm": 0.22856587280801138, "learning_rate": 2.5431530494821636e-05, "loss": 0.3316, "step": 524 }, { "epoch": 1.6304347826086958, "grad_norm": 0.24524446266248454, "learning_rate": 2.5373993095512084e-05, "loss": 0.3254, "step": 525 }, { "epoch": 1.6335403726708075, "grad_norm": 0.2781145066258604, "learning_rate": 2.5316455696202533e-05, "loss": 0.3343, "step": 526 }, { "epoch": 1.6366459627329193, "grad_norm": 0.24971582793985952, "learning_rate": 2.5258918296892982e-05, "loss": 0.3423, "step": 527 }, { "epoch": 1.639751552795031, "grad_norm": 0.2961483358525156, "learning_rate": 2.520138089758343e-05, "loss": 0.3554, "step": 528 }, { "epoch": 1.6428571428571428, "grad_norm": 0.30349368090110823, "learning_rate": 2.514384349827388e-05, "loss": 0.3563, "step": 529 }, { "epoch": 1.6459627329192545, "grad_norm": 0.28292757074394537, "learning_rate": 2.5086306098964328e-05, "loss": 0.352, "step": 530 }, { "epoch": 1.6490683229813663, "grad_norm": 0.25778656185495347, "learning_rate": 2.5028768699654777e-05, "loss": 0.3486, "step": 531 }, { "epoch": 1.6521739130434783, "grad_norm": 0.32420346337090605, "learning_rate": 2.4971231300345226e-05, "loss": 0.3497, "step": 532 }, { "epoch": 1.65527950310559, "grad_norm": 0.24803469539845557, "learning_rate": 2.4913693901035674e-05, "loss": 0.3325, "step": 533 }, { "epoch": 1.658385093167702, "grad_norm": 0.23193714998127715, "learning_rate": 2.4856156501726123e-05, "loss": 0.3244, "step": 534 }, { "epoch": 1.6614906832298137, "grad_norm": 0.31410082505061837, "learning_rate": 2.4798619102416572e-05, "loss": 0.3295, "step": 535 }, { "epoch": 1.6645962732919255, "grad_norm": 0.29805963194310403, "learning_rate": 2.474108170310702e-05, "loss": 0.3576, "step": 536 }, { "epoch": 1.6677018633540373, "grad_norm": 0.2773254453129355, "learning_rate": 2.468354430379747e-05, "loss": 0.3382, "step": 537 }, { "epoch": 1.670807453416149, "grad_norm": 0.32678020135127306, "learning_rate": 2.4626006904487918e-05, "loss": 0.3196, "step": 538 }, { "epoch": 1.6739130434782608, "grad_norm": 0.3166277691971712, "learning_rate": 2.4568469505178367e-05, "loss": 0.3567, "step": 539 }, { "epoch": 1.6770186335403725, "grad_norm": 0.28823972531727493, "learning_rate": 2.4510932105868815e-05, "loss": 0.3303, "step": 540 }, { "epoch": 1.6801242236024845, "grad_norm": 0.31416636195922193, "learning_rate": 2.4453394706559264e-05, "loss": 0.3468, "step": 541 }, { "epoch": 1.6832298136645962, "grad_norm": 0.29389175839717274, "learning_rate": 2.4395857307249713e-05, "loss": 0.3334, "step": 542 }, { "epoch": 1.6863354037267082, "grad_norm": 0.2574868658425901, "learning_rate": 2.433831990794016e-05, "loss": 0.3459, "step": 543 }, { "epoch": 1.68944099378882, "grad_norm": 0.43013005229440787, "learning_rate": 2.428078250863061e-05, "loss": 0.3663, "step": 544 }, { "epoch": 1.6925465838509317, "grad_norm": 0.29719384149686173, "learning_rate": 2.422324510932106e-05, "loss": 0.3227, "step": 545 }, { "epoch": 1.6956521739130435, "grad_norm": 0.2630824870951196, "learning_rate": 2.4165707710011508e-05, "loss": 0.3334, "step": 546 }, { "epoch": 1.6987577639751552, "grad_norm": 0.262646615576403, "learning_rate": 2.4108170310701956e-05, "loss": 0.3289, "step": 547 }, { "epoch": 1.701863354037267, "grad_norm": 0.29464184604515603, "learning_rate": 2.4050632911392405e-05, "loss": 0.3606, "step": 548 }, { "epoch": 1.704968944099379, "grad_norm": 0.270420959511805, "learning_rate": 2.3993095512082854e-05, "loss": 0.3344, "step": 549 }, { "epoch": 1.7080745341614907, "grad_norm": 0.25692484125212045, "learning_rate": 2.3935558112773303e-05, "loss": 0.3487, "step": 550 }, { "epoch": 1.7111801242236024, "grad_norm": 0.2438708000844588, "learning_rate": 2.387802071346375e-05, "loss": 0.3368, "step": 551 }, { "epoch": 1.7142857142857144, "grad_norm": 0.3013744640384419, "learning_rate": 2.38204833141542e-05, "loss": 0.3374, "step": 552 }, { "epoch": 1.7173913043478262, "grad_norm": 0.25432846986941376, "learning_rate": 2.376294591484465e-05, "loss": 0.3229, "step": 553 }, { "epoch": 1.720496894409938, "grad_norm": 0.25904590947672523, "learning_rate": 2.3705408515535098e-05, "loss": 0.3446, "step": 554 }, { "epoch": 1.7236024844720497, "grad_norm": 0.30606145603760704, "learning_rate": 2.3647871116225546e-05, "loss": 0.3341, "step": 555 }, { "epoch": 1.7267080745341614, "grad_norm": 0.29538170031014566, "learning_rate": 2.3590333716915995e-05, "loss": 0.3289, "step": 556 }, { "epoch": 1.7298136645962732, "grad_norm": 0.27134852683063904, "learning_rate": 2.3532796317606444e-05, "loss": 0.3547, "step": 557 }, { "epoch": 1.7329192546583851, "grad_norm": 0.2825681002780479, "learning_rate": 2.3475258918296892e-05, "loss": 0.3379, "step": 558 }, { "epoch": 1.736024844720497, "grad_norm": 0.2540853936510712, "learning_rate": 2.341772151898734e-05, "loss": 0.3382, "step": 559 }, { "epoch": 1.7391304347826086, "grad_norm": 0.23898673149156113, "learning_rate": 2.336018411967779e-05, "loss": 0.3204, "step": 560 }, { "epoch": 1.7422360248447206, "grad_norm": 0.28690037448179756, "learning_rate": 2.330264672036824e-05, "loss": 0.3475, "step": 561 }, { "epoch": 1.7453416149068324, "grad_norm": 0.2517436769783244, "learning_rate": 2.3245109321058687e-05, "loss": 0.3336, "step": 562 }, { "epoch": 1.7484472049689441, "grad_norm": 0.26052794049930406, "learning_rate": 2.3187571921749136e-05, "loss": 0.3659, "step": 563 }, { "epoch": 1.7515527950310559, "grad_norm": 0.2520454393087574, "learning_rate": 2.3130034522439585e-05, "loss": 0.341, "step": 564 }, { "epoch": 1.7546583850931676, "grad_norm": 0.24469475054712242, "learning_rate": 2.3072497123130034e-05, "loss": 0.3385, "step": 565 }, { "epoch": 1.7577639751552794, "grad_norm": 0.2688171235493825, "learning_rate": 2.3014959723820482e-05, "loss": 0.3194, "step": 566 }, { "epoch": 1.7608695652173914, "grad_norm": 0.24660650779589638, "learning_rate": 2.295742232451093e-05, "loss": 0.3414, "step": 567 }, { "epoch": 1.763975155279503, "grad_norm": 0.24074948906029303, "learning_rate": 2.289988492520138e-05, "loss": 0.3487, "step": 568 }, { "epoch": 1.7670807453416149, "grad_norm": 0.2683374003415654, "learning_rate": 2.284234752589183e-05, "loss": 0.3324, "step": 569 }, { "epoch": 1.7701863354037268, "grad_norm": 0.2615920960522321, "learning_rate": 2.278481012658228e-05, "loss": 0.3609, "step": 570 }, { "epoch": 1.7732919254658386, "grad_norm": 0.24217423401661245, "learning_rate": 2.272727272727273e-05, "loss": 0.3642, "step": 571 }, { "epoch": 1.7763975155279503, "grad_norm": 0.3146547539059143, "learning_rate": 2.2669735327963178e-05, "loss": 0.3333, "step": 572 }, { "epoch": 1.779503105590062, "grad_norm": 0.2562957388358894, "learning_rate": 2.2612197928653627e-05, "loss": 0.3446, "step": 573 }, { "epoch": 1.7826086956521738, "grad_norm": 0.2514732345893343, "learning_rate": 2.2554660529344075e-05, "loss": 0.3313, "step": 574 }, { "epoch": 1.7857142857142856, "grad_norm": 0.27268825204355784, "learning_rate": 2.2497123130034524e-05, "loss": 0.3431, "step": 575 }, { "epoch": 1.7888198757763976, "grad_norm": 0.24683018483720148, "learning_rate": 2.2439585730724973e-05, "loss": 0.3264, "step": 576 }, { "epoch": 1.7919254658385093, "grad_norm": 0.2511343392474156, "learning_rate": 2.238204833141542e-05, "loss": 0.3231, "step": 577 }, { "epoch": 1.795031055900621, "grad_norm": 0.29263014098541856, "learning_rate": 2.232451093210587e-05, "loss": 0.3425, "step": 578 }, { "epoch": 1.798136645962733, "grad_norm": 0.26439452081008274, "learning_rate": 2.226697353279632e-05, "loss": 0.3404, "step": 579 }, { "epoch": 1.8012422360248448, "grad_norm": 0.26624397225893237, "learning_rate": 2.2209436133486768e-05, "loss": 0.3521, "step": 580 }, { "epoch": 1.8043478260869565, "grad_norm": 0.27231006655087864, "learning_rate": 2.2151898734177217e-05, "loss": 0.3499, "step": 581 }, { "epoch": 1.8074534161490683, "grad_norm": 0.2717715300694685, "learning_rate": 2.2094361334867665e-05, "loss": 0.3488, "step": 582 }, { "epoch": 1.81055900621118, "grad_norm": 0.2555398572654679, "learning_rate": 2.2036823935558114e-05, "loss": 0.3484, "step": 583 }, { "epoch": 1.8136645962732918, "grad_norm": 0.27285873888872886, "learning_rate": 2.1979286536248563e-05, "loss": 0.3301, "step": 584 }, { "epoch": 1.8167701863354038, "grad_norm": 0.25210730048319585, "learning_rate": 2.192174913693901e-05, "loss": 0.3424, "step": 585 }, { "epoch": 1.8198757763975155, "grad_norm": 0.26842055467218623, "learning_rate": 2.186421173762946e-05, "loss": 0.3408, "step": 586 }, { "epoch": 1.8229813664596275, "grad_norm": 0.3208770216327945, "learning_rate": 2.180667433831991e-05, "loss": 0.3312, "step": 587 }, { "epoch": 1.8260869565217392, "grad_norm": 0.2559541162545561, "learning_rate": 2.1749136939010358e-05, "loss": 0.36, "step": 588 }, { "epoch": 1.829192546583851, "grad_norm": 0.32134394732411636, "learning_rate": 2.1691599539700806e-05, "loss": 0.3394, "step": 589 }, { "epoch": 1.8322981366459627, "grad_norm": 0.2708594663810051, "learning_rate": 2.1634062140391255e-05, "loss": 0.3431, "step": 590 }, { "epoch": 1.8354037267080745, "grad_norm": 0.3010404719152366, "learning_rate": 2.1576524741081704e-05, "loss": 0.3417, "step": 591 }, { "epoch": 1.8385093167701863, "grad_norm": 0.3070106173244936, "learning_rate": 2.1518987341772153e-05, "loss": 0.3388, "step": 592 }, { "epoch": 1.841614906832298, "grad_norm": 0.24023699838734106, "learning_rate": 2.14614499424626e-05, "loss": 0.359, "step": 593 }, { "epoch": 1.84472049689441, "grad_norm": 0.27420152661967667, "learning_rate": 2.140391254315305e-05, "loss": 0.3262, "step": 594 }, { "epoch": 1.8478260869565217, "grad_norm": 0.3128089706224423, "learning_rate": 2.13463751438435e-05, "loss": 0.3284, "step": 595 }, { "epoch": 1.8509316770186337, "grad_norm": 0.26784524761567324, "learning_rate": 2.1288837744533947e-05, "loss": 0.3337, "step": 596 }, { "epoch": 1.8540372670807455, "grad_norm": 0.2897924996984458, "learning_rate": 2.1231300345224396e-05, "loss": 0.3325, "step": 597 }, { "epoch": 1.8571428571428572, "grad_norm": 0.25009521464769496, "learning_rate": 2.1173762945914845e-05, "loss": 0.3457, "step": 598 }, { "epoch": 1.860248447204969, "grad_norm": 0.3055325339631166, "learning_rate": 2.1116225546605294e-05, "loss": 0.3506, "step": 599 }, { "epoch": 1.8633540372670807, "grad_norm": 0.28613812992385934, "learning_rate": 2.1058688147295742e-05, "loss": 0.3609, "step": 600 }, { "epoch": 1.8664596273291925, "grad_norm": 0.2705173567449075, "learning_rate": 2.100115074798619e-05, "loss": 0.3259, "step": 601 }, { "epoch": 1.8695652173913042, "grad_norm": 0.3180478776075474, "learning_rate": 2.094361334867664e-05, "loss": 0.3459, "step": 602 }, { "epoch": 1.8726708074534162, "grad_norm": 0.2667088182720578, "learning_rate": 2.088607594936709e-05, "loss": 0.3415, "step": 603 }, { "epoch": 1.875776397515528, "grad_norm": 0.2721287511996052, "learning_rate": 2.0828538550057537e-05, "loss": 0.3295, "step": 604 }, { "epoch": 1.87888198757764, "grad_norm": 0.24692799289830528, "learning_rate": 2.0771001150747986e-05, "loss": 0.3389, "step": 605 }, { "epoch": 1.8819875776397517, "grad_norm": 0.25562500894154333, "learning_rate": 2.0713463751438435e-05, "loss": 0.3319, "step": 606 }, { "epoch": 1.8850931677018634, "grad_norm": 0.2788963760074411, "learning_rate": 2.0655926352128883e-05, "loss": 0.3285, "step": 607 }, { "epoch": 1.8881987577639752, "grad_norm": 0.24657052891375197, "learning_rate": 2.0598388952819332e-05, "loss": 0.3457, "step": 608 }, { "epoch": 1.891304347826087, "grad_norm": 0.32309129817645427, "learning_rate": 2.054085155350978e-05, "loss": 0.3403, "step": 609 }, { "epoch": 1.8944099378881987, "grad_norm": 0.30175425766070024, "learning_rate": 2.048331415420023e-05, "loss": 0.3471, "step": 610 }, { "epoch": 1.8975155279503104, "grad_norm": 0.26841376007608464, "learning_rate": 2.042577675489068e-05, "loss": 0.3375, "step": 611 }, { "epoch": 1.9006211180124224, "grad_norm": 0.262662943207609, "learning_rate": 2.0368239355581127e-05, "loss": 0.3323, "step": 612 }, { "epoch": 1.9037267080745341, "grad_norm": 0.2701013469116013, "learning_rate": 2.0310701956271576e-05, "loss": 0.3421, "step": 613 }, { "epoch": 1.9068322981366461, "grad_norm": 0.2714596873719603, "learning_rate": 2.0253164556962025e-05, "loss": 0.3459, "step": 614 }, { "epoch": 1.9099378881987579, "grad_norm": 0.27588772682551244, "learning_rate": 2.0195627157652473e-05, "loss": 0.3393, "step": 615 }, { "epoch": 1.9130434782608696, "grad_norm": 0.2599899065726882, "learning_rate": 2.0138089758342922e-05, "loss": 0.3272, "step": 616 }, { "epoch": 1.9161490683229814, "grad_norm": 0.29859390134967695, "learning_rate": 2.008055235903337e-05, "loss": 0.3406, "step": 617 }, { "epoch": 1.9192546583850931, "grad_norm": 0.2506363801804046, "learning_rate": 2.002301495972382e-05, "loss": 0.3442, "step": 618 }, { "epoch": 1.9223602484472049, "grad_norm": 0.27643958694894183, "learning_rate": 1.9965477560414268e-05, "loss": 0.3266, "step": 619 }, { "epoch": 1.9254658385093166, "grad_norm": 0.24433788612177662, "learning_rate": 1.9907940161104717e-05, "loss": 0.3282, "step": 620 }, { "epoch": 1.9285714285714286, "grad_norm": 0.23745008386988042, "learning_rate": 1.9850402761795166e-05, "loss": 0.3362, "step": 621 }, { "epoch": 1.9316770186335404, "grad_norm": 0.26121859150272697, "learning_rate": 1.9792865362485614e-05, "loss": 0.3286, "step": 622 }, { "epoch": 1.9347826086956523, "grad_norm": 0.2593462473033886, "learning_rate": 1.9735327963176063e-05, "loss": 0.3277, "step": 623 }, { "epoch": 1.937888198757764, "grad_norm": 0.251734596039316, "learning_rate": 1.9677790563866512e-05, "loss": 0.3243, "step": 624 }, { "epoch": 1.9409937888198758, "grad_norm": 0.2796993503020773, "learning_rate": 1.962025316455696e-05, "loss": 0.341, "step": 625 }, { "epoch": 1.9440993788819876, "grad_norm": 0.24405119567007771, "learning_rate": 1.956271576524741e-05, "loss": 0.3263, "step": 626 }, { "epoch": 1.9472049689440993, "grad_norm": 0.2551104705286801, "learning_rate": 1.9505178365937858e-05, "loss": 0.3273, "step": 627 }, { "epoch": 1.950310559006211, "grad_norm": 0.30160867369839583, "learning_rate": 1.9447640966628307e-05, "loss": 0.3461, "step": 628 }, { "epoch": 1.9534161490683228, "grad_norm": 0.2307083280109175, "learning_rate": 1.9390103567318755e-05, "loss": 0.3195, "step": 629 }, { "epoch": 1.9565217391304348, "grad_norm": 0.27856475014068843, "learning_rate": 1.9332566168009204e-05, "loss": 0.3534, "step": 630 }, { "epoch": 1.9596273291925466, "grad_norm": 0.2422123663936176, "learning_rate": 1.9275028768699653e-05, "loss": 0.3444, "step": 631 }, { "epoch": 1.9627329192546585, "grad_norm": 0.2508344560707552, "learning_rate": 1.92174913693901e-05, "loss": 0.3316, "step": 632 }, { "epoch": 1.9658385093167703, "grad_norm": 0.25457679605852573, "learning_rate": 1.915995397008055e-05, "loss": 0.3511, "step": 633 }, { "epoch": 1.968944099378882, "grad_norm": 0.255448027513179, "learning_rate": 1.9102416570771002e-05, "loss": 0.3417, "step": 634 }, { "epoch": 1.9720496894409938, "grad_norm": 0.2655225587813165, "learning_rate": 1.904487917146145e-05, "loss": 0.3364, "step": 635 }, { "epoch": 1.9751552795031055, "grad_norm": 0.2644532896395622, "learning_rate": 1.89873417721519e-05, "loss": 0.3439, "step": 636 }, { "epoch": 1.9782608695652173, "grad_norm": 0.25431765900047304, "learning_rate": 1.892980437284235e-05, "loss": 0.3277, "step": 637 }, { "epoch": 1.981366459627329, "grad_norm": 0.253925081047276, "learning_rate": 1.8872266973532797e-05, "loss": 0.345, "step": 638 }, { "epoch": 1.984472049689441, "grad_norm": 0.2542103813230237, "learning_rate": 1.8814729574223246e-05, "loss": 0.3458, "step": 639 }, { "epoch": 1.9875776397515528, "grad_norm": 0.298104123148457, "learning_rate": 1.8757192174913695e-05, "loss": 0.3278, "step": 640 }, { "epoch": 1.9906832298136647, "grad_norm": 0.2322373735825899, "learning_rate": 1.8699654775604144e-05, "loss": 0.3426, "step": 641 }, { "epoch": 1.9937888198757765, "grad_norm": 0.24606988538470728, "learning_rate": 1.8642117376294592e-05, "loss": 0.3358, "step": 642 }, { "epoch": 1.9968944099378882, "grad_norm": 0.2849652231363428, "learning_rate": 1.858457997698504e-05, "loss": 0.315, "step": 643 }, { "epoch": 2.0, "grad_norm": 0.277308601131606, "learning_rate": 1.852704257767549e-05, "loss": 0.3275, "step": 644 }, { "epoch": 2.0031055900621118, "grad_norm": 0.3117446232875321, "learning_rate": 1.846950517836594e-05, "loss": 0.2631, "step": 645 }, { "epoch": 2.0062111801242235, "grad_norm": 0.27820371561408924, "learning_rate": 1.8411967779056387e-05, "loss": 0.26, "step": 646 }, { "epoch": 2.0093167701863353, "grad_norm": 0.35212508445991075, "learning_rate": 1.8354430379746836e-05, "loss": 0.2589, "step": 647 }, { "epoch": 2.012422360248447, "grad_norm": 0.29598296168936833, "learning_rate": 1.8296892980437285e-05, "loss": 0.2727, "step": 648 }, { "epoch": 2.015527950310559, "grad_norm": 0.23748767777958518, "learning_rate": 1.8239355581127733e-05, "loss": 0.2603, "step": 649 }, { "epoch": 2.018633540372671, "grad_norm": 0.3396040317332316, "learning_rate": 1.8181818181818182e-05, "loss": 0.2494, "step": 650 }, { "epoch": 2.0217391304347827, "grad_norm": 0.26773683268799814, "learning_rate": 1.812428078250863e-05, "loss": 0.2538, "step": 651 }, { "epoch": 2.0248447204968945, "grad_norm": 0.26908218639639603, "learning_rate": 1.806674338319908e-05, "loss": 0.2636, "step": 652 }, { "epoch": 2.027950310559006, "grad_norm": 0.2934635435841592, "learning_rate": 1.8009205983889528e-05, "loss": 0.25, "step": 653 }, { "epoch": 2.031055900621118, "grad_norm": 0.2785089363356141, "learning_rate": 1.7951668584579977e-05, "loss": 0.2441, "step": 654 }, { "epoch": 2.0341614906832297, "grad_norm": 0.2638349484890508, "learning_rate": 1.7894131185270426e-05, "loss": 0.2519, "step": 655 }, { "epoch": 2.0372670807453415, "grad_norm": 0.2586235412884467, "learning_rate": 1.7836593785960874e-05, "loss": 0.2509, "step": 656 }, { "epoch": 2.040372670807453, "grad_norm": 0.2472488808463837, "learning_rate": 1.7779056386651323e-05, "loss": 0.2538, "step": 657 }, { "epoch": 2.0434782608695654, "grad_norm": 0.2795998982851747, "learning_rate": 1.7721518987341772e-05, "loss": 0.245, "step": 658 }, { "epoch": 2.046583850931677, "grad_norm": 0.22761521829171, "learning_rate": 1.766398158803222e-05, "loss": 0.257, "step": 659 }, { "epoch": 2.049689440993789, "grad_norm": 0.27491770536559856, "learning_rate": 1.760644418872267e-05, "loss": 0.2543, "step": 660 }, { "epoch": 2.0527950310559007, "grad_norm": 0.2694008994446243, "learning_rate": 1.7548906789413118e-05, "loss": 0.2523, "step": 661 }, { "epoch": 2.0559006211180124, "grad_norm": 0.2649790265269665, "learning_rate": 1.7491369390103567e-05, "loss": 0.2638, "step": 662 }, { "epoch": 2.059006211180124, "grad_norm": 0.2592501818191762, "learning_rate": 1.7433831990794016e-05, "loss": 0.2577, "step": 663 }, { "epoch": 2.062111801242236, "grad_norm": 0.2669822602469171, "learning_rate": 1.7376294591484464e-05, "loss": 0.2609, "step": 664 }, { "epoch": 2.0652173913043477, "grad_norm": 0.24387894874703012, "learning_rate": 1.7318757192174913e-05, "loss": 0.2633, "step": 665 }, { "epoch": 2.0683229813664594, "grad_norm": 0.2492896708129992, "learning_rate": 1.7261219792865362e-05, "loss": 0.2574, "step": 666 }, { "epoch": 2.0714285714285716, "grad_norm": 0.23542048799701373, "learning_rate": 1.720368239355581e-05, "loss": 0.2472, "step": 667 }, { "epoch": 2.0745341614906834, "grad_norm": 0.23318721989860372, "learning_rate": 1.714614499424626e-05, "loss": 0.2492, "step": 668 }, { "epoch": 2.077639751552795, "grad_norm": 0.2184370669246145, "learning_rate": 1.7088607594936708e-05, "loss": 0.2614, "step": 669 }, { "epoch": 2.080745341614907, "grad_norm": 0.23606974543337897, "learning_rate": 1.7031070195627157e-05, "loss": 0.2669, "step": 670 }, { "epoch": 2.0838509316770186, "grad_norm": 0.24573655537821745, "learning_rate": 1.6973532796317605e-05, "loss": 0.2495, "step": 671 }, { "epoch": 2.0869565217391304, "grad_norm": 0.22577283258104885, "learning_rate": 1.6915995397008054e-05, "loss": 0.2324, "step": 672 }, { "epoch": 2.090062111801242, "grad_norm": 0.22880524187260692, "learning_rate": 1.6858457997698503e-05, "loss": 0.2454, "step": 673 }, { "epoch": 2.093167701863354, "grad_norm": 0.23649506425354394, "learning_rate": 1.680092059838895e-05, "loss": 0.2706, "step": 674 }, { "epoch": 2.0962732919254656, "grad_norm": 0.25403811052331426, "learning_rate": 1.67433831990794e-05, "loss": 0.2543, "step": 675 }, { "epoch": 2.099378881987578, "grad_norm": 0.2443521353581772, "learning_rate": 1.668584579976985e-05, "loss": 0.2484, "step": 676 }, { "epoch": 2.1024844720496896, "grad_norm": 0.21359909291298998, "learning_rate": 1.6628308400460298e-05, "loss": 0.242, "step": 677 }, { "epoch": 2.1055900621118013, "grad_norm": 0.2270460742418379, "learning_rate": 1.6570771001150746e-05, "loss": 0.248, "step": 678 }, { "epoch": 2.108695652173913, "grad_norm": 0.22643050215454086, "learning_rate": 1.6513233601841195e-05, "loss": 0.2477, "step": 679 }, { "epoch": 2.111801242236025, "grad_norm": 0.22299140134872011, "learning_rate": 1.6455696202531644e-05, "loss": 0.2568, "step": 680 }, { "epoch": 2.1149068322981366, "grad_norm": 0.21597401049687515, "learning_rate": 1.6398158803222093e-05, "loss": 0.2567, "step": 681 }, { "epoch": 2.1180124223602483, "grad_norm": 0.21424964627998483, "learning_rate": 1.634062140391254e-05, "loss": 0.2594, "step": 682 }, { "epoch": 2.12111801242236, "grad_norm": 0.22064934278360224, "learning_rate": 1.628308400460299e-05, "loss": 0.2548, "step": 683 }, { "epoch": 2.124223602484472, "grad_norm": 0.2226010291399242, "learning_rate": 1.622554660529344e-05, "loss": 0.2392, "step": 684 }, { "epoch": 2.127329192546584, "grad_norm": 0.23817934921154135, "learning_rate": 1.616800920598389e-05, "loss": 0.263, "step": 685 }, { "epoch": 2.130434782608696, "grad_norm": 0.21811012125800597, "learning_rate": 1.611047180667434e-05, "loss": 0.239, "step": 686 }, { "epoch": 2.1335403726708075, "grad_norm": 0.22669201592312113, "learning_rate": 1.605293440736479e-05, "loss": 0.2526, "step": 687 }, { "epoch": 2.1366459627329193, "grad_norm": 0.23305634956402152, "learning_rate": 1.5995397008055237e-05, "loss": 0.2685, "step": 688 }, { "epoch": 2.139751552795031, "grad_norm": 0.2115884014346869, "learning_rate": 1.5937859608745686e-05, "loss": 0.2393, "step": 689 }, { "epoch": 2.142857142857143, "grad_norm": 0.2115627751194399, "learning_rate": 1.5880322209436135e-05, "loss": 0.251, "step": 690 }, { "epoch": 2.1459627329192545, "grad_norm": 0.21692908050798426, "learning_rate": 1.5822784810126583e-05, "loss": 0.2512, "step": 691 }, { "epoch": 2.1490683229813663, "grad_norm": 0.22462301830943301, "learning_rate": 1.5765247410817032e-05, "loss": 0.2511, "step": 692 }, { "epoch": 2.1521739130434785, "grad_norm": 0.19326801109974084, "learning_rate": 1.570771001150748e-05, "loss": 0.2452, "step": 693 }, { "epoch": 2.1552795031055902, "grad_norm": 0.23274032126488928, "learning_rate": 1.565017261219793e-05, "loss": 0.2529, "step": 694 }, { "epoch": 2.158385093167702, "grad_norm": 0.22582002907669432, "learning_rate": 1.5592635212888378e-05, "loss": 0.2597, "step": 695 }, { "epoch": 2.1614906832298137, "grad_norm": 0.21665150454694335, "learning_rate": 1.5535097813578827e-05, "loss": 0.258, "step": 696 }, { "epoch": 2.1645962732919255, "grad_norm": 0.21401338541621684, "learning_rate": 1.5477560414269276e-05, "loss": 0.2556, "step": 697 }, { "epoch": 2.1677018633540373, "grad_norm": 0.23527336018366451, "learning_rate": 1.5420023014959724e-05, "loss": 0.2582, "step": 698 }, { "epoch": 2.170807453416149, "grad_norm": 0.21445105954714194, "learning_rate": 1.5362485615650173e-05, "loss": 0.2503, "step": 699 }, { "epoch": 2.1739130434782608, "grad_norm": 0.21511080318136375, "learning_rate": 1.5304948216340622e-05, "loss": 0.2566, "step": 700 }, { "epoch": 2.1770186335403725, "grad_norm": 0.24111702768301724, "learning_rate": 1.5247410817031072e-05, "loss": 0.2425, "step": 701 }, { "epoch": 2.1801242236024843, "grad_norm": 0.21498468064553858, "learning_rate": 1.5189873417721521e-05, "loss": 0.2439, "step": 702 }, { "epoch": 2.1832298136645965, "grad_norm": 0.21967291818581178, "learning_rate": 1.513233601841197e-05, "loss": 0.2509, "step": 703 }, { "epoch": 2.186335403726708, "grad_norm": 0.2205935490599146, "learning_rate": 1.5074798619102418e-05, "loss": 0.2579, "step": 704 }, { "epoch": 2.18944099378882, "grad_norm": 0.23758914721632698, "learning_rate": 1.5017261219792867e-05, "loss": 0.2399, "step": 705 }, { "epoch": 2.1925465838509317, "grad_norm": 0.19571832530537867, "learning_rate": 1.4959723820483316e-05, "loss": 0.2404, "step": 706 }, { "epoch": 2.1956521739130435, "grad_norm": 0.20772523111005442, "learning_rate": 1.4902186421173765e-05, "loss": 0.2489, "step": 707 }, { "epoch": 2.198757763975155, "grad_norm": 0.2078388868457063, "learning_rate": 1.4844649021864213e-05, "loss": 0.2404, "step": 708 }, { "epoch": 2.201863354037267, "grad_norm": 0.24074640885647317, "learning_rate": 1.4787111622554662e-05, "loss": 0.2647, "step": 709 }, { "epoch": 2.2049689440993787, "grad_norm": 0.20937990276765678, "learning_rate": 1.472957422324511e-05, "loss": 0.2578, "step": 710 }, { "epoch": 2.208074534161491, "grad_norm": 0.2639807190802869, "learning_rate": 1.467203682393556e-05, "loss": 0.2607, "step": 711 }, { "epoch": 2.2111801242236027, "grad_norm": 0.26293955788698453, "learning_rate": 1.4614499424626008e-05, "loss": 0.2638, "step": 712 }, { "epoch": 2.2142857142857144, "grad_norm": 0.23828883015584687, "learning_rate": 1.4556962025316457e-05, "loss": 0.2577, "step": 713 }, { "epoch": 2.217391304347826, "grad_norm": 0.24740324327511762, "learning_rate": 1.4499424626006906e-05, "loss": 0.2603, "step": 714 }, { "epoch": 2.220496894409938, "grad_norm": 0.22582258369375163, "learning_rate": 1.4441887226697354e-05, "loss": 0.2442, "step": 715 }, { "epoch": 2.2236024844720497, "grad_norm": 0.24839008006413138, "learning_rate": 1.4384349827387803e-05, "loss": 0.2591, "step": 716 }, { "epoch": 2.2267080745341614, "grad_norm": 0.2507061092171656, "learning_rate": 1.4326812428078252e-05, "loss": 0.2525, "step": 717 }, { "epoch": 2.229813664596273, "grad_norm": 0.214855054431312, "learning_rate": 1.42692750287687e-05, "loss": 0.2436, "step": 718 }, { "epoch": 2.232919254658385, "grad_norm": 0.21592522701402342, "learning_rate": 1.421173762945915e-05, "loss": 0.2516, "step": 719 }, { "epoch": 2.2360248447204967, "grad_norm": 0.20915695199545198, "learning_rate": 1.4154200230149598e-05, "loss": 0.2597, "step": 720 }, { "epoch": 2.239130434782609, "grad_norm": 0.22903634190903957, "learning_rate": 1.4096662830840047e-05, "loss": 0.2667, "step": 721 }, { "epoch": 2.2422360248447206, "grad_norm": 0.21439993038647093, "learning_rate": 1.4039125431530495e-05, "loss": 0.2436, "step": 722 }, { "epoch": 2.2453416149068324, "grad_norm": 0.21937639860358657, "learning_rate": 1.3981588032220944e-05, "loss": 0.2458, "step": 723 }, { "epoch": 2.248447204968944, "grad_norm": 0.2013130017609961, "learning_rate": 1.3924050632911393e-05, "loss": 0.2491, "step": 724 }, { "epoch": 2.251552795031056, "grad_norm": 0.22887750081435682, "learning_rate": 1.3866513233601842e-05, "loss": 0.2441, "step": 725 }, { "epoch": 2.2546583850931676, "grad_norm": 0.2288064963841507, "learning_rate": 1.380897583429229e-05, "loss": 0.2418, "step": 726 }, { "epoch": 2.2577639751552794, "grad_norm": 0.23248918550222136, "learning_rate": 1.3751438434982739e-05, "loss": 0.2625, "step": 727 }, { "epoch": 2.260869565217391, "grad_norm": 0.2090094171369587, "learning_rate": 1.3693901035673188e-05, "loss": 0.2507, "step": 728 }, { "epoch": 2.2639751552795033, "grad_norm": 0.23154174958563464, "learning_rate": 1.3636363636363637e-05, "loss": 0.2586, "step": 729 }, { "epoch": 2.267080745341615, "grad_norm": 0.24350218064576923, "learning_rate": 1.3578826237054085e-05, "loss": 0.263, "step": 730 }, { "epoch": 2.270186335403727, "grad_norm": 0.2278585941156764, "learning_rate": 1.3521288837744534e-05, "loss": 0.2555, "step": 731 }, { "epoch": 2.2732919254658386, "grad_norm": 0.20801276930170154, "learning_rate": 1.3463751438434983e-05, "loss": 0.2572, "step": 732 }, { "epoch": 2.2763975155279503, "grad_norm": 0.22919123007559652, "learning_rate": 1.3406214039125431e-05, "loss": 0.2582, "step": 733 }, { "epoch": 2.279503105590062, "grad_norm": 0.21268694380279451, "learning_rate": 1.334867663981588e-05, "loss": 0.2512, "step": 734 }, { "epoch": 2.282608695652174, "grad_norm": 0.2182606134520971, "learning_rate": 1.3291139240506329e-05, "loss": 0.2536, "step": 735 }, { "epoch": 2.2857142857142856, "grad_norm": 0.2177977754376004, "learning_rate": 1.323360184119678e-05, "loss": 0.2589, "step": 736 }, { "epoch": 2.2888198757763973, "grad_norm": 0.2079260936390528, "learning_rate": 1.3176064441887228e-05, "loss": 0.2445, "step": 737 }, { "epoch": 2.291925465838509, "grad_norm": 0.21654285079809454, "learning_rate": 1.3118527042577677e-05, "loss": 0.2492, "step": 738 }, { "epoch": 2.2950310559006213, "grad_norm": 0.22224222175484207, "learning_rate": 1.3060989643268126e-05, "loss": 0.2555, "step": 739 }, { "epoch": 2.298136645962733, "grad_norm": 0.2013544241929392, "learning_rate": 1.3003452243958574e-05, "loss": 0.2457, "step": 740 }, { "epoch": 2.301242236024845, "grad_norm": 0.21733404218015004, "learning_rate": 1.2945914844649023e-05, "loss": 0.2659, "step": 741 }, { "epoch": 2.3043478260869565, "grad_norm": 0.21179336140885693, "learning_rate": 1.2888377445339472e-05, "loss": 0.2426, "step": 742 }, { "epoch": 2.3074534161490683, "grad_norm": 0.2285599698694653, "learning_rate": 1.283084004602992e-05, "loss": 0.2429, "step": 743 }, { "epoch": 2.31055900621118, "grad_norm": 0.19835079918909265, "learning_rate": 1.2773302646720369e-05, "loss": 0.2489, "step": 744 }, { "epoch": 2.313664596273292, "grad_norm": 0.2298623252387309, "learning_rate": 1.2715765247410818e-05, "loss": 0.2655, "step": 745 }, { "epoch": 2.3167701863354035, "grad_norm": 0.23867880872639935, "learning_rate": 1.2658227848101267e-05, "loss": 0.2498, "step": 746 }, { "epoch": 2.3198757763975157, "grad_norm": 0.21037856832784158, "learning_rate": 1.2600690448791715e-05, "loss": 0.2589, "step": 747 }, { "epoch": 2.3229813664596275, "grad_norm": 0.24695028457966048, "learning_rate": 1.2543153049482164e-05, "loss": 0.2502, "step": 748 }, { "epoch": 2.3260869565217392, "grad_norm": 0.23360363557581765, "learning_rate": 1.2485615650172613e-05, "loss": 0.259, "step": 749 }, { "epoch": 2.329192546583851, "grad_norm": 0.22335503888847086, "learning_rate": 1.2428078250863062e-05, "loss": 0.2456, "step": 750 }, { "epoch": 2.3322981366459627, "grad_norm": 0.21231134626201825, "learning_rate": 1.237054085155351e-05, "loss": 0.26, "step": 751 }, { "epoch": 2.3354037267080745, "grad_norm": 0.20990198210516803, "learning_rate": 1.2313003452243959e-05, "loss": 0.2441, "step": 752 }, { "epoch": 2.3385093167701863, "grad_norm": 0.221067131454967, "learning_rate": 1.2255466052934408e-05, "loss": 0.2469, "step": 753 }, { "epoch": 2.341614906832298, "grad_norm": 0.22138406777470937, "learning_rate": 1.2197928653624856e-05, "loss": 0.261, "step": 754 }, { "epoch": 2.3447204968944098, "grad_norm": 0.21398489008508845, "learning_rate": 1.2140391254315305e-05, "loss": 0.2566, "step": 755 }, { "epoch": 2.3478260869565215, "grad_norm": 0.20448116831895594, "learning_rate": 1.2082853855005754e-05, "loss": 0.2598, "step": 756 }, { "epoch": 2.3509316770186337, "grad_norm": 0.21255766006062407, "learning_rate": 1.2025316455696203e-05, "loss": 0.2526, "step": 757 }, { "epoch": 2.3540372670807455, "grad_norm": 0.19087455271546003, "learning_rate": 1.1967779056386651e-05, "loss": 0.2537, "step": 758 }, { "epoch": 2.357142857142857, "grad_norm": 0.20379774772998854, "learning_rate": 1.19102416570771e-05, "loss": 0.2668, "step": 759 }, { "epoch": 2.360248447204969, "grad_norm": 0.19801295062012142, "learning_rate": 1.1852704257767549e-05, "loss": 0.2479, "step": 760 }, { "epoch": 2.3633540372670807, "grad_norm": 0.2053725094185451, "learning_rate": 1.1795166858457997e-05, "loss": 0.2597, "step": 761 }, { "epoch": 2.3664596273291925, "grad_norm": 0.19414430502845648, "learning_rate": 1.1737629459148446e-05, "loss": 0.2445, "step": 762 }, { "epoch": 2.369565217391304, "grad_norm": 0.20779479767313294, "learning_rate": 1.1680092059838895e-05, "loss": 0.2649, "step": 763 }, { "epoch": 2.372670807453416, "grad_norm": 0.20304929332054908, "learning_rate": 1.1622554660529344e-05, "loss": 0.2624, "step": 764 }, { "epoch": 2.375776397515528, "grad_norm": 0.20512146624367367, "learning_rate": 1.1565017261219792e-05, "loss": 0.2532, "step": 765 }, { "epoch": 2.37888198757764, "grad_norm": 0.1948376797912715, "learning_rate": 1.1507479861910241e-05, "loss": 0.2593, "step": 766 }, { "epoch": 2.3819875776397517, "grad_norm": 0.20111608619484334, "learning_rate": 1.144994246260069e-05, "loss": 0.2431, "step": 767 }, { "epoch": 2.3850931677018634, "grad_norm": 0.20424563225076126, "learning_rate": 1.139240506329114e-05, "loss": 0.239, "step": 768 }, { "epoch": 2.388198757763975, "grad_norm": 0.20385122820209117, "learning_rate": 1.1334867663981589e-05, "loss": 0.2519, "step": 769 }, { "epoch": 2.391304347826087, "grad_norm": 0.2169017997179514, "learning_rate": 1.1277330264672038e-05, "loss": 0.2599, "step": 770 }, { "epoch": 2.3944099378881987, "grad_norm": 0.20583351351917192, "learning_rate": 1.1219792865362486e-05, "loss": 0.2515, "step": 771 }, { "epoch": 2.3975155279503104, "grad_norm": 0.20864268761499544, "learning_rate": 1.1162255466052935e-05, "loss": 0.2674, "step": 772 }, { "epoch": 2.400621118012422, "grad_norm": 0.18352483617724127, "learning_rate": 1.1104718066743384e-05, "loss": 0.2517, "step": 773 }, { "epoch": 2.403726708074534, "grad_norm": 0.19458848397143083, "learning_rate": 1.1047180667433833e-05, "loss": 0.2348, "step": 774 }, { "epoch": 2.406832298136646, "grad_norm": 0.22085258658145707, "learning_rate": 1.0989643268124281e-05, "loss": 0.2626, "step": 775 }, { "epoch": 2.409937888198758, "grad_norm": 0.2244287112114885, "learning_rate": 1.093210586881473e-05, "loss": 0.2656, "step": 776 }, { "epoch": 2.4130434782608696, "grad_norm": 0.2064604218573695, "learning_rate": 1.0874568469505179e-05, "loss": 0.2474, "step": 777 }, { "epoch": 2.4161490683229814, "grad_norm": 0.2170623734624135, "learning_rate": 1.0817031070195628e-05, "loss": 0.2673, "step": 778 }, { "epoch": 2.419254658385093, "grad_norm": 0.21813795262022834, "learning_rate": 1.0759493670886076e-05, "loss": 0.2566, "step": 779 }, { "epoch": 2.422360248447205, "grad_norm": 0.20015983943955706, "learning_rate": 1.0701956271576525e-05, "loss": 0.2433, "step": 780 }, { "epoch": 2.4254658385093166, "grad_norm": 0.2518786075901923, "learning_rate": 1.0644418872266974e-05, "loss": 0.2542, "step": 781 }, { "epoch": 2.4285714285714284, "grad_norm": 0.2039696978745147, "learning_rate": 1.0586881472957422e-05, "loss": 0.2635, "step": 782 }, { "epoch": 2.4316770186335406, "grad_norm": 0.20193387084839037, "learning_rate": 1.0529344073647871e-05, "loss": 0.2529, "step": 783 }, { "epoch": 2.4347826086956523, "grad_norm": 0.22256582381404963, "learning_rate": 1.047180667433832e-05, "loss": 0.2502, "step": 784 }, { "epoch": 2.437888198757764, "grad_norm": 0.20375665724837322, "learning_rate": 1.0414269275028769e-05, "loss": 0.241, "step": 785 }, { "epoch": 2.440993788819876, "grad_norm": 0.2179110760676831, "learning_rate": 1.0356731875719217e-05, "loss": 0.2599, "step": 786 }, { "epoch": 2.4440993788819876, "grad_norm": 0.22202976810767208, "learning_rate": 1.0299194476409666e-05, "loss": 0.2505, "step": 787 }, { "epoch": 2.4472049689440993, "grad_norm": 0.42670457942092715, "learning_rate": 1.0241657077100115e-05, "loss": 0.241, "step": 788 }, { "epoch": 2.450310559006211, "grad_norm": 0.20784564321109833, "learning_rate": 1.0184119677790564e-05, "loss": 0.2567, "step": 789 }, { "epoch": 2.453416149068323, "grad_norm": 0.20121980240137796, "learning_rate": 1.0126582278481012e-05, "loss": 0.2451, "step": 790 }, { "epoch": 2.4565217391304346, "grad_norm": 0.21747971229319626, "learning_rate": 1.0069044879171461e-05, "loss": 0.2387, "step": 791 }, { "epoch": 2.4596273291925463, "grad_norm": 0.18957130652801002, "learning_rate": 1.001150747986191e-05, "loss": 0.2404, "step": 792 }, { "epoch": 2.4627329192546585, "grad_norm": 0.19623974528931779, "learning_rate": 9.953970080552358e-06, "loss": 0.2505, "step": 793 }, { "epoch": 2.4658385093167703, "grad_norm": 0.2090564420719582, "learning_rate": 9.896432681242807e-06, "loss": 0.259, "step": 794 }, { "epoch": 2.468944099378882, "grad_norm": 0.20995347548362167, "learning_rate": 9.838895281933256e-06, "loss": 0.2557, "step": 795 }, { "epoch": 2.472049689440994, "grad_norm": 0.21072680749655628, "learning_rate": 9.781357882623705e-06, "loss": 0.2507, "step": 796 }, { "epoch": 2.4751552795031055, "grad_norm": 0.2028138320185975, "learning_rate": 9.723820483314153e-06, "loss": 0.2428, "step": 797 }, { "epoch": 2.4782608695652173, "grad_norm": 0.203416816769087, "learning_rate": 9.666283084004602e-06, "loss": 0.2549, "step": 798 }, { "epoch": 2.481366459627329, "grad_norm": 0.2114980169350222, "learning_rate": 9.60874568469505e-06, "loss": 0.2544, "step": 799 }, { "epoch": 2.4844720496894412, "grad_norm": 0.1947781123063217, "learning_rate": 9.551208285385501e-06, "loss": 0.246, "step": 800 }, { "epoch": 2.487577639751553, "grad_norm": 0.2313621289649826, "learning_rate": 9.49367088607595e-06, "loss": 0.2688, "step": 801 }, { "epoch": 2.4906832298136647, "grad_norm": 0.2070540850596655, "learning_rate": 9.436133486766399e-06, "loss": 0.2594, "step": 802 }, { "epoch": 2.4937888198757765, "grad_norm": 0.21169469541077635, "learning_rate": 9.378596087456847e-06, "loss": 0.2493, "step": 803 }, { "epoch": 2.4968944099378882, "grad_norm": 0.19281802475760265, "learning_rate": 9.321058688147296e-06, "loss": 0.25, "step": 804 }, { "epoch": 2.5, "grad_norm": 0.2175842957962285, "learning_rate": 9.263521288837745e-06, "loss": 0.2678, "step": 805 }, { "epoch": 2.5031055900621118, "grad_norm": 0.1942027851518837, "learning_rate": 9.205983889528194e-06, "loss": 0.2505, "step": 806 }, { "epoch": 2.5062111801242235, "grad_norm": 0.2119389750172559, "learning_rate": 9.148446490218642e-06, "loss": 0.2647, "step": 807 }, { "epoch": 2.5093167701863353, "grad_norm": 0.20993843490643438, "learning_rate": 9.090909090909091e-06, "loss": 0.2381, "step": 808 }, { "epoch": 2.512422360248447, "grad_norm": 0.20387329805308116, "learning_rate": 9.03337169159954e-06, "loss": 0.2404, "step": 809 }, { "epoch": 2.5155279503105588, "grad_norm": 0.206875715468925, "learning_rate": 8.975834292289988e-06, "loss": 0.2625, "step": 810 }, { "epoch": 2.518633540372671, "grad_norm": 0.20699195679204746, "learning_rate": 8.918296892980437e-06, "loss": 0.2496, "step": 811 }, { "epoch": 2.5217391304347827, "grad_norm": 0.20082335227786552, "learning_rate": 8.860759493670886e-06, "loss": 0.2472, "step": 812 }, { "epoch": 2.5248447204968945, "grad_norm": 0.20337421721934987, "learning_rate": 8.803222094361335e-06, "loss": 0.2465, "step": 813 }, { "epoch": 2.527950310559006, "grad_norm": 0.19690561472031543, "learning_rate": 8.745684695051783e-06, "loss": 0.2562, "step": 814 }, { "epoch": 2.531055900621118, "grad_norm": 0.20942292198434145, "learning_rate": 8.688147295742232e-06, "loss": 0.2448, "step": 815 }, { "epoch": 2.5341614906832297, "grad_norm": 0.22511418926211027, "learning_rate": 8.630609896432681e-06, "loss": 0.2585, "step": 816 }, { "epoch": 2.5372670807453415, "grad_norm": 0.21038192778136464, "learning_rate": 8.57307249712313e-06, "loss": 0.2463, "step": 817 }, { "epoch": 2.5403726708074537, "grad_norm": 0.18890075777071388, "learning_rate": 8.515535097813578e-06, "loss": 0.2521, "step": 818 }, { "epoch": 2.5434782608695654, "grad_norm": 0.21205002134781, "learning_rate": 8.457997698504027e-06, "loss": 0.2585, "step": 819 }, { "epoch": 2.546583850931677, "grad_norm": 0.1941024098027217, "learning_rate": 8.400460299194476e-06, "loss": 0.2566, "step": 820 }, { "epoch": 2.549689440993789, "grad_norm": 0.30349180360429645, "learning_rate": 8.342922899884924e-06, "loss": 0.2623, "step": 821 }, { "epoch": 2.5527950310559007, "grad_norm": 0.22803507573466544, "learning_rate": 8.285385500575373e-06, "loss": 0.2558, "step": 822 }, { "epoch": 2.5559006211180124, "grad_norm": 0.2020632346168216, "learning_rate": 8.227848101265822e-06, "loss": 0.2586, "step": 823 }, { "epoch": 2.559006211180124, "grad_norm": 0.19503633689058, "learning_rate": 8.17031070195627e-06, "loss": 0.2628, "step": 824 }, { "epoch": 2.562111801242236, "grad_norm": 0.19443407045409983, "learning_rate": 8.11277330264672e-06, "loss": 0.2492, "step": 825 }, { "epoch": 2.5652173913043477, "grad_norm": 0.20150007916652513, "learning_rate": 8.05523590333717e-06, "loss": 0.256, "step": 826 }, { "epoch": 2.5683229813664594, "grad_norm": 0.20193826865741932, "learning_rate": 7.997698504027619e-06, "loss": 0.2716, "step": 827 }, { "epoch": 2.571428571428571, "grad_norm": 0.19621787984261999, "learning_rate": 7.940161104718067e-06, "loss": 0.2443, "step": 828 }, { "epoch": 2.5745341614906834, "grad_norm": 0.19610522530135707, "learning_rate": 7.882623705408516e-06, "loss": 0.2562, "step": 829 }, { "epoch": 2.577639751552795, "grad_norm": 0.1980537399225623, "learning_rate": 7.825086306098965e-06, "loss": 0.2475, "step": 830 }, { "epoch": 2.580745341614907, "grad_norm": 0.19074805307763945, "learning_rate": 7.767548906789413e-06, "loss": 0.2557, "step": 831 }, { "epoch": 2.5838509316770186, "grad_norm": 0.19613067044699573, "learning_rate": 7.710011507479862e-06, "loss": 0.2633, "step": 832 }, { "epoch": 2.5869565217391304, "grad_norm": 0.19682931877320217, "learning_rate": 7.652474108170311e-06, "loss": 0.2378, "step": 833 }, { "epoch": 2.590062111801242, "grad_norm": 0.20053417585734873, "learning_rate": 7.5949367088607605e-06, "loss": 0.2498, "step": 834 }, { "epoch": 2.593167701863354, "grad_norm": 0.19178100866522357, "learning_rate": 7.537399309551209e-06, "loss": 0.2355, "step": 835 }, { "epoch": 2.596273291925466, "grad_norm": 0.2084827189783707, "learning_rate": 7.479861910241658e-06, "loss": 0.2764, "step": 836 }, { "epoch": 2.599378881987578, "grad_norm": 0.19540838307901068, "learning_rate": 7.422324510932107e-06, "loss": 0.2437, "step": 837 }, { "epoch": 2.6024844720496896, "grad_norm": 0.19587457349490991, "learning_rate": 7.364787111622555e-06, "loss": 0.2489, "step": 838 }, { "epoch": 2.6055900621118013, "grad_norm": 0.25075690817051544, "learning_rate": 7.307249712313004e-06, "loss": 0.2648, "step": 839 }, { "epoch": 2.608695652173913, "grad_norm": 0.20743291534086578, "learning_rate": 7.249712313003453e-06, "loss": 0.2646, "step": 840 }, { "epoch": 2.611801242236025, "grad_norm": 0.21071395029449075, "learning_rate": 7.1921749136939016e-06, "loss": 0.2427, "step": 841 }, { "epoch": 2.6149068322981366, "grad_norm": 0.20235523726201224, "learning_rate": 7.13463751438435e-06, "loss": 0.2587, "step": 842 }, { "epoch": 2.6180124223602483, "grad_norm": 0.20149232436113795, "learning_rate": 7.077100115074799e-06, "loss": 0.2516, "step": 843 }, { "epoch": 2.62111801242236, "grad_norm": 0.21144648873433503, "learning_rate": 7.019562715765248e-06, "loss": 0.2582, "step": 844 }, { "epoch": 2.624223602484472, "grad_norm": 0.2162300937976304, "learning_rate": 6.9620253164556965e-06, "loss": 0.2556, "step": 845 }, { "epoch": 2.6273291925465836, "grad_norm": 0.21106771620646603, "learning_rate": 6.904487917146145e-06, "loss": 0.2558, "step": 846 }, { "epoch": 2.630434782608696, "grad_norm": 0.23609832773446915, "learning_rate": 6.846950517836594e-06, "loss": 0.2572, "step": 847 }, { "epoch": 2.6335403726708075, "grad_norm": 0.21122404379666423, "learning_rate": 6.789413118527043e-06, "loss": 0.2434, "step": 848 }, { "epoch": 2.6366459627329193, "grad_norm": 0.2015181805089703, "learning_rate": 6.731875719217491e-06, "loss": 0.2418, "step": 849 }, { "epoch": 2.639751552795031, "grad_norm": 0.20647243106844593, "learning_rate": 6.67433831990794e-06, "loss": 0.265, "step": 850 }, { "epoch": 2.642857142857143, "grad_norm": 0.2083640341120549, "learning_rate": 6.61680092059839e-06, "loss": 0.2531, "step": 851 }, { "epoch": 2.6459627329192545, "grad_norm": 0.20501908976688168, "learning_rate": 6.559263521288838e-06, "loss": 0.2514, "step": 852 }, { "epoch": 2.6490683229813663, "grad_norm": 0.19844284276810914, "learning_rate": 6.501726121979287e-06, "loss": 0.2644, "step": 853 }, { "epoch": 2.6521739130434785, "grad_norm": 0.21201237882135082, "learning_rate": 6.444188722669736e-06, "loss": 0.2568, "step": 854 }, { "epoch": 2.6552795031055902, "grad_norm": 0.22195301360518224, "learning_rate": 6.3866513233601846e-06, "loss": 0.261, "step": 855 }, { "epoch": 2.658385093167702, "grad_norm": 0.19287865061356418, "learning_rate": 6.329113924050633e-06, "loss": 0.2469, "step": 856 }, { "epoch": 2.6614906832298137, "grad_norm": 0.19640829139853255, "learning_rate": 6.271576524741082e-06, "loss": 0.2462, "step": 857 }, { "epoch": 2.6645962732919255, "grad_norm": 0.20101972350059313, "learning_rate": 6.214039125431531e-06, "loss": 0.255, "step": 858 }, { "epoch": 2.6677018633540373, "grad_norm": 0.2841326489307957, "learning_rate": 6.1565017261219795e-06, "loss": 0.2457, "step": 859 }, { "epoch": 2.670807453416149, "grad_norm": 0.18827454901664883, "learning_rate": 6.098964326812428e-06, "loss": 0.2427, "step": 860 }, { "epoch": 2.6739130434782608, "grad_norm": 0.20109847479853832, "learning_rate": 6.041426927502877e-06, "loss": 0.2402, "step": 861 }, { "epoch": 2.6770186335403725, "grad_norm": 0.1910402172602598, "learning_rate": 5.983889528193326e-06, "loss": 0.2627, "step": 862 }, { "epoch": 2.6801242236024843, "grad_norm": 0.1974312904693097, "learning_rate": 5.926352128883774e-06, "loss": 0.2625, "step": 863 }, { "epoch": 2.683229813664596, "grad_norm": 0.19911868656713894, "learning_rate": 5.868814729574223e-06, "loss": 0.2368, "step": 864 }, { "epoch": 2.686335403726708, "grad_norm": 0.21362726329843149, "learning_rate": 5.811277330264672e-06, "loss": 0.2534, "step": 865 }, { "epoch": 2.68944099378882, "grad_norm": 0.20941798902436187, "learning_rate": 5.7537399309551206e-06, "loss": 0.2454, "step": 866 }, { "epoch": 2.6925465838509317, "grad_norm": 0.19014291486371018, "learning_rate": 5.69620253164557e-06, "loss": 0.2446, "step": 867 }, { "epoch": 2.6956521739130435, "grad_norm": 0.19597012112115988, "learning_rate": 5.638665132336019e-06, "loss": 0.2537, "step": 868 }, { "epoch": 2.698757763975155, "grad_norm": 0.19714293851097728, "learning_rate": 5.581127733026468e-06, "loss": 0.2468, "step": 869 }, { "epoch": 2.701863354037267, "grad_norm": 0.19621178971442163, "learning_rate": 5.523590333716916e-06, "loss": 0.2507, "step": 870 }, { "epoch": 2.704968944099379, "grad_norm": 0.19491684844874946, "learning_rate": 5.466052934407365e-06, "loss": 0.2413, "step": 871 }, { "epoch": 2.708074534161491, "grad_norm": 0.27521581958829827, "learning_rate": 5.408515535097814e-06, "loss": 0.2633, "step": 872 }, { "epoch": 2.7111801242236027, "grad_norm": 0.2168313001961523, "learning_rate": 5.3509781357882625e-06, "loss": 0.264, "step": 873 }, { "epoch": 2.7142857142857144, "grad_norm": 0.18797210234683806, "learning_rate": 5.293440736478711e-06, "loss": 0.2447, "step": 874 }, { "epoch": 2.717391304347826, "grad_norm": 0.21084636753160527, "learning_rate": 5.23590333716916e-06, "loss": 0.2619, "step": 875 }, { "epoch": 2.720496894409938, "grad_norm": 0.20635684776280216, "learning_rate": 5.178365937859609e-06, "loss": 0.2583, "step": 876 }, { "epoch": 2.7236024844720497, "grad_norm": 0.19468296653400607, "learning_rate": 5.120828538550057e-06, "loss": 0.2492, "step": 877 }, { "epoch": 2.7267080745341614, "grad_norm": 0.20398048699150237, "learning_rate": 5.063291139240506e-06, "loss": 0.2549, "step": 878 }, { "epoch": 2.729813664596273, "grad_norm": 0.18689815600092072, "learning_rate": 5.005753739930955e-06, "loss": 0.2488, "step": 879 }, { "epoch": 2.732919254658385, "grad_norm": 0.20597588086540602, "learning_rate": 4.948216340621404e-06, "loss": 0.2667, "step": 880 }, { "epoch": 2.7360248447204967, "grad_norm": 0.196856493986424, "learning_rate": 4.890678941311852e-06, "loss": 0.2513, "step": 881 }, { "epoch": 2.7391304347826084, "grad_norm": 0.19482252545749987, "learning_rate": 4.833141542002301e-06, "loss": 0.2498, "step": 882 }, { "epoch": 2.7422360248447206, "grad_norm": 0.19795769225255558, "learning_rate": 4.775604142692751e-06, "loss": 0.2499, "step": 883 }, { "epoch": 2.7453416149068324, "grad_norm": 0.21689477537897567, "learning_rate": 4.718066743383199e-06, "loss": 0.284, "step": 884 }, { "epoch": 2.748447204968944, "grad_norm": 0.2418368942479182, "learning_rate": 4.660529344073648e-06, "loss": 0.2766, "step": 885 }, { "epoch": 2.751552795031056, "grad_norm": 0.20477977797718222, "learning_rate": 4.602991944764097e-06, "loss": 0.2401, "step": 886 }, { "epoch": 2.7546583850931676, "grad_norm": 0.2006217058218365, "learning_rate": 4.5454545454545455e-06, "loss": 0.2471, "step": 887 }, { "epoch": 2.7577639751552794, "grad_norm": 0.20256868584609686, "learning_rate": 4.487917146144994e-06, "loss": 0.2441, "step": 888 }, { "epoch": 2.7608695652173916, "grad_norm": 0.21086545356411496, "learning_rate": 4.430379746835443e-06, "loss": 0.255, "step": 889 }, { "epoch": 2.7639751552795033, "grad_norm": 0.19012940644030216, "learning_rate": 4.372842347525892e-06, "loss": 0.2524, "step": 890 }, { "epoch": 2.767080745341615, "grad_norm": 0.20733606950697256, "learning_rate": 4.3153049482163404e-06, "loss": 0.2502, "step": 891 }, { "epoch": 2.770186335403727, "grad_norm": 0.19869202500390978, "learning_rate": 4.257767548906789e-06, "loss": 0.2488, "step": 892 }, { "epoch": 2.7732919254658386, "grad_norm": 0.21061575298666055, "learning_rate": 4.200230149597238e-06, "loss": 0.2535, "step": 893 }, { "epoch": 2.7763975155279503, "grad_norm": 0.1902154270690934, "learning_rate": 4.142692750287687e-06, "loss": 0.2523, "step": 894 }, { "epoch": 2.779503105590062, "grad_norm": 0.19515929958573747, "learning_rate": 4.085155350978135e-06, "loss": 0.2544, "step": 895 }, { "epoch": 2.782608695652174, "grad_norm": 0.19252012143661815, "learning_rate": 4.027617951668585e-06, "loss": 0.2638, "step": 896 }, { "epoch": 2.7857142857142856, "grad_norm": 0.1923327877416844, "learning_rate": 3.970080552359034e-06, "loss": 0.2462, "step": 897 }, { "epoch": 2.7888198757763973, "grad_norm": 0.18586501981252762, "learning_rate": 3.912543153049482e-06, "loss": 0.2403, "step": 898 }, { "epoch": 2.791925465838509, "grad_norm": 0.1986091973327919, "learning_rate": 3.855005753739931e-06, "loss": 0.2655, "step": 899 }, { "epoch": 2.795031055900621, "grad_norm": 0.18536496942596287, "learning_rate": 3.7974683544303802e-06, "loss": 0.2389, "step": 900 }, { "epoch": 2.798136645962733, "grad_norm": 0.19607218549803698, "learning_rate": 3.739930955120829e-06, "loss": 0.2542, "step": 901 }, { "epoch": 2.801242236024845, "grad_norm": 0.19944282694872204, "learning_rate": 3.6823935558112777e-06, "loss": 0.2434, "step": 902 }, { "epoch": 2.8043478260869565, "grad_norm": 0.1972448743409019, "learning_rate": 3.6248561565017264e-06, "loss": 0.247, "step": 903 }, { "epoch": 2.8074534161490683, "grad_norm": 0.19361398823404677, "learning_rate": 3.567318757192175e-06, "loss": 0.2597, "step": 904 }, { "epoch": 2.81055900621118, "grad_norm": 0.19293534537923737, "learning_rate": 3.509781357882624e-06, "loss": 0.2679, "step": 905 }, { "epoch": 2.813664596273292, "grad_norm": 0.1978927145961964, "learning_rate": 3.4522439585730726e-06, "loss": 0.2474, "step": 906 }, { "epoch": 2.816770186335404, "grad_norm": 0.18672700788585406, "learning_rate": 3.3947065592635213e-06, "loss": 0.2468, "step": 907 }, { "epoch": 2.8198757763975157, "grad_norm": 0.1856966835076563, "learning_rate": 3.33716915995397e-06, "loss": 0.2424, "step": 908 }, { "epoch": 2.8229813664596275, "grad_norm": 0.19224475733121915, "learning_rate": 3.279631760644419e-06, "loss": 0.2477, "step": 909 }, { "epoch": 2.8260869565217392, "grad_norm": 0.20470161040078505, "learning_rate": 3.222094361334868e-06, "loss": 0.2706, "step": 910 }, { "epoch": 2.829192546583851, "grad_norm": 0.19429220598035837, "learning_rate": 3.1645569620253167e-06, "loss": 0.2477, "step": 911 }, { "epoch": 2.8322981366459627, "grad_norm": 0.1894109295691752, "learning_rate": 3.1070195627157654e-06, "loss": 0.2528, "step": 912 }, { "epoch": 2.8354037267080745, "grad_norm": 0.18097305550473375, "learning_rate": 3.049482163406214e-06, "loss": 0.2559, "step": 913 }, { "epoch": 2.8385093167701863, "grad_norm": 0.19783393465985816, "learning_rate": 2.991944764096663e-06, "loss": 0.2594, "step": 914 }, { "epoch": 2.841614906832298, "grad_norm": 0.20897012225810746, "learning_rate": 2.9344073647871116e-06, "loss": 0.2653, "step": 915 }, { "epoch": 2.8447204968944098, "grad_norm": 0.1896928698309342, "learning_rate": 2.8768699654775603e-06, "loss": 0.2477, "step": 916 }, { "epoch": 2.8478260869565215, "grad_norm": 0.1784496457827597, "learning_rate": 2.8193325661680094e-06, "loss": 0.244, "step": 917 }, { "epoch": 2.8509316770186337, "grad_norm": 0.18587478842335634, "learning_rate": 2.761795166858458e-06, "loss": 0.25, "step": 918 }, { "epoch": 2.8540372670807455, "grad_norm": 0.20084898371613977, "learning_rate": 2.704257767548907e-06, "loss": 0.2577, "step": 919 }, { "epoch": 2.857142857142857, "grad_norm": 0.17885954860642703, "learning_rate": 2.6467203682393556e-06, "loss": 0.2407, "step": 920 }, { "epoch": 2.860248447204969, "grad_norm": 0.18561208551570504, "learning_rate": 2.5891829689298043e-06, "loss": 0.2659, "step": 921 }, { "epoch": 2.8633540372670807, "grad_norm": 0.21932109217206247, "learning_rate": 2.531645569620253e-06, "loss": 0.2403, "step": 922 }, { "epoch": 2.8664596273291925, "grad_norm": 0.20030388225206663, "learning_rate": 2.474108170310702e-06, "loss": 0.2544, "step": 923 }, { "epoch": 2.869565217391304, "grad_norm": 0.20878429975095714, "learning_rate": 2.4165707710011505e-06, "loss": 0.2497, "step": 924 }, { "epoch": 2.8726708074534164, "grad_norm": 0.22186219053087963, "learning_rate": 2.3590333716915997e-06, "loss": 0.2672, "step": 925 }, { "epoch": 2.875776397515528, "grad_norm": 0.18672043459559956, "learning_rate": 2.3014959723820484e-06, "loss": 0.2485, "step": 926 }, { "epoch": 2.87888198757764, "grad_norm": 0.18051985217560826, "learning_rate": 2.243958573072497e-06, "loss": 0.2479, "step": 927 }, { "epoch": 2.8819875776397517, "grad_norm": 0.20846631011511568, "learning_rate": 2.186421173762946e-06, "loss": 0.2632, "step": 928 }, { "epoch": 2.8850931677018634, "grad_norm": 0.17696337319445454, "learning_rate": 2.1288837744533946e-06, "loss": 0.2288, "step": 929 }, { "epoch": 2.888198757763975, "grad_norm": 0.18293271617504872, "learning_rate": 2.0713463751438433e-06, "loss": 0.2495, "step": 930 }, { "epoch": 2.891304347826087, "grad_norm": 0.17853543153947618, "learning_rate": 2.0138089758342925e-06, "loss": 0.2468, "step": 931 }, { "epoch": 2.8944099378881987, "grad_norm": 0.18420177129422804, "learning_rate": 1.956271576524741e-06, "loss": 0.2538, "step": 932 }, { "epoch": 2.8975155279503104, "grad_norm": 0.18170230261287915, "learning_rate": 1.8987341772151901e-06, "loss": 0.2589, "step": 933 }, { "epoch": 2.900621118012422, "grad_norm": 0.18685594957937918, "learning_rate": 1.8411967779056388e-06, "loss": 0.2442, "step": 934 }, { "epoch": 2.903726708074534, "grad_norm": 0.18690296703530773, "learning_rate": 1.7836593785960876e-06, "loss": 0.2451, "step": 935 }, { "epoch": 2.906832298136646, "grad_norm": 0.20799939121665842, "learning_rate": 1.7261219792865363e-06, "loss": 0.2649, "step": 936 }, { "epoch": 2.909937888198758, "grad_norm": 0.18563245527227562, "learning_rate": 1.668584579976985e-06, "loss": 0.2473, "step": 937 }, { "epoch": 2.9130434782608696, "grad_norm": 0.18286501772853814, "learning_rate": 1.611047180667434e-06, "loss": 0.2486, "step": 938 }, { "epoch": 2.9161490683229814, "grad_norm": 0.17534488578059473, "learning_rate": 1.5535097813578827e-06, "loss": 0.259, "step": 939 }, { "epoch": 2.919254658385093, "grad_norm": 0.19817242903037158, "learning_rate": 1.4959723820483314e-06, "loss": 0.2428, "step": 940 }, { "epoch": 2.922360248447205, "grad_norm": 0.18335244034678858, "learning_rate": 1.4384349827387801e-06, "loss": 0.252, "step": 941 }, { "epoch": 2.9254658385093166, "grad_norm": 0.17672927011117798, "learning_rate": 1.380897583429229e-06, "loss": 0.239, "step": 942 }, { "epoch": 2.928571428571429, "grad_norm": 0.19756419475586987, "learning_rate": 1.3233601841196778e-06, "loss": 0.2481, "step": 943 }, { "epoch": 2.9316770186335406, "grad_norm": 0.18227787024732953, "learning_rate": 1.2658227848101265e-06, "loss": 0.2503, "step": 944 }, { "epoch": 2.9347826086956523, "grad_norm": 0.17546530423346965, "learning_rate": 1.2082853855005753e-06, "loss": 0.2435, "step": 945 }, { "epoch": 2.937888198757764, "grad_norm": 0.17977719939100784, "learning_rate": 1.1507479861910242e-06, "loss": 0.2462, "step": 946 }, { "epoch": 2.940993788819876, "grad_norm": 0.182411120857819, "learning_rate": 1.093210586881473e-06, "loss": 0.2508, "step": 947 }, { "epoch": 2.9440993788819876, "grad_norm": 0.19191429782536917, "learning_rate": 1.0356731875719217e-06, "loss": 0.2419, "step": 948 }, { "epoch": 2.9472049689440993, "grad_norm": 0.17465750941257832, "learning_rate": 9.781357882623706e-07, "loss": 0.2394, "step": 949 }, { "epoch": 2.950310559006211, "grad_norm": 0.17956692661649218, "learning_rate": 9.205983889528194e-07, "loss": 0.2478, "step": 950 }, { "epoch": 2.953416149068323, "grad_norm": 0.18930304144220808, "learning_rate": 8.630609896432681e-07, "loss": 0.2371, "step": 951 }, { "epoch": 2.9565217391304346, "grad_norm": 0.19245986180447752, "learning_rate": 8.05523590333717e-07, "loss": 0.2324, "step": 952 }, { "epoch": 2.9596273291925463, "grad_norm": 0.18874027199979995, "learning_rate": 7.479861910241657e-07, "loss": 0.2482, "step": 953 }, { "epoch": 2.9627329192546585, "grad_norm": 0.18606185656754726, "learning_rate": 6.904487917146145e-07, "loss": 0.2602, "step": 954 }, { "epoch": 2.9658385093167703, "grad_norm": 0.18622516429740096, "learning_rate": 6.329113924050633e-07, "loss": 0.2514, "step": 955 }, { "epoch": 2.968944099378882, "grad_norm": 0.1910726758431884, "learning_rate": 5.753739930955121e-07, "loss": 0.2526, "step": 956 }, { "epoch": 2.972049689440994, "grad_norm": 0.18715451349236073, "learning_rate": 5.178365937859608e-07, "loss": 0.2512, "step": 957 }, { "epoch": 2.9751552795031055, "grad_norm": 0.18279876224123887, "learning_rate": 4.602991944764097e-07, "loss": 0.2624, "step": 958 }, { "epoch": 2.9782608695652173, "grad_norm": 0.18112311672532813, "learning_rate": 4.027617951668585e-07, "loss": 0.261, "step": 959 }, { "epoch": 2.981366459627329, "grad_norm": 0.17566965768374485, "learning_rate": 3.4522439585730727e-07, "loss": 0.2437, "step": 960 }, { "epoch": 2.9844720496894412, "grad_norm": 0.19721806486553123, "learning_rate": 2.8768699654775605e-07, "loss": 0.2594, "step": 961 }, { "epoch": 2.987577639751553, "grad_norm": 0.19840508059977566, "learning_rate": 2.3014959723820486e-07, "loss": 0.2415, "step": 962 }, { "epoch": 2.9906832298136647, "grad_norm": 0.20273843178894588, "learning_rate": 1.7261219792865363e-07, "loss": 0.2523, "step": 963 }, { "epoch": 2.9937888198757765, "grad_norm": 0.19347512173901257, "learning_rate": 1.1507479861910243e-07, "loss": 0.256, "step": 964 }, { "epoch": 2.9968944099378882, "grad_norm": 0.17382060962656506, "learning_rate": 5.7537399309551214e-08, "loss": 0.2351, "step": 965 }, { "epoch": 3.0, "grad_norm": 0.17740605884855634, "learning_rate": 0.0, "loss": 0.2395, "step": 966 }, { "epoch": 3.0, "step": 966, "total_flos": 8.211023406049526e+17, "train_loss": 0.4564525331862225, "train_runtime": 83597.7389, "train_samples_per_second": 0.185, "train_steps_per_second": 0.012 } ], "logging_steps": 1, "max_steps": 966, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 8.211023406049526e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }