diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,6804 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 500, + "global_step": 966, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.003105590062111801, + "grad_norm": 60.46764647043296, + "learning_rate": 5.154639175257732e-07, + "loss": 11.059, + "step": 1 + }, + { + "epoch": 0.006211180124223602, + "grad_norm": 60.76414526284177, + "learning_rate": 1.0309278350515464e-06, + "loss": 11.012, + "step": 2 + }, + { + "epoch": 0.009316770186335404, + "grad_norm": 59.20725802433676, + "learning_rate": 1.5463917525773197e-06, + "loss": 11.1319, + "step": 3 + }, + { + "epoch": 0.012422360248447204, + "grad_norm": 60.61459028364047, + "learning_rate": 2.061855670103093e-06, + "loss": 11.03, + "step": 4 + }, + { + "epoch": 0.015527950310559006, + "grad_norm": 64.67058884756766, + "learning_rate": 2.577319587628866e-06, + "loss": 10.8306, + "step": 5 + }, + { + "epoch": 0.018633540372670808, + "grad_norm": 70.79205232466896, + "learning_rate": 3.0927835051546395e-06, + "loss": 10.6598, + "step": 6 + }, + { + "epoch": 0.021739130434782608, + "grad_norm": 98.53755227750479, + "learning_rate": 3.608247422680412e-06, + "loss": 9.4929, + "step": 7 + }, + { + "epoch": 0.024844720496894408, + "grad_norm": 111.40861108613349, + "learning_rate": 4.123711340206186e-06, + "loss": 9.0865, + "step": 8 + }, + { + "epoch": 0.027950310559006212, + "grad_norm": 127.60092525337744, + "learning_rate": 4.639175257731959e-06, + "loss": 8.316, + "step": 9 + }, + { + "epoch": 0.031055900621118012, + "grad_norm": 57.38601512479266, + "learning_rate": 5.154639175257732e-06, + "loss": 3.5203, + "step": 10 + }, + { + "epoch": 0.034161490683229816, + "grad_norm": 40.087017160526905, + "learning_rate": 5.670103092783505e-06, + "loss": 2.5941, + "step": 11 + }, + { + "epoch": 0.037267080745341616, + "grad_norm": 36.371937459465364, + "learning_rate": 6.185567010309279e-06, + "loss": 2.3632, + "step": 12 + }, + { + "epoch": 0.040372670807453416, + "grad_norm": 5.902085049785153, + "learning_rate": 6.701030927835052e-06, + "loss": 1.3097, + "step": 13 + }, + { + "epoch": 0.043478260869565216, + "grad_norm": 4.395696518971964, + "learning_rate": 7.216494845360824e-06, + "loss": 1.2339, + "step": 14 + }, + { + "epoch": 0.046583850931677016, + "grad_norm": 3.1604888794138524, + "learning_rate": 7.731958762886599e-06, + "loss": 1.1463, + "step": 15 + }, + { + "epoch": 0.049689440993788817, + "grad_norm": 2.399444834184007, + "learning_rate": 8.247422680412371e-06, + "loss": 1.0723, + "step": 16 + }, + { + "epoch": 0.052795031055900624, + "grad_norm": 1.5487302596438641, + "learning_rate": 8.762886597938144e-06, + "loss": 0.9467, + "step": 17 + }, + { + "epoch": 0.055900621118012424, + "grad_norm": 80.99605354011946, + "learning_rate": 9.278350515463918e-06, + "loss": 0.9669, + "step": 18 + }, + { + "epoch": 0.059006211180124224, + "grad_norm": 35.062984197987575, + "learning_rate": 9.793814432989691e-06, + "loss": 0.8903, + "step": 19 + }, + { + "epoch": 0.062111801242236024, + "grad_norm": 1.7039403556284178, + "learning_rate": 1.0309278350515464e-05, + "loss": 0.8611, + "step": 20 + }, + { + "epoch": 0.06521739130434782, + "grad_norm": 1.1286556630596418, + "learning_rate": 1.0824742268041238e-05, + "loss": 0.7956, + "step": 21 + }, + { + "epoch": 0.06832298136645963, + "grad_norm": 0.8885421752095347, + "learning_rate": 1.134020618556701e-05, + "loss": 0.7944, + "step": 22 + }, + { + "epoch": 0.07142857142857142, + "grad_norm": 0.7771123311111944, + "learning_rate": 1.1855670103092783e-05, + "loss": 0.7888, + "step": 23 + }, + { + "epoch": 0.07453416149068323, + "grad_norm": 0.8290301807562498, + "learning_rate": 1.2371134020618558e-05, + "loss": 0.7524, + "step": 24 + }, + { + "epoch": 0.07763975155279502, + "grad_norm": 0.9178800987434453, + "learning_rate": 1.2886597938144329e-05, + "loss": 0.7276, + "step": 25 + }, + { + "epoch": 0.08074534161490683, + "grad_norm": 0.7280169831391284, + "learning_rate": 1.3402061855670103e-05, + "loss": 0.7049, + "step": 26 + }, + { + "epoch": 0.08385093167701864, + "grad_norm": 0.590489381671068, + "learning_rate": 1.3917525773195878e-05, + "loss": 0.6846, + "step": 27 + }, + { + "epoch": 0.08695652173913043, + "grad_norm": 0.6956647472162396, + "learning_rate": 1.4432989690721649e-05, + "loss": 0.6518, + "step": 28 + }, + { + "epoch": 0.09006211180124224, + "grad_norm": 0.7233291715436561, + "learning_rate": 1.4948453608247423e-05, + "loss": 0.6575, + "step": 29 + }, + { + "epoch": 0.09316770186335403, + "grad_norm": 0.6557463930133224, + "learning_rate": 1.5463917525773197e-05, + "loss": 0.6648, + "step": 30 + }, + { + "epoch": 0.09627329192546584, + "grad_norm": 0.5940038812473861, + "learning_rate": 1.5979381443298968e-05, + "loss": 0.6414, + "step": 31 + }, + { + "epoch": 0.09937888198757763, + "grad_norm": 0.514015705745489, + "learning_rate": 1.6494845360824743e-05, + "loss": 0.6178, + "step": 32 + }, + { + "epoch": 0.10248447204968944, + "grad_norm": 0.5977361497140969, + "learning_rate": 1.7010309278350517e-05, + "loss": 0.6216, + "step": 33 + }, + { + "epoch": 0.10559006211180125, + "grad_norm": 0.5377773183845758, + "learning_rate": 1.7525773195876288e-05, + "loss": 0.6195, + "step": 34 + }, + { + "epoch": 0.10869565217391304, + "grad_norm": 0.40152764208172104, + "learning_rate": 1.8041237113402062e-05, + "loss": 0.5758, + "step": 35 + }, + { + "epoch": 0.11180124223602485, + "grad_norm": 0.40244189444549017, + "learning_rate": 1.8556701030927837e-05, + "loss": 0.6178, + "step": 36 + }, + { + "epoch": 0.11490683229813664, + "grad_norm": 0.49886656483811526, + "learning_rate": 1.9072164948453608e-05, + "loss": 0.6062, + "step": 37 + }, + { + "epoch": 0.11801242236024845, + "grad_norm": 0.43178714425173426, + "learning_rate": 1.9587628865979382e-05, + "loss": 0.5929, + "step": 38 + }, + { + "epoch": 0.12111801242236025, + "grad_norm": 0.37953785852942284, + "learning_rate": 2.0103092783505157e-05, + "loss": 0.57, + "step": 39 + }, + { + "epoch": 0.12422360248447205, + "grad_norm": 0.3712229743609745, + "learning_rate": 2.0618556701030927e-05, + "loss": 0.5812, + "step": 40 + }, + { + "epoch": 0.12732919254658384, + "grad_norm": 0.38350882873215847, + "learning_rate": 2.1134020618556702e-05, + "loss": 0.5714, + "step": 41 + }, + { + "epoch": 0.13043478260869565, + "grad_norm": 0.4036659557430701, + "learning_rate": 2.1649484536082476e-05, + "loss": 0.5813, + "step": 42 + }, + { + "epoch": 0.13354037267080746, + "grad_norm": 0.33097703493186653, + "learning_rate": 2.2164948453608247e-05, + "loss": 0.5537, + "step": 43 + }, + { + "epoch": 0.13664596273291926, + "grad_norm": 0.339069211939581, + "learning_rate": 2.268041237113402e-05, + "loss": 0.57, + "step": 44 + }, + { + "epoch": 0.13975155279503104, + "grad_norm": 0.34080115423530455, + "learning_rate": 2.3195876288659796e-05, + "loss": 0.5434, + "step": 45 + }, + { + "epoch": 0.14285714285714285, + "grad_norm": 0.3309319915944845, + "learning_rate": 2.3711340206185567e-05, + "loss": 0.5436, + "step": 46 + }, + { + "epoch": 0.14596273291925466, + "grad_norm": 0.36258498646852527, + "learning_rate": 2.422680412371134e-05, + "loss": 0.5372, + "step": 47 + }, + { + "epoch": 0.14906832298136646, + "grad_norm": 0.3289309195150263, + "learning_rate": 2.4742268041237116e-05, + "loss": 0.5519, + "step": 48 + }, + { + "epoch": 0.15217391304347827, + "grad_norm": 0.29200888913110107, + "learning_rate": 2.5257731958762887e-05, + "loss": 0.5269, + "step": 49 + }, + { + "epoch": 0.15527950310559005, + "grad_norm": 0.2913726775318078, + "learning_rate": 2.5773195876288658e-05, + "loss": 0.5398, + "step": 50 + }, + { + "epoch": 0.15838509316770186, + "grad_norm": 0.36183923103400334, + "learning_rate": 2.6288659793814435e-05, + "loss": 0.5313, + "step": 51 + }, + { + "epoch": 0.16149068322981366, + "grad_norm": 0.289832432081365, + "learning_rate": 2.6804123711340206e-05, + "loss": 0.5294, + "step": 52 + }, + { + "epoch": 0.16459627329192547, + "grad_norm": 0.28159321988499836, + "learning_rate": 2.7319587628865977e-05, + "loss": 0.5102, + "step": 53 + }, + { + "epoch": 0.16770186335403728, + "grad_norm": 0.33289230730425107, + "learning_rate": 2.7835051546391755e-05, + "loss": 0.5325, + "step": 54 + }, + { + "epoch": 0.17080745341614906, + "grad_norm": 0.2711500030362234, + "learning_rate": 2.8350515463917526e-05, + "loss": 0.5203, + "step": 55 + }, + { + "epoch": 0.17391304347826086, + "grad_norm": 0.2675188946251961, + "learning_rate": 2.8865979381443297e-05, + "loss": 0.5224, + "step": 56 + }, + { + "epoch": 0.17701863354037267, + "grad_norm": 0.26579895922328955, + "learning_rate": 2.9381443298969075e-05, + "loss": 0.5294, + "step": 57 + }, + { + "epoch": 0.18012422360248448, + "grad_norm": 0.2489797381353846, + "learning_rate": 2.9896907216494846e-05, + "loss": 0.5111, + "step": 58 + }, + { + "epoch": 0.18322981366459629, + "grad_norm": 0.26304984714934, + "learning_rate": 3.0412371134020617e-05, + "loss": 0.5063, + "step": 59 + }, + { + "epoch": 0.18633540372670807, + "grad_norm": 0.29536218486713367, + "learning_rate": 3.0927835051546395e-05, + "loss": 0.5278, + "step": 60 + }, + { + "epoch": 0.18944099378881987, + "grad_norm": 0.2629951129122119, + "learning_rate": 3.1443298969072166e-05, + "loss": 0.5066, + "step": 61 + }, + { + "epoch": 0.19254658385093168, + "grad_norm": 0.324166573780215, + "learning_rate": 3.1958762886597937e-05, + "loss": 0.5054, + "step": 62 + }, + { + "epoch": 0.1956521739130435, + "grad_norm": 0.2729720585938641, + "learning_rate": 3.2474226804123714e-05, + "loss": 0.5142, + "step": 63 + }, + { + "epoch": 0.19875776397515527, + "grad_norm": 0.27422169347085695, + "learning_rate": 3.2989690721649485e-05, + "loss": 0.5119, + "step": 64 + }, + { + "epoch": 0.20186335403726707, + "grad_norm": 0.26064941279629095, + "learning_rate": 3.3505154639175256e-05, + "loss": 0.5037, + "step": 65 + }, + { + "epoch": 0.20496894409937888, + "grad_norm": 0.2589323970095713, + "learning_rate": 3.4020618556701034e-05, + "loss": 0.5181, + "step": 66 + }, + { + "epoch": 0.2080745341614907, + "grad_norm": 0.2795495681392583, + "learning_rate": 3.4536082474226805e-05, + "loss": 0.5006, + "step": 67 + }, + { + "epoch": 0.2111801242236025, + "grad_norm": 0.2785747261533415, + "learning_rate": 3.5051546391752576e-05, + "loss": 0.483, + "step": 68 + }, + { + "epoch": 0.21428571428571427, + "grad_norm": 0.25302395243466885, + "learning_rate": 3.5567010309278354e-05, + "loss": 0.4883, + "step": 69 + }, + { + "epoch": 0.21739130434782608, + "grad_norm": 0.2752883227640764, + "learning_rate": 3.6082474226804125e-05, + "loss": 0.5094, + "step": 70 + }, + { + "epoch": 0.2204968944099379, + "grad_norm": 0.3024166121222451, + "learning_rate": 3.6597938144329896e-05, + "loss": 0.4881, + "step": 71 + }, + { + "epoch": 0.2236024844720497, + "grad_norm": 0.3097948500444575, + "learning_rate": 3.7113402061855674e-05, + "loss": 0.4839, + "step": 72 + }, + { + "epoch": 0.2267080745341615, + "grad_norm": 0.2876918544530116, + "learning_rate": 3.7628865979381445e-05, + "loss": 0.5144, + "step": 73 + }, + { + "epoch": 0.22981366459627328, + "grad_norm": 0.3416229447277982, + "learning_rate": 3.8144329896907216e-05, + "loss": 0.4961, + "step": 74 + }, + { + "epoch": 0.2329192546583851, + "grad_norm": 0.3199113220311117, + "learning_rate": 3.865979381443299e-05, + "loss": 0.473, + "step": 75 + }, + { + "epoch": 0.2360248447204969, + "grad_norm": 0.3005248837372916, + "learning_rate": 3.9175257731958764e-05, + "loss": 0.4869, + "step": 76 + }, + { + "epoch": 0.2391304347826087, + "grad_norm": 0.3020219962118337, + "learning_rate": 3.9690721649484535e-05, + "loss": 0.5047, + "step": 77 + }, + { + "epoch": 0.2422360248447205, + "grad_norm": 0.29698825337519646, + "learning_rate": 4.020618556701031e-05, + "loss": 0.5022, + "step": 78 + }, + { + "epoch": 0.2453416149068323, + "grad_norm": 0.3021333930392965, + "learning_rate": 4.0721649484536084e-05, + "loss": 0.4866, + "step": 79 + }, + { + "epoch": 0.2484472049689441, + "grad_norm": 0.29250713103592757, + "learning_rate": 4.1237113402061855e-05, + "loss": 0.4896, + "step": 80 + }, + { + "epoch": 0.2515527950310559, + "grad_norm": 0.27724800469538824, + "learning_rate": 4.175257731958763e-05, + "loss": 0.4836, + "step": 81 + }, + { + "epoch": 0.2546583850931677, + "grad_norm": 0.3272751041798097, + "learning_rate": 4.2268041237113404e-05, + "loss": 0.5079, + "step": 82 + }, + { + "epoch": 0.2577639751552795, + "grad_norm": 0.2875779003405876, + "learning_rate": 4.2783505154639175e-05, + "loss": 0.4822, + "step": 83 + }, + { + "epoch": 0.2608695652173913, + "grad_norm": 0.351548025457743, + "learning_rate": 4.329896907216495e-05, + "loss": 0.4719, + "step": 84 + }, + { + "epoch": 0.2639751552795031, + "grad_norm": 0.3104421632805538, + "learning_rate": 4.3814432989690723e-05, + "loss": 0.4985, + "step": 85 + }, + { + "epoch": 0.2670807453416149, + "grad_norm": 0.29340838316836443, + "learning_rate": 4.4329896907216494e-05, + "loss": 0.4506, + "step": 86 + }, + { + "epoch": 0.2701863354037267, + "grad_norm": 0.31888072280932184, + "learning_rate": 4.484536082474227e-05, + "loss": 0.4718, + "step": 87 + }, + { + "epoch": 0.2732919254658385, + "grad_norm": 0.2881905604568596, + "learning_rate": 4.536082474226804e-05, + "loss": 0.4718, + "step": 88 + }, + { + "epoch": 0.27639751552795033, + "grad_norm": 0.382391969622348, + "learning_rate": 4.5876288659793814e-05, + "loss": 0.489, + "step": 89 + }, + { + "epoch": 0.2795031055900621, + "grad_norm": 0.28677795566141734, + "learning_rate": 4.639175257731959e-05, + "loss": 0.4625, + "step": 90 + }, + { + "epoch": 0.2826086956521739, + "grad_norm": 0.44192895579293406, + "learning_rate": 4.690721649484536e-05, + "loss": 0.4901, + "step": 91 + }, + { + "epoch": 0.2857142857142857, + "grad_norm": 0.36788344235249887, + "learning_rate": 4.7422680412371134e-05, + "loss": 0.4682, + "step": 92 + }, + { + "epoch": 0.2888198757763975, + "grad_norm": 0.5143785581301379, + "learning_rate": 4.793814432989691e-05, + "loss": 0.4748, + "step": 93 + }, + { + "epoch": 0.2919254658385093, + "grad_norm": 0.3714484820764116, + "learning_rate": 4.845360824742268e-05, + "loss": 0.4733, + "step": 94 + }, + { + "epoch": 0.2950310559006211, + "grad_norm": 0.4411279949864707, + "learning_rate": 4.8969072164948454e-05, + "loss": 0.4719, + "step": 95 + }, + { + "epoch": 0.2981366459627329, + "grad_norm": 0.4095900221196949, + "learning_rate": 4.948453608247423e-05, + "loss": 0.4679, + "step": 96 + }, + { + "epoch": 0.30124223602484473, + "grad_norm": 0.3876387401039132, + "learning_rate": 5e-05, + "loss": 0.4727, + "step": 97 + }, + { + "epoch": 0.30434782608695654, + "grad_norm": 0.35671507475673714, + "learning_rate": 4.994246260069045e-05, + "loss": 0.4582, + "step": 98 + }, + { + "epoch": 0.30745341614906835, + "grad_norm": 0.40457113215141677, + "learning_rate": 4.98849252013809e-05, + "loss": 0.4817, + "step": 99 + }, + { + "epoch": 0.3105590062111801, + "grad_norm": 0.40014058749708475, + "learning_rate": 4.982738780207135e-05, + "loss": 0.4486, + "step": 100 + }, + { + "epoch": 0.3136645962732919, + "grad_norm": 0.4870121731575367, + "learning_rate": 4.97698504027618e-05, + "loss": 0.4663, + "step": 101 + }, + { + "epoch": 0.3167701863354037, + "grad_norm": 0.4340851079572886, + "learning_rate": 4.9712313003452246e-05, + "loss": 0.4484, + "step": 102 + }, + { + "epoch": 0.3198757763975155, + "grad_norm": 0.35686684080021636, + "learning_rate": 4.9654775604142695e-05, + "loss": 0.467, + "step": 103 + }, + { + "epoch": 0.32298136645962733, + "grad_norm": 0.4494359291517841, + "learning_rate": 4.9597238204833143e-05, + "loss": 0.4694, + "step": 104 + }, + { + "epoch": 0.32608695652173914, + "grad_norm": 0.4372407930618466, + "learning_rate": 4.953970080552359e-05, + "loss": 0.4648, + "step": 105 + }, + { + "epoch": 0.32919254658385094, + "grad_norm": 0.34466736034003903, + "learning_rate": 4.948216340621404e-05, + "loss": 0.4444, + "step": 106 + }, + { + "epoch": 0.33229813664596275, + "grad_norm": 0.4001800803927703, + "learning_rate": 4.942462600690449e-05, + "loss": 0.464, + "step": 107 + }, + { + "epoch": 0.33540372670807456, + "grad_norm": 0.3577590335432523, + "learning_rate": 4.936708860759494e-05, + "loss": 0.4647, + "step": 108 + }, + { + "epoch": 0.3385093167701863, + "grad_norm": 0.3827072494556767, + "learning_rate": 4.930955120828539e-05, + "loss": 0.4452, + "step": 109 + }, + { + "epoch": 0.3416149068322981, + "grad_norm": 0.40554119841147346, + "learning_rate": 4.9252013808975836e-05, + "loss": 0.457, + "step": 110 + }, + { + "epoch": 0.3447204968944099, + "grad_norm": 0.3980370218198526, + "learning_rate": 4.9194476409666285e-05, + "loss": 0.4566, + "step": 111 + }, + { + "epoch": 0.34782608695652173, + "grad_norm": 0.38595447982147235, + "learning_rate": 4.913693901035673e-05, + "loss": 0.4436, + "step": 112 + }, + { + "epoch": 0.35093167701863354, + "grad_norm": 0.3335566121887473, + "learning_rate": 4.907940161104718e-05, + "loss": 0.4525, + "step": 113 + }, + { + "epoch": 0.35403726708074534, + "grad_norm": 0.44048069823182057, + "learning_rate": 4.902186421173763e-05, + "loss": 0.4775, + "step": 114 + }, + { + "epoch": 0.35714285714285715, + "grad_norm": 0.3511836624614759, + "learning_rate": 4.896432681242808e-05, + "loss": 0.4529, + "step": 115 + }, + { + "epoch": 0.36024844720496896, + "grad_norm": 0.40512550088435406, + "learning_rate": 4.890678941311853e-05, + "loss": 0.4856, + "step": 116 + }, + { + "epoch": 0.36335403726708076, + "grad_norm": 0.4709820706303788, + "learning_rate": 4.884925201380898e-05, + "loss": 0.4613, + "step": 117 + }, + { + "epoch": 0.36645962732919257, + "grad_norm": 0.3163807878418199, + "learning_rate": 4.8791714614499426e-05, + "loss": 0.476, + "step": 118 + }, + { + "epoch": 0.3695652173913043, + "grad_norm": 0.421853544537181, + "learning_rate": 4.8734177215189874e-05, + "loss": 0.4675, + "step": 119 + }, + { + "epoch": 0.37267080745341613, + "grad_norm": 0.37140388109626665, + "learning_rate": 4.867663981588032e-05, + "loss": 0.452, + "step": 120 + }, + { + "epoch": 0.37577639751552794, + "grad_norm": 0.42352163355515543, + "learning_rate": 4.861910241657077e-05, + "loss": 0.4468, + "step": 121 + }, + { + "epoch": 0.37888198757763975, + "grad_norm": 0.4144419361914004, + "learning_rate": 4.856156501726122e-05, + "loss": 0.4526, + "step": 122 + }, + { + "epoch": 0.38198757763975155, + "grad_norm": 0.40675120816526916, + "learning_rate": 4.850402761795167e-05, + "loss": 0.4611, + "step": 123 + }, + { + "epoch": 0.38509316770186336, + "grad_norm": 0.5826147735025056, + "learning_rate": 4.844649021864212e-05, + "loss": 0.4803, + "step": 124 + }, + { + "epoch": 0.38819875776397517, + "grad_norm": 0.3282657199624206, + "learning_rate": 4.838895281933257e-05, + "loss": 0.4552, + "step": 125 + }, + { + "epoch": 0.391304347826087, + "grad_norm": 0.5159501988757971, + "learning_rate": 4.8331415420023015e-05, + "loss": 0.4794, + "step": 126 + }, + { + "epoch": 0.3944099378881988, + "grad_norm": 0.3620503849683116, + "learning_rate": 4.8273878020713464e-05, + "loss": 0.4631, + "step": 127 + }, + { + "epoch": 0.39751552795031053, + "grad_norm": 0.4221189340341717, + "learning_rate": 4.821634062140391e-05, + "loss": 0.4696, + "step": 128 + }, + { + "epoch": 0.40062111801242234, + "grad_norm": 0.46423436394369083, + "learning_rate": 4.815880322209436e-05, + "loss": 0.4573, + "step": 129 + }, + { + "epoch": 0.40372670807453415, + "grad_norm": 0.4261777248289121, + "learning_rate": 4.810126582278481e-05, + "loss": 0.4608, + "step": 130 + }, + { + "epoch": 0.40683229813664595, + "grad_norm": 0.45519667338748365, + "learning_rate": 4.804372842347526e-05, + "loss": 0.4621, + "step": 131 + }, + { + "epoch": 0.40993788819875776, + "grad_norm": 0.4384463354130905, + "learning_rate": 4.798619102416571e-05, + "loss": 0.4656, + "step": 132 + }, + { + "epoch": 0.41304347826086957, + "grad_norm": 0.41199291319131776, + "learning_rate": 4.7928653624856157e-05, + "loss": 0.4535, + "step": 133 + }, + { + "epoch": 0.4161490683229814, + "grad_norm": 0.3655597225332361, + "learning_rate": 4.7871116225546605e-05, + "loss": 0.4501, + "step": 134 + }, + { + "epoch": 0.4192546583850932, + "grad_norm": 0.44932133556116877, + "learning_rate": 4.7813578826237054e-05, + "loss": 0.4767, + "step": 135 + }, + { + "epoch": 0.422360248447205, + "grad_norm": 0.3329354062585348, + "learning_rate": 4.77560414269275e-05, + "loss": 0.4455, + "step": 136 + }, + { + "epoch": 0.4254658385093168, + "grad_norm": 0.45152077511616723, + "learning_rate": 4.769850402761795e-05, + "loss": 0.4623, + "step": 137 + }, + { + "epoch": 0.42857142857142855, + "grad_norm": 0.3188549796798649, + "learning_rate": 4.76409666283084e-05, + "loss": 0.4304, + "step": 138 + }, + { + "epoch": 0.43167701863354035, + "grad_norm": 0.39747649807961544, + "learning_rate": 4.758342922899885e-05, + "loss": 0.4486, + "step": 139 + }, + { + "epoch": 0.43478260869565216, + "grad_norm": 0.2901963778324694, + "learning_rate": 4.75258918296893e-05, + "loss": 0.4485, + "step": 140 + }, + { + "epoch": 0.43788819875776397, + "grad_norm": 0.5357034478107343, + "learning_rate": 4.7468354430379746e-05, + "loss": 0.4773, + "step": 141 + }, + { + "epoch": 0.4409937888198758, + "grad_norm": 0.3892373232000147, + "learning_rate": 4.7410817031070195e-05, + "loss": 0.4408, + "step": 142 + }, + { + "epoch": 0.4440993788819876, + "grad_norm": 0.45033922342477917, + "learning_rate": 4.7353279631760644e-05, + "loss": 0.4598, + "step": 143 + }, + { + "epoch": 0.4472049689440994, + "grad_norm": 0.37908550777510663, + "learning_rate": 4.729574223245109e-05, + "loss": 0.4452, + "step": 144 + }, + { + "epoch": 0.4503105590062112, + "grad_norm": 0.4290373855109045, + "learning_rate": 4.723820483314154e-05, + "loss": 0.4536, + "step": 145 + }, + { + "epoch": 0.453416149068323, + "grad_norm": 0.35676947230487216, + "learning_rate": 4.718066743383199e-05, + "loss": 0.4648, + "step": 146 + }, + { + "epoch": 0.45652173913043476, + "grad_norm": 0.33636058827665144, + "learning_rate": 4.712313003452244e-05, + "loss": 0.444, + "step": 147 + }, + { + "epoch": 0.45962732919254656, + "grad_norm": 0.3823016634046083, + "learning_rate": 4.706559263521289e-05, + "loss": 0.4406, + "step": 148 + }, + { + "epoch": 0.46273291925465837, + "grad_norm": 0.3818789119419192, + "learning_rate": 4.700805523590334e-05, + "loss": 0.4488, + "step": 149 + }, + { + "epoch": 0.4658385093167702, + "grad_norm": 0.33345974040131937, + "learning_rate": 4.6950517836593785e-05, + "loss": 0.4647, + "step": 150 + }, + { + "epoch": 0.468944099378882, + "grad_norm": 0.47073824185480967, + "learning_rate": 4.689298043728424e-05, + "loss": 0.4534, + "step": 151 + }, + { + "epoch": 0.4720496894409938, + "grad_norm": 0.40070437909888434, + "learning_rate": 4.683544303797468e-05, + "loss": 0.4367, + "step": 152 + }, + { + "epoch": 0.4751552795031056, + "grad_norm": 0.407305468388989, + "learning_rate": 4.677790563866514e-05, + "loss": 0.4415, + "step": 153 + }, + { + "epoch": 0.4782608695652174, + "grad_norm": 0.4058611659098106, + "learning_rate": 4.672036823935558e-05, + "loss": 0.4576, + "step": 154 + }, + { + "epoch": 0.4813664596273292, + "grad_norm": 0.3967515788115339, + "learning_rate": 4.6662830840046035e-05, + "loss": 0.4524, + "step": 155 + }, + { + "epoch": 0.484472049689441, + "grad_norm": 0.4407590164610378, + "learning_rate": 4.660529344073648e-05, + "loss": 0.457, + "step": 156 + }, + { + "epoch": 0.48757763975155277, + "grad_norm": 0.43880737794315955, + "learning_rate": 4.654775604142693e-05, + "loss": 0.4365, + "step": 157 + }, + { + "epoch": 0.4906832298136646, + "grad_norm": 0.47864526006501984, + "learning_rate": 4.6490218642117375e-05, + "loss": 0.4479, + "step": 158 + }, + { + "epoch": 0.4937888198757764, + "grad_norm": 0.4692672779398985, + "learning_rate": 4.643268124280783e-05, + "loss": 0.4546, + "step": 159 + }, + { + "epoch": 0.4968944099378882, + "grad_norm": 0.4097305951007724, + "learning_rate": 4.637514384349827e-05, + "loss": 0.4355, + "step": 160 + }, + { + "epoch": 0.5, + "grad_norm": 0.43610759922666353, + "learning_rate": 4.631760644418873e-05, + "loss": 0.447, + "step": 161 + }, + { + "epoch": 0.5031055900621118, + "grad_norm": 0.2978982430601787, + "learning_rate": 4.626006904487917e-05, + "loss": 0.4524, + "step": 162 + }, + { + "epoch": 0.5062111801242236, + "grad_norm": 0.43653406806069966, + "learning_rate": 4.6202531645569625e-05, + "loss": 0.4171, + "step": 163 + }, + { + "epoch": 0.5093167701863354, + "grad_norm": 0.40670821189566986, + "learning_rate": 4.614499424626007e-05, + "loss": 0.439, + "step": 164 + }, + { + "epoch": 0.5124223602484472, + "grad_norm": 0.33901355170318703, + "learning_rate": 4.608745684695052e-05, + "loss": 0.4461, + "step": 165 + }, + { + "epoch": 0.515527950310559, + "grad_norm": 0.43610331613751346, + "learning_rate": 4.6029919447640965e-05, + "loss": 0.4554, + "step": 166 + }, + { + "epoch": 0.5186335403726708, + "grad_norm": 0.3625661313466411, + "learning_rate": 4.597238204833142e-05, + "loss": 0.4554, + "step": 167 + }, + { + "epoch": 0.5217391304347826, + "grad_norm": 0.3394393399478139, + "learning_rate": 4.591484464902186e-05, + "loss": 0.4367, + "step": 168 + }, + { + "epoch": 0.5248447204968945, + "grad_norm": 0.3588563348596153, + "learning_rate": 4.585730724971232e-05, + "loss": 0.4461, + "step": 169 + }, + { + "epoch": 0.5279503105590062, + "grad_norm": 0.3802785353634964, + "learning_rate": 4.579976985040276e-05, + "loss": 0.4387, + "step": 170 + }, + { + "epoch": 0.531055900621118, + "grad_norm": 0.3869023722709017, + "learning_rate": 4.5742232451093215e-05, + "loss": 0.4528, + "step": 171 + }, + { + "epoch": 0.5341614906832298, + "grad_norm": 0.36676418356051843, + "learning_rate": 4.568469505178366e-05, + "loss": 0.4348, + "step": 172 + }, + { + "epoch": 0.5372670807453416, + "grad_norm": 0.46126816544453725, + "learning_rate": 4.562715765247411e-05, + "loss": 0.4231, + "step": 173 + }, + { + "epoch": 0.5403726708074534, + "grad_norm": 0.35343634631539705, + "learning_rate": 4.556962025316456e-05, + "loss": 0.4369, + "step": 174 + }, + { + "epoch": 0.5434782608695652, + "grad_norm": 0.4549103689048508, + "learning_rate": 4.551208285385501e-05, + "loss": 0.4387, + "step": 175 + }, + { + "epoch": 0.546583850931677, + "grad_norm": 0.4303714186336393, + "learning_rate": 4.545454545454546e-05, + "loss": 0.4546, + "step": 176 + }, + { + "epoch": 0.5496894409937888, + "grad_norm": 0.4531267139678119, + "learning_rate": 4.539700805523591e-05, + "loss": 0.4356, + "step": 177 + }, + { + "epoch": 0.5527950310559007, + "grad_norm": 0.42240540949166944, + "learning_rate": 4.5339470655926356e-05, + "loss": 0.4442, + "step": 178 + }, + { + "epoch": 0.5559006211180124, + "grad_norm": 0.3163983623110262, + "learning_rate": 4.5281933256616805e-05, + "loss": 0.4255, + "step": 179 + }, + { + "epoch": 0.5590062111801242, + "grad_norm": 0.37954620340652895, + "learning_rate": 4.5224395857307253e-05, + "loss": 0.4387, + "step": 180 + }, + { + "epoch": 0.562111801242236, + "grad_norm": 0.33565801845470367, + "learning_rate": 4.51668584579977e-05, + "loss": 0.4415, + "step": 181 + }, + { + "epoch": 0.5652173913043478, + "grad_norm": 0.3349864414277053, + "learning_rate": 4.510932105868815e-05, + "loss": 0.4082, + "step": 182 + }, + { + "epoch": 0.5683229813664596, + "grad_norm": 0.439294679014343, + "learning_rate": 4.50517836593786e-05, + "loss": 0.446, + "step": 183 + }, + { + "epoch": 0.5714285714285714, + "grad_norm": 0.28999671538444516, + "learning_rate": 4.499424626006905e-05, + "loss": 0.4103, + "step": 184 + }, + { + "epoch": 0.5745341614906833, + "grad_norm": 0.40660335920713986, + "learning_rate": 4.49367088607595e-05, + "loss": 0.444, + "step": 185 + }, + { + "epoch": 0.577639751552795, + "grad_norm": 0.3033161017839996, + "learning_rate": 4.4879171461449946e-05, + "loss": 0.435, + "step": 186 + }, + { + "epoch": 0.5807453416149069, + "grad_norm": 0.30568413065453626, + "learning_rate": 4.4821634062140395e-05, + "loss": 0.4237, + "step": 187 + }, + { + "epoch": 0.5838509316770186, + "grad_norm": 0.32587134975274057, + "learning_rate": 4.476409666283084e-05, + "loss": 0.4331, + "step": 188 + }, + { + "epoch": 0.5869565217391305, + "grad_norm": 0.28290562376532075, + "learning_rate": 4.470655926352129e-05, + "loss": 0.4342, + "step": 189 + }, + { + "epoch": 0.5900621118012422, + "grad_norm": 0.3630490197737241, + "learning_rate": 4.464902186421174e-05, + "loss": 0.4344, + "step": 190 + }, + { + "epoch": 0.593167701863354, + "grad_norm": 0.3559890010930286, + "learning_rate": 4.459148446490219e-05, + "loss": 0.4498, + "step": 191 + }, + { + "epoch": 0.5962732919254659, + "grad_norm": 0.3499252907427838, + "learning_rate": 4.453394706559264e-05, + "loss": 0.4507, + "step": 192 + }, + { + "epoch": 0.5993788819875776, + "grad_norm": 0.33006303704048223, + "learning_rate": 4.447640966628309e-05, + "loss": 0.4281, + "step": 193 + }, + { + "epoch": 0.6024844720496895, + "grad_norm": 0.36984061156296816, + "learning_rate": 4.4418872266973536e-05, + "loss": 0.4518, + "step": 194 + }, + { + "epoch": 0.6055900621118012, + "grad_norm": 0.3439812296873207, + "learning_rate": 4.4361334867663984e-05, + "loss": 0.4448, + "step": 195 + }, + { + "epoch": 0.6086956521739131, + "grad_norm": 0.30685626808124417, + "learning_rate": 4.430379746835443e-05, + "loss": 0.4377, + "step": 196 + }, + { + "epoch": 0.6118012422360248, + "grad_norm": 0.3925848437000049, + "learning_rate": 4.424626006904488e-05, + "loss": 0.4396, + "step": 197 + }, + { + "epoch": 0.6149068322981367, + "grad_norm": 0.32639373809266464, + "learning_rate": 4.418872266973533e-05, + "loss": 0.4321, + "step": 198 + }, + { + "epoch": 0.6180124223602484, + "grad_norm": 0.376079541285074, + "learning_rate": 4.413118527042578e-05, + "loss": 0.4242, + "step": 199 + }, + { + "epoch": 0.6211180124223602, + "grad_norm": 0.3749608850464733, + "learning_rate": 4.407364787111623e-05, + "loss": 0.4259, + "step": 200 + }, + { + "epoch": 0.6242236024844721, + "grad_norm": 0.4461881134050382, + "learning_rate": 4.401611047180668e-05, + "loss": 0.4341, + "step": 201 + }, + { + "epoch": 0.6273291925465838, + "grad_norm": 0.4877320414028972, + "learning_rate": 4.3958573072497125e-05, + "loss": 0.4344, + "step": 202 + }, + { + "epoch": 0.6304347826086957, + "grad_norm": 0.4070659780535386, + "learning_rate": 4.3901035673187574e-05, + "loss": 0.4227, + "step": 203 + }, + { + "epoch": 0.6335403726708074, + "grad_norm": 0.4635439998393952, + "learning_rate": 4.384349827387802e-05, + "loss": 0.4355, + "step": 204 + }, + { + "epoch": 0.6366459627329193, + "grad_norm": 0.35952245913430025, + "learning_rate": 4.378596087456847e-05, + "loss": 0.423, + "step": 205 + }, + { + "epoch": 0.639751552795031, + "grad_norm": 0.520771866846795, + "learning_rate": 4.372842347525892e-05, + "loss": 0.4306, + "step": 206 + }, + { + "epoch": 0.6428571428571429, + "grad_norm": 0.3273697337468707, + "learning_rate": 4.367088607594937e-05, + "loss": 0.4324, + "step": 207 + }, + { + "epoch": 0.6459627329192547, + "grad_norm": 0.4813614761483608, + "learning_rate": 4.361334867663982e-05, + "loss": 0.4478, + "step": 208 + }, + { + "epoch": 0.6490683229813664, + "grad_norm": 0.3900984777507702, + "learning_rate": 4.3555811277330267e-05, + "loss": 0.4269, + "step": 209 + }, + { + "epoch": 0.6521739130434783, + "grad_norm": 0.3853092679143466, + "learning_rate": 4.3498273878020715e-05, + "loss": 0.4407, + "step": 210 + }, + { + "epoch": 0.65527950310559, + "grad_norm": 0.46225724309871613, + "learning_rate": 4.3440736478711164e-05, + "loss": 0.436, + "step": 211 + }, + { + "epoch": 0.6583850931677019, + "grad_norm": 0.31651404685134377, + "learning_rate": 4.338319907940161e-05, + "loss": 0.4316, + "step": 212 + }, + { + "epoch": 0.6614906832298136, + "grad_norm": 0.44516432018668023, + "learning_rate": 4.332566168009206e-05, + "loss": 0.4426, + "step": 213 + }, + { + "epoch": 0.6645962732919255, + "grad_norm": 0.3462443744991128, + "learning_rate": 4.326812428078251e-05, + "loss": 0.4465, + "step": 214 + }, + { + "epoch": 0.6677018633540373, + "grad_norm": 0.4436257780311306, + "learning_rate": 4.321058688147296e-05, + "loss": 0.4241, + "step": 215 + }, + { + "epoch": 0.6708074534161491, + "grad_norm": 0.3788099950107418, + "learning_rate": 4.315304948216341e-05, + "loss": 0.4206, + "step": 216 + }, + { + "epoch": 0.6739130434782609, + "grad_norm": 0.3667132129478159, + "learning_rate": 4.3095512082853856e-05, + "loss": 0.4336, + "step": 217 + }, + { + "epoch": 0.6770186335403726, + "grad_norm": 0.43405694529571, + "learning_rate": 4.3037974683544305e-05, + "loss": 0.4285, + "step": 218 + }, + { + "epoch": 0.6801242236024845, + "grad_norm": 0.37501605794405696, + "learning_rate": 4.2980437284234754e-05, + "loss": 0.4354, + "step": 219 + }, + { + "epoch": 0.6832298136645962, + "grad_norm": 0.7491502232791192, + "learning_rate": 4.29228998849252e-05, + "loss": 0.4622, + "step": 220 + }, + { + "epoch": 0.6863354037267081, + "grad_norm": 0.34683109305557713, + "learning_rate": 4.286536248561565e-05, + "loss": 0.4349, + "step": 221 + }, + { + "epoch": 0.6894409937888198, + "grad_norm": 0.41649862939635707, + "learning_rate": 4.28078250863061e-05, + "loss": 0.4278, + "step": 222 + }, + { + "epoch": 0.6925465838509317, + "grad_norm": 0.33273645633734766, + "learning_rate": 4.275028768699655e-05, + "loss": 0.4241, + "step": 223 + }, + { + "epoch": 0.6956521739130435, + "grad_norm": 0.358638671370147, + "learning_rate": 4.2692750287687e-05, + "loss": 0.4294, + "step": 224 + }, + { + "epoch": 0.6987577639751553, + "grad_norm": 0.3505002399312612, + "learning_rate": 4.2635212888377446e-05, + "loss": 0.4339, + "step": 225 + }, + { + "epoch": 0.7018633540372671, + "grad_norm": 0.28967971081827765, + "learning_rate": 4.2577675489067895e-05, + "loss": 0.4433, + "step": 226 + }, + { + "epoch": 0.7049689440993789, + "grad_norm": 0.3792183124094411, + "learning_rate": 4.2520138089758344e-05, + "loss": 0.4263, + "step": 227 + }, + { + "epoch": 0.7080745341614907, + "grad_norm": 0.2915459102300122, + "learning_rate": 4.246260069044879e-05, + "loss": 0.4226, + "step": 228 + }, + { + "epoch": 0.7111801242236024, + "grad_norm": 0.357404227614541, + "learning_rate": 4.240506329113924e-05, + "loss": 0.4183, + "step": 229 + }, + { + "epoch": 0.7142857142857143, + "grad_norm": 0.33657909101352584, + "learning_rate": 4.234752589182969e-05, + "loss": 0.4383, + "step": 230 + }, + { + "epoch": 0.717391304347826, + "grad_norm": 0.28799404238315757, + "learning_rate": 4.228998849252014e-05, + "loss": 0.4059, + "step": 231 + }, + { + "epoch": 0.7204968944099379, + "grad_norm": 0.3559137710527895, + "learning_rate": 4.223245109321059e-05, + "loss": 0.431, + "step": 232 + }, + { + "epoch": 0.7236024844720497, + "grad_norm": 0.3571859472649835, + "learning_rate": 4.2174913693901036e-05, + "loss": 0.4365, + "step": 233 + }, + { + "epoch": 0.7267080745341615, + "grad_norm": 0.27866414620295615, + "learning_rate": 4.2117376294591485e-05, + "loss": 0.4164, + "step": 234 + }, + { + "epoch": 0.7298136645962733, + "grad_norm": 0.35761820704128017, + "learning_rate": 4.2059838895281933e-05, + "loss": 0.4155, + "step": 235 + }, + { + "epoch": 0.7329192546583851, + "grad_norm": 0.38239702778323204, + "learning_rate": 4.200230149597238e-05, + "loss": 0.4441, + "step": 236 + }, + { + "epoch": 0.7360248447204969, + "grad_norm": 0.37338686711282476, + "learning_rate": 4.194476409666283e-05, + "loss": 0.4287, + "step": 237 + }, + { + "epoch": 0.7391304347826086, + "grad_norm": 0.31078006795719737, + "learning_rate": 4.188722669735328e-05, + "loss": 0.4314, + "step": 238 + }, + { + "epoch": 0.7422360248447205, + "grad_norm": 0.42962957316409206, + "learning_rate": 4.182968929804373e-05, + "loss": 0.4258, + "step": 239 + }, + { + "epoch": 0.7453416149068323, + "grad_norm": 0.3531531884915285, + "learning_rate": 4.177215189873418e-05, + "loss": 0.4348, + "step": 240 + }, + { + "epoch": 0.7484472049689441, + "grad_norm": 0.4645354016036932, + "learning_rate": 4.1714614499424626e-05, + "loss": 0.4204, + "step": 241 + }, + { + "epoch": 0.7515527950310559, + "grad_norm": 1.09153721353785, + "learning_rate": 4.1657077100115075e-05, + "loss": 0.4386, + "step": 242 + }, + { + "epoch": 0.7546583850931677, + "grad_norm": 0.32971689202723414, + "learning_rate": 4.159953970080552e-05, + "loss": 0.4286, + "step": 243 + }, + { + "epoch": 0.7577639751552795, + "grad_norm": 0.47923594956031046, + "learning_rate": 4.154200230149597e-05, + "loss": 0.4355, + "step": 244 + }, + { + "epoch": 0.7608695652173914, + "grad_norm": 0.3499125189435591, + "learning_rate": 4.148446490218642e-05, + "loss": 0.4363, + "step": 245 + }, + { + "epoch": 0.7639751552795031, + "grad_norm": 0.3676637215847227, + "learning_rate": 4.142692750287687e-05, + "loss": 0.4351, + "step": 246 + }, + { + "epoch": 0.7670807453416149, + "grad_norm": 0.3727821108079694, + "learning_rate": 4.136939010356732e-05, + "loss": 0.4418, + "step": 247 + }, + { + "epoch": 0.7701863354037267, + "grad_norm": 0.3252006506678716, + "learning_rate": 4.131185270425777e-05, + "loss": 0.4158, + "step": 248 + }, + { + "epoch": 0.7732919254658385, + "grad_norm": 0.6538129311302192, + "learning_rate": 4.1254315304948216e-05, + "loss": 0.457, + "step": 249 + }, + { + "epoch": 0.7763975155279503, + "grad_norm": 0.33906627374077886, + "learning_rate": 4.1196777905638664e-05, + "loss": 0.4318, + "step": 250 + }, + { + "epoch": 0.7795031055900621, + "grad_norm": 0.356301991033165, + "learning_rate": 4.113924050632912e-05, + "loss": 0.4236, + "step": 251 + }, + { + "epoch": 0.782608695652174, + "grad_norm": 0.32783848540999616, + "learning_rate": 4.108170310701956e-05, + "loss": 0.4448, + "step": 252 + }, + { + "epoch": 0.7857142857142857, + "grad_norm": 0.33633346589846297, + "learning_rate": 4.102416570771002e-05, + "loss": 0.4084, + "step": 253 + }, + { + "epoch": 0.7888198757763976, + "grad_norm": 0.34262159693990346, + "learning_rate": 4.096662830840046e-05, + "loss": 0.4343, + "step": 254 + }, + { + "epoch": 0.7919254658385093, + "grad_norm": 0.4238089460532713, + "learning_rate": 4.0909090909090915e-05, + "loss": 0.4197, + "step": 255 + }, + { + "epoch": 0.7950310559006211, + "grad_norm": 0.34636542219919175, + "learning_rate": 4.085155350978136e-05, + "loss": 0.4216, + "step": 256 + }, + { + "epoch": 0.7981366459627329, + "grad_norm": 0.5147966796611364, + "learning_rate": 4.079401611047181e-05, + "loss": 0.4335, + "step": 257 + }, + { + "epoch": 0.8012422360248447, + "grad_norm": 0.3784633526026262, + "learning_rate": 4.0736478711162254e-05, + "loss": 0.4388, + "step": 258 + }, + { + "epoch": 0.8043478260869565, + "grad_norm": 0.5353215946365089, + "learning_rate": 4.067894131185271e-05, + "loss": 0.4317, + "step": 259 + }, + { + "epoch": 0.8074534161490683, + "grad_norm": 0.43051530301687313, + "learning_rate": 4.062140391254315e-05, + "loss": 0.4321, + "step": 260 + }, + { + "epoch": 0.8105590062111802, + "grad_norm": 0.42796425153438244, + "learning_rate": 4.056386651323361e-05, + "loss": 0.4088, + "step": 261 + }, + { + "epoch": 0.8136645962732919, + "grad_norm": 0.5934293854830046, + "learning_rate": 4.050632911392405e-05, + "loss": 0.4158, + "step": 262 + }, + { + "epoch": 0.8167701863354038, + "grad_norm": 0.4393755394280156, + "learning_rate": 4.0448791714614505e-05, + "loss": 0.4469, + "step": 263 + }, + { + "epoch": 0.8198757763975155, + "grad_norm": 0.4605347528048276, + "learning_rate": 4.0391254315304947e-05, + "loss": 0.4346, + "step": 264 + }, + { + "epoch": 0.8229813664596274, + "grad_norm": 0.35259936117009355, + "learning_rate": 4.03337169159954e-05, + "loss": 0.4168, + "step": 265 + }, + { + "epoch": 0.8260869565217391, + "grad_norm": 0.4804542899872928, + "learning_rate": 4.0276179516685844e-05, + "loss": 0.4302, + "step": 266 + }, + { + "epoch": 0.8291925465838509, + "grad_norm": 0.49703176852970304, + "learning_rate": 4.02186421173763e-05, + "loss": 0.4376, + "step": 267 + }, + { + "epoch": 0.8322981366459627, + "grad_norm": 0.32332376265052126, + "learning_rate": 4.016110471806674e-05, + "loss": 0.4151, + "step": 268 + }, + { + "epoch": 0.8354037267080745, + "grad_norm": 0.3837962855801273, + "learning_rate": 4.01035673187572e-05, + "loss": 0.4165, + "step": 269 + }, + { + "epoch": 0.8385093167701864, + "grad_norm": 0.3057885184710408, + "learning_rate": 4.004602991944764e-05, + "loss": 0.4193, + "step": 270 + }, + { + "epoch": 0.8416149068322981, + "grad_norm": 0.33815716235605003, + "learning_rate": 3.9988492520138094e-05, + "loss": 0.4122, + "step": 271 + }, + { + "epoch": 0.84472049689441, + "grad_norm": 0.35543954456463683, + "learning_rate": 3.9930955120828536e-05, + "loss": 0.4312, + "step": 272 + }, + { + "epoch": 0.8478260869565217, + "grad_norm": 0.4061479720971117, + "learning_rate": 3.987341772151899e-05, + "loss": 0.4326, + "step": 273 + }, + { + "epoch": 0.8509316770186336, + "grad_norm": 0.3293967556583535, + "learning_rate": 3.9815880322209434e-05, + "loss": 0.4162, + "step": 274 + }, + { + "epoch": 0.8540372670807453, + "grad_norm": 0.32127496899850444, + "learning_rate": 3.975834292289989e-05, + "loss": 0.4064, + "step": 275 + }, + { + "epoch": 0.8571428571428571, + "grad_norm": 0.3106744319229529, + "learning_rate": 3.970080552359033e-05, + "loss": 0.4219, + "step": 276 + }, + { + "epoch": 0.860248447204969, + "grad_norm": 0.2851226156515557, + "learning_rate": 3.964326812428079e-05, + "loss": 0.4357, + "step": 277 + }, + { + "epoch": 0.8633540372670807, + "grad_norm": 0.3367137774364346, + "learning_rate": 3.958573072497123e-05, + "loss": 0.4221, + "step": 278 + }, + { + "epoch": 0.8664596273291926, + "grad_norm": 0.274716671666842, + "learning_rate": 3.9528193325661684e-05, + "loss": 0.4286, + "step": 279 + }, + { + "epoch": 0.8695652173913043, + "grad_norm": 0.32476182770932666, + "learning_rate": 3.9470655926352126e-05, + "loss": 0.4156, + "step": 280 + }, + { + "epoch": 0.8726708074534162, + "grad_norm": 0.34109454423469643, + "learning_rate": 3.941311852704258e-05, + "loss": 0.4133, + "step": 281 + }, + { + "epoch": 0.8757763975155279, + "grad_norm": 0.35511307476273746, + "learning_rate": 3.9355581127733024e-05, + "loss": 0.4317, + "step": 282 + }, + { + "epoch": 0.8788819875776398, + "grad_norm": 0.3270722625275185, + "learning_rate": 3.929804372842348e-05, + "loss": 0.4182, + "step": 283 + }, + { + "epoch": 0.8819875776397516, + "grad_norm": 0.30707956127514435, + "learning_rate": 3.924050632911392e-05, + "loss": 0.4128, + "step": 284 + }, + { + "epoch": 0.8850931677018633, + "grad_norm": 0.352987960191196, + "learning_rate": 3.9182968929804377e-05, + "loss": 0.4202, + "step": 285 + }, + { + "epoch": 0.8881987577639752, + "grad_norm": 0.3209556725057783, + "learning_rate": 3.912543153049482e-05, + "loss": 0.4531, + "step": 286 + }, + { + "epoch": 0.8913043478260869, + "grad_norm": 0.3424777350197383, + "learning_rate": 3.9067894131185274e-05, + "loss": 0.4261, + "step": 287 + }, + { + "epoch": 0.8944099378881988, + "grad_norm": 0.36115235473805046, + "learning_rate": 3.9010356731875716e-05, + "loss": 0.4208, + "step": 288 + }, + { + "epoch": 0.8975155279503105, + "grad_norm": 0.3345731728145184, + "learning_rate": 3.895281933256617e-05, + "loss": 0.4243, + "step": 289 + }, + { + "epoch": 0.9006211180124224, + "grad_norm": 0.3479109694931497, + "learning_rate": 3.8895281933256613e-05, + "loss": 0.408, + "step": 290 + }, + { + "epoch": 0.9037267080745341, + "grad_norm": 0.35901431270989403, + "learning_rate": 3.883774453394707e-05, + "loss": 0.4275, + "step": 291 + }, + { + "epoch": 0.906832298136646, + "grad_norm": 0.33289357045170126, + "learning_rate": 3.878020713463751e-05, + "loss": 0.4078, + "step": 292 + }, + { + "epoch": 0.9099378881987578, + "grad_norm": 0.33168510073705165, + "learning_rate": 3.8722669735327966e-05, + "loss": 0.4218, + "step": 293 + }, + { + "epoch": 0.9130434782608695, + "grad_norm": 0.2975318289744658, + "learning_rate": 3.866513233601841e-05, + "loss": 0.4311, + "step": 294 + }, + { + "epoch": 0.9161490683229814, + "grad_norm": 0.31426977572692477, + "learning_rate": 3.8607594936708864e-05, + "loss": 0.4297, + "step": 295 + }, + { + "epoch": 0.9192546583850931, + "grad_norm": 0.3070483941031755, + "learning_rate": 3.8550057537399306e-05, + "loss": 0.4192, + "step": 296 + }, + { + "epoch": 0.922360248447205, + "grad_norm": 0.2810848054459513, + "learning_rate": 3.849252013808976e-05, + "loss": 0.427, + "step": 297 + }, + { + "epoch": 0.9254658385093167, + "grad_norm": 0.2991841633857078, + "learning_rate": 3.84349827387802e-05, + "loss": 0.4052, + "step": 298 + }, + { + "epoch": 0.9285714285714286, + "grad_norm": 0.33847151615147736, + "learning_rate": 3.837744533947066e-05, + "loss": 0.419, + "step": 299 + }, + { + "epoch": 0.9316770186335404, + "grad_norm": 0.29017927632864937, + "learning_rate": 3.83199079401611e-05, + "loss": 0.4235, + "step": 300 + }, + { + "epoch": 0.9347826086956522, + "grad_norm": 0.32565509697744177, + "learning_rate": 3.8262370540851556e-05, + "loss": 0.4218, + "step": 301 + }, + { + "epoch": 0.937888198757764, + "grad_norm": 0.31402325607805354, + "learning_rate": 3.8204833141542005e-05, + "loss": 0.4374, + "step": 302 + }, + { + "epoch": 0.9409937888198758, + "grad_norm": 0.3147076556719568, + "learning_rate": 3.8147295742232454e-05, + "loss": 0.4155, + "step": 303 + }, + { + "epoch": 0.9440993788819876, + "grad_norm": 0.29699738407713266, + "learning_rate": 3.80897583429229e-05, + "loss": 0.4111, + "step": 304 + }, + { + "epoch": 0.9472049689440993, + "grad_norm": 0.2888210602850056, + "learning_rate": 3.803222094361335e-05, + "loss": 0.4221, + "step": 305 + }, + { + "epoch": 0.9503105590062112, + "grad_norm": 0.2939573629666098, + "learning_rate": 3.79746835443038e-05, + "loss": 0.4032, + "step": 306 + }, + { + "epoch": 0.953416149068323, + "grad_norm": 0.2962446654764285, + "learning_rate": 3.791714614499425e-05, + "loss": 0.4214, + "step": 307 + }, + { + "epoch": 0.9565217391304348, + "grad_norm": 0.271891153920885, + "learning_rate": 3.78596087456847e-05, + "loss": 0.4198, + "step": 308 + }, + { + "epoch": 0.9596273291925466, + "grad_norm": 0.32256951843172593, + "learning_rate": 3.7802071346375146e-05, + "loss": 0.4216, + "step": 309 + }, + { + "epoch": 0.9627329192546584, + "grad_norm": 0.33232339921643056, + "learning_rate": 3.7744533947065595e-05, + "loss": 0.4177, + "step": 310 + }, + { + "epoch": 0.9658385093167702, + "grad_norm": 0.35814851356254335, + "learning_rate": 3.7686996547756043e-05, + "loss": 0.425, + "step": 311 + }, + { + "epoch": 0.968944099378882, + "grad_norm": 0.29938770364659023, + "learning_rate": 3.762945914844649e-05, + "loss": 0.4128, + "step": 312 + }, + { + "epoch": 0.9720496894409938, + "grad_norm": 0.38739922253123726, + "learning_rate": 3.757192174913694e-05, + "loss": 0.4113, + "step": 313 + }, + { + "epoch": 0.9751552795031055, + "grad_norm": 0.31386603107673766, + "learning_rate": 3.751438434982739e-05, + "loss": 0.4104, + "step": 314 + }, + { + "epoch": 0.9782608695652174, + "grad_norm": 0.34687136495142834, + "learning_rate": 3.745684695051784e-05, + "loss": 0.4307, + "step": 315 + }, + { + "epoch": 0.9813664596273292, + "grad_norm": 0.3492017123521989, + "learning_rate": 3.739930955120829e-05, + "loss": 0.4077, + "step": 316 + }, + { + "epoch": 0.984472049689441, + "grad_norm": 0.29396206255406326, + "learning_rate": 3.7341772151898736e-05, + "loss": 0.4067, + "step": 317 + }, + { + "epoch": 0.9875776397515528, + "grad_norm": 0.31882677984452723, + "learning_rate": 3.7284234752589185e-05, + "loss": 0.4207, + "step": 318 + }, + { + "epoch": 0.9906832298136646, + "grad_norm": 0.37165416285954644, + "learning_rate": 3.722669735327963e-05, + "loss": 0.4339, + "step": 319 + }, + { + "epoch": 0.9937888198757764, + "grad_norm": 0.3190088839703568, + "learning_rate": 3.716915995397008e-05, + "loss": 0.4079, + "step": 320 + }, + { + "epoch": 0.9968944099378882, + "grad_norm": 0.3115319771959773, + "learning_rate": 3.711162255466053e-05, + "loss": 0.4322, + "step": 321 + }, + { + "epoch": 1.0, + "grad_norm": 0.3044086608586031, + "learning_rate": 3.705408515535098e-05, + "loss": 0.4097, + "step": 322 + }, + { + "epoch": 1.0031055900621118, + "grad_norm": 0.33417590278362963, + "learning_rate": 3.699654775604143e-05, + "loss": 0.3323, + "step": 323 + }, + { + "epoch": 1.0062111801242235, + "grad_norm": 0.341573477224664, + "learning_rate": 3.693901035673188e-05, + "loss": 0.3571, + "step": 324 + }, + { + "epoch": 1.0093167701863355, + "grad_norm": 0.27258326161115387, + "learning_rate": 3.6881472957422326e-05, + "loss": 0.3404, + "step": 325 + }, + { + "epoch": 1.0124223602484472, + "grad_norm": 0.33991178542501627, + "learning_rate": 3.6823935558112774e-05, + "loss": 0.3493, + "step": 326 + }, + { + "epoch": 1.015527950310559, + "grad_norm": 0.3446263251981706, + "learning_rate": 3.676639815880322e-05, + "loss": 0.3473, + "step": 327 + }, + { + "epoch": 1.0186335403726707, + "grad_norm": 0.33801547973317314, + "learning_rate": 3.670886075949367e-05, + "loss": 0.3697, + "step": 328 + }, + { + "epoch": 1.0217391304347827, + "grad_norm": 0.35908354782023477, + "learning_rate": 3.665132336018412e-05, + "loss": 0.3476, + "step": 329 + }, + { + "epoch": 1.0248447204968945, + "grad_norm": 0.3234656105570385, + "learning_rate": 3.659378596087457e-05, + "loss": 0.3622, + "step": 330 + }, + { + "epoch": 1.0279503105590062, + "grad_norm": 0.35587249506855595, + "learning_rate": 3.653624856156502e-05, + "loss": 0.3555, + "step": 331 + }, + { + "epoch": 1.031055900621118, + "grad_norm": 0.31905169592308186, + "learning_rate": 3.647871116225547e-05, + "loss": 0.3461, + "step": 332 + }, + { + "epoch": 1.0341614906832297, + "grad_norm": 0.36840310397083925, + "learning_rate": 3.6421173762945915e-05, + "loss": 0.3429, + "step": 333 + }, + { + "epoch": 1.0372670807453417, + "grad_norm": 0.3651205860513462, + "learning_rate": 3.6363636363636364e-05, + "loss": 0.3435, + "step": 334 + }, + { + "epoch": 1.0403726708074534, + "grad_norm": 0.31066005439052724, + "learning_rate": 3.630609896432681e-05, + "loss": 0.3272, + "step": 335 + }, + { + "epoch": 1.0434782608695652, + "grad_norm": 0.3759419584351618, + "learning_rate": 3.624856156501726e-05, + "loss": 0.3395, + "step": 336 + }, + { + "epoch": 1.046583850931677, + "grad_norm": 0.3021549547887614, + "learning_rate": 3.619102416570771e-05, + "loss": 0.3417, + "step": 337 + }, + { + "epoch": 1.049689440993789, + "grad_norm": 0.3205703918762732, + "learning_rate": 3.613348676639816e-05, + "loss": 0.3433, + "step": 338 + }, + { + "epoch": 1.0527950310559007, + "grad_norm": 0.4534884210584356, + "learning_rate": 3.607594936708861e-05, + "loss": 0.3594, + "step": 339 + }, + { + "epoch": 1.0559006211180124, + "grad_norm": 0.367415386580333, + "learning_rate": 3.6018411967779057e-05, + "loss": 0.3524, + "step": 340 + }, + { + "epoch": 1.0590062111801242, + "grad_norm": 0.3127875635159284, + "learning_rate": 3.5960874568469505e-05, + "loss": 0.333, + "step": 341 + }, + { + "epoch": 1.062111801242236, + "grad_norm": 0.4511553956189257, + "learning_rate": 3.5903337169159954e-05, + "loss": 0.3454, + "step": 342 + }, + { + "epoch": 1.065217391304348, + "grad_norm": 0.27133796776358254, + "learning_rate": 3.58457997698504e-05, + "loss": 0.3307, + "step": 343 + }, + { + "epoch": 1.0683229813664596, + "grad_norm": 0.37172783607468407, + "learning_rate": 3.578826237054085e-05, + "loss": 0.332, + "step": 344 + }, + { + "epoch": 1.0714285714285714, + "grad_norm": 0.31903478698253923, + "learning_rate": 3.57307249712313e-05, + "loss": 0.3641, + "step": 345 + }, + { + "epoch": 1.0745341614906831, + "grad_norm": 0.3590599821405197, + "learning_rate": 3.567318757192175e-05, + "loss": 0.3368, + "step": 346 + }, + { + "epoch": 1.0776397515527951, + "grad_norm": 0.3228666493670707, + "learning_rate": 3.56156501726122e-05, + "loss": 0.3518, + "step": 347 + }, + { + "epoch": 1.0807453416149069, + "grad_norm": 0.35040485427397144, + "learning_rate": 3.5558112773302646e-05, + "loss": 0.3567, + "step": 348 + }, + { + "epoch": 1.0838509316770186, + "grad_norm": 0.3223473550373259, + "learning_rate": 3.5500575373993095e-05, + "loss": 0.3292, + "step": 349 + }, + { + "epoch": 1.0869565217391304, + "grad_norm": 0.3162329124544906, + "learning_rate": 3.5443037974683544e-05, + "loss": 0.3386, + "step": 350 + }, + { + "epoch": 1.0900621118012421, + "grad_norm": 0.35250805959488396, + "learning_rate": 3.538550057537399e-05, + "loss": 0.3286, + "step": 351 + }, + { + "epoch": 1.093167701863354, + "grad_norm": 0.31027768437301634, + "learning_rate": 3.532796317606444e-05, + "loss": 0.3411, + "step": 352 + }, + { + "epoch": 1.0962732919254659, + "grad_norm": 0.28606898633939265, + "learning_rate": 3.52704257767549e-05, + "loss": 0.3407, + "step": 353 + }, + { + "epoch": 1.0993788819875776, + "grad_norm": 0.3579167421662421, + "learning_rate": 3.521288837744534e-05, + "loss": 0.3262, + "step": 354 + }, + { + "epoch": 1.1024844720496894, + "grad_norm": 0.3402295001253341, + "learning_rate": 3.5155350978135794e-05, + "loss": 0.3324, + "step": 355 + }, + { + "epoch": 1.1055900621118013, + "grad_norm": 0.31366685836024, + "learning_rate": 3.5097813578826236e-05, + "loss": 0.3463, + "step": 356 + }, + { + "epoch": 1.108695652173913, + "grad_norm": 0.46838911104977027, + "learning_rate": 3.504027617951669e-05, + "loss": 0.3565, + "step": 357 + }, + { + "epoch": 1.1118012422360248, + "grad_norm": 0.3060846523455061, + "learning_rate": 3.4982738780207134e-05, + "loss": 0.357, + "step": 358 + }, + { + "epoch": 1.1149068322981366, + "grad_norm": 0.4392245103993425, + "learning_rate": 3.492520138089759e-05, + "loss": 0.3568, + "step": 359 + }, + { + "epoch": 1.1180124223602483, + "grad_norm": 0.3916417909387617, + "learning_rate": 3.486766398158803e-05, + "loss": 0.3446, + "step": 360 + }, + { + "epoch": 1.1211180124223603, + "grad_norm": 0.3501561418628378, + "learning_rate": 3.4810126582278487e-05, + "loss": 0.3282, + "step": 361 + }, + { + "epoch": 1.124223602484472, + "grad_norm": 0.37454862360065444, + "learning_rate": 3.475258918296893e-05, + "loss": 0.3543, + "step": 362 + }, + { + "epoch": 1.1273291925465838, + "grad_norm": 0.2884683302507566, + "learning_rate": 3.4695051783659384e-05, + "loss": 0.3337, + "step": 363 + }, + { + "epoch": 1.1304347826086956, + "grad_norm": 0.3254717305148171, + "learning_rate": 3.4637514384349826e-05, + "loss": 0.3271, + "step": 364 + }, + { + "epoch": 1.1335403726708075, + "grad_norm": 0.3256237761211695, + "learning_rate": 3.457997698504028e-05, + "loss": 0.3298, + "step": 365 + }, + { + "epoch": 1.1366459627329193, + "grad_norm": 0.30981574585542065, + "learning_rate": 3.4522439585730723e-05, + "loss": 0.3685, + "step": 366 + }, + { + "epoch": 1.139751552795031, + "grad_norm": 0.29936602875383006, + "learning_rate": 3.446490218642118e-05, + "loss": 0.3524, + "step": 367 + }, + { + "epoch": 1.1428571428571428, + "grad_norm": 0.2961907533597477, + "learning_rate": 3.440736478711162e-05, + "loss": 0.3414, + "step": 368 + }, + { + "epoch": 1.1459627329192545, + "grad_norm": 0.2898757967419472, + "learning_rate": 3.4349827387802076e-05, + "loss": 0.3275, + "step": 369 + }, + { + "epoch": 1.1490683229813665, + "grad_norm": 0.35918811245436444, + "learning_rate": 3.429228998849252e-05, + "loss": 0.3502, + "step": 370 + }, + { + "epoch": 1.1521739130434783, + "grad_norm": 0.2775107307381104, + "learning_rate": 3.4234752589182974e-05, + "loss": 0.3409, + "step": 371 + }, + { + "epoch": 1.15527950310559, + "grad_norm": 0.2986400287100927, + "learning_rate": 3.4177215189873416e-05, + "loss": 0.3312, + "step": 372 + }, + { + "epoch": 1.1583850931677018, + "grad_norm": 0.33238801993955036, + "learning_rate": 3.411967779056387e-05, + "loss": 0.3443, + "step": 373 + }, + { + "epoch": 1.1614906832298137, + "grad_norm": 0.2893594359102009, + "learning_rate": 3.406214039125431e-05, + "loss": 0.3332, + "step": 374 + }, + { + "epoch": 1.1645962732919255, + "grad_norm": 0.32293840276637376, + "learning_rate": 3.400460299194477e-05, + "loss": 0.3354, + "step": 375 + }, + { + "epoch": 1.1677018633540373, + "grad_norm": 0.27306219223391365, + "learning_rate": 3.394706559263521e-05, + "loss": 0.3209, + "step": 376 + }, + { + "epoch": 1.170807453416149, + "grad_norm": 0.3342500084639322, + "learning_rate": 3.3889528193325666e-05, + "loss": 0.3729, + "step": 377 + }, + { + "epoch": 1.1739130434782608, + "grad_norm": 0.2661392532196279, + "learning_rate": 3.383199079401611e-05, + "loss": 0.3383, + "step": 378 + }, + { + "epoch": 1.1770186335403727, + "grad_norm": 0.3386471665658259, + "learning_rate": 3.3774453394706564e-05, + "loss": 0.318, + "step": 379 + }, + { + "epoch": 1.1801242236024845, + "grad_norm": 0.3155587203894488, + "learning_rate": 3.3716915995397006e-05, + "loss": 0.3321, + "step": 380 + }, + { + "epoch": 1.1832298136645962, + "grad_norm": 0.3451778286777197, + "learning_rate": 3.365937859608746e-05, + "loss": 0.361, + "step": 381 + }, + { + "epoch": 1.186335403726708, + "grad_norm": 0.3227976748273063, + "learning_rate": 3.36018411967779e-05, + "loss": 0.3349, + "step": 382 + }, + { + "epoch": 1.18944099378882, + "grad_norm": 0.320511150129644, + "learning_rate": 3.354430379746836e-05, + "loss": 0.3449, + "step": 383 + }, + { + "epoch": 1.1925465838509317, + "grad_norm": 0.31955908520280063, + "learning_rate": 3.34867663981588e-05, + "loss": 0.3351, + "step": 384 + }, + { + "epoch": 1.1956521739130435, + "grad_norm": 0.30633810764776365, + "learning_rate": 3.3429228998849256e-05, + "loss": 0.3275, + "step": 385 + }, + { + "epoch": 1.1987577639751552, + "grad_norm": 0.41299034529321954, + "learning_rate": 3.33716915995397e-05, + "loss": 0.3309, + "step": 386 + }, + { + "epoch": 1.201863354037267, + "grad_norm": 0.2750482509074482, + "learning_rate": 3.3314154200230153e-05, + "loss": 0.3398, + "step": 387 + }, + { + "epoch": 1.204968944099379, + "grad_norm": 0.3081268249974453, + "learning_rate": 3.3256616800920595e-05, + "loss": 0.3322, + "step": 388 + }, + { + "epoch": 1.2080745341614907, + "grad_norm": 0.3520674198029431, + "learning_rate": 3.319907940161105e-05, + "loss": 0.3663, + "step": 389 + }, + { + "epoch": 1.2111801242236024, + "grad_norm": 0.32565232106148584, + "learning_rate": 3.314154200230149e-05, + "loss": 0.343, + "step": 390 + }, + { + "epoch": 1.2142857142857142, + "grad_norm": 0.2938812397405531, + "learning_rate": 3.308400460299195e-05, + "loss": 0.3378, + "step": 391 + }, + { + "epoch": 1.2173913043478262, + "grad_norm": 0.3141073779827861, + "learning_rate": 3.302646720368239e-05, + "loss": 0.3335, + "step": 392 + }, + { + "epoch": 1.220496894409938, + "grad_norm": 0.3418673255721663, + "learning_rate": 3.2968929804372846e-05, + "loss": 0.36, + "step": 393 + }, + { + "epoch": 1.2236024844720497, + "grad_norm": 0.24297614998734132, + "learning_rate": 3.291139240506329e-05, + "loss": 0.3387, + "step": 394 + }, + { + "epoch": 1.2267080745341614, + "grad_norm": 0.3267179467149504, + "learning_rate": 3.285385500575374e-05, + "loss": 0.3488, + "step": 395 + }, + { + "epoch": 1.2298136645962732, + "grad_norm": 0.3057560458812451, + "learning_rate": 3.2796317606444185e-05, + "loss": 0.3268, + "step": 396 + }, + { + "epoch": 1.2329192546583851, + "grad_norm": 0.3134897896860434, + "learning_rate": 3.273878020713464e-05, + "loss": 0.3459, + "step": 397 + }, + { + "epoch": 1.236024844720497, + "grad_norm": 0.3047314985401556, + "learning_rate": 3.268124280782508e-05, + "loss": 0.3291, + "step": 398 + }, + { + "epoch": 1.2391304347826086, + "grad_norm": 0.31348581848675783, + "learning_rate": 3.262370540851554e-05, + "loss": 0.3446, + "step": 399 + }, + { + "epoch": 1.2422360248447206, + "grad_norm": 0.3482328869260001, + "learning_rate": 3.256616800920598e-05, + "loss": 0.3561, + "step": 400 + }, + { + "epoch": 1.2453416149068324, + "grad_norm": 0.31183834841742225, + "learning_rate": 3.2508630609896436e-05, + "loss": 0.3547, + "step": 401 + }, + { + "epoch": 1.2484472049689441, + "grad_norm": 0.3061676085086065, + "learning_rate": 3.245109321058688e-05, + "loss": 0.3595, + "step": 402 + }, + { + "epoch": 1.2515527950310559, + "grad_norm": 0.32549148328343397, + "learning_rate": 3.239355581127733e-05, + "loss": 0.3342, + "step": 403 + }, + { + "epoch": 1.2546583850931676, + "grad_norm": 0.30445969084522895, + "learning_rate": 3.233601841196778e-05, + "loss": 0.3242, + "step": 404 + }, + { + "epoch": 1.2577639751552794, + "grad_norm": 0.2742819629805248, + "learning_rate": 3.227848101265823e-05, + "loss": 0.3522, + "step": 405 + }, + { + "epoch": 1.2608695652173914, + "grad_norm": 0.32581875150876105, + "learning_rate": 3.222094361334868e-05, + "loss": 0.3429, + "step": 406 + }, + { + "epoch": 1.263975155279503, + "grad_norm": 0.2902255052156193, + "learning_rate": 3.216340621403913e-05, + "loss": 0.3369, + "step": 407 + }, + { + "epoch": 1.2670807453416149, + "grad_norm": 0.284761382807809, + "learning_rate": 3.210586881472958e-05, + "loss": 0.36, + "step": 408 + }, + { + "epoch": 1.2701863354037268, + "grad_norm": 0.3025552167032939, + "learning_rate": 3.2048331415420025e-05, + "loss": 0.3445, + "step": 409 + }, + { + "epoch": 1.2732919254658386, + "grad_norm": 0.3305696776607858, + "learning_rate": 3.1990794016110474e-05, + "loss": 0.3463, + "step": 410 + }, + { + "epoch": 1.2763975155279503, + "grad_norm": 0.3077574972549534, + "learning_rate": 3.193325661680092e-05, + "loss": 0.3594, + "step": 411 + }, + { + "epoch": 1.279503105590062, + "grad_norm": 0.27442755120830326, + "learning_rate": 3.187571921749137e-05, + "loss": 0.3362, + "step": 412 + }, + { + "epoch": 1.2826086956521738, + "grad_norm": 0.3038026451556641, + "learning_rate": 3.181818181818182e-05, + "loss": 0.3353, + "step": 413 + }, + { + "epoch": 1.2857142857142856, + "grad_norm": 0.2758156658151106, + "learning_rate": 3.176064441887227e-05, + "loss": 0.337, + "step": 414 + }, + { + "epoch": 1.2888198757763976, + "grad_norm": 0.26613400787975794, + "learning_rate": 3.170310701956272e-05, + "loss": 0.3347, + "step": 415 + }, + { + "epoch": 1.2919254658385093, + "grad_norm": 0.30006243856469433, + "learning_rate": 3.1645569620253167e-05, + "loss": 0.3575, + "step": 416 + }, + { + "epoch": 1.295031055900621, + "grad_norm": 0.32225619437705794, + "learning_rate": 3.1588032220943615e-05, + "loss": 0.3404, + "step": 417 + }, + { + "epoch": 1.298136645962733, + "grad_norm": 0.2933513705620206, + "learning_rate": 3.1530494821634064e-05, + "loss": 0.3367, + "step": 418 + }, + { + "epoch": 1.3012422360248448, + "grad_norm": 0.34221232972865906, + "learning_rate": 3.147295742232451e-05, + "loss": 0.3507, + "step": 419 + }, + { + "epoch": 1.3043478260869565, + "grad_norm": 0.3207028944029123, + "learning_rate": 3.141542002301496e-05, + "loss": 0.339, + "step": 420 + }, + { + "epoch": 1.3074534161490683, + "grad_norm": 0.28691874649916205, + "learning_rate": 3.135788262370541e-05, + "loss": 0.3158, + "step": 421 + }, + { + "epoch": 1.31055900621118, + "grad_norm": 0.32572558244440175, + "learning_rate": 3.130034522439586e-05, + "loss": 0.3425, + "step": 422 + }, + { + "epoch": 1.3136645962732918, + "grad_norm": 0.2840181269830042, + "learning_rate": 3.124280782508631e-05, + "loss": 0.3446, + "step": 423 + }, + { + "epoch": 1.3167701863354038, + "grad_norm": 0.314090935226993, + "learning_rate": 3.1185270425776756e-05, + "loss": 0.3315, + "step": 424 + }, + { + "epoch": 1.3198757763975155, + "grad_norm": 0.31197822717588264, + "learning_rate": 3.1127733026467205e-05, + "loss": 0.3443, + "step": 425 + }, + { + "epoch": 1.3229813664596273, + "grad_norm": 0.2864210002126174, + "learning_rate": 3.1070195627157654e-05, + "loss": 0.3375, + "step": 426 + }, + { + "epoch": 1.3260869565217392, + "grad_norm": 0.25519688185589984, + "learning_rate": 3.10126582278481e-05, + "loss": 0.3384, + "step": 427 + }, + { + "epoch": 1.329192546583851, + "grad_norm": 0.2744740349540228, + "learning_rate": 3.095512082853855e-05, + "loss": 0.3383, + "step": 428 + }, + { + "epoch": 1.3322981366459627, + "grad_norm": 0.2607087924929348, + "learning_rate": 3.0897583429229e-05, + "loss": 0.3555, + "step": 429 + }, + { + "epoch": 1.3354037267080745, + "grad_norm": 0.27684287170228183, + "learning_rate": 3.084004602991945e-05, + "loss": 0.3261, + "step": 430 + }, + { + "epoch": 1.3385093167701863, + "grad_norm": 0.3095550998483706, + "learning_rate": 3.07825086306099e-05, + "loss": 0.3512, + "step": 431 + }, + { + "epoch": 1.341614906832298, + "grad_norm": 0.25842001969735057, + "learning_rate": 3.0724971231300346e-05, + "loss": 0.3296, + "step": 432 + }, + { + "epoch": 1.34472049689441, + "grad_norm": 0.30589083199518, + "learning_rate": 3.0667433831990795e-05, + "loss": 0.3329, + "step": 433 + }, + { + "epoch": 1.3478260869565217, + "grad_norm": 0.2825831249071207, + "learning_rate": 3.0609896432681244e-05, + "loss": 0.3403, + "step": 434 + }, + { + "epoch": 1.3509316770186335, + "grad_norm": 0.2849649494187899, + "learning_rate": 3.055235903337169e-05, + "loss": 0.3329, + "step": 435 + }, + { + "epoch": 1.3540372670807455, + "grad_norm": 0.31227992790240827, + "learning_rate": 3.0494821634062144e-05, + "loss": 0.3402, + "step": 436 + }, + { + "epoch": 1.3571428571428572, + "grad_norm": 0.28830226140066545, + "learning_rate": 3.043728423475259e-05, + "loss": 0.3343, + "step": 437 + }, + { + "epoch": 1.360248447204969, + "grad_norm": 0.2920144019191934, + "learning_rate": 3.0379746835443042e-05, + "loss": 0.3262, + "step": 438 + }, + { + "epoch": 1.3633540372670807, + "grad_norm": 0.25016168615415485, + "learning_rate": 3.0322209436133487e-05, + "loss": 0.3394, + "step": 439 + }, + { + "epoch": 1.3664596273291925, + "grad_norm": 0.30261081735444717, + "learning_rate": 3.026467203682394e-05, + "loss": 0.3462, + "step": 440 + }, + { + "epoch": 1.3695652173913042, + "grad_norm": 0.2881616381341832, + "learning_rate": 3.0207134637514385e-05, + "loss": 0.3318, + "step": 441 + }, + { + "epoch": 1.3726708074534162, + "grad_norm": 0.2880320213397424, + "learning_rate": 3.0149597238204837e-05, + "loss": 0.3467, + "step": 442 + }, + { + "epoch": 1.375776397515528, + "grad_norm": 0.27020350890941985, + "learning_rate": 3.0092059838895282e-05, + "loss": 0.332, + "step": 443 + }, + { + "epoch": 1.3788819875776397, + "grad_norm": 0.3103789819064371, + "learning_rate": 3.0034522439585734e-05, + "loss": 0.3557, + "step": 444 + }, + { + "epoch": 1.3819875776397517, + "grad_norm": 0.2689229148068124, + "learning_rate": 2.997698504027618e-05, + "loss": 0.3439, + "step": 445 + }, + { + "epoch": 1.3850931677018634, + "grad_norm": 0.3284067395525679, + "learning_rate": 2.991944764096663e-05, + "loss": 0.3361, + "step": 446 + }, + { + "epoch": 1.3881987577639752, + "grad_norm": 0.31384093062312546, + "learning_rate": 2.9861910241657077e-05, + "loss": 0.3397, + "step": 447 + }, + { + "epoch": 1.391304347826087, + "grad_norm": 0.2699369577142723, + "learning_rate": 2.980437284234753e-05, + "loss": 0.3586, + "step": 448 + }, + { + "epoch": 1.3944099378881987, + "grad_norm": 0.3036288617772187, + "learning_rate": 2.9746835443037974e-05, + "loss": 0.3565, + "step": 449 + }, + { + "epoch": 1.3975155279503104, + "grad_norm": 0.3124807578288405, + "learning_rate": 2.9689298043728427e-05, + "loss": 0.3419, + "step": 450 + }, + { + "epoch": 1.4006211180124224, + "grad_norm": 0.2979650176094835, + "learning_rate": 2.9631760644418872e-05, + "loss": 0.348, + "step": 451 + }, + { + "epoch": 1.4037267080745341, + "grad_norm": 0.2681659506306783, + "learning_rate": 2.9574223245109324e-05, + "loss": 0.3299, + "step": 452 + }, + { + "epoch": 1.406832298136646, + "grad_norm": 0.32598633427460977, + "learning_rate": 2.951668584579977e-05, + "loss": 0.3386, + "step": 453 + }, + { + "epoch": 1.4099378881987579, + "grad_norm": 0.3247280401348239, + "learning_rate": 2.945914844649022e-05, + "loss": 0.3616, + "step": 454 + }, + { + "epoch": 1.4130434782608696, + "grad_norm": 0.2674177394891557, + "learning_rate": 2.940161104718067e-05, + "loss": 0.3477, + "step": 455 + }, + { + "epoch": 1.4161490683229814, + "grad_norm": 0.31284120066769544, + "learning_rate": 2.934407364787112e-05, + "loss": 0.3359, + "step": 456 + }, + { + "epoch": 1.4192546583850931, + "grad_norm": 0.29650681379778476, + "learning_rate": 2.9286536248561568e-05, + "loss": 0.3534, + "step": 457 + }, + { + "epoch": 1.4223602484472049, + "grad_norm": 0.28958329200728805, + "learning_rate": 2.9228998849252016e-05, + "loss": 0.3414, + "step": 458 + }, + { + "epoch": 1.4254658385093169, + "grad_norm": 0.3164618567976454, + "learning_rate": 2.9171461449942465e-05, + "loss": 0.351, + "step": 459 + }, + { + "epoch": 1.4285714285714286, + "grad_norm": 0.32604862143805774, + "learning_rate": 2.9113924050632914e-05, + "loss": 0.3185, + "step": 460 + }, + { + "epoch": 1.4316770186335404, + "grad_norm": 0.2371091150488046, + "learning_rate": 2.9056386651323363e-05, + "loss": 0.3268, + "step": 461 + }, + { + "epoch": 1.434782608695652, + "grad_norm": 0.28836250577098943, + "learning_rate": 2.899884925201381e-05, + "loss": 0.3203, + "step": 462 + }, + { + "epoch": 1.437888198757764, + "grad_norm": 0.29935943849859553, + "learning_rate": 2.894131185270426e-05, + "loss": 0.3419, + "step": 463 + }, + { + "epoch": 1.4409937888198758, + "grad_norm": 0.2678768364941078, + "learning_rate": 2.888377445339471e-05, + "loss": 0.3423, + "step": 464 + }, + { + "epoch": 1.4440993788819876, + "grad_norm": 0.3004413989905001, + "learning_rate": 2.8826237054085157e-05, + "loss": 0.3448, + "step": 465 + }, + { + "epoch": 1.4472049689440993, + "grad_norm": 0.3437138642713499, + "learning_rate": 2.8768699654775606e-05, + "loss": 0.3624, + "step": 466 + }, + { + "epoch": 1.450310559006211, + "grad_norm": 0.27833054674558505, + "learning_rate": 2.8711162255466055e-05, + "loss": 0.3559, + "step": 467 + }, + { + "epoch": 1.453416149068323, + "grad_norm": 0.30426248134832284, + "learning_rate": 2.8653624856156504e-05, + "loss": 0.3409, + "step": 468 + }, + { + "epoch": 1.4565217391304348, + "grad_norm": 0.2884530747421473, + "learning_rate": 2.8596087456846952e-05, + "loss": 0.3543, + "step": 469 + }, + { + "epoch": 1.4596273291925466, + "grad_norm": 0.26674718010863235, + "learning_rate": 2.85385500575374e-05, + "loss": 0.3352, + "step": 470 + }, + { + "epoch": 1.4627329192546583, + "grad_norm": 0.24962947417256104, + "learning_rate": 2.848101265822785e-05, + "loss": 0.3331, + "step": 471 + }, + { + "epoch": 1.4658385093167703, + "grad_norm": 0.24321872392892266, + "learning_rate": 2.84234752589183e-05, + "loss": 0.3346, + "step": 472 + }, + { + "epoch": 1.468944099378882, + "grad_norm": 0.25806201920649635, + "learning_rate": 2.8365937859608747e-05, + "loss": 0.3433, + "step": 473 + }, + { + "epoch": 1.4720496894409938, + "grad_norm": 0.260107860168702, + "learning_rate": 2.8308400460299196e-05, + "loss": 0.3518, + "step": 474 + }, + { + "epoch": 1.4751552795031055, + "grad_norm": 0.27151545722001336, + "learning_rate": 2.8250863060989645e-05, + "loss": 0.3222, + "step": 475 + }, + { + "epoch": 1.4782608695652173, + "grad_norm": 0.2699064437677885, + "learning_rate": 2.8193325661680093e-05, + "loss": 0.3408, + "step": 476 + }, + { + "epoch": 1.4813664596273293, + "grad_norm": 0.2534825847738341, + "learning_rate": 2.8135788262370542e-05, + "loss": 0.3355, + "step": 477 + }, + { + "epoch": 1.484472049689441, + "grad_norm": 0.2596248018497863, + "learning_rate": 2.807825086306099e-05, + "loss": 0.3317, + "step": 478 + }, + { + "epoch": 1.4875776397515528, + "grad_norm": 0.22547358749920884, + "learning_rate": 2.802071346375144e-05, + "loss": 0.3172, + "step": 479 + }, + { + "epoch": 1.4906832298136645, + "grad_norm": 0.28156958226578077, + "learning_rate": 2.796317606444189e-05, + "loss": 0.3476, + "step": 480 + }, + { + "epoch": 1.4937888198757765, + "grad_norm": 0.26615285376164327, + "learning_rate": 2.7905638665132337e-05, + "loss": 0.3489, + "step": 481 + }, + { + "epoch": 1.4968944099378882, + "grad_norm": 0.246150116031317, + "learning_rate": 2.7848101265822786e-05, + "loss": 0.3464, + "step": 482 + }, + { + "epoch": 1.5, + "grad_norm": 0.25268560596400597, + "learning_rate": 2.7790563866513235e-05, + "loss": 0.3391, + "step": 483 + }, + { + "epoch": 1.5031055900621118, + "grad_norm": 0.2946205590355613, + "learning_rate": 2.7733026467203683e-05, + "loss": 0.3541, + "step": 484 + }, + { + "epoch": 1.5062111801242235, + "grad_norm": 0.2750424223242439, + "learning_rate": 2.7675489067894132e-05, + "loss": 0.3276, + "step": 485 + }, + { + "epoch": 1.5093167701863353, + "grad_norm": 0.28954598608369275, + "learning_rate": 2.761795166858458e-05, + "loss": 0.3554, + "step": 486 + }, + { + "epoch": 1.5124223602484472, + "grad_norm": 0.29461626033953947, + "learning_rate": 2.756041426927503e-05, + "loss": 0.3293, + "step": 487 + }, + { + "epoch": 1.515527950310559, + "grad_norm": 0.2407514728215296, + "learning_rate": 2.7502876869965478e-05, + "loss": 0.3263, + "step": 488 + }, + { + "epoch": 1.518633540372671, + "grad_norm": 0.24475815135162626, + "learning_rate": 2.7445339470655927e-05, + "loss": 0.3423, + "step": 489 + }, + { + "epoch": 1.5217391304347827, + "grad_norm": 0.32665261682040186, + "learning_rate": 2.7387802071346376e-05, + "loss": 0.3333, + "step": 490 + }, + { + "epoch": 1.5248447204968945, + "grad_norm": 0.2552003566164109, + "learning_rate": 2.7330264672036824e-05, + "loss": 0.3466, + "step": 491 + }, + { + "epoch": 1.5279503105590062, + "grad_norm": 0.2871320623730171, + "learning_rate": 2.7272727272727273e-05, + "loss": 0.3447, + "step": 492 + }, + { + "epoch": 1.531055900621118, + "grad_norm": 0.2440639273175817, + "learning_rate": 2.7215189873417722e-05, + "loss": 0.338, + "step": 493 + }, + { + "epoch": 1.5341614906832297, + "grad_norm": 0.24115465120440344, + "learning_rate": 2.715765247410817e-05, + "loss": 0.3245, + "step": 494 + }, + { + "epoch": 1.5372670807453415, + "grad_norm": 0.2781064697786101, + "learning_rate": 2.7100115074798623e-05, + "loss": 0.3637, + "step": 495 + }, + { + "epoch": 1.5403726708074534, + "grad_norm": 0.27237179201858924, + "learning_rate": 2.7042577675489068e-05, + "loss": 0.353, + "step": 496 + }, + { + "epoch": 1.5434782608695652, + "grad_norm": 0.3018049050362612, + "learning_rate": 2.698504027617952e-05, + "loss": 0.33, + "step": 497 + }, + { + "epoch": 1.5465838509316772, + "grad_norm": 0.319532872255584, + "learning_rate": 2.6927502876869965e-05, + "loss": 0.341, + "step": 498 + }, + { + "epoch": 1.549689440993789, + "grad_norm": 0.23318500669833875, + "learning_rate": 2.6869965477560418e-05, + "loss": 0.3324, + "step": 499 + }, + { + "epoch": 1.5527950310559007, + "grad_norm": 0.3108509837550317, + "learning_rate": 2.6812428078250863e-05, + "loss": 0.3484, + "step": 500 + }, + { + "epoch": 1.5559006211180124, + "grad_norm": 0.27432952612163103, + "learning_rate": 2.6754890678941315e-05, + "loss": 0.3301, + "step": 501 + }, + { + "epoch": 1.5590062111801242, + "grad_norm": 0.2474022932813197, + "learning_rate": 2.669735327963176e-05, + "loss": 0.3335, + "step": 502 + }, + { + "epoch": 1.562111801242236, + "grad_norm": 0.25918516414740417, + "learning_rate": 2.6639815880322212e-05, + "loss": 0.3531, + "step": 503 + }, + { + "epoch": 1.5652173913043477, + "grad_norm": 0.28631487498944946, + "learning_rate": 2.6582278481012658e-05, + "loss": 0.3461, + "step": 504 + }, + { + "epoch": 1.5683229813664596, + "grad_norm": 0.2541239393514543, + "learning_rate": 2.652474108170311e-05, + "loss": 0.3406, + "step": 505 + }, + { + "epoch": 1.5714285714285714, + "grad_norm": 0.26793962508861174, + "learning_rate": 2.646720368239356e-05, + "loss": 0.3333, + "step": 506 + }, + { + "epoch": 1.5745341614906834, + "grad_norm": 0.30945895076026697, + "learning_rate": 2.6409666283084007e-05, + "loss": 0.3429, + "step": 507 + }, + { + "epoch": 1.5776397515527951, + "grad_norm": 0.2625598679112342, + "learning_rate": 2.6352128883774456e-05, + "loss": 0.3475, + "step": 508 + }, + { + "epoch": 1.5807453416149069, + "grad_norm": 0.3340128053196445, + "learning_rate": 2.6294591484464905e-05, + "loss": 0.3548, + "step": 509 + }, + { + "epoch": 1.5838509316770186, + "grad_norm": 0.33090740840028027, + "learning_rate": 2.6237054085155354e-05, + "loss": 0.3482, + "step": 510 + }, + { + "epoch": 1.5869565217391304, + "grad_norm": 0.25904205831808136, + "learning_rate": 2.6179516685845802e-05, + "loss": 0.3322, + "step": 511 + }, + { + "epoch": 1.5900621118012421, + "grad_norm": 0.3458888736647229, + "learning_rate": 2.612197928653625e-05, + "loss": 0.3466, + "step": 512 + }, + { + "epoch": 1.5931677018633539, + "grad_norm": 0.26139554234188184, + "learning_rate": 2.60644418872267e-05, + "loss": 0.3357, + "step": 513 + }, + { + "epoch": 1.5962732919254659, + "grad_norm": 0.24797420076401436, + "learning_rate": 2.600690448791715e-05, + "loss": 0.3263, + "step": 514 + }, + { + "epoch": 1.5993788819875776, + "grad_norm": 0.25127963694679545, + "learning_rate": 2.5949367088607597e-05, + "loss": 0.3541, + "step": 515 + }, + { + "epoch": 1.6024844720496896, + "grad_norm": 0.24350107098267543, + "learning_rate": 2.5891829689298046e-05, + "loss": 0.3332, + "step": 516 + }, + { + "epoch": 1.6055900621118013, + "grad_norm": 0.2597186201230917, + "learning_rate": 2.5834292289988495e-05, + "loss": 0.3357, + "step": 517 + }, + { + "epoch": 1.608695652173913, + "grad_norm": 0.2553977260875351, + "learning_rate": 2.5776754890678943e-05, + "loss": 0.3381, + "step": 518 + }, + { + "epoch": 1.6118012422360248, + "grad_norm": 0.2495485503111441, + "learning_rate": 2.5719217491369392e-05, + "loss": 0.3489, + "step": 519 + }, + { + "epoch": 1.6149068322981366, + "grad_norm": 0.2826237704718821, + "learning_rate": 2.566168009205984e-05, + "loss": 0.3269, + "step": 520 + }, + { + "epoch": 1.6180124223602483, + "grad_norm": 0.2907559187980417, + "learning_rate": 2.560414269275029e-05, + "loss": 0.353, + "step": 521 + }, + { + "epoch": 1.62111801242236, + "grad_norm": 0.30078662752184515, + "learning_rate": 2.5546605293440738e-05, + "loss": 0.3344, + "step": 522 + }, + { + "epoch": 1.624223602484472, + "grad_norm": 0.2494274026603714, + "learning_rate": 2.5489067894131187e-05, + "loss": 0.3262, + "step": 523 + }, + { + "epoch": 1.6273291925465838, + "grad_norm": 0.22856587280801138, + "learning_rate": 2.5431530494821636e-05, + "loss": 0.3316, + "step": 524 + }, + { + "epoch": 1.6304347826086958, + "grad_norm": 0.24524446266248454, + "learning_rate": 2.5373993095512084e-05, + "loss": 0.3254, + "step": 525 + }, + { + "epoch": 1.6335403726708075, + "grad_norm": 0.2781145066258604, + "learning_rate": 2.5316455696202533e-05, + "loss": 0.3343, + "step": 526 + }, + { + "epoch": 1.6366459627329193, + "grad_norm": 0.24971582793985952, + "learning_rate": 2.5258918296892982e-05, + "loss": 0.3423, + "step": 527 + }, + { + "epoch": 1.639751552795031, + "grad_norm": 0.2961483358525156, + "learning_rate": 2.520138089758343e-05, + "loss": 0.3554, + "step": 528 + }, + { + "epoch": 1.6428571428571428, + "grad_norm": 0.30349368090110823, + "learning_rate": 2.514384349827388e-05, + "loss": 0.3563, + "step": 529 + }, + { + "epoch": 1.6459627329192545, + "grad_norm": 0.28292757074394537, + "learning_rate": 2.5086306098964328e-05, + "loss": 0.352, + "step": 530 + }, + { + "epoch": 1.6490683229813663, + "grad_norm": 0.25778656185495347, + "learning_rate": 2.5028768699654777e-05, + "loss": 0.3486, + "step": 531 + }, + { + "epoch": 1.6521739130434783, + "grad_norm": 0.32420346337090605, + "learning_rate": 2.4971231300345226e-05, + "loss": 0.3497, + "step": 532 + }, + { + "epoch": 1.65527950310559, + "grad_norm": 0.24803469539845557, + "learning_rate": 2.4913693901035674e-05, + "loss": 0.3325, + "step": 533 + }, + { + "epoch": 1.658385093167702, + "grad_norm": 0.23193714998127715, + "learning_rate": 2.4856156501726123e-05, + "loss": 0.3244, + "step": 534 + }, + { + "epoch": 1.6614906832298137, + "grad_norm": 0.31410082505061837, + "learning_rate": 2.4798619102416572e-05, + "loss": 0.3295, + "step": 535 + }, + { + "epoch": 1.6645962732919255, + "grad_norm": 0.29805963194310403, + "learning_rate": 2.474108170310702e-05, + "loss": 0.3576, + "step": 536 + }, + { + "epoch": 1.6677018633540373, + "grad_norm": 0.2773254453129355, + "learning_rate": 2.468354430379747e-05, + "loss": 0.3382, + "step": 537 + }, + { + "epoch": 1.670807453416149, + "grad_norm": 0.32678020135127306, + "learning_rate": 2.4626006904487918e-05, + "loss": 0.3196, + "step": 538 + }, + { + "epoch": 1.6739130434782608, + "grad_norm": 0.3166277691971712, + "learning_rate": 2.4568469505178367e-05, + "loss": 0.3567, + "step": 539 + }, + { + "epoch": 1.6770186335403725, + "grad_norm": 0.28823972531727493, + "learning_rate": 2.4510932105868815e-05, + "loss": 0.3303, + "step": 540 + }, + { + "epoch": 1.6801242236024845, + "grad_norm": 0.31416636195922193, + "learning_rate": 2.4453394706559264e-05, + "loss": 0.3468, + "step": 541 + }, + { + "epoch": 1.6832298136645962, + "grad_norm": 0.29389175839717274, + "learning_rate": 2.4395857307249713e-05, + "loss": 0.3334, + "step": 542 + }, + { + "epoch": 1.6863354037267082, + "grad_norm": 0.2574868658425901, + "learning_rate": 2.433831990794016e-05, + "loss": 0.3459, + "step": 543 + }, + { + "epoch": 1.68944099378882, + "grad_norm": 0.43013005229440787, + "learning_rate": 2.428078250863061e-05, + "loss": 0.3663, + "step": 544 + }, + { + "epoch": 1.6925465838509317, + "grad_norm": 0.29719384149686173, + "learning_rate": 2.422324510932106e-05, + "loss": 0.3227, + "step": 545 + }, + { + "epoch": 1.6956521739130435, + "grad_norm": 0.2630824870951196, + "learning_rate": 2.4165707710011508e-05, + "loss": 0.3334, + "step": 546 + }, + { + "epoch": 1.6987577639751552, + "grad_norm": 0.262646615576403, + "learning_rate": 2.4108170310701956e-05, + "loss": 0.3289, + "step": 547 + }, + { + "epoch": 1.701863354037267, + "grad_norm": 0.29464184604515603, + "learning_rate": 2.4050632911392405e-05, + "loss": 0.3606, + "step": 548 + }, + { + "epoch": 1.704968944099379, + "grad_norm": 0.270420959511805, + "learning_rate": 2.3993095512082854e-05, + "loss": 0.3344, + "step": 549 + }, + { + "epoch": 1.7080745341614907, + "grad_norm": 0.25692484125212045, + "learning_rate": 2.3935558112773303e-05, + "loss": 0.3487, + "step": 550 + }, + { + "epoch": 1.7111801242236024, + "grad_norm": 0.2438708000844588, + "learning_rate": 2.387802071346375e-05, + "loss": 0.3368, + "step": 551 + }, + { + "epoch": 1.7142857142857144, + "grad_norm": 0.3013744640384419, + "learning_rate": 2.38204833141542e-05, + "loss": 0.3374, + "step": 552 + }, + { + "epoch": 1.7173913043478262, + "grad_norm": 0.25432846986941376, + "learning_rate": 2.376294591484465e-05, + "loss": 0.3229, + "step": 553 + }, + { + "epoch": 1.720496894409938, + "grad_norm": 0.25904590947672523, + "learning_rate": 2.3705408515535098e-05, + "loss": 0.3446, + "step": 554 + }, + { + "epoch": 1.7236024844720497, + "grad_norm": 0.30606145603760704, + "learning_rate": 2.3647871116225546e-05, + "loss": 0.3341, + "step": 555 + }, + { + "epoch": 1.7267080745341614, + "grad_norm": 0.29538170031014566, + "learning_rate": 2.3590333716915995e-05, + "loss": 0.3289, + "step": 556 + }, + { + "epoch": 1.7298136645962732, + "grad_norm": 0.27134852683063904, + "learning_rate": 2.3532796317606444e-05, + "loss": 0.3547, + "step": 557 + }, + { + "epoch": 1.7329192546583851, + "grad_norm": 0.2825681002780479, + "learning_rate": 2.3475258918296892e-05, + "loss": 0.3379, + "step": 558 + }, + { + "epoch": 1.736024844720497, + "grad_norm": 0.2540853936510712, + "learning_rate": 2.341772151898734e-05, + "loss": 0.3382, + "step": 559 + }, + { + "epoch": 1.7391304347826086, + "grad_norm": 0.23898673149156113, + "learning_rate": 2.336018411967779e-05, + "loss": 0.3204, + "step": 560 + }, + { + "epoch": 1.7422360248447206, + "grad_norm": 0.28690037448179756, + "learning_rate": 2.330264672036824e-05, + "loss": 0.3475, + "step": 561 + }, + { + "epoch": 1.7453416149068324, + "grad_norm": 0.2517436769783244, + "learning_rate": 2.3245109321058687e-05, + "loss": 0.3336, + "step": 562 + }, + { + "epoch": 1.7484472049689441, + "grad_norm": 0.26052794049930406, + "learning_rate": 2.3187571921749136e-05, + "loss": 0.3659, + "step": 563 + }, + { + "epoch": 1.7515527950310559, + "grad_norm": 0.2520454393087574, + "learning_rate": 2.3130034522439585e-05, + "loss": 0.341, + "step": 564 + }, + { + "epoch": 1.7546583850931676, + "grad_norm": 0.24469475054712242, + "learning_rate": 2.3072497123130034e-05, + "loss": 0.3385, + "step": 565 + }, + { + "epoch": 1.7577639751552794, + "grad_norm": 0.2688171235493825, + "learning_rate": 2.3014959723820482e-05, + "loss": 0.3194, + "step": 566 + }, + { + "epoch": 1.7608695652173914, + "grad_norm": 0.24660650779589638, + "learning_rate": 2.295742232451093e-05, + "loss": 0.3414, + "step": 567 + }, + { + "epoch": 1.763975155279503, + "grad_norm": 0.24074948906029303, + "learning_rate": 2.289988492520138e-05, + "loss": 0.3487, + "step": 568 + }, + { + "epoch": 1.7670807453416149, + "grad_norm": 0.2683374003415654, + "learning_rate": 2.284234752589183e-05, + "loss": 0.3324, + "step": 569 + }, + { + "epoch": 1.7701863354037268, + "grad_norm": 0.2615920960522321, + "learning_rate": 2.278481012658228e-05, + "loss": 0.3609, + "step": 570 + }, + { + "epoch": 1.7732919254658386, + "grad_norm": 0.24217423401661245, + "learning_rate": 2.272727272727273e-05, + "loss": 0.3642, + "step": 571 + }, + { + "epoch": 1.7763975155279503, + "grad_norm": 0.3146547539059143, + "learning_rate": 2.2669735327963178e-05, + "loss": 0.3333, + "step": 572 + }, + { + "epoch": 1.779503105590062, + "grad_norm": 0.2562957388358894, + "learning_rate": 2.2612197928653627e-05, + "loss": 0.3446, + "step": 573 + }, + { + "epoch": 1.7826086956521738, + "grad_norm": 0.2514732345893343, + "learning_rate": 2.2554660529344075e-05, + "loss": 0.3313, + "step": 574 + }, + { + "epoch": 1.7857142857142856, + "grad_norm": 0.27268825204355784, + "learning_rate": 2.2497123130034524e-05, + "loss": 0.3431, + "step": 575 + }, + { + "epoch": 1.7888198757763976, + "grad_norm": 0.24683018483720148, + "learning_rate": 2.2439585730724973e-05, + "loss": 0.3264, + "step": 576 + }, + { + "epoch": 1.7919254658385093, + "grad_norm": 0.2511343392474156, + "learning_rate": 2.238204833141542e-05, + "loss": 0.3231, + "step": 577 + }, + { + "epoch": 1.795031055900621, + "grad_norm": 0.29263014098541856, + "learning_rate": 2.232451093210587e-05, + "loss": 0.3425, + "step": 578 + }, + { + "epoch": 1.798136645962733, + "grad_norm": 0.26439452081008274, + "learning_rate": 2.226697353279632e-05, + "loss": 0.3404, + "step": 579 + }, + { + "epoch": 1.8012422360248448, + "grad_norm": 0.26624397225893237, + "learning_rate": 2.2209436133486768e-05, + "loss": 0.3521, + "step": 580 + }, + { + "epoch": 1.8043478260869565, + "grad_norm": 0.27231006655087864, + "learning_rate": 2.2151898734177217e-05, + "loss": 0.3499, + "step": 581 + }, + { + "epoch": 1.8074534161490683, + "grad_norm": 0.2717715300694685, + "learning_rate": 2.2094361334867665e-05, + "loss": 0.3488, + "step": 582 + }, + { + "epoch": 1.81055900621118, + "grad_norm": 0.2555398572654679, + "learning_rate": 2.2036823935558114e-05, + "loss": 0.3484, + "step": 583 + }, + { + "epoch": 1.8136645962732918, + "grad_norm": 0.27285873888872886, + "learning_rate": 2.1979286536248563e-05, + "loss": 0.3301, + "step": 584 + }, + { + "epoch": 1.8167701863354038, + "grad_norm": 0.25210730048319585, + "learning_rate": 2.192174913693901e-05, + "loss": 0.3424, + "step": 585 + }, + { + "epoch": 1.8198757763975155, + "grad_norm": 0.26842055467218623, + "learning_rate": 2.186421173762946e-05, + "loss": 0.3408, + "step": 586 + }, + { + "epoch": 1.8229813664596275, + "grad_norm": 0.3208770216327945, + "learning_rate": 2.180667433831991e-05, + "loss": 0.3312, + "step": 587 + }, + { + "epoch": 1.8260869565217392, + "grad_norm": 0.2559541162545561, + "learning_rate": 2.1749136939010358e-05, + "loss": 0.36, + "step": 588 + }, + { + "epoch": 1.829192546583851, + "grad_norm": 0.32134394732411636, + "learning_rate": 2.1691599539700806e-05, + "loss": 0.3394, + "step": 589 + }, + { + "epoch": 1.8322981366459627, + "grad_norm": 0.2708594663810051, + "learning_rate": 2.1634062140391255e-05, + "loss": 0.3431, + "step": 590 + }, + { + "epoch": 1.8354037267080745, + "grad_norm": 0.3010404719152366, + "learning_rate": 2.1576524741081704e-05, + "loss": 0.3417, + "step": 591 + }, + { + "epoch": 1.8385093167701863, + "grad_norm": 0.3070106173244936, + "learning_rate": 2.1518987341772153e-05, + "loss": 0.3388, + "step": 592 + }, + { + "epoch": 1.841614906832298, + "grad_norm": 0.24023699838734106, + "learning_rate": 2.14614499424626e-05, + "loss": 0.359, + "step": 593 + }, + { + "epoch": 1.84472049689441, + "grad_norm": 0.27420152661967667, + "learning_rate": 2.140391254315305e-05, + "loss": 0.3262, + "step": 594 + }, + { + "epoch": 1.8478260869565217, + "grad_norm": 0.3128089706224423, + "learning_rate": 2.13463751438435e-05, + "loss": 0.3284, + "step": 595 + }, + { + "epoch": 1.8509316770186337, + "grad_norm": 0.26784524761567324, + "learning_rate": 2.1288837744533947e-05, + "loss": 0.3337, + "step": 596 + }, + { + "epoch": 1.8540372670807455, + "grad_norm": 0.2897924996984458, + "learning_rate": 2.1231300345224396e-05, + "loss": 0.3325, + "step": 597 + }, + { + "epoch": 1.8571428571428572, + "grad_norm": 0.25009521464769496, + "learning_rate": 2.1173762945914845e-05, + "loss": 0.3457, + "step": 598 + }, + { + "epoch": 1.860248447204969, + "grad_norm": 0.3055325339631166, + "learning_rate": 2.1116225546605294e-05, + "loss": 0.3506, + "step": 599 + }, + { + "epoch": 1.8633540372670807, + "grad_norm": 0.28613812992385934, + "learning_rate": 2.1058688147295742e-05, + "loss": 0.3609, + "step": 600 + }, + { + "epoch": 1.8664596273291925, + "grad_norm": 0.2705173567449075, + "learning_rate": 2.100115074798619e-05, + "loss": 0.3259, + "step": 601 + }, + { + "epoch": 1.8695652173913042, + "grad_norm": 0.3180478776075474, + "learning_rate": 2.094361334867664e-05, + "loss": 0.3459, + "step": 602 + }, + { + "epoch": 1.8726708074534162, + "grad_norm": 0.2667088182720578, + "learning_rate": 2.088607594936709e-05, + "loss": 0.3415, + "step": 603 + }, + { + "epoch": 1.875776397515528, + "grad_norm": 0.2721287511996052, + "learning_rate": 2.0828538550057537e-05, + "loss": 0.3295, + "step": 604 + }, + { + "epoch": 1.87888198757764, + "grad_norm": 0.24692799289830528, + "learning_rate": 2.0771001150747986e-05, + "loss": 0.3389, + "step": 605 + }, + { + "epoch": 1.8819875776397517, + "grad_norm": 0.25562500894154333, + "learning_rate": 2.0713463751438435e-05, + "loss": 0.3319, + "step": 606 + }, + { + "epoch": 1.8850931677018634, + "grad_norm": 0.2788963760074411, + "learning_rate": 2.0655926352128883e-05, + "loss": 0.3285, + "step": 607 + }, + { + "epoch": 1.8881987577639752, + "grad_norm": 0.24657052891375197, + "learning_rate": 2.0598388952819332e-05, + "loss": 0.3457, + "step": 608 + }, + { + "epoch": 1.891304347826087, + "grad_norm": 0.32309129817645427, + "learning_rate": 2.054085155350978e-05, + "loss": 0.3403, + "step": 609 + }, + { + "epoch": 1.8944099378881987, + "grad_norm": 0.30175425766070024, + "learning_rate": 2.048331415420023e-05, + "loss": 0.3471, + "step": 610 + }, + { + "epoch": 1.8975155279503104, + "grad_norm": 0.26841376007608464, + "learning_rate": 2.042577675489068e-05, + "loss": 0.3375, + "step": 611 + }, + { + "epoch": 1.9006211180124224, + "grad_norm": 0.262662943207609, + "learning_rate": 2.0368239355581127e-05, + "loss": 0.3323, + "step": 612 + }, + { + "epoch": 1.9037267080745341, + "grad_norm": 0.2701013469116013, + "learning_rate": 2.0310701956271576e-05, + "loss": 0.3421, + "step": 613 + }, + { + "epoch": 1.9068322981366461, + "grad_norm": 0.2714596873719603, + "learning_rate": 2.0253164556962025e-05, + "loss": 0.3459, + "step": 614 + }, + { + "epoch": 1.9099378881987579, + "grad_norm": 0.27588772682551244, + "learning_rate": 2.0195627157652473e-05, + "loss": 0.3393, + "step": 615 + }, + { + "epoch": 1.9130434782608696, + "grad_norm": 0.2599899065726882, + "learning_rate": 2.0138089758342922e-05, + "loss": 0.3272, + "step": 616 + }, + { + "epoch": 1.9161490683229814, + "grad_norm": 0.29859390134967695, + "learning_rate": 2.008055235903337e-05, + "loss": 0.3406, + "step": 617 + }, + { + "epoch": 1.9192546583850931, + "grad_norm": 0.2506363801804046, + "learning_rate": 2.002301495972382e-05, + "loss": 0.3442, + "step": 618 + }, + { + "epoch": 1.9223602484472049, + "grad_norm": 0.27643958694894183, + "learning_rate": 1.9965477560414268e-05, + "loss": 0.3266, + "step": 619 + }, + { + "epoch": 1.9254658385093166, + "grad_norm": 0.24433788612177662, + "learning_rate": 1.9907940161104717e-05, + "loss": 0.3282, + "step": 620 + }, + { + "epoch": 1.9285714285714286, + "grad_norm": 0.23745008386988042, + "learning_rate": 1.9850402761795166e-05, + "loss": 0.3362, + "step": 621 + }, + { + "epoch": 1.9316770186335404, + "grad_norm": 0.26121859150272697, + "learning_rate": 1.9792865362485614e-05, + "loss": 0.3286, + "step": 622 + }, + { + "epoch": 1.9347826086956523, + "grad_norm": 0.2593462473033886, + "learning_rate": 1.9735327963176063e-05, + "loss": 0.3277, + "step": 623 + }, + { + "epoch": 1.937888198757764, + "grad_norm": 0.251734596039316, + "learning_rate": 1.9677790563866512e-05, + "loss": 0.3243, + "step": 624 + }, + { + "epoch": 1.9409937888198758, + "grad_norm": 0.2796993503020773, + "learning_rate": 1.962025316455696e-05, + "loss": 0.341, + "step": 625 + }, + { + "epoch": 1.9440993788819876, + "grad_norm": 0.24405119567007771, + "learning_rate": 1.956271576524741e-05, + "loss": 0.3263, + "step": 626 + }, + { + "epoch": 1.9472049689440993, + "grad_norm": 0.2551104705286801, + "learning_rate": 1.9505178365937858e-05, + "loss": 0.3273, + "step": 627 + }, + { + "epoch": 1.950310559006211, + "grad_norm": 0.30160867369839583, + "learning_rate": 1.9447640966628307e-05, + "loss": 0.3461, + "step": 628 + }, + { + "epoch": 1.9534161490683228, + "grad_norm": 0.2307083280109175, + "learning_rate": 1.9390103567318755e-05, + "loss": 0.3195, + "step": 629 + }, + { + "epoch": 1.9565217391304348, + "grad_norm": 0.27856475014068843, + "learning_rate": 1.9332566168009204e-05, + "loss": 0.3534, + "step": 630 + }, + { + "epoch": 1.9596273291925466, + "grad_norm": 0.2422123663936176, + "learning_rate": 1.9275028768699653e-05, + "loss": 0.3444, + "step": 631 + }, + { + "epoch": 1.9627329192546585, + "grad_norm": 0.2508344560707552, + "learning_rate": 1.92174913693901e-05, + "loss": 0.3316, + "step": 632 + }, + { + "epoch": 1.9658385093167703, + "grad_norm": 0.25457679605852573, + "learning_rate": 1.915995397008055e-05, + "loss": 0.3511, + "step": 633 + }, + { + "epoch": 1.968944099378882, + "grad_norm": 0.255448027513179, + "learning_rate": 1.9102416570771002e-05, + "loss": 0.3417, + "step": 634 + }, + { + "epoch": 1.9720496894409938, + "grad_norm": 0.2655225587813165, + "learning_rate": 1.904487917146145e-05, + "loss": 0.3364, + "step": 635 + }, + { + "epoch": 1.9751552795031055, + "grad_norm": 0.2644532896395622, + "learning_rate": 1.89873417721519e-05, + "loss": 0.3439, + "step": 636 + }, + { + "epoch": 1.9782608695652173, + "grad_norm": 0.25431765900047304, + "learning_rate": 1.892980437284235e-05, + "loss": 0.3277, + "step": 637 + }, + { + "epoch": 1.981366459627329, + "grad_norm": 0.253925081047276, + "learning_rate": 1.8872266973532797e-05, + "loss": 0.345, + "step": 638 + }, + { + "epoch": 1.984472049689441, + "grad_norm": 0.2542103813230237, + "learning_rate": 1.8814729574223246e-05, + "loss": 0.3458, + "step": 639 + }, + { + "epoch": 1.9875776397515528, + "grad_norm": 0.298104123148457, + "learning_rate": 1.8757192174913695e-05, + "loss": 0.3278, + "step": 640 + }, + { + "epoch": 1.9906832298136647, + "grad_norm": 0.2322373735825899, + "learning_rate": 1.8699654775604144e-05, + "loss": 0.3426, + "step": 641 + }, + { + "epoch": 1.9937888198757765, + "grad_norm": 0.24606988538470728, + "learning_rate": 1.8642117376294592e-05, + "loss": 0.3358, + "step": 642 + }, + { + "epoch": 1.9968944099378882, + "grad_norm": 0.2849652231363428, + "learning_rate": 1.858457997698504e-05, + "loss": 0.315, + "step": 643 + }, + { + "epoch": 2.0, + "grad_norm": 0.277308601131606, + "learning_rate": 1.852704257767549e-05, + "loss": 0.3275, + "step": 644 + }, + { + "epoch": 2.0031055900621118, + "grad_norm": 0.3117446232875321, + "learning_rate": 1.846950517836594e-05, + "loss": 0.2631, + "step": 645 + }, + { + "epoch": 2.0062111801242235, + "grad_norm": 0.27820371561408924, + "learning_rate": 1.8411967779056387e-05, + "loss": 0.26, + "step": 646 + }, + { + "epoch": 2.0093167701863353, + "grad_norm": 0.35212508445991075, + "learning_rate": 1.8354430379746836e-05, + "loss": 0.2589, + "step": 647 + }, + { + "epoch": 2.012422360248447, + "grad_norm": 0.29598296168936833, + "learning_rate": 1.8296892980437285e-05, + "loss": 0.2727, + "step": 648 + }, + { + "epoch": 2.015527950310559, + "grad_norm": 0.23748767777958518, + "learning_rate": 1.8239355581127733e-05, + "loss": 0.2603, + "step": 649 + }, + { + "epoch": 2.018633540372671, + "grad_norm": 0.3396040317332316, + "learning_rate": 1.8181818181818182e-05, + "loss": 0.2494, + "step": 650 + }, + { + "epoch": 2.0217391304347827, + "grad_norm": 0.26773683268799814, + "learning_rate": 1.812428078250863e-05, + "loss": 0.2538, + "step": 651 + }, + { + "epoch": 2.0248447204968945, + "grad_norm": 0.26908218639639603, + "learning_rate": 1.806674338319908e-05, + "loss": 0.2636, + "step": 652 + }, + { + "epoch": 2.027950310559006, + "grad_norm": 0.2934635435841592, + "learning_rate": 1.8009205983889528e-05, + "loss": 0.25, + "step": 653 + }, + { + "epoch": 2.031055900621118, + "grad_norm": 0.2785089363356141, + "learning_rate": 1.7951668584579977e-05, + "loss": 0.2441, + "step": 654 + }, + { + "epoch": 2.0341614906832297, + "grad_norm": 0.2638349484890508, + "learning_rate": 1.7894131185270426e-05, + "loss": 0.2519, + "step": 655 + }, + { + "epoch": 2.0372670807453415, + "grad_norm": 0.2586235412884467, + "learning_rate": 1.7836593785960874e-05, + "loss": 0.2509, + "step": 656 + }, + { + "epoch": 2.040372670807453, + "grad_norm": 0.2472488808463837, + "learning_rate": 1.7779056386651323e-05, + "loss": 0.2538, + "step": 657 + }, + { + "epoch": 2.0434782608695654, + "grad_norm": 0.2795998982851747, + "learning_rate": 1.7721518987341772e-05, + "loss": 0.245, + "step": 658 + }, + { + "epoch": 2.046583850931677, + "grad_norm": 0.22761521829171, + "learning_rate": 1.766398158803222e-05, + "loss": 0.257, + "step": 659 + }, + { + "epoch": 2.049689440993789, + "grad_norm": 0.27491770536559856, + "learning_rate": 1.760644418872267e-05, + "loss": 0.2543, + "step": 660 + }, + { + "epoch": 2.0527950310559007, + "grad_norm": 0.2694008994446243, + "learning_rate": 1.7548906789413118e-05, + "loss": 0.2523, + "step": 661 + }, + { + "epoch": 2.0559006211180124, + "grad_norm": 0.2649790265269665, + "learning_rate": 1.7491369390103567e-05, + "loss": 0.2638, + "step": 662 + }, + { + "epoch": 2.059006211180124, + "grad_norm": 0.2592501818191762, + "learning_rate": 1.7433831990794016e-05, + "loss": 0.2577, + "step": 663 + }, + { + "epoch": 2.062111801242236, + "grad_norm": 0.2669822602469171, + "learning_rate": 1.7376294591484464e-05, + "loss": 0.2609, + "step": 664 + }, + { + "epoch": 2.0652173913043477, + "grad_norm": 0.24387894874703012, + "learning_rate": 1.7318757192174913e-05, + "loss": 0.2633, + "step": 665 + }, + { + "epoch": 2.0683229813664594, + "grad_norm": 0.2492896708129992, + "learning_rate": 1.7261219792865362e-05, + "loss": 0.2574, + "step": 666 + }, + { + "epoch": 2.0714285714285716, + "grad_norm": 0.23542048799701373, + "learning_rate": 1.720368239355581e-05, + "loss": 0.2472, + "step": 667 + }, + { + "epoch": 2.0745341614906834, + "grad_norm": 0.23318721989860372, + "learning_rate": 1.714614499424626e-05, + "loss": 0.2492, + "step": 668 + }, + { + "epoch": 2.077639751552795, + "grad_norm": 0.2184370669246145, + "learning_rate": 1.7088607594936708e-05, + "loss": 0.2614, + "step": 669 + }, + { + "epoch": 2.080745341614907, + "grad_norm": 0.23606974543337897, + "learning_rate": 1.7031070195627157e-05, + "loss": 0.2669, + "step": 670 + }, + { + "epoch": 2.0838509316770186, + "grad_norm": 0.24573655537821745, + "learning_rate": 1.6973532796317605e-05, + "loss": 0.2495, + "step": 671 + }, + { + "epoch": 2.0869565217391304, + "grad_norm": 0.22577283258104885, + "learning_rate": 1.6915995397008054e-05, + "loss": 0.2324, + "step": 672 + }, + { + "epoch": 2.090062111801242, + "grad_norm": 0.22880524187260692, + "learning_rate": 1.6858457997698503e-05, + "loss": 0.2454, + "step": 673 + }, + { + "epoch": 2.093167701863354, + "grad_norm": 0.23649506425354394, + "learning_rate": 1.680092059838895e-05, + "loss": 0.2706, + "step": 674 + }, + { + "epoch": 2.0962732919254656, + "grad_norm": 0.25403811052331426, + "learning_rate": 1.67433831990794e-05, + "loss": 0.2543, + "step": 675 + }, + { + "epoch": 2.099378881987578, + "grad_norm": 0.2443521353581772, + "learning_rate": 1.668584579976985e-05, + "loss": 0.2484, + "step": 676 + }, + { + "epoch": 2.1024844720496896, + "grad_norm": 0.21359909291298998, + "learning_rate": 1.6628308400460298e-05, + "loss": 0.242, + "step": 677 + }, + { + "epoch": 2.1055900621118013, + "grad_norm": 0.2270460742418379, + "learning_rate": 1.6570771001150746e-05, + "loss": 0.248, + "step": 678 + }, + { + "epoch": 2.108695652173913, + "grad_norm": 0.22643050215454086, + "learning_rate": 1.6513233601841195e-05, + "loss": 0.2477, + "step": 679 + }, + { + "epoch": 2.111801242236025, + "grad_norm": 0.22299140134872011, + "learning_rate": 1.6455696202531644e-05, + "loss": 0.2568, + "step": 680 + }, + { + "epoch": 2.1149068322981366, + "grad_norm": 0.21597401049687515, + "learning_rate": 1.6398158803222093e-05, + "loss": 0.2567, + "step": 681 + }, + { + "epoch": 2.1180124223602483, + "grad_norm": 0.21424964627998483, + "learning_rate": 1.634062140391254e-05, + "loss": 0.2594, + "step": 682 + }, + { + "epoch": 2.12111801242236, + "grad_norm": 0.22064934278360224, + "learning_rate": 1.628308400460299e-05, + "loss": 0.2548, + "step": 683 + }, + { + "epoch": 2.124223602484472, + "grad_norm": 0.2226010291399242, + "learning_rate": 1.622554660529344e-05, + "loss": 0.2392, + "step": 684 + }, + { + "epoch": 2.127329192546584, + "grad_norm": 0.23817934921154135, + "learning_rate": 1.616800920598389e-05, + "loss": 0.263, + "step": 685 + }, + { + "epoch": 2.130434782608696, + "grad_norm": 0.21811012125800597, + "learning_rate": 1.611047180667434e-05, + "loss": 0.239, + "step": 686 + }, + { + "epoch": 2.1335403726708075, + "grad_norm": 0.22669201592312113, + "learning_rate": 1.605293440736479e-05, + "loss": 0.2526, + "step": 687 + }, + { + "epoch": 2.1366459627329193, + "grad_norm": 0.23305634956402152, + "learning_rate": 1.5995397008055237e-05, + "loss": 0.2685, + "step": 688 + }, + { + "epoch": 2.139751552795031, + "grad_norm": 0.2115884014346869, + "learning_rate": 1.5937859608745686e-05, + "loss": 0.2393, + "step": 689 + }, + { + "epoch": 2.142857142857143, + "grad_norm": 0.2115627751194399, + "learning_rate": 1.5880322209436135e-05, + "loss": 0.251, + "step": 690 + }, + { + "epoch": 2.1459627329192545, + "grad_norm": 0.21692908050798426, + "learning_rate": 1.5822784810126583e-05, + "loss": 0.2512, + "step": 691 + }, + { + "epoch": 2.1490683229813663, + "grad_norm": 0.22462301830943301, + "learning_rate": 1.5765247410817032e-05, + "loss": 0.2511, + "step": 692 + }, + { + "epoch": 2.1521739130434785, + "grad_norm": 0.19326801109974084, + "learning_rate": 1.570771001150748e-05, + "loss": 0.2452, + "step": 693 + }, + { + "epoch": 2.1552795031055902, + "grad_norm": 0.23274032126488928, + "learning_rate": 1.565017261219793e-05, + "loss": 0.2529, + "step": 694 + }, + { + "epoch": 2.158385093167702, + "grad_norm": 0.22582002907669432, + "learning_rate": 1.5592635212888378e-05, + "loss": 0.2597, + "step": 695 + }, + { + "epoch": 2.1614906832298137, + "grad_norm": 0.21665150454694335, + "learning_rate": 1.5535097813578827e-05, + "loss": 0.258, + "step": 696 + }, + { + "epoch": 2.1645962732919255, + "grad_norm": 0.21401338541621684, + "learning_rate": 1.5477560414269276e-05, + "loss": 0.2556, + "step": 697 + }, + { + "epoch": 2.1677018633540373, + "grad_norm": 0.23527336018366451, + "learning_rate": 1.5420023014959724e-05, + "loss": 0.2582, + "step": 698 + }, + { + "epoch": 2.170807453416149, + "grad_norm": 0.21445105954714194, + "learning_rate": 1.5362485615650173e-05, + "loss": 0.2503, + "step": 699 + }, + { + "epoch": 2.1739130434782608, + "grad_norm": 0.21511080318136375, + "learning_rate": 1.5304948216340622e-05, + "loss": 0.2566, + "step": 700 + }, + { + "epoch": 2.1770186335403725, + "grad_norm": 0.24111702768301724, + "learning_rate": 1.5247410817031072e-05, + "loss": 0.2425, + "step": 701 + }, + { + "epoch": 2.1801242236024843, + "grad_norm": 0.21498468064553858, + "learning_rate": 1.5189873417721521e-05, + "loss": 0.2439, + "step": 702 + }, + { + "epoch": 2.1832298136645965, + "grad_norm": 0.21967291818581178, + "learning_rate": 1.513233601841197e-05, + "loss": 0.2509, + "step": 703 + }, + { + "epoch": 2.186335403726708, + "grad_norm": 0.2205935490599146, + "learning_rate": 1.5074798619102418e-05, + "loss": 0.2579, + "step": 704 + }, + { + "epoch": 2.18944099378882, + "grad_norm": 0.23758914721632698, + "learning_rate": 1.5017261219792867e-05, + "loss": 0.2399, + "step": 705 + }, + { + "epoch": 2.1925465838509317, + "grad_norm": 0.19571832530537867, + "learning_rate": 1.4959723820483316e-05, + "loss": 0.2404, + "step": 706 + }, + { + "epoch": 2.1956521739130435, + "grad_norm": 0.20772523111005442, + "learning_rate": 1.4902186421173765e-05, + "loss": 0.2489, + "step": 707 + }, + { + "epoch": 2.198757763975155, + "grad_norm": 0.2078388868457063, + "learning_rate": 1.4844649021864213e-05, + "loss": 0.2404, + "step": 708 + }, + { + "epoch": 2.201863354037267, + "grad_norm": 0.24074640885647317, + "learning_rate": 1.4787111622554662e-05, + "loss": 0.2647, + "step": 709 + }, + { + "epoch": 2.2049689440993787, + "grad_norm": 0.20937990276765678, + "learning_rate": 1.472957422324511e-05, + "loss": 0.2578, + "step": 710 + }, + { + "epoch": 2.208074534161491, + "grad_norm": 0.2639807190802869, + "learning_rate": 1.467203682393556e-05, + "loss": 0.2607, + "step": 711 + }, + { + "epoch": 2.2111801242236027, + "grad_norm": 0.26293955788698453, + "learning_rate": 1.4614499424626008e-05, + "loss": 0.2638, + "step": 712 + }, + { + "epoch": 2.2142857142857144, + "grad_norm": 0.23828883015584687, + "learning_rate": 1.4556962025316457e-05, + "loss": 0.2577, + "step": 713 + }, + { + "epoch": 2.217391304347826, + "grad_norm": 0.24740324327511762, + "learning_rate": 1.4499424626006906e-05, + "loss": 0.2603, + "step": 714 + }, + { + "epoch": 2.220496894409938, + "grad_norm": 0.22582258369375163, + "learning_rate": 1.4441887226697354e-05, + "loss": 0.2442, + "step": 715 + }, + { + "epoch": 2.2236024844720497, + "grad_norm": 0.24839008006413138, + "learning_rate": 1.4384349827387803e-05, + "loss": 0.2591, + "step": 716 + }, + { + "epoch": 2.2267080745341614, + "grad_norm": 0.2507061092171656, + "learning_rate": 1.4326812428078252e-05, + "loss": 0.2525, + "step": 717 + }, + { + "epoch": 2.229813664596273, + "grad_norm": 0.214855054431312, + "learning_rate": 1.42692750287687e-05, + "loss": 0.2436, + "step": 718 + }, + { + "epoch": 2.232919254658385, + "grad_norm": 0.21592522701402342, + "learning_rate": 1.421173762945915e-05, + "loss": 0.2516, + "step": 719 + }, + { + "epoch": 2.2360248447204967, + "grad_norm": 0.20915695199545198, + "learning_rate": 1.4154200230149598e-05, + "loss": 0.2597, + "step": 720 + }, + { + "epoch": 2.239130434782609, + "grad_norm": 0.22903634190903957, + "learning_rate": 1.4096662830840047e-05, + "loss": 0.2667, + "step": 721 + }, + { + "epoch": 2.2422360248447206, + "grad_norm": 0.21439993038647093, + "learning_rate": 1.4039125431530495e-05, + "loss": 0.2436, + "step": 722 + }, + { + "epoch": 2.2453416149068324, + "grad_norm": 0.21937639860358657, + "learning_rate": 1.3981588032220944e-05, + "loss": 0.2458, + "step": 723 + }, + { + "epoch": 2.248447204968944, + "grad_norm": 0.2013130017609961, + "learning_rate": 1.3924050632911393e-05, + "loss": 0.2491, + "step": 724 + }, + { + "epoch": 2.251552795031056, + "grad_norm": 0.22887750081435682, + "learning_rate": 1.3866513233601842e-05, + "loss": 0.2441, + "step": 725 + }, + { + "epoch": 2.2546583850931676, + "grad_norm": 0.2288064963841507, + "learning_rate": 1.380897583429229e-05, + "loss": 0.2418, + "step": 726 + }, + { + "epoch": 2.2577639751552794, + "grad_norm": 0.23248918550222136, + "learning_rate": 1.3751438434982739e-05, + "loss": 0.2625, + "step": 727 + }, + { + "epoch": 2.260869565217391, + "grad_norm": 0.2090094171369587, + "learning_rate": 1.3693901035673188e-05, + "loss": 0.2507, + "step": 728 + }, + { + "epoch": 2.2639751552795033, + "grad_norm": 0.23154174958563464, + "learning_rate": 1.3636363636363637e-05, + "loss": 0.2586, + "step": 729 + }, + { + "epoch": 2.267080745341615, + "grad_norm": 0.24350218064576923, + "learning_rate": 1.3578826237054085e-05, + "loss": 0.263, + "step": 730 + }, + { + "epoch": 2.270186335403727, + "grad_norm": 0.2278585941156764, + "learning_rate": 1.3521288837744534e-05, + "loss": 0.2555, + "step": 731 + }, + { + "epoch": 2.2732919254658386, + "grad_norm": 0.20801276930170154, + "learning_rate": 1.3463751438434983e-05, + "loss": 0.2572, + "step": 732 + }, + { + "epoch": 2.2763975155279503, + "grad_norm": 0.22919123007559652, + "learning_rate": 1.3406214039125431e-05, + "loss": 0.2582, + "step": 733 + }, + { + "epoch": 2.279503105590062, + "grad_norm": 0.21268694380279451, + "learning_rate": 1.334867663981588e-05, + "loss": 0.2512, + "step": 734 + }, + { + "epoch": 2.282608695652174, + "grad_norm": 0.2182606134520971, + "learning_rate": 1.3291139240506329e-05, + "loss": 0.2536, + "step": 735 + }, + { + "epoch": 2.2857142857142856, + "grad_norm": 0.2177977754376004, + "learning_rate": 1.323360184119678e-05, + "loss": 0.2589, + "step": 736 + }, + { + "epoch": 2.2888198757763973, + "grad_norm": 0.2079260936390528, + "learning_rate": 1.3176064441887228e-05, + "loss": 0.2445, + "step": 737 + }, + { + "epoch": 2.291925465838509, + "grad_norm": 0.21654285079809454, + "learning_rate": 1.3118527042577677e-05, + "loss": 0.2492, + "step": 738 + }, + { + "epoch": 2.2950310559006213, + "grad_norm": 0.22224222175484207, + "learning_rate": 1.3060989643268126e-05, + "loss": 0.2555, + "step": 739 + }, + { + "epoch": 2.298136645962733, + "grad_norm": 0.2013544241929392, + "learning_rate": 1.3003452243958574e-05, + "loss": 0.2457, + "step": 740 + }, + { + "epoch": 2.301242236024845, + "grad_norm": 0.21733404218015004, + "learning_rate": 1.2945914844649023e-05, + "loss": 0.2659, + "step": 741 + }, + { + "epoch": 2.3043478260869565, + "grad_norm": 0.21179336140885693, + "learning_rate": 1.2888377445339472e-05, + "loss": 0.2426, + "step": 742 + }, + { + "epoch": 2.3074534161490683, + "grad_norm": 0.2285599698694653, + "learning_rate": 1.283084004602992e-05, + "loss": 0.2429, + "step": 743 + }, + { + "epoch": 2.31055900621118, + "grad_norm": 0.19835079918909265, + "learning_rate": 1.2773302646720369e-05, + "loss": 0.2489, + "step": 744 + }, + { + "epoch": 2.313664596273292, + "grad_norm": 0.2298623252387309, + "learning_rate": 1.2715765247410818e-05, + "loss": 0.2655, + "step": 745 + }, + { + "epoch": 2.3167701863354035, + "grad_norm": 0.23867880872639935, + "learning_rate": 1.2658227848101267e-05, + "loss": 0.2498, + "step": 746 + }, + { + "epoch": 2.3198757763975157, + "grad_norm": 0.21037856832784158, + "learning_rate": 1.2600690448791715e-05, + "loss": 0.2589, + "step": 747 + }, + { + "epoch": 2.3229813664596275, + "grad_norm": 0.24695028457966048, + "learning_rate": 1.2543153049482164e-05, + "loss": 0.2502, + "step": 748 + }, + { + "epoch": 2.3260869565217392, + "grad_norm": 0.23360363557581765, + "learning_rate": 1.2485615650172613e-05, + "loss": 0.259, + "step": 749 + }, + { + "epoch": 2.329192546583851, + "grad_norm": 0.22335503888847086, + "learning_rate": 1.2428078250863062e-05, + "loss": 0.2456, + "step": 750 + }, + { + "epoch": 2.3322981366459627, + "grad_norm": 0.21231134626201825, + "learning_rate": 1.237054085155351e-05, + "loss": 0.26, + "step": 751 + }, + { + "epoch": 2.3354037267080745, + "grad_norm": 0.20990198210516803, + "learning_rate": 1.2313003452243959e-05, + "loss": 0.2441, + "step": 752 + }, + { + "epoch": 2.3385093167701863, + "grad_norm": 0.221067131454967, + "learning_rate": 1.2255466052934408e-05, + "loss": 0.2469, + "step": 753 + }, + { + "epoch": 2.341614906832298, + "grad_norm": 0.22138406777470937, + "learning_rate": 1.2197928653624856e-05, + "loss": 0.261, + "step": 754 + }, + { + "epoch": 2.3447204968944098, + "grad_norm": 0.21398489008508845, + "learning_rate": 1.2140391254315305e-05, + "loss": 0.2566, + "step": 755 + }, + { + "epoch": 2.3478260869565215, + "grad_norm": 0.20448116831895594, + "learning_rate": 1.2082853855005754e-05, + "loss": 0.2598, + "step": 756 + }, + { + "epoch": 2.3509316770186337, + "grad_norm": 0.21255766006062407, + "learning_rate": 1.2025316455696203e-05, + "loss": 0.2526, + "step": 757 + }, + { + "epoch": 2.3540372670807455, + "grad_norm": 0.19087455271546003, + "learning_rate": 1.1967779056386651e-05, + "loss": 0.2537, + "step": 758 + }, + { + "epoch": 2.357142857142857, + "grad_norm": 0.20379774772998854, + "learning_rate": 1.19102416570771e-05, + "loss": 0.2668, + "step": 759 + }, + { + "epoch": 2.360248447204969, + "grad_norm": 0.19801295062012142, + "learning_rate": 1.1852704257767549e-05, + "loss": 0.2479, + "step": 760 + }, + { + "epoch": 2.3633540372670807, + "grad_norm": 0.2053725094185451, + "learning_rate": 1.1795166858457997e-05, + "loss": 0.2597, + "step": 761 + }, + { + "epoch": 2.3664596273291925, + "grad_norm": 0.19414430502845648, + "learning_rate": 1.1737629459148446e-05, + "loss": 0.2445, + "step": 762 + }, + { + "epoch": 2.369565217391304, + "grad_norm": 0.20779479767313294, + "learning_rate": 1.1680092059838895e-05, + "loss": 0.2649, + "step": 763 + }, + { + "epoch": 2.372670807453416, + "grad_norm": 0.20304929332054908, + "learning_rate": 1.1622554660529344e-05, + "loss": 0.2624, + "step": 764 + }, + { + "epoch": 2.375776397515528, + "grad_norm": 0.20512146624367367, + "learning_rate": 1.1565017261219792e-05, + "loss": 0.2532, + "step": 765 + }, + { + "epoch": 2.37888198757764, + "grad_norm": 0.1948376797912715, + "learning_rate": 1.1507479861910241e-05, + "loss": 0.2593, + "step": 766 + }, + { + "epoch": 2.3819875776397517, + "grad_norm": 0.20111608619484334, + "learning_rate": 1.144994246260069e-05, + "loss": 0.2431, + "step": 767 + }, + { + "epoch": 2.3850931677018634, + "grad_norm": 0.20424563225076126, + "learning_rate": 1.139240506329114e-05, + "loss": 0.239, + "step": 768 + }, + { + "epoch": 2.388198757763975, + "grad_norm": 0.20385122820209117, + "learning_rate": 1.1334867663981589e-05, + "loss": 0.2519, + "step": 769 + }, + { + "epoch": 2.391304347826087, + "grad_norm": 0.2169017997179514, + "learning_rate": 1.1277330264672038e-05, + "loss": 0.2599, + "step": 770 + }, + { + "epoch": 2.3944099378881987, + "grad_norm": 0.20583351351917192, + "learning_rate": 1.1219792865362486e-05, + "loss": 0.2515, + "step": 771 + }, + { + "epoch": 2.3975155279503104, + "grad_norm": 0.20864268761499544, + "learning_rate": 1.1162255466052935e-05, + "loss": 0.2674, + "step": 772 + }, + { + "epoch": 2.400621118012422, + "grad_norm": 0.18352483617724127, + "learning_rate": 1.1104718066743384e-05, + "loss": 0.2517, + "step": 773 + }, + { + "epoch": 2.403726708074534, + "grad_norm": 0.19458848397143083, + "learning_rate": 1.1047180667433833e-05, + "loss": 0.2348, + "step": 774 + }, + { + "epoch": 2.406832298136646, + "grad_norm": 0.22085258658145707, + "learning_rate": 1.0989643268124281e-05, + "loss": 0.2626, + "step": 775 + }, + { + "epoch": 2.409937888198758, + "grad_norm": 0.2244287112114885, + "learning_rate": 1.093210586881473e-05, + "loss": 0.2656, + "step": 776 + }, + { + "epoch": 2.4130434782608696, + "grad_norm": 0.2064604218573695, + "learning_rate": 1.0874568469505179e-05, + "loss": 0.2474, + "step": 777 + }, + { + "epoch": 2.4161490683229814, + "grad_norm": 0.2170623734624135, + "learning_rate": 1.0817031070195628e-05, + "loss": 0.2673, + "step": 778 + }, + { + "epoch": 2.419254658385093, + "grad_norm": 0.21813795262022834, + "learning_rate": 1.0759493670886076e-05, + "loss": 0.2566, + "step": 779 + }, + { + "epoch": 2.422360248447205, + "grad_norm": 0.20015983943955706, + "learning_rate": 1.0701956271576525e-05, + "loss": 0.2433, + "step": 780 + }, + { + "epoch": 2.4254658385093166, + "grad_norm": 0.2518786075901923, + "learning_rate": 1.0644418872266974e-05, + "loss": 0.2542, + "step": 781 + }, + { + "epoch": 2.4285714285714284, + "grad_norm": 0.2039696978745147, + "learning_rate": 1.0586881472957422e-05, + "loss": 0.2635, + "step": 782 + }, + { + "epoch": 2.4316770186335406, + "grad_norm": 0.20193387084839037, + "learning_rate": 1.0529344073647871e-05, + "loss": 0.2529, + "step": 783 + }, + { + "epoch": 2.4347826086956523, + "grad_norm": 0.22256582381404963, + "learning_rate": 1.047180667433832e-05, + "loss": 0.2502, + "step": 784 + }, + { + "epoch": 2.437888198757764, + "grad_norm": 0.20375665724837322, + "learning_rate": 1.0414269275028769e-05, + "loss": 0.241, + "step": 785 + }, + { + "epoch": 2.440993788819876, + "grad_norm": 0.2179110760676831, + "learning_rate": 1.0356731875719217e-05, + "loss": 0.2599, + "step": 786 + }, + { + "epoch": 2.4440993788819876, + "grad_norm": 0.22202976810767208, + "learning_rate": 1.0299194476409666e-05, + "loss": 0.2505, + "step": 787 + }, + { + "epoch": 2.4472049689440993, + "grad_norm": 0.42670457942092715, + "learning_rate": 1.0241657077100115e-05, + "loss": 0.241, + "step": 788 + }, + { + "epoch": 2.450310559006211, + "grad_norm": 0.20784564321109833, + "learning_rate": 1.0184119677790564e-05, + "loss": 0.2567, + "step": 789 + }, + { + "epoch": 2.453416149068323, + "grad_norm": 0.20121980240137796, + "learning_rate": 1.0126582278481012e-05, + "loss": 0.2451, + "step": 790 + }, + { + "epoch": 2.4565217391304346, + "grad_norm": 0.21747971229319626, + "learning_rate": 1.0069044879171461e-05, + "loss": 0.2387, + "step": 791 + }, + { + "epoch": 2.4596273291925463, + "grad_norm": 0.18957130652801002, + "learning_rate": 1.001150747986191e-05, + "loss": 0.2404, + "step": 792 + }, + { + "epoch": 2.4627329192546585, + "grad_norm": 0.19623974528931779, + "learning_rate": 9.953970080552358e-06, + "loss": 0.2505, + "step": 793 + }, + { + "epoch": 2.4658385093167703, + "grad_norm": 0.2090564420719582, + "learning_rate": 9.896432681242807e-06, + "loss": 0.259, + "step": 794 + }, + { + "epoch": 2.468944099378882, + "grad_norm": 0.20995347548362167, + "learning_rate": 9.838895281933256e-06, + "loss": 0.2557, + "step": 795 + }, + { + "epoch": 2.472049689440994, + "grad_norm": 0.21072680749655628, + "learning_rate": 9.781357882623705e-06, + "loss": 0.2507, + "step": 796 + }, + { + "epoch": 2.4751552795031055, + "grad_norm": 0.2028138320185975, + "learning_rate": 9.723820483314153e-06, + "loss": 0.2428, + "step": 797 + }, + { + "epoch": 2.4782608695652173, + "grad_norm": 0.203416816769087, + "learning_rate": 9.666283084004602e-06, + "loss": 0.2549, + "step": 798 + }, + { + "epoch": 2.481366459627329, + "grad_norm": 0.2114980169350222, + "learning_rate": 9.60874568469505e-06, + "loss": 0.2544, + "step": 799 + }, + { + "epoch": 2.4844720496894412, + "grad_norm": 0.1947781123063217, + "learning_rate": 9.551208285385501e-06, + "loss": 0.246, + "step": 800 + }, + { + "epoch": 2.487577639751553, + "grad_norm": 0.2313621289649826, + "learning_rate": 9.49367088607595e-06, + "loss": 0.2688, + "step": 801 + }, + { + "epoch": 2.4906832298136647, + "grad_norm": 0.2070540850596655, + "learning_rate": 9.436133486766399e-06, + "loss": 0.2594, + "step": 802 + }, + { + "epoch": 2.4937888198757765, + "grad_norm": 0.21169469541077635, + "learning_rate": 9.378596087456847e-06, + "loss": 0.2493, + "step": 803 + }, + { + "epoch": 2.4968944099378882, + "grad_norm": 0.19281802475760265, + "learning_rate": 9.321058688147296e-06, + "loss": 0.25, + "step": 804 + }, + { + "epoch": 2.5, + "grad_norm": 0.2175842957962285, + "learning_rate": 9.263521288837745e-06, + "loss": 0.2678, + "step": 805 + }, + { + "epoch": 2.5031055900621118, + "grad_norm": 0.1942027851518837, + "learning_rate": 9.205983889528194e-06, + "loss": 0.2505, + "step": 806 + }, + { + "epoch": 2.5062111801242235, + "grad_norm": 0.2119389750172559, + "learning_rate": 9.148446490218642e-06, + "loss": 0.2647, + "step": 807 + }, + { + "epoch": 2.5093167701863353, + "grad_norm": 0.20993843490643438, + "learning_rate": 9.090909090909091e-06, + "loss": 0.2381, + "step": 808 + }, + { + "epoch": 2.512422360248447, + "grad_norm": 0.20387329805308116, + "learning_rate": 9.03337169159954e-06, + "loss": 0.2404, + "step": 809 + }, + { + "epoch": 2.5155279503105588, + "grad_norm": 0.206875715468925, + "learning_rate": 8.975834292289988e-06, + "loss": 0.2625, + "step": 810 + }, + { + "epoch": 2.518633540372671, + "grad_norm": 0.20699195679204746, + "learning_rate": 8.918296892980437e-06, + "loss": 0.2496, + "step": 811 + }, + { + "epoch": 2.5217391304347827, + "grad_norm": 0.20082335227786552, + "learning_rate": 8.860759493670886e-06, + "loss": 0.2472, + "step": 812 + }, + { + "epoch": 2.5248447204968945, + "grad_norm": 0.20337421721934987, + "learning_rate": 8.803222094361335e-06, + "loss": 0.2465, + "step": 813 + }, + { + "epoch": 2.527950310559006, + "grad_norm": 0.19690561472031543, + "learning_rate": 8.745684695051783e-06, + "loss": 0.2562, + "step": 814 + }, + { + "epoch": 2.531055900621118, + "grad_norm": 0.20942292198434145, + "learning_rate": 8.688147295742232e-06, + "loss": 0.2448, + "step": 815 + }, + { + "epoch": 2.5341614906832297, + "grad_norm": 0.22511418926211027, + "learning_rate": 8.630609896432681e-06, + "loss": 0.2585, + "step": 816 + }, + { + "epoch": 2.5372670807453415, + "grad_norm": 0.21038192778136464, + "learning_rate": 8.57307249712313e-06, + "loss": 0.2463, + "step": 817 + }, + { + "epoch": 2.5403726708074537, + "grad_norm": 0.18890075777071388, + "learning_rate": 8.515535097813578e-06, + "loss": 0.2521, + "step": 818 + }, + { + "epoch": 2.5434782608695654, + "grad_norm": 0.21205002134781, + "learning_rate": 8.457997698504027e-06, + "loss": 0.2585, + "step": 819 + }, + { + "epoch": 2.546583850931677, + "grad_norm": 0.1941024098027217, + "learning_rate": 8.400460299194476e-06, + "loss": 0.2566, + "step": 820 + }, + { + "epoch": 2.549689440993789, + "grad_norm": 0.30349180360429645, + "learning_rate": 8.342922899884924e-06, + "loss": 0.2623, + "step": 821 + }, + { + "epoch": 2.5527950310559007, + "grad_norm": 0.22803507573466544, + "learning_rate": 8.285385500575373e-06, + "loss": 0.2558, + "step": 822 + }, + { + "epoch": 2.5559006211180124, + "grad_norm": 0.2020632346168216, + "learning_rate": 8.227848101265822e-06, + "loss": 0.2586, + "step": 823 + }, + { + "epoch": 2.559006211180124, + "grad_norm": 0.19503633689058, + "learning_rate": 8.17031070195627e-06, + "loss": 0.2628, + "step": 824 + }, + { + "epoch": 2.562111801242236, + "grad_norm": 0.19443407045409983, + "learning_rate": 8.11277330264672e-06, + "loss": 0.2492, + "step": 825 + }, + { + "epoch": 2.5652173913043477, + "grad_norm": 0.20150007916652513, + "learning_rate": 8.05523590333717e-06, + "loss": 0.256, + "step": 826 + }, + { + "epoch": 2.5683229813664594, + "grad_norm": 0.20193826865741932, + "learning_rate": 7.997698504027619e-06, + "loss": 0.2716, + "step": 827 + }, + { + "epoch": 2.571428571428571, + "grad_norm": 0.19621787984261999, + "learning_rate": 7.940161104718067e-06, + "loss": 0.2443, + "step": 828 + }, + { + "epoch": 2.5745341614906834, + "grad_norm": 0.19610522530135707, + "learning_rate": 7.882623705408516e-06, + "loss": 0.2562, + "step": 829 + }, + { + "epoch": 2.577639751552795, + "grad_norm": 0.1980537399225623, + "learning_rate": 7.825086306098965e-06, + "loss": 0.2475, + "step": 830 + }, + { + "epoch": 2.580745341614907, + "grad_norm": 0.19074805307763945, + "learning_rate": 7.767548906789413e-06, + "loss": 0.2557, + "step": 831 + }, + { + "epoch": 2.5838509316770186, + "grad_norm": 0.19613067044699573, + "learning_rate": 7.710011507479862e-06, + "loss": 0.2633, + "step": 832 + }, + { + "epoch": 2.5869565217391304, + "grad_norm": 0.19682931877320217, + "learning_rate": 7.652474108170311e-06, + "loss": 0.2378, + "step": 833 + }, + { + "epoch": 2.590062111801242, + "grad_norm": 0.20053417585734873, + "learning_rate": 7.5949367088607605e-06, + "loss": 0.2498, + "step": 834 + }, + { + "epoch": 2.593167701863354, + "grad_norm": 0.19178100866522357, + "learning_rate": 7.537399309551209e-06, + "loss": 0.2355, + "step": 835 + }, + { + "epoch": 2.596273291925466, + "grad_norm": 0.2084827189783707, + "learning_rate": 7.479861910241658e-06, + "loss": 0.2764, + "step": 836 + }, + { + "epoch": 2.599378881987578, + "grad_norm": 0.19540838307901068, + "learning_rate": 7.422324510932107e-06, + "loss": 0.2437, + "step": 837 + }, + { + "epoch": 2.6024844720496896, + "grad_norm": 0.19587457349490991, + "learning_rate": 7.364787111622555e-06, + "loss": 0.2489, + "step": 838 + }, + { + "epoch": 2.6055900621118013, + "grad_norm": 0.25075690817051544, + "learning_rate": 7.307249712313004e-06, + "loss": 0.2648, + "step": 839 + }, + { + "epoch": 2.608695652173913, + "grad_norm": 0.20743291534086578, + "learning_rate": 7.249712313003453e-06, + "loss": 0.2646, + "step": 840 + }, + { + "epoch": 2.611801242236025, + "grad_norm": 0.21071395029449075, + "learning_rate": 7.1921749136939016e-06, + "loss": 0.2427, + "step": 841 + }, + { + "epoch": 2.6149068322981366, + "grad_norm": 0.20235523726201224, + "learning_rate": 7.13463751438435e-06, + "loss": 0.2587, + "step": 842 + }, + { + "epoch": 2.6180124223602483, + "grad_norm": 0.20149232436113795, + "learning_rate": 7.077100115074799e-06, + "loss": 0.2516, + "step": 843 + }, + { + "epoch": 2.62111801242236, + "grad_norm": 0.21144648873433503, + "learning_rate": 7.019562715765248e-06, + "loss": 0.2582, + "step": 844 + }, + { + "epoch": 2.624223602484472, + "grad_norm": 0.2162300937976304, + "learning_rate": 6.9620253164556965e-06, + "loss": 0.2556, + "step": 845 + }, + { + "epoch": 2.6273291925465836, + "grad_norm": 0.21106771620646603, + "learning_rate": 6.904487917146145e-06, + "loss": 0.2558, + "step": 846 + }, + { + "epoch": 2.630434782608696, + "grad_norm": 0.23609832773446915, + "learning_rate": 6.846950517836594e-06, + "loss": 0.2572, + "step": 847 + }, + { + "epoch": 2.6335403726708075, + "grad_norm": 0.21122404379666423, + "learning_rate": 6.789413118527043e-06, + "loss": 0.2434, + "step": 848 + }, + { + "epoch": 2.6366459627329193, + "grad_norm": 0.2015181805089703, + "learning_rate": 6.731875719217491e-06, + "loss": 0.2418, + "step": 849 + }, + { + "epoch": 2.639751552795031, + "grad_norm": 0.20647243106844593, + "learning_rate": 6.67433831990794e-06, + "loss": 0.265, + "step": 850 + }, + { + "epoch": 2.642857142857143, + "grad_norm": 0.2083640341120549, + "learning_rate": 6.61680092059839e-06, + "loss": 0.2531, + "step": 851 + }, + { + "epoch": 2.6459627329192545, + "grad_norm": 0.20501908976688168, + "learning_rate": 6.559263521288838e-06, + "loss": 0.2514, + "step": 852 + }, + { + "epoch": 2.6490683229813663, + "grad_norm": 0.19844284276810914, + "learning_rate": 6.501726121979287e-06, + "loss": 0.2644, + "step": 853 + }, + { + "epoch": 2.6521739130434785, + "grad_norm": 0.21201237882135082, + "learning_rate": 6.444188722669736e-06, + "loss": 0.2568, + "step": 854 + }, + { + "epoch": 2.6552795031055902, + "grad_norm": 0.22195301360518224, + "learning_rate": 6.3866513233601846e-06, + "loss": 0.261, + "step": 855 + }, + { + "epoch": 2.658385093167702, + "grad_norm": 0.19287865061356418, + "learning_rate": 6.329113924050633e-06, + "loss": 0.2469, + "step": 856 + }, + { + "epoch": 2.6614906832298137, + "grad_norm": 0.19640829139853255, + "learning_rate": 6.271576524741082e-06, + "loss": 0.2462, + "step": 857 + }, + { + "epoch": 2.6645962732919255, + "grad_norm": 0.20101972350059313, + "learning_rate": 6.214039125431531e-06, + "loss": 0.255, + "step": 858 + }, + { + "epoch": 2.6677018633540373, + "grad_norm": 0.2841326489307957, + "learning_rate": 6.1565017261219795e-06, + "loss": 0.2457, + "step": 859 + }, + { + "epoch": 2.670807453416149, + "grad_norm": 0.18827454901664883, + "learning_rate": 6.098964326812428e-06, + "loss": 0.2427, + "step": 860 + }, + { + "epoch": 2.6739130434782608, + "grad_norm": 0.20109847479853832, + "learning_rate": 6.041426927502877e-06, + "loss": 0.2402, + "step": 861 + }, + { + "epoch": 2.6770186335403725, + "grad_norm": 0.1910402172602598, + "learning_rate": 5.983889528193326e-06, + "loss": 0.2627, + "step": 862 + }, + { + "epoch": 2.6801242236024843, + "grad_norm": 0.1974312904693097, + "learning_rate": 5.926352128883774e-06, + "loss": 0.2625, + "step": 863 + }, + { + "epoch": 2.683229813664596, + "grad_norm": 0.19911868656713894, + "learning_rate": 5.868814729574223e-06, + "loss": 0.2368, + "step": 864 + }, + { + "epoch": 2.686335403726708, + "grad_norm": 0.21362726329843149, + "learning_rate": 5.811277330264672e-06, + "loss": 0.2534, + "step": 865 + }, + { + "epoch": 2.68944099378882, + "grad_norm": 0.20941798902436187, + "learning_rate": 5.7537399309551206e-06, + "loss": 0.2454, + "step": 866 + }, + { + "epoch": 2.6925465838509317, + "grad_norm": 0.19014291486371018, + "learning_rate": 5.69620253164557e-06, + "loss": 0.2446, + "step": 867 + }, + { + "epoch": 2.6956521739130435, + "grad_norm": 0.19597012112115988, + "learning_rate": 5.638665132336019e-06, + "loss": 0.2537, + "step": 868 + }, + { + "epoch": 2.698757763975155, + "grad_norm": 0.19714293851097728, + "learning_rate": 5.581127733026468e-06, + "loss": 0.2468, + "step": 869 + }, + { + "epoch": 2.701863354037267, + "grad_norm": 0.19621178971442163, + "learning_rate": 5.523590333716916e-06, + "loss": 0.2507, + "step": 870 + }, + { + "epoch": 2.704968944099379, + "grad_norm": 0.19491684844874946, + "learning_rate": 5.466052934407365e-06, + "loss": 0.2413, + "step": 871 + }, + { + "epoch": 2.708074534161491, + "grad_norm": 0.27521581958829827, + "learning_rate": 5.408515535097814e-06, + "loss": 0.2633, + "step": 872 + }, + { + "epoch": 2.7111801242236027, + "grad_norm": 0.2168313001961523, + "learning_rate": 5.3509781357882625e-06, + "loss": 0.264, + "step": 873 + }, + { + "epoch": 2.7142857142857144, + "grad_norm": 0.18797210234683806, + "learning_rate": 5.293440736478711e-06, + "loss": 0.2447, + "step": 874 + }, + { + "epoch": 2.717391304347826, + "grad_norm": 0.21084636753160527, + "learning_rate": 5.23590333716916e-06, + "loss": 0.2619, + "step": 875 + }, + { + "epoch": 2.720496894409938, + "grad_norm": 0.20635684776280216, + "learning_rate": 5.178365937859609e-06, + "loss": 0.2583, + "step": 876 + }, + { + "epoch": 2.7236024844720497, + "grad_norm": 0.19468296653400607, + "learning_rate": 5.120828538550057e-06, + "loss": 0.2492, + "step": 877 + }, + { + "epoch": 2.7267080745341614, + "grad_norm": 0.20398048699150237, + "learning_rate": 5.063291139240506e-06, + "loss": 0.2549, + "step": 878 + }, + { + "epoch": 2.729813664596273, + "grad_norm": 0.18689815600092072, + "learning_rate": 5.005753739930955e-06, + "loss": 0.2488, + "step": 879 + }, + { + "epoch": 2.732919254658385, + "grad_norm": 0.20597588086540602, + "learning_rate": 4.948216340621404e-06, + "loss": 0.2667, + "step": 880 + }, + { + "epoch": 2.7360248447204967, + "grad_norm": 0.196856493986424, + "learning_rate": 4.890678941311852e-06, + "loss": 0.2513, + "step": 881 + }, + { + "epoch": 2.7391304347826084, + "grad_norm": 0.19482252545749987, + "learning_rate": 4.833141542002301e-06, + "loss": 0.2498, + "step": 882 + }, + { + "epoch": 2.7422360248447206, + "grad_norm": 0.19795769225255558, + "learning_rate": 4.775604142692751e-06, + "loss": 0.2499, + "step": 883 + }, + { + "epoch": 2.7453416149068324, + "grad_norm": 0.21689477537897567, + "learning_rate": 4.718066743383199e-06, + "loss": 0.284, + "step": 884 + }, + { + "epoch": 2.748447204968944, + "grad_norm": 0.2418368942479182, + "learning_rate": 4.660529344073648e-06, + "loss": 0.2766, + "step": 885 + }, + { + "epoch": 2.751552795031056, + "grad_norm": 0.20477977797718222, + "learning_rate": 4.602991944764097e-06, + "loss": 0.2401, + "step": 886 + }, + { + "epoch": 2.7546583850931676, + "grad_norm": 0.2006217058218365, + "learning_rate": 4.5454545454545455e-06, + "loss": 0.2471, + "step": 887 + }, + { + "epoch": 2.7577639751552794, + "grad_norm": 0.20256868584609686, + "learning_rate": 4.487917146144994e-06, + "loss": 0.2441, + "step": 888 + }, + { + "epoch": 2.7608695652173916, + "grad_norm": 0.21086545356411496, + "learning_rate": 4.430379746835443e-06, + "loss": 0.255, + "step": 889 + }, + { + "epoch": 2.7639751552795033, + "grad_norm": 0.19012940644030216, + "learning_rate": 4.372842347525892e-06, + "loss": 0.2524, + "step": 890 + }, + { + "epoch": 2.767080745341615, + "grad_norm": 0.20733606950697256, + "learning_rate": 4.3153049482163404e-06, + "loss": 0.2502, + "step": 891 + }, + { + "epoch": 2.770186335403727, + "grad_norm": 0.19869202500390978, + "learning_rate": 4.257767548906789e-06, + "loss": 0.2488, + "step": 892 + }, + { + "epoch": 2.7732919254658386, + "grad_norm": 0.21061575298666055, + "learning_rate": 4.200230149597238e-06, + "loss": 0.2535, + "step": 893 + }, + { + "epoch": 2.7763975155279503, + "grad_norm": 0.1902154270690934, + "learning_rate": 4.142692750287687e-06, + "loss": 0.2523, + "step": 894 + }, + { + "epoch": 2.779503105590062, + "grad_norm": 0.19515929958573747, + "learning_rate": 4.085155350978135e-06, + "loss": 0.2544, + "step": 895 + }, + { + "epoch": 2.782608695652174, + "grad_norm": 0.19252012143661815, + "learning_rate": 4.027617951668585e-06, + "loss": 0.2638, + "step": 896 + }, + { + "epoch": 2.7857142857142856, + "grad_norm": 0.1923327877416844, + "learning_rate": 3.970080552359034e-06, + "loss": 0.2462, + "step": 897 + }, + { + "epoch": 2.7888198757763973, + "grad_norm": 0.18586501981252762, + "learning_rate": 3.912543153049482e-06, + "loss": 0.2403, + "step": 898 + }, + { + "epoch": 2.791925465838509, + "grad_norm": 0.1986091973327919, + "learning_rate": 3.855005753739931e-06, + "loss": 0.2655, + "step": 899 + }, + { + "epoch": 2.795031055900621, + "grad_norm": 0.18536496942596287, + "learning_rate": 3.7974683544303802e-06, + "loss": 0.2389, + "step": 900 + }, + { + "epoch": 2.798136645962733, + "grad_norm": 0.19607218549803698, + "learning_rate": 3.739930955120829e-06, + "loss": 0.2542, + "step": 901 + }, + { + "epoch": 2.801242236024845, + "grad_norm": 0.19944282694872204, + "learning_rate": 3.6823935558112777e-06, + "loss": 0.2434, + "step": 902 + }, + { + "epoch": 2.8043478260869565, + "grad_norm": 0.1972448743409019, + "learning_rate": 3.6248561565017264e-06, + "loss": 0.247, + "step": 903 + }, + { + "epoch": 2.8074534161490683, + "grad_norm": 0.19361398823404677, + "learning_rate": 3.567318757192175e-06, + "loss": 0.2597, + "step": 904 + }, + { + "epoch": 2.81055900621118, + "grad_norm": 0.19293534537923737, + "learning_rate": 3.509781357882624e-06, + "loss": 0.2679, + "step": 905 + }, + { + "epoch": 2.813664596273292, + "grad_norm": 0.1978927145961964, + "learning_rate": 3.4522439585730726e-06, + "loss": 0.2474, + "step": 906 + }, + { + "epoch": 2.816770186335404, + "grad_norm": 0.18672700788585406, + "learning_rate": 3.3947065592635213e-06, + "loss": 0.2468, + "step": 907 + }, + { + "epoch": 2.8198757763975157, + "grad_norm": 0.1856966835076563, + "learning_rate": 3.33716915995397e-06, + "loss": 0.2424, + "step": 908 + }, + { + "epoch": 2.8229813664596275, + "grad_norm": 0.19224475733121915, + "learning_rate": 3.279631760644419e-06, + "loss": 0.2477, + "step": 909 + }, + { + "epoch": 2.8260869565217392, + "grad_norm": 0.20470161040078505, + "learning_rate": 3.222094361334868e-06, + "loss": 0.2706, + "step": 910 + }, + { + "epoch": 2.829192546583851, + "grad_norm": 0.19429220598035837, + "learning_rate": 3.1645569620253167e-06, + "loss": 0.2477, + "step": 911 + }, + { + "epoch": 2.8322981366459627, + "grad_norm": 0.1894109295691752, + "learning_rate": 3.1070195627157654e-06, + "loss": 0.2528, + "step": 912 + }, + { + "epoch": 2.8354037267080745, + "grad_norm": 0.18097305550473375, + "learning_rate": 3.049482163406214e-06, + "loss": 0.2559, + "step": 913 + }, + { + "epoch": 2.8385093167701863, + "grad_norm": 0.19783393465985816, + "learning_rate": 2.991944764096663e-06, + "loss": 0.2594, + "step": 914 + }, + { + "epoch": 2.841614906832298, + "grad_norm": 0.20897012225810746, + "learning_rate": 2.9344073647871116e-06, + "loss": 0.2653, + "step": 915 + }, + { + "epoch": 2.8447204968944098, + "grad_norm": 0.1896928698309342, + "learning_rate": 2.8768699654775603e-06, + "loss": 0.2477, + "step": 916 + }, + { + "epoch": 2.8478260869565215, + "grad_norm": 0.1784496457827597, + "learning_rate": 2.8193325661680094e-06, + "loss": 0.244, + "step": 917 + }, + { + "epoch": 2.8509316770186337, + "grad_norm": 0.18587478842335634, + "learning_rate": 2.761795166858458e-06, + "loss": 0.25, + "step": 918 + }, + { + "epoch": 2.8540372670807455, + "grad_norm": 0.20084898371613977, + "learning_rate": 2.704257767548907e-06, + "loss": 0.2577, + "step": 919 + }, + { + "epoch": 2.857142857142857, + "grad_norm": 0.17885954860642703, + "learning_rate": 2.6467203682393556e-06, + "loss": 0.2407, + "step": 920 + }, + { + "epoch": 2.860248447204969, + "grad_norm": 0.18561208551570504, + "learning_rate": 2.5891829689298043e-06, + "loss": 0.2659, + "step": 921 + }, + { + "epoch": 2.8633540372670807, + "grad_norm": 0.21932109217206247, + "learning_rate": 2.531645569620253e-06, + "loss": 0.2403, + "step": 922 + }, + { + "epoch": 2.8664596273291925, + "grad_norm": 0.20030388225206663, + "learning_rate": 2.474108170310702e-06, + "loss": 0.2544, + "step": 923 + }, + { + "epoch": 2.869565217391304, + "grad_norm": 0.20878429975095714, + "learning_rate": 2.4165707710011505e-06, + "loss": 0.2497, + "step": 924 + }, + { + "epoch": 2.8726708074534164, + "grad_norm": 0.22186219053087963, + "learning_rate": 2.3590333716915997e-06, + "loss": 0.2672, + "step": 925 + }, + { + "epoch": 2.875776397515528, + "grad_norm": 0.18672043459559956, + "learning_rate": 2.3014959723820484e-06, + "loss": 0.2485, + "step": 926 + }, + { + "epoch": 2.87888198757764, + "grad_norm": 0.18051985217560826, + "learning_rate": 2.243958573072497e-06, + "loss": 0.2479, + "step": 927 + }, + { + "epoch": 2.8819875776397517, + "grad_norm": 0.20846631011511568, + "learning_rate": 2.186421173762946e-06, + "loss": 0.2632, + "step": 928 + }, + { + "epoch": 2.8850931677018634, + "grad_norm": 0.17696337319445454, + "learning_rate": 2.1288837744533946e-06, + "loss": 0.2288, + "step": 929 + }, + { + "epoch": 2.888198757763975, + "grad_norm": 0.18293271617504872, + "learning_rate": 2.0713463751438433e-06, + "loss": 0.2495, + "step": 930 + }, + { + "epoch": 2.891304347826087, + "grad_norm": 0.17853543153947618, + "learning_rate": 2.0138089758342925e-06, + "loss": 0.2468, + "step": 931 + }, + { + "epoch": 2.8944099378881987, + "grad_norm": 0.18420177129422804, + "learning_rate": 1.956271576524741e-06, + "loss": 0.2538, + "step": 932 + }, + { + "epoch": 2.8975155279503104, + "grad_norm": 0.18170230261287915, + "learning_rate": 1.8987341772151901e-06, + "loss": 0.2589, + "step": 933 + }, + { + "epoch": 2.900621118012422, + "grad_norm": 0.18685594957937918, + "learning_rate": 1.8411967779056388e-06, + "loss": 0.2442, + "step": 934 + }, + { + "epoch": 2.903726708074534, + "grad_norm": 0.18690296703530773, + "learning_rate": 1.7836593785960876e-06, + "loss": 0.2451, + "step": 935 + }, + { + "epoch": 2.906832298136646, + "grad_norm": 0.20799939121665842, + "learning_rate": 1.7261219792865363e-06, + "loss": 0.2649, + "step": 936 + }, + { + "epoch": 2.909937888198758, + "grad_norm": 0.18563245527227562, + "learning_rate": 1.668584579976985e-06, + "loss": 0.2473, + "step": 937 + }, + { + "epoch": 2.9130434782608696, + "grad_norm": 0.18286501772853814, + "learning_rate": 1.611047180667434e-06, + "loss": 0.2486, + "step": 938 + }, + { + "epoch": 2.9161490683229814, + "grad_norm": 0.17534488578059473, + "learning_rate": 1.5535097813578827e-06, + "loss": 0.259, + "step": 939 + }, + { + "epoch": 2.919254658385093, + "grad_norm": 0.19817242903037158, + "learning_rate": 1.4959723820483314e-06, + "loss": 0.2428, + "step": 940 + }, + { + "epoch": 2.922360248447205, + "grad_norm": 0.18335244034678858, + "learning_rate": 1.4384349827387801e-06, + "loss": 0.252, + "step": 941 + }, + { + "epoch": 2.9254658385093166, + "grad_norm": 0.17672927011117798, + "learning_rate": 1.380897583429229e-06, + "loss": 0.239, + "step": 942 + }, + { + "epoch": 2.928571428571429, + "grad_norm": 0.19756419475586987, + "learning_rate": 1.3233601841196778e-06, + "loss": 0.2481, + "step": 943 + }, + { + "epoch": 2.9316770186335406, + "grad_norm": 0.18227787024732953, + "learning_rate": 1.2658227848101265e-06, + "loss": 0.2503, + "step": 944 + }, + { + "epoch": 2.9347826086956523, + "grad_norm": 0.17546530423346965, + "learning_rate": 1.2082853855005753e-06, + "loss": 0.2435, + "step": 945 + }, + { + "epoch": 2.937888198757764, + "grad_norm": 0.17977719939100784, + "learning_rate": 1.1507479861910242e-06, + "loss": 0.2462, + "step": 946 + }, + { + "epoch": 2.940993788819876, + "grad_norm": 0.182411120857819, + "learning_rate": 1.093210586881473e-06, + "loss": 0.2508, + "step": 947 + }, + { + "epoch": 2.9440993788819876, + "grad_norm": 0.19191429782536917, + "learning_rate": 1.0356731875719217e-06, + "loss": 0.2419, + "step": 948 + }, + { + "epoch": 2.9472049689440993, + "grad_norm": 0.17465750941257832, + "learning_rate": 9.781357882623706e-07, + "loss": 0.2394, + "step": 949 + }, + { + "epoch": 2.950310559006211, + "grad_norm": 0.17956692661649218, + "learning_rate": 9.205983889528194e-07, + "loss": 0.2478, + "step": 950 + }, + { + "epoch": 2.953416149068323, + "grad_norm": 0.18930304144220808, + "learning_rate": 8.630609896432681e-07, + "loss": 0.2371, + "step": 951 + }, + { + "epoch": 2.9565217391304346, + "grad_norm": 0.19245986180447752, + "learning_rate": 8.05523590333717e-07, + "loss": 0.2324, + "step": 952 + }, + { + "epoch": 2.9596273291925463, + "grad_norm": 0.18874027199979995, + "learning_rate": 7.479861910241657e-07, + "loss": 0.2482, + "step": 953 + }, + { + "epoch": 2.9627329192546585, + "grad_norm": 0.18606185656754726, + "learning_rate": 6.904487917146145e-07, + "loss": 0.2602, + "step": 954 + }, + { + "epoch": 2.9658385093167703, + "grad_norm": 0.18622516429740096, + "learning_rate": 6.329113924050633e-07, + "loss": 0.2514, + "step": 955 + }, + { + "epoch": 2.968944099378882, + "grad_norm": 0.1910726758431884, + "learning_rate": 5.753739930955121e-07, + "loss": 0.2526, + "step": 956 + }, + { + "epoch": 2.972049689440994, + "grad_norm": 0.18715451349236073, + "learning_rate": 5.178365937859608e-07, + "loss": 0.2512, + "step": 957 + }, + { + "epoch": 2.9751552795031055, + "grad_norm": 0.18279876224123887, + "learning_rate": 4.602991944764097e-07, + "loss": 0.2624, + "step": 958 + }, + { + "epoch": 2.9782608695652173, + "grad_norm": 0.18112311672532813, + "learning_rate": 4.027617951668585e-07, + "loss": 0.261, + "step": 959 + }, + { + "epoch": 2.981366459627329, + "grad_norm": 0.17566965768374485, + "learning_rate": 3.4522439585730727e-07, + "loss": 0.2437, + "step": 960 + }, + { + "epoch": 2.9844720496894412, + "grad_norm": 0.19721806486553123, + "learning_rate": 2.8768699654775605e-07, + "loss": 0.2594, + "step": 961 + }, + { + "epoch": 2.987577639751553, + "grad_norm": 0.19840508059977566, + "learning_rate": 2.3014959723820486e-07, + "loss": 0.2415, + "step": 962 + }, + { + "epoch": 2.9906832298136647, + "grad_norm": 0.20273843178894588, + "learning_rate": 1.7261219792865363e-07, + "loss": 0.2523, + "step": 963 + }, + { + "epoch": 2.9937888198757765, + "grad_norm": 0.19347512173901257, + "learning_rate": 1.1507479861910243e-07, + "loss": 0.256, + "step": 964 + }, + { + "epoch": 2.9968944099378882, + "grad_norm": 0.17382060962656506, + "learning_rate": 5.7537399309551214e-08, + "loss": 0.2351, + "step": 965 + }, + { + "epoch": 3.0, + "grad_norm": 0.17740605884855634, + "learning_rate": 0.0, + "loss": 0.2395, + "step": 966 + }, + { + "epoch": 3.0, + "step": 966, + "total_flos": 8.211023406049526e+17, + "train_loss": 0.4564525331862225, + "train_runtime": 83597.7389, + "train_samples_per_second": 0.185, + "train_steps_per_second": 0.012 + } + ], + "logging_steps": 1, + "max_steps": 966, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 8.211023406049526e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}