{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9996652159357214, "eval_steps": 150, "global_step": 1493, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0006695681285570807, "grad_norm": 14.5649765167057, "learning_rate": 2.2222222222222224e-07, "loss": 1.3945, "step": 1 }, { "epoch": 0.0013391362571141614, "grad_norm": 13.261450634928215, "learning_rate": 4.444444444444445e-07, "loss": 1.455, "step": 2 }, { "epoch": 0.002008704385671242, "grad_norm": 13.204852784872177, "learning_rate": 6.666666666666667e-07, "loss": 1.4171, "step": 3 }, { "epoch": 0.002678272514228323, "grad_norm": 17.815832617044176, "learning_rate": 8.88888888888889e-07, "loss": 1.6959, "step": 4 }, { "epoch": 0.0033478406427854034, "grad_norm": 14.085686179340161, "learning_rate": 1.111111111111111e-06, "loss": 1.4322, "step": 5 }, { "epoch": 0.004017408771342484, "grad_norm": 17.172957721945867, "learning_rate": 1.3333333333333334e-06, "loss": 1.5052, "step": 6 }, { "epoch": 0.004686976899899564, "grad_norm": 14.52736382033092, "learning_rate": 1.5555555555555558e-06, "loss": 1.3753, "step": 7 }, { "epoch": 0.005356545028456646, "grad_norm": 13.175886659984371, "learning_rate": 1.777777777777778e-06, "loss": 1.3446, "step": 8 }, { "epoch": 0.006026113157013726, "grad_norm": 11.773936935367981, "learning_rate": 2.0000000000000003e-06, "loss": 1.2746, "step": 9 }, { "epoch": 0.006695681285570807, "grad_norm": 14.740080357143695, "learning_rate": 2.222222222222222e-06, "loss": 1.1677, "step": 10 }, { "epoch": 0.007365249414127887, "grad_norm": 20.830763001898973, "learning_rate": 2.4444444444444447e-06, "loss": 1.1697, "step": 11 }, { "epoch": 0.008034817542684968, "grad_norm": 12.747395882064952, "learning_rate": 2.666666666666667e-06, "loss": 1.0143, "step": 12 }, { "epoch": 0.008704385671242048, "grad_norm": 16.08549610382324, "learning_rate": 2.888888888888889e-06, "loss": 1.027, "step": 13 }, { "epoch": 0.009373953799799129, "grad_norm": 11.198954779506012, "learning_rate": 3.1111111111111116e-06, "loss": 0.9601, "step": 14 }, { "epoch": 0.010043521928356211, "grad_norm": 7.609653960045835, "learning_rate": 3.3333333333333333e-06, "loss": 0.9373, "step": 15 }, { "epoch": 0.010713090056913292, "grad_norm": 6.035810970590906, "learning_rate": 3.555555555555556e-06, "loss": 0.9306, "step": 16 }, { "epoch": 0.011382658185470372, "grad_norm": 7.270385181148614, "learning_rate": 3.777777777777778e-06, "loss": 0.9484, "step": 17 }, { "epoch": 0.012052226314027453, "grad_norm": 5.5359279195301605, "learning_rate": 4.000000000000001e-06, "loss": 0.8648, "step": 18 }, { "epoch": 0.012721794442584533, "grad_norm": 3.608985716781402, "learning_rate": 4.222222222222223e-06, "loss": 0.8609, "step": 19 }, { "epoch": 0.013391362571141614, "grad_norm": 5.591686101844878, "learning_rate": 4.444444444444444e-06, "loss": 0.9982, "step": 20 }, { "epoch": 0.014060930699698694, "grad_norm": 4.463836585746648, "learning_rate": 4.666666666666667e-06, "loss": 0.8473, "step": 21 }, { "epoch": 0.014730498828255775, "grad_norm": 3.5923227867897998, "learning_rate": 4.888888888888889e-06, "loss": 0.8281, "step": 22 }, { "epoch": 0.015400066956812855, "grad_norm": 3.9734424151527374, "learning_rate": 5.1111111111111115e-06, "loss": 0.8447, "step": 23 }, { "epoch": 0.016069635085369936, "grad_norm": 5.374224459260849, "learning_rate": 5.333333333333334e-06, "loss": 0.8413, "step": 24 }, { "epoch": 0.016739203213927016, "grad_norm": 3.986613697622307, "learning_rate": 5.555555555555557e-06, "loss": 0.7884, "step": 25 }, { "epoch": 0.017408771342484097, "grad_norm": 3.422498538904769, "learning_rate": 5.777777777777778e-06, "loss": 0.7978, "step": 26 }, { "epoch": 0.018078339471041177, "grad_norm": 3.6879137060223175, "learning_rate": 6e-06, "loss": 0.795, "step": 27 }, { "epoch": 0.018747907599598258, "grad_norm": 4.1123534603085385, "learning_rate": 6.222222222222223e-06, "loss": 0.8539, "step": 28 }, { "epoch": 0.019417475728155338, "grad_norm": 3.6636264158253216, "learning_rate": 6.444444444444445e-06, "loss": 0.8192, "step": 29 }, { "epoch": 0.020087043856712422, "grad_norm": 2.8569566741778902, "learning_rate": 6.666666666666667e-06, "loss": 0.7703, "step": 30 }, { "epoch": 0.020756611985269503, "grad_norm": 3.5182581144272804, "learning_rate": 6.88888888888889e-06, "loss": 0.8068, "step": 31 }, { "epoch": 0.021426180113826583, "grad_norm": 3.405440730846777, "learning_rate": 7.111111111111112e-06, "loss": 0.7961, "step": 32 }, { "epoch": 0.022095748242383664, "grad_norm": 3.5431261231147353, "learning_rate": 7.333333333333333e-06, "loss": 0.8075, "step": 33 }, { "epoch": 0.022765316370940744, "grad_norm": 2.499479002092754, "learning_rate": 7.555555555555556e-06, "loss": 0.7998, "step": 34 }, { "epoch": 0.023434884499497825, "grad_norm": 3.1390289919081167, "learning_rate": 7.77777777777778e-06, "loss": 0.7866, "step": 35 }, { "epoch": 0.024104452628054905, "grad_norm": 2.878084925139369, "learning_rate": 8.000000000000001e-06, "loss": 0.7747, "step": 36 }, { "epoch": 0.024774020756611986, "grad_norm": 2.8155245850641037, "learning_rate": 8.222222222222222e-06, "loss": 0.8564, "step": 37 }, { "epoch": 0.025443588885169066, "grad_norm": 2.867941624731225, "learning_rate": 8.444444444444446e-06, "loss": 0.7783, "step": 38 }, { "epoch": 0.026113157013726147, "grad_norm": 4.199007884223074, "learning_rate": 8.666666666666668e-06, "loss": 0.7701, "step": 39 }, { "epoch": 0.026782725142283227, "grad_norm": 2.691392483064756, "learning_rate": 8.888888888888888e-06, "loss": 0.7637, "step": 40 }, { "epoch": 0.027452293270840308, "grad_norm": 2.9146030993605465, "learning_rate": 9.111111111111112e-06, "loss": 0.8667, "step": 41 }, { "epoch": 0.028121861399397388, "grad_norm": 3.0961549324052644, "learning_rate": 9.333333333333334e-06, "loss": 0.8794, "step": 42 }, { "epoch": 0.02879142952795447, "grad_norm": 3.245661778633877, "learning_rate": 9.555555555555556e-06, "loss": 0.7316, "step": 43 }, { "epoch": 0.02946099765651155, "grad_norm": 2.439441850932735, "learning_rate": 9.777777777777779e-06, "loss": 0.7593, "step": 44 }, { "epoch": 0.03013056578506863, "grad_norm": 2.9546751060760212, "learning_rate": 1e-05, "loss": 0.7089, "step": 45 }, { "epoch": 0.03080013391362571, "grad_norm": 3.5342232430476836, "learning_rate": 9.999988232005414e-06, "loss": 0.8428, "step": 46 }, { "epoch": 0.031469702042182794, "grad_norm": 3.1826256274136617, "learning_rate": 9.999952928077044e-06, "loss": 0.7896, "step": 47 }, { "epoch": 0.03213927017073987, "grad_norm": 3.8045960558046037, "learning_rate": 9.999894088381077e-06, "loss": 0.8148, "step": 48 }, { "epoch": 0.032808838299296955, "grad_norm": 2.4933476486265618, "learning_rate": 9.999811713194481e-06, "loss": 0.8089, "step": 49 }, { "epoch": 0.03347840642785403, "grad_norm": 2.500154163189864, "learning_rate": 9.999705802905015e-06, "loss": 0.7788, "step": 50 }, { "epoch": 0.034147974556411116, "grad_norm": 2.6629611132436333, "learning_rate": 9.999576358011216e-06, "loss": 0.8544, "step": 51 }, { "epoch": 0.03481754268496819, "grad_norm": 2.9604021693800497, "learning_rate": 9.999423379122407e-06, "loss": 0.8016, "step": 52 }, { "epoch": 0.03548711081352528, "grad_norm": 3.021938616054198, "learning_rate": 9.999246866958693e-06, "loss": 0.8247, "step": 53 }, { "epoch": 0.036156678942082354, "grad_norm": 2.7884090318150174, "learning_rate": 9.999046822350949e-06, "loss": 0.7423, "step": 54 }, { "epoch": 0.03682624707063944, "grad_norm": 2.6398701437285026, "learning_rate": 9.998823246240826e-06, "loss": 0.7066, "step": 55 }, { "epoch": 0.037495815199196515, "grad_norm": 2.714006251360747, "learning_rate": 9.99857613968074e-06, "loss": 0.7912, "step": 56 }, { "epoch": 0.0381653833277536, "grad_norm": 2.575938091623431, "learning_rate": 9.998305503833872e-06, "loss": 0.8357, "step": 57 }, { "epoch": 0.038834951456310676, "grad_norm": 2.9849826707328946, "learning_rate": 9.998011339974156e-06, "loss": 0.7377, "step": 58 }, { "epoch": 0.03950451958486776, "grad_norm": 2.724375443797874, "learning_rate": 9.99769364948628e-06, "loss": 0.8345, "step": 59 }, { "epoch": 0.040174087713424844, "grad_norm": 2.617961270991231, "learning_rate": 9.997352433865679e-06, "loss": 0.7977, "step": 60 }, { "epoch": 0.04084365584198192, "grad_norm": 2.4391250036903185, "learning_rate": 9.99698769471852e-06, "loss": 0.8058, "step": 61 }, { "epoch": 0.041513223970539005, "grad_norm": 3.246843478972931, "learning_rate": 9.996599433761702e-06, "loss": 0.8147, "step": 62 }, { "epoch": 0.04218279209909608, "grad_norm": 3.022437330428722, "learning_rate": 9.996187652822847e-06, "loss": 0.7765, "step": 63 }, { "epoch": 0.042852360227653166, "grad_norm": 3.4802177782339854, "learning_rate": 9.995752353840288e-06, "loss": 0.7563, "step": 64 }, { "epoch": 0.04352192835621024, "grad_norm": 2.2778226150553955, "learning_rate": 9.995293538863064e-06, "loss": 0.727, "step": 65 }, { "epoch": 0.04419149648476733, "grad_norm": 3.259400478130358, "learning_rate": 9.994811210050911e-06, "loss": 0.8572, "step": 66 }, { "epoch": 0.044861064613324404, "grad_norm": 2.9453160303496944, "learning_rate": 9.994305369674242e-06, "loss": 0.8114, "step": 67 }, { "epoch": 0.04553063274188149, "grad_norm": 3.0284878436597498, "learning_rate": 9.99377602011415e-06, "loss": 0.7956, "step": 68 }, { "epoch": 0.046200200870438565, "grad_norm": 2.5227507263528555, "learning_rate": 9.993223163862385e-06, "loss": 0.7714, "step": 69 }, { "epoch": 0.04686976899899565, "grad_norm": 2.287043781704777, "learning_rate": 9.992646803521355e-06, "loss": 0.7461, "step": 70 }, { "epoch": 0.047539337127552726, "grad_norm": 5.186286680527777, "learning_rate": 9.9920469418041e-06, "loss": 0.7959, "step": 71 }, { "epoch": 0.04820890525610981, "grad_norm": 2.6206474236176733, "learning_rate": 9.991423581534287e-06, "loss": 0.8142, "step": 72 }, { "epoch": 0.04887847338466689, "grad_norm": 2.593516503667619, "learning_rate": 9.9907767256462e-06, "loss": 0.8244, "step": 73 }, { "epoch": 0.04954804151322397, "grad_norm": 2.883807791180433, "learning_rate": 9.990106377184712e-06, "loss": 0.7664, "step": 74 }, { "epoch": 0.05021760964178105, "grad_norm": 3.227079718045557, "learning_rate": 9.989412539305289e-06, "loss": 0.8535, "step": 75 }, { "epoch": 0.05088717777033813, "grad_norm": 2.9272918842603537, "learning_rate": 9.988695215273962e-06, "loss": 0.7691, "step": 76 }, { "epoch": 0.05155674589889521, "grad_norm": 4.971017848055559, "learning_rate": 9.98795440846732e-06, "loss": 0.7748, "step": 77 }, { "epoch": 0.05222631402745229, "grad_norm": 2.862480224503884, "learning_rate": 9.987190122372484e-06, "loss": 0.7976, "step": 78 }, { "epoch": 0.05289588215600938, "grad_norm": 2.113825202758556, "learning_rate": 9.9864023605871e-06, "loss": 0.7545, "step": 79 }, { "epoch": 0.053565450284566454, "grad_norm": 2.447979888590341, "learning_rate": 9.985591126819321e-06, "loss": 0.8305, "step": 80 }, { "epoch": 0.05423501841312354, "grad_norm": 2.041841968990215, "learning_rate": 9.984756424887782e-06, "loss": 0.8048, "step": 81 }, { "epoch": 0.054904586541680615, "grad_norm": 2.527823443599988, "learning_rate": 9.983898258721591e-06, "loss": 0.7228, "step": 82 }, { "epoch": 0.0555741546702377, "grad_norm": 2.369745347022294, "learning_rate": 9.983016632360308e-06, "loss": 0.7946, "step": 83 }, { "epoch": 0.056243722798794776, "grad_norm": 2.820189086170329, "learning_rate": 9.982111549953921e-06, "loss": 0.8522, "step": 84 }, { "epoch": 0.05691329092735186, "grad_norm": 2.556483079228777, "learning_rate": 9.981183015762831e-06, "loss": 0.754, "step": 85 }, { "epoch": 0.05758285905590894, "grad_norm": 2.5351332283610404, "learning_rate": 9.980231034157835e-06, "loss": 0.7684, "step": 86 }, { "epoch": 0.05825242718446602, "grad_norm": 3.0895367001005996, "learning_rate": 9.979255609620095e-06, "loss": 0.7835, "step": 87 }, { "epoch": 0.0589219953130231, "grad_norm": 2.2242034482614215, "learning_rate": 9.978256746741128e-06, "loss": 0.7189, "step": 88 }, { "epoch": 0.05959156344158018, "grad_norm": 2.3947522477531735, "learning_rate": 9.977234450222783e-06, "loss": 0.7509, "step": 89 }, { "epoch": 0.06026113157013726, "grad_norm": 3.8704680197923857, "learning_rate": 9.976188724877208e-06, "loss": 0.7613, "step": 90 }, { "epoch": 0.06093069969869434, "grad_norm": 3.236891268257037, "learning_rate": 9.97511957562684e-06, "loss": 0.8814, "step": 91 }, { "epoch": 0.06160026782725142, "grad_norm": 2.543226425072989, "learning_rate": 9.974027007504378e-06, "loss": 0.7927, "step": 92 }, { "epoch": 0.062269835955808504, "grad_norm": 2.3294085601089356, "learning_rate": 9.972911025652754e-06, "loss": 0.6765, "step": 93 }, { "epoch": 0.06293940408436559, "grad_norm": 3.4603606378980536, "learning_rate": 9.971771635325116e-06, "loss": 0.7996, "step": 94 }, { "epoch": 0.06360897221292267, "grad_norm": 2.213958278612262, "learning_rate": 9.970608841884799e-06, "loss": 0.7688, "step": 95 }, { "epoch": 0.06427854034147974, "grad_norm": 2.428905400178881, "learning_rate": 9.969422650805303e-06, "loss": 0.8201, "step": 96 }, { "epoch": 0.06494810847003682, "grad_norm": 2.5453587064576326, "learning_rate": 9.968213067670265e-06, "loss": 0.8007, "step": 97 }, { "epoch": 0.06561767659859391, "grad_norm": 2.1279609558867096, "learning_rate": 9.96698009817343e-06, "loss": 0.7592, "step": 98 }, { "epoch": 0.06628724472715099, "grad_norm": 2.5780750038161346, "learning_rate": 9.965723748118628e-06, "loss": 0.7863, "step": 99 }, { "epoch": 0.06695681285570806, "grad_norm": 3.3351967505735804, "learning_rate": 9.96444402341975e-06, "loss": 0.7859, "step": 100 }, { "epoch": 0.06762638098426516, "grad_norm": 2.2728239310727103, "learning_rate": 9.963140930100713e-06, "loss": 0.8993, "step": 101 }, { "epoch": 0.06829594911282223, "grad_norm": 2.870290130616898, "learning_rate": 9.961814474295436e-06, "loss": 0.7701, "step": 102 }, { "epoch": 0.06896551724137931, "grad_norm": 1.9318048821606673, "learning_rate": 9.960464662247808e-06, "loss": 0.7628, "step": 103 }, { "epoch": 0.06963508536993639, "grad_norm": 2.5520646043391317, "learning_rate": 9.95909150031166e-06, "loss": 0.7776, "step": 104 }, { "epoch": 0.07030465349849348, "grad_norm": 2.612737753565618, "learning_rate": 9.957694994950738e-06, "loss": 0.8002, "step": 105 }, { "epoch": 0.07097422162705055, "grad_norm": 2.5272765605626892, "learning_rate": 9.956275152738668e-06, "loss": 0.8027, "step": 106 }, { "epoch": 0.07164378975560763, "grad_norm": 2.5463233633522973, "learning_rate": 9.954831980358928e-06, "loss": 0.752, "step": 107 }, { "epoch": 0.07231335788416471, "grad_norm": 3.4239137997793234, "learning_rate": 9.95336548460482e-06, "loss": 0.7908, "step": 108 }, { "epoch": 0.0729829260127218, "grad_norm": 2.4451054857675185, "learning_rate": 9.951875672379424e-06, "loss": 0.7519, "step": 109 }, { "epoch": 0.07365249414127888, "grad_norm": 2.671818715992607, "learning_rate": 9.950362550695586e-06, "loss": 0.7218, "step": 110 }, { "epoch": 0.07432206226983595, "grad_norm": 3.3396868232577015, "learning_rate": 9.948826126675864e-06, "loss": 0.7731, "step": 111 }, { "epoch": 0.07499163039839303, "grad_norm": 2.603761890013899, "learning_rate": 9.947266407552514e-06, "loss": 0.7856, "step": 112 }, { "epoch": 0.07566119852695012, "grad_norm": 2.322839930409185, "learning_rate": 9.94568340066744e-06, "loss": 0.755, "step": 113 }, { "epoch": 0.0763307666555072, "grad_norm": 3.2083340305067582, "learning_rate": 9.94407711347217e-06, "loss": 0.7986, "step": 114 }, { "epoch": 0.07700033478406428, "grad_norm": 2.498002972116349, "learning_rate": 9.942447553527815e-06, "loss": 0.7948, "step": 115 }, { "epoch": 0.07766990291262135, "grad_norm": 3.4498054215744047, "learning_rate": 9.940794728505035e-06, "loss": 0.7293, "step": 116 }, { "epoch": 0.07833947104117844, "grad_norm": 2.1357986796173813, "learning_rate": 9.939118646184007e-06, "loss": 0.76, "step": 117 }, { "epoch": 0.07900903916973552, "grad_norm": 2.5343800368603713, "learning_rate": 9.93741931445438e-06, "loss": 0.7635, "step": 118 }, { "epoch": 0.0796786072982926, "grad_norm": 3.19367458065369, "learning_rate": 9.935696741315244e-06, "loss": 0.718, "step": 119 }, { "epoch": 0.08034817542684969, "grad_norm": 2.8007612395033052, "learning_rate": 9.933950934875094e-06, "loss": 0.7923, "step": 120 }, { "epoch": 0.08101774355540677, "grad_norm": 2.245965623649904, "learning_rate": 9.932181903351786e-06, "loss": 0.7488, "step": 121 }, { "epoch": 0.08168731168396384, "grad_norm": 2.624882041752979, "learning_rate": 9.930389655072498e-06, "loss": 0.7471, "step": 122 }, { "epoch": 0.08235687981252092, "grad_norm": 3.0626595692888485, "learning_rate": 9.928574198473701e-06, "loss": 0.8558, "step": 123 }, { "epoch": 0.08302644794107801, "grad_norm": 2.5796140099673592, "learning_rate": 9.926735542101107e-06, "loss": 0.7858, "step": 124 }, { "epoch": 0.08369601606963509, "grad_norm": 3.394617723299024, "learning_rate": 9.924873694609636e-06, "loss": 0.7448, "step": 125 }, { "epoch": 0.08436558419819216, "grad_norm": 2.704056966435895, "learning_rate": 9.922988664763372e-06, "loss": 0.7897, "step": 126 }, { "epoch": 0.08503515232674924, "grad_norm": 2.7398854360497724, "learning_rate": 9.921080461435522e-06, "loss": 0.7302, "step": 127 }, { "epoch": 0.08570472045530633, "grad_norm": 3.098286874179302, "learning_rate": 9.919149093608376e-06, "loss": 0.7682, "step": 128 }, { "epoch": 0.08637428858386341, "grad_norm": 2.433579191308278, "learning_rate": 9.91719457037327e-06, "loss": 0.753, "step": 129 }, { "epoch": 0.08704385671242049, "grad_norm": 3.5954066805030074, "learning_rate": 9.915216900930524e-06, "loss": 0.7603, "step": 130 }, { "epoch": 0.08771342484097756, "grad_norm": 3.1613237126964, "learning_rate": 9.913216094589427e-06, "loss": 0.8162, "step": 131 }, { "epoch": 0.08838299296953465, "grad_norm": 2.8101921007921042, "learning_rate": 9.911192160768162e-06, "loss": 0.7754, "step": 132 }, { "epoch": 0.08905256109809173, "grad_norm": 2.665000879197837, "learning_rate": 9.909145108993794e-06, "loss": 0.7565, "step": 133 }, { "epoch": 0.08972212922664881, "grad_norm": 3.0484809852682497, "learning_rate": 9.907074948902196e-06, "loss": 0.7806, "step": 134 }, { "epoch": 0.09039169735520589, "grad_norm": 3.047086423260451, "learning_rate": 9.90498169023802e-06, "loss": 0.7108, "step": 135 }, { "epoch": 0.09106126548376298, "grad_norm": 4.643105206780825, "learning_rate": 9.902865342854653e-06, "loss": 0.8337, "step": 136 }, { "epoch": 0.09173083361232005, "grad_norm": 2.348875806688009, "learning_rate": 9.900725916714157e-06, "loss": 0.7175, "step": 137 }, { "epoch": 0.09240040174087713, "grad_norm": 2.506650059589167, "learning_rate": 9.898563421887235e-06, "loss": 0.7962, "step": 138 }, { "epoch": 0.09306996986943422, "grad_norm": 2.6458668595935313, "learning_rate": 9.89637786855318e-06, "loss": 0.7993, "step": 139 }, { "epoch": 0.0937395379979913, "grad_norm": 2.7419309639928104, "learning_rate": 9.894169266999823e-06, "loss": 0.8233, "step": 140 }, { "epoch": 0.09440910612654838, "grad_norm": 2.404432595862172, "learning_rate": 9.891937627623486e-06, "loss": 0.7725, "step": 141 }, { "epoch": 0.09507867425510545, "grad_norm": 2.4383955283368968, "learning_rate": 9.889682960928941e-06, "loss": 0.8504, "step": 142 }, { "epoch": 0.09574824238366254, "grad_norm": 2.444993756249216, "learning_rate": 9.887405277529347e-06, "loss": 0.7625, "step": 143 }, { "epoch": 0.09641781051221962, "grad_norm": 2.105353816392082, "learning_rate": 9.885104588146213e-06, "loss": 0.807, "step": 144 }, { "epoch": 0.0970873786407767, "grad_norm": 3.1069687400719133, "learning_rate": 9.882780903609336e-06, "loss": 0.7651, "step": 145 }, { "epoch": 0.09775694676933377, "grad_norm": 2.479137733772292, "learning_rate": 9.88043423485676e-06, "loss": 0.8316, "step": 146 }, { "epoch": 0.09842651489789087, "grad_norm": 2.317033267705112, "learning_rate": 9.878064592934723e-06, "loss": 0.7502, "step": 147 }, { "epoch": 0.09909608302644794, "grad_norm": 2.433755165545855, "learning_rate": 9.87567198899759e-06, "loss": 0.761, "step": 148 }, { "epoch": 0.09976565115500502, "grad_norm": 2.4161243294048, "learning_rate": 9.873256434307828e-06, "loss": 0.74, "step": 149 }, { "epoch": 0.1004352192835621, "grad_norm": 3.8692875093058228, "learning_rate": 9.87081794023593e-06, "loss": 0.8336, "step": 150 }, { "epoch": 0.1004352192835621, "eval_loss": 0.7690443992614746, "eval_runtime": 483.7834, "eval_samples_per_second": 41.57, "eval_steps_per_second": 0.651, "step": 150 }, { "epoch": 0.10110478741211919, "grad_norm": 2.534002210679788, "learning_rate": 9.868356518260366e-06, "loss": 0.7787, "step": 151 }, { "epoch": 0.10177435554067626, "grad_norm": 3.9251054686892686, "learning_rate": 9.865872179967542e-06, "loss": 0.771, "step": 152 }, { "epoch": 0.10244392366923334, "grad_norm": 2.2374089579926797, "learning_rate": 9.863364937051725e-06, "loss": 0.7645, "step": 153 }, { "epoch": 0.10311349179779042, "grad_norm": 2.2357439287818672, "learning_rate": 9.860834801315005e-06, "loss": 0.7997, "step": 154 }, { "epoch": 0.10378305992634751, "grad_norm": 2.4798760472220476, "learning_rate": 9.858281784667233e-06, "loss": 0.8144, "step": 155 }, { "epoch": 0.10445262805490459, "grad_norm": 3.088066391345331, "learning_rate": 9.85570589912596e-06, "loss": 0.7457, "step": 156 }, { "epoch": 0.10512219618346166, "grad_norm": 2.5092630088734276, "learning_rate": 9.853107156816393e-06, "loss": 0.7523, "step": 157 }, { "epoch": 0.10579176431201875, "grad_norm": 2.3846880556010777, "learning_rate": 9.850485569971322e-06, "loss": 0.7746, "step": 158 }, { "epoch": 0.10646133244057583, "grad_norm": 2.832518846065702, "learning_rate": 9.847841150931078e-06, "loss": 0.7702, "step": 159 }, { "epoch": 0.10713090056913291, "grad_norm": 2.1909459256263872, "learning_rate": 9.845173912143465e-06, "loss": 0.6905, "step": 160 }, { "epoch": 0.10780046869768999, "grad_norm": 3.2635267711756977, "learning_rate": 9.8424838661637e-06, "loss": 0.7408, "step": 161 }, { "epoch": 0.10847003682624708, "grad_norm": 2.347809543734899, "learning_rate": 9.839771025654364e-06, "loss": 0.8081, "step": 162 }, { "epoch": 0.10913960495480415, "grad_norm": 2.488939185507867, "learning_rate": 9.837035403385336e-06, "loss": 0.8417, "step": 163 }, { "epoch": 0.10980917308336123, "grad_norm": 3.181696142226475, "learning_rate": 9.834277012233726e-06, "loss": 0.7505, "step": 164 }, { "epoch": 0.11047874121191831, "grad_norm": 2.300027255640582, "learning_rate": 9.831495865183832e-06, "loss": 0.815, "step": 165 }, { "epoch": 0.1111483093404754, "grad_norm": 2.1582924148485416, "learning_rate": 9.828691975327061e-06, "loss": 0.7772, "step": 166 }, { "epoch": 0.11181787746903248, "grad_norm": 3.4240491839692413, "learning_rate": 9.825865355861878e-06, "loss": 0.7737, "step": 167 }, { "epoch": 0.11248744559758955, "grad_norm": 2.6282648525338326, "learning_rate": 9.823016020093738e-06, "loss": 0.7259, "step": 168 }, { "epoch": 0.11315701372614663, "grad_norm": 2.4790772491207083, "learning_rate": 9.820143981435031e-06, "loss": 0.7267, "step": 169 }, { "epoch": 0.11382658185470372, "grad_norm": 2.1156781816390766, "learning_rate": 9.81724925340501e-06, "loss": 0.7199, "step": 170 }, { "epoch": 0.1144961499832608, "grad_norm": 4.144433304882992, "learning_rate": 9.814331849629732e-06, "loss": 0.7957, "step": 171 }, { "epoch": 0.11516571811181787, "grad_norm": 3.863376125937714, "learning_rate": 9.811391783841995e-06, "loss": 0.8033, "step": 172 }, { "epoch": 0.11583528624037495, "grad_norm": 2.452105120488129, "learning_rate": 9.808429069881267e-06, "loss": 0.7428, "step": 173 }, { "epoch": 0.11650485436893204, "grad_norm": 3.400682423907456, "learning_rate": 9.805443721693633e-06, "loss": 0.7566, "step": 174 }, { "epoch": 0.11717442249748912, "grad_norm": 2.412787702693195, "learning_rate": 9.802435753331717e-06, "loss": 0.7869, "step": 175 }, { "epoch": 0.1178439906260462, "grad_norm": 2.445235314416365, "learning_rate": 9.799405178954618e-06, "loss": 0.766, "step": 176 }, { "epoch": 0.11851355875460329, "grad_norm": 2.244186715936024, "learning_rate": 9.79635201282785e-06, "loss": 0.7335, "step": 177 }, { "epoch": 0.11918312688316036, "grad_norm": 2.8498254206908005, "learning_rate": 9.793276269323273e-06, "loss": 0.8805, "step": 178 }, { "epoch": 0.11985269501171744, "grad_norm": 2.0010910566608153, "learning_rate": 9.790177962919018e-06, "loss": 0.797, "step": 179 }, { "epoch": 0.12052226314027452, "grad_norm": 2.3223271678825457, "learning_rate": 9.787057108199425e-06, "loss": 0.7762, "step": 180 }, { "epoch": 0.12119183126883161, "grad_norm": 2.091856470663959, "learning_rate": 9.783913719854977e-06, "loss": 0.7609, "step": 181 }, { "epoch": 0.12186139939738869, "grad_norm": 2.911256219535097, "learning_rate": 9.780747812682224e-06, "loss": 0.8026, "step": 182 }, { "epoch": 0.12253096752594576, "grad_norm": 2.938392919003108, "learning_rate": 9.777559401583716e-06, "loss": 0.7415, "step": 183 }, { "epoch": 0.12320053565450284, "grad_norm": 2.2698224381890095, "learning_rate": 9.774348501567938e-06, "loss": 0.7747, "step": 184 }, { "epoch": 0.12387010378305993, "grad_norm": 2.6513476660577924, "learning_rate": 9.771115127749228e-06, "loss": 0.7903, "step": 185 }, { "epoch": 0.12453967191161701, "grad_norm": 2.687090734325012, "learning_rate": 9.767859295347717e-06, "loss": 0.7712, "step": 186 }, { "epoch": 0.12520924004017409, "grad_norm": 2.1585161172227667, "learning_rate": 9.764581019689255e-06, "loss": 0.7739, "step": 187 }, { "epoch": 0.12587880816873118, "grad_norm": 2.6778278022265343, "learning_rate": 9.76128031620533e-06, "loss": 0.8614, "step": 188 }, { "epoch": 0.12654837629728824, "grad_norm": 2.661252662924025, "learning_rate": 9.757957200433011e-06, "loss": 0.8445, "step": 189 }, { "epoch": 0.12721794442584533, "grad_norm": 2.2900109141971905, "learning_rate": 9.754611688014856e-06, "loss": 0.6813, "step": 190 }, { "epoch": 0.12788751255440242, "grad_norm": 3.2672656957407193, "learning_rate": 9.751243794698859e-06, "loss": 0.8404, "step": 191 }, { "epoch": 0.12855708068295948, "grad_norm": 2.4539578518963676, "learning_rate": 9.747853536338357e-06, "loss": 0.7707, "step": 192 }, { "epoch": 0.12922664881151658, "grad_norm": 2.3991157340071125, "learning_rate": 9.744440928891967e-06, "loss": 0.6981, "step": 193 }, { "epoch": 0.12989621694007364, "grad_norm": 2.5185588125931746, "learning_rate": 9.74100598842351e-06, "loss": 0.7664, "step": 194 }, { "epoch": 0.13056578506863073, "grad_norm": 2.7781896720116683, "learning_rate": 9.737548731101925e-06, "loss": 0.7457, "step": 195 }, { "epoch": 0.13123535319718782, "grad_norm": 1.9974521107324632, "learning_rate": 9.73406917320121e-06, "loss": 0.7118, "step": 196 }, { "epoch": 0.13190492132574488, "grad_norm": 2.573935105255589, "learning_rate": 9.730567331100333e-06, "loss": 0.777, "step": 197 }, { "epoch": 0.13257448945430197, "grad_norm": 3.3688560383181483, "learning_rate": 9.727043221283157e-06, "loss": 0.8415, "step": 198 }, { "epoch": 0.13324405758285907, "grad_norm": 2.6096103013626446, "learning_rate": 9.723496860338363e-06, "loss": 0.7843, "step": 199 }, { "epoch": 0.13391362571141613, "grad_norm": 2.167257474305748, "learning_rate": 9.719928264959375e-06, "loss": 0.7326, "step": 200 }, { "epoch": 0.13458319383997322, "grad_norm": 2.9562426243694593, "learning_rate": 9.716337451944275e-06, "loss": 0.8229, "step": 201 }, { "epoch": 0.1352527619685303, "grad_norm": 2.830516415835648, "learning_rate": 9.71272443819573e-06, "loss": 0.7991, "step": 202 }, { "epoch": 0.13592233009708737, "grad_norm": 3.455622978675439, "learning_rate": 9.709089240720916e-06, "loss": 0.8114, "step": 203 }, { "epoch": 0.13659189822564446, "grad_norm": 2.9555827758934576, "learning_rate": 9.705431876631421e-06, "loss": 0.6639, "step": 204 }, { "epoch": 0.13726146635420153, "grad_norm": 2.2806533402094056, "learning_rate": 9.701752363143183e-06, "loss": 0.7538, "step": 205 }, { "epoch": 0.13793103448275862, "grad_norm": 2.787577145170046, "learning_rate": 9.698050717576402e-06, "loss": 0.8114, "step": 206 }, { "epoch": 0.1386006026113157, "grad_norm": 2.847946497030917, "learning_rate": 9.694326957355452e-06, "loss": 0.7906, "step": 207 }, { "epoch": 0.13927017073987277, "grad_norm": 2.260418291718414, "learning_rate": 9.690581100008812e-06, "loss": 0.7029, "step": 208 }, { "epoch": 0.13993973886842986, "grad_norm": 2.7257783122833725, "learning_rate": 9.686813163168973e-06, "loss": 0.7639, "step": 209 }, { "epoch": 0.14060930699698695, "grad_norm": 2.181703122464614, "learning_rate": 9.68302316457236e-06, "loss": 0.7812, "step": 210 }, { "epoch": 0.14127887512554402, "grad_norm": 2.2169214483619992, "learning_rate": 9.679211122059244e-06, "loss": 0.8084, "step": 211 }, { "epoch": 0.1419484432541011, "grad_norm": 2.4938021581200878, "learning_rate": 9.675377053573664e-06, "loss": 0.7056, "step": 212 }, { "epoch": 0.14261801138265817, "grad_norm": 2.2187111881543706, "learning_rate": 9.67152097716334e-06, "loss": 0.7364, "step": 213 }, { "epoch": 0.14328757951121526, "grad_norm": 3.0241722492437066, "learning_rate": 9.667642910979586e-06, "loss": 0.756, "step": 214 }, { "epoch": 0.14395714763977235, "grad_norm": 3.131009802311333, "learning_rate": 9.663742873277227e-06, "loss": 0.772, "step": 215 }, { "epoch": 0.14462671576832942, "grad_norm": 2.0581290103077396, "learning_rate": 9.65982088241451e-06, "loss": 0.7121, "step": 216 }, { "epoch": 0.1452962838968865, "grad_norm": 2.591411187401212, "learning_rate": 9.655876956853025e-06, "loss": 0.7909, "step": 217 }, { "epoch": 0.1459658520254436, "grad_norm": 2.440462975554113, "learning_rate": 9.651911115157609e-06, "loss": 0.7774, "step": 218 }, { "epoch": 0.14663542015400066, "grad_norm": 2.4435285721410107, "learning_rate": 9.64792337599626e-06, "loss": 0.7668, "step": 219 }, { "epoch": 0.14730498828255775, "grad_norm": 1.9858859596715717, "learning_rate": 9.64391375814006e-06, "loss": 0.7572, "step": 220 }, { "epoch": 0.14797455641111484, "grad_norm": 1.9277199635459286, "learning_rate": 9.639882280463071e-06, "loss": 0.7412, "step": 221 }, { "epoch": 0.1486441245396719, "grad_norm": 3.2729405844631834, "learning_rate": 9.635828961942257e-06, "loss": 0.8015, "step": 222 }, { "epoch": 0.149313692668229, "grad_norm": 2.427511108313675, "learning_rate": 9.631753821657388e-06, "loss": 0.7312, "step": 223 }, { "epoch": 0.14998326079678606, "grad_norm": 2.478010787001764, "learning_rate": 9.62765687879096e-06, "loss": 0.845, "step": 224 }, { "epoch": 0.15065282892534315, "grad_norm": 2.8119194464862707, "learning_rate": 9.623538152628087e-06, "loss": 0.8167, "step": 225 }, { "epoch": 0.15132239705390024, "grad_norm": 3.562256362353997, "learning_rate": 9.619397662556434e-06, "loss": 0.7418, "step": 226 }, { "epoch": 0.1519919651824573, "grad_norm": 2.504568344186749, "learning_rate": 9.615235428066106e-06, "loss": 0.7479, "step": 227 }, { "epoch": 0.1526615333110144, "grad_norm": 2.8746208874855914, "learning_rate": 9.611051468749559e-06, "loss": 0.7764, "step": 228 }, { "epoch": 0.1533311014395715, "grad_norm": 2.727415396992066, "learning_rate": 9.606845804301523e-06, "loss": 0.7051, "step": 229 }, { "epoch": 0.15400066956812855, "grad_norm": 1.9014998009601116, "learning_rate": 9.60261845451889e-06, "loss": 0.6943, "step": 230 }, { "epoch": 0.15467023769668564, "grad_norm": 2.8505064889469516, "learning_rate": 9.598369439300632e-06, "loss": 0.7641, "step": 231 }, { "epoch": 0.1553398058252427, "grad_norm": 1.9695097843549827, "learning_rate": 9.594098778647706e-06, "loss": 0.7248, "step": 232 }, { "epoch": 0.1560093739537998, "grad_norm": 2.3140289807738696, "learning_rate": 9.589806492662954e-06, "loss": 0.8179, "step": 233 }, { "epoch": 0.1566789420823569, "grad_norm": 3.619196107468729, "learning_rate": 9.585492601551017e-06, "loss": 0.7847, "step": 234 }, { "epoch": 0.15734851021091395, "grad_norm": 2.6424674262204864, "learning_rate": 9.581157125618232e-06, "loss": 0.7592, "step": 235 }, { "epoch": 0.15801807833947104, "grad_norm": 2.0583922312952185, "learning_rate": 9.576800085272544e-06, "loss": 0.6957, "step": 236 }, { "epoch": 0.15868764646802813, "grad_norm": 2.8429168037099446, "learning_rate": 9.572421501023403e-06, "loss": 0.7387, "step": 237 }, { "epoch": 0.1593572145965852, "grad_norm": 2.7371944425122927, "learning_rate": 9.568021393481671e-06, "loss": 0.7781, "step": 238 }, { "epoch": 0.16002678272514229, "grad_norm": 2.2471631493407815, "learning_rate": 9.563599783359526e-06, "loss": 0.835, "step": 239 }, { "epoch": 0.16069635085369938, "grad_norm": 2.976756334298762, "learning_rate": 9.559156691470359e-06, "loss": 0.7692, "step": 240 }, { "epoch": 0.16136591898225644, "grad_norm": 2.341619341244618, "learning_rate": 9.554692138728686e-06, "loss": 0.7544, "step": 241 }, { "epoch": 0.16203548711081353, "grad_norm": 2.173347263979291, "learning_rate": 9.550206146150038e-06, "loss": 0.8074, "step": 242 }, { "epoch": 0.1627050552393706, "grad_norm": 2.427038703170593, "learning_rate": 9.545698734850867e-06, "loss": 0.8508, "step": 243 }, { "epoch": 0.16337462336792768, "grad_norm": 2.092250268632201, "learning_rate": 9.541169926048456e-06, "loss": 0.8053, "step": 244 }, { "epoch": 0.16404419149648478, "grad_norm": 2.7914085062521035, "learning_rate": 9.536619741060799e-06, "loss": 0.7785, "step": 245 }, { "epoch": 0.16471375962504184, "grad_norm": 2.110045830186256, "learning_rate": 9.532048201306519e-06, "loss": 0.7599, "step": 246 }, { "epoch": 0.16538332775359893, "grad_norm": 2.169479555320381, "learning_rate": 9.527455328304756e-06, "loss": 0.7694, "step": 247 }, { "epoch": 0.16605289588215602, "grad_norm": 2.032328864969317, "learning_rate": 9.522841143675074e-06, "loss": 0.6965, "step": 248 }, { "epoch": 0.16672246401071308, "grad_norm": 1.9930372355199912, "learning_rate": 9.518205669137353e-06, "loss": 0.7317, "step": 249 }, { "epoch": 0.16739203213927017, "grad_norm": 2.231988893354748, "learning_rate": 9.513548926511686e-06, "loss": 0.7751, "step": 250 }, { "epoch": 0.16806160026782724, "grad_norm": 2.4145850689643176, "learning_rate": 9.508870937718286e-06, "loss": 0.7475, "step": 251 }, { "epoch": 0.16873116839638433, "grad_norm": 2.7313630450702893, "learning_rate": 9.504171724777367e-06, "loss": 0.7548, "step": 252 }, { "epoch": 0.16940073652494142, "grad_norm": 2.4546952021556887, "learning_rate": 9.499451309809058e-06, "loss": 0.7873, "step": 253 }, { "epoch": 0.17007030465349848, "grad_norm": 3.5149871588629233, "learning_rate": 9.494709715033283e-06, "loss": 0.8447, "step": 254 }, { "epoch": 0.17073987278205557, "grad_norm": 2.1426835322448228, "learning_rate": 9.489946962769669e-06, "loss": 0.7382, "step": 255 }, { "epoch": 0.17140944091061266, "grad_norm": 2.239593570648056, "learning_rate": 9.485163075437434e-06, "loss": 0.8312, "step": 256 }, { "epoch": 0.17207900903916973, "grad_norm": 2.3125628519421646, "learning_rate": 9.480358075555278e-06, "loss": 0.7714, "step": 257 }, { "epoch": 0.17274857716772682, "grad_norm": 2.1853885473762142, "learning_rate": 9.475531985741288e-06, "loss": 0.7617, "step": 258 }, { "epoch": 0.1734181452962839, "grad_norm": 2.330048784059288, "learning_rate": 9.470684828712826e-06, "loss": 0.7277, "step": 259 }, { "epoch": 0.17408771342484097, "grad_norm": 2.4015222564994283, "learning_rate": 9.465816627286418e-06, "loss": 0.8079, "step": 260 }, { "epoch": 0.17475728155339806, "grad_norm": 2.1594051566101093, "learning_rate": 9.460927404377647e-06, "loss": 0.782, "step": 261 }, { "epoch": 0.17542684968195513, "grad_norm": 2.004600825736957, "learning_rate": 9.456017183001057e-06, "loss": 0.7283, "step": 262 }, { "epoch": 0.17609641781051222, "grad_norm": 2.718521315245205, "learning_rate": 9.45108598627003e-06, "loss": 0.6836, "step": 263 }, { "epoch": 0.1767659859390693, "grad_norm": 2.51139799214698, "learning_rate": 9.446133837396685e-06, "loss": 0.9112, "step": 264 }, { "epoch": 0.17743555406762637, "grad_norm": 2.080279631082688, "learning_rate": 9.441160759691768e-06, "loss": 0.7054, "step": 265 }, { "epoch": 0.17810512219618346, "grad_norm": 2.518931754492507, "learning_rate": 9.436166776564534e-06, "loss": 0.8413, "step": 266 }, { "epoch": 0.17877469032474055, "grad_norm": 2.3439221999347577, "learning_rate": 9.431151911522656e-06, "loss": 0.7119, "step": 267 }, { "epoch": 0.17944425845329762, "grad_norm": 2.340550395809966, "learning_rate": 9.426116188172093e-06, "loss": 0.7507, "step": 268 }, { "epoch": 0.1801138265818547, "grad_norm": 2.027654531290725, "learning_rate": 9.421059630216992e-06, "loss": 0.7134, "step": 269 }, { "epoch": 0.18078339471041177, "grad_norm": 2.8156077715211736, "learning_rate": 9.415982261459569e-06, "loss": 0.7297, "step": 270 }, { "epoch": 0.18145296283896886, "grad_norm": 2.584973059610657, "learning_rate": 9.410884105800005e-06, "loss": 0.7771, "step": 271 }, { "epoch": 0.18212253096752595, "grad_norm": 2.2015942313553274, "learning_rate": 9.405765187236328e-06, "loss": 0.7328, "step": 272 }, { "epoch": 0.18279209909608302, "grad_norm": 2.015219774756113, "learning_rate": 9.400625529864302e-06, "loss": 0.779, "step": 273 }, { "epoch": 0.1834616672246401, "grad_norm": 2.357016816846307, "learning_rate": 9.395465157877307e-06, "loss": 0.7113, "step": 274 }, { "epoch": 0.1841312353531972, "grad_norm": 2.4020641506009843, "learning_rate": 9.390284095566237e-06, "loss": 0.7388, "step": 275 }, { "epoch": 0.18480080348175426, "grad_norm": 2.167337464879733, "learning_rate": 9.385082367319377e-06, "loss": 0.7404, "step": 276 }, { "epoch": 0.18547037161031135, "grad_norm": 2.2338310320283266, "learning_rate": 9.37985999762229e-06, "loss": 0.7377, "step": 277 }, { "epoch": 0.18613993973886844, "grad_norm": 2.261818327257077, "learning_rate": 9.374617011057707e-06, "loss": 0.8561, "step": 278 }, { "epoch": 0.1868095078674255, "grad_norm": 2.4635858199342526, "learning_rate": 9.369353432305396e-06, "loss": 0.7963, "step": 279 }, { "epoch": 0.1874790759959826, "grad_norm": 1.7135501729583966, "learning_rate": 9.364069286142072e-06, "loss": 0.7375, "step": 280 }, { "epoch": 0.18814864412453966, "grad_norm": 2.234132543554346, "learning_rate": 9.35876459744125e-06, "loss": 0.7262, "step": 281 }, { "epoch": 0.18881821225309675, "grad_norm": 2.52837368222337, "learning_rate": 9.353439391173152e-06, "loss": 0.8037, "step": 282 }, { "epoch": 0.18948778038165384, "grad_norm": 3.0065284344752286, "learning_rate": 9.348093692404578e-06, "loss": 0.7827, "step": 283 }, { "epoch": 0.1901573485102109, "grad_norm": 2.17597709376436, "learning_rate": 9.342727526298787e-06, "loss": 0.7466, "step": 284 }, { "epoch": 0.190826916638768, "grad_norm": 2.164821041056447, "learning_rate": 9.337340918115385e-06, "loss": 0.7494, "step": 285 }, { "epoch": 0.1914964847673251, "grad_norm": 2.2013607927175323, "learning_rate": 9.331933893210205e-06, "loss": 0.8268, "step": 286 }, { "epoch": 0.19216605289588215, "grad_norm": 2.158989053908063, "learning_rate": 9.326506477035179e-06, "loss": 0.7566, "step": 287 }, { "epoch": 0.19283562102443924, "grad_norm": 2.5242706592571102, "learning_rate": 9.321058695138233e-06, "loss": 0.7505, "step": 288 }, { "epoch": 0.1935051891529963, "grad_norm": 2.105478005184011, "learning_rate": 9.315590573163152e-06, "loss": 0.7653, "step": 289 }, { "epoch": 0.1941747572815534, "grad_norm": 2.8714512095737352, "learning_rate": 9.310102136849468e-06, "loss": 0.7081, "step": 290 }, { "epoch": 0.19484432541011049, "grad_norm": 2.027623898843217, "learning_rate": 9.304593412032336e-06, "loss": 0.7394, "step": 291 }, { "epoch": 0.19551389353866755, "grad_norm": 2.353454967295155, "learning_rate": 9.299064424642415e-06, "loss": 0.7637, "step": 292 }, { "epoch": 0.19618346166722464, "grad_norm": 2.272102425725686, "learning_rate": 9.29351520070574e-06, "loss": 0.8017, "step": 293 }, { "epoch": 0.19685302979578173, "grad_norm": 2.7433949134072817, "learning_rate": 9.287945766343607e-06, "loss": 0.8225, "step": 294 }, { "epoch": 0.1975225979243388, "grad_norm": 5.510412999809797, "learning_rate": 9.282356147772447e-06, "loss": 0.757, "step": 295 }, { "epoch": 0.19819216605289589, "grad_norm": 2.4268921521070896, "learning_rate": 9.2767463713037e-06, "loss": 0.789, "step": 296 }, { "epoch": 0.19886173418145298, "grad_norm": 2.5484502641447397, "learning_rate": 9.271116463343692e-06, "loss": 0.7248, "step": 297 }, { "epoch": 0.19953130231001004, "grad_norm": 3.065960462517364, "learning_rate": 9.265466450393515e-06, "loss": 0.7271, "step": 298 }, { "epoch": 0.20020087043856713, "grad_norm": 2.228721444968851, "learning_rate": 9.259796359048896e-06, "loss": 0.8003, "step": 299 }, { "epoch": 0.2008704385671242, "grad_norm": 3.0026916906257815, "learning_rate": 9.254106216000078e-06, "loss": 0.8393, "step": 300 }, { "epoch": 0.2008704385671242, "eval_loss": 0.7555466294288635, "eval_runtime": 441.3662, "eval_samples_per_second": 45.565, "eval_steps_per_second": 0.714, "step": 300 }, { "epoch": 0.20154000669568128, "grad_norm": 3.2170151836119554, "learning_rate": 9.24839604803169e-06, "loss": 0.7438, "step": 301 }, { "epoch": 0.20220957482423838, "grad_norm": 2.305101354125338, "learning_rate": 9.242665882022623e-06, "loss": 0.7751, "step": 302 }, { "epoch": 0.20287914295279544, "grad_norm": 1.8497639096687861, "learning_rate": 9.2369157449459e-06, "loss": 0.7029, "step": 303 }, { "epoch": 0.20354871108135253, "grad_norm": 2.8596773672742803, "learning_rate": 9.231145663868557e-06, "loss": 0.7876, "step": 304 }, { "epoch": 0.20421827920990962, "grad_norm": 2.476021129611478, "learning_rate": 9.225355665951503e-06, "loss": 0.7572, "step": 305 }, { "epoch": 0.20488784733846668, "grad_norm": 2.099611557670415, "learning_rate": 9.219545778449408e-06, "loss": 0.7635, "step": 306 }, { "epoch": 0.20555741546702377, "grad_norm": 2.234994699415112, "learning_rate": 9.213716028710558e-06, "loss": 0.7382, "step": 307 }, { "epoch": 0.20622698359558084, "grad_norm": 3.210241498605866, "learning_rate": 9.207866444176741e-06, "loss": 0.7823, "step": 308 }, { "epoch": 0.20689655172413793, "grad_norm": 2.4144782170011045, "learning_rate": 9.201997052383107e-06, "loss": 0.7757, "step": 309 }, { "epoch": 0.20756611985269502, "grad_norm": 2.105653480903479, "learning_rate": 9.196107880958046e-06, "loss": 0.7712, "step": 310 }, { "epoch": 0.20823568798125208, "grad_norm": 1.882171471584976, "learning_rate": 9.190198957623051e-06, "loss": 0.76, "step": 311 }, { "epoch": 0.20890525610980917, "grad_norm": 1.8844901686707094, "learning_rate": 9.184270310192595e-06, "loss": 0.6692, "step": 312 }, { "epoch": 0.20957482423836626, "grad_norm": 2.476570561694017, "learning_rate": 9.178321966573993e-06, "loss": 0.7292, "step": 313 }, { "epoch": 0.21024439236692333, "grad_norm": 1.8014315689193638, "learning_rate": 9.172353954767275e-06, "loss": 0.6946, "step": 314 }, { "epoch": 0.21091396049548042, "grad_norm": 2.295123763637967, "learning_rate": 9.166366302865054e-06, "loss": 0.6863, "step": 315 }, { "epoch": 0.2115835286240375, "grad_norm": 2.8814797578193083, "learning_rate": 9.160359039052394e-06, "loss": 0.7231, "step": 316 }, { "epoch": 0.21225309675259457, "grad_norm": 2.4177678329756573, "learning_rate": 9.154332191606671e-06, "loss": 0.7875, "step": 317 }, { "epoch": 0.21292266488115166, "grad_norm": 2.0573714611833465, "learning_rate": 9.148285788897452e-06, "loss": 0.7374, "step": 318 }, { "epoch": 0.21359223300970873, "grad_norm": 2.489410084217292, "learning_rate": 9.142219859386344e-06, "loss": 0.8583, "step": 319 }, { "epoch": 0.21426180113826582, "grad_norm": 1.9649670230454386, "learning_rate": 9.136134431626884e-06, "loss": 0.7343, "step": 320 }, { "epoch": 0.2149313692668229, "grad_norm": 2.2990304829294823, "learning_rate": 9.130029534264381e-06, "loss": 0.8351, "step": 321 }, { "epoch": 0.21560093739537997, "grad_norm": 2.3441648584533104, "learning_rate": 9.123905196035795e-06, "loss": 0.7823, "step": 322 }, { "epoch": 0.21627050552393706, "grad_norm": 2.8971461782991814, "learning_rate": 9.1177614457696e-06, "loss": 0.8081, "step": 323 }, { "epoch": 0.21694007365249415, "grad_norm": 2.301201412988571, "learning_rate": 9.111598312385641e-06, "loss": 0.7342, "step": 324 }, { "epoch": 0.21760964178105122, "grad_norm": 4.427584798709969, "learning_rate": 9.105415824895008e-06, "loss": 0.7294, "step": 325 }, { "epoch": 0.2182792099096083, "grad_norm": 2.1762204219935337, "learning_rate": 9.099214012399892e-06, "loss": 0.7232, "step": 326 }, { "epoch": 0.21894877803816537, "grad_norm": 2.297998226516268, "learning_rate": 9.092992904093451e-06, "loss": 0.6898, "step": 327 }, { "epoch": 0.21961834616672246, "grad_norm": 2.797194850372323, "learning_rate": 9.086752529259673e-06, "loss": 0.7833, "step": 328 }, { "epoch": 0.22028791429527955, "grad_norm": 3.482572423720937, "learning_rate": 9.080492917273238e-06, "loss": 0.8077, "step": 329 }, { "epoch": 0.22095748242383662, "grad_norm": 1.833126051947291, "learning_rate": 9.074214097599375e-06, "loss": 0.7016, "step": 330 }, { "epoch": 0.2216270505523937, "grad_norm": 1.9599383645965922, "learning_rate": 9.067916099793733e-06, "loss": 0.7809, "step": 331 }, { "epoch": 0.2222966186809508, "grad_norm": 2.356386876541469, "learning_rate": 9.061598953502233e-06, "loss": 0.808, "step": 332 }, { "epoch": 0.22296618680950786, "grad_norm": 2.6990748052265023, "learning_rate": 9.055262688460931e-06, "loss": 0.7036, "step": 333 }, { "epoch": 0.22363575493806495, "grad_norm": 2.427998665980746, "learning_rate": 9.048907334495882e-06, "loss": 0.8638, "step": 334 }, { "epoch": 0.22430532306662204, "grad_norm": 2.5413554713019066, "learning_rate": 9.042532921522994e-06, "loss": 0.7743, "step": 335 }, { "epoch": 0.2249748911951791, "grad_norm": 2.491724781600139, "learning_rate": 9.03613947954789e-06, "loss": 0.7302, "step": 336 }, { "epoch": 0.2256444593237362, "grad_norm": 2.291897387111592, "learning_rate": 9.029727038665765e-06, "loss": 0.7447, "step": 337 }, { "epoch": 0.22631402745229326, "grad_norm": 1.7385102048763867, "learning_rate": 9.023295629061248e-06, "loss": 0.6769, "step": 338 }, { "epoch": 0.22698359558085035, "grad_norm": 2.1309156485057272, "learning_rate": 9.016845281008255e-06, "loss": 0.7902, "step": 339 }, { "epoch": 0.22765316370940744, "grad_norm": 2.667103723603422, "learning_rate": 9.010376024869852e-06, "loss": 0.7017, "step": 340 }, { "epoch": 0.2283227318379645, "grad_norm": 2.2823075184954047, "learning_rate": 9.003887891098108e-06, "loss": 0.8098, "step": 341 }, { "epoch": 0.2289922999665216, "grad_norm": 1.8646498203551884, "learning_rate": 8.99738091023395e-06, "loss": 0.6913, "step": 342 }, { "epoch": 0.2296618680950787, "grad_norm": 3.107649635342622, "learning_rate": 8.990855112907026e-06, "loss": 0.7544, "step": 343 }, { "epoch": 0.23033143622363575, "grad_norm": 2.1577594000247857, "learning_rate": 8.984310529835555e-06, "loss": 0.7791, "step": 344 }, { "epoch": 0.23100100435219284, "grad_norm": 2.224108643689627, "learning_rate": 8.977747191826183e-06, "loss": 0.7682, "step": 345 }, { "epoch": 0.2316705724807499, "grad_norm": 2.5097694023319446, "learning_rate": 8.97116512977384e-06, "loss": 0.6975, "step": 346 }, { "epoch": 0.232340140609307, "grad_norm": 2.1288352800411707, "learning_rate": 8.964564374661597e-06, "loss": 0.7347, "step": 347 }, { "epoch": 0.23300970873786409, "grad_norm": 1.9830485568637466, "learning_rate": 8.957944957560514e-06, "loss": 0.7564, "step": 348 }, { "epoch": 0.23367927686642115, "grad_norm": 2.5919912380082755, "learning_rate": 8.951306909629492e-06, "loss": 0.7623, "step": 349 }, { "epoch": 0.23434884499497824, "grad_norm": 2.497679184082089, "learning_rate": 8.944650262115143e-06, "loss": 0.7488, "step": 350 }, { "epoch": 0.23501841312353533, "grad_norm": 1.8612941007543895, "learning_rate": 8.937975046351616e-06, "loss": 0.7284, "step": 351 }, { "epoch": 0.2356879812520924, "grad_norm": 2.1811675399391284, "learning_rate": 8.931281293760479e-06, "loss": 0.7729, "step": 352 }, { "epoch": 0.23635754938064948, "grad_norm": 2.0372871422730063, "learning_rate": 8.924569035850547e-06, "loss": 0.7619, "step": 353 }, { "epoch": 0.23702711750920658, "grad_norm": 2.644153224907542, "learning_rate": 8.917838304217745e-06, "loss": 0.7334, "step": 354 }, { "epoch": 0.23769668563776364, "grad_norm": 2.404294507483102, "learning_rate": 8.91108913054496e-06, "loss": 0.7876, "step": 355 }, { "epoch": 0.23836625376632073, "grad_norm": 2.8600906637111723, "learning_rate": 8.904321546601887e-06, "loss": 0.7681, "step": 356 }, { "epoch": 0.2390358218948778, "grad_norm": 3.1175986556137847, "learning_rate": 8.89753558424488e-06, "loss": 0.8385, "step": 357 }, { "epoch": 0.23970539002343488, "grad_norm": 2.582066235477835, "learning_rate": 8.890731275416813e-06, "loss": 0.7982, "step": 358 }, { "epoch": 0.24037495815199197, "grad_norm": 1.806269710552544, "learning_rate": 8.883908652146907e-06, "loss": 0.8047, "step": 359 }, { "epoch": 0.24104452628054904, "grad_norm": 2.845609255480548, "learning_rate": 8.8770677465506e-06, "loss": 0.7614, "step": 360 }, { "epoch": 0.24171409440910613, "grad_norm": 2.008798306321432, "learning_rate": 8.870208590829395e-06, "loss": 0.7191, "step": 361 }, { "epoch": 0.24238366253766322, "grad_norm": 2.428594535053926, "learning_rate": 8.863331217270685e-06, "loss": 0.76, "step": 362 }, { "epoch": 0.24305323066622028, "grad_norm": 1.9430341365443742, "learning_rate": 8.856435658247636e-06, "loss": 0.7644, "step": 363 }, { "epoch": 0.24372279879477737, "grad_norm": 2.2488331747072094, "learning_rate": 8.849521946219006e-06, "loss": 0.7008, "step": 364 }, { "epoch": 0.24439236692333444, "grad_norm": 2.2378473042895357, "learning_rate": 8.842590113729001e-06, "loss": 0.8178, "step": 365 }, { "epoch": 0.24506193505189153, "grad_norm": 1.9271203469823597, "learning_rate": 8.835640193407136e-06, "loss": 0.7204, "step": 366 }, { "epoch": 0.24573150318044862, "grad_norm": 2.110108349667706, "learning_rate": 8.828672217968055e-06, "loss": 0.7095, "step": 367 }, { "epoch": 0.24640107130900568, "grad_norm": 2.4047507564578705, "learning_rate": 8.821686220211396e-06, "loss": 0.7301, "step": 368 }, { "epoch": 0.24707063943756277, "grad_norm": 2.5423037964638566, "learning_rate": 8.814682233021636e-06, "loss": 0.8049, "step": 369 }, { "epoch": 0.24774020756611986, "grad_norm": 2.241296150990498, "learning_rate": 8.80766028936793e-06, "loss": 0.7803, "step": 370 }, { "epoch": 0.24840977569467693, "grad_norm": 2.2856515695255726, "learning_rate": 8.80062042230395e-06, "loss": 0.7671, "step": 371 }, { "epoch": 0.24907934382323402, "grad_norm": 1.9109712707259574, "learning_rate": 8.793562664967748e-06, "loss": 0.6756, "step": 372 }, { "epoch": 0.2497489119517911, "grad_norm": 2.181057572694745, "learning_rate": 8.786487050581583e-06, "loss": 0.7556, "step": 373 }, { "epoch": 0.25041848008034817, "grad_norm": 1.7837596493469412, "learning_rate": 8.779393612451769e-06, "loss": 0.7225, "step": 374 }, { "epoch": 0.25108804820890523, "grad_norm": 2.0910304476761654, "learning_rate": 8.772282383968524e-06, "loss": 0.685, "step": 375 }, { "epoch": 0.25175761633746235, "grad_norm": 2.3235852027083017, "learning_rate": 8.76515339860581e-06, "loss": 0.7748, "step": 376 }, { "epoch": 0.2524271844660194, "grad_norm": 1.9001308654864533, "learning_rate": 8.75800668992117e-06, "loss": 0.6922, "step": 377 }, { "epoch": 0.2530967525945765, "grad_norm": 2.5333666020114345, "learning_rate": 8.750842291555572e-06, "loss": 0.7001, "step": 378 }, { "epoch": 0.2537663207231336, "grad_norm": 2.7406507381196383, "learning_rate": 8.743660237233262e-06, "loss": 0.7658, "step": 379 }, { "epoch": 0.25443588885169066, "grad_norm": 3.086323052396568, "learning_rate": 8.736460560761589e-06, "loss": 0.7685, "step": 380 }, { "epoch": 0.2551054569802477, "grad_norm": 2.461096342761154, "learning_rate": 8.729243296030851e-06, "loss": 0.6765, "step": 381 }, { "epoch": 0.25577502510880484, "grad_norm": 3.6150117239630264, "learning_rate": 8.722008477014144e-06, "loss": 0.754, "step": 382 }, { "epoch": 0.2564445932373619, "grad_norm": 2.7237546894818268, "learning_rate": 8.714756137767191e-06, "loss": 0.7883, "step": 383 }, { "epoch": 0.25711416136591897, "grad_norm": 1.98301035569131, "learning_rate": 8.70748631242819e-06, "loss": 0.7372, "step": 384 }, { "epoch": 0.2577837294944761, "grad_norm": 2.32096443322657, "learning_rate": 8.700199035217647e-06, "loss": 0.7433, "step": 385 }, { "epoch": 0.25845329762303315, "grad_norm": 2.286401035695198, "learning_rate": 8.692894340438213e-06, "loss": 0.7468, "step": 386 }, { "epoch": 0.2591228657515902, "grad_norm": 2.611287061330126, "learning_rate": 8.685572262474538e-06, "loss": 0.7368, "step": 387 }, { "epoch": 0.2597924338801473, "grad_norm": 2.3027852206758315, "learning_rate": 8.678232835793086e-06, "loss": 0.6907, "step": 388 }, { "epoch": 0.2604620020087044, "grad_norm": 2.0995570904115692, "learning_rate": 8.670876094941991e-06, "loss": 0.7153, "step": 389 }, { "epoch": 0.26113157013726146, "grad_norm": 2.057867877148316, "learning_rate": 8.663502074550891e-06, "loss": 0.7604, "step": 390 }, { "epoch": 0.2618011382658185, "grad_norm": 2.437385150011116, "learning_rate": 8.656110809330758e-06, "loss": 0.8215, "step": 391 }, { "epoch": 0.26247070639437564, "grad_norm": 2.3375747886253437, "learning_rate": 8.648702334073736e-06, "loss": 0.7253, "step": 392 }, { "epoch": 0.2631402745229327, "grad_norm": 2.0332382023316433, "learning_rate": 8.641276683652988e-06, "loss": 0.7167, "step": 393 }, { "epoch": 0.26380984265148977, "grad_norm": 2.141725641078446, "learning_rate": 8.633833893022517e-06, "loss": 0.7796, "step": 394 }, { "epoch": 0.2644794107800469, "grad_norm": 2.708073793757683, "learning_rate": 8.626373997217012e-06, "loss": 0.7268, "step": 395 }, { "epoch": 0.26514897890860395, "grad_norm": 2.223079811565138, "learning_rate": 8.618897031351678e-06, "loss": 0.7279, "step": 396 }, { "epoch": 0.265818547037161, "grad_norm": 2.2388507910755298, "learning_rate": 8.611403030622074e-06, "loss": 0.8196, "step": 397 }, { "epoch": 0.26648811516571813, "grad_norm": 2.0909032687213616, "learning_rate": 8.603892030303943e-06, "loss": 0.7179, "step": 398 }, { "epoch": 0.2671576832942752, "grad_norm": 2.3445139618247874, "learning_rate": 8.596364065753046e-06, "loss": 0.7528, "step": 399 }, { "epoch": 0.26782725142283226, "grad_norm": 2.10324455251196, "learning_rate": 8.588819172405007e-06, "loss": 0.7673, "step": 400 }, { "epoch": 0.2684968195513894, "grad_norm": 1.9245739670324495, "learning_rate": 8.581257385775128e-06, "loss": 0.7339, "step": 401 }, { "epoch": 0.26916638767994644, "grad_norm": 2.9122839241735226, "learning_rate": 8.573678741458236e-06, "loss": 0.7416, "step": 402 }, { "epoch": 0.2698359558085035, "grad_norm": 2.1747596118444568, "learning_rate": 8.56608327512851e-06, "loss": 0.7142, "step": 403 }, { "epoch": 0.2705055239370606, "grad_norm": 2.2309949450992024, "learning_rate": 8.558471022539311e-06, "loss": 0.7847, "step": 404 }, { "epoch": 0.2711750920656177, "grad_norm": 2.236870760950219, "learning_rate": 8.55084201952302e-06, "loss": 0.7416, "step": 405 }, { "epoch": 0.27184466019417475, "grad_norm": 2.076602980361452, "learning_rate": 8.54319630199086e-06, "loss": 0.7679, "step": 406 }, { "epoch": 0.2725142283227318, "grad_norm": 2.0600309336947262, "learning_rate": 8.535533905932739e-06, "loss": 0.7728, "step": 407 }, { "epoch": 0.27318379645128893, "grad_norm": 1.8438545986538568, "learning_rate": 8.527854867417069e-06, "loss": 0.7224, "step": 408 }, { "epoch": 0.273853364579846, "grad_norm": 1.952854159456715, "learning_rate": 8.520159222590605e-06, "loss": 0.7261, "step": 409 }, { "epoch": 0.27452293270840306, "grad_norm": 1.9903089640965324, "learning_rate": 8.512447007678271e-06, "loss": 0.7754, "step": 410 }, { "epoch": 0.2751925008369602, "grad_norm": 1.8742515905302533, "learning_rate": 8.504718258982986e-06, "loss": 0.7445, "step": 411 }, { "epoch": 0.27586206896551724, "grad_norm": 1.9935323141133852, "learning_rate": 8.496973012885498e-06, "loss": 0.7124, "step": 412 }, { "epoch": 0.2765316370940743, "grad_norm": 2.2845543427165502, "learning_rate": 8.489211305844216e-06, "loss": 0.7202, "step": 413 }, { "epoch": 0.2772012052226314, "grad_norm": 2.4542103168179272, "learning_rate": 8.48143317439503e-06, "loss": 0.7536, "step": 414 }, { "epoch": 0.2778707733511885, "grad_norm": 1.8504672206648216, "learning_rate": 8.473638655151142e-06, "loss": 0.7246, "step": 415 }, { "epoch": 0.27854034147974555, "grad_norm": 2.351656170677649, "learning_rate": 8.465827784802895e-06, "loss": 0.775, "step": 416 }, { "epoch": 0.27920990960830266, "grad_norm": 2.10545990848158, "learning_rate": 8.458000600117604e-06, "loss": 0.6879, "step": 417 }, { "epoch": 0.2798794777368597, "grad_norm": 2.1943794750511696, "learning_rate": 8.450157137939375e-06, "loss": 0.7563, "step": 418 }, { "epoch": 0.2805490458654168, "grad_norm": 4.6845763008046895, "learning_rate": 8.442297435188936e-06, "loss": 0.7258, "step": 419 }, { "epoch": 0.2812186139939739, "grad_norm": 2.7200611366890217, "learning_rate": 8.43442152886346e-06, "loss": 0.7661, "step": 420 }, { "epoch": 0.281888182122531, "grad_norm": 2.402728389795057, "learning_rate": 8.4265294560364e-06, "loss": 0.8094, "step": 421 }, { "epoch": 0.28255775025108804, "grad_norm": 2.190921211189467, "learning_rate": 8.418621253857302e-06, "loss": 0.7419, "step": 422 }, { "epoch": 0.28322731837964515, "grad_norm": 2.159026581407067, "learning_rate": 8.410696959551642e-06, "loss": 0.7053, "step": 423 }, { "epoch": 0.2838968865082022, "grad_norm": 2.182967517546507, "learning_rate": 8.402756610420634e-06, "loss": 0.7399, "step": 424 }, { "epoch": 0.2845664546367593, "grad_norm": 2.3890657391031116, "learning_rate": 8.394800243841078e-06, "loss": 0.7746, "step": 425 }, { "epoch": 0.28523602276531634, "grad_norm": 2.0502010225890577, "learning_rate": 8.386827897265163e-06, "loss": 0.7188, "step": 426 }, { "epoch": 0.28590559089387346, "grad_norm": 2.6890913540678936, "learning_rate": 8.378839608220304e-06, "loss": 0.76, "step": 427 }, { "epoch": 0.2865751590224305, "grad_norm": 2.2921971129910244, "learning_rate": 8.370835414308955e-06, "loss": 0.7408, "step": 428 }, { "epoch": 0.2872447271509876, "grad_norm": 2.421688327532607, "learning_rate": 8.362815353208441e-06, "loss": 0.738, "step": 429 }, { "epoch": 0.2879142952795447, "grad_norm": 2.629485833556006, "learning_rate": 8.354779462670778e-06, "loss": 0.7891, "step": 430 }, { "epoch": 0.28858386340810177, "grad_norm": 1.7416004415801858, "learning_rate": 8.346727780522488e-06, "loss": 0.6544, "step": 431 }, { "epoch": 0.28925343153665883, "grad_norm": 1.8489747991703402, "learning_rate": 8.338660344664438e-06, "loss": 0.768, "step": 432 }, { "epoch": 0.28992299966521595, "grad_norm": 2.0470351018474973, "learning_rate": 8.330577193071641e-06, "loss": 0.6534, "step": 433 }, { "epoch": 0.290592567793773, "grad_norm": 1.968450090818657, "learning_rate": 8.32247836379309e-06, "loss": 0.7138, "step": 434 }, { "epoch": 0.2912621359223301, "grad_norm": 2.0960640943162607, "learning_rate": 8.31436389495158e-06, "loss": 0.8035, "step": 435 }, { "epoch": 0.2919317040508872, "grad_norm": 2.391706042038681, "learning_rate": 8.306233824743516e-06, "loss": 0.7438, "step": 436 }, { "epoch": 0.29260127217944426, "grad_norm": 2.338013766689778, "learning_rate": 8.298088191438753e-06, "loss": 0.7597, "step": 437 }, { "epoch": 0.2932708403080013, "grad_norm": 2.2777722026108576, "learning_rate": 8.289927033380395e-06, "loss": 0.7678, "step": 438 }, { "epoch": 0.29394040843655844, "grad_norm": 2.368434489277288, "learning_rate": 8.281750388984627e-06, "loss": 0.7438, "step": 439 }, { "epoch": 0.2946099765651155, "grad_norm": 2.7591410879322016, "learning_rate": 8.273558296740534e-06, "loss": 0.8009, "step": 440 }, { "epoch": 0.29527954469367257, "grad_norm": 2.846355707301233, "learning_rate": 8.265350795209912e-06, "loss": 0.8137, "step": 441 }, { "epoch": 0.2959491128222297, "grad_norm": 2.5481353906612427, "learning_rate": 8.257127923027096e-06, "loss": 0.8736, "step": 442 }, { "epoch": 0.29661868095078675, "grad_norm": 1.8945070766967973, "learning_rate": 8.248889718898773e-06, "loss": 0.7756, "step": 443 }, { "epoch": 0.2972882490793438, "grad_norm": 2.3336373355150632, "learning_rate": 8.2406362216038e-06, "loss": 0.7388, "step": 444 }, { "epoch": 0.2979578172079009, "grad_norm": 2.351778735851431, "learning_rate": 8.23236746999302e-06, "loss": 0.7633, "step": 445 }, { "epoch": 0.298627385336458, "grad_norm": 2.1550280359653837, "learning_rate": 8.22408350298908e-06, "loss": 0.7537, "step": 446 }, { "epoch": 0.29929695346501506, "grad_norm": 2.200023099864876, "learning_rate": 8.215784359586257e-06, "loss": 0.6347, "step": 447 }, { "epoch": 0.2999665215935721, "grad_norm": 1.9928072420628153, "learning_rate": 8.20747007885026e-06, "loss": 0.7176, "step": 448 }, { "epoch": 0.30063608972212924, "grad_norm": 2.7005283374205575, "learning_rate": 8.19914069991805e-06, "loss": 0.6277, "step": 449 }, { "epoch": 0.3013056578506863, "grad_norm": 2.521640686138086, "learning_rate": 8.190796261997662e-06, "loss": 0.777, "step": 450 }, { "epoch": 0.3013056578506863, "eval_loss": 0.7427265048027039, "eval_runtime": 442.6311, "eval_samples_per_second": 45.435, "eval_steps_per_second": 0.712, "step": 450 }, { "epoch": 0.30197522597924337, "grad_norm": 2.0197988443571653, "learning_rate": 8.18243680436802e-06, "loss": 0.7513, "step": 451 }, { "epoch": 0.3026447941078005, "grad_norm": 2.807745305882346, "learning_rate": 8.174062366378741e-06, "loss": 0.6859, "step": 452 }, { "epoch": 0.30331436223635755, "grad_norm": 2.1569245616432013, "learning_rate": 8.165672987449962e-06, "loss": 0.7634, "step": 453 }, { "epoch": 0.3039839303649146, "grad_norm": 2.5717210707863982, "learning_rate": 8.157268707072152e-06, "loss": 0.7114, "step": 454 }, { "epoch": 0.30465349849347173, "grad_norm": 2.4850702821116477, "learning_rate": 8.148849564805916e-06, "loss": 0.7586, "step": 455 }, { "epoch": 0.3053230666220288, "grad_norm": 2.3788894627974853, "learning_rate": 8.140415600281828e-06, "loss": 0.756, "step": 456 }, { "epoch": 0.30599263475058586, "grad_norm": 2.3294416863036473, "learning_rate": 8.131966853200226e-06, "loss": 0.8119, "step": 457 }, { "epoch": 0.306662202879143, "grad_norm": 2.2445934587975644, "learning_rate": 8.123503363331031e-06, "loss": 0.7291, "step": 458 }, { "epoch": 0.30733177100770004, "grad_norm": 2.0009285696654304, "learning_rate": 8.115025170513568e-06, "loss": 0.7656, "step": 459 }, { "epoch": 0.3080013391362571, "grad_norm": 2.082411897225473, "learning_rate": 8.106532314656365e-06, "loss": 0.7902, "step": 460 }, { "epoch": 0.3086709072648142, "grad_norm": 2.036499493354211, "learning_rate": 8.098024835736977e-06, "loss": 0.696, "step": 461 }, { "epoch": 0.3093404753933713, "grad_norm": 1.8765045421029716, "learning_rate": 8.089502773801789e-06, "loss": 0.7167, "step": 462 }, { "epoch": 0.31001004352192835, "grad_norm": 1.9882595374055634, "learning_rate": 8.080966168965832e-06, "loss": 0.7641, "step": 463 }, { "epoch": 0.3106796116504854, "grad_norm": 1.9973122115138333, "learning_rate": 8.072415061412594e-06, "loss": 0.7078, "step": 464 }, { "epoch": 0.31134917977904253, "grad_norm": 2.1649171571700085, "learning_rate": 8.063849491393831e-06, "loss": 0.7375, "step": 465 }, { "epoch": 0.3120187479075996, "grad_norm": 1.736316515572263, "learning_rate": 8.055269499229375e-06, "loss": 0.7128, "step": 466 }, { "epoch": 0.31268831603615665, "grad_norm": 2.086324293880771, "learning_rate": 8.046675125306948e-06, "loss": 0.7557, "step": 467 }, { "epoch": 0.3133578841647138, "grad_norm": 2.104803176003445, "learning_rate": 8.038066410081966e-06, "loss": 0.6985, "step": 468 }, { "epoch": 0.31402745229327084, "grad_norm": 1.9264069192512328, "learning_rate": 8.029443394077356e-06, "loss": 0.7501, "step": 469 }, { "epoch": 0.3146970204218279, "grad_norm": 2.160594926554674, "learning_rate": 8.02080611788336e-06, "loss": 0.7741, "step": 470 }, { "epoch": 0.315366588550385, "grad_norm": 2.2831686540781715, "learning_rate": 8.012154622157345e-06, "loss": 0.7448, "step": 471 }, { "epoch": 0.3160361566789421, "grad_norm": 2.060831037573437, "learning_rate": 8.003488947623615e-06, "loss": 0.7815, "step": 472 }, { "epoch": 0.31670572480749914, "grad_norm": 2.086273251621193, "learning_rate": 7.994809135073211e-06, "loss": 0.7753, "step": 473 }, { "epoch": 0.31737529293605626, "grad_norm": 2.1560668238114715, "learning_rate": 7.986115225363731e-06, "loss": 0.7496, "step": 474 }, { "epoch": 0.3180448610646133, "grad_norm": 2.20836657405377, "learning_rate": 7.977407259419127e-06, "loss": 0.7027, "step": 475 }, { "epoch": 0.3187144291931704, "grad_norm": 2.294726361572952, "learning_rate": 7.96868527822952e-06, "loss": 0.7637, "step": 476 }, { "epoch": 0.3193839973217275, "grad_norm": 1.9354946669836932, "learning_rate": 7.959949322850994e-06, "loss": 0.729, "step": 477 }, { "epoch": 0.32005356545028457, "grad_norm": 1.731248431915094, "learning_rate": 7.951199434405426e-06, "loss": 0.7437, "step": 478 }, { "epoch": 0.32072313357884163, "grad_norm": 1.9728536227108657, "learning_rate": 7.94243565408027e-06, "loss": 0.6863, "step": 479 }, { "epoch": 0.32139270170739875, "grad_norm": 2.0576820971873575, "learning_rate": 7.933658023128372e-06, "loss": 0.732, "step": 480 }, { "epoch": 0.3220622698359558, "grad_norm": 2.109606247808142, "learning_rate": 7.924866582867779e-06, "loss": 0.7739, "step": 481 }, { "epoch": 0.3227318379645129, "grad_norm": 1.8593410800578598, "learning_rate": 7.916061374681538e-06, "loss": 0.6981, "step": 482 }, { "epoch": 0.32340140609306994, "grad_norm": 3.5212020924571354, "learning_rate": 7.907242440017508e-06, "loss": 0.6724, "step": 483 }, { "epoch": 0.32407097422162706, "grad_norm": 2.2688734831667237, "learning_rate": 7.898409820388159e-06, "loss": 0.7451, "step": 484 }, { "epoch": 0.3247405423501841, "grad_norm": 2.131088860470621, "learning_rate": 7.889563557370378e-06, "loss": 0.7273, "step": 485 }, { "epoch": 0.3254101104787412, "grad_norm": 2.4891641486432303, "learning_rate": 7.880703692605275e-06, "loss": 0.7692, "step": 486 }, { "epoch": 0.3260796786072983, "grad_norm": 1.56390958117357, "learning_rate": 7.87183026779799e-06, "loss": 0.6363, "step": 487 }, { "epoch": 0.32674924673585537, "grad_norm": 1.7913132739644757, "learning_rate": 7.86294332471748e-06, "loss": 0.7493, "step": 488 }, { "epoch": 0.32741881486441243, "grad_norm": 2.0983579885742296, "learning_rate": 7.854042905196355e-06, "loss": 0.7399, "step": 489 }, { "epoch": 0.32808838299296955, "grad_norm": 2.3075408243244158, "learning_rate": 7.845129051130642e-06, "loss": 0.7511, "step": 490 }, { "epoch": 0.3287579511215266, "grad_norm": 2.0396100035209144, "learning_rate": 7.83620180447962e-06, "loss": 0.7502, "step": 491 }, { "epoch": 0.3294275192500837, "grad_norm": 2.678074830241829, "learning_rate": 7.827261207265607e-06, "loss": 0.7615, "step": 492 }, { "epoch": 0.3300970873786408, "grad_norm": 2.174057144321036, "learning_rate": 7.818307301573757e-06, "loss": 0.7391, "step": 493 }, { "epoch": 0.33076665550719786, "grad_norm": 2.2146459423024054, "learning_rate": 7.809340129551878e-06, "loss": 0.7516, "step": 494 }, { "epoch": 0.3314362236357549, "grad_norm": 2.246566755390269, "learning_rate": 7.800359733410225e-06, "loss": 0.7308, "step": 495 }, { "epoch": 0.33210579176431204, "grad_norm": 1.8503111326409885, "learning_rate": 7.791366155421296e-06, "loss": 0.7339, "step": 496 }, { "epoch": 0.3327753598928691, "grad_norm": 2.146046193350473, "learning_rate": 7.782359437919644e-06, "loss": 0.7611, "step": 497 }, { "epoch": 0.33344492802142617, "grad_norm": 1.7648520200510291, "learning_rate": 7.77333962330167e-06, "loss": 0.7669, "step": 498 }, { "epoch": 0.3341144961499833, "grad_norm": 2.134134737227648, "learning_rate": 7.764306754025425e-06, "loss": 0.7328, "step": 499 }, { "epoch": 0.33478406427854035, "grad_norm": 2.524415014866301, "learning_rate": 7.755260872610412e-06, "loss": 0.7404, "step": 500 }, { "epoch": 0.3354536324070974, "grad_norm": 1.856597428933483, "learning_rate": 7.746202021637385e-06, "loss": 0.7288, "step": 501 }, { "epoch": 0.3361232005356545, "grad_norm": 2.257503356198489, "learning_rate": 7.737130243748145e-06, "loss": 0.7793, "step": 502 }, { "epoch": 0.3367927686642116, "grad_norm": 2.0025598012052184, "learning_rate": 7.728045581645349e-06, "loss": 0.7403, "step": 503 }, { "epoch": 0.33746233679276866, "grad_norm": 2.629339312633209, "learning_rate": 7.718948078092297e-06, "loss": 0.821, "step": 504 }, { "epoch": 0.3381319049213257, "grad_norm": 4.535508820638718, "learning_rate": 7.709837775912737e-06, "loss": 0.779, "step": 505 }, { "epoch": 0.33880147304988284, "grad_norm": 1.9719002569851647, "learning_rate": 7.700714717990667e-06, "loss": 0.7104, "step": 506 }, { "epoch": 0.3394710411784399, "grad_norm": 2.2941421809162224, "learning_rate": 7.691578947270122e-06, "loss": 0.7454, "step": 507 }, { "epoch": 0.34014060930699697, "grad_norm": 1.8017044480109818, "learning_rate": 7.682430506754982e-06, "loss": 0.7116, "step": 508 }, { "epoch": 0.3408101774355541, "grad_norm": 2.7477187440781212, "learning_rate": 7.67326943950877e-06, "loss": 0.7311, "step": 509 }, { "epoch": 0.34147974556411115, "grad_norm": 1.987680948248994, "learning_rate": 7.66409578865444e-06, "loss": 0.69, "step": 510 }, { "epoch": 0.3421493136926682, "grad_norm": 2.6876841061497614, "learning_rate": 7.65490959737418e-06, "loss": 0.7546, "step": 511 }, { "epoch": 0.34281888182122533, "grad_norm": 2.514962017037335, "learning_rate": 7.645710908909212e-06, "loss": 0.7891, "step": 512 }, { "epoch": 0.3434884499497824, "grad_norm": 2.115071566634725, "learning_rate": 7.636499766559582e-06, "loss": 0.7501, "step": 513 }, { "epoch": 0.34415801807833946, "grad_norm": 3.247764317500305, "learning_rate": 7.627276213683956e-06, "loss": 0.763, "step": 514 }, { "epoch": 0.3448275862068966, "grad_norm": 2.6862951246915543, "learning_rate": 7.618040293699428e-06, "loss": 0.8034, "step": 515 }, { "epoch": 0.34549715433545364, "grad_norm": 1.8011548509601292, "learning_rate": 7.608792050081295e-06, "loss": 0.6725, "step": 516 }, { "epoch": 0.3461667224640107, "grad_norm": 2.247724154639633, "learning_rate": 7.599531526362873e-06, "loss": 0.7334, "step": 517 }, { "epoch": 0.3468362905925678, "grad_norm": 2.372879412174865, "learning_rate": 7.590258766135277e-06, "loss": 0.7732, "step": 518 }, { "epoch": 0.3475058587211249, "grad_norm": 2.7080534118047974, "learning_rate": 7.580973813047225e-06, "loss": 0.7565, "step": 519 }, { "epoch": 0.34817542684968195, "grad_norm": 2.1854614288973773, "learning_rate": 7.571676710804827e-06, "loss": 0.7021, "step": 520 }, { "epoch": 0.348844994978239, "grad_norm": 2.3720594093502965, "learning_rate": 7.562367503171386e-06, "loss": 0.7565, "step": 521 }, { "epoch": 0.34951456310679613, "grad_norm": 2.179322594839869, "learning_rate": 7.553046233967181e-06, "loss": 0.7196, "step": 522 }, { "epoch": 0.3501841312353532, "grad_norm": 1.9213658858269456, "learning_rate": 7.543712947069269e-06, "loss": 0.7777, "step": 523 }, { "epoch": 0.35085369936391025, "grad_norm": 2.351774794865001, "learning_rate": 7.5343676864112795e-06, "loss": 0.7603, "step": 524 }, { "epoch": 0.3515232674924674, "grad_norm": 2.1601537641167394, "learning_rate": 7.525010495983202e-06, "loss": 0.7714, "step": 525 }, { "epoch": 0.35219283562102444, "grad_norm": 2.2507450308585715, "learning_rate": 7.515641419831188e-06, "loss": 0.742, "step": 526 }, { "epoch": 0.3528624037495815, "grad_norm": 2.3618989635727314, "learning_rate": 7.506260502057325e-06, "loss": 0.7169, "step": 527 }, { "epoch": 0.3535319718781386, "grad_norm": 1.8807667613644752, "learning_rate": 7.496867786819456e-06, "loss": 0.7006, "step": 528 }, { "epoch": 0.3542015400066957, "grad_norm": 2.3983344103815027, "learning_rate": 7.487463318330945e-06, "loss": 0.7004, "step": 529 }, { "epoch": 0.35487110813525274, "grad_norm": 2.05595677712876, "learning_rate": 7.478047140860487e-06, "loss": 0.7325, "step": 530 }, { "epoch": 0.35554067626380986, "grad_norm": 2.2110181416978465, "learning_rate": 7.468619298731893e-06, "loss": 0.6664, "step": 531 }, { "epoch": 0.3562102443923669, "grad_norm": 2.794328468259519, "learning_rate": 7.45917983632388e-06, "loss": 0.7435, "step": 532 }, { "epoch": 0.356879812520924, "grad_norm": 1.9197010199811946, "learning_rate": 7.449728798069864e-06, "loss": 0.7425, "step": 533 }, { "epoch": 0.3575493806494811, "grad_norm": 3.156168282981768, "learning_rate": 7.440266228457756e-06, "loss": 0.8212, "step": 534 }, { "epoch": 0.35821894877803817, "grad_norm": 1.8297810984207254, "learning_rate": 7.4307921720297385e-06, "loss": 0.6668, "step": 535 }, { "epoch": 0.35888851690659523, "grad_norm": 2.096107773667456, "learning_rate": 7.421306673382072e-06, "loss": 0.7382, "step": 536 }, { "epoch": 0.35955808503515235, "grad_norm": 1.859276175647247, "learning_rate": 7.411809777164873e-06, "loss": 0.7213, "step": 537 }, { "epoch": 0.3602276531637094, "grad_norm": 2.4091577205901613, "learning_rate": 7.402301528081915e-06, "loss": 0.7761, "step": 538 }, { "epoch": 0.3608972212922665, "grad_norm": 1.6717556948707037, "learning_rate": 7.392781970890404e-06, "loss": 0.6896, "step": 539 }, { "epoch": 0.36156678942082354, "grad_norm": 2.1086496794115126, "learning_rate": 7.383251150400779e-06, "loss": 0.7143, "step": 540 }, { "epoch": 0.36223635754938066, "grad_norm": 2.40960006974007, "learning_rate": 7.373709111476498e-06, "loss": 0.6919, "step": 541 }, { "epoch": 0.3629059256779377, "grad_norm": 2.6172734718071267, "learning_rate": 7.364155899033827e-06, "loss": 0.7824, "step": 542 }, { "epoch": 0.3635754938064948, "grad_norm": 2.346121704743645, "learning_rate": 7.354591558041627e-06, "loss": 0.7901, "step": 543 }, { "epoch": 0.3642450619350519, "grad_norm": 1.6606227032352279, "learning_rate": 7.345016133521141e-06, "loss": 0.6825, "step": 544 }, { "epoch": 0.36491463006360897, "grad_norm": 2.2015649167009204, "learning_rate": 7.3354296705457894e-06, "loss": 0.7643, "step": 545 }, { "epoch": 0.36558419819216603, "grad_norm": 2.36060246241988, "learning_rate": 7.325832214240949e-06, "loss": 0.7697, "step": 546 }, { "epoch": 0.36625376632072315, "grad_norm": 2.255022559582994, "learning_rate": 7.316223809783745e-06, "loss": 0.7828, "step": 547 }, { "epoch": 0.3669233344492802, "grad_norm": 2.1013870462085777, "learning_rate": 7.306604502402835e-06, "loss": 0.6664, "step": 548 }, { "epoch": 0.3675929025778373, "grad_norm": 1.7771856755897881, "learning_rate": 7.296974337378209e-06, "loss": 0.7624, "step": 549 }, { "epoch": 0.3682624707063944, "grad_norm": 1.977540765866053, "learning_rate": 7.287333360040953e-06, "loss": 0.6873, "step": 550 }, { "epoch": 0.36893203883495146, "grad_norm": 2.0477751349497955, "learning_rate": 7.277681615773055e-06, "loss": 0.6901, "step": 551 }, { "epoch": 0.3696016069635085, "grad_norm": 2.0182457783176244, "learning_rate": 7.268019150007189e-06, "loss": 0.7327, "step": 552 }, { "epoch": 0.37027117509206564, "grad_norm": 2.5298770400851502, "learning_rate": 7.25834600822649e-06, "loss": 0.7226, "step": 553 }, { "epoch": 0.3709407432206227, "grad_norm": 2.137474061728339, "learning_rate": 7.24866223596435e-06, "loss": 0.692, "step": 554 }, { "epoch": 0.37161031134917977, "grad_norm": 2.1619958776728967, "learning_rate": 7.2389678788041996e-06, "loss": 0.7541, "step": 555 }, { "epoch": 0.3722798794777369, "grad_norm": 4.0154195266069515, "learning_rate": 7.229262982379298e-06, "loss": 0.7375, "step": 556 }, { "epoch": 0.37294944760629395, "grad_norm": 2.5199221950798876, "learning_rate": 7.219547592372512e-06, "loss": 0.703, "step": 557 }, { "epoch": 0.373619015734851, "grad_norm": 13.874191180588413, "learning_rate": 7.209821754516104e-06, "loss": 0.7084, "step": 558 }, { "epoch": 0.3742885838634081, "grad_norm": 1.708754374404828, "learning_rate": 7.200085514591518e-06, "loss": 0.6473, "step": 559 }, { "epoch": 0.3749581519919652, "grad_norm": 1.6522815874479984, "learning_rate": 7.1903389184291615e-06, "loss": 0.7026, "step": 560 }, { "epoch": 0.37562772012052226, "grad_norm": 1.882159843473132, "learning_rate": 7.180582011908188e-06, "loss": 0.7277, "step": 561 }, { "epoch": 0.3762972882490793, "grad_norm": 2.04309018828362, "learning_rate": 7.1708148409562905e-06, "loss": 0.7784, "step": 562 }, { "epoch": 0.37696685637763644, "grad_norm": 2.363955588051334, "learning_rate": 7.161037451549473e-06, "loss": 0.7483, "step": 563 }, { "epoch": 0.3776364245061935, "grad_norm": 2.933107771273981, "learning_rate": 7.151249889711842e-06, "loss": 0.6933, "step": 564 }, { "epoch": 0.37830599263475057, "grad_norm": 1.825988988030226, "learning_rate": 7.141452201515386e-06, "loss": 0.6782, "step": 565 }, { "epoch": 0.3789755607633077, "grad_norm": 2.267859732494615, "learning_rate": 7.131644433079766e-06, "loss": 0.7607, "step": 566 }, { "epoch": 0.37964512889186475, "grad_norm": 1.8368116805308627, "learning_rate": 7.121826630572084e-06, "loss": 0.7268, "step": 567 }, { "epoch": 0.3803146970204218, "grad_norm": 2.788947698753989, "learning_rate": 7.111998840206681e-06, "loss": 0.708, "step": 568 }, { "epoch": 0.38098426514897893, "grad_norm": 1.6595969576780742, "learning_rate": 7.102161108244907e-06, "loss": 0.6953, "step": 569 }, { "epoch": 0.381653833277536, "grad_norm": 1.8297553781499412, "learning_rate": 7.0923134809949194e-06, "loss": 0.7351, "step": 570 }, { "epoch": 0.38232340140609306, "grad_norm": 2.4088476408305826, "learning_rate": 7.08245600481144e-06, "loss": 0.7126, "step": 571 }, { "epoch": 0.3829929695346502, "grad_norm": 2.93980487820109, "learning_rate": 7.072588726095565e-06, "loss": 0.6813, "step": 572 }, { "epoch": 0.38366253766320724, "grad_norm": 1.8106036164774397, "learning_rate": 7.062711691294525e-06, "loss": 0.7152, "step": 573 }, { "epoch": 0.3843321057917643, "grad_norm": 2.1445544547456143, "learning_rate": 7.052824946901478e-06, "loss": 0.7397, "step": 574 }, { "epoch": 0.3850016739203214, "grad_norm": 1.8902288217190941, "learning_rate": 7.042928539455285e-06, "loss": 0.6943, "step": 575 }, { "epoch": 0.3856712420488785, "grad_norm": 2.452414587347219, "learning_rate": 7.033022515540293e-06, "loss": 0.735, "step": 576 }, { "epoch": 0.38634081017743555, "grad_norm": 1.8719642749729228, "learning_rate": 7.023106921786118e-06, "loss": 0.693, "step": 577 }, { "epoch": 0.3870103783059926, "grad_norm": 2.1788013836656304, "learning_rate": 7.013181804867421e-06, "loss": 0.7376, "step": 578 }, { "epoch": 0.3876799464345497, "grad_norm": 1.9758546746637256, "learning_rate": 7.003247211503691e-06, "loss": 0.7382, "step": 579 }, { "epoch": 0.3883495145631068, "grad_norm": 2.4122881153190416, "learning_rate": 6.993303188459022e-06, "loss": 0.6758, "step": 580 }, { "epoch": 0.38901908269166385, "grad_norm": 2.4006359195340243, "learning_rate": 6.983349782541901e-06, "loss": 0.7152, "step": 581 }, { "epoch": 0.38968865082022097, "grad_norm": 1.9268478219360747, "learning_rate": 6.973387040604978e-06, "loss": 0.7399, "step": 582 }, { "epoch": 0.39035821894877804, "grad_norm": 3.15353195209888, "learning_rate": 6.963415009544847e-06, "loss": 0.7485, "step": 583 }, { "epoch": 0.3910277870773351, "grad_norm": 1.9117157016138058, "learning_rate": 6.953433736301836e-06, "loss": 0.7293, "step": 584 }, { "epoch": 0.3916973552058922, "grad_norm": 2.3699790901899482, "learning_rate": 6.9434432678597704e-06, "loss": 0.7262, "step": 585 }, { "epoch": 0.3923669233344493, "grad_norm": 2.760455900266047, "learning_rate": 6.933443651245761e-06, "loss": 0.693, "step": 586 }, { "epoch": 0.39303649146300634, "grad_norm": 1.9400106973188151, "learning_rate": 6.9234349335299835e-06, "loss": 0.6903, "step": 587 }, { "epoch": 0.39370605959156346, "grad_norm": 2.176991619604154, "learning_rate": 6.913417161825449e-06, "loss": 0.7568, "step": 588 }, { "epoch": 0.3943756277201205, "grad_norm": 3.7380778121873774, "learning_rate": 6.903390383287795e-06, "loss": 0.7771, "step": 589 }, { "epoch": 0.3950451958486776, "grad_norm": 1.9886619276415751, "learning_rate": 6.893354645115048e-06, "loss": 0.7742, "step": 590 }, { "epoch": 0.3957147639772347, "grad_norm": 2.140070696696506, "learning_rate": 6.883309994547415e-06, "loss": 0.7786, "step": 591 }, { "epoch": 0.39638433210579177, "grad_norm": 2.2990663106422264, "learning_rate": 6.873256478867053e-06, "loss": 0.7836, "step": 592 }, { "epoch": 0.39705390023434883, "grad_norm": 2.020364757323173, "learning_rate": 6.863194145397849e-06, "loss": 0.6731, "step": 593 }, { "epoch": 0.39772346836290595, "grad_norm": 2.0083940254603156, "learning_rate": 6.853123041505197e-06, "loss": 0.6951, "step": 594 }, { "epoch": 0.398393036491463, "grad_norm": 2.1533859982214856, "learning_rate": 6.843043214595777e-06, "loss": 0.7374, "step": 595 }, { "epoch": 0.3990626046200201, "grad_norm": 2.095333264524113, "learning_rate": 6.8329547121173264e-06, "loss": 0.7503, "step": 596 }, { "epoch": 0.39973217274857714, "grad_norm": 1.8057274532516507, "learning_rate": 6.822857581558423e-06, "loss": 0.7731, "step": 597 }, { "epoch": 0.40040174087713426, "grad_norm": 2.3424692938883735, "learning_rate": 6.8127518704482595e-06, "loss": 0.7414, "step": 598 }, { "epoch": 0.4010713090056913, "grad_norm": 2.461008454396458, "learning_rate": 6.802637626356414e-06, "loss": 0.7082, "step": 599 }, { "epoch": 0.4017408771342484, "grad_norm": 2.302660296561005, "learning_rate": 6.792514896892639e-06, "loss": 0.7035, "step": 600 }, { "epoch": 0.4017408771342484, "eval_loss": 0.7316640019416809, "eval_runtime": 480.4551, "eval_samples_per_second": 41.858, "eval_steps_per_second": 0.656, "step": 600 }, { "epoch": 0.4024104452628055, "grad_norm": 2.052668153130148, "learning_rate": 6.782383729706619e-06, "loss": 0.7303, "step": 601 }, { "epoch": 0.40308001339136257, "grad_norm": 2.1127956279729996, "learning_rate": 6.772244172487767e-06, "loss": 0.7161, "step": 602 }, { "epoch": 0.40374958151991963, "grad_norm": 2.059231430066789, "learning_rate": 6.762096272964982e-06, "loss": 0.7329, "step": 603 }, { "epoch": 0.40441914964847675, "grad_norm": 2.261760818575989, "learning_rate": 6.7519400789064375e-06, "loss": 0.7466, "step": 604 }, { "epoch": 0.4050887177770338, "grad_norm": 1.9469631014789435, "learning_rate": 6.741775638119345e-06, "loss": 0.6619, "step": 605 }, { "epoch": 0.4057582859055909, "grad_norm": 2.413319423085223, "learning_rate": 6.731602998449739e-06, "loss": 0.7303, "step": 606 }, { "epoch": 0.406427854034148, "grad_norm": 2.192218706679367, "learning_rate": 6.721422207782249e-06, "loss": 0.8397, "step": 607 }, { "epoch": 0.40709742216270506, "grad_norm": 2.1442876827039883, "learning_rate": 6.711233314039868e-06, "loss": 0.7883, "step": 608 }, { "epoch": 0.4077669902912621, "grad_norm": 1.9409091240907506, "learning_rate": 6.701036365183737e-06, "loss": 0.7091, "step": 609 }, { "epoch": 0.40843655841981924, "grad_norm": 2.131815598609806, "learning_rate": 6.6908314092129105e-06, "loss": 0.7178, "step": 610 }, { "epoch": 0.4091061265483763, "grad_norm": 2.2630257980578232, "learning_rate": 6.680618494164135e-06, "loss": 0.802, "step": 611 }, { "epoch": 0.40977569467693337, "grad_norm": 1.873416036176086, "learning_rate": 6.6703976681116204e-06, "loss": 0.7608, "step": 612 }, { "epoch": 0.4104452628054905, "grad_norm": 2.0120115731610513, "learning_rate": 6.66016897916682e-06, "loss": 0.7439, "step": 613 }, { "epoch": 0.41111483093404755, "grad_norm": 1.975758206307219, "learning_rate": 6.649932475478196e-06, "loss": 0.7756, "step": 614 }, { "epoch": 0.4117843990626046, "grad_norm": 1.9174788282627677, "learning_rate": 6.639688205230994e-06, "loss": 0.7155, "step": 615 }, { "epoch": 0.4124539671911617, "grad_norm": 2.034221810217612, "learning_rate": 6.629436216647023e-06, "loss": 0.7132, "step": 616 }, { "epoch": 0.4131235353197188, "grad_norm": 1.9513440264127635, "learning_rate": 6.6191765579844205e-06, "loss": 0.6948, "step": 617 }, { "epoch": 0.41379310344827586, "grad_norm": 2.0899312758965607, "learning_rate": 6.60890927753743e-06, "loss": 0.8003, "step": 618 }, { "epoch": 0.4144626715768329, "grad_norm": 2.274769195021846, "learning_rate": 6.59863442363617e-06, "loss": 0.7892, "step": 619 }, { "epoch": 0.41513223970539004, "grad_norm": 1.976380881232256, "learning_rate": 6.588352044646412e-06, "loss": 0.7215, "step": 620 }, { "epoch": 0.4158018078339471, "grad_norm": 2.298687032778965, "learning_rate": 6.57806218896935e-06, "loss": 0.7219, "step": 621 }, { "epoch": 0.41647137596250416, "grad_norm": 2.514207022139248, "learning_rate": 6.5677649050413675e-06, "loss": 0.7737, "step": 622 }, { "epoch": 0.4171409440910613, "grad_norm": 1.8378531176139836, "learning_rate": 6.557460241333817e-06, "loss": 0.6994, "step": 623 }, { "epoch": 0.41781051221961835, "grad_norm": 2.0974377086306473, "learning_rate": 6.547148246352792e-06, "loss": 0.6903, "step": 624 }, { "epoch": 0.4184800803481754, "grad_norm": 2.1147211902334635, "learning_rate": 6.5368289686388906e-06, "loss": 0.7066, "step": 625 }, { "epoch": 0.41914964847673253, "grad_norm": 1.8543613577240545, "learning_rate": 6.526502456766996e-06, "loss": 0.6816, "step": 626 }, { "epoch": 0.4198192166052896, "grad_norm": 8.745699449024615, "learning_rate": 6.5161687593460395e-06, "loss": 0.7373, "step": 627 }, { "epoch": 0.42048878473384665, "grad_norm": 2.3284886835871874, "learning_rate": 6.505827925018784e-06, "loss": 0.7852, "step": 628 }, { "epoch": 0.4211583528624038, "grad_norm": 2.7367242912968552, "learning_rate": 6.495480002461577e-06, "loss": 0.6827, "step": 629 }, { "epoch": 0.42182792099096084, "grad_norm": 2.0023583657260065, "learning_rate": 6.485125040384144e-06, "loss": 0.7382, "step": 630 }, { "epoch": 0.4224974891195179, "grad_norm": 2.0318993203518465, "learning_rate": 6.474763087529332e-06, "loss": 0.7218, "step": 631 }, { "epoch": 0.423167057248075, "grad_norm": 1.9785703028400952, "learning_rate": 6.4643941926729105e-06, "loss": 0.6941, "step": 632 }, { "epoch": 0.4238366253766321, "grad_norm": 2.1657858161959265, "learning_rate": 6.454018404623313e-06, "loss": 0.6786, "step": 633 }, { "epoch": 0.42450619350518914, "grad_norm": 2.5475124271040888, "learning_rate": 6.443635772221431e-06, "loss": 0.731, "step": 634 }, { "epoch": 0.4251757616337462, "grad_norm": 2.178717731579941, "learning_rate": 6.433246344340366e-06, "loss": 0.7581, "step": 635 }, { "epoch": 0.4258453297623033, "grad_norm": 2.1171591858958916, "learning_rate": 6.422850169885212e-06, "loss": 0.7168, "step": 636 }, { "epoch": 0.4265148978908604, "grad_norm": 2.620865711241223, "learning_rate": 6.412447297792818e-06, "loss": 0.7542, "step": 637 }, { "epoch": 0.42718446601941745, "grad_norm": 2.3397392381611892, "learning_rate": 6.402037777031563e-06, "loss": 0.7334, "step": 638 }, { "epoch": 0.42785403414797457, "grad_norm": 3.1111559844692085, "learning_rate": 6.39162165660112e-06, "loss": 0.8028, "step": 639 }, { "epoch": 0.42852360227653163, "grad_norm": 2.082202537894563, "learning_rate": 6.381198985532226e-06, "loss": 0.7264, "step": 640 }, { "epoch": 0.4291931704050887, "grad_norm": 1.9686119302460052, "learning_rate": 6.370769812886459e-06, "loss": 0.7114, "step": 641 }, { "epoch": 0.4298627385336458, "grad_norm": 2.0279561611759496, "learning_rate": 6.360334187755995e-06, "loss": 0.7414, "step": 642 }, { "epoch": 0.4305323066622029, "grad_norm": 2.0123082567900696, "learning_rate": 6.34989215926339e-06, "loss": 0.7428, "step": 643 }, { "epoch": 0.43120187479075994, "grad_norm": 1.8467948033005894, "learning_rate": 6.3394437765613315e-06, "loss": 0.6739, "step": 644 }, { "epoch": 0.43187144291931706, "grad_norm": 2.2389503412677723, "learning_rate": 6.328989088832431e-06, "loss": 0.7138, "step": 645 }, { "epoch": 0.4325410110478741, "grad_norm": 2.3046125467328706, "learning_rate": 6.318528145288967e-06, "loss": 0.6728, "step": 646 }, { "epoch": 0.4332105791764312, "grad_norm": 2.1994545152138993, "learning_rate": 6.308060995172673e-06, "loss": 0.6992, "step": 647 }, { "epoch": 0.4338801473049883, "grad_norm": 2.449036733550825, "learning_rate": 6.297587687754494e-06, "loss": 0.758, "step": 648 }, { "epoch": 0.43454971543354537, "grad_norm": 2.856821972127845, "learning_rate": 6.28710827233436e-06, "loss": 0.7604, "step": 649 }, { "epoch": 0.43521928356210243, "grad_norm": 2.805420369735936, "learning_rate": 6.276622798240953e-06, "loss": 0.7828, "step": 650 }, { "epoch": 0.43588885169065955, "grad_norm": 2.0280480299494834, "learning_rate": 6.266131314831474e-06, "loss": 0.7451, "step": 651 }, { "epoch": 0.4365584198192166, "grad_norm": 2.284835649544725, "learning_rate": 6.255633871491411e-06, "loss": 0.8106, "step": 652 }, { "epoch": 0.4372279879477737, "grad_norm": 2.061873715445228, "learning_rate": 6.245130517634307e-06, "loss": 0.6492, "step": 653 }, { "epoch": 0.43789755607633074, "grad_norm": 1.8145706719208559, "learning_rate": 6.2346213027015245e-06, "loss": 0.6545, "step": 654 }, { "epoch": 0.43856712420488786, "grad_norm": 2.306281047388641, "learning_rate": 6.224106276162021e-06, "loss": 0.7064, "step": 655 }, { "epoch": 0.4392366923334449, "grad_norm": 3.844657813123945, "learning_rate": 6.213585487512104e-06, "loss": 0.7477, "step": 656 }, { "epoch": 0.439906260462002, "grad_norm": 1.969483659415022, "learning_rate": 6.203058986275207e-06, "loss": 0.7176, "step": 657 }, { "epoch": 0.4405758285905591, "grad_norm": 2.2452467686133013, "learning_rate": 6.192526822001653e-06, "loss": 0.7482, "step": 658 }, { "epoch": 0.44124539671911617, "grad_norm": 2.3377085498348835, "learning_rate": 6.181989044268426e-06, "loss": 0.7182, "step": 659 }, { "epoch": 0.44191496484767323, "grad_norm": 1.907157377681982, "learning_rate": 6.171445702678929e-06, "loss": 0.7151, "step": 660 }, { "epoch": 0.44258453297623035, "grad_norm": 1.9500584169673754, "learning_rate": 6.160896846862754e-06, "loss": 0.777, "step": 661 }, { "epoch": 0.4432541011047874, "grad_norm": 2.763162237113587, "learning_rate": 6.150342526475457e-06, "loss": 0.7146, "step": 662 }, { "epoch": 0.4439236692333445, "grad_norm": 2.1505467326032206, "learning_rate": 6.139782791198309e-06, "loss": 0.7375, "step": 663 }, { "epoch": 0.4445932373619016, "grad_norm": 1.9810085423366004, "learning_rate": 6.129217690738075e-06, "loss": 0.6936, "step": 664 }, { "epoch": 0.44526280549045866, "grad_norm": 1.9509150759189502, "learning_rate": 6.11864727482677e-06, "loss": 0.7125, "step": 665 }, { "epoch": 0.4459323736190157, "grad_norm": 2.0205307347003942, "learning_rate": 6.108071593221436e-06, "loss": 0.7508, "step": 666 }, { "epoch": 0.44660194174757284, "grad_norm": 1.9531350064446338, "learning_rate": 6.097490695703896e-06, "loss": 0.7454, "step": 667 }, { "epoch": 0.4472715098761299, "grad_norm": 2.789841815306139, "learning_rate": 6.08690463208053e-06, "loss": 0.6366, "step": 668 }, { "epoch": 0.44794107800468697, "grad_norm": 2.1039236208805123, "learning_rate": 6.076313452182033e-06, "loss": 0.6911, "step": 669 }, { "epoch": 0.4486106461332441, "grad_norm": 1.6900590116246164, "learning_rate": 6.065717205863184e-06, "loss": 0.7087, "step": 670 }, { "epoch": 0.44928021426180115, "grad_norm": 1.8647345824050408, "learning_rate": 6.055115943002612e-06, "loss": 0.7073, "step": 671 }, { "epoch": 0.4499497823903582, "grad_norm": 2.312522422092974, "learning_rate": 6.044509713502555e-06, "loss": 0.6515, "step": 672 }, { "epoch": 0.4506193505189153, "grad_norm": 2.9603871532213146, "learning_rate": 6.033898567288638e-06, "loss": 0.7426, "step": 673 }, { "epoch": 0.4512889186474724, "grad_norm": 2.404625095513058, "learning_rate": 6.0232825543096215e-06, "loss": 0.7767, "step": 674 }, { "epoch": 0.45195848677602946, "grad_norm": 2.1380127971282787, "learning_rate": 6.012661724537181e-06, "loss": 0.7447, "step": 675 }, { "epoch": 0.4526280549045865, "grad_norm": 2.0060043181901634, "learning_rate": 6.002036127965664e-06, "loss": 0.6774, "step": 676 }, { "epoch": 0.45329762303314364, "grad_norm": 2.0092026061016477, "learning_rate": 5.991405814611855e-06, "loss": 0.714, "step": 677 }, { "epoch": 0.4539671911617007, "grad_norm": 2.0577007070951243, "learning_rate": 5.9807708345147405e-06, "loss": 0.7116, "step": 678 }, { "epoch": 0.45463675929025776, "grad_norm": 2.925386533563134, "learning_rate": 5.970131237735278e-06, "loss": 0.8565, "step": 679 }, { "epoch": 0.4553063274188149, "grad_norm": 2.4541402000266697, "learning_rate": 5.959487074356154e-06, "loss": 0.7274, "step": 680 }, { "epoch": 0.45597589554737195, "grad_norm": 2.2267995478876017, "learning_rate": 5.948838394481551e-06, "loss": 0.7292, "step": 681 }, { "epoch": 0.456645463675929, "grad_norm": 2.1096516573557627, "learning_rate": 5.938185248236911e-06, "loss": 0.7494, "step": 682 }, { "epoch": 0.4573150318044861, "grad_norm": 2.8245263115597266, "learning_rate": 5.9275276857687e-06, "loss": 0.7911, "step": 683 }, { "epoch": 0.4579845999330432, "grad_norm": 1.8723965512763308, "learning_rate": 5.916865757244177e-06, "loss": 0.7146, "step": 684 }, { "epoch": 0.45865416806160025, "grad_norm": 2.1580487457185047, "learning_rate": 5.9061995128511455e-06, "loss": 0.7246, "step": 685 }, { "epoch": 0.4593237361901574, "grad_norm": 1.9120871578189345, "learning_rate": 5.895529002797728e-06, "loss": 0.7183, "step": 686 }, { "epoch": 0.45999330431871444, "grad_norm": 2.095828863329562, "learning_rate": 5.8848542773121285e-06, "loss": 0.7467, "step": 687 }, { "epoch": 0.4606628724472715, "grad_norm": 2.92143521969414, "learning_rate": 5.87417538664239e-06, "loss": 0.7201, "step": 688 }, { "epoch": 0.4613324405758286, "grad_norm": 2.2162327159709254, "learning_rate": 5.863492381056164e-06, "loss": 0.7427, "step": 689 }, { "epoch": 0.4620020087043857, "grad_norm": 2.675500897243712, "learning_rate": 5.852805310840471e-06, "loss": 0.6987, "step": 690 }, { "epoch": 0.46267157683294274, "grad_norm": 2.042230179753257, "learning_rate": 5.842114226301466e-06, "loss": 0.751, "step": 691 }, { "epoch": 0.4633411449614998, "grad_norm": 2.4724057179665544, "learning_rate": 5.831419177764199e-06, "loss": 0.7871, "step": 692 }, { "epoch": 0.4640107130900569, "grad_norm": 2.126641106047325, "learning_rate": 5.820720215572375e-06, "loss": 0.7254, "step": 693 }, { "epoch": 0.464680281218614, "grad_norm": 2.197868472351805, "learning_rate": 5.810017390088133e-06, "loss": 0.7624, "step": 694 }, { "epoch": 0.46534984934717105, "grad_norm": 2.5569464569389733, "learning_rate": 5.799310751691783e-06, "loss": 0.7331, "step": 695 }, { "epoch": 0.46601941747572817, "grad_norm": 2.069568569524508, "learning_rate": 5.788600350781596e-06, "loss": 0.7554, "step": 696 }, { "epoch": 0.46668898560428523, "grad_norm": 1.9457244891915177, "learning_rate": 5.777886237773542e-06, "loss": 0.7227, "step": 697 }, { "epoch": 0.4673585537328423, "grad_norm": 1.9814470457835611, "learning_rate": 5.767168463101076e-06, "loss": 0.6822, "step": 698 }, { "epoch": 0.4680281218613994, "grad_norm": 1.9973661723605594, "learning_rate": 5.75644707721488e-06, "loss": 0.7596, "step": 699 }, { "epoch": 0.4686976899899565, "grad_norm": 2.48590188457019, "learning_rate": 5.7457221305826396e-06, "loss": 0.7568, "step": 700 }, { "epoch": 0.46936725811851354, "grad_norm": 2.1484110427175565, "learning_rate": 5.734993673688801e-06, "loss": 0.8, "step": 701 }, { "epoch": 0.47003682624707066, "grad_norm": 1.979370138844269, "learning_rate": 5.724261757034334e-06, "loss": 0.7164, "step": 702 }, { "epoch": 0.4707063943756277, "grad_norm": 2.1452248026779333, "learning_rate": 5.713526431136491e-06, "loss": 0.7118, "step": 703 }, { "epoch": 0.4713759625041848, "grad_norm": 1.7987878681752634, "learning_rate": 5.702787746528577e-06, "loss": 0.7216, "step": 704 }, { "epoch": 0.4720455306327419, "grad_norm": 2.3896476559676643, "learning_rate": 5.6920457537597015e-06, "loss": 0.7145, "step": 705 }, { "epoch": 0.47271509876129897, "grad_norm": 2.467662889854617, "learning_rate": 5.681300503394555e-06, "loss": 0.7441, "step": 706 }, { "epoch": 0.47338466688985603, "grad_norm": 1.9618173575207924, "learning_rate": 5.670552046013151e-06, "loss": 0.6994, "step": 707 }, { "epoch": 0.47405423501841315, "grad_norm": 1.8112328748996203, "learning_rate": 5.65980043221061e-06, "loss": 0.7142, "step": 708 }, { "epoch": 0.4747238031469702, "grad_norm": 1.8960540871656422, "learning_rate": 5.6490457125969035e-06, "loss": 0.6674, "step": 709 }, { "epoch": 0.4753933712755273, "grad_norm": 2.5297746466734434, "learning_rate": 5.638287937796622e-06, "loss": 0.7734, "step": 710 }, { "epoch": 0.47606293940408434, "grad_norm": 1.8118526980352492, "learning_rate": 5.627527158448742e-06, "loss": 0.7202, "step": 711 }, { "epoch": 0.47673250753264146, "grad_norm": 2.3171696731365476, "learning_rate": 5.61676342520638e-06, "loss": 0.658, "step": 712 }, { "epoch": 0.4774020756611985, "grad_norm": 1.8142903960531491, "learning_rate": 5.60599678873656e-06, "loss": 0.7347, "step": 713 }, { "epoch": 0.4780716437897556, "grad_norm": 2.1738428040978777, "learning_rate": 5.595227299719966e-06, "loss": 0.5792, "step": 714 }, { "epoch": 0.4787412119183127, "grad_norm": 4.915943319910834, "learning_rate": 5.5844550088507175e-06, "loss": 0.6757, "step": 715 }, { "epoch": 0.47941078004686977, "grad_norm": 2.0054578432620533, "learning_rate": 5.573679966836116e-06, "loss": 0.7428, "step": 716 }, { "epoch": 0.48008034817542683, "grad_norm": 2.3597037072590235, "learning_rate": 5.562902224396416e-06, "loss": 0.7606, "step": 717 }, { "epoch": 0.48074991630398395, "grad_norm": 1.983912175567396, "learning_rate": 5.552121832264582e-06, "loss": 0.7061, "step": 718 }, { "epoch": 0.481419484432541, "grad_norm": 2.1765938358958223, "learning_rate": 5.541338841186057e-06, "loss": 0.731, "step": 719 }, { "epoch": 0.4820890525610981, "grad_norm": 2.6929903073427695, "learning_rate": 5.53055330191851e-06, "loss": 0.717, "step": 720 }, { "epoch": 0.4827586206896552, "grad_norm": 1.976518347796674, "learning_rate": 5.519765265231609e-06, "loss": 0.7668, "step": 721 }, { "epoch": 0.48342818881821226, "grad_norm": 2.4169507549078055, "learning_rate": 5.508974781906776e-06, "loss": 0.7877, "step": 722 }, { "epoch": 0.4840977569467693, "grad_norm": 2.3419277581039544, "learning_rate": 5.4981819027369525e-06, "loss": 0.7096, "step": 723 }, { "epoch": 0.48476732507532644, "grad_norm": 1.900468002654568, "learning_rate": 5.487386678526355e-06, "loss": 0.7264, "step": 724 }, { "epoch": 0.4854368932038835, "grad_norm": 3.1923036264191387, "learning_rate": 5.476589160090238e-06, "loss": 0.7118, "step": 725 }, { "epoch": 0.48610646133244056, "grad_norm": 2.239853356972794, "learning_rate": 5.465789398254661e-06, "loss": 0.7701, "step": 726 }, { "epoch": 0.4867760294609977, "grad_norm": 2.760622144597342, "learning_rate": 5.454987443856235e-06, "loss": 0.7669, "step": 727 }, { "epoch": 0.48744559758955475, "grad_norm": 2.103767482817494, "learning_rate": 5.444183347741899e-06, "loss": 0.7836, "step": 728 }, { "epoch": 0.4881151657181118, "grad_norm": 1.710313288113687, "learning_rate": 5.433377160768669e-06, "loss": 0.6839, "step": 729 }, { "epoch": 0.4887847338466689, "grad_norm": 1.8687517769430737, "learning_rate": 5.422568933803407e-06, "loss": 0.7087, "step": 730 }, { "epoch": 0.489454301975226, "grad_norm": 2.1380188971317895, "learning_rate": 5.411758717722575e-06, "loss": 0.756, "step": 731 }, { "epoch": 0.49012387010378305, "grad_norm": 2.799648815856403, "learning_rate": 5.4009465634119975e-06, "loss": 0.6941, "step": 732 }, { "epoch": 0.4907934382323401, "grad_norm": 2.014579154315366, "learning_rate": 5.390132521766626e-06, "loss": 0.671, "step": 733 }, { "epoch": 0.49146300636089724, "grad_norm": 2.1572835951220655, "learning_rate": 5.379316643690292e-06, "loss": 0.6706, "step": 734 }, { "epoch": 0.4921325744894543, "grad_norm": 1.9764659936079838, "learning_rate": 5.368498980095474e-06, "loss": 0.7312, "step": 735 }, { "epoch": 0.49280214261801136, "grad_norm": 1.946189629480173, "learning_rate": 5.357679581903054e-06, "loss": 0.7456, "step": 736 }, { "epoch": 0.4934717107465685, "grad_norm": 2.389228693890119, "learning_rate": 5.34685850004208e-06, "loss": 0.7362, "step": 737 }, { "epoch": 0.49414127887512554, "grad_norm": 2.1052337726102324, "learning_rate": 5.336035785449527e-06, "loss": 0.6334, "step": 738 }, { "epoch": 0.4948108470036826, "grad_norm": 2.992826592107677, "learning_rate": 5.325211489070051e-06, "loss": 0.7196, "step": 739 }, { "epoch": 0.4954804151322397, "grad_norm": 1.9844899629406647, "learning_rate": 5.314385661855759e-06, "loss": 0.6555, "step": 740 }, { "epoch": 0.4961499832607968, "grad_norm": 2.172067392093458, "learning_rate": 5.30355835476596e-06, "loss": 0.6856, "step": 741 }, { "epoch": 0.49681955138935385, "grad_norm": 2.330486209150731, "learning_rate": 5.29272961876693e-06, "loss": 0.7017, "step": 742 }, { "epoch": 0.49748911951791097, "grad_norm": 2.089112315892574, "learning_rate": 5.281899504831672e-06, "loss": 0.6606, "step": 743 }, { "epoch": 0.49815868764646803, "grad_norm": 2.2463004846938173, "learning_rate": 5.271068063939675e-06, "loss": 0.7313, "step": 744 }, { "epoch": 0.4988282557750251, "grad_norm": 2.5763441324003336, "learning_rate": 5.2602353470766764e-06, "loss": 0.7227, "step": 745 }, { "epoch": 0.4994978239035822, "grad_norm": 2.0887083823350046, "learning_rate": 5.249401405234412e-06, "loss": 0.7357, "step": 746 }, { "epoch": 0.5001673920321392, "grad_norm": 3.9924469000816334, "learning_rate": 5.238566289410396e-06, "loss": 0.7087, "step": 747 }, { "epoch": 0.5008369601606963, "grad_norm": 2.0339217143177764, "learning_rate": 5.2277300506076575e-06, "loss": 0.7165, "step": 748 }, { "epoch": 0.5015065282892535, "grad_norm": 2.0517452406988834, "learning_rate": 5.216892739834519e-06, "loss": 0.6718, "step": 749 }, { "epoch": 0.5021760964178105, "grad_norm": 2.312658410114287, "learning_rate": 5.206054408104343e-06, "loss": 0.7261, "step": 750 }, { "epoch": 0.5021760964178105, "eval_loss": 0.7194977402687073, "eval_runtime": 442.0132, "eval_samples_per_second": 45.499, "eval_steps_per_second": 0.713, "step": 750 }, { "epoch": 0.5028456645463676, "grad_norm": 1.877677072177533, "learning_rate": 5.195215106435305e-06, "loss": 0.7468, "step": 751 }, { "epoch": 0.5035152326749247, "grad_norm": 2.1077634438540427, "learning_rate": 5.18437488585014e-06, "loss": 0.727, "step": 752 }, { "epoch": 0.5041848008034817, "grad_norm": 1.9665272168488421, "learning_rate": 5.17353379737591e-06, "loss": 0.6933, "step": 753 }, { "epoch": 0.5048543689320388, "grad_norm": 1.793427168197155, "learning_rate": 5.1626918920437666e-06, "loss": 0.7184, "step": 754 }, { "epoch": 0.505523937060596, "grad_norm": 2.244490885869009, "learning_rate": 5.151849220888701e-06, "loss": 0.7185, "step": 755 }, { "epoch": 0.506193505189153, "grad_norm": 1.831437193881894, "learning_rate": 5.1410058349493115e-06, "loss": 0.7834, "step": 756 }, { "epoch": 0.5068630733177101, "grad_norm": 2.0954530375339897, "learning_rate": 5.13016178526756e-06, "loss": 0.7484, "step": 757 }, { "epoch": 0.5075326414462672, "grad_norm": 2.0008908530891403, "learning_rate": 5.119317122888536e-06, "loss": 0.7072, "step": 758 }, { "epoch": 0.5082022095748242, "grad_norm": 1.7995962538001222, "learning_rate": 5.108471898860209e-06, "loss": 0.6459, "step": 759 }, { "epoch": 0.5088717777033813, "grad_norm": 2.44198210256266, "learning_rate": 5.097626164233195e-06, "loss": 0.7201, "step": 760 }, { "epoch": 0.5095413458319384, "grad_norm": 1.97038966190233, "learning_rate": 5.0867799700605105e-06, "loss": 0.7139, "step": 761 }, { "epoch": 0.5102109139604954, "grad_norm": 1.96315444325917, "learning_rate": 5.075933367397341e-06, "loss": 0.7559, "step": 762 }, { "epoch": 0.5108804820890526, "grad_norm": 2.373878769590976, "learning_rate": 5.065086407300788e-06, "loss": 0.7365, "step": 763 }, { "epoch": 0.5115500502176097, "grad_norm": 1.7417136880839625, "learning_rate": 5.054239140829637e-06, "loss": 0.6512, "step": 764 }, { "epoch": 0.5122196183461667, "grad_norm": 2.2229064911092276, "learning_rate": 5.043391619044122e-06, "loss": 0.6771, "step": 765 }, { "epoch": 0.5128891864747238, "grad_norm": 2.0969025886609516, "learning_rate": 5.032543893005674e-06, "loss": 0.7003, "step": 766 }, { "epoch": 0.5135587546032809, "grad_norm": 2.676153048536881, "learning_rate": 5.0216960137766805e-06, "loss": 0.7013, "step": 767 }, { "epoch": 0.5142283227318379, "grad_norm": 1.7222185180982599, "learning_rate": 5.010848032420258e-06, "loss": 0.6631, "step": 768 }, { "epoch": 0.5148978908603951, "grad_norm": 2.034637475983363, "learning_rate": 5e-06, "loss": 0.6983, "step": 769 }, { "epoch": 0.5155674589889522, "grad_norm": 1.951282656928074, "learning_rate": 4.989151967579744e-06, "loss": 0.7394, "step": 770 }, { "epoch": 0.5162370271175092, "grad_norm": 1.982460678032438, "learning_rate": 4.978303986223322e-06, "loss": 0.6951, "step": 771 }, { "epoch": 0.5169065952460663, "grad_norm": 2.510123052897184, "learning_rate": 4.967456106994329e-06, "loss": 0.7426, "step": 772 }, { "epoch": 0.5175761633746234, "grad_norm": 1.7208831879402, "learning_rate": 4.956608380955877e-06, "loss": 0.6664, "step": 773 }, { "epoch": 0.5182457315031804, "grad_norm": 2.2080673337384296, "learning_rate": 4.9457608591703635e-06, "loss": 0.761, "step": 774 }, { "epoch": 0.5189152996317375, "grad_norm": 1.9610183917990358, "learning_rate": 4.934913592699214e-06, "loss": 0.694, "step": 775 }, { "epoch": 0.5195848677602946, "grad_norm": 1.9351930585718387, "learning_rate": 4.924066632602662e-06, "loss": 0.704, "step": 776 }, { "epoch": 0.5202544358888517, "grad_norm": 2.0822707153342366, "learning_rate": 4.913220029939491e-06, "loss": 0.6812, "step": 777 }, { "epoch": 0.5209240040174088, "grad_norm": 2.293029384310095, "learning_rate": 4.902373835766806e-06, "loss": 0.7302, "step": 778 }, { "epoch": 0.5215935721459658, "grad_norm": 1.8509835783524797, "learning_rate": 4.891528101139793e-06, "loss": 0.6846, "step": 779 }, { "epoch": 0.5222631402745229, "grad_norm": 2.3937053090451252, "learning_rate": 4.8806828771114655e-06, "loss": 0.6553, "step": 780 }, { "epoch": 0.52293270840308, "grad_norm": 2.138915980359658, "learning_rate": 4.869838214732441e-06, "loss": 0.6415, "step": 781 }, { "epoch": 0.523602276531637, "grad_norm": 2.3283739843097253, "learning_rate": 4.85899416505069e-06, "loss": 0.758, "step": 782 }, { "epoch": 0.5242718446601942, "grad_norm": 1.9468672582660926, "learning_rate": 4.848150779111301e-06, "loss": 0.6523, "step": 783 }, { "epoch": 0.5249414127887513, "grad_norm": 2.1765114833836234, "learning_rate": 4.837308107956233e-06, "loss": 0.7101, "step": 784 }, { "epoch": 0.5256109809173083, "grad_norm": 2.459994834064646, "learning_rate": 4.826466202624091e-06, "loss": 0.7523, "step": 785 }, { "epoch": 0.5262805490458654, "grad_norm": 2.286580731711994, "learning_rate": 4.815625114149862e-06, "loss": 0.6707, "step": 786 }, { "epoch": 0.5269501171744225, "grad_norm": 1.883440557905891, "learning_rate": 4.804784893564697e-06, "loss": 0.6106, "step": 787 }, { "epoch": 0.5276196853029795, "grad_norm": 1.8470051173062276, "learning_rate": 4.793945591895659e-06, "loss": 0.7414, "step": 788 }, { "epoch": 0.5282892534315367, "grad_norm": 2.349983366367014, "learning_rate": 4.783107260165483e-06, "loss": 0.677, "step": 789 }, { "epoch": 0.5289588215600938, "grad_norm": 1.8319228240772971, "learning_rate": 4.772269949392345e-06, "loss": 0.6731, "step": 790 }, { "epoch": 0.5296283896886508, "grad_norm": 2.105990897091881, "learning_rate": 4.761433710589606e-06, "loss": 0.6537, "step": 791 }, { "epoch": 0.5302979578172079, "grad_norm": 2.2412299356456837, "learning_rate": 4.750598594765588e-06, "loss": 0.7591, "step": 792 }, { "epoch": 0.530967525945765, "grad_norm": 1.7161729252256834, "learning_rate": 4.739764652923327e-06, "loss": 0.6746, "step": 793 }, { "epoch": 0.531637094074322, "grad_norm": 1.8396388257447545, "learning_rate": 4.728931936060326e-06, "loss": 0.6749, "step": 794 }, { "epoch": 0.5323066622028791, "grad_norm": 1.9135099262982314, "learning_rate": 4.718100495168329e-06, "loss": 0.7398, "step": 795 }, { "epoch": 0.5329762303314363, "grad_norm": 2.004589811011184, "learning_rate": 4.707270381233073e-06, "loss": 0.6949, "step": 796 }, { "epoch": 0.5336457984599933, "grad_norm": 1.8940747176878188, "learning_rate": 4.696441645234042e-06, "loss": 0.6631, "step": 797 }, { "epoch": 0.5343153665885504, "grad_norm": 2.5169930120227506, "learning_rate": 4.685614338144242e-06, "loss": 0.7215, "step": 798 }, { "epoch": 0.5349849347171075, "grad_norm": 2.730940309961668, "learning_rate": 4.6747885109299504e-06, "loss": 0.7504, "step": 799 }, { "epoch": 0.5356545028456645, "grad_norm": 2.0361988145016165, "learning_rate": 4.663964214550475e-06, "loss": 0.7382, "step": 800 }, { "epoch": 0.5363240709742216, "grad_norm": 1.891905593843104, "learning_rate": 4.65314149995792e-06, "loss": 0.7672, "step": 801 }, { "epoch": 0.5369936391027788, "grad_norm": 1.952646313278564, "learning_rate": 4.642320418096948e-06, "loss": 0.6851, "step": 802 }, { "epoch": 0.5376632072313358, "grad_norm": 1.7835616292309695, "learning_rate": 4.631501019904528e-06, "loss": 0.6861, "step": 803 }, { "epoch": 0.5383327753598929, "grad_norm": 2.0783981400739306, "learning_rate": 4.620683356309711e-06, "loss": 0.725, "step": 804 }, { "epoch": 0.53900234348845, "grad_norm": 2.163153857327829, "learning_rate": 4.609867478233377e-06, "loss": 0.7267, "step": 805 }, { "epoch": 0.539671911617007, "grad_norm": 1.878941427068904, "learning_rate": 4.5990534365880024e-06, "loss": 0.6697, "step": 806 }, { "epoch": 0.5403414797455641, "grad_norm": 1.8689351983225624, "learning_rate": 4.588241282277428e-06, "loss": 0.6747, "step": 807 }, { "epoch": 0.5410110478741212, "grad_norm": 1.9151067983613053, "learning_rate": 4.577431066196594e-06, "loss": 0.7261, "step": 808 }, { "epoch": 0.5416806160026783, "grad_norm": 1.7563325563582222, "learning_rate": 4.5666228392313315e-06, "loss": 0.7, "step": 809 }, { "epoch": 0.5423501841312354, "grad_norm": 2.205354348043167, "learning_rate": 4.555816652258103e-06, "loss": 0.7402, "step": 810 }, { "epoch": 0.5430197522597925, "grad_norm": 1.7024661418900893, "learning_rate": 4.545012556143767e-06, "loss": 0.6977, "step": 811 }, { "epoch": 0.5436893203883495, "grad_norm": 1.8677767965306833, "learning_rate": 4.53421060174534e-06, "loss": 0.6932, "step": 812 }, { "epoch": 0.5443588885169066, "grad_norm": 2.815561629618248, "learning_rate": 4.523410839909764e-06, "loss": 0.7132, "step": 813 }, { "epoch": 0.5450284566454636, "grad_norm": 1.912291092111393, "learning_rate": 4.5126133214736466e-06, "loss": 0.7225, "step": 814 }, { "epoch": 0.5456980247740207, "grad_norm": 2.2930690277962578, "learning_rate": 4.501818097263049e-06, "loss": 0.7359, "step": 815 }, { "epoch": 0.5463675929025779, "grad_norm": 2.103337648459214, "learning_rate": 4.491025218093225e-06, "loss": 0.6817, "step": 816 }, { "epoch": 0.5470371610311349, "grad_norm": 2.1720646827458663, "learning_rate": 4.480234734768393e-06, "loss": 0.6923, "step": 817 }, { "epoch": 0.547706729159692, "grad_norm": 2.2630203743327266, "learning_rate": 4.469446698081492e-06, "loss": 0.7256, "step": 818 }, { "epoch": 0.5483762972882491, "grad_norm": 1.9602925825018849, "learning_rate": 4.458661158813944e-06, "loss": 0.6884, "step": 819 }, { "epoch": 0.5490458654168061, "grad_norm": 2.70279763434301, "learning_rate": 4.447878167735418e-06, "loss": 0.8084, "step": 820 }, { "epoch": 0.5497154335453632, "grad_norm": 2.061724330328305, "learning_rate": 4.437097775603587e-06, "loss": 0.7128, "step": 821 }, { "epoch": 0.5503850016739203, "grad_norm": 1.8519934941977576, "learning_rate": 4.426320033163887e-06, "loss": 0.7335, "step": 822 }, { "epoch": 0.5510545698024774, "grad_norm": 1.8239802193257029, "learning_rate": 4.415544991149284e-06, "loss": 0.6618, "step": 823 }, { "epoch": 0.5517241379310345, "grad_norm": 2.2563383595099578, "learning_rate": 4.404772700280035e-06, "loss": 0.7583, "step": 824 }, { "epoch": 0.5523937060595916, "grad_norm": 2.755780237358005, "learning_rate": 4.3940032112634405e-06, "loss": 0.8142, "step": 825 }, { "epoch": 0.5530632741881486, "grad_norm": 1.8806953909747068, "learning_rate": 4.383236574793619e-06, "loss": 0.655, "step": 826 }, { "epoch": 0.5537328423167057, "grad_norm": 1.8640214485634479, "learning_rate": 4.3724728415512585e-06, "loss": 0.6971, "step": 827 }, { "epoch": 0.5544024104452628, "grad_norm": 1.9172815596105899, "learning_rate": 4.3617120622033786e-06, "loss": 0.7687, "step": 828 }, { "epoch": 0.5550719785738198, "grad_norm": 2.0212589068589724, "learning_rate": 4.350954287403099e-06, "loss": 0.6969, "step": 829 }, { "epoch": 0.555741546702377, "grad_norm": 2.3427381077745633, "learning_rate": 4.340199567789391e-06, "loss": 0.7305, "step": 830 }, { "epoch": 0.5564111148309341, "grad_norm": 2.3593244603647943, "learning_rate": 4.329447953986849e-06, "loss": 0.7588, "step": 831 }, { "epoch": 0.5570806829594911, "grad_norm": 1.8295725661100088, "learning_rate": 4.318699496605447e-06, "loss": 0.7143, "step": 832 }, { "epoch": 0.5577502510880482, "grad_norm": 1.828919393959143, "learning_rate": 4.307954246240299e-06, "loss": 0.7176, "step": 833 }, { "epoch": 0.5584198192166053, "grad_norm": 1.6151852704150715, "learning_rate": 4.297212253471426e-06, "loss": 0.6946, "step": 834 }, { "epoch": 0.5590893873451623, "grad_norm": 2.060436034242831, "learning_rate": 4.286473568863511e-06, "loss": 0.6828, "step": 835 }, { "epoch": 0.5597589554737195, "grad_norm": 2.8461310434677287, "learning_rate": 4.2757382429656675e-06, "loss": 0.6943, "step": 836 }, { "epoch": 0.5604285236022766, "grad_norm": 2.013805515461038, "learning_rate": 4.265006326311199e-06, "loss": 0.7014, "step": 837 }, { "epoch": 0.5610980917308336, "grad_norm": 2.086411601190594, "learning_rate": 4.254277869417361e-06, "loss": 0.6675, "step": 838 }, { "epoch": 0.5617676598593907, "grad_norm": 1.8501517397011584, "learning_rate": 4.243552922785121e-06, "loss": 0.6866, "step": 839 }, { "epoch": 0.5624372279879478, "grad_norm": 1.8349338267093758, "learning_rate": 4.232831536898926e-06, "loss": 0.7149, "step": 840 }, { "epoch": 0.5631067961165048, "grad_norm": 2.313082001117094, "learning_rate": 4.22211376222646e-06, "loss": 0.6624, "step": 841 }, { "epoch": 0.563776364245062, "grad_norm": 1.9205339662817493, "learning_rate": 4.211399649218406e-06, "loss": 0.769, "step": 842 }, { "epoch": 0.5644459323736191, "grad_norm": 2.070836362759231, "learning_rate": 4.200689248308219e-06, "loss": 0.7136, "step": 843 }, { "epoch": 0.5651155005021761, "grad_norm": 2.850224688403473, "learning_rate": 4.189982609911869e-06, "loss": 0.736, "step": 844 }, { "epoch": 0.5657850686307332, "grad_norm": 2.6075047412586523, "learning_rate": 4.179279784427625e-06, "loss": 0.7014, "step": 845 }, { "epoch": 0.5664546367592903, "grad_norm": 2.0239979577534384, "learning_rate": 4.168580822235804e-06, "loss": 0.7759, "step": 846 }, { "epoch": 0.5671242048878473, "grad_norm": 1.7701553760846465, "learning_rate": 4.157885773698535e-06, "loss": 0.7248, "step": 847 }, { "epoch": 0.5677937730164044, "grad_norm": 1.8870314314666252, "learning_rate": 4.147194689159528e-06, "loss": 0.7047, "step": 848 }, { "epoch": 0.5684633411449616, "grad_norm": 2.0797898909835153, "learning_rate": 4.1365076189438365e-06, "loss": 0.6947, "step": 849 }, { "epoch": 0.5691329092735186, "grad_norm": 3.0533596188151972, "learning_rate": 4.125824613357611e-06, "loss": 0.7721, "step": 850 }, { "epoch": 0.5698024774020757, "grad_norm": 2.5180263930326436, "learning_rate": 4.115145722687872e-06, "loss": 0.7568, "step": 851 }, { "epoch": 0.5704720455306327, "grad_norm": 2.2179238331347104, "learning_rate": 4.104470997202273e-06, "loss": 0.7207, "step": 852 }, { "epoch": 0.5711416136591898, "grad_norm": 1.948655279140921, "learning_rate": 4.093800487148857e-06, "loss": 0.6849, "step": 853 }, { "epoch": 0.5718111817877469, "grad_norm": 2.107264813403839, "learning_rate": 4.083134242755826e-06, "loss": 0.6952, "step": 854 }, { "epoch": 0.5724807499163039, "grad_norm": 2.1975879958372126, "learning_rate": 4.072472314231301e-06, "loss": 0.7798, "step": 855 }, { "epoch": 0.573150318044861, "grad_norm": 1.7948987398838572, "learning_rate": 4.061814751763092e-06, "loss": 0.6587, "step": 856 }, { "epoch": 0.5738198861734182, "grad_norm": 2.450189889050675, "learning_rate": 4.051161605518453e-06, "loss": 0.8022, "step": 857 }, { "epoch": 0.5744894543019752, "grad_norm": 2.1128975197644793, "learning_rate": 4.040512925643848e-06, "loss": 0.7927, "step": 858 }, { "epoch": 0.5751590224305323, "grad_norm": 1.7037975179979132, "learning_rate": 4.029868762264722e-06, "loss": 0.7692, "step": 859 }, { "epoch": 0.5758285905590894, "grad_norm": 2.0886617289131957, "learning_rate": 4.019229165485261e-06, "loss": 0.7228, "step": 860 }, { "epoch": 0.5764981586876464, "grad_norm": 2.068992656382316, "learning_rate": 4.008594185388146e-06, "loss": 0.7042, "step": 861 }, { "epoch": 0.5771677268162035, "grad_norm": 1.7906563825900237, "learning_rate": 3.997963872034337e-06, "loss": 0.5917, "step": 862 }, { "epoch": 0.5778372949447607, "grad_norm": 2.2421454406297263, "learning_rate": 3.98733827546282e-06, "loss": 0.6974, "step": 863 }, { "epoch": 0.5785068630733177, "grad_norm": 2.0833628601863614, "learning_rate": 3.97671744569038e-06, "loss": 0.7263, "step": 864 }, { "epoch": 0.5791764312018748, "grad_norm": 2.164629645520286, "learning_rate": 3.966101432711363e-06, "loss": 0.6453, "step": 865 }, { "epoch": 0.5798459993304319, "grad_norm": 2.1536202168209186, "learning_rate": 3.955490286497446e-06, "loss": 0.727, "step": 866 }, { "epoch": 0.5805155674589889, "grad_norm": 1.9946115492878227, "learning_rate": 3.94488405699739e-06, "loss": 0.6945, "step": 867 }, { "epoch": 0.581185135587546, "grad_norm": 2.7313012901902542, "learning_rate": 3.934282794136818e-06, "loss": 0.7661, "step": 868 }, { "epoch": 0.5818547037161031, "grad_norm": 1.8389654392101609, "learning_rate": 3.9236865478179685e-06, "loss": 0.6828, "step": 869 }, { "epoch": 0.5825242718446602, "grad_norm": 2.35503408645469, "learning_rate": 3.913095367919471e-06, "loss": 0.7473, "step": 870 }, { "epoch": 0.5831938399732173, "grad_norm": 2.0418449281559674, "learning_rate": 3.902509304296106e-06, "loss": 0.6864, "step": 871 }, { "epoch": 0.5838634081017744, "grad_norm": 2.0924538021832597, "learning_rate": 3.8919284067785655e-06, "loss": 0.7563, "step": 872 }, { "epoch": 0.5845329762303314, "grad_norm": 2.973281967193373, "learning_rate": 3.881352725173231e-06, "loss": 0.7365, "step": 873 }, { "epoch": 0.5852025443588885, "grad_norm": 3.0549266656421885, "learning_rate": 3.8707823092619275e-06, "loss": 0.7492, "step": 874 }, { "epoch": 0.5858721124874456, "grad_norm": 1.9168216733022254, "learning_rate": 3.860217208801692e-06, "loss": 0.705, "step": 875 }, { "epoch": 0.5865416806160026, "grad_norm": 2.008285159572279, "learning_rate": 3.849657473524543e-06, "loss": 0.6891, "step": 876 }, { "epoch": 0.5872112487445598, "grad_norm": 2.33799485384296, "learning_rate": 3.839103153137247e-06, "loss": 0.7501, "step": 877 }, { "epoch": 0.5878808168731169, "grad_norm": 2.52453963624576, "learning_rate": 3.828554297321073e-06, "loss": 0.6838, "step": 878 }, { "epoch": 0.5885503850016739, "grad_norm": 2.010894134157367, "learning_rate": 3.818010955731576e-06, "loss": 0.7226, "step": 879 }, { "epoch": 0.589219953130231, "grad_norm": 2.0684180556805587, "learning_rate": 3.807473177998348e-06, "loss": 0.6701, "step": 880 }, { "epoch": 0.5898895212587881, "grad_norm": 3.9798960090668674, "learning_rate": 3.796941013724795e-06, "loss": 0.6869, "step": 881 }, { "epoch": 0.5905590893873451, "grad_norm": 2.54869178203082, "learning_rate": 3.7864145124878994e-06, "loss": 0.6707, "step": 882 }, { "epoch": 0.5912286575159023, "grad_norm": 1.9011595042519291, "learning_rate": 3.775893723837981e-06, "loss": 0.646, "step": 883 }, { "epoch": 0.5918982256444594, "grad_norm": 1.8369957681644247, "learning_rate": 3.765378697298475e-06, "loss": 0.7153, "step": 884 }, { "epoch": 0.5925677937730164, "grad_norm": 1.9552313183571874, "learning_rate": 3.7548694823656945e-06, "loss": 0.6582, "step": 885 }, { "epoch": 0.5932373619015735, "grad_norm": 2.004869988966541, "learning_rate": 3.7443661285085895e-06, "loss": 0.6767, "step": 886 }, { "epoch": 0.5939069300301306, "grad_norm": 1.6888018854260582, "learning_rate": 3.7338686851685267e-06, "loss": 0.6251, "step": 887 }, { "epoch": 0.5945764981586876, "grad_norm": 2.2082855073224166, "learning_rate": 3.7233772017590487e-06, "loss": 0.7544, "step": 888 }, { "epoch": 0.5952460662872447, "grad_norm": 2.634877784532776, "learning_rate": 3.7128917276656406e-06, "loss": 0.7702, "step": 889 }, { "epoch": 0.5959156344158018, "grad_norm": 1.8174930068601631, "learning_rate": 3.702412312245507e-06, "loss": 0.7185, "step": 890 }, { "epoch": 0.5965852025443589, "grad_norm": 2.078988128612903, "learning_rate": 3.6919390048273286e-06, "loss": 0.6566, "step": 891 }, { "epoch": 0.597254770672916, "grad_norm": 2.186368243043341, "learning_rate": 3.6814718547110338e-06, "loss": 0.7753, "step": 892 }, { "epoch": 0.597924338801473, "grad_norm": 1.8875849927855946, "learning_rate": 3.671010911167572e-06, "loss": 0.7204, "step": 893 }, { "epoch": 0.5985939069300301, "grad_norm": 1.9828553722955158, "learning_rate": 3.6605562234386693e-06, "loss": 0.7206, "step": 894 }, { "epoch": 0.5992634750585872, "grad_norm": 1.8874429074891341, "learning_rate": 3.650107840736612e-06, "loss": 0.6496, "step": 895 }, { "epoch": 0.5999330431871442, "grad_norm": 2.598917417862589, "learning_rate": 3.6396658122440053e-06, "loss": 0.6771, "step": 896 }, { "epoch": 0.6006026113157014, "grad_norm": 2.2234485887357036, "learning_rate": 3.6292301871135425e-06, "loss": 0.6586, "step": 897 }, { "epoch": 0.6012721794442585, "grad_norm": 2.353475054605386, "learning_rate": 3.6188010144677744e-06, "loss": 0.7362, "step": 898 }, { "epoch": 0.6019417475728155, "grad_norm": 1.7788470544716595, "learning_rate": 3.6083783433988817e-06, "loss": 0.6534, "step": 899 }, { "epoch": 0.6026113157013726, "grad_norm": 1.9550735083606614, "learning_rate": 3.5979622229684374e-06, "loss": 0.6966, "step": 900 }, { "epoch": 0.6026113157013726, "eval_loss": 0.7089197635650635, "eval_runtime": 439.9455, "eval_samples_per_second": 45.712, "eval_steps_per_second": 0.716, "step": 900 }, { "epoch": 0.6032808838299297, "grad_norm": 2.4733696565436687, "learning_rate": 3.5875527022071808e-06, "loss": 0.6851, "step": 901 }, { "epoch": 0.6039504519584867, "grad_norm": 2.0569047838856167, "learning_rate": 3.5771498301147888e-06, "loss": 0.7579, "step": 902 }, { "epoch": 0.6046200200870439, "grad_norm": 2.842744741071225, "learning_rate": 3.566753655659635e-06, "loss": 0.7006, "step": 903 }, { "epoch": 0.605289588215601, "grad_norm": 1.8506586055464957, "learning_rate": 3.5563642277785705e-06, "loss": 0.7487, "step": 904 }, { "epoch": 0.605959156344158, "grad_norm": 1.8416588883048732, "learning_rate": 3.5459815953766885e-06, "loss": 0.6266, "step": 905 }, { "epoch": 0.6066287244727151, "grad_norm": 2.1054562773483956, "learning_rate": 3.5356058073270903e-06, "loss": 0.7531, "step": 906 }, { "epoch": 0.6072982926012722, "grad_norm": 1.7819144499763895, "learning_rate": 3.5252369124706697e-06, "loss": 0.6863, "step": 907 }, { "epoch": 0.6079678607298292, "grad_norm": 2.217074776689505, "learning_rate": 3.514874959615858e-06, "loss": 0.7833, "step": 908 }, { "epoch": 0.6086374288583863, "grad_norm": 2.551136792049783, "learning_rate": 3.5045199975384225e-06, "loss": 0.6881, "step": 909 }, { "epoch": 0.6093069969869435, "grad_norm": 1.9415742569104206, "learning_rate": 3.494172074981218e-06, "loss": 0.7175, "step": 910 }, { "epoch": 0.6099765651155005, "grad_norm": 2.175874084296051, "learning_rate": 3.4838312406539613e-06, "loss": 0.7614, "step": 911 }, { "epoch": 0.6106461332440576, "grad_norm": 1.800275394840938, "learning_rate": 3.473497543233005e-06, "loss": 0.7114, "step": 912 }, { "epoch": 0.6113157013726147, "grad_norm": 2.3089304687533123, "learning_rate": 3.4631710313611115e-06, "loss": 0.6925, "step": 913 }, { "epoch": 0.6119852695011717, "grad_norm": 2.1161729776273903, "learning_rate": 3.4528517536472088e-06, "loss": 0.6561, "step": 914 }, { "epoch": 0.6126548376297288, "grad_norm": 2.143793396272445, "learning_rate": 3.4425397586661835e-06, "loss": 0.6424, "step": 915 }, { "epoch": 0.613324405758286, "grad_norm": 2.3132042413980756, "learning_rate": 3.4322350949586346e-06, "loss": 0.677, "step": 916 }, { "epoch": 0.613993973886843, "grad_norm": 2.4110417008991467, "learning_rate": 3.4219378110306523e-06, "loss": 0.6769, "step": 917 }, { "epoch": 0.6146635420154001, "grad_norm": 1.8398475556956846, "learning_rate": 3.4116479553535897e-06, "loss": 0.6181, "step": 918 }, { "epoch": 0.6153331101439572, "grad_norm": 1.9403270277457467, "learning_rate": 3.4013655763638322e-06, "loss": 0.6694, "step": 919 }, { "epoch": 0.6160026782725142, "grad_norm": 2.177886840407438, "learning_rate": 3.3910907224625724e-06, "loss": 0.6732, "step": 920 }, { "epoch": 0.6166722464010713, "grad_norm": 1.9379778618510954, "learning_rate": 3.380823442015582e-06, "loss": 0.6998, "step": 921 }, { "epoch": 0.6173418145296284, "grad_norm": 1.6792354997867955, "learning_rate": 3.3705637833529794e-06, "loss": 0.6942, "step": 922 }, { "epoch": 0.6180113826581854, "grad_norm": 1.7810326396100327, "learning_rate": 3.360311794769007e-06, "loss": 0.6964, "step": 923 }, { "epoch": 0.6186809507867426, "grad_norm": 2.2264577319304526, "learning_rate": 3.3500675245218073e-06, "loss": 0.702, "step": 924 }, { "epoch": 0.6193505189152997, "grad_norm": 1.943764011323427, "learning_rate": 3.3398310208331806e-06, "loss": 0.751, "step": 925 }, { "epoch": 0.6200200870438567, "grad_norm": 1.8368895905537654, "learning_rate": 3.329602331888381e-06, "loss": 0.6853, "step": 926 }, { "epoch": 0.6206896551724138, "grad_norm": 1.6876488691887392, "learning_rate": 3.319381505835868e-06, "loss": 0.7005, "step": 927 }, { "epoch": 0.6213592233009708, "grad_norm": 2.264788800940924, "learning_rate": 3.309168590787092e-06, "loss": 0.7084, "step": 928 }, { "epoch": 0.6220287914295279, "grad_norm": 2.5496391632996636, "learning_rate": 3.2989636348162633e-06, "loss": 0.7156, "step": 929 }, { "epoch": 0.6226983595580851, "grad_norm": 2.2445403569406026, "learning_rate": 3.2887666859601342e-06, "loss": 0.6681, "step": 930 }, { "epoch": 0.6233679276866421, "grad_norm": 1.965173507097812, "learning_rate": 3.278577792217752e-06, "loss": 0.6966, "step": 931 }, { "epoch": 0.6240374958151992, "grad_norm": 2.2888974928302326, "learning_rate": 3.2683970015502626e-06, "loss": 0.6463, "step": 932 }, { "epoch": 0.6247070639437563, "grad_norm": 2.3087988590495687, "learning_rate": 3.2582243618806574e-06, "loss": 0.7292, "step": 933 }, { "epoch": 0.6253766320723133, "grad_norm": 2.017309748691295, "learning_rate": 3.248059921093565e-06, "loss": 0.702, "step": 934 }, { "epoch": 0.6260462002008704, "grad_norm": 2.0025051969471606, "learning_rate": 3.23790372703502e-06, "loss": 0.6915, "step": 935 }, { "epoch": 0.6267157683294275, "grad_norm": 1.953010260016355, "learning_rate": 3.2277558275122345e-06, "loss": 0.6835, "step": 936 }, { "epoch": 0.6273853364579846, "grad_norm": 1.8060399463422134, "learning_rate": 3.2176162702933816e-06, "loss": 0.6847, "step": 937 }, { "epoch": 0.6280549045865417, "grad_norm": 2.216474826722627, "learning_rate": 3.207485103107364e-06, "loss": 0.7008, "step": 938 }, { "epoch": 0.6287244727150988, "grad_norm": 1.9294094474054013, "learning_rate": 3.1973623736435865e-06, "loss": 0.7358, "step": 939 }, { "epoch": 0.6293940408436558, "grad_norm": 1.9227734919753559, "learning_rate": 3.187248129551741e-06, "loss": 0.6789, "step": 940 }, { "epoch": 0.6300636089722129, "grad_norm": 1.7841497055105309, "learning_rate": 3.177142418441578e-06, "loss": 0.6971, "step": 941 }, { "epoch": 0.63073317710077, "grad_norm": 2.054843500873422, "learning_rate": 3.167045287882674e-06, "loss": 0.7478, "step": 942 }, { "epoch": 0.631402745229327, "grad_norm": 2.8044618361347267, "learning_rate": 3.156956785404224e-06, "loss": 0.6913, "step": 943 }, { "epoch": 0.6320723133578842, "grad_norm": 2.118663187135613, "learning_rate": 3.1468769584948044e-06, "loss": 0.6817, "step": 944 }, { "epoch": 0.6327418814864413, "grad_norm": 2.072508358497151, "learning_rate": 3.1368058546021524e-06, "loss": 0.6795, "step": 945 }, { "epoch": 0.6334114496149983, "grad_norm": 1.735166697468566, "learning_rate": 3.126743521132949e-06, "loss": 0.6185, "step": 946 }, { "epoch": 0.6340810177435554, "grad_norm": 2.189589904415291, "learning_rate": 3.1166900054525873e-06, "loss": 0.7104, "step": 947 }, { "epoch": 0.6347505858721125, "grad_norm": 1.7611303571435435, "learning_rate": 3.106645354884953e-06, "loss": 0.6628, "step": 948 }, { "epoch": 0.6354201540006695, "grad_norm": 2.5480610897363194, "learning_rate": 3.096609616712207e-06, "loss": 0.6819, "step": 949 }, { "epoch": 0.6360897221292267, "grad_norm": 2.7645452280422282, "learning_rate": 3.0865828381745515e-06, "loss": 0.7285, "step": 950 }, { "epoch": 0.6367592902577838, "grad_norm": 2.0282248743144113, "learning_rate": 3.0765650664700186e-06, "loss": 0.7171, "step": 951 }, { "epoch": 0.6374288583863408, "grad_norm": 2.4312506726612853, "learning_rate": 3.0665563487542405e-06, "loss": 0.7549, "step": 952 }, { "epoch": 0.6380984265148979, "grad_norm": 1.7380256101560918, "learning_rate": 3.056556732140231e-06, "loss": 0.7245, "step": 953 }, { "epoch": 0.638767994643455, "grad_norm": 1.9593604701367624, "learning_rate": 3.0465662636981643e-06, "loss": 0.6626, "step": 954 }, { "epoch": 0.639437562772012, "grad_norm": 2.012868638549377, "learning_rate": 3.036584990455154e-06, "loss": 0.6562, "step": 955 }, { "epoch": 0.6401071309005691, "grad_norm": 2.2410014827250437, "learning_rate": 3.0266129593950245e-06, "loss": 0.7138, "step": 956 }, { "epoch": 0.6407766990291263, "grad_norm": 2.1401477200013037, "learning_rate": 3.0166502174581012e-06, "loss": 0.678, "step": 957 }, { "epoch": 0.6414462671576833, "grad_norm": 1.9638989624246428, "learning_rate": 3.00669681154098e-06, "loss": 0.678, "step": 958 }, { "epoch": 0.6421158352862404, "grad_norm": 2.573563064523422, "learning_rate": 2.996752788496311e-06, "loss": 0.7902, "step": 959 }, { "epoch": 0.6427854034147975, "grad_norm": 1.8425946866156344, "learning_rate": 2.9868181951325814e-06, "loss": 0.7178, "step": 960 }, { "epoch": 0.6434549715433545, "grad_norm": 2.021136097327583, "learning_rate": 2.976893078213883e-06, "loss": 0.6929, "step": 961 }, { "epoch": 0.6441245396719116, "grad_norm": 2.0052194859146444, "learning_rate": 2.966977484459708e-06, "loss": 0.7231, "step": 962 }, { "epoch": 0.6447941078004688, "grad_norm": 2.3557897474759315, "learning_rate": 2.957071460544717e-06, "loss": 0.7046, "step": 963 }, { "epoch": 0.6454636759290258, "grad_norm": 2.011016935096324, "learning_rate": 2.947175053098524e-06, "loss": 0.6888, "step": 964 }, { "epoch": 0.6461332440575829, "grad_norm": 1.9732285297092296, "learning_rate": 2.937288308705475e-06, "loss": 0.7055, "step": 965 }, { "epoch": 0.6468028121861399, "grad_norm": 1.7622485354090454, "learning_rate": 2.927411273904436e-06, "loss": 0.6979, "step": 966 }, { "epoch": 0.647472380314697, "grad_norm": 3.1187913882304765, "learning_rate": 2.917543995188562e-06, "loss": 0.7185, "step": 967 }, { "epoch": 0.6481419484432541, "grad_norm": 2.0220840377741776, "learning_rate": 2.907686519005082e-06, "loss": 0.726, "step": 968 }, { "epoch": 0.6488115165718111, "grad_norm": 1.774511176220537, "learning_rate": 2.8978388917550936e-06, "loss": 0.6655, "step": 969 }, { "epoch": 0.6494810847003682, "grad_norm": 1.9289461743153298, "learning_rate": 2.8880011597933215e-06, "loss": 0.6691, "step": 970 }, { "epoch": 0.6501506528289254, "grad_norm": 1.6369188671096762, "learning_rate": 2.8781733694279178e-06, "loss": 0.6923, "step": 971 }, { "epoch": 0.6508202209574824, "grad_norm": 1.8197236178343137, "learning_rate": 2.8683555669202355e-06, "loss": 0.6775, "step": 972 }, { "epoch": 0.6514897890860395, "grad_norm": 2.0430251489281477, "learning_rate": 2.858547798484613e-06, "loss": 0.7154, "step": 973 }, { "epoch": 0.6521593572145966, "grad_norm": 1.8617431789085965, "learning_rate": 2.848750110288161e-06, "loss": 0.6289, "step": 974 }, { "epoch": 0.6528289253431536, "grad_norm": 2.4166825182014127, "learning_rate": 2.838962548450528e-06, "loss": 0.7331, "step": 975 }, { "epoch": 0.6534984934717107, "grad_norm": 1.9408776220373694, "learning_rate": 2.82918515904371e-06, "loss": 0.7185, "step": 976 }, { "epoch": 0.6541680616002679, "grad_norm": 1.8784200090543048, "learning_rate": 2.819417988091814e-06, "loss": 0.6806, "step": 977 }, { "epoch": 0.6548376297288249, "grad_norm": 2.1179338285622547, "learning_rate": 2.8096610815708415e-06, "loss": 0.7142, "step": 978 }, { "epoch": 0.655507197857382, "grad_norm": 1.9194248273659682, "learning_rate": 2.7999144854084816e-06, "loss": 0.6903, "step": 979 }, { "epoch": 0.6561767659859391, "grad_norm": 1.887869945146109, "learning_rate": 2.7901782454838965e-06, "loss": 0.6853, "step": 980 }, { "epoch": 0.6568463341144961, "grad_norm": 1.9787574886482675, "learning_rate": 2.7804524076274898e-06, "loss": 0.6928, "step": 981 }, { "epoch": 0.6575159022430532, "grad_norm": 2.115639196586043, "learning_rate": 2.770737017620703e-06, "loss": 0.7, "step": 982 }, { "epoch": 0.6581854703716103, "grad_norm": 1.6923416774167128, "learning_rate": 2.7610321211958017e-06, "loss": 0.633, "step": 983 }, { "epoch": 0.6588550385001674, "grad_norm": 1.6127587181185845, "learning_rate": 2.751337764035652e-06, "loss": 0.6745, "step": 984 }, { "epoch": 0.6595246066287245, "grad_norm": 2.079755830131246, "learning_rate": 2.741653991773513e-06, "loss": 0.7771, "step": 985 }, { "epoch": 0.6601941747572816, "grad_norm": 2.451378717394979, "learning_rate": 2.7319808499928113e-06, "loss": 0.7282, "step": 986 }, { "epoch": 0.6608637428858386, "grad_norm": 2.1011875433736047, "learning_rate": 2.7223183842269442e-06, "loss": 0.674, "step": 987 }, { "epoch": 0.6615333110143957, "grad_norm": 1.8308475703960754, "learning_rate": 2.7126666399590498e-06, "loss": 0.6615, "step": 988 }, { "epoch": 0.6622028791429528, "grad_norm": 1.9993491858103642, "learning_rate": 2.7030256626217932e-06, "loss": 0.6766, "step": 989 }, { "epoch": 0.6628724472715098, "grad_norm": 2.0576922363568517, "learning_rate": 2.6933954975971637e-06, "loss": 0.7626, "step": 990 }, { "epoch": 0.663542015400067, "grad_norm": 1.7870235967474282, "learning_rate": 2.683776190216258e-06, "loss": 0.6962, "step": 991 }, { "epoch": 0.6642115835286241, "grad_norm": 1.9317827493731041, "learning_rate": 2.674167785759053e-06, "loss": 0.7274, "step": 992 }, { "epoch": 0.6648811516571811, "grad_norm": 2.120345162907045, "learning_rate": 2.6645703294542114e-06, "loss": 0.7255, "step": 993 }, { "epoch": 0.6655507197857382, "grad_norm": 2.38261672301773, "learning_rate": 2.6549838664788596e-06, "loss": 0.6657, "step": 994 }, { "epoch": 0.6662202879142953, "grad_norm": 1.9275729198536946, "learning_rate": 2.6454084419583743e-06, "loss": 0.6692, "step": 995 }, { "epoch": 0.6668898560428523, "grad_norm": 3.019052381036727, "learning_rate": 2.6358441009661755e-06, "loss": 0.7052, "step": 996 }, { "epoch": 0.6675594241714095, "grad_norm": 2.205179493386387, "learning_rate": 2.6262908885235046e-06, "loss": 0.7495, "step": 997 }, { "epoch": 0.6682289922999666, "grad_norm": 1.8984438273904956, "learning_rate": 2.616748849599222e-06, "loss": 0.7117, "step": 998 }, { "epoch": 0.6688985604285236, "grad_norm": 2.2358069098620685, "learning_rate": 2.6072180291095996e-06, "loss": 0.6969, "step": 999 }, { "epoch": 0.6695681285570807, "grad_norm": 3.0402876747198846, "learning_rate": 2.5976984719180874e-06, "loss": 0.761, "step": 1000 }, { "epoch": 0.6702376966856378, "grad_norm": 1.9538217956267283, "learning_rate": 2.5881902228351274e-06, "loss": 0.6508, "step": 1001 }, { "epoch": 0.6709072648141948, "grad_norm": 2.056265015003442, "learning_rate": 2.5786933266179306e-06, "loss": 0.678, "step": 1002 }, { "epoch": 0.671576832942752, "grad_norm": 1.820434755093311, "learning_rate": 2.569207827970263e-06, "loss": 0.7057, "step": 1003 }, { "epoch": 0.672246401071309, "grad_norm": 1.7729251507630357, "learning_rate": 2.5597337715422455e-06, "loss": 0.6316, "step": 1004 }, { "epoch": 0.6729159691998661, "grad_norm": 2.022784824605612, "learning_rate": 2.550271201930136e-06, "loss": 0.6972, "step": 1005 }, { "epoch": 0.6735855373284232, "grad_norm": 2.2074381312250106, "learning_rate": 2.540820163676121e-06, "loss": 0.6794, "step": 1006 }, { "epoch": 0.6742551054569802, "grad_norm": 2.2370575941216524, "learning_rate": 2.531380701268108e-06, "loss": 0.7189, "step": 1007 }, { "epoch": 0.6749246735855373, "grad_norm": 2.06327674896634, "learning_rate": 2.5219528591395147e-06, "loss": 0.6497, "step": 1008 }, { "epoch": 0.6755942417140944, "grad_norm": 1.6658280232431273, "learning_rate": 2.5125366816690557e-06, "loss": 0.6415, "step": 1009 }, { "epoch": 0.6762638098426514, "grad_norm": 1.894769927578263, "learning_rate": 2.5031322131805456e-06, "loss": 0.6489, "step": 1010 }, { "epoch": 0.6769333779712086, "grad_norm": 1.7820357462294771, "learning_rate": 2.4937394979426756e-06, "loss": 0.7117, "step": 1011 }, { "epoch": 0.6776029460997657, "grad_norm": 2.7117951147702968, "learning_rate": 2.484358580168814e-06, "loss": 0.7537, "step": 1012 }, { "epoch": 0.6782725142283227, "grad_norm": 2.1028212721158717, "learning_rate": 2.474989504016798e-06, "loss": 0.6368, "step": 1013 }, { "epoch": 0.6789420823568798, "grad_norm": 2.2415734128266886, "learning_rate": 2.465632313588722e-06, "loss": 0.7296, "step": 1014 }, { "epoch": 0.6796116504854369, "grad_norm": 2.5394620188353008, "learning_rate": 2.456287052930733e-06, "loss": 0.7303, "step": 1015 }, { "epoch": 0.6802812186139939, "grad_norm": 2.284075406614801, "learning_rate": 2.4469537660328215e-06, "loss": 0.6561, "step": 1016 }, { "epoch": 0.680950786742551, "grad_norm": 2.101187233167272, "learning_rate": 2.4376324968286154e-06, "loss": 0.6762, "step": 1017 }, { "epoch": 0.6816203548711082, "grad_norm": 1.9611341189842693, "learning_rate": 2.4283232891951723e-06, "loss": 0.6587, "step": 1018 }, { "epoch": 0.6822899229996652, "grad_norm": 2.5262516891197335, "learning_rate": 2.419026186952777e-06, "loss": 0.7324, "step": 1019 }, { "epoch": 0.6829594911282223, "grad_norm": 2.0503910180347633, "learning_rate": 2.4097412338647236e-06, "loss": 0.7068, "step": 1020 }, { "epoch": 0.6836290592567794, "grad_norm": 2.245054761621883, "learning_rate": 2.4004684736371276e-06, "loss": 0.6857, "step": 1021 }, { "epoch": 0.6842986273853364, "grad_norm": 1.9161875774356423, "learning_rate": 2.3912079499187068e-06, "loss": 0.6921, "step": 1022 }, { "epoch": 0.6849681955138935, "grad_norm": 2.0709241564889016, "learning_rate": 2.3819597063005747e-06, "loss": 0.7107, "step": 1023 }, { "epoch": 0.6856377636424507, "grad_norm": 1.9380832016097158, "learning_rate": 2.3727237863160448e-06, "loss": 0.7011, "step": 1024 }, { "epoch": 0.6863073317710077, "grad_norm": 1.8599071797074183, "learning_rate": 2.36350023344042e-06, "loss": 0.7265, "step": 1025 }, { "epoch": 0.6869768998995648, "grad_norm": 1.868113619089663, "learning_rate": 2.3542890910907888e-06, "loss": 0.6929, "step": 1026 }, { "epoch": 0.6876464680281219, "grad_norm": 2.2540271574403716, "learning_rate": 2.345090402625822e-06, "loss": 0.6544, "step": 1027 }, { "epoch": 0.6883160361566789, "grad_norm": 2.5341753152907858, "learning_rate": 2.3359042113455603e-06, "loss": 0.6918, "step": 1028 }, { "epoch": 0.688985604285236, "grad_norm": 2.1945734404256076, "learning_rate": 2.32673056049123e-06, "loss": 0.701, "step": 1029 }, { "epoch": 0.6896551724137931, "grad_norm": 2.1918159180564114, "learning_rate": 2.317569493245019e-06, "loss": 0.7512, "step": 1030 }, { "epoch": 0.6903247405423502, "grad_norm": 1.9575726964935625, "learning_rate": 2.3084210527298806e-06, "loss": 0.7331, "step": 1031 }, { "epoch": 0.6909943086709073, "grad_norm": 1.7754102215230991, "learning_rate": 2.299285282009334e-06, "loss": 0.6636, "step": 1032 }, { "epoch": 0.6916638767994644, "grad_norm": 1.8649410661354935, "learning_rate": 2.2901622240872638e-06, "loss": 0.7302, "step": 1033 }, { "epoch": 0.6923334449280214, "grad_norm": 2.0919371752022906, "learning_rate": 2.2810519219077047e-06, "loss": 0.7105, "step": 1034 }, { "epoch": 0.6930030130565785, "grad_norm": 2.1244016514423847, "learning_rate": 2.2719544183546523e-06, "loss": 0.7326, "step": 1035 }, { "epoch": 0.6936725811851356, "grad_norm": 2.213383157273685, "learning_rate": 2.262869756251856e-06, "loss": 0.7394, "step": 1036 }, { "epoch": 0.6943421493136926, "grad_norm": 1.7802289043996253, "learning_rate": 2.253797978362617e-06, "loss": 0.671, "step": 1037 }, { "epoch": 0.6950117174422498, "grad_norm": 1.6446171767552475, "learning_rate": 2.2447391273895907e-06, "loss": 0.7383, "step": 1038 }, { "epoch": 0.6956812855708069, "grad_norm": 2.029324260009579, "learning_rate": 2.2356932459745758e-06, "loss": 0.7451, "step": 1039 }, { "epoch": 0.6963508536993639, "grad_norm": 2.400857775439718, "learning_rate": 2.2266603766983304e-06, "loss": 0.6354, "step": 1040 }, { "epoch": 0.697020421827921, "grad_norm": 1.8742469566367295, "learning_rate": 2.2176405620803577e-06, "loss": 0.6904, "step": 1041 }, { "epoch": 0.697689989956478, "grad_norm": 2.065199123825437, "learning_rate": 2.208633844578706e-06, "loss": 0.6159, "step": 1042 }, { "epoch": 0.6983595580850351, "grad_norm": 2.019321953789472, "learning_rate": 2.1996402665897753e-06, "loss": 0.6864, "step": 1043 }, { "epoch": 0.6990291262135923, "grad_norm": 1.8411216718929952, "learning_rate": 2.1906598704481236e-06, "loss": 0.721, "step": 1044 }, { "epoch": 0.6996986943421493, "grad_norm": 2.9966383355648007, "learning_rate": 2.1816926984262454e-06, "loss": 0.6762, "step": 1045 }, { "epoch": 0.7003682624707064, "grad_norm": 1.8884294138056426, "learning_rate": 2.172738792734396e-06, "loss": 0.7542, "step": 1046 }, { "epoch": 0.7010378305992635, "grad_norm": 2.086545134227622, "learning_rate": 2.16379819552038e-06, "loss": 0.6813, "step": 1047 }, { "epoch": 0.7017073987278205, "grad_norm": 1.852650054492443, "learning_rate": 2.1548709488693586e-06, "loss": 0.7131, "step": 1048 }, { "epoch": 0.7023769668563776, "grad_norm": 2.119574888330037, "learning_rate": 2.1459570948036486e-06, "loss": 0.658, "step": 1049 }, { "epoch": 0.7030465349849347, "grad_norm": 2.1380615071646907, "learning_rate": 2.1370566752825193e-06, "loss": 0.7051, "step": 1050 }, { "epoch": 0.7030465349849347, "eval_loss": 0.699982762336731, "eval_runtime": 440.9276, "eval_samples_per_second": 45.611, "eval_steps_per_second": 0.714, "step": 1050 }, { "epoch": 0.7037161031134918, "grad_norm": 2.4494989858063607, "learning_rate": 2.1281697322020124e-06, "loss": 0.7025, "step": 1051 }, { "epoch": 0.7043856712420489, "grad_norm": 1.9820640241866512, "learning_rate": 2.119296307394726e-06, "loss": 0.6817, "step": 1052 }, { "epoch": 0.705055239370606, "grad_norm": 1.8654344801801033, "learning_rate": 2.1104364426296237e-06, "loss": 0.6539, "step": 1053 }, { "epoch": 0.705724807499163, "grad_norm": 2.2412469915882376, "learning_rate": 2.1015901796118402e-06, "loss": 0.6019, "step": 1054 }, { "epoch": 0.7063943756277201, "grad_norm": 1.981137503976005, "learning_rate": 2.092757559982493e-06, "loss": 0.6901, "step": 1055 }, { "epoch": 0.7070639437562772, "grad_norm": 2.054366380644569, "learning_rate": 2.083938625318463e-06, "loss": 0.6938, "step": 1056 }, { "epoch": 0.7077335118848342, "grad_norm": 1.9870578341348804, "learning_rate": 2.075133417132223e-06, "loss": 0.7468, "step": 1057 }, { "epoch": 0.7084030800133914, "grad_norm": 2.018293449150148, "learning_rate": 2.0663419768716298e-06, "loss": 0.7343, "step": 1058 }, { "epoch": 0.7090726481419485, "grad_norm": 1.9290862425039246, "learning_rate": 2.057564345919732e-06, "loss": 0.6904, "step": 1059 }, { "epoch": 0.7097422162705055, "grad_norm": 1.880768592913249, "learning_rate": 2.0488005655945765e-06, "loss": 0.6957, "step": 1060 }, { "epoch": 0.7104117843990626, "grad_norm": 1.8775836167346036, "learning_rate": 2.040050677149008e-06, "loss": 0.6538, "step": 1061 }, { "epoch": 0.7110813525276197, "grad_norm": 1.9464369638530912, "learning_rate": 2.0313147217704814e-06, "loss": 0.6953, "step": 1062 }, { "epoch": 0.7117509206561767, "grad_norm": 1.7436553154327847, "learning_rate": 2.022592740580874e-06, "loss": 0.651, "step": 1063 }, { "epoch": 0.7124204887847339, "grad_norm": 2.2866716658496316, "learning_rate": 2.01388477463627e-06, "loss": 0.7244, "step": 1064 }, { "epoch": 0.713090056913291, "grad_norm": 2.6996171209507276, "learning_rate": 2.0051908649267898e-06, "loss": 0.7738, "step": 1065 }, { "epoch": 0.713759625041848, "grad_norm": 2.1198949151090387, "learning_rate": 1.996511052376387e-06, "loss": 0.6892, "step": 1066 }, { "epoch": 0.7144291931704051, "grad_norm": 1.6227941531150796, "learning_rate": 1.987845377842656e-06, "loss": 0.6848, "step": 1067 }, { "epoch": 0.7150987612989622, "grad_norm": 3.427376090553253, "learning_rate": 1.979193882116641e-06, "loss": 0.7449, "step": 1068 }, { "epoch": 0.7157683294275192, "grad_norm": 1.628484252868923, "learning_rate": 1.970556605922645e-06, "loss": 0.7437, "step": 1069 }, { "epoch": 0.7164378975560763, "grad_norm": 1.7570245907097315, "learning_rate": 1.9619335899180346e-06, "loss": 0.6597, "step": 1070 }, { "epoch": 0.7171074656846335, "grad_norm": 2.1239010149262265, "learning_rate": 1.9533248746930528e-06, "loss": 0.7169, "step": 1071 }, { "epoch": 0.7177770338131905, "grad_norm": 1.9009537614585204, "learning_rate": 1.9447305007706262e-06, "loss": 0.7056, "step": 1072 }, { "epoch": 0.7184466019417476, "grad_norm": 2.271593767362845, "learning_rate": 1.9361505086061688e-06, "loss": 0.7128, "step": 1073 }, { "epoch": 0.7191161700703047, "grad_norm": 1.889008461292996, "learning_rate": 1.927584938587408e-06, "loss": 0.7249, "step": 1074 }, { "epoch": 0.7197857381988617, "grad_norm": 5.057681464816618, "learning_rate": 1.9190338310341703e-06, "loss": 0.8077, "step": 1075 }, { "epoch": 0.7204553063274188, "grad_norm": 2.263732233105132, "learning_rate": 1.9104972261982134e-06, "loss": 0.7346, "step": 1076 }, { "epoch": 0.721124874455976, "grad_norm": 2.2088603480505618, "learning_rate": 1.9019751642630252e-06, "loss": 0.7358, "step": 1077 }, { "epoch": 0.721794442584533, "grad_norm": 2.2029888158044497, "learning_rate": 1.8934676853436361e-06, "loss": 0.7785, "step": 1078 }, { "epoch": 0.7224640107130901, "grad_norm": 1.977326004073148, "learning_rate": 1.8849748294864335e-06, "loss": 0.7303, "step": 1079 }, { "epoch": 0.7231335788416471, "grad_norm": 2.0338674265775154, "learning_rate": 1.8764966366689697e-06, "loss": 0.6833, "step": 1080 }, { "epoch": 0.7238031469702042, "grad_norm": 1.7735678040069283, "learning_rate": 1.8680331467997754e-06, "loss": 0.634, "step": 1081 }, { "epoch": 0.7244727150987613, "grad_norm": 1.7915041794469395, "learning_rate": 1.8595843997181717e-06, "loss": 0.7624, "step": 1082 }, { "epoch": 0.7251422832273183, "grad_norm": 1.9926728930088418, "learning_rate": 1.8511504351940852e-06, "loss": 0.6596, "step": 1083 }, { "epoch": 0.7258118513558754, "grad_norm": 1.6388709715784733, "learning_rate": 1.842731292927849e-06, "loss": 0.6529, "step": 1084 }, { "epoch": 0.7264814194844326, "grad_norm": 1.7411276223203842, "learning_rate": 1.8343270125500379e-06, "loss": 0.6972, "step": 1085 }, { "epoch": 0.7271509876129896, "grad_norm": 1.986108261994064, "learning_rate": 1.825937633621261e-06, "loss": 0.6983, "step": 1086 }, { "epoch": 0.7278205557415467, "grad_norm": 2.2118831312034515, "learning_rate": 1.8175631956319823e-06, "loss": 0.6947, "step": 1087 }, { "epoch": 0.7284901238701038, "grad_norm": 2.0198887884653582, "learning_rate": 1.8092037380023386e-06, "loss": 0.7233, "step": 1088 }, { "epoch": 0.7291596919986608, "grad_norm": 2.6047220795424377, "learning_rate": 1.8008593000819519e-06, "loss": 0.6849, "step": 1089 }, { "epoch": 0.7298292601272179, "grad_norm": 2.2805485591733903, "learning_rate": 1.792529921149742e-06, "loss": 0.6417, "step": 1090 }, { "epoch": 0.7304988282557751, "grad_norm": 1.9995600638545488, "learning_rate": 1.784215640413745e-06, "loss": 0.6819, "step": 1091 }, { "epoch": 0.7311683963843321, "grad_norm": 1.8602373470445512, "learning_rate": 1.77591649701092e-06, "loss": 0.6744, "step": 1092 }, { "epoch": 0.7318379645128892, "grad_norm": 2.128837686301292, "learning_rate": 1.7676325300069824e-06, "loss": 0.6633, "step": 1093 }, { "epoch": 0.7325075326414463, "grad_norm": 1.9928074087453413, "learning_rate": 1.759363778396203e-06, "loss": 0.692, "step": 1094 }, { "epoch": 0.7331771007700033, "grad_norm": 2.2081660794542675, "learning_rate": 1.7511102811012287e-06, "loss": 0.6681, "step": 1095 }, { "epoch": 0.7338466688985604, "grad_norm": 1.7584791118210004, "learning_rate": 1.7428720769729035e-06, "loss": 0.6766, "step": 1096 }, { "epoch": 0.7345162370271175, "grad_norm": 2.0003660516495083, "learning_rate": 1.7346492047900897e-06, "loss": 0.7157, "step": 1097 }, { "epoch": 0.7351858051556746, "grad_norm": 1.8945780098090292, "learning_rate": 1.7264417032594683e-06, "loss": 0.6877, "step": 1098 }, { "epoch": 0.7358553732842317, "grad_norm": 1.9823041645237283, "learning_rate": 1.718249611015374e-06, "loss": 0.6272, "step": 1099 }, { "epoch": 0.7365249414127888, "grad_norm": 1.9646330991316499, "learning_rate": 1.7100729666196064e-06, "loss": 0.7353, "step": 1100 }, { "epoch": 0.7371945095413458, "grad_norm": 2.210679198147859, "learning_rate": 1.7019118085612474e-06, "loss": 0.6823, "step": 1101 }, { "epoch": 0.7378640776699029, "grad_norm": 1.6204748781779608, "learning_rate": 1.693766175256485e-06, "loss": 0.6825, "step": 1102 }, { "epoch": 0.73853364579846, "grad_norm": 3.2420940391602326, "learning_rate": 1.685636105048421e-06, "loss": 0.6877, "step": 1103 }, { "epoch": 0.739203213927017, "grad_norm": 2.104053541516314, "learning_rate": 1.6775216362069096e-06, "loss": 0.6567, "step": 1104 }, { "epoch": 0.7398727820555742, "grad_norm": 1.6124853106241792, "learning_rate": 1.6694228069283614e-06, "loss": 0.5983, "step": 1105 }, { "epoch": 0.7405423501841313, "grad_norm": 1.8021765090313755, "learning_rate": 1.6613396553355638e-06, "loss": 0.6244, "step": 1106 }, { "epoch": 0.7412119183126883, "grad_norm": 2.199928999534946, "learning_rate": 1.6532722194775108e-06, "loss": 0.6938, "step": 1107 }, { "epoch": 0.7418814864412454, "grad_norm": 1.9466685806973878, "learning_rate": 1.6452205373292246e-06, "loss": 0.7168, "step": 1108 }, { "epoch": 0.7425510545698025, "grad_norm": 2.4173278483537928, "learning_rate": 1.6371846467915603e-06, "loss": 0.7826, "step": 1109 }, { "epoch": 0.7432206226983595, "grad_norm": 1.824156777382339, "learning_rate": 1.6291645856910465e-06, "loss": 0.7516, "step": 1110 }, { "epoch": 0.7438901908269167, "grad_norm": 1.968861176124366, "learning_rate": 1.6211603917796975e-06, "loss": 0.7125, "step": 1111 }, { "epoch": 0.7445597589554738, "grad_norm": 1.9301504633963562, "learning_rate": 1.6131721027348374e-06, "loss": 0.7313, "step": 1112 }, { "epoch": 0.7452293270840308, "grad_norm": 1.797977156470764, "learning_rate": 1.6051997561589244e-06, "loss": 0.6182, "step": 1113 }, { "epoch": 0.7458988952125879, "grad_norm": 1.92163660489464, "learning_rate": 1.5972433895793666e-06, "loss": 0.6274, "step": 1114 }, { "epoch": 0.746568463341145, "grad_norm": 1.7805871942573204, "learning_rate": 1.58930304044836e-06, "loss": 0.6292, "step": 1115 }, { "epoch": 0.747238031469702, "grad_norm": 1.8551114155648947, "learning_rate": 1.5813787461426988e-06, "loss": 0.7067, "step": 1116 }, { "epoch": 0.7479075995982591, "grad_norm": 1.9289349180366306, "learning_rate": 1.5734705439636017e-06, "loss": 0.6883, "step": 1117 }, { "epoch": 0.7485771677268162, "grad_norm": 1.8081113322859388, "learning_rate": 1.5655784711365413e-06, "loss": 0.6585, "step": 1118 }, { "epoch": 0.7492467358553733, "grad_norm": 1.7225059128394387, "learning_rate": 1.5577025648110666e-06, "loss": 0.7229, "step": 1119 }, { "epoch": 0.7499163039839304, "grad_norm": 2.1035359272954066, "learning_rate": 1.5498428620606264e-06, "loss": 0.7492, "step": 1120 }, { "epoch": 0.7505858721124874, "grad_norm": 1.6891284798868982, "learning_rate": 1.5419993998823968e-06, "loss": 0.686, "step": 1121 }, { "epoch": 0.7512554402410445, "grad_norm": 2.1574171454564723, "learning_rate": 1.5341722151971056e-06, "loss": 0.6957, "step": 1122 }, { "epoch": 0.7519250083696016, "grad_norm": 2.1586401009197713, "learning_rate": 1.5263613448488596e-06, "loss": 0.7353, "step": 1123 }, { "epoch": 0.7525945764981586, "grad_norm": 1.8337751803476419, "learning_rate": 1.5185668256049706e-06, "loss": 0.7296, "step": 1124 }, { "epoch": 0.7532641446267158, "grad_norm": 1.8262518685588027, "learning_rate": 1.5107886941557853e-06, "loss": 0.6958, "step": 1125 }, { "epoch": 0.7539337127552729, "grad_norm": 2.062272032390879, "learning_rate": 1.5030269871145015e-06, "loss": 0.676, "step": 1126 }, { "epoch": 0.7546032808838299, "grad_norm": 1.9320475934866321, "learning_rate": 1.495281741017016e-06, "loss": 0.6685, "step": 1127 }, { "epoch": 0.755272849012387, "grad_norm": 2.1657293183016617, "learning_rate": 1.4875529923217308e-06, "loss": 0.7866, "step": 1128 }, { "epoch": 0.7559424171409441, "grad_norm": 2.176605007927728, "learning_rate": 1.4798407774093954e-06, "loss": 0.712, "step": 1129 }, { "epoch": 0.7566119852695011, "grad_norm": 1.74239280305878, "learning_rate": 1.4721451325829322e-06, "loss": 0.6932, "step": 1130 }, { "epoch": 0.7572815533980582, "grad_norm": 2.181433189994552, "learning_rate": 1.4644660940672628e-06, "loss": 0.707, "step": 1131 }, { "epoch": 0.7579511215266154, "grad_norm": 1.8940478381754027, "learning_rate": 1.4568036980091416e-06, "loss": 0.6643, "step": 1132 }, { "epoch": 0.7586206896551724, "grad_norm": 1.8522102142559074, "learning_rate": 1.4491579804769817e-06, "loss": 0.7295, "step": 1133 }, { "epoch": 0.7592902577837295, "grad_norm": 1.8433788534811, "learning_rate": 1.4415289774606894e-06, "loss": 0.666, "step": 1134 }, { "epoch": 0.7599598259122866, "grad_norm": 2.2148371733560683, "learning_rate": 1.4339167248714902e-06, "loss": 0.6893, "step": 1135 }, { "epoch": 0.7606293940408436, "grad_norm": 2.1640810813338724, "learning_rate": 1.4263212585417653e-06, "loss": 0.6824, "step": 1136 }, { "epoch": 0.7612989621694007, "grad_norm": 2.4405676462424575, "learning_rate": 1.4187426142248723e-06, "loss": 0.7297, "step": 1137 }, { "epoch": 0.7619685302979579, "grad_norm": 2.3550844381644613, "learning_rate": 1.411180827594995e-06, "loss": 0.6385, "step": 1138 }, { "epoch": 0.7626380984265149, "grad_norm": 2.3242116526126004, "learning_rate": 1.4036359342469551e-06, "loss": 0.71, "step": 1139 }, { "epoch": 0.763307666555072, "grad_norm": 1.96515207469545, "learning_rate": 1.39610796969606e-06, "loss": 0.6925, "step": 1140 }, { "epoch": 0.7639772346836291, "grad_norm": 2.028563340971011, "learning_rate": 1.3885969693779277e-06, "loss": 0.649, "step": 1141 }, { "epoch": 0.7646468028121861, "grad_norm": 2.415481987966573, "learning_rate": 1.3811029686483224e-06, "loss": 0.6868, "step": 1142 }, { "epoch": 0.7653163709407432, "grad_norm": 1.8716157888557319, "learning_rate": 1.3736260027829883e-06, "loss": 0.6096, "step": 1143 }, { "epoch": 0.7659859390693003, "grad_norm": 1.9289135511726927, "learning_rate": 1.3661661069774835e-06, "loss": 0.6242, "step": 1144 }, { "epoch": 0.7666555071978574, "grad_norm": 2.079987742505029, "learning_rate": 1.3587233163470126e-06, "loss": 0.7571, "step": 1145 }, { "epoch": 0.7673250753264145, "grad_norm": 2.0535166525363864, "learning_rate": 1.351297665926264e-06, "loss": 0.6895, "step": 1146 }, { "epoch": 0.7679946434549716, "grad_norm": 1.8317555244311081, "learning_rate": 1.3438891906692447e-06, "loss": 0.6682, "step": 1147 }, { "epoch": 0.7686642115835286, "grad_norm": 1.6538894238413249, "learning_rate": 1.33649792544911e-06, "loss": 0.7185, "step": 1148 }, { "epoch": 0.7693337797120857, "grad_norm": 1.7342717631399374, "learning_rate": 1.3291239050580085e-06, "loss": 0.6291, "step": 1149 }, { "epoch": 0.7700033478406428, "grad_norm": 2.283889992878715, "learning_rate": 1.3217671642069163e-06, "loss": 0.7602, "step": 1150 }, { "epoch": 0.7706729159691998, "grad_norm": 1.955439428945688, "learning_rate": 1.3144277375254643e-06, "loss": 0.6544, "step": 1151 }, { "epoch": 0.771342484097757, "grad_norm": 1.9915397584187229, "learning_rate": 1.3071056595617877e-06, "loss": 0.7487, "step": 1152 }, { "epoch": 0.7720120522263141, "grad_norm": 1.7357510870042048, "learning_rate": 1.2998009647823545e-06, "loss": 0.7658, "step": 1153 }, { "epoch": 0.7726816203548711, "grad_norm": 2.8862162741072406, "learning_rate": 1.2925136875718102e-06, "loss": 0.7384, "step": 1154 }, { "epoch": 0.7733511884834282, "grad_norm": 2.2148883053809807, "learning_rate": 1.28524386223281e-06, "loss": 0.7738, "step": 1155 }, { "epoch": 0.7740207566119852, "grad_norm": 2.679593306958505, "learning_rate": 1.277991522985857e-06, "loss": 0.7078, "step": 1156 }, { "epoch": 0.7746903247405423, "grad_norm": 2.703404403482204, "learning_rate": 1.2707567039691505e-06, "loss": 0.6962, "step": 1157 }, { "epoch": 0.7753598928690995, "grad_norm": 2.426519992083522, "learning_rate": 1.2635394392384142e-06, "loss": 0.6488, "step": 1158 }, { "epoch": 0.7760294609976565, "grad_norm": 2.2205259511164344, "learning_rate": 1.2563397627667395e-06, "loss": 0.6747, "step": 1159 }, { "epoch": 0.7766990291262136, "grad_norm": 2.667702018070368, "learning_rate": 1.2491577084444273e-06, "loss": 0.614, "step": 1160 }, { "epoch": 0.7773685972547707, "grad_norm": 2.8693434628870023, "learning_rate": 1.2419933100788323e-06, "loss": 0.6337, "step": 1161 }, { "epoch": 0.7780381653833277, "grad_norm": 3.1657159654081064, "learning_rate": 1.2348466013941907e-06, "loss": 0.6987, "step": 1162 }, { "epoch": 0.7787077335118848, "grad_norm": 2.4144666105794053, "learning_rate": 1.2277176160314762e-06, "loss": 0.6278, "step": 1163 }, { "epoch": 0.7793773016404419, "grad_norm": 2.480474416421138, "learning_rate": 1.2206063875482321e-06, "loss": 0.6373, "step": 1164 }, { "epoch": 0.780046869768999, "grad_norm": 1.8980082389745958, "learning_rate": 1.213512949418419e-06, "loss": 0.7, "step": 1165 }, { "epoch": 0.7807164378975561, "grad_norm": 2.027654474275916, "learning_rate": 1.206437335032254e-06, "loss": 0.7229, "step": 1166 }, { "epoch": 0.7813860060261132, "grad_norm": 1.9307405713920822, "learning_rate": 1.1993795776960498e-06, "loss": 0.6785, "step": 1167 }, { "epoch": 0.7820555741546702, "grad_norm": 1.8612431055557115, "learning_rate": 1.192339710632071e-06, "loss": 0.6426, "step": 1168 }, { "epoch": 0.7827251422832273, "grad_norm": 3.0932347003599063, "learning_rate": 1.1853177669783645e-06, "loss": 0.6892, "step": 1169 }, { "epoch": 0.7833947104117844, "grad_norm": 2.0047642834121024, "learning_rate": 1.1783137797886052e-06, "loss": 0.7268, "step": 1170 }, { "epoch": 0.7840642785403414, "grad_norm": 1.607647150870135, "learning_rate": 1.1713277820319464e-06, "loss": 0.7011, "step": 1171 }, { "epoch": 0.7847338466688986, "grad_norm": 1.6072520306912557, "learning_rate": 1.1643598065928662e-06, "loss": 0.6352, "step": 1172 }, { "epoch": 0.7854034147974557, "grad_norm": 2.4166151081860976, "learning_rate": 1.1574098862709993e-06, "loss": 0.6554, "step": 1173 }, { "epoch": 0.7860729829260127, "grad_norm": 1.9696038690923232, "learning_rate": 1.1504780537809963e-06, "loss": 0.7174, "step": 1174 }, { "epoch": 0.7867425510545698, "grad_norm": 2.792532141939864, "learning_rate": 1.1435643417523646e-06, "loss": 0.7467, "step": 1175 }, { "epoch": 0.7874121191831269, "grad_norm": 1.7742426249022032, "learning_rate": 1.136668782729315e-06, "loss": 0.6801, "step": 1176 }, { "epoch": 0.7880816873116839, "grad_norm": 1.8730845202324546, "learning_rate": 1.1297914091706086e-06, "loss": 0.6803, "step": 1177 }, { "epoch": 0.788751255440241, "grad_norm": 1.9175295006021627, "learning_rate": 1.1229322534494008e-06, "loss": 0.6759, "step": 1178 }, { "epoch": 0.7894208235687982, "grad_norm": 2.0502465746016743, "learning_rate": 1.1160913478530944e-06, "loss": 0.7516, "step": 1179 }, { "epoch": 0.7900903916973552, "grad_norm": 1.8623972185439832, "learning_rate": 1.1092687245831896e-06, "loss": 0.7298, "step": 1180 }, { "epoch": 0.7907599598259123, "grad_norm": 2.341932106216594, "learning_rate": 1.1024644157551206e-06, "loss": 0.7348, "step": 1181 }, { "epoch": 0.7914295279544694, "grad_norm": 1.9628147359598207, "learning_rate": 1.0956784533981153e-06, "loss": 0.6735, "step": 1182 }, { "epoch": 0.7920990960830264, "grad_norm": 3.214658613248316, "learning_rate": 1.0889108694550416e-06, "loss": 0.7222, "step": 1183 }, { "epoch": 0.7927686642115835, "grad_norm": 2.821219794751604, "learning_rate": 1.0821616957822562e-06, "loss": 0.69, "step": 1184 }, { "epoch": 0.7934382323401407, "grad_norm": 1.8822351310193561, "learning_rate": 1.0754309641494543e-06, "loss": 0.6977, "step": 1185 }, { "epoch": 0.7941078004686977, "grad_norm": 7.841786357093528, "learning_rate": 1.0687187062395216e-06, "loss": 0.6994, "step": 1186 }, { "epoch": 0.7947773685972548, "grad_norm": 3.7735400256410934, "learning_rate": 1.062024953648384e-06, "loss": 0.7688, "step": 1187 }, { "epoch": 0.7954469367258119, "grad_norm": 2.0550649561128305, "learning_rate": 1.0553497378848587e-06, "loss": 0.7582, "step": 1188 }, { "epoch": 0.7961165048543689, "grad_norm": 2.1203511585281865, "learning_rate": 1.0486930903705095e-06, "loss": 0.7204, "step": 1189 }, { "epoch": 0.796786072982926, "grad_norm": 1.9967166895100035, "learning_rate": 1.0420550424394876e-06, "loss": 0.7121, "step": 1190 }, { "epoch": 0.7974556411114831, "grad_norm": 2.1021850393099877, "learning_rate": 1.035435625338404e-06, "loss": 0.6489, "step": 1191 }, { "epoch": 0.7981252092400402, "grad_norm": 1.749343915353323, "learning_rate": 1.0288348702261608e-06, "loss": 0.682, "step": 1192 }, { "epoch": 0.7987947773685973, "grad_norm": 2.0692489014966924, "learning_rate": 1.0222528081738186e-06, "loss": 0.7302, "step": 1193 }, { "epoch": 0.7994643454971543, "grad_norm": 2.148561526116731, "learning_rate": 1.015689470164447e-06, "loss": 0.7171, "step": 1194 }, { "epoch": 0.8001339136257114, "grad_norm": 2.200944222783882, "learning_rate": 1.0091448870929749e-06, "loss": 0.6877, "step": 1195 }, { "epoch": 0.8008034817542685, "grad_norm": 1.7788322006690402, "learning_rate": 1.002619089766051e-06, "loss": 0.6847, "step": 1196 }, { "epoch": 0.8014730498828255, "grad_norm": 2.239900527722847, "learning_rate": 9.961121089018933e-07, "loss": 0.6376, "step": 1197 }, { "epoch": 0.8021426180113826, "grad_norm": 1.7542734870550734, "learning_rate": 9.896239751301484e-07, "loss": 0.6788, "step": 1198 }, { "epoch": 0.8028121861399398, "grad_norm": 2.19788770637949, "learning_rate": 9.831547189917456e-07, "loss": 0.6212, "step": 1199 }, { "epoch": 0.8034817542684968, "grad_norm": 3.085549995516455, "learning_rate": 9.767043709387546e-07, "loss": 0.7096, "step": 1200 }, { "epoch": 0.8034817542684968, "eval_loss": 0.6933080554008484, "eval_runtime": 480.9738, "eval_samples_per_second": 41.813, "eval_steps_per_second": 0.655, "step": 1200 }, { "epoch": 0.8041513223970539, "grad_norm": 1.8077256072809937, "learning_rate": 9.70272961334236e-07, "loss": 0.595, "step": 1201 }, { "epoch": 0.804820890525611, "grad_norm": 1.7901684770251451, "learning_rate": 9.638605204521112e-07, "loss": 0.7136, "step": 1202 }, { "epoch": 0.805490458654168, "grad_norm": 1.8780852109779311, "learning_rate": 9.57467078477008e-07, "loss": 0.702, "step": 1203 }, { "epoch": 0.8061600267827251, "grad_norm": 1.8747298807567359, "learning_rate": 9.510926655041191e-07, "loss": 0.647, "step": 1204 }, { "epoch": 0.8068295949112823, "grad_norm": 1.986948392873154, "learning_rate": 9.447373115390702e-07, "loss": 0.7122, "step": 1205 }, { "epoch": 0.8074991630398393, "grad_norm": 2.4245372025590024, "learning_rate": 9.384010464977688e-07, "loss": 0.6729, "step": 1206 }, { "epoch": 0.8081687311683964, "grad_norm": 1.91863118372097, "learning_rate": 9.320839002062682e-07, "loss": 0.6864, "step": 1207 }, { "epoch": 0.8088382992969535, "grad_norm": 2.3277985849816463, "learning_rate": 9.257859024006272e-07, "loss": 0.708, "step": 1208 }, { "epoch": 0.8095078674255105, "grad_norm": 1.7582716968256715, "learning_rate": 9.195070827267633e-07, "loss": 0.646, "step": 1209 }, { "epoch": 0.8101774355540676, "grad_norm": 1.9771178120173754, "learning_rate": 9.132474707403272e-07, "loss": 0.7064, "step": 1210 }, { "epoch": 0.8108470036826247, "grad_norm": 1.873140215158739, "learning_rate": 9.070070959065502e-07, "loss": 0.6185, "step": 1211 }, { "epoch": 0.8115165718111818, "grad_norm": 2.0641048414260275, "learning_rate": 9.007859876001091e-07, "loss": 0.6957, "step": 1212 }, { "epoch": 0.8121861399397389, "grad_norm": 2.037387480158615, "learning_rate": 8.945841751049916e-07, "loss": 0.6329, "step": 1213 }, { "epoch": 0.812855708068296, "grad_norm": 2.92625734519654, "learning_rate": 8.884016876143592e-07, "loss": 0.676, "step": 1214 }, { "epoch": 0.813525276196853, "grad_norm": 1.5155750021120227, "learning_rate": 8.822385542304007e-07, "loss": 0.624, "step": 1215 }, { "epoch": 0.8141948443254101, "grad_norm": 2.306366227817668, "learning_rate": 8.760948039642048e-07, "loss": 0.6795, "step": 1216 }, { "epoch": 0.8148644124539672, "grad_norm": 2.381142958645941, "learning_rate": 8.699704657356195e-07, "loss": 0.6622, "step": 1217 }, { "epoch": 0.8155339805825242, "grad_norm": 2.488204745910118, "learning_rate": 8.638655683731168e-07, "loss": 0.6322, "step": 1218 }, { "epoch": 0.8162035487110814, "grad_norm": 1.764658473681586, "learning_rate": 8.577801406136577e-07, "loss": 0.6143, "step": 1219 }, { "epoch": 0.8168731168396385, "grad_norm": 2.433036280163444, "learning_rate": 8.517142111025506e-07, "loss": 0.7024, "step": 1220 }, { "epoch": 0.8175426849681955, "grad_norm": 3.2214021196132787, "learning_rate": 8.45667808393329e-07, "loss": 0.6873, "step": 1221 }, { "epoch": 0.8182122530967526, "grad_norm": 2.2754834138123625, "learning_rate": 8.396409609476075e-07, "loss": 0.7142, "step": 1222 }, { "epoch": 0.8188818212253097, "grad_norm": 3.145478529679041, "learning_rate": 8.336336971349462e-07, "loss": 0.6876, "step": 1223 }, { "epoch": 0.8195513893538667, "grad_norm": 1.8884721608142852, "learning_rate": 8.276460452327245e-07, "loss": 0.6938, "step": 1224 }, { "epoch": 0.8202209574824239, "grad_norm": 2.1671176025176107, "learning_rate": 8.216780334260088e-07, "loss": 0.6718, "step": 1225 }, { "epoch": 0.820890525610981, "grad_norm": 2.4482670290012556, "learning_rate": 8.157296898074068e-07, "loss": 0.6292, "step": 1226 }, { "epoch": 0.821560093739538, "grad_norm": 1.8159863403266467, "learning_rate": 8.098010423769503e-07, "loss": 0.7188, "step": 1227 }, { "epoch": 0.8222296618680951, "grad_norm": 2.055293382169056, "learning_rate": 8.038921190419557e-07, "loss": 0.6734, "step": 1228 }, { "epoch": 0.8228992299966521, "grad_norm": 2.033187571588303, "learning_rate": 7.980029476168943e-07, "loss": 0.6439, "step": 1229 }, { "epoch": 0.8235687981252092, "grad_norm": 1.7809749812766447, "learning_rate": 7.921335558232618e-07, "loss": 0.6623, "step": 1230 }, { "epoch": 0.8242383662537663, "grad_norm": 2.1771713830893136, "learning_rate": 7.862839712894427e-07, "loss": 0.7209, "step": 1231 }, { "epoch": 0.8249079343823233, "grad_norm": 2.096735219404522, "learning_rate": 7.804542215505934e-07, "loss": 0.7307, "step": 1232 }, { "epoch": 0.8255775025108805, "grad_norm": 1.8495146859305425, "learning_rate": 7.746443340484983e-07, "loss": 0.7459, "step": 1233 }, { "epoch": 0.8262470706394376, "grad_norm": 2.3764807485467974, "learning_rate": 7.688543361314449e-07, "loss": 0.7154, "step": 1234 }, { "epoch": 0.8269166387679946, "grad_norm": 1.884901571448974, "learning_rate": 7.630842550540996e-07, "loss": 0.728, "step": 1235 }, { "epoch": 0.8275862068965517, "grad_norm": 1.820929955867681, "learning_rate": 7.573341179773785e-07, "loss": 0.6461, "step": 1236 }, { "epoch": 0.8282557750251088, "grad_norm": 2.3943789332068595, "learning_rate": 7.516039519683105e-07, "loss": 0.6981, "step": 1237 }, { "epoch": 0.8289253431536658, "grad_norm": 1.7421744330769127, "learning_rate": 7.458937839999231e-07, "loss": 0.6495, "step": 1238 }, { "epoch": 0.829594911282223, "grad_norm": 1.8866960356013724, "learning_rate": 7.402036409511054e-07, "loss": 0.6812, "step": 1239 }, { "epoch": 0.8302644794107801, "grad_norm": 2.09729216610623, "learning_rate": 7.345335496064865e-07, "loss": 0.7207, "step": 1240 }, { "epoch": 0.8309340475393371, "grad_norm": 1.552516551952114, "learning_rate": 7.288835366563102e-07, "loss": 0.6116, "step": 1241 }, { "epoch": 0.8316036156678942, "grad_norm": 2.001419517986908, "learning_rate": 7.232536286963021e-07, "loss": 0.6452, "step": 1242 }, { "epoch": 0.8322731837964513, "grad_norm": 1.887036470609978, "learning_rate": 7.176438522275525e-07, "loss": 0.6821, "step": 1243 }, { "epoch": 0.8329427519250083, "grad_norm": 1.8093578932559364, "learning_rate": 7.120542336563935e-07, "loss": 0.6614, "step": 1244 }, { "epoch": 0.8336123200535654, "grad_norm": 2.6892900748664683, "learning_rate": 7.064847992942614e-07, "loss": 0.7393, "step": 1245 }, { "epoch": 0.8342818881821226, "grad_norm": 2.718630182241123, "learning_rate": 7.009355753575869e-07, "loss": 0.6703, "step": 1246 }, { "epoch": 0.8349514563106796, "grad_norm": 1.8146915863842343, "learning_rate": 6.954065879676653e-07, "loss": 0.5525, "step": 1247 }, { "epoch": 0.8356210244392367, "grad_norm": 1.9588652797536275, "learning_rate": 6.898978631505332e-07, "loss": 0.6571, "step": 1248 }, { "epoch": 0.8362905925677938, "grad_norm": 1.7756252863683, "learning_rate": 6.844094268368484e-07, "loss": 0.6425, "step": 1249 }, { "epoch": 0.8369601606963508, "grad_norm": 1.6536051998373236, "learning_rate": 6.789413048617672e-07, "loss": 0.6869, "step": 1250 }, { "epoch": 0.8376297288249079, "grad_norm": 2.0934928852149315, "learning_rate": 6.734935229648204e-07, "loss": 0.6356, "step": 1251 }, { "epoch": 0.8382992969534651, "grad_norm": 2.104565967434682, "learning_rate": 6.680661067897958e-07, "loss": 0.7791, "step": 1252 }, { "epoch": 0.8389688650820221, "grad_norm": 2.0660017415916014, "learning_rate": 6.626590818846163e-07, "loss": 0.712, "step": 1253 }, { "epoch": 0.8396384332105792, "grad_norm": 1.9530809102507656, "learning_rate": 6.572724737012142e-07, "loss": 0.6712, "step": 1254 }, { "epoch": 0.8403080013391363, "grad_norm": 1.9595437625190206, "learning_rate": 6.519063075954246e-07, "loss": 0.7246, "step": 1255 }, { "epoch": 0.8409775694676933, "grad_norm": 2.3649056755490085, "learning_rate": 6.465606088268489e-07, "loss": 0.6855, "step": 1256 }, { "epoch": 0.8416471375962504, "grad_norm": 2.018269953369677, "learning_rate": 6.412354025587509e-07, "loss": 0.6988, "step": 1257 }, { "epoch": 0.8423167057248075, "grad_norm": 1.8495617742153176, "learning_rate": 6.359307138579295e-07, "loss": 0.6566, "step": 1258 }, { "epoch": 0.8429862738533646, "grad_norm": 1.8169963768887052, "learning_rate": 6.306465676946038e-07, "loss": 0.6679, "step": 1259 }, { "epoch": 0.8436558419819217, "grad_norm": 2.20444109733784, "learning_rate": 6.253829889422952e-07, "loss": 0.7141, "step": 1260 }, { "epoch": 0.8443254101104788, "grad_norm": 1.997611278274971, "learning_rate": 6.201400023777105e-07, "loss": 0.662, "step": 1261 }, { "epoch": 0.8449949782390358, "grad_norm": 1.978565602851472, "learning_rate": 6.149176326806238e-07, "loss": 0.6997, "step": 1262 }, { "epoch": 0.8456645463675929, "grad_norm": 1.9176686753471432, "learning_rate": 6.097159044337641e-07, "loss": 0.6907, "step": 1263 }, { "epoch": 0.84633411449615, "grad_norm": 1.969088801932822, "learning_rate": 6.045348421226949e-07, "loss": 0.6696, "step": 1264 }, { "epoch": 0.847003682624707, "grad_norm": 1.6970441885965069, "learning_rate": 5.993744701356991e-07, "loss": 0.6474, "step": 1265 }, { "epoch": 0.8476732507532642, "grad_norm": 1.714102826946514, "learning_rate": 5.94234812763671e-07, "loss": 0.6063, "step": 1266 }, { "epoch": 0.8483428188818212, "grad_norm": 2.299616104726728, "learning_rate": 5.891158941999959e-07, "loss": 0.6865, "step": 1267 }, { "epoch": 0.8490123870103783, "grad_norm": 2.050626911273336, "learning_rate": 5.840177385404328e-07, "loss": 0.6815, "step": 1268 }, { "epoch": 0.8496819551389354, "grad_norm": 2.0592299797510814, "learning_rate": 5.789403697830104e-07, "loss": 0.709, "step": 1269 }, { "epoch": 0.8503515232674924, "grad_norm": 2.093661088065307, "learning_rate": 5.738838118279083e-07, "loss": 0.7097, "step": 1270 }, { "epoch": 0.8510210913960495, "grad_norm": 3.557387446768381, "learning_rate": 5.688480884773445e-07, "loss": 0.7058, "step": 1271 }, { "epoch": 0.8516906595246067, "grad_norm": 1.919202965495722, "learning_rate": 5.638332234354671e-07, "loss": 0.6641, "step": 1272 }, { "epoch": 0.8523602276531637, "grad_norm": 1.8992261682240739, "learning_rate": 5.588392403082338e-07, "loss": 0.6247, "step": 1273 }, { "epoch": 0.8530297957817208, "grad_norm": 1.890115567433402, "learning_rate": 5.538661626033149e-07, "loss": 0.6645, "step": 1274 }, { "epoch": 0.8536993639102779, "grad_norm": 1.9374419613746152, "learning_rate": 5.489140137299709e-07, "loss": 0.6934, "step": 1275 }, { "epoch": 0.8543689320388349, "grad_norm": 2.735947899158765, "learning_rate": 5.439828169989442e-07, "loss": 0.7537, "step": 1276 }, { "epoch": 0.855038500167392, "grad_norm": 1.8178254969699044, "learning_rate": 5.390725956223531e-07, "loss": 0.6898, "step": 1277 }, { "epoch": 0.8557080682959491, "grad_norm": 1.9770675220620806, "learning_rate": 5.341833727135847e-07, "loss": 0.6724, "step": 1278 }, { "epoch": 0.8563776364245062, "grad_norm": 2.350462017942159, "learning_rate": 5.293151712871747e-07, "loss": 0.6653, "step": 1279 }, { "epoch": 0.8570472045530633, "grad_norm": 1.915267662458703, "learning_rate": 5.244680142587116e-07, "loss": 0.7271, "step": 1280 }, { "epoch": 0.8577167726816204, "grad_norm": 2.190907082611833, "learning_rate": 5.196419244447232e-07, "loss": 0.6103, "step": 1281 }, { "epoch": 0.8583863408101774, "grad_norm": 2.3433096008940146, "learning_rate": 5.148369245625679e-07, "loss": 0.7347, "step": 1282 }, { "epoch": 0.8590559089387345, "grad_norm": 2.586789151005052, "learning_rate": 5.10053037230332e-07, "loss": 0.7002, "step": 1283 }, { "epoch": 0.8597254770672916, "grad_norm": 2.0870244855436315, "learning_rate": 5.052902849667169e-07, "loss": 0.6463, "step": 1284 }, { "epoch": 0.8603950451958486, "grad_norm": 2.526616869274362, "learning_rate": 5.005486901909429e-07, "loss": 0.7345, "step": 1285 }, { "epoch": 0.8610646133244058, "grad_norm": 1.8111574384085571, "learning_rate": 4.95828275222634e-07, "loss": 0.7447, "step": 1286 }, { "epoch": 0.8617341814529629, "grad_norm": 1.9689157373472548, "learning_rate": 4.911290622817161e-07, "loss": 0.7022, "step": 1287 }, { "epoch": 0.8624037495815199, "grad_norm": 2.1244951590281245, "learning_rate": 4.864510734883137e-07, "loss": 0.671, "step": 1288 }, { "epoch": 0.863073317710077, "grad_norm": 2.016133679319044, "learning_rate": 4.817943308626488e-07, "loss": 0.7103, "step": 1289 }, { "epoch": 0.8637428858386341, "grad_norm": 2.309812381896314, "learning_rate": 4.771588563249269e-07, "loss": 0.6314, "step": 1290 }, { "epoch": 0.8644124539671911, "grad_norm": 2.027542178627591, "learning_rate": 4.725446716952448e-07, "loss": 0.6646, "step": 1291 }, { "epoch": 0.8650820220957482, "grad_norm": 2.1058874088630426, "learning_rate": 4.679517986934823e-07, "loss": 0.6248, "step": 1292 }, { "epoch": 0.8657515902243054, "grad_norm": 2.1190277482676416, "learning_rate": 4.6338025893920167e-07, "loss": 0.6971, "step": 1293 }, { "epoch": 0.8664211583528624, "grad_norm": 2.237608865300303, "learning_rate": 4.588300739515456e-07, "loss": 0.6783, "step": 1294 }, { "epoch": 0.8670907264814195, "grad_norm": 1.7467687035944168, "learning_rate": 4.543012651491324e-07, "loss": 0.5972, "step": 1295 }, { "epoch": 0.8677602946099766, "grad_norm": 2.060528408514549, "learning_rate": 4.4979385384996353e-07, "loss": 0.7185, "step": 1296 }, { "epoch": 0.8684298627385336, "grad_norm": 3.259689193996602, "learning_rate": 4.4530786127131575e-07, "loss": 0.7207, "step": 1297 }, { "epoch": 0.8690994308670907, "grad_norm": 1.794593480000491, "learning_rate": 4.408433085296421e-07, "loss": 0.7624, "step": 1298 }, { "epoch": 0.8697689989956479, "grad_norm": 1.8050625798891218, "learning_rate": 4.364002166404757e-07, "loss": 0.7306, "step": 1299 }, { "epoch": 0.8704385671242049, "grad_norm": 2.365053228533847, "learning_rate": 4.319786065183301e-07, "loss": 0.7045, "step": 1300 }, { "epoch": 0.871108135252762, "grad_norm": 1.859352466159344, "learning_rate": 4.275784989765985e-07, "loss": 0.6822, "step": 1301 }, { "epoch": 0.8717777033813191, "grad_norm": 1.7729328489242062, "learning_rate": 4.2319991472745734e-07, "loss": 0.7026, "step": 1302 }, { "epoch": 0.8724472715098761, "grad_norm": 1.9262600751960055, "learning_rate": 4.188428743817691e-07, "loss": 0.6882, "step": 1303 }, { "epoch": 0.8731168396384332, "grad_norm": 2.2004460389495613, "learning_rate": 4.145073984489845e-07, "loss": 0.6802, "step": 1304 }, { "epoch": 0.8737864077669902, "grad_norm": 2.3596506523239333, "learning_rate": 4.1019350733704665e-07, "loss": 0.7159, "step": 1305 }, { "epoch": 0.8744559758955474, "grad_norm": 2.2253012462733874, "learning_rate": 4.059012213522956e-07, "loss": 0.7921, "step": 1306 }, { "epoch": 0.8751255440241045, "grad_norm": 1.8422754353124646, "learning_rate": 4.0163056069936757e-07, "loss": 0.602, "step": 1307 }, { "epoch": 0.8757951121526615, "grad_norm": 1.9813204606942567, "learning_rate": 3.9738154548111087e-07, "loss": 0.6429, "step": 1308 }, { "epoch": 0.8764646802812186, "grad_norm": 1.9262456867666178, "learning_rate": 3.93154195698478e-07, "loss": 0.6749, "step": 1309 }, { "epoch": 0.8771342484097757, "grad_norm": 1.5479630236924127, "learning_rate": 3.889485312504415e-07, "loss": 0.7169, "step": 1310 }, { "epoch": 0.8778038165383327, "grad_norm": 2.025692918996375, "learning_rate": 3.847645719338966e-07, "loss": 0.736, "step": 1311 }, { "epoch": 0.8784733846668898, "grad_norm": 1.8444088544390027, "learning_rate": 3.8060233744356634e-07, "loss": 0.6518, "step": 1312 }, { "epoch": 0.879142952795447, "grad_norm": 2.7540122057997127, "learning_rate": 3.7646184737191284e-07, "loss": 0.7023, "step": 1313 }, { "epoch": 0.879812520924004, "grad_norm": 1.8623407745824738, "learning_rate": 3.723431212090417e-07, "loss": 0.743, "step": 1314 }, { "epoch": 0.8804820890525611, "grad_norm": 2.247066106711546, "learning_rate": 3.682461783426122e-07, "loss": 0.6981, "step": 1315 }, { "epoch": 0.8811516571811182, "grad_norm": 1.67912933621989, "learning_rate": 3.64171038057744e-07, "loss": 0.6611, "step": 1316 }, { "epoch": 0.8818212253096752, "grad_norm": 1.6797872840763144, "learning_rate": 3.6011771953693044e-07, "loss": 0.6302, "step": 1317 }, { "epoch": 0.8824907934382323, "grad_norm": 2.1751751198474665, "learning_rate": 3.5608624185994033e-07, "loss": 0.7239, "step": 1318 }, { "epoch": 0.8831603615667895, "grad_norm": 1.756452603226379, "learning_rate": 3.5207662400374097e-07, "loss": 0.6724, "step": 1319 }, { "epoch": 0.8838299296953465, "grad_norm": 1.9251531813322504, "learning_rate": 3.4808888484239355e-07, "loss": 0.739, "step": 1320 }, { "epoch": 0.8844994978239036, "grad_norm": 1.9019667712389343, "learning_rate": 3.441230431469761e-07, "loss": 0.7249, "step": 1321 }, { "epoch": 0.8851690659524607, "grad_norm": 1.9252601368496347, "learning_rate": 3.401791175854907e-07, "loss": 0.7037, "step": 1322 }, { "epoch": 0.8858386340810177, "grad_norm": 2.008470661809516, "learning_rate": 3.3625712672277435e-07, "loss": 0.6694, "step": 1323 }, { "epoch": 0.8865082022095748, "grad_norm": 2.5710916434103734, "learning_rate": 3.3235708902041473e-07, "loss": 0.6758, "step": 1324 }, { "epoch": 0.8871777703381319, "grad_norm": 1.873681543814161, "learning_rate": 3.284790228366602e-07, "loss": 0.7531, "step": 1325 }, { "epoch": 0.887847338466689, "grad_norm": 1.7888287043265299, "learning_rate": 3.246229464263362e-07, "loss": 0.6892, "step": 1326 }, { "epoch": 0.8885169065952461, "grad_norm": 1.8072318356310084, "learning_rate": 3.20788877940757e-07, "loss": 0.6701, "step": 1327 }, { "epoch": 0.8891864747238032, "grad_norm": 2.245176599194289, "learning_rate": 3.1697683542764145e-07, "loss": 0.7286, "step": 1328 }, { "epoch": 0.8898560428523602, "grad_norm": 1.8677760335321416, "learning_rate": 3.131868368310276e-07, "loss": 0.671, "step": 1329 }, { "epoch": 0.8905256109809173, "grad_norm": 3.2092219938983413, "learning_rate": 3.094188999911879e-07, "loss": 0.716, "step": 1330 }, { "epoch": 0.8911951791094744, "grad_norm": 2.46189922484653, "learning_rate": 3.0567304264454864e-07, "loss": 0.7523, "step": 1331 }, { "epoch": 0.8918647472380314, "grad_norm": 2.7720178143257335, "learning_rate": 3.0194928242359976e-07, "loss": 0.7104, "step": 1332 }, { "epoch": 0.8925343153665886, "grad_norm": 1.7545018089070816, "learning_rate": 2.982476368568177e-07, "loss": 0.6602, "step": 1333 }, { "epoch": 0.8932038834951457, "grad_norm": 2.199946229775323, "learning_rate": 2.9456812336857987e-07, "loss": 0.6269, "step": 1334 }, { "epoch": 0.8938734516237027, "grad_norm": 2.0517256989078834, "learning_rate": 2.9091075927908497e-07, "loss": 0.7011, "step": 1335 }, { "epoch": 0.8945430197522598, "grad_norm": 2.159591690521184, "learning_rate": 2.872755618042705e-07, "loss": 0.6686, "step": 1336 }, { "epoch": 0.8952125878808169, "grad_norm": 2.2930701405014218, "learning_rate": 2.836625480557265e-07, "loss": 0.7237, "step": 1337 }, { "epoch": 0.8958821560093739, "grad_norm": 2.272370977799439, "learning_rate": 2.800717350406268e-07, "loss": 0.7172, "step": 1338 }, { "epoch": 0.896551724137931, "grad_norm": 2.842038207775454, "learning_rate": 2.7650313966163853e-07, "loss": 0.7033, "step": 1339 }, { "epoch": 0.8972212922664882, "grad_norm": 1.8649079159679345, "learning_rate": 2.7295677871684413e-07, "loss": 0.6764, "step": 1340 }, { "epoch": 0.8978908603950452, "grad_norm": 1.7990530941600642, "learning_rate": 2.6943266889966624e-07, "loss": 0.6959, "step": 1341 }, { "epoch": 0.8985604285236023, "grad_norm": 2.8378980492196293, "learning_rate": 2.659308267987898e-07, "loss": 0.6606, "step": 1342 }, { "epoch": 0.8992299966521593, "grad_norm": 2.0943825423730575, "learning_rate": 2.624512688980757e-07, "loss": 0.698, "step": 1343 }, { "epoch": 0.8998995647807164, "grad_norm": 1.8099905423689238, "learning_rate": 2.5899401157649217e-07, "loss": 0.7304, "step": 1344 }, { "epoch": 0.9005691329092735, "grad_norm": 2.1625967949541827, "learning_rate": 2.5555907110803356e-07, "loss": 0.7184, "step": 1345 }, { "epoch": 0.9012387010378305, "grad_norm": 2.0867918060356914, "learning_rate": 2.521464636616439e-07, "loss": 0.7082, "step": 1346 }, { "epoch": 0.9019082691663877, "grad_norm": 1.7038890869356877, "learning_rate": 2.487562053011422e-07, "loss": 0.6298, "step": 1347 }, { "epoch": 0.9025778372949448, "grad_norm": 2.2631868528820687, "learning_rate": 2.453883119851436e-07, "loss": 0.6544, "step": 1348 }, { "epoch": 0.9032474054235018, "grad_norm": 1.9999059149160727, "learning_rate": 2.4204279956698994e-07, "loss": 0.7001, "step": 1349 }, { "epoch": 0.9039169735520589, "grad_norm": 1.903356931558408, "learning_rate": 2.3871968379467035e-07, "loss": 0.6761, "step": 1350 }, { "epoch": 0.9039169735520589, "eval_loss": 0.690680980682373, "eval_runtime": 441.7868, "eval_samples_per_second": 45.522, "eval_steps_per_second": 0.713, "step": 1350 }, { "epoch": 0.904586541680616, "grad_norm": 2.0467141013843237, "learning_rate": 2.354189803107465e-07, "loss": 0.7184, "step": 1351 }, { "epoch": 0.905256109809173, "grad_norm": 1.7998261809713236, "learning_rate": 2.321407046522828e-07, "loss": 0.7211, "step": 1352 }, { "epoch": 0.9059256779377302, "grad_norm": 1.5838412020157826, "learning_rate": 2.288848722507736e-07, "loss": 0.685, "step": 1353 }, { "epoch": 0.9065952460662873, "grad_norm": 1.9043668566407579, "learning_rate": 2.25651498432064e-07, "loss": 0.7207, "step": 1354 }, { "epoch": 0.9072648141948443, "grad_norm": 1.9147037495050374, "learning_rate": 2.2244059841628419e-07, "loss": 0.701, "step": 1355 }, { "epoch": 0.9079343823234014, "grad_norm": 1.8201627848777142, "learning_rate": 2.192521873177772e-07, "loss": 0.633, "step": 1356 }, { "epoch": 0.9086039504519585, "grad_norm": 1.7099425725077149, "learning_rate": 2.1608628014502364e-07, "loss": 0.6969, "step": 1357 }, { "epoch": 0.9092735185805155, "grad_norm": 1.7105264618646547, "learning_rate": 2.1294289180057603e-07, "loss": 0.652, "step": 1358 }, { "epoch": 0.9099430867090726, "grad_norm": 1.6668309352336095, "learning_rate": 2.0982203708098393e-07, "loss": 0.6285, "step": 1359 }, { "epoch": 0.9106126548376298, "grad_norm": 1.7852965085398904, "learning_rate": 2.067237306767278e-07, "loss": 0.6954, "step": 1360 }, { "epoch": 0.9112822229661868, "grad_norm": 3.3173310345719127, "learning_rate": 2.0364798717215085e-07, "loss": 0.6588, "step": 1361 }, { "epoch": 0.9119517910947439, "grad_norm": 2.1954506146717794, "learning_rate": 2.0059482104538396e-07, "loss": 0.6401, "step": 1362 }, { "epoch": 0.912621359223301, "grad_norm": 2.227272874661717, "learning_rate": 1.9756424666828533e-07, "loss": 0.5764, "step": 1363 }, { "epoch": 0.913290927351858, "grad_norm": 2.0729372328812388, "learning_rate": 1.945562783063676e-07, "loss": 0.6366, "step": 1364 }, { "epoch": 0.9139604954804151, "grad_norm": 2.20751234565507, "learning_rate": 1.915709301187335e-07, "loss": 0.6863, "step": 1365 }, { "epoch": 0.9146300636089723, "grad_norm": 1.740765818023042, "learning_rate": 1.8860821615800717e-07, "loss": 0.638, "step": 1366 }, { "epoch": 0.9152996317375293, "grad_norm": 1.8255241860797013, "learning_rate": 1.8566815037026897e-07, "loss": 0.6357, "step": 1367 }, { "epoch": 0.9159691998660864, "grad_norm": 2.4914307455952813, "learning_rate": 1.827507465949907e-07, "loss": 0.6742, "step": 1368 }, { "epoch": 0.9166387679946435, "grad_norm": 2.726461440895067, "learning_rate": 1.7985601856496947e-07, "loss": 0.7147, "step": 1369 }, { "epoch": 0.9173083361232005, "grad_norm": 1.7071888944060425, "learning_rate": 1.769839799062628e-07, "loss": 0.6262, "step": 1370 }, { "epoch": 0.9179779042517576, "grad_norm": 1.8367387336842598, "learning_rate": 1.741346441381231e-07, "loss": 0.6156, "step": 1371 }, { "epoch": 0.9186474723803147, "grad_norm": 1.8177144862020809, "learning_rate": 1.7130802467293983e-07, "loss": 0.6935, "step": 1372 }, { "epoch": 0.9193170405088718, "grad_norm": 2.2407514244692432, "learning_rate": 1.6850413481616868e-07, "loss": 0.6673, "step": 1373 }, { "epoch": 0.9199866086374289, "grad_norm": 2.3041443301825364, "learning_rate": 1.6572298776627417e-07, "loss": 0.6773, "step": 1374 }, { "epoch": 0.920656176765986, "grad_norm": 2.177433904946369, "learning_rate": 1.6296459661466592e-07, "loss": 0.6773, "step": 1375 }, { "epoch": 0.921325744894543, "grad_norm": 2.2091109521091927, "learning_rate": 1.6022897434563644e-07, "loss": 0.7042, "step": 1376 }, { "epoch": 0.9219953130231001, "grad_norm": 1.6986351551994106, "learning_rate": 1.5751613383630128e-07, "loss": 0.6575, "step": 1377 }, { "epoch": 0.9226648811516572, "grad_norm": 2.171903872378935, "learning_rate": 1.548260878565372e-07, "loss": 0.6692, "step": 1378 }, { "epoch": 0.9233344492802142, "grad_norm": 2.052813475524822, "learning_rate": 1.5215884906892298e-07, "loss": 0.7056, "step": 1379 }, { "epoch": 0.9240040174087714, "grad_norm": 1.9771066494164902, "learning_rate": 1.4951443002867884e-07, "loss": 0.6865, "step": 1380 }, { "epoch": 0.9246735855373284, "grad_norm": 1.8962008521253326, "learning_rate": 1.468928431836092e-07, "loss": 0.6808, "step": 1381 }, { "epoch": 0.9253431536658855, "grad_norm": 1.81735955122608, "learning_rate": 1.4429410087404062e-07, "loss": 0.6342, "step": 1382 }, { "epoch": 0.9260127217944426, "grad_norm": 2.0152538937114723, "learning_rate": 1.4171821533276897e-07, "loss": 0.7009, "step": 1383 }, { "epoch": 0.9266822899229996, "grad_norm": 1.8237487691086767, "learning_rate": 1.3916519868499623e-07, "loss": 0.716, "step": 1384 }, { "epoch": 0.9273518580515567, "grad_norm": 2.003660376190248, "learning_rate": 1.3663506294827656e-07, "loss": 0.766, "step": 1385 }, { "epoch": 0.9280214261801139, "grad_norm": 1.6044899339736238, "learning_rate": 1.341278200324597e-07, "loss": 0.6838, "step": 1386 }, { "epoch": 0.9286909943086709, "grad_norm": 1.9304559729644348, "learning_rate": 1.3164348173963392e-07, "loss": 0.6633, "step": 1387 }, { "epoch": 0.929360562437228, "grad_norm": 2.1990717658450865, "learning_rate": 1.291820597640714e-07, "loss": 0.6454, "step": 1388 }, { "epoch": 0.9300301305657851, "grad_norm": 2.464509032822524, "learning_rate": 1.2674356569217282e-07, "loss": 0.6549, "step": 1389 }, { "epoch": 0.9306996986943421, "grad_norm": 1.6505748527218607, "learning_rate": 1.2432801100241033e-07, "loss": 0.6255, "step": 1390 }, { "epoch": 0.9313692668228992, "grad_norm": 2.3759991800709246, "learning_rate": 1.219354070652795e-07, "loss": 0.6688, "step": 1391 }, { "epoch": 0.9320388349514563, "grad_norm": 1.897267247526835, "learning_rate": 1.1956576514324025e-07, "loss": 0.6377, "step": 1392 }, { "epoch": 0.9327084030800133, "grad_norm": 1.6964394877338718, "learning_rate": 1.1721909639066498e-07, "loss": 0.6128, "step": 1393 }, { "epoch": 0.9333779712085705, "grad_norm": 2.194517533483398, "learning_rate": 1.1489541185378816e-07, "loss": 0.7132, "step": 1394 }, { "epoch": 0.9340475393371276, "grad_norm": 1.9758047152443419, "learning_rate": 1.1259472247065306e-07, "loss": 0.6426, "step": 1395 }, { "epoch": 0.9347171074656846, "grad_norm": 2.3114874781989245, "learning_rate": 1.103170390710595e-07, "loss": 0.7918, "step": 1396 }, { "epoch": 0.9353866755942417, "grad_norm": 2.5933402531490173, "learning_rate": 1.080623723765134e-07, "loss": 0.7075, "step": 1397 }, { "epoch": 0.9360562437227988, "grad_norm": 1.9145467238940987, "learning_rate": 1.0583073300017788e-07, "loss": 0.7448, "step": 1398 }, { "epoch": 0.9367258118513558, "grad_norm": 1.8300061685184454, "learning_rate": 1.0362213144681999e-07, "loss": 0.6933, "step": 1399 }, { "epoch": 0.937395379979913, "grad_norm": 2.42626629046995, "learning_rate": 1.0143657811276519e-07, "loss": 0.7035, "step": 1400 }, { "epoch": 0.9380649481084701, "grad_norm": 2.479460877431395, "learning_rate": 9.92740832858441e-08, "loss": 0.6877, "step": 1401 }, { "epoch": 0.9387345162370271, "grad_norm": 2.679566966056242, "learning_rate": 9.713465714534853e-08, "loss": 0.6105, "step": 1402 }, { "epoch": 0.9394040843655842, "grad_norm": 1.652106590299367, "learning_rate": 9.501830976198112e-08, "loss": 0.6456, "step": 1403 }, { "epoch": 0.9400736524941413, "grad_norm": 1.9619691700529782, "learning_rate": 9.292505109780636e-08, "loss": 0.6185, "step": 1404 }, { "epoch": 0.9407432206226983, "grad_norm": 2.5523885002203075, "learning_rate": 9.085489100620737e-08, "loss": 0.6937, "step": 1405 }, { "epoch": 0.9414127887512554, "grad_norm": 2.8169597093076013, "learning_rate": 8.880783923183811e-08, "loss": 0.6516, "step": 1406 }, { "epoch": 0.9420823568798126, "grad_norm": 2.211713929281273, "learning_rate": 8.678390541057512e-08, "loss": 0.7474, "step": 1407 }, { "epoch": 0.9427519250083696, "grad_norm": 3.0220975957771556, "learning_rate": 8.47830990694759e-08, "loss": 0.7202, "step": 1408 }, { "epoch": 0.9434214931369267, "grad_norm": 1.8112984401490213, "learning_rate": 8.280542962673166e-08, "loss": 0.7062, "step": 1409 }, { "epoch": 0.9440910612654838, "grad_norm": 1.976151721810344, "learning_rate": 8.085090639162407e-08, "loss": 0.5886, "step": 1410 }, { "epoch": 0.9447606293940408, "grad_norm": 2.132188652119974, "learning_rate": 7.891953856448032e-08, "loss": 0.6806, "step": 1411 }, { "epoch": 0.9454301975225979, "grad_norm": 2.8108559589224806, "learning_rate": 7.701133523662974e-08, "loss": 0.6709, "step": 1412 }, { "epoch": 0.9460997656511551, "grad_norm": 1.9744525219182456, "learning_rate": 7.512630539036502e-08, "loss": 0.6824, "step": 1413 }, { "epoch": 0.9467693337797121, "grad_norm": 2.5949814580873296, "learning_rate": 7.326445789889336e-08, "loss": 0.7003, "step": 1414 }, { "epoch": 0.9474389019082692, "grad_norm": 1.9847615152556148, "learning_rate": 7.142580152629918e-08, "loss": 0.6375, "step": 1415 }, { "epoch": 0.9481084700368263, "grad_norm": 2.3898083395040715, "learning_rate": 6.961034492750207e-08, "loss": 0.6966, "step": 1416 }, { "epoch": 0.9487780381653833, "grad_norm": 1.943913436449147, "learning_rate": 6.78180966482156e-08, "loss": 0.6409, "step": 1417 }, { "epoch": 0.9494476062939404, "grad_norm": 1.931846094705172, "learning_rate": 6.604906512490628e-08, "loss": 0.6427, "step": 1418 }, { "epoch": 0.9501171744224974, "grad_norm": 2.284158453865949, "learning_rate": 6.43032586847564e-08, "loss": 0.6173, "step": 1419 }, { "epoch": 0.9507867425510546, "grad_norm": 2.2066147923091832, "learning_rate": 6.258068554562124e-08, "loss": 0.6959, "step": 1420 }, { "epoch": 0.9514563106796117, "grad_norm": 1.946400881352082, "learning_rate": 6.088135381599414e-08, "loss": 0.6902, "step": 1421 }, { "epoch": 0.9521258788081687, "grad_norm": 1.918498819644815, "learning_rate": 5.920527149496591e-08, "loss": 0.6711, "step": 1422 }, { "epoch": 0.9527954469367258, "grad_norm": 1.7149155006121386, "learning_rate": 5.755244647218716e-08, "loss": 0.7366, "step": 1423 }, { "epoch": 0.9534650150652829, "grad_norm": 1.7726034649155824, "learning_rate": 5.592288652783162e-08, "loss": 0.6743, "step": 1424 }, { "epoch": 0.9541345831938399, "grad_norm": 2.2227511536420828, "learning_rate": 5.431659933256173e-08, "loss": 0.6306, "step": 1425 }, { "epoch": 0.954804151322397, "grad_norm": 1.8534284567845736, "learning_rate": 5.273359244748755e-08, "loss": 0.6984, "step": 1426 }, { "epoch": 0.9554737194509542, "grad_norm": 1.9841084402726694, "learning_rate": 5.117387332413737e-08, "loss": 0.6264, "step": 1427 }, { "epoch": 0.9561432875795112, "grad_norm": 1.6004831867918912, "learning_rate": 4.9637449304416055e-08, "loss": 0.6682, "step": 1428 }, { "epoch": 0.9568128557080683, "grad_norm": 1.8697069865099611, "learning_rate": 4.8124327620576726e-08, "loss": 0.6573, "step": 1429 }, { "epoch": 0.9574824238366254, "grad_norm": 2.335789480839822, "learning_rate": 4.6634515395181377e-08, "loss": 0.7413, "step": 1430 }, { "epoch": 0.9581519919651824, "grad_norm": 2.2689778948002077, "learning_rate": 4.516801964107198e-08, "loss": 0.7051, "step": 1431 }, { "epoch": 0.9588215600937395, "grad_norm": 2.871640847372224, "learning_rate": 4.3724847261333305e-08, "loss": 0.6826, "step": 1432 }, { "epoch": 0.9594911282222967, "grad_norm": 2.027730031035601, "learning_rate": 4.230500504926405e-08, "loss": 0.7367, "step": 1433 }, { "epoch": 0.9601606963508537, "grad_norm": 2.0736554285854916, "learning_rate": 4.0908499688341876e-08, "loss": 0.6373, "step": 1434 }, { "epoch": 0.9608302644794108, "grad_norm": 2.1171480385964783, "learning_rate": 3.953533775219343e-08, "loss": 0.6548, "step": 1435 }, { "epoch": 0.9614998326079679, "grad_norm": 2.028009544125441, "learning_rate": 3.8185525704564354e-08, "loss": 0.7201, "step": 1436 }, { "epoch": 0.9621694007365249, "grad_norm": 2.3493379801051244, "learning_rate": 3.685906989928656e-08, "loss": 0.7074, "step": 1437 }, { "epoch": 0.962838968865082, "grad_norm": 2.3104176567790486, "learning_rate": 3.55559765802499e-08, "loss": 0.7635, "step": 1438 }, { "epoch": 0.9635085369936391, "grad_norm": 1.6351896411739504, "learning_rate": 3.4276251881372734e-08, "loss": 0.6612, "step": 1439 }, { "epoch": 0.9641781051221961, "grad_norm": 1.6097716220562948, "learning_rate": 3.3019901826572e-08, "loss": 0.695, "step": 1440 }, { "epoch": 0.9648476732507533, "grad_norm": 1.6040754664751462, "learning_rate": 3.178693232973651e-08, "loss": 0.6385, "step": 1441 }, { "epoch": 0.9655172413793104, "grad_norm": 1.772345909730028, "learning_rate": 3.057734919469757e-08, "loss": 0.6406, "step": 1442 }, { "epoch": 0.9661868095078674, "grad_norm": 2.7206752130156366, "learning_rate": 2.939115811520121e-08, "loss": 0.7328, "step": 1443 }, { "epoch": 0.9668563776364245, "grad_norm": 1.9294098115337348, "learning_rate": 2.822836467488488e-08, "loss": 0.674, "step": 1444 }, { "epoch": 0.9675259457649816, "grad_norm": 2.125685395569805, "learning_rate": 2.7088974347246888e-08, "loss": 0.705, "step": 1445 }, { "epoch": 0.9681955138935386, "grad_norm": 1.8160788504989789, "learning_rate": 2.5972992495622573e-08, "loss": 0.6044, "step": 1446 }, { "epoch": 0.9688650820220958, "grad_norm": 1.8481597753827999, "learning_rate": 2.488042437315985e-08, "loss": 0.6763, "step": 1447 }, { "epoch": 0.9695346501506529, "grad_norm": 1.864987702285682, "learning_rate": 2.3811275122792575e-08, "loss": 0.7047, "step": 1448 }, { "epoch": 0.9702042182792099, "grad_norm": 2.046683304788004, "learning_rate": 2.2765549777217787e-08, "loss": 0.759, "step": 1449 }, { "epoch": 0.970873786407767, "grad_norm": 2.7351202993122112, "learning_rate": 2.1743253258871833e-08, "loss": 0.7014, "step": 1450 }, { "epoch": 0.9715433545363241, "grad_norm": 3.153509793043048, "learning_rate": 2.0744390379906498e-08, "loss": 0.6807, "step": 1451 }, { "epoch": 0.9722129226648811, "grad_norm": 2.20126236873377, "learning_rate": 1.976896584216681e-08, "loss": 0.7097, "step": 1452 }, { "epoch": 0.9728824907934382, "grad_norm": 2.1053121499030554, "learning_rate": 1.8816984237169378e-08, "loss": 0.6647, "step": 1453 }, { "epoch": 0.9735520589219954, "grad_norm": 1.8769125569195508, "learning_rate": 1.788845004607964e-08, "loss": 0.7029, "step": 1454 }, { "epoch": 0.9742216270505524, "grad_norm": 2.633122666822801, "learning_rate": 1.698336763969244e-08, "loss": 0.7056, "step": 1455 }, { "epoch": 0.9748911951791095, "grad_norm": 2.1563870700723613, "learning_rate": 1.610174127840869e-08, "loss": 0.6822, "step": 1456 }, { "epoch": 0.9755607633076665, "grad_norm": 2.09411306278069, "learning_rate": 1.5243575112218744e-08, "loss": 0.6885, "step": 1457 }, { "epoch": 0.9762303314362236, "grad_norm": 2.299766340719424, "learning_rate": 1.4408873180680182e-08, "loss": 0.7007, "step": 1458 }, { "epoch": 0.9768998995647807, "grad_norm": 2.442152029719013, "learning_rate": 1.3597639412900598e-08, "loss": 0.7918, "step": 1459 }, { "epoch": 0.9775694676933377, "grad_norm": 1.7807675876305469, "learning_rate": 1.280987762751762e-08, "loss": 0.6681, "step": 1460 }, { "epoch": 0.9782390358218949, "grad_norm": 1.6788718972442116, "learning_rate": 1.2045591532681145e-08, "loss": 0.68, "step": 1461 }, { "epoch": 0.978908603950452, "grad_norm": 1.702014848542504, "learning_rate": 1.1304784726038354e-08, "loss": 0.6058, "step": 1462 }, { "epoch": 0.979578172079009, "grad_norm": 1.9913361146317479, "learning_rate": 1.058746069471206e-08, "loss": 0.6618, "step": 1463 }, { "epoch": 0.9802477402075661, "grad_norm": 1.8882785898178696, "learning_rate": 9.893622815289604e-09, "loss": 0.666, "step": 1464 }, { "epoch": 0.9809173083361232, "grad_norm": 2.1174432135968133, "learning_rate": 9.223274353802326e-09, "loss": 0.6629, "step": 1465 }, { "epoch": 0.9815868764646802, "grad_norm": 2.223259000214255, "learning_rate": 8.576418465712778e-09, "loss": 0.6884, "step": 1466 }, { "epoch": 0.9822564445932374, "grad_norm": 2.1751744225869647, "learning_rate": 7.953058195900864e-09, "loss": 0.7615, "step": 1467 }, { "epoch": 0.9829260127217945, "grad_norm": 2.3296016245014224, "learning_rate": 7.353196478646074e-09, "loss": 0.6519, "step": 1468 }, { "epoch": 0.9835955808503515, "grad_norm": 1.8754617091077903, "learning_rate": 6.7768361376152616e-09, "loss": 0.7136, "step": 1469 }, { "epoch": 0.9842651489789086, "grad_norm": 2.383671971281111, "learning_rate": 6.223979885852105e-09, "loss": 0.6771, "step": 1470 }, { "epoch": 0.9849347171074657, "grad_norm": 1.9842916158618351, "learning_rate": 5.694630325759343e-09, "loss": 0.6373, "step": 1471 }, { "epoch": 0.9856042852360227, "grad_norm": 2.085737236608459, "learning_rate": 5.188789949090445e-09, "loss": 0.753, "step": 1472 }, { "epoch": 0.9862738533645798, "grad_norm": 2.5091766975124177, "learning_rate": 4.706461136935736e-09, "loss": 0.5947, "step": 1473 }, { "epoch": 0.986943421493137, "grad_norm": 2.2000849638680844, "learning_rate": 4.247646159712959e-09, "loss": 0.766, "step": 1474 }, { "epoch": 0.987612989621694, "grad_norm": 1.9804061349023174, "learning_rate": 3.812347177154508e-09, "loss": 0.6953, "step": 1475 }, { "epoch": 0.9882825577502511, "grad_norm": 2.0863727476020966, "learning_rate": 3.400566238299097e-09, "loss": 0.5934, "step": 1476 }, { "epoch": 0.9889521258788082, "grad_norm": 1.975311266876111, "learning_rate": 3.0123052814812203e-09, "loss": 0.6283, "step": 1477 }, { "epoch": 0.9896216940073652, "grad_norm": 2.2140856811539824, "learning_rate": 2.6475661343217107e-09, "loss": 0.6495, "step": 1478 }, { "epoch": 0.9902912621359223, "grad_norm": 1.6359237620778713, "learning_rate": 2.306350513719968e-09, "loss": 0.7051, "step": 1479 }, { "epoch": 0.9909608302644795, "grad_norm": 2.0354617642146353, "learning_rate": 1.9886600258450796e-09, "loss": 0.6414, "step": 1480 }, { "epoch": 0.9916303983930365, "grad_norm": 2.2741246493121987, "learning_rate": 1.694496166129711e-09, "loss": 0.7252, "step": 1481 }, { "epoch": 0.9922999665215936, "grad_norm": 1.9303914983484358, "learning_rate": 1.4238603192606726e-09, "loss": 0.7032, "step": 1482 }, { "epoch": 0.9929695346501507, "grad_norm": 1.8117361976494286, "learning_rate": 1.1767537591750312e-09, "loss": 0.7067, "step": 1483 }, { "epoch": 0.9936391027787077, "grad_norm": 2.5022802796142463, "learning_rate": 9.531776490517841e-10, "loss": 0.6795, "step": 1484 }, { "epoch": 0.9943086709072648, "grad_norm": 2.035808229970454, "learning_rate": 7.53133041307974e-10, "loss": 0.7023, "step": 1485 }, { "epoch": 0.9949782390358219, "grad_norm": 1.8376921032948585, "learning_rate": 5.766208775936921e-10, "loss": 0.737, "step": 1486 }, { "epoch": 0.995647807164379, "grad_norm": 2.0547563193503744, "learning_rate": 4.2364198878597216e-10, "loss": 0.7244, "step": 1487 }, { "epoch": 0.9963173752929361, "grad_norm": 2.1177839299587764, "learning_rate": 2.941970949865702e-10, "loss": 0.6574, "step": 1488 }, { "epoch": 0.9969869434214932, "grad_norm": 1.9510351636805723, "learning_rate": 1.8828680551918887e-10, "loss": 0.6684, "step": 1489 }, { "epoch": 0.9976565115500502, "grad_norm": 2.093068765209126, "learning_rate": 1.059116189233711e-10, "loss": 0.6599, "step": 1490 }, { "epoch": 0.9983260796786073, "grad_norm": 1.9560004253604697, "learning_rate": 4.7071922956165583e-11, "loss": 0.761, "step": 1491 }, { "epoch": 0.9989956478071644, "grad_norm": 1.824858981607754, "learning_rate": 1.1767994587685849e-11, "loss": 0.6896, "step": 1492 }, { "epoch": 0.9996652159357214, "grad_norm": 1.9604355845196693, "learning_rate": 0.0, "loss": 0.6503, "step": 1493 }, { "epoch": 0.9996652159357214, "step": 1493, "total_flos": 4999913794437120.0, "train_loss": 0.0, "train_runtime": 0.383, "train_samples_per_second": 998315.298, "train_steps_per_second": 3898.496 } ], "logging_steps": 1.0, "max_steps": 1493, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 150, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4999913794437120.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }