{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 3252, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0009225092250922509, "grad_norm": 12.955489203853885, "learning_rate": 0.0, "loss": 1.4185, "step": 1 }, { "epoch": 0.0018450184501845018, "grad_norm": 15.45613889769327, "learning_rate": 1.5337423312883438e-07, "loss": 1.5687, "step": 2 }, { "epoch": 0.0027675276752767526, "grad_norm": 12.92656381552016, "learning_rate": 3.0674846625766876e-07, "loss": 1.4134, "step": 3 }, { "epoch": 0.0036900369003690036, "grad_norm": 12.98532338060555, "learning_rate": 4.601226993865031e-07, "loss": 1.4595, "step": 4 }, { "epoch": 0.004612546125461255, "grad_norm": 12.338984619051718, "learning_rate": 6.134969325153375e-07, "loss": 1.4031, "step": 5 }, { "epoch": 0.005535055350553505, "grad_norm": 14.85148132005717, "learning_rate": 7.668711656441718e-07, "loss": 1.4832, "step": 6 }, { "epoch": 0.006457564575645757, "grad_norm": 17.072025860539494, "learning_rate": 9.202453987730062e-07, "loss": 1.6873, "step": 7 }, { "epoch": 0.007380073800738007, "grad_norm": 16.431366793672808, "learning_rate": 1.0736196319018406e-06, "loss": 1.6399, "step": 8 }, { "epoch": 0.008302583025830259, "grad_norm": 12.014901856256495, "learning_rate": 1.226993865030675e-06, "loss": 1.3723, "step": 9 }, { "epoch": 0.00922509225092251, "grad_norm": 11.848609763944259, "learning_rate": 1.3803680981595093e-06, "loss": 1.4094, "step": 10 }, { "epoch": 0.01014760147601476, "grad_norm": 11.765507368785373, "learning_rate": 1.5337423312883435e-06, "loss": 1.3755, "step": 11 }, { "epoch": 0.01107011070110701, "grad_norm": 10.590795739479944, "learning_rate": 1.687116564417178e-06, "loss": 1.279, "step": 12 }, { "epoch": 0.011992619926199263, "grad_norm": 12.149753919483338, "learning_rate": 1.8404907975460124e-06, "loss": 1.4244, "step": 13 }, { "epoch": 0.012915129151291513, "grad_norm": 10.6231739703075, "learning_rate": 1.9938650306748465e-06, "loss": 1.2602, "step": 14 }, { "epoch": 0.013837638376383764, "grad_norm": 11.087376374846873, "learning_rate": 2.147239263803681e-06, "loss": 1.2945, "step": 15 }, { "epoch": 0.014760147601476014, "grad_norm": 8.797053743582785, "learning_rate": 2.3006134969325154e-06, "loss": 0.9855, "step": 16 }, { "epoch": 0.015682656826568265, "grad_norm": 9.222973996497377, "learning_rate": 2.45398773006135e-06, "loss": 0.9322, "step": 17 }, { "epoch": 0.016605166051660517, "grad_norm": 8.741210685946207, "learning_rate": 2.607361963190184e-06, "loss": 0.8991, "step": 18 }, { "epoch": 0.017527675276752766, "grad_norm": 7.784987898808541, "learning_rate": 2.7607361963190186e-06, "loss": 0.8354, "step": 19 }, { "epoch": 0.01845018450184502, "grad_norm": 8.114903017788382, "learning_rate": 2.914110429447853e-06, "loss": 0.7483, "step": 20 }, { "epoch": 0.01937269372693727, "grad_norm": 6.985942947652761, "learning_rate": 3.067484662576687e-06, "loss": 0.7405, "step": 21 }, { "epoch": 0.02029520295202952, "grad_norm": 5.057560376950641, "learning_rate": 3.2208588957055217e-06, "loss": 0.523, "step": 22 }, { "epoch": 0.021217712177121772, "grad_norm": 4.30867356704448, "learning_rate": 3.374233128834356e-06, "loss": 0.5054, "step": 23 }, { "epoch": 0.02214022140221402, "grad_norm": 3.186678341527648, "learning_rate": 3.52760736196319e-06, "loss": 0.4204, "step": 24 }, { "epoch": 0.023062730627306273, "grad_norm": 2.547181779371563, "learning_rate": 3.680981595092025e-06, "loss": 0.471, "step": 25 }, { "epoch": 0.023985239852398525, "grad_norm": 1.4892578932038187, "learning_rate": 3.834355828220859e-06, "loss": 0.3852, "step": 26 }, { "epoch": 0.024907749077490774, "grad_norm": 1.6034561918455752, "learning_rate": 3.987730061349693e-06, "loss": 0.447, "step": 27 }, { "epoch": 0.025830258302583026, "grad_norm": 1.3040786609892756, "learning_rate": 4.141104294478528e-06, "loss": 0.4171, "step": 28 }, { "epoch": 0.026752767527675275, "grad_norm": 1.5718209161660188, "learning_rate": 4.294478527607362e-06, "loss": 0.4146, "step": 29 }, { "epoch": 0.027675276752767528, "grad_norm": 1.5226011146024725, "learning_rate": 4.447852760736196e-06, "loss": 0.3771, "step": 30 }, { "epoch": 0.02859778597785978, "grad_norm": 1.2181738630849195, "learning_rate": 4.601226993865031e-06, "loss": 0.4243, "step": 31 }, { "epoch": 0.02952029520295203, "grad_norm": 1.3702765242872075, "learning_rate": 4.7546012269938654e-06, "loss": 0.393, "step": 32 }, { "epoch": 0.03044280442804428, "grad_norm": 1.066008129856778, "learning_rate": 4.9079754601227e-06, "loss": 0.3754, "step": 33 }, { "epoch": 0.03136531365313653, "grad_norm": 0.9407582448390286, "learning_rate": 5.061349693251534e-06, "loss": 0.3492, "step": 34 }, { "epoch": 0.03228782287822878, "grad_norm": 0.9599309117793499, "learning_rate": 5.214723926380368e-06, "loss": 0.3554, "step": 35 }, { "epoch": 0.033210332103321034, "grad_norm": 1.0092310826139268, "learning_rate": 5.368098159509203e-06, "loss": 0.3804, "step": 36 }, { "epoch": 0.03413284132841329, "grad_norm": 0.8244638212070418, "learning_rate": 5.521472392638037e-06, "loss": 0.34, "step": 37 }, { "epoch": 0.03505535055350553, "grad_norm": 0.8247247080222685, "learning_rate": 5.674846625766871e-06, "loss": 0.3395, "step": 38 }, { "epoch": 0.035977859778597784, "grad_norm": 0.8400106617176237, "learning_rate": 5.828220858895706e-06, "loss": 0.3418, "step": 39 }, { "epoch": 0.03690036900369004, "grad_norm": 0.7727224893246134, "learning_rate": 5.98159509202454e-06, "loss": 0.3096, "step": 40 }, { "epoch": 0.03782287822878229, "grad_norm": 0.9087099282488907, "learning_rate": 6.134969325153374e-06, "loss": 0.319, "step": 41 }, { "epoch": 0.03874538745387454, "grad_norm": 0.9932295126911083, "learning_rate": 6.288343558282209e-06, "loss": 0.3221, "step": 42 }, { "epoch": 0.03966789667896679, "grad_norm": 0.6995800325598962, "learning_rate": 6.4417177914110434e-06, "loss": 0.3066, "step": 43 }, { "epoch": 0.04059040590405904, "grad_norm": 0.769126559231979, "learning_rate": 6.595092024539877e-06, "loss": 0.2997, "step": 44 }, { "epoch": 0.04151291512915129, "grad_norm": 0.8854396311491329, "learning_rate": 6.748466257668712e-06, "loss": 0.308, "step": 45 }, { "epoch": 0.042435424354243544, "grad_norm": 0.9106864581523603, "learning_rate": 6.901840490797547e-06, "loss": 0.3079, "step": 46 }, { "epoch": 0.043357933579335796, "grad_norm": 0.9132279319467866, "learning_rate": 7.05521472392638e-06, "loss": 0.2851, "step": 47 }, { "epoch": 0.04428044280442804, "grad_norm": 0.7097422801782368, "learning_rate": 7.208588957055215e-06, "loss": 0.3063, "step": 48 }, { "epoch": 0.045202952029520294, "grad_norm": 0.8888301377889726, "learning_rate": 7.36196319018405e-06, "loss": 0.3064, "step": 49 }, { "epoch": 0.046125461254612546, "grad_norm": 0.8326651435551397, "learning_rate": 7.5153374233128836e-06, "loss": 0.3381, "step": 50 }, { "epoch": 0.0470479704797048, "grad_norm": 0.7823625505280065, "learning_rate": 7.668711656441718e-06, "loss": 0.2976, "step": 51 }, { "epoch": 0.04797047970479705, "grad_norm": 0.7726039404102877, "learning_rate": 7.822085889570554e-06, "loss": 0.2933, "step": 52 }, { "epoch": 0.048892988929889296, "grad_norm": 0.7610651536301617, "learning_rate": 7.975460122699386e-06, "loss": 0.2882, "step": 53 }, { "epoch": 0.04981549815498155, "grad_norm": 1.5278867981943542, "learning_rate": 8.128834355828221e-06, "loss": 0.2891, "step": 54 }, { "epoch": 0.0507380073800738, "grad_norm": 0.6933285259760709, "learning_rate": 8.282208588957055e-06, "loss": 0.2631, "step": 55 }, { "epoch": 0.05166051660516605, "grad_norm": 0.8315242149931074, "learning_rate": 8.435582822085889e-06, "loss": 0.3194, "step": 56 }, { "epoch": 0.052583025830258305, "grad_norm": 0.8014840042460718, "learning_rate": 8.588957055214725e-06, "loss": 0.327, "step": 57 }, { "epoch": 0.05350553505535055, "grad_norm": 0.7542043085568613, "learning_rate": 8.742331288343558e-06, "loss": 0.2902, "step": 58 }, { "epoch": 0.0544280442804428, "grad_norm": 0.8021745207785408, "learning_rate": 8.895705521472392e-06, "loss": 0.2884, "step": 59 }, { "epoch": 0.055350553505535055, "grad_norm": 0.7935265214617545, "learning_rate": 9.049079754601228e-06, "loss": 0.2938, "step": 60 }, { "epoch": 0.05627306273062731, "grad_norm": 0.7945059096047613, "learning_rate": 9.202453987730062e-06, "loss": 0.3169, "step": 61 }, { "epoch": 0.05719557195571956, "grad_norm": 0.6844890656867241, "learning_rate": 9.355828220858897e-06, "loss": 0.2787, "step": 62 }, { "epoch": 0.058118081180811805, "grad_norm": 0.6759065932541566, "learning_rate": 9.509202453987731e-06, "loss": 0.3023, "step": 63 }, { "epoch": 0.05904059040590406, "grad_norm": 0.7068063979581338, "learning_rate": 9.662576687116565e-06, "loss": 0.2876, "step": 64 }, { "epoch": 0.05996309963099631, "grad_norm": 0.8019897278737798, "learning_rate": 9.8159509202454e-06, "loss": 0.2697, "step": 65 }, { "epoch": 0.06088560885608856, "grad_norm": 0.7851152567049443, "learning_rate": 9.969325153374232e-06, "loss": 0.3117, "step": 66 }, { "epoch": 0.061808118081180814, "grad_norm": 0.8443069282860475, "learning_rate": 1.0122699386503068e-05, "loss": 0.3379, "step": 67 }, { "epoch": 0.06273062730627306, "grad_norm": 0.6844260224687486, "learning_rate": 1.0276073619631903e-05, "loss": 0.2864, "step": 68 }, { "epoch": 0.06365313653136531, "grad_norm": 0.6764987660001486, "learning_rate": 1.0429447852760736e-05, "loss": 0.2632, "step": 69 }, { "epoch": 0.06457564575645756, "grad_norm": 0.6735304741507071, "learning_rate": 1.0582822085889571e-05, "loss": 0.2689, "step": 70 }, { "epoch": 0.06549815498154982, "grad_norm": 0.9313719592818234, "learning_rate": 1.0736196319018407e-05, "loss": 0.2969, "step": 71 }, { "epoch": 0.06642066420664207, "grad_norm": 0.7047477227444052, "learning_rate": 1.0889570552147239e-05, "loss": 0.2802, "step": 72 }, { "epoch": 0.06734317343173432, "grad_norm": 0.7511066786918772, "learning_rate": 1.1042944785276074e-05, "loss": 0.2942, "step": 73 }, { "epoch": 0.06826568265682657, "grad_norm": 0.7753312122163863, "learning_rate": 1.119631901840491e-05, "loss": 0.3039, "step": 74 }, { "epoch": 0.06918819188191883, "grad_norm": 0.7898944701574543, "learning_rate": 1.1349693251533742e-05, "loss": 0.3168, "step": 75 }, { "epoch": 0.07011070110701106, "grad_norm": 0.7608450558743302, "learning_rate": 1.1503067484662577e-05, "loss": 0.3159, "step": 76 }, { "epoch": 0.07103321033210332, "grad_norm": 0.6851187018726027, "learning_rate": 1.1656441717791411e-05, "loss": 0.302, "step": 77 }, { "epoch": 0.07195571955719557, "grad_norm": 0.7009343522823712, "learning_rate": 1.1809815950920245e-05, "loss": 0.2858, "step": 78 }, { "epoch": 0.07287822878228782, "grad_norm": 0.7106703686063037, "learning_rate": 1.196319018404908e-05, "loss": 0.2833, "step": 79 }, { "epoch": 0.07380073800738007, "grad_norm": 0.6860438191626305, "learning_rate": 1.2116564417177914e-05, "loss": 0.2836, "step": 80 }, { "epoch": 0.07472324723247233, "grad_norm": 0.6265893388163131, "learning_rate": 1.2269938650306748e-05, "loss": 0.274, "step": 81 }, { "epoch": 0.07564575645756458, "grad_norm": 0.7216235699208722, "learning_rate": 1.2423312883435584e-05, "loss": 0.2814, "step": 82 }, { "epoch": 0.07656826568265683, "grad_norm": 0.6406089500922234, "learning_rate": 1.2576687116564418e-05, "loss": 0.2927, "step": 83 }, { "epoch": 0.07749077490774908, "grad_norm": 0.7439376335847888, "learning_rate": 1.2730061349693251e-05, "loss": 0.2664, "step": 84 }, { "epoch": 0.07841328413284133, "grad_norm": 0.738376581767653, "learning_rate": 1.2883435582822087e-05, "loss": 0.2694, "step": 85 }, { "epoch": 0.07933579335793357, "grad_norm": 0.8230580812584041, "learning_rate": 1.303680981595092e-05, "loss": 0.3308, "step": 86 }, { "epoch": 0.08025830258302583, "grad_norm": 0.7665711642660273, "learning_rate": 1.3190184049079754e-05, "loss": 0.279, "step": 87 }, { "epoch": 0.08118081180811808, "grad_norm": 1.0937041621239993, "learning_rate": 1.334355828220859e-05, "loss": 0.316, "step": 88 }, { "epoch": 0.08210332103321033, "grad_norm": 0.8111056435505324, "learning_rate": 1.3496932515337424e-05, "loss": 0.298, "step": 89 }, { "epoch": 0.08302583025830258, "grad_norm": 0.7327882215407213, "learning_rate": 1.3650306748466258e-05, "loss": 0.3052, "step": 90 }, { "epoch": 0.08394833948339483, "grad_norm": 0.7183449677066613, "learning_rate": 1.3803680981595093e-05, "loss": 0.2996, "step": 91 }, { "epoch": 0.08487084870848709, "grad_norm": 0.7227952480990556, "learning_rate": 1.3957055214723927e-05, "loss": 0.31, "step": 92 }, { "epoch": 0.08579335793357934, "grad_norm": 0.6773404497705654, "learning_rate": 1.411042944785276e-05, "loss": 0.2641, "step": 93 }, { "epoch": 0.08671586715867159, "grad_norm": 0.7237883572445348, "learning_rate": 1.4263803680981596e-05, "loss": 0.28, "step": 94 }, { "epoch": 0.08763837638376384, "grad_norm": 0.6642562423013476, "learning_rate": 1.441717791411043e-05, "loss": 0.2477, "step": 95 }, { "epoch": 0.08856088560885608, "grad_norm": 0.6417741591931398, "learning_rate": 1.4570552147239264e-05, "loss": 0.2697, "step": 96 }, { "epoch": 0.08948339483394833, "grad_norm": 0.8172595640272041, "learning_rate": 1.47239263803681e-05, "loss": 0.2883, "step": 97 }, { "epoch": 0.09040590405904059, "grad_norm": 0.8387869401006764, "learning_rate": 1.4877300613496933e-05, "loss": 0.2861, "step": 98 }, { "epoch": 0.09132841328413284, "grad_norm": 0.7425167902118797, "learning_rate": 1.5030674846625767e-05, "loss": 0.2932, "step": 99 }, { "epoch": 0.09225092250922509, "grad_norm": 0.7529530576926313, "learning_rate": 1.5184049079754603e-05, "loss": 0.2732, "step": 100 }, { "epoch": 0.09317343173431734, "grad_norm": 0.7394624338802493, "learning_rate": 1.5337423312883436e-05, "loss": 0.2962, "step": 101 }, { "epoch": 0.0940959409594096, "grad_norm": 0.9937576615108098, "learning_rate": 1.549079754601227e-05, "loss": 0.3119, "step": 102 }, { "epoch": 0.09501845018450185, "grad_norm": 0.7799933355610369, "learning_rate": 1.5644171779141108e-05, "loss": 0.2782, "step": 103 }, { "epoch": 0.0959409594095941, "grad_norm": 0.6363177894137105, "learning_rate": 1.579754601226994e-05, "loss": 0.2607, "step": 104 }, { "epoch": 0.09686346863468635, "grad_norm": 0.7861983695816259, "learning_rate": 1.5950920245398772e-05, "loss": 0.2857, "step": 105 }, { "epoch": 0.09778597785977859, "grad_norm": 0.7498718582416898, "learning_rate": 1.6104294478527606e-05, "loss": 0.2839, "step": 106 }, { "epoch": 0.09870848708487084, "grad_norm": 0.7855761683296874, "learning_rate": 1.6257668711656443e-05, "loss": 0.3017, "step": 107 }, { "epoch": 0.0996309963099631, "grad_norm": 0.7242477830142006, "learning_rate": 1.6411042944785277e-05, "loss": 0.263, "step": 108 }, { "epoch": 0.10055350553505535, "grad_norm": 0.734020068542186, "learning_rate": 1.656441717791411e-05, "loss": 0.3022, "step": 109 }, { "epoch": 0.1014760147601476, "grad_norm": 0.6281436660404887, "learning_rate": 1.6717791411042948e-05, "loss": 0.3031, "step": 110 }, { "epoch": 0.10239852398523985, "grad_norm": 0.6664367279847719, "learning_rate": 1.6871165644171778e-05, "loss": 0.2709, "step": 111 }, { "epoch": 0.1033210332103321, "grad_norm": 0.8705405825083281, "learning_rate": 1.7024539877300612e-05, "loss": 0.3019, "step": 112 }, { "epoch": 0.10424354243542436, "grad_norm": 0.7455378186922866, "learning_rate": 1.717791411042945e-05, "loss": 0.3137, "step": 113 }, { "epoch": 0.10516605166051661, "grad_norm": 0.7445813146349921, "learning_rate": 1.7331288343558283e-05, "loss": 0.2933, "step": 114 }, { "epoch": 0.10608856088560886, "grad_norm": 0.7649673992573114, "learning_rate": 1.7484662576687117e-05, "loss": 0.3064, "step": 115 }, { "epoch": 0.1070110701107011, "grad_norm": 0.6673700237310717, "learning_rate": 1.7638036809815954e-05, "loss": 0.2859, "step": 116 }, { "epoch": 0.10793357933579335, "grad_norm": 0.7228195303864201, "learning_rate": 1.7791411042944784e-05, "loss": 0.2604, "step": 117 }, { "epoch": 0.1088560885608856, "grad_norm": 0.6731996318063017, "learning_rate": 1.7944785276073618e-05, "loss": 0.2868, "step": 118 }, { "epoch": 0.10977859778597786, "grad_norm": 0.7037771197992196, "learning_rate": 1.8098159509202455e-05, "loss": 0.254, "step": 119 }, { "epoch": 0.11070110701107011, "grad_norm": 0.893157319529324, "learning_rate": 1.825153374233129e-05, "loss": 0.3371, "step": 120 }, { "epoch": 0.11162361623616236, "grad_norm": 0.7109039573324791, "learning_rate": 1.8404907975460123e-05, "loss": 0.2896, "step": 121 }, { "epoch": 0.11254612546125461, "grad_norm": 0.694703757975936, "learning_rate": 1.855828220858896e-05, "loss": 0.3052, "step": 122 }, { "epoch": 0.11346863468634687, "grad_norm": 0.7069453047379004, "learning_rate": 1.8711656441717794e-05, "loss": 0.2659, "step": 123 }, { "epoch": 0.11439114391143912, "grad_norm": 0.7668929032786836, "learning_rate": 1.8865030674846625e-05, "loss": 0.3011, "step": 124 }, { "epoch": 0.11531365313653137, "grad_norm": 0.7800112789686109, "learning_rate": 1.9018404907975462e-05, "loss": 0.3154, "step": 125 }, { "epoch": 0.11623616236162361, "grad_norm": 0.6656177080819063, "learning_rate": 1.9171779141104296e-05, "loss": 0.2746, "step": 126 }, { "epoch": 0.11715867158671586, "grad_norm": 0.6439698740687717, "learning_rate": 1.932515337423313e-05, "loss": 0.2348, "step": 127 }, { "epoch": 0.11808118081180811, "grad_norm": 0.72561609461152, "learning_rate": 1.9478527607361967e-05, "loss": 0.2767, "step": 128 }, { "epoch": 0.11900369003690037, "grad_norm": 0.6902197277603564, "learning_rate": 1.96319018404908e-05, "loss": 0.2771, "step": 129 }, { "epoch": 0.11992619926199262, "grad_norm": 0.693463471274204, "learning_rate": 1.978527607361963e-05, "loss": 0.2956, "step": 130 }, { "epoch": 0.12084870848708487, "grad_norm": 0.6930227234795218, "learning_rate": 1.9938650306748465e-05, "loss": 0.2901, "step": 131 }, { "epoch": 0.12177121771217712, "grad_norm": 0.6499770864096507, "learning_rate": 2.0092024539877302e-05, "loss": 0.2628, "step": 132 }, { "epoch": 0.12269372693726938, "grad_norm": 0.6294074861857037, "learning_rate": 2.0245398773006136e-05, "loss": 0.2844, "step": 133 }, { "epoch": 0.12361623616236163, "grad_norm": 0.7055663181910957, "learning_rate": 2.039877300613497e-05, "loss": 0.2824, "step": 134 }, { "epoch": 0.12453874538745388, "grad_norm": 0.6464646932599035, "learning_rate": 2.0552147239263807e-05, "loss": 0.287, "step": 135 }, { "epoch": 0.12546125461254612, "grad_norm": 0.6453636704346579, "learning_rate": 2.0705521472392637e-05, "loss": 0.2768, "step": 136 }, { "epoch": 0.12638376383763839, "grad_norm": 0.6707939485094121, "learning_rate": 2.085889570552147e-05, "loss": 0.3016, "step": 137 }, { "epoch": 0.12730627306273062, "grad_norm": 0.6481600913545116, "learning_rate": 2.1012269938650308e-05, "loss": 0.303, "step": 138 }, { "epoch": 0.1282287822878229, "grad_norm": 0.6358781587661787, "learning_rate": 2.1165644171779142e-05, "loss": 0.2668, "step": 139 }, { "epoch": 0.12915129151291513, "grad_norm": 0.6091204160554227, "learning_rate": 2.1319018404907976e-05, "loss": 0.2717, "step": 140 }, { "epoch": 0.13007380073800737, "grad_norm": 0.6483601411774305, "learning_rate": 2.1472392638036813e-05, "loss": 0.2676, "step": 141 }, { "epoch": 0.13099630996309963, "grad_norm": 0.6569203594746024, "learning_rate": 2.1625766871165647e-05, "loss": 0.2896, "step": 142 }, { "epoch": 0.13191881918819187, "grad_norm": 0.6524579068823675, "learning_rate": 2.1779141104294477e-05, "loss": 0.287, "step": 143 }, { "epoch": 0.13284132841328414, "grad_norm": 0.6686273011266631, "learning_rate": 2.1932515337423315e-05, "loss": 0.3011, "step": 144 }, { "epoch": 0.13376383763837638, "grad_norm": 0.6425281862252585, "learning_rate": 2.208588957055215e-05, "loss": 0.2974, "step": 145 }, { "epoch": 0.13468634686346864, "grad_norm": 0.6601519038414725, "learning_rate": 2.2239263803680982e-05, "loss": 0.287, "step": 146 }, { "epoch": 0.13560885608856088, "grad_norm": 0.6003217899565634, "learning_rate": 2.239263803680982e-05, "loss": 0.2787, "step": 147 }, { "epoch": 0.13653136531365315, "grad_norm": 0.7129866882084958, "learning_rate": 2.2546012269938653e-05, "loss": 0.3, "step": 148 }, { "epoch": 0.13745387453874539, "grad_norm": 0.7644906844783096, "learning_rate": 2.2699386503067484e-05, "loss": 0.284, "step": 149 }, { "epoch": 0.13837638376383765, "grad_norm": 0.7232617216110071, "learning_rate": 2.285276073619632e-05, "loss": 0.2606, "step": 150 }, { "epoch": 0.1392988929889299, "grad_norm": 0.6829835249098904, "learning_rate": 2.3006134969325155e-05, "loss": 0.2909, "step": 151 }, { "epoch": 0.14022140221402213, "grad_norm": 0.7576335079961564, "learning_rate": 2.315950920245399e-05, "loss": 0.2974, "step": 152 }, { "epoch": 0.1411439114391144, "grad_norm": 0.6914655420763508, "learning_rate": 2.3312883435582822e-05, "loss": 0.2603, "step": 153 }, { "epoch": 0.14206642066420663, "grad_norm": 0.6160441569790551, "learning_rate": 2.346625766871166e-05, "loss": 0.255, "step": 154 }, { "epoch": 0.1429889298892989, "grad_norm": 0.6639332944719932, "learning_rate": 2.361963190184049e-05, "loss": 0.2553, "step": 155 }, { "epoch": 0.14391143911439114, "grad_norm": 0.6751721350055936, "learning_rate": 2.3773006134969324e-05, "loss": 0.2785, "step": 156 }, { "epoch": 0.1448339483394834, "grad_norm": 0.6773724192208138, "learning_rate": 2.392638036809816e-05, "loss": 0.2874, "step": 157 }, { "epoch": 0.14575645756457564, "grad_norm": 0.7490573812895311, "learning_rate": 2.4079754601226995e-05, "loss": 0.3087, "step": 158 }, { "epoch": 0.1466789667896679, "grad_norm": 0.6953581548517865, "learning_rate": 2.423312883435583e-05, "loss": 0.3007, "step": 159 }, { "epoch": 0.14760147601476015, "grad_norm": 0.6499092512394959, "learning_rate": 2.4386503067484666e-05, "loss": 0.2694, "step": 160 }, { "epoch": 0.14852398523985239, "grad_norm": 0.6977503740590972, "learning_rate": 2.4539877300613496e-05, "loss": 0.2922, "step": 161 }, { "epoch": 0.14944649446494465, "grad_norm": 0.6671277451323265, "learning_rate": 2.469325153374233e-05, "loss": 0.2679, "step": 162 }, { "epoch": 0.1503690036900369, "grad_norm": 0.6289487835271185, "learning_rate": 2.4846625766871167e-05, "loss": 0.2914, "step": 163 }, { "epoch": 0.15129151291512916, "grad_norm": 0.6130378431780416, "learning_rate": 2.5e-05, "loss": 0.277, "step": 164 }, { "epoch": 0.1522140221402214, "grad_norm": 0.6893998165404203, "learning_rate": 2.5153374233128835e-05, "loss": 0.2689, "step": 165 }, { "epoch": 0.15313653136531366, "grad_norm": 0.6431972368587816, "learning_rate": 2.530674846625767e-05, "loss": 0.2588, "step": 166 }, { "epoch": 0.1540590405904059, "grad_norm": 0.7442869589662978, "learning_rate": 2.5460122699386503e-05, "loss": 0.2891, "step": 167 }, { "epoch": 0.15498154981549817, "grad_norm": 0.6513268252137925, "learning_rate": 2.561349693251534e-05, "loss": 0.3177, "step": 168 }, { "epoch": 0.1559040590405904, "grad_norm": 0.7489389936139911, "learning_rate": 2.5766871165644174e-05, "loss": 0.2742, "step": 169 }, { "epoch": 0.15682656826568267, "grad_norm": 0.6932053862394971, "learning_rate": 2.5920245398773008e-05, "loss": 0.2731, "step": 170 }, { "epoch": 0.1577490774907749, "grad_norm": 0.708245071323059, "learning_rate": 2.607361963190184e-05, "loss": 0.2862, "step": 171 }, { "epoch": 0.15867158671586715, "grad_norm": 0.6060615887381059, "learning_rate": 2.6226993865030675e-05, "loss": 0.2653, "step": 172 }, { "epoch": 0.1595940959409594, "grad_norm": 0.726730037504758, "learning_rate": 2.638036809815951e-05, "loss": 0.2977, "step": 173 }, { "epoch": 0.16051660516605165, "grad_norm": 0.6593317139268164, "learning_rate": 2.6533742331288346e-05, "loss": 0.2813, "step": 174 }, { "epoch": 0.16143911439114392, "grad_norm": 0.6632254325238431, "learning_rate": 2.668711656441718e-05, "loss": 0.2797, "step": 175 }, { "epoch": 0.16236162361623616, "grad_norm": 0.6922954978124917, "learning_rate": 2.6840490797546014e-05, "loss": 0.2825, "step": 176 }, { "epoch": 0.16328413284132842, "grad_norm": 0.6667884515269213, "learning_rate": 2.6993865030674848e-05, "loss": 0.28, "step": 177 }, { "epoch": 0.16420664206642066, "grad_norm": 0.6277808142578968, "learning_rate": 2.714723926380368e-05, "loss": 0.2862, "step": 178 }, { "epoch": 0.16512915129151293, "grad_norm": 0.616336601612985, "learning_rate": 2.7300613496932515e-05, "loss": 0.2885, "step": 179 }, { "epoch": 0.16605166051660517, "grad_norm": 0.6723978733223379, "learning_rate": 2.7453987730061353e-05, "loss": 0.2868, "step": 180 }, { "epoch": 0.1669741697416974, "grad_norm": 0.6133837567397908, "learning_rate": 2.7607361963190186e-05, "loss": 0.272, "step": 181 }, { "epoch": 0.16789667896678967, "grad_norm": 0.5228829672328914, "learning_rate": 2.776073619631902e-05, "loss": 0.2436, "step": 182 }, { "epoch": 0.1688191881918819, "grad_norm": 0.6538390530329945, "learning_rate": 2.7914110429447854e-05, "loss": 0.2729, "step": 183 }, { "epoch": 0.16974169741697417, "grad_norm": 0.7257778344514123, "learning_rate": 2.8067484662576688e-05, "loss": 0.2804, "step": 184 }, { "epoch": 0.1706642066420664, "grad_norm": 0.602334945645816, "learning_rate": 2.822085889570552e-05, "loss": 0.2771, "step": 185 }, { "epoch": 0.17158671586715868, "grad_norm": 0.6731703893298568, "learning_rate": 2.837423312883436e-05, "loss": 0.2643, "step": 186 }, { "epoch": 0.17250922509225092, "grad_norm": 0.7561284769218707, "learning_rate": 2.8527607361963193e-05, "loss": 0.2931, "step": 187 }, { "epoch": 0.17343173431734318, "grad_norm": 0.5856064222603531, "learning_rate": 2.8680981595092026e-05, "loss": 0.2506, "step": 188 }, { "epoch": 0.17435424354243542, "grad_norm": 0.7093532995617934, "learning_rate": 2.883435582822086e-05, "loss": 0.2721, "step": 189 }, { "epoch": 0.1752767527675277, "grad_norm": 0.6182706972943631, "learning_rate": 2.8987730061349694e-05, "loss": 0.2295, "step": 190 }, { "epoch": 0.17619926199261993, "grad_norm": 0.6029101223776, "learning_rate": 2.9141104294478528e-05, "loss": 0.2719, "step": 191 }, { "epoch": 0.17712177121771217, "grad_norm": 0.5449248192061773, "learning_rate": 2.9294478527607362e-05, "loss": 0.2528, "step": 192 }, { "epoch": 0.17804428044280443, "grad_norm": 0.71239166097286, "learning_rate": 2.94478527607362e-05, "loss": 0.3122, "step": 193 }, { "epoch": 0.17896678966789667, "grad_norm": 0.7133992819416917, "learning_rate": 2.9601226993865033e-05, "loss": 0.2744, "step": 194 }, { "epoch": 0.17988929889298894, "grad_norm": 0.5886007480549781, "learning_rate": 2.9754601226993867e-05, "loss": 0.2749, "step": 195 }, { "epoch": 0.18081180811808117, "grad_norm": 0.5833471661434647, "learning_rate": 2.99079754601227e-05, "loss": 0.2564, "step": 196 }, { "epoch": 0.18173431734317344, "grad_norm": 0.576357377285588, "learning_rate": 3.0061349693251534e-05, "loss": 0.2868, "step": 197 }, { "epoch": 0.18265682656826568, "grad_norm": 0.68431739062317, "learning_rate": 3.0214723926380368e-05, "loss": 0.3071, "step": 198 }, { "epoch": 0.18357933579335795, "grad_norm": 0.6726685646776212, "learning_rate": 3.0368098159509205e-05, "loss": 0.3044, "step": 199 }, { "epoch": 0.18450184501845018, "grad_norm": 0.6380774098448085, "learning_rate": 3.052147239263804e-05, "loss": 0.2919, "step": 200 }, { "epoch": 0.18542435424354242, "grad_norm": 0.6378258091381934, "learning_rate": 3.067484662576687e-05, "loss": 0.2668, "step": 201 }, { "epoch": 0.1863468634686347, "grad_norm": 0.602025433101351, "learning_rate": 3.0828220858895703e-05, "loss": 0.2844, "step": 202 }, { "epoch": 0.18726937269372693, "grad_norm": 0.6017459414288094, "learning_rate": 3.098159509202454e-05, "loss": 0.3055, "step": 203 }, { "epoch": 0.1881918819188192, "grad_norm": 0.6633309714534229, "learning_rate": 3.113496932515337e-05, "loss": 0.2985, "step": 204 }, { "epoch": 0.18911439114391143, "grad_norm": 0.8175145037559346, "learning_rate": 3.1288343558282215e-05, "loss": 0.2697, "step": 205 }, { "epoch": 0.1900369003690037, "grad_norm": 0.7648529536365435, "learning_rate": 3.1441717791411045e-05, "loss": 0.254, "step": 206 }, { "epoch": 0.19095940959409594, "grad_norm": 0.6193845502232096, "learning_rate": 3.159509202453988e-05, "loss": 0.2812, "step": 207 }, { "epoch": 0.1918819188191882, "grad_norm": 0.6029512076439135, "learning_rate": 3.174846625766871e-05, "loss": 0.2945, "step": 208 }, { "epoch": 0.19280442804428044, "grad_norm": 0.5592254753004365, "learning_rate": 3.1901840490797544e-05, "loss": 0.2651, "step": 209 }, { "epoch": 0.1937269372693727, "grad_norm": 0.5710111832264806, "learning_rate": 3.205521472392638e-05, "loss": 0.2609, "step": 210 }, { "epoch": 0.19464944649446494, "grad_norm": 0.6118237476493752, "learning_rate": 3.220858895705521e-05, "loss": 0.2735, "step": 211 }, { "epoch": 0.19557195571955718, "grad_norm": 0.6586039796536416, "learning_rate": 3.2361963190184055e-05, "loss": 0.2912, "step": 212 }, { "epoch": 0.19649446494464945, "grad_norm": 0.7370528603672779, "learning_rate": 3.2515337423312886e-05, "loss": 0.3192, "step": 213 }, { "epoch": 0.1974169741697417, "grad_norm": 0.6274437717194886, "learning_rate": 3.266871165644172e-05, "loss": 0.2956, "step": 214 }, { "epoch": 0.19833948339483395, "grad_norm": 0.6532944820897233, "learning_rate": 3.282208588957055e-05, "loss": 0.2816, "step": 215 }, { "epoch": 0.1992619926199262, "grad_norm": 0.7051377169549334, "learning_rate": 3.2975460122699384e-05, "loss": 0.2919, "step": 216 }, { "epoch": 0.20018450184501846, "grad_norm": 0.6131283518507381, "learning_rate": 3.312883435582822e-05, "loss": 0.3063, "step": 217 }, { "epoch": 0.2011070110701107, "grad_norm": 0.5643760380208747, "learning_rate": 3.328220858895706e-05, "loss": 0.2767, "step": 218 }, { "epoch": 0.20202952029520296, "grad_norm": 0.5360618443762574, "learning_rate": 3.3435582822085895e-05, "loss": 0.2605, "step": 219 }, { "epoch": 0.2029520295202952, "grad_norm": 0.6345677981765382, "learning_rate": 3.3588957055214726e-05, "loss": 0.3078, "step": 220 }, { "epoch": 0.20387453874538744, "grad_norm": 0.699130888028388, "learning_rate": 3.3742331288343556e-05, "loss": 0.2986, "step": 221 }, { "epoch": 0.2047970479704797, "grad_norm": 0.6583162616034626, "learning_rate": 3.3895705521472393e-05, "loss": 0.3105, "step": 222 }, { "epoch": 0.20571955719557194, "grad_norm": 0.6133101318835855, "learning_rate": 3.4049079754601224e-05, "loss": 0.2785, "step": 223 }, { "epoch": 0.2066420664206642, "grad_norm": 0.5312778105673021, "learning_rate": 3.420245398773007e-05, "loss": 0.2745, "step": 224 }, { "epoch": 0.20756457564575645, "grad_norm": 0.6083100303550006, "learning_rate": 3.43558282208589e-05, "loss": 0.2718, "step": 225 }, { "epoch": 0.20848708487084872, "grad_norm": 0.6655989531281572, "learning_rate": 3.4509202453987735e-05, "loss": 0.3118, "step": 226 }, { "epoch": 0.20940959409594095, "grad_norm": 0.6528009026939667, "learning_rate": 3.4662576687116566e-05, "loss": 0.2962, "step": 227 }, { "epoch": 0.21033210332103322, "grad_norm": 0.6638162654461088, "learning_rate": 3.4815950920245396e-05, "loss": 0.3234, "step": 228 }, { "epoch": 0.21125461254612546, "grad_norm": 0.6010271418513191, "learning_rate": 3.4969325153374234e-05, "loss": 0.252, "step": 229 }, { "epoch": 0.21217712177121772, "grad_norm": 0.5670469197127859, "learning_rate": 3.512269938650307e-05, "loss": 0.2859, "step": 230 }, { "epoch": 0.21309963099630996, "grad_norm": 0.5359328549287473, "learning_rate": 3.527607361963191e-05, "loss": 0.2844, "step": 231 }, { "epoch": 0.2140221402214022, "grad_norm": 0.5901864103525112, "learning_rate": 3.542944785276074e-05, "loss": 0.2813, "step": 232 }, { "epoch": 0.21494464944649447, "grad_norm": 0.5238540986476823, "learning_rate": 3.558282208588957e-05, "loss": 0.2548, "step": 233 }, { "epoch": 0.2158671586715867, "grad_norm": 0.7105076419864915, "learning_rate": 3.5736196319018406e-05, "loss": 0.3286, "step": 234 }, { "epoch": 0.21678966789667897, "grad_norm": 0.6690175862737519, "learning_rate": 3.5889570552147236e-05, "loss": 0.2773, "step": 235 }, { "epoch": 0.2177121771217712, "grad_norm": 0.6574628031652334, "learning_rate": 3.6042944785276074e-05, "loss": 0.3139, "step": 236 }, { "epoch": 0.21863468634686348, "grad_norm": 0.5693984692687943, "learning_rate": 3.619631901840491e-05, "loss": 0.2981, "step": 237 }, { "epoch": 0.21955719557195572, "grad_norm": 0.6371295925172662, "learning_rate": 3.634969325153375e-05, "loss": 0.2946, "step": 238 }, { "epoch": 0.22047970479704798, "grad_norm": 0.6273102976578564, "learning_rate": 3.650306748466258e-05, "loss": 0.2981, "step": 239 }, { "epoch": 0.22140221402214022, "grad_norm": 0.6016146989383563, "learning_rate": 3.665644171779141e-05, "loss": 0.2823, "step": 240 }, { "epoch": 0.22232472324723246, "grad_norm": 0.5506463175407609, "learning_rate": 3.6809815950920246e-05, "loss": 0.2766, "step": 241 }, { "epoch": 0.22324723247232472, "grad_norm": 0.6144790265354202, "learning_rate": 3.696319018404908e-05, "loss": 0.2818, "step": 242 }, { "epoch": 0.22416974169741696, "grad_norm": 0.5410857080611227, "learning_rate": 3.711656441717792e-05, "loss": 0.3034, "step": 243 }, { "epoch": 0.22509225092250923, "grad_norm": 0.7090071961863476, "learning_rate": 3.726993865030675e-05, "loss": 0.3308, "step": 244 }, { "epoch": 0.22601476014760147, "grad_norm": 0.5817986077508676, "learning_rate": 3.742331288343559e-05, "loss": 0.2839, "step": 245 }, { "epoch": 0.22693726937269373, "grad_norm": 0.5378634317218598, "learning_rate": 3.757668711656442e-05, "loss": 0.2614, "step": 246 }, { "epoch": 0.22785977859778597, "grad_norm": 0.7089287955511329, "learning_rate": 3.773006134969325e-05, "loss": 0.3018, "step": 247 }, { "epoch": 0.22878228782287824, "grad_norm": 0.5790229588961053, "learning_rate": 3.7883435582822086e-05, "loss": 0.2742, "step": 248 }, { "epoch": 0.22970479704797048, "grad_norm": 0.5226751178463883, "learning_rate": 3.8036809815950924e-05, "loss": 0.3015, "step": 249 }, { "epoch": 0.23062730627306274, "grad_norm": 0.5585042684687114, "learning_rate": 3.819018404907976e-05, "loss": 0.2803, "step": 250 }, { "epoch": 0.23154981549815498, "grad_norm": 0.5266474842246636, "learning_rate": 3.834355828220859e-05, "loss": 0.2964, "step": 251 }, { "epoch": 0.23247232472324722, "grad_norm": 0.5773690989302118, "learning_rate": 3.849693251533742e-05, "loss": 0.2986, "step": 252 }, { "epoch": 0.2333948339483395, "grad_norm": 0.48670222947233316, "learning_rate": 3.865030674846626e-05, "loss": 0.2531, "step": 253 }, { "epoch": 0.23431734317343172, "grad_norm": 0.6421764020323035, "learning_rate": 3.880368098159509e-05, "loss": 0.2806, "step": 254 }, { "epoch": 0.235239852398524, "grad_norm": 0.5922553489710686, "learning_rate": 3.895705521472393e-05, "loss": 0.2713, "step": 255 }, { "epoch": 0.23616236162361623, "grad_norm": 0.6324457275724104, "learning_rate": 3.9110429447852764e-05, "loss": 0.3018, "step": 256 }, { "epoch": 0.2370848708487085, "grad_norm": 0.571837105622298, "learning_rate": 3.92638036809816e-05, "loss": 0.3122, "step": 257 }, { "epoch": 0.23800738007380073, "grad_norm": 0.677008364241822, "learning_rate": 3.941717791411043e-05, "loss": 0.2748, "step": 258 }, { "epoch": 0.238929889298893, "grad_norm": 0.5516744368754908, "learning_rate": 3.957055214723926e-05, "loss": 0.3112, "step": 259 }, { "epoch": 0.23985239852398524, "grad_norm": 0.5635734178386571, "learning_rate": 3.97239263803681e-05, "loss": 0.2418, "step": 260 }, { "epoch": 0.24077490774907748, "grad_norm": 0.579042615236464, "learning_rate": 3.987730061349693e-05, "loss": 0.2678, "step": 261 }, { "epoch": 0.24169741697416974, "grad_norm": 0.6679946243134188, "learning_rate": 4.0030674846625773e-05, "loss": 0.2858, "step": 262 }, { "epoch": 0.24261992619926198, "grad_norm": 0.5964573605703893, "learning_rate": 4.0184049079754604e-05, "loss": 0.2849, "step": 263 }, { "epoch": 0.24354243542435425, "grad_norm": 0.5530986279028629, "learning_rate": 4.033742331288344e-05, "loss": 0.2878, "step": 264 }, { "epoch": 0.2444649446494465, "grad_norm": 0.5728949927481534, "learning_rate": 4.049079754601227e-05, "loss": 0.2683, "step": 265 }, { "epoch": 0.24538745387453875, "grad_norm": 0.5613742323890516, "learning_rate": 4.06441717791411e-05, "loss": 0.2786, "step": 266 }, { "epoch": 0.246309963099631, "grad_norm": 0.5054081322613757, "learning_rate": 4.079754601226994e-05, "loss": 0.2916, "step": 267 }, { "epoch": 0.24723247232472326, "grad_norm": 0.5177290916182169, "learning_rate": 4.0950920245398776e-05, "loss": 0.2751, "step": 268 }, { "epoch": 0.2481549815498155, "grad_norm": 0.49608484582705986, "learning_rate": 4.1104294478527614e-05, "loss": 0.2657, "step": 269 }, { "epoch": 0.24907749077490776, "grad_norm": 0.5951411165866146, "learning_rate": 4.1257668711656444e-05, "loss": 0.312, "step": 270 }, { "epoch": 0.25, "grad_norm": 0.5778971143375377, "learning_rate": 4.1411042944785274e-05, "loss": 0.2479, "step": 271 }, { "epoch": 0.25092250922509224, "grad_norm": 0.5363308179443078, "learning_rate": 4.156441717791411e-05, "loss": 0.282, "step": 272 }, { "epoch": 0.2518450184501845, "grad_norm": 0.5690998644749469, "learning_rate": 4.171779141104294e-05, "loss": 0.3032, "step": 273 }, { "epoch": 0.25276752767527677, "grad_norm": 0.591776250914278, "learning_rate": 4.1871165644171786e-05, "loss": 0.3127, "step": 274 }, { "epoch": 0.253690036900369, "grad_norm": 0.6059632015040894, "learning_rate": 4.2024539877300617e-05, "loss": 0.3166, "step": 275 }, { "epoch": 0.25461254612546125, "grad_norm": 0.6083637647170296, "learning_rate": 4.2177914110429454e-05, "loss": 0.2654, "step": 276 }, { "epoch": 0.2555350553505535, "grad_norm": 0.5890457273173466, "learning_rate": 4.2331288343558284e-05, "loss": 0.2591, "step": 277 }, { "epoch": 0.2564575645756458, "grad_norm": 0.4944776942353346, "learning_rate": 4.2484662576687115e-05, "loss": 0.2517, "step": 278 }, { "epoch": 0.257380073800738, "grad_norm": 0.6643790613200776, "learning_rate": 4.263803680981595e-05, "loss": 0.3287, "step": 279 }, { "epoch": 0.25830258302583026, "grad_norm": 0.5781262612195905, "learning_rate": 4.279141104294479e-05, "loss": 0.2908, "step": 280 }, { "epoch": 0.2592250922509225, "grad_norm": 0.5079966726432265, "learning_rate": 4.2944785276073626e-05, "loss": 0.2437, "step": 281 }, { "epoch": 0.26014760147601473, "grad_norm": 0.6008854116248558, "learning_rate": 4.309815950920246e-05, "loss": 0.2783, "step": 282 }, { "epoch": 0.261070110701107, "grad_norm": 0.5490644205236153, "learning_rate": 4.3251533742331294e-05, "loss": 0.2773, "step": 283 }, { "epoch": 0.26199261992619927, "grad_norm": 0.5990005074423055, "learning_rate": 4.3404907975460124e-05, "loss": 0.342, "step": 284 }, { "epoch": 0.2629151291512915, "grad_norm": 0.6264105081239723, "learning_rate": 4.3558282208588955e-05, "loss": 0.306, "step": 285 }, { "epoch": 0.26383763837638374, "grad_norm": 0.5033413509882185, "learning_rate": 4.371165644171779e-05, "loss": 0.26, "step": 286 }, { "epoch": 0.26476014760147604, "grad_norm": 0.554351447814042, "learning_rate": 4.386503067484663e-05, "loss": 0.2854, "step": 287 }, { "epoch": 0.2656826568265683, "grad_norm": 0.5302773165225984, "learning_rate": 4.4018404907975466e-05, "loss": 0.2562, "step": 288 }, { "epoch": 0.2666051660516605, "grad_norm": 0.5183141955611247, "learning_rate": 4.41717791411043e-05, "loss": 0.2559, "step": 289 }, { "epoch": 0.26752767527675275, "grad_norm": 0.5409284590210018, "learning_rate": 4.432515337423313e-05, "loss": 0.2563, "step": 290 }, { "epoch": 0.26845018450184505, "grad_norm": 0.6393051011337837, "learning_rate": 4.4478527607361964e-05, "loss": 0.3165, "step": 291 }, { "epoch": 0.2693726937269373, "grad_norm": 0.5559185337854181, "learning_rate": 4.4631901840490795e-05, "loss": 0.3145, "step": 292 }, { "epoch": 0.2702952029520295, "grad_norm": 0.48707012990996873, "learning_rate": 4.478527607361964e-05, "loss": 0.286, "step": 293 }, { "epoch": 0.27121771217712176, "grad_norm": 1.1765119695816575, "learning_rate": 4.493865030674847e-05, "loss": 0.2925, "step": 294 }, { "epoch": 0.272140221402214, "grad_norm": 0.5092749730753052, "learning_rate": 4.5092024539877307e-05, "loss": 0.2678, "step": 295 }, { "epoch": 0.2730627306273063, "grad_norm": 0.49642035968887865, "learning_rate": 4.524539877300614e-05, "loss": 0.2772, "step": 296 }, { "epoch": 0.27398523985239853, "grad_norm": 0.5680854452902957, "learning_rate": 4.539877300613497e-05, "loss": 0.2835, "step": 297 }, { "epoch": 0.27490774907749077, "grad_norm": 0.6033918944567603, "learning_rate": 4.5552147239263805e-05, "loss": 0.3041, "step": 298 }, { "epoch": 0.275830258302583, "grad_norm": 0.5920736549312533, "learning_rate": 4.570552147239264e-05, "loss": 0.3011, "step": 299 }, { "epoch": 0.2767527675276753, "grad_norm": 0.5570819697177297, "learning_rate": 4.585889570552148e-05, "loss": 0.2795, "step": 300 }, { "epoch": 0.27767527675276754, "grad_norm": 0.5521443698108845, "learning_rate": 4.601226993865031e-05, "loss": 0.2877, "step": 301 }, { "epoch": 0.2785977859778598, "grad_norm": 0.6494127895380299, "learning_rate": 4.616564417177914e-05, "loss": 0.3129, "step": 302 }, { "epoch": 0.279520295202952, "grad_norm": 0.5642988518110355, "learning_rate": 4.631901840490798e-05, "loss": 0.2724, "step": 303 }, { "epoch": 0.28044280442804426, "grad_norm": 0.5641545725751617, "learning_rate": 4.647239263803681e-05, "loss": 0.2747, "step": 304 }, { "epoch": 0.28136531365313655, "grad_norm": 0.651919658466901, "learning_rate": 4.6625766871165645e-05, "loss": 0.297, "step": 305 }, { "epoch": 0.2822878228782288, "grad_norm": 0.5718174535396574, "learning_rate": 4.677914110429448e-05, "loss": 0.3027, "step": 306 }, { "epoch": 0.283210332103321, "grad_norm": 0.48701493995012424, "learning_rate": 4.693251533742332e-05, "loss": 0.278, "step": 307 }, { "epoch": 0.28413284132841327, "grad_norm": 0.5284403071949798, "learning_rate": 4.708588957055215e-05, "loss": 0.3056, "step": 308 }, { "epoch": 0.28505535055350556, "grad_norm": 0.5913958628137351, "learning_rate": 4.723926380368098e-05, "loss": 0.303, "step": 309 }, { "epoch": 0.2859778597785978, "grad_norm": 0.4965955407068457, "learning_rate": 4.739263803680982e-05, "loss": 0.2499, "step": 310 }, { "epoch": 0.28690036900369004, "grad_norm": 0.5326185286236886, "learning_rate": 4.754601226993865e-05, "loss": 0.2947, "step": 311 }, { "epoch": 0.2878228782287823, "grad_norm": 0.5123442651860254, "learning_rate": 4.769938650306749e-05, "loss": 0.2888, "step": 312 }, { "epoch": 0.2887453874538745, "grad_norm": 0.5538925573268508, "learning_rate": 4.785276073619632e-05, "loss": 0.2939, "step": 313 }, { "epoch": 0.2896678966789668, "grad_norm": 0.5218442693746654, "learning_rate": 4.800613496932516e-05, "loss": 0.3041, "step": 314 }, { "epoch": 0.29059040590405905, "grad_norm": 0.562035249405384, "learning_rate": 4.815950920245399e-05, "loss": 0.2851, "step": 315 }, { "epoch": 0.2915129151291513, "grad_norm": 0.536590531309307, "learning_rate": 4.831288343558282e-05, "loss": 0.3011, "step": 316 }, { "epoch": 0.2924354243542435, "grad_norm": 0.5728269165630729, "learning_rate": 4.846625766871166e-05, "loss": 0.2891, "step": 317 }, { "epoch": 0.2933579335793358, "grad_norm": 0.5526588133095315, "learning_rate": 4.8619631901840495e-05, "loss": 0.2873, "step": 318 }, { "epoch": 0.29428044280442806, "grad_norm": 0.6078040240981145, "learning_rate": 4.877300613496933e-05, "loss": 0.2559, "step": 319 }, { "epoch": 0.2952029520295203, "grad_norm": 0.5684796023103894, "learning_rate": 4.892638036809816e-05, "loss": 0.3157, "step": 320 }, { "epoch": 0.29612546125461253, "grad_norm": 0.5847394173098899, "learning_rate": 4.907975460122699e-05, "loss": 0.3068, "step": 321 }, { "epoch": 0.29704797047970477, "grad_norm": 0.5444187455969381, "learning_rate": 4.923312883435583e-05, "loss": 0.2886, "step": 322 }, { "epoch": 0.29797047970479706, "grad_norm": 0.5453378770392785, "learning_rate": 4.938650306748466e-05, "loss": 0.2962, "step": 323 }, { "epoch": 0.2988929889298893, "grad_norm": 0.5145521435893671, "learning_rate": 4.9539877300613504e-05, "loss": 0.2884, "step": 324 }, { "epoch": 0.29981549815498154, "grad_norm": 0.54156802954864, "learning_rate": 4.9693251533742335e-05, "loss": 0.2766, "step": 325 }, { "epoch": 0.3007380073800738, "grad_norm": 0.548183894892192, "learning_rate": 4.984662576687117e-05, "loss": 0.2851, "step": 326 }, { "epoch": 0.3016605166051661, "grad_norm": 0.5241708687339598, "learning_rate": 5e-05, "loss": 0.3003, "step": 327 }, { "epoch": 0.3025830258302583, "grad_norm": 0.5247623186991769, "learning_rate": 4.999998559009648e-05, "loss": 0.2907, "step": 328 }, { "epoch": 0.30350553505535055, "grad_norm": 0.46550756716924957, "learning_rate": 4.999994236040253e-05, "loss": 0.2773, "step": 329 }, { "epoch": 0.3044280442804428, "grad_norm": 0.5464343854379993, "learning_rate": 4.999987031096798e-05, "loss": 0.2838, "step": 330 }, { "epoch": 0.3053505535055351, "grad_norm": 0.5408842372543923, "learning_rate": 4.999976944187589e-05, "loss": 0.3108, "step": 331 }, { "epoch": 0.3062730627306273, "grad_norm": 0.47661724256796645, "learning_rate": 4.999963975324254e-05, "loss": 0.2778, "step": 332 }, { "epoch": 0.30719557195571956, "grad_norm": 0.5485844438480383, "learning_rate": 4.9999481245217444e-05, "loss": 0.2658, "step": 333 }, { "epoch": 0.3081180811808118, "grad_norm": 0.7530365626718195, "learning_rate": 4.999929391798332e-05, "loss": 0.3133, "step": 334 }, { "epoch": 0.30904059040590404, "grad_norm": 0.6008376648478589, "learning_rate": 4.999907777175612e-05, "loss": 0.2975, "step": 335 }, { "epoch": 0.30996309963099633, "grad_norm": 0.5622146369546329, "learning_rate": 4.999883280678501e-05, "loss": 0.2682, "step": 336 }, { "epoch": 0.31088560885608857, "grad_norm": 0.5895201399404543, "learning_rate": 4.999855902335239e-05, "loss": 0.2942, "step": 337 }, { "epoch": 0.3118081180811808, "grad_norm": 0.5241668276774941, "learning_rate": 4.999825642177387e-05, "loss": 0.3131, "step": 338 }, { "epoch": 0.31273062730627305, "grad_norm": 0.499627002160098, "learning_rate": 4.9997925002398295e-05, "loss": 0.2928, "step": 339 }, { "epoch": 0.31365313653136534, "grad_norm": 0.550188057582933, "learning_rate": 4.9997564765607716e-05, "loss": 0.2991, "step": 340 }, { "epoch": 0.3145756457564576, "grad_norm": 0.5371175592847117, "learning_rate": 4.999717571181742e-05, "loss": 0.295, "step": 341 }, { "epoch": 0.3154981549815498, "grad_norm": 0.5493905636808714, "learning_rate": 4.9996757841475894e-05, "loss": 0.2905, "step": 342 }, { "epoch": 0.31642066420664205, "grad_norm": 0.5253806832374459, "learning_rate": 4.9996311155064856e-05, "loss": 0.2948, "step": 343 }, { "epoch": 0.3173431734317343, "grad_norm": 0.5539799821025654, "learning_rate": 4.9995835653099254e-05, "loss": 0.291, "step": 344 }, { "epoch": 0.3182656826568266, "grad_norm": 0.5550654258039699, "learning_rate": 4.999533133612723e-05, "loss": 0.2821, "step": 345 }, { "epoch": 0.3191881918819188, "grad_norm": 0.7137968310290836, "learning_rate": 4.9994798204730166e-05, "loss": 0.3089, "step": 346 }, { "epoch": 0.32011070110701106, "grad_norm": 0.5139241368756094, "learning_rate": 4.999423625952264e-05, "loss": 0.2847, "step": 347 }, { "epoch": 0.3210332103321033, "grad_norm": 0.5262978328667104, "learning_rate": 4.999364550115248e-05, "loss": 0.2452, "step": 348 }, { "epoch": 0.3219557195571956, "grad_norm": 0.5347337657622876, "learning_rate": 4.9993025930300686e-05, "loss": 0.2763, "step": 349 }, { "epoch": 0.32287822878228783, "grad_norm": 0.6030031508392452, "learning_rate": 4.9992377547681505e-05, "loss": 0.3049, "step": 350 }, { "epoch": 0.3238007380073801, "grad_norm": 0.5946252763444188, "learning_rate": 4.999170035404239e-05, "loss": 0.2984, "step": 351 }, { "epoch": 0.3247232472324723, "grad_norm": 0.5719874620235956, "learning_rate": 4.999099435016399e-05, "loss": 0.275, "step": 352 }, { "epoch": 0.32564575645756455, "grad_norm": 0.7364304283462457, "learning_rate": 4.99902595368602e-05, "loss": 0.2895, "step": 353 }, { "epoch": 0.32656826568265684, "grad_norm": 0.5543236697442724, "learning_rate": 4.998949591497809e-05, "loss": 0.339, "step": 354 }, { "epoch": 0.3274907749077491, "grad_norm": 0.4761992054593626, "learning_rate": 4.998870348539797e-05, "loss": 0.2633, "step": 355 }, { "epoch": 0.3284132841328413, "grad_norm": 0.5726867972200776, "learning_rate": 4.998788224903334e-05, "loss": 0.2848, "step": 356 }, { "epoch": 0.32933579335793356, "grad_norm": 0.4876367361430421, "learning_rate": 4.9987032206830906e-05, "loss": 0.2923, "step": 357 }, { "epoch": 0.33025830258302585, "grad_norm": 0.5039945497900252, "learning_rate": 4.9986153359770614e-05, "loss": 0.2762, "step": 358 }, { "epoch": 0.3311808118081181, "grad_norm": 0.5138969084818104, "learning_rate": 4.998524570886558e-05, "loss": 0.2611, "step": 359 }, { "epoch": 0.33210332103321033, "grad_norm": 0.5593828298420278, "learning_rate": 4.998430925516213e-05, "loss": 0.2837, "step": 360 }, { "epoch": 0.33302583025830257, "grad_norm": 0.5676564634771994, "learning_rate": 4.9983343999739805e-05, "loss": 0.3008, "step": 361 }, { "epoch": 0.3339483394833948, "grad_norm": 0.5540762769745929, "learning_rate": 4.998234994371135e-05, "loss": 0.2901, "step": 362 }, { "epoch": 0.3348708487084871, "grad_norm": 0.5275548687528755, "learning_rate": 4.99813270882227e-05, "loss": 0.294, "step": 363 }, { "epoch": 0.33579335793357934, "grad_norm": 0.4698081370394266, "learning_rate": 4.9980275434452995e-05, "loss": 0.2585, "step": 364 }, { "epoch": 0.3367158671586716, "grad_norm": 0.5518300809849733, "learning_rate": 4.997919498361457e-05, "loss": 0.294, "step": 365 }, { "epoch": 0.3376383763837638, "grad_norm": 0.4924798980807726, "learning_rate": 4.997808573695297e-05, "loss": 0.2931, "step": 366 }, { "epoch": 0.3385608856088561, "grad_norm": 0.4971688337772927, "learning_rate": 4.997694769574692e-05, "loss": 0.2935, "step": 367 }, { "epoch": 0.33948339483394835, "grad_norm": 0.4955383153372864, "learning_rate": 4.997578086130834e-05, "loss": 0.2842, "step": 368 }, { "epoch": 0.3404059040590406, "grad_norm": 0.5577743523602456, "learning_rate": 4.997458523498236e-05, "loss": 0.2781, "step": 369 }, { "epoch": 0.3413284132841328, "grad_norm": 0.5039551863232465, "learning_rate": 4.9973360818147276e-05, "loss": 0.2839, "step": 370 }, { "epoch": 0.3422509225092251, "grad_norm": 0.6590350760855003, "learning_rate": 4.99721076122146e-05, "loss": 0.303, "step": 371 }, { "epoch": 0.34317343173431736, "grad_norm": 0.5389630694407243, "learning_rate": 4.9970825618629e-05, "loss": 0.2997, "step": 372 }, { "epoch": 0.3440959409594096, "grad_norm": 0.5616168426598683, "learning_rate": 4.9969514838868364e-05, "loss": 0.2928, "step": 373 }, { "epoch": 0.34501845018450183, "grad_norm": 0.4749345164297593, "learning_rate": 4.996817527444374e-05, "loss": 0.2891, "step": 374 }, { "epoch": 0.3459409594095941, "grad_norm": 0.5570269133618422, "learning_rate": 4.996680692689938e-05, "loss": 0.3089, "step": 375 }, { "epoch": 0.34686346863468637, "grad_norm": 0.5620682199124284, "learning_rate": 4.996540979781269e-05, "loss": 0.2734, "step": 376 }, { "epoch": 0.3477859778597786, "grad_norm": 0.5092748080272552, "learning_rate": 4.996398388879427e-05, "loss": 0.3017, "step": 377 }, { "epoch": 0.34870848708487084, "grad_norm": 0.47199034787323235, "learning_rate": 4.996252920148791e-05, "loss": 0.2699, "step": 378 }, { "epoch": 0.3496309963099631, "grad_norm": 0.521363279262548, "learning_rate": 4.996104573757054e-05, "loss": 0.3283, "step": 379 }, { "epoch": 0.3505535055350554, "grad_norm": 0.513514483372052, "learning_rate": 4.995953349875232e-05, "loss": 0.2764, "step": 380 }, { "epoch": 0.3514760147601476, "grad_norm": 0.4611726498410872, "learning_rate": 4.9957992486776516e-05, "loss": 0.2941, "step": 381 }, { "epoch": 0.35239852398523985, "grad_norm": 0.439926621612698, "learning_rate": 4.995642270341961e-05, "loss": 0.2696, "step": 382 }, { "epoch": 0.3533210332103321, "grad_norm": 0.47508444380843656, "learning_rate": 4.995482415049123e-05, "loss": 0.2904, "step": 383 }, { "epoch": 0.35424354243542433, "grad_norm": 0.47065356253686386, "learning_rate": 4.995319682983418e-05, "loss": 0.2967, "step": 384 }, { "epoch": 0.3551660516605166, "grad_norm": 0.5298362343070417, "learning_rate": 4.995154074332441e-05, "loss": 0.2865, "step": 385 }, { "epoch": 0.35608856088560886, "grad_norm": 0.4686677587974533, "learning_rate": 4.994985589287107e-05, "loss": 0.2953, "step": 386 }, { "epoch": 0.3570110701107011, "grad_norm": 0.4622899566823007, "learning_rate": 4.994814228041641e-05, "loss": 0.2846, "step": 387 }, { "epoch": 0.35793357933579334, "grad_norm": 0.5023602796309572, "learning_rate": 4.9946399907935894e-05, "loss": 0.2982, "step": 388 }, { "epoch": 0.35885608856088563, "grad_norm": 0.47765997988304104, "learning_rate": 4.9944628777438104e-05, "loss": 0.2631, "step": 389 }, { "epoch": 0.35977859778597787, "grad_norm": 0.5233065519746218, "learning_rate": 4.99428288909648e-05, "loss": 0.3156, "step": 390 }, { "epoch": 0.3607011070110701, "grad_norm": 0.5077270448652319, "learning_rate": 4.994100025059085e-05, "loss": 0.3018, "step": 391 }, { "epoch": 0.36162361623616235, "grad_norm": 0.501254905607358, "learning_rate": 4.993914285842433e-05, "loss": 0.2854, "step": 392 }, { "epoch": 0.3625461254612546, "grad_norm": 0.462697133871905, "learning_rate": 4.9937256716606394e-05, "loss": 0.2777, "step": 393 }, { "epoch": 0.3634686346863469, "grad_norm": 0.5345478233734732, "learning_rate": 4.99353418273114e-05, "loss": 0.2797, "step": 394 }, { "epoch": 0.3643911439114391, "grad_norm": 0.5614041609883599, "learning_rate": 4.993339819274679e-05, "loss": 0.3221, "step": 395 }, { "epoch": 0.36531365313653136, "grad_norm": 0.5208539310813163, "learning_rate": 4.9931425815153205e-05, "loss": 0.2901, "step": 396 }, { "epoch": 0.3662361623616236, "grad_norm": 0.4757548139725171, "learning_rate": 4.992942469680436e-05, "loss": 0.274, "step": 397 }, { "epoch": 0.3671586715867159, "grad_norm": 0.5076300925523486, "learning_rate": 4.992739484000714e-05, "loss": 0.2836, "step": 398 }, { "epoch": 0.36808118081180813, "grad_norm": 0.49140369112610577, "learning_rate": 4.992533624710154e-05, "loss": 0.2572, "step": 399 }, { "epoch": 0.36900369003690037, "grad_norm": 0.4384787991074871, "learning_rate": 4.992324892046069e-05, "loss": 0.2559, "step": 400 }, { "epoch": 0.3699261992619926, "grad_norm": 0.47598607693350403, "learning_rate": 4.992113286249086e-05, "loss": 0.2611, "step": 401 }, { "epoch": 0.37084870848708484, "grad_norm": 0.49850175098728877, "learning_rate": 4.9918988075631404e-05, "loss": 0.2901, "step": 402 }, { "epoch": 0.37177121771217714, "grad_norm": 0.5659124528427997, "learning_rate": 4.991681456235483e-05, "loss": 0.2994, "step": 403 }, { "epoch": 0.3726937269372694, "grad_norm": 0.4704789200086404, "learning_rate": 4.991461232516675e-05, "loss": 0.275, "step": 404 }, { "epoch": 0.3736162361623616, "grad_norm": 0.5042793616248943, "learning_rate": 4.9912381366605876e-05, "loss": 0.2801, "step": 405 }, { "epoch": 0.37453874538745385, "grad_norm": 0.5183087948851817, "learning_rate": 4.991012168924404e-05, "loss": 0.299, "step": 406 }, { "epoch": 0.37546125461254615, "grad_norm": 0.49887343246856986, "learning_rate": 4.9907833295686185e-05, "loss": 0.2726, "step": 407 }, { "epoch": 0.3763837638376384, "grad_norm": 0.5472149090701182, "learning_rate": 4.990551618857035e-05, "loss": 0.2977, "step": 408 }, { "epoch": 0.3773062730627306, "grad_norm": 0.4205940791760018, "learning_rate": 4.990317037056769e-05, "loss": 0.2619, "step": 409 }, { "epoch": 0.37822878228782286, "grad_norm": 0.4628735099185641, "learning_rate": 4.990079584438243e-05, "loss": 0.2777, "step": 410 }, { "epoch": 0.37915129151291516, "grad_norm": 0.44009518829262617, "learning_rate": 4.989839261275191e-05, "loss": 0.2605, "step": 411 }, { "epoch": 0.3800738007380074, "grad_norm": 0.5271666577742473, "learning_rate": 4.989596067844656e-05, "loss": 0.3104, "step": 412 }, { "epoch": 0.38099630996309963, "grad_norm": 0.49637247500333725, "learning_rate": 4.989350004426989e-05, "loss": 0.2911, "step": 413 }, { "epoch": 0.38191881918819187, "grad_norm": 0.8195228166776699, "learning_rate": 4.9891010713058506e-05, "loss": 0.2697, "step": 414 }, { "epoch": 0.3828413284132841, "grad_norm": 0.5735252085658141, "learning_rate": 4.9888492687682096e-05, "loss": 0.2996, "step": 415 }, { "epoch": 0.3837638376383764, "grad_norm": 0.4625217340159638, "learning_rate": 4.98859459710434e-05, "loss": 0.2433, "step": 416 }, { "epoch": 0.38468634686346864, "grad_norm": 0.5189849572409303, "learning_rate": 4.988337056607827e-05, "loss": 0.2974, "step": 417 }, { "epoch": 0.3856088560885609, "grad_norm": 0.5636750135554274, "learning_rate": 4.988076647575562e-05, "loss": 0.3159, "step": 418 }, { "epoch": 0.3865313653136531, "grad_norm": 0.4692607149916364, "learning_rate": 4.987813370307739e-05, "loss": 0.2933, "step": 419 }, { "epoch": 0.3874538745387454, "grad_norm": 0.4625490071520199, "learning_rate": 4.987547225107866e-05, "loss": 0.2913, "step": 420 }, { "epoch": 0.38837638376383765, "grad_norm": 0.5310824156987997, "learning_rate": 4.987278212282751e-05, "loss": 0.2797, "step": 421 }, { "epoch": 0.3892988929889299, "grad_norm": 0.45783594777827885, "learning_rate": 4.9870063321425105e-05, "loss": 0.279, "step": 422 }, { "epoch": 0.39022140221402213, "grad_norm": 0.4546793722346438, "learning_rate": 4.986731585000566e-05, "loss": 0.2727, "step": 423 }, { "epoch": 0.39114391143911437, "grad_norm": 0.5003814360244799, "learning_rate": 4.9864539711736425e-05, "loss": 0.2842, "step": 424 }, { "epoch": 0.39206642066420666, "grad_norm": 0.48676719250433703, "learning_rate": 4.986173490981773e-05, "loss": 0.3123, "step": 425 }, { "epoch": 0.3929889298892989, "grad_norm": 0.44752651152763834, "learning_rate": 4.985890144748292e-05, "loss": 0.2714, "step": 426 }, { "epoch": 0.39391143911439114, "grad_norm": 0.4524183699229073, "learning_rate": 4.985603932799839e-05, "loss": 0.2521, "step": 427 }, { "epoch": 0.3948339483394834, "grad_norm": 0.6702816647145312, "learning_rate": 4.9853148554663564e-05, "loss": 0.2766, "step": 428 }, { "epoch": 0.39575645756457567, "grad_norm": 0.45992023501143997, "learning_rate": 4.985022913081091e-05, "loss": 0.2684, "step": 429 }, { "epoch": 0.3966789667896679, "grad_norm": 0.49294214318656143, "learning_rate": 4.9847281059805914e-05, "loss": 0.2782, "step": 430 }, { "epoch": 0.39760147601476015, "grad_norm": 0.42437535528454967, "learning_rate": 4.98443043450471e-05, "loss": 0.2648, "step": 431 }, { "epoch": 0.3985239852398524, "grad_norm": 0.45541868520627043, "learning_rate": 4.9841298989965984e-05, "loss": 0.2667, "step": 432 }, { "epoch": 0.3994464944649446, "grad_norm": 0.5286063432706287, "learning_rate": 4.983826499802712e-05, "loss": 0.2709, "step": 433 }, { "epoch": 0.4003690036900369, "grad_norm": 0.4543353369578046, "learning_rate": 4.9835202372728086e-05, "loss": 0.2474, "step": 434 }, { "epoch": 0.40129151291512916, "grad_norm": 0.52459663995117, "learning_rate": 4.9832111117599436e-05, "loss": 0.2831, "step": 435 }, { "epoch": 0.4022140221402214, "grad_norm": 0.5242387005681102, "learning_rate": 4.982899123620475e-05, "loss": 0.2708, "step": 436 }, { "epoch": 0.40313653136531363, "grad_norm": 0.507220401515919, "learning_rate": 4.982584273214061e-05, "loss": 0.3001, "step": 437 }, { "epoch": 0.4040590405904059, "grad_norm": 0.44289090289551086, "learning_rate": 4.982266560903657e-05, "loss": 0.2749, "step": 438 }, { "epoch": 0.40498154981549817, "grad_norm": 0.4860542261240817, "learning_rate": 4.981945987055521e-05, "loss": 0.2865, "step": 439 }, { "epoch": 0.4059040590405904, "grad_norm": 0.48611623956271083, "learning_rate": 4.981622552039207e-05, "loss": 0.2704, "step": 440 }, { "epoch": 0.40682656826568264, "grad_norm": 0.5231243198031841, "learning_rate": 4.981296256227569e-05, "loss": 0.2918, "step": 441 }, { "epoch": 0.4077490774907749, "grad_norm": 0.4645772233906956, "learning_rate": 4.980967099996759e-05, "loss": 0.2333, "step": 442 }, { "epoch": 0.4086715867158672, "grad_norm": 0.5057641477236923, "learning_rate": 4.980635083726225e-05, "loss": 0.2866, "step": 443 }, { "epoch": 0.4095940959409594, "grad_norm": 0.5746343137271073, "learning_rate": 4.980300207798711e-05, "loss": 0.2751, "step": 444 }, { "epoch": 0.41051660516605165, "grad_norm": 0.4494986104587851, "learning_rate": 4.979962472600263e-05, "loss": 0.2815, "step": 445 }, { "epoch": 0.4114391143911439, "grad_norm": 0.6074090221600998, "learning_rate": 4.979621878520216e-05, "loss": 0.3311, "step": 446 }, { "epoch": 0.4123616236162362, "grad_norm": 0.43702703064995774, "learning_rate": 4.979278425951207e-05, "loss": 0.2654, "step": 447 }, { "epoch": 0.4132841328413284, "grad_norm": 0.4835542229937427, "learning_rate": 4.978932115289164e-05, "loss": 0.2823, "step": 448 }, { "epoch": 0.41420664206642066, "grad_norm": 0.48416396231619063, "learning_rate": 4.9785829469333116e-05, "loss": 0.2637, "step": 449 }, { "epoch": 0.4151291512915129, "grad_norm": 0.4661535425065205, "learning_rate": 4.978230921286168e-05, "loss": 0.3024, "step": 450 }, { "epoch": 0.4160516605166052, "grad_norm": 0.5035766846654282, "learning_rate": 4.9778760387535465e-05, "loss": 0.3318, "step": 451 }, { "epoch": 0.41697416974169743, "grad_norm": 0.45587434855728826, "learning_rate": 4.977518299744552e-05, "loss": 0.2778, "step": 452 }, { "epoch": 0.41789667896678967, "grad_norm": 0.5226509979413794, "learning_rate": 4.9771577046715846e-05, "loss": 0.3112, "step": 453 }, { "epoch": 0.4188191881918819, "grad_norm": 0.45316940391861665, "learning_rate": 4.976794253950334e-05, "loss": 0.2761, "step": 454 }, { "epoch": 0.41974169741697415, "grad_norm": 0.5061175030402905, "learning_rate": 4.976427947999784e-05, "loss": 0.2869, "step": 455 }, { "epoch": 0.42066420664206644, "grad_norm": 0.5434356789335424, "learning_rate": 4.976058787242209e-05, "loss": 0.2655, "step": 456 }, { "epoch": 0.4215867158671587, "grad_norm": 0.452576602253526, "learning_rate": 4.9756867721031756e-05, "loss": 0.2708, "step": 457 }, { "epoch": 0.4225092250922509, "grad_norm": 0.4961825947084803, "learning_rate": 4.975311903011539e-05, "loss": 0.2806, "step": 458 }, { "epoch": 0.42343173431734316, "grad_norm": 0.4630990428690511, "learning_rate": 4.9749341803994465e-05, "loss": 0.2655, "step": 459 }, { "epoch": 0.42435424354243545, "grad_norm": 0.4956010831171726, "learning_rate": 4.9745536047023324e-05, "loss": 0.2816, "step": 460 }, { "epoch": 0.4252767527675277, "grad_norm": 0.43560042606186716, "learning_rate": 4.974170176358922e-05, "loss": 0.2691, "step": 461 }, { "epoch": 0.4261992619926199, "grad_norm": 0.4878621995774692, "learning_rate": 4.973783895811228e-05, "loss": 0.2893, "step": 462 }, { "epoch": 0.42712177121771217, "grad_norm": 0.4718466074702949, "learning_rate": 4.9733947635045534e-05, "loss": 0.2697, "step": 463 }, { "epoch": 0.4280442804428044, "grad_norm": 0.502923872783246, "learning_rate": 4.9730027798874856e-05, "loss": 0.2944, "step": 464 }, { "epoch": 0.4289667896678967, "grad_norm": 0.5007071278997294, "learning_rate": 4.9726079454119e-05, "loss": 0.2807, "step": 465 }, { "epoch": 0.42988929889298894, "grad_norm": 0.4815546251566584, "learning_rate": 4.97221026053296e-05, "loss": 0.2637, "step": 466 }, { "epoch": 0.4308118081180812, "grad_norm": 0.7239974358847797, "learning_rate": 4.971809725709112e-05, "loss": 0.2806, "step": 467 }, { "epoch": 0.4317343173431734, "grad_norm": 0.4779514843339991, "learning_rate": 4.971406341402091e-05, "loss": 0.3156, "step": 468 }, { "epoch": 0.4326568265682657, "grad_norm": 0.5319628872166638, "learning_rate": 4.9710001080769145e-05, "loss": 0.2581, "step": 469 }, { "epoch": 0.43357933579335795, "grad_norm": 0.4165504237124002, "learning_rate": 4.970591026201884e-05, "loss": 0.2662, "step": 470 }, { "epoch": 0.4345018450184502, "grad_norm": 0.42652362506523284, "learning_rate": 4.970179096248588e-05, "loss": 0.2427, "step": 471 }, { "epoch": 0.4354243542435424, "grad_norm": 0.426956519303912, "learning_rate": 4.969764318691896e-05, "loss": 0.2592, "step": 472 }, { "epoch": 0.43634686346863466, "grad_norm": 0.4964034230546089, "learning_rate": 4.9693466940099596e-05, "loss": 0.2825, "step": 473 }, { "epoch": 0.43726937269372695, "grad_norm": 0.45671563154158606, "learning_rate": 4.968926222684213e-05, "loss": 0.2711, "step": 474 }, { "epoch": 0.4381918819188192, "grad_norm": 0.46933639358843837, "learning_rate": 4.968502905199373e-05, "loss": 0.2746, "step": 475 }, { "epoch": 0.43911439114391143, "grad_norm": 0.47054551084355595, "learning_rate": 4.968076742043437e-05, "loss": 0.2962, "step": 476 }, { "epoch": 0.44003690036900367, "grad_norm": 0.5576416962434739, "learning_rate": 4.967647733707681e-05, "loss": 0.2647, "step": 477 }, { "epoch": 0.44095940959409596, "grad_norm": 0.5280087869537966, "learning_rate": 4.9672158806866645e-05, "loss": 0.3067, "step": 478 }, { "epoch": 0.4418819188191882, "grad_norm": 0.44384985941092053, "learning_rate": 4.9667811834782224e-05, "loss": 0.2678, "step": 479 }, { "epoch": 0.44280442804428044, "grad_norm": 0.5030864516669159, "learning_rate": 4.966343642583472e-05, "loss": 0.3055, "step": 480 }, { "epoch": 0.4437269372693727, "grad_norm": 0.48741228397240355, "learning_rate": 4.965903258506806e-05, "loss": 0.2601, "step": 481 }, { "epoch": 0.4446494464944649, "grad_norm": 0.4955807220446558, "learning_rate": 4.9654600317558965e-05, "loss": 0.2773, "step": 482 }, { "epoch": 0.4455719557195572, "grad_norm": 0.47270716571562477, "learning_rate": 4.9650139628416916e-05, "loss": 0.3089, "step": 483 }, { "epoch": 0.44649446494464945, "grad_norm": 0.4865711142061868, "learning_rate": 4.9645650522784156e-05, "loss": 0.2761, "step": 484 }, { "epoch": 0.4474169741697417, "grad_norm": 0.48214633442910015, "learning_rate": 4.9641133005835696e-05, "loss": 0.2772, "step": 485 }, { "epoch": 0.4483394833948339, "grad_norm": 0.44599604929644415, "learning_rate": 4.963658708277929e-05, "loss": 0.2337, "step": 486 }, { "epoch": 0.4492619926199262, "grad_norm": 0.5107631952533427, "learning_rate": 4.963201275885545e-05, "loss": 0.2595, "step": 487 }, { "epoch": 0.45018450184501846, "grad_norm": 0.4856998967219853, "learning_rate": 4.962741003933742e-05, "loss": 0.2616, "step": 488 }, { "epoch": 0.4511070110701107, "grad_norm": 0.4716394691734046, "learning_rate": 4.962277892953118e-05, "loss": 0.2817, "step": 489 }, { "epoch": 0.45202952029520294, "grad_norm": 0.4449737230991853, "learning_rate": 4.9618119434775436e-05, "loss": 0.2523, "step": 490 }, { "epoch": 0.45295202952029523, "grad_norm": 0.5038411750840269, "learning_rate": 4.961343156044161e-05, "loss": 0.2948, "step": 491 }, { "epoch": 0.45387453874538747, "grad_norm": 0.47723990277212436, "learning_rate": 4.960871531193386e-05, "loss": 0.275, "step": 492 }, { "epoch": 0.4547970479704797, "grad_norm": 0.4874275599114698, "learning_rate": 4.9603970694689036e-05, "loss": 0.2658, "step": 493 }, { "epoch": 0.45571955719557194, "grad_norm": 0.4604630895511357, "learning_rate": 4.959919771417669e-05, "loss": 0.2473, "step": 494 }, { "epoch": 0.4566420664206642, "grad_norm": 0.4433615779641383, "learning_rate": 4.959439637589909e-05, "loss": 0.2648, "step": 495 }, { "epoch": 0.4575645756457565, "grad_norm": 0.47951428273104185, "learning_rate": 4.958956668539117e-05, "loss": 0.2852, "step": 496 }, { "epoch": 0.4584870848708487, "grad_norm": 0.436709393894546, "learning_rate": 4.9584708648220554e-05, "loss": 0.2782, "step": 497 }, { "epoch": 0.45940959409594095, "grad_norm": 0.7256225184114476, "learning_rate": 4.9579822269987574e-05, "loss": 0.2632, "step": 498 }, { "epoch": 0.4603321033210332, "grad_norm": 0.3961328509767162, "learning_rate": 4.9574907556325186e-05, "loss": 0.2534, "step": 499 }, { "epoch": 0.4612546125461255, "grad_norm": 0.41311737848784036, "learning_rate": 4.956996451289906e-05, "loss": 0.2759, "step": 500 }, { "epoch": 0.4621771217712177, "grad_norm": 0.5557796830484363, "learning_rate": 4.956499314540747e-05, "loss": 0.2736, "step": 501 }, { "epoch": 0.46309963099630996, "grad_norm": 0.5150229800472483, "learning_rate": 4.9559993459581375e-05, "loss": 0.3181, "step": 502 }, { "epoch": 0.4640221402214022, "grad_norm": 0.4501951026255143, "learning_rate": 4.955496546118439e-05, "loss": 0.2556, "step": 503 }, { "epoch": 0.46494464944649444, "grad_norm": 0.8463552432381145, "learning_rate": 4.954990915601274e-05, "loss": 0.2482, "step": 504 }, { "epoch": 0.46586715867158673, "grad_norm": 0.43951620309458156, "learning_rate": 4.95448245498953e-05, "loss": 0.2865, "step": 505 }, { "epoch": 0.466789667896679, "grad_norm": 0.46321874246749795, "learning_rate": 4.9539711648693555e-05, "loss": 0.2847, "step": 506 }, { "epoch": 0.4677121771217712, "grad_norm": 0.42236752170429886, "learning_rate": 4.953457045830163e-05, "loss": 0.2766, "step": 507 }, { "epoch": 0.46863468634686345, "grad_norm": 0.5338015589876643, "learning_rate": 4.9529400984646244e-05, "loss": 0.3102, "step": 508 }, { "epoch": 0.46955719557195574, "grad_norm": 0.45534183065412587, "learning_rate": 4.952420323368673e-05, "loss": 0.2976, "step": 509 }, { "epoch": 0.470479704797048, "grad_norm": 0.46562018951031153, "learning_rate": 4.951897721141502e-05, "loss": 0.2961, "step": 510 }, { "epoch": 0.4714022140221402, "grad_norm": 0.4038765927587688, "learning_rate": 4.951372292385561e-05, "loss": 0.2531, "step": 511 }, { "epoch": 0.47232472324723246, "grad_norm": 0.4934140912362845, "learning_rate": 4.950844037706563e-05, "loss": 0.2767, "step": 512 }, { "epoch": 0.4732472324723247, "grad_norm": 0.3985419688294112, "learning_rate": 4.950312957713474e-05, "loss": 0.2293, "step": 513 }, { "epoch": 0.474169741697417, "grad_norm": 0.44235538916203665, "learning_rate": 4.9497790530185194e-05, "loss": 0.2712, "step": 514 }, { "epoch": 0.47509225092250923, "grad_norm": 0.4345617378206702, "learning_rate": 4.9492423242371814e-05, "loss": 0.293, "step": 515 }, { "epoch": 0.47601476014760147, "grad_norm": 0.47623291965082337, "learning_rate": 4.948702771988195e-05, "loss": 0.2989, "step": 516 }, { "epoch": 0.4769372693726937, "grad_norm": 0.5004270904084257, "learning_rate": 4.948160396893553e-05, "loss": 0.2819, "step": 517 }, { "epoch": 0.477859778597786, "grad_norm": 0.43365473204761534, "learning_rate": 4.9476151995785016e-05, "loss": 0.2779, "step": 518 }, { "epoch": 0.47878228782287824, "grad_norm": 0.432061270625897, "learning_rate": 4.9470671806715386e-05, "loss": 0.2705, "step": 519 }, { "epoch": 0.4797047970479705, "grad_norm": 0.511804435991744, "learning_rate": 4.946516340804417e-05, "loss": 0.2963, "step": 520 }, { "epoch": 0.4806273062730627, "grad_norm": 0.4571692558782327, "learning_rate": 4.945962680612142e-05, "loss": 0.2918, "step": 521 }, { "epoch": 0.48154981549815495, "grad_norm": 0.422817092719061, "learning_rate": 4.945406200732966e-05, "loss": 0.2861, "step": 522 }, { "epoch": 0.48247232472324725, "grad_norm": 0.3882985681149562, "learning_rate": 4.9448469018083965e-05, "loss": 0.284, "step": 523 }, { "epoch": 0.4833948339483395, "grad_norm": 0.4059792503571466, "learning_rate": 4.9442847844831884e-05, "loss": 0.2587, "step": 524 }, { "epoch": 0.4843173431734317, "grad_norm": 0.5029319147515259, "learning_rate": 4.9437198494053464e-05, "loss": 0.3026, "step": 525 }, { "epoch": 0.48523985239852396, "grad_norm": 0.501433585973909, "learning_rate": 4.9431520972261236e-05, "loss": 0.2803, "step": 526 }, { "epoch": 0.48616236162361626, "grad_norm": 0.49813327319291245, "learning_rate": 4.94258152860002e-05, "loss": 0.2757, "step": 527 }, { "epoch": 0.4870848708487085, "grad_norm": 0.43818296091000847, "learning_rate": 4.942008144184783e-05, "loss": 0.2547, "step": 528 }, { "epoch": 0.48800738007380073, "grad_norm": 0.4817891496312095, "learning_rate": 4.941431944641405e-05, "loss": 0.2709, "step": 529 }, { "epoch": 0.488929889298893, "grad_norm": 0.45278995667382466, "learning_rate": 4.9408529306341255e-05, "loss": 0.2676, "step": 530 }, { "epoch": 0.48985239852398527, "grad_norm": 0.4683646977976737, "learning_rate": 4.940271102830426e-05, "loss": 0.2736, "step": 531 }, { "epoch": 0.4907749077490775, "grad_norm": 0.5129772082178625, "learning_rate": 4.939686461901034e-05, "loss": 0.3174, "step": 532 }, { "epoch": 0.49169741697416974, "grad_norm": 0.41287356302324596, "learning_rate": 4.9390990085199197e-05, "loss": 0.2495, "step": 533 }, { "epoch": 0.492619926199262, "grad_norm": 0.4634074703994402, "learning_rate": 4.938508743364293e-05, "loss": 0.2689, "step": 534 }, { "epoch": 0.4935424354243542, "grad_norm": 0.44341222256163587, "learning_rate": 4.9379156671146084e-05, "loss": 0.27, "step": 535 }, { "epoch": 0.4944649446494465, "grad_norm": 0.405366223217404, "learning_rate": 4.937319780454559e-05, "loss": 0.2827, "step": 536 }, { "epoch": 0.49538745387453875, "grad_norm": 0.6408259161108691, "learning_rate": 4.936721084071079e-05, "loss": 0.2761, "step": 537 }, { "epoch": 0.496309963099631, "grad_norm": 0.4285821096818319, "learning_rate": 4.936119578654341e-05, "loss": 0.2903, "step": 538 }, { "epoch": 0.49723247232472323, "grad_norm": 0.4718131706229978, "learning_rate": 4.935515264897754e-05, "loss": 0.2527, "step": 539 }, { "epoch": 0.4981549815498155, "grad_norm": 0.4238454146716917, "learning_rate": 4.934908143497969e-05, "loss": 0.2697, "step": 540 }, { "epoch": 0.49907749077490776, "grad_norm": 0.4619380626601982, "learning_rate": 4.934298215154869e-05, "loss": 0.2633, "step": 541 }, { "epoch": 0.5, "grad_norm": 0.4654607007586769, "learning_rate": 4.933685480571575e-05, "loss": 0.2553, "step": 542 }, { "epoch": 0.5009225092250923, "grad_norm": 0.4758385588029141, "learning_rate": 4.933069940454443e-05, "loss": 0.2542, "step": 543 }, { "epoch": 0.5018450184501845, "grad_norm": 0.3888905402060683, "learning_rate": 4.932451595513062e-05, "loss": 0.237, "step": 544 }, { "epoch": 0.5027675276752768, "grad_norm": 0.4446242533662745, "learning_rate": 4.931830446460257e-05, "loss": 0.2769, "step": 545 }, { "epoch": 0.503690036900369, "grad_norm": 0.4179986241757249, "learning_rate": 4.9312064940120825e-05, "loss": 0.2665, "step": 546 }, { "epoch": 0.5046125461254612, "grad_norm": 0.4704726176124594, "learning_rate": 4.9305797388878264e-05, "loss": 0.2698, "step": 547 }, { "epoch": 0.5055350553505535, "grad_norm": 0.42816781624738687, "learning_rate": 4.929950181810008e-05, "loss": 0.2841, "step": 548 }, { "epoch": 0.5064575645756457, "grad_norm": 0.4786245676444808, "learning_rate": 4.929317823504373e-05, "loss": 0.28, "step": 549 }, { "epoch": 0.507380073800738, "grad_norm": 0.48781934756147605, "learning_rate": 4.928682664699904e-05, "loss": 0.2825, "step": 550 }, { "epoch": 0.5083025830258303, "grad_norm": 0.5559869669654536, "learning_rate": 4.928044706128803e-05, "loss": 0.311, "step": 551 }, { "epoch": 0.5092250922509225, "grad_norm": 0.4748900536986569, "learning_rate": 4.927403948526504e-05, "loss": 0.2624, "step": 552 }, { "epoch": 0.5101476014760148, "grad_norm": 0.4957615812794995, "learning_rate": 4.92676039263167e-05, "loss": 0.2369, "step": 553 }, { "epoch": 0.511070110701107, "grad_norm": 0.4130723638312345, "learning_rate": 4.926114039186185e-05, "loss": 0.2626, "step": 554 }, { "epoch": 0.5119926199261993, "grad_norm": 0.40832203191352767, "learning_rate": 4.925464888935162e-05, "loss": 0.2652, "step": 555 }, { "epoch": 0.5129151291512916, "grad_norm": 0.42179080972296734, "learning_rate": 4.924812942626934e-05, "loss": 0.2836, "step": 556 }, { "epoch": 0.5138376383763837, "grad_norm": 0.47362251067172373, "learning_rate": 4.924158201013062e-05, "loss": 0.2822, "step": 557 }, { "epoch": 0.514760147601476, "grad_norm": 0.4585248532810261, "learning_rate": 4.923500664848326e-05, "loss": 0.263, "step": 558 }, { "epoch": 0.5156826568265682, "grad_norm": 0.4612972283763447, "learning_rate": 4.922840334890729e-05, "loss": 0.2869, "step": 559 }, { "epoch": 0.5166051660516605, "grad_norm": 0.4912111054419741, "learning_rate": 4.922177211901494e-05, "loss": 0.2808, "step": 560 }, { "epoch": 0.5175276752767528, "grad_norm": 0.4096288303101279, "learning_rate": 4.921511296645064e-05, "loss": 0.2728, "step": 561 }, { "epoch": 0.518450184501845, "grad_norm": 0.4231251335442949, "learning_rate": 4.920842589889102e-05, "loss": 0.2457, "step": 562 }, { "epoch": 0.5193726937269373, "grad_norm": 0.4714855709651646, "learning_rate": 4.9201710924044865e-05, "loss": 0.254, "step": 563 }, { "epoch": 0.5202952029520295, "grad_norm": 0.4582728610686477, "learning_rate": 4.9194968049653144e-05, "loss": 0.2885, "step": 564 }, { "epoch": 0.5212177121771218, "grad_norm": 0.4365802042178772, "learning_rate": 4.9188197283489015e-05, "loss": 0.2736, "step": 565 }, { "epoch": 0.522140221402214, "grad_norm": 0.39054096278502237, "learning_rate": 4.918139863335774e-05, "loss": 0.2512, "step": 566 }, { "epoch": 0.5230627306273062, "grad_norm": 0.3881999562207738, "learning_rate": 4.917457210709675e-05, "loss": 0.2457, "step": 567 }, { "epoch": 0.5239852398523985, "grad_norm": 0.4230922310859938, "learning_rate": 4.9167717712575635e-05, "loss": 0.2852, "step": 568 }, { "epoch": 0.5249077490774908, "grad_norm": 0.4539687703260704, "learning_rate": 4.916083545769607e-05, "loss": 0.276, "step": 569 }, { "epoch": 0.525830258302583, "grad_norm": 0.4634154156646078, "learning_rate": 4.915392535039187e-05, "loss": 0.2664, "step": 570 }, { "epoch": 0.5267527675276753, "grad_norm": 0.4106928537097177, "learning_rate": 4.914698739862895e-05, "loss": 0.253, "step": 571 }, { "epoch": 0.5276752767527675, "grad_norm": 0.4801224680728872, "learning_rate": 4.9140021610405326e-05, "loss": 0.2962, "step": 572 }, { "epoch": 0.5285977859778598, "grad_norm": 0.4660747634206696, "learning_rate": 4.913302799375112e-05, "loss": 0.2839, "step": 573 }, { "epoch": 0.5295202952029521, "grad_norm": 0.4643565869221289, "learning_rate": 4.91260065567285e-05, "loss": 0.2967, "step": 574 }, { "epoch": 0.5304428044280443, "grad_norm": 0.4565991699458606, "learning_rate": 4.911895730743174e-05, "loss": 0.2828, "step": 575 }, { "epoch": 0.5313653136531366, "grad_norm": 0.4906222604967463, "learning_rate": 4.9111880253987144e-05, "loss": 0.2629, "step": 576 }, { "epoch": 0.5322878228782287, "grad_norm": 0.5290433906880226, "learning_rate": 4.9104775404553096e-05, "loss": 0.3066, "step": 577 }, { "epoch": 0.533210332103321, "grad_norm": 0.38292404341294, "learning_rate": 4.909764276732001e-05, "loss": 0.2509, "step": 578 }, { "epoch": 0.5341328413284133, "grad_norm": 0.470498772870862, "learning_rate": 4.9090482350510336e-05, "loss": 0.2686, "step": 579 }, { "epoch": 0.5350553505535055, "grad_norm": 0.49653229082754774, "learning_rate": 4.908329416237855e-05, "loss": 0.3074, "step": 580 }, { "epoch": 0.5359778597785978, "grad_norm": 0.4672373400263147, "learning_rate": 4.907607821121112e-05, "loss": 0.3088, "step": 581 }, { "epoch": 0.5369003690036901, "grad_norm": 0.4532750252264721, "learning_rate": 4.906883450532657e-05, "loss": 0.3046, "step": 582 }, { "epoch": 0.5378228782287823, "grad_norm": 0.39484226178695664, "learning_rate": 4.906156305307536e-05, "loss": 0.2675, "step": 583 }, { "epoch": 0.5387453874538746, "grad_norm": 0.46409690143882776, "learning_rate": 4.905426386283998e-05, "loss": 0.2799, "step": 584 }, { "epoch": 0.5396678966789668, "grad_norm": 0.42793988061634014, "learning_rate": 4.904693694303488e-05, "loss": 0.2804, "step": 585 }, { "epoch": 0.540590405904059, "grad_norm": 0.4563227359982457, "learning_rate": 4.9039582302106465e-05, "loss": 0.2755, "step": 586 }, { "epoch": 0.5415129151291513, "grad_norm": 0.5135009107400244, "learning_rate": 4.903219994853313e-05, "loss": 0.286, "step": 587 }, { "epoch": 0.5424354243542435, "grad_norm": 0.46841319933602094, "learning_rate": 4.902478989082517e-05, "loss": 0.2903, "step": 588 }, { "epoch": 0.5433579335793358, "grad_norm": 0.43650400749097834, "learning_rate": 4.901735213752486e-05, "loss": 0.2392, "step": 589 }, { "epoch": 0.544280442804428, "grad_norm": 0.43694449569568666, "learning_rate": 4.900988669720637e-05, "loss": 0.2756, "step": 590 }, { "epoch": 0.5452029520295203, "grad_norm": 0.4318039876501152, "learning_rate": 4.9002393578475816e-05, "loss": 0.2898, "step": 591 }, { "epoch": 0.5461254612546126, "grad_norm": 0.5479363128059827, "learning_rate": 4.89948727899712e-05, "loss": 0.3107, "step": 592 }, { "epoch": 0.5470479704797048, "grad_norm": 0.4227935698497923, "learning_rate": 4.898732434036244e-05, "loss": 0.2884, "step": 593 }, { "epoch": 0.5479704797047971, "grad_norm": 0.4238342532266894, "learning_rate": 4.897974823835131e-05, "loss": 0.2528, "step": 594 }, { "epoch": 0.5488929889298892, "grad_norm": 0.47695834978409135, "learning_rate": 4.89721444926715e-05, "loss": 0.2873, "step": 595 }, { "epoch": 0.5498154981549815, "grad_norm": 0.4838457590674877, "learning_rate": 4.896451311208854e-05, "loss": 0.2601, "step": 596 }, { "epoch": 0.5507380073800738, "grad_norm": 0.4332730618581192, "learning_rate": 4.895685410539983e-05, "loss": 0.2478, "step": 597 }, { "epoch": 0.551660516605166, "grad_norm": 0.4375563350114113, "learning_rate": 4.894916748143461e-05, "loss": 0.2638, "step": 598 }, { "epoch": 0.5525830258302583, "grad_norm": 0.45076712957616827, "learning_rate": 4.894145324905396e-05, "loss": 0.2684, "step": 599 }, { "epoch": 0.5535055350553506, "grad_norm": 0.4636336178782477, "learning_rate": 4.89337114171508e-05, "loss": 0.2721, "step": 600 }, { "epoch": 0.5544280442804428, "grad_norm": 0.4391886154375099, "learning_rate": 4.892594199464984e-05, "loss": 0.2637, "step": 601 }, { "epoch": 0.5553505535055351, "grad_norm": 0.43384904186127465, "learning_rate": 4.891814499050762e-05, "loss": 0.2743, "step": 602 }, { "epoch": 0.5562730627306273, "grad_norm": 0.4524629128775071, "learning_rate": 4.891032041371246e-05, "loss": 0.2697, "step": 603 }, { "epoch": 0.5571955719557196, "grad_norm": 0.4392319773444855, "learning_rate": 4.8902468273284475e-05, "loss": 0.2702, "step": 604 }, { "epoch": 0.5581180811808119, "grad_norm": 0.4382414715952929, "learning_rate": 4.8894588578275544e-05, "loss": 0.2442, "step": 605 }, { "epoch": 0.559040590405904, "grad_norm": 0.4274378197084093, "learning_rate": 4.888668133776934e-05, "loss": 0.2704, "step": 606 }, { "epoch": 0.5599630996309963, "grad_norm": 0.4486738805320487, "learning_rate": 4.887874656088124e-05, "loss": 0.257, "step": 607 }, { "epoch": 0.5608856088560885, "grad_norm": 0.46043272436522703, "learning_rate": 4.88707842567584e-05, "loss": 0.2906, "step": 608 }, { "epoch": 0.5618081180811808, "grad_norm": 0.4479495902950971, "learning_rate": 4.8862794434579726e-05, "loss": 0.2642, "step": 609 }, { "epoch": 0.5627306273062731, "grad_norm": 0.4497957687832158, "learning_rate": 4.8854777103555804e-05, "loss": 0.2613, "step": 610 }, { "epoch": 0.5636531365313653, "grad_norm": 0.4203226668395155, "learning_rate": 4.884673227292895e-05, "loss": 0.2425, "step": 611 }, { "epoch": 0.5645756457564576, "grad_norm": 0.4445645750758397, "learning_rate": 4.883865995197319e-05, "loss": 0.2921, "step": 612 }, { "epoch": 0.5654981549815498, "grad_norm": 0.5884497201558954, "learning_rate": 4.883056014999423e-05, "loss": 0.29, "step": 613 }, { "epoch": 0.566420664206642, "grad_norm": 0.3975901321158924, "learning_rate": 4.882243287632947e-05, "loss": 0.2516, "step": 614 }, { "epoch": 0.5673431734317343, "grad_norm": 0.3938164318797877, "learning_rate": 4.881427814034795e-05, "loss": 0.2498, "step": 615 }, { "epoch": 0.5682656826568265, "grad_norm": 0.4512900807352941, "learning_rate": 4.880609595145039e-05, "loss": 0.2555, "step": 616 }, { "epoch": 0.5691881918819188, "grad_norm": 0.39020092611130675, "learning_rate": 4.8797886319069164e-05, "loss": 0.2531, "step": 617 }, { "epoch": 0.5701107011070111, "grad_norm": 0.460118191888693, "learning_rate": 4.8789649252668267e-05, "loss": 0.2666, "step": 618 }, { "epoch": 0.5710332103321033, "grad_norm": 0.3934694887996066, "learning_rate": 4.878138476174333e-05, "loss": 0.2503, "step": 619 }, { "epoch": 0.5719557195571956, "grad_norm": 0.4259182945379848, "learning_rate": 4.877309285582159e-05, "loss": 0.29, "step": 620 }, { "epoch": 0.5728782287822878, "grad_norm": 0.518219509749948, "learning_rate": 4.8764773544461886e-05, "loss": 0.2675, "step": 621 }, { "epoch": 0.5738007380073801, "grad_norm": 0.4193442527427061, "learning_rate": 4.875642683725467e-05, "loss": 0.2424, "step": 622 }, { "epoch": 0.5747232472324724, "grad_norm": 0.4162366808810705, "learning_rate": 4.874805274382196e-05, "loss": 0.2711, "step": 623 }, { "epoch": 0.5756457564575646, "grad_norm": 0.40037988979271777, "learning_rate": 4.8739651273817335e-05, "loss": 0.266, "step": 624 }, { "epoch": 0.5765682656826568, "grad_norm": 0.5577640987449097, "learning_rate": 4.8731222436925946e-05, "loss": 0.2472, "step": 625 }, { "epoch": 0.577490774907749, "grad_norm": 0.4356354148828068, "learning_rate": 4.87227662428645e-05, "loss": 0.2789, "step": 626 }, { "epoch": 0.5784132841328413, "grad_norm": 0.42006007196665196, "learning_rate": 4.871428270138123e-05, "loss": 0.2682, "step": 627 }, { "epoch": 0.5793357933579336, "grad_norm": 0.43660375602062723, "learning_rate": 4.870577182225589e-05, "loss": 0.2736, "step": 628 }, { "epoch": 0.5802583025830258, "grad_norm": 0.39778255799712997, "learning_rate": 4.8697233615299765e-05, "loss": 0.2489, "step": 629 }, { "epoch": 0.5811808118081181, "grad_norm": 0.4616450616405046, "learning_rate": 4.8688668090355626e-05, "loss": 0.2623, "step": 630 }, { "epoch": 0.5821033210332104, "grad_norm": 0.4865535225730099, "learning_rate": 4.868007525729775e-05, "loss": 0.2675, "step": 631 }, { "epoch": 0.5830258302583026, "grad_norm": 0.4568154435508115, "learning_rate": 4.8671455126031896e-05, "loss": 0.2712, "step": 632 }, { "epoch": 0.5839483394833949, "grad_norm": 0.4223709303814713, "learning_rate": 4.8662807706495264e-05, "loss": 0.2748, "step": 633 }, { "epoch": 0.584870848708487, "grad_norm": 0.4072049582485937, "learning_rate": 4.865413300865655e-05, "loss": 0.2626, "step": 634 }, { "epoch": 0.5857933579335793, "grad_norm": 0.43673565956983407, "learning_rate": 4.864543104251587e-05, "loss": 0.2922, "step": 635 }, { "epoch": 0.5867158671586716, "grad_norm": 0.47814222534404105, "learning_rate": 4.863670181810479e-05, "loss": 0.2847, "step": 636 }, { "epoch": 0.5876383763837638, "grad_norm": 0.47852847689852657, "learning_rate": 4.862794534548628e-05, "loss": 0.2558, "step": 637 }, { "epoch": 0.5885608856088561, "grad_norm": 0.4306830707239445, "learning_rate": 4.861916163475475e-05, "loss": 0.2771, "step": 638 }, { "epoch": 0.5894833948339483, "grad_norm": 0.4313832268646541, "learning_rate": 4.861035069603599e-05, "loss": 0.2464, "step": 639 }, { "epoch": 0.5904059040590406, "grad_norm": 0.4061200350798131, "learning_rate": 4.860151253948717e-05, "loss": 0.2571, "step": 640 }, { "epoch": 0.5913284132841329, "grad_norm": 0.4160150362331395, "learning_rate": 4.859264717529686e-05, "loss": 0.2345, "step": 641 }, { "epoch": 0.5922509225092251, "grad_norm": 0.4886974969126436, "learning_rate": 4.858375461368499e-05, "loss": 0.2596, "step": 642 }, { "epoch": 0.5931734317343174, "grad_norm": 0.4106943768640075, "learning_rate": 4.8574834864902816e-05, "loss": 0.2263, "step": 643 }, { "epoch": 0.5940959409594095, "grad_norm": 0.4557579946573745, "learning_rate": 4.856588793923297e-05, "loss": 0.2565, "step": 644 }, { "epoch": 0.5950184501845018, "grad_norm": 0.4413330841053196, "learning_rate": 4.8556913846989394e-05, "loss": 0.2804, "step": 645 }, { "epoch": 0.5959409594095941, "grad_norm": 0.38143228991076894, "learning_rate": 4.854791259851735e-05, "loss": 0.2454, "step": 646 }, { "epoch": 0.5968634686346863, "grad_norm": 0.46491021788622433, "learning_rate": 4.8538884204193426e-05, "loss": 0.2946, "step": 647 }, { "epoch": 0.5977859778597786, "grad_norm": 0.4521473135967776, "learning_rate": 4.852982867442546e-05, "loss": 0.2673, "step": 648 }, { "epoch": 0.5987084870848709, "grad_norm": 0.4154034194245415, "learning_rate": 4.8520746019652605e-05, "loss": 0.2607, "step": 649 }, { "epoch": 0.5996309963099631, "grad_norm": 0.4491837463124351, "learning_rate": 4.8511636250345294e-05, "loss": 0.2858, "step": 650 }, { "epoch": 0.6005535055350554, "grad_norm": 0.4044243243068525, "learning_rate": 4.850249937700517e-05, "loss": 0.2695, "step": 651 }, { "epoch": 0.6014760147601476, "grad_norm": 0.430403757037781, "learning_rate": 4.849333541016516e-05, "loss": 0.2715, "step": 652 }, { "epoch": 0.6023985239852399, "grad_norm": 0.45438956743986286, "learning_rate": 4.8484144360389425e-05, "loss": 0.2622, "step": 653 }, { "epoch": 0.6033210332103321, "grad_norm": 0.4539691622407592, "learning_rate": 4.847492623827333e-05, "loss": 0.2569, "step": 654 }, { "epoch": 0.6042435424354243, "grad_norm": 0.46659023853004006, "learning_rate": 4.846568105444345e-05, "loss": 0.2603, "step": 655 }, { "epoch": 0.6051660516605166, "grad_norm": 0.47799503385597875, "learning_rate": 4.8456408819557564e-05, "loss": 0.2813, "step": 656 }, { "epoch": 0.6060885608856088, "grad_norm": 0.49542981968905914, "learning_rate": 4.8447109544304636e-05, "loss": 0.3191, "step": 657 }, { "epoch": 0.6070110701107011, "grad_norm": 0.40468346171857267, "learning_rate": 4.84377832394048e-05, "loss": 0.2524, "step": 658 }, { "epoch": 0.6079335793357934, "grad_norm": 0.4069524816036995, "learning_rate": 4.8428429915609336e-05, "loss": 0.2643, "step": 659 }, { "epoch": 0.6088560885608856, "grad_norm": 0.40713370506278707, "learning_rate": 4.8419049583700696e-05, "loss": 0.2681, "step": 660 }, { "epoch": 0.6097785977859779, "grad_norm": 0.4519195307988597, "learning_rate": 4.840964225449245e-05, "loss": 0.2779, "step": 661 }, { "epoch": 0.6107011070110702, "grad_norm": 0.4558992574499373, "learning_rate": 4.84002079388293e-05, "loss": 0.2811, "step": 662 }, { "epoch": 0.6116236162361623, "grad_norm": 0.3903183870666297, "learning_rate": 4.839074664758704e-05, "loss": 0.2579, "step": 663 }, { "epoch": 0.6125461254612546, "grad_norm": 0.3808535356695177, "learning_rate": 4.838125839167259e-05, "loss": 0.2498, "step": 664 }, { "epoch": 0.6134686346863468, "grad_norm": 0.41419592574548914, "learning_rate": 4.837174318202392e-05, "loss": 0.2699, "step": 665 }, { "epoch": 0.6143911439114391, "grad_norm": 0.43374303904431827, "learning_rate": 4.836220102961011e-05, "loss": 0.2745, "step": 666 }, { "epoch": 0.6153136531365314, "grad_norm": 0.4469220968657081, "learning_rate": 4.835263194543126e-05, "loss": 0.2857, "step": 667 }, { "epoch": 0.6162361623616236, "grad_norm": 0.4620147290242077, "learning_rate": 4.834303594051854e-05, "loss": 0.2621, "step": 668 }, { "epoch": 0.6171586715867159, "grad_norm": 0.4216662617349631, "learning_rate": 4.833341302593417e-05, "loss": 0.2775, "step": 669 }, { "epoch": 0.6180811808118081, "grad_norm": 0.47013460047624434, "learning_rate": 4.8323763212771354e-05, "loss": 0.3009, "step": 670 }, { "epoch": 0.6190036900369004, "grad_norm": 0.401712384643802, "learning_rate": 4.8314086512154325e-05, "loss": 0.266, "step": 671 }, { "epoch": 0.6199261992619927, "grad_norm": 0.39452738403372956, "learning_rate": 4.83043829352383e-05, "loss": 0.2757, "step": 672 }, { "epoch": 0.6208487084870848, "grad_norm": 0.4541490118997505, "learning_rate": 4.829465249320951e-05, "loss": 0.2768, "step": 673 }, { "epoch": 0.6217712177121771, "grad_norm": 0.3987990087632013, "learning_rate": 4.8284895197285116e-05, "loss": 0.2574, "step": 674 }, { "epoch": 0.6226937269372693, "grad_norm": 0.4191622596571706, "learning_rate": 4.827511105871325e-05, "loss": 0.2453, "step": 675 }, { "epoch": 0.6236162361623616, "grad_norm": 0.40689152241903564, "learning_rate": 4.826530008877301e-05, "loss": 0.2709, "step": 676 }, { "epoch": 0.6245387453874539, "grad_norm": 0.424621539220838, "learning_rate": 4.825546229877439e-05, "loss": 0.253, "step": 677 }, { "epoch": 0.6254612546125461, "grad_norm": 0.4292291438742583, "learning_rate": 4.824559770005833e-05, "loss": 0.2763, "step": 678 }, { "epoch": 0.6263837638376384, "grad_norm": 0.3954778440075958, "learning_rate": 4.823570630399665e-05, "loss": 0.2345, "step": 679 }, { "epoch": 0.6273062730627307, "grad_norm": 0.6001086880801821, "learning_rate": 4.822578812199208e-05, "loss": 0.2674, "step": 680 }, { "epoch": 0.6282287822878229, "grad_norm": 0.40696009845252346, "learning_rate": 4.821584316547824e-05, "loss": 0.2796, "step": 681 }, { "epoch": 0.6291512915129152, "grad_norm": 0.41493478120098126, "learning_rate": 4.820587144591957e-05, "loss": 0.2739, "step": 682 }, { "epoch": 0.6300738007380073, "grad_norm": 0.4095714288542147, "learning_rate": 4.819587297481141e-05, "loss": 0.2797, "step": 683 }, { "epoch": 0.6309963099630996, "grad_norm": 0.46390279747131585, "learning_rate": 4.818584776367993e-05, "loss": 0.2716, "step": 684 }, { "epoch": 0.6319188191881919, "grad_norm": 0.40183433756081804, "learning_rate": 4.817579582408208e-05, "loss": 0.2501, "step": 685 }, { "epoch": 0.6328413284132841, "grad_norm": 0.37094621112893805, "learning_rate": 4.8165717167605694e-05, "loss": 0.2356, "step": 686 }, { "epoch": 0.6337638376383764, "grad_norm": 0.4427415896084445, "learning_rate": 4.815561180586936e-05, "loss": 0.2584, "step": 687 }, { "epoch": 0.6346863468634686, "grad_norm": 0.3941215954984464, "learning_rate": 4.814547975052245e-05, "loss": 0.2525, "step": 688 }, { "epoch": 0.6356088560885609, "grad_norm": 0.381367093297642, "learning_rate": 4.813532101324514e-05, "loss": 0.2563, "step": 689 }, { "epoch": 0.6365313653136532, "grad_norm": 0.38140384507782954, "learning_rate": 4.8125135605748314e-05, "loss": 0.243, "step": 690 }, { "epoch": 0.6374538745387454, "grad_norm": 0.4505634418832179, "learning_rate": 4.811492353977366e-05, "loss": 0.2599, "step": 691 }, { "epoch": 0.6383763837638377, "grad_norm": 0.4528289733421791, "learning_rate": 4.810468482709355e-05, "loss": 0.252, "step": 692 }, { "epoch": 0.6392988929889298, "grad_norm": 0.46057740923035445, "learning_rate": 4.80944194795111e-05, "loss": 0.252, "step": 693 }, { "epoch": 0.6402214022140221, "grad_norm": 0.4481110788137588, "learning_rate": 4.808412750886013e-05, "loss": 0.2569, "step": 694 }, { "epoch": 0.6411439114391144, "grad_norm": 0.44907624769789195, "learning_rate": 4.8073808927005125e-05, "loss": 0.2708, "step": 695 }, { "epoch": 0.6420664206642066, "grad_norm": 0.4255246259068333, "learning_rate": 4.806346374584129e-05, "loss": 0.2808, "step": 696 }, { "epoch": 0.6429889298892989, "grad_norm": 0.44614343193719974, "learning_rate": 4.8053091977294456e-05, "loss": 0.2471, "step": 697 }, { "epoch": 0.6439114391143912, "grad_norm": 0.3894364062272488, "learning_rate": 4.804269363332112e-05, "loss": 0.2385, "step": 698 }, { "epoch": 0.6448339483394834, "grad_norm": 0.4214046028628608, "learning_rate": 4.803226872590841e-05, "loss": 0.2743, "step": 699 }, { "epoch": 0.6457564575645757, "grad_norm": 0.4135698277984648, "learning_rate": 4.8021817267074084e-05, "loss": 0.254, "step": 700 }, { "epoch": 0.6466789667896679, "grad_norm": 0.44485124011855504, "learning_rate": 4.8011339268866505e-05, "loss": 0.2809, "step": 701 }, { "epoch": 0.6476014760147601, "grad_norm": 0.47409527158946313, "learning_rate": 4.800083474336463e-05, "loss": 0.2565, "step": 702 }, { "epoch": 0.6485239852398524, "grad_norm": 0.39316505907870625, "learning_rate": 4.7990303702677976e-05, "loss": 0.274, "step": 703 }, { "epoch": 0.6494464944649446, "grad_norm": 0.4578784316296411, "learning_rate": 4.797974615894667e-05, "loss": 0.2806, "step": 704 }, { "epoch": 0.6503690036900369, "grad_norm": 0.38103820283177153, "learning_rate": 4.796916212434135e-05, "loss": 0.2297, "step": 705 }, { "epoch": 0.6512915129151291, "grad_norm": 0.394140134989682, "learning_rate": 4.795855161106322e-05, "loss": 0.2561, "step": 706 }, { "epoch": 0.6522140221402214, "grad_norm": 0.3976966065031595, "learning_rate": 4.794791463134399e-05, "loss": 0.2821, "step": 707 }, { "epoch": 0.6531365313653137, "grad_norm": 0.4163144717479845, "learning_rate": 4.7937251197445886e-05, "loss": 0.2421, "step": 708 }, { "epoch": 0.6540590405904059, "grad_norm": 0.4158977933033687, "learning_rate": 4.7926561321661646e-05, "loss": 0.26, "step": 709 }, { "epoch": 0.6549815498154982, "grad_norm": 0.4427865277567835, "learning_rate": 4.791584501631447e-05, "loss": 0.2707, "step": 710 }, { "epoch": 0.6559040590405905, "grad_norm": 0.46200166734221765, "learning_rate": 4.790510229375802e-05, "loss": 0.2909, "step": 711 }, { "epoch": 0.6568265682656826, "grad_norm": 0.4285513877878524, "learning_rate": 4.789433316637644e-05, "loss": 0.2833, "step": 712 }, { "epoch": 0.6577490774907749, "grad_norm": 0.4644538463412512, "learning_rate": 4.7883537646584285e-05, "loss": 0.2932, "step": 713 }, { "epoch": 0.6586715867158671, "grad_norm": 0.42937930052684714, "learning_rate": 4.787271574682656e-05, "loss": 0.2826, "step": 714 }, { "epoch": 0.6595940959409594, "grad_norm": 0.4186608552294606, "learning_rate": 4.786186747957866e-05, "loss": 0.2436, "step": 715 }, { "epoch": 0.6605166051660517, "grad_norm": 0.4067942162950451, "learning_rate": 4.785099285734638e-05, "loss": 0.2712, "step": 716 }, { "epoch": 0.6614391143911439, "grad_norm": 0.40470907170911935, "learning_rate": 4.7840091892665904e-05, "loss": 0.2416, "step": 717 }, { "epoch": 0.6623616236162362, "grad_norm": 0.37668928961572784, "learning_rate": 4.782916459810378e-05, "loss": 0.2568, "step": 718 }, { "epoch": 0.6632841328413284, "grad_norm": 0.4176007426824119, "learning_rate": 4.78182109862569e-05, "loss": 0.2656, "step": 719 }, { "epoch": 0.6642066420664207, "grad_norm": 0.4602953981120611, "learning_rate": 4.7807231069752536e-05, "loss": 0.2703, "step": 720 }, { "epoch": 0.665129151291513, "grad_norm": 0.45123474203908875, "learning_rate": 4.7796224861248214e-05, "loss": 0.2771, "step": 721 }, { "epoch": 0.6660516605166051, "grad_norm": 0.3715107067242283, "learning_rate": 4.778519237343182e-05, "loss": 0.2285, "step": 722 }, { "epoch": 0.6669741697416974, "grad_norm": 0.4295792736840619, "learning_rate": 4.7774133619021514e-05, "loss": 0.2679, "step": 723 }, { "epoch": 0.6678966789667896, "grad_norm": 0.5222961706066085, "learning_rate": 4.776304861076576e-05, "loss": 0.2422, "step": 724 }, { "epoch": 0.6688191881918819, "grad_norm": 0.4273618356221251, "learning_rate": 4.775193736144326e-05, "loss": 0.2858, "step": 725 }, { "epoch": 0.6697416974169742, "grad_norm": 0.4580406802259534, "learning_rate": 4.774079988386296e-05, "loss": 0.2676, "step": 726 }, { "epoch": 0.6706642066420664, "grad_norm": 0.43261963264608283, "learning_rate": 4.7729636190864085e-05, "loss": 0.2546, "step": 727 }, { "epoch": 0.6715867158671587, "grad_norm": 0.4023257232553763, "learning_rate": 4.7718446295316044e-05, "loss": 0.2725, "step": 728 }, { "epoch": 0.672509225092251, "grad_norm": 0.41518162426686667, "learning_rate": 4.770723021011846e-05, "loss": 0.2755, "step": 729 }, { "epoch": 0.6734317343173432, "grad_norm": 0.40050411915943995, "learning_rate": 4.769598794820114e-05, "loss": 0.2579, "step": 730 }, { "epoch": 0.6743542435424354, "grad_norm": 0.4016016628876197, "learning_rate": 4.76847195225241e-05, "loss": 0.2746, "step": 731 }, { "epoch": 0.6752767527675276, "grad_norm": 0.4181553089033055, "learning_rate": 4.7673424946077474e-05, "loss": 0.2638, "step": 732 }, { "epoch": 0.6761992619926199, "grad_norm": 0.37434014798856674, "learning_rate": 4.7662104231881574e-05, "loss": 0.2499, "step": 733 }, { "epoch": 0.6771217712177122, "grad_norm": 0.42639642254570276, "learning_rate": 4.765075739298683e-05, "loss": 0.2718, "step": 734 }, { "epoch": 0.6780442804428044, "grad_norm": 0.4440804113794752, "learning_rate": 4.763938444247378e-05, "loss": 0.2453, "step": 735 }, { "epoch": 0.6789667896678967, "grad_norm": 0.4192906925708955, "learning_rate": 4.762798539345309e-05, "loss": 0.2885, "step": 736 }, { "epoch": 0.6798892988929889, "grad_norm": 0.482625341846353, "learning_rate": 4.7616560259065486e-05, "loss": 0.2716, "step": 737 }, { "epoch": 0.6808118081180812, "grad_norm": 0.4165196350508539, "learning_rate": 4.760510905248177e-05, "loss": 0.2698, "step": 738 }, { "epoch": 0.6817343173431735, "grad_norm": 0.38642332048977557, "learning_rate": 4.759363178690282e-05, "loss": 0.2637, "step": 739 }, { "epoch": 0.6826568265682657, "grad_norm": 0.4076451590146458, "learning_rate": 4.758212847555953e-05, "loss": 0.252, "step": 740 }, { "epoch": 0.683579335793358, "grad_norm": 0.45785352706377896, "learning_rate": 4.757059913171282e-05, "loss": 0.2608, "step": 741 }, { "epoch": 0.6845018450184502, "grad_norm": 0.4543916372183251, "learning_rate": 4.755904376865364e-05, "loss": 0.2412, "step": 742 }, { "epoch": 0.6854243542435424, "grad_norm": 0.4356719346714614, "learning_rate": 4.754746239970292e-05, "loss": 0.2443, "step": 743 }, { "epoch": 0.6863468634686347, "grad_norm": 0.390093208524888, "learning_rate": 4.753585503821157e-05, "loss": 0.2588, "step": 744 }, { "epoch": 0.6872693726937269, "grad_norm": 0.42573916131343337, "learning_rate": 4.752422169756048e-05, "loss": 0.2507, "step": 745 }, { "epoch": 0.6881918819188192, "grad_norm": 0.36877784679859316, "learning_rate": 4.751256239116046e-05, "loss": 0.2669, "step": 746 }, { "epoch": 0.6891143911439115, "grad_norm": 0.3889726868456164, "learning_rate": 4.750087713245227e-05, "loss": 0.2385, "step": 747 }, { "epoch": 0.6900369003690037, "grad_norm": 0.42525791717206574, "learning_rate": 4.74891659349066e-05, "loss": 0.2601, "step": 748 }, { "epoch": 0.690959409594096, "grad_norm": 0.42840628871678277, "learning_rate": 4.7477428812024e-05, "loss": 0.281, "step": 749 }, { "epoch": 0.6918819188191881, "grad_norm": 0.42504599248665903, "learning_rate": 4.746566577733497e-05, "loss": 0.3042, "step": 750 }, { "epoch": 0.6928044280442804, "grad_norm": 0.42452760937026107, "learning_rate": 4.7453876844399824e-05, "loss": 0.2447, "step": 751 }, { "epoch": 0.6937269372693727, "grad_norm": 0.4594757150735403, "learning_rate": 4.7442062026808756e-05, "loss": 0.2488, "step": 752 }, { "epoch": 0.6946494464944649, "grad_norm": 0.410268565822323, "learning_rate": 4.743022133818179e-05, "loss": 0.261, "step": 753 }, { "epoch": 0.6955719557195572, "grad_norm": 0.4545436622681227, "learning_rate": 4.7418354792168794e-05, "loss": 0.2171, "step": 754 }, { "epoch": 0.6964944649446494, "grad_norm": 0.40396416205888286, "learning_rate": 4.7406462402449426e-05, "loss": 0.2523, "step": 755 }, { "epoch": 0.6974169741697417, "grad_norm": 0.3943645189708737, "learning_rate": 4.7394544182733144e-05, "loss": 0.2813, "step": 756 }, { "epoch": 0.698339483394834, "grad_norm": 0.40320692754111276, "learning_rate": 4.7382600146759174e-05, "loss": 0.2638, "step": 757 }, { "epoch": 0.6992619926199262, "grad_norm": 0.40025770221592594, "learning_rate": 4.7370630308296505e-05, "loss": 0.2623, "step": 758 }, { "epoch": 0.7001845018450185, "grad_norm": 0.41160142244941034, "learning_rate": 4.735863468114388e-05, "loss": 0.2878, "step": 759 }, { "epoch": 0.7011070110701108, "grad_norm": 0.4099065053363144, "learning_rate": 4.734661327912976e-05, "loss": 0.2576, "step": 760 }, { "epoch": 0.7020295202952029, "grad_norm": 0.3916032675432416, "learning_rate": 4.733456611611233e-05, "loss": 0.2644, "step": 761 }, { "epoch": 0.7029520295202952, "grad_norm": 0.4127119444291852, "learning_rate": 4.732249320597948e-05, "loss": 0.2597, "step": 762 }, { "epoch": 0.7038745387453874, "grad_norm": 0.4158472026504385, "learning_rate": 4.731039456264874e-05, "loss": 0.2596, "step": 763 }, { "epoch": 0.7047970479704797, "grad_norm": 0.44282321593212026, "learning_rate": 4.729827020006735e-05, "loss": 0.2547, "step": 764 }, { "epoch": 0.705719557195572, "grad_norm": 0.4210528058327828, "learning_rate": 4.7286120132212176e-05, "loss": 0.2665, "step": 765 }, { "epoch": 0.7066420664206642, "grad_norm": 0.4103670912984461, "learning_rate": 4.7273944373089724e-05, "loss": 0.2479, "step": 766 }, { "epoch": 0.7075645756457565, "grad_norm": 0.459416682150275, "learning_rate": 4.726174293673612e-05, "loss": 0.2716, "step": 767 }, { "epoch": 0.7084870848708487, "grad_norm": 0.3766584559940595, "learning_rate": 4.724951583721707e-05, "loss": 0.25, "step": 768 }, { "epoch": 0.709409594095941, "grad_norm": 0.4111985223859097, "learning_rate": 4.7237263088627905e-05, "loss": 0.2599, "step": 769 }, { "epoch": 0.7103321033210332, "grad_norm": 0.39176492040102723, "learning_rate": 4.722498470509348e-05, "loss": 0.2336, "step": 770 }, { "epoch": 0.7112546125461254, "grad_norm": 0.4432494114714155, "learning_rate": 4.721268070076822e-05, "loss": 0.2886, "step": 771 }, { "epoch": 0.7121771217712177, "grad_norm": 0.42411795005176356, "learning_rate": 4.720035108983609e-05, "loss": 0.2656, "step": 772 }, { "epoch": 0.7130996309963099, "grad_norm": 0.4078323707771864, "learning_rate": 4.718799588651058e-05, "loss": 0.2735, "step": 773 }, { "epoch": 0.7140221402214022, "grad_norm": 0.4627564835371624, "learning_rate": 4.717561510503466e-05, "loss": 0.2563, "step": 774 }, { "epoch": 0.7149446494464945, "grad_norm": 0.41029118918141744, "learning_rate": 4.716320875968081e-05, "loss": 0.2567, "step": 775 }, { "epoch": 0.7158671586715867, "grad_norm": 0.4205427816104139, "learning_rate": 4.7150776864750956e-05, "loss": 0.263, "step": 776 }, { "epoch": 0.716789667896679, "grad_norm": 0.5414972668050808, "learning_rate": 4.71383194345765e-05, "loss": 0.2393, "step": 777 }, { "epoch": 0.7177121771217713, "grad_norm": 0.3885768505484422, "learning_rate": 4.7125836483518276e-05, "loss": 0.2572, "step": 778 }, { "epoch": 0.7186346863468634, "grad_norm": 0.4161037898367446, "learning_rate": 4.711332802596652e-05, "loss": 0.2506, "step": 779 }, { "epoch": 0.7195571955719557, "grad_norm": 0.38852555170461517, "learning_rate": 4.7100794076340896e-05, "loss": 0.2526, "step": 780 }, { "epoch": 0.7204797047970479, "grad_norm": 0.398585165108261, "learning_rate": 4.708823464909045e-05, "loss": 0.229, "step": 781 }, { "epoch": 0.7214022140221402, "grad_norm": 0.4195912312646104, "learning_rate": 4.7075649758693565e-05, "loss": 0.2544, "step": 782 }, { "epoch": 0.7223247232472325, "grad_norm": 0.38972929452659466, "learning_rate": 4.7063039419658035e-05, "loss": 0.2686, "step": 783 }, { "epoch": 0.7232472324723247, "grad_norm": 0.4057648954307941, "learning_rate": 4.7050403646520944e-05, "loss": 0.2521, "step": 784 }, { "epoch": 0.724169741697417, "grad_norm": 0.3881229279065408, "learning_rate": 4.703774245384873e-05, "loss": 0.2668, "step": 785 }, { "epoch": 0.7250922509225092, "grad_norm": 0.46964517708980524, "learning_rate": 4.70250558562371e-05, "loss": 0.2724, "step": 786 }, { "epoch": 0.7260147601476015, "grad_norm": 0.387501710974925, "learning_rate": 4.701234386831108e-05, "loss": 0.2369, "step": 787 }, { "epoch": 0.7269372693726938, "grad_norm": 0.42745253437867164, "learning_rate": 4.6999606504724944e-05, "loss": 0.2631, "step": 788 }, { "epoch": 0.727859778597786, "grad_norm": 0.35781009260671326, "learning_rate": 4.698684378016222e-05, "loss": 0.2435, "step": 789 }, { "epoch": 0.7287822878228782, "grad_norm": 0.41820542281472733, "learning_rate": 4.6974055709335705e-05, "loss": 0.2591, "step": 790 }, { "epoch": 0.7297047970479705, "grad_norm": 0.4024910437468129, "learning_rate": 4.696124230698736e-05, "loss": 0.249, "step": 791 }, { "epoch": 0.7306273062730627, "grad_norm": 0.395320824058086, "learning_rate": 4.694840358788839e-05, "loss": 0.2558, "step": 792 }, { "epoch": 0.731549815498155, "grad_norm": 0.4373305480576554, "learning_rate": 4.693553956683916e-05, "loss": 0.2599, "step": 793 }, { "epoch": 0.7324723247232472, "grad_norm": 0.41630398217727865, "learning_rate": 4.692265025866923e-05, "loss": 0.2776, "step": 794 }, { "epoch": 0.7333948339483395, "grad_norm": 0.38719793187775475, "learning_rate": 4.6909735678237284e-05, "loss": 0.2635, "step": 795 }, { "epoch": 0.7343173431734318, "grad_norm": 0.3749824756461014, "learning_rate": 4.689679584043115e-05, "loss": 0.2453, "step": 796 }, { "epoch": 0.735239852398524, "grad_norm": 0.6396474148444344, "learning_rate": 4.688383076016778e-05, "loss": 0.2525, "step": 797 }, { "epoch": 0.7361623616236163, "grad_norm": 0.41477012509152505, "learning_rate": 4.687084045239322e-05, "loss": 0.245, "step": 798 }, { "epoch": 0.7370848708487084, "grad_norm": 0.3666873244609313, "learning_rate": 4.6857824932082586e-05, "loss": 0.2361, "step": 799 }, { "epoch": 0.7380073800738007, "grad_norm": 0.3776802233167813, "learning_rate": 4.6844784214240076e-05, "loss": 0.2556, "step": 800 }, { "epoch": 0.738929889298893, "grad_norm": 0.39241143181846977, "learning_rate": 4.683171831389892e-05, "loss": 0.2504, "step": 801 }, { "epoch": 0.7398523985239852, "grad_norm": 0.33691745713939714, "learning_rate": 4.681862724612141e-05, "loss": 0.2177, "step": 802 }, { "epoch": 0.7407749077490775, "grad_norm": 0.36087914762741574, "learning_rate": 4.68055110259988e-05, "loss": 0.2475, "step": 803 }, { "epoch": 0.7416974169741697, "grad_norm": 0.3803088774224962, "learning_rate": 4.6792369668651384e-05, "loss": 0.2574, "step": 804 }, { "epoch": 0.742619926199262, "grad_norm": 0.45272098427208934, "learning_rate": 4.6779203189228417e-05, "loss": 0.2966, "step": 805 }, { "epoch": 0.7435424354243543, "grad_norm": 0.3677471946148316, "learning_rate": 4.6766011602908114e-05, "loss": 0.234, "step": 806 }, { "epoch": 0.7444649446494465, "grad_norm": 0.42506463356607843, "learning_rate": 4.6752794924897624e-05, "loss": 0.2668, "step": 807 }, { "epoch": 0.7453874538745388, "grad_norm": 0.3614067518204027, "learning_rate": 4.6739553170433045e-05, "loss": 0.2316, "step": 808 }, { "epoch": 0.746309963099631, "grad_norm": 0.4618757772460389, "learning_rate": 4.672628635477936e-05, "loss": 0.2652, "step": 809 }, { "epoch": 0.7472324723247232, "grad_norm": 0.3924011015972566, "learning_rate": 4.671299449323045e-05, "loss": 0.2415, "step": 810 }, { "epoch": 0.7481549815498155, "grad_norm": 0.406502684771447, "learning_rate": 4.669967760110908e-05, "loss": 0.2582, "step": 811 }, { "epoch": 0.7490774907749077, "grad_norm": 0.44920857107907514, "learning_rate": 4.668633569376685e-05, "loss": 0.2794, "step": 812 }, { "epoch": 0.75, "grad_norm": 0.41631190647871946, "learning_rate": 4.667296878658423e-05, "loss": 0.2592, "step": 813 }, { "epoch": 0.7509225092250923, "grad_norm": 0.3633834642775942, "learning_rate": 4.665957689497045e-05, "loss": 0.2424, "step": 814 }, { "epoch": 0.7518450184501845, "grad_norm": 0.424235479072311, "learning_rate": 4.664616003436361e-05, "loss": 0.2391, "step": 815 }, { "epoch": 0.7527675276752768, "grad_norm": 0.40557799197958655, "learning_rate": 4.663271822023055e-05, "loss": 0.2551, "step": 816 }, { "epoch": 0.753690036900369, "grad_norm": 0.4167542469925017, "learning_rate": 4.66192514680669e-05, "loss": 0.2633, "step": 817 }, { "epoch": 0.7546125461254612, "grad_norm": 0.4207560114138412, "learning_rate": 4.660575979339701e-05, "loss": 0.2691, "step": 818 }, { "epoch": 0.7555350553505535, "grad_norm": 0.4183650561450747, "learning_rate": 4.6592243211774e-05, "loss": 0.2702, "step": 819 }, { "epoch": 0.7564575645756457, "grad_norm": 0.4320429078035428, "learning_rate": 4.657870173877967e-05, "loss": 0.2856, "step": 820 }, { "epoch": 0.757380073800738, "grad_norm": 0.4243496691378848, "learning_rate": 4.6565135390024515e-05, "loss": 0.2729, "step": 821 }, { "epoch": 0.7583025830258303, "grad_norm": 0.40806346530521626, "learning_rate": 4.6551544181147744e-05, "loss": 0.2381, "step": 822 }, { "epoch": 0.7592250922509225, "grad_norm": 0.37050493027188813, "learning_rate": 4.653792812781717e-05, "loss": 0.2472, "step": 823 }, { "epoch": 0.7601476014760148, "grad_norm": 0.42478917712919395, "learning_rate": 4.6524287245729295e-05, "loss": 0.2696, "step": 824 }, { "epoch": 0.761070110701107, "grad_norm": 0.39453849669994284, "learning_rate": 4.65106215506092e-05, "loss": 0.2507, "step": 825 }, { "epoch": 0.7619926199261993, "grad_norm": 0.3985150630506928, "learning_rate": 4.6496931058210615e-05, "loss": 0.2441, "step": 826 }, { "epoch": 0.7629151291512916, "grad_norm": 0.43537042016071226, "learning_rate": 4.6483215784315826e-05, "loss": 0.26, "step": 827 }, { "epoch": 0.7638376383763837, "grad_norm": 0.39034045876332363, "learning_rate": 4.646947574473569e-05, "loss": 0.2328, "step": 828 }, { "epoch": 0.764760147601476, "grad_norm": 0.4179709095045547, "learning_rate": 4.645571095530963e-05, "loss": 0.268, "step": 829 }, { "epoch": 0.7656826568265682, "grad_norm": 0.39056066444947624, "learning_rate": 4.644192143190558e-05, "loss": 0.2689, "step": 830 }, { "epoch": 0.7666051660516605, "grad_norm": 0.41534205342903163, "learning_rate": 4.642810719041999e-05, "loss": 0.2419, "step": 831 }, { "epoch": 0.7675276752767528, "grad_norm": 0.38688656660125625, "learning_rate": 4.6414268246777824e-05, "loss": 0.2533, "step": 832 }, { "epoch": 0.768450184501845, "grad_norm": 0.38205343347520876, "learning_rate": 4.6400404616932505e-05, "loss": 0.2462, "step": 833 }, { "epoch": 0.7693726937269373, "grad_norm": 0.38784424587097893, "learning_rate": 4.6386516316865916e-05, "loss": 0.2285, "step": 834 }, { "epoch": 0.7702952029520295, "grad_norm": 0.45292745730319345, "learning_rate": 4.637260336258838e-05, "loss": 0.2774, "step": 835 }, { "epoch": 0.7712177121771218, "grad_norm": 0.39512053052710583, "learning_rate": 4.6358665770138664e-05, "loss": 0.2568, "step": 836 }, { "epoch": 0.772140221402214, "grad_norm": 0.43157081923032486, "learning_rate": 4.6344703555583884e-05, "loss": 0.2372, "step": 837 }, { "epoch": 0.7730627306273062, "grad_norm": 0.3467182809262665, "learning_rate": 4.63307167350196e-05, "loss": 0.2308, "step": 838 }, { "epoch": 0.7739852398523985, "grad_norm": 0.41682462805052434, "learning_rate": 4.6316705324569687e-05, "loss": 0.2481, "step": 839 }, { "epoch": 0.7749077490774908, "grad_norm": 0.4292084985479234, "learning_rate": 4.630266934038642e-05, "loss": 0.2765, "step": 840 }, { "epoch": 0.775830258302583, "grad_norm": 0.40443659411602667, "learning_rate": 4.628860879865035e-05, "loss": 0.2575, "step": 841 }, { "epoch": 0.7767527675276753, "grad_norm": 0.44602392736308993, "learning_rate": 4.627452371557036e-05, "loss": 0.2345, "step": 842 }, { "epoch": 0.7776752767527675, "grad_norm": 0.39494373687964907, "learning_rate": 4.6260414107383646e-05, "loss": 0.2538, "step": 843 }, { "epoch": 0.7785977859778598, "grad_norm": 0.39973400621846794, "learning_rate": 4.624627999035563e-05, "loss": 0.2567, "step": 844 }, { "epoch": 0.7795202952029521, "grad_norm": 0.3926013086674455, "learning_rate": 4.6232121380780034e-05, "loss": 0.2263, "step": 845 }, { "epoch": 0.7804428044280443, "grad_norm": 0.3712337226687765, "learning_rate": 4.621793829497879e-05, "loss": 0.2214, "step": 846 }, { "epoch": 0.7813653136531366, "grad_norm": 0.3842637168616991, "learning_rate": 4.6203730749302043e-05, "loss": 0.2395, "step": 847 }, { "epoch": 0.7822878228782287, "grad_norm": 0.3549206951977871, "learning_rate": 4.6189498760128136e-05, "loss": 0.2417, "step": 848 }, { "epoch": 0.783210332103321, "grad_norm": 0.37894061281979396, "learning_rate": 4.617524234386361e-05, "loss": 0.2378, "step": 849 }, { "epoch": 0.7841328413284133, "grad_norm": 0.4168344672554966, "learning_rate": 4.6160961516943145e-05, "loss": 0.2354, "step": 850 }, { "epoch": 0.7850553505535055, "grad_norm": 0.4080512446894384, "learning_rate": 4.614665629582958e-05, "loss": 0.2468, "step": 851 }, { "epoch": 0.7859778597785978, "grad_norm": 0.40911776930832033, "learning_rate": 4.613232669701384e-05, "loss": 0.2691, "step": 852 }, { "epoch": 0.7869003690036901, "grad_norm": 0.39657146630350776, "learning_rate": 4.6117972737014993e-05, "loss": 0.2588, "step": 853 }, { "epoch": 0.7878228782287823, "grad_norm": 0.3676678279956454, "learning_rate": 4.610359443238017e-05, "loss": 0.2633, "step": 854 }, { "epoch": 0.7887453874538746, "grad_norm": 0.4220328616701844, "learning_rate": 4.608919179968457e-05, "loss": 0.2179, "step": 855 }, { "epoch": 0.7896678966789668, "grad_norm": 0.3873623022895507, "learning_rate": 4.6074764855531435e-05, "loss": 0.255, "step": 856 }, { "epoch": 0.790590405904059, "grad_norm": 0.39345781924934875, "learning_rate": 4.606031361655203e-05, "loss": 0.2321, "step": 857 }, { "epoch": 0.7915129151291513, "grad_norm": 0.3641248075979448, "learning_rate": 4.604583809940565e-05, "loss": 0.2359, "step": 858 }, { "epoch": 0.7924354243542435, "grad_norm": 0.3795834365154066, "learning_rate": 4.6031338320779534e-05, "loss": 0.2187, "step": 859 }, { "epoch": 0.7933579335793358, "grad_norm": 0.3748626112020789, "learning_rate": 4.601681429738893e-05, "loss": 0.2537, "step": 860 }, { "epoch": 0.794280442804428, "grad_norm": 0.38057309992880556, "learning_rate": 4.6002266045977015e-05, "loss": 0.2633, "step": 861 }, { "epoch": 0.7952029520295203, "grad_norm": 0.39693638718508983, "learning_rate": 4.598769358331491e-05, "loss": 0.2528, "step": 862 }, { "epoch": 0.7961254612546126, "grad_norm": 0.41350889386382905, "learning_rate": 4.597309692620163e-05, "loss": 0.2627, "step": 863 }, { "epoch": 0.7970479704797048, "grad_norm": 0.38247449539113726, "learning_rate": 4.5958476091464086e-05, "loss": 0.2568, "step": 864 }, { "epoch": 0.7979704797047971, "grad_norm": 0.4104295900334278, "learning_rate": 4.5943831095957066e-05, "loss": 0.2742, "step": 865 }, { "epoch": 0.7988929889298892, "grad_norm": 0.3959010980286953, "learning_rate": 4.592916195656322e-05, "loss": 0.2423, "step": 866 }, { "epoch": 0.7998154981549815, "grad_norm": 0.397778263487737, "learning_rate": 4.5914468690192994e-05, "loss": 0.2829, "step": 867 }, { "epoch": 0.8007380073800738, "grad_norm": 0.3589113580764412, "learning_rate": 4.5899751313784693e-05, "loss": 0.2367, "step": 868 }, { "epoch": 0.801660516605166, "grad_norm": 0.35817034843856266, "learning_rate": 4.5885009844304386e-05, "loss": 0.2241, "step": 869 }, { "epoch": 0.8025830258302583, "grad_norm": 0.4181099949219438, "learning_rate": 4.5870244298745926e-05, "loss": 0.2649, "step": 870 }, { "epoch": 0.8035055350553506, "grad_norm": 0.41552754910798373, "learning_rate": 4.585545469413092e-05, "loss": 0.263, "step": 871 }, { "epoch": 0.8044280442804428, "grad_norm": 0.4000016435818405, "learning_rate": 4.584064104750872e-05, "loss": 0.2596, "step": 872 }, { "epoch": 0.8053505535055351, "grad_norm": 0.4301391513536784, "learning_rate": 4.582580337595636e-05, "loss": 0.2369, "step": 873 }, { "epoch": 0.8062730627306273, "grad_norm": 0.3771770992053558, "learning_rate": 4.5810941696578616e-05, "loss": 0.2679, "step": 874 }, { "epoch": 0.8071955719557196, "grad_norm": 0.3681314252976609, "learning_rate": 4.57960560265079e-05, "loss": 0.2453, "step": 875 }, { "epoch": 0.8081180811808119, "grad_norm": 0.3960434602601055, "learning_rate": 4.5781146382904314e-05, "loss": 0.2459, "step": 876 }, { "epoch": 0.809040590405904, "grad_norm": 0.3651264724900847, "learning_rate": 4.576621278295558e-05, "loss": 0.2293, "step": 877 }, { "epoch": 0.8099630996309963, "grad_norm": 0.3645341896411552, "learning_rate": 4.5751255243877015e-05, "loss": 0.2426, "step": 878 }, { "epoch": 0.8108856088560885, "grad_norm": 0.37810515308621573, "learning_rate": 4.5736273782911575e-05, "loss": 0.2508, "step": 879 }, { "epoch": 0.8118081180811808, "grad_norm": 0.4134745374916864, "learning_rate": 4.572126841732976e-05, "loss": 0.2536, "step": 880 }, { "epoch": 0.8127306273062731, "grad_norm": 0.3897201458683745, "learning_rate": 4.570623916442966e-05, "loss": 0.2801, "step": 881 }, { "epoch": 0.8136531365313653, "grad_norm": 0.3499836209573072, "learning_rate": 4.569118604153686e-05, "loss": 0.2224, "step": 882 }, { "epoch": 0.8145756457564576, "grad_norm": 0.3991348914664004, "learning_rate": 4.567610906600449e-05, "loss": 0.2454, "step": 883 }, { "epoch": 0.8154981549815498, "grad_norm": 0.39310366746476055, "learning_rate": 4.566100825521317e-05, "loss": 0.2343, "step": 884 }, { "epoch": 0.816420664206642, "grad_norm": 0.4209272421254392, "learning_rate": 4.564588362657101e-05, "loss": 0.2634, "step": 885 }, { "epoch": 0.8173431734317343, "grad_norm": 0.36902568189665436, "learning_rate": 4.5630735197513554e-05, "loss": 0.2492, "step": 886 }, { "epoch": 0.8182656826568265, "grad_norm": 0.35839307483214716, "learning_rate": 4.561556298550379e-05, "loss": 0.2408, "step": 887 }, { "epoch": 0.8191881918819188, "grad_norm": 0.3632283103696501, "learning_rate": 4.560036700803213e-05, "loss": 0.2124, "step": 888 }, { "epoch": 0.8201107011070111, "grad_norm": 0.3683588449535609, "learning_rate": 4.558514728261639e-05, "loss": 0.2399, "step": 889 }, { "epoch": 0.8210332103321033, "grad_norm": 0.416135107412475, "learning_rate": 4.556990382680174e-05, "loss": 0.2488, "step": 890 }, { "epoch": 0.8219557195571956, "grad_norm": 0.38767776714892904, "learning_rate": 4.555463665816073e-05, "loss": 0.2451, "step": 891 }, { "epoch": 0.8228782287822878, "grad_norm": 0.38655898937170086, "learning_rate": 4.553934579429322e-05, "loss": 0.2551, "step": 892 }, { "epoch": 0.8238007380073801, "grad_norm": 0.4237419311315628, "learning_rate": 4.552403125282641e-05, "loss": 0.2788, "step": 893 }, { "epoch": 0.8247232472324724, "grad_norm": 0.41798421330220237, "learning_rate": 4.550869305141478e-05, "loss": 0.2413, "step": 894 }, { "epoch": 0.8256457564575646, "grad_norm": 0.41305156789240494, "learning_rate": 4.54933312077401e-05, "loss": 0.2372, "step": 895 }, { "epoch": 0.8265682656826568, "grad_norm": 0.4248949007952841, "learning_rate": 4.547794573951136e-05, "loss": 0.2637, "step": 896 }, { "epoch": 0.827490774907749, "grad_norm": 0.36047701179072617, "learning_rate": 4.546253666446484e-05, "loss": 0.2558, "step": 897 }, { "epoch": 0.8284132841328413, "grad_norm": 0.43732481006930657, "learning_rate": 4.5447104000363985e-05, "loss": 0.2856, "step": 898 }, { "epoch": 0.8293357933579336, "grad_norm": 0.4187175037943844, "learning_rate": 4.5431647764999455e-05, "loss": 0.2464, "step": 899 }, { "epoch": 0.8302583025830258, "grad_norm": 0.3940722356549932, "learning_rate": 4.541616797618907e-05, "loss": 0.2404, "step": 900 }, { "epoch": 0.8311808118081181, "grad_norm": 0.39931432095847397, "learning_rate": 4.5400664651777835e-05, "loss": 0.2628, "step": 901 }, { "epoch": 0.8321033210332104, "grad_norm": 0.410685067676723, "learning_rate": 4.538513780963784e-05, "loss": 0.2384, "step": 902 }, { "epoch": 0.8330258302583026, "grad_norm": 0.42948688546306324, "learning_rate": 4.5369587467668315e-05, "loss": 0.2486, "step": 903 }, { "epoch": 0.8339483394833949, "grad_norm": 0.3922072991864588, "learning_rate": 4.535401364379558e-05, "loss": 0.2296, "step": 904 }, { "epoch": 0.834870848708487, "grad_norm": 0.43030836009249707, "learning_rate": 4.5338416355973006e-05, "loss": 0.269, "step": 905 }, { "epoch": 0.8357933579335793, "grad_norm": 0.4216060941407115, "learning_rate": 4.5322795622181044e-05, "loss": 0.2564, "step": 906 }, { "epoch": 0.8367158671586716, "grad_norm": 0.42802377697733435, "learning_rate": 4.530715146042713e-05, "loss": 0.2479, "step": 907 }, { "epoch": 0.8376383763837638, "grad_norm": 0.4266861366633069, "learning_rate": 4.529148388874577e-05, "loss": 0.2652, "step": 908 }, { "epoch": 0.8385608856088561, "grad_norm": 0.41261864191830294, "learning_rate": 4.5275792925198383e-05, "loss": 0.2716, "step": 909 }, { "epoch": 0.8394833948339483, "grad_norm": 0.3646033560843698, "learning_rate": 4.526007858787341e-05, "loss": 0.2242, "step": 910 }, { "epoch": 0.8404059040590406, "grad_norm": 0.4570957874662083, "learning_rate": 4.5244340894886215e-05, "loss": 0.2694, "step": 911 }, { "epoch": 0.8413284132841329, "grad_norm": 0.3911930239160912, "learning_rate": 4.522857986437909e-05, "loss": 0.2566, "step": 912 }, { "epoch": 0.8422509225092251, "grad_norm": 0.36326227597308736, "learning_rate": 4.521279551452122e-05, "loss": 0.2629, "step": 913 }, { "epoch": 0.8431734317343174, "grad_norm": 0.37918166872103803, "learning_rate": 4.51969878635087e-05, "loss": 0.238, "step": 914 }, { "epoch": 0.8440959409594095, "grad_norm": 0.43316472238041337, "learning_rate": 4.518115692956445e-05, "loss": 0.2676, "step": 915 }, { "epoch": 0.8450184501845018, "grad_norm": 0.42341768992528533, "learning_rate": 4.516530273093825e-05, "loss": 0.2394, "step": 916 }, { "epoch": 0.8459409594095941, "grad_norm": 0.44622071611241565, "learning_rate": 4.514942528590671e-05, "loss": 0.2486, "step": 917 }, { "epoch": 0.8468634686346863, "grad_norm": 0.35345649117366434, "learning_rate": 4.513352461277323e-05, "loss": 0.246, "step": 918 }, { "epoch": 0.8477859778597786, "grad_norm": 0.3880135352024817, "learning_rate": 4.511760072986795e-05, "loss": 0.2601, "step": 919 }, { "epoch": 0.8487084870848709, "grad_norm": 0.3965918262583217, "learning_rate": 4.5101653655547834e-05, "loss": 0.2701, "step": 920 }, { "epoch": 0.8496309963099631, "grad_norm": 0.4381694713773363, "learning_rate": 4.5085683408196535e-05, "loss": 0.2731, "step": 921 }, { "epoch": 0.8505535055350554, "grad_norm": 0.3650583316068512, "learning_rate": 4.5069690006224424e-05, "loss": 0.2383, "step": 922 }, { "epoch": 0.8514760147601476, "grad_norm": 0.46663226029659377, "learning_rate": 4.505367346806858e-05, "loss": 0.2495, "step": 923 }, { "epoch": 0.8523985239852399, "grad_norm": 0.3775521089743212, "learning_rate": 4.503763381219275e-05, "loss": 0.2545, "step": 924 }, { "epoch": 0.8533210332103321, "grad_norm": 0.3597708317746619, "learning_rate": 4.502157105708731e-05, "loss": 0.2513, "step": 925 }, { "epoch": 0.8542435424354243, "grad_norm": 0.40647398411997965, "learning_rate": 4.5005485221269285e-05, "loss": 0.2476, "step": 926 }, { "epoch": 0.8551660516605166, "grad_norm": 0.33447531379059603, "learning_rate": 4.498937632328231e-05, "loss": 0.2145, "step": 927 }, { "epoch": 0.8560885608856088, "grad_norm": 0.3512162886026165, "learning_rate": 4.4973244381696585e-05, "loss": 0.254, "step": 928 }, { "epoch": 0.8570110701107011, "grad_norm": 0.35575023354053653, "learning_rate": 4.49570894151089e-05, "loss": 0.2421, "step": 929 }, { "epoch": 0.8579335793357934, "grad_norm": 0.32325446414884945, "learning_rate": 4.494091144214258e-05, "loss": 0.2154, "step": 930 }, { "epoch": 0.8588560885608856, "grad_norm": 0.4100276190850683, "learning_rate": 4.492471048144744e-05, "loss": 0.2787, "step": 931 }, { "epoch": 0.8597785977859779, "grad_norm": 0.38579462589053465, "learning_rate": 4.490848655169986e-05, "loss": 0.2248, "step": 932 }, { "epoch": 0.8607011070110702, "grad_norm": 0.3981022513100881, "learning_rate": 4.489223967160263e-05, "loss": 0.2792, "step": 933 }, { "epoch": 0.8616236162361623, "grad_norm": 0.3558274930747902, "learning_rate": 4.487596985988505e-05, "loss": 0.2283, "step": 934 }, { "epoch": 0.8625461254612546, "grad_norm": 0.36787697605706837, "learning_rate": 4.485967713530281e-05, "loss": 0.2542, "step": 935 }, { "epoch": 0.8634686346863468, "grad_norm": 0.3990125166173122, "learning_rate": 4.4843361516638075e-05, "loss": 0.262, "step": 936 }, { "epoch": 0.8643911439114391, "grad_norm": 0.42844550257397196, "learning_rate": 4.4827023022699323e-05, "loss": 0.2717, "step": 937 }, { "epoch": 0.8653136531365314, "grad_norm": 0.4345063745058787, "learning_rate": 4.4810661672321466e-05, "loss": 0.2602, "step": 938 }, { "epoch": 0.8662361623616236, "grad_norm": 0.38743451727159217, "learning_rate": 4.4794277484365724e-05, "loss": 0.2461, "step": 939 }, { "epoch": 0.8671586715867159, "grad_norm": 0.3797476635576325, "learning_rate": 4.477787047771969e-05, "loss": 0.2194, "step": 940 }, { "epoch": 0.8680811808118081, "grad_norm": 0.4388668977771248, "learning_rate": 4.476144067129722e-05, "loss": 0.2822, "step": 941 }, { "epoch": 0.8690036900369004, "grad_norm": 0.3420994737833039, "learning_rate": 4.474498808403846e-05, "loss": 0.2155, "step": 942 }, { "epoch": 0.8699261992619927, "grad_norm": 0.3750141322967599, "learning_rate": 4.4728512734909844e-05, "loss": 0.2319, "step": 943 }, { "epoch": 0.8708487084870848, "grad_norm": 0.34705027085406503, "learning_rate": 4.471201464290401e-05, "loss": 0.2106, "step": 944 }, { "epoch": 0.8717712177121771, "grad_norm": 0.3979496693198322, "learning_rate": 4.4695493827039846e-05, "loss": 0.2379, "step": 945 }, { "epoch": 0.8726937269372693, "grad_norm": 0.379676451402185, "learning_rate": 4.4678950306362405e-05, "loss": 0.2554, "step": 946 }, { "epoch": 0.8736162361623616, "grad_norm": 0.41340574010421133, "learning_rate": 4.4662384099942946e-05, "loss": 0.2688, "step": 947 }, { "epoch": 0.8745387453874539, "grad_norm": 0.3408035104441574, "learning_rate": 4.464579522687885e-05, "loss": 0.2254, "step": 948 }, { "epoch": 0.8754612546125461, "grad_norm": 0.3967662981537898, "learning_rate": 4.462918370629365e-05, "loss": 0.2745, "step": 949 }, { "epoch": 0.8763837638376384, "grad_norm": 0.40779663351573675, "learning_rate": 4.4612549557336974e-05, "loss": 0.266, "step": 950 }, { "epoch": 0.8773062730627307, "grad_norm": 0.38684475616778263, "learning_rate": 4.4595892799184546e-05, "loss": 0.2559, "step": 951 }, { "epoch": 0.8782287822878229, "grad_norm": 0.35900223578342305, "learning_rate": 4.457921345103815e-05, "loss": 0.2714, "step": 952 }, { "epoch": 0.8791512915129152, "grad_norm": 0.42653085046234424, "learning_rate": 4.456251153212561e-05, "loss": 0.2694, "step": 953 }, { "epoch": 0.8800738007380073, "grad_norm": 0.48911189024620294, "learning_rate": 4.454578706170075e-05, "loss": 0.2365, "step": 954 }, { "epoch": 0.8809963099630996, "grad_norm": 0.3699032935486172, "learning_rate": 4.4529040059043424e-05, "loss": 0.212, "step": 955 }, { "epoch": 0.8819188191881919, "grad_norm": 0.3603449065440313, "learning_rate": 4.451227054345946e-05, "loss": 0.2258, "step": 956 }, { "epoch": 0.8828413284132841, "grad_norm": 0.3805358233443923, "learning_rate": 4.449547853428061e-05, "loss": 0.2197, "step": 957 }, { "epoch": 0.8837638376383764, "grad_norm": 0.34817693270934846, "learning_rate": 4.4478664050864586e-05, "loss": 0.241, "step": 958 }, { "epoch": 0.8846863468634686, "grad_norm": 0.4123436517213217, "learning_rate": 4.4461827112594974e-05, "loss": 0.258, "step": 959 }, { "epoch": 0.8856088560885609, "grad_norm": 0.36338763462519325, "learning_rate": 4.444496773888128e-05, "loss": 0.2463, "step": 960 }, { "epoch": 0.8865313653136532, "grad_norm": 0.3870641913052104, "learning_rate": 4.442808594915886e-05, "loss": 0.2417, "step": 961 }, { "epoch": 0.8874538745387454, "grad_norm": 0.3569445908716236, "learning_rate": 4.441118176288891e-05, "loss": 0.2504, "step": 962 }, { "epoch": 0.8883763837638377, "grad_norm": 0.3680184801953975, "learning_rate": 4.439425519955844e-05, "loss": 0.2466, "step": 963 }, { "epoch": 0.8892988929889298, "grad_norm": 0.4728595394165681, "learning_rate": 4.437730627868027e-05, "loss": 0.3084, "step": 964 }, { "epoch": 0.8902214022140221, "grad_norm": 0.4015573191764765, "learning_rate": 4.436033501979299e-05, "loss": 0.2364, "step": 965 }, { "epoch": 0.8911439114391144, "grad_norm": 0.4016704048673385, "learning_rate": 4.434334144246092e-05, "loss": 0.2433, "step": 966 }, { "epoch": 0.8920664206642066, "grad_norm": 0.40381267757102457, "learning_rate": 4.432632556627413e-05, "loss": 0.2663, "step": 967 }, { "epoch": 0.8929889298892989, "grad_norm": 0.41218953298312716, "learning_rate": 4.430928741084839e-05, "loss": 0.2321, "step": 968 }, { "epoch": 0.8939114391143912, "grad_norm": 0.3932985646232469, "learning_rate": 4.429222699582517e-05, "loss": 0.2552, "step": 969 }, { "epoch": 0.8948339483394834, "grad_norm": 0.3592400920563191, "learning_rate": 4.4275144340871556e-05, "loss": 0.2403, "step": 970 }, { "epoch": 0.8957564575645757, "grad_norm": 0.3806295572930225, "learning_rate": 4.4258039465680326e-05, "loss": 0.2373, "step": 971 }, { "epoch": 0.8966789667896679, "grad_norm": 0.35209635408855017, "learning_rate": 4.4240912389969833e-05, "loss": 0.2287, "step": 972 }, { "epoch": 0.8976014760147601, "grad_norm": 0.381360988218551, "learning_rate": 4.422376313348405e-05, "loss": 0.2476, "step": 973 }, { "epoch": 0.8985239852398524, "grad_norm": 0.3774398810196574, "learning_rate": 4.42065917159925e-05, "loss": 0.2525, "step": 974 }, { "epoch": 0.8994464944649446, "grad_norm": 0.39437701105031125, "learning_rate": 4.418939815729026e-05, "loss": 0.2372, "step": 975 }, { "epoch": 0.9003690036900369, "grad_norm": 0.42489208182485905, "learning_rate": 4.417218247719794e-05, "loss": 0.2454, "step": 976 }, { "epoch": 0.9012915129151291, "grad_norm": 0.3951313616214242, "learning_rate": 4.415494469556163e-05, "loss": 0.2388, "step": 977 }, { "epoch": 0.9022140221402214, "grad_norm": 0.4328782740433957, "learning_rate": 4.413768483225292e-05, "loss": 0.2811, "step": 978 }, { "epoch": 0.9031365313653137, "grad_norm": 0.41015549722125544, "learning_rate": 4.412040290716884e-05, "loss": 0.2578, "step": 979 }, { "epoch": 0.9040590405904059, "grad_norm": 0.3733772113740058, "learning_rate": 4.410309894023187e-05, "loss": 0.2295, "step": 980 }, { "epoch": 0.9049815498154982, "grad_norm": 0.33251844742181735, "learning_rate": 4.408577295138988e-05, "loss": 0.2392, "step": 981 }, { "epoch": 0.9059040590405905, "grad_norm": 0.33664811795093086, "learning_rate": 4.406842496061615e-05, "loss": 0.249, "step": 982 }, { "epoch": 0.9068265682656826, "grad_norm": 0.4100127469946453, "learning_rate": 4.4051054987909295e-05, "loss": 0.2538, "step": 983 }, { "epoch": 0.9077490774907749, "grad_norm": 0.4038276025334575, "learning_rate": 4.40336630532933e-05, "loss": 0.2671, "step": 984 }, { "epoch": 0.9086715867158671, "grad_norm": 0.4130873622798777, "learning_rate": 4.4016249176817424e-05, "loss": 0.2482, "step": 985 }, { "epoch": 0.9095940959409594, "grad_norm": 0.3438323781857651, "learning_rate": 4.399881337855629e-05, "loss": 0.2379, "step": 986 }, { "epoch": 0.9105166051660517, "grad_norm": 0.3771480958562822, "learning_rate": 4.398135567860972e-05, "loss": 0.2384, "step": 987 }, { "epoch": 0.9114391143911439, "grad_norm": 0.385766779169622, "learning_rate": 4.396387609710283e-05, "loss": 0.23, "step": 988 }, { "epoch": 0.9123616236162362, "grad_norm": 0.42205705134674754, "learning_rate": 4.394637465418594e-05, "loss": 0.2624, "step": 989 }, { "epoch": 0.9132841328413284, "grad_norm": 0.3862877149306351, "learning_rate": 4.392885137003459e-05, "loss": 0.2461, "step": 990 }, { "epoch": 0.9142066420664207, "grad_norm": 0.3459414889329022, "learning_rate": 4.391130626484947e-05, "loss": 0.2346, "step": 991 }, { "epoch": 0.915129151291513, "grad_norm": 0.38016164148453147, "learning_rate": 4.389373935885646e-05, "loss": 0.236, "step": 992 }, { "epoch": 0.9160516605166051, "grad_norm": 0.3841933827116207, "learning_rate": 4.387615067230654e-05, "loss": 0.2696, "step": 993 }, { "epoch": 0.9169741697416974, "grad_norm": 0.41688074113579665, "learning_rate": 4.3858540225475817e-05, "loss": 0.2563, "step": 994 }, { "epoch": 0.9178966789667896, "grad_norm": 0.3849169335820403, "learning_rate": 4.384090803866547e-05, "loss": 0.2593, "step": 995 }, { "epoch": 0.9188191881918819, "grad_norm": 0.38188251977300347, "learning_rate": 4.3823254132201763e-05, "loss": 0.248, "step": 996 }, { "epoch": 0.9197416974169742, "grad_norm": 0.4147197292335253, "learning_rate": 4.380557852643598e-05, "loss": 0.2401, "step": 997 }, { "epoch": 0.9206642066420664, "grad_norm": 0.4440822567873978, "learning_rate": 4.378788124174441e-05, "loss": 0.253, "step": 998 }, { "epoch": 0.9215867158671587, "grad_norm": 0.4169692472216945, "learning_rate": 4.377016229852836e-05, "loss": 0.2335, "step": 999 }, { "epoch": 0.922509225092251, "grad_norm": 0.41047303343206154, "learning_rate": 4.3752421717214085e-05, "loss": 0.2563, "step": 1000 }, { "epoch": 0.9234317343173432, "grad_norm": 0.41813382044338954, "learning_rate": 4.3734659518252787e-05, "loss": 0.2834, "step": 1001 }, { "epoch": 0.9243542435424354, "grad_norm": 0.41427781364888505, "learning_rate": 4.371687572212059e-05, "loss": 0.261, "step": 1002 }, { "epoch": 0.9252767527675276, "grad_norm": 0.4623243834462149, "learning_rate": 4.3699070349318537e-05, "loss": 0.2837, "step": 1003 }, { "epoch": 0.9261992619926199, "grad_norm": 0.399358869629028, "learning_rate": 4.36812434203725e-05, "loss": 0.2581, "step": 1004 }, { "epoch": 0.9271217712177122, "grad_norm": 0.42411862628878405, "learning_rate": 4.3663394955833235e-05, "loss": 0.2694, "step": 1005 }, { "epoch": 0.9280442804428044, "grad_norm": 0.3571473870625449, "learning_rate": 4.364552497627632e-05, "loss": 0.2242, "step": 1006 }, { "epoch": 0.9289667896678967, "grad_norm": 0.3723780793429299, "learning_rate": 4.3627633502302124e-05, "loss": 0.238, "step": 1007 }, { "epoch": 0.9298892988929889, "grad_norm": 0.3957445137280655, "learning_rate": 4.360972055453579e-05, "loss": 0.2537, "step": 1008 }, { "epoch": 0.9308118081180812, "grad_norm": 0.38717717292059817, "learning_rate": 4.3591786153627247e-05, "loss": 0.2225, "step": 1009 }, { "epoch": 0.9317343173431735, "grad_norm": 0.3863531822945893, "learning_rate": 4.357383032025112e-05, "loss": 0.2461, "step": 1010 }, { "epoch": 0.9326568265682657, "grad_norm": 0.3517513341161581, "learning_rate": 4.355585307510675e-05, "loss": 0.2115, "step": 1011 }, { "epoch": 0.933579335793358, "grad_norm": 0.394077914968105, "learning_rate": 4.353785443891818e-05, "loss": 0.2469, "step": 1012 }, { "epoch": 0.9345018450184502, "grad_norm": 0.4000681433541021, "learning_rate": 4.351983443243409e-05, "loss": 0.238, "step": 1013 }, { "epoch": 0.9354243542435424, "grad_norm": 0.38273149497237724, "learning_rate": 4.350179307642781e-05, "loss": 0.256, "step": 1014 }, { "epoch": 0.9363468634686347, "grad_norm": 0.39191957054373927, "learning_rate": 4.3483730391697275e-05, "loss": 0.2546, "step": 1015 }, { "epoch": 0.9372693726937269, "grad_norm": 0.3654054474137455, "learning_rate": 4.346564639906501e-05, "loss": 0.2292, "step": 1016 }, { "epoch": 0.9381918819188192, "grad_norm": 0.38086987242860304, "learning_rate": 4.344754111937809e-05, "loss": 0.2337, "step": 1017 }, { "epoch": 0.9391143911439115, "grad_norm": 0.3891942472846791, "learning_rate": 4.342941457350816e-05, "loss": 0.2579, "step": 1018 }, { "epoch": 0.9400369003690037, "grad_norm": 0.43555445451666, "learning_rate": 4.3411266782351346e-05, "loss": 0.2411, "step": 1019 }, { "epoch": 0.940959409594096, "grad_norm": 0.3664039746636333, "learning_rate": 4.3393097766828293e-05, "loss": 0.2468, "step": 1020 }, { "epoch": 0.9418819188191881, "grad_norm": 0.3723820455291896, "learning_rate": 4.3374907547884095e-05, "loss": 0.2426, "step": 1021 }, { "epoch": 0.9428044280442804, "grad_norm": 0.4054503641813245, "learning_rate": 4.3356696146488304e-05, "loss": 0.2491, "step": 1022 }, { "epoch": 0.9437269372693727, "grad_norm": 0.38516524752919157, "learning_rate": 4.333846358363487e-05, "loss": 0.222, "step": 1023 }, { "epoch": 0.9446494464944649, "grad_norm": 0.40103836099698303, "learning_rate": 4.3320209880342156e-05, "loss": 0.2545, "step": 1024 }, { "epoch": 0.9455719557195572, "grad_norm": 0.38058729358893373, "learning_rate": 4.33019350576529e-05, "loss": 0.2791, "step": 1025 }, { "epoch": 0.9464944649446494, "grad_norm": 0.37402738285722437, "learning_rate": 4.3283639136634167e-05, "loss": 0.237, "step": 1026 }, { "epoch": 0.9474169741697417, "grad_norm": 0.3697120672939745, "learning_rate": 4.3265322138377354e-05, "loss": 0.2524, "step": 1027 }, { "epoch": 0.948339483394834, "grad_norm": 0.3820254627029573, "learning_rate": 4.3246984083998154e-05, "loss": 0.258, "step": 1028 }, { "epoch": 0.9492619926199262, "grad_norm": 0.3523371586640625, "learning_rate": 4.322862499463654e-05, "loss": 0.2347, "step": 1029 }, { "epoch": 0.9501845018450185, "grad_norm": 0.3781888289848851, "learning_rate": 4.321024489145673e-05, "loss": 0.2446, "step": 1030 }, { "epoch": 0.9511070110701108, "grad_norm": 0.4317197990012946, "learning_rate": 4.319184379564716e-05, "loss": 0.2389, "step": 1031 }, { "epoch": 0.9520295202952029, "grad_norm": 0.4410472946030106, "learning_rate": 4.3173421728420464e-05, "loss": 0.2875, "step": 1032 }, { "epoch": 0.9529520295202952, "grad_norm": 0.3812828110364232, "learning_rate": 4.315497871101347e-05, "loss": 0.2505, "step": 1033 }, { "epoch": 0.9538745387453874, "grad_norm": 0.3998281184321331, "learning_rate": 4.313651476468715e-05, "loss": 0.2456, "step": 1034 }, { "epoch": 0.9547970479704797, "grad_norm": 0.3726950564281805, "learning_rate": 4.311802991072659e-05, "loss": 0.2439, "step": 1035 }, { "epoch": 0.955719557195572, "grad_norm": 0.37085693904059847, "learning_rate": 4.309952417044099e-05, "loss": 0.2481, "step": 1036 }, { "epoch": 0.9566420664206642, "grad_norm": 0.41930432385724087, "learning_rate": 4.308099756516361e-05, "loss": 0.2627, "step": 1037 }, { "epoch": 0.9575645756457565, "grad_norm": 0.4012474686060809, "learning_rate": 4.306245011625181e-05, "loss": 0.2836, "step": 1038 }, { "epoch": 0.9584870848708487, "grad_norm": 0.39256009864556235, "learning_rate": 4.304388184508691e-05, "loss": 0.2618, "step": 1039 }, { "epoch": 0.959409594095941, "grad_norm": 0.4018260203344265, "learning_rate": 4.3025292773074294e-05, "loss": 0.2672, "step": 1040 }, { "epoch": 0.9603321033210332, "grad_norm": 0.3796571177233923, "learning_rate": 4.3006682921643296e-05, "loss": 0.2587, "step": 1041 }, { "epoch": 0.9612546125461254, "grad_norm": 0.3296707035409425, "learning_rate": 4.298805231224721e-05, "loss": 0.2033, "step": 1042 }, { "epoch": 0.9621771217712177, "grad_norm": 0.36564967878510296, "learning_rate": 4.296940096636324e-05, "loss": 0.2393, "step": 1043 }, { "epoch": 0.9630996309963099, "grad_norm": 0.4024183037228938, "learning_rate": 4.2950728905492544e-05, "loss": 0.2545, "step": 1044 }, { "epoch": 0.9640221402214022, "grad_norm": 0.359357908219692, "learning_rate": 4.2932036151160104e-05, "loss": 0.2306, "step": 1045 }, { "epoch": 0.9649446494464945, "grad_norm": 0.41006569313717495, "learning_rate": 4.291332272491479e-05, "loss": 0.2496, "step": 1046 }, { "epoch": 0.9658671586715867, "grad_norm": 0.40258702606237634, "learning_rate": 4.289458864832931e-05, "loss": 0.2562, "step": 1047 }, { "epoch": 0.966789667896679, "grad_norm": 0.39459325003198475, "learning_rate": 4.287583394300015e-05, "loss": 0.2526, "step": 1048 }, { "epoch": 0.9677121771217713, "grad_norm": 0.4638042314426276, "learning_rate": 4.2857058630547594e-05, "loss": 0.2695, "step": 1049 }, { "epoch": 0.9686346863468634, "grad_norm": 0.4025697845338391, "learning_rate": 4.283826273261567e-05, "loss": 0.2713, "step": 1050 }, { "epoch": 0.9695571955719557, "grad_norm": 0.3669837154426985, "learning_rate": 4.281944627087214e-05, "loss": 0.2529, "step": 1051 }, { "epoch": 0.9704797047970479, "grad_norm": 0.3972939537424461, "learning_rate": 4.28006092670085e-05, "loss": 0.2467, "step": 1052 }, { "epoch": 0.9714022140221402, "grad_norm": 0.4099812196589974, "learning_rate": 4.2781751742739885e-05, "loss": 0.2642, "step": 1053 }, { "epoch": 0.9723247232472325, "grad_norm": 0.40327491814518013, "learning_rate": 4.27628737198051e-05, "loss": 0.2476, "step": 1054 }, { "epoch": 0.9732472324723247, "grad_norm": 0.3542847438268756, "learning_rate": 4.274397521996658e-05, "loss": 0.2318, "step": 1055 }, { "epoch": 0.974169741697417, "grad_norm": 0.357301905791408, "learning_rate": 4.272505626501039e-05, "loss": 0.2267, "step": 1056 }, { "epoch": 0.9750922509225092, "grad_norm": 0.4057193033011623, "learning_rate": 4.270611687674615e-05, "loss": 0.2655, "step": 1057 }, { "epoch": 0.9760147601476015, "grad_norm": 0.4210452062719313, "learning_rate": 4.268715707700703e-05, "loss": 0.271, "step": 1058 }, { "epoch": 0.9769372693726938, "grad_norm": 0.37462001216192053, "learning_rate": 4.266817688764974e-05, "loss": 0.2188, "step": 1059 }, { "epoch": 0.977859778597786, "grad_norm": 0.384305636910666, "learning_rate": 4.2649176330554505e-05, "loss": 0.2358, "step": 1060 }, { "epoch": 0.9787822878228782, "grad_norm": 0.4023223260591023, "learning_rate": 4.263015542762502e-05, "loss": 0.2253, "step": 1061 }, { "epoch": 0.9797047970479705, "grad_norm": 0.3582360155741272, "learning_rate": 4.261111420078843e-05, "loss": 0.2323, "step": 1062 }, { "epoch": 0.9806273062730627, "grad_norm": 0.34309753558813766, "learning_rate": 4.259205267199532e-05, "loss": 0.2312, "step": 1063 }, { "epoch": 0.981549815498155, "grad_norm": 0.342868960758366, "learning_rate": 4.257297086321967e-05, "loss": 0.2117, "step": 1064 }, { "epoch": 0.9824723247232472, "grad_norm": 0.3756603692103656, "learning_rate": 4.255386879645884e-05, "loss": 0.2511, "step": 1065 }, { "epoch": 0.9833948339483395, "grad_norm": 0.36504578675556676, "learning_rate": 4.2534746493733544e-05, "loss": 0.2623, "step": 1066 }, { "epoch": 0.9843173431734318, "grad_norm": 0.3877827233780789, "learning_rate": 4.2515603977087834e-05, "loss": 0.2484, "step": 1067 }, { "epoch": 0.985239852398524, "grad_norm": 0.4000115235566352, "learning_rate": 4.2496441268589046e-05, "loss": 0.2322, "step": 1068 }, { "epoch": 0.9861623616236163, "grad_norm": 0.3916507964424266, "learning_rate": 4.247725839032781e-05, "loss": 0.2486, "step": 1069 }, { "epoch": 0.9870848708487084, "grad_norm": 0.3672624822476246, "learning_rate": 4.245805536441799e-05, "loss": 0.246, "step": 1070 }, { "epoch": 0.9880073800738007, "grad_norm": 0.424336758506717, "learning_rate": 4.243883221299669e-05, "loss": 0.256, "step": 1071 }, { "epoch": 0.988929889298893, "grad_norm": 0.3925976329361435, "learning_rate": 4.241958895822422e-05, "loss": 0.2327, "step": 1072 }, { "epoch": 0.9898523985239852, "grad_norm": 0.36882062594137904, "learning_rate": 4.240032562228405e-05, "loss": 0.24, "step": 1073 }, { "epoch": 0.9907749077490775, "grad_norm": 0.34872009808112114, "learning_rate": 4.23810422273828e-05, "loss": 0.237, "step": 1074 }, { "epoch": 0.9916974169741697, "grad_norm": 0.3600321823367888, "learning_rate": 4.2361738795750214e-05, "loss": 0.2533, "step": 1075 }, { "epoch": 0.992619926199262, "grad_norm": 0.3744568379901036, "learning_rate": 4.234241534963916e-05, "loss": 0.244, "step": 1076 }, { "epoch": 0.9935424354243543, "grad_norm": 0.35579109010529775, "learning_rate": 4.2323071911325535e-05, "loss": 0.2362, "step": 1077 }, { "epoch": 0.9944649446494465, "grad_norm": 0.34114175573589445, "learning_rate": 4.230370850310832e-05, "loss": 0.2144, "step": 1078 }, { "epoch": 0.9953874538745388, "grad_norm": 0.4020001689070888, "learning_rate": 4.228432514730949e-05, "loss": 0.2648, "step": 1079 }, { "epoch": 0.996309963099631, "grad_norm": 0.33804693122463964, "learning_rate": 4.2264921866274046e-05, "loss": 0.2684, "step": 1080 }, { "epoch": 0.9972324723247232, "grad_norm": 0.3775262235809972, "learning_rate": 4.224549868236993e-05, "loss": 0.2539, "step": 1081 }, { "epoch": 0.9981549815498155, "grad_norm": 0.36410628062196304, "learning_rate": 4.2226055617988024e-05, "loss": 0.2561, "step": 1082 }, { "epoch": 0.9990774907749077, "grad_norm": 0.37350818965624166, "learning_rate": 4.220659269554217e-05, "loss": 0.2529, "step": 1083 }, { "epoch": 1.0, "grad_norm": 0.3419738536638325, "learning_rate": 4.218710993746906e-05, "loss": 0.2375, "step": 1084 }, { "epoch": 1.0009225092250922, "grad_norm": 0.3769368882169451, "learning_rate": 4.2167607366228266e-05, "loss": 0.1954, "step": 1085 }, { "epoch": 1.0018450184501846, "grad_norm": 0.37813463650633805, "learning_rate": 4.2148085004302205e-05, "loss": 0.1987, "step": 1086 }, { "epoch": 1.0027675276752768, "grad_norm": 0.33502446350380966, "learning_rate": 4.212854287419611e-05, "loss": 0.1669, "step": 1087 }, { "epoch": 1.003690036900369, "grad_norm": 0.38269711360360914, "learning_rate": 4.2108980998437984e-05, "loss": 0.1816, "step": 1088 }, { "epoch": 1.0046125461254614, "grad_norm": 0.4259402307903112, "learning_rate": 4.208939939957862e-05, "loss": 0.1729, "step": 1089 }, { "epoch": 1.0055350553505535, "grad_norm": 0.5129546312352509, "learning_rate": 4.2069798100191525e-05, "loss": 0.1661, "step": 1090 }, { "epoch": 1.0064575645756457, "grad_norm": 0.4389264706914863, "learning_rate": 4.2050177122872934e-05, "loss": 0.1952, "step": 1091 }, { "epoch": 1.007380073800738, "grad_norm": 0.41996401878411477, "learning_rate": 4.2030536490241754e-05, "loss": 0.1639, "step": 1092 }, { "epoch": 1.0083025830258303, "grad_norm": 0.40587370232707043, "learning_rate": 4.2010876224939556e-05, "loss": 0.183, "step": 1093 }, { "epoch": 1.0092250922509225, "grad_norm": 0.4216925521611639, "learning_rate": 4.1991196349630536e-05, "loss": 0.1777, "step": 1094 }, { "epoch": 1.0101476014760147, "grad_norm": 0.354670234146558, "learning_rate": 4.19714968870015e-05, "loss": 0.1556, "step": 1095 }, { "epoch": 1.011070110701107, "grad_norm": 0.37794728109822767, "learning_rate": 4.195177785976185e-05, "loss": 0.1615, "step": 1096 }, { "epoch": 1.0119926199261993, "grad_norm": 0.4704906880862343, "learning_rate": 4.193203929064353e-05, "loss": 0.1987, "step": 1097 }, { "epoch": 1.0129151291512914, "grad_norm": 0.3772186034127118, "learning_rate": 4.191228120240099e-05, "loss": 0.1643, "step": 1098 }, { "epoch": 1.0138376383763839, "grad_norm": 0.42250435630940364, "learning_rate": 4.1892503617811216e-05, "loss": 0.1789, "step": 1099 }, { "epoch": 1.014760147601476, "grad_norm": 0.40835588337077483, "learning_rate": 4.1872706559673665e-05, "loss": 0.1671, "step": 1100 }, { "epoch": 1.0156826568265682, "grad_norm": 0.3971438386755592, "learning_rate": 4.185289005081021e-05, "loss": 0.1758, "step": 1101 }, { "epoch": 1.0166051660516606, "grad_norm": 0.42954003229122456, "learning_rate": 4.1833054114065175e-05, "loss": 0.191, "step": 1102 }, { "epoch": 1.0175276752767528, "grad_norm": 0.5067713954677341, "learning_rate": 4.1813198772305284e-05, "loss": 0.1785, "step": 1103 }, { "epoch": 1.018450184501845, "grad_norm": 0.36309220773044926, "learning_rate": 4.1793324048419626e-05, "loss": 0.1654, "step": 1104 }, { "epoch": 1.0193726937269372, "grad_norm": 0.3651840162756363, "learning_rate": 4.177342996531961e-05, "loss": 0.1507, "step": 1105 }, { "epoch": 1.0202952029520296, "grad_norm": 0.4051567508664911, "learning_rate": 4.175351654593899e-05, "loss": 0.1719, "step": 1106 }, { "epoch": 1.0212177121771218, "grad_norm": 0.3881943625337821, "learning_rate": 4.1733583813233815e-05, "loss": 0.1742, "step": 1107 }, { "epoch": 1.022140221402214, "grad_norm": 0.37905275668416616, "learning_rate": 4.1713631790182364e-05, "loss": 0.1635, "step": 1108 }, { "epoch": 1.0230627306273063, "grad_norm": 0.3427845280225927, "learning_rate": 4.169366049978519e-05, "loss": 0.1579, "step": 1109 }, { "epoch": 1.0239852398523985, "grad_norm": 0.3538821348972919, "learning_rate": 4.167366996506503e-05, "loss": 0.1776, "step": 1110 }, { "epoch": 1.0249077490774907, "grad_norm": 0.36124493881262104, "learning_rate": 4.1653660209066835e-05, "loss": 0.1815, "step": 1111 }, { "epoch": 1.0258302583025831, "grad_norm": 0.3934661273597267, "learning_rate": 4.16336312548577e-05, "loss": 0.1672, "step": 1112 }, { "epoch": 1.0267527675276753, "grad_norm": 0.35071002280057345, "learning_rate": 4.161358312552682e-05, "loss": 0.1409, "step": 1113 }, { "epoch": 1.0276752767527675, "grad_norm": 0.3631582571734631, "learning_rate": 4.1593515844185536e-05, "loss": 0.1526, "step": 1114 }, { "epoch": 1.0285977859778597, "grad_norm": 0.4493670419396526, "learning_rate": 4.157342943396728e-05, "loss": 0.1829, "step": 1115 }, { "epoch": 1.029520295202952, "grad_norm": 0.39113684237210034, "learning_rate": 4.155332391802748e-05, "loss": 0.153, "step": 1116 }, { "epoch": 1.0304428044280443, "grad_norm": 0.3407921649948736, "learning_rate": 4.153319931954363e-05, "loss": 0.1448, "step": 1117 }, { "epoch": 1.0313653136531364, "grad_norm": 0.35735453759259145, "learning_rate": 4.1513055661715214e-05, "loss": 0.1642, "step": 1118 }, { "epoch": 1.0322878228782288, "grad_norm": 0.378033416589244, "learning_rate": 4.1492892967763686e-05, "loss": 0.17, "step": 1119 }, { "epoch": 1.033210332103321, "grad_norm": 0.32755535064867, "learning_rate": 4.147271126093243e-05, "loss": 0.1763, "step": 1120 }, { "epoch": 1.0341328413284132, "grad_norm": 0.33184993219665576, "learning_rate": 4.145251056448678e-05, "loss": 0.1641, "step": 1121 }, { "epoch": 1.0350553505535056, "grad_norm": 0.34342127146183604, "learning_rate": 4.1432290901713944e-05, "loss": 0.1599, "step": 1122 }, { "epoch": 1.0359778597785978, "grad_norm": 0.31650953682305155, "learning_rate": 4.141205229592298e-05, "loss": 0.1522, "step": 1123 }, { "epoch": 1.03690036900369, "grad_norm": 0.3884987790776325, "learning_rate": 4.13917947704448e-05, "loss": 0.1784, "step": 1124 }, { "epoch": 1.0378228782287824, "grad_norm": 0.39352618968889314, "learning_rate": 4.137151834863213e-05, "loss": 0.1832, "step": 1125 }, { "epoch": 1.0387453874538746, "grad_norm": 0.40731228868134994, "learning_rate": 4.1351223053859465e-05, "loss": 0.164, "step": 1126 }, { "epoch": 1.0396678966789668, "grad_norm": 0.35681228331776604, "learning_rate": 4.133090890952306e-05, "loss": 0.1737, "step": 1127 }, { "epoch": 1.040590405904059, "grad_norm": 0.38427644358885815, "learning_rate": 4.131057593904092e-05, "loss": 0.1525, "step": 1128 }, { "epoch": 1.0415129151291513, "grad_norm": 0.3735181284156466, "learning_rate": 4.129022416585272e-05, "loss": 0.1689, "step": 1129 }, { "epoch": 1.0424354243542435, "grad_norm": 0.3841245872081137, "learning_rate": 4.126985361341984e-05, "loss": 0.1679, "step": 1130 }, { "epoch": 1.0433579335793357, "grad_norm": 0.4394062794454207, "learning_rate": 4.1249464305225294e-05, "loss": 0.1701, "step": 1131 }, { "epoch": 1.044280442804428, "grad_norm": 0.34635693765690534, "learning_rate": 4.1229056264773705e-05, "loss": 0.1483, "step": 1132 }, { "epoch": 1.0452029520295203, "grad_norm": 0.3689927275176747, "learning_rate": 4.1208629515591316e-05, "loss": 0.173, "step": 1133 }, { "epoch": 1.0461254612546125, "grad_norm": 0.42258687449431287, "learning_rate": 4.118818408122592e-05, "loss": 0.189, "step": 1134 }, { "epoch": 1.0470479704797049, "grad_norm": 0.3591825951560322, "learning_rate": 4.116771998524688e-05, "loss": 0.1768, "step": 1135 }, { "epoch": 1.047970479704797, "grad_norm": 0.3531927142004278, "learning_rate": 4.114723725124501e-05, "loss": 0.1765, "step": 1136 }, { "epoch": 1.0488929889298892, "grad_norm": 0.3829638749984938, "learning_rate": 4.112673590283267e-05, "loss": 0.1816, "step": 1137 }, { "epoch": 1.0498154981549817, "grad_norm": 0.37254171936921016, "learning_rate": 4.1106215963643645e-05, "loss": 0.1709, "step": 1138 }, { "epoch": 1.0507380073800738, "grad_norm": 0.36194452587323034, "learning_rate": 4.108567745733318e-05, "loss": 0.1576, "step": 1139 }, { "epoch": 1.051660516605166, "grad_norm": 0.41177269350486917, "learning_rate": 4.106512040757789e-05, "loss": 0.1664, "step": 1140 }, { "epoch": 1.0525830258302582, "grad_norm": 0.42734114893298225, "learning_rate": 4.1044544838075794e-05, "loss": 0.1702, "step": 1141 }, { "epoch": 1.0535055350553506, "grad_norm": 0.37337649729707373, "learning_rate": 4.102395077254624e-05, "loss": 0.1489, "step": 1142 }, { "epoch": 1.0544280442804428, "grad_norm": 0.39261391909709, "learning_rate": 4.100333823472992e-05, "loss": 0.1749, "step": 1143 }, { "epoch": 1.055350553505535, "grad_norm": 0.42549411406276944, "learning_rate": 4.098270724838879e-05, "loss": 0.1549, "step": 1144 }, { "epoch": 1.0562730627306274, "grad_norm": 0.5053855322170004, "learning_rate": 4.096205783730611e-05, "loss": 0.1779, "step": 1145 }, { "epoch": 1.0571955719557196, "grad_norm": 0.45748588362554227, "learning_rate": 4.094139002528635e-05, "loss": 0.163, "step": 1146 }, { "epoch": 1.0581180811808117, "grad_norm": 0.4068618648456191, "learning_rate": 4.092070383615522e-05, "loss": 0.1813, "step": 1147 }, { "epoch": 1.0590405904059041, "grad_norm": 0.40950624908793243, "learning_rate": 4.089999929375957e-05, "loss": 0.1853, "step": 1148 }, { "epoch": 1.0599630996309963, "grad_norm": 0.38836686198772674, "learning_rate": 4.0879276421967475e-05, "loss": 0.1708, "step": 1149 }, { "epoch": 1.0608856088560885, "grad_norm": 0.3955714676343847, "learning_rate": 4.0858535244668066e-05, "loss": 0.1574, "step": 1150 }, { "epoch": 1.061808118081181, "grad_norm": 0.3783893233613329, "learning_rate": 4.083777578577164e-05, "loss": 0.1761, "step": 1151 }, { "epoch": 1.062730627306273, "grad_norm": 0.40335215045068984, "learning_rate": 4.081699806920951e-05, "loss": 0.1697, "step": 1152 }, { "epoch": 1.0636531365313653, "grad_norm": 0.3969568516698353, "learning_rate": 4.0796202118934105e-05, "loss": 0.1621, "step": 1153 }, { "epoch": 1.0645756457564575, "grad_norm": 0.4575477135581521, "learning_rate": 4.077538795891881e-05, "loss": 0.1622, "step": 1154 }, { "epoch": 1.0654981549815499, "grad_norm": 0.3922931124328152, "learning_rate": 4.075455561315803e-05, "loss": 0.1777, "step": 1155 }, { "epoch": 1.066420664206642, "grad_norm": 0.39111563963326046, "learning_rate": 4.073370510566714e-05, "loss": 0.1577, "step": 1156 }, { "epoch": 1.0673431734317342, "grad_norm": 0.3783621355518027, "learning_rate": 4.071283646048244e-05, "loss": 0.1733, "step": 1157 }, { "epoch": 1.0682656826568266, "grad_norm": 0.3977090300343326, "learning_rate": 4.0691949701661145e-05, "loss": 0.1654, "step": 1158 }, { "epoch": 1.0691881918819188, "grad_norm": 0.38215847988575297, "learning_rate": 4.067104485328135e-05, "loss": 0.1733, "step": 1159 }, { "epoch": 1.070110701107011, "grad_norm": 0.3926697225841087, "learning_rate": 4.065012193944201e-05, "loss": 0.1704, "step": 1160 }, { "epoch": 1.0710332103321034, "grad_norm": 0.46480611520311177, "learning_rate": 4.062918098426288e-05, "loss": 0.1901, "step": 1161 }, { "epoch": 1.0719557195571956, "grad_norm": 0.3888251222505632, "learning_rate": 4.0608222011884545e-05, "loss": 0.1768, "step": 1162 }, { "epoch": 1.0728782287822878, "grad_norm": 0.39571832407283675, "learning_rate": 4.058724504646834e-05, "loss": 0.1859, "step": 1163 }, { "epoch": 1.07380073800738, "grad_norm": 0.40515368230454263, "learning_rate": 4.056625011219636e-05, "loss": 0.184, "step": 1164 }, { "epoch": 1.0747232472324724, "grad_norm": 0.4075951012369609, "learning_rate": 4.0545237233271383e-05, "loss": 0.1898, "step": 1165 }, { "epoch": 1.0756457564575646, "grad_norm": 0.41305306544440507, "learning_rate": 4.052420643391692e-05, "loss": 0.1888, "step": 1166 }, { "epoch": 1.0765682656826567, "grad_norm": 0.45050394410648004, "learning_rate": 4.050315773837708e-05, "loss": 0.1882, "step": 1167 }, { "epoch": 1.0774907749077491, "grad_norm": 0.4168100597899049, "learning_rate": 4.048209117091668e-05, "loss": 0.1991, "step": 1168 }, { "epoch": 1.0784132841328413, "grad_norm": 0.38736925330372834, "learning_rate": 4.0461006755821066e-05, "loss": 0.1768, "step": 1169 }, { "epoch": 1.0793357933579335, "grad_norm": 0.39062853541467635, "learning_rate": 4.043990451739619e-05, "loss": 0.1596, "step": 1170 }, { "epoch": 1.080258302583026, "grad_norm": 0.3859713176984828, "learning_rate": 4.041878447996855e-05, "loss": 0.1723, "step": 1171 }, { "epoch": 1.081180811808118, "grad_norm": 0.4365972706893191, "learning_rate": 4.039764666788518e-05, "loss": 0.1648, "step": 1172 }, { "epoch": 1.0821033210332103, "grad_norm": 0.3972484486373133, "learning_rate": 4.037649110551357e-05, "loss": 0.1728, "step": 1173 }, { "epoch": 1.0830258302583027, "grad_norm": 0.4318672652196214, "learning_rate": 4.03553178172417e-05, "loss": 0.198, "step": 1174 }, { "epoch": 1.0839483394833949, "grad_norm": 0.3699873653140849, "learning_rate": 4.033412682747796e-05, "loss": 0.1626, "step": 1175 }, { "epoch": 1.084870848708487, "grad_norm": 0.36511846754595956, "learning_rate": 4.031291816065117e-05, "loss": 0.1674, "step": 1176 }, { "epoch": 1.0857933579335795, "grad_norm": 0.4059047610587183, "learning_rate": 4.029169184121051e-05, "loss": 0.1856, "step": 1177 }, { "epoch": 1.0867158671586716, "grad_norm": 0.37772960175348175, "learning_rate": 4.027044789362552e-05, "loss": 0.1762, "step": 1178 }, { "epoch": 1.0876383763837638, "grad_norm": 0.36173658775918344, "learning_rate": 4.024918634238606e-05, "loss": 0.1657, "step": 1179 }, { "epoch": 1.088560885608856, "grad_norm": 0.39888987811999554, "learning_rate": 4.022790721200229e-05, "loss": 0.1879, "step": 1180 }, { "epoch": 1.0894833948339484, "grad_norm": 0.3878771144901679, "learning_rate": 4.020661052700461e-05, "loss": 0.1663, "step": 1181 }, { "epoch": 1.0904059040590406, "grad_norm": 0.4059867833781443, "learning_rate": 4.018529631194369e-05, "loss": 0.1758, "step": 1182 }, { "epoch": 1.0913284132841328, "grad_norm": 0.3822778038862252, "learning_rate": 4.016396459139038e-05, "loss": 0.1987, "step": 1183 }, { "epoch": 1.0922509225092252, "grad_norm": 0.37823723096845563, "learning_rate": 4.0142615389935736e-05, "loss": 0.1609, "step": 1184 }, { "epoch": 1.0931734317343174, "grad_norm": 0.4041649814551321, "learning_rate": 4.012124873219094e-05, "loss": 0.1831, "step": 1185 }, { "epoch": 1.0940959409594095, "grad_norm": 0.4081409306127812, "learning_rate": 4.0099864642787324e-05, "loss": 0.1739, "step": 1186 }, { "epoch": 1.095018450184502, "grad_norm": 0.41445508633820266, "learning_rate": 4.0078463146376277e-05, "loss": 0.1607, "step": 1187 }, { "epoch": 1.0959409594095941, "grad_norm": 0.40050046405665934, "learning_rate": 4.005704426762931e-05, "loss": 0.1666, "step": 1188 }, { "epoch": 1.0968634686346863, "grad_norm": 0.421627251814422, "learning_rate": 4.003560803123791e-05, "loss": 0.18, "step": 1189 }, { "epoch": 1.0977859778597785, "grad_norm": 0.3775866601395446, "learning_rate": 4.001415446191363e-05, "loss": 0.1585, "step": 1190 }, { "epoch": 1.098708487084871, "grad_norm": 0.3949171725100949, "learning_rate": 3.999268358438797e-05, "loss": 0.1801, "step": 1191 }, { "epoch": 1.099630996309963, "grad_norm": 0.445238838334624, "learning_rate": 3.997119542341239e-05, "loss": 0.1852, "step": 1192 }, { "epoch": 1.1005535055350553, "grad_norm": 0.4079689080027289, "learning_rate": 3.994969000375828e-05, "loss": 0.1667, "step": 1193 }, { "epoch": 1.1014760147601477, "grad_norm": 0.3955839693352635, "learning_rate": 3.992816735021692e-05, "loss": 0.1898, "step": 1194 }, { "epoch": 1.1023985239852399, "grad_norm": 0.40343859359856044, "learning_rate": 3.990662748759946e-05, "loss": 0.1733, "step": 1195 }, { "epoch": 1.103321033210332, "grad_norm": 0.36049582413990144, "learning_rate": 3.988507044073687e-05, "loss": 0.1634, "step": 1196 }, { "epoch": 1.1042435424354244, "grad_norm": 0.3744445972145401, "learning_rate": 3.986349623447998e-05, "loss": 0.1803, "step": 1197 }, { "epoch": 1.1051660516605166, "grad_norm": 0.3506582654711279, "learning_rate": 3.9841904893699346e-05, "loss": 0.1526, "step": 1198 }, { "epoch": 1.1060885608856088, "grad_norm": 0.44016397693457415, "learning_rate": 3.9820296443285306e-05, "loss": 0.1767, "step": 1199 }, { "epoch": 1.1070110701107012, "grad_norm": 0.3599239979681691, "learning_rate": 3.979867090814791e-05, "loss": 0.1682, "step": 1200 }, { "epoch": 1.1079335793357934, "grad_norm": 0.4117378061959942, "learning_rate": 3.977702831321692e-05, "loss": 0.1764, "step": 1201 }, { "epoch": 1.1088560885608856, "grad_norm": 0.3655808844946291, "learning_rate": 3.9755368683441735e-05, "loss": 0.164, "step": 1202 }, { "epoch": 1.1097785977859778, "grad_norm": 0.40877835704731236, "learning_rate": 3.9733692043791414e-05, "loss": 0.1819, "step": 1203 }, { "epoch": 1.1107011070110702, "grad_norm": 0.37098664695584865, "learning_rate": 3.9711998419254634e-05, "loss": 0.1716, "step": 1204 }, { "epoch": 1.1116236162361623, "grad_norm": 0.4192369944446082, "learning_rate": 3.969028783483962e-05, "loss": 0.1853, "step": 1205 }, { "epoch": 1.1125461254612545, "grad_norm": 0.4238854228544376, "learning_rate": 3.966856031557418e-05, "loss": 0.1713, "step": 1206 }, { "epoch": 1.113468634686347, "grad_norm": 0.43343780566972534, "learning_rate": 3.964681588650562e-05, "loss": 0.186, "step": 1207 }, { "epoch": 1.1143911439114391, "grad_norm": 0.3741644034657715, "learning_rate": 3.9625054572700757e-05, "loss": 0.1675, "step": 1208 }, { "epoch": 1.1153136531365313, "grad_norm": 0.4416771819445561, "learning_rate": 3.960327639924586e-05, "loss": 0.1761, "step": 1209 }, { "epoch": 1.1162361623616237, "grad_norm": 0.39769798121436667, "learning_rate": 3.958148139124664e-05, "loss": 0.1724, "step": 1210 }, { "epoch": 1.117158671586716, "grad_norm": 0.3647395932311634, "learning_rate": 3.9559669573828225e-05, "loss": 0.1602, "step": 1211 }, { "epoch": 1.118081180811808, "grad_norm": 0.4120209918268371, "learning_rate": 3.9537840972135094e-05, "loss": 0.1638, "step": 1212 }, { "epoch": 1.1190036900369003, "grad_norm": 0.37394155614506874, "learning_rate": 3.95159956113311e-05, "loss": 0.1841, "step": 1213 }, { "epoch": 1.1199261992619927, "grad_norm": 0.38930236349935654, "learning_rate": 3.94941335165994e-05, "loss": 0.1609, "step": 1214 }, { "epoch": 1.1208487084870848, "grad_norm": 0.40402129092996436, "learning_rate": 3.9472254713142455e-05, "loss": 0.1758, "step": 1215 }, { "epoch": 1.121771217712177, "grad_norm": 0.40121844035137655, "learning_rate": 3.945035922618197e-05, "loss": 0.169, "step": 1216 }, { "epoch": 1.1226937269372694, "grad_norm": 0.3843359826294688, "learning_rate": 3.942844708095892e-05, "loss": 0.1868, "step": 1217 }, { "epoch": 1.1236162361623616, "grad_norm": 0.37991679506107584, "learning_rate": 3.9406518302733416e-05, "loss": 0.1855, "step": 1218 }, { "epoch": 1.1245387453874538, "grad_norm": 0.3431532616018786, "learning_rate": 3.938457291678482e-05, "loss": 0.1463, "step": 1219 }, { "epoch": 1.1254612546125462, "grad_norm": 0.39247797781305566, "learning_rate": 3.9362610948411585e-05, "loss": 0.1563, "step": 1220 }, { "epoch": 1.1263837638376384, "grad_norm": 0.4455152042959939, "learning_rate": 3.93406324229313e-05, "loss": 0.194, "step": 1221 }, { "epoch": 1.1273062730627306, "grad_norm": 0.3646911500010225, "learning_rate": 3.931863736568065e-05, "loss": 0.1487, "step": 1222 }, { "epoch": 1.128228782287823, "grad_norm": 0.3751401018519618, "learning_rate": 3.9296625802015356e-05, "loss": 0.1659, "step": 1223 }, { "epoch": 1.1291512915129152, "grad_norm": 0.37936272235043617, "learning_rate": 3.9274597757310186e-05, "loss": 0.1777, "step": 1224 }, { "epoch": 1.1300738007380073, "grad_norm": 0.39440103333960363, "learning_rate": 3.925255325695889e-05, "loss": 0.1643, "step": 1225 }, { "epoch": 1.1309963099630997, "grad_norm": 0.4200580796108357, "learning_rate": 3.923049232637421e-05, "loss": 0.1814, "step": 1226 }, { "epoch": 1.131918819188192, "grad_norm": 0.3973096309997986, "learning_rate": 3.920841499098781e-05, "loss": 0.1814, "step": 1227 }, { "epoch": 1.132841328413284, "grad_norm": 0.38341587007083405, "learning_rate": 3.9186321276250274e-05, "loss": 0.1707, "step": 1228 }, { "epoch": 1.1337638376383763, "grad_norm": 0.360186558259523, "learning_rate": 3.916421120763106e-05, "loss": 0.1535, "step": 1229 }, { "epoch": 1.1346863468634687, "grad_norm": 0.450775459207837, "learning_rate": 3.9142084810618495e-05, "loss": 0.1827, "step": 1230 }, { "epoch": 1.1356088560885609, "grad_norm": 0.4002646614014494, "learning_rate": 3.911994211071971e-05, "loss": 0.1614, "step": 1231 }, { "epoch": 1.136531365313653, "grad_norm": 0.3459076477142216, "learning_rate": 3.909778313346064e-05, "loss": 0.1468, "step": 1232 }, { "epoch": 1.1374538745387455, "grad_norm": 0.36252214847000114, "learning_rate": 3.907560790438598e-05, "loss": 0.167, "step": 1233 }, { "epoch": 1.1383763837638377, "grad_norm": 0.35362107841991497, "learning_rate": 3.905341644905918e-05, "loss": 0.1565, "step": 1234 }, { "epoch": 1.1392988929889298, "grad_norm": 0.38456791825871967, "learning_rate": 3.9031208793062354e-05, "loss": 0.1778, "step": 1235 }, { "epoch": 1.140221402214022, "grad_norm": 0.43754192987728807, "learning_rate": 3.900898496199634e-05, "loss": 0.155, "step": 1236 }, { "epoch": 1.1411439114391144, "grad_norm": 0.37634931688449635, "learning_rate": 3.898674498148058e-05, "loss": 0.1738, "step": 1237 }, { "epoch": 1.1420664206642066, "grad_norm": 0.37374035268129, "learning_rate": 3.896448887715316e-05, "loss": 0.1687, "step": 1238 }, { "epoch": 1.1429889298892988, "grad_norm": 0.37830297443523053, "learning_rate": 3.894221667467074e-05, "loss": 0.1546, "step": 1239 }, { "epoch": 1.1439114391143912, "grad_norm": 0.33714399256913596, "learning_rate": 3.891992839970855e-05, "loss": 0.1668, "step": 1240 }, { "epoch": 1.1448339483394834, "grad_norm": 0.3966714697489937, "learning_rate": 3.889762407796034e-05, "loss": 0.1734, "step": 1241 }, { "epoch": 1.1457564575645756, "grad_norm": 0.33789584779157467, "learning_rate": 3.8875303735138355e-05, "loss": 0.1621, "step": 1242 }, { "epoch": 1.146678966789668, "grad_norm": 0.38525392066538, "learning_rate": 3.885296739697332e-05, "loss": 0.2028, "step": 1243 }, { "epoch": 1.1476014760147601, "grad_norm": 0.4074987067807514, "learning_rate": 3.883061508921439e-05, "loss": 0.1746, "step": 1244 }, { "epoch": 1.1485239852398523, "grad_norm": 0.3499613895655867, "learning_rate": 3.880824683762914e-05, "loss": 0.1565, "step": 1245 }, { "epoch": 1.1494464944649447, "grad_norm": 0.35799322013488066, "learning_rate": 3.87858626680035e-05, "loss": 0.1771, "step": 1246 }, { "epoch": 1.150369003690037, "grad_norm": 0.41086514997229345, "learning_rate": 3.876346260614179e-05, "loss": 0.1508, "step": 1247 }, { "epoch": 1.151291512915129, "grad_norm": 0.39013582251282924, "learning_rate": 3.874104667786661e-05, "loss": 0.175, "step": 1248 }, { "epoch": 1.1522140221402215, "grad_norm": 0.3543398516359713, "learning_rate": 3.871861490901888e-05, "loss": 0.1592, "step": 1249 }, { "epoch": 1.1531365313653137, "grad_norm": 0.3951549154139269, "learning_rate": 3.869616732545777e-05, "loss": 0.1762, "step": 1250 }, { "epoch": 1.1540590405904059, "grad_norm": 0.40136372535159015, "learning_rate": 3.867370395306068e-05, "loss": 0.1703, "step": 1251 }, { "epoch": 1.1549815498154983, "grad_norm": 0.353467602286537, "learning_rate": 3.8651224817723194e-05, "loss": 0.1662, "step": 1252 }, { "epoch": 1.1559040590405905, "grad_norm": 0.384811288693674, "learning_rate": 3.862872994535912e-05, "loss": 0.1784, "step": 1253 }, { "epoch": 1.1568265682656826, "grad_norm": 0.37004350457797447, "learning_rate": 3.860621936190035e-05, "loss": 0.1595, "step": 1254 }, { "epoch": 1.1577490774907748, "grad_norm": 0.38635850669837396, "learning_rate": 3.8583693093296914e-05, "loss": 0.1712, "step": 1255 }, { "epoch": 1.1586715867158672, "grad_norm": 0.436738373842028, "learning_rate": 3.8561151165516925e-05, "loss": 0.187, "step": 1256 }, { "epoch": 1.1595940959409594, "grad_norm": 0.37161247978898304, "learning_rate": 3.853859360454654e-05, "loss": 0.1781, "step": 1257 }, { "epoch": 1.1605166051660516, "grad_norm": 0.3596937247805052, "learning_rate": 3.851602043638994e-05, "loss": 0.187, "step": 1258 }, { "epoch": 1.161439114391144, "grad_norm": 0.3690606390753706, "learning_rate": 3.84934316870693e-05, "loss": 0.1481, "step": 1259 }, { "epoch": 1.1623616236162362, "grad_norm": 0.3546601796031542, "learning_rate": 3.847082738262477e-05, "loss": 0.1629, "step": 1260 }, { "epoch": 1.1632841328413284, "grad_norm": 0.32931616066822833, "learning_rate": 3.84482075491144e-05, "loss": 0.1668, "step": 1261 }, { "epoch": 1.1642066420664205, "grad_norm": 0.33058419513964465, "learning_rate": 3.842557221261415e-05, "loss": 0.1495, "step": 1262 }, { "epoch": 1.165129151291513, "grad_norm": 0.3739966388174159, "learning_rate": 3.840292139921789e-05, "loss": 0.1668, "step": 1263 }, { "epoch": 1.1660516605166051, "grad_norm": 0.3945416380278711, "learning_rate": 3.8380255135037285e-05, "loss": 0.159, "step": 1264 }, { "epoch": 1.1669741697416973, "grad_norm": 0.39559579609047163, "learning_rate": 3.8357573446201825e-05, "loss": 0.1769, "step": 1265 }, { "epoch": 1.1678966789667897, "grad_norm": 0.3668179056990549, "learning_rate": 3.833487635885881e-05, "loss": 0.1634, "step": 1266 }, { "epoch": 1.168819188191882, "grad_norm": 0.38166202060178234, "learning_rate": 3.8312163899173234e-05, "loss": 0.1572, "step": 1267 }, { "epoch": 1.169741697416974, "grad_norm": 0.3770071600155405, "learning_rate": 3.828943609332787e-05, "loss": 0.1656, "step": 1268 }, { "epoch": 1.1706642066420665, "grad_norm": 0.34029454152305877, "learning_rate": 3.8266692967523156e-05, "loss": 0.1631, "step": 1269 }, { "epoch": 1.1715867158671587, "grad_norm": 0.3441263852387389, "learning_rate": 3.824393454797718e-05, "loss": 0.1722, "step": 1270 }, { "epoch": 1.1725092250922509, "grad_norm": 0.45946161597400226, "learning_rate": 3.8221160860925666e-05, "loss": 0.1966, "step": 1271 }, { "epoch": 1.1734317343173433, "grad_norm": 0.3743764303189791, "learning_rate": 3.8198371932621965e-05, "loss": 0.1725, "step": 1272 }, { "epoch": 1.1743542435424354, "grad_norm": 0.3789734722999809, "learning_rate": 3.817556778933698e-05, "loss": 0.166, "step": 1273 }, { "epoch": 1.1752767527675276, "grad_norm": 0.3540465811335595, "learning_rate": 3.815274845735912e-05, "loss": 0.1662, "step": 1274 }, { "epoch": 1.17619926199262, "grad_norm": 0.3508442031273123, "learning_rate": 3.812991396299437e-05, "loss": 0.1666, "step": 1275 }, { "epoch": 1.1771217712177122, "grad_norm": 0.39654898942218, "learning_rate": 3.8107064332566136e-05, "loss": 0.1814, "step": 1276 }, { "epoch": 1.1780442804428044, "grad_norm": 0.3878983535558658, "learning_rate": 3.8084199592415305e-05, "loss": 0.1845, "step": 1277 }, { "epoch": 1.1789667896678966, "grad_norm": 0.39812963500678517, "learning_rate": 3.8061319768900175e-05, "loss": 0.1774, "step": 1278 }, { "epoch": 1.179889298892989, "grad_norm": 0.356504948337044, "learning_rate": 3.8038424888396416e-05, "loss": 0.1466, "step": 1279 }, { "epoch": 1.1808118081180812, "grad_norm": 0.37203533726446586, "learning_rate": 3.801551497729709e-05, "loss": 0.1826, "step": 1280 }, { "epoch": 1.1817343173431734, "grad_norm": 0.39282813959728746, "learning_rate": 3.799259006201255e-05, "loss": 0.1681, "step": 1281 }, { "epoch": 1.1826568265682658, "grad_norm": 0.42248503596639037, "learning_rate": 3.796965016897047e-05, "loss": 0.1794, "step": 1282 }, { "epoch": 1.183579335793358, "grad_norm": 0.3418308160696917, "learning_rate": 3.7946695324615775e-05, "loss": 0.1525, "step": 1283 }, { "epoch": 1.1845018450184501, "grad_norm": 0.3725828576686521, "learning_rate": 3.7923725555410636e-05, "loss": 0.1716, "step": 1284 }, { "epoch": 1.1854243542435423, "grad_norm": 0.32818516577429657, "learning_rate": 3.790074088783443e-05, "loss": 0.1512, "step": 1285 }, { "epoch": 1.1863468634686347, "grad_norm": 0.3564761683059801, "learning_rate": 3.78777413483837e-05, "loss": 0.153, "step": 1286 }, { "epoch": 1.187269372693727, "grad_norm": 0.42092652414606263, "learning_rate": 3.785472696357214e-05, "loss": 0.1932, "step": 1287 }, { "epoch": 1.188191881918819, "grad_norm": 0.4086237832286056, "learning_rate": 3.783169775993055e-05, "loss": 0.1823, "step": 1288 }, { "epoch": 1.1891143911439115, "grad_norm": 0.37861293020683706, "learning_rate": 3.780865376400682e-05, "loss": 0.1705, "step": 1289 }, { "epoch": 1.1900369003690037, "grad_norm": 0.4672151127767379, "learning_rate": 3.7785595002365884e-05, "loss": 0.2024, "step": 1290 }, { "epoch": 1.1909594095940959, "grad_norm": 0.3678600637225433, "learning_rate": 3.7762521501589723e-05, "loss": 0.1683, "step": 1291 }, { "epoch": 1.1918819188191883, "grad_norm": 0.38876778522083116, "learning_rate": 3.773943328827728e-05, "loss": 0.1681, "step": 1292 }, { "epoch": 1.1928044280442804, "grad_norm": 0.4205936397419189, "learning_rate": 3.771633038904446e-05, "loss": 0.1882, "step": 1293 }, { "epoch": 1.1937269372693726, "grad_norm": 0.41562563425985843, "learning_rate": 3.769321283052412e-05, "loss": 0.1657, "step": 1294 }, { "epoch": 1.194649446494465, "grad_norm": 0.34433052253776564, "learning_rate": 3.7670080639366004e-05, "loss": 0.1528, "step": 1295 }, { "epoch": 1.1955719557195572, "grad_norm": 0.38136188611696903, "learning_rate": 3.764693384223671e-05, "loss": 0.177, "step": 1296 }, { "epoch": 1.1964944649446494, "grad_norm": 0.4362631823446143, "learning_rate": 3.76237724658197e-05, "loss": 0.1834, "step": 1297 }, { "epoch": 1.1974169741697418, "grad_norm": 0.4490202044552004, "learning_rate": 3.7600596536815224e-05, "loss": 0.1883, "step": 1298 }, { "epoch": 1.198339483394834, "grad_norm": 0.4159580498109915, "learning_rate": 3.7577406081940314e-05, "loss": 0.1821, "step": 1299 }, { "epoch": 1.1992619926199262, "grad_norm": 0.3754259049777767, "learning_rate": 3.7554201127928744e-05, "loss": 0.1637, "step": 1300 }, { "epoch": 1.2001845018450186, "grad_norm": 0.39487303704772997, "learning_rate": 3.753098170153102e-05, "loss": 0.1751, "step": 1301 }, { "epoch": 1.2011070110701108, "grad_norm": 0.38006135161592924, "learning_rate": 3.750774782951431e-05, "loss": 0.1832, "step": 1302 }, { "epoch": 1.202029520295203, "grad_norm": 0.373794173672362, "learning_rate": 3.7484499538662424e-05, "loss": 0.1638, "step": 1303 }, { "epoch": 1.2029520295202951, "grad_norm": 0.39680441858428456, "learning_rate": 3.746123685577585e-05, "loss": 0.1784, "step": 1304 }, { "epoch": 1.2038745387453875, "grad_norm": 0.36870033261938107, "learning_rate": 3.743795980767159e-05, "loss": 0.1682, "step": 1305 }, { "epoch": 1.2047970479704797, "grad_norm": 0.4399212266489506, "learning_rate": 3.741466842118327e-05, "loss": 0.1771, "step": 1306 }, { "epoch": 1.205719557195572, "grad_norm": 0.34685405203127034, "learning_rate": 3.739136272316102e-05, "loss": 0.1706, "step": 1307 }, { "epoch": 1.2066420664206643, "grad_norm": 0.36429649342017056, "learning_rate": 3.736804274047145e-05, "loss": 0.1596, "step": 1308 }, { "epoch": 1.2075645756457565, "grad_norm": 0.3983863474001515, "learning_rate": 3.734470849999767e-05, "loss": 0.1708, "step": 1309 }, { "epoch": 1.2084870848708487, "grad_norm": 0.3655548087385067, "learning_rate": 3.732136002863922e-05, "loss": 0.1698, "step": 1310 }, { "epoch": 1.2094095940959408, "grad_norm": 0.4039655923900139, "learning_rate": 3.729799735331203e-05, "loss": 0.1778, "step": 1311 }, { "epoch": 1.2103321033210332, "grad_norm": 0.35516821761702266, "learning_rate": 3.727462050094841e-05, "loss": 0.1459, "step": 1312 }, { "epoch": 1.2112546125461254, "grad_norm": 0.44642489434454374, "learning_rate": 3.7251229498497e-05, "loss": 0.1767, "step": 1313 }, { "epoch": 1.2121771217712176, "grad_norm": 0.40901821529429044, "learning_rate": 3.72278243729228e-05, "loss": 0.1797, "step": 1314 }, { "epoch": 1.21309963099631, "grad_norm": 0.3801414969463073, "learning_rate": 3.7204405151207036e-05, "loss": 0.1921, "step": 1315 }, { "epoch": 1.2140221402214022, "grad_norm": 0.35808170261872035, "learning_rate": 3.718097186034721e-05, "loss": 0.1663, "step": 1316 }, { "epoch": 1.2149446494464944, "grad_norm": 0.42564468017637885, "learning_rate": 3.715752452735704e-05, "loss": 0.1888, "step": 1317 }, { "epoch": 1.2158671586715868, "grad_norm": 0.37950666546171086, "learning_rate": 3.7134063179266425e-05, "loss": 0.1562, "step": 1318 }, { "epoch": 1.216789667896679, "grad_norm": 0.3843938105095001, "learning_rate": 3.711058784312144e-05, "loss": 0.1777, "step": 1319 }, { "epoch": 1.2177121771217712, "grad_norm": 0.36524969537809376, "learning_rate": 3.708709854598425e-05, "loss": 0.1649, "step": 1320 }, { "epoch": 1.2186346863468636, "grad_norm": 0.4003937659265883, "learning_rate": 3.706359531493316e-05, "loss": 0.1688, "step": 1321 }, { "epoch": 1.2195571955719557, "grad_norm": 0.39405861933597286, "learning_rate": 3.7040078177062484e-05, "loss": 0.1879, "step": 1322 }, { "epoch": 1.220479704797048, "grad_norm": 0.37765746210969386, "learning_rate": 3.701654715948264e-05, "loss": 0.1865, "step": 1323 }, { "epoch": 1.2214022140221403, "grad_norm": 0.4818548000488366, "learning_rate": 3.6993002289319955e-05, "loss": 0.1646, "step": 1324 }, { "epoch": 1.2223247232472325, "grad_norm": 0.39356600986755663, "learning_rate": 3.6969443593716804e-05, "loss": 0.1728, "step": 1325 }, { "epoch": 1.2232472324723247, "grad_norm": 0.5377447965710058, "learning_rate": 3.694587109983147e-05, "loss": 0.1827, "step": 1326 }, { "epoch": 1.2241697416974169, "grad_norm": 0.4146277637023804, "learning_rate": 3.692228483483812e-05, "loss": 0.1821, "step": 1327 }, { "epoch": 1.2250922509225093, "grad_norm": 0.4362453081571424, "learning_rate": 3.689868482592684e-05, "loss": 0.1963, "step": 1328 }, { "epoch": 1.2260147601476015, "grad_norm": 0.36800404753226107, "learning_rate": 3.6875071100303523e-05, "loss": 0.18, "step": 1329 }, { "epoch": 1.2269372693726937, "grad_norm": 0.36808130166836256, "learning_rate": 3.685144368518991e-05, "loss": 0.1481, "step": 1330 }, { "epoch": 1.227859778597786, "grad_norm": 0.3631608839553618, "learning_rate": 3.682780260782348e-05, "loss": 0.177, "step": 1331 }, { "epoch": 1.2287822878228782, "grad_norm": 0.32755840956082244, "learning_rate": 3.680414789545749e-05, "loss": 0.1573, "step": 1332 }, { "epoch": 1.2297047970479704, "grad_norm": 0.41375648373583096, "learning_rate": 3.678047957536092e-05, "loss": 0.1833, "step": 1333 }, { "epoch": 1.2306273062730628, "grad_norm": 0.399469699965947, "learning_rate": 3.675679767481842e-05, "loss": 0.187, "step": 1334 }, { "epoch": 1.231549815498155, "grad_norm": 0.3659805491727511, "learning_rate": 3.67331022211303e-05, "loss": 0.1573, "step": 1335 }, { "epoch": 1.2324723247232472, "grad_norm": 0.3799332849809668, "learning_rate": 3.670939324161251e-05, "loss": 0.1798, "step": 1336 }, { "epoch": 1.2333948339483394, "grad_norm": 0.3972461594400006, "learning_rate": 3.668567076359656e-05, "loss": 0.1681, "step": 1337 }, { "epoch": 1.2343173431734318, "grad_norm": 0.3914441459651774, "learning_rate": 3.666193481442954e-05, "loss": 0.17, "step": 1338 }, { "epoch": 1.235239852398524, "grad_norm": 0.36242286305851845, "learning_rate": 3.6638185421474084e-05, "loss": 0.1712, "step": 1339 }, { "epoch": 1.2361623616236161, "grad_norm": 0.36469642878502373, "learning_rate": 3.66144226121083e-05, "loss": 0.1654, "step": 1340 }, { "epoch": 1.2370848708487086, "grad_norm": 0.3917313439062194, "learning_rate": 3.659064641372576e-05, "loss": 0.168, "step": 1341 }, { "epoch": 1.2380073800738007, "grad_norm": 0.3617694791967727, "learning_rate": 3.6566856853735516e-05, "loss": 0.1726, "step": 1342 }, { "epoch": 1.238929889298893, "grad_norm": 0.4141158252212677, "learning_rate": 3.654305395956195e-05, "loss": 0.1849, "step": 1343 }, { "epoch": 1.2398523985239853, "grad_norm": 0.4059832013250797, "learning_rate": 3.651923775864488e-05, "loss": 0.1519, "step": 1344 }, { "epoch": 1.2407749077490775, "grad_norm": 0.37433954496738203, "learning_rate": 3.6495408278439426e-05, "loss": 0.1625, "step": 1345 }, { "epoch": 1.2416974169741697, "grad_norm": 0.3918972235044372, "learning_rate": 3.647156554641603e-05, "loss": 0.164, "step": 1346 }, { "epoch": 1.242619926199262, "grad_norm": 0.41833676506254874, "learning_rate": 3.644770959006042e-05, "loss": 0.1872, "step": 1347 }, { "epoch": 1.2435424354243543, "grad_norm": 0.4218150024250036, "learning_rate": 3.642384043687356e-05, "loss": 0.163, "step": 1348 }, { "epoch": 1.2444649446494465, "grad_norm": 0.39570660118087275, "learning_rate": 3.6399958114371595e-05, "loss": 0.1816, "step": 1349 }, { "epoch": 1.2453874538745389, "grad_norm": 0.3707416696700744, "learning_rate": 3.637606265008592e-05, "loss": 0.1765, "step": 1350 }, { "epoch": 1.246309963099631, "grad_norm": 0.3740850457534175, "learning_rate": 3.635215407156302e-05, "loss": 0.1795, "step": 1351 }, { "epoch": 1.2472324723247232, "grad_norm": 0.40097805285714844, "learning_rate": 3.632823240636452e-05, "loss": 0.1835, "step": 1352 }, { "epoch": 1.2481549815498154, "grad_norm": 0.3811162867312523, "learning_rate": 3.6304297682067144e-05, "loss": 0.1752, "step": 1353 }, { "epoch": 1.2490774907749078, "grad_norm": 0.33638864950228525, "learning_rate": 3.628034992626265e-05, "loss": 0.1514, "step": 1354 }, { "epoch": 1.25, "grad_norm": 0.4225820180427333, "learning_rate": 3.6256389166557825e-05, "loss": 0.1761, "step": 1355 }, { "epoch": 1.2509225092250922, "grad_norm": 0.41138341726215444, "learning_rate": 3.623241543057444e-05, "loss": 0.1765, "step": 1356 }, { "epoch": 1.2518450184501844, "grad_norm": 0.4001534683532543, "learning_rate": 3.6208428745949255e-05, "loss": 0.1642, "step": 1357 }, { "epoch": 1.2527675276752768, "grad_norm": 0.43869036112274273, "learning_rate": 3.618442914033392e-05, "loss": 0.1895, "step": 1358 }, { "epoch": 1.253690036900369, "grad_norm": 0.3739304339036808, "learning_rate": 3.616041664139499e-05, "loss": 0.1586, "step": 1359 }, { "epoch": 1.2546125461254611, "grad_norm": 0.35109359825230413, "learning_rate": 3.613639127681389e-05, "loss": 0.1753, "step": 1360 }, { "epoch": 1.2555350553505535, "grad_norm": 0.3577963740043067, "learning_rate": 3.61123530742869e-05, "loss": 0.1769, "step": 1361 }, { "epoch": 1.2564575645756457, "grad_norm": 0.3843171510463367, "learning_rate": 3.608830206152503e-05, "loss": 0.1823, "step": 1362 }, { "epoch": 1.257380073800738, "grad_norm": 0.3598594630906758, "learning_rate": 3.6064238266254145e-05, "loss": 0.1795, "step": 1363 }, { "epoch": 1.2583025830258303, "grad_norm": 0.37345021729424344, "learning_rate": 3.6040161716214774e-05, "loss": 0.151, "step": 1364 }, { "epoch": 1.2592250922509225, "grad_norm": 0.364594440421076, "learning_rate": 3.601607243916219e-05, "loss": 0.1564, "step": 1365 }, { "epoch": 1.2601476014760147, "grad_norm": 0.3760450872663813, "learning_rate": 3.599197046286632e-05, "loss": 0.169, "step": 1366 }, { "epoch": 1.261070110701107, "grad_norm": 0.3349224229844502, "learning_rate": 3.596785581511174e-05, "loss": 0.1379, "step": 1367 }, { "epoch": 1.2619926199261993, "grad_norm": 0.4197601137515256, "learning_rate": 3.594372852369763e-05, "loss": 0.1779, "step": 1368 }, { "epoch": 1.2629151291512914, "grad_norm": 0.4252010238904596, "learning_rate": 3.591958861643775e-05, "loss": 0.1819, "step": 1369 }, { "epoch": 1.2638376383763839, "grad_norm": 0.3726501790103951, "learning_rate": 3.5895436121160386e-05, "loss": 0.1709, "step": 1370 }, { "epoch": 1.264760147601476, "grad_norm": 0.3649082841374998, "learning_rate": 3.5871271065708354e-05, "loss": 0.164, "step": 1371 }, { "epoch": 1.2656826568265682, "grad_norm": 0.4709102508547808, "learning_rate": 3.5847093477938956e-05, "loss": 0.1951, "step": 1372 }, { "epoch": 1.2666051660516606, "grad_norm": 0.36340144372771255, "learning_rate": 3.5822903385723904e-05, "loss": 0.1733, "step": 1373 }, { "epoch": 1.2675276752767528, "grad_norm": 0.36838691907955257, "learning_rate": 3.579870081694938e-05, "loss": 0.1707, "step": 1374 }, { "epoch": 1.268450184501845, "grad_norm": 0.379732273767842, "learning_rate": 3.577448579951589e-05, "loss": 0.1584, "step": 1375 }, { "epoch": 1.2693726937269374, "grad_norm": 0.4069962634671663, "learning_rate": 3.575025836133833e-05, "loss": 0.1775, "step": 1376 }, { "epoch": 1.2702952029520296, "grad_norm": 0.36298862782828534, "learning_rate": 3.5726018530345915e-05, "loss": 0.1565, "step": 1377 }, { "epoch": 1.2712177121771218, "grad_norm": 0.35890423156426643, "learning_rate": 3.5701766334482114e-05, "loss": 0.1797, "step": 1378 }, { "epoch": 1.272140221402214, "grad_norm": 0.404840860825177, "learning_rate": 3.5677501801704685e-05, "loss": 0.1996, "step": 1379 }, { "epoch": 1.2730627306273063, "grad_norm": 0.34154877692677005, "learning_rate": 3.565322495998559e-05, "loss": 0.1577, "step": 1380 }, { "epoch": 1.2739852398523985, "grad_norm": 0.37781924095596964, "learning_rate": 3.5628935837310984e-05, "loss": 0.1688, "step": 1381 }, { "epoch": 1.2749077490774907, "grad_norm": 0.3574258267368524, "learning_rate": 3.5604634461681184e-05, "loss": 0.1646, "step": 1382 }, { "epoch": 1.275830258302583, "grad_norm": 0.3885364748872123, "learning_rate": 3.5580320861110625e-05, "loss": 0.1756, "step": 1383 }, { "epoch": 1.2767527675276753, "grad_norm": 0.3809434520908714, "learning_rate": 3.555599506362784e-05, "loss": 0.1548, "step": 1384 }, { "epoch": 1.2776752767527675, "grad_norm": 0.3881374004257548, "learning_rate": 3.5531657097275425e-05, "loss": 0.1685, "step": 1385 }, { "epoch": 1.2785977859778597, "grad_norm": 0.3878220749254127, "learning_rate": 3.550730699010999e-05, "loss": 0.1685, "step": 1386 }, { "epoch": 1.279520295202952, "grad_norm": 0.566907399079411, "learning_rate": 3.5482944770202145e-05, "loss": 0.1975, "step": 1387 }, { "epoch": 1.2804428044280443, "grad_norm": 0.36180413368577086, "learning_rate": 3.545857046563649e-05, "loss": 0.149, "step": 1388 }, { "epoch": 1.2813653136531364, "grad_norm": 0.3552529788203019, "learning_rate": 3.543418410451152e-05, "loss": 0.1789, "step": 1389 }, { "epoch": 1.2822878228782288, "grad_norm": 0.3985813856389354, "learning_rate": 3.540978571493966e-05, "loss": 0.1791, "step": 1390 }, { "epoch": 1.283210332103321, "grad_norm": 0.3654350181181372, "learning_rate": 3.5385375325047166e-05, "loss": 0.1572, "step": 1391 }, { "epoch": 1.2841328413284132, "grad_norm": 0.3651885453567304, "learning_rate": 3.536095296297415e-05, "loss": 0.15, "step": 1392 }, { "epoch": 1.2850553505535056, "grad_norm": 0.41262729710097773, "learning_rate": 3.533651865687454e-05, "loss": 0.1771, "step": 1393 }, { "epoch": 1.2859778597785978, "grad_norm": 0.3895770830415364, "learning_rate": 3.5312072434915986e-05, "loss": 0.1636, "step": 1394 }, { "epoch": 1.28690036900369, "grad_norm": 0.4216840380459189, "learning_rate": 3.528761432527992e-05, "loss": 0.1901, "step": 1395 }, { "epoch": 1.2878228782287824, "grad_norm": 0.370406659545189, "learning_rate": 3.5263144356161476e-05, "loss": 0.1783, "step": 1396 }, { "epoch": 1.2887453874538746, "grad_norm": 0.3570954429973423, "learning_rate": 3.523866255576943e-05, "loss": 0.1706, "step": 1397 }, { "epoch": 1.2896678966789668, "grad_norm": 0.343995922631194, "learning_rate": 3.52141689523262e-05, "loss": 0.1491, "step": 1398 }, { "epoch": 1.2905904059040592, "grad_norm": 0.3900651531256659, "learning_rate": 3.518966357406786e-05, "loss": 0.1892, "step": 1399 }, { "epoch": 1.2915129151291513, "grad_norm": 0.3536298519135728, "learning_rate": 3.516514644924398e-05, "loss": 0.1513, "step": 1400 }, { "epoch": 1.2924354243542435, "grad_norm": 0.42665857944563257, "learning_rate": 3.5140617606117736e-05, "loss": 0.1705, "step": 1401 }, { "epoch": 1.293357933579336, "grad_norm": 0.38719001646314355, "learning_rate": 3.511607707296579e-05, "loss": 0.1779, "step": 1402 }, { "epoch": 1.294280442804428, "grad_norm": 0.3927927302690867, "learning_rate": 3.509152487807826e-05, "loss": 0.1593, "step": 1403 }, { "epoch": 1.2952029520295203, "grad_norm": 0.3848428495517545, "learning_rate": 3.506696104975875e-05, "loss": 0.1832, "step": 1404 }, { "epoch": 1.2961254612546125, "grad_norm": 0.3610052410007685, "learning_rate": 3.504238561632424e-05, "loss": 0.1592, "step": 1405 }, { "epoch": 1.2970479704797047, "grad_norm": 0.3907677657069571, "learning_rate": 3.5017798606105095e-05, "loss": 0.1797, "step": 1406 }, { "epoch": 1.297970479704797, "grad_norm": 0.35151063137831756, "learning_rate": 3.499320004744505e-05, "loss": 0.1687, "step": 1407 }, { "epoch": 1.2988929889298892, "grad_norm": 0.40429797101240167, "learning_rate": 3.496858996870111e-05, "loss": 0.1915, "step": 1408 }, { "epoch": 1.2998154981549814, "grad_norm": 0.38685864677248, "learning_rate": 3.49439683982436e-05, "loss": 0.1639, "step": 1409 }, { "epoch": 1.3007380073800738, "grad_norm": 0.41763155701235083, "learning_rate": 3.491933536445606e-05, "loss": 0.2078, "step": 1410 }, { "epoch": 1.301660516605166, "grad_norm": 0.3897175145433721, "learning_rate": 3.489469089573529e-05, "loss": 0.1899, "step": 1411 }, { "epoch": 1.3025830258302582, "grad_norm": 0.3950918324246369, "learning_rate": 3.487003502049122e-05, "loss": 0.1789, "step": 1412 }, { "epoch": 1.3035055350553506, "grad_norm": 0.3741381176221496, "learning_rate": 3.484536776714694e-05, "loss": 0.1622, "step": 1413 }, { "epoch": 1.3044280442804428, "grad_norm": 0.3572545260636694, "learning_rate": 3.482068916413871e-05, "loss": 0.1677, "step": 1414 }, { "epoch": 1.305350553505535, "grad_norm": 0.33245502360717216, "learning_rate": 3.47959992399158e-05, "loss": 0.1495, "step": 1415 }, { "epoch": 1.3062730627306274, "grad_norm": 0.37138142998504686, "learning_rate": 3.477129802294057e-05, "loss": 0.1591, "step": 1416 }, { "epoch": 1.3071955719557196, "grad_norm": 0.3668631190741448, "learning_rate": 3.47465855416884e-05, "loss": 0.1586, "step": 1417 }, { "epoch": 1.3081180811808117, "grad_norm": 0.3441380348515503, "learning_rate": 3.472186182464765e-05, "loss": 0.16, "step": 1418 }, { "epoch": 1.3090405904059041, "grad_norm": 0.39242981438000046, "learning_rate": 3.469712690031962e-05, "loss": 0.186, "step": 1419 }, { "epoch": 1.3099630996309963, "grad_norm": 0.3818463039421942, "learning_rate": 3.467238079721855e-05, "loss": 0.1602, "step": 1420 }, { "epoch": 1.3108856088560885, "grad_norm": 0.40650210769142753, "learning_rate": 3.464762354387155e-05, "loss": 0.1713, "step": 1421 }, { "epoch": 1.311808118081181, "grad_norm": 0.41220414363195734, "learning_rate": 3.4622855168818586e-05, "loss": 0.1785, "step": 1422 }, { "epoch": 1.312730627306273, "grad_norm": 0.37792728496286754, "learning_rate": 3.459807570061246e-05, "loss": 0.1716, "step": 1423 }, { "epoch": 1.3136531365313653, "grad_norm": 0.41173148841989027, "learning_rate": 3.4573285167818744e-05, "loss": 0.1645, "step": 1424 }, { "epoch": 1.3145756457564577, "grad_norm": 0.3680120838951169, "learning_rate": 3.454848359901578e-05, "loss": 0.1679, "step": 1425 }, { "epoch": 1.3154981549815499, "grad_norm": 0.42016288755986253, "learning_rate": 3.4523671022794616e-05, "loss": 0.1776, "step": 1426 }, { "epoch": 1.316420664206642, "grad_norm": 0.3497717481979851, "learning_rate": 3.4498847467759e-05, "loss": 0.162, "step": 1427 }, { "epoch": 1.3173431734317342, "grad_norm": 0.36083858060108276, "learning_rate": 3.447401296252535e-05, "loss": 0.1675, "step": 1428 }, { "epoch": 1.3182656826568266, "grad_norm": 0.3574519149758475, "learning_rate": 3.444916753572266e-05, "loss": 0.1585, "step": 1429 }, { "epoch": 1.3191881918819188, "grad_norm": 0.4173982461504816, "learning_rate": 3.442431121599259e-05, "loss": 0.1781, "step": 1430 }, { "epoch": 1.320110701107011, "grad_norm": 0.38466774843280815, "learning_rate": 3.439944403198928e-05, "loss": 0.1704, "step": 1431 }, { "epoch": 1.3210332103321032, "grad_norm": 0.44274122553698614, "learning_rate": 3.437456601237943e-05, "loss": 0.1593, "step": 1432 }, { "epoch": 1.3219557195571956, "grad_norm": 0.36275017892653194, "learning_rate": 3.4349677185842245e-05, "loss": 0.1534, "step": 1433 }, { "epoch": 1.3228782287822878, "grad_norm": 0.407549047074853, "learning_rate": 3.4324777581069356e-05, "loss": 0.1761, "step": 1434 }, { "epoch": 1.32380073800738, "grad_norm": 0.4293837842471811, "learning_rate": 3.4299867226764845e-05, "loss": 0.1693, "step": 1435 }, { "epoch": 1.3247232472324724, "grad_norm": 0.39258990949042727, "learning_rate": 3.427494615164518e-05, "loss": 0.1784, "step": 1436 }, { "epoch": 1.3256457564575646, "grad_norm": 0.3650336270383585, "learning_rate": 3.4250014384439175e-05, "loss": 0.1678, "step": 1437 }, { "epoch": 1.3265682656826567, "grad_norm": 0.3894315314074764, "learning_rate": 3.4225071953887976e-05, "loss": 0.1934, "step": 1438 }, { "epoch": 1.3274907749077491, "grad_norm": 0.36263806255056463, "learning_rate": 3.4200118888745045e-05, "loss": 0.1529, "step": 1439 }, { "epoch": 1.3284132841328413, "grad_norm": 0.4263699101442625, "learning_rate": 3.4175155217776055e-05, "loss": 0.1602, "step": 1440 }, { "epoch": 1.3293357933579335, "grad_norm": 0.3239873105222901, "learning_rate": 3.415018096975895e-05, "loss": 0.1504, "step": 1441 }, { "epoch": 1.330258302583026, "grad_norm": 0.41428571031323863, "learning_rate": 3.412519617348384e-05, "loss": 0.1554, "step": 1442 }, { "epoch": 1.331180811808118, "grad_norm": 0.35636351141470096, "learning_rate": 3.4100200857753026e-05, "loss": 0.1511, "step": 1443 }, { "epoch": 1.3321033210332103, "grad_norm": 0.4468881576942684, "learning_rate": 3.407519505138089e-05, "loss": 0.1688, "step": 1444 }, { "epoch": 1.3330258302583027, "grad_norm": 0.3632320728678713, "learning_rate": 3.4050178783193945e-05, "loss": 0.187, "step": 1445 }, { "epoch": 1.3339483394833949, "grad_norm": 0.40674020966682184, "learning_rate": 3.402515208203076e-05, "loss": 0.166, "step": 1446 }, { "epoch": 1.334870848708487, "grad_norm": 0.3535751510672735, "learning_rate": 3.4000114976741906e-05, "loss": 0.1536, "step": 1447 }, { "epoch": 1.3357933579335795, "grad_norm": 0.39981031351083296, "learning_rate": 3.3975067496189965e-05, "loss": 0.2023, "step": 1448 }, { "epoch": 1.3367158671586716, "grad_norm": 0.3868102100726796, "learning_rate": 3.3950009669249497e-05, "loss": 0.1578, "step": 1449 }, { "epoch": 1.3376383763837638, "grad_norm": 0.3707430060161426, "learning_rate": 3.392494152480696e-05, "loss": 0.1654, "step": 1450 }, { "epoch": 1.3385608856088562, "grad_norm": 0.3418089472071561, "learning_rate": 3.3899863091760715e-05, "loss": 0.1735, "step": 1451 }, { "epoch": 1.3394833948339484, "grad_norm": 0.4074415047811184, "learning_rate": 3.387477439902099e-05, "loss": 0.1602, "step": 1452 }, { "epoch": 1.3404059040590406, "grad_norm": 0.39756053217292886, "learning_rate": 3.384967547550984e-05, "loss": 0.1507, "step": 1453 }, { "epoch": 1.3413284132841328, "grad_norm": 0.4054167651529612, "learning_rate": 3.38245663501611e-05, "loss": 0.1834, "step": 1454 }, { "epoch": 1.3422509225092252, "grad_norm": 0.3741082213467395, "learning_rate": 3.379944705192039e-05, "loss": 0.1724, "step": 1455 }, { "epoch": 1.3431734317343174, "grad_norm": 0.3731975734916343, "learning_rate": 3.377431760974503e-05, "loss": 0.1818, "step": 1456 }, { "epoch": 1.3440959409594095, "grad_norm": 0.4148220352374969, "learning_rate": 3.3749178052604045e-05, "loss": 0.1827, "step": 1457 }, { "epoch": 1.3450184501845017, "grad_norm": 0.378581946781744, "learning_rate": 3.372402840947814e-05, "loss": 0.1557, "step": 1458 }, { "epoch": 1.3459409594095941, "grad_norm": 0.3937199976821091, "learning_rate": 3.3698868709359616e-05, "loss": 0.175, "step": 1459 }, { "epoch": 1.3468634686346863, "grad_norm": 0.38797450109320764, "learning_rate": 3.367369898125238e-05, "loss": 0.166, "step": 1460 }, { "epoch": 1.3477859778597785, "grad_norm": 0.4216007726359451, "learning_rate": 3.364851925417191e-05, "loss": 0.192, "step": 1461 }, { "epoch": 1.348708487084871, "grad_norm": 0.34470152781883956, "learning_rate": 3.362332955714519e-05, "loss": 0.1545, "step": 1462 }, { "epoch": 1.349630996309963, "grad_norm": 0.36583757500717157, "learning_rate": 3.359812991921072e-05, "loss": 0.1602, "step": 1463 }, { "epoch": 1.3505535055350553, "grad_norm": 0.5255217283137901, "learning_rate": 3.357292036941844e-05, "loss": 0.1775, "step": 1464 }, { "epoch": 1.3514760147601477, "grad_norm": 0.38245817490974543, "learning_rate": 3.3547700936829726e-05, "loss": 0.1739, "step": 1465 }, { "epoch": 1.3523985239852399, "grad_norm": 0.4061122342060611, "learning_rate": 3.352247165051734e-05, "loss": 0.1725, "step": 1466 }, { "epoch": 1.353321033210332, "grad_norm": 0.3498602144068321, "learning_rate": 3.349723253956542e-05, "loss": 0.188, "step": 1467 }, { "epoch": 1.3542435424354244, "grad_norm": 0.42855975241480126, "learning_rate": 3.347198363306942e-05, "loss": 0.1916, "step": 1468 }, { "epoch": 1.3551660516605166, "grad_norm": 0.38571087950309607, "learning_rate": 3.344672496013606e-05, "loss": 0.1716, "step": 1469 }, { "epoch": 1.3560885608856088, "grad_norm": 0.3909638801847444, "learning_rate": 3.3421456549883366e-05, "loss": 0.1714, "step": 1470 }, { "epoch": 1.3570110701107012, "grad_norm": 0.3983408278731454, "learning_rate": 3.339617843144057e-05, "loss": 0.1637, "step": 1471 }, { "epoch": 1.3579335793357934, "grad_norm": 0.40774414174127693, "learning_rate": 3.337089063394807e-05, "loss": 0.1554, "step": 1472 }, { "epoch": 1.3588560885608856, "grad_norm": 0.38286568434794654, "learning_rate": 3.334559318655746e-05, "loss": 0.1706, "step": 1473 }, { "epoch": 1.359778597785978, "grad_norm": 0.41029740965251404, "learning_rate": 3.3320286118431444e-05, "loss": 0.1837, "step": 1474 }, { "epoch": 1.3607011070110702, "grad_norm": 0.3522151177885343, "learning_rate": 3.32949694587438e-05, "loss": 0.1641, "step": 1475 }, { "epoch": 1.3616236162361623, "grad_norm": 0.41040542329855806, "learning_rate": 3.3269643236679384e-05, "loss": 0.1901, "step": 1476 }, { "epoch": 1.3625461254612545, "grad_norm": 0.3986149445543998, "learning_rate": 3.324430748143409e-05, "loss": 0.1799, "step": 1477 }, { "epoch": 1.363468634686347, "grad_norm": 0.3231908952222806, "learning_rate": 3.321896222221475e-05, "loss": 0.1502, "step": 1478 }, { "epoch": 1.3643911439114391, "grad_norm": 0.3585100260807637, "learning_rate": 3.3193607488239196e-05, "loss": 0.1691, "step": 1479 }, { "epoch": 1.3653136531365313, "grad_norm": 0.3724697410027662, "learning_rate": 3.3168243308736174e-05, "loss": 0.1649, "step": 1480 }, { "epoch": 1.3662361623616235, "grad_norm": 0.43836765114750026, "learning_rate": 3.3142869712945314e-05, "loss": 0.1949, "step": 1481 }, { "epoch": 1.367158671586716, "grad_norm": 0.38698908313357655, "learning_rate": 3.311748673011709e-05, "loss": 0.1785, "step": 1482 }, { "epoch": 1.368081180811808, "grad_norm": 0.40085667062326424, "learning_rate": 3.3092094389512815e-05, "loss": 0.1706, "step": 1483 }, { "epoch": 1.3690036900369003, "grad_norm": 0.4215350852247018, "learning_rate": 3.306669272040459e-05, "loss": 0.1605, "step": 1484 }, { "epoch": 1.3699261992619927, "grad_norm": 0.38755558872265455, "learning_rate": 3.304128175207526e-05, "loss": 0.177, "step": 1485 }, { "epoch": 1.3708487084870848, "grad_norm": 0.33245606825587154, "learning_rate": 3.301586151381839e-05, "loss": 0.1584, "step": 1486 }, { "epoch": 1.371771217712177, "grad_norm": 0.3968505143633735, "learning_rate": 3.2990432034938235e-05, "loss": 0.1731, "step": 1487 }, { "epoch": 1.3726937269372694, "grad_norm": 0.3545579887998813, "learning_rate": 3.29649933447497e-05, "loss": 0.1558, "step": 1488 }, { "epoch": 1.3736162361623616, "grad_norm": 0.3538986320266718, "learning_rate": 3.293954547257832e-05, "loss": 0.1609, "step": 1489 }, { "epoch": 1.3745387453874538, "grad_norm": 0.4036471913431026, "learning_rate": 3.2914088447760194e-05, "loss": 0.184, "step": 1490 }, { "epoch": 1.3754612546125462, "grad_norm": 0.39631473739938977, "learning_rate": 3.288862229964198e-05, "loss": 0.1834, "step": 1491 }, { "epoch": 1.3763837638376384, "grad_norm": 0.37679883371092154, "learning_rate": 3.2863147057580875e-05, "loss": 0.1723, "step": 1492 }, { "epoch": 1.3773062730627306, "grad_norm": 0.33910583495721097, "learning_rate": 3.2837662750944535e-05, "loss": 0.1611, "step": 1493 }, { "epoch": 1.378228782287823, "grad_norm": 0.3449252182028951, "learning_rate": 3.281216940911106e-05, "loss": 0.1555, "step": 1494 }, { "epoch": 1.3791512915129152, "grad_norm": 0.40143260634074024, "learning_rate": 3.2786667061469e-05, "loss": 0.198, "step": 1495 }, { "epoch": 1.3800738007380073, "grad_norm": 0.33285579323659764, "learning_rate": 3.276115573741724e-05, "loss": 0.1453, "step": 1496 }, { "epoch": 1.3809963099630997, "grad_norm": 0.36713778474185194, "learning_rate": 3.2735635466365046e-05, "loss": 0.1747, "step": 1497 }, { "epoch": 1.381918819188192, "grad_norm": 0.3583536746177072, "learning_rate": 3.2710106277732e-05, "loss": 0.1672, "step": 1498 }, { "epoch": 1.382841328413284, "grad_norm": 0.3237529683832653, "learning_rate": 3.268456820094794e-05, "loss": 0.1521, "step": 1499 }, { "epoch": 1.3837638376383765, "grad_norm": 0.3432367392597323, "learning_rate": 3.2659021265452974e-05, "loss": 0.1691, "step": 1500 }, { "epoch": 1.3846863468634687, "grad_norm": 0.34058885505884795, "learning_rate": 3.263346550069741e-05, "loss": 0.1648, "step": 1501 }, { "epoch": 1.3856088560885609, "grad_norm": 0.372009222581819, "learning_rate": 3.2607900936141725e-05, "loss": 0.1831, "step": 1502 }, { "epoch": 1.386531365313653, "grad_norm": 0.34954399449943774, "learning_rate": 3.258232760125657e-05, "loss": 0.1474, "step": 1503 }, { "epoch": 1.3874538745387455, "grad_norm": 0.41213354250385276, "learning_rate": 3.255674552552267e-05, "loss": 0.1805, "step": 1504 }, { "epoch": 1.3883763837638377, "grad_norm": 0.3590923607406116, "learning_rate": 3.253115473843086e-05, "loss": 0.1636, "step": 1505 }, { "epoch": 1.3892988929889298, "grad_norm": 0.5195077376564883, "learning_rate": 3.2505555269481993e-05, "loss": 0.1746, "step": 1506 }, { "epoch": 1.390221402214022, "grad_norm": 0.35726683093834866, "learning_rate": 3.247994714818694e-05, "loss": 0.1728, "step": 1507 }, { "epoch": 1.3911439114391144, "grad_norm": 0.3431429130379043, "learning_rate": 3.2454330404066545e-05, "loss": 0.1489, "step": 1508 }, { "epoch": 1.3920664206642066, "grad_norm": 0.3404992287293656, "learning_rate": 3.2428705066651603e-05, "loss": 0.1455, "step": 1509 }, { "epoch": 1.3929889298892988, "grad_norm": 0.4157171836412933, "learning_rate": 3.240307116548279e-05, "loss": 0.1697, "step": 1510 }, { "epoch": 1.3939114391143912, "grad_norm": 0.3851045240063217, "learning_rate": 3.2377428730110684e-05, "loss": 0.1602, "step": 1511 }, { "epoch": 1.3948339483394834, "grad_norm": 0.36563293785621676, "learning_rate": 3.235177779009567e-05, "loss": 0.1466, "step": 1512 }, { "epoch": 1.3957564575645756, "grad_norm": 0.37769315758092786, "learning_rate": 3.2326118375007965e-05, "loss": 0.1444, "step": 1513 }, { "epoch": 1.396678966789668, "grad_norm": 0.404148742775024, "learning_rate": 3.230045051442754e-05, "loss": 0.1512, "step": 1514 }, { "epoch": 1.3976014760147601, "grad_norm": 0.40149150310085274, "learning_rate": 3.227477423794412e-05, "loss": 0.16, "step": 1515 }, { "epoch": 1.3985239852398523, "grad_norm": 0.3652165868891584, "learning_rate": 3.2249089575157095e-05, "loss": 0.1621, "step": 1516 }, { "epoch": 1.3994464944649447, "grad_norm": 0.38068047916450476, "learning_rate": 3.222339655567556e-05, "loss": 0.1552, "step": 1517 }, { "epoch": 1.400369003690037, "grad_norm": 0.42498426136576395, "learning_rate": 3.2197695209118236e-05, "loss": 0.1744, "step": 1518 }, { "epoch": 1.401291512915129, "grad_norm": 0.4101821448145483, "learning_rate": 3.2171985565113415e-05, "loss": 0.1681, "step": 1519 }, { "epoch": 1.4022140221402215, "grad_norm": 0.4070341768264358, "learning_rate": 3.2146267653299e-05, "loss": 0.18, "step": 1520 }, { "epoch": 1.4031365313653137, "grad_norm": 0.3400012868189246, "learning_rate": 3.212054150332239e-05, "loss": 0.1678, "step": 1521 }, { "epoch": 1.4040590405904059, "grad_norm": 0.433022903329026, "learning_rate": 3.209480714484049e-05, "loss": 0.1774, "step": 1522 }, { "epoch": 1.4049815498154983, "grad_norm": 0.31030208242511237, "learning_rate": 3.206906460751968e-05, "loss": 0.1567, "step": 1523 }, { "epoch": 1.4059040590405905, "grad_norm": 0.34001679021223896, "learning_rate": 3.2043313921035743e-05, "loss": 0.159, "step": 1524 }, { "epoch": 1.4068265682656826, "grad_norm": 0.38322630552346637, "learning_rate": 3.201755511507389e-05, "loss": 0.1669, "step": 1525 }, { "epoch": 1.4077490774907748, "grad_norm": 0.3693576434894572, "learning_rate": 3.199178821932865e-05, "loss": 0.1604, "step": 1526 }, { "epoch": 1.4086715867158672, "grad_norm": 0.3995499629249807, "learning_rate": 3.196601326350393e-05, "loss": 0.1676, "step": 1527 }, { "epoch": 1.4095940959409594, "grad_norm": 0.34824438998827817, "learning_rate": 3.194023027731288e-05, "loss": 0.1432, "step": 1528 }, { "epoch": 1.4105166051660516, "grad_norm": 0.36116120388148987, "learning_rate": 3.191443929047793e-05, "loss": 0.169, "step": 1529 }, { "epoch": 1.4114391143911438, "grad_norm": 0.3560442086436447, "learning_rate": 3.188864033273074e-05, "loss": 0.1731, "step": 1530 }, { "epoch": 1.4123616236162362, "grad_norm": 0.3706163750507851, "learning_rate": 3.186283343381213e-05, "loss": 0.1388, "step": 1531 }, { "epoch": 1.4132841328413284, "grad_norm": 0.38187197667421363, "learning_rate": 3.1837018623472116e-05, "loss": 0.1537, "step": 1532 }, { "epoch": 1.4142066420664205, "grad_norm": 0.42580664179707683, "learning_rate": 3.1811195931469804e-05, "loss": 0.1768, "step": 1533 }, { "epoch": 1.415129151291513, "grad_norm": 0.4084666432032316, "learning_rate": 3.178536538757339e-05, "loss": 0.1887, "step": 1534 }, { "epoch": 1.4160516605166051, "grad_norm": 0.35942118774969967, "learning_rate": 3.1759527021560126e-05, "loss": 0.1804, "step": 1535 }, { "epoch": 1.4169741697416973, "grad_norm": 0.3692777759704592, "learning_rate": 3.173368086321629e-05, "loss": 0.1849, "step": 1536 }, { "epoch": 1.4178966789667897, "grad_norm": 0.3636443925361855, "learning_rate": 3.170782694233712e-05, "loss": 0.1743, "step": 1537 }, { "epoch": 1.418819188191882, "grad_norm": 0.40942980389322786, "learning_rate": 3.168196528872682e-05, "loss": 0.1725, "step": 1538 }, { "epoch": 1.419741697416974, "grad_norm": 0.33833653641281153, "learning_rate": 3.165609593219852e-05, "loss": 0.1743, "step": 1539 }, { "epoch": 1.4206642066420665, "grad_norm": 0.34662617508546745, "learning_rate": 3.16302189025742e-05, "loss": 0.1665, "step": 1540 }, { "epoch": 1.4215867158671587, "grad_norm": 0.3837517105262154, "learning_rate": 3.1604334229684705e-05, "loss": 0.1811, "step": 1541 }, { "epoch": 1.4225092250922509, "grad_norm": 0.3660541015601058, "learning_rate": 3.157844194336968e-05, "loss": 0.1662, "step": 1542 }, { "epoch": 1.4234317343173433, "grad_norm": 0.4041655451243474, "learning_rate": 3.1552542073477555e-05, "loss": 0.1661, "step": 1543 }, { "epoch": 1.4243542435424354, "grad_norm": 0.38054326648318904, "learning_rate": 3.1526634649865514e-05, "loss": 0.1596, "step": 1544 }, { "epoch": 1.4252767527675276, "grad_norm": 0.3849710778859087, "learning_rate": 3.150071970239941e-05, "loss": 0.1609, "step": 1545 }, { "epoch": 1.42619926199262, "grad_norm": 0.41310874387916263, "learning_rate": 3.1474797260953806e-05, "loss": 0.1604, "step": 1546 }, { "epoch": 1.4271217712177122, "grad_norm": 0.3824298243642993, "learning_rate": 3.144886735541191e-05, "loss": 0.1414, "step": 1547 }, { "epoch": 1.4280442804428044, "grad_norm": 0.438248106970847, "learning_rate": 3.1422930015665484e-05, "loss": 0.1707, "step": 1548 }, { "epoch": 1.4289667896678968, "grad_norm": 0.47825328997554417, "learning_rate": 3.1396985271614914e-05, "loss": 0.1933, "step": 1549 }, { "epoch": 1.429889298892989, "grad_norm": 0.395617655243006, "learning_rate": 3.13710331531691e-05, "loss": 0.182, "step": 1550 }, { "epoch": 1.4308118081180812, "grad_norm": 0.36187891165428526, "learning_rate": 3.134507369024543e-05, "loss": 0.1464, "step": 1551 }, { "epoch": 1.4317343173431734, "grad_norm": 0.42839944714948797, "learning_rate": 3.13191069127698e-05, "loss": 0.1694, "step": 1552 }, { "epoch": 1.4326568265682658, "grad_norm": 0.38965828762765414, "learning_rate": 3.1293132850676484e-05, "loss": 0.1714, "step": 1553 }, { "epoch": 1.433579335793358, "grad_norm": 0.39447447361198096, "learning_rate": 3.126715153390819e-05, "loss": 0.187, "step": 1554 }, { "epoch": 1.4345018450184501, "grad_norm": 0.357097085573246, "learning_rate": 3.124116299241598e-05, "loss": 0.1798, "step": 1555 }, { "epoch": 1.4354243542435423, "grad_norm": 0.3825572959600671, "learning_rate": 3.1215167256159245e-05, "loss": 0.1611, "step": 1556 }, { "epoch": 1.4363468634686347, "grad_norm": 0.3546728581900947, "learning_rate": 3.118916435510567e-05, "loss": 0.1813, "step": 1557 }, { "epoch": 1.437269372693727, "grad_norm": 0.36315138657808316, "learning_rate": 3.1163154319231194e-05, "loss": 0.1532, "step": 1558 }, { "epoch": 1.438191881918819, "grad_norm": 0.39610627077159327, "learning_rate": 3.1137137178519985e-05, "loss": 0.1717, "step": 1559 }, { "epoch": 1.4391143911439115, "grad_norm": 0.4748758696309284, "learning_rate": 3.111111296296441e-05, "loss": 0.1557, "step": 1560 }, { "epoch": 1.4400369003690037, "grad_norm": 0.3137228098987481, "learning_rate": 3.1085081702564966e-05, "loss": 0.1489, "step": 1561 }, { "epoch": 1.4409594095940959, "grad_norm": 0.40963764410421305, "learning_rate": 3.105904342733032e-05, "loss": 0.1652, "step": 1562 }, { "epoch": 1.4418819188191883, "grad_norm": 0.3559814030662488, "learning_rate": 3.103299816727716e-05, "loss": 0.17, "step": 1563 }, { "epoch": 1.4428044280442804, "grad_norm": 0.4173167640148135, "learning_rate": 3.100694595243028e-05, "loss": 0.1809, "step": 1564 }, { "epoch": 1.4437269372693726, "grad_norm": 0.3214124947242933, "learning_rate": 3.0980886812822474e-05, "loss": 0.1542, "step": 1565 }, { "epoch": 1.444649446494465, "grad_norm": 0.399532478767107, "learning_rate": 3.0954820778494516e-05, "loss": 0.182, "step": 1566 }, { "epoch": 1.4455719557195572, "grad_norm": 0.42170975126734245, "learning_rate": 3.0928747879495115e-05, "loss": 0.182, "step": 1567 }, { "epoch": 1.4464944649446494, "grad_norm": 0.3615743704404705, "learning_rate": 3.0902668145880924e-05, "loss": 0.1593, "step": 1568 }, { "epoch": 1.4474169741697418, "grad_norm": 0.35225646041206127, "learning_rate": 3.0876581607716456e-05, "loss": 0.1579, "step": 1569 }, { "epoch": 1.448339483394834, "grad_norm": 0.39706475767264293, "learning_rate": 3.085048829507406e-05, "loss": 0.1691, "step": 1570 }, { "epoch": 1.4492619926199262, "grad_norm": 0.4822514269767401, "learning_rate": 3.082438823803392e-05, "loss": 0.1897, "step": 1571 }, { "epoch": 1.4501845018450186, "grad_norm": 0.3723450625164562, "learning_rate": 3.079828146668397e-05, "loss": 0.1832, "step": 1572 }, { "epoch": 1.4511070110701108, "grad_norm": 0.4106009436359365, "learning_rate": 3.07721680111199e-05, "loss": 0.1728, "step": 1573 }, { "epoch": 1.452029520295203, "grad_norm": 0.34347681763677174, "learning_rate": 3.07460479014451e-05, "loss": 0.1614, "step": 1574 }, { "epoch": 1.4529520295202953, "grad_norm": 0.381661507373486, "learning_rate": 3.0719921167770624e-05, "loss": 0.1815, "step": 1575 }, { "epoch": 1.4538745387453875, "grad_norm": 0.44435761234773025, "learning_rate": 3.069378784021518e-05, "loss": 0.1875, "step": 1576 }, { "epoch": 1.4547970479704797, "grad_norm": 0.3624200072470951, "learning_rate": 3.066764794890505e-05, "loss": 0.1644, "step": 1577 }, { "epoch": 1.455719557195572, "grad_norm": 0.36808113934931824, "learning_rate": 3.064150152397412e-05, "loss": 0.1501, "step": 1578 }, { "epoch": 1.456642066420664, "grad_norm": 0.3666267965852813, "learning_rate": 3.061534859556377e-05, "loss": 0.1796, "step": 1579 }, { "epoch": 1.4575645756457565, "grad_norm": 0.392902157755771, "learning_rate": 3.0589189193822895e-05, "loss": 0.19, "step": 1580 }, { "epoch": 1.4584870848708487, "grad_norm": 0.3913033345885448, "learning_rate": 3.056302334890786e-05, "loss": 0.1794, "step": 1581 }, { "epoch": 1.4594095940959408, "grad_norm": 0.3496689337140897, "learning_rate": 3.053685109098245e-05, "loss": 0.16, "step": 1582 }, { "epoch": 1.4603321033210332, "grad_norm": 0.39419972511865325, "learning_rate": 3.051067245021783e-05, "loss": 0.1546, "step": 1583 }, { "epoch": 1.4612546125461254, "grad_norm": 0.41736336497176446, "learning_rate": 3.048448745679255e-05, "loss": 0.1972, "step": 1584 }, { "epoch": 1.4621771217712176, "grad_norm": 0.35073426253376633, "learning_rate": 3.045829614089246e-05, "loss": 0.1669, "step": 1585 }, { "epoch": 1.46309963099631, "grad_norm": 0.4021785887416188, "learning_rate": 3.04320985327107e-05, "loss": 0.1731, "step": 1586 }, { "epoch": 1.4640221402214022, "grad_norm": 0.40666184044352294, "learning_rate": 3.040589466244768e-05, "loss": 0.1901, "step": 1587 }, { "epoch": 1.4649446494464944, "grad_norm": 0.3696675854986723, "learning_rate": 3.0379684560311027e-05, "loss": 0.1521, "step": 1588 }, { "epoch": 1.4658671586715868, "grad_norm": 0.38100821412378033, "learning_rate": 3.035346825651552e-05, "loss": 0.1482, "step": 1589 }, { "epoch": 1.466789667896679, "grad_norm": 0.3983505131281807, "learning_rate": 3.0327245781283136e-05, "loss": 0.1712, "step": 1590 }, { "epoch": 1.4677121771217712, "grad_norm": 0.41094545194443866, "learning_rate": 3.0301017164842932e-05, "loss": 0.1786, "step": 1591 }, { "epoch": 1.4686346863468636, "grad_norm": 0.3687323192999056, "learning_rate": 3.027478243743106e-05, "loss": 0.1708, "step": 1592 }, { "epoch": 1.4695571955719557, "grad_norm": 0.343827240699272, "learning_rate": 3.0248541629290693e-05, "loss": 0.1628, "step": 1593 }, { "epoch": 1.470479704797048, "grad_norm": 0.3260547541598996, "learning_rate": 3.0222294770672053e-05, "loss": 0.1657, "step": 1594 }, { "epoch": 1.4714022140221403, "grad_norm": 0.35540004378286827, "learning_rate": 3.0196041891832312e-05, "loss": 0.1344, "step": 1595 }, { "epoch": 1.4723247232472325, "grad_norm": 0.3527802448027834, "learning_rate": 3.0169783023035577e-05, "loss": 0.1693, "step": 1596 }, { "epoch": 1.4732472324723247, "grad_norm": 0.38113471097611723, "learning_rate": 3.0143518194552873e-05, "loss": 0.1703, "step": 1597 }, { "epoch": 1.474169741697417, "grad_norm": 0.35839469193166723, "learning_rate": 3.01172474366621e-05, "loss": 0.1674, "step": 1598 }, { "epoch": 1.4750922509225093, "grad_norm": 0.37083937938017913, "learning_rate": 3.009097077964797e-05, "loss": 0.1568, "step": 1599 }, { "epoch": 1.4760147601476015, "grad_norm": 0.39238812133174916, "learning_rate": 3.0064688253802026e-05, "loss": 0.1711, "step": 1600 }, { "epoch": 1.4769372693726937, "grad_norm": 0.3355525417090908, "learning_rate": 3.0038399889422553e-05, "loss": 0.1507, "step": 1601 }, { "epoch": 1.477859778597786, "grad_norm": 0.36063956430414745, "learning_rate": 3.001210571681457e-05, "loss": 0.1492, "step": 1602 }, { "epoch": 1.4787822878228782, "grad_norm": 0.3710232857863549, "learning_rate": 2.9985805766289817e-05, "loss": 0.1685, "step": 1603 }, { "epoch": 1.4797047970479704, "grad_norm": 0.42301545914956, "learning_rate": 2.995950006816664e-05, "loss": 0.1862, "step": 1604 }, { "epoch": 1.4806273062730626, "grad_norm": 0.43511270029205984, "learning_rate": 2.9933188652770068e-05, "loss": 0.1947, "step": 1605 }, { "epoch": 1.481549815498155, "grad_norm": 0.38886533606243734, "learning_rate": 2.9906871550431697e-05, "loss": 0.1481, "step": 1606 }, { "epoch": 1.4824723247232472, "grad_norm": 0.329252996048036, "learning_rate": 2.988054879148967e-05, "loss": 0.1656, "step": 1607 }, { "epoch": 1.4833948339483394, "grad_norm": 0.3395919414815041, "learning_rate": 2.9854220406288668e-05, "loss": 0.1695, "step": 1608 }, { "epoch": 1.4843173431734318, "grad_norm": 0.3409522992599561, "learning_rate": 2.9827886425179848e-05, "loss": 0.1649, "step": 1609 }, { "epoch": 1.485239852398524, "grad_norm": 0.33811409950873744, "learning_rate": 2.980154687852082e-05, "loss": 0.1569, "step": 1610 }, { "epoch": 1.4861623616236161, "grad_norm": 0.3673414750977099, "learning_rate": 2.977520179667561e-05, "loss": 0.1749, "step": 1611 }, { "epoch": 1.4870848708487086, "grad_norm": 0.40741286834007223, "learning_rate": 2.9748851210014628e-05, "loss": 0.1768, "step": 1612 }, { "epoch": 1.4880073800738007, "grad_norm": 0.33719593043968854, "learning_rate": 2.972249514891462e-05, "loss": 0.1698, "step": 1613 }, { "epoch": 1.488929889298893, "grad_norm": 0.4098015224299873, "learning_rate": 2.9696133643758662e-05, "loss": 0.1781, "step": 1614 }, { "epoch": 1.4898523985239853, "grad_norm": 0.3378128069918369, "learning_rate": 2.966976672493607e-05, "loss": 0.1642, "step": 1615 }, { "epoch": 1.4907749077490775, "grad_norm": 0.3317008949395516, "learning_rate": 2.964339442284245e-05, "loss": 0.1531, "step": 1616 }, { "epoch": 1.4916974169741697, "grad_norm": 0.31973761938103357, "learning_rate": 2.961701676787958e-05, "loss": 0.1463, "step": 1617 }, { "epoch": 1.492619926199262, "grad_norm": 0.40153379072024, "learning_rate": 2.9590633790455413e-05, "loss": 0.1799, "step": 1618 }, { "epoch": 1.4935424354243543, "grad_norm": 0.37354165237734216, "learning_rate": 2.9564245520984047e-05, "loss": 0.1633, "step": 1619 }, { "epoch": 1.4944649446494465, "grad_norm": 0.3082489787747408, "learning_rate": 2.9537851989885667e-05, "loss": 0.1449, "step": 1620 }, { "epoch": 1.4953874538745389, "grad_norm": 0.3370920265749821, "learning_rate": 2.951145322758654e-05, "loss": 0.1447, "step": 1621 }, { "epoch": 1.496309963099631, "grad_norm": 0.38639008934505387, "learning_rate": 2.948504926451896e-05, "loss": 0.1556, "step": 1622 }, { "epoch": 1.4972324723247232, "grad_norm": 0.39041551437746863, "learning_rate": 2.945864013112119e-05, "loss": 0.1731, "step": 1623 }, { "epoch": 1.4981549815498156, "grad_norm": 0.38232994861502023, "learning_rate": 2.943222585783749e-05, "loss": 0.1621, "step": 1624 }, { "epoch": 1.4990774907749078, "grad_norm": 0.38654601080386863, "learning_rate": 2.9405806475118048e-05, "loss": 0.142, "step": 1625 }, { "epoch": 1.5, "grad_norm": 0.33722181119539885, "learning_rate": 2.9379382013418892e-05, "loss": 0.1591, "step": 1626 }, { "epoch": 1.5009225092250924, "grad_norm": 0.4095673988908001, "learning_rate": 2.935295250320196e-05, "loss": 0.1754, "step": 1627 }, { "epoch": 1.5018450184501844, "grad_norm": 0.4197188700101124, "learning_rate": 2.932651797493498e-05, "loss": 0.1686, "step": 1628 }, { "epoch": 1.5027675276752768, "grad_norm": 0.3595654552689153, "learning_rate": 2.9300078459091462e-05, "loss": 0.1708, "step": 1629 }, { "epoch": 1.503690036900369, "grad_norm": 0.36307084098081793, "learning_rate": 2.9273633986150696e-05, "loss": 0.1656, "step": 1630 }, { "epoch": 1.5046125461254611, "grad_norm": 0.383936016864393, "learning_rate": 2.9247184586597648e-05, "loss": 0.1669, "step": 1631 }, { "epoch": 1.5055350553505535, "grad_norm": 0.4118452867071001, "learning_rate": 2.922073029092299e-05, "loss": 0.1715, "step": 1632 }, { "epoch": 1.5064575645756457, "grad_norm": 0.3875727639529233, "learning_rate": 2.9194271129623034e-05, "loss": 0.1759, "step": 1633 }, { "epoch": 1.507380073800738, "grad_norm": 0.31670060362800706, "learning_rate": 2.9167807133199686e-05, "loss": 0.1561, "step": 1634 }, { "epoch": 1.5083025830258303, "grad_norm": 0.4349756602309646, "learning_rate": 2.914133833216045e-05, "loss": 0.1745, "step": 1635 }, { "epoch": 1.5092250922509225, "grad_norm": 0.36721395758123293, "learning_rate": 2.9114864757018352e-05, "loss": 0.1707, "step": 1636 }, { "epoch": 1.5101476014760147, "grad_norm": 0.37569955946034134, "learning_rate": 2.908838643829191e-05, "loss": 0.1665, "step": 1637 }, { "epoch": 1.511070110701107, "grad_norm": 0.3515764064184136, "learning_rate": 2.9061903406505154e-05, "loss": 0.1652, "step": 1638 }, { "epoch": 1.5119926199261993, "grad_norm": 0.4000005503389113, "learning_rate": 2.9035415692187485e-05, "loss": 0.1642, "step": 1639 }, { "epoch": 1.5129151291512914, "grad_norm": 0.3495958649285323, "learning_rate": 2.9008923325873753e-05, "loss": 0.1629, "step": 1640 }, { "epoch": 1.5138376383763839, "grad_norm": 0.4242767363712676, "learning_rate": 2.8982426338104168e-05, "loss": 0.1856, "step": 1641 }, { "epoch": 1.514760147601476, "grad_norm": 0.39603614592949704, "learning_rate": 2.8955924759424225e-05, "loss": 0.1546, "step": 1642 }, { "epoch": 1.5156826568265682, "grad_norm": 0.3321707220729133, "learning_rate": 2.8929418620384753e-05, "loss": 0.1562, "step": 1643 }, { "epoch": 1.5166051660516606, "grad_norm": 0.39501612838057204, "learning_rate": 2.8902907951541834e-05, "loss": 0.1787, "step": 1644 }, { "epoch": 1.5175276752767528, "grad_norm": 0.4115223591573108, "learning_rate": 2.887639278345674e-05, "loss": 0.159, "step": 1645 }, { "epoch": 1.518450184501845, "grad_norm": 0.3952771894733965, "learning_rate": 2.8849873146695972e-05, "loss": 0.1739, "step": 1646 }, { "epoch": 1.5193726937269374, "grad_norm": 0.35382488689126607, "learning_rate": 2.882334907183115e-05, "loss": 0.1742, "step": 1647 }, { "epoch": 1.5202952029520294, "grad_norm": 0.35980398974927635, "learning_rate": 2.8796820589439027e-05, "loss": 0.1663, "step": 1648 }, { "epoch": 1.5212177121771218, "grad_norm": 0.351634834990234, "learning_rate": 2.877028773010144e-05, "loss": 0.163, "step": 1649 }, { "epoch": 1.5221402214022142, "grad_norm": 0.36216094534186977, "learning_rate": 2.8743750524405254e-05, "loss": 0.1537, "step": 1650 }, { "epoch": 1.5230627306273061, "grad_norm": 0.3820723737397306, "learning_rate": 2.8717209002942357e-05, "loss": 0.1797, "step": 1651 }, { "epoch": 1.5239852398523985, "grad_norm": 0.36131648309341774, "learning_rate": 2.8690663196309615e-05, "loss": 0.1567, "step": 1652 }, { "epoch": 1.524907749077491, "grad_norm": 0.4146918598402644, "learning_rate": 2.866411313510882e-05, "loss": 0.1655, "step": 1653 }, { "epoch": 1.525830258302583, "grad_norm": 0.3872220874047178, "learning_rate": 2.863755884994669e-05, "loss": 0.1684, "step": 1654 }, { "epoch": 1.5267527675276753, "grad_norm": 0.3488782618904229, "learning_rate": 2.8611000371434794e-05, "loss": 0.1621, "step": 1655 }, { "epoch": 1.5276752767527675, "grad_norm": 0.3513312290077386, "learning_rate": 2.8584437730189534e-05, "loss": 0.148, "step": 1656 }, { "epoch": 1.5285977859778597, "grad_norm": 0.4150137776028713, "learning_rate": 2.8557870956832132e-05, "loss": 0.1728, "step": 1657 }, { "epoch": 1.529520295202952, "grad_norm": 0.4118880306886742, "learning_rate": 2.853130008198855e-05, "loss": 0.1669, "step": 1658 }, { "epoch": 1.5304428044280443, "grad_norm": 0.39900580456854273, "learning_rate": 2.850472513628948e-05, "loss": 0.1645, "step": 1659 }, { "epoch": 1.5313653136531364, "grad_norm": 0.34815868384829984, "learning_rate": 2.8478146150370337e-05, "loss": 0.1551, "step": 1660 }, { "epoch": 1.5322878228782288, "grad_norm": 0.387257729505376, "learning_rate": 2.8451563154871148e-05, "loss": 0.1814, "step": 1661 }, { "epoch": 1.533210332103321, "grad_norm": 0.3766488859307147, "learning_rate": 2.8424976180436596e-05, "loss": 0.1739, "step": 1662 }, { "epoch": 1.5341328413284132, "grad_norm": 0.3858891394420731, "learning_rate": 2.8398385257715942e-05, "loss": 0.178, "step": 1663 }, { "epoch": 1.5350553505535056, "grad_norm": 0.3623152374526288, "learning_rate": 2.8371790417362987e-05, "loss": 0.1736, "step": 1664 }, { "epoch": 1.5359778597785978, "grad_norm": 0.3314968152447182, "learning_rate": 2.8345191690036064e-05, "loss": 0.1512, "step": 1665 }, { "epoch": 1.53690036900369, "grad_norm": 0.42404689265015627, "learning_rate": 2.8318589106397987e-05, "loss": 0.1886, "step": 1666 }, { "epoch": 1.5378228782287824, "grad_norm": 0.36779101131292224, "learning_rate": 2.8291982697115986e-05, "loss": 0.171, "step": 1667 }, { "epoch": 1.5387453874538746, "grad_norm": 0.4139048273758788, "learning_rate": 2.826537249286176e-05, "loss": 0.1883, "step": 1668 }, { "epoch": 1.5396678966789668, "grad_norm": 0.388458597327605, "learning_rate": 2.8238758524311314e-05, "loss": 0.1693, "step": 1669 }, { "epoch": 1.5405904059040592, "grad_norm": 0.37003305542652387, "learning_rate": 2.821214082214504e-05, "loss": 0.1662, "step": 1670 }, { "epoch": 1.5415129151291513, "grad_norm": 0.41708058167998435, "learning_rate": 2.8185519417047624e-05, "loss": 0.181, "step": 1671 }, { "epoch": 1.5424354243542435, "grad_norm": 0.3875713861998777, "learning_rate": 2.8158894339708004e-05, "loss": 0.1732, "step": 1672 }, { "epoch": 1.543357933579336, "grad_norm": 0.3953949690930639, "learning_rate": 2.813226562081938e-05, "loss": 0.176, "step": 1673 }, { "epoch": 1.5442804428044279, "grad_norm": 0.33502194674439734, "learning_rate": 2.8105633291079116e-05, "loss": 0.1546, "step": 1674 }, { "epoch": 1.5452029520295203, "grad_norm": 0.3262775436675679, "learning_rate": 2.807899738118876e-05, "loss": 0.1438, "step": 1675 }, { "epoch": 1.5461254612546127, "grad_norm": 0.3862535028697546, "learning_rate": 2.8052357921854e-05, "loss": 0.1779, "step": 1676 }, { "epoch": 1.5470479704797047, "grad_norm": 0.36875700215567225, "learning_rate": 2.802571494378458e-05, "loss": 0.159, "step": 1677 }, { "epoch": 1.547970479704797, "grad_norm": 0.36609032887191695, "learning_rate": 2.799906847769433e-05, "loss": 0.1624, "step": 1678 }, { "epoch": 1.5488929889298892, "grad_norm": 0.3719412064017298, "learning_rate": 2.7972418554301084e-05, "loss": 0.156, "step": 1679 }, { "epoch": 1.5498154981549814, "grad_norm": 0.3968781036045197, "learning_rate": 2.794576520432666e-05, "loss": 0.167, "step": 1680 }, { "epoch": 1.5507380073800738, "grad_norm": 0.4222511906972072, "learning_rate": 2.791910845849686e-05, "loss": 0.1975, "step": 1681 }, { "epoch": 1.551660516605166, "grad_norm": 0.38799129053194403, "learning_rate": 2.7892448347541354e-05, "loss": 0.1696, "step": 1682 }, { "epoch": 1.5525830258302582, "grad_norm": 0.3317047677083327, "learning_rate": 2.7865784902193714e-05, "loss": 0.1732, "step": 1683 }, { "epoch": 1.5535055350553506, "grad_norm": 0.3343018202348186, "learning_rate": 2.7839118153191362e-05, "loss": 0.1633, "step": 1684 }, { "epoch": 1.5544280442804428, "grad_norm": 0.3947024735698901, "learning_rate": 2.781244813127552e-05, "loss": 0.1719, "step": 1685 }, { "epoch": 1.555350553505535, "grad_norm": 0.35244868140225677, "learning_rate": 2.7785774867191172e-05, "loss": 0.1691, "step": 1686 }, { "epoch": 1.5562730627306274, "grad_norm": 0.3439655799420121, "learning_rate": 2.775909839168706e-05, "loss": 0.1827, "step": 1687 }, { "epoch": 1.5571955719557196, "grad_norm": 0.3660175861653934, "learning_rate": 2.7732418735515627e-05, "loss": 0.1504, "step": 1688 }, { "epoch": 1.5581180811808117, "grad_norm": 0.35395738515545716, "learning_rate": 2.7705735929432953e-05, "loss": 0.1573, "step": 1689 }, { "epoch": 1.5590405904059041, "grad_norm": 0.34896194637545036, "learning_rate": 2.7679050004198787e-05, "loss": 0.1609, "step": 1690 }, { "epoch": 1.5599630996309963, "grad_norm": 0.362312918114885, "learning_rate": 2.7652360990576453e-05, "loss": 0.1675, "step": 1691 }, { "epoch": 1.5608856088560885, "grad_norm": 0.3919764217015699, "learning_rate": 2.762566891933285e-05, "loss": 0.1785, "step": 1692 }, { "epoch": 1.561808118081181, "grad_norm": 0.39902352613928777, "learning_rate": 2.7598973821238365e-05, "loss": 0.1535, "step": 1693 }, { "epoch": 1.562730627306273, "grad_norm": 0.33741091519673283, "learning_rate": 2.7572275727066927e-05, "loss": 0.1507, "step": 1694 }, { "epoch": 1.5636531365313653, "grad_norm": 0.3704980091260743, "learning_rate": 2.754557466759589e-05, "loss": 0.1626, "step": 1695 }, { "epoch": 1.5645756457564577, "grad_norm": 0.3923587627762875, "learning_rate": 2.751887067360601e-05, "loss": 0.1694, "step": 1696 }, { "epoch": 1.5654981549815496, "grad_norm": 0.3725058521166386, "learning_rate": 2.7492163775881475e-05, "loss": 0.1638, "step": 1697 }, { "epoch": 1.566420664206642, "grad_norm": 0.3538882141932905, "learning_rate": 2.746545400520977e-05, "loss": 0.1562, "step": 1698 }, { "epoch": 1.5673431734317345, "grad_norm": 0.3582084206694265, "learning_rate": 2.7438741392381705e-05, "loss": 0.1666, "step": 1699 }, { "epoch": 1.5682656826568264, "grad_norm": 0.3325709438632519, "learning_rate": 2.74120259681914e-05, "loss": 0.1598, "step": 1700 }, { "epoch": 1.5691881918819188, "grad_norm": 0.3600940625464994, "learning_rate": 2.7385307763436168e-05, "loss": 0.1576, "step": 1701 }, { "epoch": 1.5701107011070112, "grad_norm": 0.3303856865581989, "learning_rate": 2.7358586808916557e-05, "loss": 0.15, "step": 1702 }, { "epoch": 1.5710332103321032, "grad_norm": 0.36761502472329155, "learning_rate": 2.733186313543628e-05, "loss": 0.1616, "step": 1703 }, { "epoch": 1.5719557195571956, "grad_norm": 0.3488967311542213, "learning_rate": 2.730513677380218e-05, "loss": 0.1602, "step": 1704 }, { "epoch": 1.5728782287822878, "grad_norm": 0.3744373709817195, "learning_rate": 2.7278407754824194e-05, "loss": 0.1653, "step": 1705 }, { "epoch": 1.57380073800738, "grad_norm": 0.3426693420939731, "learning_rate": 2.7251676109315338e-05, "loss": 0.1576, "step": 1706 }, { "epoch": 1.5747232472324724, "grad_norm": 0.3504742590283337, "learning_rate": 2.7224941868091643e-05, "loss": 0.1523, "step": 1707 }, { "epoch": 1.5756457564575646, "grad_norm": 0.30178230668808714, "learning_rate": 2.7198205061972132e-05, "loss": 0.1482, "step": 1708 }, { "epoch": 1.5765682656826567, "grad_norm": 0.42439136751990975, "learning_rate": 2.7171465721778787e-05, "loss": 0.174, "step": 1709 }, { "epoch": 1.5774907749077491, "grad_norm": 0.37755044976708263, "learning_rate": 2.7144723878336524e-05, "loss": 0.1531, "step": 1710 }, { "epoch": 1.5784132841328413, "grad_norm": 0.3686743204129699, "learning_rate": 2.711797956247313e-05, "loss": 0.1628, "step": 1711 }, { "epoch": 1.5793357933579335, "grad_norm": 0.4007196085048192, "learning_rate": 2.7091232805019235e-05, "loss": 0.172, "step": 1712 }, { "epoch": 1.580258302583026, "grad_norm": 0.36940706989551075, "learning_rate": 2.7064483636808313e-05, "loss": 0.1503, "step": 1713 }, { "epoch": 1.581180811808118, "grad_norm": 0.3717310421409479, "learning_rate": 2.7037732088676582e-05, "loss": 0.1609, "step": 1714 }, { "epoch": 1.5821033210332103, "grad_norm": 0.3728801357325129, "learning_rate": 2.7010978191463025e-05, "loss": 0.1649, "step": 1715 }, { "epoch": 1.5830258302583027, "grad_norm": 0.3752575101090936, "learning_rate": 2.698422197600934e-05, "loss": 0.1821, "step": 1716 }, { "epoch": 1.5839483394833949, "grad_norm": 0.4099370798173003, "learning_rate": 2.695746347315987e-05, "loss": 0.1767, "step": 1717 }, { "epoch": 1.584870848708487, "grad_norm": 0.3803739879623954, "learning_rate": 2.6930702713761612e-05, "loss": 0.1739, "step": 1718 }, { "epoch": 1.5857933579335795, "grad_norm": 0.4213106506726271, "learning_rate": 2.6903939728664174e-05, "loss": 0.1786, "step": 1719 }, { "epoch": 1.5867158671586716, "grad_norm": 0.3525768768348475, "learning_rate": 2.6877174548719706e-05, "loss": 0.1598, "step": 1720 }, { "epoch": 1.5876383763837638, "grad_norm": 0.3965576248495974, "learning_rate": 2.6850407204782912e-05, "loss": 0.1742, "step": 1721 }, { "epoch": 1.5885608856088562, "grad_norm": 0.32956801808293157, "learning_rate": 2.6823637727710972e-05, "loss": 0.1397, "step": 1722 }, { "epoch": 1.5894833948339482, "grad_norm": 0.4421425101053271, "learning_rate": 2.6796866148363538e-05, "loss": 0.1559, "step": 1723 }, { "epoch": 1.5904059040590406, "grad_norm": 0.39009858519793944, "learning_rate": 2.677009249760268e-05, "loss": 0.1708, "step": 1724 }, { "epoch": 1.591328413284133, "grad_norm": 0.40742855451442184, "learning_rate": 2.674331680629284e-05, "loss": 0.1715, "step": 1725 }, { "epoch": 1.592250922509225, "grad_norm": 0.3891052404705072, "learning_rate": 2.6716539105300853e-05, "loss": 0.1639, "step": 1726 }, { "epoch": 1.5931734317343174, "grad_norm": 0.31948820814791407, "learning_rate": 2.668975942549583e-05, "loss": 0.151, "step": 1727 }, { "epoch": 1.5940959409594095, "grad_norm": 0.3722932165709747, "learning_rate": 2.6662977797749178e-05, "loss": 0.1779, "step": 1728 }, { "epoch": 1.5950184501845017, "grad_norm": 0.36509476764263094, "learning_rate": 2.663619425293456e-05, "loss": 0.1639, "step": 1729 }, { "epoch": 1.5959409594095941, "grad_norm": 0.3858310656003099, "learning_rate": 2.6609408821927838e-05, "loss": 0.1693, "step": 1730 }, { "epoch": 1.5968634686346863, "grad_norm": 0.3331195088745354, "learning_rate": 2.6582621535607043e-05, "loss": 0.1581, "step": 1731 }, { "epoch": 1.5977859778597785, "grad_norm": 0.3405081382004149, "learning_rate": 2.655583242485236e-05, "loss": 0.1595, "step": 1732 }, { "epoch": 1.598708487084871, "grad_norm": 0.3574851821043332, "learning_rate": 2.652904152054607e-05, "loss": 0.1621, "step": 1733 }, { "epoch": 1.599630996309963, "grad_norm": 0.381571319987199, "learning_rate": 2.650224885357251e-05, "loss": 0.1676, "step": 1734 }, { "epoch": 1.6005535055350553, "grad_norm": 0.35821102821722783, "learning_rate": 2.6475454454818073e-05, "loss": 0.1557, "step": 1735 }, { "epoch": 1.6014760147601477, "grad_norm": 0.3764251524627297, "learning_rate": 2.6448658355171125e-05, "loss": 0.1638, "step": 1736 }, { "epoch": 1.6023985239852399, "grad_norm": 0.38347245125149365, "learning_rate": 2.6421860585522e-05, "loss": 0.1776, "step": 1737 }, { "epoch": 1.603321033210332, "grad_norm": 0.3768996262490828, "learning_rate": 2.6395061176762976e-05, "loss": 0.1597, "step": 1738 }, { "epoch": 1.6042435424354244, "grad_norm": 0.34816471747517486, "learning_rate": 2.6368260159788195e-05, "loss": 0.1551, "step": 1739 }, { "epoch": 1.6051660516605166, "grad_norm": 0.35597562064113825, "learning_rate": 2.6341457565493654e-05, "loss": 0.1668, "step": 1740 }, { "epoch": 1.6060885608856088, "grad_norm": 0.4190496695256457, "learning_rate": 2.6314653424777193e-05, "loss": 0.1706, "step": 1741 }, { "epoch": 1.6070110701107012, "grad_norm": 0.3576242465558538, "learning_rate": 2.628784776853841e-05, "loss": 0.1654, "step": 1742 }, { "epoch": 1.6079335793357934, "grad_norm": 0.34269936127558054, "learning_rate": 2.6261040627678655e-05, "loss": 0.1607, "step": 1743 }, { "epoch": 1.6088560885608856, "grad_norm": 0.36752172633589664, "learning_rate": 2.6234232033101e-05, "loss": 0.1661, "step": 1744 }, { "epoch": 1.609778597785978, "grad_norm": 0.3524255109254831, "learning_rate": 2.620742201571018e-05, "loss": 0.1513, "step": 1745 }, { "epoch": 1.6107011070110702, "grad_norm": 0.33457956119732485, "learning_rate": 2.6180610606412587e-05, "loss": 0.1676, "step": 1746 }, { "epoch": 1.6116236162361623, "grad_norm": 0.36581897070107644, "learning_rate": 2.615379783611619e-05, "loss": 0.1722, "step": 1747 }, { "epoch": 1.6125461254612548, "grad_norm": 0.3775378029415688, "learning_rate": 2.612698373573056e-05, "loss": 0.1726, "step": 1748 }, { "epoch": 1.6134686346863467, "grad_norm": 0.36484098955173416, "learning_rate": 2.610016833616678e-05, "loss": 0.1632, "step": 1749 }, { "epoch": 1.6143911439114391, "grad_norm": 0.37034868726475717, "learning_rate": 2.6073351668337425e-05, "loss": 0.1483, "step": 1750 }, { "epoch": 1.6153136531365315, "grad_norm": 0.3509729304946948, "learning_rate": 2.6046533763156556e-05, "loss": 0.1478, "step": 1751 }, { "epoch": 1.6162361623616235, "grad_norm": 0.3491459694747171, "learning_rate": 2.6019714651539646e-05, "loss": 0.1492, "step": 1752 }, { "epoch": 1.617158671586716, "grad_norm": 0.43367934165324495, "learning_rate": 2.599289436440355e-05, "loss": 0.174, "step": 1753 }, { "epoch": 1.618081180811808, "grad_norm": 0.3929404569119107, "learning_rate": 2.5966072932666496e-05, "loss": 0.1559, "step": 1754 }, { "epoch": 1.6190036900369003, "grad_norm": 0.3504889879631341, "learning_rate": 2.593925038724802e-05, "loss": 0.1624, "step": 1755 }, { "epoch": 1.6199261992619927, "grad_norm": 0.3530763221610121, "learning_rate": 2.5912426759068942e-05, "loss": 0.1628, "step": 1756 }, { "epoch": 1.6208487084870848, "grad_norm": 0.35679727055157445, "learning_rate": 2.5885602079051353e-05, "loss": 0.1646, "step": 1757 }, { "epoch": 1.621771217712177, "grad_norm": 0.39285731384307976, "learning_rate": 2.585877637811851e-05, "loss": 0.1542, "step": 1758 }, { "epoch": 1.6226937269372694, "grad_norm": 0.379985099892189, "learning_rate": 2.5831949687194896e-05, "loss": 0.151, "step": 1759 }, { "epoch": 1.6236162361623616, "grad_norm": 0.4129181801521749, "learning_rate": 2.5805122037206093e-05, "loss": 0.1825, "step": 1760 }, { "epoch": 1.6245387453874538, "grad_norm": 0.403755626049169, "learning_rate": 2.5778293459078828e-05, "loss": 0.1549, "step": 1761 }, { "epoch": 1.6254612546125462, "grad_norm": 0.3828008999461262, "learning_rate": 2.575146398374087e-05, "loss": 0.1591, "step": 1762 }, { "epoch": 1.6263837638376384, "grad_norm": 0.38324101709104025, "learning_rate": 2.5724633642121025e-05, "loss": 0.1548, "step": 1763 }, { "epoch": 1.6273062730627306, "grad_norm": 0.3695771356000055, "learning_rate": 2.5697802465149117e-05, "loss": 0.1537, "step": 1764 }, { "epoch": 1.628228782287823, "grad_norm": 0.36402826938395866, "learning_rate": 2.5670970483755912e-05, "loss": 0.1593, "step": 1765 }, { "epoch": 1.6291512915129152, "grad_norm": 0.367222780146665, "learning_rate": 2.5644137728873107e-05, "loss": 0.1615, "step": 1766 }, { "epoch": 1.6300738007380073, "grad_norm": 0.3175180820245255, "learning_rate": 2.5617304231433305e-05, "loss": 0.1396, "step": 1767 }, { "epoch": 1.6309963099630997, "grad_norm": 0.3579853168539979, "learning_rate": 2.559047002236995e-05, "loss": 0.1728, "step": 1768 }, { "epoch": 1.631918819188192, "grad_norm": 0.34460467282407875, "learning_rate": 2.5563635132617302e-05, "loss": 0.1606, "step": 1769 }, { "epoch": 1.632841328413284, "grad_norm": 0.3523090029542282, "learning_rate": 2.553679959311044e-05, "loss": 0.17, "step": 1770 }, { "epoch": 1.6337638376383765, "grad_norm": 0.34425584939112036, "learning_rate": 2.550996343478514e-05, "loss": 0.169, "step": 1771 }, { "epoch": 1.6346863468634685, "grad_norm": 0.36165025369113873, "learning_rate": 2.5483126688577926e-05, "loss": 0.1629, "step": 1772 }, { "epoch": 1.6356088560885609, "grad_norm": 0.35481197568027084, "learning_rate": 2.5456289385426e-05, "loss": 0.1363, "step": 1773 }, { "epoch": 1.6365313653136533, "grad_norm": 0.34246971926457204, "learning_rate": 2.5429451556267187e-05, "loss": 0.1535, "step": 1774 }, { "epoch": 1.6374538745387452, "grad_norm": 0.3726840562658278, "learning_rate": 2.5402613232039934e-05, "loss": 0.1519, "step": 1775 }, { "epoch": 1.6383763837638377, "grad_norm": 0.38379172397334155, "learning_rate": 2.5375774443683265e-05, "loss": 0.1708, "step": 1776 }, { "epoch": 1.6392988929889298, "grad_norm": 0.34258217495610643, "learning_rate": 2.5348935222136704e-05, "loss": 0.1596, "step": 1777 }, { "epoch": 1.640221402214022, "grad_norm": 0.38480369696157934, "learning_rate": 2.5322095598340322e-05, "loss": 0.1459, "step": 1778 }, { "epoch": 1.6411439114391144, "grad_norm": 0.36182050713057723, "learning_rate": 2.529525560323462e-05, "loss": 0.1694, "step": 1779 }, { "epoch": 1.6420664206642066, "grad_norm": 0.38787979484694496, "learning_rate": 2.5268415267760526e-05, "loss": 0.1599, "step": 1780 }, { "epoch": 1.6429889298892988, "grad_norm": 0.3495537975863066, "learning_rate": 2.5241574622859394e-05, "loss": 0.1702, "step": 1781 }, { "epoch": 1.6439114391143912, "grad_norm": 0.3569848539736059, "learning_rate": 2.521473369947289e-05, "loss": 0.1614, "step": 1782 }, { "epoch": 1.6448339483394834, "grad_norm": 0.4155549926988143, "learning_rate": 2.518789252854305e-05, "loss": 0.1982, "step": 1783 }, { "epoch": 1.6457564575645756, "grad_norm": 0.3795448375787088, "learning_rate": 2.516105114101215e-05, "loss": 0.1626, "step": 1784 }, { "epoch": 1.646678966789668, "grad_norm": 0.371239016889001, "learning_rate": 2.5134209567822724e-05, "loss": 0.1607, "step": 1785 }, { "epoch": 1.6476014760147601, "grad_norm": 0.36075383156663865, "learning_rate": 2.510736783991756e-05, "loss": 0.164, "step": 1786 }, { "epoch": 1.6485239852398523, "grad_norm": 0.3735289734675146, "learning_rate": 2.5080525988239574e-05, "loss": 0.16, "step": 1787 }, { "epoch": 1.6494464944649447, "grad_norm": 0.37814164032652253, "learning_rate": 2.5053684043731847e-05, "loss": 0.156, "step": 1788 }, { "epoch": 1.650369003690037, "grad_norm": 0.40747785575746726, "learning_rate": 2.502684203733758e-05, "loss": 0.162, "step": 1789 }, { "epoch": 1.651291512915129, "grad_norm": 0.3248782786503274, "learning_rate": 2.5e-05, "loss": 0.1421, "step": 1790 }, { "epoch": 1.6522140221402215, "grad_norm": 0.37339369747781037, "learning_rate": 2.4973157962662437e-05, "loss": 0.1287, "step": 1791 }, { "epoch": 1.6531365313653137, "grad_norm": 0.391019477959082, "learning_rate": 2.4946315956268156e-05, "loss": 0.1677, "step": 1792 }, { "epoch": 1.6540590405904059, "grad_norm": 0.3579934657099326, "learning_rate": 2.4919474011760432e-05, "loss": 0.1776, "step": 1793 }, { "epoch": 1.6549815498154983, "grad_norm": 0.40890887145853805, "learning_rate": 2.4892632160082448e-05, "loss": 0.1585, "step": 1794 }, { "epoch": 1.6559040590405905, "grad_norm": 0.349869966597724, "learning_rate": 2.486579043217727e-05, "loss": 0.152, "step": 1795 }, { "epoch": 1.6568265682656826, "grad_norm": 0.4295873001981432, "learning_rate": 2.483894885898786e-05, "loss": 0.181, "step": 1796 }, { "epoch": 1.657749077490775, "grad_norm": 0.3512100711226925, "learning_rate": 2.4812107471456954e-05, "loss": 0.1618, "step": 1797 }, { "epoch": 1.658671586715867, "grad_norm": 0.39208911596233437, "learning_rate": 2.4785266300527105e-05, "loss": 0.1915, "step": 1798 }, { "epoch": 1.6595940959409594, "grad_norm": 0.3469507405655631, "learning_rate": 2.4758425377140612e-05, "loss": 0.1613, "step": 1799 }, { "epoch": 1.6605166051660518, "grad_norm": 0.3661258766022745, "learning_rate": 2.4731584732239486e-05, "loss": 0.1479, "step": 1800 }, { "epoch": 1.6614391143911438, "grad_norm": 0.3071602425954102, "learning_rate": 2.470474439676539e-05, "loss": 0.1536, "step": 1801 }, { "epoch": 1.6623616236162362, "grad_norm": 0.34788042054454105, "learning_rate": 2.4677904401659684e-05, "loss": 0.1552, "step": 1802 }, { "epoch": 1.6632841328413284, "grad_norm": 0.4010228125029169, "learning_rate": 2.4651064777863305e-05, "loss": 0.1753, "step": 1803 }, { "epoch": 1.6642066420664205, "grad_norm": 0.4014536564406406, "learning_rate": 2.4624225556316744e-05, "loss": 0.175, "step": 1804 }, { "epoch": 1.665129151291513, "grad_norm": 0.39391368995880854, "learning_rate": 2.4597386767960075e-05, "loss": 0.1469, "step": 1805 }, { "epoch": 1.6660516605166051, "grad_norm": 0.3662948621635896, "learning_rate": 2.4570548443732825e-05, "loss": 0.1672, "step": 1806 }, { "epoch": 1.6669741697416973, "grad_norm": 0.4180461633069466, "learning_rate": 2.4543710614574005e-05, "loss": 0.1689, "step": 1807 }, { "epoch": 1.6678966789667897, "grad_norm": 0.3733317090949078, "learning_rate": 2.4516873311422083e-05, "loss": 0.1637, "step": 1808 }, { "epoch": 1.668819188191882, "grad_norm": 0.37684961403111317, "learning_rate": 2.4490036565214873e-05, "loss": 0.1565, "step": 1809 }, { "epoch": 1.669741697416974, "grad_norm": 0.37196143979647983, "learning_rate": 2.4463200406889562e-05, "loss": 0.1581, "step": 1810 }, { "epoch": 1.6706642066420665, "grad_norm": 0.421690946853156, "learning_rate": 2.44363648673827e-05, "loss": 0.1847, "step": 1811 }, { "epoch": 1.6715867158671587, "grad_norm": 0.37568701071123256, "learning_rate": 2.440952997763005e-05, "loss": 0.1752, "step": 1812 }, { "epoch": 1.6725092250922509, "grad_norm": 0.3557026738053844, "learning_rate": 2.4382695768566697e-05, "loss": 0.1625, "step": 1813 }, { "epoch": 1.6734317343173433, "grad_norm": 0.3917601421523301, "learning_rate": 2.4355862271126896e-05, "loss": 0.1679, "step": 1814 }, { "epoch": 1.6743542435424354, "grad_norm": 0.406327508192895, "learning_rate": 2.432902951624409e-05, "loss": 0.1481, "step": 1815 }, { "epoch": 1.6752767527675276, "grad_norm": 0.3788673145366441, "learning_rate": 2.4302197534850892e-05, "loss": 0.1507, "step": 1816 }, { "epoch": 1.67619926199262, "grad_norm": 0.3531155351857008, "learning_rate": 2.427536635787898e-05, "loss": 0.1639, "step": 1817 }, { "epoch": 1.6771217712177122, "grad_norm": 0.48329291467772933, "learning_rate": 2.4248536016259135e-05, "loss": 0.1714, "step": 1818 }, { "epoch": 1.6780442804428044, "grad_norm": 0.3666417542770587, "learning_rate": 2.4221706540921178e-05, "loss": 0.1761, "step": 1819 }, { "epoch": 1.6789667896678968, "grad_norm": 0.34653859689705124, "learning_rate": 2.4194877962793913e-05, "loss": 0.1801, "step": 1820 }, { "epoch": 1.6798892988929888, "grad_norm": 0.4110613509102009, "learning_rate": 2.416805031280511e-05, "loss": 0.1616, "step": 1821 }, { "epoch": 1.6808118081180812, "grad_norm": 0.36485843700318177, "learning_rate": 2.4141223621881495e-05, "loss": 0.1544, "step": 1822 }, { "epoch": 1.6817343173431736, "grad_norm": 0.31499028962004866, "learning_rate": 2.4114397920948657e-05, "loss": 0.1595, "step": 1823 }, { "epoch": 1.6826568265682655, "grad_norm": 0.40781315168296495, "learning_rate": 2.4087573240931053e-05, "loss": 0.173, "step": 1824 }, { "epoch": 1.683579335793358, "grad_norm": 0.4175211608803691, "learning_rate": 2.4060749612751988e-05, "loss": 0.1805, "step": 1825 }, { "epoch": 1.6845018450184504, "grad_norm": 0.3268684870697504, "learning_rate": 2.4033927067333513e-05, "loss": 0.1805, "step": 1826 }, { "epoch": 1.6854243542435423, "grad_norm": 0.36505231798978677, "learning_rate": 2.4007105635596454e-05, "loss": 0.1626, "step": 1827 }, { "epoch": 1.6863468634686347, "grad_norm": 0.33455639629313166, "learning_rate": 2.3980285348460363e-05, "loss": 0.1715, "step": 1828 }, { "epoch": 1.687269372693727, "grad_norm": 0.29965711634892783, "learning_rate": 2.395346623684345e-05, "loss": 0.1497, "step": 1829 }, { "epoch": 1.688191881918819, "grad_norm": 0.36809149141297764, "learning_rate": 2.3926648331662578e-05, "loss": 0.1728, "step": 1830 }, { "epoch": 1.6891143911439115, "grad_norm": 0.3576239939689782, "learning_rate": 2.389983166383323e-05, "loss": 0.1642, "step": 1831 }, { "epoch": 1.6900369003690037, "grad_norm": 0.3675704061928072, "learning_rate": 2.387301626426944e-05, "loss": 0.1563, "step": 1832 }, { "epoch": 1.6909594095940959, "grad_norm": 0.3913242233199537, "learning_rate": 2.3846202163883807e-05, "loss": 0.1642, "step": 1833 }, { "epoch": 1.6918819188191883, "grad_norm": 0.3540709417362491, "learning_rate": 2.381938939358742e-05, "loss": 0.1621, "step": 1834 }, { "epoch": 1.6928044280442804, "grad_norm": 0.3618416232702187, "learning_rate": 2.3792577984289825e-05, "loss": 0.1787, "step": 1835 }, { "epoch": 1.6937269372693726, "grad_norm": 0.35382274617948284, "learning_rate": 2.3765767966899e-05, "loss": 0.1676, "step": 1836 }, { "epoch": 1.694649446494465, "grad_norm": 0.36513333624772454, "learning_rate": 2.3738959372321347e-05, "loss": 0.1533, "step": 1837 }, { "epoch": 1.6955719557195572, "grad_norm": 0.408832356741295, "learning_rate": 2.37121522314616e-05, "loss": 0.1648, "step": 1838 }, { "epoch": 1.6964944649446494, "grad_norm": 0.41739708583790197, "learning_rate": 2.368534657522281e-05, "loss": 0.1668, "step": 1839 }, { "epoch": 1.6974169741697418, "grad_norm": 0.33318751572141103, "learning_rate": 2.3658542434506352e-05, "loss": 0.1446, "step": 1840 }, { "epoch": 1.698339483394834, "grad_norm": 0.3521329697255817, "learning_rate": 2.3631739840211817e-05, "loss": 0.1613, "step": 1841 }, { "epoch": 1.6992619926199262, "grad_norm": 0.40553912933042213, "learning_rate": 2.3604938823237023e-05, "loss": 0.1772, "step": 1842 }, { "epoch": 1.7001845018450186, "grad_norm": 0.43117642985897287, "learning_rate": 2.3578139414478002e-05, "loss": 0.1779, "step": 1843 }, { "epoch": 1.7011070110701108, "grad_norm": 0.3564227968003786, "learning_rate": 2.3551341644828884e-05, "loss": 0.1712, "step": 1844 }, { "epoch": 1.702029520295203, "grad_norm": 0.3471065097683919, "learning_rate": 2.3524545545181933e-05, "loss": 0.1592, "step": 1845 }, { "epoch": 1.7029520295202953, "grad_norm": 0.3743744653135974, "learning_rate": 2.3497751146427493e-05, "loss": 0.1706, "step": 1846 }, { "epoch": 1.7038745387453873, "grad_norm": 0.35803695186074397, "learning_rate": 2.3470958479453938e-05, "loss": 0.1495, "step": 1847 }, { "epoch": 1.7047970479704797, "grad_norm": 0.3949075012122784, "learning_rate": 2.344416757514764e-05, "loss": 0.1612, "step": 1848 }, { "epoch": 1.7057195571955721, "grad_norm": 0.36782728105119133, "learning_rate": 2.3417378464392963e-05, "loss": 0.1478, "step": 1849 }, { "epoch": 1.706642066420664, "grad_norm": 0.37391347662986596, "learning_rate": 2.339059117807217e-05, "loss": 0.1654, "step": 1850 }, { "epoch": 1.7075645756457565, "grad_norm": 0.39306774002597317, "learning_rate": 2.3363805747065443e-05, "loss": 0.1808, "step": 1851 }, { "epoch": 1.7084870848708487, "grad_norm": 0.38057434121566486, "learning_rate": 2.3337022202250828e-05, "loss": 0.1814, "step": 1852 }, { "epoch": 1.7094095940959408, "grad_norm": 0.3339814040682833, "learning_rate": 2.3310240574504185e-05, "loss": 0.1373, "step": 1853 }, { "epoch": 1.7103321033210332, "grad_norm": 0.3963449827644891, "learning_rate": 2.3283460894699156e-05, "loss": 0.167, "step": 1854 }, { "epoch": 1.7112546125461254, "grad_norm": 0.40301591658685915, "learning_rate": 2.3256683193707166e-05, "loss": 0.1557, "step": 1855 }, { "epoch": 1.7121771217712176, "grad_norm": 0.4226352086554742, "learning_rate": 2.322990750239733e-05, "loss": 0.1625, "step": 1856 }, { "epoch": 1.71309963099631, "grad_norm": 0.34736340879699523, "learning_rate": 2.3203133851636465e-05, "loss": 0.1607, "step": 1857 }, { "epoch": 1.7140221402214022, "grad_norm": 0.3681170700006526, "learning_rate": 2.317636227228903e-05, "loss": 0.1549, "step": 1858 }, { "epoch": 1.7149446494464944, "grad_norm": 0.3443452415949501, "learning_rate": 2.314959279521709e-05, "loss": 0.1591, "step": 1859 }, { "epoch": 1.7158671586715868, "grad_norm": 0.31743163801936963, "learning_rate": 2.3122825451280296e-05, "loss": 0.1608, "step": 1860 }, { "epoch": 1.716789667896679, "grad_norm": 0.44848989979697457, "learning_rate": 2.3096060271335832e-05, "loss": 0.1847, "step": 1861 }, { "epoch": 1.7177121771217712, "grad_norm": 0.36396283944358304, "learning_rate": 2.306929728623839e-05, "loss": 0.1762, "step": 1862 }, { "epoch": 1.7186346863468636, "grad_norm": 0.38223590312959876, "learning_rate": 2.3042536526840134e-05, "loss": 0.1593, "step": 1863 }, { "epoch": 1.7195571955719557, "grad_norm": 0.36568180560510427, "learning_rate": 2.3015778023990667e-05, "loss": 0.1707, "step": 1864 }, { "epoch": 1.720479704797048, "grad_norm": 0.32810773073454375, "learning_rate": 2.2989021808536974e-05, "loss": 0.1597, "step": 1865 }, { "epoch": 1.7214022140221403, "grad_norm": 0.4709336738948193, "learning_rate": 2.296226791132342e-05, "loss": 0.1704, "step": 1866 }, { "epoch": 1.7223247232472325, "grad_norm": 0.34675317136160727, "learning_rate": 2.2935516363191693e-05, "loss": 0.1772, "step": 1867 }, { "epoch": 1.7232472324723247, "grad_norm": 0.3926172439787084, "learning_rate": 2.2908767194980764e-05, "loss": 0.1717, "step": 1868 }, { "epoch": 1.724169741697417, "grad_norm": 0.35368453034298086, "learning_rate": 2.2882020437526873e-05, "loss": 0.1585, "step": 1869 }, { "epoch": 1.725092250922509, "grad_norm": 0.3639008053704692, "learning_rate": 2.2855276121663485e-05, "loss": 0.1644, "step": 1870 }, { "epoch": 1.7260147601476015, "grad_norm": 0.367283149868555, "learning_rate": 2.2828534278221212e-05, "loss": 0.1617, "step": 1871 }, { "epoch": 1.7269372693726939, "grad_norm": 0.35207757734068457, "learning_rate": 2.2801794938027873e-05, "loss": 0.1574, "step": 1872 }, { "epoch": 1.7278597785977858, "grad_norm": 0.3313244822773851, "learning_rate": 2.277505813190837e-05, "loss": 0.1616, "step": 1873 }, { "epoch": 1.7287822878228782, "grad_norm": 0.37018665631685116, "learning_rate": 2.2748323890684665e-05, "loss": 0.1495, "step": 1874 }, { "epoch": 1.7297047970479706, "grad_norm": 0.35511624838409717, "learning_rate": 2.2721592245175812e-05, "loss": 0.1698, "step": 1875 }, { "epoch": 1.7306273062730626, "grad_norm": 0.3674813704404082, "learning_rate": 2.269486322619783e-05, "loss": 0.1663, "step": 1876 }, { "epoch": 1.731549815498155, "grad_norm": 0.32742579310033787, "learning_rate": 2.266813686456372e-05, "loss": 0.1685, "step": 1877 }, { "epoch": 1.7324723247232472, "grad_norm": 0.3827716207681046, "learning_rate": 2.2641413191083445e-05, "loss": 0.1785, "step": 1878 }, { "epoch": 1.7333948339483394, "grad_norm": 0.3530017875613452, "learning_rate": 2.2614692236563838e-05, "loss": 0.1582, "step": 1879 }, { "epoch": 1.7343173431734318, "grad_norm": 0.39018328169038247, "learning_rate": 2.2587974031808608e-05, "loss": 0.151, "step": 1880 }, { "epoch": 1.735239852398524, "grad_norm": 0.37853261835392593, "learning_rate": 2.2561258607618297e-05, "loss": 0.1657, "step": 1881 }, { "epoch": 1.7361623616236161, "grad_norm": 0.35354906387489693, "learning_rate": 2.2534545994790244e-05, "loss": 0.1537, "step": 1882 }, { "epoch": 1.7370848708487086, "grad_norm": 0.3990954409001275, "learning_rate": 2.250783622411853e-05, "loss": 0.1773, "step": 1883 }, { "epoch": 1.7380073800738007, "grad_norm": 0.3944526491031225, "learning_rate": 2.2481129326393992e-05, "loss": 0.1646, "step": 1884 }, { "epoch": 1.738929889298893, "grad_norm": 0.36893109396357665, "learning_rate": 2.2454425332404122e-05, "loss": 0.1664, "step": 1885 }, { "epoch": 1.7398523985239853, "grad_norm": 0.339583638584494, "learning_rate": 2.2427724272933075e-05, "loss": 0.1602, "step": 1886 }, { "epoch": 1.7407749077490775, "grad_norm": 0.34874589566586056, "learning_rate": 2.240102617876164e-05, "loss": 0.1472, "step": 1887 }, { "epoch": 1.7416974169741697, "grad_norm": 0.3593168206981894, "learning_rate": 2.2374331080667164e-05, "loss": 0.1723, "step": 1888 }, { "epoch": 1.742619926199262, "grad_norm": 0.43778282669439506, "learning_rate": 2.2347639009423553e-05, "loss": 0.1772, "step": 1889 }, { "epoch": 1.7435424354243543, "grad_norm": 0.3387112834557388, "learning_rate": 2.2320949995801222e-05, "loss": 0.1544, "step": 1890 }, { "epoch": 1.7444649446494465, "grad_norm": 0.3849249076716214, "learning_rate": 2.2294264070567056e-05, "loss": 0.1736, "step": 1891 }, { "epoch": 1.7453874538745389, "grad_norm": 0.2938543747815116, "learning_rate": 2.2267581264484382e-05, "loss": 0.1334, "step": 1892 }, { "epoch": 1.746309963099631, "grad_norm": 0.3683306778476559, "learning_rate": 2.2240901608312942e-05, "loss": 0.155, "step": 1893 }, { "epoch": 1.7472324723247232, "grad_norm": 0.39697443182198616, "learning_rate": 2.2214225132808837e-05, "loss": 0.1666, "step": 1894 }, { "epoch": 1.7481549815498156, "grad_norm": 0.3604022476778861, "learning_rate": 2.2187551868724485e-05, "loss": 0.1555, "step": 1895 }, { "epoch": 1.7490774907749076, "grad_norm": 0.35881561561350755, "learning_rate": 2.216088184680864e-05, "loss": 0.1464, "step": 1896 }, { "epoch": 1.75, "grad_norm": 0.33670919990207204, "learning_rate": 2.2134215097806295e-05, "loss": 0.1494, "step": 1897 }, { "epoch": 1.7509225092250924, "grad_norm": 0.33737836275487587, "learning_rate": 2.2107551652458648e-05, "loss": 0.1436, "step": 1898 }, { "epoch": 1.7518450184501844, "grad_norm": 0.35033115584252883, "learning_rate": 2.2080891541503145e-05, "loss": 0.1533, "step": 1899 }, { "epoch": 1.7527675276752768, "grad_norm": 0.39914141234672607, "learning_rate": 2.2054234795673334e-05, "loss": 0.1659, "step": 1900 }, { "epoch": 1.753690036900369, "grad_norm": 0.39865041105966287, "learning_rate": 2.2027581445698922e-05, "loss": 0.1284, "step": 1901 }, { "epoch": 1.7546125461254611, "grad_norm": 0.40139166331439674, "learning_rate": 2.200093152230568e-05, "loss": 0.186, "step": 1902 }, { "epoch": 1.7555350553505535, "grad_norm": 0.3601541632619284, "learning_rate": 2.197428505621542e-05, "loss": 0.1481, "step": 1903 }, { "epoch": 1.7564575645756457, "grad_norm": 0.37537784969448856, "learning_rate": 2.1947642078146004e-05, "loss": 0.1746, "step": 1904 }, { "epoch": 1.757380073800738, "grad_norm": 0.3820863338038734, "learning_rate": 2.1921002618811244e-05, "loss": 0.1788, "step": 1905 }, { "epoch": 1.7583025830258303, "grad_norm": 0.3454516502653024, "learning_rate": 2.1894366708920886e-05, "loss": 0.1612, "step": 1906 }, { "epoch": 1.7592250922509225, "grad_norm": 0.35810941378697064, "learning_rate": 2.1867734379180628e-05, "loss": 0.1556, "step": 1907 }, { "epoch": 1.7601476014760147, "grad_norm": 0.35804285212253945, "learning_rate": 2.1841105660292e-05, "loss": 0.1533, "step": 1908 }, { "epoch": 1.761070110701107, "grad_norm": 0.332110775349014, "learning_rate": 2.1814480582952375e-05, "loss": 0.1279, "step": 1909 }, { "epoch": 1.7619926199261993, "grad_norm": 0.3970112390451209, "learning_rate": 2.1787859177854964e-05, "loss": 0.1645, "step": 1910 }, { "epoch": 1.7629151291512914, "grad_norm": 0.34157855663060777, "learning_rate": 2.1761241475688695e-05, "loss": 0.1612, "step": 1911 }, { "epoch": 1.7638376383763839, "grad_norm": 0.3280523541309268, "learning_rate": 2.1734627507138244e-05, "loss": 0.1419, "step": 1912 }, { "epoch": 1.764760147601476, "grad_norm": 0.3462767125893894, "learning_rate": 2.1708017302884016e-05, "loss": 0.1657, "step": 1913 }, { "epoch": 1.7656826568265682, "grad_norm": 0.3248813728241005, "learning_rate": 2.168141089360203e-05, "loss": 0.1499, "step": 1914 }, { "epoch": 1.7666051660516606, "grad_norm": 0.3851727529303274, "learning_rate": 2.1654808309963938e-05, "loss": 0.187, "step": 1915 }, { "epoch": 1.7675276752767528, "grad_norm": 0.3771866949552074, "learning_rate": 2.1628209582637022e-05, "loss": 0.1625, "step": 1916 }, { "epoch": 1.768450184501845, "grad_norm": 0.3475517371282469, "learning_rate": 2.160161474228407e-05, "loss": 0.1568, "step": 1917 }, { "epoch": 1.7693726937269374, "grad_norm": 0.35822102164648034, "learning_rate": 2.157502381956341e-05, "loss": 0.1461, "step": 1918 }, { "epoch": 1.7702952029520294, "grad_norm": 0.3570314929373, "learning_rate": 2.1548436845128858e-05, "loss": 0.1455, "step": 1919 }, { "epoch": 1.7712177121771218, "grad_norm": 0.34562742110365285, "learning_rate": 2.1521853849629675e-05, "loss": 0.1593, "step": 1920 }, { "epoch": 1.7721402214022142, "grad_norm": 0.3692231165374884, "learning_rate": 2.1495274863710517e-05, "loss": 0.1549, "step": 1921 }, { "epoch": 1.7730627306273061, "grad_norm": 0.37465863169104, "learning_rate": 2.146869991801146e-05, "loss": 0.1708, "step": 1922 }, { "epoch": 1.7739852398523985, "grad_norm": 0.35179057696040306, "learning_rate": 2.1442129043167874e-05, "loss": 0.1449, "step": 1923 }, { "epoch": 1.774907749077491, "grad_norm": 0.3282554209480367, "learning_rate": 2.1415562269810465e-05, "loss": 0.1501, "step": 1924 }, { "epoch": 1.775830258302583, "grad_norm": 0.3792811815541581, "learning_rate": 2.1388999628565212e-05, "loss": 0.1507, "step": 1925 }, { "epoch": 1.7767527675276753, "grad_norm": 0.40143599228679433, "learning_rate": 2.1362441150053312e-05, "loss": 0.1652, "step": 1926 }, { "epoch": 1.7776752767527675, "grad_norm": 0.37802858331533024, "learning_rate": 2.1335886864891182e-05, "loss": 0.1713, "step": 1927 }, { "epoch": 1.7785977859778597, "grad_norm": 0.3525746877132614, "learning_rate": 2.130933680369039e-05, "loss": 0.1623, "step": 1928 }, { "epoch": 1.779520295202952, "grad_norm": 0.39454226179717417, "learning_rate": 2.128279099705765e-05, "loss": 0.1544, "step": 1929 }, { "epoch": 1.7804428044280443, "grad_norm": 0.34775886716768395, "learning_rate": 2.125624947559475e-05, "loss": 0.1474, "step": 1930 }, { "epoch": 1.7813653136531364, "grad_norm": 0.36876573765007625, "learning_rate": 2.1229712269898565e-05, "loss": 0.1551, "step": 1931 }, { "epoch": 1.7822878228782288, "grad_norm": 0.34869551059545806, "learning_rate": 2.120317941056098e-05, "loss": 0.1447, "step": 1932 }, { "epoch": 1.783210332103321, "grad_norm": 0.36587891022317953, "learning_rate": 2.117665092816885e-05, "loss": 0.1671, "step": 1933 }, { "epoch": 1.7841328413284132, "grad_norm": 0.3487945989291348, "learning_rate": 2.1150126853304034e-05, "loss": 0.1509, "step": 1934 }, { "epoch": 1.7850553505535056, "grad_norm": 0.35613918691738616, "learning_rate": 2.112360721654327e-05, "loss": 0.1616, "step": 1935 }, { "epoch": 1.7859778597785978, "grad_norm": 0.4013564724324399, "learning_rate": 2.1097092048458172e-05, "loss": 0.1742, "step": 1936 }, { "epoch": 1.78690036900369, "grad_norm": 0.3151332094994997, "learning_rate": 2.1070581379615253e-05, "loss": 0.1426, "step": 1937 }, { "epoch": 1.7878228782287824, "grad_norm": 0.3655321152134994, "learning_rate": 2.1044075240575787e-05, "loss": 0.1673, "step": 1938 }, { "epoch": 1.7887453874538746, "grad_norm": 0.4302445109058553, "learning_rate": 2.1017573661895838e-05, "loss": 0.1696, "step": 1939 }, { "epoch": 1.7896678966789668, "grad_norm": 0.3767103931313366, "learning_rate": 2.099107667412625e-05, "loss": 0.1689, "step": 1940 }, { "epoch": 1.7905904059040592, "grad_norm": 0.35855837036481364, "learning_rate": 2.0964584307812514e-05, "loss": 0.16, "step": 1941 }, { "epoch": 1.7915129151291513, "grad_norm": 0.3537270043252408, "learning_rate": 2.0938096593494855e-05, "loss": 0.1686, "step": 1942 }, { "epoch": 1.7924354243542435, "grad_norm": 0.37247251035430284, "learning_rate": 2.0911613561708093e-05, "loss": 0.1624, "step": 1943 }, { "epoch": 1.793357933579336, "grad_norm": 0.377132166654423, "learning_rate": 2.088513524298165e-05, "loss": 0.1726, "step": 1944 }, { "epoch": 1.7942804428044279, "grad_norm": 0.3706806558600546, "learning_rate": 2.0858661667839553e-05, "loss": 0.1678, "step": 1945 }, { "epoch": 1.7952029520295203, "grad_norm": 0.3819437189608818, "learning_rate": 2.0832192866800316e-05, "loss": 0.1696, "step": 1946 }, { "epoch": 1.7961254612546127, "grad_norm": 0.37824618313239416, "learning_rate": 2.0805728870376965e-05, "loss": 0.157, "step": 1947 }, { "epoch": 1.7970479704797047, "grad_norm": 0.40439075320316176, "learning_rate": 2.077926970907701e-05, "loss": 0.1767, "step": 1948 }, { "epoch": 1.797970479704797, "grad_norm": 0.35369655441936054, "learning_rate": 2.075281541340236e-05, "loss": 0.1705, "step": 1949 }, { "epoch": 1.7988929889298892, "grad_norm": 0.36162004862398006, "learning_rate": 2.0726366013849313e-05, "loss": 0.1741, "step": 1950 }, { "epoch": 1.7998154981549814, "grad_norm": 0.33726035758550105, "learning_rate": 2.0699921540908544e-05, "loss": 0.1387, "step": 1951 }, { "epoch": 1.8007380073800738, "grad_norm": 0.3707853938334104, "learning_rate": 2.067348202506503e-05, "loss": 0.1716, "step": 1952 }, { "epoch": 1.801660516605166, "grad_norm": 0.3434117297353524, "learning_rate": 2.0647047496798043e-05, "loss": 0.164, "step": 1953 }, { "epoch": 1.8025830258302582, "grad_norm": 0.3764964159000091, "learning_rate": 2.062061798658111e-05, "loss": 0.1606, "step": 1954 }, { "epoch": 1.8035055350553506, "grad_norm": 0.3124660572761698, "learning_rate": 2.059419352488196e-05, "loss": 0.1451, "step": 1955 }, { "epoch": 1.8044280442804428, "grad_norm": 0.33515911908453766, "learning_rate": 2.0567774142162505e-05, "loss": 0.1458, "step": 1956 }, { "epoch": 1.805350553505535, "grad_norm": 0.368634745922285, "learning_rate": 2.0541359868878815e-05, "loss": 0.1473, "step": 1957 }, { "epoch": 1.8062730627306274, "grad_norm": 0.3701635563982187, "learning_rate": 2.0514950735481052e-05, "loss": 0.1546, "step": 1958 }, { "epoch": 1.8071955719557196, "grad_norm": 0.3651591871906541, "learning_rate": 2.0488546772413462e-05, "loss": 0.1565, "step": 1959 }, { "epoch": 1.8081180811808117, "grad_norm": 0.38876955496214516, "learning_rate": 2.046214801011434e-05, "loss": 0.1718, "step": 1960 }, { "epoch": 1.8090405904059041, "grad_norm": 0.36873405275038995, "learning_rate": 2.0435754479015962e-05, "loss": 0.1465, "step": 1961 }, { "epoch": 1.8099630996309963, "grad_norm": 0.4358055520966555, "learning_rate": 2.040936620954459e-05, "loss": 0.1632, "step": 1962 }, { "epoch": 1.8108856088560885, "grad_norm": 0.3819666578014411, "learning_rate": 2.0382983232120422e-05, "loss": 0.1642, "step": 1963 }, { "epoch": 1.811808118081181, "grad_norm": 0.38078572817688405, "learning_rate": 2.0356605577157552e-05, "loss": 0.1498, "step": 1964 }, { "epoch": 1.812730627306273, "grad_norm": 0.3554143708172271, "learning_rate": 2.033023327506393e-05, "loss": 0.1635, "step": 1965 }, { "epoch": 1.8136531365313653, "grad_norm": 0.4513479691720528, "learning_rate": 2.0303866356241347e-05, "loss": 0.1577, "step": 1966 }, { "epoch": 1.8145756457564577, "grad_norm": 0.36347844362261, "learning_rate": 2.0277504851085388e-05, "loss": 0.168, "step": 1967 }, { "epoch": 1.8154981549815496, "grad_norm": 0.3971419529038244, "learning_rate": 2.0251148789985374e-05, "loss": 0.1634, "step": 1968 }, { "epoch": 1.816420664206642, "grad_norm": 0.37177706325342263, "learning_rate": 2.0224798203324392e-05, "loss": 0.1709, "step": 1969 }, { "epoch": 1.8173431734317345, "grad_norm": 0.4059762479232794, "learning_rate": 2.019845312147919e-05, "loss": 0.1576, "step": 1970 }, { "epoch": 1.8182656826568264, "grad_norm": 0.38189613182315707, "learning_rate": 2.017211357482015e-05, "loss": 0.18, "step": 1971 }, { "epoch": 1.8191881918819188, "grad_norm": 0.3413123858587887, "learning_rate": 2.0145779593711338e-05, "loss": 0.1733, "step": 1972 }, { "epoch": 1.8201107011070112, "grad_norm": 0.34528499503103854, "learning_rate": 2.011945120851034e-05, "loss": 0.1523, "step": 1973 }, { "epoch": 1.8210332103321032, "grad_norm": 0.3576523993441041, "learning_rate": 2.0093128449568306e-05, "loss": 0.1482, "step": 1974 }, { "epoch": 1.8219557195571956, "grad_norm": 0.33455730748939116, "learning_rate": 2.006681134722994e-05, "loss": 0.157, "step": 1975 }, { "epoch": 1.8228782287822878, "grad_norm": 0.3824153116538723, "learning_rate": 2.0040499931833373e-05, "loss": 0.1591, "step": 1976 }, { "epoch": 1.82380073800738, "grad_norm": 0.44205789018039365, "learning_rate": 2.0014194233710193e-05, "loss": 0.1636, "step": 1977 }, { "epoch": 1.8247232472324724, "grad_norm": 0.3486851989930659, "learning_rate": 1.9987894283185434e-05, "loss": 0.167, "step": 1978 }, { "epoch": 1.8256457564575646, "grad_norm": 0.39737083923768923, "learning_rate": 1.9961600110577456e-05, "loss": 0.166, "step": 1979 }, { "epoch": 1.8265682656826567, "grad_norm": 0.39316579916896227, "learning_rate": 1.993531174619798e-05, "loss": 0.157, "step": 1980 }, { "epoch": 1.8274907749077491, "grad_norm": 0.3859558417881259, "learning_rate": 1.9909029220352035e-05, "loss": 0.1653, "step": 1981 }, { "epoch": 1.8284132841328413, "grad_norm": 0.3662210447558923, "learning_rate": 1.988275256333791e-05, "loss": 0.1341, "step": 1982 }, { "epoch": 1.8293357933579335, "grad_norm": 0.34513867329028414, "learning_rate": 1.985648180544713e-05, "loss": 0.1427, "step": 1983 }, { "epoch": 1.830258302583026, "grad_norm": 0.38852188782944674, "learning_rate": 1.9830216976964433e-05, "loss": 0.167, "step": 1984 }, { "epoch": 1.831180811808118, "grad_norm": 0.37931259597058387, "learning_rate": 1.9803958108167694e-05, "loss": 0.1723, "step": 1985 }, { "epoch": 1.8321033210332103, "grad_norm": 0.3933102162504835, "learning_rate": 1.9777705229327952e-05, "loss": 0.1773, "step": 1986 }, { "epoch": 1.8330258302583027, "grad_norm": 0.3214932647010383, "learning_rate": 1.9751458370709313e-05, "loss": 0.1565, "step": 1987 }, { "epoch": 1.8339483394833949, "grad_norm": 0.41438090194219784, "learning_rate": 1.9725217562568948e-05, "loss": 0.1824, "step": 1988 }, { "epoch": 1.834870848708487, "grad_norm": 0.3322969835242972, "learning_rate": 1.969898283515707e-05, "loss": 0.1488, "step": 1989 }, { "epoch": 1.8357933579335795, "grad_norm": 0.34637329546270856, "learning_rate": 1.967275421871687e-05, "loss": 0.1623, "step": 1990 }, { "epoch": 1.8367158671586716, "grad_norm": 0.35230432230838765, "learning_rate": 1.9646531743484478e-05, "loss": 0.1642, "step": 1991 }, { "epoch": 1.8376383763837638, "grad_norm": 0.3796304358368079, "learning_rate": 1.962031543968898e-05, "loss": 0.1828, "step": 1992 }, { "epoch": 1.8385608856088562, "grad_norm": 0.36214624107202387, "learning_rate": 1.9594105337552323e-05, "loss": 0.1664, "step": 1993 }, { "epoch": 1.8394833948339482, "grad_norm": 0.37017048469642266, "learning_rate": 1.9567901467289302e-05, "loss": 0.1582, "step": 1994 }, { "epoch": 1.8404059040590406, "grad_norm": 0.3789452226839255, "learning_rate": 1.9541703859107545e-05, "loss": 0.1632, "step": 1995 }, { "epoch": 1.841328413284133, "grad_norm": 0.39124779668089543, "learning_rate": 1.9515512543207453e-05, "loss": 0.1547, "step": 1996 }, { "epoch": 1.842250922509225, "grad_norm": 0.3694093100688777, "learning_rate": 1.9489327549782168e-05, "loss": 0.155, "step": 1997 }, { "epoch": 1.8431734317343174, "grad_norm": 0.3865408499721915, "learning_rate": 1.9463148909017553e-05, "loss": 0.1774, "step": 1998 }, { "epoch": 1.8440959409594095, "grad_norm": 0.3570437529844188, "learning_rate": 1.9436976651092144e-05, "loss": 0.1476, "step": 1999 }, { "epoch": 1.8450184501845017, "grad_norm": 0.3423328317252141, "learning_rate": 1.9410810806177104e-05, "loss": 0.1484, "step": 2000 }, { "epoch": 1.8459409594095941, "grad_norm": 0.4010672629924379, "learning_rate": 1.9384651404436237e-05, "loss": 0.1744, "step": 2001 }, { "epoch": 1.8468634686346863, "grad_norm": 0.3544260848357162, "learning_rate": 1.9358498476025895e-05, "loss": 0.1475, "step": 2002 }, { "epoch": 1.8477859778597785, "grad_norm": 0.38458565244615806, "learning_rate": 1.9332352051094952e-05, "loss": 0.175, "step": 2003 }, { "epoch": 1.848708487084871, "grad_norm": 0.4008291354864782, "learning_rate": 1.9306212159784828e-05, "loss": 0.1605, "step": 2004 }, { "epoch": 1.849630996309963, "grad_norm": 0.357938251421677, "learning_rate": 1.9280078832229388e-05, "loss": 0.1587, "step": 2005 }, { "epoch": 1.8505535055350553, "grad_norm": 0.38562595776978303, "learning_rate": 1.9253952098554903e-05, "loss": 0.1465, "step": 2006 }, { "epoch": 1.8514760147601477, "grad_norm": 0.3410195087635133, "learning_rate": 1.9227831988880107e-05, "loss": 0.1593, "step": 2007 }, { "epoch": 1.8523985239852399, "grad_norm": 0.3682543630360419, "learning_rate": 1.920171853331604e-05, "loss": 0.1614, "step": 2008 }, { "epoch": 1.853321033210332, "grad_norm": 0.33459865670495864, "learning_rate": 1.9175611761966082e-05, "loss": 0.1476, "step": 2009 }, { "epoch": 1.8542435424354244, "grad_norm": 0.3327774582454587, "learning_rate": 1.9149511704925942e-05, "loss": 0.157, "step": 2010 }, { "epoch": 1.8551660516605166, "grad_norm": 0.33415145348760156, "learning_rate": 1.9123418392283553e-05, "loss": 0.1707, "step": 2011 }, { "epoch": 1.8560885608856088, "grad_norm": 0.34689944075535845, "learning_rate": 1.9097331854119078e-05, "loss": 0.1553, "step": 2012 }, { "epoch": 1.8570110701107012, "grad_norm": 0.3515569170426396, "learning_rate": 1.907125212050489e-05, "loss": 0.1502, "step": 2013 }, { "epoch": 1.8579335793357934, "grad_norm": 0.38749293290572323, "learning_rate": 1.9045179221505497e-05, "loss": 0.1766, "step": 2014 }, { "epoch": 1.8588560885608856, "grad_norm": 0.3467288067329626, "learning_rate": 1.901911318717753e-05, "loss": 0.1575, "step": 2015 }, { "epoch": 1.859778597785978, "grad_norm": 0.3705513922755851, "learning_rate": 1.8993054047569726e-05, "loss": 0.164, "step": 2016 }, { "epoch": 1.8607011070110702, "grad_norm": 0.3948406618485568, "learning_rate": 1.896700183272285e-05, "loss": 0.155, "step": 2017 }, { "epoch": 1.8616236162361623, "grad_norm": 0.3327445858175401, "learning_rate": 1.8940956572669692e-05, "loss": 0.1376, "step": 2018 }, { "epoch": 1.8625461254612548, "grad_norm": 0.37760574403241454, "learning_rate": 1.891491829743504e-05, "loss": 0.1746, "step": 2019 }, { "epoch": 1.8634686346863467, "grad_norm": 0.3887404163177847, "learning_rate": 1.8888887037035607e-05, "loss": 0.1813, "step": 2020 }, { "epoch": 1.8643911439114391, "grad_norm": 0.39334239979851965, "learning_rate": 1.8862862821480025e-05, "loss": 0.1731, "step": 2021 }, { "epoch": 1.8653136531365315, "grad_norm": 0.2978329393936861, "learning_rate": 1.8836845680768815e-05, "loss": 0.1383, "step": 2022 }, { "epoch": 1.8662361623616235, "grad_norm": 0.4028454502450053, "learning_rate": 1.8810835644894344e-05, "loss": 0.1733, "step": 2023 }, { "epoch": 1.867158671586716, "grad_norm": 0.331584454188422, "learning_rate": 1.8784832743840757e-05, "loss": 0.1498, "step": 2024 }, { "epoch": 1.868081180811808, "grad_norm": 0.4106832662420921, "learning_rate": 1.8758837007584023e-05, "loss": 0.1796, "step": 2025 }, { "epoch": 1.8690036900369003, "grad_norm": 0.3764091889563927, "learning_rate": 1.8732848466091818e-05, "loss": 0.1673, "step": 2026 }, { "epoch": 1.8699261992619927, "grad_norm": 0.37190067044130726, "learning_rate": 1.870686714932352e-05, "loss": 0.1492, "step": 2027 }, { "epoch": 1.8708487084870848, "grad_norm": 0.3441146253858929, "learning_rate": 1.8680893087230204e-05, "loss": 0.1529, "step": 2028 }, { "epoch": 1.871771217712177, "grad_norm": 0.4040439209904815, "learning_rate": 1.8654926309754566e-05, "loss": 0.162, "step": 2029 }, { "epoch": 1.8726937269372694, "grad_norm": 0.3931622956402532, "learning_rate": 1.8628966846830907e-05, "loss": 0.1676, "step": 2030 }, { "epoch": 1.8736162361623616, "grad_norm": 0.3193108144849364, "learning_rate": 1.8603014728385095e-05, "loss": 0.169, "step": 2031 }, { "epoch": 1.8745387453874538, "grad_norm": 0.3441639223339678, "learning_rate": 1.8577069984334522e-05, "loss": 0.1302, "step": 2032 }, { "epoch": 1.8754612546125462, "grad_norm": 0.45250380780450933, "learning_rate": 1.8551132644588102e-05, "loss": 0.1702, "step": 2033 }, { "epoch": 1.8763837638376384, "grad_norm": 0.3210029209059365, "learning_rate": 1.8525202739046196e-05, "loss": 0.1396, "step": 2034 }, { "epoch": 1.8773062730627306, "grad_norm": 0.3711136600679912, "learning_rate": 1.8499280297600594e-05, "loss": 0.1622, "step": 2035 }, { "epoch": 1.878228782287823, "grad_norm": 0.43656379806291445, "learning_rate": 1.84733653501345e-05, "loss": 0.1755, "step": 2036 }, { "epoch": 1.8791512915129152, "grad_norm": 0.40124015959206116, "learning_rate": 1.8447457926522454e-05, "loss": 0.1593, "step": 2037 }, { "epoch": 1.8800738007380073, "grad_norm": 0.3979470533271776, "learning_rate": 1.8421558056630324e-05, "loss": 0.1666, "step": 2038 }, { "epoch": 1.8809963099630997, "grad_norm": 0.3631975944244946, "learning_rate": 1.8395665770315298e-05, "loss": 0.1678, "step": 2039 }, { "epoch": 1.881918819188192, "grad_norm": 0.3493776189055461, "learning_rate": 1.836978109742581e-05, "loss": 0.1633, "step": 2040 }, { "epoch": 1.882841328413284, "grad_norm": 0.36778378045879667, "learning_rate": 1.8343904067801477e-05, "loss": 0.1614, "step": 2041 }, { "epoch": 1.8837638376383765, "grad_norm": 0.3748701187810373, "learning_rate": 1.831803471127318e-05, "loss": 0.1619, "step": 2042 }, { "epoch": 1.8846863468634685, "grad_norm": 0.35954905741291965, "learning_rate": 1.829217305766289e-05, "loss": 0.171, "step": 2043 }, { "epoch": 1.8856088560885609, "grad_norm": 0.3319221458297196, "learning_rate": 1.8266319136783712e-05, "loss": 0.1585, "step": 2044 }, { "epoch": 1.8865313653136533, "grad_norm": 0.3794837799983876, "learning_rate": 1.8240472978439883e-05, "loss": 0.1452, "step": 2045 }, { "epoch": 1.8874538745387452, "grad_norm": 0.3987585374780179, "learning_rate": 1.8214634612426623e-05, "loss": 0.1737, "step": 2046 }, { "epoch": 1.8883763837638377, "grad_norm": 0.3683407966053583, "learning_rate": 1.8188804068530206e-05, "loss": 0.1576, "step": 2047 }, { "epoch": 1.8892988929889298, "grad_norm": 0.34263755995174733, "learning_rate": 1.8162981376527894e-05, "loss": 0.1696, "step": 2048 }, { "epoch": 1.890221402214022, "grad_norm": 0.314212820389449, "learning_rate": 1.813716656618788e-05, "loss": 0.1475, "step": 2049 }, { "epoch": 1.8911439114391144, "grad_norm": 0.3829724497296785, "learning_rate": 1.8111359667269275e-05, "loss": 0.1592, "step": 2050 }, { "epoch": 1.8920664206642066, "grad_norm": 0.3266759250780846, "learning_rate": 1.8085560709522077e-05, "loss": 0.1539, "step": 2051 }, { "epoch": 1.8929889298892988, "grad_norm": 0.3628235399383464, "learning_rate": 1.805976972268713e-05, "loss": 0.1758, "step": 2052 }, { "epoch": 1.8939114391143912, "grad_norm": 0.37032064120291297, "learning_rate": 1.8033986736496078e-05, "loss": 0.1758, "step": 2053 }, { "epoch": 1.8948339483394834, "grad_norm": 0.3390476198553966, "learning_rate": 1.8008211780671353e-05, "loss": 0.1558, "step": 2054 }, { "epoch": 1.8957564575645756, "grad_norm": 0.39703738351615603, "learning_rate": 1.798244488492612e-05, "loss": 0.1701, "step": 2055 }, { "epoch": 1.896678966789668, "grad_norm": 0.35851108811570903, "learning_rate": 1.795668607896426e-05, "loss": 0.1706, "step": 2056 }, { "epoch": 1.8976014760147601, "grad_norm": 0.3431270651037541, "learning_rate": 1.7930935392480326e-05, "loss": 0.1631, "step": 2057 }, { "epoch": 1.8985239852398523, "grad_norm": 0.3416418505976903, "learning_rate": 1.7905192855159514e-05, "loss": 0.1461, "step": 2058 }, { "epoch": 1.8994464944649447, "grad_norm": 0.32644775615725424, "learning_rate": 1.7879458496677615e-05, "loss": 0.134, "step": 2059 }, { "epoch": 1.900369003690037, "grad_norm": 0.38332095835052227, "learning_rate": 1.7853732346701003e-05, "loss": 0.1551, "step": 2060 }, { "epoch": 1.901291512915129, "grad_norm": 0.39235068169305326, "learning_rate": 1.7828014434886588e-05, "loss": 0.1534, "step": 2061 }, { "epoch": 1.9022140221402215, "grad_norm": 0.34317444492086, "learning_rate": 1.7802304790881773e-05, "loss": 0.1457, "step": 2062 }, { "epoch": 1.9031365313653137, "grad_norm": 0.4447190660701878, "learning_rate": 1.7776603444324445e-05, "loss": 0.1782, "step": 2063 }, { "epoch": 1.9040590405904059, "grad_norm": 0.36598747335495846, "learning_rate": 1.775091042484292e-05, "loss": 0.1503, "step": 2064 }, { "epoch": 1.9049815498154983, "grad_norm": 0.3369529655069667, "learning_rate": 1.7725225762055887e-05, "loss": 0.1452, "step": 2065 }, { "epoch": 1.9059040590405905, "grad_norm": 0.3162265674227008, "learning_rate": 1.7699549485572465e-05, "loss": 0.146, "step": 2066 }, { "epoch": 1.9068265682656826, "grad_norm": 0.41214926742766195, "learning_rate": 1.7673881624992047e-05, "loss": 0.1747, "step": 2067 }, { "epoch": 1.907749077490775, "grad_norm": 0.38817916498204885, "learning_rate": 1.7648222209904338e-05, "loss": 0.1695, "step": 2068 }, { "epoch": 1.908671586715867, "grad_norm": 0.3704083615912522, "learning_rate": 1.7622571269889326e-05, "loss": 0.17, "step": 2069 }, { "epoch": 1.9095940959409594, "grad_norm": 0.4032782003700155, "learning_rate": 1.759692883451721e-05, "loss": 0.1721, "step": 2070 }, { "epoch": 1.9105166051660518, "grad_norm": 0.3698213808043485, "learning_rate": 1.75712949333484e-05, "loss": 0.1713, "step": 2071 }, { "epoch": 1.9114391143911438, "grad_norm": 0.3370359745013529, "learning_rate": 1.754566959593346e-05, "loss": 0.1542, "step": 2072 }, { "epoch": 1.9123616236162362, "grad_norm": 0.3643578033579248, "learning_rate": 1.752005285181306e-05, "loss": 0.1683, "step": 2073 }, { "epoch": 1.9132841328413284, "grad_norm": 0.3662165203448614, "learning_rate": 1.7494444730518012e-05, "loss": 0.1721, "step": 2074 }, { "epoch": 1.9142066420664205, "grad_norm": 0.33697578821573065, "learning_rate": 1.746884526156915e-05, "loss": 0.1353, "step": 2075 }, { "epoch": 1.915129151291513, "grad_norm": 0.3702583026292005, "learning_rate": 1.7443254474477327e-05, "loss": 0.156, "step": 2076 }, { "epoch": 1.9160516605166051, "grad_norm": 0.36325214626895036, "learning_rate": 1.741767239874344e-05, "loss": 0.1677, "step": 2077 }, { "epoch": 1.9169741697416973, "grad_norm": 0.3693694666175546, "learning_rate": 1.7392099063858284e-05, "loss": 0.1852, "step": 2078 }, { "epoch": 1.9178966789667897, "grad_norm": 0.3714584247062507, "learning_rate": 1.7366534499302595e-05, "loss": 0.1513, "step": 2079 }, { "epoch": 1.918819188191882, "grad_norm": 0.31958997192401395, "learning_rate": 1.7340978734547035e-05, "loss": 0.1472, "step": 2080 }, { "epoch": 1.919741697416974, "grad_norm": 0.3487306312338888, "learning_rate": 1.7315431799052066e-05, "loss": 0.1637, "step": 2081 }, { "epoch": 1.9206642066420665, "grad_norm": 0.34585110546026254, "learning_rate": 1.728989372226801e-05, "loss": 0.1511, "step": 2082 }, { "epoch": 1.9215867158671587, "grad_norm": 0.3865905336151693, "learning_rate": 1.7264364533634956e-05, "loss": 0.1665, "step": 2083 }, { "epoch": 1.9225092250922509, "grad_norm": 0.3523667141912192, "learning_rate": 1.723884426258277e-05, "loss": 0.154, "step": 2084 }, { "epoch": 1.9234317343173433, "grad_norm": 0.4303298108479506, "learning_rate": 1.7213332938531012e-05, "loss": 0.1641, "step": 2085 }, { "epoch": 1.9243542435424354, "grad_norm": 0.3534015365860965, "learning_rate": 1.718783059088894e-05, "loss": 0.1403, "step": 2086 }, { "epoch": 1.9252767527675276, "grad_norm": 0.3673903376438012, "learning_rate": 1.7162337249055477e-05, "loss": 0.1579, "step": 2087 }, { "epoch": 1.92619926199262, "grad_norm": 0.342962694727826, "learning_rate": 1.7136852942419127e-05, "loss": 0.1631, "step": 2088 }, { "epoch": 1.9271217712177122, "grad_norm": 0.3753024966381805, "learning_rate": 1.7111377700358022e-05, "loss": 0.1472, "step": 2089 }, { "epoch": 1.9280442804428044, "grad_norm": 0.3364763848904482, "learning_rate": 1.708591155223982e-05, "loss": 0.1518, "step": 2090 }, { "epoch": 1.9289667896678968, "grad_norm": 0.3866136533660062, "learning_rate": 1.7060454527421688e-05, "loss": 0.1713, "step": 2091 }, { "epoch": 1.9298892988929888, "grad_norm": 0.4147150763053682, "learning_rate": 1.7035006655250304e-05, "loss": 0.1563, "step": 2092 }, { "epoch": 1.9308118081180812, "grad_norm": 0.3756944304039137, "learning_rate": 1.7009567965061774e-05, "loss": 0.1565, "step": 2093 }, { "epoch": 1.9317343173431736, "grad_norm": 0.3807925730918648, "learning_rate": 1.698413848618161e-05, "loss": 0.1593, "step": 2094 }, { "epoch": 1.9326568265682655, "grad_norm": 0.38492062922264886, "learning_rate": 1.6958718247924745e-05, "loss": 0.1593, "step": 2095 }, { "epoch": 1.933579335793358, "grad_norm": 0.3772907678044971, "learning_rate": 1.6933307279595413e-05, "loss": 0.1621, "step": 2096 }, { "epoch": 1.9345018450184504, "grad_norm": 0.4084042020189238, "learning_rate": 1.6907905610487184e-05, "loss": 0.1708, "step": 2097 }, { "epoch": 1.9354243542435423, "grad_norm": 0.38837729340567867, "learning_rate": 1.6882513269882917e-05, "loss": 0.1686, "step": 2098 }, { "epoch": 1.9363468634686347, "grad_norm": 0.38721867620945977, "learning_rate": 1.6857130287054702e-05, "loss": 0.1726, "step": 2099 }, { "epoch": 1.937269372693727, "grad_norm": 0.41436134979934947, "learning_rate": 1.683175669126383e-05, "loss": 0.161, "step": 2100 }, { "epoch": 1.938191881918819, "grad_norm": 0.36101101262609697, "learning_rate": 1.6806392511760803e-05, "loss": 0.1426, "step": 2101 }, { "epoch": 1.9391143911439115, "grad_norm": 0.36419907430338827, "learning_rate": 1.678103777778526e-05, "loss": 0.1574, "step": 2102 }, { "epoch": 1.9400369003690037, "grad_norm": 0.2908242224470308, "learning_rate": 1.6755692518565914e-05, "loss": 0.1223, "step": 2103 }, { "epoch": 1.9409594095940959, "grad_norm": 0.35436960934575523, "learning_rate": 1.6730356763320615e-05, "loss": 0.1472, "step": 2104 }, { "epoch": 1.9418819188191883, "grad_norm": 0.3760954645026784, "learning_rate": 1.670503054125621e-05, "loss": 0.1505, "step": 2105 }, { "epoch": 1.9428044280442804, "grad_norm": 0.3489617565462921, "learning_rate": 1.667971388156856e-05, "loss": 0.1569, "step": 2106 }, { "epoch": 1.9437269372693726, "grad_norm": 0.27280654136664284, "learning_rate": 1.6654406813442545e-05, "loss": 0.1325, "step": 2107 }, { "epoch": 1.944649446494465, "grad_norm": 0.3622553045951628, "learning_rate": 1.662910936605194e-05, "loss": 0.1629, "step": 2108 }, { "epoch": 1.9455719557195572, "grad_norm": 0.336857190707021, "learning_rate": 1.6603821568559437e-05, "loss": 0.1441, "step": 2109 }, { "epoch": 1.9464944649446494, "grad_norm": 0.40529479124812, "learning_rate": 1.657854345011664e-05, "loss": 0.164, "step": 2110 }, { "epoch": 1.9474169741697418, "grad_norm": 0.3788435120348254, "learning_rate": 1.655327503986395e-05, "loss": 0.1569, "step": 2111 }, { "epoch": 1.948339483394834, "grad_norm": 0.34957022347909955, "learning_rate": 1.6528016366930592e-05, "loss": 0.1688, "step": 2112 }, { "epoch": 1.9492619926199262, "grad_norm": 0.347731366608516, "learning_rate": 1.6502767460434588e-05, "loss": 0.1534, "step": 2113 }, { "epoch": 1.9501845018450186, "grad_norm": 0.3728756800631118, "learning_rate": 1.6477528349482656e-05, "loss": 0.1845, "step": 2114 }, { "epoch": 1.9511070110701108, "grad_norm": 0.3563021187541003, "learning_rate": 1.6452299063170283e-05, "loss": 0.1581, "step": 2115 }, { "epoch": 1.952029520295203, "grad_norm": 0.3505466294901255, "learning_rate": 1.6427079630581572e-05, "loss": 0.1655, "step": 2116 }, { "epoch": 1.9529520295202953, "grad_norm": 0.3101500827340247, "learning_rate": 1.6401870080789282e-05, "loss": 0.1342, "step": 2117 }, { "epoch": 1.9538745387453873, "grad_norm": 0.39658480605922347, "learning_rate": 1.6376670442854815e-05, "loss": 0.1574, "step": 2118 }, { "epoch": 1.9547970479704797, "grad_norm": 0.33081154898048376, "learning_rate": 1.63514807458281e-05, "loss": 0.1487, "step": 2119 }, { "epoch": 1.9557195571955721, "grad_norm": 0.35135230194523337, "learning_rate": 1.6326301018747623e-05, "loss": 0.1507, "step": 2120 }, { "epoch": 1.956642066420664, "grad_norm": 0.34425650471796426, "learning_rate": 1.6301131290640393e-05, "loss": 0.1617, "step": 2121 }, { "epoch": 1.9575645756457565, "grad_norm": 0.34768864572071606, "learning_rate": 1.627597159052187e-05, "loss": 0.168, "step": 2122 }, { "epoch": 1.9584870848708487, "grad_norm": 0.3781965255314279, "learning_rate": 1.6250821947395954e-05, "loss": 0.1748, "step": 2123 }, { "epoch": 1.9594095940959408, "grad_norm": 0.42719434703034126, "learning_rate": 1.622568239025498e-05, "loss": 0.1494, "step": 2124 }, { "epoch": 1.9603321033210332, "grad_norm": 0.3241759352541061, "learning_rate": 1.620055294807962e-05, "loss": 0.1507, "step": 2125 }, { "epoch": 1.9612546125461254, "grad_norm": 0.3711367189573145, "learning_rate": 1.61754336498389e-05, "loss": 0.1778, "step": 2126 }, { "epoch": 1.9621771217712176, "grad_norm": 0.40511893012609534, "learning_rate": 1.615032452449017e-05, "loss": 0.1554, "step": 2127 }, { "epoch": 1.96309963099631, "grad_norm": 0.3651020044924753, "learning_rate": 1.6125225600979015e-05, "loss": 0.1545, "step": 2128 }, { "epoch": 1.9640221402214022, "grad_norm": 0.3389783590681625, "learning_rate": 1.6100136908239284e-05, "loss": 0.1523, "step": 2129 }, { "epoch": 1.9649446494464944, "grad_norm": 0.32787767426737324, "learning_rate": 1.6075058475193045e-05, "loss": 0.1434, "step": 2130 }, { "epoch": 1.9658671586715868, "grad_norm": 0.35453868294421087, "learning_rate": 1.604999033075051e-05, "loss": 0.1705, "step": 2131 }, { "epoch": 1.966789667896679, "grad_norm": 0.3408667966426956, "learning_rate": 1.602493250381003e-05, "loss": 0.1408, "step": 2132 }, { "epoch": 1.9677121771217712, "grad_norm": 0.37629081425467487, "learning_rate": 1.59998850232581e-05, "loss": 0.1574, "step": 2133 }, { "epoch": 1.9686346863468636, "grad_norm": 0.31639473669869117, "learning_rate": 1.5974847917969253e-05, "loss": 0.1344, "step": 2134 }, { "epoch": 1.9695571955719557, "grad_norm": 0.3894565590429641, "learning_rate": 1.594982121680605e-05, "loss": 0.1562, "step": 2135 }, { "epoch": 1.970479704797048, "grad_norm": 0.33721420188956064, "learning_rate": 1.592480494861911e-05, "loss": 0.1454, "step": 2136 }, { "epoch": 1.9714022140221403, "grad_norm": 0.3658903476299478, "learning_rate": 1.5899799142246987e-05, "loss": 0.161, "step": 2137 }, { "epoch": 1.9723247232472325, "grad_norm": 0.36192956235609736, "learning_rate": 1.5874803826516153e-05, "loss": 0.172, "step": 2138 }, { "epoch": 1.9732472324723247, "grad_norm": 0.35690008919131494, "learning_rate": 1.584981903024106e-05, "loss": 0.1569, "step": 2139 }, { "epoch": 1.974169741697417, "grad_norm": 0.32918971311261697, "learning_rate": 1.5824844782223954e-05, "loss": 0.1541, "step": 2140 }, { "epoch": 1.975092250922509, "grad_norm": 0.3585597257101039, "learning_rate": 1.579988111125496e-05, "loss": 0.1414, "step": 2141 }, { "epoch": 1.9760147601476015, "grad_norm": 0.342128447426705, "learning_rate": 1.5774928046112027e-05, "loss": 0.1466, "step": 2142 }, { "epoch": 1.9769372693726939, "grad_norm": 0.3672701644773694, "learning_rate": 1.5749985615560837e-05, "loss": 0.14, "step": 2143 }, { "epoch": 1.9778597785977858, "grad_norm": 0.3570713683549541, "learning_rate": 1.572505384835482e-05, "loss": 0.1455, "step": 2144 }, { "epoch": 1.9787822878228782, "grad_norm": 0.32429396370851155, "learning_rate": 1.570013277323516e-05, "loss": 0.1584, "step": 2145 }, { "epoch": 1.9797047970479706, "grad_norm": 0.3506385107195562, "learning_rate": 1.5675222418930653e-05, "loss": 0.1341, "step": 2146 }, { "epoch": 1.9806273062730626, "grad_norm": 0.37317718273227996, "learning_rate": 1.5650322814157764e-05, "loss": 0.1616, "step": 2147 }, { "epoch": 1.981549815498155, "grad_norm": 0.353562182142919, "learning_rate": 1.5625433987620577e-05, "loss": 0.1486, "step": 2148 }, { "epoch": 1.9824723247232472, "grad_norm": 0.42413792939965583, "learning_rate": 1.5600555968010734e-05, "loss": 0.1726, "step": 2149 }, { "epoch": 1.9833948339483394, "grad_norm": 0.3622606610836388, "learning_rate": 1.557568878400742e-05, "loss": 0.1537, "step": 2150 }, { "epoch": 1.9843173431734318, "grad_norm": 0.3572809406638548, "learning_rate": 1.555083246427734e-05, "loss": 0.1741, "step": 2151 }, { "epoch": 1.985239852398524, "grad_norm": 0.33778560462753165, "learning_rate": 1.5525987037474667e-05, "loss": 0.1312, "step": 2152 }, { "epoch": 1.9861623616236161, "grad_norm": 0.3740675742270338, "learning_rate": 1.5501152532241005e-05, "loss": 0.1542, "step": 2153 }, { "epoch": 1.9870848708487086, "grad_norm": 0.3119493809349143, "learning_rate": 1.5476328977205397e-05, "loss": 0.1424, "step": 2154 }, { "epoch": 1.9880073800738007, "grad_norm": 0.38111403421490336, "learning_rate": 1.5451516400984235e-05, "loss": 0.1573, "step": 2155 }, { "epoch": 1.988929889298893, "grad_norm": 0.3224861320468959, "learning_rate": 1.5426714832181262e-05, "loss": 0.1412, "step": 2156 }, { "epoch": 1.9898523985239853, "grad_norm": 0.3376414274010515, "learning_rate": 1.540192429938755e-05, "loss": 0.1604, "step": 2157 }, { "epoch": 1.9907749077490775, "grad_norm": 0.36182822271175735, "learning_rate": 1.5377144831181416e-05, "loss": 0.1672, "step": 2158 }, { "epoch": 1.9916974169741697, "grad_norm": 0.3688889212889081, "learning_rate": 1.535237645612846e-05, "loss": 0.1629, "step": 2159 }, { "epoch": 1.992619926199262, "grad_norm": 0.35371931564594805, "learning_rate": 1.5327619202781457e-05, "loss": 0.1417, "step": 2160 }, { "epoch": 1.9935424354243543, "grad_norm": 0.3872381312100595, "learning_rate": 1.5302873099680377e-05, "loss": 0.1533, "step": 2161 }, { "epoch": 1.9944649446494465, "grad_norm": 0.34813821973923315, "learning_rate": 1.5278138175352353e-05, "loss": 0.1609, "step": 2162 }, { "epoch": 1.9953874538745389, "grad_norm": 0.340179721949598, "learning_rate": 1.52534144583116e-05, "loss": 0.1327, "step": 2163 }, { "epoch": 1.996309963099631, "grad_norm": 0.4044504632167899, "learning_rate": 1.5228701977059428e-05, "loss": 0.1735, "step": 2164 }, { "epoch": 1.9972324723247232, "grad_norm": 0.3516746142082308, "learning_rate": 1.5204000760084206e-05, "loss": 0.1633, "step": 2165 }, { "epoch": 1.9981549815498156, "grad_norm": 0.35824635814324857, "learning_rate": 1.5179310835861299e-05, "loss": 0.1534, "step": 2166 }, { "epoch": 1.9990774907749076, "grad_norm": 0.3345091202124253, "learning_rate": 1.5154632232853055e-05, "loss": 0.148, "step": 2167 }, { "epoch": 2.0, "grad_norm": 0.30797574850596654, "learning_rate": 1.5129964979508792e-05, "loss": 0.1199, "step": 2168 }, { "epoch": 2.0009225092250924, "grad_norm": 0.2976360537697981, "learning_rate": 1.5105309104264725e-05, "loss": 0.0703, "step": 2169 }, { "epoch": 2.0018450184501844, "grad_norm": 0.3198873467806432, "learning_rate": 1.5080664635543934e-05, "loss": 0.0854, "step": 2170 }, { "epoch": 2.0027675276752768, "grad_norm": 0.31263105502287075, "learning_rate": 1.5056031601756405e-05, "loss": 0.069, "step": 2171 }, { "epoch": 2.003690036900369, "grad_norm": 0.3083167766286197, "learning_rate": 1.5031410031298898e-05, "loss": 0.0798, "step": 2172 }, { "epoch": 2.004612546125461, "grad_norm": 0.3087797367319195, "learning_rate": 1.5006799952554954e-05, "loss": 0.0674, "step": 2173 }, { "epoch": 2.0055350553505535, "grad_norm": 0.38139646201681016, "learning_rate": 1.4982201393894906e-05, "loss": 0.0761, "step": 2174 }, { "epoch": 2.006457564575646, "grad_norm": 0.46131637088146976, "learning_rate": 1.495761438367577e-05, "loss": 0.0772, "step": 2175 }, { "epoch": 2.007380073800738, "grad_norm": 0.46230756809420753, "learning_rate": 1.4933038950241252e-05, "loss": 0.0656, "step": 2176 }, { "epoch": 2.0083025830258303, "grad_norm": 0.45122782926874755, "learning_rate": 1.4908475121921744e-05, "loss": 0.0676, "step": 2177 }, { "epoch": 2.0092250922509227, "grad_norm": 0.41925272184214457, "learning_rate": 1.4883922927034222e-05, "loss": 0.0631, "step": 2178 }, { "epoch": 2.0101476014760147, "grad_norm": 0.3936049787261394, "learning_rate": 1.485938239388227e-05, "loss": 0.062, "step": 2179 }, { "epoch": 2.011070110701107, "grad_norm": 0.44263563462972005, "learning_rate": 1.4834853550756029e-05, "loss": 0.0793, "step": 2180 }, { "epoch": 2.011992619926199, "grad_norm": 0.4146007752592815, "learning_rate": 1.4810336425932155e-05, "loss": 0.0707, "step": 2181 }, { "epoch": 2.0129151291512914, "grad_norm": 0.3902394436660754, "learning_rate": 1.4785831047673799e-05, "loss": 0.0715, "step": 2182 }, { "epoch": 2.013837638376384, "grad_norm": 0.4844314150823258, "learning_rate": 1.4761337444230583e-05, "loss": 0.0639, "step": 2183 }, { "epoch": 2.014760147601476, "grad_norm": 0.3584547751069646, "learning_rate": 1.4736855643838532e-05, "loss": 0.0646, "step": 2184 }, { "epoch": 2.015682656826568, "grad_norm": 0.38296790522159585, "learning_rate": 1.471238567472008e-05, "loss": 0.0754, "step": 2185 }, { "epoch": 2.0166051660516606, "grad_norm": 0.3735311076171752, "learning_rate": 1.4687927565084022e-05, "loss": 0.074, "step": 2186 }, { "epoch": 2.0175276752767526, "grad_norm": 0.3751760443336041, "learning_rate": 1.4663481343125477e-05, "loss": 0.0742, "step": 2187 }, { "epoch": 2.018450184501845, "grad_norm": 0.31588170716651065, "learning_rate": 1.4639047037025855e-05, "loss": 0.0583, "step": 2188 }, { "epoch": 2.0193726937269374, "grad_norm": 0.3515839610072372, "learning_rate": 1.4614624674952842e-05, "loss": 0.0618, "step": 2189 }, { "epoch": 2.0202952029520294, "grad_norm": 0.3861408374210167, "learning_rate": 1.4590214285060349e-05, "loss": 0.0732, "step": 2190 }, { "epoch": 2.0212177121771218, "grad_norm": 0.3927048337607836, "learning_rate": 1.4565815895488476e-05, "loss": 0.0608, "step": 2191 }, { "epoch": 2.022140221402214, "grad_norm": 0.3531098164929209, "learning_rate": 1.4541429534363515e-05, "loss": 0.0579, "step": 2192 }, { "epoch": 2.023062730627306, "grad_norm": 0.45167105898061277, "learning_rate": 1.4517055229797857e-05, "loss": 0.0751, "step": 2193 }, { "epoch": 2.0239852398523985, "grad_norm": 0.4627100326669043, "learning_rate": 1.4492693009890018e-05, "loss": 0.0748, "step": 2194 }, { "epoch": 2.024907749077491, "grad_norm": 0.46979209382263687, "learning_rate": 1.4468342902724591e-05, "loss": 0.0828, "step": 2195 }, { "epoch": 2.025830258302583, "grad_norm": 0.44744051567035725, "learning_rate": 1.4444004936372165e-05, "loss": 0.0684, "step": 2196 }, { "epoch": 2.0267527675276753, "grad_norm": 0.3403332049495621, "learning_rate": 1.4419679138889378e-05, "loss": 0.0589, "step": 2197 }, { "epoch": 2.0276752767527677, "grad_norm": 0.41763785693758054, "learning_rate": 1.4395365538318829e-05, "loss": 0.0631, "step": 2198 }, { "epoch": 2.0285977859778597, "grad_norm": 0.39019638549907776, "learning_rate": 1.4371064162689024e-05, "loss": 0.0671, "step": 2199 }, { "epoch": 2.029520295202952, "grad_norm": 0.3860442514405348, "learning_rate": 1.4346775040014415e-05, "loss": 0.0669, "step": 2200 }, { "epoch": 2.0304428044280445, "grad_norm": 0.3344238710082706, "learning_rate": 1.4322498198295327e-05, "loss": 0.05, "step": 2201 }, { "epoch": 2.0313653136531364, "grad_norm": 0.40033249044017183, "learning_rate": 1.4298233665517896e-05, "loss": 0.0756, "step": 2202 }, { "epoch": 2.032287822878229, "grad_norm": 0.39955797763735873, "learning_rate": 1.4273981469654093e-05, "loss": 0.0713, "step": 2203 }, { "epoch": 2.0332103321033212, "grad_norm": 0.35323675199493065, "learning_rate": 1.4249741638661679e-05, "loss": 0.0625, "step": 2204 }, { "epoch": 2.034132841328413, "grad_norm": 0.36589217113532685, "learning_rate": 1.4225514200484116e-05, "loss": 0.065, "step": 2205 }, { "epoch": 2.0350553505535056, "grad_norm": 0.30664598609653493, "learning_rate": 1.4201299183050626e-05, "loss": 0.0575, "step": 2206 }, { "epoch": 2.0359778597785976, "grad_norm": 0.386599914617671, "learning_rate": 1.4177096614276097e-05, "loss": 0.0777, "step": 2207 }, { "epoch": 2.03690036900369, "grad_norm": 0.36339789042022713, "learning_rate": 1.4152906522061048e-05, "loss": 0.06, "step": 2208 }, { "epoch": 2.0378228782287824, "grad_norm": 0.3961162554717477, "learning_rate": 1.4128728934291641e-05, "loss": 0.0673, "step": 2209 }, { "epoch": 2.0387453874538743, "grad_norm": 0.4122745090428468, "learning_rate": 1.4104563878839621e-05, "loss": 0.0684, "step": 2210 }, { "epoch": 2.0396678966789668, "grad_norm": 0.44028661517890205, "learning_rate": 1.4080411383562258e-05, "loss": 0.0746, "step": 2211 }, { "epoch": 2.040590405904059, "grad_norm": 0.3942892239056397, "learning_rate": 1.4056271476302368e-05, "loss": 0.0628, "step": 2212 }, { "epoch": 2.041512915129151, "grad_norm": 0.3712665887448281, "learning_rate": 1.4032144184888269e-05, "loss": 0.0581, "step": 2213 }, { "epoch": 2.0424354243542435, "grad_norm": 0.43744992110441555, "learning_rate": 1.4008029537133685e-05, "loss": 0.0627, "step": 2214 }, { "epoch": 2.043357933579336, "grad_norm": 0.4857776711592854, "learning_rate": 1.3983927560837815e-05, "loss": 0.0705, "step": 2215 }, { "epoch": 2.044280442804428, "grad_norm": 0.37197542467309835, "learning_rate": 1.3959838283785237e-05, "loss": 0.0745, "step": 2216 }, { "epoch": 2.0452029520295203, "grad_norm": 0.39177509656673304, "learning_rate": 1.3935761733745865e-05, "loss": 0.0637, "step": 2217 }, { "epoch": 2.0461254612546127, "grad_norm": 0.4065437517107225, "learning_rate": 1.3911697938474966e-05, "loss": 0.0579, "step": 2218 }, { "epoch": 2.0470479704797047, "grad_norm": 0.4002320100803212, "learning_rate": 1.3887646925713116e-05, "loss": 0.0754, "step": 2219 }, { "epoch": 2.047970479704797, "grad_norm": 0.3549424979834139, "learning_rate": 1.3863608723186108e-05, "loss": 0.0707, "step": 2220 }, { "epoch": 2.0488929889298895, "grad_norm": 0.3870244238874869, "learning_rate": 1.3839583358605012e-05, "loss": 0.0711, "step": 2221 }, { "epoch": 2.0498154981549814, "grad_norm": 0.4444638854748495, "learning_rate": 1.3815570859666091e-05, "loss": 0.0688, "step": 2222 }, { "epoch": 2.050738007380074, "grad_norm": 0.40841543971334465, "learning_rate": 1.3791571254050747e-05, "loss": 0.0847, "step": 2223 }, { "epoch": 2.0516605166051662, "grad_norm": 0.34017711339234313, "learning_rate": 1.3767584569425562e-05, "loss": 0.0584, "step": 2224 }, { "epoch": 2.052583025830258, "grad_norm": 0.40919307099781965, "learning_rate": 1.3743610833442182e-05, "loss": 0.0757, "step": 2225 }, { "epoch": 2.0535055350553506, "grad_norm": 0.35418905688832636, "learning_rate": 1.3719650073737352e-05, "loss": 0.0686, "step": 2226 }, { "epoch": 2.054428044280443, "grad_norm": 0.3812889972286595, "learning_rate": 1.3695702317932862e-05, "loss": 0.0751, "step": 2227 }, { "epoch": 2.055350553505535, "grad_norm": 0.36828127241310443, "learning_rate": 1.3671767593635482e-05, "loss": 0.0585, "step": 2228 }, { "epoch": 2.0562730627306274, "grad_norm": 0.4465144692445298, "learning_rate": 1.3647845928436986e-05, "loss": 0.0746, "step": 2229 }, { "epoch": 2.0571955719557193, "grad_norm": 0.30379954883920934, "learning_rate": 1.3623937349914093e-05, "loss": 0.0556, "step": 2230 }, { "epoch": 2.0581180811808117, "grad_norm": 0.41901500267106806, "learning_rate": 1.3600041885628409e-05, "loss": 0.0692, "step": 2231 }, { "epoch": 2.059040590405904, "grad_norm": 0.41402687661233484, "learning_rate": 1.357615956312645e-05, "loss": 0.072, "step": 2232 }, { "epoch": 2.059963099630996, "grad_norm": 0.40045954654873867, "learning_rate": 1.355229040993959e-05, "loss": 0.0672, "step": 2233 }, { "epoch": 2.0608856088560885, "grad_norm": 0.4218320383494676, "learning_rate": 1.3528434453583972e-05, "loss": 0.0634, "step": 2234 }, { "epoch": 2.061808118081181, "grad_norm": 0.3874555064614107, "learning_rate": 1.3504591721560578e-05, "loss": 0.0682, "step": 2235 }, { "epoch": 2.062730627306273, "grad_norm": 0.4112000719178533, "learning_rate": 1.3480762241355132e-05, "loss": 0.0684, "step": 2236 }, { "epoch": 2.0636531365313653, "grad_norm": 0.3738712320639851, "learning_rate": 1.3456946040438057e-05, "loss": 0.0625, "step": 2237 }, { "epoch": 2.0645756457564577, "grad_norm": 0.43171012828868377, "learning_rate": 1.3433143146264493e-05, "loss": 0.0701, "step": 2238 }, { "epoch": 2.0654981549815496, "grad_norm": 0.3524502423148047, "learning_rate": 1.3409353586274243e-05, "loss": 0.0634, "step": 2239 }, { "epoch": 2.066420664206642, "grad_norm": 0.37612818023093847, "learning_rate": 1.338557738789171e-05, "loss": 0.0706, "step": 2240 }, { "epoch": 2.0673431734317345, "grad_norm": 0.41251392187694436, "learning_rate": 1.3361814578525922e-05, "loss": 0.0753, "step": 2241 }, { "epoch": 2.0682656826568264, "grad_norm": 0.339735749819185, "learning_rate": 1.333806518557047e-05, "loss": 0.0619, "step": 2242 }, { "epoch": 2.069188191881919, "grad_norm": 0.41289076151051984, "learning_rate": 1.331432923640345e-05, "loss": 0.0723, "step": 2243 }, { "epoch": 2.0701107011070112, "grad_norm": 0.3387208444780958, "learning_rate": 1.3290606758387498e-05, "loss": 0.0579, "step": 2244 }, { "epoch": 2.071033210332103, "grad_norm": 0.36216492165089403, "learning_rate": 1.3266897778869702e-05, "loss": 0.0596, "step": 2245 }, { "epoch": 2.0719557195571956, "grad_norm": 0.3870132395876351, "learning_rate": 1.324320232518158e-05, "loss": 0.0674, "step": 2246 }, { "epoch": 2.072878228782288, "grad_norm": 0.35777903887924795, "learning_rate": 1.3219520424639076e-05, "loss": 0.0609, "step": 2247 }, { "epoch": 2.07380073800738, "grad_norm": 0.3974356851698215, "learning_rate": 1.3195852104542511e-05, "loss": 0.0638, "step": 2248 }, { "epoch": 2.0747232472324724, "grad_norm": 0.4139265534374722, "learning_rate": 1.3172197392176525e-05, "loss": 0.0675, "step": 2249 }, { "epoch": 2.0756457564575648, "grad_norm": 0.4060327318740307, "learning_rate": 1.3148556314810092e-05, "loss": 0.0615, "step": 2250 }, { "epoch": 2.0765682656826567, "grad_norm": 0.41270482151611887, "learning_rate": 1.3124928899696476e-05, "loss": 0.0618, "step": 2251 }, { "epoch": 2.077490774907749, "grad_norm": 0.40572409594054104, "learning_rate": 1.3101315174073162e-05, "loss": 0.0667, "step": 2252 }, { "epoch": 2.0784132841328415, "grad_norm": 0.3527089118513685, "learning_rate": 1.3077715165161878e-05, "loss": 0.0613, "step": 2253 }, { "epoch": 2.0793357933579335, "grad_norm": 0.3815807082735544, "learning_rate": 1.3054128900168538e-05, "loss": 0.066, "step": 2254 }, { "epoch": 2.080258302583026, "grad_norm": 0.3601800142960928, "learning_rate": 1.3030556406283195e-05, "loss": 0.0697, "step": 2255 }, { "epoch": 2.081180811808118, "grad_norm": 0.3585956071637972, "learning_rate": 1.3006997710680041e-05, "loss": 0.0663, "step": 2256 }, { "epoch": 2.0821033210332103, "grad_norm": 0.36790788748310854, "learning_rate": 1.298345284051737e-05, "loss": 0.0659, "step": 2257 }, { "epoch": 2.0830258302583027, "grad_norm": 0.406472928802804, "learning_rate": 1.295992182293751e-05, "loss": 0.0612, "step": 2258 }, { "epoch": 2.0839483394833946, "grad_norm": 0.4229844842336367, "learning_rate": 1.2936404685066852e-05, "loss": 0.0758, "step": 2259 }, { "epoch": 2.084870848708487, "grad_norm": 0.4137305147479194, "learning_rate": 1.2912901454015752e-05, "loss": 0.0695, "step": 2260 }, { "epoch": 2.0857933579335795, "grad_norm": 0.3884366301219784, "learning_rate": 1.2889412156878566e-05, "loss": 0.0639, "step": 2261 }, { "epoch": 2.0867158671586714, "grad_norm": 0.37571829415748415, "learning_rate": 1.2865936820733582e-05, "loss": 0.0704, "step": 2262 }, { "epoch": 2.087638376383764, "grad_norm": 0.4365705947838686, "learning_rate": 1.2842475472642968e-05, "loss": 0.0718, "step": 2263 }, { "epoch": 2.088560885608856, "grad_norm": 0.3621254157807818, "learning_rate": 1.2819028139652794e-05, "loss": 0.0768, "step": 2264 }, { "epoch": 2.089483394833948, "grad_norm": 0.3571045514206385, "learning_rate": 1.2795594848792975e-05, "loss": 0.0663, "step": 2265 }, { "epoch": 2.0904059040590406, "grad_norm": 0.382496586818103, "learning_rate": 1.2772175627077205e-05, "loss": 0.0618, "step": 2266 }, { "epoch": 2.091328413284133, "grad_norm": 0.35917726450691434, "learning_rate": 1.2748770501502994e-05, "loss": 0.0616, "step": 2267 }, { "epoch": 2.092250922509225, "grad_norm": 0.35707437854815516, "learning_rate": 1.2725379499051603e-05, "loss": 0.0595, "step": 2268 }, { "epoch": 2.0931734317343174, "grad_norm": 0.36622075267297094, "learning_rate": 1.2702002646687976e-05, "loss": 0.054, "step": 2269 }, { "epoch": 2.0940959409594098, "grad_norm": 0.37514244833794425, "learning_rate": 1.2678639971360778e-05, "loss": 0.06, "step": 2270 }, { "epoch": 2.0950184501845017, "grad_norm": 0.41975878863510097, "learning_rate": 1.265529150000233e-05, "loss": 0.0651, "step": 2271 }, { "epoch": 2.095940959409594, "grad_norm": 0.42133748723146647, "learning_rate": 1.2631957259528553e-05, "loss": 0.0629, "step": 2272 }, { "epoch": 2.0968634686346865, "grad_norm": 0.39875129353097116, "learning_rate": 1.2608637276838986e-05, "loss": 0.0725, "step": 2273 }, { "epoch": 2.0977859778597785, "grad_norm": 0.40771910402112077, "learning_rate": 1.2585331578816738e-05, "loss": 0.0658, "step": 2274 }, { "epoch": 2.098708487084871, "grad_norm": 0.34925279844290885, "learning_rate": 1.2562040192328414e-05, "loss": 0.0608, "step": 2275 }, { "epoch": 2.0996309963099633, "grad_norm": 0.36084418302418064, "learning_rate": 1.2538763144224157e-05, "loss": 0.0624, "step": 2276 }, { "epoch": 2.1005535055350553, "grad_norm": 0.3621477738896384, "learning_rate": 1.2515500461337581e-05, "loss": 0.0622, "step": 2277 }, { "epoch": 2.1014760147601477, "grad_norm": 0.4166217819345527, "learning_rate": 1.2492252170485702e-05, "loss": 0.0609, "step": 2278 }, { "epoch": 2.10239852398524, "grad_norm": 0.3963129875773945, "learning_rate": 1.2469018298468982e-05, "loss": 0.0735, "step": 2279 }, { "epoch": 2.103321033210332, "grad_norm": 0.40031229458504575, "learning_rate": 1.244579887207126e-05, "loss": 0.0643, "step": 2280 }, { "epoch": 2.1042435424354244, "grad_norm": 0.38710135688527125, "learning_rate": 1.2422593918059702e-05, "loss": 0.061, "step": 2281 }, { "epoch": 2.1051660516605164, "grad_norm": 0.42932007526489624, "learning_rate": 1.239940346318478e-05, "loss": 0.065, "step": 2282 }, { "epoch": 2.106088560885609, "grad_norm": 0.3549475954255283, "learning_rate": 1.2376227534180309e-05, "loss": 0.0691, "step": 2283 }, { "epoch": 2.107011070110701, "grad_norm": 0.36893883002542316, "learning_rate": 1.2353066157763304e-05, "loss": 0.0647, "step": 2284 }, { "epoch": 2.107933579335793, "grad_norm": 0.38400940343698575, "learning_rate": 1.2329919360634002e-05, "loss": 0.0772, "step": 2285 }, { "epoch": 2.1088560885608856, "grad_norm": 0.41039114987252323, "learning_rate": 1.2306787169475887e-05, "loss": 0.0864, "step": 2286 }, { "epoch": 2.109778597785978, "grad_norm": 0.37516276125973114, "learning_rate": 1.2283669610955542e-05, "loss": 0.0686, "step": 2287 }, { "epoch": 2.11070110701107, "grad_norm": 0.4340295903624818, "learning_rate": 1.2260566711722723e-05, "loss": 0.0649, "step": 2288 }, { "epoch": 2.1116236162361623, "grad_norm": 0.32110411537845807, "learning_rate": 1.2237478498410282e-05, "loss": 0.0527, "step": 2289 }, { "epoch": 2.1125461254612548, "grad_norm": 0.3726146451240176, "learning_rate": 1.2214404997634117e-05, "loss": 0.0668, "step": 2290 }, { "epoch": 2.1134686346863467, "grad_norm": 0.39411252860068463, "learning_rate": 1.2191346235993185e-05, "loss": 0.0611, "step": 2291 }, { "epoch": 2.114391143911439, "grad_norm": 0.4210251988531999, "learning_rate": 1.216830224006946e-05, "loss": 0.0628, "step": 2292 }, { "epoch": 2.1153136531365315, "grad_norm": 0.3555758988732575, "learning_rate": 1.2145273036427865e-05, "loss": 0.0596, "step": 2293 }, { "epoch": 2.1162361623616235, "grad_norm": 0.4227876412490418, "learning_rate": 1.2122258651616306e-05, "loss": 0.06, "step": 2294 }, { "epoch": 2.117158671586716, "grad_norm": 0.42060758733162873, "learning_rate": 1.209925911216557e-05, "loss": 0.0717, "step": 2295 }, { "epoch": 2.1180811808118083, "grad_norm": 0.47720257761139323, "learning_rate": 1.2076274444589361e-05, "loss": 0.0752, "step": 2296 }, { "epoch": 2.1190036900369003, "grad_norm": 0.3681611428166761, "learning_rate": 1.205330467538423e-05, "loss": 0.0678, "step": 2297 }, { "epoch": 2.1199261992619927, "grad_norm": 0.4372909250762327, "learning_rate": 1.2030349831029537e-05, "loss": 0.0623, "step": 2298 }, { "epoch": 2.120848708487085, "grad_norm": 0.3553890499364409, "learning_rate": 1.2007409937987451e-05, "loss": 0.0642, "step": 2299 }, { "epoch": 2.121771217712177, "grad_norm": 0.38781139433456296, "learning_rate": 1.1984485022702918e-05, "loss": 0.0648, "step": 2300 }, { "epoch": 2.1226937269372694, "grad_norm": 0.3891831431988466, "learning_rate": 1.1961575111603588e-05, "loss": 0.0643, "step": 2301 }, { "epoch": 2.123616236162362, "grad_norm": 0.4103801961617336, "learning_rate": 1.1938680231099833e-05, "loss": 0.0624, "step": 2302 }, { "epoch": 2.124538745387454, "grad_norm": 0.3456931795144724, "learning_rate": 1.1915800407584704e-05, "loss": 0.0583, "step": 2303 }, { "epoch": 2.125461254612546, "grad_norm": 0.4040172263992759, "learning_rate": 1.1892935667433871e-05, "loss": 0.0625, "step": 2304 }, { "epoch": 2.126383763837638, "grad_norm": 0.390510944677046, "learning_rate": 1.1870086037005635e-05, "loss": 0.0627, "step": 2305 }, { "epoch": 2.1273062730627306, "grad_norm": 0.42092084980087036, "learning_rate": 1.1847251542640885e-05, "loss": 0.0758, "step": 2306 }, { "epoch": 2.128228782287823, "grad_norm": 0.3625245206687934, "learning_rate": 1.182443221066303e-05, "loss": 0.0628, "step": 2307 }, { "epoch": 2.129151291512915, "grad_norm": 0.3543920209441862, "learning_rate": 1.1801628067378031e-05, "loss": 0.0658, "step": 2308 }, { "epoch": 2.1300738007380073, "grad_norm": 0.3692478597606682, "learning_rate": 1.1778839139074338e-05, "loss": 0.0661, "step": 2309 }, { "epoch": 2.1309963099630997, "grad_norm": 0.3532929371171087, "learning_rate": 1.175606545202283e-05, "loss": 0.0656, "step": 2310 }, { "epoch": 2.1319188191881917, "grad_norm": 0.39556690096193775, "learning_rate": 1.1733307032476848e-05, "loss": 0.0662, "step": 2311 }, { "epoch": 2.132841328413284, "grad_norm": 0.398480730251781, "learning_rate": 1.1710563906672134e-05, "loss": 0.0585, "step": 2312 }, { "epoch": 2.1337638376383765, "grad_norm": 0.34506871906767433, "learning_rate": 1.1687836100826765e-05, "loss": 0.0534, "step": 2313 }, { "epoch": 2.1346863468634685, "grad_norm": 0.3339671541671619, "learning_rate": 1.1665123641141194e-05, "loss": 0.0541, "step": 2314 }, { "epoch": 2.135608856088561, "grad_norm": 0.37804976582248867, "learning_rate": 1.1642426553798174e-05, "loss": 0.0575, "step": 2315 }, { "epoch": 2.1365313653136533, "grad_norm": 0.3580525123782122, "learning_rate": 1.1619744864962727e-05, "loss": 0.0738, "step": 2316 }, { "epoch": 2.1374538745387452, "grad_norm": 0.3593492691535271, "learning_rate": 1.159707860078211e-05, "loss": 0.0694, "step": 2317 }, { "epoch": 2.1383763837638377, "grad_norm": 0.38956493884705623, "learning_rate": 1.1574427787385852e-05, "loss": 0.0661, "step": 2318 }, { "epoch": 2.13929889298893, "grad_norm": 0.3601261188299134, "learning_rate": 1.1551792450885617e-05, "loss": 0.0593, "step": 2319 }, { "epoch": 2.140221402214022, "grad_norm": 0.3725932028380041, "learning_rate": 1.1529172617375234e-05, "loss": 0.0646, "step": 2320 }, { "epoch": 2.1411439114391144, "grad_norm": 0.5115553281623878, "learning_rate": 1.1506568312930698e-05, "loss": 0.0672, "step": 2321 }, { "epoch": 2.142066420664207, "grad_norm": 0.413571928782127, "learning_rate": 1.148397956361007e-05, "loss": 0.059, "step": 2322 }, { "epoch": 2.142988929889299, "grad_norm": 0.4884203786739079, "learning_rate": 1.1461406395453459e-05, "loss": 0.0786, "step": 2323 }, { "epoch": 2.143911439114391, "grad_norm": 0.3768262843453861, "learning_rate": 1.1438848834483081e-05, "loss": 0.0557, "step": 2324 }, { "epoch": 2.1448339483394836, "grad_norm": 0.39452290042589616, "learning_rate": 1.1416306906703097e-05, "loss": 0.0621, "step": 2325 }, { "epoch": 2.1457564575645756, "grad_norm": 0.40143554816102517, "learning_rate": 1.139378063809966e-05, "loss": 0.0622, "step": 2326 }, { "epoch": 2.146678966789668, "grad_norm": 0.40885176319407773, "learning_rate": 1.1371270054640884e-05, "loss": 0.0607, "step": 2327 }, { "epoch": 2.14760147601476, "grad_norm": 0.39687419030518506, "learning_rate": 1.1348775182276802e-05, "loss": 0.0624, "step": 2328 }, { "epoch": 2.1485239852398523, "grad_norm": 0.34413609071406964, "learning_rate": 1.1326296046939333e-05, "loss": 0.0598, "step": 2329 }, { "epoch": 2.1494464944649447, "grad_norm": 0.33814121563544586, "learning_rate": 1.1303832674542236e-05, "loss": 0.0562, "step": 2330 }, { "epoch": 2.1503690036900367, "grad_norm": 0.3825110840593142, "learning_rate": 1.1281385090981119e-05, "loss": 0.0619, "step": 2331 }, { "epoch": 2.151291512915129, "grad_norm": 0.3458059332912318, "learning_rate": 1.1258953322133398e-05, "loss": 0.0552, "step": 2332 }, { "epoch": 2.1522140221402215, "grad_norm": 0.383444247383961, "learning_rate": 1.1236537393858216e-05, "loss": 0.0607, "step": 2333 }, { "epoch": 2.1531365313653135, "grad_norm": 0.40269803579481, "learning_rate": 1.12141373319965e-05, "loss": 0.0688, "step": 2334 }, { "epoch": 2.154059040590406, "grad_norm": 0.36849070485549473, "learning_rate": 1.1191753162370871e-05, "loss": 0.063, "step": 2335 }, { "epoch": 2.1549815498154983, "grad_norm": 0.3671164565183918, "learning_rate": 1.1169384910785614e-05, "loss": 0.061, "step": 2336 }, { "epoch": 2.1559040590405902, "grad_norm": 0.38888831346308533, "learning_rate": 1.114703260302668e-05, "loss": 0.0656, "step": 2337 }, { "epoch": 2.1568265682656826, "grad_norm": 0.37897801285010585, "learning_rate": 1.1124696264861654e-05, "loss": 0.0663, "step": 2338 }, { "epoch": 2.157749077490775, "grad_norm": 0.3468797017958368, "learning_rate": 1.1102375922039665e-05, "loss": 0.0701, "step": 2339 }, { "epoch": 2.158671586715867, "grad_norm": 0.37549681661540163, "learning_rate": 1.1080071600291453e-05, "loss": 0.0689, "step": 2340 }, { "epoch": 2.1595940959409594, "grad_norm": 0.3838873479425886, "learning_rate": 1.1057783325329268e-05, "loss": 0.0571, "step": 2341 }, { "epoch": 2.160516605166052, "grad_norm": 0.4474304276936308, "learning_rate": 1.1035511122846848e-05, "loss": 0.0697, "step": 2342 }, { "epoch": 2.161439114391144, "grad_norm": 0.4840220730012607, "learning_rate": 1.1013255018519425e-05, "loss": 0.073, "step": 2343 }, { "epoch": 2.162361623616236, "grad_norm": 0.3578996455173482, "learning_rate": 1.099101503800367e-05, "loss": 0.0613, "step": 2344 }, { "epoch": 2.1632841328413286, "grad_norm": 0.39934092683814854, "learning_rate": 1.0968791206937645e-05, "loss": 0.0656, "step": 2345 }, { "epoch": 2.1642066420664205, "grad_norm": 0.41187406060511095, "learning_rate": 1.094658355094082e-05, "loss": 0.0648, "step": 2346 }, { "epoch": 2.165129151291513, "grad_norm": 0.4091523390014875, "learning_rate": 1.0924392095614019e-05, "loss": 0.0691, "step": 2347 }, { "epoch": 2.1660516605166054, "grad_norm": 0.44492007884459395, "learning_rate": 1.0902216866539363e-05, "loss": 0.0639, "step": 2348 }, { "epoch": 2.1669741697416973, "grad_norm": 0.38470641398628264, "learning_rate": 1.088005788928029e-05, "loss": 0.0617, "step": 2349 }, { "epoch": 2.1678966789667897, "grad_norm": 0.4269943517763368, "learning_rate": 1.0857915189381513e-05, "loss": 0.0654, "step": 2350 }, { "epoch": 2.1688191881918817, "grad_norm": 0.45813278300867916, "learning_rate": 1.083578879236895e-05, "loss": 0.0758, "step": 2351 }, { "epoch": 2.169741697416974, "grad_norm": 0.4183955750091749, "learning_rate": 1.0813678723749725e-05, "loss": 0.0779, "step": 2352 }, { "epoch": 2.1706642066420665, "grad_norm": 0.3627470265115012, "learning_rate": 1.0791585009012196e-05, "loss": 0.0546, "step": 2353 }, { "epoch": 2.171586715867159, "grad_norm": 0.3546274555675706, "learning_rate": 1.07695076736258e-05, "loss": 0.0557, "step": 2354 }, { "epoch": 2.172509225092251, "grad_norm": 0.41356451482436757, "learning_rate": 1.0747446743041107e-05, "loss": 0.0703, "step": 2355 }, { "epoch": 2.1734317343173433, "grad_norm": 0.40438384325635135, "learning_rate": 1.0725402242689823e-05, "loss": 0.0673, "step": 2356 }, { "epoch": 2.1743542435424352, "grad_norm": 0.3952222934802737, "learning_rate": 1.0703374197984653e-05, "loss": 0.0635, "step": 2357 }, { "epoch": 2.1752767527675276, "grad_norm": 0.40404658865059395, "learning_rate": 1.0681362634319347e-05, "loss": 0.0695, "step": 2358 }, { "epoch": 2.17619926199262, "grad_norm": 0.38566575571588047, "learning_rate": 1.0659367577068702e-05, "loss": 0.0626, "step": 2359 }, { "epoch": 2.177121771217712, "grad_norm": 0.38281627800852175, "learning_rate": 1.0637389051588426e-05, "loss": 0.0698, "step": 2360 }, { "epoch": 2.1780442804428044, "grad_norm": 0.38619544092330665, "learning_rate": 1.0615427083215187e-05, "loss": 0.0597, "step": 2361 }, { "epoch": 2.178966789667897, "grad_norm": 0.40890299425675625, "learning_rate": 1.0593481697266583e-05, "loss": 0.0713, "step": 2362 }, { "epoch": 2.1798892988929888, "grad_norm": 0.37383695344433404, "learning_rate": 1.0571552919041094e-05, "loss": 0.0629, "step": 2363 }, { "epoch": 2.180811808118081, "grad_norm": 0.4037774375780884, "learning_rate": 1.0549640773818029e-05, "loss": 0.0784, "step": 2364 }, { "epoch": 2.1817343173431736, "grad_norm": 0.37167875891239616, "learning_rate": 1.0527745286857549e-05, "loss": 0.0626, "step": 2365 }, { "epoch": 2.1826568265682655, "grad_norm": 0.4258021500571581, "learning_rate": 1.050586648340061e-05, "loss": 0.0644, "step": 2366 }, { "epoch": 2.183579335793358, "grad_norm": 0.399482560989266, "learning_rate": 1.0484004388668909e-05, "loss": 0.0684, "step": 2367 }, { "epoch": 2.1845018450184504, "grad_norm": 0.35748327143410524, "learning_rate": 1.046215902786491e-05, "loss": 0.0579, "step": 2368 }, { "epoch": 2.1854243542435423, "grad_norm": 0.3796983469724346, "learning_rate": 1.0440330426171786e-05, "loss": 0.0615, "step": 2369 }, { "epoch": 2.1863468634686347, "grad_norm": 0.41091979832216075, "learning_rate": 1.0418518608753361e-05, "loss": 0.0585, "step": 2370 }, { "epoch": 2.187269372693727, "grad_norm": 0.4071090560604256, "learning_rate": 1.0396723600754143e-05, "loss": 0.0699, "step": 2371 }, { "epoch": 2.188191881918819, "grad_norm": 0.4126300802092573, "learning_rate": 1.0374945427299242e-05, "loss": 0.0679, "step": 2372 }, { "epoch": 2.1891143911439115, "grad_norm": 0.37866397424997933, "learning_rate": 1.0353184113494386e-05, "loss": 0.0575, "step": 2373 }, { "epoch": 2.190036900369004, "grad_norm": 0.44172252715452986, "learning_rate": 1.0331439684425822e-05, "loss": 0.0646, "step": 2374 }, { "epoch": 2.190959409594096, "grad_norm": 0.33456826651019794, "learning_rate": 1.0309712165160376e-05, "loss": 0.0574, "step": 2375 }, { "epoch": 2.1918819188191883, "grad_norm": 0.4184823796794353, "learning_rate": 1.0288001580745372e-05, "loss": 0.0627, "step": 2376 }, { "epoch": 2.1928044280442807, "grad_norm": 0.3716937237706671, "learning_rate": 1.0266307956208585e-05, "loss": 0.0551, "step": 2377 }, { "epoch": 2.1937269372693726, "grad_norm": 0.38175690518378075, "learning_rate": 1.0244631316558267e-05, "loss": 0.0629, "step": 2378 }, { "epoch": 2.194649446494465, "grad_norm": 0.4593206627948996, "learning_rate": 1.022297168678309e-05, "loss": 0.0565, "step": 2379 }, { "epoch": 2.195571955719557, "grad_norm": 0.38855685156802766, "learning_rate": 1.0201329091852091e-05, "loss": 0.0648, "step": 2380 }, { "epoch": 2.1964944649446494, "grad_norm": 0.38557600892648936, "learning_rate": 1.0179703556714693e-05, "loss": 0.0656, "step": 2381 }, { "epoch": 2.197416974169742, "grad_norm": 0.35698436477503104, "learning_rate": 1.0158095106300658e-05, "loss": 0.0573, "step": 2382 }, { "epoch": 2.1983394833948338, "grad_norm": 0.4095033326518666, "learning_rate": 1.0136503765520023e-05, "loss": 0.0695, "step": 2383 }, { "epoch": 2.199261992619926, "grad_norm": 0.32909955635448535, "learning_rate": 1.0114929559263122e-05, "loss": 0.0627, "step": 2384 }, { "epoch": 2.2001845018450186, "grad_norm": 0.42170322247331393, "learning_rate": 1.0093372512400551e-05, "loss": 0.0729, "step": 2385 }, { "epoch": 2.2011070110701105, "grad_norm": 0.4170031294399854, "learning_rate": 1.0071832649783094e-05, "loss": 0.0648, "step": 2386 }, { "epoch": 2.202029520295203, "grad_norm": 0.42263067841693086, "learning_rate": 1.005030999624172e-05, "loss": 0.0712, "step": 2387 }, { "epoch": 2.2029520295202953, "grad_norm": 0.39586244798219883, "learning_rate": 1.0028804576587613e-05, "loss": 0.0711, "step": 2388 }, { "epoch": 2.2038745387453873, "grad_norm": 0.36370479167557207, "learning_rate": 1.0007316415612039e-05, "loss": 0.0666, "step": 2389 }, { "epoch": 2.2047970479704797, "grad_norm": 0.43220718510201855, "learning_rate": 9.985845538086367e-06, "loss": 0.0632, "step": 2390 }, { "epoch": 2.205719557195572, "grad_norm": 0.42938732546064245, "learning_rate": 9.964391968762091e-06, "loss": 0.0686, "step": 2391 }, { "epoch": 2.206642066420664, "grad_norm": 0.3836159221445964, "learning_rate": 9.942955732370707e-06, "loss": 0.0572, "step": 2392 }, { "epoch": 2.2075645756457565, "grad_norm": 0.4065015429719508, "learning_rate": 9.921536853623719e-06, "loss": 0.0642, "step": 2393 }, { "epoch": 2.208487084870849, "grad_norm": 0.37698155144624346, "learning_rate": 9.900135357212687e-06, "loss": 0.0647, "step": 2394 }, { "epoch": 2.209409594095941, "grad_norm": 0.37442380699492644, "learning_rate": 9.878751267809069e-06, "loss": 0.0628, "step": 2395 }, { "epoch": 2.2103321033210332, "grad_norm": 0.38191813136680935, "learning_rate": 9.857384610064272e-06, "loss": 0.0595, "step": 2396 }, { "epoch": 2.2112546125461257, "grad_norm": 0.3813478518906519, "learning_rate": 9.83603540860962e-06, "loss": 0.0637, "step": 2397 }, { "epoch": 2.2121771217712176, "grad_norm": 0.3727803675915483, "learning_rate": 9.814703688056321e-06, "loss": 0.057, "step": 2398 }, { "epoch": 2.21309963099631, "grad_norm": 0.39278714021478184, "learning_rate": 9.793389472995393e-06, "loss": 0.0615, "step": 2399 }, { "epoch": 2.2140221402214024, "grad_norm": 0.39529353507293297, "learning_rate": 9.772092787997714e-06, "loss": 0.0649, "step": 2400 }, { "epoch": 2.2149446494464944, "grad_norm": 0.4186754277263272, "learning_rate": 9.750813657613944e-06, "loss": 0.0722, "step": 2401 }, { "epoch": 2.215867158671587, "grad_norm": 0.4482267232721739, "learning_rate": 9.729552106374485e-06, "loss": 0.0623, "step": 2402 }, { "epoch": 2.2167896678966788, "grad_norm": 0.5059238984314811, "learning_rate": 9.708308158789494e-06, "loss": 0.0675, "step": 2403 }, { "epoch": 2.217712177121771, "grad_norm": 0.421014149027054, "learning_rate": 9.687081839348841e-06, "loss": 0.0601, "step": 2404 }, { "epoch": 2.2186346863468636, "grad_norm": 0.3930231949813996, "learning_rate": 9.665873172522047e-06, "loss": 0.0697, "step": 2405 }, { "epoch": 2.2195571955719555, "grad_norm": 0.41585105554699614, "learning_rate": 9.644682182758306e-06, "loss": 0.0835, "step": 2406 }, { "epoch": 2.220479704797048, "grad_norm": 0.37615047337233737, "learning_rate": 9.623508894486435e-06, "loss": 0.0619, "step": 2407 }, { "epoch": 2.2214022140221403, "grad_norm": 0.3502035160808852, "learning_rate": 9.602353332114825e-06, "loss": 0.0608, "step": 2408 }, { "epoch": 2.2223247232472323, "grad_norm": 0.408616407033454, "learning_rate": 9.581215520031448e-06, "loss": 0.058, "step": 2409 }, { "epoch": 2.2232472324723247, "grad_norm": 0.4089092435670495, "learning_rate": 9.560095482603823e-06, "loss": 0.0617, "step": 2410 }, { "epoch": 2.224169741697417, "grad_norm": 0.36470344619995576, "learning_rate": 9.538993244178945e-06, "loss": 0.0626, "step": 2411 }, { "epoch": 2.225092250922509, "grad_norm": 0.37057796529587894, "learning_rate": 9.517908829083324e-06, "loss": 0.0677, "step": 2412 }, { "epoch": 2.2260147601476015, "grad_norm": 0.3558886527371803, "learning_rate": 9.496842261622921e-06, "loss": 0.0567, "step": 2413 }, { "epoch": 2.226937269372694, "grad_norm": 0.3742612831864683, "learning_rate": 9.47579356608309e-06, "loss": 0.057, "step": 2414 }, { "epoch": 2.227859778597786, "grad_norm": 0.3952989125863061, "learning_rate": 9.454762766728617e-06, "loss": 0.0715, "step": 2415 }, { "epoch": 2.2287822878228782, "grad_norm": 0.4586153160783086, "learning_rate": 9.433749887803645e-06, "loss": 0.0605, "step": 2416 }, { "epoch": 2.2297047970479706, "grad_norm": 0.3772144695613314, "learning_rate": 9.412754953531663e-06, "loss": 0.0639, "step": 2417 }, { "epoch": 2.2306273062730626, "grad_norm": 0.4140506144692939, "learning_rate": 9.391777988115466e-06, "loss": 0.0645, "step": 2418 }, { "epoch": 2.231549815498155, "grad_norm": 0.3579186048846278, "learning_rate": 9.37081901573712e-06, "loss": 0.0621, "step": 2419 }, { "epoch": 2.2324723247232474, "grad_norm": 0.3934103535754087, "learning_rate": 9.349878060557999e-06, "loss": 0.0609, "step": 2420 }, { "epoch": 2.2333948339483394, "grad_norm": 0.38813103752980976, "learning_rate": 9.328955146718655e-06, "loss": 0.0626, "step": 2421 }, { "epoch": 2.234317343173432, "grad_norm": 0.37357724828486133, "learning_rate": 9.30805029833885e-06, "loss": 0.0647, "step": 2422 }, { "epoch": 2.235239852398524, "grad_norm": 0.36284855719708364, "learning_rate": 9.28716353951756e-06, "loss": 0.0629, "step": 2423 }, { "epoch": 2.236162361623616, "grad_norm": 0.37294511156556737, "learning_rate": 9.26629489433287e-06, "loss": 0.0577, "step": 2424 }, { "epoch": 2.2370848708487086, "grad_norm": 0.3842550911082737, "learning_rate": 9.245444386841966e-06, "loss": 0.0658, "step": 2425 }, { "epoch": 2.2380073800738005, "grad_norm": 0.4017289531736733, "learning_rate": 9.224612041081199e-06, "loss": 0.0692, "step": 2426 }, { "epoch": 2.238929889298893, "grad_norm": 0.36152473426005177, "learning_rate": 9.203797881065906e-06, "loss": 0.0666, "step": 2427 }, { "epoch": 2.2398523985239853, "grad_norm": 0.39368555294441043, "learning_rate": 9.183001930790483e-06, "loss": 0.0615, "step": 2428 }, { "epoch": 2.2407749077490773, "grad_norm": 0.34184825135839725, "learning_rate": 9.16222421422837e-06, "loss": 0.055, "step": 2429 }, { "epoch": 2.2416974169741697, "grad_norm": 0.4035152367587123, "learning_rate": 9.141464755331944e-06, "loss": 0.0654, "step": 2430 }, { "epoch": 2.242619926199262, "grad_norm": 0.3990062896703022, "learning_rate": 9.120723578032536e-06, "loss": 0.0683, "step": 2431 }, { "epoch": 2.243542435424354, "grad_norm": 0.4154132915461665, "learning_rate": 9.10000070624043e-06, "loss": 0.0617, "step": 2432 }, { "epoch": 2.2444649446494465, "grad_norm": 0.3608552225977703, "learning_rate": 9.079296163844794e-06, "loss": 0.0595, "step": 2433 }, { "epoch": 2.245387453874539, "grad_norm": 0.42304298850095023, "learning_rate": 9.058609974713655e-06, "loss": 0.0714, "step": 2434 }, { "epoch": 2.246309963099631, "grad_norm": 0.34194774227206476, "learning_rate": 9.037942162693894e-06, "loss": 0.0608, "step": 2435 }, { "epoch": 2.2472324723247232, "grad_norm": 0.4286068577211741, "learning_rate": 9.01729275161122e-06, "loss": 0.0651, "step": 2436 }, { "epoch": 2.2481549815498156, "grad_norm": 0.3957326714342427, "learning_rate": 8.996661765270092e-06, "loss": 0.0614, "step": 2437 }, { "epoch": 2.2490774907749076, "grad_norm": 0.4157264538009682, "learning_rate": 8.976049227453762e-06, "loss": 0.064, "step": 2438 }, { "epoch": 2.25, "grad_norm": 0.3811542012344819, "learning_rate": 8.955455161924217e-06, "loss": 0.0596, "step": 2439 }, { "epoch": 2.2509225092250924, "grad_norm": 0.43354843989295067, "learning_rate": 8.934879592422113e-06, "loss": 0.0685, "step": 2440 }, { "epoch": 2.2518450184501844, "grad_norm": 0.3880171606411912, "learning_rate": 8.914322542666822e-06, "loss": 0.0634, "step": 2441 }, { "epoch": 2.2527675276752768, "grad_norm": 0.3943942248307995, "learning_rate": 8.893784036356359e-06, "loss": 0.0595, "step": 2442 }, { "epoch": 2.253690036900369, "grad_norm": 0.38605063977501863, "learning_rate": 8.873264097167339e-06, "loss": 0.0578, "step": 2443 }, { "epoch": 2.254612546125461, "grad_norm": 0.3935636105100131, "learning_rate": 8.852762748754994e-06, "loss": 0.0713, "step": 2444 }, { "epoch": 2.2555350553505535, "grad_norm": 0.3663058052845239, "learning_rate": 8.832280014753132e-06, "loss": 0.0564, "step": 2445 }, { "epoch": 2.256457564575646, "grad_norm": 0.38903228868741724, "learning_rate": 8.811815918774077e-06, "loss": 0.0644, "step": 2446 }, { "epoch": 2.257380073800738, "grad_norm": 0.3757239882689567, "learning_rate": 8.791370484408684e-06, "loss": 0.0694, "step": 2447 }, { "epoch": 2.2583025830258303, "grad_norm": 0.4419107192723807, "learning_rate": 8.770943735226303e-06, "loss": 0.0756, "step": 2448 }, { "epoch": 2.2592250922509223, "grad_norm": 0.4040877974243685, "learning_rate": 8.750535694774714e-06, "loss": 0.0703, "step": 2449 }, { "epoch": 2.2601476014760147, "grad_norm": 0.4115440215289224, "learning_rate": 8.730146386580157e-06, "loss": 0.0608, "step": 2450 }, { "epoch": 2.261070110701107, "grad_norm": 0.4137796143232803, "learning_rate": 8.709775834147283e-06, "loss": 0.066, "step": 2451 }, { "epoch": 2.2619926199261995, "grad_norm": 0.35760666076703834, "learning_rate": 8.689424060959082e-06, "loss": 0.0529, "step": 2452 }, { "epoch": 2.2629151291512914, "grad_norm": 0.38983243909343557, "learning_rate": 8.669091090476944e-06, "loss": 0.0738, "step": 2453 }, { "epoch": 2.263837638376384, "grad_norm": 0.3870233680470246, "learning_rate": 8.648776946140544e-06, "loss": 0.0632, "step": 2454 }, { "epoch": 2.264760147601476, "grad_norm": 0.40092473241835, "learning_rate": 8.628481651367876e-06, "loss": 0.0672, "step": 2455 }, { "epoch": 2.265682656826568, "grad_norm": 0.37111043910186203, "learning_rate": 8.608205229555207e-06, "loss": 0.0598, "step": 2456 }, { "epoch": 2.2666051660516606, "grad_norm": 0.36140785385982965, "learning_rate": 8.587947704077018e-06, "loss": 0.0531, "step": 2457 }, { "epoch": 2.2675276752767526, "grad_norm": 0.359218678748364, "learning_rate": 8.567709098286058e-06, "loss": 0.0635, "step": 2458 }, { "epoch": 2.268450184501845, "grad_norm": 0.3563479517059402, "learning_rate": 8.547489435513222e-06, "loss": 0.0585, "step": 2459 }, { "epoch": 2.2693726937269374, "grad_norm": 0.35950359188192726, "learning_rate": 8.527288739067562e-06, "loss": 0.0659, "step": 2460 }, { "epoch": 2.2702952029520294, "grad_norm": 0.36424993799359373, "learning_rate": 8.507107032236322e-06, "loss": 0.0663, "step": 2461 }, { "epoch": 2.2712177121771218, "grad_norm": 0.42770263025619104, "learning_rate": 8.486944338284797e-06, "loss": 0.0637, "step": 2462 }, { "epoch": 2.272140221402214, "grad_norm": 0.406461182290524, "learning_rate": 8.46680068045637e-06, "loss": 0.0657, "step": 2463 }, { "epoch": 2.273062730627306, "grad_norm": 0.3550071200265757, "learning_rate": 8.446676081972526e-06, "loss": 0.0631, "step": 2464 }, { "epoch": 2.2739852398523985, "grad_norm": 0.38647593018164184, "learning_rate": 8.426570566032733e-06, "loss": 0.0636, "step": 2465 }, { "epoch": 2.274907749077491, "grad_norm": 0.40620402765224695, "learning_rate": 8.406484155814465e-06, "loss": 0.0686, "step": 2466 }, { "epoch": 2.275830258302583, "grad_norm": 0.3637540363413214, "learning_rate": 8.386416874473188e-06, "loss": 0.0571, "step": 2467 }, { "epoch": 2.2767527675276753, "grad_norm": 0.40426816905863777, "learning_rate": 8.366368745142316e-06, "loss": 0.0594, "step": 2468 }, { "epoch": 2.2776752767527677, "grad_norm": 0.37627678017473065, "learning_rate": 8.346339790933166e-06, "loss": 0.0577, "step": 2469 }, { "epoch": 2.2785977859778597, "grad_norm": 0.4329990706603838, "learning_rate": 8.326330034934968e-06, "loss": 0.0666, "step": 2470 }, { "epoch": 2.279520295202952, "grad_norm": 0.37292130288204195, "learning_rate": 8.306339500214821e-06, "loss": 0.0549, "step": 2471 }, { "epoch": 2.280442804428044, "grad_norm": 0.3779364943112543, "learning_rate": 8.286368209817644e-06, "loss": 0.0608, "step": 2472 }, { "epoch": 2.2813653136531364, "grad_norm": 0.41512067511116424, "learning_rate": 8.266416186766194e-06, "loss": 0.0632, "step": 2473 }, { "epoch": 2.282287822878229, "grad_norm": 0.397342143265139, "learning_rate": 8.246483454061015e-06, "loss": 0.0595, "step": 2474 }, { "epoch": 2.2832103321033212, "grad_norm": 0.3750701551097111, "learning_rate": 8.226570034680398e-06, "loss": 0.0593, "step": 2475 }, { "epoch": 2.284132841328413, "grad_norm": 0.37482356137981715, "learning_rate": 8.206675951580381e-06, "loss": 0.0536, "step": 2476 }, { "epoch": 2.2850553505535056, "grad_norm": 0.4277169525656212, "learning_rate": 8.186801227694722e-06, "loss": 0.072, "step": 2477 }, { "epoch": 2.2859778597785976, "grad_norm": 0.35039009884090044, "learning_rate": 8.166945885934827e-06, "loss": 0.0557, "step": 2478 }, { "epoch": 2.28690036900369, "grad_norm": 0.37618948434110666, "learning_rate": 8.147109949189793e-06, "loss": 0.0587, "step": 2479 }, { "epoch": 2.2878228782287824, "grad_norm": 0.34429121349112896, "learning_rate": 8.127293440326344e-06, "loss": 0.0524, "step": 2480 }, { "epoch": 2.2887453874538743, "grad_norm": 0.41348640468203074, "learning_rate": 8.107496382188781e-06, "loss": 0.0683, "step": 2481 }, { "epoch": 2.2896678966789668, "grad_norm": 0.35089151485209286, "learning_rate": 8.087718797599006e-06, "loss": 0.0613, "step": 2482 }, { "epoch": 2.290590405904059, "grad_norm": 0.38263238067766436, "learning_rate": 8.067960709356478e-06, "loss": 0.0677, "step": 2483 }, { "epoch": 2.291512915129151, "grad_norm": 0.4303658428177165, "learning_rate": 8.048222140238148e-06, "loss": 0.0801, "step": 2484 }, { "epoch": 2.2924354243542435, "grad_norm": 0.3542670487397755, "learning_rate": 8.028503112998496e-06, "loss": 0.0567, "step": 2485 }, { "epoch": 2.293357933579336, "grad_norm": 0.39400023544371354, "learning_rate": 8.008803650369473e-06, "loss": 0.0605, "step": 2486 }, { "epoch": 2.294280442804428, "grad_norm": 0.3408886467122237, "learning_rate": 7.989123775060453e-06, "loss": 0.0605, "step": 2487 }, { "epoch": 2.2952029520295203, "grad_norm": 0.40111276129887286, "learning_rate": 7.969463509758254e-06, "loss": 0.0661, "step": 2488 }, { "epoch": 2.2961254612546127, "grad_norm": 0.4020667133890386, "learning_rate": 7.949822877127072e-06, "loss": 0.0605, "step": 2489 }, { "epoch": 2.2970479704797047, "grad_norm": 0.34519922888637733, "learning_rate": 7.930201899808475e-06, "loss": 0.0562, "step": 2490 }, { "epoch": 2.297970479704797, "grad_norm": 0.3623006619704617, "learning_rate": 7.910600600421388e-06, "loss": 0.0584, "step": 2491 }, { "epoch": 2.2988929889298895, "grad_norm": 0.36246658853652025, "learning_rate": 7.89101900156202e-06, "loss": 0.062, "step": 2492 }, { "epoch": 2.2998154981549814, "grad_norm": 0.4018204804803805, "learning_rate": 7.871457125803896e-06, "loss": 0.0643, "step": 2493 }, { "epoch": 2.300738007380074, "grad_norm": 0.35305726451785513, "learning_rate": 7.8519149956978e-06, "loss": 0.0672, "step": 2494 }, { "epoch": 2.3016605166051662, "grad_norm": 0.41273237368631244, "learning_rate": 7.83239263377174e-06, "loss": 0.0604, "step": 2495 }, { "epoch": 2.302583025830258, "grad_norm": 0.45513339506555817, "learning_rate": 7.812890062530942e-06, "loss": 0.0664, "step": 2496 }, { "epoch": 2.3035055350553506, "grad_norm": 0.4144182159517466, "learning_rate": 7.793407304457836e-06, "loss": 0.0661, "step": 2497 }, { "epoch": 2.304428044280443, "grad_norm": 0.35440525157499714, "learning_rate": 7.773944382011977e-06, "loss": 0.0687, "step": 2498 }, { "epoch": 2.305350553505535, "grad_norm": 0.397688879723002, "learning_rate": 7.754501317630079e-06, "loss": 0.0635, "step": 2499 }, { "epoch": 2.3062730627306274, "grad_norm": 0.3950733114212165, "learning_rate": 7.735078133725961e-06, "loss": 0.0676, "step": 2500 }, { "epoch": 2.3071955719557193, "grad_norm": 0.3390242865811373, "learning_rate": 7.715674852690511e-06, "loss": 0.0553, "step": 2501 }, { "epoch": 2.3081180811808117, "grad_norm": 0.4163539201466788, "learning_rate": 7.696291496891683e-06, "loss": 0.0631, "step": 2502 }, { "epoch": 2.309040590405904, "grad_norm": 0.40064556557483877, "learning_rate": 7.67692808867447e-06, "loss": 0.072, "step": 2503 }, { "epoch": 2.3099630996309966, "grad_norm": 0.3977443109567292, "learning_rate": 7.657584650360847e-06, "loss": 0.0636, "step": 2504 }, { "epoch": 2.3108856088560885, "grad_norm": 0.3558820806014279, "learning_rate": 7.638261204249784e-06, "loss": 0.0637, "step": 2505 }, { "epoch": 2.311808118081181, "grad_norm": 0.40137296194830435, "learning_rate": 7.618957772617211e-06, "loss": 0.0639, "step": 2506 }, { "epoch": 2.312730627306273, "grad_norm": 0.4098971801635019, "learning_rate": 7.599674377715957e-06, "loss": 0.0693, "step": 2507 }, { "epoch": 2.3136531365313653, "grad_norm": 0.4300474601558293, "learning_rate": 7.580411041775779e-06, "loss": 0.0729, "step": 2508 }, { "epoch": 2.3145756457564577, "grad_norm": 0.3720867720294639, "learning_rate": 7.561167787003312e-06, "loss": 0.0555, "step": 2509 }, { "epoch": 2.3154981549815496, "grad_norm": 0.38623457849609255, "learning_rate": 7.541944635582012e-06, "loss": 0.0562, "step": 2510 }, { "epoch": 2.316420664206642, "grad_norm": 0.4165238986568089, "learning_rate": 7.522741609672193e-06, "loss": 0.057, "step": 2511 }, { "epoch": 2.3173431734317345, "grad_norm": 0.4637665834518347, "learning_rate": 7.503558731410959e-06, "loss": 0.0628, "step": 2512 }, { "epoch": 2.3182656826568264, "grad_norm": 0.39517469580622855, "learning_rate": 7.484396022912168e-06, "loss": 0.0525, "step": 2513 }, { "epoch": 2.319188191881919, "grad_norm": 0.4469403423279296, "learning_rate": 7.465253506266454e-06, "loss": 0.078, "step": 2514 }, { "epoch": 2.3201107011070112, "grad_norm": 0.3546576408290238, "learning_rate": 7.446131203541168e-06, "loss": 0.0545, "step": 2515 }, { "epoch": 2.321033210332103, "grad_norm": 0.35910317163594696, "learning_rate": 7.427029136780333e-06, "loss": 0.059, "step": 2516 }, { "epoch": 2.3219557195571956, "grad_norm": 0.37292966043796955, "learning_rate": 7.40794732800468e-06, "loss": 0.07, "step": 2517 }, { "epoch": 2.322878228782288, "grad_norm": 0.4253684481855254, "learning_rate": 7.388885799211573e-06, "loss": 0.0708, "step": 2518 }, { "epoch": 2.32380073800738, "grad_norm": 0.34809959850854133, "learning_rate": 7.369844572374981e-06, "loss": 0.0588, "step": 2519 }, { "epoch": 2.3247232472324724, "grad_norm": 0.3662135905594016, "learning_rate": 7.350823669445495e-06, "loss": 0.0582, "step": 2520 }, { "epoch": 2.3256457564575648, "grad_norm": 0.3640604087471566, "learning_rate": 7.3318231123502666e-06, "loss": 0.0633, "step": 2521 }, { "epoch": 2.3265682656826567, "grad_norm": 0.40467998469740746, "learning_rate": 7.312842922992977e-06, "loss": 0.0715, "step": 2522 }, { "epoch": 2.327490774907749, "grad_norm": 0.36356399996561506, "learning_rate": 7.293883123253861e-06, "loss": 0.0536, "step": 2523 }, { "epoch": 2.328413284132841, "grad_norm": 0.42665396685165385, "learning_rate": 7.2749437349896115e-06, "loss": 0.0629, "step": 2524 }, { "epoch": 2.3293357933579335, "grad_norm": 0.40459888743575806, "learning_rate": 7.256024780033418e-06, "loss": 0.0791, "step": 2525 }, { "epoch": 2.330258302583026, "grad_norm": 0.4017198772300197, "learning_rate": 7.237126280194914e-06, "loss": 0.0579, "step": 2526 }, { "epoch": 2.3311808118081183, "grad_norm": 0.4056286111976866, "learning_rate": 7.218248257260127e-06, "loss": 0.0623, "step": 2527 }, { "epoch": 2.3321033210332103, "grad_norm": 0.35597187276662773, "learning_rate": 7.199390732991504e-06, "loss": 0.0525, "step": 2528 }, { "epoch": 2.3330258302583027, "grad_norm": 0.4092432831485503, "learning_rate": 7.180553729127862e-06, "loss": 0.0576, "step": 2529 }, { "epoch": 2.3339483394833946, "grad_norm": 0.41150000904800804, "learning_rate": 7.1617372673843354e-06, "loss": 0.0628, "step": 2530 }, { "epoch": 2.334870848708487, "grad_norm": 0.36975495289691623, "learning_rate": 7.142941369452411e-06, "loss": 0.0673, "step": 2531 }, { "epoch": 2.3357933579335795, "grad_norm": 0.39441684219402856, "learning_rate": 7.124166056999854e-06, "loss": 0.062, "step": 2532 }, { "epoch": 2.3367158671586714, "grad_norm": 0.4452527917351628, "learning_rate": 7.105411351670691e-06, "loss": 0.0764, "step": 2533 }, { "epoch": 2.337638376383764, "grad_norm": 0.4404555798940548, "learning_rate": 7.086677275085205e-06, "loss": 0.0584, "step": 2534 }, { "epoch": 2.338560885608856, "grad_norm": 0.39590190275406095, "learning_rate": 7.0679638488399035e-06, "loss": 0.0718, "step": 2535 }, { "epoch": 2.339483394833948, "grad_norm": 0.3960244041804856, "learning_rate": 7.049271094507465e-06, "loss": 0.0686, "step": 2536 }, { "epoch": 2.3404059040590406, "grad_norm": 0.49858709489915143, "learning_rate": 7.030599033636759e-06, "loss": 0.0671, "step": 2537 }, { "epoch": 2.341328413284133, "grad_norm": 0.40343009520783757, "learning_rate": 7.011947687752804e-06, "loss": 0.0717, "step": 2538 }, { "epoch": 2.342250922509225, "grad_norm": 0.36482715970289087, "learning_rate": 6.993317078356709e-06, "loss": 0.0589, "step": 2539 }, { "epoch": 2.3431734317343174, "grad_norm": 0.3615894997654129, "learning_rate": 6.9747072269257054e-06, "loss": 0.0564, "step": 2540 }, { "epoch": 2.3440959409594098, "grad_norm": 0.3889981031366702, "learning_rate": 6.956118154913096e-06, "loss": 0.0615, "step": 2541 }, { "epoch": 2.3450184501845017, "grad_norm": 0.4066475196714116, "learning_rate": 6.937549883748201e-06, "loss": 0.0722, "step": 2542 }, { "epoch": 2.345940959409594, "grad_norm": 0.3538875333416918, "learning_rate": 6.919002434836389e-06, "loss": 0.0576, "step": 2543 }, { "epoch": 2.3468634686346865, "grad_norm": 0.4085945838979749, "learning_rate": 6.900475829559022e-06, "loss": 0.0638, "step": 2544 }, { "epoch": 2.3477859778597785, "grad_norm": 0.3735933754064253, "learning_rate": 6.881970089273418e-06, "loss": 0.0601, "step": 2545 }, { "epoch": 2.348708487084871, "grad_norm": 0.3962004713822194, "learning_rate": 6.863485235312853e-06, "loss": 0.0539, "step": 2546 }, { "epoch": 2.349630996309963, "grad_norm": 0.3844502751625651, "learning_rate": 6.845021288986531e-06, "loss": 0.063, "step": 2547 }, { "epoch": 2.3505535055350553, "grad_norm": 0.4153282393300872, "learning_rate": 6.826578271579537e-06, "loss": 0.0629, "step": 2548 }, { "epoch": 2.3514760147601477, "grad_norm": 0.4150414688277971, "learning_rate": 6.8081562043528445e-06, "loss": 0.0643, "step": 2549 }, { "epoch": 2.35239852398524, "grad_norm": 0.5368005897409298, "learning_rate": 6.789755108543275e-06, "loss": 0.0714, "step": 2550 }, { "epoch": 2.353321033210332, "grad_norm": 0.4053169220112357, "learning_rate": 6.771375005363459e-06, "loss": 0.061, "step": 2551 }, { "epoch": 2.3542435424354244, "grad_norm": 0.3974430160467046, "learning_rate": 6.753015916001842e-06, "loss": 0.0629, "step": 2552 }, { "epoch": 2.3551660516605164, "grad_norm": 0.36042512610834304, "learning_rate": 6.7346778616226515e-06, "loss": 0.0547, "step": 2553 }, { "epoch": 2.356088560885609, "grad_norm": 0.44978400097506505, "learning_rate": 6.716360863365837e-06, "loss": 0.062, "step": 2554 }, { "epoch": 2.357011070110701, "grad_norm": 0.4613801197544788, "learning_rate": 6.698064942347098e-06, "loss": 0.0661, "step": 2555 }, { "epoch": 2.357933579335793, "grad_norm": 0.42809481167463337, "learning_rate": 6.6797901196578475e-06, "loss": 0.0609, "step": 2556 }, { "epoch": 2.3588560885608856, "grad_norm": 0.4222186759732147, "learning_rate": 6.661536416365133e-06, "loss": 0.062, "step": 2557 }, { "epoch": 2.359778597785978, "grad_norm": 0.392109861648142, "learning_rate": 6.643303853511707e-06, "loss": 0.0656, "step": 2558 }, { "epoch": 2.36070110701107, "grad_norm": 0.350509019582078, "learning_rate": 6.625092452115908e-06, "loss": 0.0599, "step": 2559 }, { "epoch": 2.3616236162361623, "grad_norm": 0.37873421265389096, "learning_rate": 6.606902233171711e-06, "loss": 0.0636, "step": 2560 }, { "epoch": 2.3625461254612548, "grad_norm": 0.46300069978001546, "learning_rate": 6.58873321764866e-06, "loss": 0.0634, "step": 2561 }, { "epoch": 2.3634686346863467, "grad_norm": 0.4014439079622017, "learning_rate": 6.570585426491846e-06, "loss": 0.0645, "step": 2562 }, { "epoch": 2.364391143911439, "grad_norm": 0.3568217003146156, "learning_rate": 6.552458880621909e-06, "loss": 0.0646, "step": 2563 }, { "epoch": 2.3653136531365315, "grad_norm": 0.38169279440631276, "learning_rate": 6.534353600934997e-06, "loss": 0.0646, "step": 2564 }, { "epoch": 2.3662361623616235, "grad_norm": 0.3614823307055291, "learning_rate": 6.5162696083027275e-06, "loss": 0.063, "step": 2565 }, { "epoch": 2.367158671586716, "grad_norm": 0.41450615121302437, "learning_rate": 6.498206923572189e-06, "loss": 0.0604, "step": 2566 }, { "epoch": 2.3680811808118083, "grad_norm": 0.3888929140268929, "learning_rate": 6.480165567565913e-06, "loss": 0.0636, "step": 2567 }, { "epoch": 2.3690036900369003, "grad_norm": 0.36675927843802164, "learning_rate": 6.4621455610818225e-06, "loss": 0.0561, "step": 2568 }, { "epoch": 2.3699261992619927, "grad_norm": 0.4407090305739821, "learning_rate": 6.4441469248932515e-06, "loss": 0.0663, "step": 2569 }, { "epoch": 2.3708487084870846, "grad_norm": 0.37352908950202196, "learning_rate": 6.426169679748892e-06, "loss": 0.0726, "step": 2570 }, { "epoch": 2.371771217712177, "grad_norm": 0.3407934217583342, "learning_rate": 6.40821384637276e-06, "loss": 0.0579, "step": 2571 }, { "epoch": 2.3726937269372694, "grad_norm": 0.3496884380757048, "learning_rate": 6.390279445464209e-06, "loss": 0.0537, "step": 2572 }, { "epoch": 2.373616236162362, "grad_norm": 0.4210055489309458, "learning_rate": 6.3723664976978875e-06, "loss": 0.0622, "step": 2573 }, { "epoch": 2.374538745387454, "grad_norm": 0.41545445097543465, "learning_rate": 6.354475023723686e-06, "loss": 0.0672, "step": 2574 }, { "epoch": 2.375461254612546, "grad_norm": 0.40184438229982183, "learning_rate": 6.336605044166763e-06, "loss": 0.0665, "step": 2575 }, { "epoch": 2.376383763837638, "grad_norm": 0.35845198960242597, "learning_rate": 6.318756579627508e-06, "loss": 0.057, "step": 2576 }, { "epoch": 2.3773062730627306, "grad_norm": 0.38322370969285097, "learning_rate": 6.30092965068147e-06, "loss": 0.0697, "step": 2577 }, { "epoch": 2.378228782287823, "grad_norm": 0.4107658977093919, "learning_rate": 6.283124277879407e-06, "loss": 0.0614, "step": 2578 }, { "epoch": 2.3791512915129154, "grad_norm": 0.3804930863220513, "learning_rate": 6.2653404817472226e-06, "loss": 0.065, "step": 2579 }, { "epoch": 2.3800738007380073, "grad_norm": 0.3437078010893973, "learning_rate": 6.247578282785929e-06, "loss": 0.0594, "step": 2580 }, { "epoch": 2.3809963099630997, "grad_norm": 0.4353303768580871, "learning_rate": 6.229837701471644e-06, "loss": 0.0671, "step": 2581 }, { "epoch": 2.3819188191881917, "grad_norm": 0.385517329552986, "learning_rate": 6.212118758255595e-06, "loss": 0.054, "step": 2582 }, { "epoch": 2.382841328413284, "grad_norm": 0.4092193165406145, "learning_rate": 6.194421473564033e-06, "loss": 0.0591, "step": 2583 }, { "epoch": 2.3837638376383765, "grad_norm": 0.3979284458536561, "learning_rate": 6.176745867798234e-06, "loss": 0.0675, "step": 2584 }, { "epoch": 2.3846863468634685, "grad_norm": 0.38250678890596823, "learning_rate": 6.159091961334531e-06, "loss": 0.0676, "step": 2585 }, { "epoch": 2.385608856088561, "grad_norm": 0.41128989688310613, "learning_rate": 6.141459774524194e-06, "loss": 0.0622, "step": 2586 }, { "epoch": 2.3865313653136533, "grad_norm": 0.3592014079130497, "learning_rate": 6.123849327693462e-06, "loss": 0.0534, "step": 2587 }, { "epoch": 2.3874538745387452, "grad_norm": 0.4048581447812899, "learning_rate": 6.106260641143546e-06, "loss": 0.0662, "step": 2588 }, { "epoch": 2.3883763837638377, "grad_norm": 0.3616056918329504, "learning_rate": 6.0886937351505276e-06, "loss": 0.0544, "step": 2589 }, { "epoch": 2.38929889298893, "grad_norm": 0.3542718481167006, "learning_rate": 6.0711486299654095e-06, "loss": 0.0587, "step": 2590 }, { "epoch": 2.390221402214022, "grad_norm": 0.43225454759792764, "learning_rate": 6.053625345814062e-06, "loss": 0.0551, "step": 2591 }, { "epoch": 2.3911439114391144, "grad_norm": 0.34743105596036417, "learning_rate": 6.036123902897172e-06, "loss": 0.0569, "step": 2592 }, { "epoch": 2.392066420664207, "grad_norm": 0.4161427779221691, "learning_rate": 6.018644321390288e-06, "loss": 0.0609, "step": 2593 }, { "epoch": 2.392988929889299, "grad_norm": 0.3695331552011525, "learning_rate": 6.001186621443719e-06, "loss": 0.0563, "step": 2594 }, { "epoch": 2.393911439114391, "grad_norm": 0.396624124983913, "learning_rate": 5.983750823182574e-06, "loss": 0.0635, "step": 2595 }, { "epoch": 2.3948339483394836, "grad_norm": 0.3642112004788074, "learning_rate": 5.966336946706716e-06, "loss": 0.0614, "step": 2596 }, { "epoch": 2.3957564575645756, "grad_norm": 0.3603512900272667, "learning_rate": 5.948945012090709e-06, "loss": 0.0542, "step": 2597 }, { "epoch": 2.396678966789668, "grad_norm": 0.3975643668719549, "learning_rate": 5.931575039383852e-06, "loss": 0.07, "step": 2598 }, { "epoch": 2.39760147601476, "grad_norm": 0.39638831804174796, "learning_rate": 5.914227048610121e-06, "loss": 0.0582, "step": 2599 }, { "epoch": 2.3985239852398523, "grad_norm": 0.35663381843101466, "learning_rate": 5.896901059768134e-06, "loss": 0.0539, "step": 2600 }, { "epoch": 2.3994464944649447, "grad_norm": 0.3962885681918028, "learning_rate": 5.87959709283116e-06, "loss": 0.0661, "step": 2601 }, { "epoch": 2.400369003690037, "grad_norm": 0.3626899065335606, "learning_rate": 5.86231516774709e-06, "loss": 0.0616, "step": 2602 }, { "epoch": 2.401291512915129, "grad_norm": 0.449109122858332, "learning_rate": 5.845055304438377e-06, "loss": 0.0694, "step": 2603 }, { "epoch": 2.4022140221402215, "grad_norm": 0.37200127550749645, "learning_rate": 5.827817522802065e-06, "loss": 0.0636, "step": 2604 }, { "epoch": 2.4031365313653135, "grad_norm": 0.3735178974919831, "learning_rate": 5.810601842709743e-06, "loss": 0.064, "step": 2605 }, { "epoch": 2.404059040590406, "grad_norm": 0.38994941566994723, "learning_rate": 5.793408284007501e-06, "loss": 0.0603, "step": 2606 }, { "epoch": 2.4049815498154983, "grad_norm": 0.3967380432626151, "learning_rate": 5.776236866515947e-06, "loss": 0.0685, "step": 2607 }, { "epoch": 2.4059040590405902, "grad_norm": 0.3459901203489991, "learning_rate": 5.759087610030167e-06, "loss": 0.0621, "step": 2608 }, { "epoch": 2.4068265682656826, "grad_norm": 0.4065639509219356, "learning_rate": 5.741960534319677e-06, "loss": 0.0578, "step": 2609 }, { "epoch": 2.407749077490775, "grad_norm": 0.44235335716406815, "learning_rate": 5.724855659128442e-06, "loss": 0.062, "step": 2610 }, { "epoch": 2.408671586715867, "grad_norm": 0.3263850063837015, "learning_rate": 5.707773004174841e-06, "loss": 0.0569, "step": 2611 }, { "epoch": 2.4095940959409594, "grad_norm": 0.40541025521987767, "learning_rate": 5.6907125891516115e-06, "loss": 0.0598, "step": 2612 }, { "epoch": 2.410516605166052, "grad_norm": 0.4199545942580867, "learning_rate": 5.673674433725873e-06, "loss": 0.0672, "step": 2613 }, { "epoch": 2.411439114391144, "grad_norm": 0.3746463616462845, "learning_rate": 5.656658557539091e-06, "loss": 0.0649, "step": 2614 }, { "epoch": 2.412361623616236, "grad_norm": 0.3579648795946127, "learning_rate": 5.639664980207024e-06, "loss": 0.0569, "step": 2615 }, { "epoch": 2.4132841328413286, "grad_norm": 0.39194718088289654, "learning_rate": 5.622693721319727e-06, "loss": 0.0549, "step": 2616 }, { "epoch": 2.4142066420664205, "grad_norm": 0.3700713222546607, "learning_rate": 5.605744800441562e-06, "loss": 0.062, "step": 2617 }, { "epoch": 2.415129151291513, "grad_norm": 0.3816651720819482, "learning_rate": 5.588818237111102e-06, "loss": 0.0538, "step": 2618 }, { "epoch": 2.4160516605166054, "grad_norm": 0.3697697674957319, "learning_rate": 5.57191405084114e-06, "loss": 0.0668, "step": 2619 }, { "epoch": 2.4169741697416973, "grad_norm": 0.3807310616054102, "learning_rate": 5.5550322611187254e-06, "loss": 0.0616, "step": 2620 }, { "epoch": 2.4178966789667897, "grad_norm": 0.47064848383115176, "learning_rate": 5.538172887405038e-06, "loss": 0.0625, "step": 2621 }, { "epoch": 2.4188191881918817, "grad_norm": 0.44060259255703815, "learning_rate": 5.52133594913542e-06, "loss": 0.0678, "step": 2622 }, { "epoch": 2.419741697416974, "grad_norm": 0.4099851460303192, "learning_rate": 5.5045214657193925e-06, "loss": 0.0661, "step": 2623 }, { "epoch": 2.4206642066420665, "grad_norm": 0.401475210690167, "learning_rate": 5.487729456540547e-06, "loss": 0.059, "step": 2624 }, { "epoch": 2.421586715867159, "grad_norm": 0.35716427098125575, "learning_rate": 5.470959940956572e-06, "loss": 0.0595, "step": 2625 }, { "epoch": 2.422509225092251, "grad_norm": 0.31909506874099236, "learning_rate": 5.454212938299255e-06, "loss": 0.0514, "step": 2626 }, { "epoch": 2.4234317343173433, "grad_norm": 0.38220159344635984, "learning_rate": 5.437488467874407e-06, "loss": 0.0542, "step": 2627 }, { "epoch": 2.4243542435424352, "grad_norm": 0.3859726021013262, "learning_rate": 5.4207865489618565e-06, "loss": 0.0585, "step": 2628 }, { "epoch": 2.4252767527675276, "grad_norm": 0.42450114803191763, "learning_rate": 5.404107200815456e-06, "loss": 0.0647, "step": 2629 }, { "epoch": 2.42619926199262, "grad_norm": 0.3987226969379201, "learning_rate": 5.387450442663025e-06, "loss": 0.0538, "step": 2630 }, { "epoch": 2.427121771217712, "grad_norm": 0.3968955400530196, "learning_rate": 5.370816293706357e-06, "loss": 0.0572, "step": 2631 }, { "epoch": 2.4280442804428044, "grad_norm": 0.40159835731652965, "learning_rate": 5.354204773121155e-06, "loss": 0.0553, "step": 2632 }, { "epoch": 2.428966789667897, "grad_norm": 0.35205301145237317, "learning_rate": 5.337615900057058e-06, "loss": 0.0586, "step": 2633 }, { "epoch": 2.4298892988929888, "grad_norm": 0.37210584463265833, "learning_rate": 5.3210496936376e-06, "loss": 0.0543, "step": 2634 }, { "epoch": 2.430811808118081, "grad_norm": 0.3927689300274618, "learning_rate": 5.304506172960161e-06, "loss": 0.0596, "step": 2635 }, { "epoch": 2.4317343173431736, "grad_norm": 0.3976834621136443, "learning_rate": 5.287985357095989e-06, "loss": 0.0634, "step": 2636 }, { "epoch": 2.4326568265682655, "grad_norm": 0.3850666745291003, "learning_rate": 5.271487265090163e-06, "loss": 0.0613, "step": 2637 }, { "epoch": 2.433579335793358, "grad_norm": 0.3931373334443755, "learning_rate": 5.2550119159615414e-06, "loss": 0.055, "step": 2638 }, { "epoch": 2.4345018450184504, "grad_norm": 0.4441330734470427, "learning_rate": 5.238559328702783e-06, "loss": 0.0607, "step": 2639 }, { "epoch": 2.4354243542435423, "grad_norm": 0.43134887293841717, "learning_rate": 5.222129522280314e-06, "loss": 0.0659, "step": 2640 }, { "epoch": 2.4363468634686347, "grad_norm": 0.42422088503610306, "learning_rate": 5.205722515634276e-06, "loss": 0.0685, "step": 2641 }, { "epoch": 2.437269372693727, "grad_norm": 0.4305265594810753, "learning_rate": 5.189338327678541e-06, "loss": 0.0615, "step": 2642 }, { "epoch": 2.438191881918819, "grad_norm": 0.42859758627537825, "learning_rate": 5.172976977300687e-06, "loss": 0.0666, "step": 2643 }, { "epoch": 2.4391143911439115, "grad_norm": 0.3653846943927543, "learning_rate": 5.156638483361934e-06, "loss": 0.053, "step": 2644 }, { "epoch": 2.4400369003690034, "grad_norm": 0.47522142375272125, "learning_rate": 5.140322864697183e-06, "loss": 0.069, "step": 2645 }, { "epoch": 2.440959409594096, "grad_norm": 0.39587718312963355, "learning_rate": 5.124030140114958e-06, "loss": 0.055, "step": 2646 }, { "epoch": 2.4418819188191883, "grad_norm": 0.39370537094439917, "learning_rate": 5.107760328397371e-06, "loss": 0.0589, "step": 2647 }, { "epoch": 2.4428044280442807, "grad_norm": 0.42584974603072745, "learning_rate": 5.091513448300142e-06, "loss": 0.0717, "step": 2648 }, { "epoch": 2.4437269372693726, "grad_norm": 0.4181594731122546, "learning_rate": 5.075289518552562e-06, "loss": 0.0665, "step": 2649 }, { "epoch": 2.444649446494465, "grad_norm": 0.390316797744379, "learning_rate": 5.059088557857436e-06, "loss": 0.0607, "step": 2650 }, { "epoch": 2.445571955719557, "grad_norm": 0.38943012941645383, "learning_rate": 5.0429105848911e-06, "loss": 0.0651, "step": 2651 }, { "epoch": 2.4464944649446494, "grad_norm": 0.383197867209574, "learning_rate": 5.0267556183034195e-06, "loss": 0.0508, "step": 2652 }, { "epoch": 2.447416974169742, "grad_norm": 0.3843119449443553, "learning_rate": 5.010623676717704e-06, "loss": 0.058, "step": 2653 }, { "epoch": 2.4483394833948338, "grad_norm": 0.3707663558261996, "learning_rate": 4.994514778730719e-06, "loss": 0.06, "step": 2654 }, { "epoch": 2.449261992619926, "grad_norm": 0.4021464290126931, "learning_rate": 4.9784289429127e-06, "loss": 0.0611, "step": 2655 }, { "epoch": 2.4501845018450186, "grad_norm": 0.35954260644458425, "learning_rate": 4.9623661878072635e-06, "loss": 0.055, "step": 2656 }, { "epoch": 2.4511070110701105, "grad_norm": 0.4048880210682503, "learning_rate": 4.946326531931417e-06, "loss": 0.0611, "step": 2657 }, { "epoch": 2.452029520295203, "grad_norm": 0.3987346958997723, "learning_rate": 4.930309993775578e-06, "loss": 0.0654, "step": 2658 }, { "epoch": 2.4529520295202953, "grad_norm": 0.38942327732157767, "learning_rate": 4.914316591803475e-06, "loss": 0.074, "step": 2659 }, { "epoch": 2.4538745387453873, "grad_norm": 0.40193876135548995, "learning_rate": 4.8983463444521705e-06, "loss": 0.06, "step": 2660 }, { "epoch": 2.4547970479704797, "grad_norm": 0.38751800666766895, "learning_rate": 4.882399270132052e-06, "loss": 0.0623, "step": 2661 }, { "epoch": 2.455719557195572, "grad_norm": 0.34738897235755367, "learning_rate": 4.866475387226788e-06, "loss": 0.0597, "step": 2662 }, { "epoch": 2.456642066420664, "grad_norm": 0.38054854983134784, "learning_rate": 4.850574714093292e-06, "loss": 0.0575, "step": 2663 }, { "epoch": 2.4575645756457565, "grad_norm": 0.36201575679205095, "learning_rate": 4.83469726906175e-06, "loss": 0.0574, "step": 2664 }, { "epoch": 2.458487084870849, "grad_norm": 0.35727369220907385, "learning_rate": 4.8188430704355605e-06, "loss": 0.056, "step": 2665 }, { "epoch": 2.459409594095941, "grad_norm": 0.35982158666662806, "learning_rate": 4.803012136491308e-06, "loss": 0.0556, "step": 2666 }, { "epoch": 2.4603321033210332, "grad_norm": 0.339409645190489, "learning_rate": 4.78720448547878e-06, "loss": 0.0566, "step": 2667 }, { "epoch": 2.4612546125461257, "grad_norm": 0.3464214698695882, "learning_rate": 4.771420135620919e-06, "loss": 0.0539, "step": 2668 }, { "epoch": 2.4621771217712176, "grad_norm": 0.36442133108465213, "learning_rate": 4.755659105113788e-06, "loss": 0.0552, "step": 2669 }, { "epoch": 2.46309963099631, "grad_norm": 0.4399996919727431, "learning_rate": 4.739921412126591e-06, "loss": 0.0715, "step": 2670 }, { "epoch": 2.4640221402214024, "grad_norm": 0.3852161273688586, "learning_rate": 4.724207074801623e-06, "loss": 0.0496, "step": 2671 }, { "epoch": 2.4649446494464944, "grad_norm": 0.37967417283926874, "learning_rate": 4.708516111254238e-06, "loss": 0.0645, "step": 2672 }, { "epoch": 2.465867158671587, "grad_norm": 0.35478180776500723, "learning_rate": 4.692848539572866e-06, "loss": 0.052, "step": 2673 }, { "epoch": 2.4667896678966788, "grad_norm": 0.403051447224795, "learning_rate": 4.677204377818961e-06, "loss": 0.0608, "step": 2674 }, { "epoch": 2.467712177121771, "grad_norm": 0.3751198699123934, "learning_rate": 4.661583644026998e-06, "loss": 0.0622, "step": 2675 }, { "epoch": 2.4686346863468636, "grad_norm": 0.3808509667709742, "learning_rate": 4.6459863562044264e-06, "loss": 0.0626, "step": 2676 }, { "epoch": 2.469557195571956, "grad_norm": 0.40468818913039606, "learning_rate": 4.630412532331685e-06, "loss": 0.0596, "step": 2677 }, { "epoch": 2.470479704797048, "grad_norm": 0.3998975816295429, "learning_rate": 4.614862190362165e-06, "loss": 0.0657, "step": 2678 }, { "epoch": 2.4714022140221403, "grad_norm": 0.3994385358961541, "learning_rate": 4.59933534822217e-06, "loss": 0.0601, "step": 2679 }, { "epoch": 2.4723247232472323, "grad_norm": 0.348068920835927, "learning_rate": 4.583832023810925e-06, "loss": 0.0496, "step": 2680 }, { "epoch": 2.4732472324723247, "grad_norm": 0.37236078398166217, "learning_rate": 4.56835223500055e-06, "loss": 0.0581, "step": 2681 }, { "epoch": 2.474169741697417, "grad_norm": 0.4182456478205192, "learning_rate": 4.55289599963602e-06, "loss": 0.0597, "step": 2682 }, { "epoch": 2.475092250922509, "grad_norm": 0.4290296224964595, "learning_rate": 4.537463335535161e-06, "loss": 0.0617, "step": 2683 }, { "epoch": 2.4760147601476015, "grad_norm": 0.4288200041721056, "learning_rate": 4.52205426048864e-06, "loss": 0.0683, "step": 2684 }, { "epoch": 2.476937269372694, "grad_norm": 0.38425239725616517, "learning_rate": 4.506668792259914e-06, "loss": 0.0658, "step": 2685 }, { "epoch": 2.477859778597786, "grad_norm": 0.36189081453617344, "learning_rate": 4.491306948585219e-06, "loss": 0.0603, "step": 2686 }, { "epoch": 2.4787822878228782, "grad_norm": 0.3661967722872685, "learning_rate": 4.475968747173592e-06, "loss": 0.0549, "step": 2687 }, { "epoch": 2.4797047970479706, "grad_norm": 0.39386173428974636, "learning_rate": 4.460654205706785e-06, "loss": 0.0509, "step": 2688 }, { "epoch": 2.4806273062730626, "grad_norm": 0.4086739092191323, "learning_rate": 4.4453633418392705e-06, "loss": 0.0668, "step": 2689 }, { "epoch": 2.481549815498155, "grad_norm": 0.44226148669322224, "learning_rate": 4.430096173198259e-06, "loss": 0.0698, "step": 2690 }, { "epoch": 2.4824723247232474, "grad_norm": 0.41815992188414736, "learning_rate": 4.414852717383616e-06, "loss": 0.0707, "step": 2691 }, { "epoch": 2.4833948339483394, "grad_norm": 0.40710245431221515, "learning_rate": 4.3996329919678666e-06, "loss": 0.0611, "step": 2692 }, { "epoch": 2.484317343173432, "grad_norm": 0.3866036235460156, "learning_rate": 4.384437014496215e-06, "loss": 0.0663, "step": 2693 }, { "epoch": 2.485239852398524, "grad_norm": 0.3925558660095124, "learning_rate": 4.3692648024864585e-06, "loss": 0.0621, "step": 2694 }, { "epoch": 2.486162361623616, "grad_norm": 0.3898878289789644, "learning_rate": 4.3541163734289955e-06, "loss": 0.0631, "step": 2695 }, { "epoch": 2.4870848708487086, "grad_norm": 0.34515468988767445, "learning_rate": 4.33899174478683e-06, "loss": 0.0559, "step": 2696 }, { "epoch": 2.4880073800738005, "grad_norm": 0.38991615115843276, "learning_rate": 4.323890933995517e-06, "loss": 0.0648, "step": 2697 }, { "epoch": 2.488929889298893, "grad_norm": 0.3567292574951863, "learning_rate": 4.308813958463145e-06, "loss": 0.0621, "step": 2698 }, { "epoch": 2.4898523985239853, "grad_norm": 0.3756121323612728, "learning_rate": 4.293760835570343e-06, "loss": 0.0648, "step": 2699 }, { "epoch": 2.4907749077490777, "grad_norm": 0.43625351673224455, "learning_rate": 4.278731582670239e-06, "loss": 0.0604, "step": 2700 }, { "epoch": 2.4916974169741697, "grad_norm": 0.3841168021584028, "learning_rate": 4.263726217088429e-06, "loss": 0.0541, "step": 2701 }, { "epoch": 2.492619926199262, "grad_norm": 0.3654561694274849, "learning_rate": 4.248744756122986e-06, "loss": 0.0544, "step": 2702 }, { "epoch": 2.493542435424354, "grad_norm": 0.4111746551123488, "learning_rate": 4.23378721704443e-06, "loss": 0.0648, "step": 2703 }, { "epoch": 2.4944649446494465, "grad_norm": 0.40144075585536443, "learning_rate": 4.218853617095686e-06, "loss": 0.069, "step": 2704 }, { "epoch": 2.495387453874539, "grad_norm": 0.40482568402285934, "learning_rate": 4.203943973492097e-06, "loss": 0.0632, "step": 2705 }, { "epoch": 2.496309963099631, "grad_norm": 0.38685996424627717, "learning_rate": 4.189058303421392e-06, "loss": 0.0605, "step": 2706 }, { "epoch": 2.4972324723247232, "grad_norm": 0.30920485496123884, "learning_rate": 4.1741966240436445e-06, "loss": 0.0594, "step": 2707 }, { "epoch": 2.4981549815498156, "grad_norm": 0.4536039150701168, "learning_rate": 4.159358952491288e-06, "loss": 0.0747, "step": 2708 }, { "epoch": 2.4990774907749076, "grad_norm": 0.3733730049529338, "learning_rate": 4.144545305869086e-06, "loss": 0.0564, "step": 2709 }, { "epoch": 2.5, "grad_norm": 0.3554560516800568, "learning_rate": 4.129755701254076e-06, "loss": 0.0588, "step": 2710 }, { "epoch": 2.5009225092250924, "grad_norm": 0.36966687007450383, "learning_rate": 4.114990155695617e-06, "loss": 0.0515, "step": 2711 }, { "epoch": 2.5018450184501844, "grad_norm": 0.37330996084536244, "learning_rate": 4.100248686215313e-06, "loss": 0.0547, "step": 2712 }, { "epoch": 2.5027675276752768, "grad_norm": 0.38314398749030215, "learning_rate": 4.085531309807009e-06, "loss": 0.0507, "step": 2713 }, { "epoch": 2.5036900369003687, "grad_norm": 0.3501755082301952, "learning_rate": 4.070838043436786e-06, "loss": 0.0541, "step": 2714 }, { "epoch": 2.504612546125461, "grad_norm": 0.36500233722105163, "learning_rate": 4.056168904042934e-06, "loss": 0.0579, "step": 2715 }, { "epoch": 2.5055350553505535, "grad_norm": 0.3663050507266439, "learning_rate": 4.041523908535916e-06, "loss": 0.0627, "step": 2716 }, { "epoch": 2.506457564575646, "grad_norm": 0.3795564027623454, "learning_rate": 4.026903073798372e-06, "loss": 0.0578, "step": 2717 }, { "epoch": 2.507380073800738, "grad_norm": 0.3327657664195737, "learning_rate": 4.012306416685088e-06, "loss": 0.0597, "step": 2718 }, { "epoch": 2.5083025830258303, "grad_norm": 0.39368371129688207, "learning_rate": 3.997733954022986e-06, "loss": 0.055, "step": 2719 }, { "epoch": 2.5092250922509223, "grad_norm": 0.3744728045239547, "learning_rate": 3.983185702611078e-06, "loss": 0.0614, "step": 2720 }, { "epoch": 2.5101476014760147, "grad_norm": 0.4271760817934105, "learning_rate": 3.968661679220468e-06, "loss": 0.0627, "step": 2721 }, { "epoch": 2.511070110701107, "grad_norm": 0.40795233030440275, "learning_rate": 3.954161900594361e-06, "loss": 0.0652, "step": 2722 }, { "epoch": 2.5119926199261995, "grad_norm": 0.4370878780434889, "learning_rate": 3.9396863834479745e-06, "loss": 0.0644, "step": 2723 }, { "epoch": 2.5129151291512914, "grad_norm": 0.4126537341299734, "learning_rate": 3.925235144468567e-06, "loss": 0.0585, "step": 2724 }, { "epoch": 2.513837638376384, "grad_norm": 0.35351145353358715, "learning_rate": 3.9108082003154325e-06, "loss": 0.0681, "step": 2725 }, { "epoch": 2.514760147601476, "grad_norm": 0.3705626923081052, "learning_rate": 3.896405567619835e-06, "loss": 0.0644, "step": 2726 }, { "epoch": 2.515682656826568, "grad_norm": 0.38780181245816625, "learning_rate": 3.8820272629850056e-06, "loss": 0.0626, "step": 2727 }, { "epoch": 2.5166051660516606, "grad_norm": 0.41241279771867406, "learning_rate": 3.867673302986161e-06, "loss": 0.0514, "step": 2728 }, { "epoch": 2.517527675276753, "grad_norm": 0.37531282930150933, "learning_rate": 3.853343704170431e-06, "loss": 0.0594, "step": 2729 }, { "epoch": 2.518450184501845, "grad_norm": 0.34799920877727736, "learning_rate": 3.839038483056856e-06, "loss": 0.0651, "step": 2730 }, { "epoch": 2.5193726937269374, "grad_norm": 0.3273893877448635, "learning_rate": 3.824757656136391e-06, "loss": 0.0535, "step": 2731 }, { "epoch": 2.5202952029520294, "grad_norm": 0.3935545895818458, "learning_rate": 3.8105012398718694e-06, "loss": 0.0628, "step": 2732 }, { "epoch": 2.5212177121771218, "grad_norm": 0.3785337299041966, "learning_rate": 3.7962692506979645e-06, "loss": 0.0591, "step": 2733 }, { "epoch": 2.522140221402214, "grad_norm": 0.37094130150458543, "learning_rate": 3.7820617050212144e-06, "loss": 0.061, "step": 2734 }, { "epoch": 2.523062730627306, "grad_norm": 0.403354810381643, "learning_rate": 3.7678786192199694e-06, "loss": 0.0603, "step": 2735 }, { "epoch": 2.5239852398523985, "grad_norm": 0.3642577217808013, "learning_rate": 3.753720009644371e-06, "loss": 0.0512, "step": 2736 }, { "epoch": 2.524907749077491, "grad_norm": 0.3834507155368341, "learning_rate": 3.7395858926163594e-06, "loss": 0.0646, "step": 2737 }, { "epoch": 2.525830258302583, "grad_norm": 0.361264708903815, "learning_rate": 3.7254762844296436e-06, "loss": 0.0537, "step": 2738 }, { "epoch": 2.5267527675276753, "grad_norm": 0.3877982026840756, "learning_rate": 3.7113912013496593e-06, "loss": 0.0629, "step": 2739 }, { "epoch": 2.5276752767527677, "grad_norm": 0.4006174457463105, "learning_rate": 3.697330659613588e-06, "loss": 0.0566, "step": 2740 }, { "epoch": 2.5285977859778597, "grad_norm": 0.4005753817571909, "learning_rate": 3.6832946754303154e-06, "loss": 0.0518, "step": 2741 }, { "epoch": 2.529520295202952, "grad_norm": 0.442537675269194, "learning_rate": 3.669283264980408e-06, "loss": 0.0656, "step": 2742 }, { "epoch": 2.530442804428044, "grad_norm": 0.3689980439012489, "learning_rate": 3.6552964444161174e-06, "loss": 0.0569, "step": 2743 }, { "epoch": 2.5313653136531364, "grad_norm": 0.5035218731923568, "learning_rate": 3.641334229861346e-06, "loss": 0.0605, "step": 2744 }, { "epoch": 2.532287822878229, "grad_norm": 0.3206007580427925, "learning_rate": 3.6273966374116175e-06, "loss": 0.0548, "step": 2745 }, { "epoch": 2.5332103321033212, "grad_norm": 0.4163887790182137, "learning_rate": 3.6134836831340836e-06, "loss": 0.0659, "step": 2746 }, { "epoch": 2.534132841328413, "grad_norm": 0.3628238971354742, "learning_rate": 3.5995953830675e-06, "loss": 0.058, "step": 2747 }, { "epoch": 2.5350553505535056, "grad_norm": 0.3837183227515017, "learning_rate": 3.5857317532221794e-06, "loss": 0.0504, "step": 2748 }, { "epoch": 2.5359778597785976, "grad_norm": 0.37920100429254777, "learning_rate": 3.571892809580013e-06, "loss": 0.0445, "step": 2749 }, { "epoch": 2.53690036900369, "grad_norm": 0.3447604406273274, "learning_rate": 3.5580785680944307e-06, "loss": 0.0544, "step": 2750 }, { "epoch": 2.5378228782287824, "grad_norm": 0.38584226115149434, "learning_rate": 3.544289044690377e-06, "loss": 0.0545, "step": 2751 }, { "epoch": 2.538745387453875, "grad_norm": 0.3936622525854521, "learning_rate": 3.530524255264314e-06, "loss": 0.0678, "step": 2752 }, { "epoch": 2.5396678966789668, "grad_norm": 0.39453661403518914, "learning_rate": 3.5167842156841794e-06, "loss": 0.0611, "step": 2753 }, { "epoch": 2.540590405904059, "grad_norm": 0.3764251705184539, "learning_rate": 3.5030689417893863e-06, "loss": 0.0609, "step": 2754 }, { "epoch": 2.541512915129151, "grad_norm": 0.4021122465737506, "learning_rate": 3.4893784493908067e-06, "loss": 0.0562, "step": 2755 }, { "epoch": 2.5424354243542435, "grad_norm": 0.3823096516327499, "learning_rate": 3.475712754270716e-06, "loss": 0.0579, "step": 2756 }, { "epoch": 2.543357933579336, "grad_norm": 0.3957064764294518, "learning_rate": 3.4620718721828345e-06, "loss": 0.0561, "step": 2757 }, { "epoch": 2.544280442804428, "grad_norm": 0.3446733865172133, "learning_rate": 3.448455818852267e-06, "loss": 0.0552, "step": 2758 }, { "epoch": 2.5452029520295203, "grad_norm": 0.35795244377491925, "learning_rate": 3.43486460997548e-06, "loss": 0.0597, "step": 2759 }, { "epoch": 2.5461254612546127, "grad_norm": 0.4484437769876359, "learning_rate": 3.421298261220335e-06, "loss": 0.0593, "step": 2760 }, { "epoch": 2.5470479704797047, "grad_norm": 0.38968243346032577, "learning_rate": 3.4077567882260047e-06, "loss": 0.0723, "step": 2761 }, { "epoch": 2.547970479704797, "grad_norm": 0.3582603007306023, "learning_rate": 3.3942402066029833e-06, "loss": 0.0496, "step": 2762 }, { "epoch": 2.5488929889298895, "grad_norm": 0.4177692339230776, "learning_rate": 3.3807485319331034e-06, "loss": 0.0664, "step": 2763 }, { "epoch": 2.5498154981549814, "grad_norm": 0.41201530708558287, "learning_rate": 3.3672817797694545e-06, "loss": 0.0667, "step": 2764 }, { "epoch": 2.550738007380074, "grad_norm": 0.3489752270331511, "learning_rate": 3.3538399656363932e-06, "loss": 0.0602, "step": 2765 }, { "epoch": 2.551660516605166, "grad_norm": 0.35229813019066014, "learning_rate": 3.3404231050295526e-06, "loss": 0.0514, "step": 2766 }, { "epoch": 2.552583025830258, "grad_norm": 0.38769282705801417, "learning_rate": 3.327031213415785e-06, "loss": 0.065, "step": 2767 }, { "epoch": 2.5535055350553506, "grad_norm": 0.3588379497054201, "learning_rate": 3.3136643062331497e-06, "loss": 0.0492, "step": 2768 }, { "epoch": 2.554428044280443, "grad_norm": 0.36378035249904805, "learning_rate": 3.3003223988909234e-06, "loss": 0.0604, "step": 2769 }, { "epoch": 2.555350553505535, "grad_norm": 0.4028390243773359, "learning_rate": 3.2870055067695556e-06, "loss": 0.0594, "step": 2770 }, { "epoch": 2.5562730627306274, "grad_norm": 0.3927875562598424, "learning_rate": 3.2737136452206495e-06, "loss": 0.0561, "step": 2771 }, { "epoch": 2.5571955719557193, "grad_norm": 0.3638532814167452, "learning_rate": 3.260446829566963e-06, "loss": 0.0581, "step": 2772 }, { "epoch": 2.5581180811808117, "grad_norm": 0.3531343217297762, "learning_rate": 3.247205075102383e-06, "loss": 0.0598, "step": 2773 }, { "epoch": 2.559040590405904, "grad_norm": 0.379302976113349, "learning_rate": 3.233988397091894e-06, "loss": 0.0575, "step": 2774 }, { "epoch": 2.5599630996309966, "grad_norm": 0.419983118539953, "learning_rate": 3.220796810771584e-06, "loss": 0.067, "step": 2775 }, { "epoch": 2.5608856088560885, "grad_norm": 0.4107394147825061, "learning_rate": 3.2076303313486185e-06, "loss": 0.0644, "step": 2776 }, { "epoch": 2.561808118081181, "grad_norm": 0.3445990678509047, "learning_rate": 3.194488974001203e-06, "loss": 0.0559, "step": 2777 }, { "epoch": 2.562730627306273, "grad_norm": 0.3570425194212317, "learning_rate": 3.181372753878595e-06, "loss": 0.0525, "step": 2778 }, { "epoch": 2.5636531365313653, "grad_norm": 0.3767962842818916, "learning_rate": 3.168281686101082e-06, "loss": 0.061, "step": 2779 }, { "epoch": 2.5645756457564577, "grad_norm": 0.35713584848760216, "learning_rate": 3.1552157857599324e-06, "loss": 0.0605, "step": 2780 }, { "epoch": 2.5654981549815496, "grad_norm": 0.3981031620259749, "learning_rate": 3.142175067917419e-06, "loss": 0.0629, "step": 2781 }, { "epoch": 2.566420664206642, "grad_norm": 0.33988134730388825, "learning_rate": 3.1291595476067885e-06, "loss": 0.0581, "step": 2782 }, { "epoch": 2.5673431734317345, "grad_norm": 0.3960214847155239, "learning_rate": 3.116169239832223e-06, "loss": 0.0692, "step": 2783 }, { "epoch": 2.5682656826568264, "grad_norm": 0.36151105722935056, "learning_rate": 3.103204159568851e-06, "loss": 0.0589, "step": 2784 }, { "epoch": 2.569188191881919, "grad_norm": 0.41104212560282516, "learning_rate": 3.090264321762723e-06, "loss": 0.0605, "step": 2785 }, { "epoch": 2.5701107011070112, "grad_norm": 0.39688685526652495, "learning_rate": 3.077349741330776e-06, "loss": 0.056, "step": 2786 }, { "epoch": 2.571033210332103, "grad_norm": 0.4599531096694881, "learning_rate": 3.0644604331608456e-06, "loss": 0.0575, "step": 2787 }, { "epoch": 2.5719557195571956, "grad_norm": 0.36391951034293263, "learning_rate": 3.051596412111618e-06, "loss": 0.0534, "step": 2788 }, { "epoch": 2.5728782287822876, "grad_norm": 0.35370583130882044, "learning_rate": 3.038757693012642e-06, "loss": 0.0524, "step": 2789 }, { "epoch": 2.57380073800738, "grad_norm": 0.376223235497432, "learning_rate": 3.025944290664301e-06, "loss": 0.0665, "step": 2790 }, { "epoch": 2.5747232472324724, "grad_norm": 0.36305580286586053, "learning_rate": 3.013156219837776e-06, "loss": 0.0549, "step": 2791 }, { "epoch": 2.5756457564575648, "grad_norm": 0.37243590037908647, "learning_rate": 3.0003934952750586e-06, "loss": 0.0579, "step": 2792 }, { "epoch": 2.5765682656826567, "grad_norm": 0.4178900830973266, "learning_rate": 2.987656131688926e-06, "loss": 0.0668, "step": 2793 }, { "epoch": 2.577490774907749, "grad_norm": 0.36250689707924433, "learning_rate": 2.9749441437629033e-06, "loss": 0.0459, "step": 2794 }, { "epoch": 2.578413284132841, "grad_norm": 0.35145674746729727, "learning_rate": 2.9622575461512733e-06, "loss": 0.0556, "step": 2795 }, { "epoch": 2.5793357933579335, "grad_norm": 0.3671933764673374, "learning_rate": 2.949596353479059e-06, "loss": 0.0523, "step": 2796 }, { "epoch": 2.580258302583026, "grad_norm": 0.40725450604752705, "learning_rate": 2.9369605803419715e-06, "loss": 0.057, "step": 2797 }, { "epoch": 2.5811808118081183, "grad_norm": 0.40376867023345175, "learning_rate": 2.9243502413064368e-06, "loss": 0.0645, "step": 2798 }, { "epoch": 2.5821033210332103, "grad_norm": 0.3732775675465853, "learning_rate": 2.911765350909565e-06, "loss": 0.0624, "step": 2799 }, { "epoch": 2.5830258302583027, "grad_norm": 0.414186439960148, "learning_rate": 2.899205923659107e-06, "loss": 0.0631, "step": 2800 }, { "epoch": 2.5839483394833946, "grad_norm": 0.357872007207858, "learning_rate": 2.8866719740334807e-06, "loss": 0.0554, "step": 2801 }, { "epoch": 2.584870848708487, "grad_norm": 0.40658643535825106, "learning_rate": 2.8741635164817315e-06, "loss": 0.0676, "step": 2802 }, { "epoch": 2.5857933579335795, "grad_norm": 0.43182062585631215, "learning_rate": 2.8616805654234997e-06, "loss": 0.0612, "step": 2803 }, { "epoch": 2.586715867158672, "grad_norm": 0.39240960661008817, "learning_rate": 2.8492231352490463e-06, "loss": 0.0549, "step": 2804 }, { "epoch": 2.587638376383764, "grad_norm": 0.416096310635408, "learning_rate": 2.8367912403191977e-06, "loss": 0.0627, "step": 2805 }, { "epoch": 2.588560885608856, "grad_norm": 0.4138551349403716, "learning_rate": 2.8243848949653428e-06, "loss": 0.0603, "step": 2806 }, { "epoch": 2.589483394833948, "grad_norm": 0.39622879164551106, "learning_rate": 2.812004113489425e-06, "loss": 0.0528, "step": 2807 }, { "epoch": 2.5904059040590406, "grad_norm": 0.3581763549159933, "learning_rate": 2.7996489101639157e-06, "loss": 0.0583, "step": 2808 }, { "epoch": 2.591328413284133, "grad_norm": 0.40241494830455715, "learning_rate": 2.7873192992317887e-06, "loss": 0.0627, "step": 2809 }, { "epoch": 2.592250922509225, "grad_norm": 0.44573546308002177, "learning_rate": 2.77501529490653e-06, "loss": 0.0673, "step": 2810 }, { "epoch": 2.5931734317343174, "grad_norm": 0.3759217600883933, "learning_rate": 2.7627369113721045e-06, "loss": 0.0533, "step": 2811 }, { "epoch": 2.5940959409594093, "grad_norm": 0.41081070022494737, "learning_rate": 2.750484162782929e-06, "loss": 0.0592, "step": 2812 }, { "epoch": 2.5950184501845017, "grad_norm": 0.417495764332896, "learning_rate": 2.7382570632638854e-06, "loss": 0.0549, "step": 2813 }, { "epoch": 2.595940959409594, "grad_norm": 0.4211177228837055, "learning_rate": 2.7260556269102815e-06, "loss": 0.0657, "step": 2814 }, { "epoch": 2.5968634686346865, "grad_norm": 0.45843874502664883, "learning_rate": 2.7138798677878273e-06, "loss": 0.0609, "step": 2815 }, { "epoch": 2.5977859778597785, "grad_norm": 0.39580377551510015, "learning_rate": 2.7017297999326537e-06, "loss": 0.0574, "step": 2816 }, { "epoch": 2.598708487084871, "grad_norm": 0.3806490365199812, "learning_rate": 2.689605437351267e-06, "loss": 0.0598, "step": 2817 }, { "epoch": 2.599630996309963, "grad_norm": 0.40152279558778614, "learning_rate": 2.6775067940205288e-06, "loss": 0.0606, "step": 2818 }, { "epoch": 2.6005535055350553, "grad_norm": 0.3733066792037547, "learning_rate": 2.6654338838876665e-06, "loss": 0.0585, "step": 2819 }, { "epoch": 2.6014760147601477, "grad_norm": 0.45095338506563537, "learning_rate": 2.6533867208702433e-06, "loss": 0.0654, "step": 2820 }, { "epoch": 2.60239852398524, "grad_norm": 0.4069945137391312, "learning_rate": 2.641365318856126e-06, "loss": 0.0615, "step": 2821 }, { "epoch": 2.603321033210332, "grad_norm": 0.520598095821637, "learning_rate": 2.6293696917035066e-06, "loss": 0.0713, "step": 2822 }, { "epoch": 2.6042435424354244, "grad_norm": 0.35595572132161335, "learning_rate": 2.6173998532408347e-06, "loss": 0.0557, "step": 2823 }, { "epoch": 2.6051660516605164, "grad_norm": 0.42742458159350666, "learning_rate": 2.6054558172668607e-06, "loss": 0.0587, "step": 2824 }, { "epoch": 2.606088560885609, "grad_norm": 0.4849666629112722, "learning_rate": 2.593537597550577e-06, "loss": 0.0636, "step": 2825 }, { "epoch": 2.607011070110701, "grad_norm": 0.40275280892370763, "learning_rate": 2.581645207831204e-06, "loss": 0.0592, "step": 2826 }, { "epoch": 2.6079335793357936, "grad_norm": 0.37392792931653523, "learning_rate": 2.569778661818209e-06, "loss": 0.0613, "step": 2827 }, { "epoch": 2.6088560885608856, "grad_norm": 0.4020759847705154, "learning_rate": 2.5579379731912517e-06, "loss": 0.0582, "step": 2828 }, { "epoch": 2.609778597785978, "grad_norm": 0.38384509311715215, "learning_rate": 2.5461231556001803e-06, "loss": 0.0612, "step": 2829 }, { "epoch": 2.61070110701107, "grad_norm": 0.38682217224419096, "learning_rate": 2.53433422266503e-06, "loss": 0.0649, "step": 2830 }, { "epoch": 2.6116236162361623, "grad_norm": 0.3982787313816613, "learning_rate": 2.522571187975997e-06, "loss": 0.0588, "step": 2831 }, { "epoch": 2.6125461254612548, "grad_norm": 0.39020127803349186, "learning_rate": 2.5108340650934065e-06, "loss": 0.0624, "step": 2832 }, { "epoch": 2.6134686346863467, "grad_norm": 0.35400390451308544, "learning_rate": 2.499122867547729e-06, "loss": 0.0496, "step": 2833 }, { "epoch": 2.614391143911439, "grad_norm": 0.39965035233977825, "learning_rate": 2.487437608839546e-06, "loss": 0.0632, "step": 2834 }, { "epoch": 2.6153136531365315, "grad_norm": 0.3550887873627179, "learning_rate": 2.475778302439524e-06, "loss": 0.0483, "step": 2835 }, { "epoch": 2.6162361623616235, "grad_norm": 0.40614520630097883, "learning_rate": 2.4641449617884257e-06, "loss": 0.0633, "step": 2836 }, { "epoch": 2.617158671586716, "grad_norm": 0.40055305533410873, "learning_rate": 2.4525376002970835e-06, "loss": 0.0656, "step": 2837 }, { "epoch": 2.6180811808118083, "grad_norm": 0.3505541228685418, "learning_rate": 2.4409562313463642e-06, "loss": 0.0558, "step": 2838 }, { "epoch": 2.6190036900369003, "grad_norm": 0.4178637542478256, "learning_rate": 2.429400868287182e-06, "loss": 0.065, "step": 2839 }, { "epoch": 2.6199261992619927, "grad_norm": 0.3666081677039269, "learning_rate": 2.4178715244404794e-06, "loss": 0.0525, "step": 2840 }, { "epoch": 2.6208487084870846, "grad_norm": 0.3871509737468735, "learning_rate": 2.406368213097185e-06, "loss": 0.0607, "step": 2841 }, { "epoch": 2.621771217712177, "grad_norm": 0.36603426397149014, "learning_rate": 2.3948909475182275e-06, "loss": 0.0578, "step": 2842 }, { "epoch": 2.6226937269372694, "grad_norm": 0.39963824073936477, "learning_rate": 2.3834397409345205e-06, "loss": 0.0624, "step": 2843 }, { "epoch": 2.623616236162362, "grad_norm": 0.4236075293068511, "learning_rate": 2.372014606546913e-06, "loss": 0.0645, "step": 2844 }, { "epoch": 2.624538745387454, "grad_norm": 0.35510643010869397, "learning_rate": 2.360615557526219e-06, "loss": 0.0586, "step": 2845 }, { "epoch": 2.625461254612546, "grad_norm": 0.40337374663560444, "learning_rate": 2.3492426070131747e-06, "loss": 0.058, "step": 2846 }, { "epoch": 2.626383763837638, "grad_norm": 0.35887743196715344, "learning_rate": 2.3378957681184283e-06, "loss": 0.0548, "step": 2847 }, { "epoch": 2.6273062730627306, "grad_norm": 0.40549551953993745, "learning_rate": 2.326575053922525e-06, "loss": 0.0691, "step": 2848 }, { "epoch": 2.628228782287823, "grad_norm": 0.3410434892822295, "learning_rate": 2.315280477475906e-06, "loss": 0.0622, "step": 2849 }, { "epoch": 2.6291512915129154, "grad_norm": 0.4234698785178658, "learning_rate": 2.3040120517988593e-06, "loss": 0.0553, "step": 2850 }, { "epoch": 2.6300738007380073, "grad_norm": 0.4480502742154732, "learning_rate": 2.2927697898815465e-06, "loss": 0.0601, "step": 2851 }, { "epoch": 2.6309963099630997, "grad_norm": 0.4183436153605026, "learning_rate": 2.281553704683964e-06, "loss": 0.0667, "step": 2852 }, { "epoch": 2.6319188191881917, "grad_norm": 0.3268256625000969, "learning_rate": 2.270363809135917e-06, "loss": 0.0473, "step": 2853 }, { "epoch": 2.632841328413284, "grad_norm": 0.4243308731347921, "learning_rate": 2.2592001161370392e-06, "loss": 0.0579, "step": 2854 }, { "epoch": 2.6337638376383765, "grad_norm": 0.4114129868395799, "learning_rate": 2.2480626385567525e-06, "loss": 0.0525, "step": 2855 }, { "epoch": 2.6346863468634685, "grad_norm": 0.36434385215921766, "learning_rate": 2.2369513892342458e-06, "loss": 0.0534, "step": 2856 }, { "epoch": 2.635608856088561, "grad_norm": 0.4154254525152973, "learning_rate": 2.2258663809784892e-06, "loss": 0.0589, "step": 2857 }, { "epoch": 2.6365313653136533, "grad_norm": 0.3769354100037087, "learning_rate": 2.2148076265681883e-06, "loss": 0.0533, "step": 2858 }, { "epoch": 2.6374538745387452, "grad_norm": 0.5050526804534307, "learning_rate": 2.2037751387517902e-06, "loss": 0.0581, "step": 2859 }, { "epoch": 2.6383763837638377, "grad_norm": 0.47037972364930525, "learning_rate": 2.1927689302474714e-06, "loss": 0.0718, "step": 2860 }, { "epoch": 2.63929889298893, "grad_norm": 0.38484287178920423, "learning_rate": 2.1817890137430934e-06, "loss": 0.0656, "step": 2861 }, { "epoch": 2.640221402214022, "grad_norm": 0.3731154212738967, "learning_rate": 2.1708354018962236e-06, "loss": 0.0614, "step": 2862 }, { "epoch": 2.6411439114391144, "grad_norm": 0.3610043698344828, "learning_rate": 2.159908107334102e-06, "loss": 0.0572, "step": 2863 }, { "epoch": 2.6420664206642064, "grad_norm": 0.435744217972833, "learning_rate": 2.149007142653625e-06, "loss": 0.059, "step": 2864 }, { "epoch": 2.642988929889299, "grad_norm": 0.412794686187554, "learning_rate": 2.138132520421346e-06, "loss": 0.0542, "step": 2865 }, { "epoch": 2.643911439114391, "grad_norm": 0.39129914778294933, "learning_rate": 2.127284253173445e-06, "loss": 0.0677, "step": 2866 }, { "epoch": 2.6448339483394836, "grad_norm": 0.36160315196238885, "learning_rate": 2.116462353415716e-06, "loss": 0.057, "step": 2867 }, { "epoch": 2.6457564575645756, "grad_norm": 0.485793283197896, "learning_rate": 2.1056668336235622e-06, "loss": 0.0693, "step": 2868 }, { "epoch": 2.646678966789668, "grad_norm": 0.4181192945796221, "learning_rate": 2.0948977062419854e-06, "loss": 0.057, "step": 2869 }, { "epoch": 2.64760147601476, "grad_norm": 0.3830742446174007, "learning_rate": 2.084154983685538e-06, "loss": 0.0592, "step": 2870 }, { "epoch": 2.6485239852398523, "grad_norm": 0.44990348728191754, "learning_rate": 2.0734386783383573e-06, "loss": 0.0575, "step": 2871 }, { "epoch": 2.6494464944649447, "grad_norm": 0.35535204353980193, "learning_rate": 2.0627488025541154e-06, "loss": 0.0541, "step": 2872 }, { "epoch": 2.650369003690037, "grad_norm": 0.387494327233977, "learning_rate": 2.0520853686560178e-06, "loss": 0.0599, "step": 2873 }, { "epoch": 2.651291512915129, "grad_norm": 0.392244538962278, "learning_rate": 2.041448388936784e-06, "loss": 0.055, "step": 2874 }, { "epoch": 2.6522140221402215, "grad_norm": 0.45221744715193424, "learning_rate": 2.030837875658656e-06, "loss": 0.0694, "step": 2875 }, { "epoch": 2.6531365313653135, "grad_norm": 0.40471237010919736, "learning_rate": 2.0202538410533352e-06, "loss": 0.0661, "step": 2876 }, { "epoch": 2.654059040590406, "grad_norm": 0.3860743963032248, "learning_rate": 2.0096962973220225e-06, "loss": 0.0614, "step": 2877 }, { "epoch": 2.6549815498154983, "grad_norm": 0.3652598004060305, "learning_rate": 1.999165256635377e-06, "loss": 0.0592, "step": 2878 }, { "epoch": 2.6559040590405907, "grad_norm": 0.3903898261051929, "learning_rate": 1.988660731133499e-06, "loss": 0.0541, "step": 2879 }, { "epoch": 2.6568265682656826, "grad_norm": 0.401467646881587, "learning_rate": 1.9781827329259125e-06, "loss": 0.0575, "step": 2880 }, { "epoch": 2.657749077490775, "grad_norm": 0.36351105489318314, "learning_rate": 1.9677312740915913e-06, "loss": 0.052, "step": 2881 }, { "epoch": 2.658671586715867, "grad_norm": 0.38888434186610843, "learning_rate": 1.9573063666788875e-06, "loss": 0.0627, "step": 2882 }, { "epoch": 2.6595940959409594, "grad_norm": 0.4088245619118583, "learning_rate": 1.946908022705546e-06, "loss": 0.0727, "step": 2883 }, { "epoch": 2.660516605166052, "grad_norm": 0.3508965388722947, "learning_rate": 1.9365362541587132e-06, "loss": 0.0535, "step": 2884 }, { "epoch": 2.661439114391144, "grad_norm": 0.46244332242395547, "learning_rate": 1.926191072994879e-06, "loss": 0.0666, "step": 2885 }, { "epoch": 2.662361623616236, "grad_norm": 0.4542887467397042, "learning_rate": 1.915872491139875e-06, "loss": 0.0646, "step": 2886 }, { "epoch": 2.663284132841328, "grad_norm": 0.358909025960324, "learning_rate": 1.9055805204889033e-06, "loss": 0.0573, "step": 2887 }, { "epoch": 2.6642066420664205, "grad_norm": 0.4069627859242188, "learning_rate": 1.8953151729064532e-06, "loss": 0.0622, "step": 2888 }, { "epoch": 2.665129151291513, "grad_norm": 0.3507560973723166, "learning_rate": 1.8850764602263426e-06, "loss": 0.0548, "step": 2889 }, { "epoch": 2.6660516605166054, "grad_norm": 0.3981380354869457, "learning_rate": 1.874864394251688e-06, "loss": 0.0525, "step": 2890 }, { "epoch": 2.6669741697416973, "grad_norm": 0.395914315217764, "learning_rate": 1.864678986754867e-06, "loss": 0.0548, "step": 2891 }, { "epoch": 2.6678966789667897, "grad_norm": 0.3588837691563503, "learning_rate": 1.8545202494775509e-06, "loss": 0.0492, "step": 2892 }, { "epoch": 2.6688191881918817, "grad_norm": 0.4808091742446625, "learning_rate": 1.8443881941306417e-06, "loss": 0.0687, "step": 2893 }, { "epoch": 2.669741697416974, "grad_norm": 0.4107640158220843, "learning_rate": 1.8342828323943046e-06, "loss": 0.0525, "step": 2894 }, { "epoch": 2.6706642066420665, "grad_norm": 0.43318566507277145, "learning_rate": 1.8242041759179208e-06, "loss": 0.0581, "step": 2895 }, { "epoch": 2.671586715867159, "grad_norm": 0.36854549918837504, "learning_rate": 1.8141522363200797e-06, "loss": 0.0631, "step": 2896 }, { "epoch": 2.672509225092251, "grad_norm": 0.4240095530003623, "learning_rate": 1.80412702518859e-06, "loss": 0.0656, "step": 2897 }, { "epoch": 2.6734317343173433, "grad_norm": 0.37489214791621156, "learning_rate": 1.7941285540804348e-06, "loss": 0.0644, "step": 2898 }, { "epoch": 2.6743542435424352, "grad_norm": 0.35262465473014704, "learning_rate": 1.784156834521769e-06, "loss": 0.0535, "step": 2899 }, { "epoch": 2.6752767527675276, "grad_norm": 0.4350832985499843, "learning_rate": 1.7742118780079197e-06, "loss": 0.0619, "step": 2900 }, { "epoch": 2.67619926199262, "grad_norm": 0.42795596406317976, "learning_rate": 1.7642936960033578e-06, "loss": 0.0586, "step": 2901 }, { "epoch": 2.6771217712177124, "grad_norm": 0.36301900739582676, "learning_rate": 1.7544022999416792e-06, "loss": 0.0527, "step": 2902 }, { "epoch": 2.6780442804428044, "grad_norm": 0.43278406476884773, "learning_rate": 1.7445377012256126e-06, "loss": 0.0603, "step": 2903 }, { "epoch": 2.678966789667897, "grad_norm": 0.37181142368750203, "learning_rate": 1.7346999112269973e-06, "loss": 0.0571, "step": 2904 }, { "epoch": 2.6798892988929888, "grad_norm": 0.46085662250480114, "learning_rate": 1.7248889412867507e-06, "loss": 0.0659, "step": 2905 }, { "epoch": 2.680811808118081, "grad_norm": 0.35249503133774546, "learning_rate": 1.7151048027148896e-06, "loss": 0.0596, "step": 2906 }, { "epoch": 2.6817343173431736, "grad_norm": 0.3614844575676369, "learning_rate": 1.7053475067904973e-06, "loss": 0.063, "step": 2907 }, { "epoch": 2.6826568265682655, "grad_norm": 0.3419122906463993, "learning_rate": 1.6956170647616982e-06, "loss": 0.0601, "step": 2908 }, { "epoch": 2.683579335793358, "grad_norm": 0.3163027577396601, "learning_rate": 1.6859134878456806e-06, "loss": 0.0617, "step": 2909 }, { "epoch": 2.6845018450184504, "grad_norm": 0.36376314325136916, "learning_rate": 1.6762367872286522e-06, "loss": 0.0582, "step": 2910 }, { "epoch": 2.6854243542435423, "grad_norm": 0.41047964262592385, "learning_rate": 1.6665869740658312e-06, "loss": 0.0683, "step": 2911 }, { "epoch": 2.6863468634686347, "grad_norm": 0.4044684801609656, "learning_rate": 1.6569640594814528e-06, "loss": 0.0466, "step": 2912 }, { "epoch": 2.687269372693727, "grad_norm": 0.3363346378880635, "learning_rate": 1.647368054568743e-06, "loss": 0.0541, "step": 2913 }, { "epoch": 2.688191881918819, "grad_norm": 0.3689758433425425, "learning_rate": 1.6377989703899006e-06, "loss": 0.0569, "step": 2914 }, { "epoch": 2.6891143911439115, "grad_norm": 0.3465649965905109, "learning_rate": 1.6282568179760787e-06, "loss": 0.0512, "step": 2915 }, { "epoch": 2.6900369003690034, "grad_norm": 0.46391072040527415, "learning_rate": 1.6187416083274149e-06, "loss": 0.0549, "step": 2916 }, { "epoch": 2.690959409594096, "grad_norm": 0.38803061785917914, "learning_rate": 1.6092533524129622e-06, "loss": 0.0532, "step": 2917 }, { "epoch": 2.6918819188191883, "grad_norm": 0.3955683583672091, "learning_rate": 1.5997920611707017e-06, "loss": 0.0594, "step": 2918 }, { "epoch": 2.6928044280442807, "grad_norm": 0.37998147874811905, "learning_rate": 1.5903577455075508e-06, "loss": 0.0539, "step": 2919 }, { "epoch": 2.6937269372693726, "grad_norm": 0.3900903992942932, "learning_rate": 1.5809504162993094e-06, "loss": 0.0518, "step": 2920 }, { "epoch": 2.694649446494465, "grad_norm": 0.42179565305103933, "learning_rate": 1.571570084390664e-06, "loss": 0.061, "step": 2921 }, { "epoch": 2.695571955719557, "grad_norm": 0.36424224717684905, "learning_rate": 1.5622167605952086e-06, "loss": 0.056, "step": 2922 }, { "epoch": 2.6964944649446494, "grad_norm": 0.38416630537088486, "learning_rate": 1.552890455695369e-06, "loss": 0.0616, "step": 2923 }, { "epoch": 2.697416974169742, "grad_norm": 0.4007798955554396, "learning_rate": 1.5435911804424357e-06, "loss": 0.0608, "step": 2924 }, { "epoch": 2.698339483394834, "grad_norm": 0.39524893043698506, "learning_rate": 1.5343189455565537e-06, "loss": 0.0608, "step": 2925 }, { "epoch": 2.699261992619926, "grad_norm": 0.3734287745141257, "learning_rate": 1.5250737617266753e-06, "loss": 0.052, "step": 2926 }, { "epoch": 2.7001845018450186, "grad_norm": 0.4006702967524649, "learning_rate": 1.5158556396105749e-06, "loss": 0.0564, "step": 2927 }, { "epoch": 2.7011070110701105, "grad_norm": 0.36161962739478526, "learning_rate": 1.5066645898348385e-06, "loss": 0.0606, "step": 2928 }, { "epoch": 2.702029520295203, "grad_norm": 0.38078822448583644, "learning_rate": 1.497500622994835e-06, "loss": 0.0575, "step": 2929 }, { "epoch": 2.7029520295202953, "grad_norm": 0.4449918524142434, "learning_rate": 1.4883637496547142e-06, "loss": 0.0651, "step": 2930 }, { "epoch": 2.7038745387453873, "grad_norm": 0.384957097979171, "learning_rate": 1.479253980347392e-06, "loss": 0.0696, "step": 2931 }, { "epoch": 2.7047970479704797, "grad_norm": 0.415133375131178, "learning_rate": 1.4701713255745403e-06, "loss": 0.0566, "step": 2932 }, { "epoch": 2.705719557195572, "grad_norm": 0.38360118027466955, "learning_rate": 1.4611157958065807e-06, "loss": 0.0607, "step": 2933 }, { "epoch": 2.706642066420664, "grad_norm": 0.34898906743695224, "learning_rate": 1.4520874014826463e-06, "loss": 0.0536, "step": 2934 }, { "epoch": 2.7075645756457565, "grad_norm": 0.35629998158132603, "learning_rate": 1.4430861530106087e-06, "loss": 0.0542, "step": 2935 }, { "epoch": 2.708487084870849, "grad_norm": 0.36544252176415665, "learning_rate": 1.4341120607670371e-06, "loss": 0.0591, "step": 2936 }, { "epoch": 2.709409594095941, "grad_norm": 0.3524151942164848, "learning_rate": 1.4251651350971896e-06, "loss": 0.0493, "step": 2937 }, { "epoch": 2.7103321033210332, "grad_norm": 0.335656337855431, "learning_rate": 1.4162453863150183e-06, "loss": 0.0537, "step": 2938 }, { "epoch": 2.711254612546125, "grad_norm": 0.3865218249137243, "learning_rate": 1.4073528247031426e-06, "loss": 0.0557, "step": 2939 }, { "epoch": 2.7121771217712176, "grad_norm": 0.3438854336540138, "learning_rate": 1.3984874605128345e-06, "loss": 0.054, "step": 2940 }, { "epoch": 2.71309963099631, "grad_norm": 0.3648875446352297, "learning_rate": 1.3896493039640163e-06, "loss": 0.0589, "step": 2941 }, { "epoch": 2.7140221402214024, "grad_norm": 0.36558746492823224, "learning_rate": 1.3808383652452545e-06, "loss": 0.0558, "step": 2942 }, { "epoch": 2.7149446494464944, "grad_norm": 0.41658909272946937, "learning_rate": 1.3720546545137215e-06, "loss": 0.0566, "step": 2943 }, { "epoch": 2.715867158671587, "grad_norm": 0.3751851162404485, "learning_rate": 1.3632981818952145e-06, "loss": 0.0639, "step": 2944 }, { "epoch": 2.7167896678966788, "grad_norm": 0.3673279764095167, "learning_rate": 1.3545689574841342e-06, "loss": 0.0628, "step": 2945 }, { "epoch": 2.717712177121771, "grad_norm": 0.4019450184845962, "learning_rate": 1.345866991343453e-06, "loss": 0.0587, "step": 2946 }, { "epoch": 2.7186346863468636, "grad_norm": 0.3298962322639587, "learning_rate": 1.3371922935047355e-06, "loss": 0.0596, "step": 2947 }, { "epoch": 2.719557195571956, "grad_norm": 0.4767254384847376, "learning_rate": 1.32854487396811e-06, "loss": 0.0608, "step": 2948 }, { "epoch": 2.720479704797048, "grad_norm": 0.3442916421234061, "learning_rate": 1.3199247427022526e-06, "loss": 0.0545, "step": 2949 }, { "epoch": 2.7214022140221403, "grad_norm": 0.4750998762930767, "learning_rate": 1.3113319096443728e-06, "loss": 0.0634, "step": 2950 }, { "epoch": 2.7223247232472323, "grad_norm": 0.33294417897663703, "learning_rate": 1.302766384700238e-06, "loss": 0.0578, "step": 2951 }, { "epoch": 2.7232472324723247, "grad_norm": 0.3918038809940905, "learning_rate": 1.2942281777441168e-06, "loss": 0.0608, "step": 2952 }, { "epoch": 2.724169741697417, "grad_norm": 0.3904724565577004, "learning_rate": 1.2857172986187744e-06, "loss": 0.0544, "step": 2953 }, { "epoch": 2.725092250922509, "grad_norm": 0.3792220771172754, "learning_rate": 1.2772337571355043e-06, "loss": 0.0671, "step": 2954 }, { "epoch": 2.7260147601476015, "grad_norm": 0.3508443223549421, "learning_rate": 1.2687775630740612e-06, "loss": 0.0537, "step": 2955 }, { "epoch": 2.726937269372694, "grad_norm": 0.3958974325498116, "learning_rate": 1.2603487261826724e-06, "loss": 0.0493, "step": 2956 }, { "epoch": 2.727859778597786, "grad_norm": 0.4394079976531421, "learning_rate": 1.2519472561780488e-06, "loss": 0.0626, "step": 2957 }, { "epoch": 2.7287822878228782, "grad_norm": 0.4059333547531129, "learning_rate": 1.2435731627453345e-06, "loss": 0.0617, "step": 2958 }, { "epoch": 2.7297047970479706, "grad_norm": 0.3699666582867163, "learning_rate": 1.2352264555381132e-06, "loss": 0.061, "step": 2959 }, { "epoch": 2.7306273062730626, "grad_norm": 0.3435908868137269, "learning_rate": 1.2269071441784158e-06, "loss": 0.0485, "step": 2960 }, { "epoch": 2.731549815498155, "grad_norm": 0.35712773344744997, "learning_rate": 1.2186152382566763e-06, "loss": 0.0536, "step": 2961 }, { "epoch": 2.732472324723247, "grad_norm": 0.35871029191430415, "learning_rate": 1.2103507473317371e-06, "loss": 0.0562, "step": 2962 }, { "epoch": 2.7333948339483394, "grad_norm": 0.4105326655298472, "learning_rate": 1.2021136809308386e-06, "loss": 0.0673, "step": 2963 }, { "epoch": 2.734317343173432, "grad_norm": 0.3629105146533966, "learning_rate": 1.1939040485496155e-06, "loss": 0.0597, "step": 2964 }, { "epoch": 2.735239852398524, "grad_norm": 0.47648826594715293, "learning_rate": 1.1857218596520586e-06, "loss": 0.0645, "step": 2965 }, { "epoch": 2.736162361623616, "grad_norm": 0.4057763435449541, "learning_rate": 1.1775671236705365e-06, "loss": 0.0644, "step": 2966 }, { "epoch": 2.7370848708487086, "grad_norm": 0.41786394043315017, "learning_rate": 1.1694398500057714e-06, "loss": 0.0614, "step": 2967 }, { "epoch": 2.7380073800738005, "grad_norm": 0.37695759953636465, "learning_rate": 1.1613400480268099e-06, "loss": 0.0545, "step": 2968 }, { "epoch": 2.738929889298893, "grad_norm": 0.42107806972519235, "learning_rate": 1.1532677270710501e-06, "loss": 0.0579, "step": 2969 }, { "epoch": 2.7398523985239853, "grad_norm": 0.36109942662296984, "learning_rate": 1.1452228964442007e-06, "loss": 0.0552, "step": 2970 }, { "epoch": 2.7407749077490777, "grad_norm": 0.38851677479381025, "learning_rate": 1.1372055654202768e-06, "loss": 0.061, "step": 2971 }, { "epoch": 2.7416974169741697, "grad_norm": 0.4020199566520895, "learning_rate": 1.1292157432415962e-06, "loss": 0.0538, "step": 2972 }, { "epoch": 2.742619926199262, "grad_norm": 0.37460538612402966, "learning_rate": 1.121253439118769e-06, "loss": 0.0545, "step": 2973 }, { "epoch": 2.743542435424354, "grad_norm": 0.3474712521462762, "learning_rate": 1.1133186622306724e-06, "loss": 0.0528, "step": 2974 }, { "epoch": 2.7444649446494465, "grad_norm": 0.3499998332064438, "learning_rate": 1.105411421724456e-06, "loss": 0.0576, "step": 2975 }, { "epoch": 2.745387453874539, "grad_norm": 0.3491895773043017, "learning_rate": 1.0975317267155283e-06, "loss": 0.0557, "step": 2976 }, { "epoch": 2.7463099630996313, "grad_norm": 0.43970555215228807, "learning_rate": 1.0896795862875425e-06, "loss": 0.0649, "step": 2977 }, { "epoch": 2.7472324723247232, "grad_norm": 0.35652163871423376, "learning_rate": 1.081855009492383e-06, "loss": 0.0535, "step": 2978 }, { "epoch": 2.7481549815498156, "grad_norm": 0.3621997806387468, "learning_rate": 1.0740580053501592e-06, "loss": 0.0544, "step": 2979 }, { "epoch": 2.7490774907749076, "grad_norm": 0.3697751229366689, "learning_rate": 1.0662885828492036e-06, "loss": 0.0529, "step": 2980 }, { "epoch": 2.75, "grad_norm": 0.39400988537913456, "learning_rate": 1.0585467509460378e-06, "loss": 0.0525, "step": 2981 }, { "epoch": 2.7509225092250924, "grad_norm": 0.3465690456368866, "learning_rate": 1.0508325185653921e-06, "loss": 0.0551, "step": 2982 }, { "epoch": 2.7518450184501844, "grad_norm": 0.4509089421040558, "learning_rate": 1.0431458946001754e-06, "loss": 0.0578, "step": 2983 }, { "epoch": 2.7527675276752768, "grad_norm": 0.36956723380857287, "learning_rate": 1.035486887911466e-06, "loss": 0.051, "step": 2984 }, { "epoch": 2.7536900369003687, "grad_norm": 0.4224287758338531, "learning_rate": 1.027855507328504e-06, "loss": 0.0603, "step": 2985 }, { "epoch": 2.754612546125461, "grad_norm": 0.3681992863929429, "learning_rate": 1.0202517616486911e-06, "loss": 0.0589, "step": 2986 }, { "epoch": 2.7555350553505535, "grad_norm": 0.3675789213367372, "learning_rate": 1.0126756596375686e-06, "loss": 0.0547, "step": 2987 }, { "epoch": 2.756457564575646, "grad_norm": 0.4003914269444886, "learning_rate": 1.0051272100287974e-06, "loss": 0.0635, "step": 2988 }, { "epoch": 2.757380073800738, "grad_norm": 0.3809235673066969, "learning_rate": 9.97606421524186e-07, "loss": 0.0597, "step": 2989 }, { "epoch": 2.7583025830258303, "grad_norm": 0.34379817476614527, "learning_rate": 9.901133027936326e-07, "loss": 0.0539, "step": 2990 }, { "epoch": 2.7592250922509223, "grad_norm": 0.36702117020991015, "learning_rate": 9.826478624751445e-07, "loss": 0.0532, "step": 2991 }, { "epoch": 2.7601476014760147, "grad_norm": 0.44445769585444944, "learning_rate": 9.752101091748345e-07, "loss": 0.0621, "step": 2992 }, { "epoch": 2.761070110701107, "grad_norm": 0.4537384075371174, "learning_rate": 9.6780005146688e-07, "loss": 0.066, "step": 2993 }, { "epoch": 2.7619926199261995, "grad_norm": 0.39150259327252046, "learning_rate": 9.604176978935343e-07, "loss": 0.0604, "step": 2994 }, { "epoch": 2.7629151291512914, "grad_norm": 0.40388872411212934, "learning_rate": 9.530630569651255e-07, "loss": 0.0618, "step": 2995 }, { "epoch": 2.763837638376384, "grad_norm": 0.34396772103566386, "learning_rate": 9.457361371600249e-07, "loss": 0.0513, "step": 2996 }, { "epoch": 2.764760147601476, "grad_norm": 0.4942270722001889, "learning_rate": 9.384369469246452e-07, "loss": 0.0646, "step": 2997 }, { "epoch": 2.765682656826568, "grad_norm": 0.36904705364153034, "learning_rate": 9.311654946734388e-07, "loss": 0.0626, "step": 2998 }, { "epoch": 2.7666051660516606, "grad_norm": 0.4068666991950199, "learning_rate": 9.23921788788884e-07, "loss": 0.0714, "step": 2999 }, { "epoch": 2.767527675276753, "grad_norm": 0.43526029054214177, "learning_rate": 9.167058376214621e-07, "loss": 0.0593, "step": 3000 }, { "epoch": 2.768450184501845, "grad_norm": 0.37761165148831355, "learning_rate": 9.095176494896663e-07, "loss": 0.056, "step": 3001 }, { "epoch": 2.7693726937269374, "grad_norm": 0.3646292823550303, "learning_rate": 9.023572326799929e-07, "loss": 0.0539, "step": 3002 }, { "epoch": 2.7702952029520294, "grad_norm": 0.3593927712199849, "learning_rate": 8.952245954469057e-07, "loss": 0.054, "step": 3003 }, { "epoch": 2.7712177121771218, "grad_norm": 0.44990180566298993, "learning_rate": 8.881197460128581e-07, "loss": 0.0659, "step": 3004 }, { "epoch": 2.772140221402214, "grad_norm": 0.3712113285117311, "learning_rate": 8.81042692568268e-07, "loss": 0.055, "step": 3005 }, { "epoch": 2.773062730627306, "grad_norm": 0.3705332344243857, "learning_rate": 8.739934432715035e-07, "loss": 0.0632, "step": 3006 }, { "epoch": 2.7739852398523985, "grad_norm": 0.3205372003448543, "learning_rate": 8.66972006248884e-07, "loss": 0.0575, "step": 3007 }, { "epoch": 2.774907749077491, "grad_norm": 0.4018872116949969, "learning_rate": 8.599783895946761e-07, "loss": 0.0638, "step": 3008 }, { "epoch": 2.775830258302583, "grad_norm": 0.36652579061351526, "learning_rate": 8.53012601371056e-07, "loss": 0.0578, "step": 3009 }, { "epoch": 2.7767527675276753, "grad_norm": 0.3202958884783173, "learning_rate": 8.460746496081362e-07, "loss": 0.0498, "step": 3010 }, { "epoch": 2.7776752767527677, "grad_norm": 0.37852921602778405, "learning_rate": 8.391645423039357e-07, "loss": 0.0557, "step": 3011 }, { "epoch": 2.7785977859778597, "grad_norm": 0.35706112242489096, "learning_rate": 8.322822874243686e-07, "loss": 0.0565, "step": 3012 }, { "epoch": 2.779520295202952, "grad_norm": 0.4014573659348028, "learning_rate": 8.254278929032494e-07, "loss": 0.0551, "step": 3013 }, { "epoch": 2.780442804428044, "grad_norm": 0.37341034836485737, "learning_rate": 8.186013666422687e-07, "loss": 0.0553, "step": 3014 }, { "epoch": 2.7813653136531364, "grad_norm": 0.3703888430668142, "learning_rate": 8.118027165109926e-07, "loss": 0.0567, "step": 3015 }, { "epoch": 2.782287822878229, "grad_norm": 0.36963410147160064, "learning_rate": 8.050319503468546e-07, "loss": 0.0609, "step": 3016 }, { "epoch": 2.7832103321033212, "grad_norm": 0.45553415284580095, "learning_rate": 7.982890759551415e-07, "loss": 0.0763, "step": 3017 }, { "epoch": 2.784132841328413, "grad_norm": 0.3528517617568652, "learning_rate": 7.915741011089855e-07, "loss": 0.0559, "step": 3018 }, { "epoch": 2.7850553505535056, "grad_norm": 0.39876071016487835, "learning_rate": 7.848870335493613e-07, "loss": 0.0682, "step": 3019 }, { "epoch": 2.7859778597785976, "grad_norm": 0.344238782439648, "learning_rate": 7.78227880985058e-07, "loss": 0.052, "step": 3020 }, { "epoch": 2.78690036900369, "grad_norm": 0.47872136961864875, "learning_rate": 7.715966510927097e-07, "loss": 0.0602, "step": 3021 }, { "epoch": 2.7878228782287824, "grad_norm": 0.35265544529469695, "learning_rate": 7.649933515167407e-07, "loss": 0.0561, "step": 3022 }, { "epoch": 2.788745387453875, "grad_norm": 0.348002235267397, "learning_rate": 7.584179898693783e-07, "loss": 0.0554, "step": 3023 }, { "epoch": 2.7896678966789668, "grad_norm": 0.38996218600694976, "learning_rate": 7.518705737306591e-07, "loss": 0.0621, "step": 3024 }, { "epoch": 2.790590405904059, "grad_norm": 0.35492438028981754, "learning_rate": 7.453511106483902e-07, "loss": 0.0535, "step": 3025 }, { "epoch": 2.791512915129151, "grad_norm": 0.37061112978174804, "learning_rate": 7.38859608138151e-07, "loss": 0.0557, "step": 3026 }, { "epoch": 2.7924354243542435, "grad_norm": 0.4094905475119617, "learning_rate": 7.323960736833057e-07, "loss": 0.0702, "step": 3027 }, { "epoch": 2.793357933579336, "grad_norm": 0.3645171489168169, "learning_rate": 7.259605147349608e-07, "loss": 0.0578, "step": 3028 }, { "epoch": 2.794280442804428, "grad_norm": 0.4538889363916288, "learning_rate": 7.195529387119815e-07, "loss": 0.0597, "step": 3029 }, { "epoch": 2.7952029520295203, "grad_norm": 0.34334974001270563, "learning_rate": 7.131733530009704e-07, "loss": 0.0547, "step": 3030 }, { "epoch": 2.7961254612546127, "grad_norm": 0.3713757367965027, "learning_rate": 7.06821764956267e-07, "loss": 0.0613, "step": 3031 }, { "epoch": 2.7970479704797047, "grad_norm": 0.3095422789862229, "learning_rate": 7.004981818999279e-07, "loss": 0.0508, "step": 3032 }, { "epoch": 2.797970479704797, "grad_norm": 0.3557566485745798, "learning_rate": 6.942026111217359e-07, "loss": 0.0509, "step": 3033 }, { "epoch": 2.7988929889298895, "grad_norm": 0.412387856193388, "learning_rate": 6.879350598791772e-07, "loss": 0.0572, "step": 3034 }, { "epoch": 2.7998154981549814, "grad_norm": 0.3548976624104618, "learning_rate": 6.816955353974335e-07, "loss": 0.0608, "step": 3035 }, { "epoch": 2.800738007380074, "grad_norm": 0.38439505994481193, "learning_rate": 6.754840448693789e-07, "loss": 0.061, "step": 3036 }, { "epoch": 2.801660516605166, "grad_norm": 0.33877961758384073, "learning_rate": 6.69300595455577e-07, "loss": 0.0543, "step": 3037 }, { "epoch": 2.802583025830258, "grad_norm": 0.41338407939341637, "learning_rate": 6.631451942842565e-07, "loss": 0.0541, "step": 3038 }, { "epoch": 2.8035055350553506, "grad_norm": 0.4324337931055607, "learning_rate": 6.570178484513162e-07, "loss": 0.0556, "step": 3039 }, { "epoch": 2.804428044280443, "grad_norm": 0.3658053320531132, "learning_rate": 6.50918565020317e-07, "loss": 0.0607, "step": 3040 }, { "epoch": 2.805350553505535, "grad_norm": 0.40511232536405956, "learning_rate": 6.448473510224595e-07, "loss": 0.065, "step": 3041 }, { "epoch": 2.8062730627306274, "grad_norm": 0.3544204839763948, "learning_rate": 6.388042134565953e-07, "loss": 0.0555, "step": 3042 }, { "epoch": 2.8071955719557193, "grad_norm": 0.37855795799058395, "learning_rate": 6.327891592892127e-07, "loss": 0.0515, "step": 3043 }, { "epoch": 2.8081180811808117, "grad_norm": 0.4186095737100796, "learning_rate": 6.268021954544096e-07, "loss": 0.06, "step": 3044 }, { "epoch": 2.809040590405904, "grad_norm": 0.34904205929911225, "learning_rate": 6.208433288539178e-07, "loss": 0.0546, "step": 3045 }, { "epoch": 2.8099630996309966, "grad_norm": 0.38973795389147187, "learning_rate": 6.149125663570732e-07, "loss": 0.055, "step": 3046 }, { "epoch": 2.8108856088560885, "grad_norm": 0.4023007894124464, "learning_rate": 6.090099148008094e-07, "loss": 0.0696, "step": 3047 }, { "epoch": 2.811808118081181, "grad_norm": 0.3513069846733514, "learning_rate": 6.031353809896611e-07, "loss": 0.0491, "step": 3048 }, { "epoch": 2.812730627306273, "grad_norm": 0.46942274505022347, "learning_rate": 5.97288971695742e-07, "loss": 0.0774, "step": 3049 }, { "epoch": 2.8136531365313653, "grad_norm": 0.3746075770999299, "learning_rate": 5.914706936587494e-07, "loss": 0.067, "step": 3050 }, { "epoch": 2.8145756457564577, "grad_norm": 0.4334892424883726, "learning_rate": 5.856805535859516e-07, "loss": 0.0543, "step": 3051 }, { "epoch": 2.8154981549815496, "grad_norm": 0.36292956029933826, "learning_rate": 5.799185581521732e-07, "loss": 0.0546, "step": 3052 }, { "epoch": 2.816420664206642, "grad_norm": 0.4310683091039621, "learning_rate": 5.741847139998008e-07, "loss": 0.0653, "step": 3053 }, { "epoch": 2.8173431734317345, "grad_norm": 0.3617562819256405, "learning_rate": 5.684790277387663e-07, "loss": 0.0649, "step": 3054 }, { "epoch": 2.8182656826568264, "grad_norm": 0.3570502861875989, "learning_rate": 5.628015059465363e-07, "loss": 0.053, "step": 3055 }, { "epoch": 2.819188191881919, "grad_norm": 0.41306359141018206, "learning_rate": 5.571521551681169e-07, "loss": 0.0596, "step": 3056 }, { "epoch": 2.8201107011070112, "grad_norm": 0.35171632738645336, "learning_rate": 5.515309819160403e-07, "loss": 0.0551, "step": 3057 }, { "epoch": 2.821033210332103, "grad_norm": 0.38256128198924205, "learning_rate": 5.45937992670345e-07, "loss": 0.0587, "step": 3058 }, { "epoch": 2.8219557195571956, "grad_norm": 0.35631455510809057, "learning_rate": 5.403731938785878e-07, "loss": 0.0534, "step": 3059 }, { "epoch": 2.8228782287822876, "grad_norm": 0.3913909917635343, "learning_rate": 5.348365919558285e-07, "loss": 0.0665, "step": 3060 }, { "epoch": 2.82380073800738, "grad_norm": 0.3976575641492326, "learning_rate": 5.293281932846145e-07, "loss": 0.067, "step": 3061 }, { "epoch": 2.8247232472324724, "grad_norm": 0.3878091048370286, "learning_rate": 5.238480042149913e-07, "loss": 0.0611, "step": 3062 }, { "epoch": 2.8256457564575648, "grad_norm": 0.44775564598488815, "learning_rate": 5.183960310644748e-07, "loss": 0.0642, "step": 3063 }, { "epoch": 2.8265682656826567, "grad_norm": 0.39969690272794917, "learning_rate": 5.129722801180542e-07, "loss": 0.0624, "step": 3064 }, { "epoch": 2.827490774907749, "grad_norm": 0.40145395859816396, "learning_rate": 5.07576757628192e-07, "loss": 0.06, "step": 3065 }, { "epoch": 2.828413284132841, "grad_norm": 0.35038941278206576, "learning_rate": 5.022094698148072e-07, "loss": 0.0603, "step": 3066 }, { "epoch": 2.8293357933579335, "grad_norm": 0.3912530660871608, "learning_rate": 4.968704228652643e-07, "loss": 0.0679, "step": 3067 }, { "epoch": 2.830258302583026, "grad_norm": 0.4503668889861151, "learning_rate": 4.915596229343733e-07, "loss": 0.0713, "step": 3068 }, { "epoch": 2.8311808118081183, "grad_norm": 0.40552994897158007, "learning_rate": 4.862770761443896e-07, "loss": 0.0529, "step": 3069 }, { "epoch": 2.8321033210332103, "grad_norm": 0.3965084132876259, "learning_rate": 4.810227885849866e-07, "loss": 0.0586, "step": 3070 }, { "epoch": 2.8330258302583027, "grad_norm": 0.46853938963440855, "learning_rate": 4.75796766313269e-07, "loss": 0.0713, "step": 3071 }, { "epoch": 2.8339483394833946, "grad_norm": 0.37870988056907734, "learning_rate": 4.705990153537565e-07, "loss": 0.0585, "step": 3072 }, { "epoch": 2.834870848708487, "grad_norm": 0.4048309388437559, "learning_rate": 4.654295416983728e-07, "loss": 0.0608, "step": 3073 }, { "epoch": 2.8357933579335795, "grad_norm": 0.37403675447498214, "learning_rate": 4.602883513064482e-07, "loss": 0.0514, "step": 3074 }, { "epoch": 2.836715867158672, "grad_norm": 0.4200123129227064, "learning_rate": 4.551754501047084e-07, "loss": 0.0607, "step": 3075 }, { "epoch": 2.837638376383764, "grad_norm": 0.35394890850486577, "learning_rate": 4.500908439872664e-07, "loss": 0.0522, "step": 3076 }, { "epoch": 2.838560885608856, "grad_norm": 0.4372989175172387, "learning_rate": 4.4503453881561407e-07, "loss": 0.0607, "step": 3077 }, { "epoch": 2.839483394833948, "grad_norm": 0.4151231584289497, "learning_rate": 4.4000654041862764e-07, "loss": 0.0661, "step": 3078 }, { "epoch": 2.8404059040590406, "grad_norm": 0.38509653531385357, "learning_rate": 4.350068545925373e-07, "loss": 0.0603, "step": 3079 }, { "epoch": 2.841328413284133, "grad_norm": 0.3596935660857444, "learning_rate": 4.300354871009465e-07, "loss": 0.0637, "step": 3080 }, { "epoch": 2.842250922509225, "grad_norm": 0.4119426560230551, "learning_rate": 4.2509244367480994e-07, "loss": 0.0618, "step": 3081 }, { "epoch": 2.8431734317343174, "grad_norm": 0.34913464173804626, "learning_rate": 4.201777300124249e-07, "loss": 0.0519, "step": 3082 }, { "epoch": 2.8440959409594093, "grad_norm": 0.39015850377702105, "learning_rate": 4.152913517794399e-07, "loss": 0.058, "step": 3083 }, { "epoch": 2.8450184501845017, "grad_norm": 0.48286557292257426, "learning_rate": 4.104333146088379e-07, "loss": 0.0673, "step": 3084 }, { "epoch": 2.845940959409594, "grad_norm": 0.4101184601988959, "learning_rate": 4.0560362410091704e-07, "loss": 0.0609, "step": 3085 }, { "epoch": 2.8468634686346865, "grad_norm": 0.4152920275704722, "learning_rate": 4.0080228582331234e-07, "loss": 0.0611, "step": 3086 }, { "epoch": 2.8477859778597785, "grad_norm": 0.3790564708187215, "learning_rate": 3.960293053109687e-07, "loss": 0.06, "step": 3087 }, { "epoch": 2.848708487084871, "grad_norm": 0.4192336015241333, "learning_rate": 3.9128468806614306e-07, "loss": 0.0592, "step": 3088 }, { "epoch": 2.849630996309963, "grad_norm": 0.3809704162794546, "learning_rate": 3.8656843955839075e-07, "loss": 0.0528, "step": 3089 }, { "epoch": 2.8505535055350553, "grad_norm": 0.36785498805492256, "learning_rate": 3.818805652245683e-07, "loss": 0.0592, "step": 3090 }, { "epoch": 2.8514760147601477, "grad_norm": 0.3940379277381955, "learning_rate": 3.7722107046882226e-07, "loss": 0.0511, "step": 3091 }, { "epoch": 2.85239852398524, "grad_norm": 0.39381179296698055, "learning_rate": 3.7258996066258103e-07, "loss": 0.0614, "step": 3092 }, { "epoch": 2.853321033210332, "grad_norm": 0.38861666965095293, "learning_rate": 3.67987241144549e-07, "loss": 0.0589, "step": 3093 }, { "epoch": 2.8542435424354244, "grad_norm": 0.3693220330097373, "learning_rate": 3.6341291722070956e-07, "loss": 0.055, "step": 3094 }, { "epoch": 2.8551660516605164, "grad_norm": 0.4015995540257464, "learning_rate": 3.588669941643086e-07, "loss": 0.0625, "step": 3095 }, { "epoch": 2.856088560885609, "grad_norm": 0.3949330924315789, "learning_rate": 3.5434947721584846e-07, "loss": 0.0566, "step": 3096 }, { "epoch": 2.857011070110701, "grad_norm": 0.41813829293464966, "learning_rate": 3.498603715830884e-07, "loss": 0.062, "step": 3097 }, { "epoch": 2.8579335793357936, "grad_norm": 0.34104975646802455, "learning_rate": 3.453996824410388e-07, "loss": 0.0552, "step": 3098 }, { "epoch": 2.8588560885608856, "grad_norm": 0.37425611677733905, "learning_rate": 3.4096741493194197e-07, "loss": 0.0543, "step": 3099 }, { "epoch": 2.859778597785978, "grad_norm": 0.43738932187425617, "learning_rate": 3.3656357416528285e-07, "loss": 0.0549, "step": 3100 }, { "epoch": 2.86070110701107, "grad_norm": 0.3731623029223221, "learning_rate": 3.321881652177783e-07, "loss": 0.051, "step": 3101 }, { "epoch": 2.8616236162361623, "grad_norm": 0.4723027130587314, "learning_rate": 3.2784119313336305e-07, "loss": 0.0611, "step": 3102 }, { "epoch": 2.8625461254612548, "grad_norm": 0.3851958403736879, "learning_rate": 3.2352266292319243e-07, "loss": 0.0696, "step": 3103 }, { "epoch": 2.8634686346863467, "grad_norm": 0.36550780829170026, "learning_rate": 3.1923257956563703e-07, "loss": 0.0606, "step": 3104 }, { "epoch": 2.864391143911439, "grad_norm": 0.3670853621802748, "learning_rate": 3.1497094800627124e-07, "loss": 0.0474, "step": 3105 }, { "epoch": 2.8653136531365315, "grad_norm": 0.38481821579325054, "learning_rate": 3.107377731578709e-07, "loss": 0.0584, "step": 3106 }, { "epoch": 2.8662361623616235, "grad_norm": 0.43307530254517385, "learning_rate": 3.0653305990040736e-07, "loss": 0.0648, "step": 3107 }, { "epoch": 2.867158671586716, "grad_norm": 0.3870432354135556, "learning_rate": 3.0235681308103945e-07, "loss": 0.0547, "step": 3108 }, { "epoch": 2.8680811808118083, "grad_norm": 0.38574379242952656, "learning_rate": 2.982090375141161e-07, "loss": 0.0494, "step": 3109 }, { "epoch": 2.8690036900369003, "grad_norm": 0.4294302467000169, "learning_rate": 2.9408973798115967e-07, "loss": 0.0678, "step": 3110 }, { "epoch": 2.8699261992619927, "grad_norm": 0.4068011754116402, "learning_rate": 2.899989192308633e-07, "loss": 0.0524, "step": 3111 }, { "epoch": 2.8708487084870846, "grad_norm": 0.3873339654924171, "learning_rate": 2.859365859790963e-07, "loss": 0.0477, "step": 3112 }, { "epoch": 2.871771217712177, "grad_norm": 0.3587163133267895, "learning_rate": 2.819027429088822e-07, "loss": 0.0594, "step": 3113 }, { "epoch": 2.8726937269372694, "grad_norm": 0.377462742092774, "learning_rate": 2.7789739467040666e-07, "loss": 0.0587, "step": 3114 }, { "epoch": 2.873616236162362, "grad_norm": 0.420661122891959, "learning_rate": 2.7392054588100127e-07, "loss": 0.0651, "step": 3115 }, { "epoch": 2.874538745387454, "grad_norm": 0.35825539443343773, "learning_rate": 2.6997220112514877e-07, "loss": 0.065, "step": 3116 }, { "epoch": 2.875461254612546, "grad_norm": 0.36394797721574657, "learning_rate": 2.660523649544666e-07, "loss": 0.0646, "step": 3117 }, { "epoch": 2.876383763837638, "grad_norm": 0.36620511469972156, "learning_rate": 2.6216104188771793e-07, "loss": 0.0623, "step": 3118 }, { "epoch": 2.8773062730627306, "grad_norm": 0.42566183031464044, "learning_rate": 2.5829823641078386e-07, "loss": 0.063, "step": 3119 }, { "epoch": 2.878228782287823, "grad_norm": 0.40045449113591813, "learning_rate": 2.544639529766829e-07, "loss": 0.059, "step": 3120 }, { "epoch": 2.8791512915129154, "grad_norm": 0.43855184624490645, "learning_rate": 2.506581960055432e-07, "loss": 0.0577, "step": 3121 }, { "epoch": 2.8800738007380073, "grad_norm": 0.3590229799292661, "learning_rate": 2.4688096988461084e-07, "loss": 0.057, "step": 3122 }, { "epoch": 2.8809963099630997, "grad_norm": 0.40603600606145884, "learning_rate": 2.431322789682444e-07, "loss": 0.058, "step": 3123 }, { "epoch": 2.8819188191881917, "grad_norm": 0.37206285598652145, "learning_rate": 2.3941212757790934e-07, "loss": 0.0556, "step": 3124 }, { "epoch": 2.882841328413284, "grad_norm": 0.3825211879106796, "learning_rate": 2.3572052000216393e-07, "loss": 0.0523, "step": 3125 }, { "epoch": 2.8837638376383765, "grad_norm": 0.4043895899196592, "learning_rate": 2.3205746049666243e-07, "loss": 0.0503, "step": 3126 }, { "epoch": 2.8846863468634685, "grad_norm": 0.37371240332215466, "learning_rate": 2.284229532841603e-07, "loss": 0.0635, "step": 3127 }, { "epoch": 2.885608856088561, "grad_norm": 0.36449477350329723, "learning_rate": 2.2481700255447825e-07, "loss": 0.0624, "step": 3128 }, { "epoch": 2.8865313653136533, "grad_norm": 0.4207640198889781, "learning_rate": 2.212396124645355e-07, "loss": 0.0639, "step": 3129 }, { "epoch": 2.8874538745387452, "grad_norm": 0.35639617746732283, "learning_rate": 2.1769078713831935e-07, "loss": 0.0521, "step": 3130 }, { "epoch": 2.8883763837638377, "grad_norm": 0.349152719089145, "learning_rate": 2.1417053066688787e-07, "loss": 0.0579, "step": 3131 }, { "epoch": 2.88929889298893, "grad_norm": 0.4019981344649913, "learning_rate": 2.106788471083615e-07, "loss": 0.0535, "step": 3132 }, { "epoch": 2.890221402214022, "grad_norm": 0.40152901733528595, "learning_rate": 2.0721574048793159e-07, "loss": 0.0666, "step": 3133 }, { "epoch": 2.8911439114391144, "grad_norm": 0.36928366702351667, "learning_rate": 2.0378121479783796e-07, "loss": 0.0629, "step": 3134 }, { "epoch": 2.8920664206642064, "grad_norm": 0.4494398328104406, "learning_rate": 2.0037527399737466e-07, "loss": 0.0617, "step": 3135 }, { "epoch": 2.892988929889299, "grad_norm": 0.39146009669205106, "learning_rate": 1.9699792201288703e-07, "loss": 0.0579, "step": 3136 }, { "epoch": 2.893911439114391, "grad_norm": 0.39069058809145935, "learning_rate": 1.936491627377579e-07, "loss": 0.0543, "step": 3137 }, { "epoch": 2.8948339483394836, "grad_norm": 0.39218651628686196, "learning_rate": 1.9032900003241315e-07, "loss": 0.062, "step": 3138 }, { "epoch": 2.8957564575645756, "grad_norm": 0.38327349989050385, "learning_rate": 1.870374377243078e-07, "loss": 0.0669, "step": 3139 }, { "epoch": 2.896678966789668, "grad_norm": 0.38666177814617686, "learning_rate": 1.837744796079288e-07, "loss": 0.0521, "step": 3140 }, { "epoch": 2.89760147601476, "grad_norm": 0.3599162209804183, "learning_rate": 1.8054012944479227e-07, "loss": 0.0538, "step": 3141 }, { "epoch": 2.8985239852398523, "grad_norm": 0.3953192653422239, "learning_rate": 1.7733439096343508e-07, "loss": 0.057, "step": 3142 }, { "epoch": 2.8994464944649447, "grad_norm": 0.3967137925693252, "learning_rate": 1.7415726785939834e-07, "loss": 0.0662, "step": 3143 }, { "epoch": 2.900369003690037, "grad_norm": 0.4078272572296947, "learning_rate": 1.7100876379525232e-07, "loss": 0.0686, "step": 3144 }, { "epoch": 2.901291512915129, "grad_norm": 0.39070239748765256, "learning_rate": 1.6788888240056865e-07, "loss": 0.0591, "step": 3145 }, { "epoch": 2.9022140221402215, "grad_norm": 0.3726091345240525, "learning_rate": 1.647976272719176e-07, "loss": 0.0545, "step": 3146 }, { "epoch": 2.9031365313653135, "grad_norm": 0.41481503787534957, "learning_rate": 1.6173500197287638e-07, "loss": 0.0594, "step": 3147 }, { "epoch": 2.904059040590406, "grad_norm": 0.3794000246422108, "learning_rate": 1.5870101003402084e-07, "loss": 0.0597, "step": 3148 }, { "epoch": 2.9049815498154983, "grad_norm": 0.3369263864135048, "learning_rate": 1.5569565495290318e-07, "loss": 0.0522, "step": 3149 }, { "epoch": 2.9059040590405907, "grad_norm": 0.40960030015767807, "learning_rate": 1.5271894019408262e-07, "loss": 0.0596, "step": 3150 }, { "epoch": 2.9068265682656826, "grad_norm": 0.3441664544752149, "learning_rate": 1.4977086918908923e-07, "loss": 0.0486, "step": 3151 }, { "epoch": 2.907749077490775, "grad_norm": 0.4139275443474721, "learning_rate": 1.4685144533643502e-07, "loss": 0.058, "step": 3152 }, { "epoch": 2.908671586715867, "grad_norm": 0.4035398923339222, "learning_rate": 1.43960672001614e-07, "loss": 0.0594, "step": 3153 }, { "epoch": 2.9095940959409594, "grad_norm": 0.37766409290452785, "learning_rate": 1.410985525170827e-07, "loss": 0.0615, "step": 3154 }, { "epoch": 2.910516605166052, "grad_norm": 0.4460254147855921, "learning_rate": 1.3826509018227128e-07, "loss": 0.0681, "step": 3155 }, { "epoch": 2.911439114391144, "grad_norm": 0.385927928269677, "learning_rate": 1.3546028826357527e-07, "loss": 0.0576, "step": 3156 }, { "epoch": 2.912361623616236, "grad_norm": 0.39903411242617903, "learning_rate": 1.3268414999434985e-07, "loss": 0.0577, "step": 3157 }, { "epoch": 2.913284132841328, "grad_norm": 0.431053626264846, "learning_rate": 1.29936678574899e-07, "loss": 0.0684, "step": 3158 }, { "epoch": 2.9142066420664205, "grad_norm": 0.35442180641360227, "learning_rate": 1.2721787717249466e-07, "loss": 0.0573, "step": 3159 }, { "epoch": 2.915129151291513, "grad_norm": 0.3463713119365184, "learning_rate": 1.2452774892134368e-07, "loss": 0.0589, "step": 3160 }, { "epoch": 2.9160516605166054, "grad_norm": 0.3801898835041715, "learning_rate": 1.2186629692260976e-07, "loss": 0.0582, "step": 3161 }, { "epoch": 2.9169741697416973, "grad_norm": 0.3660826668707689, "learning_rate": 1.192335242443915e-07, "loss": 0.0524, "step": 3162 }, { "epoch": 2.9178966789667897, "grad_norm": 0.3605059072006074, "learning_rate": 1.1662943392173053e-07, "loss": 0.0548, "step": 3163 }, { "epoch": 2.9188191881918817, "grad_norm": 0.3751885832042241, "learning_rate": 1.1405402895660056e-07, "loss": 0.0472, "step": 3164 }, { "epoch": 2.919741697416974, "grad_norm": 0.4226757153010532, "learning_rate": 1.1150731231791279e-07, "loss": 0.0552, "step": 3165 }, { "epoch": 2.9206642066420665, "grad_norm": 0.3982715198752065, "learning_rate": 1.089892869414938e-07, "loss": 0.0624, "step": 3166 }, { "epoch": 2.921586715867159, "grad_norm": 0.41570676216447927, "learning_rate": 1.0649995573011329e-07, "loss": 0.0634, "step": 3167 }, { "epoch": 2.922509225092251, "grad_norm": 0.37943720435725786, "learning_rate": 1.0403932155344798e-07, "loss": 0.0503, "step": 3168 }, { "epoch": 2.9234317343173433, "grad_norm": 0.46216363117558795, "learning_rate": 1.0160738724809549e-07, "loss": 0.0612, "step": 3169 }, { "epoch": 2.9243542435424352, "grad_norm": 0.42033750230395484, "learning_rate": 9.920415561757712e-08, "loss": 0.0595, "step": 3170 }, { "epoch": 2.9252767527675276, "grad_norm": 0.3556062994479878, "learning_rate": 9.682962943231843e-08, "loss": 0.06, "step": 3171 }, { "epoch": 2.92619926199262, "grad_norm": 0.3139807524312909, "learning_rate": 9.448381142965201e-08, "loss": 0.0488, "step": 3172 }, { "epoch": 2.9271217712177124, "grad_norm": 0.3881161090492111, "learning_rate": 9.216670431381747e-08, "loss": 0.0526, "step": 3173 }, { "epoch": 2.9280442804428044, "grad_norm": 0.42774485088737857, "learning_rate": 8.98783107559642e-08, "loss": 0.0683, "step": 3174 }, { "epoch": 2.928966789667897, "grad_norm": 0.36789045233922624, "learning_rate": 8.761863339412924e-08, "loss": 0.0567, "step": 3175 }, { "epoch": 2.9298892988929888, "grad_norm": 0.3992924122347115, "learning_rate": 8.538767483325383e-08, "loss": 0.0587, "step": 3176 }, { "epoch": 2.930811808118081, "grad_norm": 0.35639401224290446, "learning_rate": 8.318543764516961e-08, "loss": 0.0457, "step": 3177 }, { "epoch": 2.9317343173431736, "grad_norm": 0.4049382588541959, "learning_rate": 8.101192436859584e-08, "loss": 0.0597, "step": 3178 }, { "epoch": 2.9326568265682655, "grad_norm": 0.37824866132664375, "learning_rate": 7.886713750914487e-08, "loss": 0.0604, "step": 3179 }, { "epoch": 2.933579335793358, "grad_norm": 0.3785839291989575, "learning_rate": 7.675107953931115e-08, "loss": 0.0617, "step": 3180 }, { "epoch": 2.9345018450184504, "grad_norm": 0.4385737390468286, "learning_rate": 7.466375289846839e-08, "loss": 0.0606, "step": 3181 }, { "epoch": 2.9354243542435423, "grad_norm": 0.385430962252876, "learning_rate": 7.260515999286677e-08, "loss": 0.0639, "step": 3182 }, { "epoch": 2.9363468634686347, "grad_norm": 0.4059174582314441, "learning_rate": 7.057530319564409e-08, "loss": 0.0593, "step": 3183 }, { "epoch": 2.937269372693727, "grad_norm": 0.4007079161368407, "learning_rate": 6.857418484679801e-08, "loss": 0.0589, "step": 3184 }, { "epoch": 2.938191881918819, "grad_norm": 0.41371594833639513, "learning_rate": 6.660180725320542e-08, "loss": 0.0557, "step": 3185 }, { "epoch": 2.9391143911439115, "grad_norm": 0.38720152945378655, "learning_rate": 6.465817268860586e-08, "loss": 0.0611, "step": 3186 }, { "epoch": 2.9400369003690034, "grad_norm": 0.46058747436877173, "learning_rate": 6.274328339360703e-08, "loss": 0.0623, "step": 3187 }, { "epoch": 2.940959409594096, "grad_norm": 0.427316595808526, "learning_rate": 6.085714157567646e-08, "loss": 0.0624, "step": 3188 }, { "epoch": 2.9418819188191883, "grad_norm": 0.36151995421843597, "learning_rate": 5.89997494091471e-08, "loss": 0.0578, "step": 3189 }, { "epoch": 2.9428044280442807, "grad_norm": 0.3929333257867398, "learning_rate": 5.717110903520617e-08, "loss": 0.0513, "step": 3190 }, { "epoch": 2.9437269372693726, "grad_norm": 0.3583213372419836, "learning_rate": 5.53712225618952e-08, "loss": 0.0585, "step": 3191 }, { "epoch": 2.944649446494465, "grad_norm": 0.33733747911177364, "learning_rate": 5.360009206410721e-08, "loss": 0.0576, "step": 3192 }, { "epoch": 2.945571955719557, "grad_norm": 0.42567376818493746, "learning_rate": 5.1857719583592314e-08, "loss": 0.058, "step": 3193 }, { "epoch": 2.9464944649446494, "grad_norm": 0.4124616322920309, "learning_rate": 5.014410712893825e-08, "loss": 0.0604, "step": 3194 }, { "epoch": 2.947416974169742, "grad_norm": 0.39271782211136996, "learning_rate": 4.845925667558981e-08, "loss": 0.0646, "step": 3195 }, { "epoch": 2.948339483394834, "grad_norm": 0.3985025384944136, "learning_rate": 4.680317016582669e-08, "loss": 0.0631, "step": 3196 }, { "epoch": 2.949261992619926, "grad_norm": 0.41581860892776334, "learning_rate": 4.517584950877452e-08, "loss": 0.0581, "step": 3197 }, { "epoch": 2.9501845018450186, "grad_norm": 0.46541482571202747, "learning_rate": 4.357729658039378e-08, "loss": 0.0658, "step": 3198 }, { "epoch": 2.9511070110701105, "grad_norm": 0.3385333526417377, "learning_rate": 4.2007513223485396e-08, "loss": 0.0504, "step": 3199 }, { "epoch": 2.952029520295203, "grad_norm": 0.4109038841918147, "learning_rate": 4.0466501247685143e-08, "loss": 0.0505, "step": 3200 }, { "epoch": 2.9529520295202953, "grad_norm": 0.4007606060640004, "learning_rate": 3.895426242945532e-08, "loss": 0.0585, "step": 3201 }, { "epoch": 2.9538745387453873, "grad_norm": 0.36225177120950675, "learning_rate": 3.74707985120959e-08, "loss": 0.056, "step": 3202 }, { "epoch": 2.9547970479704797, "grad_norm": 0.3804162709721264, "learning_rate": 3.601611120573056e-08, "loss": 0.0588, "step": 3203 }, { "epoch": 2.955719557195572, "grad_norm": 0.41534819889707814, "learning_rate": 3.459020218731512e-08, "loss": 0.0537, "step": 3204 }, { "epoch": 2.956642066420664, "grad_norm": 0.4311040488800863, "learning_rate": 3.319307310062358e-08, "loss": 0.062, "step": 3205 }, { "epoch": 2.9575645756457565, "grad_norm": 0.3028959344330278, "learning_rate": 3.182472555625926e-08, "loss": 0.0523, "step": 3206 }, { "epoch": 2.958487084870849, "grad_norm": 0.4192421837220252, "learning_rate": 3.048516113163813e-08, "loss": 0.0645, "step": 3207 }, { "epoch": 2.959409594095941, "grad_norm": 0.36431487381541733, "learning_rate": 2.9174381370999927e-08, "loss": 0.059, "step": 3208 }, { "epoch": 2.9603321033210332, "grad_norm": 0.33647666805310933, "learning_rate": 2.789238778540537e-08, "loss": 0.0544, "step": 3209 }, { "epoch": 2.961254612546125, "grad_norm": 0.43733262509769244, "learning_rate": 2.663918185272507e-08, "loss": 0.0651, "step": 3210 }, { "epoch": 2.9621771217712176, "grad_norm": 0.38996048249617143, "learning_rate": 2.5414765017642284e-08, "loss": 0.0586, "step": 3211 }, { "epoch": 2.96309963099631, "grad_norm": 0.3769281522187525, "learning_rate": 2.4219138691658482e-08, "loss": 0.0603, "step": 3212 }, { "epoch": 2.9640221402214024, "grad_norm": 0.3847947330110199, "learning_rate": 2.3052304253082246e-08, "loss": 0.0582, "step": 3213 }, { "epoch": 2.9649446494464944, "grad_norm": 0.3997571039214077, "learning_rate": 2.191426304702926e-08, "loss": 0.0695, "step": 3214 }, { "epoch": 2.965867158671587, "grad_norm": 0.3528516194523642, "learning_rate": 2.0805016385427865e-08, "loss": 0.0528, "step": 3215 }, { "epoch": 2.9667896678966788, "grad_norm": 0.36457562727966364, "learning_rate": 1.9724565547007968e-08, "loss": 0.0588, "step": 3216 }, { "epoch": 2.967712177121771, "grad_norm": 0.401797472107959, "learning_rate": 1.8672911777301015e-08, "loss": 0.0589, "step": 3217 }, { "epoch": 2.9686346863468636, "grad_norm": 0.4502992009118647, "learning_rate": 1.7650056288651127e-08, "loss": 0.0688, "step": 3218 }, { "epoch": 2.969557195571956, "grad_norm": 0.3639762154572545, "learning_rate": 1.6656000260195648e-08, "loss": 0.059, "step": 3219 }, { "epoch": 2.970479704797048, "grad_norm": 0.40979035018838866, "learning_rate": 1.5690744837873473e-08, "loss": 0.0635, "step": 3220 }, { "epoch": 2.9714022140221403, "grad_norm": 0.38266671196509183, "learning_rate": 1.4754291134425058e-08, "loss": 0.0555, "step": 3221 }, { "epoch": 2.9723247232472323, "grad_norm": 0.3802321127786767, "learning_rate": 1.3846640229386864e-08, "loss": 0.061, "step": 3222 }, { "epoch": 2.9732472324723247, "grad_norm": 0.381698112696052, "learning_rate": 1.2967793169091358e-08, "loss": 0.0511, "step": 3223 }, { "epoch": 2.974169741697417, "grad_norm": 0.3169210239387918, "learning_rate": 1.2117750966667008e-08, "loss": 0.0527, "step": 3224 }, { "epoch": 2.975092250922509, "grad_norm": 0.36279637299517065, "learning_rate": 1.1296514602038288e-08, "loss": 0.0481, "step": 3225 }, { "epoch": 2.9760147601476015, "grad_norm": 0.4224857235132395, "learning_rate": 1.0504085021914579e-08, "loss": 0.0628, "step": 3226 }, { "epoch": 2.976937269372694, "grad_norm": 0.4217813645086133, "learning_rate": 9.74046313980681e-09, "loss": 0.0589, "step": 3227 }, { "epoch": 2.977859778597786, "grad_norm": 0.33581347684941426, "learning_rate": 9.005649836013597e-09, "loss": 0.0493, "step": 3228 }, { "epoch": 2.9787822878228782, "grad_norm": 0.38548294586419124, "learning_rate": 8.299645957615675e-09, "loss": 0.0622, "step": 3229 }, { "epoch": 2.9797047970479706, "grad_norm": 0.41781985113979636, "learning_rate": 7.622452318495344e-09, "loss": 0.0594, "step": 3230 }, { "epoch": 2.9806273062730626, "grad_norm": 0.4069508964434947, "learning_rate": 6.974069699314245e-09, "loss": 0.0633, "step": 3231 }, { "epoch": 2.981549815498155, "grad_norm": 0.3580843212211675, "learning_rate": 6.354498847521706e-09, "loss": 0.0512, "step": 3232 }, { "epoch": 2.982472324723247, "grad_norm": 0.417975915367572, "learning_rate": 5.763740477357504e-09, "loss": 0.0679, "step": 3233 }, { "epoch": 2.9833948339483394, "grad_norm": 0.40074037578986116, "learning_rate": 5.201795269837995e-09, "loss": 0.0614, "step": 3234 }, { "epoch": 2.984317343173432, "grad_norm": 0.4142629840074768, "learning_rate": 4.66866387277276e-09, "loss": 0.0603, "step": 3235 }, { "epoch": 2.985239852398524, "grad_norm": 0.35881008152437754, "learning_rate": 4.164346900750737e-09, "loss": 0.044, "step": 3236 }, { "epoch": 2.986162361623616, "grad_norm": 0.3612726682414584, "learning_rate": 3.6888449351457633e-09, "loss": 0.0552, "step": 3237 }, { "epoch": 2.9870848708487086, "grad_norm": 0.35277935338398936, "learning_rate": 3.2421585241110276e-09, "loss": 0.0547, "step": 3238 }, { "epoch": 2.9880073800738005, "grad_norm": 0.3235211391456132, "learning_rate": 2.8242881825846223e-09, "loss": 0.0474, "step": 3239 }, { "epoch": 2.988929889298893, "grad_norm": 0.41428593730616464, "learning_rate": 2.4352343922839917e-09, "loss": 0.0629, "step": 3240 }, { "epoch": 2.9898523985239853, "grad_norm": 0.38148708756384203, "learning_rate": 2.07499760170593e-09, "loss": 0.0576, "step": 3241 }, { "epoch": 2.9907749077490777, "grad_norm": 0.36477370715888274, "learning_rate": 1.743578226129361e-09, "loss": 0.0548, "step": 3242 }, { "epoch": 2.9916974169741697, "grad_norm": 0.3853039173542559, "learning_rate": 1.4409766476125575e-09, "loss": 0.0567, "step": 3243 }, { "epoch": 2.992619926199262, "grad_norm": 0.47482229554396677, "learning_rate": 1.1671932149931453e-09, "loss": 0.074, "step": 3244 }, { "epoch": 2.993542435424354, "grad_norm": 0.40637755989313035, "learning_rate": 9.222282438853258e-10, "loss": 0.0545, "step": 3245 }, { "epoch": 2.9944649446494465, "grad_norm": 0.379512699850719, "learning_rate": 7.06082016682652e-10, "loss": 0.0629, "step": 3246 }, { "epoch": 2.995387453874539, "grad_norm": 0.3485901343457483, "learning_rate": 5.187547825580285e-10, "loss": 0.0502, "step": 3247 }, { "epoch": 2.9963099630996313, "grad_norm": 0.3860947708929262, "learning_rate": 3.602467574581603e-10, "loss": 0.0556, "step": 3248 }, { "epoch": 2.9972324723247232, "grad_norm": 0.38267347060083073, "learning_rate": 2.3055812411187926e-10, "loss": 0.0586, "step": 3249 }, { "epoch": 2.9981549815498156, "grad_norm": 0.36726464087860233, "learning_rate": 1.2968903202459358e-10, "loss": 0.0632, "step": 3250 }, { "epoch": 2.9990774907749076, "grad_norm": 0.3714096301031751, "learning_rate": 5.763959747551173e-11, "loss": 0.0502, "step": 3251 }, { "epoch": 3.0, "grad_norm": 0.2703555109604041, "learning_rate": 1.4409903520418155e-11, "loss": 0.0307, "step": 3252 } ], "logging_steps": 1, "max_steps": 3252, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3855346449776640.0, "train_batch_size": 32, "trial_name": null, "trial_params": null }