diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,55195 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.9999365409754921, + "eval_steps": 500, + "global_step": 39394, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 5.076721960630021e-05, + "grad_norm": 0.15623666546780773, + "learning_rate": 2.538071065989848e-07, + "loss": 1.1099, + "step": 1 + }, + { + "epoch": 0.00025383609803150106, + "grad_norm": 0.1494363970160199, + "learning_rate": 1.2690355329949238e-06, + "loss": 1.1138, + "step": 5 + }, + { + "epoch": 0.0005076721960630021, + "grad_norm": 0.15321706073016964, + "learning_rate": 2.5380710659898476e-06, + "loss": 1.1368, + "step": 10 + }, + { + "epoch": 0.0007615082940945031, + "grad_norm": 0.1542643246754322, + "learning_rate": 3.807106598984772e-06, + "loss": 1.1091, + "step": 15 + }, + { + "epoch": 0.0010153443921260042, + "grad_norm": 0.13824138762283186, + "learning_rate": 5.076142131979695e-06, + "loss": 1.0911, + "step": 20 + }, + { + "epoch": 0.0012691804901575053, + "grad_norm": 0.1579877396947511, + "learning_rate": 6.345177664974619e-06, + "loss": 1.1267, + "step": 25 + }, + { + "epoch": 0.0015230165881890063, + "grad_norm": 0.1478480849215318, + "learning_rate": 7.614213197969544e-06, + "loss": 1.0848, + "step": 30 + }, + { + "epoch": 0.0017768526862205075, + "grad_norm": 0.11527528929066594, + "learning_rate": 8.883248730964468e-06, + "loss": 1.07, + "step": 35 + }, + { + "epoch": 0.0020306887842520085, + "grad_norm": 0.10942201134957022, + "learning_rate": 1.015228426395939e-05, + "loss": 1.0856, + "step": 40 + }, + { + "epoch": 0.0022845248822835097, + "grad_norm": 0.1003586160454669, + "learning_rate": 1.1421319796954315e-05, + "loss": 1.0483, + "step": 45 + }, + { + "epoch": 0.0025383609803150105, + "grad_norm": 0.09455563009354048, + "learning_rate": 1.2690355329949238e-05, + "loss": 1.0842, + "step": 50 + }, + { + "epoch": 0.0027921970783465117, + "grad_norm": 0.08578428766129725, + "learning_rate": 1.3959390862944163e-05, + "loss": 1.0387, + "step": 55 + }, + { + "epoch": 0.0030460331763780125, + "grad_norm": 0.08715199888221983, + "learning_rate": 1.5228426395939088e-05, + "loss": 1.0585, + "step": 60 + }, + { + "epoch": 0.0032998692744095138, + "grad_norm": 0.07478324580510415, + "learning_rate": 1.6497461928934012e-05, + "loss": 0.9773, + "step": 65 + }, + { + "epoch": 0.003553705372441015, + "grad_norm": 0.07368740079084418, + "learning_rate": 1.7766497461928935e-05, + "loss": 1.0023, + "step": 70 + }, + { + "epoch": 0.0038075414704725158, + "grad_norm": 0.07301153154637648, + "learning_rate": 1.9035532994923858e-05, + "loss": 0.9723, + "step": 75 + }, + { + "epoch": 0.004061377568504017, + "grad_norm": 0.07289038213742076, + "learning_rate": 2.030456852791878e-05, + "loss": 0.9927, + "step": 80 + }, + { + "epoch": 0.004315213666535518, + "grad_norm": 0.07465327431922439, + "learning_rate": 2.1573604060913704e-05, + "loss": 0.9821, + "step": 85 + }, + { + "epoch": 0.0045690497645670194, + "grad_norm": 0.06774510984286918, + "learning_rate": 2.284263959390863e-05, + "loss": 0.965, + "step": 90 + }, + { + "epoch": 0.00482288586259852, + "grad_norm": 0.06855996583318778, + "learning_rate": 2.4111675126903553e-05, + "loss": 0.9821, + "step": 95 + }, + { + "epoch": 0.005076721960630021, + "grad_norm": 0.07536106861979534, + "learning_rate": 2.5380710659898476e-05, + "loss": 0.9284, + "step": 100 + }, + { + "epoch": 0.005330558058661523, + "grad_norm": 0.06730944873958547, + "learning_rate": 2.6649746192893403e-05, + "loss": 0.9626, + "step": 105 + }, + { + "epoch": 0.0055843941566930235, + "grad_norm": 0.069457861068512, + "learning_rate": 2.7918781725888326e-05, + "loss": 0.956, + "step": 110 + }, + { + "epoch": 0.005838230254724524, + "grad_norm": 0.06693731117055934, + "learning_rate": 2.918781725888325e-05, + "loss": 0.9747, + "step": 115 + }, + { + "epoch": 0.006092066352756025, + "grad_norm": 0.06277015163357891, + "learning_rate": 3.0456852791878175e-05, + "loss": 0.9116, + "step": 120 + }, + { + "epoch": 0.006345902450787527, + "grad_norm": 0.06643101153314454, + "learning_rate": 3.17258883248731e-05, + "loss": 0.9232, + "step": 125 + }, + { + "epoch": 0.0065997385488190275, + "grad_norm": 0.06924486382591209, + "learning_rate": 3.2994923857868024e-05, + "loss": 0.9399, + "step": 130 + }, + { + "epoch": 0.006853574646850528, + "grad_norm": 0.07604177904943392, + "learning_rate": 3.4263959390862944e-05, + "loss": 0.8964, + "step": 135 + }, + { + "epoch": 0.00710741074488203, + "grad_norm": 0.06481060411660645, + "learning_rate": 3.553299492385787e-05, + "loss": 0.948, + "step": 140 + }, + { + "epoch": 0.007361246842913531, + "grad_norm": 0.06555462608634341, + "learning_rate": 3.680203045685279e-05, + "loss": 0.9325, + "step": 145 + }, + { + "epoch": 0.0076150829409450315, + "grad_norm": 0.07121267717524804, + "learning_rate": 3.8071065989847716e-05, + "loss": 0.9379, + "step": 150 + }, + { + "epoch": 0.007868919038976532, + "grad_norm": 0.07190008765656207, + "learning_rate": 3.934010152284264e-05, + "loss": 0.9104, + "step": 155 + }, + { + "epoch": 0.008122755137008034, + "grad_norm": 0.06497388555993808, + "learning_rate": 4.060913705583756e-05, + "loss": 0.9238, + "step": 160 + }, + { + "epoch": 0.008376591235039536, + "grad_norm": 0.06647076390288592, + "learning_rate": 4.187817258883249e-05, + "loss": 0.9081, + "step": 165 + }, + { + "epoch": 0.008630427333071036, + "grad_norm": 0.07078773209118674, + "learning_rate": 4.314720812182741e-05, + "loss": 0.8845, + "step": 170 + }, + { + "epoch": 0.008884263431102537, + "grad_norm": 0.060550162204052516, + "learning_rate": 4.4416243654822335e-05, + "loss": 0.8901, + "step": 175 + }, + { + "epoch": 0.009138099529134039, + "grad_norm": 0.06924081884858772, + "learning_rate": 4.568527918781726e-05, + "loss": 0.924, + "step": 180 + }, + { + "epoch": 0.009391935627165539, + "grad_norm": 0.06720539183021615, + "learning_rate": 4.695431472081219e-05, + "loss": 0.8938, + "step": 185 + }, + { + "epoch": 0.00964577172519704, + "grad_norm": 0.07177203010161029, + "learning_rate": 4.822335025380711e-05, + "loss": 0.9048, + "step": 190 + }, + { + "epoch": 0.009899607823228542, + "grad_norm": 0.07321268374977867, + "learning_rate": 4.949238578680203e-05, + "loss": 0.8666, + "step": 195 + }, + { + "epoch": 0.010153443921260042, + "grad_norm": 0.0704505269850039, + "learning_rate": 5.076142131979695e-05, + "loss": 0.9043, + "step": 200 + }, + { + "epoch": 0.010407280019291544, + "grad_norm": 0.06657597088837011, + "learning_rate": 5.2030456852791886e-05, + "loss": 0.8896, + "step": 205 + }, + { + "epoch": 0.010661116117323045, + "grad_norm": 0.06540442514049039, + "learning_rate": 5.3299492385786806e-05, + "loss": 0.8706, + "step": 210 + }, + { + "epoch": 0.010914952215354545, + "grad_norm": 0.07007986246869724, + "learning_rate": 5.4568527918781725e-05, + "loss": 0.8502, + "step": 215 + }, + { + "epoch": 0.011168788313386047, + "grad_norm": 0.07149965435491132, + "learning_rate": 5.583756345177665e-05, + "loss": 0.8837, + "step": 220 + }, + { + "epoch": 0.011422624411417547, + "grad_norm": 0.06719503318382268, + "learning_rate": 5.710659898477157e-05, + "loss": 0.8644, + "step": 225 + }, + { + "epoch": 0.011676460509449049, + "grad_norm": 0.06782034211539793, + "learning_rate": 5.83756345177665e-05, + "loss": 0.8966, + "step": 230 + }, + { + "epoch": 0.01193029660748055, + "grad_norm": 0.06913885877260682, + "learning_rate": 5.9644670050761424e-05, + "loss": 0.881, + "step": 235 + }, + { + "epoch": 0.01218413270551205, + "grad_norm": 0.06719486728451429, + "learning_rate": 6.091370558375635e-05, + "loss": 0.8766, + "step": 240 + }, + { + "epoch": 0.012437968803543552, + "grad_norm": 0.06653298354073509, + "learning_rate": 6.218274111675127e-05, + "loss": 0.8781, + "step": 245 + }, + { + "epoch": 0.012691804901575053, + "grad_norm": 0.06438730665451305, + "learning_rate": 6.34517766497462e-05, + "loss": 0.8918, + "step": 250 + }, + { + "epoch": 0.012945640999606553, + "grad_norm": 0.06665969371026668, + "learning_rate": 6.472081218274112e-05, + "loss": 0.8583, + "step": 255 + }, + { + "epoch": 0.013199477097638055, + "grad_norm": 0.06715127430825377, + "learning_rate": 6.598984771573605e-05, + "loss": 0.8656, + "step": 260 + }, + { + "epoch": 0.013453313195669557, + "grad_norm": 0.06926262661035425, + "learning_rate": 6.725888324873096e-05, + "loss": 0.8649, + "step": 265 + }, + { + "epoch": 0.013707149293701057, + "grad_norm": 0.06985857406011756, + "learning_rate": 6.852791878172589e-05, + "loss": 0.8729, + "step": 270 + }, + { + "epoch": 0.013960985391732558, + "grad_norm": 0.06860201087036598, + "learning_rate": 6.979695431472081e-05, + "loss": 0.8601, + "step": 275 + }, + { + "epoch": 0.01421482148976406, + "grad_norm": 0.06930283958418837, + "learning_rate": 7.106598984771574e-05, + "loss": 0.8531, + "step": 280 + }, + { + "epoch": 0.01446865758779556, + "grad_norm": 0.06690534995876123, + "learning_rate": 7.233502538071065e-05, + "loss": 0.8696, + "step": 285 + }, + { + "epoch": 0.014722493685827061, + "grad_norm": 0.07121164184727187, + "learning_rate": 7.360406091370558e-05, + "loss": 0.8475, + "step": 290 + }, + { + "epoch": 0.014976329783858563, + "grad_norm": 0.06510867515978803, + "learning_rate": 7.48730964467005e-05, + "loss": 0.8529, + "step": 295 + }, + { + "epoch": 0.015230165881890063, + "grad_norm": 0.07520321408132809, + "learning_rate": 7.614213197969543e-05, + "loss": 0.8385, + "step": 300 + }, + { + "epoch": 0.015484001979921565, + "grad_norm": 0.06551584845379264, + "learning_rate": 7.741116751269036e-05, + "loss": 0.8687, + "step": 305 + }, + { + "epoch": 0.015737838077953065, + "grad_norm": 0.07490825713450537, + "learning_rate": 7.868020304568529e-05, + "loss": 0.8349, + "step": 310 + }, + { + "epoch": 0.015991674175984568, + "grad_norm": 0.06896780239118258, + "learning_rate": 7.994923857868021e-05, + "loss": 0.8212, + "step": 315 + }, + { + "epoch": 0.016245510274016068, + "grad_norm": 0.07054318367146102, + "learning_rate": 8.121827411167512e-05, + "loss": 0.8385, + "step": 320 + }, + { + "epoch": 0.016499346372047568, + "grad_norm": 0.07148603385955352, + "learning_rate": 8.248730964467005e-05, + "loss": 0.821, + "step": 325 + }, + { + "epoch": 0.01675318247007907, + "grad_norm": 0.07322626942475637, + "learning_rate": 8.375634517766498e-05, + "loss": 0.8307, + "step": 330 + }, + { + "epoch": 0.01700701856811057, + "grad_norm": 0.0656810178548751, + "learning_rate": 8.50253807106599e-05, + "loss": 0.8172, + "step": 335 + }, + { + "epoch": 0.01726085466614207, + "grad_norm": 0.08377135110233941, + "learning_rate": 8.629441624365482e-05, + "loss": 0.8464, + "step": 340 + }, + { + "epoch": 0.017514690764173575, + "grad_norm": 0.075227118003797, + "learning_rate": 8.756345177664974e-05, + "loss": 0.8505, + "step": 345 + }, + { + "epoch": 0.017768526862205074, + "grad_norm": 0.06433840181173646, + "learning_rate": 8.883248730964467e-05, + "loss": 0.8511, + "step": 350 + }, + { + "epoch": 0.018022362960236574, + "grad_norm": 0.06922064842556071, + "learning_rate": 9.01015228426396e-05, + "loss": 0.8391, + "step": 355 + }, + { + "epoch": 0.018276199058268078, + "grad_norm": 0.0755697527347306, + "learning_rate": 9.137055837563452e-05, + "loss": 0.821, + "step": 360 + }, + { + "epoch": 0.018530035156299578, + "grad_norm": 0.07031238898852438, + "learning_rate": 9.263959390862945e-05, + "loss": 0.8231, + "step": 365 + }, + { + "epoch": 0.018783871254331078, + "grad_norm": 0.0736219488733804, + "learning_rate": 9.390862944162437e-05, + "loss": 0.822, + "step": 370 + }, + { + "epoch": 0.01903770735236258, + "grad_norm": 0.06790722048303544, + "learning_rate": 9.517766497461929e-05, + "loss": 0.8175, + "step": 375 + }, + { + "epoch": 0.01929154345039408, + "grad_norm": 0.06650805003812338, + "learning_rate": 9.644670050761421e-05, + "loss": 0.821, + "step": 380 + }, + { + "epoch": 0.01954537954842558, + "grad_norm": 0.0655885804187293, + "learning_rate": 9.771573604060914e-05, + "loss": 0.8174, + "step": 385 + }, + { + "epoch": 0.019799215646457084, + "grad_norm": 0.07059246576832452, + "learning_rate": 9.898477157360407e-05, + "loss": 0.8255, + "step": 390 + }, + { + "epoch": 0.020053051744488584, + "grad_norm": 0.08379327242669067, + "learning_rate": 0.00010025380710659898, + "loss": 0.8401, + "step": 395 + }, + { + "epoch": 0.020306887842520084, + "grad_norm": 0.06727444934015664, + "learning_rate": 0.0001015228426395939, + "loss": 0.8304, + "step": 400 + }, + { + "epoch": 0.020560723940551588, + "grad_norm": 0.0772208129389319, + "learning_rate": 0.00010279187817258883, + "loss": 0.8172, + "step": 405 + }, + { + "epoch": 0.020814560038583087, + "grad_norm": 0.07506045651461135, + "learning_rate": 0.00010406091370558377, + "loss": 0.8148, + "step": 410 + }, + { + "epoch": 0.021068396136614587, + "grad_norm": 0.07444802891412307, + "learning_rate": 0.00010532994923857868, + "loss": 0.8642, + "step": 415 + }, + { + "epoch": 0.02132223223464609, + "grad_norm": 0.06410369753897495, + "learning_rate": 0.00010659898477157361, + "loss": 0.8523, + "step": 420 + }, + { + "epoch": 0.02157606833267759, + "grad_norm": 0.06581099995077219, + "learning_rate": 0.00010786802030456854, + "loss": 0.8214, + "step": 425 + }, + { + "epoch": 0.02182990443070909, + "grad_norm": 0.06664549211252414, + "learning_rate": 0.00010913705583756345, + "loss": 0.8083, + "step": 430 + }, + { + "epoch": 0.02208374052874059, + "grad_norm": 0.06658304689511382, + "learning_rate": 0.00011040609137055838, + "loss": 0.8143, + "step": 435 + }, + { + "epoch": 0.022337576626772094, + "grad_norm": 0.0683685494048942, + "learning_rate": 0.0001116751269035533, + "loss": 0.8239, + "step": 440 + }, + { + "epoch": 0.022591412724803594, + "grad_norm": 0.06845204574468279, + "learning_rate": 0.00011294416243654823, + "loss": 0.8109, + "step": 445 + }, + { + "epoch": 0.022845248822835094, + "grad_norm": 0.0677741143702505, + "learning_rate": 0.00011421319796954314, + "loss": 0.8244, + "step": 450 + }, + { + "epoch": 0.023099084920866597, + "grad_norm": 0.06606869955263507, + "learning_rate": 0.00011548223350253807, + "loss": 0.8075, + "step": 455 + }, + { + "epoch": 0.023352921018898097, + "grad_norm": 0.07205738377017837, + "learning_rate": 0.000116751269035533, + "loss": 0.7934, + "step": 460 + }, + { + "epoch": 0.023606757116929597, + "grad_norm": 0.06872617393230794, + "learning_rate": 0.00011802030456852793, + "loss": 0.8011, + "step": 465 + }, + { + "epoch": 0.0238605932149611, + "grad_norm": 0.08044019183258687, + "learning_rate": 0.00011928934010152285, + "loss": 0.8127, + "step": 470 + }, + { + "epoch": 0.0241144293129926, + "grad_norm": 0.07828212067502605, + "learning_rate": 0.00012055837563451777, + "loss": 0.787, + "step": 475 + }, + { + "epoch": 0.0243682654110241, + "grad_norm": 0.07606104395777182, + "learning_rate": 0.0001218274111675127, + "loss": 0.8251, + "step": 480 + }, + { + "epoch": 0.024622101509055604, + "grad_norm": 0.06583732816278534, + "learning_rate": 0.0001230964467005076, + "loss": 0.8246, + "step": 485 + }, + { + "epoch": 0.024875937607087104, + "grad_norm": 0.06441670784951158, + "learning_rate": 0.00012436548223350254, + "loss": 0.7876, + "step": 490 + }, + { + "epoch": 0.025129773705118603, + "grad_norm": 0.07758246886806429, + "learning_rate": 0.00012563451776649747, + "loss": 0.8221, + "step": 495 + }, + { + "epoch": 0.025383609803150107, + "grad_norm": 0.06322158391207536, + "learning_rate": 0.0001269035532994924, + "loss": 0.7881, + "step": 500 + }, + { + "epoch": 0.025637445901181607, + "grad_norm": 0.07176275752432852, + "learning_rate": 0.00012817258883248732, + "loss": 0.8097, + "step": 505 + }, + { + "epoch": 0.025891281999213107, + "grad_norm": 0.06256825439016349, + "learning_rate": 0.00012944162436548224, + "loss": 0.7871, + "step": 510 + }, + { + "epoch": 0.02614511809724461, + "grad_norm": 0.06274483186816927, + "learning_rate": 0.00013071065989847717, + "loss": 0.8353, + "step": 515 + }, + { + "epoch": 0.02639895419527611, + "grad_norm": 0.06260104403606175, + "learning_rate": 0.0001319796954314721, + "loss": 0.8062, + "step": 520 + }, + { + "epoch": 0.02665279029330761, + "grad_norm": 0.0692807834839823, + "learning_rate": 0.00013324873096446702, + "loss": 0.8184, + "step": 525 + }, + { + "epoch": 0.026906626391339113, + "grad_norm": 0.06940858744959649, + "learning_rate": 0.00013451776649746192, + "loss": 0.7818, + "step": 530 + }, + { + "epoch": 0.027160462489370613, + "grad_norm": 0.06227095786073527, + "learning_rate": 0.00013578680203045685, + "loss": 0.7997, + "step": 535 + }, + { + "epoch": 0.027414298587402113, + "grad_norm": 0.06606458794872055, + "learning_rate": 0.00013705583756345178, + "loss": 0.791, + "step": 540 + }, + { + "epoch": 0.027668134685433617, + "grad_norm": 0.07921383531960545, + "learning_rate": 0.0001383248730964467, + "loss": 0.7865, + "step": 545 + }, + { + "epoch": 0.027921970783465117, + "grad_norm": 0.07202541973928622, + "learning_rate": 0.00013959390862944163, + "loss": 0.799, + "step": 550 + }, + { + "epoch": 0.028175806881496616, + "grad_norm": 0.07446687633156211, + "learning_rate": 0.00014086294416243656, + "loss": 0.8005, + "step": 555 + }, + { + "epoch": 0.02842964297952812, + "grad_norm": 0.06806792304243547, + "learning_rate": 0.00014213197969543148, + "loss": 0.7807, + "step": 560 + }, + { + "epoch": 0.02868347907755962, + "grad_norm": 0.06961466568049711, + "learning_rate": 0.0001434010152284264, + "loss": 0.798, + "step": 565 + }, + { + "epoch": 0.02893731517559112, + "grad_norm": 0.06993114177456627, + "learning_rate": 0.0001446700507614213, + "loss": 0.8141, + "step": 570 + }, + { + "epoch": 0.029191151273622623, + "grad_norm": 0.07132008217126029, + "learning_rate": 0.00014593908629441623, + "loss": 0.7732, + "step": 575 + }, + { + "epoch": 0.029444987371654123, + "grad_norm": 0.0634032782262794, + "learning_rate": 0.00014720812182741116, + "loss": 0.7833, + "step": 580 + }, + { + "epoch": 0.029698823469685623, + "grad_norm": 0.06291224826282306, + "learning_rate": 0.00014847715736040609, + "loss": 0.7713, + "step": 585 + }, + { + "epoch": 0.029952659567717126, + "grad_norm": 0.06955080092630633, + "learning_rate": 0.000149746192893401, + "loss": 0.812, + "step": 590 + }, + { + "epoch": 0.030206495665748626, + "grad_norm": 0.06642744415224454, + "learning_rate": 0.00015101522842639594, + "loss": 0.7816, + "step": 595 + }, + { + "epoch": 0.030460331763780126, + "grad_norm": 0.07915699035230568, + "learning_rate": 0.00015228426395939087, + "loss": 0.815, + "step": 600 + }, + { + "epoch": 0.03071416786181163, + "grad_norm": 0.07162405450532806, + "learning_rate": 0.0001535532994923858, + "loss": 0.7785, + "step": 605 + }, + { + "epoch": 0.03096800395984313, + "grad_norm": 0.06622725095139695, + "learning_rate": 0.00015482233502538072, + "loss": 0.7698, + "step": 610 + }, + { + "epoch": 0.03122184005787463, + "grad_norm": 0.06718329111593666, + "learning_rate": 0.00015609137055837564, + "loss": 0.7938, + "step": 615 + }, + { + "epoch": 0.03147567615590613, + "grad_norm": 0.0834368594300888, + "learning_rate": 0.00015736040609137057, + "loss": 0.8093, + "step": 620 + }, + { + "epoch": 0.03172951225393763, + "grad_norm": 0.09950838339388045, + "learning_rate": 0.0001586294416243655, + "loss": 0.7822, + "step": 625 + }, + { + "epoch": 0.031983348351969136, + "grad_norm": 0.08312710734927868, + "learning_rate": 0.00015989847715736042, + "loss": 0.7726, + "step": 630 + }, + { + "epoch": 0.032237184450000636, + "grad_norm": 0.06335807274711928, + "learning_rate": 0.00016116751269035535, + "loss": 0.7934, + "step": 635 + }, + { + "epoch": 0.032491020548032136, + "grad_norm": 0.07178135932549387, + "learning_rate": 0.00016243654822335025, + "loss": 0.7719, + "step": 640 + }, + { + "epoch": 0.032744856646063636, + "grad_norm": 0.07635177651816702, + "learning_rate": 0.00016370558375634518, + "loss": 0.7969, + "step": 645 + }, + { + "epoch": 0.032998692744095136, + "grad_norm": 0.17477231843517774, + "learning_rate": 0.0001649746192893401, + "loss": 0.7771, + "step": 650 + }, + { + "epoch": 0.033252528842126636, + "grad_norm": 0.06434201144677448, + "learning_rate": 0.00016624365482233503, + "loss": 0.7829, + "step": 655 + }, + { + "epoch": 0.03350636494015814, + "grad_norm": 0.0663036035191937, + "learning_rate": 0.00016751269035532995, + "loss": 0.7742, + "step": 660 + }, + { + "epoch": 0.03376020103818964, + "grad_norm": 0.061908820772137774, + "learning_rate": 0.00016878172588832488, + "loss": 0.7748, + "step": 665 + }, + { + "epoch": 0.03401403713622114, + "grad_norm": 0.07340604363772968, + "learning_rate": 0.0001700507614213198, + "loss": 0.7663, + "step": 670 + }, + { + "epoch": 0.03426787323425264, + "grad_norm": 0.06471552425628542, + "learning_rate": 0.0001713197969543147, + "loss": 0.738, + "step": 675 + }, + { + "epoch": 0.03452170933228414, + "grad_norm": 0.0628486951271683, + "learning_rate": 0.00017258883248730963, + "loss": 0.7693, + "step": 680 + }, + { + "epoch": 0.03477554543031564, + "grad_norm": 0.0821063953061165, + "learning_rate": 0.00017385786802030456, + "loss": 0.7937, + "step": 685 + }, + { + "epoch": 0.03502938152834715, + "grad_norm": 0.0677477739831011, + "learning_rate": 0.00017512690355329949, + "loss": 0.7516, + "step": 690 + }, + { + "epoch": 0.03528321762637865, + "grad_norm": 0.06601728867845341, + "learning_rate": 0.0001763959390862944, + "loss": 0.7711, + "step": 695 + }, + { + "epoch": 0.03553705372441015, + "grad_norm": 0.06337314268961115, + "learning_rate": 0.00017766497461928934, + "loss": 0.7725, + "step": 700 + }, + { + "epoch": 0.03579088982244165, + "grad_norm": 0.06700277491271579, + "learning_rate": 0.00017893401015228426, + "loss": 0.7779, + "step": 705 + }, + { + "epoch": 0.03604472592047315, + "grad_norm": 0.06439978678064547, + "learning_rate": 0.0001802030456852792, + "loss": 0.7377, + "step": 710 + }, + { + "epoch": 0.03629856201850465, + "grad_norm": 0.08022019987059843, + "learning_rate": 0.00018147208121827412, + "loss": 0.7891, + "step": 715 + }, + { + "epoch": 0.036552398116536156, + "grad_norm": 0.06618773295124729, + "learning_rate": 0.00018274111675126904, + "loss": 0.7847, + "step": 720 + }, + { + "epoch": 0.036806234214567655, + "grad_norm": 0.06785165350325073, + "learning_rate": 0.00018401015228426397, + "loss": 0.7709, + "step": 725 + }, + { + "epoch": 0.037060070312599155, + "grad_norm": 0.06446068323928258, + "learning_rate": 0.0001852791878172589, + "loss": 0.7466, + "step": 730 + }, + { + "epoch": 0.037313906410630655, + "grad_norm": 0.0743985429884066, + "learning_rate": 0.00018654822335025382, + "loss": 0.7495, + "step": 735 + }, + { + "epoch": 0.037567742508662155, + "grad_norm": 0.06381394090876102, + "learning_rate": 0.00018781725888324875, + "loss": 0.7724, + "step": 740 + }, + { + "epoch": 0.037821578606693655, + "grad_norm": 0.07609937482268822, + "learning_rate": 0.00018908629441624368, + "loss": 0.8044, + "step": 745 + }, + { + "epoch": 0.03807541470472516, + "grad_norm": 0.07080823262148744, + "learning_rate": 0.00019035532994923857, + "loss": 0.7392, + "step": 750 + }, + { + "epoch": 0.03832925080275666, + "grad_norm": 0.08986296323961589, + "learning_rate": 0.0001916243654822335, + "loss": 0.7506, + "step": 755 + }, + { + "epoch": 0.03858308690078816, + "grad_norm": 0.0633546160075049, + "learning_rate": 0.00019289340101522843, + "loss": 0.775, + "step": 760 + }, + { + "epoch": 0.03883692299881966, + "grad_norm": 0.06672741053597235, + "learning_rate": 0.00019416243654822335, + "loss": 0.7918, + "step": 765 + }, + { + "epoch": 0.03909075909685116, + "grad_norm": 0.0667100508132584, + "learning_rate": 0.00019543147208121828, + "loss": 0.7857, + "step": 770 + }, + { + "epoch": 0.03934459519488266, + "grad_norm": 0.07489251478473465, + "learning_rate": 0.0001967005076142132, + "loss": 0.7601, + "step": 775 + }, + { + "epoch": 0.03959843129291417, + "grad_norm": 0.07514829685101908, + "learning_rate": 0.00019796954314720813, + "loss": 0.7734, + "step": 780 + }, + { + "epoch": 0.03985226739094567, + "grad_norm": 0.06072516621359633, + "learning_rate": 0.00019923857868020303, + "loss": 0.7716, + "step": 785 + }, + { + "epoch": 0.04010610348897717, + "grad_norm": 0.10545789364607022, + "learning_rate": 0.00020050761421319796, + "loss": 0.6972, + "step": 790 + }, + { + "epoch": 0.04035993958700867, + "grad_norm": 0.06621028315139817, + "learning_rate": 0.00020177664974619288, + "loss": 0.7619, + "step": 795 + }, + { + "epoch": 0.04061377568504017, + "grad_norm": 0.07339900729710998, + "learning_rate": 0.0002030456852791878, + "loss": 0.7748, + "step": 800 + }, + { + "epoch": 0.04086761178307167, + "grad_norm": 0.07861308330289217, + "learning_rate": 0.00020431472081218274, + "loss": 0.7492, + "step": 805 + }, + { + "epoch": 0.041121447881103175, + "grad_norm": 0.06891500592846019, + "learning_rate": 0.00020558375634517766, + "loss": 0.7389, + "step": 810 + }, + { + "epoch": 0.041375283979134675, + "grad_norm": 0.061487916642653274, + "learning_rate": 0.0002068527918781726, + "loss": 0.7533, + "step": 815 + }, + { + "epoch": 0.041629120077166175, + "grad_norm": 0.06175106319498476, + "learning_rate": 0.00020812182741116754, + "loss": 0.775, + "step": 820 + }, + { + "epoch": 0.041882956175197675, + "grad_norm": 0.07347500552851026, + "learning_rate": 0.00020939086294416244, + "loss": 0.7565, + "step": 825 + }, + { + "epoch": 0.042136792273229175, + "grad_norm": 0.06765223170162757, + "learning_rate": 0.00021065989847715737, + "loss": 0.7441, + "step": 830 + }, + { + "epoch": 0.042390628371260675, + "grad_norm": 0.07847030528657203, + "learning_rate": 0.0002119289340101523, + "loss": 0.7446, + "step": 835 + }, + { + "epoch": 0.04264446446929218, + "grad_norm": 0.07339106925074805, + "learning_rate": 0.00021319796954314722, + "loss": 0.7719, + "step": 840 + }, + { + "epoch": 0.04289830056732368, + "grad_norm": 0.06531782875804885, + "learning_rate": 0.00021446700507614215, + "loss": 0.7569, + "step": 845 + }, + { + "epoch": 0.04315213666535518, + "grad_norm": 0.06579891597056135, + "learning_rate": 0.00021573604060913707, + "loss": 0.7495, + "step": 850 + }, + { + "epoch": 0.04340597276338668, + "grad_norm": 0.07284604345072491, + "learning_rate": 0.000217005076142132, + "loss": 0.7566, + "step": 855 + }, + { + "epoch": 0.04365980886141818, + "grad_norm": 0.06726864956360354, + "learning_rate": 0.0002182741116751269, + "loss": 0.7742, + "step": 860 + }, + { + "epoch": 0.04391364495944968, + "grad_norm": 0.06299243656591876, + "learning_rate": 0.00021954314720812183, + "loss": 0.7676, + "step": 865 + }, + { + "epoch": 0.04416748105748118, + "grad_norm": 0.07127446574398555, + "learning_rate": 0.00022081218274111675, + "loss": 0.7328, + "step": 870 + }, + { + "epoch": 0.04442131715551269, + "grad_norm": 0.059333625669680895, + "learning_rate": 0.00022208121827411168, + "loss": 0.7528, + "step": 875 + }, + { + "epoch": 0.04467515325354419, + "grad_norm": 0.06769961067024773, + "learning_rate": 0.0002233502538071066, + "loss": 0.7702, + "step": 880 + }, + { + "epoch": 0.04492898935157569, + "grad_norm": 0.05797151521946424, + "learning_rate": 0.00022461928934010153, + "loss": 0.7565, + "step": 885 + }, + { + "epoch": 0.04518282544960719, + "grad_norm": 0.06349124685160816, + "learning_rate": 0.00022588832487309646, + "loss": 0.7386, + "step": 890 + }, + { + "epoch": 0.04543666154763869, + "grad_norm": 0.07021759323373769, + "learning_rate": 0.00022715736040609136, + "loss": 0.7306, + "step": 895 + }, + { + "epoch": 0.04569049764567019, + "grad_norm": 0.0637596966680832, + "learning_rate": 0.00022842639593908628, + "loss": 0.7699, + "step": 900 + }, + { + "epoch": 0.045944333743701694, + "grad_norm": 0.06500213255962216, + "learning_rate": 0.0002296954314720812, + "loss": 0.7816, + "step": 905 + }, + { + "epoch": 0.046198169841733194, + "grad_norm": 0.06309605572576463, + "learning_rate": 0.00023096446700507614, + "loss": 0.749, + "step": 910 + }, + { + "epoch": 0.046452005939764694, + "grad_norm": 0.05963186976020712, + "learning_rate": 0.00023223350253807106, + "loss": 0.7287, + "step": 915 + }, + { + "epoch": 0.046705842037796194, + "grad_norm": 0.06576264741938838, + "learning_rate": 0.000233502538071066, + "loss": 0.7624, + "step": 920 + }, + { + "epoch": 0.046959678135827694, + "grad_norm": 0.0660867978726128, + "learning_rate": 0.00023477157360406092, + "loss": 0.7249, + "step": 925 + }, + { + "epoch": 0.047213514233859194, + "grad_norm": 0.08980020895136409, + "learning_rate": 0.00023604060913705587, + "loss": 0.7694, + "step": 930 + }, + { + "epoch": 0.0474673503318907, + "grad_norm": 0.06355755979289256, + "learning_rate": 0.00023730964467005077, + "loss": 0.7519, + "step": 935 + }, + { + "epoch": 0.0477211864299222, + "grad_norm": 0.0726778191121942, + "learning_rate": 0.0002385786802030457, + "loss": 0.7576, + "step": 940 + }, + { + "epoch": 0.0479750225279537, + "grad_norm": 0.058859928139284715, + "learning_rate": 0.00023984771573604062, + "loss": 0.7244, + "step": 945 + }, + { + "epoch": 0.0482288586259852, + "grad_norm": 0.06081900024027338, + "learning_rate": 0.00024111675126903555, + "loss": 0.7373, + "step": 950 + }, + { + "epoch": 0.0484826947240167, + "grad_norm": 0.061248646181951284, + "learning_rate": 0.00024238578680203047, + "loss": 0.7822, + "step": 955 + }, + { + "epoch": 0.0487365308220482, + "grad_norm": 0.06672583029591972, + "learning_rate": 0.0002436548223350254, + "loss": 0.7488, + "step": 960 + }, + { + "epoch": 0.04899036692007971, + "grad_norm": 0.0572603692641915, + "learning_rate": 0.0002449238578680203, + "loss": 0.7198, + "step": 965 + }, + { + "epoch": 0.04924420301811121, + "grad_norm": 0.06681173443942526, + "learning_rate": 0.0002461928934010152, + "loss": 0.7525, + "step": 970 + }, + { + "epoch": 0.04949803911614271, + "grad_norm": 0.07257418993579445, + "learning_rate": 0.00024746192893401015, + "loss": 0.7395, + "step": 975 + }, + { + "epoch": 0.04975187521417421, + "grad_norm": 0.07302937635349391, + "learning_rate": 0.0002487309644670051, + "loss": 0.7372, + "step": 980 + }, + { + "epoch": 0.05000571131220571, + "grad_norm": 0.06996794132833503, + "learning_rate": 0.00025, + "loss": 0.729, + "step": 985 + }, + { + "epoch": 0.05025954741023721, + "grad_norm": 0.05788422679267036, + "learning_rate": 0.00025126903553299493, + "loss": 0.7263, + "step": 990 + }, + { + "epoch": 0.050513383508268714, + "grad_norm": 0.06234697260625598, + "learning_rate": 0.00025253807106598986, + "loss": 0.7245, + "step": 995 + }, + { + "epoch": 0.050767219606300214, + "grad_norm": 0.058460398878268476, + "learning_rate": 0.0002538071065989848, + "loss": 0.7382, + "step": 1000 + }, + { + "epoch": 0.051021055704331714, + "grad_norm": 0.05925618041244326, + "learning_rate": 0.0002550761421319797, + "loss": 0.7431, + "step": 1005 + }, + { + "epoch": 0.051274891802363214, + "grad_norm": 0.059944269822413306, + "learning_rate": 0.00025634517766497464, + "loss": 0.7556, + "step": 1010 + }, + { + "epoch": 0.051528727900394714, + "grad_norm": 0.05405315094033796, + "learning_rate": 0.00025761421319796956, + "loss": 0.6909, + "step": 1015 + }, + { + "epoch": 0.05178256399842621, + "grad_norm": 0.05752428711722885, + "learning_rate": 0.0002588832487309645, + "loss": 0.7232, + "step": 1020 + }, + { + "epoch": 0.05203640009645772, + "grad_norm": 0.06068982302190885, + "learning_rate": 0.00026015228426395936, + "loss": 0.7159, + "step": 1025 + }, + { + "epoch": 0.05229023619448922, + "grad_norm": 0.06926054723257352, + "learning_rate": 0.00026142131979695434, + "loss": 0.7097, + "step": 1030 + }, + { + "epoch": 0.05254407229252072, + "grad_norm": 0.05944640426895496, + "learning_rate": 0.0002626903553299492, + "loss": 0.7266, + "step": 1035 + }, + { + "epoch": 0.05279790839055222, + "grad_norm": 0.05911056311965479, + "learning_rate": 0.0002639593908629442, + "loss": 0.7034, + "step": 1040 + }, + { + "epoch": 0.05305174448858372, + "grad_norm": 0.05404273390108656, + "learning_rate": 0.00026522842639593907, + "loss": 0.7263, + "step": 1045 + }, + { + "epoch": 0.05330558058661522, + "grad_norm": 0.0681569692981572, + "learning_rate": 0.00026649746192893405, + "loss": 0.709, + "step": 1050 + }, + { + "epoch": 0.05355941668464673, + "grad_norm": 0.061027222712780306, + "learning_rate": 0.0002677664974619289, + "loss": 0.745, + "step": 1055 + }, + { + "epoch": 0.05381325278267823, + "grad_norm": 0.060423144178792326, + "learning_rate": 0.00026903553299492385, + "loss": 0.7525, + "step": 1060 + }, + { + "epoch": 0.05406708888070973, + "grad_norm": 0.05967597970626861, + "learning_rate": 0.00027030456852791877, + "loss": 0.739, + "step": 1065 + }, + { + "epoch": 0.05432092497874123, + "grad_norm": 0.06674697326887237, + "learning_rate": 0.0002715736040609137, + "loss": 0.754, + "step": 1070 + }, + { + "epoch": 0.054574761076772726, + "grad_norm": 0.06984166791857392, + "learning_rate": 0.0002728426395939086, + "loss": 0.7347, + "step": 1075 + }, + { + "epoch": 0.054828597174804226, + "grad_norm": 0.05660547722828721, + "learning_rate": 0.00027411167512690355, + "loss": 0.7372, + "step": 1080 + }, + { + "epoch": 0.05508243327283573, + "grad_norm": 0.061731160855571954, + "learning_rate": 0.0002753807106598985, + "loss": 0.7273, + "step": 1085 + }, + { + "epoch": 0.05533626937086723, + "grad_norm": 0.05843567186487286, + "learning_rate": 0.0002766497461928934, + "loss": 0.7405, + "step": 1090 + }, + { + "epoch": 0.05559010546889873, + "grad_norm": 0.06160351540468081, + "learning_rate": 0.0002779187817258883, + "loss": 0.7392, + "step": 1095 + }, + { + "epoch": 0.05584394156693023, + "grad_norm": 0.06302649918893416, + "learning_rate": 0.00027918781725888326, + "loss": 0.7169, + "step": 1100 + }, + { + "epoch": 0.05609777766496173, + "grad_norm": 0.26748522478789255, + "learning_rate": 0.0002804568527918782, + "loss": 0.7258, + "step": 1105 + }, + { + "epoch": 0.05635161376299323, + "grad_norm": 0.060283031147792536, + "learning_rate": 0.0002817258883248731, + "loss": 0.7216, + "step": 1110 + }, + { + "epoch": 0.05660544986102474, + "grad_norm": 0.06726444207819834, + "learning_rate": 0.00028299492385786804, + "loss": 0.751, + "step": 1115 + }, + { + "epoch": 0.05685928595905624, + "grad_norm": 0.09905598633925246, + "learning_rate": 0.00028426395939086296, + "loss": 0.7714, + "step": 1120 + }, + { + "epoch": 0.05711312205708774, + "grad_norm": 0.1277329704703035, + "learning_rate": 0.0002855329949238579, + "loss": 0.7602, + "step": 1125 + }, + { + "epoch": 0.05736695815511924, + "grad_norm": 0.20723659499952354, + "learning_rate": 0.0002868020304568528, + "loss": 0.7063, + "step": 1130 + }, + { + "epoch": 0.05762079425315074, + "grad_norm": 0.08585174275787383, + "learning_rate": 0.00028807106598984774, + "loss": 0.7199, + "step": 1135 + }, + { + "epoch": 0.05787463035118224, + "grad_norm": 0.0660384074730241, + "learning_rate": 0.0002893401015228426, + "loss": 0.7308, + "step": 1140 + }, + { + "epoch": 0.05812846644921374, + "grad_norm": 0.19383133097044608, + "learning_rate": 0.0002906091370558376, + "loss": 0.9052, + "step": 1145 + }, + { + "epoch": 0.058382302547245246, + "grad_norm": 0.24673064263656, + "learning_rate": 0.00029187817258883247, + "loss": 0.7624, + "step": 1150 + }, + { + "epoch": 0.058636138645276746, + "grad_norm": 32.757348375145966, + "learning_rate": 0.00029314720812182745, + "loss": 0.7527, + "step": 1155 + }, + { + "epoch": 0.058889974743308246, + "grad_norm": 0.09916235027198479, + "learning_rate": 0.0002944162436548223, + "loss": 0.7613, + "step": 1160 + }, + { + "epoch": 0.059143810841339746, + "grad_norm": 0.12917795113084668, + "learning_rate": 0.0002956852791878173, + "loss": 0.7631, + "step": 1165 + }, + { + "epoch": 0.059397646939371246, + "grad_norm": 0.08883151007488581, + "learning_rate": 0.00029695431472081217, + "loss": 0.7412, + "step": 1170 + }, + { + "epoch": 0.059651483037402746, + "grad_norm": 0.08449886407082698, + "learning_rate": 0.0002982233502538071, + "loss": 0.7789, + "step": 1175 + }, + { + "epoch": 0.05990531913543425, + "grad_norm": 0.12057734525050375, + "learning_rate": 0.000299492385786802, + "loss": 0.7513, + "step": 1180 + }, + { + "epoch": 0.06015915523346575, + "grad_norm": 0.16122305869580125, + "learning_rate": 0.00030076142131979695, + "loss": 0.8202, + "step": 1185 + }, + { + "epoch": 0.06041299133149725, + "grad_norm": 0.09478283020933216, + "learning_rate": 0.0003020304568527919, + "loss": 0.7556, + "step": 1190 + }, + { + "epoch": 0.06066682742952875, + "grad_norm": 1.1482190144535993, + "learning_rate": 0.0003032994923857868, + "loss": 0.742, + "step": 1195 + }, + { + "epoch": 0.06092066352756025, + "grad_norm": 0.08948971775561998, + "learning_rate": 0.00030456852791878173, + "loss": 0.7897, + "step": 1200 + }, + { + "epoch": 0.06117449962559175, + "grad_norm": 0.11543924190628982, + "learning_rate": 0.00030583756345177666, + "loss": 0.7372, + "step": 1205 + }, + { + "epoch": 0.06142833572362326, + "grad_norm": 0.06522079820810416, + "learning_rate": 0.0003071065989847716, + "loss": 0.7409, + "step": 1210 + }, + { + "epoch": 0.06168217182165476, + "grad_norm": 0.05653686696839828, + "learning_rate": 0.0003083756345177665, + "loss": 0.7432, + "step": 1215 + }, + { + "epoch": 0.06193600791968626, + "grad_norm": 0.06485576389961441, + "learning_rate": 0.00030964467005076144, + "loss": 0.7637, + "step": 1220 + }, + { + "epoch": 0.06218984401771776, + "grad_norm": 0.06287754156459976, + "learning_rate": 0.00031091370558375636, + "loss": 0.7496, + "step": 1225 + }, + { + "epoch": 0.06244368011574926, + "grad_norm": 0.06275169467007742, + "learning_rate": 0.0003121827411167513, + "loss": 0.7356, + "step": 1230 + }, + { + "epoch": 0.06269751621378077, + "grad_norm": 0.1431156053310293, + "learning_rate": 0.0003134517766497462, + "loss": 0.7021, + "step": 1235 + }, + { + "epoch": 0.06295135231181226, + "grad_norm": 0.0698152906671079, + "learning_rate": 0.00031472081218274114, + "loss": 0.7369, + "step": 1240 + }, + { + "epoch": 0.06320518840984377, + "grad_norm": 0.06474169533744434, + "learning_rate": 0.000315989847715736, + "loss": 0.7431, + "step": 1245 + }, + { + "epoch": 0.06345902450787526, + "grad_norm": 0.05656964302149857, + "learning_rate": 0.000317258883248731, + "loss": 0.7279, + "step": 1250 + }, + { + "epoch": 0.06371286060590677, + "grad_norm": 0.06008444247403088, + "learning_rate": 0.00031852791878172587, + "loss": 0.7255, + "step": 1255 + }, + { + "epoch": 0.06396669670393827, + "grad_norm": 0.06258676825297353, + "learning_rate": 0.00031979695431472085, + "loss": 0.7294, + "step": 1260 + }, + { + "epoch": 0.06422053280196977, + "grad_norm": 0.06101644314132169, + "learning_rate": 0.0003210659898477157, + "loss": 0.7495, + "step": 1265 + }, + { + "epoch": 0.06447436890000127, + "grad_norm": 0.05784297759132409, + "learning_rate": 0.0003223350253807107, + "loss": 0.7089, + "step": 1270 + }, + { + "epoch": 0.06472820499803277, + "grad_norm": 0.06608430212446814, + "learning_rate": 0.00032360406091370557, + "loss": 0.729, + "step": 1275 + }, + { + "epoch": 0.06498204109606427, + "grad_norm": 0.06682999306491608, + "learning_rate": 0.0003248730964467005, + "loss": 0.7659, + "step": 1280 + }, + { + "epoch": 0.06523587719409578, + "grad_norm": 0.05567632533610063, + "learning_rate": 0.0003261421319796954, + "loss": 0.7043, + "step": 1285 + }, + { + "epoch": 0.06548971329212727, + "grad_norm": 0.08049433253072921, + "learning_rate": 0.00032741116751269035, + "loss": 0.7045, + "step": 1290 + }, + { + "epoch": 0.06574354939015878, + "grad_norm": 0.06943993107179286, + "learning_rate": 0.0003286802030456853, + "loss": 0.7442, + "step": 1295 + }, + { + "epoch": 0.06599738548819027, + "grad_norm": 0.3124576265680848, + "learning_rate": 0.0003299492385786802, + "loss": 0.7156, + "step": 1300 + }, + { + "epoch": 0.06625122158622178, + "grad_norm": 0.05993979750837367, + "learning_rate": 0.00033121827411167513, + "loss": 0.7378, + "step": 1305 + }, + { + "epoch": 0.06650505768425327, + "grad_norm": 0.05955364139261034, + "learning_rate": 0.00033248730964467006, + "loss": 0.7258, + "step": 1310 + }, + { + "epoch": 0.06675889378228478, + "grad_norm": 0.08613170534764741, + "learning_rate": 0.00033375634517766493, + "loss": 0.7505, + "step": 1315 + }, + { + "epoch": 0.06701272988031629, + "grad_norm": 0.06998830116145732, + "learning_rate": 0.0003350253807106599, + "loss": 0.7261, + "step": 1320 + }, + { + "epoch": 0.06726656597834778, + "grad_norm": 0.05795740324311744, + "learning_rate": 0.00033629441624365484, + "loss": 0.7025, + "step": 1325 + }, + { + "epoch": 0.06752040207637929, + "grad_norm": 0.055676641145626066, + "learning_rate": 0.00033756345177664976, + "loss": 0.72, + "step": 1330 + }, + { + "epoch": 0.06777423817441078, + "grad_norm": 0.05604862800727641, + "learning_rate": 0.0003388324873096447, + "loss": 0.7285, + "step": 1335 + }, + { + "epoch": 0.06802807427244228, + "grad_norm": 0.05356518336455629, + "learning_rate": 0.0003401015228426396, + "loss": 0.7386, + "step": 1340 + }, + { + "epoch": 0.06828191037047379, + "grad_norm": 0.09605226497693169, + "learning_rate": 0.00034137055837563454, + "loss": 0.7118, + "step": 1345 + }, + { + "epoch": 0.06853574646850528, + "grad_norm": 0.06982897061697936, + "learning_rate": 0.0003426395939086294, + "loss": 0.6877, + "step": 1350 + }, + { + "epoch": 0.06878958256653679, + "grad_norm": 0.07303904652751834, + "learning_rate": 0.0003439086294416244, + "loss": 0.7174, + "step": 1355 + }, + { + "epoch": 0.06904341866456828, + "grad_norm": 0.07291571688385211, + "learning_rate": 0.00034517766497461927, + "loss": 0.7344, + "step": 1360 + }, + { + "epoch": 0.06929725476259979, + "grad_norm": 0.07543740254911013, + "learning_rate": 0.00034644670050761425, + "loss": 0.7053, + "step": 1365 + }, + { + "epoch": 0.06955109086063128, + "grad_norm": 0.06625493235058802, + "learning_rate": 0.0003477157360406091, + "loss": 0.7, + "step": 1370 + }, + { + "epoch": 0.06980492695866279, + "grad_norm": 0.05544221962273842, + "learning_rate": 0.0003489847715736041, + "loss": 0.7211, + "step": 1375 + }, + { + "epoch": 0.0700587630566943, + "grad_norm": 0.07915130432819355, + "learning_rate": 0.00035025380710659897, + "loss": 0.7113, + "step": 1380 + }, + { + "epoch": 0.07031259915472579, + "grad_norm": 0.37754027559534814, + "learning_rate": 0.00035152284263959395, + "loss": 0.7025, + "step": 1385 + }, + { + "epoch": 0.0705664352527573, + "grad_norm": 0.12584003699209365, + "learning_rate": 0.0003527918781725888, + "loss": 0.7465, + "step": 1390 + }, + { + "epoch": 0.07082027135078879, + "grad_norm": 0.06582449891223112, + "learning_rate": 0.00035406091370558375, + "loss": 0.7178, + "step": 1395 + }, + { + "epoch": 0.0710741074488203, + "grad_norm": 0.06342116685715943, + "learning_rate": 0.0003553299492385787, + "loss": 0.7553, + "step": 1400 + }, + { + "epoch": 0.0713279435468518, + "grad_norm": 0.062008656515146095, + "learning_rate": 0.0003565989847715736, + "loss": 0.7584, + "step": 1405 + }, + { + "epoch": 0.0715817796448833, + "grad_norm": 0.15575295778070705, + "learning_rate": 0.00035786802030456853, + "loss": 0.7721, + "step": 1410 + }, + { + "epoch": 0.0718356157429148, + "grad_norm": 0.07024393682133559, + "learning_rate": 0.00035913705583756346, + "loss": 0.716, + "step": 1415 + }, + { + "epoch": 0.0720894518409463, + "grad_norm": 0.06141309488998922, + "learning_rate": 0.0003604060913705584, + "loss": 0.7537, + "step": 1420 + }, + { + "epoch": 0.0723432879389778, + "grad_norm": 0.05848534147436462, + "learning_rate": 0.0003616751269035533, + "loss": 0.7358, + "step": 1425 + }, + { + "epoch": 0.0725971240370093, + "grad_norm": 0.05896313633341948, + "learning_rate": 0.00036294416243654823, + "loss": 0.7163, + "step": 1430 + }, + { + "epoch": 0.0728509601350408, + "grad_norm": 0.0612049129333866, + "learning_rate": 0.00036421319796954316, + "loss": 0.7278, + "step": 1435 + }, + { + "epoch": 0.07310479623307231, + "grad_norm": 0.20510721585245476, + "learning_rate": 0.0003654822335025381, + "loss": 0.7204, + "step": 1440 + }, + { + "epoch": 0.0733586323311038, + "grad_norm": 0.06353888649819851, + "learning_rate": 0.000366751269035533, + "loss": 0.703, + "step": 1445 + }, + { + "epoch": 0.07361246842913531, + "grad_norm": 0.11598595193927975, + "learning_rate": 0.00036802030456852794, + "loss": 0.7353, + "step": 1450 + }, + { + "epoch": 0.0738663045271668, + "grad_norm": 0.06735218740803854, + "learning_rate": 0.00036928934010152287, + "loss": 0.7387, + "step": 1455 + }, + { + "epoch": 0.07412014062519831, + "grad_norm": 0.056456597607688834, + "learning_rate": 0.0003705583756345178, + "loss": 0.7244, + "step": 1460 + }, + { + "epoch": 0.0743739767232298, + "grad_norm": 0.06695445879742773, + "learning_rate": 0.00037182741116751266, + "loss": 0.733, + "step": 1465 + }, + { + "epoch": 0.07462781282126131, + "grad_norm": 0.062469347353580236, + "learning_rate": 0.00037309644670050765, + "loss": 0.7283, + "step": 1470 + }, + { + "epoch": 0.07488164891929282, + "grad_norm": 0.10743187268032847, + "learning_rate": 0.0003743654822335025, + "loss": 0.7627, + "step": 1475 + }, + { + "epoch": 0.07513548501732431, + "grad_norm": 0.06985614418900853, + "learning_rate": 0.0003756345177664975, + "loss": 0.7398, + "step": 1480 + }, + { + "epoch": 0.07538932111535582, + "grad_norm": 4.112242925569732, + "learning_rate": 0.00037690355329949237, + "loss": 0.7503, + "step": 1485 + }, + { + "epoch": 0.07564315721338731, + "grad_norm": 3.056249205800858, + "learning_rate": 0.00037817258883248735, + "loss": 0.7365, + "step": 1490 + }, + { + "epoch": 0.07589699331141882, + "grad_norm": 0.08367423229399179, + "learning_rate": 0.0003794416243654822, + "loss": 0.751, + "step": 1495 + }, + { + "epoch": 0.07615082940945032, + "grad_norm": 0.17368019423571526, + "learning_rate": 0.00038071065989847715, + "loss": 0.7447, + "step": 1500 + }, + { + "epoch": 0.07640466550748182, + "grad_norm": 0.06787177882481868, + "learning_rate": 0.0003819796954314721, + "loss": 0.7489, + "step": 1505 + }, + { + "epoch": 0.07665850160551332, + "grad_norm": 0.07169631398568231, + "learning_rate": 0.000383248730964467, + "loss": 0.7393, + "step": 1510 + }, + { + "epoch": 0.07691233770354482, + "grad_norm": 0.05749839111352872, + "learning_rate": 0.00038451776649746193, + "loss": 0.721, + "step": 1515 + }, + { + "epoch": 0.07716617380157632, + "grad_norm": 0.08575887204955374, + "learning_rate": 0.00038578680203045685, + "loss": 0.7128, + "step": 1520 + }, + { + "epoch": 0.07742000989960782, + "grad_norm": 0.05613491181829679, + "learning_rate": 0.0003870558375634518, + "loss": 0.7338, + "step": 1525 + }, + { + "epoch": 0.07767384599763932, + "grad_norm": 0.06300181911559392, + "learning_rate": 0.0003883248730964467, + "loss": 0.7337, + "step": 1530 + }, + { + "epoch": 0.07792768209567083, + "grad_norm": 0.07134915135851151, + "learning_rate": 0.00038959390862944163, + "loss": 0.7629, + "step": 1535 + }, + { + "epoch": 0.07818151819370232, + "grad_norm": 0.05162935081471609, + "learning_rate": 0.00039086294416243656, + "loss": 0.6955, + "step": 1540 + }, + { + "epoch": 0.07843535429173383, + "grad_norm": 0.06414129871881698, + "learning_rate": 0.0003921319796954315, + "loss": 0.7301, + "step": 1545 + }, + { + "epoch": 0.07868919038976532, + "grad_norm": 0.05373549484924304, + "learning_rate": 0.0003934010152284264, + "loss": 0.6976, + "step": 1550 + }, + { + "epoch": 0.07894302648779683, + "grad_norm": 0.06837620727230255, + "learning_rate": 0.00039467005076142134, + "loss": 0.6985, + "step": 1555 + }, + { + "epoch": 0.07919686258582834, + "grad_norm": 0.07846375652980406, + "learning_rate": 0.00039593908629441627, + "loss": 0.7338, + "step": 1560 + }, + { + "epoch": 0.07945069868385983, + "grad_norm": 0.06174362803606399, + "learning_rate": 0.0003972081218274112, + "loss": 0.7168, + "step": 1565 + }, + { + "epoch": 0.07970453478189134, + "grad_norm": 0.05882865445136937, + "learning_rate": 0.00039847715736040606, + "loss": 0.7184, + "step": 1570 + }, + { + "epoch": 0.07995837087992283, + "grad_norm": 0.0493701696839989, + "learning_rate": 0.00039974619289340104, + "loss": 0.7053, + "step": 1575 + }, + { + "epoch": 0.08021220697795434, + "grad_norm": 0.054577428336826876, + "learning_rate": 0.0004010152284263959, + "loss": 0.7315, + "step": 1580 + }, + { + "epoch": 0.08046604307598583, + "grad_norm": 0.05891870223720398, + "learning_rate": 0.0004022842639593909, + "loss": 0.708, + "step": 1585 + }, + { + "epoch": 0.08071987917401734, + "grad_norm": 0.09122730379802985, + "learning_rate": 0.00040355329949238577, + "loss": 0.6886, + "step": 1590 + }, + { + "epoch": 0.08097371527204884, + "grad_norm": 0.05434428539871062, + "learning_rate": 0.00040482233502538075, + "loss": 0.7235, + "step": 1595 + }, + { + "epoch": 0.08122755137008034, + "grad_norm": 0.057406938749746325, + "learning_rate": 0.0004060913705583756, + "loss": 0.6998, + "step": 1600 + }, + { + "epoch": 0.08148138746811184, + "grad_norm": 0.05624064887105565, + "learning_rate": 0.0004073604060913706, + "loss": 0.7167, + "step": 1605 + }, + { + "epoch": 0.08173522356614334, + "grad_norm": 0.08475156677013762, + "learning_rate": 0.0004086294416243655, + "loss": 0.717, + "step": 1610 + }, + { + "epoch": 0.08198905966417484, + "grad_norm": 0.061648445637681494, + "learning_rate": 0.0004098984771573604, + "loss": 0.7219, + "step": 1615 + }, + { + "epoch": 0.08224289576220635, + "grad_norm": 0.05747063059303642, + "learning_rate": 0.00041116751269035533, + "loss": 0.698, + "step": 1620 + }, + { + "epoch": 0.08249673186023784, + "grad_norm": 0.0531564390368294, + "learning_rate": 0.00041243654822335025, + "loss": 0.6933, + "step": 1625 + }, + { + "epoch": 0.08275056795826935, + "grad_norm": 0.0531546385182133, + "learning_rate": 0.0004137055837563452, + "loss": 0.7269, + "step": 1630 + }, + { + "epoch": 0.08300440405630084, + "grad_norm": 0.05287338545106486, + "learning_rate": 0.0004149746192893401, + "loss": 0.7225, + "step": 1635 + }, + { + "epoch": 0.08325824015433235, + "grad_norm": 0.05148433899438861, + "learning_rate": 0.0004162436548223351, + "loss": 0.7105, + "step": 1640 + }, + { + "epoch": 0.08351207625236384, + "grad_norm": 0.05687600911185101, + "learning_rate": 0.00041751269035532996, + "loss": 0.7124, + "step": 1645 + }, + { + "epoch": 0.08376591235039535, + "grad_norm": 0.05524705398314237, + "learning_rate": 0.0004187817258883249, + "loss": 0.7175, + "step": 1650 + }, + { + "epoch": 0.08401974844842686, + "grad_norm": 0.05772786142305999, + "learning_rate": 0.0004200507614213198, + "loss": 0.72, + "step": 1655 + }, + { + "epoch": 0.08427358454645835, + "grad_norm": 0.06460258179361435, + "learning_rate": 0.00042131979695431474, + "loss": 0.7155, + "step": 1660 + }, + { + "epoch": 0.08452742064448986, + "grad_norm": 0.08154444806905058, + "learning_rate": 0.00042258883248730967, + "loss": 0.7249, + "step": 1665 + }, + { + "epoch": 0.08478125674252135, + "grad_norm": 0.07036154319504197, + "learning_rate": 0.0004238578680203046, + "loss": 0.7018, + "step": 1670 + }, + { + "epoch": 0.08503509284055286, + "grad_norm": 0.07974859335868449, + "learning_rate": 0.0004251269035532995, + "loss": 0.6848, + "step": 1675 + }, + { + "epoch": 0.08528892893858436, + "grad_norm": 0.07106622592720147, + "learning_rate": 0.00042639593908629444, + "loss": 0.7306, + "step": 1680 + }, + { + "epoch": 0.08554276503661586, + "grad_norm": 0.07520683130587769, + "learning_rate": 0.0004276649746192893, + "loss": 0.6777, + "step": 1685 + }, + { + "epoch": 0.08579660113464736, + "grad_norm": 0.060430901146499016, + "learning_rate": 0.0004289340101522843, + "loss": 0.7051, + "step": 1690 + }, + { + "epoch": 0.08605043723267886, + "grad_norm": 0.0635450282792058, + "learning_rate": 0.00043020304568527917, + "loss": 0.6913, + "step": 1695 + }, + { + "epoch": 0.08630427333071036, + "grad_norm": 0.04972958060129512, + "learning_rate": 0.00043147208121827415, + "loss": 0.7135, + "step": 1700 + }, + { + "epoch": 0.08655810942874186, + "grad_norm": 0.04944823830530392, + "learning_rate": 0.000432741116751269, + "loss": 0.7038, + "step": 1705 + }, + { + "epoch": 0.08681194552677336, + "grad_norm": 0.0567985625555269, + "learning_rate": 0.000434010152284264, + "loss": 0.7287, + "step": 1710 + }, + { + "epoch": 0.08706578162480487, + "grad_norm": 0.051682155754569976, + "learning_rate": 0.0004352791878172589, + "loss": 0.6989, + "step": 1715 + }, + { + "epoch": 0.08731961772283636, + "grad_norm": 0.057888169540962604, + "learning_rate": 0.0004365482233502538, + "loss": 0.6968, + "step": 1720 + }, + { + "epoch": 0.08757345382086787, + "grad_norm": 0.04580235344300138, + "learning_rate": 0.00043781725888324873, + "loss": 0.7062, + "step": 1725 + }, + { + "epoch": 0.08782728991889936, + "grad_norm": 0.05599349020439277, + "learning_rate": 0.00043908629441624365, + "loss": 0.7211, + "step": 1730 + }, + { + "epoch": 0.08808112601693087, + "grad_norm": 0.06293126880283036, + "learning_rate": 0.0004403553299492386, + "loss": 0.7035, + "step": 1735 + }, + { + "epoch": 0.08833496211496236, + "grad_norm": 0.05082245406055062, + "learning_rate": 0.0004416243654822335, + "loss": 0.674, + "step": 1740 + }, + { + "epoch": 0.08858879821299387, + "grad_norm": 0.05354641526756831, + "learning_rate": 0.00044289340101522843, + "loss": 0.6947, + "step": 1745 + }, + { + "epoch": 0.08884263431102538, + "grad_norm": 0.05188231038829293, + "learning_rate": 0.00044416243654822336, + "loss": 0.6985, + "step": 1750 + }, + { + "epoch": 0.08909647040905687, + "grad_norm": 0.28571205654683896, + "learning_rate": 0.0004454314720812183, + "loss": 0.7366, + "step": 1755 + }, + { + "epoch": 0.08935030650708838, + "grad_norm": 0.05009396851866736, + "learning_rate": 0.0004467005076142132, + "loss": 0.7037, + "step": 1760 + }, + { + "epoch": 0.08960414260511987, + "grad_norm": 0.057008248968552507, + "learning_rate": 0.00044796954314720814, + "loss": 0.7004, + "step": 1765 + }, + { + "epoch": 0.08985797870315138, + "grad_norm": 0.08570192402076927, + "learning_rate": 0.00044923857868020306, + "loss": 0.6962, + "step": 1770 + }, + { + "epoch": 0.09011181480118288, + "grad_norm": 0.07466469422515871, + "learning_rate": 0.000450507614213198, + "loss": 0.6782, + "step": 1775 + }, + { + "epoch": 0.09036565089921438, + "grad_norm": 0.050896499358629874, + "learning_rate": 0.0004517766497461929, + "loss": 0.6832, + "step": 1780 + }, + { + "epoch": 0.09061948699724588, + "grad_norm": 0.05086157843514099, + "learning_rate": 0.00045304568527918784, + "loss": 0.7398, + "step": 1785 + }, + { + "epoch": 0.09087332309527738, + "grad_norm": 0.048846685380405284, + "learning_rate": 0.0004543147208121827, + "loss": 0.7125, + "step": 1790 + }, + { + "epoch": 0.09112715919330888, + "grad_norm": 0.06253755883118842, + "learning_rate": 0.0004555837563451777, + "loss": 0.7122, + "step": 1795 + }, + { + "epoch": 0.09138099529134038, + "grad_norm": 0.0511560475400958, + "learning_rate": 0.00045685279187817257, + "loss": 0.7291, + "step": 1800 + }, + { + "epoch": 0.09163483138937188, + "grad_norm": 0.052121572152721875, + "learning_rate": 0.00045812182741116755, + "loss": 0.6912, + "step": 1805 + }, + { + "epoch": 0.09188866748740339, + "grad_norm": 0.049287056641165186, + "learning_rate": 0.0004593908629441624, + "loss": 0.6776, + "step": 1810 + }, + { + "epoch": 0.09214250358543488, + "grad_norm": 0.04890540112646688, + "learning_rate": 0.0004606598984771574, + "loss": 0.6833, + "step": 1815 + }, + { + "epoch": 0.09239633968346639, + "grad_norm": 0.0640968907329142, + "learning_rate": 0.0004619289340101523, + "loss": 0.7165, + "step": 1820 + }, + { + "epoch": 0.09265017578149788, + "grad_norm": 0.09373858909563126, + "learning_rate": 0.0004631979695431472, + "loss": 0.6946, + "step": 1825 + }, + { + "epoch": 0.09290401187952939, + "grad_norm": 0.0502898696426856, + "learning_rate": 0.0004644670050761421, + "loss": 0.7373, + "step": 1830 + }, + { + "epoch": 0.0931578479775609, + "grad_norm": 0.052116651580975205, + "learning_rate": 0.00046573604060913705, + "loss": 0.6888, + "step": 1835 + }, + { + "epoch": 0.09341168407559239, + "grad_norm": 0.06418059125424298, + "learning_rate": 0.000467005076142132, + "loss": 0.6905, + "step": 1840 + }, + { + "epoch": 0.0936655201736239, + "grad_norm": 0.06311317514924268, + "learning_rate": 0.0004682741116751269, + "loss": 0.6741, + "step": 1845 + }, + { + "epoch": 0.09391935627165539, + "grad_norm": 0.04939022002771504, + "learning_rate": 0.00046954314720812183, + "loss": 0.7203, + "step": 1850 + }, + { + "epoch": 0.0941731923696869, + "grad_norm": 0.05910410775267137, + "learning_rate": 0.00047081218274111676, + "loss": 0.7099, + "step": 1855 + }, + { + "epoch": 0.09442702846771839, + "grad_norm": 0.0814151828326972, + "learning_rate": 0.00047208121827411174, + "loss": 0.7248, + "step": 1860 + }, + { + "epoch": 0.0946808645657499, + "grad_norm": 0.8076656010487256, + "learning_rate": 0.0004733502538071066, + "loss": 0.7253, + "step": 1865 + }, + { + "epoch": 0.0949347006637814, + "grad_norm": 0.06765805196660898, + "learning_rate": 0.00047461928934010154, + "loss": 0.6885, + "step": 1870 + }, + { + "epoch": 0.0951885367618129, + "grad_norm": 0.05841136910440899, + "learning_rate": 0.00047588832487309646, + "loss": 0.7059, + "step": 1875 + }, + { + "epoch": 0.0954423728598444, + "grad_norm": 0.06300415339926192, + "learning_rate": 0.0004771573604060914, + "loss": 0.7087, + "step": 1880 + }, + { + "epoch": 0.0956962089578759, + "grad_norm": 0.0740423426972598, + "learning_rate": 0.0004784263959390863, + "loss": 0.6992, + "step": 1885 + }, + { + "epoch": 0.0959500450559074, + "grad_norm": 0.06612814847151591, + "learning_rate": 0.00047969543147208124, + "loss": 0.713, + "step": 1890 + }, + { + "epoch": 0.09620388115393891, + "grad_norm": 0.05216005232585783, + "learning_rate": 0.00048096446700507617, + "loss": 0.707, + "step": 1895 + }, + { + "epoch": 0.0964577172519704, + "grad_norm": 0.05326644260558058, + "learning_rate": 0.0004822335025380711, + "loss": 0.7025, + "step": 1900 + }, + { + "epoch": 0.09671155335000191, + "grad_norm": 0.06471751560857093, + "learning_rate": 0.00048350253807106597, + "loss": 0.7059, + "step": 1905 + }, + { + "epoch": 0.0969653894480334, + "grad_norm": 0.06373553800707545, + "learning_rate": 0.00048477157360406095, + "loss": 0.7124, + "step": 1910 + }, + { + "epoch": 0.09721922554606491, + "grad_norm": 0.07107687339224335, + "learning_rate": 0.0004860406091370558, + "loss": 0.7199, + "step": 1915 + }, + { + "epoch": 0.0974730616440964, + "grad_norm": 0.07062703466342107, + "learning_rate": 0.0004873096446700508, + "loss": 0.7306, + "step": 1920 + }, + { + "epoch": 0.09772689774212791, + "grad_norm": 0.05938067294182585, + "learning_rate": 0.0004885786802030457, + "loss": 0.683, + "step": 1925 + }, + { + "epoch": 0.09798073384015941, + "grad_norm": 0.07174216980411427, + "learning_rate": 0.0004898477157360406, + "loss": 0.7168, + "step": 1930 + }, + { + "epoch": 0.09823456993819091, + "grad_norm": 0.06009740027025641, + "learning_rate": 0.0004911167512690356, + "loss": 0.6961, + "step": 1935 + }, + { + "epoch": 0.09848840603622241, + "grad_norm": 0.05206994171495008, + "learning_rate": 0.0004923857868020305, + "loss": 0.6947, + "step": 1940 + }, + { + "epoch": 0.09874224213425391, + "grad_norm": 0.07987693756335262, + "learning_rate": 0.0004936548223350254, + "loss": 0.6895, + "step": 1945 + }, + { + "epoch": 0.09899607823228541, + "grad_norm": 0.05144943467180292, + "learning_rate": 0.0004949238578680203, + "loss": 0.7062, + "step": 1950 + }, + { + "epoch": 0.09924991433031692, + "grad_norm": 0.05082445787484508, + "learning_rate": 0.0004961928934010153, + "loss": 0.6943, + "step": 1955 + }, + { + "epoch": 0.09950375042834841, + "grad_norm": 0.047141885648804095, + "learning_rate": 0.0004974619289340102, + "loss": 0.6917, + "step": 1960 + }, + { + "epoch": 0.09975758652637992, + "grad_norm": 0.047197680147561615, + "learning_rate": 0.0004987309644670051, + "loss": 0.6746, + "step": 1965 + }, + { + "epoch": 0.10001142262441141, + "grad_norm": 0.07137820513966749, + "learning_rate": 0.0005, + "loss": 0.6841, + "step": 1970 + }, + { + "epoch": 0.10026525872244292, + "grad_norm": 0.052618795283310475, + "learning_rate": 0.000501269035532995, + "loss": 0.6897, + "step": 1975 + }, + { + "epoch": 0.10051909482047441, + "grad_norm": 0.05796055424874039, + "learning_rate": 0.0005025380710659899, + "loss": 0.6861, + "step": 1980 + }, + { + "epoch": 0.10077293091850592, + "grad_norm": 0.050447551050472175, + "learning_rate": 0.0005038071065989847, + "loss": 0.6834, + "step": 1985 + }, + { + "epoch": 0.10102676701653743, + "grad_norm": 0.048726174649120095, + "learning_rate": 0.0005050761421319797, + "loss": 0.6813, + "step": 1990 + }, + { + "epoch": 0.10128060311456892, + "grad_norm": 0.07440137618882016, + "learning_rate": 0.0005063451776649747, + "loss": 0.6647, + "step": 1995 + }, + { + "epoch": 0.10153443921260043, + "grad_norm": 0.05076461271762827, + "learning_rate": 0.0005076142131979696, + "loss": 0.7137, + "step": 2000 + }, + { + "epoch": 0.10178827531063192, + "grad_norm": 0.0471884569121775, + "learning_rate": 0.0005088832487309644, + "loss": 0.6887, + "step": 2005 + }, + { + "epoch": 0.10204211140866343, + "grad_norm": 0.11880359913639196, + "learning_rate": 0.0005101522842639594, + "loss": 0.6919, + "step": 2010 + }, + { + "epoch": 0.10229594750669492, + "grad_norm": 0.20996719664722965, + "learning_rate": 0.0005114213197969543, + "loss": 0.6802, + "step": 2015 + }, + { + "epoch": 0.10254978360472643, + "grad_norm": 0.047803725376976, + "learning_rate": 0.0005126903553299493, + "loss": 0.7059, + "step": 2020 + }, + { + "epoch": 0.10280361970275793, + "grad_norm": 0.0449286937065325, + "learning_rate": 0.0005139593908629441, + "loss": 0.7061, + "step": 2025 + }, + { + "epoch": 0.10305745580078943, + "grad_norm": 0.052746260634763203, + "learning_rate": 0.0005152284263959391, + "loss": 0.6967, + "step": 2030 + }, + { + "epoch": 0.10331129189882093, + "grad_norm": 0.04943616988366681, + "learning_rate": 0.000516497461928934, + "loss": 0.717, + "step": 2035 + }, + { + "epoch": 0.10356512799685243, + "grad_norm": 0.049716392637897996, + "learning_rate": 0.000517766497461929, + "loss": 0.6805, + "step": 2040 + }, + { + "epoch": 0.10381896409488393, + "grad_norm": 0.04637995606001784, + "learning_rate": 0.0005190355329949239, + "loss": 0.6779, + "step": 2045 + }, + { + "epoch": 0.10407280019291544, + "grad_norm": 0.04774106805011772, + "learning_rate": 0.0005203045685279187, + "loss": 0.7253, + "step": 2050 + }, + { + "epoch": 0.10432663629094693, + "grad_norm": 0.051793990117296816, + "learning_rate": 0.0005215736040609137, + "loss": 0.6626, + "step": 2055 + }, + { + "epoch": 0.10458047238897844, + "grad_norm": 0.045546210431367966, + "learning_rate": 0.0005228426395939087, + "loss": 0.6842, + "step": 2060 + }, + { + "epoch": 0.10483430848700993, + "grad_norm": 0.048518075416691675, + "learning_rate": 0.0005241116751269036, + "loss": 0.7328, + "step": 2065 + }, + { + "epoch": 0.10508814458504144, + "grad_norm": 0.05679454235982254, + "learning_rate": 0.0005253807106598984, + "loss": 0.6876, + "step": 2070 + }, + { + "epoch": 0.10534198068307293, + "grad_norm": 0.05098370288454924, + "learning_rate": 0.0005266497461928934, + "loss": 0.6929, + "step": 2075 + }, + { + "epoch": 0.10559581678110444, + "grad_norm": 0.04168186472739525, + "learning_rate": 0.0005279187817258884, + "loss": 0.6769, + "step": 2080 + }, + { + "epoch": 0.10584965287913595, + "grad_norm": 0.04581891098414401, + "learning_rate": 0.0005291878172588833, + "loss": 0.6754, + "step": 2085 + }, + { + "epoch": 0.10610348897716744, + "grad_norm": 0.043468753842354504, + "learning_rate": 0.0005304568527918781, + "loss": 0.7335, + "step": 2090 + }, + { + "epoch": 0.10635732507519895, + "grad_norm": 0.05308604744136886, + "learning_rate": 0.0005317258883248731, + "loss": 0.6921, + "step": 2095 + }, + { + "epoch": 0.10661116117323044, + "grad_norm": 0.07051381632192111, + "learning_rate": 0.0005329949238578681, + "loss": 0.6755, + "step": 2100 + }, + { + "epoch": 0.10686499727126195, + "grad_norm": 0.046426623183754255, + "learning_rate": 0.000534263959390863, + "loss": 0.7157, + "step": 2105 + }, + { + "epoch": 0.10711883336929345, + "grad_norm": 0.052422754447883815, + "learning_rate": 0.0005355329949238578, + "loss": 0.7047, + "step": 2110 + }, + { + "epoch": 0.10737266946732495, + "grad_norm": 0.07613534981689268, + "learning_rate": 0.0005368020304568528, + "loss": 0.7077, + "step": 2115 + }, + { + "epoch": 0.10762650556535645, + "grad_norm": 0.053514202239991294, + "learning_rate": 0.0005380710659898477, + "loss": 0.6843, + "step": 2120 + }, + { + "epoch": 0.10788034166338795, + "grad_norm": 0.04913041286768531, + "learning_rate": 0.0005393401015228427, + "loss": 0.6961, + "step": 2125 + }, + { + "epoch": 0.10813417776141945, + "grad_norm": 0.0568300226701408, + "learning_rate": 0.0005406091370558375, + "loss": 0.6692, + "step": 2130 + }, + { + "epoch": 0.10838801385945095, + "grad_norm": 0.04636934678676007, + "learning_rate": 0.0005418781725888325, + "loss": 0.6763, + "step": 2135 + }, + { + "epoch": 0.10864184995748245, + "grad_norm": 0.05136400672323533, + "learning_rate": 0.0005431472081218274, + "loss": 0.6894, + "step": 2140 + }, + { + "epoch": 0.10889568605551396, + "grad_norm": 0.04344668075028007, + "learning_rate": 0.0005444162436548224, + "loss": 0.6631, + "step": 2145 + }, + { + "epoch": 0.10914952215354545, + "grad_norm": 0.0555972048428014, + "learning_rate": 0.0005456852791878173, + "loss": 0.6784, + "step": 2150 + }, + { + "epoch": 0.10940335825157696, + "grad_norm": 0.06620172522346869, + "learning_rate": 0.0005469543147208121, + "loss": 0.6911, + "step": 2155 + }, + { + "epoch": 0.10965719434960845, + "grad_norm": 0.0601809692146959, + "learning_rate": 0.0005482233502538071, + "loss": 0.7259, + "step": 2160 + }, + { + "epoch": 0.10991103044763996, + "grad_norm": 0.044989749038749825, + "learning_rate": 0.0005494923857868021, + "loss": 0.7114, + "step": 2165 + }, + { + "epoch": 0.11016486654567147, + "grad_norm": 0.04741683660493615, + "learning_rate": 0.000550761421319797, + "loss": 0.6949, + "step": 2170 + }, + { + "epoch": 0.11041870264370296, + "grad_norm": 0.054064091770256034, + "learning_rate": 0.0005520304568527918, + "loss": 0.6774, + "step": 2175 + }, + { + "epoch": 0.11067253874173447, + "grad_norm": 0.050772197507611055, + "learning_rate": 0.0005532994923857868, + "loss": 0.6747, + "step": 2180 + }, + { + "epoch": 0.11092637483976596, + "grad_norm": 0.07122688113002712, + "learning_rate": 0.0005545685279187818, + "loss": 0.6751, + "step": 2185 + }, + { + "epoch": 0.11118021093779747, + "grad_norm": 0.056605934899527, + "learning_rate": 0.0005558375634517766, + "loss": 0.7048, + "step": 2190 + }, + { + "epoch": 0.11143404703582896, + "grad_norm": 0.049631694115174936, + "learning_rate": 0.0005571065989847715, + "loss": 0.6795, + "step": 2195 + }, + { + "epoch": 0.11168788313386047, + "grad_norm": 0.05830993033392446, + "learning_rate": 0.0005583756345177665, + "loss": 0.6846, + "step": 2200 + }, + { + "epoch": 0.11194171923189197, + "grad_norm": 0.044352746447960285, + "learning_rate": 0.0005596446700507615, + "loss": 0.6956, + "step": 2205 + }, + { + "epoch": 0.11219555532992347, + "grad_norm": 0.04064168684790446, + "learning_rate": 0.0005609137055837564, + "loss": 0.6584, + "step": 2210 + }, + { + "epoch": 0.11244939142795497, + "grad_norm": 0.05355535018905834, + "learning_rate": 0.0005621827411167512, + "loss": 0.6551, + "step": 2215 + }, + { + "epoch": 0.11270322752598647, + "grad_norm": 0.04823668883816903, + "learning_rate": 0.0005634517766497462, + "loss": 0.7101, + "step": 2220 + }, + { + "epoch": 0.11295706362401797, + "grad_norm": 0.04507099918690335, + "learning_rate": 0.0005647208121827412, + "loss": 0.6928, + "step": 2225 + }, + { + "epoch": 0.11321089972204948, + "grad_norm": 0.04388054794584642, + "learning_rate": 0.0005659898477157361, + "loss": 0.6937, + "step": 2230 + }, + { + "epoch": 0.11346473582008097, + "grad_norm": 0.04682924867200663, + "learning_rate": 0.0005672588832487309, + "loss": 0.6785, + "step": 2235 + }, + { + "epoch": 0.11371857191811248, + "grad_norm": 0.06604610799465334, + "learning_rate": 0.0005685279187817259, + "loss": 0.6779, + "step": 2240 + }, + { + "epoch": 0.11397240801614397, + "grad_norm": 0.06120863018511214, + "learning_rate": 0.0005697969543147208, + "loss": 0.6533, + "step": 2245 + }, + { + "epoch": 0.11422624411417548, + "grad_norm": 0.05270150831774229, + "learning_rate": 0.0005710659898477158, + "loss": 0.6622, + "step": 2250 + }, + { + "epoch": 0.11448008021220697, + "grad_norm": 0.054970737100826304, + "learning_rate": 0.0005723350253807107, + "loss": 0.682, + "step": 2255 + }, + { + "epoch": 0.11473391631023848, + "grad_norm": 0.04567202978046955, + "learning_rate": 0.0005736040609137056, + "loss": 0.6642, + "step": 2260 + }, + { + "epoch": 0.11498775240826999, + "grad_norm": 0.06810657291430826, + "learning_rate": 0.0005748730964467005, + "loss": 0.7241, + "step": 2265 + }, + { + "epoch": 0.11524158850630148, + "grad_norm": 0.051106730998495775, + "learning_rate": 0.0005761421319796955, + "loss": 0.6964, + "step": 2270 + }, + { + "epoch": 0.11549542460433299, + "grad_norm": 0.05289405047092881, + "learning_rate": 0.0005774111675126904, + "loss": 0.6951, + "step": 2275 + }, + { + "epoch": 0.11574926070236448, + "grad_norm": 0.04803879297706406, + "learning_rate": 0.0005786802030456852, + "loss": 0.668, + "step": 2280 + }, + { + "epoch": 0.11600309680039599, + "grad_norm": 0.05408216249103949, + "learning_rate": 0.0005799492385786802, + "loss": 0.6976, + "step": 2285 + }, + { + "epoch": 0.11625693289842748, + "grad_norm": 0.046893742742678574, + "learning_rate": 0.0005812182741116752, + "loss": 0.6492, + "step": 2290 + }, + { + "epoch": 0.11651076899645899, + "grad_norm": 0.05546942162432547, + "learning_rate": 0.0005824873096446702, + "loss": 0.6905, + "step": 2295 + }, + { + "epoch": 0.11676460509449049, + "grad_norm": 0.044480364722733014, + "learning_rate": 0.0005837563451776649, + "loss": 0.6774, + "step": 2300 + }, + { + "epoch": 0.11701844119252199, + "grad_norm": 0.058930815501948106, + "learning_rate": 0.0005850253807106599, + "loss": 0.7125, + "step": 2305 + }, + { + "epoch": 0.11727227729055349, + "grad_norm": 0.06993707438691858, + "learning_rate": 0.0005862944162436549, + "loss": 0.6672, + "step": 2310 + }, + { + "epoch": 0.11752611338858499, + "grad_norm": 0.07094737512117218, + "learning_rate": 0.0005875634517766498, + "loss": 0.6874, + "step": 2315 + }, + { + "epoch": 0.11777994948661649, + "grad_norm": 0.04776737380254815, + "learning_rate": 0.0005888324873096446, + "loss": 0.6743, + "step": 2320 + }, + { + "epoch": 0.118033785584648, + "grad_norm": 0.04754319568857845, + "learning_rate": 0.0005901015228426396, + "loss": 0.6955, + "step": 2325 + }, + { + "epoch": 0.11828762168267949, + "grad_norm": 0.0530012975882747, + "learning_rate": 0.0005913705583756346, + "loss": 0.6741, + "step": 2330 + }, + { + "epoch": 0.118541457780711, + "grad_norm": 0.047906343503061846, + "learning_rate": 0.0005926395939086295, + "loss": 0.6932, + "step": 2335 + }, + { + "epoch": 0.11879529387874249, + "grad_norm": 0.04482089427430776, + "learning_rate": 0.0005939086294416243, + "loss": 0.6872, + "step": 2340 + }, + { + "epoch": 0.119049129976774, + "grad_norm": 0.04529373048320046, + "learning_rate": 0.0005951776649746193, + "loss": 0.6626, + "step": 2345 + }, + { + "epoch": 0.11930296607480549, + "grad_norm": 0.0476388588473774, + "learning_rate": 0.0005964467005076142, + "loss": 0.6791, + "step": 2350 + }, + { + "epoch": 0.119556802172837, + "grad_norm": 0.04934267321682797, + "learning_rate": 0.0005977157360406092, + "loss": 0.686, + "step": 2355 + }, + { + "epoch": 0.1198106382708685, + "grad_norm": 0.048244390716089255, + "learning_rate": 0.000598984771573604, + "loss": 0.6926, + "step": 2360 + }, + { + "epoch": 0.1200644743689, + "grad_norm": 0.062491881852921594, + "learning_rate": 0.000600253807106599, + "loss": 0.7189, + "step": 2365 + }, + { + "epoch": 0.1203183104669315, + "grad_norm": 0.06931903303967604, + "learning_rate": 0.0006015228426395939, + "loss": 0.712, + "step": 2370 + }, + { + "epoch": 0.120572146564963, + "grad_norm": 0.05423697704542445, + "learning_rate": 0.0006027918781725889, + "loss": 0.6846, + "step": 2375 + }, + { + "epoch": 0.1208259826629945, + "grad_norm": 0.05778931196460261, + "learning_rate": 0.0006040609137055838, + "loss": 0.693, + "step": 2380 + }, + { + "epoch": 0.12107981876102601, + "grad_norm": 0.06091151475086323, + "learning_rate": 0.0006053299492385786, + "loss": 0.6676, + "step": 2385 + }, + { + "epoch": 0.1213336548590575, + "grad_norm": 0.055216368348407444, + "learning_rate": 0.0006065989847715736, + "loss": 0.6724, + "step": 2390 + }, + { + "epoch": 0.12158749095708901, + "grad_norm": 0.04670707090251174, + "learning_rate": 0.0006078680203045686, + "loss": 0.6651, + "step": 2395 + }, + { + "epoch": 0.1218413270551205, + "grad_norm": 0.05746162859057901, + "learning_rate": 0.0006091370558375635, + "loss": 0.6942, + "step": 2400 + }, + { + "epoch": 0.12209516315315201, + "grad_norm": 0.056859124253096104, + "learning_rate": 0.0006104060913705583, + "loss": 0.6698, + "step": 2405 + }, + { + "epoch": 0.1223489992511835, + "grad_norm": 0.05339650908828867, + "learning_rate": 0.0006116751269035533, + "loss": 0.6844, + "step": 2410 + }, + { + "epoch": 0.12260283534921501, + "grad_norm": 0.04544174695117297, + "learning_rate": 0.0006129441624365483, + "loss": 0.681, + "step": 2415 + }, + { + "epoch": 0.12285667144724652, + "grad_norm": 0.047853644230645295, + "learning_rate": 0.0006142131979695432, + "loss": 0.6736, + "step": 2420 + }, + { + "epoch": 0.12311050754527801, + "grad_norm": 0.044716395502977895, + "learning_rate": 0.000615482233502538, + "loss": 0.6692, + "step": 2425 + }, + { + "epoch": 0.12336434364330952, + "grad_norm": 0.0440973721411255, + "learning_rate": 0.000616751269035533, + "loss": 0.6751, + "step": 2430 + }, + { + "epoch": 0.12361817974134101, + "grad_norm": 0.04429030828005958, + "learning_rate": 0.000618020304568528, + "loss": 0.6906, + "step": 2435 + }, + { + "epoch": 0.12387201583937252, + "grad_norm": 0.04787659061724324, + "learning_rate": 0.0006192893401015229, + "loss": 0.6818, + "step": 2440 + }, + { + "epoch": 0.12412585193740402, + "grad_norm": 0.04128004321338866, + "learning_rate": 0.0006205583756345177, + "loss": 0.6588, + "step": 2445 + }, + { + "epoch": 0.12437968803543552, + "grad_norm": 0.061281310808498836, + "learning_rate": 0.0006218274111675127, + "loss": 0.6794, + "step": 2450 + }, + { + "epoch": 0.12463352413346702, + "grad_norm": 0.0452210479730751, + "learning_rate": 0.0006230964467005076, + "loss": 0.6981, + "step": 2455 + }, + { + "epoch": 0.12488736023149852, + "grad_norm": 0.07279557551578562, + "learning_rate": 0.0006243654822335026, + "loss": 0.6425, + "step": 2460 + }, + { + "epoch": 0.12514119632953002, + "grad_norm": 0.050867733799510144, + "learning_rate": 0.0006256345177664974, + "loss": 0.6822, + "step": 2465 + }, + { + "epoch": 0.12539503242756153, + "grad_norm": 0.03997161618281022, + "learning_rate": 0.0006269035532994924, + "loss": 0.6798, + "step": 2470 + }, + { + "epoch": 0.12564886852559304, + "grad_norm": 0.04666231998293777, + "learning_rate": 0.0006281725888324873, + "loss": 0.6669, + "step": 2475 + }, + { + "epoch": 0.12590270462362452, + "grad_norm": 0.06993861990367087, + "learning_rate": 0.0006294416243654823, + "loss": 0.6769, + "step": 2480 + }, + { + "epoch": 0.12615654072165602, + "grad_norm": 0.04460301577499877, + "learning_rate": 0.0006307106598984772, + "loss": 0.6702, + "step": 2485 + }, + { + "epoch": 0.12641037681968753, + "grad_norm": 0.046300009587602504, + "learning_rate": 0.000631979695431472, + "loss": 0.6485, + "step": 2490 + }, + { + "epoch": 0.12666421291771904, + "grad_norm": 0.04525513100008759, + "learning_rate": 0.000633248730964467, + "loss": 0.7022, + "step": 2495 + }, + { + "epoch": 0.12691804901575052, + "grad_norm": 0.046659707052599364, + "learning_rate": 0.000634517766497462, + "loss": 0.6636, + "step": 2500 + }, + { + "epoch": 0.12717188511378202, + "grad_norm": 0.04824843023874754, + "learning_rate": 0.0006357868020304569, + "loss": 0.6485, + "step": 2505 + }, + { + "epoch": 0.12742572121181353, + "grad_norm": 0.04746300692436404, + "learning_rate": 0.0006370558375634517, + "loss": 0.7173, + "step": 2510 + }, + { + "epoch": 0.12767955730984504, + "grad_norm": 0.04431626679908001, + "learning_rate": 0.0006383248730964467, + "loss": 0.6767, + "step": 2515 + }, + { + "epoch": 0.12793339340787654, + "grad_norm": 0.04212599191521632, + "learning_rate": 0.0006395939086294417, + "loss": 0.6486, + "step": 2520 + }, + { + "epoch": 0.12818722950590802, + "grad_norm": 0.044688664389786185, + "learning_rate": 0.0006408629441624366, + "loss": 0.6343, + "step": 2525 + }, + { + "epoch": 0.12844106560393953, + "grad_norm": 0.04418877402589161, + "learning_rate": 0.0006421319796954314, + "loss": 0.6908, + "step": 2530 + }, + { + "epoch": 0.12869490170197104, + "grad_norm": 0.04434768157491682, + "learning_rate": 0.0006434010152284264, + "loss": 0.6997, + "step": 2535 + }, + { + "epoch": 0.12894873780000254, + "grad_norm": 0.04818401670766656, + "learning_rate": 0.0006446700507614214, + "loss": 0.6832, + "step": 2540 + }, + { + "epoch": 0.12920257389803405, + "grad_norm": 0.04591714580956639, + "learning_rate": 0.0006459390862944163, + "loss": 0.662, + "step": 2545 + }, + { + "epoch": 0.12945640999606553, + "grad_norm": 0.050750089447246204, + "learning_rate": 0.0006472081218274111, + "loss": 0.6859, + "step": 2550 + }, + { + "epoch": 0.12971024609409704, + "grad_norm": 0.049535561340610365, + "learning_rate": 0.0006484771573604061, + "loss": 0.6929, + "step": 2555 + }, + { + "epoch": 0.12996408219212854, + "grad_norm": 0.04625672713487381, + "learning_rate": 0.000649746192893401, + "loss": 0.6712, + "step": 2560 + }, + { + "epoch": 0.13021791829016005, + "grad_norm": 0.0443189289054786, + "learning_rate": 0.000651015228426396, + "loss": 0.7355, + "step": 2565 + }, + { + "epoch": 0.13047175438819156, + "grad_norm": 0.05280014157523752, + "learning_rate": 0.0006522842639593908, + "loss": 0.701, + "step": 2570 + }, + { + "epoch": 0.13072559048622304, + "grad_norm": 0.04705146570879352, + "learning_rate": 0.0006535532994923858, + "loss": 0.6695, + "step": 2575 + }, + { + "epoch": 0.13097942658425454, + "grad_norm": 0.0460576955250553, + "learning_rate": 0.0006548223350253807, + "loss": 0.692, + "step": 2580 + }, + { + "epoch": 0.13123326268228605, + "grad_norm": 0.03813344619291145, + "learning_rate": 0.0006560913705583757, + "loss": 0.66, + "step": 2585 + }, + { + "epoch": 0.13148709878031756, + "grad_norm": 0.04969973984569192, + "learning_rate": 0.0006573604060913706, + "loss": 0.6934, + "step": 2590 + }, + { + "epoch": 0.13174093487834904, + "grad_norm": 0.042656040318584894, + "learning_rate": 0.0006586294416243654, + "loss": 0.6615, + "step": 2595 + }, + { + "epoch": 0.13199477097638054, + "grad_norm": 0.04426457599994935, + "learning_rate": 0.0006598984771573604, + "loss": 0.7075, + "step": 2600 + }, + { + "epoch": 0.13224860707441205, + "grad_norm": 0.04911188070281771, + "learning_rate": 0.0006611675126903554, + "loss": 0.663, + "step": 2605 + }, + { + "epoch": 0.13250244317244356, + "grad_norm": 0.045086612880435376, + "learning_rate": 0.0006624365482233503, + "loss": 0.6769, + "step": 2610 + }, + { + "epoch": 0.13275627927047506, + "grad_norm": 0.0806263949064106, + "learning_rate": 0.0006637055837563451, + "loss": 0.6701, + "step": 2615 + }, + { + "epoch": 0.13301011536850654, + "grad_norm": 0.09236793730312937, + "learning_rate": 0.0006649746192893401, + "loss": 0.648, + "step": 2620 + }, + { + "epoch": 0.13326395146653805, + "grad_norm": 0.045322993172678835, + "learning_rate": 0.0006662436548223351, + "loss": 0.6718, + "step": 2625 + }, + { + "epoch": 0.13351778756456956, + "grad_norm": 0.04182199738451879, + "learning_rate": 0.0006675126903553299, + "loss": 0.6724, + "step": 2630 + }, + { + "epoch": 0.13377162366260106, + "grad_norm": 0.042921119598924244, + "learning_rate": 0.0006687817258883248, + "loss": 0.6813, + "step": 2635 + }, + { + "epoch": 0.13402545976063257, + "grad_norm": 0.10282609717664869, + "learning_rate": 0.0006700507614213198, + "loss": 0.6714, + "step": 2640 + }, + { + "epoch": 0.13427929585866405, + "grad_norm": 0.05297792239316466, + "learning_rate": 0.0006713197969543148, + "loss": 0.6872, + "step": 2645 + }, + { + "epoch": 0.13453313195669556, + "grad_norm": 0.10242225950893688, + "learning_rate": 0.0006725888324873097, + "loss": 0.6771, + "step": 2650 + }, + { + "epoch": 0.13478696805472706, + "grad_norm": 0.0603931866469296, + "learning_rate": 0.0006738578680203045, + "loss": 0.6974, + "step": 2655 + }, + { + "epoch": 0.13504080415275857, + "grad_norm": 0.04679504191127658, + "learning_rate": 0.0006751269035532995, + "loss": 0.7012, + "step": 2660 + }, + { + "epoch": 0.13529464025079008, + "grad_norm": 0.04709498367472285, + "learning_rate": 0.0006763959390862944, + "loss": 0.6706, + "step": 2665 + }, + { + "epoch": 0.13554847634882156, + "grad_norm": 0.0564473112589115, + "learning_rate": 0.0006776649746192894, + "loss": 0.6901, + "step": 2670 + }, + { + "epoch": 0.13580231244685306, + "grad_norm": 0.06027753857126253, + "learning_rate": 0.0006789340101522842, + "loss": 0.6819, + "step": 2675 + }, + { + "epoch": 0.13605614854488457, + "grad_norm": 0.06314337231356607, + "learning_rate": 0.0006802030456852792, + "loss": 0.6699, + "step": 2680 + }, + { + "epoch": 0.13630998464291608, + "grad_norm": 0.057435222380299925, + "learning_rate": 0.0006814720812182741, + "loss": 0.6497, + "step": 2685 + }, + { + "epoch": 0.13656382074094758, + "grad_norm": 0.04260996003530418, + "learning_rate": 0.0006827411167512691, + "loss": 0.6658, + "step": 2690 + }, + { + "epoch": 0.13681765683897906, + "grad_norm": 0.04258191163717926, + "learning_rate": 0.000684010152284264, + "loss": 0.6924, + "step": 2695 + }, + { + "epoch": 0.13707149293701057, + "grad_norm": 0.05691730259771199, + "learning_rate": 0.0006852791878172588, + "loss": 0.6983, + "step": 2700 + }, + { + "epoch": 0.13732532903504208, + "grad_norm": 0.043909736858620214, + "learning_rate": 0.0006865482233502538, + "loss": 0.6979, + "step": 2705 + }, + { + "epoch": 0.13757916513307358, + "grad_norm": 0.047090946914990356, + "learning_rate": 0.0006878172588832488, + "loss": 0.6779, + "step": 2710 + }, + { + "epoch": 0.13783300123110506, + "grad_norm": 0.047548729011293284, + "learning_rate": 0.0006890862944162437, + "loss": 0.6499, + "step": 2715 + }, + { + "epoch": 0.13808683732913657, + "grad_norm": 0.06616744593038089, + "learning_rate": 0.0006903553299492385, + "loss": 0.6969, + "step": 2720 + }, + { + "epoch": 0.13834067342716808, + "grad_norm": 0.0703509579934864, + "learning_rate": 0.0006916243654822335, + "loss": 0.652, + "step": 2725 + }, + { + "epoch": 0.13859450952519958, + "grad_norm": 0.04246140238019912, + "learning_rate": 0.0006928934010152285, + "loss": 0.6682, + "step": 2730 + }, + { + "epoch": 0.1388483456232311, + "grad_norm": 0.04135755013487853, + "learning_rate": 0.0006941624365482235, + "loss": 0.6556, + "step": 2735 + }, + { + "epoch": 0.13910218172126257, + "grad_norm": 0.04205045142396158, + "learning_rate": 0.0006954314720812182, + "loss": 0.7063, + "step": 2740 + }, + { + "epoch": 0.13935601781929408, + "grad_norm": 0.05001019855676361, + "learning_rate": 0.0006967005076142132, + "loss": 0.7266, + "step": 2745 + }, + { + "epoch": 0.13960985391732558, + "grad_norm": 0.05281826891896312, + "learning_rate": 0.0006979695431472082, + "loss": 0.685, + "step": 2750 + }, + { + "epoch": 0.1398636900153571, + "grad_norm": 0.08165529805729099, + "learning_rate": 0.0006992385786802031, + "loss": 0.6946, + "step": 2755 + }, + { + "epoch": 0.1401175261133886, + "grad_norm": 0.06359142712647757, + "learning_rate": 0.0007005076142131979, + "loss": 0.691, + "step": 2760 + }, + { + "epoch": 0.14037136221142008, + "grad_norm": 0.04783534944187642, + "learning_rate": 0.0007017766497461929, + "loss": 0.6572, + "step": 2765 + }, + { + "epoch": 0.14062519830945158, + "grad_norm": 0.04836474183739573, + "learning_rate": 0.0007030456852791879, + "loss": 0.6499, + "step": 2770 + }, + { + "epoch": 0.1408790344074831, + "grad_norm": 0.04622813948916543, + "learning_rate": 0.0007043147208121828, + "loss": 0.6722, + "step": 2775 + }, + { + "epoch": 0.1411328705055146, + "grad_norm": 0.044196925779096605, + "learning_rate": 0.0007055837563451776, + "loss": 0.6927, + "step": 2780 + }, + { + "epoch": 0.1413867066035461, + "grad_norm": 0.0563954116918076, + "learning_rate": 0.0007068527918781726, + "loss": 0.6773, + "step": 2785 + }, + { + "epoch": 0.14164054270157758, + "grad_norm": 0.05630324142592108, + "learning_rate": 0.0007081218274111675, + "loss": 0.6948, + "step": 2790 + }, + { + "epoch": 0.1418943787996091, + "grad_norm": 0.04380530703415174, + "learning_rate": 0.0007093908629441625, + "loss": 0.6608, + "step": 2795 + }, + { + "epoch": 0.1421482148976406, + "grad_norm": 0.09238879549470941, + "learning_rate": 0.0007106598984771574, + "loss": 0.7379, + "step": 2800 + }, + { + "epoch": 0.1424020509956721, + "grad_norm": 0.08771073436704688, + "learning_rate": 0.0007119289340101523, + "loss": 0.7167, + "step": 2805 + }, + { + "epoch": 0.1426558870937036, + "grad_norm": 0.06359201377032256, + "learning_rate": 0.0007131979695431472, + "loss": 0.7088, + "step": 2810 + }, + { + "epoch": 0.1429097231917351, + "grad_norm": 0.1200886744339994, + "learning_rate": 0.0007144670050761422, + "loss": 0.6847, + "step": 2815 + }, + { + "epoch": 0.1431635592897666, + "grad_norm": 0.138281486633561, + "learning_rate": 0.0007157360406091371, + "loss": 0.6848, + "step": 2820 + }, + { + "epoch": 0.1434173953877981, + "grad_norm": 0.04480938074303019, + "learning_rate": 0.0007170050761421319, + "loss": 0.6904, + "step": 2825 + }, + { + "epoch": 0.1436712314858296, + "grad_norm": 21.175586578852744, + "learning_rate": 0.0007182741116751269, + "loss": 0.7594, + "step": 2830 + }, + { + "epoch": 0.1439250675838611, + "grad_norm": 0.06295691464093359, + "learning_rate": 0.0007195431472081219, + "loss": 0.7236, + "step": 2835 + }, + { + "epoch": 0.1441789036818926, + "grad_norm": 0.10004928301513774, + "learning_rate": 0.0007208121827411168, + "loss": 0.7397, + "step": 2840 + }, + { + "epoch": 0.1444327397799241, + "grad_norm": 0.46538792031189885, + "learning_rate": 0.0007220812182741116, + "loss": 0.8646, + "step": 2845 + }, + { + "epoch": 0.1446865758779556, + "grad_norm": 0.4358423651644742, + "learning_rate": 0.0007233502538071066, + "loss": 0.8406, + "step": 2850 + }, + { + "epoch": 0.14494041197598712, + "grad_norm": 0.10845670895421451, + "learning_rate": 0.0007246192893401016, + "loss": 0.785, + "step": 2855 + }, + { + "epoch": 0.1451942480740186, + "grad_norm": 0.06788040013815881, + "learning_rate": 0.0007258883248730965, + "loss": 0.7328, + "step": 2860 + }, + { + "epoch": 0.1454480841720501, + "grad_norm": 0.07189189427731303, + "learning_rate": 0.0007271573604060913, + "loss": 0.7376, + "step": 2865 + }, + { + "epoch": 0.1457019202700816, + "grad_norm": 0.04909596292317751, + "learning_rate": 0.0007284263959390863, + "loss": 0.7061, + "step": 2870 + }, + { + "epoch": 0.14595575636811312, + "grad_norm": 0.05537834807483057, + "learning_rate": 0.0007296954314720813, + "loss": 0.7313, + "step": 2875 + }, + { + "epoch": 0.14620959246614462, + "grad_norm": 0.2299897910231885, + "learning_rate": 0.0007309644670050762, + "loss": 1.4098, + "step": 2880 + }, + { + "epoch": 0.1464634285641761, + "grad_norm": 0.23582707917647705, + "learning_rate": 0.000732233502538071, + "loss": 0.8452, + "step": 2885 + }, + { + "epoch": 0.1467172646622076, + "grad_norm": 0.11644453166636645, + "learning_rate": 0.000733502538071066, + "loss": 0.7672, + "step": 2890 + }, + { + "epoch": 0.14697110076023912, + "grad_norm": 0.09392008019229685, + "learning_rate": 0.0007347715736040609, + "loss": 0.744, + "step": 2895 + }, + { + "epoch": 0.14722493685827062, + "grad_norm": 0.09298026761511366, + "learning_rate": 0.0007360406091370559, + "loss": 0.7396, + "step": 2900 + }, + { + "epoch": 0.14747877295630213, + "grad_norm": 0.058376952983496454, + "learning_rate": 0.0007373096446700508, + "loss": 0.7174, + "step": 2905 + }, + { + "epoch": 0.1477326090543336, + "grad_norm": 0.05509134155546014, + "learning_rate": 0.0007385786802030457, + "loss": 0.7056, + "step": 2910 + }, + { + "epoch": 0.14798644515236511, + "grad_norm": 0.04295937409763232, + "learning_rate": 0.0007398477157360406, + "loss": 0.7003, + "step": 2915 + }, + { + "epoch": 0.14824028125039662, + "grad_norm": 0.050074822465611714, + "learning_rate": 0.0007411167512690356, + "loss": 0.6936, + "step": 2920 + }, + { + "epoch": 0.14849411734842813, + "grad_norm": 0.06561188123909512, + "learning_rate": 0.0007423857868020305, + "loss": 0.69, + "step": 2925 + }, + { + "epoch": 0.1487479534464596, + "grad_norm": 0.05110790900431651, + "learning_rate": 0.0007436548223350253, + "loss": 0.7239, + "step": 2930 + }, + { + "epoch": 0.14900178954449111, + "grad_norm": 0.053991352186369024, + "learning_rate": 0.0007449238578680203, + "loss": 0.7211, + "step": 2935 + }, + { + "epoch": 0.14925562564252262, + "grad_norm": 0.07363083960749695, + "learning_rate": 0.0007461928934010153, + "loss": 0.6989, + "step": 2940 + }, + { + "epoch": 0.14950946174055413, + "grad_norm": 0.06727590148650675, + "learning_rate": 0.0007474619289340102, + "loss": 0.7114, + "step": 2945 + }, + { + "epoch": 0.14976329783858564, + "grad_norm": 0.057979540617668884, + "learning_rate": 0.000748730964467005, + "loss": 0.7061, + "step": 2950 + }, + { + "epoch": 0.15001713393661711, + "grad_norm": 0.059049940642629514, + "learning_rate": 0.00075, + "loss": 0.7095, + "step": 2955 + }, + { + "epoch": 0.15027097003464862, + "grad_norm": 0.0653578877080519, + "learning_rate": 0.000751269035532995, + "loss": 0.7319, + "step": 2960 + }, + { + "epoch": 0.15052480613268013, + "grad_norm": 0.06333871135346043, + "learning_rate": 0.0007525380710659899, + "loss": 0.7417, + "step": 2965 + }, + { + "epoch": 0.15077864223071163, + "grad_norm": 0.0728096512372998, + "learning_rate": 0.0007538071065989847, + "loss": 0.7424, + "step": 2970 + }, + { + "epoch": 0.15103247832874314, + "grad_norm": 0.041806345943965685, + "learning_rate": 0.0007550761421319797, + "loss": 0.6842, + "step": 2975 + }, + { + "epoch": 0.15128631442677462, + "grad_norm": 0.048509525473583434, + "learning_rate": 0.0007563451776649747, + "loss": 0.7402, + "step": 2980 + }, + { + "epoch": 0.15154015052480613, + "grad_norm": 0.06583937758938846, + "learning_rate": 0.0007576142131979696, + "loss": 0.7292, + "step": 2985 + }, + { + "epoch": 0.15179398662283763, + "grad_norm": 0.04468189699925895, + "learning_rate": 0.0007588832487309644, + "loss": 0.7317, + "step": 2990 + }, + { + "epoch": 0.15204782272086914, + "grad_norm": 0.047287822361576956, + "learning_rate": 0.0007601522842639594, + "loss": 0.6964, + "step": 2995 + }, + { + "epoch": 0.15230165881890065, + "grad_norm": 0.03938327148057227, + "learning_rate": 0.0007614213197969543, + "loss": 0.7265, + "step": 3000 + }, + { + "epoch": 0.15255549491693213, + "grad_norm": 0.03972449267379906, + "learning_rate": 0.0007626903553299493, + "loss": 0.6827, + "step": 3005 + }, + { + "epoch": 0.15280933101496363, + "grad_norm": 0.04711089033081518, + "learning_rate": 0.0007639593908629442, + "loss": 0.7007, + "step": 3010 + }, + { + "epoch": 0.15306316711299514, + "grad_norm": 0.06336031295625702, + "learning_rate": 0.0007652284263959391, + "loss": 0.6908, + "step": 3015 + }, + { + "epoch": 0.15331700321102665, + "grad_norm": 0.03736772924042978, + "learning_rate": 0.000766497461928934, + "loss": 0.689, + "step": 3020 + }, + { + "epoch": 0.15357083930905815, + "grad_norm": 0.041934899038226205, + "learning_rate": 0.000767766497461929, + "loss": 0.7015, + "step": 3025 + }, + { + "epoch": 0.15382467540708963, + "grad_norm": 0.04211578318800713, + "learning_rate": 0.0007690355329949239, + "loss": 0.6677, + "step": 3030 + }, + { + "epoch": 0.15407851150512114, + "grad_norm": 0.04215189545395842, + "learning_rate": 0.0007703045685279187, + "loss": 0.6836, + "step": 3035 + }, + { + "epoch": 0.15433234760315265, + "grad_norm": 0.05201347380185946, + "learning_rate": 0.0007715736040609137, + "loss": 0.7107, + "step": 3040 + }, + { + "epoch": 0.15458618370118415, + "grad_norm": 0.04761766831274283, + "learning_rate": 0.0007728426395939087, + "loss": 0.7119, + "step": 3045 + }, + { + "epoch": 0.15484001979921563, + "grad_norm": 0.03933638281586551, + "learning_rate": 0.0007741116751269036, + "loss": 0.6738, + "step": 3050 + }, + { + "epoch": 0.15509385589724714, + "grad_norm": 0.11996962799432084, + "learning_rate": 0.0007753807106598984, + "loss": 0.6781, + "step": 3055 + }, + { + "epoch": 0.15534769199527865, + "grad_norm": 0.05837163303706869, + "learning_rate": 0.0007766497461928934, + "loss": 0.6902, + "step": 3060 + }, + { + "epoch": 0.15560152809331015, + "grad_norm": 0.06028633251581502, + "learning_rate": 0.0007779187817258884, + "loss": 0.6698, + "step": 3065 + }, + { + "epoch": 0.15585536419134166, + "grad_norm": 0.07092481098060303, + "learning_rate": 0.0007791878172588833, + "loss": 0.6567, + "step": 3070 + }, + { + "epoch": 0.15610920028937314, + "grad_norm": 0.04707521754835134, + "learning_rate": 0.0007804568527918781, + "loss": 0.6966, + "step": 3075 + }, + { + "epoch": 0.15636303638740465, + "grad_norm": 0.047329839412153664, + "learning_rate": 0.0007817258883248731, + "loss": 0.7177, + "step": 3080 + }, + { + "epoch": 0.15661687248543615, + "grad_norm": 0.04088170174383998, + "learning_rate": 0.0007829949238578681, + "loss": 0.6803, + "step": 3085 + }, + { + "epoch": 0.15687070858346766, + "grad_norm": 0.038345774566105946, + "learning_rate": 0.000784263959390863, + "loss": 0.6656, + "step": 3090 + }, + { + "epoch": 0.15712454468149917, + "grad_norm": 0.04514482638739989, + "learning_rate": 0.0007855329949238578, + "loss": 0.7314, + "step": 3095 + }, + { + "epoch": 0.15737838077953065, + "grad_norm": 0.04006321580374188, + "learning_rate": 0.0007868020304568528, + "loss": 0.6747, + "step": 3100 + }, + { + "epoch": 0.15763221687756215, + "grad_norm": 0.05067763436233774, + "learning_rate": 0.0007880710659898477, + "loss": 0.7088, + "step": 3105 + }, + { + "epoch": 0.15788605297559366, + "grad_norm": 0.03882503010997677, + "learning_rate": 0.0007893401015228427, + "loss": 0.6857, + "step": 3110 + }, + { + "epoch": 0.15813988907362517, + "grad_norm": 0.041915161141761616, + "learning_rate": 0.0007906091370558376, + "loss": 0.6804, + "step": 3115 + }, + { + "epoch": 0.15839372517165667, + "grad_norm": 0.055655601622924857, + "learning_rate": 0.0007918781725888325, + "loss": 0.7128, + "step": 3120 + }, + { + "epoch": 0.15864756126968815, + "grad_norm": 0.09341393989640125, + "learning_rate": 0.0007931472081218274, + "loss": 0.6803, + "step": 3125 + }, + { + "epoch": 0.15890139736771966, + "grad_norm": 0.06199349132436108, + "learning_rate": 0.0007944162436548224, + "loss": 0.7121, + "step": 3130 + }, + { + "epoch": 0.15915523346575117, + "grad_norm": 0.08342488615098623, + "learning_rate": 0.0007956852791878173, + "loss": 0.631, + "step": 3135 + }, + { + "epoch": 0.15940906956378267, + "grad_norm": 0.06702222456990227, + "learning_rate": 0.0007969543147208121, + "loss": 0.681, + "step": 3140 + }, + { + "epoch": 0.15966290566181415, + "grad_norm": 0.051322791325766115, + "learning_rate": 0.0007982233502538071, + "loss": 0.6961, + "step": 3145 + }, + { + "epoch": 0.15991674175984566, + "grad_norm": 0.05093510864847829, + "learning_rate": 0.0007994923857868021, + "loss": 0.6924, + "step": 3150 + }, + { + "epoch": 0.16017057785787717, + "grad_norm": 0.05191372708283371, + "learning_rate": 0.000800761421319797, + "loss": 0.6651, + "step": 3155 + }, + { + "epoch": 0.16042441395590867, + "grad_norm": 0.065309480406257, + "learning_rate": 0.0008020304568527918, + "loss": 0.6873, + "step": 3160 + }, + { + "epoch": 0.16067825005394018, + "grad_norm": 0.04035360012723001, + "learning_rate": 0.0008032994923857868, + "loss": 0.7014, + "step": 3165 + }, + { + "epoch": 0.16093208615197166, + "grad_norm": 0.03787636570491385, + "learning_rate": 0.0008045685279187818, + "loss": 0.7184, + "step": 3170 + }, + { + "epoch": 0.16118592225000317, + "grad_norm": 0.04892199114261499, + "learning_rate": 0.0008058375634517766, + "loss": 0.7074, + "step": 3175 + }, + { + "epoch": 0.16143975834803467, + "grad_norm": 0.044454591373735045, + "learning_rate": 0.0008071065989847715, + "loss": 0.6681, + "step": 3180 + }, + { + "epoch": 0.16169359444606618, + "grad_norm": 0.05477810689691977, + "learning_rate": 0.0008083756345177665, + "loss": 0.7073, + "step": 3185 + }, + { + "epoch": 0.1619474305440977, + "grad_norm": 0.1561533627636135, + "learning_rate": 0.0008096446700507615, + "loss": 0.7062, + "step": 3190 + }, + { + "epoch": 0.16220126664212917, + "grad_norm": 0.07695082260270414, + "learning_rate": 0.0008109137055837564, + "loss": 0.6695, + "step": 3195 + }, + { + "epoch": 0.16245510274016067, + "grad_norm": 0.0675592941811925, + "learning_rate": 0.0008121827411167512, + "loss": 0.6852, + "step": 3200 + }, + { + "epoch": 0.16270893883819218, + "grad_norm": 0.052732094884202066, + "learning_rate": 0.0008134517766497462, + "loss": 0.7046, + "step": 3205 + }, + { + "epoch": 0.1629627749362237, + "grad_norm": 0.055496608441699284, + "learning_rate": 0.0008147208121827412, + "loss": 0.6916, + "step": 3210 + }, + { + "epoch": 0.1632166110342552, + "grad_norm": 0.043427396820308814, + "learning_rate": 0.0008159898477157361, + "loss": 0.7338, + "step": 3215 + }, + { + "epoch": 0.16347044713228667, + "grad_norm": 0.04788846665606786, + "learning_rate": 0.000817258883248731, + "loss": 0.7164, + "step": 3220 + }, + { + "epoch": 0.16372428323031818, + "grad_norm": 0.05499792725011934, + "learning_rate": 0.0008185279187817259, + "loss": 0.6441, + "step": 3225 + }, + { + "epoch": 0.1639781193283497, + "grad_norm": 0.07060079550345069, + "learning_rate": 0.0008197969543147208, + "loss": 0.6704, + "step": 3230 + }, + { + "epoch": 0.1642319554263812, + "grad_norm": 0.04697902634882019, + "learning_rate": 0.0008210659898477158, + "loss": 0.6896, + "step": 3235 + }, + { + "epoch": 0.1644857915244127, + "grad_norm": 0.03693026821420421, + "learning_rate": 0.0008223350253807107, + "loss": 0.6979, + "step": 3240 + }, + { + "epoch": 0.16473962762244418, + "grad_norm": 0.044236930949549086, + "learning_rate": 0.0008236040609137056, + "loss": 0.6727, + "step": 3245 + }, + { + "epoch": 0.16499346372047569, + "grad_norm": 0.04297691167933406, + "learning_rate": 0.0008248730964467005, + "loss": 0.7183, + "step": 3250 + }, + { + "epoch": 0.1652472998185072, + "grad_norm": 0.03926872438952914, + "learning_rate": 0.0008261421319796955, + "loss": 0.6678, + "step": 3255 + }, + { + "epoch": 0.1655011359165387, + "grad_norm": 0.058748589368983826, + "learning_rate": 0.0008274111675126904, + "loss": 0.6859, + "step": 3260 + }, + { + "epoch": 0.16575497201457018, + "grad_norm": 0.043071867704478226, + "learning_rate": 0.0008286802030456852, + "loss": 0.6584, + "step": 3265 + }, + { + "epoch": 0.16600880811260169, + "grad_norm": 0.036727442528139344, + "learning_rate": 0.0008299492385786802, + "loss": 0.6644, + "step": 3270 + }, + { + "epoch": 0.1662626442106332, + "grad_norm": 0.045406805927585635, + "learning_rate": 0.0008312182741116752, + "loss": 0.6902, + "step": 3275 + }, + { + "epoch": 0.1665164803086647, + "grad_norm": 0.03886588985080634, + "learning_rate": 0.0008324873096446702, + "loss": 0.6912, + "step": 3280 + }, + { + "epoch": 0.1667703164066962, + "grad_norm": 0.03860450090032993, + "learning_rate": 0.0008337563451776649, + "loss": 0.6918, + "step": 3285 + }, + { + "epoch": 0.16702415250472769, + "grad_norm": 0.03919405708529676, + "learning_rate": 0.0008350253807106599, + "loss": 0.7097, + "step": 3290 + }, + { + "epoch": 0.1672779886027592, + "grad_norm": 0.06384676391238123, + "learning_rate": 0.0008362944162436549, + "loss": 0.6769, + "step": 3295 + }, + { + "epoch": 0.1675318247007907, + "grad_norm": 0.04387951589525683, + "learning_rate": 0.0008375634517766498, + "loss": 0.7001, + "step": 3300 + }, + { + "epoch": 0.1677856607988222, + "grad_norm": 0.04020760640485976, + "learning_rate": 0.0008388324873096446, + "loss": 0.7, + "step": 3305 + }, + { + "epoch": 0.1680394968968537, + "grad_norm": 0.04343991292299841, + "learning_rate": 0.0008401015228426396, + "loss": 0.6653, + "step": 3310 + }, + { + "epoch": 0.1682933329948852, + "grad_norm": 0.04877171128678498, + "learning_rate": 0.0008413705583756346, + "loss": 0.6491, + "step": 3315 + }, + { + "epoch": 0.1685471690929167, + "grad_norm": 0.04489329625384213, + "learning_rate": 0.0008426395939086295, + "loss": 0.6944, + "step": 3320 + }, + { + "epoch": 0.1688010051909482, + "grad_norm": 0.03924346550703404, + "learning_rate": 0.0008439086294416243, + "loss": 0.7004, + "step": 3325 + }, + { + "epoch": 0.1690548412889797, + "grad_norm": 0.0351682478604759, + "learning_rate": 0.0008451776649746193, + "loss": 0.6289, + "step": 3330 + }, + { + "epoch": 0.16930867738701122, + "grad_norm": 0.05804265280485358, + "learning_rate": 0.0008464467005076142, + "loss": 0.6819, + "step": 3335 + }, + { + "epoch": 0.1695625134850427, + "grad_norm": 0.05558949651884164, + "learning_rate": 0.0008477157360406092, + "loss": 0.6995, + "step": 3340 + }, + { + "epoch": 0.1698163495830742, + "grad_norm": 0.07239860999828403, + "learning_rate": 0.0008489847715736041, + "loss": 0.698, + "step": 3345 + }, + { + "epoch": 0.1700701856811057, + "grad_norm": 0.04268065537159936, + "learning_rate": 0.000850253807106599, + "loss": 0.6724, + "step": 3350 + }, + { + "epoch": 0.17032402177913722, + "grad_norm": 0.04596502747992512, + "learning_rate": 0.0008515228426395939, + "loss": 0.715, + "step": 3355 + }, + { + "epoch": 0.17057785787716873, + "grad_norm": 0.07978824559788683, + "learning_rate": 0.0008527918781725889, + "loss": 0.6687, + "step": 3360 + }, + { + "epoch": 0.1708316939752002, + "grad_norm": 0.03941742907400371, + "learning_rate": 0.0008540609137055838, + "loss": 0.6799, + "step": 3365 + }, + { + "epoch": 0.1710855300732317, + "grad_norm": 0.0527239971063264, + "learning_rate": 0.0008553299492385786, + "loss": 0.7063, + "step": 3370 + }, + { + "epoch": 0.17133936617126322, + "grad_norm": 3.822834197350998, + "learning_rate": 0.0008565989847715736, + "loss": 0.696, + "step": 3375 + }, + { + "epoch": 0.17159320226929473, + "grad_norm": 0.083121560754106, + "learning_rate": 0.0008578680203045686, + "loss": 0.6914, + "step": 3380 + }, + { + "epoch": 0.1718470383673262, + "grad_norm": 0.06454785794212209, + "learning_rate": 0.0008591370558375635, + "loss": 0.7085, + "step": 3385 + }, + { + "epoch": 0.1721008744653577, + "grad_norm": 0.03945210916226416, + "learning_rate": 0.0008604060913705583, + "loss": 0.682, + "step": 3390 + }, + { + "epoch": 0.17235471056338922, + "grad_norm": 0.04193981751233244, + "learning_rate": 0.0008616751269035533, + "loss": 0.666, + "step": 3395 + }, + { + "epoch": 0.17260854666142073, + "grad_norm": 0.03967018442585797, + "learning_rate": 0.0008629441624365483, + "loss": 0.6608, + "step": 3400 + }, + { + "epoch": 0.17286238275945223, + "grad_norm": 0.04644004597796404, + "learning_rate": 0.0008642131979695432, + "loss": 0.6535, + "step": 3405 + }, + { + "epoch": 0.1731162188574837, + "grad_norm": 0.03873588952866688, + "learning_rate": 0.000865482233502538, + "loss": 0.7228, + "step": 3410 + }, + { + "epoch": 0.17337005495551522, + "grad_norm": 0.31833461711624433, + "learning_rate": 0.000866751269035533, + "loss": 0.7115, + "step": 3415 + }, + { + "epoch": 0.17362389105354673, + "grad_norm": 0.35935016596831215, + "learning_rate": 0.000868020304568528, + "loss": 0.6879, + "step": 3420 + }, + { + "epoch": 0.17387772715157823, + "grad_norm": 0.04780743742889909, + "learning_rate": 0.0008692893401015229, + "loss": 0.6907, + "step": 3425 + }, + { + "epoch": 0.17413156324960974, + "grad_norm": 0.04110231970405351, + "learning_rate": 0.0008705583756345177, + "loss": 0.7238, + "step": 3430 + }, + { + "epoch": 0.17438539934764122, + "grad_norm": 0.046525580251483005, + "learning_rate": 0.0008718274111675127, + "loss": 0.6778, + "step": 3435 + }, + { + "epoch": 0.17463923544567272, + "grad_norm": 0.041718923819856486, + "learning_rate": 0.0008730964467005076, + "loss": 0.6486, + "step": 3440 + }, + { + "epoch": 0.17489307154370423, + "grad_norm": 0.04055383524511006, + "learning_rate": 0.0008743654822335026, + "loss": 0.6968, + "step": 3445 + }, + { + "epoch": 0.17514690764173574, + "grad_norm": 0.045736208992562415, + "learning_rate": 0.0008756345177664975, + "loss": 0.6675, + "step": 3450 + }, + { + "epoch": 0.17540074373976725, + "grad_norm": 0.06303424703405444, + "learning_rate": 0.0008769035532994924, + "loss": 0.7083, + "step": 3455 + }, + { + "epoch": 0.17565457983779872, + "grad_norm": 0.03565589845558804, + "learning_rate": 0.0008781725888324873, + "loss": 0.6513, + "step": 3460 + }, + { + "epoch": 0.17590841593583023, + "grad_norm": 0.039028072883575, + "learning_rate": 0.0008794416243654823, + "loss": 0.6768, + "step": 3465 + }, + { + "epoch": 0.17616225203386174, + "grad_norm": 0.03826237652273635, + "learning_rate": 0.0008807106598984772, + "loss": 0.6931, + "step": 3470 + }, + { + "epoch": 0.17641608813189325, + "grad_norm": 0.03786297862349843, + "learning_rate": 0.000881979695431472, + "loss": 0.6665, + "step": 3475 + }, + { + "epoch": 0.17666992422992472, + "grad_norm": 0.035346860951597725, + "learning_rate": 0.000883248730964467, + "loss": 0.6739, + "step": 3480 + }, + { + "epoch": 0.17692376032795623, + "grad_norm": 0.038526863874646516, + "learning_rate": 0.000884517766497462, + "loss": 0.665, + "step": 3485 + }, + { + "epoch": 0.17717759642598774, + "grad_norm": 0.06026897162610092, + "learning_rate": 0.0008857868020304569, + "loss": 0.68, + "step": 3490 + }, + { + "epoch": 0.17743143252401924, + "grad_norm": 0.05069239821159444, + "learning_rate": 0.0008870558375634517, + "loss": 0.7229, + "step": 3495 + }, + { + "epoch": 0.17768526862205075, + "grad_norm": 0.09985009714735808, + "learning_rate": 0.0008883248730964467, + "loss": 0.6917, + "step": 3500 + }, + { + "epoch": 0.17793910472008223, + "grad_norm": 0.10170892988817608, + "learning_rate": 0.0008895939086294417, + "loss": 0.7233, + "step": 3505 + }, + { + "epoch": 0.17819294081811374, + "grad_norm": 0.07075066250119805, + "learning_rate": 0.0008908629441624366, + "loss": 0.669, + "step": 3510 + }, + { + "epoch": 0.17844677691614524, + "grad_norm": 0.08757273613751611, + "learning_rate": 0.0008921319796954314, + "loss": 0.6676, + "step": 3515 + }, + { + "epoch": 0.17870061301417675, + "grad_norm": 0.038714672890372746, + "learning_rate": 0.0008934010152284264, + "loss": 0.7115, + "step": 3520 + }, + { + "epoch": 0.17895444911220826, + "grad_norm": 0.04036233500547512, + "learning_rate": 0.0008946700507614214, + "loss": 0.6783, + "step": 3525 + }, + { + "epoch": 0.17920828521023974, + "grad_norm": 0.06230842789641548, + "learning_rate": 0.0008959390862944163, + "loss": 0.669, + "step": 3530 + }, + { + "epoch": 0.17946212130827124, + "grad_norm": 0.07454575938873274, + "learning_rate": 0.0008972081218274111, + "loss": 0.7506, + "step": 3535 + }, + { + "epoch": 0.17971595740630275, + "grad_norm": 0.06240895100177482, + "learning_rate": 0.0008984771573604061, + "loss": 0.7194, + "step": 3540 + }, + { + "epoch": 0.17996979350433426, + "grad_norm": 0.04630554537296931, + "learning_rate": 0.000899746192893401, + "loss": 0.7211, + "step": 3545 + }, + { + "epoch": 0.18022362960236576, + "grad_norm": 0.056718557931670986, + "learning_rate": 0.000901015228426396, + "loss": 0.7443, + "step": 3550 + }, + { + "epoch": 0.18047746570039724, + "grad_norm": 0.053855068524236355, + "learning_rate": 0.0009022842639593909, + "loss": 0.6938, + "step": 3555 + }, + { + "epoch": 0.18073130179842875, + "grad_norm": 0.04098910289114666, + "learning_rate": 0.0009035532994923858, + "loss": 0.6882, + "step": 3560 + }, + { + "epoch": 0.18098513789646026, + "grad_norm": 0.043572400017178894, + "learning_rate": 0.0009048223350253807, + "loss": 0.7237, + "step": 3565 + }, + { + "epoch": 0.18123897399449176, + "grad_norm": 0.05122515570696231, + "learning_rate": 0.0009060913705583757, + "loss": 0.6965, + "step": 3570 + }, + { + "epoch": 0.18149281009252327, + "grad_norm": 0.0486904734197597, + "learning_rate": 0.0009073604060913706, + "loss": 0.6821, + "step": 3575 + }, + { + "epoch": 0.18174664619055475, + "grad_norm": 0.14211715338289158, + "learning_rate": 0.0009086294416243654, + "loss": 0.6776, + "step": 3580 + }, + { + "epoch": 0.18200048228858626, + "grad_norm": 0.04140861099483773, + "learning_rate": 0.0009098984771573604, + "loss": 0.716, + "step": 3585 + }, + { + "epoch": 0.18225431838661776, + "grad_norm": 0.05317734506655789, + "learning_rate": 0.0009111675126903554, + "loss": 0.7328, + "step": 3590 + }, + { + "epoch": 0.18250815448464927, + "grad_norm": 0.04412671048588959, + "learning_rate": 0.0009124365482233503, + "loss": 0.7013, + "step": 3595 + }, + { + "epoch": 0.18276199058268075, + "grad_norm": 0.043226638147343656, + "learning_rate": 0.0009137055837563451, + "loss": 0.7068, + "step": 3600 + }, + { + "epoch": 0.18301582668071226, + "grad_norm": 0.03626076158298662, + "learning_rate": 0.0009149746192893401, + "loss": 0.6853, + "step": 3605 + }, + { + "epoch": 0.18326966277874376, + "grad_norm": 0.047674224438480246, + "learning_rate": 0.0009162436548223351, + "loss": 0.6745, + "step": 3610 + }, + { + "epoch": 0.18352349887677527, + "grad_norm": 0.04231046030159459, + "learning_rate": 0.0009175126903553299, + "loss": 0.6849, + "step": 3615 + }, + { + "epoch": 0.18377733497480678, + "grad_norm": 0.04195288389214527, + "learning_rate": 0.0009187817258883248, + "loss": 0.7041, + "step": 3620 + }, + { + "epoch": 0.18403117107283826, + "grad_norm": 0.039251889309433935, + "learning_rate": 0.0009200507614213198, + "loss": 0.6504, + "step": 3625 + }, + { + "epoch": 0.18428500717086976, + "grad_norm": 0.03738914770413547, + "learning_rate": 0.0009213197969543148, + "loss": 0.7048, + "step": 3630 + }, + { + "epoch": 0.18453884326890127, + "grad_norm": 0.0436948152095552, + "learning_rate": 0.0009225888324873097, + "loss": 0.6771, + "step": 3635 + }, + { + "epoch": 0.18479267936693278, + "grad_norm": 0.03767490570058011, + "learning_rate": 0.0009238578680203045, + "loss": 0.6494, + "step": 3640 + }, + { + "epoch": 0.18504651546496428, + "grad_norm": 0.037090867993691726, + "learning_rate": 0.0009251269035532995, + "loss": 0.6545, + "step": 3645 + }, + { + "epoch": 0.18530035156299576, + "grad_norm": 0.042846186747705906, + "learning_rate": 0.0009263959390862944, + "loss": 0.6573, + "step": 3650 + }, + { + "epoch": 0.18555418766102727, + "grad_norm": 0.046132833928787344, + "learning_rate": 0.0009276649746192894, + "loss": 0.6716, + "step": 3655 + }, + { + "epoch": 0.18580802375905878, + "grad_norm": 0.04810476990314317, + "learning_rate": 0.0009289340101522843, + "loss": 0.6552, + "step": 3660 + }, + { + "epoch": 0.18606185985709028, + "grad_norm": 0.05725224131003678, + "learning_rate": 0.0009302030456852792, + "loss": 0.7188, + "step": 3665 + }, + { + "epoch": 0.1863156959551218, + "grad_norm": 0.03760096816148779, + "learning_rate": 0.0009314720812182741, + "loss": 0.679, + "step": 3670 + }, + { + "epoch": 0.18656953205315327, + "grad_norm": 0.0414336045399388, + "learning_rate": 0.0009327411167512691, + "loss": 0.703, + "step": 3675 + }, + { + "epoch": 0.18682336815118478, + "grad_norm": 0.04888803816351589, + "learning_rate": 0.000934010152284264, + "loss": 0.6984, + "step": 3680 + }, + { + "epoch": 0.18707720424921628, + "grad_norm": 0.03932910243932595, + "learning_rate": 0.0009352791878172588, + "loss": 0.6935, + "step": 3685 + }, + { + "epoch": 0.1873310403472478, + "grad_norm": 0.053329295837259566, + "learning_rate": 0.0009365482233502538, + "loss": 0.6654, + "step": 3690 + }, + { + "epoch": 0.18758487644527927, + "grad_norm": 0.04033686402339195, + "learning_rate": 0.0009378172588832488, + "loss": 0.6656, + "step": 3695 + }, + { + "epoch": 0.18783871254331078, + "grad_norm": 0.05078207384763503, + "learning_rate": 0.0009390862944162437, + "loss": 0.6616, + "step": 3700 + }, + { + "epoch": 0.18809254864134228, + "grad_norm": 0.03612814331688169, + "learning_rate": 0.0009403553299492385, + "loss": 0.6856, + "step": 3705 + }, + { + "epoch": 0.1883463847393738, + "grad_norm": 0.04152827694147588, + "learning_rate": 0.0009416243654822335, + "loss": 0.707, + "step": 3710 + }, + { + "epoch": 0.1886002208374053, + "grad_norm": 0.03476616159624936, + "learning_rate": 0.0009428934010152285, + "loss": 0.6786, + "step": 3715 + }, + { + "epoch": 0.18885405693543678, + "grad_norm": 0.03981634185742135, + "learning_rate": 0.0009441624365482235, + "loss": 0.7042, + "step": 3720 + }, + { + "epoch": 0.18910789303346828, + "grad_norm": 0.04204868547098972, + "learning_rate": 0.0009454314720812182, + "loss": 0.6841, + "step": 3725 + }, + { + "epoch": 0.1893617291314998, + "grad_norm": 0.039850666477108665, + "learning_rate": 0.0009467005076142132, + "loss": 0.6583, + "step": 3730 + }, + { + "epoch": 0.1896155652295313, + "grad_norm": 0.03866838212345305, + "learning_rate": 0.0009479695431472082, + "loss": 0.7246, + "step": 3735 + }, + { + "epoch": 0.1898694013275628, + "grad_norm": 0.040841413496425324, + "learning_rate": 0.0009492385786802031, + "loss": 0.6598, + "step": 3740 + }, + { + "epoch": 0.19012323742559428, + "grad_norm": 0.04010141554319208, + "learning_rate": 0.000950507614213198, + "loss": 0.6813, + "step": 3745 + }, + { + "epoch": 0.1903770735236258, + "grad_norm": 0.06530308653395914, + "learning_rate": 0.0009517766497461929, + "loss": 0.6685, + "step": 3750 + }, + { + "epoch": 0.1906309096216573, + "grad_norm": 0.07450856843136024, + "learning_rate": 0.0009530456852791879, + "loss": 0.6874, + "step": 3755 + }, + { + "epoch": 0.1908847457196888, + "grad_norm": 0.035499974530579376, + "learning_rate": 0.0009543147208121828, + "loss": 0.681, + "step": 3760 + }, + { + "epoch": 0.1911385818177203, + "grad_norm": 0.0437247777174162, + "learning_rate": 0.0009555837563451777, + "loss": 0.6848, + "step": 3765 + }, + { + "epoch": 0.1913924179157518, + "grad_norm": 0.035676219852568955, + "learning_rate": 0.0009568527918781726, + "loss": 0.627, + "step": 3770 + }, + { + "epoch": 0.1916462540137833, + "grad_norm": 0.03768653840820619, + "learning_rate": 0.0009581218274111675, + "loss": 0.6689, + "step": 3775 + }, + { + "epoch": 0.1919000901118148, + "grad_norm": 0.03823207024695021, + "learning_rate": 0.0009593908629441625, + "loss": 0.6335, + "step": 3780 + }, + { + "epoch": 0.1921539262098463, + "grad_norm": 0.04684915109347205, + "learning_rate": 0.0009606598984771574, + "loss": 0.6975, + "step": 3785 + }, + { + "epoch": 0.19240776230787782, + "grad_norm": 0.04745389321415872, + "learning_rate": 0.0009619289340101523, + "loss": 0.7062, + "step": 3790 + }, + { + "epoch": 0.1926615984059093, + "grad_norm": 0.03760091854535719, + "learning_rate": 0.0009631979695431472, + "loss": 0.6646, + "step": 3795 + }, + { + "epoch": 0.1929154345039408, + "grad_norm": 0.03932186553338342, + "learning_rate": 0.0009644670050761422, + "loss": 0.6743, + "step": 3800 + }, + { + "epoch": 0.1931692706019723, + "grad_norm": 0.03951929870880764, + "learning_rate": 0.0009657360406091371, + "loss": 0.6831, + "step": 3805 + }, + { + "epoch": 0.19342310670000382, + "grad_norm": 0.03553090190878169, + "learning_rate": 0.0009670050761421319, + "loss": 0.6571, + "step": 3810 + }, + { + "epoch": 0.1936769427980353, + "grad_norm": 0.06138620851402293, + "learning_rate": 0.0009682741116751269, + "loss": 0.6986, + "step": 3815 + }, + { + "epoch": 0.1939307788960668, + "grad_norm": 0.24733244873927648, + "learning_rate": 0.0009695431472081219, + "loss": 0.6976, + "step": 3820 + }, + { + "epoch": 0.1941846149940983, + "grad_norm": 0.0666364936631057, + "learning_rate": 0.0009708121827411168, + "loss": 0.7391, + "step": 3825 + }, + { + "epoch": 0.19443845109212982, + "grad_norm": 0.07967106187155334, + "learning_rate": 0.0009720812182741116, + "loss": 0.6993, + "step": 3830 + }, + { + "epoch": 0.19469228719016132, + "grad_norm": 0.051138778909318464, + "learning_rate": 0.0009733502538071066, + "loss": 0.7191, + "step": 3835 + }, + { + "epoch": 0.1949461232881928, + "grad_norm": 0.5413258576711231, + "learning_rate": 0.0009746192893401016, + "loss": 0.7592, + "step": 3840 + }, + { + "epoch": 0.1951999593862243, + "grad_norm": 0.1471254894588324, + "learning_rate": 0.0009758883248730965, + "loss": 0.7572, + "step": 3845 + }, + { + "epoch": 0.19545379548425582, + "grad_norm": 0.2706851022164675, + "learning_rate": 0.0009771573604060915, + "loss": 0.6907, + "step": 3850 + }, + { + "epoch": 0.19570763158228732, + "grad_norm": 0.14415862874984905, + "learning_rate": 0.0009784263959390863, + "loss": 0.7062, + "step": 3855 + }, + { + "epoch": 0.19596146768031883, + "grad_norm": 0.14791455727385014, + "learning_rate": 0.0009796954314720812, + "loss": 0.7451, + "step": 3860 + }, + { + "epoch": 0.1962153037783503, + "grad_norm": 0.12586017082802645, + "learning_rate": 0.000980964467005076, + "loss": 0.6971, + "step": 3865 + }, + { + "epoch": 0.19646913987638182, + "grad_norm": 0.0826384222127971, + "learning_rate": 0.0009822335025380712, + "loss": 0.7288, + "step": 3870 + }, + { + "epoch": 0.19672297597441332, + "grad_norm": 0.4096552727669512, + "learning_rate": 0.000983502538071066, + "loss": 0.6983, + "step": 3875 + }, + { + "epoch": 0.19697681207244483, + "grad_norm": 0.126131262033545, + "learning_rate": 0.000984771573604061, + "loss": 0.7836, + "step": 3880 + }, + { + "epoch": 0.19723064817047634, + "grad_norm": 0.1721626969232291, + "learning_rate": 0.0009860406091370558, + "loss": 0.7095, + "step": 3885 + }, + { + "epoch": 0.19748448426850782, + "grad_norm": 0.1375632163342092, + "learning_rate": 0.0009873096446700509, + "loss": 0.7249, + "step": 3890 + }, + { + "epoch": 0.19773832036653932, + "grad_norm": 0.16481287346009105, + "learning_rate": 0.0009885786802030457, + "loss": 0.7144, + "step": 3895 + }, + { + "epoch": 0.19799215646457083, + "grad_norm": 0.08436713045812702, + "learning_rate": 0.0009898477157360406, + "loss": 0.7119, + "step": 3900 + }, + { + "epoch": 0.19824599256260234, + "grad_norm": 0.05626919966189561, + "learning_rate": 0.0009911167512690355, + "loss": 0.7135, + "step": 3905 + }, + { + "epoch": 0.19849982866063384, + "grad_norm": 0.04456845230313172, + "learning_rate": 0.0009923857868020306, + "loss": 0.6619, + "step": 3910 + }, + { + "epoch": 0.19875366475866532, + "grad_norm": 0.07101056929004967, + "learning_rate": 0.0009936548223350254, + "loss": 0.7022, + "step": 3915 + }, + { + "epoch": 0.19900750085669683, + "grad_norm": 0.04108441619780391, + "learning_rate": 0.0009949238578680203, + "loss": 0.7024, + "step": 3920 + }, + { + "epoch": 0.19926133695472834, + "grad_norm": 0.07150668017794924, + "learning_rate": 0.0009961928934010152, + "loss": 0.7146, + "step": 3925 + }, + { + "epoch": 0.19951517305275984, + "grad_norm": 0.058887395407361695, + "learning_rate": 0.0009974619289340103, + "loss": 0.7013, + "step": 3930 + }, + { + "epoch": 0.19976900915079132, + "grad_norm": 0.08301973441191342, + "learning_rate": 0.0009987309644670051, + "loss": 0.7245, + "step": 3935 + }, + { + "epoch": 0.20002284524882283, + "grad_norm": 0.13425731803767796, + "learning_rate": 0.001, + "loss": 0.7384, + "step": 3940 + }, + { + "epoch": 0.20027668134685433, + "grad_norm": 0.0954240553881326, + "learning_rate": 0.0009999999509262467, + "loss": 0.7082, + "step": 3945 + }, + { + "epoch": 0.20053051744488584, + "grad_norm": 0.09001615927204533, + "learning_rate": 0.0009999998037049968, + "loss": 0.7479, + "step": 3950 + }, + { + "epoch": 0.20078435354291735, + "grad_norm": 0.055536250218606135, + "learning_rate": 0.0009999995583362786, + "loss": 0.7217, + "step": 3955 + }, + { + "epoch": 0.20103818964094883, + "grad_norm": 0.039988384096081575, + "learning_rate": 0.0009999992148201407, + "loss": 0.7023, + "step": 3960 + }, + { + "epoch": 0.20129202573898033, + "grad_norm": 0.07631095658372448, + "learning_rate": 0.0009999987731566505, + "loss": 0.6593, + "step": 3965 + }, + { + "epoch": 0.20154586183701184, + "grad_norm": 0.055019840260049975, + "learning_rate": 0.0009999982333458942, + "loss": 0.7141, + "step": 3970 + }, + { + "epoch": 0.20179969793504335, + "grad_norm": 0.13389800181440178, + "learning_rate": 0.0009999975953879788, + "loss": 0.8059, + "step": 3975 + }, + { + "epoch": 0.20205353403307486, + "grad_norm": 0.10002422022594674, + "learning_rate": 0.0009999968592830286, + "loss": 0.7418, + "step": 3980 + }, + { + "epoch": 0.20230737013110633, + "grad_norm": 0.065235305911112, + "learning_rate": 0.0009999960250311885, + "loss": 0.7388, + "step": 3985 + }, + { + "epoch": 0.20256120622913784, + "grad_norm": 0.07700805891945464, + "learning_rate": 0.0009999950926326221, + "loss": 0.7619, + "step": 3990 + }, + { + "epoch": 0.20281504232716935, + "grad_norm": 0.05585759735632213, + "learning_rate": 0.0009999940620875124, + "loss": 0.7262, + "step": 3995 + }, + { + "epoch": 0.20306887842520085, + "grad_norm": 0.04210144533533203, + "learning_rate": 0.0009999929333960617, + "loss": 0.7003, + "step": 4000 + }, + { + "epoch": 0.20332271452323236, + "grad_norm": 0.04939954030213548, + "learning_rate": 0.0009999917065584918, + "loss": 0.7223, + "step": 4005 + }, + { + "epoch": 0.20357655062126384, + "grad_norm": 0.05682863013701724, + "learning_rate": 0.0009999903815750436, + "loss": 0.7366, + "step": 4010 + }, + { + "epoch": 0.20383038671929535, + "grad_norm": 0.07201666259364202, + "learning_rate": 0.0009999889584459765, + "loss": 0.7237, + "step": 4015 + }, + { + "epoch": 0.20408422281732685, + "grad_norm": 0.05415233755750322, + "learning_rate": 0.0009999874371715706, + "loss": 0.7099, + "step": 4020 + }, + { + "epoch": 0.20433805891535836, + "grad_norm": 0.09491299760351525, + "learning_rate": 0.0009999858177521242, + "loss": 0.9184, + "step": 4025 + }, + { + "epoch": 0.20459189501338984, + "grad_norm": 0.09564328359072853, + "learning_rate": 0.0009999841001879551, + "loss": 0.799, + "step": 4030 + }, + { + "epoch": 0.20484573111142135, + "grad_norm": 2.399109555715659, + "learning_rate": 0.0009999822844794005, + "loss": 1.0065, + "step": 4035 + }, + { + "epoch": 0.20509956720945285, + "grad_norm": 0.5227791347356591, + "learning_rate": 0.000999980370626817, + "loss": 0.8278, + "step": 4040 + }, + { + "epoch": 0.20535340330748436, + "grad_norm": 0.16651871729788892, + "learning_rate": 0.00099997835863058, + "loss": 0.8614, + "step": 4045 + }, + { + "epoch": 0.20560723940551587, + "grad_norm": 0.18221807327257578, + "learning_rate": 0.0009999762484910846, + "loss": 0.8153, + "step": 4050 + }, + { + "epoch": 0.20586107550354735, + "grad_norm": 0.10310087152591199, + "learning_rate": 0.0009999740402087452, + "loss": 0.8029, + "step": 4055 + }, + { + "epoch": 0.20611491160157885, + "grad_norm": 0.0703395551608927, + "learning_rate": 0.0009999717337839948, + "loss": 0.7577, + "step": 4060 + }, + { + "epoch": 0.20636874769961036, + "grad_norm": 0.08012644608333368, + "learning_rate": 0.0009999693292172865, + "loss": 0.7503, + "step": 4065 + }, + { + "epoch": 0.20662258379764187, + "grad_norm": 0.06287680300777991, + "learning_rate": 0.0009999668265090924, + "loss": 0.7382, + "step": 4070 + }, + { + "epoch": 0.20687641989567337, + "grad_norm": 0.05354965471224598, + "learning_rate": 0.0009999642256599034, + "loss": 0.7085, + "step": 4075 + }, + { + "epoch": 0.20713025599370485, + "grad_norm": 0.0482492491840279, + "learning_rate": 0.0009999615266702302, + "loss": 0.7315, + "step": 4080 + }, + { + "epoch": 0.20738409209173636, + "grad_norm": 0.05181020090220497, + "learning_rate": 0.0009999587295406026, + "loss": 0.7227, + "step": 4085 + }, + { + "epoch": 0.20763792818976787, + "grad_norm": 0.040021587065134735, + "learning_rate": 0.00099995583427157, + "loss": 0.7407, + "step": 4090 + }, + { + "epoch": 0.20789176428779937, + "grad_norm": 0.04000165166533322, + "learning_rate": 0.0009999528408637, + "loss": 0.7042, + "step": 4095 + }, + { + "epoch": 0.20814560038583088, + "grad_norm": 0.03943135892336847, + "learning_rate": 0.0009999497493175808, + "loss": 0.7267, + "step": 4100 + }, + { + "epoch": 0.20839943648386236, + "grad_norm": 0.0634943990185323, + "learning_rate": 0.0009999465596338191, + "loss": 0.6844, + "step": 4105 + }, + { + "epoch": 0.20865327258189387, + "grad_norm": 0.053343219962136215, + "learning_rate": 0.000999943271813041, + "loss": 0.6986, + "step": 4110 + }, + { + "epoch": 0.20890710867992537, + "grad_norm": 0.051181086485765775, + "learning_rate": 0.0009999398858558917, + "loss": 0.7183, + "step": 4115 + }, + { + "epoch": 0.20916094477795688, + "grad_norm": 0.057765287845386676, + "learning_rate": 0.0009999364017630361, + "loss": 0.7451, + "step": 4120 + }, + { + "epoch": 0.2094147808759884, + "grad_norm": 0.564887138188438, + "learning_rate": 0.0009999328195351579, + "loss": 0.7099, + "step": 4125 + }, + { + "epoch": 0.20966861697401987, + "grad_norm": 0.04854697031445982, + "learning_rate": 0.0009999291391729606, + "loss": 0.7045, + "step": 4130 + }, + { + "epoch": 0.20992245307205137, + "grad_norm": 0.06786359137742107, + "learning_rate": 0.0009999253606771661, + "loss": 0.7074, + "step": 4135 + }, + { + "epoch": 0.21017628917008288, + "grad_norm": 0.07218662791060168, + "learning_rate": 0.0009999214840485167, + "loss": 0.6902, + "step": 4140 + }, + { + "epoch": 0.2104301252681144, + "grad_norm": 0.1208003703070213, + "learning_rate": 0.000999917509287773, + "loss": 0.7087, + "step": 4145 + }, + { + "epoch": 0.21068396136614587, + "grad_norm": 0.05878145753992489, + "learning_rate": 0.0009999134363957152, + "loss": 0.674, + "step": 4150 + }, + { + "epoch": 0.21093779746417737, + "grad_norm": 0.046846061065139784, + "learning_rate": 0.0009999092653731432, + "loss": 0.7174, + "step": 4155 + }, + { + "epoch": 0.21119163356220888, + "grad_norm": 0.06346911438232657, + "learning_rate": 0.0009999049962208751, + "loss": 0.708, + "step": 4160 + }, + { + "epoch": 0.2114454696602404, + "grad_norm": 0.05215116911139721, + "learning_rate": 0.0009999006289397494, + "loss": 0.7164, + "step": 4165 + }, + { + "epoch": 0.2116993057582719, + "grad_norm": 0.04940387903306439, + "learning_rate": 0.0009998961635306234, + "loss": 0.7235, + "step": 4170 + }, + { + "epoch": 0.21195314185630337, + "grad_norm": 0.05729960864593158, + "learning_rate": 0.0009998915999943733, + "loss": 0.655, + "step": 4175 + }, + { + "epoch": 0.21220697795433488, + "grad_norm": 0.03918310878918639, + "learning_rate": 0.0009998869383318952, + "loss": 0.6842, + "step": 4180 + }, + { + "epoch": 0.2124608140523664, + "grad_norm": 0.06071981733933351, + "learning_rate": 0.0009998821785441039, + "loss": 0.7281, + "step": 4185 + }, + { + "epoch": 0.2127146501503979, + "grad_norm": 0.04728543724507953, + "learning_rate": 0.000999877320631934, + "loss": 0.684, + "step": 4190 + }, + { + "epoch": 0.2129684862484294, + "grad_norm": 0.0404988884730302, + "learning_rate": 0.0009998723645963388, + "loss": 0.7254, + "step": 4195 + }, + { + "epoch": 0.21322232234646088, + "grad_norm": 0.047811829648568324, + "learning_rate": 0.0009998673104382912, + "loss": 0.6826, + "step": 4200 + }, + { + "epoch": 0.2134761584444924, + "grad_norm": 0.049104308173568804, + "learning_rate": 0.0009998621581587836, + "loss": 0.7314, + "step": 4205 + }, + { + "epoch": 0.2137299945425239, + "grad_norm": 0.03823024902121797, + "learning_rate": 0.000999856907758827, + "loss": 0.72, + "step": 4210 + }, + { + "epoch": 0.2139838306405554, + "grad_norm": 0.057874634046649116, + "learning_rate": 0.0009998515592394524, + "loss": 0.713, + "step": 4215 + }, + { + "epoch": 0.2142376667385869, + "grad_norm": 0.07434713454091414, + "learning_rate": 0.0009998461126017094, + "loss": 0.7125, + "step": 4220 + }, + { + "epoch": 0.2144915028366184, + "grad_norm": 0.06272529032155742, + "learning_rate": 0.0009998405678466671, + "loss": 0.6889, + "step": 4225 + }, + { + "epoch": 0.2147453389346499, + "grad_norm": 0.033595008050920305, + "learning_rate": 0.0009998349249754142, + "loss": 0.6796, + "step": 4230 + }, + { + "epoch": 0.2149991750326814, + "grad_norm": 0.04029569391079619, + "learning_rate": 0.0009998291839890582, + "loss": 0.7028, + "step": 4235 + }, + { + "epoch": 0.2152530111307129, + "grad_norm": 0.056686671703328986, + "learning_rate": 0.000999823344888726, + "loss": 0.7324, + "step": 4240 + }, + { + "epoch": 0.21550684722874439, + "grad_norm": 0.06609484871358738, + "learning_rate": 0.0009998174076755637, + "loss": 0.6927, + "step": 4245 + }, + { + "epoch": 0.2157606833267759, + "grad_norm": 0.03952252087967003, + "learning_rate": 0.000999811372350737, + "loss": 0.6872, + "step": 4250 + }, + { + "epoch": 0.2160145194248074, + "grad_norm": 0.0421816140235618, + "learning_rate": 0.0009998052389154303, + "loss": 0.6629, + "step": 4255 + }, + { + "epoch": 0.2162683555228389, + "grad_norm": 0.0392009074437296, + "learning_rate": 0.0009997990073708479, + "loss": 0.6661, + "step": 4260 + }, + { + "epoch": 0.2165221916208704, + "grad_norm": 0.04466501671098514, + "learning_rate": 0.0009997926777182127, + "loss": 0.6814, + "step": 4265 + }, + { + "epoch": 0.2167760277189019, + "grad_norm": 0.04350807791667501, + "learning_rate": 0.0009997862499587673, + "loss": 0.7324, + "step": 4270 + }, + { + "epoch": 0.2170298638169334, + "grad_norm": 0.04149859498803727, + "learning_rate": 0.0009997797240937736, + "loss": 0.6829, + "step": 4275 + }, + { + "epoch": 0.2172836999149649, + "grad_norm": 0.04436311666473786, + "learning_rate": 0.0009997731001245124, + "loss": 0.6919, + "step": 4280 + }, + { + "epoch": 0.2175375360129964, + "grad_norm": 0.08118673796209477, + "learning_rate": 0.0009997663780522842, + "loss": 0.6884, + "step": 4285 + }, + { + "epoch": 0.21779137211102792, + "grad_norm": 1.4346902161122619, + "learning_rate": 0.000999759557878408, + "loss": 0.9, + "step": 4290 + }, + { + "epoch": 0.2180452082090594, + "grad_norm": 0.2203188373312744, + "learning_rate": 0.0009997526396042231, + "loss": 0.7955, + "step": 4295 + }, + { + "epoch": 0.2182990443070909, + "grad_norm": 0.10935101729406492, + "learning_rate": 0.000999745623231087, + "loss": 0.799, + "step": 4300 + }, + { + "epoch": 0.2185528804051224, + "grad_norm": 0.08157477584763652, + "learning_rate": 0.0009997385087603776, + "loss": 0.7428, + "step": 4305 + }, + { + "epoch": 0.21880671650315392, + "grad_norm": 0.33104745046572154, + "learning_rate": 0.0009997312961934912, + "loss": 0.7342, + "step": 4310 + }, + { + "epoch": 0.21906055260118543, + "grad_norm": 0.11235803791274827, + "learning_rate": 0.000999723985531843, + "loss": 0.7372, + "step": 4315 + }, + { + "epoch": 0.2193143886992169, + "grad_norm": 0.5110586261632338, + "learning_rate": 0.0009997165767768692, + "loss": 0.7387, + "step": 4320 + }, + { + "epoch": 0.2195682247972484, + "grad_norm": 0.12206962291351009, + "learning_rate": 0.000999709069930023, + "loss": 0.7563, + "step": 4325 + }, + { + "epoch": 0.21982206089527992, + "grad_norm": 0.08087926540403238, + "learning_rate": 0.0009997014649927786, + "loss": 0.729, + "step": 4330 + }, + { + "epoch": 0.22007589699331143, + "grad_norm": 0.14194129116961565, + "learning_rate": 0.0009996937619666287, + "loss": 0.763, + "step": 4335 + }, + { + "epoch": 0.22032973309134293, + "grad_norm": 0.05593580754563041, + "learning_rate": 0.0009996859608530852, + "loss": 0.738, + "step": 4340 + }, + { + "epoch": 0.2205835691893744, + "grad_norm": 0.05744534028157504, + "learning_rate": 0.0009996780616536795, + "loss": 0.7737, + "step": 4345 + }, + { + "epoch": 0.22083740528740592, + "grad_norm": 0.1748737919710415, + "learning_rate": 0.0009996700643699623, + "loss": 0.8663, + "step": 4350 + }, + { + "epoch": 0.22109124138543743, + "grad_norm": 0.21064202815185434, + "learning_rate": 0.0009996619690035033, + "loss": 0.7204, + "step": 4355 + }, + { + "epoch": 0.22134507748346893, + "grad_norm": 0.10370603131130617, + "learning_rate": 0.0009996537755558915, + "loss": 0.7197, + "step": 4360 + }, + { + "epoch": 0.2215989135815004, + "grad_norm": 0.06885717236979047, + "learning_rate": 0.0009996454840287355, + "loss": 0.744, + "step": 4365 + }, + { + "epoch": 0.22185274967953192, + "grad_norm": 0.07087821700739863, + "learning_rate": 0.0009996370944236625, + "loss": 0.7341, + "step": 4370 + }, + { + "epoch": 0.22210658577756343, + "grad_norm": 0.08283414484063312, + "learning_rate": 0.0009996286067423196, + "loss": 0.7271, + "step": 4375 + }, + { + "epoch": 0.22236042187559493, + "grad_norm": 0.03861660942385861, + "learning_rate": 0.000999620020986373, + "loss": 0.6911, + "step": 4380 + }, + { + "epoch": 0.22261425797362644, + "grad_norm": 0.06635541968079052, + "learning_rate": 0.0009996113371575075, + "loss": 0.7413, + "step": 4385 + }, + { + "epoch": 0.22286809407165792, + "grad_norm": 0.064436083193186, + "learning_rate": 0.0009996025552574284, + "loss": 0.687, + "step": 4390 + }, + { + "epoch": 0.22312193016968943, + "grad_norm": 0.07447130960554259, + "learning_rate": 0.000999593675287859, + "loss": 0.7174, + "step": 4395 + }, + { + "epoch": 0.22337576626772093, + "grad_norm": 0.04227245688144544, + "learning_rate": 0.0009995846972505429, + "loss": 0.7318, + "step": 4400 + }, + { + "epoch": 0.22362960236575244, + "grad_norm": 0.04336627684359626, + "learning_rate": 0.000999575621147242, + "loss": 0.758, + "step": 4405 + }, + { + "epoch": 0.22388343846378395, + "grad_norm": 0.052749435841111, + "learning_rate": 0.000999566446979738, + "loss": 0.7302, + "step": 4410 + }, + { + "epoch": 0.22413727456181542, + "grad_norm": 0.046319696344097425, + "learning_rate": 0.0009995571747498319, + "loss": 0.6572, + "step": 4415 + }, + { + "epoch": 0.22439111065984693, + "grad_norm": 0.036417375306608095, + "learning_rate": 0.0009995478044593435, + "loss": 0.7134, + "step": 4420 + }, + { + "epoch": 0.22464494675787844, + "grad_norm": 0.03453022900900243, + "learning_rate": 0.0009995383361101125, + "loss": 0.7012, + "step": 4425 + }, + { + "epoch": 0.22489878285590995, + "grad_norm": 0.07427033738679807, + "learning_rate": 0.0009995287697039973, + "loss": 0.7013, + "step": 4430 + }, + { + "epoch": 0.22515261895394145, + "grad_norm": 0.054569394891019696, + "learning_rate": 0.0009995191052428758, + "loss": 0.7198, + "step": 4435 + }, + { + "epoch": 0.22540645505197293, + "grad_norm": 0.0393817069157149, + "learning_rate": 0.0009995093427286447, + "loss": 0.6906, + "step": 4440 + }, + { + "epoch": 0.22566029115000444, + "grad_norm": 0.061316789682121224, + "learning_rate": 0.000999499482163221, + "loss": 0.7074, + "step": 4445 + }, + { + "epoch": 0.22591412724803595, + "grad_norm": 0.05219263283094663, + "learning_rate": 0.00099948952354854, + "loss": 0.7237, + "step": 4450 + }, + { + "epoch": 0.22616796334606745, + "grad_norm": 0.041749572353213034, + "learning_rate": 0.0009994794668865563, + "loss": 0.7215, + "step": 4455 + }, + { + "epoch": 0.22642179944409896, + "grad_norm": 0.06621048957460272, + "learning_rate": 0.0009994693121792443, + "loss": 0.7196, + "step": 4460 + }, + { + "epoch": 0.22667563554213044, + "grad_norm": 0.04745304839755999, + "learning_rate": 0.000999459059428597, + "loss": 0.7145, + "step": 4465 + }, + { + "epoch": 0.22692947164016194, + "grad_norm": 0.03774244254610854, + "learning_rate": 0.0009994487086366272, + "loss": 0.684, + "step": 4470 + }, + { + "epoch": 0.22718330773819345, + "grad_norm": 0.05065779440911714, + "learning_rate": 0.0009994382598053665, + "loss": 0.676, + "step": 4475 + }, + { + "epoch": 0.22743714383622496, + "grad_norm": 0.04660423097011398, + "learning_rate": 0.0009994277129368664, + "loss": 0.6876, + "step": 4480 + }, + { + "epoch": 0.22769097993425644, + "grad_norm": 0.039430766510009256, + "learning_rate": 0.0009994170680331968, + "loss": 0.6723, + "step": 4485 + }, + { + "epoch": 0.22794481603228794, + "grad_norm": 0.043461044788944345, + "learning_rate": 0.0009994063250964472, + "loss": 0.6599, + "step": 4490 + }, + { + "epoch": 0.22819865213031945, + "grad_norm": 0.05440998730009002, + "learning_rate": 0.0009993954841287266, + "loss": 0.6705, + "step": 4495 + }, + { + "epoch": 0.22845248822835096, + "grad_norm": 0.058972665330176754, + "learning_rate": 0.000999384545132163, + "loss": 0.6872, + "step": 4500 + }, + { + "epoch": 0.22870632432638247, + "grad_norm": 0.07456158646652374, + "learning_rate": 0.0009993735081089035, + "loss": 0.6873, + "step": 4505 + }, + { + "epoch": 0.22896016042441394, + "grad_norm": 0.05068153364562075, + "learning_rate": 0.0009993623730611147, + "loss": 0.6833, + "step": 4510 + }, + { + "epoch": 0.22921399652244545, + "grad_norm": 0.06571910537543511, + "learning_rate": 0.0009993511399909825, + "loss": 0.6663, + "step": 4515 + }, + { + "epoch": 0.22946783262047696, + "grad_norm": 0.1595236692921901, + "learning_rate": 0.0009993398089007117, + "loss": 0.6632, + "step": 4520 + }, + { + "epoch": 0.22972166871850846, + "grad_norm": 0.057680648886969166, + "learning_rate": 0.0009993283797925267, + "loss": 0.6815, + "step": 4525 + }, + { + "epoch": 0.22997550481653997, + "grad_norm": 0.05224803456061352, + "learning_rate": 0.0009993168526686708, + "loss": 0.7055, + "step": 4530 + }, + { + "epoch": 0.23022934091457145, + "grad_norm": 0.03339058561257075, + "learning_rate": 0.000999305227531407, + "loss": 0.6897, + "step": 4535 + }, + { + "epoch": 0.23048317701260296, + "grad_norm": 0.03485057296061167, + "learning_rate": 0.000999293504383017, + "loss": 0.6382, + "step": 4540 + }, + { + "epoch": 0.23073701311063446, + "grad_norm": 0.06126014494022501, + "learning_rate": 0.000999281683225802, + "loss": 0.7143, + "step": 4545 + }, + { + "epoch": 0.23099084920866597, + "grad_norm": 0.045158627169637415, + "learning_rate": 0.0009992697640620824, + "loss": 0.7128, + "step": 4550 + }, + { + "epoch": 0.23124468530669748, + "grad_norm": 0.0453807537141564, + "learning_rate": 0.000999257746894198, + "loss": 0.7073, + "step": 4555 + }, + { + "epoch": 0.23149852140472896, + "grad_norm": 0.038686364819667377, + "learning_rate": 0.0009992456317245077, + "loss": 0.7112, + "step": 4560 + }, + { + "epoch": 0.23175235750276046, + "grad_norm": 0.04309918961411153, + "learning_rate": 0.0009992334185553898, + "loss": 0.7091, + "step": 4565 + }, + { + "epoch": 0.23200619360079197, + "grad_norm": 0.03768367614783312, + "learning_rate": 0.0009992211073892414, + "loss": 0.6774, + "step": 4570 + }, + { + "epoch": 0.23226002969882348, + "grad_norm": 0.05953571982775251, + "learning_rate": 0.000999208698228479, + "loss": 0.6822, + "step": 4575 + }, + { + "epoch": 0.23251386579685496, + "grad_norm": 0.033857197586949536, + "learning_rate": 0.0009991961910755392, + "loss": 0.663, + "step": 4580 + }, + { + "epoch": 0.23276770189488646, + "grad_norm": 0.039971596166750424, + "learning_rate": 0.0009991835859328763, + "loss": 0.6908, + "step": 4585 + }, + { + "epoch": 0.23302153799291797, + "grad_norm": 0.050452966613271054, + "learning_rate": 0.0009991708828029648, + "loss": 0.642, + "step": 4590 + }, + { + "epoch": 0.23327537409094948, + "grad_norm": 0.0579358422967732, + "learning_rate": 0.0009991580816882983, + "loss": 0.6956, + "step": 4595 + }, + { + "epoch": 0.23352921018898098, + "grad_norm": 0.034608348704914735, + "learning_rate": 0.00099914518259139, + "loss": 0.7036, + "step": 4600 + }, + { + "epoch": 0.23378304628701246, + "grad_norm": 0.04500468096828464, + "learning_rate": 0.0009991321855147713, + "loss": 0.7096, + "step": 4605 + }, + { + "epoch": 0.23403688238504397, + "grad_norm": 0.06192464258502063, + "learning_rate": 0.0009991190904609939, + "loss": 0.6963, + "step": 4610 + }, + { + "epoch": 0.23429071848307548, + "grad_norm": 0.036824417923052896, + "learning_rate": 0.0009991058974326281, + "loss": 0.6719, + "step": 4615 + }, + { + "epoch": 0.23454455458110698, + "grad_norm": 0.034966580229344214, + "learning_rate": 0.0009990926064322636, + "loss": 0.6867, + "step": 4620 + }, + { + "epoch": 0.2347983906791385, + "grad_norm": 0.03553588744447904, + "learning_rate": 0.0009990792174625095, + "loss": 0.6936, + "step": 4625 + }, + { + "epoch": 0.23505222677716997, + "grad_norm": 0.03744697297486917, + "learning_rate": 0.000999065730525994, + "loss": 0.725, + "step": 4630 + }, + { + "epoch": 0.23530606287520148, + "grad_norm": 0.03140251768780669, + "learning_rate": 0.0009990521456253643, + "loss": 0.6593, + "step": 4635 + }, + { + "epoch": 0.23555989897323298, + "grad_norm": 0.039136334308997274, + "learning_rate": 0.0009990384627632872, + "loss": 0.6846, + "step": 4640 + }, + { + "epoch": 0.2358137350712645, + "grad_norm": 0.03616571384042556, + "learning_rate": 0.0009990246819424487, + "loss": 0.6877, + "step": 4645 + }, + { + "epoch": 0.236067571169296, + "grad_norm": 0.038208828921227095, + "learning_rate": 0.0009990108031655536, + "loss": 0.6841, + "step": 4650 + }, + { + "epoch": 0.23632140726732748, + "grad_norm": 0.03751589202735807, + "learning_rate": 0.0009989968264353265, + "loss": 0.6737, + "step": 4655 + }, + { + "epoch": 0.23657524336535898, + "grad_norm": 0.06015453571891426, + "learning_rate": 0.0009989827517545107, + "loss": 0.6753, + "step": 4660 + }, + { + "epoch": 0.2368290794633905, + "grad_norm": 0.03457380624142321, + "learning_rate": 0.0009989685791258693, + "loss": 0.6835, + "step": 4665 + }, + { + "epoch": 0.237082915561422, + "grad_norm": 0.033252457228367796, + "learning_rate": 0.0009989543085521843, + "loss": 0.676, + "step": 4670 + }, + { + "epoch": 0.2373367516594535, + "grad_norm": 0.05093290561964852, + "learning_rate": 0.0009989399400362566, + "loss": 0.7078, + "step": 4675 + }, + { + "epoch": 0.23759058775748498, + "grad_norm": 0.030107770564768616, + "learning_rate": 0.0009989254735809068, + "loss": 0.6547, + "step": 4680 + }, + { + "epoch": 0.2378444238555165, + "grad_norm": 0.03606716641926515, + "learning_rate": 0.000998910909188975, + "loss": 0.7156, + "step": 4685 + }, + { + "epoch": 0.238098259953548, + "grad_norm": 0.030847705095771165, + "learning_rate": 0.0009988962468633195, + "loss": 0.6556, + "step": 4690 + }, + { + "epoch": 0.2383520960515795, + "grad_norm": 0.03107834681283107, + "learning_rate": 0.000998881486606819, + "loss": 0.6654, + "step": 4695 + }, + { + "epoch": 0.23860593214961098, + "grad_norm": 0.03269129778574898, + "learning_rate": 0.0009988666284223703, + "loss": 0.646, + "step": 4700 + }, + { + "epoch": 0.2388597682476425, + "grad_norm": 0.03326143191194153, + "learning_rate": 0.0009988516723128905, + "loss": 0.6804, + "step": 4705 + }, + { + "epoch": 0.239113604345674, + "grad_norm": 0.032004462939776754, + "learning_rate": 0.0009988366182813152, + "loss": 0.6756, + "step": 4710 + }, + { + "epoch": 0.2393674404437055, + "grad_norm": 0.03226471228597121, + "learning_rate": 0.0009988214663305991, + "loss": 0.6736, + "step": 4715 + }, + { + "epoch": 0.239621276541737, + "grad_norm": 0.03050888275116761, + "learning_rate": 0.000998806216463717, + "loss": 0.6593, + "step": 4720 + }, + { + "epoch": 0.2398751126397685, + "grad_norm": 0.06624380514348276, + "learning_rate": 0.0009987908686836622, + "loss": 0.666, + "step": 4725 + }, + { + "epoch": 0.2401289487378, + "grad_norm": 0.04639500533331041, + "learning_rate": 0.0009987754229934473, + "loss": 0.6432, + "step": 4730 + }, + { + "epoch": 0.2403827848358315, + "grad_norm": 0.03651599837288676, + "learning_rate": 0.0009987598793961044, + "loss": 0.6984, + "step": 4735 + }, + { + "epoch": 0.240636620933863, + "grad_norm": 0.05906840836987052, + "learning_rate": 0.0009987442378946842, + "loss": 0.6734, + "step": 4740 + }, + { + "epoch": 0.24089045703189452, + "grad_norm": 0.0448969230153068, + "learning_rate": 0.0009987284984922576, + "loss": 0.6632, + "step": 4745 + }, + { + "epoch": 0.241144293129926, + "grad_norm": 0.0369703497767851, + "learning_rate": 0.0009987126611919136, + "loss": 0.6797, + "step": 4750 + }, + { + "epoch": 0.2413981292279575, + "grad_norm": 0.03187137783090064, + "learning_rate": 0.0009986967259967617, + "loss": 0.6988, + "step": 4755 + }, + { + "epoch": 0.241651965325989, + "grad_norm": 0.046567965068659574, + "learning_rate": 0.0009986806929099291, + "loss": 0.6878, + "step": 4760 + }, + { + "epoch": 0.24190580142402052, + "grad_norm": 0.03459321103042977, + "learning_rate": 0.0009986645619345636, + "loss": 0.678, + "step": 4765 + }, + { + "epoch": 0.24215963752205202, + "grad_norm": 0.03218614265241276, + "learning_rate": 0.0009986483330738313, + "loss": 0.6708, + "step": 4770 + }, + { + "epoch": 0.2424134736200835, + "grad_norm": 0.032098655982760134, + "learning_rate": 0.0009986320063309182, + "loss": 0.6975, + "step": 4775 + }, + { + "epoch": 0.242667309718115, + "grad_norm": 0.03147966656169076, + "learning_rate": 0.0009986155817090288, + "loss": 0.6792, + "step": 4780 + }, + { + "epoch": 0.24292114581614652, + "grad_norm": 0.036497678142445616, + "learning_rate": 0.0009985990592113873, + "loss": 0.6736, + "step": 4785 + }, + { + "epoch": 0.24317498191417802, + "grad_norm": 0.037003131869676374, + "learning_rate": 0.000998582438841237, + "loss": 0.7012, + "step": 4790 + }, + { + "epoch": 0.2434288180122095, + "grad_norm": 0.02977697312157573, + "learning_rate": 0.0009985657206018404, + "loss": 0.6579, + "step": 4795 + }, + { + "epoch": 0.243682654110241, + "grad_norm": 0.033364670469226705, + "learning_rate": 0.0009985489044964792, + "loss": 0.6868, + "step": 4800 + }, + { + "epoch": 0.24393649020827252, + "grad_norm": 0.03611311028340616, + "learning_rate": 0.0009985319905284542, + "loss": 0.7035, + "step": 4805 + }, + { + "epoch": 0.24419032630630402, + "grad_norm": 0.03271327971441799, + "learning_rate": 0.0009985149787010857, + "loss": 0.682, + "step": 4810 + }, + { + "epoch": 0.24444416240433553, + "grad_norm": 0.045957037298533635, + "learning_rate": 0.000998497869017713, + "loss": 0.6568, + "step": 4815 + }, + { + "epoch": 0.244697998502367, + "grad_norm": 0.058416697755835315, + "learning_rate": 0.0009984806614816944, + "loss": 0.6353, + "step": 4820 + }, + { + "epoch": 0.24495183460039852, + "grad_norm": 0.04141568212799508, + "learning_rate": 0.000998463356096408, + "loss": 0.6997, + "step": 4825 + }, + { + "epoch": 0.24520567069843002, + "grad_norm": 0.03429415259338343, + "learning_rate": 0.0009984459528652508, + "loss": 0.6693, + "step": 4830 + }, + { + "epoch": 0.24545950679646153, + "grad_norm": 0.04246659526708136, + "learning_rate": 0.0009984284517916386, + "loss": 0.6749, + "step": 4835 + }, + { + "epoch": 0.24571334289449304, + "grad_norm": 0.044954596862590616, + "learning_rate": 0.000998410852879007, + "loss": 0.6954, + "step": 4840 + }, + { + "epoch": 0.24596717899252452, + "grad_norm": 0.04365653720088704, + "learning_rate": 0.0009983931561308105, + "loss": 0.6583, + "step": 4845 + }, + { + "epoch": 0.24622101509055602, + "grad_norm": 0.04095497127619676, + "learning_rate": 0.0009983753615505232, + "loss": 0.6648, + "step": 4850 + }, + { + "epoch": 0.24647485118858753, + "grad_norm": 0.037206051604634686, + "learning_rate": 0.0009983574691416377, + "loss": 0.7103, + "step": 4855 + }, + { + "epoch": 0.24672868728661904, + "grad_norm": 0.033774627357303716, + "learning_rate": 0.0009983394789076663, + "loss": 0.6761, + "step": 4860 + }, + { + "epoch": 0.24698252338465054, + "grad_norm": 0.04836628251413319, + "learning_rate": 0.0009983213908521403, + "loss": 0.6753, + "step": 4865 + }, + { + "epoch": 0.24723635948268202, + "grad_norm": 0.05466352076606845, + "learning_rate": 0.0009983032049786106, + "loss": 0.6939, + "step": 4870 + }, + { + "epoch": 0.24749019558071353, + "grad_norm": 0.03600417172852513, + "learning_rate": 0.0009982849212906465, + "loss": 0.6242, + "step": 4875 + }, + { + "epoch": 0.24774403167874504, + "grad_norm": 0.03575528198191203, + "learning_rate": 0.0009982665397918376, + "loss": 0.6428, + "step": 4880 + }, + { + "epoch": 0.24799786777677654, + "grad_norm": 0.0444096053237933, + "learning_rate": 0.0009982480604857915, + "loss": 0.6974, + "step": 4885 + }, + { + "epoch": 0.24825170387480805, + "grad_norm": 0.0467175869465778, + "learning_rate": 0.000998229483376136, + "loss": 0.6542, + "step": 4890 + }, + { + "epoch": 0.24850553997283953, + "grad_norm": 0.03386552164563125, + "learning_rate": 0.0009982108084665177, + "loss": 0.6855, + "step": 4895 + }, + { + "epoch": 0.24875937607087104, + "grad_norm": 0.04919740684938901, + "learning_rate": 0.0009981920357606023, + "loss": 0.6631, + "step": 4900 + }, + { + "epoch": 0.24901321216890254, + "grad_norm": 0.057577585211750874, + "learning_rate": 0.0009981731652620746, + "loss": 0.6562, + "step": 4905 + }, + { + "epoch": 0.24926704826693405, + "grad_norm": 0.036406073723948565, + "learning_rate": 0.0009981541969746389, + "loss": 0.647, + "step": 4910 + }, + { + "epoch": 0.24952088436496553, + "grad_norm": 0.05465235293761601, + "learning_rate": 0.0009981351309020189, + "loss": 0.6631, + "step": 4915 + }, + { + "epoch": 0.24977472046299704, + "grad_norm": 0.048518701182604415, + "learning_rate": 0.0009981159670479566, + "loss": 0.6637, + "step": 4920 + }, + { + "epoch": 0.25002855656102857, + "grad_norm": 0.0440335328009626, + "learning_rate": 0.0009980967054162141, + "loss": 0.6347, + "step": 4925 + }, + { + "epoch": 0.25028239265906005, + "grad_norm": 0.035049011739473654, + "learning_rate": 0.0009980773460105726, + "loss": 0.6469, + "step": 4930 + }, + { + "epoch": 0.25053622875709153, + "grad_norm": 0.030844174743583502, + "learning_rate": 0.0009980578888348318, + "loss": 0.6618, + "step": 4935 + }, + { + "epoch": 0.25079006485512306, + "grad_norm": 0.04942299263880296, + "learning_rate": 0.000998038333892811, + "loss": 0.659, + "step": 4940 + }, + { + "epoch": 0.25104390095315454, + "grad_norm": 0.03920588146136878, + "learning_rate": 0.0009980186811883495, + "loss": 0.6439, + "step": 4945 + }, + { + "epoch": 0.2512977370511861, + "grad_norm": 0.037389944860020057, + "learning_rate": 0.000997998930725304, + "loss": 0.6883, + "step": 4950 + }, + { + "epoch": 0.25155157314921756, + "grad_norm": 0.035236188832557136, + "learning_rate": 0.0009979790825075522, + "loss": 0.6553, + "step": 4955 + }, + { + "epoch": 0.25180540924724903, + "grad_norm": 0.040826549993116246, + "learning_rate": 0.0009979591365389898, + "loss": 0.6896, + "step": 4960 + }, + { + "epoch": 0.25205924534528057, + "grad_norm": 0.08332873012050707, + "learning_rate": 0.0009979390928235323, + "loss": 0.6845, + "step": 4965 + }, + { + "epoch": 0.25231308144331205, + "grad_norm": 0.033440948748920125, + "learning_rate": 0.000997918951365114, + "loss": 0.639, + "step": 4970 + }, + { + "epoch": 0.2525669175413435, + "grad_norm": 0.03031028775821842, + "learning_rate": 0.0009978987121676889, + "loss": 0.6361, + "step": 4975 + }, + { + "epoch": 0.25282075363937506, + "grad_norm": 0.031208187381832196, + "learning_rate": 0.0009978783752352294, + "loss": 0.652, + "step": 4980 + }, + { + "epoch": 0.25307458973740654, + "grad_norm": 0.02872166512981619, + "learning_rate": 0.0009978579405717277, + "loss": 0.6724, + "step": 4985 + }, + { + "epoch": 0.2533284258354381, + "grad_norm": 0.03216058478323663, + "learning_rate": 0.0009978374081811951, + "loss": 0.6371, + "step": 4990 + }, + { + "epoch": 0.25358226193346955, + "grad_norm": 0.044909528815065505, + "learning_rate": 0.000997816778067662, + "loss": 0.6411, + "step": 4995 + }, + { + "epoch": 0.25383609803150103, + "grad_norm": 0.05288790528971391, + "learning_rate": 0.0009977960502351782, + "loss": 0.649, + "step": 5000 + }, + { + "epoch": 0.25408993412953257, + "grad_norm": 0.053954926716690006, + "learning_rate": 0.000997775224687812, + "loss": 0.6916, + "step": 5005 + }, + { + "epoch": 0.25434377022756405, + "grad_norm": 0.03358524756795457, + "learning_rate": 0.0009977543014296516, + "loss": 0.6414, + "step": 5010 + }, + { + "epoch": 0.2545976063255956, + "grad_norm": 0.04116556703883807, + "learning_rate": 0.0009977332804648044, + "loss": 0.629, + "step": 5015 + }, + { + "epoch": 0.25485144242362706, + "grad_norm": 0.0607964434321599, + "learning_rate": 0.000997712161797396, + "loss": 0.6169, + "step": 5020 + }, + { + "epoch": 0.25510527852165854, + "grad_norm": 0.03948191550117113, + "learning_rate": 0.0009976909454315727, + "loss": 0.6516, + "step": 5025 + }, + { + "epoch": 0.2553591146196901, + "grad_norm": 0.03816107077659965, + "learning_rate": 0.0009976696313714986, + "loss": 0.6208, + "step": 5030 + }, + { + "epoch": 0.25561295071772155, + "grad_norm": 0.03580194239027954, + "learning_rate": 0.0009976482196213578, + "loss": 0.6565, + "step": 5035 + }, + { + "epoch": 0.2558667868157531, + "grad_norm": 0.03359792481796965, + "learning_rate": 0.0009976267101853534, + "loss": 0.6612, + "step": 5040 + }, + { + "epoch": 0.25612062291378457, + "grad_norm": 0.07818021673605252, + "learning_rate": 0.000997605103067707, + "loss": 0.6803, + "step": 5045 + }, + { + "epoch": 0.25637445901181605, + "grad_norm": 0.05189617302318715, + "learning_rate": 0.000997583398272661, + "loss": 0.6446, + "step": 5050 + }, + { + "epoch": 0.2566282951098476, + "grad_norm": 0.040877169883040965, + "learning_rate": 0.000997561595804475, + "loss": 0.6738, + "step": 5055 + }, + { + "epoch": 0.25688213120787906, + "grad_norm": 0.03223264560242032, + "learning_rate": 0.0009975396956674292, + "loss": 0.6671, + "step": 5060 + }, + { + "epoch": 0.2571359673059106, + "grad_norm": 0.0293074841987577, + "learning_rate": 0.0009975176978658223, + "loss": 0.6393, + "step": 5065 + }, + { + "epoch": 0.2573898034039421, + "grad_norm": 0.029454580799978237, + "learning_rate": 0.0009974956024039723, + "loss": 0.668, + "step": 5070 + }, + { + "epoch": 0.25764363950197355, + "grad_norm": 0.036264462292175975, + "learning_rate": 0.0009974734092862167, + "loss": 0.6323, + "step": 5075 + }, + { + "epoch": 0.2578974756000051, + "grad_norm": 0.03401180337813072, + "learning_rate": 0.0009974511185169119, + "loss": 0.6179, + "step": 5080 + }, + { + "epoch": 0.25815131169803657, + "grad_norm": 0.035116697500188865, + "learning_rate": 0.0009974287301004333, + "loss": 0.6568, + "step": 5085 + }, + { + "epoch": 0.2584051477960681, + "grad_norm": 0.032907495214095854, + "learning_rate": 0.0009974062440411754, + "loss": 0.6608, + "step": 5090 + }, + { + "epoch": 0.2586589838940996, + "grad_norm": 0.03230228212232674, + "learning_rate": 0.0009973836603435525, + "loss": 0.6692, + "step": 5095 + }, + { + "epoch": 0.25891281999213106, + "grad_norm": 0.036175243285705705, + "learning_rate": 0.0009973609790119974, + "loss": 0.6115, + "step": 5100 + }, + { + "epoch": 0.2591666560901626, + "grad_norm": 0.04087747325856648, + "learning_rate": 0.0009973382000509627, + "loss": 0.6158, + "step": 5105 + }, + { + "epoch": 0.2594204921881941, + "grad_norm": 0.05131461463304514, + "learning_rate": 0.0009973153234649195, + "loss": 0.6733, + "step": 5110 + }, + { + "epoch": 0.2596743282862256, + "grad_norm": 0.042101949951047324, + "learning_rate": 0.0009972923492583582, + "loss": 0.6649, + "step": 5115 + }, + { + "epoch": 0.2599281643842571, + "grad_norm": 0.032414502454393444, + "learning_rate": 0.0009972692774357888, + "loss": 0.6502, + "step": 5120 + }, + { + "epoch": 0.26018200048228857, + "grad_norm": 0.031594756495697617, + "learning_rate": 0.0009972461080017404, + "loss": 0.6269, + "step": 5125 + }, + { + "epoch": 0.2604358365803201, + "grad_norm": 0.029578356553082198, + "learning_rate": 0.0009972228409607605, + "loss": 0.658, + "step": 5130 + }, + { + "epoch": 0.2606896726783516, + "grad_norm": 0.04180221452673936, + "learning_rate": 0.0009971994763174165, + "loss": 0.6404, + "step": 5135 + }, + { + "epoch": 0.2609435087763831, + "grad_norm": 0.034100422332999716, + "learning_rate": 0.0009971760140762948, + "loss": 0.6475, + "step": 5140 + }, + { + "epoch": 0.2611973448744146, + "grad_norm": 0.04081751959411303, + "learning_rate": 0.0009971524542420013, + "loss": 0.64, + "step": 5145 + }, + { + "epoch": 0.2614511809724461, + "grad_norm": 0.028872945512903075, + "learning_rate": 0.00099712879681916, + "loss": 0.6393, + "step": 5150 + }, + { + "epoch": 0.2617050170704776, + "grad_norm": 0.035240956930609156, + "learning_rate": 0.0009971050418124152, + "loss": 0.6333, + "step": 5155 + }, + { + "epoch": 0.2619588531685091, + "grad_norm": 0.044136885746740954, + "learning_rate": 0.0009970811892264298, + "loss": 0.635, + "step": 5160 + }, + { + "epoch": 0.2622126892665406, + "grad_norm": 0.0459739244555897, + "learning_rate": 0.0009970572390658858, + "loss": 0.6544, + "step": 5165 + }, + { + "epoch": 0.2624665253645721, + "grad_norm": 0.04065361653486883, + "learning_rate": 0.0009970331913354846, + "loss": 0.6466, + "step": 5170 + }, + { + "epoch": 0.2627203614626036, + "grad_norm": 0.06797018866692801, + "learning_rate": 0.0009970090460399467, + "loss": 0.635, + "step": 5175 + }, + { + "epoch": 0.2629741975606351, + "grad_norm": 0.037934420949089415, + "learning_rate": 0.0009969848031840117, + "loss": 0.6785, + "step": 5180 + }, + { + "epoch": 0.2632280336586666, + "grad_norm": 0.041773499604102336, + "learning_rate": 0.000996960462772438, + "loss": 0.6269, + "step": 5185 + }, + { + "epoch": 0.2634818697566981, + "grad_norm": 0.03264370334223641, + "learning_rate": 0.000996936024810004, + "loss": 0.6399, + "step": 5190 + }, + { + "epoch": 0.2637357058547296, + "grad_norm": 0.05100248961195995, + "learning_rate": 0.0009969114893015065, + "loss": 0.6499, + "step": 5195 + }, + { + "epoch": 0.2639895419527611, + "grad_norm": 0.052788735735977844, + "learning_rate": 0.000996886856251762, + "loss": 0.6809, + "step": 5200 + }, + { + "epoch": 0.2642433780507926, + "grad_norm": 0.04842406245587388, + "learning_rate": 0.0009968621256656051, + "loss": 0.6552, + "step": 5205 + }, + { + "epoch": 0.2644972141488241, + "grad_norm": 0.03292511247465923, + "learning_rate": 0.0009968372975478913, + "loss": 0.6661, + "step": 5210 + }, + { + "epoch": 0.2647510502468556, + "grad_norm": 0.058627129552400285, + "learning_rate": 0.0009968123719034934, + "loss": 0.6759, + "step": 5215 + }, + { + "epoch": 0.2650048863448871, + "grad_norm": 0.04585833544931062, + "learning_rate": 0.0009967873487373045, + "loss": 0.6838, + "step": 5220 + }, + { + "epoch": 0.2652587224429186, + "grad_norm": 0.06213801760761045, + "learning_rate": 0.0009967622280542365, + "loss": 0.686, + "step": 5225 + }, + { + "epoch": 0.2655125585409501, + "grad_norm": 0.0465036803469171, + "learning_rate": 0.0009967370098592206, + "loss": 0.6789, + "step": 5230 + }, + { + "epoch": 0.2657663946389816, + "grad_norm": 0.036623256236247154, + "learning_rate": 0.000996711694157207, + "loss": 0.6537, + "step": 5235 + }, + { + "epoch": 0.2660202307370131, + "grad_norm": 0.036813613833440686, + "learning_rate": 0.0009966862809531647, + "loss": 0.6605, + "step": 5240 + }, + { + "epoch": 0.2662740668350446, + "grad_norm": 0.033104325193404505, + "learning_rate": 0.0009966607702520825, + "loss": 0.6667, + "step": 5245 + }, + { + "epoch": 0.2665279029330761, + "grad_norm": 0.03616645613143295, + "learning_rate": 0.0009966351620589679, + "loss": 0.6442, + "step": 5250 + }, + { + "epoch": 0.26678173903110763, + "grad_norm": 0.1824434683613809, + "learning_rate": 0.0009966094563788478, + "loss": 0.7154, + "step": 5255 + }, + { + "epoch": 0.2670355751291391, + "grad_norm": 0.08584590008440728, + "learning_rate": 0.0009965836532167679, + "loss": 0.7169, + "step": 5260 + }, + { + "epoch": 0.2672894112271706, + "grad_norm": 0.08992314008060993, + "learning_rate": 0.0009965577525777934, + "loss": 0.7042, + "step": 5265 + }, + { + "epoch": 0.2675432473252021, + "grad_norm": 0.05488924202012461, + "learning_rate": 0.0009965317544670083, + "loss": 0.6752, + "step": 5270 + }, + { + "epoch": 0.2677970834232336, + "grad_norm": 0.045608215425613115, + "learning_rate": 0.000996505658889516, + "loss": 0.6711, + "step": 5275 + }, + { + "epoch": 0.26805091952126514, + "grad_norm": 0.04073278787103573, + "learning_rate": 0.000996479465850439, + "loss": 0.6915, + "step": 5280 + }, + { + "epoch": 0.2683047556192966, + "grad_norm": 0.04291579060984302, + "learning_rate": 0.000996453175354919, + "loss": 0.6813, + "step": 5285 + }, + { + "epoch": 0.2685585917173281, + "grad_norm": 0.037038830744694835, + "learning_rate": 0.000996426787408116, + "loss": 0.7183, + "step": 5290 + }, + { + "epoch": 0.26881242781535963, + "grad_norm": 0.03993617745746424, + "learning_rate": 0.0009964003020152107, + "loss": 0.7023, + "step": 5295 + }, + { + "epoch": 0.2690662639133911, + "grad_norm": 0.043203555024935315, + "learning_rate": 0.0009963737191814015, + "loss": 0.6355, + "step": 5300 + }, + { + "epoch": 0.26932010001142265, + "grad_norm": 0.03774583913684898, + "learning_rate": 0.0009963470389119068, + "loss": 0.6524, + "step": 5305 + }, + { + "epoch": 0.2695739361094541, + "grad_norm": 0.03395726169338956, + "learning_rate": 0.0009963202612119635, + "loss": 0.6512, + "step": 5310 + }, + { + "epoch": 0.2698277722074856, + "grad_norm": 0.043530029342712495, + "learning_rate": 0.000996293386086828, + "loss": 0.6618, + "step": 5315 + }, + { + "epoch": 0.27008160830551714, + "grad_norm": 0.03403180797982201, + "learning_rate": 0.0009962664135417761, + "loss": 0.6811, + "step": 5320 + }, + { + "epoch": 0.2703354444035486, + "grad_norm": 0.04666110465558761, + "learning_rate": 0.0009962393435821017, + "loss": 0.6615, + "step": 5325 + }, + { + "epoch": 0.27058928050158015, + "grad_norm": 0.032556938435121995, + "learning_rate": 0.0009962121762131192, + "loss": 0.6395, + "step": 5330 + }, + { + "epoch": 0.27084311659961163, + "grad_norm": 0.04419252183394335, + "learning_rate": 0.0009961849114401612, + "loss": 0.6119, + "step": 5335 + }, + { + "epoch": 0.2710969526976431, + "grad_norm": 0.035595422315825305, + "learning_rate": 0.0009961575492685793, + "loss": 0.6484, + "step": 5340 + }, + { + "epoch": 0.27135078879567465, + "grad_norm": 0.02926242788618824, + "learning_rate": 0.0009961300897037449, + "loss": 0.6407, + "step": 5345 + }, + { + "epoch": 0.2716046248937061, + "grad_norm": 0.03602952233417284, + "learning_rate": 0.000996102532751048, + "loss": 0.6641, + "step": 5350 + }, + { + "epoch": 0.27185846099173766, + "grad_norm": 0.045258498714948955, + "learning_rate": 0.000996074878415898, + "loss": 0.6636, + "step": 5355 + }, + { + "epoch": 0.27211229708976914, + "grad_norm": 0.055406038005293826, + "learning_rate": 0.0009960471267037234, + "loss": 0.6462, + "step": 5360 + }, + { + "epoch": 0.2723661331878006, + "grad_norm": 0.05518321218015624, + "learning_rate": 0.0009960192776199716, + "loss": 0.6265, + "step": 5365 + }, + { + "epoch": 0.27261996928583215, + "grad_norm": 0.03701471191581659, + "learning_rate": 0.0009959913311701092, + "loss": 0.6311, + "step": 5370 + }, + { + "epoch": 0.27287380538386363, + "grad_norm": 0.04247484585566076, + "learning_rate": 0.000995963287359622, + "loss": 0.663, + "step": 5375 + }, + { + "epoch": 0.27312764148189517, + "grad_norm": 0.03805986357905345, + "learning_rate": 0.0009959351461940149, + "loss": 0.6367, + "step": 5380 + }, + { + "epoch": 0.27338147757992665, + "grad_norm": 0.041801835611640255, + "learning_rate": 0.0009959069076788118, + "loss": 0.6948, + "step": 5385 + }, + { + "epoch": 0.2736353136779581, + "grad_norm": 0.034657777527560114, + "learning_rate": 0.0009958785718195559, + "loss": 0.6829, + "step": 5390 + }, + { + "epoch": 0.27388914977598966, + "grad_norm": 0.18006607610520042, + "learning_rate": 0.000995850138621809, + "loss": 0.6177, + "step": 5395 + }, + { + "epoch": 0.27414298587402114, + "grad_norm": 0.10543857125691686, + "learning_rate": 0.0009958216080911528, + "loss": 0.6223, + "step": 5400 + }, + { + "epoch": 0.2743968219720526, + "grad_norm": 0.0900614057284698, + "learning_rate": 0.0009957929802331877, + "loss": 0.6623, + "step": 5405 + }, + { + "epoch": 0.27465065807008415, + "grad_norm": 0.04819979788344021, + "learning_rate": 0.000995764255053533, + "loss": 0.7131, + "step": 5410 + }, + { + "epoch": 0.27490449416811563, + "grad_norm": 0.03076997581248242, + "learning_rate": 0.0009957354325578276, + "loss": 0.6477, + "step": 5415 + }, + { + "epoch": 0.27515833026614717, + "grad_norm": 0.041385162650395144, + "learning_rate": 0.000995706512751729, + "loss": 0.6565, + "step": 5420 + }, + { + "epoch": 0.27541216636417865, + "grad_norm": 0.03721380433063758, + "learning_rate": 0.0009956774956409139, + "loss": 0.6774, + "step": 5425 + }, + { + "epoch": 0.2756660024622101, + "grad_norm": 0.0354007938770567, + "learning_rate": 0.0009956483812310782, + "loss": 0.6286, + "step": 5430 + }, + { + "epoch": 0.27591983856024166, + "grad_norm": 0.04494906800664398, + "learning_rate": 0.0009956191695279374, + "loss": 0.6499, + "step": 5435 + }, + { + "epoch": 0.27617367465827314, + "grad_norm": 0.06723158886677959, + "learning_rate": 0.0009955898605372249, + "loss": 0.6385, + "step": 5440 + }, + { + "epoch": 0.2764275107563047, + "grad_norm": 0.04401273742367783, + "learning_rate": 0.0009955604542646946, + "loss": 0.7135, + "step": 5445 + }, + { + "epoch": 0.27668134685433615, + "grad_norm": 0.05565588713522223, + "learning_rate": 0.0009955309507161184, + "loss": 0.6492, + "step": 5450 + }, + { + "epoch": 0.27693518295236763, + "grad_norm": 0.0719042454362043, + "learning_rate": 0.0009955013498972876, + "loss": 0.6936, + "step": 5455 + }, + { + "epoch": 0.27718901905039917, + "grad_norm": 0.03740121789307142, + "learning_rate": 0.000995471651814013, + "loss": 0.6424, + "step": 5460 + }, + { + "epoch": 0.27744285514843064, + "grad_norm": 0.03850759086931981, + "learning_rate": 0.0009954418564721242, + "loss": 0.6759, + "step": 5465 + }, + { + "epoch": 0.2776966912464622, + "grad_norm": 0.036923309271708, + "learning_rate": 0.0009954119638774695, + "loss": 0.6807, + "step": 5470 + }, + { + "epoch": 0.27795052734449366, + "grad_norm": 0.042899936320904095, + "learning_rate": 0.000995381974035917, + "loss": 0.6874, + "step": 5475 + }, + { + "epoch": 0.27820436344252514, + "grad_norm": 0.0784299633425732, + "learning_rate": 0.0009953518869533536, + "loss": 0.6906, + "step": 5480 + }, + { + "epoch": 0.2784581995405567, + "grad_norm": 0.05844919432805366, + "learning_rate": 0.0009953217026356848, + "loss": 0.6793, + "step": 5485 + }, + { + "epoch": 0.27871203563858815, + "grad_norm": 0.040898154688727616, + "learning_rate": 0.0009952914210888363, + "loss": 0.6796, + "step": 5490 + }, + { + "epoch": 0.2789658717366197, + "grad_norm": 0.056560854563487195, + "learning_rate": 0.0009952610423187517, + "loss": 0.6589, + "step": 5495 + }, + { + "epoch": 0.27921970783465117, + "grad_norm": 0.07111930095820192, + "learning_rate": 0.0009952305663313943, + "loss": 0.665, + "step": 5500 + }, + { + "epoch": 0.27947354393268264, + "grad_norm": 0.06949742084914043, + "learning_rate": 0.0009951999931327464, + "loss": 0.6705, + "step": 5505 + }, + { + "epoch": 0.2797273800307142, + "grad_norm": 0.05060283874594893, + "learning_rate": 0.0009951693227288096, + "loss": 0.7088, + "step": 5510 + }, + { + "epoch": 0.27998121612874566, + "grad_norm": 0.03589194724274505, + "learning_rate": 0.0009951385551256041, + "loss": 0.6602, + "step": 5515 + }, + { + "epoch": 0.2802350522267772, + "grad_norm": 0.0327414416620521, + "learning_rate": 0.0009951076903291693, + "loss": 0.6571, + "step": 5520 + }, + { + "epoch": 0.28048888832480867, + "grad_norm": 0.052789013483981816, + "learning_rate": 0.000995076728345564, + "loss": 0.6725, + "step": 5525 + }, + { + "epoch": 0.28074272442284015, + "grad_norm": 0.05677596579461743, + "learning_rate": 0.000995045669180866, + "loss": 0.6413, + "step": 5530 + }, + { + "epoch": 0.2809965605208717, + "grad_norm": 0.03745942908058603, + "learning_rate": 0.000995014512841172, + "loss": 0.6909, + "step": 5535 + }, + { + "epoch": 0.28125039661890316, + "grad_norm": 0.14627372454354676, + "learning_rate": 0.0009949832593325978, + "loss": 0.6459, + "step": 5540 + }, + { + "epoch": 0.2815042327169347, + "grad_norm": 0.04133886420786463, + "learning_rate": 0.000994951908661278, + "loss": 0.6771, + "step": 5545 + }, + { + "epoch": 0.2817580688149662, + "grad_norm": 0.038383767135896224, + "learning_rate": 0.0009949204608333672, + "loss": 0.6659, + "step": 5550 + }, + { + "epoch": 0.28201190491299766, + "grad_norm": 0.03870804386622496, + "learning_rate": 0.0009948889158550376, + "loss": 0.6628, + "step": 5555 + }, + { + "epoch": 0.2822657410110292, + "grad_norm": 0.03666141726099845, + "learning_rate": 0.0009948572737324822, + "loss": 0.6783, + "step": 5560 + }, + { + "epoch": 0.28251957710906067, + "grad_norm": 0.13133641642519978, + "learning_rate": 0.0009948255344719118, + "loss": 0.675, + "step": 5565 + }, + { + "epoch": 0.2827734132070922, + "grad_norm": 0.12174050657136312, + "learning_rate": 0.0009947936980795565, + "loss": 0.7195, + "step": 5570 + }, + { + "epoch": 0.2830272493051237, + "grad_norm": 0.058601197176472185, + "learning_rate": 0.000994761764561666, + "loss": 0.6784, + "step": 5575 + }, + { + "epoch": 0.28328108540315516, + "grad_norm": 0.16508393923966153, + "learning_rate": 0.0009947297339245084, + "loss": 0.7202, + "step": 5580 + }, + { + "epoch": 0.2835349215011867, + "grad_norm": 0.0766947212292105, + "learning_rate": 0.0009946976061743712, + "loss": 0.6596, + "step": 5585 + }, + { + "epoch": 0.2837887575992182, + "grad_norm": 0.03340983687623817, + "learning_rate": 0.000994665381317561, + "loss": 0.6599, + "step": 5590 + }, + { + "epoch": 0.2840425936972497, + "grad_norm": 0.037573302023749436, + "learning_rate": 0.0009946330593604033, + "loss": 0.6594, + "step": 5595 + }, + { + "epoch": 0.2842964297952812, + "grad_norm": 0.03926723139173781, + "learning_rate": 0.000994600640309243, + "loss": 0.6436, + "step": 5600 + }, + { + "epoch": 0.28455026589331267, + "grad_norm": 0.033758520131084185, + "learning_rate": 0.0009945681241704434, + "loss": 0.6706, + "step": 5605 + }, + { + "epoch": 0.2848041019913442, + "grad_norm": 0.035833106887649714, + "learning_rate": 0.0009945355109503872, + "loss": 0.6757, + "step": 5610 + }, + { + "epoch": 0.2850579380893757, + "grad_norm": 0.03439574893245049, + "learning_rate": 0.0009945028006554768, + "loss": 0.674, + "step": 5615 + }, + { + "epoch": 0.2853117741874072, + "grad_norm": 0.04115472629656225, + "learning_rate": 0.0009944699932921326, + "loss": 0.6901, + "step": 5620 + }, + { + "epoch": 0.2855656102854387, + "grad_norm": 0.042426909272189464, + "learning_rate": 0.0009944370888667947, + "loss": 0.6709, + "step": 5625 + }, + { + "epoch": 0.2858194463834702, + "grad_norm": 0.05644132704585222, + "learning_rate": 0.0009944040873859218, + "loss": 0.6765, + "step": 5630 + }, + { + "epoch": 0.2860732824815017, + "grad_norm": 0.050269965909692245, + "learning_rate": 0.0009943709888559922, + "loss": 0.6463, + "step": 5635 + }, + { + "epoch": 0.2863271185795332, + "grad_norm": 0.03990106543413605, + "learning_rate": 0.000994337793283503, + "loss": 0.6788, + "step": 5640 + }, + { + "epoch": 0.28658095467756467, + "grad_norm": 0.039426229153061605, + "learning_rate": 0.0009943045006749703, + "loss": 0.6477, + "step": 5645 + }, + { + "epoch": 0.2868347907755962, + "grad_norm": 0.037701616134469464, + "learning_rate": 0.0009942711110369291, + "loss": 0.6609, + "step": 5650 + }, + { + "epoch": 0.2870886268736277, + "grad_norm": 0.036088987975116615, + "learning_rate": 0.0009942376243759336, + "loss": 0.6237, + "step": 5655 + }, + { + "epoch": 0.2873424629716592, + "grad_norm": 0.039086804987042645, + "learning_rate": 0.0009942040406985574, + "loss": 0.6313, + "step": 5660 + }, + { + "epoch": 0.2875962990696907, + "grad_norm": 0.05039912347585908, + "learning_rate": 0.0009941703600113926, + "loss": 0.6786, + "step": 5665 + }, + { + "epoch": 0.2878501351677222, + "grad_norm": 0.05153135663276811, + "learning_rate": 0.0009941365823210506, + "loss": 0.6796, + "step": 5670 + }, + { + "epoch": 0.2881039712657537, + "grad_norm": 0.048537339426450846, + "learning_rate": 0.0009941027076341615, + "loss": 0.6335, + "step": 5675 + }, + { + "epoch": 0.2883578073637852, + "grad_norm": 0.05231418629671102, + "learning_rate": 0.0009940687359573752, + "loss": 0.689, + "step": 5680 + }, + { + "epoch": 0.2886116434618167, + "grad_norm": 0.03179751321808205, + "learning_rate": 0.00099403466729736, + "loss": 0.6398, + "step": 5685 + }, + { + "epoch": 0.2888654795598482, + "grad_norm": 0.056582858417932216, + "learning_rate": 0.000994000501660803, + "loss": 0.6721, + "step": 5690 + }, + { + "epoch": 0.2891193156578797, + "grad_norm": 0.02974455078712651, + "learning_rate": 0.0009939662390544115, + "loss": 0.6327, + "step": 5695 + }, + { + "epoch": 0.2893731517559112, + "grad_norm": 0.05164849531976483, + "learning_rate": 0.0009939318794849104, + "loss": 0.6492, + "step": 5700 + }, + { + "epoch": 0.2896269878539427, + "grad_norm": 0.03957457040909659, + "learning_rate": 0.0009938974229590446, + "loss": 0.6436, + "step": 5705 + }, + { + "epoch": 0.28988082395197423, + "grad_norm": 0.04242341712591207, + "learning_rate": 0.000993862869483578, + "loss": 0.6644, + "step": 5710 + }, + { + "epoch": 0.2901346600500057, + "grad_norm": 0.1372690604293506, + "learning_rate": 0.0009938282190652928, + "loss": 0.681, + "step": 5715 + }, + { + "epoch": 0.2903884961480372, + "grad_norm": 0.06712065575391647, + "learning_rate": 0.0009937934717109912, + "loss": 0.6436, + "step": 5720 + }, + { + "epoch": 0.2906423322460687, + "grad_norm": 0.0657319128974874, + "learning_rate": 0.0009937586274274932, + "loss": 0.646, + "step": 5725 + }, + { + "epoch": 0.2908961683441002, + "grad_norm": 0.032380117007307335, + "learning_rate": 0.0009937236862216391, + "loss": 0.6722, + "step": 5730 + }, + { + "epoch": 0.29115000444213174, + "grad_norm": 0.051361069870512616, + "learning_rate": 0.0009936886481002878, + "loss": 0.6756, + "step": 5735 + }, + { + "epoch": 0.2914038405401632, + "grad_norm": 0.03396960893164104, + "learning_rate": 0.0009936535130703169, + "loss": 0.6785, + "step": 5740 + }, + { + "epoch": 0.2916576766381947, + "grad_norm": 0.03936833237945123, + "learning_rate": 0.0009936182811386232, + "loss": 0.6588, + "step": 5745 + }, + { + "epoch": 0.29191151273622623, + "grad_norm": 0.05198925092916733, + "learning_rate": 0.0009935829523121224, + "loss": 0.6675, + "step": 5750 + }, + { + "epoch": 0.2921653488342577, + "grad_norm": 0.06085607860536531, + "learning_rate": 0.0009935475265977498, + "loss": 0.6286, + "step": 5755 + }, + { + "epoch": 0.29241918493228924, + "grad_norm": 0.059359860916179744, + "learning_rate": 0.0009935120040024587, + "loss": 0.6369, + "step": 5760 + }, + { + "epoch": 0.2926730210303207, + "grad_norm": 0.039118870920788806, + "learning_rate": 0.0009934763845332228, + "loss": 0.6544, + "step": 5765 + }, + { + "epoch": 0.2929268571283522, + "grad_norm": 0.04229042882872178, + "learning_rate": 0.0009934406681970332, + "loss": 0.6304, + "step": 5770 + }, + { + "epoch": 0.29318069322638374, + "grad_norm": 0.026802917169532255, + "learning_rate": 0.0009934048550009015, + "loss": 0.6143, + "step": 5775 + }, + { + "epoch": 0.2934345293244152, + "grad_norm": 0.03201916028903979, + "learning_rate": 0.0009933689449518573, + "loss": 0.6586, + "step": 5780 + }, + { + "epoch": 0.29368836542244675, + "grad_norm": 0.03780618180915201, + "learning_rate": 0.0009933329380569494, + "loss": 0.6688, + "step": 5785 + }, + { + "epoch": 0.29394220152047823, + "grad_norm": 0.04436936096005898, + "learning_rate": 0.0009932968343232462, + "loss": 0.6441, + "step": 5790 + }, + { + "epoch": 0.2941960376185097, + "grad_norm": 0.05967133531857773, + "learning_rate": 0.0009932606337578346, + "loss": 0.6466, + "step": 5795 + }, + { + "epoch": 0.29444987371654124, + "grad_norm": 0.05591640331892786, + "learning_rate": 0.0009932243363678203, + "loss": 0.6458, + "step": 5800 + }, + { + "epoch": 0.2947037098145727, + "grad_norm": 0.0350539399039161, + "learning_rate": 0.0009931879421603285, + "loss": 0.6694, + "step": 5805 + }, + { + "epoch": 0.29495754591260426, + "grad_norm": 0.047049178898087726, + "learning_rate": 0.0009931514511425032, + "loss": 0.628, + "step": 5810 + }, + { + "epoch": 0.29521138201063574, + "grad_norm": 0.06870970965553135, + "learning_rate": 0.0009931148633215074, + "loss": 0.6383, + "step": 5815 + }, + { + "epoch": 0.2954652181086672, + "grad_norm": 0.04607237701564132, + "learning_rate": 0.000993078178704523, + "loss": 0.6395, + "step": 5820 + }, + { + "epoch": 0.29571905420669875, + "grad_norm": 0.04293642734855891, + "learning_rate": 0.0009930413972987513, + "loss": 0.6509, + "step": 5825 + }, + { + "epoch": 0.29597289030473023, + "grad_norm": 0.04186066507422814, + "learning_rate": 0.000993004519111412, + "loss": 0.689, + "step": 5830 + }, + { + "epoch": 0.29622672640276176, + "grad_norm": 0.04758901402636238, + "learning_rate": 0.0009929675441497441, + "loss": 0.6446, + "step": 5835 + }, + { + "epoch": 0.29648056250079324, + "grad_norm": 0.02860243925691624, + "learning_rate": 0.000992930472421006, + "loss": 0.6412, + "step": 5840 + }, + { + "epoch": 0.2967343985988247, + "grad_norm": 0.14274718702473935, + "learning_rate": 0.0009928933039324741, + "loss": 0.6036, + "step": 5845 + }, + { + "epoch": 0.29698823469685626, + "grad_norm": 0.048711259308299386, + "learning_rate": 0.0009928560386914447, + "loss": 0.6508, + "step": 5850 + }, + { + "epoch": 0.29724207079488774, + "grad_norm": 0.05737639269415528, + "learning_rate": 0.000992818676705233, + "loss": 0.6338, + "step": 5855 + }, + { + "epoch": 0.2974959068929192, + "grad_norm": 0.08608374496462033, + "learning_rate": 0.0009927812179811727, + "loss": 0.7236, + "step": 5860 + }, + { + "epoch": 0.29774974299095075, + "grad_norm": 0.0849381389952358, + "learning_rate": 0.0009927436625266166, + "loss": 0.6686, + "step": 5865 + }, + { + "epoch": 0.29800357908898223, + "grad_norm": 1.0620336080619193, + "learning_rate": 0.0009927060103489369, + "loss": 0.6706, + "step": 5870 + }, + { + "epoch": 0.29825741518701376, + "grad_norm": 13.801221772105595, + "learning_rate": 0.0009926682614555247, + "loss": 2.1038, + "step": 5875 + }, + { + "epoch": 0.29851125128504524, + "grad_norm": 0.529463438355754, + "learning_rate": 0.0009926304158537895, + "loss": 1.9905, + "step": 5880 + }, + { + "epoch": 0.2987650873830767, + "grad_norm": 0.5693873876849637, + "learning_rate": 0.0009925924735511603, + "loss": 0.7879, + "step": 5885 + }, + { + "epoch": 0.29901892348110826, + "grad_norm": 0.15207649725469286, + "learning_rate": 0.0009925544345550854, + "loss": 0.7494, + "step": 5890 + }, + { + "epoch": 0.29927275957913974, + "grad_norm": 0.053777419398167665, + "learning_rate": 0.0009925162988730313, + "loss": 0.7054, + "step": 5895 + }, + { + "epoch": 0.29952659567717127, + "grad_norm": 0.11506938387205977, + "learning_rate": 0.0009924780665124839, + "loss": 0.6893, + "step": 5900 + }, + { + "epoch": 0.29978043177520275, + "grad_norm": 0.07887804598963974, + "learning_rate": 0.000992439737480948, + "loss": 0.687, + "step": 5905 + }, + { + "epoch": 0.30003426787323423, + "grad_norm": 0.06054971527019416, + "learning_rate": 0.0009924013117859475, + "loss": 0.7238, + "step": 5910 + }, + { + "epoch": 0.30028810397126576, + "grad_norm": 0.04536799993236666, + "learning_rate": 0.0009923627894350248, + "loss": 0.7173, + "step": 5915 + }, + { + "epoch": 0.30054194006929724, + "grad_norm": 0.05390447300870516, + "learning_rate": 0.0009923241704357423, + "loss": 0.7083, + "step": 5920 + }, + { + "epoch": 0.3007957761673288, + "grad_norm": 0.05672780707635087, + "learning_rate": 0.0009922854547956802, + "loss": 0.6606, + "step": 5925 + }, + { + "epoch": 0.30104961226536026, + "grad_norm": 0.03885747550342492, + "learning_rate": 0.0009922466425224383, + "loss": 0.6993, + "step": 5930 + }, + { + "epoch": 0.30130344836339173, + "grad_norm": 0.07191125789862535, + "learning_rate": 0.0009922077336236353, + "loss": 0.6375, + "step": 5935 + }, + { + "epoch": 0.30155728446142327, + "grad_norm": 0.10081257590820915, + "learning_rate": 0.000992168728106909, + "loss": 0.6541, + "step": 5940 + }, + { + "epoch": 0.30181112055945475, + "grad_norm": 0.03547952948787697, + "learning_rate": 0.0009921296259799155, + "loss": 0.6846, + "step": 5945 + }, + { + "epoch": 0.3020649566574863, + "grad_norm": 0.04217686067301573, + "learning_rate": 0.000992090427250331, + "loss": 0.6567, + "step": 5950 + }, + { + "epoch": 0.30231879275551776, + "grad_norm": 0.05110904105032828, + "learning_rate": 0.0009920511319258495, + "loss": 0.6976, + "step": 5955 + }, + { + "epoch": 0.30257262885354924, + "grad_norm": 0.03701759809254075, + "learning_rate": 0.0009920117400141848, + "loss": 0.67, + "step": 5960 + }, + { + "epoch": 0.3028264649515808, + "grad_norm": 0.7757113792101427, + "learning_rate": 0.0009919722515230691, + "loss": 0.6825, + "step": 5965 + }, + { + "epoch": 0.30308030104961226, + "grad_norm": 0.10831424664913561, + "learning_rate": 0.0009919326664602538, + "loss": 0.6896, + "step": 5970 + }, + { + "epoch": 0.3033341371476438, + "grad_norm": 0.09797456779700615, + "learning_rate": 0.0009918929848335095, + "loss": 0.7677, + "step": 5975 + }, + { + "epoch": 0.30358797324567527, + "grad_norm": 0.05159857874021976, + "learning_rate": 0.0009918532066506252, + "loss": 0.7122, + "step": 5980 + }, + { + "epoch": 0.30384180934370675, + "grad_norm": 0.05813781521982605, + "learning_rate": 0.0009918133319194093, + "loss": 0.6823, + "step": 5985 + }, + { + "epoch": 0.3040956454417383, + "grad_norm": 0.03671398334295547, + "learning_rate": 0.000991773360647689, + "loss": 0.6991, + "step": 5990 + }, + { + "epoch": 0.30434948153976976, + "grad_norm": 0.07402367959365846, + "learning_rate": 0.0009917332928433106, + "loss": 0.6946, + "step": 5995 + }, + { + "epoch": 0.3046033176378013, + "grad_norm": 0.0653670605879878, + "learning_rate": 0.000991693128514139, + "loss": 0.7273, + "step": 6000 + }, + { + "epoch": 0.3048571537358328, + "grad_norm": 0.03813871957965758, + "learning_rate": 0.0009916528676680585, + "loss": 0.6652, + "step": 6005 + }, + { + "epoch": 0.30511098983386425, + "grad_norm": 0.04376402030900822, + "learning_rate": 0.0009916125103129718, + "loss": 0.6455, + "step": 6010 + }, + { + "epoch": 0.3053648259318958, + "grad_norm": 0.04963286911499542, + "learning_rate": 0.000991572056456801, + "loss": 0.7049, + "step": 6015 + }, + { + "epoch": 0.30561866202992727, + "grad_norm": 0.030884815020089395, + "learning_rate": 0.000991531506107487, + "loss": 0.6823, + "step": 6020 + }, + { + "epoch": 0.3058724981279588, + "grad_norm": 0.03715644515895416, + "learning_rate": 0.0009914908592729896, + "loss": 0.7246, + "step": 6025 + }, + { + "epoch": 0.3061263342259903, + "grad_norm": 0.05308010746588163, + "learning_rate": 0.0009914501159612877, + "loss": 0.6453, + "step": 6030 + }, + { + "epoch": 0.30638017032402176, + "grad_norm": 0.042657404727221966, + "learning_rate": 0.0009914092761803789, + "loss": 0.6524, + "step": 6035 + }, + { + "epoch": 0.3066340064220533, + "grad_norm": 0.06017465873203829, + "learning_rate": 0.0009913683399382796, + "loss": 0.7004, + "step": 6040 + }, + { + "epoch": 0.3068878425200848, + "grad_norm": 0.038922670583128045, + "learning_rate": 0.0009913273072430257, + "loss": 0.6804, + "step": 6045 + }, + { + "epoch": 0.3071416786181163, + "grad_norm": 0.0648567542128592, + "learning_rate": 0.0009912861781026718, + "loss": 0.6558, + "step": 6050 + }, + { + "epoch": 0.3073955147161478, + "grad_norm": 0.03494754440461928, + "learning_rate": 0.0009912449525252911, + "loss": 0.666, + "step": 6055 + }, + { + "epoch": 0.30764935081417927, + "grad_norm": 0.03092884139080322, + "learning_rate": 0.000991203630518976, + "loss": 0.6688, + "step": 6060 + }, + { + "epoch": 0.3079031869122108, + "grad_norm": 0.05872150090284288, + "learning_rate": 0.0009911622120918379, + "loss": 0.6385, + "step": 6065 + }, + { + "epoch": 0.3081570230102423, + "grad_norm": 0.08520802622712559, + "learning_rate": 0.0009911206972520068, + "loss": 0.6635, + "step": 6070 + }, + { + "epoch": 0.30841085910827376, + "grad_norm": 0.06879085230717281, + "learning_rate": 0.0009910790860076324, + "loss": 0.7711, + "step": 6075 + }, + { + "epoch": 0.3086646952063053, + "grad_norm": 0.06496604434818472, + "learning_rate": 0.0009910373783668823, + "loss": 0.6546, + "step": 6080 + }, + { + "epoch": 0.3089185313043368, + "grad_norm": 0.03788866020152313, + "learning_rate": 0.0009909955743379435, + "loss": 0.6574, + "step": 6085 + }, + { + "epoch": 0.3091723674023683, + "grad_norm": 0.036029343391252786, + "learning_rate": 0.0009909536739290221, + "loss": 0.6614, + "step": 6090 + }, + { + "epoch": 0.3094262035003998, + "grad_norm": 0.0357204003060438, + "learning_rate": 0.0009909116771483427, + "loss": 0.6273, + "step": 6095 + }, + { + "epoch": 0.30968003959843127, + "grad_norm": 0.036306841185922825, + "learning_rate": 0.0009908695840041496, + "loss": 0.6725, + "step": 6100 + }, + { + "epoch": 0.3099338756964628, + "grad_norm": 0.03660198406181924, + "learning_rate": 0.000990827394504705, + "loss": 0.647, + "step": 6105 + }, + { + "epoch": 0.3101877117944943, + "grad_norm": 0.04095269101364938, + "learning_rate": 0.0009907851086582906, + "loss": 0.6755, + "step": 6110 + }, + { + "epoch": 0.3104415478925258, + "grad_norm": 0.03247697430503874, + "learning_rate": 0.0009907427264732069, + "loss": 0.6813, + "step": 6115 + }, + { + "epoch": 0.3106953839905573, + "grad_norm": 0.03162996739245832, + "learning_rate": 0.0009907002479577734, + "loss": 0.6643, + "step": 6120 + }, + { + "epoch": 0.3109492200885888, + "grad_norm": 0.5455514035154928, + "learning_rate": 0.0009906576731203282, + "loss": 0.6433, + "step": 6125 + }, + { + "epoch": 0.3112030561866203, + "grad_norm": 0.044476465750440224, + "learning_rate": 0.0009906150019692288, + "loss": 0.6653, + "step": 6130 + }, + { + "epoch": 0.3114568922846518, + "grad_norm": 0.05927549810439251, + "learning_rate": 0.000990572234512851, + "loss": 0.6589, + "step": 6135 + }, + { + "epoch": 0.3117107283826833, + "grad_norm": 0.056665905206855084, + "learning_rate": 0.0009905293707595903, + "loss": 0.649, + "step": 6140 + }, + { + "epoch": 0.3119645644807148, + "grad_norm": 0.03331713795791355, + "learning_rate": 0.0009904864107178602, + "loss": 0.64, + "step": 6145 + }, + { + "epoch": 0.3122184005787463, + "grad_norm": 0.04406210853179684, + "learning_rate": 0.000990443354396094, + "loss": 0.6702, + "step": 6150 + }, + { + "epoch": 0.3124722366767778, + "grad_norm": 0.057693446080025364, + "learning_rate": 0.000990400201802743, + "loss": 0.7062, + "step": 6155 + }, + { + "epoch": 0.3127260727748093, + "grad_norm": 0.035021125314635915, + "learning_rate": 0.0009903569529462778, + "loss": 0.697, + "step": 6160 + }, + { + "epoch": 0.31297990887284083, + "grad_norm": 0.055650402580445305, + "learning_rate": 0.0009903136078351885, + "loss": 0.648, + "step": 6165 + }, + { + "epoch": 0.3132337449708723, + "grad_norm": 0.06196313978890203, + "learning_rate": 0.0009902701664779828, + "loss": 0.6454, + "step": 6170 + }, + { + "epoch": 0.3134875810689038, + "grad_norm": 0.038941666671227525, + "learning_rate": 0.0009902266288831887, + "loss": 0.676, + "step": 6175 + }, + { + "epoch": 0.3137414171669353, + "grad_norm": 0.0415527424909999, + "learning_rate": 0.000990182995059352, + "loss": 0.6999, + "step": 6180 + }, + { + "epoch": 0.3139952532649668, + "grad_norm": 0.032478914330314065, + "learning_rate": 0.0009901392650150378, + "loss": 0.6384, + "step": 6185 + }, + { + "epoch": 0.31424908936299834, + "grad_norm": 0.03685841261439452, + "learning_rate": 0.0009900954387588303, + "loss": 0.6149, + "step": 6190 + }, + { + "epoch": 0.3145029254610298, + "grad_norm": 0.0576119132892154, + "learning_rate": 0.0009900515162993325, + "loss": 0.6754, + "step": 6195 + }, + { + "epoch": 0.3147567615590613, + "grad_norm": 0.03227069972941047, + "learning_rate": 0.0009900074976451655, + "loss": 0.6563, + "step": 6200 + }, + { + "epoch": 0.31501059765709283, + "grad_norm": 0.03374111959001576, + "learning_rate": 0.0009899633828049706, + "loss": 0.6457, + "step": 6205 + }, + { + "epoch": 0.3152644337551243, + "grad_norm": 0.036155101851612435, + "learning_rate": 0.0009899191717874071, + "loss": 0.646, + "step": 6210 + }, + { + "epoch": 0.31551826985315584, + "grad_norm": 0.031824458816045474, + "learning_rate": 0.0009898748646011534, + "loss": 0.6559, + "step": 6215 + }, + { + "epoch": 0.3157721059511873, + "grad_norm": 0.03276354958142416, + "learning_rate": 0.0009898304612549068, + "loss": 0.6378, + "step": 6220 + }, + { + "epoch": 0.3160259420492188, + "grad_norm": 0.029188354303673416, + "learning_rate": 0.0009897859617573833, + "loss": 0.6496, + "step": 6225 + }, + { + "epoch": 0.31627977814725033, + "grad_norm": 0.03357230960317099, + "learning_rate": 0.0009897413661173182, + "loss": 0.6655, + "step": 6230 + }, + { + "epoch": 0.3165336142452818, + "grad_norm": 0.03530965710230692, + "learning_rate": 0.0009896966743434654, + "loss": 0.6709, + "step": 6235 + }, + { + "epoch": 0.31678745034331335, + "grad_norm": 0.03547112168077008, + "learning_rate": 0.0009896518864445974, + "loss": 0.6459, + "step": 6240 + }, + { + "epoch": 0.3170412864413448, + "grad_norm": 0.05701195512800427, + "learning_rate": 0.0009896070024295058, + "loss": 0.6933, + "step": 6245 + }, + { + "epoch": 0.3172951225393763, + "grad_norm": 0.034050322178401336, + "learning_rate": 0.0009895620223070013, + "loss": 0.6286, + "step": 6250 + }, + { + "epoch": 0.31754895863740784, + "grad_norm": 0.10844739490541815, + "learning_rate": 0.0009895169460859136, + "loss": 0.6691, + "step": 6255 + }, + { + "epoch": 0.3178027947354393, + "grad_norm": 0.04811906636847503, + "learning_rate": 0.0009894717737750905, + "loss": 0.6406, + "step": 6260 + }, + { + "epoch": 0.31805663083347085, + "grad_norm": 0.03651016020761419, + "learning_rate": 0.000989426505383399, + "loss": 0.6404, + "step": 6265 + }, + { + "epoch": 0.31831046693150233, + "grad_norm": 0.03909160423336249, + "learning_rate": 0.0009893811409197254, + "loss": 0.6718, + "step": 6270 + }, + { + "epoch": 0.3185643030295338, + "grad_norm": 0.03433471167278222, + "learning_rate": 0.0009893356803929742, + "loss": 0.7035, + "step": 6275 + }, + { + "epoch": 0.31881813912756535, + "grad_norm": 0.0315707692611077, + "learning_rate": 0.0009892901238120694, + "loss": 0.6278, + "step": 6280 + }, + { + "epoch": 0.3190719752255968, + "grad_norm": 0.03181406843213265, + "learning_rate": 0.0009892444711859536, + "loss": 0.6195, + "step": 6285 + }, + { + "epoch": 0.3193258113236283, + "grad_norm": 0.03252733678185362, + "learning_rate": 0.0009891987225235876, + "loss": 0.6591, + "step": 6290 + }, + { + "epoch": 0.31957964742165984, + "grad_norm": 0.028275724026086066, + "learning_rate": 0.0009891528778339523, + "loss": 0.632, + "step": 6295 + }, + { + "epoch": 0.3198334835196913, + "grad_norm": 0.0332639388947232, + "learning_rate": 0.0009891069371260463, + "loss": 0.6706, + "step": 6300 + }, + { + "epoch": 0.32008731961772285, + "grad_norm": 0.04023616497430709, + "learning_rate": 0.0009890609004088878, + "loss": 0.6397, + "step": 6305 + }, + { + "epoch": 0.32034115571575433, + "grad_norm": 0.04189562916683901, + "learning_rate": 0.0009890147676915133, + "loss": 0.6589, + "step": 6310 + }, + { + "epoch": 0.3205949918137858, + "grad_norm": 0.04615410615909303, + "learning_rate": 0.0009889685389829787, + "loss": 0.654, + "step": 6315 + }, + { + "epoch": 0.32084882791181735, + "grad_norm": 0.0876708805444767, + "learning_rate": 0.0009889222142923585, + "loss": 0.6873, + "step": 6320 + }, + { + "epoch": 0.3211026640098488, + "grad_norm": 0.0731800370414708, + "learning_rate": 0.0009888757936287458, + "loss": 0.6661, + "step": 6325 + }, + { + "epoch": 0.32135650010788036, + "grad_norm": 0.04489055013050584, + "learning_rate": 0.0009888292770012528, + "loss": 0.6437, + "step": 6330 + }, + { + "epoch": 0.32161033620591184, + "grad_norm": 0.05950573079091027, + "learning_rate": 0.0009887826644190106, + "loss": 0.6107, + "step": 6335 + }, + { + "epoch": 0.3218641723039433, + "grad_norm": 0.03311301201808374, + "learning_rate": 0.0009887359558911689, + "loss": 0.6396, + "step": 6340 + }, + { + "epoch": 0.32211800840197485, + "grad_norm": 0.04257944673946886, + "learning_rate": 0.0009886891514268963, + "loss": 0.6497, + "step": 6345 + }, + { + "epoch": 0.32237184450000633, + "grad_norm": 0.03870234251780305, + "learning_rate": 0.0009886422510353805, + "loss": 0.657, + "step": 6350 + }, + { + "epoch": 0.32262568059803787, + "grad_norm": 0.03016914715973346, + "learning_rate": 0.0009885952547258278, + "loss": 0.6503, + "step": 6355 + }, + { + "epoch": 0.32287951669606935, + "grad_norm": 0.039652301246605876, + "learning_rate": 0.000988548162507463, + "loss": 0.7024, + "step": 6360 + }, + { + "epoch": 0.3231333527941008, + "grad_norm": 0.032827930330652684, + "learning_rate": 0.0009885009743895302, + "loss": 0.6377, + "step": 6365 + }, + { + "epoch": 0.32338718889213236, + "grad_norm": 0.04068567499221553, + "learning_rate": 0.0009884536903812923, + "loss": 0.6727, + "step": 6370 + }, + { + "epoch": 0.32364102499016384, + "grad_norm": 0.0345449057811016, + "learning_rate": 0.000988406310492031, + "loss": 0.6875, + "step": 6375 + }, + { + "epoch": 0.3238948610881954, + "grad_norm": 0.04902874284849529, + "learning_rate": 0.0009883588347310466, + "loss": 0.6455, + "step": 6380 + }, + { + "epoch": 0.32414869718622685, + "grad_norm": 0.03422848521805073, + "learning_rate": 0.0009883112631076585, + "loss": 0.6338, + "step": 6385 + }, + { + "epoch": 0.32440253328425833, + "grad_norm": 0.03165939758398714, + "learning_rate": 0.0009882635956312046, + "loss": 0.6472, + "step": 6390 + }, + { + "epoch": 0.32465636938228987, + "grad_norm": 0.027078597446684824, + "learning_rate": 0.0009882158323110417, + "loss": 0.6291, + "step": 6395 + }, + { + "epoch": 0.32491020548032135, + "grad_norm": 0.03356428796023044, + "learning_rate": 0.0009881679731565457, + "loss": 0.649, + "step": 6400 + }, + { + "epoch": 0.3251640415783529, + "grad_norm": 0.0644450340702738, + "learning_rate": 0.000988120018177111, + "loss": 0.6616, + "step": 6405 + }, + { + "epoch": 0.32541787767638436, + "grad_norm": 0.049328556792104926, + "learning_rate": 0.0009880719673821513, + "loss": 0.6334, + "step": 6410 + }, + { + "epoch": 0.32567171377441584, + "grad_norm": 0.03340548046894897, + "learning_rate": 0.000988023820781098, + "loss": 0.6107, + "step": 6415 + }, + { + "epoch": 0.3259255498724474, + "grad_norm": 0.031492806873797255, + "learning_rate": 0.000987975578383403, + "loss": 0.6654, + "step": 6420 + }, + { + "epoch": 0.32617938597047885, + "grad_norm": 0.03786542334456132, + "learning_rate": 0.0009879272401985349, + "loss": 0.6533, + "step": 6425 + }, + { + "epoch": 0.3264332220685104, + "grad_norm": 0.03333849837492178, + "learning_rate": 0.0009878788062359831, + "loss": 0.6664, + "step": 6430 + }, + { + "epoch": 0.32668705816654187, + "grad_norm": 0.03610249374246746, + "learning_rate": 0.0009878302765052548, + "loss": 0.633, + "step": 6435 + }, + { + "epoch": 0.32694089426457335, + "grad_norm": 0.031209587596926817, + "learning_rate": 0.0009877816510158756, + "loss": 0.6613, + "step": 6440 + }, + { + "epoch": 0.3271947303626049, + "grad_norm": 0.03483245394564131, + "learning_rate": 0.0009877329297773914, + "loss": 0.6727, + "step": 6445 + }, + { + "epoch": 0.32744856646063636, + "grad_norm": 0.034854234123111616, + "learning_rate": 0.000987684112799365, + "loss": 0.6448, + "step": 6450 + }, + { + "epoch": 0.3277024025586679, + "grad_norm": 0.03027463862874961, + "learning_rate": 0.0009876352000913796, + "loss": 0.6376, + "step": 6455 + }, + { + "epoch": 0.3279562386566994, + "grad_norm": 0.029904982015815273, + "learning_rate": 0.000987586191663036, + "loss": 0.6271, + "step": 6460 + }, + { + "epoch": 0.32821007475473085, + "grad_norm": 0.04606450117610927, + "learning_rate": 0.0009875370875239548, + "loss": 0.6324, + "step": 6465 + }, + { + "epoch": 0.3284639108527624, + "grad_norm": 0.047345984810625526, + "learning_rate": 0.0009874878876837746, + "loss": 0.6559, + "step": 6470 + }, + { + "epoch": 0.32871774695079387, + "grad_norm": 0.027399081554610526, + "learning_rate": 0.0009874385921521533, + "loss": 0.6782, + "step": 6475 + }, + { + "epoch": 0.3289715830488254, + "grad_norm": 0.03402265966097058, + "learning_rate": 0.000987389200938767, + "loss": 0.6826, + "step": 6480 + }, + { + "epoch": 0.3292254191468569, + "grad_norm": 0.07145987939489644, + "learning_rate": 0.0009873397140533111, + "loss": 0.648, + "step": 6485 + }, + { + "epoch": 0.32947925524488836, + "grad_norm": 0.030640240717414162, + "learning_rate": 0.0009872901315054999, + "loss": 0.6225, + "step": 6490 + }, + { + "epoch": 0.3297330913429199, + "grad_norm": 0.029273085435207902, + "learning_rate": 0.000987240453305066, + "loss": 0.668, + "step": 6495 + }, + { + "epoch": 0.32998692744095137, + "grad_norm": 0.027915653420681048, + "learning_rate": 0.0009871906794617607, + "loss": 0.675, + "step": 6500 + }, + { + "epoch": 0.33024076353898285, + "grad_norm": 0.028085088656119427, + "learning_rate": 0.0009871408099853547, + "loss": 0.6468, + "step": 6505 + }, + { + "epoch": 0.3304945996370144, + "grad_norm": 0.03469790217987671, + "learning_rate": 0.0009870908448856373, + "loss": 0.6291, + "step": 6510 + }, + { + "epoch": 0.33074843573504586, + "grad_norm": 0.03940795336268566, + "learning_rate": 0.000987040784172416, + "loss": 0.6471, + "step": 6515 + }, + { + "epoch": 0.3310022718330774, + "grad_norm": 0.0372777835424837, + "learning_rate": 0.0009869906278555177, + "loss": 0.6434, + "step": 6520 + }, + { + "epoch": 0.3312561079311089, + "grad_norm": 0.05751236855268058, + "learning_rate": 0.0009869403759447876, + "loss": 0.6963, + "step": 6525 + }, + { + "epoch": 0.33150994402914036, + "grad_norm": 0.028415506939899773, + "learning_rate": 0.0009868900284500904, + "loss": 0.6116, + "step": 6530 + }, + { + "epoch": 0.3317637801271719, + "grad_norm": 0.06720606915783799, + "learning_rate": 0.0009868395853813085, + "loss": 0.665, + "step": 6535 + }, + { + "epoch": 0.33201761622520337, + "grad_norm": 0.037437450250349114, + "learning_rate": 0.000986789046748344, + "loss": 0.6683, + "step": 6540 + }, + { + "epoch": 0.3322714523232349, + "grad_norm": 0.03050364304716209, + "learning_rate": 0.000986738412561117, + "loss": 0.6796, + "step": 6545 + }, + { + "epoch": 0.3325252884212664, + "grad_norm": 0.03574775300175517, + "learning_rate": 0.0009866876828295672, + "loss": 0.6177, + "step": 6550 + }, + { + "epoch": 0.33277912451929786, + "grad_norm": 0.02999643436973219, + "learning_rate": 0.0009866368575636522, + "loss": 0.6632, + "step": 6555 + }, + { + "epoch": 0.3330329606173294, + "grad_norm": 0.034667486256933826, + "learning_rate": 0.0009865859367733489, + "loss": 0.6685, + "step": 6560 + }, + { + "epoch": 0.3332867967153609, + "grad_norm": 0.035749048566894245, + "learning_rate": 0.0009865349204686532, + "loss": 0.624, + "step": 6565 + }, + { + "epoch": 0.3335406328133924, + "grad_norm": 0.02721431745155214, + "learning_rate": 0.0009864838086595783, + "loss": 0.6438, + "step": 6570 + }, + { + "epoch": 0.3337944689114239, + "grad_norm": 0.030303232949427904, + "learning_rate": 0.0009864326013561584, + "loss": 0.6504, + "step": 6575 + }, + { + "epoch": 0.33404830500945537, + "grad_norm": 0.033151379828375675, + "learning_rate": 0.0009863812985684446, + "loss": 0.6616, + "step": 6580 + }, + { + "epoch": 0.3343021411074869, + "grad_norm": 0.03492354659989374, + "learning_rate": 0.0009863299003065073, + "loss": 0.6649, + "step": 6585 + }, + { + "epoch": 0.3345559772055184, + "grad_norm": 0.029289412856450096, + "learning_rate": 0.000986278406580436, + "loss": 0.671, + "step": 6590 + }, + { + "epoch": 0.3348098133035499, + "grad_norm": 0.029758417773996532, + "learning_rate": 0.0009862268174003386, + "loss": 0.6431, + "step": 6595 + }, + { + "epoch": 0.3350636494015814, + "grad_norm": 0.02917170623260094, + "learning_rate": 0.0009861751327763415, + "loss": 0.6586, + "step": 6600 + }, + { + "epoch": 0.3353174854996129, + "grad_norm": 0.039238413473097804, + "learning_rate": 0.0009861233527185907, + "loss": 0.6523, + "step": 6605 + }, + { + "epoch": 0.3355713215976444, + "grad_norm": 0.05769974377090198, + "learning_rate": 0.00098607147723725, + "loss": 0.6534, + "step": 6610 + }, + { + "epoch": 0.3358251576956759, + "grad_norm": 0.05207362585582225, + "learning_rate": 0.000986019506342502, + "loss": 0.628, + "step": 6615 + }, + { + "epoch": 0.3360789937937074, + "grad_norm": 0.042481555414880016, + "learning_rate": 0.0009859674400445491, + "loss": 0.6352, + "step": 6620 + }, + { + "epoch": 0.3363328298917389, + "grad_norm": 0.038592274715997285, + "learning_rate": 0.0009859152783536112, + "loss": 0.655, + "step": 6625 + }, + { + "epoch": 0.3365866659897704, + "grad_norm": 0.03810235969474428, + "learning_rate": 0.0009858630212799273, + "loss": 0.6078, + "step": 6630 + }, + { + "epoch": 0.3368405020878019, + "grad_norm": 0.04141260814892809, + "learning_rate": 0.0009858106688337552, + "loss": 0.6309, + "step": 6635 + }, + { + "epoch": 0.3370943381858334, + "grad_norm": 0.050224588657700506, + "learning_rate": 0.0009857582210253718, + "loss": 0.6598, + "step": 6640 + }, + { + "epoch": 0.33734817428386493, + "grad_norm": 0.044838404082403416, + "learning_rate": 0.000985705677865072, + "loss": 0.643, + "step": 6645 + }, + { + "epoch": 0.3376020103818964, + "grad_norm": 0.039078603300489616, + "learning_rate": 0.0009856530393631698, + "loss": 0.6293, + "step": 6650 + }, + { + "epoch": 0.3378558464799279, + "grad_norm": 0.049346205757841856, + "learning_rate": 0.0009856003055299979, + "loss": 0.67, + "step": 6655 + }, + { + "epoch": 0.3381096825779594, + "grad_norm": 0.0467859227211263, + "learning_rate": 0.0009855474763759075, + "loss": 0.6425, + "step": 6660 + }, + { + "epoch": 0.3383635186759909, + "grad_norm": 0.03718081935719444, + "learning_rate": 0.0009854945519112692, + "loss": 0.65, + "step": 6665 + }, + { + "epoch": 0.33861735477402244, + "grad_norm": 0.04145755563324682, + "learning_rate": 0.0009854415321464715, + "loss": 0.6424, + "step": 6670 + }, + { + "epoch": 0.3388711908720539, + "grad_norm": 0.03672619994306692, + "learning_rate": 0.0009853884170919218, + "loss": 0.66, + "step": 6675 + }, + { + "epoch": 0.3391250269700854, + "grad_norm": 0.03752204942770816, + "learning_rate": 0.0009853352067580466, + "loss": 0.6683, + "step": 6680 + }, + { + "epoch": 0.33937886306811693, + "grad_norm": 0.03222882623511126, + "learning_rate": 0.0009852819011552908, + "loss": 0.6438, + "step": 6685 + }, + { + "epoch": 0.3396326991661484, + "grad_norm": 0.02965590085193827, + "learning_rate": 0.0009852285002941174, + "loss": 0.6642, + "step": 6690 + }, + { + "epoch": 0.33988653526417995, + "grad_norm": 0.03879854510223968, + "learning_rate": 0.0009851750041850098, + "loss": 0.6477, + "step": 6695 + }, + { + "epoch": 0.3401403713622114, + "grad_norm": 0.14912016060061054, + "learning_rate": 0.000985121412838468, + "loss": 0.6301, + "step": 6700 + }, + { + "epoch": 0.3403942074602429, + "grad_norm": 0.031220805777869454, + "learning_rate": 0.0009850677262650124, + "loss": 0.6785, + "step": 6705 + }, + { + "epoch": 0.34064804355827444, + "grad_norm": 0.047945073583954875, + "learning_rate": 0.000985013944475181, + "loss": 0.6329, + "step": 6710 + }, + { + "epoch": 0.3409018796563059, + "grad_norm": 0.03125211749366002, + "learning_rate": 0.0009849600674795313, + "loss": 0.6694, + "step": 6715 + }, + { + "epoch": 0.34115571575433745, + "grad_norm": 0.030016984119769507, + "learning_rate": 0.0009849060952886385, + "loss": 0.6489, + "step": 6720 + }, + { + "epoch": 0.34140955185236893, + "grad_norm": 0.0321881850801395, + "learning_rate": 0.0009848520279130979, + "loss": 0.6557, + "step": 6725 + }, + { + "epoch": 0.3416633879504004, + "grad_norm": 0.027489966034472072, + "learning_rate": 0.0009847978653635219, + "loss": 0.6161, + "step": 6730 + }, + { + "epoch": 0.34191722404843194, + "grad_norm": 0.03167397292683289, + "learning_rate": 0.0009847436076505425, + "loss": 0.6661, + "step": 6735 + }, + { + "epoch": 0.3421710601464634, + "grad_norm": 0.027164902072622888, + "learning_rate": 0.0009846892547848106, + "loss": 0.6416, + "step": 6740 + }, + { + "epoch": 0.3424248962444949, + "grad_norm": 0.032186132345030184, + "learning_rate": 0.000984634806776995, + "loss": 0.6011, + "step": 6745 + }, + { + "epoch": 0.34267873234252644, + "grad_norm": 0.05422054145392441, + "learning_rate": 0.0009845802636377834, + "loss": 0.619, + "step": 6750 + }, + { + "epoch": 0.3429325684405579, + "grad_norm": 0.03126640652473011, + "learning_rate": 0.000984525625377883, + "loss": 0.663, + "step": 6755 + }, + { + "epoch": 0.34318640453858945, + "grad_norm": 0.04178980876855967, + "learning_rate": 0.0009844708920080185, + "loss": 0.6764, + "step": 6760 + }, + { + "epoch": 0.34344024063662093, + "grad_norm": 0.0479771178058185, + "learning_rate": 0.000984416063538934, + "loss": 0.636, + "step": 6765 + }, + { + "epoch": 0.3436940767346524, + "grad_norm": 0.048210015731811606, + "learning_rate": 0.0009843611399813921, + "loss": 0.6826, + "step": 6770 + }, + { + "epoch": 0.34394791283268394, + "grad_norm": 0.040035034533658136, + "learning_rate": 0.0009843061213461739, + "loss": 0.6551, + "step": 6775 + }, + { + "epoch": 0.3442017489307154, + "grad_norm": 0.04450679708387797, + "learning_rate": 0.0009842510076440792, + "loss": 0.6789, + "step": 6780 + }, + { + "epoch": 0.34445558502874696, + "grad_norm": 0.03469053565219521, + "learning_rate": 0.0009841957988859268, + "loss": 0.636, + "step": 6785 + }, + { + "epoch": 0.34470942112677844, + "grad_norm": 0.03854111003127534, + "learning_rate": 0.0009841404950825536, + "loss": 0.6504, + "step": 6790 + }, + { + "epoch": 0.3449632572248099, + "grad_norm": 0.0422532957102432, + "learning_rate": 0.0009840850962448157, + "loss": 0.7046, + "step": 6795 + }, + { + "epoch": 0.34521709332284145, + "grad_norm": 0.042849883678541305, + "learning_rate": 0.0009840296023835877, + "loss": 0.6982, + "step": 6800 + }, + { + "epoch": 0.34547092942087293, + "grad_norm": 0.11970895618204253, + "learning_rate": 0.0009839740135097624, + "loss": 0.7033, + "step": 6805 + }, + { + "epoch": 0.34572476551890446, + "grad_norm": 0.28594499591001454, + "learning_rate": 0.0009839183296342518, + "loss": 0.6618, + "step": 6810 + }, + { + "epoch": 0.34597860161693594, + "grad_norm": 0.09723775531984895, + "learning_rate": 0.0009838625507679866, + "loss": 0.7034, + "step": 6815 + }, + { + "epoch": 0.3462324377149674, + "grad_norm": 0.12532852308987766, + "learning_rate": 0.0009838066769219155, + "loss": 0.7629, + "step": 6820 + }, + { + "epoch": 0.34648627381299896, + "grad_norm": 0.0753799546279816, + "learning_rate": 0.0009837507081070064, + "loss": 0.7012, + "step": 6825 + }, + { + "epoch": 0.34674010991103044, + "grad_norm": 0.23690674591283684, + "learning_rate": 0.000983694644334246, + "loss": 0.7184, + "step": 6830 + }, + { + "epoch": 0.34699394600906197, + "grad_norm": 0.06495848284102379, + "learning_rate": 0.000983638485614639, + "loss": 0.6394, + "step": 6835 + }, + { + "epoch": 0.34724778210709345, + "grad_norm": 0.10113306645676379, + "learning_rate": 0.0009835822319592092, + "loss": 0.7055, + "step": 6840 + }, + { + "epoch": 0.34750161820512493, + "grad_norm": 13.507304609333943, + "learning_rate": 0.0009835258833789987, + "loss": 0.7508, + "step": 6845 + }, + { + "epoch": 0.34775545430315646, + "grad_norm": 0.08682835501016006, + "learning_rate": 0.0009834694398850687, + "loss": 0.6614, + "step": 6850 + }, + { + "epoch": 0.34800929040118794, + "grad_norm": 0.0777945624030565, + "learning_rate": 0.000983412901488499, + "loss": 0.6523, + "step": 6855 + }, + { + "epoch": 0.3482631264992195, + "grad_norm": 0.04009386117281962, + "learning_rate": 0.0009833562682003871, + "loss": 0.6425, + "step": 6860 + }, + { + "epoch": 0.34851696259725096, + "grad_norm": 0.07938149186312117, + "learning_rate": 0.0009832995400318506, + "loss": 0.6724, + "step": 6865 + }, + { + "epoch": 0.34877079869528244, + "grad_norm": 0.0464733613481905, + "learning_rate": 0.0009832427169940243, + "loss": 0.6723, + "step": 6870 + }, + { + "epoch": 0.34902463479331397, + "grad_norm": 0.030541854449360313, + "learning_rate": 0.0009831857990980628, + "loss": 0.6328, + "step": 6875 + }, + { + "epoch": 0.34927847089134545, + "grad_norm": 0.08684468777696315, + "learning_rate": 0.0009831287863551386, + "loss": 0.6537, + "step": 6880 + }, + { + "epoch": 0.349532306989377, + "grad_norm": 0.032667710885275855, + "learning_rate": 0.000983071678776443, + "loss": 0.6233, + "step": 6885 + }, + { + "epoch": 0.34978614308740846, + "grad_norm": 0.06215345815334673, + "learning_rate": 0.0009830144763731856, + "loss": 0.6523, + "step": 6890 + }, + { + "epoch": 0.35003997918543994, + "grad_norm": 0.050425409257167854, + "learning_rate": 0.0009829571791565956, + "loss": 0.6414, + "step": 6895 + }, + { + "epoch": 0.3502938152834715, + "grad_norm": 0.039909942468084936, + "learning_rate": 0.0009828997871379197, + "loss": 0.6715, + "step": 6900 + }, + { + "epoch": 0.35054765138150296, + "grad_norm": 0.03305269203844265, + "learning_rate": 0.0009828423003284239, + "loss": 0.68, + "step": 6905 + }, + { + "epoch": 0.3508014874795345, + "grad_norm": 0.06428144126234532, + "learning_rate": 0.0009827847187393924, + "loss": 0.6522, + "step": 6910 + }, + { + "epoch": 0.35105532357756597, + "grad_norm": 0.040717882474944, + "learning_rate": 0.0009827270423821283, + "loss": 0.6798, + "step": 6915 + }, + { + "epoch": 0.35130915967559745, + "grad_norm": 0.03774641806829989, + "learning_rate": 0.000982669271267953, + "loss": 0.6249, + "step": 6920 + }, + { + "epoch": 0.351562995773629, + "grad_norm": 0.22254368278616823, + "learning_rate": 0.000982611405408207, + "loss": 0.6375, + "step": 6925 + }, + { + "epoch": 0.35181683187166046, + "grad_norm": 0.0492079469548121, + "learning_rate": 0.0009825534448142487, + "loss": 0.6596, + "step": 6930 + }, + { + "epoch": 0.352070667969692, + "grad_norm": 0.030183273462930794, + "learning_rate": 0.0009824953894974559, + "loss": 0.6492, + "step": 6935 + }, + { + "epoch": 0.3523245040677235, + "grad_norm": 0.03485104672445065, + "learning_rate": 0.0009824372394692242, + "loss": 0.6405, + "step": 6940 + }, + { + "epoch": 0.35257834016575496, + "grad_norm": 0.04394880912256667, + "learning_rate": 0.0009823789947409685, + "loss": 0.6408, + "step": 6945 + }, + { + "epoch": 0.3528321762637865, + "grad_norm": 0.062030496326437375, + "learning_rate": 0.0009823206553241214, + "loss": 0.6799, + "step": 6950 + }, + { + "epoch": 0.35308601236181797, + "grad_norm": 0.034400042065388416, + "learning_rate": 0.0009822622212301354, + "loss": 0.684, + "step": 6955 + }, + { + "epoch": 0.35333984845984945, + "grad_norm": 0.04472477328842319, + "learning_rate": 0.0009822036924704803, + "loss": 0.6524, + "step": 6960 + }, + { + "epoch": 0.353593684557881, + "grad_norm": 0.0724030385807204, + "learning_rate": 0.000982145069056645, + "loss": 0.668, + "step": 6965 + }, + { + "epoch": 0.35384752065591246, + "grad_norm": 0.19364536723665782, + "learning_rate": 0.000982086351000137, + "loss": 0.6629, + "step": 6970 + }, + { + "epoch": 0.354101356753944, + "grad_norm": 0.04733731040455226, + "learning_rate": 0.0009820275383124826, + "loss": 0.6729, + "step": 6975 + }, + { + "epoch": 0.3543551928519755, + "grad_norm": 0.041046331221462486, + "learning_rate": 0.0009819686310052263, + "loss": 0.6494, + "step": 6980 + }, + { + "epoch": 0.35460902895000695, + "grad_norm": 0.030904028187576038, + "learning_rate": 0.0009819096290899312, + "loss": 0.6761, + "step": 6985 + }, + { + "epoch": 0.3548628650480385, + "grad_norm": 0.06764767017877209, + "learning_rate": 0.0009818505325781793, + "loss": 0.6716, + "step": 6990 + }, + { + "epoch": 0.35511670114606997, + "grad_norm": 0.028029672458931103, + "learning_rate": 0.000981791341481571, + "loss": 0.6701, + "step": 6995 + }, + { + "epoch": 0.3553705372441015, + "grad_norm": 0.05118116772844429, + "learning_rate": 0.0009817320558117247, + "loss": 0.663, + "step": 7000 + }, + { + "epoch": 0.355624373342133, + "grad_norm": 0.07241085260585647, + "learning_rate": 0.0009816726755802784, + "loss": 0.6521, + "step": 7005 + }, + { + "epoch": 0.35587820944016446, + "grad_norm": 0.041960569266684886, + "learning_rate": 0.000981613200798888, + "loss": 0.6904, + "step": 7010 + }, + { + "epoch": 0.356132045538196, + "grad_norm": 0.052753495247033405, + "learning_rate": 0.000981553631479228, + "loss": 0.6356, + "step": 7015 + }, + { + "epoch": 0.3563858816362275, + "grad_norm": 0.032282399891573395, + "learning_rate": 0.0009814939676329917, + "loss": 0.6411, + "step": 7020 + }, + { + "epoch": 0.356639717734259, + "grad_norm": 0.03581374152863383, + "learning_rate": 0.0009814342092718908, + "loss": 0.6605, + "step": 7025 + }, + { + "epoch": 0.3568935538322905, + "grad_norm": 0.03356418157006434, + "learning_rate": 0.0009813743564076557, + "loss": 0.6459, + "step": 7030 + }, + { + "epoch": 0.35714738993032197, + "grad_norm": 0.027321655354040797, + "learning_rate": 0.0009813144090520347, + "loss": 0.6455, + "step": 7035 + }, + { + "epoch": 0.3574012260283535, + "grad_norm": 0.036750526110344274, + "learning_rate": 0.0009812543672167958, + "loss": 0.6281, + "step": 7040 + }, + { + "epoch": 0.357655062126385, + "grad_norm": 0.03514246272392273, + "learning_rate": 0.0009811942309137242, + "loss": 0.676, + "step": 7045 + }, + { + "epoch": 0.3579088982244165, + "grad_norm": 0.061078619376600155, + "learning_rate": 0.0009811340001546253, + "loss": 0.66, + "step": 7050 + }, + { + "epoch": 0.358162734322448, + "grad_norm": 0.03588660012791982, + "learning_rate": 0.0009810736749513212, + "loss": 0.6456, + "step": 7055 + }, + { + "epoch": 0.3584165704204795, + "grad_norm": 0.032510848869803814, + "learning_rate": 0.000981013255315654, + "loss": 0.6658, + "step": 7060 + }, + { + "epoch": 0.358670406518511, + "grad_norm": 0.03332754665414955, + "learning_rate": 0.0009809527412594837, + "loss": 0.6503, + "step": 7065 + }, + { + "epoch": 0.3589242426165425, + "grad_norm": 0.03149052779731926, + "learning_rate": 0.0009808921327946886, + "loss": 0.6563, + "step": 7070 + }, + { + "epoch": 0.359178078714574, + "grad_norm": 0.04408836704626032, + "learning_rate": 0.000980831429933166, + "loss": 0.6587, + "step": 7075 + }, + { + "epoch": 0.3594319148126055, + "grad_norm": 0.029092555752435204, + "learning_rate": 0.0009807706326868317, + "loss": 0.6508, + "step": 7080 + }, + { + "epoch": 0.359685750910637, + "grad_norm": 0.0310181753692322, + "learning_rate": 0.00098070974106762, + "loss": 0.6438, + "step": 7085 + }, + { + "epoch": 0.3599395870086685, + "grad_norm": 0.04993998965712036, + "learning_rate": 0.0009806487550874832, + "loss": 0.635, + "step": 7090 + }, + { + "epoch": 0.3601934231067, + "grad_norm": 0.04501979297683468, + "learning_rate": 0.0009805876747583928, + "loss": 0.6353, + "step": 7095 + }, + { + "epoch": 0.36044725920473153, + "grad_norm": 0.04933885261852858, + "learning_rate": 0.0009805265000923384, + "loss": 0.6906, + "step": 7100 + }, + { + "epoch": 0.360701095302763, + "grad_norm": 0.025031721543288105, + "learning_rate": 0.0009804652311013286, + "loss": 0.62, + "step": 7105 + }, + { + "epoch": 0.3609549314007945, + "grad_norm": 0.03681142313849968, + "learning_rate": 0.00098040386779739, + "loss": 0.6725, + "step": 7110 + }, + { + "epoch": 0.361208767498826, + "grad_norm": 0.02848773898790337, + "learning_rate": 0.0009803424101925678, + "loss": 0.6265, + "step": 7115 + }, + { + "epoch": 0.3614626035968575, + "grad_norm": 0.0629595006387732, + "learning_rate": 0.000980280858298926, + "loss": 0.6621, + "step": 7120 + }, + { + "epoch": 0.36171643969488904, + "grad_norm": 0.031817556084941155, + "learning_rate": 0.000980219212128547, + "loss": 0.6269, + "step": 7125 + }, + { + "epoch": 0.3619702757929205, + "grad_norm": 0.03482414094775657, + "learning_rate": 0.0009801574716935314, + "loss": 0.6279, + "step": 7130 + }, + { + "epoch": 0.362224111890952, + "grad_norm": 0.02962568942750712, + "learning_rate": 0.0009800956370059986, + "loss": 0.6858, + "step": 7135 + }, + { + "epoch": 0.36247794798898353, + "grad_norm": 0.0303541560939534, + "learning_rate": 0.0009800337080780866, + "loss": 0.6156, + "step": 7140 + }, + { + "epoch": 0.362731784087015, + "grad_norm": 0.033517980729787206, + "learning_rate": 0.0009799716849219515, + "loss": 0.6849, + "step": 7145 + }, + { + "epoch": 0.36298562018504654, + "grad_norm": 0.03930435599887139, + "learning_rate": 0.0009799095675497684, + "loss": 0.6508, + "step": 7150 + }, + { + "epoch": 0.363239456283078, + "grad_norm": 0.03507509033957748, + "learning_rate": 0.0009798473559737304, + "loss": 0.6221, + "step": 7155 + }, + { + "epoch": 0.3634932923811095, + "grad_norm": 0.0345726788751751, + "learning_rate": 0.0009797850502060495, + "loss": 0.6244, + "step": 7160 + }, + { + "epoch": 0.36374712847914104, + "grad_norm": 0.031654784953979505, + "learning_rate": 0.0009797226502589558, + "loss": 0.6224, + "step": 7165 + }, + { + "epoch": 0.3640009645771725, + "grad_norm": 0.03475111940127676, + "learning_rate": 0.0009796601561446983, + "loss": 0.604, + "step": 7170 + }, + { + "epoch": 0.364254800675204, + "grad_norm": 0.03356801085531049, + "learning_rate": 0.0009795975678755441, + "loss": 0.6165, + "step": 7175 + }, + { + "epoch": 0.36450863677323553, + "grad_norm": 0.030202402409465263, + "learning_rate": 0.0009795348854637793, + "loss": 0.6357, + "step": 7180 + }, + { + "epoch": 0.364762472871267, + "grad_norm": 0.042891378351094925, + "learning_rate": 0.0009794721089217077, + "loss": 0.6639, + "step": 7185 + }, + { + "epoch": 0.36501630896929854, + "grad_norm": 0.026781555972239687, + "learning_rate": 0.0009794092382616525, + "loss": 0.6193, + "step": 7190 + }, + { + "epoch": 0.36527014506733, + "grad_norm": 0.058555352910862106, + "learning_rate": 0.0009793462734959545, + "loss": 0.6684, + "step": 7195 + }, + { + "epoch": 0.3655239811653615, + "grad_norm": 0.04710681738488992, + "learning_rate": 0.0009792832146369734, + "loss": 0.6433, + "step": 7200 + }, + { + "epoch": 0.36577781726339303, + "grad_norm": 0.05005107749394292, + "learning_rate": 0.0009792200616970876, + "loss": 0.7054, + "step": 7205 + }, + { + "epoch": 0.3660316533614245, + "grad_norm": 0.03323686626122536, + "learning_rate": 0.0009791568146886936, + "loss": 0.6797, + "step": 7210 + }, + { + "epoch": 0.36628548945945605, + "grad_norm": 0.03185618160464647, + "learning_rate": 0.0009790934736242064, + "loss": 0.6476, + "step": 7215 + }, + { + "epoch": 0.3665393255574875, + "grad_norm": 0.03692950551425256, + "learning_rate": 0.0009790300385160594, + "loss": 0.6653, + "step": 7220 + }, + { + "epoch": 0.366793161655519, + "grad_norm": 0.03377555225120197, + "learning_rate": 0.0009789665093767048, + "loss": 0.6571, + "step": 7225 + }, + { + "epoch": 0.36704699775355054, + "grad_norm": 0.029454638958564077, + "learning_rate": 0.000978902886218613, + "loss": 0.6743, + "step": 7230 + }, + { + "epoch": 0.367300833851582, + "grad_norm": 0.05946350290356885, + "learning_rate": 0.000978839169054273, + "loss": 0.6702, + "step": 7235 + }, + { + "epoch": 0.36755466994961355, + "grad_norm": 0.05244370262839264, + "learning_rate": 0.0009787753578961922, + "loss": 0.6083, + "step": 7240 + }, + { + "epoch": 0.36780850604764503, + "grad_norm": 0.03264974043837502, + "learning_rate": 0.0009787114527568962, + "loss": 0.6166, + "step": 7245 + }, + { + "epoch": 0.3680623421456765, + "grad_norm": 0.03513830130194355, + "learning_rate": 0.0009786474536489292, + "loss": 0.6634, + "step": 7250 + }, + { + "epoch": 0.36831617824370805, + "grad_norm": 0.029479205631305596, + "learning_rate": 0.0009785833605848542, + "loss": 0.636, + "step": 7255 + }, + { + "epoch": 0.3685700143417395, + "grad_norm": 0.032292807860328114, + "learning_rate": 0.0009785191735772521, + "loss": 0.6858, + "step": 7260 + }, + { + "epoch": 0.36882385043977106, + "grad_norm": 0.04406918705243669, + "learning_rate": 0.0009784548926387226, + "loss": 0.6267, + "step": 7265 + }, + { + "epoch": 0.36907768653780254, + "grad_norm": 0.03484394221767657, + "learning_rate": 0.000978390517781884, + "loss": 0.6394, + "step": 7270 + }, + { + "epoch": 0.369331522635834, + "grad_norm": 0.03455567149269723, + "learning_rate": 0.0009783260490193722, + "loss": 0.657, + "step": 7275 + }, + { + "epoch": 0.36958535873386555, + "grad_norm": 0.0553757152489157, + "learning_rate": 0.0009782614863638424, + "loss": 0.6519, + "step": 7280 + }, + { + "epoch": 0.36983919483189703, + "grad_norm": 0.03087746692822628, + "learning_rate": 0.000978196829827968, + "loss": 0.6106, + "step": 7285 + }, + { + "epoch": 0.37009303092992857, + "grad_norm": 0.028942872631375818, + "learning_rate": 0.0009781320794244408, + "loss": 0.689, + "step": 7290 + }, + { + "epoch": 0.37034686702796005, + "grad_norm": 0.05560542668705997, + "learning_rate": 0.0009780672351659707, + "loss": 0.6009, + "step": 7295 + }, + { + "epoch": 0.3706007031259915, + "grad_norm": 0.03597874219905048, + "learning_rate": 0.0009780022970652864, + "loss": 0.675, + "step": 7300 + }, + { + "epoch": 0.37085453922402306, + "grad_norm": 0.041750636143700906, + "learning_rate": 0.000977937265135135, + "loss": 0.6347, + "step": 7305 + }, + { + "epoch": 0.37110837532205454, + "grad_norm": 0.03053212587846297, + "learning_rate": 0.000977872139388282, + "loss": 0.6547, + "step": 7310 + }, + { + "epoch": 0.3713622114200861, + "grad_norm": 0.04623962660097538, + "learning_rate": 0.0009778069198375112, + "loss": 0.6296, + "step": 7315 + }, + { + "epoch": 0.37161604751811755, + "grad_norm": 0.053173942447247664, + "learning_rate": 0.0009777416064956248, + "loss": 0.6557, + "step": 7320 + }, + { + "epoch": 0.37186988361614903, + "grad_norm": 0.04322723325958535, + "learning_rate": 0.0009776761993754435, + "loss": 0.6506, + "step": 7325 + }, + { + "epoch": 0.37212371971418057, + "grad_norm": 0.054474373793746526, + "learning_rate": 0.0009776106984898066, + "loss": 0.6337, + "step": 7330 + }, + { + "epoch": 0.37237755581221205, + "grad_norm": 0.07566751344663698, + "learning_rate": 0.0009775451038515712, + "loss": 0.5778, + "step": 7335 + }, + { + "epoch": 0.3726313919102436, + "grad_norm": 0.05812108518347267, + "learning_rate": 0.0009774794154736135, + "loss": 0.6431, + "step": 7340 + }, + { + "epoch": 0.37288522800827506, + "grad_norm": 0.0569408117979321, + "learning_rate": 0.0009774136333688278, + "loss": 0.6382, + "step": 7345 + }, + { + "epoch": 0.37313906410630654, + "grad_norm": 0.04610218065404068, + "learning_rate": 0.0009773477575501265, + "loss": 0.6282, + "step": 7350 + }, + { + "epoch": 0.3733929002043381, + "grad_norm": 0.02735821778929375, + "learning_rate": 0.0009772817880304412, + "loss": 0.639, + "step": 7355 + }, + { + "epoch": 0.37364673630236955, + "grad_norm": 0.02996563286462766, + "learning_rate": 0.0009772157248227212, + "loss": 0.643, + "step": 7360 + }, + { + "epoch": 0.3739005724004011, + "grad_norm": 0.03578755871655436, + "learning_rate": 0.000977149567939934, + "loss": 0.6244, + "step": 7365 + }, + { + "epoch": 0.37415440849843257, + "grad_norm": 0.03305618805759151, + "learning_rate": 0.0009770833173950663, + "loss": 0.6467, + "step": 7370 + }, + { + "epoch": 0.37440824459646405, + "grad_norm": 0.030303389029491416, + "learning_rate": 0.0009770169732011224, + "loss": 0.6439, + "step": 7375 + }, + { + "epoch": 0.3746620806944956, + "grad_norm": 0.028992948847689715, + "learning_rate": 0.000976950535371126, + "loss": 0.6306, + "step": 7380 + }, + { + "epoch": 0.37491591679252706, + "grad_norm": 0.03888889527619476, + "learning_rate": 0.0009768840039181177, + "loss": 0.6581, + "step": 7385 + }, + { + "epoch": 0.37516975289055854, + "grad_norm": 0.031228572216811263, + "learning_rate": 0.0009768173788551576, + "loss": 0.6081, + "step": 7390 + }, + { + "epoch": 0.3754235889885901, + "grad_norm": 0.03181486445785271, + "learning_rate": 0.000976750660195324, + "loss": 0.6339, + "step": 7395 + }, + { + "epoch": 0.37567742508662155, + "grad_norm": 0.031669584990155854, + "learning_rate": 0.0009766838479517133, + "loss": 0.626, + "step": 7400 + }, + { + "epoch": 0.3759312611846531, + "grad_norm": 0.03222078035458589, + "learning_rate": 0.0009766169421374406, + "loss": 0.6635, + "step": 7405 + }, + { + "epoch": 0.37618509728268457, + "grad_norm": 0.02957011725240359, + "learning_rate": 0.000976549942765639, + "loss": 0.6494, + "step": 7410 + }, + { + "epoch": 0.37643893338071605, + "grad_norm": 0.03590678349168216, + "learning_rate": 0.0009764828498494602, + "loss": 0.6354, + "step": 7415 + }, + { + "epoch": 0.3766927694787476, + "grad_norm": 0.03220030735871836, + "learning_rate": 0.0009764156634020742, + "loss": 0.6464, + "step": 7420 + }, + { + "epoch": 0.37694660557677906, + "grad_norm": 0.03607797476983635, + "learning_rate": 0.0009763483834366693, + "loss": 0.6277, + "step": 7425 + }, + { + "epoch": 0.3772004416748106, + "grad_norm": 0.11467843343234524, + "learning_rate": 0.0009762810099664523, + "loss": 0.6575, + "step": 7430 + }, + { + "epoch": 0.3774542777728421, + "grad_norm": 0.03779371870962273, + "learning_rate": 0.0009762135430046483, + "loss": 0.6564, + "step": 7435 + }, + { + "epoch": 0.37770811387087355, + "grad_norm": 0.03333634417865055, + "learning_rate": 0.0009761459825645006, + "loss": 0.6516, + "step": 7440 + }, + { + "epoch": 0.3779619499689051, + "grad_norm": 0.03352151545779738, + "learning_rate": 0.0009760783286592711, + "loss": 0.628, + "step": 7445 + }, + { + "epoch": 0.37821578606693657, + "grad_norm": 0.031390578882334645, + "learning_rate": 0.0009760105813022399, + "loss": 0.6631, + "step": 7450 + }, + { + "epoch": 0.3784696221649681, + "grad_norm": 0.03126775917347245, + "learning_rate": 0.0009759427405067054, + "loss": 0.6296, + "step": 7455 + }, + { + "epoch": 0.3787234582629996, + "grad_norm": 0.03189164977628853, + "learning_rate": 0.0009758748062859844, + "loss": 0.614, + "step": 7460 + }, + { + "epoch": 0.37897729436103106, + "grad_norm": 0.03924473622722979, + "learning_rate": 0.0009758067786534123, + "loss": 0.648, + "step": 7465 + }, + { + "epoch": 0.3792311304590626, + "grad_norm": 0.030124031962455097, + "learning_rate": 0.0009757386576223423, + "loss": 0.6831, + "step": 7470 + }, + { + "epoch": 0.37948496655709407, + "grad_norm": 0.026069578510561218, + "learning_rate": 0.0009756704432061463, + "loss": 0.6449, + "step": 7475 + }, + { + "epoch": 0.3797388026551256, + "grad_norm": 0.04496395917283666, + "learning_rate": 0.0009756021354182145, + "loss": 0.6236, + "step": 7480 + }, + { + "epoch": 0.3799926387531571, + "grad_norm": 0.03492290304715105, + "learning_rate": 0.0009755337342719552, + "loss": 0.6452, + "step": 7485 + }, + { + "epoch": 0.38024647485118857, + "grad_norm": 0.03119933363654312, + "learning_rate": 0.0009754652397807955, + "loss": 0.697, + "step": 7490 + }, + { + "epoch": 0.3805003109492201, + "grad_norm": 0.025025130639123577, + "learning_rate": 0.0009753966519581803, + "loss": 0.5946, + "step": 7495 + }, + { + "epoch": 0.3807541470472516, + "grad_norm": 0.028756961098509868, + "learning_rate": 0.0009753279708175731, + "loss": 0.6274, + "step": 7500 + }, + { + "epoch": 0.3810079831452831, + "grad_norm": 0.04267829613214311, + "learning_rate": 0.0009752591963724558, + "loss": 0.6514, + "step": 7505 + }, + { + "epoch": 0.3812618192433146, + "grad_norm": 0.027804227185939587, + "learning_rate": 0.0009751903286363283, + "loss": 0.6459, + "step": 7510 + }, + { + "epoch": 0.38151565534134607, + "grad_norm": 0.0368399353284197, + "learning_rate": 0.0009751213676227091, + "loss": 0.6143, + "step": 7515 + }, + { + "epoch": 0.3817694914393776, + "grad_norm": 0.03566837281270354, + "learning_rate": 0.0009750523133451348, + "loss": 0.6591, + "step": 7520 + }, + { + "epoch": 0.3820233275374091, + "grad_norm": 0.04104675754438289, + "learning_rate": 0.0009749831658171605, + "loss": 0.6285, + "step": 7525 + }, + { + "epoch": 0.3822771636354406, + "grad_norm": 0.03463924815280617, + "learning_rate": 0.0009749139250523596, + "loss": 0.6361, + "step": 7530 + }, + { + "epoch": 0.3825309997334721, + "grad_norm": 0.0371233837488764, + "learning_rate": 0.0009748445910643233, + "loss": 0.6529, + "step": 7535 + }, + { + "epoch": 0.3827848358315036, + "grad_norm": 0.030735573426834636, + "learning_rate": 0.000974775163866662, + "loss": 0.6463, + "step": 7540 + }, + { + "epoch": 0.3830386719295351, + "grad_norm": 0.028788082474888697, + "learning_rate": 0.0009747056434730037, + "loss": 0.7047, + "step": 7545 + }, + { + "epoch": 0.3832925080275666, + "grad_norm": 0.34285998838897536, + "learning_rate": 0.0009746360298969951, + "loss": 0.6438, + "step": 7550 + }, + { + "epoch": 0.3835463441255981, + "grad_norm": 0.04150800275275346, + "learning_rate": 0.0009745663231523008, + "loss": 0.6424, + "step": 7555 + }, + { + "epoch": 0.3838001802236296, + "grad_norm": 0.08071417524310523, + "learning_rate": 0.0009744965232526037, + "loss": 0.6581, + "step": 7560 + }, + { + "epoch": 0.3840540163216611, + "grad_norm": 0.03292538278471775, + "learning_rate": 0.0009744266302116056, + "loss": 0.6429, + "step": 7565 + }, + { + "epoch": 0.3843078524196926, + "grad_norm": 0.0553091431056109, + "learning_rate": 0.0009743566440430258, + "loss": 0.6247, + "step": 7570 + }, + { + "epoch": 0.3845616885177241, + "grad_norm": 0.0576869905147013, + "learning_rate": 0.0009742865647606025, + "loss": 0.6362, + "step": 7575 + }, + { + "epoch": 0.38481552461575563, + "grad_norm": 0.054333812412449035, + "learning_rate": 0.0009742163923780918, + "loss": 0.6644, + "step": 7580 + }, + { + "epoch": 0.3850693607137871, + "grad_norm": 0.0458876312027763, + "learning_rate": 0.0009741461269092682, + "loss": 0.6174, + "step": 7585 + }, + { + "epoch": 0.3853231968118186, + "grad_norm": 0.026346604004534237, + "learning_rate": 0.0009740757683679244, + "loss": 0.6396, + "step": 7590 + }, + { + "epoch": 0.3855770329098501, + "grad_norm": 0.026408511013485066, + "learning_rate": 0.0009740053167678715, + "loss": 0.641, + "step": 7595 + }, + { + "epoch": 0.3858308690078816, + "grad_norm": 0.027408625330890064, + "learning_rate": 0.0009739347721229388, + "loss": 0.6739, + "step": 7600 + }, + { + "epoch": 0.38608470510591314, + "grad_norm": 0.0261075942022835, + "learning_rate": 0.0009738641344469737, + "loss": 0.6311, + "step": 7605 + }, + { + "epoch": 0.3863385412039446, + "grad_norm": 0.035196730638145504, + "learning_rate": 0.0009737934037538422, + "loss": 0.6283, + "step": 7610 + }, + { + "epoch": 0.3865923773019761, + "grad_norm": 0.029985199254207513, + "learning_rate": 0.0009737225800574285, + "loss": 0.5949, + "step": 7615 + }, + { + "epoch": 0.38684621340000763, + "grad_norm": 0.0314143972278088, + "learning_rate": 0.0009736516633716348, + "loss": 0.6309, + "step": 7620 + }, + { + "epoch": 0.3871000494980391, + "grad_norm": 0.040375384651748156, + "learning_rate": 0.0009735806537103815, + "loss": 0.6346, + "step": 7625 + }, + { + "epoch": 0.3873538855960706, + "grad_norm": 0.030875166168935275, + "learning_rate": 0.0009735095510876077, + "loss": 0.6188, + "step": 7630 + }, + { + "epoch": 0.3876077216941021, + "grad_norm": 0.035657526104679045, + "learning_rate": 0.0009734383555172705, + "loss": 0.6334, + "step": 7635 + }, + { + "epoch": 0.3878615577921336, + "grad_norm": 0.03726651149611401, + "learning_rate": 0.000973367067013345, + "loss": 0.6194, + "step": 7640 + }, + { + "epoch": 0.38811539389016514, + "grad_norm": 0.0549306418678483, + "learning_rate": 0.000973295685589825, + "loss": 0.6289, + "step": 7645 + }, + { + "epoch": 0.3883692299881966, + "grad_norm": 0.029283307487707237, + "learning_rate": 0.0009732242112607222, + "loss": 0.6263, + "step": 7650 + }, + { + "epoch": 0.3886230660862281, + "grad_norm": 0.03836427004065547, + "learning_rate": 0.0009731526440400667, + "loss": 0.671, + "step": 7655 + }, + { + "epoch": 0.38887690218425963, + "grad_norm": 0.03320166265686439, + "learning_rate": 0.0009730809839419069, + "loss": 0.679, + "step": 7660 + }, + { + "epoch": 0.3891307382822911, + "grad_norm": 0.03903157209354159, + "learning_rate": 0.0009730092309803091, + "loss": 0.6721, + "step": 7665 + }, + { + "epoch": 0.38938457438032265, + "grad_norm": 0.02937314090643266, + "learning_rate": 0.0009729373851693581, + "loss": 0.619, + "step": 7670 + }, + { + "epoch": 0.3896384104783541, + "grad_norm": 0.0567425226530842, + "learning_rate": 0.000972865446523157, + "loss": 0.6287, + "step": 7675 + }, + { + "epoch": 0.3898922465763856, + "grad_norm": 0.03706531779273381, + "learning_rate": 0.000972793415055827, + "loss": 0.6683, + "step": 7680 + }, + { + "epoch": 0.39014608267441714, + "grad_norm": 0.034876406305835016, + "learning_rate": 0.0009727212907815072, + "loss": 0.6199, + "step": 7685 + }, + { + "epoch": 0.3903999187724486, + "grad_norm": 0.04177776494427368, + "learning_rate": 0.0009726490737143557, + "loss": 0.6593, + "step": 7690 + }, + { + "epoch": 0.39065375487048015, + "grad_norm": 0.029493444763321025, + "learning_rate": 0.0009725767638685481, + "loss": 0.6818, + "step": 7695 + }, + { + "epoch": 0.39090759096851163, + "grad_norm": 0.05797901330208343, + "learning_rate": 0.0009725043612582785, + "loss": 0.6558, + "step": 7700 + }, + { + "epoch": 0.3911614270665431, + "grad_norm": 0.03060632068082371, + "learning_rate": 0.0009724318658977591, + "loss": 0.6483, + "step": 7705 + }, + { + "epoch": 0.39141526316457464, + "grad_norm": 0.029533842342163073, + "learning_rate": 0.0009723592778012205, + "loss": 0.5882, + "step": 7710 + }, + { + "epoch": 0.3916690992626061, + "grad_norm": 0.04016752549164152, + "learning_rate": 0.0009722865969829111, + "loss": 0.6289, + "step": 7715 + }, + { + "epoch": 0.39192293536063766, + "grad_norm": 0.02740885040789266, + "learning_rate": 0.0009722138234570983, + "loss": 0.6119, + "step": 7720 + }, + { + "epoch": 0.39217677145866914, + "grad_norm": 0.027711495610382927, + "learning_rate": 0.0009721409572380666, + "loss": 0.6439, + "step": 7725 + }, + { + "epoch": 0.3924306075567006, + "grad_norm": 0.03504746630633313, + "learning_rate": 0.0009720679983401197, + "loss": 0.6287, + "step": 7730 + }, + { + "epoch": 0.39268444365473215, + "grad_norm": 0.030714782507982694, + "learning_rate": 0.0009719949467775791, + "loss": 0.6305, + "step": 7735 + }, + { + "epoch": 0.39293827975276363, + "grad_norm": 0.05519161310942375, + "learning_rate": 0.000971921802564784, + "loss": 0.6733, + "step": 7740 + }, + { + "epoch": 0.39319211585079517, + "grad_norm": 0.035905400393446016, + "learning_rate": 0.0009718485657160927, + "loss": 0.6309, + "step": 7745 + }, + { + "epoch": 0.39344595194882664, + "grad_norm": 0.05482351027401504, + "learning_rate": 0.000971775236245881, + "loss": 0.6257, + "step": 7750 + }, + { + "epoch": 0.3936997880468581, + "grad_norm": 0.033754877354996, + "learning_rate": 0.0009717018141685432, + "loss": 0.6465, + "step": 7755 + }, + { + "epoch": 0.39395362414488966, + "grad_norm": 0.03394880260163253, + "learning_rate": 0.0009716282994984915, + "loss": 0.6321, + "step": 7760 + }, + { + "epoch": 0.39420746024292114, + "grad_norm": 0.06103387015592127, + "learning_rate": 0.0009715546922501568, + "loss": 0.5975, + "step": 7765 + }, + { + "epoch": 0.39446129634095267, + "grad_norm": 0.05594462999211319, + "learning_rate": 0.0009714809924379875, + "loss": 0.6332, + "step": 7770 + }, + { + "epoch": 0.39471513243898415, + "grad_norm": 0.028505003978790574, + "learning_rate": 0.0009714072000764508, + "loss": 0.6577, + "step": 7775 + }, + { + "epoch": 0.39496896853701563, + "grad_norm": 0.028286990335348833, + "learning_rate": 0.0009713333151800315, + "loss": 0.6279, + "step": 7780 + }, + { + "epoch": 0.39522280463504716, + "grad_norm": 0.030106626846128356, + "learning_rate": 0.0009712593377632331, + "loss": 0.6379, + "step": 7785 + }, + { + "epoch": 0.39547664073307864, + "grad_norm": 0.03861700452164129, + "learning_rate": 0.0009711852678405768, + "loss": 0.6545, + "step": 7790 + }, + { + "epoch": 0.3957304768311102, + "grad_norm": 0.02838597618173486, + "learning_rate": 0.0009711111054266022, + "loss": 0.6671, + "step": 7795 + }, + { + "epoch": 0.39598431292914166, + "grad_norm": 0.034442366907157136, + "learning_rate": 0.000971036850535867, + "loss": 0.7508, + "step": 7800 + }, + { + "epoch": 0.39623814902717314, + "grad_norm": 0.11425516985484442, + "learning_rate": 0.0009709625031829473, + "loss": 0.66, + "step": 7805 + }, + { + "epoch": 0.39649198512520467, + "grad_norm": 0.04883647009890089, + "learning_rate": 0.0009708880633824366, + "loss": 0.644, + "step": 7810 + }, + { + "epoch": 0.39674582122323615, + "grad_norm": 0.039289664485130486, + "learning_rate": 0.0009708135311489475, + "loss": 0.6813, + "step": 7815 + }, + { + "epoch": 0.3969996573212677, + "grad_norm": 0.0375152643384516, + "learning_rate": 0.0009707389064971102, + "loss": 0.6693, + "step": 7820 + }, + { + "epoch": 0.39725349341929916, + "grad_norm": 0.03153109087788052, + "learning_rate": 0.0009706641894415731, + "loss": 0.6824, + "step": 7825 + }, + { + "epoch": 0.39750732951733064, + "grad_norm": 0.031082795917225698, + "learning_rate": 0.0009705893799970029, + "loss": 0.6108, + "step": 7830 + }, + { + "epoch": 0.3977611656153622, + "grad_norm": 0.04178387644138036, + "learning_rate": 0.0009705144781780842, + "loss": 0.6963, + "step": 7835 + }, + { + "epoch": 0.39801500171339366, + "grad_norm": 0.06903963849411496, + "learning_rate": 0.0009704394839995198, + "loss": 0.6825, + "step": 7840 + }, + { + "epoch": 0.39826883781142514, + "grad_norm": 0.03484218684833334, + "learning_rate": 0.0009703643974760307, + "loss": 0.6589, + "step": 7845 + }, + { + "epoch": 0.39852267390945667, + "grad_norm": 0.045063431620059595, + "learning_rate": 0.0009702892186223564, + "loss": 0.5665, + "step": 7850 + }, + { + "epoch": 0.39877651000748815, + "grad_norm": 0.029449955491035615, + "learning_rate": 0.0009702139474532536, + "loss": 0.6465, + "step": 7855 + }, + { + "epoch": 0.3990303461055197, + "grad_norm": 0.03916288270704511, + "learning_rate": 0.0009701385839834979, + "loss": 0.6582, + "step": 7860 + }, + { + "epoch": 0.39928418220355116, + "grad_norm": 0.04166533905758244, + "learning_rate": 0.0009700631282278827, + "loss": 0.6625, + "step": 7865 + }, + { + "epoch": 0.39953801830158264, + "grad_norm": 0.024806646516510142, + "learning_rate": 0.0009699875802012197, + "loss": 0.6456, + "step": 7870 + }, + { + "epoch": 0.3997918543996142, + "grad_norm": 0.0349710848699053, + "learning_rate": 0.0009699119399183385, + "loss": 0.6463, + "step": 7875 + }, + { + "epoch": 0.40004569049764566, + "grad_norm": 0.031334987561882265, + "learning_rate": 0.0009698362073940869, + "loss": 0.6219, + "step": 7880 + }, + { + "epoch": 0.4002995265956772, + "grad_norm": 0.031472418909322404, + "learning_rate": 0.0009697603826433308, + "loss": 0.6228, + "step": 7885 + }, + { + "epoch": 0.40055336269370867, + "grad_norm": 0.037854618230179916, + "learning_rate": 0.0009696844656809545, + "loss": 0.6558, + "step": 7890 + }, + { + "epoch": 0.40080719879174015, + "grad_norm": 0.11265157750996117, + "learning_rate": 0.0009696084565218597, + "loss": 0.6325, + "step": 7895 + }, + { + "epoch": 0.4010610348897717, + "grad_norm": 0.031122545622557275, + "learning_rate": 0.0009695323551809669, + "loss": 0.6229, + "step": 7900 + }, + { + "epoch": 0.40131487098780316, + "grad_norm": 0.04038263274235042, + "learning_rate": 0.0009694561616732143, + "loss": 0.6507, + "step": 7905 + }, + { + "epoch": 0.4015687070858347, + "grad_norm": 0.029612453037692527, + "learning_rate": 0.0009693798760135584, + "loss": 0.6059, + "step": 7910 + }, + { + "epoch": 0.4018225431838662, + "grad_norm": 0.032308274222676875, + "learning_rate": 0.0009693034982169735, + "loss": 0.6363, + "step": 7915 + }, + { + "epoch": 0.40207637928189766, + "grad_norm": 0.05317177280278271, + "learning_rate": 0.0009692270282984525, + "loss": 0.6503, + "step": 7920 + }, + { + "epoch": 0.4023302153799292, + "grad_norm": 0.04291373669655286, + "learning_rate": 0.0009691504662730058, + "loss": 0.6311, + "step": 7925 + }, + { + "epoch": 0.40258405147796067, + "grad_norm": 0.03662642484263608, + "learning_rate": 0.0009690738121556621, + "loss": 0.6753, + "step": 7930 + }, + { + "epoch": 0.4028378875759922, + "grad_norm": 0.035393766402028314, + "learning_rate": 0.0009689970659614684, + "loss": 0.6479, + "step": 7935 + }, + { + "epoch": 0.4030917236740237, + "grad_norm": 0.03128196520109021, + "learning_rate": 0.0009689202277054896, + "loss": 0.671, + "step": 7940 + }, + { + "epoch": 0.40334555977205516, + "grad_norm": 0.03677958165657406, + "learning_rate": 0.0009688432974028085, + "loss": 0.6932, + "step": 7945 + }, + { + "epoch": 0.4035993958700867, + "grad_norm": 0.02727568027543021, + "learning_rate": 0.0009687662750685265, + "loss": 0.6504, + "step": 7950 + }, + { + "epoch": 0.4038532319681182, + "grad_norm": 0.028131562831269996, + "learning_rate": 0.0009686891607177621, + "loss": 0.6585, + "step": 7955 + }, + { + "epoch": 0.4041070680661497, + "grad_norm": 0.027514481928846146, + "learning_rate": 0.0009686119543656531, + "loss": 0.6182, + "step": 7960 + }, + { + "epoch": 0.4043609041641812, + "grad_norm": 0.07056505685894615, + "learning_rate": 0.0009685346560273542, + "loss": 0.636, + "step": 7965 + }, + { + "epoch": 0.40461474026221267, + "grad_norm": 0.04403662030554125, + "learning_rate": 0.000968457265718039, + "loss": 0.6695, + "step": 7970 + }, + { + "epoch": 0.4048685763602442, + "grad_norm": 0.027718390866363754, + "learning_rate": 0.0009683797834528987, + "loss": 0.6283, + "step": 7975 + }, + { + "epoch": 0.4051224124582757, + "grad_norm": 0.040775149461867354, + "learning_rate": 0.0009683022092471427, + "loss": 0.6609, + "step": 7980 + }, + { + "epoch": 0.4053762485563072, + "grad_norm": 0.03695097130504616, + "learning_rate": 0.0009682245431159984, + "loss": 0.6104, + "step": 7985 + }, + { + "epoch": 0.4056300846543387, + "grad_norm": 0.046398150204463785, + "learning_rate": 0.0009681467850747114, + "loss": 0.6298, + "step": 7990 + }, + { + "epoch": 0.4058839207523702, + "grad_norm": 0.04641307748919207, + "learning_rate": 0.0009680689351385453, + "loss": 0.6244, + "step": 7995 + }, + { + "epoch": 0.4061377568504017, + "grad_norm": 0.03193976715382944, + "learning_rate": 0.0009679909933227811, + "loss": 0.6378, + "step": 8000 + }, + { + "epoch": 0.4063915929484332, + "grad_norm": 0.028276413703941978, + "learning_rate": 0.0009679129596427189, + "loss": 0.6194, + "step": 8005 + }, + { + "epoch": 0.4066454290464647, + "grad_norm": 0.02733137371225709, + "learning_rate": 0.0009678348341136764, + "loss": 0.6586, + "step": 8010 + }, + { + "epoch": 0.4068992651444962, + "grad_norm": 0.029814450966989852, + "learning_rate": 0.000967756616750989, + "loss": 0.6222, + "step": 8015 + }, + { + "epoch": 0.4071531012425277, + "grad_norm": 0.03102914660521326, + "learning_rate": 0.0009676783075700103, + "loss": 0.5845, + "step": 8020 + }, + { + "epoch": 0.4074069373405592, + "grad_norm": 0.031168821244497057, + "learning_rate": 0.0009675999065861121, + "loss": 0.6355, + "step": 8025 + }, + { + "epoch": 0.4076607734385907, + "grad_norm": 0.05486797052015813, + "learning_rate": 0.0009675214138146844, + "loss": 0.6322, + "step": 8030 + }, + { + "epoch": 0.40791460953662223, + "grad_norm": 0.04707616172534586, + "learning_rate": 0.0009674428292711346, + "loss": 0.6169, + "step": 8035 + }, + { + "epoch": 0.4081684456346537, + "grad_norm": 0.02787363667264948, + "learning_rate": 0.0009673641529708884, + "loss": 0.6481, + "step": 8040 + }, + { + "epoch": 0.4084222817326852, + "grad_norm": 0.027658440071354547, + "learning_rate": 0.0009672853849293899, + "loss": 0.6315, + "step": 8045 + }, + { + "epoch": 0.4086761178307167, + "grad_norm": 0.04557845624399485, + "learning_rate": 0.0009672065251621005, + "loss": 0.6724, + "step": 8050 + }, + { + "epoch": 0.4089299539287482, + "grad_norm": 0.028701244900625106, + "learning_rate": 0.0009671275736845002, + "loss": 0.6497, + "step": 8055 + }, + { + "epoch": 0.4091837900267797, + "grad_norm": 0.027420431774163943, + "learning_rate": 0.0009670485305120868, + "loss": 0.617, + "step": 8060 + }, + { + "epoch": 0.4094376261248112, + "grad_norm": 0.04387568253300268, + "learning_rate": 0.0009669693956603761, + "loss": 0.6354, + "step": 8065 + }, + { + "epoch": 0.4096914622228427, + "grad_norm": 0.0356837346671153, + "learning_rate": 0.0009668901691449017, + "loss": 0.6622, + "step": 8070 + }, + { + "epoch": 0.40994529832087423, + "grad_norm": 0.0663989720981148, + "learning_rate": 0.0009668108509812155, + "loss": 0.6492, + "step": 8075 + }, + { + "epoch": 0.4101991344189057, + "grad_norm": 0.06052011549737725, + "learning_rate": 0.0009667314411848873, + "loss": 0.5987, + "step": 8080 + }, + { + "epoch": 0.4104529705169372, + "grad_norm": 0.036195040168482975, + "learning_rate": 0.0009666519397715048, + "loss": 0.654, + "step": 8085 + }, + { + "epoch": 0.4107068066149687, + "grad_norm": 0.040727444865344034, + "learning_rate": 0.0009665723467566736, + "loss": 0.6036, + "step": 8090 + }, + { + "epoch": 0.4109606427130002, + "grad_norm": 0.1897729919545745, + "learning_rate": 0.0009664926621560175, + "loss": 0.6395, + "step": 8095 + }, + { + "epoch": 0.41121447881103174, + "grad_norm": 0.0569334892002028, + "learning_rate": 0.0009664128859851784, + "loss": 0.6519, + "step": 8100 + }, + { + "epoch": 0.4114683149090632, + "grad_norm": 0.06785163170844886, + "learning_rate": 0.0009663330182598155, + "loss": 0.6455, + "step": 8105 + }, + { + "epoch": 0.4117221510070947, + "grad_norm": 0.040654887306683485, + "learning_rate": 0.0009662530589956069, + "loss": 0.6278, + "step": 8110 + }, + { + "epoch": 0.41197598710512623, + "grad_norm": 0.03066395095140888, + "learning_rate": 0.0009661730082082481, + "loss": 0.5972, + "step": 8115 + }, + { + "epoch": 0.4122298232031577, + "grad_norm": 0.0319600496844395, + "learning_rate": 0.0009660928659134525, + "loss": 0.6551, + "step": 8120 + }, + { + "epoch": 0.41248365930118924, + "grad_norm": 0.03366629328090797, + "learning_rate": 0.0009660126321269516, + "loss": 0.6494, + "step": 8125 + }, + { + "epoch": 0.4127374953992207, + "grad_norm": 0.045751657644504846, + "learning_rate": 0.0009659323068644952, + "loss": 0.6166, + "step": 8130 + }, + { + "epoch": 0.4129913314972522, + "grad_norm": 0.03206675763665257, + "learning_rate": 0.0009658518901418505, + "loss": 0.6429, + "step": 8135 + }, + { + "epoch": 0.41324516759528374, + "grad_norm": 0.026550019766652787, + "learning_rate": 0.0009657713819748028, + "loss": 0.6498, + "step": 8140 + }, + { + "epoch": 0.4134990036933152, + "grad_norm": 0.045396032327730144, + "learning_rate": 0.0009656907823791559, + "loss": 0.6428, + "step": 8145 + }, + { + "epoch": 0.41375283979134675, + "grad_norm": 0.036710167826551975, + "learning_rate": 0.0009656100913707306, + "loss": 0.6255, + "step": 8150 + }, + { + "epoch": 0.41400667588937823, + "grad_norm": 0.031135118412712215, + "learning_rate": 0.0009655293089653665, + "loss": 0.6103, + "step": 8155 + }, + { + "epoch": 0.4142605119874097, + "grad_norm": 0.03347596053431247, + "learning_rate": 0.0009654484351789206, + "loss": 0.627, + "step": 8160 + }, + { + "epoch": 0.41451434808544124, + "grad_norm": 0.030646522747677672, + "learning_rate": 0.000965367470027268, + "loss": 0.6277, + "step": 8165 + }, + { + "epoch": 0.4147681841834727, + "grad_norm": 0.05286996241213806, + "learning_rate": 0.0009652864135263018, + "loss": 0.633, + "step": 8170 + }, + { + "epoch": 0.41502202028150426, + "grad_norm": 0.028068076907754813, + "learning_rate": 0.0009652052656919331, + "loss": 0.6432, + "step": 8175 + }, + { + "epoch": 0.41527585637953573, + "grad_norm": 0.03619239399249029, + "learning_rate": 0.0009651240265400907, + "loss": 0.6117, + "step": 8180 + }, + { + "epoch": 0.4155296924775672, + "grad_norm": 0.03319322210171537, + "learning_rate": 0.0009650426960867215, + "loss": 0.6499, + "step": 8185 + }, + { + "epoch": 0.41578352857559875, + "grad_norm": 0.04854675850168715, + "learning_rate": 0.00096496127434779, + "loss": 0.6564, + "step": 8190 + }, + { + "epoch": 0.41603736467363023, + "grad_norm": 0.029353149898544276, + "learning_rate": 0.0009648797613392794, + "loss": 0.6103, + "step": 8195 + }, + { + "epoch": 0.41629120077166176, + "grad_norm": 0.027116416725451065, + "learning_rate": 0.0009647981570771898, + "loss": 0.6166, + "step": 8200 + }, + { + "epoch": 0.41654503686969324, + "grad_norm": 0.05618152750846531, + "learning_rate": 0.00096471646157754, + "loss": 0.6162, + "step": 8205 + }, + { + "epoch": 0.4167988729677247, + "grad_norm": 0.02729960033208377, + "learning_rate": 0.0009646346748563663, + "loss": 0.6004, + "step": 8210 + }, + { + "epoch": 0.41705270906575626, + "grad_norm": 0.08612473698169335, + "learning_rate": 0.0009645527969297231, + "loss": 0.6483, + "step": 8215 + }, + { + "epoch": 0.41730654516378773, + "grad_norm": 0.029936015816733892, + "learning_rate": 0.0009644708278136826, + "loss": 0.6317, + "step": 8220 + }, + { + "epoch": 0.41756038126181927, + "grad_norm": 0.0726201142485528, + "learning_rate": 0.0009643887675243348, + "loss": 0.6244, + "step": 8225 + }, + { + "epoch": 0.41781421735985075, + "grad_norm": 0.057917723581260475, + "learning_rate": 0.0009643066160777879, + "loss": 0.647, + "step": 8230 + }, + { + "epoch": 0.4180680534578822, + "grad_norm": 0.13072452439749868, + "learning_rate": 0.0009642243734901678, + "loss": 0.6838, + "step": 8235 + }, + { + "epoch": 0.41832188955591376, + "grad_norm": 0.05469152409284166, + "learning_rate": 0.0009641420397776181, + "loss": 0.6624, + "step": 8240 + }, + { + "epoch": 0.41857572565394524, + "grad_norm": 0.033211019095030915, + "learning_rate": 0.0009640596149563008, + "loss": 0.6251, + "step": 8245 + }, + { + "epoch": 0.4188295617519768, + "grad_norm": 0.039537117087901134, + "learning_rate": 0.0009639770990423954, + "loss": 0.6764, + "step": 8250 + }, + { + "epoch": 0.41908339785000825, + "grad_norm": 0.043759885870117206, + "learning_rate": 0.0009638944920520992, + "loss": 0.6291, + "step": 8255 + }, + { + "epoch": 0.41933723394803973, + "grad_norm": 0.06431426896668371, + "learning_rate": 0.0009638117940016278, + "loss": 0.6548, + "step": 8260 + }, + { + "epoch": 0.41959107004607127, + "grad_norm": 0.027159981833348182, + "learning_rate": 0.000963729004907214, + "loss": 0.6055, + "step": 8265 + }, + { + "epoch": 0.41984490614410275, + "grad_norm": 0.10032739292221377, + "learning_rate": 0.0009636461247851094, + "loss": 0.6578, + "step": 8270 + }, + { + "epoch": 0.4200987422421342, + "grad_norm": 0.06844856797298811, + "learning_rate": 0.0009635631536515825, + "loss": 0.6316, + "step": 8275 + }, + { + "epoch": 0.42035257834016576, + "grad_norm": 0.03539995161128657, + "learning_rate": 0.0009634800915229205, + "loss": 0.6597, + "step": 8280 + }, + { + "epoch": 0.42060641443819724, + "grad_norm": 0.033253904264618245, + "learning_rate": 0.0009633969384154279, + "loss": 0.6066, + "step": 8285 + }, + { + "epoch": 0.4208602505362288, + "grad_norm": 0.042401610577462875, + "learning_rate": 0.0009633136943454271, + "loss": 0.6235, + "step": 8290 + }, + { + "epoch": 0.42111408663426025, + "grad_norm": 0.06416678518804314, + "learning_rate": 0.0009632303593292589, + "loss": 0.6112, + "step": 8295 + }, + { + "epoch": 0.42136792273229173, + "grad_norm": 0.055313941008177936, + "learning_rate": 0.0009631469333832809, + "loss": 0.6632, + "step": 8300 + }, + { + "epoch": 0.42162175883032327, + "grad_norm": 0.11795660265352328, + "learning_rate": 0.0009630634165238699, + "loss": 0.6342, + "step": 8305 + }, + { + "epoch": 0.42187559492835475, + "grad_norm": 0.0657921582096661, + "learning_rate": 0.0009629798087674194, + "loss": 0.6253, + "step": 8310 + }, + { + "epoch": 0.4221294310263863, + "grad_norm": 0.03717901771126588, + "learning_rate": 0.0009628961101303412, + "loss": 0.6518, + "step": 8315 + }, + { + "epoch": 0.42238326712441776, + "grad_norm": 0.04495023963968202, + "learning_rate": 0.0009628123206290654, + "loss": 0.6451, + "step": 8320 + }, + { + "epoch": 0.42263710322244924, + "grad_norm": 0.0418969458155804, + "learning_rate": 0.0009627284402800388, + "loss": 0.6534, + "step": 8325 + }, + { + "epoch": 0.4228909393204808, + "grad_norm": 0.030384984685908507, + "learning_rate": 0.0009626444690997272, + "loss": 0.5976, + "step": 8330 + }, + { + "epoch": 0.42314477541851225, + "grad_norm": 0.04470751844633285, + "learning_rate": 0.0009625604071046133, + "loss": 0.6242, + "step": 8335 + }, + { + "epoch": 0.4233986115165438, + "grad_norm": 0.025004516406755696, + "learning_rate": 0.0009624762543111985, + "loss": 0.6245, + "step": 8340 + }, + { + "epoch": 0.42365244761457527, + "grad_norm": 0.04181782200662666, + "learning_rate": 0.0009623920107360011, + "loss": 0.6792, + "step": 8345 + }, + { + "epoch": 0.42390628371260675, + "grad_norm": 0.03102456315861995, + "learning_rate": 0.0009623076763955581, + "loss": 0.6725, + "step": 8350 + }, + { + "epoch": 0.4241601198106383, + "grad_norm": 0.057488518244734055, + "learning_rate": 0.0009622232513064237, + "loss": 0.6335, + "step": 8355 + }, + { + "epoch": 0.42441395590866976, + "grad_norm": 0.05907000801077856, + "learning_rate": 0.00096213873548517, + "loss": 0.6626, + "step": 8360 + }, + { + "epoch": 0.4246677920067013, + "grad_norm": 0.07423416923574946, + "learning_rate": 0.0009620541289483875, + "loss": 0.6819, + "step": 8365 + }, + { + "epoch": 0.4249216281047328, + "grad_norm": 0.04354747716701456, + "learning_rate": 0.0009619694317126837, + "loss": 0.6638, + "step": 8370 + }, + { + "epoch": 0.42517546420276425, + "grad_norm": 0.03257416270726558, + "learning_rate": 0.0009618846437946842, + "loss": 0.6575, + "step": 8375 + }, + { + "epoch": 0.4254293003007958, + "grad_norm": 0.1385958109330031, + "learning_rate": 0.0009617997652110326, + "loss": 0.6893, + "step": 8380 + }, + { + "epoch": 0.42568313639882727, + "grad_norm": 0.11178651527717962, + "learning_rate": 0.00096171479597839, + "loss": 0.7221, + "step": 8385 + }, + { + "epoch": 0.4259369724968588, + "grad_norm": 0.07142680791086048, + "learning_rate": 0.0009616297361134355, + "loss": 0.6673, + "step": 8390 + }, + { + "epoch": 0.4261908085948903, + "grad_norm": 0.07599589867371198, + "learning_rate": 0.000961544585632866, + "loss": 0.6966, + "step": 8395 + }, + { + "epoch": 0.42644464469292176, + "grad_norm": 0.0324535534315102, + "learning_rate": 0.0009614593445533961, + "loss": 0.6887, + "step": 8400 + }, + { + "epoch": 0.4266984807909533, + "grad_norm": 0.03664906295093519, + "learning_rate": 0.0009613740128917581, + "loss": 0.6472, + "step": 8405 + }, + { + "epoch": 0.4269523168889848, + "grad_norm": 0.06479920979013709, + "learning_rate": 0.0009612885906647023, + "loss": 0.6738, + "step": 8410 + }, + { + "epoch": 0.4272061529870163, + "grad_norm": 0.04275229522059579, + "learning_rate": 0.0009612030778889966, + "loss": 0.6488, + "step": 8415 + }, + { + "epoch": 0.4274599890850478, + "grad_norm": 0.029470064994178823, + "learning_rate": 0.0009611174745814266, + "loss": 0.6667, + "step": 8420 + }, + { + "epoch": 0.42771382518307927, + "grad_norm": 0.03351287696319307, + "learning_rate": 0.000961031780758796, + "loss": 0.6559, + "step": 8425 + }, + { + "epoch": 0.4279676612811108, + "grad_norm": 0.05384940363959001, + "learning_rate": 0.000960945996437926, + "loss": 0.6732, + "step": 8430 + }, + { + "epoch": 0.4282214973791423, + "grad_norm": 0.0320326929365238, + "learning_rate": 0.0009608601216356557, + "loss": 0.7019, + "step": 8435 + }, + { + "epoch": 0.4284753334771738, + "grad_norm": 0.02719381758672677, + "learning_rate": 0.0009607741563688417, + "loss": 0.6465, + "step": 8440 + }, + { + "epoch": 0.4287291695752053, + "grad_norm": 0.03282220675318019, + "learning_rate": 0.0009606881006543589, + "loss": 0.6366, + "step": 8445 + }, + { + "epoch": 0.4289830056732368, + "grad_norm": 0.03248468141086955, + "learning_rate": 0.0009606019545090992, + "loss": 0.6066, + "step": 8450 + }, + { + "epoch": 0.4292368417712683, + "grad_norm": 0.03936893710205544, + "learning_rate": 0.0009605157179499728, + "loss": 0.6604, + "step": 8455 + }, + { + "epoch": 0.4294906778692998, + "grad_norm": 0.03206017279382604, + "learning_rate": 0.0009604293909939077, + "loss": 0.6396, + "step": 8460 + }, + { + "epoch": 0.4297445139673313, + "grad_norm": 0.056617557142407925, + "learning_rate": 0.0009603429736578493, + "loss": 0.6345, + "step": 8465 + }, + { + "epoch": 0.4299983500653628, + "grad_norm": 0.04111342531938289, + "learning_rate": 0.0009602564659587608, + "loss": 0.6756, + "step": 8470 + }, + { + "epoch": 0.4302521861633943, + "grad_norm": 0.03727263305415869, + "learning_rate": 0.0009601698679136233, + "loss": 0.6321, + "step": 8475 + }, + { + "epoch": 0.4305060222614258, + "grad_norm": 0.03320811173723645, + "learning_rate": 0.0009600831795394358, + "loss": 0.6278, + "step": 8480 + }, + { + "epoch": 0.4307598583594573, + "grad_norm": 0.04118865787603138, + "learning_rate": 0.0009599964008532144, + "loss": 0.6521, + "step": 8485 + }, + { + "epoch": 0.43101369445748877, + "grad_norm": 0.027675504471304815, + "learning_rate": 0.0009599095318719935, + "loss": 0.6282, + "step": 8490 + }, + { + "epoch": 0.4312675305555203, + "grad_norm": 0.04646301540154575, + "learning_rate": 0.0009598225726128251, + "loss": 0.6408, + "step": 8495 + }, + { + "epoch": 0.4315213666535518, + "grad_norm": 0.030455384549113017, + "learning_rate": 0.0009597355230927789, + "loss": 0.6444, + "step": 8500 + }, + { + "epoch": 0.4317752027515833, + "grad_norm": 0.031905572716127935, + "learning_rate": 0.0009596483833289422, + "loss": 0.632, + "step": 8505 + }, + { + "epoch": 0.4320290388496148, + "grad_norm": 0.03178070686854921, + "learning_rate": 0.0009595611533384201, + "loss": 0.6341, + "step": 8510 + }, + { + "epoch": 0.4322828749476463, + "grad_norm": 0.037639770941935886, + "learning_rate": 0.0009594738331383355, + "loss": 0.6236, + "step": 8515 + }, + { + "epoch": 0.4325367110456778, + "grad_norm": 0.06449433125052263, + "learning_rate": 0.0009593864227458287, + "loss": 0.6534, + "step": 8520 + }, + { + "epoch": 0.4327905471437093, + "grad_norm": 0.028739688486159997, + "learning_rate": 0.0009592989221780581, + "loss": 0.6303, + "step": 8525 + }, + { + "epoch": 0.4330443832417408, + "grad_norm": 0.07881302169372612, + "learning_rate": 0.0009592113314521996, + "loss": 0.6427, + "step": 8530 + }, + { + "epoch": 0.4332982193397723, + "grad_norm": 0.06386066146903421, + "learning_rate": 0.0009591236505854468, + "loss": 0.6411, + "step": 8535 + }, + { + "epoch": 0.4335520554378038, + "grad_norm": 0.08823347046423324, + "learning_rate": 0.0009590358795950112, + "loss": 0.6628, + "step": 8540 + }, + { + "epoch": 0.4338058915358353, + "grad_norm": 0.04402973321402076, + "learning_rate": 0.0009589480184981214, + "loss": 0.6312, + "step": 8545 + }, + { + "epoch": 0.4340597276338668, + "grad_norm": 0.06702143934000367, + "learning_rate": 0.0009588600673120245, + "loss": 0.6386, + "step": 8550 + }, + { + "epoch": 0.43431356373189833, + "grad_norm": 0.0303696868506802, + "learning_rate": 0.0009587720260539847, + "loss": 0.6389, + "step": 8555 + }, + { + "epoch": 0.4345673998299298, + "grad_norm": 0.028798244571663335, + "learning_rate": 0.000958683894741284, + "loss": 0.6517, + "step": 8560 + }, + { + "epoch": 0.4348212359279613, + "grad_norm": 0.026950156282351694, + "learning_rate": 0.0009585956733912224, + "loss": 0.6254, + "step": 8565 + }, + { + "epoch": 0.4350750720259928, + "grad_norm": 0.0355083939976014, + "learning_rate": 0.0009585073620211169, + "loss": 0.6603, + "step": 8570 + }, + { + "epoch": 0.4353289081240243, + "grad_norm": 0.03066199950726959, + "learning_rate": 0.0009584189606483029, + "loss": 0.6305, + "step": 8575 + }, + { + "epoch": 0.43558274422205584, + "grad_norm": 0.030869429351402574, + "learning_rate": 0.0009583304692901331, + "loss": 0.6479, + "step": 8580 + }, + { + "epoch": 0.4358365803200873, + "grad_norm": 0.03223477546507825, + "learning_rate": 0.0009582418879639778, + "loss": 0.6709, + "step": 8585 + }, + { + "epoch": 0.4360904164181188, + "grad_norm": 0.0480603195178494, + "learning_rate": 0.0009581532166872252, + "loss": 0.6198, + "step": 8590 + }, + { + "epoch": 0.43634425251615033, + "grad_norm": 0.02653848497460224, + "learning_rate": 0.0009580644554772809, + "loss": 0.6247, + "step": 8595 + }, + { + "epoch": 0.4365980886141818, + "grad_norm": 0.03447545219420309, + "learning_rate": 0.0009579756043515684, + "loss": 0.6677, + "step": 8600 + }, + { + "epoch": 0.43685192471221335, + "grad_norm": 0.0340747508415548, + "learning_rate": 0.0009578866633275287, + "loss": 0.695, + "step": 8605 + }, + { + "epoch": 0.4371057608102448, + "grad_norm": 0.034686685618524345, + "learning_rate": 0.0009577976324226205, + "loss": 0.6303, + "step": 8610 + }, + { + "epoch": 0.4373595969082763, + "grad_norm": 0.04739152588761622, + "learning_rate": 0.0009577085116543201, + "loss": 0.6718, + "step": 8615 + }, + { + "epoch": 0.43761343300630784, + "grad_norm": 0.06752904402187106, + "learning_rate": 0.0009576193010401213, + "loss": 0.6227, + "step": 8620 + }, + { + "epoch": 0.4378672691043393, + "grad_norm": 0.03265339538465412, + "learning_rate": 0.0009575300005975361, + "loss": 0.6396, + "step": 8625 + }, + { + "epoch": 0.43812110520237085, + "grad_norm": 0.04449318891895552, + "learning_rate": 0.0009574406103440931, + "loss": 0.6716, + "step": 8630 + }, + { + "epoch": 0.43837494130040233, + "grad_norm": 0.04632596776822006, + "learning_rate": 0.0009573511302973399, + "loss": 0.6233, + "step": 8635 + }, + { + "epoch": 0.4386287773984338, + "grad_norm": 0.03465904010797608, + "learning_rate": 0.0009572615604748405, + "loss": 0.6524, + "step": 8640 + }, + { + "epoch": 0.43888261349646535, + "grad_norm": 0.0870434657255882, + "learning_rate": 0.000957171900894177, + "loss": 0.6214, + "step": 8645 + }, + { + "epoch": 0.4391364495944968, + "grad_norm": 0.04534670041292767, + "learning_rate": 0.0009570821515729496, + "loss": 0.6523, + "step": 8650 + }, + { + "epoch": 0.43939028569252836, + "grad_norm": 0.028417736922154647, + "learning_rate": 0.0009569923125287749, + "loss": 0.6387, + "step": 8655 + }, + { + "epoch": 0.43964412179055984, + "grad_norm": 0.03085472383605309, + "learning_rate": 0.0009569023837792885, + "loss": 0.6569, + "step": 8660 + }, + { + "epoch": 0.4398979578885913, + "grad_norm": 0.029597971880738125, + "learning_rate": 0.0009568123653421427, + "loss": 0.6517, + "step": 8665 + }, + { + "epoch": 0.44015179398662285, + "grad_norm": 0.031124321107765782, + "learning_rate": 0.0009567222572350078, + "loss": 0.6481, + "step": 8670 + }, + { + "epoch": 0.44040563008465433, + "grad_norm": 0.04250290000689362, + "learning_rate": 0.0009566320594755713, + "loss": 0.6301, + "step": 8675 + }, + { + "epoch": 0.44065946618268587, + "grad_norm": 0.03803602491108363, + "learning_rate": 0.0009565417720815389, + "loss": 0.6506, + "step": 8680 + }, + { + "epoch": 0.44091330228071735, + "grad_norm": 0.10059999444471293, + "learning_rate": 0.0009564513950706333, + "loss": 0.7053, + "step": 8685 + }, + { + "epoch": 0.4411671383787488, + "grad_norm": 0.028992308416219137, + "learning_rate": 0.0009563609284605951, + "loss": 0.6764, + "step": 8690 + }, + { + "epoch": 0.44142097447678036, + "grad_norm": 0.028072857577109308, + "learning_rate": 0.0009562703722691828, + "loss": 0.6359, + "step": 8695 + }, + { + "epoch": 0.44167481057481184, + "grad_norm": 0.04504031556014271, + "learning_rate": 0.0009561797265141717, + "loss": 0.6518, + "step": 8700 + }, + { + "epoch": 0.4419286466728434, + "grad_norm": 0.028108033251197, + "learning_rate": 0.0009560889912133552, + "loss": 0.6291, + "step": 8705 + }, + { + "epoch": 0.44218248277087485, + "grad_norm": 0.03109459111595203, + "learning_rate": 0.0009559981663845443, + "loss": 0.6163, + "step": 8710 + }, + { + "epoch": 0.44243631886890633, + "grad_norm": 0.04169194643254887, + "learning_rate": 0.0009559072520455672, + "loss": 0.6475, + "step": 8715 + }, + { + "epoch": 0.44269015496693787, + "grad_norm": 0.20480559958951622, + "learning_rate": 0.0009558162482142703, + "loss": 0.5948, + "step": 8720 + }, + { + "epoch": 0.44294399106496934, + "grad_norm": 0.056634271165419775, + "learning_rate": 0.000955725154908517, + "loss": 0.6407, + "step": 8725 + }, + { + "epoch": 0.4431978271630008, + "grad_norm": 0.04431366313723485, + "learning_rate": 0.0009556339721461885, + "loss": 0.6483, + "step": 8730 + }, + { + "epoch": 0.44345166326103236, + "grad_norm": 0.030702014731428928, + "learning_rate": 0.0009555426999451835, + "loss": 0.6127, + "step": 8735 + }, + { + "epoch": 0.44370549935906384, + "grad_norm": 0.04349749497078023, + "learning_rate": 0.0009554513383234184, + "loss": 0.6401, + "step": 8740 + }, + { + "epoch": 0.44395933545709537, + "grad_norm": 0.028774456041381447, + "learning_rate": 0.0009553598872988268, + "loss": 0.6411, + "step": 8745 + }, + { + "epoch": 0.44421317155512685, + "grad_norm": 0.04729637621631858, + "learning_rate": 0.0009552683468893601, + "loss": 0.6229, + "step": 8750 + }, + { + "epoch": 0.44446700765315833, + "grad_norm": 0.03315699465561875, + "learning_rate": 0.0009551767171129874, + "loss": 0.6576, + "step": 8755 + }, + { + "epoch": 0.44472084375118986, + "grad_norm": 0.07162055236767001, + "learning_rate": 0.0009550849979876952, + "loss": 0.6478, + "step": 8760 + }, + { + "epoch": 0.44497467984922134, + "grad_norm": 0.03618826906366848, + "learning_rate": 0.0009549931895314874, + "loss": 0.6262, + "step": 8765 + }, + { + "epoch": 0.4452285159472529, + "grad_norm": 0.05135770096791484, + "learning_rate": 0.0009549012917623854, + "loss": 0.6681, + "step": 8770 + }, + { + "epoch": 0.44548235204528436, + "grad_norm": 0.02358653749589321, + "learning_rate": 0.0009548093046984285, + "loss": 0.5986, + "step": 8775 + }, + { + "epoch": 0.44573618814331584, + "grad_norm": 0.04897232885513862, + "learning_rate": 0.0009547172283576733, + "loss": 0.6277, + "step": 8780 + }, + { + "epoch": 0.44599002424134737, + "grad_norm": 0.029813286658092467, + "learning_rate": 0.0009546250627581936, + "loss": 0.6221, + "step": 8785 + }, + { + "epoch": 0.44624386033937885, + "grad_norm": 0.04724784641629323, + "learning_rate": 0.0009545328079180815, + "loss": 0.644, + "step": 8790 + }, + { + "epoch": 0.4464976964374104, + "grad_norm": 0.04105666493239327, + "learning_rate": 0.0009544404638554459, + "loss": 0.6266, + "step": 8795 + }, + { + "epoch": 0.44675153253544186, + "grad_norm": 0.036873167174765825, + "learning_rate": 0.0009543480305884136, + "loss": 0.6148, + "step": 8800 + }, + { + "epoch": 0.44700536863347334, + "grad_norm": 0.024048534299153178, + "learning_rate": 0.0009542555081351286, + "loss": 0.6191, + "step": 8805 + }, + { + "epoch": 0.4472592047315049, + "grad_norm": 0.02625656937587616, + "learning_rate": 0.0009541628965137528, + "loss": 0.6347, + "step": 8810 + }, + { + "epoch": 0.44751304082953636, + "grad_norm": 0.060240180235133486, + "learning_rate": 0.0009540701957424653, + "loss": 0.5963, + "step": 8815 + }, + { + "epoch": 0.4477668769275679, + "grad_norm": 0.05265523726241905, + "learning_rate": 0.0009539774058394628, + "loss": 0.6619, + "step": 8820 + }, + { + "epoch": 0.44802071302559937, + "grad_norm": 0.06653130957206357, + "learning_rate": 0.0009538845268229596, + "loss": 0.643, + "step": 8825 + }, + { + "epoch": 0.44827454912363085, + "grad_norm": 0.027089718279573065, + "learning_rate": 0.0009537915587111872, + "loss": 0.6249, + "step": 8830 + }, + { + "epoch": 0.4485283852216624, + "grad_norm": 0.041501163456008426, + "learning_rate": 0.0009536985015223949, + "loss": 0.669, + "step": 8835 + }, + { + "epoch": 0.44878222131969386, + "grad_norm": 0.04886005734739699, + "learning_rate": 0.0009536053552748494, + "loss": 0.5982, + "step": 8840 + }, + { + "epoch": 0.4490360574177254, + "grad_norm": 0.07228127391608218, + "learning_rate": 0.0009535121199868348, + "loss": 0.6427, + "step": 8845 + }, + { + "epoch": 0.4492898935157569, + "grad_norm": 0.0560495813187393, + "learning_rate": 0.0009534187956766526, + "loss": 0.6376, + "step": 8850 + }, + { + "epoch": 0.44954372961378836, + "grad_norm": 0.024625434128465788, + "learning_rate": 0.000953325382362622, + "loss": 0.6456, + "step": 8855 + }, + { + "epoch": 0.4497975657118199, + "grad_norm": 0.02767775635007945, + "learning_rate": 0.0009532318800630797, + "loss": 0.6126, + "step": 8860 + }, + { + "epoch": 0.45005140180985137, + "grad_norm": 0.03132593059311349, + "learning_rate": 0.0009531382887963796, + "loss": 0.626, + "step": 8865 + }, + { + "epoch": 0.4503052379078829, + "grad_norm": 0.03409858571498195, + "learning_rate": 0.0009530446085808932, + "loss": 0.6232, + "step": 8870 + }, + { + "epoch": 0.4505590740059144, + "grad_norm": 0.03060723735055533, + "learning_rate": 0.0009529508394350093, + "loss": 0.6515, + "step": 8875 + }, + { + "epoch": 0.45081291010394586, + "grad_norm": 0.02791574567598346, + "learning_rate": 0.0009528569813771346, + "loss": 0.643, + "step": 8880 + }, + { + "epoch": 0.4510667462019774, + "grad_norm": 0.025463994979441774, + "learning_rate": 0.0009527630344256929, + "loss": 0.6072, + "step": 8885 + }, + { + "epoch": 0.4513205823000089, + "grad_norm": 0.03198977951851349, + "learning_rate": 0.0009526689985991255, + "loss": 0.6013, + "step": 8890 + }, + { + "epoch": 0.4515744183980404, + "grad_norm": 0.03573354091801804, + "learning_rate": 0.000952574873915891, + "loss": 0.6139, + "step": 8895 + }, + { + "epoch": 0.4518282544960719, + "grad_norm": 0.037344124835746804, + "learning_rate": 0.0009524806603944658, + "loss": 0.6095, + "step": 8900 + }, + { + "epoch": 0.45208209059410337, + "grad_norm": 0.0338181770188993, + "learning_rate": 0.0009523863580533434, + "loss": 0.63, + "step": 8905 + }, + { + "epoch": 0.4523359266921349, + "grad_norm": 0.047741999628023585, + "learning_rate": 0.000952291966911035, + "loss": 0.6191, + "step": 8910 + }, + { + "epoch": 0.4525897627901664, + "grad_norm": 0.027889977098360996, + "learning_rate": 0.0009521974869860691, + "loss": 0.6289, + "step": 8915 + }, + { + "epoch": 0.4528435988881979, + "grad_norm": 0.04542033140089097, + "learning_rate": 0.0009521029182969915, + "loss": 0.6292, + "step": 8920 + }, + { + "epoch": 0.4530974349862294, + "grad_norm": 0.060630645721840855, + "learning_rate": 0.000952008260862366, + "loss": 0.6073, + "step": 8925 + }, + { + "epoch": 0.4533512710842609, + "grad_norm": 0.042297603142427635, + "learning_rate": 0.0009519135147007726, + "loss": 0.6255, + "step": 8930 + }, + { + "epoch": 0.4536051071822924, + "grad_norm": 0.03012511251936056, + "learning_rate": 0.0009518186798308104, + "loss": 0.6177, + "step": 8935 + }, + { + "epoch": 0.4538589432803239, + "grad_norm": 0.030825815994073906, + "learning_rate": 0.0009517237562710943, + "loss": 0.6154, + "step": 8940 + }, + { + "epoch": 0.45411277937835537, + "grad_norm": 0.027418304719546968, + "learning_rate": 0.0009516287440402576, + "loss": 0.6451, + "step": 8945 + }, + { + "epoch": 0.4543666154763869, + "grad_norm": 0.03437809223733027, + "learning_rate": 0.0009515336431569508, + "loss": 0.6372, + "step": 8950 + }, + { + "epoch": 0.4546204515744184, + "grad_norm": 0.02706173332906627, + "learning_rate": 0.0009514384536398416, + "loss": 0.5825, + "step": 8955 + }, + { + "epoch": 0.4548742876724499, + "grad_norm": 0.026630978850147032, + "learning_rate": 0.0009513431755076152, + "loss": 0.6007, + "step": 8960 + }, + { + "epoch": 0.4551281237704814, + "grad_norm": 0.027776987645599445, + "learning_rate": 0.0009512478087789745, + "loss": 0.6777, + "step": 8965 + }, + { + "epoch": 0.4553819598685129, + "grad_norm": 0.03127848050806932, + "learning_rate": 0.0009511523534726391, + "loss": 0.6137, + "step": 8970 + }, + { + "epoch": 0.4556357959665444, + "grad_norm": 0.0316814909688181, + "learning_rate": 0.0009510568096073466, + "loss": 0.6259, + "step": 8975 + }, + { + "epoch": 0.4558896320645759, + "grad_norm": 0.03288592896635044, + "learning_rate": 0.0009509611772018519, + "loss": 0.594, + "step": 8980 + }, + { + "epoch": 0.4561434681626074, + "grad_norm": 0.030504149235616482, + "learning_rate": 0.0009508654562749271, + "loss": 0.6559, + "step": 8985 + }, + { + "epoch": 0.4563973042606389, + "grad_norm": 0.02918212945710069, + "learning_rate": 0.0009507696468453615, + "loss": 0.6376, + "step": 8990 + }, + { + "epoch": 0.4566511403586704, + "grad_norm": 0.02867657062417966, + "learning_rate": 0.0009506737489319623, + "loss": 0.6153, + "step": 8995 + }, + { + "epoch": 0.4569049764567019, + "grad_norm": 0.04312848043233246, + "learning_rate": 0.0009505777625535538, + "loss": 0.6249, + "step": 9000 + }, + { + "epoch": 0.4571588125547334, + "grad_norm": 0.025378226871406697, + "learning_rate": 0.0009504816877289775, + "loss": 0.625, + "step": 9005 + }, + { + "epoch": 0.45741264865276493, + "grad_norm": 0.034824462280793105, + "learning_rate": 0.0009503855244770923, + "loss": 0.5977, + "step": 9010 + }, + { + "epoch": 0.4576664847507964, + "grad_norm": 0.06002542817312333, + "learning_rate": 0.0009502892728167749, + "loss": 0.6148, + "step": 9015 + }, + { + "epoch": 0.4579203208488279, + "grad_norm": 0.04708946523260842, + "learning_rate": 0.0009501929327669188, + "loss": 0.6292, + "step": 9020 + }, + { + "epoch": 0.4581741569468594, + "grad_norm": 0.030847513835845684, + "learning_rate": 0.0009500965043464349, + "loss": 0.5828, + "step": 9025 + }, + { + "epoch": 0.4584279930448909, + "grad_norm": 0.030701249013652274, + "learning_rate": 0.000949999987574252, + "loss": 0.6038, + "step": 9030 + }, + { + "epoch": 0.45868182914292244, + "grad_norm": 0.027439149183266075, + "learning_rate": 0.0009499033824693158, + "loss": 0.6028, + "step": 9035 + }, + { + "epoch": 0.4589356652409539, + "grad_norm": 0.050799419978763395, + "learning_rate": 0.000949806689050589, + "loss": 0.633, + "step": 9040 + }, + { + "epoch": 0.4591895013389854, + "grad_norm": 0.03760838688025009, + "learning_rate": 0.0009497099073370526, + "loss": 0.5974, + "step": 9045 + }, + { + "epoch": 0.45944333743701693, + "grad_norm": 0.03301722262990158, + "learning_rate": 0.0009496130373477039, + "loss": 0.614, + "step": 9050 + }, + { + "epoch": 0.4596971735350484, + "grad_norm": 0.03076280298803993, + "learning_rate": 0.0009495160791015583, + "loss": 0.6149, + "step": 9055 + }, + { + "epoch": 0.45995100963307994, + "grad_norm": 0.02435240521738851, + "learning_rate": 0.0009494190326176479, + "loss": 0.5875, + "step": 9060 + }, + { + "epoch": 0.4602048457311114, + "grad_norm": 0.036608354215028616, + "learning_rate": 0.0009493218979150229, + "loss": 0.6492, + "step": 9065 + }, + { + "epoch": 0.4604586818291429, + "grad_norm": 0.031052494072487117, + "learning_rate": 0.00094922467501275, + "loss": 0.6289, + "step": 9070 + }, + { + "epoch": 0.46071251792717444, + "grad_norm": 0.05646399870150115, + "learning_rate": 0.0009491273639299136, + "loss": 0.6317, + "step": 9075 + }, + { + "epoch": 0.4609663540252059, + "grad_norm": 0.026516267771592012, + "learning_rate": 0.0009490299646856156, + "loss": 0.6345, + "step": 9080 + }, + { + "epoch": 0.46122019012323745, + "grad_norm": 0.026314766475744072, + "learning_rate": 0.0009489324772989747, + "loss": 0.5714, + "step": 9085 + }, + { + "epoch": 0.46147402622126893, + "grad_norm": 0.04954065910053027, + "learning_rate": 0.0009488349017891275, + "loss": 0.5893, + "step": 9090 + }, + { + "epoch": 0.4617278623193004, + "grad_norm": 0.030999818544982013, + "learning_rate": 0.0009487372381752273, + "loss": 0.6209, + "step": 9095 + }, + { + "epoch": 0.46198169841733194, + "grad_norm": 0.05472893804971776, + "learning_rate": 0.0009486394864764452, + "loss": 0.6358, + "step": 9100 + }, + { + "epoch": 0.4622355345153634, + "grad_norm": 0.03583598972027129, + "learning_rate": 0.000948541646711969, + "loss": 0.6119, + "step": 9105 + }, + { + "epoch": 0.46248937061339496, + "grad_norm": 0.03444527417330176, + "learning_rate": 0.0009484437189010047, + "loss": 0.6398, + "step": 9110 + }, + { + "epoch": 0.46274320671142644, + "grad_norm": 0.03009633415385528, + "learning_rate": 0.0009483457030627746, + "loss": 0.6811, + "step": 9115 + }, + { + "epoch": 0.4629970428094579, + "grad_norm": 0.02932226956272446, + "learning_rate": 0.000948247599216519, + "loss": 0.65, + "step": 9120 + }, + { + "epoch": 0.46325087890748945, + "grad_norm": 0.040643168448799956, + "learning_rate": 0.0009481494073814951, + "loss": 0.6312, + "step": 9125 + }, + { + "epoch": 0.46350471500552093, + "grad_norm": 0.035491703470452655, + "learning_rate": 0.0009480511275769773, + "loss": 0.6314, + "step": 9130 + }, + { + "epoch": 0.46375855110355246, + "grad_norm": 0.030658119230129795, + "learning_rate": 0.0009479527598222577, + "loss": 0.6384, + "step": 9135 + }, + { + "epoch": 0.46401238720158394, + "grad_norm": 0.031569879525829354, + "learning_rate": 0.0009478543041366452, + "loss": 0.6376, + "step": 9140 + }, + { + "epoch": 0.4642662232996154, + "grad_norm": 0.03790647935672875, + "learning_rate": 0.0009477557605394664, + "loss": 0.6153, + "step": 9145 + }, + { + "epoch": 0.46452005939764696, + "grad_norm": 0.030688459561337894, + "learning_rate": 0.0009476571290500647, + "loss": 0.6538, + "step": 9150 + }, + { + "epoch": 0.46477389549567844, + "grad_norm": 0.03266933170402601, + "learning_rate": 0.000947558409687801, + "loss": 0.642, + "step": 9155 + }, + { + "epoch": 0.4650277315937099, + "grad_norm": 0.02719276820532039, + "learning_rate": 0.0009474596024720534, + "loss": 0.6071, + "step": 9160 + }, + { + "epoch": 0.46528156769174145, + "grad_norm": 0.03990484440945978, + "learning_rate": 0.0009473607074222172, + "loss": 0.6301, + "step": 9165 + }, + { + "epoch": 0.46553540378977293, + "grad_norm": 0.030262431248659433, + "learning_rate": 0.0009472617245577053, + "loss": 0.6482, + "step": 9170 + }, + { + "epoch": 0.46578923988780446, + "grad_norm": 0.02733900220464855, + "learning_rate": 0.0009471626538979474, + "loss": 0.6199, + "step": 9175 + }, + { + "epoch": 0.46604307598583594, + "grad_norm": 0.04504748091932876, + "learning_rate": 0.0009470634954623905, + "loss": 0.596, + "step": 9180 + }, + { + "epoch": 0.4662969120838674, + "grad_norm": 0.027370854127821743, + "learning_rate": 0.0009469642492704989, + "loss": 0.6347, + "step": 9185 + }, + { + "epoch": 0.46655074818189896, + "grad_norm": 0.05378088594277385, + "learning_rate": 0.0009468649153417542, + "loss": 0.6133, + "step": 9190 + }, + { + "epoch": 0.46680458427993043, + "grad_norm": 0.03391320932942569, + "learning_rate": 0.000946765493695655, + "loss": 0.6205, + "step": 9195 + }, + { + "epoch": 0.46705842037796197, + "grad_norm": 0.04140473437665591, + "learning_rate": 0.0009466659843517176, + "loss": 0.6276, + "step": 9200 + }, + { + "epoch": 0.46731225647599345, + "grad_norm": 0.02569082334485342, + "learning_rate": 0.0009465663873294747, + "loss": 0.6059, + "step": 9205 + }, + { + "epoch": 0.4675660925740249, + "grad_norm": 0.02578745353589086, + "learning_rate": 0.0009464667026484774, + "loss": 0.6109, + "step": 9210 + }, + { + "epoch": 0.46781992867205646, + "grad_norm": 0.04347854360585143, + "learning_rate": 0.0009463669303282927, + "loss": 0.6338, + "step": 9215 + }, + { + "epoch": 0.46807376477008794, + "grad_norm": 0.03554446657138465, + "learning_rate": 0.0009462670703885054, + "loss": 0.615, + "step": 9220 + }, + { + "epoch": 0.4683276008681195, + "grad_norm": 0.035194592885897366, + "learning_rate": 0.0009461671228487181, + "loss": 0.6292, + "step": 9225 + }, + { + "epoch": 0.46858143696615095, + "grad_norm": 0.048381105428334986, + "learning_rate": 0.0009460670877285493, + "loss": 0.6208, + "step": 9230 + }, + { + "epoch": 0.46883527306418243, + "grad_norm": 0.029440014096268563, + "learning_rate": 0.0009459669650476359, + "loss": 0.6429, + "step": 9235 + }, + { + "epoch": 0.46908910916221397, + "grad_norm": 0.032081756608336384, + "learning_rate": 0.0009458667548256312, + "loss": 0.6007, + "step": 9240 + }, + { + "epoch": 0.46934294526024545, + "grad_norm": 0.029846212941765773, + "learning_rate": 0.0009457664570822061, + "loss": 0.6394, + "step": 9245 + }, + { + "epoch": 0.469596781358277, + "grad_norm": 0.029316590912338028, + "learning_rate": 0.0009456660718370484, + "loss": 0.6067, + "step": 9250 + }, + { + "epoch": 0.46985061745630846, + "grad_norm": 0.0239561828979098, + "learning_rate": 0.0009455655991098635, + "loss": 0.6099, + "step": 9255 + }, + { + "epoch": 0.47010445355433994, + "grad_norm": 0.031644371771562194, + "learning_rate": 0.0009454650389203735, + "loss": 0.6135, + "step": 9260 + }, + { + "epoch": 0.4703582896523715, + "grad_norm": 0.02960336906370349, + "learning_rate": 0.0009453643912883179, + "loss": 0.5989, + "step": 9265 + }, + { + "epoch": 0.47061212575040295, + "grad_norm": 0.026335689130472083, + "learning_rate": 0.0009452636562334532, + "loss": 0.6412, + "step": 9270 + }, + { + "epoch": 0.4708659618484345, + "grad_norm": 0.02648170596691074, + "learning_rate": 0.0009451628337755533, + "loss": 0.5987, + "step": 9275 + }, + { + "epoch": 0.47111979794646597, + "grad_norm": 0.09832497965370343, + "learning_rate": 0.0009450619239344094, + "loss": 0.6353, + "step": 9280 + }, + { + "epoch": 0.47137363404449745, + "grad_norm": 0.04865339323397357, + "learning_rate": 0.0009449609267298292, + "loss": 0.6184, + "step": 9285 + }, + { + "epoch": 0.471627470142529, + "grad_norm": 0.0675079877924424, + "learning_rate": 0.000944859842181638, + "loss": 0.6434, + "step": 9290 + }, + { + "epoch": 0.47188130624056046, + "grad_norm": 0.049608513841766726, + "learning_rate": 0.0009447586703096784, + "loss": 0.6152, + "step": 9295 + }, + { + "epoch": 0.472135142338592, + "grad_norm": 0.027169536333829045, + "learning_rate": 0.0009446574111338097, + "loss": 0.6311, + "step": 9300 + }, + { + "epoch": 0.4723889784366235, + "grad_norm": 0.03714479241574352, + "learning_rate": 0.0009445560646739088, + "loss": 0.6124, + "step": 9305 + }, + { + "epoch": 0.47264281453465495, + "grad_norm": 0.02908329857098156, + "learning_rate": 0.0009444546309498693, + "loss": 0.5914, + "step": 9310 + }, + { + "epoch": 0.4728966506326865, + "grad_norm": 0.040933687722797846, + "learning_rate": 0.0009443531099816025, + "loss": 0.6233, + "step": 9315 + }, + { + "epoch": 0.47315048673071797, + "grad_norm": 0.039379070667730956, + "learning_rate": 0.0009442515017890361, + "loss": 0.6072, + "step": 9320 + }, + { + "epoch": 0.4734043228287495, + "grad_norm": 0.029435204711081575, + "learning_rate": 0.0009441498063921152, + "loss": 0.6187, + "step": 9325 + }, + { + "epoch": 0.473658158926781, + "grad_norm": 0.03446056203368723, + "learning_rate": 0.0009440480238108025, + "loss": 0.6397, + "step": 9330 + }, + { + "epoch": 0.47391199502481246, + "grad_norm": 0.033665244711908196, + "learning_rate": 0.000943946154065077, + "loss": 0.6008, + "step": 9335 + }, + { + "epoch": 0.474165831122844, + "grad_norm": 0.027286057953673223, + "learning_rate": 0.0009438441971749354, + "loss": 0.6321, + "step": 9340 + }, + { + "epoch": 0.4744196672208755, + "grad_norm": 0.027252383518020805, + "learning_rate": 0.0009437421531603916, + "loss": 0.6052, + "step": 9345 + }, + { + "epoch": 0.474673503318907, + "grad_norm": 0.0323130206125324, + "learning_rate": 0.0009436400220414758, + "loss": 0.5934, + "step": 9350 + }, + { + "epoch": 0.4749273394169385, + "grad_norm": 0.05540094012390889, + "learning_rate": 0.0009435378038382363, + "loss": 0.6281, + "step": 9355 + }, + { + "epoch": 0.47518117551496997, + "grad_norm": 0.026971796001700972, + "learning_rate": 0.0009434354985707376, + "loss": 0.5914, + "step": 9360 + }, + { + "epoch": 0.4754350116130015, + "grad_norm": 0.028590512194596497, + "learning_rate": 0.0009433331062590621, + "loss": 0.5702, + "step": 9365 + }, + { + "epoch": 0.475688847711033, + "grad_norm": 0.03025843533876914, + "learning_rate": 0.0009432306269233087, + "loss": 0.6067, + "step": 9370 + }, + { + "epoch": 0.47594268380906446, + "grad_norm": 0.038050502244553336, + "learning_rate": 0.0009431280605835937, + "loss": 0.5976, + "step": 9375 + }, + { + "epoch": 0.476196519907096, + "grad_norm": 0.02875268521706395, + "learning_rate": 0.0009430254072600501, + "loss": 0.6181, + "step": 9380 + }, + { + "epoch": 0.4764503560051275, + "grad_norm": 0.03480214322443527, + "learning_rate": 0.0009429226669728285, + "loss": 0.5914, + "step": 9385 + }, + { + "epoch": 0.476704192103159, + "grad_norm": 0.02601422654788008, + "learning_rate": 0.0009428198397420964, + "loss": 0.5903, + "step": 9390 + }, + { + "epoch": 0.4769580282011905, + "grad_norm": 0.025876539373789892, + "learning_rate": 0.0009427169255880379, + "loss": 0.6328, + "step": 9395 + }, + { + "epoch": 0.47721186429922197, + "grad_norm": 0.04738818931522865, + "learning_rate": 0.0009426139245308548, + "loss": 0.5819, + "step": 9400 + }, + { + "epoch": 0.4774657003972535, + "grad_norm": 0.035112867568037956, + "learning_rate": 0.0009425108365907658, + "loss": 0.6039, + "step": 9405 + }, + { + "epoch": 0.477719536495285, + "grad_norm": 0.03811522461096082, + "learning_rate": 0.0009424076617880059, + "loss": 0.5912, + "step": 9410 + }, + { + "epoch": 0.4779733725933165, + "grad_norm": 0.03389803328245173, + "learning_rate": 0.0009423044001428287, + "loss": 0.5831, + "step": 9415 + }, + { + "epoch": 0.478227208691348, + "grad_norm": 0.03133776562544537, + "learning_rate": 0.0009422010516755034, + "loss": 0.6577, + "step": 9420 + }, + { + "epoch": 0.4784810447893795, + "grad_norm": 0.0269014540427202, + "learning_rate": 0.0009420976164063169, + "loss": 0.6213, + "step": 9425 + }, + { + "epoch": 0.478734880887411, + "grad_norm": 0.02669607689703728, + "learning_rate": 0.0009419940943555731, + "loss": 0.6164, + "step": 9430 + }, + { + "epoch": 0.4789887169854425, + "grad_norm": 0.026824612081548814, + "learning_rate": 0.0009418904855435927, + "loss": 0.6229, + "step": 9435 + }, + { + "epoch": 0.479242553083474, + "grad_norm": 0.046154821680137924, + "learning_rate": 0.0009417867899907138, + "loss": 0.5931, + "step": 9440 + }, + { + "epoch": 0.4794963891815055, + "grad_norm": 0.036033799620862, + "learning_rate": 0.0009416830077172911, + "loss": 0.6269, + "step": 9445 + }, + { + "epoch": 0.479750225279537, + "grad_norm": 0.06210323273721612, + "learning_rate": 0.0009415791387436968, + "loss": 0.6021, + "step": 9450 + }, + { + "epoch": 0.4800040613775685, + "grad_norm": 0.027549653199184822, + "learning_rate": 0.0009414751830903195, + "loss": 0.6554, + "step": 9455 + }, + { + "epoch": 0.4802578974756, + "grad_norm": 0.03550040518296244, + "learning_rate": 0.0009413711407775655, + "loss": 0.6116, + "step": 9460 + }, + { + "epoch": 0.4805117335736315, + "grad_norm": 0.025622868890069674, + "learning_rate": 0.0009412670118258578, + "loss": 0.6054, + "step": 9465 + }, + { + "epoch": 0.480765569671663, + "grad_norm": 0.04189228398723917, + "learning_rate": 0.0009411627962556359, + "loss": 0.6122, + "step": 9470 + }, + { + "epoch": 0.4810194057696945, + "grad_norm": 0.02570476986142007, + "learning_rate": 0.0009410584940873574, + "loss": 0.6176, + "step": 9475 + }, + { + "epoch": 0.481273241867726, + "grad_norm": 0.07031584927871945, + "learning_rate": 0.0009409541053414963, + "loss": 0.5885, + "step": 9480 + }, + { + "epoch": 0.4815270779657575, + "grad_norm": 0.03149142845263447, + "learning_rate": 0.000940849630038543, + "loss": 0.6325, + "step": 9485 + }, + { + "epoch": 0.48178091406378903, + "grad_norm": 0.049358378222708374, + "learning_rate": 0.0009407450681990061, + "loss": 0.6283, + "step": 9490 + }, + { + "epoch": 0.4820347501618205, + "grad_norm": 0.025008692305935783, + "learning_rate": 0.0009406404198434102, + "loss": 0.6001, + "step": 9495 + }, + { + "epoch": 0.482288586259852, + "grad_norm": 0.030799405008239217, + "learning_rate": 0.0009405356849922972, + "loss": 0.638, + "step": 9500 + }, + { + "epoch": 0.4825424223578835, + "grad_norm": 0.024246543364429107, + "learning_rate": 0.0009404308636662264, + "loss": 0.6356, + "step": 9505 + }, + { + "epoch": 0.482796258455915, + "grad_norm": 0.03302243412784637, + "learning_rate": 0.0009403259558857734, + "loss": 0.6112, + "step": 9510 + }, + { + "epoch": 0.48305009455394654, + "grad_norm": 0.023532175082063685, + "learning_rate": 0.0009402209616715311, + "loss": 0.585, + "step": 9515 + }, + { + "epoch": 0.483303930651978, + "grad_norm": 0.04255227744929531, + "learning_rate": 0.0009401158810441095, + "loss": 0.6327, + "step": 9520 + }, + { + "epoch": 0.4835577667500095, + "grad_norm": 0.024840322869465154, + "learning_rate": 0.0009400107140241354, + "loss": 0.6208, + "step": 9525 + }, + { + "epoch": 0.48381160284804103, + "grad_norm": 0.024921227435088424, + "learning_rate": 0.0009399054606322524, + "loss": 0.6054, + "step": 9530 + }, + { + "epoch": 0.4840654389460725, + "grad_norm": 0.031344853057544725, + "learning_rate": 0.0009398001208891212, + "loss": 0.5989, + "step": 9535 + }, + { + "epoch": 0.48431927504410405, + "grad_norm": 0.14707366575213146, + "learning_rate": 0.0009396946948154194, + "loss": 0.6113, + "step": 9540 + }, + { + "epoch": 0.4845731111421355, + "grad_norm": 0.02572297244175927, + "learning_rate": 0.0009395891824318421, + "loss": 0.6063, + "step": 9545 + }, + { + "epoch": 0.484826947240167, + "grad_norm": 0.024708458838109962, + "learning_rate": 0.0009394835837591004, + "loss": 0.6199, + "step": 9550 + }, + { + "epoch": 0.48508078333819854, + "grad_norm": 0.035794765917293574, + "learning_rate": 0.0009393778988179229, + "loss": 0.635, + "step": 9555 + }, + { + "epoch": 0.48533461943623, + "grad_norm": 0.0554244840891106, + "learning_rate": 0.0009392721276290549, + "loss": 0.6169, + "step": 9560 + }, + { + "epoch": 0.48558845553426155, + "grad_norm": 0.029279116423008678, + "learning_rate": 0.0009391662702132591, + "loss": 0.637, + "step": 9565 + }, + { + "epoch": 0.48584229163229303, + "grad_norm": 0.028687460122576846, + "learning_rate": 0.0009390603265913145, + "loss": 0.6328, + "step": 9570 + }, + { + "epoch": 0.4860961277303245, + "grad_norm": 0.03505406613441969, + "learning_rate": 0.0009389542967840173, + "loss": 0.5973, + "step": 9575 + }, + { + "epoch": 0.48634996382835605, + "grad_norm": 0.041803466393629654, + "learning_rate": 0.0009388481808121807, + "loss": 0.599, + "step": 9580 + }, + { + "epoch": 0.4866037999263875, + "grad_norm": 0.023902502916385335, + "learning_rate": 0.0009387419786966348, + "loss": 0.5804, + "step": 9585 + }, + { + "epoch": 0.486857636024419, + "grad_norm": 0.04180433344415987, + "learning_rate": 0.0009386356904582265, + "loss": 0.6429, + "step": 9590 + }, + { + "epoch": 0.48711147212245054, + "grad_norm": 0.037702560242762466, + "learning_rate": 0.0009385293161178197, + "loss": 0.6352, + "step": 9595 + }, + { + "epoch": 0.487365308220482, + "grad_norm": 0.040454587767540365, + "learning_rate": 0.0009384228556962949, + "loss": 0.617, + "step": 9600 + }, + { + "epoch": 0.48761914431851355, + "grad_norm": 0.051660525518630485, + "learning_rate": 0.0009383163092145501, + "loss": 0.6255, + "step": 9605 + }, + { + "epoch": 0.48787298041654503, + "grad_norm": 0.03034861191132385, + "learning_rate": 0.0009382096766934996, + "loss": 0.6528, + "step": 9610 + }, + { + "epoch": 0.4881268165145765, + "grad_norm": 0.03757127293588768, + "learning_rate": 0.000938102958154075, + "loss": 0.59, + "step": 9615 + }, + { + "epoch": 0.48838065261260805, + "grad_norm": 0.07147380234071418, + "learning_rate": 0.0009379961536172244, + "loss": 0.6392, + "step": 9620 + }, + { + "epoch": 0.4886344887106395, + "grad_norm": 0.026977337654578798, + "learning_rate": 0.0009378892631039132, + "loss": 0.6504, + "step": 9625 + }, + { + "epoch": 0.48888832480867106, + "grad_norm": 0.04626336090648292, + "learning_rate": 0.0009377822866351235, + "loss": 0.651, + "step": 9630 + }, + { + "epoch": 0.48914216090670254, + "grad_norm": 0.06597352026364534, + "learning_rate": 0.000937675224231854, + "loss": 0.6051, + "step": 9635 + }, + { + "epoch": 0.489395997004734, + "grad_norm": 0.04524993145754299, + "learning_rate": 0.0009375680759151206, + "loss": 0.6247, + "step": 9640 + }, + { + "epoch": 0.48964983310276555, + "grad_norm": 0.034280234478550185, + "learning_rate": 0.0009374608417059562, + "loss": 0.612, + "step": 9645 + }, + { + "epoch": 0.48990366920079703, + "grad_norm": 0.02631497433619995, + "learning_rate": 0.0009373535216254101, + "loss": 0.612, + "step": 9650 + }, + { + "epoch": 0.49015750529882857, + "grad_norm": 0.03832792024810497, + "learning_rate": 0.0009372461156945489, + "loss": 0.6249, + "step": 9655 + }, + { + "epoch": 0.49041134139686005, + "grad_norm": 0.033364201876492666, + "learning_rate": 0.0009371386239344557, + "loss": 0.637, + "step": 9660 + }, + { + "epoch": 0.4906651774948915, + "grad_norm": 0.03219096028559519, + "learning_rate": 0.0009370310463662306, + "loss": 0.6425, + "step": 9665 + }, + { + "epoch": 0.49091901359292306, + "grad_norm": 0.024519457484542897, + "learning_rate": 0.0009369233830109905, + "loss": 0.5807, + "step": 9670 + }, + { + "epoch": 0.49117284969095454, + "grad_norm": 0.0341270179716214, + "learning_rate": 0.0009368156338898694, + "loss": 0.6251, + "step": 9675 + }, + { + "epoch": 0.4914266857889861, + "grad_norm": 0.024071390232175503, + "learning_rate": 0.0009367077990240176, + "loss": 0.5962, + "step": 9680 + }, + { + "epoch": 0.49168052188701755, + "grad_norm": 0.02743756336479237, + "learning_rate": 0.0009365998784346028, + "loss": 0.6005, + "step": 9685 + }, + { + "epoch": 0.49193435798504903, + "grad_norm": 0.04122709532915945, + "learning_rate": 0.0009364918721428093, + "loss": 0.5867, + "step": 9690 + }, + { + "epoch": 0.49218819408308057, + "grad_norm": 0.02721674730288296, + "learning_rate": 0.0009363837801698379, + "loss": 0.62, + "step": 9695 + }, + { + "epoch": 0.49244203018111204, + "grad_norm": 0.04100687725132822, + "learning_rate": 0.0009362756025369067, + "loss": 0.6184, + "step": 9700 + }, + { + "epoch": 0.4926958662791436, + "grad_norm": 0.023552301564775294, + "learning_rate": 0.0009361673392652505, + "loss": 0.5772, + "step": 9705 + }, + { + "epoch": 0.49294970237717506, + "grad_norm": 0.043765850357777386, + "learning_rate": 0.0009360589903761208, + "loss": 0.5763, + "step": 9710 + }, + { + "epoch": 0.49320353847520654, + "grad_norm": 0.0377192420219829, + "learning_rate": 0.0009359505558907857, + "loss": 0.5907, + "step": 9715 + }, + { + "epoch": 0.4934573745732381, + "grad_norm": 0.04173445599728607, + "learning_rate": 0.0009358420358305307, + "loss": 0.6209, + "step": 9720 + }, + { + "epoch": 0.49371121067126955, + "grad_norm": 0.02795345786794999, + "learning_rate": 0.0009357334302166577, + "loss": 0.6139, + "step": 9725 + }, + { + "epoch": 0.4939650467693011, + "grad_norm": 0.030534586657452248, + "learning_rate": 0.0009356247390704853, + "loss": 0.6334, + "step": 9730 + }, + { + "epoch": 0.49421888286733257, + "grad_norm": 0.024191524057541644, + "learning_rate": 0.0009355159624133489, + "loss": 0.5692, + "step": 9735 + }, + { + "epoch": 0.49447271896536404, + "grad_norm": 0.03323609638954307, + "learning_rate": 0.0009354071002666011, + "loss": 0.6106, + "step": 9740 + }, + { + "epoch": 0.4947265550633956, + "grad_norm": 0.024949282216645566, + "learning_rate": 0.000935298152651611, + "loss": 0.5978, + "step": 9745 + }, + { + "epoch": 0.49498039116142706, + "grad_norm": 0.06663669858744471, + "learning_rate": 0.0009351891195897644, + "loss": 0.623, + "step": 9750 + }, + { + "epoch": 0.4952342272594586, + "grad_norm": 0.032099026906883724, + "learning_rate": 0.0009350800011024636, + "loss": 0.6189, + "step": 9755 + }, + { + "epoch": 0.49548806335749007, + "grad_norm": 0.0663218764773123, + "learning_rate": 0.0009349707972111285, + "loss": 0.6074, + "step": 9760 + }, + { + "epoch": 0.49574189945552155, + "grad_norm": 0.025197965203499328, + "learning_rate": 0.0009348615079371952, + "loss": 0.5815, + "step": 9765 + }, + { + "epoch": 0.4959957355535531, + "grad_norm": 0.04923718941099546, + "learning_rate": 0.0009347521333021165, + "loss": 0.6104, + "step": 9770 + }, + { + "epoch": 0.49624957165158456, + "grad_norm": 0.024233669045321673, + "learning_rate": 0.000934642673327362, + "loss": 0.5966, + "step": 9775 + }, + { + "epoch": 0.4965034077496161, + "grad_norm": 0.02630064833029989, + "learning_rate": 0.0009345331280344184, + "loss": 0.6308, + "step": 9780 + }, + { + "epoch": 0.4967572438476476, + "grad_norm": 0.03733420897200894, + "learning_rate": 0.0009344234974447888, + "loss": 0.5984, + "step": 9785 + }, + { + "epoch": 0.49701107994567906, + "grad_norm": 0.03288604787671546, + "learning_rate": 0.0009343137815799931, + "loss": 0.6278, + "step": 9790 + }, + { + "epoch": 0.4972649160437106, + "grad_norm": 0.02870697067749721, + "learning_rate": 0.000934203980461568, + "loss": 0.5907, + "step": 9795 + }, + { + "epoch": 0.49751875214174207, + "grad_norm": 0.029210706904460447, + "learning_rate": 0.0009340940941110669, + "loss": 0.623, + "step": 9800 + }, + { + "epoch": 0.4977725882397736, + "grad_norm": 0.022650840666329053, + "learning_rate": 0.00093398412255006, + "loss": 0.585, + "step": 9805 + }, + { + "epoch": 0.4980264243378051, + "grad_norm": 0.033968761190204845, + "learning_rate": 0.000933874065800134, + "loss": 0.6272, + "step": 9810 + }, + { + "epoch": 0.49828026043583656, + "grad_norm": 0.03225151989127355, + "learning_rate": 0.0009337639238828927, + "loss": 0.5957, + "step": 9815 + }, + { + "epoch": 0.4985340965338681, + "grad_norm": 0.03766789630190195, + "learning_rate": 0.0009336536968199562, + "loss": 0.5961, + "step": 9820 + }, + { + "epoch": 0.4987879326318996, + "grad_norm": 0.024291519990984625, + "learning_rate": 0.0009335433846329618, + "loss": 0.5848, + "step": 9825 + }, + { + "epoch": 0.49904176872993106, + "grad_norm": 1.1081797676644811, + "learning_rate": 0.000933432987343563, + "loss": 0.6314, + "step": 9830 + }, + { + "epoch": 0.4992956048279626, + "grad_norm": 0.06216658843899505, + "learning_rate": 0.0009333225049734303, + "loss": 0.6137, + "step": 9835 + }, + { + "epoch": 0.49954944092599407, + "grad_norm": 0.06842641946547982, + "learning_rate": 0.0009332119375442509, + "loss": 0.6494, + "step": 9840 + }, + { + "epoch": 0.4998032770240256, + "grad_norm": 0.07771247073389582, + "learning_rate": 0.0009331012850777286, + "loss": 0.6461, + "step": 9845 + }, + { + "epoch": 0.5000571131220571, + "grad_norm": 0.09586623316765225, + "learning_rate": 0.0009329905475955838, + "loss": 0.6127, + "step": 9850 + }, + { + "epoch": 0.5003109492200886, + "grad_norm": 0.0549907807273626, + "learning_rate": 0.0009328797251195539, + "loss": 0.6336, + "step": 9855 + }, + { + "epoch": 0.5005647853181201, + "grad_norm": 0.03707374666368335, + "learning_rate": 0.0009327688176713927, + "loss": 0.6643, + "step": 9860 + }, + { + "epoch": 0.5008186214161516, + "grad_norm": 0.06451159664108401, + "learning_rate": 0.0009326578252728708, + "loss": 0.6221, + "step": 9865 + }, + { + "epoch": 0.5010724575141831, + "grad_norm": 0.07176924497990926, + "learning_rate": 0.0009325467479457754, + "loss": 0.6478, + "step": 9870 + }, + { + "epoch": 0.5013262936122146, + "grad_norm": 0.05337950431785672, + "learning_rate": 0.0009324355857119106, + "loss": 0.6161, + "step": 9875 + }, + { + "epoch": 0.5015801297102461, + "grad_norm": 0.040267440416152635, + "learning_rate": 0.0009323243385930968, + "loss": 0.6131, + "step": 9880 + }, + { + "epoch": 0.5018339658082775, + "grad_norm": 0.03271936843258226, + "learning_rate": 0.0009322130066111713, + "loss": 0.662, + "step": 9885 + }, + { + "epoch": 0.5020878019063091, + "grad_norm": 0.029561308985711136, + "learning_rate": 0.0009321015897879883, + "loss": 0.6276, + "step": 9890 + }, + { + "epoch": 0.5023416380043406, + "grad_norm": 0.053789724310795484, + "learning_rate": 0.0009319900881454179, + "loss": 0.6598, + "step": 9895 + }, + { + "epoch": 0.5025954741023722, + "grad_norm": 0.040676726302224304, + "learning_rate": 0.0009318785017053475, + "loss": 0.6075, + "step": 9900 + }, + { + "epoch": 0.5028493102004036, + "grad_norm": 0.04707911068377355, + "learning_rate": 0.0009317668304896811, + "loss": 0.6384, + "step": 9905 + }, + { + "epoch": 0.5031031462984351, + "grad_norm": 0.02729704500716682, + "learning_rate": 0.000931655074520339, + "loss": 0.6042, + "step": 9910 + }, + { + "epoch": 0.5033569823964666, + "grad_norm": 0.041817519847404235, + "learning_rate": 0.0009315432338192584, + "loss": 0.6407, + "step": 9915 + }, + { + "epoch": 0.5036108184944981, + "grad_norm": 0.06693004074647073, + "learning_rate": 0.0009314313084083933, + "loss": 0.6267, + "step": 9920 + }, + { + "epoch": 0.5038646545925296, + "grad_norm": 0.037734517481965976, + "learning_rate": 0.0009313192983097137, + "loss": 0.6235, + "step": 9925 + }, + { + "epoch": 0.5041184906905611, + "grad_norm": 0.04441041670706489, + "learning_rate": 0.0009312072035452069, + "loss": 0.6722, + "step": 9930 + }, + { + "epoch": 0.5043723267885926, + "grad_norm": 0.06966465373395124, + "learning_rate": 0.0009310950241368765, + "loss": 0.6524, + "step": 9935 + }, + { + "epoch": 0.5046261628866241, + "grad_norm": 0.11851335463883052, + "learning_rate": 0.0009309827601067428, + "loss": 0.6386, + "step": 9940 + }, + { + "epoch": 0.5048799989846556, + "grad_norm": 0.044597706923391586, + "learning_rate": 0.0009308704114768425, + "loss": 0.6495, + "step": 9945 + }, + { + "epoch": 0.505133835082687, + "grad_norm": 0.048673205080514974, + "learning_rate": 0.0009307579782692291, + "loss": 0.6183, + "step": 9950 + }, + { + "epoch": 0.5053876711807186, + "grad_norm": 0.03281635254658095, + "learning_rate": 0.0009306454605059729, + "loss": 0.6426, + "step": 9955 + }, + { + "epoch": 0.5056415072787501, + "grad_norm": 0.046125277826759015, + "learning_rate": 0.0009305328582091603, + "loss": 0.6343, + "step": 9960 + }, + { + "epoch": 0.5058953433767817, + "grad_norm": 0.03821293529358294, + "learning_rate": 0.0009304201714008948, + "loss": 0.6326, + "step": 9965 + }, + { + "epoch": 0.5061491794748131, + "grad_norm": 0.03994101386720053, + "learning_rate": 0.0009303074001032961, + "loss": 0.6243, + "step": 9970 + }, + { + "epoch": 0.5064030155728446, + "grad_norm": 0.04022586000406283, + "learning_rate": 0.0009301945443385007, + "loss": 0.6491, + "step": 9975 + }, + { + "epoch": 0.5066568516708762, + "grad_norm": 0.045502011916775865, + "learning_rate": 0.0009300816041286617, + "loss": 0.6329, + "step": 9980 + }, + { + "epoch": 0.5069106877689076, + "grad_norm": 0.031196383196199537, + "learning_rate": 0.0009299685794959485, + "loss": 0.6071, + "step": 9985 + }, + { + "epoch": 0.5071645238669391, + "grad_norm": 0.05198326476515164, + "learning_rate": 0.0009298554704625474, + "loss": 0.6269, + "step": 9990 + }, + { + "epoch": 0.5074183599649706, + "grad_norm": 0.044150229220093255, + "learning_rate": 0.0009297422770506613, + "loss": 0.6137, + "step": 9995 + }, + { + "epoch": 0.5076721960630021, + "grad_norm": 0.03554285265944353, + "learning_rate": 0.0009296289992825091, + "loss": 0.6603, + "step": 10000 + }, + { + "epoch": 0.5079260321610336, + "grad_norm": 0.02972651584300317, + "learning_rate": 0.0009295156371803271, + "loss": 0.6034, + "step": 10005 + }, + { + "epoch": 0.5081798682590651, + "grad_norm": 0.04646869581333915, + "learning_rate": 0.0009294021907663674, + "loss": 0.6213, + "step": 10010 + }, + { + "epoch": 0.5084337043570967, + "grad_norm": 0.04166905651002559, + "learning_rate": 0.0009292886600628991, + "loss": 0.6103, + "step": 10015 + }, + { + "epoch": 0.5086875404551281, + "grad_norm": 0.05922446812891187, + "learning_rate": 0.0009291750450922078, + "loss": 0.6225, + "step": 10020 + }, + { + "epoch": 0.5089413765531596, + "grad_norm": 0.024191658630679954, + "learning_rate": 0.0009290613458765953, + "loss": 0.6064, + "step": 10025 + }, + { + "epoch": 0.5091952126511912, + "grad_norm": 0.06362952277601344, + "learning_rate": 0.0009289475624383804, + "loss": 0.6077, + "step": 10030 + }, + { + "epoch": 0.5094490487492226, + "grad_norm": 0.040791426024977054, + "learning_rate": 0.0009288336947998981, + "loss": 0.6121, + "step": 10035 + }, + { + "epoch": 0.5097028848472541, + "grad_norm": 0.07317831667654075, + "learning_rate": 0.0009287197429835002, + "loss": 0.6198, + "step": 10040 + }, + { + "epoch": 0.5099567209452857, + "grad_norm": 0.031549492144105, + "learning_rate": 0.0009286057070115545, + "loss": 0.6359, + "step": 10045 + }, + { + "epoch": 0.5102105570433171, + "grad_norm": 0.047125852190035775, + "learning_rate": 0.0009284915869064463, + "loss": 0.6185, + "step": 10050 + }, + { + "epoch": 0.5104643931413486, + "grad_norm": 0.03542009861223921, + "learning_rate": 0.0009283773826905764, + "loss": 0.6187, + "step": 10055 + }, + { + "epoch": 0.5107182292393802, + "grad_norm": 0.026666894145488004, + "learning_rate": 0.0009282630943863625, + "loss": 0.6106, + "step": 10060 + }, + { + "epoch": 0.5109720653374117, + "grad_norm": 0.05545034501844204, + "learning_rate": 0.0009281487220162388, + "loss": 0.6283, + "step": 10065 + }, + { + "epoch": 0.5112259014354431, + "grad_norm": 0.02963889637887912, + "learning_rate": 0.0009280342656026564, + "loss": 0.6136, + "step": 10070 + }, + { + "epoch": 0.5114797375334746, + "grad_norm": 0.05901358817696739, + "learning_rate": 0.0009279197251680822, + "loss": 0.6145, + "step": 10075 + }, + { + "epoch": 0.5117335736315062, + "grad_norm": 0.040264054246851005, + "learning_rate": 0.000927805100735, + "loss": 0.6421, + "step": 10080 + }, + { + "epoch": 0.5119874097295376, + "grad_norm": 0.0649111774735333, + "learning_rate": 0.0009276903923259099, + "loss": 0.6036, + "step": 10085 + }, + { + "epoch": 0.5122412458275691, + "grad_norm": 0.037817792224739064, + "learning_rate": 0.0009275755999633286, + "loss": 0.6248, + "step": 10090 + }, + { + "epoch": 0.5124950819256007, + "grad_norm": 0.03397505282281072, + "learning_rate": 0.0009274607236697895, + "loss": 0.612, + "step": 10095 + }, + { + "epoch": 0.5127489180236321, + "grad_norm": 0.05208015537649028, + "learning_rate": 0.000927345763467842, + "loss": 0.6096, + "step": 10100 + }, + { + "epoch": 0.5130027541216636, + "grad_norm": 0.08839766979888053, + "learning_rate": 0.0009272307193800524, + "loss": 0.6074, + "step": 10105 + }, + { + "epoch": 0.5132565902196952, + "grad_norm": 0.03841799364281357, + "learning_rate": 0.000927115591429003, + "loss": 0.6152, + "step": 10110 + }, + { + "epoch": 0.5135104263177267, + "grad_norm": 0.03045633762227496, + "learning_rate": 0.0009270003796372933, + "loss": 0.6044, + "step": 10115 + }, + { + "epoch": 0.5137642624157581, + "grad_norm": 0.02569055323506306, + "learning_rate": 0.0009268850840275382, + "loss": 0.6355, + "step": 10120 + }, + { + "epoch": 0.5140180985137897, + "grad_norm": 0.026408037229659734, + "learning_rate": 0.0009267697046223702, + "loss": 0.6462, + "step": 10125 + }, + { + "epoch": 0.5142719346118212, + "grad_norm": 0.025909580463186872, + "learning_rate": 0.0009266542414444374, + "loss": 0.5864, + "step": 10130 + }, + { + "epoch": 0.5145257707098526, + "grad_norm": 0.03722734324648191, + "learning_rate": 0.0009265386945164049, + "loss": 0.6161, + "step": 10135 + }, + { + "epoch": 0.5147796068078841, + "grad_norm": 0.03566709272968282, + "learning_rate": 0.0009264230638609535, + "loss": 0.6085, + "step": 10140 + }, + { + "epoch": 0.5150334429059157, + "grad_norm": 0.06277602947724216, + "learning_rate": 0.0009263073495007814, + "loss": 0.6338, + "step": 10145 + }, + { + "epoch": 0.5152872790039471, + "grad_norm": 0.058049067673817435, + "learning_rate": 0.0009261915514586026, + "loss": 0.6555, + "step": 10150 + }, + { + "epoch": 0.5155411151019786, + "grad_norm": 0.03795674743934634, + "learning_rate": 0.0009260756697571477, + "loss": 0.6693, + "step": 10155 + }, + { + "epoch": 0.5157949512000102, + "grad_norm": 0.04777864042916011, + "learning_rate": 0.0009259597044191636, + "loss": 0.6382, + "step": 10160 + }, + { + "epoch": 0.5160487872980416, + "grad_norm": 0.03739573996244197, + "learning_rate": 0.0009258436554674137, + "loss": 0.638, + "step": 10165 + }, + { + "epoch": 0.5163026233960731, + "grad_norm": 0.028436191862654784, + "learning_rate": 0.000925727522924678, + "loss": 0.6003, + "step": 10170 + }, + { + "epoch": 0.5165564594941047, + "grad_norm": 0.03204307790523904, + "learning_rate": 0.0009256113068137526, + "loss": 0.6642, + "step": 10175 + }, + { + "epoch": 0.5168102955921362, + "grad_norm": 0.055050382059251654, + "learning_rate": 0.0009254950071574502, + "loss": 0.6275, + "step": 10180 + }, + { + "epoch": 0.5170641316901676, + "grad_norm": 0.026441208032810616, + "learning_rate": 0.0009253786239785999, + "loss": 0.6071, + "step": 10185 + }, + { + "epoch": 0.5173179677881992, + "grad_norm": 0.035768658322057015, + "learning_rate": 0.0009252621573000472, + "loss": 0.6499, + "step": 10190 + }, + { + "epoch": 0.5175718038862307, + "grad_norm": 0.024019701385016478, + "learning_rate": 0.0009251456071446536, + "loss": 0.5909, + "step": 10195 + }, + { + "epoch": 0.5178256399842621, + "grad_norm": 0.0261249278412097, + "learning_rate": 0.0009250289735352975, + "loss": 0.6388, + "step": 10200 + }, + { + "epoch": 0.5180794760822937, + "grad_norm": 0.05688851509224552, + "learning_rate": 0.0009249122564948736, + "loss": 0.6392, + "step": 10205 + }, + { + "epoch": 0.5183333121803252, + "grad_norm": 0.02773345047952887, + "learning_rate": 0.0009247954560462928, + "loss": 0.6311, + "step": 10210 + }, + { + "epoch": 0.5185871482783566, + "grad_norm": 0.0270974270992603, + "learning_rate": 0.0009246785722124823, + "loss": 0.6285, + "step": 10215 + }, + { + "epoch": 0.5188409843763881, + "grad_norm": 0.03094525704725652, + "learning_rate": 0.0009245616050163861, + "loss": 0.6084, + "step": 10220 + }, + { + "epoch": 0.5190948204744197, + "grad_norm": 0.0302889231805603, + "learning_rate": 0.000924444554480964, + "loss": 0.6384, + "step": 10225 + }, + { + "epoch": 0.5193486565724512, + "grad_norm": 0.053230977077894856, + "learning_rate": 0.0009243274206291926, + "loss": 0.5947, + "step": 10230 + }, + { + "epoch": 0.5196024926704826, + "grad_norm": 0.06540135430875998, + "learning_rate": 0.0009242102034840647, + "loss": 0.6397, + "step": 10235 + }, + { + "epoch": 0.5198563287685142, + "grad_norm": 0.026623481542029606, + "learning_rate": 0.0009240929030685893, + "loss": 0.6642, + "step": 10240 + }, + { + "epoch": 0.5201101648665457, + "grad_norm": 0.034876098813892115, + "learning_rate": 0.0009239755194057921, + "loss": 0.6457, + "step": 10245 + }, + { + "epoch": 0.5203640009645771, + "grad_norm": 0.03378616222313645, + "learning_rate": 0.0009238580525187146, + "loss": 0.5793, + "step": 10250 + }, + { + "epoch": 0.5206178370626087, + "grad_norm": 0.1052756526973527, + "learning_rate": 0.0009237405024304153, + "loss": 0.6458, + "step": 10255 + }, + { + "epoch": 0.5208716731606402, + "grad_norm": 0.03839663563882219, + "learning_rate": 0.0009236228691639686, + "loss": 0.665, + "step": 10260 + }, + { + "epoch": 0.5211255092586716, + "grad_norm": 0.033097916077724686, + "learning_rate": 0.0009235051527424652, + "loss": 0.6032, + "step": 10265 + }, + { + "epoch": 0.5213793453567032, + "grad_norm": 0.04114159903818375, + "learning_rate": 0.0009233873531890123, + "loss": 0.6592, + "step": 10270 + }, + { + "epoch": 0.5216331814547347, + "grad_norm": 0.059483637355810894, + "learning_rate": 0.0009232694705267335, + "loss": 0.6168, + "step": 10275 + }, + { + "epoch": 0.5218870175527662, + "grad_norm": 0.03912192748754646, + "learning_rate": 0.0009231515047787686, + "loss": 0.6632, + "step": 10280 + }, + { + "epoch": 0.5221408536507977, + "grad_norm": 0.03455355947863813, + "learning_rate": 0.0009230334559682734, + "loss": 0.6174, + "step": 10285 + }, + { + "epoch": 0.5223946897488292, + "grad_norm": 0.06201615106125605, + "learning_rate": 0.0009229153241184204, + "loss": 0.6291, + "step": 10290 + }, + { + "epoch": 0.5226485258468607, + "grad_norm": 0.03866401919091889, + "learning_rate": 0.0009227971092523983, + "loss": 0.6441, + "step": 10295 + }, + { + "epoch": 0.5229023619448921, + "grad_norm": 0.04274992026640168, + "learning_rate": 0.0009226788113934123, + "loss": 0.6226, + "step": 10300 + }, + { + "epoch": 0.5231561980429237, + "grad_norm": 0.050672917058466604, + "learning_rate": 0.0009225604305646835, + "loss": 0.6336, + "step": 10305 + }, + { + "epoch": 0.5234100341409552, + "grad_norm": 0.03842833678462949, + "learning_rate": 0.0009224419667894495, + "loss": 0.6085, + "step": 10310 + }, + { + "epoch": 0.5236638702389866, + "grad_norm": 0.038015665579822645, + "learning_rate": 0.000922323420090964, + "loss": 0.6214, + "step": 10315 + }, + { + "epoch": 0.5239177063370182, + "grad_norm": 0.03004579581769112, + "learning_rate": 0.0009222047904924975, + "loss": 0.5919, + "step": 10320 + }, + { + "epoch": 0.5241715424350497, + "grad_norm": 0.03190679818928855, + "learning_rate": 0.000922086078017336, + "loss": 0.6159, + "step": 10325 + }, + { + "epoch": 0.5244253785330812, + "grad_norm": 0.029075752083056672, + "learning_rate": 0.0009219672826887824, + "loss": 0.5941, + "step": 10330 + }, + { + "epoch": 0.5246792146311127, + "grad_norm": 0.034715107108026645, + "learning_rate": 0.0009218484045301554, + "loss": 0.6209, + "step": 10335 + }, + { + "epoch": 0.5249330507291442, + "grad_norm": 0.02962263191628107, + "learning_rate": 0.0009217294435647905, + "loss": 0.6439, + "step": 10340 + }, + { + "epoch": 0.5251868868271757, + "grad_norm": 0.0357418067321709, + "learning_rate": 0.0009216103998160389, + "loss": 0.6227, + "step": 10345 + }, + { + "epoch": 0.5254407229252072, + "grad_norm": 0.05508757844826437, + "learning_rate": 0.0009214912733072685, + "loss": 0.618, + "step": 10350 + }, + { + "epoch": 0.5256945590232387, + "grad_norm": 0.04548692629351668, + "learning_rate": 0.0009213720640618631, + "loss": 0.6661, + "step": 10355 + }, + { + "epoch": 0.5259483951212702, + "grad_norm": 0.03260375865114965, + "learning_rate": 0.0009212527721032226, + "loss": 0.6416, + "step": 10360 + }, + { + "epoch": 0.5262022312193017, + "grad_norm": 0.029374475221709984, + "learning_rate": 0.000921133397454764, + "loss": 0.5965, + "step": 10365 + }, + { + "epoch": 0.5264560673173332, + "grad_norm": 0.025766929716711365, + "learning_rate": 0.0009210139401399197, + "loss": 0.64, + "step": 10370 + }, + { + "epoch": 0.5267099034153647, + "grad_norm": 0.028038442454314576, + "learning_rate": 0.0009208944001821384, + "loss": 0.6088, + "step": 10375 + }, + { + "epoch": 0.5269637395133961, + "grad_norm": 0.052489144282257295, + "learning_rate": 0.0009207747776048855, + "loss": 0.6408, + "step": 10380 + }, + { + "epoch": 0.5272175756114277, + "grad_norm": 0.03444955545077113, + "learning_rate": 0.000920655072431642, + "loss": 0.5995, + "step": 10385 + }, + { + "epoch": 0.5274714117094592, + "grad_norm": 0.03088222308172369, + "learning_rate": 0.0009205352846859056, + "loss": 0.6194, + "step": 10390 + }, + { + "epoch": 0.5277252478074907, + "grad_norm": 0.027998757741213697, + "learning_rate": 0.0009204154143911903, + "loss": 0.63, + "step": 10395 + }, + { + "epoch": 0.5279790839055222, + "grad_norm": 0.04158454624606828, + "learning_rate": 0.0009202954615710256, + "loss": 0.5983, + "step": 10400 + }, + { + "epoch": 0.5282329200035537, + "grad_norm": 0.0453153646691045, + "learning_rate": 0.0009201754262489575, + "loss": 0.6328, + "step": 10405 + }, + { + "epoch": 0.5284867561015852, + "grad_norm": 0.04473664568912869, + "learning_rate": 0.0009200553084485491, + "loss": 0.6301, + "step": 10410 + }, + { + "epoch": 0.5287405921996167, + "grad_norm": 0.0382431324349534, + "learning_rate": 0.0009199351081933781, + "loss": 0.6, + "step": 10415 + }, + { + "epoch": 0.5289944282976482, + "grad_norm": 0.039893358245096724, + "learning_rate": 0.0009198148255070398, + "loss": 0.5886, + "step": 10420 + }, + { + "epoch": 0.5292482643956797, + "grad_norm": 0.058211501320811744, + "learning_rate": 0.0009196944604131448, + "loss": 0.5985, + "step": 10425 + }, + { + "epoch": 0.5295021004937112, + "grad_norm": 0.03851407689786308, + "learning_rate": 0.0009195740129353202, + "loss": 0.5908, + "step": 10430 + }, + { + "epoch": 0.5297559365917427, + "grad_norm": 0.038851954860420396, + "learning_rate": 0.0009194534830972092, + "loss": 0.6391, + "step": 10435 + }, + { + "epoch": 0.5300097726897742, + "grad_norm": 0.026937948392526747, + "learning_rate": 0.0009193328709224714, + "loss": 0.6123, + "step": 10440 + }, + { + "epoch": 0.5302636087878058, + "grad_norm": 0.036109446414125626, + "learning_rate": 0.0009192121764347822, + "loss": 0.5954, + "step": 10445 + }, + { + "epoch": 0.5305174448858372, + "grad_norm": 0.03316804876731099, + "learning_rate": 0.0009190913996578334, + "loss": 0.6145, + "step": 10450 + }, + { + "epoch": 0.5307712809838687, + "grad_norm": 0.06849212460462101, + "learning_rate": 0.000918970540615333, + "loss": 0.613, + "step": 10455 + }, + { + "epoch": 0.5310251170819003, + "grad_norm": 0.03323906997049166, + "learning_rate": 0.0009188495993310046, + "loss": 0.6341, + "step": 10460 + }, + { + "epoch": 0.5312789531799317, + "grad_norm": 0.03941007227631917, + "learning_rate": 0.0009187285758285889, + "loss": 0.5978, + "step": 10465 + }, + { + "epoch": 0.5315327892779632, + "grad_norm": 0.02635949534355945, + "learning_rate": 0.0009186074701318419, + "loss": 0.5906, + "step": 10470 + }, + { + "epoch": 0.5317866253759947, + "grad_norm": 0.04078016260737411, + "learning_rate": 0.0009184862822645359, + "loss": 0.6263, + "step": 10475 + }, + { + "epoch": 0.5320404614740262, + "grad_norm": 0.057867925606058655, + "learning_rate": 0.0009183650122504598, + "loss": 0.6272, + "step": 10480 + }, + { + "epoch": 0.5322942975720577, + "grad_norm": 0.037243395752825356, + "learning_rate": 0.0009182436601134184, + "loss": 0.6025, + "step": 10485 + }, + { + "epoch": 0.5325481336700892, + "grad_norm": 0.04821834594916435, + "learning_rate": 0.0009181222258772319, + "loss": 0.6018, + "step": 10490 + }, + { + "epoch": 0.5328019697681208, + "grad_norm": 0.028085738317345087, + "learning_rate": 0.0009180007095657379, + "loss": 0.5925, + "step": 10495 + }, + { + "epoch": 0.5330558058661522, + "grad_norm": 0.036071988511367975, + "learning_rate": 0.0009178791112027891, + "loss": 0.6027, + "step": 10500 + }, + { + "epoch": 0.5333096419641837, + "grad_norm": 0.02739881003703629, + "learning_rate": 0.0009177574308122547, + "loss": 0.6641, + "step": 10505 + }, + { + "epoch": 0.5335634780622153, + "grad_norm": 0.02822235924487654, + "learning_rate": 0.00091763566841802, + "loss": 0.6287, + "step": 10510 + }, + { + "epoch": 0.5338173141602467, + "grad_norm": 0.024121670310468826, + "learning_rate": 0.0009175138240439864, + "loss": 0.5854, + "step": 10515 + }, + { + "epoch": 0.5340711502582782, + "grad_norm": 0.025342117126993798, + "learning_rate": 0.0009173918977140713, + "loss": 0.5713, + "step": 10520 + }, + { + "epoch": 0.5343249863563098, + "grad_norm": 0.027321368031964118, + "learning_rate": 0.0009172698894522082, + "loss": 0.6106, + "step": 10525 + }, + { + "epoch": 0.5345788224543412, + "grad_norm": 0.03343234624185458, + "learning_rate": 0.0009171477992823467, + "loss": 0.6268, + "step": 10530 + }, + { + "epoch": 0.5348326585523727, + "grad_norm": 0.04732716404477403, + "learning_rate": 0.0009170256272284525, + "loss": 0.5807, + "step": 10535 + }, + { + "epoch": 0.5350864946504043, + "grad_norm": 0.02840525748237608, + "learning_rate": 0.0009169033733145074, + "loss": 0.6045, + "step": 10540 + }, + { + "epoch": 0.5353403307484358, + "grad_norm": 0.027475221513180086, + "learning_rate": 0.0009167810375645091, + "loss": 0.6323, + "step": 10545 + }, + { + "epoch": 0.5355941668464672, + "grad_norm": 0.025750551058873007, + "learning_rate": 0.0009166586200024717, + "loss": 0.5838, + "step": 10550 + }, + { + "epoch": 0.5358480029444987, + "grad_norm": 0.03765393114821948, + "learning_rate": 0.000916536120652425, + "loss": 0.6147, + "step": 10555 + }, + { + "epoch": 0.5361018390425303, + "grad_norm": 0.03715730591665516, + "learning_rate": 0.0009164135395384151, + "loss": 0.6005, + "step": 10560 + }, + { + "epoch": 0.5363556751405617, + "grad_norm": 0.03645041602808113, + "learning_rate": 0.0009162908766845041, + "loss": 0.5842, + "step": 10565 + }, + { + "epoch": 0.5366095112385932, + "grad_norm": 0.03100263105448387, + "learning_rate": 0.00091616813211477, + "loss": 0.5594, + "step": 10570 + }, + { + "epoch": 0.5368633473366248, + "grad_norm": 0.04226102686913694, + "learning_rate": 0.0009160453058533071, + "loss": 0.6151, + "step": 10575 + }, + { + "epoch": 0.5371171834346562, + "grad_norm": 0.02589182283128981, + "learning_rate": 0.0009159223979242253, + "loss": 0.614, + "step": 10580 + }, + { + "epoch": 0.5373710195326877, + "grad_norm": 0.028467877248238536, + "learning_rate": 0.0009157994083516511, + "loss": 0.6492, + "step": 10585 + }, + { + "epoch": 0.5376248556307193, + "grad_norm": 0.03135996868398204, + "learning_rate": 0.0009156763371597266, + "loss": 0.6065, + "step": 10590 + }, + { + "epoch": 0.5378786917287507, + "grad_norm": 0.040987082480228504, + "learning_rate": 0.0009155531843726101, + "loss": 0.6084, + "step": 10595 + }, + { + "epoch": 0.5381325278267822, + "grad_norm": 0.029712117528617986, + "learning_rate": 0.0009154299500144758, + "loss": 0.6086, + "step": 10600 + }, + { + "epoch": 0.5383863639248138, + "grad_norm": 0.05504579201351947, + "learning_rate": 0.0009153066341095142, + "loss": 0.6395, + "step": 10605 + }, + { + "epoch": 0.5386402000228453, + "grad_norm": 0.030518737256477527, + "learning_rate": 0.0009151832366819314, + "loss": 0.6192, + "step": 10610 + }, + { + "epoch": 0.5388940361208767, + "grad_norm": 0.029879557984338494, + "learning_rate": 0.0009150597577559496, + "loss": 0.668, + "step": 10615 + }, + { + "epoch": 0.5391478722189083, + "grad_norm": 0.03153018920722057, + "learning_rate": 0.0009149361973558075, + "loss": 0.5801, + "step": 10620 + }, + { + "epoch": 0.5394017083169398, + "grad_norm": 0.030512110696911215, + "learning_rate": 0.000914812555505759, + "loss": 0.5942, + "step": 10625 + }, + { + "epoch": 0.5396555444149712, + "grad_norm": 0.041125863080099585, + "learning_rate": 0.0009146888322300745, + "loss": 0.6002, + "step": 10630 + }, + { + "epoch": 0.5399093805130027, + "grad_norm": 0.054165606681991184, + "learning_rate": 0.0009145650275530404, + "loss": 0.5951, + "step": 10635 + }, + { + "epoch": 0.5401632166110343, + "grad_norm": 0.029366439611697406, + "learning_rate": 0.0009144411414989587, + "loss": 0.5834, + "step": 10640 + }, + { + "epoch": 0.5404170527090657, + "grad_norm": 0.04321665819488211, + "learning_rate": 0.0009143171740921479, + "loss": 0.61, + "step": 10645 + }, + { + "epoch": 0.5406708888070972, + "grad_norm": 0.02466718128736207, + "learning_rate": 0.0009141931253569418, + "loss": 0.6155, + "step": 10650 + }, + { + "epoch": 0.5409247249051288, + "grad_norm": 0.025734399782198193, + "learning_rate": 0.000914068995317691, + "loss": 0.6249, + "step": 10655 + }, + { + "epoch": 0.5411785610031603, + "grad_norm": 0.030956416519900224, + "learning_rate": 0.0009139447839987613, + "loss": 0.588, + "step": 10660 + }, + { + "epoch": 0.5414323971011917, + "grad_norm": 0.03328165539133208, + "learning_rate": 0.0009138204914245347, + "loss": 0.5812, + "step": 10665 + }, + { + "epoch": 0.5416862331992233, + "grad_norm": 0.03418976866258275, + "learning_rate": 0.0009136961176194094, + "loss": 0.6126, + "step": 10670 + }, + { + "epoch": 0.5419400692972548, + "grad_norm": 0.060922163184233144, + "learning_rate": 0.0009135716626077994, + "loss": 0.5858, + "step": 10675 + }, + { + "epoch": 0.5421939053952862, + "grad_norm": 0.0316654793096513, + "learning_rate": 0.0009134471264141345, + "loss": 0.6172, + "step": 10680 + }, + { + "epoch": 0.5424477414933178, + "grad_norm": 0.033052227554561796, + "learning_rate": 0.0009133225090628605, + "loss": 0.6176, + "step": 10685 + }, + { + "epoch": 0.5427015775913493, + "grad_norm": 0.034867711839285526, + "learning_rate": 0.0009131978105784394, + "loss": 0.6441, + "step": 10690 + }, + { + "epoch": 0.5429554136893807, + "grad_norm": 0.03242440613542117, + "learning_rate": 0.0009130730309853483, + "loss": 0.5871, + "step": 10695 + }, + { + "epoch": 0.5432092497874123, + "grad_norm": 0.02846877585722155, + "learning_rate": 0.0009129481703080816, + "loss": 0.636, + "step": 10700 + }, + { + "epoch": 0.5434630858854438, + "grad_norm": 0.03968673666286646, + "learning_rate": 0.0009128232285711482, + "loss": 0.6109, + "step": 10705 + }, + { + "epoch": 0.5437169219834753, + "grad_norm": 0.025092489100120463, + "learning_rate": 0.0009126982057990738, + "loss": 0.6156, + "step": 10710 + }, + { + "epoch": 0.5439707580815067, + "grad_norm": 0.03529191550732015, + "learning_rate": 0.0009125731020163998, + "loss": 0.5723, + "step": 10715 + }, + { + "epoch": 0.5442245941795383, + "grad_norm": 0.027050926910876526, + "learning_rate": 0.0009124479172476833, + "loss": 0.6045, + "step": 10720 + }, + { + "epoch": 0.5444784302775698, + "grad_norm": 0.07622157764619372, + "learning_rate": 0.0009123226515174976, + "loss": 0.5689, + "step": 10725 + }, + { + "epoch": 0.5447322663756012, + "grad_norm": 0.02751158017548396, + "learning_rate": 0.0009121973048504316, + "loss": 0.6213, + "step": 10730 + }, + { + "epoch": 0.5449861024736328, + "grad_norm": 0.049460936685531046, + "learning_rate": 0.0009120718772710903, + "loss": 0.6061, + "step": 10735 + }, + { + "epoch": 0.5452399385716643, + "grad_norm": 0.03544395471579363, + "learning_rate": 0.0009119463688040945, + "loss": 0.5933, + "step": 10740 + }, + { + "epoch": 0.5454937746696957, + "grad_norm": 0.03400989240869563, + "learning_rate": 0.0009118207794740809, + "loss": 0.5727, + "step": 10745 + }, + { + "epoch": 0.5457476107677273, + "grad_norm": 0.03151062837134726, + "learning_rate": 0.000911695109305702, + "loss": 0.5955, + "step": 10750 + }, + { + "epoch": 0.5460014468657588, + "grad_norm": 0.02463583148102344, + "learning_rate": 0.0009115693583236263, + "loss": 0.6051, + "step": 10755 + }, + { + "epoch": 0.5462552829637903, + "grad_norm": 0.033094296247287235, + "learning_rate": 0.0009114435265525381, + "loss": 0.5763, + "step": 10760 + }, + { + "epoch": 0.5465091190618218, + "grad_norm": 0.026074781871230777, + "learning_rate": 0.0009113176140171373, + "loss": 0.6216, + "step": 10765 + }, + { + "epoch": 0.5467629551598533, + "grad_norm": 0.03141881910628044, + "learning_rate": 0.0009111916207421402, + "loss": 0.606, + "step": 10770 + }, + { + "epoch": 0.5470167912578848, + "grad_norm": 0.029052049913770146, + "learning_rate": 0.0009110655467522786, + "loss": 0.6244, + "step": 10775 + }, + { + "epoch": 0.5472706273559163, + "grad_norm": 0.0251807877680953, + "learning_rate": 0.0009109393920723001, + "loss": 0.5854, + "step": 10780 + }, + { + "epoch": 0.5475244634539478, + "grad_norm": 0.03612511248761835, + "learning_rate": 0.0009108131567269684, + "loss": 0.601, + "step": 10785 + }, + { + "epoch": 0.5477782995519793, + "grad_norm": 0.05249632040568085, + "learning_rate": 0.0009106868407410627, + "loss": 0.6207, + "step": 10790 + }, + { + "epoch": 0.5480321356500107, + "grad_norm": 0.030858769777672986, + "learning_rate": 0.0009105604441393782, + "loss": 0.624, + "step": 10795 + }, + { + "epoch": 0.5482859717480423, + "grad_norm": 0.029487600818791027, + "learning_rate": 0.0009104339669467261, + "loss": 0.5926, + "step": 10800 + }, + { + "epoch": 0.5485398078460738, + "grad_norm": 0.03067140315629251, + "learning_rate": 0.0009103074091879331, + "loss": 0.623, + "step": 10805 + }, + { + "epoch": 0.5487936439441052, + "grad_norm": 0.02640097178132819, + "learning_rate": 0.0009101807708878418, + "loss": 0.6049, + "step": 10810 + }, + { + "epoch": 0.5490474800421368, + "grad_norm": 0.049525814735870075, + "learning_rate": 0.0009100540520713108, + "loss": 0.6012, + "step": 10815 + }, + { + "epoch": 0.5493013161401683, + "grad_norm": 0.04668719700489464, + "learning_rate": 0.0009099272527632142, + "loss": 0.6075, + "step": 10820 + }, + { + "epoch": 0.5495551522381998, + "grad_norm": 0.04879877503113339, + "learning_rate": 0.0009098003729884423, + "loss": 0.6007, + "step": 10825 + }, + { + "epoch": 0.5498089883362313, + "grad_norm": 0.035478008135277204, + "learning_rate": 0.0009096734127719007, + "loss": 0.6009, + "step": 10830 + }, + { + "epoch": 0.5500628244342628, + "grad_norm": 0.06071680497508226, + "learning_rate": 0.0009095463721385113, + "loss": 0.6103, + "step": 10835 + }, + { + "epoch": 0.5503166605322943, + "grad_norm": 0.06356999878241335, + "learning_rate": 0.0009094192511132116, + "loss": 0.6008, + "step": 10840 + }, + { + "epoch": 0.5505704966303258, + "grad_norm": 0.07522895375448929, + "learning_rate": 0.0009092920497209545, + "loss": 0.59, + "step": 10845 + }, + { + "epoch": 0.5508243327283573, + "grad_norm": 0.06152876042967086, + "learning_rate": 0.0009091647679867092, + "loss": 0.6016, + "step": 10850 + }, + { + "epoch": 0.5510781688263888, + "grad_norm": 1.435812648824866, + "learning_rate": 0.0009090374059354605, + "loss": 0.9246, + "step": 10855 + }, + { + "epoch": 0.5513320049244202, + "grad_norm": 0.16453277042657283, + "learning_rate": 0.0009089099635922089, + "loss": 0.7979, + "step": 10860 + }, + { + "epoch": 0.5515858410224518, + "grad_norm": 0.09400053941343282, + "learning_rate": 0.0009087824409819706, + "loss": 0.7172, + "step": 10865 + }, + { + "epoch": 0.5518396771204833, + "grad_norm": 0.037709382524332286, + "learning_rate": 0.0009086548381297778, + "loss": 0.7196, + "step": 10870 + }, + { + "epoch": 0.5520935132185149, + "grad_norm": 0.058763207110847336, + "learning_rate": 0.0009085271550606782, + "loss": 0.6644, + "step": 10875 + }, + { + "epoch": 0.5523473493165463, + "grad_norm": 0.03877532647328554, + "learning_rate": 0.0009083993917997354, + "loss": 0.6212, + "step": 10880 + }, + { + "epoch": 0.5526011854145778, + "grad_norm": 0.032910769522172735, + "learning_rate": 0.0009082715483720287, + "loss": 0.6176, + "step": 10885 + }, + { + "epoch": 0.5528550215126093, + "grad_norm": 0.06379896202591075, + "learning_rate": 0.000908143624802653, + "loss": 0.6868, + "step": 10890 + }, + { + "epoch": 0.5531088576106408, + "grad_norm": 0.03960368139013127, + "learning_rate": 0.0009080156211167192, + "loss": 0.595, + "step": 10895 + }, + { + "epoch": 0.5533626937086723, + "grad_norm": 0.03265424433724443, + "learning_rate": 0.0009078875373393538, + "loss": 0.649, + "step": 10900 + }, + { + "epoch": 0.5536165298067038, + "grad_norm": 0.03415971908723185, + "learning_rate": 0.0009077593734956988, + "loss": 0.6206, + "step": 10905 + }, + { + "epoch": 0.5538703659047353, + "grad_norm": 0.036488130956054673, + "learning_rate": 0.0009076311296109125, + "loss": 0.67, + "step": 10910 + }, + { + "epoch": 0.5541242020027668, + "grad_norm": 0.035320885747738756, + "learning_rate": 0.0009075028057101682, + "loss": 0.6229, + "step": 10915 + }, + { + "epoch": 0.5543780381007983, + "grad_norm": 0.027348385875276045, + "learning_rate": 0.0009073744018186554, + "loss": 0.6252, + "step": 10920 + }, + { + "epoch": 0.5546318741988299, + "grad_norm": 0.05990522832658699, + "learning_rate": 0.0009072459179615789, + "loss": 0.6792, + "step": 10925 + }, + { + "epoch": 0.5548857102968613, + "grad_norm": 0.025462128094323643, + "learning_rate": 0.0009071173541641598, + "loss": 0.5896, + "step": 10930 + }, + { + "epoch": 0.5551395463948928, + "grad_norm": 0.02807969122992142, + "learning_rate": 0.0009069887104516344, + "loss": 0.6309, + "step": 10935 + }, + { + "epoch": 0.5553933824929244, + "grad_norm": 0.036603755788089144, + "learning_rate": 0.0009068599868492549, + "loss": 0.6168, + "step": 10940 + }, + { + "epoch": 0.5556472185909558, + "grad_norm": 0.027661489675158593, + "learning_rate": 0.0009067311833822887, + "loss": 0.6306, + "step": 10945 + }, + { + "epoch": 0.5559010546889873, + "grad_norm": 0.03077694227440884, + "learning_rate": 0.0009066023000760198, + "loss": 0.6059, + "step": 10950 + }, + { + "epoch": 0.5561548907870189, + "grad_norm": 0.03144955523682315, + "learning_rate": 0.0009064733369557469, + "loss": 0.5914, + "step": 10955 + }, + { + "epoch": 0.5564087268850503, + "grad_norm": 0.02757118827722829, + "learning_rate": 0.0009063442940467852, + "loss": 0.6138, + "step": 10960 + }, + { + "epoch": 0.5566625629830818, + "grad_norm": 0.028124044394521205, + "learning_rate": 0.0009062151713744649, + "loss": 0.5849, + "step": 10965 + }, + { + "epoch": 0.5569163990811133, + "grad_norm": 0.025072368259489837, + "learning_rate": 0.0009060859689641323, + "loss": 0.6349, + "step": 10970 + }, + { + "epoch": 0.5571702351791449, + "grad_norm": 0.031444917396144835, + "learning_rate": 0.0009059566868411492, + "loss": 0.6198, + "step": 10975 + }, + { + "epoch": 0.5574240712771763, + "grad_norm": 0.04501045513342346, + "learning_rate": 0.0009058273250308929, + "loss": 0.6399, + "step": 10980 + }, + { + "epoch": 0.5576779073752078, + "grad_norm": 0.03025025685706769, + "learning_rate": 0.0009056978835587566, + "loss": 0.6388, + "step": 10985 + }, + { + "epoch": 0.5579317434732394, + "grad_norm": 0.03855276480190827, + "learning_rate": 0.0009055683624501489, + "loss": 0.6172, + "step": 10990 + }, + { + "epoch": 0.5581855795712708, + "grad_norm": 0.04172515052455696, + "learning_rate": 0.0009054387617304945, + "loss": 0.6024, + "step": 10995 + }, + { + "epoch": 0.5584394156693023, + "grad_norm": 0.02429228351814789, + "learning_rate": 0.0009053090814252327, + "loss": 0.5964, + "step": 11000 + }, + { + "epoch": 0.5586932517673339, + "grad_norm": 0.04151154242191302, + "learning_rate": 0.0009051793215598197, + "loss": 0.6044, + "step": 11005 + }, + { + "epoch": 0.5589470878653653, + "grad_norm": 0.024858734225047054, + "learning_rate": 0.0009050494821597264, + "loss": 0.6301, + "step": 11010 + }, + { + "epoch": 0.5592009239633968, + "grad_norm": 0.028072967963923535, + "learning_rate": 0.0009049195632504399, + "loss": 0.626, + "step": 11015 + }, + { + "epoch": 0.5594547600614284, + "grad_norm": 0.03193777274190929, + "learning_rate": 0.0009047895648574623, + "loss": 0.6255, + "step": 11020 + }, + { + "epoch": 0.5597085961594598, + "grad_norm": 0.026480125682836954, + "learning_rate": 0.0009046594870063118, + "loss": 0.6033, + "step": 11025 + }, + { + "epoch": 0.5599624322574913, + "grad_norm": 0.026097311141357767, + "learning_rate": 0.0009045293297225221, + "loss": 0.5809, + "step": 11030 + }, + { + "epoch": 0.5602162683555229, + "grad_norm": 0.0308704453992081, + "learning_rate": 0.0009043990930316424, + "loss": 0.6089, + "step": 11035 + }, + { + "epoch": 0.5604701044535544, + "grad_norm": 0.025288526434383552, + "learning_rate": 0.0009042687769592375, + "loss": 0.6248, + "step": 11040 + }, + { + "epoch": 0.5607239405515858, + "grad_norm": 0.024553453597092455, + "learning_rate": 0.0009041383815308877, + "loss": 0.598, + "step": 11045 + }, + { + "epoch": 0.5609777766496173, + "grad_norm": 0.030367718805119067, + "learning_rate": 0.0009040079067721889, + "loss": 0.5852, + "step": 11050 + }, + { + "epoch": 0.5612316127476489, + "grad_norm": 0.03317092988088747, + "learning_rate": 0.0009038773527087529, + "loss": 0.5776, + "step": 11055 + }, + { + "epoch": 0.5614854488456803, + "grad_norm": 0.05184744896110897, + "learning_rate": 0.0009037467193662068, + "loss": 0.6059, + "step": 11060 + }, + { + "epoch": 0.5617392849437118, + "grad_norm": 0.025577618854582287, + "learning_rate": 0.0009036160067701931, + "loss": 0.5846, + "step": 11065 + }, + { + "epoch": 0.5619931210417434, + "grad_norm": 0.03938538897439106, + "learning_rate": 0.00090348521494637, + "loss": 0.6368, + "step": 11070 + }, + { + "epoch": 0.5622469571397748, + "grad_norm": 0.038820715798665424, + "learning_rate": 0.0009033543439204114, + "loss": 0.609, + "step": 11075 + }, + { + "epoch": 0.5625007932378063, + "grad_norm": 0.029533488100105525, + "learning_rate": 0.0009032233937180067, + "loss": 0.5961, + "step": 11080 + }, + { + "epoch": 0.5627546293358379, + "grad_norm": 0.02379671510332352, + "learning_rate": 0.0009030923643648607, + "loss": 0.5998, + "step": 11085 + }, + { + "epoch": 0.5630084654338694, + "grad_norm": 0.023448978175139953, + "learning_rate": 0.0009029612558866938, + "loss": 0.5834, + "step": 11090 + }, + { + "epoch": 0.5632623015319008, + "grad_norm": 0.024265093856533426, + "learning_rate": 0.0009028300683092418, + "loss": 0.5921, + "step": 11095 + }, + { + "epoch": 0.5635161376299324, + "grad_norm": 0.024632811713908073, + "learning_rate": 0.0009026988016582564, + "loss": 0.6196, + "step": 11100 + }, + { + "epoch": 0.5637699737279639, + "grad_norm": 0.022302125548208863, + "learning_rate": 0.0009025674559595045, + "loss": 0.6236, + "step": 11105 + }, + { + "epoch": 0.5640238098259953, + "grad_norm": 0.036192999100420786, + "learning_rate": 0.0009024360312387687, + "loss": 0.609, + "step": 11110 + }, + { + "epoch": 0.5642776459240268, + "grad_norm": 0.0306562971571317, + "learning_rate": 0.0009023045275218467, + "loss": 0.5926, + "step": 11115 + }, + { + "epoch": 0.5645314820220584, + "grad_norm": 0.032739588825804515, + "learning_rate": 0.0009021729448345524, + "loss": 0.6067, + "step": 11120 + }, + { + "epoch": 0.5647853181200898, + "grad_norm": 0.02788797841269012, + "learning_rate": 0.0009020412832027146, + "loss": 0.6026, + "step": 11125 + }, + { + "epoch": 0.5650391542181213, + "grad_norm": 0.025558068319595086, + "learning_rate": 0.0009019095426521779, + "loss": 0.6021, + "step": 11130 + }, + { + "epoch": 0.5652929903161529, + "grad_norm": 0.025476402210346454, + "learning_rate": 0.0009017777232088023, + "loss": 0.6236, + "step": 11135 + }, + { + "epoch": 0.5655468264141844, + "grad_norm": 0.024392907820231798, + "learning_rate": 0.0009016458248984632, + "loss": 0.6497, + "step": 11140 + }, + { + "epoch": 0.5658006625122158, + "grad_norm": 0.02799976269471266, + "learning_rate": 0.0009015138477470516, + "loss": 0.601, + "step": 11145 + }, + { + "epoch": 0.5660544986102474, + "grad_norm": 0.027785695512840384, + "learning_rate": 0.0009013817917804743, + "loss": 0.6225, + "step": 11150 + }, + { + "epoch": 0.5663083347082789, + "grad_norm": 0.031205524445284426, + "learning_rate": 0.0009012496570246529, + "loss": 0.6179, + "step": 11155 + }, + { + "epoch": 0.5665621708063103, + "grad_norm": 0.025209600845023012, + "learning_rate": 0.0009011174435055247, + "loss": 0.5718, + "step": 11160 + }, + { + "epoch": 0.5668160069043419, + "grad_norm": 0.03060213292836761, + "learning_rate": 0.0009009851512490428, + "loss": 0.5973, + "step": 11165 + }, + { + "epoch": 0.5670698430023734, + "grad_norm": 0.03631452482840258, + "learning_rate": 0.0009008527802811754, + "loss": 0.5809, + "step": 11170 + }, + { + "epoch": 0.5673236791004048, + "grad_norm": 0.03908183938816148, + "learning_rate": 0.0009007203306279064, + "loss": 0.6072, + "step": 11175 + }, + { + "epoch": 0.5675775151984364, + "grad_norm": 0.035964383599072204, + "learning_rate": 0.0009005878023152348, + "loss": 0.6082, + "step": 11180 + }, + { + "epoch": 0.5678313512964679, + "grad_norm": 0.0306440737992479, + "learning_rate": 0.0009004551953691754, + "loss": 0.6095, + "step": 11185 + }, + { + "epoch": 0.5680851873944994, + "grad_norm": 0.0513744832338725, + "learning_rate": 0.000900322509815758, + "loss": 0.6013, + "step": 11190 + }, + { + "epoch": 0.5683390234925308, + "grad_norm": 0.025114635513695662, + "learning_rate": 0.0009001897456810286, + "loss": 0.6058, + "step": 11195 + }, + { + "epoch": 0.5685928595905624, + "grad_norm": 0.04276993258659403, + "learning_rate": 0.0009000569029910477, + "loss": 0.6343, + "step": 11200 + }, + { + "epoch": 0.5688466956885939, + "grad_norm": 0.03646375136050398, + "learning_rate": 0.0008999239817718918, + "loss": 0.6169, + "step": 11205 + }, + { + "epoch": 0.5691005317866253, + "grad_norm": 0.024107862227893833, + "learning_rate": 0.0008997909820496528, + "loss": 0.605, + "step": 11210 + }, + { + "epoch": 0.5693543678846569, + "grad_norm": 0.02312217074080101, + "learning_rate": 0.0008996579038504376, + "loss": 0.6016, + "step": 11215 + }, + { + "epoch": 0.5696082039826884, + "grad_norm": 0.026924470535448695, + "learning_rate": 0.0008995247472003691, + "loss": 0.6151, + "step": 11220 + }, + { + "epoch": 0.5698620400807198, + "grad_norm": 0.025057655726728318, + "learning_rate": 0.0008993915121255852, + "loss": 0.6265, + "step": 11225 + }, + { + "epoch": 0.5701158761787514, + "grad_norm": 0.025445367684530214, + "learning_rate": 0.0008992581986522392, + "loss": 0.6009, + "step": 11230 + }, + { + "epoch": 0.5703697122767829, + "grad_norm": 0.0382438609341841, + "learning_rate": 0.0008991248068064999, + "loss": 0.5777, + "step": 11235 + }, + { + "epoch": 0.5706235483748144, + "grad_norm": 0.03626958625239848, + "learning_rate": 0.0008989913366145515, + "loss": 0.5829, + "step": 11240 + }, + { + "epoch": 0.5708773844728459, + "grad_norm": 0.024880449607419988, + "learning_rate": 0.0008988577881025935, + "loss": 0.5972, + "step": 11245 + }, + { + "epoch": 0.5711312205708774, + "grad_norm": 0.02888281590301525, + "learning_rate": 0.0008987241612968406, + "loss": 0.615, + "step": 11250 + }, + { + "epoch": 0.5713850566689089, + "grad_norm": 0.03757091988973743, + "learning_rate": 0.0008985904562235234, + "loss": 0.5978, + "step": 11255 + }, + { + "epoch": 0.5716388927669404, + "grad_norm": 0.03374674197558035, + "learning_rate": 0.0008984566729088874, + "loss": 0.581, + "step": 11260 + }, + { + "epoch": 0.5718927288649719, + "grad_norm": 0.031246163529604482, + "learning_rate": 0.0008983228113791937, + "loss": 0.6174, + "step": 11265 + }, + { + "epoch": 0.5721465649630034, + "grad_norm": 0.03523115249240424, + "learning_rate": 0.0008981888716607184, + "loss": 0.5909, + "step": 11270 + }, + { + "epoch": 0.5724004010610348, + "grad_norm": 0.058253114080335025, + "learning_rate": 0.0008980548537797535, + "loss": 0.5974, + "step": 11275 + }, + { + "epoch": 0.5726542371590664, + "grad_norm": 0.03323643646387071, + "learning_rate": 0.0008979207577626058, + "loss": 0.6457, + "step": 11280 + }, + { + "epoch": 0.5729080732570979, + "grad_norm": 0.02636690022689109, + "learning_rate": 0.0008977865836355979, + "loss": 0.5843, + "step": 11285 + }, + { + "epoch": 0.5731619093551293, + "grad_norm": 0.02518146447116509, + "learning_rate": 0.0008976523314250672, + "loss": 0.5996, + "step": 11290 + }, + { + "epoch": 0.5734157454531609, + "grad_norm": 0.03063657652619607, + "learning_rate": 0.0008975180011573669, + "loss": 0.5909, + "step": 11295 + }, + { + "epoch": 0.5736695815511924, + "grad_norm": 0.033555485623473526, + "learning_rate": 0.0008973835928588656, + "loss": 0.5984, + "step": 11300 + }, + { + "epoch": 0.5739234176492239, + "grad_norm": 0.026858437506102287, + "learning_rate": 0.0008972491065559467, + "loss": 0.5745, + "step": 11305 + }, + { + "epoch": 0.5741772537472554, + "grad_norm": 0.041900500650273065, + "learning_rate": 0.0008971145422750094, + "loss": 0.5525, + "step": 11310 + }, + { + "epoch": 0.5744310898452869, + "grad_norm": 0.023692636987519583, + "learning_rate": 0.0008969799000424676, + "loss": 0.6412, + "step": 11315 + }, + { + "epoch": 0.5746849259433184, + "grad_norm": 0.03520820653367125, + "learning_rate": 0.0008968451798847513, + "loss": 0.5895, + "step": 11320 + }, + { + "epoch": 0.5749387620413499, + "grad_norm": 0.045950374099341315, + "learning_rate": 0.0008967103818283051, + "loss": 0.6144, + "step": 11325 + }, + { + "epoch": 0.5751925981393814, + "grad_norm": 0.03013110218086917, + "learning_rate": 0.0008965755058995896, + "loss": 0.6167, + "step": 11330 + }, + { + "epoch": 0.5754464342374129, + "grad_norm": 0.028677449123263064, + "learning_rate": 0.0008964405521250798, + "loss": 0.5907, + "step": 11335 + }, + { + "epoch": 0.5757002703354444, + "grad_norm": 0.06897665640762995, + "learning_rate": 0.0008963055205312667, + "loss": 0.6008, + "step": 11340 + }, + { + "epoch": 0.5759541064334759, + "grad_norm": 0.039108751736186075, + "learning_rate": 0.0008961704111446564, + "loss": 0.5963, + "step": 11345 + }, + { + "epoch": 0.5762079425315074, + "grad_norm": 0.02497304268795272, + "learning_rate": 0.00089603522399177, + "loss": 0.5648, + "step": 11350 + }, + { + "epoch": 0.576461778629539, + "grad_norm": 0.02409915928784757, + "learning_rate": 0.0008958999590991441, + "loss": 0.6111, + "step": 11355 + }, + { + "epoch": 0.5767156147275704, + "grad_norm": 0.023423833712250722, + "learning_rate": 0.0008957646164933307, + "loss": 0.6034, + "step": 11360 + }, + { + "epoch": 0.5769694508256019, + "grad_norm": 0.02939021201285094, + "learning_rate": 0.0008956291962008967, + "loss": 0.601, + "step": 11365 + }, + { + "epoch": 0.5772232869236334, + "grad_norm": 0.025113894696930674, + "learning_rate": 0.0008954936982484245, + "loss": 0.5741, + "step": 11370 + }, + { + "epoch": 0.5774771230216649, + "grad_norm": 0.03523771168659597, + "learning_rate": 0.0008953581226625116, + "loss": 0.5955, + "step": 11375 + }, + { + "epoch": 0.5777309591196964, + "grad_norm": 0.033018941463141716, + "learning_rate": 0.000895222469469771, + "loss": 0.6561, + "step": 11380 + }, + { + "epoch": 0.5779847952177279, + "grad_norm": 0.03870222645684432, + "learning_rate": 0.0008950867386968305, + "loss": 0.5742, + "step": 11385 + }, + { + "epoch": 0.5782386313157594, + "grad_norm": 0.033035287220069445, + "learning_rate": 0.0008949509303703336, + "loss": 0.6268, + "step": 11390 + }, + { + "epoch": 0.5784924674137909, + "grad_norm": 0.02535187944282503, + "learning_rate": 0.0008948150445169386, + "loss": 0.6145, + "step": 11395 + }, + { + "epoch": 0.5787463035118224, + "grad_norm": 0.03673161990221083, + "learning_rate": 0.0008946790811633193, + "loss": 0.5821, + "step": 11400 + }, + { + "epoch": 0.579000139609854, + "grad_norm": 0.027797183940164883, + "learning_rate": 0.0008945430403361647, + "loss": 0.6329, + "step": 11405 + }, + { + "epoch": 0.5792539757078854, + "grad_norm": 0.024400895985974023, + "learning_rate": 0.0008944069220621788, + "loss": 0.5537, + "step": 11410 + }, + { + "epoch": 0.5795078118059169, + "grad_norm": 0.029540130582463044, + "learning_rate": 0.000894270726368081, + "loss": 0.6305, + "step": 11415 + }, + { + "epoch": 0.5797616479039485, + "grad_norm": 0.03403617081620501, + "learning_rate": 0.0008941344532806057, + "loss": 0.6018, + "step": 11420 + }, + { + "epoch": 0.5800154840019799, + "grad_norm": 0.036771472648533886, + "learning_rate": 0.000893998102826503, + "loss": 0.6164, + "step": 11425 + }, + { + "epoch": 0.5802693201000114, + "grad_norm": 0.050072623532946885, + "learning_rate": 0.0008938616750325375, + "loss": 0.5806, + "step": 11430 + }, + { + "epoch": 0.580523156198043, + "grad_norm": 0.031054510529734854, + "learning_rate": 0.0008937251699254893, + "loss": 0.6024, + "step": 11435 + }, + { + "epoch": 0.5807769922960744, + "grad_norm": 0.02607242206947156, + "learning_rate": 0.0008935885875321539, + "loss": 0.5976, + "step": 11440 + }, + { + "epoch": 0.5810308283941059, + "grad_norm": 0.025700401652453922, + "learning_rate": 0.0008934519278793416, + "loss": 0.6159, + "step": 11445 + }, + { + "epoch": 0.5812846644921374, + "grad_norm": 0.03334981900638615, + "learning_rate": 0.0008933151909938778, + "loss": 0.5664, + "step": 11450 + }, + { + "epoch": 0.581538500590169, + "grad_norm": 0.03470397493711404, + "learning_rate": 0.0008931783769026036, + "loss": 0.6065, + "step": 11455 + }, + { + "epoch": 0.5817923366882004, + "grad_norm": 0.040190824236687406, + "learning_rate": 0.0008930414856323747, + "loss": 0.5918, + "step": 11460 + }, + { + "epoch": 0.5820461727862319, + "grad_norm": 0.02671898679248395, + "learning_rate": 0.0008929045172100624, + "loss": 0.5995, + "step": 11465 + }, + { + "epoch": 0.5823000088842635, + "grad_norm": 0.06746663315212366, + "learning_rate": 0.0008927674716625527, + "loss": 0.6212, + "step": 11470 + }, + { + "epoch": 0.5825538449822949, + "grad_norm": 0.03231819490055282, + "learning_rate": 0.0008926303490167471, + "loss": 0.6091, + "step": 11475 + }, + { + "epoch": 0.5828076810803264, + "grad_norm": 0.028477552470271787, + "learning_rate": 0.0008924931492995619, + "loss": 0.6107, + "step": 11480 + }, + { + "epoch": 0.583061517178358, + "grad_norm": 0.030545895434276424, + "learning_rate": 0.000892355872537929, + "loss": 0.6016, + "step": 11485 + }, + { + "epoch": 0.5833153532763894, + "grad_norm": 0.02764360721101484, + "learning_rate": 0.0008922185187587949, + "loss": 0.6014, + "step": 11490 + }, + { + "epoch": 0.5835691893744209, + "grad_norm": 0.051972230411543044, + "learning_rate": 0.0008920810879891217, + "loss": 0.6184, + "step": 11495 + }, + { + "epoch": 0.5838230254724525, + "grad_norm": 0.049601905584870654, + "learning_rate": 0.0008919435802558862, + "loss": 0.5848, + "step": 11500 + }, + { + "epoch": 0.5840768615704839, + "grad_norm": 0.5732510853690319, + "learning_rate": 0.0008918059955860803, + "loss": 0.6587, + "step": 11505 + }, + { + "epoch": 0.5843306976685154, + "grad_norm": 0.042150081750240605, + "learning_rate": 0.0008916683340067116, + "loss": 0.625, + "step": 11510 + }, + { + "epoch": 0.584584533766547, + "grad_norm": 0.05774925217438834, + "learning_rate": 0.0008915305955448021, + "loss": 0.6199, + "step": 11515 + }, + { + "epoch": 0.5848383698645785, + "grad_norm": 0.04160871680886203, + "learning_rate": 0.0008913927802273894, + "loss": 0.6081, + "step": 11520 + }, + { + "epoch": 0.5850922059626099, + "grad_norm": 0.03323206015318204, + "learning_rate": 0.0008912548880815256, + "loss": 0.6115, + "step": 11525 + }, + { + "epoch": 0.5853460420606414, + "grad_norm": 0.04576230402616482, + "learning_rate": 0.0008911169191342785, + "loss": 0.6077, + "step": 11530 + }, + { + "epoch": 0.585599878158673, + "grad_norm": 0.09158330505750234, + "learning_rate": 0.0008909788734127307, + "loss": 0.6539, + "step": 11535 + }, + { + "epoch": 0.5858537142567044, + "grad_norm": 0.02861387432236888, + "learning_rate": 0.00089084075094398, + "loss": 0.6305, + "step": 11540 + }, + { + "epoch": 0.5861075503547359, + "grad_norm": 0.037516416748120646, + "learning_rate": 0.0008907025517551388, + "loss": 0.6258, + "step": 11545 + }, + { + "epoch": 0.5863613864527675, + "grad_norm": 0.04086135649565531, + "learning_rate": 0.0008905642758733352, + "loss": 0.599, + "step": 11550 + }, + { + "epoch": 0.5866152225507989, + "grad_norm": 0.03266087385037055, + "learning_rate": 0.000890425923325712, + "loss": 0.6342, + "step": 11555 + }, + { + "epoch": 0.5868690586488304, + "grad_norm": 0.04351600531336264, + "learning_rate": 0.0008902874941394271, + "loss": 0.623, + "step": 11560 + }, + { + "epoch": 0.587122894746862, + "grad_norm": 0.04031763253939243, + "learning_rate": 0.0008901489883416535, + "loss": 0.6061, + "step": 11565 + }, + { + "epoch": 0.5873767308448935, + "grad_norm": 0.03024842628718133, + "learning_rate": 0.0008900104059595791, + "loss": 0.5995, + "step": 11570 + }, + { + "epoch": 0.5876305669429249, + "grad_norm": 0.03277584675848276, + "learning_rate": 0.000889871747020407, + "loss": 0.5808, + "step": 11575 + }, + { + "epoch": 0.5878844030409565, + "grad_norm": 0.025589433458913628, + "learning_rate": 0.0008897330115513553, + "loss": 0.6386, + "step": 11580 + }, + { + "epoch": 0.588138239138988, + "grad_norm": 0.04730997143858022, + "learning_rate": 0.0008895941995796569, + "loss": 0.6175, + "step": 11585 + }, + { + "epoch": 0.5883920752370194, + "grad_norm": 0.02716519389485637, + "learning_rate": 0.0008894553111325601, + "loss": 0.6319, + "step": 11590 + }, + { + "epoch": 0.588645911335051, + "grad_norm": 0.029548377642727466, + "learning_rate": 0.0008893163462373279, + "loss": 0.6307, + "step": 11595 + }, + { + "epoch": 0.5888997474330825, + "grad_norm": 0.029441752118062915, + "learning_rate": 0.0008891773049212387, + "loss": 0.604, + "step": 11600 + }, + { + "epoch": 0.5891535835311139, + "grad_norm": 0.03547180982249062, + "learning_rate": 0.000889038187211585, + "loss": 0.615, + "step": 11605 + }, + { + "epoch": 0.5894074196291454, + "grad_norm": 0.025520246542906348, + "learning_rate": 0.0008888989931356754, + "loss": 0.5925, + "step": 11610 + }, + { + "epoch": 0.589661255727177, + "grad_norm": 0.032232116965678355, + "learning_rate": 0.0008887597227208331, + "loss": 0.618, + "step": 11615 + }, + { + "epoch": 0.5899150918252085, + "grad_norm": 0.026107740044775674, + "learning_rate": 0.0008886203759943957, + "loss": 0.6311, + "step": 11620 + }, + { + "epoch": 0.5901689279232399, + "grad_norm": 0.028672705224940687, + "learning_rate": 0.0008884809529837167, + "loss": 0.5692, + "step": 11625 + }, + { + "epoch": 0.5904227640212715, + "grad_norm": 0.0258655607946437, + "learning_rate": 0.0008883414537161638, + "loss": 0.6251, + "step": 11630 + }, + { + "epoch": 0.590676600119303, + "grad_norm": 0.028780253182444794, + "learning_rate": 0.0008882018782191204, + "loss": 0.6016, + "step": 11635 + }, + { + "epoch": 0.5909304362173344, + "grad_norm": 0.03331921506050773, + "learning_rate": 0.0008880622265199841, + "loss": 0.6073, + "step": 11640 + }, + { + "epoch": 0.591184272315366, + "grad_norm": 0.4723036335382741, + "learning_rate": 0.0008879224986461681, + "loss": 0.668, + "step": 11645 + }, + { + "epoch": 0.5914381084133975, + "grad_norm": 0.06636323581751623, + "learning_rate": 0.0008877826946251002, + "loss": 0.6493, + "step": 11650 + }, + { + "epoch": 0.5916919445114289, + "grad_norm": 0.06678436419997426, + "learning_rate": 0.0008876428144842231, + "loss": 0.5844, + "step": 11655 + }, + { + "epoch": 0.5919457806094605, + "grad_norm": 0.048242208081622634, + "learning_rate": 0.0008875028582509948, + "loss": 0.606, + "step": 11660 + }, + { + "epoch": 0.592199616707492, + "grad_norm": 0.06276218448082216, + "learning_rate": 0.0008873628259528878, + "loss": 0.6185, + "step": 11665 + }, + { + "epoch": 0.5924534528055235, + "grad_norm": 0.02996611743977883, + "learning_rate": 0.0008872227176173899, + "loss": 0.6132, + "step": 11670 + }, + { + "epoch": 0.592707288903555, + "grad_norm": 0.025393102232066908, + "learning_rate": 0.0008870825332720036, + "loss": 0.6025, + "step": 11675 + }, + { + "epoch": 0.5929611250015865, + "grad_norm": 0.04284571854613707, + "learning_rate": 0.0008869422729442465, + "loss": 0.6017, + "step": 11680 + }, + { + "epoch": 0.593214961099618, + "grad_norm": 0.03802645828166949, + "learning_rate": 0.0008868019366616508, + "loss": 0.6635, + "step": 11685 + }, + { + "epoch": 0.5934687971976494, + "grad_norm": 0.02382312826786996, + "learning_rate": 0.0008866615244517639, + "loss": 0.569, + "step": 11690 + }, + { + "epoch": 0.593722633295681, + "grad_norm": 0.027194057612111737, + "learning_rate": 0.000886521036342148, + "loss": 0.6323, + "step": 11695 + }, + { + "epoch": 0.5939764693937125, + "grad_norm": 0.030713511992266334, + "learning_rate": 0.0008863804723603803, + "loss": 0.5997, + "step": 11700 + }, + { + "epoch": 0.5942303054917439, + "grad_norm": 0.027258233011688147, + "learning_rate": 0.0008862398325340526, + "loss": 0.6317, + "step": 11705 + }, + { + "epoch": 0.5944841415897755, + "grad_norm": 0.03926742498370449, + "learning_rate": 0.0008860991168907721, + "loss": 0.6199, + "step": 11710 + }, + { + "epoch": 0.594737977687807, + "grad_norm": 0.043600047694201435, + "learning_rate": 0.0008859583254581605, + "loss": 0.6552, + "step": 11715 + }, + { + "epoch": 0.5949918137858384, + "grad_norm": 0.030006140812709456, + "learning_rate": 0.0008858174582638543, + "loss": 0.608, + "step": 11720 + }, + { + "epoch": 0.59524564988387, + "grad_norm": 0.022938563538545016, + "learning_rate": 0.0008856765153355051, + "loss": 0.6107, + "step": 11725 + }, + { + "epoch": 0.5954994859819015, + "grad_norm": 0.029591425378169715, + "learning_rate": 0.0008855354967007793, + "loss": 0.5701, + "step": 11730 + }, + { + "epoch": 0.595753322079933, + "grad_norm": 0.037546799230365954, + "learning_rate": 0.0008853944023873581, + "loss": 0.6099, + "step": 11735 + }, + { + "epoch": 0.5960071581779645, + "grad_norm": 0.030940988076782326, + "learning_rate": 0.0008852532324229379, + "loss": 0.6363, + "step": 11740 + }, + { + "epoch": 0.596260994275996, + "grad_norm": 0.02823182925476383, + "learning_rate": 0.0008851119868352292, + "loss": 0.6331, + "step": 11745 + }, + { + "epoch": 0.5965148303740275, + "grad_norm": 0.03156718715990183, + "learning_rate": 0.000884970665651958, + "loss": 0.6566, + "step": 11750 + }, + { + "epoch": 0.596768666472059, + "grad_norm": 0.041051536536051314, + "learning_rate": 0.0008848292689008653, + "loss": 0.644, + "step": 11755 + }, + { + "epoch": 0.5970225025700905, + "grad_norm": 0.023005223765102525, + "learning_rate": 0.0008846877966097059, + "loss": 0.592, + "step": 11760 + }, + { + "epoch": 0.597276338668122, + "grad_norm": 0.027109360668611315, + "learning_rate": 0.0008845462488062506, + "loss": 0.5873, + "step": 11765 + }, + { + "epoch": 0.5975301747661534, + "grad_norm": 0.027879918337945408, + "learning_rate": 0.0008844046255182844, + "loss": 0.6062, + "step": 11770 + }, + { + "epoch": 0.597784010864185, + "grad_norm": 0.03759374089636192, + "learning_rate": 0.0008842629267736072, + "loss": 0.5949, + "step": 11775 + }, + { + "epoch": 0.5980378469622165, + "grad_norm": 0.02369923860074637, + "learning_rate": 0.0008841211526000339, + "loss": 0.6322, + "step": 11780 + }, + { + "epoch": 0.598291683060248, + "grad_norm": 0.028051536267109798, + "learning_rate": 0.0008839793030253937, + "loss": 0.6232, + "step": 11785 + }, + { + "epoch": 0.5985455191582795, + "grad_norm": 0.056018315371266905, + "learning_rate": 0.0008838373780775315, + "loss": 0.6123, + "step": 11790 + }, + { + "epoch": 0.598799355256311, + "grad_norm": 0.03552958483190999, + "learning_rate": 0.000883695377784306, + "loss": 0.5976, + "step": 11795 + }, + { + "epoch": 0.5990531913543425, + "grad_norm": 0.032759803712457444, + "learning_rate": 0.0008835533021735914, + "loss": 0.5939, + "step": 11800 + }, + { + "epoch": 0.599307027452374, + "grad_norm": 0.049806073645126195, + "learning_rate": 0.0008834111512732763, + "loss": 0.6214, + "step": 11805 + }, + { + "epoch": 0.5995608635504055, + "grad_norm": 0.02802494107222027, + "learning_rate": 0.0008832689251112645, + "loss": 0.6295, + "step": 11810 + }, + { + "epoch": 0.599814699648437, + "grad_norm": 0.030855803033759243, + "learning_rate": 0.0008831266237154738, + "loss": 0.6435, + "step": 11815 + }, + { + "epoch": 0.6000685357464685, + "grad_norm": 0.024861923925470025, + "learning_rate": 0.0008829842471138376, + "loss": 0.6032, + "step": 11820 + }, + { + "epoch": 0.6003223718445, + "grad_norm": 0.02762869776919164, + "learning_rate": 0.0008828417953343035, + "loss": 0.5834, + "step": 11825 + }, + { + "epoch": 0.6005762079425315, + "grad_norm": 0.029202677438373507, + "learning_rate": 0.0008826992684048344, + "loss": 0.5823, + "step": 11830 + }, + { + "epoch": 0.6008300440405631, + "grad_norm": 0.04246057047785471, + "learning_rate": 0.0008825566663534074, + "loss": 0.5808, + "step": 11835 + }, + { + "epoch": 0.6010838801385945, + "grad_norm": 0.02677918683034048, + "learning_rate": 0.0008824139892080145, + "loss": 0.589, + "step": 11840 + }, + { + "epoch": 0.601337716236626, + "grad_norm": 0.022623912880773735, + "learning_rate": 0.0008822712369966628, + "loss": 0.5985, + "step": 11845 + }, + { + "epoch": 0.6015915523346576, + "grad_norm": 0.03863218744236265, + "learning_rate": 0.0008821284097473734, + "loss": 0.5964, + "step": 11850 + }, + { + "epoch": 0.601845388432689, + "grad_norm": 0.026457485001171663, + "learning_rate": 0.000881985507488183, + "loss": 0.592, + "step": 11855 + }, + { + "epoch": 0.6020992245307205, + "grad_norm": 0.03158510312459021, + "learning_rate": 0.0008818425302471424, + "loss": 0.5976, + "step": 11860 + }, + { + "epoch": 0.602353060628752, + "grad_norm": 0.044049060218755355, + "learning_rate": 0.0008816994780523175, + "loss": 0.5742, + "step": 11865 + }, + { + "epoch": 0.6026068967267835, + "grad_norm": 0.02599413568481521, + "learning_rate": 0.0008815563509317883, + "loss": 0.5854, + "step": 11870 + }, + { + "epoch": 0.602860732824815, + "grad_norm": 0.03864798470702778, + "learning_rate": 0.0008814131489136506, + "loss": 0.5732, + "step": 11875 + }, + { + "epoch": 0.6031145689228465, + "grad_norm": 0.032289876389169754, + "learning_rate": 0.0008812698720260135, + "loss": 0.621, + "step": 11880 + }, + { + "epoch": 0.6033684050208781, + "grad_norm": 0.05318024428744845, + "learning_rate": 0.000881126520297002, + "loss": 0.5706, + "step": 11885 + }, + { + "epoch": 0.6036222411189095, + "grad_norm": 0.03086598470372497, + "learning_rate": 0.0008809830937547554, + "loss": 0.601, + "step": 11890 + }, + { + "epoch": 0.603876077216941, + "grad_norm": 0.025731078946828065, + "learning_rate": 0.0008808395924274274, + "loss": 0.5996, + "step": 11895 + }, + { + "epoch": 0.6041299133149726, + "grad_norm": 0.025339932657364943, + "learning_rate": 0.0008806960163431866, + "loss": 0.601, + "step": 11900 + }, + { + "epoch": 0.604383749413004, + "grad_norm": 0.037850633222621166, + "learning_rate": 0.0008805523655302164, + "loss": 0.5848, + "step": 11905 + }, + { + "epoch": 0.6046375855110355, + "grad_norm": 0.026694596427904598, + "learning_rate": 0.0008804086400167146, + "loss": 0.6027, + "step": 11910 + }, + { + "epoch": 0.6048914216090671, + "grad_norm": 0.028977129692472393, + "learning_rate": 0.0008802648398308939, + "loss": 0.5947, + "step": 11915 + }, + { + "epoch": 0.6051452577070985, + "grad_norm": 0.02467551449885606, + "learning_rate": 0.0008801209650009813, + "loss": 0.6287, + "step": 11920 + }, + { + "epoch": 0.60539909380513, + "grad_norm": 0.024282792825939505, + "learning_rate": 0.0008799770155552192, + "loss": 0.5401, + "step": 11925 + }, + { + "epoch": 0.6056529299031616, + "grad_norm": 0.04118279824961832, + "learning_rate": 0.0008798329915218638, + "loss": 0.6076, + "step": 11930 + }, + { + "epoch": 0.605906766001193, + "grad_norm": 0.025423020492523492, + "learning_rate": 0.0008796888929291864, + "loss": 0.5958, + "step": 11935 + }, + { + "epoch": 0.6061606020992245, + "grad_norm": 0.02499168872123397, + "learning_rate": 0.0008795447198054729, + "loss": 0.6043, + "step": 11940 + }, + { + "epoch": 0.606414438197256, + "grad_norm": 0.042162291687962206, + "learning_rate": 0.0008794004721790235, + "loss": 0.589, + "step": 11945 + }, + { + "epoch": 0.6066682742952876, + "grad_norm": 0.03291178818373513, + "learning_rate": 0.0008792561500781535, + "loss": 0.5671, + "step": 11950 + }, + { + "epoch": 0.606922110393319, + "grad_norm": 0.0376953507530463, + "learning_rate": 0.0008791117535311928, + "loss": 0.6097, + "step": 11955 + }, + { + "epoch": 0.6071759464913505, + "grad_norm": 0.025349852157537867, + "learning_rate": 0.0008789672825664854, + "loss": 0.5582, + "step": 11960 + }, + { + "epoch": 0.6074297825893821, + "grad_norm": 0.02634738699215234, + "learning_rate": 0.0008788227372123902, + "loss": 0.6388, + "step": 11965 + }, + { + "epoch": 0.6076836186874135, + "grad_norm": 0.030461346907874626, + "learning_rate": 0.0008786781174972811, + "loss": 0.6106, + "step": 11970 + }, + { + "epoch": 0.607937454785445, + "grad_norm": 0.033633731377443056, + "learning_rate": 0.0008785334234495459, + "loss": 0.5998, + "step": 11975 + }, + { + "epoch": 0.6081912908834766, + "grad_norm": 0.022923564848087382, + "learning_rate": 0.0008783886550975872, + "loss": 0.547, + "step": 11980 + }, + { + "epoch": 0.608445126981508, + "grad_norm": 0.029198962706889067, + "learning_rate": 0.0008782438124698229, + "loss": 0.618, + "step": 11985 + }, + { + "epoch": 0.6086989630795395, + "grad_norm": 0.026047548738704816, + "learning_rate": 0.0008780988955946843, + "loss": 0.5686, + "step": 11990 + }, + { + "epoch": 0.6089527991775711, + "grad_norm": 0.02660176396313372, + "learning_rate": 0.0008779539045006182, + "loss": 0.5884, + "step": 11995 + }, + { + "epoch": 0.6092066352756026, + "grad_norm": 0.030589674203168663, + "learning_rate": 0.0008778088392160853, + "loss": 0.6039, + "step": 12000 + }, + { + "epoch": 0.609460471373634, + "grad_norm": 0.050693411471426746, + "learning_rate": 0.0008776636997695615, + "loss": 0.6013, + "step": 12005 + }, + { + "epoch": 0.6097143074716656, + "grad_norm": 0.026446497681540608, + "learning_rate": 0.0008775184861895369, + "loss": 0.5416, + "step": 12010 + }, + { + "epoch": 0.6099681435696971, + "grad_norm": 0.1932080655888927, + "learning_rate": 0.0008773731985045162, + "loss": 0.5818, + "step": 12015 + }, + { + "epoch": 0.6102219796677285, + "grad_norm": 0.027856228756609126, + "learning_rate": 0.0008772278367430185, + "loss": 0.5952, + "step": 12020 + }, + { + "epoch": 0.61047581576576, + "grad_norm": 0.04117569443549469, + "learning_rate": 0.0008770824009335775, + "loss": 0.5738, + "step": 12025 + }, + { + "epoch": 0.6107296518637916, + "grad_norm": 0.025174168899307972, + "learning_rate": 0.000876936891104742, + "loss": 0.5825, + "step": 12030 + }, + { + "epoch": 0.610983487961823, + "grad_norm": 0.024070740808714138, + "learning_rate": 0.0008767913072850743, + "loss": 0.5766, + "step": 12035 + }, + { + "epoch": 0.6112373240598545, + "grad_norm": 0.022925340728893576, + "learning_rate": 0.0008766456495031521, + "loss": 0.6117, + "step": 12040 + }, + { + "epoch": 0.6114911601578861, + "grad_norm": 0.02650268728322704, + "learning_rate": 0.0008764999177875673, + "loss": 0.594, + "step": 12045 + }, + { + "epoch": 0.6117449962559176, + "grad_norm": 0.04585848245773159, + "learning_rate": 0.0008763541121669263, + "loss": 0.5841, + "step": 12050 + }, + { + "epoch": 0.611998832353949, + "grad_norm": 0.028232685184612846, + "learning_rate": 0.0008762082326698498, + "loss": 0.5985, + "step": 12055 + }, + { + "epoch": 0.6122526684519806, + "grad_norm": 0.02453819199214054, + "learning_rate": 0.0008760622793249735, + "loss": 0.6275, + "step": 12060 + }, + { + "epoch": 0.6125065045500121, + "grad_norm": 0.03043290217761071, + "learning_rate": 0.0008759162521609472, + "loss": 0.5899, + "step": 12065 + }, + { + "epoch": 0.6127603406480435, + "grad_norm": 0.035932514886264776, + "learning_rate": 0.0008757701512064351, + "loss": 0.5917, + "step": 12070 + }, + { + "epoch": 0.6130141767460751, + "grad_norm": 0.024922837264232393, + "learning_rate": 0.0008756239764901165, + "loss": 0.5963, + "step": 12075 + }, + { + "epoch": 0.6132680128441066, + "grad_norm": 0.021912894422967358, + "learning_rate": 0.0008754777280406845, + "loss": 0.5423, + "step": 12080 + }, + { + "epoch": 0.613521848942138, + "grad_norm": 0.03153730727693658, + "learning_rate": 0.0008753314058868469, + "loss": 0.6127, + "step": 12085 + }, + { + "epoch": 0.6137756850401695, + "grad_norm": 0.026200893072720284, + "learning_rate": 0.0008751850100573262, + "loss": 0.5789, + "step": 12090 + }, + { + "epoch": 0.6140295211382011, + "grad_norm": 0.03394058282414533, + "learning_rate": 0.000875038540580859, + "loss": 0.5715, + "step": 12095 + }, + { + "epoch": 0.6142833572362326, + "grad_norm": 0.027581562059680105, + "learning_rate": 0.0008748919974861967, + "loss": 0.5693, + "step": 12100 + }, + { + "epoch": 0.614537193334264, + "grad_norm": 0.029272850241735242, + "learning_rate": 0.0008747453808021047, + "loss": 0.6127, + "step": 12105 + }, + { + "epoch": 0.6147910294322956, + "grad_norm": 0.02771381768749453, + "learning_rate": 0.0008745986905573634, + "loss": 0.5814, + "step": 12110 + }, + { + "epoch": 0.6150448655303271, + "grad_norm": 0.02582305425809761, + "learning_rate": 0.0008744519267807673, + "loss": 0.5818, + "step": 12115 + }, + { + "epoch": 0.6152987016283585, + "grad_norm": 0.03650039105604249, + "learning_rate": 0.0008743050895011253, + "loss": 0.5948, + "step": 12120 + }, + { + "epoch": 0.6155525377263901, + "grad_norm": 0.025855707462884535, + "learning_rate": 0.000874158178747261, + "loss": 0.5788, + "step": 12125 + }, + { + "epoch": 0.6158063738244216, + "grad_norm": 0.02817299125660346, + "learning_rate": 0.000874011194548012, + "loss": 0.5825, + "step": 12130 + }, + { + "epoch": 0.616060209922453, + "grad_norm": 0.05520364073190341, + "learning_rate": 0.0008738641369322308, + "loss": 0.5901, + "step": 12135 + }, + { + "epoch": 0.6163140460204846, + "grad_norm": 0.04862377362129934, + "learning_rate": 0.0008737170059287838, + "loss": 0.5905, + "step": 12140 + }, + { + "epoch": 0.6165678821185161, + "grad_norm": 0.043586867841226354, + "learning_rate": 0.0008735698015665525, + "loss": 0.6009, + "step": 12145 + }, + { + "epoch": 0.6168217182165475, + "grad_norm": 0.02855284608566154, + "learning_rate": 0.000873422523874432, + "loss": 0.5848, + "step": 12150 + }, + { + "epoch": 0.6170755543145791, + "grad_norm": 0.02574982692090748, + "learning_rate": 0.0008732751728813324, + "loss": 0.6078, + "step": 12155 + }, + { + "epoch": 0.6173293904126106, + "grad_norm": 0.028740949383117314, + "learning_rate": 0.0008731277486161777, + "loss": 0.5622, + "step": 12160 + }, + { + "epoch": 0.6175832265106421, + "grad_norm": 0.028127163339311326, + "learning_rate": 0.000872980251107907, + "loss": 0.5664, + "step": 12165 + }, + { + "epoch": 0.6178370626086735, + "grad_norm": 0.024587054800826992, + "learning_rate": 0.0008728326803854728, + "loss": 0.6062, + "step": 12170 + }, + { + "epoch": 0.6180908987067051, + "grad_norm": 0.03673435237469503, + "learning_rate": 0.0008726850364778429, + "loss": 0.6079, + "step": 12175 + }, + { + "epoch": 0.6183447348047366, + "grad_norm": 0.022463713399365034, + "learning_rate": 0.000872537319413999, + "loss": 0.5885, + "step": 12180 + }, + { + "epoch": 0.618598570902768, + "grad_norm": 0.04006474303404941, + "learning_rate": 0.000872389529222937, + "loss": 0.5838, + "step": 12185 + }, + { + "epoch": 0.6188524070007996, + "grad_norm": 0.03323874643198722, + "learning_rate": 0.0008722416659336676, + "loss": 0.5907, + "step": 12190 + }, + { + "epoch": 0.6191062430988311, + "grad_norm": 0.0325269083515918, + "learning_rate": 0.0008720937295752153, + "loss": 0.5848, + "step": 12195 + }, + { + "epoch": 0.6193600791968625, + "grad_norm": 0.023584804219643197, + "learning_rate": 0.0008719457201766199, + "loss": 0.5965, + "step": 12200 + }, + { + "epoch": 0.6196139152948941, + "grad_norm": 0.02567919434786954, + "learning_rate": 0.0008717976377669343, + "loss": 0.5644, + "step": 12205 + }, + { + "epoch": 0.6198677513929256, + "grad_norm": 0.023457406627368636, + "learning_rate": 0.0008716494823752265, + "loss": 0.5846, + "step": 12210 + }, + { + "epoch": 0.6201215874909571, + "grad_norm": 0.10110762567310781, + "learning_rate": 0.0008715012540305789, + "loss": 0.5958, + "step": 12215 + }, + { + "epoch": 0.6203754235889886, + "grad_norm": 0.027499055613954763, + "learning_rate": 0.0008713529527620876, + "loss": 0.6264, + "step": 12220 + }, + { + "epoch": 0.6206292596870201, + "grad_norm": 0.04044664204628187, + "learning_rate": 0.0008712045785988638, + "loss": 0.5648, + "step": 12225 + }, + { + "epoch": 0.6208830957850516, + "grad_norm": 0.026521946530749428, + "learning_rate": 0.0008710561315700323, + "loss": 0.6316, + "step": 12230 + }, + { + "epoch": 0.621136931883083, + "grad_norm": 0.024722241475712593, + "learning_rate": 0.0008709076117047326, + "loss": 0.5668, + "step": 12235 + }, + { + "epoch": 0.6213907679811146, + "grad_norm": 0.03259920434042201, + "learning_rate": 0.0008707590190321186, + "loss": 0.5987, + "step": 12240 + }, + { + "epoch": 0.6216446040791461, + "grad_norm": 3.80266301123272, + "learning_rate": 0.000870610353581358, + "loss": 0.6304, + "step": 12245 + }, + { + "epoch": 0.6218984401771775, + "grad_norm": 0.08694604714084496, + "learning_rate": 0.0008704616153816332, + "loss": 0.649, + "step": 12250 + }, + { + "epoch": 0.6221522762752091, + "grad_norm": 0.05774492909835282, + "learning_rate": 0.0008703128044621409, + "loss": 0.6147, + "step": 12255 + }, + { + "epoch": 0.6224061123732406, + "grad_norm": 0.04804931614993646, + "learning_rate": 0.0008701639208520917, + "loss": 0.6145, + "step": 12260 + }, + { + "epoch": 0.6226599484712722, + "grad_norm": 0.08384982616634495, + "learning_rate": 0.000870014964580711, + "loss": 0.6363, + "step": 12265 + }, + { + "epoch": 0.6229137845693036, + "grad_norm": 0.03400226647956488, + "learning_rate": 0.000869865935677238, + "loss": 0.5997, + "step": 12270 + }, + { + "epoch": 0.6231676206673351, + "grad_norm": 0.026650165047009896, + "learning_rate": 0.0008697168341709263, + "loss": 0.6205, + "step": 12275 + }, + { + "epoch": 0.6234214567653666, + "grad_norm": 0.03254784519557384, + "learning_rate": 0.0008695676600910437, + "loss": 0.6218, + "step": 12280 + }, + { + "epoch": 0.6236752928633981, + "grad_norm": 0.025852394090405486, + "learning_rate": 0.0008694184134668726, + "loss": 0.6043, + "step": 12285 + }, + { + "epoch": 0.6239291289614296, + "grad_norm": 0.03673450921472964, + "learning_rate": 0.0008692690943277092, + "loss": 0.6277, + "step": 12290 + }, + { + "epoch": 0.6241829650594611, + "grad_norm": 0.026035740947944315, + "learning_rate": 0.0008691197027028641, + "loss": 0.6135, + "step": 12295 + }, + { + "epoch": 0.6244368011574926, + "grad_norm": 0.035313678241679185, + "learning_rate": 0.0008689702386216622, + "loss": 0.584, + "step": 12300 + }, + { + "epoch": 0.6246906372555241, + "grad_norm": 0.03306444173549, + "learning_rate": 0.0008688207021134424, + "loss": 0.6576, + "step": 12305 + }, + { + "epoch": 0.6249444733535556, + "grad_norm": 0.03609474653140698, + "learning_rate": 0.0008686710932075582, + "loss": 0.5882, + "step": 12310 + }, + { + "epoch": 0.6251983094515872, + "grad_norm": 0.02644393403575191, + "learning_rate": 0.000868521411933377, + "loss": 0.5805, + "step": 12315 + }, + { + "epoch": 0.6254521455496186, + "grad_norm": 0.03558589982698361, + "learning_rate": 0.0008683716583202803, + "loss": 0.597, + "step": 12320 + }, + { + "epoch": 0.6257059816476501, + "grad_norm": 0.027207972337934594, + "learning_rate": 0.0008682218323976643, + "loss": 0.6125, + "step": 12325 + }, + { + "epoch": 0.6259598177456817, + "grad_norm": 0.030538062894411674, + "learning_rate": 0.0008680719341949388, + "loss": 0.6047, + "step": 12330 + }, + { + "epoch": 0.6262136538437131, + "grad_norm": 0.02839935399711346, + "learning_rate": 0.0008679219637415281, + "loss": 0.5826, + "step": 12335 + }, + { + "epoch": 0.6264674899417446, + "grad_norm": 0.03696443623177008, + "learning_rate": 0.0008677719210668708, + "loss": 0.6189, + "step": 12340 + }, + { + "epoch": 0.6267213260397761, + "grad_norm": 0.037688218408350786, + "learning_rate": 0.0008676218062004196, + "loss": 0.6191, + "step": 12345 + }, + { + "epoch": 0.6269751621378076, + "grad_norm": 0.030327644773068856, + "learning_rate": 0.0008674716191716412, + "loss": 0.5768, + "step": 12350 + }, + { + "epoch": 0.6272289982358391, + "grad_norm": 0.026282274839364978, + "learning_rate": 0.0008673213600100165, + "loss": 0.589, + "step": 12355 + }, + { + "epoch": 0.6274828343338706, + "grad_norm": 0.023814603743172128, + "learning_rate": 0.0008671710287450406, + "loss": 0.5615, + "step": 12360 + }, + { + "epoch": 0.6277366704319021, + "grad_norm": 0.027581917581802243, + "learning_rate": 0.0008670206254062227, + "loss": 0.6108, + "step": 12365 + }, + { + "epoch": 0.6279905065299336, + "grad_norm": 0.026215277115865347, + "learning_rate": 0.0008668701500230865, + "loss": 0.6145, + "step": 12370 + }, + { + "epoch": 0.6282443426279651, + "grad_norm": 0.025711494856042162, + "learning_rate": 0.0008667196026251694, + "loss": 0.6259, + "step": 12375 + }, + { + "epoch": 0.6284981787259967, + "grad_norm": 0.02851492204909293, + "learning_rate": 0.0008665689832420231, + "loss": 0.6542, + "step": 12380 + }, + { + "epoch": 0.6287520148240281, + "grad_norm": 0.0445871823188339, + "learning_rate": 0.0008664182919032135, + "loss": 0.5829, + "step": 12385 + }, + { + "epoch": 0.6290058509220596, + "grad_norm": 0.03817207065267871, + "learning_rate": 0.0008662675286383206, + "loss": 0.5721, + "step": 12390 + }, + { + "epoch": 0.6292596870200912, + "grad_norm": 0.025461696058893753, + "learning_rate": 0.0008661166934769384, + "loss": 0.6207, + "step": 12395 + }, + { + "epoch": 0.6295135231181226, + "grad_norm": 0.03291254373796645, + "learning_rate": 0.000865965786448675, + "loss": 0.6254, + "step": 12400 + }, + { + "epoch": 0.6297673592161541, + "grad_norm": 0.02456833220774285, + "learning_rate": 0.0008658148075831529, + "loss": 0.6136, + "step": 12405 + }, + { + "epoch": 0.6300211953141857, + "grad_norm": 0.03660349536181771, + "learning_rate": 0.0008656637569100083, + "loss": 0.6312, + "step": 12410 + }, + { + "epoch": 0.6302750314122171, + "grad_norm": 0.028629048846541345, + "learning_rate": 0.0008655126344588917, + "loss": 0.6128, + "step": 12415 + }, + { + "epoch": 0.6305288675102486, + "grad_norm": 0.027152018036820328, + "learning_rate": 0.0008653614402594679, + "loss": 0.5857, + "step": 12420 + }, + { + "epoch": 0.6307827036082801, + "grad_norm": 0.02680519867482763, + "learning_rate": 0.0008652101743414154, + "loss": 0.6, + "step": 12425 + }, + { + "epoch": 0.6310365397063117, + "grad_norm": 0.038568306378071626, + "learning_rate": 0.000865058836734427, + "loss": 0.5764, + "step": 12430 + }, + { + "epoch": 0.6312903758043431, + "grad_norm": 0.022332649415363656, + "learning_rate": 0.0008649074274682094, + "loss": 0.5723, + "step": 12435 + }, + { + "epoch": 0.6315442119023746, + "grad_norm": 0.03745046556172155, + "learning_rate": 0.0008647559465724837, + "loss": 0.6402, + "step": 12440 + }, + { + "epoch": 0.6317980480004062, + "grad_norm": 0.043267157791484344, + "learning_rate": 0.0008646043940769846, + "loss": 0.614, + "step": 12445 + }, + { + "epoch": 0.6320518840984376, + "grad_norm": 0.026176498937215194, + "learning_rate": 0.0008644527700114613, + "loss": 0.6109, + "step": 12450 + }, + { + "epoch": 0.6323057201964691, + "grad_norm": 0.026927373965425307, + "learning_rate": 0.0008643010744056768, + "loss": 0.5921, + "step": 12455 + }, + { + "epoch": 0.6325595562945007, + "grad_norm": 0.030985586351536417, + "learning_rate": 0.0008641493072894081, + "loss": 0.6037, + "step": 12460 + }, + { + "epoch": 0.6328133923925321, + "grad_norm": 0.034088985294575574, + "learning_rate": 0.0008639974686924463, + "loss": 0.5987, + "step": 12465 + }, + { + "epoch": 0.6330672284905636, + "grad_norm": 0.04071949159100916, + "learning_rate": 0.0008638455586445967, + "loss": 0.598, + "step": 12470 + }, + { + "epoch": 0.6333210645885952, + "grad_norm": 0.04119075917419428, + "learning_rate": 0.0008636935771756787, + "loss": 0.6045, + "step": 12475 + }, + { + "epoch": 0.6335749006866267, + "grad_norm": 0.024601400128880478, + "learning_rate": 0.000863541524315525, + "loss": 0.5895, + "step": 12480 + }, + { + "epoch": 0.6338287367846581, + "grad_norm": 0.025669915239589995, + "learning_rate": 0.000863389400093983, + "loss": 0.6138, + "step": 12485 + }, + { + "epoch": 0.6340825728826897, + "grad_norm": 0.07547210741704416, + "learning_rate": 0.0008632372045409141, + "loss": 0.5917, + "step": 12490 + }, + { + "epoch": 0.6343364089807212, + "grad_norm": 0.08126800386669199, + "learning_rate": 0.0008630849376861933, + "loss": 0.5978, + "step": 12495 + }, + { + "epoch": 0.6345902450787526, + "grad_norm": 0.039597508909703365, + "learning_rate": 0.0008629325995597101, + "loss": 0.6147, + "step": 12500 + }, + { + "epoch": 0.6348440811767841, + "grad_norm": 0.03847450277611692, + "learning_rate": 0.0008627801901913675, + "loss": 0.6161, + "step": 12505 + }, + { + "epoch": 0.6350979172748157, + "grad_norm": 0.031096240484264088, + "learning_rate": 0.0008626277096110826, + "loss": 0.6256, + "step": 12510 + }, + { + "epoch": 0.6353517533728471, + "grad_norm": 0.0351603079410334, + "learning_rate": 0.0008624751578487868, + "loss": 0.5906, + "step": 12515 + }, + { + "epoch": 0.6356055894708786, + "grad_norm": 0.05304450618895726, + "learning_rate": 0.0008623225349344252, + "loss": 0.5935, + "step": 12520 + }, + { + "epoch": 0.6358594255689102, + "grad_norm": 0.042723117144634414, + "learning_rate": 0.000862169840897957, + "loss": 0.6222, + "step": 12525 + }, + { + "epoch": 0.6361132616669417, + "grad_norm": 0.06453506596782073, + "learning_rate": 0.0008620170757693551, + "loss": 0.5989, + "step": 12530 + }, + { + "epoch": 0.6363670977649731, + "grad_norm": 0.03504784561459624, + "learning_rate": 0.0008618642395786065, + "loss": 0.5673, + "step": 12535 + }, + { + "epoch": 0.6366209338630047, + "grad_norm": 0.0351102465977865, + "learning_rate": 0.0008617113323557124, + "loss": 0.6076, + "step": 12540 + }, + { + "epoch": 0.6368747699610362, + "grad_norm": 0.030573876028541808, + "learning_rate": 0.0008615583541306875, + "loss": 0.5974, + "step": 12545 + }, + { + "epoch": 0.6371286060590676, + "grad_norm": 0.03746793184981241, + "learning_rate": 0.0008614053049335608, + "loss": 0.5956, + "step": 12550 + }, + { + "epoch": 0.6373824421570992, + "grad_norm": 0.03858770491598553, + "learning_rate": 0.0008612521847943751, + "loss": 0.5991, + "step": 12555 + }, + { + "epoch": 0.6376362782551307, + "grad_norm": 0.026289258814351405, + "learning_rate": 0.0008610989937431872, + "loss": 0.5822, + "step": 12560 + }, + { + "epoch": 0.6378901143531621, + "grad_norm": 0.03593649422727849, + "learning_rate": 0.0008609457318100674, + "loss": 0.6193, + "step": 12565 + }, + { + "epoch": 0.6381439504511937, + "grad_norm": 0.02527123338083977, + "learning_rate": 0.0008607923990251005, + "loss": 0.5848, + "step": 12570 + }, + { + "epoch": 0.6383977865492252, + "grad_norm": 0.03742296667921461, + "learning_rate": 0.0008606389954183851, + "loss": 0.625, + "step": 12575 + }, + { + "epoch": 0.6386516226472566, + "grad_norm": 0.025365542502550003, + "learning_rate": 0.0008604855210200333, + "loss": 0.5827, + "step": 12580 + }, + { + "epoch": 0.6389054587452881, + "grad_norm": 0.0405102841258219, + "learning_rate": 0.0008603319758601715, + "loss": 0.6005, + "step": 12585 + }, + { + "epoch": 0.6391592948433197, + "grad_norm": 0.034192188660412175, + "learning_rate": 0.0008601783599689399, + "loss": 0.5751, + "step": 12590 + }, + { + "epoch": 0.6394131309413512, + "grad_norm": 0.027215532759491887, + "learning_rate": 0.0008600246733764923, + "loss": 0.5862, + "step": 12595 + }, + { + "epoch": 0.6396669670393826, + "grad_norm": 0.03061750468523527, + "learning_rate": 0.0008598709161129969, + "loss": 0.5811, + "step": 12600 + }, + { + "epoch": 0.6399208031374142, + "grad_norm": 0.028928109299113807, + "learning_rate": 0.0008597170882086351, + "loss": 0.5806, + "step": 12605 + }, + { + "epoch": 0.6401746392354457, + "grad_norm": 0.027021955515260244, + "learning_rate": 0.000859563189693603, + "loss": 0.6105, + "step": 12610 + }, + { + "epoch": 0.6404284753334771, + "grad_norm": 0.027332261017979507, + "learning_rate": 0.0008594092205981099, + "loss": 0.5754, + "step": 12615 + }, + { + "epoch": 0.6406823114315087, + "grad_norm": 0.02712378032650469, + "learning_rate": 0.0008592551809523791, + "loss": 0.6216, + "step": 12620 + }, + { + "epoch": 0.6409361475295402, + "grad_norm": 0.02824436218143356, + "learning_rate": 0.0008591010707866478, + "loss": 0.6344, + "step": 12625 + }, + { + "epoch": 0.6411899836275716, + "grad_norm": 0.023321393807142935, + "learning_rate": 0.0008589468901311672, + "loss": 0.6034, + "step": 12630 + }, + { + "epoch": 0.6414438197256032, + "grad_norm": 0.03151905452662985, + "learning_rate": 0.0008587926390162022, + "loss": 0.587, + "step": 12635 + }, + { + "epoch": 0.6416976558236347, + "grad_norm": 0.030624964870169982, + "learning_rate": 0.0008586383174720315, + "loss": 0.6196, + "step": 12640 + }, + { + "epoch": 0.6419514919216662, + "grad_norm": 0.025288630292415442, + "learning_rate": 0.0008584839255289475, + "loss": 0.6114, + "step": 12645 + }, + { + "epoch": 0.6422053280196977, + "grad_norm": 0.028822411703530795, + "learning_rate": 0.0008583294632172567, + "loss": 0.598, + "step": 12650 + }, + { + "epoch": 0.6424591641177292, + "grad_norm": 0.026442735674073018, + "learning_rate": 0.0008581749305672792, + "loss": 0.5951, + "step": 12655 + }, + { + "epoch": 0.6427130002157607, + "grad_norm": 0.026576792989007662, + "learning_rate": 0.0008580203276093492, + "loss": 0.5872, + "step": 12660 + }, + { + "epoch": 0.6429668363137921, + "grad_norm": 0.029042645916427186, + "learning_rate": 0.0008578656543738141, + "loss": 0.5867, + "step": 12665 + }, + { + "epoch": 0.6432206724118237, + "grad_norm": 0.04209495966527208, + "learning_rate": 0.0008577109108910359, + "loss": 0.5837, + "step": 12670 + }, + { + "epoch": 0.6434745085098552, + "grad_norm": 0.043317431826600195, + "learning_rate": 0.0008575560971913898, + "loss": 0.5904, + "step": 12675 + }, + { + "epoch": 0.6437283446078866, + "grad_norm": 0.038439775882813054, + "learning_rate": 0.0008574012133052649, + "loss": 0.5627, + "step": 12680 + }, + { + "epoch": 0.6439821807059182, + "grad_norm": 0.48049526028798634, + "learning_rate": 0.0008572462592630641, + "loss": 0.5769, + "step": 12685 + }, + { + "epoch": 0.6442360168039497, + "grad_norm": 0.033053890270663924, + "learning_rate": 0.0008570912350952044, + "loss": 0.5878, + "step": 12690 + }, + { + "epoch": 0.6444898529019812, + "grad_norm": 0.07863611845071104, + "learning_rate": 0.0008569361408321159, + "loss": 0.6133, + "step": 12695 + }, + { + "epoch": 0.6447436890000127, + "grad_norm": 0.029398204077650845, + "learning_rate": 0.000856780976504243, + "loss": 0.5799, + "step": 12700 + }, + { + "epoch": 0.6449975250980442, + "grad_norm": 0.02869897907225985, + "learning_rate": 0.0008566257421420439, + "loss": 0.6008, + "step": 12705 + }, + { + "epoch": 0.6452513611960757, + "grad_norm": 0.026097172563309384, + "learning_rate": 0.0008564704377759897, + "loss": 0.5939, + "step": 12710 + }, + { + "epoch": 0.6455051972941072, + "grad_norm": 0.02502019281825702, + "learning_rate": 0.0008563150634365666, + "loss": 0.598, + "step": 12715 + }, + { + "epoch": 0.6457590333921387, + "grad_norm": 0.02483685696837895, + "learning_rate": 0.0008561596191542733, + "loss": 0.5801, + "step": 12720 + }, + { + "epoch": 0.6460128694901702, + "grad_norm": 0.03535549265065644, + "learning_rate": 0.000856004104959623, + "loss": 0.5946, + "step": 12725 + }, + { + "epoch": 0.6462667055882017, + "grad_norm": 0.027879010371972223, + "learning_rate": 0.0008558485208831424, + "loss": 0.612, + "step": 12730 + }, + { + "epoch": 0.6465205416862332, + "grad_norm": 0.04735218553478145, + "learning_rate": 0.0008556928669553717, + "loss": 0.5938, + "step": 12735 + }, + { + "epoch": 0.6467743777842647, + "grad_norm": 0.0279629249002624, + "learning_rate": 0.000855537143206865, + "loss": 0.5837, + "step": 12740 + }, + { + "epoch": 0.6470282138822963, + "grad_norm": 0.02533240032359082, + "learning_rate": 0.00085538134966819, + "loss": 0.6252, + "step": 12745 + }, + { + "epoch": 0.6472820499803277, + "grad_norm": 0.025603639669077066, + "learning_rate": 0.0008552254863699286, + "loss": 0.5819, + "step": 12750 + }, + { + "epoch": 0.6475358860783592, + "grad_norm": 0.04145899859904642, + "learning_rate": 0.0008550695533426756, + "loss": 0.597, + "step": 12755 + }, + { + "epoch": 0.6477897221763907, + "grad_norm": 0.029237034218711128, + "learning_rate": 0.00085491355061704, + "loss": 0.6067, + "step": 12760 + }, + { + "epoch": 0.6480435582744222, + "grad_norm": 0.030474815495890747, + "learning_rate": 0.0008547574782236444, + "loss": 0.5969, + "step": 12765 + }, + { + "epoch": 0.6482973943724537, + "grad_norm": 0.033389985535876285, + "learning_rate": 0.0008546013361931251, + "loss": 0.5902, + "step": 12770 + }, + { + "epoch": 0.6485512304704852, + "grad_norm": 0.03244401680509467, + "learning_rate": 0.0008544451245561318, + "loss": 0.5714, + "step": 12775 + }, + { + "epoch": 0.6488050665685167, + "grad_norm": 0.047562688287312124, + "learning_rate": 0.0008542888433433283, + "loss": 0.5706, + "step": 12780 + }, + { + "epoch": 0.6490589026665482, + "grad_norm": 0.028307045275590848, + "learning_rate": 0.0008541324925853915, + "loss": 0.5689, + "step": 12785 + }, + { + "epoch": 0.6493127387645797, + "grad_norm": 0.04704027854528433, + "learning_rate": 0.0008539760723130125, + "loss": 0.5661, + "step": 12790 + }, + { + "epoch": 0.6495665748626112, + "grad_norm": 0.04648606005162825, + "learning_rate": 0.0008538195825568958, + "loss": 0.6028, + "step": 12795 + }, + { + "epoch": 0.6498204109606427, + "grad_norm": 0.025943070142767706, + "learning_rate": 0.0008536630233477594, + "loss": 0.5877, + "step": 12800 + }, + { + "epoch": 0.6500742470586742, + "grad_norm": 0.04012620623020576, + "learning_rate": 0.0008535063947163355, + "loss": 0.5976, + "step": 12805 + }, + { + "epoch": 0.6503280831567058, + "grad_norm": 0.03544811219416931, + "learning_rate": 0.0008533496966933691, + "loss": 0.5855, + "step": 12810 + }, + { + "epoch": 0.6505819192547372, + "grad_norm": 0.0281669036894832, + "learning_rate": 0.0008531929293096194, + "loss": 0.6111, + "step": 12815 + }, + { + "epoch": 0.6508357553527687, + "grad_norm": 0.023763951065239725, + "learning_rate": 0.0008530360925958591, + "loss": 0.5776, + "step": 12820 + }, + { + "epoch": 0.6510895914508003, + "grad_norm": 0.24418838214511798, + "learning_rate": 0.0008528791865828742, + "loss": 0.6009, + "step": 12825 + }, + { + "epoch": 0.6513434275488317, + "grad_norm": 0.029087961946777385, + "learning_rate": 0.000852722211301465, + "loss": 0.6235, + "step": 12830 + }, + { + "epoch": 0.6515972636468632, + "grad_norm": 0.034137672479786794, + "learning_rate": 0.0008525651667824447, + "loss": 0.5799, + "step": 12835 + }, + { + "epoch": 0.6518510997448947, + "grad_norm": 0.028312942222624603, + "learning_rate": 0.0008524080530566405, + "loss": 0.6404, + "step": 12840 + }, + { + "epoch": 0.6521049358429262, + "grad_norm": 0.02843554231766899, + "learning_rate": 0.0008522508701548927, + "loss": 0.5799, + "step": 12845 + }, + { + "epoch": 0.6523587719409577, + "grad_norm": 0.027322928571429136, + "learning_rate": 0.0008520936181080561, + "loss": 0.5999, + "step": 12850 + }, + { + "epoch": 0.6526126080389892, + "grad_norm": 0.0303740372452228, + "learning_rate": 0.0008519362969469979, + "loss": 0.5929, + "step": 12855 + }, + { + "epoch": 0.6528664441370208, + "grad_norm": 0.027100993271736346, + "learning_rate": 0.0008517789067025997, + "loss": 0.6328, + "step": 12860 + }, + { + "epoch": 0.6531202802350522, + "grad_norm": 0.028849366128399865, + "learning_rate": 0.0008516214474057565, + "loss": 0.5698, + "step": 12865 + }, + { + "epoch": 0.6533741163330837, + "grad_norm": 0.029545255854450884, + "learning_rate": 0.0008514639190873767, + "loss": 0.5783, + "step": 12870 + }, + { + "epoch": 0.6536279524311153, + "grad_norm": 0.024473560866559003, + "learning_rate": 0.0008513063217783824, + "loss": 0.6031, + "step": 12875 + }, + { + "epoch": 0.6538817885291467, + "grad_norm": 0.02666332020016646, + "learning_rate": 0.000851148655509709, + "loss": 0.6249, + "step": 12880 + }, + { + "epoch": 0.6541356246271782, + "grad_norm": 0.03803144464344789, + "learning_rate": 0.0008509909203123057, + "loss": 0.6052, + "step": 12885 + }, + { + "epoch": 0.6543894607252098, + "grad_norm": 0.029430501206934002, + "learning_rate": 0.0008508331162171353, + "loss": 0.6082, + "step": 12890 + }, + { + "epoch": 0.6546432968232412, + "grad_norm": 0.038636878659898226, + "learning_rate": 0.0008506752432551736, + "loss": 0.5922, + "step": 12895 + }, + { + "epoch": 0.6548971329212727, + "grad_norm": 0.03645227270336851, + "learning_rate": 0.0008505173014574104, + "loss": 0.5961, + "step": 12900 + }, + { + "epoch": 0.6551509690193043, + "grad_norm": 0.0339469915664989, + "learning_rate": 0.0008503592908548492, + "loss": 0.6087, + "step": 12905 + }, + { + "epoch": 0.6554048051173358, + "grad_norm": 0.04584178211309174, + "learning_rate": 0.0008502012114785062, + "loss": 0.6057, + "step": 12910 + }, + { + "epoch": 0.6556586412153672, + "grad_norm": 0.027847814793399383, + "learning_rate": 0.0008500430633594121, + "loss": 0.5957, + "step": 12915 + }, + { + "epoch": 0.6559124773133987, + "grad_norm": 0.04088969160713702, + "learning_rate": 0.0008498848465286101, + "loss": 0.6299, + "step": 12920 + }, + { + "epoch": 0.6561663134114303, + "grad_norm": 0.024361703944890165, + "learning_rate": 0.0008497265610171576, + "loss": 0.5643, + "step": 12925 + }, + { + "epoch": 0.6564201495094617, + "grad_norm": 0.08374406284028113, + "learning_rate": 0.0008495682068561254, + "loss": 0.5758, + "step": 12930 + }, + { + "epoch": 0.6566739856074932, + "grad_norm": 0.023934163458897068, + "learning_rate": 0.0008494097840765975, + "loss": 0.5817, + "step": 12935 + }, + { + "epoch": 0.6569278217055248, + "grad_norm": 0.027116947138030544, + "learning_rate": 0.0008492512927096714, + "loss": 0.6075, + "step": 12940 + }, + { + "epoch": 0.6571816578035562, + "grad_norm": 0.027883742219997405, + "learning_rate": 0.0008490927327864581, + "loss": 0.587, + "step": 12945 + }, + { + "epoch": 0.6574354939015877, + "grad_norm": 0.02716053289665079, + "learning_rate": 0.0008489341043380825, + "loss": 0.657, + "step": 12950 + }, + { + "epoch": 0.6576893299996193, + "grad_norm": 0.03758581951547631, + "learning_rate": 0.0008487754073956823, + "loss": 0.5958, + "step": 12955 + }, + { + "epoch": 0.6579431660976508, + "grad_norm": 0.028060630011589125, + "learning_rate": 0.0008486166419904089, + "loss": 0.5604, + "step": 12960 + }, + { + "epoch": 0.6581970021956822, + "grad_norm": 0.026078088164496267, + "learning_rate": 0.0008484578081534274, + "loss": 0.6172, + "step": 12965 + }, + { + "epoch": 0.6584508382937138, + "grad_norm": 0.028071334962187697, + "learning_rate": 0.0008482989059159158, + "loss": 0.5903, + "step": 12970 + }, + { + "epoch": 0.6587046743917453, + "grad_norm": 0.03378636851534933, + "learning_rate": 0.0008481399353090659, + "loss": 0.6051, + "step": 12975 + }, + { + "epoch": 0.6589585104897767, + "grad_norm": 0.026105241851777608, + "learning_rate": 0.0008479808963640828, + "loss": 0.6411, + "step": 12980 + }, + { + "epoch": 0.6592123465878083, + "grad_norm": 0.030533873522670462, + "learning_rate": 0.0008478217891121853, + "loss": 0.5837, + "step": 12985 + }, + { + "epoch": 0.6594661826858398, + "grad_norm": 0.03300525899669726, + "learning_rate": 0.0008476626135846051, + "loss": 0.5938, + "step": 12990 + }, + { + "epoch": 0.6597200187838712, + "grad_norm": 0.028329077912125737, + "learning_rate": 0.0008475033698125876, + "loss": 0.6348, + "step": 12995 + }, + { + "epoch": 0.6599738548819027, + "grad_norm": 0.04110076364540404, + "learning_rate": 0.0008473440578273916, + "loss": 0.5867, + "step": 13000 + }, + { + "epoch": 0.6602276909799343, + "grad_norm": 0.05757425924457839, + "learning_rate": 0.0008471846776602894, + "loss": 0.5384, + "step": 13005 + }, + { + "epoch": 0.6604815270779657, + "grad_norm": 0.05844590144036168, + "learning_rate": 0.0008470252293425662, + "loss": 0.5848, + "step": 13010 + }, + { + "epoch": 0.6607353631759972, + "grad_norm": 0.06415338224333281, + "learning_rate": 0.0008468657129055213, + "loss": 0.5522, + "step": 13015 + }, + { + "epoch": 0.6609891992740288, + "grad_norm": 0.030473738178567217, + "learning_rate": 0.0008467061283804665, + "loss": 0.5945, + "step": 13020 + }, + { + "epoch": 0.6612430353720603, + "grad_norm": 0.02536912248622989, + "learning_rate": 0.000846546475798728, + "loss": 0.5816, + "step": 13025 + }, + { + "epoch": 0.6614968714700917, + "grad_norm": 0.0461113719066347, + "learning_rate": 0.0008463867551916443, + "loss": 0.6512, + "step": 13030 + }, + { + "epoch": 0.6617507075681233, + "grad_norm": 0.024517762602526454, + "learning_rate": 0.0008462269665905682, + "loss": 0.597, + "step": 13035 + }, + { + "epoch": 0.6620045436661548, + "grad_norm": 1.033031229487397, + "learning_rate": 0.0008460671100268649, + "loss": 0.6271, + "step": 13040 + }, + { + "epoch": 0.6622583797641862, + "grad_norm": 0.0693341169836654, + "learning_rate": 0.0008459071855319141, + "loss": 0.6248, + "step": 13045 + }, + { + "epoch": 0.6625122158622178, + "grad_norm": 0.03717279346920806, + "learning_rate": 0.0008457471931371074, + "loss": 0.573, + "step": 13050 + }, + { + "epoch": 0.6627660519602493, + "grad_norm": 0.03262397839797902, + "learning_rate": 0.0008455871328738512, + "loss": 0.5841, + "step": 13055 + }, + { + "epoch": 0.6630198880582807, + "grad_norm": 0.05355080175870677, + "learning_rate": 0.0008454270047735643, + "loss": 0.5727, + "step": 13060 + }, + { + "epoch": 0.6632737241563122, + "grad_norm": 0.03628283883616998, + "learning_rate": 0.0008452668088676789, + "loss": 0.6012, + "step": 13065 + }, + { + "epoch": 0.6635275602543438, + "grad_norm": 0.06228849912821036, + "learning_rate": 0.0008451065451876408, + "loss": 0.5836, + "step": 13070 + }, + { + "epoch": 0.6637813963523753, + "grad_norm": 0.03209553307639597, + "learning_rate": 0.0008449462137649087, + "loss": 0.5907, + "step": 13075 + }, + { + "epoch": 0.6640352324504067, + "grad_norm": 0.029984865563620306, + "learning_rate": 0.0008447858146309554, + "loss": 0.5891, + "step": 13080 + }, + { + "epoch": 0.6642890685484383, + "grad_norm": 0.03128245374162066, + "learning_rate": 0.000844625347817266, + "loss": 0.5829, + "step": 13085 + }, + { + "epoch": 0.6645429046464698, + "grad_norm": 0.03488425051563897, + "learning_rate": 0.0008444648133553394, + "loss": 0.6055, + "step": 13090 + }, + { + "epoch": 0.6647967407445012, + "grad_norm": 0.026243668933171885, + "learning_rate": 0.0008443042112766879, + "loss": 0.5931, + "step": 13095 + }, + { + "epoch": 0.6650505768425328, + "grad_norm": 0.0776128900108717, + "learning_rate": 0.0008441435416128367, + "loss": 0.7144, + "step": 13100 + }, + { + "epoch": 0.6653044129405643, + "grad_norm": 0.06729227309686386, + "learning_rate": 0.0008439828043953246, + "loss": 0.6272, + "step": 13105 + }, + { + "epoch": 0.6655582490385957, + "grad_norm": 0.06770135653654577, + "learning_rate": 0.0008438219996557033, + "loss": 0.6059, + "step": 13110 + }, + { + "epoch": 0.6658120851366273, + "grad_norm": 0.059522461982278785, + "learning_rate": 0.0008436611274255382, + "loss": 0.5964, + "step": 13115 + }, + { + "epoch": 0.6660659212346588, + "grad_norm": 0.038971286412593786, + "learning_rate": 0.0008435001877364076, + "loss": 0.6201, + "step": 13120 + }, + { + "epoch": 0.6663197573326903, + "grad_norm": 0.03964531176980359, + "learning_rate": 0.0008433391806199033, + "loss": 0.6378, + "step": 13125 + }, + { + "epoch": 0.6665735934307218, + "grad_norm": 0.03537229559330627, + "learning_rate": 0.0008431781061076298, + "loss": 0.6107, + "step": 13130 + }, + { + "epoch": 0.6668274295287533, + "grad_norm": 0.03125614020555413, + "learning_rate": 0.0008430169642312058, + "loss": 0.6444, + "step": 13135 + }, + { + "epoch": 0.6670812656267848, + "grad_norm": 0.02696637588263313, + "learning_rate": 0.0008428557550222622, + "loss": 0.6245, + "step": 13140 + }, + { + "epoch": 0.6673351017248162, + "grad_norm": 0.03589221798809491, + "learning_rate": 0.0008426944785124437, + "loss": 0.5984, + "step": 13145 + }, + { + "epoch": 0.6675889378228478, + "grad_norm": 0.05479122023801771, + "learning_rate": 0.000842533134733408, + "loss": 0.5568, + "step": 13150 + }, + { + "epoch": 0.6678427739208793, + "grad_norm": 0.02850027631836941, + "learning_rate": 0.0008423717237168263, + "loss": 0.5844, + "step": 13155 + }, + { + "epoch": 0.6680966100189107, + "grad_norm": 0.043303911644560475, + "learning_rate": 0.0008422102454943827, + "loss": 0.6056, + "step": 13160 + }, + { + "epoch": 0.6683504461169423, + "grad_norm": 0.0485597197702503, + "learning_rate": 0.0008420487000977743, + "loss": 0.5952, + "step": 13165 + }, + { + "epoch": 0.6686042822149738, + "grad_norm": 0.04377645965991001, + "learning_rate": 0.0008418870875587121, + "loss": 0.6004, + "step": 13170 + }, + { + "epoch": 0.6688581183130053, + "grad_norm": 0.04194534237527523, + "learning_rate": 0.0008417254079089194, + "loss": 0.6054, + "step": 13175 + }, + { + "epoch": 0.6691119544110368, + "grad_norm": 0.03712390306106914, + "learning_rate": 0.0008415636611801334, + "loss": 0.6166, + "step": 13180 + }, + { + "epoch": 0.6693657905090683, + "grad_norm": 0.03756186192265224, + "learning_rate": 0.0008414018474041041, + "loss": 0.5976, + "step": 13185 + }, + { + "epoch": 0.6696196266070998, + "grad_norm": 0.039758177882621364, + "learning_rate": 0.0008412399666125945, + "loss": 0.5797, + "step": 13190 + }, + { + "epoch": 0.6698734627051313, + "grad_norm": 0.034160759225817725, + "learning_rate": 0.0008410780188373814, + "loss": 0.603, + "step": 13195 + }, + { + "epoch": 0.6701272988031628, + "grad_norm": 0.045688219761594664, + "learning_rate": 0.0008409160041102543, + "loss": 0.5717, + "step": 13200 + }, + { + "epoch": 0.6703811349011943, + "grad_norm": 0.025679114000337882, + "learning_rate": 0.0008407539224630157, + "loss": 0.5667, + "step": 13205 + }, + { + "epoch": 0.6706349709992258, + "grad_norm": 0.036873394000505495, + "learning_rate": 0.0008405917739274813, + "loss": 0.6186, + "step": 13210 + }, + { + "epoch": 0.6708888070972573, + "grad_norm": 0.04153117906728298, + "learning_rate": 0.0008404295585354802, + "loss": 0.5834, + "step": 13215 + }, + { + "epoch": 0.6711426431952888, + "grad_norm": 0.02339181698231598, + "learning_rate": 0.0008402672763188545, + "loss": 0.5755, + "step": 13220 + }, + { + "epoch": 0.6713964792933204, + "grad_norm": 0.030301352786187497, + "learning_rate": 0.0008401049273094594, + "loss": 0.5768, + "step": 13225 + }, + { + "epoch": 0.6716503153913518, + "grad_norm": 0.030870240027105245, + "learning_rate": 0.0008399425115391632, + "loss": 0.5899, + "step": 13230 + }, + { + "epoch": 0.6719041514893833, + "grad_norm": 0.04099103986806237, + "learning_rate": 0.0008397800290398473, + "loss": 0.5572, + "step": 13235 + }, + { + "epoch": 0.6721579875874149, + "grad_norm": 0.025526012832099508, + "learning_rate": 0.0008396174798434062, + "loss": 0.5796, + "step": 13240 + }, + { + "epoch": 0.6724118236854463, + "grad_norm": 0.02582782722174359, + "learning_rate": 0.0008394548639817474, + "loss": 0.5748, + "step": 13245 + }, + { + "epoch": 0.6726656597834778, + "grad_norm": 0.044686293141161536, + "learning_rate": 0.0008392921814867916, + "loss": 0.6161, + "step": 13250 + }, + { + "epoch": 0.6729194958815093, + "grad_norm": 0.03157479001428999, + "learning_rate": 0.0008391294323904726, + "loss": 0.579, + "step": 13255 + }, + { + "epoch": 0.6731733319795408, + "grad_norm": 0.03230925419893887, + "learning_rate": 0.0008389666167247374, + "loss": 0.6105, + "step": 13260 + }, + { + "epoch": 0.6734271680775723, + "grad_norm": 0.0227012216152469, + "learning_rate": 0.0008388037345215457, + "loss": 0.5985, + "step": 13265 + }, + { + "epoch": 0.6736810041756038, + "grad_norm": 0.027786730311156502, + "learning_rate": 0.0008386407858128706, + "loss": 0.609, + "step": 13270 + }, + { + "epoch": 0.6739348402736353, + "grad_norm": 0.024074341825161803, + "learning_rate": 0.0008384777706306979, + "loss": 0.5953, + "step": 13275 + }, + { + "epoch": 0.6741886763716668, + "grad_norm": 0.024751468872537288, + "learning_rate": 0.0008383146890070269, + "loss": 0.5925, + "step": 13280 + }, + { + "epoch": 0.6744425124696983, + "grad_norm": 0.028042950086599638, + "learning_rate": 0.0008381515409738696, + "loss": 0.5829, + "step": 13285 + }, + { + "epoch": 0.6746963485677299, + "grad_norm": 0.026500810254070112, + "learning_rate": 0.0008379883265632512, + "loss": 0.6014, + "step": 13290 + }, + { + "epoch": 0.6749501846657613, + "grad_norm": 0.031677519675672304, + "learning_rate": 0.0008378250458072099, + "loss": 0.5688, + "step": 13295 + }, + { + "epoch": 0.6752040207637928, + "grad_norm": 0.03226776792346643, + "learning_rate": 0.0008376616987377968, + "loss": 0.637, + "step": 13300 + }, + { + "epoch": 0.6754578568618244, + "grad_norm": 0.03935591032487197, + "learning_rate": 0.0008374982853870761, + "loss": 0.6372, + "step": 13305 + }, + { + "epoch": 0.6757116929598558, + "grad_norm": 0.04985731682272338, + "learning_rate": 0.000837334805787125, + "loss": 0.5976, + "step": 13310 + }, + { + "epoch": 0.6759655290578873, + "grad_norm": 0.03831220440900208, + "learning_rate": 0.0008371712599700338, + "loss": 0.6055, + "step": 13315 + }, + { + "epoch": 0.6762193651559189, + "grad_norm": 0.02840577329842378, + "learning_rate": 0.0008370076479679059, + "loss": 0.5921, + "step": 13320 + }, + { + "epoch": 0.6764732012539503, + "grad_norm": 0.02670171769669835, + "learning_rate": 0.0008368439698128574, + "loss": 0.5863, + "step": 13325 + }, + { + "epoch": 0.6767270373519818, + "grad_norm": 0.027414460086001645, + "learning_rate": 0.0008366802255370174, + "loss": 0.5851, + "step": 13330 + }, + { + "epoch": 0.6769808734500133, + "grad_norm": 0.028303785071122464, + "learning_rate": 0.000836516415172528, + "loss": 0.5752, + "step": 13335 + }, + { + "epoch": 0.6772347095480449, + "grad_norm": 0.025538256488275953, + "learning_rate": 0.0008363525387515446, + "loss": 0.5939, + "step": 13340 + }, + { + "epoch": 0.6774885456460763, + "grad_norm": 0.03068030408097493, + "learning_rate": 0.0008361885963062353, + "loss": 0.5596, + "step": 13345 + }, + { + "epoch": 0.6777423817441078, + "grad_norm": 0.02823000001525365, + "learning_rate": 0.000836024587868781, + "loss": 0.6162, + "step": 13350 + }, + { + "epoch": 0.6779962178421394, + "grad_norm": 0.025476619812650033, + "learning_rate": 0.0008358605134713759, + "loss": 0.5924, + "step": 13355 + }, + { + "epoch": 0.6782500539401708, + "grad_norm": 0.04780163681995968, + "learning_rate": 0.0008356963731462271, + "loss": 0.5633, + "step": 13360 + }, + { + "epoch": 0.6785038900382023, + "grad_norm": 0.02825831061440577, + "learning_rate": 0.0008355321669255542, + "loss": 0.5918, + "step": 13365 + }, + { + "epoch": 0.6787577261362339, + "grad_norm": 0.026437801730974143, + "learning_rate": 0.0008353678948415901, + "loss": 0.5642, + "step": 13370 + }, + { + "epoch": 0.6790115622342653, + "grad_norm": 0.025454718983034218, + "learning_rate": 0.0008352035569265809, + "loss": 0.5691, + "step": 13375 + }, + { + "epoch": 0.6792653983322968, + "grad_norm": 0.023242855883304124, + "learning_rate": 0.0008350391532127851, + "loss": 0.6205, + "step": 13380 + }, + { + "epoch": 0.6795192344303284, + "grad_norm": 0.03623503812266734, + "learning_rate": 0.0008348746837324743, + "loss": 0.5955, + "step": 13385 + }, + { + "epoch": 0.6797730705283599, + "grad_norm": 0.029114104299875758, + "learning_rate": 0.0008347101485179332, + "loss": 0.5573, + "step": 13390 + }, + { + "epoch": 0.6800269066263913, + "grad_norm": 0.024202571333690956, + "learning_rate": 0.0008345455476014592, + "loss": 0.5487, + "step": 13395 + }, + { + "epoch": 0.6802807427244228, + "grad_norm": 0.02802981312295006, + "learning_rate": 0.0008343808810153624, + "loss": 0.5798, + "step": 13400 + }, + { + "epoch": 0.6805345788224544, + "grad_norm": 0.024805503868118448, + "learning_rate": 0.0008342161487919664, + "loss": 0.5874, + "step": 13405 + }, + { + "epoch": 0.6807884149204858, + "grad_norm": 0.023290389364338096, + "learning_rate": 0.000834051350963607, + "loss": 0.5926, + "step": 13410 + }, + { + "epoch": 0.6810422510185173, + "grad_norm": 0.028882137299632987, + "learning_rate": 0.0008338864875626333, + "loss": 0.5975, + "step": 13415 + }, + { + "epoch": 0.6812960871165489, + "grad_norm": 0.0288092686879103, + "learning_rate": 0.0008337215586214073, + "loss": 0.6053, + "step": 13420 + }, + { + "epoch": 0.6815499232145803, + "grad_norm": 0.025146916551842938, + "learning_rate": 0.0008335565641723035, + "loss": 0.5884, + "step": 13425 + }, + { + "epoch": 0.6818037593126118, + "grad_norm": 0.024739281591221643, + "learning_rate": 0.0008333915042477096, + "loss": 0.5652, + "step": 13430 + }, + { + "epoch": 0.6820575954106434, + "grad_norm": 0.024056449982588327, + "learning_rate": 0.000833226378880026, + "loss": 0.6189, + "step": 13435 + }, + { + "epoch": 0.6823114315086749, + "grad_norm": 0.026316533874474936, + "learning_rate": 0.000833061188101666, + "loss": 0.6197, + "step": 13440 + }, + { + "epoch": 0.6825652676067063, + "grad_norm": 0.022435368988669158, + "learning_rate": 0.000832895931945056, + "loss": 0.5676, + "step": 13445 + }, + { + "epoch": 0.6828191037047379, + "grad_norm": 0.03713138580934235, + "learning_rate": 0.0008327306104426345, + "loss": 0.587, + "step": 13450 + }, + { + "epoch": 0.6830729398027694, + "grad_norm": 0.028633714641747837, + "learning_rate": 0.0008325652236268536, + "loss": 0.599, + "step": 13455 + }, + { + "epoch": 0.6833267759008008, + "grad_norm": 0.040045692037592666, + "learning_rate": 0.0008323997715301777, + "loss": 0.5729, + "step": 13460 + }, + { + "epoch": 0.6835806119988324, + "grad_norm": 0.03628423966164721, + "learning_rate": 0.0008322342541850844, + "loss": 0.5689, + "step": 13465 + }, + { + "epoch": 0.6838344480968639, + "grad_norm": 0.030711278133780332, + "learning_rate": 0.0008320686716240637, + "loss": 0.5646, + "step": 13470 + }, + { + "epoch": 0.6840882841948953, + "grad_norm": 0.025362649640482234, + "learning_rate": 0.000831903023879619, + "loss": 0.5774, + "step": 13475 + }, + { + "epoch": 0.6843421202929268, + "grad_norm": 0.03407568648626278, + "learning_rate": 0.0008317373109842658, + "loss": 0.573, + "step": 13480 + }, + { + "epoch": 0.6845959563909584, + "grad_norm": 0.02491716289782007, + "learning_rate": 0.0008315715329705329, + "loss": 0.5727, + "step": 13485 + }, + { + "epoch": 0.6848497924889898, + "grad_norm": 0.0317932126461997, + "learning_rate": 0.0008314056898709615, + "loss": 0.6018, + "step": 13490 + }, + { + "epoch": 0.6851036285870213, + "grad_norm": 0.024107080698907192, + "learning_rate": 0.0008312397817181059, + "loss": 0.6016, + "step": 13495 + }, + { + "epoch": 0.6853574646850529, + "grad_norm": 0.03632719720582319, + "learning_rate": 0.0008310738085445332, + "loss": 0.5991, + "step": 13500 + }, + { + "epoch": 0.6856113007830844, + "grad_norm": 0.05715841618806044, + "learning_rate": 0.0008309077703828228, + "loss": 0.6122, + "step": 13505 + }, + { + "epoch": 0.6858651368811158, + "grad_norm": 0.03704079824369565, + "learning_rate": 0.0008307416672655674, + "loss": 0.6023, + "step": 13510 + }, + { + "epoch": 0.6861189729791474, + "grad_norm": 0.24267969330580477, + "learning_rate": 0.000830575499225372, + "loss": 0.5998, + "step": 13515 + }, + { + "epoch": 0.6863728090771789, + "grad_norm": 0.048764967617313686, + "learning_rate": 0.0008304092662948548, + "loss": 0.608, + "step": 13520 + }, + { + "epoch": 0.6866266451752103, + "grad_norm": 0.0516560962912755, + "learning_rate": 0.0008302429685066462, + "loss": 0.5713, + "step": 13525 + }, + { + "epoch": 0.6868804812732419, + "grad_norm": 0.03554128227511413, + "learning_rate": 0.0008300766058933899, + "loss": 0.5681, + "step": 13530 + }, + { + "epoch": 0.6871343173712734, + "grad_norm": 0.028215204615185188, + "learning_rate": 0.0008299101784877421, + "loss": 0.5954, + "step": 13535 + }, + { + "epoch": 0.6873881534693048, + "grad_norm": 0.0321511775141223, + "learning_rate": 0.0008297436863223715, + "loss": 0.5873, + "step": 13540 + }, + { + "epoch": 0.6876419895673364, + "grad_norm": 0.029157356552754574, + "learning_rate": 0.0008295771294299596, + "loss": 0.5775, + "step": 13545 + }, + { + "epoch": 0.6878958256653679, + "grad_norm": 0.028937031419344138, + "learning_rate": 0.0008294105078432007, + "loss": 0.5808, + "step": 13550 + }, + { + "epoch": 0.6881496617633994, + "grad_norm": 0.037942050160010676, + "learning_rate": 0.000829243821594802, + "loss": 0.6233, + "step": 13555 + }, + { + "epoch": 0.6884034978614308, + "grad_norm": 0.031543194063541696, + "learning_rate": 0.0008290770707174831, + "loss": 0.5977, + "step": 13560 + }, + { + "epoch": 0.6886573339594624, + "grad_norm": 0.02238872742391851, + "learning_rate": 0.0008289102552439762, + "loss": 0.5518, + "step": 13565 + }, + { + "epoch": 0.6889111700574939, + "grad_norm": 0.027982976773836193, + "learning_rate": 0.0008287433752070265, + "loss": 0.5747, + "step": 13570 + }, + { + "epoch": 0.6891650061555253, + "grad_norm": 0.03491184560447035, + "learning_rate": 0.0008285764306393917, + "loss": 0.5675, + "step": 13575 + }, + { + "epoch": 0.6894188422535569, + "grad_norm": 0.05242850032688674, + "learning_rate": 0.0008284094215738422, + "loss": 0.5764, + "step": 13580 + }, + { + "epoch": 0.6896726783515884, + "grad_norm": 0.05106024303010987, + "learning_rate": 0.000828242348043161, + "loss": 0.6068, + "step": 13585 + }, + { + "epoch": 0.6899265144496198, + "grad_norm": 0.030655123821702948, + "learning_rate": 0.0008280752100801439, + "loss": 0.5834, + "step": 13590 + }, + { + "epoch": 0.6901803505476514, + "grad_norm": 0.030080411514861318, + "learning_rate": 0.0008279080077175992, + "loss": 0.589, + "step": 13595 + }, + { + "epoch": 0.6904341866456829, + "grad_norm": 0.07602111036419369, + "learning_rate": 0.0008277407409883476, + "loss": 0.5973, + "step": 13600 + }, + { + "epoch": 0.6906880227437144, + "grad_norm": 0.026230560906281026, + "learning_rate": 0.0008275734099252233, + "loss": 0.5867, + "step": 13605 + }, + { + "epoch": 0.6909418588417459, + "grad_norm": 0.03576027679760639, + "learning_rate": 0.0008274060145610719, + "loss": 0.5555, + "step": 13610 + }, + { + "epoch": 0.6911956949397774, + "grad_norm": 0.03286724650115783, + "learning_rate": 0.0008272385549287529, + "loss": 0.6358, + "step": 13615 + }, + { + "epoch": 0.6914495310378089, + "grad_norm": 0.023355654899445015, + "learning_rate": 0.0008270710310611374, + "loss": 0.5994, + "step": 13620 + }, + { + "epoch": 0.6917033671358404, + "grad_norm": 0.026604280616454975, + "learning_rate": 0.0008269034429911095, + "loss": 0.6027, + "step": 13625 + }, + { + "epoch": 0.6919572032338719, + "grad_norm": 0.025639930478733225, + "learning_rate": 0.0008267357907515661, + "loss": 0.5732, + "step": 13630 + }, + { + "epoch": 0.6922110393319034, + "grad_norm": 0.0332874158069147, + "learning_rate": 0.0008265680743754165, + "loss": 0.6007, + "step": 13635 + }, + { + "epoch": 0.6924648754299348, + "grad_norm": 0.029848135095345608, + "learning_rate": 0.0008264002938955823, + "loss": 0.568, + "step": 13640 + }, + { + "epoch": 0.6927187115279664, + "grad_norm": 0.02669194226166377, + "learning_rate": 0.0008262324493449982, + "loss": 0.5983, + "step": 13645 + }, + { + "epoch": 0.6929725476259979, + "grad_norm": 0.027289337864795267, + "learning_rate": 0.0008260645407566114, + "loss": 0.6212, + "step": 13650 + }, + { + "epoch": 0.6932263837240294, + "grad_norm": 0.028021835675969425, + "learning_rate": 0.0008258965681633813, + "loss": 0.5927, + "step": 13655 + }, + { + "epoch": 0.6934802198220609, + "grad_norm": 0.025781289683893413, + "learning_rate": 0.0008257285315982799, + "loss": 0.5623, + "step": 13660 + }, + { + "epoch": 0.6937340559200924, + "grad_norm": 0.03639235414298802, + "learning_rate": 0.0008255604310942922, + "loss": 0.5608, + "step": 13665 + }, + { + "epoch": 0.6939878920181239, + "grad_norm": 0.025081206584032647, + "learning_rate": 0.0008253922666844155, + "loss": 0.5641, + "step": 13670 + }, + { + "epoch": 0.6942417281161554, + "grad_norm": 0.02503169555093532, + "learning_rate": 0.0008252240384016596, + "loss": 0.5815, + "step": 13675 + }, + { + "epoch": 0.6944955642141869, + "grad_norm": 0.025522493920881646, + "learning_rate": 0.0008250557462790469, + "loss": 0.6103, + "step": 13680 + }, + { + "epoch": 0.6947494003122184, + "grad_norm": 0.029829670493091157, + "learning_rate": 0.0008248873903496123, + "loss": 0.561, + "step": 13685 + }, + { + "epoch": 0.6950032364102499, + "grad_norm": 0.03274242306959272, + "learning_rate": 0.000824718970646403, + "loss": 0.5965, + "step": 13690 + }, + { + "epoch": 0.6952570725082814, + "grad_norm": 0.053602824065729755, + "learning_rate": 0.0008245504872024793, + "loss": 0.5778, + "step": 13695 + }, + { + "epoch": 0.6955109086063129, + "grad_norm": 0.02930268258189034, + "learning_rate": 0.0008243819400509133, + "loss": 0.5291, + "step": 13700 + }, + { + "epoch": 0.6957647447043444, + "grad_norm": 0.024987980789305964, + "learning_rate": 0.0008242133292247902, + "loss": 0.5828, + "step": 13705 + }, + { + "epoch": 0.6960185808023759, + "grad_norm": 0.02829975573628627, + "learning_rate": 0.0008240446547572076, + "loss": 0.5881, + "step": 13710 + }, + { + "epoch": 0.6962724169004074, + "grad_norm": 0.03443116573593433, + "learning_rate": 0.0008238759166812751, + "loss": 0.5919, + "step": 13715 + }, + { + "epoch": 0.696526252998439, + "grad_norm": 0.031587204673142105, + "learning_rate": 0.0008237071150301154, + "loss": 0.6065, + "step": 13720 + }, + { + "epoch": 0.6967800890964704, + "grad_norm": 0.023508246283948264, + "learning_rate": 0.0008235382498368634, + "loss": 0.5946, + "step": 13725 + }, + { + "epoch": 0.6970339251945019, + "grad_norm": 0.025009199129867415, + "learning_rate": 0.0008233693211346663, + "loss": 0.5563, + "step": 13730 + }, + { + "epoch": 0.6972877612925334, + "grad_norm": 0.03326099984467122, + "learning_rate": 0.0008232003289566843, + "loss": 0.5873, + "step": 13735 + }, + { + "epoch": 0.6975415973905649, + "grad_norm": 0.034975275494994254, + "learning_rate": 0.0008230312733360894, + "loss": 0.5658, + "step": 13740 + }, + { + "epoch": 0.6977954334885964, + "grad_norm": 0.025872022613802486, + "learning_rate": 0.0008228621543060665, + "loss": 0.5572, + "step": 13745 + }, + { + "epoch": 0.6980492695866279, + "grad_norm": 0.02546372161263482, + "learning_rate": 0.0008226929718998129, + "loss": 0.5905, + "step": 13750 + }, + { + "epoch": 0.6983031056846594, + "grad_norm": 0.03215322261937108, + "learning_rate": 0.0008225237261505381, + "loss": 0.5581, + "step": 13755 + }, + { + "epoch": 0.6985569417826909, + "grad_norm": 0.027702755257204188, + "learning_rate": 0.0008223544170914641, + "loss": 0.6156, + "step": 13760 + }, + { + "epoch": 0.6988107778807224, + "grad_norm": 0.03195843106084198, + "learning_rate": 0.0008221850447558259, + "loss": 0.6007, + "step": 13765 + }, + { + "epoch": 0.699064613978754, + "grad_norm": 0.037527909108500845, + "learning_rate": 0.00082201560917687, + "loss": 0.5738, + "step": 13770 + }, + { + "epoch": 0.6993184500767854, + "grad_norm": 0.05309666088358755, + "learning_rate": 0.000821846110387856, + "loss": 0.585, + "step": 13775 + }, + { + "epoch": 0.6995722861748169, + "grad_norm": 0.03185639071908041, + "learning_rate": 0.0008216765484220554, + "loss": 0.594, + "step": 13780 + }, + { + "epoch": 0.6998261222728485, + "grad_norm": 0.029682191254135553, + "learning_rate": 0.0008215069233127528, + "loss": 0.5832, + "step": 13785 + }, + { + "epoch": 0.7000799583708799, + "grad_norm": 0.030629890653038103, + "learning_rate": 0.0008213372350932444, + "loss": 0.5727, + "step": 13790 + }, + { + "epoch": 0.7003337944689114, + "grad_norm": 0.025415360193154723, + "learning_rate": 0.0008211674837968391, + "loss": 0.5891, + "step": 13795 + }, + { + "epoch": 0.700587630566943, + "grad_norm": 0.04954711837375036, + "learning_rate": 0.0008209976694568586, + "loss": 0.58, + "step": 13800 + }, + { + "epoch": 0.7008414666649744, + "grad_norm": 0.047662707757456846, + "learning_rate": 0.0008208277921066362, + "loss": 0.5835, + "step": 13805 + }, + { + "epoch": 0.7010953027630059, + "grad_norm": 0.049928278726034496, + "learning_rate": 0.0008206578517795185, + "loss": 0.584, + "step": 13810 + }, + { + "epoch": 0.7013491388610374, + "grad_norm": 0.06832687556620186, + "learning_rate": 0.0008204878485088634, + "loss": 0.6406, + "step": 13815 + }, + { + "epoch": 0.701602974959069, + "grad_norm": 0.08083335473139422, + "learning_rate": 0.0008203177823280419, + "loss": 0.6479, + "step": 13820 + }, + { + "epoch": 0.7018568110571004, + "grad_norm": 0.07135721218869323, + "learning_rate": 0.000820147653270437, + "loss": 0.6405, + "step": 13825 + }, + { + "epoch": 0.7021106471551319, + "grad_norm": 0.04104888957953317, + "learning_rate": 0.0008199774613694447, + "loss": 0.5936, + "step": 13830 + }, + { + "epoch": 0.7023644832531635, + "grad_norm": 0.0875218883592723, + "learning_rate": 0.0008198072066584721, + "loss": 0.6216, + "step": 13835 + }, + { + "epoch": 0.7026183193511949, + "grad_norm": 0.07942963393083619, + "learning_rate": 0.0008196368891709399, + "loss": 0.6315, + "step": 13840 + }, + { + "epoch": 0.7028721554492264, + "grad_norm": 0.06110437267505552, + "learning_rate": 0.0008194665089402804, + "loss": 0.5965, + "step": 13845 + }, + { + "epoch": 0.703125991547258, + "grad_norm": 0.08891102687332698, + "learning_rate": 0.0008192960659999383, + "loss": 0.6391, + "step": 13850 + }, + { + "epoch": 0.7033798276452894, + "grad_norm": 0.06448648520699571, + "learning_rate": 0.0008191255603833708, + "loss": 0.5794, + "step": 13855 + }, + { + "epoch": 0.7036336637433209, + "grad_norm": 0.030570919417378353, + "learning_rate": 0.0008189549921240472, + "loss": 0.5855, + "step": 13860 + }, + { + "epoch": 0.7038874998413525, + "grad_norm": 0.03473828499726488, + "learning_rate": 0.0008187843612554493, + "loss": 0.6168, + "step": 13865 + }, + { + "epoch": 0.704141335939384, + "grad_norm": 0.050512426090771084, + "learning_rate": 0.0008186136678110711, + "loss": 0.6157, + "step": 13870 + }, + { + "epoch": 0.7043951720374154, + "grad_norm": 0.03331461557177089, + "learning_rate": 0.000818442911824419, + "loss": 0.6325, + "step": 13875 + }, + { + "epoch": 0.704649008135447, + "grad_norm": 0.03887589726537461, + "learning_rate": 0.0008182720933290111, + "loss": 0.619, + "step": 13880 + }, + { + "epoch": 0.7049028442334785, + "grad_norm": 0.034491361473236455, + "learning_rate": 0.0008181012123583786, + "loss": 0.6265, + "step": 13885 + }, + { + "epoch": 0.7051566803315099, + "grad_norm": 0.04939733683559655, + "learning_rate": 0.0008179302689460646, + "loss": 0.6209, + "step": 13890 + }, + { + "epoch": 0.7054105164295414, + "grad_norm": 0.036958385336953684, + "learning_rate": 0.0008177592631256241, + "loss": 0.5847, + "step": 13895 + }, + { + "epoch": 0.705664352527573, + "grad_norm": 0.0338882190476505, + "learning_rate": 0.0008175881949306252, + "loss": 0.6022, + "step": 13900 + }, + { + "epoch": 0.7059181886256044, + "grad_norm": 0.03164427898151885, + "learning_rate": 0.0008174170643946472, + "loss": 0.6194, + "step": 13905 + }, + { + "epoch": 0.7061720247236359, + "grad_norm": 0.02788808622745641, + "learning_rate": 0.0008172458715512825, + "loss": 0.5735, + "step": 13910 + }, + { + "epoch": 0.7064258608216675, + "grad_norm": 0.05228385785798862, + "learning_rate": 0.0008170746164341352, + "loss": 0.6066, + "step": 13915 + }, + { + "epoch": 0.7066796969196989, + "grad_norm": 0.026836897802726748, + "learning_rate": 0.0008169032990768221, + "loss": 0.6333, + "step": 13920 + }, + { + "epoch": 0.7069335330177304, + "grad_norm": 0.028759260350827046, + "learning_rate": 0.0008167319195129717, + "loss": 0.6147, + "step": 13925 + }, + { + "epoch": 0.707187369115762, + "grad_norm": 0.045590276542868145, + "learning_rate": 0.0008165604777762251, + "loss": 0.6096, + "step": 13930 + }, + { + "epoch": 0.7074412052137935, + "grad_norm": 0.025644349892156408, + "learning_rate": 0.0008163889739002354, + "loss": 0.571, + "step": 13935 + }, + { + "epoch": 0.7076950413118249, + "grad_norm": 0.03515540024760426, + "learning_rate": 0.000816217407918668, + "loss": 0.57, + "step": 13940 + }, + { + "epoch": 0.7079488774098565, + "grad_norm": 0.030510481245704536, + "learning_rate": 0.0008160457798652002, + "loss": 0.5802, + "step": 13945 + }, + { + "epoch": 0.708202713507888, + "grad_norm": 0.026068842912907026, + "learning_rate": 0.0008158740897735221, + "loss": 0.59, + "step": 13950 + }, + { + "epoch": 0.7084565496059194, + "grad_norm": 0.02799394637765536, + "learning_rate": 0.0008157023376773354, + "loss": 0.6148, + "step": 13955 + }, + { + "epoch": 0.708710385703951, + "grad_norm": 0.039004735079580716, + "learning_rate": 0.0008155305236103543, + "loss": 0.6115, + "step": 13960 + }, + { + "epoch": 0.7089642218019825, + "grad_norm": 0.028262356505209356, + "learning_rate": 0.0008153586476063048, + "loss": 0.5613, + "step": 13965 + }, + { + "epoch": 0.7092180579000139, + "grad_norm": 0.02565901929071906, + "learning_rate": 0.0008151867096989256, + "loss": 0.5753, + "step": 13970 + }, + { + "epoch": 0.7094718939980454, + "grad_norm": 0.03604279567945798, + "learning_rate": 0.0008150147099219669, + "loss": 0.6221, + "step": 13975 + }, + { + "epoch": 0.709725730096077, + "grad_norm": 0.02599493314932738, + "learning_rate": 0.0008148426483091919, + "loss": 0.6047, + "step": 13980 + }, + { + "epoch": 0.7099795661941085, + "grad_norm": 0.03222079110872602, + "learning_rate": 0.000814670524894375, + "loss": 0.6341, + "step": 13985 + }, + { + "epoch": 0.7102334022921399, + "grad_norm": 0.027685751102907306, + "learning_rate": 0.0008144983397113032, + "loss": 0.6027, + "step": 13990 + }, + { + "epoch": 0.7104872383901715, + "grad_norm": 0.023842700636128793, + "learning_rate": 0.000814326092793776, + "loss": 0.5593, + "step": 13995 + }, + { + "epoch": 0.710741074488203, + "grad_norm": 0.04054047646149784, + "learning_rate": 0.0008141537841756043, + "loss": 0.563, + "step": 14000 + }, + { + "epoch": 0.7109949105862344, + "grad_norm": 0.028942162080538656, + "learning_rate": 0.0008139814138906112, + "loss": 0.5957, + "step": 14005 + }, + { + "epoch": 0.711248746684266, + "grad_norm": 0.03050796562803737, + "learning_rate": 0.0008138089819726326, + "loss": 0.5865, + "step": 14010 + }, + { + "epoch": 0.7115025827822975, + "grad_norm": 0.02451193708223736, + "learning_rate": 0.0008136364884555158, + "loss": 0.5557, + "step": 14015 + }, + { + "epoch": 0.7117564188803289, + "grad_norm": 0.04559764332264477, + "learning_rate": 0.0008134639333731202, + "loss": 0.5906, + "step": 14020 + }, + { + "epoch": 0.7120102549783605, + "grad_norm": 0.03944520888736035, + "learning_rate": 0.0008132913167593179, + "loss": 0.5703, + "step": 14025 + }, + { + "epoch": 0.712264091076392, + "grad_norm": 0.02938776092497612, + "learning_rate": 0.0008131186386479925, + "loss": 0.5766, + "step": 14030 + }, + { + "epoch": 0.7125179271744235, + "grad_norm": 0.04170810908228807, + "learning_rate": 0.0008129458990730398, + "loss": 0.5721, + "step": 14035 + }, + { + "epoch": 0.712771763272455, + "grad_norm": 0.04306584561555878, + "learning_rate": 0.0008127730980683677, + "loss": 0.6244, + "step": 14040 + }, + { + "epoch": 0.7130255993704865, + "grad_norm": 0.05080947630039173, + "learning_rate": 0.0008126002356678965, + "loss": 0.6219, + "step": 14045 + }, + { + "epoch": 0.713279435468518, + "grad_norm": 0.04665163730281675, + "learning_rate": 0.0008124273119055577, + "loss": 0.6287, + "step": 14050 + }, + { + "epoch": 0.7135332715665494, + "grad_norm": 0.023905364788607067, + "learning_rate": 0.0008122543268152957, + "loss": 0.6114, + "step": 14055 + }, + { + "epoch": 0.713787107664581, + "grad_norm": 0.031161542177222803, + "learning_rate": 0.0008120812804310667, + "loss": 0.5888, + "step": 14060 + }, + { + "epoch": 0.7140409437626125, + "grad_norm": 0.03335446756811596, + "learning_rate": 0.0008119081727868386, + "loss": 0.6239, + "step": 14065 + }, + { + "epoch": 0.7142947798606439, + "grad_norm": 0.12183613795209156, + "learning_rate": 0.0008117350039165916, + "loss": 0.5827, + "step": 14070 + }, + { + "epoch": 0.7145486159586755, + "grad_norm": 0.031453942182186376, + "learning_rate": 0.0008115617738543182, + "loss": 0.6154, + "step": 14075 + }, + { + "epoch": 0.714802452056707, + "grad_norm": 0.03179438032903415, + "learning_rate": 0.0008113884826340221, + "loss": 0.6307, + "step": 14080 + }, + { + "epoch": 0.7150562881547385, + "grad_norm": 0.03411034419877854, + "learning_rate": 0.0008112151302897198, + "loss": 0.6119, + "step": 14085 + }, + { + "epoch": 0.71531012425277, + "grad_norm": 0.03217809599591784, + "learning_rate": 0.0008110417168554396, + "loss": 0.5976, + "step": 14090 + }, + { + "epoch": 0.7155639603508015, + "grad_norm": 0.039786783196638205, + "learning_rate": 0.0008108682423652213, + "loss": 0.5819, + "step": 14095 + }, + { + "epoch": 0.715817796448833, + "grad_norm": 0.027681937005003987, + "learning_rate": 0.0008106947068531174, + "loss": 0.5556, + "step": 14100 + }, + { + "epoch": 0.7160716325468645, + "grad_norm": 0.046793772542064, + "learning_rate": 0.000810521110353192, + "loss": 0.6361, + "step": 14105 + }, + { + "epoch": 0.716325468644896, + "grad_norm": 0.02471149262461818, + "learning_rate": 0.0008103474528995213, + "loss": 0.5904, + "step": 14110 + }, + { + "epoch": 0.7165793047429275, + "grad_norm": 0.034604562342302796, + "learning_rate": 0.0008101737345261932, + "loss": 0.5659, + "step": 14115 + }, + { + "epoch": 0.716833140840959, + "grad_norm": 0.03117446691261137, + "learning_rate": 0.0008099999552673079, + "loss": 0.6229, + "step": 14120 + }, + { + "epoch": 0.7170869769389905, + "grad_norm": 0.0247974767087348, + "learning_rate": 0.0008098261151569772, + "loss": 0.5935, + "step": 14125 + }, + { + "epoch": 0.717340813037022, + "grad_norm": 0.03509041523808686, + "learning_rate": 0.0008096522142293255, + "loss": 0.6033, + "step": 14130 + }, + { + "epoch": 0.7175946491350534, + "grad_norm": 0.025370999477552076, + "learning_rate": 0.0008094782525184881, + "loss": 0.6065, + "step": 14135 + }, + { + "epoch": 0.717848485233085, + "grad_norm": 0.03293482244748489, + "learning_rate": 0.0008093042300586132, + "loss": 0.5631, + "step": 14140 + }, + { + "epoch": 0.7181023213311165, + "grad_norm": 0.02484308148359049, + "learning_rate": 0.0008091301468838604, + "loss": 0.6092, + "step": 14145 + }, + { + "epoch": 0.718356157429148, + "grad_norm": 0.024837575281261598, + "learning_rate": 0.0008089560030284014, + "loss": 0.5881, + "step": 14150 + }, + { + "epoch": 0.7186099935271795, + "grad_norm": 0.03353516289786398, + "learning_rate": 0.0008087817985264197, + "loss": 0.5782, + "step": 14155 + }, + { + "epoch": 0.718863829625211, + "grad_norm": 0.03056718458106469, + "learning_rate": 0.0008086075334121111, + "loss": 0.5962, + "step": 14160 + }, + { + "epoch": 0.7191176657232425, + "grad_norm": 0.05151128438996498, + "learning_rate": 0.0008084332077196824, + "loss": 0.5671, + "step": 14165 + }, + { + "epoch": 0.719371501821274, + "grad_norm": 0.029141462162182278, + "learning_rate": 0.0008082588214833534, + "loss": 0.6444, + "step": 14170 + }, + { + "epoch": 0.7196253379193055, + "grad_norm": 0.036675084804481534, + "learning_rate": 0.000808084374737355, + "loss": 0.5763, + "step": 14175 + }, + { + "epoch": 0.719879174017337, + "grad_norm": 0.03544199673026975, + "learning_rate": 0.0008079098675159302, + "loss": 0.5726, + "step": 14180 + }, + { + "epoch": 0.7201330101153685, + "grad_norm": 0.026411392839355837, + "learning_rate": 0.0008077352998533339, + "loss": 0.5642, + "step": 14185 + }, + { + "epoch": 0.7203868462134, + "grad_norm": 0.06222643500106606, + "learning_rate": 0.0008075606717838329, + "loss": 0.579, + "step": 14190 + }, + { + "epoch": 0.7206406823114315, + "grad_norm": 0.04757491182106772, + "learning_rate": 0.0008073859833417059, + "loss": 0.5845, + "step": 14195 + }, + { + "epoch": 0.7208945184094631, + "grad_norm": 0.03794874490419018, + "learning_rate": 0.0008072112345612433, + "loss": 0.6024, + "step": 14200 + }, + { + "epoch": 0.7211483545074945, + "grad_norm": 0.02909593167125118, + "learning_rate": 0.0008070364254767475, + "loss": 0.5807, + "step": 14205 + }, + { + "epoch": 0.721402190605526, + "grad_norm": 0.04995516503580864, + "learning_rate": 0.0008068615561225324, + "loss": 0.6365, + "step": 14210 + }, + { + "epoch": 0.7216560267035576, + "grad_norm": 0.029511579035933468, + "learning_rate": 0.0008066866265329242, + "loss": 0.5855, + "step": 14215 + }, + { + "epoch": 0.721909862801589, + "grad_norm": 0.04837520573816359, + "learning_rate": 0.0008065116367422607, + "loss": 0.5877, + "step": 14220 + }, + { + "epoch": 0.7221636988996205, + "grad_norm": 0.029346824274389435, + "learning_rate": 0.0008063365867848916, + "loss": 0.6129, + "step": 14225 + }, + { + "epoch": 0.722417534997652, + "grad_norm": 0.03801220854976456, + "learning_rate": 0.0008061614766951779, + "loss": 0.6121, + "step": 14230 + }, + { + "epoch": 0.7226713710956835, + "grad_norm": 0.035405249909125754, + "learning_rate": 0.0008059863065074934, + "loss": 0.5757, + "step": 14235 + }, + { + "epoch": 0.722925207193715, + "grad_norm": 0.03770160969115643, + "learning_rate": 0.0008058110762562227, + "loss": 0.6527, + "step": 14240 + }, + { + "epoch": 0.7231790432917465, + "grad_norm": 0.031787448281367404, + "learning_rate": 0.0008056357859757631, + "loss": 0.6437, + "step": 14245 + }, + { + "epoch": 0.7234328793897781, + "grad_norm": 0.3335810711614509, + "learning_rate": 0.0008054604357005227, + "loss": 0.5659, + "step": 14250 + }, + { + "epoch": 0.7236867154878095, + "grad_norm": 0.04424485372113206, + "learning_rate": 0.000805285025464922, + "loss": 0.5756, + "step": 14255 + }, + { + "epoch": 0.723940551585841, + "grad_norm": 0.03463932389672562, + "learning_rate": 0.0008051095553033935, + "loss": 0.5662, + "step": 14260 + }, + { + "epoch": 0.7241943876838726, + "grad_norm": 0.059238021172196606, + "learning_rate": 0.0008049340252503808, + "loss": 0.6028, + "step": 14265 + }, + { + "epoch": 0.724448223781904, + "grad_norm": 0.037906766126114544, + "learning_rate": 0.0008047584353403396, + "loss": 0.6386, + "step": 14270 + }, + { + "epoch": 0.7247020598799355, + "grad_norm": 0.03614272842990135, + "learning_rate": 0.0008045827856077373, + "loss": 0.5869, + "step": 14275 + }, + { + "epoch": 0.7249558959779671, + "grad_norm": 0.025607815005150466, + "learning_rate": 0.0008044070760870533, + "loss": 0.5728, + "step": 14280 + }, + { + "epoch": 0.7252097320759985, + "grad_norm": 0.027627783001377804, + "learning_rate": 0.0008042313068127781, + "loss": 0.639, + "step": 14285 + }, + { + "epoch": 0.72546356817403, + "grad_norm": 0.043412145360747785, + "learning_rate": 0.0008040554778194148, + "loss": 0.5497, + "step": 14290 + }, + { + "epoch": 0.7257174042720616, + "grad_norm": 0.02701727522162977, + "learning_rate": 0.0008038795891414774, + "loss": 0.6095, + "step": 14295 + }, + { + "epoch": 0.7259712403700931, + "grad_norm": 0.029754029318448776, + "learning_rate": 0.0008037036408134921, + "loss": 0.6163, + "step": 14300 + }, + { + "epoch": 0.7262250764681245, + "grad_norm": 0.04031215844232491, + "learning_rate": 0.0008035276328699967, + "loss": 0.6099, + "step": 14305 + }, + { + "epoch": 0.726478912566156, + "grad_norm": 0.029676037361304357, + "learning_rate": 0.0008033515653455408, + "loss": 0.5771, + "step": 14310 + }, + { + "epoch": 0.7267327486641876, + "grad_norm": 0.059697029416254835, + "learning_rate": 0.0008031754382746854, + "loss": 0.5749, + "step": 14315 + }, + { + "epoch": 0.726986584762219, + "grad_norm": 0.025871394003377013, + "learning_rate": 0.0008029992516920033, + "loss": 0.5962, + "step": 14320 + }, + { + "epoch": 0.7272404208602505, + "grad_norm": 0.02569159141087064, + "learning_rate": 0.0008028230056320791, + "loss": 0.575, + "step": 14325 + }, + { + "epoch": 0.7274942569582821, + "grad_norm": 0.02738571226532053, + "learning_rate": 0.0008026467001295092, + "loss": 0.5774, + "step": 14330 + }, + { + "epoch": 0.7277480930563135, + "grad_norm": 0.036913640747463713, + "learning_rate": 0.0008024703352189011, + "loss": 0.6074, + "step": 14335 + }, + { + "epoch": 0.728001929154345, + "grad_norm": 0.02570980610387182, + "learning_rate": 0.0008022939109348749, + "loss": 0.5959, + "step": 14340 + }, + { + "epoch": 0.7282557652523766, + "grad_norm": 0.04358525580235644, + "learning_rate": 0.0008021174273120615, + "loss": 0.5795, + "step": 14345 + }, + { + "epoch": 0.728509601350408, + "grad_norm": 0.07617898724384416, + "learning_rate": 0.0008019408843851037, + "loss": 0.7202, + "step": 14350 + }, + { + "epoch": 0.7287634374484395, + "grad_norm": 0.07466034802353338, + "learning_rate": 0.0008017642821886562, + "loss": 0.6215, + "step": 14355 + }, + { + "epoch": 0.7290172735464711, + "grad_norm": 0.06325253244834098, + "learning_rate": 0.0008015876207573848, + "loss": 0.6182, + "step": 14360 + }, + { + "epoch": 0.7292711096445026, + "grad_norm": 0.033647850896565544, + "learning_rate": 0.0008014109001259675, + "loss": 0.62, + "step": 14365 + }, + { + "epoch": 0.729524945742534, + "grad_norm": 0.0311901569505441, + "learning_rate": 0.0008012341203290936, + "loss": 0.5985, + "step": 14370 + }, + { + "epoch": 0.7297787818405655, + "grad_norm": 0.030231478505345966, + "learning_rate": 0.0008010572814014643, + "loss": 0.6101, + "step": 14375 + }, + { + "epoch": 0.7300326179385971, + "grad_norm": 0.03148649478982455, + "learning_rate": 0.0008008803833777919, + "loss": 0.5824, + "step": 14380 + }, + { + "epoch": 0.7302864540366285, + "grad_norm": 0.030003355989516, + "learning_rate": 0.0008007034262928008, + "loss": 0.5957, + "step": 14385 + }, + { + "epoch": 0.73054029013466, + "grad_norm": 0.02917680627459806, + "learning_rate": 0.0008005264101812267, + "loss": 0.5986, + "step": 14390 + }, + { + "epoch": 0.7307941262326916, + "grad_norm": 0.02495332231474039, + "learning_rate": 0.000800349335077817, + "loss": 0.5705, + "step": 14395 + }, + { + "epoch": 0.731047962330723, + "grad_norm": 0.024227094406430567, + "learning_rate": 0.0008001722010173306, + "loss": 0.606, + "step": 14400 + }, + { + "epoch": 0.7313017984287545, + "grad_norm": 0.02497321166367223, + "learning_rate": 0.0007999950080345382, + "loss": 0.598, + "step": 14405 + }, + { + "epoch": 0.7315556345267861, + "grad_norm": 0.03244596407041108, + "learning_rate": 0.0007998177561642218, + "loss": 0.6059, + "step": 14410 + }, + { + "epoch": 0.7318094706248176, + "grad_norm": 0.031337458561177915, + "learning_rate": 0.000799640445441175, + "loss": 0.6066, + "step": 14415 + }, + { + "epoch": 0.732063306722849, + "grad_norm": 0.027382503542512458, + "learning_rate": 0.000799463075900203, + "loss": 0.6036, + "step": 14420 + }, + { + "epoch": 0.7323171428208806, + "grad_norm": 0.11807874331082033, + "learning_rate": 0.0007992856475761228, + "loss": 0.5847, + "step": 14425 + }, + { + "epoch": 0.7325709789189121, + "grad_norm": 0.05405316379164612, + "learning_rate": 0.0007991081605037624, + "loss": 0.5619, + "step": 14430 + }, + { + "epoch": 0.7328248150169435, + "grad_norm": 0.03327425512133745, + "learning_rate": 0.0007989306147179618, + "loss": 0.5953, + "step": 14435 + }, + { + "epoch": 0.733078651114975, + "grad_norm": 0.03368726791670784, + "learning_rate": 0.0007987530102535723, + "loss": 0.5914, + "step": 14440 + }, + { + "epoch": 0.7333324872130066, + "grad_norm": 0.03358231187464119, + "learning_rate": 0.0007985753471454566, + "loss": 0.5838, + "step": 14445 + }, + { + "epoch": 0.733586323311038, + "grad_norm": 0.03261890078454357, + "learning_rate": 0.0007983976254284894, + "loss": 0.5523, + "step": 14450 + }, + { + "epoch": 0.7338401594090695, + "grad_norm": 0.05662683445430045, + "learning_rate": 0.0007982198451375564, + "loss": 0.6053, + "step": 14455 + }, + { + "epoch": 0.7340939955071011, + "grad_norm": 0.030310060759633013, + "learning_rate": 0.0007980420063075551, + "loss": 0.6454, + "step": 14460 + }, + { + "epoch": 0.7343478316051326, + "grad_norm": 0.026572897492377096, + "learning_rate": 0.0007978641089733941, + "loss": 0.5835, + "step": 14465 + }, + { + "epoch": 0.734601667703164, + "grad_norm": 0.03738980925312766, + "learning_rate": 0.0007976861531699942, + "loss": 0.5832, + "step": 14470 + }, + { + "epoch": 0.7348555038011956, + "grad_norm": 0.04967817488154915, + "learning_rate": 0.0007975081389322868, + "loss": 0.6216, + "step": 14475 + }, + { + "epoch": 0.7351093398992271, + "grad_norm": 0.021984750342595057, + "learning_rate": 0.0007973300662952155, + "loss": 0.5908, + "step": 14480 + }, + { + "epoch": 0.7353631759972585, + "grad_norm": 0.030283172609555046, + "learning_rate": 0.0007971519352937349, + "loss": 0.5622, + "step": 14485 + }, + { + "epoch": 0.7356170120952901, + "grad_norm": 0.025554536052177994, + "learning_rate": 0.0007969737459628112, + "loss": 0.5918, + "step": 14490 + }, + { + "epoch": 0.7358708481933216, + "grad_norm": 0.04131262873095309, + "learning_rate": 0.0007967954983374224, + "loss": 0.6124, + "step": 14495 + }, + { + "epoch": 0.736124684291353, + "grad_norm": 0.028079868427916613, + "learning_rate": 0.0007966171924525573, + "loss": 0.5802, + "step": 14500 + }, + { + "epoch": 0.7363785203893846, + "grad_norm": 0.03973060406341919, + "learning_rate": 0.0007964388283432165, + "loss": 0.5826, + "step": 14505 + }, + { + "epoch": 0.7366323564874161, + "grad_norm": 0.02430548491118239, + "learning_rate": 0.0007962604060444121, + "loss": 0.5866, + "step": 14510 + }, + { + "epoch": 0.7368861925854476, + "grad_norm": 0.02552835377781297, + "learning_rate": 0.0007960819255911673, + "loss": 0.5807, + "step": 14515 + }, + { + "epoch": 0.737140028683479, + "grad_norm": 0.030612793762513597, + "learning_rate": 0.0007959033870185173, + "loss": 0.5847, + "step": 14520 + }, + { + "epoch": 0.7373938647815106, + "grad_norm": 0.02107127051595995, + "learning_rate": 0.0007957247903615079, + "loss": 0.5667, + "step": 14525 + }, + { + "epoch": 0.7376477008795421, + "grad_norm": 0.022070086673001976, + "learning_rate": 0.0007955461356551971, + "loss": 0.5777, + "step": 14530 + }, + { + "epoch": 0.7379015369775735, + "grad_norm": 0.02546159830842789, + "learning_rate": 0.0007953674229346537, + "loss": 0.5899, + "step": 14535 + }, + { + "epoch": 0.7381553730756051, + "grad_norm": 0.039042598157433286, + "learning_rate": 0.000795188652234958, + "loss": 0.5747, + "step": 14540 + }, + { + "epoch": 0.7384092091736366, + "grad_norm": 0.2906439126670049, + "learning_rate": 0.0007950098235912021, + "loss": 0.6219, + "step": 14545 + }, + { + "epoch": 0.738663045271668, + "grad_norm": 0.07072338732370455, + "learning_rate": 0.0007948309370384891, + "loss": 0.5922, + "step": 14550 + }, + { + "epoch": 0.7389168813696996, + "grad_norm": 0.47514585488788236, + "learning_rate": 0.0007946519926119335, + "loss": 0.615, + "step": 14555 + }, + { + "epoch": 0.7391707174677311, + "grad_norm": 0.0648487542350592, + "learning_rate": 0.000794472990346661, + "loss": 0.5799, + "step": 14560 + }, + { + "epoch": 0.7394245535657625, + "grad_norm": 0.03408321166656662, + "learning_rate": 0.0007942939302778092, + "loss": 0.5847, + "step": 14565 + }, + { + "epoch": 0.7396783896637941, + "grad_norm": 0.04411672773665418, + "learning_rate": 0.0007941148124405264, + "loss": 0.6344, + "step": 14570 + }, + { + "epoch": 0.7399322257618256, + "grad_norm": 0.029167074032466867, + "learning_rate": 0.0007939356368699727, + "loss": 0.6158, + "step": 14575 + }, + { + "epoch": 0.7401860618598571, + "grad_norm": 0.030168988293000874, + "learning_rate": 0.0007937564036013194, + "loss": 0.5652, + "step": 14580 + }, + { + "epoch": 0.7404398979578886, + "grad_norm": 0.035815596751334416, + "learning_rate": 0.000793577112669749, + "loss": 0.5966, + "step": 14585 + }, + { + "epoch": 0.7406937340559201, + "grad_norm": 0.028868811480219827, + "learning_rate": 0.0007933977641104555, + "loss": 0.6047, + "step": 14590 + }, + { + "epoch": 0.7409475701539516, + "grad_norm": 0.031907618454550465, + "learning_rate": 0.000793218357958644, + "loss": 0.5645, + "step": 14595 + }, + { + "epoch": 0.741201406251983, + "grad_norm": 0.030941483646788018, + "learning_rate": 0.0007930388942495312, + "loss": 0.6008, + "step": 14600 + }, + { + "epoch": 0.7414552423500146, + "grad_norm": 0.026271292559618437, + "learning_rate": 0.0007928593730183447, + "loss": 0.5566, + "step": 14605 + }, + { + "epoch": 0.7417090784480461, + "grad_norm": 0.030445120700402395, + "learning_rate": 0.0007926797943003239, + "loss": 0.5926, + "step": 14610 + }, + { + "epoch": 0.7419629145460775, + "grad_norm": 0.02990094964372429, + "learning_rate": 0.0007925001581307189, + "loss": 0.5997, + "step": 14615 + }, + { + "epoch": 0.7422167506441091, + "grad_norm": 0.02511691268212505, + "learning_rate": 0.0007923204645447916, + "loss": 0.5804, + "step": 14620 + }, + { + "epoch": 0.7424705867421406, + "grad_norm": 0.041391672490102865, + "learning_rate": 0.0007921407135778151, + "loss": 0.5659, + "step": 14625 + }, + { + "epoch": 0.7427244228401721, + "grad_norm": 0.02995128050105831, + "learning_rate": 0.0007919609052650734, + "loss": 0.6081, + "step": 14630 + }, + { + "epoch": 0.7429782589382036, + "grad_norm": 0.03127151350046966, + "learning_rate": 0.0007917810396418618, + "loss": 0.5946, + "step": 14635 + }, + { + "epoch": 0.7432320950362351, + "grad_norm": 0.03606392489536054, + "learning_rate": 0.0007916011167434873, + "loss": 0.5931, + "step": 14640 + }, + { + "epoch": 0.7434859311342666, + "grad_norm": 0.037073684283635416, + "learning_rate": 0.000791421136605268, + "loss": 0.5968, + "step": 14645 + }, + { + "epoch": 0.7437397672322981, + "grad_norm": 0.04600230974128539, + "learning_rate": 0.0007912410992625326, + "loss": 0.6161, + "step": 14650 + }, + { + "epoch": 0.7439936033303296, + "grad_norm": 0.030237177490696085, + "learning_rate": 0.0007910610047506219, + "loss": 0.5662, + "step": 14655 + }, + { + "epoch": 0.7442474394283611, + "grad_norm": 0.04480838241032686, + "learning_rate": 0.0007908808531048876, + "loss": 0.6055, + "step": 14660 + }, + { + "epoch": 0.7445012755263926, + "grad_norm": 0.038523733334650756, + "learning_rate": 0.0007907006443606924, + "loss": 0.6029, + "step": 14665 + }, + { + "epoch": 0.7447551116244241, + "grad_norm": 0.03737018496192438, + "learning_rate": 0.0007905203785534104, + "loss": 0.6037, + "step": 14670 + }, + { + "epoch": 0.7450089477224556, + "grad_norm": 0.09121012906159448, + "learning_rate": 0.000790340055718427, + "loss": 0.5908, + "step": 14675 + }, + { + "epoch": 0.7452627838204872, + "grad_norm": 0.024868732926211504, + "learning_rate": 0.0007901596758911384, + "loss": 0.5974, + "step": 14680 + }, + { + "epoch": 0.7455166199185186, + "grad_norm": 0.029371473458118812, + "learning_rate": 0.0007899792391069527, + "loss": 0.5968, + "step": 14685 + }, + { + "epoch": 0.7457704560165501, + "grad_norm": 0.030092147058257405, + "learning_rate": 0.0007897987454012885, + "loss": 0.5867, + "step": 14690 + }, + { + "epoch": 0.7460242921145817, + "grad_norm": 0.06169934776691142, + "learning_rate": 0.0007896181948095755, + "loss": 0.6084, + "step": 14695 + }, + { + "epoch": 0.7462781282126131, + "grad_norm": 0.03499538883906122, + "learning_rate": 0.0007894375873672555, + "loss": 0.5695, + "step": 14700 + }, + { + "epoch": 0.7465319643106446, + "grad_norm": 0.03834546473926501, + "learning_rate": 0.0007892569231097804, + "loss": 0.6104, + "step": 14705 + }, + { + "epoch": 0.7467858004086761, + "grad_norm": 0.054730939898757484, + "learning_rate": 0.0007890762020726136, + "loss": 0.5776, + "step": 14710 + }, + { + "epoch": 0.7470396365067076, + "grad_norm": 0.024843870205877142, + "learning_rate": 0.0007888954242912303, + "loss": 0.5656, + "step": 14715 + }, + { + "epoch": 0.7472934726047391, + "grad_norm": 0.03349184814575679, + "learning_rate": 0.0007887145898011158, + "loss": 0.5901, + "step": 14720 + }, + { + "epoch": 0.7475473087027706, + "grad_norm": 0.049461573887305814, + "learning_rate": 0.0007885336986377671, + "loss": 0.5508, + "step": 14725 + }, + { + "epoch": 0.7478011448008022, + "grad_norm": 0.03552849919599442, + "learning_rate": 0.0007883527508366923, + "loss": 0.5951, + "step": 14730 + }, + { + "epoch": 0.7480549808988336, + "grad_norm": 0.028690443617235013, + "learning_rate": 0.0007881717464334104, + "loss": 0.5891, + "step": 14735 + }, + { + "epoch": 0.7483088169968651, + "grad_norm": 0.04167848811425972, + "learning_rate": 0.000787990685463452, + "loss": 0.562, + "step": 14740 + }, + { + "epoch": 0.7485626530948967, + "grad_norm": 0.05775833745744054, + "learning_rate": 0.000787809567962358, + "loss": 0.5688, + "step": 14745 + }, + { + "epoch": 0.7488164891929281, + "grad_norm": 0.026872351737841308, + "learning_rate": 0.0007876283939656814, + "loss": 0.5846, + "step": 14750 + }, + { + "epoch": 0.7490703252909596, + "grad_norm": 0.03924993939110344, + "learning_rate": 0.0007874471635089853, + "loss": 0.5622, + "step": 14755 + }, + { + "epoch": 0.7493241613889912, + "grad_norm": 0.04221861220551446, + "learning_rate": 0.0007872658766278444, + "loss": 0.5605, + "step": 14760 + }, + { + "epoch": 0.7495779974870226, + "grad_norm": 0.026884242563953744, + "learning_rate": 0.0007870845333578447, + "loss": 0.5434, + "step": 14765 + }, + { + "epoch": 0.7498318335850541, + "grad_norm": 0.05659339772085351, + "learning_rate": 0.0007869031337345828, + "loss": 0.5545, + "step": 14770 + }, + { + "epoch": 0.7500856696830857, + "grad_norm": 0.04676635774385785, + "learning_rate": 0.0007867216777936665, + "loss": 0.6038, + "step": 14775 + }, + { + "epoch": 0.7503395057811171, + "grad_norm": 0.03555192654015796, + "learning_rate": 0.0007865401655707148, + "loss": 0.628, + "step": 14780 + }, + { + "epoch": 0.7505933418791486, + "grad_norm": 0.02697674684520458, + "learning_rate": 0.0007863585971013574, + "loss": 0.6192, + "step": 14785 + }, + { + "epoch": 0.7508471779771801, + "grad_norm": 0.032422868044355145, + "learning_rate": 0.0007861769724212353, + "loss": 0.5789, + "step": 14790 + }, + { + "epoch": 0.7511010140752117, + "grad_norm": 0.0308503473942792, + "learning_rate": 0.0007859952915660009, + "loss": 0.6233, + "step": 14795 + }, + { + "epoch": 0.7513548501732431, + "grad_norm": 0.024092818055271237, + "learning_rate": 0.000785813554571317, + "loss": 0.5842, + "step": 14800 + }, + { + "epoch": 0.7516086862712746, + "grad_norm": 0.03344296842370167, + "learning_rate": 0.0007856317614728578, + "loss": 0.6261, + "step": 14805 + }, + { + "epoch": 0.7518625223693062, + "grad_norm": 0.037965967888487176, + "learning_rate": 0.0007854499123063081, + "loss": 0.5733, + "step": 14810 + }, + { + "epoch": 0.7521163584673376, + "grad_norm": 0.04191928838742156, + "learning_rate": 0.0007852680071073644, + "loss": 0.6117, + "step": 14815 + }, + { + "epoch": 0.7523701945653691, + "grad_norm": 0.04383694642505963, + "learning_rate": 0.0007850860459117332, + "loss": 0.5831, + "step": 14820 + }, + { + "epoch": 0.7526240306634007, + "grad_norm": 0.037614692321617, + "learning_rate": 0.0007849040287551332, + "loss": 0.5871, + "step": 14825 + }, + { + "epoch": 0.7528778667614321, + "grad_norm": 0.02516403676981507, + "learning_rate": 0.0007847219556732929, + "loss": 0.5927, + "step": 14830 + }, + { + "epoch": 0.7531317028594636, + "grad_norm": 0.041692616391361986, + "learning_rate": 0.0007845398267019528, + "loss": 0.6591, + "step": 14835 + }, + { + "epoch": 0.7533855389574952, + "grad_norm": 0.025964287691236976, + "learning_rate": 0.0007843576418768637, + "loss": 0.6005, + "step": 14840 + }, + { + "epoch": 0.7536393750555267, + "grad_norm": 0.03353359440550333, + "learning_rate": 0.0007841754012337876, + "loss": 0.579, + "step": 14845 + }, + { + "epoch": 0.7538932111535581, + "grad_norm": 0.055412308005210985, + "learning_rate": 0.0007839931048084971, + "loss": 0.6126, + "step": 14850 + }, + { + "epoch": 0.7541470472515897, + "grad_norm": 0.04895600693106303, + "learning_rate": 0.0007838107526367768, + "loss": 0.62, + "step": 14855 + }, + { + "epoch": 0.7544008833496212, + "grad_norm": 0.03477352614622541, + "learning_rate": 0.0007836283447544211, + "loss": 0.585, + "step": 14860 + }, + { + "epoch": 0.7546547194476526, + "grad_norm": 0.03588526651863759, + "learning_rate": 0.0007834458811972356, + "loss": 0.5851, + "step": 14865 + }, + { + "epoch": 0.7549085555456841, + "grad_norm": 0.05624481095210902, + "learning_rate": 0.0007832633620010372, + "loss": 0.6079, + "step": 14870 + }, + { + "epoch": 0.7551623916437157, + "grad_norm": 0.04357936006581149, + "learning_rate": 0.0007830807872016536, + "loss": 0.6187, + "step": 14875 + }, + { + "epoch": 0.7554162277417471, + "grad_norm": 0.040083961110080835, + "learning_rate": 0.000782898156834923, + "loss": 0.6431, + "step": 14880 + }, + { + "epoch": 0.7556700638397786, + "grad_norm": 0.07764594003089959, + "learning_rate": 0.000782715470936695, + "loss": 0.5915, + "step": 14885 + }, + { + "epoch": 0.7559238999378102, + "grad_norm": 0.030985982015469513, + "learning_rate": 0.0007825327295428302, + "loss": 0.5931, + "step": 14890 + }, + { + "epoch": 0.7561777360358417, + "grad_norm": 0.029912753853148843, + "learning_rate": 0.0007823499326891994, + "loss": 0.6124, + "step": 14895 + }, + { + "epoch": 0.7564315721338731, + "grad_norm": 0.028550336668593645, + "learning_rate": 0.000782167080411685, + "loss": 0.5834, + "step": 14900 + }, + { + "epoch": 0.7566854082319047, + "grad_norm": 0.028003935514170488, + "learning_rate": 0.0007819841727461798, + "loss": 0.6129, + "step": 14905 + }, + { + "epoch": 0.7569392443299362, + "grad_norm": 0.030636633612124004, + "learning_rate": 0.0007818012097285876, + "loss": 0.6299, + "step": 14910 + }, + { + "epoch": 0.7571930804279676, + "grad_norm": 0.05006422454029914, + "learning_rate": 0.0007816181913948235, + "loss": 0.6086, + "step": 14915 + }, + { + "epoch": 0.7574469165259992, + "grad_norm": 0.026143934888707424, + "learning_rate": 0.0007814351177808128, + "loss": 0.6139, + "step": 14920 + }, + { + "epoch": 0.7577007526240307, + "grad_norm": 1.2578760148102972, + "learning_rate": 0.000781251988922492, + "loss": 0.8194, + "step": 14925 + }, + { + "epoch": 0.7579545887220621, + "grad_norm": 0.10182912331451577, + "learning_rate": 0.0007810688048558083, + "loss": 0.6552, + "step": 14930 + }, + { + "epoch": 0.7582084248200937, + "grad_norm": 0.1212458030523509, + "learning_rate": 0.00078088556561672, + "loss": 0.6837, + "step": 14935 + }, + { + "epoch": 0.7584622609181252, + "grad_norm": 0.10362361037228977, + "learning_rate": 0.0007807022712411957, + "loss": 0.6756, + "step": 14940 + }, + { + "epoch": 0.7587160970161567, + "grad_norm": 0.04440285687666505, + "learning_rate": 0.0007805189217652158, + "loss": 0.6408, + "step": 14945 + }, + { + "epoch": 0.7589699331141881, + "grad_norm": 0.05277231543250202, + "learning_rate": 0.0007803355172247702, + "loss": 0.6197, + "step": 14950 + }, + { + "epoch": 0.7592237692122197, + "grad_norm": 0.031231568656463315, + "learning_rate": 0.0007801520576558608, + "loss": 0.6298, + "step": 14955 + }, + { + "epoch": 0.7594776053102512, + "grad_norm": 0.0345196918831133, + "learning_rate": 0.0007799685430944995, + "loss": 0.5926, + "step": 14960 + }, + { + "epoch": 0.7597314414082826, + "grad_norm": 0.03315835092152511, + "learning_rate": 0.0007797849735767094, + "loss": 0.5859, + "step": 14965 + }, + { + "epoch": 0.7599852775063142, + "grad_norm": 0.05487099815278874, + "learning_rate": 0.0007796013491385243, + "loss": 0.6047, + "step": 14970 + }, + { + "epoch": 0.7602391136043457, + "grad_norm": 0.0305550620003498, + "learning_rate": 0.0007794176698159887, + "loss": 0.5898, + "step": 14975 + }, + { + "epoch": 0.7604929497023771, + "grad_norm": 0.03492544688949025, + "learning_rate": 0.000779233935645158, + "loss": 0.6285, + "step": 14980 + }, + { + "epoch": 0.7607467858004087, + "grad_norm": 0.03623180099708256, + "learning_rate": 0.0007790501466620983, + "loss": 0.6035, + "step": 14985 + }, + { + "epoch": 0.7610006218984402, + "grad_norm": 0.04180008581977664, + "learning_rate": 0.0007788663029028863, + "loss": 0.5536, + "step": 14990 + }, + { + "epoch": 0.7612544579964716, + "grad_norm": 0.030734490730055696, + "learning_rate": 0.0007786824044036098, + "loss": 0.5731, + "step": 14995 + }, + { + "epoch": 0.7615082940945032, + "grad_norm": 0.055420433682433455, + "learning_rate": 0.0007784984512003671, + "loss": 0.6263, + "step": 15000 + }, + { + "epoch": 0.7617621301925347, + "grad_norm": 0.02941885467547894, + "learning_rate": 0.0007783144433292673, + "loss": 0.6284, + "step": 15005 + }, + { + "epoch": 0.7620159662905662, + "grad_norm": 0.033206110423357985, + "learning_rate": 0.0007781303808264303, + "loss": 0.5358, + "step": 15010 + }, + { + "epoch": 0.7622698023885977, + "grad_norm": 0.02625796801377728, + "learning_rate": 0.0007779462637279865, + "loss": 0.5969, + "step": 15015 + }, + { + "epoch": 0.7625236384866292, + "grad_norm": 0.29298031514322703, + "learning_rate": 0.0007777620920700773, + "loss": 0.5957, + "step": 15020 + }, + { + "epoch": 0.7627774745846607, + "grad_norm": 0.04086600855995801, + "learning_rate": 0.0007775778658888546, + "loss": 0.6322, + "step": 15025 + }, + { + "epoch": 0.7630313106826921, + "grad_norm": 0.039401332495816975, + "learning_rate": 0.000777393585220481, + "loss": 0.5687, + "step": 15030 + }, + { + "epoch": 0.7632851467807237, + "grad_norm": 0.026702482139940326, + "learning_rate": 0.0007772092501011301, + "loss": 0.6218, + "step": 15035 + }, + { + "epoch": 0.7635389828787552, + "grad_norm": 0.050236563570187805, + "learning_rate": 0.0007770248605669858, + "loss": 0.5861, + "step": 15040 + }, + { + "epoch": 0.7637928189767866, + "grad_norm": 0.02874533178510391, + "learning_rate": 0.0007768404166542431, + "loss": 0.5967, + "step": 15045 + }, + { + "epoch": 0.7640466550748182, + "grad_norm": 0.02753978513413307, + "learning_rate": 0.000776655918399107, + "loss": 0.5802, + "step": 15050 + }, + { + "epoch": 0.7643004911728497, + "grad_norm": 0.037996461348599846, + "learning_rate": 0.0007764713658377938, + "loss": 0.5915, + "step": 15055 + }, + { + "epoch": 0.7645543272708812, + "grad_norm": 0.06312266025775402, + "learning_rate": 0.0007762867590065302, + "loss": 0.5816, + "step": 15060 + }, + { + "epoch": 0.7648081633689127, + "grad_norm": 0.04647511323225117, + "learning_rate": 0.0007761020979415537, + "loss": 0.6081, + "step": 15065 + }, + { + "epoch": 0.7650619994669442, + "grad_norm": 0.0350344173940654, + "learning_rate": 0.0007759173826791123, + "loss": 0.5988, + "step": 15070 + }, + { + "epoch": 0.7653158355649757, + "grad_norm": 0.026503015783973544, + "learning_rate": 0.0007757326132554648, + "loss": 0.5924, + "step": 15075 + }, + { + "epoch": 0.7655696716630072, + "grad_norm": 0.03832259438303605, + "learning_rate": 0.0007755477897068803, + "loss": 0.6175, + "step": 15080 + }, + { + "epoch": 0.7658235077610387, + "grad_norm": 0.09757514516462057, + "learning_rate": 0.0007753629120696388, + "loss": 0.6075, + "step": 15085 + }, + { + "epoch": 0.7660773438590702, + "grad_norm": 0.05636880292552427, + "learning_rate": 0.000775177980380031, + "loss": 0.5902, + "step": 15090 + }, + { + "epoch": 0.7663311799571016, + "grad_norm": 0.044508398900761414, + "learning_rate": 0.0007749929946743578, + "loss": 0.6134, + "step": 15095 + }, + { + "epoch": 0.7665850160551332, + "grad_norm": 0.02889076972000296, + "learning_rate": 0.0007748079549889312, + "loss": 0.5796, + "step": 15100 + }, + { + "epoch": 0.7668388521531647, + "grad_norm": 0.038063937210235835, + "learning_rate": 0.0007746228613600735, + "loss": 0.5813, + "step": 15105 + }, + { + "epoch": 0.7670926882511963, + "grad_norm": 0.09708033315517485, + "learning_rate": 0.0007744377138241177, + "loss": 0.5844, + "step": 15110 + }, + { + "epoch": 0.7673465243492277, + "grad_norm": 0.06930206252274214, + "learning_rate": 0.0007742525124174073, + "loss": 0.6186, + "step": 15115 + }, + { + "epoch": 0.7676003604472592, + "grad_norm": 0.03248594459557167, + "learning_rate": 0.0007740672571762963, + "loss": 0.6108, + "step": 15120 + }, + { + "epoch": 0.7678541965452907, + "grad_norm": 0.0358203234252936, + "learning_rate": 0.0007738819481371495, + "loss": 0.5629, + "step": 15125 + }, + { + "epoch": 0.7681080326433222, + "grad_norm": 0.0446450825233111, + "learning_rate": 0.0007736965853363423, + "loss": 0.5974, + "step": 15130 + }, + { + "epoch": 0.7683618687413537, + "grad_norm": 0.03164679167642612, + "learning_rate": 0.0007735111688102602, + "loss": 0.6547, + "step": 15135 + }, + { + "epoch": 0.7686157048393852, + "grad_norm": 0.03064186972892073, + "learning_rate": 0.0007733256985952997, + "loss": 0.6022, + "step": 15140 + }, + { + "epoch": 0.7688695409374167, + "grad_norm": 0.34573913673428947, + "learning_rate": 0.0007731401747278676, + "loss": 0.5827, + "step": 15145 + }, + { + "epoch": 0.7691233770354482, + "grad_norm": 0.13966486620739513, + "learning_rate": 0.0007729545972443812, + "loss": 0.6077, + "step": 15150 + }, + { + "epoch": 0.7693772131334797, + "grad_norm": 0.029192487039169105, + "learning_rate": 0.000772768966181269, + "loss": 0.5599, + "step": 15155 + }, + { + "epoch": 0.7696310492315113, + "grad_norm": 0.03124263927843463, + "learning_rate": 0.0007725832815749686, + "loss": 0.6285, + "step": 15160 + }, + { + "epoch": 0.7698848853295427, + "grad_norm": 0.024326930348444097, + "learning_rate": 0.0007723975434619296, + "loss": 0.5531, + "step": 15165 + }, + { + "epoch": 0.7701387214275742, + "grad_norm": 0.02512413491231233, + "learning_rate": 0.0007722117518786112, + "loss": 0.5537, + "step": 15170 + }, + { + "epoch": 0.7703925575256058, + "grad_norm": 0.031772939647128885, + "learning_rate": 0.0007720259068614836, + "loss": 0.6025, + "step": 15175 + }, + { + "epoch": 0.7706463936236372, + "grad_norm": 0.043947294185195836, + "learning_rate": 0.0007718400084470267, + "loss": 0.6034, + "step": 15180 + }, + { + "epoch": 0.7709002297216687, + "grad_norm": 0.03222419275981874, + "learning_rate": 0.0007716540566717321, + "loss": 0.5981, + "step": 15185 + }, + { + "epoch": 0.7711540658197003, + "grad_norm": 0.03819007335282729, + "learning_rate": 0.0007714680515721008, + "loss": 0.5963, + "step": 15190 + }, + { + "epoch": 0.7714079019177317, + "grad_norm": 0.04093973435288178, + "learning_rate": 0.0007712819931846448, + "loss": 0.613, + "step": 15195 + }, + { + "epoch": 0.7716617380157632, + "grad_norm": 0.03695759978176583, + "learning_rate": 0.0007710958815458866, + "loss": 0.5745, + "step": 15200 + }, + { + "epoch": 0.7719155741137947, + "grad_norm": 0.027960391790393533, + "learning_rate": 0.0007709097166923586, + "loss": 0.5462, + "step": 15205 + }, + { + "epoch": 0.7721694102118263, + "grad_norm": 0.05757494108734914, + "learning_rate": 0.0007707234986606043, + "loss": 0.615, + "step": 15210 + }, + { + "epoch": 0.7724232463098577, + "grad_norm": 0.04085819775177726, + "learning_rate": 0.0007705372274871774, + "loss": 0.61, + "step": 15215 + }, + { + "epoch": 0.7726770824078892, + "grad_norm": 0.04422316744214125, + "learning_rate": 0.0007703509032086417, + "loss": 0.5625, + "step": 15220 + }, + { + "epoch": 0.7729309185059208, + "grad_norm": 0.029609165102383688, + "learning_rate": 0.0007701645258615721, + "loss": 0.5884, + "step": 15225 + }, + { + "epoch": 0.7731847546039522, + "grad_norm": 0.025139781930555237, + "learning_rate": 0.0007699780954825534, + "loss": 0.599, + "step": 15230 + }, + { + "epoch": 0.7734385907019837, + "grad_norm": 0.02759229289936248, + "learning_rate": 0.0007697916121081809, + "loss": 0.5581, + "step": 15235 + }, + { + "epoch": 0.7736924268000153, + "grad_norm": 0.022703912785005373, + "learning_rate": 0.0007696050757750603, + "loss": 0.5604, + "step": 15240 + }, + { + "epoch": 0.7739462628980467, + "grad_norm": 0.026623783300295627, + "learning_rate": 0.000769418486519808, + "loss": 0.5961, + "step": 15245 + }, + { + "epoch": 0.7742000989960782, + "grad_norm": 0.026127263841029254, + "learning_rate": 0.0007692318443790503, + "loss": 0.5585, + "step": 15250 + }, + { + "epoch": 0.7744539350941098, + "grad_norm": 0.02569593152061675, + "learning_rate": 0.0007690451493894241, + "loss": 0.6041, + "step": 15255 + }, + { + "epoch": 0.7747077711921412, + "grad_norm": 0.0256631736374656, + "learning_rate": 0.0007688584015875769, + "loss": 0.6003, + "step": 15260 + }, + { + "epoch": 0.7749616072901727, + "grad_norm": 0.03236587398767528, + "learning_rate": 0.0007686716010101663, + "loss": 0.642, + "step": 15265 + }, + { + "epoch": 0.7752154433882043, + "grad_norm": 0.03721845153105975, + "learning_rate": 0.0007684847476938601, + "loss": 0.5819, + "step": 15270 + }, + { + "epoch": 0.7754692794862358, + "grad_norm": 0.028027879120731618, + "learning_rate": 0.0007682978416753371, + "loss": 0.5795, + "step": 15275 + }, + { + "epoch": 0.7757231155842672, + "grad_norm": 0.02737803847928579, + "learning_rate": 0.0007681108829912857, + "loss": 0.5746, + "step": 15280 + }, + { + "epoch": 0.7759769516822987, + "grad_norm": 0.027303438264093486, + "learning_rate": 0.0007679238716784049, + "loss": 0.8898, + "step": 15285 + }, + { + "epoch": 0.7762307877803303, + "grad_norm": 0.036685326663591804, + "learning_rate": 0.0007677368077734045, + "loss": 0.5854, + "step": 15290 + }, + { + "epoch": 0.7764846238783617, + "grad_norm": 0.1602053406884727, + "learning_rate": 0.0007675496913130038, + "loss": 0.6604, + "step": 15295 + }, + { + "epoch": 0.7767384599763932, + "grad_norm": 0.04847230882404775, + "learning_rate": 0.0007673625223339329, + "loss": 0.623, + "step": 15300 + }, + { + "epoch": 0.7769922960744248, + "grad_norm": 0.05089648219518116, + "learning_rate": 0.0007671753008729323, + "loss": 0.6436, + "step": 15305 + }, + { + "epoch": 0.7772461321724562, + "grad_norm": 0.07145564787304685, + "learning_rate": 0.0007669880269667524, + "loss": 0.5677, + "step": 15310 + }, + { + "epoch": 0.7774999682704877, + "grad_norm": 0.058442914245472245, + "learning_rate": 0.0007668007006521544, + "loss": 0.6102, + "step": 15315 + }, + { + "epoch": 0.7777538043685193, + "grad_norm": 0.04062643001361585, + "learning_rate": 0.0007666133219659094, + "loss": 0.5994, + "step": 15320 + }, + { + "epoch": 0.7780076404665508, + "grad_norm": 0.04022904195831844, + "learning_rate": 0.0007664258909447989, + "loss": 0.6411, + "step": 15325 + }, + { + "epoch": 0.7782614765645822, + "grad_norm": 0.02659845024976935, + "learning_rate": 0.0007662384076256146, + "loss": 0.6142, + "step": 15330 + }, + { + "epoch": 0.7785153126626138, + "grad_norm": 0.04785803634447859, + "learning_rate": 0.0007660508720451585, + "loss": 0.6396, + "step": 15335 + }, + { + "epoch": 0.7787691487606453, + "grad_norm": 0.061095944021831515, + "learning_rate": 0.0007658632842402432, + "loss": 0.5819, + "step": 15340 + }, + { + "epoch": 0.7790229848586767, + "grad_norm": 0.030130348283759538, + "learning_rate": 0.0007656756442476911, + "loss": 0.5868, + "step": 15345 + }, + { + "epoch": 0.7792768209567082, + "grad_norm": 0.028351823717474526, + "learning_rate": 0.0007654879521043347, + "loss": 0.5797, + "step": 15350 + }, + { + "epoch": 0.7795306570547398, + "grad_norm": 0.050575288848887096, + "learning_rate": 0.0007653002078470175, + "loss": 0.6096, + "step": 15355 + }, + { + "epoch": 0.7797844931527712, + "grad_norm": 0.05292190850284731, + "learning_rate": 0.0007651124115125924, + "loss": 0.5891, + "step": 15360 + }, + { + "epoch": 0.7800383292508027, + "grad_norm": 0.03779146555526197, + "learning_rate": 0.0007649245631379232, + "loss": 0.5974, + "step": 15365 + }, + { + "epoch": 0.7802921653488343, + "grad_norm": 0.03100086787490601, + "learning_rate": 0.0007647366627598835, + "loss": 0.566, + "step": 15370 + }, + { + "epoch": 0.7805460014468658, + "grad_norm": 0.030255527750173147, + "learning_rate": 0.0007645487104153568, + "loss": 0.6128, + "step": 15375 + }, + { + "epoch": 0.7807998375448972, + "grad_norm": 0.027165593937075252, + "learning_rate": 0.0007643607061412379, + "loss": 0.5686, + "step": 15380 + }, + { + "epoch": 0.7810536736429288, + "grad_norm": 0.02824615720081996, + "learning_rate": 0.0007641726499744306, + "loss": 0.582, + "step": 15385 + }, + { + "epoch": 0.7813075097409603, + "grad_norm": 0.04126298838740098, + "learning_rate": 0.0007639845419518494, + "loss": 0.6027, + "step": 15390 + }, + { + "epoch": 0.7815613458389917, + "grad_norm": 0.032036059387028706, + "learning_rate": 0.0007637963821104192, + "loss": 0.5775, + "step": 15395 + }, + { + "epoch": 0.7818151819370233, + "grad_norm": 0.028486079119276198, + "learning_rate": 0.0007636081704870749, + "loss": 0.5682, + "step": 15400 + }, + { + "epoch": 0.7820690180350548, + "grad_norm": 0.024905654752730703, + "learning_rate": 0.0007634199071187613, + "loss": 0.5981, + "step": 15405 + }, + { + "epoch": 0.7823228541330862, + "grad_norm": 0.17057789658969538, + "learning_rate": 0.0007632315920424335, + "loss": 0.5801, + "step": 15410 + }, + { + "epoch": 0.7825766902311178, + "grad_norm": 0.03659353032866976, + "learning_rate": 0.000763043225295057, + "loss": 0.5597, + "step": 15415 + }, + { + "epoch": 0.7828305263291493, + "grad_norm": 0.026725388304567464, + "learning_rate": 0.0007628548069136071, + "loss": 0.5931, + "step": 15420 + }, + { + "epoch": 0.7830843624271808, + "grad_norm": 0.037625318866698565, + "learning_rate": 0.0007626663369350695, + "loss": 0.538, + "step": 15425 + }, + { + "epoch": 0.7833381985252122, + "grad_norm": 0.03956094460634958, + "learning_rate": 0.0007624778153964398, + "loss": 0.5758, + "step": 15430 + }, + { + "epoch": 0.7835920346232438, + "grad_norm": 0.02719027610400642, + "learning_rate": 0.0007622892423347241, + "loss": 0.6001, + "step": 15435 + }, + { + "epoch": 0.7838458707212753, + "grad_norm": 0.03100928874647277, + "learning_rate": 0.000762100617786938, + "loss": 0.6138, + "step": 15440 + }, + { + "epoch": 0.7840997068193067, + "grad_norm": 0.027836970066281757, + "learning_rate": 0.0007619119417901077, + "loss": 0.6017, + "step": 15445 + }, + { + "epoch": 0.7843535429173383, + "grad_norm": 0.03469260516526115, + "learning_rate": 0.0007617232143812693, + "loss": 0.5756, + "step": 15450 + }, + { + "epoch": 0.7846073790153698, + "grad_norm": 0.03935777037891353, + "learning_rate": 0.0007615344355974694, + "loss": 0.595, + "step": 15455 + }, + { + "epoch": 0.7848612151134012, + "grad_norm": 0.02501991833734147, + "learning_rate": 0.0007613456054757639, + "loss": 0.6073, + "step": 15460 + }, + { + "epoch": 0.7851150512114328, + "grad_norm": 0.02333012899999052, + "learning_rate": 0.0007611567240532193, + "loss": 0.5868, + "step": 15465 + }, + { + "epoch": 0.7853688873094643, + "grad_norm": 0.03818225366150291, + "learning_rate": 0.0007609677913669124, + "loss": 0.599, + "step": 15470 + }, + { + "epoch": 0.7856227234074957, + "grad_norm": 0.028704075914218637, + "learning_rate": 0.0007607788074539293, + "loss": 0.5973, + "step": 15475 + }, + { + "epoch": 0.7858765595055273, + "grad_norm": 0.021928136458700793, + "learning_rate": 0.0007605897723513669, + "loss": 0.593, + "step": 15480 + }, + { + "epoch": 0.7861303956035588, + "grad_norm": 0.025069154514387224, + "learning_rate": 0.0007604006860963315, + "loss": 0.5762, + "step": 15485 + }, + { + "epoch": 0.7863842317015903, + "grad_norm": 0.029790848879588383, + "learning_rate": 0.0007602115487259403, + "loss": 0.5952, + "step": 15490 + }, + { + "epoch": 0.7866380677996218, + "grad_norm": 0.030994892822406125, + "learning_rate": 0.0007600223602773198, + "loss": 0.6024, + "step": 15495 + }, + { + "epoch": 0.7868919038976533, + "grad_norm": 0.028391213579639876, + "learning_rate": 0.0007598331207876066, + "loss": 0.58, + "step": 15500 + }, + { + "epoch": 0.7871457399956848, + "grad_norm": 0.04047000777595839, + "learning_rate": 0.0007596438302939475, + "loss": 0.5813, + "step": 15505 + }, + { + "epoch": 0.7873995760937162, + "grad_norm": 0.032259318761633765, + "learning_rate": 0.0007594544888334994, + "loss": 0.583, + "step": 15510 + }, + { + "epoch": 0.7876534121917478, + "grad_norm": 0.036383271252755846, + "learning_rate": 0.0007592650964434292, + "loss": 0.6082, + "step": 15515 + }, + { + "epoch": 0.7879072482897793, + "grad_norm": 0.046019554411827555, + "learning_rate": 0.0007590756531609133, + "loss": 0.6063, + "step": 15520 + }, + { + "epoch": 0.7881610843878107, + "grad_norm": 0.027578051274795613, + "learning_rate": 0.0007588861590231388, + "loss": 0.5804, + "step": 15525 + }, + { + "epoch": 0.7884149204858423, + "grad_norm": 0.030164791483657458, + "learning_rate": 0.0007586966140673024, + "loss": 0.5828, + "step": 15530 + }, + { + "epoch": 0.7886687565838738, + "grad_norm": 0.029268754849734627, + "learning_rate": 0.0007585070183306106, + "loss": 0.5624, + "step": 15535 + }, + { + "epoch": 0.7889225926819053, + "grad_norm": 0.026293524029202597, + "learning_rate": 0.0007583173718502803, + "loss": 0.5694, + "step": 15540 + }, + { + "epoch": 0.7891764287799368, + "grad_norm": 0.03225939541940473, + "learning_rate": 0.0007581276746635383, + "loss": 0.5923, + "step": 15545 + }, + { + "epoch": 0.7894302648779683, + "grad_norm": 0.02421244006898213, + "learning_rate": 0.000757937926807621, + "loss": 0.5453, + "step": 15550 + }, + { + "epoch": 0.7896841009759998, + "grad_norm": 0.03888331896268403, + "learning_rate": 0.0007577481283197749, + "loss": 0.6002, + "step": 15555 + }, + { + "epoch": 0.7899379370740313, + "grad_norm": 0.031178413757377548, + "learning_rate": 0.0007575582792372567, + "loss": 0.5885, + "step": 15560 + }, + { + "epoch": 0.7901917731720628, + "grad_norm": 0.02453845415254218, + "learning_rate": 0.0007573683795973328, + "loss": 0.5623, + "step": 15565 + }, + { + "epoch": 0.7904456092700943, + "grad_norm": 0.03542656617397271, + "learning_rate": 0.0007571784294372792, + "loss": 0.6026, + "step": 15570 + }, + { + "epoch": 0.7906994453681258, + "grad_norm": 0.024593970240234127, + "learning_rate": 0.0007569884287943826, + "loss": 0.5946, + "step": 15575 + }, + { + "epoch": 0.7909532814661573, + "grad_norm": 0.034196088349742554, + "learning_rate": 0.000756798377705939, + "loss": 0.6394, + "step": 15580 + }, + { + "epoch": 0.7912071175641888, + "grad_norm": 0.03593770916178315, + "learning_rate": 0.0007566082762092546, + "loss": 0.6134, + "step": 15585 + }, + { + "epoch": 0.7914609536622204, + "grad_norm": 0.02744220052589826, + "learning_rate": 0.0007564181243416453, + "loss": 0.574, + "step": 15590 + }, + { + "epoch": 0.7917147897602518, + "grad_norm": 0.04256423578944382, + "learning_rate": 0.0007562279221404368, + "loss": 0.5861, + "step": 15595 + }, + { + "epoch": 0.7919686258582833, + "grad_norm": 0.036644059764111316, + "learning_rate": 0.0007560376696429651, + "loss": 0.5489, + "step": 15600 + }, + { + "epoch": 0.7922224619563148, + "grad_norm": 0.03260114588100312, + "learning_rate": 0.0007558473668865755, + "loss": 0.5637, + "step": 15605 + }, + { + "epoch": 0.7924762980543463, + "grad_norm": 0.024808409878328188, + "learning_rate": 0.0007556570139086239, + "loss": 0.593, + "step": 15610 + }, + { + "epoch": 0.7927301341523778, + "grad_norm": 0.04023189432619606, + "learning_rate": 0.0007554666107464754, + "loss": 0.5664, + "step": 15615 + }, + { + "epoch": 0.7929839702504093, + "grad_norm": 0.02342950434065168, + "learning_rate": 0.0007552761574375052, + "loss": 0.5895, + "step": 15620 + }, + { + "epoch": 0.7932378063484408, + "grad_norm": 0.03876234729699636, + "learning_rate": 0.0007550856540190985, + "loss": 0.5723, + "step": 15625 + }, + { + "epoch": 0.7934916424464723, + "grad_norm": 0.023809875697390476, + "learning_rate": 0.0007548951005286498, + "loss": 0.5897, + "step": 15630 + }, + { + "epoch": 0.7937454785445038, + "grad_norm": 0.025531257953473226, + "learning_rate": 0.0007547044970035641, + "loss": 0.5728, + "step": 15635 + }, + { + "epoch": 0.7939993146425354, + "grad_norm": 0.023549442711572447, + "learning_rate": 0.0007545138434812559, + "loss": 0.5479, + "step": 15640 + }, + { + "epoch": 0.7942531507405668, + "grad_norm": 0.03393140892826125, + "learning_rate": 0.0007543231399991495, + "loss": 0.591, + "step": 15645 + }, + { + "epoch": 0.7945069868385983, + "grad_norm": 0.026850249760398237, + "learning_rate": 0.0007541323865946789, + "loss": 0.5756, + "step": 15650 + }, + { + "epoch": 0.7947608229366299, + "grad_norm": 0.02644298213479883, + "learning_rate": 0.0007539415833052882, + "loss": 0.6027, + "step": 15655 + }, + { + "epoch": 0.7950146590346613, + "grad_norm": 1.7634760340274238, + "learning_rate": 0.0007537507301684312, + "loss": 0.6009, + "step": 15660 + }, + { + "epoch": 0.7952684951326928, + "grad_norm": 0.05116113095501194, + "learning_rate": 0.0007535598272215712, + "loss": 0.6035, + "step": 15665 + }, + { + "epoch": 0.7955223312307244, + "grad_norm": 0.09123746164702429, + "learning_rate": 0.0007533688745021817, + "loss": 0.5869, + "step": 15670 + }, + { + "epoch": 0.7957761673287558, + "grad_norm": 0.04008705946671749, + "learning_rate": 0.0007531778720477457, + "loss": 0.6197, + "step": 15675 + }, + { + "epoch": 0.7960300034267873, + "grad_norm": 0.2577476743899855, + "learning_rate": 0.000752986819895756, + "loss": 0.6145, + "step": 15680 + }, + { + "epoch": 0.7962838395248188, + "grad_norm": 0.04408392445254025, + "learning_rate": 0.0007527957180837152, + "loss": 0.6326, + "step": 15685 + }, + { + "epoch": 0.7965376756228503, + "grad_norm": 0.03340637676201848, + "learning_rate": 0.0007526045666491355, + "loss": 0.6089, + "step": 15690 + }, + { + "epoch": 0.7967915117208818, + "grad_norm": 0.03220967045008419, + "learning_rate": 0.0007524133656295392, + "loss": 0.5869, + "step": 15695 + }, + { + "epoch": 0.7970453478189133, + "grad_norm": 0.03674143282021481, + "learning_rate": 0.0007522221150624579, + "loss": 0.6527, + "step": 15700 + }, + { + "epoch": 0.7972991839169449, + "grad_norm": 0.02344499359362633, + "learning_rate": 0.0007520308149854336, + "loss": 0.5767, + "step": 15705 + }, + { + "epoch": 0.7975530200149763, + "grad_norm": 0.030359982821470545, + "learning_rate": 0.0007518394654360169, + "loss": 0.5843, + "step": 15710 + }, + { + "epoch": 0.7978068561130078, + "grad_norm": 0.03557247505749754, + "learning_rate": 0.000751648066451769, + "loss": 0.608, + "step": 15715 + }, + { + "epoch": 0.7980606922110394, + "grad_norm": 0.027894981110671378, + "learning_rate": 0.0007514566180702609, + "loss": 0.5726, + "step": 15720 + }, + { + "epoch": 0.7983145283090708, + "grad_norm": 0.03575658449386519, + "learning_rate": 0.0007512651203290723, + "loss": 0.616, + "step": 15725 + }, + { + "epoch": 0.7985683644071023, + "grad_norm": 0.031795758337900826, + "learning_rate": 0.000751073573265794, + "loss": 0.5772, + "step": 15730 + }, + { + "epoch": 0.7988222005051339, + "grad_norm": 0.03329640143386616, + "learning_rate": 0.0007508819769180252, + "loss": 0.5722, + "step": 15735 + }, + { + "epoch": 0.7990760366031653, + "grad_norm": 0.028561079497327932, + "learning_rate": 0.0007506903313233755, + "loss": 0.5843, + "step": 15740 + }, + { + "epoch": 0.7993298727011968, + "grad_norm": 0.03499905510867631, + "learning_rate": 0.0007504986365194639, + "loss": 0.5441, + "step": 15745 + }, + { + "epoch": 0.7995837087992284, + "grad_norm": 0.02584998094099768, + "learning_rate": 0.0007503068925439194, + "loss": 0.5588, + "step": 15750 + }, + { + "epoch": 0.7998375448972599, + "grad_norm": 0.02484740511899366, + "learning_rate": 0.00075011509943438, + "loss": 0.5439, + "step": 15755 + }, + { + "epoch": 0.8000913809952913, + "grad_norm": 0.05001793472695475, + "learning_rate": 0.0007499232572284938, + "loss": 0.5912, + "step": 15760 + }, + { + "epoch": 0.8003452170933228, + "grad_norm": 0.02822375304428738, + "learning_rate": 0.0007497313659639188, + "loss": 0.5858, + "step": 15765 + }, + { + "epoch": 0.8005990531913544, + "grad_norm": 0.025776424090071148, + "learning_rate": 0.0007495394256783219, + "loss": 0.5725, + "step": 15770 + }, + { + "epoch": 0.8008528892893858, + "grad_norm": 0.03319633421805169, + "learning_rate": 0.0007493474364093803, + "loss": 0.5897, + "step": 15775 + }, + { + "epoch": 0.8011067253874173, + "grad_norm": 0.04258494801205897, + "learning_rate": 0.0007491553981947804, + "loss": 0.5431, + "step": 15780 + }, + { + "epoch": 0.8013605614854489, + "grad_norm": 0.028145407612735995, + "learning_rate": 0.0007489633110722183, + "loss": 0.5549, + "step": 15785 + }, + { + "epoch": 0.8016143975834803, + "grad_norm": 0.026623484939927906, + "learning_rate": 0.0007487711750793998, + "loss": 0.595, + "step": 15790 + }, + { + "epoch": 0.8018682336815118, + "grad_norm": 0.022934168464128556, + "learning_rate": 0.0007485789902540403, + "loss": 0.5527, + "step": 15795 + }, + { + "epoch": 0.8021220697795434, + "grad_norm": 0.025252927285032714, + "learning_rate": 0.0007483867566338647, + "loss": 0.5876, + "step": 15800 + }, + { + "epoch": 0.8023759058775749, + "grad_norm": 0.028849244701322482, + "learning_rate": 0.0007481944742566076, + "loss": 0.5715, + "step": 15805 + }, + { + "epoch": 0.8026297419756063, + "grad_norm": 0.026220259487405768, + "learning_rate": 0.0007480021431600128, + "loss": 0.6162, + "step": 15810 + }, + { + "epoch": 0.8028835780736379, + "grad_norm": 0.021106176901375888, + "learning_rate": 0.000747809763381834, + "loss": 0.5713, + "step": 15815 + }, + { + "epoch": 0.8031374141716694, + "grad_norm": 0.02762824758071424, + "learning_rate": 0.0007476173349598345, + "loss": 0.5849, + "step": 15820 + }, + { + "epoch": 0.8033912502697008, + "grad_norm": 0.05745838787319821, + "learning_rate": 0.000747424857931787, + "loss": 0.5462, + "step": 15825 + }, + { + "epoch": 0.8036450863677324, + "grad_norm": 0.040680994894634726, + "learning_rate": 0.0007472323323354739, + "loss": 0.5626, + "step": 15830 + }, + { + "epoch": 0.8038989224657639, + "grad_norm": 0.029285643222152852, + "learning_rate": 0.0007470397582086869, + "loss": 0.598, + "step": 15835 + }, + { + "epoch": 0.8041527585637953, + "grad_norm": 0.03200481493108892, + "learning_rate": 0.0007468471355892275, + "loss": 0.5663, + "step": 15840 + }, + { + "epoch": 0.8044065946618268, + "grad_norm": 0.02381141335581086, + "learning_rate": 0.0007466544645149061, + "loss": 0.5944, + "step": 15845 + }, + { + "epoch": 0.8046604307598584, + "grad_norm": 0.02766719856403892, + "learning_rate": 0.0007464617450235434, + "loss": 0.5815, + "step": 15850 + }, + { + "epoch": 0.8049142668578899, + "grad_norm": 0.04488529656718956, + "learning_rate": 0.0007462689771529695, + "loss": 0.5526, + "step": 15855 + }, + { + "epoch": 0.8051681029559213, + "grad_norm": 0.034731993566618206, + "learning_rate": 0.0007460761609410233, + "loss": 0.6007, + "step": 15860 + }, + { + "epoch": 0.8054219390539529, + "grad_norm": 0.025791063644137125, + "learning_rate": 0.000745883296425554, + "loss": 0.5695, + "step": 15865 + }, + { + "epoch": 0.8056757751519844, + "grad_norm": 0.028080587090770482, + "learning_rate": 0.00074569038364442, + "loss": 0.556, + "step": 15870 + }, + { + "epoch": 0.8059296112500158, + "grad_norm": 0.035411194412573295, + "learning_rate": 0.0007454974226354887, + "loss": 0.5774, + "step": 15875 + }, + { + "epoch": 0.8061834473480474, + "grad_norm": 0.03763595723144201, + "learning_rate": 0.0007453044134366377, + "loss": 0.5604, + "step": 15880 + }, + { + "epoch": 0.8064372834460789, + "grad_norm": 0.026026178191929684, + "learning_rate": 0.0007451113560857537, + "loss": 0.5668, + "step": 15885 + }, + { + "epoch": 0.8066911195441103, + "grad_norm": 0.023998437380902855, + "learning_rate": 0.0007449182506207328, + "loss": 0.5542, + "step": 15890 + }, + { + "epoch": 0.8069449556421419, + "grad_norm": 0.02465158291279762, + "learning_rate": 0.0007447250970794807, + "loss": 0.585, + "step": 15895 + }, + { + "epoch": 0.8071987917401734, + "grad_norm": 0.03811256192460854, + "learning_rate": 0.0007445318954999126, + "loss": 0.5816, + "step": 15900 + }, + { + "epoch": 0.8074526278382048, + "grad_norm": 0.03160589407825792, + "learning_rate": 0.0007443386459199528, + "loss": 0.577, + "step": 15905 + }, + { + "epoch": 0.8077064639362364, + "grad_norm": 0.028906762540522022, + "learning_rate": 0.0007441453483775354, + "loss": 0.6078, + "step": 15910 + }, + { + "epoch": 0.8079603000342679, + "grad_norm": 0.036054008668968705, + "learning_rate": 0.0007439520029106035, + "loss": 0.5942, + "step": 15915 + }, + { + "epoch": 0.8082141361322994, + "grad_norm": 0.026999292449829813, + "learning_rate": 0.0007437586095571102, + "loss": 0.5836, + "step": 15920 + }, + { + "epoch": 0.8084679722303308, + "grad_norm": 0.027971628557511584, + "learning_rate": 0.0007435651683550173, + "loss": 0.5629, + "step": 15925 + }, + { + "epoch": 0.8087218083283624, + "grad_norm": 0.036526997758639046, + "learning_rate": 0.0007433716793422967, + "loss": 0.5892, + "step": 15930 + }, + { + "epoch": 0.8089756444263939, + "grad_norm": 0.03567724658137178, + "learning_rate": 0.0007431781425569289, + "loss": 0.5557, + "step": 15935 + }, + { + "epoch": 0.8092294805244253, + "grad_norm": 0.024802333956108272, + "learning_rate": 0.0007429845580369046, + "loss": 0.5618, + "step": 15940 + }, + { + "epoch": 0.8094833166224569, + "grad_norm": 0.022538774195908166, + "learning_rate": 0.0007427909258202232, + "loss": 0.583, + "step": 15945 + }, + { + "epoch": 0.8097371527204884, + "grad_norm": 0.025056735200451364, + "learning_rate": 0.0007425972459448941, + "loss": 0.5647, + "step": 15950 + }, + { + "epoch": 0.8099909888185198, + "grad_norm": 0.03851297901292128, + "learning_rate": 0.0007424035184489352, + "loss": 0.5914, + "step": 15955 + }, + { + "epoch": 0.8102448249165514, + "grad_norm": 0.04099783815757058, + "learning_rate": 0.0007422097433703748, + "loss": 0.5802, + "step": 15960 + }, + { + "epoch": 0.8104986610145829, + "grad_norm": 0.03222527568690099, + "learning_rate": 0.0007420159207472494, + "loss": 0.5839, + "step": 15965 + }, + { + "epoch": 0.8107524971126144, + "grad_norm": 0.024865699732427023, + "learning_rate": 0.0007418220506176058, + "loss": 0.5913, + "step": 15970 + }, + { + "epoch": 0.8110063332106459, + "grad_norm": 0.035177466344471775, + "learning_rate": 0.0007416281330194996, + "loss": 0.5812, + "step": 15975 + }, + { + "epoch": 0.8112601693086774, + "grad_norm": 0.024089520192005706, + "learning_rate": 0.0007414341679909958, + "loss": 0.609, + "step": 15980 + }, + { + "epoch": 0.8115140054067089, + "grad_norm": 0.023966198926588644, + "learning_rate": 0.0007412401555701689, + "loss": 0.5816, + "step": 15985 + }, + { + "epoch": 0.8117678415047404, + "grad_norm": 0.09247687498068556, + "learning_rate": 0.0007410460957951026, + "loss": 0.5439, + "step": 15990 + }, + { + "epoch": 0.8120216776027719, + "grad_norm": 0.026101986363411846, + "learning_rate": 0.0007408519887038898, + "loss": 0.5656, + "step": 15995 + }, + { + "epoch": 0.8122755137008034, + "grad_norm": 0.024600475072470263, + "learning_rate": 0.0007406578343346327, + "loss": 0.5966, + "step": 16000 + }, + { + "epoch": 0.8125293497988348, + "grad_norm": 0.02581395178412151, + "learning_rate": 0.0007404636327254428, + "loss": 0.5841, + "step": 16005 + }, + { + "epoch": 0.8127831858968664, + "grad_norm": 0.06338067688891946, + "learning_rate": 0.000740269383914441, + "loss": 0.5751, + "step": 16010 + }, + { + "epoch": 0.8130370219948979, + "grad_norm": 0.02366983052130053, + "learning_rate": 0.0007400750879397576, + "loss": 0.5536, + "step": 16015 + }, + { + "epoch": 0.8132908580929294, + "grad_norm": 0.0363662136727588, + "learning_rate": 0.0007398807448395314, + "loss": 0.5709, + "step": 16020 + }, + { + "epoch": 0.8135446941909609, + "grad_norm": 0.04335279844371627, + "learning_rate": 0.0007396863546519113, + "loss": 0.5775, + "step": 16025 + }, + { + "epoch": 0.8137985302889924, + "grad_norm": 0.028807809521302862, + "learning_rate": 0.0007394919174150552, + "loss": 0.587, + "step": 16030 + }, + { + "epoch": 0.8140523663870239, + "grad_norm": 0.03090810771638599, + "learning_rate": 0.0007392974331671301, + "loss": 0.5813, + "step": 16035 + }, + { + "epoch": 0.8143062024850554, + "grad_norm": 0.03332179615924521, + "learning_rate": 0.0007391029019463121, + "loss": 0.5748, + "step": 16040 + }, + { + "epoch": 0.8145600385830869, + "grad_norm": 0.02260131038125526, + "learning_rate": 0.0007389083237907869, + "loss": 0.5357, + "step": 16045 + }, + { + "epoch": 0.8148138746811184, + "grad_norm": 0.03144023339848837, + "learning_rate": 0.0007387136987387493, + "loss": 0.5479, + "step": 16050 + }, + { + "epoch": 0.8150677107791499, + "grad_norm": 0.04852227270793531, + "learning_rate": 0.0007385190268284028, + "loss": 0.562, + "step": 16055 + }, + { + "epoch": 0.8153215468771814, + "grad_norm": 0.0238226778020187, + "learning_rate": 0.000738324308097961, + "loss": 0.5748, + "step": 16060 + }, + { + "epoch": 0.8155753829752129, + "grad_norm": 0.024025342394617455, + "learning_rate": 0.0007381295425856461, + "loss": 0.5779, + "step": 16065 + }, + { + "epoch": 0.8158292190732445, + "grad_norm": 0.025164599314322966, + "learning_rate": 0.0007379347303296895, + "loss": 0.5699, + "step": 16070 + }, + { + "epoch": 0.8160830551712759, + "grad_norm": 0.024454600763767285, + "learning_rate": 0.0007377398713683319, + "loss": 0.5647, + "step": 16075 + }, + { + "epoch": 0.8163368912693074, + "grad_norm": 0.0465643154498082, + "learning_rate": 0.0007375449657398232, + "loss": 0.6121, + "step": 16080 + }, + { + "epoch": 0.816590727367339, + "grad_norm": 0.05708836406856427, + "learning_rate": 0.0007373500134824224, + "loss": 0.552, + "step": 16085 + }, + { + "epoch": 0.8168445634653704, + "grad_norm": 0.03307721737401848, + "learning_rate": 0.0007371550146343976, + "loss": 0.5815, + "step": 16090 + }, + { + "epoch": 0.8170983995634019, + "grad_norm": 0.022676746585353742, + "learning_rate": 0.0007369599692340261, + "loss": 0.5196, + "step": 16095 + }, + { + "epoch": 0.8173522356614334, + "grad_norm": 0.022835698320630423, + "learning_rate": 0.0007367648773195942, + "loss": 0.5688, + "step": 16100 + }, + { + "epoch": 0.8176060717594649, + "grad_norm": 0.024353558374283157, + "learning_rate": 0.000736569738929398, + "loss": 0.5533, + "step": 16105 + }, + { + "epoch": 0.8178599078574964, + "grad_norm": 0.035753016297181556, + "learning_rate": 0.0007363745541017415, + "loss": 0.5759, + "step": 16110 + }, + { + "epoch": 0.8181137439555279, + "grad_norm": 0.04602579365083806, + "learning_rate": 0.0007361793228749387, + "loss": 0.6057, + "step": 16115 + }, + { + "epoch": 0.8183675800535594, + "grad_norm": 0.02654832147555816, + "learning_rate": 0.0007359840452873129, + "loss": 0.5732, + "step": 16120 + }, + { + "epoch": 0.8186214161515909, + "grad_norm": 0.029689830503051053, + "learning_rate": 0.0007357887213771958, + "loss": 0.5735, + "step": 16125 + }, + { + "epoch": 0.8188752522496224, + "grad_norm": 0.02922774898175317, + "learning_rate": 0.0007355933511829286, + "loss": 0.5886, + "step": 16130 + }, + { + "epoch": 0.819129088347654, + "grad_norm": 0.02346950210966065, + "learning_rate": 0.0007353979347428614, + "loss": 0.5436, + "step": 16135 + }, + { + "epoch": 0.8193829244456854, + "grad_norm": 0.4292564062685949, + "learning_rate": 0.0007352024720953536, + "loss": 0.5475, + "step": 16140 + }, + { + "epoch": 0.8196367605437169, + "grad_norm": 0.054772914446169205, + "learning_rate": 0.0007350069632787734, + "loss": 0.5315, + "step": 16145 + }, + { + "epoch": 0.8198905966417485, + "grad_norm": 0.05912804272652365, + "learning_rate": 0.0007348114083314984, + "loss": 0.5795, + "step": 16150 + }, + { + "epoch": 0.8201444327397799, + "grad_norm": 0.030284659576007726, + "learning_rate": 0.0007346158072919149, + "loss": 0.5843, + "step": 16155 + }, + { + "epoch": 0.8203982688378114, + "grad_norm": 0.023151058600271775, + "learning_rate": 0.0007344201601984185, + "loss": 0.5864, + "step": 16160 + }, + { + "epoch": 0.820652104935843, + "grad_norm": 0.03376567799773459, + "learning_rate": 0.0007342244670894136, + "loss": 0.5571, + "step": 16165 + }, + { + "epoch": 0.8209059410338744, + "grad_norm": 0.03959840237334942, + "learning_rate": 0.000734028728003314, + "loss": 0.5696, + "step": 16170 + }, + { + "epoch": 0.8211597771319059, + "grad_norm": 0.02935275476865624, + "learning_rate": 0.000733832942978542, + "loss": 0.5647, + "step": 16175 + }, + { + "epoch": 0.8214136132299374, + "grad_norm": 0.03823963239401788, + "learning_rate": 0.0007336371120535295, + "loss": 0.5742, + "step": 16180 + }, + { + "epoch": 0.821667449327969, + "grad_norm": 0.024101714729295634, + "learning_rate": 0.0007334412352667173, + "loss": 0.6284, + "step": 16185 + }, + { + "epoch": 0.8219212854260004, + "grad_norm": 2.693520215640975, + "learning_rate": 0.0007332453126565545, + "loss": 0.6209, + "step": 16190 + }, + { + "epoch": 0.8221751215240319, + "grad_norm": 0.03279854521465908, + "learning_rate": 0.0007330493442615, + "loss": 0.5537, + "step": 16195 + }, + { + "epoch": 0.8224289576220635, + "grad_norm": 0.044838327703993694, + "learning_rate": 0.0007328533301200216, + "loss": 0.557, + "step": 16200 + }, + { + "epoch": 0.8226827937200949, + "grad_norm": 0.02638836436847605, + "learning_rate": 0.0007326572702705958, + "loss": 0.5701, + "step": 16205 + }, + { + "epoch": 0.8229366298181264, + "grad_norm": 0.04922931478557384, + "learning_rate": 0.0007324611647517078, + "loss": 0.56, + "step": 16210 + }, + { + "epoch": 0.823190465916158, + "grad_norm": 0.03891712292801701, + "learning_rate": 0.0007322650136018527, + "loss": 0.5869, + "step": 16215 + }, + { + "epoch": 0.8234443020141894, + "grad_norm": 0.04138342114685851, + "learning_rate": 0.0007320688168595338, + "loss": 0.6059, + "step": 16220 + }, + { + "epoch": 0.8236981381122209, + "grad_norm": 0.02918011994976219, + "learning_rate": 0.0007318725745632632, + "loss": 0.5551, + "step": 16225 + }, + { + "epoch": 0.8239519742102525, + "grad_norm": 0.0419742382923944, + "learning_rate": 0.0007316762867515627, + "loss": 0.5673, + "step": 16230 + }, + { + "epoch": 0.824205810308284, + "grad_norm": 0.030211128758018017, + "learning_rate": 0.0007314799534629625, + "loss": 0.6303, + "step": 16235 + }, + { + "epoch": 0.8244596464063154, + "grad_norm": 0.02340354696643915, + "learning_rate": 0.0007312835747360018, + "loss": 0.5603, + "step": 16240 + }, + { + "epoch": 0.824713482504347, + "grad_norm": 0.042394083569737895, + "learning_rate": 0.0007310871506092287, + "loss": 0.5427, + "step": 16245 + }, + { + "epoch": 0.8249673186023785, + "grad_norm": 0.04912858556363834, + "learning_rate": 0.0007308906811212004, + "loss": 0.5917, + "step": 16250 + }, + { + "epoch": 0.8252211547004099, + "grad_norm": 0.03971810340456524, + "learning_rate": 0.000730694166310483, + "loss": 0.562, + "step": 16255 + }, + { + "epoch": 0.8254749907984414, + "grad_norm": 0.02785763337910464, + "learning_rate": 0.0007304976062156512, + "loss": 0.5795, + "step": 16260 + }, + { + "epoch": 0.825728826896473, + "grad_norm": 0.033158827041856395, + "learning_rate": 0.0007303010008752886, + "loss": 0.5688, + "step": 16265 + }, + { + "epoch": 0.8259826629945044, + "grad_norm": 0.030706479795822323, + "learning_rate": 0.0007301043503279881, + "loss": 0.5976, + "step": 16270 + }, + { + "epoch": 0.8262364990925359, + "grad_norm": 0.030873986022704285, + "learning_rate": 0.0007299076546123512, + "loss": 0.5715, + "step": 16275 + }, + { + "epoch": 0.8264903351905675, + "grad_norm": 0.0474915666866346, + "learning_rate": 0.0007297109137669882, + "loss": 0.563, + "step": 16280 + }, + { + "epoch": 0.826744171288599, + "grad_norm": 0.03893683462709932, + "learning_rate": 0.0007295141278305185, + "loss": 0.5592, + "step": 16285 + }, + { + "epoch": 0.8269980073866304, + "grad_norm": 0.03111456293981649, + "learning_rate": 0.0007293172968415701, + "loss": 0.5792, + "step": 16290 + }, + { + "epoch": 0.827251843484662, + "grad_norm": 0.024853102754921644, + "learning_rate": 0.0007291204208387798, + "loss": 0.58, + "step": 16295 + }, + { + "epoch": 0.8275056795826935, + "grad_norm": 0.0251707952164745, + "learning_rate": 0.0007289234998607935, + "loss": 0.5397, + "step": 16300 + }, + { + "epoch": 0.8277595156807249, + "grad_norm": 0.03637341065833897, + "learning_rate": 0.000728726533946266, + "loss": 0.5889, + "step": 16305 + }, + { + "epoch": 0.8280133517787565, + "grad_norm": 0.025249035570284573, + "learning_rate": 0.0007285295231338605, + "loss": 0.5608, + "step": 16310 + }, + { + "epoch": 0.828267187876788, + "grad_norm": 0.03480589568385215, + "learning_rate": 0.0007283324674622491, + "loss": 0.5958, + "step": 16315 + }, + { + "epoch": 0.8285210239748194, + "grad_norm": 0.06002094158028185, + "learning_rate": 0.0007281353669701131, + "loss": 0.5949, + "step": 16320 + }, + { + "epoch": 0.828774860072851, + "grad_norm": 0.02645055337203729, + "learning_rate": 0.0007279382216961426, + "loss": 0.5696, + "step": 16325 + }, + { + "epoch": 0.8290286961708825, + "grad_norm": 0.026182888420755522, + "learning_rate": 0.0007277410316790355, + "loss": 0.5815, + "step": 16330 + }, + { + "epoch": 0.8292825322689139, + "grad_norm": 0.026085049179655987, + "learning_rate": 0.0007275437969574999, + "loss": 0.5716, + "step": 16335 + }, + { + "epoch": 0.8295363683669454, + "grad_norm": 0.03239294409527313, + "learning_rate": 0.0007273465175702515, + "loss": 0.5524, + "step": 16340 + }, + { + "epoch": 0.829790204464977, + "grad_norm": 0.027393441053260734, + "learning_rate": 0.0007271491935560155, + "loss": 0.6076, + "step": 16345 + }, + { + "epoch": 0.8300440405630085, + "grad_norm": 0.029966666901580178, + "learning_rate": 0.0007269518249535256, + "loss": 0.5385, + "step": 16350 + }, + { + "epoch": 0.8302978766610399, + "grad_norm": 0.02610780272688068, + "learning_rate": 0.0007267544118015243, + "loss": 0.5672, + "step": 16355 + }, + { + "epoch": 0.8305517127590715, + "grad_norm": 0.03032975964214376, + "learning_rate": 0.0007265569541387628, + "loss": 0.5809, + "step": 16360 + }, + { + "epoch": 0.830805548857103, + "grad_norm": 0.025317306734505777, + "learning_rate": 0.0007263594520040011, + "loss": 0.5645, + "step": 16365 + }, + { + "epoch": 0.8310593849551344, + "grad_norm": 0.02593763038542392, + "learning_rate": 0.0007261619054360078, + "loss": 0.5595, + "step": 16370 + }, + { + "epoch": 0.831313221053166, + "grad_norm": 0.0439291131573673, + "learning_rate": 0.0007259643144735603, + "loss": 0.568, + "step": 16375 + }, + { + "epoch": 0.8315670571511975, + "grad_norm": 0.026930260112695524, + "learning_rate": 0.0007257666791554447, + "loss": 0.5841, + "step": 16380 + }, + { + "epoch": 0.8318208932492289, + "grad_norm": 0.031340436270697436, + "learning_rate": 0.0007255689995204559, + "loss": 0.5776, + "step": 16385 + }, + { + "epoch": 0.8320747293472605, + "grad_norm": 0.035038658960907056, + "learning_rate": 0.0007253712756073973, + "loss": 0.5945, + "step": 16390 + }, + { + "epoch": 0.832328565445292, + "grad_norm": 0.02459703987491012, + "learning_rate": 0.0007251735074550815, + "loss": 0.5377, + "step": 16395 + }, + { + "epoch": 0.8325824015433235, + "grad_norm": 0.04858837100594378, + "learning_rate": 0.000724975695102329, + "loss": 0.5757, + "step": 16400 + }, + { + "epoch": 0.832836237641355, + "grad_norm": 0.025374293791486993, + "learning_rate": 0.0007247778385879695, + "loss": 0.5681, + "step": 16405 + }, + { + "epoch": 0.8330900737393865, + "grad_norm": 0.024727570721873534, + "learning_rate": 0.0007245799379508412, + "loss": 0.5556, + "step": 16410 + }, + { + "epoch": 0.833343909837418, + "grad_norm": 0.04461548506528957, + "learning_rate": 0.000724381993229791, + "loss": 0.5403, + "step": 16415 + }, + { + "epoch": 0.8335977459354494, + "grad_norm": 0.02730610710183291, + "learning_rate": 0.0007241840044636747, + "loss": 0.5994, + "step": 16420 + }, + { + "epoch": 0.833851582033481, + "grad_norm": 0.0335198165281854, + "learning_rate": 0.0007239859716913562, + "loss": 0.5605, + "step": 16425 + }, + { + "epoch": 0.8341054181315125, + "grad_norm": 0.04217088658047506, + "learning_rate": 0.0007237878949517085, + "loss": 0.5743, + "step": 16430 + }, + { + "epoch": 0.8343592542295439, + "grad_norm": 0.024585253772533063, + "learning_rate": 0.0007235897742836131, + "loss": 0.5929, + "step": 16435 + }, + { + "epoch": 0.8346130903275755, + "grad_norm": 0.025207671295824716, + "learning_rate": 0.00072339160972596, + "loss": 0.5635, + "step": 16440 + }, + { + "epoch": 0.834866926425607, + "grad_norm": 0.027558993795189373, + "learning_rate": 0.000723193401317648, + "loss": 0.592, + "step": 16445 + }, + { + "epoch": 0.8351207625236385, + "grad_norm": 0.036827202153997284, + "learning_rate": 0.0007229951490975844, + "loss": 0.5541, + "step": 16450 + }, + { + "epoch": 0.83537459862167, + "grad_norm": 0.026861426162605315, + "learning_rate": 0.000722796853104685, + "loss": 0.552, + "step": 16455 + }, + { + "epoch": 0.8356284347197015, + "grad_norm": 0.04528560408360573, + "learning_rate": 0.0007225985133778745, + "loss": 0.5598, + "step": 16460 + }, + { + "epoch": 0.835882270817733, + "grad_norm": 0.026142557018130994, + "learning_rate": 0.0007224001299560859, + "loss": 0.5774, + "step": 16465 + }, + { + "epoch": 0.8361361069157645, + "grad_norm": 0.03854871080553361, + "learning_rate": 0.000722201702878261, + "loss": 0.5299, + "step": 16470 + }, + { + "epoch": 0.836389943013796, + "grad_norm": 0.03451324677710476, + "learning_rate": 0.0007220032321833498, + "loss": 0.5753, + "step": 16475 + }, + { + "epoch": 0.8366437791118275, + "grad_norm": 0.026185635664542194, + "learning_rate": 0.0007218047179103112, + "loss": 0.5643, + "step": 16480 + }, + { + "epoch": 0.836897615209859, + "grad_norm": 0.026415165159016925, + "learning_rate": 0.0007216061600981128, + "loss": 0.5673, + "step": 16485 + }, + { + "epoch": 0.8371514513078905, + "grad_norm": 0.022148025561463187, + "learning_rate": 0.0007214075587857302, + "loss": 0.5244, + "step": 16490 + }, + { + "epoch": 0.837405287405922, + "grad_norm": 0.0809295820317378, + "learning_rate": 0.0007212089140121481, + "loss": 0.5359, + "step": 16495 + }, + { + "epoch": 0.8376591235039536, + "grad_norm": 0.03396241665558352, + "learning_rate": 0.0007210102258163592, + "loss": 0.5489, + "step": 16500 + }, + { + "epoch": 0.837912959601985, + "grad_norm": 0.04736874657246381, + "learning_rate": 0.0007208114942373651, + "loss": 0.5593, + "step": 16505 + }, + { + "epoch": 0.8381667957000165, + "grad_norm": 0.02415642764938736, + "learning_rate": 0.0007206127193141761, + "loss": 0.5368, + "step": 16510 + }, + { + "epoch": 0.838420631798048, + "grad_norm": 0.025190772006756453, + "learning_rate": 0.0007204139010858103, + "loss": 0.5872, + "step": 16515 + }, + { + "epoch": 0.8386744678960795, + "grad_norm": 0.023720884374368048, + "learning_rate": 0.0007202150395912949, + "loss": 0.5686, + "step": 16520 + }, + { + "epoch": 0.838928303994111, + "grad_norm": 0.02328821938311436, + "learning_rate": 0.0007200161348696655, + "loss": 0.5578, + "step": 16525 + }, + { + "epoch": 0.8391821400921425, + "grad_norm": 0.05346546124165315, + "learning_rate": 0.0007198171869599662, + "loss": 0.5899, + "step": 16530 + }, + { + "epoch": 0.839435976190174, + "grad_norm": 0.04804743967944873, + "learning_rate": 0.0007196181959012491, + "loss": 0.5919, + "step": 16535 + }, + { + "epoch": 0.8396898122882055, + "grad_norm": 0.07197115729997203, + "learning_rate": 0.0007194191617325755, + "loss": 0.5914, + "step": 16540 + }, + { + "epoch": 0.839943648386237, + "grad_norm": 0.09143766754452975, + "learning_rate": 0.0007192200844930147, + "loss": 0.5068, + "step": 16545 + }, + { + "epoch": 0.8401974844842685, + "grad_norm": 0.03931986444195336, + "learning_rate": 0.0007190209642216445, + "loss": 0.5951, + "step": 16550 + }, + { + "epoch": 0.8404513205823, + "grad_norm": 0.05372740389831407, + "learning_rate": 0.0007188218009575514, + "loss": 0.5746, + "step": 16555 + }, + { + "epoch": 0.8407051566803315, + "grad_norm": 0.03388013411522178, + "learning_rate": 0.0007186225947398298, + "loss": 0.5847, + "step": 16560 + }, + { + "epoch": 0.8409589927783631, + "grad_norm": 0.2323506802544606, + "learning_rate": 0.0007184233456075833, + "loss": 1.0376, + "step": 16565 + }, + { + "epoch": 0.8412128288763945, + "grad_norm": 0.1835106341020443, + "learning_rate": 0.0007182240535999232, + "loss": 0.5778, + "step": 16570 + }, + { + "epoch": 0.841466664974426, + "grad_norm": 0.11156210838067974, + "learning_rate": 0.0007180247187559697, + "loss": 0.5769, + "step": 16575 + }, + { + "epoch": 0.8417205010724575, + "grad_norm": 0.07718512327967042, + "learning_rate": 0.0007178253411148513, + "loss": 0.6366, + "step": 16580 + }, + { + "epoch": 0.841974337170489, + "grad_norm": 0.043680931999653776, + "learning_rate": 0.0007176259207157048, + "loss": 0.5654, + "step": 16585 + }, + { + "epoch": 0.8422281732685205, + "grad_norm": 0.05565560445937635, + "learning_rate": 0.0007174264575976752, + "loss": 0.5576, + "step": 16590 + }, + { + "epoch": 0.842482009366552, + "grad_norm": 0.04784086578137076, + "learning_rate": 0.0007172269517999163, + "loss": 0.5756, + "step": 16595 + }, + { + "epoch": 0.8427358454645835, + "grad_norm": 0.029385540515945264, + "learning_rate": 0.00071702740336159, + "loss": 0.5723, + "step": 16600 + }, + { + "epoch": 0.842989681562615, + "grad_norm": 0.028863407001418123, + "learning_rate": 0.0007168278123218667, + "loss": 0.5663, + "step": 16605 + }, + { + "epoch": 0.8432435176606465, + "grad_norm": 0.036227912872782514, + "learning_rate": 0.0007166281787199253, + "loss": 0.5634, + "step": 16610 + }, + { + "epoch": 0.8434973537586781, + "grad_norm": 0.05143219348236945, + "learning_rate": 0.0007164285025949528, + "loss": 0.6063, + "step": 16615 + }, + { + "epoch": 0.8437511898567095, + "grad_norm": 0.028270375163286395, + "learning_rate": 0.0007162287839861445, + "loss": 0.5714, + "step": 16620 + }, + { + "epoch": 0.844005025954741, + "grad_norm": 0.045237364055904485, + "learning_rate": 0.0007160290229327042, + "loss": 0.5614, + "step": 16625 + }, + { + "epoch": 0.8442588620527726, + "grad_norm": 0.04742378757483653, + "learning_rate": 0.000715829219473844, + "loss": 0.5856, + "step": 16630 + }, + { + "epoch": 0.844512698150804, + "grad_norm": 0.028756290264305383, + "learning_rate": 0.0007156293736487844, + "loss": 0.5441, + "step": 16635 + }, + { + "epoch": 0.8447665342488355, + "grad_norm": 0.03771401096316666, + "learning_rate": 0.0007154294854967541, + "loss": 0.5825, + "step": 16640 + }, + { + "epoch": 0.8450203703468671, + "grad_norm": 0.03402827219689952, + "learning_rate": 0.0007152295550569902, + "loss": 0.5564, + "step": 16645 + }, + { + "epoch": 0.8452742064448985, + "grad_norm": 0.03963421220660269, + "learning_rate": 0.0007150295823687379, + "loss": 0.5801, + "step": 16650 + }, + { + "epoch": 0.84552804254293, + "grad_norm": 0.03217463407046853, + "learning_rate": 0.000714829567471251, + "loss": 0.5673, + "step": 16655 + }, + { + "epoch": 0.8457818786409615, + "grad_norm": 0.025969270367900015, + "learning_rate": 0.0007146295104037914, + "loss": 0.5762, + "step": 16660 + }, + { + "epoch": 0.8460357147389931, + "grad_norm": 0.0372684784203102, + "learning_rate": 0.0007144294112056292, + "loss": 0.5858, + "step": 16665 + }, + { + "epoch": 0.8462895508370245, + "grad_norm": 0.03206359149474644, + "learning_rate": 0.000714229269916043, + "loss": 0.5754, + "step": 16670 + }, + { + "epoch": 0.846543386935056, + "grad_norm": 0.08771530469398538, + "learning_rate": 0.0007140290865743194, + "loss": 0.5861, + "step": 16675 + }, + { + "epoch": 0.8467972230330876, + "grad_norm": 0.04534708545965994, + "learning_rate": 0.0007138288612197534, + "loss": 0.5762, + "step": 16680 + }, + { + "epoch": 0.847051059131119, + "grad_norm": 0.03943245508252062, + "learning_rate": 0.0007136285938916484, + "loss": 0.5512, + "step": 16685 + }, + { + "epoch": 0.8473048952291505, + "grad_norm": 0.05354631204112542, + "learning_rate": 0.0007134282846293157, + "loss": 0.5659, + "step": 16690 + }, + { + "epoch": 0.8475587313271821, + "grad_norm": 0.10850672830641753, + "learning_rate": 0.0007132279334720751, + "loss": 0.5914, + "step": 16695 + }, + { + "epoch": 0.8478125674252135, + "grad_norm": 0.0344090734999104, + "learning_rate": 0.0007130275404592547, + "loss": 0.583, + "step": 16700 + }, + { + "epoch": 0.848066403523245, + "grad_norm": 0.026564143833072755, + "learning_rate": 0.0007128271056301902, + "loss": 0.5518, + "step": 16705 + }, + { + "epoch": 0.8483202396212766, + "grad_norm": 0.02413909355518316, + "learning_rate": 0.0007126266290242264, + "loss": 0.5541, + "step": 16710 + }, + { + "epoch": 0.8485740757193081, + "grad_norm": 0.02253771359833544, + "learning_rate": 0.0007124261106807158, + "loss": 0.5439, + "step": 16715 + }, + { + "epoch": 0.8488279118173395, + "grad_norm": 0.025639805853241732, + "learning_rate": 0.0007122255506390188, + "loss": 0.5553, + "step": 16720 + }, + { + "epoch": 0.849081747915371, + "grad_norm": 0.025441727802141368, + "learning_rate": 0.0007120249489385048, + "loss": 0.5888, + "step": 16725 + }, + { + "epoch": 0.8493355840134026, + "grad_norm": 0.20833156756734772, + "learning_rate": 0.0007118243056185505, + "loss": 0.584, + "step": 16730 + }, + { + "epoch": 0.849589420111434, + "grad_norm": 0.02582708018087541, + "learning_rate": 0.0007116236207185414, + "loss": 0.5758, + "step": 16735 + }, + { + "epoch": 0.8498432562094655, + "grad_norm": 0.02244099996933645, + "learning_rate": 0.0007114228942778711, + "loss": 0.5464, + "step": 16740 + }, + { + "epoch": 0.8500970923074971, + "grad_norm": 0.030980292003979164, + "learning_rate": 0.0007112221263359408, + "loss": 0.5484, + "step": 16745 + }, + { + "epoch": 0.8503509284055285, + "grad_norm": 0.03890776257527718, + "learning_rate": 0.0007110213169321606, + "loss": 0.6233, + "step": 16750 + }, + { + "epoch": 0.85060476450356, + "grad_norm": 0.04528676589062287, + "learning_rate": 0.0007108204661059482, + "loss": 0.5863, + "step": 16755 + }, + { + "epoch": 0.8508586006015916, + "grad_norm": 0.028278146245680834, + "learning_rate": 0.0007106195738967296, + "loss": 0.5689, + "step": 16760 + }, + { + "epoch": 0.851112436699623, + "grad_norm": 0.02656249132824659, + "learning_rate": 0.0007104186403439391, + "loss": 0.5627, + "step": 16765 + }, + { + "epoch": 0.8513662727976545, + "grad_norm": 0.032513287659724305, + "learning_rate": 0.0007102176654870189, + "loss": 0.5654, + "step": 16770 + }, + { + "epoch": 0.8516201088956861, + "grad_norm": 0.02776149485485987, + "learning_rate": 0.0007100166493654192, + "loss": 0.5786, + "step": 16775 + }, + { + "epoch": 0.8518739449937176, + "grad_norm": 0.031105339929159002, + "learning_rate": 0.0007098155920185987, + "loss": 0.562, + "step": 16780 + }, + { + "epoch": 0.852127781091749, + "grad_norm": 0.02446590396405293, + "learning_rate": 0.0007096144934860237, + "loss": 0.5624, + "step": 16785 + }, + { + "epoch": 0.8523816171897806, + "grad_norm": 0.03981011051439899, + "learning_rate": 0.0007094133538071691, + "loss": 0.539, + "step": 16790 + }, + { + "epoch": 0.8526354532878121, + "grad_norm": 0.033938471991999015, + "learning_rate": 0.0007092121730215174, + "loss": 0.6041, + "step": 16795 + }, + { + "epoch": 0.8528892893858435, + "grad_norm": 0.028696702111982848, + "learning_rate": 0.0007090109511685595, + "loss": 0.5812, + "step": 16800 + }, + { + "epoch": 0.853143125483875, + "grad_norm": 0.031170929377724053, + "learning_rate": 0.0007088096882877942, + "loss": 0.6141, + "step": 16805 + }, + { + "epoch": 0.8533969615819066, + "grad_norm": 0.0229475127058901, + "learning_rate": 0.0007086083844187284, + "loss": 0.5381, + "step": 16810 + }, + { + "epoch": 0.853650797679938, + "grad_norm": 0.04104758489251979, + "learning_rate": 0.0007084070396008771, + "loss": 0.5961, + "step": 16815 + }, + { + "epoch": 0.8539046337779695, + "grad_norm": 0.02451932040436503, + "learning_rate": 0.0007082056538737633, + "loss": 0.5366, + "step": 16820 + }, + { + "epoch": 0.8541584698760011, + "grad_norm": 0.04468035129714473, + "learning_rate": 0.0007080042272769179, + "loss": 0.5693, + "step": 16825 + }, + { + "epoch": 0.8544123059740326, + "grad_norm": 0.025369526471293296, + "learning_rate": 0.0007078027598498801, + "loss": 0.5659, + "step": 16830 + }, + { + "epoch": 0.854666142072064, + "grad_norm": 0.023020797662670914, + "learning_rate": 0.0007076012516321968, + "loss": 0.5979, + "step": 16835 + }, + { + "epoch": 0.8549199781700956, + "grad_norm": 0.02317984351608055, + "learning_rate": 0.0007073997026634229, + "loss": 0.5815, + "step": 16840 + }, + { + "epoch": 0.8551738142681271, + "grad_norm": 0.024195045783076848, + "learning_rate": 0.000707198112983122, + "loss": 0.5277, + "step": 16845 + }, + { + "epoch": 0.8554276503661585, + "grad_norm": 0.03085842460268176, + "learning_rate": 0.0007069964826308646, + "loss": 0.5641, + "step": 16850 + }, + { + "epoch": 0.8556814864641901, + "grad_norm": 0.023758504611377088, + "learning_rate": 0.00070679481164623, + "loss": 0.5577, + "step": 16855 + }, + { + "epoch": 0.8559353225622216, + "grad_norm": 0.03533884780505401, + "learning_rate": 0.0007065931000688053, + "loss": 0.5214, + "step": 16860 + }, + { + "epoch": 0.856189158660253, + "grad_norm": 0.028013725394752647, + "learning_rate": 0.0007063913479381851, + "loss": 0.5775, + "step": 16865 + }, + { + "epoch": 0.8564429947582846, + "grad_norm": 0.03922126124071241, + "learning_rate": 0.0007061895552939727, + "loss": 0.5676, + "step": 16870 + }, + { + "epoch": 0.8566968308563161, + "grad_norm": 0.02828532099990536, + "learning_rate": 0.0007059877221757789, + "loss": 0.5652, + "step": 16875 + }, + { + "epoch": 0.8569506669543476, + "grad_norm": 0.036809316995682864, + "learning_rate": 0.0007057858486232224, + "loss": 0.536, + "step": 16880 + }, + { + "epoch": 0.857204503052379, + "grad_norm": 0.021695340919237437, + "learning_rate": 0.00070558393467593, + "loss": 0.529, + "step": 16885 + }, + { + "epoch": 0.8574583391504106, + "grad_norm": 0.02740340955824536, + "learning_rate": 0.0007053819803735367, + "loss": 0.5352, + "step": 16890 + }, + { + "epoch": 0.8577121752484421, + "grad_norm": 0.03336334546505141, + "learning_rate": 0.0007051799857556848, + "loss": 0.5372, + "step": 16895 + }, + { + "epoch": 0.8579660113464735, + "grad_norm": 0.028637615728519715, + "learning_rate": 0.0007049779508620248, + "loss": 0.5726, + "step": 16900 + }, + { + "epoch": 0.8582198474445051, + "grad_norm": 0.02540896995489235, + "learning_rate": 0.0007047758757322155, + "loss": 0.5588, + "step": 16905 + }, + { + "epoch": 0.8584736835425366, + "grad_norm": 0.0243774664015233, + "learning_rate": 0.0007045737604059228, + "loss": 0.5855, + "step": 16910 + }, + { + "epoch": 0.858727519640568, + "grad_norm": 0.025712991028124108, + "learning_rate": 0.0007043716049228212, + "loss": 0.5725, + "step": 16915 + }, + { + "epoch": 0.8589813557385996, + "grad_norm": 0.01963408655252367, + "learning_rate": 0.0007041694093225929, + "loss": 0.5553, + "step": 16920 + }, + { + "epoch": 0.8592351918366311, + "grad_norm": 0.024595690577265465, + "learning_rate": 0.0007039671736449275, + "loss": 0.5464, + "step": 16925 + }, + { + "epoch": 0.8594890279346626, + "grad_norm": 0.022243385284112183, + "learning_rate": 0.0007037648979295232, + "loss": 0.5316, + "step": 16930 + }, + { + "epoch": 0.8597428640326941, + "grad_norm": 0.02440380975141963, + "learning_rate": 0.0007035625822160856, + "loss": 0.5465, + "step": 16935 + }, + { + "epoch": 0.8599967001307256, + "grad_norm": 0.03482315185034089, + "learning_rate": 0.0007033602265443284, + "loss": 0.5636, + "step": 16940 + }, + { + "epoch": 0.8602505362287571, + "grad_norm": 0.024590501396839586, + "learning_rate": 0.0007031578309539728, + "loss": 0.5877, + "step": 16945 + }, + { + "epoch": 0.8605043723267886, + "grad_norm": 0.03922772861007277, + "learning_rate": 0.000702955395484748, + "loss": 0.5928, + "step": 16950 + }, + { + "epoch": 0.8607582084248201, + "grad_norm": 0.026668312848002942, + "learning_rate": 0.0007027529201763913, + "loss": 0.5848, + "step": 16955 + }, + { + "epoch": 0.8610120445228516, + "grad_norm": 0.022676881579317083, + "learning_rate": 0.0007025504050686475, + "loss": 0.5759, + "step": 16960 + }, + { + "epoch": 0.861265880620883, + "grad_norm": 0.023945888246282236, + "learning_rate": 0.0007023478502012694, + "loss": 0.5796, + "step": 16965 + }, + { + "epoch": 0.8615197167189146, + "grad_norm": 0.030829524909033925, + "learning_rate": 0.0007021452556140173, + "loss": 0.5673, + "step": 16970 + }, + { + "epoch": 0.8617735528169461, + "grad_norm": 0.023288491755525775, + "learning_rate": 0.0007019426213466597, + "loss": 0.5592, + "step": 16975 + }, + { + "epoch": 0.8620273889149775, + "grad_norm": 0.034313257493583556, + "learning_rate": 0.0007017399474389725, + "loss": 0.5415, + "step": 16980 + }, + { + "epoch": 0.8622812250130091, + "grad_norm": 0.030676916955254648, + "learning_rate": 0.0007015372339307398, + "loss": 0.5374, + "step": 16985 + }, + { + "epoch": 0.8625350611110406, + "grad_norm": 0.9270526277643246, + "learning_rate": 0.000701334480861753, + "loss": 0.5703, + "step": 16990 + }, + { + "epoch": 0.8627888972090721, + "grad_norm": 0.03881534525111206, + "learning_rate": 0.0007011316882718119, + "loss": 0.5436, + "step": 16995 + }, + { + "epoch": 0.8630427333071036, + "grad_norm": 0.028288958072038958, + "learning_rate": 0.0007009288562007232, + "loss": 0.5424, + "step": 17000 + }, + { + "epoch": 0.8632965694051351, + "grad_norm": 0.03263081861380027, + "learning_rate": 0.0007007259846883022, + "loss": 0.5635, + "step": 17005 + }, + { + "epoch": 0.8635504055031666, + "grad_norm": 0.02801944612812956, + "learning_rate": 0.0007005230737743714, + "loss": 0.5607, + "step": 17010 + }, + { + "epoch": 0.8638042416011981, + "grad_norm": 0.0354537026800304, + "learning_rate": 0.0007003201234987612, + "loss": 0.5787, + "step": 17015 + }, + { + "epoch": 0.8640580776992296, + "grad_norm": 0.04224026943667041, + "learning_rate": 0.0007001171339013097, + "loss": 0.5469, + "step": 17020 + }, + { + "epoch": 0.8643119137972611, + "grad_norm": 0.027519369512133282, + "learning_rate": 0.0006999141050218628, + "loss": 0.5826, + "step": 17025 + }, + { + "epoch": 0.8645657498952926, + "grad_norm": 0.026240030212773185, + "learning_rate": 0.0006997110369002742, + "loss": 0.5602, + "step": 17030 + }, + { + "epoch": 0.8648195859933241, + "grad_norm": 0.02599836695677239, + "learning_rate": 0.0006995079295764048, + "loss": 0.5593, + "step": 17035 + }, + { + "epoch": 0.8650734220913556, + "grad_norm": 0.03230402905795593, + "learning_rate": 0.000699304783090124, + "loss": 0.5685, + "step": 17040 + }, + { + "epoch": 0.8653272581893872, + "grad_norm": 0.028811832838592445, + "learning_rate": 0.0006991015974813081, + "loss": 0.5859, + "step": 17045 + }, + { + "epoch": 0.8655810942874186, + "grad_norm": 0.034518109548492715, + "learning_rate": 0.0006988983727898414, + "loss": 0.5502, + "step": 17050 + }, + { + "epoch": 0.8658349303854501, + "grad_norm": 0.03883667058154074, + "learning_rate": 0.0006986951090556161, + "loss": 0.5473, + "step": 17055 + }, + { + "epoch": 0.8660887664834817, + "grad_norm": 0.025695035028394198, + "learning_rate": 0.0006984918063185319, + "loss": 0.5815, + "step": 17060 + }, + { + "epoch": 0.8663426025815131, + "grad_norm": 0.034846551058862335, + "learning_rate": 0.0006982884646184959, + "loss": 0.543, + "step": 17065 + }, + { + "epoch": 0.8665964386795446, + "grad_norm": 0.03999095610243723, + "learning_rate": 0.0006980850839954232, + "loss": 0.5682, + "step": 17070 + }, + { + "epoch": 0.8668502747775761, + "grad_norm": 0.029998948571798918, + "learning_rate": 0.0006978816644892364, + "loss": 0.5651, + "step": 17075 + }, + { + "epoch": 0.8671041108756076, + "grad_norm": 0.03870411499824972, + "learning_rate": 0.0006976782061398657, + "loss": 0.5865, + "step": 17080 + }, + { + "epoch": 0.8673579469736391, + "grad_norm": 0.02581985137998219, + "learning_rate": 0.0006974747089872488, + "loss": 0.5883, + "step": 17085 + }, + { + "epoch": 0.8676117830716706, + "grad_norm": 0.03341194341814896, + "learning_rate": 0.0006972711730713315, + "loss": 0.573, + "step": 17090 + }, + { + "epoch": 0.8678656191697022, + "grad_norm": 0.036010523707030416, + "learning_rate": 0.0006970675984320667, + "loss": 0.5535, + "step": 17095 + }, + { + "epoch": 0.8681194552677336, + "grad_norm": 0.027649651226152285, + "learning_rate": 0.000696863985109415, + "loss": 0.5654, + "step": 17100 + }, + { + "epoch": 0.8683732913657651, + "grad_norm": 0.03162081059227697, + "learning_rate": 0.0006966603331433447, + "loss": 0.6022, + "step": 17105 + }, + { + "epoch": 0.8686271274637967, + "grad_norm": 0.032299427855624986, + "learning_rate": 0.0006964566425738321, + "loss": 0.5425, + "step": 17110 + }, + { + "epoch": 0.8688809635618281, + "grad_norm": 0.021742217941501947, + "learning_rate": 0.0006962529134408599, + "loss": 0.5897, + "step": 17115 + }, + { + "epoch": 0.8691347996598596, + "grad_norm": 0.0346527041544656, + "learning_rate": 0.0006960491457844198, + "loss": 0.5532, + "step": 17120 + }, + { + "epoch": 0.8693886357578912, + "grad_norm": 0.022954764473995297, + "learning_rate": 0.00069584533964451, + "loss": 0.5665, + "step": 17125 + }, + { + "epoch": 0.8696424718559226, + "grad_norm": 0.02307425717340246, + "learning_rate": 0.0006956414950611366, + "loss": 0.5429, + "step": 17130 + }, + { + "epoch": 0.8698963079539541, + "grad_norm": 0.023508308014224755, + "learning_rate": 0.0006954376120743136, + "loss": 0.5923, + "step": 17135 + }, + { + "epoch": 0.8701501440519857, + "grad_norm": 0.025046949091913972, + "learning_rate": 0.0006952336907240616, + "loss": 0.5354, + "step": 17140 + }, + { + "epoch": 0.8704039801500172, + "grad_norm": 0.02807435788273573, + "learning_rate": 0.00069502973105041, + "loss": 0.5499, + "step": 17145 + }, + { + "epoch": 0.8706578162480486, + "grad_norm": 0.023394478814900033, + "learning_rate": 0.0006948257330933948, + "loss": 0.5804, + "step": 17150 + }, + { + "epoch": 0.8709116523460801, + "grad_norm": 0.02484051188427158, + "learning_rate": 0.0006946216968930598, + "loss": 0.5795, + "step": 17155 + }, + { + "epoch": 0.8711654884441117, + "grad_norm": 0.02775584758767845, + "learning_rate": 0.0006944176224894563, + "loss": 0.5821, + "step": 17160 + }, + { + "epoch": 0.8714193245421431, + "grad_norm": 0.03912060374962221, + "learning_rate": 0.000694213509922643, + "loss": 0.583, + "step": 17165 + }, + { + "epoch": 0.8716731606401746, + "grad_norm": 0.0362056815020368, + "learning_rate": 0.0006940093592326861, + "loss": 0.571, + "step": 17170 + }, + { + "epoch": 0.8719269967382062, + "grad_norm": 0.028946737163921484, + "learning_rate": 0.0006938051704596598, + "loss": 0.5959, + "step": 17175 + }, + { + "epoch": 0.8721808328362376, + "grad_norm": 0.02620608291978458, + "learning_rate": 0.0006936009436436448, + "loss": 0.5515, + "step": 17180 + }, + { + "epoch": 0.8724346689342691, + "grad_norm": 0.02801972461379925, + "learning_rate": 0.0006933966788247302, + "loss": 0.565, + "step": 17185 + }, + { + "epoch": 0.8726885050323007, + "grad_norm": 0.042778133049176235, + "learning_rate": 0.000693192376043012, + "loss": 0.5829, + "step": 17190 + }, + { + "epoch": 0.8729423411303322, + "grad_norm": 0.03858970271483881, + "learning_rate": 0.0006929880353385938, + "loss": 0.5507, + "step": 17195 + }, + { + "epoch": 0.8731961772283636, + "grad_norm": 0.031470436939788314, + "learning_rate": 0.0006927836567515866, + "loss": 0.5978, + "step": 17200 + }, + { + "epoch": 0.8734500133263952, + "grad_norm": 0.07394640988697473, + "learning_rate": 0.0006925792403221091, + "loss": 0.5749, + "step": 17205 + }, + { + "epoch": 0.8737038494244267, + "grad_norm": 0.03603952523691002, + "learning_rate": 0.0006923747860902871, + "loss": 0.5866, + "step": 17210 + }, + { + "epoch": 0.8739576855224581, + "grad_norm": 0.043584509249294856, + "learning_rate": 0.000692170294096254, + "loss": 0.5899, + "step": 17215 + }, + { + "epoch": 0.8742115216204897, + "grad_norm": 0.024373073949978077, + "learning_rate": 0.0006919657643801504, + "loss": 0.506, + "step": 17220 + }, + { + "epoch": 0.8744653577185212, + "grad_norm": 0.029580565745340112, + "learning_rate": 0.0006917611969821248, + "loss": 0.5343, + "step": 17225 + }, + { + "epoch": 0.8747191938165526, + "grad_norm": 0.0257140032071113, + "learning_rate": 0.0006915565919423324, + "loss": 0.5875, + "step": 17230 + }, + { + "epoch": 0.8749730299145841, + "grad_norm": 0.03390362964198948, + "learning_rate": 0.0006913519493009363, + "loss": 0.5813, + "step": 17235 + }, + { + "epoch": 0.8752268660126157, + "grad_norm": 0.05042303052722009, + "learning_rate": 0.0006911472690981069, + "loss": 0.606, + "step": 17240 + }, + { + "epoch": 0.8754807021106471, + "grad_norm": 0.05272668324721124, + "learning_rate": 0.0006909425513740217, + "loss": 0.5965, + "step": 17245 + }, + { + "epoch": 0.8757345382086786, + "grad_norm": 0.0488542916140239, + "learning_rate": 0.000690737796168866, + "loss": 0.5871, + "step": 17250 + }, + { + "epoch": 0.8759883743067102, + "grad_norm": 0.030433512897338318, + "learning_rate": 0.0006905330035228321, + "loss": 0.5384, + "step": 17255 + }, + { + "epoch": 0.8762422104047417, + "grad_norm": 0.026478083460412936, + "learning_rate": 0.0006903281734761197, + "loss": 0.5365, + "step": 17260 + }, + { + "epoch": 0.8764960465027731, + "grad_norm": 0.029126871537591283, + "learning_rate": 0.000690123306068936, + "loss": 0.5554, + "step": 17265 + }, + { + "epoch": 0.8767498826008047, + "grad_norm": 0.032584410814189266, + "learning_rate": 0.0006899184013414955, + "loss": 0.5551, + "step": 17270 + }, + { + "epoch": 0.8770037186988362, + "grad_norm": 0.03105939707552323, + "learning_rate": 0.00068971345933402, + "loss": 0.5244, + "step": 17275 + }, + { + "epoch": 0.8772575547968676, + "grad_norm": 0.04428702286739715, + "learning_rate": 0.0006895084800867386, + "loss": 0.5524, + "step": 17280 + }, + { + "epoch": 0.8775113908948992, + "grad_norm": 0.026501029034102612, + "learning_rate": 0.0006893034636398875, + "loss": 0.5697, + "step": 17285 + }, + { + "epoch": 0.8777652269929307, + "grad_norm": 0.02789104304834869, + "learning_rate": 0.0006890984100337105, + "loss": 0.5451, + "step": 17290 + }, + { + "epoch": 0.8780190630909621, + "grad_norm": 0.030381112646398638, + "learning_rate": 0.0006888933193084588, + "loss": 0.5728, + "step": 17295 + }, + { + "epoch": 0.8782728991889936, + "grad_norm": 0.03320203500452083, + "learning_rate": 0.0006886881915043905, + "loss": 0.5478, + "step": 17300 + }, + { + "epoch": 0.8785267352870252, + "grad_norm": 0.04308556659587886, + "learning_rate": 0.0006884830266617711, + "loss": 0.5476, + "step": 17305 + }, + { + "epoch": 0.8787805713850567, + "grad_norm": 0.04744726865498166, + "learning_rate": 0.0006882778248208737, + "loss": 0.5283, + "step": 17310 + }, + { + "epoch": 0.8790344074830881, + "grad_norm": 0.04730302242145846, + "learning_rate": 0.000688072586021978, + "loss": 0.5921, + "step": 17315 + }, + { + "epoch": 0.8792882435811197, + "grad_norm": 0.02844248850997349, + "learning_rate": 0.0006878673103053717, + "loss": 0.5359, + "step": 17320 + }, + { + "epoch": 0.8795420796791512, + "grad_norm": 0.02665626078133169, + "learning_rate": 0.0006876619977113492, + "loss": 0.5729, + "step": 17325 + }, + { + "epoch": 0.8797959157771826, + "grad_norm": 0.05159803146684941, + "learning_rate": 0.0006874566482802125, + "loss": 0.5683, + "step": 17330 + }, + { + "epoch": 0.8800497518752142, + "grad_norm": 0.028811741517915785, + "learning_rate": 0.0006872512620522707, + "loss": 0.577, + "step": 17335 + }, + { + "epoch": 0.8803035879732457, + "grad_norm": 0.032289986764814115, + "learning_rate": 0.0006870458390678397, + "loss": 0.5473, + "step": 17340 + }, + { + "epoch": 0.8805574240712771, + "grad_norm": 0.026018555624320636, + "learning_rate": 0.0006868403793672435, + "loss": 0.5738, + "step": 17345 + }, + { + "epoch": 0.8808112601693087, + "grad_norm": 0.02320427721745887, + "learning_rate": 0.0006866348829908125, + "loss": 0.5572, + "step": 17350 + }, + { + "epoch": 0.8810650962673402, + "grad_norm": 0.028063747667248035, + "learning_rate": 0.0006864293499788849, + "loss": 0.5831, + "step": 17355 + }, + { + "epoch": 0.8813189323653717, + "grad_norm": 0.026346028747411987, + "learning_rate": 0.0006862237803718054, + "loss": 0.5708, + "step": 17360 + }, + { + "epoch": 0.8815727684634032, + "grad_norm": 0.023583638229848918, + "learning_rate": 0.0006860181742099266, + "loss": 0.5457, + "step": 17365 + }, + { + "epoch": 0.8818266045614347, + "grad_norm": 0.03017396130009387, + "learning_rate": 0.0006858125315336079, + "loss": 0.5551, + "step": 17370 + }, + { + "epoch": 0.8820804406594662, + "grad_norm": 0.04360521104064648, + "learning_rate": 0.0006856068523832158, + "loss": 0.5653, + "step": 17375 + }, + { + "epoch": 0.8823342767574976, + "grad_norm": 0.02249120515299714, + "learning_rate": 0.0006854011367991243, + "loss": 0.521, + "step": 17380 + }, + { + "epoch": 0.8825881128555292, + "grad_norm": 0.03244681552967337, + "learning_rate": 0.0006851953848217142, + "loss": 0.5652, + "step": 17385 + }, + { + "epoch": 0.8828419489535607, + "grad_norm": 0.026484829488799315, + "learning_rate": 0.0006849895964913737, + "loss": 0.5456, + "step": 17390 + }, + { + "epoch": 0.8830957850515921, + "grad_norm": 0.027157693421820065, + "learning_rate": 0.0006847837718484977, + "loss": 0.5214, + "step": 17395 + }, + { + "epoch": 0.8833496211496237, + "grad_norm": 0.07609582956202243, + "learning_rate": 0.0006845779109334891, + "loss": 0.545, + "step": 17400 + }, + { + "epoch": 0.8836034572476552, + "grad_norm": 0.03239068638126852, + "learning_rate": 0.0006843720137867569, + "loss": 0.5713, + "step": 17405 + }, + { + "epoch": 0.8838572933456867, + "grad_norm": 0.025149606981124856, + "learning_rate": 0.0006841660804487179, + "loss": 0.5704, + "step": 17410 + }, + { + "epoch": 0.8841111294437182, + "grad_norm": 0.023833536556687185, + "learning_rate": 0.0006839601109597957, + "loss": 0.5377, + "step": 17415 + }, + { + "epoch": 0.8843649655417497, + "grad_norm": 0.028450371295118047, + "learning_rate": 0.0006837541053604213, + "loss": 0.5916, + "step": 17420 + }, + { + "epoch": 0.8846188016397812, + "grad_norm": 0.033176344085271356, + "learning_rate": 0.0006835480636910321, + "loss": 0.5335, + "step": 17425 + }, + { + "epoch": 0.8848726377378127, + "grad_norm": 0.02880029477938549, + "learning_rate": 0.0006833419859920736, + "loss": 0.5554, + "step": 17430 + }, + { + "epoch": 0.8851264738358442, + "grad_norm": 0.029522099650815575, + "learning_rate": 0.0006831358723039976, + "loss": 0.5704, + "step": 17435 + }, + { + "epoch": 0.8853803099338757, + "grad_norm": 0.04615263077662782, + "learning_rate": 0.000682929722667263, + "loss": 0.5362, + "step": 17440 + }, + { + "epoch": 0.8856341460319072, + "grad_norm": 0.48427634793861496, + "learning_rate": 0.0006827235371223362, + "loss": 0.5819, + "step": 17445 + }, + { + "epoch": 0.8858879821299387, + "grad_norm": 0.030739394979893288, + "learning_rate": 0.0006825173157096903, + "loss": 0.5927, + "step": 17450 + }, + { + "epoch": 0.8861418182279702, + "grad_norm": 0.03176037900151184, + "learning_rate": 0.0006823110584698055, + "loss": 0.5528, + "step": 17455 + }, + { + "epoch": 0.8863956543260016, + "grad_norm": 0.026725514935062814, + "learning_rate": 0.0006821047654431691, + "loss": 0.5695, + "step": 17460 + }, + { + "epoch": 0.8866494904240332, + "grad_norm": 0.02741254348251184, + "learning_rate": 0.0006818984366702754, + "loss": 0.5361, + "step": 17465 + }, + { + "epoch": 0.8869033265220647, + "grad_norm": 0.06467389710456385, + "learning_rate": 0.0006816920721916259, + "loss": 0.5637, + "step": 17470 + }, + { + "epoch": 0.8871571626200963, + "grad_norm": 0.05652090655931264, + "learning_rate": 0.0006814856720477285, + "loss": 0.5662, + "step": 17475 + }, + { + "epoch": 0.8874109987181277, + "grad_norm": 0.025971100514922223, + "learning_rate": 0.0006812792362790987, + "loss": 0.5662, + "step": 17480 + }, + { + "epoch": 0.8876648348161592, + "grad_norm": 0.037312238893479364, + "learning_rate": 0.0006810727649262591, + "loss": 0.5216, + "step": 17485 + }, + { + "epoch": 0.8879186709141907, + "grad_norm": 0.03529026683625101, + "learning_rate": 0.0006808662580297385, + "loss": 0.5754, + "step": 17490 + }, + { + "epoch": 0.8881725070122222, + "grad_norm": 0.052177166996155586, + "learning_rate": 0.0006806597156300736, + "loss": 0.5862, + "step": 17495 + }, + { + "epoch": 0.8884263431102537, + "grad_norm": 0.034917141903712645, + "learning_rate": 0.0006804531377678074, + "loss": 0.5504, + "step": 17500 + }, + { + "epoch": 0.8886801792082852, + "grad_norm": 0.035789909497893145, + "learning_rate": 0.0006802465244834901, + "loss": 0.523, + "step": 17505 + }, + { + "epoch": 0.8889340153063167, + "grad_norm": 0.02633496325094234, + "learning_rate": 0.000680039875817679, + "loss": 0.5773, + "step": 17510 + }, + { + "epoch": 0.8891878514043482, + "grad_norm": 0.1027046920453322, + "learning_rate": 0.0006798331918109381, + "loss": 0.5328, + "step": 17515 + }, + { + "epoch": 0.8894416875023797, + "grad_norm": 0.03431649346909258, + "learning_rate": 0.0006796264725038387, + "loss": 0.584, + "step": 17520 + }, + { + "epoch": 0.8896955236004113, + "grad_norm": 0.02609942274925054, + "learning_rate": 0.0006794197179369584, + "loss": 0.5586, + "step": 17525 + }, + { + "epoch": 0.8899493596984427, + "grad_norm": 0.03430169189866086, + "learning_rate": 0.0006792129281508821, + "loss": 0.546, + "step": 17530 + }, + { + "epoch": 0.8902031957964742, + "grad_norm": 0.024928043771357686, + "learning_rate": 0.0006790061031862018, + "loss": 0.5435, + "step": 17535 + }, + { + "epoch": 0.8904570318945058, + "grad_norm": 0.028688456933202128, + "learning_rate": 0.0006787992430835161, + "loss": 0.5732, + "step": 17540 + }, + { + "epoch": 0.8907108679925372, + "grad_norm": 0.042830556579080276, + "learning_rate": 0.0006785923478834308, + "loss": 0.5484, + "step": 17545 + }, + { + "epoch": 0.8909647040905687, + "grad_norm": 0.035751869994615854, + "learning_rate": 0.0006783854176265582, + "loss": 0.5575, + "step": 17550 + }, + { + "epoch": 0.8912185401886003, + "grad_norm": 0.028210300266072946, + "learning_rate": 0.0006781784523535177, + "loss": 0.5915, + "step": 17555 + }, + { + "epoch": 0.8914723762866317, + "grad_norm": 0.03715659125320336, + "learning_rate": 0.0006779714521049356, + "loss": 0.5359, + "step": 17560 + }, + { + "epoch": 0.8917262123846632, + "grad_norm": 0.03583841060325269, + "learning_rate": 0.000677764416921445, + "loss": 0.5624, + "step": 17565 + }, + { + "epoch": 0.8919800484826947, + "grad_norm": 0.03003577944086306, + "learning_rate": 0.000677557346843686, + "loss": 0.5508, + "step": 17570 + }, + { + "epoch": 0.8922338845807263, + "grad_norm": 0.03580088688841066, + "learning_rate": 0.0006773502419123051, + "loss": 0.5862, + "step": 17575 + }, + { + "epoch": 0.8924877206787577, + "grad_norm": 0.03560540588373299, + "learning_rate": 0.0006771431021679561, + "loss": 0.5623, + "step": 17580 + }, + { + "epoch": 0.8927415567767892, + "grad_norm": 0.029606401551905418, + "learning_rate": 0.0006769359276512998, + "loss": 0.5346, + "step": 17585 + }, + { + "epoch": 0.8929953928748208, + "grad_norm": 0.034532543727339556, + "learning_rate": 0.0006767287184030031, + "loss": 0.5903, + "step": 17590 + }, + { + "epoch": 0.8932492289728522, + "grad_norm": 0.029377320096539063, + "learning_rate": 0.0006765214744637402, + "loss": 0.5723, + "step": 17595 + }, + { + "epoch": 0.8935030650708837, + "grad_norm": 0.030743555347065143, + "learning_rate": 0.0006763141958741924, + "loss": 0.5641, + "step": 17600 + }, + { + "epoch": 0.8937569011689153, + "grad_norm": 0.0409722689803858, + "learning_rate": 0.0006761068826750472, + "loss": 0.5476, + "step": 17605 + }, + { + "epoch": 0.8940107372669467, + "grad_norm": 0.029101547286446497, + "learning_rate": 0.0006758995349069992, + "loss": 0.5678, + "step": 17610 + }, + { + "epoch": 0.8942645733649782, + "grad_norm": 0.026075517265298196, + "learning_rate": 0.0006756921526107495, + "loss": 0.5715, + "step": 17615 + }, + { + "epoch": 0.8945184094630098, + "grad_norm": 0.04138837555118283, + "learning_rate": 0.0006754847358270066, + "loss": 0.5776, + "step": 17620 + }, + { + "epoch": 0.8947722455610413, + "grad_norm": 0.02499298797537694, + "learning_rate": 0.0006752772845964852, + "loss": 0.5813, + "step": 17625 + }, + { + "epoch": 0.8950260816590727, + "grad_norm": 0.02920872117024721, + "learning_rate": 0.0006750697989599068, + "loss": 0.567, + "step": 17630 + }, + { + "epoch": 0.8952799177571042, + "grad_norm": 0.024311481377263307, + "learning_rate": 0.0006748622789580001, + "loss": 0.5472, + "step": 17635 + }, + { + "epoch": 0.8955337538551358, + "grad_norm": 0.022553193192463565, + "learning_rate": 0.0006746547246315, + "loss": 0.5839, + "step": 17640 + }, + { + "epoch": 0.8957875899531672, + "grad_norm": 0.025203410801119673, + "learning_rate": 0.0006744471360211484, + "loss": 0.5593, + "step": 17645 + }, + { + "epoch": 0.8960414260511987, + "grad_norm": 0.06256849888992963, + "learning_rate": 0.0006742395131676942, + "loss": 0.5406, + "step": 17650 + }, + { + "epoch": 0.8962952621492303, + "grad_norm": 0.02521288112975126, + "learning_rate": 0.0006740318561118922, + "loss": 0.5682, + "step": 17655 + }, + { + "epoch": 0.8965490982472617, + "grad_norm": 0.3098613025846971, + "learning_rate": 0.0006738241648945049, + "loss": 0.5492, + "step": 17660 + }, + { + "epoch": 0.8968029343452932, + "grad_norm": 0.037730912321007094, + "learning_rate": 0.0006736164395563009, + "loss": 0.59, + "step": 17665 + }, + { + "epoch": 0.8970567704433248, + "grad_norm": 0.034261701601160224, + "learning_rate": 0.0006734086801380556, + "loss": 0.549, + "step": 17670 + }, + { + "epoch": 0.8973106065413562, + "grad_norm": 0.023592660379858885, + "learning_rate": 0.0006732008866805512, + "loss": 0.5566, + "step": 17675 + }, + { + "epoch": 0.8975644426393877, + "grad_norm": 0.03082538341771991, + "learning_rate": 0.0006729930592245764, + "loss": 0.5699, + "step": 17680 + }, + { + "epoch": 0.8978182787374193, + "grad_norm": 0.028133490847564162, + "learning_rate": 0.000672785197810927, + "loss": 0.5445, + "step": 17685 + }, + { + "epoch": 0.8980721148354508, + "grad_norm": 0.03384650095501219, + "learning_rate": 0.0006725773024804047, + "loss": 0.572, + "step": 17690 + }, + { + "epoch": 0.8983259509334822, + "grad_norm": 0.029532460122143937, + "learning_rate": 0.0006723693732738188, + "loss": 0.5905, + "step": 17695 + }, + { + "epoch": 0.8985797870315138, + "grad_norm": 0.04184269802969518, + "learning_rate": 0.0006721614102319845, + "loss": 0.595, + "step": 17700 + }, + { + "epoch": 0.8988336231295453, + "grad_norm": 0.024257335369379907, + "learning_rate": 0.0006719534133957237, + "loss": 0.5904, + "step": 17705 + }, + { + "epoch": 0.8990874592275767, + "grad_norm": 0.8407754555100807, + "learning_rate": 0.0006717453828058655, + "loss": 0.5799, + "step": 17710 + }, + { + "epoch": 0.8993412953256082, + "grad_norm": 0.03889746059406645, + "learning_rate": 0.0006715373185032452, + "loss": 0.5636, + "step": 17715 + }, + { + "epoch": 0.8995951314236398, + "grad_norm": 0.03812093829559977, + "learning_rate": 0.0006713292205287047, + "loss": 0.5268, + "step": 17720 + }, + { + "epoch": 0.8998489675216712, + "grad_norm": 0.031502538038959534, + "learning_rate": 0.0006711210889230926, + "loss": 0.5409, + "step": 17725 + }, + { + "epoch": 0.9001028036197027, + "grad_norm": 0.04019619925278292, + "learning_rate": 0.0006709129237272642, + "loss": 0.5921, + "step": 17730 + }, + { + "epoch": 0.9003566397177343, + "grad_norm": 0.029122633192066108, + "learning_rate": 0.0006707047249820813, + "loss": 0.5613, + "step": 17735 + }, + { + "epoch": 0.9006104758157658, + "grad_norm": 0.04486417536535739, + "learning_rate": 0.0006704964927284119, + "loss": 0.5838, + "step": 17740 + }, + { + "epoch": 0.9008643119137972, + "grad_norm": 0.035238015488419025, + "learning_rate": 0.0006702882270071313, + "loss": 0.56, + "step": 17745 + }, + { + "epoch": 0.9011181480118288, + "grad_norm": 0.047574854581445294, + "learning_rate": 0.0006700799278591212, + "loss": 0.5946, + "step": 17750 + }, + { + "epoch": 0.9013719841098603, + "grad_norm": 0.029209760126982294, + "learning_rate": 0.0006698715953252693, + "loss": 0.5785, + "step": 17755 + }, + { + "epoch": 0.9016258202078917, + "grad_norm": 0.04635207472974123, + "learning_rate": 0.0006696632294464704, + "loss": 0.6096, + "step": 17760 + }, + { + "epoch": 0.9018796563059233, + "grad_norm": 0.04669566058976855, + "learning_rate": 0.0006694548302636256, + "loss": 0.5996, + "step": 17765 + }, + { + "epoch": 0.9021334924039548, + "grad_norm": 0.047565163368266106, + "learning_rate": 0.0006692463978176428, + "loss": 0.5887, + "step": 17770 + }, + { + "epoch": 0.9023873285019862, + "grad_norm": 0.04629943424684873, + "learning_rate": 0.0006690379321494361, + "loss": 0.5573, + "step": 17775 + }, + { + "epoch": 0.9026411646000178, + "grad_norm": 0.02436241562205948, + "learning_rate": 0.0006688294332999263, + "loss": 0.5669, + "step": 17780 + }, + { + "epoch": 0.9028950006980493, + "grad_norm": 0.04739393205079893, + "learning_rate": 0.0006686209013100407, + "loss": 0.5763, + "step": 17785 + }, + { + "epoch": 0.9031488367960808, + "grad_norm": 0.06024300726679382, + "learning_rate": 0.0006684123362207131, + "loss": 0.5468, + "step": 17790 + }, + { + "epoch": 0.9034026728941122, + "grad_norm": 0.024550757216584715, + "learning_rate": 0.0006682037380728839, + "loss": 0.5404, + "step": 17795 + }, + { + "epoch": 0.9036565089921438, + "grad_norm": 0.029738050538604267, + "learning_rate": 0.0006679951069074995, + "loss": 0.5511, + "step": 17800 + }, + { + "epoch": 0.9039103450901753, + "grad_norm": 0.031231955912328697, + "learning_rate": 0.0006677864427655135, + "loss": 0.5778, + "step": 17805 + }, + { + "epoch": 0.9041641811882067, + "grad_norm": 0.12238924641223849, + "learning_rate": 0.0006675777456878855, + "loss": 0.5493, + "step": 17810 + }, + { + "epoch": 0.9044180172862383, + "grad_norm": 0.04489807461665631, + "learning_rate": 0.0006673690157155818, + "loss": 0.5639, + "step": 17815 + }, + { + "epoch": 0.9046718533842698, + "grad_norm": 0.040984958114977495, + "learning_rate": 0.000667160252889575, + "loss": 0.5693, + "step": 17820 + }, + { + "epoch": 0.9049256894823012, + "grad_norm": 0.0299007750606263, + "learning_rate": 0.0006669514572508441, + "loss": 0.5359, + "step": 17825 + }, + { + "epoch": 0.9051795255803328, + "grad_norm": 0.02649079224122146, + "learning_rate": 0.0006667426288403749, + "loss": 0.5571, + "step": 17830 + }, + { + "epoch": 0.9054333616783643, + "grad_norm": 0.0491849427073363, + "learning_rate": 0.000666533767699159, + "loss": 0.5594, + "step": 17835 + }, + { + "epoch": 0.9056871977763958, + "grad_norm": 0.028577497869547455, + "learning_rate": 0.0006663248738681951, + "loss": 0.5776, + "step": 17840 + }, + { + "epoch": 0.9059410338744273, + "grad_norm": 0.029544273344263332, + "learning_rate": 0.0006661159473884879, + "loss": 0.5156, + "step": 17845 + }, + { + "epoch": 0.9061948699724588, + "grad_norm": 0.07785921945262643, + "learning_rate": 0.0006659069883010487, + "loss": 0.6146, + "step": 17850 + }, + { + "epoch": 0.9064487060704903, + "grad_norm": 0.04916615480146852, + "learning_rate": 0.0006656979966468949, + "loss": 0.621, + "step": 17855 + }, + { + "epoch": 0.9067025421685218, + "grad_norm": 0.04266402533079821, + "learning_rate": 0.0006654889724670509, + "loss": 0.5852, + "step": 17860 + }, + { + "epoch": 0.9069563782665533, + "grad_norm": 0.03478646050112459, + "learning_rate": 0.0006652799158025466, + "loss": 0.5586, + "step": 17865 + }, + { + "epoch": 0.9072102143645848, + "grad_norm": 0.03269211204236239, + "learning_rate": 0.0006650708266944194, + "loss": 0.5417, + "step": 17870 + }, + { + "epoch": 0.9074640504626162, + "grad_norm": 0.05926828968618208, + "learning_rate": 0.000664861705183712, + "loss": 0.5573, + "step": 17875 + }, + { + "epoch": 0.9077178865606478, + "grad_norm": 1.137998555724827, + "learning_rate": 0.0006646525513114741, + "loss": 0.7911, + "step": 17880 + }, + { + "epoch": 0.9079717226586793, + "grad_norm": 0.15186991344137019, + "learning_rate": 0.0006644433651187613, + "loss": 0.6537, + "step": 17885 + }, + { + "epoch": 0.9082255587567107, + "grad_norm": 0.10837397788584424, + "learning_rate": 0.0006642341466466363, + "loss": 0.5914, + "step": 17890 + }, + { + "epoch": 0.9084793948547423, + "grad_norm": 0.04567223182586979, + "learning_rate": 0.0006640248959361671, + "loss": 0.5919, + "step": 17895 + }, + { + "epoch": 0.9087332309527738, + "grad_norm": 0.0424493699036947, + "learning_rate": 0.000663815613028429, + "loss": 0.5887, + "step": 17900 + }, + { + "epoch": 0.9089870670508053, + "grad_norm": 0.02914392670442529, + "learning_rate": 0.0006636062979645029, + "loss": 0.6148, + "step": 17905 + }, + { + "epoch": 0.9092409031488368, + "grad_norm": 0.03378335007634043, + "learning_rate": 0.0006633969507854764, + "loss": 0.5855, + "step": 17910 + }, + { + "epoch": 0.9094947392468683, + "grad_norm": 0.045172687759939864, + "learning_rate": 0.0006631875715324433, + "loss": 0.5607, + "step": 17915 + }, + { + "epoch": 0.9097485753448998, + "grad_norm": 0.03534790871616725, + "learning_rate": 0.0006629781602465039, + "loss": 0.5642, + "step": 17920 + }, + { + "epoch": 0.9100024114429313, + "grad_norm": 0.031709338585968895, + "learning_rate": 0.0006627687169687643, + "loss": 0.5685, + "step": 17925 + }, + { + "epoch": 0.9102562475409628, + "grad_norm": 0.02405831853389006, + "learning_rate": 0.0006625592417403372, + "loss": 0.5837, + "step": 17930 + }, + { + "epoch": 0.9105100836389943, + "grad_norm": 0.02887059553974142, + "learning_rate": 0.0006623497346023419, + "loss": 0.6114, + "step": 17935 + }, + { + "epoch": 0.9107639197370258, + "grad_norm": 0.026405583798025718, + "learning_rate": 0.0006621401955959029, + "loss": 0.5811, + "step": 17940 + }, + { + "epoch": 0.9110177558350573, + "grad_norm": 0.030545139437636752, + "learning_rate": 0.0006619306247621525, + "loss": 0.5621, + "step": 17945 + }, + { + "epoch": 0.9112715919330888, + "grad_norm": 0.03844677188791095, + "learning_rate": 0.0006617210221422278, + "loss": 0.5567, + "step": 17950 + }, + { + "epoch": 0.9115254280311204, + "grad_norm": 0.025348216932646248, + "learning_rate": 0.0006615113877772729, + "loss": 0.5636, + "step": 17955 + }, + { + "epoch": 0.9117792641291518, + "grad_norm": 0.02747141569387044, + "learning_rate": 0.0006613017217084382, + "loss": 0.58, + "step": 17960 + }, + { + "epoch": 0.9120331002271833, + "grad_norm": 0.02571958303044386, + "learning_rate": 0.00066109202397688, + "loss": 0.569, + "step": 17965 + }, + { + "epoch": 0.9122869363252148, + "grad_norm": 0.02567934522052075, + "learning_rate": 0.0006608822946237607, + "loss": 0.5563, + "step": 17970 + }, + { + "epoch": 0.9125407724232463, + "grad_norm": 0.02982421448410097, + "learning_rate": 0.0006606725336902493, + "loss": 0.5524, + "step": 17975 + }, + { + "epoch": 0.9127946085212778, + "grad_norm": 0.02603891319299408, + "learning_rate": 0.0006604627412175209, + "loss": 0.5962, + "step": 17980 + }, + { + "epoch": 0.9130484446193093, + "grad_norm": 0.030676874285685652, + "learning_rate": 0.0006602529172467564, + "loss": 0.5556, + "step": 17985 + }, + { + "epoch": 0.9133022807173408, + "grad_norm": 0.028691123229749194, + "learning_rate": 0.0006600430618191436, + "loss": 0.559, + "step": 17990 + }, + { + "epoch": 0.9135561168153723, + "grad_norm": 0.022843921568002558, + "learning_rate": 0.0006598331749758759, + "loss": 0.5935, + "step": 17995 + }, + { + "epoch": 0.9138099529134038, + "grad_norm": 0.026576251107490675, + "learning_rate": 0.0006596232567581531, + "loss": 0.5514, + "step": 18000 + }, + { + "epoch": 0.9140637890114354, + "grad_norm": 0.029295378037257274, + "learning_rate": 0.0006594133072071809, + "loss": 0.5623, + "step": 18005 + }, + { + "epoch": 0.9143176251094668, + "grad_norm": 0.026342637202697045, + "learning_rate": 0.0006592033263641715, + "loss": 0.5377, + "step": 18010 + }, + { + "epoch": 0.9145714612074983, + "grad_norm": 0.0268948168872539, + "learning_rate": 0.000658993314270343, + "loss": 0.5734, + "step": 18015 + }, + { + "epoch": 0.9148252973055299, + "grad_norm": 0.03228314048721056, + "learning_rate": 0.00065878327096692, + "loss": 0.5695, + "step": 18020 + }, + { + "epoch": 0.9150791334035613, + "grad_norm": 0.03776376655159827, + "learning_rate": 0.0006585731964951327, + "loss": 0.5676, + "step": 18025 + }, + { + "epoch": 0.9153329695015928, + "grad_norm": 0.02713281422594516, + "learning_rate": 0.0006583630908962178, + "loss": 0.6121, + "step": 18030 + }, + { + "epoch": 0.9155868055996244, + "grad_norm": 0.0313761277274463, + "learning_rate": 0.0006581529542114178, + "loss": 0.5656, + "step": 18035 + }, + { + "epoch": 0.9158406416976558, + "grad_norm": 0.023551377190705478, + "learning_rate": 0.0006579427864819817, + "loss": 0.5859, + "step": 18040 + }, + { + "epoch": 0.9160944777956873, + "grad_norm": 0.03213179076154911, + "learning_rate": 0.0006577325877491641, + "loss": 0.5385, + "step": 18045 + }, + { + "epoch": 0.9163483138937188, + "grad_norm": 0.03309679333832961, + "learning_rate": 0.0006575223580542263, + "loss": 0.5686, + "step": 18050 + }, + { + "epoch": 0.9166021499917504, + "grad_norm": 0.02292030032279284, + "learning_rate": 0.0006573120974384351, + "loss": 0.569, + "step": 18055 + }, + { + "epoch": 0.9168559860897818, + "grad_norm": 0.02196534616948093, + "learning_rate": 0.0006571018059430638, + "loss": 0.5583, + "step": 18060 + }, + { + "epoch": 0.9171098221878133, + "grad_norm": 0.03751186679215072, + "learning_rate": 0.0006568914836093913, + "loss": 0.5632, + "step": 18065 + }, + { + "epoch": 0.9173636582858449, + "grad_norm": 0.039073154648674925, + "learning_rate": 0.000656681130478703, + "loss": 0.5875, + "step": 18070 + }, + { + "epoch": 0.9176174943838763, + "grad_norm": 0.03016782826811619, + "learning_rate": 0.0006564707465922901, + "loss": 0.5562, + "step": 18075 + }, + { + "epoch": 0.9178713304819078, + "grad_norm": 0.44620351828974875, + "learning_rate": 0.0006562603319914502, + "loss": 0.5836, + "step": 18080 + }, + { + "epoch": 0.9181251665799394, + "grad_norm": 0.03733778783819997, + "learning_rate": 0.0006560498867174862, + "loss": 0.5544, + "step": 18085 + }, + { + "epoch": 0.9183790026779708, + "grad_norm": 0.04185835413154692, + "learning_rate": 0.0006558394108117078, + "loss": 0.5843, + "step": 18090 + }, + { + "epoch": 0.9186328387760023, + "grad_norm": 0.02584084106316554, + "learning_rate": 0.00065562890431543, + "loss": 0.545, + "step": 18095 + }, + { + "epoch": 0.9188866748740339, + "grad_norm": 0.026170167620165836, + "learning_rate": 0.0006554183672699747, + "loss": 0.5684, + "step": 18100 + }, + { + "epoch": 0.9191405109720653, + "grad_norm": 0.028617521912761606, + "learning_rate": 0.0006552077997166686, + "loss": 0.5778, + "step": 18105 + }, + { + "epoch": 0.9193943470700968, + "grad_norm": 0.024764375605303438, + "learning_rate": 0.0006549972016968457, + "loss": 0.5668, + "step": 18110 + }, + { + "epoch": 0.9196481831681284, + "grad_norm": 0.04224569793867106, + "learning_rate": 0.0006547865732518451, + "loss": 0.5563, + "step": 18115 + }, + { + "epoch": 0.9199020192661599, + "grad_norm": 0.024423393777903805, + "learning_rate": 0.0006545759144230122, + "loss": 0.5272, + "step": 18120 + }, + { + "epoch": 0.9201558553641913, + "grad_norm": 0.03562323884403274, + "learning_rate": 0.0006543652252516978, + "loss": 0.5497, + "step": 18125 + }, + { + "epoch": 0.9204096914622228, + "grad_norm": 0.03532107692857155, + "learning_rate": 0.0006541545057792597, + "loss": 0.562, + "step": 18130 + }, + { + "epoch": 0.9206635275602544, + "grad_norm": 0.02233461039237326, + "learning_rate": 0.0006539437560470609, + "loss": 0.5778, + "step": 18135 + }, + { + "epoch": 0.9209173636582858, + "grad_norm": 0.036379888044198186, + "learning_rate": 0.0006537329760964705, + "loss": 0.5385, + "step": 18140 + }, + { + "epoch": 0.9211711997563173, + "grad_norm": 0.031306097214398455, + "learning_rate": 0.0006535221659688636, + "loss": 0.5766, + "step": 18145 + }, + { + "epoch": 0.9214250358543489, + "grad_norm": 0.032940875660410714, + "learning_rate": 0.0006533113257056212, + "loss": 0.5745, + "step": 18150 + }, + { + "epoch": 0.9216788719523803, + "grad_norm": 0.028269439007305754, + "learning_rate": 0.0006531004553481299, + "loss": 0.5593, + "step": 18155 + }, + { + "epoch": 0.9219327080504118, + "grad_norm": 0.025325938543204793, + "learning_rate": 0.0006528895549377829, + "loss": 0.5545, + "step": 18160 + }, + { + "epoch": 0.9221865441484434, + "grad_norm": 0.025608019654732625, + "learning_rate": 0.0006526786245159785, + "loss": 0.5645, + "step": 18165 + }, + { + "epoch": 0.9224403802464749, + "grad_norm": 0.03073301426221954, + "learning_rate": 0.0006524676641241216, + "loss": 0.5729, + "step": 18170 + }, + { + "epoch": 0.9226942163445063, + "grad_norm": 0.02655926848711976, + "learning_rate": 0.0006522566738036227, + "loss": 0.5605, + "step": 18175 + }, + { + "epoch": 0.9229480524425379, + "grad_norm": 0.030923379907562828, + "learning_rate": 0.0006520456535958981, + "loss": 0.5438, + "step": 18180 + }, + { + "epoch": 0.9232018885405694, + "grad_norm": 0.02498118812708288, + "learning_rate": 0.0006518346035423697, + "loss": 0.5618, + "step": 18185 + }, + { + "epoch": 0.9234557246386008, + "grad_norm": 0.03195096400109142, + "learning_rate": 0.0006516235236844661, + "loss": 0.5771, + "step": 18190 + }, + { + "epoch": 0.9237095607366324, + "grad_norm": 0.025673518361398005, + "learning_rate": 0.0006514124140636206, + "loss": 0.5714, + "step": 18195 + }, + { + "epoch": 0.9239633968346639, + "grad_norm": 0.023420267058336542, + "learning_rate": 0.0006512012747212736, + "loss": 0.5543, + "step": 18200 + }, + { + "epoch": 0.9242172329326953, + "grad_norm": 0.035974375956175315, + "learning_rate": 0.0006509901056988703, + "loss": 0.5399, + "step": 18205 + }, + { + "epoch": 0.9244710690307268, + "grad_norm": 0.026974281528685886, + "learning_rate": 0.0006507789070378623, + "loss": 0.5803, + "step": 18210 + }, + { + "epoch": 0.9247249051287584, + "grad_norm": 0.03386779705440503, + "learning_rate": 0.0006505676787797068, + "loss": 0.5573, + "step": 18215 + }, + { + "epoch": 0.9249787412267899, + "grad_norm": 0.02994069343757094, + "learning_rate": 0.0006503564209658668, + "loss": 0.5631, + "step": 18220 + }, + { + "epoch": 0.9252325773248213, + "grad_norm": 0.025204798762410694, + "learning_rate": 0.0006501451336378111, + "loss": 0.5778, + "step": 18225 + }, + { + "epoch": 0.9254864134228529, + "grad_norm": 0.02844201792241884, + "learning_rate": 0.0006499338168370145, + "loss": 0.5494, + "step": 18230 + }, + { + "epoch": 0.9257402495208844, + "grad_norm": 0.027272927584755857, + "learning_rate": 0.0006497224706049574, + "loss": 0.5513, + "step": 18235 + }, + { + "epoch": 0.9259940856189158, + "grad_norm": 0.022625222413103843, + "learning_rate": 0.000649511094983126, + "loss": 0.5454, + "step": 18240 + }, + { + "epoch": 0.9262479217169474, + "grad_norm": 0.025965109273752868, + "learning_rate": 0.0006492996900130122, + "loss": 0.5521, + "step": 18245 + }, + { + "epoch": 0.9265017578149789, + "grad_norm": 0.023734960990954648, + "learning_rate": 0.0006490882557361138, + "loss": 0.569, + "step": 18250 + }, + { + "epoch": 0.9267555939130103, + "grad_norm": 0.029694319744587402, + "learning_rate": 0.0006488767921939344, + "loss": 0.5544, + "step": 18255 + }, + { + "epoch": 0.9270094300110419, + "grad_norm": 0.027522631414110266, + "learning_rate": 0.0006486652994279832, + "loss": 0.5191, + "step": 18260 + }, + { + "epoch": 0.9272632661090734, + "grad_norm": 0.0213560260774621, + "learning_rate": 0.000648453777479775, + "loss": 0.5602, + "step": 18265 + }, + { + "epoch": 0.9275171022071049, + "grad_norm": 0.027648563843928633, + "learning_rate": 0.0006482422263908305, + "loss": 0.5757, + "step": 18270 + }, + { + "epoch": 0.9277709383051364, + "grad_norm": 0.024346877510486684, + "learning_rate": 0.0006480306462026765, + "loss": 0.5502, + "step": 18275 + }, + { + "epoch": 0.9280247744031679, + "grad_norm": 0.025987941627693693, + "learning_rate": 0.0006478190369568447, + "loss": 0.572, + "step": 18280 + }, + { + "epoch": 0.9282786105011994, + "grad_norm": 0.03607494520553649, + "learning_rate": 0.0006476073986948731, + "loss": 0.5474, + "step": 18285 + }, + { + "epoch": 0.9285324465992308, + "grad_norm": 0.04133198513293862, + "learning_rate": 0.0006473957314583053, + "loss": 0.5646, + "step": 18290 + }, + { + "epoch": 0.9287862826972624, + "grad_norm": 0.02356878697506822, + "learning_rate": 0.0006471840352886906, + "loss": 0.5555, + "step": 18295 + }, + { + "epoch": 0.9290401187952939, + "grad_norm": 0.026719448119602498, + "learning_rate": 0.0006469723102275835, + "loss": 0.5478, + "step": 18300 + }, + { + "epoch": 0.9292939548933253, + "grad_norm": 0.023824789207588595, + "learning_rate": 0.000646760556316545, + "loss": 0.5686, + "step": 18305 + }, + { + "epoch": 0.9295477909913569, + "grad_norm": 0.02504122878294649, + "learning_rate": 0.0006465487735971414, + "loss": 0.5822, + "step": 18310 + }, + { + "epoch": 0.9298016270893884, + "grad_norm": 0.025465279303564035, + "learning_rate": 0.000646336962110944, + "loss": 0.5391, + "step": 18315 + }, + { + "epoch": 0.9300554631874198, + "grad_norm": 0.024053916899900416, + "learning_rate": 0.0006461251218995309, + "loss": 0.5812, + "step": 18320 + }, + { + "epoch": 0.9303092992854514, + "grad_norm": 0.023988031306641153, + "learning_rate": 0.0006459132530044851, + "loss": 0.5653, + "step": 18325 + }, + { + "epoch": 0.9305631353834829, + "grad_norm": 0.033028824504018735, + "learning_rate": 0.0006457013554673954, + "loss": 0.529, + "step": 18330 + }, + { + "epoch": 0.9308169714815144, + "grad_norm": 0.031328475615483485, + "learning_rate": 0.0006454894293298563, + "loss": 0.6023, + "step": 18335 + }, + { + "epoch": 0.9310708075795459, + "grad_norm": 0.025458288041452327, + "learning_rate": 0.0006452774746334677, + "loss": 0.5946, + "step": 18340 + }, + { + "epoch": 0.9313246436775774, + "grad_norm": 0.029766324738899293, + "learning_rate": 0.0006450654914198354, + "loss": 0.5466, + "step": 18345 + }, + { + "epoch": 0.9315784797756089, + "grad_norm": 0.02517080042746789, + "learning_rate": 0.0006448534797305704, + "loss": 0.5717, + "step": 18350 + }, + { + "epoch": 0.9318323158736403, + "grad_norm": 0.02380537403364461, + "learning_rate": 0.0006446414396072899, + "loss": 0.5402, + "step": 18355 + }, + { + "epoch": 0.9320861519716719, + "grad_norm": 0.03236454209338435, + "learning_rate": 0.0006444293710916161, + "loss": 0.5468, + "step": 18360 + }, + { + "epoch": 0.9323399880697034, + "grad_norm": 0.07009893831871468, + "learning_rate": 0.000644217274225177, + "loss": 0.5848, + "step": 18365 + }, + { + "epoch": 0.9325938241677348, + "grad_norm": 0.03355070794386776, + "learning_rate": 0.000644005149049606, + "loss": 0.5628, + "step": 18370 + }, + { + "epoch": 0.9328476602657664, + "grad_norm": 0.04417035285010824, + "learning_rate": 0.0006437929956065426, + "loss": 0.5699, + "step": 18375 + }, + { + "epoch": 0.9331014963637979, + "grad_norm": 0.025686169691077572, + "learning_rate": 0.0006435808139376313, + "loss": 0.5644, + "step": 18380 + }, + { + "epoch": 0.9333553324618294, + "grad_norm": 0.02456795859417348, + "learning_rate": 0.0006433686040845222, + "loss": 0.5688, + "step": 18385 + }, + { + "epoch": 0.9336091685598609, + "grad_norm": 0.027858399120178644, + "learning_rate": 0.0006431563660888711, + "loss": 0.5418, + "step": 18390 + }, + { + "epoch": 0.9338630046578924, + "grad_norm": 0.031235035810057876, + "learning_rate": 0.0006429440999923392, + "loss": 0.5544, + "step": 18395 + }, + { + "epoch": 0.9341168407559239, + "grad_norm": 0.03109567390073356, + "learning_rate": 0.0006427318058365934, + "loss": 0.5845, + "step": 18400 + }, + { + "epoch": 0.9343706768539554, + "grad_norm": 0.023673410214207512, + "learning_rate": 0.0006425194836633058, + "loss": 0.586, + "step": 18405 + }, + { + "epoch": 0.9346245129519869, + "grad_norm": 0.025651225561305648, + "learning_rate": 0.0006423071335141543, + "loss": 0.5602, + "step": 18410 + }, + { + "epoch": 0.9348783490500184, + "grad_norm": 0.024917392040912964, + "learning_rate": 0.0006420947554308223, + "loss": 0.5642, + "step": 18415 + }, + { + "epoch": 0.9351321851480499, + "grad_norm": 0.027718757843249613, + "learning_rate": 0.0006418823494549983, + "loss": 0.5735, + "step": 18420 + }, + { + "epoch": 0.9353860212460814, + "grad_norm": 0.026445735170235473, + "learning_rate": 0.0006416699156283768, + "loss": 0.5166, + "step": 18425 + }, + { + "epoch": 0.9356398573441129, + "grad_norm": 0.024653454471051918, + "learning_rate": 0.0006414574539926574, + "loss": 0.5693, + "step": 18430 + }, + { + "epoch": 0.9358936934421445, + "grad_norm": 0.035138012803531134, + "learning_rate": 0.0006412449645895452, + "loss": 0.5398, + "step": 18435 + }, + { + "epoch": 0.9361475295401759, + "grad_norm": 0.03579404088252254, + "learning_rate": 0.0006410324474607507, + "loss": 0.5867, + "step": 18440 + }, + { + "epoch": 0.9364013656382074, + "grad_norm": 0.05203695866136957, + "learning_rate": 0.0006408199026479901, + "loss": 0.5798, + "step": 18445 + }, + { + "epoch": 0.936655201736239, + "grad_norm": 0.03418141205359679, + "learning_rate": 0.000640607330192985, + "loss": 0.589, + "step": 18450 + }, + { + "epoch": 0.9369090378342704, + "grad_norm": 0.029127603885871836, + "learning_rate": 0.0006403947301374622, + "loss": 0.5181, + "step": 18455 + }, + { + "epoch": 0.9371628739323019, + "grad_norm": 0.025330901763078596, + "learning_rate": 0.000640182102523154, + "loss": 0.5358, + "step": 18460 + }, + { + "epoch": 0.9374167100303334, + "grad_norm": 0.02939613956496011, + "learning_rate": 0.0006399694473917981, + "loss": 0.5448, + "step": 18465 + }, + { + "epoch": 0.9376705461283649, + "grad_norm": 0.05802483149330994, + "learning_rate": 0.0006397567647851377, + "loss": 0.6088, + "step": 18470 + }, + { + "epoch": 0.9379243822263964, + "grad_norm": 0.05969020572466202, + "learning_rate": 0.0006395440547449214, + "loss": 0.5606, + "step": 18475 + }, + { + "epoch": 0.9381782183244279, + "grad_norm": 0.03090071852022251, + "learning_rate": 0.000639331317312903, + "loss": 0.5658, + "step": 18480 + }, + { + "epoch": 0.9384320544224595, + "grad_norm": 0.026204359425882364, + "learning_rate": 0.0006391185525308419, + "loss": 0.5685, + "step": 18485 + }, + { + "epoch": 0.9386858905204909, + "grad_norm": 0.03498061994333065, + "learning_rate": 0.0006389057604405027, + "loss": 0.5711, + "step": 18490 + }, + { + "epoch": 0.9389397266185224, + "grad_norm": 0.02717442339063079, + "learning_rate": 0.0006386929410836555, + "loss": 0.5629, + "step": 18495 + }, + { + "epoch": 0.939193562716554, + "grad_norm": 0.02682126084664184, + "learning_rate": 0.0006384800945020755, + "loss": 0.5369, + "step": 18500 + }, + { + "epoch": 0.9394473988145854, + "grad_norm": 0.021417798921295972, + "learning_rate": 0.0006382672207375438, + "loss": 0.516, + "step": 18505 + }, + { + "epoch": 0.9397012349126169, + "grad_norm": 0.0335394781975359, + "learning_rate": 0.000638054319831846, + "loss": 0.5536, + "step": 18510 + }, + { + "epoch": 0.9399550710106485, + "grad_norm": 0.033542368843596015, + "learning_rate": 0.0006378413918267737, + "loss": 0.5476, + "step": 18515 + }, + { + "epoch": 0.9402089071086799, + "grad_norm": 0.03314207140461258, + "learning_rate": 0.0006376284367641237, + "loss": 0.5405, + "step": 18520 + }, + { + "epoch": 0.9404627432067114, + "grad_norm": 0.025257880645441717, + "learning_rate": 0.0006374154546856978, + "loss": 0.5727, + "step": 18525 + }, + { + "epoch": 0.940716579304743, + "grad_norm": 0.03153823246454771, + "learning_rate": 0.0006372024456333034, + "loss": 0.576, + "step": 18530 + }, + { + "epoch": 0.9409704154027744, + "grad_norm": 0.02414021611137436, + "learning_rate": 0.0006369894096487533, + "loss": 0.5608, + "step": 18535 + }, + { + "epoch": 0.9412242515008059, + "grad_norm": 0.05820198130239536, + "learning_rate": 0.0006367763467738652, + "loss": 0.5498, + "step": 18540 + }, + { + "epoch": 0.9414780875988374, + "grad_norm": 0.024189243271371962, + "learning_rate": 0.0006365632570504622, + "loss": 0.5532, + "step": 18545 + }, + { + "epoch": 0.941731923696869, + "grad_norm": 0.025398202281715243, + "learning_rate": 0.000636350140520373, + "loss": 0.5623, + "step": 18550 + }, + { + "epoch": 0.9419857597949004, + "grad_norm": 0.03514525492016054, + "learning_rate": 0.0006361369972254313, + "loss": 0.5561, + "step": 18555 + }, + { + "epoch": 0.9422395958929319, + "grad_norm": 0.027198150257000973, + "learning_rate": 0.0006359238272074757, + "loss": 0.5701, + "step": 18560 + }, + { + "epoch": 0.9424934319909635, + "grad_norm": 0.03242882219511253, + "learning_rate": 0.0006357106305083509, + "loss": 0.5913, + "step": 18565 + }, + { + "epoch": 0.9427472680889949, + "grad_norm": 0.030898225159171074, + "learning_rate": 0.000635497407169906, + "loss": 0.5652, + "step": 18570 + }, + { + "epoch": 0.9430011041870264, + "grad_norm": 0.023609900849047016, + "learning_rate": 0.0006352841572339957, + "loss": 0.5138, + "step": 18575 + }, + { + "epoch": 0.943254940285058, + "grad_norm": 0.022870274202999735, + "learning_rate": 0.0006350708807424803, + "loss": 0.5139, + "step": 18580 + }, + { + "epoch": 0.9435087763830894, + "grad_norm": 0.029704150434133162, + "learning_rate": 0.0006348575777372244, + "loss": 0.5581, + "step": 18585 + }, + { + "epoch": 0.9437626124811209, + "grad_norm": 0.023261609889708712, + "learning_rate": 0.0006346442482600986, + "loss": 0.5341, + "step": 18590 + }, + { + "epoch": 0.9440164485791525, + "grad_norm": 0.03251302144724121, + "learning_rate": 0.0006344308923529784, + "loss": 0.5377, + "step": 18595 + }, + { + "epoch": 0.944270284677184, + "grad_norm": 0.04594564704845458, + "learning_rate": 0.0006342175100577443, + "loss": 0.5423, + "step": 18600 + }, + { + "epoch": 0.9445241207752154, + "grad_norm": 0.03186429074273563, + "learning_rate": 0.0006340041014162822, + "loss": 0.5286, + "step": 18605 + }, + { + "epoch": 0.944777956873247, + "grad_norm": 0.025609007700680392, + "learning_rate": 0.0006337906664704836, + "loss": 0.5456, + "step": 18610 + }, + { + "epoch": 0.9450317929712785, + "grad_norm": 0.024349448360170327, + "learning_rate": 0.0006335772052622441, + "loss": 0.5522, + "step": 18615 + }, + { + "epoch": 0.9452856290693099, + "grad_norm": 0.03254833741297758, + "learning_rate": 0.0006333637178334655, + "loss": 0.5441, + "step": 18620 + }, + { + "epoch": 0.9455394651673414, + "grad_norm": 0.05909130904483525, + "learning_rate": 0.0006331502042260541, + "loss": 0.5259, + "step": 18625 + }, + { + "epoch": 0.945793301265373, + "grad_norm": 0.05134549162232977, + "learning_rate": 0.0006329366644819217, + "loss": 0.5649, + "step": 18630 + }, + { + "epoch": 0.9460471373634044, + "grad_norm": 0.037053648813884996, + "learning_rate": 0.0006327230986429849, + "loss": 0.5742, + "step": 18635 + }, + { + "epoch": 0.9463009734614359, + "grad_norm": 0.03129953084498823, + "learning_rate": 0.0006325095067511658, + "loss": 0.5378, + "step": 18640 + }, + { + "epoch": 0.9465548095594675, + "grad_norm": 0.03654618370634232, + "learning_rate": 0.0006322958888483914, + "loss": 0.5617, + "step": 18645 + }, + { + "epoch": 0.946808645657499, + "grad_norm": 0.02422795139105715, + "learning_rate": 0.0006320822449765937, + "loss": 0.5494, + "step": 18650 + }, + { + "epoch": 0.9470624817555304, + "grad_norm": 0.02329083288512449, + "learning_rate": 0.00063186857517771, + "loss": 0.5506, + "step": 18655 + }, + { + "epoch": 0.947316317853562, + "grad_norm": 0.03354540198025808, + "learning_rate": 0.0006316548794936827, + "loss": 0.5735, + "step": 18660 + }, + { + "epoch": 0.9475701539515935, + "grad_norm": 0.027348158991933078, + "learning_rate": 0.0006314411579664591, + "loss": 0.5586, + "step": 18665 + }, + { + "epoch": 0.9478239900496249, + "grad_norm": 0.023645544124025912, + "learning_rate": 0.0006312274106379916, + "loss": 0.5575, + "step": 18670 + }, + { + "epoch": 0.9480778261476565, + "grad_norm": 0.04599714324805517, + "learning_rate": 0.0006310136375502379, + "loss": 0.5205, + "step": 18675 + }, + { + "epoch": 0.948331662245688, + "grad_norm": 0.02201034707176053, + "learning_rate": 0.0006307998387451604, + "loss": 0.5386, + "step": 18680 + }, + { + "epoch": 0.9485854983437194, + "grad_norm": 0.02399542893990445, + "learning_rate": 0.0006305860142647269, + "loss": 0.5139, + "step": 18685 + }, + { + "epoch": 0.948839334441751, + "grad_norm": 0.04397570627326147, + "learning_rate": 0.0006303721641509101, + "loss": 0.5376, + "step": 18690 + }, + { + "epoch": 0.9490931705397825, + "grad_norm": 0.025883089049502404, + "learning_rate": 0.0006301582884456877, + "loss": 0.5458, + "step": 18695 + }, + { + "epoch": 0.949347006637814, + "grad_norm": 0.026996067831657474, + "learning_rate": 0.0006299443871910423, + "loss": 0.5854, + "step": 18700 + }, + { + "epoch": 0.9496008427358454, + "grad_norm": 0.02942829816892566, + "learning_rate": 0.0006297304604289618, + "loss": 0.5797, + "step": 18705 + }, + { + "epoch": 0.949854678833877, + "grad_norm": 0.024736740711198107, + "learning_rate": 0.0006295165082014387, + "loss": 0.5299, + "step": 18710 + }, + { + "epoch": 0.9501085149319085, + "grad_norm": 0.02630491688826239, + "learning_rate": 0.0006293025305504712, + "loss": 0.5528, + "step": 18715 + }, + { + "epoch": 0.9503623510299399, + "grad_norm": 0.028244133702288142, + "learning_rate": 0.0006290885275180615, + "loss": 0.5188, + "step": 18720 + }, + { + "epoch": 0.9506161871279715, + "grad_norm": 0.02421009162431851, + "learning_rate": 0.0006288744991462177, + "loss": 0.5593, + "step": 18725 + }, + { + "epoch": 0.950870023226003, + "grad_norm": 0.023551000998892246, + "learning_rate": 0.0006286604454769526, + "loss": 0.5521, + "step": 18730 + }, + { + "epoch": 0.9511238593240344, + "grad_norm": 0.024350962137287317, + "learning_rate": 0.0006284463665522835, + "loss": 0.59, + "step": 18735 + }, + { + "epoch": 0.951377695422066, + "grad_norm": 0.08344993588030145, + "learning_rate": 0.0006282322624142332, + "loss": 0.571, + "step": 18740 + }, + { + "epoch": 0.9516315315200975, + "grad_norm": 0.031834745293134896, + "learning_rate": 0.0006280181331048293, + "loss": 0.5855, + "step": 18745 + }, + { + "epoch": 0.9518853676181289, + "grad_norm": 0.025834530453184186, + "learning_rate": 0.0006278039786661042, + "loss": 0.5814, + "step": 18750 + }, + { + "epoch": 0.9521392037161605, + "grad_norm": 0.027296230433010688, + "learning_rate": 0.0006275897991400956, + "loss": 0.5759, + "step": 18755 + }, + { + "epoch": 0.952393039814192, + "grad_norm": 0.06183118735204768, + "learning_rate": 0.0006273755945688458, + "loss": 0.5715, + "step": 18760 + }, + { + "epoch": 0.9526468759122235, + "grad_norm": 0.026071457227966246, + "learning_rate": 0.0006271613649944019, + "loss": 0.5506, + "step": 18765 + }, + { + "epoch": 0.952900712010255, + "grad_norm": 0.05151300719134269, + "learning_rate": 0.000626947110458816, + "loss": 0.5903, + "step": 18770 + }, + { + "epoch": 0.9531545481082865, + "grad_norm": 0.029346199524142243, + "learning_rate": 0.0006267328310041457, + "loss": 0.5632, + "step": 18775 + }, + { + "epoch": 0.953408384206318, + "grad_norm": 0.026410770850243227, + "learning_rate": 0.0006265185266724526, + "loss": 0.5699, + "step": 18780 + }, + { + "epoch": 0.9536622203043494, + "grad_norm": 0.049041169261654256, + "learning_rate": 0.0006263041975058035, + "loss": 0.5605, + "step": 18785 + }, + { + "epoch": 0.953916056402381, + "grad_norm": 0.0363002561476258, + "learning_rate": 0.0006260898435462705, + "loss": 0.5506, + "step": 18790 + }, + { + "epoch": 0.9541698925004125, + "grad_norm": 0.022941890322864124, + "learning_rate": 0.0006258754648359301, + "loss": 0.5471, + "step": 18795 + }, + { + "epoch": 0.9544237285984439, + "grad_norm": 0.025645292089913767, + "learning_rate": 0.0006256610614168634, + "loss": 0.5604, + "step": 18800 + }, + { + "epoch": 0.9546775646964755, + "grad_norm": 0.022494488082290532, + "learning_rate": 0.0006254466333311573, + "loss": 0.5778, + "step": 18805 + }, + { + "epoch": 0.954931400794507, + "grad_norm": 0.02606808639822887, + "learning_rate": 0.0006252321806209024, + "loss": 0.6058, + "step": 18810 + }, + { + "epoch": 0.9551852368925385, + "grad_norm": 0.025745633241197222, + "learning_rate": 0.0006250177033281952, + "loss": 0.5883, + "step": 18815 + }, + { + "epoch": 0.95543907299057, + "grad_norm": 0.033669145164535265, + "learning_rate": 0.0006248032014951363, + "loss": 0.5292, + "step": 18820 + }, + { + "epoch": 0.9556929090886015, + "grad_norm": 0.03772741877989379, + "learning_rate": 0.0006245886751638312, + "loss": 0.5183, + "step": 18825 + }, + { + "epoch": 0.955946745186633, + "grad_norm": 0.03475199734255405, + "learning_rate": 0.0006243741243763906, + "loss": 0.5497, + "step": 18830 + }, + { + "epoch": 0.9562005812846645, + "grad_norm": 0.028007500754168836, + "learning_rate": 0.0006241595491749297, + "loss": 0.5348, + "step": 18835 + }, + { + "epoch": 0.956454417382696, + "grad_norm": 0.03964976547186352, + "learning_rate": 0.0006239449496015684, + "loss": 0.5696, + "step": 18840 + }, + { + "epoch": 0.9567082534807275, + "grad_norm": 0.029844026908865874, + "learning_rate": 0.0006237303256984315, + "loss": 0.5383, + "step": 18845 + }, + { + "epoch": 0.956962089578759, + "grad_norm": 0.03293271336871704, + "learning_rate": 0.0006235156775076488, + "loss": 0.5653, + "step": 18850 + }, + { + "epoch": 0.9572159256767905, + "grad_norm": 0.023764813247282635, + "learning_rate": 0.0006233010050713546, + "loss": 0.5474, + "step": 18855 + }, + { + "epoch": 0.957469761774822, + "grad_norm": 0.02729058413216523, + "learning_rate": 0.0006230863084316879, + "loss": 0.546, + "step": 18860 + }, + { + "epoch": 0.9577235978728535, + "grad_norm": 0.03048111721240167, + "learning_rate": 0.0006228715876307928, + "loss": 0.5302, + "step": 18865 + }, + { + "epoch": 0.957977433970885, + "grad_norm": 0.024821623443136005, + "learning_rate": 0.0006226568427108177, + "loss": 0.5267, + "step": 18870 + }, + { + "epoch": 0.9582312700689165, + "grad_norm": 0.02171005251341012, + "learning_rate": 0.0006224420737139161, + "loss": 0.5686, + "step": 18875 + }, + { + "epoch": 0.958485106166948, + "grad_norm": 0.02438019118654564, + "learning_rate": 0.0006222272806822463, + "loss": 0.5566, + "step": 18880 + }, + { + "epoch": 0.9587389422649795, + "grad_norm": 0.0223542750232972, + "learning_rate": 0.0006220124636579704, + "loss": 0.5438, + "step": 18885 + }, + { + "epoch": 0.958992778363011, + "grad_norm": 0.03015303343884345, + "learning_rate": 0.0006217976226832565, + "loss": 0.5771, + "step": 18890 + }, + { + "epoch": 0.9592466144610425, + "grad_norm": 0.025825991972978846, + "learning_rate": 0.0006215827578002768, + "loss": 0.5591, + "step": 18895 + }, + { + "epoch": 0.959500450559074, + "grad_norm": 0.03547031836750829, + "learning_rate": 0.0006213678690512081, + "loss": 0.5608, + "step": 18900 + }, + { + "epoch": 0.9597542866571055, + "grad_norm": 0.020995458669697365, + "learning_rate": 0.0006211529564782319, + "loss": 0.5428, + "step": 18905 + }, + { + "epoch": 0.960008122755137, + "grad_norm": 0.02476217559421539, + "learning_rate": 0.0006209380201235345, + "loss": 0.5592, + "step": 18910 + }, + { + "epoch": 0.9602619588531686, + "grad_norm": 0.023830491201295642, + "learning_rate": 0.000620723060029307, + "loss": 0.5601, + "step": 18915 + }, + { + "epoch": 0.9605157949512, + "grad_norm": 0.024232475162824736, + "learning_rate": 0.0006205080762377446, + "loss": 0.5588, + "step": 18920 + }, + { + "epoch": 0.9607696310492315, + "grad_norm": 0.02838429347591899, + "learning_rate": 0.000620293068791048, + "loss": 0.5365, + "step": 18925 + }, + { + "epoch": 0.961023467147263, + "grad_norm": 0.022820427870362616, + "learning_rate": 0.0006200780377314219, + "loss": 0.5594, + "step": 18930 + }, + { + "epoch": 0.9612773032452945, + "grad_norm": 0.03755731589801296, + "learning_rate": 0.0006198629831010758, + "loss": 0.5745, + "step": 18935 + }, + { + "epoch": 0.961531139343326, + "grad_norm": 0.024014657854385738, + "learning_rate": 0.0006196479049422239, + "loss": 0.5418, + "step": 18940 + }, + { + "epoch": 0.9617849754413575, + "grad_norm": 0.027051758292076112, + "learning_rate": 0.0006194328032970848, + "loss": 0.5438, + "step": 18945 + }, + { + "epoch": 0.962038811539389, + "grad_norm": 0.023230927228254993, + "learning_rate": 0.0006192176782078822, + "loss": 0.5285, + "step": 18950 + }, + { + "epoch": 0.9622926476374205, + "grad_norm": 0.02453226053942355, + "learning_rate": 0.0006190025297168437, + "loss": 0.5408, + "step": 18955 + }, + { + "epoch": 0.962546483735452, + "grad_norm": 0.28109523504650163, + "learning_rate": 0.0006187873578662024, + "loss": 0.5413, + "step": 18960 + }, + { + "epoch": 0.9628003198334835, + "grad_norm": 0.03123756702410563, + "learning_rate": 0.0006185721626981949, + "loss": 0.5555, + "step": 18965 + }, + { + "epoch": 0.963054155931515, + "grad_norm": 0.031041861048312636, + "learning_rate": 0.0006183569442550633, + "loss": 0.6098, + "step": 18970 + }, + { + "epoch": 0.9633079920295465, + "grad_norm": 0.031099393750686002, + "learning_rate": 0.0006181417025790536, + "loss": 0.5361, + "step": 18975 + }, + { + "epoch": 0.9635618281275781, + "grad_norm": 0.02595781815514382, + "learning_rate": 0.000617926437712417, + "loss": 0.5539, + "step": 18980 + }, + { + "epoch": 0.9638156642256095, + "grad_norm": 0.03639463677804149, + "learning_rate": 0.0006177111496974087, + "loss": 0.5472, + "step": 18985 + }, + { + "epoch": 0.964069500323641, + "grad_norm": 0.02606363135734158, + "learning_rate": 0.0006174958385762888, + "loss": 0.5701, + "step": 18990 + }, + { + "epoch": 0.9643233364216726, + "grad_norm": 0.04038231790604605, + "learning_rate": 0.0006172805043913218, + "loss": 0.5584, + "step": 18995 + }, + { + "epoch": 0.964577172519704, + "grad_norm": 0.03184750465530715, + "learning_rate": 0.0006170651471847766, + "loss": 0.5571, + "step": 19000 + }, + { + "epoch": 0.9648310086177355, + "grad_norm": 0.03705308485544799, + "learning_rate": 0.0006168497669989268, + "loss": 0.5572, + "step": 19005 + }, + { + "epoch": 0.965084844715767, + "grad_norm": 0.03454894666757974, + "learning_rate": 0.0006166343638760504, + "loss": 0.6049, + "step": 19010 + }, + { + "epoch": 0.9653386808137985, + "grad_norm": 0.03088369982680049, + "learning_rate": 0.0006164189378584301, + "loss": 0.5598, + "step": 19015 + }, + { + "epoch": 0.96559251691183, + "grad_norm": 0.028758495609164273, + "learning_rate": 0.0006162034889883529, + "loss": 0.5474, + "step": 19020 + }, + { + "epoch": 0.9658463530098615, + "grad_norm": 0.02750682584809894, + "learning_rate": 0.0006159880173081103, + "loss": 0.542, + "step": 19025 + }, + { + "epoch": 0.9661001891078931, + "grad_norm": 0.03970699514356098, + "learning_rate": 0.0006157725228599982, + "loss": 0.5545, + "step": 19030 + }, + { + "epoch": 0.9663540252059245, + "grad_norm": 0.022540600879443772, + "learning_rate": 0.0006155570056863175, + "loss": 0.5211, + "step": 19035 + }, + { + "epoch": 0.966607861303956, + "grad_norm": 0.03517528202934328, + "learning_rate": 0.0006153414658293725, + "loss": 0.5706, + "step": 19040 + }, + { + "epoch": 0.9668616974019876, + "grad_norm": 0.025523416998452415, + "learning_rate": 0.0006151259033314733, + "loss": 0.5416, + "step": 19045 + }, + { + "epoch": 0.967115533500019, + "grad_norm": 0.024970335802673438, + "learning_rate": 0.0006149103182349333, + "loss": 0.5711, + "step": 19050 + }, + { + "epoch": 0.9673693695980505, + "grad_norm": 0.04734193158993932, + "learning_rate": 0.0006146947105820709, + "loss": 0.5787, + "step": 19055 + }, + { + "epoch": 0.9676232056960821, + "grad_norm": 0.025774746375266132, + "learning_rate": 0.0006144790804152088, + "loss": 0.5752, + "step": 19060 + }, + { + "epoch": 0.9678770417941135, + "grad_norm": 0.035809770586720045, + "learning_rate": 0.0006142634277766741, + "loss": 0.5544, + "step": 19065 + }, + { + "epoch": 0.968130877892145, + "grad_norm": 0.022096408564194946, + "learning_rate": 0.0006140477527087983, + "loss": 0.5594, + "step": 19070 + }, + { + "epoch": 0.9683847139901766, + "grad_norm": 0.02285174641983169, + "learning_rate": 0.0006138320552539175, + "loss": 0.5824, + "step": 19075 + }, + { + "epoch": 0.9686385500882081, + "grad_norm": 0.022639632770351623, + "learning_rate": 0.000613616335454372, + "loss": 0.5582, + "step": 19080 + }, + { + "epoch": 0.9688923861862395, + "grad_norm": 0.02407222130765984, + "learning_rate": 0.0006134005933525062, + "loss": 0.5672, + "step": 19085 + }, + { + "epoch": 0.969146222284271, + "grad_norm": 0.02688787209576168, + "learning_rate": 0.0006131848289906696, + "loss": 0.5335, + "step": 19090 + }, + { + "epoch": 0.9694000583823026, + "grad_norm": 0.02409004068147683, + "learning_rate": 0.0006129690424112156, + "loss": 0.5962, + "step": 19095 + }, + { + "epoch": 0.969653894480334, + "grad_norm": 0.024141956964642438, + "learning_rate": 0.0006127532336565018, + "loss": 0.5666, + "step": 19100 + }, + { + "epoch": 0.9699077305783655, + "grad_norm": 0.022675312599314844, + "learning_rate": 0.0006125374027688905, + "loss": 0.551, + "step": 19105 + }, + { + "epoch": 0.9701615666763971, + "grad_norm": 0.037996729698158885, + "learning_rate": 0.0006123215497907484, + "loss": 0.5652, + "step": 19110 + }, + { + "epoch": 0.9704154027744285, + "grad_norm": 0.0272987646077441, + "learning_rate": 0.0006121056747644461, + "loss": 0.565, + "step": 19115 + }, + { + "epoch": 0.97066923887246, + "grad_norm": 0.04267144316368038, + "learning_rate": 0.000611889777732359, + "loss": 0.5429, + "step": 19120 + }, + { + "epoch": 0.9709230749704916, + "grad_norm": 0.02925397740944498, + "learning_rate": 0.0006116738587368665, + "loss": 0.5578, + "step": 19125 + }, + { + "epoch": 0.9711769110685231, + "grad_norm": 0.023290731955223867, + "learning_rate": 0.0006114579178203524, + "loss": 0.5429, + "step": 19130 + }, + { + "epoch": 0.9714307471665545, + "grad_norm": 0.03164983053501458, + "learning_rate": 0.000611241955025205, + "loss": 0.5405, + "step": 19135 + }, + { + "epoch": 0.9716845832645861, + "grad_norm": 0.037520719870187604, + "learning_rate": 0.0006110259703938165, + "loss": 0.5724, + "step": 19140 + }, + { + "epoch": 0.9719384193626176, + "grad_norm": 0.034312355122149724, + "learning_rate": 0.0006108099639685837, + "loss": 0.5474, + "step": 19145 + }, + { + "epoch": 0.972192255460649, + "grad_norm": 0.02562062543260294, + "learning_rate": 0.0006105939357919076, + "loss": 0.5461, + "step": 19150 + }, + { + "epoch": 0.9724460915586806, + "grad_norm": 0.02354699369047809, + "learning_rate": 0.0006103778859061935, + "loss": 0.545, + "step": 19155 + }, + { + "epoch": 0.9726999276567121, + "grad_norm": 0.021930383911286492, + "learning_rate": 0.0006101618143538508, + "loss": 0.5573, + "step": 19160 + }, + { + "epoch": 0.9729537637547435, + "grad_norm": 0.02583749368715139, + "learning_rate": 0.0006099457211772933, + "loss": 0.5544, + "step": 19165 + }, + { + "epoch": 0.973207599852775, + "grad_norm": 0.022556631905887248, + "learning_rate": 0.0006097296064189391, + "loss": 0.5559, + "step": 19170 + }, + { + "epoch": 0.9734614359508066, + "grad_norm": 0.03226579384353296, + "learning_rate": 0.0006095134701212102, + "loss": 0.5902, + "step": 19175 + }, + { + "epoch": 0.973715272048838, + "grad_norm": 0.023654661735220267, + "learning_rate": 0.0006092973123265334, + "loss": 0.5331, + "step": 19180 + }, + { + "epoch": 0.9739691081468695, + "grad_norm": 0.021164960310687785, + "learning_rate": 0.0006090811330773392, + "loss": 0.5349, + "step": 19185 + }, + { + "epoch": 0.9742229442449011, + "grad_norm": 0.026505686211264395, + "learning_rate": 0.0006088649324160626, + "loss": 0.4964, + "step": 19190 + }, + { + "epoch": 0.9744767803429326, + "grad_norm": 0.15964885508324672, + "learning_rate": 0.0006086487103851426, + "loss": 0.5849, + "step": 19195 + }, + { + "epoch": 0.974730616440964, + "grad_norm": 0.031195162048445326, + "learning_rate": 0.0006084324670270227, + "loss": 0.5735, + "step": 19200 + }, + { + "epoch": 0.9749844525389956, + "grad_norm": 0.024371095692914808, + "learning_rate": 0.0006082162023841502, + "loss": 0.5566, + "step": 19205 + }, + { + "epoch": 0.9752382886370271, + "grad_norm": 0.027331834493339167, + "learning_rate": 0.0006079999164989769, + "loss": 0.551, + "step": 19210 + }, + { + "epoch": 0.9754921247350585, + "grad_norm": 0.027798770270491248, + "learning_rate": 0.0006077836094139586, + "loss": 0.5576, + "step": 19215 + }, + { + "epoch": 0.9757459608330901, + "grad_norm": 0.025968021890026095, + "learning_rate": 0.0006075672811715553, + "loss": 0.5549, + "step": 19220 + }, + { + "epoch": 0.9759997969311216, + "grad_norm": 0.02489623198625103, + "learning_rate": 0.0006073509318142308, + "loss": 0.5526, + "step": 19225 + }, + { + "epoch": 0.976253633029153, + "grad_norm": 0.02608892933733737, + "learning_rate": 0.0006071345613844541, + "loss": 0.5696, + "step": 19230 + }, + { + "epoch": 0.9765074691271846, + "grad_norm": 0.023901773570461967, + "learning_rate": 0.0006069181699246973, + "loss": 0.5763, + "step": 19235 + }, + { + "epoch": 0.9767613052252161, + "grad_norm": 0.04188933752859617, + "learning_rate": 0.0006067017574774369, + "loss": 0.5685, + "step": 19240 + }, + { + "epoch": 0.9770151413232476, + "grad_norm": 0.04061459638720143, + "learning_rate": 0.0006064853240851536, + "loss": 0.5394, + "step": 19245 + }, + { + "epoch": 0.977268977421279, + "grad_norm": 0.037889258081175194, + "learning_rate": 0.0006062688697903322, + "loss": 0.5385, + "step": 19250 + }, + { + "epoch": 0.9775228135193106, + "grad_norm": 0.03360689695404566, + "learning_rate": 0.0006060523946354615, + "loss": 0.5318, + "step": 19255 + }, + { + "epoch": 0.9777766496173421, + "grad_norm": 0.03162668844088351, + "learning_rate": 0.0006058358986630347, + "loss": 0.5254, + "step": 19260 + }, + { + "epoch": 0.9780304857153735, + "grad_norm": 0.028559717544297737, + "learning_rate": 0.0006056193819155488, + "loss": 0.5786, + "step": 19265 + }, + { + "epoch": 0.9782843218134051, + "grad_norm": 0.02644336830634299, + "learning_rate": 0.0006054028444355051, + "loss": 0.5469, + "step": 19270 + }, + { + "epoch": 0.9785381579114366, + "grad_norm": 0.03308577275301962, + "learning_rate": 0.0006051862862654085, + "loss": 0.5332, + "step": 19275 + }, + { + "epoch": 0.978791994009468, + "grad_norm": 0.022000448921150856, + "learning_rate": 0.0006049697074477686, + "loss": 0.5501, + "step": 19280 + }, + { + "epoch": 0.9790458301074996, + "grad_norm": 0.0247774682959936, + "learning_rate": 0.0006047531080250985, + "loss": 0.5404, + "step": 19285 + }, + { + "epoch": 0.9792996662055311, + "grad_norm": 0.022885746957311163, + "learning_rate": 0.0006045364880399158, + "loss": 0.5675, + "step": 19290 + }, + { + "epoch": 0.9795535023035626, + "grad_norm": 0.024643646796613586, + "learning_rate": 0.0006043198475347418, + "loss": 0.5897, + "step": 19295 + }, + { + "epoch": 0.9798073384015941, + "grad_norm": 0.03123615100865316, + "learning_rate": 0.0006041031865521019, + "loss": 0.5386, + "step": 19300 + }, + { + "epoch": 0.9800611744996256, + "grad_norm": 0.023897535561443067, + "learning_rate": 0.0006038865051345257, + "loss": 0.5787, + "step": 19305 + }, + { + "epoch": 0.9803150105976571, + "grad_norm": 0.0238186011366476, + "learning_rate": 0.0006036698033245466, + "loss": 0.5415, + "step": 19310 + }, + { + "epoch": 0.9805688466956886, + "grad_norm": 0.024067063625727737, + "learning_rate": 0.000603453081164702, + "loss": 0.5225, + "step": 19315 + }, + { + "epoch": 0.9808226827937201, + "grad_norm": 0.02230127040642843, + "learning_rate": 0.0006032363386975337, + "loss": 0.5599, + "step": 19320 + }, + { + "epoch": 0.9810765188917516, + "grad_norm": 0.023928492267361032, + "learning_rate": 0.0006030195759655867, + "loss": 0.5454, + "step": 19325 + }, + { + "epoch": 0.981330354989783, + "grad_norm": 0.022074494679136896, + "learning_rate": 0.0006028027930114109, + "loss": 0.545, + "step": 19330 + }, + { + "epoch": 0.9815841910878146, + "grad_norm": 0.036818146764293826, + "learning_rate": 0.0006025859898775596, + "loss": 0.5431, + "step": 19335 + }, + { + "epoch": 0.9818380271858461, + "grad_norm": 0.021267108908214902, + "learning_rate": 0.0006023691666065899, + "loss": 0.5694, + "step": 19340 + }, + { + "epoch": 0.9820918632838777, + "grad_norm": 0.18195885280117177, + "learning_rate": 0.0006021523232410633, + "loss": 0.53, + "step": 19345 + }, + { + "epoch": 0.9823456993819091, + "grad_norm": 0.02933469861631167, + "learning_rate": 0.0006019354598235451, + "loss": 0.5383, + "step": 19350 + }, + { + "epoch": 0.9825995354799406, + "grad_norm": 0.02677555173548487, + "learning_rate": 0.0006017185763966044, + "loss": 0.5268, + "step": 19355 + }, + { + "epoch": 0.9828533715779721, + "grad_norm": 0.03238477638416001, + "learning_rate": 0.0006015016730028147, + "loss": 0.5301, + "step": 19360 + }, + { + "epoch": 0.9831072076760036, + "grad_norm": 0.026772761845334205, + "learning_rate": 0.0006012847496847525, + "loss": 0.5573, + "step": 19365 + }, + { + "epoch": 0.9833610437740351, + "grad_norm": 0.033966700490202034, + "learning_rate": 0.0006010678064849993, + "loss": 0.5473, + "step": 19370 + }, + { + "epoch": 0.9836148798720666, + "grad_norm": 0.023419696424668214, + "learning_rate": 0.0006008508434461394, + "loss": 0.5467, + "step": 19375 + }, + { + "epoch": 0.9838687159700981, + "grad_norm": 0.023940801308642033, + "learning_rate": 0.0006006338606107621, + "loss": 0.5717, + "step": 19380 + }, + { + "epoch": 0.9841225520681296, + "grad_norm": 0.025782454053669163, + "learning_rate": 0.0006004168580214598, + "loss": 0.52, + "step": 19385 + }, + { + "epoch": 0.9843763881661611, + "grad_norm": 0.0347230672413288, + "learning_rate": 0.000600199835720829, + "loss": 0.5136, + "step": 19390 + }, + { + "epoch": 0.9846302242641927, + "grad_norm": 0.03694921277593407, + "learning_rate": 0.0005999827937514701, + "loss": 0.557, + "step": 19395 + }, + { + "epoch": 0.9848840603622241, + "grad_norm": 0.042215904453139365, + "learning_rate": 0.0005997657321559875, + "loss": 0.5597, + "step": 19400 + }, + { + "epoch": 0.9851378964602556, + "grad_norm": 0.029186970807708833, + "learning_rate": 0.0005995486509769892, + "loss": 0.543, + "step": 19405 + }, + { + "epoch": 0.9853917325582872, + "grad_norm": 0.030825680890777987, + "learning_rate": 0.0005993315502570871, + "loss": 0.5283, + "step": 19410 + }, + { + "epoch": 0.9856455686563186, + "grad_norm": 0.03084685303258577, + "learning_rate": 0.000599114430038897, + "loss": 0.5378, + "step": 19415 + }, + { + "epoch": 0.9858994047543501, + "grad_norm": 0.03624341566980718, + "learning_rate": 0.0005988972903650388, + "loss": 0.5581, + "step": 19420 + }, + { + "epoch": 0.9861532408523817, + "grad_norm": 0.03238937387426097, + "learning_rate": 0.0005986801312781356, + "loss": 0.5855, + "step": 19425 + }, + { + "epoch": 0.9864070769504131, + "grad_norm": 0.029275031406698588, + "learning_rate": 0.0005984629528208147, + "loss": 0.5407, + "step": 19430 + }, + { + "epoch": 0.9866609130484446, + "grad_norm": 0.025894922300801874, + "learning_rate": 0.000598245755035707, + "loss": 0.5429, + "step": 19435 + }, + { + "epoch": 0.9869147491464761, + "grad_norm": 0.03331858196787023, + "learning_rate": 0.0005980285379654478, + "loss": 0.5862, + "step": 19440 + }, + { + "epoch": 0.9871685852445076, + "grad_norm": 0.02236486818578095, + "learning_rate": 0.0005978113016526753, + "loss": 0.5557, + "step": 19445 + }, + { + "epoch": 0.9874224213425391, + "grad_norm": 0.028767154063198953, + "learning_rate": 0.0005975940461400322, + "loss": 0.5677, + "step": 19450 + }, + { + "epoch": 0.9876762574405706, + "grad_norm": 0.02765546851820107, + "learning_rate": 0.0005973767714701646, + "loss": 0.5399, + "step": 19455 + }, + { + "epoch": 0.9879300935386022, + "grad_norm": 0.025484293553854787, + "learning_rate": 0.0005971594776857224, + "loss": 0.5593, + "step": 19460 + }, + { + "epoch": 0.9881839296366336, + "grad_norm": 0.031096244994143988, + "learning_rate": 0.000596942164829359, + "loss": 0.5332, + "step": 19465 + }, + { + "epoch": 0.9884377657346651, + "grad_norm": 0.044264139567922633, + "learning_rate": 0.0005967248329437322, + "loss": 0.5601, + "step": 19470 + }, + { + "epoch": 0.9886916018326967, + "grad_norm": 0.038014080072740736, + "learning_rate": 0.0005965074820715031, + "loss": 0.5252, + "step": 19475 + }, + { + "epoch": 0.9889454379307281, + "grad_norm": 0.02925907334151629, + "learning_rate": 0.0005962901122553366, + "loss": 0.5219, + "step": 19480 + }, + { + "epoch": 0.9891992740287596, + "grad_norm": 0.03246914140061126, + "learning_rate": 0.000596072723537901, + "loss": 0.5414, + "step": 19485 + }, + { + "epoch": 0.9894531101267912, + "grad_norm": 0.08010941576386262, + "learning_rate": 0.0005958553159618693, + "loss": 0.5208, + "step": 19490 + }, + { + "epoch": 0.9897069462248226, + "grad_norm": 0.0337186100394352, + "learning_rate": 0.0005956378895699169, + "loss": 0.5167, + "step": 19495 + }, + { + "epoch": 0.9899607823228541, + "grad_norm": 0.02503702827550537, + "learning_rate": 0.0005954204444047237, + "loss": 0.577, + "step": 19500 + }, + { + "epoch": 0.9902146184208857, + "grad_norm": 0.027330468479088564, + "learning_rate": 0.000595202980508973, + "loss": 0.5403, + "step": 19505 + }, + { + "epoch": 0.9904684545189172, + "grad_norm": 0.024841582567018985, + "learning_rate": 0.0005949854979253521, + "loss": 0.5139, + "step": 19510 + }, + { + "epoch": 0.9907222906169486, + "grad_norm": 0.02841062368445544, + "learning_rate": 0.0005947679966965517, + "loss": 0.5688, + "step": 19515 + }, + { + "epoch": 0.9909761267149801, + "grad_norm": 0.023717779713039483, + "learning_rate": 0.0005945504768652664, + "loss": 0.5672, + "step": 19520 + }, + { + "epoch": 0.9912299628130117, + "grad_norm": 0.02633602181222713, + "learning_rate": 0.0005943329384741937, + "loss": 0.5766, + "step": 19525 + }, + { + "epoch": 0.9914837989110431, + "grad_norm": 0.04290875603513479, + "learning_rate": 0.0005941153815660357, + "loss": 0.5472, + "step": 19530 + }, + { + "epoch": 0.9917376350090746, + "grad_norm": 0.024582207250698523, + "learning_rate": 0.0005938978061834977, + "loss": 0.5855, + "step": 19535 + }, + { + "epoch": 0.9919914711071062, + "grad_norm": 0.023721707310345872, + "learning_rate": 0.0005936802123692885, + "loss": 0.5748, + "step": 19540 + }, + { + "epoch": 0.9922453072051376, + "grad_norm": 0.026918040386621857, + "learning_rate": 0.0005934626001661209, + "loss": 0.5301, + "step": 19545 + }, + { + "epoch": 0.9924991433031691, + "grad_norm": 0.021708866914701308, + "learning_rate": 0.000593244969616711, + "loss": 0.5495, + "step": 19550 + }, + { + "epoch": 0.9927529794012007, + "grad_norm": 0.026495248356829797, + "learning_rate": 0.0005930273207637783, + "loss": 0.5222, + "step": 19555 + }, + { + "epoch": 0.9930068154992322, + "grad_norm": 0.02555187322005583, + "learning_rate": 0.0005928096536500467, + "loss": 0.54, + "step": 19560 + }, + { + "epoch": 0.9932606515972636, + "grad_norm": 0.02168086852198851, + "learning_rate": 0.0005925919683182429, + "loss": 0.5276, + "step": 19565 + }, + { + "epoch": 0.9935144876952952, + "grad_norm": 0.02499980089901408, + "learning_rate": 0.0005923742648110974, + "loss": 0.5073, + "step": 19570 + }, + { + "epoch": 0.9937683237933267, + "grad_norm": 0.02692412234182429, + "learning_rate": 0.0005921565431713445, + "loss": 0.5752, + "step": 19575 + }, + { + "epoch": 0.9940221598913581, + "grad_norm": 0.024546898354117743, + "learning_rate": 0.0005919388034417218, + "loss": 0.5328, + "step": 19580 + }, + { + "epoch": 0.9942759959893896, + "grad_norm": 0.024371485803514188, + "learning_rate": 0.0005917210456649703, + "loss": 0.5451, + "step": 19585 + }, + { + "epoch": 0.9945298320874212, + "grad_norm": 0.021869432790112914, + "learning_rate": 0.0005915032698838351, + "loss": 0.5394, + "step": 19590 + }, + { + "epoch": 0.9947836681854526, + "grad_norm": 0.03548043935373576, + "learning_rate": 0.0005912854761410642, + "loss": 0.5672, + "step": 19595 + }, + { + "epoch": 0.9950375042834841, + "grad_norm": 0.03662482896138081, + "learning_rate": 0.0005910676644794098, + "loss": 0.5641, + "step": 19600 + }, + { + "epoch": 0.9952913403815157, + "grad_norm": 0.0378468309188474, + "learning_rate": 0.0005908498349416269, + "loss": 0.5613, + "step": 19605 + }, + { + "epoch": 0.9955451764795472, + "grad_norm": 0.03627088854294031, + "learning_rate": 0.0005906319875704744, + "loss": 0.5532, + "step": 19610 + }, + { + "epoch": 0.9957990125775786, + "grad_norm": 0.03649215551829472, + "learning_rate": 0.0005904141224087147, + "loss": 0.4949, + "step": 19615 + }, + { + "epoch": 0.9960528486756102, + "grad_norm": 0.03763331065484329, + "learning_rate": 0.0005901962394991139, + "loss": 0.5862, + "step": 19620 + }, + { + "epoch": 0.9963066847736417, + "grad_norm": 0.0383387603823414, + "learning_rate": 0.0005899783388844408, + "loss": 0.5556, + "step": 19625 + }, + { + "epoch": 0.9965605208716731, + "grad_norm": 0.023484212925512286, + "learning_rate": 0.0005897604206074687, + "loss": 0.5708, + "step": 19630 + }, + { + "epoch": 0.9968143569697047, + "grad_norm": 0.03442449480611108, + "learning_rate": 0.0005895424847109736, + "loss": 0.5424, + "step": 19635 + }, + { + "epoch": 0.9970681930677362, + "grad_norm": 0.025468803257999176, + "learning_rate": 0.0005893245312377353, + "loss": 0.5184, + "step": 19640 + }, + { + "epoch": 0.9973220291657676, + "grad_norm": 0.023711176900071695, + "learning_rate": 0.0005891065602305369, + "loss": 0.5628, + "step": 19645 + }, + { + "epoch": 0.9975758652637992, + "grad_norm": 0.02757286076818459, + "learning_rate": 0.0005888885717321653, + "loss": 0.5549, + "step": 19650 + }, + { + "epoch": 0.9978297013618307, + "grad_norm": 0.022946177140901372, + "learning_rate": 0.0005886705657854101, + "loss": 0.531, + "step": 19655 + }, + { + "epoch": 0.9980835374598621, + "grad_norm": 0.03239722642028069, + "learning_rate": 0.0005884525424330652, + "loss": 0.547, + "step": 19660 + }, + { + "epoch": 0.9983373735578936, + "grad_norm": 0.02284766198581876, + "learning_rate": 0.0005882345017179274, + "loss": 0.5415, + "step": 19665 + }, + { + "epoch": 0.9985912096559252, + "grad_norm": 0.024423645649003435, + "learning_rate": 0.0005880164436827968, + "loss": 0.5319, + "step": 19670 + }, + { + "epoch": 0.9988450457539567, + "grad_norm": 0.023330277892660952, + "learning_rate": 0.0005877983683704772, + "loss": 0.5493, + "step": 19675 + }, + { + "epoch": 0.9990988818519881, + "grad_norm": 0.023557950454847, + "learning_rate": 0.0005875802758237758, + "loss": 0.5102, + "step": 19680 + }, + { + "epoch": 0.9993527179500197, + "grad_norm": 0.023129582403367616, + "learning_rate": 0.0005873621660855031, + "loss": 0.5691, + "step": 19685 + }, + { + "epoch": 0.9996065540480512, + "grad_norm": 0.03468112252443195, + "learning_rate": 0.0005871440391984729, + "loss": 0.5328, + "step": 19690 + }, + { + "epoch": 0.9998603901460826, + "grad_norm": 0.027312657946593693, + "learning_rate": 0.0005869258952055023, + "loss": 0.5544, + "step": 19695 + }, + { + "epoch": 1.0001269180490158, + "grad_norm": 0.026262955674306445, + "learning_rate": 0.000586707734149412, + "loss": 0.5426, + "step": 19700 + }, + { + "epoch": 1.0003807541470473, + "grad_norm": 0.022474094556517613, + "learning_rate": 0.0005864895560730257, + "loss": 0.4976, + "step": 19705 + }, + { + "epoch": 1.0006345902450788, + "grad_norm": 0.030681716691258087, + "learning_rate": 0.000586271361019171, + "loss": 0.484, + "step": 19710 + }, + { + "epoch": 1.0008884263431101, + "grad_norm": 0.06523945944943467, + "learning_rate": 0.0005860531490306784, + "loss": 0.5035, + "step": 19715 + }, + { + "epoch": 1.0011422624411417, + "grad_norm": 0.02136637932462246, + "learning_rate": 0.0005858349201503819, + "loss": 0.4933, + "step": 19720 + }, + { + "epoch": 1.0013960985391732, + "grad_norm": 0.023171987689390824, + "learning_rate": 0.0005856166744211185, + "loss": 0.5241, + "step": 19725 + }, + { + "epoch": 1.0016499346372048, + "grad_norm": 0.024915720736357203, + "learning_rate": 0.000585398411885729, + "loss": 0.5251, + "step": 19730 + }, + { + "epoch": 1.0019037707352363, + "grad_norm": 0.026668874934590553, + "learning_rate": 0.0005851801325870569, + "loss": 0.5185, + "step": 19735 + }, + { + "epoch": 1.0021576068332678, + "grad_norm": 0.03835627675960685, + "learning_rate": 0.0005849618365679497, + "loss": 0.5184, + "step": 19740 + }, + { + "epoch": 1.0024114429312994, + "grad_norm": 0.034939337784261996, + "learning_rate": 0.0005847435238712578, + "loss": 0.492, + "step": 19745 + }, + { + "epoch": 1.0026652790293307, + "grad_norm": 0.02797155118461246, + "learning_rate": 0.0005845251945398347, + "loss": 0.5061, + "step": 19750 + }, + { + "epoch": 1.0029191151273622, + "grad_norm": 0.02625721377041572, + "learning_rate": 0.0005843068486165374, + "loss": 0.5162, + "step": 19755 + }, + { + "epoch": 1.0031729512253937, + "grad_norm": 0.033958806367861846, + "learning_rate": 0.0005840884861442262, + "loss": 0.5117, + "step": 19760 + }, + { + "epoch": 1.0034267873234253, + "grad_norm": 0.03202782443206157, + "learning_rate": 0.0005838701071657643, + "loss": 0.5007, + "step": 19765 + }, + { + "epoch": 1.0036806234214568, + "grad_norm": 0.05036650421537131, + "learning_rate": 0.0005836517117240188, + "loss": 0.5151, + "step": 19770 + }, + { + "epoch": 1.0039344595194883, + "grad_norm": 0.04653612155409561, + "learning_rate": 0.0005834332998618596, + "loss": 0.4854, + "step": 19775 + }, + { + "epoch": 1.0041882956175199, + "grad_norm": 0.02804671100615471, + "learning_rate": 0.0005832148716221595, + "loss": 0.4987, + "step": 19780 + }, + { + "epoch": 1.0044421317155512, + "grad_norm": 0.022529443828552497, + "learning_rate": 0.0005829964270477953, + "loss": 0.5166, + "step": 19785 + }, + { + "epoch": 1.0046959678135827, + "grad_norm": 0.026684379922705167, + "learning_rate": 0.0005827779661816461, + "loss": 0.4885, + "step": 19790 + }, + { + "epoch": 1.0049498039116143, + "grad_norm": 0.060963234819393114, + "learning_rate": 0.000582559489066595, + "loss": 0.4791, + "step": 19795 + }, + { + "epoch": 1.0052036400096458, + "grad_norm": 0.03024468169105826, + "learning_rate": 0.0005823409957455281, + "loss": 0.5277, + "step": 19800 + }, + { + "epoch": 1.0054574761076773, + "grad_norm": 0.026438898745094152, + "learning_rate": 0.0005821224862613343, + "loss": 0.5012, + "step": 19805 + }, + { + "epoch": 1.0057113122057089, + "grad_norm": 0.03800242164729915, + "learning_rate": 0.000581903960656906, + "loss": 0.5191, + "step": 19810 + }, + { + "epoch": 1.0059651483037402, + "grad_norm": 0.02349650729472903, + "learning_rate": 0.0005816854189751386, + "loss": 0.5025, + "step": 19815 + }, + { + "epoch": 1.0062189844017717, + "grad_norm": 0.027214316740470396, + "learning_rate": 0.0005814668612589309, + "loss": 0.5251, + "step": 19820 + }, + { + "epoch": 1.0064728204998032, + "grad_norm": 0.025006007655423122, + "learning_rate": 0.0005812482875511845, + "loss": 0.5188, + "step": 19825 + }, + { + "epoch": 1.0067266565978348, + "grad_norm": 0.023683757750602448, + "learning_rate": 0.0005810296978948045, + "loss": 0.5285, + "step": 19830 + }, + { + "epoch": 1.0069804926958663, + "grad_norm": 0.02813093510207219, + "learning_rate": 0.0005808110923326989, + "loss": 0.5355, + "step": 19835 + }, + { + "epoch": 1.0072343287938978, + "grad_norm": 0.036465732354810876, + "learning_rate": 0.000580592470907779, + "loss": 0.5106, + "step": 19840 + }, + { + "epoch": 1.0074881648919294, + "grad_norm": 0.027979504250805996, + "learning_rate": 0.0005803738336629588, + "loss": 0.5217, + "step": 19845 + }, + { + "epoch": 1.0077420009899607, + "grad_norm": 0.02444979840392901, + "learning_rate": 0.0005801551806411561, + "loss": 0.5038, + "step": 19850 + }, + { + "epoch": 1.0079958370879922, + "grad_norm": 0.02488255109410723, + "learning_rate": 0.000579936511885291, + "loss": 0.5093, + "step": 19855 + }, + { + "epoch": 1.0082496731860238, + "grad_norm": 0.02550049459565845, + "learning_rate": 0.0005797178274382873, + "loss": 0.5269, + "step": 19860 + }, + { + "epoch": 1.0085035092840553, + "grad_norm": 0.02148889517361041, + "learning_rate": 0.0005794991273430716, + "loss": 0.5236, + "step": 19865 + }, + { + "epoch": 1.0087573453820868, + "grad_norm": 0.029497009648340886, + "learning_rate": 0.0005792804116425736, + "loss": 0.5042, + "step": 19870 + }, + { + "epoch": 1.0090111814801184, + "grad_norm": 0.02775516992605051, + "learning_rate": 0.0005790616803797263, + "loss": 0.5132, + "step": 19875 + }, + { + "epoch": 1.0092650175781497, + "grad_norm": 0.021211655705574328, + "learning_rate": 0.0005788429335974653, + "loss": 0.4801, + "step": 19880 + }, + { + "epoch": 1.0095188536761812, + "grad_norm": 0.02517030398746431, + "learning_rate": 0.0005786241713387297, + "loss": 0.5146, + "step": 19885 + }, + { + "epoch": 1.0097726897742128, + "grad_norm": 0.09462270152737755, + "learning_rate": 0.0005784053936464613, + "loss": 0.5127, + "step": 19890 + }, + { + "epoch": 1.0100265258722443, + "grad_norm": 0.026408205454532502, + "learning_rate": 0.0005781866005636052, + "loss": 0.5132, + "step": 19895 + }, + { + "epoch": 1.0102803619702758, + "grad_norm": 0.048718416202262686, + "learning_rate": 0.0005779677921331093, + "loss": 0.5499, + "step": 19900 + }, + { + "epoch": 1.0105341980683074, + "grad_norm": 0.05158533517321516, + "learning_rate": 0.0005777489683979247, + "loss": 0.4807, + "step": 19905 + }, + { + "epoch": 1.010788034166339, + "grad_norm": 0.042414622078804524, + "learning_rate": 0.0005775301294010052, + "loss": 0.5528, + "step": 19910 + }, + { + "epoch": 1.0110418702643702, + "grad_norm": 0.04213150411449631, + "learning_rate": 0.000577311275185308, + "loss": 0.5266, + "step": 19915 + }, + { + "epoch": 1.0112957063624017, + "grad_norm": 0.03233858503499294, + "learning_rate": 0.000577092405793793, + "loss": 0.4932, + "step": 19920 + }, + { + "epoch": 1.0115495424604333, + "grad_norm": 0.039425677298710396, + "learning_rate": 0.0005768735212694232, + "loss": 0.5242, + "step": 19925 + }, + { + "epoch": 1.0118033785584648, + "grad_norm": 0.04737514302447274, + "learning_rate": 0.0005766546216551646, + "loss": 0.4966, + "step": 19930 + }, + { + "epoch": 1.0120572146564963, + "grad_norm": 0.03637605851463593, + "learning_rate": 0.0005764357069939861, + "loss": 0.4907, + "step": 19935 + }, + { + "epoch": 1.0123110507545279, + "grad_norm": 0.03694503903527369, + "learning_rate": 0.0005762167773288594, + "loss": 0.5324, + "step": 19940 + }, + { + "epoch": 1.0125648868525594, + "grad_norm": 0.02639160420478131, + "learning_rate": 0.0005759978327027594, + "loss": 0.5153, + "step": 19945 + }, + { + "epoch": 1.0128187229505907, + "grad_norm": 0.028504997239272276, + "learning_rate": 0.000575778873158664, + "loss": 0.5113, + "step": 19950 + }, + { + "epoch": 1.0130725590486223, + "grad_norm": 0.024034023043991923, + "learning_rate": 0.0005755598987395535, + "loss": 0.5544, + "step": 19955 + }, + { + "epoch": 1.0133263951466538, + "grad_norm": 0.024483302437720976, + "learning_rate": 0.0005753409094884118, + "loss": 0.512, + "step": 19960 + }, + { + "epoch": 1.0135802312446853, + "grad_norm": 0.03202531364702948, + "learning_rate": 0.0005751219054482252, + "loss": 0.5313, + "step": 19965 + }, + { + "epoch": 1.0138340673427169, + "grad_norm": 0.04629752856614847, + "learning_rate": 0.0005749028866619833, + "loss": 0.5335, + "step": 19970 + }, + { + "epoch": 1.0140879034407484, + "grad_norm": 0.03603069481589699, + "learning_rate": 0.0005746838531726783, + "loss": 0.4915, + "step": 19975 + }, + { + "epoch": 1.0143417395387797, + "grad_norm": 0.036047586573516865, + "learning_rate": 0.0005744648050233053, + "loss": 0.5061, + "step": 19980 + }, + { + "epoch": 1.0145955756368112, + "grad_norm": 0.02500217530038146, + "learning_rate": 0.0005742457422568626, + "loss": 0.5021, + "step": 19985 + }, + { + "epoch": 1.0148494117348428, + "grad_norm": 0.030208741553390497, + "learning_rate": 0.0005740266649163507, + "loss": 0.5167, + "step": 19990 + }, + { + "epoch": 1.0151032478328743, + "grad_norm": 0.027545548141998097, + "learning_rate": 0.0005738075730447738, + "loss": 0.523, + "step": 19995 + }, + { + "epoch": 1.0153570839309058, + "grad_norm": 0.08854306261500218, + "learning_rate": 0.0005735884666851383, + "loss": 0.5208, + "step": 20000 + }, + { + "epoch": 1.0156109200289374, + "grad_norm": 0.030820917059042843, + "learning_rate": 0.0005733693458804537, + "loss": 0.5101, + "step": 20005 + }, + { + "epoch": 1.015864756126969, + "grad_norm": 0.034348864701267225, + "learning_rate": 0.0005731502106737326, + "loss": 0.5029, + "step": 20010 + }, + { + "epoch": 1.0161185922250002, + "grad_norm": 0.05872350374420824, + "learning_rate": 0.0005729310611079899, + "loss": 0.522, + "step": 20015 + }, + { + "epoch": 1.0163724283230318, + "grad_norm": 0.156969240858159, + "learning_rate": 0.0005727118972262437, + "loss": 0.5268, + "step": 20020 + }, + { + "epoch": 1.0166262644210633, + "grad_norm": 0.028641596405852787, + "learning_rate": 0.0005724927190715144, + "loss": 0.5322, + "step": 20025 + }, + { + "epoch": 1.0168801005190948, + "grad_norm": 0.03521162865341286, + "learning_rate": 0.0005722735266868261, + "loss": 0.4916, + "step": 20030 + }, + { + "epoch": 1.0171339366171264, + "grad_norm": 0.10045881918602356, + "learning_rate": 0.0005720543201152048, + "loss": 0.5197, + "step": 20035 + }, + { + "epoch": 1.017387772715158, + "grad_norm": 0.029867425574080526, + "learning_rate": 0.0005718350993996798, + "loss": 0.517, + "step": 20040 + }, + { + "epoch": 1.0176416088131894, + "grad_norm": 0.025642792051341005, + "learning_rate": 0.0005716158645832831, + "loss": 0.5213, + "step": 20045 + }, + { + "epoch": 1.0178954449112207, + "grad_norm": 0.02164896016678895, + "learning_rate": 0.0005713966157090493, + "loss": 0.5081, + "step": 20050 + }, + { + "epoch": 1.0181492810092523, + "grad_norm": 0.029307046081771176, + "learning_rate": 0.000571177352820016, + "loss": 0.56, + "step": 20055 + }, + { + "epoch": 1.0184031171072838, + "grad_norm": 0.03348711512912175, + "learning_rate": 0.0005709580759592232, + "loss": 0.4986, + "step": 20060 + }, + { + "epoch": 1.0186569532053154, + "grad_norm": 0.022750518622437432, + "learning_rate": 0.000570738785169714, + "loss": 0.5318, + "step": 20065 + }, + { + "epoch": 1.0189107893033469, + "grad_norm": 0.03273089769737566, + "learning_rate": 0.0005705194804945339, + "loss": 0.5205, + "step": 20070 + }, + { + "epoch": 1.0191646254013784, + "grad_norm": 0.03101997693290262, + "learning_rate": 0.0005703001619767317, + "loss": 0.5585, + "step": 20075 + }, + { + "epoch": 1.0194184614994097, + "grad_norm": 0.48293463443735013, + "learning_rate": 0.0005700808296593581, + "loss": 0.5232, + "step": 20080 + }, + { + "epoch": 1.0196722975974413, + "grad_norm": 0.026869468991904833, + "learning_rate": 0.0005698614835854672, + "loss": 0.5329, + "step": 20085 + }, + { + "epoch": 1.0199261336954728, + "grad_norm": 0.03175236206705197, + "learning_rate": 0.0005696421237981155, + "loss": 0.5293, + "step": 20090 + }, + { + "epoch": 1.0201799697935043, + "grad_norm": 0.02839497971854906, + "learning_rate": 0.0005694227503403623, + "loss": 0.5012, + "step": 20095 + }, + { + "epoch": 1.0204338058915359, + "grad_norm": 0.03500525520582677, + "learning_rate": 0.0005692033632552691, + "loss": 0.5219, + "step": 20100 + }, + { + "epoch": 1.0206876419895674, + "grad_norm": 0.034416581373221365, + "learning_rate": 0.000568983962585901, + "loss": 0.4977, + "step": 20105 + }, + { + "epoch": 1.020941478087599, + "grad_norm": 0.025912971110602936, + "learning_rate": 0.0005687645483753252, + "loss": 0.5208, + "step": 20110 + }, + { + "epoch": 1.0211953141856303, + "grad_norm": 0.033142018106894794, + "learning_rate": 0.0005685451206666113, + "loss": 0.4926, + "step": 20115 + }, + { + "epoch": 1.0214491502836618, + "grad_norm": 0.03866721854730293, + "learning_rate": 0.0005683256795028321, + "loss": 0.4984, + "step": 20120 + }, + { + "epoch": 1.0217029863816933, + "grad_norm": 0.027988000692318403, + "learning_rate": 0.0005681062249270627, + "loss": 0.4899, + "step": 20125 + }, + { + "epoch": 1.0219568224797249, + "grad_norm": 0.022355333164415802, + "learning_rate": 0.000567886756982381, + "loss": 0.5059, + "step": 20130 + }, + { + "epoch": 1.0222106585777564, + "grad_norm": 0.0440650062378424, + "learning_rate": 0.0005676672757118675, + "loss": 0.5015, + "step": 20135 + }, + { + "epoch": 1.022464494675788, + "grad_norm": 0.023375514087650193, + "learning_rate": 0.0005674477811586053, + "loss": 0.4984, + "step": 20140 + }, + { + "epoch": 1.0227183307738192, + "grad_norm": 0.03157563540299216, + "learning_rate": 0.0005672282733656799, + "loss": 0.5046, + "step": 20145 + }, + { + "epoch": 1.0229721668718508, + "grad_norm": 0.02116231894172818, + "learning_rate": 0.0005670087523761797, + "loss": 0.5165, + "step": 20150 + }, + { + "epoch": 1.0232260029698823, + "grad_norm": 0.02699273830998635, + "learning_rate": 0.0005667892182331958, + "loss": 0.5343, + "step": 20155 + }, + { + "epoch": 1.0234798390679138, + "grad_norm": 0.025520103162852272, + "learning_rate": 0.0005665696709798211, + "loss": 0.5015, + "step": 20160 + }, + { + "epoch": 1.0237336751659454, + "grad_norm": 0.02320429503242182, + "learning_rate": 0.0005663501106591522, + "loss": 0.5069, + "step": 20165 + }, + { + "epoch": 1.023987511263977, + "grad_norm": 0.03088412096065196, + "learning_rate": 0.0005661305373142874, + "loss": 0.5269, + "step": 20170 + }, + { + "epoch": 1.0242413473620084, + "grad_norm": 0.028956706789680344, + "learning_rate": 0.0005659109509883279, + "loss": 0.5156, + "step": 20175 + }, + { + "epoch": 1.0244951834600398, + "grad_norm": 0.028907980726983182, + "learning_rate": 0.0005656913517243775, + "loss": 0.5126, + "step": 20180 + }, + { + "epoch": 1.0247490195580713, + "grad_norm": 0.031564123214603855, + "learning_rate": 0.0005654717395655423, + "loss": 0.5212, + "step": 20185 + }, + { + "epoch": 1.0250028556561028, + "grad_norm": 0.023659437611885405, + "learning_rate": 0.0005652521145549312, + "loss": 0.5182, + "step": 20190 + }, + { + "epoch": 1.0252566917541344, + "grad_norm": 0.024467789569289182, + "learning_rate": 0.0005650324767356553, + "loss": 0.5317, + "step": 20195 + }, + { + "epoch": 1.025510527852166, + "grad_norm": 0.0259776185535513, + "learning_rate": 0.0005648128261508287, + "loss": 0.5111, + "step": 20200 + }, + { + "epoch": 1.0257643639501974, + "grad_norm": 0.037490439119855785, + "learning_rate": 0.0005645931628435674, + "loss": 0.4861, + "step": 20205 + }, + { + "epoch": 1.026018200048229, + "grad_norm": 0.033783262227514106, + "learning_rate": 0.0005643734868569904, + "loss": 0.5077, + "step": 20210 + }, + { + "epoch": 1.0262720361462603, + "grad_norm": 0.02114752691351358, + "learning_rate": 0.0005641537982342189, + "loss": 0.5203, + "step": 20215 + }, + { + "epoch": 1.0265258722442918, + "grad_norm": 0.0438818125159392, + "learning_rate": 0.0005639340970183767, + "loss": 0.5161, + "step": 20220 + }, + { + "epoch": 1.0267797083423233, + "grad_norm": 0.025324346639667195, + "learning_rate": 0.0005637143832525902, + "loss": 0.5301, + "step": 20225 + }, + { + "epoch": 1.0270335444403549, + "grad_norm": 0.02505419634462526, + "learning_rate": 0.000563494656979988, + "loss": 0.5142, + "step": 20230 + }, + { + "epoch": 1.0272873805383864, + "grad_norm": 0.0260932413609203, + "learning_rate": 0.0005632749182437013, + "loss": 0.476, + "step": 20235 + }, + { + "epoch": 1.027541216636418, + "grad_norm": 0.02657813325380469, + "learning_rate": 0.0005630551670868638, + "loss": 0.5121, + "step": 20240 + }, + { + "epoch": 1.0277950527344493, + "grad_norm": 0.02517945560861088, + "learning_rate": 0.0005628354035526113, + "loss": 0.5014, + "step": 20245 + }, + { + "epoch": 1.0280488888324808, + "grad_norm": 0.02457715419344653, + "learning_rate": 0.0005626156276840824, + "loss": 0.5445, + "step": 20250 + }, + { + "epoch": 1.0283027249305123, + "grad_norm": 0.023965938720396072, + "learning_rate": 0.0005623958395244182, + "loss": 0.5173, + "step": 20255 + }, + { + "epoch": 1.0285565610285439, + "grad_norm": 0.025808791859439455, + "learning_rate": 0.0005621760391167618, + "loss": 0.512, + "step": 20260 + }, + { + "epoch": 1.0288103971265754, + "grad_norm": 0.02616645418239067, + "learning_rate": 0.0005619562265042589, + "loss": 0.5024, + "step": 20265 + }, + { + "epoch": 1.029064233224607, + "grad_norm": 0.026011495515384515, + "learning_rate": 0.0005617364017300579, + "loss": 0.5688, + "step": 20270 + }, + { + "epoch": 1.0293180693226385, + "grad_norm": 0.023645680045192928, + "learning_rate": 0.0005615165648373091, + "loss": 0.5107, + "step": 20275 + }, + { + "epoch": 1.0295719054206698, + "grad_norm": 0.030625762897844983, + "learning_rate": 0.0005612967158691652, + "loss": 0.5425, + "step": 20280 + }, + { + "epoch": 1.0298257415187013, + "grad_norm": 0.02374291305295429, + "learning_rate": 0.0005610768548687818, + "loss": 0.4875, + "step": 20285 + }, + { + "epoch": 1.0300795776167329, + "grad_norm": 0.030056754991982286, + "learning_rate": 0.0005608569818793163, + "loss": 0.5138, + "step": 20290 + }, + { + "epoch": 1.0303334137147644, + "grad_norm": 0.021415600184617786, + "learning_rate": 0.0005606370969439288, + "loss": 0.5262, + "step": 20295 + }, + { + "epoch": 1.030587249812796, + "grad_norm": 0.035674997734029885, + "learning_rate": 0.0005604172001057817, + "loss": 0.4913, + "step": 20300 + }, + { + "epoch": 1.0308410859108275, + "grad_norm": 0.02322983861959061, + "learning_rate": 0.0005601972914080394, + "loss": 0.5258, + "step": 20305 + }, + { + "epoch": 1.0310949220088588, + "grad_norm": 0.02417219163699135, + "learning_rate": 0.000559977370893869, + "loss": 0.523, + "step": 20310 + }, + { + "epoch": 1.0313487581068903, + "grad_norm": 0.028159975076131768, + "learning_rate": 0.0005597574386064398, + "loss": 0.4937, + "step": 20315 + }, + { + "epoch": 1.0316025942049218, + "grad_norm": 0.025480520142062726, + "learning_rate": 0.0005595374945889235, + "loss": 0.5297, + "step": 20320 + }, + { + "epoch": 1.0318564303029534, + "grad_norm": 0.02648031129017469, + "learning_rate": 0.0005593175388844939, + "loss": 0.5191, + "step": 20325 + }, + { + "epoch": 1.032110266400985, + "grad_norm": 0.027996109985128536, + "learning_rate": 0.0005590975715363271, + "loss": 0.5182, + "step": 20330 + }, + { + "epoch": 1.0323641024990164, + "grad_norm": 0.02208647698714924, + "learning_rate": 0.0005588775925876019, + "loss": 0.4733, + "step": 20335 + }, + { + "epoch": 1.032617938597048, + "grad_norm": 0.03617341314263012, + "learning_rate": 0.0005586576020814986, + "loss": 0.5569, + "step": 20340 + }, + { + "epoch": 1.0328717746950793, + "grad_norm": 0.02362640361558171, + "learning_rate": 0.0005584376000612008, + "loss": 0.5051, + "step": 20345 + }, + { + "epoch": 1.0331256107931108, + "grad_norm": 0.024439837400368546, + "learning_rate": 0.0005582175865698935, + "loss": 0.5007, + "step": 20350 + }, + { + "epoch": 1.0333794468911424, + "grad_norm": 0.02551816319500186, + "learning_rate": 0.0005579975616507642, + "loss": 0.5351, + "step": 20355 + }, + { + "epoch": 1.033633282989174, + "grad_norm": 0.03547530832085492, + "learning_rate": 0.0005577775253470028, + "loss": 0.5283, + "step": 20360 + }, + { + "epoch": 1.0338871190872054, + "grad_norm": 0.024034616664719018, + "learning_rate": 0.0005575574777018014, + "loss": 0.5191, + "step": 20365 + }, + { + "epoch": 1.034140955185237, + "grad_norm": 0.023421751953444182, + "learning_rate": 0.000557337418758354, + "loss": 0.5522, + "step": 20370 + }, + { + "epoch": 1.0343947912832685, + "grad_norm": 0.02493069566308207, + "learning_rate": 0.0005571173485598575, + "loss": 0.5506, + "step": 20375 + }, + { + "epoch": 1.0346486273812998, + "grad_norm": 0.02674149513349264, + "learning_rate": 0.0005568972671495102, + "loss": 0.5057, + "step": 20380 + }, + { + "epoch": 1.0349024634793313, + "grad_norm": 0.025106714581499427, + "learning_rate": 0.000556677174570513, + "loss": 0.5033, + "step": 20385 + }, + { + "epoch": 1.0351562995773629, + "grad_norm": 0.025943628794495237, + "learning_rate": 0.0005564570708660692, + "loss": 0.4955, + "step": 20390 + }, + { + "epoch": 1.0354101356753944, + "grad_norm": 0.026309101778550982, + "learning_rate": 0.000556236956079384, + "loss": 0.4825, + "step": 20395 + }, + { + "epoch": 1.035663971773426, + "grad_norm": 0.026006362089191562, + "learning_rate": 0.0005560168302536645, + "loss": 0.5453, + "step": 20400 + }, + { + "epoch": 1.0359178078714575, + "grad_norm": 0.026998637894890546, + "learning_rate": 0.0005557966934321208, + "loss": 0.5181, + "step": 20405 + }, + { + "epoch": 1.0361716439694888, + "grad_norm": 0.023293334910978493, + "learning_rate": 0.0005555765456579645, + "loss": 0.4928, + "step": 20410 + }, + { + "epoch": 1.0364254800675203, + "grad_norm": 0.02235857148830006, + "learning_rate": 0.0005553563869744092, + "loss": 0.5101, + "step": 20415 + }, + { + "epoch": 1.0366793161655519, + "grad_norm": 0.031120430714844608, + "learning_rate": 0.0005551362174246714, + "loss": 0.4839, + "step": 20420 + }, + { + "epoch": 1.0369331522635834, + "grad_norm": 0.03754523909785098, + "learning_rate": 0.000554916037051969, + "loss": 0.5065, + "step": 20425 + }, + { + "epoch": 1.037186988361615, + "grad_norm": 0.02326842333764262, + "learning_rate": 0.0005546958458995225, + "loss": 0.5045, + "step": 20430 + }, + { + "epoch": 1.0374408244596465, + "grad_norm": 0.02252685117622399, + "learning_rate": 0.0005544756440105541, + "loss": 0.4895, + "step": 20435 + }, + { + "epoch": 1.037694660557678, + "grad_norm": 0.022048155029177167, + "learning_rate": 0.0005542554314282885, + "loss": 0.4994, + "step": 20440 + }, + { + "epoch": 1.0379484966557093, + "grad_norm": 0.034805475396144946, + "learning_rate": 0.0005540352081959524, + "loss": 0.5204, + "step": 20445 + }, + { + "epoch": 1.0382023327537409, + "grad_norm": 0.03443533236482359, + "learning_rate": 0.0005538149743567742, + "loss": 0.4981, + "step": 20450 + }, + { + "epoch": 1.0384561688517724, + "grad_norm": 0.02382977310407126, + "learning_rate": 0.000553594729953985, + "loss": 0.51, + "step": 20455 + }, + { + "epoch": 1.038710004949804, + "grad_norm": 0.025391666920891502, + "learning_rate": 0.0005533744750308173, + "loss": 0.5266, + "step": 20460 + }, + { + "epoch": 1.0389638410478355, + "grad_norm": 0.03609117313209483, + "learning_rate": 0.0005531542096305067, + "loss": 0.4907, + "step": 20465 + }, + { + "epoch": 1.039217677145867, + "grad_norm": 0.023507381495373715, + "learning_rate": 0.0005529339337962898, + "loss": 0.4975, + "step": 20470 + }, + { + "epoch": 1.0394715132438983, + "grad_norm": 0.07606685865235592, + "learning_rate": 0.0005527136475714055, + "loss": 0.4953, + "step": 20475 + }, + { + "epoch": 1.0397253493419298, + "grad_norm": 0.025963486371282662, + "learning_rate": 0.0005524933509990953, + "loss": 0.5241, + "step": 20480 + }, + { + "epoch": 1.0399791854399614, + "grad_norm": 0.03372063969736153, + "learning_rate": 0.0005522730441226019, + "loss": 0.4818, + "step": 20485 + }, + { + "epoch": 1.040233021537993, + "grad_norm": 0.024045885674214548, + "learning_rate": 0.0005520527269851707, + "loss": 0.5318, + "step": 20490 + }, + { + "epoch": 1.0404868576360244, + "grad_norm": 0.025422330712362343, + "learning_rate": 0.0005518323996300486, + "loss": 0.4953, + "step": 20495 + }, + { + "epoch": 1.040740693734056, + "grad_norm": 0.033795313139793635, + "learning_rate": 0.0005516120621004852, + "loss": 0.5, + "step": 20500 + }, + { + "epoch": 1.0409945298320875, + "grad_norm": 0.023641788988281797, + "learning_rate": 0.0005513917144397313, + "loss": 0.5393, + "step": 20505 + }, + { + "epoch": 1.0412483659301188, + "grad_norm": 0.029211469214469557, + "learning_rate": 0.0005511713566910401, + "loss": 0.5019, + "step": 20510 + }, + { + "epoch": 1.0415022020281504, + "grad_norm": 0.03256182907446245, + "learning_rate": 0.0005509509888976668, + "loss": 0.5106, + "step": 20515 + }, + { + "epoch": 1.041756038126182, + "grad_norm": 0.03264231316359048, + "learning_rate": 0.0005507306111028683, + "loss": 0.5352, + "step": 20520 + }, + { + "epoch": 1.0420098742242134, + "grad_norm": 0.04894371601933567, + "learning_rate": 0.000550510223349904, + "loss": 0.5144, + "step": 20525 + }, + { + "epoch": 1.042263710322245, + "grad_norm": 0.0439869944668601, + "learning_rate": 0.0005502898256820349, + "loss": 0.5165, + "step": 20530 + }, + { + "epoch": 1.0425175464202765, + "grad_norm": 0.03490709301718278, + "learning_rate": 0.0005500694181425237, + "loss": 0.5228, + "step": 20535 + }, + { + "epoch": 1.042771382518308, + "grad_norm": 0.02637599113409724, + "learning_rate": 0.0005498490007746354, + "loss": 0.4868, + "step": 20540 + }, + { + "epoch": 1.0430252186163393, + "grad_norm": 0.030905442645324806, + "learning_rate": 0.0005496285736216369, + "loss": 0.5007, + "step": 20545 + }, + { + "epoch": 1.0432790547143709, + "grad_norm": 0.022912410310244463, + "learning_rate": 0.0005494081367267968, + "loss": 0.531, + "step": 20550 + }, + { + "epoch": 1.0435328908124024, + "grad_norm": 0.03822182605381993, + "learning_rate": 0.0005491876901333859, + "loss": 0.5184, + "step": 20555 + }, + { + "epoch": 1.043786726910434, + "grad_norm": 0.03854079290096544, + "learning_rate": 0.0005489672338846767, + "loss": 0.5451, + "step": 20560 + }, + { + "epoch": 1.0440405630084655, + "grad_norm": 0.023025560697022475, + "learning_rate": 0.0005487467680239437, + "loss": 0.486, + "step": 20565 + }, + { + "epoch": 1.044294399106497, + "grad_norm": 0.02820566558878494, + "learning_rate": 0.0005485262925944633, + "loss": 0.5208, + "step": 20570 + }, + { + "epoch": 1.0445482352045283, + "grad_norm": 0.023098213177233765, + "learning_rate": 0.0005483058076395136, + "loss": 0.5211, + "step": 20575 + }, + { + "epoch": 1.0448020713025599, + "grad_norm": 0.032862139607394404, + "learning_rate": 0.0005480853132023746, + "loss": 0.4847, + "step": 20580 + }, + { + "epoch": 1.0450559074005914, + "grad_norm": 0.02296236963012799, + "learning_rate": 0.0005478648093263286, + "loss": 0.4886, + "step": 20585 + }, + { + "epoch": 1.045309743498623, + "grad_norm": 0.022572602535311893, + "learning_rate": 0.0005476442960546592, + "loss": 0.5119, + "step": 20590 + }, + { + "epoch": 1.0455635795966545, + "grad_norm": 0.030771735483987164, + "learning_rate": 0.0005474237734306522, + "loss": 0.4762, + "step": 20595 + }, + { + "epoch": 1.045817415694686, + "grad_norm": 0.03547547036096252, + "learning_rate": 0.0005472032414975949, + "loss": 0.5127, + "step": 20600 + }, + { + "epoch": 1.0460712517927175, + "grad_norm": 0.0254360744170134, + "learning_rate": 0.0005469827002987767, + "loss": 0.4929, + "step": 20605 + }, + { + "epoch": 1.0463250878907489, + "grad_norm": 0.025518311798158718, + "learning_rate": 0.0005467621498774886, + "loss": 0.5245, + "step": 20610 + }, + { + "epoch": 1.0465789239887804, + "grad_norm": 0.026490108041312813, + "learning_rate": 0.0005465415902770238, + "loss": 0.4944, + "step": 20615 + }, + { + "epoch": 1.046832760086812, + "grad_norm": 0.02526869174684408, + "learning_rate": 0.0005463210215406769, + "loss": 0.5045, + "step": 20620 + }, + { + "epoch": 1.0470865961848435, + "grad_norm": 0.02074411912466121, + "learning_rate": 0.0005461004437117445, + "loss": 0.51, + "step": 20625 + }, + { + "epoch": 1.047340432282875, + "grad_norm": 0.027093812806925615, + "learning_rate": 0.0005458798568335249, + "loss": 0.5166, + "step": 20630 + }, + { + "epoch": 1.0475942683809065, + "grad_norm": 0.02436102653069909, + "learning_rate": 0.0005456592609493182, + "loss": 0.5132, + "step": 20635 + }, + { + "epoch": 1.047848104478938, + "grad_norm": 0.023097199436276732, + "learning_rate": 0.0005454386561024263, + "loss": 0.4994, + "step": 20640 + }, + { + "epoch": 1.0481019405769694, + "grad_norm": 0.026331858968549688, + "learning_rate": 0.0005452180423361528, + "loss": 0.5153, + "step": 20645 + }, + { + "epoch": 1.048355776675001, + "grad_norm": 0.02645976286908623, + "learning_rate": 0.0005449974196938031, + "loss": 0.5188, + "step": 20650 + }, + { + "epoch": 1.0486096127730324, + "grad_norm": 0.025310276627568677, + "learning_rate": 0.0005447767882186844, + "loss": 0.53, + "step": 20655 + }, + { + "epoch": 1.048863448871064, + "grad_norm": 0.0261930070884005, + "learning_rate": 0.0005445561479541053, + "loss": 0.4882, + "step": 20660 + }, + { + "epoch": 1.0491172849690955, + "grad_norm": 0.025129244942486483, + "learning_rate": 0.0005443354989433766, + "loss": 0.4951, + "step": 20665 + }, + { + "epoch": 1.049371121067127, + "grad_norm": 0.020547190474118154, + "learning_rate": 0.0005441148412298106, + "loss": 0.4623, + "step": 20670 + }, + { + "epoch": 1.0496249571651584, + "grad_norm": 0.02180036590225857, + "learning_rate": 0.0005438941748567212, + "loss": 0.5372, + "step": 20675 + }, + { + "epoch": 1.04987879326319, + "grad_norm": 0.026634325082656105, + "learning_rate": 0.0005436734998674242, + "loss": 0.526, + "step": 20680 + }, + { + "epoch": 1.0501326293612214, + "grad_norm": 0.022973149324837107, + "learning_rate": 0.0005434528163052371, + "loss": 0.4898, + "step": 20685 + }, + { + "epoch": 1.050386465459253, + "grad_norm": 0.024359590869779388, + "learning_rate": 0.0005432321242134787, + "loss": 0.504, + "step": 20690 + }, + { + "epoch": 1.0506403015572845, + "grad_norm": 0.024548501089468676, + "learning_rate": 0.0005430114236354701, + "loss": 0.5368, + "step": 20695 + }, + { + "epoch": 1.050894137655316, + "grad_norm": 0.021875608126201377, + "learning_rate": 0.0005427907146145333, + "loss": 0.5023, + "step": 20700 + }, + { + "epoch": 1.0511479737533476, + "grad_norm": 0.03438836236481197, + "learning_rate": 0.0005425699971939927, + "loss": 0.5339, + "step": 20705 + }, + { + "epoch": 1.0514018098513789, + "grad_norm": 0.023250977742150812, + "learning_rate": 0.000542349271417174, + "loss": 0.5067, + "step": 20710 + }, + { + "epoch": 1.0516556459494104, + "grad_norm": 0.030063734343917937, + "learning_rate": 0.0005421285373274045, + "loss": 0.4875, + "step": 20715 + }, + { + "epoch": 1.051909482047442, + "grad_norm": 0.028285057645613915, + "learning_rate": 0.0005419077949680132, + "loss": 0.5006, + "step": 20720 + }, + { + "epoch": 1.0521633181454735, + "grad_norm": 0.023333102184669698, + "learning_rate": 0.0005416870443823308, + "loss": 0.4778, + "step": 20725 + }, + { + "epoch": 1.052417154243505, + "grad_norm": 0.02557383529258429, + "learning_rate": 0.0005414662856136894, + "loss": 0.53, + "step": 20730 + }, + { + "epoch": 1.0526709903415365, + "grad_norm": 0.023805131180192746, + "learning_rate": 0.0005412455187054229, + "loss": 0.5292, + "step": 20735 + }, + { + "epoch": 1.0529248264395679, + "grad_norm": 0.049484796752778205, + "learning_rate": 0.0005410247437008668, + "loss": 0.5113, + "step": 20740 + }, + { + "epoch": 1.0531786625375994, + "grad_norm": 0.04103309197810282, + "learning_rate": 0.0005408039606433582, + "loss": 0.5546, + "step": 20745 + }, + { + "epoch": 1.053432498635631, + "grad_norm": 0.03609048454627881, + "learning_rate": 0.0005405831695762355, + "loss": 0.5268, + "step": 20750 + }, + { + "epoch": 1.0536863347336625, + "grad_norm": 0.02838028673259011, + "learning_rate": 0.0005403623705428391, + "loss": 0.5169, + "step": 20755 + }, + { + "epoch": 1.053940170831694, + "grad_norm": 0.02412838334841983, + "learning_rate": 0.0005401415635865106, + "loss": 0.4983, + "step": 20760 + }, + { + "epoch": 1.0541940069297255, + "grad_norm": 0.023534830774553976, + "learning_rate": 0.0005399207487505934, + "loss": 0.4908, + "step": 20765 + }, + { + "epoch": 1.054447843027757, + "grad_norm": 0.08793074530330729, + "learning_rate": 0.0005396999260784323, + "loss": 0.4971, + "step": 20770 + }, + { + "epoch": 1.0547016791257884, + "grad_norm": 0.02146817209464667, + "learning_rate": 0.0005394790956133736, + "loss": 0.487, + "step": 20775 + }, + { + "epoch": 1.05495551522382, + "grad_norm": 0.02956735657834776, + "learning_rate": 0.0005392582573987654, + "loss": 0.4974, + "step": 20780 + }, + { + "epoch": 1.0552093513218515, + "grad_norm": 0.03392596129392696, + "learning_rate": 0.0005390374114779571, + "loss": 0.5112, + "step": 20785 + }, + { + "epoch": 1.055463187419883, + "grad_norm": 0.024893466438836578, + "learning_rate": 0.0005388165578942993, + "loss": 0.4961, + "step": 20790 + }, + { + "epoch": 1.0557170235179145, + "grad_norm": 0.02466824162848842, + "learning_rate": 0.0005385956966911451, + "loss": 0.5279, + "step": 20795 + }, + { + "epoch": 1.055970859615946, + "grad_norm": 0.021932982882764882, + "learning_rate": 0.000538374827911848, + "loss": 0.5209, + "step": 20800 + }, + { + "epoch": 1.0562246957139776, + "grad_norm": 0.031454975273976886, + "learning_rate": 0.0005381539515997636, + "loss": 0.5206, + "step": 20805 + }, + { + "epoch": 1.056478531812009, + "grad_norm": 0.02097302093594751, + "learning_rate": 0.0005379330677982487, + "loss": 0.4813, + "step": 20810 + }, + { + "epoch": 1.0567323679100404, + "grad_norm": 0.027013738993206558, + "learning_rate": 0.0005377121765506619, + "loss": 0.5156, + "step": 20815 + }, + { + "epoch": 1.056986204008072, + "grad_norm": 0.035362483004186006, + "learning_rate": 0.0005374912779003626, + "loss": 0.5105, + "step": 20820 + }, + { + "epoch": 1.0572400401061035, + "grad_norm": 0.028929030452595348, + "learning_rate": 0.0005372703718907127, + "loss": 0.5047, + "step": 20825 + }, + { + "epoch": 1.057493876204135, + "grad_norm": 0.024817950996213613, + "learning_rate": 0.0005370494585650746, + "loss": 0.4983, + "step": 20830 + }, + { + "epoch": 1.0577477123021666, + "grad_norm": 0.022644591412287526, + "learning_rate": 0.0005368285379668125, + "loss": 0.5294, + "step": 20835 + }, + { + "epoch": 1.0580015484001979, + "grad_norm": 0.02520849939725291, + "learning_rate": 0.0005366076101392922, + "loss": 0.4923, + "step": 20840 + }, + { + "epoch": 1.0582553844982294, + "grad_norm": 0.023884449834670873, + "learning_rate": 0.0005363866751258805, + "loss": 0.5227, + "step": 20845 + }, + { + "epoch": 1.058509220596261, + "grad_norm": 0.0246942336074402, + "learning_rate": 0.0005361657329699457, + "loss": 0.5014, + "step": 20850 + }, + { + "epoch": 1.0587630566942925, + "grad_norm": 0.024562310682568606, + "learning_rate": 0.0005359447837148582, + "loss": 0.5134, + "step": 20855 + }, + { + "epoch": 1.059016892792324, + "grad_norm": 0.023069233139811948, + "learning_rate": 0.0005357238274039888, + "loss": 0.5302, + "step": 20860 + }, + { + "epoch": 1.0592707288903556, + "grad_norm": 0.03147995452519133, + "learning_rate": 0.0005355028640807103, + "loss": 0.4765, + "step": 20865 + }, + { + "epoch": 1.059524564988387, + "grad_norm": 0.023156418652504413, + "learning_rate": 0.0005352818937883966, + "loss": 0.5326, + "step": 20870 + }, + { + "epoch": 1.0597784010864184, + "grad_norm": 0.023516340548810546, + "learning_rate": 0.0005350609165704231, + "loss": 0.5014, + "step": 20875 + }, + { + "epoch": 1.06003223718445, + "grad_norm": 0.02173782389595552, + "learning_rate": 0.0005348399324701665, + "loss": 0.4892, + "step": 20880 + }, + { + "epoch": 1.0602860732824815, + "grad_norm": 0.02497605503933453, + "learning_rate": 0.0005346189415310049, + "loss": 0.5238, + "step": 20885 + }, + { + "epoch": 1.060539909380513, + "grad_norm": 0.026640052657665187, + "learning_rate": 0.0005343979437963178, + "loss": 0.5475, + "step": 20890 + }, + { + "epoch": 1.0607937454785445, + "grad_norm": 0.02611282353111445, + "learning_rate": 0.0005341769393094857, + "loss": 0.5454, + "step": 20895 + }, + { + "epoch": 1.061047581576576, + "grad_norm": 0.05663032774630538, + "learning_rate": 0.000533955928113891, + "loss": 0.5268, + "step": 20900 + }, + { + "epoch": 1.0613014176746076, + "grad_norm": 0.027192717640165086, + "learning_rate": 0.000533734910252917, + "loss": 0.5328, + "step": 20905 + }, + { + "epoch": 1.061555253772639, + "grad_norm": 0.0479761926852491, + "learning_rate": 0.0005335138857699482, + "loss": 0.5321, + "step": 20910 + }, + { + "epoch": 1.0618090898706705, + "grad_norm": 0.03449047232824588, + "learning_rate": 0.0005332928547083707, + "loss": 0.5135, + "step": 20915 + }, + { + "epoch": 1.062062925968702, + "grad_norm": 0.03882056462374773, + "learning_rate": 0.0005330718171115721, + "loss": 0.5159, + "step": 20920 + }, + { + "epoch": 1.0623167620667335, + "grad_norm": 0.029049562658363927, + "learning_rate": 0.0005328507730229407, + "loss": 0.5109, + "step": 20925 + }, + { + "epoch": 1.062570598164765, + "grad_norm": 0.03658465415118424, + "learning_rate": 0.0005326297224858661, + "loss": 0.5055, + "step": 20930 + }, + { + "epoch": 1.0628244342627966, + "grad_norm": 0.026304675203290016, + "learning_rate": 0.00053240866554374, + "loss": 0.4788, + "step": 20935 + }, + { + "epoch": 1.063078270360828, + "grad_norm": 0.02473561068343538, + "learning_rate": 0.0005321876022399542, + "loss": 0.4786, + "step": 20940 + }, + { + "epoch": 1.0633321064588594, + "grad_norm": 0.027156090763984255, + "learning_rate": 0.0005319665326179028, + "loss": 0.5033, + "step": 20945 + }, + { + "epoch": 1.063585942556891, + "grad_norm": 0.02473336770832404, + "learning_rate": 0.0005317454567209804, + "loss": 0.4906, + "step": 20950 + }, + { + "epoch": 1.0638397786549225, + "grad_norm": 0.03156233005770823, + "learning_rate": 0.0005315243745925833, + "loss": 0.5012, + "step": 20955 + }, + { + "epoch": 1.064093614752954, + "grad_norm": 0.02170352188784017, + "learning_rate": 0.0005313032862761085, + "loss": 0.5181, + "step": 20960 + }, + { + "epoch": 1.0643474508509856, + "grad_norm": 0.022630967619430002, + "learning_rate": 0.0005310821918149548, + "loss": 0.4965, + "step": 20965 + }, + { + "epoch": 1.0646012869490171, + "grad_norm": 0.021957567917221844, + "learning_rate": 0.0005308610912525218, + "loss": 0.5313, + "step": 20970 + }, + { + "epoch": 1.0648551230470484, + "grad_norm": 0.0830362281234276, + "learning_rate": 0.0005306399846322106, + "loss": 0.5142, + "step": 20975 + }, + { + "epoch": 1.06510895914508, + "grad_norm": 0.021880705238877092, + "learning_rate": 0.000530418871997423, + "loss": 0.4935, + "step": 20980 + }, + { + "epoch": 1.0653627952431115, + "grad_norm": 0.025755954372723704, + "learning_rate": 0.0005301977533915627, + "loss": 0.5006, + "step": 20985 + }, + { + "epoch": 1.065616631341143, + "grad_norm": 0.02290313209246396, + "learning_rate": 0.000529976628858034, + "loss": 0.4983, + "step": 20990 + }, + { + "epoch": 1.0658704674391746, + "grad_norm": 0.02118947492338632, + "learning_rate": 0.0005297554984402426, + "loss": 0.5367, + "step": 20995 + }, + { + "epoch": 1.066124303537206, + "grad_norm": 0.025500946021106714, + "learning_rate": 0.0005295343621815952, + "loss": 0.5239, + "step": 21000 + }, + { + "epoch": 1.0663781396352374, + "grad_norm": 0.02849283742078081, + "learning_rate": 0.0005293132201254996, + "loss": 0.5033, + "step": 21005 + }, + { + "epoch": 1.066631975733269, + "grad_norm": 0.023519558376559214, + "learning_rate": 0.0005290920723153653, + "loss": 0.5283, + "step": 21010 + }, + { + "epoch": 1.0668858118313005, + "grad_norm": 0.023929624757609843, + "learning_rate": 0.0005288709187946022, + "loss": 0.5176, + "step": 21015 + }, + { + "epoch": 1.067139647929332, + "grad_norm": 0.024787372939198683, + "learning_rate": 0.0005286497596066218, + "loss": 0.5717, + "step": 21020 + }, + { + "epoch": 1.0673934840273636, + "grad_norm": 0.02105047392838496, + "learning_rate": 0.0005284285947948364, + "loss": 0.4807, + "step": 21025 + }, + { + "epoch": 1.067647320125395, + "grad_norm": 0.021809780590215072, + "learning_rate": 0.0005282074244026597, + "loss": 0.4876, + "step": 21030 + }, + { + "epoch": 1.0679011562234266, + "grad_norm": 0.02979096066481348, + "learning_rate": 0.0005279862484735059, + "loss": 0.4964, + "step": 21035 + }, + { + "epoch": 1.068154992321458, + "grad_norm": 0.02698075454085126, + "learning_rate": 0.0005277650670507915, + "loss": 0.5425, + "step": 21040 + }, + { + "epoch": 1.0684088284194895, + "grad_norm": 0.02305290180830383, + "learning_rate": 0.0005275438801779327, + "loss": 0.4809, + "step": 21045 + }, + { + "epoch": 1.068662664517521, + "grad_norm": 0.024541759228573837, + "learning_rate": 0.0005273226878983476, + "loss": 0.5089, + "step": 21050 + }, + { + "epoch": 1.0689165006155525, + "grad_norm": 0.022014145757905085, + "learning_rate": 0.0005271014902554552, + "loss": 0.4971, + "step": 21055 + }, + { + "epoch": 1.069170336713584, + "grad_norm": 0.024792727012130904, + "learning_rate": 0.0005268802872926755, + "loss": 0.5472, + "step": 21060 + }, + { + "epoch": 1.0694241728116156, + "grad_norm": 0.02829315368981998, + "learning_rate": 0.0005266590790534292, + "loss": 0.5336, + "step": 21065 + }, + { + "epoch": 1.069678008909647, + "grad_norm": 0.022298962580040806, + "learning_rate": 0.0005264378655811388, + "loss": 0.4895, + "step": 21070 + }, + { + "epoch": 1.0699318450076785, + "grad_norm": 0.029443771995413214, + "learning_rate": 0.0005262166469192273, + "loss": 0.5068, + "step": 21075 + }, + { + "epoch": 1.07018568110571, + "grad_norm": 0.035247934270422096, + "learning_rate": 0.0005259954231111186, + "loss": 0.5181, + "step": 21080 + }, + { + "epoch": 1.0704395172037415, + "grad_norm": 0.02766821768801607, + "learning_rate": 0.000525774194200238, + "loss": 0.5312, + "step": 21085 + }, + { + "epoch": 1.070693353301773, + "grad_norm": 0.034962832983413614, + "learning_rate": 0.0005255529602300118, + "loss": 0.5087, + "step": 21090 + }, + { + "epoch": 1.0709471893998046, + "grad_norm": 0.024917411843302463, + "learning_rate": 0.0005253317212438668, + "loss": 0.5247, + "step": 21095 + }, + { + "epoch": 1.0712010254978361, + "grad_norm": 0.022976680538935538, + "learning_rate": 0.0005251104772852312, + "loss": 0.527, + "step": 21100 + }, + { + "epoch": 1.0714548615958674, + "grad_norm": 0.023641409157934393, + "learning_rate": 0.0005248892283975341, + "loss": 0.5004, + "step": 21105 + }, + { + "epoch": 1.071708697693899, + "grad_norm": 0.026029533102603888, + "learning_rate": 0.0005246679746242058, + "loss": 0.5154, + "step": 21110 + }, + { + "epoch": 1.0719625337919305, + "grad_norm": 0.03545812118043111, + "learning_rate": 0.000524446716008677, + "loss": 0.5032, + "step": 21115 + }, + { + "epoch": 1.072216369889962, + "grad_norm": 0.02325775970323228, + "learning_rate": 0.0005242254525943799, + "loss": 0.5166, + "step": 21120 + }, + { + "epoch": 1.0724702059879936, + "grad_norm": 0.027063279281270988, + "learning_rate": 0.000524004184424747, + "loss": 0.5139, + "step": 21125 + }, + { + "epoch": 1.0727240420860251, + "grad_norm": 0.022406878965903397, + "learning_rate": 0.0005237829115432124, + "loss": 0.5163, + "step": 21130 + }, + { + "epoch": 1.0729778781840567, + "grad_norm": 0.020476946022531705, + "learning_rate": 0.000523561633993211, + "loss": 0.4938, + "step": 21135 + }, + { + "epoch": 1.073231714282088, + "grad_norm": 0.02946149615897215, + "learning_rate": 0.0005233403518181784, + "loss": 0.4777, + "step": 21140 + }, + { + "epoch": 1.0734855503801195, + "grad_norm": 0.022659371359572644, + "learning_rate": 0.000523119065061551, + "loss": 0.52, + "step": 21145 + }, + { + "epoch": 1.073739386478151, + "grad_norm": 0.024333860488001735, + "learning_rate": 0.0005228977737667665, + "loss": 0.5307, + "step": 21150 + }, + { + "epoch": 1.0739932225761826, + "grad_norm": 0.024349542450303005, + "learning_rate": 0.0005226764779772632, + "loss": 0.5321, + "step": 21155 + }, + { + "epoch": 1.074247058674214, + "grad_norm": 0.0349413047957909, + "learning_rate": 0.0005224551777364803, + "loss": 0.5386, + "step": 21160 + }, + { + "epoch": 1.0745008947722456, + "grad_norm": 0.024434904354044745, + "learning_rate": 0.0005222338730878581, + "loss": 0.5224, + "step": 21165 + }, + { + "epoch": 1.0747547308702772, + "grad_norm": 0.022587237289784082, + "learning_rate": 0.0005220125640748375, + "loss": 0.5014, + "step": 21170 + }, + { + "epoch": 1.0750085669683085, + "grad_norm": 0.028695622755737175, + "learning_rate": 0.0005217912507408602, + "loss": 0.5129, + "step": 21175 + }, + { + "epoch": 1.07526240306634, + "grad_norm": 0.04548078606712318, + "learning_rate": 0.0005215699331293692, + "loss": 0.4981, + "step": 21180 + }, + { + "epoch": 1.0755162391643716, + "grad_norm": 0.03372321027470343, + "learning_rate": 0.0005213486112838076, + "loss": 0.4986, + "step": 21185 + }, + { + "epoch": 1.075770075262403, + "grad_norm": 0.03485560152221166, + "learning_rate": 0.0005211272852476204, + "loss": 0.5012, + "step": 21190 + }, + { + "epoch": 1.0760239113604346, + "grad_norm": 0.025328513759871395, + "learning_rate": 0.0005209059550642523, + "loss": 0.536, + "step": 21195 + }, + { + "epoch": 1.0762777474584662, + "grad_norm": 0.0382721168074841, + "learning_rate": 0.0005206846207771496, + "loss": 0.5569, + "step": 21200 + }, + { + "epoch": 1.0765315835564975, + "grad_norm": 0.02520714240837497, + "learning_rate": 0.0005204632824297589, + "loss": 0.5249, + "step": 21205 + }, + { + "epoch": 1.076785419654529, + "grad_norm": 0.028982430714109, + "learning_rate": 0.0005202419400655281, + "loss": 0.5233, + "step": 21210 + }, + { + "epoch": 1.0770392557525605, + "grad_norm": 0.02574048454288774, + "learning_rate": 0.0005200205937279052, + "loss": 0.4889, + "step": 21215 + }, + { + "epoch": 1.077293091850592, + "grad_norm": 0.03233762725205312, + "learning_rate": 0.0005197992434603397, + "loss": 0.5201, + "step": 21220 + }, + { + "epoch": 1.0775469279486236, + "grad_norm": 0.026051642045672115, + "learning_rate": 0.0005195778893062814, + "loss": 0.5596, + "step": 21225 + }, + { + "epoch": 1.0778007640466551, + "grad_norm": 0.034440397145426654, + "learning_rate": 0.000519356531309181, + "loss": 0.508, + "step": 21230 + }, + { + "epoch": 1.0780546001446867, + "grad_norm": 0.024779579773170867, + "learning_rate": 0.0005191351695124902, + "loss": 0.511, + "step": 21235 + }, + { + "epoch": 1.078308436242718, + "grad_norm": 0.027199827628791112, + "learning_rate": 0.000518913803959661, + "loss": 0.5391, + "step": 21240 + }, + { + "epoch": 1.0785622723407495, + "grad_norm": 0.039295388227057915, + "learning_rate": 0.0005186924346941463, + "loss": 0.5159, + "step": 21245 + }, + { + "epoch": 1.078816108438781, + "grad_norm": 0.03140488277705228, + "learning_rate": 0.0005184710617593998, + "loss": 0.5333, + "step": 21250 + }, + { + "epoch": 1.0790699445368126, + "grad_norm": 0.037347453599965476, + "learning_rate": 0.0005182496851988763, + "loss": 0.4879, + "step": 21255 + }, + { + "epoch": 1.0793237806348441, + "grad_norm": 0.030428369817123643, + "learning_rate": 0.0005180283050560304, + "loss": 0.5088, + "step": 21260 + }, + { + "epoch": 1.0795776167328757, + "grad_norm": 0.02612433843324173, + "learning_rate": 0.0005178069213743182, + "loss": 0.4955, + "step": 21265 + }, + { + "epoch": 1.079831452830907, + "grad_norm": 0.028085153442456857, + "learning_rate": 0.0005175855341971961, + "loss": 0.5297, + "step": 21270 + }, + { + "epoch": 1.0800852889289385, + "grad_norm": 0.030147412276925384, + "learning_rate": 0.0005173641435681212, + "loss": 0.4905, + "step": 21275 + }, + { + "epoch": 1.08033912502697, + "grad_norm": 0.029195226257636587, + "learning_rate": 0.0005171427495305517, + "loss": 0.5101, + "step": 21280 + }, + { + "epoch": 1.0805929611250016, + "grad_norm": 0.02573350843933712, + "learning_rate": 0.000516921352127946, + "loss": 0.5075, + "step": 21285 + }, + { + "epoch": 1.0808467972230331, + "grad_norm": 0.03039462615229446, + "learning_rate": 0.0005166999514037631, + "loss": 0.5506, + "step": 21290 + }, + { + "epoch": 1.0811006333210647, + "grad_norm": 0.03422447727787612, + "learning_rate": 0.0005164785474014631, + "loss": 0.4838, + "step": 21295 + }, + { + "epoch": 1.0813544694190962, + "grad_norm": 0.031835929504930464, + "learning_rate": 0.0005162571401645065, + "loss": 0.5303, + "step": 21300 + }, + { + "epoch": 1.0816083055171275, + "grad_norm": 0.024796482258768755, + "learning_rate": 0.0005160357297363541, + "loss": 0.5407, + "step": 21305 + }, + { + "epoch": 1.081862141615159, + "grad_norm": 0.024946672122012212, + "learning_rate": 0.0005158143161604682, + "loss": 0.5406, + "step": 21310 + }, + { + "epoch": 1.0821159777131906, + "grad_norm": 0.028919849891573573, + "learning_rate": 0.0005155928994803108, + "loss": 0.5164, + "step": 21315 + }, + { + "epoch": 1.082369813811222, + "grad_norm": 0.02634430322655897, + "learning_rate": 0.0005153714797393451, + "loss": 0.5272, + "step": 21320 + }, + { + "epoch": 1.0826236499092536, + "grad_norm": 0.034896141494776906, + "learning_rate": 0.0005151500569810345, + "loss": 0.5127, + "step": 21325 + }, + { + "epoch": 1.0828774860072852, + "grad_norm": 0.022623036190050343, + "learning_rate": 0.0005149286312488432, + "loss": 0.5492, + "step": 21330 + }, + { + "epoch": 1.0831313221053165, + "grad_norm": 0.02338043777628199, + "learning_rate": 0.0005147072025862362, + "loss": 0.4818, + "step": 21335 + }, + { + "epoch": 1.083385158203348, + "grad_norm": 0.02630639246261155, + "learning_rate": 0.0005144857710366785, + "loss": 0.5142, + "step": 21340 + }, + { + "epoch": 1.0836389943013796, + "grad_norm": 0.02480597381903302, + "learning_rate": 0.0005142643366436362, + "loss": 0.5441, + "step": 21345 + }, + { + "epoch": 1.083892830399411, + "grad_norm": 0.02501822087153717, + "learning_rate": 0.0005140428994505759, + "loss": 0.5228, + "step": 21350 + }, + { + "epoch": 1.0841466664974426, + "grad_norm": 0.027223628649433902, + "learning_rate": 0.0005138214595009643, + "loss": 0.5148, + "step": 21355 + }, + { + "epoch": 1.0844005025954742, + "grad_norm": 0.022696406307624702, + "learning_rate": 0.0005136000168382693, + "loss": 0.5075, + "step": 21360 + }, + { + "epoch": 1.0846543386935057, + "grad_norm": 0.032940782729276794, + "learning_rate": 0.0005133785715059586, + "loss": 0.5322, + "step": 21365 + }, + { + "epoch": 1.084908174791537, + "grad_norm": 0.022402759329344542, + "learning_rate": 0.0005131571235475012, + "loss": 0.5077, + "step": 21370 + }, + { + "epoch": 1.0851620108895685, + "grad_norm": 0.038649939505366236, + "learning_rate": 0.000512935673006366, + "loss": 0.5236, + "step": 21375 + }, + { + "epoch": 1.0854158469876, + "grad_norm": 0.022969477729405774, + "learning_rate": 0.0005127142199260228, + "loss": 0.4826, + "step": 21380 + }, + { + "epoch": 1.0856696830856316, + "grad_norm": 0.02759754028356563, + "learning_rate": 0.0005124927643499415, + "loss": 0.5351, + "step": 21385 + }, + { + "epoch": 1.0859235191836631, + "grad_norm": 0.026556927936903868, + "learning_rate": 0.000512271306321593, + "loss": 0.5354, + "step": 21390 + }, + { + "epoch": 1.0861773552816947, + "grad_norm": 0.024696980583765446, + "learning_rate": 0.000512049845884448, + "loss": 0.5162, + "step": 21395 + }, + { + "epoch": 1.0864311913797262, + "grad_norm": 0.026456834753896295, + "learning_rate": 0.0005118283830819786, + "loss": 0.4936, + "step": 21400 + }, + { + "epoch": 1.0866850274777575, + "grad_norm": 0.023930744190106705, + "learning_rate": 0.0005116069179576565, + "loss": 0.4917, + "step": 21405 + }, + { + "epoch": 1.086938863575789, + "grad_norm": 0.02349838967686964, + "learning_rate": 0.0005113854505549543, + "loss": 0.5321, + "step": 21410 + }, + { + "epoch": 1.0871926996738206, + "grad_norm": 0.032693372410154034, + "learning_rate": 0.000511163980917345, + "loss": 0.471, + "step": 21415 + }, + { + "epoch": 1.0874465357718521, + "grad_norm": 0.03223543121863213, + "learning_rate": 0.0005109425090883019, + "loss": 0.5017, + "step": 21420 + }, + { + "epoch": 1.0877003718698837, + "grad_norm": 0.027354722210189156, + "learning_rate": 0.0005107210351112986, + "loss": 0.5257, + "step": 21425 + }, + { + "epoch": 1.0879542079679152, + "grad_norm": 0.020761117911752558, + "learning_rate": 0.0005104995590298098, + "loss": 0.5032, + "step": 21430 + }, + { + "epoch": 1.0882080440659467, + "grad_norm": 0.025848466019290892, + "learning_rate": 0.0005102780808873098, + "loss": 0.4959, + "step": 21435 + }, + { + "epoch": 1.088461880163978, + "grad_norm": 0.026496761790380605, + "learning_rate": 0.000510056600727274, + "loss": 0.4579, + "step": 21440 + }, + { + "epoch": 1.0887157162620096, + "grad_norm": 0.024422392773643196, + "learning_rate": 0.0005098351185931775, + "loss": 0.503, + "step": 21445 + }, + { + "epoch": 1.0889695523600411, + "grad_norm": 0.04356332488795266, + "learning_rate": 0.0005096136345284963, + "loss": 0.5012, + "step": 21450 + }, + { + "epoch": 1.0892233884580726, + "grad_norm": 0.025205409841538344, + "learning_rate": 0.0005093921485767066, + "loss": 0.5125, + "step": 21455 + }, + { + "epoch": 1.0894772245561042, + "grad_norm": 0.03256381052307056, + "learning_rate": 0.0005091706607812848, + "loss": 0.5021, + "step": 21460 + }, + { + "epoch": 1.0897310606541357, + "grad_norm": 0.022789084607815834, + "learning_rate": 0.0005089491711857083, + "loss": 0.5027, + "step": 21465 + }, + { + "epoch": 1.089984896752167, + "grad_norm": 0.022777957966430507, + "learning_rate": 0.0005087276798334539, + "loss": 0.4724, + "step": 21470 + }, + { + "epoch": 1.0902387328501986, + "grad_norm": 0.024179892808939247, + "learning_rate": 0.0005085061867679995, + "loss": 0.5076, + "step": 21475 + }, + { + "epoch": 1.09049256894823, + "grad_norm": 0.023288334670357742, + "learning_rate": 0.0005082846920328232, + "loss": 0.5211, + "step": 21480 + }, + { + "epoch": 1.0907464050462616, + "grad_norm": 0.024409120224983153, + "learning_rate": 0.0005080631956714029, + "loss": 0.5234, + "step": 21485 + }, + { + "epoch": 1.0910002411442932, + "grad_norm": 0.02599437175288467, + "learning_rate": 0.0005078416977272178, + "loss": 0.5029, + "step": 21490 + }, + { + "epoch": 1.0912540772423247, + "grad_norm": 0.022222357794203767, + "learning_rate": 0.0005076201982437464, + "loss": 0.5228, + "step": 21495 + }, + { + "epoch": 1.0915079133403562, + "grad_norm": 0.02880385866020692, + "learning_rate": 0.0005073986972644681, + "loss": 0.5447, + "step": 21500 + }, + { + "epoch": 1.0917617494383876, + "grad_norm": 0.020822683076055126, + "learning_rate": 0.0005071771948328624, + "loss": 0.4931, + "step": 21505 + }, + { + "epoch": 1.092015585536419, + "grad_norm": 0.02398468484355743, + "learning_rate": 0.0005069556909924092, + "loss": 0.482, + "step": 21510 + }, + { + "epoch": 1.0922694216344506, + "grad_norm": 0.02807227467217408, + "learning_rate": 0.0005067341857865885, + "loss": 0.5237, + "step": 21515 + }, + { + "epoch": 1.0925232577324822, + "grad_norm": 0.02290810606351165, + "learning_rate": 0.0005065126792588807, + "loss": 0.5166, + "step": 21520 + }, + { + "epoch": 1.0927770938305137, + "grad_norm": 0.028239546173554175, + "learning_rate": 0.0005062911714527664, + "loss": 0.5152, + "step": 21525 + }, + { + "epoch": 1.0930309299285452, + "grad_norm": 0.026605118843009996, + "learning_rate": 0.0005060696624117266, + "loss": 0.4891, + "step": 21530 + }, + { + "epoch": 1.0932847660265765, + "grad_norm": 0.03451731867184281, + "learning_rate": 0.0005058481521792424, + "loss": 0.5054, + "step": 21535 + }, + { + "epoch": 1.093538602124608, + "grad_norm": 0.022870995542147783, + "learning_rate": 0.000505626640798795, + "loss": 0.5094, + "step": 21540 + }, + { + "epoch": 1.0937924382226396, + "grad_norm": 0.023384178570215613, + "learning_rate": 0.000505405128313866, + "loss": 0.4954, + "step": 21545 + }, + { + "epoch": 1.0940462743206711, + "grad_norm": 0.021332217474057745, + "learning_rate": 0.0005051836147679374, + "loss": 0.4802, + "step": 21550 + }, + { + "epoch": 1.0943001104187027, + "grad_norm": 0.025123000830193358, + "learning_rate": 0.000504962100204491, + "loss": 0.4802, + "step": 21555 + }, + { + "epoch": 1.0945539465167342, + "grad_norm": 0.025473977289128598, + "learning_rate": 0.0005047405846670091, + "loss": 0.4966, + "step": 21560 + }, + { + "epoch": 1.0948077826147657, + "grad_norm": 0.026371420615893433, + "learning_rate": 0.0005045190681989742, + "loss": 0.4988, + "step": 21565 + }, + { + "epoch": 1.095061618712797, + "grad_norm": 0.022460113412766883, + "learning_rate": 0.0005042975508438687, + "loss": 0.4925, + "step": 21570 + }, + { + "epoch": 1.0953154548108286, + "grad_norm": 0.02307578944045605, + "learning_rate": 0.0005040760326451752, + "loss": 0.4906, + "step": 21575 + }, + { + "epoch": 1.0955692909088601, + "grad_norm": 0.03059351913144231, + "learning_rate": 0.000503854513646377, + "loss": 0.4782, + "step": 21580 + }, + { + "epoch": 1.0958231270068917, + "grad_norm": 0.024544268108570917, + "learning_rate": 0.000503632993890957, + "loss": 0.4977, + "step": 21585 + }, + { + "epoch": 1.0960769631049232, + "grad_norm": 0.021615409644089055, + "learning_rate": 0.0005034114734223983, + "loss": 0.5147, + "step": 21590 + }, + { + "epoch": 1.0963307992029547, + "grad_norm": 0.025081101527570617, + "learning_rate": 0.0005031899522841845, + "loss": 0.5216, + "step": 21595 + }, + { + "epoch": 1.096584635300986, + "grad_norm": 0.02738150628716518, + "learning_rate": 0.0005029684305197989, + "loss": 0.4893, + "step": 21600 + }, + { + "epoch": 1.0968384713990176, + "grad_norm": 0.030632362820872255, + "learning_rate": 0.000502746908172725, + "loss": 0.512, + "step": 21605 + }, + { + "epoch": 1.0970923074970491, + "grad_norm": 0.03751097192664053, + "learning_rate": 0.000502525385286447, + "loss": 0.4916, + "step": 21610 + }, + { + "epoch": 1.0973461435950806, + "grad_norm": 0.03045939530777982, + "learning_rate": 0.0005023038619044485, + "loss": 0.4898, + "step": 21615 + }, + { + "epoch": 1.0975999796931122, + "grad_norm": 0.025961661067633108, + "learning_rate": 0.0005020823380702133, + "loss": 0.5124, + "step": 21620 + }, + { + "epoch": 1.0978538157911437, + "grad_norm": 0.026346768663115368, + "learning_rate": 0.0005018608138272255, + "loss": 0.4997, + "step": 21625 + }, + { + "epoch": 1.0981076518891753, + "grad_norm": 0.035779285163878446, + "learning_rate": 0.0005016392892189692, + "loss": 0.5007, + "step": 21630 + }, + { + "epoch": 1.0983614879872066, + "grad_norm": 0.022963332458905212, + "learning_rate": 0.0005014177642889286, + "loss": 0.5089, + "step": 21635 + }, + { + "epoch": 1.098615324085238, + "grad_norm": 0.027026555042418753, + "learning_rate": 0.000501196239080588, + "loss": 0.4791, + "step": 21640 + }, + { + "epoch": 1.0988691601832696, + "grad_norm": 0.026351077220473095, + "learning_rate": 0.0005009747136374317, + "loss": 0.5035, + "step": 21645 + }, + { + "epoch": 1.0991229962813012, + "grad_norm": 0.02173089121604654, + "learning_rate": 0.0005007531880029438, + "loss": 0.4921, + "step": 21650 + }, + { + "epoch": 1.0993768323793327, + "grad_norm": 0.02864481532801961, + "learning_rate": 0.000500531662220609, + "loss": 0.5183, + "step": 21655 + }, + { + "epoch": 1.0996306684773642, + "grad_norm": 0.02446699653140137, + "learning_rate": 0.0005003101363339114, + "loss": 0.5084, + "step": 21660 + }, + { + "epoch": 1.0998845045753955, + "grad_norm": 0.023667829604516112, + "learning_rate": 0.0005000886103863355, + "loss": 0.4801, + "step": 21665 + }, + { + "epoch": 1.100138340673427, + "grad_norm": 0.023439234864149394, + "learning_rate": 0.0004998670844213661, + "loss": 0.5036, + "step": 21670 + }, + { + "epoch": 1.1003921767714586, + "grad_norm": 0.027040266970354027, + "learning_rate": 0.0004996455584824873, + "loss": 0.5186, + "step": 21675 + }, + { + "epoch": 1.1006460128694902, + "grad_norm": 0.0317481536525307, + "learning_rate": 0.0004994240326131837, + "loss": 0.5298, + "step": 21680 + }, + { + "epoch": 1.1008998489675217, + "grad_norm": 0.023747077663502984, + "learning_rate": 0.0004992025068569395, + "loss": 0.5139, + "step": 21685 + }, + { + "epoch": 1.1011536850655532, + "grad_norm": 0.035914229145339024, + "learning_rate": 0.0004989809812572392, + "loss": 0.553, + "step": 21690 + }, + { + "epoch": 1.1014075211635848, + "grad_norm": 0.022286343442528934, + "learning_rate": 0.0004987594558575673, + "loss": 0.511, + "step": 21695 + }, + { + "epoch": 1.1016613572616163, + "grad_norm": 0.032836479339666325, + "learning_rate": 0.0004985379307014079, + "loss": 0.5277, + "step": 21700 + }, + { + "epoch": 1.1019151933596476, + "grad_norm": 0.021909939697584042, + "learning_rate": 0.0004983164058322455, + "loss": 0.5027, + "step": 21705 + }, + { + "epoch": 1.1021690294576791, + "grad_norm": 0.036886073050320635, + "learning_rate": 0.000498094881293564, + "loss": 0.5319, + "step": 21710 + }, + { + "epoch": 1.1024228655557107, + "grad_norm": 0.04010799722360534, + "learning_rate": 0.000497873357128848, + "loss": 0.493, + "step": 21715 + }, + { + "epoch": 1.1026767016537422, + "grad_norm": 0.03364367343902673, + "learning_rate": 0.0004976518333815814, + "loss": 0.5138, + "step": 21720 + }, + { + "epoch": 1.1029305377517737, + "grad_norm": 0.03065112641730614, + "learning_rate": 0.0004974303100952483, + "loss": 0.4981, + "step": 21725 + }, + { + "epoch": 1.1031843738498053, + "grad_norm": 0.02896054754547121, + "learning_rate": 0.0004972087873133323, + "loss": 0.5215, + "step": 21730 + }, + { + "epoch": 1.1034382099478366, + "grad_norm": 0.03710991307969362, + "learning_rate": 0.0004969872650793176, + "loss": 0.4901, + "step": 21735 + }, + { + "epoch": 1.1036920460458681, + "grad_norm": 0.022960913289772747, + "learning_rate": 0.0004967657434366877, + "loss": 0.4889, + "step": 21740 + }, + { + "epoch": 1.1039458821438997, + "grad_norm": 0.025188988881946872, + "learning_rate": 0.0004965442224289262, + "loss": 0.4756, + "step": 21745 + }, + { + "epoch": 1.1041997182419312, + "grad_norm": 0.022061989690903828, + "learning_rate": 0.0004963227020995167, + "loss": 0.5295, + "step": 21750 + }, + { + "epoch": 1.1044535543399627, + "grad_norm": 0.03873982984670393, + "learning_rate": 0.0004961011824919422, + "loss": 0.5255, + "step": 21755 + }, + { + "epoch": 1.1047073904379943, + "grad_norm": 0.024110509410144455, + "learning_rate": 0.0004958796636496864, + "loss": 0.4762, + "step": 21760 + }, + { + "epoch": 1.1049612265360258, + "grad_norm": 0.024542539175151357, + "learning_rate": 0.0004956581456162319, + "loss": 0.4792, + "step": 21765 + }, + { + "epoch": 1.105215062634057, + "grad_norm": 0.022363840834245956, + "learning_rate": 0.0004954366284350617, + "loss": 0.5118, + "step": 21770 + }, + { + "epoch": 1.1054688987320886, + "grad_norm": 0.025982163556048574, + "learning_rate": 0.0004952151121496587, + "loss": 0.5072, + "step": 21775 + }, + { + "epoch": 1.1057227348301202, + "grad_norm": 0.02270591702590939, + "learning_rate": 0.0004949935968035054, + "loss": 0.504, + "step": 21780 + }, + { + "epoch": 1.1059765709281517, + "grad_norm": 0.022406489019998924, + "learning_rate": 0.000494772082440084, + "loss": 0.4982, + "step": 21785 + }, + { + "epoch": 1.1062304070261832, + "grad_norm": 0.025696915164668793, + "learning_rate": 0.0004945505691028769, + "loss": 0.5271, + "step": 21790 + }, + { + "epoch": 1.1064842431242148, + "grad_norm": 0.02202726327814844, + "learning_rate": 0.0004943290568353657, + "loss": 0.4841, + "step": 21795 + }, + { + "epoch": 1.106738079222246, + "grad_norm": 0.027349694310620433, + "learning_rate": 0.0004941075456810324, + "loss": 0.4977, + "step": 21800 + }, + { + "epoch": 1.1069919153202776, + "grad_norm": 0.021841600344543598, + "learning_rate": 0.0004938860356833585, + "loss": 0.4939, + "step": 21805 + }, + { + "epoch": 1.1072457514183092, + "grad_norm": 0.024165588262214474, + "learning_rate": 0.0004936645268858253, + "loss": 0.4974, + "step": 21810 + }, + { + "epoch": 1.1074995875163407, + "grad_norm": 0.020070193682354385, + "learning_rate": 0.000493443019331914, + "loss": 0.4816, + "step": 21815 + }, + { + "epoch": 1.1077534236143722, + "grad_norm": 0.024862032516753653, + "learning_rate": 0.0004932215130651052, + "loss": 0.5147, + "step": 21820 + }, + { + "epoch": 1.1080072597124038, + "grad_norm": 0.025675685849875715, + "learning_rate": 0.0004930000081288797, + "loss": 0.4855, + "step": 21825 + }, + { + "epoch": 1.1082610958104353, + "grad_norm": 0.02461817640839028, + "learning_rate": 0.0004927785045667173, + "loss": 0.4864, + "step": 21830 + }, + { + "epoch": 1.1085149319084666, + "grad_norm": 0.023549777823439446, + "learning_rate": 0.0004925570024220987, + "loss": 0.5247, + "step": 21835 + }, + { + "epoch": 1.1087687680064982, + "grad_norm": 0.022915635775272027, + "learning_rate": 0.0004923355017385035, + "loss": 0.4903, + "step": 21840 + }, + { + "epoch": 1.1090226041045297, + "grad_norm": 0.028502210315459423, + "learning_rate": 0.000492114002559411, + "loss": 0.4815, + "step": 21845 + }, + { + "epoch": 1.1092764402025612, + "grad_norm": 0.03661671337568482, + "learning_rate": 0.0004918925049283005, + "loss": 0.5259, + "step": 21850 + }, + { + "epoch": 1.1095302763005928, + "grad_norm": 0.034776988326917536, + "learning_rate": 0.0004916710088886508, + "loss": 0.5042, + "step": 21855 + }, + { + "epoch": 1.1097841123986243, + "grad_norm": 0.022641065605251254, + "learning_rate": 0.0004914495144839406, + "loss": 0.4759, + "step": 21860 + }, + { + "epoch": 1.1100379484966556, + "grad_norm": 0.02470161392259303, + "learning_rate": 0.0004912280217576481, + "loss": 0.484, + "step": 21865 + }, + { + "epoch": 1.1102917845946871, + "grad_norm": 0.02216524988218757, + "learning_rate": 0.0004910065307532511, + "loss": 0.514, + "step": 21870 + }, + { + "epoch": 1.1105456206927187, + "grad_norm": 0.022165983415213607, + "learning_rate": 0.0004907850415142273, + "loss": 0.4873, + "step": 21875 + }, + { + "epoch": 1.1107994567907502, + "grad_norm": 0.02581011869369852, + "learning_rate": 0.0004905635540840539, + "loss": 0.4913, + "step": 21880 + }, + { + "epoch": 1.1110532928887817, + "grad_norm": 0.02854559216993533, + "learning_rate": 0.0004903420685062077, + "loss": 0.4985, + "step": 21885 + }, + { + "epoch": 1.1113071289868133, + "grad_norm": 0.031286700118217534, + "learning_rate": 0.0004901205848241654, + "loss": 0.5251, + "step": 21890 + }, + { + "epoch": 1.1115609650848448, + "grad_norm": 0.027160788459885017, + "learning_rate": 0.0004898991030814028, + "loss": 0.5148, + "step": 21895 + }, + { + "epoch": 1.1118148011828761, + "grad_norm": 0.024176529505431532, + "learning_rate": 0.000489677623321396, + "loss": 0.4936, + "step": 21900 + }, + { + "epoch": 1.1120686372809077, + "grad_norm": 0.039337612553737564, + "learning_rate": 0.0004894561455876204, + "loss": 0.5253, + "step": 21905 + }, + { + "epoch": 1.1123224733789392, + "grad_norm": 0.022635785746774068, + "learning_rate": 0.0004892346699235507, + "loss": 0.5176, + "step": 21910 + }, + { + "epoch": 1.1125763094769707, + "grad_norm": 0.02559706784297847, + "learning_rate": 0.0004890131963726617, + "loss": 0.5059, + "step": 21915 + }, + { + "epoch": 1.1128301455750023, + "grad_norm": 0.021313731704569797, + "learning_rate": 0.0004887917249784275, + "loss": 0.4962, + "step": 21920 + }, + { + "epoch": 1.1130839816730338, + "grad_norm": 0.03607627354015021, + "learning_rate": 0.0004885702557843217, + "loss": 0.4859, + "step": 21925 + }, + { + "epoch": 1.113337817771065, + "grad_norm": 0.02559178946483111, + "learning_rate": 0.0004883487888338177, + "loss": 0.4888, + "step": 21930 + }, + { + "epoch": 1.1135916538690966, + "grad_norm": 0.023591231457929233, + "learning_rate": 0.0004881273241703884, + "loss": 0.5408, + "step": 21935 + }, + { + "epoch": 1.1138454899671282, + "grad_norm": 0.032232207206550165, + "learning_rate": 0.00048790586183750605, + "loss": 0.4845, + "step": 21940 + }, + { + "epoch": 1.1140993260651597, + "grad_norm": 0.023369247501599504, + "learning_rate": 0.0004876844018786428, + "loss": 0.5135, + "step": 21945 + }, + { + "epoch": 1.1143531621631912, + "grad_norm": 0.021929243467260426, + "learning_rate": 0.00048746294433727003, + "loss": 0.5041, + "step": 21950 + }, + { + "epoch": 1.1146069982612228, + "grad_norm": 0.024271700794895327, + "learning_rate": 0.0004872414892568585, + "loss": 0.5053, + "step": 21955 + }, + { + "epoch": 1.1148608343592543, + "grad_norm": 0.02341400981114356, + "learning_rate": 0.00048702003668087926, + "loss": 0.5172, + "step": 21960 + }, + { + "epoch": 1.1151146704572856, + "grad_norm": 0.024390827336790148, + "learning_rate": 0.00048679858665280206, + "loss": 0.4964, + "step": 21965 + }, + { + "epoch": 1.1153685065553172, + "grad_norm": 0.023466277161900626, + "learning_rate": 0.00048657713921609647, + "loss": 0.5098, + "step": 21970 + }, + { + "epoch": 1.1156223426533487, + "grad_norm": 0.03522806209015746, + "learning_rate": 0.0004863556944142316, + "loss": 0.5102, + "step": 21975 + }, + { + "epoch": 1.1158761787513802, + "grad_norm": 0.023062324780772376, + "learning_rate": 0.00048613425229067575, + "loss": 0.4558, + "step": 21980 + }, + { + "epoch": 1.1161300148494118, + "grad_norm": 0.029175018799990165, + "learning_rate": 0.0004859128128888971, + "loss": 0.5498, + "step": 21985 + }, + { + "epoch": 1.1163838509474433, + "grad_norm": 0.023244545510022245, + "learning_rate": 0.000485691376252363, + "loss": 0.5382, + "step": 21990 + }, + { + "epoch": 1.1166376870454748, + "grad_norm": 0.02540046969404091, + "learning_rate": 0.0004854699424245404, + "loss": 0.5167, + "step": 21995 + }, + { + "epoch": 1.1168915231435061, + "grad_norm": 0.023122619586228372, + "learning_rate": 0.00048524851144889563, + "loss": 0.4975, + "step": 22000 + }, + { + "epoch": 1.1171453592415377, + "grad_norm": 0.028102268868484732, + "learning_rate": 0.0004850270833688945, + "loss": 0.5306, + "step": 22005 + }, + { + "epoch": 1.1173991953395692, + "grad_norm": 0.028295243268877415, + "learning_rate": 0.0004848056582280022, + "loss": 0.485, + "step": 22010 + }, + { + "epoch": 1.1176530314376008, + "grad_norm": 0.023662648309147206, + "learning_rate": 0.00048458423606968337, + "loss": 0.5316, + "step": 22015 + }, + { + "epoch": 1.1179068675356323, + "grad_norm": 0.023956513141186277, + "learning_rate": 0.0004843628169374022, + "loss": 0.4989, + "step": 22020 + }, + { + "epoch": 1.1181607036336638, + "grad_norm": 0.03533645288614336, + "learning_rate": 0.0004841414008746221, + "loss": 0.4866, + "step": 22025 + }, + { + "epoch": 1.1184145397316954, + "grad_norm": 0.02362689256504165, + "learning_rate": 0.0004839199879248059, + "loss": 0.5038, + "step": 22030 + }, + { + "epoch": 1.1186683758297267, + "grad_norm": 0.022341925551004328, + "learning_rate": 0.00048369857813141586, + "loss": 0.4776, + "step": 22035 + }, + { + "epoch": 1.1189222119277582, + "grad_norm": 0.02777412452937675, + "learning_rate": 0.00048347717153791365, + "loss": 0.4765, + "step": 22040 + }, + { + "epoch": 1.1191760480257897, + "grad_norm": 0.03839285711516218, + "learning_rate": 0.0004832557681877603, + "loss": 0.4991, + "step": 22045 + }, + { + "epoch": 1.1194298841238213, + "grad_norm": 0.02786618691587262, + "learning_rate": 0.0004830343681244161, + "loss": 0.5314, + "step": 22050 + }, + { + "epoch": 1.1196837202218528, + "grad_norm": 0.02177076732255751, + "learning_rate": 0.0004828129713913409, + "loss": 0.5031, + "step": 22055 + }, + { + "epoch": 1.1199375563198843, + "grad_norm": 0.021374169537151152, + "learning_rate": 0.0004825915780319937, + "loss": 0.4887, + "step": 22060 + }, + { + "epoch": 1.1201913924179157, + "grad_norm": 0.021242873714302467, + "learning_rate": 0.00048237018808983286, + "loss": 0.5098, + "step": 22065 + }, + { + "epoch": 1.1204452285159472, + "grad_norm": 0.03862878033938459, + "learning_rate": 0.0004821488016083162, + "loss": 0.4902, + "step": 22070 + }, + { + "epoch": 1.1206990646139787, + "grad_norm": 0.022914131678326958, + "learning_rate": 0.0004819274186309005, + "loss": 0.5181, + "step": 22075 + }, + { + "epoch": 1.1209529007120103, + "grad_norm": 0.02444901284526625, + "learning_rate": 0.0004817060392010427, + "loss": 0.519, + "step": 22080 + }, + { + "epoch": 1.1212067368100418, + "grad_norm": 0.027511001579787645, + "learning_rate": 0.0004814846633621981, + "loss": 0.5124, + "step": 22085 + }, + { + "epoch": 1.1214605729080733, + "grad_norm": 0.02170627891570555, + "learning_rate": 0.0004812632911578218, + "loss": 0.4825, + "step": 22090 + }, + { + "epoch": 1.1217144090061049, + "grad_norm": 0.023975365604404518, + "learning_rate": 0.000481041922631368, + "loss": 0.5226, + "step": 22095 + }, + { + "epoch": 1.1219682451041362, + "grad_norm": 0.023405836814547467, + "learning_rate": 0.00048082055782629017, + "loss": 0.5045, + "step": 22100 + }, + { + "epoch": 1.1222220812021677, + "grad_norm": 0.023988948357005786, + "learning_rate": 0.00048059919678604125, + "loss": 0.5046, + "step": 22105 + }, + { + "epoch": 1.1224759173001992, + "grad_norm": 0.019265243232751396, + "learning_rate": 0.0004803778395540733, + "loss": 0.4932, + "step": 22110 + }, + { + "epoch": 1.1227297533982308, + "grad_norm": 0.020015777068000878, + "learning_rate": 0.0004801564861738375, + "loss": 0.4692, + "step": 22115 + }, + { + "epoch": 1.1229835894962623, + "grad_norm": 0.02271364249987384, + "learning_rate": 0.00047993513668878455, + "loss": 0.5071, + "step": 22120 + }, + { + "epoch": 1.1232374255942938, + "grad_norm": 0.0283767253917993, + "learning_rate": 0.0004797137911423642, + "loss": 0.49, + "step": 22125 + }, + { + "epoch": 1.1234912616923252, + "grad_norm": 0.03057189959396885, + "learning_rate": 0.00047949244957802545, + "loss": 0.5132, + "step": 22130 + }, + { + "epoch": 1.1237450977903567, + "grad_norm": 0.02307975684077437, + "learning_rate": 0.0004792711120392165, + "loss": 0.5052, + "step": 22135 + }, + { + "epoch": 1.1239989338883882, + "grad_norm": 0.022609530542469808, + "learning_rate": 0.00047904977856938496, + "loss": 0.4825, + "step": 22140 + }, + { + "epoch": 1.1242527699864198, + "grad_norm": 0.02667533885777748, + "learning_rate": 0.0004788284492119775, + "loss": 0.4987, + "step": 22145 + }, + { + "epoch": 1.1245066060844513, + "grad_norm": 0.03401959306286431, + "learning_rate": 0.00047860712401043976, + "loss": 0.4834, + "step": 22150 + }, + { + "epoch": 1.1247604421824828, + "grad_norm": 0.03470866794755956, + "learning_rate": 0.00047838580300821695, + "loss": 0.4963, + "step": 22155 + }, + { + "epoch": 1.1250142782805144, + "grad_norm": 0.023970283863950843, + "learning_rate": 0.0004781644862487532, + "loss": 0.4791, + "step": 22160 + }, + { + "epoch": 1.1252681143785457, + "grad_norm": 0.02312660422996864, + "learning_rate": 0.000477943173775492, + "loss": 0.5058, + "step": 22165 + }, + { + "epoch": 1.1255219504765772, + "grad_norm": 0.02435017898171767, + "learning_rate": 0.00047772186563187566, + "loss": 0.4919, + "step": 22170 + }, + { + "epoch": 1.1257757865746088, + "grad_norm": 0.023843442848662726, + "learning_rate": 0.00047750056186134603, + "loss": 0.5119, + "step": 22175 + }, + { + "epoch": 1.1260296226726403, + "grad_norm": 0.027676021986730217, + "learning_rate": 0.00047727926250734393, + "loss": 0.5182, + "step": 22180 + }, + { + "epoch": 1.1262834587706718, + "grad_norm": 0.02668274381778642, + "learning_rate": 0.00047705796761330927, + "loss": 0.5088, + "step": 22185 + }, + { + "epoch": 1.1265372948687034, + "grad_norm": 0.04197766800019646, + "learning_rate": 0.00047683667722268116, + "loss": 0.5212, + "step": 22190 + }, + { + "epoch": 1.1267911309667347, + "grad_norm": 0.04081434517250474, + "learning_rate": 0.0004766153913788976, + "loss": 0.4979, + "step": 22195 + }, + { + "epoch": 1.1270449670647662, + "grad_norm": 0.03853608127797696, + "learning_rate": 0.00047639411012539626, + "loss": 0.5104, + "step": 22200 + }, + { + "epoch": 1.1272988031627977, + "grad_norm": 0.02809744226799178, + "learning_rate": 0.0004761728335056134, + "loss": 0.4911, + "step": 22205 + }, + { + "epoch": 1.1275526392608293, + "grad_norm": 0.027375956086551304, + "learning_rate": 0.00047595156156298455, + "loss": 0.4629, + "step": 22210 + }, + { + "epoch": 1.1278064753588608, + "grad_norm": 0.026405679471089256, + "learning_rate": 0.0004757302943409442, + "loss": 0.5249, + "step": 22215 + }, + { + "epoch": 1.1280603114568923, + "grad_norm": 0.02422688158994748, + "learning_rate": 0.000475509031882926, + "loss": 0.4936, + "step": 22220 + }, + { + "epoch": 1.1283141475549239, + "grad_norm": 0.02310122351518651, + "learning_rate": 0.00047528777423236276, + "loss": 0.4977, + "step": 22225 + }, + { + "epoch": 1.1285679836529554, + "grad_norm": 0.027179185579231067, + "learning_rate": 0.00047506652143268615, + "loss": 0.527, + "step": 22230 + }, + { + "epoch": 1.1288218197509867, + "grad_norm": 0.02555209948452092, + "learning_rate": 0.0004748452735273271, + "loss": 0.4867, + "step": 22235 + }, + { + "epoch": 1.1290756558490183, + "grad_norm": 0.024286447301691477, + "learning_rate": 0.0004746240305597154, + "loss": 0.4958, + "step": 22240 + }, + { + "epoch": 1.1293294919470498, + "grad_norm": 0.028126452190403196, + "learning_rate": 0.0004744027925732799, + "loss": 0.5077, + "step": 22245 + }, + { + "epoch": 1.1295833280450813, + "grad_norm": 0.031834226850280145, + "learning_rate": 0.0004741815596114486, + "loss": 0.4713, + "step": 22250 + }, + { + "epoch": 1.1298371641431129, + "grad_norm": 0.025659978434002053, + "learning_rate": 0.00047396033171764825, + "loss": 0.4642, + "step": 22255 + }, + { + "epoch": 1.1300910002411442, + "grad_norm": 0.029700890203732126, + "learning_rate": 0.00047373910893530504, + "loss": 0.5396, + "step": 22260 + }, + { + "epoch": 1.1303448363391757, + "grad_norm": 0.022579372378051864, + "learning_rate": 0.00047351789130784384, + "loss": 0.4994, + "step": 22265 + }, + { + "epoch": 1.1305986724372072, + "grad_norm": 0.029980851132857454, + "learning_rate": 0.00047329667887868846, + "loss": 0.4837, + "step": 22270 + }, + { + "epoch": 1.1308525085352388, + "grad_norm": 0.038482466915576954, + "learning_rate": 0.00047307547169126183, + "loss": 0.4999, + "step": 22275 + }, + { + "epoch": 1.1311063446332703, + "grad_norm": 0.028758075959962347, + "learning_rate": 0.0004728542697889859, + "loss": 0.4943, + "step": 22280 + }, + { + "epoch": 1.1313601807313018, + "grad_norm": 0.02302067520972011, + "learning_rate": 0.00047263307321528136, + "loss": 0.5058, + "step": 22285 + }, + { + "epoch": 1.1316140168293334, + "grad_norm": 0.02338759212969084, + "learning_rate": 0.0004724118820135681, + "loss": 0.5417, + "step": 22290 + }, + { + "epoch": 1.131867852927365, + "grad_norm": 0.024240682457677416, + "learning_rate": 0.00047219069622726485, + "loss": 0.5429, + "step": 22295 + }, + { + "epoch": 1.1321216890253962, + "grad_norm": 0.03240119123904312, + "learning_rate": 0.0004719695158997892, + "loss": 0.4872, + "step": 22300 + }, + { + "epoch": 1.1323755251234278, + "grad_norm": 0.027587688170413006, + "learning_rate": 0.00047174834107455784, + "loss": 0.5008, + "step": 22305 + }, + { + "epoch": 1.1326293612214593, + "grad_norm": 0.026389922450704612, + "learning_rate": 0.00047152717179498624, + "loss": 0.5294, + "step": 22310 + }, + { + "epoch": 1.1328831973194908, + "grad_norm": 0.031230602354335214, + "learning_rate": 0.00047130600810448855, + "loss": 0.4712, + "step": 22315 + }, + { + "epoch": 1.1331370334175224, + "grad_norm": 0.02765524085794462, + "learning_rate": 0.0004710848500464786, + "loss": 0.526, + "step": 22320 + }, + { + "epoch": 1.133390869515554, + "grad_norm": 0.023377797728668624, + "learning_rate": 0.0004708636976643684, + "loss": 0.4667, + "step": 22325 + }, + { + "epoch": 1.1336447056135852, + "grad_norm": 0.024350148694776152, + "learning_rate": 0.00047064255100156904, + "loss": 0.4631, + "step": 22330 + }, + { + "epoch": 1.1338985417116167, + "grad_norm": 0.023281128309858613, + "learning_rate": 0.00047042141010149053, + "loss": 0.5129, + "step": 22335 + }, + { + "epoch": 1.1341523778096483, + "grad_norm": 0.030603931112854497, + "learning_rate": 0.0004702002750075417, + "loss": 0.517, + "step": 22340 + }, + { + "epoch": 1.1344062139076798, + "grad_norm": 0.024716062309770172, + "learning_rate": 0.0004699791457631303, + "loss": 0.4932, + "step": 22345 + }, + { + "epoch": 1.1346600500057114, + "grad_norm": 0.10774252187329189, + "learning_rate": 0.00046975802241166283, + "loss": 0.5326, + "step": 22350 + }, + { + "epoch": 1.1349138861037429, + "grad_norm": 0.029932485775150907, + "learning_rate": 0.00046953690499654477, + "loss": 0.5271, + "step": 22355 + }, + { + "epoch": 1.1351677222017744, + "grad_norm": 0.02306648216314524, + "learning_rate": 0.0004693157935611803, + "loss": 0.4831, + "step": 22360 + }, + { + "epoch": 1.1354215582998057, + "grad_norm": 0.024003978824830057, + "learning_rate": 0.0004690946881489726, + "loss": 0.4986, + "step": 22365 + }, + { + "epoch": 1.1356753943978373, + "grad_norm": 0.021905055146049835, + "learning_rate": 0.00046887358880332345, + "loss": 0.5525, + "step": 22370 + }, + { + "epoch": 1.1359292304958688, + "grad_norm": 0.022848413649515294, + "learning_rate": 0.00046865249556763344, + "loss": 0.529, + "step": 22375 + }, + { + "epoch": 1.1361830665939003, + "grad_norm": 0.03605177144637833, + "learning_rate": 0.0004684314084853024, + "loss": 0.4814, + "step": 22380 + }, + { + "epoch": 1.1364369026919319, + "grad_norm": 0.056644899234727326, + "learning_rate": 0.0004682103275997284, + "loss": 0.546, + "step": 22385 + }, + { + "epoch": 1.1366907387899634, + "grad_norm": 0.032741353664718975, + "learning_rate": 0.00046798925295430863, + "loss": 0.492, + "step": 22390 + }, + { + "epoch": 1.1369445748879947, + "grad_norm": 0.024068036984486157, + "learning_rate": 0.00046776818459243874, + "loss": 0.4783, + "step": 22395 + }, + { + "epoch": 1.1371984109860263, + "grad_norm": 0.026300065257889048, + "learning_rate": 0.0004675471225575136, + "loss": 0.5042, + "step": 22400 + }, + { + "epoch": 1.1374522470840578, + "grad_norm": 0.026588922808316026, + "learning_rate": 0.00046732606689292637, + "loss": 0.4825, + "step": 22405 + }, + { + "epoch": 1.1377060831820893, + "grad_norm": 0.03082021206138233, + "learning_rate": 0.00046710501764206933, + "loss": 0.4886, + "step": 22410 + }, + { + "epoch": 1.1379599192801209, + "grad_norm": 0.02702663439746158, + "learning_rate": 0.0004668839748483332, + "loss": 0.4956, + "step": 22415 + }, + { + "epoch": 1.1382137553781524, + "grad_norm": 0.02770867084856874, + "learning_rate": 0.0004666629385551078, + "loss": 0.5162, + "step": 22420 + }, + { + "epoch": 1.138467591476184, + "grad_norm": 0.02593191401749543, + "learning_rate": 0.0004664419088057812, + "loss": 0.5222, + "step": 22425 + }, + { + "epoch": 1.1387214275742152, + "grad_norm": 0.025067133006483912, + "learning_rate": 0.0004662208856437405, + "loss": 0.5045, + "step": 22430 + }, + { + "epoch": 1.1389752636722468, + "grad_norm": 0.02386909656610933, + "learning_rate": 0.00046599986911237135, + "loss": 0.4937, + "step": 22435 + }, + { + "epoch": 1.1392290997702783, + "grad_norm": 0.02308227237038335, + "learning_rate": 0.00046577885925505857, + "loss": 0.5298, + "step": 22440 + }, + { + "epoch": 1.1394829358683098, + "grad_norm": 0.022608086605042318, + "learning_rate": 0.00046555785611518505, + "loss": 0.5098, + "step": 22445 + }, + { + "epoch": 1.1397367719663414, + "grad_norm": 0.022743886474204174, + "learning_rate": 0.0004653368597361326, + "loss": 0.5065, + "step": 22450 + }, + { + "epoch": 1.139990608064373, + "grad_norm": 0.021721050273651527, + "learning_rate": 0.00046511587016128173, + "loss": 0.5221, + "step": 22455 + }, + { + "epoch": 1.1402444441624042, + "grad_norm": 0.028647441808002567, + "learning_rate": 0.0004648948874340115, + "loss": 0.5102, + "step": 22460 + }, + { + "epoch": 1.1404982802604358, + "grad_norm": 0.023629836872766076, + "learning_rate": 0.0004646739115976999, + "loss": 0.4838, + "step": 22465 + }, + { + "epoch": 1.1407521163584673, + "grad_norm": 0.022987645202448132, + "learning_rate": 0.00046445294269572326, + "loss": 0.5101, + "step": 22470 + }, + { + "epoch": 1.1410059524564988, + "grad_norm": 0.028359397619188362, + "learning_rate": 0.0004642319807714567, + "loss": 0.49, + "step": 22475 + }, + { + "epoch": 1.1412597885545304, + "grad_norm": 0.024714621127131544, + "learning_rate": 0.0004640110258682739, + "loss": 0.5394, + "step": 22480 + }, + { + "epoch": 1.141513624652562, + "grad_norm": 0.023929726037479372, + "learning_rate": 0.0004637900780295472, + "loss": 0.4957, + "step": 22485 + }, + { + "epoch": 1.1417674607505934, + "grad_norm": 0.02145690677992983, + "learning_rate": 0.0004635691372986477, + "loss": 0.455, + "step": 22490 + }, + { + "epoch": 1.142021296848625, + "grad_norm": 0.020947420613049612, + "learning_rate": 0.0004633482037189447, + "loss": 0.4911, + "step": 22495 + }, + { + "epoch": 1.1422751329466563, + "grad_norm": 0.02298839539671349, + "learning_rate": 0.00046312727733380666, + "loss": 0.5216, + "step": 22500 + }, + { + "epoch": 1.1425289690446878, + "grad_norm": 0.021966101109555183, + "learning_rate": 0.0004629063581866002, + "loss": 0.5083, + "step": 22505 + }, + { + "epoch": 1.1427828051427193, + "grad_norm": 0.03686984297075965, + "learning_rate": 0.00046268544632069064, + "loss": 0.5224, + "step": 22510 + }, + { + "epoch": 1.1430366412407509, + "grad_norm": 0.030756815032274516, + "learning_rate": 0.00046246454177944194, + "loss": 0.508, + "step": 22515 + }, + { + "epoch": 1.1432904773387824, + "grad_norm": 0.05181483793136802, + "learning_rate": 0.0004622436446062164, + "loss": 0.489, + "step": 22520 + }, + { + "epoch": 1.1435443134368137, + "grad_norm": 0.03958359517287309, + "learning_rate": 0.0004620227548443752, + "loss": 0.5243, + "step": 22525 + }, + { + "epoch": 1.1437981495348453, + "grad_norm": 0.03427906955795205, + "learning_rate": 0.0004618018725372778, + "loss": 0.4834, + "step": 22530 + }, + { + "epoch": 1.1440519856328768, + "grad_norm": 0.023909407473577712, + "learning_rate": 0.0004615809977282823, + "loss": 0.4969, + "step": 22535 + }, + { + "epoch": 1.1443058217309083, + "grad_norm": 0.026640110354467325, + "learning_rate": 0.0004613601304607454, + "loss": 0.5131, + "step": 22540 + }, + { + "epoch": 1.1445596578289399, + "grad_norm": 0.0237982378487044, + "learning_rate": 0.0004611392707780222, + "loss": 0.4965, + "step": 22545 + }, + { + "epoch": 1.1448134939269714, + "grad_norm": 0.03386310576456612, + "learning_rate": 0.00046091841872346627, + "loss": 0.5167, + "step": 22550 + }, + { + "epoch": 1.145067330025003, + "grad_norm": 0.02459001107447409, + "learning_rate": 0.00046069757434042975, + "loss": 0.5098, + "step": 22555 + }, + { + "epoch": 1.1453211661230345, + "grad_norm": 0.02750293868217048, + "learning_rate": 0.0004604767376722635, + "loss": 0.5183, + "step": 22560 + }, + { + "epoch": 1.1455750022210658, + "grad_norm": 0.02465764415596952, + "learning_rate": 0.0004602559087623166, + "loss": 0.5346, + "step": 22565 + }, + { + "epoch": 1.1458288383190973, + "grad_norm": 0.021840825134174797, + "learning_rate": 0.0004600350876539366, + "loss": 0.4734, + "step": 22570 + }, + { + "epoch": 1.1460826744171289, + "grad_norm": 0.022144355795008893, + "learning_rate": 0.00045981427439046956, + "loss": 0.4902, + "step": 22575 + }, + { + "epoch": 1.1463365105151604, + "grad_norm": 0.02333513320332411, + "learning_rate": 0.00045959346901526006, + "loss": 0.4993, + "step": 22580 + }, + { + "epoch": 1.146590346613192, + "grad_norm": 0.02599869001595599, + "learning_rate": 0.0004593726715716511, + "loss": 0.5261, + "step": 22585 + }, + { + "epoch": 1.1468441827112235, + "grad_norm": 0.03686029455414443, + "learning_rate": 0.00045915188210298406, + "loss": 0.4647, + "step": 22590 + }, + { + "epoch": 1.1470980188092548, + "grad_norm": 0.027385432815099645, + "learning_rate": 0.00045893110065259893, + "loss": 0.5352, + "step": 22595 + }, + { + "epoch": 1.1473518549072863, + "grad_norm": 0.02888582352407677, + "learning_rate": 0.0004587103272638339, + "loss": 0.4924, + "step": 22600 + }, + { + "epoch": 1.1476056910053178, + "grad_norm": 0.023625257936494698, + "learning_rate": 0.0004584895619800257, + "loss": 0.505, + "step": 22605 + }, + { + "epoch": 1.1478595271033494, + "grad_norm": 0.0277603791896047, + "learning_rate": 0.00045826880484450946, + "loss": 0.5021, + "step": 22610 + }, + { + "epoch": 1.148113363201381, + "grad_norm": 0.028467365276233746, + "learning_rate": 0.0004580480559006186, + "loss": 0.497, + "step": 22615 + }, + { + "epoch": 1.1483671992994124, + "grad_norm": 0.03687524663675464, + "learning_rate": 0.0004578273151916853, + "loss": 0.4686, + "step": 22620 + }, + { + "epoch": 1.148621035397444, + "grad_norm": 0.033659979229860255, + "learning_rate": 0.0004576065827610397, + "loss": 0.489, + "step": 22625 + }, + { + "epoch": 1.1488748714954753, + "grad_norm": 0.025235412390316816, + "learning_rate": 0.0004573858586520105, + "loss": 0.5098, + "step": 22630 + }, + { + "epoch": 1.1491287075935068, + "grad_norm": 0.03243044981273508, + "learning_rate": 0.0004571651429079247, + "loss": 0.4934, + "step": 22635 + }, + { + "epoch": 1.1493825436915384, + "grad_norm": 0.022833415951073927, + "learning_rate": 0.00045694443557210777, + "loss": 0.5142, + "step": 22640 + }, + { + "epoch": 1.14963637978957, + "grad_norm": 0.03068941635429747, + "learning_rate": 0.00045672373668788336, + "loss": 0.466, + "step": 22645 + }, + { + "epoch": 1.1498902158876014, + "grad_norm": 0.0246266009056399, + "learning_rate": 0.0004565030462985737, + "loss": 0.5178, + "step": 22650 + }, + { + "epoch": 1.150144051985633, + "grad_norm": 0.030534870475012552, + "learning_rate": 0.00045628236444749905, + "loss": 0.4653, + "step": 22655 + }, + { + "epoch": 1.1503978880836643, + "grad_norm": 0.024906564205494433, + "learning_rate": 0.0004560616911779783, + "loss": 0.5041, + "step": 22660 + }, + { + "epoch": 1.1506517241816958, + "grad_norm": 0.03444194262921391, + "learning_rate": 0.00045584102653332845, + "loss": 0.4884, + "step": 22665 + }, + { + "epoch": 1.1509055602797273, + "grad_norm": 0.036461759923969016, + "learning_rate": 0.0004556203705568648, + "loss": 0.5029, + "step": 22670 + }, + { + "epoch": 1.1511593963777589, + "grad_norm": 0.04594830004230979, + "learning_rate": 0.0004553997232919009, + "loss": 0.5145, + "step": 22675 + }, + { + "epoch": 1.1514132324757904, + "grad_norm": 0.03261588655559848, + "learning_rate": 0.00045517908478174917, + "loss": 0.5217, + "step": 22680 + }, + { + "epoch": 1.151667068573822, + "grad_norm": 0.0242254240936562, + "learning_rate": 0.0004549584550697196, + "loss": 0.4833, + "step": 22685 + }, + { + "epoch": 1.1519209046718535, + "grad_norm": 0.023948731317067408, + "learning_rate": 0.00045473783419912057, + "loss": 0.473, + "step": 22690 + }, + { + "epoch": 1.1521747407698848, + "grad_norm": 0.02369769333893661, + "learning_rate": 0.000454517222213259, + "loss": 0.4996, + "step": 22695 + }, + { + "epoch": 1.1524285768679163, + "grad_norm": 0.0235533696148796, + "learning_rate": 0.00045429661915543995, + "loss": 0.5215, + "step": 22700 + }, + { + "epoch": 1.1526824129659479, + "grad_norm": 0.029693427304754598, + "learning_rate": 0.0004540760250689666, + "loss": 0.5202, + "step": 22705 + }, + { + "epoch": 1.1529362490639794, + "grad_norm": 0.026223364679563748, + "learning_rate": 0.0004538554399971406, + "loss": 0.499, + "step": 22710 + }, + { + "epoch": 1.153190085162011, + "grad_norm": 0.022238545679295847, + "learning_rate": 0.00045363486398326147, + "loss": 0.4906, + "step": 22715 + }, + { + "epoch": 1.1534439212600425, + "grad_norm": 0.02977765503263147, + "learning_rate": 0.0004534142970706274, + "loss": 0.529, + "step": 22720 + }, + { + "epoch": 1.1536977573580738, + "grad_norm": 0.022706195045235984, + "learning_rate": 0.0004531937393025344, + "loss": 0.5039, + "step": 22725 + }, + { + "epoch": 1.1539515934561053, + "grad_norm": 0.02434660390615717, + "learning_rate": 0.000452973190722277, + "loss": 0.5186, + "step": 22730 + }, + { + "epoch": 1.1542054295541369, + "grad_norm": 0.0213880961035262, + "learning_rate": 0.00045275265137314754, + "loss": 0.5202, + "step": 22735 + }, + { + "epoch": 1.1544592656521684, + "grad_norm": 0.024719044730757197, + "learning_rate": 0.0004525321212984372, + "loss": 0.5122, + "step": 22740 + }, + { + "epoch": 1.1547131017502, + "grad_norm": 0.024745026751628207, + "learning_rate": 0.00045231160054143467, + "loss": 0.5046, + "step": 22745 + }, + { + "epoch": 1.1549669378482315, + "grad_norm": 0.025339355379831775, + "learning_rate": 0.00045209108914542716, + "loss": 0.5085, + "step": 22750 + }, + { + "epoch": 1.155220773946263, + "grad_norm": 0.03885787120756748, + "learning_rate": 0.0004518705871537, + "loss": 0.5161, + "step": 22755 + }, + { + "epoch": 1.1554746100442943, + "grad_norm": 0.022406997994527122, + "learning_rate": 0.0004516500946095365, + "loss": 0.4808, + "step": 22760 + }, + { + "epoch": 1.1557284461423258, + "grad_norm": 0.026005826539297415, + "learning_rate": 0.0004514296115562183, + "loss": 0.5135, + "step": 22765 + }, + { + "epoch": 1.1559822822403574, + "grad_norm": 0.03539270632977643, + "learning_rate": 0.0004512091380370251, + "loss": 0.4971, + "step": 22770 + }, + { + "epoch": 1.156236118338389, + "grad_norm": 0.029564361875585962, + "learning_rate": 0.00045098867409523486, + "loss": 0.4924, + "step": 22775 + }, + { + "epoch": 1.1564899544364204, + "grad_norm": 0.02358096736186789, + "learning_rate": 0.0004507682197741235, + "loss": 0.4856, + "step": 22780 + }, + { + "epoch": 1.156743790534452, + "grad_norm": 0.028230367180303174, + "learning_rate": 0.000450547775116965, + "loss": 0.4659, + "step": 22785 + }, + { + "epoch": 1.1569976266324833, + "grad_norm": 0.02700559394625303, + "learning_rate": 0.00045032734016703163, + "loss": 0.4845, + "step": 22790 + }, + { + "epoch": 1.1572514627305148, + "grad_norm": 0.026220476689667822, + "learning_rate": 0.0004501069149675937, + "loss": 0.5081, + "step": 22795 + }, + { + "epoch": 1.1575052988285464, + "grad_norm": 0.023325253274810016, + "learning_rate": 0.00044988649956191943, + "loss": 0.5124, + "step": 22800 + }, + { + "epoch": 1.157759134926578, + "grad_norm": 0.02964466344942591, + "learning_rate": 0.00044966609399327544, + "loss": 0.5034, + "step": 22805 + }, + { + "epoch": 1.1580129710246094, + "grad_norm": 0.025629931037714163, + "learning_rate": 0.0004494456983049263, + "loss": 0.5159, + "step": 22810 + }, + { + "epoch": 1.158266807122641, + "grad_norm": 0.02818068589305215, + "learning_rate": 0.0004492253125401344, + "loss": 0.517, + "step": 22815 + }, + { + "epoch": 1.1585206432206725, + "grad_norm": 0.025937834093816534, + "learning_rate": 0.00044900493674216043, + "loss": 0.535, + "step": 22820 + }, + { + "epoch": 1.158774479318704, + "grad_norm": 0.021906620341182587, + "learning_rate": 0.00044878457095426307, + "loss": 0.4881, + "step": 22825 + }, + { + "epoch": 1.1590283154167353, + "grad_norm": 0.023326406428330577, + "learning_rate": 0.000448564215219699, + "loss": 0.4898, + "step": 22830 + }, + { + "epoch": 1.1592821515147669, + "grad_norm": 0.025849041107935036, + "learning_rate": 0.00044834386958172295, + "loss": 0.5198, + "step": 22835 + }, + { + "epoch": 1.1595359876127984, + "grad_norm": 0.02594641672975575, + "learning_rate": 0.00044812353408358777, + "loss": 0.4972, + "step": 22840 + }, + { + "epoch": 1.15978982371083, + "grad_norm": 0.0330275583093027, + "learning_rate": 0.0004479032087685441, + "loss": 0.5119, + "step": 22845 + }, + { + "epoch": 1.1600436598088615, + "grad_norm": 0.023604960673479395, + "learning_rate": 0.00044768289367984077, + "loss": 0.5103, + "step": 22850 + }, + { + "epoch": 1.1602974959068928, + "grad_norm": 0.04128228371869332, + "learning_rate": 0.0004474625888607245, + "loss": 0.5197, + "step": 22855 + }, + { + "epoch": 1.1605513320049243, + "grad_norm": 0.020903884654111818, + "learning_rate": 0.00044724229435443973, + "loss": 0.5284, + "step": 22860 + }, + { + "epoch": 1.1608051681029559, + "grad_norm": 0.031129567663155357, + "learning_rate": 0.0004470220102042298, + "loss": 0.5068, + "step": 22865 + }, + { + "epoch": 1.1610590042009874, + "grad_norm": 0.020499848777966484, + "learning_rate": 0.00044680173645333504, + "loss": 0.5014, + "step": 22870 + }, + { + "epoch": 1.161312840299019, + "grad_norm": 0.024298947458801685, + "learning_rate": 0.0004465814731449941, + "loss": 0.4989, + "step": 22875 + }, + { + "epoch": 1.1615666763970505, + "grad_norm": 0.02238268280787287, + "learning_rate": 0.0004463612203224436, + "loss": 0.5114, + "step": 22880 + }, + { + "epoch": 1.161820512495082, + "grad_norm": 0.030230746480821854, + "learning_rate": 0.0004461409780289181, + "loss": 0.4876, + "step": 22885 + }, + { + "epoch": 1.1620743485931135, + "grad_norm": 0.024028109056989183, + "learning_rate": 0.0004459207463076499, + "loss": 0.4815, + "step": 22890 + }, + { + "epoch": 1.1623281846911449, + "grad_norm": 0.021016127843322022, + "learning_rate": 0.00044570052520186956, + "loss": 0.5067, + "step": 22895 + }, + { + "epoch": 1.1625820207891764, + "grad_norm": 0.029561997978537945, + "learning_rate": 0.00044548031475480533, + "loss": 0.5074, + "step": 22900 + }, + { + "epoch": 1.162835856887208, + "grad_norm": 0.022169964271249176, + "learning_rate": 0.0004452601150096834, + "loss": 0.522, + "step": 22905 + }, + { + "epoch": 1.1630896929852395, + "grad_norm": 0.024800692528682368, + "learning_rate": 0.000445039926009728, + "loss": 0.5108, + "step": 22910 + }, + { + "epoch": 1.163343529083271, + "grad_norm": 0.02591690066399758, + "learning_rate": 0.00044481974779816096, + "loss": 0.5044, + "step": 22915 + }, + { + "epoch": 1.1635973651813025, + "grad_norm": 0.025486400367960977, + "learning_rate": 0.00044459958041820217, + "loss": 0.5242, + "step": 22920 + }, + { + "epoch": 1.1638512012793338, + "grad_norm": 2.4946821079686567, + "learning_rate": 0.0004443794239130696, + "loss": 0.7984, + "step": 22925 + }, + { + "epoch": 1.1641050373773654, + "grad_norm": 0.06100607110479418, + "learning_rate": 0.00044415927832597865, + "loss": 0.4905, + "step": 22930 + }, + { + "epoch": 1.164358873475397, + "grad_norm": 0.025761932513384978, + "learning_rate": 0.00044393914370014295, + "loss": 0.4968, + "step": 22935 + }, + { + "epoch": 1.1646127095734284, + "grad_norm": 0.02625663236193525, + "learning_rate": 0.00044371902007877374, + "loss": 0.4951, + "step": 22940 + }, + { + "epoch": 1.16486654567146, + "grad_norm": 0.02683158448012171, + "learning_rate": 0.0004434989075050802, + "loss": 0.5164, + "step": 22945 + }, + { + "epoch": 1.1651203817694915, + "grad_norm": 0.03126770701983838, + "learning_rate": 0.0004432788060222694, + "loss": 0.4774, + "step": 22950 + }, + { + "epoch": 1.165374217867523, + "grad_norm": 0.02778926944239494, + "learning_rate": 0.00044305871567354606, + "loss": 0.5266, + "step": 22955 + }, + { + "epoch": 1.1656280539655544, + "grad_norm": 0.02199391640791797, + "learning_rate": 0.0004428386365021129, + "loss": 0.5053, + "step": 22960 + }, + { + "epoch": 1.165881890063586, + "grad_norm": 0.02185784711754856, + "learning_rate": 0.0004426185685511703, + "loss": 0.5215, + "step": 22965 + }, + { + "epoch": 1.1661357261616174, + "grad_norm": 0.03374512683832721, + "learning_rate": 0.00044239851186391653, + "loss": 0.4848, + "step": 22970 + }, + { + "epoch": 1.166389562259649, + "grad_norm": 0.029868509213157268, + "learning_rate": 0.00044217846648354764, + "loss": 0.4479, + "step": 22975 + }, + { + "epoch": 1.1666433983576805, + "grad_norm": 0.04777615390254651, + "learning_rate": 0.00044195843245325723, + "loss": 0.4859, + "step": 22980 + }, + { + "epoch": 1.166897234455712, + "grad_norm": 0.023512758989838855, + "learning_rate": 0.0004417384098162373, + "loss": 0.5342, + "step": 22985 + }, + { + "epoch": 1.1671510705537433, + "grad_norm": 0.02229369139136557, + "learning_rate": 0.00044151839861567694, + "loss": 0.5134, + "step": 22990 + }, + { + "epoch": 1.1674049066517749, + "grad_norm": 0.026894096234455038, + "learning_rate": 0.0004412983988947633, + "loss": 0.4969, + "step": 22995 + }, + { + "epoch": 1.1676587427498064, + "grad_norm": 0.02589303820825699, + "learning_rate": 0.0004410784106966812, + "loss": 0.4886, + "step": 23000 + }, + { + "epoch": 1.167912578847838, + "grad_norm": 0.02124381698964111, + "learning_rate": 0.0004408584340646132, + "loss": 0.523, + "step": 23005 + }, + { + "epoch": 1.1681664149458695, + "grad_norm": 0.03791510606931018, + "learning_rate": 0.0004406384690417397, + "loss": 0.5113, + "step": 23010 + }, + { + "epoch": 1.168420251043901, + "grad_norm": 0.02862399836284424, + "learning_rate": 0.0004404185156712387, + "loss": 0.5179, + "step": 23015 + }, + { + "epoch": 1.1686740871419325, + "grad_norm": 0.030684358273063143, + "learning_rate": 0.00044019857399628593, + "loss": 0.4732, + "step": 23020 + }, + { + "epoch": 1.1689279232399639, + "grad_norm": 0.023354500795241345, + "learning_rate": 0.0004399786440600549, + "loss": 0.5008, + "step": 23025 + }, + { + "epoch": 1.1691817593379954, + "grad_norm": 0.024691152727817047, + "learning_rate": 0.0004397587259057166, + "loss": 0.4704, + "step": 23030 + }, + { + "epoch": 1.169435595436027, + "grad_norm": 0.023111558411901535, + "learning_rate": 0.0004395388195764401, + "loss": 0.4923, + "step": 23035 + }, + { + "epoch": 1.1696894315340585, + "grad_norm": 0.023376429033597424, + "learning_rate": 0.00043931892511539164, + "loss": 0.4986, + "step": 23040 + }, + { + "epoch": 1.16994326763209, + "grad_norm": 0.021932488535556502, + "learning_rate": 0.0004390990425657357, + "loss": 0.5079, + "step": 23045 + }, + { + "epoch": 1.1701971037301215, + "grad_norm": 0.02295870523021244, + "learning_rate": 0.00043887917197063395, + "loss": 0.4881, + "step": 23050 + }, + { + "epoch": 1.1704509398281528, + "grad_norm": 0.025540059947805397, + "learning_rate": 0.00043865931337324596, + "loss": 0.4948, + "step": 23055 + }, + { + "epoch": 1.1707047759261844, + "grad_norm": 0.022156403341379288, + "learning_rate": 0.0004384394668167288, + "loss": 0.4972, + "step": 23060 + }, + { + "epoch": 1.170958612024216, + "grad_norm": 0.02786034852874224, + "learning_rate": 0.00043821963234423736, + "loss": 0.5261, + "step": 23065 + }, + { + "epoch": 1.1712124481222475, + "grad_norm": 0.04119583380859592, + "learning_rate": 0.00043799980999892395, + "loss": 0.5146, + "step": 23070 + }, + { + "epoch": 1.171466284220279, + "grad_norm": 0.023409857934518236, + "learning_rate": 0.00043777999982393866, + "loss": 0.5204, + "step": 23075 + }, + { + "epoch": 1.1717201203183105, + "grad_norm": 0.02598832215798176, + "learning_rate": 0.00043756020186242915, + "loss": 0.4982, + "step": 23080 + }, + { + "epoch": 1.171973956416342, + "grad_norm": 0.03531536654582589, + "learning_rate": 0.0004373404161575406, + "loss": 0.4819, + "step": 23085 + }, + { + "epoch": 1.1722277925143736, + "grad_norm": 0.022966689799931606, + "learning_rate": 0.00043712064275241584, + "loss": 0.5127, + "step": 23090 + }, + { + "epoch": 1.172481628612405, + "grad_norm": 0.022269854028272786, + "learning_rate": 0.00043690088169019535, + "loss": 0.4744, + "step": 23095 + }, + { + "epoch": 1.1727354647104364, + "grad_norm": 0.025940748821848634, + "learning_rate": 0.0004366811330140169, + "loss": 0.4777, + "step": 23100 + }, + { + "epoch": 1.172989300808468, + "grad_norm": 0.022624731878350762, + "learning_rate": 0.0004364613967670165, + "loss": 0.4943, + "step": 23105 + }, + { + "epoch": 1.1732431369064995, + "grad_norm": 0.023056585507083997, + "learning_rate": 0.0004362416729923271, + "loss": 0.495, + "step": 23110 + }, + { + "epoch": 1.173496973004531, + "grad_norm": 0.03515992882856805, + "learning_rate": 0.0004360219617330792, + "loss": 0.5044, + "step": 23115 + }, + { + "epoch": 1.1737508091025624, + "grad_norm": 0.02197279853949111, + "learning_rate": 0.00043580226303240125, + "loss": 0.5008, + "step": 23120 + }, + { + "epoch": 1.1740046452005939, + "grad_norm": 0.026489459382601607, + "learning_rate": 0.0004355825769334189, + "loss": 0.5203, + "step": 23125 + }, + { + "epoch": 1.1742584812986254, + "grad_norm": 0.028795879529102696, + "learning_rate": 0.00043536290347925545, + "loss": 0.4819, + "step": 23130 + }, + { + "epoch": 1.174512317396657, + "grad_norm": 0.023530562218747813, + "learning_rate": 0.0004351432427130316, + "loss": 0.5285, + "step": 23135 + }, + { + "epoch": 1.1747661534946885, + "grad_norm": 0.026020705705209802, + "learning_rate": 0.0004349235946778659, + "loss": 0.5238, + "step": 23140 + }, + { + "epoch": 1.17501998959272, + "grad_norm": 0.0214632854435188, + "learning_rate": 0.000434703959416874, + "loss": 0.4631, + "step": 23145 + }, + { + "epoch": 1.1752738256907516, + "grad_norm": 0.025379100473350413, + "learning_rate": 0.0004344843369731692, + "loss": 0.479, + "step": 23150 + }, + { + "epoch": 1.175527661788783, + "grad_norm": 0.02437051797462676, + "learning_rate": 0.00043426472738986233, + "loss": 0.512, + "step": 23155 + }, + { + "epoch": 1.1757814978868144, + "grad_norm": 0.02897066685669499, + "learning_rate": 0.00043404513071006157, + "loss": 0.5283, + "step": 23160 + }, + { + "epoch": 1.176035333984846, + "grad_norm": 0.02546901345942979, + "learning_rate": 0.0004338255469768728, + "loss": 0.4924, + "step": 23165 + }, + { + "epoch": 1.1762891700828775, + "grad_norm": 0.02368301627174018, + "learning_rate": 0.0004336059762333992, + "loss": 0.5317, + "step": 23170 + }, + { + "epoch": 1.176543006180909, + "grad_norm": 0.022864082081725436, + "learning_rate": 0.0004333864185227413, + "loss": 0.5158, + "step": 23175 + }, + { + "epoch": 1.1767968422789405, + "grad_norm": 0.02715490428058028, + "learning_rate": 0.0004331668738879973, + "loss": 0.5156, + "step": 23180 + }, + { + "epoch": 1.177050678376972, + "grad_norm": 0.021324658368549614, + "learning_rate": 0.00043294734237226263, + "loss": 0.5044, + "step": 23185 + }, + { + "epoch": 1.1773045144750034, + "grad_norm": 0.024522981714582275, + "learning_rate": 0.0004327278240186303, + "loss": 0.5064, + "step": 23190 + }, + { + "epoch": 1.177558350573035, + "grad_norm": 0.03465334835103066, + "learning_rate": 0.0004325083188701906, + "loss": 0.4927, + "step": 23195 + }, + { + "epoch": 1.1778121866710665, + "grad_norm": 0.02331481482855284, + "learning_rate": 0.0004322888269700313, + "loss": 0.5376, + "step": 23200 + }, + { + "epoch": 1.178066022769098, + "grad_norm": 0.020402232394778678, + "learning_rate": 0.00043206934836123763, + "loss": 0.4963, + "step": 23205 + }, + { + "epoch": 1.1783198588671295, + "grad_norm": 0.029288417733953877, + "learning_rate": 0.0004318498830868921, + "loss": 0.4816, + "step": 23210 + }, + { + "epoch": 1.178573694965161, + "grad_norm": 0.026993170435865486, + "learning_rate": 0.0004316304311900746, + "loss": 0.4947, + "step": 23215 + }, + { + "epoch": 1.1788275310631926, + "grad_norm": 0.029826322623069505, + "learning_rate": 0.00043141099271386236, + "loss": 0.5017, + "step": 23220 + }, + { + "epoch": 1.179081367161224, + "grad_norm": 0.02839469569776015, + "learning_rate": 0.0004311915677013304, + "loss": 0.5048, + "step": 23225 + }, + { + "epoch": 1.1793352032592554, + "grad_norm": 0.03104613930338224, + "learning_rate": 0.00043097215619555053, + "loss": 0.4949, + "step": 23230 + }, + { + "epoch": 1.179589039357287, + "grad_norm": 0.023927607690992232, + "learning_rate": 0.00043075275823959217, + "loss": 0.4748, + "step": 23235 + }, + { + "epoch": 1.1798428754553185, + "grad_norm": 0.027181678664027754, + "learning_rate": 0.000430533373876522, + "loss": 0.5051, + "step": 23240 + }, + { + "epoch": 1.18009671155335, + "grad_norm": 0.025312092368077646, + "learning_rate": 0.0004303140031494042, + "loss": 0.5043, + "step": 23245 + }, + { + "epoch": 1.1803505476513816, + "grad_norm": 0.024109005993253833, + "learning_rate": 0.0004300946461012999, + "loss": 0.4829, + "step": 23250 + }, + { + "epoch": 1.180604383749413, + "grad_norm": 0.031076290791344705, + "learning_rate": 0.0004298753027752681, + "loss": 0.5013, + "step": 23255 + }, + { + "epoch": 1.1808582198474444, + "grad_norm": 0.02301069082386406, + "learning_rate": 0.00042965597321436454, + "loss": 0.496, + "step": 23260 + }, + { + "epoch": 1.181112055945476, + "grad_norm": 0.02834869538619908, + "learning_rate": 0.00042943665746164274, + "loss": 0.4945, + "step": 23265 + }, + { + "epoch": 1.1813658920435075, + "grad_norm": 0.040647134876522086, + "learning_rate": 0.0004292173555601531, + "loss": 0.5038, + "step": 23270 + }, + { + "epoch": 1.181619728141539, + "grad_norm": 0.047047098373209854, + "learning_rate": 0.00042899806755294364, + "loss": 0.5186, + "step": 23275 + }, + { + "epoch": 1.1818735642395706, + "grad_norm": 0.023835906470185797, + "learning_rate": 0.00042877879348305925, + "loss": 0.4914, + "step": 23280 + }, + { + "epoch": 1.182127400337602, + "grad_norm": 0.3198338312283238, + "learning_rate": 0.0004285595333935427, + "loss": 0.4655, + "step": 23285 + }, + { + "epoch": 1.1823812364356334, + "grad_norm": 0.08480024872059884, + "learning_rate": 0.0004283402873274334, + "loss": 0.5015, + "step": 23290 + }, + { + "epoch": 1.182635072533665, + "grad_norm": 0.044275755637726996, + "learning_rate": 0.0004281210553277684, + "loss": 0.5064, + "step": 23295 + }, + { + "epoch": 1.1828889086316965, + "grad_norm": 0.03453100961430329, + "learning_rate": 0.0004279018374375817, + "loss": 0.518, + "step": 23300 + }, + { + "epoch": 1.183142744729728, + "grad_norm": 0.02331481023981138, + "learning_rate": 0.00042768263369990486, + "loss": 0.5057, + "step": 23305 + }, + { + "epoch": 1.1833965808277596, + "grad_norm": 0.02423433591461226, + "learning_rate": 0.00042746344415776634, + "loss": 0.5355, + "step": 23310 + }, + { + "epoch": 1.183650416925791, + "grad_norm": 0.020158746070223478, + "learning_rate": 0.00042724426885419197, + "loss": 0.5056, + "step": 23315 + }, + { + "epoch": 1.1839042530238224, + "grad_norm": 0.02346254177392359, + "learning_rate": 0.0004270251078322048, + "loss": 0.4555, + "step": 23320 + }, + { + "epoch": 1.184158089121854, + "grad_norm": 0.02215680364485662, + "learning_rate": 0.000426805961134825, + "loss": 0.4967, + "step": 23325 + }, + { + "epoch": 1.1844119252198855, + "grad_norm": 0.025983453386552117, + "learning_rate": 0.00042658682880507005, + "loss": 0.488, + "step": 23330 + }, + { + "epoch": 1.184665761317917, + "grad_norm": 0.02558287853502543, + "learning_rate": 0.0004263677108859545, + "loss": 0.5222, + "step": 23335 + }, + { + "epoch": 1.1849195974159485, + "grad_norm": 0.029862575005858382, + "learning_rate": 0.0004261486074204899, + "loss": 0.5155, + "step": 23340 + }, + { + "epoch": 1.18517343351398, + "grad_norm": 0.03365037560913944, + "learning_rate": 0.0004259295184516855, + "loss": 0.5005, + "step": 23345 + }, + { + "epoch": 1.1854272696120116, + "grad_norm": 0.02324079187444754, + "learning_rate": 0.00042571044402254734, + "loss": 0.5356, + "step": 23350 + }, + { + "epoch": 1.1856811057100431, + "grad_norm": 0.023311446376579104, + "learning_rate": 0.00042549138417607855, + "loss": 0.5226, + "step": 23355 + }, + { + "epoch": 1.1859349418080745, + "grad_norm": 0.02264754449055334, + "learning_rate": 0.0004252723389552794, + "loss": 0.4857, + "step": 23360 + }, + { + "epoch": 1.186188777906106, + "grad_norm": 0.024792949347000763, + "learning_rate": 0.0004250533084031474, + "loss": 0.4847, + "step": 23365 + }, + { + "epoch": 1.1864426140041375, + "grad_norm": 0.02365423995133714, + "learning_rate": 0.0004248342925626773, + "loss": 0.5237, + "step": 23370 + }, + { + "epoch": 1.186696450102169, + "grad_norm": 0.025122984233723788, + "learning_rate": 0.0004246152914768607, + "loss": 0.4653, + "step": 23375 + }, + { + "epoch": 1.1869502862002006, + "grad_norm": 0.037495988695979386, + "learning_rate": 0.00042439630518868645, + "loss": 0.4927, + "step": 23380 + }, + { + "epoch": 1.187204122298232, + "grad_norm": 0.02744207164178393, + "learning_rate": 0.00042417733374114044, + "loss": 0.4939, + "step": 23385 + }, + { + "epoch": 1.1874579583962634, + "grad_norm": 0.023497982340652368, + "learning_rate": 0.00042395837717720564, + "loss": 0.517, + "step": 23390 + }, + { + "epoch": 1.187711794494295, + "grad_norm": 0.022057752647203964, + "learning_rate": 0.0004237394355398622, + "loss": 0.4763, + "step": 23395 + }, + { + "epoch": 1.1879656305923265, + "grad_norm": 0.022675614809165457, + "learning_rate": 0.0004235205088720872, + "loss": 0.4862, + "step": 23400 + }, + { + "epoch": 1.188219466690358, + "grad_norm": 0.027088398303537622, + "learning_rate": 0.000423301597216855, + "loss": 0.4862, + "step": 23405 + }, + { + "epoch": 1.1884733027883896, + "grad_norm": 0.021433012116146354, + "learning_rate": 0.0004230827006171367, + "loss": 0.4712, + "step": 23410 + }, + { + "epoch": 1.1887271388864211, + "grad_norm": 0.026295761373166058, + "learning_rate": 0.00042286381911590075, + "loss": 0.5153, + "step": 23415 + }, + { + "epoch": 1.1889809749844527, + "grad_norm": 0.0245055560686366, + "learning_rate": 0.0004226449527561124, + "loss": 0.4915, + "step": 23420 + }, + { + "epoch": 1.189234811082484, + "grad_norm": 0.023358170637321662, + "learning_rate": 0.0004224261015807341, + "loss": 0.5051, + "step": 23425 + }, + { + "epoch": 1.1894886471805155, + "grad_norm": 0.02494520282712566, + "learning_rate": 0.00042220726563272514, + "loss": 0.5168, + "step": 23430 + }, + { + "epoch": 1.189742483278547, + "grad_norm": 0.026188639690865594, + "learning_rate": 0.0004219884449550421, + "loss": 0.5007, + "step": 23435 + }, + { + "epoch": 1.1899963193765786, + "grad_norm": 0.02181777095566158, + "learning_rate": 0.0004217696395906381, + "loss": 0.4987, + "step": 23440 + }, + { + "epoch": 1.19025015547461, + "grad_norm": 0.022429924845303997, + "learning_rate": 0.00042155084958246387, + "loss": 0.5191, + "step": 23445 + }, + { + "epoch": 1.1905039915726416, + "grad_norm": 0.022466417610871727, + "learning_rate": 0.0004213320749734665, + "loss": 0.507, + "step": 23450 + }, + { + "epoch": 1.190757827670673, + "grad_norm": 0.03174964407796664, + "learning_rate": 0.0004211133158065906, + "loss": 0.487, + "step": 23455 + }, + { + "epoch": 1.1910116637687045, + "grad_norm": 0.02782479444320585, + "learning_rate": 0.0004208945721247772, + "loss": 0.4833, + "step": 23460 + }, + { + "epoch": 1.191265499866736, + "grad_norm": 0.023164893847604912, + "learning_rate": 0.0004206758439709649, + "loss": 0.5069, + "step": 23465 + }, + { + "epoch": 1.1915193359647676, + "grad_norm": 0.02156991148966579, + "learning_rate": 0.00042045713138808894, + "loss": 0.5061, + "step": 23470 + }, + { + "epoch": 1.191773172062799, + "grad_norm": 0.03206441910648273, + "learning_rate": 0.0004202384344190814, + "loss": 0.4991, + "step": 23475 + }, + { + "epoch": 1.1920270081608306, + "grad_norm": 0.033785505523490716, + "learning_rate": 0.00042001975310687134, + "loss": 0.5094, + "step": 23480 + }, + { + "epoch": 1.1922808442588622, + "grad_norm": 0.029304316918651336, + "learning_rate": 0.0004198010874943849, + "loss": 0.4911, + "step": 23485 + }, + { + "epoch": 1.1925346803568935, + "grad_norm": 0.024414079665053167, + "learning_rate": 0.0004195824376245451, + "loss": 0.5014, + "step": 23490 + }, + { + "epoch": 1.192788516454925, + "grad_norm": 0.02755089300926629, + "learning_rate": 0.0004193638035402717, + "loss": 0.4836, + "step": 23495 + }, + { + "epoch": 1.1930423525529565, + "grad_norm": 0.04448377045587385, + "learning_rate": 0.0004191451852844816, + "loss": 0.5055, + "step": 23500 + }, + { + "epoch": 1.193296188650988, + "grad_norm": 0.02563765603314885, + "learning_rate": 0.00041892658290008835, + "loss": 0.4757, + "step": 23505 + }, + { + "epoch": 1.1935500247490196, + "grad_norm": 0.029729997745895555, + "learning_rate": 0.00041870799643000257, + "loss": 0.5031, + "step": 23510 + }, + { + "epoch": 1.1938038608470511, + "grad_norm": 0.022197332496704934, + "learning_rate": 0.00041848942591713167, + "loss": 0.493, + "step": 23515 + }, + { + "epoch": 1.1940576969450825, + "grad_norm": 0.02508559588342066, + "learning_rate": 0.0004182708714043799, + "loss": 0.493, + "step": 23520 + }, + { + "epoch": 1.194311533043114, + "grad_norm": 0.02801379075039443, + "learning_rate": 0.0004180523329346486, + "loss": 0.4848, + "step": 23525 + }, + { + "epoch": 1.1945653691411455, + "grad_norm": 0.02487825457446648, + "learning_rate": 0.00041783381055083565, + "loss": 0.5065, + "step": 23530 + }, + { + "epoch": 1.194819205239177, + "grad_norm": 0.024301005036306705, + "learning_rate": 0.0004176153042958359, + "loss": 0.4799, + "step": 23535 + }, + { + "epoch": 1.1950730413372086, + "grad_norm": 0.028592227759142063, + "learning_rate": 0.0004173968142125411, + "loss": 0.4706, + "step": 23540 + }, + { + "epoch": 1.1953268774352401, + "grad_norm": 0.020208642081283146, + "learning_rate": 0.00041717834034383974, + "loss": 0.4838, + "step": 23545 + }, + { + "epoch": 1.1955807135332717, + "grad_norm": 0.025013814826272913, + "learning_rate": 0.0004169598827326171, + "loss": 0.5043, + "step": 23550 + }, + { + "epoch": 1.195834549631303, + "grad_norm": 0.02413307493483816, + "learning_rate": 0.0004167414414217554, + "loss": 0.5207, + "step": 23555 + }, + { + "epoch": 1.1960883857293345, + "grad_norm": 0.02524654414343768, + "learning_rate": 0.0004165230164541335, + "loss": 0.465, + "step": 23560 + }, + { + "epoch": 1.196342221827366, + "grad_norm": 0.03571503975031509, + "learning_rate": 0.00041630460787262717, + "loss": 0.4715, + "step": 23565 + }, + { + "epoch": 1.1965960579253976, + "grad_norm": 0.05081085862061478, + "learning_rate": 0.00041608621572010896, + "loss": 0.5229, + "step": 23570 + }, + { + "epoch": 1.1968498940234291, + "grad_norm": 0.04184691240326009, + "learning_rate": 0.0004158678400394481, + "loss": 0.5048, + "step": 23575 + }, + { + "epoch": 1.1971037301214607, + "grad_norm": 0.026686408386705083, + "learning_rate": 0.00041564948087351053, + "loss": 0.5156, + "step": 23580 + }, + { + "epoch": 1.197357566219492, + "grad_norm": 0.03417816047008303, + "learning_rate": 0.0004154311382651593, + "loss": 0.4824, + "step": 23585 + }, + { + "epoch": 1.1976114023175235, + "grad_norm": 0.026200514928376224, + "learning_rate": 0.000415212812257254, + "loss": 0.4909, + "step": 23590 + }, + { + "epoch": 1.197865238415555, + "grad_norm": 0.033241782938311475, + "learning_rate": 0.0004149945028926507, + "loss": 0.4919, + "step": 23595 + }, + { + "epoch": 1.1981190745135866, + "grad_norm": 0.023753551533967514, + "learning_rate": 0.0004147762102142027, + "loss": 0.4899, + "step": 23600 + }, + { + "epoch": 1.198372910611618, + "grad_norm": 0.026945495744205454, + "learning_rate": 0.0004145579342647595, + "loss": 0.4666, + "step": 23605 + }, + { + "epoch": 1.1986267467096496, + "grad_norm": 0.02038326341118833, + "learning_rate": 0.0004143396750871678, + "loss": 0.4999, + "step": 23610 + }, + { + "epoch": 1.1988805828076812, + "grad_norm": 0.022956204878543746, + "learning_rate": 0.0004141214327242707, + "loss": 0.491, + "step": 23615 + }, + { + "epoch": 1.1991344189057125, + "grad_norm": 0.0219273779464646, + "learning_rate": 0.000413903207218908, + "loss": 0.4867, + "step": 23620 + }, + { + "epoch": 1.199388255003744, + "grad_norm": 0.021082208350651496, + "learning_rate": 0.0004136849986139164, + "loss": 0.4782, + "step": 23625 + }, + { + "epoch": 1.1996420911017756, + "grad_norm": 0.021461829412799976, + "learning_rate": 0.0004134668069521291, + "loss": 0.5114, + "step": 23630 + }, + { + "epoch": 1.199895927199807, + "grad_norm": 0.027611384568723554, + "learning_rate": 0.00041324863227637607, + "loss": 0.5122, + "step": 23635 + }, + { + "epoch": 1.2001497632978386, + "grad_norm": 0.0313518897326471, + "learning_rate": 0.0004130304746294839, + "loss": 0.5, + "step": 23640 + }, + { + "epoch": 1.2004035993958702, + "grad_norm": 0.021179468678818012, + "learning_rate": 0.0004128123340542757, + "loss": 0.4911, + "step": 23645 + }, + { + "epoch": 1.2006574354939015, + "grad_norm": 0.021479315942391494, + "learning_rate": 0.0004125942105935717, + "loss": 0.5172, + "step": 23650 + }, + { + "epoch": 1.200911271591933, + "grad_norm": 0.030395801642170088, + "learning_rate": 0.00041237610429018824, + "loss": 0.4812, + "step": 23655 + }, + { + "epoch": 1.2011651076899645, + "grad_norm": 0.026149647662465924, + "learning_rate": 0.0004121580151869385, + "loss": 0.5146, + "step": 23660 + }, + { + "epoch": 1.201418943787996, + "grad_norm": 0.025141949645227077, + "learning_rate": 0.0004119399433266323, + "loss": 0.5049, + "step": 23665 + }, + { + "epoch": 1.2016727798860276, + "grad_norm": 0.022368500516788497, + "learning_rate": 0.0004117218887520761, + "loss": 0.4685, + "step": 23670 + }, + { + "epoch": 1.2019266159840591, + "grad_norm": 0.028886687584022533, + "learning_rate": 0.00041150385150607287, + "loss": 0.5113, + "step": 23675 + }, + { + "epoch": 1.2021804520820907, + "grad_norm": 0.026390424071676334, + "learning_rate": 0.0004112858316314223, + "loss": 0.486, + "step": 23680 + }, + { + "epoch": 1.2024342881801222, + "grad_norm": 0.027944997664867154, + "learning_rate": 0.00041106782917092055, + "loss": 0.5285, + "step": 23685 + }, + { + "epoch": 1.2026881242781535, + "grad_norm": 0.024740823096402697, + "learning_rate": 0.00041084984416736044, + "loss": 0.5107, + "step": 23690 + }, + { + "epoch": 1.202941960376185, + "grad_norm": 0.02106534018159003, + "learning_rate": 0.0004106318766635313, + "loss": 0.4754, + "step": 23695 + }, + { + "epoch": 1.2031957964742166, + "grad_norm": 0.0238222430975329, + "learning_rate": 0.00041041392670221913, + "loss": 0.4942, + "step": 23700 + }, + { + "epoch": 1.2034496325722481, + "grad_norm": 0.0237081263610808, + "learning_rate": 0.00041019599432620614, + "loss": 0.4864, + "step": 23705 + }, + { + "epoch": 1.2037034686702797, + "grad_norm": 0.02344590182148663, + "learning_rate": 0.00040997807957827184, + "loss": 0.4607, + "step": 23710 + }, + { + "epoch": 1.2039573047683112, + "grad_norm": 0.022849121360239078, + "learning_rate": 0.0004097601825011916, + "loss": 0.4918, + "step": 23715 + }, + { + "epoch": 1.2042111408663425, + "grad_norm": 0.02180468331845955, + "learning_rate": 0.00040954230313773745, + "loss": 0.4798, + "step": 23720 + }, + { + "epoch": 1.204464976964374, + "grad_norm": 0.0243468829191348, + "learning_rate": 0.0004093244415306781, + "loss": 0.5206, + "step": 23725 + }, + { + "epoch": 1.2047188130624056, + "grad_norm": 0.0376925453970005, + "learning_rate": 0.00040910659772277867, + "loss": 0.4733, + "step": 23730 + }, + { + "epoch": 1.2049726491604371, + "grad_norm": 0.02194738892506034, + "learning_rate": 0.0004088887717568009, + "loss": 0.4802, + "step": 23735 + }, + { + "epoch": 1.2052264852584686, + "grad_norm": 0.02392526501857356, + "learning_rate": 0.0004086709636755029, + "loss": 0.5137, + "step": 23740 + }, + { + "epoch": 1.2054803213565002, + "grad_norm": 0.023675767647900945, + "learning_rate": 0.0004084531735216392, + "loss": 0.4837, + "step": 23745 + }, + { + "epoch": 1.2057341574545317, + "grad_norm": 0.029907048109683313, + "learning_rate": 0.000408235401337961, + "loss": 0.4991, + "step": 23750 + }, + { + "epoch": 1.205987993552563, + "grad_norm": 0.023611085556861953, + "learning_rate": 0.00040801764716721586, + "loss": 0.5074, + "step": 23755 + }, + { + "epoch": 1.2062418296505946, + "grad_norm": 0.02342929941369781, + "learning_rate": 0.00040779991105214787, + "loss": 0.4753, + "step": 23760 + }, + { + "epoch": 1.206495665748626, + "grad_norm": 0.024184197724051767, + "learning_rate": 0.00040758219303549734, + "loss": 0.5053, + "step": 23765 + }, + { + "epoch": 1.2067495018466576, + "grad_norm": 0.022748064850076327, + "learning_rate": 0.00040736449316000156, + "loss": 0.5263, + "step": 23770 + }, + { + "epoch": 1.2070033379446892, + "grad_norm": 0.04285865957318062, + "learning_rate": 0.00040714681146839394, + "loss": 0.4945, + "step": 23775 + }, + { + "epoch": 1.2072571740427207, + "grad_norm": 0.027550877084688605, + "learning_rate": 0.00040692914800340407, + "loss": 0.519, + "step": 23780 + }, + { + "epoch": 1.207511010140752, + "grad_norm": 0.028024709655043955, + "learning_rate": 0.00040671150280775835, + "loss": 0.5254, + "step": 23785 + }, + { + "epoch": 1.2077648462387836, + "grad_norm": 0.027347216039832773, + "learning_rate": 0.0004064938759241794, + "loss": 0.5175, + "step": 23790 + }, + { + "epoch": 1.208018682336815, + "grad_norm": 0.03246702500036394, + "learning_rate": 0.0004062762673953863, + "loss": 0.4976, + "step": 23795 + }, + { + "epoch": 1.2082725184348466, + "grad_norm": 0.024018195972550192, + "learning_rate": 0.00040605867726409446, + "loss": 0.5024, + "step": 23800 + }, + { + "epoch": 1.2085263545328782, + "grad_norm": 0.036249081128423837, + "learning_rate": 0.00040584110557301576, + "loss": 0.5149, + "step": 23805 + }, + { + "epoch": 1.2087801906309097, + "grad_norm": 0.025423232304757903, + "learning_rate": 0.0004056235523648586, + "loss": 0.4963, + "step": 23810 + }, + { + "epoch": 1.2090340267289412, + "grad_norm": 0.02314566298487677, + "learning_rate": 0.0004054060176823273, + "loss": 0.4801, + "step": 23815 + }, + { + "epoch": 1.2092878628269725, + "grad_norm": 0.031246175124657095, + "learning_rate": 0.00040518850156812315, + "loss": 0.5157, + "step": 23820 + }, + { + "epoch": 1.209541698925004, + "grad_norm": 0.025447196021980784, + "learning_rate": 0.0004049710040649431, + "loss": 0.4978, + "step": 23825 + }, + { + "epoch": 1.2097955350230356, + "grad_norm": 0.19408857583730024, + "learning_rate": 0.0004047535252154812, + "loss": 0.4927, + "step": 23830 + }, + { + "epoch": 1.2100493711210671, + "grad_norm": 0.037859047454809586, + "learning_rate": 0.0004045360650624272, + "loss": 0.492, + "step": 23835 + }, + { + "epoch": 1.2103032072190987, + "grad_norm": 0.02407784644881004, + "learning_rate": 0.0004043186236484677, + "loss": 0.5306, + "step": 23840 + }, + { + "epoch": 1.2105570433171302, + "grad_norm": 0.021170736417903336, + "learning_rate": 0.0004041012010162852, + "loss": 0.4789, + "step": 23845 + }, + { + "epoch": 1.2108108794151615, + "grad_norm": 0.02333015002979291, + "learning_rate": 0.0004038837972085586, + "loss": 0.5165, + "step": 23850 + }, + { + "epoch": 1.211064715513193, + "grad_norm": 0.022900385191847467, + "learning_rate": 0.0004036664122679633, + "loss": 0.529, + "step": 23855 + }, + { + "epoch": 1.2113185516112246, + "grad_norm": 0.02900664488594827, + "learning_rate": 0.00040344904623717094, + "loss": 0.5186, + "step": 23860 + }, + { + "epoch": 1.2115723877092561, + "grad_norm": 0.02318704654191729, + "learning_rate": 0.00040323169915884924, + "loss": 0.5114, + "step": 23865 + }, + { + "epoch": 1.2118262238072877, + "grad_norm": 0.02241334253331378, + "learning_rate": 0.0004030143710756624, + "loss": 0.5244, + "step": 23870 + }, + { + "epoch": 1.2120800599053192, + "grad_norm": 0.02019981834426705, + "learning_rate": 0.0004027970620302709, + "loss": 0.5171, + "step": 23875 + }, + { + "epoch": 1.2123338960033507, + "grad_norm": 0.021530806359524663, + "learning_rate": 0.0004025797720653313, + "loss": 0.5028, + "step": 23880 + }, + { + "epoch": 1.212587732101382, + "grad_norm": 0.023565880923201048, + "learning_rate": 0.00040236250122349643, + "loss": 0.5048, + "step": 23885 + }, + { + "epoch": 1.2128415681994136, + "grad_norm": 0.02799458782575493, + "learning_rate": 0.0004021452495474159, + "loss": 0.5116, + "step": 23890 + }, + { + "epoch": 1.2130954042974451, + "grad_norm": 0.026146402918077717, + "learning_rate": 0.0004019280170797349, + "loss": 0.5023, + "step": 23895 + }, + { + "epoch": 1.2133492403954766, + "grad_norm": 0.04633882230495636, + "learning_rate": 0.000401710803863095, + "loss": 0.499, + "step": 23900 + }, + { + "epoch": 1.2136030764935082, + "grad_norm": 0.03278896093851392, + "learning_rate": 0.0004014936099401341, + "loss": 0.484, + "step": 23905 + }, + { + "epoch": 1.2138569125915397, + "grad_norm": 0.023279990625039652, + "learning_rate": 0.0004012764353534864, + "loss": 0.5017, + "step": 23910 + }, + { + "epoch": 1.214110748689571, + "grad_norm": 0.025580791250068846, + "learning_rate": 0.00040105928014578206, + "loss": 0.4841, + "step": 23915 + }, + { + "epoch": 1.2143645847876026, + "grad_norm": 0.0231176720102259, + "learning_rate": 0.00040084214435964766, + "loss": 0.4992, + "step": 23920 + }, + { + "epoch": 1.214618420885634, + "grad_norm": 0.02248737002851678, + "learning_rate": 0.0004006250280377058, + "loss": 0.5068, + "step": 23925 + }, + { + "epoch": 1.2148722569836656, + "grad_norm": 0.022927316960127005, + "learning_rate": 0.0004004079312225754, + "loss": 0.492, + "step": 23930 + }, + { + "epoch": 1.2151260930816972, + "grad_norm": 0.02313720027070532, + "learning_rate": 0.00040019085395687134, + "loss": 0.4949, + "step": 23935 + }, + { + "epoch": 1.2153799291797287, + "grad_norm": 0.030106027729234503, + "learning_rate": 0.00039997379628320493, + "loss": 0.5044, + "step": 23940 + }, + { + "epoch": 1.2156337652777602, + "grad_norm": 0.023137200469528597, + "learning_rate": 0.0003997567582441834, + "loss": 0.5162, + "step": 23945 + }, + { + "epoch": 1.2158876013757918, + "grad_norm": 0.02729799923558458, + "learning_rate": 0.00039953973988241035, + "loss": 0.5144, + "step": 23950 + }, + { + "epoch": 1.216141437473823, + "grad_norm": 0.02110940033197545, + "learning_rate": 0.00039932274124048546, + "loss": 0.4959, + "step": 23955 + }, + { + "epoch": 1.2163952735718546, + "grad_norm": 0.023832077215208247, + "learning_rate": 0.00039910576236100437, + "loss": 0.5058, + "step": 23960 + }, + { + "epoch": 1.2166491096698862, + "grad_norm": 0.021490387702712672, + "learning_rate": 0.000398888803286559, + "loss": 0.5041, + "step": 23965 + }, + { + "epoch": 1.2169029457679177, + "grad_norm": 0.022133880712670376, + "learning_rate": 0.0003986718640597372, + "loss": 0.5059, + "step": 23970 + }, + { + "epoch": 1.2171567818659492, + "grad_norm": 0.021425175563689818, + "learning_rate": 0.0003984549447231232, + "loss": 0.4609, + "step": 23975 + }, + { + "epoch": 1.2174106179639805, + "grad_norm": 0.019327536501010368, + "learning_rate": 0.0003982380453192972, + "loss": 0.4943, + "step": 23980 + }, + { + "epoch": 1.217664454062012, + "grad_norm": 0.023998177431719198, + "learning_rate": 0.0003980211658908354, + "loss": 0.4973, + "step": 23985 + }, + { + "epoch": 1.2179182901600436, + "grad_norm": 0.033915972076828955, + "learning_rate": 0.0003978043064803101, + "loss": 0.5149, + "step": 23990 + }, + { + "epoch": 1.2181721262580751, + "grad_norm": 0.0237613621554742, + "learning_rate": 0.0003975874671302899, + "loss": 0.5135, + "step": 23995 + }, + { + "epoch": 1.2184259623561067, + "grad_norm": 0.026750732728818505, + "learning_rate": 0.00039737064788333907, + "loss": 0.503, + "step": 24000 + }, + { + "epoch": 1.2186797984541382, + "grad_norm": 0.025308308981860574, + "learning_rate": 0.0003971538487820181, + "loss": 0.5002, + "step": 24005 + }, + { + "epoch": 1.2189336345521697, + "grad_norm": 0.02222643637338913, + "learning_rate": 0.0003969370698688839, + "loss": 0.5231, + "step": 24010 + }, + { + "epoch": 1.2191874706502013, + "grad_norm": 0.031118899097702987, + "learning_rate": 0.0003967203111864889, + "loss": 0.5099, + "step": 24015 + }, + { + "epoch": 1.2194413067482326, + "grad_norm": 0.022092906382930166, + "learning_rate": 0.0003965035727773818, + "loss": 0.4864, + "step": 24020 + }, + { + "epoch": 1.2196951428462641, + "grad_norm": 0.028023218544810078, + "learning_rate": 0.0003962868546841072, + "loss": 0.4551, + "step": 24025 + }, + { + "epoch": 1.2199489789442957, + "grad_norm": 0.023384820051804464, + "learning_rate": 0.0003960701569492058, + "loss": 0.4975, + "step": 24030 + }, + { + "epoch": 1.2202028150423272, + "grad_norm": 0.022821055406039938, + "learning_rate": 0.00039585347961521434, + "loss": 0.5172, + "step": 24035 + }, + { + "epoch": 1.2204566511403587, + "grad_norm": 0.030261713548462982, + "learning_rate": 0.0003956368227246654, + "loss": 0.5088, + "step": 24040 + }, + { + "epoch": 1.2207104872383903, + "grad_norm": 0.023111376142268682, + "learning_rate": 0.00039542018632008773, + "loss": 0.5049, + "step": 24045 + }, + { + "epoch": 1.2209643233364216, + "grad_norm": 0.021954597042835224, + "learning_rate": 0.00039520357044400595, + "loss": 0.4923, + "step": 24050 + }, + { + "epoch": 1.221218159434453, + "grad_norm": 0.022076500459662276, + "learning_rate": 0.0003949869751389407, + "loss": 0.4696, + "step": 24055 + }, + { + "epoch": 1.2214719955324846, + "grad_norm": 0.023319825211927294, + "learning_rate": 0.0003947704004474085, + "loss": 0.5188, + "step": 24060 + }, + { + "epoch": 1.2217258316305162, + "grad_norm": 0.02700682357188388, + "learning_rate": 0.0003945538464119218, + "loss": 0.5128, + "step": 24065 + }, + { + "epoch": 1.2219796677285477, + "grad_norm": 0.025886961687645718, + "learning_rate": 0.00039433731307498925, + "loss": 0.5348, + "step": 24070 + }, + { + "epoch": 1.2222335038265792, + "grad_norm": 0.02644977307390592, + "learning_rate": 0.00039412080047911526, + "loss": 0.5141, + "step": 24075 + }, + { + "epoch": 1.2224873399246108, + "grad_norm": 0.028225811453520037, + "learning_rate": 0.00039390430866680017, + "loss": 0.5302, + "step": 24080 + }, + { + "epoch": 1.222741176022642, + "grad_norm": 0.035161610050816194, + "learning_rate": 0.00039368783768054005, + "loss": 0.4871, + "step": 24085 + }, + { + "epoch": 1.2229950121206736, + "grad_norm": 0.08376585705403677, + "learning_rate": 0.00039347138756282737, + "loss": 0.5101, + "step": 24090 + }, + { + "epoch": 1.2232488482187052, + "grad_norm": 0.028243847099455724, + "learning_rate": 0.0003932549583561499, + "loss": 0.5145, + "step": 24095 + }, + { + "epoch": 1.2235026843167367, + "grad_norm": 0.02241171828134657, + "learning_rate": 0.00039303855010299187, + "loss": 0.4779, + "step": 24100 + }, + { + "epoch": 1.2237565204147682, + "grad_norm": 0.027565864630121506, + "learning_rate": 0.00039282216284583304, + "loss": 0.5191, + "step": 24105 + }, + { + "epoch": 1.2240103565127998, + "grad_norm": 0.04141754255679934, + "learning_rate": 0.00039260579662714915, + "loss": 0.5127, + "step": 24110 + }, + { + "epoch": 1.224264192610831, + "grad_norm": 0.03073490220186529, + "learning_rate": 0.0003923894514894118, + "loss": 0.5125, + "step": 24115 + }, + { + "epoch": 1.2245180287088626, + "grad_norm": 0.02681137516278315, + "learning_rate": 0.00039217312747508843, + "loss": 0.4995, + "step": 24120 + }, + { + "epoch": 1.2247718648068942, + "grad_norm": 0.03856684515614741, + "learning_rate": 0.00039195682462664225, + "loss": 0.4739, + "step": 24125 + }, + { + "epoch": 1.2250257009049257, + "grad_norm": 0.039563497611539114, + "learning_rate": 0.0003917405429865327, + "loss": 0.4841, + "step": 24130 + }, + { + "epoch": 1.2252795370029572, + "grad_norm": 0.02634194065105358, + "learning_rate": 0.0003915242825972148, + "loss": 0.5017, + "step": 24135 + }, + { + "epoch": 1.2255333731009888, + "grad_norm": 0.022677125872718785, + "learning_rate": 0.0003913080435011392, + "loss": 0.5316, + "step": 24140 + }, + { + "epoch": 1.2257872091990203, + "grad_norm": 0.021362302216391706, + "learning_rate": 0.00039109182574075256, + "loss": 0.4939, + "step": 24145 + }, + { + "epoch": 1.2260410452970516, + "grad_norm": 0.02490355388687261, + "learning_rate": 0.00039087562935849745, + "loss": 0.4987, + "step": 24150 + }, + { + "epoch": 1.2262948813950831, + "grad_norm": 0.02658190177665588, + "learning_rate": 0.00039065945439681213, + "loss": 0.4917, + "step": 24155 + }, + { + "epoch": 1.2265487174931147, + "grad_norm": 0.020375606787208123, + "learning_rate": 0.0003904433008981306, + "loss": 0.4781, + "step": 24160 + }, + { + "epoch": 1.2268025535911462, + "grad_norm": 0.02515928592237166, + "learning_rate": 0.00039022716890488275, + "loss": 0.4924, + "step": 24165 + }, + { + "epoch": 1.2270563896891777, + "grad_norm": 0.036589589288339636, + "learning_rate": 0.0003900110584594942, + "loss": 0.5066, + "step": 24170 + }, + { + "epoch": 1.2273102257872093, + "grad_norm": 0.023900416252566864, + "learning_rate": 0.00038979496960438637, + "loss": 0.4742, + "step": 24175 + }, + { + "epoch": 1.2275640618852406, + "grad_norm": 0.024765774151169644, + "learning_rate": 0.0003895789023819764, + "loss": 0.5312, + "step": 24180 + }, + { + "epoch": 1.2278178979832721, + "grad_norm": 0.023371819709462876, + "learning_rate": 0.0003893628568346771, + "loss": 0.495, + "step": 24185 + }, + { + "epoch": 1.2280717340813037, + "grad_norm": 0.023492189347015212, + "learning_rate": 0.0003891468330048974, + "loss": 0.4349, + "step": 24190 + }, + { + "epoch": 1.2283255701793352, + "grad_norm": 0.022915433000463895, + "learning_rate": 0.00038893083093504154, + "loss": 0.4776, + "step": 24195 + }, + { + "epoch": 1.2285794062773667, + "grad_norm": 0.03768217141505213, + "learning_rate": 0.00038871485066750965, + "loss": 0.4771, + "step": 24200 + }, + { + "epoch": 1.2288332423753983, + "grad_norm": 0.08372265623632144, + "learning_rate": 0.00038849889224469765, + "loss": 0.4923, + "step": 24205 + }, + { + "epoch": 1.2290870784734298, + "grad_norm": 0.023183814231157947, + "learning_rate": 0.000388282955708997, + "loss": 0.4936, + "step": 24210 + }, + { + "epoch": 1.2293409145714613, + "grad_norm": 0.021976570825354085, + "learning_rate": 0.0003880670411027951, + "loss": 0.4798, + "step": 24215 + }, + { + "epoch": 1.2295947506694926, + "grad_norm": 0.02966175812308913, + "learning_rate": 0.0003878511484684747, + "loss": 0.4887, + "step": 24220 + }, + { + "epoch": 1.2298485867675242, + "grad_norm": 0.02807429193190882, + "learning_rate": 0.00038763527784841463, + "loss": 0.5163, + "step": 24225 + }, + { + "epoch": 1.2301024228655557, + "grad_norm": 0.023104952777379555, + "learning_rate": 0.00038741942928498913, + "loss": 0.5479, + "step": 24230 + }, + { + "epoch": 1.2303562589635872, + "grad_norm": 0.029930387788839295, + "learning_rate": 0.0003872036028205683, + "loss": 0.4894, + "step": 24235 + }, + { + "epoch": 1.2306100950616188, + "grad_norm": 0.024336392055221638, + "learning_rate": 0.00038698779849751766, + "loss": 0.4459, + "step": 24240 + }, + { + "epoch": 1.23086393115965, + "grad_norm": 0.022263929857281015, + "learning_rate": 0.0003867720163581983, + "loss": 0.4906, + "step": 24245 + }, + { + "epoch": 1.2311177672576816, + "grad_norm": 0.03133076104283715, + "learning_rate": 0.0003865562564449678, + "loss": 0.4989, + "step": 24250 + }, + { + "epoch": 1.2313716033557132, + "grad_norm": 0.03260231148423326, + "learning_rate": 0.0003863405188001783, + "loss": 0.5352, + "step": 24255 + }, + { + "epoch": 1.2316254394537447, + "grad_norm": 0.04700860380299025, + "learning_rate": 0.00038612480346617825, + "loss": 0.477, + "step": 24260 + }, + { + "epoch": 1.2318792755517762, + "grad_norm": 0.02740921040477875, + "learning_rate": 0.00038590911048531136, + "loss": 0.5073, + "step": 24265 + }, + { + "epoch": 1.2321331116498078, + "grad_norm": 0.028178131413544947, + "learning_rate": 0.00038569343989991705, + "loss": 0.5108, + "step": 24270 + }, + { + "epoch": 1.2323869477478393, + "grad_norm": 0.03757846827473028, + "learning_rate": 0.0003854777917523305, + "loss": 0.5107, + "step": 24275 + }, + { + "epoch": 1.2326407838458708, + "grad_norm": 0.023537367550062548, + "learning_rate": 0.00038526216608488227, + "loss": 0.5113, + "step": 24280 + }, + { + "epoch": 1.2328946199439021, + "grad_norm": 0.023626230913656128, + "learning_rate": 0.0003850465629398987, + "loss": 0.5004, + "step": 24285 + }, + { + "epoch": 1.2331484560419337, + "grad_norm": 0.0223229940793588, + "learning_rate": 0.00038483098235970147, + "loss": 0.474, + "step": 24290 + }, + { + "epoch": 1.2334022921399652, + "grad_norm": 0.024562899395510082, + "learning_rate": 0.00038461542438660815, + "loss": 0.5167, + "step": 24295 + }, + { + "epoch": 1.2336561282379968, + "grad_norm": 0.028102102596075113, + "learning_rate": 0.00038439988906293157, + "loss": 0.4756, + "step": 24300 + }, + { + "epoch": 1.2339099643360283, + "grad_norm": 0.025740036122312678, + "learning_rate": 0.00038418437643098006, + "loss": 0.4896, + "step": 24305 + }, + { + "epoch": 1.2341638004340598, + "grad_norm": 0.026595707267021335, + "learning_rate": 0.0003839688865330581, + "loss": 0.4887, + "step": 24310 + }, + { + "epoch": 1.2344176365320911, + "grad_norm": 0.024095548845127494, + "learning_rate": 0.00038375341941146505, + "loss": 0.4901, + "step": 24315 + }, + { + "epoch": 1.2346714726301227, + "grad_norm": 0.021876912693680587, + "learning_rate": 0.0003835379751084961, + "loss": 0.4839, + "step": 24320 + }, + { + "epoch": 1.2349253087281542, + "grad_norm": 0.02794133991499325, + "learning_rate": 0.00038332255366644175, + "loss": 0.4859, + "step": 24325 + }, + { + "epoch": 1.2351791448261857, + "grad_norm": 0.03176212918526051, + "learning_rate": 0.0003831071551275883, + "loss": 0.5001, + "step": 24330 + }, + { + "epoch": 1.2354329809242173, + "grad_norm": 0.02008532610763912, + "learning_rate": 0.0003828917795342173, + "loss": 0.474, + "step": 24335 + }, + { + "epoch": 1.2356868170222488, + "grad_norm": 0.02404321499467447, + "learning_rate": 0.000382676426928606, + "loss": 0.523, + "step": 24340 + }, + { + "epoch": 1.2359406531202803, + "grad_norm": 0.020304618971407788, + "learning_rate": 0.00038246109735302696, + "loss": 0.4927, + "step": 24345 + }, + { + "epoch": 1.2361944892183117, + "grad_norm": 0.021943643137327405, + "learning_rate": 0.0003822457908497484, + "loss": 0.5214, + "step": 24350 + }, + { + "epoch": 1.2364483253163432, + "grad_norm": 0.026610455716482826, + "learning_rate": 0.00038203050746103386, + "loss": 0.5162, + "step": 24355 + }, + { + "epoch": 1.2367021614143747, + "grad_norm": 0.028298758995324663, + "learning_rate": 0.00038181524722914235, + "loss": 0.509, + "step": 24360 + }, + { + "epoch": 1.2369559975124063, + "grad_norm": 0.037423844884409904, + "learning_rate": 0.0003816000101963282, + "loss": 0.5026, + "step": 24365 + }, + { + "epoch": 1.2372098336104378, + "grad_norm": 0.022982766278687995, + "learning_rate": 0.00038138479640484183, + "loss": 0.5009, + "step": 24370 + }, + { + "epoch": 1.2374636697084693, + "grad_norm": 0.024787564495660528, + "learning_rate": 0.00038116960589692844, + "loss": 0.4921, + "step": 24375 + }, + { + "epoch": 1.2377175058065006, + "grad_norm": 0.02088814143474573, + "learning_rate": 0.00038095443871482876, + "loss": 0.4885, + "step": 24380 + }, + { + "epoch": 1.2379713419045322, + "grad_norm": 0.021715700250562936, + "learning_rate": 0.0003807392949007791, + "loss": 0.4914, + "step": 24385 + }, + { + "epoch": 1.2382251780025637, + "grad_norm": 0.027943308695689863, + "learning_rate": 0.00038052417449701106, + "loss": 0.4809, + "step": 24390 + }, + { + "epoch": 1.2384790141005952, + "grad_norm": 0.024457173191587765, + "learning_rate": 0.00038030907754575173, + "loss": 0.4905, + "step": 24395 + }, + { + "epoch": 1.2387328501986268, + "grad_norm": 0.02298993158514161, + "learning_rate": 0.0003800940040892236, + "loss": 0.5076, + "step": 24400 + }, + { + "epoch": 1.2389866862966583, + "grad_norm": 0.02407909122160034, + "learning_rate": 0.00037987895416964455, + "loss": 0.505, + "step": 24405 + }, + { + "epoch": 1.2392405223946898, + "grad_norm": 0.023168723950299244, + "learning_rate": 0.0003796639278292277, + "loss": 0.4801, + "step": 24410 + }, + { + "epoch": 1.2394943584927212, + "grad_norm": 0.022695039167580793, + "learning_rate": 0.0003794489251101817, + "loss": 0.4998, + "step": 24415 + }, + { + "epoch": 1.2397481945907527, + "grad_norm": 0.025740573541342248, + "learning_rate": 0.00037923394605471057, + "loss": 0.4891, + "step": 24420 + }, + { + "epoch": 1.2400020306887842, + "grad_norm": 0.022190743593717716, + "learning_rate": 0.00037901899070501337, + "loss": 0.5166, + "step": 24425 + }, + { + "epoch": 1.2402558667868158, + "grad_norm": 0.02273570131152975, + "learning_rate": 0.00037880405910328515, + "loss": 0.4837, + "step": 24430 + }, + { + "epoch": 1.2405097028848473, + "grad_norm": 0.02247267893276436, + "learning_rate": 0.0003785891512917157, + "loss": 0.503, + "step": 24435 + }, + { + "epoch": 1.2407635389828788, + "grad_norm": 0.027397988839106935, + "learning_rate": 0.00037837426731249035, + "loss": 0.5138, + "step": 24440 + }, + { + "epoch": 1.2410173750809101, + "grad_norm": 0.0360747255061236, + "learning_rate": 0.0003781594072077899, + "loss": 0.5247, + "step": 24445 + }, + { + "epoch": 1.2412712111789417, + "grad_norm": 0.020839949347156835, + "learning_rate": 0.00037794457101979, + "loss": 0.4719, + "step": 24450 + }, + { + "epoch": 1.2415250472769732, + "grad_norm": 0.023805826469806165, + "learning_rate": 0.00037772975879066224, + "loss": 0.528, + "step": 24455 + }, + { + "epoch": 1.2417788833750047, + "grad_norm": 0.02229363835470009, + "learning_rate": 0.00037751497056257304, + "loss": 0.5116, + "step": 24460 + }, + { + "epoch": 1.2420327194730363, + "grad_norm": 0.02478864260181994, + "learning_rate": 0.0003773002063776843, + "loss": 0.4765, + "step": 24465 + }, + { + "epoch": 1.2422865555710678, + "grad_norm": 0.026718793626477343, + "learning_rate": 0.00037708546627815317, + "loss": 0.4911, + "step": 24470 + }, + { + "epoch": 1.2425403916690994, + "grad_norm": 0.022347714227584022, + "learning_rate": 0.000376870750306132, + "loss": 0.5098, + "step": 24475 + }, + { + "epoch": 1.2427942277671309, + "grad_norm": 0.02195899584253101, + "learning_rate": 0.0003766560585037685, + "loss": 0.4948, + "step": 24480 + }, + { + "epoch": 1.2430480638651622, + "grad_norm": 0.024116784723832947, + "learning_rate": 0.0003764413909132054, + "loss": 0.4916, + "step": 24485 + }, + { + "epoch": 1.2433018999631937, + "grad_norm": 0.025348617292129703, + "learning_rate": 0.00037622674757658127, + "loss": 0.4983, + "step": 24490 + }, + { + "epoch": 1.2435557360612253, + "grad_norm": 0.028723847683448087, + "learning_rate": 0.0003760121285360293, + "loss": 0.4974, + "step": 24495 + }, + { + "epoch": 1.2438095721592568, + "grad_norm": 0.02549204124081865, + "learning_rate": 0.00037579753383367825, + "loss": 0.4797, + "step": 24500 + }, + { + "epoch": 1.2440634082572883, + "grad_norm": 0.02572550068222192, + "learning_rate": 0.0003755829635116519, + "loss": 0.4911, + "step": 24505 + }, + { + "epoch": 1.2443172443553197, + "grad_norm": 0.02151255214938373, + "learning_rate": 0.0003753684176120693, + "loss": 0.4818, + "step": 24510 + }, + { + "epoch": 1.2445710804533512, + "grad_norm": 0.03485262446331745, + "learning_rate": 0.0003751538961770448, + "loss": 0.4964, + "step": 24515 + }, + { + "epoch": 1.2448249165513827, + "grad_norm": 0.020421404595873636, + "learning_rate": 0.0003749393992486879, + "loss": 0.4928, + "step": 24520 + }, + { + "epoch": 1.2450787526494143, + "grad_norm": 0.02213502791269541, + "learning_rate": 0.0003747249268691033, + "loss": 0.5011, + "step": 24525 + }, + { + "epoch": 1.2453325887474458, + "grad_norm": 0.039753961527774426, + "learning_rate": 0.0003745104790803907, + "loss": 0.5099, + "step": 24530 + }, + { + "epoch": 1.2455864248454773, + "grad_norm": 0.020760234136978934, + "learning_rate": 0.0003742960559246453, + "loss": 0.471, + "step": 24535 + }, + { + "epoch": 1.2458402609435089, + "grad_norm": 0.027835500455752622, + "learning_rate": 0.0003740816574439572, + "loss": 0.4985, + "step": 24540 + }, + { + "epoch": 1.2460940970415404, + "grad_norm": 0.024653367801313206, + "learning_rate": 0.00037386728368041185, + "loss": 0.5224, + "step": 24545 + }, + { + "epoch": 1.2463479331395717, + "grad_norm": 0.025495982916413043, + "learning_rate": 0.00037365293467608954, + "loss": 0.5011, + "step": 24550 + }, + { + "epoch": 1.2466017692376032, + "grad_norm": 0.04123159279622747, + "learning_rate": 0.00037343861047306617, + "loss": 0.4972, + "step": 24555 + }, + { + "epoch": 1.2468556053356348, + "grad_norm": 0.021840485245885177, + "learning_rate": 0.00037322431111341245, + "loss": 0.484, + "step": 24560 + }, + { + "epoch": 1.2471094414336663, + "grad_norm": 0.02574861374734803, + "learning_rate": 0.0003730100366391942, + "loss": 0.4747, + "step": 24565 + }, + { + "epoch": 1.2473632775316978, + "grad_norm": 0.029726658779212437, + "learning_rate": 0.0003727957870924724, + "loss": 0.468, + "step": 24570 + }, + { + "epoch": 1.2476171136297294, + "grad_norm": 0.023386237965567654, + "learning_rate": 0.0003725815625153033, + "loss": 0.461, + "step": 24575 + }, + { + "epoch": 1.2478709497277607, + "grad_norm": 0.02511675497455319, + "learning_rate": 0.00037236736294973805, + "loss": 0.4921, + "step": 24580 + }, + { + "epoch": 1.2481247858257922, + "grad_norm": 0.022973041358262655, + "learning_rate": 0.00037215318843782287, + "loss": 0.4862, + "step": 24585 + }, + { + "epoch": 1.2483786219238238, + "grad_norm": 0.025350959019371976, + "learning_rate": 0.0003719390390215993, + "loss": 0.4988, + "step": 24590 + }, + { + "epoch": 1.2486324580218553, + "grad_norm": 0.022332478902111395, + "learning_rate": 0.0003717249147431037, + "loss": 0.4785, + "step": 24595 + }, + { + "epoch": 1.2488862941198868, + "grad_norm": 0.02605623024379265, + "learning_rate": 0.0003715108156443676, + "loss": 0.5072, + "step": 24600 + }, + { + "epoch": 1.2491401302179184, + "grad_norm": 0.023303658340822323, + "learning_rate": 0.0003712967417674177, + "loss": 0.5011, + "step": 24605 + }, + { + "epoch": 1.24939396631595, + "grad_norm": 0.02406384974114823, + "learning_rate": 0.0003710826931542753, + "loss": 0.512, + "step": 24610 + }, + { + "epoch": 1.2496478024139812, + "grad_norm": 0.02759010527581803, + "learning_rate": 0.0003708686698469575, + "loss": 0.4894, + "step": 24615 + }, + { + "epoch": 1.2499016385120127, + "grad_norm": 0.03146887342339219, + "learning_rate": 0.00037065467188747593, + "loss": 0.4801, + "step": 24620 + }, + { + "epoch": 1.2501554746100443, + "grad_norm": 0.030508984982450908, + "learning_rate": 0.0003704406993178371, + "loss": 0.4932, + "step": 24625 + }, + { + "epoch": 1.2504093107080758, + "grad_norm": 0.02545476005935238, + "learning_rate": 0.000370226752180043, + "loss": 0.5114, + "step": 24630 + }, + { + "epoch": 1.2506631468061074, + "grad_norm": 0.02230948334682719, + "learning_rate": 0.0003700128305160901, + "loss": 0.5132, + "step": 24635 + }, + { + "epoch": 1.2509169829041387, + "grad_norm": 0.02206263568624184, + "learning_rate": 0.00036979893436797054, + "loss": 0.5015, + "step": 24640 + }, + { + "epoch": 1.2511708190021702, + "grad_norm": 0.02398238667525352, + "learning_rate": 0.0003695850637776707, + "loss": 0.4873, + "step": 24645 + }, + { + "epoch": 1.2514246551002017, + "grad_norm": 0.02334469006495267, + "learning_rate": 0.0003693712187871725, + "loss": 0.4907, + "step": 24650 + }, + { + "epoch": 1.2516784911982333, + "grad_norm": 0.02105351477888063, + "learning_rate": 0.0003691573994384526, + "loss": 0.5158, + "step": 24655 + }, + { + "epoch": 1.2519323272962648, + "grad_norm": 0.023611719484646673, + "learning_rate": 0.00036894360577348275, + "loss": 0.4912, + "step": 24660 + }, + { + "epoch": 1.2521861633942963, + "grad_norm": 0.026957081159899983, + "learning_rate": 0.00036872983783422944, + "loss": 0.5186, + "step": 24665 + }, + { + "epoch": 1.2524399994923279, + "grad_norm": 0.022112456631055228, + "learning_rate": 0.0003685160956626542, + "loss": 0.4708, + "step": 24670 + }, + { + "epoch": 1.2526938355903594, + "grad_norm": 0.03960707777333683, + "learning_rate": 0.0003683023793007138, + "loss": 0.4818, + "step": 24675 + }, + { + "epoch": 1.252947671688391, + "grad_norm": 0.024900205832922344, + "learning_rate": 0.0003680886887903596, + "loss": 0.4882, + "step": 24680 + }, + { + "epoch": 1.2532015077864223, + "grad_norm": 0.02600602146903172, + "learning_rate": 0.0003678750241735379, + "loss": 0.482, + "step": 24685 + }, + { + "epoch": 1.2534553438844538, + "grad_norm": 0.020762831712885167, + "learning_rate": 0.00036766138549219007, + "loss": 0.4721, + "step": 24690 + }, + { + "epoch": 1.2537091799824853, + "grad_norm": 0.02365987982582443, + "learning_rate": 0.00036744777278825225, + "loss": 0.4996, + "step": 24695 + }, + { + "epoch": 1.2539630160805169, + "grad_norm": 0.021839292335870203, + "learning_rate": 0.0003672341861036557, + "loss": 0.5116, + "step": 24700 + }, + { + "epoch": 1.2542168521785484, + "grad_norm": 0.02473930369877982, + "learning_rate": 0.00036702062548032624, + "loss": 0.4747, + "step": 24705 + }, + { + "epoch": 1.2544706882765797, + "grad_norm": 0.021843985621647143, + "learning_rate": 0.00036680709096018483, + "loss": 0.4981, + "step": 24710 + }, + { + "epoch": 1.2547245243746112, + "grad_norm": 0.024828258327636316, + "learning_rate": 0.0003665935825851473, + "loss": 0.5257, + "step": 24715 + }, + { + "epoch": 1.2549783604726428, + "grad_norm": 0.02156185794599099, + "learning_rate": 0.0003663801003971241, + "loss": 0.505, + "step": 24720 + }, + { + "epoch": 1.2552321965706743, + "grad_norm": 0.02121222455282886, + "learning_rate": 0.0003661666444380209, + "loss": 0.4864, + "step": 24725 + }, + { + "epoch": 1.2554860326687058, + "grad_norm": 0.028321758337109544, + "learning_rate": 0.00036595321474973777, + "loss": 0.4814, + "step": 24730 + }, + { + "epoch": 1.2557398687667374, + "grad_norm": 0.02973621886010056, + "learning_rate": 0.0003657398113741703, + "loss": 0.4777, + "step": 24735 + }, + { + "epoch": 1.255993704864769, + "grad_norm": 0.03303744106278796, + "learning_rate": 0.0003655264343532083, + "loss": 0.4917, + "step": 24740 + }, + { + "epoch": 1.2562475409628004, + "grad_norm": 0.024275455013697354, + "learning_rate": 0.0003653130837287366, + "loss": 0.4928, + "step": 24745 + }, + { + "epoch": 1.2565013770608318, + "grad_norm": 0.021864363759526228, + "learning_rate": 0.00036509975954263486, + "loss": 0.5008, + "step": 24750 + }, + { + "epoch": 1.2567552131588633, + "grad_norm": 0.023011380155086403, + "learning_rate": 0.00036488646183677767, + "loss": 0.4985, + "step": 24755 + }, + { + "epoch": 1.2570090492568948, + "grad_norm": 0.020669161817962644, + "learning_rate": 0.00036467319065303414, + "loss": 0.5056, + "step": 24760 + }, + { + "epoch": 1.2572628853549264, + "grad_norm": 0.02725897387201023, + "learning_rate": 0.00036445994603326835, + "loss": 0.5112, + "step": 24765 + }, + { + "epoch": 1.257516721452958, + "grad_norm": 0.024508242607115118, + "learning_rate": 0.00036424672801933946, + "loss": 0.5077, + "step": 24770 + }, + { + "epoch": 1.2577705575509892, + "grad_norm": 0.025773782621984185, + "learning_rate": 0.0003640335366531007, + "loss": 0.4975, + "step": 24775 + }, + { + "epoch": 1.2580243936490207, + "grad_norm": 0.023553740478245766, + "learning_rate": 0.00036382037197640063, + "loss": 0.4949, + "step": 24780 + }, + { + "epoch": 1.2582782297470523, + "grad_norm": 0.022708195915233667, + "learning_rate": 0.00036360723403108233, + "loss": 0.5042, + "step": 24785 + }, + { + "epoch": 1.2585320658450838, + "grad_norm": 0.021576770915417216, + "learning_rate": 0.00036339412285898363, + "loss": 0.4956, + "step": 24790 + }, + { + "epoch": 1.2587859019431153, + "grad_norm": 0.024661142091304572, + "learning_rate": 0.0003631810385019376, + "loss": 0.5243, + "step": 24795 + }, + { + "epoch": 1.2590397380411469, + "grad_norm": 0.023783896612178973, + "learning_rate": 0.0003629679810017714, + "loss": 0.5104, + "step": 24800 + }, + { + "epoch": 1.2592935741391784, + "grad_norm": 0.028819292594759488, + "learning_rate": 0.0003627549504003072, + "loss": 0.4754, + "step": 24805 + }, + { + "epoch": 1.25954741023721, + "grad_norm": 0.02512338445035506, + "learning_rate": 0.00036254194673936174, + "loss": 0.4788, + "step": 24810 + }, + { + "epoch": 1.2598012463352413, + "grad_norm": 0.02170026035535352, + "learning_rate": 0.0003623289700607466, + "loss": 0.5096, + "step": 24815 + }, + { + "epoch": 1.2600550824332728, + "grad_norm": 0.026629983889606926, + "learning_rate": 0.00036211602040626815, + "loss": 0.4805, + "step": 24820 + }, + { + "epoch": 1.2603089185313043, + "grad_norm": 0.03586703668787359, + "learning_rate": 0.00036190309781772723, + "loss": 0.4917, + "step": 24825 + }, + { + "epoch": 1.2605627546293359, + "grad_norm": 0.021805993713398496, + "learning_rate": 0.00036169020233691953, + "loss": 0.4935, + "step": 24830 + }, + { + "epoch": 1.2608165907273674, + "grad_norm": 0.02066388286099757, + "learning_rate": 0.0003614773340056353, + "loss": 0.4849, + "step": 24835 + }, + { + "epoch": 1.2610704268253987, + "grad_norm": 0.02456481899175869, + "learning_rate": 0.00036126449286565966, + "loss": 0.4749, + "step": 24840 + }, + { + "epoch": 1.2613242629234303, + "grad_norm": 0.022104636358690293, + "learning_rate": 0.0003610516789587722, + "loss": 0.5236, + "step": 24845 + }, + { + "epoch": 1.2615780990214618, + "grad_norm": 0.023314937158047715, + "learning_rate": 0.000360838892326747, + "loss": 0.478, + "step": 24850 + }, + { + "epoch": 1.2618319351194933, + "grad_norm": 0.02278136876188265, + "learning_rate": 0.00036062613301135357, + "loss": 0.5083, + "step": 24855 + }, + { + "epoch": 1.2620857712175249, + "grad_norm": 0.022721574613654438, + "learning_rate": 0.00036041340105435506, + "loss": 0.5178, + "step": 24860 + }, + { + "epoch": 1.2623396073155564, + "grad_norm": 0.021038721362959267, + "learning_rate": 0.00036020069649750976, + "loss": 0.4987, + "step": 24865 + }, + { + "epoch": 1.262593443413588, + "grad_norm": 0.024677584467370603, + "learning_rate": 0.00035998801938257063, + "loss": 0.4939, + "step": 24870 + }, + { + "epoch": 1.2628472795116195, + "grad_norm": 0.021066193323710593, + "learning_rate": 0.000359775369751285, + "loss": 0.4553, + "step": 24875 + }, + { + "epoch": 1.2631011156096508, + "grad_norm": 0.020772474815759088, + "learning_rate": 0.00035956274764539504, + "loss": 0.4793, + "step": 24880 + }, + { + "epoch": 1.2633549517076823, + "grad_norm": 0.02074020180323779, + "learning_rate": 0.0003593501531066373, + "loss": 0.4897, + "step": 24885 + }, + { + "epoch": 1.2636087878057138, + "grad_norm": 0.022634506709179467, + "learning_rate": 0.00035913758617674315, + "loss": 0.4656, + "step": 24890 + }, + { + "epoch": 1.2638626239037454, + "grad_norm": 0.02442246115751448, + "learning_rate": 0.0003589250468974383, + "loss": 0.4923, + "step": 24895 + }, + { + "epoch": 1.264116460001777, + "grad_norm": 0.02182147534232203, + "learning_rate": 0.00035871253531044323, + "loss": 0.4827, + "step": 24900 + }, + { + "epoch": 1.2643702960998082, + "grad_norm": 0.03540133910293839, + "learning_rate": 0.00035850005145747287, + "loss": 0.4997, + "step": 24905 + }, + { + "epoch": 1.2646241321978398, + "grad_norm": 0.031068992552075844, + "learning_rate": 0.00035828759538023653, + "loss": 0.4974, + "step": 24910 + }, + { + "epoch": 1.2648779682958713, + "grad_norm": 0.024541408824232794, + "learning_rate": 0.00035807516712043876, + "loss": 0.521, + "step": 24915 + }, + { + "epoch": 1.2651318043939028, + "grad_norm": 0.027145012369650882, + "learning_rate": 0.00035786276671977786, + "loss": 0.4929, + "step": 24920 + }, + { + "epoch": 1.2653856404919344, + "grad_norm": 0.03763414082526663, + "learning_rate": 0.000357650394219947, + "loss": 0.4915, + "step": 24925 + }, + { + "epoch": 1.265639476589966, + "grad_norm": 0.04014144903363009, + "learning_rate": 0.0003574380496626339, + "loss": 0.4897, + "step": 24930 + }, + { + "epoch": 1.2658933126879974, + "grad_norm": 0.029764740803608244, + "learning_rate": 0.00035722573308952064, + "loss": 0.4696, + "step": 24935 + }, + { + "epoch": 1.266147148786029, + "grad_norm": 0.030962115180249043, + "learning_rate": 0.000357013444542284, + "loss": 0.5014, + "step": 24940 + }, + { + "epoch": 1.2664009848840605, + "grad_norm": 0.027834184566246624, + "learning_rate": 0.00035680118406259515, + "loss": 0.4928, + "step": 24945 + }, + { + "epoch": 1.2666548209820918, + "grad_norm": 0.021043989423317384, + "learning_rate": 0.00035658895169211966, + "loss": 0.4762, + "step": 24950 + }, + { + "epoch": 1.2669086570801233, + "grad_norm": 0.02449083248247568, + "learning_rate": 0.00035637674747251785, + "loss": 0.491, + "step": 24955 + }, + { + "epoch": 1.2671624931781549, + "grad_norm": 0.02271799388618845, + "learning_rate": 0.00035616457144544425, + "loss": 0.4848, + "step": 24960 + }, + { + "epoch": 1.2674163292761864, + "grad_norm": 0.02138761722620517, + "learning_rate": 0.0003559524236525479, + "loss": 0.478, + "step": 24965 + }, + { + "epoch": 1.267670165374218, + "grad_norm": 0.02379623979828549, + "learning_rate": 0.0003557403041354724, + "loss": 0.4819, + "step": 24970 + }, + { + "epoch": 1.2679240014722493, + "grad_norm": 0.02449724126117815, + "learning_rate": 0.0003555282129358558, + "loss": 0.4804, + "step": 24975 + }, + { + "epoch": 1.2681778375702808, + "grad_norm": 0.021883903501979027, + "learning_rate": 0.0003553161500953306, + "loss": 0.4858, + "step": 24980 + }, + { + "epoch": 1.2684316736683123, + "grad_norm": 0.0209418839992429, + "learning_rate": 0.0003551041156555236, + "loss": 0.4907, + "step": 24985 + }, + { + "epoch": 1.2686855097663439, + "grad_norm": 0.023379422469475157, + "learning_rate": 0.000354892109658056, + "loss": 0.4686, + "step": 24990 + }, + { + "epoch": 1.2689393458643754, + "grad_norm": 0.03050915873944768, + "learning_rate": 0.00035468013214454375, + "loss": 0.4872, + "step": 24995 + }, + { + "epoch": 1.269193181962407, + "grad_norm": 0.019469878265224637, + "learning_rate": 0.0003544681831565968, + "loss": 0.467, + "step": 25000 + }, + { + "epoch": 1.2694470180604385, + "grad_norm": 0.02706167777407716, + "learning_rate": 0.0003542562627358197, + "loss": 0.5358, + "step": 25005 + }, + { + "epoch": 1.26970085415847, + "grad_norm": 0.02607408126692583, + "learning_rate": 0.0003540443709238114, + "loss": 0.484, + "step": 25010 + }, + { + "epoch": 1.2699546902565013, + "grad_norm": 0.026898207511883893, + "learning_rate": 0.00035383250776216526, + "loss": 0.4652, + "step": 25015 + }, + { + "epoch": 1.2702085263545329, + "grad_norm": 0.02822647163163459, + "learning_rate": 0.00035362067329246884, + "loss": 0.5043, + "step": 25020 + }, + { + "epoch": 1.2704623624525644, + "grad_norm": 0.024912999653158442, + "learning_rate": 0.0003534088675563043, + "loss": 0.4968, + "step": 25025 + }, + { + "epoch": 1.270716198550596, + "grad_norm": 0.04104540138730313, + "learning_rate": 0.0003531970905952478, + "loss": 0.48, + "step": 25030 + }, + { + "epoch": 1.2709700346486275, + "grad_norm": 0.04967344080591417, + "learning_rate": 0.00035298534245087055, + "loss": 0.4968, + "step": 25035 + }, + { + "epoch": 1.2712238707466588, + "grad_norm": 0.032041261058398655, + "learning_rate": 0.0003527736231647374, + "loss": 0.4833, + "step": 25040 + }, + { + "epoch": 1.2714777068446903, + "grad_norm": 0.026420537505594786, + "learning_rate": 0.0003525619327784078, + "loss": 0.5085, + "step": 25045 + }, + { + "epoch": 1.2717315429427218, + "grad_norm": 0.02728321137774862, + "learning_rate": 0.00035235027133343546, + "loss": 0.4869, + "step": 25050 + }, + { + "epoch": 1.2719853790407534, + "grad_norm": 0.022824173369318995, + "learning_rate": 0.0003521386388713686, + "loss": 0.5129, + "step": 25055 + }, + { + "epoch": 1.272239215138785, + "grad_norm": 0.027880785233754136, + "learning_rate": 0.0003519270354337495, + "loss": 0.48, + "step": 25060 + }, + { + "epoch": 1.2724930512368164, + "grad_norm": 0.02194127139960059, + "learning_rate": 0.0003517154610621149, + "loss": 0.4868, + "step": 25065 + }, + { + "epoch": 1.272746887334848, + "grad_norm": 0.023243161860758137, + "learning_rate": 0.0003515039157979959, + "loss": 0.4901, + "step": 25070 + }, + { + "epoch": 1.2730007234328795, + "grad_norm": 0.0375654084513337, + "learning_rate": 0.0003512923996829176, + "loss": 0.4824, + "step": 25075 + }, + { + "epoch": 1.2732545595309108, + "grad_norm": 0.02148030589323779, + "learning_rate": 0.0003510809127583997, + "loss": 0.4985, + "step": 25080 + }, + { + "epoch": 1.2735083956289424, + "grad_norm": 0.02366581081516137, + "learning_rate": 0.0003508694550659559, + "loss": 0.4889, + "step": 25085 + }, + { + "epoch": 1.273762231726974, + "grad_norm": 0.026078232012127944, + "learning_rate": 0.00035065802664709426, + "loss": 0.5308, + "step": 25090 + }, + { + "epoch": 1.2740160678250054, + "grad_norm": 0.025296448194318342, + "learning_rate": 0.00035044662754331736, + "loss": 0.4917, + "step": 25095 + }, + { + "epoch": 1.274269903923037, + "grad_norm": 0.032298444356296446, + "learning_rate": 0.00035023525779612165, + "loss": 0.4935, + "step": 25100 + }, + { + "epoch": 1.2745237400210683, + "grad_norm": 0.02559527128726925, + "learning_rate": 0.0003500239174469979, + "loss": 0.502, + "step": 25105 + }, + { + "epoch": 1.2747775761190998, + "grad_norm": 0.024979128795998007, + "learning_rate": 0.0003498126065374313, + "loss": 0.4913, + "step": 25110 + }, + { + "epoch": 1.2750314122171313, + "grad_norm": 0.025909934373053247, + "learning_rate": 0.00034960132510890096, + "loss": 0.4648, + "step": 25115 + }, + { + "epoch": 1.2752852483151629, + "grad_norm": 0.02298436546813651, + "learning_rate": 0.0003493900732028806, + "loss": 0.5013, + "step": 25120 + }, + { + "epoch": 1.2755390844131944, + "grad_norm": 0.02056212672636294, + "learning_rate": 0.0003491788508608377, + "loss": 0.4907, + "step": 25125 + }, + { + "epoch": 1.275792920511226, + "grad_norm": 0.021393425271120866, + "learning_rate": 0.00034896765812423425, + "loss": 0.4743, + "step": 25130 + }, + { + "epoch": 1.2760467566092575, + "grad_norm": 0.02799046607865763, + "learning_rate": 0.00034875649503452626, + "loss": 0.5046, + "step": 25135 + }, + { + "epoch": 1.276300592707289, + "grad_norm": 0.021557706427491538, + "learning_rate": 0.0003485453616331641, + "loss": 0.445, + "step": 25140 + }, + { + "epoch": 1.2765544288053203, + "grad_norm": 0.035754348141408966, + "learning_rate": 0.00034833425796159214, + "loss": 0.4767, + "step": 25145 + }, + { + "epoch": 1.2768082649033519, + "grad_norm": 0.022260297154022193, + "learning_rate": 0.00034812318406124876, + "loss": 0.4998, + "step": 25150 + }, + { + "epoch": 1.2770621010013834, + "grad_norm": 0.020517704282148764, + "learning_rate": 0.0003479121399735672, + "loss": 0.4705, + "step": 25155 + }, + { + "epoch": 1.277315937099415, + "grad_norm": 0.023421817797892777, + "learning_rate": 0.00034770112573997405, + "loss": 0.486, + "step": 25160 + }, + { + "epoch": 1.2775697731974465, + "grad_norm": 0.03614632807366431, + "learning_rate": 0.0003474901414018904, + "loss": 0.5111, + "step": 25165 + }, + { + "epoch": 1.2778236092954778, + "grad_norm": 0.05816272517086025, + "learning_rate": 0.00034727918700073145, + "loss": 0.4814, + "step": 25170 + }, + { + "epoch": 1.2780774453935093, + "grad_norm": 0.025050111858038723, + "learning_rate": 0.0003470682625779065, + "loss": 0.5197, + "step": 25175 + }, + { + "epoch": 1.2783312814915408, + "grad_norm": 0.03165688938324348, + "learning_rate": 0.0003468573681748188, + "loss": 0.4827, + "step": 25180 + }, + { + "epoch": 1.2785851175895724, + "grad_norm": 0.019624471813982844, + "learning_rate": 0.00034664650383286615, + "loss": 0.4922, + "step": 25185 + }, + { + "epoch": 1.278838953687604, + "grad_norm": 0.021835974079306434, + "learning_rate": 0.00034643566959343997, + "loss": 0.5077, + "step": 25190 + }, + { + "epoch": 1.2790927897856355, + "grad_norm": 0.021070204790334, + "learning_rate": 0.0003462248654979261, + "loss": 0.4665, + "step": 25195 + }, + { + "epoch": 1.279346625883667, + "grad_norm": 0.04029683650164615, + "learning_rate": 0.0003460140915877041, + "loss": 0.4625, + "step": 25200 + }, + { + "epoch": 1.2796004619816985, + "grad_norm": 0.02203877571856326, + "learning_rate": 0.00034580334790414814, + "loss": 0.4589, + "step": 25205 + }, + { + "epoch": 1.2798542980797298, + "grad_norm": 0.03135309516087329, + "learning_rate": 0.0003455926344886259, + "loss": 0.4735, + "step": 25210 + }, + { + "epoch": 1.2801081341777614, + "grad_norm": 0.02096327734711526, + "learning_rate": 0.0003453819513824995, + "loss": 0.4874, + "step": 25215 + }, + { + "epoch": 1.280361970275793, + "grad_norm": 0.022733042961183666, + "learning_rate": 0.00034517129862712506, + "loss": 0.4795, + "step": 25220 + }, + { + "epoch": 1.2806158063738244, + "grad_norm": 0.02232352087148914, + "learning_rate": 0.00034496067626385254, + "loss": 0.4843, + "step": 25225 + }, + { + "epoch": 1.280869642471856, + "grad_norm": 0.021657090158753636, + "learning_rate": 0.000344750084334026, + "loss": 0.4864, + "step": 25230 + }, + { + "epoch": 1.2811234785698873, + "grad_norm": 0.02829516889946878, + "learning_rate": 0.00034453952287898375, + "loss": 0.4628, + "step": 25235 + }, + { + "epoch": 1.2813773146679188, + "grad_norm": 0.02189023360857141, + "learning_rate": 0.0003443289919400579, + "loss": 0.4704, + "step": 25240 + }, + { + "epoch": 1.2816311507659504, + "grad_norm": 0.02205365383036312, + "learning_rate": 0.0003441184915585746, + "loss": 0.4757, + "step": 25245 + }, + { + "epoch": 1.281884986863982, + "grad_norm": 0.03272625160720233, + "learning_rate": 0.000343908021775854, + "loss": 0.462, + "step": 25250 + }, + { + "epoch": 1.2821388229620134, + "grad_norm": 0.02110097107472652, + "learning_rate": 0.00034369758263321025, + "loss": 0.4615, + "step": 25255 + }, + { + "epoch": 1.282392659060045, + "grad_norm": 0.02123030495394618, + "learning_rate": 0.0003434871741719516, + "loss": 0.4795, + "step": 25260 + }, + { + "epoch": 1.2826464951580765, + "grad_norm": 0.02373251352589659, + "learning_rate": 0.0003432767964333802, + "loss": 0.4727, + "step": 25265 + }, + { + "epoch": 1.282900331256108, + "grad_norm": 0.03378365804202472, + "learning_rate": 0.00034306644945879174, + "loss": 0.4789, + "step": 25270 + }, + { + "epoch": 1.2831541673541396, + "grad_norm": 0.023436130695674143, + "learning_rate": 0.0003428561332894769, + "loss": 0.4851, + "step": 25275 + }, + { + "epoch": 1.2834080034521709, + "grad_norm": 0.025875198122374644, + "learning_rate": 0.0003426458479667194, + "loss": 0.4709, + "step": 25280 + }, + { + "epoch": 1.2836618395502024, + "grad_norm": 0.030081526490696734, + "learning_rate": 0.00034243559353179726, + "loss": 0.4867, + "step": 25285 + }, + { + "epoch": 1.283915675648234, + "grad_norm": 0.021009383912915302, + "learning_rate": 0.00034222537002598233, + "loss": 0.4919, + "step": 25290 + }, + { + "epoch": 1.2841695117462655, + "grad_norm": 0.020722639706099787, + "learning_rate": 0.00034201517749054037, + "loss": 0.4786, + "step": 25295 + }, + { + "epoch": 1.284423347844297, + "grad_norm": 0.02511803417211432, + "learning_rate": 0.0003418050159667313, + "loss": 0.4895, + "step": 25300 + }, + { + "epoch": 1.2846771839423283, + "grad_norm": 0.023199730249595922, + "learning_rate": 0.00034159488549580865, + "loss": 0.4975, + "step": 25305 + }, + { + "epoch": 1.2849310200403599, + "grad_norm": 0.022681672373907304, + "learning_rate": 0.00034138478611902, + "loss": 0.4618, + "step": 25310 + }, + { + "epoch": 1.2851848561383914, + "grad_norm": 0.025437230351334297, + "learning_rate": 0.0003411747178776068, + "loss": 0.6043, + "step": 25315 + }, + { + "epoch": 1.285438692236423, + "grad_norm": 0.02517031394643481, + "learning_rate": 0.00034096468081280443, + "loss": 0.4689, + "step": 25320 + }, + { + "epoch": 1.2856925283344545, + "grad_norm": 0.0255164084090977, + "learning_rate": 0.00034075467496584214, + "loss": 0.4664, + "step": 25325 + }, + { + "epoch": 1.285946364432486, + "grad_norm": 0.024254877277534864, + "learning_rate": 0.00034054470037794284, + "loss": 0.4927, + "step": 25330 + }, + { + "epoch": 1.2862002005305175, + "grad_norm": 0.02399484634796993, + "learning_rate": 0.0003403347570903238, + "loss": 0.4992, + "step": 25335 + }, + { + "epoch": 1.286454036628549, + "grad_norm": 0.022617445238938865, + "learning_rate": 0.0003401248451441957, + "loss": 0.4784, + "step": 25340 + }, + { + "epoch": 1.2867078727265804, + "grad_norm": 0.02702146861655969, + "learning_rate": 0.0003399149645807632, + "loss": 0.4879, + "step": 25345 + }, + { + "epoch": 1.286961708824612, + "grad_norm": 0.021698144351784757, + "learning_rate": 0.00033970511544122476, + "loss": 0.5057, + "step": 25350 + }, + { + "epoch": 1.2872155449226435, + "grad_norm": 0.02835164720141961, + "learning_rate": 0.0003394952977667728, + "loss": 0.4854, + "step": 25355 + }, + { + "epoch": 1.287469381020675, + "grad_norm": 0.023058090063842972, + "learning_rate": 0.0003392855115985935, + "loss": 0.5309, + "step": 25360 + }, + { + "epoch": 1.2877232171187065, + "grad_norm": 0.043240874010317586, + "learning_rate": 0.00033907575697786677, + "loss": 0.4942, + "step": 25365 + }, + { + "epoch": 1.2879770532167378, + "grad_norm": 0.030301182776510747, + "learning_rate": 0.0003388660339457664, + "loss": 0.4814, + "step": 25370 + }, + { + "epoch": 1.2882308893147694, + "grad_norm": 0.037066055185106375, + "learning_rate": 0.00033865634254345996, + "loss": 0.4831, + "step": 25375 + }, + { + "epoch": 1.288484725412801, + "grad_norm": 0.031338071085445056, + "learning_rate": 0.0003384466828121089, + "loss": 0.508, + "step": 25380 + }, + { + "epoch": 1.2887385615108324, + "grad_norm": 0.02436415302207937, + "learning_rate": 0.0003382370547928683, + "loss": 0.4708, + "step": 25385 + }, + { + "epoch": 1.288992397608864, + "grad_norm": 0.028312170689583957, + "learning_rate": 0.000338027458526887, + "loss": 0.4962, + "step": 25390 + }, + { + "epoch": 1.2892462337068955, + "grad_norm": 0.02167851043165668, + "learning_rate": 0.00033781789405530794, + "loss": 0.4667, + "step": 25395 + }, + { + "epoch": 1.289500069804927, + "grad_norm": 0.023856335129568706, + "learning_rate": 0.00033760836141926754, + "loss": 0.4736, + "step": 25400 + }, + { + "epoch": 1.2897539059029586, + "grad_norm": 0.028628411965523105, + "learning_rate": 0.000337398860659896, + "loss": 0.5157, + "step": 25405 + }, + { + "epoch": 1.2900077420009899, + "grad_norm": 0.021708828812962416, + "learning_rate": 0.0003371893918183171, + "loss": 0.4964, + "step": 25410 + }, + { + "epoch": 1.2902615780990214, + "grad_norm": 0.02198643320874413, + "learning_rate": 0.0003369799549356487, + "loss": 0.4799, + "step": 25415 + }, + { + "epoch": 1.290515414197053, + "grad_norm": 0.02453087273669778, + "learning_rate": 0.00033677055005300224, + "loss": 0.4635, + "step": 25420 + }, + { + "epoch": 1.2907692502950845, + "grad_norm": 0.026346186269380012, + "learning_rate": 0.0003365611772114827, + "loss": 0.5175, + "step": 25425 + }, + { + "epoch": 1.291023086393116, + "grad_norm": 0.029201729373243494, + "learning_rate": 0.000336351836452189, + "loss": 0.5042, + "step": 25430 + }, + { + "epoch": 1.2912769224911473, + "grad_norm": 0.020057883724721514, + "learning_rate": 0.00033614252781621374, + "loss": 0.4929, + "step": 25435 + }, + { + "epoch": 1.2915307585891789, + "grad_norm": 0.023365071609751025, + "learning_rate": 0.0003359332513446431, + "loss": 0.4701, + "step": 25440 + }, + { + "epoch": 1.2917845946872104, + "grad_norm": 0.029195703341591574, + "learning_rate": 0.000335724007078557, + "loss": 0.4679, + "step": 25445 + }, + { + "epoch": 1.292038430785242, + "grad_norm": 0.024061801809005346, + "learning_rate": 0.0003355147950590291, + "loss": 0.4735, + "step": 25450 + }, + { + "epoch": 1.2922922668832735, + "grad_norm": 0.020717453665263782, + "learning_rate": 0.00033530561532712653, + "loss": 0.5058, + "step": 25455 + }, + { + "epoch": 1.292546102981305, + "grad_norm": 0.02955683642722223, + "learning_rate": 0.00033509646792391045, + "loss": 0.4869, + "step": 25460 + }, + { + "epoch": 1.2927999390793365, + "grad_norm": 0.022595682726649052, + "learning_rate": 0.0003348873528904353, + "loss": 0.4827, + "step": 25465 + }, + { + "epoch": 1.293053775177368, + "grad_norm": 0.029808289348049386, + "learning_rate": 0.0003346782702677494, + "loss": 0.496, + "step": 25470 + }, + { + "epoch": 1.2933076112753994, + "grad_norm": 0.02274550903922533, + "learning_rate": 0.0003344692200968946, + "loss": 0.4972, + "step": 25475 + }, + { + "epoch": 1.293561447373431, + "grad_norm": 0.041851383405792156, + "learning_rate": 0.00033426020241890636, + "loss": 0.4798, + "step": 25480 + }, + { + "epoch": 1.2938152834714625, + "grad_norm": 0.033587120320584446, + "learning_rate": 0.00033405121727481384, + "loss": 0.4889, + "step": 25485 + }, + { + "epoch": 1.294069119569494, + "grad_norm": 0.037859152680054645, + "learning_rate": 0.00033384226470563983, + "loss": 0.492, + "step": 25490 + }, + { + "epoch": 1.2943229556675255, + "grad_norm": 0.028727700779199155, + "learning_rate": 0.0003336333447524006, + "loss": 0.5041, + "step": 25495 + }, + { + "epoch": 1.2945767917655568, + "grad_norm": 0.025080840121736163, + "learning_rate": 0.0003334244574561061, + "loss": 0.5099, + "step": 25500 + }, + { + "epoch": 1.2948306278635884, + "grad_norm": 0.02222522508564205, + "learning_rate": 0.0003332156028577599, + "loss": 0.5108, + "step": 25505 + }, + { + "epoch": 1.29508446396162, + "grad_norm": 0.037804332629990035, + "learning_rate": 0.00033300678099835914, + "loss": 0.472, + "step": 25510 + }, + { + "epoch": 1.2953383000596514, + "grad_norm": 0.022133420759234763, + "learning_rate": 0.00033279799191889426, + "loss": 0.4965, + "step": 25515 + }, + { + "epoch": 1.295592136157683, + "grad_norm": 0.027447234503632068, + "learning_rate": 0.00033258923566034995, + "loss": 0.4852, + "step": 25520 + }, + { + "epoch": 1.2958459722557145, + "grad_norm": 0.026292192328524953, + "learning_rate": 0.0003323805122637038, + "loss": 0.4646, + "step": 25525 + }, + { + "epoch": 1.296099808353746, + "grad_norm": 0.02694952032736674, + "learning_rate": 0.0003321718217699271, + "loss": 0.4964, + "step": 25530 + }, + { + "epoch": 1.2963536444517776, + "grad_norm": 0.0217904749033225, + "learning_rate": 0.00033196316421998495, + "loss": 0.5027, + "step": 25535 + }, + { + "epoch": 1.2966074805498091, + "grad_norm": 0.021851950907082737, + "learning_rate": 0.0003317545396548356, + "loss": 0.4986, + "step": 25540 + }, + { + "epoch": 1.2968613166478404, + "grad_norm": 0.023510352499926397, + "learning_rate": 0.00033154594811543104, + "loss": 0.4833, + "step": 25545 + }, + { + "epoch": 1.297115152745872, + "grad_norm": 0.024076269555153962, + "learning_rate": 0.00033133738964271687, + "loss": 0.4734, + "step": 25550 + }, + { + "epoch": 1.2973689888439035, + "grad_norm": 0.022988360192355953, + "learning_rate": 0.00033112886427763197, + "loss": 0.5029, + "step": 25555 + }, + { + "epoch": 1.297622824941935, + "grad_norm": 0.02164944305735075, + "learning_rate": 0.0003309203720611088, + "loss": 0.5109, + "step": 25560 + }, + { + "epoch": 1.2978766610399666, + "grad_norm": 0.038596290229903046, + "learning_rate": 0.00033071191303407345, + "loss": 0.4938, + "step": 25565 + }, + { + "epoch": 1.2981304971379979, + "grad_norm": 0.024119758300361713, + "learning_rate": 0.00033050348723744527, + "loss": 0.4897, + "step": 25570 + }, + { + "epoch": 1.2983843332360294, + "grad_norm": 0.02587506880218007, + "learning_rate": 0.00033029509471213726, + "loss": 0.505, + "step": 25575 + }, + { + "epoch": 1.298638169334061, + "grad_norm": 0.02275936565570734, + "learning_rate": 0.00033008673549905586, + "loss": 0.5145, + "step": 25580 + }, + { + "epoch": 1.2988920054320925, + "grad_norm": 0.023072141573578203, + "learning_rate": 0.000329878409639101, + "loss": 0.4862, + "step": 25585 + }, + { + "epoch": 1.299145841530124, + "grad_norm": 0.021274802226077427, + "learning_rate": 0.00032967011717316587, + "loss": 0.4965, + "step": 25590 + }, + { + "epoch": 1.2993996776281556, + "grad_norm": 0.030502737103929357, + "learning_rate": 0.00032946185814213734, + "loss": 0.5204, + "step": 25595 + }, + { + "epoch": 1.299653513726187, + "grad_norm": 0.020252746392252694, + "learning_rate": 0.00032925363258689557, + "loss": 0.479, + "step": 25600 + }, + { + "epoch": 1.2999073498242186, + "grad_norm": 0.021662085892109512, + "learning_rate": 0.0003290454405483142, + "loss": 0.4786, + "step": 25605 + }, + { + "epoch": 1.30016118592225, + "grad_norm": 0.030512365258254626, + "learning_rate": 0.00032883728206726035, + "loss": 0.4816, + "step": 25610 + }, + { + "epoch": 1.3004150220202815, + "grad_norm": 0.02419501135859568, + "learning_rate": 0.00032862915718459443, + "loss": 0.4659, + "step": 25615 + }, + { + "epoch": 1.300668858118313, + "grad_norm": 0.02205521149147181, + "learning_rate": 0.0003284210659411703, + "loss": 0.4919, + "step": 25620 + }, + { + "epoch": 1.3009226942163445, + "grad_norm": 0.03676879072186463, + "learning_rate": 0.0003282130083778352, + "loss": 0.4887, + "step": 25625 + }, + { + "epoch": 1.301176530314376, + "grad_norm": 0.02352083834323318, + "learning_rate": 0.0003280049845354299, + "loss": 0.4994, + "step": 25630 + }, + { + "epoch": 1.3014303664124074, + "grad_norm": 0.02443911005181168, + "learning_rate": 0.00032779699445478826, + "loss": 0.4826, + "step": 25635 + }, + { + "epoch": 1.301684202510439, + "grad_norm": 0.02197560170718096, + "learning_rate": 0.000327589038176738, + "loss": 0.4615, + "step": 25640 + }, + { + "epoch": 1.3019380386084705, + "grad_norm": 0.022748104689422538, + "learning_rate": 0.00032738111574209973, + "loss": 0.4801, + "step": 25645 + }, + { + "epoch": 1.302191874706502, + "grad_norm": 0.02023037758904707, + "learning_rate": 0.0003271732271916876, + "loss": 0.4661, + "step": 25650 + }, + { + "epoch": 1.3024457108045335, + "grad_norm": 0.029780248485063505, + "learning_rate": 0.0003269653725663091, + "loss": 0.5154, + "step": 25655 + }, + { + "epoch": 1.302699546902565, + "grad_norm": 0.022291598091070952, + "learning_rate": 0.000326757551906765, + "loss": 0.5173, + "step": 25660 + }, + { + "epoch": 1.3029533830005966, + "grad_norm": 0.022819230833246173, + "learning_rate": 0.00032654976525384947, + "loss": 0.4833, + "step": 25665 + }, + { + "epoch": 1.3032072190986281, + "grad_norm": 0.02715749696309478, + "learning_rate": 0.0003263420126483501, + "loss": 0.4879, + "step": 25670 + }, + { + "epoch": 1.3034610551966594, + "grad_norm": 0.023854247107745894, + "learning_rate": 0.0003261342941310476, + "loss": 0.473, + "step": 25675 + }, + { + "epoch": 1.303714891294691, + "grad_norm": 0.022155878634300286, + "learning_rate": 0.00032592660974271615, + "loss": 0.505, + "step": 25680 + }, + { + "epoch": 1.3039687273927225, + "grad_norm": 0.02398677758854186, + "learning_rate": 0.000325718959524123, + "loss": 0.5156, + "step": 25685 + }, + { + "epoch": 1.304222563490754, + "grad_norm": 0.03172998468913407, + "learning_rate": 0.000325511343516029, + "loss": 0.488, + "step": 25690 + }, + { + "epoch": 1.3044763995887856, + "grad_norm": 0.03394625530167979, + "learning_rate": 0.00032530376175918794, + "loss": 0.4604, + "step": 25695 + }, + { + "epoch": 1.304730235686817, + "grad_norm": 0.02947534576412719, + "learning_rate": 0.00032509621429434744, + "loss": 0.4747, + "step": 25700 + }, + { + "epoch": 1.3049840717848484, + "grad_norm": 0.03537345478451157, + "learning_rate": 0.0003248887011622478, + "loss": 0.4933, + "step": 25705 + }, + { + "epoch": 1.30523790788288, + "grad_norm": 0.022484576203938268, + "learning_rate": 0.00032468122240362287, + "loss": 0.5131, + "step": 25710 + }, + { + "epoch": 1.3054917439809115, + "grad_norm": 0.03394847342947618, + "learning_rate": 0.00032447377805919957, + "loss": 0.4996, + "step": 25715 + }, + { + "epoch": 1.305745580078943, + "grad_norm": 0.022258649273689368, + "learning_rate": 0.00032426636816969837, + "loss": 0.4864, + "step": 25720 + }, + { + "epoch": 1.3059994161769746, + "grad_norm": 0.028081809394511537, + "learning_rate": 0.0003240589927758327, + "loss": 0.4942, + "step": 25725 + }, + { + "epoch": 1.306253252275006, + "grad_norm": 0.02227139522287654, + "learning_rate": 0.0003238516519183093, + "loss": 0.5094, + "step": 25730 + }, + { + "epoch": 1.3065070883730376, + "grad_norm": 0.03070685559989823, + "learning_rate": 0.0003236443456378282, + "loss": 0.4944, + "step": 25735 + }, + { + "epoch": 1.306760924471069, + "grad_norm": 0.04102837340472807, + "learning_rate": 0.0003234370739750826, + "loss": 0.458, + "step": 25740 + }, + { + "epoch": 1.3070147605691005, + "grad_norm": 0.021808995007741704, + "learning_rate": 0.00032322983697075883, + "loss": 0.4717, + "step": 25745 + }, + { + "epoch": 1.307268596667132, + "grad_norm": 0.023349246400971222, + "learning_rate": 0.0003230226346655365, + "loss": 0.4855, + "step": 25750 + }, + { + "epoch": 1.3075224327651636, + "grad_norm": 0.023103164390614456, + "learning_rate": 0.0003228154671000882, + "loss": 0.4876, + "step": 25755 + }, + { + "epoch": 1.307776268863195, + "grad_norm": 0.024319382145632857, + "learning_rate": 0.0003226083343150803, + "loss": 0.4753, + "step": 25760 + }, + { + "epoch": 1.3080301049612264, + "grad_norm": 0.02350220796453745, + "learning_rate": 0.0003224012363511717, + "loss": 0.4565, + "step": 25765 + }, + { + "epoch": 1.308283941059258, + "grad_norm": 0.023084276590446533, + "learning_rate": 0.0003221941732490148, + "loss": 0.4742, + "step": 25770 + }, + { + "epoch": 1.3085377771572895, + "grad_norm": 0.023561783980489238, + "learning_rate": 0.00032198714504925487, + "loss": 0.4818, + "step": 25775 + }, + { + "epoch": 1.308791613255321, + "grad_norm": 0.024018962472135568, + "learning_rate": 0.0003217801517925307, + "loss": 0.454, + "step": 25780 + }, + { + "epoch": 1.3090454493533525, + "grad_norm": 0.02133114101515424, + "learning_rate": 0.0003215731935194739, + "loss": 0.4503, + "step": 25785 + }, + { + "epoch": 1.309299285451384, + "grad_norm": 0.026035923942215537, + "learning_rate": 0.0003213662702707094, + "loss": 0.5116, + "step": 25790 + }, + { + "epoch": 1.3095531215494156, + "grad_norm": 0.022735126112242227, + "learning_rate": 0.00032115938208685527, + "loss": 0.4965, + "step": 25795 + }, + { + "epoch": 1.3098069576474471, + "grad_norm": 0.023075574740689275, + "learning_rate": 0.0003209525290085226, + "loss": 0.4858, + "step": 25800 + }, + { + "epoch": 1.3100607937454787, + "grad_norm": 0.02157939323367077, + "learning_rate": 0.00032074571107631544, + "loss": 0.465, + "step": 25805 + }, + { + "epoch": 1.31031462984351, + "grad_norm": 0.02922405968847422, + "learning_rate": 0.0003205389283308313, + "loss": 0.4925, + "step": 25810 + }, + { + "epoch": 1.3105684659415415, + "grad_norm": 0.022879840515332046, + "learning_rate": 0.0003203321808126604, + "loss": 0.4703, + "step": 25815 + }, + { + "epoch": 1.310822302039573, + "grad_norm": 0.023144556467176263, + "learning_rate": 0.0003201254685623866, + "loss": 0.4869, + "step": 25820 + }, + { + "epoch": 1.3110761381376046, + "grad_norm": 0.02520401264527816, + "learning_rate": 0.00031991879162058623, + "loss": 0.4813, + "step": 25825 + }, + { + "epoch": 1.3113299742356361, + "grad_norm": 0.02170783835021678, + "learning_rate": 0.00031971215002782907, + "loss": 0.4802, + "step": 25830 + }, + { + "epoch": 1.3115838103336674, + "grad_norm": 0.028491353202148535, + "learning_rate": 0.00031950554382467766, + "loss": 0.4757, + "step": 25835 + }, + { + "epoch": 1.311837646431699, + "grad_norm": 0.02918056571262906, + "learning_rate": 0.000319298973051688, + "loss": 0.4895, + "step": 25840 + }, + { + "epoch": 1.3120914825297305, + "grad_norm": 0.03246670924753988, + "learning_rate": 0.00031909243774940865, + "loss": 0.5045, + "step": 25845 + }, + { + "epoch": 1.312345318627762, + "grad_norm": 0.04517489478201621, + "learning_rate": 0.0003188859379583816, + "loss": 0.49, + "step": 25850 + }, + { + "epoch": 1.3125991547257936, + "grad_norm": 0.025048916079119958, + "learning_rate": 0.0003186794737191418, + "loss": 0.4813, + "step": 25855 + }, + { + "epoch": 1.3128529908238251, + "grad_norm": 0.02702368574587235, + "learning_rate": 0.000318473045072217, + "loss": 0.4809, + "step": 25860 + }, + { + "epoch": 1.3131068269218567, + "grad_norm": 0.02451499381789933, + "learning_rate": 0.00031826665205812824, + "loss": 0.4777, + "step": 25865 + }, + { + "epoch": 1.3133606630198882, + "grad_norm": 0.027600609156731175, + "learning_rate": 0.00031806029471738933, + "loss": 0.4788, + "step": 25870 + }, + { + "epoch": 1.3136144991179195, + "grad_norm": 0.03131877820295245, + "learning_rate": 0.000317853973090507, + "loss": 0.4761, + "step": 25875 + }, + { + "epoch": 1.313868335215951, + "grad_norm": 0.027123485696514793, + "learning_rate": 0.00031764768721798163, + "loss": 0.4727, + "step": 25880 + }, + { + "epoch": 1.3141221713139826, + "grad_norm": 0.02108556651131552, + "learning_rate": 0.00031744143714030606, + "loss": 0.4948, + "step": 25885 + }, + { + "epoch": 1.314376007412014, + "grad_norm": 0.02128012590382479, + "learning_rate": 0.00031723522289796573, + "loss": 0.4942, + "step": 25890 + }, + { + "epoch": 1.3146298435100456, + "grad_norm": 0.021709980768361257, + "learning_rate": 0.00031702904453143976, + "loss": 0.4826, + "step": 25895 + }, + { + "epoch": 1.314883679608077, + "grad_norm": 0.023018801114703516, + "learning_rate": 0.0003168229020811999, + "loss": 0.4924, + "step": 25900 + }, + { + "epoch": 1.3151375157061085, + "grad_norm": 0.024999798230040154, + "learning_rate": 0.00031661679558771076, + "loss": 0.4843, + "step": 25905 + }, + { + "epoch": 1.31539135180414, + "grad_norm": 0.02466258211924113, + "learning_rate": 0.0003164107250914302, + "loss": 0.5157, + "step": 25910 + }, + { + "epoch": 1.3156451879021716, + "grad_norm": 0.03808085877482315, + "learning_rate": 0.0003162046906328087, + "loss": 0.4982, + "step": 25915 + }, + { + "epoch": 1.315899024000203, + "grad_norm": 0.022915850153156146, + "learning_rate": 0.0003159986922522899, + "loss": 0.4911, + "step": 25920 + }, + { + "epoch": 1.3161528600982346, + "grad_norm": 0.029411931313225212, + "learning_rate": 0.0003157927299903102, + "loss": 0.5142, + "step": 25925 + }, + { + "epoch": 1.3164066961962662, + "grad_norm": 0.022508290380115467, + "learning_rate": 0.0003155868038872989, + "loss": 0.4781, + "step": 25930 + }, + { + "epoch": 1.3166605322942977, + "grad_norm": 0.02367130521025921, + "learning_rate": 0.0003153809139836781, + "loss": 0.5034, + "step": 25935 + }, + { + "epoch": 1.316914368392329, + "grad_norm": 0.02370837844837251, + "learning_rate": 0.0003151750603198634, + "loss": 0.4825, + "step": 25940 + }, + { + "epoch": 1.3171682044903605, + "grad_norm": 0.022145644887770835, + "learning_rate": 0.0003149692429362627, + "loss": 0.4711, + "step": 25945 + }, + { + "epoch": 1.317422040588392, + "grad_norm": 0.030152956850519705, + "learning_rate": 0.00031476346187327684, + "loss": 0.4648, + "step": 25950 + }, + { + "epoch": 1.3176758766864236, + "grad_norm": 0.027631145783320436, + "learning_rate": 0.0003145577171712997, + "loss": 0.4992, + "step": 25955 + }, + { + "epoch": 1.3179297127844551, + "grad_norm": 0.028341925435365112, + "learning_rate": 0.00031435200887071786, + "loss": 0.4761, + "step": 25960 + }, + { + "epoch": 1.3181835488824865, + "grad_norm": 0.026884867627421965, + "learning_rate": 0.0003141463370119108, + "loss": 0.4643, + "step": 25965 + }, + { + "epoch": 1.318437384980518, + "grad_norm": 0.022874705393107205, + "learning_rate": 0.00031394070163525095, + "loss": 0.4786, + "step": 25970 + }, + { + "epoch": 1.3186912210785495, + "grad_norm": 0.027134145494403333, + "learning_rate": 0.0003137351027811035, + "loss": 0.503, + "step": 25975 + }, + { + "epoch": 1.318945057176581, + "grad_norm": 0.024479146918958583, + "learning_rate": 0.0003135295404898265, + "loss": 0.4983, + "step": 25980 + }, + { + "epoch": 1.3191988932746126, + "grad_norm": 0.02172267954854436, + "learning_rate": 0.00031332401480177073, + "loss": 0.4721, + "step": 25985 + }, + { + "epoch": 1.3194527293726441, + "grad_norm": 0.025255869240494836, + "learning_rate": 0.0003131185257572799, + "loss": 0.4665, + "step": 25990 + }, + { + "epoch": 1.3197065654706757, + "grad_norm": 0.023915003608357185, + "learning_rate": 0.0003129130733966904, + "loss": 0.5186, + "step": 25995 + }, + { + "epoch": 1.3199604015687072, + "grad_norm": 0.038739561505479345, + "learning_rate": 0.00031270765776033173, + "loss": 0.4665, + "step": 26000 + }, + { + "epoch": 1.3202142376667385, + "grad_norm": 0.02087696169338501, + "learning_rate": 0.00031250227888852576, + "loss": 0.4838, + "step": 26005 + }, + { + "epoch": 1.32046807376477, + "grad_norm": 0.024862811416681813, + "learning_rate": 0.0003122969368215874, + "loss": 0.4861, + "step": 26010 + }, + { + "epoch": 1.3207219098628016, + "grad_norm": 0.019972495995757008, + "learning_rate": 0.0003120916315998243, + "loss": 0.4681, + "step": 26015 + }, + { + "epoch": 1.3209757459608331, + "grad_norm": 0.026081632082320026, + "learning_rate": 0.0003118863632635368, + "loss": 0.4591, + "step": 26020 + }, + { + "epoch": 1.3212295820588646, + "grad_norm": 0.03877882882655485, + "learning_rate": 0.00031168113185301815, + "loss": 0.5011, + "step": 26025 + }, + { + "epoch": 1.321483418156896, + "grad_norm": 0.035565096811897436, + "learning_rate": 0.00031147593740855407, + "loss": 0.4884, + "step": 26030 + }, + { + "epoch": 1.3217372542549275, + "grad_norm": 0.032673275953771695, + "learning_rate": 0.00031127077997042336, + "loss": 0.4688, + "step": 26035 + }, + { + "epoch": 1.321991090352959, + "grad_norm": 0.02225124249672015, + "learning_rate": 0.0003110656595788973, + "loss": 0.5164, + "step": 26040 + }, + { + "epoch": 1.3222449264509906, + "grad_norm": 0.027980878312804384, + "learning_rate": 0.0003108605762742401, + "loss": 0.4872, + "step": 26045 + }, + { + "epoch": 1.322498762549022, + "grad_norm": 0.025276827695637313, + "learning_rate": 0.00031065553009670857, + "loss": 0.4903, + "step": 26050 + }, + { + "epoch": 1.3227525986470536, + "grad_norm": 0.028706939574934476, + "learning_rate": 0.00031045052108655193, + "loss": 0.5026, + "step": 26055 + }, + { + "epoch": 1.3230064347450852, + "grad_norm": 0.027941893356288494, + "learning_rate": 0.0003102455492840129, + "loss": 0.4932, + "step": 26060 + }, + { + "epoch": 1.3232602708431167, + "grad_norm": 0.026634652222519675, + "learning_rate": 0.00031004061472932634, + "loss": 0.5158, + "step": 26065 + }, + { + "epoch": 1.3235141069411482, + "grad_norm": 0.03025644762717795, + "learning_rate": 0.00030983571746271977, + "loss": 0.4733, + "step": 26070 + }, + { + "epoch": 1.3237679430391796, + "grad_norm": 0.02398881393021072, + "learning_rate": 0.0003096308575244135, + "loss": 0.4784, + "step": 26075 + }, + { + "epoch": 1.324021779137211, + "grad_norm": 0.023325566374430225, + "learning_rate": 0.00030942603495462054, + "loss": 0.501, + "step": 26080 + }, + { + "epoch": 1.3242756152352426, + "grad_norm": 0.02552948646532719, + "learning_rate": 0.0003092212497935465, + "loss": 0.4894, + "step": 26085 + }, + { + "epoch": 1.3245294513332742, + "grad_norm": 0.03612792972246537, + "learning_rate": 0.0003090165020813897, + "loss": 0.4688, + "step": 26090 + }, + { + "epoch": 1.3247832874313055, + "grad_norm": 0.028878202348219067, + "learning_rate": 0.00030881179185834114, + "loss": 0.5304, + "step": 26095 + }, + { + "epoch": 1.325037123529337, + "grad_norm": 0.020583251018650484, + "learning_rate": 0.0003086071191645844, + "loss": 0.4639, + "step": 26100 + }, + { + "epoch": 1.3252909596273685, + "grad_norm": 0.023744717612926514, + "learning_rate": 0.00030840248404029563, + "loss": 0.4727, + "step": 26105 + }, + { + "epoch": 1.3255447957254, + "grad_norm": 0.024384417847231648, + "learning_rate": 0.00030819788652564377, + "loss": 0.4774, + "step": 26110 + }, + { + "epoch": 1.3257986318234316, + "grad_norm": 0.029828723552213644, + "learning_rate": 0.00030799332666079016, + "loss": 0.4778, + "step": 26115 + }, + { + "epoch": 1.3260524679214631, + "grad_norm": 0.0277189472799926, + "learning_rate": 0.0003077888044858891, + "loss": 0.4987, + "step": 26120 + }, + { + "epoch": 1.3263063040194947, + "grad_norm": 0.021616997214167058, + "learning_rate": 0.00030758432004108723, + "loss": 0.491, + "step": 26125 + }, + { + "epoch": 1.3265601401175262, + "grad_norm": 0.03097966201237373, + "learning_rate": 0.0003073798733665237, + "loss": 0.4743, + "step": 26130 + }, + { + "epoch": 1.3268139762155577, + "grad_norm": 0.03448839917044529, + "learning_rate": 0.00030717546450233045, + "loss": 0.5102, + "step": 26135 + }, + { + "epoch": 1.327067812313589, + "grad_norm": 0.027797346027596648, + "learning_rate": 0.0003069710934886319, + "loss": 0.4805, + "step": 26140 + }, + { + "epoch": 1.3273216484116206, + "grad_norm": 0.023865639312762033, + "learning_rate": 0.0003067667603655451, + "loss": 0.4738, + "step": 26145 + }, + { + "epoch": 1.3275754845096521, + "grad_norm": 0.022994781293125738, + "learning_rate": 0.0003065624651731795, + "loss": 0.4604, + "step": 26150 + }, + { + "epoch": 1.3278293206076837, + "grad_norm": 0.02452841527289796, + "learning_rate": 0.00030635820795163737, + "loss": 0.495, + "step": 26155 + }, + { + "epoch": 1.3280831567057152, + "grad_norm": 0.025166790867288878, + "learning_rate": 0.0003061539887410133, + "loss": 0.4653, + "step": 26160 + }, + { + "epoch": 1.3283369928037465, + "grad_norm": 0.025844039701185498, + "learning_rate": 0.0003059498075813946, + "loss": 0.474, + "step": 26165 + }, + { + "epoch": 1.328590828901778, + "grad_norm": 0.022922263461315293, + "learning_rate": 0.0003057456645128609, + "loss": 0.5229, + "step": 26170 + }, + { + "epoch": 1.3288446649998096, + "grad_norm": 0.028735220280624686, + "learning_rate": 0.00030554155957548425, + "loss": 0.4895, + "step": 26175 + }, + { + "epoch": 1.3290985010978411, + "grad_norm": 0.02399502885581283, + "learning_rate": 0.00030533749280933, + "loss": 0.4898, + "step": 26180 + }, + { + "epoch": 1.3293523371958726, + "grad_norm": 0.027489214240756413, + "learning_rate": 0.0003051334642544551, + "loss": 0.4868, + "step": 26185 + }, + { + "epoch": 1.3296061732939042, + "grad_norm": 0.022807932982948913, + "learning_rate": 0.0003049294739509093, + "loss": 0.4645, + "step": 26190 + }, + { + "epoch": 1.3298600093919357, + "grad_norm": 0.02364717870718042, + "learning_rate": 0.00030472552193873506, + "loss": 0.4874, + "step": 26195 + }, + { + "epoch": 1.3301138454899673, + "grad_norm": 0.02421526146734168, + "learning_rate": 0.0003045216082579669, + "loss": 0.4753, + "step": 26200 + }, + { + "epoch": 1.3303676815879986, + "grad_norm": 0.023753864278555786, + "learning_rate": 0.0003043177329486323, + "loss": 0.4974, + "step": 26205 + }, + { + "epoch": 1.33062151768603, + "grad_norm": 0.02405794761573703, + "learning_rate": 0.0003041138960507508, + "loss": 0.5051, + "step": 26210 + }, + { + "epoch": 1.3308753537840616, + "grad_norm": 0.022815391635180565, + "learning_rate": 0.0003039100976043346, + "loss": 0.491, + "step": 26215 + }, + { + "epoch": 1.3311291898820932, + "grad_norm": 0.022343572537838263, + "learning_rate": 0.0003037063376493884, + "loss": 0.4743, + "step": 26220 + }, + { + "epoch": 1.3313830259801247, + "grad_norm": 0.022723276658171112, + "learning_rate": 0.00030350261622590926, + "loss": 0.4531, + "step": 26225 + }, + { + "epoch": 1.331636862078156, + "grad_norm": 0.023454158002978998, + "learning_rate": 0.0003032989333738865, + "loss": 0.4808, + "step": 26230 + }, + { + "epoch": 1.3318906981761875, + "grad_norm": 0.023075690824087954, + "learning_rate": 0.0003030952891333021, + "loss": 0.4672, + "step": 26235 + }, + { + "epoch": 1.332144534274219, + "grad_norm": 0.024106778177103005, + "learning_rate": 0.00030289168354413065, + "loss": 0.4942, + "step": 26240 + }, + { + "epoch": 1.3323983703722506, + "grad_norm": 0.025151670342148585, + "learning_rate": 0.00030268811664633865, + "loss": 0.4927, + "step": 26245 + }, + { + "epoch": 1.3326522064702822, + "grad_norm": 0.030957786516116338, + "learning_rate": 0.0003024845884798855, + "loss": 0.4773, + "step": 26250 + }, + { + "epoch": 1.3329060425683137, + "grad_norm": 0.025925112474195305, + "learning_rate": 0.00030228109908472247, + "loss": 0.4611, + "step": 26255 + }, + { + "epoch": 1.3331598786663452, + "grad_norm": 0.019884408833520717, + "learning_rate": 0.00030207764850079374, + "loss": 0.4644, + "step": 26260 + }, + { + "epoch": 1.3334137147643768, + "grad_norm": 0.025171919728835243, + "learning_rate": 0.00030187423676803556, + "loss": 0.4628, + "step": 26265 + }, + { + "epoch": 1.333667550862408, + "grad_norm": 0.02155136741752636, + "learning_rate": 0.00030167086392637665, + "loss": 0.4931, + "step": 26270 + }, + { + "epoch": 1.3339213869604396, + "grad_norm": 0.03331276638766642, + "learning_rate": 0.0003014675300157381, + "loss": 0.4617, + "step": 26275 + }, + { + "epoch": 1.3341752230584711, + "grad_norm": 0.026091056559288128, + "learning_rate": 0.00030126423507603327, + "loss": 0.4691, + "step": 26280 + }, + { + "epoch": 1.3344290591565027, + "grad_norm": 0.021480786402548066, + "learning_rate": 0.00030106097914716804, + "loss": 0.4798, + "step": 26285 + }, + { + "epoch": 1.3346828952545342, + "grad_norm": 0.026115011721068254, + "learning_rate": 0.0003008577622690405, + "loss": 0.4778, + "step": 26290 + }, + { + "epoch": 1.3349367313525655, + "grad_norm": 0.029188603624534938, + "learning_rate": 0.00030065458448154094, + "loss": 0.4956, + "step": 26295 + }, + { + "epoch": 1.335190567450597, + "grad_norm": 0.02343770485301329, + "learning_rate": 0.0003004514458245525, + "loss": 0.5235, + "step": 26300 + }, + { + "epoch": 1.3354444035486286, + "grad_norm": 0.023629932549987128, + "learning_rate": 0.00030024834633795005, + "loss": 0.4652, + "step": 26305 + }, + { + "epoch": 1.3356982396466601, + "grad_norm": 0.02116981769281469, + "learning_rate": 0.0003000452860616011, + "loss": 0.4936, + "step": 26310 + }, + { + "epoch": 1.3359520757446917, + "grad_norm": 0.019859957952761565, + "learning_rate": 0.00029984226503536527, + "loss": 0.4681, + "step": 26315 + }, + { + "epoch": 1.3362059118427232, + "grad_norm": 0.025886391465037177, + "learning_rate": 0.0002996392832990946, + "loss": 0.4803, + "step": 26320 + }, + { + "epoch": 1.3364597479407547, + "grad_norm": 0.02594128632014247, + "learning_rate": 0.00029943634089263355, + "loss": 0.4873, + "step": 26325 + }, + { + "epoch": 1.3367135840387863, + "grad_norm": 0.03408346447284297, + "learning_rate": 0.0002992334378558185, + "loss": 0.48, + "step": 26330 + }, + { + "epoch": 1.3369674201368176, + "grad_norm": 0.03841155039452, + "learning_rate": 0.00029903057422847834, + "loss": 0.4702, + "step": 26335 + }, + { + "epoch": 1.337221256234849, + "grad_norm": 0.022888290658989296, + "learning_rate": 0.0002988277500504343, + "loss": 0.4388, + "step": 26340 + }, + { + "epoch": 1.3374750923328806, + "grad_norm": 0.03122320066551133, + "learning_rate": 0.00029862496536149966, + "loss": 0.4969, + "step": 26345 + }, + { + "epoch": 1.3377289284309122, + "grad_norm": 0.03769099264555599, + "learning_rate": 0.00029842222020148, + "loss": 0.489, + "step": 26350 + }, + { + "epoch": 1.3379827645289437, + "grad_norm": 0.029431216873870947, + "learning_rate": 0.0002982195146101734, + "loss": 0.5069, + "step": 26355 + }, + { + "epoch": 1.338236600626975, + "grad_norm": 0.024178396636187073, + "learning_rate": 0.00029801684862736956, + "loss": 0.4598, + "step": 26360 + }, + { + "epoch": 1.3384904367250066, + "grad_norm": 0.027229237482082266, + "learning_rate": 0.0002978142222928512, + "loss": 0.4881, + "step": 26365 + }, + { + "epoch": 1.338744272823038, + "grad_norm": 0.02518085523341514, + "learning_rate": 0.0002976116356463927, + "loss": 0.4734, + "step": 26370 + }, + { + "epoch": 1.3389981089210696, + "grad_norm": 0.026449318799886906, + "learning_rate": 0.00029740908872776087, + "loss": 0.4857, + "step": 26375 + }, + { + "epoch": 1.3392519450191012, + "grad_norm": 0.02736453220212371, + "learning_rate": 0.00029720658157671447, + "loss": 0.4952, + "step": 26380 + }, + { + "epoch": 1.3395057811171327, + "grad_norm": 0.02011376143672255, + "learning_rate": 0.0002970041142330049, + "loss": 0.475, + "step": 26385 + }, + { + "epoch": 1.3397596172151642, + "grad_norm": 0.022297115519452224, + "learning_rate": 0.0002968016867363753, + "loss": 0.4881, + "step": 26390 + }, + { + "epoch": 1.3400134533131958, + "grad_norm": 0.028308862230847293, + "learning_rate": 0.00029659929912656123, + "loss": 0.4707, + "step": 26395 + }, + { + "epoch": 1.3402672894112273, + "grad_norm": 0.02418795190807018, + "learning_rate": 0.0002963969514432904, + "loss": 0.472, + "step": 26400 + }, + { + "epoch": 1.3405211255092586, + "grad_norm": 0.03161928209152755, + "learning_rate": 0.0002961946437262827, + "loss": 0.4715, + "step": 26405 + }, + { + "epoch": 1.3407749616072901, + "grad_norm": 0.03500499761857729, + "learning_rate": 0.00029599237601525, + "loss": 0.4636, + "step": 26410 + }, + { + "epoch": 1.3410287977053217, + "grad_norm": 0.03348168345879801, + "learning_rate": 0.00029579014834989653, + "loss": 0.4979, + "step": 26415 + }, + { + "epoch": 1.3412826338033532, + "grad_norm": 0.029165985289788657, + "learning_rate": 0.00029558796076991836, + "loss": 0.5169, + "step": 26420 + }, + { + "epoch": 1.3415364699013848, + "grad_norm": 0.026140837166966736, + "learning_rate": 0.00029538581331500427, + "loss": 0.4786, + "step": 26425 + }, + { + "epoch": 1.341790305999416, + "grad_norm": 0.031152081282308442, + "learning_rate": 0.0002951837060248346, + "loss": 0.4906, + "step": 26430 + }, + { + "epoch": 1.3420441420974476, + "grad_norm": 0.036089699561631966, + "learning_rate": 0.000294981638939082, + "loss": 0.4527, + "step": 26435 + }, + { + "epoch": 1.3422979781954791, + "grad_norm": 0.029379508239414574, + "learning_rate": 0.0002947796120974113, + "loss": 0.4576, + "step": 26440 + }, + { + "epoch": 1.3425518142935107, + "grad_norm": 0.026574119568228798, + "learning_rate": 0.0002945776255394793, + "loss": 0.487, + "step": 26445 + }, + { + "epoch": 1.3428056503915422, + "grad_norm": 0.02274745690514462, + "learning_rate": 0.00029437567930493493, + "loss": 0.4661, + "step": 26450 + }, + { + "epoch": 1.3430594864895737, + "grad_norm": 0.021614065812495135, + "learning_rate": 0.0002941737734334193, + "loss": 0.496, + "step": 26455 + }, + { + "epoch": 1.3433133225876053, + "grad_norm": 0.022482586999432463, + "learning_rate": 0.00029397190796456553, + "loss": 0.4844, + "step": 26460 + }, + { + "epoch": 1.3435671586856368, + "grad_norm": 0.022116336028821203, + "learning_rate": 0.00029377008293799865, + "loss": 0.52, + "step": 26465 + }, + { + "epoch": 1.3438209947836681, + "grad_norm": 0.028349507842694705, + "learning_rate": 0.00029356829839333615, + "loss": 0.4871, + "step": 26470 + }, + { + "epoch": 1.3440748308816997, + "grad_norm": 0.022089539699478508, + "learning_rate": 0.0002933665543701871, + "loss": 0.4943, + "step": 26475 + }, + { + "epoch": 1.3443286669797312, + "grad_norm": 0.026039145829479762, + "learning_rate": 0.0002931648509081529, + "loss": 0.502, + "step": 26480 + }, + { + "epoch": 1.3445825030777627, + "grad_norm": 0.028120685619677774, + "learning_rate": 0.0002929631880468271, + "loss": 0.4956, + "step": 26485 + }, + { + "epoch": 1.3448363391757943, + "grad_norm": 0.02097286192345862, + "learning_rate": 0.000292761565825795, + "loss": 0.4729, + "step": 26490 + }, + { + "epoch": 1.3450901752738256, + "grad_norm": 0.027620567755231737, + "learning_rate": 0.000292559984284634, + "loss": 0.5011, + "step": 26495 + }, + { + "epoch": 1.345344011371857, + "grad_norm": 0.022184671610469525, + "learning_rate": 0.0002923584434629136, + "loss": 0.4848, + "step": 26500 + }, + { + "epoch": 1.3455978474698886, + "grad_norm": 0.027060057039042365, + "learning_rate": 0.0002921569434001952, + "loss": 0.446, + "step": 26505 + }, + { + "epoch": 1.3458516835679202, + "grad_norm": 0.022283856953366245, + "learning_rate": 0.00029195548413603236, + "loss": 0.4637, + "step": 26510 + }, + { + "epoch": 1.3461055196659517, + "grad_norm": 0.02142821011081133, + "learning_rate": 0.0002917540657099703, + "loss": 0.4999, + "step": 26515 + }, + { + "epoch": 1.3463593557639832, + "grad_norm": 0.03140709448224681, + "learning_rate": 0.0002915526881615469, + "loss": 0.5199, + "step": 26520 + }, + { + "epoch": 1.3466131918620148, + "grad_norm": 0.02193762571699132, + "learning_rate": 0.000291351351530291, + "loss": 0.5037, + "step": 26525 + }, + { + "epoch": 1.3468670279600463, + "grad_norm": 0.024860792301576635, + "learning_rate": 0.0002911500558557245, + "loss": 0.4918, + "step": 26530 + }, + { + "epoch": 1.3471208640580776, + "grad_norm": 0.02411891556809714, + "learning_rate": 0.0002909488011773603, + "loss": 0.48, + "step": 26535 + }, + { + "epoch": 1.3473747001561092, + "grad_norm": 0.02334451935870803, + "learning_rate": 0.000290747587534704, + "loss": 0.5067, + "step": 26540 + }, + { + "epoch": 1.3476285362541407, + "grad_norm": 0.033294686217091356, + "learning_rate": 0.00029054641496725276, + "loss": 0.533, + "step": 26545 + }, + { + "epoch": 1.3478823723521722, + "grad_norm": 0.021364746646164178, + "learning_rate": 0.00029034528351449564, + "loss": 0.4972, + "step": 26550 + }, + { + "epoch": 1.3481362084502038, + "grad_norm": 0.022844057569169195, + "learning_rate": 0.00029014419321591396, + "loss": 0.4839, + "step": 26555 + }, + { + "epoch": 1.348390044548235, + "grad_norm": 0.021739041832296423, + "learning_rate": 0.00028994314411098044, + "loss": 0.4683, + "step": 26560 + }, + { + "epoch": 1.3486438806462666, + "grad_norm": 0.022308989962987445, + "learning_rate": 0.00028974213623916037, + "loss": 0.4752, + "step": 26565 + }, + { + "epoch": 1.3488977167442981, + "grad_norm": 0.022250254963704218, + "learning_rate": 0.0002895411696399102, + "loss": 0.5067, + "step": 26570 + }, + { + "epoch": 1.3491515528423297, + "grad_norm": 0.02323169518050859, + "learning_rate": 0.000289340244352679, + "loss": 0.4824, + "step": 26575 + }, + { + "epoch": 1.3494053889403612, + "grad_norm": 0.021304787629742503, + "learning_rate": 0.00028913936041690715, + "loss": 0.4832, + "step": 26580 + }, + { + "epoch": 1.3496592250383928, + "grad_norm": 0.021154944633414424, + "learning_rate": 0.00028893851787202746, + "loss": 0.496, + "step": 26585 + }, + { + "epoch": 1.3499130611364243, + "grad_norm": 0.023742114405543396, + "learning_rate": 0.00028873771675746394, + "loss": 0.4846, + "step": 26590 + }, + { + "epoch": 1.3501668972344558, + "grad_norm": 0.02312754487239046, + "learning_rate": 0.0002885369571126333, + "loss": 0.4775, + "step": 26595 + }, + { + "epoch": 1.3504207333324871, + "grad_norm": 0.026159799755686802, + "learning_rate": 0.000288336238976943, + "loss": 0.5369, + "step": 26600 + }, + { + "epoch": 1.3506745694305187, + "grad_norm": 0.022936903925025648, + "learning_rate": 0.00028813556238979377, + "loss": 0.5077, + "step": 26605 + }, + { + "epoch": 1.3509284055285502, + "grad_norm": 0.023814084119461076, + "learning_rate": 0.000287934927390577, + "loss": 0.4698, + "step": 26610 + }, + { + "epoch": 1.3511822416265817, + "grad_norm": 0.02246864872428122, + "learning_rate": 0.0002877343340186765, + "loss": 0.482, + "step": 26615 + }, + { + "epoch": 1.3514360777246133, + "grad_norm": 0.02288321097342424, + "learning_rate": 0.0002875337823134675, + "loss": 0.5256, + "step": 26620 + }, + { + "epoch": 1.3516899138226446, + "grad_norm": 0.0224047370327657, + "learning_rate": 0.0002873332723143177, + "loss": 0.5193, + "step": 26625 + }, + { + "epoch": 1.3519437499206761, + "grad_norm": 0.025203167907080704, + "learning_rate": 0.00028713280406058575, + "loss": 0.484, + "step": 26630 + }, + { + "epoch": 1.3521975860187077, + "grad_norm": 0.020720560042865592, + "learning_rate": 0.00028693237759162295, + "loss": 0.4852, + "step": 26635 + }, + { + "epoch": 1.3524514221167392, + "grad_norm": 0.021481598559433032, + "learning_rate": 0.0002867319929467717, + "loss": 0.5115, + "step": 26640 + }, + { + "epoch": 1.3527052582147707, + "grad_norm": 0.022428252435808805, + "learning_rate": 0.0002865316501653669, + "loss": 0.5024, + "step": 26645 + }, + { + "epoch": 1.3529590943128023, + "grad_norm": 0.02233749023643697, + "learning_rate": 0.0002863313492867344, + "loss": 0.476, + "step": 26650 + }, + { + "epoch": 1.3532129304108338, + "grad_norm": 0.03464327071026982, + "learning_rate": 0.0002861310903501926, + "loss": 0.5206, + "step": 26655 + }, + { + "epoch": 1.3534667665088653, + "grad_norm": 0.02542767368623182, + "learning_rate": 0.0002859308733950511, + "loss": 0.4785, + "step": 26660 + }, + { + "epoch": 1.3537206026068969, + "grad_norm": 0.021672839236436318, + "learning_rate": 0.0002857306984606115, + "loss": 0.4777, + "step": 26665 + }, + { + "epoch": 1.3539744387049282, + "grad_norm": 0.021627780540600057, + "learning_rate": 0.0002855305655861675, + "loss": 0.4673, + "step": 26670 + }, + { + "epoch": 1.3542282748029597, + "grad_norm": 0.02290542274140012, + "learning_rate": 0.0002853304748110037, + "loss": 0.4879, + "step": 26675 + }, + { + "epoch": 1.3544821109009912, + "grad_norm": 0.024948536996439105, + "learning_rate": 0.00028513042617439734, + "loss": 0.4777, + "step": 26680 + }, + { + "epoch": 1.3547359469990228, + "grad_norm": 0.029359162803752524, + "learning_rate": 0.0002849304197156166, + "loss": 0.5148, + "step": 26685 + }, + { + "epoch": 1.3549897830970543, + "grad_norm": 0.023302687409291374, + "learning_rate": 0.00028473045547392205, + "loss": 0.482, + "step": 26690 + }, + { + "epoch": 1.3552436191950856, + "grad_norm": 0.025299148598432575, + "learning_rate": 0.0002845305334885654, + "loss": 0.51, + "step": 26695 + }, + { + "epoch": 1.3554974552931172, + "grad_norm": 0.02516023607604002, + "learning_rate": 0.0002843306537987906, + "loss": 0.4369, + "step": 26700 + }, + { + "epoch": 1.3557512913911487, + "grad_norm": 0.022672487670563547, + "learning_rate": 0.00028413081644383285, + "loss": 0.4883, + "step": 26705 + }, + { + "epoch": 1.3560051274891802, + "grad_norm": 0.02464631035571871, + "learning_rate": 0.0002839310214629194, + "loss": 0.5134, + "step": 26710 + }, + { + "epoch": 1.3562589635872118, + "grad_norm": 0.021789322301810896, + "learning_rate": 0.00028373126889526875, + "loss": 0.4862, + "step": 26715 + }, + { + "epoch": 1.3565127996852433, + "grad_norm": 0.021183030059470056, + "learning_rate": 0.0002835315587800914, + "loss": 0.4785, + "step": 26720 + }, + { + "epoch": 1.3567666357832748, + "grad_norm": 0.02477154579365883, + "learning_rate": 0.00028333189115658966, + "loss": 0.4824, + "step": 26725 + }, + { + "epoch": 1.3570204718813064, + "grad_norm": 0.023028212328728488, + "learning_rate": 0.0002831322660639573, + "loss": 0.5141, + "step": 26730 + }, + { + "epoch": 1.3572743079793377, + "grad_norm": 0.03275446295412662, + "learning_rate": 0.0002829326835413794, + "loss": 0.4805, + "step": 26735 + }, + { + "epoch": 1.3575281440773692, + "grad_norm": 0.02501122701564825, + "learning_rate": 0.00028273314362803337, + "loss": 0.4855, + "step": 26740 + }, + { + "epoch": 1.3577819801754007, + "grad_norm": 0.02328137101092054, + "learning_rate": 0.0002825336463630875, + "loss": 0.4999, + "step": 26745 + }, + { + "epoch": 1.3580358162734323, + "grad_norm": 0.026605008466514972, + "learning_rate": 0.0002823341917857027, + "loss": 0.4587, + "step": 26750 + }, + { + "epoch": 1.3582896523714638, + "grad_norm": 0.023150456810902106, + "learning_rate": 0.0002821347799350302, + "loss": 0.4723, + "step": 26755 + }, + { + "epoch": 1.3585434884694951, + "grad_norm": 0.02251804410714866, + "learning_rate": 0.00028193541085021423, + "loss": 0.501, + "step": 26760 + }, + { + "epoch": 1.3587973245675267, + "grad_norm": 0.029335610284536826, + "learning_rate": 0.00028173608457038936, + "loss": 0.5047, + "step": 26765 + }, + { + "epoch": 1.3590511606655582, + "grad_norm": 0.022188907618175026, + "learning_rate": 0.0002815368011346828, + "loss": 0.4884, + "step": 26770 + }, + { + "epoch": 1.3593049967635897, + "grad_norm": 0.021239459035322466, + "learning_rate": 0.00028133756058221253, + "loss": 0.4655, + "step": 26775 + }, + { + "epoch": 1.3595588328616213, + "grad_norm": 0.022946805788639644, + "learning_rate": 0.0002811383629520887, + "loss": 0.4871, + "step": 26780 + }, + { + "epoch": 1.3598126689596528, + "grad_norm": 0.023284296559063916, + "learning_rate": 0.0002809392082834129, + "loss": 0.4965, + "step": 26785 + }, + { + "epoch": 1.3600665050576843, + "grad_norm": 0.028648125257779124, + "learning_rate": 0.0002807400966152778, + "loss": 0.4915, + "step": 26790 + }, + { + "epoch": 1.3603203411557159, + "grad_norm": 0.02302269054921378, + "learning_rate": 0.0002805410279867686, + "loss": 0.459, + "step": 26795 + }, + { + "epoch": 1.3605741772537472, + "grad_norm": 0.030405240018795764, + "learning_rate": 0.0002803420024369609, + "loss": 0.4316, + "step": 26800 + }, + { + "epoch": 1.3608280133517787, + "grad_norm": 0.030800198523568283, + "learning_rate": 0.00028014302000492285, + "loss": 0.4752, + "step": 26805 + }, + { + "epoch": 1.3610818494498103, + "grad_norm": 0.02475384060074233, + "learning_rate": 0.00027994408072971346, + "loss": 0.4718, + "step": 26810 + }, + { + "epoch": 1.3613356855478418, + "grad_norm": 0.021700406170621297, + "learning_rate": 0.0002797451846503837, + "loss": 0.4708, + "step": 26815 + }, + { + "epoch": 1.3615895216458733, + "grad_norm": 0.021235969952080012, + "learning_rate": 0.00027954633180597564, + "loss": 0.495, + "step": 26820 + }, + { + "epoch": 1.3618433577439046, + "grad_norm": 0.024028737162551318, + "learning_rate": 0.00027934752223552343, + "loss": 0.4563, + "step": 26825 + }, + { + "epoch": 1.3620971938419362, + "grad_norm": 0.024465335502800833, + "learning_rate": 0.0002791487559780521, + "loss": 0.4787, + "step": 26830 + }, + { + "epoch": 1.3623510299399677, + "grad_norm": 0.022491243446900795, + "learning_rate": 0.00027895003307257867, + "loss": 0.4787, + "step": 26835 + }, + { + "epoch": 1.3626048660379992, + "grad_norm": 0.021133211476665054, + "learning_rate": 0.000278751353558111, + "loss": 0.4931, + "step": 26840 + }, + { + "epoch": 1.3628587021360308, + "grad_norm": 0.02049442999020091, + "learning_rate": 0.00027855271747364966, + "loss": 0.4676, + "step": 26845 + }, + { + "epoch": 1.3631125382340623, + "grad_norm": 0.08243543665279841, + "learning_rate": 0.00027835412485818534, + "loss": 0.4618, + "step": 26850 + }, + { + "epoch": 1.3633663743320938, + "grad_norm": 0.03544850817167854, + "learning_rate": 0.00027815557575070117, + "loss": 0.4583, + "step": 26855 + }, + { + "epoch": 1.3636202104301254, + "grad_norm": 0.020365646726838955, + "learning_rate": 0.0002779570701901709, + "loss": 0.4487, + "step": 26860 + }, + { + "epoch": 1.3638740465281567, + "grad_norm": 0.02299060460650672, + "learning_rate": 0.0002777586082155607, + "loss": 0.5215, + "step": 26865 + }, + { + "epoch": 1.3641278826261882, + "grad_norm": 0.02293172060847605, + "learning_rate": 0.00027756018986582715, + "loss": 0.4658, + "step": 26870 + }, + { + "epoch": 1.3643817187242198, + "grad_norm": 0.024224482941596973, + "learning_rate": 0.00027736181517991923, + "loss": 0.4582, + "step": 26875 + }, + { + "epoch": 1.3646355548222513, + "grad_norm": 0.02013654142332293, + "learning_rate": 0.0002771634841967767, + "loss": 0.5103, + "step": 26880 + }, + { + "epoch": 1.3648893909202828, + "grad_norm": 0.022624447257192006, + "learning_rate": 0.00027696519695533074, + "loss": 0.5027, + "step": 26885 + }, + { + "epoch": 1.3651432270183141, + "grad_norm": 0.023533731523000142, + "learning_rate": 0.00027676695349450456, + "loss": 0.4481, + "step": 26890 + }, + { + "epoch": 1.3653970631163457, + "grad_norm": 0.023179531374566843, + "learning_rate": 0.0002765687538532119, + "loss": 0.4483, + "step": 26895 + }, + { + "epoch": 1.3656508992143772, + "grad_norm": 0.024281101734940795, + "learning_rate": 0.0002763705980703586, + "loss": 0.4812, + "step": 26900 + }, + { + "epoch": 1.3659047353124087, + "grad_norm": 0.026283033797865986, + "learning_rate": 0.0002761724861848417, + "loss": 0.491, + "step": 26905 + }, + { + "epoch": 1.3661585714104403, + "grad_norm": 0.02323522667080857, + "learning_rate": 0.0002759744182355498, + "loss": 0.4649, + "step": 26910 + }, + { + "epoch": 1.3664124075084718, + "grad_norm": 0.021854167824604177, + "learning_rate": 0.00027577639426136204, + "loss": 0.5056, + "step": 26915 + }, + { + "epoch": 1.3666662436065034, + "grad_norm": 0.023141909554007044, + "learning_rate": 0.00027557841430115015, + "loss": 0.4835, + "step": 26920 + }, + { + "epoch": 1.3669200797045349, + "grad_norm": 0.025826171421759428, + "learning_rate": 0.0002753804783937762, + "loss": 0.5056, + "step": 26925 + }, + { + "epoch": 1.3671739158025664, + "grad_norm": 0.022488613024276922, + "learning_rate": 0.0002751825865780943, + "loss": 0.4655, + "step": 26930 + }, + { + "epoch": 1.3674277519005977, + "grad_norm": 0.023927541727324465, + "learning_rate": 0.0002749847388929493, + "loss": 0.4865, + "step": 26935 + }, + { + "epoch": 1.3676815879986293, + "grad_norm": 0.022655438670871445, + "learning_rate": 0.0002747869353771781, + "loss": 0.4848, + "step": 26940 + }, + { + "epoch": 1.3679354240966608, + "grad_norm": 0.022064396845660302, + "learning_rate": 0.0002745891760696082, + "loss": 0.5048, + "step": 26945 + }, + { + "epoch": 1.3681892601946923, + "grad_norm": 0.026679064333024728, + "learning_rate": 0.0002743914610090591, + "loss": 0.4827, + "step": 26950 + }, + { + "epoch": 1.3684430962927239, + "grad_norm": 0.027571108833935626, + "learning_rate": 0.0002741937902343409, + "loss": 0.4845, + "step": 26955 + }, + { + "epoch": 1.3686969323907552, + "grad_norm": 0.025110754378475025, + "learning_rate": 0.0002739961637842555, + "loss": 0.4623, + "step": 26960 + }, + { + "epoch": 1.3689507684887867, + "grad_norm": 0.0203989644652357, + "learning_rate": 0.0002737985816975963, + "loss": 0.5092, + "step": 26965 + }, + { + "epoch": 1.3692046045868183, + "grad_norm": 0.02200896596387883, + "learning_rate": 0.00027360104401314735, + "loss": 0.4924, + "step": 26970 + }, + { + "epoch": 1.3694584406848498, + "grad_norm": 0.027141315110277984, + "learning_rate": 0.0002734035507696845, + "loss": 0.4874, + "step": 26975 + }, + { + "epoch": 1.3697122767828813, + "grad_norm": 0.02848772652258403, + "learning_rate": 0.0002732061020059745, + "loss": 0.5233, + "step": 26980 + }, + { + "epoch": 1.3699661128809129, + "grad_norm": 0.022711189059726297, + "learning_rate": 0.00027300869776077574, + "loss": 0.5153, + "step": 26985 + }, + { + "epoch": 1.3702199489789444, + "grad_norm": 0.023955919803751433, + "learning_rate": 0.0002728113380728375, + "loss": 0.4977, + "step": 26990 + }, + { + "epoch": 1.370473785076976, + "grad_norm": 0.028659928908923184, + "learning_rate": 0.0002726140229809008, + "loss": 0.515, + "step": 26995 + }, + { + "epoch": 1.3707276211750072, + "grad_norm": 0.02508581003225146, + "learning_rate": 0.00027241675252369715, + "loss": 0.4582, + "step": 27000 + }, + { + "epoch": 1.3709814572730388, + "grad_norm": 0.021436214718563537, + "learning_rate": 0.0002722195267399502, + "loss": 0.4612, + "step": 27005 + }, + { + "epoch": 1.3712352933710703, + "grad_norm": 0.02201171442218941, + "learning_rate": 0.00027202234566837415, + "loss": 0.4995, + "step": 27010 + }, + { + "epoch": 1.3714891294691018, + "grad_norm": 0.02526187332817724, + "learning_rate": 0.0002718252093476748, + "loss": 0.4808, + "step": 27015 + }, + { + "epoch": 1.3717429655671334, + "grad_norm": 0.020953262767424676, + "learning_rate": 0.0002716281178165486, + "loss": 0.4999, + "step": 27020 + }, + { + "epoch": 1.3719968016651647, + "grad_norm": 0.022858740436538003, + "learning_rate": 0.00027143107111368437, + "loss": 0.5066, + "step": 27025 + }, + { + "epoch": 1.3722506377631962, + "grad_norm": 0.030228302005709733, + "learning_rate": 0.00027123406927776085, + "loss": 0.4618, + "step": 27030 + }, + { + "epoch": 1.3725044738612278, + "grad_norm": 0.021247447506756632, + "learning_rate": 0.0002710371123474488, + "loss": 0.4838, + "step": 27035 + }, + { + "epoch": 1.3727583099592593, + "grad_norm": 0.029048074402705838, + "learning_rate": 0.00027084020036140965, + "loss": 0.4537, + "step": 27040 + }, + { + "epoch": 1.3730121460572908, + "grad_norm": 0.020897232940356406, + "learning_rate": 0.00027064333335829647, + "loss": 0.4661, + "step": 27045 + }, + { + "epoch": 1.3732659821553224, + "grad_norm": 0.024458604155040357, + "learning_rate": 0.00027044651137675304, + "loss": 0.4854, + "step": 27050 + }, + { + "epoch": 1.373519818253354, + "grad_norm": 0.021452148822171557, + "learning_rate": 0.00027024973445541475, + "loss": 0.4756, + "step": 27055 + }, + { + "epoch": 1.3737736543513854, + "grad_norm": 0.02638487068742648, + "learning_rate": 0.00027005300263290764, + "loss": 0.5049, + "step": 27060 + }, + { + "epoch": 1.3740274904494167, + "grad_norm": 0.030588138064835756, + "learning_rate": 0.00026985631594784966, + "loss": 0.4904, + "step": 27065 + }, + { + "epoch": 1.3742813265474483, + "grad_norm": 0.021448138541180863, + "learning_rate": 0.0002696596744388488, + "loss": 0.4687, + "step": 27070 + }, + { + "epoch": 1.3745351626454798, + "grad_norm": 0.0220331732837399, + "learning_rate": 0.0002694630781445054, + "loss": 0.5052, + "step": 27075 + }, + { + "epoch": 1.3747889987435113, + "grad_norm": 0.021848289466681034, + "learning_rate": 0.0002692665271034099, + "loss": 0.4823, + "step": 27080 + }, + { + "epoch": 1.3750428348415429, + "grad_norm": 0.024123987652464134, + "learning_rate": 0.00026907002135414447, + "loss": 0.4641, + "step": 27085 + }, + { + "epoch": 1.3752966709395742, + "grad_norm": 0.020535646841589324, + "learning_rate": 0.00026887356093528237, + "loss": 0.4648, + "step": 27090 + }, + { + "epoch": 1.3755505070376057, + "grad_norm": 0.02584026244252551, + "learning_rate": 0.00026867714588538747, + "loss": 0.5047, + "step": 27095 + }, + { + "epoch": 1.3758043431356373, + "grad_norm": 0.02367328497332991, + "learning_rate": 0.00026848077624301537, + "loss": 0.4565, + "step": 27100 + }, + { + "epoch": 1.3760581792336688, + "grad_norm": 0.031745274615003184, + "learning_rate": 0.00026828445204671216, + "loss": 0.4953, + "step": 27105 + }, + { + "epoch": 1.3763120153317003, + "grad_norm": 0.029581043724073622, + "learning_rate": 0.0002680881733350156, + "loss": 0.4612, + "step": 27110 + }, + { + "epoch": 1.3765658514297319, + "grad_norm": 0.025514285969248228, + "learning_rate": 0.0002678919401464539, + "loss": 0.4811, + "step": 27115 + }, + { + "epoch": 1.3768196875277634, + "grad_norm": 0.023296903129213998, + "learning_rate": 0.00026769575251954703, + "loss": 0.5023, + "step": 27120 + }, + { + "epoch": 1.377073523625795, + "grad_norm": 0.02227739230704745, + "learning_rate": 0.00026749961049280527, + "loss": 0.4709, + "step": 27125 + }, + { + "epoch": 1.3773273597238263, + "grad_norm": 0.023504231198353757, + "learning_rate": 0.0002673035141047306, + "loss": 0.4861, + "step": 27130 + }, + { + "epoch": 1.3775811958218578, + "grad_norm": 0.02229075980390175, + "learning_rate": 0.0002671074633938156, + "loss": 0.4808, + "step": 27135 + }, + { + "epoch": 1.3778350319198893, + "grad_norm": 0.024181315546383808, + "learning_rate": 0.00026691145839854405, + "loss": 0.4894, + "step": 27140 + }, + { + "epoch": 1.3780888680179209, + "grad_norm": 0.02532811062819618, + "learning_rate": 0.00026671549915739076, + "loss": 0.472, + "step": 27145 + }, + { + "epoch": 1.3783427041159524, + "grad_norm": 0.021472103926306933, + "learning_rate": 0.0002665195857088218, + "loss": 0.4636, + "step": 27150 + }, + { + "epoch": 1.3785965402139837, + "grad_norm": 0.028858062964877782, + "learning_rate": 0.0002663237180912936, + "loss": 0.4687, + "step": 27155 + }, + { + "epoch": 1.3788503763120152, + "grad_norm": 0.022973674462700357, + "learning_rate": 0.0002661278963432544, + "loss": 0.4748, + "step": 27160 + }, + { + "epoch": 1.3791042124100468, + "grad_norm": 0.024592376476516513, + "learning_rate": 0.00026593212050314265, + "loss": 0.4917, + "step": 27165 + }, + { + "epoch": 1.3793580485080783, + "grad_norm": 0.026905170435774242, + "learning_rate": 0.0002657363906093886, + "loss": 0.461, + "step": 27170 + }, + { + "epoch": 1.3796118846061098, + "grad_norm": 0.03238266587023234, + "learning_rate": 0.0002655407067004125, + "loss": 0.4955, + "step": 27175 + }, + { + "epoch": 1.3798657207041414, + "grad_norm": 0.03437683326100972, + "learning_rate": 0.00026534506881462674, + "loss": 0.4659, + "step": 27180 + }, + { + "epoch": 1.380119556802173, + "grad_norm": 0.030372754755464044, + "learning_rate": 0.0002651494769904335, + "loss": 0.4952, + "step": 27185 + }, + { + "epoch": 1.3803733929002044, + "grad_norm": 0.029874387863824136, + "learning_rate": 0.00026495393126622685, + "loss": 0.4846, + "step": 27190 + }, + { + "epoch": 1.3806272289982358, + "grad_norm": 0.0267554731125212, + "learning_rate": 0.00026475843168039117, + "loss": 0.4723, + "step": 27195 + }, + { + "epoch": 1.3808810650962673, + "grad_norm": 0.020690270070420125, + "learning_rate": 0.0002645629782713022, + "loss": 0.4799, + "step": 27200 + }, + { + "epoch": 1.3811349011942988, + "grad_norm": 0.02838109794666923, + "learning_rate": 0.00026436757107732665, + "loss": 0.4916, + "step": 27205 + }, + { + "epoch": 1.3813887372923304, + "grad_norm": 0.02358589235650878, + "learning_rate": 0.0002641722101368217, + "loss": 0.4953, + "step": 27210 + }, + { + "epoch": 1.381642573390362, + "grad_norm": 0.02842708954146116, + "learning_rate": 0.000263976895488136, + "loss": 0.5203, + "step": 27215 + }, + { + "epoch": 1.3818964094883932, + "grad_norm": 0.0220954764308833, + "learning_rate": 0.0002637816271696084, + "loss": 0.4997, + "step": 27220 + }, + { + "epoch": 1.3821502455864247, + "grad_norm": 0.031197550414260204, + "learning_rate": 0.0002635864052195696, + "loss": 0.5016, + "step": 27225 + }, + { + "epoch": 1.3824040816844563, + "grad_norm": 0.02454468885248768, + "learning_rate": 0.00026339122967634026, + "loss": 0.499, + "step": 27230 + }, + { + "epoch": 1.3826579177824878, + "grad_norm": 0.05778464299883465, + "learning_rate": 0.0002631961005782328, + "loss": 0.4341, + "step": 27235 + }, + { + "epoch": 1.3829117538805193, + "grad_norm": 0.023244866350785674, + "learning_rate": 0.00026300101796354966, + "loss": 0.4571, + "step": 27240 + }, + { + "epoch": 1.3831655899785509, + "grad_norm": 0.06533569452053184, + "learning_rate": 0.0002628059818705849, + "loss": 0.471, + "step": 27245 + }, + { + "epoch": 1.3834194260765824, + "grad_norm": 0.026721629818407002, + "learning_rate": 0.00026261099233762286, + "loss": 0.448, + "step": 27250 + }, + { + "epoch": 1.383673262174614, + "grad_norm": 0.023742554699451898, + "learning_rate": 0.0002624160494029394, + "loss": 0.4868, + "step": 27255 + }, + { + "epoch": 1.3839270982726455, + "grad_norm": 0.028301790837454648, + "learning_rate": 0.0002622211531048004, + "loss": 0.4682, + "step": 27260 + }, + { + "epoch": 1.3841809343706768, + "grad_norm": 0.025666697236186767, + "learning_rate": 0.0002620263034814632, + "loss": 0.4725, + "step": 27265 + }, + { + "epoch": 1.3844347704687083, + "grad_norm": 0.02634404140327674, + "learning_rate": 0.00026183150057117595, + "loss": 0.4929, + "step": 27270 + }, + { + "epoch": 1.3846886065667399, + "grad_norm": 0.025227814246512378, + "learning_rate": 0.0002616367444121775, + "loss": 0.4846, + "step": 27275 + }, + { + "epoch": 1.3849424426647714, + "grad_norm": 0.028927076349626928, + "learning_rate": 0.0002614420350426973, + "loss": 0.4822, + "step": 27280 + }, + { + "epoch": 1.385196278762803, + "grad_norm": 0.023294755705656967, + "learning_rate": 0.00026124737250095596, + "loss": 0.4622, + "step": 27285 + }, + { + "epoch": 1.3854501148608342, + "grad_norm": 0.023578850146889208, + "learning_rate": 0.0002610527568251647, + "loss": 0.457, + "step": 27290 + }, + { + "epoch": 1.3857039509588658, + "grad_norm": 0.021005770334938684, + "learning_rate": 0.0002608581880535258, + "loss": 0.4816, + "step": 27295 + }, + { + "epoch": 1.3859577870568973, + "grad_norm": 0.022374150226942276, + "learning_rate": 0.00026066366622423177, + "loss": 0.4843, + "step": 27300 + }, + { + "epoch": 1.3862116231549289, + "grad_norm": 0.027829807992719183, + "learning_rate": 0.0002604691913754668, + "loss": 0.4921, + "step": 27305 + }, + { + "epoch": 1.3864654592529604, + "grad_norm": 0.021996056034670426, + "learning_rate": 0.0002602747635454047, + "loss": 0.5174, + "step": 27310 + }, + { + "epoch": 1.386719295350992, + "grad_norm": 0.021135081168996664, + "learning_rate": 0.00026008038277221127, + "loss": 0.4732, + "step": 27315 + }, + { + "epoch": 1.3869731314490235, + "grad_norm": 0.040553027428796726, + "learning_rate": 0.0002598860490940419, + "loss": 0.4869, + "step": 27320 + }, + { + "epoch": 1.387226967547055, + "grad_norm": 0.02726713196600684, + "learning_rate": 0.0002596917625490438, + "loss": 0.4646, + "step": 27325 + }, + { + "epoch": 1.3874808036450863, + "grad_norm": 0.030567319881294505, + "learning_rate": 0.0002594975231753544, + "loss": 0.4909, + "step": 27330 + }, + { + "epoch": 1.3877346397431178, + "grad_norm": 0.02274111194508241, + "learning_rate": 0.00025930333101110173, + "loss": 0.476, + "step": 27335 + }, + { + "epoch": 1.3879884758411494, + "grad_norm": 0.02044291002475312, + "learning_rate": 0.0002591091860944049, + "loss": 0.5066, + "step": 27340 + }, + { + "epoch": 1.388242311939181, + "grad_norm": 0.028846129971471627, + "learning_rate": 0.00025891508846337337, + "loss": 0.4598, + "step": 27345 + }, + { + "epoch": 1.3884961480372124, + "grad_norm": 0.024004903325607403, + "learning_rate": 0.00025872103815610794, + "loss": 0.4617, + "step": 27350 + }, + { + "epoch": 1.3887499841352438, + "grad_norm": 0.02506170048370503, + "learning_rate": 0.0002585270352106992, + "loss": 0.4712, + "step": 27355 + }, + { + "epoch": 1.3890038202332753, + "grad_norm": 0.02408913428393671, + "learning_rate": 0.0002583330796652294, + "loss": 0.4632, + "step": 27360 + }, + { + "epoch": 1.3892576563313068, + "grad_norm": 0.022166652995046117, + "learning_rate": 0.0002581391715577707, + "loss": 0.4792, + "step": 27365 + }, + { + "epoch": 1.3895114924293384, + "grad_norm": 0.034847220934961225, + "learning_rate": 0.00025794531092638667, + "loss": 0.4824, + "step": 27370 + }, + { + "epoch": 1.38976532852737, + "grad_norm": 0.02419671012650832, + "learning_rate": 0.0002577514978091308, + "loss": 0.5055, + "step": 27375 + }, + { + "epoch": 1.3900191646254014, + "grad_norm": 0.02417967296068721, + "learning_rate": 0.000257557732244048, + "loss": 0.4874, + "step": 27380 + }, + { + "epoch": 1.390273000723433, + "grad_norm": 0.022712157728031215, + "learning_rate": 0.00025736401426917286, + "loss": 0.4702, + "step": 27385 + }, + { + "epoch": 1.3905268368214645, + "grad_norm": 0.027626729533074428, + "learning_rate": 0.0002571703439225322, + "loss": 0.4904, + "step": 27390 + }, + { + "epoch": 1.3907806729194958, + "grad_norm": 0.023010404248220272, + "learning_rate": 0.00025697672124214176, + "loss": 0.4923, + "step": 27395 + }, + { + "epoch": 1.3910345090175273, + "grad_norm": 0.021117714170819323, + "learning_rate": 0.00025678314626600924, + "loss": 0.4616, + "step": 27400 + }, + { + "epoch": 1.3912883451155589, + "grad_norm": 0.023323314097295, + "learning_rate": 0.00025658961903213197, + "loss": 0.4779, + "step": 27405 + }, + { + "epoch": 1.3915421812135904, + "grad_norm": 0.03386308270675858, + "learning_rate": 0.0002563961395784987, + "loss": 0.4701, + "step": 27410 + }, + { + "epoch": 1.391796017311622, + "grad_norm": 0.023072249782764793, + "learning_rate": 0.0002562027079430883, + "loss": 0.4628, + "step": 27415 + }, + { + "epoch": 1.3920498534096533, + "grad_norm": 0.026329411232554476, + "learning_rate": 0.0002560093241638707, + "loss": 0.5117, + "step": 27420 + }, + { + "epoch": 1.3923036895076848, + "grad_norm": 0.02564234053698023, + "learning_rate": 0.00025581598827880575, + "loss": 0.4713, + "step": 27425 + }, + { + "epoch": 1.3925575256057163, + "grad_norm": 0.026068964054194645, + "learning_rate": 0.0002556227003258448, + "loss": 0.4654, + "step": 27430 + }, + { + "epoch": 1.3928113617037479, + "grad_norm": 0.020125692575617058, + "learning_rate": 0.0002554294603429288, + "loss": 0.4917, + "step": 27435 + }, + { + "epoch": 1.3930651978017794, + "grad_norm": 0.02566386436456717, + "learning_rate": 0.0002552362683679903, + "loss": 0.491, + "step": 27440 + }, + { + "epoch": 1.393319033899811, + "grad_norm": 0.022596381640967816, + "learning_rate": 0.0002550431244389515, + "loss": 0.4924, + "step": 27445 + }, + { + "epoch": 1.3935728699978425, + "grad_norm": 0.05194383248908799, + "learning_rate": 0.00025485002859372574, + "loss": 0.472, + "step": 27450 + }, + { + "epoch": 1.393826706095874, + "grad_norm": 0.0222046493642593, + "learning_rate": 0.00025465698087021705, + "loss": 0.4746, + "step": 27455 + }, + { + "epoch": 1.3940805421939053, + "grad_norm": 0.02186007738670582, + "learning_rate": 0.0002544639813063193, + "loss": 0.4409, + "step": 27460 + }, + { + "epoch": 1.3943343782919368, + "grad_norm": 0.023367690449110124, + "learning_rate": 0.0002542710299399177, + "loss": 0.4573, + "step": 27465 + }, + { + "epoch": 1.3945882143899684, + "grad_norm": 0.02272755185104169, + "learning_rate": 0.00025407812680888726, + "loss": 0.4495, + "step": 27470 + }, + { + "epoch": 1.394842050488, + "grad_norm": 0.024020085751695088, + "learning_rate": 0.0002538852719510943, + "loss": 0.4779, + "step": 27475 + }, + { + "epoch": 1.3950958865860315, + "grad_norm": 0.0218924915046459, + "learning_rate": 0.00025369246540439495, + "loss": 0.4737, + "step": 27480 + }, + { + "epoch": 1.3953497226840628, + "grad_norm": 0.02276957102386517, + "learning_rate": 0.00025349970720663653, + "loss": 0.5073, + "step": 27485 + }, + { + "epoch": 1.3956035587820943, + "grad_norm": 0.02384126314470508, + "learning_rate": 0.000253306997395656, + "loss": 0.4972, + "step": 27490 + }, + { + "epoch": 1.3958573948801258, + "grad_norm": 0.03572583730853441, + "learning_rate": 0.00025311433600928184, + "loss": 0.4611, + "step": 27495 + }, + { + "epoch": 1.3961112309781574, + "grad_norm": 0.7840304155881027, + "learning_rate": 0.00025292172308533214, + "loss": 0.4962, + "step": 27500 + }, + { + "epoch": 1.396365067076189, + "grad_norm": 0.0463916296322995, + "learning_rate": 0.000252729158661616, + "loss": 0.4765, + "step": 27505 + }, + { + "epoch": 1.3966189031742204, + "grad_norm": 0.05568946088292396, + "learning_rate": 0.0002525366427759329, + "loss": 0.4964, + "step": 27510 + }, + { + "epoch": 1.396872739272252, + "grad_norm": 0.023057160898420713, + "learning_rate": 0.00025234417546607293, + "loss": 0.4928, + "step": 27515 + }, + { + "epoch": 1.3971265753702835, + "grad_norm": 0.029288129095807625, + "learning_rate": 0.000252151756769816, + "loss": 0.4895, + "step": 27520 + }, + { + "epoch": 1.397380411468315, + "grad_norm": 0.03340361260423104, + "learning_rate": 0.00025195938672493344, + "loss": 0.479, + "step": 27525 + }, + { + "epoch": 1.3976342475663464, + "grad_norm": 0.03739192421428975, + "learning_rate": 0.0002517670653691861, + "loss": 0.4901, + "step": 27530 + }, + { + "epoch": 1.397888083664378, + "grad_norm": 0.023015539091900444, + "learning_rate": 0.0002515747927403261, + "loss": 0.457, + "step": 27535 + }, + { + "epoch": 1.3981419197624094, + "grad_norm": 0.024683773093099548, + "learning_rate": 0.00025138256887609513, + "loss": 0.4845, + "step": 27540 + }, + { + "epoch": 1.398395755860441, + "grad_norm": 0.03275767392537276, + "learning_rate": 0.0002511903938142263, + "loss": 0.4641, + "step": 27545 + }, + { + "epoch": 1.3986495919584725, + "grad_norm": 0.02455035950939294, + "learning_rate": 0.0002509982675924421, + "loss": 0.4641, + "step": 27550 + }, + { + "epoch": 1.3989034280565038, + "grad_norm": 0.021707198486694775, + "learning_rate": 0.00025080619024845643, + "loss": 0.482, + "step": 27555 + }, + { + "epoch": 1.3991572641545353, + "grad_norm": 0.022054949788400096, + "learning_rate": 0.0002506141618199727, + "loss": 0.4771, + "step": 27560 + }, + { + "epoch": 1.3994111002525669, + "grad_norm": 0.023827476345478014, + "learning_rate": 0.0002504221823446853, + "loss": 0.4608, + "step": 27565 + }, + { + "epoch": 1.3996649363505984, + "grad_norm": 0.028504264487398272, + "learning_rate": 0.00025023025186027905, + "loss": 0.4909, + "step": 27570 + }, + { + "epoch": 1.39991877244863, + "grad_norm": 0.029743187667604372, + "learning_rate": 0.0002500383704044286, + "loss": 0.4695, + "step": 27575 + }, + { + "epoch": 1.4001726085466615, + "grad_norm": 0.024573032907380145, + "learning_rate": 0.00024984653801479967, + "loss": 0.4768, + "step": 27580 + }, + { + "epoch": 1.400426444644693, + "grad_norm": 0.04851132365758049, + "learning_rate": 0.0002496547547290476, + "loss": 0.4658, + "step": 27585 + }, + { + "epoch": 1.4006802807427245, + "grad_norm": 0.03611595038168912, + "learning_rate": 0.0002494630205848189, + "loss": 0.4891, + "step": 27590 + }, + { + "epoch": 1.4009341168407559, + "grad_norm": 0.020312909657943287, + "learning_rate": 0.0002492713356197497, + "loss": 0.4731, + "step": 27595 + }, + { + "epoch": 1.4011879529387874, + "grad_norm": 0.02051178256643885, + "learning_rate": 0.0002490796998714671, + "loss": 0.4633, + "step": 27600 + }, + { + "epoch": 1.401441789036819, + "grad_norm": 0.028805213882367668, + "learning_rate": 0.0002488881133775878, + "loss": 0.4689, + "step": 27605 + }, + { + "epoch": 1.4016956251348505, + "grad_norm": 0.03456140699472926, + "learning_rate": 0.00024869657617571984, + "loss": 0.4732, + "step": 27610 + }, + { + "epoch": 1.401949461232882, + "grad_norm": 0.02567267995131493, + "learning_rate": 0.00024850508830346046, + "loss": 0.4893, + "step": 27615 + }, + { + "epoch": 1.4022032973309133, + "grad_norm": 0.027690736913673394, + "learning_rate": 0.0002483136497983983, + "loss": 0.4773, + "step": 27620 + }, + { + "epoch": 1.4024571334289448, + "grad_norm": 0.02059800948274391, + "learning_rate": 0.00024812226069811114, + "loss": 0.4533, + "step": 27625 + }, + { + "epoch": 1.4027109695269764, + "grad_norm": 0.022577872784147853, + "learning_rate": 0.00024793092104016844, + "loss": 0.4989, + "step": 27630 + }, + { + "epoch": 1.402964805625008, + "grad_norm": 0.02564370891867458, + "learning_rate": 0.00024773963086212867, + "loss": 0.483, + "step": 27635 + }, + { + "epoch": 1.4032186417230395, + "grad_norm": 0.022783404442849806, + "learning_rate": 0.0002475483902015416, + "loss": 0.4517, + "step": 27640 + }, + { + "epoch": 1.403472477821071, + "grad_norm": 0.02130850865861562, + "learning_rate": 0.00024735719909594635, + "loss": 0.5042, + "step": 27645 + }, + { + "epoch": 1.4037263139191025, + "grad_norm": 0.025642889265787623, + "learning_rate": 0.00024716605758287315, + "loss": 0.4993, + "step": 27650 + }, + { + "epoch": 1.403980150017134, + "grad_norm": 0.022945742341174302, + "learning_rate": 0.00024697496569984177, + "loss": 0.4853, + "step": 27655 + }, + { + "epoch": 1.4042339861151654, + "grad_norm": 0.022770860368356186, + "learning_rate": 0.000246783923484363, + "loss": 0.467, + "step": 27660 + }, + { + "epoch": 1.404487822213197, + "grad_norm": 0.022660091235407458, + "learning_rate": 0.0002465929309739371, + "loss": 0.4698, + "step": 27665 + }, + { + "epoch": 1.4047416583112284, + "grad_norm": 0.027042451768303048, + "learning_rate": 0.0002464019882060553, + "loss": 0.4554, + "step": 27670 + }, + { + "epoch": 1.40499549440926, + "grad_norm": 0.02246177180890777, + "learning_rate": 0.0002462110952181982, + "loss": 0.4776, + "step": 27675 + }, + { + "epoch": 1.4052493305072915, + "grad_norm": 0.022416923064172367, + "learning_rate": 0.0002460202520478378, + "loss": 0.4547, + "step": 27680 + }, + { + "epoch": 1.4055031666053228, + "grad_norm": 0.02287798343042281, + "learning_rate": 0.0002458294587324351, + "loss": 0.4598, + "step": 27685 + }, + { + "epoch": 1.4057570027033544, + "grad_norm": 0.02035412219965513, + "learning_rate": 0.0002456387153094421, + "loss": 0.4872, + "step": 27690 + }, + { + "epoch": 1.4060108388013859, + "grad_norm": 0.029570301391272168, + "learning_rate": 0.000245448021816301, + "loss": 0.4461, + "step": 27695 + }, + { + "epoch": 1.4062646748994174, + "grad_norm": 0.024728980525368392, + "learning_rate": 0.00024525737829044354, + "loss": 0.4797, + "step": 27700 + }, + { + "epoch": 1.406518510997449, + "grad_norm": 0.022333792784804983, + "learning_rate": 0.0002450667847692925, + "loss": 0.5004, + "step": 27705 + }, + { + "epoch": 1.4067723470954805, + "grad_norm": 0.02145010095522035, + "learning_rate": 0.00024487624129026017, + "loss": 0.4473, + "step": 27710 + }, + { + "epoch": 1.407026183193512, + "grad_norm": 0.02758397602988774, + "learning_rate": 0.00024468574789074946, + "loss": 0.4705, + "step": 27715 + }, + { + "epoch": 1.4072800192915436, + "grad_norm": 0.033109491790319426, + "learning_rate": 0.000244495304608153, + "loss": 0.4633, + "step": 27720 + }, + { + "epoch": 1.4075338553895749, + "grad_norm": 0.029190273291780672, + "learning_rate": 0.0002443049114798543, + "loss": 0.4585, + "step": 27725 + }, + { + "epoch": 1.4077876914876064, + "grad_norm": 0.03253325936543185, + "learning_rate": 0.00024411456854322612, + "loss": 0.4757, + "step": 27730 + }, + { + "epoch": 1.408041527585638, + "grad_norm": 0.020506343909216858, + "learning_rate": 0.0002439242758356322, + "loss": 0.4797, + "step": 27735 + }, + { + "epoch": 1.4082953636836695, + "grad_norm": 0.026935301152816532, + "learning_rate": 0.0002437340333944257, + "loss": 0.4494, + "step": 27740 + }, + { + "epoch": 1.408549199781701, + "grad_norm": 0.029552183804907328, + "learning_rate": 0.00024354384125695045, + "loss": 0.4962, + "step": 27745 + }, + { + "epoch": 1.4088030358797323, + "grad_norm": 0.027966829528624507, + "learning_rate": 0.00024335369946054027, + "loss": 0.4627, + "step": 27750 + }, + { + "epoch": 1.4090568719777639, + "grad_norm": 0.020488294841671674, + "learning_rate": 0.00024316360804251907, + "loss": 0.4783, + "step": 27755 + }, + { + "epoch": 1.4093107080757954, + "grad_norm": 0.02500272619569665, + "learning_rate": 0.0002429735670402007, + "loss": 0.4523, + "step": 27760 + }, + { + "epoch": 1.409564544173827, + "grad_norm": 0.03524132852415086, + "learning_rate": 0.00024278357649088945, + "loss": 0.4832, + "step": 27765 + }, + { + "epoch": 1.4098183802718585, + "grad_norm": 0.029454084316621682, + "learning_rate": 0.00024259363643187922, + "loss": 0.4866, + "step": 27770 + }, + { + "epoch": 1.41007221636989, + "grad_norm": 0.0230214354412747, + "learning_rate": 0.00024240374690045468, + "loss": 0.4832, + "step": 27775 + }, + { + "epoch": 1.4103260524679215, + "grad_norm": 0.022547181126946717, + "learning_rate": 0.00024221390793388977, + "loss": 0.4898, + "step": 27780 + }, + { + "epoch": 1.410579888565953, + "grad_norm": 0.023719925710689885, + "learning_rate": 0.00024202411956944937, + "loss": 0.4859, + "step": 27785 + }, + { + "epoch": 1.4108337246639846, + "grad_norm": 0.026253153450593418, + "learning_rate": 0.00024183438184438761, + "loss": 0.4943, + "step": 27790 + }, + { + "epoch": 1.411087560762016, + "grad_norm": 0.023581771288596627, + "learning_rate": 0.00024164469479594935, + "loss": 0.5134, + "step": 27795 + }, + { + "epoch": 1.4113413968600474, + "grad_norm": 0.024469131177750587, + "learning_rate": 0.00024145505846136895, + "loss": 0.4991, + "step": 27800 + }, + { + "epoch": 1.411595232958079, + "grad_norm": 0.03535425997247597, + "learning_rate": 0.0002412654728778712, + "loss": 0.4964, + "step": 27805 + }, + { + "epoch": 1.4118490690561105, + "grad_norm": 0.024175285871346007, + "learning_rate": 0.00024107593808267102, + "loss": 0.471, + "step": 27810 + }, + { + "epoch": 1.412102905154142, + "grad_norm": 0.023755148541426182, + "learning_rate": 0.00024088645411297273, + "loss": 0.4849, + "step": 27815 + }, + { + "epoch": 1.4123567412521734, + "grad_norm": 0.02516434829571276, + "learning_rate": 0.00024069702100597146, + "loss": 0.467, + "step": 27820 + }, + { + "epoch": 1.412610577350205, + "grad_norm": 0.023488420926309095, + "learning_rate": 0.00024050763879885167, + "loss": 0.482, + "step": 27825 + }, + { + "epoch": 1.4128644134482364, + "grad_norm": 0.022254767215497136, + "learning_rate": 0.00024031830752878854, + "loss": 0.5116, + "step": 27830 + }, + { + "epoch": 1.413118249546268, + "grad_norm": 0.02191194664116013, + "learning_rate": 0.00024012902723294632, + "loss": 0.4826, + "step": 27835 + }, + { + "epoch": 1.4133720856442995, + "grad_norm": 0.03718887288280671, + "learning_rate": 0.00023993979794848037, + "loss": 0.4901, + "step": 27840 + }, + { + "epoch": 1.413625921742331, + "grad_norm": 0.028432289745428645, + "learning_rate": 0.00023975061971253492, + "loss": 0.4806, + "step": 27845 + }, + { + "epoch": 1.4138797578403626, + "grad_norm": 0.023502679374965474, + "learning_rate": 0.00023956149256224512, + "loss": 0.4374, + "step": 27850 + }, + { + "epoch": 1.414133593938394, + "grad_norm": 0.02184921161134165, + "learning_rate": 0.0002393724165347354, + "loss": 0.4712, + "step": 27855 + }, + { + "epoch": 1.4143874300364254, + "grad_norm": 0.02284152025331955, + "learning_rate": 0.0002391833916671207, + "loss": 0.5026, + "step": 27860 + }, + { + "epoch": 1.414641266134457, + "grad_norm": 0.02065393654716191, + "learning_rate": 0.0002389944179965052, + "loss": 0.4614, + "step": 27865 + }, + { + "epoch": 1.4148951022324885, + "grad_norm": 0.021188395785789092, + "learning_rate": 0.00023880549555998416, + "loss": 0.4681, + "step": 27870 + }, + { + "epoch": 1.41514893833052, + "grad_norm": 0.02498643430523538, + "learning_rate": 0.00023861662439464155, + "loss": 0.5116, + "step": 27875 + }, + { + "epoch": 1.4154027744285516, + "grad_norm": 0.03642276120823365, + "learning_rate": 0.00023842780453755231, + "loss": 0.479, + "step": 27880 + }, + { + "epoch": 1.4156566105265829, + "grad_norm": 0.021274402113326943, + "learning_rate": 0.00023823903602578035, + "loss": 0.4518, + "step": 27885 + }, + { + "epoch": 1.4159104466246144, + "grad_norm": 0.02378943254532682, + "learning_rate": 0.0002380503188963804, + "loss": 0.5052, + "step": 27890 + }, + { + "epoch": 1.416164282722646, + "grad_norm": 0.026776271663452332, + "learning_rate": 0.00023786165318639635, + "loss": 0.4953, + "step": 27895 + }, + { + "epoch": 1.4164181188206775, + "grad_norm": 0.022203192350251453, + "learning_rate": 0.00023767303893286262, + "loss": 0.4893, + "step": 27900 + }, + { + "epoch": 1.416671954918709, + "grad_norm": 0.0235156458738656, + "learning_rate": 0.00023748447617280322, + "loss": 0.4461, + "step": 27905 + }, + { + "epoch": 1.4169257910167405, + "grad_norm": 0.021958486852655536, + "learning_rate": 0.00023729596494323173, + "loss": 0.4653, + "step": 27910 + }, + { + "epoch": 1.417179627114772, + "grad_norm": 0.0289039207384805, + "learning_rate": 0.00023710750528115244, + "loss": 0.4816, + "step": 27915 + }, + { + "epoch": 1.4174334632128036, + "grad_norm": 0.024505561379556843, + "learning_rate": 0.00023691909722355864, + "loss": 0.475, + "step": 27920 + }, + { + "epoch": 1.417687299310835, + "grad_norm": 0.021676083599215912, + "learning_rate": 0.00023673074080743405, + "loss": 0.4999, + "step": 27925 + }, + { + "epoch": 1.4179411354088665, + "grad_norm": 0.027958626700585656, + "learning_rate": 0.00023654243606975213, + "loss": 0.512, + "step": 27930 + }, + { + "epoch": 1.418194971506898, + "grad_norm": 0.024897726465202892, + "learning_rate": 0.0002363541830474763, + "loss": 0.5286, + "step": 27935 + }, + { + "epoch": 1.4184488076049295, + "grad_norm": 0.02113190762244004, + "learning_rate": 0.00023616598177755938, + "loss": 0.4781, + "step": 27940 + }, + { + "epoch": 1.418702643702961, + "grad_norm": 0.020658914881807294, + "learning_rate": 0.0002359778322969447, + "loss": 0.476, + "step": 27945 + }, + { + "epoch": 1.4189564798009924, + "grad_norm": 0.02016987097475399, + "learning_rate": 0.00023578973464256464, + "loss": 0.5082, + "step": 27950 + }, + { + "epoch": 1.419210315899024, + "grad_norm": 0.025221457362809504, + "learning_rate": 0.0002356016888513423, + "loss": 0.4956, + "step": 27955 + }, + { + "epoch": 1.4194641519970554, + "grad_norm": 0.027710307385371332, + "learning_rate": 0.00023541369496018967, + "loss": 0.4782, + "step": 27960 + }, + { + "epoch": 1.419717988095087, + "grad_norm": 0.019837189877263843, + "learning_rate": 0.0002352257530060094, + "loss": 0.4673, + "step": 27965 + }, + { + "epoch": 1.4199718241931185, + "grad_norm": 0.022705298855318596, + "learning_rate": 0.00023503786302569318, + "loss": 0.4746, + "step": 27970 + }, + { + "epoch": 1.42022566029115, + "grad_norm": 0.030143315594230304, + "learning_rate": 0.0002348500250561233, + "loss": 0.4531, + "step": 27975 + }, + { + "epoch": 1.4204794963891816, + "grad_norm": 0.02996052140446814, + "learning_rate": 0.00023466223913417105, + "loss": 0.4657, + "step": 27980 + }, + { + "epoch": 1.4207333324872131, + "grad_norm": 0.02310648055769472, + "learning_rate": 0.00023447450529669796, + "loss": 0.4884, + "step": 27985 + }, + { + "epoch": 1.4209871685852444, + "grad_norm": 0.029314484029678897, + "learning_rate": 0.00023428682358055553, + "loss": 0.4989, + "step": 27990 + }, + { + "epoch": 1.421241004683276, + "grad_norm": 0.0252418080632575, + "learning_rate": 0.00023409919402258433, + "loss": 0.4721, + "step": 27995 + }, + { + "epoch": 1.4214948407813075, + "grad_norm": 0.023141843733552995, + "learning_rate": 0.00023391161665961546, + "loss": 0.4946, + "step": 28000 + }, + { + "epoch": 1.421748676879339, + "grad_norm": 0.02792804961488345, + "learning_rate": 0.00023372409152846912, + "loss": 0.4681, + "step": 28005 + }, + { + "epoch": 1.4220025129773706, + "grad_norm": 0.029553149663017995, + "learning_rate": 0.00023353661866595582, + "loss": 0.5152, + "step": 28010 + }, + { + "epoch": 1.4222563490754019, + "grad_norm": 0.020892682323818024, + "learning_rate": 0.00023334919810887527, + "loss": 0.4211, + "step": 28015 + }, + { + "epoch": 1.4225101851734334, + "grad_norm": 0.02707479709867003, + "learning_rate": 0.0002331618298940176, + "loss": 0.4819, + "step": 28020 + }, + { + "epoch": 1.422764021271465, + "grad_norm": 0.02176754481264695, + "learning_rate": 0.00023297451405816173, + "loss": 0.4723, + "step": 28025 + }, + { + "epoch": 1.4230178573694965, + "grad_norm": 0.02889767627713131, + "learning_rate": 0.00023278725063807733, + "loss": 0.4705, + "step": 28030 + }, + { + "epoch": 1.423271693467528, + "grad_norm": 0.020395619422563103, + "learning_rate": 0.0002326000396705228, + "loss": 0.4619, + "step": 28035 + }, + { + "epoch": 1.4235255295655596, + "grad_norm": 0.02300164782732942, + "learning_rate": 0.0002324128811922472, + "loss": 0.4976, + "step": 28040 + }, + { + "epoch": 1.423779365663591, + "grad_norm": 0.026291959890614048, + "learning_rate": 0.00023222577523998816, + "loss": 0.473, + "step": 28045 + }, + { + "epoch": 1.4240332017616226, + "grad_norm": 0.020485251522475535, + "learning_rate": 0.00023203872185047442, + "loss": 0.4657, + "step": 28050 + }, + { + "epoch": 1.4242870378596542, + "grad_norm": 0.025828189579312467, + "learning_rate": 0.00023185172106042308, + "loss": 0.4625, + "step": 28055 + }, + { + "epoch": 1.4245408739576855, + "grad_norm": 0.02218487881694467, + "learning_rate": 0.00023166477290654185, + "loss": 0.4819, + "step": 28060 + }, + { + "epoch": 1.424794710055717, + "grad_norm": 0.0286179264415249, + "learning_rate": 0.00023147787742552734, + "loss": 0.4737, + "step": 28065 + }, + { + "epoch": 1.4250485461537485, + "grad_norm": 0.0228564657619892, + "learning_rate": 0.00023129103465406654, + "loss": 0.4672, + "step": 28070 + }, + { + "epoch": 1.42530238225178, + "grad_norm": 0.029588254076403645, + "learning_rate": 0.00023110424462883538, + "loss": 0.49, + "step": 28075 + }, + { + "epoch": 1.4255562183498114, + "grad_norm": 0.0214582359967556, + "learning_rate": 0.00023091750738650024, + "loss": 0.4618, + "step": 28080 + }, + { + "epoch": 1.425810054447843, + "grad_norm": 0.032133974034998646, + "learning_rate": 0.00023073082296371628, + "loss": 0.4364, + "step": 28085 + }, + { + "epoch": 1.4260638905458745, + "grad_norm": 0.027651903639121147, + "learning_rate": 0.0002305441913971291, + "loss": 0.4692, + "step": 28090 + }, + { + "epoch": 1.426317726643906, + "grad_norm": 0.019039201847567744, + "learning_rate": 0.0002303576127233732, + "loss": 0.4859, + "step": 28095 + }, + { + "epoch": 1.4265715627419375, + "grad_norm": 0.02386583789317677, + "learning_rate": 0.0002301710869790734, + "loss": 0.4875, + "step": 28100 + }, + { + "epoch": 1.426825398839969, + "grad_norm": 0.025025245204414376, + "learning_rate": 0.00022998461420084342, + "loss": 0.5166, + "step": 28105 + }, + { + "epoch": 1.4270792349380006, + "grad_norm": 0.023420261634063733, + "learning_rate": 0.00022979819442528715, + "loss": 0.4759, + "step": 28110 + }, + { + "epoch": 1.4273330710360321, + "grad_norm": 0.02333034761205301, + "learning_rate": 0.00022961182768899797, + "loss": 0.48, + "step": 28115 + }, + { + "epoch": 1.4275869071340637, + "grad_norm": 0.02627854902400825, + "learning_rate": 0.00022942551402855839, + "loss": 0.4807, + "step": 28120 + }, + { + "epoch": 1.427840743232095, + "grad_norm": 0.021412030460517, + "learning_rate": 0.0002292392534805412, + "loss": 0.5039, + "step": 28125 + }, + { + "epoch": 1.4280945793301265, + "grad_norm": 0.022170572716743882, + "learning_rate": 0.0002290530460815082, + "loss": 0.4968, + "step": 28130 + }, + { + "epoch": 1.428348415428158, + "grad_norm": 0.025786423164189262, + "learning_rate": 0.00022886689186801113, + "loss": 0.4907, + "step": 28135 + }, + { + "epoch": 1.4286022515261896, + "grad_norm": 0.021201177325464542, + "learning_rate": 0.00022868079087659087, + "loss": 0.4856, + "step": 28140 + }, + { + "epoch": 1.4288560876242211, + "grad_norm": 0.02309415988605629, + "learning_rate": 0.0002284947431437785, + "loss": 0.4825, + "step": 28145 + }, + { + "epoch": 1.4291099237222524, + "grad_norm": 0.024799580659098996, + "learning_rate": 0.00022830874870609385, + "loss": 0.4728, + "step": 28150 + }, + { + "epoch": 1.429363759820284, + "grad_norm": 0.02544415478890754, + "learning_rate": 0.00022812280760004718, + "loss": 0.4449, + "step": 28155 + }, + { + "epoch": 1.4296175959183155, + "grad_norm": 0.022858736494680027, + "learning_rate": 0.00022793691986213726, + "loss": 0.4719, + "step": 28160 + }, + { + "epoch": 1.429871432016347, + "grad_norm": 0.02868557828847498, + "learning_rate": 0.00022775108552885336, + "loss": 0.4756, + "step": 28165 + }, + { + "epoch": 1.4301252681143786, + "grad_norm": 0.022645438963720982, + "learning_rate": 0.00022756530463667336, + "loss": 0.4726, + "step": 28170 + }, + { + "epoch": 1.43037910421241, + "grad_norm": 0.02751085893232087, + "learning_rate": 0.00022737957722206576, + "loss": 0.4698, + "step": 28175 + }, + { + "epoch": 1.4306329403104416, + "grad_norm": 0.02151280270290485, + "learning_rate": 0.00022719390332148743, + "loss": 0.4608, + "step": 28180 + }, + { + "epoch": 1.4308867764084732, + "grad_norm": 0.02606728512274907, + "learning_rate": 0.0002270082829713856, + "loss": 0.4721, + "step": 28185 + }, + { + "epoch": 1.4311406125065045, + "grad_norm": 0.023997247097793137, + "learning_rate": 0.00022682271620819622, + "loss": 0.4877, + "step": 28190 + }, + { + "epoch": 1.431394448604536, + "grad_norm": 0.022872074241439312, + "learning_rate": 0.00022663720306834544, + "loss": 0.4929, + "step": 28195 + }, + { + "epoch": 1.4316482847025676, + "grad_norm": 0.029600629396223117, + "learning_rate": 0.00022645174358824834, + "loss": 0.4875, + "step": 28200 + }, + { + "epoch": 1.431902120800599, + "grad_norm": 0.024623593274030287, + "learning_rate": 0.00022626633780430995, + "loss": 0.4856, + "step": 28205 + }, + { + "epoch": 1.4321559568986306, + "grad_norm": 0.02466052196985095, + "learning_rate": 0.00022608098575292412, + "loss": 0.4738, + "step": 28210 + }, + { + "epoch": 1.432409792996662, + "grad_norm": 0.02014319185333175, + "learning_rate": 0.00022589568747047496, + "loss": 0.4535, + "step": 28215 + }, + { + "epoch": 1.4326636290946935, + "grad_norm": 0.07392864104661222, + "learning_rate": 0.00022571044299333522, + "loss": 0.5205, + "step": 28220 + }, + { + "epoch": 1.432917465192725, + "grad_norm": 0.02808956874894487, + "learning_rate": 0.0002255252523578678, + "loss": 0.4641, + "step": 28225 + }, + { + "epoch": 1.4331713012907565, + "grad_norm": 0.023026002461065096, + "learning_rate": 0.0002253401156004244, + "loss": 0.4719, + "step": 28230 + }, + { + "epoch": 1.433425137388788, + "grad_norm": 0.025239834221944403, + "learning_rate": 0.00022515503275734655, + "loss": 0.4812, + "step": 28235 + }, + { + "epoch": 1.4336789734868196, + "grad_norm": 0.023740320525211498, + "learning_rate": 0.0002249700038649653, + "loss": 0.4871, + "step": 28240 + }, + { + "epoch": 1.4339328095848511, + "grad_norm": 0.022974485649644116, + "learning_rate": 0.00022478502895960056, + "loss": 0.5321, + "step": 28245 + }, + { + "epoch": 1.4341866456828827, + "grad_norm": 0.038073746528805776, + "learning_rate": 0.00022460010807756232, + "loss": 0.4879, + "step": 28250 + }, + { + "epoch": 1.434440481780914, + "grad_norm": 0.022813896482576042, + "learning_rate": 0.00022441524125514924, + "loss": 0.4661, + "step": 28255 + }, + { + "epoch": 1.4346943178789455, + "grad_norm": 0.023168119706696216, + "learning_rate": 0.0002242304285286501, + "loss": 0.4958, + "step": 28260 + }, + { + "epoch": 1.434948153976977, + "grad_norm": 0.02529122063862638, + "learning_rate": 0.0002240456699343425, + "loss": 0.477, + "step": 28265 + }, + { + "epoch": 1.4352019900750086, + "grad_norm": 0.022195490060621263, + "learning_rate": 0.00022386096550849384, + "loss": 0.4525, + "step": 28270 + }, + { + "epoch": 1.4354558261730401, + "grad_norm": 0.022086437502397378, + "learning_rate": 0.00022367631528736037, + "loss": 0.4679, + "step": 28275 + }, + { + "epoch": 1.4357096622710714, + "grad_norm": 0.02864867259943996, + "learning_rate": 0.00022349171930718836, + "loss": 0.4855, + "step": 28280 + }, + { + "epoch": 1.435963498369103, + "grad_norm": 0.0224610763647068, + "learning_rate": 0.0002233071776042127, + "loss": 0.4817, + "step": 28285 + }, + { + "epoch": 1.4362173344671345, + "grad_norm": 0.02349424018618673, + "learning_rate": 0.00022312269021465826, + "loss": 0.4559, + "step": 28290 + }, + { + "epoch": 1.436471170565166, + "grad_norm": 0.02339458889206688, + "learning_rate": 0.00022293825717473891, + "loss": 0.4929, + "step": 28295 + }, + { + "epoch": 1.4367250066631976, + "grad_norm": 0.031316045398563565, + "learning_rate": 0.0002227538785206582, + "loss": 0.5114, + "step": 28300 + }, + { + "epoch": 1.4369788427612291, + "grad_norm": 0.029663272075113593, + "learning_rate": 0.0002225695542886083, + "loss": 0.4813, + "step": 28305 + }, + { + "epoch": 1.4372326788592606, + "grad_norm": 0.01994990696285228, + "learning_rate": 0.00022238528451477152, + "loss": 0.4663, + "step": 28310 + }, + { + "epoch": 1.4374865149572922, + "grad_norm": 0.02117337091557514, + "learning_rate": 0.0002222010692353188, + "loss": 0.4757, + "step": 28315 + }, + { + "epoch": 1.4377403510553235, + "grad_norm": 0.027257286084898684, + "learning_rate": 0.00022201690848641092, + "loss": 0.465, + "step": 28320 + }, + { + "epoch": 1.437994187153355, + "grad_norm": 0.02068920674069367, + "learning_rate": 0.00022183280230419746, + "loss": 0.454, + "step": 28325 + }, + { + "epoch": 1.4382480232513866, + "grad_norm": 0.026482096977822, + "learning_rate": 0.00022164875072481788, + "loss": 0.4992, + "step": 28330 + }, + { + "epoch": 1.438501859349418, + "grad_norm": 0.025250420834823727, + "learning_rate": 0.00022146475378440018, + "loss": 0.4845, + "step": 28335 + }, + { + "epoch": 1.4387556954474496, + "grad_norm": 0.023124139028515872, + "learning_rate": 0.00022128081151906248, + "loss": 0.4981, + "step": 28340 + }, + { + "epoch": 1.439009531545481, + "grad_norm": 0.0250480716309079, + "learning_rate": 0.00022109692396491128, + "loss": 0.4932, + "step": 28345 + }, + { + "epoch": 1.4392633676435125, + "grad_norm": 0.025172649290414614, + "learning_rate": 0.00022091309115804305, + "loss": 0.4734, + "step": 28350 + }, + { + "epoch": 1.439517203741544, + "grad_norm": 0.023861858060389464, + "learning_rate": 0.0002207293131345434, + "loss": 0.4958, + "step": 28355 + }, + { + "epoch": 1.4397710398395756, + "grad_norm": 0.023276629706840232, + "learning_rate": 0.00022054558993048667, + "loss": 0.4775, + "step": 28360 + }, + { + "epoch": 1.440024875937607, + "grad_norm": 0.021929215795727805, + "learning_rate": 0.00022036192158193717, + "loss": 0.5146, + "step": 28365 + }, + { + "epoch": 1.4402787120356386, + "grad_norm": 0.022629914839638336, + "learning_rate": 0.00022017830812494778, + "loss": 0.4748, + "step": 28370 + }, + { + "epoch": 1.4405325481336702, + "grad_norm": 0.024040761607839467, + "learning_rate": 0.0002199947495955612, + "loss": 0.5049, + "step": 28375 + }, + { + "epoch": 1.4407863842317017, + "grad_norm": 0.02291771186966908, + "learning_rate": 0.00021981124602980868, + "loss": 0.4945, + "step": 28380 + }, + { + "epoch": 1.4410402203297332, + "grad_norm": 0.02136166148441168, + "learning_rate": 0.00021962779746371148, + "loss": 0.4428, + "step": 28385 + }, + { + "epoch": 1.4412940564277645, + "grad_norm": 0.02612556885350365, + "learning_rate": 0.0002194444039332792, + "loss": 0.4812, + "step": 28390 + }, + { + "epoch": 1.441547892525796, + "grad_norm": 0.03403548088384833, + "learning_rate": 0.00021926106547451153, + "loss": 0.5052, + "step": 28395 + }, + { + "epoch": 1.4418017286238276, + "grad_norm": 0.02777524166007245, + "learning_rate": 0.00021907778212339646, + "loss": 0.4847, + "step": 28400 + }, + { + "epoch": 1.4420555647218591, + "grad_norm": 0.02317959016446424, + "learning_rate": 0.00021889455391591197, + "loss": 0.4857, + "step": 28405 + }, + { + "epoch": 1.4423094008198907, + "grad_norm": 0.03177895171595999, + "learning_rate": 0.00021871138088802434, + "loss": 0.4558, + "step": 28410 + }, + { + "epoch": 1.442563236917922, + "grad_norm": 0.02126964198718204, + "learning_rate": 0.00021852826307569017, + "loss": 0.4506, + "step": 28415 + }, + { + "epoch": 1.4428170730159535, + "grad_norm": 0.024740968782442203, + "learning_rate": 0.00021834520051485412, + "loss": 0.4973, + "step": 28420 + }, + { + "epoch": 1.443070909113985, + "grad_norm": 0.027489399167317248, + "learning_rate": 0.00021816219324145082, + "loss": 0.4751, + "step": 28425 + }, + { + "epoch": 1.4433247452120166, + "grad_norm": 0.025374524663172544, + "learning_rate": 0.00021797924129140323, + "loss": 0.4764, + "step": 28430 + }, + { + "epoch": 1.4435785813100481, + "grad_norm": 0.02277464171649709, + "learning_rate": 0.00021779634470062433, + "loss": 0.4912, + "step": 28435 + }, + { + "epoch": 1.4438324174080797, + "grad_norm": 0.021400550142160534, + "learning_rate": 0.0002176135035050154, + "loss": 0.4559, + "step": 28440 + }, + { + "epoch": 1.4440862535061112, + "grad_norm": 0.03045404966662296, + "learning_rate": 0.00021743071774046768, + "loss": 0.5034, + "step": 28445 + }, + { + "epoch": 1.4443400896041427, + "grad_norm": 0.02407917586260191, + "learning_rate": 0.00021724798744286072, + "loss": 0.4785, + "step": 28450 + }, + { + "epoch": 1.444593925702174, + "grad_norm": 0.025205748983629185, + "learning_rate": 0.00021706531264806394, + "loss": 0.4874, + "step": 28455 + }, + { + "epoch": 1.4448477618002056, + "grad_norm": 0.02348829524564938, + "learning_rate": 0.00021688269339193513, + "loss": 0.4709, + "step": 28460 + }, + { + "epoch": 1.4451015978982371, + "grad_norm": 0.0232211398020149, + "learning_rate": 0.00021670012971032184, + "loss": 0.4774, + "step": 28465 + }, + { + "epoch": 1.4453554339962686, + "grad_norm": 0.024930973074259177, + "learning_rate": 0.00021651762163906008, + "loss": 0.4685, + "step": 28470 + }, + { + "epoch": 1.4456092700943002, + "grad_norm": 0.03092756571772325, + "learning_rate": 0.0002163351692139755, + "loss": 0.4734, + "step": 28475 + }, + { + "epoch": 1.4458631061923315, + "grad_norm": 0.022477760693391274, + "learning_rate": 0.00021615277247088278, + "loss": 0.4869, + "step": 28480 + }, + { + "epoch": 1.446116942290363, + "grad_norm": 0.03361966594571212, + "learning_rate": 0.00021597043144558505, + "loss": 0.4451, + "step": 28485 + }, + { + "epoch": 1.4463707783883946, + "grad_norm": 0.030322892044589482, + "learning_rate": 0.00021578814617387537, + "loss": 0.4828, + "step": 28490 + }, + { + "epoch": 1.446624614486426, + "grad_norm": 0.024900585317585298, + "learning_rate": 0.00021560591669153505, + "loss": 0.4799, + "step": 28495 + }, + { + "epoch": 1.4468784505844576, + "grad_norm": 0.022462012440346523, + "learning_rate": 0.00021542374303433522, + "loss": 0.4528, + "step": 28500 + }, + { + "epoch": 1.4471322866824892, + "grad_norm": 0.02385434946419265, + "learning_rate": 0.00021524162523803525, + "loss": 0.4762, + "step": 28505 + }, + { + "epoch": 1.4473861227805207, + "grad_norm": 0.0237899246854406, + "learning_rate": 0.00021505956333838432, + "loss": 0.5128, + "step": 28510 + }, + { + "epoch": 1.4476399588785522, + "grad_norm": 0.022093078514918426, + "learning_rate": 0.00021487755737111997, + "loss": 0.5007, + "step": 28515 + }, + { + "epoch": 1.4478937949765835, + "grad_norm": 0.03720387345715551, + "learning_rate": 0.00021469560737196936, + "loss": 0.4737, + "step": 28520 + }, + { + "epoch": 1.448147631074615, + "grad_norm": 0.02307666838695736, + "learning_rate": 0.00021451371337664803, + "loss": 0.4847, + "step": 28525 + }, + { + "epoch": 1.4484014671726466, + "grad_norm": 0.027202707719082012, + "learning_rate": 0.00021433187542086102, + "loss": 0.5096, + "step": 28530 + }, + { + "epoch": 1.4486553032706782, + "grad_norm": 0.02281167822321718, + "learning_rate": 0.0002141500935403023, + "loss": 0.4936, + "step": 28535 + }, + { + "epoch": 1.4489091393687097, + "grad_norm": 0.022981780525392213, + "learning_rate": 0.0002139683677706548, + "loss": 0.4721, + "step": 28540 + }, + { + "epoch": 1.449162975466741, + "grad_norm": 0.023971573526569936, + "learning_rate": 0.00021378669814759016, + "loss": 0.4788, + "step": 28545 + }, + { + "epoch": 1.4494168115647725, + "grad_norm": 0.0351687277665943, + "learning_rate": 0.00021360508470676947, + "loss": 0.4275, + "step": 28550 + }, + { + "epoch": 1.449670647662804, + "grad_norm": 0.030238615571759643, + "learning_rate": 0.00021342352748384224, + "loss": 0.4425, + "step": 28555 + }, + { + "epoch": 1.4499244837608356, + "grad_norm": 0.019177171938210256, + "learning_rate": 0.00021324202651444758, + "loss": 0.4624, + "step": 28560 + }, + { + "epoch": 1.4501783198588671, + "grad_norm": 0.023734475246235107, + "learning_rate": 0.00021306058183421289, + "loss": 0.4531, + "step": 28565 + }, + { + "epoch": 1.4504321559568987, + "grad_norm": 0.022972344810526544, + "learning_rate": 0.00021287919347875517, + "loss": 0.5048, + "step": 28570 + }, + { + "epoch": 1.4506859920549302, + "grad_norm": 0.022124511197820666, + "learning_rate": 0.00021269786148367975, + "loss": 0.4901, + "step": 28575 + }, + { + "epoch": 1.4509398281529617, + "grad_norm": 0.020604414986246704, + "learning_rate": 0.00021251658588458151, + "loss": 0.498, + "step": 28580 + }, + { + "epoch": 1.451193664250993, + "grad_norm": 0.03208564684499312, + "learning_rate": 0.00021233536671704363, + "loss": 0.4814, + "step": 28585 + }, + { + "epoch": 1.4514475003490246, + "grad_norm": 0.026818436879605508, + "learning_rate": 0.00021215420401663864, + "loss": 0.494, + "step": 28590 + }, + { + "epoch": 1.4517013364470561, + "grad_norm": 0.02278810185462568, + "learning_rate": 0.0002119730978189281, + "loss": 0.4588, + "step": 28595 + }, + { + "epoch": 1.4519551725450877, + "grad_norm": 0.03576004008501719, + "learning_rate": 0.0002117920481594619, + "loss": 0.4828, + "step": 28600 + }, + { + "epoch": 1.4522090086431192, + "grad_norm": 0.02300363802637254, + "learning_rate": 0.00021161105507377958, + "loss": 0.4625, + "step": 28605 + }, + { + "epoch": 1.4524628447411505, + "grad_norm": 0.021251968272071712, + "learning_rate": 0.00021143011859740875, + "loss": 0.4577, + "step": 28610 + }, + { + "epoch": 1.452716680839182, + "grad_norm": 0.02134933721672599, + "learning_rate": 0.00021124923876586672, + "loss": 0.4804, + "step": 28615 + }, + { + "epoch": 1.4529705169372136, + "grad_norm": 0.022574435169212356, + "learning_rate": 0.0002110684156146589, + "loss": 0.4659, + "step": 28620 + }, + { + "epoch": 1.453224353035245, + "grad_norm": 0.021844543840398207, + "learning_rate": 0.00021088764917928044, + "loss": 0.4765, + "step": 28625 + }, + { + "epoch": 1.4534781891332766, + "grad_norm": 0.022297227696848144, + "learning_rate": 0.0002107069394952144, + "loss": 0.4838, + "step": 28630 + }, + { + "epoch": 1.4537320252313082, + "grad_norm": 0.025232582647418608, + "learning_rate": 0.00021052628659793367, + "loss": 0.4793, + "step": 28635 + }, + { + "epoch": 1.4539858613293397, + "grad_norm": 0.02270811870174028, + "learning_rate": 0.00021034569052289908, + "loss": 0.4736, + "step": 28640 + }, + { + "epoch": 1.4542396974273712, + "grad_norm": 0.03638109948966619, + "learning_rate": 0.00021016515130556113, + "loss": 0.4767, + "step": 28645 + }, + { + "epoch": 1.4544935335254028, + "grad_norm": 0.020506178116723225, + "learning_rate": 0.0002099846689813582, + "loss": 0.4831, + "step": 28650 + }, + { + "epoch": 1.454747369623434, + "grad_norm": 0.02980987722615396, + "learning_rate": 0.0002098042435857188, + "loss": 0.4477, + "step": 28655 + }, + { + "epoch": 1.4550012057214656, + "grad_norm": 0.02633014121866591, + "learning_rate": 0.000209623875154059, + "loss": 0.4758, + "step": 28660 + }, + { + "epoch": 1.4552550418194972, + "grad_norm": 0.024442989213594755, + "learning_rate": 0.00020944356372178458, + "loss": 0.4776, + "step": 28665 + }, + { + "epoch": 1.4555088779175287, + "grad_norm": 0.022043581830940656, + "learning_rate": 0.00020926330932428944, + "loss": 0.4482, + "step": 28670 + }, + { + "epoch": 1.4557627140155602, + "grad_norm": 0.023858740881236372, + "learning_rate": 0.00020908311199695695, + "loss": 0.4835, + "step": 28675 + }, + { + "epoch": 1.4560165501135915, + "grad_norm": 0.027411770365613423, + "learning_rate": 0.0002089029717751586, + "loss": 0.4671, + "step": 28680 + }, + { + "epoch": 1.456270386211623, + "grad_norm": 0.023697876508040217, + "learning_rate": 0.00020872288869425536, + "loss": 0.4911, + "step": 28685 + }, + { + "epoch": 1.4565242223096546, + "grad_norm": 0.026627596602070615, + "learning_rate": 0.0002085428627895963, + "loss": 0.4721, + "step": 28690 + }, + { + "epoch": 1.4567780584076861, + "grad_norm": 0.021374243564867154, + "learning_rate": 0.00020836289409651993, + "loss": 0.4851, + "step": 28695 + }, + { + "epoch": 1.4570318945057177, + "grad_norm": 0.026025357905320266, + "learning_rate": 0.0002081829826503529, + "loss": 0.4711, + "step": 28700 + }, + { + "epoch": 1.4572857306037492, + "grad_norm": 0.022910628027651588, + "learning_rate": 0.0002080031284864113, + "loss": 0.467, + "step": 28705 + }, + { + "epoch": 1.4575395667017808, + "grad_norm": 0.020149333711585025, + "learning_rate": 0.00020782333163999917, + "loss": 0.4638, + "step": 28710 + }, + { + "epoch": 1.4577934027998123, + "grad_norm": 0.022348782139786152, + "learning_rate": 0.00020764359214640998, + "loss": 0.4672, + "step": 28715 + }, + { + "epoch": 1.4580472388978436, + "grad_norm": 0.0314334704344233, + "learning_rate": 0.0002074639100409258, + "loss": 0.4774, + "step": 28720 + }, + { + "epoch": 1.4583010749958751, + "grad_norm": 0.022750058450229413, + "learning_rate": 0.0002072842853588171, + "loss": 0.4761, + "step": 28725 + }, + { + "epoch": 1.4585549110939067, + "grad_norm": 0.026687684346631817, + "learning_rate": 0.00020710471813534354, + "loss": 0.4796, + "step": 28730 + }, + { + "epoch": 1.4588087471919382, + "grad_norm": 0.025046655726691035, + "learning_rate": 0.00020692520840575297, + "loss": 0.48, + "step": 28735 + }, + { + "epoch": 1.4590625832899697, + "grad_norm": 0.024616034157493274, + "learning_rate": 0.00020674575620528262, + "loss": 0.4789, + "step": 28740 + }, + { + "epoch": 1.459316419388001, + "grad_norm": 0.022143594499575536, + "learning_rate": 0.0002065663615691577, + "loss": 0.5053, + "step": 28745 + }, + { + "epoch": 1.4595702554860326, + "grad_norm": 0.021547465775443694, + "learning_rate": 0.00020638702453259285, + "loss": 0.4676, + "step": 28750 + }, + { + "epoch": 1.4598240915840641, + "grad_norm": 0.02362010469124488, + "learning_rate": 0.0002062077451307906, + "loss": 0.4553, + "step": 28755 + }, + { + "epoch": 1.4600779276820957, + "grad_norm": 0.021056652145822894, + "learning_rate": 0.00020602852339894306, + "loss": 0.4686, + "step": 28760 + }, + { + "epoch": 1.4603317637801272, + "grad_norm": 0.02196518330009373, + "learning_rate": 0.00020584935937223016, + "loss": 0.4884, + "step": 28765 + }, + { + "epoch": 1.4605855998781587, + "grad_norm": 0.021922777074083848, + "learning_rate": 0.0002056702530858211, + "loss": 0.4752, + "step": 28770 + }, + { + "epoch": 1.4608394359761903, + "grad_norm": 0.02529934917068532, + "learning_rate": 0.00020549120457487354, + "loss": 0.4553, + "step": 28775 + }, + { + "epoch": 1.4610932720742218, + "grad_norm": 0.02498486840017278, + "learning_rate": 0.00020531221387453392, + "loss": 0.4434, + "step": 28780 + }, + { + "epoch": 1.461347108172253, + "grad_norm": 0.021722508591940946, + "learning_rate": 0.000205133281019937, + "loss": 0.4787, + "step": 28785 + }, + { + "epoch": 1.4616009442702846, + "grad_norm": 0.023233582767727436, + "learning_rate": 0.0002049544060462067, + "loss": 0.4614, + "step": 28790 + }, + { + "epoch": 1.4618547803683162, + "grad_norm": 0.02278293695138392, + "learning_rate": 0.00020477558898845488, + "loss": 0.5037, + "step": 28795 + }, + { + "epoch": 1.4621086164663477, + "grad_norm": 0.023402783149172365, + "learning_rate": 0.00020459682988178285, + "loss": 0.4573, + "step": 28800 + }, + { + "epoch": 1.4623624525643792, + "grad_norm": 0.031169336608011022, + "learning_rate": 0.0002044181287612798, + "loss": 0.4791, + "step": 28805 + }, + { + "epoch": 1.4626162886624106, + "grad_norm": 0.028362314190744413, + "learning_rate": 0.00020423948566202415, + "loss": 0.495, + "step": 28810 + }, + { + "epoch": 1.462870124760442, + "grad_norm": 0.02179210935015138, + "learning_rate": 0.00020406090061908234, + "loss": 0.4638, + "step": 28815 + }, + { + "epoch": 1.4631239608584736, + "grad_norm": 0.024277811689262054, + "learning_rate": 0.00020388237366751006, + "loss": 0.4549, + "step": 28820 + }, + { + "epoch": 1.4633777969565052, + "grad_norm": 0.020361262646131383, + "learning_rate": 0.00020370390484235096, + "loss": 0.479, + "step": 28825 + }, + { + "epoch": 1.4636316330545367, + "grad_norm": 0.02705830030106512, + "learning_rate": 0.00020352549417863768, + "loss": 0.4854, + "step": 28830 + }, + { + "epoch": 1.4638854691525682, + "grad_norm": 0.03431989841271053, + "learning_rate": 0.00020334714171139158, + "loss": 0.4722, + "step": 28835 + }, + { + "epoch": 1.4641393052505998, + "grad_norm": 0.02690359771060965, + "learning_rate": 0.00020316884747562192, + "loss": 0.4755, + "step": 28840 + }, + { + "epoch": 1.4643931413486313, + "grad_norm": 0.02164943326089008, + "learning_rate": 0.0002029906115063274, + "loss": 0.468, + "step": 28845 + }, + { + "epoch": 1.4646469774466626, + "grad_norm": 0.024397789571063064, + "learning_rate": 0.0002028124338384945, + "loss": 0.4625, + "step": 28850 + }, + { + "epoch": 1.4649008135446941, + "grad_norm": 0.022519450507155134, + "learning_rate": 0.00020263431450709895, + "loss": 0.4975, + "step": 28855 + }, + { + "epoch": 1.4651546496427257, + "grad_norm": 0.0217210003632832, + "learning_rate": 0.00020245625354710435, + "loss": 0.4629, + "step": 28860 + }, + { + "epoch": 1.4654084857407572, + "grad_norm": 0.025787902107206176, + "learning_rate": 0.00020227825099346347, + "loss": 0.4741, + "step": 28865 + }, + { + "epoch": 1.4656623218387888, + "grad_norm": 0.022185672758679997, + "learning_rate": 0.00020210030688111701, + "loss": 0.4508, + "step": 28870 + }, + { + "epoch": 1.46591615793682, + "grad_norm": 0.023178483612238884, + "learning_rate": 0.00020192242124499488, + "loss": 0.4874, + "step": 28875 + }, + { + "epoch": 1.4661699940348516, + "grad_norm": 0.02582945990064946, + "learning_rate": 0.00020174459412001473, + "loss": 0.4582, + "step": 28880 + }, + { + "epoch": 1.4664238301328831, + "grad_norm": 0.023143750413606654, + "learning_rate": 0.00020156682554108357, + "loss": 0.4656, + "step": 28885 + }, + { + "epoch": 1.4666776662309147, + "grad_norm": 0.02269378720031511, + "learning_rate": 0.0002013891155430959, + "loss": 0.4668, + "step": 28890 + }, + { + "epoch": 1.4669315023289462, + "grad_norm": 0.03448609380058457, + "learning_rate": 0.00020121146416093605, + "loss": 0.4811, + "step": 28895 + }, + { + "epoch": 1.4671853384269777, + "grad_norm": 0.021699709617265108, + "learning_rate": 0.00020103387142947555, + "loss": 0.5225, + "step": 28900 + }, + { + "epoch": 1.4674391745250093, + "grad_norm": 0.026406847147590853, + "learning_rate": 0.00020085633738357533, + "loss": 0.4825, + "step": 28905 + }, + { + "epoch": 1.4676930106230408, + "grad_norm": 0.033192215599298, + "learning_rate": 0.00020067886205808405, + "loss": 0.4979, + "step": 28910 + }, + { + "epoch": 1.4679468467210723, + "grad_norm": 0.02394413280206084, + "learning_rate": 0.0002005014454878396, + "loss": 0.459, + "step": 28915 + }, + { + "epoch": 1.4682006828191037, + "grad_norm": 0.0245933526818191, + "learning_rate": 0.0002003240877076677, + "loss": 0.4576, + "step": 28920 + }, + { + "epoch": 1.4684545189171352, + "grad_norm": 0.02516217005778102, + "learning_rate": 0.00020014678875238302, + "loss": 0.5001, + "step": 28925 + }, + { + "epoch": 1.4687083550151667, + "grad_norm": 0.02361500803054123, + "learning_rate": 0.00019996954865678817, + "loss": 0.4896, + "step": 28930 + }, + { + "epoch": 1.4689621911131983, + "grad_norm": 0.024603894061880013, + "learning_rate": 0.00019979236745567487, + "loss": 0.5019, + "step": 28935 + }, + { + "epoch": 1.4692160272112298, + "grad_norm": 0.020585521169676338, + "learning_rate": 0.00019961524518382267, + "loss": 0.456, + "step": 28940 + }, + { + "epoch": 1.469469863309261, + "grad_norm": 0.02522878829527217, + "learning_rate": 0.00019943818187599966, + "loss": 0.4565, + "step": 28945 + }, + { + "epoch": 1.4697236994072926, + "grad_norm": 0.02118075848457831, + "learning_rate": 0.00019926117756696265, + "loss": 0.4868, + "step": 28950 + }, + { + "epoch": 1.4699775355053242, + "grad_norm": 0.022435152582389803, + "learning_rate": 0.00019908423229145672, + "loss": 0.4959, + "step": 28955 + }, + { + "epoch": 1.4702313716033557, + "grad_norm": 0.02197759398001768, + "learning_rate": 0.00019890734608421552, + "loss": 0.4627, + "step": 28960 + }, + { + "epoch": 1.4704852077013872, + "grad_norm": 0.04773874949155717, + "learning_rate": 0.00019873051897996053, + "loss": 0.4562, + "step": 28965 + }, + { + "epoch": 1.4707390437994188, + "grad_norm": 0.03353710032706133, + "learning_rate": 0.0001985537510134024, + "loss": 0.4665, + "step": 28970 + }, + { + "epoch": 1.4709928798974503, + "grad_norm": 0.026180926097628836, + "learning_rate": 0.00019837704221923946, + "loss": 0.4479, + "step": 28975 + }, + { + "epoch": 1.4712467159954818, + "grad_norm": 0.03126605626141384, + "learning_rate": 0.00019820039263215917, + "loss": 0.4602, + "step": 28980 + }, + { + "epoch": 1.4715005520935132, + "grad_norm": 0.023705023219665274, + "learning_rate": 0.00019802380228683646, + "loss": 0.4881, + "step": 28985 + }, + { + "epoch": 1.4717543881915447, + "grad_norm": 0.023534377244730398, + "learning_rate": 0.00019784727121793566, + "loss": 0.4828, + "step": 28990 + }, + { + "epoch": 1.4720082242895762, + "grad_norm": 0.021276660406363043, + "learning_rate": 0.00019767079946010852, + "loss": 0.4731, + "step": 28995 + }, + { + "epoch": 1.4722620603876078, + "grad_norm": 0.0275960081210604, + "learning_rate": 0.00019749438704799588, + "loss": 0.4718, + "step": 29000 + }, + { + "epoch": 1.4725158964856393, + "grad_norm": 0.021170380103597952, + "learning_rate": 0.0001973180340162263, + "loss": 0.4525, + "step": 29005 + }, + { + "epoch": 1.4727697325836706, + "grad_norm": 0.024756352788350977, + "learning_rate": 0.00019714174039941736, + "loss": 0.5201, + "step": 29010 + }, + { + "epoch": 1.4730235686817021, + "grad_norm": 0.02683451441512577, + "learning_rate": 0.00019696550623217403, + "loss": 0.4786, + "step": 29015 + }, + { + "epoch": 1.4732774047797337, + "grad_norm": 0.02575373794125539, + "learning_rate": 0.00019678933154909095, + "loss": 0.4784, + "step": 29020 + }, + { + "epoch": 1.4735312408777652, + "grad_norm": 0.020364829491525765, + "learning_rate": 0.00019661321638475004, + "loss": 0.4341, + "step": 29025 + }, + { + "epoch": 1.4737850769757967, + "grad_norm": 0.021298525215463972, + "learning_rate": 0.00019643716077372153, + "loss": 0.4904, + "step": 29030 + }, + { + "epoch": 1.4740389130738283, + "grad_norm": 0.021611695522008065, + "learning_rate": 0.0001962611647505647, + "loss": 0.4968, + "step": 29035 + }, + { + "epoch": 1.4742927491718598, + "grad_norm": 0.021550734290455672, + "learning_rate": 0.00019608522834982633, + "loss": 0.4822, + "step": 29040 + }, + { + "epoch": 1.4745465852698914, + "grad_norm": 0.021212859093263894, + "learning_rate": 0.00019590935160604218, + "loss": 0.4758, + "step": 29045 + }, + { + "epoch": 1.4748004213679227, + "grad_norm": 0.02522241453937803, + "learning_rate": 0.0001957335345537356, + "loss": 0.4638, + "step": 29050 + }, + { + "epoch": 1.4750542574659542, + "grad_norm": 0.02216798656322059, + "learning_rate": 0.00019555777722741902, + "loss": 0.4727, + "step": 29055 + }, + { + "epoch": 1.4753080935639857, + "grad_norm": 0.024334896417849243, + "learning_rate": 0.00019538207966159234, + "loss": 0.4876, + "step": 29060 + }, + { + "epoch": 1.4755619296620173, + "grad_norm": 0.020644818633358916, + "learning_rate": 0.00019520644189074444, + "loss": 0.4633, + "step": 29065 + }, + { + "epoch": 1.4758157657600488, + "grad_norm": 0.02651778575033576, + "learning_rate": 0.00019503086394935182, + "loss": 0.4603, + "step": 29070 + }, + { + "epoch": 1.4760696018580801, + "grad_norm": 0.03491940275430201, + "learning_rate": 0.00019485534587187977, + "loss": 0.4425, + "step": 29075 + }, + { + "epoch": 1.4763234379561117, + "grad_norm": 0.023186579903332387, + "learning_rate": 0.00019467988769278154, + "loss": 0.4549, + "step": 29080 + }, + { + "epoch": 1.4765772740541432, + "grad_norm": 0.025307596693011927, + "learning_rate": 0.00019450448944649895, + "loss": 0.4795, + "step": 29085 + }, + { + "epoch": 1.4768311101521747, + "grad_norm": 0.021614617095183405, + "learning_rate": 0.00019432915116746136, + "loss": 0.4892, + "step": 29090 + }, + { + "epoch": 1.4770849462502063, + "grad_norm": 0.02464047963920047, + "learning_rate": 0.0001941538728900872, + "loss": 0.4628, + "step": 29095 + }, + { + "epoch": 1.4773387823482378, + "grad_norm": 0.029044426055459413, + "learning_rate": 0.00019397865464878235, + "loss": 0.4777, + "step": 29100 + }, + { + "epoch": 1.4775926184462693, + "grad_norm": 0.03081511255172388, + "learning_rate": 0.00019380349647794165, + "loss": 0.4915, + "step": 29105 + }, + { + "epoch": 1.4778464545443009, + "grad_norm": 0.02441428752542893, + "learning_rate": 0.00019362839841194747, + "loss": 0.4716, + "step": 29110 + }, + { + "epoch": 1.4781002906423322, + "grad_norm": 0.022749416876083776, + "learning_rate": 0.00019345336048517094, + "loss": 0.4796, + "step": 29115 + }, + { + "epoch": 1.4783541267403637, + "grad_norm": 0.023954531450951794, + "learning_rate": 0.00019327838273197078, + "loss": 0.4952, + "step": 29120 + }, + { + "epoch": 1.4786079628383952, + "grad_norm": 0.02345877737054963, + "learning_rate": 0.0001931034651866947, + "loss": 0.477, + "step": 29125 + }, + { + "epoch": 1.4788617989364268, + "grad_norm": 0.02651510050871646, + "learning_rate": 0.00019292860788367773, + "loss": 0.4987, + "step": 29130 + }, + { + "epoch": 1.4791156350344583, + "grad_norm": 0.02263999113571259, + "learning_rate": 0.00019275381085724364, + "loss": 0.455, + "step": 29135 + }, + { + "epoch": 1.4793694711324896, + "grad_norm": 0.020869618792912294, + "learning_rate": 0.00019257907414170445, + "loss": 0.4501, + "step": 29140 + }, + { + "epoch": 1.4796233072305212, + "grad_norm": 0.02387312884738094, + "learning_rate": 0.00019240439777135976, + "loss": 0.4511, + "step": 29145 + }, + { + "epoch": 1.4798771433285527, + "grad_norm": 0.05515921100199077, + "learning_rate": 0.00019222978178049793, + "loss": 0.4692, + "step": 29150 + }, + { + "epoch": 1.4801309794265842, + "grad_norm": 0.022481925501552334, + "learning_rate": 0.00019205522620339494, + "loss": 0.4822, + "step": 29155 + }, + { + "epoch": 1.4803848155246158, + "grad_norm": 0.029596722532962007, + "learning_rate": 0.00019188073107431546, + "loss": 0.4648, + "step": 29160 + }, + { + "epoch": 1.4806386516226473, + "grad_norm": 0.022443147646602133, + "learning_rate": 0.00019170629642751175, + "loss": 0.4588, + "step": 29165 + }, + { + "epoch": 1.4808924877206788, + "grad_norm": 0.019695512999081097, + "learning_rate": 0.00019153192229722478, + "loss": 0.47, + "step": 29170 + }, + { + "epoch": 1.4811463238187104, + "grad_norm": 0.023069499823679177, + "learning_rate": 0.00019135760871768294, + "loss": 0.4826, + "step": 29175 + }, + { + "epoch": 1.4814001599167417, + "grad_norm": 0.020513414900472458, + "learning_rate": 0.00019118335572310347, + "loss": 0.4897, + "step": 29180 + }, + { + "epoch": 1.4816539960147732, + "grad_norm": 0.02458185146292343, + "learning_rate": 0.00019100916334769107, + "loss": 0.4503, + "step": 29185 + }, + { + "epoch": 1.4819078321128047, + "grad_norm": 0.020161767636643235, + "learning_rate": 0.00019083503162563908, + "loss": 0.4888, + "step": 29190 + }, + { + "epoch": 1.4821616682108363, + "grad_norm": 0.021529664850411454, + "learning_rate": 0.0001906609605911283, + "loss": 0.4678, + "step": 29195 + }, + { + "epoch": 1.4824155043088678, + "grad_norm": 0.022419770519300376, + "learning_rate": 0.00019048695027832862, + "loss": 0.4569, + "step": 29200 + }, + { + "epoch": 1.4826693404068991, + "grad_norm": 0.02111471529503639, + "learning_rate": 0.00019031300072139685, + "loss": 0.4865, + "step": 29205 + }, + { + "epoch": 1.4829231765049307, + "grad_norm": 0.022270474614644514, + "learning_rate": 0.00019013911195447887, + "loss": 0.4909, + "step": 29210 + }, + { + "epoch": 1.4831770126029622, + "grad_norm": 0.0246869876351486, + "learning_rate": 0.0001899652840117077, + "loss": 0.4963, + "step": 29215 + }, + { + "epoch": 1.4834308487009937, + "grad_norm": 0.02576348095397173, + "learning_rate": 0.0001897915169272053, + "loss": 0.4913, + "step": 29220 + }, + { + "epoch": 1.4836846847990253, + "grad_norm": 0.024597752477760727, + "learning_rate": 0.000189617810735081, + "loss": 0.5088, + "step": 29225 + }, + { + "epoch": 1.4839385208970568, + "grad_norm": 0.02344812114499017, + "learning_rate": 0.0001894441654694327, + "loss": 0.4355, + "step": 29230 + }, + { + "epoch": 1.4841923569950883, + "grad_norm": 0.021559453697928024, + "learning_rate": 0.00018927058116434588, + "loss": 0.4764, + "step": 29235 + }, + { + "epoch": 1.4844461930931199, + "grad_norm": 0.02036859797483484, + "learning_rate": 0.00018909705785389452, + "loss": 0.4684, + "step": 29240 + }, + { + "epoch": 1.4847000291911514, + "grad_norm": 0.02205258982868522, + "learning_rate": 0.00018892359557214, + "loss": 0.4725, + "step": 29245 + }, + { + "epoch": 1.4849538652891827, + "grad_norm": 0.020563901133827663, + "learning_rate": 0.00018875019435313255, + "loss": 0.4731, + "step": 29250 + }, + { + "epoch": 1.4852077013872143, + "grad_norm": 0.028685312406026363, + "learning_rate": 0.0001885768542309096, + "loss": 0.4728, + "step": 29255 + }, + { + "epoch": 1.4854615374852458, + "grad_norm": 0.02454713847924293, + "learning_rate": 0.0001884035752394971, + "loss": 0.4763, + "step": 29260 + }, + { + "epoch": 1.4857153735832773, + "grad_norm": 0.02365382394464072, + "learning_rate": 0.000188230357412909, + "loss": 0.4331, + "step": 29265 + }, + { + "epoch": 1.4859692096813089, + "grad_norm": 0.021080550215192822, + "learning_rate": 0.00018805720078514677, + "loss": 0.4587, + "step": 29270 + }, + { + "epoch": 1.4862230457793402, + "grad_norm": 0.022858405035791375, + "learning_rate": 0.0001878841053902005, + "loss": 0.4602, + "step": 29275 + }, + { + "epoch": 1.4864768818773717, + "grad_norm": 0.026334553802682143, + "learning_rate": 0.00018771107126204771, + "loss": 0.4749, + "step": 29280 + }, + { + "epoch": 1.4867307179754032, + "grad_norm": 0.024924349203167863, + "learning_rate": 0.00018753809843465442, + "loss": 0.4775, + "step": 29285 + }, + { + "epoch": 1.4869845540734348, + "grad_norm": 0.031850516565785435, + "learning_rate": 0.00018736518694197396, + "loss": 0.4589, + "step": 29290 + }, + { + "epoch": 1.4872383901714663, + "grad_norm": 0.028274802301810807, + "learning_rate": 0.0001871923368179484, + "loss": 0.4538, + "step": 29295 + }, + { + "epoch": 1.4874922262694978, + "grad_norm": 0.021991555650211984, + "learning_rate": 0.000187019548096507, + "loss": 0.4906, + "step": 29300 + }, + { + "epoch": 1.4877460623675294, + "grad_norm": 0.027104083276996482, + "learning_rate": 0.00018684682081156762, + "loss": 0.4922, + "step": 29305 + }, + { + "epoch": 1.487999898465561, + "grad_norm": 0.02050509919684562, + "learning_rate": 0.00018667415499703545, + "loss": 0.4614, + "step": 29310 + }, + { + "epoch": 1.4882537345635922, + "grad_norm": 0.021216527675499326, + "learning_rate": 0.00018650155068680407, + "loss": 0.4525, + "step": 29315 + }, + { + "epoch": 1.4885075706616238, + "grad_norm": 0.022730076917583136, + "learning_rate": 0.00018632900791475492, + "loss": 0.4685, + "step": 29320 + }, + { + "epoch": 1.4887614067596553, + "grad_norm": 0.023575223517873986, + "learning_rate": 0.0001861565267147574, + "loss": 0.4376, + "step": 29325 + }, + { + "epoch": 1.4890152428576868, + "grad_norm": 0.02277744320446659, + "learning_rate": 0.0001859841071206684, + "loss": 0.4398, + "step": 29330 + }, + { + "epoch": 1.4892690789557184, + "grad_norm": 0.021608319308460473, + "learning_rate": 0.0001858117491663333, + "loss": 0.489, + "step": 29335 + }, + { + "epoch": 1.4895229150537497, + "grad_norm": 0.020073745176878363, + "learning_rate": 0.0001856394528855848, + "loss": 0.4597, + "step": 29340 + }, + { + "epoch": 1.4897767511517812, + "grad_norm": 0.022010323279037017, + "learning_rate": 0.00018546721831224424, + "loss": 0.4762, + "step": 29345 + }, + { + "epoch": 1.4900305872498127, + "grad_norm": 0.02437069862496661, + "learning_rate": 0.00018529504548011995, + "loss": 0.4831, + "step": 29350 + }, + { + "epoch": 1.4902844233478443, + "grad_norm": 0.021983892566894717, + "learning_rate": 0.00018512293442300893, + "loss": 0.4737, + "step": 29355 + }, + { + "epoch": 1.4905382594458758, + "grad_norm": 0.03613783598950718, + "learning_rate": 0.00018495088517469545, + "loss": 0.4716, + "step": 29360 + }, + { + "epoch": 1.4907920955439073, + "grad_norm": 0.02504114958866562, + "learning_rate": 0.00018477889776895225, + "loss": 0.453, + "step": 29365 + }, + { + "epoch": 1.4910459316419389, + "grad_norm": 0.023647519356946433, + "learning_rate": 0.0001846069722395392, + "loss": 0.4599, + "step": 29370 + }, + { + "epoch": 1.4912997677399704, + "grad_norm": 0.02647181372921064, + "learning_rate": 0.00018443510862020467, + "loss": 0.4836, + "step": 29375 + }, + { + "epoch": 1.4915536038380017, + "grad_norm": 0.0222009899568302, + "learning_rate": 0.0001842633069446848, + "loss": 0.4592, + "step": 29380 + }, + { + "epoch": 1.4918074399360333, + "grad_norm": 0.02282816694246597, + "learning_rate": 0.00018409156724670295, + "loss": 0.5007, + "step": 29385 + }, + { + "epoch": 1.4920612760340648, + "grad_norm": 0.02159121780004676, + "learning_rate": 0.00018391988955997126, + "loss": 0.4567, + "step": 29390 + }, + { + "epoch": 1.4923151121320963, + "grad_norm": 0.022260535759933305, + "learning_rate": 0.00018374827391818877, + "loss": 0.4663, + "step": 29395 + }, + { + "epoch": 1.4925689482301279, + "grad_norm": 0.022850427122847246, + "learning_rate": 0.00018357672035504313, + "loss": 0.4874, + "step": 29400 + }, + { + "epoch": 1.4928227843281592, + "grad_norm": 0.019789375321462733, + "learning_rate": 0.00018340522890420907, + "loss": 0.4172, + "step": 29405 + }, + { + "epoch": 1.4930766204261907, + "grad_norm": 0.025613999903597914, + "learning_rate": 0.00018323379959934993, + "loss": 0.4852, + "step": 29410 + }, + { + "epoch": 1.4933304565242222, + "grad_norm": 0.027238755553218558, + "learning_rate": 0.0001830624324741161, + "loss": 0.4733, + "step": 29415 + }, + { + "epoch": 1.4935842926222538, + "grad_norm": 0.025748721906667212, + "learning_rate": 0.00018289112756214633, + "loss": 0.4633, + "step": 29420 + }, + { + "epoch": 1.4938381287202853, + "grad_norm": 0.022524685723581257, + "learning_rate": 0.0001827198848970666, + "loss": 0.4573, + "step": 29425 + }, + { + "epoch": 1.4940919648183169, + "grad_norm": 0.02722240287350484, + "learning_rate": 0.00018254870451249138, + "loss": 0.4754, + "step": 29430 + }, + { + "epoch": 1.4943458009163484, + "grad_norm": 0.024092764906284942, + "learning_rate": 0.000182377586442022, + "loss": 0.4882, + "step": 29435 + }, + { + "epoch": 1.49459963701438, + "grad_norm": 0.023881198609627088, + "learning_rate": 0.00018220653071924876, + "loss": 0.4624, + "step": 29440 + }, + { + "epoch": 1.4948534731124112, + "grad_norm": 0.027775639716056417, + "learning_rate": 0.0001820355373777486, + "loss": 0.438, + "step": 29445 + }, + { + "epoch": 1.4951073092104428, + "grad_norm": 0.028210032309733676, + "learning_rate": 0.0001818646064510868, + "loss": 0.485, + "step": 29450 + }, + { + "epoch": 1.4953611453084743, + "grad_norm": 0.024018687894484327, + "learning_rate": 0.00018169373797281618, + "loss": 0.4624, + "step": 29455 + }, + { + "epoch": 1.4956149814065058, + "grad_norm": 0.02242820966339362, + "learning_rate": 0.0001815229319764775, + "loss": 0.4483, + "step": 29460 + }, + { + "epoch": 1.4958688175045374, + "grad_norm": 0.024756677687856363, + "learning_rate": 0.00018135218849559887, + "loss": 0.4973, + "step": 29465 + }, + { + "epoch": 1.4961226536025687, + "grad_norm": 0.026600698958258437, + "learning_rate": 0.00018118150756369673, + "loss": 0.4575, + "step": 29470 + }, + { + "epoch": 1.4963764897006002, + "grad_norm": 0.021303694946705722, + "learning_rate": 0.00018101088921427456, + "loss": 0.4561, + "step": 29475 + }, + { + "epoch": 1.4966303257986318, + "grad_norm": 0.02617735975797147, + "learning_rate": 0.00018084033348082418, + "loss": 0.4593, + "step": 29480 + }, + { + "epoch": 1.4968841618966633, + "grad_norm": 0.021397946565801608, + "learning_rate": 0.00018066984039682456, + "loss": 0.4405, + "step": 29485 + }, + { + "epoch": 1.4971379979946948, + "grad_norm": 0.024482557930807926, + "learning_rate": 0.00018049940999574288, + "loss": 0.4788, + "step": 29490 + }, + { + "epoch": 1.4973918340927264, + "grad_norm": 0.025781729735713393, + "learning_rate": 0.00018032904231103354, + "loss": 0.4797, + "step": 29495 + }, + { + "epoch": 1.497645670190758, + "grad_norm": 0.01949959677459056, + "learning_rate": 0.00018015873737613897, + "loss": 0.4558, + "step": 29500 + }, + { + "epoch": 1.4978995062887894, + "grad_norm": 0.022386748122245725, + "learning_rate": 0.0001799884952244894, + "loss": 0.4935, + "step": 29505 + }, + { + "epoch": 1.498153342386821, + "grad_norm": 0.021221757126598573, + "learning_rate": 0.00017981831588950216, + "loss": 0.4462, + "step": 29510 + }, + { + "epoch": 1.4984071784848523, + "grad_norm": 0.022333608494739168, + "learning_rate": 0.00017964819940458293, + "loss": 0.4621, + "step": 29515 + }, + { + "epoch": 1.4986610145828838, + "grad_norm": 0.024666056084257013, + "learning_rate": 0.00017947814580312438, + "loss": 0.4549, + "step": 29520 + }, + { + "epoch": 1.4989148506809153, + "grad_norm": 0.02090549438169576, + "learning_rate": 0.00017930815511850757, + "loss": 0.4689, + "step": 29525 + }, + { + "epoch": 1.4991686867789469, + "grad_norm": 0.03565612070546386, + "learning_rate": 0.00017913822738410042, + "loss": 0.4928, + "step": 29530 + }, + { + "epoch": 1.4994225228769784, + "grad_norm": 0.023113435548243647, + "learning_rate": 0.00017896836263325928, + "loss": 0.4889, + "step": 29535 + }, + { + "epoch": 1.4996763589750097, + "grad_norm": 0.023728923010015324, + "learning_rate": 0.0001787985608993274, + "loss": 0.4956, + "step": 29540 + }, + { + "epoch": 1.4999301950730413, + "grad_norm": 0.025393929885262224, + "learning_rate": 0.00017862882221563635, + "loss": 0.4454, + "step": 29545 + }, + { + "epoch": 1.5001840311710728, + "grad_norm": 0.028049319465082344, + "learning_rate": 0.00017845914661550466, + "loss": 0.4871, + "step": 29550 + }, + { + "epoch": 1.5004378672691043, + "grad_norm": 0.031110725136696264, + "learning_rate": 0.00017828953413223897, + "loss": 0.4707, + "step": 29555 + }, + { + "epoch": 1.5006917033671359, + "grad_norm": 0.023067267396166907, + "learning_rate": 0.00017811998479913337, + "loss": 0.4852, + "step": 29560 + }, + { + "epoch": 1.5009455394651674, + "grad_norm": 0.022599329066948542, + "learning_rate": 0.0001779504986494697, + "loss": 0.4629, + "step": 29565 + }, + { + "epoch": 1.501199375563199, + "grad_norm": 0.024939817004019073, + "learning_rate": 0.00017778107571651692, + "loss": 0.486, + "step": 29570 + }, + { + "epoch": 1.5014532116612305, + "grad_norm": 0.0293017112366802, + "learning_rate": 0.00017761171603353226, + "loss": 0.4563, + "step": 29575 + }, + { + "epoch": 1.501707047759262, + "grad_norm": 0.029238115277750512, + "learning_rate": 0.00017744241963375986, + "loss": 0.4879, + "step": 29580 + }, + { + "epoch": 1.5019608838572933, + "grad_norm": 0.02734864802953103, + "learning_rate": 0.00017727318655043196, + "loss": 0.4643, + "step": 29585 + }, + { + "epoch": 1.5022147199553249, + "grad_norm": 0.023307569346119706, + "learning_rate": 0.00017710401681676803, + "loss": 0.4675, + "step": 29590 + }, + { + "epoch": 1.5024685560533564, + "grad_norm": 0.03538931937461218, + "learning_rate": 0.00017693491046597544, + "loss": 0.4816, + "step": 29595 + }, + { + "epoch": 1.5027223921513877, + "grad_norm": 0.02593067797767141, + "learning_rate": 0.0001767658675312486, + "loss": 0.4854, + "step": 29600 + }, + { + "epoch": 1.5029762282494192, + "grad_norm": 0.038262040752877625, + "learning_rate": 0.00017659688804577022, + "loss": 0.4869, + "step": 29605 + }, + { + "epoch": 1.5032300643474508, + "grad_norm": 0.03034656256994557, + "learning_rate": 0.00017642797204270972, + "loss": 0.4745, + "step": 29610 + }, + { + "epoch": 1.5034839004454823, + "grad_norm": 0.02925654062689158, + "learning_rate": 0.00017625911955522467, + "loss": 0.4796, + "step": 29615 + }, + { + "epoch": 1.5037377365435138, + "grad_norm": 0.35865233957359255, + "learning_rate": 0.00017609033061646013, + "loss": 0.4897, + "step": 29620 + }, + { + "epoch": 1.5039915726415454, + "grad_norm": 0.024975856991066987, + "learning_rate": 0.0001759216052595482, + "loss": 0.4771, + "step": 29625 + }, + { + "epoch": 1.504245408739577, + "grad_norm": 0.027727973964852774, + "learning_rate": 0.00017575294351760912, + "loss": 0.4449, + "step": 29630 + }, + { + "epoch": 1.5044992448376084, + "grad_norm": 0.03183878239195353, + "learning_rate": 0.00017558434542375002, + "loss": 0.4803, + "step": 29635 + }, + { + "epoch": 1.50475308093564, + "grad_norm": 0.021762613260108758, + "learning_rate": 0.0001754158110110663, + "loss": 0.4505, + "step": 29640 + }, + { + "epoch": 1.5050069170336715, + "grad_norm": 0.02265057874561824, + "learning_rate": 0.00017524734031263995, + "loss": 0.4783, + "step": 29645 + }, + { + "epoch": 1.5052607531317028, + "grad_norm": 0.024018928533084835, + "learning_rate": 0.00017507893336154136, + "loss": 0.4687, + "step": 29650 + }, + { + "epoch": 1.5055145892297344, + "grad_norm": 0.025100700637922822, + "learning_rate": 0.00017491059019082757, + "loss": 0.4687, + "step": 29655 + }, + { + "epoch": 1.505768425327766, + "grad_norm": 0.025226901808710973, + "learning_rate": 0.00017474231083354386, + "loss": 0.4664, + "step": 29660 + }, + { + "epoch": 1.5060222614257972, + "grad_norm": 0.019733370421100523, + "learning_rate": 0.00017457409532272233, + "loss": 0.4593, + "step": 29665 + }, + { + "epoch": 1.5062760975238287, + "grad_norm": 0.036127873751405706, + "learning_rate": 0.00017440594369138318, + "loss": 0.4612, + "step": 29670 + }, + { + "epoch": 1.5065299336218603, + "grad_norm": 0.027988146551036864, + "learning_rate": 0.00017423785597253322, + "loss": 0.4712, + "step": 29675 + }, + { + "epoch": 1.5067837697198918, + "grad_norm": 0.022722614037556717, + "learning_rate": 0.00017406983219916784, + "loss": 0.4818, + "step": 29680 + }, + { + "epoch": 1.5070376058179233, + "grad_norm": 0.021733007504925076, + "learning_rate": 0.00017390187240426885, + "loss": 0.4636, + "step": 29685 + }, + { + "epoch": 1.5072914419159549, + "grad_norm": 0.022966860647516654, + "learning_rate": 0.00017373397662080625, + "loss": 0.4507, + "step": 29690 + }, + { + "epoch": 1.5075452780139864, + "grad_norm": 0.02313533592545971, + "learning_rate": 0.0001735661448817368, + "loss": 0.4839, + "step": 29695 + }, + { + "epoch": 1.507799114112018, + "grad_norm": 0.02643057262905096, + "learning_rate": 0.0001733983772200053, + "loss": 0.4706, + "step": 29700 + }, + { + "epoch": 1.5080529502100495, + "grad_norm": 0.021347818597730134, + "learning_rate": 0.00017323067366854344, + "loss": 0.481, + "step": 29705 + }, + { + "epoch": 1.508306786308081, + "grad_norm": 0.02075350561553384, + "learning_rate": 0.00017306303426027094, + "loss": 0.5027, + "step": 29710 + }, + { + "epoch": 1.5085606224061123, + "grad_norm": 0.044968810989294526, + "learning_rate": 0.00017289545902809416, + "loss": 0.4437, + "step": 29715 + }, + { + "epoch": 1.5088144585041439, + "grad_norm": 0.02712313510234019, + "learning_rate": 0.00017272794800490772, + "loss": 0.4846, + "step": 29720 + }, + { + "epoch": 1.5090682946021754, + "grad_norm": 0.030525162317820355, + "learning_rate": 0.00017256050122359278, + "loss": 0.4566, + "step": 29725 + }, + { + "epoch": 1.5093221307002067, + "grad_norm": 0.023955850748629902, + "learning_rate": 0.00017239311871701868, + "loss": 0.453, + "step": 29730 + }, + { + "epoch": 1.5095759667982382, + "grad_norm": 0.02040802810169976, + "learning_rate": 0.00017222580051804147, + "loss": 0.4381, + "step": 29735 + }, + { + "epoch": 1.5098298028962698, + "grad_norm": 0.022363301802030486, + "learning_rate": 0.000172058546659505, + "loss": 0.4707, + "step": 29740 + }, + { + "epoch": 1.5100836389943013, + "grad_norm": 0.020755527638949292, + "learning_rate": 0.00017189135717424054, + "loss": 0.4495, + "step": 29745 + }, + { + "epoch": 1.5103374750923328, + "grad_norm": 0.022821381482327964, + "learning_rate": 0.0001717242320950662, + "loss": 0.4443, + "step": 29750 + }, + { + "epoch": 1.5105913111903644, + "grad_norm": 0.02519971954586667, + "learning_rate": 0.00017155717145478822, + "loss": 0.4724, + "step": 29755 + }, + { + "epoch": 1.510845147288396, + "grad_norm": 0.031004863317514574, + "learning_rate": 0.00017139017528619932, + "loss": 0.4501, + "step": 29760 + }, + { + "epoch": 1.5110989833864275, + "grad_norm": 0.022610866791681877, + "learning_rate": 0.0001712232436220804, + "loss": 0.4789, + "step": 29765 + }, + { + "epoch": 1.511352819484459, + "grad_norm": 0.027520059296275306, + "learning_rate": 0.000171056376495199, + "loss": 0.4893, + "step": 29770 + }, + { + "epoch": 1.5116066555824905, + "grad_norm": 0.02082044707695213, + "learning_rate": 0.00017088957393831066, + "loss": 0.4471, + "step": 29775 + }, + { + "epoch": 1.5118604916805218, + "grad_norm": 0.02144250442393683, + "learning_rate": 0.0001707228359841575, + "loss": 0.4981, + "step": 29780 + }, + { + "epoch": 1.5121143277785534, + "grad_norm": 0.021818336086735435, + "learning_rate": 0.0001705561626654697, + "loss": 0.4511, + "step": 29785 + }, + { + "epoch": 1.512368163876585, + "grad_norm": 0.023425182235900163, + "learning_rate": 0.00017038955401496404, + "loss": 0.4725, + "step": 29790 + }, + { + "epoch": 1.5126219999746164, + "grad_norm": 0.0227877965017971, + "learning_rate": 0.00017022301006534512, + "loss": 0.4842, + "step": 29795 + }, + { + "epoch": 1.5128758360726478, + "grad_norm": 0.02442465002826424, + "learning_rate": 0.00017005653084930483, + "loss": 0.485, + "step": 29800 + }, + { + "epoch": 1.5131296721706793, + "grad_norm": 0.020723345058742938, + "learning_rate": 0.00016989011639952222, + "loss": 0.4655, + "step": 29805 + }, + { + "epoch": 1.5133835082687108, + "grad_norm": 0.020479347433989627, + "learning_rate": 0.00016972376674866336, + "loss": 0.473, + "step": 29810 + }, + { + "epoch": 1.5136373443667424, + "grad_norm": 0.02383837108874991, + "learning_rate": 0.00016955748192938215, + "loss": 0.4885, + "step": 29815 + }, + { + "epoch": 1.513891180464774, + "grad_norm": 0.02189974689590864, + "learning_rate": 0.00016939126197431916, + "loss": 0.4831, + "step": 29820 + }, + { + "epoch": 1.5141450165628054, + "grad_norm": 0.022326421553039454, + "learning_rate": 0.00016922510691610288, + "loss": 0.4914, + "step": 29825 + }, + { + "epoch": 1.514398852660837, + "grad_norm": 0.025686926161214048, + "learning_rate": 0.00016905901678734836, + "loss": 0.4565, + "step": 29830 + }, + { + "epoch": 1.5146526887588685, + "grad_norm": 0.02510185893652824, + "learning_rate": 0.00016889299162065863, + "loss": 0.4412, + "step": 29835 + }, + { + "epoch": 1.5149065248569, + "grad_norm": 0.019743756304140575, + "learning_rate": 0.00016872703144862322, + "loss": 0.4503, + "step": 29840 + }, + { + "epoch": 1.5151603609549316, + "grad_norm": 0.023099741532777593, + "learning_rate": 0.0001685611363038197, + "loss": 0.4646, + "step": 29845 + }, + { + "epoch": 1.5154141970529629, + "grad_norm": 0.02167102495604313, + "learning_rate": 0.000168395306218812, + "loss": 0.4297, + "step": 29850 + }, + { + "epoch": 1.5156680331509944, + "grad_norm": 0.021199600428507007, + "learning_rate": 0.00016822954122615202, + "loss": 0.5018, + "step": 29855 + }, + { + "epoch": 1.515921869249026, + "grad_norm": 0.03525431444628682, + "learning_rate": 0.0001680638413583787, + "loss": 0.4627, + "step": 29860 + }, + { + "epoch": 1.5161757053470573, + "grad_norm": 0.021847874025035, + "learning_rate": 0.00016789820664801785, + "loss": 0.4601, + "step": 29865 + }, + { + "epoch": 1.5164295414450888, + "grad_norm": 0.023339409719603202, + "learning_rate": 0.00016773263712758298, + "loss": 0.4853, + "step": 29870 + }, + { + "epoch": 1.5166833775431203, + "grad_norm": 0.028528438177320366, + "learning_rate": 0.00016756713282957425, + "loss": 0.4764, + "step": 29875 + }, + { + "epoch": 1.5169372136411519, + "grad_norm": 0.02968109811483413, + "learning_rate": 0.00016740169378647967, + "loss": 0.4622, + "step": 29880 + }, + { + "epoch": 1.5171910497391834, + "grad_norm": 0.020421612487212134, + "learning_rate": 0.00016723632003077382, + "loss": 0.4608, + "step": 29885 + }, + { + "epoch": 1.517444885837215, + "grad_norm": 0.022429618618365205, + "learning_rate": 0.000167071011594919, + "loss": 0.4365, + "step": 29890 + }, + { + "epoch": 1.5176987219352465, + "grad_norm": 0.022662076262398696, + "learning_rate": 0.00016690576851136407, + "loss": 0.4606, + "step": 29895 + }, + { + "epoch": 1.517952558033278, + "grad_norm": 0.024997243449074575, + "learning_rate": 0.00016674059081254588, + "loss": 0.494, + "step": 29900 + }, + { + "epoch": 1.5182063941313095, + "grad_norm": 0.02074484540640106, + "learning_rate": 0.00016657547853088755, + "loss": 0.4665, + "step": 29905 + }, + { + "epoch": 1.518460230229341, + "grad_norm": 0.02155076942870292, + "learning_rate": 0.00016641043169880016, + "loss": 0.4733, + "step": 29910 + }, + { + "epoch": 1.5187140663273724, + "grad_norm": 0.021665837981174028, + "learning_rate": 0.00016624545034868126, + "loss": 0.4754, + "step": 29915 + }, + { + "epoch": 1.518967902425404, + "grad_norm": 0.021946502878706562, + "learning_rate": 0.00016608053451291606, + "loss": 0.4603, + "step": 29920 + }, + { + "epoch": 1.5192217385234354, + "grad_norm": 0.021175079987024562, + "learning_rate": 0.0001659156842238766, + "loss": 0.4527, + "step": 29925 + }, + { + "epoch": 1.5194755746214668, + "grad_norm": 0.0237496786913426, + "learning_rate": 0.00016575089951392246, + "loss": 0.4646, + "step": 29930 + }, + { + "epoch": 1.5197294107194983, + "grad_norm": 0.023737688121588916, + "learning_rate": 0.0001655861804153997, + "loss": 0.4924, + "step": 29935 + }, + { + "epoch": 1.5199832468175298, + "grad_norm": 0.021122130099155718, + "learning_rate": 0.00016542152696064216, + "loss": 0.4394, + "step": 29940 + }, + { + "epoch": 1.5202370829155614, + "grad_norm": 0.02227799640020005, + "learning_rate": 0.00016525693918197017, + "loss": 0.4538, + "step": 29945 + }, + { + "epoch": 1.520490919013593, + "grad_norm": 0.02210238996254023, + "learning_rate": 0.00016509241711169182, + "loss": 0.4354, + "step": 29950 + }, + { + "epoch": 1.5207447551116244, + "grad_norm": 0.023706052350875976, + "learning_rate": 0.00016492796078210165, + "loss": 0.4673, + "step": 29955 + }, + { + "epoch": 1.520998591209656, + "grad_norm": 0.024032972742291287, + "learning_rate": 0.00016476357022548194, + "loss": 0.4675, + "step": 29960 + }, + { + "epoch": 1.5212524273076875, + "grad_norm": 0.022993782010873133, + "learning_rate": 0.0001645992454741016, + "loss": 0.467, + "step": 29965 + }, + { + "epoch": 1.521506263405719, + "grad_norm": 0.02190020611794009, + "learning_rate": 0.0001644349865602165, + "loss": 0.4702, + "step": 29970 + }, + { + "epoch": 1.5217600995037506, + "grad_norm": 0.028275273809187658, + "learning_rate": 0.00016427079351607031, + "loss": 0.4871, + "step": 29975 + }, + { + "epoch": 1.5220139356017819, + "grad_norm": 0.021445956898613706, + "learning_rate": 0.00016410666637389272, + "loss": 0.4798, + "step": 29980 + }, + { + "epoch": 1.5222677716998134, + "grad_norm": 0.02438123499531666, + "learning_rate": 0.00016394260516590175, + "loss": 0.4827, + "step": 29985 + }, + { + "epoch": 1.522521607797845, + "grad_norm": 0.022720865331639653, + "learning_rate": 0.00016377860992430128, + "loss": 0.4744, + "step": 29990 + }, + { + "epoch": 1.5227754438958763, + "grad_norm": 0.022386692990421366, + "learning_rate": 0.00016361468068128314, + "loss": 0.487, + "step": 29995 + }, + { + "epoch": 1.5230292799939078, + "grad_norm": 0.022169321428591356, + "learning_rate": 0.00016345081746902546, + "loss": 0.455, + "step": 30000 + }, + { + "epoch": 1.5232831160919393, + "grad_norm": 0.023182632494306774, + "learning_rate": 0.0001632870203196941, + "loss": 0.4439, + "step": 30005 + }, + { + "epoch": 1.5235369521899709, + "grad_norm": 0.02371609367353566, + "learning_rate": 0.00016312328926544134, + "loss": 0.4749, + "step": 30010 + }, + { + "epoch": 1.5237907882880024, + "grad_norm": 0.03987679265817027, + "learning_rate": 0.00016295962433840705, + "loss": 0.4789, + "step": 30015 + }, + { + "epoch": 1.524044624386034, + "grad_norm": 0.02301138402117863, + "learning_rate": 0.0001627960255707175, + "loss": 0.4964, + "step": 30020 + }, + { + "epoch": 1.5242984604840655, + "grad_norm": 0.03265353278900402, + "learning_rate": 0.0001626324929944867, + "loss": 0.4898, + "step": 30025 + }, + { + "epoch": 1.524552296582097, + "grad_norm": 0.022592094290072112, + "learning_rate": 0.00016246902664181483, + "loss": 0.4581, + "step": 30030 + }, + { + "epoch": 1.5248061326801285, + "grad_norm": 0.022149041415701717, + "learning_rate": 0.00016230562654478997, + "loss": 0.501, + "step": 30035 + }, + { + "epoch": 1.52505996877816, + "grad_norm": 0.027689045530166045, + "learning_rate": 0.00016214229273548626, + "loss": 0.4852, + "step": 30040 + }, + { + "epoch": 1.5253138048761914, + "grad_norm": 0.021429952552435644, + "learning_rate": 0.00016197902524596586, + "loss": 0.4657, + "step": 30045 + }, + { + "epoch": 1.525567640974223, + "grad_norm": 0.020301150588238986, + "learning_rate": 0.0001618158241082771, + "loss": 0.4545, + "step": 30050 + }, + { + "epoch": 1.5258214770722545, + "grad_norm": 0.031778032501045626, + "learning_rate": 0.00016165268935445544, + "loss": 0.4439, + "step": 30055 + }, + { + "epoch": 1.526075313170286, + "grad_norm": 0.035429628853980294, + "learning_rate": 0.00016148962101652364, + "loss": 0.4669, + "step": 30060 + }, + { + "epoch": 1.5263291492683173, + "grad_norm": 0.02204480950917167, + "learning_rate": 0.00016132661912649093, + "loss": 0.4762, + "step": 30065 + }, + { + "epoch": 1.5265829853663488, + "grad_norm": 0.10463008791075984, + "learning_rate": 0.0001611636837163541, + "loss": 0.4746, + "step": 30070 + }, + { + "epoch": 1.5268368214643804, + "grad_norm": 0.022253586897096424, + "learning_rate": 0.0001610008148180962, + "loss": 0.4731, + "step": 30075 + }, + { + "epoch": 1.527090657562412, + "grad_norm": 0.02649746556621996, + "learning_rate": 0.0001608380124636879, + "loss": 0.4716, + "step": 30080 + }, + { + "epoch": 1.5273444936604434, + "grad_norm": 0.02103303584052773, + "learning_rate": 0.00016067527668508624, + "loss": 0.4537, + "step": 30085 + }, + { + "epoch": 1.527598329758475, + "grad_norm": 0.022990234812475127, + "learning_rate": 0.00016051260751423575, + "loss": 0.4653, + "step": 30090 + }, + { + "epoch": 1.5278521658565065, + "grad_norm": 0.023544638654335974, + "learning_rate": 0.00016035000498306712, + "loss": 0.4698, + "step": 30095 + }, + { + "epoch": 1.528106001954538, + "grad_norm": 0.02200543924407069, + "learning_rate": 0.00016018746912349873, + "loss": 0.4672, + "step": 30100 + }, + { + "epoch": 1.5283598380525696, + "grad_norm": 0.024788258547969233, + "learning_rate": 0.00016002499996743553, + "loss": 0.4542, + "step": 30105 + }, + { + "epoch": 1.5286136741506011, + "grad_norm": 0.020208891949950916, + "learning_rate": 0.00015986259754676956, + "loss": 0.4441, + "step": 30110 + }, + { + "epoch": 1.5288675102486324, + "grad_norm": 0.020897749391169118, + "learning_rate": 0.00015970026189337922, + "loss": 0.4426, + "step": 30115 + }, + { + "epoch": 1.529121346346664, + "grad_norm": 0.029396390617214736, + "learning_rate": 0.00015953799303913057, + "loss": 0.4663, + "step": 30120 + }, + { + "epoch": 1.5293751824446955, + "grad_norm": 0.022145565505644067, + "learning_rate": 0.0001593757910158759, + "loss": 0.461, + "step": 30125 + }, + { + "epoch": 1.5296290185427268, + "grad_norm": 0.025670850634170336, + "learning_rate": 0.00015921365585545483, + "loss": 0.4842, + "step": 30130 + }, + { + "epoch": 1.5298828546407583, + "grad_norm": 0.02368956140515389, + "learning_rate": 0.00015905158758969351, + "loss": 0.47, + "step": 30135 + }, + { + "epoch": 1.5301366907387899, + "grad_norm": 0.029291043930741913, + "learning_rate": 0.0001588895862504054, + "loss": 0.4698, + "step": 30140 + }, + { + "epoch": 1.5303905268368214, + "grad_norm": 0.025337071026864683, + "learning_rate": 0.00015872765186939025, + "loss": 0.4856, + "step": 30145 + }, + { + "epoch": 1.530644362934853, + "grad_norm": 0.026591156717338086, + "learning_rate": 0.00015856578447843523, + "loss": 0.4747, + "step": 30150 + }, + { + "epoch": 1.5308981990328845, + "grad_norm": 0.022063128074845867, + "learning_rate": 0.0001584039841093139, + "loss": 0.4591, + "step": 30155 + }, + { + "epoch": 1.531152035130916, + "grad_norm": 0.021335741188654174, + "learning_rate": 0.00015824225079378684, + "loss": 0.4508, + "step": 30160 + }, + { + "epoch": 1.5314058712289476, + "grad_norm": 0.0223720420156148, + "learning_rate": 0.00015808058456360185, + "loss": 0.476, + "step": 30165 + }, + { + "epoch": 1.531659707326979, + "grad_norm": 0.020840588861711914, + "learning_rate": 0.00015791898545049277, + "loss": 0.4812, + "step": 30170 + }, + { + "epoch": 1.5319135434250106, + "grad_norm": 0.022298891763324967, + "learning_rate": 0.0001577574534861811, + "loss": 0.4683, + "step": 30175 + }, + { + "epoch": 1.532167379523042, + "grad_norm": 0.023587454451723652, + "learning_rate": 0.00015759598870237435, + "loss": 0.4414, + "step": 30180 + }, + { + "epoch": 1.5324212156210735, + "grad_norm": 0.021926001890396517, + "learning_rate": 0.00015743459113076757, + "loss": 0.4848, + "step": 30185 + }, + { + "epoch": 1.532675051719105, + "grad_norm": 0.02060956296914585, + "learning_rate": 0.0001572732608030421, + "loss": 0.4925, + "step": 30190 + }, + { + "epoch": 1.5329288878171363, + "grad_norm": 0.023321532056538852, + "learning_rate": 0.0001571119977508665, + "loss": 0.4832, + "step": 30195 + }, + { + "epoch": 1.5331827239151679, + "grad_norm": 0.020455538805839175, + "learning_rate": 0.00015695080200589555, + "loss": 0.46, + "step": 30200 + }, + { + "epoch": 1.5334365600131994, + "grad_norm": 0.027072650225216113, + "learning_rate": 0.0001567896735997716, + "loss": 0.5015, + "step": 30205 + }, + { + "epoch": 1.533690396111231, + "grad_norm": 0.025146547764616688, + "learning_rate": 0.00015662861256412293, + "loss": 0.4592, + "step": 30210 + }, + { + "epoch": 1.5339442322092625, + "grad_norm": 0.036222624314755626, + "learning_rate": 0.0001564676189305654, + "loss": 0.4494, + "step": 30215 + }, + { + "epoch": 1.534198068307294, + "grad_norm": 0.021401020225457784, + "learning_rate": 0.00015630669273070075, + "loss": 0.4658, + "step": 30220 + }, + { + "epoch": 1.5344519044053255, + "grad_norm": 0.030566491748286028, + "learning_rate": 0.00015614583399611864, + "loss": 0.4616, + "step": 30225 + }, + { + "epoch": 1.534705740503357, + "grad_norm": 0.02095090812110531, + "learning_rate": 0.00015598504275839443, + "loss": 0.4697, + "step": 30230 + }, + { + "epoch": 1.5349595766013886, + "grad_norm": 0.022932148457458463, + "learning_rate": 0.00015582431904909082, + "loss": 0.4771, + "step": 30235 + }, + { + "epoch": 1.5352134126994201, + "grad_norm": 0.03407518278595503, + "learning_rate": 0.00015566366289975682, + "loss": 0.4951, + "step": 30240 + }, + { + "epoch": 1.5354672487974514, + "grad_norm": 0.025253916128979353, + "learning_rate": 0.00015550307434192878, + "loss": 0.4665, + "step": 30245 + }, + { + "epoch": 1.535721084895483, + "grad_norm": 0.02181284888478828, + "learning_rate": 0.00015534255340712906, + "loss": 0.4712, + "step": 30250 + }, + { + "epoch": 1.5359749209935145, + "grad_norm": 0.026511357983416556, + "learning_rate": 0.00015518210012686746, + "loss": 0.457, + "step": 30255 + }, + { + "epoch": 1.5362287570915458, + "grad_norm": 0.02225675314670062, + "learning_rate": 0.00015502171453263985, + "loss": 0.4655, + "step": 30260 + }, + { + "epoch": 1.5364825931895774, + "grad_norm": 0.02579060282785478, + "learning_rate": 0.0001548613966559294, + "loss": 0.4537, + "step": 30265 + }, + { + "epoch": 1.536736429287609, + "grad_norm": 0.022733734170467795, + "learning_rate": 0.00015470114652820548, + "loss": 0.4893, + "step": 30270 + }, + { + "epoch": 1.5369902653856404, + "grad_norm": 0.033369886202445666, + "learning_rate": 0.0001545409641809246, + "loss": 0.4329, + "step": 30275 + }, + { + "epoch": 1.537244101483672, + "grad_norm": 0.02399529836464429, + "learning_rate": 0.00015438084964552952, + "loss": 0.4608, + "step": 30280 + }, + { + "epoch": 1.5374979375817035, + "grad_norm": 0.026731534341703136, + "learning_rate": 0.0001542208029534501, + "loss": 0.4834, + "step": 30285 + }, + { + "epoch": 1.537751773679735, + "grad_norm": 0.02160422482777416, + "learning_rate": 0.00015406082413610273, + "loss": 0.4872, + "step": 30290 + }, + { + "epoch": 1.5380056097777666, + "grad_norm": 0.0316642181786694, + "learning_rate": 0.0001539009132248903, + "loss": 0.4739, + "step": 30295 + }, + { + "epoch": 1.538259445875798, + "grad_norm": 0.021390203234012684, + "learning_rate": 0.0001537410702512027, + "loss": 0.485, + "step": 30300 + }, + { + "epoch": 1.5385132819738296, + "grad_norm": 0.021910472645039536, + "learning_rate": 0.00015358129524641612, + "loss": 0.4836, + "step": 30305 + }, + { + "epoch": 1.538767118071861, + "grad_norm": 0.026086465805670835, + "learning_rate": 0.00015342158824189383, + "loss": 0.4645, + "step": 30310 + }, + { + "epoch": 1.5390209541698925, + "grad_norm": 0.02206117837334602, + "learning_rate": 0.00015326194926898524, + "loss": 0.4429, + "step": 30315 + }, + { + "epoch": 1.539274790267924, + "grad_norm": 0.04439473835947078, + "learning_rate": 0.00015310237835902696, + "loss": 0.4955, + "step": 30320 + }, + { + "epoch": 1.5395286263659556, + "grad_norm": 0.022276905328733724, + "learning_rate": 0.0001529428755433417, + "loss": 0.4728, + "step": 30325 + }, + { + "epoch": 1.5397824624639869, + "grad_norm": 0.022985726029175126, + "learning_rate": 0.00015278344085323936, + "loss": 0.4855, + "step": 30330 + }, + { + "epoch": 1.5400362985620184, + "grad_norm": 0.02518413121712412, + "learning_rate": 0.00015262407432001585, + "loss": 0.4615, + "step": 30335 + }, + { + "epoch": 1.54029013466005, + "grad_norm": 0.02425031078779939, + "learning_rate": 0.00015246477597495418, + "loss": 0.4743, + "step": 30340 + }, + { + "epoch": 1.5405439707580815, + "grad_norm": 0.02735453587465954, + "learning_rate": 0.00015230554584932382, + "loss": 0.5191, + "step": 30345 + }, + { + "epoch": 1.540797806856113, + "grad_norm": 0.019919301710259858, + "learning_rate": 0.00015214638397438108, + "loss": 0.4306, + "step": 30350 + }, + { + "epoch": 1.5410516429541445, + "grad_norm": 0.0254003448653861, + "learning_rate": 0.00015198729038136822, + "loss": 0.4859, + "step": 30355 + }, + { + "epoch": 1.541305479052176, + "grad_norm": 0.025573678483990125, + "learning_rate": 0.00015182826510151486, + "loss": 0.4723, + "step": 30360 + }, + { + "epoch": 1.5415593151502076, + "grad_norm": 0.027652411805370227, + "learning_rate": 0.00015166930816603658, + "loss": 0.455, + "step": 30365 + }, + { + "epoch": 1.5418131512482391, + "grad_norm": 0.020242159971136965, + "learning_rate": 0.00015151041960613615, + "loss": 0.4484, + "step": 30370 + }, + { + "epoch": 1.5420669873462707, + "grad_norm": 0.020441851093310592, + "learning_rate": 0.0001513515994530023, + "loss": 0.4777, + "step": 30375 + }, + { + "epoch": 1.542320823444302, + "grad_norm": 0.02264328559702477, + "learning_rate": 0.00015119284773781088, + "loss": 0.4692, + "step": 30380 + }, + { + "epoch": 1.5425746595423335, + "grad_norm": 0.026801259654149444, + "learning_rate": 0.00015103416449172385, + "loss": 0.4879, + "step": 30385 + }, + { + "epoch": 1.542828495640365, + "grad_norm": 0.029775650566010867, + "learning_rate": 0.0001508755497458902, + "loss": 0.4733, + "step": 30390 + }, + { + "epoch": 1.5430823317383964, + "grad_norm": 0.023702035657112406, + "learning_rate": 0.00015071700353144486, + "loss": 0.4844, + "step": 30395 + }, + { + "epoch": 1.543336167836428, + "grad_norm": 0.02932424299604861, + "learning_rate": 0.00015055852587950985, + "loss": 0.4498, + "step": 30400 + }, + { + "epoch": 1.5435900039344594, + "grad_norm": 0.024319591793915026, + "learning_rate": 0.0001504001168211937, + "loss": 0.485, + "step": 30405 + }, + { + "epoch": 1.543843840032491, + "grad_norm": 0.02366358061890231, + "learning_rate": 0.00015024177638759106, + "loss": 0.4566, + "step": 30410 + }, + { + "epoch": 1.5440976761305225, + "grad_norm": 0.024118758618433097, + "learning_rate": 0.00015008350460978358, + "loss": 0.4397, + "step": 30415 + }, + { + "epoch": 1.544351512228554, + "grad_norm": 0.020666341449150978, + "learning_rate": 0.00014992530151883898, + "loss": 0.4599, + "step": 30420 + }, + { + "epoch": 1.5446053483265856, + "grad_norm": 0.02778511686784231, + "learning_rate": 0.000149767167145812, + "loss": 0.454, + "step": 30425 + }, + { + "epoch": 1.5448591844246171, + "grad_norm": 0.02389674933505474, + "learning_rate": 0.0001496091015217434, + "loss": 0.463, + "step": 30430 + }, + { + "epoch": 1.5451130205226487, + "grad_norm": 0.027551341882567568, + "learning_rate": 0.00014945110467766087, + "loss": 0.4622, + "step": 30435 + }, + { + "epoch": 1.5453668566206802, + "grad_norm": 0.02501687597581702, + "learning_rate": 0.0001492931766445782, + "loss": 0.4764, + "step": 30440 + }, + { + "epoch": 1.5456206927187115, + "grad_norm": 0.0233086153289092, + "learning_rate": 0.0001491353174534961, + "loss": 0.4686, + "step": 30445 + }, + { + "epoch": 1.545874528816743, + "grad_norm": 0.02184154260928588, + "learning_rate": 0.0001489775271354013, + "loss": 0.4702, + "step": 30450 + }, + { + "epoch": 1.5461283649147746, + "grad_norm": 0.024678224162576597, + "learning_rate": 0.00014881980572126752, + "loss": 0.483, + "step": 30455 + }, + { + "epoch": 1.5463822010128059, + "grad_norm": 0.023783148168890343, + "learning_rate": 0.00014866215324205423, + "loss": 0.4683, + "step": 30460 + }, + { + "epoch": 1.5466360371108374, + "grad_norm": 0.022198329720484546, + "learning_rate": 0.00014850456972870845, + "loss": 0.458, + "step": 30465 + }, + { + "epoch": 1.546889873208869, + "grad_norm": 0.02239881749730953, + "learning_rate": 0.00014834705521216262, + "loss": 0.4715, + "step": 30470 + }, + { + "epoch": 1.5471437093069005, + "grad_norm": 0.022857676745027398, + "learning_rate": 0.0001481896097233363, + "loss": 0.4553, + "step": 30475 + }, + { + "epoch": 1.547397545404932, + "grad_norm": 0.021961872351366773, + "learning_rate": 0.00014803223329313493, + "loss": 0.458, + "step": 30480 + }, + { + "epoch": 1.5476513815029636, + "grad_norm": 0.025052913119013042, + "learning_rate": 0.00014787492595245107, + "loss": 0.4809, + "step": 30485 + }, + { + "epoch": 1.547905217600995, + "grad_norm": 0.023440120967819812, + "learning_rate": 0.00014771768773216298, + "loss": 0.4511, + "step": 30490 + }, + { + "epoch": 1.5481590536990266, + "grad_norm": 0.021273780361796304, + "learning_rate": 0.00014756051866313618, + "loss": 0.4716, + "step": 30495 + }, + { + "epoch": 1.5484128897970582, + "grad_norm": 0.17776653515516164, + "learning_rate": 0.00014740341877622181, + "loss": 0.4469, + "step": 30500 + }, + { + "epoch": 1.5486667258950897, + "grad_norm": 0.028505550422634855, + "learning_rate": 0.0001472463881022581, + "loss": 0.4391, + "step": 30505 + }, + { + "epoch": 1.548920561993121, + "grad_norm": 0.021992503155247888, + "learning_rate": 0.00014708942667206903, + "loss": 0.4806, + "step": 30510 + }, + { + "epoch": 1.5491743980911525, + "grad_norm": 0.022741796670375448, + "learning_rate": 0.0001469325345164657, + "loss": 0.4632, + "step": 30515 + }, + { + "epoch": 1.549428234189184, + "grad_norm": 0.020106646227831544, + "learning_rate": 0.00014677571166624498, + "loss": 0.4321, + "step": 30520 + }, + { + "epoch": 1.5496820702872154, + "grad_norm": 0.02404819367108056, + "learning_rate": 0.0001466189581521905, + "loss": 0.5009, + "step": 30525 + }, + { + "epoch": 1.549935906385247, + "grad_norm": 0.020931172832830407, + "learning_rate": 0.00014646227400507238, + "loss": 0.4553, + "step": 30530 + }, + { + "epoch": 1.5501897424832785, + "grad_norm": 0.021373194488072105, + "learning_rate": 0.00014630565925564666, + "loss": 0.4716, + "step": 30535 + }, + { + "epoch": 1.55044357858131, + "grad_norm": 0.0197633540678443, + "learning_rate": 0.0001461491139346563, + "loss": 0.4438, + "step": 30540 + }, + { + "epoch": 1.5506974146793415, + "grad_norm": 0.02187526961268043, + "learning_rate": 0.00014599263807283004, + "loss": 0.475, + "step": 30545 + }, + { + "epoch": 1.550951250777373, + "grad_norm": 0.026271464926687128, + "learning_rate": 0.00014583623170088368, + "loss": 0.4536, + "step": 30550 + }, + { + "epoch": 1.5512050868754046, + "grad_norm": 0.02494946006654002, + "learning_rate": 0.00014567989484951866, + "loss": 0.4895, + "step": 30555 + }, + { + "epoch": 1.5514589229734361, + "grad_norm": 0.021477409162667542, + "learning_rate": 0.00014552362754942345, + "loss": 0.4711, + "step": 30560 + }, + { + "epoch": 1.5517127590714677, + "grad_norm": 0.02423951069240377, + "learning_rate": 0.00014536742983127222, + "loss": 0.4594, + "step": 30565 + }, + { + "epoch": 1.5519665951694992, + "grad_norm": 0.022676565651745723, + "learning_rate": 0.0001452113017257261, + "loss": 0.4663, + "step": 30570 + }, + { + "epoch": 1.5522204312675305, + "grad_norm": 0.02291640098589712, + "learning_rate": 0.000145055243263432, + "loss": 0.4895, + "step": 30575 + }, + { + "epoch": 1.552474267365562, + "grad_norm": 0.028475726341260384, + "learning_rate": 0.0001448992544750235, + "loss": 0.4935, + "step": 30580 + }, + { + "epoch": 1.5527281034635936, + "grad_norm": 0.027950998531424066, + "learning_rate": 0.0001447433353911205, + "loss": 0.4825, + "step": 30585 + }, + { + "epoch": 1.5529819395616251, + "grad_norm": 0.03974053562564052, + "learning_rate": 0.00014458748604232924, + "loss": 0.4758, + "step": 30590 + }, + { + "epoch": 1.5532357756596564, + "grad_norm": 0.022027661241062403, + "learning_rate": 0.00014443170645924192, + "loss": 0.4816, + "step": 30595 + }, + { + "epoch": 1.553489611757688, + "grad_norm": 0.02238767269393634, + "learning_rate": 0.0001442759966724375, + "loss": 0.4531, + "step": 30600 + }, + { + "epoch": 1.5537434478557195, + "grad_norm": 0.026046907786261127, + "learning_rate": 0.0001441203567124808, + "loss": 0.459, + "step": 30605 + }, + { + "epoch": 1.553997283953751, + "grad_norm": 0.023211596978539077, + "learning_rate": 0.00014396478660992353, + "loss": 0.4597, + "step": 30610 + }, + { + "epoch": 1.5542511200517826, + "grad_norm": 0.02283234851673069, + "learning_rate": 0.00014380928639530282, + "loss": 0.4729, + "step": 30615 + }, + { + "epoch": 1.554504956149814, + "grad_norm": 0.022351336504833237, + "learning_rate": 0.00014365385609914312, + "loss": 0.4719, + "step": 30620 + }, + { + "epoch": 1.5547587922478456, + "grad_norm": 0.022992716838049543, + "learning_rate": 0.00014349849575195423, + "loss": 0.444, + "step": 30625 + }, + { + "epoch": 1.5550126283458772, + "grad_norm": 0.024021395143192816, + "learning_rate": 0.00014334320538423285, + "loss": 0.4633, + "step": 30630 + }, + { + "epoch": 1.5552664644439087, + "grad_norm": 0.023643090769376898, + "learning_rate": 0.00014318798502646146, + "loss": 0.4528, + "step": 30635 + }, + { + "epoch": 1.55552030054194, + "grad_norm": 0.01987202638142388, + "learning_rate": 0.00014303283470910923, + "loss": 0.4648, + "step": 30640 + }, + { + "epoch": 1.5557741366399715, + "grad_norm": 0.022557897730457903, + "learning_rate": 0.00014287775446263147, + "loss": 0.4736, + "step": 30645 + }, + { + "epoch": 1.556027972738003, + "grad_norm": 0.045642258711147084, + "learning_rate": 0.0001427227443174694, + "loss": 0.4524, + "step": 30650 + }, + { + "epoch": 1.5562818088360346, + "grad_norm": 0.02528878503814024, + "learning_rate": 0.00014256780430405103, + "loss": 0.4454, + "step": 30655 + }, + { + "epoch": 1.556535644934066, + "grad_norm": 0.026569740501743554, + "learning_rate": 0.00014241293445279, + "loss": 0.4783, + "step": 30660 + }, + { + "epoch": 1.5567894810320975, + "grad_norm": 0.021501409027796143, + "learning_rate": 0.00014225813479408684, + "loss": 0.4639, + "step": 30665 + }, + { + "epoch": 1.557043317130129, + "grad_norm": 0.02015158186890378, + "learning_rate": 0.0001421034053583276, + "loss": 0.452, + "step": 30670 + }, + { + "epoch": 1.5572971532281605, + "grad_norm": 0.02290562338028314, + "learning_rate": 0.00014194874617588522, + "loss": 0.4564, + "step": 30675 + }, + { + "epoch": 1.557550989326192, + "grad_norm": 0.021262535192917962, + "learning_rate": 0.0001417941572771182, + "loss": 0.4404, + "step": 30680 + }, + { + "epoch": 1.5578048254242236, + "grad_norm": 0.023855140132822632, + "learning_rate": 0.0001416396386923719, + "loss": 0.4657, + "step": 30685 + }, + { + "epoch": 1.5580586615222551, + "grad_norm": 0.026001147130948372, + "learning_rate": 0.00014148519045197722, + "loss": 0.4983, + "step": 30690 + }, + { + "epoch": 1.5583124976202867, + "grad_norm": 0.025029246692496336, + "learning_rate": 0.00014133081258625192, + "loss": 0.4728, + "step": 30695 + }, + { + "epoch": 1.5585663337183182, + "grad_norm": 0.02861011211966918, + "learning_rate": 0.00014117650512549912, + "loss": 0.4667, + "step": 30700 + }, + { + "epoch": 1.5588201698163497, + "grad_norm": 0.020989857959052866, + "learning_rate": 0.00014102226810000919, + "loss": 0.4785, + "step": 30705 + }, + { + "epoch": 1.559074005914381, + "grad_norm": 0.02520491934250732, + "learning_rate": 0.0001408681015400577, + "loss": 0.4715, + "step": 30710 + }, + { + "epoch": 1.5593278420124126, + "grad_norm": 0.02884493504793044, + "learning_rate": 0.000140714005475907, + "loss": 0.4626, + "step": 30715 + }, + { + "epoch": 1.5595816781104441, + "grad_norm": 0.023290080100767288, + "learning_rate": 0.00014055997993780512, + "loss": 0.4761, + "step": 30720 + }, + { + "epoch": 1.5598355142084754, + "grad_norm": 0.024842802813832925, + "learning_rate": 0.0001404060249559868, + "loss": 0.4583, + "step": 30725 + }, + { + "epoch": 1.560089350306507, + "grad_norm": 0.02281559318997292, + "learning_rate": 0.00014025214056067237, + "loss": 0.4806, + "step": 30730 + }, + { + "epoch": 1.5603431864045385, + "grad_norm": 0.033399438534170926, + "learning_rate": 0.00014009832678206887, + "loss": 0.4709, + "step": 30735 + }, + { + "epoch": 1.56059702250257, + "grad_norm": 0.028247300578566486, + "learning_rate": 0.00013994458365036879, + "loss": 0.4727, + "step": 30740 + }, + { + "epoch": 1.5608508586006016, + "grad_norm": 0.02286113664149876, + "learning_rate": 0.0001397909111957515, + "loss": 0.4916, + "step": 30745 + }, + { + "epoch": 1.561104694698633, + "grad_norm": 0.02910635163937602, + "learning_rate": 0.00013963730944838181, + "loss": 0.4586, + "step": 30750 + }, + { + "epoch": 1.5613585307966646, + "grad_norm": 0.028491477035486, + "learning_rate": 0.00013948377843841137, + "loss": 0.4695, + "step": 30755 + }, + { + "epoch": 1.5616123668946962, + "grad_norm": 0.0224890085130081, + "learning_rate": 0.00013933031819597714, + "loss": 0.4666, + "step": 30760 + }, + { + "epoch": 1.5618662029927277, + "grad_norm": 0.019158984174170664, + "learning_rate": 0.00013917692875120276, + "loss": 0.4534, + "step": 30765 + }, + { + "epoch": 1.5621200390907592, + "grad_norm": 0.023081415384837618, + "learning_rate": 0.00013902361013419807, + "loss": 0.4889, + "step": 30770 + }, + { + "epoch": 1.5623738751887906, + "grad_norm": 0.023710781179949143, + "learning_rate": 0.0001388703623750583, + "loss": 0.4508, + "step": 30775 + }, + { + "epoch": 1.562627711286822, + "grad_norm": 0.02229210338649228, + "learning_rate": 0.00013871718550386564, + "loss": 0.4581, + "step": 30780 + }, + { + "epoch": 1.5628815473848536, + "grad_norm": 0.02255486124550216, + "learning_rate": 0.00013856407955068755, + "loss": 0.468, + "step": 30785 + }, + { + "epoch": 1.563135383482885, + "grad_norm": 0.028951039267342597, + "learning_rate": 0.0001384110445455784, + "loss": 0.4481, + "step": 30790 + }, + { + "epoch": 1.5633892195809165, + "grad_norm": 0.03206682793554021, + "learning_rate": 0.00013825808051857774, + "loss": 0.4628, + "step": 30795 + }, + { + "epoch": 1.563643055678948, + "grad_norm": 0.03315618731467156, + "learning_rate": 0.00013810518749971207, + "loss": 0.4913, + "step": 30800 + }, + { + "epoch": 1.5638968917769795, + "grad_norm": 0.020986799281081246, + "learning_rate": 0.00013795236551899316, + "loss": 0.4788, + "step": 30805 + }, + { + "epoch": 1.564150727875011, + "grad_norm": 0.027439058381662006, + "learning_rate": 0.0001377996146064195, + "loss": 0.5025, + "step": 30810 + }, + { + "epoch": 1.5644045639730426, + "grad_norm": 0.03575554930996507, + "learning_rate": 0.00013764693479197503, + "loss": 0.4559, + "step": 30815 + }, + { + "epoch": 1.5646584000710742, + "grad_norm": 0.022347257231482485, + "learning_rate": 0.00013749432610563045, + "loss": 0.464, + "step": 30820 + }, + { + "epoch": 1.5649122361691057, + "grad_norm": 0.022141752765691802, + "learning_rate": 0.00013734178857734147, + "loss": 0.4646, + "step": 30825 + }, + { + "epoch": 1.5651660722671372, + "grad_norm": 0.0205841238198598, + "learning_rate": 0.0001371893222370511, + "loss": 0.4786, + "step": 30830 + }, + { + "epoch": 1.5654199083651688, + "grad_norm": 0.02330750400737154, + "learning_rate": 0.00013703692711468734, + "loss": 0.479, + "step": 30835 + }, + { + "epoch": 1.5656737444632, + "grad_norm": 0.02083094398512009, + "learning_rate": 0.00013688460324016484, + "loss": 0.4422, + "step": 30840 + }, + { + "epoch": 1.5659275805612316, + "grad_norm": 0.02046271028364385, + "learning_rate": 0.00013673235064338375, + "loss": 0.4481, + "step": 30845 + }, + { + "epoch": 1.5661814166592631, + "grad_norm": 0.020193980043060795, + "learning_rate": 0.00013658016935423067, + "loss": 0.4385, + "step": 30850 + }, + { + "epoch": 1.5664352527572944, + "grad_norm": 0.023275239337087233, + "learning_rate": 0.0001364280594025779, + "loss": 0.4721, + "step": 30855 + }, + { + "epoch": 1.566689088855326, + "grad_norm": 0.021162190591902365, + "learning_rate": 0.00013627602081828412, + "loss": 0.4614, + "step": 30860 + }, + { + "epoch": 1.5669429249533575, + "grad_norm": 0.020039658211474635, + "learning_rate": 0.00013612405363119334, + "loss": 0.4461, + "step": 30865 + }, + { + "epoch": 1.567196761051389, + "grad_norm": 0.023617453564502285, + "learning_rate": 0.00013597215787113638, + "loss": 0.4713, + "step": 30870 + }, + { + "epoch": 1.5674505971494206, + "grad_norm": 0.025261328277977876, + "learning_rate": 0.00013582033356792923, + "loss": 0.464, + "step": 30875 + }, + { + "epoch": 1.5677044332474521, + "grad_norm": 0.022210536243253937, + "learning_rate": 0.00013566858075137462, + "loss": 0.4461, + "step": 30880 + }, + { + "epoch": 1.5679582693454837, + "grad_norm": 0.02139787157750851, + "learning_rate": 0.00013551689945126056, + "loss": 0.4619, + "step": 30885 + }, + { + "epoch": 1.5682121054435152, + "grad_norm": 0.02347373800866568, + "learning_rate": 0.0001353652896973614, + "loss": 0.4681, + "step": 30890 + }, + { + "epoch": 1.5684659415415467, + "grad_norm": 0.027527125241354457, + "learning_rate": 0.00013521375151943766, + "loss": 0.4738, + "step": 30895 + }, + { + "epoch": 1.5687197776395783, + "grad_norm": 0.023524626568918166, + "learning_rate": 0.0001350622849472351, + "loss": 0.4821, + "step": 30900 + }, + { + "epoch": 1.5689736137376096, + "grad_norm": 0.02466639732929348, + "learning_rate": 0.00013491089001048628, + "loss": 0.4721, + "step": 30905 + }, + { + "epoch": 1.569227449835641, + "grad_norm": 0.027029307313648266, + "learning_rate": 0.00013475956673890887, + "loss": 0.4678, + "step": 30910 + }, + { + "epoch": 1.5694812859336726, + "grad_norm": 0.03282760997469555, + "learning_rate": 0.0001346083151622072, + "loss": 0.4394, + "step": 30915 + }, + { + "epoch": 1.5697351220317042, + "grad_norm": 0.02789891853127172, + "learning_rate": 0.00013445713531007092, + "loss": 0.4628, + "step": 30920 + }, + { + "epoch": 1.5699889581297355, + "grad_norm": 0.02426468888804551, + "learning_rate": 0.00013430602721217617, + "loss": 0.48, + "step": 30925 + }, + { + "epoch": 1.570242794227767, + "grad_norm": 0.021681960512083976, + "learning_rate": 0.0001341549908981844, + "loss": 0.4885, + "step": 30930 + }, + { + "epoch": 1.5704966303257986, + "grad_norm": 0.022832994313996206, + "learning_rate": 0.00013400402639774362, + "loss": 0.4431, + "step": 30935 + }, + { + "epoch": 1.57075046642383, + "grad_norm": 0.024825658736222323, + "learning_rate": 0.00013385313374048708, + "loss": 0.4614, + "step": 30940 + }, + { + "epoch": 1.5710043025218616, + "grad_norm": 0.021382123283392958, + "learning_rate": 0.0001337023129560344, + "loss": 0.4473, + "step": 30945 + }, + { + "epoch": 1.5712581386198932, + "grad_norm": 0.022778600002653724, + "learning_rate": 0.000133551564073991, + "loss": 0.4876, + "step": 30950 + }, + { + "epoch": 1.5715119747179247, + "grad_norm": 0.022378373834901225, + "learning_rate": 0.0001334008871239482, + "loss": 0.4689, + "step": 30955 + }, + { + "epoch": 1.5717658108159562, + "grad_norm": 0.022126622701371928, + "learning_rate": 0.0001332502821354829, + "loss": 0.4686, + "step": 30960 + }, + { + "epoch": 1.5720196469139878, + "grad_norm": 0.02328246824343477, + "learning_rate": 0.00013309974913815843, + "loss": 0.49, + "step": 30965 + }, + { + "epoch": 1.5722734830120193, + "grad_norm": 0.022513981649263024, + "learning_rate": 0.0001329492881615233, + "loss": 0.4663, + "step": 30970 + }, + { + "epoch": 1.5725273191100506, + "grad_norm": 0.02648229761719871, + "learning_rate": 0.00013279889923511256, + "loss": 0.4824, + "step": 30975 + }, + { + "epoch": 1.5727811552080821, + "grad_norm": 0.031697592758533163, + "learning_rate": 0.00013264858238844652, + "loss": 0.4597, + "step": 30980 + }, + { + "epoch": 1.5730349913061137, + "grad_norm": 0.023692598750396664, + "learning_rate": 0.0001324983376510319, + "loss": 0.4792, + "step": 30985 + }, + { + "epoch": 1.573288827404145, + "grad_norm": 0.02253078704005018, + "learning_rate": 0.0001323481650523608, + "loss": 0.4632, + "step": 30990 + }, + { + "epoch": 1.5735426635021765, + "grad_norm": 0.023678013208962723, + "learning_rate": 0.00013219806462191154, + "loss": 0.4776, + "step": 30995 + }, + { + "epoch": 1.573796499600208, + "grad_norm": 0.022573287303128707, + "learning_rate": 0.00013204803638914791, + "loss": 0.5042, + "step": 31000 + }, + { + "epoch": 1.5740503356982396, + "grad_norm": 0.022353046325857568, + "learning_rate": 0.00013189808038351953, + "loss": 0.468, + "step": 31005 + }, + { + "epoch": 1.5743041717962711, + "grad_norm": 0.019324948043265216, + "learning_rate": 0.00013174819663446254, + "loss": 0.4637, + "step": 31010 + }, + { + "epoch": 1.5745580078943027, + "grad_norm": 0.02228322129344453, + "learning_rate": 0.00013159838517139795, + "loss": 0.4464, + "step": 31015 + }, + { + "epoch": 1.5748118439923342, + "grad_norm": 0.020602212019198058, + "learning_rate": 0.00013144864602373325, + "loss": 0.4768, + "step": 31020 + }, + { + "epoch": 1.5750656800903657, + "grad_norm": 0.021221567403693256, + "learning_rate": 0.0001312989792208612, + "loss": 0.4386, + "step": 31025 + }, + { + "epoch": 1.5753195161883973, + "grad_norm": 0.022718156061845098, + "learning_rate": 0.00013114938479216105, + "loss": 0.4555, + "step": 31030 + }, + { + "epoch": 1.5755733522864288, + "grad_norm": 0.026040312386550257, + "learning_rate": 0.000130999862766997, + "loss": 0.4637, + "step": 31035 + }, + { + "epoch": 1.5758271883844601, + "grad_norm": 0.025384578710037565, + "learning_rate": 0.00013085041317471984, + "loss": 0.4709, + "step": 31040 + }, + { + "epoch": 1.5760810244824917, + "grad_norm": 0.019114058763165305, + "learning_rate": 0.00013070103604466548, + "loss": 0.43, + "step": 31045 + }, + { + "epoch": 1.5763348605805232, + "grad_norm": 0.02267175025832458, + "learning_rate": 0.00013055173140615623, + "loss": 0.4853, + "step": 31050 + }, + { + "epoch": 1.5765886966785545, + "grad_norm": 0.02231164473017753, + "learning_rate": 0.00013040249928849952, + "loss": 0.4755, + "step": 31055 + }, + { + "epoch": 1.576842532776586, + "grad_norm": 0.025173835816743373, + "learning_rate": 0.00013025333972098912, + "loss": 0.4666, + "step": 31060 + }, + { + "epoch": 1.5770963688746176, + "grad_norm": 0.022930909710894763, + "learning_rate": 0.00013010425273290394, + "loss": 0.4614, + "step": 31065 + }, + { + "epoch": 1.577350204972649, + "grad_norm": 0.019310788047425583, + "learning_rate": 0.00012995523835350958, + "loss": 0.4661, + "step": 31070 + }, + { + "epoch": 1.5776040410706806, + "grad_norm": 0.033091726187777254, + "learning_rate": 0.0001298062966120564, + "loss": 0.4882, + "step": 31075 + }, + { + "epoch": 1.5778578771687122, + "grad_norm": 0.02326961130506038, + "learning_rate": 0.00012965742753778115, + "loss": 0.4549, + "step": 31080 + }, + { + "epoch": 1.5781117132667437, + "grad_norm": 0.03266197407987793, + "learning_rate": 0.00012950863115990602, + "loss": 0.4458, + "step": 31085 + }, + { + "epoch": 1.5783655493647752, + "grad_norm": 0.021876108652836432, + "learning_rate": 0.00012935990750763876, + "loss": 0.4695, + "step": 31090 + }, + { + "epoch": 1.5786193854628068, + "grad_norm": 0.026393027361750386, + "learning_rate": 0.00012921125661017347, + "loss": 0.4486, + "step": 31095 + }, + { + "epoch": 1.5788732215608383, + "grad_norm": 0.02184839617602591, + "learning_rate": 0.0001290626784966892, + "loss": 0.4432, + "step": 31100 + }, + { + "epoch": 1.5791270576588696, + "grad_norm": 0.029331145194564957, + "learning_rate": 0.00012891417319635146, + "loss": 0.4636, + "step": 31105 + }, + { + "epoch": 1.5793808937569012, + "grad_norm": 0.02774728403803806, + "learning_rate": 0.0001287657407383107, + "loss": 0.4598, + "step": 31110 + }, + { + "epoch": 1.5796347298549327, + "grad_norm": 0.02128441451715541, + "learning_rate": 0.0001286173811517039, + "loss": 0.461, + "step": 31115 + }, + { + "epoch": 1.579888565952964, + "grad_norm": 0.026163707221929008, + "learning_rate": 0.00012846909446565297, + "loss": 0.4585, + "step": 31120 + }, + { + "epoch": 1.5801424020509955, + "grad_norm": 0.022788567101665227, + "learning_rate": 0.00012832088070926595, + "loss": 0.4514, + "step": 31125 + }, + { + "epoch": 1.580396238149027, + "grad_norm": 0.023031836651392787, + "learning_rate": 0.00012817273991163648, + "loss": 0.4727, + "step": 31130 + }, + { + "epoch": 1.5806500742470586, + "grad_norm": 0.022560904451555855, + "learning_rate": 0.00012802467210184398, + "loss": 0.4728, + "step": 31135 + }, + { + "epoch": 1.5809039103450901, + "grad_norm": 0.024809637462144523, + "learning_rate": 0.00012787667730895325, + "loss": 0.4804, + "step": 31140 + }, + { + "epoch": 1.5811577464431217, + "grad_norm": 0.027218959388466347, + "learning_rate": 0.00012772875556201507, + "loss": 0.4705, + "step": 31145 + }, + { + "epoch": 1.5814115825411532, + "grad_norm": 0.022561455510123646, + "learning_rate": 0.0001275809068900655, + "loss": 0.45, + "step": 31150 + }, + { + "epoch": 1.5816654186391848, + "grad_norm": 0.022204515235270998, + "learning_rate": 0.00012743313132212685, + "loss": 0.4892, + "step": 31155 + }, + { + "epoch": 1.5819192547372163, + "grad_norm": 0.022075257403621297, + "learning_rate": 0.00012728542888720633, + "loss": 0.4829, + "step": 31160 + }, + { + "epoch": 1.5821730908352478, + "grad_norm": 0.028454740968265037, + "learning_rate": 0.0001271377996142976, + "loss": 0.4713, + "step": 31165 + }, + { + "epoch": 1.5824269269332791, + "grad_norm": 0.02252943808074146, + "learning_rate": 0.00012699024353237921, + "loss": 0.4869, + "step": 31170 + }, + { + "epoch": 1.5826807630313107, + "grad_norm": 0.02494152110893949, + "learning_rate": 0.0001268427606704159, + "loss": 0.46, + "step": 31175 + }, + { + "epoch": 1.5829345991293422, + "grad_norm": 0.02655046766353095, + "learning_rate": 0.00012669535105735763, + "loss": 0.4563, + "step": 31180 + }, + { + "epoch": 1.5831884352273737, + "grad_norm": 0.023971988910522334, + "learning_rate": 0.0001265480147221403, + "loss": 0.4794, + "step": 31185 + }, + { + "epoch": 1.583442271325405, + "grad_norm": 0.02428090384726735, + "learning_rate": 0.00012640075169368536, + "loss": 0.4667, + "step": 31190 + }, + { + "epoch": 1.5836961074234366, + "grad_norm": 0.02616066326968095, + "learning_rate": 0.0001262535620008996, + "loss": 0.4711, + "step": 31195 + }, + { + "epoch": 1.5839499435214681, + "grad_norm": 0.029430410330922516, + "learning_rate": 0.00012610644567267592, + "loss": 0.4481, + "step": 31200 + }, + { + "epoch": 1.5842037796194997, + "grad_norm": 0.023433715884341892, + "learning_rate": 0.0001259594027378922, + "loss": 0.4922, + "step": 31205 + }, + { + "epoch": 1.5844576157175312, + "grad_norm": 0.022517731868072016, + "learning_rate": 0.00012581243322541252, + "loss": 0.5017, + "step": 31210 + }, + { + "epoch": 1.5847114518155627, + "grad_norm": 0.019566396686039718, + "learning_rate": 0.000125665537164086, + "loss": 0.4659, + "step": 31215 + }, + { + "epoch": 1.5849652879135943, + "grad_norm": 0.02130331252828402, + "learning_rate": 0.00012551871458274787, + "loss": 0.4998, + "step": 31220 + }, + { + "epoch": 1.5852191240116258, + "grad_norm": 0.021977116905423498, + "learning_rate": 0.0001253719655102184, + "loss": 0.4663, + "step": 31225 + }, + { + "epoch": 1.5854729601096573, + "grad_norm": 0.025769773014901368, + "learning_rate": 0.0001252252899753039, + "loss": 0.4722, + "step": 31230 + }, + { + "epoch": 1.5857267962076889, + "grad_norm": 0.028231473801800827, + "learning_rate": 0.00012507868800679594, + "loss": 0.4624, + "step": 31235 + }, + { + "epoch": 1.5859806323057202, + "grad_norm": 0.022650562195746557, + "learning_rate": 0.00012493215963347188, + "loss": 0.4796, + "step": 31240 + }, + { + "epoch": 1.5862344684037517, + "grad_norm": 0.029622948175144603, + "learning_rate": 0.00012478570488409413, + "loss": 0.4543, + "step": 31245 + }, + { + "epoch": 1.5864883045017832, + "grad_norm": 0.021229634657799406, + "learning_rate": 0.00012463932378741166, + "loss": 0.4534, + "step": 31250 + }, + { + "epoch": 1.5867421405998146, + "grad_norm": 0.02759828595048276, + "learning_rate": 0.00012449301637215782, + "loss": 0.4761, + "step": 31255 + }, + { + "epoch": 1.586995976697846, + "grad_norm": 0.022925858589321915, + "learning_rate": 0.0001243467826670524, + "loss": 0.4405, + "step": 31260 + }, + { + "epoch": 1.5872498127958776, + "grad_norm": 0.023105250881910747, + "learning_rate": 0.00012420062270079995, + "loss": 0.4712, + "step": 31265 + }, + { + "epoch": 1.5875036488939092, + "grad_norm": 0.021868594752397733, + "learning_rate": 0.00012405453650209136, + "loss": 0.4677, + "step": 31270 + }, + { + "epoch": 1.5877574849919407, + "grad_norm": 0.03077895247333048, + "learning_rate": 0.00012390852409960223, + "loss": 0.4539, + "step": 31275 + }, + { + "epoch": 1.5880113210899722, + "grad_norm": 0.02329117852026229, + "learning_rate": 0.00012376258552199444, + "loss": 0.4749, + "step": 31280 + }, + { + "epoch": 1.5882651571880038, + "grad_norm": 0.021664079596118983, + "learning_rate": 0.00012361672079791469, + "loss": 0.4691, + "step": 31285 + }, + { + "epoch": 1.5885189932860353, + "grad_norm": 0.025983796005873985, + "learning_rate": 0.00012347092995599574, + "loss": 0.4481, + "step": 31290 + }, + { + "epoch": 1.5887728293840668, + "grad_norm": 0.027325008699016377, + "learning_rate": 0.00012332521302485533, + "loss": 0.4636, + "step": 31295 + }, + { + "epoch": 1.5890266654820984, + "grad_norm": 0.020269269921609292, + "learning_rate": 0.00012317957003309726, + "loss": 0.425, + "step": 31300 + }, + { + "epoch": 1.5892805015801297, + "grad_norm": 0.019317983071545985, + "learning_rate": 0.00012303400100931029, + "loss": 0.4336, + "step": 31305 + }, + { + "epoch": 1.5895343376781612, + "grad_norm": 0.02161940503170865, + "learning_rate": 0.00012288850598206902, + "loss": 0.4696, + "step": 31310 + }, + { + "epoch": 1.5897881737761927, + "grad_norm": 0.026885333685387532, + "learning_rate": 0.00012274308497993346, + "loss": 0.4598, + "step": 31315 + }, + { + "epoch": 1.590042009874224, + "grad_norm": 0.023792733517593017, + "learning_rate": 0.0001225977380314488, + "loss": 0.4553, + "step": 31320 + }, + { + "epoch": 1.5902958459722556, + "grad_norm": 0.02653728675273625, + "learning_rate": 0.00012245246516514626, + "loss": 0.4675, + "step": 31325 + }, + { + "epoch": 1.5905496820702871, + "grad_norm": 0.021186203668840244, + "learning_rate": 0.00012230726640954183, + "loss": 0.4436, + "step": 31330 + }, + { + "epoch": 1.5908035181683187, + "grad_norm": 0.025039321378471157, + "learning_rate": 0.0001221621417931375, + "loss": 0.4697, + "step": 31335 + }, + { + "epoch": 1.5910573542663502, + "grad_norm": 0.024695574034159236, + "learning_rate": 0.00012201709134442041, + "loss": 0.4479, + "step": 31340 + }, + { + "epoch": 1.5913111903643817, + "grad_norm": 0.020718266810303008, + "learning_rate": 0.00012187211509186341, + "loss": 0.479, + "step": 31345 + }, + { + "epoch": 1.5915650264624133, + "grad_norm": 0.02115176797212826, + "learning_rate": 0.00012172721306392437, + "loss": 0.4714, + "step": 31350 + }, + { + "epoch": 1.5918188625604448, + "grad_norm": 0.02176260766701053, + "learning_rate": 0.00012158238528904707, + "loss": 0.4647, + "step": 31355 + }, + { + "epoch": 1.5920726986584763, + "grad_norm": 0.02570121404643556, + "learning_rate": 0.00012143763179566026, + "loss": 0.456, + "step": 31360 + }, + { + "epoch": 1.5923265347565079, + "grad_norm": 0.02737621105346301, + "learning_rate": 0.00012129295261217843, + "loss": 0.483, + "step": 31365 + }, + { + "epoch": 1.5925803708545392, + "grad_norm": 0.026854830654727595, + "learning_rate": 0.0001211483477670014, + "loss": 0.457, + "step": 31370 + }, + { + "epoch": 1.5928342069525707, + "grad_norm": 0.02062145184869095, + "learning_rate": 0.0001210038172885145, + "loss": 0.4471, + "step": 31375 + }, + { + "epoch": 1.5930880430506023, + "grad_norm": 0.02532668196949466, + "learning_rate": 0.00012085936120508811, + "loss": 0.4475, + "step": 31380 + }, + { + "epoch": 1.5933418791486336, + "grad_norm": 0.020702904030837858, + "learning_rate": 0.00012071497954507843, + "loss": 0.4668, + "step": 31385 + }, + { + "epoch": 1.593595715246665, + "grad_norm": 0.031141316623061212, + "learning_rate": 0.00012057067233682667, + "loss": 0.4714, + "step": 31390 + }, + { + "epoch": 1.5938495513446966, + "grad_norm": 0.028963886369253525, + "learning_rate": 0.00012042643960865985, + "loss": 0.4677, + "step": 31395 + }, + { + "epoch": 1.5941033874427282, + "grad_norm": 0.02100153262498024, + "learning_rate": 0.00012028228138888986, + "loss": 0.4844, + "step": 31400 + }, + { + "epoch": 1.5943572235407597, + "grad_norm": 0.021293170094263958, + "learning_rate": 0.00012013819770581458, + "loss": 0.4344, + "step": 31405 + }, + { + "epoch": 1.5946110596387912, + "grad_norm": 0.022079676785804837, + "learning_rate": 0.00011999418858771649, + "loss": 0.472, + "step": 31410 + }, + { + "epoch": 1.5948648957368228, + "grad_norm": 0.020399139507724554, + "learning_rate": 0.00011985025406286432, + "loss": 0.4412, + "step": 31415 + }, + { + "epoch": 1.5951187318348543, + "grad_norm": 0.020737921370516353, + "learning_rate": 0.00011970639415951129, + "loss": 0.4782, + "step": 31420 + }, + { + "epoch": 1.5953725679328858, + "grad_norm": 0.021206624778793808, + "learning_rate": 0.00011956260890589655, + "loss": 0.459, + "step": 31425 + }, + { + "epoch": 1.5956264040309174, + "grad_norm": 0.0245333591767397, + "learning_rate": 0.00011941889833024461, + "loss": 0.4675, + "step": 31430 + }, + { + "epoch": 1.5958802401289487, + "grad_norm": 0.025273714859497986, + "learning_rate": 0.0001192752624607648, + "loss": 0.4753, + "step": 31435 + }, + { + "epoch": 1.5961340762269802, + "grad_norm": 0.02473632336996924, + "learning_rate": 0.00011913170132565248, + "loss": 0.4207, + "step": 31440 + }, + { + "epoch": 1.5963879123250118, + "grad_norm": 0.024196910552343232, + "learning_rate": 0.00011898821495308764, + "loss": 0.4838, + "step": 31445 + }, + { + "epoch": 1.5966417484230433, + "grad_norm": 0.024546317457647595, + "learning_rate": 0.00011884480337123621, + "loss": 0.4766, + "step": 31450 + }, + { + "epoch": 1.5968955845210746, + "grad_norm": 0.021580443853018175, + "learning_rate": 0.00011870146660824899, + "loss": 0.4618, + "step": 31455 + }, + { + "epoch": 1.5971494206191061, + "grad_norm": 0.021030168327500064, + "learning_rate": 0.00011855820469226242, + "loss": 0.4697, + "step": 31460 + }, + { + "epoch": 1.5974032567171377, + "grad_norm": 0.02542968053225913, + "learning_rate": 0.00011841501765139795, + "loss": 0.4583, + "step": 31465 + }, + { + "epoch": 1.5976570928151692, + "grad_norm": 0.0245457635447329, + "learning_rate": 0.00011827190551376265, + "loss": 0.469, + "step": 31470 + }, + { + "epoch": 1.5979109289132007, + "grad_norm": 0.022086900914002925, + "learning_rate": 0.00011812886830744846, + "loss": 0.404, + "step": 31475 + }, + { + "epoch": 1.5981647650112323, + "grad_norm": 0.024647282768808704, + "learning_rate": 0.00011798590606053322, + "loss": 0.4778, + "step": 31480 + }, + { + "epoch": 1.5984186011092638, + "grad_norm": 0.02520833814662268, + "learning_rate": 0.00011784301880107917, + "loss": 0.4682, + "step": 31485 + }, + { + "epoch": 1.5986724372072953, + "grad_norm": 0.030999706879264646, + "learning_rate": 0.00011770020655713509, + "loss": 0.4543, + "step": 31490 + }, + { + "epoch": 1.5989262733053269, + "grad_norm": 0.028404831004490545, + "learning_rate": 0.00011755746935673372, + "loss": 0.4685, + "step": 31495 + }, + { + "epoch": 1.5991801094033582, + "grad_norm": 0.02067687233055602, + "learning_rate": 0.00011741480722789405, + "loss": 0.4439, + "step": 31500 + }, + { + "epoch": 1.5994339455013897, + "grad_norm": 0.03005348926340442, + "learning_rate": 0.00011727222019861966, + "loss": 0.4475, + "step": 31505 + }, + { + "epoch": 1.5996877815994213, + "grad_norm": 0.025221995139169794, + "learning_rate": 0.0001171297082968999, + "loss": 0.4601, + "step": 31510 + }, + { + "epoch": 1.5999416176974528, + "grad_norm": 0.024617531922327757, + "learning_rate": 0.00011698727155070888, + "loss": 0.485, + "step": 31515 + }, + { + "epoch": 1.6001954537954841, + "grad_norm": 0.02522929219903767, + "learning_rate": 0.0001168449099880065, + "loss": 0.4684, + "step": 31520 + }, + { + "epoch": 1.6004492898935156, + "grad_norm": 0.02188538172758521, + "learning_rate": 0.0001167026236367374, + "loss": 0.4667, + "step": 31525 + }, + { + "epoch": 1.6007031259915472, + "grad_norm": 0.03015421394199565, + "learning_rate": 0.00011656041252483185, + "loss": 0.4609, + "step": 31530 + }, + { + "epoch": 1.6009569620895787, + "grad_norm": 0.02771728039572498, + "learning_rate": 0.00011641827668020504, + "loss": 0.4537, + "step": 31535 + }, + { + "epoch": 1.6012107981876103, + "grad_norm": 0.024448943269020997, + "learning_rate": 0.00011627621613075772, + "loss": 0.4735, + "step": 31540 + }, + { + "epoch": 1.6014646342856418, + "grad_norm": 0.024047651615317647, + "learning_rate": 0.00011613423090437536, + "loss": 0.4797, + "step": 31545 + }, + { + "epoch": 1.6017184703836733, + "grad_norm": 0.022908547869978178, + "learning_rate": 0.0001159923210289292, + "loss": 0.4419, + "step": 31550 + }, + { + "epoch": 1.6019723064817049, + "grad_norm": 0.027921713893488884, + "learning_rate": 0.00011585048653227548, + "loss": 0.4529, + "step": 31555 + }, + { + "epoch": 1.6022261425797364, + "grad_norm": 0.022455295711872506, + "learning_rate": 0.00011570872744225541, + "loss": 0.4689, + "step": 31560 + }, + { + "epoch": 1.602479978677768, + "grad_norm": 0.029028863638509785, + "learning_rate": 0.0001155670437866958, + "loss": 0.4693, + "step": 31565 + }, + { + "epoch": 1.6027338147757992, + "grad_norm": 0.026146904467255937, + "learning_rate": 0.00011542543559340817, + "loss": 0.4802, + "step": 31570 + }, + { + "epoch": 1.6029876508738308, + "grad_norm": 0.021361160108741662, + "learning_rate": 0.0001152839028901898, + "loss": 0.4735, + "step": 31575 + }, + { + "epoch": 1.6032414869718623, + "grad_norm": 0.022332608927233596, + "learning_rate": 0.00011514244570482263, + "loss": 0.4405, + "step": 31580 + }, + { + "epoch": 1.6034953230698936, + "grad_norm": 0.023861647792640446, + "learning_rate": 0.00011500106406507416, + "loss": 0.4384, + "step": 31585 + }, + { + "epoch": 1.6037491591679252, + "grad_norm": 0.03455619002140877, + "learning_rate": 0.00011485975799869675, + "loss": 0.4781, + "step": 31590 + }, + { + "epoch": 1.6040029952659567, + "grad_norm": 0.032908225955371004, + "learning_rate": 0.00011471852753342826, + "loss": 0.4617, + "step": 31595 + }, + { + "epoch": 1.6042568313639882, + "grad_norm": 0.0248333675995091, + "learning_rate": 0.00011457737269699125, + "loss": 0.4736, + "step": 31600 + }, + { + "epoch": 1.6045106674620198, + "grad_norm": 0.023654932219431005, + "learning_rate": 0.00011443629351709394, + "loss": 0.4893, + "step": 31605 + }, + { + "epoch": 1.6047645035600513, + "grad_norm": 0.019152490721215262, + "learning_rate": 0.00011429529002142941, + "loss": 0.4619, + "step": 31610 + }, + { + "epoch": 1.6050183396580828, + "grad_norm": 0.024857396949531627, + "learning_rate": 0.00011415436223767606, + "loss": 0.4452, + "step": 31615 + }, + { + "epoch": 1.6052721757561144, + "grad_norm": 0.02790240391915016, + "learning_rate": 0.00011401351019349704, + "loss": 0.4561, + "step": 31620 + }, + { + "epoch": 1.605526011854146, + "grad_norm": 0.024084282543615547, + "learning_rate": 0.00011387273391654118, + "loss": 0.4774, + "step": 31625 + }, + { + "epoch": 1.6057798479521774, + "grad_norm": 0.02920146578824623, + "learning_rate": 0.00011373203343444194, + "loss": 0.4789, + "step": 31630 + }, + { + "epoch": 1.6060336840502087, + "grad_norm": 0.029127523345097956, + "learning_rate": 0.00011359140877481833, + "loss": 0.464, + "step": 31635 + }, + { + "epoch": 1.6062875201482403, + "grad_norm": 0.022090138726677838, + "learning_rate": 0.00011345085996527405, + "loss": 0.4806, + "step": 31640 + }, + { + "epoch": 1.6065413562462718, + "grad_norm": 0.026141716192201573, + "learning_rate": 0.00011331038703339836, + "loss": 0.4589, + "step": 31645 + }, + { + "epoch": 1.6067951923443031, + "grad_norm": 0.02162959028748172, + "learning_rate": 0.00011316999000676514, + "loss": 0.4543, + "step": 31650 + }, + { + "epoch": 1.6070490284423347, + "grad_norm": 0.023063059924878233, + "learning_rate": 0.00011302966891293392, + "loss": 0.47, + "step": 31655 + }, + { + "epoch": 1.6073028645403662, + "grad_norm": 0.027292009363410075, + "learning_rate": 0.00011288942377944872, + "loss": 0.4909, + "step": 31660 + }, + { + "epoch": 1.6075567006383977, + "grad_norm": 0.04681074491676968, + "learning_rate": 0.00011274925463383912, + "loss": 0.4656, + "step": 31665 + }, + { + "epoch": 1.6078105367364293, + "grad_norm": 0.01979398122113408, + "learning_rate": 0.00011260916150361977, + "loss": 0.4488, + "step": 31670 + }, + { + "epoch": 1.6080643728344608, + "grad_norm": 0.020250869022994698, + "learning_rate": 0.00011246914441628992, + "loss": 0.4782, + "step": 31675 + }, + { + "epoch": 1.6083182089324923, + "grad_norm": 0.019872795940748027, + "learning_rate": 0.00011232920339933461, + "loss": 0.4777, + "step": 31680 + }, + { + "epoch": 1.6085720450305239, + "grad_norm": 0.02316643741156545, + "learning_rate": 0.00011218933848022317, + "loss": 0.493, + "step": 31685 + }, + { + "epoch": 1.6088258811285554, + "grad_norm": 0.02234673533736629, + "learning_rate": 0.00011204954968641074, + "loss": 0.4629, + "step": 31690 + }, + { + "epoch": 1.609079717226587, + "grad_norm": 0.022362390983495437, + "learning_rate": 0.00011190983704533685, + "loss": 0.4572, + "step": 31695 + }, + { + "epoch": 1.6093335533246182, + "grad_norm": 0.021690554415178736, + "learning_rate": 0.00011177020058442672, + "loss": 0.4685, + "step": 31700 + }, + { + "epoch": 1.6095873894226498, + "grad_norm": 0.02776519862681794, + "learning_rate": 0.00011163064033108994, + "loss": 0.4662, + "step": 31705 + }, + { + "epoch": 1.6098412255206813, + "grad_norm": 0.022607748514862483, + "learning_rate": 0.00011149115631272183, + "loss": 0.4853, + "step": 31710 + }, + { + "epoch": 1.6100950616187126, + "grad_norm": 0.041421241284070646, + "learning_rate": 0.00011135174855670205, + "loss": 0.458, + "step": 31715 + }, + { + "epoch": 1.6103488977167442, + "grad_norm": 0.022255670798694553, + "learning_rate": 0.00011121241709039604, + "loss": 0.4626, + "step": 31720 + }, + { + "epoch": 1.6106027338147757, + "grad_norm": 0.02393315473991221, + "learning_rate": 0.00011107316194115352, + "loss": 0.482, + "step": 31725 + }, + { + "epoch": 1.6108565699128072, + "grad_norm": 0.022345117574356848, + "learning_rate": 0.00011093398313630975, + "loss": 0.4787, + "step": 31730 + }, + { + "epoch": 1.6111104060108388, + "grad_norm": 0.021278319102440692, + "learning_rate": 0.00011079488070318477, + "loss": 0.4639, + "step": 31735 + }, + { + "epoch": 1.6113642421088703, + "grad_norm": 0.02130312138742558, + "learning_rate": 0.00011065585466908395, + "loss": 0.4675, + "step": 31740 + }, + { + "epoch": 1.6116180782069018, + "grad_norm": 0.020984294776622912, + "learning_rate": 0.00011051690506129702, + "loss": 0.4446, + "step": 31745 + }, + { + "epoch": 1.6118719143049334, + "grad_norm": 0.023292694738733775, + "learning_rate": 0.00011037803190709945, + "loss": 0.4559, + "step": 31750 + }, + { + "epoch": 1.612125750402965, + "grad_norm": 0.021942067180995492, + "learning_rate": 0.00011023923523375102, + "loss": 0.4573, + "step": 31755 + }, + { + "epoch": 1.6123795865009964, + "grad_norm": 0.02815465580690162, + "learning_rate": 0.00011010051506849711, + "loss": 0.4792, + "step": 31760 + }, + { + "epoch": 1.6126334225990278, + "grad_norm": 0.021597418410303274, + "learning_rate": 0.0001099618714385675, + "loss": 0.4643, + "step": 31765 + }, + { + "epoch": 1.6128872586970593, + "grad_norm": 0.02364408717712364, + "learning_rate": 0.0001098233043711776, + "loss": 0.4706, + "step": 31770 + }, + { + "epoch": 1.6131410947950908, + "grad_norm": 0.027433611435712636, + "learning_rate": 0.00010968481389352708, + "loss": 0.4131, + "step": 31775 + }, + { + "epoch": 1.6133949308931224, + "grad_norm": 0.02152935633696659, + "learning_rate": 0.00010954640003280125, + "loss": 0.44, + "step": 31780 + }, + { + "epoch": 1.6136487669911537, + "grad_norm": 0.026076111440461344, + "learning_rate": 0.00010940806281616977, + "loss": 0.4446, + "step": 31785 + }, + { + "epoch": 1.6139026030891852, + "grad_norm": 0.023558985051178333, + "learning_rate": 0.00010926980227078765, + "loss": 0.4795, + "step": 31790 + }, + { + "epoch": 1.6141564391872167, + "grad_norm": 0.025471211297723347, + "learning_rate": 0.00010913161842379493, + "loss": 0.442, + "step": 31795 + }, + { + "epoch": 1.6144102752852483, + "grad_norm": 0.020061384357153312, + "learning_rate": 0.00010899351130231611, + "loss": 0.4324, + "step": 31800 + }, + { + "epoch": 1.6146641113832798, + "grad_norm": 0.032744538736248614, + "learning_rate": 0.00010885548093346126, + "loss": 0.468, + "step": 31805 + }, + { + "epoch": 1.6149179474813113, + "grad_norm": 0.02071627920287155, + "learning_rate": 0.00010871752734432466, + "loss": 0.4439, + "step": 31810 + }, + { + "epoch": 1.6151717835793429, + "grad_norm": 0.031839601183315996, + "learning_rate": 0.00010857965056198633, + "loss": 0.4447, + "step": 31815 + }, + { + "epoch": 1.6154256196773744, + "grad_norm": 0.024140957539309883, + "learning_rate": 0.00010844185061351036, + "loss": 0.4768, + "step": 31820 + }, + { + "epoch": 1.615679455775406, + "grad_norm": 0.020597217507985967, + "learning_rate": 0.00010830412752594659, + "loss": 0.4638, + "step": 31825 + }, + { + "epoch": 1.6159332918734375, + "grad_norm": 0.02099597835732562, + "learning_rate": 0.00010816648132632912, + "loss": 0.4407, + "step": 31830 + }, + { + "epoch": 1.6161871279714688, + "grad_norm": 0.022374131730080137, + "learning_rate": 0.00010802891204167736, + "loss": 0.4437, + "step": 31835 + }, + { + "epoch": 1.6164409640695003, + "grad_norm": 0.022866381042039967, + "learning_rate": 0.0001078914196989953, + "loss": 0.4499, + "step": 31840 + }, + { + "epoch": 1.6166948001675319, + "grad_norm": 0.023954437468624835, + "learning_rate": 0.00010775400432527228, + "loss": 0.462, + "step": 31845 + }, + { + "epoch": 1.6169486362655632, + "grad_norm": 0.023454406216757356, + "learning_rate": 0.00010761666594748176, + "loss": 0.4746, + "step": 31850 + }, + { + "epoch": 1.6172024723635947, + "grad_norm": 0.023827431315221397, + "learning_rate": 0.00010747940459258321, + "loss": 0.4376, + "step": 31855 + }, + { + "epoch": 1.6174563084616262, + "grad_norm": 0.025096747648666724, + "learning_rate": 0.00010734222028751989, + "loss": 0.4879, + "step": 31860 + }, + { + "epoch": 1.6177101445596578, + "grad_norm": 0.021230019197699266, + "learning_rate": 0.00010720511305922065, + "loss": 0.4286, + "step": 31865 + }, + { + "epoch": 1.6179639806576893, + "grad_norm": 0.024734269364125625, + "learning_rate": 0.00010706808293459875, + "loss": 0.4792, + "step": 31870 + }, + { + "epoch": 1.6182178167557209, + "grad_norm": 0.02274730432603367, + "learning_rate": 0.00010693112994055277, + "loss": 0.4631, + "step": 31875 + }, + { + "epoch": 1.6184716528537524, + "grad_norm": 0.023273183427376207, + "learning_rate": 0.00010679425410396559, + "loss": 0.4611, + "step": 31880 + }, + { + "epoch": 1.618725488951784, + "grad_norm": 0.020775817214814158, + "learning_rate": 0.00010665745545170557, + "loss": 0.4473, + "step": 31885 + }, + { + "epoch": 1.6189793250498155, + "grad_norm": 0.02042396438999102, + "learning_rate": 0.00010652073401062529, + "loss": 0.4245, + "step": 31890 + }, + { + "epoch": 1.619233161147847, + "grad_norm": 0.02282973946779475, + "learning_rate": 0.00010638408980756281, + "loss": 0.4685, + "step": 31895 + }, + { + "epoch": 1.6194869972458783, + "grad_norm": 0.020201540964168187, + "learning_rate": 0.00010624752286934037, + "loss": 0.4285, + "step": 31900 + }, + { + "epoch": 1.6197408333439098, + "grad_norm": 0.02332813243361064, + "learning_rate": 0.00010611103322276571, + "loss": 0.47, + "step": 31905 + }, + { + "epoch": 1.6199946694419414, + "grad_norm": 0.0210065909715921, + "learning_rate": 0.00010597462089463078, + "loss": 0.4695, + "step": 31910 + }, + { + "epoch": 1.6202485055399727, + "grad_norm": 0.02246075990254701, + "learning_rate": 0.00010583828591171273, + "loss": 0.4382, + "step": 31915 + }, + { + "epoch": 1.6205023416380042, + "grad_norm": 0.0283987877904691, + "learning_rate": 0.00010570202830077363, + "loss": 0.4513, + "step": 31920 + }, + { + "epoch": 1.6207561777360358, + "grad_norm": 0.023181166904655805, + "learning_rate": 0.0001055658480885599, + "loss": 0.454, + "step": 31925 + }, + { + "epoch": 1.6210100138340673, + "grad_norm": 0.02263188362017243, + "learning_rate": 0.00010542974530180327, + "loss": 0.4693, + "step": 31930 + }, + { + "epoch": 1.6212638499320988, + "grad_norm": 0.020696164165394288, + "learning_rate": 0.00010529371996721976, + "loss": 0.4531, + "step": 31935 + }, + { + "epoch": 1.6215176860301304, + "grad_norm": 0.022616240854955654, + "learning_rate": 0.00010515777211151079, + "loss": 0.4457, + "step": 31940 + }, + { + "epoch": 1.621771522128162, + "grad_norm": 0.023467651643862778, + "learning_rate": 0.00010502190176136195, + "loss": 0.4472, + "step": 31945 + }, + { + "epoch": 1.6220253582261934, + "grad_norm": 0.024225187265112066, + "learning_rate": 0.00010488610894344414, + "loss": 0.4586, + "step": 31950 + }, + { + "epoch": 1.622279194324225, + "grad_norm": 0.02003294507179616, + "learning_rate": 0.00010475039368441258, + "loss": 0.4476, + "step": 31955 + }, + { + "epoch": 1.6225330304222565, + "grad_norm": 0.02746040189534053, + "learning_rate": 0.0001046147560109078, + "loss": 0.4355, + "step": 31960 + }, + { + "epoch": 1.6227868665202878, + "grad_norm": 0.023121259899165414, + "learning_rate": 0.00010447919594955452, + "loss": 0.4772, + "step": 31965 + }, + { + "epoch": 1.6230407026183193, + "grad_norm": 0.021015795475823218, + "learning_rate": 0.00010434371352696259, + "loss": 0.4599, + "step": 31970 + }, + { + "epoch": 1.6232945387163509, + "grad_norm": 0.020515866606887798, + "learning_rate": 0.00010420830876972653, + "loss": 0.4425, + "step": 31975 + }, + { + "epoch": 1.6235483748143822, + "grad_norm": 0.02337407247961481, + "learning_rate": 0.0001040729817044258, + "loss": 0.4713, + "step": 31980 + }, + { + "epoch": 1.6238022109124137, + "grad_norm": 0.02089505650167657, + "learning_rate": 0.00010393773235762416, + "loss": 0.4621, + "step": 31985 + }, + { + "epoch": 1.6240560470104453, + "grad_norm": 0.023288518470522646, + "learning_rate": 0.00010380256075587063, + "loss": 0.4926, + "step": 31990 + }, + { + "epoch": 1.6243098831084768, + "grad_norm": 0.020663246760806137, + "learning_rate": 0.00010366746692569845, + "loss": 0.4301, + "step": 31995 + }, + { + "epoch": 1.6245637192065083, + "grad_norm": 0.021804964895316997, + "learning_rate": 0.00010353245089362612, + "loss": 0.4608, + "step": 32000 + }, + { + "epoch": 1.6248175553045399, + "grad_norm": 0.02226540859695977, + "learning_rate": 0.00010339751268615639, + "loss": 0.4374, + "step": 32005 + }, + { + "epoch": 1.6250713914025714, + "grad_norm": 0.02024366952881605, + "learning_rate": 0.00010326265232977717, + "loss": 0.4543, + "step": 32010 + }, + { + "epoch": 1.625325227500603, + "grad_norm": 0.022002203748430427, + "learning_rate": 0.00010312786985096067, + "loss": 0.4619, + "step": 32015 + }, + { + "epoch": 1.6255790635986345, + "grad_norm": 0.020301191207565898, + "learning_rate": 0.00010299316527616426, + "loss": 0.4779, + "step": 32020 + }, + { + "epoch": 1.625832899696666, + "grad_norm": 0.021523853309415912, + "learning_rate": 0.00010285853863182948, + "loss": 0.4441, + "step": 32025 + }, + { + "epoch": 1.6260867357946973, + "grad_norm": 0.020392254627019237, + "learning_rate": 0.00010272398994438303, + "loss": 0.4482, + "step": 32030 + }, + { + "epoch": 1.6263405718927288, + "grad_norm": 0.028102974962649765, + "learning_rate": 0.00010258951924023625, + "loss": 0.4494, + "step": 32035 + }, + { + "epoch": 1.6265944079907604, + "grad_norm": 0.028653290930459173, + "learning_rate": 0.00010245512654578487, + "loss": 0.6634, + "step": 32040 + }, + { + "epoch": 1.626848244088792, + "grad_norm": 0.03496966873027593, + "learning_rate": 0.00010232081188740971, + "loss": 0.4563, + "step": 32045 + }, + { + "epoch": 1.6271020801868232, + "grad_norm": 0.0444008021284147, + "learning_rate": 0.0001021865752914758, + "loss": 0.4782, + "step": 32050 + }, + { + "epoch": 1.6273559162848548, + "grad_norm": 0.028458235576396235, + "learning_rate": 0.00010205241678433341, + "loss": 0.4633, + "step": 32055 + }, + { + "epoch": 1.6276097523828863, + "grad_norm": 0.033779463143654556, + "learning_rate": 0.00010191833639231695, + "loss": 0.473, + "step": 32060 + }, + { + "epoch": 1.6278635884809178, + "grad_norm": 0.02289380406880714, + "learning_rate": 0.00010178433414174593, + "loss": 0.4981, + "step": 32065 + }, + { + "epoch": 1.6281174245789494, + "grad_norm": 0.023008365122818897, + "learning_rate": 0.00010165041005892412, + "loss": 0.4632, + "step": 32070 + }, + { + "epoch": 1.628371260676981, + "grad_norm": 0.024659825812692805, + "learning_rate": 0.00010151656417014033, + "loss": 0.4615, + "step": 32075 + }, + { + "epoch": 1.6286250967750124, + "grad_norm": 0.023409476356552144, + "learning_rate": 0.00010138279650166765, + "loss": 0.5097, + "step": 32080 + }, + { + "epoch": 1.628878932873044, + "grad_norm": 0.021110774308924826, + "learning_rate": 0.00010124910707976426, + "loss": 0.4515, + "step": 32085 + }, + { + "epoch": 1.6291327689710755, + "grad_norm": 0.02965948111333057, + "learning_rate": 0.00010111549593067226, + "loss": 0.4821, + "step": 32090 + }, + { + "epoch": 1.629386605069107, + "grad_norm": 0.02391127207510598, + "learning_rate": 0.00010098196308061953, + "loss": 0.4448, + "step": 32095 + }, + { + "epoch": 1.6296404411671384, + "grad_norm": 0.02681195362674087, + "learning_rate": 0.00010084850855581734, + "loss": 0.433, + "step": 32100 + }, + { + "epoch": 1.6298942772651699, + "grad_norm": 0.021012222244902096, + "learning_rate": 0.00010071513238246255, + "loss": 0.4676, + "step": 32105 + }, + { + "epoch": 1.6301481133632014, + "grad_norm": 0.02751151777371785, + "learning_rate": 0.00010058183458673587, + "loss": 0.4638, + "step": 32110 + }, + { + "epoch": 1.6304019494612327, + "grad_norm": 0.02956936482420226, + "learning_rate": 0.0001004486151948033, + "loss": 0.473, + "step": 32115 + }, + { + "epoch": 1.6306557855592643, + "grad_norm": 0.034062325657136044, + "learning_rate": 0.00010031547423281501, + "loss": 0.4819, + "step": 32120 + }, + { + "epoch": 1.6309096216572958, + "grad_norm": 0.024178056660624028, + "learning_rate": 0.00010018241172690578, + "loss": 0.4669, + "step": 32125 + }, + { + "epoch": 1.6311634577553273, + "grad_norm": 0.021402418062271347, + "learning_rate": 0.00010004942770319536, + "loss": 0.4613, + "step": 32130 + }, + { + "epoch": 1.6314172938533589, + "grad_norm": 0.02076067212479062, + "learning_rate": 9.991652218778762e-05, + "loss": 0.4463, + "step": 32135 + }, + { + "epoch": 1.6316711299513904, + "grad_norm": 0.028173969272268517, + "learning_rate": 9.97836952067715e-05, + "loss": 0.4744, + "step": 32140 + }, + { + "epoch": 1.631924966049422, + "grad_norm": 0.022830320689664698, + "learning_rate": 9.965094678621994e-05, + "loss": 0.4921, + "step": 32145 + }, + { + "epoch": 1.6321788021474535, + "grad_norm": 0.022425906549558932, + "learning_rate": 9.951827695219107e-05, + "loss": 0.4392, + "step": 32150 + }, + { + "epoch": 1.632432638245485, + "grad_norm": 0.029061636424618946, + "learning_rate": 9.938568573072715e-05, + "loss": 0.4671, + "step": 32155 + }, + { + "epoch": 1.6326864743435165, + "grad_norm": 0.028166002445137558, + "learning_rate": 9.925317314785548e-05, + "loss": 0.4338, + "step": 32160 + }, + { + "epoch": 1.6329403104415479, + "grad_norm": 0.02300375901450975, + "learning_rate": 9.91207392295872e-05, + "loss": 0.4764, + "step": 32165 + }, + { + "epoch": 1.6331941465395794, + "grad_norm": 0.022728515736743903, + "learning_rate": 9.898838400191879e-05, + "loss": 0.4725, + "step": 32170 + }, + { + "epoch": 1.633447982637611, + "grad_norm": 0.02413443910577726, + "learning_rate": 9.885610749083063e-05, + "loss": 0.468, + "step": 32175 + }, + { + "epoch": 1.6337018187356422, + "grad_norm": 0.02197648542631456, + "learning_rate": 9.872390972228823e-05, + "loss": 0.4635, + "step": 32180 + }, + { + "epoch": 1.6339556548336738, + "grad_norm": 0.02750430405496458, + "learning_rate": 9.8591790722241e-05, + "loss": 0.4992, + "step": 32185 + }, + { + "epoch": 1.6342094909317053, + "grad_norm": 0.022925967015699954, + "learning_rate": 9.84597505166236e-05, + "loss": 0.4749, + "step": 32190 + }, + { + "epoch": 1.6344633270297368, + "grad_norm": 0.023864232586490966, + "learning_rate": 9.832778913135454e-05, + "loss": 0.4432, + "step": 32195 + }, + { + "epoch": 1.6347171631277684, + "grad_norm": 0.023429582390276676, + "learning_rate": 9.819590659233746e-05, + "loss": 0.4689, + "step": 32200 + }, + { + "epoch": 1.6349709992258, + "grad_norm": 0.023005759239922634, + "learning_rate": 9.806410292546003e-05, + "loss": 0.4647, + "step": 32205 + }, + { + "epoch": 1.6352248353238314, + "grad_norm": 0.033322292665807295, + "learning_rate": 9.793237815659473e-05, + "loss": 0.4787, + "step": 32210 + }, + { + "epoch": 1.635478671421863, + "grad_norm": 0.02676499418010649, + "learning_rate": 9.780073231159864e-05, + "loss": 0.4879, + "step": 32215 + }, + { + "epoch": 1.6357325075198945, + "grad_norm": 0.023068752325228373, + "learning_rate": 9.766916541631288e-05, + "loss": 0.4579, + "step": 32220 + }, + { + "epoch": 1.635986343617926, + "grad_norm": 0.02443617843704049, + "learning_rate": 9.753767749656361e-05, + "loss": 0.4236, + "step": 32225 + }, + { + "epoch": 1.6362401797159574, + "grad_norm": 0.022098127052591947, + "learning_rate": 9.740626857816109e-05, + "loss": 0.4596, + "step": 32230 + }, + { + "epoch": 1.636494015813989, + "grad_norm": 0.025000807984976732, + "learning_rate": 9.727493868690046e-05, + "loss": 0.4547, + "step": 32235 + }, + { + "epoch": 1.6367478519120204, + "grad_norm": 0.02155154892913524, + "learning_rate": 9.714368784856081e-05, + "loss": 0.4733, + "step": 32240 + }, + { + "epoch": 1.6370016880100517, + "grad_norm": 0.02097638201730721, + "learning_rate": 9.701251608890638e-05, + "loss": 0.4939, + "step": 32245 + }, + { + "epoch": 1.6372555241080833, + "grad_norm": 0.024844539480766835, + "learning_rate": 9.688142343368517e-05, + "loss": 0.4623, + "step": 32250 + }, + { + "epoch": 1.6375093602061148, + "grad_norm": 0.022892093225189144, + "learning_rate": 9.675040990863032e-05, + "loss": 0.4643, + "step": 32255 + }, + { + "epoch": 1.6377631963041464, + "grad_norm": 0.021736869039684987, + "learning_rate": 9.661947553945893e-05, + "loss": 0.4592, + "step": 32260 + }, + { + "epoch": 1.6380170324021779, + "grad_norm": 0.030318571487825602, + "learning_rate": 9.648862035187289e-05, + "loss": 0.4798, + "step": 32265 + }, + { + "epoch": 1.6382708685002094, + "grad_norm": 0.023079937986626196, + "learning_rate": 9.635784437155815e-05, + "loss": 0.4786, + "step": 32270 + }, + { + "epoch": 1.638524704598241, + "grad_norm": 0.020531984789138997, + "learning_rate": 9.622714762418588e-05, + "loss": 0.4466, + "step": 32275 + }, + { + "epoch": 1.6387785406962725, + "grad_norm": 0.02520449611053328, + "learning_rate": 9.609653013541076e-05, + "loss": 0.4799, + "step": 32280 + }, + { + "epoch": 1.639032376794304, + "grad_norm": 0.02276790959983237, + "learning_rate": 9.596599193087263e-05, + "loss": 0.4371, + "step": 32285 + }, + { + "epoch": 1.6392862128923356, + "grad_norm": 0.023814420038553278, + "learning_rate": 9.583553303619524e-05, + "loss": 0.4874, + "step": 32290 + }, + { + "epoch": 1.6395400489903669, + "grad_norm": 0.032769714816805816, + "learning_rate": 9.570515347698727e-05, + "loss": 0.4549, + "step": 32295 + }, + { + "epoch": 1.6397938850883984, + "grad_norm": 0.02497036630007797, + "learning_rate": 9.557485327884136e-05, + "loss": 0.4573, + "step": 32300 + }, + { + "epoch": 1.64004772118643, + "grad_norm": 0.020083465356179893, + "learning_rate": 9.544463246733503e-05, + "loss": 0.4504, + "step": 32305 + }, + { + "epoch": 1.6403015572844615, + "grad_norm": 0.022338564551145575, + "learning_rate": 9.531449106802964e-05, + "loss": 0.4702, + "step": 32310 + }, + { + "epoch": 1.6405553933824928, + "grad_norm": 0.021589374085527493, + "learning_rate": 9.518442910647168e-05, + "loss": 0.4527, + "step": 32315 + }, + { + "epoch": 1.6408092294805243, + "grad_norm": 0.02577068006041862, + "learning_rate": 9.50544466081913e-05, + "loss": 0.4341, + "step": 32320 + }, + { + "epoch": 1.6410630655785559, + "grad_norm": 0.024543726750310888, + "learning_rate": 9.492454359870379e-05, + "loss": 0.4649, + "step": 32325 + }, + { + "epoch": 1.6413169016765874, + "grad_norm": 0.029648235764047823, + "learning_rate": 9.479472010350803e-05, + "loss": 0.4673, + "step": 32330 + }, + { + "epoch": 1.641570737774619, + "grad_norm": 0.021346922529075836, + "learning_rate": 9.466497614808806e-05, + "loss": 0.4419, + "step": 32335 + }, + { + "epoch": 1.6418245738726505, + "grad_norm": 0.02170707567899142, + "learning_rate": 9.453531175791191e-05, + "loss": 0.4657, + "step": 32340 + }, + { + "epoch": 1.642078409970682, + "grad_norm": 0.025382331522061787, + "learning_rate": 9.440572695843192e-05, + "loss": 0.4543, + "step": 32345 + }, + { + "epoch": 1.6423322460687135, + "grad_norm": 0.01955537085234093, + "learning_rate": 9.427622177508521e-05, + "loss": 0.4287, + "step": 32350 + }, + { + "epoch": 1.642586082166745, + "grad_norm": 0.021772913466288676, + "learning_rate": 9.414679623329264e-05, + "loss": 0.46, + "step": 32355 + }, + { + "epoch": 1.6428399182647766, + "grad_norm": 0.02173314708203356, + "learning_rate": 9.40174503584601e-05, + "loss": 0.4652, + "step": 32360 + }, + { + "epoch": 1.643093754362808, + "grad_norm": 0.02028675485464349, + "learning_rate": 9.388818417597733e-05, + "loss": 0.4648, + "step": 32365 + }, + { + "epoch": 1.6433475904608394, + "grad_norm": 0.024655003878005108, + "learning_rate": 9.375899771121888e-05, + "loss": 0.4126, + "step": 32370 + }, + { + "epoch": 1.643601426558871, + "grad_norm": 0.022638057624033838, + "learning_rate": 9.362989098954306e-05, + "loss": 0.4586, + "step": 32375 + }, + { + "epoch": 1.6438552626569023, + "grad_norm": 0.020410137630603085, + "learning_rate": 9.350086403629326e-05, + "loss": 0.4371, + "step": 32380 + }, + { + "epoch": 1.6441090987549338, + "grad_norm": 0.02664816125540432, + "learning_rate": 9.337191687679648e-05, + "loss": 0.4442, + "step": 32385 + }, + { + "epoch": 1.6443629348529654, + "grad_norm": 0.0234300596390381, + "learning_rate": 9.324304953636458e-05, + "loss": 0.464, + "step": 32390 + }, + { + "epoch": 1.644616770950997, + "grad_norm": 0.022576297788024068, + "learning_rate": 9.311426204029355e-05, + "loss": 0.4434, + "step": 32395 + }, + { + "epoch": 1.6448706070490284, + "grad_norm": 0.022495995573104423, + "learning_rate": 9.298555441386392e-05, + "loss": 0.495, + "step": 32400 + }, + { + "epoch": 1.64512444314706, + "grad_norm": 0.020474348549872002, + "learning_rate": 9.285692668233997e-05, + "loss": 0.4833, + "step": 32405 + }, + { + "epoch": 1.6453782792450915, + "grad_norm": 0.024333096423502235, + "learning_rate": 9.272837887097108e-05, + "loss": 0.4754, + "step": 32410 + }, + { + "epoch": 1.645632115343123, + "grad_norm": 0.020618548983006132, + "learning_rate": 9.259991100499021e-05, + "loss": 0.4721, + "step": 32415 + }, + { + "epoch": 1.6458859514411546, + "grad_norm": 0.020652302792662478, + "learning_rate": 9.247152310961527e-05, + "loss": 0.4418, + "step": 32420 + }, + { + "epoch": 1.646139787539186, + "grad_norm": 0.02297733570270565, + "learning_rate": 9.234321521004786e-05, + "loss": 0.4864, + "step": 32425 + }, + { + "epoch": 1.6463936236372174, + "grad_norm": 0.019973132791394135, + "learning_rate": 9.221498733147443e-05, + "loss": 0.455, + "step": 32430 + }, + { + "epoch": 1.646647459735249, + "grad_norm": 0.0213492601346557, + "learning_rate": 9.208683949906526e-05, + "loss": 0.4294, + "step": 32435 + }, + { + "epoch": 1.6469012958332805, + "grad_norm": 0.020742705287425788, + "learning_rate": 9.195877173797534e-05, + "loss": 0.4499, + "step": 32440 + }, + { + "epoch": 1.6471551319313118, + "grad_norm": 0.01942087181042954, + "learning_rate": 9.18307840733435e-05, + "loss": 0.456, + "step": 32445 + }, + { + "epoch": 1.6474089680293433, + "grad_norm": 0.031308669135702176, + "learning_rate": 9.170287653029325e-05, + "loss": 0.4647, + "step": 32450 + }, + { + "epoch": 1.6476628041273749, + "grad_norm": 0.02130109350045376, + "learning_rate": 9.157504913393228e-05, + "loss": 0.4667, + "step": 32455 + }, + { + "epoch": 1.6479166402254064, + "grad_norm": 0.027077864769477565, + "learning_rate": 9.14473019093522e-05, + "loss": 0.4551, + "step": 32460 + }, + { + "epoch": 1.648170476323438, + "grad_norm": 0.023508906357244588, + "learning_rate": 9.131963488162942e-05, + "loss": 0.4568, + "step": 32465 + }, + { + "epoch": 1.6484243124214695, + "grad_norm": 0.021416094495555104, + "learning_rate": 9.119204807582415e-05, + "loss": 0.4501, + "step": 32470 + }, + { + "epoch": 1.648678148519501, + "grad_norm": 0.026119351851691987, + "learning_rate": 9.106454151698118e-05, + "loss": 0.4582, + "step": 32475 + }, + { + "epoch": 1.6489319846175325, + "grad_norm": 0.0208804764024078, + "learning_rate": 9.093711523012933e-05, + "loss": 0.4448, + "step": 32480 + }, + { + "epoch": 1.649185820715564, + "grad_norm": 0.024577116363089128, + "learning_rate": 9.080976924028177e-05, + "loss": 0.4357, + "step": 32485 + }, + { + "epoch": 1.6494396568135956, + "grad_norm": 0.022644134972793314, + "learning_rate": 9.068250357243585e-05, + "loss": 0.459, + "step": 32490 + }, + { + "epoch": 1.649693492911627, + "grad_norm": 0.024600197563246825, + "learning_rate": 9.055531825157332e-05, + "loss": 0.4454, + "step": 32495 + }, + { + "epoch": 1.6499473290096585, + "grad_norm": 0.021916792555752632, + "learning_rate": 9.042821330265976e-05, + "loss": 0.4652, + "step": 32500 + }, + { + "epoch": 1.65020116510769, + "grad_norm": 0.022504308117136322, + "learning_rate": 9.030118875064553e-05, + "loss": 0.464, + "step": 32505 + }, + { + "epoch": 1.6504550012057213, + "grad_norm": 0.023410283807235067, + "learning_rate": 9.017424462046453e-05, + "loss": 0.4625, + "step": 32510 + }, + { + "epoch": 1.6507088373037528, + "grad_norm": 0.023827231525166772, + "learning_rate": 9.00473809370358e-05, + "loss": 0.4476, + "step": 32515 + }, + { + "epoch": 1.6509626734017844, + "grad_norm": 0.024423379173870094, + "learning_rate": 8.992059772526163e-05, + "loss": 0.4765, + "step": 32520 + }, + { + "epoch": 1.651216509499816, + "grad_norm": 0.022352780343270956, + "learning_rate": 8.979389501002916e-05, + "loss": 0.4578, + "step": 32525 + }, + { + "epoch": 1.6514703455978474, + "grad_norm": 0.021988070837923635, + "learning_rate": 8.966727281620929e-05, + "loss": 0.4696, + "step": 32530 + }, + { + "epoch": 1.651724181695879, + "grad_norm": 0.020573849908693122, + "learning_rate": 8.954073116865757e-05, + "loss": 0.4873, + "step": 32535 + }, + { + "epoch": 1.6519780177939105, + "grad_norm": 0.023883137882499617, + "learning_rate": 8.941427009221325e-05, + "loss": 0.4686, + "step": 32540 + }, + { + "epoch": 1.652231853891942, + "grad_norm": 0.027468298536276997, + "learning_rate": 8.928788961170025e-05, + "loss": 0.4925, + "step": 32545 + }, + { + "epoch": 1.6524856899899736, + "grad_norm": 0.027088296741393195, + "learning_rate": 8.916158975192618e-05, + "loss": 0.4534, + "step": 32550 + }, + { + "epoch": 1.6527395260880051, + "grad_norm": 0.02137783275444411, + "learning_rate": 8.903537053768329e-05, + "loss": 0.4541, + "step": 32555 + }, + { + "epoch": 1.6529933621860364, + "grad_norm": 0.02449402913442447, + "learning_rate": 8.890923199374756e-05, + "loss": 0.4558, + "step": 32560 + }, + { + "epoch": 1.653247198284068, + "grad_norm": 0.022228205017642978, + "learning_rate": 8.878317414487964e-05, + "loss": 0.4577, + "step": 32565 + }, + { + "epoch": 1.6535010343820995, + "grad_norm": 0.023535911694491006, + "learning_rate": 8.865719701582376e-05, + "loss": 0.4531, + "step": 32570 + }, + { + "epoch": 1.653754870480131, + "grad_norm": 0.021119661452584808, + "learning_rate": 8.85313006313087e-05, + "loss": 0.4574, + "step": 32575 + }, + { + "epoch": 1.6540087065781623, + "grad_norm": 0.022839968865042536, + "learning_rate": 8.84054850160475e-05, + "loss": 0.4713, + "step": 32580 + }, + { + "epoch": 1.6542625426761939, + "grad_norm": 0.022416424195115817, + "learning_rate": 8.827975019473688e-05, + "loss": 0.4826, + "step": 32585 + }, + { + "epoch": 1.6545163787742254, + "grad_norm": 0.021476787913188244, + "learning_rate": 8.815409619205811e-05, + "loss": 0.4419, + "step": 32590 + }, + { + "epoch": 1.654770214872257, + "grad_norm": 0.028305345188139118, + "learning_rate": 8.802852303267634e-05, + "loss": 0.465, + "step": 32595 + }, + { + "epoch": 1.6550240509702885, + "grad_norm": 0.04040707057128688, + "learning_rate": 8.790303074124106e-05, + "loss": 0.5121, + "step": 32600 + }, + { + "epoch": 1.65527788706832, + "grad_norm": 0.04087985900665181, + "learning_rate": 8.77776193423856e-05, + "loss": 0.4659, + "step": 32605 + }, + { + "epoch": 1.6555317231663516, + "grad_norm": 0.028685293599005325, + "learning_rate": 8.765228886072785e-05, + "loss": 0.4659, + "step": 32610 + }, + { + "epoch": 1.655785559264383, + "grad_norm": 0.022967724296628123, + "learning_rate": 8.75270393208693e-05, + "loss": 0.4558, + "step": 32615 + }, + { + "epoch": 1.6560393953624146, + "grad_norm": 0.02385629905389606, + "learning_rate": 8.740187074739609e-05, + "loss": 0.4281, + "step": 32620 + }, + { + "epoch": 1.656293231460446, + "grad_norm": 0.024276473538365772, + "learning_rate": 8.727678316487786e-05, + "loss": 0.4542, + "step": 32625 + }, + { + "epoch": 1.6565470675584775, + "grad_norm": 0.02582996339953851, + "learning_rate": 8.7151776597869e-05, + "loss": 0.4921, + "step": 32630 + }, + { + "epoch": 1.656800903656509, + "grad_norm": 0.019009514698150008, + "learning_rate": 8.702685107090725e-05, + "loss": 0.4833, + "step": 32635 + }, + { + "epoch": 1.6570547397545405, + "grad_norm": 0.02262802830227219, + "learning_rate": 8.690200660851539e-05, + "loss": 0.4611, + "step": 32640 + }, + { + "epoch": 1.6573085758525719, + "grad_norm": 0.022327220330587837, + "learning_rate": 8.677724323519937e-05, + "loss": 0.4822, + "step": 32645 + }, + { + "epoch": 1.6575624119506034, + "grad_norm": 0.026247061121182514, + "learning_rate": 8.665256097544994e-05, + "loss": 0.483, + "step": 32650 + }, + { + "epoch": 1.657816248048635, + "grad_norm": 0.021890433127245045, + "learning_rate": 8.65279598537413e-05, + "loss": 0.4744, + "step": 32655 + }, + { + "epoch": 1.6580700841466665, + "grad_norm": 0.023375317295645848, + "learning_rate": 8.640343989453225e-05, + "loss": 0.4488, + "step": 32660 + }, + { + "epoch": 1.658323920244698, + "grad_norm": 0.025314102237468532, + "learning_rate": 8.627900112226522e-05, + "loss": 0.4704, + "step": 32665 + }, + { + "epoch": 1.6585777563427295, + "grad_norm": 0.020441167533399623, + "learning_rate": 8.61546435613672e-05, + "loss": 0.4608, + "step": 32670 + }, + { + "epoch": 1.658831592440761, + "grad_norm": 0.022593790331337692, + "learning_rate": 8.603036723624868e-05, + "loss": 0.4543, + "step": 32675 + }, + { + "epoch": 1.6590854285387926, + "grad_norm": 0.023141528922392886, + "learning_rate": 8.590617217130469e-05, + "loss": 0.4867, + "step": 32680 + }, + { + "epoch": 1.6593392646368241, + "grad_norm": 0.019506697724557195, + "learning_rate": 8.578205839091397e-05, + "loss": 0.4562, + "step": 32685 + }, + { + "epoch": 1.6595931007348557, + "grad_norm": 0.021711834234721967, + "learning_rate": 8.565802591943955e-05, + "loss": 0.4485, + "step": 32690 + }, + { + "epoch": 1.659846936832887, + "grad_norm": 0.021652695537899707, + "learning_rate": 8.55340747812282e-05, + "loss": 0.4609, + "step": 32695 + }, + { + "epoch": 1.6601007729309185, + "grad_norm": 0.021747456684810037, + "learning_rate": 8.541020500061109e-05, + "loss": 0.4705, + "step": 32700 + }, + { + "epoch": 1.66035460902895, + "grad_norm": 0.022086611000210216, + "learning_rate": 8.528641660190323e-05, + "loss": 0.478, + "step": 32705 + }, + { + "epoch": 1.6606084451269814, + "grad_norm": 0.022136451230823195, + "learning_rate": 8.516270960940353e-05, + "loss": 0.4541, + "step": 32710 + }, + { + "epoch": 1.660862281225013, + "grad_norm": 0.025866562335245584, + "learning_rate": 8.50390840473953e-05, + "loss": 0.4446, + "step": 32715 + }, + { + "epoch": 1.6611161173230444, + "grad_norm": 0.0228248045581071, + "learning_rate": 8.491553994014528e-05, + "loss": 0.4186, + "step": 32720 + }, + { + "epoch": 1.661369953421076, + "grad_norm": 0.021908437534819313, + "learning_rate": 8.479207731190491e-05, + "loss": 0.444, + "step": 32725 + }, + { + "epoch": 1.6616237895191075, + "grad_norm": 0.022820980730176514, + "learning_rate": 8.466869618690898e-05, + "loss": 0.4496, + "step": 32730 + }, + { + "epoch": 1.661877625617139, + "grad_norm": 0.02083612493092523, + "learning_rate": 8.454539658937688e-05, + "loss": 0.4318, + "step": 32735 + }, + { + "epoch": 1.6621314617151706, + "grad_norm": 0.02223274504726005, + "learning_rate": 8.442217854351142e-05, + "loss": 0.4465, + "step": 32740 + }, + { + "epoch": 1.662385297813202, + "grad_norm": 0.027608070343403793, + "learning_rate": 8.429904207349997e-05, + "loss": 0.4895, + "step": 32745 + }, + { + "epoch": 1.6626391339112336, + "grad_norm": 0.023017127806108626, + "learning_rate": 8.417598720351333e-05, + "loss": 0.4566, + "step": 32750 + }, + { + "epoch": 1.6628929700092652, + "grad_norm": 0.024694217741001077, + "learning_rate": 8.40530139577067e-05, + "loss": 0.4202, + "step": 32755 + }, + { + "epoch": 1.6631468061072965, + "grad_norm": 0.021747446327509346, + "learning_rate": 8.393012236021908e-05, + "loss": 0.442, + "step": 32760 + }, + { + "epoch": 1.663400642205328, + "grad_norm": 0.02285407342505394, + "learning_rate": 8.380731243517365e-05, + "loss": 0.4849, + "step": 32765 + }, + { + "epoch": 1.6636544783033596, + "grad_norm": 0.01880950444315991, + "learning_rate": 8.368458420667707e-05, + "loss": 0.4462, + "step": 32770 + }, + { + "epoch": 1.6639083144013909, + "grad_norm": 0.02321735100250034, + "learning_rate": 8.356193769882064e-05, + "loss": 0.4659, + "step": 32775 + }, + { + "epoch": 1.6641621504994224, + "grad_norm": 0.021688726089099976, + "learning_rate": 8.343937293567888e-05, + "loss": 0.4518, + "step": 32780 + }, + { + "epoch": 1.664415986597454, + "grad_norm": 0.022255121531889792, + "learning_rate": 8.331688994131098e-05, + "loss": 0.4709, + "step": 32785 + }, + { + "epoch": 1.6646698226954855, + "grad_norm": 0.0234424879295081, + "learning_rate": 8.319448873975948e-05, + "loss": 0.4485, + "step": 32790 + }, + { + "epoch": 1.664923658793517, + "grad_norm": 0.01971346035102544, + "learning_rate": 8.307216935505135e-05, + "loss": 0.4471, + "step": 32795 + }, + { + "epoch": 1.6651774948915485, + "grad_norm": 0.027744043981377947, + "learning_rate": 8.294993181119703e-05, + "loss": 0.4878, + "step": 32800 + }, + { + "epoch": 1.66543133098958, + "grad_norm": 0.02538544426279172, + "learning_rate": 8.282777613219139e-05, + "loss": 0.4614, + "step": 32805 + }, + { + "epoch": 1.6656851670876116, + "grad_norm": 0.024257509182234863, + "learning_rate": 8.270570234201274e-05, + "loss": 0.456, + "step": 32810 + }, + { + "epoch": 1.6659390031856431, + "grad_norm": 0.022358059693099067, + "learning_rate": 8.25837104646237e-05, + "loss": 0.4957, + "step": 32815 + }, + { + "epoch": 1.6661928392836747, + "grad_norm": 0.021859775564087938, + "learning_rate": 8.246180052397078e-05, + "loss": 0.4784, + "step": 32820 + }, + { + "epoch": 1.666446675381706, + "grad_norm": 0.020567147747974097, + "learning_rate": 8.233997254398401e-05, + "loss": 0.4608, + "step": 32825 + }, + { + "epoch": 1.6667005114797375, + "grad_norm": 0.023002497460944293, + "learning_rate": 8.221822654857786e-05, + "loss": 0.441, + "step": 32830 + }, + { + "epoch": 1.666954347577769, + "grad_norm": 0.024727893994143192, + "learning_rate": 8.209656256165027e-05, + "loss": 0.4698, + "step": 32835 + }, + { + "epoch": 1.6672081836758004, + "grad_norm": 0.020590000782565322, + "learning_rate": 8.197498060708347e-05, + "loss": 0.4686, + "step": 32840 + }, + { + "epoch": 1.667462019773832, + "grad_norm": 0.026579843381741665, + "learning_rate": 8.185348070874316e-05, + "loss": 0.4612, + "step": 32845 + }, + { + "epoch": 1.6677158558718634, + "grad_norm": 0.01927314772150972, + "learning_rate": 8.173206289047947e-05, + "loss": 0.4627, + "step": 32850 + }, + { + "epoch": 1.667969691969895, + "grad_norm": 0.028426534307149153, + "learning_rate": 8.161072717612578e-05, + "loss": 0.4565, + "step": 32855 + }, + { + "epoch": 1.6682235280679265, + "grad_norm": 0.028414912943784405, + "learning_rate": 8.148947358949992e-05, + "loss": 0.4608, + "step": 32860 + }, + { + "epoch": 1.668477364165958, + "grad_norm": 0.02128169255167679, + "learning_rate": 8.136830215440322e-05, + "loss": 0.4669, + "step": 32865 + }, + { + "epoch": 1.6687312002639896, + "grad_norm": 0.020873791873976224, + "learning_rate": 8.124721289462122e-05, + "loss": 0.4489, + "step": 32870 + }, + { + "epoch": 1.6689850363620211, + "grad_norm": 0.03135989480369304, + "learning_rate": 8.112620583392272e-05, + "loss": 0.4529, + "step": 32875 + }, + { + "epoch": 1.6692388724600526, + "grad_norm": 0.02000361275840586, + "learning_rate": 8.100528099606135e-05, + "loss": 0.4683, + "step": 32880 + }, + { + "epoch": 1.6694927085580842, + "grad_norm": 0.03133371354490263, + "learning_rate": 8.088443840477371e-05, + "loss": 0.4594, + "step": 32885 + }, + { + "epoch": 1.6697465446561155, + "grad_norm": 0.020707782157897564, + "learning_rate": 8.076367808378083e-05, + "loss": 0.4547, + "step": 32890 + }, + { + "epoch": 1.670000380754147, + "grad_norm": 0.022147898036397162, + "learning_rate": 8.064300005678705e-05, + "loss": 0.4866, + "step": 32895 + }, + { + "epoch": 1.6702542168521786, + "grad_norm": 0.02198563076624078, + "learning_rate": 8.052240434748114e-05, + "loss": 0.4789, + "step": 32900 + }, + { + "epoch": 1.67050805295021, + "grad_norm": 0.39720820061969236, + "learning_rate": 8.04018909795352e-05, + "loss": 0.438, + "step": 32905 + }, + { + "epoch": 1.6707618890482414, + "grad_norm": 0.022207069306431813, + "learning_rate": 8.028145997660569e-05, + "loss": 0.4757, + "step": 32910 + }, + { + "epoch": 1.671015725146273, + "grad_norm": 0.024892721760228, + "learning_rate": 8.016111136233229e-05, + "loss": 0.4694, + "step": 32915 + }, + { + "epoch": 1.6712695612443045, + "grad_norm": 0.026240771643922845, + "learning_rate": 8.00408451603391e-05, + "loss": 0.483, + "step": 32920 + }, + { + "epoch": 1.671523397342336, + "grad_norm": 0.027364163639244488, + "learning_rate": 7.992066139423359e-05, + "loss": 0.4495, + "step": 32925 + }, + { + "epoch": 1.6717772334403675, + "grad_norm": 0.022658845760252785, + "learning_rate": 7.980056008760744e-05, + "loss": 0.4604, + "step": 32930 + }, + { + "epoch": 1.672031069538399, + "grad_norm": 0.020613403322029124, + "learning_rate": 7.968054126403568e-05, + "loss": 0.4436, + "step": 32935 + }, + { + "epoch": 1.6722849056364306, + "grad_norm": 0.020569339169032044, + "learning_rate": 7.956060494707757e-05, + "loss": 0.4611, + "step": 32940 + }, + { + "epoch": 1.6725387417344622, + "grad_norm": 0.02209932289337912, + "learning_rate": 7.944075116027604e-05, + "loss": 0.4665, + "step": 32945 + }, + { + "epoch": 1.6727925778324937, + "grad_norm": 0.022726524271639435, + "learning_rate": 7.93209799271577e-05, + "loss": 0.4564, + "step": 32950 + }, + { + "epoch": 1.6730464139305252, + "grad_norm": 0.021952171929792236, + "learning_rate": 7.920129127123316e-05, + "loss": 0.4485, + "step": 32955 + }, + { + "epoch": 1.6733002500285565, + "grad_norm": 0.02051579303808435, + "learning_rate": 7.908168521599646e-05, + "loss": 0.4584, + "step": 32960 + }, + { + "epoch": 1.673554086126588, + "grad_norm": 0.02294166980769807, + "learning_rate": 7.896216178492599e-05, + "loss": 0.4563, + "step": 32965 + }, + { + "epoch": 1.6738079222246196, + "grad_norm": 0.02490011470275378, + "learning_rate": 7.884272100148332e-05, + "loss": 0.486, + "step": 32970 + }, + { + "epoch": 1.674061758322651, + "grad_norm": 0.02500987816795483, + "learning_rate": 7.872336288911436e-05, + "loss": 0.4521, + "step": 32975 + }, + { + "epoch": 1.6743155944206825, + "grad_norm": 0.025048858782017916, + "learning_rate": 7.86040874712482e-05, + "loss": 0.4468, + "step": 32980 + }, + { + "epoch": 1.674569430518714, + "grad_norm": 0.028012775399430874, + "learning_rate": 7.848489477129828e-05, + "loss": 0.4734, + "step": 32985 + }, + { + "epoch": 1.6748232666167455, + "grad_norm": 0.02640398235527484, + "learning_rate": 7.836578481266132e-05, + "loss": 0.4504, + "step": 32990 + }, + { + "epoch": 1.675077102714777, + "grad_norm": 0.023020229066731457, + "learning_rate": 7.824675761871814e-05, + "loss": 0.4716, + "step": 32995 + }, + { + "epoch": 1.6753309388128086, + "grad_norm": 0.02729357114542021, + "learning_rate": 7.812781321283319e-05, + "loss": 0.4414, + "step": 33000 + }, + { + "epoch": 1.6755847749108401, + "grad_norm": 0.02331912928699373, + "learning_rate": 7.800895161835469e-05, + "loss": 0.5048, + "step": 33005 + }, + { + "epoch": 1.6758386110088717, + "grad_norm": 0.020302890548499142, + "learning_rate": 7.789017285861439e-05, + "loss": 0.4686, + "step": 33010 + }, + { + "epoch": 1.6760924471069032, + "grad_norm": 0.030570223530975774, + "learning_rate": 7.777147695692827e-05, + "loss": 0.4768, + "step": 33015 + }, + { + "epoch": 1.6763462832049347, + "grad_norm": 0.021681113676065226, + "learning_rate": 7.765286393659543e-05, + "loss": 0.4306, + "step": 33020 + }, + { + "epoch": 1.676600119302966, + "grad_norm": 0.02236935979091485, + "learning_rate": 7.75343338208993e-05, + "loss": 0.4734, + "step": 33025 + }, + { + "epoch": 1.6768539554009976, + "grad_norm": 0.025693461163144155, + "learning_rate": 7.741588663310644e-05, + "loss": 0.4494, + "step": 33030 + }, + { + "epoch": 1.677107791499029, + "grad_norm": 0.02791486982571019, + "learning_rate": 7.729752239646776e-05, + "loss": 0.4523, + "step": 33035 + }, + { + "epoch": 1.6773616275970604, + "grad_norm": 0.02226158895252821, + "learning_rate": 7.717924113421732e-05, + "loss": 0.4467, + "step": 33040 + }, + { + "epoch": 1.677615463695092, + "grad_norm": 0.02192720169085694, + "learning_rate": 7.706104286957333e-05, + "loss": 0.4695, + "step": 33045 + }, + { + "epoch": 1.6778692997931235, + "grad_norm": 0.022241780707175234, + "learning_rate": 7.694292762573729e-05, + "loss": 0.4432, + "step": 33050 + }, + { + "epoch": 1.678123135891155, + "grad_norm": 0.025844348165324875, + "learning_rate": 7.682489542589483e-05, + "loss": 0.5004, + "step": 33055 + }, + { + "epoch": 1.6783769719891866, + "grad_norm": 0.03152479859049114, + "learning_rate": 7.670694629321511e-05, + "loss": 0.4711, + "step": 33060 + }, + { + "epoch": 1.678630808087218, + "grad_norm": 0.020936664773053695, + "learning_rate": 7.658908025085076e-05, + "loss": 0.4632, + "step": 33065 + }, + { + "epoch": 1.6788846441852496, + "grad_norm": 0.021671379054153835, + "learning_rate": 7.647129732193859e-05, + "loss": 0.4327, + "step": 33070 + }, + { + "epoch": 1.6791384802832812, + "grad_norm": 0.023902127255322248, + "learning_rate": 7.635359752959841e-05, + "loss": 0.4704, + "step": 33075 + }, + { + "epoch": 1.6793923163813127, + "grad_norm": 0.024510636050162397, + "learning_rate": 7.623598089693446e-05, + "loss": 0.4771, + "step": 33080 + }, + { + "epoch": 1.6796461524793442, + "grad_norm": 0.023726057600865935, + "learning_rate": 7.611844744703406e-05, + "loss": 0.471, + "step": 33085 + }, + { + "epoch": 1.6798999885773755, + "grad_norm": 0.022957493744582655, + "learning_rate": 7.600099720296866e-05, + "loss": 0.4646, + "step": 33090 + }, + { + "epoch": 1.680153824675407, + "grad_norm": 0.022157047872468763, + "learning_rate": 7.588363018779288e-05, + "loss": 0.4636, + "step": 33095 + }, + { + "epoch": 1.6804076607734386, + "grad_norm": 0.020795019876852835, + "learning_rate": 7.576634642454555e-05, + "loss": 0.4648, + "step": 33100 + }, + { + "epoch": 1.68066149687147, + "grad_norm": 0.02546400520836241, + "learning_rate": 7.564914593624866e-05, + "loss": 0.4771, + "step": 33105 + }, + { + "epoch": 1.6809153329695015, + "grad_norm": 0.035868249174773666, + "learning_rate": 7.553202874590825e-05, + "loss": 0.4593, + "step": 33110 + }, + { + "epoch": 1.681169169067533, + "grad_norm": 0.029253511552575565, + "learning_rate": 7.54149948765136e-05, + "loss": 0.4585, + "step": 33115 + }, + { + "epoch": 1.6814230051655645, + "grad_norm": 0.021167662879100695, + "learning_rate": 7.529804435103831e-05, + "loss": 0.4654, + "step": 33120 + }, + { + "epoch": 1.681676841263596, + "grad_norm": 0.02498414369849199, + "learning_rate": 7.518117719243878e-05, + "loss": 0.4542, + "step": 33125 + }, + { + "epoch": 1.6819306773616276, + "grad_norm": 0.022195501835238156, + "learning_rate": 7.506439342365573e-05, + "loss": 0.4739, + "step": 33130 + }, + { + "epoch": 1.6821845134596591, + "grad_norm": 0.021987713178080383, + "learning_rate": 7.494769306761296e-05, + "loss": 0.4555, + "step": 33135 + }, + { + "epoch": 1.6824383495576907, + "grad_norm": 0.030374528516733935, + "learning_rate": 7.483107614721846e-05, + "loss": 0.4736, + "step": 33140 + }, + { + "epoch": 1.6826921856557222, + "grad_norm": 0.02149051980457975, + "learning_rate": 7.471454268536338e-05, + "loss": 0.45, + "step": 33145 + }, + { + "epoch": 1.6829460217537537, + "grad_norm": 0.022265453269349957, + "learning_rate": 7.459809270492252e-05, + "loss": 0.4573, + "step": 33150 + }, + { + "epoch": 1.683199857851785, + "grad_norm": 0.03210156404072531, + "learning_rate": 7.448172622875477e-05, + "loss": 0.462, + "step": 33155 + }, + { + "epoch": 1.6834536939498166, + "grad_norm": 0.027726727483721063, + "learning_rate": 7.436544327970191e-05, + "loss": 0.4343, + "step": 33160 + }, + { + "epoch": 1.6837075300478481, + "grad_norm": 0.026983110267163848, + "learning_rate": 7.424924388059007e-05, + "loss": 0.4746, + "step": 33165 + }, + { + "epoch": 1.6839613661458797, + "grad_norm": 0.022811847390947975, + "learning_rate": 7.413312805422834e-05, + "loss": 0.475, + "step": 33170 + }, + { + "epoch": 1.684215202243911, + "grad_norm": 0.022500674142276494, + "learning_rate": 7.40170958234097e-05, + "loss": 0.4579, + "step": 33175 + }, + { + "epoch": 1.6844690383419425, + "grad_norm": 0.021612375802658696, + "learning_rate": 7.390114721091084e-05, + "loss": 0.4414, + "step": 33180 + }, + { + "epoch": 1.684722874439974, + "grad_norm": 0.0248775111800712, + "learning_rate": 7.378528223949194e-05, + "loss": 0.5039, + "step": 33185 + }, + { + "epoch": 1.6849767105380056, + "grad_norm": 0.02388192498972222, + "learning_rate": 7.366950093189651e-05, + "loss": 0.4526, + "step": 33190 + }, + { + "epoch": 1.685230546636037, + "grad_norm": 0.030723879806664092, + "learning_rate": 7.355380331085205e-05, + "loss": 0.4692, + "step": 33195 + }, + { + "epoch": 1.6854843827340686, + "grad_norm": 0.018952594935604992, + "learning_rate": 7.343818939906915e-05, + "loss": 0.4533, + "step": 33200 + }, + { + "epoch": 1.6857382188321002, + "grad_norm": 0.021896935877159233, + "learning_rate": 7.332265921924258e-05, + "loss": 0.4908, + "step": 33205 + }, + { + "epoch": 1.6859920549301317, + "grad_norm": 0.024915232475315307, + "learning_rate": 7.320721279405002e-05, + "loss": 0.4696, + "step": 33210 + }, + { + "epoch": 1.6862458910281632, + "grad_norm": 0.023173557196539376, + "learning_rate": 7.309185014615333e-05, + "loss": 0.4438, + "step": 33215 + }, + { + "epoch": 1.6864997271261948, + "grad_norm": 0.023554891519709124, + "learning_rate": 7.29765712981973e-05, + "loss": 0.4373, + "step": 33220 + }, + { + "epoch": 1.686753563224226, + "grad_norm": 0.019733917611659817, + "learning_rate": 7.286137627281092e-05, + "loss": 0.4432, + "step": 33225 + }, + { + "epoch": 1.6870073993222576, + "grad_norm": 0.02160655258996359, + "learning_rate": 7.274626509260612e-05, + "loss": 0.4774, + "step": 33230 + }, + { + "epoch": 1.6872612354202892, + "grad_norm": 0.02405968481850796, + "learning_rate": 7.263123778017877e-05, + "loss": 0.4548, + "step": 33235 + }, + { + "epoch": 1.6875150715183205, + "grad_norm": 0.024128327278038663, + "learning_rate": 7.251629435810825e-05, + "loss": 0.4642, + "step": 33240 + }, + { + "epoch": 1.687768907616352, + "grad_norm": 0.024693827881499066, + "learning_rate": 7.240143484895718e-05, + "loss": 0.4463, + "step": 33245 + }, + { + "epoch": 1.6880227437143835, + "grad_norm": 0.03310653970465261, + "learning_rate": 7.228665927527217e-05, + "loss": 0.4786, + "step": 33250 + }, + { + "epoch": 1.688276579812415, + "grad_norm": 0.024219301603149616, + "learning_rate": 7.217196765958278e-05, + "loss": 0.4417, + "step": 33255 + }, + { + "epoch": 1.6885304159104466, + "grad_norm": 0.020412058664254072, + "learning_rate": 7.205736002440272e-05, + "loss": 0.4324, + "step": 33260 + }, + { + "epoch": 1.6887842520084781, + "grad_norm": 0.020652073502858727, + "learning_rate": 7.19428363922286e-05, + "loss": 0.4505, + "step": 33265 + }, + { + "epoch": 1.6890380881065097, + "grad_norm": 0.020752974694944956, + "learning_rate": 7.18283967855411e-05, + "loss": 0.4729, + "step": 33270 + }, + { + "epoch": 1.6892919242045412, + "grad_norm": 0.026444187207730747, + "learning_rate": 7.171404122680391e-05, + "loss": 0.4503, + "step": 33275 + }, + { + "epoch": 1.6895457603025728, + "grad_norm": 0.024097349847725474, + "learning_rate": 7.159976973846466e-05, + "loss": 0.4576, + "step": 33280 + }, + { + "epoch": 1.6897995964006043, + "grad_norm": 0.02500842430792348, + "learning_rate": 7.14855823429541e-05, + "loss": 0.4684, + "step": 33285 + }, + { + "epoch": 1.6900534324986356, + "grad_norm": 0.022174452172228635, + "learning_rate": 7.137147906268682e-05, + "loss": 0.4605, + "step": 33290 + }, + { + "epoch": 1.6903072685966671, + "grad_norm": 0.02242173750304481, + "learning_rate": 7.125745992006044e-05, + "loss": 0.4677, + "step": 33295 + }, + { + "epoch": 1.6905611046946987, + "grad_norm": 0.023083145288370695, + "learning_rate": 7.114352493745674e-05, + "loss": 0.4695, + "step": 33300 + }, + { + "epoch": 1.69081494079273, + "grad_norm": 0.02381655314266417, + "learning_rate": 7.102967413724027e-05, + "loss": 0.4685, + "step": 33305 + }, + { + "epoch": 1.6910687768907615, + "grad_norm": 0.024080193683219564, + "learning_rate": 7.091590754175963e-05, + "loss": 0.445, + "step": 33310 + }, + { + "epoch": 1.691322612988793, + "grad_norm": 0.021690243710150815, + "learning_rate": 7.080222517334639e-05, + "loss": 0.4125, + "step": 33315 + }, + { + "epoch": 1.6915764490868246, + "grad_norm": 0.021918256209021103, + "learning_rate": 7.068862705431601e-05, + "loss": 0.4648, + "step": 33320 + }, + { + "epoch": 1.6918302851848561, + "grad_norm": 0.022452404320154225, + "learning_rate": 7.057511320696708e-05, + "loss": 0.4451, + "step": 33325 + }, + { + "epoch": 1.6920841212828877, + "grad_norm": 0.020322901723341114, + "learning_rate": 7.046168365358202e-05, + "loss": 0.4244, + "step": 33330 + }, + { + "epoch": 1.6923379573809192, + "grad_norm": 0.026597028609778042, + "learning_rate": 7.034833841642624e-05, + "loss": 0.4772, + "step": 33335 + }, + { + "epoch": 1.6925917934789507, + "grad_norm": 0.02102494226253916, + "learning_rate": 7.023507751774905e-05, + "loss": 0.4323, + "step": 33340 + }, + { + "epoch": 1.6928456295769823, + "grad_norm": 0.021714702487951786, + "learning_rate": 7.012190097978282e-05, + "loss": 0.449, + "step": 33345 + }, + { + "epoch": 1.6930994656750138, + "grad_norm": 0.02158282445558592, + "learning_rate": 7.000880882474375e-05, + "loss": 0.447, + "step": 33350 + }, + { + "epoch": 1.693353301773045, + "grad_norm": 0.02078501840382419, + "learning_rate": 6.989580107483102e-05, + "loss": 0.452, + "step": 33355 + }, + { + "epoch": 1.6936071378710766, + "grad_norm": 0.022296558827429345, + "learning_rate": 6.978287775222758e-05, + "loss": 0.4745, + "step": 33360 + }, + { + "epoch": 1.6938609739691082, + "grad_norm": 0.022424807973195095, + "learning_rate": 6.967003887909989e-05, + "loss": 0.4309, + "step": 33365 + }, + { + "epoch": 1.6941148100671395, + "grad_norm": 0.022647524848265772, + "learning_rate": 6.95572844775974e-05, + "loss": 0.4513, + "step": 33370 + }, + { + "epoch": 1.694368646165171, + "grad_norm": 0.02604844884193192, + "learning_rate": 6.944461456985346e-05, + "loss": 0.4496, + "step": 33375 + }, + { + "epoch": 1.6946224822632026, + "grad_norm": 0.02231782508277886, + "learning_rate": 6.933202917798443e-05, + "loss": 0.4345, + "step": 33380 + }, + { + "epoch": 1.694876318361234, + "grad_norm": 0.027331402316067194, + "learning_rate": 6.92195283240904e-05, + "loss": 0.452, + "step": 33385 + }, + { + "epoch": 1.6951301544592656, + "grad_norm": 0.02244181112322746, + "learning_rate": 6.910711203025455e-05, + "loss": 0.4548, + "step": 33390 + }, + { + "epoch": 1.6953839905572972, + "grad_norm": 0.031539164754729246, + "learning_rate": 6.89947803185439e-05, + "loss": 0.4876, + "step": 33395 + }, + { + "epoch": 1.6956378266553287, + "grad_norm": 0.029750827597486304, + "learning_rate": 6.888253321100829e-05, + "loss": 0.4631, + "step": 33400 + }, + { + "epoch": 1.6958916627533602, + "grad_norm": 0.02054220426689229, + "learning_rate": 6.877037072968157e-05, + "loss": 0.433, + "step": 33405 + }, + { + "epoch": 1.6961454988513918, + "grad_norm": 0.022921757508354648, + "learning_rate": 6.865829289658044e-05, + "loss": 0.4793, + "step": 33410 + }, + { + "epoch": 1.6963993349494233, + "grad_norm": 0.02272022241679228, + "learning_rate": 6.85462997337053e-05, + "loss": 0.4618, + "step": 33415 + }, + { + "epoch": 1.6966531710474546, + "grad_norm": 0.021335577126739804, + "learning_rate": 6.843439126303985e-05, + "loss": 0.4481, + "step": 33420 + }, + { + "epoch": 1.6969070071454861, + "grad_norm": 0.024313607070824157, + "learning_rate": 6.83225675065513e-05, + "loss": 0.4399, + "step": 33425 + }, + { + "epoch": 1.6971608432435177, + "grad_norm": 0.02095148113976031, + "learning_rate": 6.821082848618988e-05, + "loss": 0.4599, + "step": 33430 + }, + { + "epoch": 1.6974146793415492, + "grad_norm": 0.023138372142731362, + "learning_rate": 6.809917422388961e-05, + "loss": 0.4756, + "step": 33435 + }, + { + "epoch": 1.6976685154395805, + "grad_norm": 0.022979590715020475, + "learning_rate": 6.798760474156745e-05, + "loss": 0.4845, + "step": 33440 + }, + { + "epoch": 1.697922351537612, + "grad_norm": 0.02503552519860586, + "learning_rate": 6.787612006112409e-05, + "loss": 0.4802, + "step": 33445 + }, + { + "epoch": 1.6981761876356436, + "grad_norm": 0.02551716298674583, + "learning_rate": 6.77647202044433e-05, + "loss": 0.4388, + "step": 33450 + }, + { + "epoch": 1.6984300237336751, + "grad_norm": 0.02430410214676648, + "learning_rate": 6.765340519339252e-05, + "loss": 0.4553, + "step": 33455 + }, + { + "epoch": 1.6986838598317067, + "grad_norm": 0.02781257765021097, + "learning_rate": 6.754217504982202e-05, + "loss": 0.4474, + "step": 33460 + }, + { + "epoch": 1.6989376959297382, + "grad_norm": 0.021901755276533954, + "learning_rate": 6.743102979556604e-05, + "loss": 0.4726, + "step": 33465 + }, + { + "epoch": 1.6991915320277697, + "grad_norm": 0.02314556208967541, + "learning_rate": 6.731996945244162e-05, + "loss": 0.435, + "step": 33470 + }, + { + "epoch": 1.6994453681258013, + "grad_norm": 0.0216630417947033, + "learning_rate": 6.720899404224934e-05, + "loss": 0.4521, + "step": 33475 + }, + { + "epoch": 1.6996992042238328, + "grad_norm": 0.02078810198958909, + "learning_rate": 6.709810358677337e-05, + "loss": 0.4387, + "step": 33480 + }, + { + "epoch": 1.6999530403218641, + "grad_norm": 0.021858124288052016, + "learning_rate": 6.698729810778065e-05, + "loss": 0.4292, + "step": 33485 + }, + { + "epoch": 1.7002068764198957, + "grad_norm": 0.02220940183114807, + "learning_rate": 6.687657762702203e-05, + "loss": 0.4541, + "step": 33490 + }, + { + "epoch": 1.7004607125179272, + "grad_norm": 0.021898318644871732, + "learning_rate": 6.67659421662311e-05, + "loss": 0.4709, + "step": 33495 + }, + { + "epoch": 1.7007145486159587, + "grad_norm": 0.019784763924519774, + "learning_rate": 6.665539174712532e-05, + "loss": 0.4626, + "step": 33500 + }, + { + "epoch": 1.70096838471399, + "grad_norm": 0.022919389047645698, + "learning_rate": 6.654492639140492e-05, + "loss": 0.4507, + "step": 33505 + }, + { + "epoch": 1.7012222208120216, + "grad_norm": 0.02271121221914657, + "learning_rate": 6.643454612075395e-05, + "loss": 0.4652, + "step": 33510 + }, + { + "epoch": 1.701476056910053, + "grad_norm": 0.022198266733550638, + "learning_rate": 6.632425095683925e-05, + "loss": 0.4125, + "step": 33515 + }, + { + "epoch": 1.7017298930080846, + "grad_norm": 0.023454135381851583, + "learning_rate": 6.62140409213115e-05, + "loss": 0.4537, + "step": 33520 + }, + { + "epoch": 1.7019837291061162, + "grad_norm": 0.02311814200716394, + "learning_rate": 6.610391603580412e-05, + "loss": 0.4415, + "step": 33525 + }, + { + "epoch": 1.7022375652041477, + "grad_norm": 0.02338916064340492, + "learning_rate": 6.599387632193426e-05, + "loss": 0.468, + "step": 33530 + }, + { + "epoch": 1.7024914013021792, + "grad_norm": 0.022293680542931123, + "learning_rate": 6.588392180130198e-05, + "loss": 0.4832, + "step": 33535 + }, + { + "epoch": 1.7027452374002108, + "grad_norm": 0.02228526819113989, + "learning_rate": 6.577405249549096e-05, + "loss": 0.4406, + "step": 33540 + }, + { + "epoch": 1.7029990734982423, + "grad_norm": 0.02669841047779239, + "learning_rate": 6.566426842606793e-05, + "loss": 0.4683, + "step": 33545 + }, + { + "epoch": 1.7032529095962738, + "grad_norm": 0.020687449495207438, + "learning_rate": 6.555456961458311e-05, + "loss": 0.4788, + "step": 33550 + }, + { + "epoch": 1.7035067456943052, + "grad_norm": 0.02316737056094181, + "learning_rate": 6.544495608256957e-05, + "loss": 0.4479, + "step": 33555 + }, + { + "epoch": 1.7037605817923367, + "grad_norm": 0.02015658685414917, + "learning_rate": 6.533542785154412e-05, + "loss": 0.4675, + "step": 33560 + }, + { + "epoch": 1.7040144178903682, + "grad_norm": 0.024630709903064985, + "learning_rate": 6.522598494300647e-05, + "loss": 0.4612, + "step": 33565 + }, + { + "epoch": 1.7042682539883995, + "grad_norm": 0.021211217998291603, + "learning_rate": 6.511662737843981e-05, + "loss": 0.4552, + "step": 33570 + }, + { + "epoch": 1.704522090086431, + "grad_norm": 0.022897383797772494, + "learning_rate": 6.500735517931033e-05, + "loss": 0.4553, + "step": 33575 + }, + { + "epoch": 1.7047759261844626, + "grad_norm": 0.021403658562759068, + "learning_rate": 6.489816836706786e-05, + "loss": 0.4417, + "step": 33580 + }, + { + "epoch": 1.7050297622824941, + "grad_norm": 0.024079482108871682, + "learning_rate": 6.478906696314496e-05, + "loss": 0.4456, + "step": 33585 + }, + { + "epoch": 1.7052835983805257, + "grad_norm": 0.02474012090426629, + "learning_rate": 6.468005098895797e-05, + "loss": 0.4528, + "step": 33590 + }, + { + "epoch": 1.7055374344785572, + "grad_norm": 0.025738626621193236, + "learning_rate": 6.457112046590585e-05, + "loss": 0.5034, + "step": 33595 + }, + { + "epoch": 1.7057912705765887, + "grad_norm": 0.0219847722352076, + "learning_rate": 6.446227541537136e-05, + "loss": 0.4519, + "step": 33600 + }, + { + "epoch": 1.7060451066746203, + "grad_norm": 0.023838238873203427, + "learning_rate": 6.43535158587203e-05, + "loss": 0.4685, + "step": 33605 + }, + { + "epoch": 1.7062989427726518, + "grad_norm": 0.02133132469936017, + "learning_rate": 6.424484181730134e-05, + "loss": 0.4578, + "step": 33610 + }, + { + "epoch": 1.7065527788706834, + "grad_norm": 0.020615296500899355, + "learning_rate": 6.413625331244698e-05, + "loss": 0.4472, + "step": 33615 + }, + { + "epoch": 1.7068066149687147, + "grad_norm": 0.02205130832162417, + "learning_rate": 6.402775036547231e-05, + "loss": 0.4635, + "step": 33620 + }, + { + "epoch": 1.7070604510667462, + "grad_norm": 0.022030504089429213, + "learning_rate": 6.391933299767622e-05, + "loss": 0.4566, + "step": 33625 + }, + { + "epoch": 1.7073142871647777, + "grad_norm": 0.026096405850323693, + "learning_rate": 6.381100123034017e-05, + "loss": 0.4668, + "step": 33630 + }, + { + "epoch": 1.707568123262809, + "grad_norm": 0.021626998107609114, + "learning_rate": 6.370275508472945e-05, + "loss": 0.4256, + "step": 33635 + }, + { + "epoch": 1.7078219593608406, + "grad_norm": 0.020387795966162997, + "learning_rate": 6.359459458209194e-05, + "loss": 0.4587, + "step": 33640 + }, + { + "epoch": 1.7080757954588721, + "grad_norm": 0.03128743884752836, + "learning_rate": 6.348651974365932e-05, + "loss": 0.4348, + "step": 33645 + }, + { + "epoch": 1.7083296315569036, + "grad_norm": 0.022169939171472575, + "learning_rate": 6.337853059064586e-05, + "loss": 0.4487, + "step": 33650 + }, + { + "epoch": 1.7085834676549352, + "grad_norm": 0.021740231589529504, + "learning_rate": 6.327062714424946e-05, + "loss": 0.4785, + "step": 33655 + }, + { + "epoch": 1.7088373037529667, + "grad_norm": 0.023768265517867246, + "learning_rate": 6.31628094256509e-05, + "loss": 0.4787, + "step": 33660 + }, + { + "epoch": 1.7090911398509983, + "grad_norm": 0.024960047338923673, + "learning_rate": 6.305507745601446e-05, + "loss": 0.4519, + "step": 33665 + }, + { + "epoch": 1.7093449759490298, + "grad_norm": 0.02652925712341185, + "learning_rate": 6.294743125648722e-05, + "loss": 0.432, + "step": 33670 + }, + { + "epoch": 1.7095988120470613, + "grad_norm": 0.021458655669676602, + "learning_rate": 6.28398708481997e-05, + "loss": 0.4734, + "step": 33675 + }, + { + "epoch": 1.7098526481450929, + "grad_norm": 0.02287235529136257, + "learning_rate": 6.273239625226534e-05, + "loss": 0.4538, + "step": 33680 + }, + { + "epoch": 1.7101064842431242, + "grad_norm": 0.02228112960841006, + "learning_rate": 6.262500748978106e-05, + "loss": 0.4456, + "step": 33685 + }, + { + "epoch": 1.7103603203411557, + "grad_norm": 0.021753237335259395, + "learning_rate": 6.251770458182654e-05, + "loss": 0.4251, + "step": 33690 + }, + { + "epoch": 1.7106141564391872, + "grad_norm": 0.02618127618582834, + "learning_rate": 6.241048754946493e-05, + "loss": 0.4458, + "step": 33695 + }, + { + "epoch": 1.7108679925372186, + "grad_norm": 0.022092349146015322, + "learning_rate": 6.23033564137423e-05, + "loss": 0.4643, + "step": 33700 + }, + { + "epoch": 1.71112182863525, + "grad_norm": 0.02659549264775889, + "learning_rate": 6.219631119568814e-05, + "loss": 0.4926, + "step": 33705 + }, + { + "epoch": 1.7113756647332816, + "grad_norm": 0.021634033709659647, + "learning_rate": 6.208935191631465e-05, + "loss": 0.4538, + "step": 33710 + }, + { + "epoch": 1.7116295008313132, + "grad_norm": 0.020836889426777406, + "learning_rate": 6.19824785966176e-05, + "loss": 0.4703, + "step": 33715 + }, + { + "epoch": 1.7118833369293447, + "grad_norm": 0.01971938523489474, + "learning_rate": 6.187569125757553e-05, + "loss": 0.4523, + "step": 33720 + }, + { + "epoch": 1.7121371730273762, + "grad_norm": 0.021843066134417412, + "learning_rate": 6.176898992015034e-05, + "loss": 0.4278, + "step": 33725 + }, + { + "epoch": 1.7123910091254078, + "grad_norm": 0.020570604910276645, + "learning_rate": 6.166237460528706e-05, + "loss": 0.4728, + "step": 33730 + }, + { + "epoch": 1.7126448452234393, + "grad_norm": 0.019151004234894124, + "learning_rate": 6.155584533391356e-05, + "loss": 0.4478, + "step": 33735 + }, + { + "epoch": 1.7128986813214708, + "grad_norm": 0.020268446809525022, + "learning_rate": 6.144940212694122e-05, + "loss": 0.4503, + "step": 33740 + }, + { + "epoch": 1.7131525174195024, + "grad_norm": 0.024437006226734212, + "learning_rate": 6.134304500526411e-05, + "loss": 0.4756, + "step": 33745 + }, + { + "epoch": 1.7134063535175337, + "grad_norm": 0.02201855727722671, + "learning_rate": 6.123677398975974e-05, + "loss": 0.4507, + "step": 33750 + }, + { + "epoch": 1.7136601896155652, + "grad_norm": 0.021612750422246748, + "learning_rate": 6.11305891012885e-05, + "loss": 0.4466, + "step": 33755 + }, + { + "epoch": 1.7139140257135967, + "grad_norm": 0.02317623568444332, + "learning_rate": 6.1024490360694016e-05, + "loss": 0.4361, + "step": 33760 + }, + { + "epoch": 1.7141678618116283, + "grad_norm": 0.022813604709884243, + "learning_rate": 6.091847778880283e-05, + "loss": 0.4513, + "step": 33765 + }, + { + "epoch": 1.7144216979096596, + "grad_norm": 0.02326852728426279, + "learning_rate": 6.081255140642483e-05, + "loss": 0.4761, + "step": 33770 + }, + { + "epoch": 1.7146755340076911, + "grad_norm": 0.02386406329427358, + "learning_rate": 6.0706711234352674e-05, + "loss": 0.4465, + "step": 33775 + }, + { + "epoch": 1.7149293701057227, + "grad_norm": 0.022526987576066877, + "learning_rate": 6.06009572933624e-05, + "loss": 0.4818, + "step": 33780 + }, + { + "epoch": 1.7151832062037542, + "grad_norm": 0.024915335276817413, + "learning_rate": 6.0495289604212853e-05, + "loss": 0.4499, + "step": 33785 + }, + { + "epoch": 1.7154370423017857, + "grad_norm": 0.02046552164736613, + "learning_rate": 6.038970818764633e-05, + "loss": 0.4566, + "step": 33790 + }, + { + "epoch": 1.7156908783998173, + "grad_norm": 0.02288153652110328, + "learning_rate": 6.0284213064387586e-05, + "loss": 0.4612, + "step": 33795 + }, + { + "epoch": 1.7159447144978488, + "grad_norm": 0.02442365282755549, + "learning_rate": 6.0178804255145106e-05, + "loss": 0.4732, + "step": 33800 + }, + { + "epoch": 1.7161985505958803, + "grad_norm": 0.025400195295964864, + "learning_rate": 6.007348178060984e-05, + "loss": 0.4789, + "step": 33805 + }, + { + "epoch": 1.7164523866939119, + "grad_norm": 0.021023893448474346, + "learning_rate": 5.996824566145631e-05, + "loss": 0.454, + "step": 33810 + }, + { + "epoch": 1.7167062227919434, + "grad_norm": 0.02088473207737844, + "learning_rate": 5.98630959183416e-05, + "loss": 0.4452, + "step": 33815 + }, + { + "epoch": 1.7169600588899747, + "grad_norm": 0.021319092582090367, + "learning_rate": 5.975803257190632e-05, + "loss": 0.452, + "step": 33820 + }, + { + "epoch": 1.7172138949880063, + "grad_norm": 0.02336522341312864, + "learning_rate": 5.965305564277368e-05, + "loss": 0.4185, + "step": 33825 + }, + { + "epoch": 1.7174677310860378, + "grad_norm": 0.02176603784713292, + "learning_rate": 5.954816515155026e-05, + "loss": 0.4783, + "step": 33830 + }, + { + "epoch": 1.717721567184069, + "grad_norm": 0.021689798405552758, + "learning_rate": 5.944336111882542e-05, + "loss": 0.4443, + "step": 33835 + }, + { + "epoch": 1.7179754032821006, + "grad_norm": 0.021388359751042172, + "learning_rate": 5.933864356517177e-05, + "loss": 0.4269, + "step": 33840 + }, + { + "epoch": 1.7182292393801322, + "grad_norm": 0.021256453398072323, + "learning_rate": 5.923401251114485e-05, + "loss": 0.4655, + "step": 33845 + }, + { + "epoch": 1.7184830754781637, + "grad_norm": 0.02151794556558625, + "learning_rate": 5.9129467977283135e-05, + "loss": 0.4529, + "step": 33850 + }, + { + "epoch": 1.7187369115761952, + "grad_norm": 0.029363161811987813, + "learning_rate": 5.902500998410831e-05, + "loss": 0.4275, + "step": 33855 + }, + { + "epoch": 1.7189907476742268, + "grad_norm": 0.02440315247691098, + "learning_rate": 5.892063855212476e-05, + "loss": 0.4658, + "step": 33860 + }, + { + "epoch": 1.7192445837722583, + "grad_norm": 0.018975284982618558, + "learning_rate": 5.881635370182037e-05, + "loss": 0.4563, + "step": 33865 + }, + { + "epoch": 1.7194984198702898, + "grad_norm": 0.021764078280474785, + "learning_rate": 5.8712155453665426e-05, + "loss": 0.4476, + "step": 33870 + }, + { + "epoch": 1.7197522559683214, + "grad_norm": 0.020822416739688775, + "learning_rate": 5.8608043828113744e-05, + "loss": 0.45, + "step": 33875 + }, + { + "epoch": 1.720006092066353, + "grad_norm": 0.022089565418501236, + "learning_rate": 5.8504018845601804e-05, + "loss": 0.4551, + "step": 33880 + }, + { + "epoch": 1.7202599281643842, + "grad_norm": 0.02359465062305091, + "learning_rate": 5.840008052654927e-05, + "loss": 0.4634, + "step": 33885 + }, + { + "epoch": 1.7205137642624158, + "grad_norm": 0.029544919593911603, + "learning_rate": 5.8296228891358604e-05, + "loss": 0.4532, + "step": 33890 + }, + { + "epoch": 1.7207676003604473, + "grad_norm": 0.026839978594084928, + "learning_rate": 5.81924639604155e-05, + "loss": 0.4486, + "step": 33895 + }, + { + "epoch": 1.7210214364584786, + "grad_norm": 0.02088653909861625, + "learning_rate": 5.808878575408827e-05, + "loss": 0.4447, + "step": 33900 + }, + { + "epoch": 1.7212752725565101, + "grad_norm": 0.022797571162386343, + "learning_rate": 5.798519429272875e-05, + "loss": 0.4544, + "step": 33905 + }, + { + "epoch": 1.7215291086545417, + "grad_norm": 0.023109589614580234, + "learning_rate": 5.7881689596671226e-05, + "loss": 0.4803, + "step": 33910 + }, + { + "epoch": 1.7217829447525732, + "grad_norm": 0.020498287608990536, + "learning_rate": 5.777827168623323e-05, + "loss": 0.4638, + "step": 33915 + }, + { + "epoch": 1.7220367808506047, + "grad_norm": 0.021809212367703504, + "learning_rate": 5.767494058171507e-05, + "loss": 0.4412, + "step": 33920 + }, + { + "epoch": 1.7222906169486363, + "grad_norm": 0.01961469054428933, + "learning_rate": 5.757169630340031e-05, + "loss": 0.4349, + "step": 33925 + }, + { + "epoch": 1.7225444530466678, + "grad_norm": 0.021210553875649015, + "learning_rate": 5.7468538871555064e-05, + "loss": 0.4451, + "step": 33930 + }, + { + "epoch": 1.7227982891446993, + "grad_norm": 0.023689032069760745, + "learning_rate": 5.736546830642886e-05, + "loss": 0.4785, + "step": 33935 + }, + { + "epoch": 1.7230521252427309, + "grad_norm": 0.022017968208859488, + "learning_rate": 5.726248462825373e-05, + "loss": 0.4336, + "step": 33940 + }, + { + "epoch": 1.7233059613407624, + "grad_norm": 0.021437511540021992, + "learning_rate": 5.715958785724501e-05, + "loss": 0.4562, + "step": 33945 + }, + { + "epoch": 1.7235597974387937, + "grad_norm": 0.021235793273451938, + "learning_rate": 5.705677801360065e-05, + "loss": 0.4432, + "step": 33950 + }, + { + "epoch": 1.7238136335368253, + "grad_norm": 0.02204282859521283, + "learning_rate": 5.69540551175019e-05, + "loss": 0.4808, + "step": 33955 + }, + { + "epoch": 1.7240674696348568, + "grad_norm": 0.022958015134989514, + "learning_rate": 5.6851419189112575e-05, + "loss": 0.4472, + "step": 33960 + }, + { + "epoch": 1.724321305732888, + "grad_norm": 0.022808401192192258, + "learning_rate": 5.6748870248579666e-05, + "loss": 0.4495, + "step": 33965 + }, + { + "epoch": 1.7245751418309196, + "grad_norm": 0.02509495051395949, + "learning_rate": 5.6646408316033185e-05, + "loss": 0.4662, + "step": 33970 + }, + { + "epoch": 1.7248289779289512, + "grad_norm": 0.022427454109016656, + "learning_rate": 5.654403341158565e-05, + "loss": 0.4887, + "step": 33975 + }, + { + "epoch": 1.7250828140269827, + "grad_norm": 0.02867990044631863, + "learning_rate": 5.644174555533288e-05, + "loss": 0.4325, + "step": 33980 + }, + { + "epoch": 1.7253366501250142, + "grad_norm": 0.02527782595636069, + "learning_rate": 5.633954476735337e-05, + "loss": 0.455, + "step": 33985 + }, + { + "epoch": 1.7255904862230458, + "grad_norm": 0.02978500394864718, + "learning_rate": 5.623743106770879e-05, + "loss": 0.4693, + "step": 33990 + }, + { + "epoch": 1.7258443223210773, + "grad_norm": 0.021989421251067075, + "learning_rate": 5.6135404476443384e-05, + "loss": 0.4501, + "step": 33995 + }, + { + "epoch": 1.7260981584191089, + "grad_norm": 0.02414972943152398, + "learning_rate": 5.603346501358458e-05, + "loss": 0.5015, + "step": 34000 + }, + { + "epoch": 1.7263519945171404, + "grad_norm": 0.020959704894745596, + "learning_rate": 5.593161269914249e-05, + "loss": 0.4601, + "step": 34005 + }, + { + "epoch": 1.726605830615172, + "grad_norm": 0.01883199147720315, + "learning_rate": 5.5829847553110326e-05, + "loss": 0.4297, + "step": 34010 + }, + { + "epoch": 1.7268596667132032, + "grad_norm": 0.02135197011443074, + "learning_rate": 5.572816959546389e-05, + "loss": 0.4358, + "step": 34015 + }, + { + "epoch": 1.7271135028112348, + "grad_norm": 0.02107503468536846, + "learning_rate": 5.562657884616223e-05, + "loss": 0.4669, + "step": 34020 + }, + { + "epoch": 1.7273673389092663, + "grad_norm": 0.023771596018301224, + "learning_rate": 5.5525075325147054e-05, + "loss": 0.4587, + "step": 34025 + }, + { + "epoch": 1.7276211750072978, + "grad_norm": 0.021587351694255002, + "learning_rate": 5.542365905234309e-05, + "loss": 0.4481, + "step": 34030 + }, + { + "epoch": 1.7278750111053292, + "grad_norm": 0.020770474747904666, + "learning_rate": 5.532233004765763e-05, + "loss": 0.4439, + "step": 34035 + }, + { + "epoch": 1.7281288472033607, + "grad_norm": 0.01967096579673539, + "learning_rate": 5.5221088330981274e-05, + "loss": 0.4629, + "step": 34040 + }, + { + "epoch": 1.7283826833013922, + "grad_norm": 0.02081313356039374, + "learning_rate": 5.5119933922187115e-05, + "loss": 0.4447, + "step": 34045 + }, + { + "epoch": 1.7286365193994238, + "grad_norm": 0.030103240651800995, + "learning_rate": 5.501886684113139e-05, + "loss": 0.448, + "step": 34050 + }, + { + "epoch": 1.7288903554974553, + "grad_norm": 0.021992967904753014, + "learning_rate": 5.491788710765289e-05, + "loss": 0.4657, + "step": 34055 + }, + { + "epoch": 1.7291441915954868, + "grad_norm": 0.024640038006979705, + "learning_rate": 5.481699474157364e-05, + "loss": 0.4678, + "step": 34060 + }, + { + "epoch": 1.7293980276935184, + "grad_norm": 0.02259317629920808, + "learning_rate": 5.4716189762698044e-05, + "loss": 0.4526, + "step": 34065 + }, + { + "epoch": 1.72965186379155, + "grad_norm": 0.021047781618106692, + "learning_rate": 5.461547219081392e-05, + "loss": 0.4498, + "step": 34070 + }, + { + "epoch": 1.7299056998895814, + "grad_norm": 0.024835964339130236, + "learning_rate": 5.4514842045691346e-05, + "loss": 0.4491, + "step": 34075 + }, + { + "epoch": 1.730159535987613, + "grad_norm": 0.023902080014017398, + "learning_rate": 5.441429934708369e-05, + "loss": 0.4293, + "step": 34080 + }, + { + "epoch": 1.7304133720856443, + "grad_norm": 0.022346544604734297, + "learning_rate": 5.431384411472701e-05, + "loss": 0.475, + "step": 34085 + }, + { + "epoch": 1.7306672081836758, + "grad_norm": 0.022715026835179515, + "learning_rate": 5.421347636834001e-05, + "loss": 0.4446, + "step": 34090 + }, + { + "epoch": 1.7309210442817073, + "grad_norm": 0.022933158827406614, + "learning_rate": 5.411319612762455e-05, + "loss": 0.4259, + "step": 34095 + }, + { + "epoch": 1.7311748803797387, + "grad_norm": 0.024421520760028334, + "learning_rate": 5.4013003412265004e-05, + "loss": 0.4691, + "step": 34100 + }, + { + "epoch": 1.7314287164777702, + "grad_norm": 0.02420532978107086, + "learning_rate": 5.3912898241928796e-05, + "loss": 0.4405, + "step": 34105 + }, + { + "epoch": 1.7316825525758017, + "grad_norm": 0.02170211562927459, + "learning_rate": 5.3812880636265935e-05, + "loss": 0.4729, + "step": 34110 + }, + { + "epoch": 1.7319363886738333, + "grad_norm": 0.02370624185573575, + "learning_rate": 5.371295061490961e-05, + "loss": 0.4741, + "step": 34115 + }, + { + "epoch": 1.7321902247718648, + "grad_norm": 0.021244344362853385, + "learning_rate": 5.3613108197475335e-05, + "loss": 0.4529, + "step": 34120 + }, + { + "epoch": 1.7324440608698963, + "grad_norm": 0.02100785970567331, + "learning_rate": 5.3513353403561895e-05, + "loss": 0.462, + "step": 34125 + }, + { + "epoch": 1.7326978969679279, + "grad_norm": 0.019808061436518504, + "learning_rate": 5.3413686252750445e-05, + "loss": 0.4525, + "step": 34130 + }, + { + "epoch": 1.7329517330659594, + "grad_norm": 0.02187199410993585, + "learning_rate": 5.3314106764605354e-05, + "loss": 0.4645, + "step": 34135 + }, + { + "epoch": 1.733205569163991, + "grad_norm": 0.024006944977098717, + "learning_rate": 5.32146149586733e-05, + "loss": 0.4742, + "step": 34140 + }, + { + "epoch": 1.7334594052620225, + "grad_norm": 0.019976476111784883, + "learning_rate": 5.3115210854484394e-05, + "loss": 0.4409, + "step": 34145 + }, + { + "epoch": 1.7337132413600538, + "grad_norm": 0.026385558195068118, + "learning_rate": 5.301589447155092e-05, + "loss": 0.4456, + "step": 34150 + }, + { + "epoch": 1.7339670774580853, + "grad_norm": 0.021897238374891215, + "learning_rate": 5.2916665829368324e-05, + "loss": 0.4438, + "step": 34155 + }, + { + "epoch": 1.7342209135561168, + "grad_norm": 0.024811420195424226, + "learning_rate": 5.281752494741454e-05, + "loss": 0.4627, + "step": 34160 + }, + { + "epoch": 1.7344747496541482, + "grad_norm": 0.024672823583302583, + "learning_rate": 5.2718471845150604e-05, + "loss": 0.4759, + "step": 34165 + }, + { + "epoch": 1.7347285857521797, + "grad_norm": 0.02181462463091039, + "learning_rate": 5.261950654201997e-05, + "loss": 0.4559, + "step": 34170 + }, + { + "epoch": 1.7349824218502112, + "grad_norm": 0.024684395145582377, + "learning_rate": 5.252062905744926e-05, + "loss": 0.4422, + "step": 34175 + }, + { + "epoch": 1.7352362579482428, + "grad_norm": 0.02230800553146116, + "learning_rate": 5.2421839410847436e-05, + "loss": 0.4637, + "step": 34180 + }, + { + "epoch": 1.7354900940462743, + "grad_norm": 0.023147228626355657, + "learning_rate": 5.2323137621606345e-05, + "loss": 0.456, + "step": 34185 + }, + { + "epoch": 1.7357439301443058, + "grad_norm": 0.02281138844618608, + "learning_rate": 5.2224523709100914e-05, + "loss": 0.4666, + "step": 34190 + }, + { + "epoch": 1.7359977662423374, + "grad_norm": 0.022744297929036067, + "learning_rate": 5.212599769268833e-05, + "loss": 0.4577, + "step": 34195 + }, + { + "epoch": 1.736251602340369, + "grad_norm": 0.026435757976267977, + "learning_rate": 5.202755959170885e-05, + "loss": 0.4792, + "step": 34200 + }, + { + "epoch": 1.7365054384384004, + "grad_norm": 0.02327427177805453, + "learning_rate": 5.1929209425485346e-05, + "loss": 0.454, + "step": 34205 + }, + { + "epoch": 1.736759274536432, + "grad_norm": 0.019262280027333455, + "learning_rate": 5.1830947213323656e-05, + "loss": 0.4344, + "step": 34210 + }, + { + "epoch": 1.7370131106344633, + "grad_norm": 0.02020587242748104, + "learning_rate": 5.17327729745119e-05, + "loss": 0.4687, + "step": 34215 + }, + { + "epoch": 1.7372669467324948, + "grad_norm": 0.022404415922131816, + "learning_rate": 5.163468672832139e-05, + "loss": 0.444, + "step": 34220 + }, + { + "epoch": 1.7375207828305264, + "grad_norm": 0.022425367350743124, + "learning_rate": 5.1536688494005835e-05, + "loss": 0.4682, + "step": 34225 + }, + { + "epoch": 1.7377746189285577, + "grad_norm": 0.022763870580745405, + "learning_rate": 5.14387782908019e-05, + "loss": 0.4601, + "step": 34230 + }, + { + "epoch": 1.7380284550265892, + "grad_norm": 0.023042118858273286, + "learning_rate": 5.134095613792872e-05, + "loss": 0.4637, + "step": 34235 + }, + { + "epoch": 1.7382822911246207, + "grad_norm": 0.02604482240900869, + "learning_rate": 5.124322205458848e-05, + "loss": 0.4382, + "step": 34240 + }, + { + "epoch": 1.7385361272226523, + "grad_norm": 0.021731629118018583, + "learning_rate": 5.1145576059965726e-05, + "loss": 0.4654, + "step": 34245 + }, + { + "epoch": 1.7387899633206838, + "grad_norm": 0.019413019416783566, + "learning_rate": 5.1048018173228015e-05, + "loss": 0.4529, + "step": 34250 + }, + { + "epoch": 1.7390437994187153, + "grad_norm": 0.02664943630561449, + "learning_rate": 5.0950548413525365e-05, + "loss": 0.4562, + "step": 34255 + }, + { + "epoch": 1.7392976355167469, + "grad_norm": 0.02087198766602968, + "learning_rate": 5.085316679999064e-05, + "loss": 0.4422, + "step": 34260 + }, + { + "epoch": 1.7395514716147784, + "grad_norm": 0.021876909463953986, + "learning_rate": 5.075587335173948e-05, + "loss": 0.4453, + "step": 34265 + }, + { + "epoch": 1.73980530771281, + "grad_norm": 0.02240544876246522, + "learning_rate": 5.06586680878699e-05, + "loss": 0.4302, + "step": 34270 + }, + { + "epoch": 1.7400591438108415, + "grad_norm": 0.020994222583535893, + "learning_rate": 5.056155102746302e-05, + "loss": 0.4592, + "step": 34275 + }, + { + "epoch": 1.7403129799088728, + "grad_norm": 0.022701990879272254, + "learning_rate": 5.0464522189582194e-05, + "loss": 0.4851, + "step": 34280 + }, + { + "epoch": 1.7405668160069043, + "grad_norm": 0.027194326586476105, + "learning_rate": 5.036758159327398e-05, + "loss": 0.4818, + "step": 34285 + }, + { + "epoch": 1.7408206521049359, + "grad_norm": 0.02078698225293263, + "learning_rate": 5.027072925756709e-05, + "loss": 0.4662, + "step": 34290 + }, + { + "epoch": 1.7410744882029674, + "grad_norm": 0.02420724997035512, + "learning_rate": 5.017396520147333e-05, + "loss": 0.4687, + "step": 34295 + }, + { + "epoch": 1.7413283243009987, + "grad_norm": 0.031193300288096932, + "learning_rate": 5.007728944398682e-05, + "loss": 0.4567, + "step": 34300 + }, + { + "epoch": 1.7415821603990302, + "grad_norm": 0.020794031282449752, + "learning_rate": 4.9980702004084724e-05, + "loss": 0.4529, + "step": 34305 + }, + { + "epoch": 1.7418359964970618, + "grad_norm": 0.022257297830772214, + "learning_rate": 4.9884202900726486e-05, + "loss": 0.4428, + "step": 34310 + }, + { + "epoch": 1.7420898325950933, + "grad_norm": 0.019932049025810045, + "learning_rate": 4.978779215285456e-05, + "loss": 0.4386, + "step": 34315 + }, + { + "epoch": 1.7423436686931248, + "grad_norm": 0.026391941731528173, + "learning_rate": 4.9691469779393706e-05, + "loss": 0.464, + "step": 34320 + }, + { + "epoch": 1.7425975047911564, + "grad_norm": 0.022393195253757603, + "learning_rate": 4.959523579925179e-05, + "loss": 0.4803, + "step": 34325 + }, + { + "epoch": 1.742851340889188, + "grad_norm": 0.029090252728671346, + "learning_rate": 4.949909023131888e-05, + "loss": 0.4668, + "step": 34330 + }, + { + "epoch": 1.7431051769872195, + "grad_norm": 0.024897631086516752, + "learning_rate": 4.940303309446798e-05, + "loss": 0.4674, + "step": 34335 + }, + { + "epoch": 1.743359013085251, + "grad_norm": 0.020804078169891622, + "learning_rate": 4.9307064407554445e-05, + "loss": 0.4664, + "step": 34340 + }, + { + "epoch": 1.7436128491832825, + "grad_norm": 0.021704515629220097, + "learning_rate": 4.921118418941667e-05, + "loss": 0.4632, + "step": 34345 + }, + { + "epoch": 1.7438666852813138, + "grad_norm": 0.024256706115496882, + "learning_rate": 4.911539245887525e-05, + "loss": 0.4219, + "step": 34350 + }, + { + "epoch": 1.7441205213793454, + "grad_norm": 0.02298081741126669, + "learning_rate": 4.901968923473382e-05, + "loss": 0.4591, + "step": 34355 + }, + { + "epoch": 1.744374357477377, + "grad_norm": 0.03464863168116285, + "learning_rate": 4.8924074535778294e-05, + "loss": 0.446, + "step": 34360 + }, + { + "epoch": 1.7446281935754082, + "grad_norm": 0.02157577331468465, + "learning_rate": 4.882854838077755e-05, + "loss": 0.4657, + "step": 34365 + }, + { + "epoch": 1.7448820296734397, + "grad_norm": 0.019964492489978992, + "learning_rate": 4.873311078848264e-05, + "loss": 0.4643, + "step": 34370 + }, + { + "epoch": 1.7451358657714713, + "grad_norm": 0.022047232259986137, + "learning_rate": 4.863776177762769e-05, + "loss": 0.4639, + "step": 34375 + }, + { + "epoch": 1.7453897018695028, + "grad_norm": 0.03244136132931666, + "learning_rate": 4.854250136692912e-05, + "loss": 0.4822, + "step": 34380 + }, + { + "epoch": 1.7456435379675344, + "grad_norm": 0.026966628720723723, + "learning_rate": 4.844732957508607e-05, + "loss": 0.4612, + "step": 34385 + }, + { + "epoch": 1.7458973740655659, + "grad_norm": 0.021335951919062863, + "learning_rate": 4.8352246420780456e-05, + "loss": 0.44, + "step": 34390 + }, + { + "epoch": 1.7461512101635974, + "grad_norm": 0.021305484071550175, + "learning_rate": 4.825725192267638e-05, + "loss": 0.4573, + "step": 34395 + }, + { + "epoch": 1.746405046261629, + "grad_norm": 0.021326881921091657, + "learning_rate": 4.816234609942105e-05, + "loss": 0.4524, + "step": 34400 + }, + { + "epoch": 1.7466588823596605, + "grad_norm": 0.02188831813845589, + "learning_rate": 4.806752896964373e-05, + "loss": 0.4707, + "step": 34405 + }, + { + "epoch": 1.746912718457692, + "grad_norm": 0.02064699692483605, + "learning_rate": 4.79728005519568e-05, + "loss": 0.4565, + "step": 34410 + }, + { + "epoch": 1.7471665545557233, + "grad_norm": 0.021389953513019002, + "learning_rate": 4.787816086495478e-05, + "loss": 0.47, + "step": 34415 + }, + { + "epoch": 1.7474203906537549, + "grad_norm": 0.019822070124622355, + "learning_rate": 4.7783609927215145e-05, + "loss": 0.4244, + "step": 34420 + }, + { + "epoch": 1.7476742267517864, + "grad_norm": 0.01981125932792789, + "learning_rate": 4.7689147757297605e-05, + "loss": 0.4369, + "step": 34425 + }, + { + "epoch": 1.7479280628498177, + "grad_norm": 0.028737375632746403, + "learning_rate": 4.7594774373744766e-05, + "loss": 0.4647, + "step": 34430 + }, + { + "epoch": 1.7481818989478493, + "grad_norm": 0.031205219363603577, + "learning_rate": 4.750048979508148e-05, + "loss": 0.4611, + "step": 34435 + }, + { + "epoch": 1.7484357350458808, + "grad_norm": 0.022939345811528095, + "learning_rate": 4.7406294039815553e-05, + "loss": 0.4711, + "step": 34440 + }, + { + "epoch": 1.7486895711439123, + "grad_norm": 0.027141259222797177, + "learning_rate": 4.731218712643681e-05, + "loss": 0.4422, + "step": 34445 + }, + { + "epoch": 1.7489434072419439, + "grad_norm": 0.027268634228652108, + "learning_rate": 4.721816907341836e-05, + "loss": 0.4768, + "step": 34450 + }, + { + "epoch": 1.7491972433399754, + "grad_norm": 0.023701231135006304, + "learning_rate": 4.712423989921527e-05, + "loss": 0.4631, + "step": 34455 + }, + { + "epoch": 1.749451079438007, + "grad_norm": 0.02347054862041871, + "learning_rate": 4.703039962226541e-05, + "loss": 0.4619, + "step": 34460 + }, + { + "epoch": 1.7497049155360385, + "grad_norm": 0.028567040630409428, + "learning_rate": 4.693664826098909e-05, + "loss": 0.4311, + "step": 34465 + }, + { + "epoch": 1.74995875163407, + "grad_norm": 0.02148438393708741, + "learning_rate": 4.684298583378943e-05, + "loss": 0.4438, + "step": 34470 + }, + { + "epoch": 1.7502125877321015, + "grad_norm": 0.02216116188881408, + "learning_rate": 4.674941235905161e-05, + "loss": 0.4676, + "step": 34475 + }, + { + "epoch": 1.7504664238301328, + "grad_norm": 0.022665906180228006, + "learning_rate": 4.6655927855143886e-05, + "loss": 0.4453, + "step": 34480 + }, + { + "epoch": 1.7507202599281644, + "grad_norm": 0.022100912241708407, + "learning_rate": 4.656253234041663e-05, + "loss": 0.4854, + "step": 34485 + }, + { + "epoch": 1.750974096026196, + "grad_norm": 0.019646671652917273, + "learning_rate": 4.646922583320307e-05, + "loss": 0.4738, + "step": 34490 + }, + { + "epoch": 1.7512279321242272, + "grad_norm": 0.02278953025792618, + "learning_rate": 4.637600835181866e-05, + "loss": 0.469, + "step": 34495 + }, + { + "epoch": 1.7514817682222588, + "grad_norm": 0.022089585019886025, + "learning_rate": 4.6282879914561646e-05, + "loss": 0.4497, + "step": 34500 + }, + { + "epoch": 1.7517356043202903, + "grad_norm": 0.023445723973391685, + "learning_rate": 4.6189840539712534e-05, + "loss": 0.4551, + "step": 34505 + }, + { + "epoch": 1.7519894404183218, + "grad_norm": 0.02715037290748338, + "learning_rate": 4.609689024553459e-05, + "loss": 0.462, + "step": 34510 + }, + { + "epoch": 1.7522432765163534, + "grad_norm": 0.02174226702680522, + "learning_rate": 4.600402905027357e-05, + "loss": 0.4662, + "step": 34515 + }, + { + "epoch": 1.752497112614385, + "grad_norm": 0.021279501901226617, + "learning_rate": 4.5911256972157476e-05, + "loss": 0.4485, + "step": 34520 + }, + { + "epoch": 1.7527509487124164, + "grad_norm": 0.023627373642052033, + "learning_rate": 4.581857402939721e-05, + "loss": 0.4522, + "step": 34525 + }, + { + "epoch": 1.753004784810448, + "grad_norm": 0.0215796445898145, + "learning_rate": 4.572598024018571e-05, + "loss": 0.4708, + "step": 34530 + }, + { + "epoch": 1.7532586209084795, + "grad_norm": 0.022638555616672124, + "learning_rate": 4.563347562269898e-05, + "loss": 0.4391, + "step": 34535 + }, + { + "epoch": 1.753512457006511, + "grad_norm": 0.02165756905100213, + "learning_rate": 4.5541060195094965e-05, + "loss": 0.4557, + "step": 34540 + }, + { + "epoch": 1.7537662931045424, + "grad_norm": 0.021506773610349895, + "learning_rate": 4.5448733975514524e-05, + "loss": 0.4493, + "step": 34545 + }, + { + "epoch": 1.7540201292025739, + "grad_norm": 0.021106012563365185, + "learning_rate": 4.535649698208066e-05, + "loss": 0.4566, + "step": 34550 + }, + { + "epoch": 1.7542739653006054, + "grad_norm": 0.02090198796312075, + "learning_rate": 4.526434923289924e-05, + "loss": 0.4457, + "step": 34555 + }, + { + "epoch": 1.754527801398637, + "grad_norm": 0.02136853005412373, + "learning_rate": 4.517229074605822e-05, + "loss": 0.4747, + "step": 34560 + }, + { + "epoch": 1.7547816374966683, + "grad_norm": 0.02157527210244312, + "learning_rate": 4.508032153962832e-05, + "loss": 0.4704, + "step": 34565 + }, + { + "epoch": 1.7550354735946998, + "grad_norm": 0.023144728143769867, + "learning_rate": 4.49884416316626e-05, + "loss": 0.4704, + "step": 34570 + }, + { + "epoch": 1.7552893096927313, + "grad_norm": 0.020386305284952208, + "learning_rate": 4.489665104019675e-05, + "loss": 0.4557, + "step": 34575 + }, + { + "epoch": 1.7555431457907629, + "grad_norm": 0.022166335060348437, + "learning_rate": 4.4804949783248564e-05, + "loss": 0.46, + "step": 34580 + }, + { + "epoch": 1.7557969818887944, + "grad_norm": 0.02094210104633068, + "learning_rate": 4.471333787881881e-05, + "loss": 0.4561, + "step": 34585 + }, + { + "epoch": 1.756050817986826, + "grad_norm": 0.02140439011369098, + "learning_rate": 4.4621815344890235e-05, + "loss": 0.4253, + "step": 34590 + }, + { + "epoch": 1.7563046540848575, + "grad_norm": 0.023342207225477016, + "learning_rate": 4.453038219942845e-05, + "loss": 0.4902, + "step": 34595 + }, + { + "epoch": 1.756558490182889, + "grad_norm": 0.019041730139629988, + "learning_rate": 4.443903846038111e-05, + "loss": 0.4525, + "step": 34600 + }, + { + "epoch": 1.7568123262809205, + "grad_norm": 0.024800524118674366, + "learning_rate": 4.4347784145678695e-05, + "loss": 0.4606, + "step": 34605 + }, + { + "epoch": 1.7570661623789519, + "grad_norm": 0.02916453322026752, + "learning_rate": 4.425661927323388e-05, + "loss": 0.4643, + "step": 34610 + }, + { + "epoch": 1.7573199984769834, + "grad_norm": 0.024905988994358454, + "learning_rate": 4.416554386094196e-05, + "loss": 0.437, + "step": 34615 + }, + { + "epoch": 1.757573834575015, + "grad_norm": 0.030406574388976055, + "learning_rate": 4.407455792668047e-05, + "loss": 0.4883, + "step": 34620 + }, + { + "epoch": 1.7578276706730465, + "grad_norm": 0.020939906014040045, + "learning_rate": 4.3983661488309565e-05, + "loss": 0.4282, + "step": 34625 + }, + { + "epoch": 1.7580815067710778, + "grad_norm": 0.022365655581909887, + "learning_rate": 4.389285456367181e-05, + "loss": 0.478, + "step": 34630 + }, + { + "epoch": 1.7583353428691093, + "grad_norm": 0.023054617223912062, + "learning_rate": 4.380213717059206e-05, + "loss": 0.4749, + "step": 34635 + }, + { + "epoch": 1.7585891789671408, + "grad_norm": 0.021279493374970884, + "learning_rate": 4.371150932687784e-05, + "loss": 0.4535, + "step": 34640 + }, + { + "epoch": 1.7588430150651724, + "grad_norm": 0.021608983124179647, + "learning_rate": 4.3620971050318706e-05, + "loss": 0.452, + "step": 34645 + }, + { + "epoch": 1.759096851163204, + "grad_norm": 0.02125252639391023, + "learning_rate": 4.3530522358687045e-05, + "loss": 0.4427, + "step": 34650 + }, + { + "epoch": 1.7593506872612354, + "grad_norm": 0.021716546978771272, + "learning_rate": 4.3440163269737374e-05, + "loss": 0.4551, + "step": 34655 + }, + { + "epoch": 1.759604523359267, + "grad_norm": 0.022723743581024202, + "learning_rate": 4.334989380120691e-05, + "loss": 0.4373, + "step": 34660 + }, + { + "epoch": 1.7598583594572985, + "grad_norm": 0.023783058996820775, + "learning_rate": 4.3259713970814904e-05, + "loss": 0.471, + "step": 34665 + }, + { + "epoch": 1.76011219555533, + "grad_norm": 0.022842675520249336, + "learning_rate": 4.316962379626333e-05, + "loss": 0.4575, + "step": 34670 + }, + { + "epoch": 1.7603660316533616, + "grad_norm": 0.024246326137658224, + "learning_rate": 4.3079623295236345e-05, + "loss": 0.4028, + "step": 34675 + }, + { + "epoch": 1.760619867751393, + "grad_norm": 0.025535918723743688, + "learning_rate": 4.298971248540068e-05, + "loss": 0.4636, + "step": 34680 + }, + { + "epoch": 1.7608737038494244, + "grad_norm": 0.022390889197142872, + "learning_rate": 4.2899891384405196e-05, + "loss": 0.4283, + "step": 34685 + }, + { + "epoch": 1.761127539947456, + "grad_norm": 0.025131303601364605, + "learning_rate": 4.281016000988169e-05, + "loss": 0.4524, + "step": 34690 + }, + { + "epoch": 1.7613813760454873, + "grad_norm": 0.02186743401814803, + "learning_rate": 4.2720518379443684e-05, + "loss": 0.4497, + "step": 34695 + }, + { + "epoch": 1.7616352121435188, + "grad_norm": 0.02114715992999132, + "learning_rate": 4.263096651068754e-05, + "loss": 0.4662, + "step": 34700 + }, + { + "epoch": 1.7618890482415503, + "grad_norm": 0.028404232206161735, + "learning_rate": 4.254150442119164e-05, + "loss": 0.4742, + "step": 34705 + }, + { + "epoch": 1.7621428843395819, + "grad_norm": 0.02630860958767224, + "learning_rate": 4.2452132128517226e-05, + "loss": 0.4296, + "step": 34710 + }, + { + "epoch": 1.7623967204376134, + "grad_norm": 0.024673175731050732, + "learning_rate": 4.236284965020737e-05, + "loss": 0.4308, + "step": 34715 + }, + { + "epoch": 1.762650556535645, + "grad_norm": 0.02180317292962621, + "learning_rate": 4.227365700378799e-05, + "loss": 0.4469, + "step": 34720 + }, + { + "epoch": 1.7629043926336765, + "grad_norm": 0.0214905144095388, + "learning_rate": 4.2184554206767034e-05, + "loss": 0.4723, + "step": 34725 + }, + { + "epoch": 1.763158228731708, + "grad_norm": 0.021720539302091973, + "learning_rate": 4.209554127663495e-05, + "loss": 0.4465, + "step": 34730 + }, + { + "epoch": 1.7634120648297396, + "grad_norm": 0.02368170581782568, + "learning_rate": 4.200661823086454e-05, + "loss": 0.4699, + "step": 34735 + }, + { + "epoch": 1.763665900927771, + "grad_norm": 0.022332063931247034, + "learning_rate": 4.191778508691102e-05, + "loss": 0.4846, + "step": 34740 + }, + { + "epoch": 1.7639197370258024, + "grad_norm": 0.021873430868819217, + "learning_rate": 4.182904186221176e-05, + "loss": 0.4475, + "step": 34745 + }, + { + "epoch": 1.764173573123834, + "grad_norm": 0.018510023864084377, + "learning_rate": 4.174038857418666e-05, + "loss": 0.4087, + "step": 34750 + }, + { + "epoch": 1.7644274092218655, + "grad_norm": 0.020083902539849405, + "learning_rate": 4.165182524023803e-05, + "loss": 0.4578, + "step": 34755 + }, + { + "epoch": 1.7646812453198968, + "grad_norm": 0.01956729272999206, + "learning_rate": 4.156335187775029e-05, + "loss": 0.432, + "step": 34760 + }, + { + "epoch": 1.7649350814179283, + "grad_norm": 0.033417341393678385, + "learning_rate": 4.1474968504090385e-05, + "loss": 0.4514, + "step": 34765 + }, + { + "epoch": 1.7651889175159599, + "grad_norm": 0.027037751978040633, + "learning_rate": 4.1386675136607434e-05, + "loss": 0.4497, + "step": 34770 + }, + { + "epoch": 1.7654427536139914, + "grad_norm": 0.02221486375997609, + "learning_rate": 4.129847179263318e-05, + "loss": 0.4261, + "step": 34775 + }, + { + "epoch": 1.765696589712023, + "grad_norm": 0.02085106880362318, + "learning_rate": 4.121035848948124e-05, + "loss": 0.4639, + "step": 34780 + }, + { + "epoch": 1.7659504258100545, + "grad_norm": 0.023019324461147312, + "learning_rate": 4.112233524444803e-05, + "loss": 0.4712, + "step": 34785 + }, + { + "epoch": 1.766204261908086, + "grad_norm": 0.019887284736623528, + "learning_rate": 4.103440207481196e-05, + "loss": 0.4421, + "step": 34790 + }, + { + "epoch": 1.7664580980061175, + "grad_norm": 0.0244559527736879, + "learning_rate": 4.094655899783395e-05, + "loss": 0.4429, + "step": 34795 + }, + { + "epoch": 1.766711934104149, + "grad_norm": 0.023135446169896694, + "learning_rate": 4.085880603075703e-05, + "loss": 0.435, + "step": 34800 + }, + { + "epoch": 1.7669657702021806, + "grad_norm": 0.020742048909337243, + "learning_rate": 4.077114319080671e-05, + "loss": 0.4307, + "step": 34805 + }, + { + "epoch": 1.767219606300212, + "grad_norm": 0.02392298449389485, + "learning_rate": 4.068357049519089e-05, + "loss": 0.4433, + "step": 34810 + }, + { + "epoch": 1.7674734423982434, + "grad_norm": 0.023622721434002313, + "learning_rate": 4.0596087961099595e-05, + "loss": 0.4809, + "step": 34815 + }, + { + "epoch": 1.767727278496275, + "grad_norm": 0.02007693135153689, + "learning_rate": 4.0508695605705136e-05, + "loss": 0.443, + "step": 34820 + }, + { + "epoch": 1.7679811145943063, + "grad_norm": 0.020604137444585325, + "learning_rate": 4.042139344616236e-05, + "loss": 0.45, + "step": 34825 + }, + { + "epoch": 1.7682349506923378, + "grad_norm": 0.021241405566437443, + "learning_rate": 4.033418149960799e-05, + "loss": 0.4537, + "step": 34830 + }, + { + "epoch": 1.7684887867903694, + "grad_norm": 0.02142638288944903, + "learning_rate": 4.0247059783161565e-05, + "loss": 0.4272, + "step": 34835 + }, + { + "epoch": 1.768742622888401, + "grad_norm": 0.022520249913372568, + "learning_rate": 4.0160028313924456e-05, + "loss": 0.4789, + "step": 34840 + }, + { + "epoch": 1.7689964589864324, + "grad_norm": 0.021175421917149957, + "learning_rate": 4.007308710898061e-05, + "loss": 0.4452, + "step": 34845 + }, + { + "epoch": 1.769250295084464, + "grad_norm": 0.020501338142536445, + "learning_rate": 3.998623618539604e-05, + "loss": 0.4764, + "step": 34850 + }, + { + "epoch": 1.7695041311824955, + "grad_norm": 0.019904516522167997, + "learning_rate": 3.9899475560219336e-05, + "loss": 0.4533, + "step": 34855 + }, + { + "epoch": 1.769757967280527, + "grad_norm": 0.023716865857408845, + "learning_rate": 3.981280525048098e-05, + "loss": 0.4717, + "step": 34860 + }, + { + "epoch": 1.7700118033785586, + "grad_norm": 0.020968802334100606, + "learning_rate": 3.972622527319397e-05, + "loss": 0.4447, + "step": 34865 + }, + { + "epoch": 1.77026563947659, + "grad_norm": 0.0220213821940423, + "learning_rate": 3.963973564535361e-05, + "loss": 0.4418, + "step": 34870 + }, + { + "epoch": 1.7705194755746214, + "grad_norm": 0.020738385170455637, + "learning_rate": 3.955333638393732e-05, + "loss": 0.434, + "step": 34875 + }, + { + "epoch": 1.770773311672653, + "grad_norm": 0.023096359415643572, + "learning_rate": 3.9467027505904916e-05, + "loss": 0.4299, + "step": 34880 + }, + { + "epoch": 1.7710271477706845, + "grad_norm": 0.0199432213339235, + "learning_rate": 3.938080902819824e-05, + "loss": 0.4624, + "step": 34885 + }, + { + "epoch": 1.771280983868716, + "grad_norm": 0.026960951902493344, + "learning_rate": 3.929468096774175e-05, + "loss": 0.4648, + "step": 34890 + }, + { + "epoch": 1.7715348199667473, + "grad_norm": 0.025299529980017853, + "learning_rate": 3.92086433414417e-05, + "loss": 0.4428, + "step": 34895 + }, + { + "epoch": 1.7717886560647789, + "grad_norm": 0.02174184002171596, + "learning_rate": 3.9122696166187186e-05, + "loss": 0.4534, + "step": 34900 + }, + { + "epoch": 1.7720424921628104, + "grad_norm": 0.024345802063285356, + "learning_rate": 3.903683945884884e-05, + "loss": 0.4454, + "step": 34905 + }, + { + "epoch": 1.772296328260842, + "grad_norm": 0.023452333877164416, + "learning_rate": 3.895107323628022e-05, + "loss": 0.4538, + "step": 34910 + }, + { + "epoch": 1.7725501643588735, + "grad_norm": 0.019713235902259953, + "learning_rate": 3.8865397515316645e-05, + "loss": 0.442, + "step": 34915 + }, + { + "epoch": 1.772804000456905, + "grad_norm": 0.01862175117732225, + "learning_rate": 3.8779812312775885e-05, + "loss": 0.4352, + "step": 34920 + }, + { + "epoch": 1.7730578365549365, + "grad_norm": 0.02407320867951667, + "learning_rate": 3.869431764545772e-05, + "loss": 0.4435, + "step": 34925 + }, + { + "epoch": 1.773311672652968, + "grad_norm": 0.02675629662340058, + "learning_rate": 3.860891353014462e-05, + "loss": 0.4357, + "step": 34930 + }, + { + "epoch": 1.7735655087509996, + "grad_norm": 0.027161122772953582, + "learning_rate": 3.8523599983600776e-05, + "loss": 0.4311, + "step": 34935 + }, + { + "epoch": 1.7738193448490311, + "grad_norm": 0.023763895197796703, + "learning_rate": 3.843837702257291e-05, + "loss": 0.4361, + "step": 34940 + }, + { + "epoch": 1.7740731809470625, + "grad_norm": 0.02520329920417153, + "learning_rate": 3.835324466378981e-05, + "loss": 0.4608, + "step": 34945 + }, + { + "epoch": 1.774327017045094, + "grad_norm": 0.026335133730435882, + "learning_rate": 3.82682029239626e-05, + "loss": 0.4425, + "step": 34950 + }, + { + "epoch": 1.7745808531431255, + "grad_norm": 0.02077693291797904, + "learning_rate": 3.8183251819784436e-05, + "loss": 0.4415, + "step": 34955 + }, + { + "epoch": 1.7748346892411568, + "grad_norm": 0.021994750743455038, + "learning_rate": 3.8098391367930976e-05, + "loss": 0.4816, + "step": 34960 + }, + { + "epoch": 1.7750885253391884, + "grad_norm": 0.022292288456564727, + "learning_rate": 3.8013621585059665e-05, + "loss": 0.4741, + "step": 34965 + }, + { + "epoch": 1.77534236143722, + "grad_norm": 0.022199822217541125, + "learning_rate": 3.7928942487810594e-05, + "loss": 0.4602, + "step": 34970 + }, + { + "epoch": 1.7755961975352514, + "grad_norm": 0.026501730743718647, + "learning_rate": 3.7844354092805735e-05, + "loss": 0.451, + "step": 34975 + }, + { + "epoch": 1.775850033633283, + "grad_norm": 0.02266914903426037, + "learning_rate": 3.775985641664942e-05, + "loss": 0.4661, + "step": 34980 + }, + { + "epoch": 1.7761038697313145, + "grad_norm": 0.01942723439600507, + "learning_rate": 3.767544947592805e-05, + "loss": 0.4289, + "step": 34985 + }, + { + "epoch": 1.776357705829346, + "grad_norm": 0.021825648976358702, + "learning_rate": 3.759113328721036e-05, + "loss": 0.433, + "step": 34990 + }, + { + "epoch": 1.7766115419273776, + "grad_norm": 0.023665285889932587, + "learning_rate": 3.750690786704725e-05, + "loss": 0.4506, + "step": 34995 + }, + { + "epoch": 1.7768653780254091, + "grad_norm": 0.022461997989559473, + "learning_rate": 3.742277323197158e-05, + "loss": 0.4552, + "step": 35000 + }, + { + "epoch": 1.7771192141234406, + "grad_norm": 0.024581088277719675, + "learning_rate": 3.733872939849875e-05, + "loss": 0.4634, + "step": 35005 + }, + { + "epoch": 1.777373050221472, + "grad_norm": 0.026586649366082245, + "learning_rate": 3.725477638312591e-05, + "loss": 0.4461, + "step": 35010 + }, + { + "epoch": 1.7776268863195035, + "grad_norm": 0.029002282206786262, + "learning_rate": 3.717091420233293e-05, + "loss": 0.4392, + "step": 35015 + }, + { + "epoch": 1.777880722417535, + "grad_norm": 0.025326290885756917, + "learning_rate": 3.708714287258125e-05, + "loss": 0.4314, + "step": 35020 + }, + { + "epoch": 1.7781345585155663, + "grad_norm": 0.025316121879115406, + "learning_rate": 3.700346241031494e-05, + "loss": 0.4434, + "step": 35025 + }, + { + "epoch": 1.7783883946135979, + "grad_norm": 0.02063467888087572, + "learning_rate": 3.691987283195991e-05, + "loss": 0.4665, + "step": 35030 + }, + { + "epoch": 1.7786422307116294, + "grad_norm": 0.022082244394205426, + "learning_rate": 3.68363741539246e-05, + "loss": 0.4657, + "step": 35035 + }, + { + "epoch": 1.778896066809661, + "grad_norm": 0.024010484572527937, + "learning_rate": 3.675296639259912e-05, + "loss": 0.4751, + "step": 35040 + }, + { + "epoch": 1.7791499029076925, + "grad_norm": 0.020949059566116365, + "learning_rate": 3.66696495643562e-05, + "loss": 0.4485, + "step": 35045 + }, + { + "epoch": 1.779403739005724, + "grad_norm": 0.02321244261448484, + "learning_rate": 3.6586423685550374e-05, + "loss": 0.4597, + "step": 35050 + }, + { + "epoch": 1.7796575751037556, + "grad_norm": 0.02673174516429816, + "learning_rate": 3.6503288772518626e-05, + "loss": 0.4677, + "step": 35055 + }, + { + "epoch": 1.779911411201787, + "grad_norm": 0.020598141046026204, + "learning_rate": 3.64202448415798e-05, + "loss": 0.4393, + "step": 35060 + }, + { + "epoch": 1.7801652472998186, + "grad_norm": 0.02209046479234219, + "learning_rate": 3.6337291909035065e-05, + "loss": 0.4413, + "step": 35065 + }, + { + "epoch": 1.7804190833978502, + "grad_norm": 0.027347973082832013, + "learning_rate": 3.625442999116763e-05, + "loss": 0.4273, + "step": 35070 + }, + { + "epoch": 1.7806729194958815, + "grad_norm": 0.02196378431429979, + "learning_rate": 3.6171659104242914e-05, + "loss": 0.4579, + "step": 35075 + }, + { + "epoch": 1.780926755593913, + "grad_norm": 0.03402444971932555, + "learning_rate": 3.608897926450838e-05, + "loss": 0.4532, + "step": 35080 + }, + { + "epoch": 1.7811805916919445, + "grad_norm": 0.025631046835950563, + "learning_rate": 3.600639048819371e-05, + "loss": 0.4314, + "step": 35085 + }, + { + "epoch": 1.7814344277899758, + "grad_norm": 0.023888969294360688, + "learning_rate": 3.592389279151065e-05, + "loss": 0.4677, + "step": 35090 + }, + { + "epoch": 1.7816882638880074, + "grad_norm": 0.020867066218234958, + "learning_rate": 3.584148619065314e-05, + "loss": 0.4444, + "step": 35095 + }, + { + "epoch": 1.781942099986039, + "grad_norm": 0.023266323128579264, + "learning_rate": 3.575917070179702e-05, + "loss": 0.4331, + "step": 35100 + }, + { + "epoch": 1.7821959360840705, + "grad_norm": 0.02512759163952341, + "learning_rate": 3.567694634110058e-05, + "loss": 0.4822, + "step": 35105 + }, + { + "epoch": 1.782449772182102, + "grad_norm": 0.021294948101753024, + "learning_rate": 3.559481312470403e-05, + "loss": 0.4744, + "step": 35110 + }, + { + "epoch": 1.7827036082801335, + "grad_norm": 0.021426087304758333, + "learning_rate": 3.551277106872963e-05, + "loss": 0.4677, + "step": 35115 + }, + { + "epoch": 1.782957444378165, + "grad_norm": 0.02346702339510721, + "learning_rate": 3.5430820189281954e-05, + "loss": 0.4692, + "step": 35120 + }, + { + "epoch": 1.7832112804761966, + "grad_norm": 0.020925318483005197, + "learning_rate": 3.53489605024474e-05, + "loss": 0.4804, + "step": 35125 + }, + { + "epoch": 1.7834651165742281, + "grad_norm": 0.024026448655141502, + "learning_rate": 3.526719202429474e-05, + "loss": 0.4871, + "step": 35130 + }, + { + "epoch": 1.7837189526722597, + "grad_norm": 0.02135052995935449, + "learning_rate": 3.518551477087462e-05, + "loss": 0.4451, + "step": 35135 + }, + { + "epoch": 1.783972788770291, + "grad_norm": 0.03284714110137373, + "learning_rate": 3.5103928758219995e-05, + "loss": 0.4731, + "step": 35140 + }, + { + "epoch": 1.7842266248683225, + "grad_norm": 0.021457290842253285, + "learning_rate": 3.5022434002345615e-05, + "loss": 0.4752, + "step": 35145 + }, + { + "epoch": 1.784480460966354, + "grad_norm": 0.030062390522041568, + "learning_rate": 3.4941030519248685e-05, + "loss": 0.4536, + "step": 35150 + }, + { + "epoch": 1.7847342970643856, + "grad_norm": 0.02160896181203855, + "learning_rate": 3.485971832490814e-05, + "loss": 0.4567, + "step": 35155 + }, + { + "epoch": 1.784988133162417, + "grad_norm": 0.022948769188133194, + "learning_rate": 3.477849743528533e-05, + "loss": 0.45, + "step": 35160 + }, + { + "epoch": 1.7852419692604484, + "grad_norm": 0.023052274795903466, + "learning_rate": 3.469736786632327e-05, + "loss": 0.4513, + "step": 35165 + }, + { + "epoch": 1.78549580535848, + "grad_norm": 0.02387145233098405, + "learning_rate": 3.461632963394756e-05, + "loss": 0.4534, + "step": 35170 + }, + { + "epoch": 1.7857496414565115, + "grad_norm": 0.02398398740621941, + "learning_rate": 3.453538275406542e-05, + "loss": 0.454, + "step": 35175 + }, + { + "epoch": 1.786003477554543, + "grad_norm": 0.024153894469605582, + "learning_rate": 3.445452724256648e-05, + "loss": 0.4466, + "step": 35180 + }, + { + "epoch": 1.7862573136525746, + "grad_norm": 0.02474750346228495, + "learning_rate": 3.437376311532209e-05, + "loss": 0.4586, + "step": 35185 + }, + { + "epoch": 1.786511149750606, + "grad_norm": 0.021820142241289246, + "learning_rate": 3.4293090388185955e-05, + "loss": 0.4259, + "step": 35190 + }, + { + "epoch": 1.7867649858486376, + "grad_norm": 0.021702335087422333, + "learning_rate": 3.421250907699369e-05, + "loss": 0.4538, + "step": 35195 + }, + { + "epoch": 1.7870188219466692, + "grad_norm": 0.025281488724625104, + "learning_rate": 3.413201919756304e-05, + "loss": 0.4311, + "step": 35200 + }, + { + "epoch": 1.7872726580447007, + "grad_norm": 0.022103622641024265, + "learning_rate": 3.4051620765693734e-05, + "loss": 0.4543, + "step": 35205 + }, + { + "epoch": 1.787526494142732, + "grad_norm": 0.020184392908854817, + "learning_rate": 3.3971313797167555e-05, + "loss": 0.4551, + "step": 35210 + }, + { + "epoch": 1.7877803302407635, + "grad_norm": 0.024327540101643993, + "learning_rate": 3.389109830774845e-05, + "loss": 0.462, + "step": 35215 + }, + { + "epoch": 1.788034166338795, + "grad_norm": 0.018786966111353623, + "learning_rate": 3.38109743131822e-05, + "loss": 0.4489, + "step": 35220 + }, + { + "epoch": 1.7882880024368264, + "grad_norm": 0.024298565666928316, + "learning_rate": 3.373094182919678e-05, + "loss": 0.4377, + "step": 35225 + }, + { + "epoch": 1.788541838534858, + "grad_norm": 0.025256385511008395, + "learning_rate": 3.3651000871502245e-05, + "loss": 0.4503, + "step": 35230 + }, + { + "epoch": 1.7887956746328895, + "grad_norm": 0.02722898777164149, + "learning_rate": 3.357115145579059e-05, + "loss": 0.4697, + "step": 35235 + }, + { + "epoch": 1.789049510730921, + "grad_norm": 0.022525515289120692, + "learning_rate": 3.3491393597735786e-05, + "loss": 0.4664, + "step": 35240 + }, + { + "epoch": 1.7893033468289525, + "grad_norm": 0.02377488171334522, + "learning_rate": 3.341172731299402e-05, + "loss": 0.4654, + "step": 35245 + }, + { + "epoch": 1.789557182926984, + "grad_norm": 0.020576451658282147, + "learning_rate": 3.3332152617203237e-05, + "loss": 0.4412, + "step": 35250 + }, + { + "epoch": 1.7898110190250156, + "grad_norm": 0.019862554906292372, + "learning_rate": 3.325266952598366e-05, + "loss": 0.4613, + "step": 35255 + }, + { + "epoch": 1.7900648551230471, + "grad_norm": 0.025920687548760694, + "learning_rate": 3.317327805493736e-05, + "loss": 0.4409, + "step": 35260 + }, + { + "epoch": 1.7903186912210787, + "grad_norm": 0.025050850533418417, + "learning_rate": 3.3093978219648605e-05, + "loss": 0.487, + "step": 35265 + }, + { + "epoch": 1.7905725273191102, + "grad_norm": 0.021263072671365613, + "learning_rate": 3.3014770035683315e-05, + "loss": 0.449, + "step": 35270 + }, + { + "epoch": 1.7908263634171415, + "grad_norm": 0.021022969296222715, + "learning_rate": 3.293565351858996e-05, + "loss": 0.4434, + "step": 35275 + }, + { + "epoch": 1.791080199515173, + "grad_norm": 0.02133416611869445, + "learning_rate": 3.285662868389849e-05, + "loss": 0.4612, + "step": 35280 + }, + { + "epoch": 1.7913340356132046, + "grad_norm": 0.021363627371597606, + "learning_rate": 3.2777695547121236e-05, + "loss": 0.4463, + "step": 35285 + }, + { + "epoch": 1.791587871711236, + "grad_norm": 0.023294959319734358, + "learning_rate": 3.269885412375223e-05, + "loss": 0.4843, + "step": 35290 + }, + { + "epoch": 1.7918417078092674, + "grad_norm": 0.020389213955970452, + "learning_rate": 3.262010442926772e-05, + "loss": 0.4228, + "step": 35295 + }, + { + "epoch": 1.792095543907299, + "grad_norm": 0.020902326598321523, + "learning_rate": 3.254144647912599e-05, + "loss": 0.4467, + "step": 35300 + }, + { + "epoch": 1.7923493800053305, + "grad_norm": 0.029630242897729194, + "learning_rate": 3.246288028876704e-05, + "loss": 0.4303, + "step": 35305 + }, + { + "epoch": 1.792603216103362, + "grad_norm": 0.02027047342735473, + "learning_rate": 3.2384405873613134e-05, + "loss": 0.4566, + "step": 35310 + }, + { + "epoch": 1.7928570522013936, + "grad_norm": 0.02183801483079023, + "learning_rate": 3.2306023249068285e-05, + "loss": 0.4414, + "step": 35315 + }, + { + "epoch": 1.793110888299425, + "grad_norm": 0.022104767365956915, + "learning_rate": 3.22277324305188e-05, + "loss": 0.4563, + "step": 35320 + }, + { + "epoch": 1.7933647243974566, + "grad_norm": 0.02103106803838072, + "learning_rate": 3.214953343333255e-05, + "loss": 0.4399, + "step": 35325 + }, + { + "epoch": 1.7936185604954882, + "grad_norm": 0.025480846033686194, + "learning_rate": 3.20714262728598e-05, + "loss": 0.4418, + "step": 35330 + }, + { + "epoch": 1.7938723965935197, + "grad_norm": 0.026910726059110684, + "learning_rate": 3.1993410964432424e-05, + "loss": 0.4787, + "step": 35335 + }, + { + "epoch": 1.794126232691551, + "grad_norm": 0.03201969008252627, + "learning_rate": 3.1915487523364596e-05, + "loss": 0.4484, + "step": 35340 + }, + { + "epoch": 1.7943800687895826, + "grad_norm": 0.030531459636693972, + "learning_rate": 3.18376559649522e-05, + "loss": 0.4485, + "step": 35345 + }, + { + "epoch": 1.794633904887614, + "grad_norm": 0.02274451247134649, + "learning_rate": 3.175991630447322e-05, + "loss": 0.4718, + "step": 35350 + }, + { + "epoch": 1.7948877409856454, + "grad_norm": 0.02435117079584466, + "learning_rate": 3.1682268557187535e-05, + "loss": 0.436, + "step": 35355 + }, + { + "epoch": 1.795141577083677, + "grad_norm": 0.024073899530536718, + "learning_rate": 3.160471273833709e-05, + "loss": 0.4625, + "step": 35360 + }, + { + "epoch": 1.7953954131817085, + "grad_norm": 0.023077987822385543, + "learning_rate": 3.152724886314562e-05, + "loss": 0.4367, + "step": 35365 + }, + { + "epoch": 1.79564924927974, + "grad_norm": 0.024014102288831315, + "learning_rate": 3.1449876946819e-05, + "loss": 0.4771, + "step": 35370 + }, + { + "epoch": 1.7959030853777715, + "grad_norm": 0.022994443638095826, + "learning_rate": 3.137259700454481e-05, + "loss": 0.4728, + "step": 35375 + }, + { + "epoch": 1.796156921475803, + "grad_norm": 0.019915951884140817, + "learning_rate": 3.129540905149281e-05, + "loss": 0.4655, + "step": 35380 + }, + { + "epoch": 1.7964107575738346, + "grad_norm": 0.022563204784642146, + "learning_rate": 3.121831310281459e-05, + "loss": 0.4764, + "step": 35385 + }, + { + "epoch": 1.7966645936718662, + "grad_norm": 0.02124227375276603, + "learning_rate": 3.114130917364372e-05, + "loss": 0.4419, + "step": 35390 + }, + { + "epoch": 1.7969184297698977, + "grad_norm": 0.023126356296767688, + "learning_rate": 3.10643972790956e-05, + "loss": 0.4736, + "step": 35395 + }, + { + "epoch": 1.7971722658679292, + "grad_norm": 0.026053218593494146, + "learning_rate": 3.098757743426778e-05, + "loss": 0.4739, + "step": 35400 + }, + { + "epoch": 1.7974261019659605, + "grad_norm": 0.023364066512373553, + "learning_rate": 3.0910849654239456e-05, + "loss": 0.4504, + "step": 35405 + }, + { + "epoch": 1.797679938063992, + "grad_norm": 0.027082071431975455, + "learning_rate": 3.0834213954072046e-05, + "loss": 0.4452, + "step": 35410 + }, + { + "epoch": 1.7979337741620236, + "grad_norm": 0.02380009827362938, + "learning_rate": 3.0757670348808774e-05, + "loss": 0.4494, + "step": 35415 + }, + { + "epoch": 1.7981876102600551, + "grad_norm": 0.033607420854026, + "learning_rate": 3.0681218853474636e-05, + "loss": 0.4235, + "step": 35420 + }, + { + "epoch": 1.7984414463580864, + "grad_norm": 0.02321159683766678, + "learning_rate": 3.0604859483076785e-05, + "loss": 0.4543, + "step": 35425 + }, + { + "epoch": 1.798695282456118, + "grad_norm": 0.03605534174130652, + "learning_rate": 3.0528592252604126e-05, + "loss": 0.4683, + "step": 35430 + }, + { + "epoch": 1.7989491185541495, + "grad_norm": 0.024039033525210644, + "learning_rate": 3.045241717702757e-05, + "loss": 0.4711, + "step": 35435 + }, + { + "epoch": 1.799202954652181, + "grad_norm": 0.022549638573687806, + "learning_rate": 3.0376334271299878e-05, + "loss": 0.467, + "step": 35440 + }, + { + "epoch": 1.7994567907502126, + "grad_norm": 0.021025752959848737, + "learning_rate": 3.0300343550355767e-05, + "loss": 0.4471, + "step": 35445 + }, + { + "epoch": 1.7997106268482441, + "grad_norm": 0.02042717498913915, + "learning_rate": 3.0224445029111812e-05, + "loss": 0.4593, + "step": 35450 + }, + { + "epoch": 1.7999644629462757, + "grad_norm": 0.01959098368966413, + "learning_rate": 3.0148638722466593e-05, + "loss": 0.4566, + "step": 35455 + }, + { + "epoch": 1.8002182990443072, + "grad_norm": 0.0388203089794261, + "learning_rate": 3.007292464530037e-05, + "loss": 0.4778, + "step": 35460 + }, + { + "epoch": 1.8004721351423387, + "grad_norm": 0.022497742943539102, + "learning_rate": 2.9997302812475592e-05, + "loss": 0.4565, + "step": 35465 + }, + { + "epoch": 1.80072597124037, + "grad_norm": 0.023761047149701668, + "learning_rate": 2.9921773238836215e-05, + "loss": 0.4653, + "step": 35470 + }, + { + "epoch": 1.8009798073384016, + "grad_norm": 0.021599826773256652, + "learning_rate": 2.9846335939208602e-05, + "loss": 0.437, + "step": 35475 + }, + { + "epoch": 1.801233643436433, + "grad_norm": 0.02511013717130302, + "learning_rate": 2.9770990928400575e-05, + "loss": 0.4416, + "step": 35480 + }, + { + "epoch": 1.8014874795344646, + "grad_norm": 0.02204277337871591, + "learning_rate": 2.969573822120203e-05, + "loss": 0.4594, + "step": 35485 + }, + { + "epoch": 1.801741315632496, + "grad_norm": 0.022231649077976815, + "learning_rate": 2.9620577832384643e-05, + "loss": 0.4725, + "step": 35490 + }, + { + "epoch": 1.8019951517305275, + "grad_norm": 0.02225484721568446, + "learning_rate": 2.9545509776702062e-05, + "loss": 0.448, + "step": 35495 + }, + { + "epoch": 1.802248987828559, + "grad_norm": 0.022235212450463348, + "learning_rate": 2.947053406888972e-05, + "loss": 0.4285, + "step": 35500 + }, + { + "epoch": 1.8025028239265906, + "grad_norm": 0.020744906938457715, + "learning_rate": 2.939565072366507e-05, + "loss": 0.4364, + "step": 35505 + }, + { + "epoch": 1.802756660024622, + "grad_norm": 0.02218934079642219, + "learning_rate": 2.9320859755727238e-05, + "loss": 0.4372, + "step": 35510 + }, + { + "epoch": 1.8030104961226536, + "grad_norm": 0.021558249002831046, + "learning_rate": 2.9246161179757425e-05, + "loss": 0.4534, + "step": 35515 + }, + { + "epoch": 1.8032643322206852, + "grad_norm": 0.02400375345216303, + "learning_rate": 2.9171555010418404e-05, + "loss": 0.43, + "step": 35520 + }, + { + "epoch": 1.8035181683187167, + "grad_norm": 0.0224200032122597, + "learning_rate": 2.909704126235524e-05, + "loss": 0.4536, + "step": 35525 + }, + { + "epoch": 1.8037720044167482, + "grad_norm": 0.023048378811341624, + "learning_rate": 2.9022619950194395e-05, + "loss": 0.4554, + "step": 35530 + }, + { + "epoch": 1.8040258405147798, + "grad_norm": 0.026029749704731063, + "learning_rate": 2.8948291088544522e-05, + "loss": 0.4645, + "step": 35535 + }, + { + "epoch": 1.804279676612811, + "grad_norm": 0.019985537420418396, + "learning_rate": 2.8874054691996054e-05, + "loss": 0.445, + "step": 35540 + }, + { + "epoch": 1.8045335127108426, + "grad_norm": 0.022666606334347597, + "learning_rate": 2.8799910775121008e-05, + "loss": 0.4507, + "step": 35545 + }, + { + "epoch": 1.8047873488088741, + "grad_norm": 0.020489511769417657, + "learning_rate": 2.8725859352473737e-05, + "loss": 0.4697, + "step": 35550 + }, + { + "epoch": 1.8050411849069055, + "grad_norm": 0.02213113768181123, + "learning_rate": 2.865190043858995e-05, + "loss": 0.4638, + "step": 35555 + }, + { + "epoch": 1.805295021004937, + "grad_norm": 0.028290545411715884, + "learning_rate": 2.8578034047987587e-05, + "loss": 0.4496, + "step": 35560 + }, + { + "epoch": 1.8055488571029685, + "grad_norm": 0.02059738323425534, + "learning_rate": 2.8504260195166055e-05, + "loss": 0.4598, + "step": 35565 + }, + { + "epoch": 1.805802693201, + "grad_norm": 0.02560738205998867, + "learning_rate": 2.8430578894606985e-05, + "loss": 0.4699, + "step": 35570 + }, + { + "epoch": 1.8060565292990316, + "grad_norm": 0.02077551935432808, + "learning_rate": 2.8356990160773534e-05, + "loss": 0.4605, + "step": 35575 + }, + { + "epoch": 1.8063103653970631, + "grad_norm": 0.02698118825215807, + "learning_rate": 2.8283494008110867e-05, + "loss": 0.4439, + "step": 35580 + }, + { + "epoch": 1.8065642014950947, + "grad_norm": 0.021234911775022253, + "learning_rate": 2.821009045104578e-05, + "loss": 0.4599, + "step": 35585 + }, + { + "epoch": 1.8068180375931262, + "grad_norm": 0.023435049130171297, + "learning_rate": 2.8136779503987186e-05, + "loss": 0.4547, + "step": 35590 + }, + { + "epoch": 1.8070718736911577, + "grad_norm": 0.022261106315884807, + "learning_rate": 2.8063561181325526e-05, + "loss": 0.4291, + "step": 35595 + }, + { + "epoch": 1.8073257097891893, + "grad_norm": 0.025676143757082985, + "learning_rate": 2.7990435497433408e-05, + "loss": 0.4872, + "step": 35600 + }, + { + "epoch": 1.8075795458872206, + "grad_norm": 0.02382218211439765, + "learning_rate": 2.79174024666648e-05, + "loss": 0.4365, + "step": 35605 + }, + { + "epoch": 1.8078333819852521, + "grad_norm": 0.022263485578407526, + "learning_rate": 2.7844462103355838e-05, + "loss": 0.4421, + "step": 35610 + }, + { + "epoch": 1.8080872180832837, + "grad_norm": 0.025275647713049482, + "learning_rate": 2.7771614421824297e-05, + "loss": 0.471, + "step": 35615 + }, + { + "epoch": 1.808341054181315, + "grad_norm": 0.019588771701611427, + "learning_rate": 2.769885943636996e-05, + "loss": 0.4582, + "step": 35620 + }, + { + "epoch": 1.8085948902793465, + "grad_norm": 0.020818530567243063, + "learning_rate": 2.7626197161274014e-05, + "loss": 0.4502, + "step": 35625 + }, + { + "epoch": 1.808848726377378, + "grad_norm": 0.022388154883143465, + "learning_rate": 2.7553627610799938e-05, + "loss": 0.4305, + "step": 35630 + }, + { + "epoch": 1.8091025624754096, + "grad_norm": 0.0278131432647123, + "learning_rate": 2.748115079919261e-05, + "loss": 0.4423, + "step": 35635 + }, + { + "epoch": 1.809356398573441, + "grad_norm": 0.0214926109026721, + "learning_rate": 2.7408766740678994e-05, + "loss": 0.4498, + "step": 35640 + }, + { + "epoch": 1.8096102346714726, + "grad_norm": 0.020383587231959203, + "learning_rate": 2.73364754494676e-05, + "loss": 0.4468, + "step": 35645 + }, + { + "epoch": 1.8098640707695042, + "grad_norm": 0.023441844532656696, + "learning_rate": 2.7264276939748923e-05, + "loss": 0.4376, + "step": 35650 + }, + { + "epoch": 1.8101179068675357, + "grad_norm": 0.021868685315593866, + "learning_rate": 2.7192171225695172e-05, + "loss": 0.4606, + "step": 35655 + }, + { + "epoch": 1.8103717429655672, + "grad_norm": 0.023969847865051304, + "learning_rate": 2.712015832146031e-05, + "loss": 0.4675, + "step": 35660 + }, + { + "epoch": 1.8106255790635988, + "grad_norm": 0.022850546221913993, + "learning_rate": 2.7048238241180133e-05, + "loss": 0.4327, + "step": 35665 + }, + { + "epoch": 1.81087941516163, + "grad_norm": 0.02192908510076702, + "learning_rate": 2.6976410998972134e-05, + "loss": 0.4755, + "step": 35670 + }, + { + "epoch": 1.8111332512596616, + "grad_norm": 0.023714661280836886, + "learning_rate": 2.690467660893575e-05, + "loss": 0.4509, + "step": 35675 + }, + { + "epoch": 1.8113870873576932, + "grad_norm": 0.023176233911916783, + "learning_rate": 2.6833035085152003e-05, + "loss": 0.4615, + "step": 35680 + }, + { + "epoch": 1.8116409234557245, + "grad_norm": 0.023676313879505324, + "learning_rate": 2.6761486441683802e-05, + "loss": 0.4545, + "step": 35685 + }, + { + "epoch": 1.811894759553756, + "grad_norm": 0.03016670003145498, + "learning_rate": 2.669003069257575e-05, + "loss": 0.4299, + "step": 35690 + }, + { + "epoch": 1.8121485956517875, + "grad_norm": 0.022836140384253037, + "learning_rate": 2.661866785185435e-05, + "loss": 0.4544, + "step": 35695 + }, + { + "epoch": 1.812402431749819, + "grad_norm": 0.022515164600397174, + "learning_rate": 2.6547397933527562e-05, + "loss": 0.4482, + "step": 35700 + }, + { + "epoch": 1.8126562678478506, + "grad_norm": 0.023784234188635057, + "learning_rate": 2.6476220951585582e-05, + "loss": 0.4566, + "step": 35705 + }, + { + "epoch": 1.8129101039458821, + "grad_norm": 0.021185807136934295, + "learning_rate": 2.640513691999985e-05, + "loss": 0.454, + "step": 35710 + }, + { + "epoch": 1.8131639400439137, + "grad_norm": 0.019955400348529193, + "learning_rate": 2.6334145852724035e-05, + "loss": 0.4142, + "step": 35715 + }, + { + "epoch": 1.8134177761419452, + "grad_norm": 0.032252200455131774, + "learning_rate": 2.6263247763693153e-05, + "loss": 0.4394, + "step": 35720 + }, + { + "epoch": 1.8136716122399767, + "grad_norm": 0.025647649030433494, + "learning_rate": 2.61924426668243e-05, + "loss": 0.4871, + "step": 35725 + }, + { + "epoch": 1.8139254483380083, + "grad_norm": 0.02352222162068727, + "learning_rate": 2.6121730576015967e-05, + "loss": 0.4726, + "step": 35730 + }, + { + "epoch": 1.8141792844360396, + "grad_norm": 0.023536332582882832, + "learning_rate": 2.605111150514883e-05, + "loss": 0.4894, + "step": 35735 + }, + { + "epoch": 1.8144331205340711, + "grad_norm": 0.020218263139566366, + "learning_rate": 2.5980585468084795e-05, + "loss": 0.4571, + "step": 35740 + }, + { + "epoch": 1.8146869566321027, + "grad_norm": 0.022759174498022658, + "learning_rate": 2.5910152478668015e-05, + "loss": 0.4687, + "step": 35745 + }, + { + "epoch": 1.8149407927301342, + "grad_norm": 0.021047033486352878, + "learning_rate": 2.5839812550723928e-05, + "loss": 0.4586, + "step": 35750 + }, + { + "epoch": 1.8151946288281655, + "grad_norm": 0.023654540393796105, + "learning_rate": 2.5769565698060047e-05, + "loss": 0.4613, + "step": 35755 + }, + { + "epoch": 1.815448464926197, + "grad_norm": 0.021804276959855963, + "learning_rate": 2.56994119344654e-05, + "loss": 0.4454, + "step": 35760 + }, + { + "epoch": 1.8157023010242286, + "grad_norm": 0.02108201902634222, + "learning_rate": 2.562935127371091e-05, + "loss": 0.4555, + "step": 35765 + }, + { + "epoch": 1.8159561371222601, + "grad_norm": 0.02148487710666091, + "learning_rate": 2.5559383729549025e-05, + "loss": 0.4611, + "step": 35770 + }, + { + "epoch": 1.8162099732202917, + "grad_norm": 0.029051791837414652, + "learning_rate": 2.5489509315714087e-05, + "loss": 0.4538, + "step": 35775 + }, + { + "epoch": 1.8164638093183232, + "grad_norm": 0.026776408987169115, + "learning_rate": 2.5419728045922186e-05, + "loss": 0.4421, + "step": 35780 + }, + { + "epoch": 1.8167176454163547, + "grad_norm": 0.02023071442340706, + "learning_rate": 2.5350039933870805e-05, + "loss": 0.4225, + "step": 35785 + }, + { + "epoch": 1.8169714815143863, + "grad_norm": 0.02079097271610684, + "learning_rate": 2.5280444993239616e-05, + "loss": 0.4471, + "step": 35790 + }, + { + "epoch": 1.8172253176124178, + "grad_norm": 0.02335976147958995, + "learning_rate": 2.5210943237689575e-05, + "loss": 0.4645, + "step": 35795 + }, + { + "epoch": 1.8174791537104493, + "grad_norm": 0.025662477366910293, + "learning_rate": 2.514153468086372e-05, + "loss": 0.4523, + "step": 35800 + }, + { + "epoch": 1.8177329898084806, + "grad_norm": 0.02096423059117386, + "learning_rate": 2.507221933638637e-05, + "loss": 0.4512, + "step": 35805 + }, + { + "epoch": 1.8179868259065122, + "grad_norm": 0.021587834721081906, + "learning_rate": 2.5002997217863975e-05, + "loss": 0.4739, + "step": 35810 + }, + { + "epoch": 1.8182406620045437, + "grad_norm": 0.02346278616646674, + "learning_rate": 2.4933868338884392e-05, + "loss": 0.4391, + "step": 35815 + }, + { + "epoch": 1.818494498102575, + "grad_norm": 0.02239295958313847, + "learning_rate": 2.4864832713017316e-05, + "loss": 0.4868, + "step": 35820 + }, + { + "epoch": 1.8187483342006066, + "grad_norm": 0.022180419791008956, + "learning_rate": 2.479589035381402e-05, + "loss": 0.4144, + "step": 35825 + }, + { + "epoch": 1.819002170298638, + "grad_norm": 0.02094838835196554, + "learning_rate": 2.472704127480768e-05, + "loss": 0.4537, + "step": 35830 + }, + { + "epoch": 1.8192560063966696, + "grad_norm": 0.02255615727366666, + "learning_rate": 2.4658285489512876e-05, + "loss": 0.4575, + "step": 35835 + }, + { + "epoch": 1.8195098424947012, + "grad_norm": 0.024813587982619647, + "learning_rate": 2.45896230114262e-05, + "loss": 0.4705, + "step": 35840 + }, + { + "epoch": 1.8197636785927327, + "grad_norm": 0.024876925814751656, + "learning_rate": 2.4521053854025587e-05, + "loss": 0.443, + "step": 35845 + }, + { + "epoch": 1.8200175146907642, + "grad_norm": 0.022425131114226664, + "learning_rate": 2.4452578030771e-05, + "loss": 0.4437, + "step": 35850 + }, + { + "epoch": 1.8202713507887958, + "grad_norm": 0.02983896975643497, + "learning_rate": 2.4384195555103685e-05, + "loss": 0.4525, + "step": 35855 + }, + { + "epoch": 1.8205251868868273, + "grad_norm": 0.022587211482833724, + "learning_rate": 2.4315906440446956e-05, + "loss": 0.4341, + "step": 35860 + }, + { + "epoch": 1.8207790229848588, + "grad_norm": 0.024206758618488585, + "learning_rate": 2.4247710700205484e-05, + "loss": 0.4649, + "step": 35865 + }, + { + "epoch": 1.8210328590828901, + "grad_norm": 0.024478336072376698, + "learning_rate": 2.4179608347765948e-05, + "loss": 0.5033, + "step": 35870 + }, + { + "epoch": 1.8212866951809217, + "grad_norm": 0.02070813235705416, + "learning_rate": 2.4111599396496263e-05, + "loss": 0.4639, + "step": 35875 + }, + { + "epoch": 1.8215405312789532, + "grad_norm": 0.021774866578754614, + "learning_rate": 2.404368385974648e-05, + "loss": 0.4698, + "step": 35880 + }, + { + "epoch": 1.8217943673769845, + "grad_norm": 0.0212623746339756, + "learning_rate": 2.3975861750847872e-05, + "loss": 0.4637, + "step": 35885 + }, + { + "epoch": 1.822048203475016, + "grad_norm": 0.02128933031223815, + "learning_rate": 2.3908133083113627e-05, + "loss": 0.4293, + "step": 35890 + }, + { + "epoch": 1.8223020395730476, + "grad_norm": 0.021642236447562767, + "learning_rate": 2.3840497869838718e-05, + "loss": 0.4228, + "step": 35895 + }, + { + "epoch": 1.8225558756710791, + "grad_norm": 0.020495462348286645, + "learning_rate": 2.3772956124299416e-05, + "loss": 0.4498, + "step": 35900 + }, + { + "epoch": 1.8228097117691107, + "grad_norm": 0.019689106987887187, + "learning_rate": 2.3705507859753896e-05, + "loss": 0.4314, + "step": 35905 + }, + { + "epoch": 1.8230635478671422, + "grad_norm": 0.0217185094138313, + "learning_rate": 2.3638153089441893e-05, + "loss": 0.4535, + "step": 35910 + }, + { + "epoch": 1.8233173839651737, + "grad_norm": 0.02237290814529761, + "learning_rate": 2.357089182658484e-05, + "loss": 0.4633, + "step": 35915 + }, + { + "epoch": 1.8235712200632053, + "grad_norm": 0.02187816074803507, + "learning_rate": 2.350372408438578e-05, + "loss": 0.4592, + "step": 35920 + }, + { + "epoch": 1.8238250561612368, + "grad_norm": 0.022184120130896715, + "learning_rate": 2.343664987602939e-05, + "loss": 0.4646, + "step": 35925 + }, + { + "epoch": 1.8240788922592683, + "grad_norm": 0.019843601924149395, + "learning_rate": 2.3369669214681977e-05, + "loss": 0.4541, + "step": 35930 + }, + { + "epoch": 1.8243327283572996, + "grad_norm": 0.022400869249771683, + "learning_rate": 2.3302782113491628e-05, + "loss": 0.4757, + "step": 35935 + }, + { + "epoch": 1.8245865644553312, + "grad_norm": 0.02151836218094333, + "learning_rate": 2.3235988585587784e-05, + "loss": 0.4753, + "step": 35940 + }, + { + "epoch": 1.8248404005533627, + "grad_norm": 0.02301572497170209, + "learning_rate": 2.31692886440818e-05, + "loss": 0.4432, + "step": 35945 + }, + { + "epoch": 1.825094236651394, + "grad_norm": 0.019813493528431883, + "learning_rate": 2.3102682302066412e-05, + "loss": 0.4371, + "step": 35950 + }, + { + "epoch": 1.8253480727494256, + "grad_norm": 0.01992449587850107, + "learning_rate": 2.303616957261634e-05, + "loss": 0.4603, + "step": 35955 + }, + { + "epoch": 1.825601908847457, + "grad_norm": 0.023016076431343672, + "learning_rate": 2.2969750468787466e-05, + "loss": 0.4326, + "step": 35960 + }, + { + "epoch": 1.8258557449454886, + "grad_norm": 0.022191758185042577, + "learning_rate": 2.290342500361775e-05, + "loss": 0.4524, + "step": 35965 + }, + { + "epoch": 1.8261095810435202, + "grad_norm": 0.025349303799634692, + "learning_rate": 2.2837193190126282e-05, + "loss": 0.4524, + "step": 35970 + }, + { + "epoch": 1.8263634171415517, + "grad_norm": 0.020969407434920425, + "learning_rate": 2.2771055041314327e-05, + "loss": 0.4363, + "step": 35975 + }, + { + "epoch": 1.8266172532395832, + "grad_norm": 0.023006042793577052, + "learning_rate": 2.270501057016422e-05, + "loss": 0.4669, + "step": 35980 + }, + { + "epoch": 1.8268710893376148, + "grad_norm": 0.028873927723596555, + "learning_rate": 2.263905978964037e-05, + "loss": 0.4695, + "step": 35985 + }, + { + "epoch": 1.8271249254356463, + "grad_norm": 0.024116668482079474, + "learning_rate": 2.2573202712688367e-05, + "loss": 0.4777, + "step": 35990 + }, + { + "epoch": 1.8273787615336778, + "grad_norm": 0.024030296769852598, + "learning_rate": 2.250743935223587e-05, + "loss": 0.4963, + "step": 35995 + }, + { + "epoch": 1.8276325976317092, + "grad_norm": 0.026746408895009968, + "learning_rate": 2.2441769721191662e-05, + "loss": 0.4449, + "step": 36000 + }, + { + "epoch": 1.8278864337297407, + "grad_norm": 0.023888483474343483, + "learning_rate": 2.23761938324466e-05, + "loss": 0.4392, + "step": 36005 + }, + { + "epoch": 1.8281402698277722, + "grad_norm": 0.035099541692213014, + "learning_rate": 2.2310711698872665e-05, + "loss": 0.4504, + "step": 36010 + }, + { + "epoch": 1.8283941059258038, + "grad_norm": 0.02112342338748428, + "learning_rate": 2.224532333332385e-05, + "loss": 0.4618, + "step": 36015 + }, + { + "epoch": 1.828647942023835, + "grad_norm": 0.020712955873280085, + "learning_rate": 2.2180028748635506e-05, + "loss": 0.4715, + "step": 36020 + }, + { + "epoch": 1.8289017781218666, + "grad_norm": 0.02460324591191274, + "learning_rate": 2.2114827957624595e-05, + "loss": 0.4728, + "step": 36025 + }, + { + "epoch": 1.8291556142198981, + "grad_norm": 0.02098638010680304, + "learning_rate": 2.2049720973089825e-05, + "loss": 0.4343, + "step": 36030 + }, + { + "epoch": 1.8294094503179297, + "grad_norm": 0.021605128746199135, + "learning_rate": 2.19847078078112e-05, + "loss": 0.475, + "step": 36035 + }, + { + "epoch": 1.8296632864159612, + "grad_norm": 0.020744240541715135, + "learning_rate": 2.1919788474550673e-05, + "loss": 0.4333, + "step": 36040 + }, + { + "epoch": 1.8299171225139927, + "grad_norm": 0.021445317479955133, + "learning_rate": 2.185496298605144e-05, + "loss": 0.4572, + "step": 36045 + }, + { + "epoch": 1.8301709586120243, + "grad_norm": 0.025771996958530503, + "learning_rate": 2.1790231355038493e-05, + "loss": 0.4197, + "step": 36050 + }, + { + "epoch": 1.8304247947100558, + "grad_norm": 0.022223270474904383, + "learning_rate": 2.172559359421822e-05, + "loss": 0.4581, + "step": 36055 + }, + { + "epoch": 1.8306786308080873, + "grad_norm": 0.020030488071074923, + "learning_rate": 2.166104971627886e-05, + "loss": 0.4306, + "step": 36060 + }, + { + "epoch": 1.8309324669061189, + "grad_norm": 0.022278981780526303, + "learning_rate": 2.1596599733889888e-05, + "loss": 0.4406, + "step": 36065 + }, + { + "epoch": 1.8311863030041502, + "grad_norm": 0.0221757803883364, + "learning_rate": 2.1532243659702634e-05, + "loss": 0.4834, + "step": 36070 + }, + { + "epoch": 1.8314401391021817, + "grad_norm": 0.023275974115444426, + "learning_rate": 2.146798150634982e-05, + "loss": 0.4563, + "step": 36075 + }, + { + "epoch": 1.8316939752002133, + "grad_norm": 0.023221039443217652, + "learning_rate": 2.140381328644586e-05, + "loss": 0.4463, + "step": 36080 + }, + { + "epoch": 1.8319478112982446, + "grad_norm": 0.020569710112120266, + "learning_rate": 2.133973901258651e-05, + "loss": 0.4703, + "step": 36085 + }, + { + "epoch": 1.8322016473962761, + "grad_norm": 0.021785715748550184, + "learning_rate": 2.1275758697349434e-05, + "loss": 0.4624, + "step": 36090 + }, + { + "epoch": 1.8324554834943076, + "grad_norm": 0.02146143080616713, + "learning_rate": 2.1211872353293417e-05, + "loss": 0.4476, + "step": 36095 + }, + { + "epoch": 1.8327093195923392, + "grad_norm": 0.020350789547040154, + "learning_rate": 2.11480799929592e-05, + "loss": 0.4511, + "step": 36100 + }, + { + "epoch": 1.8329631556903707, + "grad_norm": 0.02933647592875351, + "learning_rate": 2.1084381628868833e-05, + "loss": 0.4251, + "step": 36105 + }, + { + "epoch": 1.8332169917884023, + "grad_norm": 0.02113714142733342, + "learning_rate": 2.1020777273526025e-05, + "loss": 0.4418, + "step": 36110 + }, + { + "epoch": 1.8334708278864338, + "grad_norm": 0.021965707443001067, + "learning_rate": 2.0957266939415965e-05, + "loss": 0.4415, + "step": 36115 + }, + { + "epoch": 1.8337246639844653, + "grad_norm": 0.02268716397256464, + "learning_rate": 2.0893850639005453e-05, + "loss": 0.457, + "step": 36120 + }, + { + "epoch": 1.8339785000824969, + "grad_norm": 0.025935687476320603, + "learning_rate": 2.0830528384742697e-05, + "loss": 0.4502, + "step": 36125 + }, + { + "epoch": 1.8342323361805284, + "grad_norm": 0.02355894973776656, + "learning_rate": 2.076730018905759e-05, + "loss": 0.4344, + "step": 36130 + }, + { + "epoch": 1.8344861722785597, + "grad_norm": 0.022212368167510682, + "learning_rate": 2.0704166064361596e-05, + "loss": 0.4486, + "step": 36135 + }, + { + "epoch": 1.8347400083765912, + "grad_norm": 0.02141121626813041, + "learning_rate": 2.0641126023047518e-05, + "loss": 0.4587, + "step": 36140 + }, + { + "epoch": 1.8349938444746228, + "grad_norm": 0.02163591724705281, + "learning_rate": 2.0578180077489905e-05, + "loss": 0.4409, + "step": 36145 + }, + { + "epoch": 1.835247680572654, + "grad_norm": 0.023066084860677204, + "learning_rate": 2.0515328240044594e-05, + "loss": 0.4499, + "step": 36150 + }, + { + "epoch": 1.8355015166706856, + "grad_norm": 0.022242330077506877, + "learning_rate": 2.0452570523049217e-05, + "loss": 0.4362, + "step": 36155 + }, + { + "epoch": 1.8357553527687172, + "grad_norm": 0.020051623391778836, + "learning_rate": 2.03899069388227e-05, + "loss": 0.4427, + "step": 36160 + }, + { + "epoch": 1.8360091888667487, + "grad_norm": 0.029608579217612622, + "learning_rate": 2.03273374996657e-05, + "loss": 0.4502, + "step": 36165 + }, + { + "epoch": 1.8362630249647802, + "grad_norm": 0.023737953896778508, + "learning_rate": 2.026486221786017e-05, + "loss": 0.4767, + "step": 36170 + }, + { + "epoch": 1.8365168610628118, + "grad_norm": 0.023375619205875876, + "learning_rate": 2.02024811056698e-05, + "loss": 0.4497, + "step": 36175 + }, + { + "epoch": 1.8367706971608433, + "grad_norm": 0.027714733020031652, + "learning_rate": 2.0140194175339575e-05, + "loss": 0.4693, + "step": 36180 + }, + { + "epoch": 1.8370245332588748, + "grad_norm": 0.02427169464452961, + "learning_rate": 2.0078001439096218e-05, + "loss": 0.4364, + "step": 36185 + }, + { + "epoch": 1.8372783693569064, + "grad_norm": 0.020026240969129284, + "learning_rate": 2.001590290914779e-05, + "loss": 0.4272, + "step": 36190 + }, + { + "epoch": 1.837532205454938, + "grad_norm": 0.022495330500840523, + "learning_rate": 1.9953898597683927e-05, + "loss": 0.4843, + "step": 36195 + }, + { + "epoch": 1.8377860415529692, + "grad_norm": 0.020360142174855975, + "learning_rate": 1.989198851687579e-05, + "loss": 0.4429, + "step": 36200 + }, + { + "epoch": 1.8380398776510007, + "grad_norm": 0.023413422223116345, + "learning_rate": 1.9830172678876103e-05, + "loss": 0.468, + "step": 36205 + }, + { + "epoch": 1.8382937137490323, + "grad_norm": 0.02298430069711076, + "learning_rate": 1.9768451095818818e-05, + "loss": 0.4567, + "step": 36210 + }, + { + "epoch": 1.8385475498470636, + "grad_norm": 0.02280937325321792, + "learning_rate": 1.9706823779819692e-05, + "loss": 0.442, + "step": 36215 + }, + { + "epoch": 1.8388013859450951, + "grad_norm": 0.021396894421407745, + "learning_rate": 1.964529074297583e-05, + "loss": 0.4653, + "step": 36220 + }, + { + "epoch": 1.8390552220431267, + "grad_norm": 0.020024815824637394, + "learning_rate": 1.9583851997365954e-05, + "loss": 0.4385, + "step": 36225 + }, + { + "epoch": 1.8393090581411582, + "grad_norm": 0.02174390016299535, + "learning_rate": 1.952250755505003e-05, + "loss": 0.4606, + "step": 36230 + }, + { + "epoch": 1.8395628942391897, + "grad_norm": 0.022477525249405196, + "learning_rate": 1.9461257428069755e-05, + "loss": 0.4434, + "step": 36235 + }, + { + "epoch": 1.8398167303372213, + "grad_norm": 0.0245485389384089, + "learning_rate": 1.9400101628448242e-05, + "loss": 0.4237, + "step": 36240 + }, + { + "epoch": 1.8400705664352528, + "grad_norm": 0.021224620960195745, + "learning_rate": 1.9339040168189937e-05, + "loss": 0.4724, + "step": 36245 + }, + { + "epoch": 1.8403244025332843, + "grad_norm": 0.023242450571638745, + "learning_rate": 1.927807305928109e-05, + "loss": 0.4638, + "step": 36250 + }, + { + "epoch": 1.8405782386313159, + "grad_norm": 0.020915924336955997, + "learning_rate": 1.921720031368901e-05, + "loss": 0.4405, + "step": 36255 + }, + { + "epoch": 1.8408320747293474, + "grad_norm": 0.02248775188519345, + "learning_rate": 1.9156421943362924e-05, + "loss": 0.4786, + "step": 36260 + }, + { + "epoch": 1.8410859108273787, + "grad_norm": 0.02148666149767411, + "learning_rate": 1.9095737960233228e-05, + "loss": 0.4436, + "step": 36265 + }, + { + "epoch": 1.8413397469254102, + "grad_norm": 0.02416587138910249, + "learning_rate": 1.903514837621201e-05, + "loss": 0.4714, + "step": 36270 + }, + { + "epoch": 1.8415935830234418, + "grad_norm": 0.023157173410389604, + "learning_rate": 1.897465320319247e-05, + "loss": 0.4338, + "step": 36275 + }, + { + "epoch": 1.8418474191214733, + "grad_norm": 0.029233910498072987, + "learning_rate": 1.891425245304973e-05, + "loss": 0.4753, + "step": 36280 + }, + { + "epoch": 1.8421012552195046, + "grad_norm": 0.02283659947545247, + "learning_rate": 1.8853946137639966e-05, + "loss": 0.4239, + "step": 36285 + }, + { + "epoch": 1.8423550913175362, + "grad_norm": 0.019978195457050724, + "learning_rate": 1.879373426880121e-05, + "loss": 0.4456, + "step": 36290 + }, + { + "epoch": 1.8426089274155677, + "grad_norm": 0.021687663942728044, + "learning_rate": 1.8733616858352564e-05, + "loss": 0.4522, + "step": 36295 + }, + { + "epoch": 1.8428627635135992, + "grad_norm": 0.019528780287524306, + "learning_rate": 1.8673593918094923e-05, + "loss": 0.4439, + "step": 36300 + }, + { + "epoch": 1.8431165996116308, + "grad_norm": 0.02155234027892195, + "learning_rate": 1.8613665459810357e-05, + "loss": 0.4574, + "step": 36305 + }, + { + "epoch": 1.8433704357096623, + "grad_norm": 0.0261456620147658, + "learning_rate": 1.8553831495262685e-05, + "loss": 0.4648, + "step": 36310 + }, + { + "epoch": 1.8436242718076938, + "grad_norm": 0.026088651944962685, + "learning_rate": 1.849409203619673e-05, + "loss": 0.4534, + "step": 36315 + }, + { + "epoch": 1.8438781079057254, + "grad_norm": 0.02423314397669894, + "learning_rate": 1.8434447094339446e-05, + "loss": 0.4652, + "step": 36320 + }, + { + "epoch": 1.844131944003757, + "grad_norm": 0.023517519839947224, + "learning_rate": 1.837489668139858e-05, + "loss": 0.4425, + "step": 36325 + }, + { + "epoch": 1.8443857801017884, + "grad_norm": 0.02513005496856768, + "learning_rate": 1.8315440809063554e-05, + "loss": 0.4472, + "step": 36330 + }, + { + "epoch": 1.8446396161998198, + "grad_norm": 0.021229880550278774, + "learning_rate": 1.8256079489005485e-05, + "loss": 0.4364, + "step": 36335 + }, + { + "epoch": 1.8448934522978513, + "grad_norm": 0.01841576739055947, + "learning_rate": 1.8196812732876434e-05, + "loss": 0.4649, + "step": 36340 + }, + { + "epoch": 1.8451472883958828, + "grad_norm": 0.022976094393190677, + "learning_rate": 1.8137640552310374e-05, + "loss": 0.4356, + "step": 36345 + }, + { + "epoch": 1.8454011244939141, + "grad_norm": 0.02206953417030893, + "learning_rate": 1.807856295892235e-05, + "loss": 0.4636, + "step": 36350 + }, + { + "epoch": 1.8456549605919457, + "grad_norm": 0.02132833882982846, + "learning_rate": 1.801957996430914e-05, + "loss": 0.4605, + "step": 36355 + }, + { + "epoch": 1.8459087966899772, + "grad_norm": 0.025669102961975235, + "learning_rate": 1.7960691580048705e-05, + "loss": 0.4491, + "step": 36360 + }, + { + "epoch": 1.8461626327880087, + "grad_norm": 0.0256888308875704, + "learning_rate": 1.7901897817700685e-05, + "loss": 0.4626, + "step": 36365 + }, + { + "epoch": 1.8464164688860403, + "grad_norm": 0.022264542254564815, + "learning_rate": 1.7843198688805793e-05, + "loss": 0.4803, + "step": 36370 + }, + { + "epoch": 1.8466703049840718, + "grad_norm": 0.023955069464813527, + "learning_rate": 1.7784594204886485e-05, + "loss": 0.4728, + "step": 36375 + }, + { + "epoch": 1.8469241410821033, + "grad_norm": 0.028377687030451647, + "learning_rate": 1.772608437744655e-05, + "loss": 0.4523, + "step": 36380 + }, + { + "epoch": 1.8471779771801349, + "grad_norm": 0.02256742111449494, + "learning_rate": 1.7667669217971195e-05, + "loss": 0.4608, + "step": 36385 + }, + { + "epoch": 1.8474318132781664, + "grad_norm": 0.020948702040254757, + "learning_rate": 1.7609348737926968e-05, + "loss": 0.4561, + "step": 36390 + }, + { + "epoch": 1.847685649376198, + "grad_norm": 0.02215357030539792, + "learning_rate": 1.7551122948761932e-05, + "loss": 0.4575, + "step": 36395 + }, + { + "epoch": 1.8479394854742293, + "grad_norm": 0.021140043797980356, + "learning_rate": 1.7492991861905394e-05, + "loss": 0.4541, + "step": 36400 + }, + { + "epoch": 1.8481933215722608, + "grad_norm": 0.02126277301347097, + "learning_rate": 1.7434955488768445e-05, + "loss": 0.4592, + "step": 36405 + }, + { + "epoch": 1.8484471576702923, + "grad_norm": 0.020846498654345385, + "learning_rate": 1.7377013840743083e-05, + "loss": 0.4505, + "step": 36410 + }, + { + "epoch": 1.8487009937683236, + "grad_norm": 0.024918017001830127, + "learning_rate": 1.73191669292031e-05, + "loss": 0.4629, + "step": 36415 + }, + { + "epoch": 1.8489548298663552, + "grad_norm": 0.02250400370693143, + "learning_rate": 1.726141476550347e-05, + "loss": 0.4666, + "step": 36420 + }, + { + "epoch": 1.8492086659643867, + "grad_norm": 0.019638524615499656, + "learning_rate": 1.720375736098079e-05, + "loss": 0.4416, + "step": 36425 + }, + { + "epoch": 1.8494625020624182, + "grad_norm": 0.022038205053431632, + "learning_rate": 1.7146194726952778e-05, + "loss": 0.4335, + "step": 36430 + }, + { + "epoch": 1.8497163381604498, + "grad_norm": 0.022686280035144835, + "learning_rate": 1.708872687471874e-05, + "loss": 0.4578, + "step": 36435 + }, + { + "epoch": 1.8499701742584813, + "grad_norm": 0.025248657754128545, + "learning_rate": 1.7031353815559425e-05, + "loss": 0.4814, + "step": 36440 + }, + { + "epoch": 1.8502240103565128, + "grad_norm": 0.021988387262798826, + "learning_rate": 1.697407556073671e-05, + "loss": 0.4446, + "step": 36445 + }, + { + "epoch": 1.8504778464545444, + "grad_norm": 0.021771081888713024, + "learning_rate": 1.6916892121494166e-05, + "loss": 0.4603, + "step": 36450 + }, + { + "epoch": 1.850731682552576, + "grad_norm": 0.019874477923159228, + "learning_rate": 1.6859803509056527e-05, + "loss": 0.4329, + "step": 36455 + }, + { + "epoch": 1.8509855186506075, + "grad_norm": 0.02171798205791076, + "learning_rate": 1.680280973463011e-05, + "loss": 0.449, + "step": 36460 + }, + { + "epoch": 1.8512393547486388, + "grad_norm": 0.02199931724655234, + "learning_rate": 1.674591080940241e-05, + "loss": 0.4585, + "step": 36465 + }, + { + "epoch": 1.8514931908466703, + "grad_norm": 0.02577647787558014, + "learning_rate": 1.6689106744542437e-05, + "loss": 0.4542, + "step": 36470 + }, + { + "epoch": 1.8517470269447018, + "grad_norm": 0.023435984326753145, + "learning_rate": 1.6632397551200496e-05, + "loss": 0.4436, + "step": 36475 + }, + { + "epoch": 1.8520008630427331, + "grad_norm": 0.020355449487897765, + "learning_rate": 1.6575783240508458e-05, + "loss": 0.4355, + "step": 36480 + }, + { + "epoch": 1.8522546991407647, + "grad_norm": 0.0207668042498274, + "learning_rate": 1.6519263823579213e-05, + "loss": 0.4557, + "step": 36485 + }, + { + "epoch": 1.8525085352387962, + "grad_norm": 0.0236812491259185, + "learning_rate": 1.6462839311507494e-05, + "loss": 0.4519, + "step": 36490 + }, + { + "epoch": 1.8527623713368278, + "grad_norm": 0.026595024576517964, + "learning_rate": 1.640650971536889e-05, + "loss": 0.4644, + "step": 36495 + }, + { + "epoch": 1.8530162074348593, + "grad_norm": 0.022762821590263934, + "learning_rate": 1.635027504622083e-05, + "loss": 0.4323, + "step": 36500 + }, + { + "epoch": 1.8532700435328908, + "grad_norm": 0.025364089662246653, + "learning_rate": 1.6294135315101765e-05, + "loss": 0.4326, + "step": 36505 + }, + { + "epoch": 1.8535238796309224, + "grad_norm": 0.021616350106393015, + "learning_rate": 1.6238090533031825e-05, + "loss": 0.4613, + "step": 36510 + }, + { + "epoch": 1.853777715728954, + "grad_norm": 0.022173841898150175, + "learning_rate": 1.6182140711012095e-05, + "loss": 0.4623, + "step": 36515 + }, + { + "epoch": 1.8540315518269854, + "grad_norm": 0.021041461117188336, + "learning_rate": 1.6126285860025403e-05, + "loss": 0.4418, + "step": 36520 + }, + { + "epoch": 1.854285387925017, + "grad_norm": 0.03205910168306328, + "learning_rate": 1.6070525991035646e-05, + "loss": 0.4613, + "step": 36525 + }, + { + "epoch": 1.8545392240230483, + "grad_norm": 0.02175375824381657, + "learning_rate": 1.6014861114988343e-05, + "loss": 0.4306, + "step": 36530 + }, + { + "epoch": 1.8547930601210798, + "grad_norm": 0.024123083015002662, + "learning_rate": 1.5959291242810146e-05, + "loss": 0.4818, + "step": 36535 + }, + { + "epoch": 1.8550468962191113, + "grad_norm": 0.022298076921305057, + "learning_rate": 1.590381638540922e-05, + "loss": 0.4215, + "step": 36540 + }, + { + "epoch": 1.8553007323171429, + "grad_norm": 0.026828035039684898, + "learning_rate": 1.5848436553674905e-05, + "loss": 0.456, + "step": 36545 + }, + { + "epoch": 1.8555545684151742, + "grad_norm": 0.02438509358051607, + "learning_rate": 1.5793151758478064e-05, + "loss": 0.4759, + "step": 36550 + }, + { + "epoch": 1.8558084045132057, + "grad_norm": 0.022150780639142196, + "learning_rate": 1.5737962010670738e-05, + "loss": 0.43, + "step": 36555 + }, + { + "epoch": 1.8560622406112373, + "grad_norm": 0.02587280793684847, + "learning_rate": 1.5682867321086482e-05, + "loss": 0.4301, + "step": 36560 + }, + { + "epoch": 1.8563160767092688, + "grad_norm": 0.02196586338846343, + "learning_rate": 1.5627867700540144e-05, + "loss": 0.4392, + "step": 36565 + }, + { + "epoch": 1.8565699128073003, + "grad_norm": 0.02236881612109358, + "learning_rate": 1.557296315982776e-05, + "loss": 0.4533, + "step": 36570 + }, + { + "epoch": 1.8568237489053319, + "grad_norm": 0.021640783844340757, + "learning_rate": 1.5518153709726922e-05, + "loss": 0.4356, + "step": 36575 + }, + { + "epoch": 1.8570775850033634, + "grad_norm": 0.021850601092974337, + "learning_rate": 1.5463439360996367e-05, + "loss": 0.4601, + "step": 36580 + }, + { + "epoch": 1.857331421101395, + "grad_norm": 0.022863672714919226, + "learning_rate": 1.5408820124376277e-05, + "loss": 0.4623, + "step": 36585 + }, + { + "epoch": 1.8575852571994265, + "grad_norm": 0.02052988156416512, + "learning_rate": 1.535429601058813e-05, + "loss": 0.4182, + "step": 36590 + }, + { + "epoch": 1.8578390932974578, + "grad_norm": 0.01930622632761717, + "learning_rate": 1.5299867030334813e-05, + "loss": 0.4422, + "step": 36595 + }, + { + "epoch": 1.8580929293954893, + "grad_norm": 0.020532303166287503, + "learning_rate": 1.5245533194300387e-05, + "loss": 0.4742, + "step": 36600 + }, + { + "epoch": 1.8583467654935208, + "grad_norm": 0.021453897011547724, + "learning_rate": 1.5191294513150322e-05, + "loss": 0.4438, + "step": 36605 + }, + { + "epoch": 1.8586006015915524, + "grad_norm": 0.021567947200895245, + "learning_rate": 1.5137150997531379e-05, + "loss": 0.4645, + "step": 36610 + }, + { + "epoch": 1.8588544376895837, + "grad_norm": 0.02120114038014877, + "learning_rate": 1.5083102658071667e-05, + "loss": 0.4417, + "step": 36615 + }, + { + "epoch": 1.8591082737876152, + "grad_norm": 0.02211469006670524, + "learning_rate": 1.5029149505380647e-05, + "loss": 0.4773, + "step": 36620 + }, + { + "epoch": 1.8593621098856468, + "grad_norm": 0.023182166431144506, + "learning_rate": 1.4975291550049063e-05, + "loss": 0.4537, + "step": 36625 + }, + { + "epoch": 1.8596159459836783, + "grad_norm": 0.019178819226592782, + "learning_rate": 1.492152880264891e-05, + "loss": 0.4491, + "step": 36630 + }, + { + "epoch": 1.8598697820817098, + "grad_norm": 0.026391292018557753, + "learning_rate": 1.4867861273733629e-05, + "loss": 0.4437, + "step": 36635 + }, + { + "epoch": 1.8601236181797414, + "grad_norm": 0.02545968689149914, + "learning_rate": 1.4814288973837742e-05, + "loss": 0.4414, + "step": 36640 + }, + { + "epoch": 1.860377454277773, + "grad_norm": 0.02262304039898802, + "learning_rate": 1.4760811913477389e-05, + "loss": 0.4643, + "step": 36645 + }, + { + "epoch": 1.8606312903758044, + "grad_norm": 0.022296739978009437, + "learning_rate": 1.4707430103149732e-05, + "loss": 0.4651, + "step": 36650 + }, + { + "epoch": 1.860885126473836, + "grad_norm": 0.025642319286203506, + "learning_rate": 1.4654143553333387e-05, + "loss": 0.4729, + "step": 36655 + }, + { + "epoch": 1.8611389625718675, + "grad_norm": 0.02273311192918183, + "learning_rate": 1.4600952274488265e-05, + "loss": 0.4587, + "step": 36660 + }, + { + "epoch": 1.8613927986698988, + "grad_norm": 0.02116679503489843, + "learning_rate": 1.4547856277055571e-05, + "loss": 0.4557, + "step": 36665 + }, + { + "epoch": 1.8616466347679304, + "grad_norm": 0.021316010404225146, + "learning_rate": 1.4494855571457633e-05, + "loss": 0.4697, + "step": 36670 + }, + { + "epoch": 1.8619004708659619, + "grad_norm": 0.02350879271004433, + "learning_rate": 1.4441950168098406e-05, + "loss": 0.4464, + "step": 36675 + }, + { + "epoch": 1.8621543069639932, + "grad_norm": 0.023258720770063683, + "learning_rate": 1.4389140077362916e-05, + "loss": 0.4375, + "step": 36680 + }, + { + "epoch": 1.8624081430620247, + "grad_norm": 0.021273891527511624, + "learning_rate": 1.433642530961743e-05, + "loss": 0.4552, + "step": 36685 + }, + { + "epoch": 1.8626619791600563, + "grad_norm": 0.02136026789458103, + "learning_rate": 1.4283805875209721e-05, + "loss": 0.4587, + "step": 36690 + }, + { + "epoch": 1.8629158152580878, + "grad_norm": 0.02214731165919245, + "learning_rate": 1.4231281784468587e-05, + "loss": 0.4264, + "step": 36695 + }, + { + "epoch": 1.8631696513561193, + "grad_norm": 0.02350780328623469, + "learning_rate": 1.4178853047704388e-05, + "loss": 0.4499, + "step": 36700 + }, + { + "epoch": 1.8634234874541509, + "grad_norm": 0.03141213231106275, + "learning_rate": 1.412651967520845e-05, + "loss": 0.4694, + "step": 36705 + }, + { + "epoch": 1.8636773235521824, + "grad_norm": 0.023866885124020652, + "learning_rate": 1.4074281677253719e-05, + "loss": 0.4456, + "step": 36710 + }, + { + "epoch": 1.863931159650214, + "grad_norm": 0.020431070665186554, + "learning_rate": 1.4022139064094164e-05, + "loss": 0.4463, + "step": 36715 + }, + { + "epoch": 1.8641849957482455, + "grad_norm": 0.021642398766930707, + "learning_rate": 1.3970091845965205e-05, + "loss": 0.4466, + "step": 36720 + }, + { + "epoch": 1.864438831846277, + "grad_norm": 0.02411974345336584, + "learning_rate": 1.3918140033083338e-05, + "loss": 0.4958, + "step": 36725 + }, + { + "epoch": 1.8646926679443083, + "grad_norm": 0.024224467721648102, + "learning_rate": 1.3866283635646515e-05, + "loss": 0.4691, + "step": 36730 + }, + { + "epoch": 1.8649465040423399, + "grad_norm": 0.02057402120977364, + "learning_rate": 1.3814522663833761e-05, + "loss": 0.4473, + "step": 36735 + }, + { + "epoch": 1.8652003401403714, + "grad_norm": 0.021182543447772897, + "learning_rate": 1.3762857127805727e-05, + "loss": 0.4408, + "step": 36740 + }, + { + "epoch": 1.8654541762384027, + "grad_norm": 0.025082497648913876, + "learning_rate": 1.3711287037703913e-05, + "loss": 0.4422, + "step": 36745 + }, + { + "epoch": 1.8657080123364342, + "grad_norm": 0.023947880929159032, + "learning_rate": 1.3659812403651439e-05, + "loss": 0.464, + "step": 36750 + }, + { + "epoch": 1.8659618484344658, + "grad_norm": 0.0255745124388733, + "learning_rate": 1.3608433235752282e-05, + "loss": 0.4467, + "step": 36755 + }, + { + "epoch": 1.8662156845324973, + "grad_norm": 0.02307802412282208, + "learning_rate": 1.355714954409215e-05, + "loss": 0.4477, + "step": 36760 + }, + { + "epoch": 1.8664695206305288, + "grad_norm": 0.02612322796667025, + "learning_rate": 1.3505961338737604e-05, + "loss": 0.4463, + "step": 36765 + }, + { + "epoch": 1.8667233567285604, + "grad_norm": 0.029078431865484707, + "learning_rate": 1.3454868629736771e-05, + "loss": 0.4497, + "step": 36770 + }, + { + "epoch": 1.866977192826592, + "grad_norm": 0.025822399311491818, + "learning_rate": 1.3403871427118798e-05, + "loss": 0.4527, + "step": 36775 + }, + { + "epoch": 1.8672310289246234, + "grad_norm": 0.022291834000361617, + "learning_rate": 1.3352969740894228e-05, + "loss": 0.4711, + "step": 36780 + }, + { + "epoch": 1.867484865022655, + "grad_norm": 0.02647393100605352, + "learning_rate": 1.3302163581054793e-05, + "loss": 0.4574, + "step": 36785 + }, + { + "epoch": 1.8677387011206865, + "grad_norm": 0.023446096336786612, + "learning_rate": 1.3251452957573517e-05, + "loss": 0.4738, + "step": 36790 + }, + { + "epoch": 1.8679925372187178, + "grad_norm": 0.02127291076715947, + "learning_rate": 1.3200837880404548e-05, + "loss": 0.4877, + "step": 36795 + }, + { + "epoch": 1.8682463733167494, + "grad_norm": 0.024906256232495056, + "learning_rate": 1.3150318359483437e-05, + "loss": 0.447, + "step": 36800 + }, + { + "epoch": 1.868500209414781, + "grad_norm": 0.02057976417447868, + "learning_rate": 1.3099894404726976e-05, + "loss": 0.4736, + "step": 36805 + }, + { + "epoch": 1.8687540455128122, + "grad_norm": 0.02492191020600024, + "learning_rate": 1.3049566026033022e-05, + "loss": 0.4711, + "step": 36810 + }, + { + "epoch": 1.8690078816108437, + "grad_norm": 0.02420601274081891, + "learning_rate": 1.2999333233280896e-05, + "loss": 0.4365, + "step": 36815 + }, + { + "epoch": 1.8692617177088753, + "grad_norm": 0.020254390572854473, + "learning_rate": 1.294919603633088e-05, + "loss": 0.439, + "step": 36820 + }, + { + "epoch": 1.8695155538069068, + "grad_norm": 0.02474683239453854, + "learning_rate": 1.2899154445024874e-05, + "loss": 0.4541, + "step": 36825 + }, + { + "epoch": 1.8697693899049384, + "grad_norm": 0.03001458417928382, + "learning_rate": 1.2849208469185636e-05, + "loss": 0.4811, + "step": 36830 + }, + { + "epoch": 1.8700232260029699, + "grad_norm": 0.021337265901394033, + "learning_rate": 1.2799358118617377e-05, + "loss": 0.4643, + "step": 36835 + }, + { + "epoch": 1.8702770621010014, + "grad_norm": 0.02208717085882638, + "learning_rate": 1.2749603403105437e-05, + "loss": 0.4623, + "step": 36840 + }, + { + "epoch": 1.870530898199033, + "grad_norm": 0.02100059123832777, + "learning_rate": 1.2699944332416502e-05, + "loss": 0.4391, + "step": 36845 + }, + { + "epoch": 1.8707847342970645, + "grad_norm": 0.023998206914377488, + "learning_rate": 1.2650380916298222e-05, + "loss": 0.4945, + "step": 36850 + }, + { + "epoch": 1.871038570395096, + "grad_norm": 0.022644141112716502, + "learning_rate": 1.2600913164479811e-05, + "loss": 0.443, + "step": 36855 + }, + { + "epoch": 1.8712924064931273, + "grad_norm": 0.019765022307224444, + "learning_rate": 1.2551541086671447e-05, + "loss": 0.4424, + "step": 36860 + }, + { + "epoch": 1.8715462425911589, + "grad_norm": 0.021694636824496228, + "learning_rate": 1.2502264692564768e-05, + "loss": 0.4575, + "step": 36865 + }, + { + "epoch": 1.8718000786891904, + "grad_norm": 0.020689782081024555, + "learning_rate": 1.2453083991832258e-05, + "loss": 0.4464, + "step": 36870 + }, + { + "epoch": 1.872053914787222, + "grad_norm": 0.02173020264000965, + "learning_rate": 1.2403998994128085e-05, + "loss": 0.4299, + "step": 36875 + }, + { + "epoch": 1.8723077508852533, + "grad_norm": 0.023926802677357032, + "learning_rate": 1.2355009709087205e-05, + "loss": 0.4544, + "step": 36880 + }, + { + "epoch": 1.8725615869832848, + "grad_norm": 0.024662532571645124, + "learning_rate": 1.2306116146326096e-05, + "loss": 0.4503, + "step": 36885 + }, + { + "epoch": 1.8728154230813163, + "grad_norm": 0.022866472003658402, + "learning_rate": 1.225731831544219e-05, + "loss": 0.4684, + "step": 36890 + }, + { + "epoch": 1.8730692591793479, + "grad_norm": 0.02737229931132882, + "learning_rate": 1.220861622601438e-05, + "loss": 0.4691, + "step": 36895 + }, + { + "epoch": 1.8733230952773794, + "grad_norm": 0.02045519299338699, + "learning_rate": 1.2160009887602575e-05, + "loss": 0.439, + "step": 36900 + }, + { + "epoch": 1.873576931375411, + "grad_norm": 0.02105021426485451, + "learning_rate": 1.2111499309747975e-05, + "loss": 0.4512, + "step": 36905 + }, + { + "epoch": 1.8738307674734425, + "grad_norm": 0.023234909473193354, + "learning_rate": 1.2063084501972966e-05, + "loss": 0.4605, + "step": 36910 + }, + { + "epoch": 1.874084603571474, + "grad_norm": 0.02495377423784143, + "learning_rate": 1.2014765473781053e-05, + "loss": 0.4709, + "step": 36915 + }, + { + "epoch": 1.8743384396695055, + "grad_norm": 0.01982317350160172, + "learning_rate": 1.1966542234657208e-05, + "loss": 0.4205, + "step": 36920 + }, + { + "epoch": 1.874592275767537, + "grad_norm": 0.023080934785440023, + "learning_rate": 1.1918414794067244e-05, + "loss": 0.4355, + "step": 36925 + }, + { + "epoch": 1.8748461118655684, + "grad_norm": 0.022797451618634154, + "learning_rate": 1.1870383161458497e-05, + "loss": 0.4465, + "step": 36930 + }, + { + "epoch": 1.8750999479636, + "grad_norm": 0.0241812112755731, + "learning_rate": 1.182244734625909e-05, + "loss": 0.4426, + "step": 36935 + }, + { + "epoch": 1.8753537840616314, + "grad_norm": 0.023434936541055627, + "learning_rate": 1.1774607357878886e-05, + "loss": 0.4484, + "step": 36940 + }, + { + "epoch": 1.8756076201596628, + "grad_norm": 0.02410828643170135, + "learning_rate": 1.1726863205708372e-05, + "loss": 0.4408, + "step": 36945 + }, + { + "epoch": 1.8758614562576943, + "grad_norm": 0.022411527527885312, + "learning_rate": 1.1679214899119605e-05, + "loss": 0.4561, + "step": 36950 + }, + { + "epoch": 1.8761152923557258, + "grad_norm": 0.021883107784838412, + "learning_rate": 1.1631662447465719e-05, + "loss": 0.4449, + "step": 36955 + }, + { + "epoch": 1.8763691284537574, + "grad_norm": 0.02541344155482163, + "learning_rate": 1.1584205860081021e-05, + "loss": 0.4598, + "step": 36960 + }, + { + "epoch": 1.876622964551789, + "grad_norm": 0.02445820739790313, + "learning_rate": 1.153684514628095e-05, + "loss": 0.4213, + "step": 36965 + }, + { + "epoch": 1.8768768006498204, + "grad_norm": 0.022274003744240302, + "learning_rate": 1.1489580315362292e-05, + "loss": 0.4674, + "step": 36970 + }, + { + "epoch": 1.877130636747852, + "grad_norm": 0.02302479732283661, + "learning_rate": 1.1442411376602679e-05, + "loss": 0.4447, + "step": 36975 + }, + { + "epoch": 1.8773844728458835, + "grad_norm": 0.023099210723607017, + "learning_rate": 1.139533833926143e-05, + "loss": 0.4334, + "step": 36980 + }, + { + "epoch": 1.877638308943915, + "grad_norm": 0.02557626233760811, + "learning_rate": 1.1348361212578484e-05, + "loss": 0.4435, + "step": 36985 + }, + { + "epoch": 1.8778921450419466, + "grad_norm": 0.02197010518349448, + "learning_rate": 1.1301480005775412e-05, + "loss": 0.4739, + "step": 36990 + }, + { + "epoch": 1.8781459811399779, + "grad_norm": 0.025263565398403336, + "learning_rate": 1.1254694728054626e-05, + "loss": 0.4556, + "step": 36995 + }, + { + "epoch": 1.8783998172380094, + "grad_norm": 0.021885371029893786, + "learning_rate": 1.1208005388599951e-05, + "loss": 0.4652, + "step": 37000 + }, + { + "epoch": 1.878653653336041, + "grad_norm": 0.023486472540742757, + "learning_rate": 1.1161411996576165e-05, + "loss": 0.4667, + "step": 37005 + }, + { + "epoch": 1.8789074894340723, + "grad_norm": 0.022717357060794766, + "learning_rate": 1.1114914561129396e-05, + "loss": 0.4579, + "step": 37010 + }, + { + "epoch": 1.8791613255321038, + "grad_norm": 0.021029880026985255, + "learning_rate": 1.106851309138679e-05, + "loss": 0.4903, + "step": 37015 + }, + { + "epoch": 1.8794151616301353, + "grad_norm": 0.020524349147354494, + "learning_rate": 1.1022207596456835e-05, + "loss": 0.4325, + "step": 37020 + }, + { + "epoch": 1.8796689977281669, + "grad_norm": 0.026662156059636153, + "learning_rate": 1.0975998085428984e-05, + "loss": 0.4224, + "step": 37025 + }, + { + "epoch": 1.8799228338261984, + "grad_norm": 0.02148292116153404, + "learning_rate": 1.0929884567373927e-05, + "loss": 0.4643, + "step": 37030 + }, + { + "epoch": 1.88017666992423, + "grad_norm": 0.030321978223338382, + "learning_rate": 1.0883867051343533e-05, + "loss": 0.4631, + "step": 37035 + }, + { + "epoch": 1.8804305060222615, + "grad_norm": 0.02090326478040996, + "learning_rate": 1.0837945546370798e-05, + "loss": 0.4325, + "step": 37040 + }, + { + "epoch": 1.880684342120293, + "grad_norm": 0.02145454006132866, + "learning_rate": 1.0792120061469956e-05, + "loss": 0.4412, + "step": 37045 + }, + { + "epoch": 1.8809381782183245, + "grad_norm": 0.022566197322495922, + "learning_rate": 1.0746390605636259e-05, + "loss": 0.463, + "step": 37050 + }, + { + "epoch": 1.881192014316356, + "grad_norm": 0.027247127025454444, + "learning_rate": 1.0700757187846188e-05, + "loss": 0.4749, + "step": 37055 + }, + { + "epoch": 1.8814458504143874, + "grad_norm": 0.02154384889356073, + "learning_rate": 1.065521981705736e-05, + "loss": 0.429, + "step": 37060 + }, + { + "epoch": 1.881699686512419, + "grad_norm": 0.026084199584670894, + "learning_rate": 1.0609778502208512e-05, + "loss": 0.4497, + "step": 37065 + }, + { + "epoch": 1.8819535226104505, + "grad_norm": 0.020525637238970808, + "learning_rate": 1.0564433252219507e-05, + "loss": 0.4537, + "step": 37070 + }, + { + "epoch": 1.8822073587084818, + "grad_norm": 0.020483461060641535, + "learning_rate": 1.0519184075991505e-05, + "loss": 0.4505, + "step": 37075 + }, + { + "epoch": 1.8824611948065133, + "grad_norm": 0.02326068817822505, + "learning_rate": 1.0474030982406624e-05, + "loss": 0.4732, + "step": 37080 + }, + { + "epoch": 1.8827150309045448, + "grad_norm": 0.019163506578638276, + "learning_rate": 1.0428973980328216e-05, + "loss": 0.441, + "step": 37085 + }, + { + "epoch": 1.8829688670025764, + "grad_norm": 0.018667241598925556, + "learning_rate": 1.038401307860065e-05, + "loss": 0.435, + "step": 37090 + }, + { + "epoch": 1.883222703100608, + "grad_norm": 0.02123912140480303, + "learning_rate": 1.0339148286049705e-05, + "loss": 0.4478, + "step": 37095 + }, + { + "epoch": 1.8834765391986394, + "grad_norm": 0.021748725396268195, + "learning_rate": 1.0294379611481885e-05, + "loss": 0.4382, + "step": 37100 + }, + { + "epoch": 1.883730375296671, + "grad_norm": 0.021313373453137146, + "learning_rate": 1.0249707063685277e-05, + "loss": 0.4929, + "step": 37105 + }, + { + "epoch": 1.8839842113947025, + "grad_norm": 0.020814925335928592, + "learning_rate": 1.0205130651428806e-05, + "loss": 0.4589, + "step": 37110 + }, + { + "epoch": 1.884238047492734, + "grad_norm": 0.019883791376370075, + "learning_rate": 1.0160650383462588e-05, + "loss": 0.4311, + "step": 37115 + }, + { + "epoch": 1.8844918835907656, + "grad_norm": 0.025265388503289, + "learning_rate": 1.0116266268517805e-05, + "loss": 0.4132, + "step": 37120 + }, + { + "epoch": 1.884745719688797, + "grad_norm": 0.024819525773685097, + "learning_rate": 1.0071978315306984e-05, + "loss": 0.4526, + "step": 37125 + }, + { + "epoch": 1.8849995557868284, + "grad_norm": 0.02126190229536664, + "learning_rate": 1.0027786532523508e-05, + "loss": 0.4589, + "step": 37130 + }, + { + "epoch": 1.88525339188486, + "grad_norm": 0.022454913487583175, + "learning_rate": 9.983690928842105e-06, + "loss": 0.4721, + "step": 37135 + }, + { + "epoch": 1.8855072279828915, + "grad_norm": 0.03277420429629867, + "learning_rate": 9.939691512918404e-06, + "loss": 0.4367, + "step": 37140 + }, + { + "epoch": 1.8857610640809228, + "grad_norm": 0.022196508671986213, + "learning_rate": 9.895788293389385e-06, + "loss": 0.4626, + "step": 37145 + }, + { + "epoch": 1.8860149001789543, + "grad_norm": 0.022168660914226923, + "learning_rate": 9.851981278872878e-06, + "loss": 0.4689, + "step": 37150 + }, + { + "epoch": 1.8862687362769859, + "grad_norm": 0.02563443384862334, + "learning_rate": 9.808270477968173e-06, + "loss": 0.4455, + "step": 37155 + }, + { + "epoch": 1.8865225723750174, + "grad_norm": 0.021376673277626792, + "learning_rate": 9.764655899255347e-06, + "loss": 0.4679, + "step": 37160 + }, + { + "epoch": 1.886776408473049, + "grad_norm": 0.026018940765089024, + "learning_rate": 9.721137551295778e-06, + "loss": 0.4601, + "step": 37165 + }, + { + "epoch": 1.8870302445710805, + "grad_norm": 0.023643143077169092, + "learning_rate": 9.677715442631962e-06, + "loss": 0.4454, + "step": 37170 + }, + { + "epoch": 1.887284080669112, + "grad_norm": 0.025955682817119883, + "learning_rate": 9.63438958178725e-06, + "loss": 0.47, + "step": 37175 + }, + { + "epoch": 1.8875379167671436, + "grad_norm": 0.02096935393978991, + "learning_rate": 9.591159977266506e-06, + "loss": 0.4425, + "step": 37180 + }, + { + "epoch": 1.887791752865175, + "grad_norm": 0.018616468472140536, + "learning_rate": 9.54802663755533e-06, + "loss": 0.4415, + "step": 37185 + }, + { + "epoch": 1.8880455889632066, + "grad_norm": 0.02545228535890788, + "learning_rate": 9.504989571120726e-06, + "loss": 0.4677, + "step": 37190 + }, + { + "epoch": 1.888299425061238, + "grad_norm": 0.022999000833790844, + "learning_rate": 9.462048786410492e-06, + "loss": 0.4482, + "step": 37195 + }, + { + "epoch": 1.8885532611592695, + "grad_norm": 0.02212084208825238, + "learning_rate": 9.419204291853834e-06, + "loss": 0.4339, + "step": 37200 + }, + { + "epoch": 1.888807097257301, + "grad_norm": 0.024284937751072763, + "learning_rate": 9.376456095860798e-06, + "loss": 0.4601, + "step": 37205 + }, + { + "epoch": 1.8890609333553323, + "grad_norm": 0.02382615118490866, + "learning_rate": 9.333804206822726e-06, + "loss": 0.4606, + "step": 37210 + }, + { + "epoch": 1.8893147694533639, + "grad_norm": 0.021486485098027707, + "learning_rate": 9.291248633111927e-06, + "loss": 0.436, + "step": 37215 + }, + { + "epoch": 1.8895686055513954, + "grad_norm": 0.02341752435747304, + "learning_rate": 9.248789383081879e-06, + "loss": 0.4725, + "step": 37220 + }, + { + "epoch": 1.889822441649427, + "grad_norm": 0.027777059793671188, + "learning_rate": 9.206426465067031e-06, + "loss": 0.4456, + "step": 37225 + }, + { + "epoch": 1.8900762777474585, + "grad_norm": 0.021359684770520977, + "learning_rate": 9.164159887383172e-06, + "loss": 0.4423, + "step": 37230 + }, + { + "epoch": 1.89033011384549, + "grad_norm": 0.02349495122091391, + "learning_rate": 9.12198965832689e-06, + "loss": 0.46, + "step": 37235 + }, + { + "epoch": 1.8905839499435215, + "grad_norm": 0.022308573567443774, + "learning_rate": 9.079915786176063e-06, + "loss": 0.4566, + "step": 37240 + }, + { + "epoch": 1.890837786041553, + "grad_norm": 0.020770175767240735, + "learning_rate": 9.037938279189528e-06, + "loss": 0.4211, + "step": 37245 + }, + { + "epoch": 1.8910916221395846, + "grad_norm": 0.0222815680616627, + "learning_rate": 8.996057145607306e-06, + "loss": 0.4519, + "step": 37250 + }, + { + "epoch": 1.8913454582376161, + "grad_norm": 0.02139157051540678, + "learning_rate": 8.95427239365043e-06, + "loss": 0.4423, + "step": 37255 + }, + { + "epoch": 1.8915992943356474, + "grad_norm": 0.021039178876140388, + "learning_rate": 8.912584031521065e-06, + "loss": 0.4599, + "step": 37260 + }, + { + "epoch": 1.891853130433679, + "grad_norm": 0.031078033789477582, + "learning_rate": 8.870992067402384e-06, + "loss": 0.4544, + "step": 37265 + }, + { + "epoch": 1.8921069665317105, + "grad_norm": 0.02651545626840905, + "learning_rate": 8.82949650945869e-06, + "loss": 0.4506, + "step": 37270 + }, + { + "epoch": 1.8923608026297418, + "grad_norm": 0.018840248290589404, + "learning_rate": 8.788097365835358e-06, + "loss": 0.4068, + "step": 37275 + }, + { + "epoch": 1.8926146387277734, + "grad_norm": 0.02098145796800753, + "learning_rate": 8.746794644658828e-06, + "loss": 0.4254, + "step": 37280 + }, + { + "epoch": 1.892868474825805, + "grad_norm": 0.023778470835364184, + "learning_rate": 8.705588354036676e-06, + "loss": 0.4727, + "step": 37285 + }, + { + "epoch": 1.8931223109238364, + "grad_norm": 0.019800277404283806, + "learning_rate": 8.664478502057427e-06, + "loss": 0.4189, + "step": 37290 + }, + { + "epoch": 1.893376147021868, + "grad_norm": 0.022580776759673696, + "learning_rate": 8.623465096790794e-06, + "loss": 0.4688, + "step": 37295 + }, + { + "epoch": 1.8936299831198995, + "grad_norm": 0.024504692780699923, + "learning_rate": 8.582548146287395e-06, + "loss": 0.4415, + "step": 37300 + }, + { + "epoch": 1.893883819217931, + "grad_norm": 0.023773581217463372, + "learning_rate": 8.541727658579191e-06, + "loss": 0.4546, + "step": 37305 + }, + { + "epoch": 1.8941376553159626, + "grad_norm": 0.02457580274560056, + "learning_rate": 8.501003641678885e-06, + "loss": 0.4493, + "step": 37310 + }, + { + "epoch": 1.894391491413994, + "grad_norm": 0.023069551301224907, + "learning_rate": 8.460376103580526e-06, + "loss": 0.4494, + "step": 37315 + }, + { + "epoch": 1.8946453275120256, + "grad_norm": 0.02501756577031343, + "learning_rate": 8.419845052258956e-06, + "loss": 0.4785, + "step": 37320 + }, + { + "epoch": 1.894899163610057, + "grad_norm": 0.025329116897176897, + "learning_rate": 8.37941049567037e-06, + "loss": 0.4603, + "step": 37325 + }, + { + "epoch": 1.8951529997080885, + "grad_norm": 0.023975660111064893, + "learning_rate": 8.339072441751749e-06, + "loss": 0.4487, + "step": 37330 + }, + { + "epoch": 1.89540683580612, + "grad_norm": 0.022681786439959096, + "learning_rate": 8.298830898421316e-06, + "loss": 0.4756, + "step": 37335 + }, + { + "epoch": 1.8956606719041513, + "grad_norm": 0.019839744977402773, + "learning_rate": 8.258685873578198e-06, + "loss": 0.4743, + "step": 37340 + }, + { + "epoch": 1.8959145080021829, + "grad_norm": 0.029110595946668913, + "learning_rate": 8.218637375102866e-06, + "loss": 0.4219, + "step": 37345 + }, + { + "epoch": 1.8961683441002144, + "grad_norm": 0.02468212485542408, + "learning_rate": 8.178685410856424e-06, + "loss": 0.4636, + "step": 37350 + }, + { + "epoch": 1.896422180198246, + "grad_norm": 0.023021032923387984, + "learning_rate": 8.138829988681318e-06, + "loss": 0.4565, + "step": 37355 + }, + { + "epoch": 1.8966760162962775, + "grad_norm": 0.021806816479458407, + "learning_rate": 8.09907111640107e-06, + "loss": 0.4547, + "step": 37360 + }, + { + "epoch": 1.896929852394309, + "grad_norm": 0.028433865062280395, + "learning_rate": 8.059408801819934e-06, + "loss": 0.4506, + "step": 37365 + }, + { + "epoch": 1.8971836884923405, + "grad_norm": 0.024983454984375437, + "learning_rate": 8.01984305272363e-06, + "loss": 0.4492, + "step": 37370 + }, + { + "epoch": 1.897437524590372, + "grad_norm": 0.022957101384112662, + "learning_rate": 7.98037387687861e-06, + "loss": 0.4389, + "step": 37375 + }, + { + "epoch": 1.8976913606884036, + "grad_norm": 0.018439604142735946, + "learning_rate": 7.941001282032512e-06, + "loss": 0.4117, + "step": 37380 + }, + { + "epoch": 1.8979451967864351, + "grad_norm": 0.027207353586784774, + "learning_rate": 7.90172527591393e-06, + "loss": 0.4506, + "step": 37385 + }, + { + "epoch": 1.8981990328844665, + "grad_norm": 0.027055180572066115, + "learning_rate": 7.862545866232585e-06, + "loss": 0.4689, + "step": 37390 + }, + { + "epoch": 1.898452868982498, + "grad_norm": 0.026896364230939135, + "learning_rate": 7.823463060679215e-06, + "loss": 0.4432, + "step": 37395 + }, + { + "epoch": 1.8987067050805295, + "grad_norm": 0.02423512351795487, + "learning_rate": 7.784476866925571e-06, + "loss": 0.4585, + "step": 37400 + }, + { + "epoch": 1.898960541178561, + "grad_norm": 0.02515565929962076, + "learning_rate": 7.745587292624423e-06, + "loss": 0.4585, + "step": 37405 + }, + { + "epoch": 1.8992143772765924, + "grad_norm": 0.020797485192393404, + "learning_rate": 7.706794345409662e-06, + "loss": 0.4633, + "step": 37410 + }, + { + "epoch": 1.899468213374624, + "grad_norm": 0.021221061620204006, + "learning_rate": 7.668098032896086e-06, + "loss": 0.4474, + "step": 37415 + }, + { + "epoch": 1.8997220494726554, + "grad_norm": 0.02165854297022104, + "learning_rate": 7.629498362679621e-06, + "loss": 0.4516, + "step": 37420 + }, + { + "epoch": 1.899975885570687, + "grad_norm": 0.021157573658216487, + "learning_rate": 7.590995342337148e-06, + "loss": 0.451, + "step": 37425 + }, + { + "epoch": 1.9002297216687185, + "grad_norm": 0.022283635644036758, + "learning_rate": 7.552588979426733e-06, + "loss": 0.4755, + "step": 37430 + }, + { + "epoch": 1.90048355776675, + "grad_norm": 0.024413711078674752, + "learning_rate": 7.514279281487179e-06, + "loss": 0.4585, + "step": 37435 + }, + { + "epoch": 1.9007373938647816, + "grad_norm": 0.022093763977813735, + "learning_rate": 7.476066256038638e-06, + "loss": 0.4473, + "step": 37440 + }, + { + "epoch": 1.9009912299628131, + "grad_norm": 0.021626286158038993, + "learning_rate": 7.437949910581998e-06, + "loss": 0.4354, + "step": 37445 + }, + { + "epoch": 1.9012450660608446, + "grad_norm": 0.0273165842843997, + "learning_rate": 7.399930252599496e-06, + "loss": 0.4737, + "step": 37450 + }, + { + "epoch": 1.901498902158876, + "grad_norm": 0.023472801346320262, + "learning_rate": 7.362007289553996e-06, + "loss": 0.4606, + "step": 37455 + }, + { + "epoch": 1.9017527382569075, + "grad_norm": 0.025848419421606466, + "learning_rate": 7.324181028889709e-06, + "loss": 0.4439, + "step": 37460 + }, + { + "epoch": 1.902006574354939, + "grad_norm": 0.022098946912295272, + "learning_rate": 7.286451478031753e-06, + "loss": 0.4525, + "step": 37465 + }, + { + "epoch": 1.9022604104529706, + "grad_norm": 0.020132077745634438, + "learning_rate": 7.2488186443862015e-06, + "loss": 0.4505, + "step": 37470 + }, + { + "epoch": 1.9025142465510019, + "grad_norm": 0.021696897376979055, + "learning_rate": 7.211282535340202e-06, + "loss": 0.4805, + "step": 37475 + }, + { + "epoch": 1.9027680826490334, + "grad_norm": 0.027185595143177976, + "learning_rate": 7.173843158261861e-06, + "loss": 0.4553, + "step": 37480 + }, + { + "epoch": 1.903021918747065, + "grad_norm": 0.02056624187081856, + "learning_rate": 7.136500520500466e-06, + "loss": 0.4245, + "step": 37485 + }, + { + "epoch": 1.9032757548450965, + "grad_norm": 0.020926738183475128, + "learning_rate": 7.0992546293860425e-06, + "loss": 0.4228, + "step": 37490 + }, + { + "epoch": 1.903529590943128, + "grad_norm": 0.028740551485723114, + "learning_rate": 7.062105492229909e-06, + "loss": 0.459, + "step": 37495 + }, + { + "epoch": 1.9037834270411595, + "grad_norm": 0.053682388754172104, + "learning_rate": 7.02505311632412e-06, + "loss": 0.4424, + "step": 37500 + }, + { + "epoch": 1.904037263139191, + "grad_norm": 0.02173561656109306, + "learning_rate": 6.988097508942026e-06, + "loss": 0.4822, + "step": 37505 + }, + { + "epoch": 1.9042910992372226, + "grad_norm": 0.020845652238898626, + "learning_rate": 6.951238677337657e-06, + "loss": 0.4551, + "step": 37510 + }, + { + "epoch": 1.9045449353352542, + "grad_norm": 0.022442971026077695, + "learning_rate": 6.914476628746391e-06, + "loss": 0.4574, + "step": 37515 + }, + { + "epoch": 1.9047987714332857, + "grad_norm": 0.019100573728677607, + "learning_rate": 6.8778113703842345e-06, + "loss": 0.441, + "step": 37520 + }, + { + "epoch": 1.905052607531317, + "grad_norm": 0.022661668431873734, + "learning_rate": 6.8412429094485975e-06, + "loss": 0.4653, + "step": 37525 + }, + { + "epoch": 1.9053064436293485, + "grad_norm": 0.022521358187594164, + "learning_rate": 6.80477125311757e-06, + "loss": 0.459, + "step": 37530 + }, + { + "epoch": 1.90556027972738, + "grad_norm": 0.02587584279617253, + "learning_rate": 6.768396408550426e-06, + "loss": 0.434, + "step": 37535 + }, + { + "epoch": 1.9058141158254114, + "grad_norm": 0.02722351063157932, + "learning_rate": 6.732118382887287e-06, + "loss": 0.4388, + "step": 37540 + }, + { + "epoch": 1.906067951923443, + "grad_norm": 0.02166094929641842, + "learning_rate": 6.695937183249401e-06, + "loss": 0.4502, + "step": 37545 + }, + { + "epoch": 1.9063217880214745, + "grad_norm": 0.02157001533468999, + "learning_rate": 6.6598528167389205e-06, + "loss": 0.4463, + "step": 37550 + }, + { + "epoch": 1.906575624119506, + "grad_norm": 0.02241970910672534, + "learning_rate": 6.623865290439068e-06, + "loss": 0.4456, + "step": 37555 + }, + { + "epoch": 1.9068294602175375, + "grad_norm": 0.021738008804449923, + "learning_rate": 6.587974611413972e-06, + "loss": 0.4634, + "step": 37560 + }, + { + "epoch": 1.907083296315569, + "grad_norm": 0.021239611101420534, + "learning_rate": 6.552180786708828e-06, + "loss": 0.4402, + "step": 37565 + }, + { + "epoch": 1.9073371324136006, + "grad_norm": 0.02101083088232794, + "learning_rate": 6.516483823349795e-06, + "loss": 0.4725, + "step": 37570 + }, + { + "epoch": 1.9075909685116321, + "grad_norm": 0.025164203687074636, + "learning_rate": 6.480883728343989e-06, + "loss": 0.4614, + "step": 37575 + }, + { + "epoch": 1.9078448046096637, + "grad_norm": 0.023582969514340343, + "learning_rate": 6.445380508679488e-06, + "loss": 0.4138, + "step": 37580 + }, + { + "epoch": 1.9080986407076952, + "grad_norm": 0.018915533270275835, + "learning_rate": 6.4099741713254945e-06, + "loss": 0.432, + "step": 37585 + }, + { + "epoch": 1.9083524768057265, + "grad_norm": 0.02275092358064681, + "learning_rate": 6.374664723232004e-06, + "loss": 0.4535, + "step": 37590 + }, + { + "epoch": 1.908606312903758, + "grad_norm": 0.022085053146593946, + "learning_rate": 6.33945217133014e-06, + "loss": 0.4422, + "step": 37595 + }, + { + "epoch": 1.9088601490017896, + "grad_norm": 0.020867827787237652, + "learning_rate": 6.304336522531928e-06, + "loss": 0.462, + "step": 37600 + }, + { + "epoch": 1.9091139850998209, + "grad_norm": 0.024270037364154493, + "learning_rate": 6.26931778373041e-06, + "loss": 0.4475, + "step": 37605 + }, + { + "epoch": 1.9093678211978524, + "grad_norm": 0.019787562403379675, + "learning_rate": 6.234395961799588e-06, + "loss": 0.4427, + "step": 37610 + }, + { + "epoch": 1.909621657295884, + "grad_norm": 0.021795151563263483, + "learning_rate": 6.199571063594423e-06, + "loss": 0.4609, + "step": 37615 + }, + { + "epoch": 1.9098754933939155, + "grad_norm": 0.025237377838803695, + "learning_rate": 6.164843095950889e-06, + "loss": 0.4763, + "step": 37620 + }, + { + "epoch": 1.910129329491947, + "grad_norm": 0.0213538500394289, + "learning_rate": 6.13021206568587e-06, + "loss": 0.4668, + "step": 37625 + }, + { + "epoch": 1.9103831655899786, + "grad_norm": 0.019996110614226716, + "learning_rate": 6.095677979597314e-06, + "loss": 0.4321, + "step": 37630 + }, + { + "epoch": 1.91063700168801, + "grad_norm": 0.0239910603152135, + "learning_rate": 6.0612408444640775e-06, + "loss": 0.4418, + "step": 37635 + }, + { + "epoch": 1.9108908377860416, + "grad_norm": 0.01982225962400516, + "learning_rate": 6.026900667045976e-06, + "loss": 0.452, + "step": 37640 + }, + { + "epoch": 1.9111446738840732, + "grad_norm": 0.020978657467764606, + "learning_rate": 5.992657454083839e-06, + "loss": 0.4343, + "step": 37645 + }, + { + "epoch": 1.9113985099821047, + "grad_norm": 0.021612862899188023, + "learning_rate": 5.958511212299455e-06, + "loss": 0.4522, + "step": 37650 + }, + { + "epoch": 1.911652346080136, + "grad_norm": 0.02111789726848896, + "learning_rate": 5.9244619483955206e-06, + "loss": 0.4635, + "step": 37655 + }, + { + "epoch": 1.9119061821781675, + "grad_norm": 0.026163308653677195, + "learning_rate": 5.890509669055799e-06, + "loss": 0.4711, + "step": 37660 + }, + { + "epoch": 1.912160018276199, + "grad_norm": 0.021019925153574526, + "learning_rate": 5.856654380944848e-06, + "loss": 0.4837, + "step": 37665 + }, + { + "epoch": 1.9124138543742304, + "grad_norm": 0.027602896310674144, + "learning_rate": 5.822896090708407e-06, + "loss": 0.4239, + "step": 37670 + }, + { + "epoch": 1.912667690472262, + "grad_norm": 0.021167535487248926, + "learning_rate": 5.789234804972954e-06, + "loss": 0.4552, + "step": 37675 + }, + { + "epoch": 1.9129215265702935, + "grad_norm": 0.023086996155861482, + "learning_rate": 5.755670530346146e-06, + "loss": 0.4514, + "step": 37680 + }, + { + "epoch": 1.913175362668325, + "grad_norm": 0.022681878507942586, + "learning_rate": 5.722203273416326e-06, + "loss": 0.4658, + "step": 37685 + }, + { + "epoch": 1.9134291987663565, + "grad_norm": 0.02359410198455559, + "learning_rate": 5.6888330407531275e-06, + "loss": 0.4964, + "step": 37690 + }, + { + "epoch": 1.913683034864388, + "grad_norm": 0.03200500971381236, + "learning_rate": 5.6555598389068656e-06, + "loss": 0.4776, + "step": 37695 + }, + { + "epoch": 1.9139368709624196, + "grad_norm": 0.022712836996455054, + "learning_rate": 5.622383674408871e-06, + "loss": 0.4376, + "step": 37700 + }, + { + "epoch": 1.9141907070604511, + "grad_norm": 0.025060341466893926, + "learning_rate": 5.589304553771546e-06, + "loss": 0.458, + "step": 37705 + }, + { + "epoch": 1.9144445431584827, + "grad_norm": 0.030278170291182304, + "learning_rate": 5.556322483488086e-06, + "loss": 0.4499, + "step": 37710 + }, + { + "epoch": 1.9146983792565142, + "grad_norm": 0.02740154808328099, + "learning_rate": 5.523437470032755e-06, + "loss": 0.4415, + "step": 37715 + }, + { + "epoch": 1.9149522153545455, + "grad_norm": 0.02153574679548554, + "learning_rate": 5.4906495198607246e-06, + "loss": 0.4241, + "step": 37720 + }, + { + "epoch": 1.915206051452577, + "grad_norm": 0.017912518417006917, + "learning_rate": 5.457958639408067e-06, + "loss": 0.4443, + "step": 37725 + }, + { + "epoch": 1.9154598875506086, + "grad_norm": 0.02149002570361707, + "learning_rate": 5.425364835091817e-06, + "loss": 0.4269, + "step": 37730 + }, + { + "epoch": 1.9157137236486401, + "grad_norm": 0.020655007242153935, + "learning_rate": 5.392868113310023e-06, + "loss": 0.4742, + "step": 37735 + }, + { + "epoch": 1.9159675597466714, + "grad_norm": 0.02053084592337424, + "learning_rate": 5.3604684804416385e-06, + "loss": 0.4562, + "step": 37740 + }, + { + "epoch": 1.916221395844703, + "grad_norm": 0.0241087565783307, + "learning_rate": 5.328165942846519e-06, + "loss": 0.4665, + "step": 37745 + }, + { + "epoch": 1.9164752319427345, + "grad_norm": 0.022566883002394126, + "learning_rate": 5.2959605068654825e-06, + "loss": 0.4356, + "step": 37750 + }, + { + "epoch": 1.916729068040766, + "grad_norm": 0.02242988197114318, + "learning_rate": 5.263852178820305e-06, + "loss": 0.4291, + "step": 37755 + }, + { + "epoch": 1.9169829041387976, + "grad_norm": 0.020849242044697184, + "learning_rate": 5.231840965013668e-06, + "loss": 0.4608, + "step": 37760 + }, + { + "epoch": 1.917236740236829, + "grad_norm": 0.02353852633397079, + "learning_rate": 5.199926871729321e-06, + "loss": 0.476, + "step": 37765 + }, + { + "epoch": 1.9174905763348606, + "grad_norm": 0.022019434675896826, + "learning_rate": 5.1681099052317545e-06, + "loss": 0.4421, + "step": 37770 + }, + { + "epoch": 1.9177444124328922, + "grad_norm": 0.02349710133273863, + "learning_rate": 5.136390071766472e-06, + "loss": 0.4404, + "step": 37775 + }, + { + "epoch": 1.9179982485309237, + "grad_norm": 0.023170632986613243, + "learning_rate": 5.104767377559938e-06, + "loss": 0.4633, + "step": 37780 + }, + { + "epoch": 1.9182520846289552, + "grad_norm": 0.020601169329142837, + "learning_rate": 5.073241828819519e-06, + "loss": 0.4633, + "step": 37785 + }, + { + "epoch": 1.9185059207269866, + "grad_norm": 0.020705639764346858, + "learning_rate": 5.041813431733544e-06, + "loss": 0.441, + "step": 37790 + }, + { + "epoch": 1.918759756825018, + "grad_norm": 0.02556826389734915, + "learning_rate": 5.010482192471244e-06, + "loss": 0.4671, + "step": 37795 + }, + { + "epoch": 1.9190135929230496, + "grad_norm": 0.025036129806181157, + "learning_rate": 4.9792481171828105e-06, + "loss": 0.4469, + "step": 37800 + }, + { + "epoch": 1.919267429021081, + "grad_norm": 0.022980559401410746, + "learning_rate": 4.948111211999284e-06, + "loss": 0.454, + "step": 37805 + }, + { + "epoch": 1.9195212651191125, + "grad_norm": 0.02211175879102414, + "learning_rate": 4.917071483032665e-06, + "loss": 0.4545, + "step": 37810 + }, + { + "epoch": 1.919775101217144, + "grad_norm": 0.021321225074512732, + "learning_rate": 4.886128936375966e-06, + "loss": 0.4651, + "step": 37815 + }, + { + "epoch": 1.9200289373151755, + "grad_norm": 0.021856946004611112, + "learning_rate": 4.855283578103054e-06, + "loss": 0.4445, + "step": 37820 + }, + { + "epoch": 1.920282773413207, + "grad_norm": 0.020187327224539992, + "learning_rate": 4.824535414268638e-06, + "loss": 0.4632, + "step": 37825 + }, + { + "epoch": 1.9205366095112386, + "grad_norm": 0.023155717064296796, + "learning_rate": 4.793884450908559e-06, + "loss": 0.4565, + "step": 37830 + }, + { + "epoch": 1.9207904456092701, + "grad_norm": 0.023520053434749763, + "learning_rate": 4.763330694039281e-06, + "loss": 0.4615, + "step": 37835 + }, + { + "epoch": 1.9210442817073017, + "grad_norm": 0.02152645989454147, + "learning_rate": 4.7328741496585615e-06, + "loss": 0.4672, + "step": 37840 + }, + { + "epoch": 1.9212981178053332, + "grad_norm": 0.020920871384161287, + "learning_rate": 4.7025148237446745e-06, + "loss": 0.462, + "step": 37845 + }, + { + "epoch": 1.9215519539033648, + "grad_norm": 0.02171301262060699, + "learning_rate": 4.672252722257076e-06, + "loss": 0.4595, + "step": 37850 + }, + { + "epoch": 1.921805790001396, + "grad_norm": 0.02813041922076858, + "learning_rate": 4.642087851136123e-06, + "loss": 0.4462, + "step": 37855 + }, + { + "epoch": 1.9220596260994276, + "grad_norm": 0.025961513788743708, + "learning_rate": 4.61202021630297e-06, + "loss": 0.4425, + "step": 37860 + }, + { + "epoch": 1.9223134621974591, + "grad_norm": 0.031507528395826975, + "learning_rate": 4.582049823659673e-06, + "loss": 0.4624, + "step": 37865 + }, + { + "epoch": 1.9225672982954904, + "grad_norm": 0.022048534282361677, + "learning_rate": 4.55217667908947e-06, + "loss": 0.467, + "step": 37870 + }, + { + "epoch": 1.922821134393522, + "grad_norm": 0.02306257762735943, + "learning_rate": 4.522400788456115e-06, + "loss": 0.4674, + "step": 37875 + }, + { + "epoch": 1.9230749704915535, + "grad_norm": 0.02387832694700697, + "learning_rate": 4.492722157604545e-06, + "loss": 0.4673, + "step": 37880 + }, + { + "epoch": 1.923328806589585, + "grad_norm": 0.020366760561243185, + "learning_rate": 4.463140792360487e-06, + "loss": 0.4333, + "step": 37885 + }, + { + "epoch": 1.9235826426876166, + "grad_norm": 0.02130836354004238, + "learning_rate": 4.433656698530741e-06, + "loss": 0.4599, + "step": 37890 + }, + { + "epoch": 1.9238364787856481, + "grad_norm": 0.0206503846500878, + "learning_rate": 4.404269881902734e-06, + "loss": 0.4508, + "step": 37895 + }, + { + "epoch": 1.9240903148836797, + "grad_norm": 0.02300274852597502, + "learning_rate": 4.374980348245072e-06, + "loss": 0.4446, + "step": 37900 + }, + { + "epoch": 1.9243441509817112, + "grad_norm": 0.02138956368703685, + "learning_rate": 4.345788103307047e-06, + "loss": 0.4589, + "step": 37905 + }, + { + "epoch": 1.9245979870797427, + "grad_norm": 0.02227297400993538, + "learning_rate": 4.316693152819018e-06, + "loss": 0.4654, + "step": 37910 + }, + { + "epoch": 1.9248518231777743, + "grad_norm": 0.022468692731550267, + "learning_rate": 4.287695502492139e-06, + "loss": 0.4457, + "step": 37915 + }, + { + "epoch": 1.9251056592758056, + "grad_norm": 0.02069126082899075, + "learning_rate": 4.25879515801858e-06, + "loss": 0.4567, + "step": 37920 + }, + { + "epoch": 1.925359495373837, + "grad_norm": 0.019826909262503828, + "learning_rate": 4.229992125071192e-06, + "loss": 0.4453, + "step": 37925 + }, + { + "epoch": 1.9256133314718686, + "grad_norm": 0.020987878267737438, + "learning_rate": 4.201286409304006e-06, + "loss": 0.4454, + "step": 37930 + }, + { + "epoch": 1.9258671675699, + "grad_norm": 0.020420855142907778, + "learning_rate": 4.172678016351683e-06, + "loss": 0.433, + "step": 37935 + }, + { + "epoch": 1.9261210036679315, + "grad_norm": 0.02284228322904161, + "learning_rate": 4.1441669518300086e-06, + "loss": 0.4631, + "step": 37940 + }, + { + "epoch": 1.926374839765963, + "grad_norm": 0.0222567772359531, + "learning_rate": 4.115753221335561e-06, + "loss": 0.4523, + "step": 37945 + }, + { + "epoch": 1.9266286758639946, + "grad_norm": 0.023189951687417994, + "learning_rate": 4.087436830445768e-06, + "loss": 0.4458, + "step": 37950 + }, + { + "epoch": 1.926882511962026, + "grad_norm": 0.021658902190008947, + "learning_rate": 4.059217784719016e-06, + "loss": 0.4459, + "step": 37955 + }, + { + "epoch": 1.9271363480600576, + "grad_norm": 0.024217930442230858, + "learning_rate": 4.0310960896945415e-06, + "loss": 0.4538, + "step": 37960 + }, + { + "epoch": 1.9273901841580892, + "grad_norm": 0.022206278190921094, + "learning_rate": 4.003071750892595e-06, + "loss": 0.4552, + "step": 37965 + }, + { + "epoch": 1.9276440202561207, + "grad_norm": 0.025828497485879003, + "learning_rate": 3.9751447738140545e-06, + "loss": 0.4645, + "step": 37970 + }, + { + "epoch": 1.9278978563541522, + "grad_norm": 0.019836230031236992, + "learning_rate": 3.9473151639409235e-06, + "loss": 0.4585, + "step": 37975 + }, + { + "epoch": 1.9281516924521838, + "grad_norm": 0.020662066999979745, + "learning_rate": 3.919582926735999e-06, + "loss": 0.454, + "step": 37980 + }, + { + "epoch": 1.928405528550215, + "grad_norm": 0.022723364615094117, + "learning_rate": 3.891948067643036e-06, + "loss": 0.4606, + "step": 37985 + }, + { + "epoch": 1.9286593646482466, + "grad_norm": 0.023952908845065374, + "learning_rate": 3.864410592086587e-06, + "loss": 0.4559, + "step": 37990 + }, + { + "epoch": 1.9289132007462781, + "grad_norm": 0.025670760204192115, + "learning_rate": 3.836970505472104e-06, + "loss": 0.4263, + "step": 37995 + }, + { + "epoch": 1.9291670368443097, + "grad_norm": 0.02416027669647824, + "learning_rate": 3.8096278131859452e-06, + "loss": 0.4594, + "step": 38000 + }, + { + "epoch": 1.929420872942341, + "grad_norm": 0.02400148828175322, + "learning_rate": 3.7823825205953177e-06, + "loss": 0.4519, + "step": 38005 + }, + { + "epoch": 1.9296747090403725, + "grad_norm": 0.020877060418883157, + "learning_rate": 3.755234633048388e-06, + "loss": 0.4399, + "step": 38010 + }, + { + "epoch": 1.929928545138404, + "grad_norm": 0.021079166677847018, + "learning_rate": 3.7281841558741147e-06, + "loss": 0.4526, + "step": 38015 + }, + { + "epoch": 1.9301823812364356, + "grad_norm": 0.02979341790679115, + "learning_rate": 3.7012310943824178e-06, + "loss": 0.4559, + "step": 38020 + }, + { + "epoch": 1.9304362173344671, + "grad_norm": 0.021472840172437824, + "learning_rate": 3.6743754538640093e-06, + "loss": 0.4546, + "step": 38025 + }, + { + "epoch": 1.9306900534324987, + "grad_norm": 0.024965054424825203, + "learning_rate": 3.6476172395905615e-06, + "loss": 0.4608, + "step": 38030 + }, + { + "epoch": 1.9309438895305302, + "grad_norm": 0.030534410696193816, + "learning_rate": 3.6209564568144837e-06, + "loss": 0.4537, + "step": 38035 + }, + { + "epoch": 1.9311977256285617, + "grad_norm": 0.020565392136713392, + "learning_rate": 3.5943931107692563e-06, + "loss": 0.4551, + "step": 38040 + }, + { + "epoch": 1.9314515617265933, + "grad_norm": 0.02990769064293206, + "learning_rate": 3.567927206669097e-06, + "loss": 0.4534, + "step": 38045 + }, + { + "epoch": 1.9317053978246248, + "grad_norm": 0.020744114829942508, + "learning_rate": 3.5415587497090727e-06, + "loss": 0.4606, + "step": 38050 + }, + { + "epoch": 1.9319592339226561, + "grad_norm": 0.02147231946067232, + "learning_rate": 3.515287745065321e-06, + "loss": 0.4542, + "step": 38055 + }, + { + "epoch": 1.9322130700206877, + "grad_norm": 0.02382770440065348, + "learning_rate": 3.4891141978945497e-06, + "loss": 0.4489, + "step": 38060 + }, + { + "epoch": 1.9324669061187192, + "grad_norm": 0.02168451146145817, + "learning_rate": 3.463038113334538e-06, + "loss": 0.4486, + "step": 38065 + }, + { + "epoch": 1.9327207422167505, + "grad_norm": 0.019029522910771698, + "learning_rate": 3.437059496503969e-06, + "loss": 0.4532, + "step": 38070 + }, + { + "epoch": 1.932974578314782, + "grad_norm": 0.02297240339975973, + "learning_rate": 3.4111783525022646e-06, + "loss": 0.4704, + "step": 38075 + }, + { + "epoch": 1.9332284144128136, + "grad_norm": 0.029039053764296118, + "learning_rate": 3.3853946864097486e-06, + "loss": 0.4711, + "step": 38080 + }, + { + "epoch": 1.933482250510845, + "grad_norm": 0.021336084269481418, + "learning_rate": 3.3597085032876505e-06, + "loss": 0.4505, + "step": 38085 + }, + { + "epoch": 1.9337360866088766, + "grad_norm": 0.026153460382982597, + "learning_rate": 3.3341198081780487e-06, + "loss": 0.4829, + "step": 38090 + }, + { + "epoch": 1.9339899227069082, + "grad_norm": 0.026443914139286612, + "learning_rate": 3.3086286061038697e-06, + "loss": 0.46, + "step": 38095 + }, + { + "epoch": 1.9342437588049397, + "grad_norm": 0.022842151943259267, + "learning_rate": 3.283234902068888e-06, + "loss": 0.4416, + "step": 38100 + }, + { + "epoch": 1.9344975949029712, + "grad_norm": 0.020825476467275915, + "learning_rate": 3.2579387010577277e-06, + "loss": 0.4656, + "step": 38105 + }, + { + "epoch": 1.9347514310010028, + "grad_norm": 0.021769918904403884, + "learning_rate": 3.2327400080359725e-06, + "loss": 0.4631, + "step": 38110 + }, + { + "epoch": 1.9350052670990343, + "grad_norm": 0.02082151349347851, + "learning_rate": 3.207638827949999e-06, + "loss": 0.4353, + "step": 38115 + }, + { + "epoch": 1.9352591031970656, + "grad_norm": 0.02513216757499985, + "learning_rate": 3.1826351657270323e-06, + "loss": 0.4638, + "step": 38120 + }, + { + "epoch": 1.9355129392950972, + "grad_norm": 0.022971752053904795, + "learning_rate": 3.1577290262750912e-06, + "loss": 0.455, + "step": 38125 + }, + { + "epoch": 1.9357667753931287, + "grad_norm": 0.023390667458557612, + "learning_rate": 3.1329204144832647e-06, + "loss": 0.4503, + "step": 38130 + }, + { + "epoch": 1.93602061149116, + "grad_norm": 0.027462875293432002, + "learning_rate": 3.108209335221268e-06, + "loss": 0.4698, + "step": 38135 + }, + { + "epoch": 1.9362744475891915, + "grad_norm": 0.022880416789241798, + "learning_rate": 3.0835957933397774e-06, + "loss": 0.4613, + "step": 38140 + }, + { + "epoch": 1.936528283687223, + "grad_norm": 0.021014225306591475, + "learning_rate": 3.0590797936703164e-06, + "loss": 0.5413, + "step": 38145 + }, + { + "epoch": 1.9367821197852546, + "grad_norm": 0.022815293753873547, + "learning_rate": 3.034661341025258e-06, + "loss": 0.4312, + "step": 38150 + }, + { + "epoch": 1.9370359558832861, + "grad_norm": 0.021813853966513634, + "learning_rate": 3.010340440197823e-06, + "loss": 0.455, + "step": 38155 + }, + { + "epoch": 1.9372897919813177, + "grad_norm": 0.027654537630323616, + "learning_rate": 2.986117095962082e-06, + "loss": 0.4457, + "step": 38160 + }, + { + "epoch": 1.9375436280793492, + "grad_norm": 0.020417251873714362, + "learning_rate": 2.961991313072898e-06, + "loss": 0.4311, + "step": 38165 + }, + { + "epoch": 1.9377974641773807, + "grad_norm": 0.02126948120782193, + "learning_rate": 2.9379630962661496e-06, + "loss": 0.4697, + "step": 38170 + }, + { + "epoch": 1.9380513002754123, + "grad_norm": 0.020504217551342806, + "learning_rate": 2.914032450258397e-06, + "loss": 0.4503, + "step": 38175 + }, + { + "epoch": 1.9383051363734438, + "grad_norm": 0.021169339004345845, + "learning_rate": 2.890199379747105e-06, + "loss": 0.4436, + "step": 38180 + }, + { + "epoch": 1.9385589724714751, + "grad_norm": 0.027548188156958732, + "learning_rate": 2.8664638894105867e-06, + "loss": 0.4585, + "step": 38185 + }, + { + "epoch": 1.9388128085695067, + "grad_norm": 0.02785876405422749, + "learning_rate": 2.8428259839079486e-06, + "loss": 0.4681, + "step": 38190 + }, + { + "epoch": 1.9390666446675382, + "grad_norm": 0.025887282870096888, + "learning_rate": 2.819285667879312e-06, + "loss": 0.4748, + "step": 38195 + }, + { + "epoch": 1.9393204807655695, + "grad_norm": 0.021934926759944635, + "learning_rate": 2.7958429459454817e-06, + "loss": 0.4409, + "step": 38200 + }, + { + "epoch": 1.939574316863601, + "grad_norm": 0.02102254390453021, + "learning_rate": 2.7724978227081086e-06, + "loss": 0.443, + "step": 38205 + }, + { + "epoch": 1.9398281529616326, + "grad_norm": 0.025739303931824997, + "learning_rate": 2.7492503027496953e-06, + "loss": 0.4346, + "step": 38210 + }, + { + "epoch": 1.9400819890596641, + "grad_norm": 0.020415985192957675, + "learning_rate": 2.726100390633757e-06, + "loss": 0.4278, + "step": 38215 + }, + { + "epoch": 1.9403358251576956, + "grad_norm": 0.027445653367167453, + "learning_rate": 2.7030480909043254e-06, + "loss": 0.4677, + "step": 38220 + }, + { + "epoch": 1.9405896612557272, + "grad_norm": 0.022194649303696412, + "learning_rate": 2.680093408086559e-06, + "loss": 0.4588, + "step": 38225 + }, + { + "epoch": 1.9408434973537587, + "grad_norm": 0.03334788531458536, + "learning_rate": 2.6572363466863534e-06, + "loss": 0.48, + "step": 38230 + }, + { + "epoch": 1.9410973334517903, + "grad_norm": 0.02165643766225538, + "learning_rate": 2.6344769111903975e-06, + "loss": 0.4869, + "step": 38235 + }, + { + "epoch": 1.9413511695498218, + "grad_norm": 0.02482685913614366, + "learning_rate": 2.6118151060662842e-06, + "loss": 0.4267, + "step": 38240 + }, + { + "epoch": 1.9416050056478533, + "grad_norm": 0.02207024691983292, + "learning_rate": 2.589250935762344e-06, + "loss": 0.4325, + "step": 38245 + }, + { + "epoch": 1.9418588417458846, + "grad_norm": 0.021736842944571975, + "learning_rate": 2.566784404707867e-06, + "loss": 0.444, + "step": 38250 + }, + { + "epoch": 1.9421126778439162, + "grad_norm": 0.033892642112711965, + "learning_rate": 2.5444155173129368e-06, + "loss": 0.4529, + "step": 38255 + }, + { + "epoch": 1.9423665139419477, + "grad_norm": 0.021770668922768112, + "learning_rate": 2.52214427796843e-06, + "loss": 0.4422, + "step": 38260 + }, + { + "epoch": 1.9426203500399792, + "grad_norm": 0.02338794948994561, + "learning_rate": 2.499970691046127e-06, + "loss": 0.4577, + "step": 38265 + }, + { + "epoch": 1.9428741861380106, + "grad_norm": 0.0211563138278504, + "learning_rate": 2.4778947608984915e-06, + "loss": 0.4691, + "step": 38270 + }, + { + "epoch": 1.943128022236042, + "grad_norm": 0.021156732461866418, + "learning_rate": 2.4559164918590005e-06, + "loss": 0.4505, + "step": 38275 + }, + { + "epoch": 1.9433818583340736, + "grad_norm": 0.029251366916451853, + "learning_rate": 2.4340358882418144e-06, + "loss": 0.436, + "step": 38280 + }, + { + "epoch": 1.9436356944321052, + "grad_norm": 0.020800337122009502, + "learning_rate": 2.412252954342109e-06, + "loss": 0.4635, + "step": 38285 + }, + { + "epoch": 1.9438895305301367, + "grad_norm": 0.020582842457094158, + "learning_rate": 2.3905676944356303e-06, + "loss": 0.4379, + "step": 38290 + }, + { + "epoch": 1.9441433666281682, + "grad_norm": 0.021231715330240406, + "learning_rate": 2.36898011277914e-06, + "loss": 0.4536, + "step": 38295 + }, + { + "epoch": 1.9443972027261998, + "grad_norm": 0.019707268959821594, + "learning_rate": 2.3474902136101927e-06, + "loss": 0.4483, + "step": 38300 + }, + { + "epoch": 1.9446510388242313, + "grad_norm": 0.021565038396861153, + "learning_rate": 2.3260980011470258e-06, + "loss": 0.4508, + "step": 38305 + }, + { + "epoch": 1.9449048749222628, + "grad_norm": 0.022307843520456463, + "learning_rate": 2.304803479589057e-06, + "loss": 0.4469, + "step": 38310 + }, + { + "epoch": 1.9451587110202944, + "grad_norm": 0.021662290602322585, + "learning_rate": 2.2836066531161104e-06, + "loss": 0.4584, + "step": 38315 + }, + { + "epoch": 1.9454125471183257, + "grad_norm": 0.022824581912878204, + "learning_rate": 2.2625075258890793e-06, + "loss": 0.4524, + "step": 38320 + }, + { + "epoch": 1.9456663832163572, + "grad_norm": 0.02383320091907116, + "learning_rate": 2.2415061020495954e-06, + "loss": 0.4545, + "step": 38325 + }, + { + "epoch": 1.9459202193143887, + "grad_norm": 0.02259973050640163, + "learning_rate": 2.2206023857201386e-06, + "loss": 0.4395, + "step": 38330 + }, + { + "epoch": 1.94617405541242, + "grad_norm": 0.02451602070891827, + "learning_rate": 2.199796381004038e-06, + "loss": 0.4463, + "step": 38335 + }, + { + "epoch": 1.9464278915104516, + "grad_norm": 0.02350649241970694, + "learning_rate": 2.1790880919853595e-06, + "loss": 0.4571, + "step": 38340 + }, + { + "epoch": 1.9466817276084831, + "grad_norm": 0.024863563292605854, + "learning_rate": 2.1584775227290745e-06, + "loss": 0.4537, + "step": 38345 + }, + { + "epoch": 1.9469355637065147, + "grad_norm": 0.026608746542194056, + "learning_rate": 2.1379646772808903e-06, + "loss": 0.4525, + "step": 38350 + }, + { + "epoch": 1.9471893998045462, + "grad_norm": 0.021420118326157717, + "learning_rate": 2.11754955966742e-06, + "loss": 0.4788, + "step": 38355 + }, + { + "epoch": 1.9474432359025777, + "grad_norm": 0.021686725777661547, + "learning_rate": 2.0972321738960687e-06, + "loss": 0.4555, + "step": 38360 + }, + { + "epoch": 1.9476970720006093, + "grad_norm": 0.020239535964856954, + "learning_rate": 2.0770125239549797e-06, + "loss": 0.4275, + "step": 38365 + }, + { + "epoch": 1.9479509080986408, + "grad_norm": 0.026279491549929514, + "learning_rate": 2.0568906138132002e-06, + "loss": 0.4728, + "step": 38370 + }, + { + "epoch": 1.9482047441966723, + "grad_norm": 0.03912414768353642, + "learning_rate": 2.0368664474205157e-06, + "loss": 0.4767, + "step": 38375 + }, + { + "epoch": 1.9484585802947039, + "grad_norm": 0.02377861368004799, + "learning_rate": 2.01694002870767e-06, + "loss": 0.4395, + "step": 38380 + }, + { + "epoch": 1.9487124163927352, + "grad_norm": 0.022829057653313226, + "learning_rate": 1.997111361586035e-06, + "loss": 0.4665, + "step": 38385 + }, + { + "epoch": 1.9489662524907667, + "grad_norm": 0.02234878999035154, + "learning_rate": 1.9773804499478854e-06, + "loss": 0.4479, + "step": 38390 + }, + { + "epoch": 1.9492200885887982, + "grad_norm": 0.020446858573344908, + "learning_rate": 1.957747297666346e-06, + "loss": 0.4451, + "step": 38395 + }, + { + "epoch": 1.9494739246868296, + "grad_norm": 0.022150719815722406, + "learning_rate": 1.9382119085952777e-06, + "loss": 0.467, + "step": 38400 + }, + { + "epoch": 1.949727760784861, + "grad_norm": 0.02395305631634785, + "learning_rate": 1.9187742865693915e-06, + "loss": 0.4446, + "step": 38405 + }, + { + "epoch": 1.9499815968828926, + "grad_norm": 0.025226111441204044, + "learning_rate": 1.899434435404135e-06, + "loss": 0.4212, + "step": 38410 + }, + { + "epoch": 1.9502354329809242, + "grad_norm": 0.026080819122294808, + "learning_rate": 1.8801923588959157e-06, + "loss": 0.43, + "step": 38415 + }, + { + "epoch": 1.9504892690789557, + "grad_norm": 0.027147258519150435, + "learning_rate": 1.8610480608218239e-06, + "loss": 0.4371, + "step": 38420 + }, + { + "epoch": 1.9507431051769872, + "grad_norm": 0.02461769872993647, + "learning_rate": 1.842001544939742e-06, + "loss": 0.47, + "step": 38425 + }, + { + "epoch": 1.9509969412750188, + "grad_norm": 0.020951902456712867, + "learning_rate": 1.8230528149884573e-06, + "loss": 0.4701, + "step": 38430 + }, + { + "epoch": 1.9512507773730503, + "grad_norm": 0.026972795277747482, + "learning_rate": 1.80420187468755e-06, + "loss": 0.4404, + "step": 38435 + }, + { + "epoch": 1.9515046134710818, + "grad_norm": 0.021207557562487, + "learning_rate": 1.7854487277372822e-06, + "loss": 0.4366, + "step": 38440 + }, + { + "epoch": 1.9517584495691134, + "grad_norm": 0.02498355979793343, + "learning_rate": 1.7667933778188206e-06, + "loss": 0.4516, + "step": 38445 + }, + { + "epoch": 1.9520122856671447, + "grad_norm": 0.02192218125038996, + "learning_rate": 1.7482358285941803e-06, + "loss": 0.4627, + "step": 38450 + }, + { + "epoch": 1.9522661217651762, + "grad_norm": 0.029378815413494033, + "learning_rate": 1.729776083706003e-06, + "loss": 0.4461, + "step": 38455 + }, + { + "epoch": 1.9525199578632078, + "grad_norm": 0.024581468794183266, + "learning_rate": 1.7114141467779454e-06, + "loss": 0.454, + "step": 38460 + }, + { + "epoch": 1.952773793961239, + "grad_norm": 0.025894600364455185, + "learning_rate": 1.693150021414347e-06, + "loss": 0.4724, + "step": 38465 + }, + { + "epoch": 1.9530276300592706, + "grad_norm": 0.02278420673605267, + "learning_rate": 1.6749837112003398e-06, + "loss": 0.4594, + "step": 38470 + }, + { + "epoch": 1.9532814661573021, + "grad_norm": 0.02509311215738138, + "learning_rate": 1.656915219701849e-06, + "loss": 0.4309, + "step": 38475 + }, + { + "epoch": 1.9535353022553337, + "grad_norm": 0.02132393990165603, + "learning_rate": 1.6389445504657041e-06, + "loss": 0.4423, + "step": 38480 + }, + { + "epoch": 1.9537891383533652, + "grad_norm": 0.02171837487424093, + "learning_rate": 1.621071707019417e-06, + "loss": 0.4358, + "step": 38485 + }, + { + "epoch": 1.9540429744513967, + "grad_norm": 0.024981217751365643, + "learning_rate": 1.6032966928713477e-06, + "loss": 0.4675, + "step": 38490 + }, + { + "epoch": 1.9542968105494283, + "grad_norm": 0.020893534776536122, + "learning_rate": 1.5856195115105943e-06, + "loss": 0.4433, + "step": 38495 + }, + { + "epoch": 1.9545506466474598, + "grad_norm": 0.02068830461015465, + "learning_rate": 1.5680401664072141e-06, + "loss": 0.4629, + "step": 38500 + }, + { + "epoch": 1.9548044827454913, + "grad_norm": 0.020953616835626587, + "learning_rate": 1.5505586610118361e-06, + "loss": 0.4689, + "step": 38505 + }, + { + "epoch": 1.9550583188435229, + "grad_norm": 0.02440341051189393, + "learning_rate": 1.5331749987560484e-06, + "loss": 0.461, + "step": 38510 + }, + { + "epoch": 1.9553121549415542, + "grad_norm": 0.029598548396459087, + "learning_rate": 1.5158891830521215e-06, + "loss": 0.431, + "step": 38515 + }, + { + "epoch": 1.9555659910395857, + "grad_norm": 0.02717560220840279, + "learning_rate": 1.4987012172932301e-06, + "loss": 0.4535, + "step": 38520 + }, + { + "epoch": 1.9558198271376173, + "grad_norm": 0.02370839493798561, + "learning_rate": 1.481611104853231e-06, + "loss": 0.4735, + "step": 38525 + }, + { + "epoch": 1.9560736632356488, + "grad_norm": 0.022848184848146453, + "learning_rate": 1.4646188490869405e-06, + "loss": 0.4754, + "step": 38530 + }, + { + "epoch": 1.95632749933368, + "grad_norm": 0.022225861131520797, + "learning_rate": 1.4477244533297463e-06, + "loss": 0.4466, + "step": 38535 + }, + { + "epoch": 1.9565813354317116, + "grad_norm": 0.022152077073412883, + "learning_rate": 1.4309279208979398e-06, + "loss": 0.4698, + "step": 38540 + }, + { + "epoch": 1.9568351715297432, + "grad_norm": 0.02524987360341103, + "learning_rate": 1.414229255088606e-06, + "loss": 0.4294, + "step": 38545 + }, + { + "epoch": 1.9570890076277747, + "grad_norm": 0.023994085753204972, + "learning_rate": 1.3976284591796783e-06, + "loss": 0.4485, + "step": 38550 + }, + { + "epoch": 1.9573428437258062, + "grad_norm": 0.021722894693704954, + "learning_rate": 1.381125536429717e-06, + "loss": 0.4456, + "step": 38555 + }, + { + "epoch": 1.9575966798238378, + "grad_norm": 0.024344062652594294, + "learning_rate": 1.3647204900782417e-06, + "loss": 0.4338, + "step": 38560 + }, + { + "epoch": 1.9578505159218693, + "grad_norm": 0.021929490605915293, + "learning_rate": 1.3484133233454544e-06, + "loss": 0.4643, + "step": 38565 + }, + { + "epoch": 1.9581043520199009, + "grad_norm": 0.02177586782728187, + "learning_rate": 1.3322040394323498e-06, + "loss": 0.4649, + "step": 38570 + }, + { + "epoch": 1.9583581881179324, + "grad_norm": 0.020581067246549605, + "learning_rate": 1.3160926415207163e-06, + "loss": 0.4485, + "step": 38575 + }, + { + "epoch": 1.9586120242159637, + "grad_norm": 0.0215530463340169, + "learning_rate": 1.300079132773191e-06, + "loss": 0.4468, + "step": 38580 + }, + { + "epoch": 1.9588658603139952, + "grad_norm": 0.024185145615631558, + "learning_rate": 1.2841635163330922e-06, + "loss": 0.4886, + "step": 38585 + }, + { + "epoch": 1.9591196964120268, + "grad_norm": 0.02437935153018092, + "learning_rate": 1.268345795324588e-06, + "loss": 0.469, + "step": 38590 + }, + { + "epoch": 1.9593735325100583, + "grad_norm": 0.023546706097560363, + "learning_rate": 1.252625972852639e-06, + "loss": 0.4888, + "step": 38595 + }, + { + "epoch": 1.9596273686080896, + "grad_norm": 0.021139727672287015, + "learning_rate": 1.237004052002999e-06, + "loss": 0.4337, + "step": 38600 + }, + { + "epoch": 1.9598812047061211, + "grad_norm": 0.03423369843513143, + "learning_rate": 1.221480035842104e-06, + "loss": 0.4259, + "step": 38605 + }, + { + "epoch": 1.9601350408041527, + "grad_norm": 0.021383194616096283, + "learning_rate": 1.2060539274172944e-06, + "loss": 0.4357, + "step": 38610 + }, + { + "epoch": 1.9603888769021842, + "grad_norm": 0.020286977515740528, + "learning_rate": 1.1907257297566477e-06, + "loss": 0.4446, + "step": 38615 + }, + { + "epoch": 1.9606427130002158, + "grad_norm": 0.02403465823618508, + "learning_rate": 1.1754954458689238e-06, + "loss": 0.4564, + "step": 38620 + }, + { + "epoch": 1.9608965490982473, + "grad_norm": 0.02086613709755337, + "learning_rate": 1.1603630787438424e-06, + "loss": 0.4625, + "step": 38625 + }, + { + "epoch": 1.9611503851962788, + "grad_norm": 0.02145705596640427, + "learning_rate": 1.1453286313517498e-06, + "loss": 0.4622, + "step": 38630 + }, + { + "epoch": 1.9614042212943104, + "grad_norm": 0.0198366436003919, + "learning_rate": 1.130392106643896e-06, + "loss": 0.4296, + "step": 38635 + }, + { + "epoch": 1.961658057392342, + "grad_norm": 0.021595570641432995, + "learning_rate": 1.1155535075522138e-06, + "loss": 0.4395, + "step": 38640 + }, + { + "epoch": 1.9619118934903734, + "grad_norm": 0.027632249043748877, + "learning_rate": 1.1008128369894288e-06, + "loss": 0.4571, + "step": 38645 + }, + { + "epoch": 1.9621657295884047, + "grad_norm": 0.0211592206862124, + "learning_rate": 1.0861700978490596e-06, + "loss": 0.4516, + "step": 38650 + }, + { + "epoch": 1.9624195656864363, + "grad_norm": 0.020641952024909045, + "learning_rate": 1.0716252930054737e-06, + "loss": 0.4525, + "step": 38655 + }, + { + "epoch": 1.9626734017844678, + "grad_norm": 0.02239625096519365, + "learning_rate": 1.0571784253136652e-06, + "loss": 0.4538, + "step": 38660 + }, + { + "epoch": 1.9629272378824991, + "grad_norm": 0.020935572931792722, + "learning_rate": 1.0428294976094766e-06, + "loss": 0.4726, + "step": 38665 + }, + { + "epoch": 1.9631810739805307, + "grad_norm": 0.023797580850831307, + "learning_rate": 1.0285785127095993e-06, + "loss": 0.4591, + "step": 38670 + }, + { + "epoch": 1.9634349100785622, + "grad_norm": 0.021663436246309797, + "learning_rate": 1.0144254734113511e-06, + "loss": 0.439, + "step": 38675 + }, + { + "epoch": 1.9636887461765937, + "grad_norm": 0.02399748053716495, + "learning_rate": 1.00037038249301e-06, + "loss": 0.467, + "step": 38680 + }, + { + "epoch": 1.9639425822746253, + "grad_norm": 0.02209544242205649, + "learning_rate": 9.864132427134243e-07, + "loss": 0.4573, + "step": 38685 + }, + { + "epoch": 1.9641964183726568, + "grad_norm": 0.02180806139548943, + "learning_rate": 9.725540568122915e-07, + "loss": 0.4393, + "step": 38690 + }, + { + "epoch": 1.9644502544706883, + "grad_norm": 0.025112167834570206, + "learning_rate": 9.587928275102132e-07, + "loss": 0.4367, + "step": 38695 + }, + { + "epoch": 1.9647040905687199, + "grad_norm": 0.019834862643478415, + "learning_rate": 9.451295575083618e-07, + "loss": 0.4682, + "step": 38700 + }, + { + "epoch": 1.9649579266667514, + "grad_norm": 0.019828424235274936, + "learning_rate": 9.315642494888144e-07, + "loss": 0.424, + "step": 38705 + }, + { + "epoch": 1.965211762764783, + "grad_norm": 0.023011146520013428, + "learning_rate": 9.180969061143851e-07, + "loss": 0.4488, + "step": 38710 + }, + { + "epoch": 1.9654655988628142, + "grad_norm": 0.021083449030767323, + "learning_rate": 9.047275300285706e-07, + "loss": 0.4656, + "step": 38715 + }, + { + "epoch": 1.9657194349608458, + "grad_norm": 0.028229164879771167, + "learning_rate": 8.914561238557717e-07, + "loss": 0.4336, + "step": 38720 + }, + { + "epoch": 1.9659732710588773, + "grad_norm": 0.023561467883931243, + "learning_rate": 8.78282690201071e-07, + "loss": 0.4416, + "step": 38725 + }, + { + "epoch": 1.9662271071569086, + "grad_norm": 0.0231890769370734, + "learning_rate": 8.652072316503446e-07, + "loss": 0.4391, + "step": 38730 + }, + { + "epoch": 1.9664809432549402, + "grad_norm": 0.023528700345852093, + "learning_rate": 8.52229750770317e-07, + "loss": 0.4776, + "step": 38735 + }, + { + "epoch": 1.9667347793529717, + "grad_norm": 0.022550152791175105, + "learning_rate": 8.39350250108284e-07, + "loss": 0.4482, + "step": 38740 + }, + { + "epoch": 1.9669886154510032, + "grad_norm": 0.018641810249880507, + "learning_rate": 8.265687321925009e-07, + "loss": 0.4285, + "step": 38745 + }, + { + "epoch": 1.9672424515490348, + "grad_norm": 0.02746049418696759, + "learning_rate": 8.138851995319608e-07, + "loss": 0.4298, + "step": 38750 + }, + { + "epoch": 1.9674962876470663, + "grad_norm": 0.023113644657525075, + "learning_rate": 8.012996546162277e-07, + "loss": 0.4573, + "step": 38755 + }, + { + "epoch": 1.9677501237450978, + "grad_norm": 0.02536365354716488, + "learning_rate": 7.888120999159365e-07, + "loss": 0.4669, + "step": 38760 + }, + { + "epoch": 1.9680039598431294, + "grad_norm": 0.031051578737683758, + "learning_rate": 7.764225378822377e-07, + "loss": 0.4395, + "step": 38765 + }, + { + "epoch": 1.968257795941161, + "grad_norm": 0.022412814259141482, + "learning_rate": 7.641309709471855e-07, + "loss": 0.4616, + "step": 38770 + }, + { + "epoch": 1.9685116320391924, + "grad_norm": 0.029004560100377053, + "learning_rate": 7.51937401523517e-07, + "loss": 0.4527, + "step": 38775 + }, + { + "epoch": 1.9687654681372238, + "grad_norm": 0.03144197827525452, + "learning_rate": 7.398418320048173e-07, + "loss": 0.4587, + "step": 38780 + }, + { + "epoch": 1.9690193042352553, + "grad_norm": 0.020675890106864256, + "learning_rate": 7.278442647653538e-07, + "loss": 0.4294, + "step": 38785 + }, + { + "epoch": 1.9692731403332868, + "grad_norm": 0.022404177027982874, + "learning_rate": 7.159447021601872e-07, + "loss": 0.4414, + "step": 38790 + }, + { + "epoch": 1.9695269764313181, + "grad_norm": 0.025733305949821667, + "learning_rate": 7.041431465251713e-07, + "loss": 0.4477, + "step": 38795 + }, + { + "epoch": 1.9697808125293497, + "grad_norm": 0.027375264236320213, + "learning_rate": 6.924396001768418e-07, + "loss": 0.4665, + "step": 38800 + }, + { + "epoch": 1.9700346486273812, + "grad_norm": 0.025948495378095977, + "learning_rate": 6.808340654125833e-07, + "loss": 0.4424, + "step": 38805 + }, + { + "epoch": 1.9702884847254127, + "grad_norm": 0.023666766635241906, + "learning_rate": 6.693265445105179e-07, + "loss": 0.4416, + "step": 38810 + }, + { + "epoch": 1.9705423208234443, + "grad_norm": 0.02132357302519558, + "learning_rate": 6.579170397294498e-07, + "loss": 0.4253, + "step": 38815 + }, + { + "epoch": 1.9707961569214758, + "grad_norm": 0.030128825836747055, + "learning_rate": 6.466055533090875e-07, + "loss": 0.4195, + "step": 38820 + }, + { + "epoch": 1.9710499930195073, + "grad_norm": 0.02602764759746211, + "learning_rate": 6.35392087469766e-07, + "loss": 0.442, + "step": 38825 + }, + { + "epoch": 1.9713038291175389, + "grad_norm": 0.021881181041248757, + "learning_rate": 6.24276644412669e-07, + "loss": 0.473, + "step": 38830 + }, + { + "epoch": 1.9715576652155704, + "grad_norm": 0.02185523352546431, + "learning_rate": 6.132592263196623e-07, + "loss": 0.43, + "step": 38835 + }, + { + "epoch": 1.971811501313602, + "grad_norm": 0.021680403099536016, + "learning_rate": 6.023398353534604e-07, + "loss": 0.4553, + "step": 38840 + }, + { + "epoch": 1.9720653374116333, + "grad_norm": 0.021974257862993504, + "learning_rate": 5.915184736574597e-07, + "loss": 0.4562, + "step": 38845 + }, + { + "epoch": 1.9723191735096648, + "grad_norm": 0.02096949718491153, + "learning_rate": 5.807951433557946e-07, + "loss": 0.4408, + "step": 38850 + }, + { + "epoch": 1.9725730096076963, + "grad_norm": 0.025939230078918414, + "learning_rate": 5.701698465534477e-07, + "loss": 0.456, + "step": 38855 + }, + { + "epoch": 1.9728268457057279, + "grad_norm": 0.02828624799294119, + "learning_rate": 5.596425853361397e-07, + "loss": 0.4327, + "step": 38860 + }, + { + "epoch": 1.9730806818037592, + "grad_norm": 0.022004046110511963, + "learning_rate": 5.492133617702733e-07, + "loss": 0.4511, + "step": 38865 + }, + { + "epoch": 1.9733345179017907, + "grad_norm": 0.024590538540810992, + "learning_rate": 5.388821779030994e-07, + "loss": 0.4676, + "step": 38870 + }, + { + "epoch": 1.9735883539998222, + "grad_norm": 0.021191903724915806, + "learning_rate": 5.286490357624962e-07, + "loss": 0.4506, + "step": 38875 + }, + { + "epoch": 1.9738421900978538, + "grad_norm": 0.023563631620566922, + "learning_rate": 5.185139373572456e-07, + "loss": 0.4451, + "step": 38880 + }, + { + "epoch": 1.9740960261958853, + "grad_norm": 0.023529886583346264, + "learning_rate": 5.084768846768117e-07, + "loss": 0.4457, + "step": 38885 + }, + { + "epoch": 1.9743498622939168, + "grad_norm": 0.023884990638485995, + "learning_rate": 4.985378796913964e-07, + "loss": 0.4755, + "step": 38890 + }, + { + "epoch": 1.9746036983919484, + "grad_norm": 0.02209835269662074, + "learning_rate": 4.886969243519391e-07, + "loss": 0.4252, + "step": 38895 + }, + { + "epoch": 1.97485753448998, + "grad_norm": 0.019159617450172008, + "learning_rate": 4.789540205902831e-07, + "loss": 0.4405, + "step": 38900 + }, + { + "epoch": 1.9751113705880115, + "grad_norm": 0.02540563968338361, + "learning_rate": 4.6930917031878796e-07, + "loss": 0.4234, + "step": 38905 + }, + { + "epoch": 1.975365206686043, + "grad_norm": 0.025584914989898507, + "learning_rate": 4.597623754307723e-07, + "loss": 0.4422, + "step": 38910 + }, + { + "epoch": 1.9756190427840743, + "grad_norm": 0.02296434045482897, + "learning_rate": 4.5031363780023705e-07, + "loss": 0.4342, + "step": 38915 + }, + { + "epoch": 1.9758728788821058, + "grad_norm": 0.023745961805210634, + "learning_rate": 4.4096295928186534e-07, + "loss": 0.4681, + "step": 38920 + }, + { + "epoch": 1.9761267149801374, + "grad_norm": 0.023731819927084494, + "learning_rate": 4.3171034171113346e-07, + "loss": 0.4512, + "step": 38925 + }, + { + "epoch": 1.9763805510781687, + "grad_norm": 0.022054760692146153, + "learning_rate": 4.225557869043661e-07, + "loss": 0.4677, + "step": 38930 + }, + { + "epoch": 1.9766343871762002, + "grad_norm": 0.023331327072565897, + "learning_rate": 4.134992966584594e-07, + "loss": 0.4595, + "step": 38935 + }, + { + "epoch": 1.9768882232742317, + "grad_norm": 0.022221322234984905, + "learning_rate": 4.0454087275121344e-07, + "loss": 0.4274, + "step": 38940 + }, + { + "epoch": 1.9771420593722633, + "grad_norm": 0.028686436657490943, + "learning_rate": 3.956805169411659e-07, + "loss": 0.4756, + "step": 38945 + }, + { + "epoch": 1.9773958954702948, + "grad_norm": 0.021950488480769904, + "learning_rate": 3.8691823096748126e-07, + "loss": 0.4263, + "step": 38950 + }, + { + "epoch": 1.9776497315683264, + "grad_norm": 0.021704365417338094, + "learning_rate": 3.7825401655017246e-07, + "loss": 0.4912, + "step": 38955 + }, + { + "epoch": 1.9779035676663579, + "grad_norm": 0.029412474606072804, + "learning_rate": 3.6968787538999016e-07, + "loss": 0.446, + "step": 38960 + }, + { + "epoch": 1.9781574037643894, + "grad_norm": 0.021330314142649562, + "learning_rate": 3.6121980916842265e-07, + "loss": 0.4515, + "step": 38965 + }, + { + "epoch": 1.978411239862421, + "grad_norm": 0.02002666201390011, + "learning_rate": 3.528498195476959e-07, + "loss": 0.4289, + "step": 38970 + }, + { + "epoch": 1.9786650759604525, + "grad_norm": 0.022747671492206325, + "learning_rate": 3.445779081708844e-07, + "loss": 0.4598, + "step": 38975 + }, + { + "epoch": 1.9789189120584838, + "grad_norm": 0.023336518609922925, + "learning_rate": 3.3640407666157835e-07, + "loss": 0.4739, + "step": 38980 + }, + { + "epoch": 1.9791727481565153, + "grad_norm": 0.03376020799973892, + "learning_rate": 3.283283266243831e-07, + "loss": 0.4483, + "step": 38985 + }, + { + "epoch": 1.9794265842545469, + "grad_norm": 0.02132524288935295, + "learning_rate": 3.203506596444194e-07, + "loss": 0.463, + "step": 38990 + }, + { + "epoch": 1.9796804203525782, + "grad_norm": 0.020945363773474973, + "learning_rate": 3.1247107728776815e-07, + "loss": 0.46, + "step": 38995 + }, + { + "epoch": 1.9799342564506097, + "grad_norm": 0.019771955108709015, + "learning_rate": 3.046895811011363e-07, + "loss": 0.4515, + "step": 39000 + }, + { + "epoch": 1.9801880925486413, + "grad_norm": 0.02332612793278335, + "learning_rate": 2.970061726119133e-07, + "loss": 0.434, + "step": 39005 + }, + { + "epoch": 1.9804419286466728, + "grad_norm": 0.022591788630783205, + "learning_rate": 2.894208533283371e-07, + "loss": 0.4332, + "step": 39010 + }, + { + "epoch": 1.9806957647447043, + "grad_norm": 0.020037110918590357, + "learning_rate": 2.8193362473943885e-07, + "loss": 0.4296, + "step": 39015 + }, + { + "epoch": 1.9809496008427359, + "grad_norm": 0.023998278576397646, + "learning_rate": 2.7454448831487624e-07, + "loss": 0.4527, + "step": 39020 + }, + { + "epoch": 1.9812034369407674, + "grad_norm": 0.02707422698823243, + "learning_rate": 2.672534455051001e-07, + "loss": 0.4571, + "step": 39025 + }, + { + "epoch": 1.981457273038799, + "grad_norm": 0.025204251574380124, + "learning_rate": 2.60060497741299e-07, + "loss": 0.461, + "step": 39030 + }, + { + "epoch": 1.9817111091368305, + "grad_norm": 0.024377692549952947, + "learning_rate": 2.529656464354546e-07, + "loss": 0.4683, + "step": 39035 + }, + { + "epoch": 1.981964945234862, + "grad_norm": 0.02538043216380801, + "learning_rate": 2.459688929802306e-07, + "loss": 0.4442, + "step": 39040 + }, + { + "epoch": 1.9822187813328933, + "grad_norm": 0.02464209180856968, + "learning_rate": 2.3907023874897295e-07, + "loss": 0.4245, + "step": 39045 + }, + { + "epoch": 1.9824726174309248, + "grad_norm": 0.023932076143716494, + "learning_rate": 2.3226968509598712e-07, + "loss": 0.4507, + "step": 39050 + }, + { + "epoch": 1.9827264535289564, + "grad_norm": 0.020196673491678915, + "learning_rate": 2.2556723335609431e-07, + "loss": 0.4407, + "step": 39055 + }, + { + "epoch": 1.9829802896269877, + "grad_norm": 0.022940306957792315, + "learning_rate": 2.1896288484496428e-07, + "loss": 0.4575, + "step": 39060 + }, + { + "epoch": 1.9832341257250192, + "grad_norm": 0.028907148506462126, + "learning_rate": 2.1245664085906002e-07, + "loss": 0.4506, + "step": 39065 + }, + { + "epoch": 1.9834879618230508, + "grad_norm": 0.023945110409690495, + "learning_rate": 2.0604850267547104e-07, + "loss": 0.4592, + "step": 39070 + }, + { + "epoch": 1.9837417979210823, + "grad_norm": 0.028605852547885884, + "learning_rate": 1.9973847155208003e-07, + "loss": 0.4588, + "step": 39075 + }, + { + "epoch": 1.9839956340191138, + "grad_norm": 0.02150438379068988, + "learning_rate": 1.935265487275073e-07, + "loss": 0.4286, + "step": 39080 + }, + { + "epoch": 1.9842494701171454, + "grad_norm": 0.021743607443387786, + "learning_rate": 1.8741273542116633e-07, + "loss": 0.4728, + "step": 39085 + }, + { + "epoch": 1.984503306215177, + "grad_norm": 0.024759023733061148, + "learning_rate": 1.8139703283315267e-07, + "loss": 0.4691, + "step": 39090 + }, + { + "epoch": 1.9847571423132084, + "grad_norm": 0.021978062996421237, + "learning_rate": 1.7547944214429957e-07, + "loss": 0.4413, + "step": 39095 + }, + { + "epoch": 1.98501097841124, + "grad_norm": 0.02156291605727596, + "learning_rate": 1.6965996451623334e-07, + "loss": 0.4424, + "step": 39100 + }, + { + "epoch": 1.9852648145092715, + "grad_norm": 0.019974654547251697, + "learning_rate": 1.6393860109120695e-07, + "loss": 0.4581, + "step": 39105 + }, + { + "epoch": 1.9855186506073028, + "grad_norm": 0.023303362723310548, + "learning_rate": 1.5831535299243304e-07, + "loss": 0.4222, + "step": 39110 + }, + { + "epoch": 1.9857724867053343, + "grad_norm": 0.021376371336891062, + "learning_rate": 1.5279022132358434e-07, + "loss": 0.4265, + "step": 39115 + }, + { + "epoch": 1.9860263228033659, + "grad_norm": 0.07893795121247377, + "learning_rate": 1.473632071692932e-07, + "loss": 0.4357, + "step": 39120 + }, + { + "epoch": 1.9862801589013974, + "grad_norm": 0.02242228809324414, + "learning_rate": 1.4203431159487413e-07, + "loss": 0.4276, + "step": 39125 + }, + { + "epoch": 1.9865339949994287, + "grad_norm": 0.024029115581471198, + "learning_rate": 1.3680353564632375e-07, + "loss": 0.4291, + "step": 39130 + }, + { + "epoch": 1.9867878310974603, + "grad_norm": 0.02328166827725145, + "learning_rate": 1.3167088035037632e-07, + "loss": 0.4332, + "step": 39135 + }, + { + "epoch": 1.9870416671954918, + "grad_norm": 0.02357094301240424, + "learning_rate": 1.266363467146703e-07, + "loss": 0.4545, + "step": 39140 + }, + { + "epoch": 1.9872955032935233, + "grad_norm": 0.023516549978020867, + "learning_rate": 1.216999357273596e-07, + "loss": 0.4384, + "step": 39145 + }, + { + "epoch": 1.9875493393915549, + "grad_norm": 0.025546553659660538, + "learning_rate": 1.1686164835744695e-07, + "loss": 0.4447, + "step": 39150 + }, + { + "epoch": 1.9878031754895864, + "grad_norm": 0.018763562747325255, + "learning_rate": 1.121214855546726e-07, + "loss": 0.4391, + "step": 39155 + }, + { + "epoch": 1.988057011587618, + "grad_norm": 0.02484040778673898, + "learning_rate": 1.074794482495145e-07, + "loss": 0.4554, + "step": 39160 + }, + { + "epoch": 1.9883108476856495, + "grad_norm": 0.020917177794273913, + "learning_rate": 1.0293553735318817e-07, + "loss": 0.4487, + "step": 39165 + }, + { + "epoch": 1.988564683783681, + "grad_norm": 0.02066839479802247, + "learning_rate": 9.84897537576468e-08, + "loss": 0.4241, + "step": 39170 + }, + { + "epoch": 1.9888185198817125, + "grad_norm": 0.023515586475883914, + "learning_rate": 9.414209833552567e-08, + "loss": 0.4709, + "step": 39175 + }, + { + "epoch": 1.9890723559797439, + "grad_norm": 0.024226686054511816, + "learning_rate": 8.989257194030876e-08, + "loss": 0.4775, + "step": 39180 + }, + { + "epoch": 1.9893261920777754, + "grad_norm": 0.023019658882310785, + "learning_rate": 8.57411754061621e-08, + "loss": 0.4539, + "step": 39185 + }, + { + "epoch": 1.989580028175807, + "grad_norm": 0.02633296041160311, + "learning_rate": 8.168790954793392e-08, + "loss": 0.4702, + "step": 39190 + }, + { + "epoch": 1.9898338642738382, + "grad_norm": 0.02531198180541627, + "learning_rate": 7.773277516126553e-08, + "loss": 0.4541, + "step": 39195 + }, + { + "epoch": 1.9900877003718698, + "grad_norm": 0.025611896685111196, + "learning_rate": 7.38757730225359e-08, + "loss": 0.4324, + "step": 39200 + }, + { + "epoch": 1.9903415364699013, + "grad_norm": 0.02262366655843535, + "learning_rate": 7.01169038888616e-08, + "loss": 0.4531, + "step": 39205 + }, + { + "epoch": 1.9905953725679328, + "grad_norm": 0.022614701857616906, + "learning_rate": 6.64561684981524e-08, + "loss": 0.4504, + "step": 39210 + }, + { + "epoch": 1.9908492086659644, + "grad_norm": 0.024008714004990543, + "learning_rate": 6.289356756888908e-08, + "loss": 0.4487, + "step": 39215 + }, + { + "epoch": 1.991103044763996, + "grad_norm": 0.020603992655022652, + "learning_rate": 5.9429101800401174e-08, + "loss": 0.4555, + "step": 39220 + }, + { + "epoch": 1.9913568808620274, + "grad_norm": 0.020889931974897115, + "learning_rate": 5.606277187286679e-08, + "loss": 0.463, + "step": 39225 + }, + { + "epoch": 1.991610716960059, + "grad_norm": 0.0313564646039055, + "learning_rate": 5.2794578446924145e-08, + "loss": 0.4804, + "step": 39230 + }, + { + "epoch": 1.9918645530580905, + "grad_norm": 0.021498099912898545, + "learning_rate": 4.962452216417113e-08, + "loss": 0.4825, + "step": 39235 + }, + { + "epoch": 1.992118389156122, + "grad_norm": 0.02066023063054367, + "learning_rate": 4.655260364694325e-08, + "loss": 0.4463, + "step": 39240 + }, + { + "epoch": 1.9923722252541534, + "grad_norm": 0.02131649131072014, + "learning_rate": 4.357882349809161e-08, + "loss": 0.4666, + "step": 39245 + }, + { + "epoch": 1.992626061352185, + "grad_norm": 0.02195382258423416, + "learning_rate": 4.0703182301482514e-08, + "loss": 0.4372, + "step": 39250 + }, + { + "epoch": 1.9928798974502164, + "grad_norm": 0.020715639914265123, + "learning_rate": 3.792568062155333e-08, + "loss": 0.4769, + "step": 39255 + }, + { + "epoch": 1.9931337335482477, + "grad_norm": 0.0206383390200722, + "learning_rate": 3.524631900347908e-08, + "loss": 0.4327, + "step": 39260 + }, + { + "epoch": 1.9933875696462793, + "grad_norm": 0.021381354938705657, + "learning_rate": 3.266509797328343e-08, + "loss": 0.4715, + "step": 39265 + }, + { + "epoch": 1.9936414057443108, + "grad_norm": 0.021713933568330897, + "learning_rate": 3.018201803756115e-08, + "loss": 0.4428, + "step": 39270 + }, + { + "epoch": 1.9938952418423423, + "grad_norm": 0.019288162544681065, + "learning_rate": 2.7797079683755666e-08, + "loss": 0.4647, + "step": 39275 + }, + { + "epoch": 1.9941490779403739, + "grad_norm": 0.027710930933847345, + "learning_rate": 2.5510283379992505e-08, + "loss": 0.4898, + "step": 39280 + }, + { + "epoch": 1.9944029140384054, + "grad_norm": 0.021943033021282254, + "learning_rate": 2.3321629575245862e-08, + "loss": 0.4366, + "step": 39285 + }, + { + "epoch": 1.994656750136437, + "grad_norm": 0.02816046395571235, + "learning_rate": 2.1231118699061024e-08, + "loss": 0.4599, + "step": 39290 + }, + { + "epoch": 1.9949105862344685, + "grad_norm": 0.021547960303644455, + "learning_rate": 1.9238751161831936e-08, + "loss": 0.425, + "step": 39295 + }, + { + "epoch": 1.9951644223325, + "grad_norm": 0.020328451114982965, + "learning_rate": 1.7344527354634655e-08, + "loss": 0.4201, + "step": 39300 + }, + { + "epoch": 1.9954182584305316, + "grad_norm": 0.02182918824731763, + "learning_rate": 1.554844764928287e-08, + "loss": 0.4637, + "step": 39305 + }, + { + "epoch": 1.9956720945285629, + "grad_norm": 0.02193159918916923, + "learning_rate": 1.3850512398383419e-08, + "loss": 0.4707, + "step": 39310 + }, + { + "epoch": 1.9959259306265944, + "grad_norm": 0.02149565745718913, + "learning_rate": 1.225072193516974e-08, + "loss": 0.4406, + "step": 39315 + }, + { + "epoch": 1.996179766724626, + "grad_norm": 0.020693611586431505, + "learning_rate": 1.0749076573723927e-08, + "loss": 0.4555, + "step": 39320 + }, + { + "epoch": 1.9964336028226572, + "grad_norm": 0.0239210359312069, + "learning_rate": 9.34557660875468e-09, + "loss": 0.4554, + "step": 39325 + }, + { + "epoch": 1.9966874389206888, + "grad_norm": 0.02080480579062484, + "learning_rate": 8.040222315819357e-09, + "loss": 0.4551, + "step": 39330 + }, + { + "epoch": 1.9969412750187203, + "grad_norm": 0.023148989474884817, + "learning_rate": 6.833013951157429e-09, + "loss": 0.4525, + "step": 39335 + }, + { + "epoch": 1.9971951111167519, + "grad_norm": 0.023818805572660067, + "learning_rate": 5.7239517516904925e-09, + "loss": 0.4586, + "step": 39340 + }, + { + "epoch": 1.9974489472147834, + "grad_norm": 0.029865648276199377, + "learning_rate": 4.713035935188792e-09, + "loss": 0.4767, + "step": 39345 + }, + { + "epoch": 1.997702783312815, + "grad_norm": 0.020425051877142286, + "learning_rate": 3.800266699993671e-09, + "loss": 0.444, + "step": 39350 + }, + { + "epoch": 1.9979566194108465, + "grad_norm": 0.022269852971449254, + "learning_rate": 2.9856442253506366e-09, + "loss": 0.4632, + "step": 39355 + }, + { + "epoch": 1.998210455508878, + "grad_norm": 0.02273454463601086, + "learning_rate": 2.2691686711318048e-09, + "loss": 0.4404, + "step": 39360 + }, + { + "epoch": 1.9984642916069095, + "grad_norm": 0.021280763318387155, + "learning_rate": 1.6508401780024329e-09, + "loss": 0.4621, + "step": 39365 + }, + { + "epoch": 1.998718127704941, + "grad_norm": 0.025913136491658925, + "learning_rate": 1.1306588673098972e-09, + "loss": 0.4594, + "step": 39370 + }, + { + "epoch": 1.9989719638029724, + "grad_norm": 0.02119584869294317, + "learning_rate": 7.08624841194716e-10, + "loss": 0.456, + "step": 39375 + }, + { + "epoch": 1.999225799901004, + "grad_norm": 0.01982643960905116, + "learning_rate": 3.8473818242401594e-10, + "loss": 0.436, + "step": 39380 + }, + { + "epoch": 1.9994796359990354, + "grad_norm": 0.024448883021925015, + "learning_rate": 1.5899895472459848e-10, + "loss": 0.4621, + "step": 39385 + }, + { + "epoch": 1.999733472097067, + "grad_norm": 0.027796596702240602, + "learning_rate": 3.140720228334004e-11, + "loss": 0.4499, + "step": 39390 + }, + { + "epoch": 1.9999365409754921, + "step": 39394, + "total_flos": 3.655598871565828e+18, + "train_loss": 0.5567794406680932, + "train_runtime": 148446.9136, + "train_samples_per_second": 2.123, + "train_steps_per_second": 0.265 + } + ], + "logging_steps": 5, + "max_steps": 39394, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 3.655598871565828e+18, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}