diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,17601 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.7135896458142392, + "eval_steps": 250, + "global_step": 2500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0002854358583256957, + "grad_norm": 2.75, + "learning_rate": 8.571428571428571e-06, + "loss": 3.7737, + "step": 1 + }, + { + "epoch": 0.0005708717166513914, + "grad_norm": 3.03125, + "learning_rate": 1.7142857142857142e-05, + "loss": 3.8253, + "step": 2 + }, + { + "epoch": 0.000856307574977087, + "grad_norm": 2.078125, + "learning_rate": 2.571428571428571e-05, + "loss": 3.8136, + "step": 3 + }, + { + "epoch": 0.0011417434333027827, + "grad_norm": 2.53125, + "learning_rate": 3.4285714285714284e-05, + "loss": 3.7592, + "step": 4 + }, + { + "epoch": 0.0014271792916284785, + "grad_norm": 2.515625, + "learning_rate": 4.285714285714285e-05, + "loss": 3.7806, + "step": 5 + }, + { + "epoch": 0.001712615149954174, + "grad_norm": 2.375, + "learning_rate": 5.142857142857142e-05, + "loss": 3.7962, + "step": 6 + }, + { + "epoch": 0.0019980510082798697, + "grad_norm": 2.53125, + "learning_rate": 5.9999999999999995e-05, + "loss": 3.7494, + "step": 7 + }, + { + "epoch": 0.0022834868666055655, + "grad_norm": 2.859375, + "learning_rate": 6.857142857142857e-05, + "loss": 3.7721, + "step": 8 + }, + { + "epoch": 0.0025689227249312612, + "grad_norm": 2.0625, + "learning_rate": 7.714285714285713e-05, + "loss": 3.744, + "step": 9 + }, + { + "epoch": 0.002854358583256957, + "grad_norm": 1.828125, + "learning_rate": 8.57142857142857e-05, + "loss": 3.7373, + "step": 10 + }, + { + "epoch": 0.003139794441582653, + "grad_norm": 2.65625, + "learning_rate": 9.428571428571427e-05, + "loss": 3.7021, + "step": 11 + }, + { + "epoch": 0.003425230299908348, + "grad_norm": 4.375, + "learning_rate": 0.00010285714285714284, + "loss": 3.6838, + "step": 12 + }, + { + "epoch": 0.003710666158234044, + "grad_norm": 2.5, + "learning_rate": 0.00011142857142857142, + "loss": 3.7084, + "step": 13 + }, + { + "epoch": 0.003996102016559739, + "grad_norm": 4.25, + "learning_rate": 0.00011999999999999999, + "loss": 3.6489, + "step": 14 + }, + { + "epoch": 0.004281537874885435, + "grad_norm": 2.84375, + "learning_rate": 0.00012857142857142855, + "loss": 3.6588, + "step": 15 + }, + { + "epoch": 0.004566973733211131, + "grad_norm": 4.4375, + "learning_rate": 0.00013714285714285713, + "loss": 3.6394, + "step": 16 + }, + { + "epoch": 0.004852409591536827, + "grad_norm": 3.203125, + "learning_rate": 0.0001457142857142857, + "loss": 3.5906, + "step": 17 + }, + { + "epoch": 0.0051378454498625225, + "grad_norm": 2.640625, + "learning_rate": 0.00015428571428571425, + "loss": 3.5944, + "step": 18 + }, + { + "epoch": 0.005423281308188218, + "grad_norm": 4.21875, + "learning_rate": 0.00016285714285714284, + "loss": 3.5843, + "step": 19 + }, + { + "epoch": 0.005708717166513914, + "grad_norm": 2.875, + "learning_rate": 0.0001714285714285714, + "loss": 3.5661, + "step": 20 + }, + { + "epoch": 0.00599415302483961, + "grad_norm": 4.375, + "learning_rate": 0.00017999999999999998, + "loss": 3.5976, + "step": 21 + }, + { + "epoch": 0.006279588883165306, + "grad_norm": 3.0, + "learning_rate": 0.00018857142857142854, + "loss": 3.5226, + "step": 22 + }, + { + "epoch": 0.006565024741491001, + "grad_norm": 2.828125, + "learning_rate": 0.00019714285714285713, + "loss": 3.5581, + "step": 23 + }, + { + "epoch": 0.006850460599816696, + "grad_norm": 4.1875, + "learning_rate": 0.0002057142857142857, + "loss": 3.5337, + "step": 24 + }, + { + "epoch": 0.007135896458142392, + "grad_norm": 5.375, + "learning_rate": 0.00021428571428571427, + "loss": 3.502, + "step": 25 + }, + { + "epoch": 0.007421332316468088, + "grad_norm": 2.359375, + "learning_rate": 0.00022285714285714283, + "loss": 3.4848, + "step": 26 + }, + { + "epoch": 0.007706768174793784, + "grad_norm": 7.65625, + "learning_rate": 0.00023142857142857142, + "loss": 3.5451, + "step": 27 + }, + { + "epoch": 0.007992204033119479, + "grad_norm": 4.96875, + "learning_rate": 0.00023999999999999998, + "loss": 3.5235, + "step": 28 + }, + { + "epoch": 0.008277639891445174, + "grad_norm": 6.65625, + "learning_rate": 0.00024857142857142857, + "loss": 3.5061, + "step": 29 + }, + { + "epoch": 0.00856307574977087, + "grad_norm": 4.9375, + "learning_rate": 0.0002571428571428571, + "loss": 3.5228, + "step": 30 + }, + { + "epoch": 0.008848511608096566, + "grad_norm": 7.75, + "learning_rate": 0.0002657142857142857, + "loss": 3.4963, + "step": 31 + }, + { + "epoch": 0.009133947466422262, + "grad_norm": 4.75, + "learning_rate": 0.00027428571428571427, + "loss": 3.5074, + "step": 32 + }, + { + "epoch": 0.009419383324747958, + "grad_norm": 5.3125, + "learning_rate": 0.0002828571428571428, + "loss": 3.4555, + "step": 33 + }, + { + "epoch": 0.009704819183073653, + "grad_norm": 4.40625, + "learning_rate": 0.0002914285714285714, + "loss": 3.4634, + "step": 34 + }, + { + "epoch": 0.00999025504139935, + "grad_norm": 4.8125, + "learning_rate": 0.0003, + "loss": 3.4516, + "step": 35 + }, + { + "epoch": 0.010275690899725045, + "grad_norm": 3.921875, + "learning_rate": 0.00029999993845357924, + "loss": 3.4341, + "step": 36 + }, + { + "epoch": 0.01056112675805074, + "grad_norm": 5.40625, + "learning_rate": 0.0002999997538143675, + "loss": 3.4625, + "step": 37 + }, + { + "epoch": 0.010846562616376437, + "grad_norm": 4.59375, + "learning_rate": 0.0002999994460825163, + "loss": 3.4492, + "step": 38 + }, + { + "epoch": 0.011131998474702132, + "grad_norm": 3.5625, + "learning_rate": 0.0002999990152582781, + "loss": 3.4078, + "step": 39 + }, + { + "epoch": 0.011417434333027828, + "grad_norm": 4.75, + "learning_rate": 0.00029999846134200653, + "loss": 3.4077, + "step": 40 + }, + { + "epoch": 0.011702870191353524, + "grad_norm": 4.03125, + "learning_rate": 0.0002999977843341562, + "loss": 3.4062, + "step": 41 + }, + { + "epoch": 0.01198830604967922, + "grad_norm": 3.859375, + "learning_rate": 0.0002999969842352825, + "loss": 3.3895, + "step": 42 + }, + { + "epoch": 0.012273741908004916, + "grad_norm": 3.25, + "learning_rate": 0.0002999960610460421, + "loss": 3.3762, + "step": 43 + }, + { + "epoch": 0.012559177766330611, + "grad_norm": 4.0625, + "learning_rate": 0.00029999501476719257, + "loss": 3.3807, + "step": 44 + }, + { + "epoch": 0.012844613624656307, + "grad_norm": 3.71875, + "learning_rate": 0.00029999384539959253, + "loss": 3.3432, + "step": 45 + }, + { + "epoch": 0.013130049482982001, + "grad_norm": 3.328125, + "learning_rate": 0.0002999925529442016, + "loss": 3.3543, + "step": 46 + }, + { + "epoch": 0.013415485341307697, + "grad_norm": 5.5625, + "learning_rate": 0.0002999911374020804, + "loss": 3.3339, + "step": 47 + }, + { + "epoch": 0.013700921199633393, + "grad_norm": 2.25, + "learning_rate": 0.00029998959877439044, + "loss": 3.3377, + "step": 48 + }, + { + "epoch": 0.013986357057959089, + "grad_norm": 4.84375, + "learning_rate": 0.0002999879370623944, + "loss": 3.4033, + "step": 49 + }, + { + "epoch": 0.014271792916284784, + "grad_norm": 4.375, + "learning_rate": 0.00029998615226745605, + "loss": 3.3567, + "step": 50 + }, + { + "epoch": 0.01455722877461048, + "grad_norm": 3.15625, + "learning_rate": 0.0002999842443910399, + "loss": 3.3819, + "step": 51 + }, + { + "epoch": 0.014842664632936176, + "grad_norm": 4.5, + "learning_rate": 0.0002999822134347115, + "loss": 3.3586, + "step": 52 + }, + { + "epoch": 0.015128100491261872, + "grad_norm": 3.671875, + "learning_rate": 0.0002999800594001376, + "loss": 3.3414, + "step": 53 + }, + { + "epoch": 0.015413536349587567, + "grad_norm": 2.765625, + "learning_rate": 0.000299977782289086, + "loss": 3.3165, + "step": 54 + }, + { + "epoch": 0.01569897220791326, + "grad_norm": 4.6875, + "learning_rate": 0.00029997538210342503, + "loss": 3.3446, + "step": 55 + }, + { + "epoch": 0.015984408066238957, + "grad_norm": 4.0625, + "learning_rate": 0.0002999728588451245, + "loss": 3.3649, + "step": 56 + }, + { + "epoch": 0.016269843924564653, + "grad_norm": 2.828125, + "learning_rate": 0.000299970212516255, + "loss": 3.3258, + "step": 57 + }, + { + "epoch": 0.01655527978289035, + "grad_norm": 3.84375, + "learning_rate": 0.0002999674431189883, + "loss": 3.3137, + "step": 58 + }, + { + "epoch": 0.016840715641216045, + "grad_norm": 2.53125, + "learning_rate": 0.0002999645506555967, + "loss": 3.31, + "step": 59 + }, + { + "epoch": 0.01712615149954174, + "grad_norm": 3.796875, + "learning_rate": 0.00029996153512845415, + "loss": 3.3022, + "step": 60 + }, + { + "epoch": 0.017411587357867436, + "grad_norm": 3.671875, + "learning_rate": 0.00029995839654003504, + "loss": 3.3119, + "step": 61 + }, + { + "epoch": 0.017697023216193132, + "grad_norm": 2.828125, + "learning_rate": 0.00029995513489291506, + "loss": 3.306, + "step": 62 + }, + { + "epoch": 0.017982459074518828, + "grad_norm": 2.96875, + "learning_rate": 0.0002999517501897707, + "loss": 3.2965, + "step": 63 + }, + { + "epoch": 0.018267894932844524, + "grad_norm": 3.765625, + "learning_rate": 0.0002999482424333796, + "loss": 3.3035, + "step": 64 + }, + { + "epoch": 0.01855333079117022, + "grad_norm": 2.96875, + "learning_rate": 0.00029994461162662024, + "loss": 3.2734, + "step": 65 + }, + { + "epoch": 0.018838766649495915, + "grad_norm": 2.28125, + "learning_rate": 0.0002999408577724721, + "loss": 3.2772, + "step": 66 + }, + { + "epoch": 0.01912420250782161, + "grad_norm": 3.46875, + "learning_rate": 0.0002999369808740157, + "loss": 3.2491, + "step": 67 + }, + { + "epoch": 0.019409638366147307, + "grad_norm": 3.78125, + "learning_rate": 0.00029993298093443246, + "loss": 3.2943, + "step": 68 + }, + { + "epoch": 0.019695074224473003, + "grad_norm": 1.9296875, + "learning_rate": 0.0002999288579570049, + "loss": 3.2525, + "step": 69 + }, + { + "epoch": 0.0199805100827987, + "grad_norm": 4.0, + "learning_rate": 0.00029992461194511624, + "loss": 3.2765, + "step": 70 + }, + { + "epoch": 0.020265945941124394, + "grad_norm": 2.578125, + "learning_rate": 0.000299920242902251, + "loss": 3.2538, + "step": 71 + }, + { + "epoch": 0.02055138179945009, + "grad_norm": 2.84375, + "learning_rate": 0.00029991575083199455, + "loss": 3.2407, + "step": 72 + }, + { + "epoch": 0.020836817657775786, + "grad_norm": 3.203125, + "learning_rate": 0.00029991113573803294, + "loss": 3.2537, + "step": 73 + }, + { + "epoch": 0.02112225351610148, + "grad_norm": 4.34375, + "learning_rate": 0.0002999063976241536, + "loss": 3.2618, + "step": 74 + }, + { + "epoch": 0.021407689374427177, + "grad_norm": 1.5390625, + "learning_rate": 0.00029990153649424463, + "loss": 3.2486, + "step": 75 + }, + { + "epoch": 0.021693125232752873, + "grad_norm": 6.15625, + "learning_rate": 0.0002998965523522951, + "loss": 3.2839, + "step": 76 + }, + { + "epoch": 0.02197856109107857, + "grad_norm": 3.953125, + "learning_rate": 0.0002998914452023953, + "loss": 3.2866, + "step": 77 + }, + { + "epoch": 0.022263996949404265, + "grad_norm": 4.9375, + "learning_rate": 0.00029988621504873606, + "loss": 3.3082, + "step": 78 + }, + { + "epoch": 0.02254943280772996, + "grad_norm": 3.578125, + "learning_rate": 0.0002998808618956094, + "loss": 3.2833, + "step": 79 + }, + { + "epoch": 0.022834868666055656, + "grad_norm": 4.375, + "learning_rate": 0.00029987538574740826, + "loss": 3.2748, + "step": 80 + }, + { + "epoch": 0.023120304524381352, + "grad_norm": 2.921875, + "learning_rate": 0.0002998697866086264, + "loss": 3.2491, + "step": 81 + }, + { + "epoch": 0.023405740382707048, + "grad_norm": 3.5, + "learning_rate": 0.0002998640644838587, + "loss": 3.2526, + "step": 82 + }, + { + "epoch": 0.023691176241032744, + "grad_norm": 3.09375, + "learning_rate": 0.0002998582193778006, + "loss": 3.2262, + "step": 83 + }, + { + "epoch": 0.02397661209935844, + "grad_norm": 2.96875, + "learning_rate": 0.000299852251295249, + "loss": 3.2321, + "step": 84 + }, + { + "epoch": 0.024262047957684135, + "grad_norm": 2.796875, + "learning_rate": 0.0002998461602411013, + "loss": 3.2485, + "step": 85 + }, + { + "epoch": 0.02454748381600983, + "grad_norm": 2.46875, + "learning_rate": 0.00029983994622035585, + "loss": 3.2223, + "step": 86 + }, + { + "epoch": 0.024832919674335527, + "grad_norm": 3.484375, + "learning_rate": 0.0002998336092381121, + "loss": 3.2184, + "step": 87 + }, + { + "epoch": 0.025118355532661223, + "grad_norm": 2.734375, + "learning_rate": 0.0002998271492995702, + "loss": 3.2204, + "step": 88 + }, + { + "epoch": 0.02540379139098692, + "grad_norm": 3.34375, + "learning_rate": 0.00029982056641003147, + "loss": 3.2185, + "step": 89 + }, + { + "epoch": 0.025689227249312614, + "grad_norm": 2.03125, + "learning_rate": 0.00029981386057489776, + "loss": 3.1942, + "step": 90 + }, + { + "epoch": 0.025974663107638307, + "grad_norm": 2.953125, + "learning_rate": 0.00029980703179967213, + "loss": 3.1724, + "step": 91 + }, + { + "epoch": 0.026260098965964002, + "grad_norm": 3.015625, + "learning_rate": 0.00029980008008995834, + "loss": 3.2225, + "step": 92 + }, + { + "epoch": 0.026545534824289698, + "grad_norm": 3.125, + "learning_rate": 0.0002997930054514612, + "loss": 3.2103, + "step": 93 + }, + { + "epoch": 0.026830970682615394, + "grad_norm": 2.3125, + "learning_rate": 0.0002997858078899861, + "loss": 3.1942, + "step": 94 + }, + { + "epoch": 0.02711640654094109, + "grad_norm": 2.234375, + "learning_rate": 0.00029977848741143966, + "loss": 3.1652, + "step": 95 + }, + { + "epoch": 0.027401842399266785, + "grad_norm": 3.234375, + "learning_rate": 0.0002997710440218291, + "loss": 3.186, + "step": 96 + }, + { + "epoch": 0.02768727825759248, + "grad_norm": 2.40625, + "learning_rate": 0.0002997634777272627, + "loss": 3.1928, + "step": 97 + }, + { + "epoch": 0.027972714115918177, + "grad_norm": 2.625, + "learning_rate": 0.0002997557885339494, + "loss": 3.169, + "step": 98 + }, + { + "epoch": 0.028258149974243873, + "grad_norm": 2.015625, + "learning_rate": 0.00029974797644819926, + "loss": 3.174, + "step": 99 + }, + { + "epoch": 0.02854358583256957, + "grad_norm": 3.984375, + "learning_rate": 0.0002997400414764229, + "loss": 3.1859, + "step": 100 + }, + { + "epoch": 0.028829021690895264, + "grad_norm": 2.234375, + "learning_rate": 0.0002997319836251319, + "loss": 3.1975, + "step": 101 + }, + { + "epoch": 0.02911445754922096, + "grad_norm": 2.65625, + "learning_rate": 0.0002997238029009387, + "loss": 3.163, + "step": 102 + }, + { + "epoch": 0.029399893407546656, + "grad_norm": 3.359375, + "learning_rate": 0.0002997154993105566, + "loss": 3.1766, + "step": 103 + }, + { + "epoch": 0.029685329265872352, + "grad_norm": 3.078125, + "learning_rate": 0.00029970707286079966, + "loss": 3.1692, + "step": 104 + }, + { + "epoch": 0.029970765124198048, + "grad_norm": 3.171875, + "learning_rate": 0.00029969852355858276, + "loss": 3.1785, + "step": 105 + }, + { + "epoch": 0.030256200982523743, + "grad_norm": 2.09375, + "learning_rate": 0.00029968985141092165, + "loss": 3.1622, + "step": 106 + }, + { + "epoch": 0.03054163684084944, + "grad_norm": 2.625, + "learning_rate": 0.00029968105642493286, + "loss": 3.1934, + "step": 107 + }, + { + "epoch": 0.030827072699175135, + "grad_norm": 3.25, + "learning_rate": 0.0002996721386078337, + "loss": 3.1503, + "step": 108 + }, + { + "epoch": 0.03111250855750083, + "grad_norm": 2.34375, + "learning_rate": 0.00029966309796694226, + "loss": 3.1415, + "step": 109 + }, + { + "epoch": 0.03139794441582652, + "grad_norm": 2.6875, + "learning_rate": 0.0002996539345096776, + "loss": 3.169, + "step": 110 + }, + { + "epoch": 0.03168338027415222, + "grad_norm": 1.828125, + "learning_rate": 0.0002996446482435593, + "loss": 3.1381, + "step": 111 + }, + { + "epoch": 0.031968816132477915, + "grad_norm": 2.8125, + "learning_rate": 0.0002996352391762079, + "loss": 3.1506, + "step": 112 + }, + { + "epoch": 0.03225425199080361, + "grad_norm": 2.796875, + "learning_rate": 0.0002996257073153446, + "loss": 3.1666, + "step": 113 + }, + { + "epoch": 0.032539687849129306, + "grad_norm": 2.546875, + "learning_rate": 0.00029961605266879153, + "loss": 3.1883, + "step": 114 + }, + { + "epoch": 0.032825123707455, + "grad_norm": 2.703125, + "learning_rate": 0.0002996062752444714, + "loss": 3.1594, + "step": 115 + }, + { + "epoch": 0.0331105595657807, + "grad_norm": 2.15625, + "learning_rate": 0.00029959637505040773, + "loss": 3.1553, + "step": 116 + }, + { + "epoch": 0.033395995424106394, + "grad_norm": 2.8125, + "learning_rate": 0.00029958635209472486, + "loss": 3.125, + "step": 117 + }, + { + "epoch": 0.03368143128243209, + "grad_norm": 2.4375, + "learning_rate": 0.00029957620638564785, + "loss": 3.1074, + "step": 118 + }, + { + "epoch": 0.033966867140757785, + "grad_norm": 2.03125, + "learning_rate": 0.00029956593793150233, + "loss": 3.1193, + "step": 119 + }, + { + "epoch": 0.03425230299908348, + "grad_norm": 2.484375, + "learning_rate": 0.0002995555467407149, + "loss": 3.107, + "step": 120 + }, + { + "epoch": 0.03453773885740918, + "grad_norm": 2.84375, + "learning_rate": 0.0002995450328218127, + "loss": 3.1292, + "step": 121 + }, + { + "epoch": 0.03482317471573487, + "grad_norm": 2.0, + "learning_rate": 0.0002995343961834238, + "loss": 3.1159, + "step": 122 + }, + { + "epoch": 0.03510861057406057, + "grad_norm": 2.390625, + "learning_rate": 0.0002995236368342766, + "loss": 3.1207, + "step": 123 + }, + { + "epoch": 0.035394046432386264, + "grad_norm": 2.109375, + "learning_rate": 0.00029951275478320056, + "loss": 3.1056, + "step": 124 + }, + { + "epoch": 0.03567948229071196, + "grad_norm": 2.984375, + "learning_rate": 0.00029950175003912573, + "loss": 3.1206, + "step": 125 + }, + { + "epoch": 0.035964918149037656, + "grad_norm": 1.484375, + "learning_rate": 0.0002994906226110827, + "loss": 3.1213, + "step": 126 + }, + { + "epoch": 0.03625035400736335, + "grad_norm": 2.96875, + "learning_rate": 0.00029947937250820295, + "loss": 3.1091, + "step": 127 + }, + { + "epoch": 0.03653578986568905, + "grad_norm": 1.8125, + "learning_rate": 0.0002994679997397185, + "loss": 3.1071, + "step": 128 + }, + { + "epoch": 0.03682122572401474, + "grad_norm": 3.15625, + "learning_rate": 0.000299456504314962, + "loss": 3.143, + "step": 129 + }, + { + "epoch": 0.03710666158234044, + "grad_norm": 1.9765625, + "learning_rate": 0.00029944488624336683, + "loss": 3.1106, + "step": 130 + }, + { + "epoch": 0.037392097440666135, + "grad_norm": 3.3125, + "learning_rate": 0.00029943314553446706, + "loss": 3.1163, + "step": 131 + }, + { + "epoch": 0.03767753329899183, + "grad_norm": 2.578125, + "learning_rate": 0.00029942128219789734, + "loss": 3.1173, + "step": 132 + }, + { + "epoch": 0.037962969157317526, + "grad_norm": 2.734375, + "learning_rate": 0.0002994092962433929, + "loss": 3.1289, + "step": 133 + }, + { + "epoch": 0.03824840501564322, + "grad_norm": 2.484375, + "learning_rate": 0.0002993971876807896, + "loss": 3.1056, + "step": 134 + }, + { + "epoch": 0.03853384087396892, + "grad_norm": 2.40625, + "learning_rate": 0.0002993849565200241, + "loss": 3.0896, + "step": 135 + }, + { + "epoch": 0.038819276732294614, + "grad_norm": 2.359375, + "learning_rate": 0.0002993726027711333, + "loss": 3.1087, + "step": 136 + }, + { + "epoch": 0.03910471259062031, + "grad_norm": 2.328125, + "learning_rate": 0.00029936012644425517, + "loss": 3.1059, + "step": 137 + }, + { + "epoch": 0.039390148448946005, + "grad_norm": 2.984375, + "learning_rate": 0.00029934752754962783, + "loss": 3.1265, + "step": 138 + }, + { + "epoch": 0.0396755843072717, + "grad_norm": 2.15625, + "learning_rate": 0.00029933480609759027, + "loss": 3.0987, + "step": 139 + }, + { + "epoch": 0.0399610201655974, + "grad_norm": 2.59375, + "learning_rate": 0.00029932196209858197, + "loss": 3.1122, + "step": 140 + }, + { + "epoch": 0.04024645602392309, + "grad_norm": 2.375, + "learning_rate": 0.0002993089955631429, + "loss": 3.0887, + "step": 141 + }, + { + "epoch": 0.04053189188224879, + "grad_norm": 2.25, + "learning_rate": 0.0002992959065019136, + "loss": 3.0815, + "step": 142 + }, + { + "epoch": 0.040817327740574484, + "grad_norm": 3.0, + "learning_rate": 0.00029928269492563537, + "loss": 3.0889, + "step": 143 + }, + { + "epoch": 0.04110276359890018, + "grad_norm": 1.53125, + "learning_rate": 0.00029926936084514967, + "loss": 3.0793, + "step": 144 + }, + { + "epoch": 0.041388199457225876, + "grad_norm": 2.59375, + "learning_rate": 0.00029925590427139887, + "loss": 3.0804, + "step": 145 + }, + { + "epoch": 0.04167363531555157, + "grad_norm": 1.8984375, + "learning_rate": 0.00029924232521542557, + "loss": 3.0612, + "step": 146 + }, + { + "epoch": 0.04195907117387727, + "grad_norm": 2.71875, + "learning_rate": 0.00029922862368837315, + "loss": 3.0698, + "step": 147 + }, + { + "epoch": 0.04224450703220296, + "grad_norm": 2.859375, + "learning_rate": 0.00029921479970148517, + "loss": 3.088, + "step": 148 + }, + { + "epoch": 0.04252994289052866, + "grad_norm": 1.9609375, + "learning_rate": 0.00029920085326610595, + "loss": 3.0765, + "step": 149 + }, + { + "epoch": 0.042815378748854355, + "grad_norm": 3.515625, + "learning_rate": 0.00029918678439368017, + "loss": 3.0926, + "step": 150 + }, + { + "epoch": 0.04310081460718005, + "grad_norm": 2.453125, + "learning_rate": 0.000299172593095753, + "loss": 3.0821, + "step": 151 + }, + { + "epoch": 0.043386250465505746, + "grad_norm": 5.25, + "learning_rate": 0.00029915827938397017, + "loss": 3.0682, + "step": 152 + }, + { + "epoch": 0.04367168632383144, + "grad_norm": 3.078125, + "learning_rate": 0.0002991438432700777, + "loss": 3.0657, + "step": 153 + }, + { + "epoch": 0.04395712218215714, + "grad_norm": 4.03125, + "learning_rate": 0.0002991292847659222, + "loss": 3.0883, + "step": 154 + }, + { + "epoch": 0.044242558040482834, + "grad_norm": 3.828125, + "learning_rate": 0.0002991146038834505, + "loss": 3.0962, + "step": 155 + }, + { + "epoch": 0.04452799389880853, + "grad_norm": 2.578125, + "learning_rate": 0.0002990998006347102, + "loss": 3.0695, + "step": 156 + }, + { + "epoch": 0.044813429757134225, + "grad_norm": 4.0625, + "learning_rate": 0.0002990848750318491, + "loss": 3.1003, + "step": 157 + }, + { + "epoch": 0.04509886561545992, + "grad_norm": 2.90625, + "learning_rate": 0.00029906982708711533, + "loss": 3.0733, + "step": 158 + }, + { + "epoch": 0.04538430147378562, + "grad_norm": 5.53125, + "learning_rate": 0.0002990546568128576, + "loss": 3.1179, + "step": 159 + }, + { + "epoch": 0.04566973733211131, + "grad_norm": 4.625, + "learning_rate": 0.00029903936422152487, + "loss": 3.1125, + "step": 160 + }, + { + "epoch": 0.04595517319043701, + "grad_norm": 4.90625, + "learning_rate": 0.00029902394932566657, + "loss": 3.0922, + "step": 161 + }, + { + "epoch": 0.046240609048762704, + "grad_norm": 3.34375, + "learning_rate": 0.00029900841213793247, + "loss": 3.048, + "step": 162 + }, + { + "epoch": 0.0465260449070884, + "grad_norm": 9.5, + "learning_rate": 0.00029899275267107264, + "loss": 3.1456, + "step": 163 + }, + { + "epoch": 0.046811480765414096, + "grad_norm": 8.3125, + "learning_rate": 0.00029897697093793753, + "loss": 3.1066, + "step": 164 + }, + { + "epoch": 0.04709691662373979, + "grad_norm": 3.0, + "learning_rate": 0.000298961066951478, + "loss": 3.0876, + "step": 165 + }, + { + "epoch": 0.04738235248206549, + "grad_norm": 6.0625, + "learning_rate": 0.0002989450407247451, + "loss": 3.1259, + "step": 166 + }, + { + "epoch": 0.04766778834039118, + "grad_norm": 5.96875, + "learning_rate": 0.0002989288922708902, + "loss": 3.1248, + "step": 167 + }, + { + "epoch": 0.04795322419871688, + "grad_norm": 3.4375, + "learning_rate": 0.0002989126216031652, + "loss": 3.0802, + "step": 168 + }, + { + "epoch": 0.048238660057042575, + "grad_norm": 3.890625, + "learning_rate": 0.00029889622873492195, + "loss": 3.0777, + "step": 169 + }, + { + "epoch": 0.04852409591536827, + "grad_norm": 2.78125, + "learning_rate": 0.0002988797136796128, + "loss": 3.0904, + "step": 170 + }, + { + "epoch": 0.048809531773693966, + "grad_norm": 3.453125, + "learning_rate": 0.0002988630764507904, + "loss": 3.081, + "step": 171 + }, + { + "epoch": 0.04909496763201966, + "grad_norm": 2.859375, + "learning_rate": 0.0002988463170621074, + "loss": 3.0743, + "step": 172 + }, + { + "epoch": 0.04938040349034536, + "grad_norm": 2.515625, + "learning_rate": 0.00029882943552731703, + "loss": 3.0189, + "step": 173 + }, + { + "epoch": 0.049665839348671054, + "grad_norm": 2.6875, + "learning_rate": 0.0002988124318602725, + "loss": 3.0684, + "step": 174 + }, + { + "epoch": 0.04995127520699675, + "grad_norm": 2.21875, + "learning_rate": 0.0002987953060749274, + "loss": 3.0479, + "step": 175 + }, + { + "epoch": 0.050236711065322445, + "grad_norm": 2.90625, + "learning_rate": 0.0002987780581853355, + "loss": 3.0374, + "step": 176 + }, + { + "epoch": 0.05052214692364814, + "grad_norm": 2.078125, + "learning_rate": 0.0002987606882056507, + "loss": 3.0589, + "step": 177 + }, + { + "epoch": 0.05080758278197384, + "grad_norm": 3.59375, + "learning_rate": 0.00029874319615012714, + "loss": 3.0731, + "step": 178 + }, + { + "epoch": 0.05109301864029953, + "grad_norm": 3.109375, + "learning_rate": 0.00029872558203311914, + "loss": 3.0793, + "step": 179 + }, + { + "epoch": 0.05137845449862523, + "grad_norm": 2.546875, + "learning_rate": 0.0002987078458690811, + "loss": 3.0748, + "step": 180 + }, + { + "epoch": 0.05166389035695092, + "grad_norm": 3.109375, + "learning_rate": 0.0002986899876725678, + "loss": 3.0308, + "step": 181 + }, + { + "epoch": 0.05194932621527661, + "grad_norm": 2.0625, + "learning_rate": 0.00029867200745823384, + "loss": 3.0496, + "step": 182 + }, + { + "epoch": 0.05223476207360231, + "grad_norm": 2.40625, + "learning_rate": 0.0002986539052408343, + "loss": 3.0577, + "step": 183 + }, + { + "epoch": 0.052520197931928005, + "grad_norm": 2.75, + "learning_rate": 0.0002986356810352241, + "loss": 3.0357, + "step": 184 + }, + { + "epoch": 0.0528056337902537, + "grad_norm": 1.546875, + "learning_rate": 0.00029861733485635834, + "loss": 3.023, + "step": 185 + }, + { + "epoch": 0.053091069648579396, + "grad_norm": 2.6875, + "learning_rate": 0.00029859886671929233, + "loss": 3.0768, + "step": 186 + }, + { + "epoch": 0.05337650550690509, + "grad_norm": 1.90625, + "learning_rate": 0.00029858027663918135, + "loss": 3.0272, + "step": 187 + }, + { + "epoch": 0.05366194136523079, + "grad_norm": 2.328125, + "learning_rate": 0.0002985615646312807, + "loss": 3.0348, + "step": 188 + }, + { + "epoch": 0.053947377223556484, + "grad_norm": 2.140625, + "learning_rate": 0.00029854273071094596, + "loss": 3.0245, + "step": 189 + }, + { + "epoch": 0.05423281308188218, + "grad_norm": 1.9375, + "learning_rate": 0.00029852377489363247, + "loss": 3.0558, + "step": 190 + }, + { + "epoch": 0.054518248940207875, + "grad_norm": 2.578125, + "learning_rate": 0.00029850469719489573, + "loss": 3.0611, + "step": 191 + }, + { + "epoch": 0.05480368479853357, + "grad_norm": 1.9453125, + "learning_rate": 0.00029848549763039135, + "loss": 3.0442, + "step": 192 + }, + { + "epoch": 0.05508912065685927, + "grad_norm": 2.796875, + "learning_rate": 0.00029846617621587474, + "loss": 3.06, + "step": 193 + }, + { + "epoch": 0.05537455651518496, + "grad_norm": 1.765625, + "learning_rate": 0.00029844673296720154, + "loss": 3.0144, + "step": 194 + }, + { + "epoch": 0.05565999237351066, + "grad_norm": 2.46875, + "learning_rate": 0.0002984271679003272, + "loss": 3.0423, + "step": 195 + }, + { + "epoch": 0.055945428231836354, + "grad_norm": 1.8671875, + "learning_rate": 0.0002984074810313071, + "loss": 3.0504, + "step": 196 + }, + { + "epoch": 0.05623086409016205, + "grad_norm": 2.203125, + "learning_rate": 0.00029838767237629684, + "loss": 3.0031, + "step": 197 + }, + { + "epoch": 0.056516299948487746, + "grad_norm": 1.8046875, + "learning_rate": 0.0002983677419515516, + "loss": 3.0401, + "step": 198 + }, + { + "epoch": 0.05680173580681344, + "grad_norm": 2.015625, + "learning_rate": 0.00029834768977342677, + "loss": 3.0359, + "step": 199 + }, + { + "epoch": 0.05708717166513914, + "grad_norm": 2.5625, + "learning_rate": 0.0002983275158583775, + "loss": 3.028, + "step": 200 + }, + { + "epoch": 0.05737260752346483, + "grad_norm": 1.9140625, + "learning_rate": 0.0002983072202229589, + "loss": 3.0115, + "step": 201 + }, + { + "epoch": 0.05765804338179053, + "grad_norm": 1.953125, + "learning_rate": 0.000298286802883826, + "loss": 3.0221, + "step": 202 + }, + { + "epoch": 0.057943479240116225, + "grad_norm": 2.109375, + "learning_rate": 0.0002982662638577335, + "loss": 3.0104, + "step": 203 + }, + { + "epoch": 0.05822891509844192, + "grad_norm": 2.21875, + "learning_rate": 0.00029824560316153633, + "loss": 2.9983, + "step": 204 + }, + { + "epoch": 0.058514350956767616, + "grad_norm": 1.8984375, + "learning_rate": 0.00029822482081218887, + "loss": 3.0208, + "step": 205 + }, + { + "epoch": 0.05879978681509331, + "grad_norm": 2.5, + "learning_rate": 0.00029820391682674563, + "loss": 3.0206, + "step": 206 + }, + { + "epoch": 0.05908522267341901, + "grad_norm": 2.109375, + "learning_rate": 0.00029818289122236075, + "loss": 3.0552, + "step": 207 + }, + { + "epoch": 0.059370658531744704, + "grad_norm": 1.7421875, + "learning_rate": 0.00029816174401628827, + "loss": 3.0075, + "step": 208 + }, + { + "epoch": 0.0596560943900704, + "grad_norm": 2.03125, + "learning_rate": 0.00029814047522588194, + "loss": 3.0068, + "step": 209 + }, + { + "epoch": 0.059941530248396095, + "grad_norm": 1.5546875, + "learning_rate": 0.0002981190848685954, + "loss": 2.9909, + "step": 210 + }, + { + "epoch": 0.06022696610672179, + "grad_norm": 2.453125, + "learning_rate": 0.00029809757296198194, + "loss": 2.9962, + "step": 211 + }, + { + "epoch": 0.06051240196504749, + "grad_norm": 1.515625, + "learning_rate": 0.00029807593952369465, + "loss": 3.0294, + "step": 212 + }, + { + "epoch": 0.06079783782337318, + "grad_norm": 2.484375, + "learning_rate": 0.00029805418457148637, + "loss": 2.9857, + "step": 213 + }, + { + "epoch": 0.06108327368169888, + "grad_norm": 2.15625, + "learning_rate": 0.00029803230812320956, + "loss": 3.0202, + "step": 214 + }, + { + "epoch": 0.061368709540024574, + "grad_norm": 2.0625, + "learning_rate": 0.00029801031019681645, + "loss": 2.9734, + "step": 215 + }, + { + "epoch": 0.06165414539835027, + "grad_norm": 1.7109375, + "learning_rate": 0.000297988190810359, + "loss": 2.9859, + "step": 216 + }, + { + "epoch": 0.061939581256675966, + "grad_norm": 3.203125, + "learning_rate": 0.0002979659499819888, + "loss": 3.0128, + "step": 217 + }, + { + "epoch": 0.06222501711500166, + "grad_norm": 1.7734375, + "learning_rate": 0.0002979435877299571, + "loss": 3.0178, + "step": 218 + }, + { + "epoch": 0.06251045297332736, + "grad_norm": 2.75, + "learning_rate": 0.0002979211040726147, + "loss": 2.9779, + "step": 219 + }, + { + "epoch": 0.06279588883165305, + "grad_norm": 2.375, + "learning_rate": 0.00029789849902841223, + "loss": 2.9843, + "step": 220 + }, + { + "epoch": 0.06308132468997875, + "grad_norm": 2.3125, + "learning_rate": 0.0002978757726158998, + "loss": 2.9943, + "step": 221 + }, + { + "epoch": 0.06336676054830444, + "grad_norm": 2.421875, + "learning_rate": 0.0002978529248537271, + "loss": 3.0043, + "step": 222 + }, + { + "epoch": 0.06365219640663014, + "grad_norm": 2.109375, + "learning_rate": 0.00029782995576064337, + "loss": 2.9729, + "step": 223 + }, + { + "epoch": 0.06393763226495583, + "grad_norm": 1.484375, + "learning_rate": 0.00029780686535549756, + "loss": 2.9874, + "step": 224 + }, + { + "epoch": 0.06422306812328153, + "grad_norm": 2.5, + "learning_rate": 0.0002977836536572382, + "loss": 3.0055, + "step": 225 + }, + { + "epoch": 0.06450850398160722, + "grad_norm": 1.8046875, + "learning_rate": 0.00029776032068491303, + "loss": 3.0, + "step": 226 + }, + { + "epoch": 0.06479393983993292, + "grad_norm": 3.03125, + "learning_rate": 0.0002977368664576696, + "loss": 3.0042, + "step": 227 + }, + { + "epoch": 0.06507937569825861, + "grad_norm": 2.453125, + "learning_rate": 0.000297713290994755, + "loss": 2.9981, + "step": 228 + }, + { + "epoch": 0.06536481155658432, + "grad_norm": 2.65625, + "learning_rate": 0.0002976895943155156, + "loss": 2.9803, + "step": 229 + }, + { + "epoch": 0.06565024741491, + "grad_norm": 3.015625, + "learning_rate": 0.00029766577643939744, + "loss": 2.9994, + "step": 230 + }, + { + "epoch": 0.0659356832732357, + "grad_norm": 1.8203125, + "learning_rate": 0.0002976418373859458, + "loss": 2.9842, + "step": 231 + }, + { + "epoch": 0.0662211191315614, + "grad_norm": 5.125, + "learning_rate": 0.00029761777717480554, + "loss": 3.0053, + "step": 232 + }, + { + "epoch": 0.0665065549898871, + "grad_norm": 3.8125, + "learning_rate": 0.00029759359582572103, + "loss": 2.9906, + "step": 233 + }, + { + "epoch": 0.06679199084821279, + "grad_norm": 4.03125, + "learning_rate": 0.00029756929335853584, + "loss": 3.0234, + "step": 234 + }, + { + "epoch": 0.06707742670653849, + "grad_norm": 3.125, + "learning_rate": 0.0002975448697931931, + "loss": 2.9871, + "step": 235 + }, + { + "epoch": 0.06736286256486418, + "grad_norm": 4.03125, + "learning_rate": 0.00029752032514973516, + "loss": 3.0048, + "step": 236 + }, + { + "epoch": 0.06764829842318988, + "grad_norm": 3.453125, + "learning_rate": 0.0002974956594483039, + "loss": 3.0141, + "step": 237 + }, + { + "epoch": 0.06793373428151557, + "grad_norm": 2.90625, + "learning_rate": 0.0002974708727091404, + "loss": 2.9658, + "step": 238 + }, + { + "epoch": 0.06821917013984127, + "grad_norm": 2.6875, + "learning_rate": 0.00029744596495258525, + "loss": 3.002, + "step": 239 + }, + { + "epoch": 0.06850460599816696, + "grad_norm": 2.625, + "learning_rate": 0.0002974209361990781, + "loss": 2.9831, + "step": 240 + }, + { + "epoch": 0.06879004185649266, + "grad_norm": 2.28125, + "learning_rate": 0.0002973957864691581, + "loss": 2.9823, + "step": 241 + }, + { + "epoch": 0.06907547771481835, + "grad_norm": 2.609375, + "learning_rate": 0.00029737051578346345, + "loss": 2.9626, + "step": 242 + }, + { + "epoch": 0.06936091357314406, + "grad_norm": 1.8203125, + "learning_rate": 0.000297345124162732, + "loss": 2.9729, + "step": 243 + }, + { + "epoch": 0.06964634943146975, + "grad_norm": 3.3125, + "learning_rate": 0.00029731961162780037, + "loss": 3.0036, + "step": 244 + }, + { + "epoch": 0.06993178528979545, + "grad_norm": 2.59375, + "learning_rate": 0.0002972939781996047, + "loss": 2.9818, + "step": 245 + }, + { + "epoch": 0.07021722114812114, + "grad_norm": 4.53125, + "learning_rate": 0.00029726822389918034, + "loss": 2.9709, + "step": 246 + }, + { + "epoch": 0.07050265700644684, + "grad_norm": 4.09375, + "learning_rate": 0.0002972423487476617, + "loss": 2.9748, + "step": 247 + }, + { + "epoch": 0.07078809286477253, + "grad_norm": 2.8125, + "learning_rate": 0.0002972163527662824, + "loss": 2.96, + "step": 248 + }, + { + "epoch": 0.07107352872309823, + "grad_norm": 3.75, + "learning_rate": 0.00029719023597637523, + "loss": 2.9929, + "step": 249 + }, + { + "epoch": 0.07135896458142392, + "grad_norm": 2.640625, + "learning_rate": 0.00029716399839937216, + "loss": 2.9467, + "step": 250 + }, + { + "epoch": 0.07135896458142392, + "eval_loss": 2.805173873901367, + "eval_runtime": 5998.7495, + "eval_samples_per_second": 10.717, + "eval_steps_per_second": 10.717, + "step": 250 + }, + { + "epoch": 0.07164440043974962, + "grad_norm": 3.8125, + "learning_rate": 0.00029713764005680427, + "loss": 2.9764, + "step": 251 + }, + { + "epoch": 0.07192983629807531, + "grad_norm": 3.625, + "learning_rate": 0.00029711116097030167, + "loss": 2.9982, + "step": 252 + }, + { + "epoch": 0.07221527215640101, + "grad_norm": 2.359375, + "learning_rate": 0.0002970845611615935, + "loss": 2.9649, + "step": 253 + }, + { + "epoch": 0.0725007080147267, + "grad_norm": 3.265625, + "learning_rate": 0.00029705784065250826, + "loss": 2.9516, + "step": 254 + }, + { + "epoch": 0.0727861438730524, + "grad_norm": 2.875, + "learning_rate": 0.00029703099946497323, + "loss": 2.9788, + "step": 255 + }, + { + "epoch": 0.0730715797313781, + "grad_norm": 2.734375, + "learning_rate": 0.0002970040376210148, + "loss": 2.9737, + "step": 256 + }, + { + "epoch": 0.0733570155897038, + "grad_norm": 2.5625, + "learning_rate": 0.00029697695514275824, + "loss": 2.9806, + "step": 257 + }, + { + "epoch": 0.07364245144802949, + "grad_norm": 2.375, + "learning_rate": 0.00029694975205242816, + "loss": 2.9629, + "step": 258 + }, + { + "epoch": 0.07392788730635519, + "grad_norm": 2.578125, + "learning_rate": 0.00029692242837234777, + "loss": 2.9698, + "step": 259 + }, + { + "epoch": 0.07421332316468088, + "grad_norm": 1.7890625, + "learning_rate": 0.0002968949841249395, + "loss": 2.9449, + "step": 260 + }, + { + "epoch": 0.07449875902300658, + "grad_norm": 3.234375, + "learning_rate": 0.00029686741933272455, + "loss": 2.9724, + "step": 261 + }, + { + "epoch": 0.07478419488133227, + "grad_norm": 2.421875, + "learning_rate": 0.0002968397340183232, + "loss": 2.9606, + "step": 262 + }, + { + "epoch": 0.07506963073965797, + "grad_norm": 4.5, + "learning_rate": 0.00029681192820445445, + "loss": 3.0101, + "step": 263 + }, + { + "epoch": 0.07535506659798366, + "grad_norm": 3.3125, + "learning_rate": 0.00029678400191393626, + "loss": 2.9797, + "step": 264 + }, + { + "epoch": 0.07564050245630936, + "grad_norm": 4.4375, + "learning_rate": 0.0002967559551696856, + "loss": 2.9859, + "step": 265 + }, + { + "epoch": 0.07592593831463505, + "grad_norm": 3.953125, + "learning_rate": 0.00029672778799471797, + "loss": 2.9839, + "step": 266 + }, + { + "epoch": 0.07621137417296076, + "grad_norm": 3.640625, + "learning_rate": 0.0002966995004121481, + "loss": 2.9812, + "step": 267 + }, + { + "epoch": 0.07649681003128644, + "grad_norm": 3.546875, + "learning_rate": 0.00029667109244518923, + "loss": 2.9904, + "step": 268 + }, + { + "epoch": 0.07678224588961215, + "grad_norm": 2.671875, + "learning_rate": 0.0002966425641171534, + "loss": 2.9614, + "step": 269 + }, + { + "epoch": 0.07706768174793784, + "grad_norm": 2.6875, + "learning_rate": 0.00029661391545145156, + "loss": 2.9671, + "step": 270 + }, + { + "epoch": 0.07735311760626354, + "grad_norm": 2.125, + "learning_rate": 0.00029658514647159335, + "loss": 2.9646, + "step": 271 + }, + { + "epoch": 0.07763855346458923, + "grad_norm": 2.640625, + "learning_rate": 0.0002965562572011872, + "loss": 2.9729, + "step": 272 + }, + { + "epoch": 0.07792398932291493, + "grad_norm": 1.78125, + "learning_rate": 0.00029652724766394007, + "loss": 2.9315, + "step": 273 + }, + { + "epoch": 0.07820942518124062, + "grad_norm": 3.015625, + "learning_rate": 0.0002964981178836578, + "loss": 2.9511, + "step": 274 + }, + { + "epoch": 0.07849486103956632, + "grad_norm": 2.171875, + "learning_rate": 0.00029646886788424487, + "loss": 2.9338, + "step": 275 + }, + { + "epoch": 0.07878029689789201, + "grad_norm": 3.296875, + "learning_rate": 0.0002964394976897043, + "loss": 2.936, + "step": 276 + }, + { + "epoch": 0.07906573275621771, + "grad_norm": 2.734375, + "learning_rate": 0.0002964100073241379, + "loss": 2.9335, + "step": 277 + }, + { + "epoch": 0.0793511686145434, + "grad_norm": 2.65625, + "learning_rate": 0.000296380396811746, + "loss": 2.9638, + "step": 278 + }, + { + "epoch": 0.0796366044728691, + "grad_norm": 2.203125, + "learning_rate": 0.00029635066617682754, + "loss": 2.9612, + "step": 279 + }, + { + "epoch": 0.0799220403311948, + "grad_norm": 2.09375, + "learning_rate": 0.00029632081544378003, + "loss": 2.9579, + "step": 280 + }, + { + "epoch": 0.0802074761895205, + "grad_norm": 1.734375, + "learning_rate": 0.00029629084463709957, + "loss": 2.9506, + "step": 281 + }, + { + "epoch": 0.08049291204784619, + "grad_norm": 1.8671875, + "learning_rate": 0.0002962607537813808, + "loss": 2.9479, + "step": 282 + }, + { + "epoch": 0.08077834790617189, + "grad_norm": 1.4609375, + "learning_rate": 0.0002962305429013168, + "loss": 2.9124, + "step": 283 + }, + { + "epoch": 0.08106378376449758, + "grad_norm": 2.03125, + "learning_rate": 0.0002962002120216992, + "loss": 2.9741, + "step": 284 + }, + { + "epoch": 0.08134921962282328, + "grad_norm": 1.625, + "learning_rate": 0.0002961697611674181, + "loss": 2.9481, + "step": 285 + }, + { + "epoch": 0.08163465548114897, + "grad_norm": 1.875, + "learning_rate": 0.00029613919036346203, + "loss": 2.9457, + "step": 286 + }, + { + "epoch": 0.08192009133947467, + "grad_norm": 1.78125, + "learning_rate": 0.00029610849963491797, + "loss": 2.9509, + "step": 287 + }, + { + "epoch": 0.08220552719780036, + "grad_norm": 2.40625, + "learning_rate": 0.0002960776890069714, + "loss": 2.9441, + "step": 288 + }, + { + "epoch": 0.08249096305612605, + "grad_norm": 2.03125, + "learning_rate": 0.0002960467585049059, + "loss": 2.9625, + "step": 289 + }, + { + "epoch": 0.08277639891445175, + "grad_norm": 1.453125, + "learning_rate": 0.0002960157081541039, + "loss": 2.9183, + "step": 290 + }, + { + "epoch": 0.08306183477277744, + "grad_norm": 2.0, + "learning_rate": 0.0002959845379800457, + "loss": 2.9312, + "step": 291 + }, + { + "epoch": 0.08334727063110314, + "grad_norm": 1.734375, + "learning_rate": 0.00029595324800831024, + "loss": 2.9224, + "step": 292 + }, + { + "epoch": 0.08363270648942883, + "grad_norm": 2.1875, + "learning_rate": 0.0002959218382645746, + "loss": 2.9394, + "step": 293 + }, + { + "epoch": 0.08391814234775453, + "grad_norm": 1.96875, + "learning_rate": 0.00029589030877461426, + "loss": 2.9493, + "step": 294 + }, + { + "epoch": 0.08420357820608022, + "grad_norm": 1.5078125, + "learning_rate": 0.00029585865956430283, + "loss": 2.9385, + "step": 295 + }, + { + "epoch": 0.08448901406440593, + "grad_norm": 2.4375, + "learning_rate": 0.00029582689065961237, + "loss": 2.9265, + "step": 296 + }, + { + "epoch": 0.08477444992273162, + "grad_norm": 1.609375, + "learning_rate": 0.00029579500208661296, + "loss": 2.9448, + "step": 297 + }, + { + "epoch": 0.08505988578105732, + "grad_norm": 1.8984375, + "learning_rate": 0.00029576299387147305, + "loss": 2.9555, + "step": 298 + }, + { + "epoch": 0.085345321639383, + "grad_norm": 1.6953125, + "learning_rate": 0.00029573086604045904, + "loss": 2.904, + "step": 299 + }, + { + "epoch": 0.08563075749770871, + "grad_norm": 2.046875, + "learning_rate": 0.0002956986186199358, + "loss": 2.959, + "step": 300 + }, + { + "epoch": 0.0859161933560344, + "grad_norm": 1.171875, + "learning_rate": 0.0002956662516363661, + "loss": 2.9075, + "step": 301 + }, + { + "epoch": 0.0862016292143601, + "grad_norm": 2.40625, + "learning_rate": 0.0002956337651163109, + "loss": 2.9521, + "step": 302 + }, + { + "epoch": 0.08648706507268579, + "grad_norm": 1.703125, + "learning_rate": 0.00029560115908642924, + "loss": 2.9425, + "step": 303 + }, + { + "epoch": 0.08677250093101149, + "grad_norm": 2.59375, + "learning_rate": 0.0002955684335734783, + "loss": 2.9626, + "step": 304 + }, + { + "epoch": 0.08705793678933718, + "grad_norm": 1.8515625, + "learning_rate": 0.00029553558860431317, + "loss": 2.9293, + "step": 305 + }, + { + "epoch": 0.08734337264766288, + "grad_norm": 2.515625, + "learning_rate": 0.0002955026242058872, + "loss": 2.9332, + "step": 306 + }, + { + "epoch": 0.08762880850598857, + "grad_norm": 2.03125, + "learning_rate": 0.0002954695404052514, + "loss": 2.9323, + "step": 307 + }, + { + "epoch": 0.08791424436431428, + "grad_norm": 2.265625, + "learning_rate": 0.0002954363372295551, + "loss": 2.9408, + "step": 308 + }, + { + "epoch": 0.08819968022263996, + "grad_norm": 1.859375, + "learning_rate": 0.0002954030147060454, + "loss": 2.9305, + "step": 309 + }, + { + "epoch": 0.08848511608096567, + "grad_norm": 2.1875, + "learning_rate": 0.0002953695728620675, + "loss": 2.9323, + "step": 310 + }, + { + "epoch": 0.08877055193929136, + "grad_norm": 1.7890625, + "learning_rate": 0.00029533601172506427, + "loss": 2.9138, + "step": 311 + }, + { + "epoch": 0.08905598779761706, + "grad_norm": 2.40625, + "learning_rate": 0.00029530233132257663, + "loss": 2.9394, + "step": 312 + }, + { + "epoch": 0.08934142365594275, + "grad_norm": 1.7578125, + "learning_rate": 0.00029526853168224343, + "loss": 2.8984, + "step": 313 + }, + { + "epoch": 0.08962685951426845, + "grad_norm": 2.359375, + "learning_rate": 0.0002952346128318013, + "loss": 2.9322, + "step": 314 + }, + { + "epoch": 0.08991229537259414, + "grad_norm": 1.9375, + "learning_rate": 0.00029520057479908465, + "loss": 2.9164, + "step": 315 + }, + { + "epoch": 0.09019773123091984, + "grad_norm": 2.234375, + "learning_rate": 0.0002951664176120257, + "loss": 2.9167, + "step": 316 + }, + { + "epoch": 0.09048316708924553, + "grad_norm": 1.9375, + "learning_rate": 0.00029513214129865456, + "loss": 2.9398, + "step": 317 + }, + { + "epoch": 0.09076860294757123, + "grad_norm": 2.21875, + "learning_rate": 0.00029509774588709896, + "loss": 2.9395, + "step": 318 + }, + { + "epoch": 0.09105403880589692, + "grad_norm": 1.828125, + "learning_rate": 0.00029506323140558445, + "loss": 2.9478, + "step": 319 + }, + { + "epoch": 0.09133947466422263, + "grad_norm": 1.9140625, + "learning_rate": 0.0002950285978824343, + "loss": 2.9216, + "step": 320 + }, + { + "epoch": 0.09162491052254831, + "grad_norm": 1.7109375, + "learning_rate": 0.00029499384534606936, + "loss": 2.8959, + "step": 321 + }, + { + "epoch": 0.09191034638087402, + "grad_norm": 1.75, + "learning_rate": 0.00029495897382500827, + "loss": 2.9072, + "step": 322 + }, + { + "epoch": 0.0921957822391997, + "grad_norm": 1.5234375, + "learning_rate": 0.00029492398334786727, + "loss": 2.9121, + "step": 323 + }, + { + "epoch": 0.09248121809752541, + "grad_norm": 2.09375, + "learning_rate": 0.0002948888739433602, + "loss": 2.9344, + "step": 324 + }, + { + "epoch": 0.0927666539558511, + "grad_norm": 1.765625, + "learning_rate": 0.0002948536456402985, + "loss": 2.9211, + "step": 325 + }, + { + "epoch": 0.0930520898141768, + "grad_norm": 1.9296875, + "learning_rate": 0.00029481829846759116, + "loss": 2.9041, + "step": 326 + }, + { + "epoch": 0.09333752567250249, + "grad_norm": 2.265625, + "learning_rate": 0.0002947828324542448, + "loss": 2.9353, + "step": 327 + }, + { + "epoch": 0.09362296153082819, + "grad_norm": 1.1328125, + "learning_rate": 0.0002947472476293634, + "loss": 2.9037, + "step": 328 + }, + { + "epoch": 0.09390839738915388, + "grad_norm": 1.8359375, + "learning_rate": 0.00029471154402214864, + "loss": 2.9166, + "step": 329 + }, + { + "epoch": 0.09419383324747958, + "grad_norm": 2.078125, + "learning_rate": 0.00029467572166189956, + "loss": 2.9074, + "step": 330 + }, + { + "epoch": 0.09447926910580527, + "grad_norm": 2.015625, + "learning_rate": 0.00029463978057801257, + "loss": 2.9137, + "step": 331 + }, + { + "epoch": 0.09476470496413097, + "grad_norm": 1.7734375, + "learning_rate": 0.00029460372079998177, + "loss": 2.8971, + "step": 332 + }, + { + "epoch": 0.09505014082245666, + "grad_norm": 1.296875, + "learning_rate": 0.00029456754235739833, + "loss": 2.8784, + "step": 333 + }, + { + "epoch": 0.09533557668078237, + "grad_norm": 2.203125, + "learning_rate": 0.0002945312452799511, + "loss": 2.9102, + "step": 334 + }, + { + "epoch": 0.09562101253910806, + "grad_norm": 1.34375, + "learning_rate": 0.00029449482959742604, + "loss": 2.9096, + "step": 335 + }, + { + "epoch": 0.09590644839743376, + "grad_norm": 1.3671875, + "learning_rate": 0.0002944582953397067, + "loss": 2.8925, + "step": 336 + }, + { + "epoch": 0.09619188425575945, + "grad_norm": 1.9375, + "learning_rate": 0.0002944216425367736, + "loss": 2.9094, + "step": 337 + }, + { + "epoch": 0.09647732011408515, + "grad_norm": 1.9140625, + "learning_rate": 0.0002943848712187048, + "loss": 2.9133, + "step": 338 + }, + { + "epoch": 0.09676275597241084, + "grad_norm": 1.90625, + "learning_rate": 0.0002943479814156756, + "loss": 2.9073, + "step": 339 + }, + { + "epoch": 0.09704819183073654, + "grad_norm": 1.2734375, + "learning_rate": 0.00029431097315795834, + "loss": 2.8993, + "step": 340 + }, + { + "epoch": 0.09733362768906223, + "grad_norm": 2.296875, + "learning_rate": 0.00029427384647592284, + "loss": 2.8968, + "step": 341 + }, + { + "epoch": 0.09761906354738793, + "grad_norm": 1.4609375, + "learning_rate": 0.0002942366014000359, + "loss": 2.9124, + "step": 342 + }, + { + "epoch": 0.09790449940571362, + "grad_norm": 2.484375, + "learning_rate": 0.0002941992379608615, + "loss": 2.8816, + "step": 343 + }, + { + "epoch": 0.09818993526403932, + "grad_norm": 1.609375, + "learning_rate": 0.00029416175618906084, + "loss": 2.9015, + "step": 344 + }, + { + "epoch": 0.09847537112236501, + "grad_norm": 2.6875, + "learning_rate": 0.00029412415611539214, + "loss": 2.9286, + "step": 345 + }, + { + "epoch": 0.09876080698069072, + "grad_norm": 2.140625, + "learning_rate": 0.00029408643777071073, + "loss": 2.9316, + "step": 346 + }, + { + "epoch": 0.0990462428390164, + "grad_norm": 2.28125, + "learning_rate": 0.00029404860118596905, + "loss": 2.894, + "step": 347 + }, + { + "epoch": 0.09933167869734211, + "grad_norm": 2.09375, + "learning_rate": 0.00029401064639221643, + "loss": 2.8946, + "step": 348 + }, + { + "epoch": 0.0996171145556678, + "grad_norm": 2.21875, + "learning_rate": 0.0002939725734205994, + "loss": 2.9068, + "step": 349 + }, + { + "epoch": 0.0999025504139935, + "grad_norm": 1.84375, + "learning_rate": 0.00029393438230236124, + "loss": 2.8898, + "step": 350 + }, + { + "epoch": 0.10018798627231919, + "grad_norm": 1.7578125, + "learning_rate": 0.0002938960730688424, + "loss": 2.8922, + "step": 351 + }, + { + "epoch": 0.10047342213064489, + "grad_norm": 1.640625, + "learning_rate": 0.00029385764575148014, + "loss": 2.8772, + "step": 352 + }, + { + "epoch": 0.10075885798897058, + "grad_norm": 1.7890625, + "learning_rate": 0.00029381910038180856, + "loss": 2.8961, + "step": 353 + }, + { + "epoch": 0.10104429384729628, + "grad_norm": 1.890625, + "learning_rate": 0.00029378043699145886, + "loss": 2.9052, + "step": 354 + }, + { + "epoch": 0.10132972970562197, + "grad_norm": 1.9375, + "learning_rate": 0.0002937416556121589, + "loss": 2.8703, + "step": 355 + }, + { + "epoch": 0.10161516556394767, + "grad_norm": 1.5625, + "learning_rate": 0.0002937027562757334, + "loss": 2.8967, + "step": 356 + }, + { + "epoch": 0.10190060142227336, + "grad_norm": 1.6640625, + "learning_rate": 0.00029366373901410387, + "loss": 2.913, + "step": 357 + }, + { + "epoch": 0.10218603728059907, + "grad_norm": 1.53125, + "learning_rate": 0.0002936246038592886, + "loss": 2.8944, + "step": 358 + }, + { + "epoch": 0.10247147313892475, + "grad_norm": 2.390625, + "learning_rate": 0.00029358535084340274, + "loss": 2.8808, + "step": 359 + }, + { + "epoch": 0.10275690899725046, + "grad_norm": 1.421875, + "learning_rate": 0.000293545979998658, + "loss": 2.9055, + "step": 360 + }, + { + "epoch": 0.10304234485557615, + "grad_norm": 1.71875, + "learning_rate": 0.0002935064913573628, + "loss": 2.8925, + "step": 361 + }, + { + "epoch": 0.10332778071390183, + "grad_norm": 2.0625, + "learning_rate": 0.0002934668849519223, + "loss": 2.8751, + "step": 362 + }, + { + "epoch": 0.10361321657222754, + "grad_norm": 2.015625, + "learning_rate": 0.00029342716081483825, + "loss": 2.8836, + "step": 363 + }, + { + "epoch": 0.10389865243055323, + "grad_norm": 1.4375, + "learning_rate": 0.0002933873189787091, + "loss": 2.8702, + "step": 364 + }, + { + "epoch": 0.10418408828887893, + "grad_norm": 2.375, + "learning_rate": 0.0002933473594762297, + "loss": 2.8953, + "step": 365 + }, + { + "epoch": 0.10446952414720462, + "grad_norm": 1.3671875, + "learning_rate": 0.00029330728234019173, + "loss": 2.8753, + "step": 366 + }, + { + "epoch": 0.10475496000553032, + "grad_norm": 2.90625, + "learning_rate": 0.0002932670876034831, + "loss": 2.8844, + "step": 367 + }, + { + "epoch": 0.10504039586385601, + "grad_norm": 2.109375, + "learning_rate": 0.00029322677529908844, + "loss": 2.9018, + "step": 368 + }, + { + "epoch": 0.10532583172218171, + "grad_norm": 2.328125, + "learning_rate": 0.0002931863454600888, + "loss": 2.8967, + "step": 369 + }, + { + "epoch": 0.1056112675805074, + "grad_norm": 2.0625, + "learning_rate": 0.0002931457981196616, + "loss": 2.882, + "step": 370 + }, + { + "epoch": 0.1058967034388331, + "grad_norm": 2.1875, + "learning_rate": 0.00029310513331108086, + "loss": 2.8641, + "step": 371 + }, + { + "epoch": 0.10618213929715879, + "grad_norm": 1.78125, + "learning_rate": 0.0002930643510677168, + "loss": 2.8808, + "step": 372 + }, + { + "epoch": 0.1064675751554845, + "grad_norm": 1.703125, + "learning_rate": 0.00029302345142303616, + "loss": 2.8699, + "step": 373 + }, + { + "epoch": 0.10675301101381018, + "grad_norm": 1.8984375, + "learning_rate": 0.0002929824344106019, + "loss": 2.8467, + "step": 374 + }, + { + "epoch": 0.10703844687213589, + "grad_norm": 1.546875, + "learning_rate": 0.0002929413000640735, + "loss": 2.8674, + "step": 375 + }, + { + "epoch": 0.10732388273046158, + "grad_norm": 2.265625, + "learning_rate": 0.0002929000484172064, + "loss": 2.8897, + "step": 376 + }, + { + "epoch": 0.10760931858878728, + "grad_norm": 1.328125, + "learning_rate": 0.00029285867950385255, + "loss": 2.8601, + "step": 377 + }, + { + "epoch": 0.10789475444711297, + "grad_norm": 2.65625, + "learning_rate": 0.00029281719335796013, + "loss": 2.89, + "step": 378 + }, + { + "epoch": 0.10818019030543867, + "grad_norm": 2.046875, + "learning_rate": 0.00029277559001357343, + "loss": 2.9044, + "step": 379 + }, + { + "epoch": 0.10846562616376436, + "grad_norm": 2.109375, + "learning_rate": 0.00029273386950483287, + "loss": 2.8765, + "step": 380 + }, + { + "epoch": 0.10875106202209006, + "grad_norm": 1.7265625, + "learning_rate": 0.00029269203186597513, + "loss": 2.8911, + "step": 381 + }, + { + "epoch": 0.10903649788041575, + "grad_norm": 2.296875, + "learning_rate": 0.00029265007713133304, + "loss": 2.8756, + "step": 382 + }, + { + "epoch": 0.10932193373874145, + "grad_norm": 1.5703125, + "learning_rate": 0.00029260800533533534, + "loss": 2.889, + "step": 383 + }, + { + "epoch": 0.10960736959706714, + "grad_norm": 2.609375, + "learning_rate": 0.000292565816512507, + "loss": 2.8758, + "step": 384 + }, + { + "epoch": 0.10989280545539284, + "grad_norm": 2.125, + "learning_rate": 0.000292523510697469, + "loss": 2.8699, + "step": 385 + }, + { + "epoch": 0.11017824131371853, + "grad_norm": 2.515625, + "learning_rate": 0.0002924810879249382, + "loss": 2.8935, + "step": 386 + }, + { + "epoch": 0.11046367717204424, + "grad_norm": 2.015625, + "learning_rate": 0.00029243854822972763, + "loss": 2.8723, + "step": 387 + }, + { + "epoch": 0.11074911303036993, + "grad_norm": 2.21875, + "learning_rate": 0.0002923958916467461, + "loss": 2.894, + "step": 388 + }, + { + "epoch": 0.11103454888869563, + "grad_norm": 2.0625, + "learning_rate": 0.00029235311821099847, + "loss": 2.8676, + "step": 389 + }, + { + "epoch": 0.11131998474702132, + "grad_norm": 1.75, + "learning_rate": 0.00029231022795758537, + "loss": 2.8786, + "step": 390 + }, + { + "epoch": 0.11160542060534702, + "grad_norm": 1.671875, + "learning_rate": 0.0002922672209217033, + "loss": 2.867, + "step": 391 + }, + { + "epoch": 0.11189085646367271, + "grad_norm": 1.8359375, + "learning_rate": 0.00029222409713864484, + "loss": 2.8938, + "step": 392 + }, + { + "epoch": 0.11217629232199841, + "grad_norm": 1.6640625, + "learning_rate": 0.00029218085664379806, + "loss": 2.8601, + "step": 393 + }, + { + "epoch": 0.1124617281803241, + "grad_norm": 1.5078125, + "learning_rate": 0.0002921374994726469, + "loss": 2.8817, + "step": 394 + }, + { + "epoch": 0.1127471640386498, + "grad_norm": 1.6015625, + "learning_rate": 0.0002920940256607711, + "loss": 2.8482, + "step": 395 + }, + { + "epoch": 0.11303259989697549, + "grad_norm": 1.859375, + "learning_rate": 0.0002920504352438462, + "loss": 2.8996, + "step": 396 + }, + { + "epoch": 0.1133180357553012, + "grad_norm": 2.015625, + "learning_rate": 0.00029200672825764314, + "loss": 2.8592, + "step": 397 + }, + { + "epoch": 0.11360347161362688, + "grad_norm": 1.46875, + "learning_rate": 0.00029196290473802885, + "loss": 2.8327, + "step": 398 + }, + { + "epoch": 0.11388890747195259, + "grad_norm": 1.515625, + "learning_rate": 0.0002919189647209656, + "loss": 2.8438, + "step": 399 + }, + { + "epoch": 0.11417434333027827, + "grad_norm": 1.3359375, + "learning_rate": 0.00029187490824251154, + "loss": 2.884, + "step": 400 + }, + { + "epoch": 0.11445977918860398, + "grad_norm": 1.828125, + "learning_rate": 0.00029183073533882025, + "loss": 2.8601, + "step": 401 + }, + { + "epoch": 0.11474521504692967, + "grad_norm": 1.96875, + "learning_rate": 0.00029178644604614077, + "loss": 2.8788, + "step": 402 + }, + { + "epoch": 0.11503065090525537, + "grad_norm": 1.7890625, + "learning_rate": 0.00029174204040081773, + "loss": 2.8823, + "step": 403 + }, + { + "epoch": 0.11531608676358106, + "grad_norm": 1.7265625, + "learning_rate": 0.0002916975184392912, + "loss": 2.8464, + "step": 404 + }, + { + "epoch": 0.11560152262190676, + "grad_norm": 1.0625, + "learning_rate": 0.0002916528801980969, + "loss": 2.8377, + "step": 405 + }, + { + "epoch": 0.11588695848023245, + "grad_norm": 2.109375, + "learning_rate": 0.00029160812571386575, + "loss": 2.8409, + "step": 406 + }, + { + "epoch": 0.11617239433855815, + "grad_norm": 1.4609375, + "learning_rate": 0.00029156325502332413, + "loss": 2.8581, + "step": 407 + }, + { + "epoch": 0.11645783019688384, + "grad_norm": 1.9296875, + "learning_rate": 0.00029151826816329365, + "loss": 2.865, + "step": 408 + }, + { + "epoch": 0.11674326605520954, + "grad_norm": 1.71875, + "learning_rate": 0.00029147316517069157, + "loss": 2.8527, + "step": 409 + }, + { + "epoch": 0.11702870191353523, + "grad_norm": 1.4375, + "learning_rate": 0.00029142794608253016, + "loss": 2.8494, + "step": 410 + }, + { + "epoch": 0.11731413777186094, + "grad_norm": 3.875, + "learning_rate": 0.0002913826109359171, + "loss": 2.8461, + "step": 411 + }, + { + "epoch": 0.11759957363018662, + "grad_norm": 1.875, + "learning_rate": 0.00029133715976805525, + "loss": 2.8565, + "step": 412 + }, + { + "epoch": 0.11788500948851233, + "grad_norm": 3.125, + "learning_rate": 0.0002912915926162427, + "loss": 2.8667, + "step": 413 + }, + { + "epoch": 0.11817044534683802, + "grad_norm": 2.0625, + "learning_rate": 0.00029124590951787267, + "loss": 2.8504, + "step": 414 + }, + { + "epoch": 0.11845588120516372, + "grad_norm": 3.46875, + "learning_rate": 0.0002912001105104337, + "loss": 2.8719, + "step": 415 + }, + { + "epoch": 0.11874131706348941, + "grad_norm": 2.171875, + "learning_rate": 0.00029115419563150916, + "loss": 2.8702, + "step": 416 + }, + { + "epoch": 0.11902675292181511, + "grad_norm": 4.71875, + "learning_rate": 0.0002911081649187778, + "loss": 2.8971, + "step": 417 + }, + { + "epoch": 0.1193121887801408, + "grad_norm": 3.828125, + "learning_rate": 0.0002910620184100133, + "loss": 2.9119, + "step": 418 + }, + { + "epoch": 0.1195976246384665, + "grad_norm": 4.25, + "learning_rate": 0.0002910157561430842, + "loss": 2.8927, + "step": 419 + }, + { + "epoch": 0.11988306049679219, + "grad_norm": 3.65625, + "learning_rate": 0.0002909693781559544, + "loss": 2.861, + "step": 420 + }, + { + "epoch": 0.1201684963551179, + "grad_norm": 3.59375, + "learning_rate": 0.0002909228844866824, + "loss": 2.8826, + "step": 421 + }, + { + "epoch": 0.12045393221344358, + "grad_norm": 3.265625, + "learning_rate": 0.0002908762751734219, + "loss": 2.8495, + "step": 422 + }, + { + "epoch": 0.12073936807176928, + "grad_norm": 3.484375, + "learning_rate": 0.0002908295502544213, + "loss": 2.8707, + "step": 423 + }, + { + "epoch": 0.12102480393009497, + "grad_norm": 2.828125, + "learning_rate": 0.00029078270976802393, + "loss": 2.8647, + "step": 424 + }, + { + "epoch": 0.12131023978842068, + "grad_norm": 3.515625, + "learning_rate": 0.00029073575375266806, + "loss": 2.8505, + "step": 425 + }, + { + "epoch": 0.12159567564674637, + "grad_norm": 2.5, + "learning_rate": 0.0002906886822468867, + "loss": 2.8821, + "step": 426 + }, + { + "epoch": 0.12188111150507207, + "grad_norm": 5.4375, + "learning_rate": 0.0002906414952893075, + "loss": 2.8788, + "step": 427 + }, + { + "epoch": 0.12216654736339776, + "grad_norm": 4.40625, + "learning_rate": 0.00029059419291865314, + "loss": 2.8715, + "step": 428 + }, + { + "epoch": 0.12245198322172346, + "grad_norm": 3.6875, + "learning_rate": 0.0002905467751737407, + "loss": 2.846, + "step": 429 + }, + { + "epoch": 0.12273741908004915, + "grad_norm": 3.765625, + "learning_rate": 0.00029049924209348214, + "loss": 2.856, + "step": 430 + }, + { + "epoch": 0.12302285493837485, + "grad_norm": 2.90625, + "learning_rate": 0.000290451593716884, + "loss": 2.8646, + "step": 431 + }, + { + "epoch": 0.12330829079670054, + "grad_norm": 2.890625, + "learning_rate": 0.00029040383008304744, + "loss": 2.8408, + "step": 432 + }, + { + "epoch": 0.12359372665502623, + "grad_norm": 2.984375, + "learning_rate": 0.00029035595123116817, + "loss": 2.8501, + "step": 433 + }, + { + "epoch": 0.12387916251335193, + "grad_norm": 2.53125, + "learning_rate": 0.0002903079572005365, + "loss": 2.8384, + "step": 434 + }, + { + "epoch": 0.12416459837167762, + "grad_norm": 3.40625, + "learning_rate": 0.00029025984803053735, + "loss": 2.8436, + "step": 435 + }, + { + "epoch": 0.12445003423000332, + "grad_norm": 2.984375, + "learning_rate": 0.0002902116237606498, + "loss": 2.8543, + "step": 436 + }, + { + "epoch": 0.12473547008832901, + "grad_norm": 3.65625, + "learning_rate": 0.0002901632844304478, + "loss": 2.8469, + "step": 437 + }, + { + "epoch": 0.12502090594665471, + "grad_norm": 3.40625, + "learning_rate": 0.0002901148300795994, + "loss": 2.8636, + "step": 438 + }, + { + "epoch": 0.1253063418049804, + "grad_norm": 3.546875, + "learning_rate": 0.0002900662607478672, + "loss": 2.8424, + "step": 439 + }, + { + "epoch": 0.1255917776633061, + "grad_norm": 3.265625, + "learning_rate": 0.00029001757647510815, + "loss": 2.8493, + "step": 440 + }, + { + "epoch": 0.1258772135216318, + "grad_norm": 2.890625, + "learning_rate": 0.0002899687773012734, + "loss": 2.8214, + "step": 441 + }, + { + "epoch": 0.1261626493799575, + "grad_norm": 2.75, + "learning_rate": 0.0002899198632664086, + "loss": 2.8492, + "step": 442 + }, + { + "epoch": 0.1264480852382832, + "grad_norm": 3.6875, + "learning_rate": 0.0002898708344106533, + "loss": 2.8111, + "step": 443 + }, + { + "epoch": 0.12673352109660888, + "grad_norm": 3.546875, + "learning_rate": 0.0002898216907742418, + "loss": 2.8513, + "step": 444 + }, + { + "epoch": 0.1270189569549346, + "grad_norm": 2.671875, + "learning_rate": 0.0002897724323975021, + "loss": 2.8602, + "step": 445 + }, + { + "epoch": 0.12730439281326028, + "grad_norm": 2.578125, + "learning_rate": 0.0002897230593208567, + "loss": 2.8462, + "step": 446 + }, + { + "epoch": 0.12758982867158597, + "grad_norm": 2.828125, + "learning_rate": 0.00028967357158482196, + "loss": 2.8422, + "step": 447 + }, + { + "epoch": 0.12787526452991166, + "grad_norm": 2.375, + "learning_rate": 0.00028962396923000846, + "loss": 2.8382, + "step": 448 + }, + { + "epoch": 0.12816070038823738, + "grad_norm": 3.6875, + "learning_rate": 0.0002895742522971209, + "loss": 2.8544, + "step": 449 + }, + { + "epoch": 0.12844613624656306, + "grad_norm": 3.640625, + "learning_rate": 0.0002895244208269579, + "loss": 2.8542, + "step": 450 + }, + { + "epoch": 0.12873157210488875, + "grad_norm": 2.40625, + "learning_rate": 0.0002894744748604121, + "loss": 2.8417, + "step": 451 + }, + { + "epoch": 0.12901700796321444, + "grad_norm": 2.375, + "learning_rate": 0.0002894244144384701, + "loss": 2.8588, + "step": 452 + }, + { + "epoch": 0.12930244382154016, + "grad_norm": 2.765625, + "learning_rate": 0.0002893742396022125, + "loss": 2.8388, + "step": 453 + }, + { + "epoch": 0.12958787967986585, + "grad_norm": 2.359375, + "learning_rate": 0.0002893239503928137, + "loss": 2.8559, + "step": 454 + }, + { + "epoch": 0.12987331553819154, + "grad_norm": 3.96875, + "learning_rate": 0.00028927354685154185, + "loss": 2.8341, + "step": 455 + }, + { + "epoch": 0.13015875139651722, + "grad_norm": 3.765625, + "learning_rate": 0.0002892230290197592, + "loss": 2.8267, + "step": 456 + }, + { + "epoch": 0.13044418725484294, + "grad_norm": 2.078125, + "learning_rate": 0.0002891723969389216, + "loss": 2.8497, + "step": 457 + }, + { + "epoch": 0.13072962311316863, + "grad_norm": 2.0, + "learning_rate": 0.0002891216506505787, + "loss": 2.8252, + "step": 458 + }, + { + "epoch": 0.13101505897149432, + "grad_norm": 3.5625, + "learning_rate": 0.0002890707901963738, + "loss": 2.8563, + "step": 459 + }, + { + "epoch": 0.13130049482982, + "grad_norm": 3.171875, + "learning_rate": 0.00028901981561804403, + "loss": 2.861, + "step": 460 + }, + { + "epoch": 0.13158593068814572, + "grad_norm": 2.734375, + "learning_rate": 0.0002889687269574201, + "loss": 2.8336, + "step": 461 + }, + { + "epoch": 0.1318713665464714, + "grad_norm": 2.78125, + "learning_rate": 0.0002889175242564263, + "loss": 2.8575, + "step": 462 + }, + { + "epoch": 0.1321568024047971, + "grad_norm": 2.265625, + "learning_rate": 0.00028886620755708045, + "loss": 2.8301, + "step": 463 + }, + { + "epoch": 0.1324422382631228, + "grad_norm": 2.015625, + "learning_rate": 0.0002888147769014942, + "loss": 2.8299, + "step": 464 + }, + { + "epoch": 0.1327276741214485, + "grad_norm": 3.1875, + "learning_rate": 0.0002887632323318723, + "loss": 2.8261, + "step": 465 + }, + { + "epoch": 0.1330131099797742, + "grad_norm": 2.84375, + "learning_rate": 0.0002887115738905134, + "loss": 2.8398, + "step": 466 + }, + { + "epoch": 0.13329854583809989, + "grad_norm": 2.828125, + "learning_rate": 0.0002886598016198093, + "loss": 2.8414, + "step": 467 + }, + { + "epoch": 0.13358398169642557, + "grad_norm": 2.640625, + "learning_rate": 0.00028860791556224524, + "loss": 2.8286, + "step": 468 + }, + { + "epoch": 0.1338694175547513, + "grad_norm": 2.8125, + "learning_rate": 0.00028855591576040004, + "loss": 2.8641, + "step": 469 + }, + { + "epoch": 0.13415485341307698, + "grad_norm": 2.65625, + "learning_rate": 0.0002885038022569457, + "loss": 2.8478, + "step": 470 + }, + { + "epoch": 0.13444028927140267, + "grad_norm": 2.703125, + "learning_rate": 0.0002884515750946474, + "loss": 2.8215, + "step": 471 + }, + { + "epoch": 0.13472572512972836, + "grad_norm": 2.515625, + "learning_rate": 0.0002883992343163639, + "loss": 2.8004, + "step": 472 + }, + { + "epoch": 0.13501116098805407, + "grad_norm": 2.75, + "learning_rate": 0.00028834677996504696, + "loss": 2.8395, + "step": 473 + }, + { + "epoch": 0.13529659684637976, + "grad_norm": 2.625, + "learning_rate": 0.00028829421208374166, + "loss": 2.8313, + "step": 474 + }, + { + "epoch": 0.13558203270470545, + "grad_norm": 2.640625, + "learning_rate": 0.0002882415307155862, + "loss": 2.841, + "step": 475 + }, + { + "epoch": 0.13586746856303114, + "grad_norm": 2.4375, + "learning_rate": 0.00028818873590381183, + "loss": 2.8614, + "step": 476 + }, + { + "epoch": 0.13615290442135686, + "grad_norm": 2.71875, + "learning_rate": 0.000288135827691743, + "loss": 2.8482, + "step": 477 + }, + { + "epoch": 0.13643834027968255, + "grad_norm": 2.5, + "learning_rate": 0.0002880828061227973, + "loss": 2.8532, + "step": 478 + }, + { + "epoch": 0.13672377613800824, + "grad_norm": 2.75, + "learning_rate": 0.0002880296712404851, + "loss": 2.8337, + "step": 479 + }, + { + "epoch": 0.13700921199633392, + "grad_norm": 2.578125, + "learning_rate": 0.0002879764230884099, + "loss": 2.8183, + "step": 480 + }, + { + "epoch": 0.13729464785465964, + "grad_norm": 2.515625, + "learning_rate": 0.00028792306171026823, + "loss": 2.8161, + "step": 481 + }, + { + "epoch": 0.13758008371298533, + "grad_norm": 2.390625, + "learning_rate": 0.00028786958714984936, + "loss": 2.8174, + "step": 482 + }, + { + "epoch": 0.13786551957131102, + "grad_norm": 2.609375, + "learning_rate": 0.0002878159994510356, + "loss": 2.8075, + "step": 483 + }, + { + "epoch": 0.1381509554296367, + "grad_norm": 2.5, + "learning_rate": 0.00028776229865780205, + "loss": 2.8157, + "step": 484 + }, + { + "epoch": 0.13843639128796242, + "grad_norm": 2.453125, + "learning_rate": 0.0002877084848142165, + "loss": 2.8291, + "step": 485 + }, + { + "epoch": 0.1387218271462881, + "grad_norm": 2.375, + "learning_rate": 0.0002876545579644396, + "loss": 2.8247, + "step": 486 + }, + { + "epoch": 0.1390072630046138, + "grad_norm": 2.671875, + "learning_rate": 0.0002876005181527249, + "loss": 2.8366, + "step": 487 + }, + { + "epoch": 0.1392926988629395, + "grad_norm": 2.546875, + "learning_rate": 0.0002875463654234183, + "loss": 2.8679, + "step": 488 + }, + { + "epoch": 0.1395781347212652, + "grad_norm": 2.5625, + "learning_rate": 0.0002874920998209587, + "loss": 2.8432, + "step": 489 + }, + { + "epoch": 0.1398635705795909, + "grad_norm": 2.40625, + "learning_rate": 0.00028743772138987745, + "loss": 2.8366, + "step": 490 + }, + { + "epoch": 0.14014900643791658, + "grad_norm": 2.546875, + "learning_rate": 0.0002873832301747985, + "loss": 2.8279, + "step": 491 + }, + { + "epoch": 0.14043444229624227, + "grad_norm": 2.28125, + "learning_rate": 0.00028732862622043835, + "loss": 2.7933, + "step": 492 + }, + { + "epoch": 0.140719878154568, + "grad_norm": 2.671875, + "learning_rate": 0.000287273909571606, + "loss": 2.8563, + "step": 493 + }, + { + "epoch": 0.14100531401289368, + "grad_norm": 2.546875, + "learning_rate": 0.00028721908027320314, + "loss": 2.858, + "step": 494 + }, + { + "epoch": 0.14129074987121937, + "grad_norm": 2.390625, + "learning_rate": 0.00028716413837022355, + "loss": 2.7946, + "step": 495 + }, + { + "epoch": 0.14157618572954506, + "grad_norm": 2.15625, + "learning_rate": 0.0002871090839077537, + "loss": 2.7874, + "step": 496 + }, + { + "epoch": 0.14186162158787077, + "grad_norm": 2.546875, + "learning_rate": 0.0002870539169309723, + "loss": 2.8255, + "step": 497 + }, + { + "epoch": 0.14214705744619646, + "grad_norm": 2.453125, + "learning_rate": 0.0002869986374851504, + "loss": 2.8218, + "step": 498 + }, + { + "epoch": 0.14243249330452215, + "grad_norm": 2.46875, + "learning_rate": 0.00028694324561565136, + "loss": 2.8197, + "step": 499 + }, + { + "epoch": 0.14271792916284784, + "grad_norm": 2.296875, + "learning_rate": 0.00028688774136793085, + "loss": 2.8208, + "step": 500 + }, + { + "epoch": 0.14271792916284784, + "eval_loss": 2.6708414554595947, + "eval_runtime": 6008.9725, + "eval_samples_per_second": 10.698, + "eval_steps_per_second": 10.698, + "step": 500 + }, + { + "epoch": 0.14300336502117356, + "grad_norm": 2.34375, + "learning_rate": 0.00028683212478753663, + "loss": 2.8263, + "step": 501 + }, + { + "epoch": 0.14328880087949925, + "grad_norm": 2.046875, + "learning_rate": 0.00028677639592010874, + "loss": 2.8395, + "step": 502 + }, + { + "epoch": 0.14357423673782493, + "grad_norm": 2.828125, + "learning_rate": 0.00028672055481137937, + "loss": 2.815, + "step": 503 + }, + { + "epoch": 0.14385967259615062, + "grad_norm": 2.53125, + "learning_rate": 0.0002866646015071728, + "loss": 2.8157, + "step": 504 + }, + { + "epoch": 0.1441451084544763, + "grad_norm": 2.421875, + "learning_rate": 0.0002866085360534053, + "loss": 2.8449, + "step": 505 + }, + { + "epoch": 0.14443054431280203, + "grad_norm": 2.203125, + "learning_rate": 0.00028655235849608533, + "loss": 2.7893, + "step": 506 + }, + { + "epoch": 0.14471598017112772, + "grad_norm": 2.359375, + "learning_rate": 0.00028649606888131327, + "loss": 2.8099, + "step": 507 + }, + { + "epoch": 0.1450014160294534, + "grad_norm": 1.9453125, + "learning_rate": 0.00028643966725528134, + "loss": 2.8032, + "step": 508 + }, + { + "epoch": 0.1452868518877791, + "grad_norm": 2.921875, + "learning_rate": 0.0002863831536642739, + "loss": 2.8453, + "step": 509 + }, + { + "epoch": 0.1455722877461048, + "grad_norm": 2.59375, + "learning_rate": 0.0002863265281546669, + "loss": 2.7995, + "step": 510 + }, + { + "epoch": 0.1458577236044305, + "grad_norm": 2.1875, + "learning_rate": 0.0002862697907729285, + "loss": 2.8297, + "step": 511 + }, + { + "epoch": 0.1461431594627562, + "grad_norm": 2.015625, + "learning_rate": 0.00028621294156561843, + "loss": 2.7948, + "step": 512 + }, + { + "epoch": 0.14642859532108188, + "grad_norm": 2.5, + "learning_rate": 0.0002861559805793881, + "loss": 2.8182, + "step": 513 + }, + { + "epoch": 0.1467140311794076, + "grad_norm": 2.203125, + "learning_rate": 0.0002860989078609809, + "loss": 2.8126, + "step": 514 + }, + { + "epoch": 0.14699946703773328, + "grad_norm": 2.5625, + "learning_rate": 0.00028604172345723174, + "loss": 2.8018, + "step": 515 + }, + { + "epoch": 0.14728490289605897, + "grad_norm": 2.421875, + "learning_rate": 0.00028598442741506724, + "loss": 2.8455, + "step": 516 + }, + { + "epoch": 0.14757033875438466, + "grad_norm": 2.671875, + "learning_rate": 0.0002859270197815056, + "loss": 2.82, + "step": 517 + }, + { + "epoch": 0.14785577461271038, + "grad_norm": 2.375, + "learning_rate": 0.0002858695006036566, + "loss": 2.8428, + "step": 518 + }, + { + "epoch": 0.14814121047103607, + "grad_norm": 2.828125, + "learning_rate": 0.0002858118699287216, + "loss": 2.8128, + "step": 519 + }, + { + "epoch": 0.14842664632936176, + "grad_norm": 2.53125, + "learning_rate": 0.00028575412780399345, + "loss": 2.8563, + "step": 520 + }, + { + "epoch": 0.14871208218768744, + "grad_norm": 3.28125, + "learning_rate": 0.00028569627427685627, + "loss": 2.8428, + "step": 521 + }, + { + "epoch": 0.14899751804601316, + "grad_norm": 2.5625, + "learning_rate": 0.000285638309394786, + "loss": 2.8274, + "step": 522 + }, + { + "epoch": 0.14928295390433885, + "grad_norm": 3.84375, + "learning_rate": 0.0002855802332053496, + "loss": 2.8169, + "step": 523 + }, + { + "epoch": 0.14956838976266454, + "grad_norm": 3.453125, + "learning_rate": 0.00028552204575620543, + "loss": 2.828, + "step": 524 + }, + { + "epoch": 0.14985382562099023, + "grad_norm": 2.53125, + "learning_rate": 0.0002854637470951033, + "loss": 2.8265, + "step": 525 + }, + { + "epoch": 0.15013926147931594, + "grad_norm": 2.484375, + "learning_rate": 0.00028540533726988414, + "loss": 2.853, + "step": 526 + }, + { + "epoch": 0.15042469733764163, + "grad_norm": 2.328125, + "learning_rate": 0.00028534681632848025, + "loss": 2.8193, + "step": 527 + }, + { + "epoch": 0.15071013319596732, + "grad_norm": 2.015625, + "learning_rate": 0.0002852881843189149, + "loss": 2.8112, + "step": 528 + }, + { + "epoch": 0.150995569054293, + "grad_norm": 2.71875, + "learning_rate": 0.0002852294412893027, + "loss": 2.8376, + "step": 529 + }, + { + "epoch": 0.15128100491261873, + "grad_norm": 2.421875, + "learning_rate": 0.00028517058728784933, + "loss": 2.8126, + "step": 530 + }, + { + "epoch": 0.15156644077094442, + "grad_norm": 2.5625, + "learning_rate": 0.0002851116223628514, + "loss": 2.8375, + "step": 531 + }, + { + "epoch": 0.1518518766292701, + "grad_norm": 2.40625, + "learning_rate": 0.00028505254656269673, + "loss": 2.8186, + "step": 532 + }, + { + "epoch": 0.1521373124875958, + "grad_norm": 2.3125, + "learning_rate": 0.00028499335993586403, + "loss": 2.8437, + "step": 533 + }, + { + "epoch": 0.1524227483459215, + "grad_norm": 1.96875, + "learning_rate": 0.0002849340625309229, + "loss": 2.7927, + "step": 534 + }, + { + "epoch": 0.1527081842042472, + "grad_norm": 2.578125, + "learning_rate": 0.000284874654396534, + "loss": 2.8123, + "step": 535 + }, + { + "epoch": 0.1529936200625729, + "grad_norm": 2.171875, + "learning_rate": 0.0002848151355814487, + "loss": 2.8459, + "step": 536 + }, + { + "epoch": 0.15327905592089858, + "grad_norm": 2.953125, + "learning_rate": 0.0002847555061345093, + "loss": 2.8225, + "step": 537 + }, + { + "epoch": 0.1535644917792243, + "grad_norm": 2.84375, + "learning_rate": 0.0002846957661046488, + "loss": 2.8028, + "step": 538 + }, + { + "epoch": 0.15384992763754998, + "grad_norm": 2.03125, + "learning_rate": 0.0002846359155408911, + "loss": 2.8167, + "step": 539 + }, + { + "epoch": 0.15413536349587567, + "grad_norm": 1.8984375, + "learning_rate": 0.0002845759544923507, + "loss": 2.83, + "step": 540 + }, + { + "epoch": 0.15442079935420136, + "grad_norm": 2.578125, + "learning_rate": 0.00028451588300823266, + "loss": 2.8233, + "step": 541 + }, + { + "epoch": 0.15470623521252708, + "grad_norm": 2.03125, + "learning_rate": 0.0002844557011378328, + "loss": 2.8076, + "step": 542 + }, + { + "epoch": 0.15499167107085277, + "grad_norm": 2.734375, + "learning_rate": 0.00028439540893053766, + "loss": 2.8473, + "step": 543 + }, + { + "epoch": 0.15527710692917845, + "grad_norm": 2.5625, + "learning_rate": 0.000284335006435824, + "loss": 2.8175, + "step": 544 + }, + { + "epoch": 0.15556254278750414, + "grad_norm": 2.28125, + "learning_rate": 0.00028427449370325937, + "loss": 2.8237, + "step": 545 + }, + { + "epoch": 0.15584797864582986, + "grad_norm": 1.9921875, + "learning_rate": 0.0002842138707825015, + "loss": 2.8176, + "step": 546 + }, + { + "epoch": 0.15613341450415555, + "grad_norm": 2.421875, + "learning_rate": 0.0002841531377232989, + "loss": 2.8295, + "step": 547 + }, + { + "epoch": 0.15641885036248124, + "grad_norm": 1.9453125, + "learning_rate": 0.0002840922945754901, + "loss": 2.8035, + "step": 548 + }, + { + "epoch": 0.15670428622080693, + "grad_norm": 2.921875, + "learning_rate": 0.00028403134138900427, + "loss": 2.8217, + "step": 549 + }, + { + "epoch": 0.15698972207913264, + "grad_norm": 2.546875, + "learning_rate": 0.0002839702782138607, + "loss": 2.8093, + "step": 550 + }, + { + "epoch": 0.15727515793745833, + "grad_norm": 2.296875, + "learning_rate": 0.00028390910510016896, + "loss": 2.8026, + "step": 551 + }, + { + "epoch": 0.15756059379578402, + "grad_norm": 2.34375, + "learning_rate": 0.00028384782209812893, + "loss": 2.8124, + "step": 552 + }, + { + "epoch": 0.1578460296541097, + "grad_norm": 1.84375, + "learning_rate": 0.0002837864292580305, + "loss": 2.8342, + "step": 553 + }, + { + "epoch": 0.15813146551243543, + "grad_norm": 1.75, + "learning_rate": 0.00028372492663025393, + "loss": 2.7897, + "step": 554 + }, + { + "epoch": 0.15841690137076112, + "grad_norm": 1.703125, + "learning_rate": 0.0002836633142652693, + "loss": 2.8149, + "step": 555 + }, + { + "epoch": 0.1587023372290868, + "grad_norm": 1.390625, + "learning_rate": 0.00028360159221363704, + "loss": 2.8298, + "step": 556 + }, + { + "epoch": 0.1589877730874125, + "grad_norm": 2.453125, + "learning_rate": 0.00028353976052600727, + "loss": 2.8108, + "step": 557 + }, + { + "epoch": 0.1592732089457382, + "grad_norm": 1.8125, + "learning_rate": 0.0002834778192531204, + "loss": 2.7943, + "step": 558 + }, + { + "epoch": 0.1595586448040639, + "grad_norm": 2.890625, + "learning_rate": 0.00028341576844580647, + "loss": 2.8394, + "step": 559 + }, + { + "epoch": 0.1598440806623896, + "grad_norm": 2.796875, + "learning_rate": 0.00028335360815498565, + "loss": 2.8056, + "step": 560 + }, + { + "epoch": 0.16012951652071528, + "grad_norm": 1.8515625, + "learning_rate": 0.00028329133843166786, + "loss": 2.8123, + "step": 561 + }, + { + "epoch": 0.160414952379041, + "grad_norm": 2.515625, + "learning_rate": 0.0002832289593269527, + "loss": 2.8239, + "step": 562 + }, + { + "epoch": 0.16070038823736668, + "grad_norm": 1.8203125, + "learning_rate": 0.00028316647089202975, + "loss": 2.8298, + "step": 563 + }, + { + "epoch": 0.16098582409569237, + "grad_norm": 2.875, + "learning_rate": 0.0002831038731781782, + "loss": 2.839, + "step": 564 + }, + { + "epoch": 0.16127125995401806, + "grad_norm": 2.65625, + "learning_rate": 0.00028304116623676685, + "loss": 2.8498, + "step": 565 + }, + { + "epoch": 0.16155669581234378, + "grad_norm": 2.21875, + "learning_rate": 0.0002829783501192542, + "loss": 2.8228, + "step": 566 + }, + { + "epoch": 0.16184213167066946, + "grad_norm": 2.109375, + "learning_rate": 0.0002829154248771885, + "loss": 2.8171, + "step": 567 + }, + { + "epoch": 0.16212756752899515, + "grad_norm": 1.9453125, + "learning_rate": 0.00028285239056220724, + "loss": 2.7826, + "step": 568 + }, + { + "epoch": 0.16241300338732084, + "grad_norm": 1.5078125, + "learning_rate": 0.0002827892472260376, + "loss": 2.8087, + "step": 569 + }, + { + "epoch": 0.16269843924564656, + "grad_norm": 2.046875, + "learning_rate": 0.00028272599492049625, + "loss": 2.7997, + "step": 570 + }, + { + "epoch": 0.16298387510397225, + "grad_norm": 1.609375, + "learning_rate": 0.00028266263369748916, + "loss": 2.8093, + "step": 571 + }, + { + "epoch": 0.16326931096229794, + "grad_norm": 1.765625, + "learning_rate": 0.0002825991636090118, + "loss": 2.7765, + "step": 572 + }, + { + "epoch": 0.16355474682062363, + "grad_norm": 1.3984375, + "learning_rate": 0.0002825355847071489, + "loss": 2.8033, + "step": 573 + }, + { + "epoch": 0.16384018267894934, + "grad_norm": 50.75, + "learning_rate": 0.00028247189704407456, + "loss": 2.8378, + "step": 574 + }, + { + "epoch": 0.16412561853727503, + "grad_norm": 4.03125, + "learning_rate": 0.000282408100672052, + "loss": 2.8366, + "step": 575 + }, + { + "epoch": 0.16441105439560072, + "grad_norm": 2.625, + "learning_rate": 0.0002823441956434338, + "loss": 2.8565, + "step": 576 + }, + { + "epoch": 0.1646964902539264, + "grad_norm": 3.15625, + "learning_rate": 0.0002822801820106617, + "loss": 2.8216, + "step": 577 + }, + { + "epoch": 0.1649819261122521, + "grad_norm": 2.84375, + "learning_rate": 0.0002822160598262663, + "loss": 2.8249, + "step": 578 + }, + { + "epoch": 0.16526736197057781, + "grad_norm": 2.25, + "learning_rate": 0.00028215182914286766, + "loss": 2.8343, + "step": 579 + }, + { + "epoch": 0.1655527978289035, + "grad_norm": 2.53125, + "learning_rate": 0.0002820874900131746, + "loss": 2.8027, + "step": 580 + }, + { + "epoch": 0.1658382336872292, + "grad_norm": 2.21875, + "learning_rate": 0.00028202304248998506, + "loss": 2.8204, + "step": 581 + }, + { + "epoch": 0.16612366954555488, + "grad_norm": 2.078125, + "learning_rate": 0.0002819584866261859, + "loss": 2.8122, + "step": 582 + }, + { + "epoch": 0.1664091054038806, + "grad_norm": 1.828125, + "learning_rate": 0.0002818938224747529, + "loss": 2.816, + "step": 583 + }, + { + "epoch": 0.16669454126220629, + "grad_norm": 1.5390625, + "learning_rate": 0.0002818290500887506, + "loss": 2.8286, + "step": 584 + }, + { + "epoch": 0.16697997712053197, + "grad_norm": 2.0625, + "learning_rate": 0.0002817641695213327, + "loss": 2.8046, + "step": 585 + }, + { + "epoch": 0.16726541297885766, + "grad_norm": 1.3359375, + "learning_rate": 0.00028169918082574105, + "loss": 2.8249, + "step": 586 + }, + { + "epoch": 0.16755084883718338, + "grad_norm": 1.9921875, + "learning_rate": 0.0002816340840553069, + "loss": 2.8051, + "step": 587 + }, + { + "epoch": 0.16783628469550907, + "grad_norm": 1.4140625, + "learning_rate": 0.00028156887926344975, + "loss": 2.8328, + "step": 588 + }, + { + "epoch": 0.16812172055383476, + "grad_norm": 2.1875, + "learning_rate": 0.00028150356650367796, + "loss": 2.8087, + "step": 589 + }, + { + "epoch": 0.16840715641216045, + "grad_norm": 1.7421875, + "learning_rate": 0.00028143814582958827, + "loss": 2.7976, + "step": 590 + }, + { + "epoch": 0.16869259227048616, + "grad_norm": 2.671875, + "learning_rate": 0.0002813726172948664, + "loss": 2.8238, + "step": 591 + }, + { + "epoch": 0.16897802812881185, + "grad_norm": 2.0625, + "learning_rate": 0.000281306980953286, + "loss": 2.8243, + "step": 592 + }, + { + "epoch": 0.16926346398713754, + "grad_norm": 2.734375, + "learning_rate": 0.0002812412368587097, + "loss": 2.8078, + "step": 593 + }, + { + "epoch": 0.16954889984546323, + "grad_norm": 2.328125, + "learning_rate": 0.0002811753850650883, + "loss": 2.7899, + "step": 594 + }, + { + "epoch": 0.16983433570378895, + "grad_norm": 3.09375, + "learning_rate": 0.000281109425626461, + "loss": 2.8176, + "step": 595 + }, + { + "epoch": 0.17011977156211464, + "grad_norm": 3.015625, + "learning_rate": 0.00028104335859695543, + "loss": 2.8235, + "step": 596 + }, + { + "epoch": 0.17040520742044032, + "grad_norm": 2.125, + "learning_rate": 0.0002809771840307873, + "loss": 2.7986, + "step": 597 + }, + { + "epoch": 0.170690643278766, + "grad_norm": 1.8984375, + "learning_rate": 0.0002809109019822609, + "loss": 2.7848, + "step": 598 + }, + { + "epoch": 0.17097607913709173, + "grad_norm": 2.203125, + "learning_rate": 0.00028084451250576844, + "loss": 2.7914, + "step": 599 + }, + { + "epoch": 0.17126151499541742, + "grad_norm": 1.625, + "learning_rate": 0.00028077801565579033, + "loss": 2.8036, + "step": 600 + }, + { + "epoch": 0.1715469508537431, + "grad_norm": 2.84375, + "learning_rate": 0.0002807114114868953, + "loss": 2.8006, + "step": 601 + }, + { + "epoch": 0.1718323867120688, + "grad_norm": 2.40625, + "learning_rate": 0.0002806447000537398, + "loss": 2.7898, + "step": 602 + }, + { + "epoch": 0.1721178225703945, + "grad_norm": 2.78125, + "learning_rate": 0.00028057788141106865, + "loss": 2.7905, + "step": 603 + }, + { + "epoch": 0.1724032584287202, + "grad_norm": 2.5625, + "learning_rate": 0.0002805109556137144, + "loss": 2.8129, + "step": 604 + }, + { + "epoch": 0.1726886942870459, + "grad_norm": 2.375, + "learning_rate": 0.0002804439227165977, + "loss": 2.8151, + "step": 605 + }, + { + "epoch": 0.17297413014537158, + "grad_norm": 2.140625, + "learning_rate": 0.00028037678277472697, + "loss": 2.7888, + "step": 606 + }, + { + "epoch": 0.1732595660036973, + "grad_norm": 2.515625, + "learning_rate": 0.0002803095358431985, + "loss": 2.7996, + "step": 607 + }, + { + "epoch": 0.17354500186202299, + "grad_norm": 2.234375, + "learning_rate": 0.00028024218197719643, + "loss": 2.7932, + "step": 608 + }, + { + "epoch": 0.17383043772034867, + "grad_norm": 2.609375, + "learning_rate": 0.0002801747212319926, + "loss": 2.7972, + "step": 609 + }, + { + "epoch": 0.17411587357867436, + "grad_norm": 2.375, + "learning_rate": 0.0002801071536629466, + "loss": 2.8141, + "step": 610 + }, + { + "epoch": 0.17440130943700008, + "grad_norm": 2.5, + "learning_rate": 0.0002800394793255056, + "loss": 2.8014, + "step": 611 + }, + { + "epoch": 0.17468674529532577, + "grad_norm": 2.40625, + "learning_rate": 0.00027997169827520454, + "loss": 2.8036, + "step": 612 + }, + { + "epoch": 0.17497218115365146, + "grad_norm": 2.359375, + "learning_rate": 0.0002799038105676658, + "loss": 2.8235, + "step": 613 + }, + { + "epoch": 0.17525761701197715, + "grad_norm": 2.109375, + "learning_rate": 0.00027983581625859927, + "loss": 2.7849, + "step": 614 + }, + { + "epoch": 0.17554305287030286, + "grad_norm": 2.40625, + "learning_rate": 0.0002797677154038024, + "loss": 2.7964, + "step": 615 + }, + { + "epoch": 0.17582848872862855, + "grad_norm": 2.15625, + "learning_rate": 0.00027969950805916, + "loss": 2.8027, + "step": 616 + }, + { + "epoch": 0.17611392458695424, + "grad_norm": 2.5, + "learning_rate": 0.0002796311942806444, + "loss": 2.783, + "step": 617 + }, + { + "epoch": 0.17639936044527993, + "grad_norm": 2.25, + "learning_rate": 0.00027956277412431507, + "loss": 2.7981, + "step": 618 + }, + { + "epoch": 0.17668479630360565, + "grad_norm": 2.46875, + "learning_rate": 0.00027949424764631896, + "loss": 2.8145, + "step": 619 + }, + { + "epoch": 0.17697023216193133, + "grad_norm": 2.265625, + "learning_rate": 0.0002794256149028902, + "loss": 2.83, + "step": 620 + }, + { + "epoch": 0.17725566802025702, + "grad_norm": 2.375, + "learning_rate": 0.00027935687595035015, + "loss": 2.811, + "step": 621 + }, + { + "epoch": 0.1775411038785827, + "grad_norm": 2.09375, + "learning_rate": 0.00027928803084510716, + "loss": 2.8016, + "step": 622 + }, + { + "epoch": 0.17782653973690843, + "grad_norm": 2.421875, + "learning_rate": 0.000279219079643657, + "loss": 2.7996, + "step": 623 + }, + { + "epoch": 0.17811197559523412, + "grad_norm": 2.203125, + "learning_rate": 0.0002791500224025822, + "loss": 2.817, + "step": 624 + }, + { + "epoch": 0.1783974114535598, + "grad_norm": 2.40625, + "learning_rate": 0.00027908085917855243, + "loss": 2.8096, + "step": 625 + }, + { + "epoch": 0.1786828473118855, + "grad_norm": 2.09375, + "learning_rate": 0.0002790115900283245, + "loss": 2.7852, + "step": 626 + }, + { + "epoch": 0.1789682831702112, + "grad_norm": 2.28125, + "learning_rate": 0.00027894221500874184, + "loss": 2.8088, + "step": 627 + }, + { + "epoch": 0.1792537190285369, + "grad_norm": 2.046875, + "learning_rate": 0.0002788727341767349, + "loss": 2.767, + "step": 628 + }, + { + "epoch": 0.1795391548868626, + "grad_norm": 2.4375, + "learning_rate": 0.0002788031475893211, + "loss": 2.7955, + "step": 629 + }, + { + "epoch": 0.17982459074518828, + "grad_norm": 2.125, + "learning_rate": 0.00027873345530360436, + "loss": 2.8143, + "step": 630 + }, + { + "epoch": 0.180110026603514, + "grad_norm": 2.625, + "learning_rate": 0.00027866365737677564, + "loss": 2.777, + "step": 631 + }, + { + "epoch": 0.18039546246183968, + "grad_norm": 2.234375, + "learning_rate": 0.00027859375386611227, + "loss": 2.8, + "step": 632 + }, + { + "epoch": 0.18068089832016537, + "grad_norm": 2.65625, + "learning_rate": 0.0002785237448289786, + "loss": 2.7796, + "step": 633 + }, + { + "epoch": 0.18096633417849106, + "grad_norm": 2.421875, + "learning_rate": 0.00027845363032282514, + "loss": 2.8042, + "step": 634 + }, + { + "epoch": 0.18125177003681678, + "grad_norm": 2.171875, + "learning_rate": 0.0002783834104051893, + "loss": 2.8206, + "step": 635 + }, + { + "epoch": 0.18153720589514247, + "grad_norm": 2.171875, + "learning_rate": 0.00027831308513369494, + "loss": 2.812, + "step": 636 + }, + { + "epoch": 0.18182264175346816, + "grad_norm": 1.953125, + "learning_rate": 0.00027824265456605224, + "loss": 2.7804, + "step": 637 + }, + { + "epoch": 0.18210807761179384, + "grad_norm": 1.859375, + "learning_rate": 0.00027817211876005786, + "loss": 2.7941, + "step": 638 + }, + { + "epoch": 0.18239351347011956, + "grad_norm": 1.734375, + "learning_rate": 0.0002781014777735948, + "loss": 2.7842, + "step": 639 + }, + { + "epoch": 0.18267894932844525, + "grad_norm": 1.671875, + "learning_rate": 0.00027803073166463244, + "loss": 2.7955, + "step": 640 + }, + { + "epoch": 0.18296438518677094, + "grad_norm": 1.7578125, + "learning_rate": 0.00027795988049122625, + "loss": 2.7597, + "step": 641 + }, + { + "epoch": 0.18324982104509663, + "grad_norm": 1.4453125, + "learning_rate": 0.0002778889243115183, + "loss": 2.811, + "step": 642 + }, + { + "epoch": 0.18353525690342234, + "grad_norm": 1.7734375, + "learning_rate": 0.00027781786318373627, + "loss": 2.7948, + "step": 643 + }, + { + "epoch": 0.18382069276174803, + "grad_norm": 1.4296875, + "learning_rate": 0.0002777466971661945, + "loss": 2.7811, + "step": 644 + }, + { + "epoch": 0.18410612862007372, + "grad_norm": 2.0625, + "learning_rate": 0.00027767542631729306, + "loss": 2.7838, + "step": 645 + }, + { + "epoch": 0.1843915644783994, + "grad_norm": 1.65625, + "learning_rate": 0.0002776040506955182, + "loss": 2.7958, + "step": 646 + }, + { + "epoch": 0.18467700033672513, + "grad_norm": 2.1875, + "learning_rate": 0.0002775325703594421, + "loss": 2.7798, + "step": 647 + }, + { + "epoch": 0.18496243619505082, + "grad_norm": 1.8984375, + "learning_rate": 0.0002774609853677229, + "loss": 2.7891, + "step": 648 + }, + { + "epoch": 0.1852478720533765, + "grad_norm": 2.25, + "learning_rate": 0.0002773892957791045, + "loss": 2.8067, + "step": 649 + }, + { + "epoch": 0.1855333079117022, + "grad_norm": 1.9140625, + "learning_rate": 0.0002773175016524169, + "loss": 2.7842, + "step": 650 + }, + { + "epoch": 0.18581874377002788, + "grad_norm": 2.265625, + "learning_rate": 0.00027724560304657553, + "loss": 2.7706, + "step": 651 + }, + { + "epoch": 0.1861041796283536, + "grad_norm": 2.03125, + "learning_rate": 0.0002771736000205819, + "loss": 2.7912, + "step": 652 + }, + { + "epoch": 0.1863896154866793, + "grad_norm": 2.40625, + "learning_rate": 0.000277101492633523, + "loss": 2.7859, + "step": 653 + }, + { + "epoch": 0.18667505134500498, + "grad_norm": 2.140625, + "learning_rate": 0.0002770292809445715, + "loss": 2.7637, + "step": 654 + }, + { + "epoch": 0.18696048720333067, + "grad_norm": 2.359375, + "learning_rate": 0.0002769569650129857, + "loss": 2.7884, + "step": 655 + }, + { + "epoch": 0.18724592306165638, + "grad_norm": 2.234375, + "learning_rate": 0.00027688454489810946, + "loss": 2.7858, + "step": 656 + }, + { + "epoch": 0.18753135891998207, + "grad_norm": 1.9921875, + "learning_rate": 0.00027681202065937203, + "loss": 2.7677, + "step": 657 + }, + { + "epoch": 0.18781679477830776, + "grad_norm": 1.796875, + "learning_rate": 0.00027673939235628827, + "loss": 2.7883, + "step": 658 + }, + { + "epoch": 0.18810223063663345, + "grad_norm": 2.21875, + "learning_rate": 0.00027666666004845823, + "loss": 2.7624, + "step": 659 + }, + { + "epoch": 0.18838766649495917, + "grad_norm": 1.9609375, + "learning_rate": 0.0002765938237955674, + "loss": 2.8089, + "step": 660 + }, + { + "epoch": 0.18867310235328486, + "grad_norm": 2.328125, + "learning_rate": 0.0002765208836573868, + "loss": 2.7795, + "step": 661 + }, + { + "epoch": 0.18895853821161054, + "grad_norm": 2.140625, + "learning_rate": 0.0002764478396937722, + "loss": 2.7722, + "step": 662 + }, + { + "epoch": 0.18924397406993623, + "grad_norm": 2.171875, + "learning_rate": 0.00027637469196466506, + "loss": 2.7653, + "step": 663 + }, + { + "epoch": 0.18952940992826195, + "grad_norm": 1.984375, + "learning_rate": 0.00027630144053009174, + "loss": 2.7717, + "step": 664 + }, + { + "epoch": 0.18981484578658764, + "grad_norm": 2.15625, + "learning_rate": 0.0002762280854501638, + "loss": 2.762, + "step": 665 + }, + { + "epoch": 0.19010028164491333, + "grad_norm": 2.03125, + "learning_rate": 0.00027615462678507775, + "loss": 2.7989, + "step": 666 + }, + { + "epoch": 0.19038571750323902, + "grad_norm": 2.203125, + "learning_rate": 0.00027608106459511513, + "loss": 2.7851, + "step": 667 + }, + { + "epoch": 0.19067115336156473, + "grad_norm": 2.15625, + "learning_rate": 0.0002760073989406425, + "loss": 2.7428, + "step": 668 + }, + { + "epoch": 0.19095658921989042, + "grad_norm": 1.9921875, + "learning_rate": 0.00027593362988211133, + "loss": 2.7699, + "step": 669 + }, + { + "epoch": 0.1912420250782161, + "grad_norm": 1.875, + "learning_rate": 0.00027585975748005783, + "loss": 2.7797, + "step": 670 + }, + { + "epoch": 0.1915274609365418, + "grad_norm": 2.109375, + "learning_rate": 0.0002757857817951032, + "loss": 2.7656, + "step": 671 + }, + { + "epoch": 0.19181289679486752, + "grad_norm": 2.078125, + "learning_rate": 0.00027571170288795323, + "loss": 2.7674, + "step": 672 + }, + { + "epoch": 0.1920983326531932, + "grad_norm": 1.9765625, + "learning_rate": 0.0002756375208193985, + "loss": 2.7576, + "step": 673 + }, + { + "epoch": 0.1923837685115189, + "grad_norm": 1.8984375, + "learning_rate": 0.0002755632356503141, + "loss": 2.7844, + "step": 674 + }, + { + "epoch": 0.19266920436984458, + "grad_norm": 2.03125, + "learning_rate": 0.00027548884744166, + "loss": 2.7817, + "step": 675 + }, + { + "epoch": 0.1929546402281703, + "grad_norm": 1.8671875, + "learning_rate": 0.0002754143562544805, + "loss": 2.7589, + "step": 676 + }, + { + "epoch": 0.193240076086496, + "grad_norm": 2.125, + "learning_rate": 0.0002753397621499045, + "loss": 2.7841, + "step": 677 + }, + { + "epoch": 0.19352551194482168, + "grad_norm": 1.9453125, + "learning_rate": 0.00027526506518914533, + "loss": 2.7945, + "step": 678 + }, + { + "epoch": 0.19381094780314737, + "grad_norm": 2.140625, + "learning_rate": 0.00027519026543350067, + "loss": 2.7896, + "step": 679 + }, + { + "epoch": 0.19409638366147308, + "grad_norm": 1.9609375, + "learning_rate": 0.0002751153629443528, + "loss": 2.7839, + "step": 680 + }, + { + "epoch": 0.19438181951979877, + "grad_norm": 2.078125, + "learning_rate": 0.0002750403577831679, + "loss": 2.7684, + "step": 681 + }, + { + "epoch": 0.19466725537812446, + "grad_norm": 1.828125, + "learning_rate": 0.00027496525001149676, + "loss": 2.7598, + "step": 682 + }, + { + "epoch": 0.19495269123645015, + "grad_norm": 2.015625, + "learning_rate": 0.00027489003969097416, + "loss": 2.7652, + "step": 683 + }, + { + "epoch": 0.19523812709477587, + "grad_norm": 1.9765625, + "learning_rate": 0.00027481472688331923, + "loss": 2.7909, + "step": 684 + }, + { + "epoch": 0.19552356295310155, + "grad_norm": 1.984375, + "learning_rate": 0.00027473931165033496, + "loss": 2.7535, + "step": 685 + }, + { + "epoch": 0.19580899881142724, + "grad_norm": 1.8671875, + "learning_rate": 0.00027466379405390864, + "loss": 2.763, + "step": 686 + }, + { + "epoch": 0.19609443466975293, + "grad_norm": 1.96875, + "learning_rate": 0.0002745881741560113, + "loss": 2.8034, + "step": 687 + }, + { + "epoch": 0.19637987052807865, + "grad_norm": 1.671875, + "learning_rate": 0.0002745124520186981, + "loss": 2.7538, + "step": 688 + }, + { + "epoch": 0.19666530638640434, + "grad_norm": 2.109375, + "learning_rate": 0.0002744366277041082, + "loss": 2.7494, + "step": 689 + }, + { + "epoch": 0.19695074224473003, + "grad_norm": 1.8125, + "learning_rate": 0.0002743607012744643, + "loss": 2.7578, + "step": 690 + }, + { + "epoch": 0.19723617810305571, + "grad_norm": 1.984375, + "learning_rate": 0.00027428467279207316, + "loss": 2.7845, + "step": 691 + }, + { + "epoch": 0.19752161396138143, + "grad_norm": 1.7890625, + "learning_rate": 0.00027420854231932515, + "loss": 2.7833, + "step": 692 + }, + { + "epoch": 0.19780704981970712, + "grad_norm": 1.8671875, + "learning_rate": 0.0002741323099186944, + "loss": 2.7835, + "step": 693 + }, + { + "epoch": 0.1980924856780328, + "grad_norm": 1.6640625, + "learning_rate": 0.00027405597565273866, + "loss": 2.7663, + "step": 694 + }, + { + "epoch": 0.1983779215363585, + "grad_norm": 2.0, + "learning_rate": 0.00027397953958409923, + "loss": 2.7737, + "step": 695 + }, + { + "epoch": 0.19866335739468421, + "grad_norm": 1.71875, + "learning_rate": 0.00027390300177550106, + "loss": 2.7501, + "step": 696 + }, + { + "epoch": 0.1989487932530099, + "grad_norm": 1.9296875, + "learning_rate": 0.0002738263622897525, + "loss": 2.7862, + "step": 697 + }, + { + "epoch": 0.1992342291113356, + "grad_norm": 1.71875, + "learning_rate": 0.0002737496211897453, + "loss": 2.7629, + "step": 698 + }, + { + "epoch": 0.19951966496966128, + "grad_norm": 1.8515625, + "learning_rate": 0.0002736727785384548, + "loss": 2.7394, + "step": 699 + }, + { + "epoch": 0.199805100827987, + "grad_norm": 1.7421875, + "learning_rate": 0.00027359583439893944, + "loss": 2.7867, + "step": 700 + }, + { + "epoch": 0.2000905366863127, + "grad_norm": 1.765625, + "learning_rate": 0.00027351878883434105, + "loss": 2.7564, + "step": 701 + }, + { + "epoch": 0.20037597254463838, + "grad_norm": 1.53125, + "learning_rate": 0.0002734416419078847, + "loss": 2.7623, + "step": 702 + }, + { + "epoch": 0.20066140840296406, + "grad_norm": 2.0625, + "learning_rate": 0.00027336439368287857, + "loss": 2.7678, + "step": 703 + }, + { + "epoch": 0.20094684426128978, + "grad_norm": 1.9375, + "learning_rate": 0.0002732870442227141, + "loss": 2.7727, + "step": 704 + }, + { + "epoch": 0.20123228011961547, + "grad_norm": 1.828125, + "learning_rate": 0.00027320959359086565, + "loss": 2.7808, + "step": 705 + }, + { + "epoch": 0.20151771597794116, + "grad_norm": 1.703125, + "learning_rate": 0.0002731320418508907, + "loss": 2.7509, + "step": 706 + }, + { + "epoch": 0.20180315183626685, + "grad_norm": 1.8125, + "learning_rate": 0.0002730543890664297, + "loss": 2.7839, + "step": 707 + }, + { + "epoch": 0.20208858769459256, + "grad_norm": 1.546875, + "learning_rate": 0.0002729766353012059, + "loss": 2.7573, + "step": 708 + }, + { + "epoch": 0.20237402355291825, + "grad_norm": 2.296875, + "learning_rate": 0.0002728987806190257, + "loss": 2.7872, + "step": 709 + }, + { + "epoch": 0.20265945941124394, + "grad_norm": 1.9609375, + "learning_rate": 0.00027282082508377795, + "loss": 2.7727, + "step": 710 + }, + { + "epoch": 0.20294489526956963, + "grad_norm": 2.203125, + "learning_rate": 0.0002727427687594345, + "loss": 2.7632, + "step": 711 + }, + { + "epoch": 0.20323033112789535, + "grad_norm": 2.03125, + "learning_rate": 0.00027266461171004985, + "loss": 2.7631, + "step": 712 + }, + { + "epoch": 0.20351576698622104, + "grad_norm": 2.046875, + "learning_rate": 0.00027258635399976115, + "loss": 2.768, + "step": 713 + }, + { + "epoch": 0.20380120284454672, + "grad_norm": 1.7890625, + "learning_rate": 0.00027250799569278816, + "loss": 2.7666, + "step": 714 + }, + { + "epoch": 0.2040866387028724, + "grad_norm": 2.25, + "learning_rate": 0.00027242953685343327, + "loss": 2.7794, + "step": 715 + }, + { + "epoch": 0.20437207456119813, + "grad_norm": 1.890625, + "learning_rate": 0.0002723509775460811, + "loss": 2.7449, + "step": 716 + }, + { + "epoch": 0.20465751041952382, + "grad_norm": 2.140625, + "learning_rate": 0.00027227231783519913, + "loss": 2.7529, + "step": 717 + }, + { + "epoch": 0.2049429462778495, + "grad_norm": 1.8984375, + "learning_rate": 0.0002721935577853368, + "loss": 2.7785, + "step": 718 + }, + { + "epoch": 0.2052283821361752, + "grad_norm": 1.9765625, + "learning_rate": 0.00027211469746112624, + "loss": 2.7653, + "step": 719 + }, + { + "epoch": 0.2055138179945009, + "grad_norm": 1.734375, + "learning_rate": 0.00027203573692728174, + "loss": 2.7664, + "step": 720 + }, + { + "epoch": 0.2057992538528266, + "grad_norm": 2.0625, + "learning_rate": 0.0002719566762485997, + "loss": 2.7677, + "step": 721 + }, + { + "epoch": 0.2060846897111523, + "grad_norm": 1.7578125, + "learning_rate": 0.0002718775154899589, + "loss": 2.7667, + "step": 722 + }, + { + "epoch": 0.20637012556947798, + "grad_norm": 2.078125, + "learning_rate": 0.0002717982547163201, + "loss": 2.7674, + "step": 723 + }, + { + "epoch": 0.20665556142780367, + "grad_norm": 1.84375, + "learning_rate": 0.0002717188939927262, + "loss": 2.7747, + "step": 724 + }, + { + "epoch": 0.20694099728612939, + "grad_norm": 1.984375, + "learning_rate": 0.00027163943338430214, + "loss": 2.7299, + "step": 725 + }, + { + "epoch": 0.20722643314445507, + "grad_norm": 1.8359375, + "learning_rate": 0.0002715598729562548, + "loss": 2.7672, + "step": 726 + }, + { + "epoch": 0.20751186900278076, + "grad_norm": 1.875, + "learning_rate": 0.000271480212773873, + "loss": 2.7847, + "step": 727 + }, + { + "epoch": 0.20779730486110645, + "grad_norm": 1.609375, + "learning_rate": 0.0002714004529025273, + "loss": 2.7886, + "step": 728 + }, + { + "epoch": 0.20808274071943217, + "grad_norm": 1.890625, + "learning_rate": 0.00027132059340767025, + "loss": 2.7586, + "step": 729 + }, + { + "epoch": 0.20836817657775786, + "grad_norm": 1.5859375, + "learning_rate": 0.00027124063435483603, + "loss": 2.779, + "step": 730 + }, + { + "epoch": 0.20865361243608355, + "grad_norm": 2.0625, + "learning_rate": 0.0002711605758096406, + "loss": 2.7593, + "step": 731 + }, + { + "epoch": 0.20893904829440924, + "grad_norm": 1.7109375, + "learning_rate": 0.0002710804178377814, + "loss": 2.7684, + "step": 732 + }, + { + "epoch": 0.20922448415273495, + "grad_norm": 2.03125, + "learning_rate": 0.0002710001605050377, + "loss": 2.7542, + "step": 733 + }, + { + "epoch": 0.20950992001106064, + "grad_norm": 1.7578125, + "learning_rate": 0.00027091980387727014, + "loss": 2.7644, + "step": 734 + }, + { + "epoch": 0.20979535586938633, + "grad_norm": 2.15625, + "learning_rate": 0.00027083934802042084, + "loss": 2.7772, + "step": 735 + }, + { + "epoch": 0.21008079172771202, + "grad_norm": 1.8359375, + "learning_rate": 0.0002707587930005136, + "loss": 2.7419, + "step": 736 + }, + { + "epoch": 0.21036622758603774, + "grad_norm": 2.09375, + "learning_rate": 0.0002706781388836531, + "loss": 2.7889, + "step": 737 + }, + { + "epoch": 0.21065166344436342, + "grad_norm": 1.765625, + "learning_rate": 0.00027059738573602583, + "loss": 2.768, + "step": 738 + }, + { + "epoch": 0.2109370993026891, + "grad_norm": 2.25, + "learning_rate": 0.00027051653362389935, + "loss": 2.8016, + "step": 739 + }, + { + "epoch": 0.2112225351610148, + "grad_norm": 1.8046875, + "learning_rate": 0.0002704355826136224, + "loss": 2.758, + "step": 740 + }, + { + "epoch": 0.21150797101934052, + "grad_norm": 2.203125, + "learning_rate": 0.0002703545327716249, + "loss": 2.7658, + "step": 741 + }, + { + "epoch": 0.2117934068776662, + "grad_norm": 1.859375, + "learning_rate": 0.00027027338416441785, + "loss": 2.7693, + "step": 742 + }, + { + "epoch": 0.2120788427359919, + "grad_norm": 2.40625, + "learning_rate": 0.0002701921368585934, + "loss": 2.7948, + "step": 743 + }, + { + "epoch": 0.21236427859431758, + "grad_norm": 2.0, + "learning_rate": 0.0002701107909208246, + "loss": 2.7832, + "step": 744 + }, + { + "epoch": 0.2126497144526433, + "grad_norm": 2.25, + "learning_rate": 0.00027002934641786545, + "loss": 2.7851, + "step": 745 + }, + { + "epoch": 0.212935150310969, + "grad_norm": 2.0625, + "learning_rate": 0.00026994780341655093, + "loss": 2.7461, + "step": 746 + }, + { + "epoch": 0.21322058616929468, + "grad_norm": 2.015625, + "learning_rate": 0.0002698661619837967, + "loss": 2.7511, + "step": 747 + }, + { + "epoch": 0.21350602202762037, + "grad_norm": 1.8125, + "learning_rate": 0.0002697844221865993, + "loss": 2.7562, + "step": 748 + }, + { + "epoch": 0.21379145788594608, + "grad_norm": 2.078125, + "learning_rate": 0.00026970258409203594, + "loss": 2.729, + "step": 749 + }, + { + "epoch": 0.21407689374427177, + "grad_norm": 1.7421875, + "learning_rate": 0.00026962064776726445, + "loss": 2.7467, + "step": 750 + }, + { + "epoch": 0.21407689374427177, + "eval_loss": 2.6212494373321533, + "eval_runtime": 5936.0633, + "eval_samples_per_second": 10.83, + "eval_steps_per_second": 10.83, + "step": 750 + }, + { + "epoch": 0.21436232960259746, + "grad_norm": 2.0625, + "learning_rate": 0.0002695386132795234, + "loss": 2.7875, + "step": 751 + }, + { + "epoch": 0.21464776546092315, + "grad_norm": 1.8828125, + "learning_rate": 0.0002694564806961319, + "loss": 2.7879, + "step": 752 + }, + { + "epoch": 0.21493320131924887, + "grad_norm": 1.9140625, + "learning_rate": 0.00026937425008448937, + "loss": 2.7634, + "step": 753 + }, + { + "epoch": 0.21521863717757456, + "grad_norm": 1.65625, + "learning_rate": 0.0002692919215120759, + "loss": 2.7563, + "step": 754 + }, + { + "epoch": 0.21550407303590025, + "grad_norm": 1.953125, + "learning_rate": 0.0002692094950464519, + "loss": 2.7836, + "step": 755 + }, + { + "epoch": 0.21578950889422593, + "grad_norm": 1.59375, + "learning_rate": 0.000269126970755258, + "loss": 2.7366, + "step": 756 + }, + { + "epoch": 0.21607494475255165, + "grad_norm": 1.828125, + "learning_rate": 0.00026904434870621524, + "loss": 2.7813, + "step": 757 + }, + { + "epoch": 0.21636038061087734, + "grad_norm": 1.53125, + "learning_rate": 0.00026896162896712476, + "loss": 2.7718, + "step": 758 + }, + { + "epoch": 0.21664581646920303, + "grad_norm": 2.0, + "learning_rate": 0.00026887881160586813, + "loss": 2.7536, + "step": 759 + }, + { + "epoch": 0.21693125232752872, + "grad_norm": 1.6953125, + "learning_rate": 0.0002687958966904067, + "loss": 2.7619, + "step": 760 + }, + { + "epoch": 0.21721668818585443, + "grad_norm": 1.84375, + "learning_rate": 0.00026871288428878206, + "loss": 2.7672, + "step": 761 + }, + { + "epoch": 0.21750212404418012, + "grad_norm": 1.6328125, + "learning_rate": 0.0002686297744691158, + "loss": 2.7571, + "step": 762 + }, + { + "epoch": 0.2177875599025058, + "grad_norm": 2.015625, + "learning_rate": 0.0002685465672996093, + "loss": 2.7652, + "step": 763 + }, + { + "epoch": 0.2180729957608315, + "grad_norm": 1.6953125, + "learning_rate": 0.000268463262848544, + "loss": 2.7748, + "step": 764 + }, + { + "epoch": 0.21835843161915722, + "grad_norm": 1.7109375, + "learning_rate": 0.0002683798611842812, + "loss": 2.7583, + "step": 765 + }, + { + "epoch": 0.2186438674774829, + "grad_norm": 1.59375, + "learning_rate": 0.0002682963623752617, + "loss": 2.7586, + "step": 766 + }, + { + "epoch": 0.2189293033358086, + "grad_norm": 1.828125, + "learning_rate": 0.0002682127664900064, + "loss": 2.7338, + "step": 767 + }, + { + "epoch": 0.21921473919413428, + "grad_norm": 1.40625, + "learning_rate": 0.0002681290735971156, + "loss": 2.752, + "step": 768 + }, + { + "epoch": 0.21950017505246, + "grad_norm": 1.796875, + "learning_rate": 0.0002680452837652691, + "loss": 2.7629, + "step": 769 + }, + { + "epoch": 0.2197856109107857, + "grad_norm": 1.5625, + "learning_rate": 0.0002679613970632267, + "loss": 2.7652, + "step": 770 + }, + { + "epoch": 0.22007104676911138, + "grad_norm": 2.109375, + "learning_rate": 0.0002678774135598272, + "loss": 2.7537, + "step": 771 + }, + { + "epoch": 0.22035648262743707, + "grad_norm": 1.6875, + "learning_rate": 0.00026779333332398923, + "loss": 2.7141, + "step": 772 + }, + { + "epoch": 0.22064191848576278, + "grad_norm": 1.90625, + "learning_rate": 0.0002677091564247105, + "loss": 2.757, + "step": 773 + }, + { + "epoch": 0.22092735434408847, + "grad_norm": 1.6875, + "learning_rate": 0.0002676248829310682, + "loss": 2.7454, + "step": 774 + }, + { + "epoch": 0.22121279020241416, + "grad_norm": 1.8671875, + "learning_rate": 0.0002675405129122188, + "loss": 2.7545, + "step": 775 + }, + { + "epoch": 0.22149822606073985, + "grad_norm": 1.4765625, + "learning_rate": 0.0002674560464373979, + "loss": 2.7331, + "step": 776 + }, + { + "epoch": 0.22178366191906557, + "grad_norm": 1.9609375, + "learning_rate": 0.0002673714835759202, + "loss": 2.7603, + "step": 777 + }, + { + "epoch": 0.22206909777739126, + "grad_norm": 1.515625, + "learning_rate": 0.00026728682439717974, + "loss": 2.7551, + "step": 778 + }, + { + "epoch": 0.22235453363571694, + "grad_norm": 2.234375, + "learning_rate": 0.0002672020689706493, + "loss": 2.7814, + "step": 779 + }, + { + "epoch": 0.22263996949404263, + "grad_norm": 1.765625, + "learning_rate": 0.00026711721736588103, + "loss": 2.7604, + "step": 780 + }, + { + "epoch": 0.22292540535236835, + "grad_norm": 2.0625, + "learning_rate": 0.00026703226965250546, + "loss": 2.7551, + "step": 781 + }, + { + "epoch": 0.22321084121069404, + "grad_norm": 1.890625, + "learning_rate": 0.00026694722590023246, + "loss": 2.7357, + "step": 782 + }, + { + "epoch": 0.22349627706901973, + "grad_norm": 1.875, + "learning_rate": 0.00026686208617885055, + "loss": 2.7532, + "step": 783 + }, + { + "epoch": 0.22378171292734542, + "grad_norm": 1.6875, + "learning_rate": 0.0002667768505582269, + "loss": 2.7388, + "step": 784 + }, + { + "epoch": 0.22406714878567113, + "grad_norm": 1.6484375, + "learning_rate": 0.0002666915191083076, + "loss": 2.7594, + "step": 785 + }, + { + "epoch": 0.22435258464399682, + "grad_norm": 1.4296875, + "learning_rate": 0.00026660609189911724, + "loss": 2.7504, + "step": 786 + }, + { + "epoch": 0.2246380205023225, + "grad_norm": 1.515625, + "learning_rate": 0.00026652056900075885, + "loss": 2.7631, + "step": 787 + }, + { + "epoch": 0.2249234563606482, + "grad_norm": 1.2578125, + "learning_rate": 0.0002664349504834143, + "loss": 2.7534, + "step": 788 + }, + { + "epoch": 0.22520889221897392, + "grad_norm": 1.6875, + "learning_rate": 0.00026634923641734374, + "loss": 2.7584, + "step": 789 + }, + { + "epoch": 0.2254943280772996, + "grad_norm": 1.3125, + "learning_rate": 0.00026626342687288576, + "loss": 2.7519, + "step": 790 + }, + { + "epoch": 0.2257797639356253, + "grad_norm": 2.0625, + "learning_rate": 0.0002661775219204572, + "loss": 2.7477, + "step": 791 + }, + { + "epoch": 0.22606519979395098, + "grad_norm": 1.7578125, + "learning_rate": 0.0002660915216305534, + "loss": 2.7484, + "step": 792 + }, + { + "epoch": 0.22635063565227667, + "grad_norm": 1.734375, + "learning_rate": 0.0002660054260737478, + "loss": 2.7718, + "step": 793 + }, + { + "epoch": 0.2266360715106024, + "grad_norm": 1.6015625, + "learning_rate": 0.000265919235320692, + "loss": 2.7437, + "step": 794 + }, + { + "epoch": 0.22692150736892808, + "grad_norm": 1.6953125, + "learning_rate": 0.00026583294944211583, + "loss": 2.7564, + "step": 795 + }, + { + "epoch": 0.22720694322725377, + "grad_norm": 1.3046875, + "learning_rate": 0.00026574656850882706, + "loss": 2.7322, + "step": 796 + }, + { + "epoch": 0.22749237908557945, + "grad_norm": 1.8828125, + "learning_rate": 0.0002656600925917116, + "loss": 2.7623, + "step": 797 + }, + { + "epoch": 0.22777781494390517, + "grad_norm": 1.46875, + "learning_rate": 0.00026557352176173317, + "loss": 2.7294, + "step": 798 + }, + { + "epoch": 0.22806325080223086, + "grad_norm": 2.1875, + "learning_rate": 0.00026548685608993337, + "loss": 2.7457, + "step": 799 + }, + { + "epoch": 0.22834868666055655, + "grad_norm": 2.015625, + "learning_rate": 0.0002654000956474318, + "loss": 2.7512, + "step": 800 + }, + { + "epoch": 0.22863412251888224, + "grad_norm": 1.6953125, + "learning_rate": 0.0002653132405054257, + "loss": 2.7251, + "step": 801 + }, + { + "epoch": 0.22891955837720795, + "grad_norm": 1.59375, + "learning_rate": 0.00026522629073519, + "loss": 2.7645, + "step": 802 + }, + { + "epoch": 0.22920499423553364, + "grad_norm": 1.6328125, + "learning_rate": 0.00026513924640807733, + "loss": 2.7856, + "step": 803 + }, + { + "epoch": 0.22949043009385933, + "grad_norm": 1.3203125, + "learning_rate": 0.000265052107595518, + "loss": 2.7234, + "step": 804 + }, + { + "epoch": 0.22977586595218502, + "grad_norm": 1.84375, + "learning_rate": 0.00026496487436901964, + "loss": 2.7626, + "step": 805 + }, + { + "epoch": 0.23006130181051074, + "grad_norm": 1.578125, + "learning_rate": 0.00026487754680016765, + "loss": 2.7252, + "step": 806 + }, + { + "epoch": 0.23034673766883643, + "grad_norm": 1.890625, + "learning_rate": 0.0002647901249606245, + "loss": 2.7371, + "step": 807 + }, + { + "epoch": 0.23063217352716212, + "grad_norm": 1.7109375, + "learning_rate": 0.00026470260892213034, + "loss": 2.7533, + "step": 808 + }, + { + "epoch": 0.2309176093854878, + "grad_norm": 1.734375, + "learning_rate": 0.00026461499875650245, + "loss": 2.7512, + "step": 809 + }, + { + "epoch": 0.23120304524381352, + "grad_norm": 1.59375, + "learning_rate": 0.0002645272945356354, + "loss": 2.7423, + "step": 810 + }, + { + "epoch": 0.2314884811021392, + "grad_norm": 1.890625, + "learning_rate": 0.0002644394963315009, + "loss": 2.7495, + "step": 811 + }, + { + "epoch": 0.2317739169604649, + "grad_norm": 1.6484375, + "learning_rate": 0.00026435160421614784, + "loss": 2.7378, + "step": 812 + }, + { + "epoch": 0.2320593528187906, + "grad_norm": 1.859375, + "learning_rate": 0.0002642636182617022, + "loss": 2.7887, + "step": 813 + }, + { + "epoch": 0.2323447886771163, + "grad_norm": 1.5390625, + "learning_rate": 0.0002641755385403669, + "loss": 2.7452, + "step": 814 + }, + { + "epoch": 0.232630224535442, + "grad_norm": 2.0625, + "learning_rate": 0.0002640873651244217, + "loss": 2.7407, + "step": 815 + }, + { + "epoch": 0.23291566039376768, + "grad_norm": 1.671875, + "learning_rate": 0.0002639990980862236, + "loss": 2.7571, + "step": 816 + }, + { + "epoch": 0.23320109625209337, + "grad_norm": 2.15625, + "learning_rate": 0.00026391073749820607, + "loss": 2.7219, + "step": 817 + }, + { + "epoch": 0.2334865321104191, + "grad_norm": 1.953125, + "learning_rate": 0.00026382228343287947, + "loss": 2.7314, + "step": 818 + }, + { + "epoch": 0.23377196796874478, + "grad_norm": 2.15625, + "learning_rate": 0.0002637337359628309, + "loss": 2.7363, + "step": 819 + }, + { + "epoch": 0.23405740382707046, + "grad_norm": 1.953125, + "learning_rate": 0.00026364509516072415, + "loss": 2.7455, + "step": 820 + }, + { + "epoch": 0.23434283968539615, + "grad_norm": 1.921875, + "learning_rate": 0.00026355636109929946, + "loss": 2.7301, + "step": 821 + }, + { + "epoch": 0.23462827554372187, + "grad_norm": 1.7578125, + "learning_rate": 0.0002634675338513738, + "loss": 2.733, + "step": 822 + }, + { + "epoch": 0.23491371140204756, + "grad_norm": 1.71875, + "learning_rate": 0.00026337861348984024, + "loss": 2.7564, + "step": 823 + }, + { + "epoch": 0.23519914726037325, + "grad_norm": 1.4609375, + "learning_rate": 0.00026328960008766884, + "loss": 2.7489, + "step": 824 + }, + { + "epoch": 0.23548458311869894, + "grad_norm": 1.9140625, + "learning_rate": 0.0002632004937179055, + "loss": 2.7493, + "step": 825 + }, + { + "epoch": 0.23577001897702465, + "grad_norm": 1.6328125, + "learning_rate": 0.00026311129445367255, + "loss": 2.7289, + "step": 826 + }, + { + "epoch": 0.23605545483535034, + "grad_norm": 2.15625, + "learning_rate": 0.0002630220023681687, + "loss": 2.7193, + "step": 827 + }, + { + "epoch": 0.23634089069367603, + "grad_norm": 2.0, + "learning_rate": 0.0002629326175346687, + "loss": 2.738, + "step": 828 + }, + { + "epoch": 0.23662632655200172, + "grad_norm": 1.921875, + "learning_rate": 0.0002628431400265235, + "loss": 2.7497, + "step": 829 + }, + { + "epoch": 0.23691176241032744, + "grad_norm": 1.8203125, + "learning_rate": 0.00026275356991715986, + "loss": 2.7239, + "step": 830 + }, + { + "epoch": 0.23719719826865313, + "grad_norm": 1.71875, + "learning_rate": 0.0002626639072800809, + "loss": 2.7372, + "step": 831 + }, + { + "epoch": 0.23748263412697881, + "grad_norm": 1.4921875, + "learning_rate": 0.00026257415218886536, + "loss": 2.7284, + "step": 832 + }, + { + "epoch": 0.2377680699853045, + "grad_norm": 2.046875, + "learning_rate": 0.00026248430471716795, + "loss": 2.7515, + "step": 833 + }, + { + "epoch": 0.23805350584363022, + "grad_norm": 1.8984375, + "learning_rate": 0.0002623943649387194, + "loss": 2.7412, + "step": 834 + }, + { + "epoch": 0.2383389417019559, + "grad_norm": 1.84375, + "learning_rate": 0.0002623043329273257, + "loss": 2.7339, + "step": 835 + }, + { + "epoch": 0.2386243775602816, + "grad_norm": 1.6953125, + "learning_rate": 0.0002622142087568691, + "loss": 2.7482, + "step": 836 + }, + { + "epoch": 0.2389098134186073, + "grad_norm": 1.7890625, + "learning_rate": 0.00026212399250130706, + "loss": 2.7411, + "step": 837 + }, + { + "epoch": 0.239195249276933, + "grad_norm": 1.5234375, + "learning_rate": 0.0002620336842346728, + "loss": 2.7394, + "step": 838 + }, + { + "epoch": 0.2394806851352587, + "grad_norm": 1.9375, + "learning_rate": 0.0002619432840310749, + "loss": 2.6938, + "step": 839 + }, + { + "epoch": 0.23976612099358438, + "grad_norm": 1.7265625, + "learning_rate": 0.00026185279196469757, + "loss": 2.7298, + "step": 840 + }, + { + "epoch": 0.24005155685191007, + "grad_norm": 1.8828125, + "learning_rate": 0.00026176220810980035, + "loss": 2.7237, + "step": 841 + }, + { + "epoch": 0.2403369927102358, + "grad_norm": 1.7890625, + "learning_rate": 0.00026167153254071795, + "loss": 2.742, + "step": 842 + }, + { + "epoch": 0.24062242856856147, + "grad_norm": 1.6015625, + "learning_rate": 0.0002615807653318605, + "loss": 2.7514, + "step": 843 + }, + { + "epoch": 0.24090786442688716, + "grad_norm": 1.4453125, + "learning_rate": 0.0002614899065577133, + "loss": 2.7606, + "step": 844 + }, + { + "epoch": 0.24119330028521285, + "grad_norm": 1.8125, + "learning_rate": 0.0002613989562928369, + "loss": 2.7474, + "step": 845 + }, + { + "epoch": 0.24147873614353857, + "grad_norm": 1.53125, + "learning_rate": 0.00026130791461186656, + "loss": 2.7309, + "step": 846 + }, + { + "epoch": 0.24176417200186426, + "grad_norm": 1.984375, + "learning_rate": 0.000261216781589513, + "loss": 2.726, + "step": 847 + }, + { + "epoch": 0.24204960786018995, + "grad_norm": 1.703125, + "learning_rate": 0.0002611255573005617, + "loss": 2.7471, + "step": 848 + }, + { + "epoch": 0.24233504371851564, + "grad_norm": 1.90625, + "learning_rate": 0.00026103424181987293, + "loss": 2.7328, + "step": 849 + }, + { + "epoch": 0.24262047957684135, + "grad_norm": 1.7578125, + "learning_rate": 0.00026094283522238204, + "loss": 2.755, + "step": 850 + }, + { + "epoch": 0.24290591543516704, + "grad_norm": 1.7890625, + "learning_rate": 0.00026085133758309883, + "loss": 2.7581, + "step": 851 + }, + { + "epoch": 0.24319135129349273, + "grad_norm": 1.59375, + "learning_rate": 0.00026075974897710815, + "loss": 2.7312, + "step": 852 + }, + { + "epoch": 0.24347678715181842, + "grad_norm": 1.9296875, + "learning_rate": 0.0002606680694795693, + "loss": 2.7274, + "step": 853 + }, + { + "epoch": 0.24376222301014414, + "grad_norm": 1.5, + "learning_rate": 0.0002605762991657163, + "loss": 2.7208, + "step": 854 + }, + { + "epoch": 0.24404765886846982, + "grad_norm": 2.296875, + "learning_rate": 0.00026048443811085744, + "loss": 2.7326, + "step": 855 + }, + { + "epoch": 0.2443330947267955, + "grad_norm": 1.9765625, + "learning_rate": 0.00026039248639037575, + "loss": 2.7559, + "step": 856 + }, + { + "epoch": 0.2446185305851212, + "grad_norm": 2.1875, + "learning_rate": 0.00026030044407972854, + "loss": 2.7389, + "step": 857 + }, + { + "epoch": 0.24490396644344692, + "grad_norm": 2.03125, + "learning_rate": 0.00026020831125444745, + "loss": 2.7434, + "step": 858 + }, + { + "epoch": 0.2451894023017726, + "grad_norm": 1.8828125, + "learning_rate": 0.0002601160879901384, + "loss": 2.745, + "step": 859 + }, + { + "epoch": 0.2454748381600983, + "grad_norm": 1.6640625, + "learning_rate": 0.0002600237743624816, + "loss": 2.74, + "step": 860 + }, + { + "epoch": 0.24576027401842399, + "grad_norm": 2.125, + "learning_rate": 0.00025993137044723135, + "loss": 2.736, + "step": 861 + }, + { + "epoch": 0.2460457098767497, + "grad_norm": 1.7734375, + "learning_rate": 0.0002598388763202161, + "loss": 2.7447, + "step": 862 + }, + { + "epoch": 0.2463311457350754, + "grad_norm": 2.1875, + "learning_rate": 0.0002597462920573381, + "loss": 2.7457, + "step": 863 + }, + { + "epoch": 0.24661658159340108, + "grad_norm": 2.0, + "learning_rate": 0.000259653617734574, + "loss": 2.7256, + "step": 864 + }, + { + "epoch": 0.24690201745172677, + "grad_norm": 1.953125, + "learning_rate": 0.00025956085342797395, + "loss": 2.7233, + "step": 865 + }, + { + "epoch": 0.24718745331005246, + "grad_norm": 1.7265625, + "learning_rate": 0.00025946799921366205, + "loss": 2.7471, + "step": 866 + }, + { + "epoch": 0.24747288916837817, + "grad_norm": 1.9296875, + "learning_rate": 0.0002593750551678364, + "loss": 2.7426, + "step": 867 + }, + { + "epoch": 0.24775832502670386, + "grad_norm": 1.4921875, + "learning_rate": 0.00025928202136676855, + "loss": 2.6968, + "step": 868 + }, + { + "epoch": 0.24804376088502955, + "grad_norm": 2.09375, + "learning_rate": 0.0002591888978868038, + "loss": 2.7192, + "step": 869 + }, + { + "epoch": 0.24832919674335524, + "grad_norm": 1.828125, + "learning_rate": 0.000259095684804361, + "loss": 2.7436, + "step": 870 + }, + { + "epoch": 0.24861463260168096, + "grad_norm": 1.9609375, + "learning_rate": 0.0002590023821959326, + "loss": 2.7627, + "step": 871 + }, + { + "epoch": 0.24890006846000665, + "grad_norm": 1.7421875, + "learning_rate": 0.00025890899013808455, + "loss": 2.7603, + "step": 872 + }, + { + "epoch": 0.24918550431833233, + "grad_norm": 1.6640625, + "learning_rate": 0.0002588155087074561, + "loss": 2.7315, + "step": 873 + }, + { + "epoch": 0.24947094017665802, + "grad_norm": 1.515625, + "learning_rate": 0.00025872193798075985, + "loss": 2.7302, + "step": 874 + }, + { + "epoch": 0.24975637603498374, + "grad_norm": 1.453125, + "learning_rate": 0.0002586282780347818, + "loss": 2.7236, + "step": 875 + }, + { + "epoch": 0.25004181189330943, + "grad_norm": 1.25, + "learning_rate": 0.00025853452894638093, + "loss": 2.7152, + "step": 876 + }, + { + "epoch": 0.2503272477516351, + "grad_norm": 1.3515625, + "learning_rate": 0.00025844069079248964, + "loss": 2.7169, + "step": 877 + }, + { + "epoch": 0.2506126836099608, + "grad_norm": 1.125, + "learning_rate": 0.00025834676365011326, + "loss": 2.7202, + "step": 878 + }, + { + "epoch": 0.2508981194682865, + "grad_norm": 1.6484375, + "learning_rate": 0.00025825274759633016, + "loss": 2.7239, + "step": 879 + }, + { + "epoch": 0.2511835553266122, + "grad_norm": 1.234375, + "learning_rate": 0.0002581586427082918, + "loss": 2.7023, + "step": 880 + }, + { + "epoch": 0.25146899118493793, + "grad_norm": 1.90625, + "learning_rate": 0.0002580644490632222, + "loss": 2.7203, + "step": 881 + }, + { + "epoch": 0.2517544270432636, + "grad_norm": 1.5234375, + "learning_rate": 0.0002579701667384187, + "loss": 2.7288, + "step": 882 + }, + { + "epoch": 0.2520398629015893, + "grad_norm": 1.90625, + "learning_rate": 0.00025787579581125107, + "loss": 2.7284, + "step": 883 + }, + { + "epoch": 0.252325298759915, + "grad_norm": 1.7265625, + "learning_rate": 0.00025778133635916183, + "loss": 2.7377, + "step": 884 + }, + { + "epoch": 0.2526107346182407, + "grad_norm": 1.75, + "learning_rate": 0.0002576867884596663, + "loss": 2.7267, + "step": 885 + }, + { + "epoch": 0.2528961704765664, + "grad_norm": 1.5859375, + "learning_rate": 0.00025759215219035213, + "loss": 2.723, + "step": 886 + }, + { + "epoch": 0.25318160633489206, + "grad_norm": 1.7109375, + "learning_rate": 0.00025749742762887977, + "loss": 2.7178, + "step": 887 + }, + { + "epoch": 0.25346704219321775, + "grad_norm": 1.3828125, + "learning_rate": 0.00025740261485298195, + "loss": 2.7387, + "step": 888 + }, + { + "epoch": 0.2537524780515435, + "grad_norm": 1.984375, + "learning_rate": 0.0002573077139404638, + "loss": 2.7513, + "step": 889 + }, + { + "epoch": 0.2540379139098692, + "grad_norm": 1.7265625, + "learning_rate": 0.0002572127249692028, + "loss": 2.7288, + "step": 890 + }, + { + "epoch": 0.2543233497681949, + "grad_norm": 1.734375, + "learning_rate": 0.00025711764801714874, + "loss": 2.7322, + "step": 891 + }, + { + "epoch": 0.25460878562652056, + "grad_norm": 1.6015625, + "learning_rate": 0.00025702248316232355, + "loss": 2.7598, + "step": 892 + }, + { + "epoch": 0.25489422148484625, + "grad_norm": 1.671875, + "learning_rate": 0.0002569272304828213, + "loss": 2.7304, + "step": 893 + }, + { + "epoch": 0.25517965734317194, + "grad_norm": 1.421875, + "learning_rate": 0.00025683189005680827, + "loss": 2.7288, + "step": 894 + }, + { + "epoch": 0.25546509320149763, + "grad_norm": 1.8203125, + "learning_rate": 0.0002567364619625224, + "loss": 2.753, + "step": 895 + }, + { + "epoch": 0.2557505290598233, + "grad_norm": 1.5390625, + "learning_rate": 0.00025664094627827393, + "loss": 2.7233, + "step": 896 + }, + { + "epoch": 0.25603596491814906, + "grad_norm": 1.8046875, + "learning_rate": 0.00025654534308244484, + "loss": 2.731, + "step": 897 + }, + { + "epoch": 0.25632140077647475, + "grad_norm": 1.6328125, + "learning_rate": 0.0002564496524534888, + "loss": 2.7177, + "step": 898 + }, + { + "epoch": 0.25660683663480044, + "grad_norm": 2.015625, + "learning_rate": 0.00025635387446993154, + "loss": 2.7327, + "step": 899 + }, + { + "epoch": 0.25689227249312613, + "grad_norm": 1.7890625, + "learning_rate": 0.0002562580092103702, + "loss": 2.7251, + "step": 900 + }, + { + "epoch": 0.2571777083514518, + "grad_norm": 1.90625, + "learning_rate": 0.00025616205675347355, + "loss": 2.7005, + "step": 901 + }, + { + "epoch": 0.2574631442097775, + "grad_norm": 1.8125, + "learning_rate": 0.00025606601717798207, + "loss": 2.7263, + "step": 902 + }, + { + "epoch": 0.2577485800681032, + "grad_norm": 1.6953125, + "learning_rate": 0.0002559698905627077, + "loss": 2.6863, + "step": 903 + }, + { + "epoch": 0.2580340159264289, + "grad_norm": 1.5703125, + "learning_rate": 0.00025587367698653367, + "loss": 2.718, + "step": 904 + }, + { + "epoch": 0.25831945178475463, + "grad_norm": 1.6640625, + "learning_rate": 0.0002557773765284148, + "loss": 2.7263, + "step": 905 + }, + { + "epoch": 0.2586048876430803, + "grad_norm": 1.546875, + "learning_rate": 0.0002556809892673769, + "loss": 2.7485, + "step": 906 + }, + { + "epoch": 0.258890323501406, + "grad_norm": 1.8359375, + "learning_rate": 0.0002555845152825173, + "loss": 2.6922, + "step": 907 + }, + { + "epoch": 0.2591757593597317, + "grad_norm": 1.71875, + "learning_rate": 0.00025548795465300426, + "loss": 2.7269, + "step": 908 + }, + { + "epoch": 0.2594611952180574, + "grad_norm": 1.6875, + "learning_rate": 0.0002553913074580774, + "loss": 2.7466, + "step": 909 + }, + { + "epoch": 0.25974663107638307, + "grad_norm": 1.6015625, + "learning_rate": 0.00025529457377704713, + "loss": 2.728, + "step": 910 + }, + { + "epoch": 0.26003206693470876, + "grad_norm": 1.703125, + "learning_rate": 0.0002551977536892951, + "loss": 2.7171, + "step": 911 + }, + { + "epoch": 0.26031750279303445, + "grad_norm": 1.4921875, + "learning_rate": 0.0002551008472742735, + "loss": 2.7028, + "step": 912 + }, + { + "epoch": 0.2606029386513602, + "grad_norm": 1.7578125, + "learning_rate": 0.00025500385461150565, + "loss": 2.7107, + "step": 913 + }, + { + "epoch": 0.2608883745096859, + "grad_norm": 1.65625, + "learning_rate": 0.0002549067757805856, + "loss": 2.7452, + "step": 914 + }, + { + "epoch": 0.26117381036801157, + "grad_norm": 1.515625, + "learning_rate": 0.00025480961086117815, + "loss": 2.7045, + "step": 915 + }, + { + "epoch": 0.26145924622633726, + "grad_norm": 1.4453125, + "learning_rate": 0.0002547123599330185, + "loss": 2.72, + "step": 916 + }, + { + "epoch": 0.26174468208466295, + "grad_norm": 1.640625, + "learning_rate": 0.00025461502307591274, + "loss": 2.7136, + "step": 917 + }, + { + "epoch": 0.26203011794298864, + "grad_norm": 1.515625, + "learning_rate": 0.0002545176003697372, + "loss": 2.7097, + "step": 918 + }, + { + "epoch": 0.2623155538013143, + "grad_norm": 1.6328125, + "learning_rate": 0.000254420091894439, + "loss": 2.7218, + "step": 919 + }, + { + "epoch": 0.26260098965964, + "grad_norm": 1.4921875, + "learning_rate": 0.0002543224977300352, + "loss": 2.6923, + "step": 920 + }, + { + "epoch": 0.26288642551796576, + "grad_norm": 1.6015625, + "learning_rate": 0.0002542248179566137, + "loss": 2.735, + "step": 921 + }, + { + "epoch": 0.26317186137629145, + "grad_norm": 1.375, + "learning_rate": 0.0002541270526543321, + "loss": 2.7211, + "step": 922 + }, + { + "epoch": 0.26345729723461714, + "grad_norm": 1.8203125, + "learning_rate": 0.00025402920190341864, + "loss": 2.73, + "step": 923 + }, + { + "epoch": 0.2637427330929428, + "grad_norm": 1.625, + "learning_rate": 0.0002539312657841714, + "loss": 2.7038, + "step": 924 + }, + { + "epoch": 0.2640281689512685, + "grad_norm": 1.65625, + "learning_rate": 0.0002538332443769587, + "loss": 2.7209, + "step": 925 + }, + { + "epoch": 0.2643136048095942, + "grad_norm": 2.875, + "learning_rate": 0.0002537351377622187, + "loss": 2.7053, + "step": 926 + }, + { + "epoch": 0.2645990406679199, + "grad_norm": 0.88671875, + "learning_rate": 0.00025363694602045957, + "loss": 2.7378, + "step": 927 + }, + { + "epoch": 0.2648844765262456, + "grad_norm": 2.1875, + "learning_rate": 0.0002535386692322593, + "loss": 2.7339, + "step": 928 + }, + { + "epoch": 0.2651699123845713, + "grad_norm": 1.875, + "learning_rate": 0.0002534403074782657, + "loss": 2.7474, + "step": 929 + }, + { + "epoch": 0.265455348242897, + "grad_norm": 1.7734375, + "learning_rate": 0.00025334186083919623, + "loss": 2.7283, + "step": 930 + }, + { + "epoch": 0.2657407841012227, + "grad_norm": 1.71875, + "learning_rate": 0.00025324332939583813, + "loss": 2.7195, + "step": 931 + }, + { + "epoch": 0.2660262199595484, + "grad_norm": 1.2578125, + "learning_rate": 0.0002531447132290482, + "loss": 2.7133, + "step": 932 + }, + { + "epoch": 0.2663116558178741, + "grad_norm": 1.484375, + "learning_rate": 0.00025304601241975266, + "loss": 2.737, + "step": 933 + }, + { + "epoch": 0.26659709167619977, + "grad_norm": 1.1171875, + "learning_rate": 0.0002529472270489473, + "loss": 2.7129, + "step": 934 + }, + { + "epoch": 0.26688252753452546, + "grad_norm": 1.828125, + "learning_rate": 0.0002528483571976973, + "loss": 2.7195, + "step": 935 + }, + { + "epoch": 0.26716796339285115, + "grad_norm": 1.28125, + "learning_rate": 0.00025274940294713706, + "loss": 2.694, + "step": 936 + }, + { + "epoch": 0.2674533992511769, + "grad_norm": 2.171875, + "learning_rate": 0.00025265036437847036, + "loss": 2.739, + "step": 937 + }, + { + "epoch": 0.2677388351095026, + "grad_norm": 1.8046875, + "learning_rate": 0.0002525512415729701, + "loss": 2.7, + "step": 938 + }, + { + "epoch": 0.26802427096782827, + "grad_norm": 1.4453125, + "learning_rate": 0.00025245203461197834, + "loss": 2.7329, + "step": 939 + }, + { + "epoch": 0.26830970682615396, + "grad_norm": 1.5390625, + "learning_rate": 0.0002523527435769062, + "loss": 2.7321, + "step": 940 + }, + { + "epoch": 0.26859514268447965, + "grad_norm": 1.25, + "learning_rate": 0.0002522533685492338, + "loss": 2.7274, + "step": 941 + }, + { + "epoch": 0.26888057854280534, + "grad_norm": 1.5703125, + "learning_rate": 0.0002521539096105101, + "loss": 2.719, + "step": 942 + }, + { + "epoch": 0.269166014401131, + "grad_norm": 1.203125, + "learning_rate": 0.00025205436684235313, + "loss": 2.7257, + "step": 943 + }, + { + "epoch": 0.2694514502594567, + "grad_norm": 1.703125, + "learning_rate": 0.0002519547403264494, + "loss": 2.7126, + "step": 944 + }, + { + "epoch": 0.2697368861177824, + "grad_norm": 1.359375, + "learning_rate": 0.00025185503014455443, + "loss": 2.7297, + "step": 945 + }, + { + "epoch": 0.27002232197610815, + "grad_norm": 1.8203125, + "learning_rate": 0.00025175523637849224, + "loss": 2.7324, + "step": 946 + }, + { + "epoch": 0.27030775783443384, + "grad_norm": 1.296875, + "learning_rate": 0.0002516553591101555, + "loss": 2.7367, + "step": 947 + }, + { + "epoch": 0.2705931936927595, + "grad_norm": 1.890625, + "learning_rate": 0.00025155539842150535, + "loss": 2.6977, + "step": 948 + }, + { + "epoch": 0.2708786295510852, + "grad_norm": 1.3125, + "learning_rate": 0.0002514553543945715, + "loss": 2.6864, + "step": 949 + }, + { + "epoch": 0.2711640654094109, + "grad_norm": 2.015625, + "learning_rate": 0.00025135522711145197, + "loss": 2.7111, + "step": 950 + }, + { + "epoch": 0.2714495012677366, + "grad_norm": 1.7265625, + "learning_rate": 0.000251255016654313, + "loss": 2.7124, + "step": 951 + }, + { + "epoch": 0.2717349371260623, + "grad_norm": 1.875, + "learning_rate": 0.0002511547231053893, + "loss": 2.6945, + "step": 952 + }, + { + "epoch": 0.27202037298438797, + "grad_norm": 1.765625, + "learning_rate": 0.00025105434654698356, + "loss": 2.7364, + "step": 953 + }, + { + "epoch": 0.2723058088427137, + "grad_norm": 1.53125, + "learning_rate": 0.00025095388706146676, + "loss": 2.7086, + "step": 954 + }, + { + "epoch": 0.2725912447010394, + "grad_norm": 1.4765625, + "learning_rate": 0.00025085334473127786, + "loss": 2.7037, + "step": 955 + }, + { + "epoch": 0.2728766805593651, + "grad_norm": 1.359375, + "learning_rate": 0.0002507527196389238, + "loss": 2.7295, + "step": 956 + }, + { + "epoch": 0.2731621164176908, + "grad_norm": 1.25, + "learning_rate": 0.0002506520118669794, + "loss": 2.6829, + "step": 957 + }, + { + "epoch": 0.27344755227601647, + "grad_norm": 1.3671875, + "learning_rate": 0.0002505512214980873, + "loss": 2.6869, + "step": 958 + }, + { + "epoch": 0.27373298813434216, + "grad_norm": 1.0390625, + "learning_rate": 0.0002504503486149581, + "loss": 2.6919, + "step": 959 + }, + { + "epoch": 0.27401842399266785, + "grad_norm": 1.65625, + "learning_rate": 0.00025034939330037, + "loss": 2.6851, + "step": 960 + }, + { + "epoch": 0.27430385985099354, + "grad_norm": 1.34375, + "learning_rate": 0.0002502483556371688, + "loss": 2.7326, + "step": 961 + }, + { + "epoch": 0.2745892957093193, + "grad_norm": 1.8515625, + "learning_rate": 0.00025014723570826794, + "loss": 2.7369, + "step": 962 + }, + { + "epoch": 0.27487473156764497, + "grad_norm": 1.6640625, + "learning_rate": 0.00025004603359664833, + "loss": 2.7398, + "step": 963 + }, + { + "epoch": 0.27516016742597066, + "grad_norm": 1.6875, + "learning_rate": 0.0002499447493853583, + "loss": 2.7145, + "step": 964 + }, + { + "epoch": 0.27544560328429635, + "grad_norm": 1.4921875, + "learning_rate": 0.00024984338315751366, + "loss": 2.733, + "step": 965 + }, + { + "epoch": 0.27573103914262204, + "grad_norm": 1.7578125, + "learning_rate": 0.00024974193499629745, + "loss": 2.707, + "step": 966 + }, + { + "epoch": 0.2760164750009477, + "grad_norm": 1.6171875, + "learning_rate": 0.00024964040498496, + "loss": 2.7282, + "step": 967 + }, + { + "epoch": 0.2763019108592734, + "grad_norm": 1.765625, + "learning_rate": 0.00024953879320681853, + "loss": 2.7208, + "step": 968 + }, + { + "epoch": 0.2765873467175991, + "grad_norm": 1.5625, + "learning_rate": 0.00024943709974525793, + "loss": 2.7021, + "step": 969 + }, + { + "epoch": 0.27687278257592485, + "grad_norm": 1.609375, + "learning_rate": 0.00024933532468372955, + "loss": 2.7056, + "step": 970 + }, + { + "epoch": 0.27715821843425054, + "grad_norm": 1.515625, + "learning_rate": 0.00024923346810575193, + "loss": 2.7342, + "step": 971 + }, + { + "epoch": 0.2774436542925762, + "grad_norm": 1.5703125, + "learning_rate": 0.0002491315300949106, + "loss": 2.7258, + "step": 972 + }, + { + "epoch": 0.2777290901509019, + "grad_norm": 1.40625, + "learning_rate": 0.00024902951073485784, + "loss": 2.7053, + "step": 973 + }, + { + "epoch": 0.2780145260092276, + "grad_norm": 1.609375, + "learning_rate": 0.00024892741010931264, + "loss": 2.7111, + "step": 974 + }, + { + "epoch": 0.2782999618675533, + "grad_norm": 1.390625, + "learning_rate": 0.0002488252283020606, + "loss": 2.6961, + "step": 975 + }, + { + "epoch": 0.278585397725879, + "grad_norm": 1.7421875, + "learning_rate": 0.00024872296539695427, + "loss": 2.7148, + "step": 976 + }, + { + "epoch": 0.27887083358420467, + "grad_norm": 1.5078125, + "learning_rate": 0.00024862062147791233, + "loss": 2.7192, + "step": 977 + }, + { + "epoch": 0.2791562694425304, + "grad_norm": 1.671875, + "learning_rate": 0.00024851819662892016, + "loss": 2.725, + "step": 978 + }, + { + "epoch": 0.2794417053008561, + "grad_norm": 1.546875, + "learning_rate": 0.0002484156909340296, + "loss": 2.7303, + "step": 979 + }, + { + "epoch": 0.2797271411591818, + "grad_norm": 1.703125, + "learning_rate": 0.00024831310447735874, + "loss": 2.6735, + "step": 980 + }, + { + "epoch": 0.2800125770175075, + "grad_norm": 1.4921875, + "learning_rate": 0.00024821043734309204, + "loss": 2.6935, + "step": 981 + }, + { + "epoch": 0.28029801287583317, + "grad_norm": 1.7890625, + "learning_rate": 0.0002481076896154799, + "loss": 2.7103, + "step": 982 + }, + { + "epoch": 0.28058344873415886, + "grad_norm": 1.6328125, + "learning_rate": 0.00024800486137883926, + "loss": 2.7239, + "step": 983 + }, + { + "epoch": 0.28086888459248455, + "grad_norm": 1.6875, + "learning_rate": 0.00024790195271755277, + "loss": 2.7289, + "step": 984 + }, + { + "epoch": 0.28115432045081024, + "grad_norm": 1.5078125, + "learning_rate": 0.0002477989637160694, + "loss": 2.7095, + "step": 985 + }, + { + "epoch": 0.281439756309136, + "grad_norm": 1.7578125, + "learning_rate": 0.0002476958944589037, + "loss": 2.6648, + "step": 986 + }, + { + "epoch": 0.28172519216746167, + "grad_norm": 1.546875, + "learning_rate": 0.0002475927450306363, + "loss": 2.666, + "step": 987 + }, + { + "epoch": 0.28201062802578736, + "grad_norm": 1.796875, + "learning_rate": 0.00024748951551591364, + "loss": 2.7152, + "step": 988 + }, + { + "epoch": 0.28229606388411305, + "grad_norm": 1.578125, + "learning_rate": 0.00024738620599944774, + "loss": 2.7102, + "step": 989 + }, + { + "epoch": 0.28258149974243874, + "grad_norm": 1.703125, + "learning_rate": 0.0002472828165660164, + "loss": 2.7055, + "step": 990 + }, + { + "epoch": 0.2828669356007644, + "grad_norm": 1.5625, + "learning_rate": 0.0002471793473004629, + "loss": 2.7004, + "step": 991 + }, + { + "epoch": 0.2831523714590901, + "grad_norm": 1.734375, + "learning_rate": 0.0002470757982876961, + "loss": 2.6998, + "step": 992 + }, + { + "epoch": 0.2834378073174158, + "grad_norm": 1.5546875, + "learning_rate": 0.00024697216961269035, + "loss": 2.7259, + "step": 993 + }, + { + "epoch": 0.28372324317574155, + "grad_norm": 1.6953125, + "learning_rate": 0.0002468684613604852, + "loss": 2.6939, + "step": 994 + }, + { + "epoch": 0.28400867903406724, + "grad_norm": 1.6015625, + "learning_rate": 0.00024676467361618563, + "loss": 2.7005, + "step": 995 + }, + { + "epoch": 0.2842941148923929, + "grad_norm": 1.6015625, + "learning_rate": 0.00024666080646496187, + "loss": 2.7153, + "step": 996 + }, + { + "epoch": 0.2845795507507186, + "grad_norm": 1.4296875, + "learning_rate": 0.0002465568599920493, + "loss": 2.7052, + "step": 997 + }, + { + "epoch": 0.2848649866090443, + "grad_norm": 1.6171875, + "learning_rate": 0.0002464528342827482, + "loss": 2.7191, + "step": 998 + }, + { + "epoch": 0.28515042246737, + "grad_norm": 1.5546875, + "learning_rate": 0.00024634872942242423, + "loss": 2.7117, + "step": 999 + }, + { + "epoch": 0.2854358583256957, + "grad_norm": 1.734375, + "learning_rate": 0.0002462445454965077, + "loss": 2.6923, + "step": 1000 + }, + { + "epoch": 0.2854358583256957, + "eval_loss": 2.571556806564331, + "eval_runtime": 5980.855, + "eval_samples_per_second": 10.749, + "eval_steps_per_second": 10.749, + "step": 1000 + }, + { + "epoch": 0.28572129418402137, + "grad_norm": 1.578125, + "learning_rate": 0.00024614028259049397, + "loss": 2.6922, + "step": 1001 + }, + { + "epoch": 0.2860067300423471, + "grad_norm": 1.5625, + "learning_rate": 0.0002460359407899431, + "loss": 2.7178, + "step": 1002 + }, + { + "epoch": 0.2862921659006728, + "grad_norm": 1.4609375, + "learning_rate": 0.00024593152018048, + "loss": 2.696, + "step": 1003 + }, + { + "epoch": 0.2865776017589985, + "grad_norm": 1.625, + "learning_rate": 0.00024582702084779414, + "loss": 2.6841, + "step": 1004 + }, + { + "epoch": 0.2868630376173242, + "grad_norm": 1.4140625, + "learning_rate": 0.00024572244287763976, + "loss": 2.6869, + "step": 1005 + }, + { + "epoch": 0.28714847347564987, + "grad_norm": 1.5546875, + "learning_rate": 0.0002456177863558354, + "loss": 2.7185, + "step": 1006 + }, + { + "epoch": 0.28743390933397556, + "grad_norm": 1.4140625, + "learning_rate": 0.00024551305136826424, + "loss": 2.69, + "step": 1007 + }, + { + "epoch": 0.28771934519230125, + "grad_norm": 1.6171875, + "learning_rate": 0.00024540823800087386, + "loss": 2.6593, + "step": 1008 + }, + { + "epoch": 0.28800478105062693, + "grad_norm": 1.3984375, + "learning_rate": 0.00024530334633967595, + "loss": 2.6818, + "step": 1009 + }, + { + "epoch": 0.2882902169089526, + "grad_norm": 1.5390625, + "learning_rate": 0.00024519837647074674, + "loss": 2.7043, + "step": 1010 + }, + { + "epoch": 0.28857565276727837, + "grad_norm": 1.40625, + "learning_rate": 0.00024509332848022636, + "loss": 2.7057, + "step": 1011 + }, + { + "epoch": 0.28886108862560406, + "grad_norm": 1.5, + "learning_rate": 0.0002449882024543193, + "loss": 2.6855, + "step": 1012 + }, + { + "epoch": 0.28914652448392975, + "grad_norm": 1.3515625, + "learning_rate": 0.00024488299847929385, + "loss": 2.7012, + "step": 1013 + }, + { + "epoch": 0.28943196034225543, + "grad_norm": 1.5390625, + "learning_rate": 0.0002447777166414825, + "loss": 2.7178, + "step": 1014 + }, + { + "epoch": 0.2897173962005811, + "grad_norm": 1.5625, + "learning_rate": 0.0002446723570272814, + "loss": 2.6926, + "step": 1015 + }, + { + "epoch": 0.2900028320589068, + "grad_norm": 1.21875, + "learning_rate": 0.00024456691972315076, + "loss": 2.6914, + "step": 1016 + }, + { + "epoch": 0.2902882679172325, + "grad_norm": 1.0390625, + "learning_rate": 0.0002444614048156144, + "loss": 2.6794, + "step": 1017 + }, + { + "epoch": 0.2905737037755582, + "grad_norm": 1.4375, + "learning_rate": 0.00024435581239125987, + "loss": 2.7046, + "step": 1018 + }, + { + "epoch": 0.29085913963388393, + "grad_norm": 1.09375, + "learning_rate": 0.0002442501425367382, + "loss": 2.6849, + "step": 1019 + }, + { + "epoch": 0.2911445754922096, + "grad_norm": 1.796875, + "learning_rate": 0.0002441443953387642, + "loss": 2.6808, + "step": 1020 + }, + { + "epoch": 0.2914300113505353, + "grad_norm": 1.65625, + "learning_rate": 0.000244038570884116, + "loss": 2.6968, + "step": 1021 + }, + { + "epoch": 0.291715447208861, + "grad_norm": 1.5, + "learning_rate": 0.00024393266925963505, + "loss": 2.6755, + "step": 1022 + }, + { + "epoch": 0.2920008830671867, + "grad_norm": 1.4765625, + "learning_rate": 0.00024382669055222634, + "loss": 2.7195, + "step": 1023 + }, + { + "epoch": 0.2922863189255124, + "grad_norm": 1.1484375, + "learning_rate": 0.000243720634848858, + "loss": 2.6943, + "step": 1024 + }, + { + "epoch": 0.29257175478383807, + "grad_norm": 1.1640625, + "learning_rate": 0.0002436145022365613, + "loss": 2.7172, + "step": 1025 + }, + { + "epoch": 0.29285719064216376, + "grad_norm": 1.390625, + "learning_rate": 0.00024350829280243074, + "loss": 2.7061, + "step": 1026 + }, + { + "epoch": 0.2931426265004895, + "grad_norm": 1.3359375, + "learning_rate": 0.00024340200663362368, + "loss": 2.6897, + "step": 1027 + }, + { + "epoch": 0.2934280623588152, + "grad_norm": 0.96484375, + "learning_rate": 0.00024329564381736068, + "loss": 2.691, + "step": 1028 + }, + { + "epoch": 0.2937134982171409, + "grad_norm": 0.8828125, + "learning_rate": 0.000243189204440925, + "loss": 2.7367, + "step": 1029 + }, + { + "epoch": 0.29399893407546657, + "grad_norm": 1.171875, + "learning_rate": 0.0002430826885916629, + "loss": 2.6964, + "step": 1030 + }, + { + "epoch": 0.29428436993379226, + "grad_norm": 1.1796875, + "learning_rate": 0.0002429760963569832, + "loss": 2.7204, + "step": 1031 + }, + { + "epoch": 0.29456980579211794, + "grad_norm": 1.90625, + "learning_rate": 0.00024286942782435753, + "loss": 2.7186, + "step": 1032 + }, + { + "epoch": 0.29485524165044363, + "grad_norm": 1.1328125, + "learning_rate": 0.0002427626830813202, + "loss": 2.6901, + "step": 1033 + }, + { + "epoch": 0.2951406775087693, + "grad_norm": 1.2890625, + "learning_rate": 0.0002426558622154679, + "loss": 2.7291, + "step": 1034 + }, + { + "epoch": 0.29542611336709507, + "grad_norm": 1.875, + "learning_rate": 0.0002425489653144598, + "loss": 2.717, + "step": 1035 + }, + { + "epoch": 0.29571154922542076, + "grad_norm": 0.71484375, + "learning_rate": 0.0002424419924660176, + "loss": 2.7074, + "step": 1036 + }, + { + "epoch": 0.29599698508374644, + "grad_norm": 2.03125, + "learning_rate": 0.00024233494375792524, + "loss": 2.7174, + "step": 1037 + }, + { + "epoch": 0.29628242094207213, + "grad_norm": 1.1640625, + "learning_rate": 0.00024222781927802888, + "loss": 2.6859, + "step": 1038 + }, + { + "epoch": 0.2965678568003978, + "grad_norm": 2.421875, + "learning_rate": 0.0002421206191142369, + "loss": 2.6916, + "step": 1039 + }, + { + "epoch": 0.2968532926587235, + "grad_norm": 1.8984375, + "learning_rate": 0.00024201334335451988, + "loss": 2.7098, + "step": 1040 + }, + { + "epoch": 0.2971387285170492, + "grad_norm": 2.09375, + "learning_rate": 0.0002419059920869102, + "loss": 2.7105, + "step": 1041 + }, + { + "epoch": 0.2974241643753749, + "grad_norm": 1.65625, + "learning_rate": 0.0002417985653995024, + "loss": 2.7329, + "step": 1042 + }, + { + "epoch": 0.29770960023370063, + "grad_norm": 2.328125, + "learning_rate": 0.0002416910633804529, + "loss": 2.6864, + "step": 1043 + }, + { + "epoch": 0.2979950360920263, + "grad_norm": 1.6640625, + "learning_rate": 0.00024158348611797985, + "loss": 2.6915, + "step": 1044 + }, + { + "epoch": 0.298280471950352, + "grad_norm": 2.578125, + "learning_rate": 0.0002414758337003632, + "loss": 2.71, + "step": 1045 + }, + { + "epoch": 0.2985659078086777, + "grad_norm": 2.421875, + "learning_rate": 0.00024136810621594454, + "loss": 2.7174, + "step": 1046 + }, + { + "epoch": 0.2988513436670034, + "grad_norm": 1.2578125, + "learning_rate": 0.0002412603037531271, + "loss": 2.7106, + "step": 1047 + }, + { + "epoch": 0.2991367795253291, + "grad_norm": 1.5390625, + "learning_rate": 0.00024115242640037569, + "loss": 2.7032, + "step": 1048 + }, + { + "epoch": 0.29942221538365477, + "grad_norm": 1.2421875, + "learning_rate": 0.0002410444742462164, + "loss": 2.6975, + "step": 1049 + }, + { + "epoch": 0.29970765124198046, + "grad_norm": 1.484375, + "learning_rate": 0.00024093644737923682, + "loss": 2.6909, + "step": 1050 + }, + { + "epoch": 0.2999930871003062, + "grad_norm": 1.1484375, + "learning_rate": 0.00024082834588808592, + "loss": 2.7097, + "step": 1051 + }, + { + "epoch": 0.3002785229586319, + "grad_norm": 1.640625, + "learning_rate": 0.0002407201698614738, + "loss": 2.7031, + "step": 1052 + }, + { + "epoch": 0.3005639588169576, + "grad_norm": 1.2734375, + "learning_rate": 0.0002406119193881718, + "loss": 2.6834, + "step": 1053 + }, + { + "epoch": 0.30084939467528327, + "grad_norm": 1.953125, + "learning_rate": 0.00024050359455701217, + "loss": 2.7092, + "step": 1054 + }, + { + "epoch": 0.30113483053360895, + "grad_norm": 1.7734375, + "learning_rate": 0.00024039519545688846, + "loss": 2.6838, + "step": 1055 + }, + { + "epoch": 0.30142026639193464, + "grad_norm": 1.7265625, + "learning_rate": 0.00024028672217675493, + "loss": 2.7051, + "step": 1056 + }, + { + "epoch": 0.30170570225026033, + "grad_norm": 1.5625, + "learning_rate": 0.00024017817480562686, + "loss": 2.698, + "step": 1057 + }, + { + "epoch": 0.301991138108586, + "grad_norm": 1.59375, + "learning_rate": 0.00024006955343258032, + "loss": 2.6918, + "step": 1058 + }, + { + "epoch": 0.30227657396691177, + "grad_norm": 1.46875, + "learning_rate": 0.00023996085814675198, + "loss": 2.7027, + "step": 1059 + }, + { + "epoch": 0.30256200982523745, + "grad_norm": 1.34375, + "learning_rate": 0.0002398520890373393, + "loss": 2.6585, + "step": 1060 + }, + { + "epoch": 0.30284744568356314, + "grad_norm": 1.3671875, + "learning_rate": 0.00023974324619360028, + "loss": 2.7134, + "step": 1061 + }, + { + "epoch": 0.30313288154188883, + "grad_norm": 1.1328125, + "learning_rate": 0.00023963432970485333, + "loss": 2.7017, + "step": 1062 + }, + { + "epoch": 0.3034183174002145, + "grad_norm": 1.328125, + "learning_rate": 0.0002395253396604775, + "loss": 2.7121, + "step": 1063 + }, + { + "epoch": 0.3037037532585402, + "grad_norm": 1.1015625, + "learning_rate": 0.00023941627614991205, + "loss": 2.6666, + "step": 1064 + }, + { + "epoch": 0.3039891891168659, + "grad_norm": 1.3203125, + "learning_rate": 0.00023930713926265652, + "loss": 2.6927, + "step": 1065 + }, + { + "epoch": 0.3042746249751916, + "grad_norm": 1.0546875, + "learning_rate": 0.00023919792908827072, + "loss": 2.6844, + "step": 1066 + }, + { + "epoch": 0.30456006083351733, + "grad_norm": 1.3125, + "learning_rate": 0.00023908864571637464, + "loss": 2.6666, + "step": 1067 + }, + { + "epoch": 0.304845496691843, + "grad_norm": 1.03125, + "learning_rate": 0.00023897928923664825, + "loss": 2.6676, + "step": 1068 + }, + { + "epoch": 0.3051309325501687, + "grad_norm": 1.3671875, + "learning_rate": 0.00023886985973883157, + "loss": 2.7065, + "step": 1069 + }, + { + "epoch": 0.3054163684084944, + "grad_norm": 1.09375, + "learning_rate": 0.00023876035731272444, + "loss": 2.6579, + "step": 1070 + }, + { + "epoch": 0.3057018042668201, + "grad_norm": 1.65625, + "learning_rate": 0.00023865078204818676, + "loss": 2.6919, + "step": 1071 + }, + { + "epoch": 0.3059872401251458, + "grad_norm": 1.3515625, + "learning_rate": 0.0002385411340351379, + "loss": 2.6779, + "step": 1072 + }, + { + "epoch": 0.30627267598347147, + "grad_norm": 1.59375, + "learning_rate": 0.00023843141336355725, + "loss": 2.6798, + "step": 1073 + }, + { + "epoch": 0.30655811184179715, + "grad_norm": 1.4609375, + "learning_rate": 0.0002383216201234836, + "loss": 2.6775, + "step": 1074 + }, + { + "epoch": 0.3068435477001229, + "grad_norm": 1.4765625, + "learning_rate": 0.00023821175440501535, + "loss": 2.693, + "step": 1075 + }, + { + "epoch": 0.3071289835584486, + "grad_norm": 1.328125, + "learning_rate": 0.00023810181629831042, + "loss": 2.6807, + "step": 1076 + }, + { + "epoch": 0.3074144194167743, + "grad_norm": 1.3125, + "learning_rate": 0.0002379918058935861, + "loss": 2.6583, + "step": 1077 + }, + { + "epoch": 0.30769985527509996, + "grad_norm": 1.1171875, + "learning_rate": 0.00023788172328111903, + "loss": 2.6784, + "step": 1078 + }, + { + "epoch": 0.30798529113342565, + "grad_norm": 1.40625, + "learning_rate": 0.00023777156855124505, + "loss": 2.6992, + "step": 1079 + }, + { + "epoch": 0.30827072699175134, + "grad_norm": 1.0390625, + "learning_rate": 0.00023766134179435921, + "loss": 2.7007, + "step": 1080 + }, + { + "epoch": 0.30855616285007703, + "grad_norm": 1.5390625, + "learning_rate": 0.0002375510431009157, + "loss": 2.698, + "step": 1081 + }, + { + "epoch": 0.3088415987084027, + "grad_norm": 1.2109375, + "learning_rate": 0.00023744067256142775, + "loss": 2.6982, + "step": 1082 + }, + { + "epoch": 0.3091270345667284, + "grad_norm": 1.7421875, + "learning_rate": 0.00023733023026646744, + "loss": 2.732, + "step": 1083 + }, + { + "epoch": 0.30941247042505415, + "grad_norm": 1.53125, + "learning_rate": 0.00023721971630666589, + "loss": 2.7234, + "step": 1084 + }, + { + "epoch": 0.30969790628337984, + "grad_norm": 1.3828125, + "learning_rate": 0.00023710913077271286, + "loss": 2.6996, + "step": 1085 + }, + { + "epoch": 0.30998334214170553, + "grad_norm": 1.3359375, + "learning_rate": 0.00023699847375535698, + "loss": 2.7038, + "step": 1086 + }, + { + "epoch": 0.3102687780000312, + "grad_norm": 1.296875, + "learning_rate": 0.00023688774534540554, + "loss": 2.6705, + "step": 1087 + }, + { + "epoch": 0.3105542138583569, + "grad_norm": 1.1484375, + "learning_rate": 0.0002367769456337243, + "loss": 2.6632, + "step": 1088 + }, + { + "epoch": 0.3108396497166826, + "grad_norm": 1.296875, + "learning_rate": 0.00023666607471123767, + "loss": 2.6572, + "step": 1089 + }, + { + "epoch": 0.3111250855750083, + "grad_norm": 1.09375, + "learning_rate": 0.0002365551326689283, + "loss": 2.68, + "step": 1090 + }, + { + "epoch": 0.311410521433334, + "grad_norm": 1.625, + "learning_rate": 0.0002364441195978375, + "loss": 2.6704, + "step": 1091 + }, + { + "epoch": 0.3116959572916597, + "grad_norm": 1.359375, + "learning_rate": 0.0002363330355890646, + "loss": 2.6514, + "step": 1092 + }, + { + "epoch": 0.3119813931499854, + "grad_norm": 1.4765625, + "learning_rate": 0.00023622188073376728, + "loss": 2.6773, + "step": 1093 + }, + { + "epoch": 0.3122668290083111, + "grad_norm": 1.34375, + "learning_rate": 0.00023611065512316127, + "loss": 2.6896, + "step": 1094 + }, + { + "epoch": 0.3125522648666368, + "grad_norm": 1.3515625, + "learning_rate": 0.00023599935884852045, + "loss": 2.7068, + "step": 1095 + }, + { + "epoch": 0.3128377007249625, + "grad_norm": 1.21875, + "learning_rate": 0.00023588799200117662, + "loss": 2.6837, + "step": 1096 + }, + { + "epoch": 0.31312313658328816, + "grad_norm": 1.3828125, + "learning_rate": 0.00023577655467251963, + "loss": 2.6873, + "step": 1097 + }, + { + "epoch": 0.31340857244161385, + "grad_norm": 1.234375, + "learning_rate": 0.0002356650469539969, + "loss": 2.6891, + "step": 1098 + }, + { + "epoch": 0.31369400829993954, + "grad_norm": 1.296875, + "learning_rate": 0.0002355534689371139, + "loss": 2.6888, + "step": 1099 + }, + { + "epoch": 0.3139794441582653, + "grad_norm": 1.1796875, + "learning_rate": 0.00023544182071343363, + "loss": 2.6745, + "step": 1100 + }, + { + "epoch": 0.314264880016591, + "grad_norm": 1.3046875, + "learning_rate": 0.00023533010237457674, + "loss": 2.6668, + "step": 1101 + }, + { + "epoch": 0.31455031587491666, + "grad_norm": 1.1171875, + "learning_rate": 0.00023521831401222132, + "loss": 2.6679, + "step": 1102 + }, + { + "epoch": 0.31483575173324235, + "grad_norm": 1.578125, + "learning_rate": 0.00023510645571810316, + "loss": 2.693, + "step": 1103 + }, + { + "epoch": 0.31512118759156804, + "grad_norm": 1.34375, + "learning_rate": 0.00023499452758401525, + "loss": 2.6966, + "step": 1104 + }, + { + "epoch": 0.31540662344989373, + "grad_norm": 1.59375, + "learning_rate": 0.00023488252970180792, + "loss": 2.6786, + "step": 1105 + }, + { + "epoch": 0.3156920593082194, + "grad_norm": 1.3984375, + "learning_rate": 0.00023477046216338875, + "loss": 2.6579, + "step": 1106 + }, + { + "epoch": 0.3159774951665451, + "grad_norm": 1.515625, + "learning_rate": 0.0002346583250607225, + "loss": 2.6717, + "step": 1107 + }, + { + "epoch": 0.31626293102487085, + "grad_norm": 1.421875, + "learning_rate": 0.00023454611848583104, + "loss": 2.6939, + "step": 1108 + }, + { + "epoch": 0.31654836688319654, + "grad_norm": 1.390625, + "learning_rate": 0.00023443384253079308, + "loss": 2.658, + "step": 1109 + }, + { + "epoch": 0.31683380274152223, + "grad_norm": 1.21875, + "learning_rate": 0.00023432149728774455, + "loss": 2.6733, + "step": 1110 + }, + { + "epoch": 0.3171192385998479, + "grad_norm": 1.5546875, + "learning_rate": 0.000234209082848878, + "loss": 2.6814, + "step": 1111 + }, + { + "epoch": 0.3174046744581736, + "grad_norm": 1.2421875, + "learning_rate": 0.00023409659930644287, + "loss": 2.67, + "step": 1112 + }, + { + "epoch": 0.3176901103164993, + "grad_norm": 1.8359375, + "learning_rate": 0.00023398404675274522, + "loss": 2.6662, + "step": 1113 + }, + { + "epoch": 0.317975546174825, + "grad_norm": 1.7578125, + "learning_rate": 0.00023387142528014798, + "loss": 2.6935, + "step": 1114 + }, + { + "epoch": 0.3182609820331507, + "grad_norm": 1.296875, + "learning_rate": 0.00023375873498107026, + "loss": 2.6746, + "step": 1115 + }, + { + "epoch": 0.3185464178914764, + "grad_norm": 1.3125, + "learning_rate": 0.00023364597594798802, + "loss": 2.6977, + "step": 1116 + }, + { + "epoch": 0.3188318537498021, + "grad_norm": 1.453125, + "learning_rate": 0.0002335331482734333, + "loss": 2.6889, + "step": 1117 + }, + { + "epoch": 0.3191172896081278, + "grad_norm": 1.09375, + "learning_rate": 0.00023342025204999472, + "loss": 2.6725, + "step": 1118 + }, + { + "epoch": 0.3194027254664535, + "grad_norm": 1.8203125, + "learning_rate": 0.0002333072873703171, + "loss": 2.669, + "step": 1119 + }, + { + "epoch": 0.3196881613247792, + "grad_norm": 1.640625, + "learning_rate": 0.00023319425432710136, + "loss": 2.691, + "step": 1120 + }, + { + "epoch": 0.31997359718310486, + "grad_norm": 1.5859375, + "learning_rate": 0.0002330811530131045, + "loss": 2.6734, + "step": 1121 + }, + { + "epoch": 0.32025903304143055, + "grad_norm": 1.53125, + "learning_rate": 0.0002329679835211397, + "loss": 2.6915, + "step": 1122 + }, + { + "epoch": 0.32054446889975624, + "grad_norm": 1.421875, + "learning_rate": 0.00023285474594407585, + "loss": 2.6766, + "step": 1123 + }, + { + "epoch": 0.320829904758082, + "grad_norm": 1.2890625, + "learning_rate": 0.000232741440374838, + "loss": 2.6737, + "step": 1124 + }, + { + "epoch": 0.3211153406164077, + "grad_norm": 1.484375, + "learning_rate": 0.00023262806690640673, + "loss": 2.6618, + "step": 1125 + }, + { + "epoch": 0.32140077647473336, + "grad_norm": 1.2578125, + "learning_rate": 0.00023251462563181853, + "loss": 2.7, + "step": 1126 + }, + { + "epoch": 0.32168621233305905, + "grad_norm": 1.6484375, + "learning_rate": 0.00023240111664416544, + "loss": 2.6777, + "step": 1127 + }, + { + "epoch": 0.32197164819138474, + "grad_norm": 1.4765625, + "learning_rate": 0.0002322875400365951, + "loss": 2.6749, + "step": 1128 + }, + { + "epoch": 0.32225708404971043, + "grad_norm": 1.5703125, + "learning_rate": 0.00023217389590231058, + "loss": 2.6936, + "step": 1129 + }, + { + "epoch": 0.3225425199080361, + "grad_norm": 1.3359375, + "learning_rate": 0.00023206018433457045, + "loss": 2.6419, + "step": 1130 + }, + { + "epoch": 0.3228279557663618, + "grad_norm": 1.453125, + "learning_rate": 0.00023194640542668855, + "loss": 2.6704, + "step": 1131 + }, + { + "epoch": 0.32311339162468755, + "grad_norm": 1.3125, + "learning_rate": 0.00023183255927203405, + "loss": 2.7011, + "step": 1132 + }, + { + "epoch": 0.32339882748301324, + "grad_norm": 1.40625, + "learning_rate": 0.00023171864596403116, + "loss": 2.683, + "step": 1133 + }, + { + "epoch": 0.32368426334133893, + "grad_norm": 1.1875, + "learning_rate": 0.00023160466559615946, + "loss": 2.7078, + "step": 1134 + }, + { + "epoch": 0.3239696991996646, + "grad_norm": 1.2734375, + "learning_rate": 0.00023149061826195327, + "loss": 2.6919, + "step": 1135 + }, + { + "epoch": 0.3242551350579903, + "grad_norm": 1.09375, + "learning_rate": 0.00023137650405500202, + "loss": 2.6554, + "step": 1136 + }, + { + "epoch": 0.324540570916316, + "grad_norm": 1.3046875, + "learning_rate": 0.00023126232306895, + "loss": 2.6734, + "step": 1137 + }, + { + "epoch": 0.3248260067746417, + "grad_norm": 1.1484375, + "learning_rate": 0.0002311480753974963, + "loss": 2.6794, + "step": 1138 + }, + { + "epoch": 0.3251114426329674, + "grad_norm": 1.296875, + "learning_rate": 0.00023103376113439472, + "loss": 2.6802, + "step": 1139 + }, + { + "epoch": 0.3253968784912931, + "grad_norm": 1.046875, + "learning_rate": 0.0002309193803734537, + "loss": 2.6811, + "step": 1140 + }, + { + "epoch": 0.3256823143496188, + "grad_norm": 1.5, + "learning_rate": 0.00023080493320853628, + "loss": 2.671, + "step": 1141 + }, + { + "epoch": 0.3259677502079445, + "grad_norm": 1.0859375, + "learning_rate": 0.00023069041973355992, + "loss": 2.6759, + "step": 1142 + }, + { + "epoch": 0.3262531860662702, + "grad_norm": 1.5859375, + "learning_rate": 0.00023057584004249662, + "loss": 2.682, + "step": 1143 + }, + { + "epoch": 0.3265386219245959, + "grad_norm": 1.3515625, + "learning_rate": 0.00023046119422937258, + "loss": 2.6591, + "step": 1144 + }, + { + "epoch": 0.32682405778292156, + "grad_norm": 1.6171875, + "learning_rate": 0.00023034648238826836, + "loss": 2.6607, + "step": 1145 + }, + { + "epoch": 0.32710949364124725, + "grad_norm": 1.4140625, + "learning_rate": 0.00023023170461331863, + "loss": 2.6512, + "step": 1146 + }, + { + "epoch": 0.32739492949957294, + "grad_norm": 1.6171875, + "learning_rate": 0.0002301168609987123, + "loss": 2.6913, + "step": 1147 + }, + { + "epoch": 0.3276803653578987, + "grad_norm": 1.46875, + "learning_rate": 0.00023000195163869216, + "loss": 2.6783, + "step": 1148 + }, + { + "epoch": 0.3279658012162244, + "grad_norm": 1.5546875, + "learning_rate": 0.0002298869766275549, + "loss": 2.6467, + "step": 1149 + }, + { + "epoch": 0.32825123707455006, + "grad_norm": 1.40625, + "learning_rate": 0.00022977193605965143, + "loss": 2.7, + "step": 1150 + }, + { + "epoch": 0.32853667293287575, + "grad_norm": 1.4453125, + "learning_rate": 0.000229656830029386, + "loss": 2.6604, + "step": 1151 + }, + { + "epoch": 0.32882210879120144, + "grad_norm": 1.328125, + "learning_rate": 0.0002295416586312169, + "loss": 2.6538, + "step": 1152 + }, + { + "epoch": 0.32910754464952713, + "grad_norm": 1.28125, + "learning_rate": 0.00022942642195965596, + "loss": 2.69, + "step": 1153 + }, + { + "epoch": 0.3293929805078528, + "grad_norm": 1.2109375, + "learning_rate": 0.0002293111201092686, + "loss": 2.6806, + "step": 1154 + }, + { + "epoch": 0.3296784163661785, + "grad_norm": 1.203125, + "learning_rate": 0.00022919575317467358, + "loss": 2.6815, + "step": 1155 + }, + { + "epoch": 0.3299638522245042, + "grad_norm": 1.0390625, + "learning_rate": 0.0002290803212505433, + "loss": 2.6887, + "step": 1156 + }, + { + "epoch": 0.33024928808282994, + "grad_norm": 1.5078125, + "learning_rate": 0.00022896482443160335, + "loss": 2.6799, + "step": 1157 + }, + { + "epoch": 0.33053472394115563, + "grad_norm": 1.34375, + "learning_rate": 0.00022884926281263265, + "loss": 2.6802, + "step": 1158 + }, + { + "epoch": 0.3308201597994813, + "grad_norm": 1.3984375, + "learning_rate": 0.00022873363648846318, + "loss": 2.6585, + "step": 1159 + }, + { + "epoch": 0.331105595657807, + "grad_norm": 1.3203125, + "learning_rate": 0.00022861794555398016, + "loss": 2.6746, + "step": 1160 + }, + { + "epoch": 0.3313910315161327, + "grad_norm": 1.40625, + "learning_rate": 0.0002285021901041217, + "loss": 2.6856, + "step": 1161 + }, + { + "epoch": 0.3316764673744584, + "grad_norm": 1.234375, + "learning_rate": 0.000228386370233879, + "loss": 2.6456, + "step": 1162 + }, + { + "epoch": 0.3319619032327841, + "grad_norm": 1.484375, + "learning_rate": 0.00022827048603829596, + "loss": 2.6973, + "step": 1163 + }, + { + "epoch": 0.33224733909110976, + "grad_norm": 1.3359375, + "learning_rate": 0.0002281545376124694, + "loss": 2.665, + "step": 1164 + }, + { + "epoch": 0.3325327749494355, + "grad_norm": 1.515625, + "learning_rate": 0.00022803852505154867, + "loss": 2.666, + "step": 1165 + }, + { + "epoch": 0.3328182108077612, + "grad_norm": 1.390625, + "learning_rate": 0.00022792244845073608, + "loss": 2.6748, + "step": 1166 + }, + { + "epoch": 0.3331036466660869, + "grad_norm": 1.40625, + "learning_rate": 0.00022780630790528617, + "loss": 2.6593, + "step": 1167 + }, + { + "epoch": 0.33338908252441257, + "grad_norm": 1.296875, + "learning_rate": 0.00022769010351050606, + "loss": 2.6485, + "step": 1168 + }, + { + "epoch": 0.33367451838273826, + "grad_norm": 1.375, + "learning_rate": 0.00022757383536175529, + "loss": 2.6684, + "step": 1169 + }, + { + "epoch": 0.33395995424106395, + "grad_norm": 1.203125, + "learning_rate": 0.00022745750355444573, + "loss": 2.6508, + "step": 1170 + }, + { + "epoch": 0.33424539009938964, + "grad_norm": 1.4453125, + "learning_rate": 0.00022734110818404144, + "loss": 2.6546, + "step": 1171 + }, + { + "epoch": 0.3345308259577153, + "grad_norm": 1.3828125, + "learning_rate": 0.00022722464934605869, + "loss": 2.6864, + "step": 1172 + }, + { + "epoch": 0.33481626181604107, + "grad_norm": 1.5, + "learning_rate": 0.00022710812713606582, + "loss": 2.6611, + "step": 1173 + }, + { + "epoch": 0.33510169767436676, + "grad_norm": 1.328125, + "learning_rate": 0.00022699154164968307, + "loss": 2.6822, + "step": 1174 + }, + { + "epoch": 0.33538713353269245, + "grad_norm": 1.3671875, + "learning_rate": 0.0002268748929825828, + "loss": 2.6522, + "step": 1175 + }, + { + "epoch": 0.33567256939101814, + "grad_norm": 1.25, + "learning_rate": 0.0002267581812304891, + "loss": 2.6546, + "step": 1176 + }, + { + "epoch": 0.3359580052493438, + "grad_norm": 1.390625, + "learning_rate": 0.00022664140648917782, + "loss": 2.6711, + "step": 1177 + }, + { + "epoch": 0.3362434411076695, + "grad_norm": 1.15625, + "learning_rate": 0.00022652456885447652, + "loss": 2.6533, + "step": 1178 + }, + { + "epoch": 0.3365288769659952, + "grad_norm": 1.53125, + "learning_rate": 0.0002264076684222644, + "loss": 2.6659, + "step": 1179 + }, + { + "epoch": 0.3368143128243209, + "grad_norm": 1.359375, + "learning_rate": 0.00022629070528847216, + "loss": 2.6843, + "step": 1180 + }, + { + "epoch": 0.33709974868264664, + "grad_norm": 1.4375, + "learning_rate": 0.00022617367954908194, + "loss": 2.6654, + "step": 1181 + }, + { + "epoch": 0.3373851845409723, + "grad_norm": 1.2109375, + "learning_rate": 0.00022605659130012733, + "loss": 2.6624, + "step": 1182 + }, + { + "epoch": 0.337670620399298, + "grad_norm": 1.3828125, + "learning_rate": 0.00022593944063769314, + "loss": 2.6839, + "step": 1183 + }, + { + "epoch": 0.3379560562576237, + "grad_norm": 1.21875, + "learning_rate": 0.0002258222276579154, + "loss": 2.6787, + "step": 1184 + }, + { + "epoch": 0.3382414921159494, + "grad_norm": 1.375, + "learning_rate": 0.00022570495245698128, + "loss": 2.6928, + "step": 1185 + }, + { + "epoch": 0.3385269279742751, + "grad_norm": 1.2109375, + "learning_rate": 0.00022558761513112913, + "loss": 2.6999, + "step": 1186 + }, + { + "epoch": 0.33881236383260077, + "grad_norm": 1.3984375, + "learning_rate": 0.00022547021577664814, + "loss": 2.6904, + "step": 1187 + }, + { + "epoch": 0.33909779969092646, + "grad_norm": 1.171875, + "learning_rate": 0.00022535275448987832, + "loss": 2.6623, + "step": 1188 + }, + { + "epoch": 0.3393832355492522, + "grad_norm": 1.2734375, + "learning_rate": 0.00022523523136721085, + "loss": 2.6658, + "step": 1189 + }, + { + "epoch": 0.3396686714075779, + "grad_norm": 1.1171875, + "learning_rate": 0.00022511764650508728, + "loss": 2.6547, + "step": 1190 + }, + { + "epoch": 0.3399541072659036, + "grad_norm": 1.3359375, + "learning_rate": 0.000225, + "loss": 2.6677, + "step": 1191 + }, + { + "epoch": 0.34023954312422927, + "grad_norm": 1.171875, + "learning_rate": 0.00022488229194849192, + "loss": 2.6869, + "step": 1192 + }, + { + "epoch": 0.34052497898255496, + "grad_norm": 1.40625, + "learning_rate": 0.00022476452244715663, + "loss": 2.6773, + "step": 1193 + }, + { + "epoch": 0.34081041484088065, + "grad_norm": 1.1875, + "learning_rate": 0.00022464669159263793, + "loss": 2.6669, + "step": 1194 + }, + { + "epoch": 0.34109585069920634, + "grad_norm": 1.3515625, + "learning_rate": 0.00022452879948162998, + "loss": 2.64, + "step": 1195 + }, + { + "epoch": 0.341381286557532, + "grad_norm": 1.203125, + "learning_rate": 0.0002244108462108774, + "loss": 2.6452, + "step": 1196 + }, + { + "epoch": 0.34166672241585777, + "grad_norm": 1.3359375, + "learning_rate": 0.00022429283187717485, + "loss": 2.6339, + "step": 1197 + }, + { + "epoch": 0.34195215827418346, + "grad_norm": 1.1640625, + "learning_rate": 0.00022417475657736705, + "loss": 2.6572, + "step": 1198 + }, + { + "epoch": 0.34223759413250915, + "grad_norm": 1.3125, + "learning_rate": 0.00022405662040834895, + "loss": 2.646, + "step": 1199 + }, + { + "epoch": 0.34252302999083484, + "grad_norm": 1.1796875, + "learning_rate": 0.00022393842346706523, + "loss": 2.6676, + "step": 1200 + }, + { + "epoch": 0.3428084658491605, + "grad_norm": 1.171875, + "learning_rate": 0.00022382016585051058, + "loss": 2.6574, + "step": 1201 + }, + { + "epoch": 0.3430939017074862, + "grad_norm": 1.1171875, + "learning_rate": 0.00022370184765572944, + "loss": 2.6481, + "step": 1202 + }, + { + "epoch": 0.3433793375658119, + "grad_norm": 1.15625, + "learning_rate": 0.00022358346897981596, + "loss": 2.675, + "step": 1203 + }, + { + "epoch": 0.3436647734241376, + "grad_norm": 1.09375, + "learning_rate": 0.0002234650299199139, + "loss": 2.6475, + "step": 1204 + }, + { + "epoch": 0.34395020928246334, + "grad_norm": 1.0234375, + "learning_rate": 0.00022334653057321663, + "loss": 2.6372, + "step": 1205 + }, + { + "epoch": 0.344235645140789, + "grad_norm": 0.90625, + "learning_rate": 0.00022322797103696692, + "loss": 2.657, + "step": 1206 + }, + { + "epoch": 0.3445210809991147, + "grad_norm": 0.98828125, + "learning_rate": 0.00022310935140845706, + "loss": 2.6606, + "step": 1207 + }, + { + "epoch": 0.3448065168574404, + "grad_norm": 0.8515625, + "learning_rate": 0.0002229906717850284, + "loss": 2.6751, + "step": 1208 + }, + { + "epoch": 0.3450919527157661, + "grad_norm": 0.9140625, + "learning_rate": 0.00022287193226407185, + "loss": 2.6703, + "step": 1209 + }, + { + "epoch": 0.3453773885740918, + "grad_norm": 0.81640625, + "learning_rate": 0.00022275313294302726, + "loss": 2.6554, + "step": 1210 + }, + { + "epoch": 0.34566282443241747, + "grad_norm": 0.9375, + "learning_rate": 0.00022263427391938358, + "loss": 2.6401, + "step": 1211 + }, + { + "epoch": 0.34594826029074316, + "grad_norm": 0.78515625, + "learning_rate": 0.00022251535529067877, + "loss": 2.6659, + "step": 1212 + }, + { + "epoch": 0.3462336961490689, + "grad_norm": 0.984375, + "learning_rate": 0.00022239637715449977, + "loss": 2.6972, + "step": 1213 + }, + { + "epoch": 0.3465191320073946, + "grad_norm": 0.82421875, + "learning_rate": 0.0002222773396084822, + "loss": 2.6545, + "step": 1214 + }, + { + "epoch": 0.3468045678657203, + "grad_norm": 0.80859375, + "learning_rate": 0.0002221582427503106, + "loss": 2.6515, + "step": 1215 + }, + { + "epoch": 0.34709000372404597, + "grad_norm": 0.6953125, + "learning_rate": 0.00022203908667771808, + "loss": 2.6517, + "step": 1216 + }, + { + "epoch": 0.34737543958237166, + "grad_norm": 0.73828125, + "learning_rate": 0.00022191987148848636, + "loss": 2.6596, + "step": 1217 + }, + { + "epoch": 0.34766087544069735, + "grad_norm": 0.6640625, + "learning_rate": 0.0002218005972804457, + "loss": 2.6795, + "step": 1218 + }, + { + "epoch": 0.34794631129902304, + "grad_norm": 0.73828125, + "learning_rate": 0.00022168126415147478, + "loss": 2.6416, + "step": 1219 + }, + { + "epoch": 0.3482317471573487, + "grad_norm": 0.71875, + "learning_rate": 0.00022156187219950059, + "loss": 2.6384, + "step": 1220 + }, + { + "epoch": 0.34851718301567447, + "grad_norm": 0.69140625, + "learning_rate": 0.0002214424215224985, + "loss": 2.6574, + "step": 1221 + }, + { + "epoch": 0.34880261887400016, + "grad_norm": 0.77734375, + "learning_rate": 0.0002213229122184919, + "loss": 2.6864, + "step": 1222 + }, + { + "epoch": 0.34908805473232585, + "grad_norm": 0.796875, + "learning_rate": 0.0002212033443855525, + "loss": 2.6457, + "step": 1223 + }, + { + "epoch": 0.34937349059065154, + "grad_norm": 0.7265625, + "learning_rate": 0.0002210837181217998, + "loss": 2.6441, + "step": 1224 + }, + { + "epoch": 0.3496589264489772, + "grad_norm": 0.8203125, + "learning_rate": 0.0002209640335254015, + "loss": 2.6643, + "step": 1225 + }, + { + "epoch": 0.3499443623073029, + "grad_norm": 0.703125, + "learning_rate": 0.00022084429069457297, + "loss": 2.6436, + "step": 1226 + }, + { + "epoch": 0.3502297981656286, + "grad_norm": 0.80859375, + "learning_rate": 0.0002207244897275775, + "loss": 2.6485, + "step": 1227 + }, + { + "epoch": 0.3505152340239543, + "grad_norm": 0.80859375, + "learning_rate": 0.00022060463072272595, + "loss": 2.6534, + "step": 1228 + }, + { + "epoch": 0.35080066988228, + "grad_norm": 0.8203125, + "learning_rate": 0.00022048471377837697, + "loss": 2.6605, + "step": 1229 + }, + { + "epoch": 0.3510861057406057, + "grad_norm": 0.9296875, + "learning_rate": 0.0002203647389929367, + "loss": 2.6603, + "step": 1230 + }, + { + "epoch": 0.3513715415989314, + "grad_norm": 1.2578125, + "learning_rate": 0.00022024470646485862, + "loss": 2.6937, + "step": 1231 + }, + { + "epoch": 0.3516569774572571, + "grad_norm": 0.96484375, + "learning_rate": 0.0002201246162926437, + "loss": 2.6643, + "step": 1232 + }, + { + "epoch": 0.3519424133155828, + "grad_norm": 0.875, + "learning_rate": 0.00022000446857484035, + "loss": 2.6523, + "step": 1233 + }, + { + "epoch": 0.3522278491739085, + "grad_norm": 0.7578125, + "learning_rate": 0.0002198842634100439, + "loss": 2.6739, + "step": 1234 + }, + { + "epoch": 0.35251328503223417, + "grad_norm": 0.59765625, + "learning_rate": 0.00021976400089689712, + "loss": 2.6605, + "step": 1235 + }, + { + "epoch": 0.35279872089055986, + "grad_norm": 0.7109375, + "learning_rate": 0.00021964368113408959, + "loss": 2.6868, + "step": 1236 + }, + { + "epoch": 0.35308415674888555, + "grad_norm": 0.828125, + "learning_rate": 0.00021952330422035803, + "loss": 2.6759, + "step": 1237 + }, + { + "epoch": 0.3533695926072113, + "grad_norm": 0.96875, + "learning_rate": 0.0002194028702544861, + "loss": 2.6735, + "step": 1238 + }, + { + "epoch": 0.353655028465537, + "grad_norm": 0.97265625, + "learning_rate": 0.00021928237933530403, + "loss": 2.661, + "step": 1239 + }, + { + "epoch": 0.35394046432386267, + "grad_norm": 1.0546875, + "learning_rate": 0.00021916183156168908, + "loss": 2.6457, + "step": 1240 + }, + { + "epoch": 0.35422590018218836, + "grad_norm": 0.9453125, + "learning_rate": 0.00021904122703256498, + "loss": 2.6761, + "step": 1241 + }, + { + "epoch": 0.35451133604051405, + "grad_norm": 0.80859375, + "learning_rate": 0.00021892056584690213, + "loss": 2.6441, + "step": 1242 + }, + { + "epoch": 0.35479677189883974, + "grad_norm": 0.94921875, + "learning_rate": 0.00021879984810371734, + "loss": 2.6453, + "step": 1243 + }, + { + "epoch": 0.3550822077571654, + "grad_norm": 1.015625, + "learning_rate": 0.00021867907390207394, + "loss": 2.6208, + "step": 1244 + }, + { + "epoch": 0.3553676436154911, + "grad_norm": 0.98046875, + "learning_rate": 0.00021855824334108143, + "loss": 2.6572, + "step": 1245 + }, + { + "epoch": 0.35565307947381686, + "grad_norm": 0.85546875, + "learning_rate": 0.00021843735651989575, + "loss": 2.6826, + "step": 1246 + }, + { + "epoch": 0.35593851533214255, + "grad_norm": 0.8125, + "learning_rate": 0.00021831641353771885, + "loss": 2.6611, + "step": 1247 + }, + { + "epoch": 0.35622395119046824, + "grad_norm": 0.83203125, + "learning_rate": 0.00021819541449379892, + "loss": 2.6597, + "step": 1248 + }, + { + "epoch": 0.3565093870487939, + "grad_norm": 0.99609375, + "learning_rate": 0.00021807435948742994, + "loss": 2.635, + "step": 1249 + }, + { + "epoch": 0.3567948229071196, + "grad_norm": 0.9296875, + "learning_rate": 0.00021795324861795208, + "loss": 2.6526, + "step": 1250 + }, + { + "epoch": 0.3567948229071196, + "eval_loss": 2.5330393314361572, + "eval_runtime": 5928.9133, + "eval_samples_per_second": 10.843, + "eval_steps_per_second": 10.843, + "step": 1250 + }, + { + "epoch": 0.3570802587654453, + "grad_norm": 0.84375, + "learning_rate": 0.00021783208198475107, + "loss": 2.6512, + "step": 1251 + }, + { + "epoch": 0.357365694623771, + "grad_norm": 0.7890625, + "learning_rate": 0.00021771085968725864, + "loss": 2.6381, + "step": 1252 + }, + { + "epoch": 0.3576511304820967, + "grad_norm": 0.7265625, + "learning_rate": 0.00021758958182495214, + "loss": 2.6498, + "step": 1253 + }, + { + "epoch": 0.3579365663404224, + "grad_norm": 1.171875, + "learning_rate": 0.00021746824849735435, + "loss": 2.6614, + "step": 1254 + }, + { + "epoch": 0.3582220021987481, + "grad_norm": 0.72265625, + "learning_rate": 0.00021734685980403376, + "loss": 2.6483, + "step": 1255 + }, + { + "epoch": 0.3585074380570738, + "grad_norm": 0.89453125, + "learning_rate": 0.0002172254158446043, + "loss": 2.6365, + "step": 1256 + }, + { + "epoch": 0.3587928739153995, + "grad_norm": 0.86328125, + "learning_rate": 0.00021710391671872514, + "loss": 2.6484, + "step": 1257 + }, + { + "epoch": 0.3590783097737252, + "grad_norm": 0.8984375, + "learning_rate": 0.00021698236252610072, + "loss": 2.6372, + "step": 1258 + }, + { + "epoch": 0.35936374563205087, + "grad_norm": 0.80859375, + "learning_rate": 0.00021686075336648075, + "loss": 2.6554, + "step": 1259 + }, + { + "epoch": 0.35964918149037656, + "grad_norm": 0.8359375, + "learning_rate": 0.00021673908933965996, + "loss": 2.6511, + "step": 1260 + }, + { + "epoch": 0.35993461734870225, + "grad_norm": 0.81640625, + "learning_rate": 0.00021661737054547826, + "loss": 2.6473, + "step": 1261 + }, + { + "epoch": 0.360220053207028, + "grad_norm": 0.7890625, + "learning_rate": 0.00021649559708382027, + "loss": 2.6396, + "step": 1262 + }, + { + "epoch": 0.3605054890653537, + "grad_norm": 0.87890625, + "learning_rate": 0.0002163737690546157, + "loss": 2.6517, + "step": 1263 + }, + { + "epoch": 0.36079092492367937, + "grad_norm": 0.9765625, + "learning_rate": 0.00021625188655783893, + "loss": 2.6126, + "step": 1264 + }, + { + "epoch": 0.36107636078200506, + "grad_norm": 0.89453125, + "learning_rate": 0.000216129949693509, + "loss": 2.6551, + "step": 1265 + }, + { + "epoch": 0.36136179664033075, + "grad_norm": 0.8671875, + "learning_rate": 0.0002160079585616896, + "loss": 2.6316, + "step": 1266 + }, + { + "epoch": 0.36164723249865643, + "grad_norm": 0.89453125, + "learning_rate": 0.000215885913262489, + "loss": 2.6376, + "step": 1267 + }, + { + "epoch": 0.3619326683569821, + "grad_norm": 0.78125, + "learning_rate": 0.00021576381389605992, + "loss": 2.6378, + "step": 1268 + }, + { + "epoch": 0.3622181042153078, + "grad_norm": 0.78515625, + "learning_rate": 0.00021564166056259936, + "loss": 2.6742, + "step": 1269 + }, + { + "epoch": 0.36250354007363356, + "grad_norm": 0.9453125, + "learning_rate": 0.00021551945336234867, + "loss": 2.6676, + "step": 1270 + }, + { + "epoch": 0.36278897593195925, + "grad_norm": 0.7578125, + "learning_rate": 0.00021539719239559336, + "loss": 2.6604, + "step": 1271 + }, + { + "epoch": 0.36307441179028493, + "grad_norm": 0.734375, + "learning_rate": 0.00021527487776266317, + "loss": 2.6459, + "step": 1272 + }, + { + "epoch": 0.3633598476486106, + "grad_norm": 0.74609375, + "learning_rate": 0.0002151525095639318, + "loss": 2.6323, + "step": 1273 + }, + { + "epoch": 0.3636452835069363, + "grad_norm": 0.80859375, + "learning_rate": 0.0002150300878998168, + "loss": 2.6476, + "step": 1274 + }, + { + "epoch": 0.363930719365262, + "grad_norm": 0.73046875, + "learning_rate": 0.0002149076128707798, + "loss": 2.6378, + "step": 1275 + }, + { + "epoch": 0.3642161552235877, + "grad_norm": 0.73828125, + "learning_rate": 0.00021478508457732615, + "loss": 2.654, + "step": 1276 + }, + { + "epoch": 0.3645015910819134, + "grad_norm": 0.62109375, + "learning_rate": 0.00021466250312000482, + "loss": 2.6398, + "step": 1277 + }, + { + "epoch": 0.3647870269402391, + "grad_norm": 0.74609375, + "learning_rate": 0.00021453986859940852, + "loss": 2.6306, + "step": 1278 + }, + { + "epoch": 0.3650724627985648, + "grad_norm": 0.84765625, + "learning_rate": 0.00021441718111617344, + "loss": 2.6299, + "step": 1279 + }, + { + "epoch": 0.3653578986568905, + "grad_norm": 0.8125, + "learning_rate": 0.00021429444077097928, + "loss": 2.6466, + "step": 1280 + }, + { + "epoch": 0.3656433345152162, + "grad_norm": 0.75390625, + "learning_rate": 0.00021417164766454903, + "loss": 2.6788, + "step": 1281 + }, + { + "epoch": 0.3659287703735419, + "grad_norm": 0.61328125, + "learning_rate": 0.00021404880189764913, + "loss": 2.6416, + "step": 1282 + }, + { + "epoch": 0.36621420623186757, + "grad_norm": 0.63671875, + "learning_rate": 0.00021392590357108905, + "loss": 2.6469, + "step": 1283 + }, + { + "epoch": 0.36649964209019326, + "grad_norm": 0.65625, + "learning_rate": 0.00021380295278572155, + "loss": 2.6422, + "step": 1284 + }, + { + "epoch": 0.36678507794851894, + "grad_norm": 0.63671875, + "learning_rate": 0.00021367994964244236, + "loss": 2.6202, + "step": 1285 + }, + { + "epoch": 0.3670705138068447, + "grad_norm": 0.640625, + "learning_rate": 0.00021355689424219023, + "loss": 2.6281, + "step": 1286 + }, + { + "epoch": 0.3673559496651704, + "grad_norm": 0.60546875, + "learning_rate": 0.00021343378668594662, + "loss": 2.6181, + "step": 1287 + }, + { + "epoch": 0.36764138552349607, + "grad_norm": 0.62890625, + "learning_rate": 0.00021331062707473605, + "loss": 2.6632, + "step": 1288 + }, + { + "epoch": 0.36792682138182176, + "grad_norm": 0.59765625, + "learning_rate": 0.00021318741550962556, + "loss": 2.6296, + "step": 1289 + }, + { + "epoch": 0.36821225724014744, + "grad_norm": 0.58984375, + "learning_rate": 0.00021306415209172502, + "loss": 2.654, + "step": 1290 + }, + { + "epoch": 0.36849769309847313, + "grad_norm": 0.54296875, + "learning_rate": 0.00021294083692218653, + "loss": 2.6375, + "step": 1291 + }, + { + "epoch": 0.3687831289567988, + "grad_norm": 0.61328125, + "learning_rate": 0.00021281747010220496, + "loss": 2.6488, + "step": 1292 + }, + { + "epoch": 0.3690685648151245, + "grad_norm": 0.62109375, + "learning_rate": 0.0002126940517330175, + "loss": 2.6565, + "step": 1293 + }, + { + "epoch": 0.36935400067345026, + "grad_norm": 0.58984375, + "learning_rate": 0.00021257058191590354, + "loss": 2.6622, + "step": 1294 + }, + { + "epoch": 0.36963943653177594, + "grad_norm": 0.59765625, + "learning_rate": 0.00021244706075218472, + "loss": 2.6498, + "step": 1295 + }, + { + "epoch": 0.36992487239010163, + "grad_norm": 0.72265625, + "learning_rate": 0.00021232348834322495, + "loss": 2.6525, + "step": 1296 + }, + { + "epoch": 0.3702103082484273, + "grad_norm": 0.75, + "learning_rate": 0.00021219986479043001, + "loss": 2.6365, + "step": 1297 + }, + { + "epoch": 0.370495744106753, + "grad_norm": 0.6484375, + "learning_rate": 0.00021207619019524777, + "loss": 2.6502, + "step": 1298 + }, + { + "epoch": 0.3707811799650787, + "grad_norm": 0.5859375, + "learning_rate": 0.00021195246465916792, + "loss": 2.6183, + "step": 1299 + }, + { + "epoch": 0.3710666158234044, + "grad_norm": 0.52734375, + "learning_rate": 0.00021182868828372196, + "loss": 2.6646, + "step": 1300 + }, + { + "epoch": 0.3713520516817301, + "grad_norm": 0.6484375, + "learning_rate": 0.00021170486117048315, + "loss": 2.6203, + "step": 1301 + }, + { + "epoch": 0.37163748754005577, + "grad_norm": 0.62109375, + "learning_rate": 0.0002115809834210664, + "loss": 2.625, + "step": 1302 + }, + { + "epoch": 0.3719229233983815, + "grad_norm": 0.6171875, + "learning_rate": 0.0002114570551371281, + "loss": 2.671, + "step": 1303 + }, + { + "epoch": 0.3722083592567072, + "grad_norm": 0.54296875, + "learning_rate": 0.00021133307642036615, + "loss": 2.6239, + "step": 1304 + }, + { + "epoch": 0.3724937951150329, + "grad_norm": 0.64453125, + "learning_rate": 0.0002112090473725198, + "loss": 2.643, + "step": 1305 + }, + { + "epoch": 0.3727792309733586, + "grad_norm": 0.5390625, + "learning_rate": 0.00021108496809536974, + "loss": 2.627, + "step": 1306 + }, + { + "epoch": 0.37306466683168427, + "grad_norm": 0.56640625, + "learning_rate": 0.00021096083869073765, + "loss": 2.6038, + "step": 1307 + }, + { + "epoch": 0.37335010269000996, + "grad_norm": 0.640625, + "learning_rate": 0.0002108366592604866, + "loss": 2.6223, + "step": 1308 + }, + { + "epoch": 0.37363553854833564, + "grad_norm": 0.7109375, + "learning_rate": 0.00021071242990652043, + "loss": 2.6492, + "step": 1309 + }, + { + "epoch": 0.37392097440666133, + "grad_norm": 0.625, + "learning_rate": 0.00021058815073078422, + "loss": 2.6534, + "step": 1310 + }, + { + "epoch": 0.3742064102649871, + "grad_norm": 0.58984375, + "learning_rate": 0.00021046382183526378, + "loss": 2.6197, + "step": 1311 + }, + { + "epoch": 0.37449184612331277, + "grad_norm": 0.7265625, + "learning_rate": 0.0002103394433219858, + "loss": 2.632, + "step": 1312 + }, + { + "epoch": 0.37477728198163845, + "grad_norm": 0.59375, + "learning_rate": 0.00021021501529301756, + "loss": 2.639, + "step": 1313 + }, + { + "epoch": 0.37506271783996414, + "grad_norm": 0.63671875, + "learning_rate": 0.00021009053785046706, + "loss": 2.6138, + "step": 1314 + }, + { + "epoch": 0.37534815369828983, + "grad_norm": 0.61328125, + "learning_rate": 0.0002099660110964829, + "loss": 2.647, + "step": 1315 + }, + { + "epoch": 0.3756335895566155, + "grad_norm": 0.60546875, + "learning_rate": 0.00020984143513325416, + "loss": 2.6299, + "step": 1316 + }, + { + "epoch": 0.3759190254149412, + "grad_norm": 0.55078125, + "learning_rate": 0.0002097168100630101, + "loss": 2.6422, + "step": 1317 + }, + { + "epoch": 0.3762044612732669, + "grad_norm": 0.62109375, + "learning_rate": 0.0002095921359880204, + "loss": 2.6092, + "step": 1318 + }, + { + "epoch": 0.37648989713159264, + "grad_norm": 0.609375, + "learning_rate": 0.00020946741301059514, + "loss": 2.6118, + "step": 1319 + }, + { + "epoch": 0.37677533298991833, + "grad_norm": 0.64453125, + "learning_rate": 0.0002093426412330842, + "loss": 2.6348, + "step": 1320 + }, + { + "epoch": 0.377060768848244, + "grad_norm": 0.671875, + "learning_rate": 0.00020921782075787777, + "loss": 2.6552, + "step": 1321 + }, + { + "epoch": 0.3773462047065697, + "grad_norm": 0.58203125, + "learning_rate": 0.00020909295168740577, + "loss": 2.6427, + "step": 1322 + }, + { + "epoch": 0.3776316405648954, + "grad_norm": 0.5625, + "learning_rate": 0.00020896803412413824, + "loss": 2.626, + "step": 1323 + }, + { + "epoch": 0.3779170764232211, + "grad_norm": 0.59375, + "learning_rate": 0.00020884306817058482, + "loss": 2.6509, + "step": 1324 + }, + { + "epoch": 0.3782025122815468, + "grad_norm": 0.58984375, + "learning_rate": 0.00020871805392929502, + "loss": 2.6215, + "step": 1325 + }, + { + "epoch": 0.37848794813987247, + "grad_norm": 0.55859375, + "learning_rate": 0.00020859299150285786, + "loss": 2.6605, + "step": 1326 + }, + { + "epoch": 0.3787733839981982, + "grad_norm": 0.625, + "learning_rate": 0.00020846788099390188, + "loss": 2.6488, + "step": 1327 + }, + { + "epoch": 0.3790588198565239, + "grad_norm": 0.5703125, + "learning_rate": 0.00020834272250509523, + "loss": 2.6607, + "step": 1328 + }, + { + "epoch": 0.3793442557148496, + "grad_norm": 0.6015625, + "learning_rate": 0.00020821751613914525, + "loss": 2.6426, + "step": 1329 + }, + { + "epoch": 0.3796296915731753, + "grad_norm": 0.57421875, + "learning_rate": 0.0002080922619987987, + "loss": 2.6458, + "step": 1330 + }, + { + "epoch": 0.37991512743150097, + "grad_norm": 0.57421875, + "learning_rate": 0.00020796696018684152, + "loss": 2.6278, + "step": 1331 + }, + { + "epoch": 0.38020056328982665, + "grad_norm": 0.54296875, + "learning_rate": 0.00020784161080609868, + "loss": 2.6603, + "step": 1332 + }, + { + "epoch": 0.38048599914815234, + "grad_norm": 0.6015625, + "learning_rate": 0.00020771621395943436, + "loss": 2.6395, + "step": 1333 + }, + { + "epoch": 0.38077143500647803, + "grad_norm": 0.55859375, + "learning_rate": 0.00020759076974975144, + "loss": 2.6346, + "step": 1334 + }, + { + "epoch": 0.3810568708648038, + "grad_norm": 0.58203125, + "learning_rate": 0.00020746527827999195, + "loss": 2.6412, + "step": 1335 + }, + { + "epoch": 0.38134230672312946, + "grad_norm": 0.6015625, + "learning_rate": 0.00020733973965313655, + "loss": 2.6311, + "step": 1336 + }, + { + "epoch": 0.38162774258145515, + "grad_norm": 0.57421875, + "learning_rate": 0.0002072141539722046, + "loss": 2.6174, + "step": 1337 + }, + { + "epoch": 0.38191317843978084, + "grad_norm": 0.5625, + "learning_rate": 0.00020708852134025397, + "loss": 2.6192, + "step": 1338 + }, + { + "epoch": 0.38219861429810653, + "grad_norm": 0.5703125, + "learning_rate": 0.0002069628418603814, + "loss": 2.6467, + "step": 1339 + }, + { + "epoch": 0.3824840501564322, + "grad_norm": 0.57421875, + "learning_rate": 0.00020683711563572167, + "loss": 2.6369, + "step": 1340 + }, + { + "epoch": 0.3827694860147579, + "grad_norm": 0.55078125, + "learning_rate": 0.00020671134276944815, + "loss": 2.6372, + "step": 1341 + }, + { + "epoch": 0.3830549218730836, + "grad_norm": 0.53515625, + "learning_rate": 0.0002065855233647725, + "loss": 2.6436, + "step": 1342 + }, + { + "epoch": 0.38334035773140934, + "grad_norm": 0.58203125, + "learning_rate": 0.00020645965752494444, + "loss": 2.6342, + "step": 1343 + }, + { + "epoch": 0.38362579358973503, + "grad_norm": 0.60546875, + "learning_rate": 0.0002063337453532519, + "loss": 2.637, + "step": 1344 + }, + { + "epoch": 0.3839112294480607, + "grad_norm": 0.58203125, + "learning_rate": 0.0002062077869530207, + "loss": 2.6444, + "step": 1345 + }, + { + "epoch": 0.3841966653063864, + "grad_norm": 0.54296875, + "learning_rate": 0.00020608178242761483, + "loss": 2.6339, + "step": 1346 + }, + { + "epoch": 0.3844821011647121, + "grad_norm": 0.5703125, + "learning_rate": 0.00020595573188043594, + "loss": 2.6422, + "step": 1347 + }, + { + "epoch": 0.3847675370230378, + "grad_norm": 0.6484375, + "learning_rate": 0.00020582963541492343, + "loss": 2.6472, + "step": 1348 + }, + { + "epoch": 0.3850529728813635, + "grad_norm": 0.65625, + "learning_rate": 0.00020570349313455452, + "loss": 2.6081, + "step": 1349 + }, + { + "epoch": 0.38533840873968916, + "grad_norm": 0.55078125, + "learning_rate": 0.00020557730514284396, + "loss": 2.6214, + "step": 1350 + }, + { + "epoch": 0.3856238445980149, + "grad_norm": 0.53515625, + "learning_rate": 0.00020545107154334397, + "loss": 2.6263, + "step": 1351 + }, + { + "epoch": 0.3859092804563406, + "grad_norm": 0.6328125, + "learning_rate": 0.0002053247924396442, + "loss": 2.6092, + "step": 1352 + }, + { + "epoch": 0.3861947163146663, + "grad_norm": 0.5625, + "learning_rate": 0.0002051984679353718, + "loss": 2.6329, + "step": 1353 + }, + { + "epoch": 0.386480152172992, + "grad_norm": 0.5546875, + "learning_rate": 0.0002050720981341909, + "loss": 2.6087, + "step": 1354 + }, + { + "epoch": 0.38676558803131766, + "grad_norm": 0.70703125, + "learning_rate": 0.00020494568313980305, + "loss": 2.6249, + "step": 1355 + }, + { + "epoch": 0.38705102388964335, + "grad_norm": 0.7578125, + "learning_rate": 0.00020481922305594678, + "loss": 2.6385, + "step": 1356 + }, + { + "epoch": 0.38733645974796904, + "grad_norm": 0.70703125, + "learning_rate": 0.0002046927179863976, + "loss": 2.632, + "step": 1357 + }, + { + "epoch": 0.38762189560629473, + "grad_norm": 0.54296875, + "learning_rate": 0.00020456616803496796, + "loss": 2.642, + "step": 1358 + }, + { + "epoch": 0.3879073314646205, + "grad_norm": 0.7109375, + "learning_rate": 0.00020443957330550718, + "loss": 2.6268, + "step": 1359 + }, + { + "epoch": 0.38819276732294616, + "grad_norm": 0.6640625, + "learning_rate": 0.0002043129339019013, + "loss": 2.6379, + "step": 1360 + }, + { + "epoch": 0.38847820318127185, + "grad_norm": 0.51953125, + "learning_rate": 0.00020418624992807295, + "loss": 2.6577, + "step": 1361 + }, + { + "epoch": 0.38876363903959754, + "grad_norm": 0.67578125, + "learning_rate": 0.00020405952148798144, + "loss": 2.6331, + "step": 1362 + }, + { + "epoch": 0.38904907489792323, + "grad_norm": 0.55078125, + "learning_rate": 0.00020393274868562254, + "loss": 2.6376, + "step": 1363 + }, + { + "epoch": 0.3893345107562489, + "grad_norm": 0.609375, + "learning_rate": 0.00020380593162502844, + "loss": 2.6041, + "step": 1364 + }, + { + "epoch": 0.3896199466145746, + "grad_norm": 0.6796875, + "learning_rate": 0.00020367907041026755, + "loss": 2.6439, + "step": 1365 + }, + { + "epoch": 0.3899053824729003, + "grad_norm": 0.5625, + "learning_rate": 0.00020355216514544462, + "loss": 2.6405, + "step": 1366 + }, + { + "epoch": 0.39019081833122604, + "grad_norm": 0.56640625, + "learning_rate": 0.0002034252159347005, + "loss": 2.6451, + "step": 1367 + }, + { + "epoch": 0.39047625418955173, + "grad_norm": 0.60546875, + "learning_rate": 0.00020329822288221218, + "loss": 2.637, + "step": 1368 + }, + { + "epoch": 0.3907616900478774, + "grad_norm": 0.578125, + "learning_rate": 0.00020317118609219253, + "loss": 2.5896, + "step": 1369 + }, + { + "epoch": 0.3910471259062031, + "grad_norm": 0.59375, + "learning_rate": 0.00020304410566889027, + "loss": 2.641, + "step": 1370 + }, + { + "epoch": 0.3913325617645288, + "grad_norm": 0.62890625, + "learning_rate": 0.0002029169817165901, + "loss": 2.6245, + "step": 1371 + }, + { + "epoch": 0.3916179976228545, + "grad_norm": 0.56640625, + "learning_rate": 0.0002027898143396123, + "loss": 2.6347, + "step": 1372 + }, + { + "epoch": 0.3919034334811802, + "grad_norm": 0.56640625, + "learning_rate": 0.00020266260364231286, + "loss": 2.6158, + "step": 1373 + }, + { + "epoch": 0.39218886933950586, + "grad_norm": 0.62890625, + "learning_rate": 0.00020253534972908326, + "loss": 2.6349, + "step": 1374 + }, + { + "epoch": 0.39247430519783155, + "grad_norm": 0.7421875, + "learning_rate": 0.00020240805270435044, + "loss": 2.6329, + "step": 1375 + }, + { + "epoch": 0.3927597410561573, + "grad_norm": 0.6875, + "learning_rate": 0.00020228071267257687, + "loss": 2.6633, + "step": 1376 + }, + { + "epoch": 0.393045176914483, + "grad_norm": 0.703125, + "learning_rate": 0.00020215332973826003, + "loss": 2.6117, + "step": 1377 + }, + { + "epoch": 0.3933306127728087, + "grad_norm": 0.91796875, + "learning_rate": 0.00020202590400593285, + "loss": 2.6286, + "step": 1378 + }, + { + "epoch": 0.39361604863113436, + "grad_norm": 0.87109375, + "learning_rate": 0.00020189843558016338, + "loss": 2.6105, + "step": 1379 + }, + { + "epoch": 0.39390148448946005, + "grad_norm": 0.98828125, + "learning_rate": 0.0002017709245655545, + "loss": 2.6128, + "step": 1380 + }, + { + "epoch": 0.39418692034778574, + "grad_norm": 0.82421875, + "learning_rate": 0.00020164337106674417, + "loss": 2.6243, + "step": 1381 + }, + { + "epoch": 0.39447235620611143, + "grad_norm": 0.69140625, + "learning_rate": 0.0002015157751884053, + "loss": 2.6557, + "step": 1382 + }, + { + "epoch": 0.3947577920644371, + "grad_norm": 0.8984375, + "learning_rate": 0.0002013881370352454, + "loss": 2.624, + "step": 1383 + }, + { + "epoch": 0.39504322792276286, + "grad_norm": 0.76171875, + "learning_rate": 0.00020126045671200682, + "loss": 2.6279, + "step": 1384 + }, + { + "epoch": 0.39532866378108855, + "grad_norm": 0.5859375, + "learning_rate": 0.00020113273432346632, + "loss": 2.6363, + "step": 1385 + }, + { + "epoch": 0.39561409963941424, + "grad_norm": 0.79296875, + "learning_rate": 0.00020100496997443553, + "loss": 2.6274, + "step": 1386 + }, + { + "epoch": 0.39589953549773993, + "grad_norm": 0.6484375, + "learning_rate": 0.00020087716376976014, + "loss": 2.6191, + "step": 1387 + }, + { + "epoch": 0.3961849713560656, + "grad_norm": 0.640625, + "learning_rate": 0.00020074931581432035, + "loss": 2.6355, + "step": 1388 + }, + { + "epoch": 0.3964704072143913, + "grad_norm": 0.6875, + "learning_rate": 0.0002006214262130307, + "loss": 2.6386, + "step": 1389 + }, + { + "epoch": 0.396755843072717, + "grad_norm": 0.6171875, + "learning_rate": 0.0002004934950708397, + "loss": 2.6345, + "step": 1390 + }, + { + "epoch": 0.3970412789310427, + "grad_norm": 0.55078125, + "learning_rate": 0.00020036552249273014, + "loss": 2.6081, + "step": 1391 + }, + { + "epoch": 0.39732671478936843, + "grad_norm": 0.65625, + "learning_rate": 0.00020023750858371876, + "loss": 2.6243, + "step": 1392 + }, + { + "epoch": 0.3976121506476941, + "grad_norm": 0.55078125, + "learning_rate": 0.00020010945344885615, + "loss": 2.6405, + "step": 1393 + }, + { + "epoch": 0.3978975865060198, + "grad_norm": 0.61328125, + "learning_rate": 0.0001999813571932268, + "loss": 2.5995, + "step": 1394 + }, + { + "epoch": 0.3981830223643455, + "grad_norm": 0.5859375, + "learning_rate": 0.00019985321992194892, + "loss": 2.6225, + "step": 1395 + }, + { + "epoch": 0.3984684582226712, + "grad_norm": 0.57421875, + "learning_rate": 0.00019972504174017446, + "loss": 2.6077, + "step": 1396 + }, + { + "epoch": 0.3987538940809969, + "grad_norm": 0.59765625, + "learning_rate": 0.00019959682275308869, + "loss": 2.6165, + "step": 1397 + }, + { + "epoch": 0.39903932993932256, + "grad_norm": 0.66796875, + "learning_rate": 0.0001994685630659107, + "loss": 2.601, + "step": 1398 + }, + { + "epoch": 0.39932476579764825, + "grad_norm": 0.6171875, + "learning_rate": 0.00019934026278389274, + "loss": 2.6332, + "step": 1399 + }, + { + "epoch": 0.399610201655974, + "grad_norm": 0.6015625, + "learning_rate": 0.00019921192201232047, + "loss": 2.6224, + "step": 1400 + }, + { + "epoch": 0.3998956375142997, + "grad_norm": 0.58203125, + "learning_rate": 0.0001990835408565127, + "loss": 2.5961, + "step": 1401 + }, + { + "epoch": 0.4001810733726254, + "grad_norm": 0.56640625, + "learning_rate": 0.0001989551194218216, + "loss": 2.6291, + "step": 1402 + }, + { + "epoch": 0.40046650923095106, + "grad_norm": 0.56640625, + "learning_rate": 0.00019882665781363208, + "loss": 2.6164, + "step": 1403 + }, + { + "epoch": 0.40075194508927675, + "grad_norm": 0.6015625, + "learning_rate": 0.00019869815613736224, + "loss": 2.6452, + "step": 1404 + }, + { + "epoch": 0.40103738094760244, + "grad_norm": 0.6015625, + "learning_rate": 0.00019856961449846294, + "loss": 2.6502, + "step": 1405 + }, + { + "epoch": 0.40132281680592813, + "grad_norm": 0.62109375, + "learning_rate": 0.0001984410330024179, + "loss": 2.6174, + "step": 1406 + }, + { + "epoch": 0.4016082526642538, + "grad_norm": 0.56640625, + "learning_rate": 0.0001983124117547436, + "loss": 2.5982, + "step": 1407 + }, + { + "epoch": 0.40189368852257956, + "grad_norm": 0.5625, + "learning_rate": 0.00019818375086098897, + "loss": 2.5949, + "step": 1408 + }, + { + "epoch": 0.40217912438090525, + "grad_norm": 1.015625, + "learning_rate": 0.00019805505042673564, + "loss": 2.6337, + "step": 1409 + }, + { + "epoch": 0.40246456023923094, + "grad_norm": 0.64453125, + "learning_rate": 0.00019792631055759764, + "loss": 2.6204, + "step": 1410 + }, + { + "epoch": 0.40274999609755663, + "grad_norm": 0.61328125, + "learning_rate": 0.00019779753135922126, + "loss": 2.6416, + "step": 1411 + }, + { + "epoch": 0.4030354319558823, + "grad_norm": 0.60546875, + "learning_rate": 0.00019766871293728524, + "loss": 2.6037, + "step": 1412 + }, + { + "epoch": 0.403320867814208, + "grad_norm": 0.63671875, + "learning_rate": 0.00019753985539750036, + "loss": 2.6191, + "step": 1413 + }, + { + "epoch": 0.4036063036725337, + "grad_norm": 0.578125, + "learning_rate": 0.00019741095884560957, + "loss": 2.6103, + "step": 1414 + }, + { + "epoch": 0.4038917395308594, + "grad_norm": 0.57421875, + "learning_rate": 0.00019728202338738785, + "loss": 2.6346, + "step": 1415 + }, + { + "epoch": 0.40417717538918513, + "grad_norm": 0.6796875, + "learning_rate": 0.0001971530491286421, + "loss": 2.6142, + "step": 1416 + }, + { + "epoch": 0.4044626112475108, + "grad_norm": 0.51953125, + "learning_rate": 0.00019702403617521093, + "loss": 2.612, + "step": 1417 + }, + { + "epoch": 0.4047480471058365, + "grad_norm": 0.625, + "learning_rate": 0.00019689498463296487, + "loss": 2.6237, + "step": 1418 + }, + { + "epoch": 0.4050334829641622, + "grad_norm": 0.5625, + "learning_rate": 0.00019676589460780616, + "loss": 2.6104, + "step": 1419 + }, + { + "epoch": 0.4053189188224879, + "grad_norm": 0.5703125, + "learning_rate": 0.00019663676620566836, + "loss": 2.6246, + "step": 1420 + }, + { + "epoch": 0.4056043546808136, + "grad_norm": 0.6328125, + "learning_rate": 0.00019650759953251677, + "loss": 2.6212, + "step": 1421 + }, + { + "epoch": 0.40588979053913926, + "grad_norm": 0.578125, + "learning_rate": 0.00019637839469434804, + "loss": 2.6268, + "step": 1422 + }, + { + "epoch": 0.40617522639746495, + "grad_norm": 0.578125, + "learning_rate": 0.00019624915179719004, + "loss": 2.6045, + "step": 1423 + }, + { + "epoch": 0.4064606622557907, + "grad_norm": 1.125, + "learning_rate": 0.00019611987094710192, + "loss": 2.5961, + "step": 1424 + }, + { + "epoch": 0.4067460981141164, + "grad_norm": 0.5625, + "learning_rate": 0.00019599055225017408, + "loss": 2.5987, + "step": 1425 + }, + { + "epoch": 0.40703153397244207, + "grad_norm": 0.6328125, + "learning_rate": 0.00019586119581252781, + "loss": 2.6394, + "step": 1426 + }, + { + "epoch": 0.40731696983076776, + "grad_norm": 0.5859375, + "learning_rate": 0.00019573180174031556, + "loss": 2.5998, + "step": 1427 + }, + { + "epoch": 0.40760240568909345, + "grad_norm": 0.6015625, + "learning_rate": 0.00019560237013972046, + "loss": 2.6149, + "step": 1428 + }, + { + "epoch": 0.40788784154741914, + "grad_norm": 0.5859375, + "learning_rate": 0.0001954729011169565, + "loss": 2.6389, + "step": 1429 + }, + { + "epoch": 0.4081732774057448, + "grad_norm": 0.59375, + "learning_rate": 0.00019534339477826854, + "loss": 2.6498, + "step": 1430 + }, + { + "epoch": 0.4084587132640705, + "grad_norm": 0.60546875, + "learning_rate": 0.00019521385122993185, + "loss": 2.6256, + "step": 1431 + }, + { + "epoch": 0.40874414912239626, + "grad_norm": 0.66015625, + "learning_rate": 0.00019508427057825237, + "loss": 2.614, + "step": 1432 + }, + { + "epoch": 0.40902958498072195, + "grad_norm": 0.58203125, + "learning_rate": 0.0001949546529295664, + "loss": 2.5803, + "step": 1433 + }, + { + "epoch": 0.40931502083904764, + "grad_norm": 0.625, + "learning_rate": 0.00019482499839024062, + "loss": 2.6267, + "step": 1434 + }, + { + "epoch": 0.4096004566973733, + "grad_norm": 0.58203125, + "learning_rate": 0.00019469530706667205, + "loss": 2.627, + "step": 1435 + }, + { + "epoch": 0.409885892555699, + "grad_norm": 0.5859375, + "learning_rate": 0.0001945655790652878, + "loss": 2.6262, + "step": 1436 + }, + { + "epoch": 0.4101713284140247, + "grad_norm": 0.62890625, + "learning_rate": 0.00019443581449254515, + "loss": 2.6189, + "step": 1437 + }, + { + "epoch": 0.4104567642723504, + "grad_norm": 0.55078125, + "learning_rate": 0.00019430601345493136, + "loss": 2.6023, + "step": 1438 + }, + { + "epoch": 0.4107422001306761, + "grad_norm": 0.58203125, + "learning_rate": 0.0001941761760589637, + "loss": 2.6085, + "step": 1439 + }, + { + "epoch": 0.4110276359890018, + "grad_norm": 0.5625, + "learning_rate": 0.00019404630241118902, + "loss": 2.6117, + "step": 1440 + }, + { + "epoch": 0.4113130718473275, + "grad_norm": 0.58203125, + "learning_rate": 0.00019391639261818428, + "loss": 2.6289, + "step": 1441 + }, + { + "epoch": 0.4115985077056532, + "grad_norm": 0.55859375, + "learning_rate": 0.00019378644678655582, + "loss": 2.6221, + "step": 1442 + }, + { + "epoch": 0.4118839435639789, + "grad_norm": 0.5546875, + "learning_rate": 0.00019365646502293962, + "loss": 2.6028, + "step": 1443 + }, + { + "epoch": 0.4121693794223046, + "grad_norm": 0.5546875, + "learning_rate": 0.00019352644743400124, + "loss": 2.599, + "step": 1444 + }, + { + "epoch": 0.41245481528063027, + "grad_norm": 0.68359375, + "learning_rate": 0.0001933963941264356, + "loss": 2.6002, + "step": 1445 + }, + { + "epoch": 0.41274025113895596, + "grad_norm": 0.53125, + "learning_rate": 0.0001932663052069668, + "loss": 2.6078, + "step": 1446 + }, + { + "epoch": 0.41302568699728165, + "grad_norm": 0.62109375, + "learning_rate": 0.00019313618078234843, + "loss": 2.6375, + "step": 1447 + }, + { + "epoch": 0.41331112285560734, + "grad_norm": 0.625, + "learning_rate": 0.00019300602095936287, + "loss": 2.6145, + "step": 1448 + }, + { + "epoch": 0.4135965587139331, + "grad_norm": 1.578125, + "learning_rate": 0.00019287582584482193, + "loss": 2.6075, + "step": 1449 + }, + { + "epoch": 0.41388199457225877, + "grad_norm": 1.3671875, + "learning_rate": 0.00019274559554556604, + "loss": 2.5988, + "step": 1450 + }, + { + "epoch": 0.41416743043058446, + "grad_norm": 1.0, + "learning_rate": 0.00019261533016846468, + "loss": 2.6142, + "step": 1451 + }, + { + "epoch": 0.41445286628891015, + "grad_norm": 0.65234375, + "learning_rate": 0.00019248502982041613, + "loss": 2.5849, + "step": 1452 + }, + { + "epoch": 0.41473830214723584, + "grad_norm": 0.703125, + "learning_rate": 0.00019235469460834732, + "loss": 2.6181, + "step": 1453 + }, + { + "epoch": 0.4150237380055615, + "grad_norm": 0.61328125, + "learning_rate": 0.00019222432463921374, + "loss": 2.5999, + "step": 1454 + }, + { + "epoch": 0.4153091738638872, + "grad_norm": 0.75, + "learning_rate": 0.0001920939200199995, + "loss": 2.6166, + "step": 1455 + }, + { + "epoch": 0.4155946097222129, + "grad_norm": 0.78515625, + "learning_rate": 0.00019196348085771713, + "loss": 2.6053, + "step": 1456 + }, + { + "epoch": 0.41588004558053865, + "grad_norm": 0.69140625, + "learning_rate": 0.0001918330072594074, + "loss": 2.6113, + "step": 1457 + }, + { + "epoch": 0.41616548143886434, + "grad_norm": 0.6484375, + "learning_rate": 0.00019170249933213947, + "loss": 2.6028, + "step": 1458 + }, + { + "epoch": 0.41645091729719, + "grad_norm": 0.765625, + "learning_rate": 0.00019157195718301067, + "loss": 2.6048, + "step": 1459 + }, + { + "epoch": 0.4167363531555157, + "grad_norm": 0.7421875, + "learning_rate": 0.00019144138091914617, + "loss": 2.6143, + "step": 1460 + }, + { + "epoch": 0.4170217890138414, + "grad_norm": 0.5859375, + "learning_rate": 0.00019131077064769953, + "loss": 2.6159, + "step": 1461 + }, + { + "epoch": 0.4173072248721671, + "grad_norm": 0.76171875, + "learning_rate": 0.00019118012647585192, + "loss": 2.5989, + "step": 1462 + }, + { + "epoch": 0.4175926607304928, + "grad_norm": 0.7421875, + "learning_rate": 0.00019104944851081244, + "loss": 2.6203, + "step": 1463 + }, + { + "epoch": 0.41787809658881847, + "grad_norm": 0.6015625, + "learning_rate": 0.00019091873685981786, + "loss": 2.596, + "step": 1464 + }, + { + "epoch": 0.4181635324471442, + "grad_norm": 0.875, + "learning_rate": 0.00019078799163013273, + "loss": 2.5961, + "step": 1465 + }, + { + "epoch": 0.4184489683054699, + "grad_norm": 0.8125, + "learning_rate": 0.000190657212929049, + "loss": 2.6254, + "step": 1466 + }, + { + "epoch": 0.4187344041637956, + "grad_norm": 0.57421875, + "learning_rate": 0.0001905264008638861, + "loss": 2.616, + "step": 1467 + }, + { + "epoch": 0.4190198400221213, + "grad_norm": 0.671875, + "learning_rate": 0.00019039555554199099, + "loss": 2.635, + "step": 1468 + }, + { + "epoch": 0.41930527588044697, + "grad_norm": 0.609375, + "learning_rate": 0.0001902646770707378, + "loss": 2.5834, + "step": 1469 + }, + { + "epoch": 0.41959071173877266, + "grad_norm": 0.58203125, + "learning_rate": 0.00019013376555752782, + "loss": 2.61, + "step": 1470 + }, + { + "epoch": 0.41987614759709835, + "grad_norm": 0.6015625, + "learning_rate": 0.00019000282110978958, + "loss": 2.6072, + "step": 1471 + }, + { + "epoch": 0.42016158345542404, + "grad_norm": 0.578125, + "learning_rate": 0.00018987184383497855, + "loss": 2.5803, + "step": 1472 + }, + { + "epoch": 0.4204470193137498, + "grad_norm": 0.5546875, + "learning_rate": 0.00018974083384057713, + "loss": 2.639, + "step": 1473 + }, + { + "epoch": 0.42073245517207547, + "grad_norm": 0.64453125, + "learning_rate": 0.00018960979123409466, + "loss": 2.5955, + "step": 1474 + }, + { + "epoch": 0.42101789103040116, + "grad_norm": 0.5234375, + "learning_rate": 0.0001894787161230672, + "loss": 2.6356, + "step": 1475 + }, + { + "epoch": 0.42130332688872685, + "grad_norm": 0.578125, + "learning_rate": 0.0001893476086150574, + "loss": 2.6224, + "step": 1476 + }, + { + "epoch": 0.42158876274705254, + "grad_norm": 0.62109375, + "learning_rate": 0.00018921646881765456, + "loss": 2.6103, + "step": 1477 + }, + { + "epoch": 0.4218741986053782, + "grad_norm": 0.578125, + "learning_rate": 0.0001890852968384746, + "loss": 2.6162, + "step": 1478 + }, + { + "epoch": 0.4221596344637039, + "grad_norm": 0.52734375, + "learning_rate": 0.0001889540927851596, + "loss": 2.628, + "step": 1479 + }, + { + "epoch": 0.4224450703220296, + "grad_norm": 0.65625, + "learning_rate": 0.0001888228567653781, + "loss": 2.6217, + "step": 1480 + }, + { + "epoch": 0.42273050618035535, + "grad_norm": 0.52734375, + "learning_rate": 0.00018869158888682494, + "loss": 2.613, + "step": 1481 + }, + { + "epoch": 0.42301594203868104, + "grad_norm": 0.5703125, + "learning_rate": 0.00018856028925722104, + "loss": 2.608, + "step": 1482 + }, + { + "epoch": 0.4233013778970067, + "grad_norm": 0.57421875, + "learning_rate": 0.00018842895798431327, + "loss": 2.6083, + "step": 1483 + }, + { + "epoch": 0.4235868137553324, + "grad_norm": 0.51171875, + "learning_rate": 0.00018829759517587457, + "loss": 2.6065, + "step": 1484 + }, + { + "epoch": 0.4238722496136581, + "grad_norm": 0.62890625, + "learning_rate": 0.00018816620093970387, + "loss": 2.6158, + "step": 1485 + }, + { + "epoch": 0.4241576854719838, + "grad_norm": 0.625, + "learning_rate": 0.00018803477538362562, + "loss": 2.628, + "step": 1486 + }, + { + "epoch": 0.4244431213303095, + "grad_norm": 0.52734375, + "learning_rate": 0.00018790331861549023, + "loss": 2.6095, + "step": 1487 + }, + { + "epoch": 0.42472855718863517, + "grad_norm": 0.58984375, + "learning_rate": 0.00018777183074317349, + "loss": 2.5987, + "step": 1488 + }, + { + "epoch": 0.4250139930469609, + "grad_norm": 0.5625, + "learning_rate": 0.000187640311874577, + "loss": 2.5805, + "step": 1489 + }, + { + "epoch": 0.4252994289052866, + "grad_norm": 0.515625, + "learning_rate": 0.00018750876211762752, + "loss": 2.6163, + "step": 1490 + }, + { + "epoch": 0.4255848647636123, + "grad_norm": 0.53515625, + "learning_rate": 0.00018737718158027734, + "loss": 2.596, + "step": 1491 + }, + { + "epoch": 0.425870300621938, + "grad_norm": 0.54296875, + "learning_rate": 0.00018724557037050384, + "loss": 2.6397, + "step": 1492 + }, + { + "epoch": 0.42615573648026367, + "grad_norm": 0.53125, + "learning_rate": 0.0001871139285963098, + "loss": 2.6378, + "step": 1493 + }, + { + "epoch": 0.42644117233858936, + "grad_norm": 0.546875, + "learning_rate": 0.00018698225636572285, + "loss": 2.6063, + "step": 1494 + }, + { + "epoch": 0.42672660819691505, + "grad_norm": 0.5234375, + "learning_rate": 0.0001868505537867958, + "loss": 2.6003, + "step": 1495 + }, + { + "epoch": 0.42701204405524074, + "grad_norm": 0.58984375, + "learning_rate": 0.00018671882096760623, + "loss": 2.595, + "step": 1496 + }, + { + "epoch": 0.4272974799135665, + "grad_norm": 0.5546875, + "learning_rate": 0.00018658705801625656, + "loss": 2.5969, + "step": 1497 + }, + { + "epoch": 0.42758291577189217, + "grad_norm": 0.515625, + "learning_rate": 0.00018645526504087402, + "loss": 2.6158, + "step": 1498 + }, + { + "epoch": 0.42786835163021786, + "grad_norm": 0.5546875, + "learning_rate": 0.00018632344214961045, + "loss": 2.6027, + "step": 1499 + }, + { + "epoch": 0.42815378748854355, + "grad_norm": 0.53515625, + "learning_rate": 0.0001861915894506421, + "loss": 2.6258, + "step": 1500 + }, + { + "epoch": 0.42815378748854355, + "eval_loss": 2.498450517654419, + "eval_runtime": 5960.8882, + "eval_samples_per_second": 10.785, + "eval_steps_per_second": 10.785, + "step": 1500 + }, + { + "epoch": 0.42843922334686924, + "grad_norm": 0.578125, + "learning_rate": 0.00018605970705216988, + "loss": 2.5927, + "step": 1501 + }, + { + "epoch": 0.4287246592051949, + "grad_norm": 0.51171875, + "learning_rate": 0.00018592779506241902, + "loss": 2.5965, + "step": 1502 + }, + { + "epoch": 0.4290100950635206, + "grad_norm": 0.5625, + "learning_rate": 0.00018579585358963885, + "loss": 2.6102, + "step": 1503 + }, + { + "epoch": 0.4292955309218463, + "grad_norm": 0.5390625, + "learning_rate": 0.00018566388274210316, + "loss": 2.5903, + "step": 1504 + }, + { + "epoch": 0.42958096678017205, + "grad_norm": 0.515625, + "learning_rate": 0.00018553188262810974, + "loss": 2.6056, + "step": 1505 + }, + { + "epoch": 0.42986640263849774, + "grad_norm": 0.56640625, + "learning_rate": 0.00018539985335598033, + "loss": 2.6157, + "step": 1506 + }, + { + "epoch": 0.4301518384968234, + "grad_norm": 0.53125, + "learning_rate": 0.00018526779503406059, + "loss": 2.5769, + "step": 1507 + }, + { + "epoch": 0.4304372743551491, + "grad_norm": 0.55078125, + "learning_rate": 0.00018513570777072024, + "loss": 2.6171, + "step": 1508 + }, + { + "epoch": 0.4307227102134748, + "grad_norm": 0.52734375, + "learning_rate": 0.0001850035916743525, + "loss": 2.5859, + "step": 1509 + }, + { + "epoch": 0.4310081460718005, + "grad_norm": 0.52734375, + "learning_rate": 0.00018487144685337432, + "loss": 2.5976, + "step": 1510 + }, + { + "epoch": 0.4312935819301262, + "grad_norm": 0.5390625, + "learning_rate": 0.00018473927341622627, + "loss": 2.6144, + "step": 1511 + }, + { + "epoch": 0.43157901778845187, + "grad_norm": 0.53125, + "learning_rate": 0.0001846070714713724, + "loss": 2.6233, + "step": 1512 + }, + { + "epoch": 0.4318644536467776, + "grad_norm": 0.5390625, + "learning_rate": 0.0001844748411273001, + "loss": 2.6009, + "step": 1513 + }, + { + "epoch": 0.4321498895051033, + "grad_norm": 0.578125, + "learning_rate": 0.00018434258249252008, + "loss": 2.6117, + "step": 1514 + }, + { + "epoch": 0.432435325363429, + "grad_norm": 0.50390625, + "learning_rate": 0.00018421029567556633, + "loss": 2.6089, + "step": 1515 + }, + { + "epoch": 0.4327207612217547, + "grad_norm": 0.5390625, + "learning_rate": 0.00018407798078499588, + "loss": 2.5967, + "step": 1516 + }, + { + "epoch": 0.43300619708008037, + "grad_norm": 0.515625, + "learning_rate": 0.0001839456379293889, + "loss": 2.6026, + "step": 1517 + }, + { + "epoch": 0.43329163293840606, + "grad_norm": 0.515625, + "learning_rate": 0.00018381326721734833, + "loss": 2.6104, + "step": 1518 + }, + { + "epoch": 0.43357706879673175, + "grad_norm": 0.51953125, + "learning_rate": 0.00018368086875750013, + "loss": 2.6096, + "step": 1519 + }, + { + "epoch": 0.43386250465505743, + "grad_norm": 0.486328125, + "learning_rate": 0.00018354844265849307, + "loss": 2.6035, + "step": 1520 + }, + { + "epoch": 0.4341479405133831, + "grad_norm": 0.5234375, + "learning_rate": 0.0001834159890289984, + "loss": 2.6119, + "step": 1521 + }, + { + "epoch": 0.43443337637170887, + "grad_norm": 0.4921875, + "learning_rate": 0.00018328350797771018, + "loss": 2.6295, + "step": 1522 + }, + { + "epoch": 0.43471881223003456, + "grad_norm": 0.515625, + "learning_rate": 0.0001831509996133447, + "loss": 2.5938, + "step": 1523 + }, + { + "epoch": 0.43500424808836025, + "grad_norm": 0.50390625, + "learning_rate": 0.000183018464044641, + "loss": 2.6174, + "step": 1524 + }, + { + "epoch": 0.43528968394668593, + "grad_norm": 0.486328125, + "learning_rate": 0.00018288590138036028, + "loss": 2.6166, + "step": 1525 + }, + { + "epoch": 0.4355751198050116, + "grad_norm": 0.50390625, + "learning_rate": 0.00018275331172928587, + "loss": 2.6148, + "step": 1526 + }, + { + "epoch": 0.4358605556633373, + "grad_norm": 0.498046875, + "learning_rate": 0.00018262069520022338, + "loss": 2.5973, + "step": 1527 + }, + { + "epoch": 0.436145991521663, + "grad_norm": 0.51953125, + "learning_rate": 0.00018248805190200048, + "loss": 2.5931, + "step": 1528 + }, + { + "epoch": 0.4364314273799887, + "grad_norm": 0.51171875, + "learning_rate": 0.0001823553819434668, + "loss": 2.5844, + "step": 1529 + }, + { + "epoch": 0.43671686323831443, + "grad_norm": 0.515625, + "learning_rate": 0.00018222268543349374, + "loss": 2.6187, + "step": 1530 + }, + { + "epoch": 0.4370022990966401, + "grad_norm": 0.5234375, + "learning_rate": 0.00018208996248097458, + "loss": 2.5919, + "step": 1531 + }, + { + "epoch": 0.4372877349549658, + "grad_norm": 0.53125, + "learning_rate": 0.00018195721319482438, + "loss": 2.6071, + "step": 1532 + }, + { + "epoch": 0.4375731708132915, + "grad_norm": 0.515625, + "learning_rate": 0.00018182443768397963, + "loss": 2.6021, + "step": 1533 + }, + { + "epoch": 0.4378586066716172, + "grad_norm": 0.5546875, + "learning_rate": 0.00018169163605739845, + "loss": 2.5948, + "step": 1534 + }, + { + "epoch": 0.4381440425299429, + "grad_norm": 0.53515625, + "learning_rate": 0.0001815588084240604, + "loss": 2.6145, + "step": 1535 + }, + { + "epoch": 0.43842947838826857, + "grad_norm": 0.55859375, + "learning_rate": 0.0001814259548929663, + "loss": 2.5996, + "step": 1536 + }, + { + "epoch": 0.43871491424659426, + "grad_norm": 0.55078125, + "learning_rate": 0.0001812930755731383, + "loss": 2.6011, + "step": 1537 + }, + { + "epoch": 0.43900035010492, + "grad_norm": 0.56640625, + "learning_rate": 0.00018116017057361972, + "loss": 2.6185, + "step": 1538 + }, + { + "epoch": 0.4392857859632457, + "grad_norm": 0.609375, + "learning_rate": 0.00018102724000347488, + "loss": 2.5761, + "step": 1539 + }, + { + "epoch": 0.4395712218215714, + "grad_norm": 0.51953125, + "learning_rate": 0.00018089428397178908, + "loss": 2.6193, + "step": 1540 + }, + { + "epoch": 0.43985665767989707, + "grad_norm": 0.494140625, + "learning_rate": 0.0001807613025876687, + "loss": 2.6, + "step": 1541 + }, + { + "epoch": 0.44014209353822276, + "grad_norm": 0.53515625, + "learning_rate": 0.00018062829596024067, + "loss": 2.5964, + "step": 1542 + }, + { + "epoch": 0.44042752939654845, + "grad_norm": 0.5234375, + "learning_rate": 0.0001804952641986527, + "loss": 2.5884, + "step": 1543 + }, + { + "epoch": 0.44071296525487413, + "grad_norm": 0.5, + "learning_rate": 0.00018036220741207332, + "loss": 2.5893, + "step": 1544 + }, + { + "epoch": 0.4409984011131998, + "grad_norm": 0.484375, + "learning_rate": 0.0001802291257096914, + "loss": 2.5842, + "step": 1545 + }, + { + "epoch": 0.44128383697152557, + "grad_norm": 0.498046875, + "learning_rate": 0.00018009601920071624, + "loss": 2.6291, + "step": 1546 + }, + { + "epoch": 0.44156927282985126, + "grad_norm": 0.470703125, + "learning_rate": 0.00017996288799437758, + "loss": 2.6153, + "step": 1547 + }, + { + "epoch": 0.44185470868817694, + "grad_norm": 0.50390625, + "learning_rate": 0.00017982973219992548, + "loss": 2.5752, + "step": 1548 + }, + { + "epoch": 0.44214014454650263, + "grad_norm": 0.482421875, + "learning_rate": 0.00017969655192663007, + "loss": 2.5856, + "step": 1549 + }, + { + "epoch": 0.4424255804048283, + "grad_norm": 0.48828125, + "learning_rate": 0.00017956334728378158, + "loss": 2.5989, + "step": 1550 + }, + { + "epoch": 0.442711016263154, + "grad_norm": 0.5078125, + "learning_rate": 0.00017943011838069021, + "loss": 2.621, + "step": 1551 + }, + { + "epoch": 0.4429964521214797, + "grad_norm": 0.51171875, + "learning_rate": 0.0001792968653266863, + "loss": 2.6003, + "step": 1552 + }, + { + "epoch": 0.4432818879798054, + "grad_norm": 0.52734375, + "learning_rate": 0.00017916358823111972, + "loss": 2.6094, + "step": 1553 + }, + { + "epoch": 0.44356732383813113, + "grad_norm": 0.490234375, + "learning_rate": 0.0001790302872033601, + "loss": 2.6167, + "step": 1554 + }, + { + "epoch": 0.4438527596964568, + "grad_norm": 0.494140625, + "learning_rate": 0.00017889696235279693, + "loss": 2.576, + "step": 1555 + }, + { + "epoch": 0.4441381955547825, + "grad_norm": 0.478515625, + "learning_rate": 0.00017876361378883903, + "loss": 2.5914, + "step": 1556 + }, + { + "epoch": 0.4444236314131082, + "grad_norm": 0.515625, + "learning_rate": 0.00017863024162091478, + "loss": 2.591, + "step": 1557 + }, + { + "epoch": 0.4447090672714339, + "grad_norm": 0.482421875, + "learning_rate": 0.0001784968459584719, + "loss": 2.6002, + "step": 1558 + }, + { + "epoch": 0.4449945031297596, + "grad_norm": 0.482421875, + "learning_rate": 0.00017836342691097742, + "loss": 2.5826, + "step": 1559 + }, + { + "epoch": 0.44527993898808527, + "grad_norm": 0.4921875, + "learning_rate": 0.0001782299845879175, + "loss": 2.5972, + "step": 1560 + }, + { + "epoch": 0.44556537484641096, + "grad_norm": 0.48046875, + "learning_rate": 0.00017809651909879749, + "loss": 2.5984, + "step": 1561 + }, + { + "epoch": 0.4458508107047367, + "grad_norm": 0.5, + "learning_rate": 0.00017796303055314164, + "loss": 2.5803, + "step": 1562 + }, + { + "epoch": 0.4461362465630624, + "grad_norm": 0.515625, + "learning_rate": 0.00017782951906049316, + "loss": 2.6079, + "step": 1563 + }, + { + "epoch": 0.4464216824213881, + "grad_norm": 0.486328125, + "learning_rate": 0.00017769598473041422, + "loss": 2.5998, + "step": 1564 + }, + { + "epoch": 0.44670711827971377, + "grad_norm": 0.5234375, + "learning_rate": 0.00017756242767248557, + "loss": 2.5921, + "step": 1565 + }, + { + "epoch": 0.44699255413803946, + "grad_norm": 0.52734375, + "learning_rate": 0.0001774288479963066, + "loss": 2.5799, + "step": 1566 + }, + { + "epoch": 0.44727798999636514, + "grad_norm": 0.55078125, + "learning_rate": 0.00017729524581149537, + "loss": 2.639, + "step": 1567 + }, + { + "epoch": 0.44756342585469083, + "grad_norm": 0.52734375, + "learning_rate": 0.00017716162122768836, + "loss": 2.613, + "step": 1568 + }, + { + "epoch": 0.4478488617130165, + "grad_norm": 0.5, + "learning_rate": 0.0001770279743545405, + "loss": 2.6075, + "step": 1569 + }, + { + "epoch": 0.44813429757134227, + "grad_norm": 0.515625, + "learning_rate": 0.00017689430530172482, + "loss": 2.5834, + "step": 1570 + }, + { + "epoch": 0.44841973342966795, + "grad_norm": 0.51171875, + "learning_rate": 0.00017676061417893274, + "loss": 2.607, + "step": 1571 + }, + { + "epoch": 0.44870516928799364, + "grad_norm": 0.51171875, + "learning_rate": 0.00017662690109587382, + "loss": 2.5996, + "step": 1572 + }, + { + "epoch": 0.44899060514631933, + "grad_norm": 0.51171875, + "learning_rate": 0.00017649316616227538, + "loss": 2.5941, + "step": 1573 + }, + { + "epoch": 0.449276041004645, + "grad_norm": 0.5234375, + "learning_rate": 0.0001763594094878829, + "loss": 2.5961, + "step": 1574 + }, + { + "epoch": 0.4495614768629707, + "grad_norm": 0.50390625, + "learning_rate": 0.00017622563118245972, + "loss": 2.5923, + "step": 1575 + }, + { + "epoch": 0.4498469127212964, + "grad_norm": 0.53515625, + "learning_rate": 0.00017609183135578675, + "loss": 2.5981, + "step": 1576 + }, + { + "epoch": 0.4501323485796221, + "grad_norm": 0.5234375, + "learning_rate": 0.00017595801011766274, + "loss": 2.6039, + "step": 1577 + }, + { + "epoch": 0.45041778443794783, + "grad_norm": 0.51171875, + "learning_rate": 0.00017582416757790388, + "loss": 2.587, + "step": 1578 + }, + { + "epoch": 0.4507032202962735, + "grad_norm": 0.52734375, + "learning_rate": 0.0001756903038463439, + "loss": 2.5729, + "step": 1579 + }, + { + "epoch": 0.4509886561545992, + "grad_norm": 0.47265625, + "learning_rate": 0.0001755564190328339, + "loss": 2.6028, + "step": 1580 + }, + { + "epoch": 0.4512740920129249, + "grad_norm": 0.53125, + "learning_rate": 0.00017542251324724237, + "loss": 2.5784, + "step": 1581 + }, + { + "epoch": 0.4515595278712506, + "grad_norm": 0.50390625, + "learning_rate": 0.00017528858659945486, + "loss": 2.6228, + "step": 1582 + }, + { + "epoch": 0.4518449637295763, + "grad_norm": 0.51171875, + "learning_rate": 0.00017515463919937413, + "loss": 2.6181, + "step": 1583 + }, + { + "epoch": 0.45213039958790197, + "grad_norm": 0.498046875, + "learning_rate": 0.00017502067115691996, + "loss": 2.5915, + "step": 1584 + }, + { + "epoch": 0.45241583544622765, + "grad_norm": 0.462890625, + "learning_rate": 0.0001748866825820291, + "loss": 2.6104, + "step": 1585 + }, + { + "epoch": 0.45270127130455334, + "grad_norm": 0.49609375, + "learning_rate": 0.00017475267358465504, + "loss": 2.5913, + "step": 1586 + }, + { + "epoch": 0.4529867071628791, + "grad_norm": 0.484375, + "learning_rate": 0.00017461864427476814, + "loss": 2.6017, + "step": 1587 + }, + { + "epoch": 0.4532721430212048, + "grad_norm": 0.55078125, + "learning_rate": 0.0001744845947623554, + "loss": 2.6186, + "step": 1588 + }, + { + "epoch": 0.45355757887953047, + "grad_norm": 0.5078125, + "learning_rate": 0.00017435052515742038, + "loss": 2.5961, + "step": 1589 + }, + { + "epoch": 0.45384301473785615, + "grad_norm": 0.54296875, + "learning_rate": 0.00017421643556998312, + "loss": 2.5929, + "step": 1590 + }, + { + "epoch": 0.45412845059618184, + "grad_norm": 0.50390625, + "learning_rate": 0.0001740823261100801, + "loss": 2.5902, + "step": 1591 + }, + { + "epoch": 0.45441388645450753, + "grad_norm": 0.5390625, + "learning_rate": 0.0001739481968877641, + "loss": 2.5817, + "step": 1592 + }, + { + "epoch": 0.4546993223128332, + "grad_norm": 0.50390625, + "learning_rate": 0.00017381404801310404, + "loss": 2.5856, + "step": 1593 + }, + { + "epoch": 0.4549847581711589, + "grad_norm": 0.515625, + "learning_rate": 0.00017367987959618505, + "loss": 2.5742, + "step": 1594 + }, + { + "epoch": 0.45527019402948465, + "grad_norm": 0.51953125, + "learning_rate": 0.00017354569174710834, + "loss": 2.5916, + "step": 1595 + }, + { + "epoch": 0.45555562988781034, + "grad_norm": 0.515625, + "learning_rate": 0.00017341148457599096, + "loss": 2.5964, + "step": 1596 + }, + { + "epoch": 0.45584106574613603, + "grad_norm": 0.5625, + "learning_rate": 0.00017327725819296576, + "loss": 2.597, + "step": 1597 + }, + { + "epoch": 0.4561265016044617, + "grad_norm": 0.48046875, + "learning_rate": 0.0001731430127081816, + "loss": 2.5921, + "step": 1598 + }, + { + "epoch": 0.4564119374627874, + "grad_norm": 0.5390625, + "learning_rate": 0.00017300874823180282, + "loss": 2.61, + "step": 1599 + }, + { + "epoch": 0.4566973733211131, + "grad_norm": 0.5703125, + "learning_rate": 0.00017287446487400935, + "loss": 2.5985, + "step": 1600 + }, + { + "epoch": 0.4569828091794388, + "grad_norm": 0.494140625, + "learning_rate": 0.00017274016274499665, + "loss": 2.6079, + "step": 1601 + }, + { + "epoch": 0.4572682450377645, + "grad_norm": 0.55078125, + "learning_rate": 0.00017260584195497567, + "loss": 2.5797, + "step": 1602 + }, + { + "epoch": 0.4575536808960902, + "grad_norm": 0.494140625, + "learning_rate": 0.00017247150261417255, + "loss": 2.6106, + "step": 1603 + }, + { + "epoch": 0.4578391167544159, + "grad_norm": 0.53125, + "learning_rate": 0.0001723371448328287, + "loss": 2.5846, + "step": 1604 + }, + { + "epoch": 0.4581245526127416, + "grad_norm": 0.55078125, + "learning_rate": 0.00017220276872120072, + "loss": 2.5763, + "step": 1605 + }, + { + "epoch": 0.4584099884710673, + "grad_norm": 0.52734375, + "learning_rate": 0.00017206837438956004, + "loss": 2.5878, + "step": 1606 + }, + { + "epoch": 0.458695424329393, + "grad_norm": 0.5390625, + "learning_rate": 0.00017193396194819328, + "loss": 2.5931, + "step": 1607 + }, + { + "epoch": 0.45898086018771866, + "grad_norm": 0.5078125, + "learning_rate": 0.00017179953150740193, + "loss": 2.5835, + "step": 1608 + }, + { + "epoch": 0.45926629604604435, + "grad_norm": 0.5625, + "learning_rate": 0.000171665083177502, + "loss": 2.6094, + "step": 1609 + }, + { + "epoch": 0.45955173190437004, + "grad_norm": 0.515625, + "learning_rate": 0.00017153061706882443, + "loss": 2.6024, + "step": 1610 + }, + { + "epoch": 0.4598371677626958, + "grad_norm": 0.53515625, + "learning_rate": 0.0001713961332917146, + "loss": 2.618, + "step": 1611 + }, + { + "epoch": 0.4601226036210215, + "grad_norm": 0.5625, + "learning_rate": 0.00017126163195653254, + "loss": 2.6115, + "step": 1612 + }, + { + "epoch": 0.46040803947934716, + "grad_norm": 0.515625, + "learning_rate": 0.00017112711317365247, + "loss": 2.5529, + "step": 1613 + }, + { + "epoch": 0.46069347533767285, + "grad_norm": 0.52734375, + "learning_rate": 0.00017099257705346314, + "loss": 2.6051, + "step": 1614 + }, + { + "epoch": 0.46097891119599854, + "grad_norm": 0.56640625, + "learning_rate": 0.00017085802370636743, + "loss": 2.6073, + "step": 1615 + }, + { + "epoch": 0.46126434705432423, + "grad_norm": 0.494140625, + "learning_rate": 0.00017072345324278232, + "loss": 2.5969, + "step": 1616 + }, + { + "epoch": 0.4615497829126499, + "grad_norm": 0.54296875, + "learning_rate": 0.00017058886577313892, + "loss": 2.6139, + "step": 1617 + }, + { + "epoch": 0.4618352187709756, + "grad_norm": 0.50390625, + "learning_rate": 0.00017045426140788224, + "loss": 2.5696, + "step": 1618 + }, + { + "epoch": 0.46212065462930135, + "grad_norm": 0.53125, + "learning_rate": 0.00017031964025747117, + "loss": 2.5835, + "step": 1619 + }, + { + "epoch": 0.46240609048762704, + "grad_norm": 0.515625, + "learning_rate": 0.00017018500243237838, + "loss": 2.5731, + "step": 1620 + }, + { + "epoch": 0.46269152634595273, + "grad_norm": 0.482421875, + "learning_rate": 0.00017005034804309027, + "loss": 2.6096, + "step": 1621 + }, + { + "epoch": 0.4629769622042784, + "grad_norm": 0.494140625, + "learning_rate": 0.00016991567720010668, + "loss": 2.6063, + "step": 1622 + }, + { + "epoch": 0.4632623980626041, + "grad_norm": 0.453125, + "learning_rate": 0.00016978099001394112, + "loss": 2.6002, + "step": 1623 + }, + { + "epoch": 0.4635478339209298, + "grad_norm": 0.5234375, + "learning_rate": 0.00016964628659512046, + "loss": 2.5955, + "step": 1624 + }, + { + "epoch": 0.4638332697792555, + "grad_norm": 0.482421875, + "learning_rate": 0.00016951156705418484, + "loss": 2.5975, + "step": 1625 + }, + { + "epoch": 0.4641187056375812, + "grad_norm": 0.609375, + "learning_rate": 0.00016937683150168765, + "loss": 2.5944, + "step": 1626 + }, + { + "epoch": 0.4644041414959069, + "grad_norm": 0.9921875, + "learning_rate": 0.0001692420800481955, + "loss": 2.5734, + "step": 1627 + }, + { + "epoch": 0.4646895773542326, + "grad_norm": 0.73828125, + "learning_rate": 0.000169107312804288, + "loss": 2.6232, + "step": 1628 + }, + { + "epoch": 0.4649750132125583, + "grad_norm": 0.8046875, + "learning_rate": 0.0001689725298805576, + "loss": 2.5985, + "step": 1629 + }, + { + "epoch": 0.465260449070884, + "grad_norm": 1.5234375, + "learning_rate": 0.00016883773138760976, + "loss": 2.578, + "step": 1630 + }, + { + "epoch": 0.4655458849292097, + "grad_norm": 0.83203125, + "learning_rate": 0.00016870291743606273, + "loss": 2.5762, + "step": 1631 + }, + { + "epoch": 0.46583132078753536, + "grad_norm": 0.8359375, + "learning_rate": 0.0001685680881365474, + "loss": 2.5714, + "step": 1632 + }, + { + "epoch": 0.46611675664586105, + "grad_norm": 0.8671875, + "learning_rate": 0.00016843324359970712, + "loss": 2.5721, + "step": 1633 + }, + { + "epoch": 0.46640219250418674, + "grad_norm": 0.7109375, + "learning_rate": 0.00016829838393619796, + "loss": 2.6092, + "step": 1634 + }, + { + "epoch": 0.4666876283625125, + "grad_norm": 0.6875, + "learning_rate": 0.00016816350925668837, + "loss": 2.5973, + "step": 1635 + }, + { + "epoch": 0.4669730642208382, + "grad_norm": 0.85546875, + "learning_rate": 0.000168028619671859, + "loss": 2.6003, + "step": 1636 + }, + { + "epoch": 0.46725850007916386, + "grad_norm": 0.6171875, + "learning_rate": 0.00016789371529240271, + "loss": 2.612, + "step": 1637 + }, + { + "epoch": 0.46754393593748955, + "grad_norm": 0.79296875, + "learning_rate": 0.0001677587962290248, + "loss": 2.5903, + "step": 1638 + }, + { + "epoch": 0.46782937179581524, + "grad_norm": 0.58984375, + "learning_rate": 0.00016762386259244224, + "loss": 2.5791, + "step": 1639 + }, + { + "epoch": 0.46811480765414093, + "grad_norm": 0.65234375, + "learning_rate": 0.0001674889144933842, + "loss": 2.6103, + "step": 1640 + }, + { + "epoch": 0.4684002435124666, + "grad_norm": 0.63671875, + "learning_rate": 0.00016735395204259162, + "loss": 2.5757, + "step": 1641 + }, + { + "epoch": 0.4686856793707923, + "grad_norm": 0.60546875, + "learning_rate": 0.00016721897535081724, + "loss": 2.5925, + "step": 1642 + }, + { + "epoch": 0.46897111522911805, + "grad_norm": 0.66796875, + "learning_rate": 0.00016708398452882552, + "loss": 2.6213, + "step": 1643 + }, + { + "epoch": 0.46925655108744374, + "grad_norm": 0.5546875, + "learning_rate": 0.00016694897968739245, + "loss": 2.5948, + "step": 1644 + }, + { + "epoch": 0.46954198694576943, + "grad_norm": 0.6015625, + "learning_rate": 0.0001668139609373056, + "loss": 2.5849, + "step": 1645 + }, + { + "epoch": 0.4698274228040951, + "grad_norm": 0.62109375, + "learning_rate": 0.00016667892838936389, + "loss": 2.6265, + "step": 1646 + }, + { + "epoch": 0.4701128586624208, + "grad_norm": 0.57421875, + "learning_rate": 0.00016654388215437755, + "loss": 2.6059, + "step": 1647 + }, + { + "epoch": 0.4703982945207465, + "grad_norm": 0.5078125, + "learning_rate": 0.0001664088223431682, + "loss": 2.6298, + "step": 1648 + }, + { + "epoch": 0.4706837303790722, + "grad_norm": 0.5390625, + "learning_rate": 0.0001662737490665683, + "loss": 2.6045, + "step": 1649 + }, + { + "epoch": 0.4709691662373979, + "grad_norm": 0.52734375, + "learning_rate": 0.0001661386624354217, + "loss": 2.6153, + "step": 1650 + }, + { + "epoch": 0.4712546020957236, + "grad_norm": 0.498046875, + "learning_rate": 0.00016600356256058296, + "loss": 2.5974, + "step": 1651 + }, + { + "epoch": 0.4715400379540493, + "grad_norm": 0.515625, + "learning_rate": 0.00016586844955291768, + "loss": 2.5846, + "step": 1652 + }, + { + "epoch": 0.471825473812375, + "grad_norm": 0.515625, + "learning_rate": 0.00016573332352330203, + "loss": 2.5888, + "step": 1653 + }, + { + "epoch": 0.4721109096707007, + "grad_norm": 0.515625, + "learning_rate": 0.00016559818458262304, + "loss": 2.5823, + "step": 1654 + }, + { + "epoch": 0.4723963455290264, + "grad_norm": 0.50390625, + "learning_rate": 0.00016546303284177837, + "loss": 2.5973, + "step": 1655 + }, + { + "epoch": 0.47268178138735206, + "grad_norm": 0.51953125, + "learning_rate": 0.000165327868411676, + "loss": 2.5688, + "step": 1656 + }, + { + "epoch": 0.47296721724567775, + "grad_norm": 0.51171875, + "learning_rate": 0.00016519269140323443, + "loss": 2.584, + "step": 1657 + }, + { + "epoch": 0.47325265310400344, + "grad_norm": 0.51953125, + "learning_rate": 0.00016505750192738253, + "loss": 2.5829, + "step": 1658 + }, + { + "epoch": 0.47353808896232913, + "grad_norm": 0.50390625, + "learning_rate": 0.00016492230009505928, + "loss": 2.5653, + "step": 1659 + }, + { + "epoch": 0.4738235248206549, + "grad_norm": 0.5078125, + "learning_rate": 0.0001647870860172139, + "loss": 2.6081, + "step": 1660 + }, + { + "epoch": 0.47410896067898056, + "grad_norm": 0.49609375, + "learning_rate": 0.00016465185980480562, + "loss": 2.5732, + "step": 1661 + }, + { + "epoch": 0.47439439653730625, + "grad_norm": 0.53515625, + "learning_rate": 0.0001645166215688036, + "loss": 2.5776, + "step": 1662 + }, + { + "epoch": 0.47467983239563194, + "grad_norm": 0.5078125, + "learning_rate": 0.000164381371420187, + "loss": 2.5898, + "step": 1663 + }, + { + "epoch": 0.47496526825395763, + "grad_norm": 0.53515625, + "learning_rate": 0.00016424610946994453, + "loss": 2.6061, + "step": 1664 + }, + { + "epoch": 0.4752507041122833, + "grad_norm": 0.50390625, + "learning_rate": 0.00016411083582907476, + "loss": 2.5932, + "step": 1665 + }, + { + "epoch": 0.475536139970609, + "grad_norm": 0.478515625, + "learning_rate": 0.0001639755506085858, + "loss": 2.5887, + "step": 1666 + }, + { + "epoch": 0.4758215758289347, + "grad_norm": 0.484375, + "learning_rate": 0.0001638402539194953, + "loss": 2.597, + "step": 1667 + }, + { + "epoch": 0.47610701168726044, + "grad_norm": 0.50390625, + "learning_rate": 0.00016370494587283026, + "loss": 2.5624, + "step": 1668 + }, + { + "epoch": 0.47639244754558613, + "grad_norm": 0.44921875, + "learning_rate": 0.00016356962657962693, + "loss": 2.571, + "step": 1669 + }, + { + "epoch": 0.4766778834039118, + "grad_norm": 0.51171875, + "learning_rate": 0.00016343429615093104, + "loss": 2.5971, + "step": 1670 + }, + { + "epoch": 0.4769633192622375, + "grad_norm": 0.462890625, + "learning_rate": 0.00016329895469779725, + "loss": 2.5999, + "step": 1671 + }, + { + "epoch": 0.4772487551205632, + "grad_norm": 0.48046875, + "learning_rate": 0.00016316360233128933, + "loss": 2.5949, + "step": 1672 + }, + { + "epoch": 0.4775341909788889, + "grad_norm": 0.46484375, + "learning_rate": 0.0001630282391624799, + "loss": 2.599, + "step": 1673 + }, + { + "epoch": 0.4778196268372146, + "grad_norm": 0.52734375, + "learning_rate": 0.00016289286530245064, + "loss": 2.5983, + "step": 1674 + }, + { + "epoch": 0.47810506269554026, + "grad_norm": 0.4921875, + "learning_rate": 0.00016275748086229193, + "loss": 2.5857, + "step": 1675 + }, + { + "epoch": 0.478390498553866, + "grad_norm": 0.44140625, + "learning_rate": 0.0001626220859531027, + "loss": 2.5945, + "step": 1676 + }, + { + "epoch": 0.4786759344121917, + "grad_norm": 0.494140625, + "learning_rate": 0.00016248668068599066, + "loss": 2.6017, + "step": 1677 + }, + { + "epoch": 0.4789613702705174, + "grad_norm": 0.46484375, + "learning_rate": 0.0001623512651720719, + "loss": 2.6014, + "step": 1678 + }, + { + "epoch": 0.4792468061288431, + "grad_norm": 0.486328125, + "learning_rate": 0.00016221583952247097, + "loss": 2.5712, + "step": 1679 + }, + { + "epoch": 0.47953224198716876, + "grad_norm": 0.458984375, + "learning_rate": 0.00016208040384832072, + "loss": 2.5989, + "step": 1680 + }, + { + "epoch": 0.47981767784549445, + "grad_norm": 0.48828125, + "learning_rate": 0.00016194495826076224, + "loss": 2.5548, + "step": 1681 + }, + { + "epoch": 0.48010311370382014, + "grad_norm": 0.47265625, + "learning_rate": 0.0001618095028709447, + "loss": 2.5883, + "step": 1682 + }, + { + "epoch": 0.48038854956214583, + "grad_norm": 0.9296875, + "learning_rate": 0.0001616740377900254, + "loss": 2.6151, + "step": 1683 + }, + { + "epoch": 0.4806739854204716, + "grad_norm": 0.50390625, + "learning_rate": 0.00016153856312916957, + "loss": 2.5432, + "step": 1684 + }, + { + "epoch": 0.48095942127879726, + "grad_norm": 0.671875, + "learning_rate": 0.00016140307899955024, + "loss": 2.5735, + "step": 1685 + }, + { + "epoch": 0.48124485713712295, + "grad_norm": 0.671875, + "learning_rate": 0.00016126758551234825, + "loss": 2.5766, + "step": 1686 + }, + { + "epoch": 0.48153029299544864, + "grad_norm": 0.578125, + "learning_rate": 0.0001611320827787522, + "loss": 2.5697, + "step": 1687 + }, + { + "epoch": 0.4818157288537743, + "grad_norm": 0.5859375, + "learning_rate": 0.00016099657090995812, + "loss": 2.5824, + "step": 1688 + }, + { + "epoch": 0.4821011647121, + "grad_norm": 0.50390625, + "learning_rate": 0.0001608610500171696, + "loss": 2.5885, + "step": 1689 + }, + { + "epoch": 0.4823866005704257, + "grad_norm": 0.5078125, + "learning_rate": 0.00016072552021159775, + "loss": 2.5984, + "step": 1690 + }, + { + "epoch": 0.4826720364287514, + "grad_norm": 0.55078125, + "learning_rate": 0.0001605899816044608, + "loss": 2.6025, + "step": 1691 + }, + { + "epoch": 0.48295747228707714, + "grad_norm": 0.5, + "learning_rate": 0.00016045443430698437, + "loss": 2.6107, + "step": 1692 + }, + { + "epoch": 0.4832429081454028, + "grad_norm": 0.52734375, + "learning_rate": 0.00016031887843040104, + "loss": 2.5978, + "step": 1693 + }, + { + "epoch": 0.4835283440037285, + "grad_norm": 0.53515625, + "learning_rate": 0.00016018331408595063, + "loss": 2.5974, + "step": 1694 + }, + { + "epoch": 0.4838137798620542, + "grad_norm": 0.53515625, + "learning_rate": 0.00016004774138487983, + "loss": 2.6113, + "step": 1695 + }, + { + "epoch": 0.4840992157203799, + "grad_norm": 0.51171875, + "learning_rate": 0.00015991216043844208, + "loss": 2.5766, + "step": 1696 + }, + { + "epoch": 0.4843846515787056, + "grad_norm": 0.5, + "learning_rate": 0.00015977657135789764, + "loss": 2.5671, + "step": 1697 + }, + { + "epoch": 0.48467008743703127, + "grad_norm": 0.54296875, + "learning_rate": 0.0001596409742545136, + "loss": 2.6138, + "step": 1698 + }, + { + "epoch": 0.48495552329535696, + "grad_norm": 0.45703125, + "learning_rate": 0.00015950536923956346, + "loss": 2.5962, + "step": 1699 + }, + { + "epoch": 0.4852409591536827, + "grad_norm": 0.50390625, + "learning_rate": 0.00015936975642432725, + "loss": 2.5992, + "step": 1700 + }, + { + "epoch": 0.4855263950120084, + "grad_norm": 0.50390625, + "learning_rate": 0.00015923413592009144, + "loss": 2.5925, + "step": 1701 + }, + { + "epoch": 0.4858118308703341, + "grad_norm": 0.462890625, + "learning_rate": 0.00015909850783814874, + "loss": 2.5949, + "step": 1702 + }, + { + "epoch": 0.48609726672865977, + "grad_norm": 0.515625, + "learning_rate": 0.00015896287228979816, + "loss": 2.5671, + "step": 1703 + }, + { + "epoch": 0.48638270258698546, + "grad_norm": 0.5, + "learning_rate": 0.00015882722938634477, + "loss": 2.5684, + "step": 1704 + }, + { + "epoch": 0.48666813844531115, + "grad_norm": 0.482421875, + "learning_rate": 0.00015869157923909978, + "loss": 2.59, + "step": 1705 + }, + { + "epoch": 0.48695357430363684, + "grad_norm": 0.515625, + "learning_rate": 0.00015855592195938018, + "loss": 2.587, + "step": 1706 + }, + { + "epoch": 0.4872390101619625, + "grad_norm": 0.46875, + "learning_rate": 0.00015842025765850894, + "loss": 2.5942, + "step": 1707 + }, + { + "epoch": 0.48752444602028827, + "grad_norm": 0.48046875, + "learning_rate": 0.00015828458644781478, + "loss": 2.604, + "step": 1708 + }, + { + "epoch": 0.48780988187861396, + "grad_norm": 0.44140625, + "learning_rate": 0.00015814890843863204, + "loss": 2.5862, + "step": 1709 + }, + { + "epoch": 0.48809531773693965, + "grad_norm": 0.486328125, + "learning_rate": 0.00015801322374230068, + "loss": 2.5813, + "step": 1710 + }, + { + "epoch": 0.48838075359526534, + "grad_norm": 0.4453125, + "learning_rate": 0.00015787753247016608, + "loss": 2.5988, + "step": 1711 + }, + { + "epoch": 0.488666189453591, + "grad_norm": 0.470703125, + "learning_rate": 0.00015774183473357914, + "loss": 2.5786, + "step": 1712 + }, + { + "epoch": 0.4889516253119167, + "grad_norm": 0.48828125, + "learning_rate": 0.00015760613064389595, + "loss": 2.5616, + "step": 1713 + }, + { + "epoch": 0.4892370611702424, + "grad_norm": 0.484375, + "learning_rate": 0.00015747042031247785, + "loss": 2.5828, + "step": 1714 + }, + { + "epoch": 0.4895224970285681, + "grad_norm": 0.47265625, + "learning_rate": 0.0001573347038506914, + "loss": 2.565, + "step": 1715 + }, + { + "epoch": 0.48980793288689384, + "grad_norm": 0.46875, + "learning_rate": 0.00015719898136990794, + "loss": 2.5747, + "step": 1716 + }, + { + "epoch": 0.4900933687452195, + "grad_norm": 0.466796875, + "learning_rate": 0.00015706325298150403, + "loss": 2.5779, + "step": 1717 + }, + { + "epoch": 0.4903788046035452, + "grad_norm": 0.4921875, + "learning_rate": 0.00015692751879686095, + "loss": 2.5682, + "step": 1718 + }, + { + "epoch": 0.4906642404618709, + "grad_norm": 0.48828125, + "learning_rate": 0.00015679177892736468, + "loss": 2.5675, + "step": 1719 + }, + { + "epoch": 0.4909496763201966, + "grad_norm": 0.4765625, + "learning_rate": 0.00015665603348440595, + "loss": 2.5824, + "step": 1720 + }, + { + "epoch": 0.4912351121785223, + "grad_norm": 0.52734375, + "learning_rate": 0.0001565202825793801, + "loss": 2.5604, + "step": 1721 + }, + { + "epoch": 0.49152054803684797, + "grad_norm": 0.5, + "learning_rate": 0.0001563845263236868, + "loss": 2.5612, + "step": 1722 + }, + { + "epoch": 0.49180598389517366, + "grad_norm": 0.5234375, + "learning_rate": 0.0001562487648287303, + "loss": 2.6068, + "step": 1723 + }, + { + "epoch": 0.4920914197534994, + "grad_norm": 0.47265625, + "learning_rate": 0.000156112998205919, + "loss": 2.5695, + "step": 1724 + }, + { + "epoch": 0.4923768556118251, + "grad_norm": 0.51953125, + "learning_rate": 0.00015597722656666554, + "loss": 2.5929, + "step": 1725 + }, + { + "epoch": 0.4926622914701508, + "grad_norm": 0.515625, + "learning_rate": 0.00015584145002238677, + "loss": 2.5656, + "step": 1726 + }, + { + "epoch": 0.49294772732847647, + "grad_norm": 0.482421875, + "learning_rate": 0.00015570566868450343, + "loss": 2.5609, + "step": 1727 + }, + { + "epoch": 0.49323316318680216, + "grad_norm": 0.5234375, + "learning_rate": 0.00015556988266444028, + "loss": 2.5954, + "step": 1728 + }, + { + "epoch": 0.49351859904512785, + "grad_norm": 0.48828125, + "learning_rate": 0.0001554340920736259, + "loss": 2.5662, + "step": 1729 + }, + { + "epoch": 0.49380403490345354, + "grad_norm": 0.4921875, + "learning_rate": 0.00015529829702349266, + "loss": 2.6074, + "step": 1730 + }, + { + "epoch": 0.4940894707617792, + "grad_norm": 0.53515625, + "learning_rate": 0.0001551624976254765, + "loss": 2.593, + "step": 1731 + }, + { + "epoch": 0.4943749066201049, + "grad_norm": 0.5, + "learning_rate": 0.00015502669399101695, + "loss": 2.6089, + "step": 1732 + }, + { + "epoch": 0.49466034247843066, + "grad_norm": 0.5, + "learning_rate": 0.00015489088623155716, + "loss": 2.5917, + "step": 1733 + }, + { + "epoch": 0.49494577833675635, + "grad_norm": 0.53515625, + "learning_rate": 0.00015475507445854343, + "loss": 2.566, + "step": 1734 + }, + { + "epoch": 0.49523121419508204, + "grad_norm": 0.5, + "learning_rate": 0.00015461925878342556, + "loss": 2.5928, + "step": 1735 + }, + { + "epoch": 0.4955166500534077, + "grad_norm": 0.55859375, + "learning_rate": 0.00015448343931765635, + "loss": 2.5719, + "step": 1736 + }, + { + "epoch": 0.4958020859117334, + "grad_norm": 0.50390625, + "learning_rate": 0.000154347616172692, + "loss": 2.5568, + "step": 1737 + }, + { + "epoch": 0.4960875217700591, + "grad_norm": 0.49609375, + "learning_rate": 0.00015421178945999143, + "loss": 2.5836, + "step": 1738 + }, + { + "epoch": 0.4963729576283848, + "grad_norm": 0.498046875, + "learning_rate": 0.00015407595929101665, + "loss": 2.5957, + "step": 1739 + }, + { + "epoch": 0.4966583934867105, + "grad_norm": 0.4609375, + "learning_rate": 0.0001539401257772324, + "loss": 2.6004, + "step": 1740 + }, + { + "epoch": 0.4969438293450362, + "grad_norm": 0.51171875, + "learning_rate": 0.0001538042890301064, + "loss": 2.5866, + "step": 1741 + }, + { + "epoch": 0.4972292652033619, + "grad_norm": 0.478515625, + "learning_rate": 0.00015366844916110868, + "loss": 2.5744, + "step": 1742 + }, + { + "epoch": 0.4975147010616876, + "grad_norm": 0.474609375, + "learning_rate": 0.00015353260628171212, + "loss": 2.6165, + "step": 1743 + }, + { + "epoch": 0.4978001369200133, + "grad_norm": 0.5, + "learning_rate": 0.0001533967605033919, + "loss": 2.5778, + "step": 1744 + }, + { + "epoch": 0.498085572778339, + "grad_norm": 0.423828125, + "learning_rate": 0.00015326091193762568, + "loss": 2.5816, + "step": 1745 + }, + { + "epoch": 0.49837100863666467, + "grad_norm": 0.5078125, + "learning_rate": 0.00015312506069589335, + "loss": 2.6123, + "step": 1746 + }, + { + "epoch": 0.49865644449499036, + "grad_norm": 0.458984375, + "learning_rate": 0.00015298920688967702, + "loss": 2.5834, + "step": 1747 + }, + { + "epoch": 0.49894188035331605, + "grad_norm": 0.51953125, + "learning_rate": 0.00015285335063046089, + "loss": 2.5644, + "step": 1748 + }, + { + "epoch": 0.4992273162116418, + "grad_norm": 0.50390625, + "learning_rate": 0.00015271749202973116, + "loss": 2.5766, + "step": 1749 + }, + { + "epoch": 0.4995127520699675, + "grad_norm": 0.52734375, + "learning_rate": 0.000152581631198976, + "loss": 2.5764, + "step": 1750 + }, + { + "epoch": 0.4995127520699675, + "eval_loss": 2.4794108867645264, + "eval_runtime": 6003.2988, + "eval_samples_per_second": 10.708, + "eval_steps_per_second": 10.708, + "step": 1750 + }, + { + "epoch": 0.49979818792829317, + "grad_norm": 0.462890625, + "learning_rate": 0.00015244576824968538, + "loss": 2.5287, + "step": 1751 + }, + { + "epoch": 0.5000836237866189, + "grad_norm": 0.486328125, + "learning_rate": 0.000152309903293351, + "loss": 2.5808, + "step": 1752 + }, + { + "epoch": 0.5003690596449446, + "grad_norm": 0.4609375, + "learning_rate": 0.00015217403644146626, + "loss": 2.6024, + "step": 1753 + }, + { + "epoch": 0.5006544955032702, + "grad_norm": 0.50390625, + "learning_rate": 0.000152038167805526, + "loss": 2.6072, + "step": 1754 + }, + { + "epoch": 0.500939931361596, + "grad_norm": 0.50390625, + "learning_rate": 0.00015190229749702664, + "loss": 2.5662, + "step": 1755 + }, + { + "epoch": 0.5012253672199216, + "grad_norm": 0.56640625, + "learning_rate": 0.00015176642562746587, + "loss": 2.5949, + "step": 1756 + }, + { + "epoch": 0.5015108030782474, + "grad_norm": 0.5546875, + "learning_rate": 0.0001516305523083428, + "loss": 2.5952, + "step": 1757 + }, + { + "epoch": 0.501796238936573, + "grad_norm": 0.58984375, + "learning_rate": 0.00015149467765115764, + "loss": 2.5761, + "step": 1758 + }, + { + "epoch": 0.5020816747948987, + "grad_norm": 0.51953125, + "learning_rate": 0.0001513588017674117, + "loss": 2.5776, + "step": 1759 + }, + { + "epoch": 0.5023671106532244, + "grad_norm": 0.5078125, + "learning_rate": 0.0001512229247686072, + "loss": 2.5913, + "step": 1760 + }, + { + "epoch": 0.5026525465115501, + "grad_norm": 0.498046875, + "learning_rate": 0.00015108704676624756, + "loss": 2.6031, + "step": 1761 + }, + { + "epoch": 0.5029379823698759, + "grad_norm": 0.55859375, + "learning_rate": 0.00015095116787183668, + "loss": 2.5457, + "step": 1762 + }, + { + "epoch": 0.5032234182282015, + "grad_norm": 0.51171875, + "learning_rate": 0.0001508152881968795, + "loss": 2.5609, + "step": 1763 + }, + { + "epoch": 0.5035088540865272, + "grad_norm": 0.498046875, + "learning_rate": 0.00015067940785288135, + "loss": 2.6055, + "step": 1764 + }, + { + "epoch": 0.5037942899448529, + "grad_norm": 0.51171875, + "learning_rate": 0.0001505435269513482, + "loss": 2.597, + "step": 1765 + }, + { + "epoch": 0.5040797258031786, + "grad_norm": 0.458984375, + "learning_rate": 0.00015040764560378658, + "loss": 2.5936, + "step": 1766 + }, + { + "epoch": 0.5043651616615042, + "grad_norm": 0.578125, + "learning_rate": 0.00015027176392170326, + "loss": 2.5551, + "step": 1767 + }, + { + "epoch": 0.50465059751983, + "grad_norm": 0.51953125, + "learning_rate": 0.00015013588201660529, + "loss": 2.5881, + "step": 1768 + }, + { + "epoch": 0.5049360333781557, + "grad_norm": 0.515625, + "learning_rate": 0.00015, + "loss": 2.5998, + "step": 1769 + }, + { + "epoch": 0.5052214692364814, + "grad_norm": 0.451171875, + "learning_rate": 0.0001498641179833947, + "loss": 2.58, + "step": 1770 + }, + { + "epoch": 0.5055069050948071, + "grad_norm": 0.53515625, + "learning_rate": 0.00014972823607829674, + "loss": 2.5808, + "step": 1771 + }, + { + "epoch": 0.5057923409531327, + "grad_norm": 0.455078125, + "learning_rate": 0.00014959235439621343, + "loss": 2.575, + "step": 1772 + }, + { + "epoch": 0.5060777768114585, + "grad_norm": 0.5234375, + "learning_rate": 0.00014945647304865175, + "loss": 2.5957, + "step": 1773 + }, + { + "epoch": 0.5063632126697841, + "grad_norm": 0.5, + "learning_rate": 0.00014932059214711868, + "loss": 2.5831, + "step": 1774 + }, + { + "epoch": 0.5066486485281099, + "grad_norm": 0.59375, + "learning_rate": 0.00014918471180312053, + "loss": 2.5812, + "step": 1775 + }, + { + "epoch": 0.5069340843864355, + "grad_norm": 0.52734375, + "learning_rate": 0.0001490488321281633, + "loss": 2.5925, + "step": 1776 + }, + { + "epoch": 0.5072195202447612, + "grad_norm": 0.494140625, + "learning_rate": 0.00014891295323375244, + "loss": 2.5934, + "step": 1777 + }, + { + "epoch": 0.507504956103087, + "grad_norm": 0.5078125, + "learning_rate": 0.0001487770752313928, + "loss": 2.5923, + "step": 1778 + }, + { + "epoch": 0.5077903919614126, + "grad_norm": 0.466796875, + "learning_rate": 0.00014864119823258836, + "loss": 2.5811, + "step": 1779 + }, + { + "epoch": 0.5080758278197384, + "grad_norm": 0.490234375, + "learning_rate": 0.00014850532234884236, + "loss": 2.5726, + "step": 1780 + }, + { + "epoch": 0.508361263678064, + "grad_norm": 0.53515625, + "learning_rate": 0.00014836944769165716, + "loss": 2.57, + "step": 1781 + }, + { + "epoch": 0.5086466995363897, + "grad_norm": 0.51171875, + "learning_rate": 0.0001482335743725341, + "loss": 2.584, + "step": 1782 + }, + { + "epoch": 0.5089321353947154, + "grad_norm": 0.48828125, + "learning_rate": 0.00014809770250297336, + "loss": 2.5903, + "step": 1783 + }, + { + "epoch": 0.5092175712530411, + "grad_norm": 0.51171875, + "learning_rate": 0.000147961832194474, + "loss": 2.6009, + "step": 1784 + }, + { + "epoch": 0.5095030071113669, + "grad_norm": 0.478515625, + "learning_rate": 0.00014782596355853374, + "loss": 2.6057, + "step": 1785 + }, + { + "epoch": 0.5097884429696925, + "grad_norm": 0.49609375, + "learning_rate": 0.00014769009670664897, + "loss": 2.5661, + "step": 1786 + }, + { + "epoch": 0.5100738788280182, + "grad_norm": 0.447265625, + "learning_rate": 0.0001475542317503146, + "loss": 2.5986, + "step": 1787 + }, + { + "epoch": 0.5103593146863439, + "grad_norm": 0.5234375, + "learning_rate": 0.000147418368801024, + "loss": 2.5837, + "step": 1788 + }, + { + "epoch": 0.5106447505446696, + "grad_norm": 0.474609375, + "learning_rate": 0.0001472825079702688, + "loss": 2.5738, + "step": 1789 + }, + { + "epoch": 0.5109301864029953, + "grad_norm": 0.48828125, + "learning_rate": 0.0001471466493695391, + "loss": 2.5681, + "step": 1790 + }, + { + "epoch": 0.511215622261321, + "grad_norm": 0.466796875, + "learning_rate": 0.00014701079311032298, + "loss": 2.5817, + "step": 1791 + }, + { + "epoch": 0.5115010581196466, + "grad_norm": 0.48828125, + "learning_rate": 0.00014687493930410663, + "loss": 2.5813, + "step": 1792 + }, + { + "epoch": 0.5117864939779724, + "grad_norm": 0.478515625, + "learning_rate": 0.00014673908806237432, + "loss": 2.5893, + "step": 1793 + }, + { + "epoch": 0.5120719298362981, + "grad_norm": 0.498046875, + "learning_rate": 0.0001466032394966081, + "loss": 2.6104, + "step": 1794 + }, + { + "epoch": 0.5123573656946238, + "grad_norm": 0.5078125, + "learning_rate": 0.0001464673937182879, + "loss": 2.6105, + "step": 1795 + }, + { + "epoch": 0.5126428015529495, + "grad_norm": 0.494140625, + "learning_rate": 0.00014633155083889132, + "loss": 2.6015, + "step": 1796 + }, + { + "epoch": 0.5129282374112751, + "grad_norm": 0.5, + "learning_rate": 0.00014619571096989359, + "loss": 2.578, + "step": 1797 + }, + { + "epoch": 0.5132136732696009, + "grad_norm": 0.47265625, + "learning_rate": 0.00014605987422276756, + "loss": 2.5755, + "step": 1798 + }, + { + "epoch": 0.5134991091279265, + "grad_norm": 0.50390625, + "learning_rate": 0.00014592404070898335, + "loss": 2.5822, + "step": 1799 + }, + { + "epoch": 0.5137845449862523, + "grad_norm": 0.474609375, + "learning_rate": 0.00014578821054000854, + "loss": 2.5701, + "step": 1800 + }, + { + "epoch": 0.514069980844578, + "grad_norm": 0.51953125, + "learning_rate": 0.000145652383827308, + "loss": 2.5652, + "step": 1801 + }, + { + "epoch": 0.5143554167029036, + "grad_norm": 0.5078125, + "learning_rate": 0.00014551656068234362, + "loss": 2.5589, + "step": 1802 + }, + { + "epoch": 0.5146408525612294, + "grad_norm": 0.4609375, + "learning_rate": 0.00014538074121657447, + "loss": 2.5928, + "step": 1803 + }, + { + "epoch": 0.514926288419555, + "grad_norm": 0.48046875, + "learning_rate": 0.00014524492554145657, + "loss": 2.5787, + "step": 1804 + }, + { + "epoch": 0.5152117242778808, + "grad_norm": 0.474609375, + "learning_rate": 0.0001451091137684428, + "loss": 2.6031, + "step": 1805 + }, + { + "epoch": 0.5154971601362064, + "grad_norm": 0.478515625, + "learning_rate": 0.00014497330600898297, + "loss": 2.6, + "step": 1806 + }, + { + "epoch": 0.5157825959945321, + "grad_norm": 0.4609375, + "learning_rate": 0.0001448375023745235, + "loss": 2.5984, + "step": 1807 + }, + { + "epoch": 0.5160680318528578, + "grad_norm": 0.45703125, + "learning_rate": 0.00014470170297650734, + "loss": 2.5901, + "step": 1808 + }, + { + "epoch": 0.5163534677111835, + "grad_norm": 0.5078125, + "learning_rate": 0.00014456590792637407, + "loss": 2.555, + "step": 1809 + }, + { + "epoch": 0.5166389035695093, + "grad_norm": 0.4453125, + "learning_rate": 0.0001444301173355597, + "loss": 2.5745, + "step": 1810 + }, + { + "epoch": 0.5169243394278349, + "grad_norm": 0.4765625, + "learning_rate": 0.0001442943313154966, + "loss": 2.5377, + "step": 1811 + }, + { + "epoch": 0.5172097752861606, + "grad_norm": 0.455078125, + "learning_rate": 0.00014415854997761328, + "loss": 2.5617, + "step": 1812 + }, + { + "epoch": 0.5174952111444863, + "grad_norm": 0.46875, + "learning_rate": 0.0001440227734333344, + "loss": 2.5987, + "step": 1813 + }, + { + "epoch": 0.517780647002812, + "grad_norm": 0.44921875, + "learning_rate": 0.000143887001794081, + "loss": 2.5686, + "step": 1814 + }, + { + "epoch": 0.5180660828611376, + "grad_norm": 0.427734375, + "learning_rate": 0.00014375123517126968, + "loss": 2.5911, + "step": 1815 + }, + { + "epoch": 0.5183515187194634, + "grad_norm": 0.43359375, + "learning_rate": 0.00014361547367631317, + "loss": 2.5687, + "step": 1816 + }, + { + "epoch": 0.518636954577789, + "grad_norm": 0.447265625, + "learning_rate": 0.00014347971742061989, + "loss": 2.6098, + "step": 1817 + }, + { + "epoch": 0.5189223904361148, + "grad_norm": 0.474609375, + "learning_rate": 0.00014334396651559405, + "loss": 2.5648, + "step": 1818 + }, + { + "epoch": 0.5192078262944405, + "grad_norm": 0.40625, + "learning_rate": 0.00014320822107263532, + "loss": 2.583, + "step": 1819 + }, + { + "epoch": 0.5194932621527661, + "grad_norm": 0.50390625, + "learning_rate": 0.00014307248120313908, + "loss": 2.5763, + "step": 1820 + }, + { + "epoch": 0.5197786980110919, + "grad_norm": 0.44140625, + "learning_rate": 0.00014293674701849595, + "loss": 2.5835, + "step": 1821 + }, + { + "epoch": 0.5200641338694175, + "grad_norm": 0.478515625, + "learning_rate": 0.00014280101863009203, + "loss": 2.5738, + "step": 1822 + }, + { + "epoch": 0.5203495697277433, + "grad_norm": 0.447265625, + "learning_rate": 0.0001426652961493086, + "loss": 2.5956, + "step": 1823 + }, + { + "epoch": 0.5206350055860689, + "grad_norm": 0.5390625, + "learning_rate": 0.00014252957968752212, + "loss": 2.5553, + "step": 1824 + }, + { + "epoch": 0.5209204414443946, + "grad_norm": 0.484375, + "learning_rate": 0.00014239386935610405, + "loss": 2.5876, + "step": 1825 + }, + { + "epoch": 0.5212058773027204, + "grad_norm": 0.53515625, + "learning_rate": 0.00014225816526642086, + "loss": 2.592, + "step": 1826 + }, + { + "epoch": 0.521491313161046, + "grad_norm": 0.4609375, + "learning_rate": 0.00014212246752983392, + "loss": 2.5715, + "step": 1827 + }, + { + "epoch": 0.5217767490193718, + "grad_norm": 0.4765625, + "learning_rate": 0.00014198677625769937, + "loss": 2.5873, + "step": 1828 + }, + { + "epoch": 0.5220621848776974, + "grad_norm": 0.46875, + "learning_rate": 0.0001418510915613679, + "loss": 2.5964, + "step": 1829 + }, + { + "epoch": 0.5223476207360231, + "grad_norm": 0.470703125, + "learning_rate": 0.0001417154135521852, + "loss": 2.5588, + "step": 1830 + }, + { + "epoch": 0.5226330565943488, + "grad_norm": 0.478515625, + "learning_rate": 0.00014157974234149103, + "loss": 2.5652, + "step": 1831 + }, + { + "epoch": 0.5229184924526745, + "grad_norm": 0.46484375, + "learning_rate": 0.00014144407804061982, + "loss": 2.6088, + "step": 1832 + }, + { + "epoch": 0.5232039283110002, + "grad_norm": 0.494140625, + "learning_rate": 0.00014130842076090023, + "loss": 2.5847, + "step": 1833 + }, + { + "epoch": 0.5234893641693259, + "grad_norm": 0.439453125, + "learning_rate": 0.0001411727706136552, + "loss": 2.5664, + "step": 1834 + }, + { + "epoch": 0.5237748000276516, + "grad_norm": 0.458984375, + "learning_rate": 0.00014103712771020187, + "loss": 2.5667, + "step": 1835 + }, + { + "epoch": 0.5240602358859773, + "grad_norm": 0.447265625, + "learning_rate": 0.00014090149216185123, + "loss": 2.5789, + "step": 1836 + }, + { + "epoch": 0.524345671744303, + "grad_norm": 0.55859375, + "learning_rate": 0.00014076586407990856, + "loss": 2.5775, + "step": 1837 + }, + { + "epoch": 0.5246311076026287, + "grad_norm": 0.48046875, + "learning_rate": 0.00014063024357567275, + "loss": 2.5817, + "step": 1838 + }, + { + "epoch": 0.5249165434609544, + "grad_norm": 0.453125, + "learning_rate": 0.00014049463076043652, + "loss": 2.6099, + "step": 1839 + }, + { + "epoch": 0.52520197931928, + "grad_norm": 0.453125, + "learning_rate": 0.00014035902574548637, + "loss": 2.5589, + "step": 1840 + }, + { + "epoch": 0.5254874151776058, + "grad_norm": 0.44921875, + "learning_rate": 0.00014022342864210234, + "loss": 2.5884, + "step": 1841 + }, + { + "epoch": 0.5257728510359315, + "grad_norm": 0.458984375, + "learning_rate": 0.00014008783956155797, + "loss": 2.606, + "step": 1842 + }, + { + "epoch": 0.5260582868942572, + "grad_norm": 0.474609375, + "learning_rate": 0.0001399522586151202, + "loss": 2.5597, + "step": 1843 + }, + { + "epoch": 0.5263437227525829, + "grad_norm": 0.478515625, + "learning_rate": 0.00013981668591404932, + "loss": 2.5987, + "step": 1844 + }, + { + "epoch": 0.5266291586109085, + "grad_norm": 0.48046875, + "learning_rate": 0.00013968112156959893, + "loss": 2.5708, + "step": 1845 + }, + { + "epoch": 0.5269145944692343, + "grad_norm": 0.43359375, + "learning_rate": 0.00013954556569301563, + "loss": 2.5932, + "step": 1846 + }, + { + "epoch": 0.5272000303275599, + "grad_norm": 0.478515625, + "learning_rate": 0.0001394100183955392, + "loss": 2.6022, + "step": 1847 + }, + { + "epoch": 0.5274854661858857, + "grad_norm": 0.43359375, + "learning_rate": 0.00013927447978840225, + "loss": 2.5497, + "step": 1848 + }, + { + "epoch": 0.5277709020442113, + "grad_norm": 0.515625, + "learning_rate": 0.00013913894998283038, + "loss": 2.5742, + "step": 1849 + }, + { + "epoch": 0.528056337902537, + "grad_norm": 0.486328125, + "learning_rate": 0.00013900342909004188, + "loss": 2.624, + "step": 1850 + }, + { + "epoch": 0.5283417737608628, + "grad_norm": 0.5, + "learning_rate": 0.00013886791722124783, + "loss": 2.5814, + "step": 1851 + }, + { + "epoch": 0.5286272096191884, + "grad_norm": 0.44921875, + "learning_rate": 0.00013873241448765167, + "loss": 2.5622, + "step": 1852 + }, + { + "epoch": 0.5289126454775142, + "grad_norm": 0.474609375, + "learning_rate": 0.00013859692100044973, + "loss": 2.5673, + "step": 1853 + }, + { + "epoch": 0.5291980813358398, + "grad_norm": 0.4765625, + "learning_rate": 0.00013846143687083043, + "loss": 2.5758, + "step": 1854 + }, + { + "epoch": 0.5294835171941655, + "grad_norm": 0.4765625, + "learning_rate": 0.00013832596220997458, + "loss": 2.5934, + "step": 1855 + }, + { + "epoch": 0.5297689530524912, + "grad_norm": 0.455078125, + "learning_rate": 0.0001381904971290553, + "loss": 2.5529, + "step": 1856 + }, + { + "epoch": 0.5300543889108169, + "grad_norm": 0.447265625, + "learning_rate": 0.00013805504173923776, + "loss": 2.5794, + "step": 1857 + }, + { + "epoch": 0.5303398247691427, + "grad_norm": 0.466796875, + "learning_rate": 0.0001379195961516793, + "loss": 2.5519, + "step": 1858 + }, + { + "epoch": 0.5306252606274683, + "grad_norm": 0.482421875, + "learning_rate": 0.00013778416047752903, + "loss": 2.5965, + "step": 1859 + }, + { + "epoch": 0.530910696485794, + "grad_norm": 0.455078125, + "learning_rate": 0.0001376487348279281, + "loss": 2.5725, + "step": 1860 + }, + { + "epoch": 0.5311961323441197, + "grad_norm": 0.484375, + "learning_rate": 0.0001375133193140093, + "loss": 2.5638, + "step": 1861 + }, + { + "epoch": 0.5314815682024454, + "grad_norm": 0.46875, + "learning_rate": 0.00013737791404689728, + "loss": 2.5935, + "step": 1862 + }, + { + "epoch": 0.531767004060771, + "grad_norm": 0.470703125, + "learning_rate": 0.00013724251913770807, + "loss": 2.6033, + "step": 1863 + }, + { + "epoch": 0.5320524399190968, + "grad_norm": 0.44921875, + "learning_rate": 0.00013710713469754934, + "loss": 2.5982, + "step": 1864 + }, + { + "epoch": 0.5323378757774224, + "grad_norm": 0.5078125, + "learning_rate": 0.00013697176083752008, + "loss": 2.5374, + "step": 1865 + }, + { + "epoch": 0.5326233116357482, + "grad_norm": 0.443359375, + "learning_rate": 0.0001368363976687107, + "loss": 2.5623, + "step": 1866 + }, + { + "epoch": 0.5329087474940739, + "grad_norm": 0.494140625, + "learning_rate": 0.00013670104530220275, + "loss": 2.574, + "step": 1867 + }, + { + "epoch": 0.5331941833523995, + "grad_norm": 0.45703125, + "learning_rate": 0.0001365657038490689, + "loss": 2.5917, + "step": 1868 + }, + { + "epoch": 0.5334796192107253, + "grad_norm": 0.490234375, + "learning_rate": 0.000136430373420373, + "loss": 2.5844, + "step": 1869 + }, + { + "epoch": 0.5337650550690509, + "grad_norm": 0.419921875, + "learning_rate": 0.00013629505412716974, + "loss": 2.6019, + "step": 1870 + }, + { + "epoch": 0.5340504909273767, + "grad_norm": 0.478515625, + "learning_rate": 0.0001361597460805047, + "loss": 2.5718, + "step": 1871 + }, + { + "epoch": 0.5343359267857023, + "grad_norm": 0.46484375, + "learning_rate": 0.0001360244493914142, + "loss": 2.5665, + "step": 1872 + }, + { + "epoch": 0.534621362644028, + "grad_norm": 0.45703125, + "learning_rate": 0.0001358891641709252, + "loss": 2.5814, + "step": 1873 + }, + { + "epoch": 0.5349067985023538, + "grad_norm": 0.478515625, + "learning_rate": 0.00013575389053005547, + "loss": 2.5467, + "step": 1874 + }, + { + "epoch": 0.5351922343606794, + "grad_norm": 0.66015625, + "learning_rate": 0.00013561862857981304, + "loss": 2.5697, + "step": 1875 + }, + { + "epoch": 0.5354776702190052, + "grad_norm": 0.55078125, + "learning_rate": 0.00013548337843119634, + "loss": 2.5856, + "step": 1876 + }, + { + "epoch": 0.5357631060773308, + "grad_norm": 0.5625, + "learning_rate": 0.00013534814019519438, + "loss": 2.5662, + "step": 1877 + }, + { + "epoch": 0.5360485419356565, + "grad_norm": 0.5625, + "learning_rate": 0.00013521291398278608, + "loss": 2.5983, + "step": 1878 + }, + { + "epoch": 0.5363339777939822, + "grad_norm": 0.57421875, + "learning_rate": 0.00013507769990494072, + "loss": 2.5893, + "step": 1879 + }, + { + "epoch": 0.5366194136523079, + "grad_norm": 0.671875, + "learning_rate": 0.00013494249807261748, + "loss": 2.5852, + "step": 1880 + }, + { + "epoch": 0.5369048495106336, + "grad_norm": 0.55078125, + "learning_rate": 0.00013480730859676557, + "loss": 2.5667, + "step": 1881 + }, + { + "epoch": 0.5371902853689593, + "grad_norm": 0.82421875, + "learning_rate": 0.00013467213158832402, + "loss": 2.5674, + "step": 1882 + }, + { + "epoch": 0.537475721227285, + "grad_norm": 0.5078125, + "learning_rate": 0.00013453696715822163, + "loss": 2.5955, + "step": 1883 + }, + { + "epoch": 0.5377611570856107, + "grad_norm": 0.67578125, + "learning_rate": 0.0001344018154173769, + "loss": 2.5681, + "step": 1884 + }, + { + "epoch": 0.5380465929439364, + "grad_norm": 0.55859375, + "learning_rate": 0.00013426667647669795, + "loss": 2.6069, + "step": 1885 + }, + { + "epoch": 0.538332028802262, + "grad_norm": 0.609375, + "learning_rate": 0.00013413155044708232, + "loss": 2.5682, + "step": 1886 + }, + { + "epoch": 0.5386174646605878, + "grad_norm": 0.53125, + "learning_rate": 0.00013399643743941701, + "loss": 2.5783, + "step": 1887 + }, + { + "epoch": 0.5389029005189134, + "grad_norm": 0.59375, + "learning_rate": 0.0001338613375645783, + "loss": 2.5545, + "step": 1888 + }, + { + "epoch": 0.5391883363772392, + "grad_norm": 0.57421875, + "learning_rate": 0.00013372625093343167, + "loss": 2.5683, + "step": 1889 + }, + { + "epoch": 0.5394737722355648, + "grad_norm": 0.52734375, + "learning_rate": 0.00013359117765683183, + "loss": 2.5635, + "step": 1890 + }, + { + "epoch": 0.5397592080938906, + "grad_norm": 0.546875, + "learning_rate": 0.00013345611784562245, + "loss": 2.5851, + "step": 1891 + }, + { + "epoch": 0.5400446439522163, + "grad_norm": 0.578125, + "learning_rate": 0.0001333210716106361, + "loss": 2.5822, + "step": 1892 + }, + { + "epoch": 0.5403300798105419, + "grad_norm": 0.46484375, + "learning_rate": 0.00013318603906269436, + "loss": 2.587, + "step": 1893 + }, + { + "epoch": 0.5406155156688677, + "grad_norm": 0.62890625, + "learning_rate": 0.00013305102031260755, + "loss": 2.5887, + "step": 1894 + }, + { + "epoch": 0.5409009515271933, + "grad_norm": 0.443359375, + "learning_rate": 0.00013291601547117448, + "loss": 2.5895, + "step": 1895 + }, + { + "epoch": 0.541186387385519, + "grad_norm": 0.56640625, + "learning_rate": 0.00013278102464918276, + "loss": 2.5535, + "step": 1896 + }, + { + "epoch": 0.5414718232438447, + "grad_norm": 0.447265625, + "learning_rate": 0.00013264604795740838, + "loss": 2.5836, + "step": 1897 + }, + { + "epoch": 0.5417572591021704, + "grad_norm": 0.5390625, + "learning_rate": 0.00013251108550661585, + "loss": 2.5933, + "step": 1898 + }, + { + "epoch": 0.5420426949604962, + "grad_norm": 0.45703125, + "learning_rate": 0.0001323761374075578, + "loss": 2.5745, + "step": 1899 + }, + { + "epoch": 0.5423281308188218, + "grad_norm": 0.490234375, + "learning_rate": 0.0001322412037709752, + "loss": 2.5632, + "step": 1900 + }, + { + "epoch": 0.5426135666771476, + "grad_norm": 0.5, + "learning_rate": 0.00013210628470759726, + "loss": 2.5525, + "step": 1901 + }, + { + "epoch": 0.5428990025354732, + "grad_norm": 0.5078125, + "learning_rate": 0.000131971380328141, + "loss": 2.6075, + "step": 1902 + }, + { + "epoch": 0.5431844383937989, + "grad_norm": 0.447265625, + "learning_rate": 0.0001318364907433116, + "loss": 2.5948, + "step": 1903 + }, + { + "epoch": 0.5434698742521246, + "grad_norm": 0.53125, + "learning_rate": 0.00013170161606380204, + "loss": 2.6039, + "step": 1904 + }, + { + "epoch": 0.5437553101104503, + "grad_norm": 0.453125, + "learning_rate": 0.00013156675640029289, + "loss": 2.5849, + "step": 1905 + }, + { + "epoch": 0.5440407459687759, + "grad_norm": 0.546875, + "learning_rate": 0.00013143191186345266, + "loss": 2.5805, + "step": 1906 + }, + { + "epoch": 0.5443261818271017, + "grad_norm": 0.431640625, + "learning_rate": 0.00013129708256393724, + "loss": 2.5466, + "step": 1907 + }, + { + "epoch": 0.5446116176854274, + "grad_norm": 0.515625, + "learning_rate": 0.00013116226861239019, + "loss": 2.5889, + "step": 1908 + }, + { + "epoch": 0.5448970535437531, + "grad_norm": 0.45703125, + "learning_rate": 0.00013102747011944238, + "loss": 2.5744, + "step": 1909 + }, + { + "epoch": 0.5451824894020788, + "grad_norm": 0.484375, + "learning_rate": 0.000130892687195712, + "loss": 2.5408, + "step": 1910 + }, + { + "epoch": 0.5454679252604044, + "grad_norm": 0.470703125, + "learning_rate": 0.00013075791995180447, + "loss": 2.5915, + "step": 1911 + }, + { + "epoch": 0.5457533611187302, + "grad_norm": 0.439453125, + "learning_rate": 0.00013062316849831232, + "loss": 2.5739, + "step": 1912 + }, + { + "epoch": 0.5460387969770558, + "grad_norm": 0.458984375, + "learning_rate": 0.00013048843294581516, + "loss": 2.5662, + "step": 1913 + }, + { + "epoch": 0.5463242328353816, + "grad_norm": 0.447265625, + "learning_rate": 0.00013035371340487954, + "loss": 2.5486, + "step": 1914 + }, + { + "epoch": 0.5466096686937073, + "grad_norm": 0.47265625, + "learning_rate": 0.00013021900998605885, + "loss": 2.5508, + "step": 1915 + }, + { + "epoch": 0.5468951045520329, + "grad_norm": 0.45703125, + "learning_rate": 0.0001300843227998933, + "loss": 2.5886, + "step": 1916 + }, + { + "epoch": 0.5471805404103587, + "grad_norm": 0.455078125, + "learning_rate": 0.00012994965195690976, + "loss": 2.5568, + "step": 1917 + }, + { + "epoch": 0.5474659762686843, + "grad_norm": 0.443359375, + "learning_rate": 0.0001298149975676216, + "loss": 2.5776, + "step": 1918 + }, + { + "epoch": 0.5477514121270101, + "grad_norm": 0.4296875, + "learning_rate": 0.0001296803597425288, + "loss": 2.5829, + "step": 1919 + }, + { + "epoch": 0.5480368479853357, + "grad_norm": 0.455078125, + "learning_rate": 0.00012954573859211773, + "loss": 2.5828, + "step": 1920 + }, + { + "epoch": 0.5483222838436614, + "grad_norm": 0.408203125, + "learning_rate": 0.00012941113422686108, + "loss": 2.5825, + "step": 1921 + }, + { + "epoch": 0.5486077197019871, + "grad_norm": 0.474609375, + "learning_rate": 0.0001292765467572177, + "loss": 2.5706, + "step": 1922 + }, + { + "epoch": 0.5488931555603128, + "grad_norm": 0.435546875, + "learning_rate": 0.00012914197629363257, + "loss": 2.546, + "step": 1923 + }, + { + "epoch": 0.5491785914186386, + "grad_norm": 0.4609375, + "learning_rate": 0.00012900742294653684, + "loss": 2.6005, + "step": 1924 + }, + { + "epoch": 0.5494640272769642, + "grad_norm": 0.494140625, + "learning_rate": 0.0001288728868263475, + "loss": 2.5664, + "step": 1925 + }, + { + "epoch": 0.5497494631352899, + "grad_norm": 0.439453125, + "learning_rate": 0.00012873836804346746, + "loss": 2.5662, + "step": 1926 + }, + { + "epoch": 0.5500348989936156, + "grad_norm": 0.486328125, + "learning_rate": 0.00012860386670828538, + "loss": 2.5691, + "step": 1927 + }, + { + "epoch": 0.5503203348519413, + "grad_norm": 0.458984375, + "learning_rate": 0.0001284693829311756, + "loss": 2.556, + "step": 1928 + }, + { + "epoch": 0.550605770710267, + "grad_norm": 0.494140625, + "learning_rate": 0.00012833491682249802, + "loss": 2.5723, + "step": 1929 + }, + { + "epoch": 0.5508912065685927, + "grad_norm": 0.439453125, + "learning_rate": 0.0001282004684925981, + "loss": 2.5932, + "step": 1930 + }, + { + "epoch": 0.5511766424269184, + "grad_norm": 0.52734375, + "learning_rate": 0.00012806603805180666, + "loss": 2.5586, + "step": 1931 + }, + { + "epoch": 0.5514620782852441, + "grad_norm": 0.4453125, + "learning_rate": 0.00012793162561043994, + "loss": 2.6137, + "step": 1932 + }, + { + "epoch": 0.5517475141435698, + "grad_norm": 0.474609375, + "learning_rate": 0.0001277972312787993, + "loss": 2.5864, + "step": 1933 + }, + { + "epoch": 0.5520329500018955, + "grad_norm": 0.44140625, + "learning_rate": 0.0001276628551671713, + "loss": 2.5684, + "step": 1934 + }, + { + "epoch": 0.5523183858602212, + "grad_norm": 0.470703125, + "learning_rate": 0.00012752849738582745, + "loss": 2.5812, + "step": 1935 + }, + { + "epoch": 0.5526038217185468, + "grad_norm": 0.44921875, + "learning_rate": 0.0001273941580450243, + "loss": 2.5645, + "step": 1936 + }, + { + "epoch": 0.5528892575768726, + "grad_norm": 0.49609375, + "learning_rate": 0.00012725983725500332, + "loss": 2.5597, + "step": 1937 + }, + { + "epoch": 0.5531746934351982, + "grad_norm": 0.43359375, + "learning_rate": 0.0001271255351259907, + "loss": 2.5787, + "step": 1938 + }, + { + "epoch": 0.553460129293524, + "grad_norm": 0.466796875, + "learning_rate": 0.00012699125176819716, + "loss": 2.5669, + "step": 1939 + }, + { + "epoch": 0.5537455651518497, + "grad_norm": 0.78125, + "learning_rate": 0.00012685698729181837, + "loss": 2.5653, + "step": 1940 + }, + { + "epoch": 0.5540310010101753, + "grad_norm": 0.48828125, + "learning_rate": 0.0001267227418070342, + "loss": 2.5713, + "step": 1941 + }, + { + "epoch": 0.5543164368685011, + "grad_norm": 0.46484375, + "learning_rate": 0.00012658851542400907, + "loss": 2.5643, + "step": 1942 + }, + { + "epoch": 0.5546018727268267, + "grad_norm": 0.431640625, + "learning_rate": 0.00012645430825289163, + "loss": 2.5536, + "step": 1943 + }, + { + "epoch": 0.5548873085851525, + "grad_norm": 0.53515625, + "learning_rate": 0.00012632012040381493, + "loss": 2.5869, + "step": 1944 + }, + { + "epoch": 0.5551727444434781, + "grad_norm": 0.44921875, + "learning_rate": 0.00012618595198689596, + "loss": 2.5626, + "step": 1945 + }, + { + "epoch": 0.5554581803018038, + "grad_norm": 0.484375, + "learning_rate": 0.0001260518031122359, + "loss": 2.5907, + "step": 1946 + }, + { + "epoch": 0.5557436161601295, + "grad_norm": 0.431640625, + "learning_rate": 0.00012591767388991985, + "loss": 2.5852, + "step": 1947 + }, + { + "epoch": 0.5560290520184552, + "grad_norm": 0.458984375, + "learning_rate": 0.00012578356443001683, + "loss": 2.557, + "step": 1948 + }, + { + "epoch": 0.556314487876781, + "grad_norm": 0.453125, + "learning_rate": 0.0001256494748425796, + "loss": 2.581, + "step": 1949 + }, + { + "epoch": 0.5565999237351066, + "grad_norm": 0.451171875, + "learning_rate": 0.00012551540523764458, + "loss": 2.5861, + "step": 1950 + }, + { + "epoch": 0.5568853595934323, + "grad_norm": 0.49609375, + "learning_rate": 0.00012538135572523183, + "loss": 2.5701, + "step": 1951 + }, + { + "epoch": 0.557170795451758, + "grad_norm": 0.482421875, + "learning_rate": 0.00012524732641534496, + "loss": 2.5348, + "step": 1952 + }, + { + "epoch": 0.5574562313100837, + "grad_norm": 0.458984375, + "learning_rate": 0.00012511331741797092, + "loss": 2.5597, + "step": 1953 + }, + { + "epoch": 0.5577416671684093, + "grad_norm": 0.4921875, + "learning_rate": 0.00012497932884308002, + "loss": 2.5808, + "step": 1954 + }, + { + "epoch": 0.5580271030267351, + "grad_norm": 0.439453125, + "learning_rate": 0.00012484536080062581, + "loss": 2.5469, + "step": 1955 + }, + { + "epoch": 0.5583125388850608, + "grad_norm": 0.54296875, + "learning_rate": 0.00012471141340054508, + "loss": 2.5758, + "step": 1956 + }, + { + "epoch": 0.5585979747433865, + "grad_norm": 0.43359375, + "learning_rate": 0.00012457748675275763, + "loss": 2.5819, + "step": 1957 + }, + { + "epoch": 0.5588834106017122, + "grad_norm": 0.494140625, + "learning_rate": 0.00012444358096716607, + "loss": 2.5616, + "step": 1958 + }, + { + "epoch": 0.5591688464600378, + "grad_norm": 0.43359375, + "learning_rate": 0.0001243096961536561, + "loss": 2.5502, + "step": 1959 + }, + { + "epoch": 0.5594542823183636, + "grad_norm": 0.421875, + "learning_rate": 0.00012417583242209612, + "loss": 2.5667, + "step": 1960 + }, + { + "epoch": 0.5597397181766892, + "grad_norm": 0.478515625, + "learning_rate": 0.00012404198988233729, + "loss": 2.5661, + "step": 1961 + }, + { + "epoch": 0.560025154035015, + "grad_norm": 0.447265625, + "learning_rate": 0.00012390816864421325, + "loss": 2.5755, + "step": 1962 + }, + { + "epoch": 0.5603105898933406, + "grad_norm": 0.466796875, + "learning_rate": 0.00012377436881754025, + "loss": 2.5679, + "step": 1963 + }, + { + "epoch": 0.5605960257516663, + "grad_norm": 0.4296875, + "learning_rate": 0.00012364059051211707, + "loss": 2.5471, + "step": 1964 + }, + { + "epoch": 0.5608814616099921, + "grad_norm": 0.455078125, + "learning_rate": 0.00012350683383772462, + "loss": 2.5443, + "step": 1965 + }, + { + "epoch": 0.5611668974683177, + "grad_norm": 0.46875, + "learning_rate": 0.00012337309890412618, + "loss": 2.5963, + "step": 1966 + }, + { + "epoch": 0.5614523333266435, + "grad_norm": 0.443359375, + "learning_rate": 0.00012323938582106724, + "loss": 2.5735, + "step": 1967 + }, + { + "epoch": 0.5617377691849691, + "grad_norm": 0.48046875, + "learning_rate": 0.00012310569469827518, + "loss": 2.5885, + "step": 1968 + }, + { + "epoch": 0.5620232050432948, + "grad_norm": 0.458984375, + "learning_rate": 0.00012297202564545953, + "loss": 2.5558, + "step": 1969 + }, + { + "epoch": 0.5623086409016205, + "grad_norm": 0.419921875, + "learning_rate": 0.0001228383787723116, + "loss": 2.5914, + "step": 1970 + }, + { + "epoch": 0.5625940767599462, + "grad_norm": 0.458984375, + "learning_rate": 0.0001227047541885046, + "loss": 2.5518, + "step": 1971 + }, + { + "epoch": 0.562879512618272, + "grad_norm": 0.431640625, + "learning_rate": 0.00012257115200369338, + "loss": 2.541, + "step": 1972 + }, + { + "epoch": 0.5631649484765976, + "grad_norm": 0.4453125, + "learning_rate": 0.0001224375723275144, + "loss": 2.5672, + "step": 1973 + }, + { + "epoch": 0.5634503843349233, + "grad_norm": 0.4140625, + "learning_rate": 0.00012230401526958578, + "loss": 2.579, + "step": 1974 + }, + { + "epoch": 0.563735820193249, + "grad_norm": 0.431640625, + "learning_rate": 0.0001221704809395068, + "loss": 2.5442, + "step": 1975 + }, + { + "epoch": 0.5640212560515747, + "grad_norm": 0.447265625, + "learning_rate": 0.00012203696944685838, + "loss": 2.582, + "step": 1976 + }, + { + "epoch": 0.5643066919099003, + "grad_norm": 0.41015625, + "learning_rate": 0.00012190348090120253, + "loss": 2.5607, + "step": 1977 + }, + { + "epoch": 0.5645921277682261, + "grad_norm": 0.41796875, + "learning_rate": 0.00012177001541208247, + "loss": 2.5668, + "step": 1978 + }, + { + "epoch": 0.5648775636265517, + "grad_norm": 0.423828125, + "learning_rate": 0.00012163657308902254, + "loss": 2.5663, + "step": 1979 + }, + { + "epoch": 0.5651629994848775, + "grad_norm": 0.40625, + "learning_rate": 0.00012150315404152809, + "loss": 2.575, + "step": 1980 + }, + { + "epoch": 0.5654484353432032, + "grad_norm": 0.458984375, + "learning_rate": 0.00012136975837908521, + "loss": 2.5806, + "step": 1981 + }, + { + "epoch": 0.5657338712015288, + "grad_norm": 0.43359375, + "learning_rate": 0.00012123638621116096, + "loss": 2.5632, + "step": 1982 + }, + { + "epoch": 0.5660193070598546, + "grad_norm": 0.451171875, + "learning_rate": 0.00012110303764720305, + "loss": 2.5993, + "step": 1983 + }, + { + "epoch": 0.5663047429181802, + "grad_norm": 0.42578125, + "learning_rate": 0.00012096971279663991, + "loss": 2.5778, + "step": 1984 + }, + { + "epoch": 0.566590178776506, + "grad_norm": 0.462890625, + "learning_rate": 0.00012083641176888034, + "loss": 2.5656, + "step": 1985 + }, + { + "epoch": 0.5668756146348316, + "grad_norm": 0.419921875, + "learning_rate": 0.00012070313467331368, + "loss": 2.5657, + "step": 1986 + }, + { + "epoch": 0.5671610504931573, + "grad_norm": 0.427734375, + "learning_rate": 0.00012056988161930973, + "loss": 2.5606, + "step": 1987 + }, + { + "epoch": 0.5674464863514831, + "grad_norm": 0.44140625, + "learning_rate": 0.00012043665271621843, + "loss": 2.5621, + "step": 1988 + }, + { + "epoch": 0.5677319222098087, + "grad_norm": 0.455078125, + "learning_rate": 0.00012030344807336993, + "loss": 2.5528, + "step": 1989 + }, + { + "epoch": 0.5680173580681345, + "grad_norm": 0.416015625, + "learning_rate": 0.00012017026780007452, + "loss": 2.5568, + "step": 1990 + }, + { + "epoch": 0.5683027939264601, + "grad_norm": 0.46484375, + "learning_rate": 0.00012003711200562242, + "loss": 2.5495, + "step": 1991 + }, + { + "epoch": 0.5685882297847858, + "grad_norm": 0.412109375, + "learning_rate": 0.00011990398079928378, + "loss": 2.5533, + "step": 1992 + }, + { + "epoch": 0.5688736656431115, + "grad_norm": 0.447265625, + "learning_rate": 0.00011977087429030862, + "loss": 2.55, + "step": 1993 + }, + { + "epoch": 0.5691591015014372, + "grad_norm": 0.50390625, + "learning_rate": 0.00011963779258792664, + "loss": 2.5533, + "step": 1994 + }, + { + "epoch": 0.5694445373597629, + "grad_norm": 0.453125, + "learning_rate": 0.00011950473580134723, + "loss": 2.567, + "step": 1995 + }, + { + "epoch": 0.5697299732180886, + "grad_norm": 0.50390625, + "learning_rate": 0.00011937170403975933, + "loss": 2.5419, + "step": 1996 + }, + { + "epoch": 0.5700154090764143, + "grad_norm": 0.42578125, + "learning_rate": 0.00011923869741233131, + "loss": 2.56, + "step": 1997 + }, + { + "epoch": 0.57030084493474, + "grad_norm": 0.486328125, + "learning_rate": 0.00011910571602821089, + "loss": 2.571, + "step": 1998 + }, + { + "epoch": 0.5705862807930657, + "grad_norm": 0.40625, + "learning_rate": 0.00011897275999652513, + "loss": 2.5794, + "step": 1999 + }, + { + "epoch": 0.5708717166513914, + "grad_norm": 0.455078125, + "learning_rate": 0.00011883982942638028, + "loss": 2.5708, + "step": 2000 + }, + { + "epoch": 0.5708717166513914, + "eval_loss": 2.470252513885498, + "eval_runtime": 5925.0122, + "eval_samples_per_second": 10.85, + "eval_steps_per_second": 10.85, + "step": 2000 + }, + { + "epoch": 0.5711571525097171, + "grad_norm": 0.435546875, + "learning_rate": 0.00011870692442686172, + "loss": 2.5898, + "step": 2001 + }, + { + "epoch": 0.5714425883680427, + "grad_norm": 0.423828125, + "learning_rate": 0.00011857404510703366, + "loss": 2.5845, + "step": 2002 + }, + { + "epoch": 0.5717280242263685, + "grad_norm": 0.5, + "learning_rate": 0.0001184411915759396, + "loss": 2.5365, + "step": 2003 + }, + { + "epoch": 0.5720134600846942, + "grad_norm": 0.4140625, + "learning_rate": 0.00011830836394260153, + "loss": 2.562, + "step": 2004 + }, + { + "epoch": 0.5722988959430199, + "grad_norm": 0.4453125, + "learning_rate": 0.00011817556231602037, + "loss": 2.5718, + "step": 2005 + }, + { + "epoch": 0.5725843318013456, + "grad_norm": 0.416015625, + "learning_rate": 0.00011804278680517561, + "loss": 2.5428, + "step": 2006 + }, + { + "epoch": 0.5728697676596712, + "grad_norm": 0.439453125, + "learning_rate": 0.00011791003751902542, + "loss": 2.5839, + "step": 2007 + }, + { + "epoch": 0.573155203517997, + "grad_norm": 0.4609375, + "learning_rate": 0.00011777731456650629, + "loss": 2.5791, + "step": 2008 + }, + { + "epoch": 0.5734406393763226, + "grad_norm": 0.43359375, + "learning_rate": 0.00011764461805653324, + "loss": 2.5559, + "step": 2009 + }, + { + "epoch": 0.5737260752346484, + "grad_norm": 0.484375, + "learning_rate": 0.00011751194809799949, + "loss": 2.5588, + "step": 2010 + }, + { + "epoch": 0.574011511092974, + "grad_norm": 0.47265625, + "learning_rate": 0.00011737930479977658, + "loss": 2.597, + "step": 2011 + }, + { + "epoch": 0.5742969469512997, + "grad_norm": 0.474609375, + "learning_rate": 0.00011724668827071413, + "loss": 2.5619, + "step": 2012 + }, + { + "epoch": 0.5745823828096255, + "grad_norm": 0.458984375, + "learning_rate": 0.00011711409861963971, + "loss": 2.5595, + "step": 2013 + }, + { + "epoch": 0.5748678186679511, + "grad_norm": 0.478515625, + "learning_rate": 0.00011698153595535897, + "loss": 2.5641, + "step": 2014 + }, + { + "epoch": 0.5751532545262769, + "grad_norm": 0.435546875, + "learning_rate": 0.0001168490003866553, + "loss": 2.5707, + "step": 2015 + }, + { + "epoch": 0.5754386903846025, + "grad_norm": 0.490234375, + "learning_rate": 0.00011671649202228988, + "loss": 2.5486, + "step": 2016 + }, + { + "epoch": 0.5757241262429282, + "grad_norm": 0.453125, + "learning_rate": 0.00011658401097100161, + "loss": 2.5753, + "step": 2017 + }, + { + "epoch": 0.5760095621012539, + "grad_norm": 0.50390625, + "learning_rate": 0.0001164515573415069, + "loss": 2.5995, + "step": 2018 + }, + { + "epoch": 0.5762949979595796, + "grad_norm": 0.4609375, + "learning_rate": 0.00011631913124249981, + "loss": 2.587, + "step": 2019 + }, + { + "epoch": 0.5765804338179052, + "grad_norm": 0.439453125, + "learning_rate": 0.00011618673278265168, + "loss": 2.5885, + "step": 2020 + }, + { + "epoch": 0.576865869676231, + "grad_norm": 0.435546875, + "learning_rate": 0.00011605436207061112, + "loss": 2.5741, + "step": 2021 + }, + { + "epoch": 0.5771513055345567, + "grad_norm": 0.431640625, + "learning_rate": 0.00011592201921500408, + "loss": 2.5782, + "step": 2022 + }, + { + "epoch": 0.5774367413928824, + "grad_norm": 0.42578125, + "learning_rate": 0.00011578970432443364, + "loss": 2.5819, + "step": 2023 + }, + { + "epoch": 0.5777221772512081, + "grad_norm": 0.427734375, + "learning_rate": 0.00011565741750747992, + "loss": 2.5745, + "step": 2024 + }, + { + "epoch": 0.5780076131095337, + "grad_norm": 0.455078125, + "learning_rate": 0.00011552515887269992, + "loss": 2.5694, + "step": 2025 + }, + { + "epoch": 0.5782930489678595, + "grad_norm": 0.416015625, + "learning_rate": 0.00011539292852862757, + "loss": 2.5542, + "step": 2026 + }, + { + "epoch": 0.5785784848261851, + "grad_norm": 0.396484375, + "learning_rate": 0.0001152607265837737, + "loss": 2.5776, + "step": 2027 + }, + { + "epoch": 0.5788639206845109, + "grad_norm": 0.431640625, + "learning_rate": 0.00011512855314662566, + "loss": 2.555, + "step": 2028 + }, + { + "epoch": 0.5791493565428366, + "grad_norm": 0.71484375, + "learning_rate": 0.00011499640832564749, + "loss": 2.5699, + "step": 2029 + }, + { + "epoch": 0.5794347924011622, + "grad_norm": 0.44140625, + "learning_rate": 0.00011486429222927976, + "loss": 2.5698, + "step": 2030 + }, + { + "epoch": 0.579720228259488, + "grad_norm": 0.427734375, + "learning_rate": 0.00011473220496593937, + "loss": 2.546, + "step": 2031 + }, + { + "epoch": 0.5800056641178136, + "grad_norm": 0.439453125, + "learning_rate": 0.0001146001466440197, + "loss": 2.563, + "step": 2032 + }, + { + "epoch": 0.5802910999761394, + "grad_norm": 0.4296875, + "learning_rate": 0.00011446811737189029, + "loss": 2.5682, + "step": 2033 + }, + { + "epoch": 0.580576535834465, + "grad_norm": 0.44921875, + "learning_rate": 0.0001143361172578968, + "loss": 2.5643, + "step": 2034 + }, + { + "epoch": 0.5808619716927907, + "grad_norm": 0.416015625, + "learning_rate": 0.00011420414641036111, + "loss": 2.5385, + "step": 2035 + }, + { + "epoch": 0.5811474075511164, + "grad_norm": 0.453125, + "learning_rate": 0.00011407220493758099, + "loss": 2.5788, + "step": 2036 + }, + { + "epoch": 0.5814328434094421, + "grad_norm": 0.4375, + "learning_rate": 0.00011394029294783011, + "loss": 2.5717, + "step": 2037 + }, + { + "epoch": 0.5817182792677679, + "grad_norm": 0.46484375, + "learning_rate": 0.00011380841054935789, + "loss": 2.595, + "step": 2038 + }, + { + "epoch": 0.5820037151260935, + "grad_norm": 0.484375, + "learning_rate": 0.00011367655785038957, + "loss": 2.5678, + "step": 2039 + }, + { + "epoch": 0.5822891509844192, + "grad_norm": 0.427734375, + "learning_rate": 0.00011354473495912596, + "loss": 2.5785, + "step": 2040 + }, + { + "epoch": 0.5825745868427449, + "grad_norm": 0.4453125, + "learning_rate": 0.00011341294198374341, + "loss": 2.5803, + "step": 2041 + }, + { + "epoch": 0.5828600227010706, + "grad_norm": 0.451171875, + "learning_rate": 0.00011328117903239376, + "loss": 2.5802, + "step": 2042 + }, + { + "epoch": 0.5831454585593963, + "grad_norm": 0.44140625, + "learning_rate": 0.00011314944621320421, + "loss": 2.5512, + "step": 2043 + }, + { + "epoch": 0.583430894417722, + "grad_norm": 0.447265625, + "learning_rate": 0.00011301774363427714, + "loss": 2.5891, + "step": 2044 + }, + { + "epoch": 0.5837163302760477, + "grad_norm": 0.4453125, + "learning_rate": 0.00011288607140369021, + "loss": 2.5855, + "step": 2045 + }, + { + "epoch": 0.5840017661343734, + "grad_norm": 0.451171875, + "learning_rate": 0.00011275442962949613, + "loss": 2.5551, + "step": 2046 + }, + { + "epoch": 0.5842872019926991, + "grad_norm": 0.4296875, + "learning_rate": 0.00011262281841972272, + "loss": 2.5605, + "step": 2047 + }, + { + "epoch": 0.5845726378510248, + "grad_norm": 0.48046875, + "learning_rate": 0.0001124912378823725, + "loss": 2.5974, + "step": 2048 + }, + { + "epoch": 0.5848580737093505, + "grad_norm": 0.482421875, + "learning_rate": 0.00011235968812542298, + "loss": 2.5483, + "step": 2049 + }, + { + "epoch": 0.5851435095676761, + "grad_norm": 0.474609375, + "learning_rate": 0.00011222816925682647, + "loss": 2.5846, + "step": 2050 + }, + { + "epoch": 0.5854289454260019, + "grad_norm": 0.490234375, + "learning_rate": 0.00011209668138450979, + "loss": 2.572, + "step": 2051 + }, + { + "epoch": 0.5857143812843275, + "grad_norm": 0.466796875, + "learning_rate": 0.00011196522461637439, + "loss": 2.5609, + "step": 2052 + }, + { + "epoch": 0.5859998171426533, + "grad_norm": 0.52734375, + "learning_rate": 0.00011183379906029615, + "loss": 2.5499, + "step": 2053 + }, + { + "epoch": 0.586285253000979, + "grad_norm": 0.490234375, + "learning_rate": 0.00011170240482412542, + "loss": 2.5417, + "step": 2054 + }, + { + "epoch": 0.5865706888593046, + "grad_norm": 0.5390625, + "learning_rate": 0.00011157104201568677, + "loss": 2.5613, + "step": 2055 + }, + { + "epoch": 0.5868561247176304, + "grad_norm": 0.4609375, + "learning_rate": 0.000111439710742779, + "loss": 2.5377, + "step": 2056 + }, + { + "epoch": 0.587141560575956, + "grad_norm": 0.5703125, + "learning_rate": 0.00011130841111317501, + "loss": 2.5511, + "step": 2057 + }, + { + "epoch": 0.5874269964342818, + "grad_norm": 0.4296875, + "learning_rate": 0.00011117714323462186, + "loss": 2.581, + "step": 2058 + }, + { + "epoch": 0.5877124322926074, + "grad_norm": 0.4921875, + "learning_rate": 0.0001110459072148404, + "loss": 2.556, + "step": 2059 + }, + { + "epoch": 0.5879978681509331, + "grad_norm": 0.44140625, + "learning_rate": 0.00011091470316152543, + "loss": 2.5631, + "step": 2060 + }, + { + "epoch": 0.5882833040092589, + "grad_norm": 0.4609375, + "learning_rate": 0.00011078353118234542, + "loss": 2.5587, + "step": 2061 + }, + { + "epoch": 0.5885687398675845, + "grad_norm": 0.486328125, + "learning_rate": 0.00011065239138494263, + "loss": 2.5622, + "step": 2062 + }, + { + "epoch": 0.5888541757259103, + "grad_norm": 0.421875, + "learning_rate": 0.0001105212838769328, + "loss": 2.5687, + "step": 2063 + }, + { + "epoch": 0.5891396115842359, + "grad_norm": 0.458984375, + "learning_rate": 0.00011039020876590535, + "loss": 2.5541, + "step": 2064 + }, + { + "epoch": 0.5894250474425616, + "grad_norm": 0.44140625, + "learning_rate": 0.00011025916615942281, + "loss": 2.5607, + "step": 2065 + }, + { + "epoch": 0.5897104833008873, + "grad_norm": 0.423828125, + "learning_rate": 0.00011012815616502145, + "loss": 2.5617, + "step": 2066 + }, + { + "epoch": 0.589995919159213, + "grad_norm": 0.46875, + "learning_rate": 0.00010999717889021042, + "loss": 2.5915, + "step": 2067 + }, + { + "epoch": 0.5902813550175386, + "grad_norm": 0.408203125, + "learning_rate": 0.00010986623444247216, + "loss": 2.5686, + "step": 2068 + }, + { + "epoch": 0.5905667908758644, + "grad_norm": 0.45703125, + "learning_rate": 0.0001097353229292622, + "loss": 2.5715, + "step": 2069 + }, + { + "epoch": 0.5908522267341901, + "grad_norm": 0.44140625, + "learning_rate": 0.00010960444445800901, + "loss": 2.5551, + "step": 2070 + }, + { + "epoch": 0.5911376625925158, + "grad_norm": 0.4140625, + "learning_rate": 0.0001094735991361139, + "loss": 2.5485, + "step": 2071 + }, + { + "epoch": 0.5914230984508415, + "grad_norm": 0.453125, + "learning_rate": 0.00010934278707095103, + "loss": 2.5534, + "step": 2072 + }, + { + "epoch": 0.5917085343091671, + "grad_norm": 0.427734375, + "learning_rate": 0.00010921200836986727, + "loss": 2.56, + "step": 2073 + }, + { + "epoch": 0.5919939701674929, + "grad_norm": 0.435546875, + "learning_rate": 0.00010908126314018212, + "loss": 2.5518, + "step": 2074 + }, + { + "epoch": 0.5922794060258185, + "grad_norm": 0.455078125, + "learning_rate": 0.00010895055148918756, + "loss": 2.587, + "step": 2075 + }, + { + "epoch": 0.5925648418841443, + "grad_norm": 0.419921875, + "learning_rate": 0.00010881987352414806, + "loss": 2.5573, + "step": 2076 + }, + { + "epoch": 0.59285027774247, + "grad_norm": 0.439453125, + "learning_rate": 0.00010868922935230049, + "loss": 2.5569, + "step": 2077 + }, + { + "epoch": 0.5931357136007956, + "grad_norm": 0.462890625, + "learning_rate": 0.00010855861908085383, + "loss": 2.5437, + "step": 2078 + }, + { + "epoch": 0.5934211494591214, + "grad_norm": 0.4296875, + "learning_rate": 0.00010842804281698937, + "loss": 2.554, + "step": 2079 + }, + { + "epoch": 0.593706585317447, + "grad_norm": 0.46875, + "learning_rate": 0.00010829750066786052, + "loss": 2.5834, + "step": 2080 + }, + { + "epoch": 0.5939920211757728, + "grad_norm": 0.4140625, + "learning_rate": 0.00010816699274059255, + "loss": 2.5947, + "step": 2081 + }, + { + "epoch": 0.5942774570340984, + "grad_norm": 0.470703125, + "learning_rate": 0.00010803651914228285, + "loss": 2.557, + "step": 2082 + }, + { + "epoch": 0.5945628928924241, + "grad_norm": 0.400390625, + "learning_rate": 0.00010790607998000048, + "loss": 2.5781, + "step": 2083 + }, + { + "epoch": 0.5948483287507498, + "grad_norm": 0.455078125, + "learning_rate": 0.00010777567536078623, + "loss": 2.57, + "step": 2084 + }, + { + "epoch": 0.5951337646090755, + "grad_norm": 0.42578125, + "learning_rate": 0.0001076453053916527, + "loss": 2.5555, + "step": 2085 + }, + { + "epoch": 0.5954192004674013, + "grad_norm": 0.4296875, + "learning_rate": 0.00010751497017958385, + "loss": 2.6032, + "step": 2086 + }, + { + "epoch": 0.5957046363257269, + "grad_norm": 0.5546875, + "learning_rate": 0.00010738466983153533, + "loss": 2.5711, + "step": 2087 + }, + { + "epoch": 0.5959900721840526, + "grad_norm": 0.439453125, + "learning_rate": 0.000107254404454434, + "loss": 2.5851, + "step": 2088 + }, + { + "epoch": 0.5962755080423783, + "grad_norm": 0.49609375, + "learning_rate": 0.00010712417415517808, + "loss": 2.5805, + "step": 2089 + }, + { + "epoch": 0.596560943900704, + "grad_norm": 0.451171875, + "learning_rate": 0.00010699397904063708, + "loss": 2.5809, + "step": 2090 + }, + { + "epoch": 0.5968463797590297, + "grad_norm": 0.57421875, + "learning_rate": 0.00010686381921765158, + "loss": 2.5796, + "step": 2091 + }, + { + "epoch": 0.5971318156173554, + "grad_norm": 0.462890625, + "learning_rate": 0.00010673369479303315, + "loss": 2.5641, + "step": 2092 + }, + { + "epoch": 0.597417251475681, + "grad_norm": 0.42578125, + "learning_rate": 0.00010660360587356438, + "loss": 2.5651, + "step": 2093 + }, + { + "epoch": 0.5977026873340068, + "grad_norm": 0.44921875, + "learning_rate": 0.00010647355256599877, + "loss": 2.5639, + "step": 2094 + }, + { + "epoch": 0.5979881231923325, + "grad_norm": 0.423828125, + "learning_rate": 0.00010634353497706037, + "loss": 2.5482, + "step": 2095 + }, + { + "epoch": 0.5982735590506582, + "grad_norm": 0.439453125, + "learning_rate": 0.0001062135532134442, + "loss": 2.5762, + "step": 2096 + }, + { + "epoch": 0.5985589949089839, + "grad_norm": 0.419921875, + "learning_rate": 0.0001060836073818157, + "loss": 2.573, + "step": 2097 + }, + { + "epoch": 0.5988444307673095, + "grad_norm": 0.4453125, + "learning_rate": 0.00010595369758881091, + "loss": 2.5582, + "step": 2098 + }, + { + "epoch": 0.5991298666256353, + "grad_norm": 0.455078125, + "learning_rate": 0.00010582382394103628, + "loss": 2.6, + "step": 2099 + }, + { + "epoch": 0.5994153024839609, + "grad_norm": 0.400390625, + "learning_rate": 0.0001056939865450686, + "loss": 2.573, + "step": 2100 + }, + { + "epoch": 0.5997007383422867, + "grad_norm": 0.419921875, + "learning_rate": 0.00010556418550745482, + "loss": 2.5422, + "step": 2101 + }, + { + "epoch": 0.5999861742006124, + "grad_norm": 0.427734375, + "learning_rate": 0.00010543442093471218, + "loss": 2.5682, + "step": 2102 + }, + { + "epoch": 0.600271610058938, + "grad_norm": 0.451171875, + "learning_rate": 0.00010530469293332797, + "loss": 2.563, + "step": 2103 + }, + { + "epoch": 0.6005570459172638, + "grad_norm": 0.41015625, + "learning_rate": 0.00010517500160975935, + "loss": 2.5584, + "step": 2104 + }, + { + "epoch": 0.6008424817755894, + "grad_norm": 0.4296875, + "learning_rate": 0.00010504534707043357, + "loss": 2.5646, + "step": 2105 + }, + { + "epoch": 0.6011279176339152, + "grad_norm": 0.447265625, + "learning_rate": 0.00010491572942174763, + "loss": 2.5812, + "step": 2106 + }, + { + "epoch": 0.6014133534922408, + "grad_norm": 0.46875, + "learning_rate": 0.00010478614877006813, + "loss": 2.5652, + "step": 2107 + }, + { + "epoch": 0.6016987893505665, + "grad_norm": 0.443359375, + "learning_rate": 0.00010465660522173144, + "loss": 2.5468, + "step": 2108 + }, + { + "epoch": 0.6019842252088922, + "grad_norm": 0.4140625, + "learning_rate": 0.00010452709888304347, + "loss": 2.5424, + "step": 2109 + }, + { + "epoch": 0.6022696610672179, + "grad_norm": 0.43359375, + "learning_rate": 0.0001043976298602796, + "loss": 2.579, + "step": 2110 + }, + { + "epoch": 0.6025550969255437, + "grad_norm": 0.45703125, + "learning_rate": 0.00010426819825968449, + "loss": 2.5618, + "step": 2111 + }, + { + "epoch": 0.6028405327838693, + "grad_norm": 0.421875, + "learning_rate": 0.00010413880418747215, + "loss": 2.5656, + "step": 2112 + }, + { + "epoch": 0.603125968642195, + "grad_norm": 0.4609375, + "learning_rate": 0.00010400944774982593, + "loss": 2.5724, + "step": 2113 + }, + { + "epoch": 0.6034114045005207, + "grad_norm": 0.435546875, + "learning_rate": 0.00010388012905289808, + "loss": 2.5452, + "step": 2114 + }, + { + "epoch": 0.6036968403588464, + "grad_norm": 0.41796875, + "learning_rate": 0.00010375084820280998, + "loss": 2.5538, + "step": 2115 + }, + { + "epoch": 0.603982276217172, + "grad_norm": 0.4296875, + "learning_rate": 0.00010362160530565197, + "loss": 2.5399, + "step": 2116 + }, + { + "epoch": 0.6042677120754978, + "grad_norm": 0.42578125, + "learning_rate": 0.00010349240046748324, + "loss": 2.5613, + "step": 2117 + }, + { + "epoch": 0.6045531479338235, + "grad_norm": 0.412109375, + "learning_rate": 0.00010336323379433165, + "loss": 2.5742, + "step": 2118 + }, + { + "epoch": 0.6048385837921492, + "grad_norm": 0.41015625, + "learning_rate": 0.00010323410539219388, + "loss": 2.5627, + "step": 2119 + }, + { + "epoch": 0.6051240196504749, + "grad_norm": 0.412109375, + "learning_rate": 0.00010310501536703507, + "loss": 2.5675, + "step": 2120 + }, + { + "epoch": 0.6054094555088005, + "grad_norm": 0.412109375, + "learning_rate": 0.00010297596382478906, + "loss": 2.5845, + "step": 2121 + }, + { + "epoch": 0.6056948913671263, + "grad_norm": 0.419921875, + "learning_rate": 0.00010284695087135791, + "loss": 2.5579, + "step": 2122 + }, + { + "epoch": 0.6059803272254519, + "grad_norm": 0.423828125, + "learning_rate": 0.00010271797661261215, + "loss": 2.5864, + "step": 2123 + }, + { + "epoch": 0.6062657630837777, + "grad_norm": 0.390625, + "learning_rate": 0.0001025890411543904, + "loss": 2.5851, + "step": 2124 + }, + { + "epoch": 0.6065511989421033, + "grad_norm": 0.412109375, + "learning_rate": 0.00010246014460249964, + "loss": 2.5753, + "step": 2125 + }, + { + "epoch": 0.606836634800429, + "grad_norm": 0.404296875, + "learning_rate": 0.00010233128706271475, + "loss": 2.5756, + "step": 2126 + }, + { + "epoch": 0.6071220706587548, + "grad_norm": 0.380859375, + "learning_rate": 0.00010220246864077875, + "loss": 2.5755, + "step": 2127 + }, + { + "epoch": 0.6074075065170804, + "grad_norm": 0.384765625, + "learning_rate": 0.00010207368944240234, + "loss": 2.5598, + "step": 2128 + }, + { + "epoch": 0.6076929423754062, + "grad_norm": 0.4140625, + "learning_rate": 0.00010194494957326434, + "loss": 2.564, + "step": 2129 + }, + { + "epoch": 0.6079783782337318, + "grad_norm": 0.388671875, + "learning_rate": 0.00010181624913901099, + "loss": 2.5546, + "step": 2130 + }, + { + "epoch": 0.6082638140920575, + "grad_norm": 0.38671875, + "learning_rate": 0.0001016875882452564, + "loss": 2.5709, + "step": 2131 + }, + { + "epoch": 0.6085492499503832, + "grad_norm": 0.42578125, + "learning_rate": 0.00010155896699758206, + "loss": 2.5293, + "step": 2132 + }, + { + "epoch": 0.6088346858087089, + "grad_norm": 0.384765625, + "learning_rate": 0.00010143038550153703, + "loss": 2.5746, + "step": 2133 + }, + { + "epoch": 0.6091201216670347, + "grad_norm": 0.45703125, + "learning_rate": 0.0001013018438626378, + "loss": 2.5632, + "step": 2134 + }, + { + "epoch": 0.6094055575253603, + "grad_norm": 0.408203125, + "learning_rate": 0.00010117334218636793, + "loss": 2.5465, + "step": 2135 + }, + { + "epoch": 0.609690993383686, + "grad_norm": 0.400390625, + "learning_rate": 0.00010104488057817839, + "loss": 2.5461, + "step": 2136 + }, + { + "epoch": 0.6099764292420117, + "grad_norm": 0.408203125, + "learning_rate": 0.00010091645914348724, + "loss": 2.5891, + "step": 2137 + }, + { + "epoch": 0.6102618651003374, + "grad_norm": 0.412109375, + "learning_rate": 0.00010078807798767953, + "loss": 2.5954, + "step": 2138 + }, + { + "epoch": 0.610547300958663, + "grad_norm": 0.4140625, + "learning_rate": 0.00010065973721610727, + "loss": 2.5611, + "step": 2139 + }, + { + "epoch": 0.6108327368169888, + "grad_norm": 0.392578125, + "learning_rate": 0.00010053143693408932, + "loss": 2.5958, + "step": 2140 + }, + { + "epoch": 0.6111181726753144, + "grad_norm": 0.41015625, + "learning_rate": 0.00010040317724691133, + "loss": 2.5734, + "step": 2141 + }, + { + "epoch": 0.6114036085336402, + "grad_norm": 0.40625, + "learning_rate": 0.00010027495825982558, + "loss": 2.5665, + "step": 2142 + }, + { + "epoch": 0.6116890443919659, + "grad_norm": 0.388671875, + "learning_rate": 0.00010014678007805106, + "loss": 2.5597, + "step": 2143 + }, + { + "epoch": 0.6119744802502916, + "grad_norm": 0.4140625, + "learning_rate": 0.00010001864280677316, + "loss": 2.5883, + "step": 2144 + }, + { + "epoch": 0.6122599161086173, + "grad_norm": 0.41015625, + "learning_rate": 9.989054655114383e-05, + "loss": 2.5357, + "step": 2145 + }, + { + "epoch": 0.6125453519669429, + "grad_norm": 0.40625, + "learning_rate": 9.976249141628124e-05, + "loss": 2.5692, + "step": 2146 + }, + { + "epoch": 0.6128307878252687, + "grad_norm": 0.4296875, + "learning_rate": 9.963447750726984e-05, + "loss": 2.5544, + "step": 2147 + }, + { + "epoch": 0.6131162236835943, + "grad_norm": 0.390625, + "learning_rate": 9.95065049291603e-05, + "loss": 2.5472, + "step": 2148 + }, + { + "epoch": 0.61340165954192, + "grad_norm": 0.3984375, + "learning_rate": 9.937857378696932e-05, + "loss": 2.6036, + "step": 2149 + }, + { + "epoch": 0.6136870954002458, + "grad_norm": 0.40234375, + "learning_rate": 9.925068418567967e-05, + "loss": 2.5645, + "step": 2150 + }, + { + "epoch": 0.6139725312585714, + "grad_norm": 0.396484375, + "learning_rate": 9.912283623023988e-05, + "loss": 2.5646, + "step": 2151 + }, + { + "epoch": 0.6142579671168972, + "grad_norm": 0.4140625, + "learning_rate": 9.899503002556442e-05, + "loss": 2.5792, + "step": 2152 + }, + { + "epoch": 0.6145434029752228, + "grad_norm": 0.39453125, + "learning_rate": 9.886726567653362e-05, + "loss": 2.5629, + "step": 2153 + }, + { + "epoch": 0.6148288388335486, + "grad_norm": 0.4375, + "learning_rate": 9.87395432879932e-05, + "loss": 2.5558, + "step": 2154 + }, + { + "epoch": 0.6151142746918742, + "grad_norm": 0.416015625, + "learning_rate": 9.861186296475458e-05, + "loss": 2.5663, + "step": 2155 + }, + { + "epoch": 0.6153997105501999, + "grad_norm": 0.390625, + "learning_rate": 9.84842248115947e-05, + "loss": 2.5347, + "step": 2156 + }, + { + "epoch": 0.6156851464085256, + "grad_norm": 0.3828125, + "learning_rate": 9.835662893325584e-05, + "loss": 2.5608, + "step": 2157 + }, + { + "epoch": 0.6159705822668513, + "grad_norm": 0.3984375, + "learning_rate": 9.822907543444553e-05, + "loss": 2.5695, + "step": 2158 + }, + { + "epoch": 0.616256018125177, + "grad_norm": 0.376953125, + "learning_rate": 9.810156441983665e-05, + "loss": 2.5549, + "step": 2159 + }, + { + "epoch": 0.6165414539835027, + "grad_norm": 0.41015625, + "learning_rate": 9.797409599406709e-05, + "loss": 2.5916, + "step": 2160 + }, + { + "epoch": 0.6168268898418284, + "grad_norm": 0.4140625, + "learning_rate": 9.784667026173993e-05, + "loss": 2.546, + "step": 2161 + }, + { + "epoch": 0.6171123257001541, + "grad_norm": 0.380859375, + "learning_rate": 9.771928732742313e-05, + "loss": 2.5728, + "step": 2162 + }, + { + "epoch": 0.6173977615584798, + "grad_norm": 0.376953125, + "learning_rate": 9.759194729564954e-05, + "loss": 2.5711, + "step": 2163 + }, + { + "epoch": 0.6176831974168054, + "grad_norm": 0.421875, + "learning_rate": 9.746465027091676e-05, + "loss": 2.5335, + "step": 2164 + }, + { + "epoch": 0.6179686332751312, + "grad_norm": 0.376953125, + "learning_rate": 9.733739635768714e-05, + "loss": 2.5583, + "step": 2165 + }, + { + "epoch": 0.6182540691334568, + "grad_norm": 0.404296875, + "learning_rate": 9.721018566038767e-05, + "loss": 2.537, + "step": 2166 + }, + { + "epoch": 0.6185395049917826, + "grad_norm": 0.421875, + "learning_rate": 9.708301828340993e-05, + "loss": 2.5576, + "step": 2167 + }, + { + "epoch": 0.6188249408501083, + "grad_norm": 0.388671875, + "learning_rate": 9.695589433110968e-05, + "loss": 2.5786, + "step": 2168 + }, + { + "epoch": 0.6191103767084339, + "grad_norm": 0.37890625, + "learning_rate": 9.682881390780749e-05, + "loss": 2.584, + "step": 2169 + }, + { + "epoch": 0.6193958125667597, + "grad_norm": 0.41796875, + "learning_rate": 9.67017771177878e-05, + "loss": 2.5681, + "step": 2170 + }, + { + "epoch": 0.6196812484250853, + "grad_norm": 0.392578125, + "learning_rate": 9.657478406529946e-05, + "loss": 2.553, + "step": 2171 + }, + { + "epoch": 0.6199666842834111, + "grad_norm": 0.390625, + "learning_rate": 9.644783485455537e-05, + "loss": 2.5665, + "step": 2172 + }, + { + "epoch": 0.6202521201417367, + "grad_norm": 0.39453125, + "learning_rate": 9.632092958973246e-05, + "loss": 2.5572, + "step": 2173 + }, + { + "epoch": 0.6205375560000624, + "grad_norm": 0.40234375, + "learning_rate": 9.61940683749716e-05, + "loss": 2.5576, + "step": 2174 + }, + { + "epoch": 0.6208229918583882, + "grad_norm": 0.3828125, + "learning_rate": 9.606725131437739e-05, + "loss": 2.5667, + "step": 2175 + }, + { + "epoch": 0.6211084277167138, + "grad_norm": 0.400390625, + "learning_rate": 9.594047851201855e-05, + "loss": 2.5688, + "step": 2176 + }, + { + "epoch": 0.6213938635750396, + "grad_norm": 0.38671875, + "learning_rate": 9.581375007192705e-05, + "loss": 2.5627, + "step": 2177 + }, + { + "epoch": 0.6216792994333652, + "grad_norm": 0.400390625, + "learning_rate": 9.568706609809872e-05, + "loss": 2.5918, + "step": 2178 + }, + { + "epoch": 0.6219647352916909, + "grad_norm": 0.396484375, + "learning_rate": 9.556042669449281e-05, + "loss": 2.5662, + "step": 2179 + }, + { + "epoch": 0.6222501711500166, + "grad_norm": 0.396484375, + "learning_rate": 9.543383196503206e-05, + "loss": 2.5345, + "step": 2180 + }, + { + "epoch": 0.6225356070083423, + "grad_norm": 0.40234375, + "learning_rate": 9.530728201360244e-05, + "loss": 2.5612, + "step": 2181 + }, + { + "epoch": 0.622821042866668, + "grad_norm": 0.390625, + "learning_rate": 9.518077694405322e-05, + "loss": 2.5691, + "step": 2182 + }, + { + "epoch": 0.6231064787249937, + "grad_norm": 0.40234375, + "learning_rate": 9.505431686019692e-05, + "loss": 2.5599, + "step": 2183 + }, + { + "epoch": 0.6233919145833194, + "grad_norm": 0.39453125, + "learning_rate": 9.492790186580906e-05, + "loss": 2.5384, + "step": 2184 + }, + { + "epoch": 0.6236773504416451, + "grad_norm": 0.388671875, + "learning_rate": 9.480153206462817e-05, + "loss": 2.5833, + "step": 2185 + }, + { + "epoch": 0.6239627862999708, + "grad_norm": 0.3828125, + "learning_rate": 9.467520756035575e-05, + "loss": 2.5582, + "step": 2186 + }, + { + "epoch": 0.6242482221582965, + "grad_norm": 0.390625, + "learning_rate": 9.454892845665603e-05, + "loss": 2.5327, + "step": 2187 + }, + { + "epoch": 0.6245336580166222, + "grad_norm": 0.41015625, + "learning_rate": 9.442269485715602e-05, + "loss": 2.5675, + "step": 2188 + }, + { + "epoch": 0.6248190938749478, + "grad_norm": 0.38671875, + "learning_rate": 9.429650686544546e-05, + "loss": 2.5706, + "step": 2189 + }, + { + "epoch": 0.6251045297332736, + "grad_norm": 0.41015625, + "learning_rate": 9.417036458507658e-05, + "loss": 2.5732, + "step": 2190 + }, + { + "epoch": 0.6253899655915993, + "grad_norm": 0.40234375, + "learning_rate": 9.404426811956404e-05, + "loss": 2.57, + "step": 2191 + }, + { + "epoch": 0.625675401449925, + "grad_norm": 0.40234375, + "learning_rate": 9.391821757238511e-05, + "loss": 2.5336, + "step": 2192 + }, + { + "epoch": 0.6259608373082507, + "grad_norm": 0.40625, + "learning_rate": 9.379221304697925e-05, + "loss": 2.5533, + "step": 2193 + }, + { + "epoch": 0.6262462731665763, + "grad_norm": 0.40234375, + "learning_rate": 9.366625464674811e-05, + "loss": 2.5648, + "step": 2194 + }, + { + "epoch": 0.6265317090249021, + "grad_norm": 0.40625, + "learning_rate": 9.354034247505556e-05, + "loss": 2.5672, + "step": 2195 + }, + { + "epoch": 0.6268171448832277, + "grad_norm": 0.40234375, + "learning_rate": 9.341447663522749e-05, + "loss": 2.5789, + "step": 2196 + }, + { + "epoch": 0.6271025807415535, + "grad_norm": 0.384765625, + "learning_rate": 9.328865723055185e-05, + "loss": 2.5557, + "step": 2197 + }, + { + "epoch": 0.6273880165998791, + "grad_norm": 0.431640625, + "learning_rate": 9.316288436427834e-05, + "loss": 2.5479, + "step": 2198 + }, + { + "epoch": 0.6276734524582048, + "grad_norm": 0.40234375, + "learning_rate": 9.30371581396186e-05, + "loss": 2.5853, + "step": 2199 + }, + { + "epoch": 0.6279588883165306, + "grad_norm": 0.380859375, + "learning_rate": 9.291147865974599e-05, + "loss": 2.588, + "step": 2200 + }, + { + "epoch": 0.6282443241748562, + "grad_norm": 0.37890625, + "learning_rate": 9.278584602779541e-05, + "loss": 2.5675, + "step": 2201 + }, + { + "epoch": 0.628529760033182, + "grad_norm": 0.396484375, + "learning_rate": 9.266026034686341e-05, + "loss": 2.59, + "step": 2202 + }, + { + "epoch": 0.6288151958915076, + "grad_norm": 0.44140625, + "learning_rate": 9.253472172000802e-05, + "loss": 2.5578, + "step": 2203 + }, + { + "epoch": 0.6291006317498333, + "grad_norm": 0.40234375, + "learning_rate": 9.240923025024853e-05, + "loss": 2.5348, + "step": 2204 + }, + { + "epoch": 0.629386067608159, + "grad_norm": 0.423828125, + "learning_rate": 9.228378604056568e-05, + "loss": 2.5759, + "step": 2205 + }, + { + "epoch": 0.6296715034664847, + "grad_norm": 0.416015625, + "learning_rate": 9.215838919390132e-05, + "loss": 2.5559, + "step": 2206 + }, + { + "epoch": 0.6299569393248104, + "grad_norm": 0.41015625, + "learning_rate": 9.203303981315847e-05, + "loss": 2.5611, + "step": 2207 + }, + { + "epoch": 0.6302423751831361, + "grad_norm": 0.41015625, + "learning_rate": 9.190773800120126e-05, + "loss": 2.5746, + "step": 2208 + }, + { + "epoch": 0.6305278110414618, + "grad_norm": 0.396484375, + "learning_rate": 9.178248386085474e-05, + "loss": 2.5519, + "step": 2209 + }, + { + "epoch": 0.6308132468997875, + "grad_norm": 0.408203125, + "learning_rate": 9.165727749490477e-05, + "loss": 2.5576, + "step": 2210 + }, + { + "epoch": 0.6310986827581132, + "grad_norm": 0.408203125, + "learning_rate": 9.15321190060981e-05, + "loss": 2.5854, + "step": 2211 + }, + { + "epoch": 0.6313841186164388, + "grad_norm": 0.404296875, + "learning_rate": 9.140700849714216e-05, + "loss": 2.5661, + "step": 2212 + }, + { + "epoch": 0.6316695544747646, + "grad_norm": 0.41015625, + "learning_rate": 9.128194607070498e-05, + "loss": 2.5572, + "step": 2213 + }, + { + "epoch": 0.6319549903330902, + "grad_norm": 0.404296875, + "learning_rate": 9.115693182941518e-05, + "loss": 2.5889, + "step": 2214 + }, + { + "epoch": 0.632240426191416, + "grad_norm": 0.421875, + "learning_rate": 9.103196587586172e-05, + "loss": 2.5474, + "step": 2215 + }, + { + "epoch": 0.6325258620497417, + "grad_norm": 0.412109375, + "learning_rate": 9.090704831259422e-05, + "loss": 2.5664, + "step": 2216 + }, + { + "epoch": 0.6328112979080673, + "grad_norm": 0.376953125, + "learning_rate": 9.078217924212224e-05, + "loss": 2.5648, + "step": 2217 + }, + { + "epoch": 0.6330967337663931, + "grad_norm": 0.412109375, + "learning_rate": 9.065735876691578e-05, + "loss": 2.5675, + "step": 2218 + }, + { + "epoch": 0.6333821696247187, + "grad_norm": 0.39453125, + "learning_rate": 9.053258698940484e-05, + "loss": 2.5783, + "step": 2219 + }, + { + "epoch": 0.6336676054830445, + "grad_norm": 0.4140625, + "learning_rate": 9.040786401197957e-05, + "loss": 2.561, + "step": 2220 + }, + { + "epoch": 0.6339530413413701, + "grad_norm": 0.390625, + "learning_rate": 9.028318993698993e-05, + "loss": 2.5814, + "step": 2221 + }, + { + "epoch": 0.6342384771996958, + "grad_norm": 0.421875, + "learning_rate": 9.015856486674587e-05, + "loss": 2.6124, + "step": 2222 + }, + { + "epoch": 0.6345239130580216, + "grad_norm": 0.458984375, + "learning_rate": 9.003398890351704e-05, + "loss": 2.5395, + "step": 2223 + }, + { + "epoch": 0.6348093489163472, + "grad_norm": 0.400390625, + "learning_rate": 8.99094621495329e-05, + "loss": 2.5417, + "step": 2224 + }, + { + "epoch": 0.635094784774673, + "grad_norm": 0.388671875, + "learning_rate": 8.978498470698244e-05, + "loss": 2.5751, + "step": 2225 + }, + { + "epoch": 0.6353802206329986, + "grad_norm": 0.439453125, + "learning_rate": 8.966055667801422e-05, + "loss": 2.5614, + "step": 2226 + }, + { + "epoch": 0.6356656564913243, + "grad_norm": 0.423828125, + "learning_rate": 8.95361781647362e-05, + "loss": 2.5633, + "step": 2227 + }, + { + "epoch": 0.63595109234965, + "grad_norm": 0.396484375, + "learning_rate": 8.941184926921576e-05, + "loss": 2.5668, + "step": 2228 + }, + { + "epoch": 0.6362365282079757, + "grad_norm": 0.384765625, + "learning_rate": 8.928757009347956e-05, + "loss": 2.5793, + "step": 2229 + }, + { + "epoch": 0.6365219640663013, + "grad_norm": 0.373046875, + "learning_rate": 8.916334073951345e-05, + "loss": 2.5548, + "step": 2230 + }, + { + "epoch": 0.6368073999246271, + "grad_norm": 0.419921875, + "learning_rate": 8.90391613092623e-05, + "loss": 2.5783, + "step": 2231 + }, + { + "epoch": 0.6370928357829528, + "grad_norm": 0.419921875, + "learning_rate": 8.891503190463024e-05, + "loss": 2.5809, + "step": 2232 + }, + { + "epoch": 0.6373782716412785, + "grad_norm": 0.390625, + "learning_rate": 8.879095262748018e-05, + "loss": 2.5614, + "step": 2233 + }, + { + "epoch": 0.6376637074996042, + "grad_norm": 0.41796875, + "learning_rate": 8.866692357963387e-05, + "loss": 2.5739, + "step": 2234 + }, + { + "epoch": 0.6379491433579298, + "grad_norm": 0.416015625, + "learning_rate": 8.854294486287188e-05, + "loss": 2.5764, + "step": 2235 + }, + { + "epoch": 0.6382345792162556, + "grad_norm": 0.4375, + "learning_rate": 8.84190165789336e-05, + "loss": 2.5702, + "step": 2236 + }, + { + "epoch": 0.6385200150745812, + "grad_norm": 0.40625, + "learning_rate": 8.829513882951686e-05, + "loss": 2.5682, + "step": 2237 + }, + { + "epoch": 0.638805450932907, + "grad_norm": 0.423828125, + "learning_rate": 8.8171311716278e-05, + "loss": 2.5557, + "step": 2238 + }, + { + "epoch": 0.6390908867912326, + "grad_norm": 0.42578125, + "learning_rate": 8.804753534083208e-05, + "loss": 2.5917, + "step": 2239 + }, + { + "epoch": 0.6393763226495583, + "grad_norm": 0.390625, + "learning_rate": 8.79238098047522e-05, + "loss": 2.5776, + "step": 2240 + }, + { + "epoch": 0.6396617585078841, + "grad_norm": 0.3984375, + "learning_rate": 8.780013520956996e-05, + "loss": 2.5412, + "step": 2241 + }, + { + "epoch": 0.6399471943662097, + "grad_norm": 0.423828125, + "learning_rate": 8.767651165677502e-05, + "loss": 2.572, + "step": 2242 + }, + { + "epoch": 0.6402326302245355, + "grad_norm": 0.388671875, + "learning_rate": 8.755293924781523e-05, + "loss": 2.5363, + "step": 2243 + }, + { + "epoch": 0.6405180660828611, + "grad_norm": 0.390625, + "learning_rate": 8.742941808409647e-05, + "loss": 2.5623, + "step": 2244 + }, + { + "epoch": 0.6408035019411868, + "grad_norm": 0.404296875, + "learning_rate": 8.730594826698253e-05, + "loss": 2.551, + "step": 2245 + }, + { + "epoch": 0.6410889377995125, + "grad_norm": 0.37109375, + "learning_rate": 8.718252989779496e-05, + "loss": 2.5181, + "step": 2246 + }, + { + "epoch": 0.6413743736578382, + "grad_norm": 0.396484375, + "learning_rate": 8.705916307781344e-05, + "loss": 2.5543, + "step": 2247 + }, + { + "epoch": 0.641659809516164, + "grad_norm": 0.392578125, + "learning_rate": 8.6935847908275e-05, + "loss": 2.5636, + "step": 2248 + }, + { + "epoch": 0.6419452453744896, + "grad_norm": 0.416015625, + "learning_rate": 8.681258449037438e-05, + "loss": 2.5439, + "step": 2249 + }, + { + "epoch": 0.6422306812328153, + "grad_norm": 0.396484375, + "learning_rate": 8.668937292526394e-05, + "loss": 2.5287, + "step": 2250 + }, + { + "epoch": 0.6422306812328153, + "eval_loss": 2.4652860164642334, + "eval_runtime": 6001.1587, + "eval_samples_per_second": 10.712, + "eval_steps_per_second": 10.712, + "step": 2250 + }, + { + "epoch": 0.642516117091141, + "grad_norm": 0.400390625, + "learning_rate": 8.656621331405339e-05, + "loss": 2.5401, + "step": 2251 + }, + { + "epoch": 0.6428015529494667, + "grad_norm": 0.373046875, + "learning_rate": 8.644310575780979e-05, + "loss": 2.5709, + "step": 2252 + }, + { + "epoch": 0.6430869888077924, + "grad_norm": 0.37890625, + "learning_rate": 8.632005035755766e-05, + "loss": 2.6213, + "step": 2253 + }, + { + "epoch": 0.6433724246661181, + "grad_norm": 0.38671875, + "learning_rate": 8.619704721427843e-05, + "loss": 2.5512, + "step": 2254 + }, + { + "epoch": 0.6436578605244437, + "grad_norm": 0.376953125, + "learning_rate": 8.607409642891091e-05, + "loss": 2.563, + "step": 2255 + }, + { + "epoch": 0.6439432963827695, + "grad_norm": 0.39453125, + "learning_rate": 8.595119810235088e-05, + "loss": 2.5438, + "step": 2256 + }, + { + "epoch": 0.6442287322410952, + "grad_norm": 0.38671875, + "learning_rate": 8.582835233545093e-05, + "loss": 2.5563, + "step": 2257 + }, + { + "epoch": 0.6445141680994209, + "grad_norm": 0.38671875, + "learning_rate": 8.570555922902074e-05, + "loss": 2.5278, + "step": 2258 + }, + { + "epoch": 0.6447996039577466, + "grad_norm": 0.388671875, + "learning_rate": 8.558281888382659e-05, + "loss": 2.5753, + "step": 2259 + }, + { + "epoch": 0.6450850398160722, + "grad_norm": 0.380859375, + "learning_rate": 8.546013140059148e-05, + "loss": 2.5751, + "step": 2260 + }, + { + "epoch": 0.645370475674398, + "grad_norm": 0.37890625, + "learning_rate": 8.53374968799952e-05, + "loss": 2.5553, + "step": 2261 + }, + { + "epoch": 0.6456559115327236, + "grad_norm": 0.3828125, + "learning_rate": 8.521491542267386e-05, + "loss": 2.5534, + "step": 2262 + }, + { + "epoch": 0.6459413473910494, + "grad_norm": 0.37890625, + "learning_rate": 8.509238712922014e-05, + "loss": 2.5781, + "step": 2263 + }, + { + "epoch": 0.6462267832493751, + "grad_norm": 0.365234375, + "learning_rate": 8.496991210018319e-05, + "loss": 2.5595, + "step": 2264 + }, + { + "epoch": 0.6465122191077007, + "grad_norm": 0.390625, + "learning_rate": 8.484749043606824e-05, + "loss": 2.5502, + "step": 2265 + }, + { + "epoch": 0.6467976549660265, + "grad_norm": 0.3671875, + "learning_rate": 8.472512223733679e-05, + "loss": 2.5458, + "step": 2266 + }, + { + "epoch": 0.6470830908243521, + "grad_norm": 0.375, + "learning_rate": 8.460280760440664e-05, + "loss": 2.5653, + "step": 2267 + }, + { + "epoch": 0.6473685266826779, + "grad_norm": 0.361328125, + "learning_rate": 8.448054663765135e-05, + "loss": 2.5727, + "step": 2268 + }, + { + "epoch": 0.6476539625410035, + "grad_norm": 0.390625, + "learning_rate": 8.435833943740064e-05, + "loss": 2.5665, + "step": 2269 + }, + { + "epoch": 0.6479393983993292, + "grad_norm": 0.390625, + "learning_rate": 8.423618610394004e-05, + "loss": 2.5411, + "step": 2270 + }, + { + "epoch": 0.6482248342576549, + "grad_norm": 0.375, + "learning_rate": 8.411408673751096e-05, + "loss": 2.5636, + "step": 2271 + }, + { + "epoch": 0.6485102701159806, + "grad_norm": 0.369140625, + "learning_rate": 8.399204143831036e-05, + "loss": 2.5729, + "step": 2272 + }, + { + "epoch": 0.6487957059743064, + "grad_norm": 0.37890625, + "learning_rate": 8.387005030649102e-05, + "loss": 2.5837, + "step": 2273 + }, + { + "epoch": 0.649081141832632, + "grad_norm": 0.375, + "learning_rate": 8.374811344216105e-05, + "loss": 2.5646, + "step": 2274 + }, + { + "epoch": 0.6493665776909577, + "grad_norm": 0.380859375, + "learning_rate": 8.362623094538428e-05, + "loss": 2.5886, + "step": 2275 + }, + { + "epoch": 0.6496520135492834, + "grad_norm": 0.39453125, + "learning_rate": 8.350440291617974e-05, + "loss": 2.5494, + "step": 2276 + }, + { + "epoch": 0.6499374494076091, + "grad_norm": 0.400390625, + "learning_rate": 8.338262945452176e-05, + "loss": 2.5577, + "step": 2277 + }, + { + "epoch": 0.6502228852659347, + "grad_norm": 0.369140625, + "learning_rate": 8.326091066033998e-05, + "loss": 2.5796, + "step": 2278 + }, + { + "epoch": 0.6505083211242605, + "grad_norm": 0.376953125, + "learning_rate": 8.313924663351926e-05, + "loss": 2.574, + "step": 2279 + }, + { + "epoch": 0.6507937569825862, + "grad_norm": 0.38671875, + "learning_rate": 8.301763747389925e-05, + "loss": 2.5544, + "step": 2280 + }, + { + "epoch": 0.6510791928409119, + "grad_norm": 0.36328125, + "learning_rate": 8.289608328127483e-05, + "loss": 2.5358, + "step": 2281 + }, + { + "epoch": 0.6513646286992376, + "grad_norm": 0.38671875, + "learning_rate": 8.277458415539569e-05, + "loss": 2.5567, + "step": 2282 + }, + { + "epoch": 0.6516500645575632, + "grad_norm": 0.375, + "learning_rate": 8.265314019596617e-05, + "loss": 2.5566, + "step": 2283 + }, + { + "epoch": 0.651935500415889, + "grad_norm": 0.369140625, + "learning_rate": 8.253175150264565e-05, + "loss": 2.5591, + "step": 2284 + }, + { + "epoch": 0.6522209362742146, + "grad_norm": 0.375, + "learning_rate": 8.241041817504791e-05, + "loss": 2.5519, + "step": 2285 + }, + { + "epoch": 0.6525063721325404, + "grad_norm": 0.380859375, + "learning_rate": 8.228914031274128e-05, + "loss": 2.5378, + "step": 2286 + }, + { + "epoch": 0.652791807990866, + "grad_norm": 0.392578125, + "learning_rate": 8.21679180152489e-05, + "loss": 2.5576, + "step": 2287 + }, + { + "epoch": 0.6530772438491917, + "grad_norm": 0.361328125, + "learning_rate": 8.204675138204794e-05, + "loss": 2.5636, + "step": 2288 + }, + { + "epoch": 0.6533626797075175, + "grad_norm": 0.37109375, + "learning_rate": 8.192564051257001e-05, + "loss": 2.5682, + "step": 2289 + }, + { + "epoch": 0.6536481155658431, + "grad_norm": 0.376953125, + "learning_rate": 8.180458550620109e-05, + "loss": 2.5616, + "step": 2290 + }, + { + "epoch": 0.6539335514241689, + "grad_norm": 0.3671875, + "learning_rate": 8.168358646228115e-05, + "loss": 2.5503, + "step": 2291 + }, + { + "epoch": 0.6542189872824945, + "grad_norm": 0.3828125, + "learning_rate": 8.156264348010425e-05, + "loss": 2.548, + "step": 2292 + }, + { + "epoch": 0.6545044231408202, + "grad_norm": 0.365234375, + "learning_rate": 8.144175665891858e-05, + "loss": 2.5327, + "step": 2293 + }, + { + "epoch": 0.6547898589991459, + "grad_norm": 0.369140625, + "learning_rate": 8.132092609792608e-05, + "loss": 2.5491, + "step": 2294 + }, + { + "epoch": 0.6550752948574716, + "grad_norm": 0.373046875, + "learning_rate": 8.120015189628259e-05, + "loss": 2.5576, + "step": 2295 + }, + { + "epoch": 0.6553607307157974, + "grad_norm": 0.375, + "learning_rate": 8.107943415309786e-05, + "loss": 2.5687, + "step": 2296 + }, + { + "epoch": 0.655646166574123, + "grad_norm": 0.388671875, + "learning_rate": 8.095877296743497e-05, + "loss": 2.5506, + "step": 2297 + }, + { + "epoch": 0.6559316024324487, + "grad_norm": 0.361328125, + "learning_rate": 8.083816843831091e-05, + "loss": 2.5609, + "step": 2298 + }, + { + "epoch": 0.6562170382907744, + "grad_norm": 0.35546875, + "learning_rate": 8.071762066469598e-05, + "loss": 2.5515, + "step": 2299 + }, + { + "epoch": 0.6565024741491001, + "grad_norm": 0.3671875, + "learning_rate": 8.059712974551392e-05, + "loss": 2.5587, + "step": 2300 + }, + { + "epoch": 0.6567879100074258, + "grad_norm": 0.384765625, + "learning_rate": 8.047669577964197e-05, + "loss": 2.5523, + "step": 2301 + }, + { + "epoch": 0.6570733458657515, + "grad_norm": 0.384765625, + "learning_rate": 8.03563188659104e-05, + "loss": 2.5321, + "step": 2302 + }, + { + "epoch": 0.6573587817240771, + "grad_norm": 0.36328125, + "learning_rate": 8.023599910310287e-05, + "loss": 2.5848, + "step": 2303 + }, + { + "epoch": 0.6576442175824029, + "grad_norm": 0.353515625, + "learning_rate": 8.011573658995606e-05, + "loss": 2.539, + "step": 2304 + }, + { + "epoch": 0.6579296534407286, + "grad_norm": 0.384765625, + "learning_rate": 7.999553142515969e-05, + "loss": 2.5545, + "step": 2305 + }, + { + "epoch": 0.6582150892990543, + "grad_norm": 0.373046875, + "learning_rate": 7.987538370735624e-05, + "loss": 2.5481, + "step": 2306 + }, + { + "epoch": 0.65850052515738, + "grad_norm": 0.373046875, + "learning_rate": 7.975529353514141e-05, + "loss": 2.5889, + "step": 2307 + }, + { + "epoch": 0.6587859610157056, + "grad_norm": 0.37109375, + "learning_rate": 7.963526100706337e-05, + "loss": 2.5113, + "step": 2308 + }, + { + "epoch": 0.6590713968740314, + "grad_norm": 0.361328125, + "learning_rate": 7.951528622162297e-05, + "loss": 2.5789, + "step": 2309 + }, + { + "epoch": 0.659356832732357, + "grad_norm": 0.36328125, + "learning_rate": 7.9395369277274e-05, + "loss": 2.546, + "step": 2310 + }, + { + "epoch": 0.6596422685906828, + "grad_norm": 0.3671875, + "learning_rate": 7.927551027242252e-05, + "loss": 2.5322, + "step": 2311 + }, + { + "epoch": 0.6599277044490084, + "grad_norm": 0.384765625, + "learning_rate": 7.9155709305427e-05, + "loss": 2.5277, + "step": 2312 + }, + { + "epoch": 0.6602131403073341, + "grad_norm": 0.384765625, + "learning_rate": 7.90359664745985e-05, + "loss": 2.5684, + "step": 2313 + }, + { + "epoch": 0.6604985761656599, + "grad_norm": 0.369140625, + "learning_rate": 7.891628187820021e-05, + "loss": 2.5712, + "step": 2314 + }, + { + "epoch": 0.6607840120239855, + "grad_norm": 0.384765625, + "learning_rate": 7.87966556144475e-05, + "loss": 2.5458, + "step": 2315 + }, + { + "epoch": 0.6610694478823113, + "grad_norm": 0.40234375, + "learning_rate": 7.867708778150812e-05, + "loss": 2.572, + "step": 2316 + }, + { + "epoch": 0.6613548837406369, + "grad_norm": 0.376953125, + "learning_rate": 7.855757847750151e-05, + "loss": 2.553, + "step": 2317 + }, + { + "epoch": 0.6616403195989626, + "grad_norm": 0.38671875, + "learning_rate": 7.843812780049935e-05, + "loss": 2.5738, + "step": 2318 + }, + { + "epoch": 0.6619257554572883, + "grad_norm": 0.375, + "learning_rate": 7.831873584852522e-05, + "loss": 2.5652, + "step": 2319 + }, + { + "epoch": 0.662211191315614, + "grad_norm": 0.37890625, + "learning_rate": 7.819940271955425e-05, + "loss": 2.5447, + "step": 2320 + }, + { + "epoch": 0.6624966271739398, + "grad_norm": 0.375, + "learning_rate": 7.808012851151362e-05, + "loss": 2.5698, + "step": 2321 + }, + { + "epoch": 0.6627820630322654, + "grad_norm": 0.3828125, + "learning_rate": 7.796091332228193e-05, + "loss": 2.54, + "step": 2322 + }, + { + "epoch": 0.6630674988905911, + "grad_norm": 0.3515625, + "learning_rate": 7.784175724968939e-05, + "loss": 2.5497, + "step": 2323 + }, + { + "epoch": 0.6633529347489168, + "grad_norm": 0.376953125, + "learning_rate": 7.772266039151781e-05, + "loss": 2.5507, + "step": 2324 + }, + { + "epoch": 0.6636383706072425, + "grad_norm": 3.140625, + "learning_rate": 7.760362284550024e-05, + "loss": 2.5712, + "step": 2325 + }, + { + "epoch": 0.6639238064655681, + "grad_norm": 0.67578125, + "learning_rate": 7.748464470932117e-05, + "loss": 2.5554, + "step": 2326 + }, + { + "epoch": 0.6642092423238939, + "grad_norm": 1.328125, + "learning_rate": 7.73657260806164e-05, + "loss": 2.5577, + "step": 2327 + }, + { + "epoch": 0.6644946781822195, + "grad_norm": 0.38671875, + "learning_rate": 7.724686705697274e-05, + "loss": 2.5744, + "step": 2328 + }, + { + "epoch": 0.6647801140405453, + "grad_norm": 0.431640625, + "learning_rate": 7.712806773592811e-05, + "loss": 2.547, + "step": 2329 + }, + { + "epoch": 0.665065549898871, + "grad_norm": 0.400390625, + "learning_rate": 7.700932821497157e-05, + "loss": 2.558, + "step": 2330 + }, + { + "epoch": 0.6653509857571966, + "grad_norm": 0.39453125, + "learning_rate": 7.689064859154299e-05, + "loss": 2.5383, + "step": 2331 + }, + { + "epoch": 0.6656364216155224, + "grad_norm": 0.3671875, + "learning_rate": 7.677202896303307e-05, + "loss": 2.6, + "step": 2332 + }, + { + "epoch": 0.665921857473848, + "grad_norm": 0.3828125, + "learning_rate": 7.665346942678335e-05, + "loss": 2.5926, + "step": 2333 + }, + { + "epoch": 0.6662072933321738, + "grad_norm": 0.384765625, + "learning_rate": 7.653497008008611e-05, + "loss": 2.5573, + "step": 2334 + }, + { + "epoch": 0.6664927291904994, + "grad_norm": 0.3828125, + "learning_rate": 7.641653102018402e-05, + "loss": 2.5838, + "step": 2335 + }, + { + "epoch": 0.6667781650488251, + "grad_norm": 0.380859375, + "learning_rate": 7.629815234427057e-05, + "loss": 2.5812, + "step": 2336 + }, + { + "epoch": 0.6670636009071509, + "grad_norm": 0.41015625, + "learning_rate": 7.617983414948937e-05, + "loss": 2.5533, + "step": 2337 + }, + { + "epoch": 0.6673490367654765, + "grad_norm": 0.376953125, + "learning_rate": 7.606157653293476e-05, + "loss": 2.5459, + "step": 2338 + }, + { + "epoch": 0.6676344726238023, + "grad_norm": 0.419921875, + "learning_rate": 7.594337959165107e-05, + "loss": 2.5619, + "step": 2339 + }, + { + "epoch": 0.6679199084821279, + "grad_norm": 0.380859375, + "learning_rate": 7.582524342263292e-05, + "loss": 2.5708, + "step": 2340 + }, + { + "epoch": 0.6682053443404536, + "grad_norm": 0.392578125, + "learning_rate": 7.570716812282512e-05, + "loss": 2.5465, + "step": 2341 + }, + { + "epoch": 0.6684907801987793, + "grad_norm": 0.388671875, + "learning_rate": 7.558915378912257e-05, + "loss": 2.5456, + "step": 2342 + }, + { + "epoch": 0.668776216057105, + "grad_norm": 0.3828125, + "learning_rate": 7.547120051836996e-05, + "loss": 2.5814, + "step": 2343 + }, + { + "epoch": 0.6690616519154307, + "grad_norm": 0.3984375, + "learning_rate": 7.535330840736209e-05, + "loss": 2.5684, + "step": 2344 + }, + { + "epoch": 0.6693470877737564, + "grad_norm": 0.357421875, + "learning_rate": 7.523547755284337e-05, + "loss": 2.5622, + "step": 2345 + }, + { + "epoch": 0.6696325236320821, + "grad_norm": 0.392578125, + "learning_rate": 7.511770805150802e-05, + "loss": 2.5668, + "step": 2346 + }, + { + "epoch": 0.6699179594904078, + "grad_norm": 0.390625, + "learning_rate": 7.500000000000002e-05, + "loss": 2.5299, + "step": 2347 + }, + { + "epoch": 0.6702033953487335, + "grad_norm": 0.384765625, + "learning_rate": 7.488235349491278e-05, + "loss": 2.546, + "step": 2348 + }, + { + "epoch": 0.6704888312070592, + "grad_norm": 0.388671875, + "learning_rate": 7.47647686327891e-05, + "loss": 2.5488, + "step": 2349 + }, + { + "epoch": 0.6707742670653849, + "grad_norm": 0.419921875, + "learning_rate": 7.464724551012161e-05, + "loss": 2.5425, + "step": 2350 + }, + { + "epoch": 0.6710597029237105, + "grad_norm": 0.365234375, + "learning_rate": 7.45297842233519e-05, + "loss": 2.5346, + "step": 2351 + }, + { + "epoch": 0.6713451387820363, + "grad_norm": 0.373046875, + "learning_rate": 7.441238486887083e-05, + "loss": 2.5254, + "step": 2352 + }, + { + "epoch": 0.671630574640362, + "grad_norm": 0.380859375, + "learning_rate": 7.42950475430187e-05, + "loss": 2.5561, + "step": 2353 + }, + { + "epoch": 0.6719160104986877, + "grad_norm": 0.376953125, + "learning_rate": 7.417777234208463e-05, + "loss": 2.5601, + "step": 2354 + }, + { + "epoch": 0.6722014463570134, + "grad_norm": 0.3671875, + "learning_rate": 7.406055936230687e-05, + "loss": 2.5617, + "step": 2355 + }, + { + "epoch": 0.672486882215339, + "grad_norm": 0.39453125, + "learning_rate": 7.394340869987267e-05, + "loss": 2.5633, + "step": 2356 + }, + { + "epoch": 0.6727723180736648, + "grad_norm": 0.380859375, + "learning_rate": 7.382632045091803e-05, + "loss": 2.5703, + "step": 2357 + }, + { + "epoch": 0.6730577539319904, + "grad_norm": 0.37109375, + "learning_rate": 7.37092947115278e-05, + "loss": 2.5611, + "step": 2358 + }, + { + "epoch": 0.6733431897903162, + "grad_norm": 0.369140625, + "learning_rate": 7.359233157773557e-05, + "loss": 2.5762, + "step": 2359 + }, + { + "epoch": 0.6736286256486418, + "grad_norm": 0.373046875, + "learning_rate": 7.347543114552343e-05, + "loss": 2.5665, + "step": 2360 + }, + { + "epoch": 0.6739140615069675, + "grad_norm": 0.40234375, + "learning_rate": 7.335859351082217e-05, + "loss": 2.548, + "step": 2361 + }, + { + "epoch": 0.6741994973652933, + "grad_norm": 0.365234375, + "learning_rate": 7.324181876951092e-05, + "loss": 2.5389, + "step": 2362 + }, + { + "epoch": 0.6744849332236189, + "grad_norm": 0.390625, + "learning_rate": 7.312510701741717e-05, + "loss": 2.5481, + "step": 2363 + }, + { + "epoch": 0.6747703690819447, + "grad_norm": 0.3671875, + "learning_rate": 7.300845835031693e-05, + "loss": 2.5571, + "step": 2364 + }, + { + "epoch": 0.6750558049402703, + "grad_norm": 0.3828125, + "learning_rate": 7.28918728639342e-05, + "loss": 2.5809, + "step": 2365 + }, + { + "epoch": 0.675341240798596, + "grad_norm": 0.384765625, + "learning_rate": 7.277535065394127e-05, + "loss": 2.5644, + "step": 2366 + }, + { + "epoch": 0.6756266766569217, + "grad_norm": 0.359375, + "learning_rate": 7.265889181595853e-05, + "loss": 2.5799, + "step": 2367 + }, + { + "epoch": 0.6759121125152474, + "grad_norm": 0.373046875, + "learning_rate": 7.254249644555429e-05, + "loss": 2.5631, + "step": 2368 + }, + { + "epoch": 0.6761975483735732, + "grad_norm": 0.36328125, + "learning_rate": 7.242616463824469e-05, + "loss": 2.5673, + "step": 2369 + }, + { + "epoch": 0.6764829842318988, + "grad_norm": 0.37109375, + "learning_rate": 7.230989648949396e-05, + "loss": 2.5697, + "step": 2370 + }, + { + "epoch": 0.6767684200902245, + "grad_norm": 0.36328125, + "learning_rate": 7.219369209471387e-05, + "loss": 2.569, + "step": 2371 + }, + { + "epoch": 0.6770538559485502, + "grad_norm": 0.357421875, + "learning_rate": 7.207755154926386e-05, + "loss": 2.5493, + "step": 2372 + }, + { + "epoch": 0.6773392918068759, + "grad_norm": 0.357421875, + "learning_rate": 7.196147494845127e-05, + "loss": 2.5515, + "step": 2373 + }, + { + "epoch": 0.6776247276652015, + "grad_norm": 0.396484375, + "learning_rate": 7.184546238753064e-05, + "loss": 2.5449, + "step": 2374 + }, + { + "epoch": 0.6779101635235273, + "grad_norm": 0.36328125, + "learning_rate": 7.172951396170402e-05, + "loss": 2.5657, + "step": 2375 + }, + { + "epoch": 0.6781955993818529, + "grad_norm": 0.376953125, + "learning_rate": 7.1613629766121e-05, + "loss": 2.5615, + "step": 2376 + }, + { + "epoch": 0.6784810352401787, + "grad_norm": 0.39453125, + "learning_rate": 7.149780989587825e-05, + "loss": 2.5787, + "step": 2377 + }, + { + "epoch": 0.6787664710985044, + "grad_norm": 0.359375, + "learning_rate": 7.138205444601985e-05, + "loss": 2.5632, + "step": 2378 + }, + { + "epoch": 0.67905190695683, + "grad_norm": 0.375, + "learning_rate": 7.126636351153684e-05, + "loss": 2.5594, + "step": 2379 + }, + { + "epoch": 0.6793373428151558, + "grad_norm": 0.373046875, + "learning_rate": 7.115073718736735e-05, + "loss": 2.55, + "step": 2380 + }, + { + "epoch": 0.6796227786734814, + "grad_norm": 0.357421875, + "learning_rate": 7.10351755683966e-05, + "loss": 2.5493, + "step": 2381 + }, + { + "epoch": 0.6799082145318072, + "grad_norm": 0.3671875, + "learning_rate": 7.09196787494567e-05, + "loss": 2.54, + "step": 2382 + }, + { + "epoch": 0.6801936503901328, + "grad_norm": 0.35546875, + "learning_rate": 7.08042468253264e-05, + "loss": 2.5681, + "step": 2383 + }, + { + "epoch": 0.6804790862484585, + "grad_norm": 0.375, + "learning_rate": 7.068887989073143e-05, + "loss": 2.5505, + "step": 2384 + }, + { + "epoch": 0.6807645221067842, + "grad_norm": 0.388671875, + "learning_rate": 7.057357804034404e-05, + "loss": 2.5489, + "step": 2385 + }, + { + "epoch": 0.6810499579651099, + "grad_norm": 0.373046875, + "learning_rate": 7.045834136878308e-05, + "loss": 2.5669, + "step": 2386 + }, + { + "epoch": 0.6813353938234357, + "grad_norm": 0.373046875, + "learning_rate": 7.0343169970614e-05, + "loss": 2.5354, + "step": 2387 + }, + { + "epoch": 0.6816208296817613, + "grad_norm": 0.359375, + "learning_rate": 7.022806394034856e-05, + "loss": 2.5571, + "step": 2388 + }, + { + "epoch": 0.681906265540087, + "grad_norm": 0.369140625, + "learning_rate": 7.0113023372445e-05, + "loss": 2.5556, + "step": 2389 + }, + { + "epoch": 0.6821917013984127, + "grad_norm": 0.36328125, + "learning_rate": 6.999804836130784e-05, + "loss": 2.5822, + "step": 2390 + }, + { + "epoch": 0.6824771372567384, + "grad_norm": 0.365234375, + "learning_rate": 6.988313900128769e-05, + "loss": 2.5923, + "step": 2391 + }, + { + "epoch": 0.682762573115064, + "grad_norm": 0.384765625, + "learning_rate": 6.97682953866813e-05, + "loss": 2.5303, + "step": 2392 + }, + { + "epoch": 0.6830480089733898, + "grad_norm": 0.37109375, + "learning_rate": 6.965351761173165e-05, + "loss": 2.5794, + "step": 2393 + }, + { + "epoch": 0.6833334448317155, + "grad_norm": 0.35546875, + "learning_rate": 6.953880577062745e-05, + "loss": 2.582, + "step": 2394 + }, + { + "epoch": 0.6836188806900412, + "grad_norm": 0.37109375, + "learning_rate": 6.94241599575034e-05, + "loss": 2.5485, + "step": 2395 + }, + { + "epoch": 0.6839043165483669, + "grad_norm": 0.361328125, + "learning_rate": 6.930958026644005e-05, + "loss": 2.5524, + "step": 2396 + }, + { + "epoch": 0.6841897524066926, + "grad_norm": 0.36328125, + "learning_rate": 6.919506679146372e-05, + "loss": 2.5754, + "step": 2397 + }, + { + "epoch": 0.6844751882650183, + "grad_norm": 0.357421875, + "learning_rate": 6.908061962654626e-05, + "loss": 2.5647, + "step": 2398 + }, + { + "epoch": 0.6847606241233439, + "grad_norm": 0.373046875, + "learning_rate": 6.896623886560528e-05, + "loss": 2.567, + "step": 2399 + }, + { + "epoch": 0.6850460599816697, + "grad_norm": 0.36328125, + "learning_rate": 6.885192460250366e-05, + "loss": 2.5596, + "step": 2400 + }, + { + "epoch": 0.6853314958399953, + "grad_norm": 0.40234375, + "learning_rate": 6.873767693105e-05, + "loss": 2.5652, + "step": 2401 + }, + { + "epoch": 0.685616931698321, + "grad_norm": 0.369140625, + "learning_rate": 6.8623495944998e-05, + "loss": 2.5612, + "step": 2402 + }, + { + "epoch": 0.6859023675566468, + "grad_norm": 0.37109375, + "learning_rate": 6.850938173804672e-05, + "loss": 2.5595, + "step": 2403 + }, + { + "epoch": 0.6861878034149724, + "grad_norm": 0.380859375, + "learning_rate": 6.839533440384051e-05, + "loss": 2.5805, + "step": 2404 + }, + { + "epoch": 0.6864732392732982, + "grad_norm": 0.353515625, + "learning_rate": 6.82813540359688e-05, + "loss": 2.5742, + "step": 2405 + }, + { + "epoch": 0.6867586751316238, + "grad_norm": 0.365234375, + "learning_rate": 6.816744072796592e-05, + "loss": 2.5801, + "step": 2406 + }, + { + "epoch": 0.6870441109899496, + "grad_norm": 0.365234375, + "learning_rate": 6.805359457331144e-05, + "loss": 2.5545, + "step": 2407 + }, + { + "epoch": 0.6873295468482752, + "grad_norm": 0.369140625, + "learning_rate": 6.793981566542957e-05, + "loss": 2.553, + "step": 2408 + }, + { + "epoch": 0.6876149827066009, + "grad_norm": 0.365234375, + "learning_rate": 6.78261040976894e-05, + "loss": 2.5477, + "step": 2409 + }, + { + "epoch": 0.6879004185649267, + "grad_norm": 0.36328125, + "learning_rate": 6.771245996340491e-05, + "loss": 2.5584, + "step": 2410 + }, + { + "epoch": 0.6881858544232523, + "grad_norm": 0.4453125, + "learning_rate": 6.759888335583458e-05, + "loss": 2.5786, + "step": 2411 + }, + { + "epoch": 0.688471290281578, + "grad_norm": 0.34765625, + "learning_rate": 6.748537436818142e-05, + "loss": 2.5663, + "step": 2412 + }, + { + "epoch": 0.6887567261399037, + "grad_norm": 0.38671875, + "learning_rate": 6.737193309359324e-05, + "loss": 2.5402, + "step": 2413 + }, + { + "epoch": 0.6890421619982294, + "grad_norm": 0.353515625, + "learning_rate": 6.7258559625162e-05, + "loss": 2.5748, + "step": 2414 + }, + { + "epoch": 0.6893275978565551, + "grad_norm": 0.357421875, + "learning_rate": 6.714525405592412e-05, + "loss": 2.5759, + "step": 2415 + }, + { + "epoch": 0.6896130337148808, + "grad_norm": 0.3828125, + "learning_rate": 6.703201647886034e-05, + "loss": 2.5636, + "step": 2416 + }, + { + "epoch": 0.6898984695732064, + "grad_norm": 0.4765625, + "learning_rate": 6.691884698689548e-05, + "loss": 2.5573, + "step": 2417 + }, + { + "epoch": 0.6901839054315322, + "grad_norm": 0.369140625, + "learning_rate": 6.680574567289864e-05, + "loss": 2.5802, + "step": 2418 + }, + { + "epoch": 0.6904693412898579, + "grad_norm": 0.373046875, + "learning_rate": 6.66927126296829e-05, + "loss": 2.5497, + "step": 2419 + }, + { + "epoch": 0.6907547771481836, + "grad_norm": 0.36328125, + "learning_rate": 6.657974795000525e-05, + "loss": 2.5806, + "step": 2420 + }, + { + "epoch": 0.6910402130065093, + "grad_norm": 0.37109375, + "learning_rate": 6.646685172656667e-05, + "loss": 2.5485, + "step": 2421 + }, + { + "epoch": 0.6913256488648349, + "grad_norm": 0.37109375, + "learning_rate": 6.6354024052012e-05, + "loss": 2.5518, + "step": 2422 + }, + { + "epoch": 0.6916110847231607, + "grad_norm": 0.373046875, + "learning_rate": 6.62412650189297e-05, + "loss": 2.5628, + "step": 2423 + }, + { + "epoch": 0.6918965205814863, + "grad_norm": 0.349609375, + "learning_rate": 6.612857471985203e-05, + "loss": 2.5364, + "step": 2424 + }, + { + "epoch": 0.6921819564398121, + "grad_norm": 0.365234375, + "learning_rate": 6.601595324725474e-05, + "loss": 2.5879, + "step": 2425 + }, + { + "epoch": 0.6924673922981378, + "grad_norm": 0.353515625, + "learning_rate": 6.590340069355713e-05, + "loss": 2.5652, + "step": 2426 + }, + { + "epoch": 0.6927528281564634, + "grad_norm": 0.37109375, + "learning_rate": 6.579091715112201e-05, + "loss": 2.544, + "step": 2427 + }, + { + "epoch": 0.6930382640147892, + "grad_norm": 0.384765625, + "learning_rate": 6.567850271225543e-05, + "loss": 2.5717, + "step": 2428 + }, + { + "epoch": 0.6933236998731148, + "grad_norm": 0.37109375, + "learning_rate": 6.556615746920685e-05, + "loss": 2.5632, + "step": 2429 + }, + { + "epoch": 0.6936091357314406, + "grad_norm": 0.3515625, + "learning_rate": 6.545388151416896e-05, + "loss": 2.544, + "step": 2430 + }, + { + "epoch": 0.6938945715897662, + "grad_norm": 0.36328125, + "learning_rate": 6.534167493927748e-05, + "loss": 2.5697, + "step": 2431 + }, + { + "epoch": 0.6941800074480919, + "grad_norm": 0.35546875, + "learning_rate": 6.522953783661121e-05, + "loss": 2.5455, + "step": 2432 + }, + { + "epoch": 0.6944654433064176, + "grad_norm": 0.404296875, + "learning_rate": 6.511747029819207e-05, + "loss": 2.5844, + "step": 2433 + }, + { + "epoch": 0.6947508791647433, + "grad_norm": 0.36328125, + "learning_rate": 6.500547241598478e-05, + "loss": 2.5579, + "step": 2434 + }, + { + "epoch": 0.6950363150230691, + "grad_norm": 0.3828125, + "learning_rate": 6.489354428189683e-05, + "loss": 2.5542, + "step": 2435 + }, + { + "epoch": 0.6953217508813947, + "grad_norm": 0.3671875, + "learning_rate": 6.478168598777864e-05, + "loss": 2.5787, + "step": 2436 + }, + { + "epoch": 0.6956071867397204, + "grad_norm": 0.39453125, + "learning_rate": 6.466989762542332e-05, + "loss": 2.5676, + "step": 2437 + }, + { + "epoch": 0.6958926225980461, + "grad_norm": 0.3671875, + "learning_rate": 6.455817928656636e-05, + "loss": 2.5601, + "step": 2438 + }, + { + "epoch": 0.6961780584563718, + "grad_norm": 0.33984375, + "learning_rate": 6.444653106288612e-05, + "loss": 2.5721, + "step": 2439 + }, + { + "epoch": 0.6964634943146975, + "grad_norm": 0.42578125, + "learning_rate": 6.433495304600306e-05, + "loss": 2.5427, + "step": 2440 + }, + { + "epoch": 0.6967489301730232, + "grad_norm": 0.361328125, + "learning_rate": 6.422344532748039e-05, + "loss": 2.5505, + "step": 2441 + }, + { + "epoch": 0.6970343660313489, + "grad_norm": 0.384765625, + "learning_rate": 6.411200799882338e-05, + "loss": 2.5491, + "step": 2442 + }, + { + "epoch": 0.6973198018896746, + "grad_norm": 0.36328125, + "learning_rate": 6.400064115147955e-05, + "loss": 2.5645, + "step": 2443 + }, + { + "epoch": 0.6976052377480003, + "grad_norm": 0.34765625, + "learning_rate": 6.38893448768387e-05, + "loss": 2.5374, + "step": 2444 + }, + { + "epoch": 0.697890673606326, + "grad_norm": 0.3515625, + "learning_rate": 6.377811926623273e-05, + "loss": 2.5343, + "step": 2445 + }, + { + "epoch": 0.6981761094646517, + "grad_norm": 0.345703125, + "learning_rate": 6.366696441093536e-05, + "loss": 2.6022, + "step": 2446 + }, + { + "epoch": 0.6984615453229773, + "grad_norm": 0.365234375, + "learning_rate": 6.355588040216248e-05, + "loss": 2.5745, + "step": 2447 + }, + { + "epoch": 0.6987469811813031, + "grad_norm": 0.390625, + "learning_rate": 6.344486733107168e-05, + "loss": 2.5623, + "step": 2448 + }, + { + "epoch": 0.6990324170396287, + "grad_norm": 0.353515625, + "learning_rate": 6.333392528876233e-05, + "loss": 2.567, + "step": 2449 + }, + { + "epoch": 0.6993178528979545, + "grad_norm": 0.359375, + "learning_rate": 6.32230543662757e-05, + "loss": 2.5734, + "step": 2450 + }, + { + "epoch": 0.6996032887562802, + "grad_norm": 0.38671875, + "learning_rate": 6.311225465459442e-05, + "loss": 2.5358, + "step": 2451 + }, + { + "epoch": 0.6998887246146058, + "grad_norm": 0.369140625, + "learning_rate": 6.300152624464296e-05, + "loss": 2.5494, + "step": 2452 + }, + { + "epoch": 0.7001741604729316, + "grad_norm": 0.3515625, + "learning_rate": 6.289086922728712e-05, + "loss": 2.5602, + "step": 2453 + }, + { + "epoch": 0.7004595963312572, + "grad_norm": 0.3515625, + "learning_rate": 6.278028369333413e-05, + "loss": 2.5788, + "step": 2454 + }, + { + "epoch": 0.700745032189583, + "grad_norm": 0.392578125, + "learning_rate": 6.266976973353252e-05, + "loss": 2.5591, + "step": 2455 + }, + { + "epoch": 0.7010304680479086, + "grad_norm": 0.3671875, + "learning_rate": 6.255932743857226e-05, + "loss": 2.5517, + "step": 2456 + }, + { + "epoch": 0.7013159039062343, + "grad_norm": 0.353515625, + "learning_rate": 6.244895689908426e-05, + "loss": 2.5502, + "step": 2457 + }, + { + "epoch": 0.70160133976456, + "grad_norm": 0.373046875, + "learning_rate": 6.233865820564079e-05, + "loss": 2.5815, + "step": 2458 + }, + { + "epoch": 0.7018867756228857, + "grad_norm": 0.353515625, + "learning_rate": 6.222843144875492e-05, + "loss": 2.5633, + "step": 2459 + }, + { + "epoch": 0.7021722114812115, + "grad_norm": 0.373046875, + "learning_rate": 6.211827671888098e-05, + "loss": 2.5513, + "step": 2460 + }, + { + "epoch": 0.7024576473395371, + "grad_norm": 0.380859375, + "learning_rate": 6.200819410641385e-05, + "loss": 2.569, + "step": 2461 + }, + { + "epoch": 0.7027430831978628, + "grad_norm": 0.37109375, + "learning_rate": 6.189818370168956e-05, + "loss": 2.559, + "step": 2462 + }, + { + "epoch": 0.7030285190561885, + "grad_norm": 0.369140625, + "learning_rate": 6.17882455949846e-05, + "loss": 2.5625, + "step": 2463 + }, + { + "epoch": 0.7033139549145142, + "grad_norm": 0.359375, + "learning_rate": 6.16783798765164e-05, + "loss": 2.552, + "step": 2464 + }, + { + "epoch": 0.7035993907728398, + "grad_norm": 0.365234375, + "learning_rate": 6.156858663644277e-05, + "loss": 2.5329, + "step": 2465 + }, + { + "epoch": 0.7038848266311656, + "grad_norm": 0.33984375, + "learning_rate": 6.145886596486208e-05, + "loss": 2.5371, + "step": 2466 + }, + { + "epoch": 0.7041702624894913, + "grad_norm": 0.337890625, + "learning_rate": 6.134921795181324e-05, + "loss": 2.561, + "step": 2467 + }, + { + "epoch": 0.704455698347817, + "grad_norm": 0.34765625, + "learning_rate": 6.123964268727554e-05, + "loss": 2.5607, + "step": 2468 + }, + { + "epoch": 0.7047411342061427, + "grad_norm": 0.3515625, + "learning_rate": 6.113014026116841e-05, + "loss": 2.5781, + "step": 2469 + }, + { + "epoch": 0.7050265700644683, + "grad_norm": 0.369140625, + "learning_rate": 6.102071076335173e-05, + "loss": 2.5742, + "step": 2470 + }, + { + "epoch": 0.7053120059227941, + "grad_norm": 0.341796875, + "learning_rate": 6.091135428362536e-05, + "loss": 2.5736, + "step": 2471 + }, + { + "epoch": 0.7055974417811197, + "grad_norm": 0.36328125, + "learning_rate": 6.0802070911729246e-05, + "loss": 2.5795, + "step": 2472 + }, + { + "epoch": 0.7058828776394455, + "grad_norm": 0.357421875, + "learning_rate": 6.06928607373435e-05, + "loss": 2.5563, + "step": 2473 + }, + { + "epoch": 0.7061683134977711, + "grad_norm": 0.357421875, + "learning_rate": 6.058372385008801e-05, + "loss": 2.5287, + "step": 2474 + }, + { + "epoch": 0.7064537493560968, + "grad_norm": 0.34765625, + "learning_rate": 6.047466033952245e-05, + "loss": 2.5752, + "step": 2475 + }, + { + "epoch": 0.7067391852144226, + "grad_norm": 0.34765625, + "learning_rate": 6.036567029514665e-05, + "loss": 2.5511, + "step": 2476 + }, + { + "epoch": 0.7070246210727482, + "grad_norm": 0.357421875, + "learning_rate": 6.025675380639976e-05, + "loss": 2.5685, + "step": 2477 + }, + { + "epoch": 0.707310056931074, + "grad_norm": 0.357421875, + "learning_rate": 6.0147910962660684e-05, + "loss": 2.577, + "step": 2478 + }, + { + "epoch": 0.7075954927893996, + "grad_norm": 0.3671875, + "learning_rate": 6.003914185324802e-05, + "loss": 2.5451, + "step": 2479 + }, + { + "epoch": 0.7078809286477253, + "grad_norm": 0.349609375, + "learning_rate": 5.993044656741965e-05, + "loss": 2.5405, + "step": 2480 + }, + { + "epoch": 0.708166364506051, + "grad_norm": 0.34765625, + "learning_rate": 5.982182519437311e-05, + "loss": 2.5569, + "step": 2481 + }, + { + "epoch": 0.7084518003643767, + "grad_norm": 0.373046875, + "learning_rate": 5.971327782324508e-05, + "loss": 2.5454, + "step": 2482 + }, + { + "epoch": 0.7087372362227025, + "grad_norm": 0.369140625, + "learning_rate": 5.960480454311155e-05, + "loss": 2.5725, + "step": 2483 + }, + { + "epoch": 0.7090226720810281, + "grad_norm": 0.34375, + "learning_rate": 5.949640544298779e-05, + "loss": 2.5612, + "step": 2484 + }, + { + "epoch": 0.7093081079393538, + "grad_norm": 0.3359375, + "learning_rate": 5.938808061182823e-05, + "loss": 2.5581, + "step": 2485 + }, + { + "epoch": 0.7095935437976795, + "grad_norm": 0.34765625, + "learning_rate": 5.927983013852614e-05, + "loss": 2.5476, + "step": 2486 + }, + { + "epoch": 0.7098789796560052, + "grad_norm": 0.359375, + "learning_rate": 5.917165411191405e-05, + "loss": 2.5592, + "step": 2487 + }, + { + "epoch": 0.7101644155143308, + "grad_norm": 0.36328125, + "learning_rate": 5.906355262076317e-05, + "loss": 2.5649, + "step": 2488 + }, + { + "epoch": 0.7104498513726566, + "grad_norm": 0.3515625, + "learning_rate": 5.895552575378361e-05, + "loss": 2.5849, + "step": 2489 + }, + { + "epoch": 0.7107352872309822, + "grad_norm": 0.34765625, + "learning_rate": 5.8847573599624335e-05, + "loss": 2.5812, + "step": 2490 + }, + { + "epoch": 0.711020723089308, + "grad_norm": 0.365234375, + "learning_rate": 5.8739696246872853e-05, + "loss": 2.5425, + "step": 2491 + }, + { + "epoch": 0.7113061589476337, + "grad_norm": 0.353515625, + "learning_rate": 5.863189378405541e-05, + "loss": 2.554, + "step": 2492 + }, + { + "epoch": 0.7115915948059593, + "grad_norm": 0.361328125, + "learning_rate": 5.8524166299636785e-05, + "loss": 2.5374, + "step": 2493 + }, + { + "epoch": 0.7118770306642851, + "grad_norm": 0.353515625, + "learning_rate": 5.841651388202015e-05, + "loss": 2.5079, + "step": 2494 + }, + { + "epoch": 0.7121624665226107, + "grad_norm": 0.380859375, + "learning_rate": 5.8308936619547076e-05, + "loss": 2.5421, + "step": 2495 + }, + { + "epoch": 0.7124479023809365, + "grad_norm": 0.376953125, + "learning_rate": 5.820143460049759e-05, + "loss": 2.5617, + "step": 2496 + }, + { + "epoch": 0.7127333382392621, + "grad_norm": 0.3515625, + "learning_rate": 5.809400791308978e-05, + "loss": 2.5253, + "step": 2497 + }, + { + "epoch": 0.7130187740975878, + "grad_norm": 0.34765625, + "learning_rate": 5.798665664548015e-05, + "loss": 2.5518, + "step": 2498 + }, + { + "epoch": 0.7133042099559136, + "grad_norm": 0.369140625, + "learning_rate": 5.787938088576305e-05, + "loss": 2.5575, + "step": 2499 + }, + { + "epoch": 0.7135896458142392, + "grad_norm": 0.359375, + "learning_rate": 5.777218072197113e-05, + "loss": 2.5604, + "step": 2500 + }, + { + "epoch": 0.7135896458142392, + "eval_loss": 2.4628705978393555, + "eval_runtime": 5982.5105, + "eval_samples_per_second": 10.746, + "eval_steps_per_second": 10.746, + "step": 2500 + } + ], + "logging_steps": 1, + "max_steps": 3503, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 250, + "total_flos": 9.70632734441472e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}