{ "best_metric": null, "best_model_checkpoint": null, "epoch": 4.96969696969697, "eval_steps": 500, "global_step": 615, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.04040404040404041, "grad_norm": 80.8431859261654, "learning_rate": 4.0650406504065046e-07, "loss": 2.64, "step": 5 }, { "epoch": 0.08080808080808081, "grad_norm": 58.95328571668908, "learning_rate": 8.130081300813009e-07, "loss": 2.5295, "step": 10 }, { "epoch": 0.12121212121212122, "grad_norm": 30.259656389849923, "learning_rate": 1.2195121951219514e-06, "loss": 2.3595, "step": 15 }, { "epoch": 0.16161616161616163, "grad_norm": 19.031626660884115, "learning_rate": 1.6260162601626018e-06, "loss": 2.2046, "step": 20 }, { "epoch": 0.20202020202020202, "grad_norm": 14.285496207217582, "learning_rate": 2.0325203252032523e-06, "loss": 2.0114, "step": 25 }, { "epoch": 0.24242424242424243, "grad_norm": 11.705575031693748, "learning_rate": 2.4390243902439027e-06, "loss": 1.9542, "step": 30 }, { "epoch": 0.2828282828282828, "grad_norm": 10.031255451818282, "learning_rate": 2.845528455284553e-06, "loss": 1.9027, "step": 35 }, { "epoch": 0.32323232323232326, "grad_norm": 9.87064503889936, "learning_rate": 3.2520325203252037e-06, "loss": 1.8376, "step": 40 }, { "epoch": 0.36363636363636365, "grad_norm": 9.463342602070677, "learning_rate": 3.6585365853658537e-06, "loss": 1.8335, "step": 45 }, { "epoch": 0.40404040404040403, "grad_norm": 8.14740265971335, "learning_rate": 4.0650406504065046e-06, "loss": 1.7531, "step": 50 }, { "epoch": 0.4444444444444444, "grad_norm": 8.579660709279775, "learning_rate": 4.471544715447155e-06, "loss": 1.746, "step": 55 }, { "epoch": 0.48484848484848486, "grad_norm": 7.764594425498635, "learning_rate": 4.8780487804878055e-06, "loss": 1.7795, "step": 60 }, { "epoch": 0.5252525252525253, "grad_norm": 7.818967776047545, "learning_rate": 5.2845528455284555e-06, "loss": 1.7845, "step": 65 }, { "epoch": 0.5656565656565656, "grad_norm": 8.571992272728625, "learning_rate": 5.691056910569106e-06, "loss": 1.7176, "step": 70 }, { "epoch": 0.6060606060606061, "grad_norm": 7.157751637340014, "learning_rate": 6.0975609756097564e-06, "loss": 1.6894, "step": 75 }, { "epoch": 0.6464646464646465, "grad_norm": 7.776730945646382, "learning_rate": 6.504065040650407e-06, "loss": 1.6733, "step": 80 }, { "epoch": 0.6868686868686869, "grad_norm": 7.538790813241334, "learning_rate": 6.910569105691057e-06, "loss": 1.6661, "step": 85 }, { "epoch": 0.7272727272727273, "grad_norm": 7.237611731257021, "learning_rate": 7.317073170731707e-06, "loss": 1.6413, "step": 90 }, { "epoch": 0.7676767676767676, "grad_norm": 8.713375291094371, "learning_rate": 7.723577235772358e-06, "loss": 1.6712, "step": 95 }, { "epoch": 0.8080808080808081, "grad_norm": 8.148761177726017, "learning_rate": 8.130081300813009e-06, "loss": 1.6491, "step": 100 }, { "epoch": 0.8484848484848485, "grad_norm": 7.670084788858041, "learning_rate": 8.536585365853658e-06, "loss": 1.6661, "step": 105 }, { "epoch": 0.8888888888888888, "grad_norm": 8.193767782896925, "learning_rate": 8.94308943089431e-06, "loss": 1.6061, "step": 110 }, { "epoch": 0.9292929292929293, "grad_norm": 7.327023272280465, "learning_rate": 9.34959349593496e-06, "loss": 1.6177, "step": 115 }, { "epoch": 0.9696969696969697, "grad_norm": 7.6240834480577995, "learning_rate": 9.756097560975611e-06, "loss": 1.7007, "step": 120 }, { "epoch": 1.0101010101010102, "grad_norm": 7.022808705113093, "learning_rate": 9.959349593495936e-06, "loss": 1.5861, "step": 125 }, { "epoch": 1.0505050505050506, "grad_norm": 7.831382750074179, "learning_rate": 9.857723577235772e-06, "loss": 1.397, "step": 130 }, { "epoch": 1.0909090909090908, "grad_norm": 7.482634887493619, "learning_rate": 9.756097560975611e-06, "loss": 1.3207, "step": 135 }, { "epoch": 1.1313131313131313, "grad_norm": 7.77405709671628, "learning_rate": 9.654471544715448e-06, "loss": 1.4016, "step": 140 }, { "epoch": 1.1717171717171717, "grad_norm": 8.316039462095251, "learning_rate": 9.552845528455286e-06, "loss": 1.4044, "step": 145 }, { "epoch": 1.2121212121212122, "grad_norm": 8.220962058193507, "learning_rate": 9.451219512195122e-06, "loss": 1.4525, "step": 150 }, { "epoch": 1.2525252525252526, "grad_norm": 8.628523413754008, "learning_rate": 9.34959349593496e-06, "loss": 1.3725, "step": 155 }, { "epoch": 1.2929292929292928, "grad_norm": 9.049067713299571, "learning_rate": 9.247967479674797e-06, "loss": 1.3988, "step": 160 }, { "epoch": 1.3333333333333333, "grad_norm": 8.79216448432911, "learning_rate": 9.146341463414635e-06, "loss": 1.3923, "step": 165 }, { "epoch": 1.3737373737373737, "grad_norm": 7.749843189602475, "learning_rate": 9.044715447154472e-06, "loss": 1.3965, "step": 170 }, { "epoch": 1.4141414141414141, "grad_norm": 8.10742211898071, "learning_rate": 8.94308943089431e-06, "loss": 1.3586, "step": 175 }, { "epoch": 1.4545454545454546, "grad_norm": 8.61374938206157, "learning_rate": 8.841463414634148e-06, "loss": 1.3455, "step": 180 }, { "epoch": 1.494949494949495, "grad_norm": 7.806546434692009, "learning_rate": 8.739837398373985e-06, "loss": 1.3164, "step": 185 }, { "epoch": 1.5353535353535355, "grad_norm": 8.055069468513308, "learning_rate": 8.638211382113821e-06, "loss": 1.3719, "step": 190 }, { "epoch": 1.5757575757575757, "grad_norm": 7.286077733666312, "learning_rate": 8.536585365853658e-06, "loss": 1.331, "step": 195 }, { "epoch": 1.6161616161616161, "grad_norm": 7.722391890399128, "learning_rate": 8.434959349593497e-06, "loss": 1.347, "step": 200 }, { "epoch": 1.6565656565656566, "grad_norm": 7.963303799143797, "learning_rate": 8.333333333333334e-06, "loss": 1.2988, "step": 205 }, { "epoch": 1.696969696969697, "grad_norm": 8.363381324799755, "learning_rate": 8.23170731707317e-06, "loss": 1.3731, "step": 210 }, { "epoch": 1.7373737373737375, "grad_norm": 8.650050591837509, "learning_rate": 8.130081300813009e-06, "loss": 1.3554, "step": 215 }, { "epoch": 1.7777777777777777, "grad_norm": 7.821382086934233, "learning_rate": 8.028455284552846e-06, "loss": 1.3257, "step": 220 }, { "epoch": 1.8181818181818183, "grad_norm": 7.593249783363984, "learning_rate": 7.926829268292685e-06, "loss": 1.2994, "step": 225 }, { "epoch": 1.8585858585858586, "grad_norm": 8.265959609765153, "learning_rate": 7.82520325203252e-06, "loss": 1.258, "step": 230 }, { "epoch": 1.898989898989899, "grad_norm": 7.47340446063849, "learning_rate": 7.723577235772358e-06, "loss": 1.3744, "step": 235 }, { "epoch": 1.9393939393939394, "grad_norm": 7.8636893423505505, "learning_rate": 7.621951219512196e-06, "loss": 1.2867, "step": 240 }, { "epoch": 1.9797979797979797, "grad_norm": 8.66108025838036, "learning_rate": 7.520325203252034e-06, "loss": 1.3423, "step": 245 }, { "epoch": 2.0202020202020203, "grad_norm": 7.253266730457967, "learning_rate": 7.41869918699187e-06, "loss": 1.1248, "step": 250 }, { "epoch": 2.0606060606060606, "grad_norm": 7.304801005516647, "learning_rate": 7.317073170731707e-06, "loss": 0.8695, "step": 255 }, { "epoch": 2.101010101010101, "grad_norm": 8.148533408280995, "learning_rate": 7.215447154471545e-06, "loss": 0.8396, "step": 260 }, { "epoch": 2.1414141414141414, "grad_norm": 9.202743834871297, "learning_rate": 7.113821138211383e-06, "loss": 0.8835, "step": 265 }, { "epoch": 2.1818181818181817, "grad_norm": 9.248143695051853, "learning_rate": 7.01219512195122e-06, "loss": 0.769, "step": 270 }, { "epoch": 2.2222222222222223, "grad_norm": 8.877400578549704, "learning_rate": 6.910569105691057e-06, "loss": 0.8883, "step": 275 }, { "epoch": 2.2626262626262625, "grad_norm": 9.858300545714043, "learning_rate": 6.808943089430895e-06, "loss": 0.8892, "step": 280 }, { "epoch": 2.303030303030303, "grad_norm": 8.940708831871842, "learning_rate": 6.707317073170733e-06, "loss": 0.8526, "step": 285 }, { "epoch": 2.3434343434343434, "grad_norm": 9.167915788662723, "learning_rate": 6.60569105691057e-06, "loss": 0.9309, "step": 290 }, { "epoch": 2.3838383838383836, "grad_norm": 8.435170262522817, "learning_rate": 6.504065040650407e-06, "loss": 0.8693, "step": 295 }, { "epoch": 2.4242424242424243, "grad_norm": 9.022959586969035, "learning_rate": 6.402439024390244e-06, "loss": 0.8659, "step": 300 }, { "epoch": 2.4646464646464645, "grad_norm": 8.705530302904208, "learning_rate": 6.300813008130082e-06, "loss": 0.9076, "step": 305 }, { "epoch": 2.505050505050505, "grad_norm": 8.449067406312437, "learning_rate": 6.199186991869919e-06, "loss": 0.8896, "step": 310 }, { "epoch": 2.5454545454545454, "grad_norm": 8.962552587001122, "learning_rate": 6.0975609756097564e-06, "loss": 0.8568, "step": 315 }, { "epoch": 2.5858585858585856, "grad_norm": 9.26680724967832, "learning_rate": 5.995934959349594e-06, "loss": 0.8707, "step": 320 }, { "epoch": 2.6262626262626263, "grad_norm": 9.852323988179384, "learning_rate": 5.894308943089432e-06, "loss": 0.9007, "step": 325 }, { "epoch": 2.6666666666666665, "grad_norm": 9.473031970955077, "learning_rate": 5.792682926829269e-06, "loss": 0.8907, "step": 330 }, { "epoch": 2.707070707070707, "grad_norm": 8.423216825316242, "learning_rate": 5.691056910569106e-06, "loss": 0.8408, "step": 335 }, { "epoch": 2.7474747474747474, "grad_norm": 8.772022355651819, "learning_rate": 5.589430894308944e-06, "loss": 0.8791, "step": 340 }, { "epoch": 2.787878787878788, "grad_norm": 9.456726517429484, "learning_rate": 5.487804878048781e-06, "loss": 0.9048, "step": 345 }, { "epoch": 2.8282828282828283, "grad_norm": 8.932119436113132, "learning_rate": 5.386178861788618e-06, "loss": 0.9458, "step": 350 }, { "epoch": 2.8686868686868685, "grad_norm": 9.076984191036512, "learning_rate": 5.2845528455284555e-06, "loss": 0.887, "step": 355 }, { "epoch": 2.909090909090909, "grad_norm": 9.228482912276068, "learning_rate": 5.182926829268293e-06, "loss": 0.9164, "step": 360 }, { "epoch": 2.9494949494949494, "grad_norm": 9.39115941622314, "learning_rate": 5.081300813008131e-06, "loss": 0.9146, "step": 365 }, { "epoch": 2.98989898989899, "grad_norm": 9.590905474617363, "learning_rate": 4.979674796747968e-06, "loss": 0.8975, "step": 370 }, { "epoch": 3.0303030303030303, "grad_norm": 8.147437990412763, "learning_rate": 4.8780487804878055e-06, "loss": 0.5845, "step": 375 }, { "epoch": 3.0707070707070705, "grad_norm": 8.966178510928351, "learning_rate": 4.776422764227643e-06, "loss": 0.4348, "step": 380 }, { "epoch": 3.111111111111111, "grad_norm": 9.411638909598917, "learning_rate": 4.67479674796748e-06, "loss": 0.4666, "step": 385 }, { "epoch": 3.1515151515151514, "grad_norm": 8.787258663179166, "learning_rate": 4.573170731707318e-06, "loss": 0.4412, "step": 390 }, { "epoch": 3.191919191919192, "grad_norm": 8.448686917354301, "learning_rate": 4.471544715447155e-06, "loss": 0.4644, "step": 395 }, { "epoch": 3.2323232323232323, "grad_norm": 10.794449599661947, "learning_rate": 4.369918699186992e-06, "loss": 0.4697, "step": 400 }, { "epoch": 3.2727272727272725, "grad_norm": 9.903329877706229, "learning_rate": 4.268292682926829e-06, "loss": 0.4451, "step": 405 }, { "epoch": 3.313131313131313, "grad_norm": 10.77521515420911, "learning_rate": 4.166666666666667e-06, "loss": 0.4332, "step": 410 }, { "epoch": 3.3535353535353534, "grad_norm": 9.850095186977901, "learning_rate": 4.0650406504065046e-06, "loss": 0.4571, "step": 415 }, { "epoch": 3.393939393939394, "grad_norm": 9.719974950746256, "learning_rate": 3.963414634146342e-06, "loss": 0.4651, "step": 420 }, { "epoch": 3.4343434343434343, "grad_norm": 8.909903135988007, "learning_rate": 3.861788617886179e-06, "loss": 0.4412, "step": 425 }, { "epoch": 3.474747474747475, "grad_norm": 9.513088350689786, "learning_rate": 3.760162601626017e-06, "loss": 0.4713, "step": 430 }, { "epoch": 3.515151515151515, "grad_norm": 8.770653506187902, "learning_rate": 3.6585365853658537e-06, "loss": 0.4325, "step": 435 }, { "epoch": 3.5555555555555554, "grad_norm": 10.698789680572704, "learning_rate": 3.5569105691056914e-06, "loss": 0.456, "step": 440 }, { "epoch": 3.595959595959596, "grad_norm": 9.719819699090596, "learning_rate": 3.4552845528455287e-06, "loss": 0.4673, "step": 445 }, { "epoch": 3.6363636363636362, "grad_norm": 10.503333896340955, "learning_rate": 3.3536585365853664e-06, "loss": 0.4571, "step": 450 }, { "epoch": 3.676767676767677, "grad_norm": 9.503329386063092, "learning_rate": 3.2520325203252037e-06, "loss": 0.4389, "step": 455 }, { "epoch": 3.717171717171717, "grad_norm": 10.009998037929371, "learning_rate": 3.150406504065041e-06, "loss": 0.4638, "step": 460 }, { "epoch": 3.757575757575758, "grad_norm": 9.165548124514086, "learning_rate": 3.0487804878048782e-06, "loss": 0.4911, "step": 465 }, { "epoch": 3.797979797979798, "grad_norm": 8.906764540289775, "learning_rate": 2.947154471544716e-06, "loss": 0.4568, "step": 470 }, { "epoch": 3.8383838383838382, "grad_norm": 9.034724254956192, "learning_rate": 2.845528455284553e-06, "loss": 0.4336, "step": 475 }, { "epoch": 3.878787878787879, "grad_norm": 11.166876327596686, "learning_rate": 2.7439024390243905e-06, "loss": 0.4499, "step": 480 }, { "epoch": 3.919191919191919, "grad_norm": 9.217158341597267, "learning_rate": 2.6422764227642278e-06, "loss": 0.421, "step": 485 }, { "epoch": 3.9595959595959593, "grad_norm": 10.35588055954636, "learning_rate": 2.5406504065040655e-06, "loss": 0.461, "step": 490 }, { "epoch": 4.0, "grad_norm": 9.702783400257974, "learning_rate": 2.4390243902439027e-06, "loss": 0.4693, "step": 495 }, { "epoch": 4.040404040404041, "grad_norm": 7.785172054818246, "learning_rate": 2.33739837398374e-06, "loss": 0.2154, "step": 500 }, { "epoch": 4.08080808080808, "grad_norm": 8.930594851346704, "learning_rate": 2.2357723577235773e-06, "loss": 0.1967, "step": 505 }, { "epoch": 4.121212121212121, "grad_norm": 10.107028846989817, "learning_rate": 2.1341463414634146e-06, "loss": 0.1844, "step": 510 }, { "epoch": 4.161616161616162, "grad_norm": 7.826371662630265, "learning_rate": 2.0325203252032523e-06, "loss": 0.19, "step": 515 }, { "epoch": 4.202020202020202, "grad_norm": 7.454368070701538, "learning_rate": 1.9308943089430896e-06, "loss": 0.204, "step": 520 }, { "epoch": 4.242424242424242, "grad_norm": 10.555419961914236, "learning_rate": 1.8292682926829268e-06, "loss": 0.1838, "step": 525 }, { "epoch": 4.282828282828283, "grad_norm": 7.50612949709692, "learning_rate": 1.7276422764227643e-06, "loss": 0.2054, "step": 530 }, { "epoch": 4.3232323232323235, "grad_norm": 7.8740842398900055, "learning_rate": 1.6260162601626018e-06, "loss": 0.1933, "step": 535 }, { "epoch": 4.363636363636363, "grad_norm": 7.239232049440118, "learning_rate": 1.5243902439024391e-06, "loss": 0.195, "step": 540 }, { "epoch": 4.404040404040404, "grad_norm": 8.52719981728453, "learning_rate": 1.4227642276422766e-06, "loss": 0.2201, "step": 545 }, { "epoch": 4.444444444444445, "grad_norm": 6.870928125369453, "learning_rate": 1.3211382113821139e-06, "loss": 0.1876, "step": 550 }, { "epoch": 4.484848484848484, "grad_norm": 7.742011223629616, "learning_rate": 1.2195121951219514e-06, "loss": 0.1811, "step": 555 }, { "epoch": 4.525252525252525, "grad_norm": 7.506513525908142, "learning_rate": 1.1178861788617887e-06, "loss": 0.1805, "step": 560 }, { "epoch": 4.565656565656566, "grad_norm": 7.401202667774116, "learning_rate": 1.0162601626016261e-06, "loss": 0.1951, "step": 565 }, { "epoch": 4.606060606060606, "grad_norm": 8.135929523728391, "learning_rate": 9.146341463414634e-07, "loss": 0.1856, "step": 570 }, { "epoch": 4.646464646464646, "grad_norm": 9.307497468880753, "learning_rate": 8.130081300813009e-07, "loss": 0.1807, "step": 575 }, { "epoch": 4.686868686868687, "grad_norm": 8.362839951501785, "learning_rate": 7.113821138211383e-07, "loss": 0.1653, "step": 580 }, { "epoch": 4.7272727272727275, "grad_norm": 9.06145198185612, "learning_rate": 6.097560975609757e-07, "loss": 0.1747, "step": 585 }, { "epoch": 4.767676767676767, "grad_norm": 7.9663701451927516, "learning_rate": 5.081300813008131e-07, "loss": 0.1922, "step": 590 }, { "epoch": 4.808080808080808, "grad_norm": 8.512067102195044, "learning_rate": 4.0650406504065046e-07, "loss": 0.1837, "step": 595 }, { "epoch": 4.848484848484849, "grad_norm": 7.294652883363056, "learning_rate": 3.0487804878048784e-07, "loss": 0.1877, "step": 600 }, { "epoch": 4.888888888888889, "grad_norm": 8.551601154075781, "learning_rate": 2.0325203252032523e-07, "loss": 0.1851, "step": 605 }, { "epoch": 4.929292929292929, "grad_norm": 8.034884715447463, "learning_rate": 1.0162601626016261e-07, "loss": 0.187, "step": 610 }, { "epoch": 4.96969696969697, "grad_norm": 7.727841885701067, "learning_rate": 0.0, "loss": 0.1957, "step": 615 }, { "epoch": 4.96969696969697, "step": 615, "total_flos": 849370300416.0, "train_loss": 0.9517717417662706, "train_runtime": 734.4559, "train_samples_per_second": 26.959, "train_steps_per_second": 0.837 } ], "logging_steps": 5, "max_steps": 615, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 849370300416.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }