|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 4.96969696969697, |
|
"eval_steps": 500, |
|
"global_step": 615, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.04040404040404041, |
|
"grad_norm": 80.8431859261654, |
|
"learning_rate": 4.0650406504065046e-07, |
|
"loss": 2.64, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.08080808080808081, |
|
"grad_norm": 58.95328571668908, |
|
"learning_rate": 8.130081300813009e-07, |
|
"loss": 2.5295, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.12121212121212122, |
|
"grad_norm": 30.259656389849923, |
|
"learning_rate": 1.2195121951219514e-06, |
|
"loss": 2.3595, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.16161616161616163, |
|
"grad_norm": 19.031626660884115, |
|
"learning_rate": 1.6260162601626018e-06, |
|
"loss": 2.2046, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.20202020202020202, |
|
"grad_norm": 14.285496207217582, |
|
"learning_rate": 2.0325203252032523e-06, |
|
"loss": 2.0114, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.24242424242424243, |
|
"grad_norm": 11.705575031693748, |
|
"learning_rate": 2.4390243902439027e-06, |
|
"loss": 1.9542, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.2828282828282828, |
|
"grad_norm": 10.031255451818282, |
|
"learning_rate": 2.845528455284553e-06, |
|
"loss": 1.9027, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.32323232323232326, |
|
"grad_norm": 9.87064503889936, |
|
"learning_rate": 3.2520325203252037e-06, |
|
"loss": 1.8376, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.36363636363636365, |
|
"grad_norm": 9.463342602070677, |
|
"learning_rate": 3.6585365853658537e-06, |
|
"loss": 1.8335, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.40404040404040403, |
|
"grad_norm": 8.14740265971335, |
|
"learning_rate": 4.0650406504065046e-06, |
|
"loss": 1.7531, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.4444444444444444, |
|
"grad_norm": 8.579660709279775, |
|
"learning_rate": 4.471544715447155e-06, |
|
"loss": 1.746, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.48484848484848486, |
|
"grad_norm": 7.764594425498635, |
|
"learning_rate": 4.8780487804878055e-06, |
|
"loss": 1.7795, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.5252525252525253, |
|
"grad_norm": 7.818967776047545, |
|
"learning_rate": 5.2845528455284555e-06, |
|
"loss": 1.7845, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.5656565656565656, |
|
"grad_norm": 8.571992272728625, |
|
"learning_rate": 5.691056910569106e-06, |
|
"loss": 1.7176, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.6060606060606061, |
|
"grad_norm": 7.157751637340014, |
|
"learning_rate": 6.0975609756097564e-06, |
|
"loss": 1.6894, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.6464646464646465, |
|
"grad_norm": 7.776730945646382, |
|
"learning_rate": 6.504065040650407e-06, |
|
"loss": 1.6733, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.6868686868686869, |
|
"grad_norm": 7.538790813241334, |
|
"learning_rate": 6.910569105691057e-06, |
|
"loss": 1.6661, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.7272727272727273, |
|
"grad_norm": 7.237611731257021, |
|
"learning_rate": 7.317073170731707e-06, |
|
"loss": 1.6413, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.7676767676767676, |
|
"grad_norm": 8.713375291094371, |
|
"learning_rate": 7.723577235772358e-06, |
|
"loss": 1.6712, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.8080808080808081, |
|
"grad_norm": 8.148761177726017, |
|
"learning_rate": 8.130081300813009e-06, |
|
"loss": 1.6491, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.8484848484848485, |
|
"grad_norm": 7.670084788858041, |
|
"learning_rate": 8.536585365853658e-06, |
|
"loss": 1.6661, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.8888888888888888, |
|
"grad_norm": 8.193767782896925, |
|
"learning_rate": 8.94308943089431e-06, |
|
"loss": 1.6061, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.9292929292929293, |
|
"grad_norm": 7.327023272280465, |
|
"learning_rate": 9.34959349593496e-06, |
|
"loss": 1.6177, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.9696969696969697, |
|
"grad_norm": 7.6240834480577995, |
|
"learning_rate": 9.756097560975611e-06, |
|
"loss": 1.7007, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 1.0101010101010102, |
|
"grad_norm": 7.022808705113093, |
|
"learning_rate": 9.959349593495936e-06, |
|
"loss": 1.5861, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 1.0505050505050506, |
|
"grad_norm": 7.831382750074179, |
|
"learning_rate": 9.857723577235772e-06, |
|
"loss": 1.397, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 1.0909090909090908, |
|
"grad_norm": 7.482634887493619, |
|
"learning_rate": 9.756097560975611e-06, |
|
"loss": 1.3207, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 1.1313131313131313, |
|
"grad_norm": 7.77405709671628, |
|
"learning_rate": 9.654471544715448e-06, |
|
"loss": 1.4016, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 1.1717171717171717, |
|
"grad_norm": 8.316039462095251, |
|
"learning_rate": 9.552845528455286e-06, |
|
"loss": 1.4044, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 1.2121212121212122, |
|
"grad_norm": 8.220962058193507, |
|
"learning_rate": 9.451219512195122e-06, |
|
"loss": 1.4525, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.2525252525252526, |
|
"grad_norm": 8.628523413754008, |
|
"learning_rate": 9.34959349593496e-06, |
|
"loss": 1.3725, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 1.2929292929292928, |
|
"grad_norm": 9.049067713299571, |
|
"learning_rate": 9.247967479674797e-06, |
|
"loss": 1.3988, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.3333333333333333, |
|
"grad_norm": 8.79216448432911, |
|
"learning_rate": 9.146341463414635e-06, |
|
"loss": 1.3923, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 1.3737373737373737, |
|
"grad_norm": 7.749843189602475, |
|
"learning_rate": 9.044715447154472e-06, |
|
"loss": 1.3965, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.4141414141414141, |
|
"grad_norm": 8.10742211898071, |
|
"learning_rate": 8.94308943089431e-06, |
|
"loss": 1.3586, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 1.4545454545454546, |
|
"grad_norm": 8.61374938206157, |
|
"learning_rate": 8.841463414634148e-06, |
|
"loss": 1.3455, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.494949494949495, |
|
"grad_norm": 7.806546434692009, |
|
"learning_rate": 8.739837398373985e-06, |
|
"loss": 1.3164, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 1.5353535353535355, |
|
"grad_norm": 8.055069468513308, |
|
"learning_rate": 8.638211382113821e-06, |
|
"loss": 1.3719, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.5757575757575757, |
|
"grad_norm": 7.286077733666312, |
|
"learning_rate": 8.536585365853658e-06, |
|
"loss": 1.331, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 1.6161616161616161, |
|
"grad_norm": 7.722391890399128, |
|
"learning_rate": 8.434959349593497e-06, |
|
"loss": 1.347, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.6565656565656566, |
|
"grad_norm": 7.963303799143797, |
|
"learning_rate": 8.333333333333334e-06, |
|
"loss": 1.2988, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 1.696969696969697, |
|
"grad_norm": 8.363381324799755, |
|
"learning_rate": 8.23170731707317e-06, |
|
"loss": 1.3731, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.7373737373737375, |
|
"grad_norm": 8.650050591837509, |
|
"learning_rate": 8.130081300813009e-06, |
|
"loss": 1.3554, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 1.7777777777777777, |
|
"grad_norm": 7.821382086934233, |
|
"learning_rate": 8.028455284552846e-06, |
|
"loss": 1.3257, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.8181818181818183, |
|
"grad_norm": 7.593249783363984, |
|
"learning_rate": 7.926829268292685e-06, |
|
"loss": 1.2994, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 1.8585858585858586, |
|
"grad_norm": 8.265959609765153, |
|
"learning_rate": 7.82520325203252e-06, |
|
"loss": 1.258, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.898989898989899, |
|
"grad_norm": 7.47340446063849, |
|
"learning_rate": 7.723577235772358e-06, |
|
"loss": 1.3744, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 1.9393939393939394, |
|
"grad_norm": 7.8636893423505505, |
|
"learning_rate": 7.621951219512196e-06, |
|
"loss": 1.2867, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.9797979797979797, |
|
"grad_norm": 8.66108025838036, |
|
"learning_rate": 7.520325203252034e-06, |
|
"loss": 1.3423, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 2.0202020202020203, |
|
"grad_norm": 7.253266730457967, |
|
"learning_rate": 7.41869918699187e-06, |
|
"loss": 1.1248, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 2.0606060606060606, |
|
"grad_norm": 7.304801005516647, |
|
"learning_rate": 7.317073170731707e-06, |
|
"loss": 0.8695, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 2.101010101010101, |
|
"grad_norm": 8.148533408280995, |
|
"learning_rate": 7.215447154471545e-06, |
|
"loss": 0.8396, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 2.1414141414141414, |
|
"grad_norm": 9.202743834871297, |
|
"learning_rate": 7.113821138211383e-06, |
|
"loss": 0.8835, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 2.1818181818181817, |
|
"grad_norm": 9.248143695051853, |
|
"learning_rate": 7.01219512195122e-06, |
|
"loss": 0.769, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 2.2222222222222223, |
|
"grad_norm": 8.877400578549704, |
|
"learning_rate": 6.910569105691057e-06, |
|
"loss": 0.8883, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 2.2626262626262625, |
|
"grad_norm": 9.858300545714043, |
|
"learning_rate": 6.808943089430895e-06, |
|
"loss": 0.8892, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 2.303030303030303, |
|
"grad_norm": 8.940708831871842, |
|
"learning_rate": 6.707317073170733e-06, |
|
"loss": 0.8526, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 2.3434343434343434, |
|
"grad_norm": 9.167915788662723, |
|
"learning_rate": 6.60569105691057e-06, |
|
"loss": 0.9309, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 2.3838383838383836, |
|
"grad_norm": 8.435170262522817, |
|
"learning_rate": 6.504065040650407e-06, |
|
"loss": 0.8693, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 2.4242424242424243, |
|
"grad_norm": 9.022959586969035, |
|
"learning_rate": 6.402439024390244e-06, |
|
"loss": 0.8659, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 2.4646464646464645, |
|
"grad_norm": 8.705530302904208, |
|
"learning_rate": 6.300813008130082e-06, |
|
"loss": 0.9076, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 2.505050505050505, |
|
"grad_norm": 8.449067406312437, |
|
"learning_rate": 6.199186991869919e-06, |
|
"loss": 0.8896, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 2.5454545454545454, |
|
"grad_norm": 8.962552587001122, |
|
"learning_rate": 6.0975609756097564e-06, |
|
"loss": 0.8568, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 2.5858585858585856, |
|
"grad_norm": 9.26680724967832, |
|
"learning_rate": 5.995934959349594e-06, |
|
"loss": 0.8707, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 2.6262626262626263, |
|
"grad_norm": 9.852323988179384, |
|
"learning_rate": 5.894308943089432e-06, |
|
"loss": 0.9007, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 2.6666666666666665, |
|
"grad_norm": 9.473031970955077, |
|
"learning_rate": 5.792682926829269e-06, |
|
"loss": 0.8907, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 2.707070707070707, |
|
"grad_norm": 8.423216825316242, |
|
"learning_rate": 5.691056910569106e-06, |
|
"loss": 0.8408, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 2.7474747474747474, |
|
"grad_norm": 8.772022355651819, |
|
"learning_rate": 5.589430894308944e-06, |
|
"loss": 0.8791, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 2.787878787878788, |
|
"grad_norm": 9.456726517429484, |
|
"learning_rate": 5.487804878048781e-06, |
|
"loss": 0.9048, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 2.8282828282828283, |
|
"grad_norm": 8.932119436113132, |
|
"learning_rate": 5.386178861788618e-06, |
|
"loss": 0.9458, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 2.8686868686868685, |
|
"grad_norm": 9.076984191036512, |
|
"learning_rate": 5.2845528455284555e-06, |
|
"loss": 0.887, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 2.909090909090909, |
|
"grad_norm": 9.228482912276068, |
|
"learning_rate": 5.182926829268293e-06, |
|
"loss": 0.9164, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 2.9494949494949494, |
|
"grad_norm": 9.39115941622314, |
|
"learning_rate": 5.081300813008131e-06, |
|
"loss": 0.9146, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 2.98989898989899, |
|
"grad_norm": 9.590905474617363, |
|
"learning_rate": 4.979674796747968e-06, |
|
"loss": 0.8975, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 3.0303030303030303, |
|
"grad_norm": 8.147437990412763, |
|
"learning_rate": 4.8780487804878055e-06, |
|
"loss": 0.5845, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 3.0707070707070705, |
|
"grad_norm": 8.966178510928351, |
|
"learning_rate": 4.776422764227643e-06, |
|
"loss": 0.4348, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 3.111111111111111, |
|
"grad_norm": 9.411638909598917, |
|
"learning_rate": 4.67479674796748e-06, |
|
"loss": 0.4666, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 3.1515151515151514, |
|
"grad_norm": 8.787258663179166, |
|
"learning_rate": 4.573170731707318e-06, |
|
"loss": 0.4412, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 3.191919191919192, |
|
"grad_norm": 8.448686917354301, |
|
"learning_rate": 4.471544715447155e-06, |
|
"loss": 0.4644, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 3.2323232323232323, |
|
"grad_norm": 10.794449599661947, |
|
"learning_rate": 4.369918699186992e-06, |
|
"loss": 0.4697, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 3.2727272727272725, |
|
"grad_norm": 9.903329877706229, |
|
"learning_rate": 4.268292682926829e-06, |
|
"loss": 0.4451, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 3.313131313131313, |
|
"grad_norm": 10.77521515420911, |
|
"learning_rate": 4.166666666666667e-06, |
|
"loss": 0.4332, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 3.3535353535353534, |
|
"grad_norm": 9.850095186977901, |
|
"learning_rate": 4.0650406504065046e-06, |
|
"loss": 0.4571, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 3.393939393939394, |
|
"grad_norm": 9.719974950746256, |
|
"learning_rate": 3.963414634146342e-06, |
|
"loss": 0.4651, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 3.4343434343434343, |
|
"grad_norm": 8.909903135988007, |
|
"learning_rate": 3.861788617886179e-06, |
|
"loss": 0.4412, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 3.474747474747475, |
|
"grad_norm": 9.513088350689786, |
|
"learning_rate": 3.760162601626017e-06, |
|
"loss": 0.4713, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 3.515151515151515, |
|
"grad_norm": 8.770653506187902, |
|
"learning_rate": 3.6585365853658537e-06, |
|
"loss": 0.4325, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 3.5555555555555554, |
|
"grad_norm": 10.698789680572704, |
|
"learning_rate": 3.5569105691056914e-06, |
|
"loss": 0.456, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 3.595959595959596, |
|
"grad_norm": 9.719819699090596, |
|
"learning_rate": 3.4552845528455287e-06, |
|
"loss": 0.4673, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 3.6363636363636362, |
|
"grad_norm": 10.503333896340955, |
|
"learning_rate": 3.3536585365853664e-06, |
|
"loss": 0.4571, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 3.676767676767677, |
|
"grad_norm": 9.503329386063092, |
|
"learning_rate": 3.2520325203252037e-06, |
|
"loss": 0.4389, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 3.717171717171717, |
|
"grad_norm": 10.009998037929371, |
|
"learning_rate": 3.150406504065041e-06, |
|
"loss": 0.4638, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 3.757575757575758, |
|
"grad_norm": 9.165548124514086, |
|
"learning_rate": 3.0487804878048782e-06, |
|
"loss": 0.4911, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 3.797979797979798, |
|
"grad_norm": 8.906764540289775, |
|
"learning_rate": 2.947154471544716e-06, |
|
"loss": 0.4568, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 3.8383838383838382, |
|
"grad_norm": 9.034724254956192, |
|
"learning_rate": 2.845528455284553e-06, |
|
"loss": 0.4336, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 3.878787878787879, |
|
"grad_norm": 11.166876327596686, |
|
"learning_rate": 2.7439024390243905e-06, |
|
"loss": 0.4499, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 3.919191919191919, |
|
"grad_norm": 9.217158341597267, |
|
"learning_rate": 2.6422764227642278e-06, |
|
"loss": 0.421, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 3.9595959595959593, |
|
"grad_norm": 10.35588055954636, |
|
"learning_rate": 2.5406504065040655e-06, |
|
"loss": 0.461, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 9.702783400257974, |
|
"learning_rate": 2.4390243902439027e-06, |
|
"loss": 0.4693, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 4.040404040404041, |
|
"grad_norm": 7.785172054818246, |
|
"learning_rate": 2.33739837398374e-06, |
|
"loss": 0.2154, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 4.08080808080808, |
|
"grad_norm": 8.930594851346704, |
|
"learning_rate": 2.2357723577235773e-06, |
|
"loss": 0.1967, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 4.121212121212121, |
|
"grad_norm": 10.107028846989817, |
|
"learning_rate": 2.1341463414634146e-06, |
|
"loss": 0.1844, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 4.161616161616162, |
|
"grad_norm": 7.826371662630265, |
|
"learning_rate": 2.0325203252032523e-06, |
|
"loss": 0.19, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 4.202020202020202, |
|
"grad_norm": 7.454368070701538, |
|
"learning_rate": 1.9308943089430896e-06, |
|
"loss": 0.204, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 4.242424242424242, |
|
"grad_norm": 10.555419961914236, |
|
"learning_rate": 1.8292682926829268e-06, |
|
"loss": 0.1838, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 4.282828282828283, |
|
"grad_norm": 7.50612949709692, |
|
"learning_rate": 1.7276422764227643e-06, |
|
"loss": 0.2054, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 4.3232323232323235, |
|
"grad_norm": 7.8740842398900055, |
|
"learning_rate": 1.6260162601626018e-06, |
|
"loss": 0.1933, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 4.363636363636363, |
|
"grad_norm": 7.239232049440118, |
|
"learning_rate": 1.5243902439024391e-06, |
|
"loss": 0.195, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 4.404040404040404, |
|
"grad_norm": 8.52719981728453, |
|
"learning_rate": 1.4227642276422766e-06, |
|
"loss": 0.2201, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 4.444444444444445, |
|
"grad_norm": 6.870928125369453, |
|
"learning_rate": 1.3211382113821139e-06, |
|
"loss": 0.1876, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 4.484848484848484, |
|
"grad_norm": 7.742011223629616, |
|
"learning_rate": 1.2195121951219514e-06, |
|
"loss": 0.1811, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 4.525252525252525, |
|
"grad_norm": 7.506513525908142, |
|
"learning_rate": 1.1178861788617887e-06, |
|
"loss": 0.1805, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 4.565656565656566, |
|
"grad_norm": 7.401202667774116, |
|
"learning_rate": 1.0162601626016261e-06, |
|
"loss": 0.1951, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 4.606060606060606, |
|
"grad_norm": 8.135929523728391, |
|
"learning_rate": 9.146341463414634e-07, |
|
"loss": 0.1856, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 4.646464646464646, |
|
"grad_norm": 9.307497468880753, |
|
"learning_rate": 8.130081300813009e-07, |
|
"loss": 0.1807, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 4.686868686868687, |
|
"grad_norm": 8.362839951501785, |
|
"learning_rate": 7.113821138211383e-07, |
|
"loss": 0.1653, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 4.7272727272727275, |
|
"grad_norm": 9.06145198185612, |
|
"learning_rate": 6.097560975609757e-07, |
|
"loss": 0.1747, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 4.767676767676767, |
|
"grad_norm": 7.9663701451927516, |
|
"learning_rate": 5.081300813008131e-07, |
|
"loss": 0.1922, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 4.808080808080808, |
|
"grad_norm": 8.512067102195044, |
|
"learning_rate": 4.0650406504065046e-07, |
|
"loss": 0.1837, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 4.848484848484849, |
|
"grad_norm": 7.294652883363056, |
|
"learning_rate": 3.0487804878048784e-07, |
|
"loss": 0.1877, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 4.888888888888889, |
|
"grad_norm": 8.551601154075781, |
|
"learning_rate": 2.0325203252032523e-07, |
|
"loss": 0.1851, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 4.929292929292929, |
|
"grad_norm": 8.034884715447463, |
|
"learning_rate": 1.0162601626016261e-07, |
|
"loss": 0.187, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 4.96969696969697, |
|
"grad_norm": 7.727841885701067, |
|
"learning_rate": 0.0, |
|
"loss": 0.1957, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 4.96969696969697, |
|
"step": 615, |
|
"total_flos": 849370300416.0, |
|
"train_loss": 0.9517717417662706, |
|
"train_runtime": 734.4559, |
|
"train_samples_per_second": 26.959, |
|
"train_steps_per_second": 0.837 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 615, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 5, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": false, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 849370300416.0, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|