|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.2792022792022792, |
|
"eval_steps": 500, |
|
"global_step": 200, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.011396011396011397, |
|
"grad_norm": 1.0688412189483643, |
|
"learning_rate": 0.0001, |
|
"loss": 2.5442, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.022792022792022793, |
|
"grad_norm": 0.9676798582077026, |
|
"learning_rate": 9.949748743718594e-05, |
|
"loss": 2.4169, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.03418803418803419, |
|
"grad_norm": 0.9322265386581421, |
|
"learning_rate": 9.899497487437186e-05, |
|
"loss": 2.4585, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.045584045584045586, |
|
"grad_norm": 1.0203752517700195, |
|
"learning_rate": 9.84924623115578e-05, |
|
"loss": 2.2605, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.05698005698005698, |
|
"grad_norm": 1.0615314245224, |
|
"learning_rate": 9.798994974874372e-05, |
|
"loss": 2.2029, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.06837606837606838, |
|
"grad_norm": 1.1191452741622925, |
|
"learning_rate": 9.748743718592965e-05, |
|
"loss": 1.977, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.07977207977207977, |
|
"grad_norm": 1.0572948455810547, |
|
"learning_rate": 9.698492462311559e-05, |
|
"loss": 1.8041, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.09116809116809117, |
|
"grad_norm": 1.0000556707382202, |
|
"learning_rate": 9.64824120603015e-05, |
|
"loss": 1.8135, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.10256410256410256, |
|
"grad_norm": 0.815621554851532, |
|
"learning_rate": 9.597989949748745e-05, |
|
"loss": 1.6268, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.11396011396011396, |
|
"grad_norm": 0.8220420479774475, |
|
"learning_rate": 9.547738693467337e-05, |
|
"loss": 1.556, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.12535612535612536, |
|
"grad_norm": 0.8398631811141968, |
|
"learning_rate": 9.49748743718593e-05, |
|
"loss": 1.4275, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.13675213675213677, |
|
"grad_norm": 0.7354830503463745, |
|
"learning_rate": 9.447236180904523e-05, |
|
"loss": 1.4687, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.14814814814814814, |
|
"grad_norm": 0.4654861092567444, |
|
"learning_rate": 9.396984924623115e-05, |
|
"loss": 1.3937, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.15954415954415954, |
|
"grad_norm": 0.5842018127441406, |
|
"learning_rate": 9.34673366834171e-05, |
|
"loss": 1.4105, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.17094017094017094, |
|
"grad_norm": 0.6819984912872314, |
|
"learning_rate": 9.296482412060302e-05, |
|
"loss": 1.3469, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.18233618233618235, |
|
"grad_norm": 0.5046871900558472, |
|
"learning_rate": 9.246231155778895e-05, |
|
"loss": 1.3817, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.19373219373219372, |
|
"grad_norm": 0.4931313693523407, |
|
"learning_rate": 9.195979899497488e-05, |
|
"loss": 1.4015, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.20512820512820512, |
|
"grad_norm": 0.5237986445426941, |
|
"learning_rate": 9.14572864321608e-05, |
|
"loss": 1.3251, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.21652421652421652, |
|
"grad_norm": 0.48392462730407715, |
|
"learning_rate": 9.095477386934675e-05, |
|
"loss": 1.2821, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.22792022792022792, |
|
"grad_norm": 0.5208500623703003, |
|
"learning_rate": 9.045226130653267e-05, |
|
"loss": 1.3226, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.23931623931623933, |
|
"grad_norm": 0.5178021192550659, |
|
"learning_rate": 8.99497487437186e-05, |
|
"loss": 1.2268, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.25071225071225073, |
|
"grad_norm": 0.5599659085273743, |
|
"learning_rate": 8.944723618090453e-05, |
|
"loss": 1.3742, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.2621082621082621, |
|
"grad_norm": 0.5551819801330566, |
|
"learning_rate": 8.894472361809045e-05, |
|
"loss": 1.3156, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.27350427350427353, |
|
"grad_norm": 0.7024655342102051, |
|
"learning_rate": 8.84422110552764e-05, |
|
"loss": 1.2187, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.2849002849002849, |
|
"grad_norm": 0.6340409517288208, |
|
"learning_rate": 8.793969849246232e-05, |
|
"loss": 1.3423, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.2962962962962963, |
|
"grad_norm": 0.5692094564437866, |
|
"learning_rate": 8.743718592964825e-05, |
|
"loss": 1.2315, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.3076923076923077, |
|
"grad_norm": 0.5804877877235413, |
|
"learning_rate": 8.693467336683418e-05, |
|
"loss": 1.2316, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.3190883190883191, |
|
"grad_norm": 0.7234011292457581, |
|
"learning_rate": 8.64321608040201e-05, |
|
"loss": 1.2583, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.33048433048433046, |
|
"grad_norm": 0.8010082244873047, |
|
"learning_rate": 8.592964824120603e-05, |
|
"loss": 1.3704, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.3418803418803419, |
|
"grad_norm": 0.6479464173316956, |
|
"learning_rate": 8.542713567839196e-05, |
|
"loss": 1.3294, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.35327635327635326, |
|
"grad_norm": 0.778668224811554, |
|
"learning_rate": 8.49246231155779e-05, |
|
"loss": 1.2566, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.3646723646723647, |
|
"grad_norm": 0.8151825070381165, |
|
"learning_rate": 8.442211055276383e-05, |
|
"loss": 1.2372, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.37606837606837606, |
|
"grad_norm": 0.777619481086731, |
|
"learning_rate": 8.391959798994975e-05, |
|
"loss": 1.2749, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.38746438746438744, |
|
"grad_norm": 0.8822659850120544, |
|
"learning_rate": 8.341708542713568e-05, |
|
"loss": 1.2229, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.39886039886039887, |
|
"grad_norm": 0.870242178440094, |
|
"learning_rate": 8.291457286432161e-05, |
|
"loss": 1.1641, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.41025641025641024, |
|
"grad_norm": 0.8463490605354309, |
|
"learning_rate": 8.241206030150754e-05, |
|
"loss": 1.2443, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.42165242165242167, |
|
"grad_norm": 0.8682456612586975, |
|
"learning_rate": 8.190954773869348e-05, |
|
"loss": 1.2818, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.43304843304843305, |
|
"grad_norm": 0.6559503674507141, |
|
"learning_rate": 8.14070351758794e-05, |
|
"loss": 1.1601, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.4444444444444444, |
|
"grad_norm": 0.603629469871521, |
|
"learning_rate": 8.090452261306533e-05, |
|
"loss": 1.2256, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.45584045584045585, |
|
"grad_norm": 0.5692597031593323, |
|
"learning_rate": 8.040201005025126e-05, |
|
"loss": 1.2869, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.4672364672364672, |
|
"grad_norm": 0.5320606231689453, |
|
"learning_rate": 7.989949748743719e-05, |
|
"loss": 1.179, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.47863247863247865, |
|
"grad_norm": 0.5642545819282532, |
|
"learning_rate": 7.939698492462313e-05, |
|
"loss": 1.1968, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.49002849002849, |
|
"grad_norm": 0.4435971975326538, |
|
"learning_rate": 7.889447236180904e-05, |
|
"loss": 1.2094, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.5014245014245015, |
|
"grad_norm": 0.4562317430973053, |
|
"learning_rate": 7.839195979899498e-05, |
|
"loss": 1.1734, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.5128205128205128, |
|
"grad_norm": 0.5164006948471069, |
|
"learning_rate": 7.788944723618091e-05, |
|
"loss": 1.2177, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.5242165242165242, |
|
"grad_norm": 0.5158300399780273, |
|
"learning_rate": 7.738693467336684e-05, |
|
"loss": 1.2289, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.5356125356125356, |
|
"grad_norm": 0.49650484323501587, |
|
"learning_rate": 7.688442211055277e-05, |
|
"loss": 1.3316, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.5470085470085471, |
|
"grad_norm": 0.4756147563457489, |
|
"learning_rate": 7.638190954773869e-05, |
|
"loss": 1.333, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.5584045584045584, |
|
"grad_norm": 0.4514218866825104, |
|
"learning_rate": 7.587939698492463e-05, |
|
"loss": 1.2733, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.5698005698005698, |
|
"grad_norm": 0.445081889629364, |
|
"learning_rate": 7.537688442211056e-05, |
|
"loss": 1.2024, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.5811965811965812, |
|
"grad_norm": 0.45231911540031433, |
|
"learning_rate": 7.487437185929649e-05, |
|
"loss": 1.267, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.5925925925925926, |
|
"grad_norm": 0.5107349753379822, |
|
"learning_rate": 7.437185929648241e-05, |
|
"loss": 1.2068, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.603988603988604, |
|
"grad_norm": 0.4770635664463043, |
|
"learning_rate": 7.386934673366834e-05, |
|
"loss": 1.2372, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.6153846153846154, |
|
"grad_norm": 0.4619287848472595, |
|
"learning_rate": 7.336683417085427e-05, |
|
"loss": 1.2132, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.6267806267806267, |
|
"grad_norm": 0.5125857591629028, |
|
"learning_rate": 7.28643216080402e-05, |
|
"loss": 1.2367, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.6381766381766382, |
|
"grad_norm": 0.456436425447464, |
|
"learning_rate": 7.236180904522614e-05, |
|
"loss": 1.1989, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.6495726495726496, |
|
"grad_norm": 0.4466511011123657, |
|
"learning_rate": 7.185929648241206e-05, |
|
"loss": 1.2789, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.6609686609686609, |
|
"grad_norm": 0.45993903279304504, |
|
"learning_rate": 7.135678391959799e-05, |
|
"loss": 1.2228, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.6723646723646723, |
|
"grad_norm": 0.4762590825557709, |
|
"learning_rate": 7.085427135678392e-05, |
|
"loss": 1.1418, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.6837606837606838, |
|
"grad_norm": 0.4990002512931824, |
|
"learning_rate": 7.035175879396985e-05, |
|
"loss": 1.1788, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.6951566951566952, |
|
"grad_norm": 0.4810471534729004, |
|
"learning_rate": 6.984924623115579e-05, |
|
"loss": 1.1878, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.7065527065527065, |
|
"grad_norm": 0.4777512550354004, |
|
"learning_rate": 6.93467336683417e-05, |
|
"loss": 1.2621, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.717948717948718, |
|
"grad_norm": 0.5083452463150024, |
|
"learning_rate": 6.884422110552764e-05, |
|
"loss": 1.1642, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.7293447293447294, |
|
"grad_norm": 0.4965672194957733, |
|
"learning_rate": 6.834170854271357e-05, |
|
"loss": 1.1356, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.7407407407407407, |
|
"grad_norm": 0.48566991090774536, |
|
"learning_rate": 6.78391959798995e-05, |
|
"loss": 1.0954, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.7521367521367521, |
|
"grad_norm": 0.46452316641807556, |
|
"learning_rate": 6.733668341708544e-05, |
|
"loss": 1.1905, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.7635327635327636, |
|
"grad_norm": 0.51093989610672, |
|
"learning_rate": 6.683417085427135e-05, |
|
"loss": 1.2285, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.7749287749287749, |
|
"grad_norm": 0.5919416546821594, |
|
"learning_rate": 6.633165829145729e-05, |
|
"loss": 1.2368, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.7863247863247863, |
|
"grad_norm": 0.46627846360206604, |
|
"learning_rate": 6.582914572864322e-05, |
|
"loss": 1.1286, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.7977207977207977, |
|
"grad_norm": 0.4974450170993805, |
|
"learning_rate": 6.532663316582915e-05, |
|
"loss": 1.2874, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.8091168091168092, |
|
"grad_norm": 0.5373516082763672, |
|
"learning_rate": 6.482412060301508e-05, |
|
"loss": 1.2573, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.8205128205128205, |
|
"grad_norm": 0.4522000849246979, |
|
"learning_rate": 6.4321608040201e-05, |
|
"loss": 1.137, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.8319088319088319, |
|
"grad_norm": 0.4694693088531494, |
|
"learning_rate": 6.381909547738694e-05, |
|
"loss": 1.1971, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.8433048433048433, |
|
"grad_norm": 0.4889134466648102, |
|
"learning_rate": 6.331658291457287e-05, |
|
"loss": 1.1435, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.8547008547008547, |
|
"grad_norm": 0.512048602104187, |
|
"learning_rate": 6.28140703517588e-05, |
|
"loss": 1.2086, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.8660968660968661, |
|
"grad_norm": 0.49475014209747314, |
|
"learning_rate": 6.231155778894473e-05, |
|
"loss": 1.2496, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.8774928774928775, |
|
"grad_norm": 0.4658482074737549, |
|
"learning_rate": 6.180904522613065e-05, |
|
"loss": 1.0942, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.8888888888888888, |
|
"grad_norm": 0.5323877334594727, |
|
"learning_rate": 6.130653266331658e-05, |
|
"loss": 1.1853, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.9002849002849003, |
|
"grad_norm": 0.6251657605171204, |
|
"learning_rate": 6.080402010050251e-05, |
|
"loss": 1.2209, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.9116809116809117, |
|
"grad_norm": 0.4768678545951843, |
|
"learning_rate": 6.030150753768844e-05, |
|
"loss": 1.164, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.9230769230769231, |
|
"grad_norm": 0.5291458964347839, |
|
"learning_rate": 5.979899497487438e-05, |
|
"loss": 1.2754, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.9344729344729344, |
|
"grad_norm": 0.5092456340789795, |
|
"learning_rate": 5.929648241206031e-05, |
|
"loss": 1.248, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.9458689458689459, |
|
"grad_norm": 0.4834723174571991, |
|
"learning_rate": 5.879396984924623e-05, |
|
"loss": 1.2681, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.9572649572649573, |
|
"grad_norm": 0.5478146076202393, |
|
"learning_rate": 5.829145728643216e-05, |
|
"loss": 1.2045, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.9686609686609686, |
|
"grad_norm": 0.4918864071369171, |
|
"learning_rate": 5.778894472361809e-05, |
|
"loss": 1.1216, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.98005698005698, |
|
"grad_norm": 0.4812568128108978, |
|
"learning_rate": 5.728643216080403e-05, |
|
"loss": 1.1732, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.9914529914529915, |
|
"grad_norm": 0.5308701992034912, |
|
"learning_rate": 5.6783919597989955e-05, |
|
"loss": 1.2643, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 1.002849002849003, |
|
"grad_norm": 0.7254398465156555, |
|
"learning_rate": 5.628140703517588e-05, |
|
"loss": 1.4278, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 1.0142450142450143, |
|
"grad_norm": 0.5298280715942383, |
|
"learning_rate": 5.577889447236181e-05, |
|
"loss": 1.2691, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 1.0256410256410255, |
|
"grad_norm": 0.4411105811595917, |
|
"learning_rate": 5.527638190954774e-05, |
|
"loss": 1.1033, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 1.037037037037037, |
|
"grad_norm": 0.5160195231437683, |
|
"learning_rate": 5.477386934673368e-05, |
|
"loss": 1.2041, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 1.0484330484330484, |
|
"grad_norm": 0.4904952943325043, |
|
"learning_rate": 5.4271356783919604e-05, |
|
"loss": 1.1634, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 1.0598290598290598, |
|
"grad_norm": 0.4598182439804077, |
|
"learning_rate": 5.376884422110553e-05, |
|
"loss": 1.1095, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 1.0712250712250713, |
|
"grad_norm": 0.4579429626464844, |
|
"learning_rate": 5.3266331658291455e-05, |
|
"loss": 1.0614, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 1.0826210826210827, |
|
"grad_norm": 0.49531427025794983, |
|
"learning_rate": 5.276381909547739e-05, |
|
"loss": 1.2415, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 1.0940170940170941, |
|
"grad_norm": 0.4900212287902832, |
|
"learning_rate": 5.226130653266332e-05, |
|
"loss": 1.08, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 1.1054131054131053, |
|
"grad_norm": 0.5321851372718811, |
|
"learning_rate": 5.175879396984925e-05, |
|
"loss": 1.1464, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 1.1168091168091168, |
|
"grad_norm": 0.4976498782634735, |
|
"learning_rate": 5.125628140703518e-05, |
|
"loss": 1.1962, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 1.1282051282051282, |
|
"grad_norm": 0.48046809434890747, |
|
"learning_rate": 5.0753768844221104e-05, |
|
"loss": 1.0534, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 1.1396011396011396, |
|
"grad_norm": 0.5641231536865234, |
|
"learning_rate": 5.0251256281407036e-05, |
|
"loss": 1.2342, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 1.150997150997151, |
|
"grad_norm": 0.5218859314918518, |
|
"learning_rate": 4.974874371859297e-05, |
|
"loss": 1.1125, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 1.1623931623931625, |
|
"grad_norm": 0.672106146812439, |
|
"learning_rate": 4.92462311557789e-05, |
|
"loss": 1.2577, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 1.173789173789174, |
|
"grad_norm": 0.5047236680984497, |
|
"learning_rate": 4.874371859296483e-05, |
|
"loss": 1.0518, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 1.1851851851851851, |
|
"grad_norm": 0.5189758539199829, |
|
"learning_rate": 4.824120603015075e-05, |
|
"loss": 1.1672, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 1.1965811965811965, |
|
"grad_norm": 0.5212465524673462, |
|
"learning_rate": 4.7738693467336685e-05, |
|
"loss": 1.0869, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 1.207977207977208, |
|
"grad_norm": 0.5158497095108032, |
|
"learning_rate": 4.723618090452262e-05, |
|
"loss": 1.0674, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 1.2193732193732194, |
|
"grad_norm": 0.6164978742599487, |
|
"learning_rate": 4.673366834170855e-05, |
|
"loss": 1.2204, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 1.2307692307692308, |
|
"grad_norm": 0.5276105403900146, |
|
"learning_rate": 4.6231155778894475e-05, |
|
"loss": 1.0782, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 1.242165242165242, |
|
"grad_norm": 0.5990796685218811, |
|
"learning_rate": 4.57286432160804e-05, |
|
"loss": 1.1748, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 1.2535612535612537, |
|
"grad_norm": 0.5942894816398621, |
|
"learning_rate": 4.522613065326633e-05, |
|
"loss": 1.1417, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 1.264957264957265, |
|
"grad_norm": 0.5517327189445496, |
|
"learning_rate": 4.4723618090452266e-05, |
|
"loss": 1.0214, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 1.2763532763532763, |
|
"grad_norm": 0.599429190158844, |
|
"learning_rate": 4.42211055276382e-05, |
|
"loss": 1.2503, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 1.2877492877492878, |
|
"grad_norm": 0.5922709107398987, |
|
"learning_rate": 4.3718592964824124e-05, |
|
"loss": 1.1294, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 1.2991452991452992, |
|
"grad_norm": 0.569146990776062, |
|
"learning_rate": 4.321608040201005e-05, |
|
"loss": 1.1523, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 1.3105413105413106, |
|
"grad_norm": 0.5592817664146423, |
|
"learning_rate": 4.271356783919598e-05, |
|
"loss": 1.1484, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 1.3219373219373218, |
|
"grad_norm": 0.5917912125587463, |
|
"learning_rate": 4.2211055276381914e-05, |
|
"loss": 1.0667, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 1.3333333333333333, |
|
"grad_norm": 0.6044990420341492, |
|
"learning_rate": 4.170854271356784e-05, |
|
"loss": 1.1646, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 1.3447293447293447, |
|
"grad_norm": 0.5149083137512207, |
|
"learning_rate": 4.120603015075377e-05, |
|
"loss": 1.0567, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 1.3561253561253561, |
|
"grad_norm": 0.5668403506278992, |
|
"learning_rate": 4.07035175879397e-05, |
|
"loss": 1.2072, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 1.3675213675213675, |
|
"grad_norm": 0.5478379726409912, |
|
"learning_rate": 4.020100502512563e-05, |
|
"loss": 1.1353, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 1.378917378917379, |
|
"grad_norm": 0.5570518374443054, |
|
"learning_rate": 3.969849246231156e-05, |
|
"loss": 1.0336, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 1.3903133903133904, |
|
"grad_norm": 0.5794707536697388, |
|
"learning_rate": 3.919597989949749e-05, |
|
"loss": 1.1696, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 1.4017094017094016, |
|
"grad_norm": 0.567593514919281, |
|
"learning_rate": 3.869346733668342e-05, |
|
"loss": 1.0635, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 1.413105413105413, |
|
"grad_norm": 0.5602433085441589, |
|
"learning_rate": 3.8190954773869346e-05, |
|
"loss": 1.1443, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 1.4245014245014245, |
|
"grad_norm": 0.578921377658844, |
|
"learning_rate": 3.768844221105528e-05, |
|
"loss": 1.1843, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 1.435897435897436, |
|
"grad_norm": 0.5648573040962219, |
|
"learning_rate": 3.7185929648241204e-05, |
|
"loss": 1.1076, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 1.4472934472934473, |
|
"grad_norm": 0.5635711550712585, |
|
"learning_rate": 3.668341708542714e-05, |
|
"loss": 1.0589, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 1.4586894586894588, |
|
"grad_norm": 0.6067689061164856, |
|
"learning_rate": 3.618090452261307e-05, |
|
"loss": 1.1941, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 1.4700854700854702, |
|
"grad_norm": 0.5677550435066223, |
|
"learning_rate": 3.5678391959798995e-05, |
|
"loss": 1.0384, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 1.4814814814814814, |
|
"grad_norm": 0.599470853805542, |
|
"learning_rate": 3.517587939698493e-05, |
|
"loss": 1.2259, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 1.4928774928774928, |
|
"grad_norm": 0.6475313305854797, |
|
"learning_rate": 3.467336683417085e-05, |
|
"loss": 1.1618, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 1.5042735042735043, |
|
"grad_norm": 0.6463358402252197, |
|
"learning_rate": 3.4170854271356785e-05, |
|
"loss": 1.1712, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 1.5156695156695157, |
|
"grad_norm": 0.5662721395492554, |
|
"learning_rate": 3.366834170854272e-05, |
|
"loss": 1.0731, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 1.5270655270655271, |
|
"grad_norm": 0.5981451869010925, |
|
"learning_rate": 3.3165829145728643e-05, |
|
"loss": 1.0298, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 1.5384615384615383, |
|
"grad_norm": 0.5991083979606628, |
|
"learning_rate": 3.2663316582914576e-05, |
|
"loss": 1.2169, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 1.54985754985755, |
|
"grad_norm": 0.6036386489868164, |
|
"learning_rate": 3.21608040201005e-05, |
|
"loss": 1.0498, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 1.5612535612535612, |
|
"grad_norm": 0.5927392840385437, |
|
"learning_rate": 3.1658291457286434e-05, |
|
"loss": 1.0389, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 1.5726495726495726, |
|
"grad_norm": 0.5733420848846436, |
|
"learning_rate": 3.1155778894472366e-05, |
|
"loss": 1.0608, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 1.584045584045584, |
|
"grad_norm": 0.6083365678787231, |
|
"learning_rate": 3.065326633165829e-05, |
|
"loss": 1.1203, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 1.5954415954415955, |
|
"grad_norm": 0.6153535842895508, |
|
"learning_rate": 3.015075376884422e-05, |
|
"loss": 1.1729, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 1.606837606837607, |
|
"grad_norm": 0.6425400376319885, |
|
"learning_rate": 2.9648241206030153e-05, |
|
"loss": 1.0155, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 1.618233618233618, |
|
"grad_norm": 0.6353899240493774, |
|
"learning_rate": 2.914572864321608e-05, |
|
"loss": 1.245, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 1.6296296296296298, |
|
"grad_norm": 0.6624664664268494, |
|
"learning_rate": 2.8643216080402015e-05, |
|
"loss": 0.9795, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 1.641025641025641, |
|
"grad_norm": 0.6005571484565735, |
|
"learning_rate": 2.814070351758794e-05, |
|
"loss": 1.1581, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 1.6524216524216524, |
|
"grad_norm": 0.6583634614944458, |
|
"learning_rate": 2.763819095477387e-05, |
|
"loss": 1.0538, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 1.6638176638176638, |
|
"grad_norm": 0.5750309228897095, |
|
"learning_rate": 2.7135678391959802e-05, |
|
"loss": 0.9153, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 1.6752136752136753, |
|
"grad_norm": 0.6019430160522461, |
|
"learning_rate": 2.6633165829145728e-05, |
|
"loss": 1.159, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 1.6866096866096867, |
|
"grad_norm": 0.6395899057388306, |
|
"learning_rate": 2.613065326633166e-05, |
|
"loss": 1.1699, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 1.698005698005698, |
|
"grad_norm": 0.635377824306488, |
|
"learning_rate": 2.562814070351759e-05, |
|
"loss": 1.0286, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 1.7094017094017095, |
|
"grad_norm": 0.6403070688247681, |
|
"learning_rate": 2.5125628140703518e-05, |
|
"loss": 1.1748, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.7207977207977208, |
|
"grad_norm": 0.6614859104156494, |
|
"learning_rate": 2.462311557788945e-05, |
|
"loss": 1.0518, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 1.7321937321937322, |
|
"grad_norm": 0.7036588788032532, |
|
"learning_rate": 2.4120603015075376e-05, |
|
"loss": 1.2191, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 1.7435897435897436, |
|
"grad_norm": 0.6717102527618408, |
|
"learning_rate": 2.361809045226131e-05, |
|
"loss": 1.1376, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 1.7549857549857548, |
|
"grad_norm": 0.6435654759407043, |
|
"learning_rate": 2.3115577889447238e-05, |
|
"loss": 1.1601, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 1.7663817663817665, |
|
"grad_norm": 0.6151710748672485, |
|
"learning_rate": 2.2613065326633167e-05, |
|
"loss": 1.0929, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 1.7777777777777777, |
|
"grad_norm": 0.6438819169998169, |
|
"learning_rate": 2.21105527638191e-05, |
|
"loss": 1.111, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 1.7891737891737893, |
|
"grad_norm": 0.6459853649139404, |
|
"learning_rate": 2.1608040201005025e-05, |
|
"loss": 1.2303, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 1.8005698005698005, |
|
"grad_norm": 0.6052287220954895, |
|
"learning_rate": 2.1105527638190957e-05, |
|
"loss": 1.1341, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 1.811965811965812, |
|
"grad_norm": 0.6797654032707214, |
|
"learning_rate": 2.0603015075376886e-05, |
|
"loss": 1.1724, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 1.8233618233618234, |
|
"grad_norm": 0.6900933980941772, |
|
"learning_rate": 2.0100502512562815e-05, |
|
"loss": 1.0881, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.8347578347578346, |
|
"grad_norm": 0.6317200064659119, |
|
"learning_rate": 1.9597989949748744e-05, |
|
"loss": 1.1256, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 1.8461538461538463, |
|
"grad_norm": 0.6044368743896484, |
|
"learning_rate": 1.9095477386934673e-05, |
|
"loss": 1.0595, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 1.8575498575498575, |
|
"grad_norm": 0.6719862818717957, |
|
"learning_rate": 1.8592964824120602e-05, |
|
"loss": 0.9983, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 1.868945868945869, |
|
"grad_norm": 0.6419474482536316, |
|
"learning_rate": 1.8090452261306535e-05, |
|
"loss": 1.0876, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 1.8803418803418803, |
|
"grad_norm": 0.6861122250556946, |
|
"learning_rate": 1.7587939698492464e-05, |
|
"loss": 1.0904, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 1.8917378917378918, |
|
"grad_norm": 0.6277052760124207, |
|
"learning_rate": 1.7085427135678393e-05, |
|
"loss": 1.1036, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 1.9031339031339032, |
|
"grad_norm": 0.7358347177505493, |
|
"learning_rate": 1.6582914572864322e-05, |
|
"loss": 1.0499, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 1.9145299145299144, |
|
"grad_norm": 0.6961327195167542, |
|
"learning_rate": 1.608040201005025e-05, |
|
"loss": 1.097, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 1.925925925925926, |
|
"grad_norm": 0.6499162316322327, |
|
"learning_rate": 1.5577889447236183e-05, |
|
"loss": 1.0566, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 1.9373219373219372, |
|
"grad_norm": 0.6426655650138855, |
|
"learning_rate": 1.507537688442211e-05, |
|
"loss": 1.1127, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.9487179487179487, |
|
"grad_norm": 0.6038071513175964, |
|
"learning_rate": 1.457286432160804e-05, |
|
"loss": 1.0345, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 1.96011396011396, |
|
"grad_norm": 0.6887624263763428, |
|
"learning_rate": 1.407035175879397e-05, |
|
"loss": 1.2775, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 1.9715099715099715, |
|
"grad_norm": 0.6664908528327942, |
|
"learning_rate": 1.3567839195979901e-05, |
|
"loss": 1.099, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 1.982905982905983, |
|
"grad_norm": 0.6395720839500427, |
|
"learning_rate": 1.306532663316583e-05, |
|
"loss": 1.0795, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 1.9943019943019942, |
|
"grad_norm": 0.8258576989173889, |
|
"learning_rate": 1.2562814070351759e-05, |
|
"loss": 1.4081, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 2.005698005698006, |
|
"grad_norm": 0.7664657831192017, |
|
"learning_rate": 1.2060301507537688e-05, |
|
"loss": 1.2999, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 2.017094017094017, |
|
"grad_norm": 0.6419854164123535, |
|
"learning_rate": 1.1557788944723619e-05, |
|
"loss": 1.0661, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 2.0284900284900287, |
|
"grad_norm": 0.6126803755760193, |
|
"learning_rate": 1.105527638190955e-05, |
|
"loss": 1.0989, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 2.03988603988604, |
|
"grad_norm": 0.5971490740776062, |
|
"learning_rate": 1.0552763819095479e-05, |
|
"loss": 0.9485, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 2.051282051282051, |
|
"grad_norm": 0.6343738436698914, |
|
"learning_rate": 1.0050251256281408e-05, |
|
"loss": 1.1226, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 2.0626780626780628, |
|
"grad_norm": 0.6217495203018188, |
|
"learning_rate": 9.547738693467337e-06, |
|
"loss": 1.0628, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 2.074074074074074, |
|
"grad_norm": 0.5966653823852539, |
|
"learning_rate": 9.045226130653267e-06, |
|
"loss": 1.0092, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 2.0854700854700856, |
|
"grad_norm": 0.6323566436767578, |
|
"learning_rate": 8.542713567839196e-06, |
|
"loss": 1.0992, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 2.096866096866097, |
|
"grad_norm": 0.6440880298614502, |
|
"learning_rate": 8.040201005025125e-06, |
|
"loss": 1.0804, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 2.1082621082621085, |
|
"grad_norm": 0.5995816588401794, |
|
"learning_rate": 7.537688442211055e-06, |
|
"loss": 1.1067, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 2.1196581196581197, |
|
"grad_norm": 0.6618144512176514, |
|
"learning_rate": 7.035175879396985e-06, |
|
"loss": 1.0689, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 2.131054131054131, |
|
"grad_norm": 0.6532097458839417, |
|
"learning_rate": 6.532663316582915e-06, |
|
"loss": 1.1028, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 2.1424501424501425, |
|
"grad_norm": 0.5830849409103394, |
|
"learning_rate": 6.030150753768844e-06, |
|
"loss": 0.9996, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 2.1538461538461537, |
|
"grad_norm": 0.6783652901649475, |
|
"learning_rate": 5.527638190954775e-06, |
|
"loss": 1.2311, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 2.1652421652421654, |
|
"grad_norm": 0.6712796688079834, |
|
"learning_rate": 5.025125628140704e-06, |
|
"loss": 0.9609, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 2.1766381766381766, |
|
"grad_norm": 0.6146546006202698, |
|
"learning_rate": 4.522613065326634e-06, |
|
"loss": 1.1135, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 2.1880341880341883, |
|
"grad_norm": 0.6589621901512146, |
|
"learning_rate": 4.020100502512563e-06, |
|
"loss": 1.0524, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 2.1994301994301995, |
|
"grad_norm": 0.648345947265625, |
|
"learning_rate": 3.5175879396984926e-06, |
|
"loss": 1.1136, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 2.2108262108262107, |
|
"grad_norm": 0.6554787158966064, |
|
"learning_rate": 3.015075376884422e-06, |
|
"loss": 1.0393, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 2.2222222222222223, |
|
"grad_norm": 0.60575270652771, |
|
"learning_rate": 2.512562814070352e-06, |
|
"loss": 1.0688, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 2.2336182336182335, |
|
"grad_norm": 0.609583854675293, |
|
"learning_rate": 2.0100502512562813e-06, |
|
"loss": 1.047, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 2.245014245014245, |
|
"grad_norm": 0.6172504425048828, |
|
"learning_rate": 1.507537688442211e-06, |
|
"loss": 1.0835, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 2.2564102564102564, |
|
"grad_norm": 0.6250450015068054, |
|
"learning_rate": 1.0050251256281407e-06, |
|
"loss": 1.0684, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 2.267806267806268, |
|
"grad_norm": 0.5692541599273682, |
|
"learning_rate": 5.025125628140703e-07, |
|
"loss": 0.9156, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 2.2792022792022792, |
|
"grad_norm": 0.6259585022926331, |
|
"learning_rate": 0.0, |
|
"loss": 1.2106, |
|
"step": 200 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 200, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.469981127242547e+16, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|