leduckhai's picture
Upload folder using huggingface_hub
7db9682 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.2792022792022792,
"eval_steps": 500,
"global_step": 200,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.011396011396011397,
"grad_norm": 1.0688412189483643,
"learning_rate": 0.0001,
"loss": 2.5442,
"step": 1
},
{
"epoch": 0.022792022792022793,
"grad_norm": 0.9676798582077026,
"learning_rate": 9.949748743718594e-05,
"loss": 2.4169,
"step": 2
},
{
"epoch": 0.03418803418803419,
"grad_norm": 0.9322265386581421,
"learning_rate": 9.899497487437186e-05,
"loss": 2.4585,
"step": 3
},
{
"epoch": 0.045584045584045586,
"grad_norm": 1.0203752517700195,
"learning_rate": 9.84924623115578e-05,
"loss": 2.2605,
"step": 4
},
{
"epoch": 0.05698005698005698,
"grad_norm": 1.0615314245224,
"learning_rate": 9.798994974874372e-05,
"loss": 2.2029,
"step": 5
},
{
"epoch": 0.06837606837606838,
"grad_norm": 1.1191452741622925,
"learning_rate": 9.748743718592965e-05,
"loss": 1.977,
"step": 6
},
{
"epoch": 0.07977207977207977,
"grad_norm": 1.0572948455810547,
"learning_rate": 9.698492462311559e-05,
"loss": 1.8041,
"step": 7
},
{
"epoch": 0.09116809116809117,
"grad_norm": 1.0000556707382202,
"learning_rate": 9.64824120603015e-05,
"loss": 1.8135,
"step": 8
},
{
"epoch": 0.10256410256410256,
"grad_norm": 0.815621554851532,
"learning_rate": 9.597989949748745e-05,
"loss": 1.6268,
"step": 9
},
{
"epoch": 0.11396011396011396,
"grad_norm": 0.8220420479774475,
"learning_rate": 9.547738693467337e-05,
"loss": 1.556,
"step": 10
},
{
"epoch": 0.12535612535612536,
"grad_norm": 0.8398631811141968,
"learning_rate": 9.49748743718593e-05,
"loss": 1.4275,
"step": 11
},
{
"epoch": 0.13675213675213677,
"grad_norm": 0.7354830503463745,
"learning_rate": 9.447236180904523e-05,
"loss": 1.4687,
"step": 12
},
{
"epoch": 0.14814814814814814,
"grad_norm": 0.4654861092567444,
"learning_rate": 9.396984924623115e-05,
"loss": 1.3937,
"step": 13
},
{
"epoch": 0.15954415954415954,
"grad_norm": 0.5842018127441406,
"learning_rate": 9.34673366834171e-05,
"loss": 1.4105,
"step": 14
},
{
"epoch": 0.17094017094017094,
"grad_norm": 0.6819984912872314,
"learning_rate": 9.296482412060302e-05,
"loss": 1.3469,
"step": 15
},
{
"epoch": 0.18233618233618235,
"grad_norm": 0.5046871900558472,
"learning_rate": 9.246231155778895e-05,
"loss": 1.3817,
"step": 16
},
{
"epoch": 0.19373219373219372,
"grad_norm": 0.4931313693523407,
"learning_rate": 9.195979899497488e-05,
"loss": 1.4015,
"step": 17
},
{
"epoch": 0.20512820512820512,
"grad_norm": 0.5237986445426941,
"learning_rate": 9.14572864321608e-05,
"loss": 1.3251,
"step": 18
},
{
"epoch": 0.21652421652421652,
"grad_norm": 0.48392462730407715,
"learning_rate": 9.095477386934675e-05,
"loss": 1.2821,
"step": 19
},
{
"epoch": 0.22792022792022792,
"grad_norm": 0.5208500623703003,
"learning_rate": 9.045226130653267e-05,
"loss": 1.3226,
"step": 20
},
{
"epoch": 0.23931623931623933,
"grad_norm": 0.5178021192550659,
"learning_rate": 8.99497487437186e-05,
"loss": 1.2268,
"step": 21
},
{
"epoch": 0.25071225071225073,
"grad_norm": 0.5599659085273743,
"learning_rate": 8.944723618090453e-05,
"loss": 1.3742,
"step": 22
},
{
"epoch": 0.2621082621082621,
"grad_norm": 0.5551819801330566,
"learning_rate": 8.894472361809045e-05,
"loss": 1.3156,
"step": 23
},
{
"epoch": 0.27350427350427353,
"grad_norm": 0.7024655342102051,
"learning_rate": 8.84422110552764e-05,
"loss": 1.2187,
"step": 24
},
{
"epoch": 0.2849002849002849,
"grad_norm": 0.6340409517288208,
"learning_rate": 8.793969849246232e-05,
"loss": 1.3423,
"step": 25
},
{
"epoch": 0.2962962962962963,
"grad_norm": 0.5692094564437866,
"learning_rate": 8.743718592964825e-05,
"loss": 1.2315,
"step": 26
},
{
"epoch": 0.3076923076923077,
"grad_norm": 0.5804877877235413,
"learning_rate": 8.693467336683418e-05,
"loss": 1.2316,
"step": 27
},
{
"epoch": 0.3190883190883191,
"grad_norm": 0.7234011292457581,
"learning_rate": 8.64321608040201e-05,
"loss": 1.2583,
"step": 28
},
{
"epoch": 0.33048433048433046,
"grad_norm": 0.8010082244873047,
"learning_rate": 8.592964824120603e-05,
"loss": 1.3704,
"step": 29
},
{
"epoch": 0.3418803418803419,
"grad_norm": 0.6479464173316956,
"learning_rate": 8.542713567839196e-05,
"loss": 1.3294,
"step": 30
},
{
"epoch": 0.35327635327635326,
"grad_norm": 0.778668224811554,
"learning_rate": 8.49246231155779e-05,
"loss": 1.2566,
"step": 31
},
{
"epoch": 0.3646723646723647,
"grad_norm": 0.8151825070381165,
"learning_rate": 8.442211055276383e-05,
"loss": 1.2372,
"step": 32
},
{
"epoch": 0.37606837606837606,
"grad_norm": 0.777619481086731,
"learning_rate": 8.391959798994975e-05,
"loss": 1.2749,
"step": 33
},
{
"epoch": 0.38746438746438744,
"grad_norm": 0.8822659850120544,
"learning_rate": 8.341708542713568e-05,
"loss": 1.2229,
"step": 34
},
{
"epoch": 0.39886039886039887,
"grad_norm": 0.870242178440094,
"learning_rate": 8.291457286432161e-05,
"loss": 1.1641,
"step": 35
},
{
"epoch": 0.41025641025641024,
"grad_norm": 0.8463490605354309,
"learning_rate": 8.241206030150754e-05,
"loss": 1.2443,
"step": 36
},
{
"epoch": 0.42165242165242167,
"grad_norm": 0.8682456612586975,
"learning_rate": 8.190954773869348e-05,
"loss": 1.2818,
"step": 37
},
{
"epoch": 0.43304843304843305,
"grad_norm": 0.6559503674507141,
"learning_rate": 8.14070351758794e-05,
"loss": 1.1601,
"step": 38
},
{
"epoch": 0.4444444444444444,
"grad_norm": 0.603629469871521,
"learning_rate": 8.090452261306533e-05,
"loss": 1.2256,
"step": 39
},
{
"epoch": 0.45584045584045585,
"grad_norm": 0.5692597031593323,
"learning_rate": 8.040201005025126e-05,
"loss": 1.2869,
"step": 40
},
{
"epoch": 0.4672364672364672,
"grad_norm": 0.5320606231689453,
"learning_rate": 7.989949748743719e-05,
"loss": 1.179,
"step": 41
},
{
"epoch": 0.47863247863247865,
"grad_norm": 0.5642545819282532,
"learning_rate": 7.939698492462313e-05,
"loss": 1.1968,
"step": 42
},
{
"epoch": 0.49002849002849,
"grad_norm": 0.4435971975326538,
"learning_rate": 7.889447236180904e-05,
"loss": 1.2094,
"step": 43
},
{
"epoch": 0.5014245014245015,
"grad_norm": 0.4562317430973053,
"learning_rate": 7.839195979899498e-05,
"loss": 1.1734,
"step": 44
},
{
"epoch": 0.5128205128205128,
"grad_norm": 0.5164006948471069,
"learning_rate": 7.788944723618091e-05,
"loss": 1.2177,
"step": 45
},
{
"epoch": 0.5242165242165242,
"grad_norm": 0.5158300399780273,
"learning_rate": 7.738693467336684e-05,
"loss": 1.2289,
"step": 46
},
{
"epoch": 0.5356125356125356,
"grad_norm": 0.49650484323501587,
"learning_rate": 7.688442211055277e-05,
"loss": 1.3316,
"step": 47
},
{
"epoch": 0.5470085470085471,
"grad_norm": 0.4756147563457489,
"learning_rate": 7.638190954773869e-05,
"loss": 1.333,
"step": 48
},
{
"epoch": 0.5584045584045584,
"grad_norm": 0.4514218866825104,
"learning_rate": 7.587939698492463e-05,
"loss": 1.2733,
"step": 49
},
{
"epoch": 0.5698005698005698,
"grad_norm": 0.445081889629364,
"learning_rate": 7.537688442211056e-05,
"loss": 1.2024,
"step": 50
},
{
"epoch": 0.5811965811965812,
"grad_norm": 0.45231911540031433,
"learning_rate": 7.487437185929649e-05,
"loss": 1.267,
"step": 51
},
{
"epoch": 0.5925925925925926,
"grad_norm": 0.5107349753379822,
"learning_rate": 7.437185929648241e-05,
"loss": 1.2068,
"step": 52
},
{
"epoch": 0.603988603988604,
"grad_norm": 0.4770635664463043,
"learning_rate": 7.386934673366834e-05,
"loss": 1.2372,
"step": 53
},
{
"epoch": 0.6153846153846154,
"grad_norm": 0.4619287848472595,
"learning_rate": 7.336683417085427e-05,
"loss": 1.2132,
"step": 54
},
{
"epoch": 0.6267806267806267,
"grad_norm": 0.5125857591629028,
"learning_rate": 7.28643216080402e-05,
"loss": 1.2367,
"step": 55
},
{
"epoch": 0.6381766381766382,
"grad_norm": 0.456436425447464,
"learning_rate": 7.236180904522614e-05,
"loss": 1.1989,
"step": 56
},
{
"epoch": 0.6495726495726496,
"grad_norm": 0.4466511011123657,
"learning_rate": 7.185929648241206e-05,
"loss": 1.2789,
"step": 57
},
{
"epoch": 0.6609686609686609,
"grad_norm": 0.45993903279304504,
"learning_rate": 7.135678391959799e-05,
"loss": 1.2228,
"step": 58
},
{
"epoch": 0.6723646723646723,
"grad_norm": 0.4762590825557709,
"learning_rate": 7.085427135678392e-05,
"loss": 1.1418,
"step": 59
},
{
"epoch": 0.6837606837606838,
"grad_norm": 0.4990002512931824,
"learning_rate": 7.035175879396985e-05,
"loss": 1.1788,
"step": 60
},
{
"epoch": 0.6951566951566952,
"grad_norm": 0.4810471534729004,
"learning_rate": 6.984924623115579e-05,
"loss": 1.1878,
"step": 61
},
{
"epoch": 0.7065527065527065,
"grad_norm": 0.4777512550354004,
"learning_rate": 6.93467336683417e-05,
"loss": 1.2621,
"step": 62
},
{
"epoch": 0.717948717948718,
"grad_norm": 0.5083452463150024,
"learning_rate": 6.884422110552764e-05,
"loss": 1.1642,
"step": 63
},
{
"epoch": 0.7293447293447294,
"grad_norm": 0.4965672194957733,
"learning_rate": 6.834170854271357e-05,
"loss": 1.1356,
"step": 64
},
{
"epoch": 0.7407407407407407,
"grad_norm": 0.48566991090774536,
"learning_rate": 6.78391959798995e-05,
"loss": 1.0954,
"step": 65
},
{
"epoch": 0.7521367521367521,
"grad_norm": 0.46452316641807556,
"learning_rate": 6.733668341708544e-05,
"loss": 1.1905,
"step": 66
},
{
"epoch": 0.7635327635327636,
"grad_norm": 0.51093989610672,
"learning_rate": 6.683417085427135e-05,
"loss": 1.2285,
"step": 67
},
{
"epoch": 0.7749287749287749,
"grad_norm": 0.5919416546821594,
"learning_rate": 6.633165829145729e-05,
"loss": 1.2368,
"step": 68
},
{
"epoch": 0.7863247863247863,
"grad_norm": 0.46627846360206604,
"learning_rate": 6.582914572864322e-05,
"loss": 1.1286,
"step": 69
},
{
"epoch": 0.7977207977207977,
"grad_norm": 0.4974450170993805,
"learning_rate": 6.532663316582915e-05,
"loss": 1.2874,
"step": 70
},
{
"epoch": 0.8091168091168092,
"grad_norm": 0.5373516082763672,
"learning_rate": 6.482412060301508e-05,
"loss": 1.2573,
"step": 71
},
{
"epoch": 0.8205128205128205,
"grad_norm": 0.4522000849246979,
"learning_rate": 6.4321608040201e-05,
"loss": 1.137,
"step": 72
},
{
"epoch": 0.8319088319088319,
"grad_norm": 0.4694693088531494,
"learning_rate": 6.381909547738694e-05,
"loss": 1.1971,
"step": 73
},
{
"epoch": 0.8433048433048433,
"grad_norm": 0.4889134466648102,
"learning_rate": 6.331658291457287e-05,
"loss": 1.1435,
"step": 74
},
{
"epoch": 0.8547008547008547,
"grad_norm": 0.512048602104187,
"learning_rate": 6.28140703517588e-05,
"loss": 1.2086,
"step": 75
},
{
"epoch": 0.8660968660968661,
"grad_norm": 0.49475014209747314,
"learning_rate": 6.231155778894473e-05,
"loss": 1.2496,
"step": 76
},
{
"epoch": 0.8774928774928775,
"grad_norm": 0.4658482074737549,
"learning_rate": 6.180904522613065e-05,
"loss": 1.0942,
"step": 77
},
{
"epoch": 0.8888888888888888,
"grad_norm": 0.5323877334594727,
"learning_rate": 6.130653266331658e-05,
"loss": 1.1853,
"step": 78
},
{
"epoch": 0.9002849002849003,
"grad_norm": 0.6251657605171204,
"learning_rate": 6.080402010050251e-05,
"loss": 1.2209,
"step": 79
},
{
"epoch": 0.9116809116809117,
"grad_norm": 0.4768678545951843,
"learning_rate": 6.030150753768844e-05,
"loss": 1.164,
"step": 80
},
{
"epoch": 0.9230769230769231,
"grad_norm": 0.5291458964347839,
"learning_rate": 5.979899497487438e-05,
"loss": 1.2754,
"step": 81
},
{
"epoch": 0.9344729344729344,
"grad_norm": 0.5092456340789795,
"learning_rate": 5.929648241206031e-05,
"loss": 1.248,
"step": 82
},
{
"epoch": 0.9458689458689459,
"grad_norm": 0.4834723174571991,
"learning_rate": 5.879396984924623e-05,
"loss": 1.2681,
"step": 83
},
{
"epoch": 0.9572649572649573,
"grad_norm": 0.5478146076202393,
"learning_rate": 5.829145728643216e-05,
"loss": 1.2045,
"step": 84
},
{
"epoch": 0.9686609686609686,
"grad_norm": 0.4918864071369171,
"learning_rate": 5.778894472361809e-05,
"loss": 1.1216,
"step": 85
},
{
"epoch": 0.98005698005698,
"grad_norm": 0.4812568128108978,
"learning_rate": 5.728643216080403e-05,
"loss": 1.1732,
"step": 86
},
{
"epoch": 0.9914529914529915,
"grad_norm": 0.5308701992034912,
"learning_rate": 5.6783919597989955e-05,
"loss": 1.2643,
"step": 87
},
{
"epoch": 1.002849002849003,
"grad_norm": 0.7254398465156555,
"learning_rate": 5.628140703517588e-05,
"loss": 1.4278,
"step": 88
},
{
"epoch": 1.0142450142450143,
"grad_norm": 0.5298280715942383,
"learning_rate": 5.577889447236181e-05,
"loss": 1.2691,
"step": 89
},
{
"epoch": 1.0256410256410255,
"grad_norm": 0.4411105811595917,
"learning_rate": 5.527638190954774e-05,
"loss": 1.1033,
"step": 90
},
{
"epoch": 1.037037037037037,
"grad_norm": 0.5160195231437683,
"learning_rate": 5.477386934673368e-05,
"loss": 1.2041,
"step": 91
},
{
"epoch": 1.0484330484330484,
"grad_norm": 0.4904952943325043,
"learning_rate": 5.4271356783919604e-05,
"loss": 1.1634,
"step": 92
},
{
"epoch": 1.0598290598290598,
"grad_norm": 0.4598182439804077,
"learning_rate": 5.376884422110553e-05,
"loss": 1.1095,
"step": 93
},
{
"epoch": 1.0712250712250713,
"grad_norm": 0.4579429626464844,
"learning_rate": 5.3266331658291455e-05,
"loss": 1.0614,
"step": 94
},
{
"epoch": 1.0826210826210827,
"grad_norm": 0.49531427025794983,
"learning_rate": 5.276381909547739e-05,
"loss": 1.2415,
"step": 95
},
{
"epoch": 1.0940170940170941,
"grad_norm": 0.4900212287902832,
"learning_rate": 5.226130653266332e-05,
"loss": 1.08,
"step": 96
},
{
"epoch": 1.1054131054131053,
"grad_norm": 0.5321851372718811,
"learning_rate": 5.175879396984925e-05,
"loss": 1.1464,
"step": 97
},
{
"epoch": 1.1168091168091168,
"grad_norm": 0.4976498782634735,
"learning_rate": 5.125628140703518e-05,
"loss": 1.1962,
"step": 98
},
{
"epoch": 1.1282051282051282,
"grad_norm": 0.48046809434890747,
"learning_rate": 5.0753768844221104e-05,
"loss": 1.0534,
"step": 99
},
{
"epoch": 1.1396011396011396,
"grad_norm": 0.5641231536865234,
"learning_rate": 5.0251256281407036e-05,
"loss": 1.2342,
"step": 100
},
{
"epoch": 1.150997150997151,
"grad_norm": 0.5218859314918518,
"learning_rate": 4.974874371859297e-05,
"loss": 1.1125,
"step": 101
},
{
"epoch": 1.1623931623931625,
"grad_norm": 0.672106146812439,
"learning_rate": 4.92462311557789e-05,
"loss": 1.2577,
"step": 102
},
{
"epoch": 1.173789173789174,
"grad_norm": 0.5047236680984497,
"learning_rate": 4.874371859296483e-05,
"loss": 1.0518,
"step": 103
},
{
"epoch": 1.1851851851851851,
"grad_norm": 0.5189758539199829,
"learning_rate": 4.824120603015075e-05,
"loss": 1.1672,
"step": 104
},
{
"epoch": 1.1965811965811965,
"grad_norm": 0.5212465524673462,
"learning_rate": 4.7738693467336685e-05,
"loss": 1.0869,
"step": 105
},
{
"epoch": 1.207977207977208,
"grad_norm": 0.5158497095108032,
"learning_rate": 4.723618090452262e-05,
"loss": 1.0674,
"step": 106
},
{
"epoch": 1.2193732193732194,
"grad_norm": 0.6164978742599487,
"learning_rate": 4.673366834170855e-05,
"loss": 1.2204,
"step": 107
},
{
"epoch": 1.2307692307692308,
"grad_norm": 0.5276105403900146,
"learning_rate": 4.6231155778894475e-05,
"loss": 1.0782,
"step": 108
},
{
"epoch": 1.242165242165242,
"grad_norm": 0.5990796685218811,
"learning_rate": 4.57286432160804e-05,
"loss": 1.1748,
"step": 109
},
{
"epoch": 1.2535612535612537,
"grad_norm": 0.5942894816398621,
"learning_rate": 4.522613065326633e-05,
"loss": 1.1417,
"step": 110
},
{
"epoch": 1.264957264957265,
"grad_norm": 0.5517327189445496,
"learning_rate": 4.4723618090452266e-05,
"loss": 1.0214,
"step": 111
},
{
"epoch": 1.2763532763532763,
"grad_norm": 0.599429190158844,
"learning_rate": 4.42211055276382e-05,
"loss": 1.2503,
"step": 112
},
{
"epoch": 1.2877492877492878,
"grad_norm": 0.5922709107398987,
"learning_rate": 4.3718592964824124e-05,
"loss": 1.1294,
"step": 113
},
{
"epoch": 1.2991452991452992,
"grad_norm": 0.569146990776062,
"learning_rate": 4.321608040201005e-05,
"loss": 1.1523,
"step": 114
},
{
"epoch": 1.3105413105413106,
"grad_norm": 0.5592817664146423,
"learning_rate": 4.271356783919598e-05,
"loss": 1.1484,
"step": 115
},
{
"epoch": 1.3219373219373218,
"grad_norm": 0.5917912125587463,
"learning_rate": 4.2211055276381914e-05,
"loss": 1.0667,
"step": 116
},
{
"epoch": 1.3333333333333333,
"grad_norm": 0.6044990420341492,
"learning_rate": 4.170854271356784e-05,
"loss": 1.1646,
"step": 117
},
{
"epoch": 1.3447293447293447,
"grad_norm": 0.5149083137512207,
"learning_rate": 4.120603015075377e-05,
"loss": 1.0567,
"step": 118
},
{
"epoch": 1.3561253561253561,
"grad_norm": 0.5668403506278992,
"learning_rate": 4.07035175879397e-05,
"loss": 1.2072,
"step": 119
},
{
"epoch": 1.3675213675213675,
"grad_norm": 0.5478379726409912,
"learning_rate": 4.020100502512563e-05,
"loss": 1.1353,
"step": 120
},
{
"epoch": 1.378917378917379,
"grad_norm": 0.5570518374443054,
"learning_rate": 3.969849246231156e-05,
"loss": 1.0336,
"step": 121
},
{
"epoch": 1.3903133903133904,
"grad_norm": 0.5794707536697388,
"learning_rate": 3.919597989949749e-05,
"loss": 1.1696,
"step": 122
},
{
"epoch": 1.4017094017094016,
"grad_norm": 0.567593514919281,
"learning_rate": 3.869346733668342e-05,
"loss": 1.0635,
"step": 123
},
{
"epoch": 1.413105413105413,
"grad_norm": 0.5602433085441589,
"learning_rate": 3.8190954773869346e-05,
"loss": 1.1443,
"step": 124
},
{
"epoch": 1.4245014245014245,
"grad_norm": 0.578921377658844,
"learning_rate": 3.768844221105528e-05,
"loss": 1.1843,
"step": 125
},
{
"epoch": 1.435897435897436,
"grad_norm": 0.5648573040962219,
"learning_rate": 3.7185929648241204e-05,
"loss": 1.1076,
"step": 126
},
{
"epoch": 1.4472934472934473,
"grad_norm": 0.5635711550712585,
"learning_rate": 3.668341708542714e-05,
"loss": 1.0589,
"step": 127
},
{
"epoch": 1.4586894586894588,
"grad_norm": 0.6067689061164856,
"learning_rate": 3.618090452261307e-05,
"loss": 1.1941,
"step": 128
},
{
"epoch": 1.4700854700854702,
"grad_norm": 0.5677550435066223,
"learning_rate": 3.5678391959798995e-05,
"loss": 1.0384,
"step": 129
},
{
"epoch": 1.4814814814814814,
"grad_norm": 0.599470853805542,
"learning_rate": 3.517587939698493e-05,
"loss": 1.2259,
"step": 130
},
{
"epoch": 1.4928774928774928,
"grad_norm": 0.6475313305854797,
"learning_rate": 3.467336683417085e-05,
"loss": 1.1618,
"step": 131
},
{
"epoch": 1.5042735042735043,
"grad_norm": 0.6463358402252197,
"learning_rate": 3.4170854271356785e-05,
"loss": 1.1712,
"step": 132
},
{
"epoch": 1.5156695156695157,
"grad_norm": 0.5662721395492554,
"learning_rate": 3.366834170854272e-05,
"loss": 1.0731,
"step": 133
},
{
"epoch": 1.5270655270655271,
"grad_norm": 0.5981451869010925,
"learning_rate": 3.3165829145728643e-05,
"loss": 1.0298,
"step": 134
},
{
"epoch": 1.5384615384615383,
"grad_norm": 0.5991083979606628,
"learning_rate": 3.2663316582914576e-05,
"loss": 1.2169,
"step": 135
},
{
"epoch": 1.54985754985755,
"grad_norm": 0.6036386489868164,
"learning_rate": 3.21608040201005e-05,
"loss": 1.0498,
"step": 136
},
{
"epoch": 1.5612535612535612,
"grad_norm": 0.5927392840385437,
"learning_rate": 3.1658291457286434e-05,
"loss": 1.0389,
"step": 137
},
{
"epoch": 1.5726495726495726,
"grad_norm": 0.5733420848846436,
"learning_rate": 3.1155778894472366e-05,
"loss": 1.0608,
"step": 138
},
{
"epoch": 1.584045584045584,
"grad_norm": 0.6083365678787231,
"learning_rate": 3.065326633165829e-05,
"loss": 1.1203,
"step": 139
},
{
"epoch": 1.5954415954415955,
"grad_norm": 0.6153535842895508,
"learning_rate": 3.015075376884422e-05,
"loss": 1.1729,
"step": 140
},
{
"epoch": 1.606837606837607,
"grad_norm": 0.6425400376319885,
"learning_rate": 2.9648241206030153e-05,
"loss": 1.0155,
"step": 141
},
{
"epoch": 1.618233618233618,
"grad_norm": 0.6353899240493774,
"learning_rate": 2.914572864321608e-05,
"loss": 1.245,
"step": 142
},
{
"epoch": 1.6296296296296298,
"grad_norm": 0.6624664664268494,
"learning_rate": 2.8643216080402015e-05,
"loss": 0.9795,
"step": 143
},
{
"epoch": 1.641025641025641,
"grad_norm": 0.6005571484565735,
"learning_rate": 2.814070351758794e-05,
"loss": 1.1581,
"step": 144
},
{
"epoch": 1.6524216524216524,
"grad_norm": 0.6583634614944458,
"learning_rate": 2.763819095477387e-05,
"loss": 1.0538,
"step": 145
},
{
"epoch": 1.6638176638176638,
"grad_norm": 0.5750309228897095,
"learning_rate": 2.7135678391959802e-05,
"loss": 0.9153,
"step": 146
},
{
"epoch": 1.6752136752136753,
"grad_norm": 0.6019430160522461,
"learning_rate": 2.6633165829145728e-05,
"loss": 1.159,
"step": 147
},
{
"epoch": 1.6866096866096867,
"grad_norm": 0.6395899057388306,
"learning_rate": 2.613065326633166e-05,
"loss": 1.1699,
"step": 148
},
{
"epoch": 1.698005698005698,
"grad_norm": 0.635377824306488,
"learning_rate": 2.562814070351759e-05,
"loss": 1.0286,
"step": 149
},
{
"epoch": 1.7094017094017095,
"grad_norm": 0.6403070688247681,
"learning_rate": 2.5125628140703518e-05,
"loss": 1.1748,
"step": 150
},
{
"epoch": 1.7207977207977208,
"grad_norm": 0.6614859104156494,
"learning_rate": 2.462311557788945e-05,
"loss": 1.0518,
"step": 151
},
{
"epoch": 1.7321937321937322,
"grad_norm": 0.7036588788032532,
"learning_rate": 2.4120603015075376e-05,
"loss": 1.2191,
"step": 152
},
{
"epoch": 1.7435897435897436,
"grad_norm": 0.6717102527618408,
"learning_rate": 2.361809045226131e-05,
"loss": 1.1376,
"step": 153
},
{
"epoch": 1.7549857549857548,
"grad_norm": 0.6435654759407043,
"learning_rate": 2.3115577889447238e-05,
"loss": 1.1601,
"step": 154
},
{
"epoch": 1.7663817663817665,
"grad_norm": 0.6151710748672485,
"learning_rate": 2.2613065326633167e-05,
"loss": 1.0929,
"step": 155
},
{
"epoch": 1.7777777777777777,
"grad_norm": 0.6438819169998169,
"learning_rate": 2.21105527638191e-05,
"loss": 1.111,
"step": 156
},
{
"epoch": 1.7891737891737893,
"grad_norm": 0.6459853649139404,
"learning_rate": 2.1608040201005025e-05,
"loss": 1.2303,
"step": 157
},
{
"epoch": 1.8005698005698005,
"grad_norm": 0.6052287220954895,
"learning_rate": 2.1105527638190957e-05,
"loss": 1.1341,
"step": 158
},
{
"epoch": 1.811965811965812,
"grad_norm": 0.6797654032707214,
"learning_rate": 2.0603015075376886e-05,
"loss": 1.1724,
"step": 159
},
{
"epoch": 1.8233618233618234,
"grad_norm": 0.6900933980941772,
"learning_rate": 2.0100502512562815e-05,
"loss": 1.0881,
"step": 160
},
{
"epoch": 1.8347578347578346,
"grad_norm": 0.6317200064659119,
"learning_rate": 1.9597989949748744e-05,
"loss": 1.1256,
"step": 161
},
{
"epoch": 1.8461538461538463,
"grad_norm": 0.6044368743896484,
"learning_rate": 1.9095477386934673e-05,
"loss": 1.0595,
"step": 162
},
{
"epoch": 1.8575498575498575,
"grad_norm": 0.6719862818717957,
"learning_rate": 1.8592964824120602e-05,
"loss": 0.9983,
"step": 163
},
{
"epoch": 1.868945868945869,
"grad_norm": 0.6419474482536316,
"learning_rate": 1.8090452261306535e-05,
"loss": 1.0876,
"step": 164
},
{
"epoch": 1.8803418803418803,
"grad_norm": 0.6861122250556946,
"learning_rate": 1.7587939698492464e-05,
"loss": 1.0904,
"step": 165
},
{
"epoch": 1.8917378917378918,
"grad_norm": 0.6277052760124207,
"learning_rate": 1.7085427135678393e-05,
"loss": 1.1036,
"step": 166
},
{
"epoch": 1.9031339031339032,
"grad_norm": 0.7358347177505493,
"learning_rate": 1.6582914572864322e-05,
"loss": 1.0499,
"step": 167
},
{
"epoch": 1.9145299145299144,
"grad_norm": 0.6961327195167542,
"learning_rate": 1.608040201005025e-05,
"loss": 1.097,
"step": 168
},
{
"epoch": 1.925925925925926,
"grad_norm": 0.6499162316322327,
"learning_rate": 1.5577889447236183e-05,
"loss": 1.0566,
"step": 169
},
{
"epoch": 1.9373219373219372,
"grad_norm": 0.6426655650138855,
"learning_rate": 1.507537688442211e-05,
"loss": 1.1127,
"step": 170
},
{
"epoch": 1.9487179487179487,
"grad_norm": 0.6038071513175964,
"learning_rate": 1.457286432160804e-05,
"loss": 1.0345,
"step": 171
},
{
"epoch": 1.96011396011396,
"grad_norm": 0.6887624263763428,
"learning_rate": 1.407035175879397e-05,
"loss": 1.2775,
"step": 172
},
{
"epoch": 1.9715099715099715,
"grad_norm": 0.6664908528327942,
"learning_rate": 1.3567839195979901e-05,
"loss": 1.099,
"step": 173
},
{
"epoch": 1.982905982905983,
"grad_norm": 0.6395720839500427,
"learning_rate": 1.306532663316583e-05,
"loss": 1.0795,
"step": 174
},
{
"epoch": 1.9943019943019942,
"grad_norm": 0.8258576989173889,
"learning_rate": 1.2562814070351759e-05,
"loss": 1.4081,
"step": 175
},
{
"epoch": 2.005698005698006,
"grad_norm": 0.7664657831192017,
"learning_rate": 1.2060301507537688e-05,
"loss": 1.2999,
"step": 176
},
{
"epoch": 2.017094017094017,
"grad_norm": 0.6419854164123535,
"learning_rate": 1.1557788944723619e-05,
"loss": 1.0661,
"step": 177
},
{
"epoch": 2.0284900284900287,
"grad_norm": 0.6126803755760193,
"learning_rate": 1.105527638190955e-05,
"loss": 1.0989,
"step": 178
},
{
"epoch": 2.03988603988604,
"grad_norm": 0.5971490740776062,
"learning_rate": 1.0552763819095479e-05,
"loss": 0.9485,
"step": 179
},
{
"epoch": 2.051282051282051,
"grad_norm": 0.6343738436698914,
"learning_rate": 1.0050251256281408e-05,
"loss": 1.1226,
"step": 180
},
{
"epoch": 2.0626780626780628,
"grad_norm": 0.6217495203018188,
"learning_rate": 9.547738693467337e-06,
"loss": 1.0628,
"step": 181
},
{
"epoch": 2.074074074074074,
"grad_norm": 0.5966653823852539,
"learning_rate": 9.045226130653267e-06,
"loss": 1.0092,
"step": 182
},
{
"epoch": 2.0854700854700856,
"grad_norm": 0.6323566436767578,
"learning_rate": 8.542713567839196e-06,
"loss": 1.0992,
"step": 183
},
{
"epoch": 2.096866096866097,
"grad_norm": 0.6440880298614502,
"learning_rate": 8.040201005025125e-06,
"loss": 1.0804,
"step": 184
},
{
"epoch": 2.1082621082621085,
"grad_norm": 0.5995816588401794,
"learning_rate": 7.537688442211055e-06,
"loss": 1.1067,
"step": 185
},
{
"epoch": 2.1196581196581197,
"grad_norm": 0.6618144512176514,
"learning_rate": 7.035175879396985e-06,
"loss": 1.0689,
"step": 186
},
{
"epoch": 2.131054131054131,
"grad_norm": 0.6532097458839417,
"learning_rate": 6.532663316582915e-06,
"loss": 1.1028,
"step": 187
},
{
"epoch": 2.1424501424501425,
"grad_norm": 0.5830849409103394,
"learning_rate": 6.030150753768844e-06,
"loss": 0.9996,
"step": 188
},
{
"epoch": 2.1538461538461537,
"grad_norm": 0.6783652901649475,
"learning_rate": 5.527638190954775e-06,
"loss": 1.2311,
"step": 189
},
{
"epoch": 2.1652421652421654,
"grad_norm": 0.6712796688079834,
"learning_rate": 5.025125628140704e-06,
"loss": 0.9609,
"step": 190
},
{
"epoch": 2.1766381766381766,
"grad_norm": 0.6146546006202698,
"learning_rate": 4.522613065326634e-06,
"loss": 1.1135,
"step": 191
},
{
"epoch": 2.1880341880341883,
"grad_norm": 0.6589621901512146,
"learning_rate": 4.020100502512563e-06,
"loss": 1.0524,
"step": 192
},
{
"epoch": 2.1994301994301995,
"grad_norm": 0.648345947265625,
"learning_rate": 3.5175879396984926e-06,
"loss": 1.1136,
"step": 193
},
{
"epoch": 2.2108262108262107,
"grad_norm": 0.6554787158966064,
"learning_rate": 3.015075376884422e-06,
"loss": 1.0393,
"step": 194
},
{
"epoch": 2.2222222222222223,
"grad_norm": 0.60575270652771,
"learning_rate": 2.512562814070352e-06,
"loss": 1.0688,
"step": 195
},
{
"epoch": 2.2336182336182335,
"grad_norm": 0.609583854675293,
"learning_rate": 2.0100502512562813e-06,
"loss": 1.047,
"step": 196
},
{
"epoch": 2.245014245014245,
"grad_norm": 0.6172504425048828,
"learning_rate": 1.507537688442211e-06,
"loss": 1.0835,
"step": 197
},
{
"epoch": 2.2564102564102564,
"grad_norm": 0.6250450015068054,
"learning_rate": 1.0050251256281407e-06,
"loss": 1.0684,
"step": 198
},
{
"epoch": 2.267806267806268,
"grad_norm": 0.5692541599273682,
"learning_rate": 5.025125628140703e-07,
"loss": 0.9156,
"step": 199
},
{
"epoch": 2.2792022792022792,
"grad_norm": 0.6259585022926331,
"learning_rate": 0.0,
"loss": 1.2106,
"step": 200
}
],
"logging_steps": 1,
"max_steps": 200,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.469981127242547e+16,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}