molereddy's picture
Upload folder using huggingface_hub
64a2028 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 4.96969696969697,
"eval_steps": 500,
"global_step": 615,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.04040404040404041,
"grad_norm": 80.8431859261654,
"learning_rate": 4.0650406504065046e-07,
"loss": 2.64,
"step": 5
},
{
"epoch": 0.08080808080808081,
"grad_norm": 58.95328571668908,
"learning_rate": 8.130081300813009e-07,
"loss": 2.5295,
"step": 10
},
{
"epoch": 0.12121212121212122,
"grad_norm": 30.259656389849923,
"learning_rate": 1.2195121951219514e-06,
"loss": 2.3595,
"step": 15
},
{
"epoch": 0.16161616161616163,
"grad_norm": 19.031626660884115,
"learning_rate": 1.6260162601626018e-06,
"loss": 2.2046,
"step": 20
},
{
"epoch": 0.20202020202020202,
"grad_norm": 14.285496207217582,
"learning_rate": 2.0325203252032523e-06,
"loss": 2.0114,
"step": 25
},
{
"epoch": 0.24242424242424243,
"grad_norm": 11.705575031693748,
"learning_rate": 2.4390243902439027e-06,
"loss": 1.9542,
"step": 30
},
{
"epoch": 0.2828282828282828,
"grad_norm": 10.031255451818282,
"learning_rate": 2.845528455284553e-06,
"loss": 1.9027,
"step": 35
},
{
"epoch": 0.32323232323232326,
"grad_norm": 9.87064503889936,
"learning_rate": 3.2520325203252037e-06,
"loss": 1.8376,
"step": 40
},
{
"epoch": 0.36363636363636365,
"grad_norm": 9.463342602070677,
"learning_rate": 3.6585365853658537e-06,
"loss": 1.8335,
"step": 45
},
{
"epoch": 0.40404040404040403,
"grad_norm": 8.14740265971335,
"learning_rate": 4.0650406504065046e-06,
"loss": 1.7531,
"step": 50
},
{
"epoch": 0.4444444444444444,
"grad_norm": 8.579660709279775,
"learning_rate": 4.471544715447155e-06,
"loss": 1.746,
"step": 55
},
{
"epoch": 0.48484848484848486,
"grad_norm": 7.764594425498635,
"learning_rate": 4.8780487804878055e-06,
"loss": 1.7795,
"step": 60
},
{
"epoch": 0.5252525252525253,
"grad_norm": 7.818967776047545,
"learning_rate": 5.2845528455284555e-06,
"loss": 1.7845,
"step": 65
},
{
"epoch": 0.5656565656565656,
"grad_norm": 8.571992272728625,
"learning_rate": 5.691056910569106e-06,
"loss": 1.7176,
"step": 70
},
{
"epoch": 0.6060606060606061,
"grad_norm": 7.157751637340014,
"learning_rate": 6.0975609756097564e-06,
"loss": 1.6894,
"step": 75
},
{
"epoch": 0.6464646464646465,
"grad_norm": 7.776730945646382,
"learning_rate": 6.504065040650407e-06,
"loss": 1.6733,
"step": 80
},
{
"epoch": 0.6868686868686869,
"grad_norm": 7.538790813241334,
"learning_rate": 6.910569105691057e-06,
"loss": 1.6661,
"step": 85
},
{
"epoch": 0.7272727272727273,
"grad_norm": 7.237611731257021,
"learning_rate": 7.317073170731707e-06,
"loss": 1.6413,
"step": 90
},
{
"epoch": 0.7676767676767676,
"grad_norm": 8.713375291094371,
"learning_rate": 7.723577235772358e-06,
"loss": 1.6712,
"step": 95
},
{
"epoch": 0.8080808080808081,
"grad_norm": 8.148761177726017,
"learning_rate": 8.130081300813009e-06,
"loss": 1.6491,
"step": 100
},
{
"epoch": 0.8484848484848485,
"grad_norm": 7.670084788858041,
"learning_rate": 8.536585365853658e-06,
"loss": 1.6661,
"step": 105
},
{
"epoch": 0.8888888888888888,
"grad_norm": 8.193767782896925,
"learning_rate": 8.94308943089431e-06,
"loss": 1.6061,
"step": 110
},
{
"epoch": 0.9292929292929293,
"grad_norm": 7.327023272280465,
"learning_rate": 9.34959349593496e-06,
"loss": 1.6177,
"step": 115
},
{
"epoch": 0.9696969696969697,
"grad_norm": 7.6240834480577995,
"learning_rate": 9.756097560975611e-06,
"loss": 1.7007,
"step": 120
},
{
"epoch": 1.0101010101010102,
"grad_norm": 7.022808705113093,
"learning_rate": 9.959349593495936e-06,
"loss": 1.5861,
"step": 125
},
{
"epoch": 1.0505050505050506,
"grad_norm": 7.831382750074179,
"learning_rate": 9.857723577235772e-06,
"loss": 1.397,
"step": 130
},
{
"epoch": 1.0909090909090908,
"grad_norm": 7.482634887493619,
"learning_rate": 9.756097560975611e-06,
"loss": 1.3207,
"step": 135
},
{
"epoch": 1.1313131313131313,
"grad_norm": 7.77405709671628,
"learning_rate": 9.654471544715448e-06,
"loss": 1.4016,
"step": 140
},
{
"epoch": 1.1717171717171717,
"grad_norm": 8.316039462095251,
"learning_rate": 9.552845528455286e-06,
"loss": 1.4044,
"step": 145
},
{
"epoch": 1.2121212121212122,
"grad_norm": 8.220962058193507,
"learning_rate": 9.451219512195122e-06,
"loss": 1.4525,
"step": 150
},
{
"epoch": 1.2525252525252526,
"grad_norm": 8.628523413754008,
"learning_rate": 9.34959349593496e-06,
"loss": 1.3725,
"step": 155
},
{
"epoch": 1.2929292929292928,
"grad_norm": 9.049067713299571,
"learning_rate": 9.247967479674797e-06,
"loss": 1.3988,
"step": 160
},
{
"epoch": 1.3333333333333333,
"grad_norm": 8.79216448432911,
"learning_rate": 9.146341463414635e-06,
"loss": 1.3923,
"step": 165
},
{
"epoch": 1.3737373737373737,
"grad_norm": 7.749843189602475,
"learning_rate": 9.044715447154472e-06,
"loss": 1.3965,
"step": 170
},
{
"epoch": 1.4141414141414141,
"grad_norm": 8.10742211898071,
"learning_rate": 8.94308943089431e-06,
"loss": 1.3586,
"step": 175
},
{
"epoch": 1.4545454545454546,
"grad_norm": 8.61374938206157,
"learning_rate": 8.841463414634148e-06,
"loss": 1.3455,
"step": 180
},
{
"epoch": 1.494949494949495,
"grad_norm": 7.806546434692009,
"learning_rate": 8.739837398373985e-06,
"loss": 1.3164,
"step": 185
},
{
"epoch": 1.5353535353535355,
"grad_norm": 8.055069468513308,
"learning_rate": 8.638211382113821e-06,
"loss": 1.3719,
"step": 190
},
{
"epoch": 1.5757575757575757,
"grad_norm": 7.286077733666312,
"learning_rate": 8.536585365853658e-06,
"loss": 1.331,
"step": 195
},
{
"epoch": 1.6161616161616161,
"grad_norm": 7.722391890399128,
"learning_rate": 8.434959349593497e-06,
"loss": 1.347,
"step": 200
},
{
"epoch": 1.6565656565656566,
"grad_norm": 7.963303799143797,
"learning_rate": 8.333333333333334e-06,
"loss": 1.2988,
"step": 205
},
{
"epoch": 1.696969696969697,
"grad_norm": 8.363381324799755,
"learning_rate": 8.23170731707317e-06,
"loss": 1.3731,
"step": 210
},
{
"epoch": 1.7373737373737375,
"grad_norm": 8.650050591837509,
"learning_rate": 8.130081300813009e-06,
"loss": 1.3554,
"step": 215
},
{
"epoch": 1.7777777777777777,
"grad_norm": 7.821382086934233,
"learning_rate": 8.028455284552846e-06,
"loss": 1.3257,
"step": 220
},
{
"epoch": 1.8181818181818183,
"grad_norm": 7.593249783363984,
"learning_rate": 7.926829268292685e-06,
"loss": 1.2994,
"step": 225
},
{
"epoch": 1.8585858585858586,
"grad_norm": 8.265959609765153,
"learning_rate": 7.82520325203252e-06,
"loss": 1.258,
"step": 230
},
{
"epoch": 1.898989898989899,
"grad_norm": 7.47340446063849,
"learning_rate": 7.723577235772358e-06,
"loss": 1.3744,
"step": 235
},
{
"epoch": 1.9393939393939394,
"grad_norm": 7.8636893423505505,
"learning_rate": 7.621951219512196e-06,
"loss": 1.2867,
"step": 240
},
{
"epoch": 1.9797979797979797,
"grad_norm": 8.66108025838036,
"learning_rate": 7.520325203252034e-06,
"loss": 1.3423,
"step": 245
},
{
"epoch": 2.0202020202020203,
"grad_norm": 7.253266730457967,
"learning_rate": 7.41869918699187e-06,
"loss": 1.1248,
"step": 250
},
{
"epoch": 2.0606060606060606,
"grad_norm": 7.304801005516647,
"learning_rate": 7.317073170731707e-06,
"loss": 0.8695,
"step": 255
},
{
"epoch": 2.101010101010101,
"grad_norm": 8.148533408280995,
"learning_rate": 7.215447154471545e-06,
"loss": 0.8396,
"step": 260
},
{
"epoch": 2.1414141414141414,
"grad_norm": 9.202743834871297,
"learning_rate": 7.113821138211383e-06,
"loss": 0.8835,
"step": 265
},
{
"epoch": 2.1818181818181817,
"grad_norm": 9.248143695051853,
"learning_rate": 7.01219512195122e-06,
"loss": 0.769,
"step": 270
},
{
"epoch": 2.2222222222222223,
"grad_norm": 8.877400578549704,
"learning_rate": 6.910569105691057e-06,
"loss": 0.8883,
"step": 275
},
{
"epoch": 2.2626262626262625,
"grad_norm": 9.858300545714043,
"learning_rate": 6.808943089430895e-06,
"loss": 0.8892,
"step": 280
},
{
"epoch": 2.303030303030303,
"grad_norm": 8.940708831871842,
"learning_rate": 6.707317073170733e-06,
"loss": 0.8526,
"step": 285
},
{
"epoch": 2.3434343434343434,
"grad_norm": 9.167915788662723,
"learning_rate": 6.60569105691057e-06,
"loss": 0.9309,
"step": 290
},
{
"epoch": 2.3838383838383836,
"grad_norm": 8.435170262522817,
"learning_rate": 6.504065040650407e-06,
"loss": 0.8693,
"step": 295
},
{
"epoch": 2.4242424242424243,
"grad_norm": 9.022959586969035,
"learning_rate": 6.402439024390244e-06,
"loss": 0.8659,
"step": 300
},
{
"epoch": 2.4646464646464645,
"grad_norm": 8.705530302904208,
"learning_rate": 6.300813008130082e-06,
"loss": 0.9076,
"step": 305
},
{
"epoch": 2.505050505050505,
"grad_norm": 8.449067406312437,
"learning_rate": 6.199186991869919e-06,
"loss": 0.8896,
"step": 310
},
{
"epoch": 2.5454545454545454,
"grad_norm": 8.962552587001122,
"learning_rate": 6.0975609756097564e-06,
"loss": 0.8568,
"step": 315
},
{
"epoch": 2.5858585858585856,
"grad_norm": 9.26680724967832,
"learning_rate": 5.995934959349594e-06,
"loss": 0.8707,
"step": 320
},
{
"epoch": 2.6262626262626263,
"grad_norm": 9.852323988179384,
"learning_rate": 5.894308943089432e-06,
"loss": 0.9007,
"step": 325
},
{
"epoch": 2.6666666666666665,
"grad_norm": 9.473031970955077,
"learning_rate": 5.792682926829269e-06,
"loss": 0.8907,
"step": 330
},
{
"epoch": 2.707070707070707,
"grad_norm": 8.423216825316242,
"learning_rate": 5.691056910569106e-06,
"loss": 0.8408,
"step": 335
},
{
"epoch": 2.7474747474747474,
"grad_norm": 8.772022355651819,
"learning_rate": 5.589430894308944e-06,
"loss": 0.8791,
"step": 340
},
{
"epoch": 2.787878787878788,
"grad_norm": 9.456726517429484,
"learning_rate": 5.487804878048781e-06,
"loss": 0.9048,
"step": 345
},
{
"epoch": 2.8282828282828283,
"grad_norm": 8.932119436113132,
"learning_rate": 5.386178861788618e-06,
"loss": 0.9458,
"step": 350
},
{
"epoch": 2.8686868686868685,
"grad_norm": 9.076984191036512,
"learning_rate": 5.2845528455284555e-06,
"loss": 0.887,
"step": 355
},
{
"epoch": 2.909090909090909,
"grad_norm": 9.228482912276068,
"learning_rate": 5.182926829268293e-06,
"loss": 0.9164,
"step": 360
},
{
"epoch": 2.9494949494949494,
"grad_norm": 9.39115941622314,
"learning_rate": 5.081300813008131e-06,
"loss": 0.9146,
"step": 365
},
{
"epoch": 2.98989898989899,
"grad_norm": 9.590905474617363,
"learning_rate": 4.979674796747968e-06,
"loss": 0.8975,
"step": 370
},
{
"epoch": 3.0303030303030303,
"grad_norm": 8.147437990412763,
"learning_rate": 4.8780487804878055e-06,
"loss": 0.5845,
"step": 375
},
{
"epoch": 3.0707070707070705,
"grad_norm": 8.966178510928351,
"learning_rate": 4.776422764227643e-06,
"loss": 0.4348,
"step": 380
},
{
"epoch": 3.111111111111111,
"grad_norm": 9.411638909598917,
"learning_rate": 4.67479674796748e-06,
"loss": 0.4666,
"step": 385
},
{
"epoch": 3.1515151515151514,
"grad_norm": 8.787258663179166,
"learning_rate": 4.573170731707318e-06,
"loss": 0.4412,
"step": 390
},
{
"epoch": 3.191919191919192,
"grad_norm": 8.448686917354301,
"learning_rate": 4.471544715447155e-06,
"loss": 0.4644,
"step": 395
},
{
"epoch": 3.2323232323232323,
"grad_norm": 10.794449599661947,
"learning_rate": 4.369918699186992e-06,
"loss": 0.4697,
"step": 400
},
{
"epoch": 3.2727272727272725,
"grad_norm": 9.903329877706229,
"learning_rate": 4.268292682926829e-06,
"loss": 0.4451,
"step": 405
},
{
"epoch": 3.313131313131313,
"grad_norm": 10.77521515420911,
"learning_rate": 4.166666666666667e-06,
"loss": 0.4332,
"step": 410
},
{
"epoch": 3.3535353535353534,
"grad_norm": 9.850095186977901,
"learning_rate": 4.0650406504065046e-06,
"loss": 0.4571,
"step": 415
},
{
"epoch": 3.393939393939394,
"grad_norm": 9.719974950746256,
"learning_rate": 3.963414634146342e-06,
"loss": 0.4651,
"step": 420
},
{
"epoch": 3.4343434343434343,
"grad_norm": 8.909903135988007,
"learning_rate": 3.861788617886179e-06,
"loss": 0.4412,
"step": 425
},
{
"epoch": 3.474747474747475,
"grad_norm": 9.513088350689786,
"learning_rate": 3.760162601626017e-06,
"loss": 0.4713,
"step": 430
},
{
"epoch": 3.515151515151515,
"grad_norm": 8.770653506187902,
"learning_rate": 3.6585365853658537e-06,
"loss": 0.4325,
"step": 435
},
{
"epoch": 3.5555555555555554,
"grad_norm": 10.698789680572704,
"learning_rate": 3.5569105691056914e-06,
"loss": 0.456,
"step": 440
},
{
"epoch": 3.595959595959596,
"grad_norm": 9.719819699090596,
"learning_rate": 3.4552845528455287e-06,
"loss": 0.4673,
"step": 445
},
{
"epoch": 3.6363636363636362,
"grad_norm": 10.503333896340955,
"learning_rate": 3.3536585365853664e-06,
"loss": 0.4571,
"step": 450
},
{
"epoch": 3.676767676767677,
"grad_norm": 9.503329386063092,
"learning_rate": 3.2520325203252037e-06,
"loss": 0.4389,
"step": 455
},
{
"epoch": 3.717171717171717,
"grad_norm": 10.009998037929371,
"learning_rate": 3.150406504065041e-06,
"loss": 0.4638,
"step": 460
},
{
"epoch": 3.757575757575758,
"grad_norm": 9.165548124514086,
"learning_rate": 3.0487804878048782e-06,
"loss": 0.4911,
"step": 465
},
{
"epoch": 3.797979797979798,
"grad_norm": 8.906764540289775,
"learning_rate": 2.947154471544716e-06,
"loss": 0.4568,
"step": 470
},
{
"epoch": 3.8383838383838382,
"grad_norm": 9.034724254956192,
"learning_rate": 2.845528455284553e-06,
"loss": 0.4336,
"step": 475
},
{
"epoch": 3.878787878787879,
"grad_norm": 11.166876327596686,
"learning_rate": 2.7439024390243905e-06,
"loss": 0.4499,
"step": 480
},
{
"epoch": 3.919191919191919,
"grad_norm": 9.217158341597267,
"learning_rate": 2.6422764227642278e-06,
"loss": 0.421,
"step": 485
},
{
"epoch": 3.9595959595959593,
"grad_norm": 10.35588055954636,
"learning_rate": 2.5406504065040655e-06,
"loss": 0.461,
"step": 490
},
{
"epoch": 4.0,
"grad_norm": 9.702783400257974,
"learning_rate": 2.4390243902439027e-06,
"loss": 0.4693,
"step": 495
},
{
"epoch": 4.040404040404041,
"grad_norm": 7.785172054818246,
"learning_rate": 2.33739837398374e-06,
"loss": 0.2154,
"step": 500
},
{
"epoch": 4.08080808080808,
"grad_norm": 8.930594851346704,
"learning_rate": 2.2357723577235773e-06,
"loss": 0.1967,
"step": 505
},
{
"epoch": 4.121212121212121,
"grad_norm": 10.107028846989817,
"learning_rate": 2.1341463414634146e-06,
"loss": 0.1844,
"step": 510
},
{
"epoch": 4.161616161616162,
"grad_norm": 7.826371662630265,
"learning_rate": 2.0325203252032523e-06,
"loss": 0.19,
"step": 515
},
{
"epoch": 4.202020202020202,
"grad_norm": 7.454368070701538,
"learning_rate": 1.9308943089430896e-06,
"loss": 0.204,
"step": 520
},
{
"epoch": 4.242424242424242,
"grad_norm": 10.555419961914236,
"learning_rate": 1.8292682926829268e-06,
"loss": 0.1838,
"step": 525
},
{
"epoch": 4.282828282828283,
"grad_norm": 7.50612949709692,
"learning_rate": 1.7276422764227643e-06,
"loss": 0.2054,
"step": 530
},
{
"epoch": 4.3232323232323235,
"grad_norm": 7.8740842398900055,
"learning_rate": 1.6260162601626018e-06,
"loss": 0.1933,
"step": 535
},
{
"epoch": 4.363636363636363,
"grad_norm": 7.239232049440118,
"learning_rate": 1.5243902439024391e-06,
"loss": 0.195,
"step": 540
},
{
"epoch": 4.404040404040404,
"grad_norm": 8.52719981728453,
"learning_rate": 1.4227642276422766e-06,
"loss": 0.2201,
"step": 545
},
{
"epoch": 4.444444444444445,
"grad_norm": 6.870928125369453,
"learning_rate": 1.3211382113821139e-06,
"loss": 0.1876,
"step": 550
},
{
"epoch": 4.484848484848484,
"grad_norm": 7.742011223629616,
"learning_rate": 1.2195121951219514e-06,
"loss": 0.1811,
"step": 555
},
{
"epoch": 4.525252525252525,
"grad_norm": 7.506513525908142,
"learning_rate": 1.1178861788617887e-06,
"loss": 0.1805,
"step": 560
},
{
"epoch": 4.565656565656566,
"grad_norm": 7.401202667774116,
"learning_rate": 1.0162601626016261e-06,
"loss": 0.1951,
"step": 565
},
{
"epoch": 4.606060606060606,
"grad_norm": 8.135929523728391,
"learning_rate": 9.146341463414634e-07,
"loss": 0.1856,
"step": 570
},
{
"epoch": 4.646464646464646,
"grad_norm": 9.307497468880753,
"learning_rate": 8.130081300813009e-07,
"loss": 0.1807,
"step": 575
},
{
"epoch": 4.686868686868687,
"grad_norm": 8.362839951501785,
"learning_rate": 7.113821138211383e-07,
"loss": 0.1653,
"step": 580
},
{
"epoch": 4.7272727272727275,
"grad_norm": 9.06145198185612,
"learning_rate": 6.097560975609757e-07,
"loss": 0.1747,
"step": 585
},
{
"epoch": 4.767676767676767,
"grad_norm": 7.9663701451927516,
"learning_rate": 5.081300813008131e-07,
"loss": 0.1922,
"step": 590
},
{
"epoch": 4.808080808080808,
"grad_norm": 8.512067102195044,
"learning_rate": 4.0650406504065046e-07,
"loss": 0.1837,
"step": 595
},
{
"epoch": 4.848484848484849,
"grad_norm": 7.294652883363056,
"learning_rate": 3.0487804878048784e-07,
"loss": 0.1877,
"step": 600
},
{
"epoch": 4.888888888888889,
"grad_norm": 8.551601154075781,
"learning_rate": 2.0325203252032523e-07,
"loss": 0.1851,
"step": 605
},
{
"epoch": 4.929292929292929,
"grad_norm": 8.034884715447463,
"learning_rate": 1.0162601626016261e-07,
"loss": 0.187,
"step": 610
},
{
"epoch": 4.96969696969697,
"grad_norm": 7.727841885701067,
"learning_rate": 0.0,
"loss": 0.1957,
"step": 615
},
{
"epoch": 4.96969696969697,
"step": 615,
"total_flos": 849370300416.0,
"train_loss": 0.9517717417662706,
"train_runtime": 734.4559,
"train_samples_per_second": 26.959,
"train_steps_per_second": 0.837
}
],
"logging_steps": 5,
"max_steps": 615,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": false,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 849370300416.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}