leduckhai's picture
Upload folder using huggingface_hub
7db9682 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.7036059806508356,
"eval_steps": 500,
"global_step": 200,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.003518029903254178,
"grad_norm": 1.260473608970642,
"learning_rate": 0.0001,
"loss": 2.9049,
"step": 1
},
{
"epoch": 0.007036059806508356,
"grad_norm": 1.2573540210723877,
"learning_rate": 9.949748743718594e-05,
"loss": 2.9229,
"step": 2
},
{
"epoch": 0.010554089709762533,
"grad_norm": 1.1971232891082764,
"learning_rate": 9.899497487437186e-05,
"loss": 2.7224,
"step": 3
},
{
"epoch": 0.014072119613016711,
"grad_norm": 1.3032786846160889,
"learning_rate": 9.84924623115578e-05,
"loss": 2.5683,
"step": 4
},
{
"epoch": 0.01759014951627089,
"grad_norm": 1.358597755432129,
"learning_rate": 9.798994974874372e-05,
"loss": 2.4848,
"step": 5
},
{
"epoch": 0.021108179419525065,
"grad_norm": 1.439229130744934,
"learning_rate": 9.748743718592965e-05,
"loss": 2.2625,
"step": 6
},
{
"epoch": 0.024626209322779244,
"grad_norm": 1.313475251197815,
"learning_rate": 9.698492462311559e-05,
"loss": 2.0213,
"step": 7
},
{
"epoch": 0.028144239226033423,
"grad_norm": 1.3972655534744263,
"learning_rate": 9.64824120603015e-05,
"loss": 2.0339,
"step": 8
},
{
"epoch": 0.0316622691292876,
"grad_norm": 1.2032045125961304,
"learning_rate": 9.597989949748745e-05,
"loss": 1.7561,
"step": 9
},
{
"epoch": 0.03518029903254178,
"grad_norm": 1.1419575214385986,
"learning_rate": 9.547738693467337e-05,
"loss": 1.6908,
"step": 10
},
{
"epoch": 0.03869832893579595,
"grad_norm": 1.0084314346313477,
"learning_rate": 9.49748743718593e-05,
"loss": 1.4594,
"step": 11
},
{
"epoch": 0.04221635883905013,
"grad_norm": 0.8950707912445068,
"learning_rate": 9.447236180904523e-05,
"loss": 1.4036,
"step": 12
},
{
"epoch": 0.04573438874230431,
"grad_norm": 0.735814094543457,
"learning_rate": 9.396984924623115e-05,
"loss": 1.3852,
"step": 13
},
{
"epoch": 0.04925241864555849,
"grad_norm": 0.7151777148246765,
"learning_rate": 9.34673366834171e-05,
"loss": 1.4866,
"step": 14
},
{
"epoch": 0.052770448548812667,
"grad_norm": 0.6819817423820496,
"learning_rate": 9.296482412060302e-05,
"loss": 1.342,
"step": 15
},
{
"epoch": 0.056288478452066845,
"grad_norm": 0.6794399619102478,
"learning_rate": 9.246231155778895e-05,
"loss": 1.3458,
"step": 16
},
{
"epoch": 0.05980650835532102,
"grad_norm": 0.7077609896659851,
"learning_rate": 9.195979899497488e-05,
"loss": 1.3624,
"step": 17
},
{
"epoch": 0.0633245382585752,
"grad_norm": 0.6987395286560059,
"learning_rate": 9.14572864321608e-05,
"loss": 1.2826,
"step": 18
},
{
"epoch": 0.06684256816182937,
"grad_norm": 0.7677096724510193,
"learning_rate": 9.095477386934675e-05,
"loss": 1.3643,
"step": 19
},
{
"epoch": 0.07036059806508356,
"grad_norm": 0.7091855406761169,
"learning_rate": 9.045226130653267e-05,
"loss": 1.2746,
"step": 20
},
{
"epoch": 0.07387862796833773,
"grad_norm": 0.7494800090789795,
"learning_rate": 8.99497487437186e-05,
"loss": 1.3333,
"step": 21
},
{
"epoch": 0.0773966578715919,
"grad_norm": 0.7067513465881348,
"learning_rate": 8.944723618090453e-05,
"loss": 1.1819,
"step": 22
},
{
"epoch": 0.08091468777484609,
"grad_norm": 0.7177988290786743,
"learning_rate": 8.894472361809045e-05,
"loss": 1.248,
"step": 23
},
{
"epoch": 0.08443271767810026,
"grad_norm": 0.8215416669845581,
"learning_rate": 8.84422110552764e-05,
"loss": 1.2657,
"step": 24
},
{
"epoch": 0.08795074758135445,
"grad_norm": 0.7781690955162048,
"learning_rate": 8.793969849246232e-05,
"loss": 1.2697,
"step": 25
},
{
"epoch": 0.09146877748460862,
"grad_norm": 0.7476558685302734,
"learning_rate": 8.743718592964825e-05,
"loss": 1.1899,
"step": 26
},
{
"epoch": 0.09498680738786279,
"grad_norm": 0.7552306056022644,
"learning_rate": 8.693467336683418e-05,
"loss": 1.2209,
"step": 27
},
{
"epoch": 0.09850483729111698,
"grad_norm": 0.8574095368385315,
"learning_rate": 8.64321608040201e-05,
"loss": 1.2418,
"step": 28
},
{
"epoch": 0.10202286719437115,
"grad_norm": 0.8529037833213806,
"learning_rate": 8.592964824120603e-05,
"loss": 1.2028,
"step": 29
},
{
"epoch": 0.10554089709762533,
"grad_norm": 0.9519858360290527,
"learning_rate": 8.542713567839196e-05,
"loss": 1.2501,
"step": 30
},
{
"epoch": 0.1090589270008795,
"grad_norm": 0.929736852645874,
"learning_rate": 8.49246231155779e-05,
"loss": 1.162,
"step": 31
},
{
"epoch": 0.11257695690413369,
"grad_norm": 1.0004149675369263,
"learning_rate": 8.442211055276383e-05,
"loss": 1.297,
"step": 32
},
{
"epoch": 0.11609498680738786,
"grad_norm": 0.9739089608192444,
"learning_rate": 8.391959798994975e-05,
"loss": 1.2161,
"step": 33
},
{
"epoch": 0.11961301671064203,
"grad_norm": 1.010574460029602,
"learning_rate": 8.341708542713568e-05,
"loss": 1.2653,
"step": 34
},
{
"epoch": 0.12313104661389622,
"grad_norm": 1.1182818412780762,
"learning_rate": 8.291457286432161e-05,
"loss": 1.2406,
"step": 35
},
{
"epoch": 0.1266490765171504,
"grad_norm": 1.0420522689819336,
"learning_rate": 8.241206030150754e-05,
"loss": 1.1295,
"step": 36
},
{
"epoch": 0.13016710642040458,
"grad_norm": 0.9873694777488708,
"learning_rate": 8.190954773869348e-05,
"loss": 1.1819,
"step": 37
},
{
"epoch": 0.13368513632365875,
"grad_norm": 0.9985349774360657,
"learning_rate": 8.14070351758794e-05,
"loss": 1.0707,
"step": 38
},
{
"epoch": 0.13720316622691292,
"grad_norm": 0.9374362826347351,
"learning_rate": 8.090452261306533e-05,
"loss": 1.1942,
"step": 39
},
{
"epoch": 0.14072119613016712,
"grad_norm": 0.9439392685890198,
"learning_rate": 8.040201005025126e-05,
"loss": 1.249,
"step": 40
},
{
"epoch": 0.1442392260334213,
"grad_norm": 0.7446234822273254,
"learning_rate": 7.989949748743719e-05,
"loss": 1.2198,
"step": 41
},
{
"epoch": 0.14775725593667546,
"grad_norm": 0.6614280939102173,
"learning_rate": 7.939698492462313e-05,
"loss": 1.147,
"step": 42
},
{
"epoch": 0.15127528583992964,
"grad_norm": 0.7048504948616028,
"learning_rate": 7.889447236180904e-05,
"loss": 1.2024,
"step": 43
},
{
"epoch": 0.1547933157431838,
"grad_norm": 0.6736462116241455,
"learning_rate": 7.839195979899498e-05,
"loss": 1.1622,
"step": 44
},
{
"epoch": 0.158311345646438,
"grad_norm": 0.651723325252533,
"learning_rate": 7.788944723618091e-05,
"loss": 1.115,
"step": 45
},
{
"epoch": 0.16182937554969218,
"grad_norm": 0.6364433169364929,
"learning_rate": 7.738693467336684e-05,
"loss": 1.1094,
"step": 46
},
{
"epoch": 0.16534740545294635,
"grad_norm": 0.6916666626930237,
"learning_rate": 7.688442211055277e-05,
"loss": 1.0492,
"step": 47
},
{
"epoch": 0.16886543535620052,
"grad_norm": 0.6645711064338684,
"learning_rate": 7.638190954773869e-05,
"loss": 1.1187,
"step": 48
},
{
"epoch": 0.1723834652594547,
"grad_norm": 0.6615894436836243,
"learning_rate": 7.587939698492463e-05,
"loss": 1.0937,
"step": 49
},
{
"epoch": 0.1759014951627089,
"grad_norm": 0.6639358997344971,
"learning_rate": 7.537688442211056e-05,
"loss": 1.1598,
"step": 50
},
{
"epoch": 0.17941952506596306,
"grad_norm": 0.7072437405586243,
"learning_rate": 7.487437185929649e-05,
"loss": 1.0835,
"step": 51
},
{
"epoch": 0.18293755496921724,
"grad_norm": 0.6130443811416626,
"learning_rate": 7.437185929648241e-05,
"loss": 1.1116,
"step": 52
},
{
"epoch": 0.1864555848724714,
"grad_norm": 0.6709645986557007,
"learning_rate": 7.386934673366834e-05,
"loss": 1.0814,
"step": 53
},
{
"epoch": 0.18997361477572558,
"grad_norm": 0.724520742893219,
"learning_rate": 7.336683417085427e-05,
"loss": 1.1467,
"step": 54
},
{
"epoch": 0.19349164467897978,
"grad_norm": 0.7026971578598022,
"learning_rate": 7.28643216080402e-05,
"loss": 1.1671,
"step": 55
},
{
"epoch": 0.19700967458223395,
"grad_norm": 0.6167840957641602,
"learning_rate": 7.236180904522614e-05,
"loss": 1.0342,
"step": 56
},
{
"epoch": 0.20052770448548812,
"grad_norm": 0.6177359819412231,
"learning_rate": 7.185929648241206e-05,
"loss": 0.9947,
"step": 57
},
{
"epoch": 0.2040457343887423,
"grad_norm": 0.6578599810600281,
"learning_rate": 7.135678391959799e-05,
"loss": 1.083,
"step": 58
},
{
"epoch": 0.2075637642919965,
"grad_norm": 0.6553827524185181,
"learning_rate": 7.085427135678392e-05,
"loss": 1.1451,
"step": 59
},
{
"epoch": 0.21108179419525067,
"grad_norm": 0.6147003769874573,
"learning_rate": 7.035175879396985e-05,
"loss": 1.1026,
"step": 60
},
{
"epoch": 0.21459982409850484,
"grad_norm": 0.6656669974327087,
"learning_rate": 6.984924623115579e-05,
"loss": 1.0638,
"step": 61
},
{
"epoch": 0.218117854001759,
"grad_norm": 0.6732162237167358,
"learning_rate": 6.93467336683417e-05,
"loss": 1.1639,
"step": 62
},
{
"epoch": 0.22163588390501318,
"grad_norm": 0.6583305597305298,
"learning_rate": 6.884422110552764e-05,
"loss": 1.0694,
"step": 63
},
{
"epoch": 0.22515391380826738,
"grad_norm": 0.573226809501648,
"learning_rate": 6.834170854271357e-05,
"loss": 1.0237,
"step": 64
},
{
"epoch": 0.22867194371152155,
"grad_norm": 0.6894761919975281,
"learning_rate": 6.78391959798995e-05,
"loss": 1.0884,
"step": 65
},
{
"epoch": 0.23218997361477572,
"grad_norm": 0.6620854735374451,
"learning_rate": 6.733668341708544e-05,
"loss": 1.0515,
"step": 66
},
{
"epoch": 0.2357080035180299,
"grad_norm": 0.695426344871521,
"learning_rate": 6.683417085427135e-05,
"loss": 1.1149,
"step": 67
},
{
"epoch": 0.23922603342128407,
"grad_norm": 0.6343066096305847,
"learning_rate": 6.633165829145729e-05,
"loss": 1.0736,
"step": 68
},
{
"epoch": 0.24274406332453827,
"grad_norm": 0.6478216052055359,
"learning_rate": 6.582914572864322e-05,
"loss": 1.0576,
"step": 69
},
{
"epoch": 0.24626209322779244,
"grad_norm": 0.7264822125434875,
"learning_rate": 6.532663316582915e-05,
"loss": 1.16,
"step": 70
},
{
"epoch": 0.2497801231310466,
"grad_norm": 0.7927188277244568,
"learning_rate": 6.482412060301508e-05,
"loss": 1.0784,
"step": 71
},
{
"epoch": 0.2532981530343008,
"grad_norm": 0.6734123826026917,
"learning_rate": 6.4321608040201e-05,
"loss": 1.1155,
"step": 72
},
{
"epoch": 0.256816182937555,
"grad_norm": 0.6928442120552063,
"learning_rate": 6.381909547738694e-05,
"loss": 1.1,
"step": 73
},
{
"epoch": 0.26033421284080915,
"grad_norm": 0.6205620765686035,
"learning_rate": 6.331658291457287e-05,
"loss": 1.0557,
"step": 74
},
{
"epoch": 0.2638522427440633,
"grad_norm": 0.6895455718040466,
"learning_rate": 6.28140703517588e-05,
"loss": 1.0893,
"step": 75
},
{
"epoch": 0.2673702726473175,
"grad_norm": 0.7075064778327942,
"learning_rate": 6.231155778894473e-05,
"loss": 1.0281,
"step": 76
},
{
"epoch": 0.27088830255057167,
"grad_norm": 0.8777890801429749,
"learning_rate": 6.180904522613065e-05,
"loss": 1.1,
"step": 77
},
{
"epoch": 0.27440633245382584,
"grad_norm": 0.7415732145309448,
"learning_rate": 6.130653266331658e-05,
"loss": 1.1266,
"step": 78
},
{
"epoch": 0.27792436235708,
"grad_norm": 0.6941065192222595,
"learning_rate": 6.080402010050251e-05,
"loss": 1.1073,
"step": 79
},
{
"epoch": 0.28144239226033424,
"grad_norm": 0.713752269744873,
"learning_rate": 6.030150753768844e-05,
"loss": 1.0299,
"step": 80
},
{
"epoch": 0.2849604221635884,
"grad_norm": 0.672386884689331,
"learning_rate": 5.979899497487438e-05,
"loss": 1.1285,
"step": 81
},
{
"epoch": 0.2884784520668426,
"grad_norm": 0.6600875854492188,
"learning_rate": 5.929648241206031e-05,
"loss": 1.0618,
"step": 82
},
{
"epoch": 0.29199648197009676,
"grad_norm": 0.7304966449737549,
"learning_rate": 5.879396984924623e-05,
"loss": 1.068,
"step": 83
},
{
"epoch": 0.2955145118733509,
"grad_norm": 0.7191479206085205,
"learning_rate": 5.829145728643216e-05,
"loss": 1.0915,
"step": 84
},
{
"epoch": 0.2990325417766051,
"grad_norm": 0.6817315220832825,
"learning_rate": 5.778894472361809e-05,
"loss": 1.0081,
"step": 85
},
{
"epoch": 0.30255057167985927,
"grad_norm": 0.7097010016441345,
"learning_rate": 5.728643216080403e-05,
"loss": 1.0442,
"step": 86
},
{
"epoch": 0.30606860158311344,
"grad_norm": 0.7585952281951904,
"learning_rate": 5.6783919597989955e-05,
"loss": 1.0238,
"step": 87
},
{
"epoch": 0.3095866314863676,
"grad_norm": 0.7607995271682739,
"learning_rate": 5.628140703517588e-05,
"loss": 1.0959,
"step": 88
},
{
"epoch": 0.3131046613896218,
"grad_norm": 0.67258220911026,
"learning_rate": 5.577889447236181e-05,
"loss": 0.9929,
"step": 89
},
{
"epoch": 0.316622691292876,
"grad_norm": 0.75568026304245,
"learning_rate": 5.527638190954774e-05,
"loss": 1.105,
"step": 90
},
{
"epoch": 0.3201407211961302,
"grad_norm": 0.8852781057357788,
"learning_rate": 5.477386934673368e-05,
"loss": 1.083,
"step": 91
},
{
"epoch": 0.32365875109938436,
"grad_norm": 0.6639973521232605,
"learning_rate": 5.4271356783919604e-05,
"loss": 1.073,
"step": 92
},
{
"epoch": 0.32717678100263853,
"grad_norm": 0.7528688311576843,
"learning_rate": 5.376884422110553e-05,
"loss": 1.0957,
"step": 93
},
{
"epoch": 0.3306948109058927,
"grad_norm": 0.7375084757804871,
"learning_rate": 5.3266331658291455e-05,
"loss": 1.0804,
"step": 94
},
{
"epoch": 0.33421284080914687,
"grad_norm": 0.8116129040718079,
"learning_rate": 5.276381909547739e-05,
"loss": 1.0797,
"step": 95
},
{
"epoch": 0.33773087071240104,
"grad_norm": 0.7964279055595398,
"learning_rate": 5.226130653266332e-05,
"loss": 1.1213,
"step": 96
},
{
"epoch": 0.3412489006156552,
"grad_norm": 0.765575110912323,
"learning_rate": 5.175879396984925e-05,
"loss": 1.0384,
"step": 97
},
{
"epoch": 0.3447669305189094,
"grad_norm": 0.6614196300506592,
"learning_rate": 5.125628140703518e-05,
"loss": 1.0332,
"step": 98
},
{
"epoch": 0.3482849604221636,
"grad_norm": 0.7407499551773071,
"learning_rate": 5.0753768844221104e-05,
"loss": 1.0688,
"step": 99
},
{
"epoch": 0.3518029903254178,
"grad_norm": 0.8672274947166443,
"learning_rate": 5.0251256281407036e-05,
"loss": 1.0742,
"step": 100
},
{
"epoch": 0.35532102022867196,
"grad_norm": 0.6899972558021545,
"learning_rate": 4.974874371859297e-05,
"loss": 0.9776,
"step": 101
},
{
"epoch": 0.35883905013192613,
"grad_norm": 0.7466877698898315,
"learning_rate": 4.92462311557789e-05,
"loss": 1.0293,
"step": 102
},
{
"epoch": 0.3623570800351803,
"grad_norm": 0.7986593842506409,
"learning_rate": 4.874371859296483e-05,
"loss": 1.0399,
"step": 103
},
{
"epoch": 0.3658751099384345,
"grad_norm": 0.6813223958015442,
"learning_rate": 4.824120603015075e-05,
"loss": 1.063,
"step": 104
},
{
"epoch": 0.36939313984168864,
"grad_norm": 0.7377122044563293,
"learning_rate": 4.7738693467336685e-05,
"loss": 0.9959,
"step": 105
},
{
"epoch": 0.3729111697449428,
"grad_norm": 0.7429965138435364,
"learning_rate": 4.723618090452262e-05,
"loss": 1.0617,
"step": 106
},
{
"epoch": 0.376429199648197,
"grad_norm": 0.8200985193252563,
"learning_rate": 4.673366834170855e-05,
"loss": 1.069,
"step": 107
},
{
"epoch": 0.37994722955145116,
"grad_norm": 0.734062910079956,
"learning_rate": 4.6231155778894475e-05,
"loss": 1.1513,
"step": 108
},
{
"epoch": 0.3834652594547054,
"grad_norm": 0.8677653670310974,
"learning_rate": 4.57286432160804e-05,
"loss": 1.1646,
"step": 109
},
{
"epoch": 0.38698328935795956,
"grad_norm": 0.7318121790885925,
"learning_rate": 4.522613065326633e-05,
"loss": 1.0443,
"step": 110
},
{
"epoch": 0.39050131926121373,
"grad_norm": 0.8211216330528259,
"learning_rate": 4.4723618090452266e-05,
"loss": 1.1295,
"step": 111
},
{
"epoch": 0.3940193491644679,
"grad_norm": 0.6949535608291626,
"learning_rate": 4.42211055276382e-05,
"loss": 1.0175,
"step": 112
},
{
"epoch": 0.3975373790677221,
"grad_norm": 0.7230639457702637,
"learning_rate": 4.3718592964824124e-05,
"loss": 1.0341,
"step": 113
},
{
"epoch": 0.40105540897097625,
"grad_norm": 0.793847918510437,
"learning_rate": 4.321608040201005e-05,
"loss": 1.0576,
"step": 114
},
{
"epoch": 0.4045734388742304,
"grad_norm": 0.7108281850814819,
"learning_rate": 4.271356783919598e-05,
"loss": 1.0636,
"step": 115
},
{
"epoch": 0.4080914687774846,
"grad_norm": 0.7297809720039368,
"learning_rate": 4.2211055276381914e-05,
"loss": 1.0821,
"step": 116
},
{
"epoch": 0.41160949868073876,
"grad_norm": 0.6856512427330017,
"learning_rate": 4.170854271356784e-05,
"loss": 0.9826,
"step": 117
},
{
"epoch": 0.415127528583993,
"grad_norm": 0.7112051248550415,
"learning_rate": 4.120603015075377e-05,
"loss": 1.0463,
"step": 118
},
{
"epoch": 0.41864555848724716,
"grad_norm": 0.6769644021987915,
"learning_rate": 4.07035175879397e-05,
"loss": 1.0091,
"step": 119
},
{
"epoch": 0.42216358839050133,
"grad_norm": 0.7250102758407593,
"learning_rate": 4.020100502512563e-05,
"loss": 1.0686,
"step": 120
},
{
"epoch": 0.4256816182937555,
"grad_norm": 0.7410470843315125,
"learning_rate": 3.969849246231156e-05,
"loss": 1.0755,
"step": 121
},
{
"epoch": 0.4291996481970097,
"grad_norm": 0.7236255407333374,
"learning_rate": 3.919597989949749e-05,
"loss": 1.0721,
"step": 122
},
{
"epoch": 0.43271767810026385,
"grad_norm": 0.7625666856765747,
"learning_rate": 3.869346733668342e-05,
"loss": 0.966,
"step": 123
},
{
"epoch": 0.436235708003518,
"grad_norm": 0.7245182394981384,
"learning_rate": 3.8190954773869346e-05,
"loss": 1.0801,
"step": 124
},
{
"epoch": 0.4397537379067722,
"grad_norm": 0.7869658470153809,
"learning_rate": 3.768844221105528e-05,
"loss": 1.022,
"step": 125
},
{
"epoch": 0.44327176781002636,
"grad_norm": 0.7516188621520996,
"learning_rate": 3.7185929648241204e-05,
"loss": 1.0499,
"step": 126
},
{
"epoch": 0.4467897977132806,
"grad_norm": 0.7964783906936646,
"learning_rate": 3.668341708542714e-05,
"loss": 1.0321,
"step": 127
},
{
"epoch": 0.45030782761653476,
"grad_norm": 0.8271761536598206,
"learning_rate": 3.618090452261307e-05,
"loss": 1.0488,
"step": 128
},
{
"epoch": 0.45382585751978893,
"grad_norm": 0.69193434715271,
"learning_rate": 3.5678391959798995e-05,
"loss": 0.9999,
"step": 129
},
{
"epoch": 0.4573438874230431,
"grad_norm": 0.7824375033378601,
"learning_rate": 3.517587939698493e-05,
"loss": 1.0199,
"step": 130
},
{
"epoch": 0.4608619173262973,
"grad_norm": 0.7616211771965027,
"learning_rate": 3.467336683417085e-05,
"loss": 0.9752,
"step": 131
},
{
"epoch": 0.46437994722955145,
"grad_norm": 0.7464612126350403,
"learning_rate": 3.4170854271356785e-05,
"loss": 0.9756,
"step": 132
},
{
"epoch": 0.4678979771328056,
"grad_norm": 0.7916256189346313,
"learning_rate": 3.366834170854272e-05,
"loss": 1.1048,
"step": 133
},
{
"epoch": 0.4714160070360598,
"grad_norm": 0.7534184455871582,
"learning_rate": 3.3165829145728643e-05,
"loss": 0.9938,
"step": 134
},
{
"epoch": 0.47493403693931396,
"grad_norm": 0.6909853219985962,
"learning_rate": 3.2663316582914576e-05,
"loss": 0.9762,
"step": 135
},
{
"epoch": 0.47845206684256814,
"grad_norm": 0.7753147482872009,
"learning_rate": 3.21608040201005e-05,
"loss": 1.105,
"step": 136
},
{
"epoch": 0.48197009674582236,
"grad_norm": 0.7884505391120911,
"learning_rate": 3.1658291457286434e-05,
"loss": 1.0783,
"step": 137
},
{
"epoch": 0.48548812664907653,
"grad_norm": 0.7265881896018982,
"learning_rate": 3.1155778894472366e-05,
"loss": 0.9864,
"step": 138
},
{
"epoch": 0.4890061565523307,
"grad_norm": 0.7939391732215881,
"learning_rate": 3.065326633165829e-05,
"loss": 1.1004,
"step": 139
},
{
"epoch": 0.4925241864555849,
"grad_norm": 0.739389955997467,
"learning_rate": 3.015075376884422e-05,
"loss": 1.0617,
"step": 140
},
{
"epoch": 0.49604221635883905,
"grad_norm": 0.8098007440567017,
"learning_rate": 2.9648241206030153e-05,
"loss": 1.0949,
"step": 141
},
{
"epoch": 0.4995602462620932,
"grad_norm": 0.8120628595352173,
"learning_rate": 2.914572864321608e-05,
"loss": 1.0509,
"step": 142
},
{
"epoch": 0.5030782761653474,
"grad_norm": 0.8424797654151917,
"learning_rate": 2.8643216080402015e-05,
"loss": 1.1095,
"step": 143
},
{
"epoch": 0.5065963060686016,
"grad_norm": 0.7528412938117981,
"learning_rate": 2.814070351758794e-05,
"loss": 0.9922,
"step": 144
},
{
"epoch": 0.5101143359718557,
"grad_norm": 0.7280577421188354,
"learning_rate": 2.763819095477387e-05,
"loss": 1.0284,
"step": 145
},
{
"epoch": 0.51363236587511,
"grad_norm": 0.7835600972175598,
"learning_rate": 2.7135678391959802e-05,
"loss": 1.0266,
"step": 146
},
{
"epoch": 0.5171503957783641,
"grad_norm": 0.7442212104797363,
"learning_rate": 2.6633165829145728e-05,
"loss": 1.0308,
"step": 147
},
{
"epoch": 0.5206684256816183,
"grad_norm": 0.7954034209251404,
"learning_rate": 2.613065326633166e-05,
"loss": 1.1187,
"step": 148
},
{
"epoch": 0.5241864555848724,
"grad_norm": 0.6770613193511963,
"learning_rate": 2.562814070351759e-05,
"loss": 0.9689,
"step": 149
},
{
"epoch": 0.5277044854881267,
"grad_norm": 0.7365478277206421,
"learning_rate": 2.5125628140703518e-05,
"loss": 0.9841,
"step": 150
},
{
"epoch": 0.5312225153913809,
"grad_norm": 0.7578640580177307,
"learning_rate": 2.462311557788945e-05,
"loss": 1.041,
"step": 151
},
{
"epoch": 0.534740545294635,
"grad_norm": 0.7007668614387512,
"learning_rate": 2.4120603015075376e-05,
"loss": 1.0407,
"step": 152
},
{
"epoch": 0.5382585751978892,
"grad_norm": 0.7602474689483643,
"learning_rate": 2.361809045226131e-05,
"loss": 0.9901,
"step": 153
},
{
"epoch": 0.5417766051011433,
"grad_norm": 0.8691968321800232,
"learning_rate": 2.3115577889447238e-05,
"loss": 1.0138,
"step": 154
},
{
"epoch": 0.5452946350043976,
"grad_norm": 0.7328104376792908,
"learning_rate": 2.2613065326633167e-05,
"loss": 1.0706,
"step": 155
},
{
"epoch": 0.5488126649076517,
"grad_norm": 0.7762755751609802,
"learning_rate": 2.21105527638191e-05,
"loss": 1.0248,
"step": 156
},
{
"epoch": 0.5523306948109059,
"grad_norm": 0.854016900062561,
"learning_rate": 2.1608040201005025e-05,
"loss": 1.0046,
"step": 157
},
{
"epoch": 0.55584872471416,
"grad_norm": 0.7478740215301514,
"learning_rate": 2.1105527638190957e-05,
"loss": 1.0434,
"step": 158
},
{
"epoch": 0.5593667546174143,
"grad_norm": 0.8043814301490784,
"learning_rate": 2.0603015075376886e-05,
"loss": 1.0655,
"step": 159
},
{
"epoch": 0.5628847845206685,
"grad_norm": 0.8493765592575073,
"learning_rate": 2.0100502512562815e-05,
"loss": 1.0395,
"step": 160
},
{
"epoch": 0.5664028144239226,
"grad_norm": 0.783991813659668,
"learning_rate": 1.9597989949748744e-05,
"loss": 0.9494,
"step": 161
},
{
"epoch": 0.5699208443271768,
"grad_norm": 0.7609344124794006,
"learning_rate": 1.9095477386934673e-05,
"loss": 1.055,
"step": 162
},
{
"epoch": 0.5734388742304309,
"grad_norm": 0.766476035118103,
"learning_rate": 1.8592964824120602e-05,
"loss": 1.0782,
"step": 163
},
{
"epoch": 0.5769569041336852,
"grad_norm": 0.7780715227127075,
"learning_rate": 1.8090452261306535e-05,
"loss": 0.9793,
"step": 164
},
{
"epoch": 0.5804749340369393,
"grad_norm": 0.7344515919685364,
"learning_rate": 1.7587939698492464e-05,
"loss": 1.0129,
"step": 165
},
{
"epoch": 0.5839929639401935,
"grad_norm": 0.7865444421768188,
"learning_rate": 1.7085427135678393e-05,
"loss": 1.0503,
"step": 166
},
{
"epoch": 0.5875109938434476,
"grad_norm": 0.8012449741363525,
"learning_rate": 1.6582914572864322e-05,
"loss": 1.0298,
"step": 167
},
{
"epoch": 0.5910290237467019,
"grad_norm": 0.8140902519226074,
"learning_rate": 1.608040201005025e-05,
"loss": 1.1027,
"step": 168
},
{
"epoch": 0.594547053649956,
"grad_norm": 0.9053994417190552,
"learning_rate": 1.5577889447236183e-05,
"loss": 1.0591,
"step": 169
},
{
"epoch": 0.5980650835532102,
"grad_norm": 0.7367292642593384,
"learning_rate": 1.507537688442211e-05,
"loss": 1.0475,
"step": 170
},
{
"epoch": 0.6015831134564644,
"grad_norm": 0.8504379391670227,
"learning_rate": 1.457286432160804e-05,
"loss": 0.9989,
"step": 171
},
{
"epoch": 0.6051011433597185,
"grad_norm": 0.7499436736106873,
"learning_rate": 1.407035175879397e-05,
"loss": 1.0329,
"step": 172
},
{
"epoch": 0.6086191732629728,
"grad_norm": 0.8187640309333801,
"learning_rate": 1.3567839195979901e-05,
"loss": 1.0425,
"step": 173
},
{
"epoch": 0.6121372031662269,
"grad_norm": 0.7070643305778503,
"learning_rate": 1.306532663316583e-05,
"loss": 0.9766,
"step": 174
},
{
"epoch": 0.6156552330694811,
"grad_norm": 0.8162341713905334,
"learning_rate": 1.2562814070351759e-05,
"loss": 0.9974,
"step": 175
},
{
"epoch": 0.6191732629727352,
"grad_norm": 0.7759721875190735,
"learning_rate": 1.2060301507537688e-05,
"loss": 1.0475,
"step": 176
},
{
"epoch": 0.6226912928759895,
"grad_norm": 0.7885333299636841,
"learning_rate": 1.1557788944723619e-05,
"loss": 1.0531,
"step": 177
},
{
"epoch": 0.6262093227792436,
"grad_norm": 0.7671830654144287,
"learning_rate": 1.105527638190955e-05,
"loss": 0.9974,
"step": 178
},
{
"epoch": 0.6297273526824978,
"grad_norm": 0.7737442851066589,
"learning_rate": 1.0552763819095479e-05,
"loss": 1.0145,
"step": 179
},
{
"epoch": 0.633245382585752,
"grad_norm": 0.8488346338272095,
"learning_rate": 1.0050251256281408e-05,
"loss": 1.024,
"step": 180
},
{
"epoch": 0.6367634124890061,
"grad_norm": 0.7485771775245667,
"learning_rate": 9.547738693467337e-06,
"loss": 1.0519,
"step": 181
},
{
"epoch": 0.6402814423922604,
"grad_norm": 0.8044915795326233,
"learning_rate": 9.045226130653267e-06,
"loss": 0.977,
"step": 182
},
{
"epoch": 0.6437994722955145,
"grad_norm": 0.8789907693862915,
"learning_rate": 8.542713567839196e-06,
"loss": 1.0284,
"step": 183
},
{
"epoch": 0.6473175021987687,
"grad_norm": 0.7542572617530823,
"learning_rate": 8.040201005025125e-06,
"loss": 1.0125,
"step": 184
},
{
"epoch": 0.6508355321020228,
"grad_norm": 0.7793267965316772,
"learning_rate": 7.537688442211055e-06,
"loss": 1.0383,
"step": 185
},
{
"epoch": 0.6543535620052771,
"grad_norm": 0.774917721748352,
"learning_rate": 7.035175879396985e-06,
"loss": 1.0392,
"step": 186
},
{
"epoch": 0.6578715919085312,
"grad_norm": 0.8436054587364197,
"learning_rate": 6.532663316582915e-06,
"loss": 1.0772,
"step": 187
},
{
"epoch": 0.6613896218117854,
"grad_norm": 0.7968306541442871,
"learning_rate": 6.030150753768844e-06,
"loss": 1.0723,
"step": 188
},
{
"epoch": 0.6649076517150396,
"grad_norm": 0.8724409341812134,
"learning_rate": 5.527638190954775e-06,
"loss": 1.0429,
"step": 189
},
{
"epoch": 0.6684256816182937,
"grad_norm": 0.9110769033432007,
"learning_rate": 5.025125628140704e-06,
"loss": 1.0439,
"step": 190
},
{
"epoch": 0.671943711521548,
"grad_norm": 0.8945828080177307,
"learning_rate": 4.522613065326634e-06,
"loss": 1.0797,
"step": 191
},
{
"epoch": 0.6754617414248021,
"grad_norm": 0.8030518889427185,
"learning_rate": 4.020100502512563e-06,
"loss": 1.0457,
"step": 192
},
{
"epoch": 0.6789797713280563,
"grad_norm": 0.8692275285720825,
"learning_rate": 3.5175879396984926e-06,
"loss": 1.0895,
"step": 193
},
{
"epoch": 0.6824978012313104,
"grad_norm": 0.7445128560066223,
"learning_rate": 3.015075376884422e-06,
"loss": 0.9904,
"step": 194
},
{
"epoch": 0.6860158311345647,
"grad_norm": 0.7754868865013123,
"learning_rate": 2.512562814070352e-06,
"loss": 1.0576,
"step": 195
},
{
"epoch": 0.6895338610378188,
"grad_norm": 0.8235899806022644,
"learning_rate": 2.0100502512562813e-06,
"loss": 0.9928,
"step": 196
},
{
"epoch": 0.693051890941073,
"grad_norm": 0.8219490051269531,
"learning_rate": 1.507537688442211e-06,
"loss": 1.0847,
"step": 197
},
{
"epoch": 0.6965699208443272,
"grad_norm": 0.7800722122192383,
"learning_rate": 1.0050251256281407e-06,
"loss": 1.0303,
"step": 198
},
{
"epoch": 0.7000879507475813,
"grad_norm": 0.8147994875907898,
"learning_rate": 5.025125628140703e-07,
"loss": 1.045,
"step": 199
},
{
"epoch": 0.7036059806508356,
"grad_norm": 0.7462975978851318,
"learning_rate": 0.0,
"loss": 0.9956,
"step": 200
}
],
"logging_steps": 1,
"max_steps": 200,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.871454142739251e+16,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}