|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.7036059806508356, |
|
"eval_steps": 500, |
|
"global_step": 200, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.003518029903254178, |
|
"grad_norm": 1.260473608970642, |
|
"learning_rate": 0.0001, |
|
"loss": 2.9049, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.007036059806508356, |
|
"grad_norm": 1.2573540210723877, |
|
"learning_rate": 9.949748743718594e-05, |
|
"loss": 2.9229, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.010554089709762533, |
|
"grad_norm": 1.1971232891082764, |
|
"learning_rate": 9.899497487437186e-05, |
|
"loss": 2.7224, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.014072119613016711, |
|
"grad_norm": 1.3032786846160889, |
|
"learning_rate": 9.84924623115578e-05, |
|
"loss": 2.5683, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.01759014951627089, |
|
"grad_norm": 1.358597755432129, |
|
"learning_rate": 9.798994974874372e-05, |
|
"loss": 2.4848, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.021108179419525065, |
|
"grad_norm": 1.439229130744934, |
|
"learning_rate": 9.748743718592965e-05, |
|
"loss": 2.2625, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.024626209322779244, |
|
"grad_norm": 1.313475251197815, |
|
"learning_rate": 9.698492462311559e-05, |
|
"loss": 2.0213, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.028144239226033423, |
|
"grad_norm": 1.3972655534744263, |
|
"learning_rate": 9.64824120603015e-05, |
|
"loss": 2.0339, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.0316622691292876, |
|
"grad_norm": 1.2032045125961304, |
|
"learning_rate": 9.597989949748745e-05, |
|
"loss": 1.7561, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.03518029903254178, |
|
"grad_norm": 1.1419575214385986, |
|
"learning_rate": 9.547738693467337e-05, |
|
"loss": 1.6908, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.03869832893579595, |
|
"grad_norm": 1.0084314346313477, |
|
"learning_rate": 9.49748743718593e-05, |
|
"loss": 1.4594, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.04221635883905013, |
|
"grad_norm": 0.8950707912445068, |
|
"learning_rate": 9.447236180904523e-05, |
|
"loss": 1.4036, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.04573438874230431, |
|
"grad_norm": 0.735814094543457, |
|
"learning_rate": 9.396984924623115e-05, |
|
"loss": 1.3852, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.04925241864555849, |
|
"grad_norm": 0.7151777148246765, |
|
"learning_rate": 9.34673366834171e-05, |
|
"loss": 1.4866, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.052770448548812667, |
|
"grad_norm": 0.6819817423820496, |
|
"learning_rate": 9.296482412060302e-05, |
|
"loss": 1.342, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.056288478452066845, |
|
"grad_norm": 0.6794399619102478, |
|
"learning_rate": 9.246231155778895e-05, |
|
"loss": 1.3458, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.05980650835532102, |
|
"grad_norm": 0.7077609896659851, |
|
"learning_rate": 9.195979899497488e-05, |
|
"loss": 1.3624, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.0633245382585752, |
|
"grad_norm": 0.6987395286560059, |
|
"learning_rate": 9.14572864321608e-05, |
|
"loss": 1.2826, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.06684256816182937, |
|
"grad_norm": 0.7677096724510193, |
|
"learning_rate": 9.095477386934675e-05, |
|
"loss": 1.3643, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.07036059806508356, |
|
"grad_norm": 0.7091855406761169, |
|
"learning_rate": 9.045226130653267e-05, |
|
"loss": 1.2746, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.07387862796833773, |
|
"grad_norm": 0.7494800090789795, |
|
"learning_rate": 8.99497487437186e-05, |
|
"loss": 1.3333, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.0773966578715919, |
|
"grad_norm": 0.7067513465881348, |
|
"learning_rate": 8.944723618090453e-05, |
|
"loss": 1.1819, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.08091468777484609, |
|
"grad_norm": 0.7177988290786743, |
|
"learning_rate": 8.894472361809045e-05, |
|
"loss": 1.248, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.08443271767810026, |
|
"grad_norm": 0.8215416669845581, |
|
"learning_rate": 8.84422110552764e-05, |
|
"loss": 1.2657, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.08795074758135445, |
|
"grad_norm": 0.7781690955162048, |
|
"learning_rate": 8.793969849246232e-05, |
|
"loss": 1.2697, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.09146877748460862, |
|
"grad_norm": 0.7476558685302734, |
|
"learning_rate": 8.743718592964825e-05, |
|
"loss": 1.1899, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.09498680738786279, |
|
"grad_norm": 0.7552306056022644, |
|
"learning_rate": 8.693467336683418e-05, |
|
"loss": 1.2209, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.09850483729111698, |
|
"grad_norm": 0.8574095368385315, |
|
"learning_rate": 8.64321608040201e-05, |
|
"loss": 1.2418, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.10202286719437115, |
|
"grad_norm": 0.8529037833213806, |
|
"learning_rate": 8.592964824120603e-05, |
|
"loss": 1.2028, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.10554089709762533, |
|
"grad_norm": 0.9519858360290527, |
|
"learning_rate": 8.542713567839196e-05, |
|
"loss": 1.2501, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.1090589270008795, |
|
"grad_norm": 0.929736852645874, |
|
"learning_rate": 8.49246231155779e-05, |
|
"loss": 1.162, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.11257695690413369, |
|
"grad_norm": 1.0004149675369263, |
|
"learning_rate": 8.442211055276383e-05, |
|
"loss": 1.297, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.11609498680738786, |
|
"grad_norm": 0.9739089608192444, |
|
"learning_rate": 8.391959798994975e-05, |
|
"loss": 1.2161, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.11961301671064203, |
|
"grad_norm": 1.010574460029602, |
|
"learning_rate": 8.341708542713568e-05, |
|
"loss": 1.2653, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.12313104661389622, |
|
"grad_norm": 1.1182818412780762, |
|
"learning_rate": 8.291457286432161e-05, |
|
"loss": 1.2406, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.1266490765171504, |
|
"grad_norm": 1.0420522689819336, |
|
"learning_rate": 8.241206030150754e-05, |
|
"loss": 1.1295, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.13016710642040458, |
|
"grad_norm": 0.9873694777488708, |
|
"learning_rate": 8.190954773869348e-05, |
|
"loss": 1.1819, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.13368513632365875, |
|
"grad_norm": 0.9985349774360657, |
|
"learning_rate": 8.14070351758794e-05, |
|
"loss": 1.0707, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.13720316622691292, |
|
"grad_norm": 0.9374362826347351, |
|
"learning_rate": 8.090452261306533e-05, |
|
"loss": 1.1942, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.14072119613016712, |
|
"grad_norm": 0.9439392685890198, |
|
"learning_rate": 8.040201005025126e-05, |
|
"loss": 1.249, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.1442392260334213, |
|
"grad_norm": 0.7446234822273254, |
|
"learning_rate": 7.989949748743719e-05, |
|
"loss": 1.2198, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.14775725593667546, |
|
"grad_norm": 0.6614280939102173, |
|
"learning_rate": 7.939698492462313e-05, |
|
"loss": 1.147, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.15127528583992964, |
|
"grad_norm": 0.7048504948616028, |
|
"learning_rate": 7.889447236180904e-05, |
|
"loss": 1.2024, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.1547933157431838, |
|
"grad_norm": 0.6736462116241455, |
|
"learning_rate": 7.839195979899498e-05, |
|
"loss": 1.1622, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.158311345646438, |
|
"grad_norm": 0.651723325252533, |
|
"learning_rate": 7.788944723618091e-05, |
|
"loss": 1.115, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.16182937554969218, |
|
"grad_norm": 0.6364433169364929, |
|
"learning_rate": 7.738693467336684e-05, |
|
"loss": 1.1094, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.16534740545294635, |
|
"grad_norm": 0.6916666626930237, |
|
"learning_rate": 7.688442211055277e-05, |
|
"loss": 1.0492, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.16886543535620052, |
|
"grad_norm": 0.6645711064338684, |
|
"learning_rate": 7.638190954773869e-05, |
|
"loss": 1.1187, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.1723834652594547, |
|
"grad_norm": 0.6615894436836243, |
|
"learning_rate": 7.587939698492463e-05, |
|
"loss": 1.0937, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.1759014951627089, |
|
"grad_norm": 0.6639358997344971, |
|
"learning_rate": 7.537688442211056e-05, |
|
"loss": 1.1598, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.17941952506596306, |
|
"grad_norm": 0.7072437405586243, |
|
"learning_rate": 7.487437185929649e-05, |
|
"loss": 1.0835, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.18293755496921724, |
|
"grad_norm": 0.6130443811416626, |
|
"learning_rate": 7.437185929648241e-05, |
|
"loss": 1.1116, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.1864555848724714, |
|
"grad_norm": 0.6709645986557007, |
|
"learning_rate": 7.386934673366834e-05, |
|
"loss": 1.0814, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.18997361477572558, |
|
"grad_norm": 0.724520742893219, |
|
"learning_rate": 7.336683417085427e-05, |
|
"loss": 1.1467, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.19349164467897978, |
|
"grad_norm": 0.7026971578598022, |
|
"learning_rate": 7.28643216080402e-05, |
|
"loss": 1.1671, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.19700967458223395, |
|
"grad_norm": 0.6167840957641602, |
|
"learning_rate": 7.236180904522614e-05, |
|
"loss": 1.0342, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.20052770448548812, |
|
"grad_norm": 0.6177359819412231, |
|
"learning_rate": 7.185929648241206e-05, |
|
"loss": 0.9947, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.2040457343887423, |
|
"grad_norm": 0.6578599810600281, |
|
"learning_rate": 7.135678391959799e-05, |
|
"loss": 1.083, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.2075637642919965, |
|
"grad_norm": 0.6553827524185181, |
|
"learning_rate": 7.085427135678392e-05, |
|
"loss": 1.1451, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.21108179419525067, |
|
"grad_norm": 0.6147003769874573, |
|
"learning_rate": 7.035175879396985e-05, |
|
"loss": 1.1026, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.21459982409850484, |
|
"grad_norm": 0.6656669974327087, |
|
"learning_rate": 6.984924623115579e-05, |
|
"loss": 1.0638, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.218117854001759, |
|
"grad_norm": 0.6732162237167358, |
|
"learning_rate": 6.93467336683417e-05, |
|
"loss": 1.1639, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.22163588390501318, |
|
"grad_norm": 0.6583305597305298, |
|
"learning_rate": 6.884422110552764e-05, |
|
"loss": 1.0694, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.22515391380826738, |
|
"grad_norm": 0.573226809501648, |
|
"learning_rate": 6.834170854271357e-05, |
|
"loss": 1.0237, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.22867194371152155, |
|
"grad_norm": 0.6894761919975281, |
|
"learning_rate": 6.78391959798995e-05, |
|
"loss": 1.0884, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.23218997361477572, |
|
"grad_norm": 0.6620854735374451, |
|
"learning_rate": 6.733668341708544e-05, |
|
"loss": 1.0515, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.2357080035180299, |
|
"grad_norm": 0.695426344871521, |
|
"learning_rate": 6.683417085427135e-05, |
|
"loss": 1.1149, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.23922603342128407, |
|
"grad_norm": 0.6343066096305847, |
|
"learning_rate": 6.633165829145729e-05, |
|
"loss": 1.0736, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.24274406332453827, |
|
"grad_norm": 0.6478216052055359, |
|
"learning_rate": 6.582914572864322e-05, |
|
"loss": 1.0576, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.24626209322779244, |
|
"grad_norm": 0.7264822125434875, |
|
"learning_rate": 6.532663316582915e-05, |
|
"loss": 1.16, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.2497801231310466, |
|
"grad_norm": 0.7927188277244568, |
|
"learning_rate": 6.482412060301508e-05, |
|
"loss": 1.0784, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.2532981530343008, |
|
"grad_norm": 0.6734123826026917, |
|
"learning_rate": 6.4321608040201e-05, |
|
"loss": 1.1155, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.256816182937555, |
|
"grad_norm": 0.6928442120552063, |
|
"learning_rate": 6.381909547738694e-05, |
|
"loss": 1.1, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.26033421284080915, |
|
"grad_norm": 0.6205620765686035, |
|
"learning_rate": 6.331658291457287e-05, |
|
"loss": 1.0557, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.2638522427440633, |
|
"grad_norm": 0.6895455718040466, |
|
"learning_rate": 6.28140703517588e-05, |
|
"loss": 1.0893, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.2673702726473175, |
|
"grad_norm": 0.7075064778327942, |
|
"learning_rate": 6.231155778894473e-05, |
|
"loss": 1.0281, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.27088830255057167, |
|
"grad_norm": 0.8777890801429749, |
|
"learning_rate": 6.180904522613065e-05, |
|
"loss": 1.1, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.27440633245382584, |
|
"grad_norm": 0.7415732145309448, |
|
"learning_rate": 6.130653266331658e-05, |
|
"loss": 1.1266, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.27792436235708, |
|
"grad_norm": 0.6941065192222595, |
|
"learning_rate": 6.080402010050251e-05, |
|
"loss": 1.1073, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.28144239226033424, |
|
"grad_norm": 0.713752269744873, |
|
"learning_rate": 6.030150753768844e-05, |
|
"loss": 1.0299, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.2849604221635884, |
|
"grad_norm": 0.672386884689331, |
|
"learning_rate": 5.979899497487438e-05, |
|
"loss": 1.1285, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.2884784520668426, |
|
"grad_norm": 0.6600875854492188, |
|
"learning_rate": 5.929648241206031e-05, |
|
"loss": 1.0618, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.29199648197009676, |
|
"grad_norm": 0.7304966449737549, |
|
"learning_rate": 5.879396984924623e-05, |
|
"loss": 1.068, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.2955145118733509, |
|
"grad_norm": 0.7191479206085205, |
|
"learning_rate": 5.829145728643216e-05, |
|
"loss": 1.0915, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.2990325417766051, |
|
"grad_norm": 0.6817315220832825, |
|
"learning_rate": 5.778894472361809e-05, |
|
"loss": 1.0081, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.30255057167985927, |
|
"grad_norm": 0.7097010016441345, |
|
"learning_rate": 5.728643216080403e-05, |
|
"loss": 1.0442, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.30606860158311344, |
|
"grad_norm": 0.7585952281951904, |
|
"learning_rate": 5.6783919597989955e-05, |
|
"loss": 1.0238, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.3095866314863676, |
|
"grad_norm": 0.7607995271682739, |
|
"learning_rate": 5.628140703517588e-05, |
|
"loss": 1.0959, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.3131046613896218, |
|
"grad_norm": 0.67258220911026, |
|
"learning_rate": 5.577889447236181e-05, |
|
"loss": 0.9929, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.316622691292876, |
|
"grad_norm": 0.75568026304245, |
|
"learning_rate": 5.527638190954774e-05, |
|
"loss": 1.105, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.3201407211961302, |
|
"grad_norm": 0.8852781057357788, |
|
"learning_rate": 5.477386934673368e-05, |
|
"loss": 1.083, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.32365875109938436, |
|
"grad_norm": 0.6639973521232605, |
|
"learning_rate": 5.4271356783919604e-05, |
|
"loss": 1.073, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.32717678100263853, |
|
"grad_norm": 0.7528688311576843, |
|
"learning_rate": 5.376884422110553e-05, |
|
"loss": 1.0957, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.3306948109058927, |
|
"grad_norm": 0.7375084757804871, |
|
"learning_rate": 5.3266331658291455e-05, |
|
"loss": 1.0804, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.33421284080914687, |
|
"grad_norm": 0.8116129040718079, |
|
"learning_rate": 5.276381909547739e-05, |
|
"loss": 1.0797, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.33773087071240104, |
|
"grad_norm": 0.7964279055595398, |
|
"learning_rate": 5.226130653266332e-05, |
|
"loss": 1.1213, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.3412489006156552, |
|
"grad_norm": 0.765575110912323, |
|
"learning_rate": 5.175879396984925e-05, |
|
"loss": 1.0384, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.3447669305189094, |
|
"grad_norm": 0.6614196300506592, |
|
"learning_rate": 5.125628140703518e-05, |
|
"loss": 1.0332, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.3482849604221636, |
|
"grad_norm": 0.7407499551773071, |
|
"learning_rate": 5.0753768844221104e-05, |
|
"loss": 1.0688, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.3518029903254178, |
|
"grad_norm": 0.8672274947166443, |
|
"learning_rate": 5.0251256281407036e-05, |
|
"loss": 1.0742, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.35532102022867196, |
|
"grad_norm": 0.6899972558021545, |
|
"learning_rate": 4.974874371859297e-05, |
|
"loss": 0.9776, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.35883905013192613, |
|
"grad_norm": 0.7466877698898315, |
|
"learning_rate": 4.92462311557789e-05, |
|
"loss": 1.0293, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.3623570800351803, |
|
"grad_norm": 0.7986593842506409, |
|
"learning_rate": 4.874371859296483e-05, |
|
"loss": 1.0399, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.3658751099384345, |
|
"grad_norm": 0.6813223958015442, |
|
"learning_rate": 4.824120603015075e-05, |
|
"loss": 1.063, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.36939313984168864, |
|
"grad_norm": 0.7377122044563293, |
|
"learning_rate": 4.7738693467336685e-05, |
|
"loss": 0.9959, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.3729111697449428, |
|
"grad_norm": 0.7429965138435364, |
|
"learning_rate": 4.723618090452262e-05, |
|
"loss": 1.0617, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.376429199648197, |
|
"grad_norm": 0.8200985193252563, |
|
"learning_rate": 4.673366834170855e-05, |
|
"loss": 1.069, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.37994722955145116, |
|
"grad_norm": 0.734062910079956, |
|
"learning_rate": 4.6231155778894475e-05, |
|
"loss": 1.1513, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.3834652594547054, |
|
"grad_norm": 0.8677653670310974, |
|
"learning_rate": 4.57286432160804e-05, |
|
"loss": 1.1646, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.38698328935795956, |
|
"grad_norm": 0.7318121790885925, |
|
"learning_rate": 4.522613065326633e-05, |
|
"loss": 1.0443, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.39050131926121373, |
|
"grad_norm": 0.8211216330528259, |
|
"learning_rate": 4.4723618090452266e-05, |
|
"loss": 1.1295, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.3940193491644679, |
|
"grad_norm": 0.6949535608291626, |
|
"learning_rate": 4.42211055276382e-05, |
|
"loss": 1.0175, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.3975373790677221, |
|
"grad_norm": 0.7230639457702637, |
|
"learning_rate": 4.3718592964824124e-05, |
|
"loss": 1.0341, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.40105540897097625, |
|
"grad_norm": 0.793847918510437, |
|
"learning_rate": 4.321608040201005e-05, |
|
"loss": 1.0576, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.4045734388742304, |
|
"grad_norm": 0.7108281850814819, |
|
"learning_rate": 4.271356783919598e-05, |
|
"loss": 1.0636, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.4080914687774846, |
|
"grad_norm": 0.7297809720039368, |
|
"learning_rate": 4.2211055276381914e-05, |
|
"loss": 1.0821, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.41160949868073876, |
|
"grad_norm": 0.6856512427330017, |
|
"learning_rate": 4.170854271356784e-05, |
|
"loss": 0.9826, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.415127528583993, |
|
"grad_norm": 0.7112051248550415, |
|
"learning_rate": 4.120603015075377e-05, |
|
"loss": 1.0463, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.41864555848724716, |
|
"grad_norm": 0.6769644021987915, |
|
"learning_rate": 4.07035175879397e-05, |
|
"loss": 1.0091, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.42216358839050133, |
|
"grad_norm": 0.7250102758407593, |
|
"learning_rate": 4.020100502512563e-05, |
|
"loss": 1.0686, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.4256816182937555, |
|
"grad_norm": 0.7410470843315125, |
|
"learning_rate": 3.969849246231156e-05, |
|
"loss": 1.0755, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.4291996481970097, |
|
"grad_norm": 0.7236255407333374, |
|
"learning_rate": 3.919597989949749e-05, |
|
"loss": 1.0721, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.43271767810026385, |
|
"grad_norm": 0.7625666856765747, |
|
"learning_rate": 3.869346733668342e-05, |
|
"loss": 0.966, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.436235708003518, |
|
"grad_norm": 0.7245182394981384, |
|
"learning_rate": 3.8190954773869346e-05, |
|
"loss": 1.0801, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.4397537379067722, |
|
"grad_norm": 0.7869658470153809, |
|
"learning_rate": 3.768844221105528e-05, |
|
"loss": 1.022, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.44327176781002636, |
|
"grad_norm": 0.7516188621520996, |
|
"learning_rate": 3.7185929648241204e-05, |
|
"loss": 1.0499, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.4467897977132806, |
|
"grad_norm": 0.7964783906936646, |
|
"learning_rate": 3.668341708542714e-05, |
|
"loss": 1.0321, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.45030782761653476, |
|
"grad_norm": 0.8271761536598206, |
|
"learning_rate": 3.618090452261307e-05, |
|
"loss": 1.0488, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.45382585751978893, |
|
"grad_norm": 0.69193434715271, |
|
"learning_rate": 3.5678391959798995e-05, |
|
"loss": 0.9999, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.4573438874230431, |
|
"grad_norm": 0.7824375033378601, |
|
"learning_rate": 3.517587939698493e-05, |
|
"loss": 1.0199, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.4608619173262973, |
|
"grad_norm": 0.7616211771965027, |
|
"learning_rate": 3.467336683417085e-05, |
|
"loss": 0.9752, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.46437994722955145, |
|
"grad_norm": 0.7464612126350403, |
|
"learning_rate": 3.4170854271356785e-05, |
|
"loss": 0.9756, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.4678979771328056, |
|
"grad_norm": 0.7916256189346313, |
|
"learning_rate": 3.366834170854272e-05, |
|
"loss": 1.1048, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.4714160070360598, |
|
"grad_norm": 0.7534184455871582, |
|
"learning_rate": 3.3165829145728643e-05, |
|
"loss": 0.9938, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.47493403693931396, |
|
"grad_norm": 0.6909853219985962, |
|
"learning_rate": 3.2663316582914576e-05, |
|
"loss": 0.9762, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.47845206684256814, |
|
"grad_norm": 0.7753147482872009, |
|
"learning_rate": 3.21608040201005e-05, |
|
"loss": 1.105, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.48197009674582236, |
|
"grad_norm": 0.7884505391120911, |
|
"learning_rate": 3.1658291457286434e-05, |
|
"loss": 1.0783, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.48548812664907653, |
|
"grad_norm": 0.7265881896018982, |
|
"learning_rate": 3.1155778894472366e-05, |
|
"loss": 0.9864, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.4890061565523307, |
|
"grad_norm": 0.7939391732215881, |
|
"learning_rate": 3.065326633165829e-05, |
|
"loss": 1.1004, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.4925241864555849, |
|
"grad_norm": 0.739389955997467, |
|
"learning_rate": 3.015075376884422e-05, |
|
"loss": 1.0617, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.49604221635883905, |
|
"grad_norm": 0.8098007440567017, |
|
"learning_rate": 2.9648241206030153e-05, |
|
"loss": 1.0949, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.4995602462620932, |
|
"grad_norm": 0.8120628595352173, |
|
"learning_rate": 2.914572864321608e-05, |
|
"loss": 1.0509, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.5030782761653474, |
|
"grad_norm": 0.8424797654151917, |
|
"learning_rate": 2.8643216080402015e-05, |
|
"loss": 1.1095, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.5065963060686016, |
|
"grad_norm": 0.7528412938117981, |
|
"learning_rate": 2.814070351758794e-05, |
|
"loss": 0.9922, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.5101143359718557, |
|
"grad_norm": 0.7280577421188354, |
|
"learning_rate": 2.763819095477387e-05, |
|
"loss": 1.0284, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.51363236587511, |
|
"grad_norm": 0.7835600972175598, |
|
"learning_rate": 2.7135678391959802e-05, |
|
"loss": 1.0266, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.5171503957783641, |
|
"grad_norm": 0.7442212104797363, |
|
"learning_rate": 2.6633165829145728e-05, |
|
"loss": 1.0308, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.5206684256816183, |
|
"grad_norm": 0.7954034209251404, |
|
"learning_rate": 2.613065326633166e-05, |
|
"loss": 1.1187, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.5241864555848724, |
|
"grad_norm": 0.6770613193511963, |
|
"learning_rate": 2.562814070351759e-05, |
|
"loss": 0.9689, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.5277044854881267, |
|
"grad_norm": 0.7365478277206421, |
|
"learning_rate": 2.5125628140703518e-05, |
|
"loss": 0.9841, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.5312225153913809, |
|
"grad_norm": 0.7578640580177307, |
|
"learning_rate": 2.462311557788945e-05, |
|
"loss": 1.041, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.534740545294635, |
|
"grad_norm": 0.7007668614387512, |
|
"learning_rate": 2.4120603015075376e-05, |
|
"loss": 1.0407, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.5382585751978892, |
|
"grad_norm": 0.7602474689483643, |
|
"learning_rate": 2.361809045226131e-05, |
|
"loss": 0.9901, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.5417766051011433, |
|
"grad_norm": 0.8691968321800232, |
|
"learning_rate": 2.3115577889447238e-05, |
|
"loss": 1.0138, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.5452946350043976, |
|
"grad_norm": 0.7328104376792908, |
|
"learning_rate": 2.2613065326633167e-05, |
|
"loss": 1.0706, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.5488126649076517, |
|
"grad_norm": 0.7762755751609802, |
|
"learning_rate": 2.21105527638191e-05, |
|
"loss": 1.0248, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.5523306948109059, |
|
"grad_norm": 0.854016900062561, |
|
"learning_rate": 2.1608040201005025e-05, |
|
"loss": 1.0046, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.55584872471416, |
|
"grad_norm": 0.7478740215301514, |
|
"learning_rate": 2.1105527638190957e-05, |
|
"loss": 1.0434, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.5593667546174143, |
|
"grad_norm": 0.8043814301490784, |
|
"learning_rate": 2.0603015075376886e-05, |
|
"loss": 1.0655, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.5628847845206685, |
|
"grad_norm": 0.8493765592575073, |
|
"learning_rate": 2.0100502512562815e-05, |
|
"loss": 1.0395, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.5664028144239226, |
|
"grad_norm": 0.783991813659668, |
|
"learning_rate": 1.9597989949748744e-05, |
|
"loss": 0.9494, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.5699208443271768, |
|
"grad_norm": 0.7609344124794006, |
|
"learning_rate": 1.9095477386934673e-05, |
|
"loss": 1.055, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.5734388742304309, |
|
"grad_norm": 0.766476035118103, |
|
"learning_rate": 1.8592964824120602e-05, |
|
"loss": 1.0782, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.5769569041336852, |
|
"grad_norm": 0.7780715227127075, |
|
"learning_rate": 1.8090452261306535e-05, |
|
"loss": 0.9793, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.5804749340369393, |
|
"grad_norm": 0.7344515919685364, |
|
"learning_rate": 1.7587939698492464e-05, |
|
"loss": 1.0129, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.5839929639401935, |
|
"grad_norm": 0.7865444421768188, |
|
"learning_rate": 1.7085427135678393e-05, |
|
"loss": 1.0503, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.5875109938434476, |
|
"grad_norm": 0.8012449741363525, |
|
"learning_rate": 1.6582914572864322e-05, |
|
"loss": 1.0298, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.5910290237467019, |
|
"grad_norm": 0.8140902519226074, |
|
"learning_rate": 1.608040201005025e-05, |
|
"loss": 1.1027, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.594547053649956, |
|
"grad_norm": 0.9053994417190552, |
|
"learning_rate": 1.5577889447236183e-05, |
|
"loss": 1.0591, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.5980650835532102, |
|
"grad_norm": 0.7367292642593384, |
|
"learning_rate": 1.507537688442211e-05, |
|
"loss": 1.0475, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.6015831134564644, |
|
"grad_norm": 0.8504379391670227, |
|
"learning_rate": 1.457286432160804e-05, |
|
"loss": 0.9989, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.6051011433597185, |
|
"grad_norm": 0.7499436736106873, |
|
"learning_rate": 1.407035175879397e-05, |
|
"loss": 1.0329, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.6086191732629728, |
|
"grad_norm": 0.8187640309333801, |
|
"learning_rate": 1.3567839195979901e-05, |
|
"loss": 1.0425, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.6121372031662269, |
|
"grad_norm": 0.7070643305778503, |
|
"learning_rate": 1.306532663316583e-05, |
|
"loss": 0.9766, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.6156552330694811, |
|
"grad_norm": 0.8162341713905334, |
|
"learning_rate": 1.2562814070351759e-05, |
|
"loss": 0.9974, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.6191732629727352, |
|
"grad_norm": 0.7759721875190735, |
|
"learning_rate": 1.2060301507537688e-05, |
|
"loss": 1.0475, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.6226912928759895, |
|
"grad_norm": 0.7885333299636841, |
|
"learning_rate": 1.1557788944723619e-05, |
|
"loss": 1.0531, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.6262093227792436, |
|
"grad_norm": 0.7671830654144287, |
|
"learning_rate": 1.105527638190955e-05, |
|
"loss": 0.9974, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.6297273526824978, |
|
"grad_norm": 0.7737442851066589, |
|
"learning_rate": 1.0552763819095479e-05, |
|
"loss": 1.0145, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.633245382585752, |
|
"grad_norm": 0.8488346338272095, |
|
"learning_rate": 1.0050251256281408e-05, |
|
"loss": 1.024, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.6367634124890061, |
|
"grad_norm": 0.7485771775245667, |
|
"learning_rate": 9.547738693467337e-06, |
|
"loss": 1.0519, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.6402814423922604, |
|
"grad_norm": 0.8044915795326233, |
|
"learning_rate": 9.045226130653267e-06, |
|
"loss": 0.977, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.6437994722955145, |
|
"grad_norm": 0.8789907693862915, |
|
"learning_rate": 8.542713567839196e-06, |
|
"loss": 1.0284, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.6473175021987687, |
|
"grad_norm": 0.7542572617530823, |
|
"learning_rate": 8.040201005025125e-06, |
|
"loss": 1.0125, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.6508355321020228, |
|
"grad_norm": 0.7793267965316772, |
|
"learning_rate": 7.537688442211055e-06, |
|
"loss": 1.0383, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.6543535620052771, |
|
"grad_norm": 0.774917721748352, |
|
"learning_rate": 7.035175879396985e-06, |
|
"loss": 1.0392, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.6578715919085312, |
|
"grad_norm": 0.8436054587364197, |
|
"learning_rate": 6.532663316582915e-06, |
|
"loss": 1.0772, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.6613896218117854, |
|
"grad_norm": 0.7968306541442871, |
|
"learning_rate": 6.030150753768844e-06, |
|
"loss": 1.0723, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.6649076517150396, |
|
"grad_norm": 0.8724409341812134, |
|
"learning_rate": 5.527638190954775e-06, |
|
"loss": 1.0429, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.6684256816182937, |
|
"grad_norm": 0.9110769033432007, |
|
"learning_rate": 5.025125628140704e-06, |
|
"loss": 1.0439, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.671943711521548, |
|
"grad_norm": 0.8945828080177307, |
|
"learning_rate": 4.522613065326634e-06, |
|
"loss": 1.0797, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.6754617414248021, |
|
"grad_norm": 0.8030518889427185, |
|
"learning_rate": 4.020100502512563e-06, |
|
"loss": 1.0457, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.6789797713280563, |
|
"grad_norm": 0.8692275285720825, |
|
"learning_rate": 3.5175879396984926e-06, |
|
"loss": 1.0895, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.6824978012313104, |
|
"grad_norm": 0.7445128560066223, |
|
"learning_rate": 3.015075376884422e-06, |
|
"loss": 0.9904, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.6860158311345647, |
|
"grad_norm": 0.7754868865013123, |
|
"learning_rate": 2.512562814070352e-06, |
|
"loss": 1.0576, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.6895338610378188, |
|
"grad_norm": 0.8235899806022644, |
|
"learning_rate": 2.0100502512562813e-06, |
|
"loss": 0.9928, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.693051890941073, |
|
"grad_norm": 0.8219490051269531, |
|
"learning_rate": 1.507537688442211e-06, |
|
"loss": 1.0847, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.6965699208443272, |
|
"grad_norm": 0.7800722122192383, |
|
"learning_rate": 1.0050251256281407e-06, |
|
"loss": 1.0303, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.7000879507475813, |
|
"grad_norm": 0.8147994875907898, |
|
"learning_rate": 5.025125628140703e-07, |
|
"loss": 1.045, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.7036059806508356, |
|
"grad_norm": 0.7462975978851318, |
|
"learning_rate": 0.0, |
|
"loss": 0.9956, |
|
"step": 200 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 200, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.871454142739251e+16, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|