|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.4444444444444444, |
|
"eval_steps": 500, |
|
"global_step": 325000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0022222222222222222, |
|
"grad_norm": 4388.67578125, |
|
"learning_rate": 5.988e-07, |
|
"loss": 891.7704, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.0044444444444444444, |
|
"grad_norm": 1130.9163818359375, |
|
"learning_rate": 1.1988e-06, |
|
"loss": 365.2608, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.006666666666666667, |
|
"grad_norm": 647.9131469726562, |
|
"learning_rate": 1.7988e-06, |
|
"loss": 143.2146, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.008888888888888889, |
|
"grad_norm": 862.5914916992188, |
|
"learning_rate": 2.3988000000000002e-06, |
|
"loss": 101.8926, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.011111111111111112, |
|
"grad_norm": 874.10302734375, |
|
"learning_rate": 2.9988e-06, |
|
"loss": 86.5583, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.013333333333333334, |
|
"grad_norm": 732.438720703125, |
|
"learning_rate": 3.5988e-06, |
|
"loss": 80.9323, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.015555555555555555, |
|
"grad_norm": 493.2248229980469, |
|
"learning_rate": 4.1988e-06, |
|
"loss": 73.8484, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.017777777777777778, |
|
"grad_norm": 678.9496459960938, |
|
"learning_rate": 4.7988e-06, |
|
"loss": 68.8807, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 2241.881103515625, |
|
"learning_rate": 5.398800000000001e-06, |
|
"loss": 69.1163, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.022222222222222223, |
|
"grad_norm": 572.26318359375, |
|
"learning_rate": 5.9988e-06, |
|
"loss": 65.9477, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.024444444444444446, |
|
"grad_norm": 472.3359069824219, |
|
"learning_rate": 6.5988e-06, |
|
"loss": 60.6877, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.02666666666666667, |
|
"grad_norm": 713.2996215820312, |
|
"learning_rate": 7.1988000000000004e-06, |
|
"loss": 62.0643, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.028888888888888888, |
|
"grad_norm": 399.187255859375, |
|
"learning_rate": 7.7988e-06, |
|
"loss": 58.1376, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.03111111111111111, |
|
"grad_norm": 494.1978454589844, |
|
"learning_rate": 8.3988e-06, |
|
"loss": 56.4748, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.03333333333333333, |
|
"grad_norm": 338.4364318847656, |
|
"learning_rate": 8.998800000000001e-06, |
|
"loss": 59.7625, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.035555555555555556, |
|
"grad_norm": 287.89202880859375, |
|
"learning_rate": 9.5988e-06, |
|
"loss": 55.0997, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.03777777777777778, |
|
"grad_norm": 213.35813903808594, |
|
"learning_rate": 1.01988e-05, |
|
"loss": 53.2111, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 354.8004455566406, |
|
"learning_rate": 1.07988e-05, |
|
"loss": 53.5394, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.042222222222222223, |
|
"grad_norm": 875.28955078125, |
|
"learning_rate": 1.1398800000000002e-05, |
|
"loss": 52.944, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 0.044444444444444446, |
|
"grad_norm": 523.3621215820312, |
|
"learning_rate": 1.19988e-05, |
|
"loss": 50.8715, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.04666666666666667, |
|
"grad_norm": 545.8438720703125, |
|
"learning_rate": 1.25988e-05, |
|
"loss": 51.0906, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 0.04888888888888889, |
|
"grad_norm": 371.3891296386719, |
|
"learning_rate": 1.3198800000000001e-05, |
|
"loss": 49.5472, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 0.051111111111111114, |
|
"grad_norm": 175.73524475097656, |
|
"learning_rate": 1.3798799999999999e-05, |
|
"loss": 47.1287, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 0.05333333333333334, |
|
"grad_norm": 335.2581481933594, |
|
"learning_rate": 1.43988e-05, |
|
"loss": 47.6528, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.05555555555555555, |
|
"grad_norm": 1022.18115234375, |
|
"learning_rate": 1.4998800000000001e-05, |
|
"loss": 46.9557, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 0.057777777777777775, |
|
"grad_norm": 380.919677734375, |
|
"learning_rate": 1.55988e-05, |
|
"loss": 44.6385, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 305.0384826660156, |
|
"learning_rate": 1.61988e-05, |
|
"loss": 44.5282, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 0.06222222222222222, |
|
"grad_norm": 458.19122314453125, |
|
"learning_rate": 1.67988e-05, |
|
"loss": 44.6465, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 0.06444444444444444, |
|
"grad_norm": 143.66160583496094, |
|
"learning_rate": 1.73988e-05, |
|
"loss": 44.0934, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 0.06666666666666667, |
|
"grad_norm": 436.7533874511719, |
|
"learning_rate": 1.79988e-05, |
|
"loss": 43.5587, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 0.06888888888888889, |
|
"grad_norm": 455.068359375, |
|
"learning_rate": 1.85988e-05, |
|
"loss": 41.507, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 0.07111111111111111, |
|
"grad_norm": 394.86676025390625, |
|
"learning_rate": 1.91988e-05, |
|
"loss": 40.521, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 0.07333333333333333, |
|
"grad_norm": 371.15753173828125, |
|
"learning_rate": 1.97988e-05, |
|
"loss": 40.0934, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 0.07555555555555556, |
|
"grad_norm": 476.3223571777344, |
|
"learning_rate": 2.0398800000000002e-05, |
|
"loss": 42.2142, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 0.07777777777777778, |
|
"grad_norm": 498.6954650878906, |
|
"learning_rate": 2.0998800000000003e-05, |
|
"loss": 39.011, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 327.6210632324219, |
|
"learning_rate": 2.15988e-05, |
|
"loss": 39.5519, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 0.08222222222222222, |
|
"grad_norm": 210.87628173828125, |
|
"learning_rate": 2.2198799999999998e-05, |
|
"loss": 39.4893, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 0.08444444444444445, |
|
"grad_norm": 357.408203125, |
|
"learning_rate": 2.27988e-05, |
|
"loss": 39.7812, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 0.08666666666666667, |
|
"grad_norm": 312.556640625, |
|
"learning_rate": 2.33988e-05, |
|
"loss": 37.975, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 0.08888888888888889, |
|
"grad_norm": 363.57891845703125, |
|
"learning_rate": 2.39988e-05, |
|
"loss": 36.2815, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 0.09111111111111111, |
|
"grad_norm": 332.95977783203125, |
|
"learning_rate": 2.4598800000000002e-05, |
|
"loss": 36.7108, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 0.09333333333333334, |
|
"grad_norm": 483.03765869140625, |
|
"learning_rate": 2.5198800000000003e-05, |
|
"loss": 36.0883, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 0.09555555555555556, |
|
"grad_norm": 266.86065673828125, |
|
"learning_rate": 2.5798799999999998e-05, |
|
"loss": 38.5255, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 0.09777777777777778, |
|
"grad_norm": 371.4537048339844, |
|
"learning_rate": 2.63988e-05, |
|
"loss": 34.8224, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 1334.1453857421875, |
|
"learning_rate": 2.69988e-05, |
|
"loss": 36.1617, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 0.10222222222222223, |
|
"grad_norm": 234.84649658203125, |
|
"learning_rate": 2.75988e-05, |
|
"loss": 35.088, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 0.10444444444444445, |
|
"grad_norm": 2964.02978515625, |
|
"learning_rate": 2.8198800000000002e-05, |
|
"loss": 34.028, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 0.10666666666666667, |
|
"grad_norm": 456.6842956542969, |
|
"learning_rate": 2.8798800000000003e-05, |
|
"loss": 36.25, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 0.10888888888888888, |
|
"grad_norm": 306.76007080078125, |
|
"learning_rate": 2.9398800000000004e-05, |
|
"loss": 33.3643, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 0.1111111111111111, |
|
"grad_norm": 818.77783203125, |
|
"learning_rate": 2.9998799999999998e-05, |
|
"loss": 36.2583, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 0.11333333333333333, |
|
"grad_norm": 173.24815368652344, |
|
"learning_rate": 2.9999918308948427e-05, |
|
"loss": 36.2218, |
|
"step": 25500 |
|
}, |
|
{ |
|
"epoch": 0.11555555555555555, |
|
"grad_norm": 542.15234375, |
|
"learning_rate": 2.9999672581521505e-05, |
|
"loss": 33.669, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 0.11777777777777777, |
|
"grad_norm": 663.7468872070312, |
|
"learning_rate": 2.999926282007839e-05, |
|
"loss": 33.3195, |
|
"step": 26500 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 237.98435974121094, |
|
"learning_rate": 2.9998689029100164e-05, |
|
"loss": 34.6775, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 0.12222222222222222, |
|
"grad_norm": 350.93109130859375, |
|
"learning_rate": 2.9997951214861724e-05, |
|
"loss": 32.0158, |
|
"step": 27500 |
|
}, |
|
{ |
|
"epoch": 0.12444444444444444, |
|
"grad_norm": 648.1705322265625, |
|
"learning_rate": 2.999704938543168e-05, |
|
"loss": 33.583, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 0.12666666666666668, |
|
"grad_norm": 263.5220642089844, |
|
"learning_rate": 2.9995983550672296e-05, |
|
"loss": 33.9471, |
|
"step": 28500 |
|
}, |
|
{ |
|
"epoch": 0.1288888888888889, |
|
"grad_norm": 193.79708862304688, |
|
"learning_rate": 2.9994753722239374e-05, |
|
"loss": 32.0882, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 0.13111111111111112, |
|
"grad_norm": 584.5958862304688, |
|
"learning_rate": 2.999335991358211e-05, |
|
"loss": 32.2817, |
|
"step": 29500 |
|
}, |
|
{ |
|
"epoch": 0.13333333333333333, |
|
"grad_norm": 498.8976745605469, |
|
"learning_rate": 2.999180213994299e-05, |
|
"loss": 31.1343, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 0.13555555555555557, |
|
"grad_norm": 492.1926574707031, |
|
"learning_rate": 2.9990080418357563e-05, |
|
"loss": 30.703, |
|
"step": 30500 |
|
}, |
|
{ |
|
"epoch": 0.13777777777777778, |
|
"grad_norm": 389.2348937988281, |
|
"learning_rate": 2.99881947676543e-05, |
|
"loss": 32.2483, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 687.1718139648438, |
|
"learning_rate": 2.9986145208454382e-05, |
|
"loss": 31.1763, |
|
"step": 31500 |
|
}, |
|
{ |
|
"epoch": 0.14222222222222222, |
|
"grad_norm": 404.84326171875, |
|
"learning_rate": 2.998393176317146e-05, |
|
"loss": 31.7738, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 0.14444444444444443, |
|
"grad_norm": 492.9033203125, |
|
"learning_rate": 2.9981554456011407e-05, |
|
"loss": 31.7717, |
|
"step": 32500 |
|
}, |
|
{ |
|
"epoch": 0.14666666666666667, |
|
"grad_norm": 393.6338195800781, |
|
"learning_rate": 2.997901331297209e-05, |
|
"loss": 30.5822, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 0.14888888888888888, |
|
"grad_norm": 510.1676025390625, |
|
"learning_rate": 2.9976308361843024e-05, |
|
"loss": 28.6046, |
|
"step": 33500 |
|
}, |
|
{ |
|
"epoch": 0.1511111111111111, |
|
"grad_norm": 547.7921142578125, |
|
"learning_rate": 2.997343963220513e-05, |
|
"loss": 29.9463, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 0.15333333333333332, |
|
"grad_norm": 481.76092529296875, |
|
"learning_rate": 2.997040715543038e-05, |
|
"loss": 29.8005, |
|
"step": 34500 |
|
}, |
|
{ |
|
"epoch": 0.15555555555555556, |
|
"grad_norm": 394.83935546875, |
|
"learning_rate": 2.9967210964681447e-05, |
|
"loss": 29.8433, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 0.15777777777777777, |
|
"grad_norm": 223.97235107421875, |
|
"learning_rate": 2.9963851094911362e-05, |
|
"loss": 30.1751, |
|
"step": 35500 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 587.9564819335938, |
|
"learning_rate": 2.9960327582863126e-05, |
|
"loss": 28.0523, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 0.1622222222222222, |
|
"grad_norm": 786.5308227539062, |
|
"learning_rate": 2.9956640467069298e-05, |
|
"loss": 30.0858, |
|
"step": 36500 |
|
}, |
|
{ |
|
"epoch": 0.16444444444444445, |
|
"grad_norm": 627.6124267578125, |
|
"learning_rate": 2.995278978785159e-05, |
|
"loss": 27.514, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 0.16666666666666666, |
|
"grad_norm": 135.85784912109375, |
|
"learning_rate": 2.9948775587320413e-05, |
|
"loss": 29.0652, |
|
"step": 37500 |
|
}, |
|
{ |
|
"epoch": 0.1688888888888889, |
|
"grad_norm": 516.0145874023438, |
|
"learning_rate": 2.9944597909374416e-05, |
|
"loss": 28.7626, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 0.1711111111111111, |
|
"grad_norm": 381.4872131347656, |
|
"learning_rate": 2.994025679970002e-05, |
|
"loss": 30.4396, |
|
"step": 38500 |
|
}, |
|
{ |
|
"epoch": 0.17333333333333334, |
|
"grad_norm": 612.7399291992188, |
|
"learning_rate": 2.99357523057709e-05, |
|
"loss": 26.5003, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 0.17555555555555555, |
|
"grad_norm": 365.5273132324219, |
|
"learning_rate": 2.9931084476847486e-05, |
|
"loss": 27.6445, |
|
"step": 39500 |
|
}, |
|
{ |
|
"epoch": 0.17777777777777778, |
|
"grad_norm": 117.53230285644531, |
|
"learning_rate": 2.99262533639764e-05, |
|
"loss": 26.8894, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 895.5122680664062, |
|
"learning_rate": 2.9921259019989926e-05, |
|
"loss": 26.3664, |
|
"step": 40500 |
|
}, |
|
{ |
|
"epoch": 0.18222222222222223, |
|
"grad_norm": 493.69683837890625, |
|
"learning_rate": 2.9916101499505408e-05, |
|
"loss": 25.5829, |
|
"step": 41000 |
|
}, |
|
{ |
|
"epoch": 0.18444444444444444, |
|
"grad_norm": 469.6036376953125, |
|
"learning_rate": 2.9910780858924657e-05, |
|
"loss": 27.9183, |
|
"step": 41500 |
|
}, |
|
{ |
|
"epoch": 0.18666666666666668, |
|
"grad_norm": 539.50390625, |
|
"learning_rate": 2.9905297156433357e-05, |
|
"loss": 27.7629, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 0.18888888888888888, |
|
"grad_norm": 127.55433654785156, |
|
"learning_rate": 2.9899650452000393e-05, |
|
"loss": 26.9212, |
|
"step": 42500 |
|
}, |
|
{ |
|
"epoch": 0.19111111111111112, |
|
"grad_norm": 361.29010009765625, |
|
"learning_rate": 2.9893840807377214e-05, |
|
"loss": 25.828, |
|
"step": 43000 |
|
}, |
|
{ |
|
"epoch": 0.19333333333333333, |
|
"grad_norm": 603.46533203125, |
|
"learning_rate": 2.988786828609718e-05, |
|
"loss": 27.1813, |
|
"step": 43500 |
|
}, |
|
{ |
|
"epoch": 0.19555555555555557, |
|
"grad_norm": 94.64213562011719, |
|
"learning_rate": 2.988173295347481e-05, |
|
"loss": 28.3537, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 0.19777777777777777, |
|
"grad_norm": 1213.6317138671875, |
|
"learning_rate": 2.987543487660513e-05, |
|
"loss": 25.5299, |
|
"step": 44500 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 504.8955993652344, |
|
"learning_rate": 2.986897412436289e-05, |
|
"loss": 29.0305, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 0.20222222222222222, |
|
"grad_norm": 734.322021484375, |
|
"learning_rate": 2.9862350767401846e-05, |
|
"loss": 28.3809, |
|
"step": 45500 |
|
}, |
|
{ |
|
"epoch": 0.20444444444444446, |
|
"grad_norm": 1137.0435791015625, |
|
"learning_rate": 2.9855564878153972e-05, |
|
"loss": 26.6201, |
|
"step": 46000 |
|
}, |
|
{ |
|
"epoch": 0.20666666666666667, |
|
"grad_norm": 373.8830871582031, |
|
"learning_rate": 2.984861653082866e-05, |
|
"loss": 25.7129, |
|
"step": 46500 |
|
}, |
|
{ |
|
"epoch": 0.2088888888888889, |
|
"grad_norm": 263.8885498046875, |
|
"learning_rate": 2.9841505801411928e-05, |
|
"loss": 26.2681, |
|
"step": 47000 |
|
}, |
|
{ |
|
"epoch": 0.2111111111111111, |
|
"grad_norm": 1805.83984375, |
|
"learning_rate": 2.983423276766557e-05, |
|
"loss": 26.6592, |
|
"step": 47500 |
|
}, |
|
{ |
|
"epoch": 0.21333333333333335, |
|
"grad_norm": 286.2330627441406, |
|
"learning_rate": 2.982679750912632e-05, |
|
"loss": 25.0459, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 0.21555555555555556, |
|
"grad_norm": 219.3948516845703, |
|
"learning_rate": 2.9819200107104972e-05, |
|
"loss": 25.5699, |
|
"step": 48500 |
|
}, |
|
{ |
|
"epoch": 0.21777777777777776, |
|
"grad_norm": 412.9397888183594, |
|
"learning_rate": 2.98114406446855e-05, |
|
"loss": 26.1915, |
|
"step": 49000 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 602.8424682617188, |
|
"learning_rate": 2.9803519206724136e-05, |
|
"loss": 27.0685, |
|
"step": 49500 |
|
}, |
|
{ |
|
"epoch": 0.2222222222222222, |
|
"grad_norm": 149.6744384765625, |
|
"learning_rate": 2.9795435879848466e-05, |
|
"loss": 24.8978, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 0.22444444444444445, |
|
"grad_norm": 339.0307312011719, |
|
"learning_rate": 2.9787190752456448e-05, |
|
"loss": 23.1352, |
|
"step": 50500 |
|
}, |
|
{ |
|
"epoch": 0.22666666666666666, |
|
"grad_norm": 627.1898193359375, |
|
"learning_rate": 2.977878391471548e-05, |
|
"loss": 25.7614, |
|
"step": 51000 |
|
}, |
|
{ |
|
"epoch": 0.2288888888888889, |
|
"grad_norm": 959.9122924804688, |
|
"learning_rate": 2.9770215458561394e-05, |
|
"loss": 23.909, |
|
"step": 51500 |
|
}, |
|
{ |
|
"epoch": 0.2311111111111111, |
|
"grad_norm": 290.6165466308594, |
|
"learning_rate": 2.976148547769745e-05, |
|
"loss": 25.6165, |
|
"step": 52000 |
|
}, |
|
{ |
|
"epoch": 0.23333333333333334, |
|
"grad_norm": 337.4861755371094, |
|
"learning_rate": 2.9752594067593318e-05, |
|
"loss": 24.7856, |
|
"step": 52500 |
|
}, |
|
{ |
|
"epoch": 0.23555555555555555, |
|
"grad_norm": 1252.9945068359375, |
|
"learning_rate": 2.974354132548404e-05, |
|
"loss": 25.353, |
|
"step": 53000 |
|
}, |
|
{ |
|
"epoch": 0.23777777777777778, |
|
"grad_norm": 186.39710998535156, |
|
"learning_rate": 2.973432735036895e-05, |
|
"loss": 24.7965, |
|
"step": 53500 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 795.011962890625, |
|
"learning_rate": 2.9724952243010605e-05, |
|
"loss": 24.6118, |
|
"step": 54000 |
|
}, |
|
{ |
|
"epoch": 0.24222222222222223, |
|
"grad_norm": 217.4955291748047, |
|
"learning_rate": 2.9715416105933675e-05, |
|
"loss": 24.6205, |
|
"step": 54500 |
|
}, |
|
{ |
|
"epoch": 0.24444444444444444, |
|
"grad_norm": 310.7270812988281, |
|
"learning_rate": 2.970571904342383e-05, |
|
"loss": 24.1833, |
|
"step": 55000 |
|
}, |
|
{ |
|
"epoch": 0.24666666666666667, |
|
"grad_norm": 250.29307556152344, |
|
"learning_rate": 2.969586116152659e-05, |
|
"loss": 24.082, |
|
"step": 55500 |
|
}, |
|
{ |
|
"epoch": 0.24888888888888888, |
|
"grad_norm": 243.90106201171875, |
|
"learning_rate": 2.9685842568046167e-05, |
|
"loss": 23.5486, |
|
"step": 56000 |
|
}, |
|
{ |
|
"epoch": 0.2511111111111111, |
|
"grad_norm": 281.5003967285156, |
|
"learning_rate": 2.967566337254431e-05, |
|
"loss": 22.6343, |
|
"step": 56500 |
|
}, |
|
{ |
|
"epoch": 0.25333333333333335, |
|
"grad_norm": 190.99545288085938, |
|
"learning_rate": 2.9665323686339052e-05, |
|
"loss": 25.0189, |
|
"step": 57000 |
|
}, |
|
{ |
|
"epoch": 0.25555555555555554, |
|
"grad_norm": 400.95361328125, |
|
"learning_rate": 2.9654823622503557e-05, |
|
"loss": 23.9388, |
|
"step": 57500 |
|
}, |
|
{ |
|
"epoch": 0.2577777777777778, |
|
"grad_norm": 74.59510040283203, |
|
"learning_rate": 2.9644163295864836e-05, |
|
"loss": 24.4699, |
|
"step": 58000 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 650.9434204101562, |
|
"learning_rate": 2.9633342823002515e-05, |
|
"loss": 22.5825, |
|
"step": 58500 |
|
}, |
|
{ |
|
"epoch": 0.26222222222222225, |
|
"grad_norm": 359.67315673828125, |
|
"learning_rate": 2.9622362322247548e-05, |
|
"loss": 24.1618, |
|
"step": 59000 |
|
}, |
|
{ |
|
"epoch": 0.2644444444444444, |
|
"grad_norm": 0.0, |
|
"learning_rate": 2.9611221913680935e-05, |
|
"loss": 22.4548, |
|
"step": 59500 |
|
}, |
|
{ |
|
"epoch": 0.26666666666666666, |
|
"grad_norm": 392.0536804199219, |
|
"learning_rate": 2.9599921719132397e-05, |
|
"loss": 22.0985, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 0.2688888888888889, |
|
"grad_norm": 220.76341247558594, |
|
"learning_rate": 2.9588461862179055e-05, |
|
"loss": 22.2635, |
|
"step": 60500 |
|
}, |
|
{ |
|
"epoch": 0.27111111111111114, |
|
"grad_norm": 179.5050048828125, |
|
"learning_rate": 2.9576842468144067e-05, |
|
"loss": 22.9824, |
|
"step": 61000 |
|
}, |
|
{ |
|
"epoch": 0.2733333333333333, |
|
"grad_norm": 625.1077270507812, |
|
"learning_rate": 2.9565063664095265e-05, |
|
"loss": 23.0385, |
|
"step": 61500 |
|
}, |
|
{ |
|
"epoch": 0.27555555555555555, |
|
"grad_norm": 787.576171875, |
|
"learning_rate": 2.955312557884376e-05, |
|
"loss": 23.6391, |
|
"step": 62000 |
|
}, |
|
{ |
|
"epoch": 0.2777777777777778, |
|
"grad_norm": 287.6144714355469, |
|
"learning_rate": 2.954102834294254e-05, |
|
"loss": 22.4223, |
|
"step": 62500 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 598.0758666992188, |
|
"learning_rate": 2.9528772088685042e-05, |
|
"loss": 22.2955, |
|
"step": 63000 |
|
}, |
|
{ |
|
"epoch": 0.2822222222222222, |
|
"grad_norm": 567.0135498046875, |
|
"learning_rate": 2.9516356950103695e-05, |
|
"loss": 22.5473, |
|
"step": 63500 |
|
}, |
|
{ |
|
"epoch": 0.28444444444444444, |
|
"grad_norm": 209.81381225585938, |
|
"learning_rate": 2.950378306296847e-05, |
|
"loss": 23.5631, |
|
"step": 64000 |
|
}, |
|
{ |
|
"epoch": 0.2866666666666667, |
|
"grad_norm": 413.2209167480469, |
|
"learning_rate": 2.9491050564785384e-05, |
|
"loss": 23.1249, |
|
"step": 64500 |
|
}, |
|
{ |
|
"epoch": 0.28888888888888886, |
|
"grad_norm": 140.22494506835938, |
|
"learning_rate": 2.9478159594794985e-05, |
|
"loss": 23.2432, |
|
"step": 65000 |
|
}, |
|
{ |
|
"epoch": 0.2911111111111111, |
|
"grad_norm": 322.0098571777344, |
|
"learning_rate": 2.946511029397087e-05, |
|
"loss": 23.1568, |
|
"step": 65500 |
|
}, |
|
{ |
|
"epoch": 0.29333333333333333, |
|
"grad_norm": 204.205810546875, |
|
"learning_rate": 2.945190280501809e-05, |
|
"loss": 23.9367, |
|
"step": 66000 |
|
}, |
|
{ |
|
"epoch": 0.29555555555555557, |
|
"grad_norm": 247.4243621826172, |
|
"learning_rate": 2.943853727237164e-05, |
|
"loss": 23.2841, |
|
"step": 66500 |
|
}, |
|
{ |
|
"epoch": 0.29777777777777775, |
|
"grad_norm": 767.0619506835938, |
|
"learning_rate": 2.9425013842194833e-05, |
|
"loss": 23.7975, |
|
"step": 67000 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 1255.4112548828125, |
|
"learning_rate": 2.9411332662377744e-05, |
|
"loss": 23.7579, |
|
"step": 67500 |
|
}, |
|
{ |
|
"epoch": 0.3022222222222222, |
|
"grad_norm": 444.0653991699219, |
|
"learning_rate": 2.9397493882535556e-05, |
|
"loss": 22.0943, |
|
"step": 68000 |
|
}, |
|
{ |
|
"epoch": 0.30444444444444446, |
|
"grad_norm": 362.8856506347656, |
|
"learning_rate": 2.9383497654006945e-05, |
|
"loss": 22.6397, |
|
"step": 68500 |
|
}, |
|
{ |
|
"epoch": 0.30666666666666664, |
|
"grad_norm": 450.62237548828125, |
|
"learning_rate": 2.936934412985244e-05, |
|
"loss": 22.2143, |
|
"step": 69000 |
|
}, |
|
{ |
|
"epoch": 0.3088888888888889, |
|
"grad_norm": 148.87391662597656, |
|
"learning_rate": 2.9355033464852697e-05, |
|
"loss": 21.7673, |
|
"step": 69500 |
|
}, |
|
{ |
|
"epoch": 0.3111111111111111, |
|
"grad_norm": 182.1023406982422, |
|
"learning_rate": 2.9340565815506865e-05, |
|
"loss": 22.5551, |
|
"step": 70000 |
|
}, |
|
{ |
|
"epoch": 0.31333333333333335, |
|
"grad_norm": 289.2044677734375, |
|
"learning_rate": 2.932594134003083e-05, |
|
"loss": 22.7895, |
|
"step": 70500 |
|
}, |
|
{ |
|
"epoch": 0.31555555555555553, |
|
"grad_norm": 0.0, |
|
"learning_rate": 2.931116019835553e-05, |
|
"loss": 22.729, |
|
"step": 71000 |
|
}, |
|
{ |
|
"epoch": 0.31777777777777777, |
|
"grad_norm": 361.7475891113281, |
|
"learning_rate": 2.9296222552125148e-05, |
|
"loss": 21.4155, |
|
"step": 71500 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 391.5496520996094, |
|
"learning_rate": 2.928112856469539e-05, |
|
"loss": 22.2849, |
|
"step": 72000 |
|
}, |
|
{ |
|
"epoch": 0.32222222222222224, |
|
"grad_norm": 429.3208923339844, |
|
"learning_rate": 2.9265878401131687e-05, |
|
"loss": 20.7871, |
|
"step": 72500 |
|
}, |
|
{ |
|
"epoch": 0.3244444444444444, |
|
"grad_norm": 912.58154296875, |
|
"learning_rate": 2.9250472228207387e-05, |
|
"loss": 20.8959, |
|
"step": 73000 |
|
}, |
|
{ |
|
"epoch": 0.32666666666666666, |
|
"grad_norm": 145.02476501464844, |
|
"learning_rate": 2.9234910214401926e-05, |
|
"loss": 22.3574, |
|
"step": 73500 |
|
}, |
|
{ |
|
"epoch": 0.3288888888888889, |
|
"grad_norm": 313.38629150390625, |
|
"learning_rate": 2.9219192529899e-05, |
|
"loss": 22.3035, |
|
"step": 74000 |
|
}, |
|
{ |
|
"epoch": 0.33111111111111113, |
|
"grad_norm": 416.150146484375, |
|
"learning_rate": 2.9203319346584673e-05, |
|
"loss": 22.091, |
|
"step": 74500 |
|
}, |
|
{ |
|
"epoch": 0.3333333333333333, |
|
"grad_norm": 125.51025390625, |
|
"learning_rate": 2.9187290838045552e-05, |
|
"loss": 21.6607, |
|
"step": 75000 |
|
}, |
|
{ |
|
"epoch": 0.33555555555555555, |
|
"grad_norm": 256.96875, |
|
"learning_rate": 2.9171107179566826e-05, |
|
"loss": 21.8178, |
|
"step": 75500 |
|
}, |
|
{ |
|
"epoch": 0.3377777777777778, |
|
"grad_norm": 1280.6885986328125, |
|
"learning_rate": 2.91547685481304e-05, |
|
"loss": 21.1816, |
|
"step": 76000 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 276.4981994628906, |
|
"learning_rate": 2.9138275122412927e-05, |
|
"loss": 21.1474, |
|
"step": 76500 |
|
}, |
|
{ |
|
"epoch": 0.3422222222222222, |
|
"grad_norm": 0.0, |
|
"learning_rate": 2.9121627082783864e-05, |
|
"loss": 21.2128, |
|
"step": 77000 |
|
}, |
|
{ |
|
"epoch": 0.34444444444444444, |
|
"grad_norm": 779.9710693359375, |
|
"learning_rate": 2.910482461130351e-05, |
|
"loss": 21.6096, |
|
"step": 77500 |
|
}, |
|
{ |
|
"epoch": 0.3466666666666667, |
|
"grad_norm": 726.4488525390625, |
|
"learning_rate": 2.9087867891721e-05, |
|
"loss": 20.5737, |
|
"step": 78000 |
|
}, |
|
{ |
|
"epoch": 0.3488888888888889, |
|
"grad_norm": 867.9049682617188, |
|
"learning_rate": 2.90707571094723e-05, |
|
"loss": 21.431, |
|
"step": 78500 |
|
}, |
|
{ |
|
"epoch": 0.3511111111111111, |
|
"grad_norm": 1406.6778564453125, |
|
"learning_rate": 2.905349245167819e-05, |
|
"loss": 22.8944, |
|
"step": 79000 |
|
}, |
|
{ |
|
"epoch": 0.35333333333333333, |
|
"grad_norm": 30.834983825683594, |
|
"learning_rate": 2.903607410714219e-05, |
|
"loss": 20.6775, |
|
"step": 79500 |
|
}, |
|
{ |
|
"epoch": 0.35555555555555557, |
|
"grad_norm": 307.98822021484375, |
|
"learning_rate": 2.9018502266348537e-05, |
|
"loss": 19.7868, |
|
"step": 80000 |
|
}, |
|
{ |
|
"epoch": 0.35777777777777775, |
|
"grad_norm": 897.4186401367188, |
|
"learning_rate": 2.900077712146006e-05, |
|
"loss": 22.5855, |
|
"step": 80500 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 203.12339782714844, |
|
"learning_rate": 2.8982898866316107e-05, |
|
"loss": 21.1752, |
|
"step": 81000 |
|
}, |
|
{ |
|
"epoch": 0.3622222222222222, |
|
"grad_norm": 220.3880157470703, |
|
"learning_rate": 2.8964867696430412e-05, |
|
"loss": 21.3629, |
|
"step": 81500 |
|
}, |
|
{ |
|
"epoch": 0.36444444444444446, |
|
"grad_norm": 46.697349548339844, |
|
"learning_rate": 2.8946683808988956e-05, |
|
"loss": 21.3887, |
|
"step": 82000 |
|
}, |
|
{ |
|
"epoch": 0.36666666666666664, |
|
"grad_norm": 179.70164489746094, |
|
"learning_rate": 2.892834740284782e-05, |
|
"loss": 21.825, |
|
"step": 82500 |
|
}, |
|
{ |
|
"epoch": 0.3688888888888889, |
|
"grad_norm": 518.6677856445312, |
|
"learning_rate": 2.8909858678531007e-05, |
|
"loss": 20.7174, |
|
"step": 83000 |
|
}, |
|
{ |
|
"epoch": 0.3711111111111111, |
|
"grad_norm": 643.6600952148438, |
|
"learning_rate": 2.889121783822824e-05, |
|
"loss": 22.1913, |
|
"step": 83500 |
|
}, |
|
{ |
|
"epoch": 0.37333333333333335, |
|
"grad_norm": 262.9464111328125, |
|
"learning_rate": 2.887242508579277e-05, |
|
"loss": 22.0347, |
|
"step": 84000 |
|
}, |
|
{ |
|
"epoch": 0.37555555555555553, |
|
"grad_norm": 1396.894775390625, |
|
"learning_rate": 2.8853480626739115e-05, |
|
"loss": 20.4351, |
|
"step": 84500 |
|
}, |
|
{ |
|
"epoch": 0.37777777777777777, |
|
"grad_norm": 597.5517578125, |
|
"learning_rate": 2.883438466824085e-05, |
|
"loss": 19.2972, |
|
"step": 85000 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 238.96986389160156, |
|
"learning_rate": 2.8815137419128317e-05, |
|
"loss": 20.8544, |
|
"step": 85500 |
|
}, |
|
{ |
|
"epoch": 0.38222222222222224, |
|
"grad_norm": 118.68024444580078, |
|
"learning_rate": 2.8795739089886353e-05, |
|
"loss": 20.0097, |
|
"step": 86000 |
|
}, |
|
{ |
|
"epoch": 0.3844444444444444, |
|
"grad_norm": 281.3915100097656, |
|
"learning_rate": 2.877618989265197e-05, |
|
"loss": 19.3276, |
|
"step": 86500 |
|
}, |
|
{ |
|
"epoch": 0.38666666666666666, |
|
"grad_norm": 412.5798645019531, |
|
"learning_rate": 2.8756490041212067e-05, |
|
"loss": 20.9107, |
|
"step": 87000 |
|
}, |
|
{ |
|
"epoch": 0.3888888888888889, |
|
"grad_norm": 897.23095703125, |
|
"learning_rate": 2.8736639751001056e-05, |
|
"loss": 21.3243, |
|
"step": 87500 |
|
}, |
|
{ |
|
"epoch": 0.39111111111111113, |
|
"grad_norm": 1561.7535400390625, |
|
"learning_rate": 2.871663923909853e-05, |
|
"loss": 20.2997, |
|
"step": 88000 |
|
}, |
|
{ |
|
"epoch": 0.3933333333333333, |
|
"grad_norm": 219.94825744628906, |
|
"learning_rate": 2.8696488724226884e-05, |
|
"loss": 19.0194, |
|
"step": 88500 |
|
}, |
|
{ |
|
"epoch": 0.39555555555555555, |
|
"grad_norm": 175.09353637695312, |
|
"learning_rate": 2.8676188426748923e-05, |
|
"loss": 20.7055, |
|
"step": 89000 |
|
}, |
|
{ |
|
"epoch": 0.3977777777777778, |
|
"grad_norm": 282.50933837890625, |
|
"learning_rate": 2.8655738568665447e-05, |
|
"loss": 19.1337, |
|
"step": 89500 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 60.395172119140625, |
|
"learning_rate": 2.863513937361283e-05, |
|
"loss": 20.728, |
|
"step": 90000 |
|
}, |
|
{ |
|
"epoch": 0.4022222222222222, |
|
"grad_norm": 314.94561767578125, |
|
"learning_rate": 2.861439106686056e-05, |
|
"loss": 19.575, |
|
"step": 90500 |
|
}, |
|
{ |
|
"epoch": 0.40444444444444444, |
|
"grad_norm": 473.822998046875, |
|
"learning_rate": 2.8593493875308805e-05, |
|
"loss": 20.2208, |
|
"step": 91000 |
|
}, |
|
{ |
|
"epoch": 0.4066666666666667, |
|
"grad_norm": 412.5682373046875, |
|
"learning_rate": 2.8572448027485896e-05, |
|
"loss": 19.7487, |
|
"step": 91500 |
|
}, |
|
{ |
|
"epoch": 0.4088888888888889, |
|
"grad_norm": 155.67567443847656, |
|
"learning_rate": 2.855125375354586e-05, |
|
"loss": 18.5899, |
|
"step": 92000 |
|
}, |
|
{ |
|
"epoch": 0.4111111111111111, |
|
"grad_norm": 401.43621826171875, |
|
"learning_rate": 2.8529911285265876e-05, |
|
"loss": 21.001, |
|
"step": 92500 |
|
}, |
|
{ |
|
"epoch": 0.41333333333333333, |
|
"grad_norm": 379.79302978515625, |
|
"learning_rate": 2.8508420856043763e-05, |
|
"loss": 19.6731, |
|
"step": 93000 |
|
}, |
|
{ |
|
"epoch": 0.41555555555555557, |
|
"grad_norm": 224.41383361816406, |
|
"learning_rate": 2.8486782700895407e-05, |
|
"loss": 19.2887, |
|
"step": 93500 |
|
}, |
|
{ |
|
"epoch": 0.4177777777777778, |
|
"grad_norm": 164.6722412109375, |
|
"learning_rate": 2.8464997056452206e-05, |
|
"loss": 20.0013, |
|
"step": 94000 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 241.1973876953125, |
|
"learning_rate": 2.8443064160958483e-05, |
|
"loss": 18.3981, |
|
"step": 94500 |
|
}, |
|
{ |
|
"epoch": 0.4222222222222222, |
|
"grad_norm": 790.732421875, |
|
"learning_rate": 2.8420984254268863e-05, |
|
"loss": 18.5947, |
|
"step": 95000 |
|
}, |
|
{ |
|
"epoch": 0.42444444444444446, |
|
"grad_norm": 446.4692687988281, |
|
"learning_rate": 2.8398757577845665e-05, |
|
"loss": 19.8438, |
|
"step": 95500 |
|
}, |
|
{ |
|
"epoch": 0.4266666666666667, |
|
"grad_norm": 17.384523391723633, |
|
"learning_rate": 2.837638437475627e-05, |
|
"loss": 19.1518, |
|
"step": 96000 |
|
}, |
|
{ |
|
"epoch": 0.4288888888888889, |
|
"grad_norm": 292.8326416015625, |
|
"learning_rate": 2.8353864889670442e-05, |
|
"loss": 18.9518, |
|
"step": 96500 |
|
}, |
|
{ |
|
"epoch": 0.4311111111111111, |
|
"grad_norm": 1216.1114501953125, |
|
"learning_rate": 2.8331199368857656e-05, |
|
"loss": 19.3502, |
|
"step": 97000 |
|
}, |
|
{ |
|
"epoch": 0.43333333333333335, |
|
"grad_norm": 256.9949035644531, |
|
"learning_rate": 2.830838806018442e-05, |
|
"loss": 18.1643, |
|
"step": 97500 |
|
}, |
|
{ |
|
"epoch": 0.43555555555555553, |
|
"grad_norm": 203.0587615966797, |
|
"learning_rate": 2.8285431213111548e-05, |
|
"loss": 19.173, |
|
"step": 98000 |
|
}, |
|
{ |
|
"epoch": 0.43777777777777777, |
|
"grad_norm": 290.00775146484375, |
|
"learning_rate": 2.826232907869145e-05, |
|
"loss": 20.2496, |
|
"step": 98500 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 437.4803771972656, |
|
"learning_rate": 2.823908190956535e-05, |
|
"loss": 19.568, |
|
"step": 99000 |
|
}, |
|
{ |
|
"epoch": 0.44222222222222224, |
|
"grad_norm": 79.48589324951172, |
|
"learning_rate": 2.821568995996058e-05, |
|
"loss": 18.2379, |
|
"step": 99500 |
|
}, |
|
{ |
|
"epoch": 0.4444444444444444, |
|
"grad_norm": 252.00978088378906, |
|
"learning_rate": 2.8192153485687752e-05, |
|
"loss": 19.322, |
|
"step": 100000 |
|
}, |
|
{ |
|
"epoch": 0.44666666666666666, |
|
"grad_norm": 220.2042999267578, |
|
"learning_rate": 2.8168472744137977e-05, |
|
"loss": 18.7556, |
|
"step": 100500 |
|
}, |
|
{ |
|
"epoch": 0.4488888888888889, |
|
"grad_norm": 260.3736572265625, |
|
"learning_rate": 2.814464799428004e-05, |
|
"loss": 18.9124, |
|
"step": 101000 |
|
}, |
|
{ |
|
"epoch": 0.45111111111111113, |
|
"grad_norm": 593.2783203125, |
|
"learning_rate": 2.8120679496657602e-05, |
|
"loss": 19.0002, |
|
"step": 101500 |
|
}, |
|
{ |
|
"epoch": 0.4533333333333333, |
|
"grad_norm": 1167.1844482421875, |
|
"learning_rate": 2.80965675133863e-05, |
|
"loss": 19.2148, |
|
"step": 102000 |
|
}, |
|
{ |
|
"epoch": 0.45555555555555555, |
|
"grad_norm": 15.313830375671387, |
|
"learning_rate": 2.8072312308150934e-05, |
|
"loss": 18.2168, |
|
"step": 102500 |
|
}, |
|
{ |
|
"epoch": 0.4577777777777778, |
|
"grad_norm": 200.6254119873047, |
|
"learning_rate": 2.8047914146202533e-05, |
|
"loss": 19.3346, |
|
"step": 103000 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 426.6332702636719, |
|
"learning_rate": 2.8023373294355492e-05, |
|
"loss": 17.3282, |
|
"step": 103500 |
|
}, |
|
{ |
|
"epoch": 0.4622222222222222, |
|
"grad_norm": 432.8354187011719, |
|
"learning_rate": 2.799869002098463e-05, |
|
"loss": 19.5463, |
|
"step": 104000 |
|
}, |
|
{ |
|
"epoch": 0.46444444444444444, |
|
"grad_norm": 298.2032775878906, |
|
"learning_rate": 2.7973864596022273e-05, |
|
"loss": 18.7725, |
|
"step": 104500 |
|
}, |
|
{ |
|
"epoch": 0.4666666666666667, |
|
"grad_norm": 403.9524841308594, |
|
"learning_rate": 2.7948897290955293e-05, |
|
"loss": 19.5364, |
|
"step": 105000 |
|
}, |
|
{ |
|
"epoch": 0.4688888888888889, |
|
"grad_norm": 51.500240325927734, |
|
"learning_rate": 2.7923788378822135e-05, |
|
"loss": 18.9839, |
|
"step": 105500 |
|
}, |
|
{ |
|
"epoch": 0.4711111111111111, |
|
"grad_norm": 521.7046508789062, |
|
"learning_rate": 2.7898538134209837e-05, |
|
"loss": 18.7831, |
|
"step": 106000 |
|
}, |
|
{ |
|
"epoch": 0.47333333333333333, |
|
"grad_norm": 105.23808288574219, |
|
"learning_rate": 2.787314683325104e-05, |
|
"loss": 18.1615, |
|
"step": 106500 |
|
}, |
|
{ |
|
"epoch": 0.47555555555555556, |
|
"grad_norm": 332.540283203125, |
|
"learning_rate": 2.7847614753620926e-05, |
|
"loss": 19.3657, |
|
"step": 107000 |
|
}, |
|
{ |
|
"epoch": 0.4777777777777778, |
|
"grad_norm": 901.9822387695312, |
|
"learning_rate": 2.7821942174534243e-05, |
|
"loss": 18.9534, |
|
"step": 107500 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 437.5888977050781, |
|
"learning_rate": 2.779612937674219e-05, |
|
"loss": 18.7374, |
|
"step": 108000 |
|
}, |
|
{ |
|
"epoch": 0.4822222222222222, |
|
"grad_norm": 438.2900390625, |
|
"learning_rate": 2.7770176642529397e-05, |
|
"loss": 20.7495, |
|
"step": 108500 |
|
}, |
|
{ |
|
"epoch": 0.48444444444444446, |
|
"grad_norm": 369.8582763671875, |
|
"learning_rate": 2.7744084255710804e-05, |
|
"loss": 17.091, |
|
"step": 109000 |
|
}, |
|
{ |
|
"epoch": 0.4866666666666667, |
|
"grad_norm": 734.362548828125, |
|
"learning_rate": 2.7717852501628574e-05, |
|
"loss": 19.0611, |
|
"step": 109500 |
|
}, |
|
{ |
|
"epoch": 0.4888888888888889, |
|
"grad_norm": 425.8333435058594, |
|
"learning_rate": 2.769148166714897e-05, |
|
"loss": 18.6956, |
|
"step": 110000 |
|
}, |
|
{ |
|
"epoch": 0.4911111111111111, |
|
"grad_norm": 273.7350158691406, |
|
"learning_rate": 2.76649720406592e-05, |
|
"loss": 18.9581, |
|
"step": 110500 |
|
}, |
|
{ |
|
"epoch": 0.49333333333333335, |
|
"grad_norm": 501.64019775390625, |
|
"learning_rate": 2.763832391206431e-05, |
|
"loss": 17.5245, |
|
"step": 111000 |
|
}, |
|
{ |
|
"epoch": 0.4955555555555556, |
|
"grad_norm": 1036.9017333984375, |
|
"learning_rate": 2.7611537572783953e-05, |
|
"loss": 17.9539, |
|
"step": 111500 |
|
}, |
|
{ |
|
"epoch": 0.49777777777777776, |
|
"grad_norm": 63.28369140625, |
|
"learning_rate": 2.7584613315749247e-05, |
|
"loss": 17.5569, |
|
"step": 112000 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 144.62741088867188, |
|
"learning_rate": 2.7557551435399554e-05, |
|
"loss": 18.3981, |
|
"step": 112500 |
|
}, |
|
{ |
|
"epoch": 0.5022222222222222, |
|
"grad_norm": 50.069549560546875, |
|
"learning_rate": 2.753035222767926e-05, |
|
"loss": 18.6216, |
|
"step": 113000 |
|
}, |
|
{ |
|
"epoch": 0.5044444444444445, |
|
"grad_norm": 733.9398193359375, |
|
"learning_rate": 2.7503015990034543e-05, |
|
"loss": 17.1969, |
|
"step": 113500 |
|
}, |
|
{ |
|
"epoch": 0.5066666666666667, |
|
"grad_norm": 444.6294250488281, |
|
"learning_rate": 2.747554302141012e-05, |
|
"loss": 18.0202, |
|
"step": 114000 |
|
}, |
|
{ |
|
"epoch": 0.5088888888888888, |
|
"grad_norm": 59.344337463378906, |
|
"learning_rate": 2.7447933622245974e-05, |
|
"loss": 17.6973, |
|
"step": 114500 |
|
}, |
|
{ |
|
"epoch": 0.5111111111111111, |
|
"grad_norm": 0.0, |
|
"learning_rate": 2.742018809447407e-05, |
|
"loss": 18.7046, |
|
"step": 115000 |
|
}, |
|
{ |
|
"epoch": 0.5133333333333333, |
|
"grad_norm": 421.5881652832031, |
|
"learning_rate": 2.7392306741515056e-05, |
|
"loss": 17.8755, |
|
"step": 115500 |
|
}, |
|
{ |
|
"epoch": 0.5155555555555555, |
|
"grad_norm": 292.31060791015625, |
|
"learning_rate": 2.736428986827494e-05, |
|
"loss": 18.5183, |
|
"step": 116000 |
|
}, |
|
{ |
|
"epoch": 0.5177777777777778, |
|
"grad_norm": 448.3764343261719, |
|
"learning_rate": 2.7336137781141758e-05, |
|
"loss": 18.2446, |
|
"step": 116500 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 312.8506164550781, |
|
"learning_rate": 2.730785078798222e-05, |
|
"loss": 17.2551, |
|
"step": 117000 |
|
}, |
|
{ |
|
"epoch": 0.5222222222222223, |
|
"grad_norm": 198.42645263671875, |
|
"learning_rate": 2.7279429198138368e-05, |
|
"loss": 17.8948, |
|
"step": 117500 |
|
}, |
|
{ |
|
"epoch": 0.5244444444444445, |
|
"grad_norm": 148.22213745117188, |
|
"learning_rate": 2.7250873322424135e-05, |
|
"loss": 17.4501, |
|
"step": 118000 |
|
}, |
|
{ |
|
"epoch": 0.5266666666666666, |
|
"grad_norm": 537.1702270507812, |
|
"learning_rate": 2.7222183473122015e-05, |
|
"loss": 18.9861, |
|
"step": 118500 |
|
}, |
|
{ |
|
"epoch": 0.5288888888888889, |
|
"grad_norm": 363.04833984375, |
|
"learning_rate": 2.71933599639796e-05, |
|
"loss": 18.2579, |
|
"step": 119000 |
|
}, |
|
{ |
|
"epoch": 0.5311111111111111, |
|
"grad_norm": 550.2840576171875, |
|
"learning_rate": 2.7164403110206168e-05, |
|
"loss": 17.3876, |
|
"step": 119500 |
|
}, |
|
{ |
|
"epoch": 0.5333333333333333, |
|
"grad_norm": 99.29381561279297, |
|
"learning_rate": 2.713531322846923e-05, |
|
"loss": 18.4671, |
|
"step": 120000 |
|
}, |
|
{ |
|
"epoch": 0.5355555555555556, |
|
"grad_norm": 267.3313293457031, |
|
"learning_rate": 2.7106090636891077e-05, |
|
"loss": 19.6639, |
|
"step": 120500 |
|
}, |
|
{ |
|
"epoch": 0.5377777777777778, |
|
"grad_norm": 356.0230407714844, |
|
"learning_rate": 2.7076735655045283e-05, |
|
"loss": 18.553, |
|
"step": 121000 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 72.5117416381836, |
|
"learning_rate": 2.7047248603953233e-05, |
|
"loss": 16.9581, |
|
"step": 121500 |
|
}, |
|
{ |
|
"epoch": 0.5422222222222223, |
|
"grad_norm": 283.059326171875, |
|
"learning_rate": 2.701762980608059e-05, |
|
"loss": 17.3513, |
|
"step": 122000 |
|
}, |
|
{ |
|
"epoch": 0.5444444444444444, |
|
"grad_norm": 455.74267578125, |
|
"learning_rate": 2.698787958533378e-05, |
|
"loss": 18.527, |
|
"step": 122500 |
|
}, |
|
{ |
|
"epoch": 0.5466666666666666, |
|
"grad_norm": 264.24700927734375, |
|
"learning_rate": 2.6957998267056454e-05, |
|
"loss": 18.6227, |
|
"step": 123000 |
|
}, |
|
{ |
|
"epoch": 0.5488888888888889, |
|
"grad_norm": 563.1781005859375, |
|
"learning_rate": 2.692798617802592e-05, |
|
"loss": 17.3232, |
|
"step": 123500 |
|
}, |
|
{ |
|
"epoch": 0.5511111111111111, |
|
"grad_norm": 488.3459777832031, |
|
"learning_rate": 2.6897843646449575e-05, |
|
"loss": 17.4262, |
|
"step": 124000 |
|
}, |
|
{ |
|
"epoch": 0.5533333333333333, |
|
"grad_norm": 119.61053466796875, |
|
"learning_rate": 2.6867571001961312e-05, |
|
"loss": 17.022, |
|
"step": 124500 |
|
}, |
|
{ |
|
"epoch": 0.5555555555555556, |
|
"grad_norm": 239.64756774902344, |
|
"learning_rate": 2.683716857561793e-05, |
|
"loss": 17.9908, |
|
"step": 125000 |
|
}, |
|
{ |
|
"epoch": 0.5577777777777778, |
|
"grad_norm": 418.17547607421875, |
|
"learning_rate": 2.6806636699895484e-05, |
|
"loss": 18.6269, |
|
"step": 125500 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 551.5980224609375, |
|
"learning_rate": 2.677597570868568e-05, |
|
"loss": 18.3972, |
|
"step": 126000 |
|
}, |
|
{ |
|
"epoch": 0.5622222222222222, |
|
"grad_norm": 304.7643127441406, |
|
"learning_rate": 2.6745185937292207e-05, |
|
"loss": 18.2829, |
|
"step": 126500 |
|
}, |
|
{ |
|
"epoch": 0.5644444444444444, |
|
"grad_norm": 144.07781982421875, |
|
"learning_rate": 2.6714267722427064e-05, |
|
"loss": 18.218, |
|
"step": 127000 |
|
}, |
|
{ |
|
"epoch": 0.5666666666666667, |
|
"grad_norm": 353.9224548339844, |
|
"learning_rate": 2.66832214022069e-05, |
|
"loss": 18.1345, |
|
"step": 127500 |
|
}, |
|
{ |
|
"epoch": 0.5688888888888889, |
|
"grad_norm": 197.71298217773438, |
|
"learning_rate": 2.66520473161493e-05, |
|
"loss": 17.18, |
|
"step": 128000 |
|
}, |
|
{ |
|
"epoch": 0.5711111111111111, |
|
"grad_norm": 783.7542114257812, |
|
"learning_rate": 2.6620745805169076e-05, |
|
"loss": 16.7577, |
|
"step": 128500 |
|
}, |
|
{ |
|
"epoch": 0.5733333333333334, |
|
"grad_norm": 331.999755859375, |
|
"learning_rate": 2.6589317211574535e-05, |
|
"loss": 16.8293, |
|
"step": 129000 |
|
}, |
|
{ |
|
"epoch": 0.5755555555555556, |
|
"grad_norm": 386.9215393066406, |
|
"learning_rate": 2.6557761879063737e-05, |
|
"loss": 16.7488, |
|
"step": 129500 |
|
}, |
|
{ |
|
"epoch": 0.5777777777777777, |
|
"grad_norm": 670.8016357421875, |
|
"learning_rate": 2.652608015272075e-05, |
|
"loss": 16.6633, |
|
"step": 130000 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 130.0618133544922, |
|
"learning_rate": 2.6494272379011853e-05, |
|
"loss": 17.5815, |
|
"step": 130500 |
|
}, |
|
{ |
|
"epoch": 0.5822222222222222, |
|
"grad_norm": 363.4728698730469, |
|
"learning_rate": 2.6462338905781766e-05, |
|
"loss": 17.5676, |
|
"step": 131000 |
|
}, |
|
{ |
|
"epoch": 0.5844444444444444, |
|
"grad_norm": 194.19207763671875, |
|
"learning_rate": 2.6430280082249832e-05, |
|
"loss": 19.0677, |
|
"step": 131500 |
|
}, |
|
{ |
|
"epoch": 0.5866666666666667, |
|
"grad_norm": 478.40692138671875, |
|
"learning_rate": 2.6398096259006212e-05, |
|
"loss": 16.4278, |
|
"step": 132000 |
|
}, |
|
{ |
|
"epoch": 0.5888888888888889, |
|
"grad_norm": 673.5048828125, |
|
"learning_rate": 2.636578778800804e-05, |
|
"loss": 17.7745, |
|
"step": 132500 |
|
}, |
|
{ |
|
"epoch": 0.5911111111111111, |
|
"grad_norm": 208.15098571777344, |
|
"learning_rate": 2.633335502257558e-05, |
|
"loss": 17.4536, |
|
"step": 133000 |
|
}, |
|
{ |
|
"epoch": 0.5933333333333334, |
|
"grad_norm": 1426.62109375, |
|
"learning_rate": 2.6300798317388357e-05, |
|
"loss": 17.152, |
|
"step": 133500 |
|
}, |
|
{ |
|
"epoch": 0.5955555555555555, |
|
"grad_norm": 253.73455810546875, |
|
"learning_rate": 2.626811802848128e-05, |
|
"loss": 16.4736, |
|
"step": 134000 |
|
}, |
|
{ |
|
"epoch": 0.5977777777777777, |
|
"grad_norm": 890.9122924804688, |
|
"learning_rate": 2.623531451324076e-05, |
|
"loss": 17.913, |
|
"step": 134500 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 880.38671875, |
|
"learning_rate": 2.6202388130400772e-05, |
|
"loss": 17.0165, |
|
"step": 135000 |
|
}, |
|
{ |
|
"epoch": 0.6022222222222222, |
|
"grad_norm": 284.1332702636719, |
|
"learning_rate": 2.616933924003898e-05, |
|
"loss": 17.0189, |
|
"step": 135500 |
|
}, |
|
{ |
|
"epoch": 0.6044444444444445, |
|
"grad_norm": 23.394821166992188, |
|
"learning_rate": 2.6136168203572742e-05, |
|
"loss": 17.2017, |
|
"step": 136000 |
|
}, |
|
{ |
|
"epoch": 0.6066666666666667, |
|
"grad_norm": 790.5655517578125, |
|
"learning_rate": 2.61028753837552e-05, |
|
"loss": 15.7028, |
|
"step": 136500 |
|
}, |
|
{ |
|
"epoch": 0.6088888888888889, |
|
"grad_norm": 196.9662628173828, |
|
"learning_rate": 2.6069461144671298e-05, |
|
"loss": 16.4864, |
|
"step": 137000 |
|
}, |
|
{ |
|
"epoch": 0.6111111111111112, |
|
"grad_norm": 178.7125244140625, |
|
"learning_rate": 2.6035925851733808e-05, |
|
"loss": 17.2559, |
|
"step": 137500 |
|
}, |
|
{ |
|
"epoch": 0.6133333333333333, |
|
"grad_norm": 402.0807800292969, |
|
"learning_rate": 2.600226987167931e-05, |
|
"loss": 17.2757, |
|
"step": 138000 |
|
}, |
|
{ |
|
"epoch": 0.6155555555555555, |
|
"grad_norm": 252.41526794433594, |
|
"learning_rate": 2.5968493572564218e-05, |
|
"loss": 16.8407, |
|
"step": 138500 |
|
}, |
|
{ |
|
"epoch": 0.6177777777777778, |
|
"grad_norm": 0.0, |
|
"learning_rate": 2.593459732376072e-05, |
|
"loss": 16.4473, |
|
"step": 139000 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 324.2782287597656, |
|
"learning_rate": 2.590058149595277e-05, |
|
"loss": 17.0955, |
|
"step": 139500 |
|
}, |
|
{ |
|
"epoch": 0.6222222222222222, |
|
"grad_norm": 259.27532958984375, |
|
"learning_rate": 2.5866446461132007e-05, |
|
"loss": 17.8668, |
|
"step": 140000 |
|
}, |
|
{ |
|
"epoch": 0.6244444444444445, |
|
"grad_norm": 504.20550537109375, |
|
"learning_rate": 2.5832192592593707e-05, |
|
"loss": 18.1582, |
|
"step": 140500 |
|
}, |
|
{ |
|
"epoch": 0.6266666666666667, |
|
"grad_norm": 464.8078918457031, |
|
"learning_rate": 2.5797820264932682e-05, |
|
"loss": 16.0802, |
|
"step": 141000 |
|
}, |
|
{ |
|
"epoch": 0.6288888888888889, |
|
"grad_norm": 294.2264099121094, |
|
"learning_rate": 2.5763329854039204e-05, |
|
"loss": 16.0784, |
|
"step": 141500 |
|
}, |
|
{ |
|
"epoch": 0.6311111111111111, |
|
"grad_norm": 212.64166259765625, |
|
"learning_rate": 2.572872173709488e-05, |
|
"loss": 16.1939, |
|
"step": 142000 |
|
}, |
|
{ |
|
"epoch": 0.6333333333333333, |
|
"grad_norm": 313.9952087402344, |
|
"learning_rate": 2.5693996292568535e-05, |
|
"loss": 16.6863, |
|
"step": 142500 |
|
}, |
|
{ |
|
"epoch": 0.6355555555555555, |
|
"grad_norm": 350.9505615234375, |
|
"learning_rate": 2.565915390021206e-05, |
|
"loss": 15.5249, |
|
"step": 143000 |
|
}, |
|
{ |
|
"epoch": 0.6377777777777778, |
|
"grad_norm": 113.72864532470703, |
|
"learning_rate": 2.562419494105628e-05, |
|
"loss": 17.4712, |
|
"step": 143500 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 439.85784912109375, |
|
"learning_rate": 2.558911979740677e-05, |
|
"loss": 16.1441, |
|
"step": 144000 |
|
}, |
|
{ |
|
"epoch": 0.6422222222222222, |
|
"grad_norm": 107.58014678955078, |
|
"learning_rate": 2.5553928852839686e-05, |
|
"loss": 17.8531, |
|
"step": 144500 |
|
}, |
|
{ |
|
"epoch": 0.6444444444444445, |
|
"grad_norm": 314.7883605957031, |
|
"learning_rate": 2.5518622492197558e-05, |
|
"loss": 16.5554, |
|
"step": 145000 |
|
}, |
|
{ |
|
"epoch": 0.6466666666666666, |
|
"grad_norm": 146.2752227783203, |
|
"learning_rate": 2.5483201101585085e-05, |
|
"loss": 17.0876, |
|
"step": 145500 |
|
}, |
|
{ |
|
"epoch": 0.6488888888888888, |
|
"grad_norm": 493.06488037109375, |
|
"learning_rate": 2.544766506836492e-05, |
|
"loss": 16.4471, |
|
"step": 146000 |
|
}, |
|
{ |
|
"epoch": 0.6511111111111111, |
|
"grad_norm": 331.6954040527344, |
|
"learning_rate": 2.5412014781153433e-05, |
|
"loss": 16.6836, |
|
"step": 146500 |
|
}, |
|
{ |
|
"epoch": 0.6533333333333333, |
|
"grad_norm": 324.4432373046875, |
|
"learning_rate": 2.537625062981645e-05, |
|
"loss": 16.9327, |
|
"step": 147000 |
|
}, |
|
{ |
|
"epoch": 0.6555555555555556, |
|
"grad_norm": 447.0750732421875, |
|
"learning_rate": 2.5340373005465007e-05, |
|
"loss": 16.6021, |
|
"step": 147500 |
|
}, |
|
{ |
|
"epoch": 0.6577777777777778, |
|
"grad_norm": 74.82227325439453, |
|
"learning_rate": 2.530438230045105e-05, |
|
"loss": 16.6877, |
|
"step": 148000 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 408.71380615234375, |
|
"learning_rate": 2.5268278908363157e-05, |
|
"loss": 15.4423, |
|
"step": 148500 |
|
}, |
|
{ |
|
"epoch": 0.6622222222222223, |
|
"grad_norm": 434.0395812988281, |
|
"learning_rate": 2.523206322402225e-05, |
|
"loss": 16.9507, |
|
"step": 149000 |
|
}, |
|
{ |
|
"epoch": 0.6644444444444444, |
|
"grad_norm": 0.0, |
|
"learning_rate": 2.5195735643477244e-05, |
|
"loss": 17.0505, |
|
"step": 149500 |
|
}, |
|
{ |
|
"epoch": 0.6666666666666666, |
|
"grad_norm": 744.4578857421875, |
|
"learning_rate": 2.5159296564000744e-05, |
|
"loss": 16.4468, |
|
"step": 150000 |
|
}, |
|
{ |
|
"epoch": 0.6688888888888889, |
|
"grad_norm": 203.68789672851562, |
|
"learning_rate": 2.5122746384084683e-05, |
|
"loss": 15.6102, |
|
"step": 150500 |
|
}, |
|
{ |
|
"epoch": 0.6711111111111111, |
|
"grad_norm": 304.8150329589844, |
|
"learning_rate": 2.5086085503435973e-05, |
|
"loss": 16.5682, |
|
"step": 151000 |
|
}, |
|
{ |
|
"epoch": 0.6733333333333333, |
|
"grad_norm": 212.24891662597656, |
|
"learning_rate": 2.504931432297213e-05, |
|
"loss": 16.6716, |
|
"step": 151500 |
|
}, |
|
{ |
|
"epoch": 0.6755555555555556, |
|
"grad_norm": 143.3702392578125, |
|
"learning_rate": 2.5012433244816894e-05, |
|
"loss": 17.2561, |
|
"step": 152000 |
|
}, |
|
{ |
|
"epoch": 0.6777777777777778, |
|
"grad_norm": 82.70915985107422, |
|
"learning_rate": 2.4975442672295827e-05, |
|
"loss": 17.7661, |
|
"step": 152500 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 81.59647369384766, |
|
"learning_rate": 2.4938343009931908e-05, |
|
"loss": 15.6807, |
|
"step": 153000 |
|
}, |
|
{ |
|
"epoch": 0.6822222222222222, |
|
"grad_norm": 483.339111328125, |
|
"learning_rate": 2.4901134663441088e-05, |
|
"loss": 16.8148, |
|
"step": 153500 |
|
}, |
|
{ |
|
"epoch": 0.6844444444444444, |
|
"grad_norm": 0.0, |
|
"learning_rate": 2.4863818039727895e-05, |
|
"loss": 17.1794, |
|
"step": 154000 |
|
}, |
|
{ |
|
"epoch": 0.6866666666666666, |
|
"grad_norm": 211.79966735839844, |
|
"learning_rate": 2.482639354688094e-05, |
|
"loss": 15.5973, |
|
"step": 154500 |
|
}, |
|
{ |
|
"epoch": 0.6888888888888889, |
|
"grad_norm": 242.6669464111328, |
|
"learning_rate": 2.4788861594168485e-05, |
|
"loss": 16.9753, |
|
"step": 155000 |
|
}, |
|
{ |
|
"epoch": 0.6911111111111111, |
|
"grad_norm": 186.95126342773438, |
|
"learning_rate": 2.475122259203395e-05, |
|
"loss": 15.0561, |
|
"step": 155500 |
|
}, |
|
{ |
|
"epoch": 0.6933333333333334, |
|
"grad_norm": 332.6864929199219, |
|
"learning_rate": 2.471347695209143e-05, |
|
"loss": 16.4118, |
|
"step": 156000 |
|
}, |
|
{ |
|
"epoch": 0.6955555555555556, |
|
"grad_norm": 373.36944580078125, |
|
"learning_rate": 2.4675625087121204e-05, |
|
"loss": 16.9823, |
|
"step": 156500 |
|
}, |
|
{ |
|
"epoch": 0.6977777777777778, |
|
"grad_norm": 61.25292205810547, |
|
"learning_rate": 2.4637667411065197e-05, |
|
"loss": 16.2012, |
|
"step": 157000 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 549.8672485351562, |
|
"learning_rate": 2.459960433902247e-05, |
|
"loss": 17.6019, |
|
"step": 157500 |
|
}, |
|
{ |
|
"epoch": 0.7022222222222222, |
|
"grad_norm": 478.5077209472656, |
|
"learning_rate": 2.4561436287244685e-05, |
|
"loss": 17.6805, |
|
"step": 158000 |
|
}, |
|
{ |
|
"epoch": 0.7044444444444444, |
|
"grad_norm": 218.25418090820312, |
|
"learning_rate": 2.4523163673131538e-05, |
|
"loss": 15.3333, |
|
"step": 158500 |
|
}, |
|
{ |
|
"epoch": 0.7066666666666667, |
|
"grad_norm": 383.55767822265625, |
|
"learning_rate": 2.4484786915226213e-05, |
|
"loss": 16.3707, |
|
"step": 159000 |
|
}, |
|
{ |
|
"epoch": 0.7088888888888889, |
|
"grad_norm": 729.36474609375, |
|
"learning_rate": 2.444630643321078e-05, |
|
"loss": 15.4495, |
|
"step": 159500 |
|
}, |
|
{ |
|
"epoch": 0.7111111111111111, |
|
"grad_norm": 578.4398193359375, |
|
"learning_rate": 2.4407722647901624e-05, |
|
"loss": 17.7177, |
|
"step": 160000 |
|
}, |
|
{ |
|
"epoch": 0.7133333333333334, |
|
"grad_norm": 284.87823486328125, |
|
"learning_rate": 2.4369035981244836e-05, |
|
"loss": 16.7006, |
|
"step": 160500 |
|
}, |
|
{ |
|
"epoch": 0.7155555555555555, |
|
"grad_norm": 287.9507751464844, |
|
"learning_rate": 2.4330246856311613e-05, |
|
"loss": 16.7623, |
|
"step": 161000 |
|
}, |
|
{ |
|
"epoch": 0.7177777777777777, |
|
"grad_norm": 518.5828857421875, |
|
"learning_rate": 2.429135569729361e-05, |
|
"loss": 18.6743, |
|
"step": 161500 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 741.138916015625, |
|
"learning_rate": 2.42523629294983e-05, |
|
"loss": 15.989, |
|
"step": 162000 |
|
}, |
|
{ |
|
"epoch": 0.7222222222222222, |
|
"grad_norm": 0.0, |
|
"learning_rate": 2.4213268979344362e-05, |
|
"loss": 16.102, |
|
"step": 162500 |
|
}, |
|
{ |
|
"epoch": 0.7244444444444444, |
|
"grad_norm": 358.8752746582031, |
|
"learning_rate": 2.417407427435696e-05, |
|
"loss": 15.923, |
|
"step": 163000 |
|
}, |
|
{ |
|
"epoch": 0.7266666666666667, |
|
"grad_norm": 570.9427490234375, |
|
"learning_rate": 2.4134779243163105e-05, |
|
"loss": 16.5887, |
|
"step": 163500 |
|
}, |
|
{ |
|
"epoch": 0.7288888888888889, |
|
"grad_norm": 435.3963928222656, |
|
"learning_rate": 2.409538431548697e-05, |
|
"loss": 15.2045, |
|
"step": 164000 |
|
}, |
|
{ |
|
"epoch": 0.7311111111111112, |
|
"grad_norm": 298.369140625, |
|
"learning_rate": 2.405588992214517e-05, |
|
"loss": 16.1364, |
|
"step": 164500 |
|
}, |
|
{ |
|
"epoch": 0.7333333333333333, |
|
"grad_norm": 161.4807586669922, |
|
"learning_rate": 2.4016296495042065e-05, |
|
"loss": 16.3397, |
|
"step": 165000 |
|
}, |
|
{ |
|
"epoch": 0.7355555555555555, |
|
"grad_norm": 450.2773742675781, |
|
"learning_rate": 2.3976604467165035e-05, |
|
"loss": 14.8856, |
|
"step": 165500 |
|
}, |
|
{ |
|
"epoch": 0.7377777777777778, |
|
"grad_norm": 62.63951110839844, |
|
"learning_rate": 2.3936814272579718e-05, |
|
"loss": 16.1214, |
|
"step": 166000 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 295.8753662109375, |
|
"learning_rate": 2.389692634642533e-05, |
|
"loss": 16.7177, |
|
"step": 166500 |
|
}, |
|
{ |
|
"epoch": 0.7422222222222222, |
|
"grad_norm": 83.56742858886719, |
|
"learning_rate": 2.385694112490983e-05, |
|
"loss": 16.233, |
|
"step": 167000 |
|
}, |
|
{ |
|
"epoch": 0.7444444444444445, |
|
"grad_norm": 859.1819458007812, |
|
"learning_rate": 2.381685904530519e-05, |
|
"loss": 16.7252, |
|
"step": 167500 |
|
}, |
|
{ |
|
"epoch": 0.7466666666666667, |
|
"grad_norm": 414.3497009277344, |
|
"learning_rate": 2.377668054594262e-05, |
|
"loss": 16.0818, |
|
"step": 168000 |
|
}, |
|
{ |
|
"epoch": 0.7488888888888889, |
|
"grad_norm": 291.54498291015625, |
|
"learning_rate": 2.373640606620775e-05, |
|
"loss": 14.5691, |
|
"step": 168500 |
|
}, |
|
{ |
|
"epoch": 0.7511111111111111, |
|
"grad_norm": 594.7430419921875, |
|
"learning_rate": 2.369603604653583e-05, |
|
"loss": 16.9945, |
|
"step": 169000 |
|
}, |
|
{ |
|
"epoch": 0.7533333333333333, |
|
"grad_norm": 202.13864135742188, |
|
"learning_rate": 2.3655570928406937e-05, |
|
"loss": 15.3943, |
|
"step": 169500 |
|
}, |
|
{ |
|
"epoch": 0.7555555555555555, |
|
"grad_norm": 212.4605712890625, |
|
"learning_rate": 2.361501115434112e-05, |
|
"loss": 16.8734, |
|
"step": 170000 |
|
}, |
|
{ |
|
"epoch": 0.7577777777777778, |
|
"grad_norm": 414.1224060058594, |
|
"learning_rate": 2.357435716789356e-05, |
|
"loss": 15.8502, |
|
"step": 170500 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 92.9588394165039, |
|
"learning_rate": 2.3533609413649745e-05, |
|
"loss": 16.2583, |
|
"step": 171000 |
|
}, |
|
{ |
|
"epoch": 0.7622222222222222, |
|
"grad_norm": 308.7859802246094, |
|
"learning_rate": 2.349276833722059e-05, |
|
"loss": 16.0059, |
|
"step": 171500 |
|
}, |
|
{ |
|
"epoch": 0.7644444444444445, |
|
"grad_norm": 437.89178466796875, |
|
"learning_rate": 2.345183438523756e-05, |
|
"loss": 16.7771, |
|
"step": 172000 |
|
}, |
|
{ |
|
"epoch": 0.7666666666666667, |
|
"grad_norm": 28.078920364379883, |
|
"learning_rate": 2.3410808005347798e-05, |
|
"loss": 17.1159, |
|
"step": 172500 |
|
}, |
|
{ |
|
"epoch": 0.7688888888888888, |
|
"grad_norm": 243.4501495361328, |
|
"learning_rate": 2.336968964620922e-05, |
|
"loss": 17.4442, |
|
"step": 173000 |
|
}, |
|
{ |
|
"epoch": 0.7711111111111111, |
|
"grad_norm": 873.5339965820312, |
|
"learning_rate": 2.3328479757485615e-05, |
|
"loss": 16.389, |
|
"step": 173500 |
|
}, |
|
{ |
|
"epoch": 0.7733333333333333, |
|
"grad_norm": 487.0278015136719, |
|
"learning_rate": 2.328717878984172e-05, |
|
"loss": 15.1246, |
|
"step": 174000 |
|
}, |
|
{ |
|
"epoch": 0.7755555555555556, |
|
"grad_norm": 1256.6805419921875, |
|
"learning_rate": 2.32457871949383e-05, |
|
"loss": 16.0509, |
|
"step": 174500 |
|
}, |
|
{ |
|
"epoch": 0.7777777777777778, |
|
"grad_norm": 437.3548278808594, |
|
"learning_rate": 2.320430542542721e-05, |
|
"loss": 14.2762, |
|
"step": 175000 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 50.979103088378906, |
|
"learning_rate": 2.3162733934946437e-05, |
|
"loss": 15.7425, |
|
"step": 175500 |
|
}, |
|
{ |
|
"epoch": 0.7822222222222223, |
|
"grad_norm": 461.4090270996094, |
|
"learning_rate": 2.3121073178115136e-05, |
|
"loss": 17.1488, |
|
"step": 176000 |
|
}, |
|
{ |
|
"epoch": 0.7844444444444445, |
|
"grad_norm": 163.63095092773438, |
|
"learning_rate": 2.307932361052867e-05, |
|
"loss": 14.9277, |
|
"step": 176500 |
|
}, |
|
{ |
|
"epoch": 0.7866666666666666, |
|
"grad_norm": 349.4720458984375, |
|
"learning_rate": 2.3037485688753623e-05, |
|
"loss": 15.1278, |
|
"step": 177000 |
|
}, |
|
{ |
|
"epoch": 0.7888888888888889, |
|
"grad_norm": 266.4578857421875, |
|
"learning_rate": 2.2995559870322797e-05, |
|
"loss": 14.9445, |
|
"step": 177500 |
|
}, |
|
{ |
|
"epoch": 0.7911111111111111, |
|
"grad_norm": 259.8016357421875, |
|
"learning_rate": 2.2953546613730237e-05, |
|
"loss": 15.8992, |
|
"step": 178000 |
|
}, |
|
{ |
|
"epoch": 0.7933333333333333, |
|
"grad_norm": 302.3138732910156, |
|
"learning_rate": 2.2911446378426177e-05, |
|
"loss": 16.151, |
|
"step": 178500 |
|
}, |
|
{ |
|
"epoch": 0.7955555555555556, |
|
"grad_norm": 302.546142578125, |
|
"learning_rate": 2.286925962481205e-05, |
|
"loss": 15.9711, |
|
"step": 179000 |
|
}, |
|
{ |
|
"epoch": 0.7977777777777778, |
|
"grad_norm": 161.2322998046875, |
|
"learning_rate": 2.282698681423543e-05, |
|
"loss": 15.3818, |
|
"step": 179500 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 338.44873046875, |
|
"learning_rate": 2.2784628408985005e-05, |
|
"loss": 16.7231, |
|
"step": 180000 |
|
}, |
|
{ |
|
"epoch": 0.8022222222222222, |
|
"grad_norm": 331.5046691894531, |
|
"learning_rate": 2.2742184872285507e-05, |
|
"loss": 15.7784, |
|
"step": 180500 |
|
}, |
|
{ |
|
"epoch": 0.8044444444444444, |
|
"grad_norm": 532.013671875, |
|
"learning_rate": 2.2699656668292653e-05, |
|
"loss": 15.8937, |
|
"step": 181000 |
|
}, |
|
{ |
|
"epoch": 0.8066666666666666, |
|
"grad_norm": 30.83024024963379, |
|
"learning_rate": 2.2657044262088068e-05, |
|
"loss": 14.8331, |
|
"step": 181500 |
|
}, |
|
{ |
|
"epoch": 0.8088888888888889, |
|
"grad_norm": 208.97105407714844, |
|
"learning_rate": 2.26143481196742e-05, |
|
"loss": 14.8417, |
|
"step": 182000 |
|
}, |
|
{ |
|
"epoch": 0.8111111111111111, |
|
"grad_norm": 178.349609375, |
|
"learning_rate": 2.2571568707969224e-05, |
|
"loss": 15.9551, |
|
"step": 182500 |
|
}, |
|
{ |
|
"epoch": 0.8133333333333334, |
|
"grad_norm": 191.2917938232422, |
|
"learning_rate": 2.2528706494801933e-05, |
|
"loss": 15.4303, |
|
"step": 183000 |
|
}, |
|
{ |
|
"epoch": 0.8155555555555556, |
|
"grad_norm": 379.2752685546875, |
|
"learning_rate": 2.248576194890661e-05, |
|
"loss": 17.1609, |
|
"step": 183500 |
|
}, |
|
{ |
|
"epoch": 0.8177777777777778, |
|
"grad_norm": 49.782352447509766, |
|
"learning_rate": 2.244273553991795e-05, |
|
"loss": 16.6368, |
|
"step": 184000 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 164.4068603515625, |
|
"learning_rate": 2.239962773836585e-05, |
|
"loss": 16.0915, |
|
"step": 184500 |
|
}, |
|
{ |
|
"epoch": 0.8222222222222222, |
|
"grad_norm": 120.09187316894531, |
|
"learning_rate": 2.2356439015670335e-05, |
|
"loss": 15.3172, |
|
"step": 185000 |
|
}, |
|
{ |
|
"epoch": 0.8244444444444444, |
|
"grad_norm": 119.5110855102539, |
|
"learning_rate": 2.2313169844136342e-05, |
|
"loss": 15.7401, |
|
"step": 185500 |
|
}, |
|
{ |
|
"epoch": 0.8266666666666667, |
|
"grad_norm": 238.2360076904297, |
|
"learning_rate": 2.226982069694861e-05, |
|
"loss": 15.5555, |
|
"step": 186000 |
|
}, |
|
{ |
|
"epoch": 0.8288888888888889, |
|
"grad_norm": 234.07911682128906, |
|
"learning_rate": 2.2226392048166467e-05, |
|
"loss": 15.8124, |
|
"step": 186500 |
|
}, |
|
{ |
|
"epoch": 0.8311111111111111, |
|
"grad_norm": 0.0, |
|
"learning_rate": 2.218288437271865e-05, |
|
"loss": 14.9297, |
|
"step": 187000 |
|
}, |
|
{ |
|
"epoch": 0.8333333333333334, |
|
"grad_norm": 0.0, |
|
"learning_rate": 2.213929814639814e-05, |
|
"loss": 14.9676, |
|
"step": 187500 |
|
}, |
|
{ |
|
"epoch": 0.8355555555555556, |
|
"grad_norm": 221.94076538085938, |
|
"learning_rate": 2.2095633845856912e-05, |
|
"loss": 14.5759, |
|
"step": 188000 |
|
}, |
|
{ |
|
"epoch": 0.8377777777777777, |
|
"grad_norm": 798.3099365234375, |
|
"learning_rate": 2.2051891948600773e-05, |
|
"loss": 16.8336, |
|
"step": 188500 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 148.87489318847656, |
|
"learning_rate": 2.2008072932984095e-05, |
|
"loss": 15.6524, |
|
"step": 189000 |
|
}, |
|
{ |
|
"epoch": 0.8422222222222222, |
|
"grad_norm": 979.9264526367188, |
|
"learning_rate": 2.196417727820461e-05, |
|
"loss": 14.5125, |
|
"step": 189500 |
|
}, |
|
{ |
|
"epoch": 0.8444444444444444, |
|
"grad_norm": 273.1609191894531, |
|
"learning_rate": 2.1920205464298174e-05, |
|
"loss": 14.7308, |
|
"step": 190000 |
|
}, |
|
{ |
|
"epoch": 0.8466666666666667, |
|
"grad_norm": 494.7351989746094, |
|
"learning_rate": 2.187615797213349e-05, |
|
"loss": 14.448, |
|
"step": 190500 |
|
}, |
|
{ |
|
"epoch": 0.8488888888888889, |
|
"grad_norm": 2433.17529296875, |
|
"learning_rate": 2.183203528340689e-05, |
|
"loss": 15.0146, |
|
"step": 191000 |
|
}, |
|
{ |
|
"epoch": 0.8511111111111112, |
|
"grad_norm": 446.34490966796875, |
|
"learning_rate": 2.1787837880637014e-05, |
|
"loss": 15.0511, |
|
"step": 191500 |
|
}, |
|
{ |
|
"epoch": 0.8533333333333334, |
|
"grad_norm": 596.4390869140625, |
|
"learning_rate": 2.1743566247159586e-05, |
|
"loss": 14.3164, |
|
"step": 192000 |
|
}, |
|
{ |
|
"epoch": 0.8555555555555555, |
|
"grad_norm": 927.9017333984375, |
|
"learning_rate": 2.1699220867122087e-05, |
|
"loss": 14.7031, |
|
"step": 192500 |
|
}, |
|
{ |
|
"epoch": 0.8577777777777778, |
|
"grad_norm": 174.5888671875, |
|
"learning_rate": 2.16548022254785e-05, |
|
"loss": 14.77, |
|
"step": 193000 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 346.9240417480469, |
|
"learning_rate": 2.161031080798397e-05, |
|
"loss": 14.618, |
|
"step": 193500 |
|
}, |
|
{ |
|
"epoch": 0.8622222222222222, |
|
"grad_norm": 533.3963623046875, |
|
"learning_rate": 2.156574710118951e-05, |
|
"loss": 14.1816, |
|
"step": 194000 |
|
}, |
|
{ |
|
"epoch": 0.8644444444444445, |
|
"grad_norm": 234.50579833984375, |
|
"learning_rate": 2.1521111592436673e-05, |
|
"loss": 15.6746, |
|
"step": 194500 |
|
}, |
|
{ |
|
"epoch": 0.8666666666666667, |
|
"grad_norm": 654.4329833984375, |
|
"learning_rate": 2.1476404769852238e-05, |
|
"loss": 16.4027, |
|
"step": 195000 |
|
}, |
|
{ |
|
"epoch": 0.8688888888888889, |
|
"grad_norm": 97.57040405273438, |
|
"learning_rate": 2.143162712234285e-05, |
|
"loss": 14.6315, |
|
"step": 195500 |
|
}, |
|
{ |
|
"epoch": 0.8711111111111111, |
|
"grad_norm": 347.2988586425781, |
|
"learning_rate": 2.138677913958969e-05, |
|
"loss": 14.8534, |
|
"step": 196000 |
|
}, |
|
{ |
|
"epoch": 0.8733333333333333, |
|
"grad_norm": 61.20378112792969, |
|
"learning_rate": 2.1341861312043116e-05, |
|
"loss": 14.0666, |
|
"step": 196500 |
|
}, |
|
{ |
|
"epoch": 0.8755555555555555, |
|
"grad_norm": 57.949256896972656, |
|
"learning_rate": 2.1296874130917282e-05, |
|
"loss": 13.8681, |
|
"step": 197000 |
|
}, |
|
{ |
|
"epoch": 0.8777777777777778, |
|
"grad_norm": 417.0851745605469, |
|
"learning_rate": 2.1251818088184808e-05, |
|
"loss": 15.6193, |
|
"step": 197500 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 261.3269958496094, |
|
"learning_rate": 2.1206693676571347e-05, |
|
"loss": 15.1966, |
|
"step": 198000 |
|
}, |
|
{ |
|
"epoch": 0.8822222222222222, |
|
"grad_norm": 105.9546890258789, |
|
"learning_rate": 2.1161501389550242e-05, |
|
"loss": 15.0815, |
|
"step": 198500 |
|
}, |
|
{ |
|
"epoch": 0.8844444444444445, |
|
"grad_norm": 453.0606994628906, |
|
"learning_rate": 2.11162417213371e-05, |
|
"loss": 15.7839, |
|
"step": 199000 |
|
}, |
|
{ |
|
"epoch": 0.8866666666666667, |
|
"grad_norm": 0.0, |
|
"learning_rate": 2.10709151668844e-05, |
|
"loss": 15.5458, |
|
"step": 199500 |
|
}, |
|
{ |
|
"epoch": 0.8888888888888888, |
|
"grad_norm": 373.2171630859375, |
|
"learning_rate": 2.1025522221876087e-05, |
|
"loss": 14.8535, |
|
"step": 200000 |
|
}, |
|
{ |
|
"epoch": 0.8911111111111111, |
|
"grad_norm": 182.15408325195312, |
|
"learning_rate": 2.098006338272212e-05, |
|
"loss": 15.9142, |
|
"step": 200500 |
|
}, |
|
{ |
|
"epoch": 0.8933333333333333, |
|
"grad_norm": 159.78123474121094, |
|
"learning_rate": 2.09345391465531e-05, |
|
"loss": 17.2029, |
|
"step": 201000 |
|
}, |
|
{ |
|
"epoch": 0.8955555555555555, |
|
"grad_norm": 761.6434326171875, |
|
"learning_rate": 2.0888950011214763e-05, |
|
"loss": 14.7574, |
|
"step": 201500 |
|
}, |
|
{ |
|
"epoch": 0.8977777777777778, |
|
"grad_norm": 602.9556274414062, |
|
"learning_rate": 2.0843296475262604e-05, |
|
"loss": 15.3703, |
|
"step": 202000 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 44.228267669677734, |
|
"learning_rate": 2.0797579037956364e-05, |
|
"loss": 16.191, |
|
"step": 202500 |
|
}, |
|
{ |
|
"epoch": 0.9022222222222223, |
|
"grad_norm": 191.9353485107422, |
|
"learning_rate": 2.075179819925462e-05, |
|
"loss": 15.4188, |
|
"step": 203000 |
|
}, |
|
{ |
|
"epoch": 0.9044444444444445, |
|
"grad_norm": 41.51668930053711, |
|
"learning_rate": 2.0705954459809293e-05, |
|
"loss": 14.5222, |
|
"step": 203500 |
|
}, |
|
{ |
|
"epoch": 0.9066666666666666, |
|
"grad_norm": 281.99273681640625, |
|
"learning_rate": 2.0660048320960164e-05, |
|
"loss": 15.4986, |
|
"step": 204000 |
|
}, |
|
{ |
|
"epoch": 0.9088888888888889, |
|
"grad_norm": 3.3990941047668457, |
|
"learning_rate": 2.061408028472942e-05, |
|
"loss": 15.7127, |
|
"step": 204500 |
|
}, |
|
{ |
|
"epoch": 0.9111111111111111, |
|
"grad_norm": 151.7320556640625, |
|
"learning_rate": 2.0568050853816137e-05, |
|
"loss": 14.9146, |
|
"step": 205000 |
|
}, |
|
{ |
|
"epoch": 0.9133333333333333, |
|
"grad_norm": 223.80499267578125, |
|
"learning_rate": 2.0521960531590795e-05, |
|
"loss": 15.3864, |
|
"step": 205500 |
|
}, |
|
{ |
|
"epoch": 0.9155555555555556, |
|
"grad_norm": 394.2869567871094, |
|
"learning_rate": 2.0475809822089774e-05, |
|
"loss": 15.7962, |
|
"step": 206000 |
|
}, |
|
{ |
|
"epoch": 0.9177777777777778, |
|
"grad_norm": 471.55072021484375, |
|
"learning_rate": 2.0429599230009844e-05, |
|
"loss": 14.9467, |
|
"step": 206500 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 773.841552734375, |
|
"learning_rate": 2.0383329260702634e-05, |
|
"loss": 14.1642, |
|
"step": 207000 |
|
}, |
|
{ |
|
"epoch": 0.9222222222222223, |
|
"grad_norm": 269.2467346191406, |
|
"learning_rate": 2.0337000420169113e-05, |
|
"loss": 14.8939, |
|
"step": 207500 |
|
}, |
|
{ |
|
"epoch": 0.9244444444444444, |
|
"grad_norm": 262.891357421875, |
|
"learning_rate": 2.0290613215054063e-05, |
|
"loss": 14.6107, |
|
"step": 208000 |
|
}, |
|
{ |
|
"epoch": 0.9266666666666666, |
|
"grad_norm": 370.94036865234375, |
|
"learning_rate": 2.0244168152640522e-05, |
|
"loss": 14.8097, |
|
"step": 208500 |
|
}, |
|
{ |
|
"epoch": 0.9288888888888889, |
|
"grad_norm": 526.1622924804688, |
|
"learning_rate": 2.0197665740844254e-05, |
|
"loss": 13.5514, |
|
"step": 209000 |
|
}, |
|
{ |
|
"epoch": 0.9311111111111111, |
|
"grad_norm": 402.8370361328125, |
|
"learning_rate": 2.0151106488208185e-05, |
|
"loss": 15.5235, |
|
"step": 209500 |
|
}, |
|
{ |
|
"epoch": 0.9333333333333333, |
|
"grad_norm": 240.7682647705078, |
|
"learning_rate": 2.0104490903896834e-05, |
|
"loss": 15.7625, |
|
"step": 210000 |
|
}, |
|
{ |
|
"epoch": 0.9355555555555556, |
|
"grad_norm": 929.83447265625, |
|
"learning_rate": 2.0057819497690778e-05, |
|
"loss": 13.7892, |
|
"step": 210500 |
|
}, |
|
{ |
|
"epoch": 0.9377777777777778, |
|
"grad_norm": 50.330322265625, |
|
"learning_rate": 2.0011092779981027e-05, |
|
"loss": 14.8297, |
|
"step": 211000 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 106.34629821777344, |
|
"learning_rate": 1.9964311261763482e-05, |
|
"loss": 14.0396, |
|
"step": 211500 |
|
}, |
|
{ |
|
"epoch": 0.9422222222222222, |
|
"grad_norm": 519.3964233398438, |
|
"learning_rate": 1.991747545463333e-05, |
|
"loss": 14.4548, |
|
"step": 212000 |
|
}, |
|
{ |
|
"epoch": 0.9444444444444444, |
|
"grad_norm": 496.7522888183594, |
|
"learning_rate": 1.987058587077946e-05, |
|
"loss": 15.0954, |
|
"step": 212500 |
|
}, |
|
{ |
|
"epoch": 0.9466666666666667, |
|
"grad_norm": 79.46224975585938, |
|
"learning_rate": 1.9823643022978844e-05, |
|
"loss": 15.5782, |
|
"step": 213000 |
|
}, |
|
{ |
|
"epoch": 0.9488888888888889, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.9776647424590937e-05, |
|
"loss": 14.1761, |
|
"step": 213500 |
|
}, |
|
{ |
|
"epoch": 0.9511111111111111, |
|
"grad_norm": 328.0174560546875, |
|
"learning_rate": 1.9729599589552084e-05, |
|
"loss": 14.5482, |
|
"step": 214000 |
|
}, |
|
{ |
|
"epoch": 0.9533333333333334, |
|
"grad_norm": 223.33721923828125, |
|
"learning_rate": 1.968250003236987e-05, |
|
"loss": 14.5949, |
|
"step": 214500 |
|
}, |
|
{ |
|
"epoch": 0.9555555555555556, |
|
"grad_norm": 233.63478088378906, |
|
"learning_rate": 1.9635349268117507e-05, |
|
"loss": 14.8437, |
|
"step": 215000 |
|
}, |
|
{ |
|
"epoch": 0.9577777777777777, |
|
"grad_norm": 4.987401485443115, |
|
"learning_rate": 1.9588147812428197e-05, |
|
"loss": 15.7183, |
|
"step": 215500 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 341.9475402832031, |
|
"learning_rate": 1.954089618148949e-05, |
|
"loss": 15.5074, |
|
"step": 216000 |
|
}, |
|
{ |
|
"epoch": 0.9622222222222222, |
|
"grad_norm": 186.303466796875, |
|
"learning_rate": 1.9493594892037667e-05, |
|
"loss": 14.1594, |
|
"step": 216500 |
|
}, |
|
{ |
|
"epoch": 0.9644444444444444, |
|
"grad_norm": 196.6855010986328, |
|
"learning_rate": 1.9446244461352033e-05, |
|
"loss": 16.0385, |
|
"step": 217000 |
|
}, |
|
{ |
|
"epoch": 0.9666666666666667, |
|
"grad_norm": 536.9638061523438, |
|
"learning_rate": 1.9398845407249326e-05, |
|
"loss": 15.1219, |
|
"step": 217500 |
|
}, |
|
{ |
|
"epoch": 0.9688888888888889, |
|
"grad_norm": 369.9173889160156, |
|
"learning_rate": 1.9351398248078004e-05, |
|
"loss": 14.1767, |
|
"step": 218000 |
|
}, |
|
{ |
|
"epoch": 0.9711111111111111, |
|
"grad_norm": 36.90256118774414, |
|
"learning_rate": 1.9303903502712592e-05, |
|
"loss": 15.2894, |
|
"step": 218500 |
|
}, |
|
{ |
|
"epoch": 0.9733333333333334, |
|
"grad_norm": 475.021240234375, |
|
"learning_rate": 1.9256361690548026e-05, |
|
"loss": 14.8856, |
|
"step": 219000 |
|
}, |
|
{ |
|
"epoch": 0.9755555555555555, |
|
"grad_norm": 805.5115356445312, |
|
"learning_rate": 1.9208773331493938e-05, |
|
"loss": 14.159, |
|
"step": 219500 |
|
}, |
|
{ |
|
"epoch": 0.9777777777777777, |
|
"grad_norm": 767.4393310546875, |
|
"learning_rate": 1.9161138945969007e-05, |
|
"loss": 14.6288, |
|
"step": 220000 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 122.41221618652344, |
|
"learning_rate": 1.911345905489523e-05, |
|
"loss": 13.795, |
|
"step": 220500 |
|
}, |
|
{ |
|
"epoch": 0.9822222222222222, |
|
"grad_norm": 432.9138488769531, |
|
"learning_rate": 1.9065734179692262e-05, |
|
"loss": 14.115, |
|
"step": 221000 |
|
}, |
|
{ |
|
"epoch": 0.9844444444444445, |
|
"grad_norm": 630.0858764648438, |
|
"learning_rate": 1.90179648422717e-05, |
|
"loss": 13.5404, |
|
"step": 221500 |
|
}, |
|
{ |
|
"epoch": 0.9866666666666667, |
|
"grad_norm": 681.5342407226562, |
|
"learning_rate": 1.897015156503135e-05, |
|
"loss": 14.8603, |
|
"step": 222000 |
|
}, |
|
{ |
|
"epoch": 0.9888888888888889, |
|
"grad_norm": 18.26776885986328, |
|
"learning_rate": 1.8922294870849566e-05, |
|
"loss": 14.8978, |
|
"step": 222500 |
|
}, |
|
{ |
|
"epoch": 0.9911111111111112, |
|
"grad_norm": 610.2125244140625, |
|
"learning_rate": 1.8874395283079478e-05, |
|
"loss": 14.0042, |
|
"step": 223000 |
|
}, |
|
{ |
|
"epoch": 0.9933333333333333, |
|
"grad_norm": 236.45591735839844, |
|
"learning_rate": 1.8826453325543308e-05, |
|
"loss": 13.2571, |
|
"step": 223500 |
|
}, |
|
{ |
|
"epoch": 0.9955555555555555, |
|
"grad_norm": 146.5922393798828, |
|
"learning_rate": 1.877846952252662e-05, |
|
"loss": 14.9317, |
|
"step": 224000 |
|
}, |
|
{ |
|
"epoch": 0.9977777777777778, |
|
"grad_norm": 831.205078125, |
|
"learning_rate": 1.8730444398772605e-05, |
|
"loss": 14.2085, |
|
"step": 224500 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 465.5499267578125, |
|
"learning_rate": 1.8682378479476307e-05, |
|
"loss": 15.6298, |
|
"step": 225000 |
|
}, |
|
{ |
|
"epoch": 1.0022222222222221, |
|
"grad_norm": 130.86990356445312, |
|
"learning_rate": 1.8634272290278932e-05, |
|
"loss": 12.7156, |
|
"step": 225500 |
|
}, |
|
{ |
|
"epoch": 1.0044444444444445, |
|
"grad_norm": 394.0591125488281, |
|
"learning_rate": 1.8586126357262054e-05, |
|
"loss": 12.0245, |
|
"step": 226000 |
|
}, |
|
{ |
|
"epoch": 1.0066666666666666, |
|
"grad_norm": 144.7230682373047, |
|
"learning_rate": 1.853794120694187e-05, |
|
"loss": 12.68, |
|
"step": 226500 |
|
}, |
|
{ |
|
"epoch": 1.008888888888889, |
|
"grad_norm": 108.50147247314453, |
|
"learning_rate": 1.8489717366263487e-05, |
|
"loss": 11.755, |
|
"step": 227000 |
|
}, |
|
{ |
|
"epoch": 1.011111111111111, |
|
"grad_norm": 45.11106872558594, |
|
"learning_rate": 1.8441455362595082e-05, |
|
"loss": 12.0449, |
|
"step": 227500 |
|
}, |
|
{ |
|
"epoch": 1.0133333333333334, |
|
"grad_norm": 321.0522155761719, |
|
"learning_rate": 1.8393155723722205e-05, |
|
"loss": 12.5334, |
|
"step": 228000 |
|
}, |
|
{ |
|
"epoch": 1.0155555555555555, |
|
"grad_norm": 409.6867370605469, |
|
"learning_rate": 1.8344818977841967e-05, |
|
"loss": 12.5081, |
|
"step": 228500 |
|
}, |
|
{ |
|
"epoch": 1.0177777777777777, |
|
"grad_norm": 293.31866455078125, |
|
"learning_rate": 1.829644565355727e-05, |
|
"loss": 11.9373, |
|
"step": 229000 |
|
}, |
|
{ |
|
"epoch": 1.02, |
|
"grad_norm": 182.61883544921875, |
|
"learning_rate": 1.8248036279871043e-05, |
|
"loss": 12.3983, |
|
"step": 229500 |
|
}, |
|
{ |
|
"epoch": 1.0222222222222221, |
|
"grad_norm": 152.36061096191406, |
|
"learning_rate": 1.819959138618044e-05, |
|
"loss": 13.1577, |
|
"step": 230000 |
|
}, |
|
{ |
|
"epoch": 1.0244444444444445, |
|
"grad_norm": 31.093074798583984, |
|
"learning_rate": 1.8151111502271063e-05, |
|
"loss": 13.6112, |
|
"step": 230500 |
|
}, |
|
{ |
|
"epoch": 1.0266666666666666, |
|
"grad_norm": 504.9164733886719, |
|
"learning_rate": 1.810259715831115e-05, |
|
"loss": 12.9236, |
|
"step": 231000 |
|
}, |
|
{ |
|
"epoch": 1.028888888888889, |
|
"grad_norm": 118.45124053955078, |
|
"learning_rate": 1.8054048884845784e-05, |
|
"loss": 14.7912, |
|
"step": 231500 |
|
}, |
|
{ |
|
"epoch": 1.031111111111111, |
|
"grad_norm": 247.5614776611328, |
|
"learning_rate": 1.8005467212791124e-05, |
|
"loss": 13.3697, |
|
"step": 232000 |
|
}, |
|
{ |
|
"epoch": 1.0333333333333334, |
|
"grad_norm": 431.06396484375, |
|
"learning_rate": 1.795685267342854e-05, |
|
"loss": 13.0248, |
|
"step": 232500 |
|
}, |
|
{ |
|
"epoch": 1.0355555555555556, |
|
"grad_norm": 209.7031707763672, |
|
"learning_rate": 1.7908205798398853e-05, |
|
"loss": 13.0866, |
|
"step": 233000 |
|
}, |
|
{ |
|
"epoch": 1.0377777777777777, |
|
"grad_norm": 127.96566009521484, |
|
"learning_rate": 1.7859527119696487e-05, |
|
"loss": 13.5331, |
|
"step": 233500 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"grad_norm": 117.52790832519531, |
|
"learning_rate": 1.7810817169663676e-05, |
|
"loss": 11.3817, |
|
"step": 234000 |
|
}, |
|
{ |
|
"epoch": 1.0422222222222222, |
|
"grad_norm": 1179.1375732421875, |
|
"learning_rate": 1.7762076480984635e-05, |
|
"loss": 12.7315, |
|
"step": 234500 |
|
}, |
|
{ |
|
"epoch": 1.0444444444444445, |
|
"grad_norm": 357.2664489746094, |
|
"learning_rate": 1.771330558667971e-05, |
|
"loss": 12.4928, |
|
"step": 235000 |
|
}, |
|
{ |
|
"epoch": 1.0466666666666666, |
|
"grad_norm": 230.9121551513672, |
|
"learning_rate": 1.766450502009961e-05, |
|
"loss": 13.6869, |
|
"step": 235500 |
|
}, |
|
{ |
|
"epoch": 1.048888888888889, |
|
"grad_norm": 236.51214599609375, |
|
"learning_rate": 1.7615675314919504e-05, |
|
"loss": 13.8959, |
|
"step": 236000 |
|
}, |
|
{ |
|
"epoch": 1.051111111111111, |
|
"grad_norm": 32.029823303222656, |
|
"learning_rate": 1.7566817005133215e-05, |
|
"loss": 11.7484, |
|
"step": 236500 |
|
}, |
|
{ |
|
"epoch": 1.0533333333333332, |
|
"grad_norm": 487.9048767089844, |
|
"learning_rate": 1.7517930625047403e-05, |
|
"loss": 12.8478, |
|
"step": 237000 |
|
}, |
|
{ |
|
"epoch": 1.0555555555555556, |
|
"grad_norm": 64.5386962890625, |
|
"learning_rate": 1.7469016709275678e-05, |
|
"loss": 13.1321, |
|
"step": 237500 |
|
}, |
|
{ |
|
"epoch": 1.0577777777777777, |
|
"grad_norm": 123.01608276367188, |
|
"learning_rate": 1.7420075792732797e-05, |
|
"loss": 12.7279, |
|
"step": 238000 |
|
}, |
|
{ |
|
"epoch": 1.06, |
|
"grad_norm": 418.50323486328125, |
|
"learning_rate": 1.7371108410628778e-05, |
|
"loss": 12.7196, |
|
"step": 238500 |
|
}, |
|
{ |
|
"epoch": 1.0622222222222222, |
|
"grad_norm": 15.958662986755371, |
|
"learning_rate": 1.732211509846306e-05, |
|
"loss": 12.8302, |
|
"step": 239000 |
|
}, |
|
{ |
|
"epoch": 1.0644444444444445, |
|
"grad_norm": 903.5818481445312, |
|
"learning_rate": 1.7273096392018664e-05, |
|
"loss": 12.5959, |
|
"step": 239500 |
|
}, |
|
{ |
|
"epoch": 1.0666666666666667, |
|
"grad_norm": 132.69081115722656, |
|
"learning_rate": 1.7224052827356306e-05, |
|
"loss": 12.4179, |
|
"step": 240000 |
|
}, |
|
{ |
|
"epoch": 1.068888888888889, |
|
"grad_norm": 72.78104400634766, |
|
"learning_rate": 1.7174984940808555e-05, |
|
"loss": 12.6991, |
|
"step": 240500 |
|
}, |
|
{ |
|
"epoch": 1.0711111111111111, |
|
"grad_norm": 19.8783016204834, |
|
"learning_rate": 1.7125893268973953e-05, |
|
"loss": 12.3093, |
|
"step": 241000 |
|
}, |
|
{ |
|
"epoch": 1.0733333333333333, |
|
"grad_norm": 53.51363754272461, |
|
"learning_rate": 1.707677834871116e-05, |
|
"loss": 12.2946, |
|
"step": 241500 |
|
}, |
|
{ |
|
"epoch": 1.0755555555555556, |
|
"grad_norm": 310.8068542480469, |
|
"learning_rate": 1.7027640717133074e-05, |
|
"loss": 12.9432, |
|
"step": 242000 |
|
}, |
|
{ |
|
"epoch": 1.0777777777777777, |
|
"grad_norm": 448.7236633300781, |
|
"learning_rate": 1.697848091160096e-05, |
|
"loss": 12.162, |
|
"step": 242500 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"grad_norm": 802.4764404296875, |
|
"learning_rate": 1.6929299469718585e-05, |
|
"loss": 13.7779, |
|
"step": 243000 |
|
}, |
|
{ |
|
"epoch": 1.0822222222222222, |
|
"grad_norm": 429.84564208984375, |
|
"learning_rate": 1.68800969293263e-05, |
|
"loss": 12.5977, |
|
"step": 243500 |
|
}, |
|
{ |
|
"epoch": 1.0844444444444445, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.6830873828495226e-05, |
|
"loss": 11.7274, |
|
"step": 244000 |
|
}, |
|
{ |
|
"epoch": 1.0866666666666667, |
|
"grad_norm": 194.27366638183594, |
|
"learning_rate": 1.6781630705521288e-05, |
|
"loss": 13.384, |
|
"step": 244500 |
|
}, |
|
{ |
|
"epoch": 1.0888888888888888, |
|
"grad_norm": 28.86142921447754, |
|
"learning_rate": 1.67323680989194e-05, |
|
"loss": 12.4926, |
|
"step": 245000 |
|
}, |
|
{ |
|
"epoch": 1.0911111111111111, |
|
"grad_norm": 729.71875, |
|
"learning_rate": 1.6683086547417527e-05, |
|
"loss": 12.177, |
|
"step": 245500 |
|
}, |
|
{ |
|
"epoch": 1.0933333333333333, |
|
"grad_norm": 17.39883804321289, |
|
"learning_rate": 1.663378658995083e-05, |
|
"loss": 11.7948, |
|
"step": 246000 |
|
}, |
|
{ |
|
"epoch": 1.0955555555555556, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.6584468765655737e-05, |
|
"loss": 12.777, |
|
"step": 246500 |
|
}, |
|
{ |
|
"epoch": 1.0977777777777777, |
|
"grad_norm": 214.7503204345703, |
|
"learning_rate": 1.653513361386408e-05, |
|
"loss": 12.8227, |
|
"step": 247000 |
|
}, |
|
{ |
|
"epoch": 1.1, |
|
"grad_norm": 279.39007568359375, |
|
"learning_rate": 1.6485781674097173e-05, |
|
"loss": 12.6121, |
|
"step": 247500 |
|
}, |
|
{ |
|
"epoch": 1.1022222222222222, |
|
"grad_norm": 74.43594360351562, |
|
"learning_rate": 1.643641348605992e-05, |
|
"loss": 11.8667, |
|
"step": 248000 |
|
}, |
|
{ |
|
"epoch": 1.1044444444444443, |
|
"grad_norm": 35.02223587036133, |
|
"learning_rate": 1.638702958963492e-05, |
|
"loss": 12.2564, |
|
"step": 248500 |
|
}, |
|
{ |
|
"epoch": 1.1066666666666667, |
|
"grad_norm": 23.571346282958984, |
|
"learning_rate": 1.6337630524876546e-05, |
|
"loss": 11.9732, |
|
"step": 249000 |
|
}, |
|
{ |
|
"epoch": 1.1088888888888888, |
|
"grad_norm": 15.899101257324219, |
|
"learning_rate": 1.628821683200506e-05, |
|
"loss": 13.1795, |
|
"step": 249500 |
|
}, |
|
{ |
|
"epoch": 1.1111111111111112, |
|
"grad_norm": 272.45257568359375, |
|
"learning_rate": 1.6238789051400688e-05, |
|
"loss": 12.9309, |
|
"step": 250000 |
|
}, |
|
{ |
|
"epoch": 1.1133333333333333, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.6189347723597725e-05, |
|
"loss": 12.8293, |
|
"step": 250500 |
|
}, |
|
{ |
|
"epoch": 1.1155555555555556, |
|
"grad_norm": 10.567012786865234, |
|
"learning_rate": 1.6139893389278608e-05, |
|
"loss": 11.9302, |
|
"step": 251000 |
|
}, |
|
{ |
|
"epoch": 1.1177777777777778, |
|
"grad_norm": 823.9113159179688, |
|
"learning_rate": 1.609042658926801e-05, |
|
"loss": 11.3798, |
|
"step": 251500 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"grad_norm": 449.7940673828125, |
|
"learning_rate": 1.6040947864526935e-05, |
|
"loss": 12.5211, |
|
"step": 252000 |
|
}, |
|
{ |
|
"epoch": 1.1222222222222222, |
|
"grad_norm": 427.29150390625, |
|
"learning_rate": 1.5991457756146786e-05, |
|
"loss": 12.1701, |
|
"step": 252500 |
|
}, |
|
{ |
|
"epoch": 1.1244444444444444, |
|
"grad_norm": 108.2233657836914, |
|
"learning_rate": 1.5941956805343463e-05, |
|
"loss": 12.4913, |
|
"step": 253000 |
|
}, |
|
{ |
|
"epoch": 1.1266666666666667, |
|
"grad_norm": 92.11042022705078, |
|
"learning_rate": 1.589244555345143e-05, |
|
"loss": 11.8749, |
|
"step": 253500 |
|
}, |
|
{ |
|
"epoch": 1.1288888888888888, |
|
"grad_norm": 177.92575073242188, |
|
"learning_rate": 1.584292454191781e-05, |
|
"loss": 13.8006, |
|
"step": 254000 |
|
}, |
|
{ |
|
"epoch": 1.1311111111111112, |
|
"grad_norm": 203.5926513671875, |
|
"learning_rate": 1.5793394312296444e-05, |
|
"loss": 12.2695, |
|
"step": 254500 |
|
}, |
|
{ |
|
"epoch": 1.1333333333333333, |
|
"grad_norm": 339.7933654785156, |
|
"learning_rate": 1.5743855406242e-05, |
|
"loss": 12.3823, |
|
"step": 255000 |
|
}, |
|
{ |
|
"epoch": 1.1355555555555557, |
|
"grad_norm": 334.1343688964844, |
|
"learning_rate": 1.5694308365504e-05, |
|
"loss": 13.8132, |
|
"step": 255500 |
|
}, |
|
{ |
|
"epoch": 1.1377777777777778, |
|
"grad_norm": 206.6999969482422, |
|
"learning_rate": 1.5644753731920954e-05, |
|
"loss": 12.8192, |
|
"step": 256000 |
|
}, |
|
{ |
|
"epoch": 1.1400000000000001, |
|
"grad_norm": 237.3104248046875, |
|
"learning_rate": 1.5595192047414395e-05, |
|
"loss": 11.9175, |
|
"step": 256500 |
|
}, |
|
{ |
|
"epoch": 1.1422222222222222, |
|
"grad_norm": 673.7626953125, |
|
"learning_rate": 1.5545623853982966e-05, |
|
"loss": 13.1039, |
|
"step": 257000 |
|
}, |
|
{ |
|
"epoch": 1.1444444444444444, |
|
"grad_norm": 40.97128677368164, |
|
"learning_rate": 1.549604969369649e-05, |
|
"loss": 11.9416, |
|
"step": 257500 |
|
}, |
|
{ |
|
"epoch": 1.1466666666666667, |
|
"grad_norm": 125.23896789550781, |
|
"learning_rate": 1.544647010869003e-05, |
|
"loss": 12.4299, |
|
"step": 258000 |
|
}, |
|
{ |
|
"epoch": 1.1488888888888888, |
|
"grad_norm": 297.3369140625, |
|
"learning_rate": 1.5396885641158002e-05, |
|
"loss": 12.2724, |
|
"step": 258500 |
|
}, |
|
{ |
|
"epoch": 1.1511111111111112, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.534729683334818e-05, |
|
"loss": 10.8568, |
|
"step": 259000 |
|
}, |
|
{ |
|
"epoch": 1.1533333333333333, |
|
"grad_norm": 222.6666717529297, |
|
"learning_rate": 1.529770422755583e-05, |
|
"loss": 11.321, |
|
"step": 259500 |
|
}, |
|
{ |
|
"epoch": 1.1555555555555554, |
|
"grad_norm": 258.8761291503906, |
|
"learning_rate": 1.524810836611775e-05, |
|
"loss": 11.3846, |
|
"step": 260000 |
|
}, |
|
{ |
|
"epoch": 1.1577777777777778, |
|
"grad_norm": 362.4846496582031, |
|
"learning_rate": 1.5198509791406325e-05, |
|
"loss": 12.1888, |
|
"step": 260500 |
|
}, |
|
{ |
|
"epoch": 1.16, |
|
"grad_norm": 325.5453186035156, |
|
"learning_rate": 1.5148909045823626e-05, |
|
"loss": 11.6617, |
|
"step": 261000 |
|
}, |
|
{ |
|
"epoch": 1.1622222222222223, |
|
"grad_norm": 346.42791748046875, |
|
"learning_rate": 1.509930667179546e-05, |
|
"loss": 12.4993, |
|
"step": 261500 |
|
}, |
|
{ |
|
"epoch": 1.1644444444444444, |
|
"grad_norm": 427.6278991699219, |
|
"learning_rate": 1.5049703211765442e-05, |
|
"loss": 12.6815, |
|
"step": 262000 |
|
}, |
|
{ |
|
"epoch": 1.1666666666666667, |
|
"grad_norm": 416.53680419921875, |
|
"learning_rate": 1.5000099208189061e-05, |
|
"loss": 12.9896, |
|
"step": 262500 |
|
}, |
|
{ |
|
"epoch": 1.1688888888888889, |
|
"grad_norm": 181.99703979492188, |
|
"learning_rate": 1.4950495203527755e-05, |
|
"loss": 12.7223, |
|
"step": 263000 |
|
}, |
|
{ |
|
"epoch": 1.1711111111111112, |
|
"grad_norm": 38.73680114746094, |
|
"learning_rate": 1.4900891740242976e-05, |
|
"loss": 12.5012, |
|
"step": 263500 |
|
}, |
|
{ |
|
"epoch": 1.1733333333333333, |
|
"grad_norm": 527.49267578125, |
|
"learning_rate": 1.4851289360790243e-05, |
|
"loss": 11.8226, |
|
"step": 264000 |
|
}, |
|
{ |
|
"epoch": 1.1755555555555555, |
|
"grad_norm": 593.9708862304688, |
|
"learning_rate": 1.480168860761324e-05, |
|
"loss": 11.9695, |
|
"step": 264500 |
|
}, |
|
{ |
|
"epoch": 1.1777777777777778, |
|
"grad_norm": 743.0066528320312, |
|
"learning_rate": 1.4752090023137843e-05, |
|
"loss": 12.0286, |
|
"step": 265000 |
|
}, |
|
{ |
|
"epoch": 1.18, |
|
"grad_norm": 201.2530059814453, |
|
"learning_rate": 1.4702494149766239e-05, |
|
"loss": 10.9088, |
|
"step": 265500 |
|
}, |
|
{ |
|
"epoch": 1.1822222222222223, |
|
"grad_norm": 548.9100952148438, |
|
"learning_rate": 1.465290152987095e-05, |
|
"loss": 11.889, |
|
"step": 266000 |
|
}, |
|
{ |
|
"epoch": 1.1844444444444444, |
|
"grad_norm": 233.81863403320312, |
|
"learning_rate": 1.4603312705788917e-05, |
|
"loss": 12.1066, |
|
"step": 266500 |
|
}, |
|
{ |
|
"epoch": 1.1866666666666668, |
|
"grad_norm": 163.2041015625, |
|
"learning_rate": 1.4553728219815586e-05, |
|
"loss": 12.8837, |
|
"step": 267000 |
|
}, |
|
{ |
|
"epoch": 1.1888888888888889, |
|
"grad_norm": 153.75701904296875, |
|
"learning_rate": 1.4504148614198935e-05, |
|
"loss": 11.7215, |
|
"step": 267500 |
|
}, |
|
{ |
|
"epoch": 1.1911111111111112, |
|
"grad_norm": 32.576324462890625, |
|
"learning_rate": 1.4454574431133605e-05, |
|
"loss": 12.7392, |
|
"step": 268000 |
|
}, |
|
{ |
|
"epoch": 1.1933333333333334, |
|
"grad_norm": 690.4747314453125, |
|
"learning_rate": 1.4405006212754901e-05, |
|
"loss": 12.4667, |
|
"step": 268500 |
|
}, |
|
{ |
|
"epoch": 1.1955555555555555, |
|
"grad_norm": 70.4339828491211, |
|
"learning_rate": 1.4355444501132934e-05, |
|
"loss": 12.3897, |
|
"step": 269000 |
|
}, |
|
{ |
|
"epoch": 1.1977777777777778, |
|
"grad_norm": 1018.3383178710938, |
|
"learning_rate": 1.430588983826664e-05, |
|
"loss": 11.7094, |
|
"step": 269500 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 64.98046112060547, |
|
"learning_rate": 1.4256342766077859e-05, |
|
"loss": 11.031, |
|
"step": 270000 |
|
}, |
|
{ |
|
"epoch": 1.2022222222222223, |
|
"grad_norm": 507.530029296875, |
|
"learning_rate": 1.4206803826405453e-05, |
|
"loss": 11.7225, |
|
"step": 270500 |
|
}, |
|
{ |
|
"epoch": 1.2044444444444444, |
|
"grad_norm": 396.6742248535156, |
|
"learning_rate": 1.4157273560999311e-05, |
|
"loss": 12.0661, |
|
"step": 271000 |
|
}, |
|
{ |
|
"epoch": 1.2066666666666666, |
|
"grad_norm": 741.4268188476562, |
|
"learning_rate": 1.4107752511514499e-05, |
|
"loss": 12.1401, |
|
"step": 271500 |
|
}, |
|
{ |
|
"epoch": 1.208888888888889, |
|
"grad_norm": 977.9871826171875, |
|
"learning_rate": 1.405824121950526e-05, |
|
"loss": 11.8266, |
|
"step": 272000 |
|
}, |
|
{ |
|
"epoch": 1.211111111111111, |
|
"grad_norm": 172.49072265625, |
|
"learning_rate": 1.4008740226419166e-05, |
|
"loss": 12.024, |
|
"step": 272500 |
|
}, |
|
{ |
|
"epoch": 1.2133333333333334, |
|
"grad_norm": 148.6393585205078, |
|
"learning_rate": 1.3959250073591146e-05, |
|
"loss": 11.7095, |
|
"step": 273000 |
|
}, |
|
{ |
|
"epoch": 1.2155555555555555, |
|
"grad_norm": 50.63189697265625, |
|
"learning_rate": 1.390977130223757e-05, |
|
"loss": 11.5046, |
|
"step": 273500 |
|
}, |
|
{ |
|
"epoch": 1.2177777777777778, |
|
"grad_norm": 101.87459564208984, |
|
"learning_rate": 1.3860304453450373e-05, |
|
"loss": 11.3638, |
|
"step": 274000 |
|
}, |
|
{ |
|
"epoch": 1.22, |
|
"grad_norm": 274.5159606933594, |
|
"learning_rate": 1.3810850068191069e-05, |
|
"loss": 12.2588, |
|
"step": 274500 |
|
}, |
|
{ |
|
"epoch": 1.2222222222222223, |
|
"grad_norm": 108.9557876586914, |
|
"learning_rate": 1.3761408687284907e-05, |
|
"loss": 12.7642, |
|
"step": 275000 |
|
}, |
|
{ |
|
"epoch": 1.2244444444444444, |
|
"grad_norm": 455.4017028808594, |
|
"learning_rate": 1.3711980851414898e-05, |
|
"loss": 11.3841, |
|
"step": 275500 |
|
}, |
|
{ |
|
"epoch": 1.2266666666666666, |
|
"grad_norm": 239.2037811279297, |
|
"learning_rate": 1.3662567101115934e-05, |
|
"loss": 12.0606, |
|
"step": 276000 |
|
}, |
|
{ |
|
"epoch": 1.228888888888889, |
|
"grad_norm": 56.60507583618164, |
|
"learning_rate": 1.3613167976768886e-05, |
|
"loss": 11.4546, |
|
"step": 276500 |
|
}, |
|
{ |
|
"epoch": 1.231111111111111, |
|
"grad_norm": 310.4095458984375, |
|
"learning_rate": 1.3563784018594645e-05, |
|
"loss": 11.4747, |
|
"step": 277000 |
|
}, |
|
{ |
|
"epoch": 1.2333333333333334, |
|
"grad_norm": 335.875, |
|
"learning_rate": 1.3514415766648284e-05, |
|
"loss": 11.9081, |
|
"step": 277500 |
|
}, |
|
{ |
|
"epoch": 1.2355555555555555, |
|
"grad_norm": 594.1018676757812, |
|
"learning_rate": 1.346506376081308e-05, |
|
"loss": 11.2674, |
|
"step": 278000 |
|
}, |
|
{ |
|
"epoch": 1.2377777777777779, |
|
"grad_norm": 275.7675476074219, |
|
"learning_rate": 1.3415728540794674e-05, |
|
"loss": 10.7813, |
|
"step": 278500 |
|
}, |
|
{ |
|
"epoch": 1.24, |
|
"grad_norm": 214.95712280273438, |
|
"learning_rate": 1.3366410646115118e-05, |
|
"loss": 12.3449, |
|
"step": 279000 |
|
}, |
|
{ |
|
"epoch": 1.2422222222222223, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.331711061610701e-05, |
|
"loss": 11.6398, |
|
"step": 279500 |
|
}, |
|
{ |
|
"epoch": 1.2444444444444445, |
|
"grad_norm": 15.316904067993164, |
|
"learning_rate": 1.3267828989907592e-05, |
|
"loss": 11.7452, |
|
"step": 280000 |
|
}, |
|
{ |
|
"epoch": 1.2466666666666666, |
|
"grad_norm": 529.526611328125, |
|
"learning_rate": 1.3218566306452813e-05, |
|
"loss": 12.7856, |
|
"step": 280500 |
|
}, |
|
{ |
|
"epoch": 1.248888888888889, |
|
"grad_norm": 4.096035480499268, |
|
"learning_rate": 1.31693231044715e-05, |
|
"loss": 11.2883, |
|
"step": 281000 |
|
}, |
|
{ |
|
"epoch": 1.251111111111111, |
|
"grad_norm": 641.160888671875, |
|
"learning_rate": 1.3120099922479414e-05, |
|
"loss": 12.2018, |
|
"step": 281500 |
|
}, |
|
{ |
|
"epoch": 1.2533333333333334, |
|
"grad_norm": 218.7012939453125, |
|
"learning_rate": 1.3070897298773392e-05, |
|
"loss": 11.9625, |
|
"step": 282000 |
|
}, |
|
{ |
|
"epoch": 1.2555555555555555, |
|
"grad_norm": 1709.0491943359375, |
|
"learning_rate": 1.3021715771425437e-05, |
|
"loss": 11.9818, |
|
"step": 282500 |
|
}, |
|
{ |
|
"epoch": 1.2577777777777777, |
|
"grad_norm": 325.7183532714844, |
|
"learning_rate": 1.2972555878276857e-05, |
|
"loss": 12.171, |
|
"step": 283000 |
|
}, |
|
{ |
|
"epoch": 1.26, |
|
"grad_norm": 463.99432373046875, |
|
"learning_rate": 1.292341815693237e-05, |
|
"loss": 12.996, |
|
"step": 283500 |
|
}, |
|
{ |
|
"epoch": 1.2622222222222224, |
|
"grad_norm": 30.650217056274414, |
|
"learning_rate": 1.2874303144754219e-05, |
|
"loss": 11.0988, |
|
"step": 284000 |
|
}, |
|
{ |
|
"epoch": 1.2644444444444445, |
|
"grad_norm": 308.7669372558594, |
|
"learning_rate": 1.2825211378856311e-05, |
|
"loss": 11.6588, |
|
"step": 284500 |
|
}, |
|
{ |
|
"epoch": 1.2666666666666666, |
|
"grad_norm": 813.3473510742188, |
|
"learning_rate": 1.2776143396098331e-05, |
|
"loss": 11.7966, |
|
"step": 285000 |
|
}, |
|
{ |
|
"epoch": 1.268888888888889, |
|
"grad_norm": 277.6453857421875, |
|
"learning_rate": 1.272709973307988e-05, |
|
"loss": 11.957, |
|
"step": 285500 |
|
}, |
|
{ |
|
"epoch": 1.271111111111111, |
|
"grad_norm": 614.5536499023438, |
|
"learning_rate": 1.2678080926134595e-05, |
|
"loss": 12.0953, |
|
"step": 286000 |
|
}, |
|
{ |
|
"epoch": 1.2733333333333334, |
|
"grad_norm": 600.1682739257812, |
|
"learning_rate": 1.2629087511324295e-05, |
|
"loss": 12.4912, |
|
"step": 286500 |
|
}, |
|
{ |
|
"epoch": 1.2755555555555556, |
|
"grad_norm": 291.91387939453125, |
|
"learning_rate": 1.2580120024433123e-05, |
|
"loss": 11.737, |
|
"step": 287000 |
|
}, |
|
{ |
|
"epoch": 1.2777777777777777, |
|
"grad_norm": 645.7890625, |
|
"learning_rate": 1.2531179000961662e-05, |
|
"loss": 11.1851, |
|
"step": 287500 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"grad_norm": 390.1597900390625, |
|
"learning_rate": 1.2482264976121108e-05, |
|
"loss": 11.5208, |
|
"step": 288000 |
|
}, |
|
{ |
|
"epoch": 1.2822222222222222, |
|
"grad_norm": 15.699028968811035, |
|
"learning_rate": 1.2433378484827395e-05, |
|
"loss": 12.3516, |
|
"step": 288500 |
|
}, |
|
{ |
|
"epoch": 1.2844444444444445, |
|
"grad_norm": 35.82905578613281, |
|
"learning_rate": 1.2384520061695367e-05, |
|
"loss": 11.0025, |
|
"step": 289000 |
|
}, |
|
{ |
|
"epoch": 1.2866666666666666, |
|
"grad_norm": 112.55397033691406, |
|
"learning_rate": 1.2335690241032904e-05, |
|
"loss": 11.9212, |
|
"step": 289500 |
|
}, |
|
{ |
|
"epoch": 1.2888888888888888, |
|
"grad_norm": 143.4647979736328, |
|
"learning_rate": 1.2286889556835105e-05, |
|
"loss": 11.8427, |
|
"step": 290000 |
|
}, |
|
{ |
|
"epoch": 1.291111111111111, |
|
"grad_norm": 83.45748138427734, |
|
"learning_rate": 1.2238118542778435e-05, |
|
"loss": 11.4673, |
|
"step": 290500 |
|
}, |
|
{ |
|
"epoch": 1.2933333333333334, |
|
"grad_norm": 128.21621704101562, |
|
"learning_rate": 1.2189377732214886e-05, |
|
"loss": 10.8374, |
|
"step": 291000 |
|
}, |
|
{ |
|
"epoch": 1.2955555555555556, |
|
"grad_norm": 987.85302734375, |
|
"learning_rate": 1.2140667658166162e-05, |
|
"loss": 12.346, |
|
"step": 291500 |
|
}, |
|
{ |
|
"epoch": 1.2977777777777777, |
|
"grad_norm": 250.47520446777344, |
|
"learning_rate": 1.2091988853317817e-05, |
|
"loss": 10.7999, |
|
"step": 292000 |
|
}, |
|
{ |
|
"epoch": 1.3, |
|
"grad_norm": 33.65868377685547, |
|
"learning_rate": 1.2043341850013472e-05, |
|
"loss": 12.6021, |
|
"step": 292500 |
|
}, |
|
{ |
|
"epoch": 1.3022222222222222, |
|
"grad_norm": 207.2305450439453, |
|
"learning_rate": 1.1994727180248953e-05, |
|
"loss": 12.2435, |
|
"step": 293000 |
|
}, |
|
{ |
|
"epoch": 1.3044444444444445, |
|
"grad_norm": 210.83741760253906, |
|
"learning_rate": 1.1946145375666504e-05, |
|
"loss": 11.2422, |
|
"step": 293500 |
|
}, |
|
{ |
|
"epoch": 1.3066666666666666, |
|
"grad_norm": 289.1300964355469, |
|
"learning_rate": 1.189759696754896e-05, |
|
"loss": 11.7366, |
|
"step": 294000 |
|
}, |
|
{ |
|
"epoch": 1.3088888888888888, |
|
"grad_norm": 491.4790954589844, |
|
"learning_rate": 1.1849082486813923e-05, |
|
"loss": 11.8805, |
|
"step": 294500 |
|
}, |
|
{ |
|
"epoch": 1.3111111111111111, |
|
"grad_norm": 286.23681640625, |
|
"learning_rate": 1.1800602464007995e-05, |
|
"loss": 11.8487, |
|
"step": 295000 |
|
}, |
|
{ |
|
"epoch": 1.3133333333333335, |
|
"grad_norm": 150.55995178222656, |
|
"learning_rate": 1.175215742930093e-05, |
|
"loss": 11.2674, |
|
"step": 295500 |
|
}, |
|
{ |
|
"epoch": 1.3155555555555556, |
|
"grad_norm": 90.90438842773438, |
|
"learning_rate": 1.1703747912479867e-05, |
|
"loss": 12.0513, |
|
"step": 296000 |
|
}, |
|
{ |
|
"epoch": 1.3177777777777777, |
|
"grad_norm": 402.916748046875, |
|
"learning_rate": 1.1655374442943526e-05, |
|
"loss": 11.3287, |
|
"step": 296500 |
|
}, |
|
{ |
|
"epoch": 1.32, |
|
"grad_norm": 221.00369262695312, |
|
"learning_rate": 1.160703754969642e-05, |
|
"loss": 10.8907, |
|
"step": 297000 |
|
}, |
|
{ |
|
"epoch": 1.3222222222222222, |
|
"grad_norm": 84.22000885009766, |
|
"learning_rate": 1.1558737761343074e-05, |
|
"loss": 12.0133, |
|
"step": 297500 |
|
}, |
|
{ |
|
"epoch": 1.3244444444444445, |
|
"grad_norm": 19.054018020629883, |
|
"learning_rate": 1.1510475606082226e-05, |
|
"loss": 10.2377, |
|
"step": 298000 |
|
}, |
|
{ |
|
"epoch": 1.3266666666666667, |
|
"grad_norm": 453.34326171875, |
|
"learning_rate": 1.1462251611701084e-05, |
|
"loss": 11.93, |
|
"step": 298500 |
|
}, |
|
{ |
|
"epoch": 1.3288888888888888, |
|
"grad_norm": 275.5953063964844, |
|
"learning_rate": 1.1414066305569514e-05, |
|
"loss": 13.0519, |
|
"step": 299000 |
|
}, |
|
{ |
|
"epoch": 1.3311111111111111, |
|
"grad_norm": 279.2978210449219, |
|
"learning_rate": 1.1365920214634312e-05, |
|
"loss": 11.8949, |
|
"step": 299500 |
|
}, |
|
{ |
|
"epoch": 1.3333333333333333, |
|
"grad_norm": 278.0643310546875, |
|
"learning_rate": 1.1317813865413409e-05, |
|
"loss": 10.4946, |
|
"step": 300000 |
|
}, |
|
{ |
|
"epoch": 1.3355555555555556, |
|
"grad_norm": 685.4400024414062, |
|
"learning_rate": 1.1269747783990135e-05, |
|
"loss": 11.1153, |
|
"step": 300500 |
|
}, |
|
{ |
|
"epoch": 1.3377777777777777, |
|
"grad_norm": 312.36724853515625, |
|
"learning_rate": 1.1221722496007462e-05, |
|
"loss": 12.0323, |
|
"step": 301000 |
|
}, |
|
{ |
|
"epoch": 1.34, |
|
"grad_norm": 1231.820068359375, |
|
"learning_rate": 1.1173738526662234e-05, |
|
"loss": 10.8594, |
|
"step": 301500 |
|
}, |
|
{ |
|
"epoch": 1.3422222222222222, |
|
"grad_norm": 273.9977111816406, |
|
"learning_rate": 1.1125796400699458e-05, |
|
"loss": 11.2889, |
|
"step": 302000 |
|
}, |
|
{ |
|
"epoch": 1.3444444444444446, |
|
"grad_norm": 222.45266723632812, |
|
"learning_rate": 1.1077896642406542e-05, |
|
"loss": 11.6009, |
|
"step": 302500 |
|
}, |
|
{ |
|
"epoch": 1.3466666666666667, |
|
"grad_norm": 1616.0927734375, |
|
"learning_rate": 1.103003977560757e-05, |
|
"loss": 11.7312, |
|
"step": 303000 |
|
}, |
|
{ |
|
"epoch": 1.3488888888888888, |
|
"grad_norm": 172.6010284423828, |
|
"learning_rate": 1.0982226323657565e-05, |
|
"loss": 11.6923, |
|
"step": 303500 |
|
}, |
|
{ |
|
"epoch": 1.3511111111111112, |
|
"grad_norm": 188.0960235595703, |
|
"learning_rate": 1.093445680943678e-05, |
|
"loss": 10.7696, |
|
"step": 304000 |
|
}, |
|
{ |
|
"epoch": 1.3533333333333333, |
|
"grad_norm": 708.9501342773438, |
|
"learning_rate": 1.0886731755344972e-05, |
|
"loss": 11.5035, |
|
"step": 304500 |
|
}, |
|
{ |
|
"epoch": 1.3555555555555556, |
|
"grad_norm": 112.46131896972656, |
|
"learning_rate": 1.0839051683295682e-05, |
|
"loss": 11.0951, |
|
"step": 305000 |
|
}, |
|
{ |
|
"epoch": 1.3577777777777778, |
|
"grad_norm": 42.40409469604492, |
|
"learning_rate": 1.0791417114710543e-05, |
|
"loss": 12.8662, |
|
"step": 305500 |
|
}, |
|
{ |
|
"epoch": 1.3599999999999999, |
|
"grad_norm": 447.1692810058594, |
|
"learning_rate": 1.074382857051356e-05, |
|
"loss": 11.2495, |
|
"step": 306000 |
|
}, |
|
{ |
|
"epoch": 1.3622222222222222, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.0696286571125437e-05, |
|
"loss": 12.0512, |
|
"step": 306500 |
|
}, |
|
{ |
|
"epoch": 1.3644444444444446, |
|
"grad_norm": 1327.175537109375, |
|
"learning_rate": 1.0648791636457847e-05, |
|
"loss": 11.3486, |
|
"step": 307000 |
|
}, |
|
{ |
|
"epoch": 1.3666666666666667, |
|
"grad_norm": 114.16178894042969, |
|
"learning_rate": 1.0601344285907797e-05, |
|
"loss": 12.0348, |
|
"step": 307500 |
|
}, |
|
{ |
|
"epoch": 1.3688888888888888, |
|
"grad_norm": 410.4014587402344, |
|
"learning_rate": 1.0553945038351914e-05, |
|
"loss": 11.0606, |
|
"step": 308000 |
|
}, |
|
{ |
|
"epoch": 1.3711111111111112, |
|
"grad_norm": 205.14894104003906, |
|
"learning_rate": 1.0506594412140768e-05, |
|
"loss": 12.0553, |
|
"step": 308500 |
|
}, |
|
{ |
|
"epoch": 1.3733333333333333, |
|
"grad_norm": 70.4958267211914, |
|
"learning_rate": 1.0459292925093228e-05, |
|
"loss": 11.5397, |
|
"step": 309000 |
|
}, |
|
{ |
|
"epoch": 1.3755555555555556, |
|
"grad_norm": 194.81698608398438, |
|
"learning_rate": 1.0412041094490767e-05, |
|
"loss": 10.2973, |
|
"step": 309500 |
|
}, |
|
{ |
|
"epoch": 1.3777777777777778, |
|
"grad_norm": 15.8478364944458, |
|
"learning_rate": 1.0364839437071848e-05, |
|
"loss": 11.748, |
|
"step": 310000 |
|
}, |
|
{ |
|
"epoch": 1.38, |
|
"grad_norm": 117.78498840332031, |
|
"learning_rate": 1.0317688469026219e-05, |
|
"loss": 11.4108, |
|
"step": 310500 |
|
}, |
|
{ |
|
"epoch": 1.3822222222222222, |
|
"grad_norm": 497.8285217285156, |
|
"learning_rate": 1.0270588705989322e-05, |
|
"loss": 11.4724, |
|
"step": 311000 |
|
}, |
|
{ |
|
"epoch": 1.3844444444444444, |
|
"grad_norm": 339.4550476074219, |
|
"learning_rate": 1.0223540663036624e-05, |
|
"loss": 12.0662, |
|
"step": 311500 |
|
}, |
|
{ |
|
"epoch": 1.3866666666666667, |
|
"grad_norm": 191.90072631835938, |
|
"learning_rate": 1.017654485467797e-05, |
|
"loss": 12.0687, |
|
"step": 312000 |
|
}, |
|
{ |
|
"epoch": 1.3888888888888888, |
|
"grad_norm": 201.05392456054688, |
|
"learning_rate": 1.0129601794852007e-05, |
|
"loss": 12.6799, |
|
"step": 312500 |
|
}, |
|
{ |
|
"epoch": 1.3911111111111112, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.00827119969205e-05, |
|
"loss": 11.8095, |
|
"step": 313000 |
|
}, |
|
{ |
|
"epoch": 1.3933333333333333, |
|
"grad_norm": 110.20486450195312, |
|
"learning_rate": 1.0035875973662787e-05, |
|
"loss": 11.1245, |
|
"step": 313500 |
|
}, |
|
{ |
|
"epoch": 1.3955555555555557, |
|
"grad_norm": 142.47384643554688, |
|
"learning_rate": 9.989094237270094e-06, |
|
"loss": 11.5409, |
|
"step": 314000 |
|
}, |
|
{ |
|
"epoch": 1.3977777777777778, |
|
"grad_norm": 408.63031005859375, |
|
"learning_rate": 9.942367299340003e-06, |
|
"loss": 11.8593, |
|
"step": 314500 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"grad_norm": 470.6206970214844, |
|
"learning_rate": 9.89569567087083e-06, |
|
"loss": 11.6008, |
|
"step": 315000 |
|
}, |
|
{ |
|
"epoch": 1.4022222222222223, |
|
"grad_norm": 67.78047180175781, |
|
"learning_rate": 9.84907986225601e-06, |
|
"loss": 10.926, |
|
"step": 315500 |
|
}, |
|
{ |
|
"epoch": 1.4044444444444444, |
|
"grad_norm": 293.5436706542969, |
|
"learning_rate": 9.802520383278574e-06, |
|
"loss": 10.8069, |
|
"step": 316000 |
|
}, |
|
{ |
|
"epoch": 1.4066666666666667, |
|
"grad_norm": 480.68389892578125, |
|
"learning_rate": 9.75601774310551e-06, |
|
"loss": 11.2341, |
|
"step": 316500 |
|
}, |
|
{ |
|
"epoch": 1.4088888888888889, |
|
"grad_norm": 235.30406188964844, |
|
"learning_rate": 9.709572450282253e-06, |
|
"loss": 11.3084, |
|
"step": 317000 |
|
}, |
|
{ |
|
"epoch": 1.411111111111111, |
|
"grad_norm": 977.0435180664062, |
|
"learning_rate": 9.663185012727075e-06, |
|
"loss": 12.978, |
|
"step": 317500 |
|
}, |
|
{ |
|
"epoch": 1.4133333333333333, |
|
"grad_norm": 26.692384719848633, |
|
"learning_rate": 9.61685593772556e-06, |
|
"loss": 11.2446, |
|
"step": 318000 |
|
}, |
|
{ |
|
"epoch": 1.4155555555555557, |
|
"grad_norm": 691.8837280273438, |
|
"learning_rate": 9.570585731925064e-06, |
|
"loss": 11.2801, |
|
"step": 318500 |
|
}, |
|
{ |
|
"epoch": 1.4177777777777778, |
|
"grad_norm": 210.51527404785156, |
|
"learning_rate": 9.524374901329125e-06, |
|
"loss": 10.0809, |
|
"step": 319000 |
|
}, |
|
{ |
|
"epoch": 1.42, |
|
"grad_norm": 0.0, |
|
"learning_rate": 9.478223951292001e-06, |
|
"loss": 11.3325, |
|
"step": 319500 |
|
}, |
|
{ |
|
"epoch": 1.4222222222222223, |
|
"grad_norm": 302.1672668457031, |
|
"learning_rate": 9.432133386513075e-06, |
|
"loss": 10.449, |
|
"step": 320000 |
|
}, |
|
{ |
|
"epoch": 1.4244444444444444, |
|
"grad_norm": 210.6238250732422, |
|
"learning_rate": 9.386103711031384e-06, |
|
"loss": 12.6131, |
|
"step": 320500 |
|
}, |
|
{ |
|
"epoch": 1.4266666666666667, |
|
"grad_norm": 615.0394897460938, |
|
"learning_rate": 9.340135428220081e-06, |
|
"loss": 11.892, |
|
"step": 321000 |
|
}, |
|
{ |
|
"epoch": 1.4288888888888889, |
|
"grad_norm": 44.33654022216797, |
|
"learning_rate": 9.294229040780948e-06, |
|
"loss": 11.7791, |
|
"step": 321500 |
|
}, |
|
{ |
|
"epoch": 1.431111111111111, |
|
"grad_norm": 423.60943603515625, |
|
"learning_rate": 9.248385050738874e-06, |
|
"loss": 11.8577, |
|
"step": 322000 |
|
}, |
|
{ |
|
"epoch": 1.4333333333333333, |
|
"grad_norm": 750.2989501953125, |
|
"learning_rate": 9.202603959436398e-06, |
|
"loss": 11.5078, |
|
"step": 322500 |
|
}, |
|
{ |
|
"epoch": 1.4355555555555555, |
|
"grad_norm": 14.56828784942627, |
|
"learning_rate": 9.156886267528198e-06, |
|
"loss": 11.1005, |
|
"step": 323000 |
|
}, |
|
{ |
|
"epoch": 1.4377777777777778, |
|
"grad_norm": 98.75641632080078, |
|
"learning_rate": 9.111232474975624e-06, |
|
"loss": 10.4616, |
|
"step": 323500 |
|
}, |
|
{ |
|
"epoch": 1.44, |
|
"grad_norm": 58.22282409667969, |
|
"learning_rate": 9.065643081041242e-06, |
|
"loss": 10.8385, |
|
"step": 324000 |
|
}, |
|
{ |
|
"epoch": 1.4422222222222223, |
|
"grad_norm": 95.11531066894531, |
|
"learning_rate": 9.020118584283357e-06, |
|
"loss": 10.93, |
|
"step": 324500 |
|
}, |
|
{ |
|
"epoch": 1.4444444444444444, |
|
"grad_norm": 402.2863464355469, |
|
"learning_rate": 8.974659482550576e-06, |
|
"loss": 10.7504, |
|
"step": 325000 |
|
} |
|
], |
|
"logging_steps": 500, |
|
"max_steps": 500000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 5000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|