|
{
|
|
"best_metric": 2.3947181701660156,
|
|
"best_model_checkpoint": "../checkpoints/FinalTrains_Runde2_720samples\\checkpoint-2300",
|
|
"epoch": 27.07017543859649,
|
|
"eval_steps": 50,
|
|
"global_step": 2301,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"epoch": 0.5847953216374269,
|
|
"grad_norm": 2511.836181640625,
|
|
"learning_rate": 1.6399999999999998e-06,
|
|
"loss": 11.5208,
|
|
"step": 50
|
|
},
|
|
{
|
|
"epoch": 0.5847953216374269,
|
|
"eval_loss": 7.463321208953857,
|
|
"eval_runtime": 48.9349,
|
|
"eval_samples_per_second": 2.943,
|
|
"eval_steps_per_second": 2.943,
|
|
"step": 50
|
|
},
|
|
{
|
|
"epoch": 1.1695906432748537,
|
|
"grad_norm": 582.1835327148438,
|
|
"learning_rate": 1.9672e-06,
|
|
"loss": 5.9458,
|
|
"step": 100
|
|
},
|
|
{
|
|
"epoch": 1.1695906432748537,
|
|
"eval_loss": 4.85849142074585,
|
|
"eval_runtime": 51.5611,
|
|
"eval_samples_per_second": 2.793,
|
|
"eval_steps_per_second": 2.793,
|
|
"step": 100
|
|
},
|
|
{
|
|
"epoch": 1.7543859649122808,
|
|
"grad_norm": 429.77252197265625,
|
|
"learning_rate": 1.9272e-06,
|
|
"loss": 4.5189,
|
|
"step": 150
|
|
},
|
|
{
|
|
"epoch": 1.7543859649122808,
|
|
"eval_loss": 4.148195743560791,
|
|
"eval_runtime": 53.3669,
|
|
"eval_samples_per_second": 2.698,
|
|
"eval_steps_per_second": 2.698,
|
|
"step": 150
|
|
},
|
|
{
|
|
"epoch": 2.3391812865497075,
|
|
"grad_norm": 246.91941833496094,
|
|
"learning_rate": 1.8872e-06,
|
|
"loss": 3.9677,
|
|
"step": 200
|
|
},
|
|
{
|
|
"epoch": 2.3391812865497075,
|
|
"eval_loss": 3.7681965827941895,
|
|
"eval_runtime": 48.4728,
|
|
"eval_samples_per_second": 2.971,
|
|
"eval_steps_per_second": 2.971,
|
|
"step": 200
|
|
},
|
|
{
|
|
"epoch": 2.9239766081871346,
|
|
"grad_norm": 326.88507080078125,
|
|
"learning_rate": 1.8471999999999999e-06,
|
|
"loss": 3.6828,
|
|
"step": 250
|
|
},
|
|
{
|
|
"epoch": 2.9239766081871346,
|
|
"eval_loss": 3.4826574325561523,
|
|
"eval_runtime": 47.8574,
|
|
"eval_samples_per_second": 3.009,
|
|
"eval_steps_per_second": 3.009,
|
|
"step": 250
|
|
},
|
|
{
|
|
"epoch": 3.5087719298245617,
|
|
"grad_norm": 216.14739990234375,
|
|
"learning_rate": 1.8071999999999998e-06,
|
|
"loss": 3.3901,
|
|
"step": 300
|
|
},
|
|
{
|
|
"epoch": 3.5087719298245617,
|
|
"eval_loss": 3.304009437561035,
|
|
"eval_runtime": 48.0808,
|
|
"eval_samples_per_second": 2.995,
|
|
"eval_steps_per_second": 2.995,
|
|
"step": 300
|
|
},
|
|
{
|
|
"epoch": 4.093567251461988,
|
|
"grad_norm": 159.58961486816406,
|
|
"learning_rate": 1.7672e-06,
|
|
"loss": 3.2532,
|
|
"step": 350
|
|
},
|
|
{
|
|
"epoch": 4.093567251461988,
|
|
"eval_loss": 3.1738274097442627,
|
|
"eval_runtime": 47.8126,
|
|
"eval_samples_per_second": 3.012,
|
|
"eval_steps_per_second": 3.012,
|
|
"step": 350
|
|
},
|
|
{
|
|
"epoch": 4.678362573099415,
|
|
"grad_norm": 161.6717071533203,
|
|
"learning_rate": 1.7272e-06,
|
|
"loss": 3.1071,
|
|
"step": 400
|
|
},
|
|
{
|
|
"epoch": 4.678362573099415,
|
|
"eval_loss": 3.0628068447113037,
|
|
"eval_runtime": 48.0252,
|
|
"eval_samples_per_second": 2.998,
|
|
"eval_steps_per_second": 2.998,
|
|
"step": 400
|
|
},
|
|
{
|
|
"epoch": 5.2631578947368425,
|
|
"grad_norm": 146.9420166015625,
|
|
"learning_rate": 1.6872e-06,
|
|
"loss": 2.9944,
|
|
"step": 450
|
|
},
|
|
{
|
|
"epoch": 5.2631578947368425,
|
|
"eval_loss": 2.990321159362793,
|
|
"eval_runtime": 47.8573,
|
|
"eval_samples_per_second": 3.009,
|
|
"eval_steps_per_second": 3.009,
|
|
"step": 450
|
|
},
|
|
{
|
|
"epoch": 5.847953216374269,
|
|
"grad_norm": 233.46127319335938,
|
|
"learning_rate": 1.6471999999999999e-06,
|
|
"loss": 2.898,
|
|
"step": 500
|
|
},
|
|
{
|
|
"epoch": 5.847953216374269,
|
|
"eval_loss": 2.9129867553710938,
|
|
"eval_runtime": 47.7606,
|
|
"eval_samples_per_second": 3.015,
|
|
"eval_steps_per_second": 3.015,
|
|
"step": 500
|
|
},
|
|
{
|
|
"epoch": 6.432748538011696,
|
|
"grad_norm": 217.460205078125,
|
|
"learning_rate": 1.6071999999999998e-06,
|
|
"loss": 2.8098,
|
|
"step": 550
|
|
},
|
|
{
|
|
"epoch": 6.432748538011696,
|
|
"eval_loss": 2.862227439880371,
|
|
"eval_runtime": 47.7417,
|
|
"eval_samples_per_second": 3.016,
|
|
"eval_steps_per_second": 3.016,
|
|
"step": 550
|
|
},
|
|
{
|
|
"epoch": 7.017543859649122,
|
|
"grad_norm": 171.82847595214844,
|
|
"learning_rate": 1.5671999999999998e-06,
|
|
"loss": 2.7165,
|
|
"step": 600
|
|
},
|
|
{
|
|
"epoch": 7.017543859649122,
|
|
"eval_loss": 2.827782392501831,
|
|
"eval_runtime": 47.7602,
|
|
"eval_samples_per_second": 3.015,
|
|
"eval_steps_per_second": 3.015,
|
|
"step": 600
|
|
},
|
|
{
|
|
"epoch": 7.60233918128655,
|
|
"grad_norm": 255.87489318847656,
|
|
"learning_rate": 1.5271999999999998e-06,
|
|
"loss": 2.6446,
|
|
"step": 650
|
|
},
|
|
{
|
|
"epoch": 7.60233918128655,
|
|
"eval_loss": 2.7759788036346436,
|
|
"eval_runtime": 47.7257,
|
|
"eval_samples_per_second": 3.017,
|
|
"eval_steps_per_second": 3.017,
|
|
"step": 650
|
|
},
|
|
{
|
|
"epoch": 8.187134502923977,
|
|
"grad_norm": 349.6673889160156,
|
|
"learning_rate": 1.4872e-06,
|
|
"loss": 2.5134,
|
|
"step": 700
|
|
},
|
|
{
|
|
"epoch": 8.187134502923977,
|
|
"eval_loss": 2.7408220767974854,
|
|
"eval_runtime": 47.7358,
|
|
"eval_samples_per_second": 3.017,
|
|
"eval_steps_per_second": 3.017,
|
|
"step": 700
|
|
},
|
|
{
|
|
"epoch": 8.771929824561404,
|
|
"grad_norm": 303.1741943359375,
|
|
"learning_rate": 1.4471999999999999e-06,
|
|
"loss": 2.4957,
|
|
"step": 750
|
|
},
|
|
{
|
|
"epoch": 8.771929824561404,
|
|
"eval_loss": 2.7205865383148193,
|
|
"eval_runtime": 47.7327,
|
|
"eval_samples_per_second": 3.017,
|
|
"eval_steps_per_second": 3.017,
|
|
"step": 750
|
|
},
|
|
{
|
|
"epoch": 9.35672514619883,
|
|
"grad_norm": 277.3189697265625,
|
|
"learning_rate": 1.4071999999999998e-06,
|
|
"loss": 2.4521,
|
|
"step": 800
|
|
},
|
|
{
|
|
"epoch": 9.35672514619883,
|
|
"eval_loss": 2.699465036392212,
|
|
"eval_runtime": 47.8368,
|
|
"eval_samples_per_second": 3.01,
|
|
"eval_steps_per_second": 3.01,
|
|
"step": 800
|
|
},
|
|
{
|
|
"epoch": 9.941520467836257,
|
|
"grad_norm": 342.6287536621094,
|
|
"learning_rate": 1.3672e-06,
|
|
"loss": 2.3549,
|
|
"step": 850
|
|
},
|
|
{
|
|
"epoch": 9.941520467836257,
|
|
"eval_loss": 2.657285690307617,
|
|
"eval_runtime": 47.7948,
|
|
"eval_samples_per_second": 3.013,
|
|
"eval_steps_per_second": 3.013,
|
|
"step": 850
|
|
},
|
|
{
|
|
"epoch": 10.526315789473685,
|
|
"grad_norm": 314.8951416015625,
|
|
"learning_rate": 1.3272e-06,
|
|
"loss": 2.3118,
|
|
"step": 900
|
|
},
|
|
{
|
|
"epoch": 10.526315789473685,
|
|
"eval_loss": 2.6494998931884766,
|
|
"eval_runtime": 47.8289,
|
|
"eval_samples_per_second": 3.011,
|
|
"eval_steps_per_second": 3.011,
|
|
"step": 900
|
|
},
|
|
{
|
|
"epoch": 11.11111111111111,
|
|
"grad_norm": 259.777099609375,
|
|
"learning_rate": 1.2872e-06,
|
|
"loss": 2.2292,
|
|
"step": 950
|
|
},
|
|
{
|
|
"epoch": 11.11111111111111,
|
|
"eval_loss": 2.6447384357452393,
|
|
"eval_runtime": 47.7489,
|
|
"eval_samples_per_second": 3.016,
|
|
"eval_steps_per_second": 3.016,
|
|
"step": 950
|
|
},
|
|
{
|
|
"epoch": 11.695906432748538,
|
|
"grad_norm": 298.0020446777344,
|
|
"learning_rate": 1.2472e-06,
|
|
"loss": 2.191,
|
|
"step": 1000
|
|
},
|
|
{
|
|
"epoch": 11.695906432748538,
|
|
"eval_loss": 2.6040868759155273,
|
|
"eval_runtime": 47.8314,
|
|
"eval_samples_per_second": 3.011,
|
|
"eval_steps_per_second": 3.011,
|
|
"step": 1000
|
|
},
|
|
{
|
|
"epoch": 12.280701754385966,
|
|
"grad_norm": 276.3546447753906,
|
|
"learning_rate": 1.2072e-06,
|
|
"loss": 2.1521,
|
|
"step": 1050
|
|
},
|
|
{
|
|
"epoch": 12.280701754385966,
|
|
"eval_loss": 2.6078646183013916,
|
|
"eval_runtime": 47.8526,
|
|
"eval_samples_per_second": 3.009,
|
|
"eval_steps_per_second": 3.009,
|
|
"step": 1050
|
|
},
|
|
{
|
|
"epoch": 12.865497076023392,
|
|
"grad_norm": 336.7939453125,
|
|
"learning_rate": 1.1672e-06,
|
|
"loss": 2.0565,
|
|
"step": 1100
|
|
},
|
|
{
|
|
"epoch": 12.865497076023392,
|
|
"eval_loss": 2.5721495151519775,
|
|
"eval_runtime": 47.7559,
|
|
"eval_samples_per_second": 3.015,
|
|
"eval_steps_per_second": 3.015,
|
|
"step": 1100
|
|
},
|
|
{
|
|
"epoch": 13.450292397660819,
|
|
"grad_norm": 302.5251770019531,
|
|
"learning_rate": 1.1272e-06,
|
|
"loss": 2.096,
|
|
"step": 1150
|
|
},
|
|
{
|
|
"epoch": 13.450292397660819,
|
|
"eval_loss": 2.5710203647613525,
|
|
"eval_runtime": 47.8318,
|
|
"eval_samples_per_second": 3.011,
|
|
"eval_steps_per_second": 3.011,
|
|
"step": 1150
|
|
},
|
|
{
|
|
"epoch": 14.035087719298245,
|
|
"grad_norm": 262.70208740234375,
|
|
"learning_rate": 1.0872e-06,
|
|
"loss": 1.9957,
|
|
"step": 1200
|
|
},
|
|
{
|
|
"epoch": 14.035087719298245,
|
|
"eval_loss": 2.561436414718628,
|
|
"eval_runtime": 47.868,
|
|
"eval_samples_per_second": 3.008,
|
|
"eval_steps_per_second": 3.008,
|
|
"step": 1200
|
|
},
|
|
{
|
|
"epoch": 14.619883040935672,
|
|
"grad_norm": 339.61175537109375,
|
|
"learning_rate": 1.0472e-06,
|
|
"loss": 1.9382,
|
|
"step": 1250
|
|
},
|
|
{
|
|
"epoch": 14.619883040935672,
|
|
"eval_loss": 2.544802665710449,
|
|
"eval_runtime": 47.8634,
|
|
"eval_samples_per_second": 3.009,
|
|
"eval_steps_per_second": 3.009,
|
|
"step": 1250
|
|
},
|
|
{
|
|
"epoch": 15.2046783625731,
|
|
"grad_norm": 259.406005859375,
|
|
"learning_rate": 1.0072e-06,
|
|
"loss": 1.9409,
|
|
"step": 1300
|
|
},
|
|
{
|
|
"epoch": 15.2046783625731,
|
|
"eval_loss": 2.541916847229004,
|
|
"eval_runtime": 47.8691,
|
|
"eval_samples_per_second": 3.008,
|
|
"eval_steps_per_second": 3.008,
|
|
"step": 1300
|
|
},
|
|
{
|
|
"epoch": 15.789473684210526,
|
|
"grad_norm": 277.8048400878906,
|
|
"learning_rate": 9.671999999999998e-07,
|
|
"loss": 1.9007,
|
|
"step": 1350
|
|
},
|
|
{
|
|
"epoch": 15.789473684210526,
|
|
"eval_loss": 2.5277278423309326,
|
|
"eval_runtime": 47.8083,
|
|
"eval_samples_per_second": 3.012,
|
|
"eval_steps_per_second": 3.012,
|
|
"step": 1350
|
|
},
|
|
{
|
|
"epoch": 16.374269005847953,
|
|
"grad_norm": 241.31704711914062,
|
|
"learning_rate": 9.272e-07,
|
|
"loss": 1.9013,
|
|
"step": 1400
|
|
},
|
|
{
|
|
"epoch": 16.374269005847953,
|
|
"eval_loss": 2.5117082595825195,
|
|
"eval_runtime": 47.755,
|
|
"eval_samples_per_second": 3.015,
|
|
"eval_steps_per_second": 3.015,
|
|
"step": 1400
|
|
},
|
|
{
|
|
"epoch": 16.95906432748538,
|
|
"grad_norm": 307.3838806152344,
|
|
"learning_rate": 8.872e-07,
|
|
"loss": 1.7729,
|
|
"step": 1450
|
|
},
|
|
{
|
|
"epoch": 16.95906432748538,
|
|
"eval_loss": 2.4952661991119385,
|
|
"eval_runtime": 47.9247,
|
|
"eval_samples_per_second": 3.005,
|
|
"eval_steps_per_second": 3.005,
|
|
"step": 1450
|
|
},
|
|
{
|
|
"epoch": 17.54385964912281,
|
|
"grad_norm": 264.11541748046875,
|
|
"learning_rate": 8.471999999999999e-07,
|
|
"loss": 1.7591,
|
|
"step": 1500
|
|
},
|
|
{
|
|
"epoch": 17.54385964912281,
|
|
"eval_loss": 2.4815564155578613,
|
|
"eval_runtime": 47.8623,
|
|
"eval_samples_per_second": 3.009,
|
|
"eval_steps_per_second": 3.009,
|
|
"step": 1500
|
|
},
|
|
{
|
|
"epoch": 18.128654970760234,
|
|
"grad_norm": 268.2859802246094,
|
|
"learning_rate": 8.072e-07,
|
|
"loss": 1.787,
|
|
"step": 1550
|
|
},
|
|
{
|
|
"epoch": 18.128654970760234,
|
|
"eval_loss": 2.4869062900543213,
|
|
"eval_runtime": 47.8575,
|
|
"eval_samples_per_second": 3.009,
|
|
"eval_steps_per_second": 3.009,
|
|
"step": 1550
|
|
},
|
|
{
|
|
"epoch": 18.71345029239766,
|
|
"grad_norm": 245.68939208984375,
|
|
"learning_rate": 7.671999999999999e-07,
|
|
"loss": 1.7344,
|
|
"step": 1600
|
|
},
|
|
{
|
|
"epoch": 18.71345029239766,
|
|
"eval_loss": 2.4752299785614014,
|
|
"eval_runtime": 47.7262,
|
|
"eval_samples_per_second": 3.017,
|
|
"eval_steps_per_second": 3.017,
|
|
"step": 1600
|
|
},
|
|
{
|
|
"epoch": 19.29824561403509,
|
|
"grad_norm": 328.9492492675781,
|
|
"learning_rate": 7.271999999999999e-07,
|
|
"loss": 1.6783,
|
|
"step": 1650
|
|
},
|
|
{
|
|
"epoch": 19.29824561403509,
|
|
"eval_loss": 2.4677534103393555,
|
|
"eval_runtime": 47.7929,
|
|
"eval_samples_per_second": 3.013,
|
|
"eval_steps_per_second": 3.013,
|
|
"step": 1650
|
|
},
|
|
{
|
|
"epoch": 19.883040935672515,
|
|
"grad_norm": 271.9133605957031,
|
|
"learning_rate": 6.872e-07,
|
|
"loss": 1.6848,
|
|
"step": 1700
|
|
},
|
|
{
|
|
"epoch": 19.883040935672515,
|
|
"eval_loss": 2.458709478378296,
|
|
"eval_runtime": 47.8355,
|
|
"eval_samples_per_second": 3.01,
|
|
"eval_steps_per_second": 3.01,
|
|
"step": 1700
|
|
},
|
|
{
|
|
"epoch": 20.46783625730994,
|
|
"grad_norm": 280.5169372558594,
|
|
"learning_rate": 6.471999999999999e-07,
|
|
"loss": 1.6688,
|
|
"step": 1750
|
|
},
|
|
{
|
|
"epoch": 20.46783625730994,
|
|
"eval_loss": 2.451404333114624,
|
|
"eval_runtime": 47.8049,
|
|
"eval_samples_per_second": 3.012,
|
|
"eval_steps_per_second": 3.012,
|
|
"step": 1750
|
|
},
|
|
{
|
|
"epoch": 21.05263157894737,
|
|
"grad_norm": 255.56361389160156,
|
|
"learning_rate": 6.072e-07,
|
|
"loss": 1.5875,
|
|
"step": 1800
|
|
},
|
|
{
|
|
"epoch": 21.05263157894737,
|
|
"eval_loss": 2.446828842163086,
|
|
"eval_runtime": 47.8033,
|
|
"eval_samples_per_second": 3.012,
|
|
"eval_steps_per_second": 3.012,
|
|
"step": 1800
|
|
},
|
|
{
|
|
"epoch": 21.637426900584796,
|
|
"grad_norm": 307.4700622558594,
|
|
"learning_rate": 5.672e-07,
|
|
"loss": 1.5884,
|
|
"step": 1850
|
|
},
|
|
{
|
|
"epoch": 21.637426900584796,
|
|
"eval_loss": 2.4349937438964844,
|
|
"eval_runtime": 47.8413,
|
|
"eval_samples_per_second": 3.01,
|
|
"eval_steps_per_second": 3.01,
|
|
"step": 1850
|
|
},
|
|
{
|
|
"epoch": 22.22222222222222,
|
|
"grad_norm": 283.2776184082031,
|
|
"learning_rate": 5.272e-07,
|
|
"loss": 1.6004,
|
|
"step": 1900
|
|
},
|
|
{
|
|
"epoch": 22.22222222222222,
|
|
"eval_loss": 2.434246063232422,
|
|
"eval_runtime": 47.7863,
|
|
"eval_samples_per_second": 3.013,
|
|
"eval_steps_per_second": 3.013,
|
|
"step": 1900
|
|
},
|
|
{
|
|
"epoch": 22.80701754385965,
|
|
"grad_norm": 250.710693359375,
|
|
"learning_rate": 4.872e-07,
|
|
"loss": 1.5452,
|
|
"step": 1950
|
|
},
|
|
{
|
|
"epoch": 22.80701754385965,
|
|
"eval_loss": 2.4212119579315186,
|
|
"eval_runtime": 47.8378,
|
|
"eval_samples_per_second": 3.01,
|
|
"eval_steps_per_second": 3.01,
|
|
"step": 1950
|
|
},
|
|
{
|
|
"epoch": 23.391812865497077,
|
|
"grad_norm": 270.8255310058594,
|
|
"learning_rate": 4.472e-07,
|
|
"loss": 1.541,
|
|
"step": 2000
|
|
},
|
|
{
|
|
"epoch": 23.391812865497077,
|
|
"eval_loss": 2.417588472366333,
|
|
"eval_runtime": 47.7987,
|
|
"eval_samples_per_second": 3.013,
|
|
"eval_steps_per_second": 3.013,
|
|
"step": 2000
|
|
},
|
|
{
|
|
"epoch": 23.976608187134502,
|
|
"grad_norm": 254.624755859375,
|
|
"learning_rate": 4.072e-07,
|
|
"loss": 1.4856,
|
|
"step": 2050
|
|
},
|
|
{
|
|
"epoch": 23.976608187134502,
|
|
"eval_loss": 2.409721851348877,
|
|
"eval_runtime": 47.8588,
|
|
"eval_samples_per_second": 3.009,
|
|
"eval_steps_per_second": 3.009,
|
|
"step": 2050
|
|
},
|
|
{
|
|
"epoch": 24.56140350877193,
|
|
"grad_norm": 317.4898986816406,
|
|
"learning_rate": 3.672e-07,
|
|
"loss": 1.5132,
|
|
"step": 2100
|
|
},
|
|
{
|
|
"epoch": 24.56140350877193,
|
|
"eval_loss": 2.408442497253418,
|
|
"eval_runtime": 47.9177,
|
|
"eval_samples_per_second": 3.005,
|
|
"eval_steps_per_second": 3.005,
|
|
"step": 2100
|
|
},
|
|
{
|
|
"epoch": 25.146198830409357,
|
|
"grad_norm": 276.2957763671875,
|
|
"learning_rate": 3.2719999999999997e-07,
|
|
"loss": 1.4667,
|
|
"step": 2150
|
|
},
|
|
{
|
|
"epoch": 25.146198830409357,
|
|
"eval_loss": 2.407714605331421,
|
|
"eval_runtime": 47.6759,
|
|
"eval_samples_per_second": 3.02,
|
|
"eval_steps_per_second": 3.02,
|
|
"step": 2150
|
|
},
|
|
{
|
|
"epoch": 25.730994152046783,
|
|
"grad_norm": 269.7268981933594,
|
|
"learning_rate": 2.872e-07,
|
|
"loss": 1.4582,
|
|
"step": 2200
|
|
},
|
|
{
|
|
"epoch": 25.730994152046783,
|
|
"eval_loss": 2.3980703353881836,
|
|
"eval_runtime": 47.9246,
|
|
"eval_samples_per_second": 3.005,
|
|
"eval_steps_per_second": 3.005,
|
|
"step": 2200
|
|
},
|
|
{
|
|
"epoch": 26.31578947368421,
|
|
"grad_norm": 259.5769958496094,
|
|
"learning_rate": 2.472e-07,
|
|
"loss": 1.4627,
|
|
"step": 2250
|
|
},
|
|
{
|
|
"epoch": 26.31578947368421,
|
|
"eval_loss": 2.396810293197632,
|
|
"eval_runtime": 47.988,
|
|
"eval_samples_per_second": 3.001,
|
|
"eval_steps_per_second": 3.001,
|
|
"step": 2250
|
|
},
|
|
{
|
|
"epoch": 26.900584795321638,
|
|
"grad_norm": 229.35121154785156,
|
|
"learning_rate": 2.0719999999999998e-07,
|
|
"loss": 1.4554,
|
|
"step": 2300
|
|
},
|
|
{
|
|
"epoch": 26.900584795321638,
|
|
"eval_loss": 2.3947181701660156,
|
|
"eval_runtime": 47.8901,
|
|
"eval_samples_per_second": 3.007,
|
|
"eval_steps_per_second": 3.007,
|
|
"step": 2300
|
|
},
|
|
{
|
|
"epoch": 27.07017543859649,
|
|
"step": 2301,
|
|
"total_flos": 0.0,
|
|
"train_loss": 0.0005499045261348242,
|
|
"train_runtime": 49.212,
|
|
"train_samples_per_second": 1496.22,
|
|
"train_steps_per_second": 46.757
|
|
}
|
|
],
|
|
"logging_steps": 50,
|
|
"max_steps": 2301,
|
|
"num_input_tokens_seen": 0,
|
|
"num_train_epochs": 28,
|
|
"save_steps": 50,
|
|
"stateful_callbacks": {
|
|
"EarlyStoppingCallback": {
|
|
"args": {
|
|
"early_stopping_patience": 5,
|
|
"early_stopping_threshold": 0.0
|
|
},
|
|
"attributes": {
|
|
"early_stopping_patience_counter": 0
|
|
}
|
|
},
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": true,
|
|
"should_training_stop": true
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 0.0,
|
|
"train_batch_size": 1,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|
|
|