havok2's picture
Upload 17 files
9e7fec5 verified
{
"best_metric": 2.3947181701660156,
"best_model_checkpoint": "../checkpoints/FinalTrains_Runde2_720samples\\checkpoint-2300",
"epoch": 27.07017543859649,
"eval_steps": 50,
"global_step": 2301,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.5847953216374269,
"grad_norm": 2511.836181640625,
"learning_rate": 1.6399999999999998e-06,
"loss": 11.5208,
"step": 50
},
{
"epoch": 0.5847953216374269,
"eval_loss": 7.463321208953857,
"eval_runtime": 48.9349,
"eval_samples_per_second": 2.943,
"eval_steps_per_second": 2.943,
"step": 50
},
{
"epoch": 1.1695906432748537,
"grad_norm": 582.1835327148438,
"learning_rate": 1.9672e-06,
"loss": 5.9458,
"step": 100
},
{
"epoch": 1.1695906432748537,
"eval_loss": 4.85849142074585,
"eval_runtime": 51.5611,
"eval_samples_per_second": 2.793,
"eval_steps_per_second": 2.793,
"step": 100
},
{
"epoch": 1.7543859649122808,
"grad_norm": 429.77252197265625,
"learning_rate": 1.9272e-06,
"loss": 4.5189,
"step": 150
},
{
"epoch": 1.7543859649122808,
"eval_loss": 4.148195743560791,
"eval_runtime": 53.3669,
"eval_samples_per_second": 2.698,
"eval_steps_per_second": 2.698,
"step": 150
},
{
"epoch": 2.3391812865497075,
"grad_norm": 246.91941833496094,
"learning_rate": 1.8872e-06,
"loss": 3.9677,
"step": 200
},
{
"epoch": 2.3391812865497075,
"eval_loss": 3.7681965827941895,
"eval_runtime": 48.4728,
"eval_samples_per_second": 2.971,
"eval_steps_per_second": 2.971,
"step": 200
},
{
"epoch": 2.9239766081871346,
"grad_norm": 326.88507080078125,
"learning_rate": 1.8471999999999999e-06,
"loss": 3.6828,
"step": 250
},
{
"epoch": 2.9239766081871346,
"eval_loss": 3.4826574325561523,
"eval_runtime": 47.8574,
"eval_samples_per_second": 3.009,
"eval_steps_per_second": 3.009,
"step": 250
},
{
"epoch": 3.5087719298245617,
"grad_norm": 216.14739990234375,
"learning_rate": 1.8071999999999998e-06,
"loss": 3.3901,
"step": 300
},
{
"epoch": 3.5087719298245617,
"eval_loss": 3.304009437561035,
"eval_runtime": 48.0808,
"eval_samples_per_second": 2.995,
"eval_steps_per_second": 2.995,
"step": 300
},
{
"epoch": 4.093567251461988,
"grad_norm": 159.58961486816406,
"learning_rate": 1.7672e-06,
"loss": 3.2532,
"step": 350
},
{
"epoch": 4.093567251461988,
"eval_loss": 3.1738274097442627,
"eval_runtime": 47.8126,
"eval_samples_per_second": 3.012,
"eval_steps_per_second": 3.012,
"step": 350
},
{
"epoch": 4.678362573099415,
"grad_norm": 161.6717071533203,
"learning_rate": 1.7272e-06,
"loss": 3.1071,
"step": 400
},
{
"epoch": 4.678362573099415,
"eval_loss": 3.0628068447113037,
"eval_runtime": 48.0252,
"eval_samples_per_second": 2.998,
"eval_steps_per_second": 2.998,
"step": 400
},
{
"epoch": 5.2631578947368425,
"grad_norm": 146.9420166015625,
"learning_rate": 1.6872e-06,
"loss": 2.9944,
"step": 450
},
{
"epoch": 5.2631578947368425,
"eval_loss": 2.990321159362793,
"eval_runtime": 47.8573,
"eval_samples_per_second": 3.009,
"eval_steps_per_second": 3.009,
"step": 450
},
{
"epoch": 5.847953216374269,
"grad_norm": 233.46127319335938,
"learning_rate": 1.6471999999999999e-06,
"loss": 2.898,
"step": 500
},
{
"epoch": 5.847953216374269,
"eval_loss": 2.9129867553710938,
"eval_runtime": 47.7606,
"eval_samples_per_second": 3.015,
"eval_steps_per_second": 3.015,
"step": 500
},
{
"epoch": 6.432748538011696,
"grad_norm": 217.460205078125,
"learning_rate": 1.6071999999999998e-06,
"loss": 2.8098,
"step": 550
},
{
"epoch": 6.432748538011696,
"eval_loss": 2.862227439880371,
"eval_runtime": 47.7417,
"eval_samples_per_second": 3.016,
"eval_steps_per_second": 3.016,
"step": 550
},
{
"epoch": 7.017543859649122,
"grad_norm": 171.82847595214844,
"learning_rate": 1.5671999999999998e-06,
"loss": 2.7165,
"step": 600
},
{
"epoch": 7.017543859649122,
"eval_loss": 2.827782392501831,
"eval_runtime": 47.7602,
"eval_samples_per_second": 3.015,
"eval_steps_per_second": 3.015,
"step": 600
},
{
"epoch": 7.60233918128655,
"grad_norm": 255.87489318847656,
"learning_rate": 1.5271999999999998e-06,
"loss": 2.6446,
"step": 650
},
{
"epoch": 7.60233918128655,
"eval_loss": 2.7759788036346436,
"eval_runtime": 47.7257,
"eval_samples_per_second": 3.017,
"eval_steps_per_second": 3.017,
"step": 650
},
{
"epoch": 8.187134502923977,
"grad_norm": 349.6673889160156,
"learning_rate": 1.4872e-06,
"loss": 2.5134,
"step": 700
},
{
"epoch": 8.187134502923977,
"eval_loss": 2.7408220767974854,
"eval_runtime": 47.7358,
"eval_samples_per_second": 3.017,
"eval_steps_per_second": 3.017,
"step": 700
},
{
"epoch": 8.771929824561404,
"grad_norm": 303.1741943359375,
"learning_rate": 1.4471999999999999e-06,
"loss": 2.4957,
"step": 750
},
{
"epoch": 8.771929824561404,
"eval_loss": 2.7205865383148193,
"eval_runtime": 47.7327,
"eval_samples_per_second": 3.017,
"eval_steps_per_second": 3.017,
"step": 750
},
{
"epoch": 9.35672514619883,
"grad_norm": 277.3189697265625,
"learning_rate": 1.4071999999999998e-06,
"loss": 2.4521,
"step": 800
},
{
"epoch": 9.35672514619883,
"eval_loss": 2.699465036392212,
"eval_runtime": 47.8368,
"eval_samples_per_second": 3.01,
"eval_steps_per_second": 3.01,
"step": 800
},
{
"epoch": 9.941520467836257,
"grad_norm": 342.6287536621094,
"learning_rate": 1.3672e-06,
"loss": 2.3549,
"step": 850
},
{
"epoch": 9.941520467836257,
"eval_loss": 2.657285690307617,
"eval_runtime": 47.7948,
"eval_samples_per_second": 3.013,
"eval_steps_per_second": 3.013,
"step": 850
},
{
"epoch": 10.526315789473685,
"grad_norm": 314.8951416015625,
"learning_rate": 1.3272e-06,
"loss": 2.3118,
"step": 900
},
{
"epoch": 10.526315789473685,
"eval_loss": 2.6494998931884766,
"eval_runtime": 47.8289,
"eval_samples_per_second": 3.011,
"eval_steps_per_second": 3.011,
"step": 900
},
{
"epoch": 11.11111111111111,
"grad_norm": 259.777099609375,
"learning_rate": 1.2872e-06,
"loss": 2.2292,
"step": 950
},
{
"epoch": 11.11111111111111,
"eval_loss": 2.6447384357452393,
"eval_runtime": 47.7489,
"eval_samples_per_second": 3.016,
"eval_steps_per_second": 3.016,
"step": 950
},
{
"epoch": 11.695906432748538,
"grad_norm": 298.0020446777344,
"learning_rate": 1.2472e-06,
"loss": 2.191,
"step": 1000
},
{
"epoch": 11.695906432748538,
"eval_loss": 2.6040868759155273,
"eval_runtime": 47.8314,
"eval_samples_per_second": 3.011,
"eval_steps_per_second": 3.011,
"step": 1000
},
{
"epoch": 12.280701754385966,
"grad_norm": 276.3546447753906,
"learning_rate": 1.2072e-06,
"loss": 2.1521,
"step": 1050
},
{
"epoch": 12.280701754385966,
"eval_loss": 2.6078646183013916,
"eval_runtime": 47.8526,
"eval_samples_per_second": 3.009,
"eval_steps_per_second": 3.009,
"step": 1050
},
{
"epoch": 12.865497076023392,
"grad_norm": 336.7939453125,
"learning_rate": 1.1672e-06,
"loss": 2.0565,
"step": 1100
},
{
"epoch": 12.865497076023392,
"eval_loss": 2.5721495151519775,
"eval_runtime": 47.7559,
"eval_samples_per_second": 3.015,
"eval_steps_per_second": 3.015,
"step": 1100
},
{
"epoch": 13.450292397660819,
"grad_norm": 302.5251770019531,
"learning_rate": 1.1272e-06,
"loss": 2.096,
"step": 1150
},
{
"epoch": 13.450292397660819,
"eval_loss": 2.5710203647613525,
"eval_runtime": 47.8318,
"eval_samples_per_second": 3.011,
"eval_steps_per_second": 3.011,
"step": 1150
},
{
"epoch": 14.035087719298245,
"grad_norm": 262.70208740234375,
"learning_rate": 1.0872e-06,
"loss": 1.9957,
"step": 1200
},
{
"epoch": 14.035087719298245,
"eval_loss": 2.561436414718628,
"eval_runtime": 47.868,
"eval_samples_per_second": 3.008,
"eval_steps_per_second": 3.008,
"step": 1200
},
{
"epoch": 14.619883040935672,
"grad_norm": 339.61175537109375,
"learning_rate": 1.0472e-06,
"loss": 1.9382,
"step": 1250
},
{
"epoch": 14.619883040935672,
"eval_loss": 2.544802665710449,
"eval_runtime": 47.8634,
"eval_samples_per_second": 3.009,
"eval_steps_per_second": 3.009,
"step": 1250
},
{
"epoch": 15.2046783625731,
"grad_norm": 259.406005859375,
"learning_rate": 1.0072e-06,
"loss": 1.9409,
"step": 1300
},
{
"epoch": 15.2046783625731,
"eval_loss": 2.541916847229004,
"eval_runtime": 47.8691,
"eval_samples_per_second": 3.008,
"eval_steps_per_second": 3.008,
"step": 1300
},
{
"epoch": 15.789473684210526,
"grad_norm": 277.8048400878906,
"learning_rate": 9.671999999999998e-07,
"loss": 1.9007,
"step": 1350
},
{
"epoch": 15.789473684210526,
"eval_loss": 2.5277278423309326,
"eval_runtime": 47.8083,
"eval_samples_per_second": 3.012,
"eval_steps_per_second": 3.012,
"step": 1350
},
{
"epoch": 16.374269005847953,
"grad_norm": 241.31704711914062,
"learning_rate": 9.272e-07,
"loss": 1.9013,
"step": 1400
},
{
"epoch": 16.374269005847953,
"eval_loss": 2.5117082595825195,
"eval_runtime": 47.755,
"eval_samples_per_second": 3.015,
"eval_steps_per_second": 3.015,
"step": 1400
},
{
"epoch": 16.95906432748538,
"grad_norm": 307.3838806152344,
"learning_rate": 8.872e-07,
"loss": 1.7729,
"step": 1450
},
{
"epoch": 16.95906432748538,
"eval_loss": 2.4952661991119385,
"eval_runtime": 47.9247,
"eval_samples_per_second": 3.005,
"eval_steps_per_second": 3.005,
"step": 1450
},
{
"epoch": 17.54385964912281,
"grad_norm": 264.11541748046875,
"learning_rate": 8.471999999999999e-07,
"loss": 1.7591,
"step": 1500
},
{
"epoch": 17.54385964912281,
"eval_loss": 2.4815564155578613,
"eval_runtime": 47.8623,
"eval_samples_per_second": 3.009,
"eval_steps_per_second": 3.009,
"step": 1500
},
{
"epoch": 18.128654970760234,
"grad_norm": 268.2859802246094,
"learning_rate": 8.072e-07,
"loss": 1.787,
"step": 1550
},
{
"epoch": 18.128654970760234,
"eval_loss": 2.4869062900543213,
"eval_runtime": 47.8575,
"eval_samples_per_second": 3.009,
"eval_steps_per_second": 3.009,
"step": 1550
},
{
"epoch": 18.71345029239766,
"grad_norm": 245.68939208984375,
"learning_rate": 7.671999999999999e-07,
"loss": 1.7344,
"step": 1600
},
{
"epoch": 18.71345029239766,
"eval_loss": 2.4752299785614014,
"eval_runtime": 47.7262,
"eval_samples_per_second": 3.017,
"eval_steps_per_second": 3.017,
"step": 1600
},
{
"epoch": 19.29824561403509,
"grad_norm": 328.9492492675781,
"learning_rate": 7.271999999999999e-07,
"loss": 1.6783,
"step": 1650
},
{
"epoch": 19.29824561403509,
"eval_loss": 2.4677534103393555,
"eval_runtime": 47.7929,
"eval_samples_per_second": 3.013,
"eval_steps_per_second": 3.013,
"step": 1650
},
{
"epoch": 19.883040935672515,
"grad_norm": 271.9133605957031,
"learning_rate": 6.872e-07,
"loss": 1.6848,
"step": 1700
},
{
"epoch": 19.883040935672515,
"eval_loss": 2.458709478378296,
"eval_runtime": 47.8355,
"eval_samples_per_second": 3.01,
"eval_steps_per_second": 3.01,
"step": 1700
},
{
"epoch": 20.46783625730994,
"grad_norm": 280.5169372558594,
"learning_rate": 6.471999999999999e-07,
"loss": 1.6688,
"step": 1750
},
{
"epoch": 20.46783625730994,
"eval_loss": 2.451404333114624,
"eval_runtime": 47.8049,
"eval_samples_per_second": 3.012,
"eval_steps_per_second": 3.012,
"step": 1750
},
{
"epoch": 21.05263157894737,
"grad_norm": 255.56361389160156,
"learning_rate": 6.072e-07,
"loss": 1.5875,
"step": 1800
},
{
"epoch": 21.05263157894737,
"eval_loss": 2.446828842163086,
"eval_runtime": 47.8033,
"eval_samples_per_second": 3.012,
"eval_steps_per_second": 3.012,
"step": 1800
},
{
"epoch": 21.637426900584796,
"grad_norm": 307.4700622558594,
"learning_rate": 5.672e-07,
"loss": 1.5884,
"step": 1850
},
{
"epoch": 21.637426900584796,
"eval_loss": 2.4349937438964844,
"eval_runtime": 47.8413,
"eval_samples_per_second": 3.01,
"eval_steps_per_second": 3.01,
"step": 1850
},
{
"epoch": 22.22222222222222,
"grad_norm": 283.2776184082031,
"learning_rate": 5.272e-07,
"loss": 1.6004,
"step": 1900
},
{
"epoch": 22.22222222222222,
"eval_loss": 2.434246063232422,
"eval_runtime": 47.7863,
"eval_samples_per_second": 3.013,
"eval_steps_per_second": 3.013,
"step": 1900
},
{
"epoch": 22.80701754385965,
"grad_norm": 250.710693359375,
"learning_rate": 4.872e-07,
"loss": 1.5452,
"step": 1950
},
{
"epoch": 22.80701754385965,
"eval_loss": 2.4212119579315186,
"eval_runtime": 47.8378,
"eval_samples_per_second": 3.01,
"eval_steps_per_second": 3.01,
"step": 1950
},
{
"epoch": 23.391812865497077,
"grad_norm": 270.8255310058594,
"learning_rate": 4.472e-07,
"loss": 1.541,
"step": 2000
},
{
"epoch": 23.391812865497077,
"eval_loss": 2.417588472366333,
"eval_runtime": 47.7987,
"eval_samples_per_second": 3.013,
"eval_steps_per_second": 3.013,
"step": 2000
},
{
"epoch": 23.976608187134502,
"grad_norm": 254.624755859375,
"learning_rate": 4.072e-07,
"loss": 1.4856,
"step": 2050
},
{
"epoch": 23.976608187134502,
"eval_loss": 2.409721851348877,
"eval_runtime": 47.8588,
"eval_samples_per_second": 3.009,
"eval_steps_per_second": 3.009,
"step": 2050
},
{
"epoch": 24.56140350877193,
"grad_norm": 317.4898986816406,
"learning_rate": 3.672e-07,
"loss": 1.5132,
"step": 2100
},
{
"epoch": 24.56140350877193,
"eval_loss": 2.408442497253418,
"eval_runtime": 47.9177,
"eval_samples_per_second": 3.005,
"eval_steps_per_second": 3.005,
"step": 2100
},
{
"epoch": 25.146198830409357,
"grad_norm": 276.2957763671875,
"learning_rate": 3.2719999999999997e-07,
"loss": 1.4667,
"step": 2150
},
{
"epoch": 25.146198830409357,
"eval_loss": 2.407714605331421,
"eval_runtime": 47.6759,
"eval_samples_per_second": 3.02,
"eval_steps_per_second": 3.02,
"step": 2150
},
{
"epoch": 25.730994152046783,
"grad_norm": 269.7268981933594,
"learning_rate": 2.872e-07,
"loss": 1.4582,
"step": 2200
},
{
"epoch": 25.730994152046783,
"eval_loss": 2.3980703353881836,
"eval_runtime": 47.9246,
"eval_samples_per_second": 3.005,
"eval_steps_per_second": 3.005,
"step": 2200
},
{
"epoch": 26.31578947368421,
"grad_norm": 259.5769958496094,
"learning_rate": 2.472e-07,
"loss": 1.4627,
"step": 2250
},
{
"epoch": 26.31578947368421,
"eval_loss": 2.396810293197632,
"eval_runtime": 47.988,
"eval_samples_per_second": 3.001,
"eval_steps_per_second": 3.001,
"step": 2250
},
{
"epoch": 26.900584795321638,
"grad_norm": 229.35121154785156,
"learning_rate": 2.0719999999999998e-07,
"loss": 1.4554,
"step": 2300
},
{
"epoch": 26.900584795321638,
"eval_loss": 2.3947181701660156,
"eval_runtime": 47.8901,
"eval_samples_per_second": 3.007,
"eval_steps_per_second": 3.007,
"step": 2300
},
{
"epoch": 27.07017543859649,
"step": 2301,
"total_flos": 0.0,
"train_loss": 0.0005499045261348242,
"train_runtime": 49.212,
"train_samples_per_second": 1496.22,
"train_steps_per_second": 46.757
}
],
"logging_steps": 50,
"max_steps": 2301,
"num_input_tokens_seen": 0,
"num_train_epochs": 28,
"save_steps": 50,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 5,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}