|
{ |
|
"best_global_step": 1200, |
|
"best_metric": 1.777273416519165, |
|
"best_model_checkpoint": "./output_dir/th-Llama-3.1-8B-lr4e-06-atten0.25-ffn0.25_20250430_142946/checkpoint-1200", |
|
"epoch": 0.7199856002879942, |
|
"eval_steps": 50, |
|
"global_step": 1200, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0005999880002399952, |
|
"grad_norm": 2.234375, |
|
"learning_rate": 0.0, |
|
"loss": 1.7675, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0011999760004799903, |
|
"grad_norm": 2.1875, |
|
"learning_rate": 8e-08, |
|
"loss": 1.6481, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.0017999640007199855, |
|
"grad_norm": 2.15625, |
|
"learning_rate": 1.6e-07, |
|
"loss": 1.6866, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.0023999520009599807, |
|
"grad_norm": 1.953125, |
|
"learning_rate": 2.4e-07, |
|
"loss": 1.7062, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.002999940001199976, |
|
"grad_norm": 2.3125, |
|
"learning_rate": 3.2e-07, |
|
"loss": 1.774, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.003599928001439971, |
|
"grad_norm": 2.09375, |
|
"learning_rate": 4e-07, |
|
"loss": 1.6995, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.004199916001679967, |
|
"grad_norm": 1.9765625, |
|
"learning_rate": 4.8e-07, |
|
"loss": 1.6063, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.004799904001919961, |
|
"grad_norm": 1.9296875, |
|
"learning_rate": 5.6e-07, |
|
"loss": 1.5995, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.005399892002159957, |
|
"grad_norm": 2.03125, |
|
"learning_rate": 6.4e-07, |
|
"loss": 1.7045, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.005999880002399952, |
|
"grad_norm": 2.421875, |
|
"learning_rate": 7.2e-07, |
|
"loss": 1.5958, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.006599868002639947, |
|
"grad_norm": 2.1875, |
|
"learning_rate": 8e-07, |
|
"loss": 1.6149, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.007199856002879942, |
|
"grad_norm": 2.09375, |
|
"learning_rate": 8.799999999999999e-07, |
|
"loss": 1.7559, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.007799844003119938, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 9.6e-07, |
|
"loss": 1.682, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.008399832003359933, |
|
"grad_norm": 2.4375, |
|
"learning_rate": 1.04e-06, |
|
"loss": 1.6184, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.008999820003599928, |
|
"grad_norm": 2.0625, |
|
"learning_rate": 1.12e-06, |
|
"loss": 1.7178, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.009599808003839923, |
|
"grad_norm": 2.15625, |
|
"learning_rate": 1.2e-06, |
|
"loss": 1.5901, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.01019979600407992, |
|
"grad_norm": 1.9375, |
|
"learning_rate": 1.28e-06, |
|
"loss": 1.6869, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.010799784004319914, |
|
"grad_norm": 2.3125, |
|
"learning_rate": 1.3600000000000001e-06, |
|
"loss": 1.6398, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.011399772004559909, |
|
"grad_norm": 1.953125, |
|
"learning_rate": 1.44e-06, |
|
"loss": 1.7697, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.011999760004799903, |
|
"grad_norm": 2.109375, |
|
"learning_rate": 1.5199999999999998e-06, |
|
"loss": 1.7484, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.0125997480050399, |
|
"grad_norm": 2.234375, |
|
"learning_rate": 1.6e-06, |
|
"loss": 1.7561, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.013199736005279895, |
|
"grad_norm": 2.234375, |
|
"learning_rate": 1.6799999999999998e-06, |
|
"loss": 1.5346, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.01379972400551989, |
|
"grad_norm": 2.390625, |
|
"learning_rate": 1.7599999999999999e-06, |
|
"loss": 1.7269, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.014399712005759884, |
|
"grad_norm": 1.921875, |
|
"learning_rate": 1.84e-06, |
|
"loss": 1.6799, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.01499970000599988, |
|
"grad_norm": 2.25, |
|
"learning_rate": 1.92e-06, |
|
"loss": 1.6713, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.015599688006239875, |
|
"grad_norm": 2.140625, |
|
"learning_rate": 2e-06, |
|
"loss": 1.6378, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.016199676006479872, |
|
"grad_norm": 2.140625, |
|
"learning_rate": 2.08e-06, |
|
"loss": 1.7315, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.016799664006719867, |
|
"grad_norm": 2.34375, |
|
"learning_rate": 2.16e-06, |
|
"loss": 1.7283, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.01739965200695986, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 2.24e-06, |
|
"loss": 1.7627, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.017999640007199856, |
|
"grad_norm": 2.1875, |
|
"learning_rate": 2.32e-06, |
|
"loss": 1.6382, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.01859962800743985, |
|
"grad_norm": 1.78125, |
|
"learning_rate": 2.4e-06, |
|
"loss": 1.6786, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.019199616007679846, |
|
"grad_norm": 2.359375, |
|
"learning_rate": 2.48e-06, |
|
"loss": 1.6262, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.01979960400791984, |
|
"grad_norm": 2.0, |
|
"learning_rate": 2.56e-06, |
|
"loss": 1.6589, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.02039959200815984, |
|
"grad_norm": 2.34375, |
|
"learning_rate": 2.64e-06, |
|
"loss": 1.671, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.020999580008399833, |
|
"grad_norm": 2.03125, |
|
"learning_rate": 2.7200000000000002e-06, |
|
"loss": 1.7393, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.021599568008639828, |
|
"grad_norm": 2.140625, |
|
"learning_rate": 2.8e-06, |
|
"loss": 1.6027, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.022199556008879823, |
|
"grad_norm": 2.0, |
|
"learning_rate": 2.88e-06, |
|
"loss": 1.8158, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.022799544009119817, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 2.96e-06, |
|
"loss": 1.7158, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.023399532009359812, |
|
"grad_norm": 1.96875, |
|
"learning_rate": 3.0399999999999997e-06, |
|
"loss": 1.7778, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.023999520009599807, |
|
"grad_norm": 1.9765625, |
|
"learning_rate": 3.1199999999999998e-06, |
|
"loss": 1.6903, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.0245995080098398, |
|
"grad_norm": 2.046875, |
|
"learning_rate": 3.2e-06, |
|
"loss": 1.6403, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.0251994960100798, |
|
"grad_norm": 1.859375, |
|
"learning_rate": 3.2799999999999995e-06, |
|
"loss": 1.7292, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.025799484010319795, |
|
"grad_norm": 2.09375, |
|
"learning_rate": 3.3599999999999996e-06, |
|
"loss": 1.6956, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.02639947201055979, |
|
"grad_norm": 1.84375, |
|
"learning_rate": 3.4399999999999997e-06, |
|
"loss": 1.6927, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.026999460010799784, |
|
"grad_norm": 2.15625, |
|
"learning_rate": 3.5199999999999998e-06, |
|
"loss": 1.6794, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.02759944801103978, |
|
"grad_norm": 2.046875, |
|
"learning_rate": 3.6e-06, |
|
"loss": 1.7373, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.028199436011279774, |
|
"grad_norm": 2.0, |
|
"learning_rate": 3.68e-06, |
|
"loss": 1.6971, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.02879942401151977, |
|
"grad_norm": 2.03125, |
|
"learning_rate": 3.7599999999999996e-06, |
|
"loss": 1.7465, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.029399412011759767, |
|
"grad_norm": 1.890625, |
|
"learning_rate": 3.84e-06, |
|
"loss": 1.6621, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.02999940001199976, |
|
"grad_norm": 1.9140625, |
|
"learning_rate": 3.92e-06, |
|
"loss": 1.6802, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.02999940001199976, |
|
"eval_loss": 1.7964001893997192, |
|
"eval_model_preparation_time": 0.0037, |
|
"eval_runtime": 66.1162, |
|
"eval_samples_per_second": 151.249, |
|
"eval_steps_per_second": 25.213, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.030599388012239756, |
|
"grad_norm": 1.8203125, |
|
"learning_rate": 4e-06, |
|
"loss": 1.6904, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.03119937601247975, |
|
"grad_norm": 2.140625, |
|
"learning_rate": 3.997524752475248e-06, |
|
"loss": 1.6148, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.031799364012719745, |
|
"grad_norm": 1.7578125, |
|
"learning_rate": 3.9950495049504945e-06, |
|
"loss": 1.6905, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.032399352012959744, |
|
"grad_norm": 2.09375, |
|
"learning_rate": 3.992574257425742e-06, |
|
"loss": 1.7708, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.032999340013199735, |
|
"grad_norm": 1.75, |
|
"learning_rate": 3.99009900990099e-06, |
|
"loss": 1.7365, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.03359932801343973, |
|
"grad_norm": 2.109375, |
|
"learning_rate": 3.987623762376238e-06, |
|
"loss": 1.5333, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.034199316013679724, |
|
"grad_norm": 1.921875, |
|
"learning_rate": 3.985148514851485e-06, |
|
"loss": 1.7474, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.03479930401391972, |
|
"grad_norm": 1.90625, |
|
"learning_rate": 3.9826732673267325e-06, |
|
"loss": 1.667, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.035399292014159714, |
|
"grad_norm": 2.140625, |
|
"learning_rate": 3.98019801980198e-06, |
|
"loss": 1.652, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.03599928001439971, |
|
"grad_norm": 1.9140625, |
|
"learning_rate": 3.977722772277228e-06, |
|
"loss": 1.6359, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.03659926801463971, |
|
"grad_norm": 1.9765625, |
|
"learning_rate": 3.975247524752475e-06, |
|
"loss": 1.6885, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.0371992560148797, |
|
"grad_norm": 1.9765625, |
|
"learning_rate": 3.972772277227723e-06, |
|
"loss": 1.6758, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.0377992440151197, |
|
"grad_norm": 2.0, |
|
"learning_rate": 3.9702970297029705e-06, |
|
"loss": 1.6944, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.03839923201535969, |
|
"grad_norm": 1.734375, |
|
"learning_rate": 3.967821782178218e-06, |
|
"loss": 1.5645, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.03899922001559969, |
|
"grad_norm": 1.8125, |
|
"learning_rate": 3.965346534653465e-06, |
|
"loss": 1.7534, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.03959920801583968, |
|
"grad_norm": 2.078125, |
|
"learning_rate": 3.962871287128713e-06, |
|
"loss": 1.6782, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.04019919601607968, |
|
"grad_norm": 1.90625, |
|
"learning_rate": 3.96039603960396e-06, |
|
"loss": 1.6826, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.04079918401631968, |
|
"grad_norm": 1.8203125, |
|
"learning_rate": 3.957920792079208e-06, |
|
"loss": 1.692, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.04139917201655967, |
|
"grad_norm": 1.8984375, |
|
"learning_rate": 3.955445544554455e-06, |
|
"loss": 1.5594, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.041999160016799666, |
|
"grad_norm": 1.7890625, |
|
"learning_rate": 3.952970297029703e-06, |
|
"loss": 1.677, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.04259914801703966, |
|
"grad_norm": 1.8984375, |
|
"learning_rate": 3.95049504950495e-06, |
|
"loss": 1.6147, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.043199136017279656, |
|
"grad_norm": 1.8125, |
|
"learning_rate": 3.948019801980198e-06, |
|
"loss": 1.6306, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.04379912401751965, |
|
"grad_norm": 1.8984375, |
|
"learning_rate": 3.945544554455446e-06, |
|
"loss": 1.5884, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.044399112017759645, |
|
"grad_norm": 1.8359375, |
|
"learning_rate": 3.943069306930693e-06, |
|
"loss": 1.5904, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.04499910001799964, |
|
"grad_norm": 1.8671875, |
|
"learning_rate": 3.94059405940594e-06, |
|
"loss": 1.628, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.045599088018239635, |
|
"grad_norm": 1.875, |
|
"learning_rate": 3.938118811881188e-06, |
|
"loss": 1.7228, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.04619907601847963, |
|
"grad_norm": 1.8671875, |
|
"learning_rate": 3.935643564356436e-06, |
|
"loss": 1.7077, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.046799064018719624, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 3.933168316831683e-06, |
|
"loss": 1.5831, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.04739905201895962, |
|
"grad_norm": 1.6875, |
|
"learning_rate": 3.9306930693069305e-06, |
|
"loss": 1.6097, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.047999040019199614, |
|
"grad_norm": 1.765625, |
|
"learning_rate": 3.928217821782178e-06, |
|
"loss": 1.6748, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.04859902801943961, |
|
"grad_norm": 1.78125, |
|
"learning_rate": 3.925742574257425e-06, |
|
"loss": 1.598, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.0491990160196796, |
|
"grad_norm": 1.6640625, |
|
"learning_rate": 3.923267326732673e-06, |
|
"loss": 1.6844, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.0497990040199196, |
|
"grad_norm": 2.03125, |
|
"learning_rate": 3.920792079207921e-06, |
|
"loss": 1.7564, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.0503989920201596, |
|
"grad_norm": 1.984375, |
|
"learning_rate": 3.9183168316831685e-06, |
|
"loss": 1.6621, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.05099898002039959, |
|
"grad_norm": 1.953125, |
|
"learning_rate": 3.915841584158415e-06, |
|
"loss": 1.6924, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.05159896802063959, |
|
"grad_norm": 1.8515625, |
|
"learning_rate": 3.913366336633663e-06, |
|
"loss": 1.6198, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.05219895602087958, |
|
"grad_norm": 1.9140625, |
|
"learning_rate": 3.910891089108911e-06, |
|
"loss": 1.6174, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.05279894402111958, |
|
"grad_norm": 1.8515625, |
|
"learning_rate": 3.908415841584159e-06, |
|
"loss": 1.67, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.05339893202135957, |
|
"grad_norm": 2.453125, |
|
"learning_rate": 3.905940594059406e-06, |
|
"loss": 1.7446, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.05399892002159957, |
|
"grad_norm": 1.828125, |
|
"learning_rate": 3.903465346534653e-06, |
|
"loss": 1.6408, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.054598908021839566, |
|
"grad_norm": 1.7421875, |
|
"learning_rate": 3.9009900990099e-06, |
|
"loss": 1.7832, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.05519889602207956, |
|
"grad_norm": 1.890625, |
|
"learning_rate": 3.898514851485148e-06, |
|
"loss": 1.6397, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.055798884022319556, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 3.896039603960396e-06, |
|
"loss": 1.6805, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.05639887202255955, |
|
"grad_norm": 1.84375, |
|
"learning_rate": 3.893564356435644e-06, |
|
"loss": 1.742, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.056998860022799545, |
|
"grad_norm": 1.875, |
|
"learning_rate": 3.8910891089108905e-06, |
|
"loss": 1.5952, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.05759884802303954, |
|
"grad_norm": 1.9296875, |
|
"learning_rate": 3.888613861386138e-06, |
|
"loss": 1.7769, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.058198836023279535, |
|
"grad_norm": 1.6484375, |
|
"learning_rate": 3.886138613861386e-06, |
|
"loss": 1.6168, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.05879882402351953, |
|
"grad_norm": 1.984375, |
|
"learning_rate": 3.883663366336634e-06, |
|
"loss": 1.6283, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.059398812023759524, |
|
"grad_norm": 1.8828125, |
|
"learning_rate": 3.881188118811881e-06, |
|
"loss": 1.6354, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.05999880002399952, |
|
"grad_norm": 1.609375, |
|
"learning_rate": 3.8787128712871285e-06, |
|
"loss": 1.6169, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.05999880002399952, |
|
"eval_loss": 1.7915641069412231, |
|
"eval_model_preparation_time": 0.0037, |
|
"eval_runtime": 66.2561, |
|
"eval_samples_per_second": 150.93, |
|
"eval_steps_per_second": 25.16, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.060598788024239514, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 3.876237623762376e-06, |
|
"loss": 1.785, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.06119877602447951, |
|
"grad_norm": 1.6484375, |
|
"learning_rate": 3.873762376237624e-06, |
|
"loss": 1.7006, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.0617987640247195, |
|
"grad_norm": 1.890625, |
|
"learning_rate": 3.871287128712871e-06, |
|
"loss": 1.7633, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.0623987520249595, |
|
"grad_norm": 1.6640625, |
|
"learning_rate": 3.868811881188119e-06, |
|
"loss": 1.7084, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.06299874002519949, |
|
"grad_norm": 1.75, |
|
"learning_rate": 3.866336633663366e-06, |
|
"loss": 1.7583, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.06359872802543949, |
|
"grad_norm": 1.6875, |
|
"learning_rate": 3.8638613861386134e-06, |
|
"loss": 1.6945, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.06419871602567949, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 3.861386138613861e-06, |
|
"loss": 1.7145, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.06479870402591949, |
|
"grad_norm": 1.6328125, |
|
"learning_rate": 3.858910891089109e-06, |
|
"loss": 1.6482, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.06539869202615947, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 3.856435643564356e-06, |
|
"loss": 1.7366, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.06599868002639947, |
|
"grad_norm": 1.84375, |
|
"learning_rate": 3.853960396039604e-06, |
|
"loss": 1.6372, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.06659866802663947, |
|
"grad_norm": 1.8359375, |
|
"learning_rate": 3.851485148514851e-06, |
|
"loss": 1.7359, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.06719865602687947, |
|
"grad_norm": 1.71875, |
|
"learning_rate": 3.849009900990099e-06, |
|
"loss": 1.6406, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.06779864402711945, |
|
"grad_norm": 1.7421875, |
|
"learning_rate": 3.846534653465346e-06, |
|
"loss": 1.696, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.06839863202735945, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 3.844059405940594e-06, |
|
"loss": 1.6321, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.06899862002759945, |
|
"grad_norm": 1.8828125, |
|
"learning_rate": 3.841584158415842e-06, |
|
"loss": 1.706, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.06959860802783945, |
|
"grad_norm": 1.65625, |
|
"learning_rate": 3.839108910891089e-06, |
|
"loss": 1.7333, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.07019859602807944, |
|
"grad_norm": 1.9296875, |
|
"learning_rate": 3.836633663366336e-06, |
|
"loss": 1.7489, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.07079858402831943, |
|
"grad_norm": 1.7578125, |
|
"learning_rate": 3.834158415841584e-06, |
|
"loss": 1.6628, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.07139857202855943, |
|
"grad_norm": 1.7421875, |
|
"learning_rate": 3.831683168316831e-06, |
|
"loss": 1.7741, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.07199856002879942, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 3.829207920792079e-06, |
|
"loss": 1.6783, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.07259854802903942, |
|
"grad_norm": 1.921875, |
|
"learning_rate": 3.8267326732673265e-06, |
|
"loss": 1.6955, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.07319853602927942, |
|
"grad_norm": 1.984375, |
|
"learning_rate": 3.824257425742574e-06, |
|
"loss": 1.721, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.0737985240295194, |
|
"grad_norm": 1.6640625, |
|
"learning_rate": 3.821782178217821e-06, |
|
"loss": 1.6035, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.0743985120297594, |
|
"grad_norm": 2.140625, |
|
"learning_rate": 3.819306930693069e-06, |
|
"loss": 1.5864, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.0749985000299994, |
|
"grad_norm": 1.546875, |
|
"learning_rate": 3.816831683168317e-06, |
|
"loss": 1.6237, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.0755984880302394, |
|
"grad_norm": 1.734375, |
|
"learning_rate": 3.814356435643564e-06, |
|
"loss": 1.6389, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.07619847603047938, |
|
"grad_norm": 1.96875, |
|
"learning_rate": 3.8118811881188114e-06, |
|
"loss": 1.6965, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.07679846403071938, |
|
"grad_norm": 2.34375, |
|
"learning_rate": 3.809405940594059e-06, |
|
"loss": 1.5421, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.07739845203095938, |
|
"grad_norm": 1.875, |
|
"learning_rate": 3.8069306930693065e-06, |
|
"loss": 1.7131, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.07799844003119938, |
|
"grad_norm": 1.7265625, |
|
"learning_rate": 3.8044554455445543e-06, |
|
"loss": 1.7201, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.07859842803143938, |
|
"grad_norm": 1.8125, |
|
"learning_rate": 3.8019801980198017e-06, |
|
"loss": 1.7214, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.07919841603167936, |
|
"grad_norm": 1.71875, |
|
"learning_rate": 3.7995049504950494e-06, |
|
"loss": 1.7073, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.07979840403191936, |
|
"grad_norm": 1.578125, |
|
"learning_rate": 3.7970297029702968e-06, |
|
"loss": 1.6527, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.08039839203215936, |
|
"grad_norm": 1.7890625, |
|
"learning_rate": 3.7945544554455445e-06, |
|
"loss": 1.6767, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.08099838003239936, |
|
"grad_norm": 1.609375, |
|
"learning_rate": 3.792079207920792e-06, |
|
"loss": 1.7262, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.08159836803263935, |
|
"grad_norm": 1.65625, |
|
"learning_rate": 3.7896039603960396e-06, |
|
"loss": 1.7001, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.08219835603287934, |
|
"grad_norm": 1.6171875, |
|
"learning_rate": 3.7871287128712866e-06, |
|
"loss": 1.6917, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.08279834403311934, |
|
"grad_norm": 1.734375, |
|
"learning_rate": 3.7846534653465343e-06, |
|
"loss": 1.6897, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.08339833203335933, |
|
"grad_norm": 1.78125, |
|
"learning_rate": 3.7821782178217817e-06, |
|
"loss": 1.6675, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.08399832003359933, |
|
"grad_norm": 1.6015625, |
|
"learning_rate": 3.7797029702970294e-06, |
|
"loss": 1.5694, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.08459830803383932, |
|
"grad_norm": 1.71875, |
|
"learning_rate": 3.7772277227722768e-06, |
|
"loss": 1.633, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.08519829603407932, |
|
"grad_norm": 1.90625, |
|
"learning_rate": 3.7747524752475245e-06, |
|
"loss": 1.7887, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.08579828403431931, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 3.772277227722772e-06, |
|
"loss": 1.593, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.08639827203455931, |
|
"grad_norm": 1.796875, |
|
"learning_rate": 3.7698019801980197e-06, |
|
"loss": 1.6052, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.08699826003479931, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 3.767326732673267e-06, |
|
"loss": 1.5389, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.0875982480350393, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 3.7648514851485148e-06, |
|
"loss": 1.6326, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.08819823603527929, |
|
"grad_norm": 1.4921875, |
|
"learning_rate": 3.762376237623762e-06, |
|
"loss": 1.6813, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.08879822403551929, |
|
"grad_norm": 1.8359375, |
|
"learning_rate": 3.75990099009901e-06, |
|
"loss": 1.6882, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.08939821203575929, |
|
"grad_norm": 1.8515625, |
|
"learning_rate": 3.7574257425742572e-06, |
|
"loss": 1.5999, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.08999820003599927, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 3.754950495049505e-06, |
|
"loss": 1.5369, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.08999820003599927, |
|
"eval_loss": 1.788702130317688, |
|
"eval_model_preparation_time": 0.0037, |
|
"eval_runtime": 68.5232, |
|
"eval_samples_per_second": 145.936, |
|
"eval_steps_per_second": 24.328, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.09059818803623927, |
|
"grad_norm": 1.8046875, |
|
"learning_rate": 3.752475247524752e-06, |
|
"loss": 1.6148, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.09119817603647927, |
|
"grad_norm": 1.8203125, |
|
"learning_rate": 3.7499999999999997e-06, |
|
"loss": 1.6697, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.09179816403671927, |
|
"grad_norm": 1.75, |
|
"learning_rate": 3.7475247524752474e-06, |
|
"loss": 1.6048, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.09239815203695927, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 3.7450495049504948e-06, |
|
"loss": 1.6984, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.09299814003719925, |
|
"grad_norm": 1.7421875, |
|
"learning_rate": 3.7425742574257425e-06, |
|
"loss": 1.5856, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.09359812803743925, |
|
"grad_norm": 1.5546875, |
|
"learning_rate": 3.74009900990099e-06, |
|
"loss": 1.6575, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.09419811603767925, |
|
"grad_norm": 1.8359375, |
|
"learning_rate": 3.7376237623762377e-06, |
|
"loss": 1.6302, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.09479810403791925, |
|
"grad_norm": 1.6640625, |
|
"learning_rate": 3.735148514851485e-06, |
|
"loss": 1.644, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.09539809203815924, |
|
"grad_norm": 1.734375, |
|
"learning_rate": 3.7326732673267328e-06, |
|
"loss": 1.7122, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.09599808003839923, |
|
"grad_norm": 1.8984375, |
|
"learning_rate": 3.73019801980198e-06, |
|
"loss": 1.6811, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.09659806803863923, |
|
"grad_norm": 1.65625, |
|
"learning_rate": 3.727722772277228e-06, |
|
"loss": 1.6262, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.09719805603887922, |
|
"grad_norm": 1.6875, |
|
"learning_rate": 3.7252475247524752e-06, |
|
"loss": 1.5868, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.09779804403911922, |
|
"grad_norm": 1.6640625, |
|
"learning_rate": 3.722772277227723e-06, |
|
"loss": 1.7002, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.0983980320393592, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 3.72029702970297e-06, |
|
"loss": 1.6849, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.0989980200395992, |
|
"grad_norm": 1.7109375, |
|
"learning_rate": 3.7178217821782177e-06, |
|
"loss": 1.7176, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.0995980080398392, |
|
"grad_norm": 1.6875, |
|
"learning_rate": 3.715346534653465e-06, |
|
"loss": 1.6203, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.1001979960400792, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 3.7128712871287128e-06, |
|
"loss": 1.715, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.1007979840403192, |
|
"grad_norm": 1.640625, |
|
"learning_rate": 3.71039603960396e-06, |
|
"loss": 1.6671, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.10139797204055918, |
|
"grad_norm": 1.734375, |
|
"learning_rate": 3.707920792079208e-06, |
|
"loss": 1.7003, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.10199796004079918, |
|
"grad_norm": 1.6875, |
|
"learning_rate": 3.7054455445544552e-06, |
|
"loss": 1.658, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.10259794804103918, |
|
"grad_norm": 1.8828125, |
|
"learning_rate": 3.702970297029703e-06, |
|
"loss": 1.6661, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.10319793604127918, |
|
"grad_norm": 1.859375, |
|
"learning_rate": 3.7004950495049503e-06, |
|
"loss": 1.6916, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.10379792404151916, |
|
"grad_norm": 2.03125, |
|
"learning_rate": 3.698019801980198e-06, |
|
"loss": 1.7125, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.10439791204175916, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 3.6955445544554455e-06, |
|
"loss": 1.6975, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.10499790004199916, |
|
"grad_norm": 1.7578125, |
|
"learning_rate": 3.6930693069306932e-06, |
|
"loss": 1.7408, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.10559788804223916, |
|
"grad_norm": 1.625, |
|
"learning_rate": 3.6905940594059406e-06, |
|
"loss": 1.7682, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.10619787604247916, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 3.6881188118811883e-06, |
|
"loss": 1.6703, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.10679786404271914, |
|
"grad_norm": 1.90625, |
|
"learning_rate": 3.6856435643564352e-06, |
|
"loss": 1.6898, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.10739785204295914, |
|
"grad_norm": 1.890625, |
|
"learning_rate": 3.683168316831683e-06, |
|
"loss": 1.7006, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.10799784004319914, |
|
"grad_norm": 1.734375, |
|
"learning_rate": 3.6806930693069304e-06, |
|
"loss": 1.659, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.10859782804343913, |
|
"grad_norm": 1.7421875, |
|
"learning_rate": 3.678217821782178e-06, |
|
"loss": 1.685, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.10919781604367913, |
|
"grad_norm": 1.796875, |
|
"learning_rate": 3.6757425742574255e-06, |
|
"loss": 1.7294, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.10979780404391912, |
|
"grad_norm": 1.890625, |
|
"learning_rate": 3.6732673267326732e-06, |
|
"loss": 1.7036, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.11039779204415912, |
|
"grad_norm": 1.765625, |
|
"learning_rate": 3.6707920792079206e-06, |
|
"loss": 1.6733, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.11099778004439911, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 3.6683168316831683e-06, |
|
"loss": 1.7434, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.11159776804463911, |
|
"grad_norm": 1.7734375, |
|
"learning_rate": 3.6658415841584157e-06, |
|
"loss": 1.6844, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.1121977560448791, |
|
"grad_norm": 1.7421875, |
|
"learning_rate": 3.6633663366336635e-06, |
|
"loss": 1.7077, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.1127977440451191, |
|
"grad_norm": 1.75, |
|
"learning_rate": 3.660891089108911e-06, |
|
"loss": 1.7586, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.11339773204535909, |
|
"grad_norm": 1.828125, |
|
"learning_rate": 3.6584158415841586e-06, |
|
"loss": 1.6478, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.11399772004559909, |
|
"grad_norm": 1.7265625, |
|
"learning_rate": 3.6559405940594055e-06, |
|
"loss": 1.6893, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.11459770804583909, |
|
"grad_norm": 1.9375, |
|
"learning_rate": 3.6534653465346532e-06, |
|
"loss": 1.7054, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.11519769604607907, |
|
"grad_norm": 1.65625, |
|
"learning_rate": 3.6509900990099006e-06, |
|
"loss": 1.7179, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.11579768404631907, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 3.6485148514851484e-06, |
|
"loss": 1.5688, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.11639767204655907, |
|
"grad_norm": 1.8203125, |
|
"learning_rate": 3.6460396039603957e-06, |
|
"loss": 1.807, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.11699766004679907, |
|
"grad_norm": 1.734375, |
|
"learning_rate": 3.6435643564356435e-06, |
|
"loss": 1.6499, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.11759764804703907, |
|
"grad_norm": 1.796875, |
|
"learning_rate": 3.641089108910891e-06, |
|
"loss": 1.6366, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.11819763604727905, |
|
"grad_norm": 1.7890625, |
|
"learning_rate": 3.6386138613861386e-06, |
|
"loss": 1.7076, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.11879762404751905, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 3.636138613861386e-06, |
|
"loss": 1.6531, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.11939761204775905, |
|
"grad_norm": 1.6640625, |
|
"learning_rate": 3.6336633663366337e-06, |
|
"loss": 1.6723, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.11999760004799905, |
|
"grad_norm": 1.5859375, |
|
"learning_rate": 3.631188118811881e-06, |
|
"loss": 1.5718, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.11999760004799905, |
|
"eval_loss": 1.7864090204238892, |
|
"eval_model_preparation_time": 0.0037, |
|
"eval_runtime": 65.9536, |
|
"eval_samples_per_second": 151.622, |
|
"eval_steps_per_second": 25.275, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.12059758804823903, |
|
"grad_norm": 1.78125, |
|
"learning_rate": 3.628712871287129e-06, |
|
"loss": 1.6047, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.12119757604847903, |
|
"grad_norm": 1.5234375, |
|
"learning_rate": 3.626237623762376e-06, |
|
"loss": 1.5836, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.12179756404871903, |
|
"grad_norm": 1.625, |
|
"learning_rate": 3.623762376237624e-06, |
|
"loss": 1.7101, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 0.12239755204895902, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 3.621287128712871e-06, |
|
"loss": 1.7021, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.12299754004919902, |
|
"grad_norm": 1.6875, |
|
"learning_rate": 3.6188118811881186e-06, |
|
"loss": 1.673, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.123597528049439, |
|
"grad_norm": 1.71875, |
|
"learning_rate": 3.616336633663366e-06, |
|
"loss": 1.6808, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.124197516049679, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 3.6138613861386137e-06, |
|
"loss": 1.6939, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.124797504049919, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 3.611386138613861e-06, |
|
"loss": 1.691, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.125397492050159, |
|
"grad_norm": 1.6640625, |
|
"learning_rate": 3.608910891089109e-06, |
|
"loss": 1.6701, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 0.12599748005039899, |
|
"grad_norm": 1.71875, |
|
"learning_rate": 3.606435643564356e-06, |
|
"loss": 1.7267, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.12659746805063898, |
|
"grad_norm": 1.6015625, |
|
"learning_rate": 3.603960396039604e-06, |
|
"loss": 1.626, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 0.12719745605087898, |
|
"grad_norm": 1.8828125, |
|
"learning_rate": 3.6014851485148513e-06, |
|
"loss": 1.6452, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.12779744405111898, |
|
"grad_norm": 1.625, |
|
"learning_rate": 3.599009900990099e-06, |
|
"loss": 1.6894, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.12839743205135898, |
|
"grad_norm": 1.7109375, |
|
"learning_rate": 3.5965346534653464e-06, |
|
"loss": 1.5345, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.12899742005159898, |
|
"grad_norm": 1.828125, |
|
"learning_rate": 3.594059405940594e-06, |
|
"loss": 1.6531, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.12959740805183897, |
|
"grad_norm": 1.71875, |
|
"learning_rate": 3.5915841584158415e-06, |
|
"loss": 1.6251, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.13019739605207895, |
|
"grad_norm": 1.796875, |
|
"learning_rate": 3.589108910891089e-06, |
|
"loss": 1.6431, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 0.13079738405231894, |
|
"grad_norm": 1.5, |
|
"learning_rate": 3.586633663366336e-06, |
|
"loss": 1.6415, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.13139737205255894, |
|
"grad_norm": 1.9765625, |
|
"learning_rate": 3.584158415841584e-06, |
|
"loss": 1.777, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.13199736005279894, |
|
"grad_norm": 1.7890625, |
|
"learning_rate": 3.5816831683168313e-06, |
|
"loss": 1.6775, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.13259734805303894, |
|
"grad_norm": 1.6640625, |
|
"learning_rate": 3.579207920792079e-06, |
|
"loss": 1.6495, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 0.13319733605327894, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 3.5767326732673264e-06, |
|
"loss": 1.6738, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.13379732405351893, |
|
"grad_norm": 1.8046875, |
|
"learning_rate": 3.574257425742574e-06, |
|
"loss": 1.6602, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 0.13439731205375893, |
|
"grad_norm": 1.8046875, |
|
"learning_rate": 3.5717821782178215e-06, |
|
"loss": 1.6473, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.13499730005399893, |
|
"grad_norm": 1.796875, |
|
"learning_rate": 3.5693069306930693e-06, |
|
"loss": 1.6683, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.1355972880542389, |
|
"grad_norm": 1.5, |
|
"learning_rate": 3.5668316831683166e-06, |
|
"loss": 1.6702, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.1361972760544789, |
|
"grad_norm": 1.8125, |
|
"learning_rate": 3.5643564356435644e-06, |
|
"loss": 1.6736, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 0.1367972640547189, |
|
"grad_norm": 1.6875, |
|
"learning_rate": 3.5618811881188117e-06, |
|
"loss": 1.7395, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.1373972520549589, |
|
"grad_norm": 1.7734375, |
|
"learning_rate": 3.5594059405940595e-06, |
|
"loss": 1.6352, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 0.1379972400551989, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 3.5569306930693064e-06, |
|
"loss": 1.7035, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.1385972280554389, |
|
"grad_norm": 1.859375, |
|
"learning_rate": 3.554455445544554e-06, |
|
"loss": 1.6634, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 0.1391972160556789, |
|
"grad_norm": 1.5703125, |
|
"learning_rate": 3.5519801980198015e-06, |
|
"loss": 1.5949, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.1397972040559189, |
|
"grad_norm": 1.6875, |
|
"learning_rate": 3.5495049504950493e-06, |
|
"loss": 1.6848, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 0.1403971920561589, |
|
"grad_norm": 1.7421875, |
|
"learning_rate": 3.5470297029702966e-06, |
|
"loss": 1.6158, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.14099718005639889, |
|
"grad_norm": 1.765625, |
|
"learning_rate": 3.5445544554455444e-06, |
|
"loss": 1.6746, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.14159716805663886, |
|
"grad_norm": 1.8515625, |
|
"learning_rate": 3.5420792079207917e-06, |
|
"loss": 1.6911, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.14219715605687885, |
|
"grad_norm": 1.7734375, |
|
"learning_rate": 3.5396039603960395e-06, |
|
"loss": 1.6304, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 0.14279714405711885, |
|
"grad_norm": 2.03125, |
|
"learning_rate": 3.537128712871287e-06, |
|
"loss": 1.6485, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.14339713205735885, |
|
"grad_norm": 1.6875, |
|
"learning_rate": 3.5346534653465346e-06, |
|
"loss": 1.6509, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 0.14399712005759885, |
|
"grad_norm": 1.7578125, |
|
"learning_rate": 3.532178217821782e-06, |
|
"loss": 1.7299, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.14459710805783885, |
|
"grad_norm": 1.6171875, |
|
"learning_rate": 3.5297029702970297e-06, |
|
"loss": 1.5903, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 0.14519709605807885, |
|
"grad_norm": 1.8515625, |
|
"learning_rate": 3.527227722772277e-06, |
|
"loss": 1.7034, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 0.14579708405831884, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 3.524752475247525e-06, |
|
"loss": 1.604, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 0.14639707205855884, |
|
"grad_norm": 1.578125, |
|
"learning_rate": 3.5222772277227717e-06, |
|
"loss": 1.6242, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.1469970600587988, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 3.5198019801980195e-06, |
|
"loss": 1.6533, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.1475970480590388, |
|
"grad_norm": 1.6015625, |
|
"learning_rate": 3.517326732673267e-06, |
|
"loss": 1.6703, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.1481970360592788, |
|
"grad_norm": 1.6875, |
|
"learning_rate": 3.5148514851485146e-06, |
|
"loss": 1.6464, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 0.1487970240595188, |
|
"grad_norm": 1.7421875, |
|
"learning_rate": 3.512376237623762e-06, |
|
"loss": 1.8545, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.1493970120597588, |
|
"grad_norm": 1.5, |
|
"learning_rate": 3.5099009900990097e-06, |
|
"loss": 1.7212, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 0.1499970000599988, |
|
"grad_norm": 1.6484375, |
|
"learning_rate": 3.507425742574257e-06, |
|
"loss": 1.637, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.1499970000599988, |
|
"eval_loss": 1.7847853899002075, |
|
"eval_model_preparation_time": 0.0037, |
|
"eval_runtime": 65.9306, |
|
"eval_samples_per_second": 151.675, |
|
"eval_steps_per_second": 25.284, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.1505969880602388, |
|
"grad_norm": 1.8828125, |
|
"learning_rate": 3.504950495049505e-06, |
|
"loss": 1.7151, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 0.1511969760604788, |
|
"grad_norm": 1.71875, |
|
"learning_rate": 3.502475247524752e-06, |
|
"loss": 1.6645, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.1517969640607188, |
|
"grad_norm": 1.6640625, |
|
"learning_rate": 3.5e-06, |
|
"loss": 1.6666, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 0.15239695206095877, |
|
"grad_norm": 1.84375, |
|
"learning_rate": 3.4975247524752477e-06, |
|
"loss": 1.5748, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 0.15299694006119877, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 3.495049504950495e-06, |
|
"loss": 1.6351, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.15359692806143876, |
|
"grad_norm": 1.734375, |
|
"learning_rate": 3.492574257425743e-06, |
|
"loss": 1.6819, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.15419691606167876, |
|
"grad_norm": 1.71875, |
|
"learning_rate": 3.4900990099009897e-06, |
|
"loss": 1.714, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 0.15479690406191876, |
|
"grad_norm": 1.78125, |
|
"learning_rate": 3.4876237623762375e-06, |
|
"loss": 1.6284, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 0.15539689206215876, |
|
"grad_norm": 1.8515625, |
|
"learning_rate": 3.485148514851485e-06, |
|
"loss": 1.5917, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 0.15599688006239876, |
|
"grad_norm": 1.828125, |
|
"learning_rate": 3.4826732673267326e-06, |
|
"loss": 1.7185, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.15659686806263876, |
|
"grad_norm": 1.734375, |
|
"learning_rate": 3.48019801980198e-06, |
|
"loss": 1.6383, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 0.15719685606287875, |
|
"grad_norm": 1.78125, |
|
"learning_rate": 3.4777227722772277e-06, |
|
"loss": 1.6595, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 0.15779684406311872, |
|
"grad_norm": 1.71875, |
|
"learning_rate": 3.475247524752475e-06, |
|
"loss": 1.6335, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 0.15839683206335872, |
|
"grad_norm": 1.984375, |
|
"learning_rate": 3.472772277227723e-06, |
|
"loss": 1.7455, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.15899682006359872, |
|
"grad_norm": 1.5859375, |
|
"learning_rate": 3.47029702970297e-06, |
|
"loss": 1.7527, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.15959680806383872, |
|
"grad_norm": 1.6171875, |
|
"learning_rate": 3.467821782178218e-06, |
|
"loss": 1.6564, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 0.16019679606407872, |
|
"grad_norm": 1.5703125, |
|
"learning_rate": 3.4653465346534653e-06, |
|
"loss": 1.6865, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 0.16079678406431872, |
|
"grad_norm": 1.578125, |
|
"learning_rate": 3.462871287128713e-06, |
|
"loss": 1.5996, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 0.1613967720645587, |
|
"grad_norm": 1.5078125, |
|
"learning_rate": 3.4603960396039604e-06, |
|
"loss": 1.5936, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 0.1619967600647987, |
|
"grad_norm": 1.6640625, |
|
"learning_rate": 3.4579207920792077e-06, |
|
"loss": 1.5691, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.1625967480650387, |
|
"grad_norm": 1.59375, |
|
"learning_rate": 3.455445544554455e-06, |
|
"loss": 1.5869, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 0.1631967360652787, |
|
"grad_norm": 1.7265625, |
|
"learning_rate": 3.452970297029703e-06, |
|
"loss": 1.675, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.16379672406551868, |
|
"grad_norm": 1.65625, |
|
"learning_rate": 3.45049504950495e-06, |
|
"loss": 1.7083, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 0.16439671206575868, |
|
"grad_norm": 1.7734375, |
|
"learning_rate": 3.448019801980198e-06, |
|
"loss": 1.7339, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 0.16499670006599867, |
|
"grad_norm": 1.6640625, |
|
"learning_rate": 3.4455445544554453e-06, |
|
"loss": 1.6974, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.16559668806623867, |
|
"grad_norm": 1.65625, |
|
"learning_rate": 3.443069306930693e-06, |
|
"loss": 1.6987, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.16619667606647867, |
|
"grad_norm": 1.90625, |
|
"learning_rate": 3.4405940594059404e-06, |
|
"loss": 1.6543, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 0.16679666406671867, |
|
"grad_norm": 2.203125, |
|
"learning_rate": 3.438118811881188e-06, |
|
"loss": 1.7186, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 0.16739665206695867, |
|
"grad_norm": 1.6328125, |
|
"learning_rate": 3.4356435643564355e-06, |
|
"loss": 1.5786, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 0.16799664006719867, |
|
"grad_norm": 1.78125, |
|
"learning_rate": 3.4331683168316833e-06, |
|
"loss": 1.6934, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.16859662806743866, |
|
"grad_norm": 2.125, |
|
"learning_rate": 3.4306930693069306e-06, |
|
"loss": 1.6231, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 0.16919661606767863, |
|
"grad_norm": 1.8125, |
|
"learning_rate": 3.4282178217821784e-06, |
|
"loss": 1.7557, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 0.16979660406791863, |
|
"grad_norm": 1.8359375, |
|
"learning_rate": 3.4257425742574253e-06, |
|
"loss": 1.7097, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 0.17039659206815863, |
|
"grad_norm": 1.8828125, |
|
"learning_rate": 3.423267326732673e-06, |
|
"loss": 1.6093, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 0.17099658006839863, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 3.4207920792079204e-06, |
|
"loss": 1.6127, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.17159656806863863, |
|
"grad_norm": 1.578125, |
|
"learning_rate": 3.418316831683168e-06, |
|
"loss": 1.6953, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 0.17219655606887863, |
|
"grad_norm": 1.8515625, |
|
"learning_rate": 3.4158415841584155e-06, |
|
"loss": 1.5976, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 0.17279654406911862, |
|
"grad_norm": 1.5703125, |
|
"learning_rate": 3.4133663366336633e-06, |
|
"loss": 1.696, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.17339653206935862, |
|
"grad_norm": 1.578125, |
|
"learning_rate": 3.4108910891089106e-06, |
|
"loss": 1.6185, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 0.17399652006959862, |
|
"grad_norm": 1.6875, |
|
"learning_rate": 3.4084158415841584e-06, |
|
"loss": 1.671, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.1745965080698386, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 3.4059405940594058e-06, |
|
"loss": 1.6512, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 0.1751964960700786, |
|
"grad_norm": 1.75, |
|
"learning_rate": 3.4034653465346535e-06, |
|
"loss": 1.6009, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 0.1757964840703186, |
|
"grad_norm": 1.578125, |
|
"learning_rate": 3.400990099009901e-06, |
|
"loss": 1.5156, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 0.17639647207055859, |
|
"grad_norm": 1.796875, |
|
"learning_rate": 3.3985148514851486e-06, |
|
"loss": 1.694, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 0.17699646007079858, |
|
"grad_norm": 1.7109375, |
|
"learning_rate": 3.396039603960396e-06, |
|
"loss": 1.6629, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.17759644807103858, |
|
"grad_norm": 1.625, |
|
"learning_rate": 3.3935643564356437e-06, |
|
"loss": 1.697, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 0.17819643607127858, |
|
"grad_norm": 1.78125, |
|
"learning_rate": 3.3910891089108907e-06, |
|
"loss": 1.597, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 0.17879642407151858, |
|
"grad_norm": 1.65625, |
|
"learning_rate": 3.3886138613861384e-06, |
|
"loss": 1.6441, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 0.17939641207175858, |
|
"grad_norm": 1.9140625, |
|
"learning_rate": 3.3861386138613858e-06, |
|
"loss": 1.6373, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 0.17999640007199855, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 3.3836633663366335e-06, |
|
"loss": 1.75, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.17999640007199855, |
|
"eval_loss": 1.7828515768051147, |
|
"eval_model_preparation_time": 0.0037, |
|
"eval_runtime": 68.7039, |
|
"eval_samples_per_second": 145.552, |
|
"eval_steps_per_second": 24.264, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.18059638807223855, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 3.381188118811881e-06, |
|
"loss": 1.648, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 0.18119637607247854, |
|
"grad_norm": 1.859375, |
|
"learning_rate": 3.3787128712871286e-06, |
|
"loss": 1.7426, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 0.18179636407271854, |
|
"grad_norm": 1.640625, |
|
"learning_rate": 3.376237623762376e-06, |
|
"loss": 1.6039, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 0.18239635207295854, |
|
"grad_norm": 1.546875, |
|
"learning_rate": 3.3737623762376238e-06, |
|
"loss": 1.6396, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 0.18299634007319854, |
|
"grad_norm": 1.953125, |
|
"learning_rate": 3.371287128712871e-06, |
|
"loss": 1.6994, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.18359632807343854, |
|
"grad_norm": 1.5390625, |
|
"learning_rate": 3.368811881188119e-06, |
|
"loss": 1.4593, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 0.18419631607367853, |
|
"grad_norm": 1.6640625, |
|
"learning_rate": 3.366336633663366e-06, |
|
"loss": 1.5756, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 0.18479630407391853, |
|
"grad_norm": 1.78125, |
|
"learning_rate": 3.363861386138614e-06, |
|
"loss": 1.5962, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 0.1853962920741585, |
|
"grad_norm": 1.796875, |
|
"learning_rate": 3.3613861386138613e-06, |
|
"loss": 1.5964, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 0.1859962800743985, |
|
"grad_norm": 1.5390625, |
|
"learning_rate": 3.3589108910891087e-06, |
|
"loss": 1.6447, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.1865962680746385, |
|
"grad_norm": 1.5625, |
|
"learning_rate": 3.356435643564356e-06, |
|
"loss": 1.7261, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 0.1871962560748785, |
|
"grad_norm": 1.828125, |
|
"learning_rate": 3.3539603960396038e-06, |
|
"loss": 1.6419, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 0.1877962440751185, |
|
"grad_norm": 1.7734375, |
|
"learning_rate": 3.351485148514851e-06, |
|
"loss": 1.6787, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 0.1883962320753585, |
|
"grad_norm": 1.6328125, |
|
"learning_rate": 3.349009900990099e-06, |
|
"loss": 1.7553, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 0.1889962200755985, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 3.3465346534653462e-06, |
|
"loss": 1.7045, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.1895962080758385, |
|
"grad_norm": 1.6640625, |
|
"learning_rate": 3.344059405940594e-06, |
|
"loss": 1.715, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 0.1901961960760785, |
|
"grad_norm": 1.6328125, |
|
"learning_rate": 3.3415841584158413e-06, |
|
"loss": 1.6036, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 0.1907961840763185, |
|
"grad_norm": 1.8515625, |
|
"learning_rate": 3.339108910891089e-06, |
|
"loss": 1.7363, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 0.19139617207655846, |
|
"grad_norm": 1.5625, |
|
"learning_rate": 3.3366336633663364e-06, |
|
"loss": 1.7064, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 0.19199616007679846, |
|
"grad_norm": 1.5625, |
|
"learning_rate": 3.334158415841584e-06, |
|
"loss": 1.7196, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.19259614807703845, |
|
"grad_norm": 1.7265625, |
|
"learning_rate": 3.3316831683168316e-06, |
|
"loss": 1.6718, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 0.19319613607727845, |
|
"grad_norm": 1.546875, |
|
"learning_rate": 3.3292079207920793e-06, |
|
"loss": 1.689, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 0.19379612407751845, |
|
"grad_norm": 1.6328125, |
|
"learning_rate": 3.3267326732673262e-06, |
|
"loss": 1.716, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 0.19439611207775845, |
|
"grad_norm": 1.8046875, |
|
"learning_rate": 3.324257425742574e-06, |
|
"loss": 1.7491, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 0.19499610007799845, |
|
"grad_norm": 1.75, |
|
"learning_rate": 3.3217821782178213e-06, |
|
"loss": 1.6944, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.19559608807823844, |
|
"grad_norm": 1.890625, |
|
"learning_rate": 3.319306930693069e-06, |
|
"loss": 1.691, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 0.19619607607847844, |
|
"grad_norm": 2.0625, |
|
"learning_rate": 3.3168316831683165e-06, |
|
"loss": 1.662, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 0.1967960640787184, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 3.3143564356435642e-06, |
|
"loss": 1.7127, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 0.1973960520789584, |
|
"grad_norm": 1.75, |
|
"learning_rate": 3.3118811881188116e-06, |
|
"loss": 1.6894, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 0.1979960400791984, |
|
"grad_norm": 1.6640625, |
|
"learning_rate": 3.3094059405940593e-06, |
|
"loss": 1.6596, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.1985960280794384, |
|
"grad_norm": 1.609375, |
|
"learning_rate": 3.3069306930693067e-06, |
|
"loss": 1.7074, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 0.1991960160796784, |
|
"grad_norm": 1.7734375, |
|
"learning_rate": 3.3044554455445544e-06, |
|
"loss": 1.7924, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 0.1997960040799184, |
|
"grad_norm": 1.890625, |
|
"learning_rate": 3.3019801980198018e-06, |
|
"loss": 1.6684, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 0.2003959920801584, |
|
"grad_norm": 1.875, |
|
"learning_rate": 3.2995049504950496e-06, |
|
"loss": 1.6355, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 0.2009959800803984, |
|
"grad_norm": 1.6484375, |
|
"learning_rate": 3.297029702970297e-06, |
|
"loss": 1.6694, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.2015959680806384, |
|
"grad_norm": 1.6015625, |
|
"learning_rate": 3.2945544554455442e-06, |
|
"loss": 1.722, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.20219595608087837, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 3.2920792079207916e-06, |
|
"loss": 1.6248, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 0.20279594408111837, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 3.2896039603960393e-06, |
|
"loss": 1.6511, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 0.20339593208135837, |
|
"grad_norm": 1.9375, |
|
"learning_rate": 3.2871287128712867e-06, |
|
"loss": 1.6785, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 0.20399592008159836, |
|
"grad_norm": 1.7734375, |
|
"learning_rate": 3.2846534653465345e-06, |
|
"loss": 1.6779, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.20459590808183836, |
|
"grad_norm": 1.859375, |
|
"learning_rate": 3.282178217821782e-06, |
|
"loss": 1.6735, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 0.20519589608207836, |
|
"grad_norm": 1.59375, |
|
"learning_rate": 3.2797029702970296e-06, |
|
"loss": 1.6365, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 0.20579588408231836, |
|
"grad_norm": 1.7578125, |
|
"learning_rate": 3.277227722772277e-06, |
|
"loss": 1.6497, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 0.20639587208255836, |
|
"grad_norm": 1.6015625, |
|
"learning_rate": 3.2747524752475247e-06, |
|
"loss": 1.6381, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 0.20699586008279836, |
|
"grad_norm": 1.8671875, |
|
"learning_rate": 3.272277227722772e-06, |
|
"loss": 1.669, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.20759584808303833, |
|
"grad_norm": 1.8359375, |
|
"learning_rate": 3.2698019801980198e-06, |
|
"loss": 1.6485, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 0.20819583608327832, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 3.267326732673267e-06, |
|
"loss": 1.5217, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 0.20879582408351832, |
|
"grad_norm": 1.7265625, |
|
"learning_rate": 3.264851485148515e-06, |
|
"loss": 1.5988, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 0.20939581208375832, |
|
"grad_norm": 1.6171875, |
|
"learning_rate": 3.262376237623762e-06, |
|
"loss": 1.6752, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 0.20999580008399832, |
|
"grad_norm": 1.6484375, |
|
"learning_rate": 3.2599009900990096e-06, |
|
"loss": 1.6443, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.20999580008399832, |
|
"eval_loss": 1.7816474437713623, |
|
"eval_model_preparation_time": 0.0037, |
|
"eval_runtime": 65.9284, |
|
"eval_samples_per_second": 151.68, |
|
"eval_steps_per_second": 25.285, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.21059578808423832, |
|
"grad_norm": 1.84375, |
|
"learning_rate": 3.257425742574257e-06, |
|
"loss": 1.6992, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 0.21119577608447831, |
|
"grad_norm": 1.640625, |
|
"learning_rate": 3.2549504950495047e-06, |
|
"loss": 1.6317, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 0.2117957640847183, |
|
"grad_norm": 1.71875, |
|
"learning_rate": 3.252475247524752e-06, |
|
"loss": 1.6773, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 0.2123957520849583, |
|
"grad_norm": 1.8515625, |
|
"learning_rate": 3.25e-06, |
|
"loss": 1.6705, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 0.2129957400851983, |
|
"grad_norm": 1.578125, |
|
"learning_rate": 3.2475247524752476e-06, |
|
"loss": 1.6884, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.21359572808543828, |
|
"grad_norm": 1.84375, |
|
"learning_rate": 3.245049504950495e-06, |
|
"loss": 1.6452, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 0.21419571608567828, |
|
"grad_norm": 1.96875, |
|
"learning_rate": 3.2425742574257427e-06, |
|
"loss": 1.6493, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 0.21479570408591828, |
|
"grad_norm": 1.9765625, |
|
"learning_rate": 3.24009900990099e-06, |
|
"loss": 1.648, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 0.21539569208615827, |
|
"grad_norm": 1.5390625, |
|
"learning_rate": 3.2376237623762378e-06, |
|
"loss": 1.633, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 0.21599568008639827, |
|
"grad_norm": 1.765625, |
|
"learning_rate": 3.235148514851485e-06, |
|
"loss": 1.678, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.21659566808663827, |
|
"grad_norm": 1.65625, |
|
"learning_rate": 3.232673267326733e-06, |
|
"loss": 1.7183, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 0.21719565608687827, |
|
"grad_norm": 1.75, |
|
"learning_rate": 3.2301980198019802e-06, |
|
"loss": 1.6863, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 0.21779564408711827, |
|
"grad_norm": 1.71875, |
|
"learning_rate": 3.2277227722772276e-06, |
|
"loss": 1.8238, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 0.21839563208735827, |
|
"grad_norm": 1.7265625, |
|
"learning_rate": 3.225247524752475e-06, |
|
"loss": 1.6829, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 0.21899562008759824, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 3.2227722772277227e-06, |
|
"loss": 1.5743, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.21959560808783823, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 3.22029702970297e-06, |
|
"loss": 1.7275, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 0.22019559608807823, |
|
"grad_norm": 1.84375, |
|
"learning_rate": 3.217821782178218e-06, |
|
"loss": 1.7962, |
|
"step": 367 |
|
}, |
|
{ |
|
"epoch": 0.22079558408831823, |
|
"grad_norm": 1.5703125, |
|
"learning_rate": 3.215346534653465e-06, |
|
"loss": 1.6825, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 0.22139557208855823, |
|
"grad_norm": 1.5234375, |
|
"learning_rate": 3.212871287128713e-06, |
|
"loss": 1.7308, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 0.22199556008879823, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 3.2103960396039603e-06, |
|
"loss": 1.6639, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.22259554808903823, |
|
"grad_norm": 1.546875, |
|
"learning_rate": 3.207920792079208e-06, |
|
"loss": 1.7165, |
|
"step": 371 |
|
}, |
|
{ |
|
"epoch": 0.22319553608927822, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 3.2054455445544554e-06, |
|
"loss": 1.655, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 0.22379552408951822, |
|
"grad_norm": 1.859375, |
|
"learning_rate": 3.202970297029703e-06, |
|
"loss": 1.6016, |
|
"step": 373 |
|
}, |
|
{ |
|
"epoch": 0.2243955120897582, |
|
"grad_norm": 1.7578125, |
|
"learning_rate": 3.2004950495049505e-06, |
|
"loss": 1.6397, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 0.2249955000899982, |
|
"grad_norm": 1.6328125, |
|
"learning_rate": 3.1980198019801982e-06, |
|
"loss": 1.6711, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.2255954880902382, |
|
"grad_norm": 1.65625, |
|
"learning_rate": 3.195544554455445e-06, |
|
"loss": 1.5991, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 0.2261954760904782, |
|
"grad_norm": 1.8203125, |
|
"learning_rate": 3.193069306930693e-06, |
|
"loss": 1.6873, |
|
"step": 377 |
|
}, |
|
{ |
|
"epoch": 0.22679546409071819, |
|
"grad_norm": 1.65625, |
|
"learning_rate": 3.1905940594059403e-06, |
|
"loss": 1.7299, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 0.22739545209095818, |
|
"grad_norm": 1.6015625, |
|
"learning_rate": 3.188118811881188e-06, |
|
"loss": 1.6678, |
|
"step": 379 |
|
}, |
|
{ |
|
"epoch": 0.22799544009119818, |
|
"grad_norm": 1.8046875, |
|
"learning_rate": 3.1856435643564354e-06, |
|
"loss": 1.632, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.22859542809143818, |
|
"grad_norm": 1.828125, |
|
"learning_rate": 3.183168316831683e-06, |
|
"loss": 1.6137, |
|
"step": 381 |
|
}, |
|
{ |
|
"epoch": 0.22919541609167818, |
|
"grad_norm": 1.7421875, |
|
"learning_rate": 3.1806930693069305e-06, |
|
"loss": 1.5535, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 0.22979540409191815, |
|
"grad_norm": 1.6484375, |
|
"learning_rate": 3.1782178217821783e-06, |
|
"loss": 1.6658, |
|
"step": 383 |
|
}, |
|
{ |
|
"epoch": 0.23039539209215815, |
|
"grad_norm": 1.7734375, |
|
"learning_rate": 3.1757425742574256e-06, |
|
"loss": 1.636, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 0.23099538009239814, |
|
"grad_norm": 1.53125, |
|
"learning_rate": 3.1732673267326734e-06, |
|
"loss": 1.64, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.23159536809263814, |
|
"grad_norm": 1.609375, |
|
"learning_rate": 3.1707920792079207e-06, |
|
"loss": 1.6077, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 0.23219535609287814, |
|
"grad_norm": 1.7109375, |
|
"learning_rate": 3.1683168316831685e-06, |
|
"loss": 1.6389, |
|
"step": 387 |
|
}, |
|
{ |
|
"epoch": 0.23279534409311814, |
|
"grad_norm": 1.625, |
|
"learning_rate": 3.165841584158416e-06, |
|
"loss": 1.6041, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 0.23339533209335814, |
|
"grad_norm": 1.75, |
|
"learning_rate": 3.163366336633663e-06, |
|
"loss": 1.6911, |
|
"step": 389 |
|
}, |
|
{ |
|
"epoch": 0.23399532009359814, |
|
"grad_norm": 1.78125, |
|
"learning_rate": 3.1608910891089105e-06, |
|
"loss": 1.5314, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.23459530809383813, |
|
"grad_norm": 1.765625, |
|
"learning_rate": 3.1584158415841583e-06, |
|
"loss": 1.5704, |
|
"step": 391 |
|
}, |
|
{ |
|
"epoch": 0.23519529609407813, |
|
"grad_norm": 1.6171875, |
|
"learning_rate": 3.1559405940594056e-06, |
|
"loss": 1.7246, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 0.2357952840943181, |
|
"grad_norm": 1.6640625, |
|
"learning_rate": 3.1534653465346534e-06, |
|
"loss": 1.6135, |
|
"step": 393 |
|
}, |
|
{ |
|
"epoch": 0.2363952720945581, |
|
"grad_norm": 1.578125, |
|
"learning_rate": 3.1509900990099007e-06, |
|
"loss": 1.7003, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 0.2369952600947981, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 3.1485148514851485e-06, |
|
"loss": 1.7987, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.2375952480950381, |
|
"grad_norm": 1.8125, |
|
"learning_rate": 3.146039603960396e-06, |
|
"loss": 1.6395, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 0.2381952360952781, |
|
"grad_norm": 1.6484375, |
|
"learning_rate": 3.1435643564356436e-06, |
|
"loss": 1.6497, |
|
"step": 397 |
|
}, |
|
{ |
|
"epoch": 0.2387952240955181, |
|
"grad_norm": 1.8203125, |
|
"learning_rate": 3.141089108910891e-06, |
|
"loss": 1.692, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 0.2393952120957581, |
|
"grad_norm": 1.7421875, |
|
"learning_rate": 3.1386138613861387e-06, |
|
"loss": 1.7406, |
|
"step": 399 |
|
}, |
|
{ |
|
"epoch": 0.2399952000959981, |
|
"grad_norm": 1.7265625, |
|
"learning_rate": 3.136138613861386e-06, |
|
"loss": 1.7191, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.2399952000959981, |
|
"eval_loss": 1.7807087898254395, |
|
"eval_model_preparation_time": 0.0037, |
|
"eval_runtime": 66.0367, |
|
"eval_samples_per_second": 151.431, |
|
"eval_steps_per_second": 25.244, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.2405951880962381, |
|
"grad_norm": 1.7734375, |
|
"learning_rate": 3.133663366336634e-06, |
|
"loss": 1.6603, |
|
"step": 401 |
|
}, |
|
{ |
|
"epoch": 0.24119517609647806, |
|
"grad_norm": 1.609375, |
|
"learning_rate": 3.1311881188118807e-06, |
|
"loss": 1.6492, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 0.24179516409671806, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 3.1287128712871285e-06, |
|
"loss": 1.7367, |
|
"step": 403 |
|
}, |
|
{ |
|
"epoch": 0.24239515209695806, |
|
"grad_norm": 1.6484375, |
|
"learning_rate": 3.126237623762376e-06, |
|
"loss": 1.7123, |
|
"step": 404 |
|
}, |
|
{ |
|
"epoch": 0.24299514009719805, |
|
"grad_norm": 1.7265625, |
|
"learning_rate": 3.1237623762376236e-06, |
|
"loss": 1.7351, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.24359512809743805, |
|
"grad_norm": 1.7109375, |
|
"learning_rate": 3.121287128712871e-06, |
|
"loss": 1.7273, |
|
"step": 406 |
|
}, |
|
{ |
|
"epoch": 0.24419511609767805, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 3.1188118811881187e-06, |
|
"loss": 1.6324, |
|
"step": 407 |
|
}, |
|
{ |
|
"epoch": 0.24479510409791805, |
|
"grad_norm": 1.90625, |
|
"learning_rate": 3.116336633663366e-06, |
|
"loss": 1.6113, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 0.24539509209815805, |
|
"grad_norm": 1.5078125, |
|
"learning_rate": 3.113861386138614e-06, |
|
"loss": 1.6485, |
|
"step": 409 |
|
}, |
|
{ |
|
"epoch": 0.24599508009839804, |
|
"grad_norm": 1.9921875, |
|
"learning_rate": 3.111386138613861e-06, |
|
"loss": 1.664, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.24659506809863802, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 3.108910891089109e-06, |
|
"loss": 1.7174, |
|
"step": 411 |
|
}, |
|
{ |
|
"epoch": 0.247195056098878, |
|
"grad_norm": 1.5390625, |
|
"learning_rate": 3.1064356435643563e-06, |
|
"loss": 1.6438, |
|
"step": 412 |
|
}, |
|
{ |
|
"epoch": 0.247795044099118, |
|
"grad_norm": 1.5859375, |
|
"learning_rate": 3.103960396039604e-06, |
|
"loss": 1.7208, |
|
"step": 413 |
|
}, |
|
{ |
|
"epoch": 0.248395032099358, |
|
"grad_norm": 1.4765625, |
|
"learning_rate": 3.1014851485148514e-06, |
|
"loss": 1.7257, |
|
"step": 414 |
|
}, |
|
{ |
|
"epoch": 0.248995020099598, |
|
"grad_norm": 1.5703125, |
|
"learning_rate": 3.099009900990099e-06, |
|
"loss": 1.683, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.249595008099838, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 3.096534653465346e-06, |
|
"loss": 1.7018, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 0.250194996100078, |
|
"grad_norm": 1.59375, |
|
"learning_rate": 3.094059405940594e-06, |
|
"loss": 1.6217, |
|
"step": 417 |
|
}, |
|
{ |
|
"epoch": 0.250794984100318, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 3.091584158415841e-06, |
|
"loss": 1.6733, |
|
"step": 418 |
|
}, |
|
{ |
|
"epoch": 0.251394972100558, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 3.089108910891089e-06, |
|
"loss": 1.7294, |
|
"step": 419 |
|
}, |
|
{ |
|
"epoch": 0.25199496010079797, |
|
"grad_norm": 1.625, |
|
"learning_rate": 3.0866336633663363e-06, |
|
"loss": 1.6648, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.252594948101038, |
|
"grad_norm": 1.625, |
|
"learning_rate": 3.084158415841584e-06, |
|
"loss": 1.6886, |
|
"step": 421 |
|
}, |
|
{ |
|
"epoch": 0.25319493610127797, |
|
"grad_norm": 1.5078125, |
|
"learning_rate": 3.0816831683168314e-06, |
|
"loss": 1.6579, |
|
"step": 422 |
|
}, |
|
{ |
|
"epoch": 0.253794924101518, |
|
"grad_norm": 1.71875, |
|
"learning_rate": 3.079207920792079e-06, |
|
"loss": 1.5933, |
|
"step": 423 |
|
}, |
|
{ |
|
"epoch": 0.25439491210175796, |
|
"grad_norm": 1.625, |
|
"learning_rate": 3.0767326732673265e-06, |
|
"loss": 1.6736, |
|
"step": 424 |
|
}, |
|
{ |
|
"epoch": 0.25499490010199793, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 3.0742574257425743e-06, |
|
"loss": 1.6689, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.25559488810223796, |
|
"grad_norm": 1.875, |
|
"learning_rate": 3.0717821782178216e-06, |
|
"loss": 1.6232, |
|
"step": 426 |
|
}, |
|
{ |
|
"epoch": 0.25619487610247793, |
|
"grad_norm": 1.609375, |
|
"learning_rate": 3.0693069306930694e-06, |
|
"loss": 1.6268, |
|
"step": 427 |
|
}, |
|
{ |
|
"epoch": 0.25679486410271796, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 3.0668316831683167e-06, |
|
"loss": 1.6636, |
|
"step": 428 |
|
}, |
|
{ |
|
"epoch": 0.2573948521029579, |
|
"grad_norm": 1.4921875, |
|
"learning_rate": 3.064356435643564e-06, |
|
"loss": 1.6045, |
|
"step": 429 |
|
}, |
|
{ |
|
"epoch": 0.25799484010319795, |
|
"grad_norm": 1.7265625, |
|
"learning_rate": 3.0618811881188114e-06, |
|
"loss": 1.6552, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.2585948281034379, |
|
"grad_norm": 1.59375, |
|
"learning_rate": 3.059405940594059e-06, |
|
"loss": 1.7048, |
|
"step": 431 |
|
}, |
|
{ |
|
"epoch": 0.25919481610367795, |
|
"grad_norm": 1.7578125, |
|
"learning_rate": 3.0569306930693065e-06, |
|
"loss": 1.6747, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 0.2597948041039179, |
|
"grad_norm": 1.6328125, |
|
"learning_rate": 3.0544554455445543e-06, |
|
"loss": 1.811, |
|
"step": 433 |
|
}, |
|
{ |
|
"epoch": 0.2603947921041579, |
|
"grad_norm": 1.6484375, |
|
"learning_rate": 3.0519801980198016e-06, |
|
"loss": 1.5845, |
|
"step": 434 |
|
}, |
|
{ |
|
"epoch": 0.2609947801043979, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 3.0495049504950494e-06, |
|
"loss": 1.6937, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.2615947681046379, |
|
"grad_norm": 2.078125, |
|
"learning_rate": 3.0470297029702967e-06, |
|
"loss": 1.7166, |
|
"step": 436 |
|
}, |
|
{ |
|
"epoch": 0.2621947561048779, |
|
"grad_norm": 1.8671875, |
|
"learning_rate": 3.0445544554455445e-06, |
|
"loss": 1.5966, |
|
"step": 437 |
|
}, |
|
{ |
|
"epoch": 0.2627947441051179, |
|
"grad_norm": 1.46875, |
|
"learning_rate": 3.042079207920792e-06, |
|
"loss": 1.5997, |
|
"step": 438 |
|
}, |
|
{ |
|
"epoch": 0.2633947321053579, |
|
"grad_norm": 1.6171875, |
|
"learning_rate": 3.0396039603960396e-06, |
|
"loss": 1.5692, |
|
"step": 439 |
|
}, |
|
{ |
|
"epoch": 0.2639947201055979, |
|
"grad_norm": 1.8125, |
|
"learning_rate": 3.037128712871287e-06, |
|
"loss": 1.6428, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.2645947081058379, |
|
"grad_norm": 1.6484375, |
|
"learning_rate": 3.0346534653465347e-06, |
|
"loss": 1.5924, |
|
"step": 441 |
|
}, |
|
{ |
|
"epoch": 0.2651946961060779, |
|
"grad_norm": 1.5625, |
|
"learning_rate": 3.0321782178217817e-06, |
|
"loss": 1.6718, |
|
"step": 442 |
|
}, |
|
{ |
|
"epoch": 0.26579468410631785, |
|
"grad_norm": 1.53125, |
|
"learning_rate": 3.0297029702970294e-06, |
|
"loss": 1.7045, |
|
"step": 443 |
|
}, |
|
{ |
|
"epoch": 0.2663946721065579, |
|
"grad_norm": 1.8046875, |
|
"learning_rate": 3.0272277227722768e-06, |
|
"loss": 1.6391, |
|
"step": 444 |
|
}, |
|
{ |
|
"epoch": 0.26699466010679784, |
|
"grad_norm": 1.53125, |
|
"learning_rate": 3.0247524752475245e-06, |
|
"loss": 1.648, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.26759464810703787, |
|
"grad_norm": 1.75, |
|
"learning_rate": 3.022277227722772e-06, |
|
"loss": 1.6945, |
|
"step": 446 |
|
}, |
|
{ |
|
"epoch": 0.26819463610727784, |
|
"grad_norm": 1.921875, |
|
"learning_rate": 3.0198019801980196e-06, |
|
"loss": 1.6156, |
|
"step": 447 |
|
}, |
|
{ |
|
"epoch": 0.26879462410751787, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 3.017326732673267e-06, |
|
"loss": 1.7582, |
|
"step": 448 |
|
}, |
|
{ |
|
"epoch": 0.26939461210775784, |
|
"grad_norm": 1.6328125, |
|
"learning_rate": 3.0148514851485147e-06, |
|
"loss": 1.6294, |
|
"step": 449 |
|
}, |
|
{ |
|
"epoch": 0.26999460010799786, |
|
"grad_norm": 1.6875, |
|
"learning_rate": 3.012376237623762e-06, |
|
"loss": 1.7376, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.26999460010799786, |
|
"eval_loss": 1.7797411680221558, |
|
"eval_model_preparation_time": 0.0037, |
|
"eval_runtime": 68.9129, |
|
"eval_samples_per_second": 145.111, |
|
"eval_steps_per_second": 24.19, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.27059458810823783, |
|
"grad_norm": 1.6015625, |
|
"learning_rate": 3.00990099009901e-06, |
|
"loss": 1.6089, |
|
"step": 451 |
|
}, |
|
{ |
|
"epoch": 0.2711945761084778, |
|
"grad_norm": 1.6484375, |
|
"learning_rate": 3.007425742574257e-06, |
|
"loss": 1.657, |
|
"step": 452 |
|
}, |
|
{ |
|
"epoch": 0.27179456410871783, |
|
"grad_norm": 1.5859375, |
|
"learning_rate": 3.004950495049505e-06, |
|
"loss": 1.7169, |
|
"step": 453 |
|
}, |
|
{ |
|
"epoch": 0.2723945521089578, |
|
"grad_norm": 1.6484375, |
|
"learning_rate": 3.0024752475247523e-06, |
|
"loss": 1.5906, |
|
"step": 454 |
|
}, |
|
{ |
|
"epoch": 0.2729945401091978, |
|
"grad_norm": 1.828125, |
|
"learning_rate": 3e-06, |
|
"loss": 1.5775, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.2735945281094378, |
|
"grad_norm": 1.6484375, |
|
"learning_rate": 2.9975247524752474e-06, |
|
"loss": 1.7177, |
|
"step": 456 |
|
}, |
|
{ |
|
"epoch": 0.2741945161096778, |
|
"grad_norm": 1.6171875, |
|
"learning_rate": 2.9950495049504948e-06, |
|
"loss": 1.6019, |
|
"step": 457 |
|
}, |
|
{ |
|
"epoch": 0.2747945041099178, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 2.9925742574257425e-06, |
|
"loss": 1.6325, |
|
"step": 458 |
|
}, |
|
{ |
|
"epoch": 0.2753944921101578, |
|
"grad_norm": 1.7109375, |
|
"learning_rate": 2.99009900990099e-06, |
|
"loss": 1.7797, |
|
"step": 459 |
|
}, |
|
{ |
|
"epoch": 0.2759944801103978, |
|
"grad_norm": 1.7890625, |
|
"learning_rate": 2.9876237623762376e-06, |
|
"loss": 1.6406, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.2765944681106378, |
|
"grad_norm": 1.71875, |
|
"learning_rate": 2.985148514851485e-06, |
|
"loss": 1.745, |
|
"step": 461 |
|
}, |
|
{ |
|
"epoch": 0.2771944561108778, |
|
"grad_norm": 1.7265625, |
|
"learning_rate": 2.9826732673267327e-06, |
|
"loss": 1.7707, |
|
"step": 462 |
|
}, |
|
{ |
|
"epoch": 0.27779444411111776, |
|
"grad_norm": 1.65625, |
|
"learning_rate": 2.98019801980198e-06, |
|
"loss": 1.7372, |
|
"step": 463 |
|
}, |
|
{ |
|
"epoch": 0.2783944321113578, |
|
"grad_norm": 1.71875, |
|
"learning_rate": 2.977722772277228e-06, |
|
"loss": 1.6024, |
|
"step": 464 |
|
}, |
|
{ |
|
"epoch": 0.27899442011159775, |
|
"grad_norm": 1.6171875, |
|
"learning_rate": 2.975247524752475e-06, |
|
"loss": 1.508, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.2795944081118378, |
|
"grad_norm": 1.78125, |
|
"learning_rate": 2.972772277227723e-06, |
|
"loss": 1.8252, |
|
"step": 466 |
|
}, |
|
{ |
|
"epoch": 0.28019439611207775, |
|
"grad_norm": 1.609375, |
|
"learning_rate": 2.9702970297029703e-06, |
|
"loss": 1.5962, |
|
"step": 467 |
|
}, |
|
{ |
|
"epoch": 0.2807943841123178, |
|
"grad_norm": 1.8046875, |
|
"learning_rate": 2.967821782178218e-06, |
|
"loss": 1.6764, |
|
"step": 468 |
|
}, |
|
{ |
|
"epoch": 0.28139437211255774, |
|
"grad_norm": 1.5, |
|
"learning_rate": 2.965346534653465e-06, |
|
"loss": 1.6339, |
|
"step": 469 |
|
}, |
|
{ |
|
"epoch": 0.28199436011279777, |
|
"grad_norm": 1.640625, |
|
"learning_rate": 2.9628712871287128e-06, |
|
"loss": 1.5768, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.28259434811303774, |
|
"grad_norm": 1.7265625, |
|
"learning_rate": 2.96039603960396e-06, |
|
"loss": 1.6435, |
|
"step": 471 |
|
}, |
|
{ |
|
"epoch": 0.2831943361132777, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 2.957920792079208e-06, |
|
"loss": 1.6312, |
|
"step": 472 |
|
}, |
|
{ |
|
"epoch": 0.28379432411351774, |
|
"grad_norm": 1.625, |
|
"learning_rate": 2.9554455445544552e-06, |
|
"loss": 1.7725, |
|
"step": 473 |
|
}, |
|
{ |
|
"epoch": 0.2843943121137577, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 2.952970297029703e-06, |
|
"loss": 1.6755, |
|
"step": 474 |
|
}, |
|
{ |
|
"epoch": 0.28499430011399773, |
|
"grad_norm": 1.78125, |
|
"learning_rate": 2.9504950495049503e-06, |
|
"loss": 1.6577, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.2855942881142377, |
|
"grad_norm": 1.6171875, |
|
"learning_rate": 2.948019801980198e-06, |
|
"loss": 1.7085, |
|
"step": 476 |
|
}, |
|
{ |
|
"epoch": 0.28619427611447773, |
|
"grad_norm": 1.59375, |
|
"learning_rate": 2.9455445544554454e-06, |
|
"loss": 1.7308, |
|
"step": 477 |
|
}, |
|
{ |
|
"epoch": 0.2867942641147177, |
|
"grad_norm": 1.6171875, |
|
"learning_rate": 2.943069306930693e-06, |
|
"loss": 1.592, |
|
"step": 478 |
|
}, |
|
{ |
|
"epoch": 0.2873942521149577, |
|
"grad_norm": 1.8203125, |
|
"learning_rate": 2.9405940594059405e-06, |
|
"loss": 1.5693, |
|
"step": 479 |
|
}, |
|
{ |
|
"epoch": 0.2879942401151977, |
|
"grad_norm": 1.6328125, |
|
"learning_rate": 2.9381188118811883e-06, |
|
"loss": 1.6561, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.28859422811543767, |
|
"grad_norm": 1.7734375, |
|
"learning_rate": 2.9356435643564357e-06, |
|
"loss": 1.6724, |
|
"step": 481 |
|
}, |
|
{ |
|
"epoch": 0.2891942161156777, |
|
"grad_norm": 1.5546875, |
|
"learning_rate": 2.933168316831683e-06, |
|
"loss": 1.6792, |
|
"step": 482 |
|
}, |
|
{ |
|
"epoch": 0.28979420411591766, |
|
"grad_norm": 1.609375, |
|
"learning_rate": 2.9306930693069303e-06, |
|
"loss": 1.5727, |
|
"step": 483 |
|
}, |
|
{ |
|
"epoch": 0.2903941921161577, |
|
"grad_norm": 1.7265625, |
|
"learning_rate": 2.928217821782178e-06, |
|
"loss": 1.6944, |
|
"step": 484 |
|
}, |
|
{ |
|
"epoch": 0.29099418011639766, |
|
"grad_norm": 1.7265625, |
|
"learning_rate": 2.9257425742574254e-06, |
|
"loss": 1.6274, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.2915941681166377, |
|
"grad_norm": 1.59375, |
|
"learning_rate": 2.9232673267326732e-06, |
|
"loss": 1.7017, |
|
"step": 486 |
|
}, |
|
{ |
|
"epoch": 0.29219415611687766, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 2.9207920792079206e-06, |
|
"loss": 1.6242, |
|
"step": 487 |
|
}, |
|
{ |
|
"epoch": 0.2927941441171177, |
|
"grad_norm": 1.8125, |
|
"learning_rate": 2.9183168316831683e-06, |
|
"loss": 1.5515, |
|
"step": 488 |
|
}, |
|
{ |
|
"epoch": 0.29339413211735765, |
|
"grad_norm": 1.546875, |
|
"learning_rate": 2.9158415841584157e-06, |
|
"loss": 1.6318, |
|
"step": 489 |
|
}, |
|
{ |
|
"epoch": 0.2939941201175976, |
|
"grad_norm": 1.734375, |
|
"learning_rate": 2.9133663366336634e-06, |
|
"loss": 1.6594, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.29459410811783765, |
|
"grad_norm": 1.5703125, |
|
"learning_rate": 2.9108910891089108e-06, |
|
"loss": 1.6918, |
|
"step": 491 |
|
}, |
|
{ |
|
"epoch": 0.2951940961180776, |
|
"grad_norm": 1.53125, |
|
"learning_rate": 2.9084158415841585e-06, |
|
"loss": 1.7203, |
|
"step": 492 |
|
}, |
|
{ |
|
"epoch": 0.29579408411831765, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 2.905940594059406e-06, |
|
"loss": 1.6643, |
|
"step": 493 |
|
}, |
|
{ |
|
"epoch": 0.2963940721185576, |
|
"grad_norm": 1.7890625, |
|
"learning_rate": 2.9034653465346537e-06, |
|
"loss": 1.5509, |
|
"step": 494 |
|
}, |
|
{ |
|
"epoch": 0.29699406011879764, |
|
"grad_norm": 1.7578125, |
|
"learning_rate": 2.9009900990099006e-06, |
|
"loss": 1.7432, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.2975940481190376, |
|
"grad_norm": 1.640625, |
|
"learning_rate": 2.8985148514851483e-06, |
|
"loss": 1.6119, |
|
"step": 496 |
|
}, |
|
{ |
|
"epoch": 0.29819403611927764, |
|
"grad_norm": 1.6328125, |
|
"learning_rate": 2.8960396039603957e-06, |
|
"loss": 1.7538, |
|
"step": 497 |
|
}, |
|
{ |
|
"epoch": 0.2987940241195176, |
|
"grad_norm": 1.59375, |
|
"learning_rate": 2.8935643564356434e-06, |
|
"loss": 1.6927, |
|
"step": 498 |
|
}, |
|
{ |
|
"epoch": 0.2993940121197576, |
|
"grad_norm": 1.7265625, |
|
"learning_rate": 2.891089108910891e-06, |
|
"loss": 1.8514, |
|
"step": 499 |
|
}, |
|
{ |
|
"epoch": 0.2999940001199976, |
|
"grad_norm": 1.4921875, |
|
"learning_rate": 2.8886138613861386e-06, |
|
"loss": 1.6127, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.2999940001199976, |
|
"eval_loss": 1.7789958715438843, |
|
"eval_model_preparation_time": 0.0037, |
|
"eval_runtime": 66.0169, |
|
"eval_samples_per_second": 151.476, |
|
"eval_steps_per_second": 25.251, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.3005939881202376, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 2.886138613861386e-06, |
|
"loss": 1.7337, |
|
"step": 501 |
|
}, |
|
{ |
|
"epoch": 0.3011939761204776, |
|
"grad_norm": 1.59375, |
|
"learning_rate": 2.8836633663366337e-06, |
|
"loss": 1.684, |
|
"step": 502 |
|
}, |
|
{ |
|
"epoch": 0.3017939641207176, |
|
"grad_norm": 1.8203125, |
|
"learning_rate": 2.881188118811881e-06, |
|
"loss": 1.6062, |
|
"step": 503 |
|
}, |
|
{ |
|
"epoch": 0.3023939521209576, |
|
"grad_norm": 1.6015625, |
|
"learning_rate": 2.8787128712871288e-06, |
|
"loss": 1.6014, |
|
"step": 504 |
|
}, |
|
{ |
|
"epoch": 0.30299394012119757, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 2.876237623762376e-06, |
|
"loss": 1.5647, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.3035939281214376, |
|
"grad_norm": 1.8671875, |
|
"learning_rate": 2.873762376237624e-06, |
|
"loss": 1.7988, |
|
"step": 506 |
|
}, |
|
{ |
|
"epoch": 0.30419391612167757, |
|
"grad_norm": 1.5859375, |
|
"learning_rate": 2.8712871287128712e-06, |
|
"loss": 1.6926, |
|
"step": 507 |
|
}, |
|
{ |
|
"epoch": 0.30479390412191754, |
|
"grad_norm": 1.8515625, |
|
"learning_rate": 2.868811881188119e-06, |
|
"loss": 1.5877, |
|
"step": 508 |
|
}, |
|
{ |
|
"epoch": 0.30539389212215756, |
|
"grad_norm": 1.8203125, |
|
"learning_rate": 2.866336633663366e-06, |
|
"loss": 1.6823, |
|
"step": 509 |
|
}, |
|
{ |
|
"epoch": 0.30599388012239753, |
|
"grad_norm": 1.578125, |
|
"learning_rate": 2.8638613861386137e-06, |
|
"loss": 1.5566, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.30659386812263756, |
|
"grad_norm": 1.7578125, |
|
"learning_rate": 2.861386138613861e-06, |
|
"loss": 1.6614, |
|
"step": 511 |
|
}, |
|
{ |
|
"epoch": 0.30719385612287753, |
|
"grad_norm": 1.65625, |
|
"learning_rate": 2.858910891089109e-06, |
|
"loss": 1.6932, |
|
"step": 512 |
|
}, |
|
{ |
|
"epoch": 0.30779384412311755, |
|
"grad_norm": 1.6640625, |
|
"learning_rate": 2.856435643564356e-06, |
|
"loss": 1.7368, |
|
"step": 513 |
|
}, |
|
{ |
|
"epoch": 0.3083938321233575, |
|
"grad_norm": 1.6328125, |
|
"learning_rate": 2.853960396039604e-06, |
|
"loss": 1.6836, |
|
"step": 514 |
|
}, |
|
{ |
|
"epoch": 0.30899382012359755, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 2.8514851485148512e-06, |
|
"loss": 1.5781, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.3095938081238375, |
|
"grad_norm": 1.75, |
|
"learning_rate": 2.849009900990099e-06, |
|
"loss": 1.6617, |
|
"step": 516 |
|
}, |
|
{ |
|
"epoch": 0.3101937961240775, |
|
"grad_norm": 1.8203125, |
|
"learning_rate": 2.8465346534653464e-06, |
|
"loss": 1.6568, |
|
"step": 517 |
|
}, |
|
{ |
|
"epoch": 0.3107937841243175, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 2.844059405940594e-06, |
|
"loss": 1.6805, |
|
"step": 518 |
|
}, |
|
{ |
|
"epoch": 0.3113937721245575, |
|
"grad_norm": 1.8046875, |
|
"learning_rate": 2.8415841584158415e-06, |
|
"loss": 1.6114, |
|
"step": 519 |
|
}, |
|
{ |
|
"epoch": 0.3119937601247975, |
|
"grad_norm": 1.71875, |
|
"learning_rate": 2.8391089108910892e-06, |
|
"loss": 1.6399, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.3125937481250375, |
|
"grad_norm": 1.6640625, |
|
"learning_rate": 2.8366336633663366e-06, |
|
"loss": 1.6753, |
|
"step": 521 |
|
}, |
|
{ |
|
"epoch": 0.3131937361252775, |
|
"grad_norm": 1.6640625, |
|
"learning_rate": 2.834158415841584e-06, |
|
"loss": 1.6818, |
|
"step": 522 |
|
}, |
|
{ |
|
"epoch": 0.3137937241255175, |
|
"grad_norm": 1.7421875, |
|
"learning_rate": 2.8316831683168313e-06, |
|
"loss": 1.6999, |
|
"step": 523 |
|
}, |
|
{ |
|
"epoch": 0.3143937121257575, |
|
"grad_norm": 1.6484375, |
|
"learning_rate": 2.829207920792079e-06, |
|
"loss": 1.5612, |
|
"step": 524 |
|
}, |
|
{ |
|
"epoch": 0.3149937001259975, |
|
"grad_norm": 1.8671875, |
|
"learning_rate": 2.8267326732673264e-06, |
|
"loss": 1.7166, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.31559368812623745, |
|
"grad_norm": 1.78125, |
|
"learning_rate": 2.824257425742574e-06, |
|
"loss": 1.6112, |
|
"step": 526 |
|
}, |
|
{ |
|
"epoch": 0.3161936761264775, |
|
"grad_norm": 1.5625, |
|
"learning_rate": 2.8217821782178215e-06, |
|
"loss": 1.5862, |
|
"step": 527 |
|
}, |
|
{ |
|
"epoch": 0.31679366412671744, |
|
"grad_norm": 1.5390625, |
|
"learning_rate": 2.8193069306930692e-06, |
|
"loss": 1.6285, |
|
"step": 528 |
|
}, |
|
{ |
|
"epoch": 0.31739365212695747, |
|
"grad_norm": 1.5234375, |
|
"learning_rate": 2.8168316831683166e-06, |
|
"loss": 1.6099, |
|
"step": 529 |
|
}, |
|
{ |
|
"epoch": 0.31799364012719744, |
|
"grad_norm": 1.7109375, |
|
"learning_rate": 2.8143564356435644e-06, |
|
"loss": 1.6994, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.31859362812743747, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 2.8118811881188117e-06, |
|
"loss": 1.5936, |
|
"step": 531 |
|
}, |
|
{ |
|
"epoch": 0.31919361612767744, |
|
"grad_norm": 1.53125, |
|
"learning_rate": 2.8094059405940595e-06, |
|
"loss": 1.6696, |
|
"step": 532 |
|
}, |
|
{ |
|
"epoch": 0.31979360412791746, |
|
"grad_norm": 1.765625, |
|
"learning_rate": 2.806930693069307e-06, |
|
"loss": 1.5878, |
|
"step": 533 |
|
}, |
|
{ |
|
"epoch": 0.32039359212815743, |
|
"grad_norm": 1.765625, |
|
"learning_rate": 2.8044554455445546e-06, |
|
"loss": 1.6611, |
|
"step": 534 |
|
}, |
|
{ |
|
"epoch": 0.3209935801283974, |
|
"grad_norm": 1.71875, |
|
"learning_rate": 2.8019801980198015e-06, |
|
"loss": 1.7178, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 0.32159356812863743, |
|
"grad_norm": 1.734375, |
|
"learning_rate": 2.7995049504950493e-06, |
|
"loss": 1.6074, |
|
"step": 536 |
|
}, |
|
{ |
|
"epoch": 0.3221935561288774, |
|
"grad_norm": 1.6484375, |
|
"learning_rate": 2.7970297029702966e-06, |
|
"loss": 1.6906, |
|
"step": 537 |
|
}, |
|
{ |
|
"epoch": 0.3227935441291174, |
|
"grad_norm": 1.6015625, |
|
"learning_rate": 2.7945544554455444e-06, |
|
"loss": 1.6133, |
|
"step": 538 |
|
}, |
|
{ |
|
"epoch": 0.3233935321293574, |
|
"grad_norm": 1.5390625, |
|
"learning_rate": 2.7920792079207917e-06, |
|
"loss": 1.6716, |
|
"step": 539 |
|
}, |
|
{ |
|
"epoch": 0.3239935201295974, |
|
"grad_norm": 1.78125, |
|
"learning_rate": 2.7896039603960395e-06, |
|
"loss": 1.6822, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.3245935081298374, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 2.787128712871287e-06, |
|
"loss": 1.6233, |
|
"step": 541 |
|
}, |
|
{ |
|
"epoch": 0.3251934961300774, |
|
"grad_norm": 1.578125, |
|
"learning_rate": 2.7846534653465346e-06, |
|
"loss": 1.6941, |
|
"step": 542 |
|
}, |
|
{ |
|
"epoch": 0.3257934841303174, |
|
"grad_norm": 1.59375, |
|
"learning_rate": 2.782178217821782e-06, |
|
"loss": 1.7407, |
|
"step": 543 |
|
}, |
|
{ |
|
"epoch": 0.3263934721305574, |
|
"grad_norm": 1.7578125, |
|
"learning_rate": 2.7797029702970297e-06, |
|
"loss": 1.6666, |
|
"step": 544 |
|
}, |
|
{ |
|
"epoch": 0.3269934601307974, |
|
"grad_norm": 1.6875, |
|
"learning_rate": 2.777227722772277e-06, |
|
"loss": 1.5721, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 0.32759344813103736, |
|
"grad_norm": 1.6328125, |
|
"learning_rate": 2.774752475247525e-06, |
|
"loss": 1.6345, |
|
"step": 546 |
|
}, |
|
{ |
|
"epoch": 0.3281934361312774, |
|
"grad_norm": 1.578125, |
|
"learning_rate": 2.772277227722772e-06, |
|
"loss": 1.7061, |
|
"step": 547 |
|
}, |
|
{ |
|
"epoch": 0.32879342413151735, |
|
"grad_norm": 1.921875, |
|
"learning_rate": 2.7698019801980195e-06, |
|
"loss": 1.72, |
|
"step": 548 |
|
}, |
|
{ |
|
"epoch": 0.3293934121317574, |
|
"grad_norm": 1.4765625, |
|
"learning_rate": 2.767326732673267e-06, |
|
"loss": 1.5754, |
|
"step": 549 |
|
}, |
|
{ |
|
"epoch": 0.32999340013199735, |
|
"grad_norm": 1.796875, |
|
"learning_rate": 2.7648514851485146e-06, |
|
"loss": 1.6856, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.32999340013199735, |
|
"eval_loss": 1.7785999774932861, |
|
"eval_model_preparation_time": 0.0037, |
|
"eval_runtime": 65.9577, |
|
"eval_samples_per_second": 151.612, |
|
"eval_steps_per_second": 25.274, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.3305933881322374, |
|
"grad_norm": 1.75, |
|
"learning_rate": 2.762376237623762e-06, |
|
"loss": 1.6151, |
|
"step": 551 |
|
}, |
|
{ |
|
"epoch": 0.33119337613247735, |
|
"grad_norm": 1.59375, |
|
"learning_rate": 2.7599009900990097e-06, |
|
"loss": 1.6493, |
|
"step": 552 |
|
}, |
|
{ |
|
"epoch": 0.33179336413271737, |
|
"grad_norm": 1.8203125, |
|
"learning_rate": 2.757425742574257e-06, |
|
"loss": 1.7389, |
|
"step": 553 |
|
}, |
|
{ |
|
"epoch": 0.33239335213295734, |
|
"grad_norm": 1.7109375, |
|
"learning_rate": 2.754950495049505e-06, |
|
"loss": 1.6765, |
|
"step": 554 |
|
}, |
|
{ |
|
"epoch": 0.3329933401331973, |
|
"grad_norm": 1.625, |
|
"learning_rate": 2.752475247524752e-06, |
|
"loss": 1.6237, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 0.33359332813343734, |
|
"grad_norm": 1.5, |
|
"learning_rate": 2.75e-06, |
|
"loss": 1.7249, |
|
"step": 556 |
|
}, |
|
{ |
|
"epoch": 0.3341933161336773, |
|
"grad_norm": 1.71875, |
|
"learning_rate": 2.7475247524752477e-06, |
|
"loss": 1.6253, |
|
"step": 557 |
|
}, |
|
{ |
|
"epoch": 0.33479330413391734, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 2.745049504950495e-06, |
|
"loss": 1.6199, |
|
"step": 558 |
|
}, |
|
{ |
|
"epoch": 0.3353932921341573, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 2.742574257425743e-06, |
|
"loss": 1.6945, |
|
"step": 559 |
|
}, |
|
{ |
|
"epoch": 0.33599328013439733, |
|
"grad_norm": 1.5859375, |
|
"learning_rate": 2.74009900990099e-06, |
|
"loss": 1.6295, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.3365932681346373, |
|
"grad_norm": 1.5625, |
|
"learning_rate": 2.737623762376238e-06, |
|
"loss": 1.617, |
|
"step": 561 |
|
}, |
|
{ |
|
"epoch": 0.33719325613487733, |
|
"grad_norm": 1.546875, |
|
"learning_rate": 2.735148514851485e-06, |
|
"loss": 1.7085, |
|
"step": 562 |
|
}, |
|
{ |
|
"epoch": 0.3377932441351173, |
|
"grad_norm": 1.765625, |
|
"learning_rate": 2.7326732673267326e-06, |
|
"loss": 1.7428, |
|
"step": 563 |
|
}, |
|
{ |
|
"epoch": 0.33839323213535727, |
|
"grad_norm": 1.5703125, |
|
"learning_rate": 2.73019801980198e-06, |
|
"loss": 1.5509, |
|
"step": 564 |
|
}, |
|
{ |
|
"epoch": 0.3389932201355973, |
|
"grad_norm": 1.65625, |
|
"learning_rate": 2.7277227722772277e-06, |
|
"loss": 1.6567, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 0.33959320813583727, |
|
"grad_norm": 1.515625, |
|
"learning_rate": 2.725247524752475e-06, |
|
"loss": 1.5832, |
|
"step": 566 |
|
}, |
|
{ |
|
"epoch": 0.3401931961360773, |
|
"grad_norm": 1.7109375, |
|
"learning_rate": 2.722772277227723e-06, |
|
"loss": 1.6637, |
|
"step": 567 |
|
}, |
|
{ |
|
"epoch": 0.34079318413631726, |
|
"grad_norm": 1.5390625, |
|
"learning_rate": 2.72029702970297e-06, |
|
"loss": 1.6742, |
|
"step": 568 |
|
}, |
|
{ |
|
"epoch": 0.3413931721365573, |
|
"grad_norm": 1.4375, |
|
"learning_rate": 2.717821782178218e-06, |
|
"loss": 1.6378, |
|
"step": 569 |
|
}, |
|
{ |
|
"epoch": 0.34199316013679726, |
|
"grad_norm": 1.5234375, |
|
"learning_rate": 2.7153465346534653e-06, |
|
"loss": 1.6595, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.3425931481370373, |
|
"grad_norm": 1.765625, |
|
"learning_rate": 2.712871287128713e-06, |
|
"loss": 1.7167, |
|
"step": 571 |
|
}, |
|
{ |
|
"epoch": 0.34319313613727725, |
|
"grad_norm": 1.8046875, |
|
"learning_rate": 2.7103960396039604e-06, |
|
"loss": 1.7184, |
|
"step": 572 |
|
}, |
|
{ |
|
"epoch": 0.3437931241375172, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 2.707920792079208e-06, |
|
"loss": 1.6598, |
|
"step": 573 |
|
}, |
|
{ |
|
"epoch": 0.34439311213775725, |
|
"grad_norm": 1.578125, |
|
"learning_rate": 2.7054455445544555e-06, |
|
"loss": 1.7302, |
|
"step": 574 |
|
}, |
|
{ |
|
"epoch": 0.3449931001379972, |
|
"grad_norm": 2.0, |
|
"learning_rate": 2.702970297029703e-06, |
|
"loss": 1.6286, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.34559308813823725, |
|
"grad_norm": 1.6171875, |
|
"learning_rate": 2.70049504950495e-06, |
|
"loss": 1.7379, |
|
"step": 576 |
|
}, |
|
{ |
|
"epoch": 0.3461930761384772, |
|
"grad_norm": 1.8203125, |
|
"learning_rate": 2.698019801980198e-06, |
|
"loss": 1.7016, |
|
"step": 577 |
|
}, |
|
{ |
|
"epoch": 0.34679306413871724, |
|
"grad_norm": 1.890625, |
|
"learning_rate": 2.6955445544554453e-06, |
|
"loss": 1.6231, |
|
"step": 578 |
|
}, |
|
{ |
|
"epoch": 0.3473930521389572, |
|
"grad_norm": 1.9140625, |
|
"learning_rate": 2.693069306930693e-06, |
|
"loss": 1.7478, |
|
"step": 579 |
|
}, |
|
{ |
|
"epoch": 0.34799304013919724, |
|
"grad_norm": 1.6484375, |
|
"learning_rate": 2.6905940594059404e-06, |
|
"loss": 1.6871, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.3485930281394372, |
|
"grad_norm": 1.6015625, |
|
"learning_rate": 2.688118811881188e-06, |
|
"loss": 1.6774, |
|
"step": 581 |
|
}, |
|
{ |
|
"epoch": 0.3491930161396772, |
|
"grad_norm": 1.765625, |
|
"learning_rate": 2.6856435643564355e-06, |
|
"loss": 1.6801, |
|
"step": 582 |
|
}, |
|
{ |
|
"epoch": 0.3497930041399172, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 2.6831683168316833e-06, |
|
"loss": 1.6734, |
|
"step": 583 |
|
}, |
|
{ |
|
"epoch": 0.3503929921401572, |
|
"grad_norm": 1.59375, |
|
"learning_rate": 2.6806930693069306e-06, |
|
"loss": 1.6161, |
|
"step": 584 |
|
}, |
|
{ |
|
"epoch": 0.3509929801403972, |
|
"grad_norm": 1.84375, |
|
"learning_rate": 2.6782178217821784e-06, |
|
"loss": 1.621, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 0.3515929681406372, |
|
"grad_norm": 1.71875, |
|
"learning_rate": 2.6757425742574257e-06, |
|
"loss": 1.6603, |
|
"step": 586 |
|
}, |
|
{ |
|
"epoch": 0.3521929561408772, |
|
"grad_norm": 1.9296875, |
|
"learning_rate": 2.6732673267326735e-06, |
|
"loss": 1.7578, |
|
"step": 587 |
|
}, |
|
{ |
|
"epoch": 0.35279294414111717, |
|
"grad_norm": 1.75, |
|
"learning_rate": 2.6707920792079204e-06, |
|
"loss": 1.7356, |
|
"step": 588 |
|
}, |
|
{ |
|
"epoch": 0.3533929321413572, |
|
"grad_norm": 1.7421875, |
|
"learning_rate": 2.668316831683168e-06, |
|
"loss": 1.7648, |
|
"step": 589 |
|
}, |
|
{ |
|
"epoch": 0.35399292014159717, |
|
"grad_norm": 1.5703125, |
|
"learning_rate": 2.6658415841584155e-06, |
|
"loss": 1.6623, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.35459290814183714, |
|
"grad_norm": 1.8515625, |
|
"learning_rate": 2.6633663366336633e-06, |
|
"loss": 1.7709, |
|
"step": 591 |
|
}, |
|
{ |
|
"epoch": 0.35519289614207716, |
|
"grad_norm": 1.7890625, |
|
"learning_rate": 2.6608910891089106e-06, |
|
"loss": 1.6477, |
|
"step": 592 |
|
}, |
|
{ |
|
"epoch": 0.35579288414231713, |
|
"grad_norm": 1.5546875, |
|
"learning_rate": 2.6584158415841584e-06, |
|
"loss": 1.5843, |
|
"step": 593 |
|
}, |
|
{ |
|
"epoch": 0.35639287214255716, |
|
"grad_norm": 1.5703125, |
|
"learning_rate": 2.6559405940594057e-06, |
|
"loss": 1.6412, |
|
"step": 594 |
|
}, |
|
{ |
|
"epoch": 0.35699286014279713, |
|
"grad_norm": 1.6328125, |
|
"learning_rate": 2.6534653465346535e-06, |
|
"loss": 1.646, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 0.35759284814303716, |
|
"grad_norm": 1.65625, |
|
"learning_rate": 2.650990099009901e-06, |
|
"loss": 1.753, |
|
"step": 596 |
|
}, |
|
{ |
|
"epoch": 0.3581928361432771, |
|
"grad_norm": 1.609375, |
|
"learning_rate": 2.6485148514851486e-06, |
|
"loss": 1.6465, |
|
"step": 597 |
|
}, |
|
{ |
|
"epoch": 0.35879282414351715, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 2.646039603960396e-06, |
|
"loss": 1.6874, |
|
"step": 598 |
|
}, |
|
{ |
|
"epoch": 0.3593928121437571, |
|
"grad_norm": 1.6875, |
|
"learning_rate": 2.6435643564356437e-06, |
|
"loss": 1.6842, |
|
"step": 599 |
|
}, |
|
{ |
|
"epoch": 0.3599928001439971, |
|
"grad_norm": 1.53125, |
|
"learning_rate": 2.641089108910891e-06, |
|
"loss": 1.692, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.3599928001439971, |
|
"eval_loss": 1.7782986164093018, |
|
"eval_model_preparation_time": 0.0037, |
|
"eval_runtime": 68.6919, |
|
"eval_samples_per_second": 145.577, |
|
"eval_steps_per_second": 24.268, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.3605927881442371, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 2.6386138613861384e-06, |
|
"loss": 1.7203, |
|
"step": 601 |
|
}, |
|
{ |
|
"epoch": 0.3611927761444771, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 2.6361386138613858e-06, |
|
"loss": 1.7563, |
|
"step": 602 |
|
}, |
|
{ |
|
"epoch": 0.3617927641447171, |
|
"grad_norm": 1.625, |
|
"learning_rate": 2.6336633663366335e-06, |
|
"loss": 1.5713, |
|
"step": 603 |
|
}, |
|
{ |
|
"epoch": 0.3623927521449571, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 2.631188118811881e-06, |
|
"loss": 1.7128, |
|
"step": 604 |
|
}, |
|
{ |
|
"epoch": 0.3629927401451971, |
|
"grad_norm": 1.7109375, |
|
"learning_rate": 2.6287128712871286e-06, |
|
"loss": 1.7093, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 0.3635927281454371, |
|
"grad_norm": 1.9765625, |
|
"learning_rate": 2.626237623762376e-06, |
|
"loss": 1.7053, |
|
"step": 606 |
|
}, |
|
{ |
|
"epoch": 0.3641927161456771, |
|
"grad_norm": 1.5703125, |
|
"learning_rate": 2.6237623762376237e-06, |
|
"loss": 1.6861, |
|
"step": 607 |
|
}, |
|
{ |
|
"epoch": 0.3647927041459171, |
|
"grad_norm": 1.515625, |
|
"learning_rate": 2.621287128712871e-06, |
|
"loss": 1.5996, |
|
"step": 608 |
|
}, |
|
{ |
|
"epoch": 0.36539269214615705, |
|
"grad_norm": 1.65625, |
|
"learning_rate": 2.618811881188119e-06, |
|
"loss": 1.6401, |
|
"step": 609 |
|
}, |
|
{ |
|
"epoch": 0.3659926801463971, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 2.616336633663366e-06, |
|
"loss": 1.6489, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.36659266814663705, |
|
"grad_norm": 1.59375, |
|
"learning_rate": 2.613861386138614e-06, |
|
"loss": 1.7161, |
|
"step": 611 |
|
}, |
|
{ |
|
"epoch": 0.36719265614687707, |
|
"grad_norm": 1.9453125, |
|
"learning_rate": 2.6113861386138613e-06, |
|
"loss": 1.5981, |
|
"step": 612 |
|
}, |
|
{ |
|
"epoch": 0.36779264414711704, |
|
"grad_norm": 1.7265625, |
|
"learning_rate": 2.608910891089109e-06, |
|
"loss": 1.7098, |
|
"step": 613 |
|
}, |
|
{ |
|
"epoch": 0.36839263214735707, |
|
"grad_norm": 1.796875, |
|
"learning_rate": 2.606435643564356e-06, |
|
"loss": 1.6385, |
|
"step": 614 |
|
}, |
|
{ |
|
"epoch": 0.36899262014759704, |
|
"grad_norm": 1.6328125, |
|
"learning_rate": 2.6039603960396038e-06, |
|
"loss": 1.7776, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 0.36959260814783707, |
|
"grad_norm": 1.765625, |
|
"learning_rate": 2.601485148514851e-06, |
|
"loss": 1.6092, |
|
"step": 616 |
|
}, |
|
{ |
|
"epoch": 0.37019259614807704, |
|
"grad_norm": 1.6015625, |
|
"learning_rate": 2.599009900990099e-06, |
|
"loss": 1.7118, |
|
"step": 617 |
|
}, |
|
{ |
|
"epoch": 0.370792584148317, |
|
"grad_norm": 1.796875, |
|
"learning_rate": 2.596534653465346e-06, |
|
"loss": 1.5554, |
|
"step": 618 |
|
}, |
|
{ |
|
"epoch": 0.37139257214855703, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 2.594059405940594e-06, |
|
"loss": 1.7137, |
|
"step": 619 |
|
}, |
|
{ |
|
"epoch": 0.371992560148797, |
|
"grad_norm": 2.265625, |
|
"learning_rate": 2.5915841584158413e-06, |
|
"loss": 1.568, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.37259254814903703, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 2.589108910891089e-06, |
|
"loss": 1.6098, |
|
"step": 621 |
|
}, |
|
{ |
|
"epoch": 0.373192536149277, |
|
"grad_norm": 1.7265625, |
|
"learning_rate": 2.5866336633663364e-06, |
|
"loss": 1.7429, |
|
"step": 622 |
|
}, |
|
{ |
|
"epoch": 0.373792524149517, |
|
"grad_norm": 1.515625, |
|
"learning_rate": 2.584158415841584e-06, |
|
"loss": 1.5776, |
|
"step": 623 |
|
}, |
|
{ |
|
"epoch": 0.374392512149757, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 2.5816831683168315e-06, |
|
"loss": 1.7225, |
|
"step": 624 |
|
}, |
|
{ |
|
"epoch": 0.374992500149997, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 2.5792079207920793e-06, |
|
"loss": 1.636, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.375592488150237, |
|
"grad_norm": 1.71875, |
|
"learning_rate": 2.5767326732673266e-06, |
|
"loss": 1.6907, |
|
"step": 626 |
|
}, |
|
{ |
|
"epoch": 0.376192476150477, |
|
"grad_norm": 1.515625, |
|
"learning_rate": 2.5742574257425744e-06, |
|
"loss": 1.6, |
|
"step": 627 |
|
}, |
|
{ |
|
"epoch": 0.376792464150717, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 2.5717821782178213e-06, |
|
"loss": 1.7379, |
|
"step": 628 |
|
}, |
|
{ |
|
"epoch": 0.37739245215095696, |
|
"grad_norm": 1.8203125, |
|
"learning_rate": 2.569306930693069e-06, |
|
"loss": 1.7201, |
|
"step": 629 |
|
}, |
|
{ |
|
"epoch": 0.377992440151197, |
|
"grad_norm": 1.75, |
|
"learning_rate": 2.5668316831683164e-06, |
|
"loss": 1.5961, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.37859242815143695, |
|
"grad_norm": 1.609375, |
|
"learning_rate": 2.564356435643564e-06, |
|
"loss": 1.6398, |
|
"step": 631 |
|
}, |
|
{ |
|
"epoch": 0.379192416151677, |
|
"grad_norm": 1.84375, |
|
"learning_rate": 2.5618811881188115e-06, |
|
"loss": 1.7187, |
|
"step": 632 |
|
}, |
|
{ |
|
"epoch": 0.37979240415191695, |
|
"grad_norm": 1.7578125, |
|
"learning_rate": 2.5594059405940593e-06, |
|
"loss": 1.6705, |
|
"step": 633 |
|
}, |
|
{ |
|
"epoch": 0.380392392152157, |
|
"grad_norm": 1.5703125, |
|
"learning_rate": 2.5569306930693067e-06, |
|
"loss": 1.6784, |
|
"step": 634 |
|
}, |
|
{ |
|
"epoch": 0.38099238015239695, |
|
"grad_norm": 1.71875, |
|
"learning_rate": 2.5544554455445544e-06, |
|
"loss": 1.6956, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 0.381592368152637, |
|
"grad_norm": 1.5625, |
|
"learning_rate": 2.5519801980198018e-06, |
|
"loss": 1.6573, |
|
"step": 636 |
|
}, |
|
{ |
|
"epoch": 0.38219235615287694, |
|
"grad_norm": 1.53125, |
|
"learning_rate": 2.5495049504950495e-06, |
|
"loss": 1.6514, |
|
"step": 637 |
|
}, |
|
{ |
|
"epoch": 0.3827923441531169, |
|
"grad_norm": 1.6328125, |
|
"learning_rate": 2.547029702970297e-06, |
|
"loss": 1.5906, |
|
"step": 638 |
|
}, |
|
{ |
|
"epoch": 0.38339233215335694, |
|
"grad_norm": 1.6875, |
|
"learning_rate": 2.5445544554455446e-06, |
|
"loss": 1.6045, |
|
"step": 639 |
|
}, |
|
{ |
|
"epoch": 0.3839923201535969, |
|
"grad_norm": 1.8125, |
|
"learning_rate": 2.542079207920792e-06, |
|
"loss": 1.6549, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.38459230815383694, |
|
"grad_norm": 1.59375, |
|
"learning_rate": 2.5396039603960393e-06, |
|
"loss": 1.6548, |
|
"step": 641 |
|
}, |
|
{ |
|
"epoch": 0.3851922961540769, |
|
"grad_norm": 1.75, |
|
"learning_rate": 2.5371287128712867e-06, |
|
"loss": 1.6412, |
|
"step": 642 |
|
}, |
|
{ |
|
"epoch": 0.38579228415431693, |
|
"grad_norm": 1.59375, |
|
"learning_rate": 2.5346534653465344e-06, |
|
"loss": 1.6326, |
|
"step": 643 |
|
}, |
|
{ |
|
"epoch": 0.3863922721545569, |
|
"grad_norm": 1.8125, |
|
"learning_rate": 2.5321782178217818e-06, |
|
"loss": 1.5856, |
|
"step": 644 |
|
}, |
|
{ |
|
"epoch": 0.38699226015479693, |
|
"grad_norm": 1.8203125, |
|
"learning_rate": 2.5297029702970295e-06, |
|
"loss": 1.548, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 0.3875922481550369, |
|
"grad_norm": 1.546875, |
|
"learning_rate": 2.527227722772277e-06, |
|
"loss": 1.6333, |
|
"step": 646 |
|
}, |
|
{ |
|
"epoch": 0.38819223615527687, |
|
"grad_norm": 1.796875, |
|
"learning_rate": 2.5247524752475247e-06, |
|
"loss": 1.6828, |
|
"step": 647 |
|
}, |
|
{ |
|
"epoch": 0.3887922241555169, |
|
"grad_norm": 1.6328125, |
|
"learning_rate": 2.522277227722772e-06, |
|
"loss": 1.6181, |
|
"step": 648 |
|
}, |
|
{ |
|
"epoch": 0.38939221215575687, |
|
"grad_norm": 1.625, |
|
"learning_rate": 2.5198019801980198e-06, |
|
"loss": 1.6848, |
|
"step": 649 |
|
}, |
|
{ |
|
"epoch": 0.3899922001559969, |
|
"grad_norm": 1.734375, |
|
"learning_rate": 2.517326732673267e-06, |
|
"loss": 1.7051, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.3899922001559969, |
|
"eval_loss": 1.778051733970642, |
|
"eval_model_preparation_time": 0.0037, |
|
"eval_runtime": 65.9746, |
|
"eval_samples_per_second": 151.573, |
|
"eval_steps_per_second": 25.267, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.39059218815623686, |
|
"grad_norm": 1.5, |
|
"learning_rate": 2.514851485148515e-06, |
|
"loss": 1.6987, |
|
"step": 651 |
|
}, |
|
{ |
|
"epoch": 0.3911921761564769, |
|
"grad_norm": 1.84375, |
|
"learning_rate": 2.5123762376237622e-06, |
|
"loss": 1.6951, |
|
"step": 652 |
|
}, |
|
{ |
|
"epoch": 0.39179216415671686, |
|
"grad_norm": 1.6015625, |
|
"learning_rate": 2.50990099009901e-06, |
|
"loss": 1.6218, |
|
"step": 653 |
|
}, |
|
{ |
|
"epoch": 0.3923921521569569, |
|
"grad_norm": 1.6015625, |
|
"learning_rate": 2.507425742574257e-06, |
|
"loss": 1.7695, |
|
"step": 654 |
|
}, |
|
{ |
|
"epoch": 0.39299214015719686, |
|
"grad_norm": 1.5625, |
|
"learning_rate": 2.5049504950495047e-06, |
|
"loss": 1.6856, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 0.3935921281574368, |
|
"grad_norm": 1.6640625, |
|
"learning_rate": 2.502475247524752e-06, |
|
"loss": 1.658, |
|
"step": 656 |
|
}, |
|
{ |
|
"epoch": 0.39419211615767685, |
|
"grad_norm": 1.8203125, |
|
"learning_rate": 2.4999999999999998e-06, |
|
"loss": 1.6642, |
|
"step": 657 |
|
}, |
|
{ |
|
"epoch": 0.3947921041579168, |
|
"grad_norm": 1.890625, |
|
"learning_rate": 2.4975247524752475e-06, |
|
"loss": 1.6252, |
|
"step": 658 |
|
}, |
|
{ |
|
"epoch": 0.39539209215815685, |
|
"grad_norm": 1.6484375, |
|
"learning_rate": 2.495049504950495e-06, |
|
"loss": 1.6601, |
|
"step": 659 |
|
}, |
|
{ |
|
"epoch": 0.3959920801583968, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 2.4925742574257427e-06, |
|
"loss": 1.6114, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.39659206815863685, |
|
"grad_norm": 1.515625, |
|
"learning_rate": 2.49009900990099e-06, |
|
"loss": 1.6902, |
|
"step": 661 |
|
}, |
|
{ |
|
"epoch": 0.3971920561588768, |
|
"grad_norm": 1.6015625, |
|
"learning_rate": 2.4876237623762378e-06, |
|
"loss": 1.5903, |
|
"step": 662 |
|
}, |
|
{ |
|
"epoch": 0.39779204415911684, |
|
"grad_norm": 1.8359375, |
|
"learning_rate": 2.485148514851485e-06, |
|
"loss": 1.7081, |
|
"step": 663 |
|
}, |
|
{ |
|
"epoch": 0.3983920321593568, |
|
"grad_norm": 1.765625, |
|
"learning_rate": 2.482673267326733e-06, |
|
"loss": 1.7127, |
|
"step": 664 |
|
}, |
|
{ |
|
"epoch": 0.3989920201595968, |
|
"grad_norm": 1.78125, |
|
"learning_rate": 2.4801980198019802e-06, |
|
"loss": 1.538, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 0.3995920081598368, |
|
"grad_norm": 1.609375, |
|
"learning_rate": 2.477722772277228e-06, |
|
"loss": 1.7153, |
|
"step": 666 |
|
}, |
|
{ |
|
"epoch": 0.4001919961600768, |
|
"grad_norm": 1.5703125, |
|
"learning_rate": 2.475247524752475e-06, |
|
"loss": 1.5796, |
|
"step": 667 |
|
}, |
|
{ |
|
"epoch": 0.4007919841603168, |
|
"grad_norm": 1.796875, |
|
"learning_rate": 2.4727722772277227e-06, |
|
"loss": 1.7261, |
|
"step": 668 |
|
}, |
|
{ |
|
"epoch": 0.4013919721605568, |
|
"grad_norm": 1.75, |
|
"learning_rate": 2.47029702970297e-06, |
|
"loss": 1.6201, |
|
"step": 669 |
|
}, |
|
{ |
|
"epoch": 0.4019919601607968, |
|
"grad_norm": 1.78125, |
|
"learning_rate": 2.4678217821782178e-06, |
|
"loss": 1.5838, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.4025919481610368, |
|
"grad_norm": 1.828125, |
|
"learning_rate": 2.465346534653465e-06, |
|
"loss": 1.675, |
|
"step": 671 |
|
}, |
|
{ |
|
"epoch": 0.4031919361612768, |
|
"grad_norm": 1.625, |
|
"learning_rate": 2.462871287128713e-06, |
|
"loss": 1.5512, |
|
"step": 672 |
|
}, |
|
{ |
|
"epoch": 0.40379192416151677, |
|
"grad_norm": 1.625, |
|
"learning_rate": 2.4603960396039602e-06, |
|
"loss": 1.7064, |
|
"step": 673 |
|
}, |
|
{ |
|
"epoch": 0.40439191216175674, |
|
"grad_norm": 1.7265625, |
|
"learning_rate": 2.457920792079208e-06, |
|
"loss": 1.6955, |
|
"step": 674 |
|
}, |
|
{ |
|
"epoch": 0.40499190016199677, |
|
"grad_norm": 1.7265625, |
|
"learning_rate": 2.4554455445544553e-06, |
|
"loss": 1.5868, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.40559188816223674, |
|
"grad_norm": 1.84375, |
|
"learning_rate": 2.452970297029703e-06, |
|
"loss": 1.6344, |
|
"step": 676 |
|
}, |
|
{ |
|
"epoch": 0.40619187616247676, |
|
"grad_norm": 1.7734375, |
|
"learning_rate": 2.4504950495049505e-06, |
|
"loss": 1.7793, |
|
"step": 677 |
|
}, |
|
{ |
|
"epoch": 0.40679186416271673, |
|
"grad_norm": 1.6328125, |
|
"learning_rate": 2.4480198019801982e-06, |
|
"loss": 1.6707, |
|
"step": 678 |
|
}, |
|
{ |
|
"epoch": 0.40739185216295676, |
|
"grad_norm": 1.453125, |
|
"learning_rate": 2.4455445544554456e-06, |
|
"loss": 1.6521, |
|
"step": 679 |
|
}, |
|
{ |
|
"epoch": 0.40799184016319673, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 2.4430693069306933e-06, |
|
"loss": 1.6611, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.40859182816343675, |
|
"grad_norm": 1.625, |
|
"learning_rate": 2.4405940594059402e-06, |
|
"loss": 1.7121, |
|
"step": 681 |
|
}, |
|
{ |
|
"epoch": 0.4091918161636767, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 2.438118811881188e-06, |
|
"loss": 1.6899, |
|
"step": 682 |
|
}, |
|
{ |
|
"epoch": 0.4097918041639167, |
|
"grad_norm": 1.609375, |
|
"learning_rate": 2.4356435643564354e-06, |
|
"loss": 1.5913, |
|
"step": 683 |
|
}, |
|
{ |
|
"epoch": 0.4103917921641567, |
|
"grad_norm": 1.59375, |
|
"learning_rate": 2.433168316831683e-06, |
|
"loss": 1.6691, |
|
"step": 684 |
|
}, |
|
{ |
|
"epoch": 0.4109917801643967, |
|
"grad_norm": 1.8359375, |
|
"learning_rate": 2.4306930693069305e-06, |
|
"loss": 1.5859, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 0.4115917681646367, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 2.4282178217821782e-06, |
|
"loss": 1.6688, |
|
"step": 686 |
|
}, |
|
{ |
|
"epoch": 0.4121917561648767, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 2.4257425742574256e-06, |
|
"loss": 1.6196, |
|
"step": 687 |
|
}, |
|
{ |
|
"epoch": 0.4127917441651167, |
|
"grad_norm": 1.6171875, |
|
"learning_rate": 2.4232673267326733e-06, |
|
"loss": 1.6959, |
|
"step": 688 |
|
}, |
|
{ |
|
"epoch": 0.4133917321653567, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 2.4207920792079207e-06, |
|
"loss": 1.6958, |
|
"step": 689 |
|
}, |
|
{ |
|
"epoch": 0.4139917201655967, |
|
"grad_norm": 1.6875, |
|
"learning_rate": 2.4183168316831685e-06, |
|
"loss": 1.6803, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.4145917081658367, |
|
"grad_norm": 1.6484375, |
|
"learning_rate": 2.415841584158416e-06, |
|
"loss": 1.67, |
|
"step": 691 |
|
}, |
|
{ |
|
"epoch": 0.41519169616607665, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 2.4133663366336636e-06, |
|
"loss": 1.6805, |
|
"step": 692 |
|
}, |
|
{ |
|
"epoch": 0.4157916841663167, |
|
"grad_norm": 1.859375, |
|
"learning_rate": 2.410891089108911e-06, |
|
"loss": 1.67, |
|
"step": 693 |
|
}, |
|
{ |
|
"epoch": 0.41639167216655665, |
|
"grad_norm": 1.8203125, |
|
"learning_rate": 2.4084158415841582e-06, |
|
"loss": 1.5639, |
|
"step": 694 |
|
}, |
|
{ |
|
"epoch": 0.4169916601667967, |
|
"grad_norm": 1.84375, |
|
"learning_rate": 2.4059405940594056e-06, |
|
"loss": 1.7609, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 0.41759164816703664, |
|
"grad_norm": 1.5625, |
|
"learning_rate": 2.4034653465346534e-06, |
|
"loss": 1.6972, |
|
"step": 696 |
|
}, |
|
{ |
|
"epoch": 0.41819163616727667, |
|
"grad_norm": 1.640625, |
|
"learning_rate": 2.4009900990099007e-06, |
|
"loss": 1.5793, |
|
"step": 697 |
|
}, |
|
{ |
|
"epoch": 0.41879162416751664, |
|
"grad_norm": 1.6640625, |
|
"learning_rate": 2.3985148514851485e-06, |
|
"loss": 1.6058, |
|
"step": 698 |
|
}, |
|
{ |
|
"epoch": 0.41939161216775667, |
|
"grad_norm": 1.8671875, |
|
"learning_rate": 2.396039603960396e-06, |
|
"loss": 1.7138, |
|
"step": 699 |
|
}, |
|
{ |
|
"epoch": 0.41999160016799664, |
|
"grad_norm": 1.53125, |
|
"learning_rate": 2.3935643564356436e-06, |
|
"loss": 1.6364, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.41999160016799664, |
|
"eval_loss": 1.777879238128662, |
|
"eval_model_preparation_time": 0.0037, |
|
"eval_runtime": 66.0918, |
|
"eval_samples_per_second": 151.305, |
|
"eval_steps_per_second": 25.222, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.4205915881682366, |
|
"grad_norm": 1.75, |
|
"learning_rate": 2.391089108910891e-06, |
|
"loss": 1.7899, |
|
"step": 701 |
|
}, |
|
{ |
|
"epoch": 0.42119157616847663, |
|
"grad_norm": 1.8671875, |
|
"learning_rate": 2.3886138613861387e-06, |
|
"loss": 1.6305, |
|
"step": 702 |
|
}, |
|
{ |
|
"epoch": 0.4217915641687166, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 2.386138613861386e-06, |
|
"loss": 1.6793, |
|
"step": 703 |
|
}, |
|
{ |
|
"epoch": 0.42239155216895663, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 2.383663366336634e-06, |
|
"loss": 1.657, |
|
"step": 704 |
|
}, |
|
{ |
|
"epoch": 0.4229915401691966, |
|
"grad_norm": 1.6484375, |
|
"learning_rate": 2.381188118811881e-06, |
|
"loss": 1.6353, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 0.4235915281694366, |
|
"grad_norm": 1.8515625, |
|
"learning_rate": 2.378712871287129e-06, |
|
"loss": 1.592, |
|
"step": 706 |
|
}, |
|
{ |
|
"epoch": 0.4241915161696766, |
|
"grad_norm": 1.546875, |
|
"learning_rate": 2.376237623762376e-06, |
|
"loss": 1.615, |
|
"step": 707 |
|
}, |
|
{ |
|
"epoch": 0.4247915041699166, |
|
"grad_norm": 1.6875, |
|
"learning_rate": 2.3737623762376236e-06, |
|
"loss": 1.5985, |
|
"step": 708 |
|
}, |
|
{ |
|
"epoch": 0.4253914921701566, |
|
"grad_norm": 1.8046875, |
|
"learning_rate": 2.371287128712871e-06, |
|
"loss": 1.6037, |
|
"step": 709 |
|
}, |
|
{ |
|
"epoch": 0.4259914801703966, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 2.3688118811881187e-06, |
|
"loss": 1.6959, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.4265914681706366, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 2.366336633663366e-06, |
|
"loss": 1.726, |
|
"step": 711 |
|
}, |
|
{ |
|
"epoch": 0.42719145617087656, |
|
"grad_norm": 1.75, |
|
"learning_rate": 2.363861386138614e-06, |
|
"loss": 1.7963, |
|
"step": 712 |
|
}, |
|
{ |
|
"epoch": 0.4277914441711166, |
|
"grad_norm": 1.6328125, |
|
"learning_rate": 2.361386138613861e-06, |
|
"loss": 1.64, |
|
"step": 713 |
|
}, |
|
{ |
|
"epoch": 0.42839143217135656, |
|
"grad_norm": 1.75, |
|
"learning_rate": 2.358910891089109e-06, |
|
"loss": 1.6623, |
|
"step": 714 |
|
}, |
|
{ |
|
"epoch": 0.4289914201715966, |
|
"grad_norm": 1.609375, |
|
"learning_rate": 2.3564356435643563e-06, |
|
"loss": 1.6111, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 0.42959140817183655, |
|
"grad_norm": 1.765625, |
|
"learning_rate": 2.353960396039604e-06, |
|
"loss": 1.7229, |
|
"step": 716 |
|
}, |
|
{ |
|
"epoch": 0.4301913961720766, |
|
"grad_norm": 1.6640625, |
|
"learning_rate": 2.3514851485148514e-06, |
|
"loss": 1.6864, |
|
"step": 717 |
|
}, |
|
{ |
|
"epoch": 0.43079138417231655, |
|
"grad_norm": 1.453125, |
|
"learning_rate": 2.349009900990099e-06, |
|
"loss": 1.7142, |
|
"step": 718 |
|
}, |
|
{ |
|
"epoch": 0.4313913721725566, |
|
"grad_norm": 1.8984375, |
|
"learning_rate": 2.3465346534653465e-06, |
|
"loss": 1.7028, |
|
"step": 719 |
|
}, |
|
{ |
|
"epoch": 0.43199136017279655, |
|
"grad_norm": 1.765625, |
|
"learning_rate": 2.3440594059405942e-06, |
|
"loss": 1.7853, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.4325913481730365, |
|
"grad_norm": 1.6875, |
|
"learning_rate": 2.341584158415841e-06, |
|
"loss": 1.6417, |
|
"step": 721 |
|
}, |
|
{ |
|
"epoch": 0.43319133617327654, |
|
"grad_norm": 1.5546875, |
|
"learning_rate": 2.339108910891089e-06, |
|
"loss": 1.6398, |
|
"step": 722 |
|
}, |
|
{ |
|
"epoch": 0.4337913241735165, |
|
"grad_norm": 1.84375, |
|
"learning_rate": 2.3366336633663363e-06, |
|
"loss": 1.6656, |
|
"step": 723 |
|
}, |
|
{ |
|
"epoch": 0.43439131217375654, |
|
"grad_norm": 1.578125, |
|
"learning_rate": 2.334158415841584e-06, |
|
"loss": 1.6787, |
|
"step": 724 |
|
}, |
|
{ |
|
"epoch": 0.4349913001739965, |
|
"grad_norm": 1.7421875, |
|
"learning_rate": 2.3316831683168314e-06, |
|
"loss": 1.6582, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 0.43559128817423654, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 2.329207920792079e-06, |
|
"loss": 1.6612, |
|
"step": 726 |
|
}, |
|
{ |
|
"epoch": 0.4361912761744765, |
|
"grad_norm": 1.7734375, |
|
"learning_rate": 2.3267326732673265e-06, |
|
"loss": 1.7151, |
|
"step": 727 |
|
}, |
|
{ |
|
"epoch": 0.43679126417471653, |
|
"grad_norm": 1.78125, |
|
"learning_rate": 2.3242574257425743e-06, |
|
"loss": 1.6576, |
|
"step": 728 |
|
}, |
|
{ |
|
"epoch": 0.4373912521749565, |
|
"grad_norm": 1.59375, |
|
"learning_rate": 2.3217821782178216e-06, |
|
"loss": 1.7122, |
|
"step": 729 |
|
}, |
|
{ |
|
"epoch": 0.4379912401751965, |
|
"grad_norm": 1.75, |
|
"learning_rate": 2.3193069306930694e-06, |
|
"loss": 1.6774, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.4385912281754365, |
|
"grad_norm": 1.578125, |
|
"learning_rate": 2.3168316831683167e-06, |
|
"loss": 1.6655, |
|
"step": 731 |
|
}, |
|
{ |
|
"epoch": 0.43919121617567647, |
|
"grad_norm": 1.78125, |
|
"learning_rate": 2.3143564356435645e-06, |
|
"loss": 1.6039, |
|
"step": 732 |
|
}, |
|
{ |
|
"epoch": 0.4397912041759165, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 2.3118811881188114e-06, |
|
"loss": 1.6186, |
|
"step": 733 |
|
}, |
|
{ |
|
"epoch": 0.44039119217615647, |
|
"grad_norm": 1.9609375, |
|
"learning_rate": 2.309405940594059e-06, |
|
"loss": 1.628, |
|
"step": 734 |
|
}, |
|
{ |
|
"epoch": 0.4409911801763965, |
|
"grad_norm": 1.625, |
|
"learning_rate": 2.3069306930693065e-06, |
|
"loss": 1.6176, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 0.44159116817663646, |
|
"grad_norm": 1.6171875, |
|
"learning_rate": 2.3044554455445543e-06, |
|
"loss": 1.6363, |
|
"step": 736 |
|
}, |
|
{ |
|
"epoch": 0.4421911561768765, |
|
"grad_norm": 1.5546875, |
|
"learning_rate": 2.3019801980198016e-06, |
|
"loss": 1.6177, |
|
"step": 737 |
|
}, |
|
{ |
|
"epoch": 0.44279114417711646, |
|
"grad_norm": 1.5859375, |
|
"learning_rate": 2.2995049504950494e-06, |
|
"loss": 1.8267, |
|
"step": 738 |
|
}, |
|
{ |
|
"epoch": 0.44339113217735643, |
|
"grad_norm": 1.7109375, |
|
"learning_rate": 2.2970297029702967e-06, |
|
"loss": 1.7321, |
|
"step": 739 |
|
}, |
|
{ |
|
"epoch": 0.44399112017759645, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 2.2945544554455445e-06, |
|
"loss": 1.6686, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.4445911081778364, |
|
"grad_norm": 1.828125, |
|
"learning_rate": 2.292079207920792e-06, |
|
"loss": 1.5937, |
|
"step": 741 |
|
}, |
|
{ |
|
"epoch": 0.44519109617807645, |
|
"grad_norm": 1.625, |
|
"learning_rate": 2.2896039603960396e-06, |
|
"loss": 1.6786, |
|
"step": 742 |
|
}, |
|
{ |
|
"epoch": 0.4457910841783164, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 2.287128712871287e-06, |
|
"loss": 1.6417, |
|
"step": 743 |
|
}, |
|
{ |
|
"epoch": 0.44639107217855645, |
|
"grad_norm": 1.7109375, |
|
"learning_rate": 2.2846534653465347e-06, |
|
"loss": 1.7238, |
|
"step": 744 |
|
}, |
|
{ |
|
"epoch": 0.4469910601787964, |
|
"grad_norm": 1.8828125, |
|
"learning_rate": 2.282178217821782e-06, |
|
"loss": 1.6931, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 0.44759104817903644, |
|
"grad_norm": 1.5546875, |
|
"learning_rate": 2.27970297029703e-06, |
|
"loss": 1.6201, |
|
"step": 746 |
|
}, |
|
{ |
|
"epoch": 0.4481910361792764, |
|
"grad_norm": 1.5390625, |
|
"learning_rate": 2.2772277227722767e-06, |
|
"loss": 1.6692, |
|
"step": 747 |
|
}, |
|
{ |
|
"epoch": 0.4487910241795164, |
|
"grad_norm": 1.7734375, |
|
"learning_rate": 2.2747524752475245e-06, |
|
"loss": 1.6056, |
|
"step": 748 |
|
}, |
|
{ |
|
"epoch": 0.4493910121797564, |
|
"grad_norm": 1.65625, |
|
"learning_rate": 2.272277227722772e-06, |
|
"loss": 1.7107, |
|
"step": 749 |
|
}, |
|
{ |
|
"epoch": 0.4499910001799964, |
|
"grad_norm": 1.7421875, |
|
"learning_rate": 2.2698019801980196e-06, |
|
"loss": 1.6711, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.4499910001799964, |
|
"eval_loss": 1.77778959274292, |
|
"eval_model_preparation_time": 0.0037, |
|
"eval_runtime": 69.2035, |
|
"eval_samples_per_second": 144.501, |
|
"eval_steps_per_second": 24.088, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.4505909881802364, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 2.267326732673267e-06, |
|
"loss": 1.6199, |
|
"step": 751 |
|
}, |
|
{ |
|
"epoch": 0.4511909761804764, |
|
"grad_norm": 1.7734375, |
|
"learning_rate": 2.2648514851485147e-06, |
|
"loss": 1.6252, |
|
"step": 752 |
|
}, |
|
{ |
|
"epoch": 0.4517909641807164, |
|
"grad_norm": 1.65625, |
|
"learning_rate": 2.262376237623762e-06, |
|
"loss": 1.6548, |
|
"step": 753 |
|
}, |
|
{ |
|
"epoch": 0.4523909521809564, |
|
"grad_norm": 1.5390625, |
|
"learning_rate": 2.25990099009901e-06, |
|
"loss": 1.6134, |
|
"step": 754 |
|
}, |
|
{ |
|
"epoch": 0.4529909401811964, |
|
"grad_norm": 1.7109375, |
|
"learning_rate": 2.257425742574257e-06, |
|
"loss": 1.6512, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 0.45359092818143637, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 2.254950495049505e-06, |
|
"loss": 1.717, |
|
"step": 756 |
|
}, |
|
{ |
|
"epoch": 0.45419091618167634, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 2.2524752475247523e-06, |
|
"loss": 1.667, |
|
"step": 757 |
|
}, |
|
{ |
|
"epoch": 0.45479090418191637, |
|
"grad_norm": 1.84375, |
|
"learning_rate": 2.25e-06, |
|
"loss": 1.6836, |
|
"step": 758 |
|
}, |
|
{ |
|
"epoch": 0.45539089218215634, |
|
"grad_norm": 1.6875, |
|
"learning_rate": 2.247524752475248e-06, |
|
"loss": 1.7515, |
|
"step": 759 |
|
}, |
|
{ |
|
"epoch": 0.45599088018239636, |
|
"grad_norm": 1.78125, |
|
"learning_rate": 2.2450495049504947e-06, |
|
"loss": 1.7191, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.45659086818263633, |
|
"grad_norm": 1.9453125, |
|
"learning_rate": 2.2425742574257425e-06, |
|
"loss": 1.6758, |
|
"step": 761 |
|
}, |
|
{ |
|
"epoch": 0.45719085618287636, |
|
"grad_norm": 1.6640625, |
|
"learning_rate": 2.24009900990099e-06, |
|
"loss": 1.7625, |
|
"step": 762 |
|
}, |
|
{ |
|
"epoch": 0.45779084418311633, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 2.2376237623762376e-06, |
|
"loss": 1.6278, |
|
"step": 763 |
|
}, |
|
{ |
|
"epoch": 0.45839083218335636, |
|
"grad_norm": 1.7734375, |
|
"learning_rate": 2.235148514851485e-06, |
|
"loss": 1.6825, |
|
"step": 764 |
|
}, |
|
{ |
|
"epoch": 0.4589908201835963, |
|
"grad_norm": 1.65625, |
|
"learning_rate": 2.2326732673267327e-06, |
|
"loss": 1.5404, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 0.4595908081838363, |
|
"grad_norm": 1.6640625, |
|
"learning_rate": 2.23019801980198e-06, |
|
"loss": 1.5464, |
|
"step": 766 |
|
}, |
|
{ |
|
"epoch": 0.4601907961840763, |
|
"grad_norm": 1.6640625, |
|
"learning_rate": 2.227722772277228e-06, |
|
"loss": 1.7487, |
|
"step": 767 |
|
}, |
|
{ |
|
"epoch": 0.4607907841843163, |
|
"grad_norm": 1.4296875, |
|
"learning_rate": 2.225247524752475e-06, |
|
"loss": 1.6372, |
|
"step": 768 |
|
}, |
|
{ |
|
"epoch": 0.4613907721845563, |
|
"grad_norm": 1.6484375, |
|
"learning_rate": 2.222772277227723e-06, |
|
"loss": 1.7136, |
|
"step": 769 |
|
}, |
|
{ |
|
"epoch": 0.4619907601847963, |
|
"grad_norm": 1.59375, |
|
"learning_rate": 2.2202970297029703e-06, |
|
"loss": 1.6032, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.4625907481850363, |
|
"grad_norm": 1.5078125, |
|
"learning_rate": 2.217821782178218e-06, |
|
"loss": 1.6514, |
|
"step": 771 |
|
}, |
|
{ |
|
"epoch": 0.4631907361852763, |
|
"grad_norm": 1.6171875, |
|
"learning_rate": 2.2153465346534654e-06, |
|
"loss": 1.6665, |
|
"step": 772 |
|
}, |
|
{ |
|
"epoch": 0.4637907241855163, |
|
"grad_norm": 1.796875, |
|
"learning_rate": 2.212871287128713e-06, |
|
"loss": 1.6432, |
|
"step": 773 |
|
}, |
|
{ |
|
"epoch": 0.4643907121857563, |
|
"grad_norm": 1.8515625, |
|
"learning_rate": 2.21039603960396e-06, |
|
"loss": 1.5685, |
|
"step": 774 |
|
}, |
|
{ |
|
"epoch": 0.46499070018599625, |
|
"grad_norm": 1.5546875, |
|
"learning_rate": 2.207920792079208e-06, |
|
"loss": 1.7123, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 0.4655906881862363, |
|
"grad_norm": 1.4921875, |
|
"learning_rate": 2.205445544554455e-06, |
|
"loss": 1.6538, |
|
"step": 776 |
|
}, |
|
{ |
|
"epoch": 0.46619067618647625, |
|
"grad_norm": 1.8046875, |
|
"learning_rate": 2.202970297029703e-06, |
|
"loss": 1.6715, |
|
"step": 777 |
|
}, |
|
{ |
|
"epoch": 0.4667906641867163, |
|
"grad_norm": 1.7421875, |
|
"learning_rate": 2.2004950495049503e-06, |
|
"loss": 1.6058, |
|
"step": 778 |
|
}, |
|
{ |
|
"epoch": 0.46739065218695625, |
|
"grad_norm": 1.7265625, |
|
"learning_rate": 2.198019801980198e-06, |
|
"loss": 1.766, |
|
"step": 779 |
|
}, |
|
{ |
|
"epoch": 0.46799064018719627, |
|
"grad_norm": 1.5546875, |
|
"learning_rate": 2.1955445544554454e-06, |
|
"loss": 1.7647, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.46859062818743624, |
|
"grad_norm": 1.6875, |
|
"learning_rate": 2.193069306930693e-06, |
|
"loss": 1.6069, |
|
"step": 781 |
|
}, |
|
{ |
|
"epoch": 0.46919061618767627, |
|
"grad_norm": 1.6171875, |
|
"learning_rate": 2.1905940594059405e-06, |
|
"loss": 1.6362, |
|
"step": 782 |
|
}, |
|
{ |
|
"epoch": 0.46979060418791624, |
|
"grad_norm": 1.75, |
|
"learning_rate": 2.1881188118811883e-06, |
|
"loss": 1.6003, |
|
"step": 783 |
|
}, |
|
{ |
|
"epoch": 0.47039059218815626, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 2.1856435643564356e-06, |
|
"loss": 1.676, |
|
"step": 784 |
|
}, |
|
{ |
|
"epoch": 0.47099058018839624, |
|
"grad_norm": 1.5703125, |
|
"learning_rate": 2.1831683168316834e-06, |
|
"loss": 1.6357, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 0.4715905681886362, |
|
"grad_norm": 1.7578125, |
|
"learning_rate": 2.1806930693069307e-06, |
|
"loss": 1.7, |
|
"step": 786 |
|
}, |
|
{ |
|
"epoch": 0.47219055618887623, |
|
"grad_norm": 1.6875, |
|
"learning_rate": 2.178217821782178e-06, |
|
"loss": 1.5975, |
|
"step": 787 |
|
}, |
|
{ |
|
"epoch": 0.4727905441891162, |
|
"grad_norm": 1.7265625, |
|
"learning_rate": 2.1757425742574254e-06, |
|
"loss": 1.7075, |
|
"step": 788 |
|
}, |
|
{ |
|
"epoch": 0.47339053218935623, |
|
"grad_norm": 1.5546875, |
|
"learning_rate": 2.173267326732673e-06, |
|
"loss": 1.6066, |
|
"step": 789 |
|
}, |
|
{ |
|
"epoch": 0.4739905201895962, |
|
"grad_norm": 1.765625, |
|
"learning_rate": 2.1707920792079205e-06, |
|
"loss": 1.6408, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.4745905081898362, |
|
"grad_norm": 1.5, |
|
"learning_rate": 2.1683168316831683e-06, |
|
"loss": 1.6861, |
|
"step": 791 |
|
}, |
|
{ |
|
"epoch": 0.4751904961900762, |
|
"grad_norm": 1.796875, |
|
"learning_rate": 2.1658415841584156e-06, |
|
"loss": 1.6754, |
|
"step": 792 |
|
}, |
|
{ |
|
"epoch": 0.4757904841903162, |
|
"grad_norm": 1.7265625, |
|
"learning_rate": 2.1633663366336634e-06, |
|
"loss": 1.5473, |
|
"step": 793 |
|
}, |
|
{ |
|
"epoch": 0.4763904721905562, |
|
"grad_norm": 1.6328125, |
|
"learning_rate": 2.1608910891089108e-06, |
|
"loss": 1.5907, |
|
"step": 794 |
|
}, |
|
{ |
|
"epoch": 0.47699046019079616, |
|
"grad_norm": 1.4609375, |
|
"learning_rate": 2.1584158415841585e-06, |
|
"loss": 1.5811, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 0.4775904481910362, |
|
"grad_norm": 1.65625, |
|
"learning_rate": 2.155940594059406e-06, |
|
"loss": 1.7476, |
|
"step": 796 |
|
}, |
|
{ |
|
"epoch": 0.47819043619127616, |
|
"grad_norm": 1.7890625, |
|
"learning_rate": 2.1534653465346536e-06, |
|
"loss": 1.6973, |
|
"step": 797 |
|
}, |
|
{ |
|
"epoch": 0.4787904241915162, |
|
"grad_norm": 1.7421875, |
|
"learning_rate": 2.150990099009901e-06, |
|
"loss": 1.6153, |
|
"step": 798 |
|
}, |
|
{ |
|
"epoch": 0.47939041219175615, |
|
"grad_norm": 1.6171875, |
|
"learning_rate": 2.1485148514851487e-06, |
|
"loss": 1.5589, |
|
"step": 799 |
|
}, |
|
{ |
|
"epoch": 0.4799904001919962, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 2.1460396039603957e-06, |
|
"loss": 1.6918, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.4799904001919962, |
|
"eval_loss": 1.7776175737380981, |
|
"eval_model_preparation_time": 0.0037, |
|
"eval_runtime": 66.0238, |
|
"eval_samples_per_second": 151.461, |
|
"eval_steps_per_second": 25.248, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.48059038819223615, |
|
"grad_norm": 1.7109375, |
|
"learning_rate": 2.1435643564356434e-06, |
|
"loss": 1.6829, |
|
"step": 801 |
|
}, |
|
{ |
|
"epoch": 0.4811903761924762, |
|
"grad_norm": 1.65625, |
|
"learning_rate": 2.1410891089108908e-06, |
|
"loss": 1.6587, |
|
"step": 802 |
|
}, |
|
{ |
|
"epoch": 0.48179036419271615, |
|
"grad_norm": 1.75, |
|
"learning_rate": 2.1386138613861385e-06, |
|
"loss": 1.6863, |
|
"step": 803 |
|
}, |
|
{ |
|
"epoch": 0.4823903521929561, |
|
"grad_norm": 1.5546875, |
|
"learning_rate": 2.136138613861386e-06, |
|
"loss": 1.7411, |
|
"step": 804 |
|
}, |
|
{ |
|
"epoch": 0.48299034019319614, |
|
"grad_norm": 1.6015625, |
|
"learning_rate": 2.1336633663366336e-06, |
|
"loss": 1.6127, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 0.4835903281934361, |
|
"grad_norm": 1.640625, |
|
"learning_rate": 2.131188118811881e-06, |
|
"loss": 1.6713, |
|
"step": 806 |
|
}, |
|
{ |
|
"epoch": 0.48419031619367614, |
|
"grad_norm": 1.6171875, |
|
"learning_rate": 2.1287128712871288e-06, |
|
"loss": 1.6152, |
|
"step": 807 |
|
}, |
|
{ |
|
"epoch": 0.4847903041939161, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 2.126237623762376e-06, |
|
"loss": 1.6861, |
|
"step": 808 |
|
}, |
|
{ |
|
"epoch": 0.48539029219415614, |
|
"grad_norm": 1.515625, |
|
"learning_rate": 2.123762376237624e-06, |
|
"loss": 1.6041, |
|
"step": 809 |
|
}, |
|
{ |
|
"epoch": 0.4859902801943961, |
|
"grad_norm": 1.6328125, |
|
"learning_rate": 2.121287128712871e-06, |
|
"loss": 1.6616, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.48659026819463613, |
|
"grad_norm": 1.546875, |
|
"learning_rate": 2.118811881188119e-06, |
|
"loss": 1.6977, |
|
"step": 811 |
|
}, |
|
{ |
|
"epoch": 0.4871902561948761, |
|
"grad_norm": 1.7421875, |
|
"learning_rate": 2.1163366336633663e-06, |
|
"loss": 1.7304, |
|
"step": 812 |
|
}, |
|
{ |
|
"epoch": 0.4877902441951161, |
|
"grad_norm": 1.7265625, |
|
"learning_rate": 2.1138613861386137e-06, |
|
"loss": 1.6383, |
|
"step": 813 |
|
}, |
|
{ |
|
"epoch": 0.4883902321953561, |
|
"grad_norm": 1.515625, |
|
"learning_rate": 2.111386138613861e-06, |
|
"loss": 1.646, |
|
"step": 814 |
|
}, |
|
{ |
|
"epoch": 0.48899022019559607, |
|
"grad_norm": 1.7421875, |
|
"learning_rate": 2.1089108910891088e-06, |
|
"loss": 1.6341, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 0.4895902081958361, |
|
"grad_norm": 1.625, |
|
"learning_rate": 2.106435643564356e-06, |
|
"loss": 1.625, |
|
"step": 816 |
|
}, |
|
{ |
|
"epoch": 0.49019019619607607, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 2.103960396039604e-06, |
|
"loss": 1.6915, |
|
"step": 817 |
|
}, |
|
{ |
|
"epoch": 0.4907901841963161, |
|
"grad_norm": 1.859375, |
|
"learning_rate": 2.1014851485148512e-06, |
|
"loss": 1.5856, |
|
"step": 818 |
|
}, |
|
{ |
|
"epoch": 0.49139017219655606, |
|
"grad_norm": 1.7265625, |
|
"learning_rate": 2.099009900990099e-06, |
|
"loss": 1.6453, |
|
"step": 819 |
|
}, |
|
{ |
|
"epoch": 0.4919901601967961, |
|
"grad_norm": 1.6640625, |
|
"learning_rate": 2.0965346534653463e-06, |
|
"loss": 1.6145, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.49259014819703606, |
|
"grad_norm": 1.5625, |
|
"learning_rate": 2.094059405940594e-06, |
|
"loss": 1.6025, |
|
"step": 821 |
|
}, |
|
{ |
|
"epoch": 0.49319013619727603, |
|
"grad_norm": 1.65625, |
|
"learning_rate": 2.0915841584158414e-06, |
|
"loss": 1.753, |
|
"step": 822 |
|
}, |
|
{ |
|
"epoch": 0.49379012419751606, |
|
"grad_norm": 1.640625, |
|
"learning_rate": 2.089108910891089e-06, |
|
"loss": 1.6231, |
|
"step": 823 |
|
}, |
|
{ |
|
"epoch": 0.494390112197756, |
|
"grad_norm": 1.5859375, |
|
"learning_rate": 2.0866336633663366e-06, |
|
"loss": 1.6406, |
|
"step": 824 |
|
}, |
|
{ |
|
"epoch": 0.49499010019799605, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 2.0841584158415843e-06, |
|
"loss": 1.6947, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 0.495590088198236, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 2.0816831683168312e-06, |
|
"loss": 1.6547, |
|
"step": 826 |
|
}, |
|
{ |
|
"epoch": 0.49619007619847605, |
|
"grad_norm": 1.6015625, |
|
"learning_rate": 2.079207920792079e-06, |
|
"loss": 1.6789, |
|
"step": 827 |
|
}, |
|
{ |
|
"epoch": 0.496790064198716, |
|
"grad_norm": 1.7890625, |
|
"learning_rate": 2.0767326732673263e-06, |
|
"loss": 1.7236, |
|
"step": 828 |
|
}, |
|
{ |
|
"epoch": 0.49739005219895605, |
|
"grad_norm": 1.6171875, |
|
"learning_rate": 2.074257425742574e-06, |
|
"loss": 1.6554, |
|
"step": 829 |
|
}, |
|
{ |
|
"epoch": 0.497990040199196, |
|
"grad_norm": 1.7421875, |
|
"learning_rate": 2.0717821782178215e-06, |
|
"loss": 1.651, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.498590028199436, |
|
"grad_norm": 1.6171875, |
|
"learning_rate": 2.0693069306930692e-06, |
|
"loss": 1.618, |
|
"step": 831 |
|
}, |
|
{ |
|
"epoch": 0.499190016199676, |
|
"grad_norm": 1.7265625, |
|
"learning_rate": 2.0668316831683166e-06, |
|
"loss": 1.5994, |
|
"step": 832 |
|
}, |
|
{ |
|
"epoch": 0.499790004199916, |
|
"grad_norm": 1.8515625, |
|
"learning_rate": 2.0643564356435643e-06, |
|
"loss": 1.6897, |
|
"step": 833 |
|
}, |
|
{ |
|
"epoch": 0.500389992200156, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 2.0618811881188117e-06, |
|
"loss": 1.5805, |
|
"step": 834 |
|
}, |
|
{ |
|
"epoch": 0.500989980200396, |
|
"grad_norm": 1.65625, |
|
"learning_rate": 2.0594059405940594e-06, |
|
"loss": 1.7514, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 0.501589968200636, |
|
"grad_norm": 1.6015625, |
|
"learning_rate": 2.0569306930693068e-06, |
|
"loss": 1.7048, |
|
"step": 836 |
|
}, |
|
{ |
|
"epoch": 0.502189956200876, |
|
"grad_norm": 1.53125, |
|
"learning_rate": 2.0544554455445546e-06, |
|
"loss": 1.6226, |
|
"step": 837 |
|
}, |
|
{ |
|
"epoch": 0.502789944201116, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 2.051980198019802e-06, |
|
"loss": 1.7454, |
|
"step": 838 |
|
}, |
|
{ |
|
"epoch": 0.503389932201356, |
|
"grad_norm": 1.515625, |
|
"learning_rate": 2.0495049504950497e-06, |
|
"loss": 1.6367, |
|
"step": 839 |
|
}, |
|
{ |
|
"epoch": 0.5039899202015959, |
|
"grad_norm": 1.7578125, |
|
"learning_rate": 2.0470297029702966e-06, |
|
"loss": 1.6435, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.5045899082018359, |
|
"grad_norm": 2.0, |
|
"learning_rate": 2.0445544554455443e-06, |
|
"loss": 1.5614, |
|
"step": 841 |
|
}, |
|
{ |
|
"epoch": 0.505189896202076, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 2.0420792079207917e-06, |
|
"loss": 1.7469, |
|
"step": 842 |
|
}, |
|
{ |
|
"epoch": 0.505789884202316, |
|
"grad_norm": 1.7109375, |
|
"learning_rate": 2.0396039603960395e-06, |
|
"loss": 1.6062, |
|
"step": 843 |
|
}, |
|
{ |
|
"epoch": 0.5063898722025559, |
|
"grad_norm": 1.4140625, |
|
"learning_rate": 2.037128712871287e-06, |
|
"loss": 1.5944, |
|
"step": 844 |
|
}, |
|
{ |
|
"epoch": 0.5069898602027959, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 2.0346534653465346e-06, |
|
"loss": 1.6536, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 0.507589848203036, |
|
"grad_norm": 1.734375, |
|
"learning_rate": 2.032178217821782e-06, |
|
"loss": 1.6721, |
|
"step": 846 |
|
}, |
|
{ |
|
"epoch": 0.508189836203276, |
|
"grad_norm": 1.8046875, |
|
"learning_rate": 2.0297029702970297e-06, |
|
"loss": 1.6654, |
|
"step": 847 |
|
}, |
|
{ |
|
"epoch": 0.5087898242035159, |
|
"grad_norm": 1.640625, |
|
"learning_rate": 2.027227722772277e-06, |
|
"loss": 1.7078, |
|
"step": 848 |
|
}, |
|
{ |
|
"epoch": 0.5093898122037559, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 2.0247524752475248e-06, |
|
"loss": 1.7034, |
|
"step": 849 |
|
}, |
|
{ |
|
"epoch": 0.5099898002039959, |
|
"grad_norm": 1.7421875, |
|
"learning_rate": 2.022277227722772e-06, |
|
"loss": 1.6864, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.5099898002039959, |
|
"eval_loss": 1.7774574756622314, |
|
"eval_model_preparation_time": 0.0037, |
|
"eval_runtime": 66.0293, |
|
"eval_samples_per_second": 151.448, |
|
"eval_steps_per_second": 25.246, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.510589788204236, |
|
"grad_norm": 1.78125, |
|
"learning_rate": 2.01980198019802e-06, |
|
"loss": 1.7012, |
|
"step": 851 |
|
}, |
|
{ |
|
"epoch": 0.5111897762044759, |
|
"grad_norm": 1.6484375, |
|
"learning_rate": 2.0173267326732672e-06, |
|
"loss": 1.7185, |
|
"step": 852 |
|
}, |
|
{ |
|
"epoch": 0.5117897642047159, |
|
"grad_norm": 1.8203125, |
|
"learning_rate": 2.0148514851485146e-06, |
|
"loss": 1.5713, |
|
"step": 853 |
|
}, |
|
{ |
|
"epoch": 0.5123897522049559, |
|
"grad_norm": 1.640625, |
|
"learning_rate": 2.012376237623762e-06, |
|
"loss": 1.6537, |
|
"step": 854 |
|
}, |
|
{ |
|
"epoch": 0.5129897402051959, |
|
"grad_norm": 1.640625, |
|
"learning_rate": 2.0099009900990097e-06, |
|
"loss": 1.6145, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 0.5135897282054359, |
|
"grad_norm": 1.5859375, |
|
"learning_rate": 2.007425742574257e-06, |
|
"loss": 1.6468, |
|
"step": 856 |
|
}, |
|
{ |
|
"epoch": 0.5141897162056759, |
|
"grad_norm": 1.65625, |
|
"learning_rate": 2.004950495049505e-06, |
|
"loss": 1.6845, |
|
"step": 857 |
|
}, |
|
{ |
|
"epoch": 0.5147897042059159, |
|
"grad_norm": 1.5078125, |
|
"learning_rate": 2.002475247524752e-06, |
|
"loss": 1.7128, |
|
"step": 858 |
|
}, |
|
{ |
|
"epoch": 0.5153896922061558, |
|
"grad_norm": 1.8359375, |
|
"learning_rate": 2e-06, |
|
"loss": 1.58, |
|
"step": 859 |
|
}, |
|
{ |
|
"epoch": 0.5159896802063959, |
|
"grad_norm": 1.625, |
|
"learning_rate": 1.9975247524752473e-06, |
|
"loss": 1.5932, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.5165896682066359, |
|
"grad_norm": 1.734375, |
|
"learning_rate": 1.995049504950495e-06, |
|
"loss": 1.6651, |
|
"step": 861 |
|
}, |
|
{ |
|
"epoch": 0.5171896562068758, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 1.9925742574257424e-06, |
|
"loss": 1.6753, |
|
"step": 862 |
|
}, |
|
{ |
|
"epoch": 0.5177896442071158, |
|
"grad_norm": 1.7578125, |
|
"learning_rate": 1.99009900990099e-06, |
|
"loss": 1.7112, |
|
"step": 863 |
|
}, |
|
{ |
|
"epoch": 0.5183896322073559, |
|
"grad_norm": 1.75, |
|
"learning_rate": 1.9876237623762375e-06, |
|
"loss": 1.6007, |
|
"step": 864 |
|
}, |
|
{ |
|
"epoch": 0.5189896202075959, |
|
"grad_norm": 1.8125, |
|
"learning_rate": 1.9851485148514852e-06, |
|
"loss": 1.763, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 0.5195896082078358, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 1.9826732673267326e-06, |
|
"loss": 1.6699, |
|
"step": 866 |
|
}, |
|
{ |
|
"epoch": 0.5201895962080758, |
|
"grad_norm": 1.6640625, |
|
"learning_rate": 1.98019801980198e-06, |
|
"loss": 1.6485, |
|
"step": 867 |
|
}, |
|
{ |
|
"epoch": 0.5207895842083158, |
|
"grad_norm": 1.75, |
|
"learning_rate": 1.9777227722772277e-06, |
|
"loss": 1.7541, |
|
"step": 868 |
|
}, |
|
{ |
|
"epoch": 0.5213895722085559, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 1.975247524752475e-06, |
|
"loss": 1.7011, |
|
"step": 869 |
|
}, |
|
{ |
|
"epoch": 0.5219895602087958, |
|
"grad_norm": 1.8125, |
|
"learning_rate": 1.972772277227723e-06, |
|
"loss": 1.5922, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.5225895482090358, |
|
"grad_norm": 1.5078125, |
|
"learning_rate": 1.97029702970297e-06, |
|
"loss": 1.6786, |
|
"step": 871 |
|
}, |
|
{ |
|
"epoch": 0.5231895362092758, |
|
"grad_norm": 1.4296875, |
|
"learning_rate": 1.967821782178218e-06, |
|
"loss": 1.6818, |
|
"step": 872 |
|
}, |
|
{ |
|
"epoch": 0.5237895242095159, |
|
"grad_norm": 1.6640625, |
|
"learning_rate": 1.9653465346534653e-06, |
|
"loss": 1.6646, |
|
"step": 873 |
|
}, |
|
{ |
|
"epoch": 0.5243895122097558, |
|
"grad_norm": 1.5625, |
|
"learning_rate": 1.9628712871287126e-06, |
|
"loss": 1.7467, |
|
"step": 874 |
|
}, |
|
{ |
|
"epoch": 0.5249895002099958, |
|
"grad_norm": 1.6015625, |
|
"learning_rate": 1.9603960396039604e-06, |
|
"loss": 1.6728, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 0.5255894882102358, |
|
"grad_norm": 1.7109375, |
|
"learning_rate": 1.9579207920792077e-06, |
|
"loss": 1.654, |
|
"step": 876 |
|
}, |
|
{ |
|
"epoch": 0.5261894762104757, |
|
"grad_norm": 1.5390625, |
|
"learning_rate": 1.9554455445544555e-06, |
|
"loss": 1.6831, |
|
"step": 877 |
|
}, |
|
{ |
|
"epoch": 0.5267894642107158, |
|
"grad_norm": 1.609375, |
|
"learning_rate": 1.952970297029703e-06, |
|
"loss": 1.5535, |
|
"step": 878 |
|
}, |
|
{ |
|
"epoch": 0.5273894522109558, |
|
"grad_norm": 1.984375, |
|
"learning_rate": 1.95049504950495e-06, |
|
"loss": 1.6483, |
|
"step": 879 |
|
}, |
|
{ |
|
"epoch": 0.5279894402111958, |
|
"grad_norm": 1.59375, |
|
"learning_rate": 1.948019801980198e-06, |
|
"loss": 1.7029, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.5285894282114357, |
|
"grad_norm": 1.7265625, |
|
"learning_rate": 1.9455445544554453e-06, |
|
"loss": 1.6396, |
|
"step": 881 |
|
}, |
|
{ |
|
"epoch": 0.5291894162116758, |
|
"grad_norm": 1.65625, |
|
"learning_rate": 1.943069306930693e-06, |
|
"loss": 1.6633, |
|
"step": 882 |
|
}, |
|
{ |
|
"epoch": 0.5297894042119158, |
|
"grad_norm": 1.5859375, |
|
"learning_rate": 1.9405940594059404e-06, |
|
"loss": 1.6083, |
|
"step": 883 |
|
}, |
|
{ |
|
"epoch": 0.5303893922121558, |
|
"grad_norm": 1.6171875, |
|
"learning_rate": 1.938118811881188e-06, |
|
"loss": 1.684, |
|
"step": 884 |
|
}, |
|
{ |
|
"epoch": 0.5309893802123957, |
|
"grad_norm": 1.5703125, |
|
"learning_rate": 1.9356435643564355e-06, |
|
"loss": 1.6599, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 0.5315893682126357, |
|
"grad_norm": 1.6328125, |
|
"learning_rate": 1.933168316831683e-06, |
|
"loss": 1.6902, |
|
"step": 886 |
|
}, |
|
{ |
|
"epoch": 0.5321893562128758, |
|
"grad_norm": 1.6484375, |
|
"learning_rate": 1.9306930693069306e-06, |
|
"loss": 1.6117, |
|
"step": 887 |
|
}, |
|
{ |
|
"epoch": 0.5327893442131157, |
|
"grad_norm": 1.6875, |
|
"learning_rate": 1.928217821782178e-06, |
|
"loss": 1.6018, |
|
"step": 888 |
|
}, |
|
{ |
|
"epoch": 0.5333893322133557, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 1.9257425742574257e-06, |
|
"loss": 1.7181, |
|
"step": 889 |
|
}, |
|
{ |
|
"epoch": 0.5339893202135957, |
|
"grad_norm": 1.8046875, |
|
"learning_rate": 1.923267326732673e-06, |
|
"loss": 1.6801, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.5345893082138358, |
|
"grad_norm": 1.5703125, |
|
"learning_rate": 1.920792079207921e-06, |
|
"loss": 1.6134, |
|
"step": 891 |
|
}, |
|
{ |
|
"epoch": 0.5351892962140757, |
|
"grad_norm": 1.65625, |
|
"learning_rate": 1.918316831683168e-06, |
|
"loss": 1.7768, |
|
"step": 892 |
|
}, |
|
{ |
|
"epoch": 0.5357892842143157, |
|
"grad_norm": 1.65625, |
|
"learning_rate": 1.9158415841584155e-06, |
|
"loss": 1.6964, |
|
"step": 893 |
|
}, |
|
{ |
|
"epoch": 0.5363892722145557, |
|
"grad_norm": 1.6171875, |
|
"learning_rate": 1.9133663366336633e-06, |
|
"loss": 1.6572, |
|
"step": 894 |
|
}, |
|
{ |
|
"epoch": 0.5369892602147956, |
|
"grad_norm": 1.6171875, |
|
"learning_rate": 1.9108910891089106e-06, |
|
"loss": 1.6688, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 0.5375892482150357, |
|
"grad_norm": 1.515625, |
|
"learning_rate": 1.9084158415841584e-06, |
|
"loss": 1.5866, |
|
"step": 896 |
|
}, |
|
{ |
|
"epoch": 0.5381892362152757, |
|
"grad_norm": 1.71875, |
|
"learning_rate": 1.9059405940594057e-06, |
|
"loss": 1.6582, |
|
"step": 897 |
|
}, |
|
{ |
|
"epoch": 0.5387892242155157, |
|
"grad_norm": 1.78125, |
|
"learning_rate": 1.9034653465346533e-06, |
|
"loss": 1.7269, |
|
"step": 898 |
|
}, |
|
{ |
|
"epoch": 0.5393892122157556, |
|
"grad_norm": 1.8125, |
|
"learning_rate": 1.9009900990099008e-06, |
|
"loss": 1.5366, |
|
"step": 899 |
|
}, |
|
{ |
|
"epoch": 0.5399892002159957, |
|
"grad_norm": 1.8046875, |
|
"learning_rate": 1.8985148514851484e-06, |
|
"loss": 1.6964, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.5399892002159957, |
|
"eval_loss": 1.7774394750595093, |
|
"eval_model_preparation_time": 0.0037, |
|
"eval_runtime": 68.6659, |
|
"eval_samples_per_second": 145.633, |
|
"eval_steps_per_second": 24.277, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.5405891882162357, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 1.896039603960396e-06, |
|
"loss": 1.7363, |
|
"step": 901 |
|
}, |
|
{ |
|
"epoch": 0.5411891762164757, |
|
"grad_norm": 1.6328125, |
|
"learning_rate": 1.8935643564356433e-06, |
|
"loss": 1.6164, |
|
"step": 902 |
|
}, |
|
{ |
|
"epoch": 0.5417891642167156, |
|
"grad_norm": 1.6328125, |
|
"learning_rate": 1.8910891089108908e-06, |
|
"loss": 1.6276, |
|
"step": 903 |
|
}, |
|
{ |
|
"epoch": 0.5423891522169556, |
|
"grad_norm": 1.765625, |
|
"learning_rate": 1.8886138613861384e-06, |
|
"loss": 1.5128, |
|
"step": 904 |
|
}, |
|
{ |
|
"epoch": 0.5429891402171957, |
|
"grad_norm": 1.7578125, |
|
"learning_rate": 1.886138613861386e-06, |
|
"loss": 1.6413, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 0.5435891282174357, |
|
"grad_norm": 1.8671875, |
|
"learning_rate": 1.8836633663366335e-06, |
|
"loss": 1.6669, |
|
"step": 906 |
|
}, |
|
{ |
|
"epoch": 0.5441891162176756, |
|
"grad_norm": 1.84375, |
|
"learning_rate": 1.881188118811881e-06, |
|
"loss": 1.6869, |
|
"step": 907 |
|
}, |
|
{ |
|
"epoch": 0.5447891042179156, |
|
"grad_norm": 1.625, |
|
"learning_rate": 1.8787128712871286e-06, |
|
"loss": 1.6706, |
|
"step": 908 |
|
}, |
|
{ |
|
"epoch": 0.5453890922181557, |
|
"grad_norm": 1.6328125, |
|
"learning_rate": 1.876237623762376e-06, |
|
"loss": 1.6194, |
|
"step": 909 |
|
}, |
|
{ |
|
"epoch": 0.5459890802183957, |
|
"grad_norm": 1.7421875, |
|
"learning_rate": 1.8737623762376237e-06, |
|
"loss": 1.7302, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.5465890682186356, |
|
"grad_norm": 1.7734375, |
|
"learning_rate": 1.8712871287128713e-06, |
|
"loss": 1.698, |
|
"step": 911 |
|
}, |
|
{ |
|
"epoch": 0.5471890562188756, |
|
"grad_norm": 1.7421875, |
|
"learning_rate": 1.8688118811881188e-06, |
|
"loss": 1.6802, |
|
"step": 912 |
|
}, |
|
{ |
|
"epoch": 0.5477890442191157, |
|
"grad_norm": 1.9453125, |
|
"learning_rate": 1.8663366336633664e-06, |
|
"loss": 1.7833, |
|
"step": 913 |
|
}, |
|
{ |
|
"epoch": 0.5483890322193556, |
|
"grad_norm": 1.7734375, |
|
"learning_rate": 1.863861386138614e-06, |
|
"loss": 1.7153, |
|
"step": 914 |
|
}, |
|
{ |
|
"epoch": 0.5489890202195956, |
|
"grad_norm": 1.84375, |
|
"learning_rate": 1.8613861386138615e-06, |
|
"loss": 1.6458, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 0.5495890082198356, |
|
"grad_norm": 1.65625, |
|
"learning_rate": 1.8589108910891088e-06, |
|
"loss": 1.5899, |
|
"step": 916 |
|
}, |
|
{ |
|
"epoch": 0.5501889962200756, |
|
"grad_norm": 1.890625, |
|
"learning_rate": 1.8564356435643564e-06, |
|
"loss": 1.618, |
|
"step": 917 |
|
}, |
|
{ |
|
"epoch": 0.5507889842203156, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 1.853960396039604e-06, |
|
"loss": 1.6677, |
|
"step": 918 |
|
}, |
|
{ |
|
"epoch": 0.5513889722205556, |
|
"grad_norm": 1.5703125, |
|
"learning_rate": 1.8514851485148515e-06, |
|
"loss": 1.6409, |
|
"step": 919 |
|
}, |
|
{ |
|
"epoch": 0.5519889602207956, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 1.849009900990099e-06, |
|
"loss": 1.6201, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.5525889482210355, |
|
"grad_norm": 1.6171875, |
|
"learning_rate": 1.8465346534653466e-06, |
|
"loss": 1.6385, |
|
"step": 921 |
|
}, |
|
{ |
|
"epoch": 0.5531889362212756, |
|
"grad_norm": 1.6328125, |
|
"learning_rate": 1.8440594059405942e-06, |
|
"loss": 1.6725, |
|
"step": 922 |
|
}, |
|
{ |
|
"epoch": 0.5537889242215156, |
|
"grad_norm": 1.6640625, |
|
"learning_rate": 1.8415841584158415e-06, |
|
"loss": 1.6324, |
|
"step": 923 |
|
}, |
|
{ |
|
"epoch": 0.5543889122217556, |
|
"grad_norm": 1.546875, |
|
"learning_rate": 1.839108910891089e-06, |
|
"loss": 1.6393, |
|
"step": 924 |
|
}, |
|
{ |
|
"epoch": 0.5549889002219955, |
|
"grad_norm": 1.578125, |
|
"learning_rate": 1.8366336633663366e-06, |
|
"loss": 1.727, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 0.5555888882222355, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 1.8341584158415842e-06, |
|
"loss": 1.7974, |
|
"step": 926 |
|
}, |
|
{ |
|
"epoch": 0.5561888762224756, |
|
"grad_norm": 1.5625, |
|
"learning_rate": 1.8316831683168317e-06, |
|
"loss": 1.7538, |
|
"step": 927 |
|
}, |
|
{ |
|
"epoch": 0.5567888642227156, |
|
"grad_norm": 1.859375, |
|
"learning_rate": 1.8292079207920793e-06, |
|
"loss": 1.6599, |
|
"step": 928 |
|
}, |
|
{ |
|
"epoch": 0.5573888522229555, |
|
"grad_norm": 1.546875, |
|
"learning_rate": 1.8267326732673266e-06, |
|
"loss": 1.6722, |
|
"step": 929 |
|
}, |
|
{ |
|
"epoch": 0.5579888402231955, |
|
"grad_norm": 1.8046875, |
|
"learning_rate": 1.8242574257425742e-06, |
|
"loss": 1.6885, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.5585888282234356, |
|
"grad_norm": 1.625, |
|
"learning_rate": 1.8217821782178217e-06, |
|
"loss": 1.6151, |
|
"step": 931 |
|
}, |
|
{ |
|
"epoch": 0.5591888162236756, |
|
"grad_norm": 1.5625, |
|
"learning_rate": 1.8193069306930693e-06, |
|
"loss": 1.621, |
|
"step": 932 |
|
}, |
|
{ |
|
"epoch": 0.5597888042239155, |
|
"grad_norm": 1.7578125, |
|
"learning_rate": 1.8168316831683168e-06, |
|
"loss": 1.6494, |
|
"step": 933 |
|
}, |
|
{ |
|
"epoch": 0.5603887922241555, |
|
"grad_norm": 1.5546875, |
|
"learning_rate": 1.8143564356435644e-06, |
|
"loss": 1.6403, |
|
"step": 934 |
|
}, |
|
{ |
|
"epoch": 0.5609887802243955, |
|
"grad_norm": 1.71875, |
|
"learning_rate": 1.811881188118812e-06, |
|
"loss": 1.6198, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 0.5615887682246355, |
|
"grad_norm": 1.6328125, |
|
"learning_rate": 1.8094059405940593e-06, |
|
"loss": 1.6158, |
|
"step": 936 |
|
}, |
|
{ |
|
"epoch": 0.5621887562248755, |
|
"grad_norm": 1.578125, |
|
"learning_rate": 1.8069306930693068e-06, |
|
"loss": 1.7377, |
|
"step": 937 |
|
}, |
|
{ |
|
"epoch": 0.5627887442251155, |
|
"grad_norm": 1.765625, |
|
"learning_rate": 1.8044554455445544e-06, |
|
"loss": 1.6515, |
|
"step": 938 |
|
}, |
|
{ |
|
"epoch": 0.5633887322253555, |
|
"grad_norm": 1.734375, |
|
"learning_rate": 1.801980198019802e-06, |
|
"loss": 1.6708, |
|
"step": 939 |
|
}, |
|
{ |
|
"epoch": 0.5639887202255955, |
|
"grad_norm": 1.7265625, |
|
"learning_rate": 1.7995049504950495e-06, |
|
"loss": 1.6375, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.5645887082258355, |
|
"grad_norm": 1.6171875, |
|
"learning_rate": 1.797029702970297e-06, |
|
"loss": 1.7352, |
|
"step": 941 |
|
}, |
|
{ |
|
"epoch": 0.5651886962260755, |
|
"grad_norm": 1.515625, |
|
"learning_rate": 1.7945544554455444e-06, |
|
"loss": 1.5845, |
|
"step": 942 |
|
}, |
|
{ |
|
"epoch": 0.5657886842263155, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 1.792079207920792e-06, |
|
"loss": 1.6404, |
|
"step": 943 |
|
}, |
|
{ |
|
"epoch": 0.5663886722265554, |
|
"grad_norm": 1.484375, |
|
"learning_rate": 1.7896039603960395e-06, |
|
"loss": 1.693, |
|
"step": 944 |
|
}, |
|
{ |
|
"epoch": 0.5669886602267955, |
|
"grad_norm": 1.65625, |
|
"learning_rate": 1.787128712871287e-06, |
|
"loss": 1.6918, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 0.5675886482270355, |
|
"grad_norm": 1.8125, |
|
"learning_rate": 1.7846534653465346e-06, |
|
"loss": 1.6853, |
|
"step": 946 |
|
}, |
|
{ |
|
"epoch": 0.5681886362272754, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 1.7821782178217822e-06, |
|
"loss": 1.7385, |
|
"step": 947 |
|
}, |
|
{ |
|
"epoch": 0.5687886242275154, |
|
"grad_norm": 1.734375, |
|
"learning_rate": 1.7797029702970297e-06, |
|
"loss": 1.6999, |
|
"step": 948 |
|
}, |
|
{ |
|
"epoch": 0.5693886122277555, |
|
"grad_norm": 1.71875, |
|
"learning_rate": 1.777227722772277e-06, |
|
"loss": 1.5892, |
|
"step": 949 |
|
}, |
|
{ |
|
"epoch": 0.5699886002279955, |
|
"grad_norm": 1.6328125, |
|
"learning_rate": 1.7747524752475246e-06, |
|
"loss": 1.7026, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.5699886002279955, |
|
"eval_loss": 1.777536153793335, |
|
"eval_model_preparation_time": 0.0037, |
|
"eval_runtime": 65.9833, |
|
"eval_samples_per_second": 151.554, |
|
"eval_steps_per_second": 25.264, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.5705885882282354, |
|
"grad_norm": 1.6328125, |
|
"learning_rate": 1.7722772277227722e-06, |
|
"loss": 1.6403, |
|
"step": 951 |
|
}, |
|
{ |
|
"epoch": 0.5711885762284754, |
|
"grad_norm": 1.5234375, |
|
"learning_rate": 1.7698019801980197e-06, |
|
"loss": 1.5106, |
|
"step": 952 |
|
}, |
|
{ |
|
"epoch": 0.5717885642287154, |
|
"grad_norm": 1.8515625, |
|
"learning_rate": 1.7673267326732673e-06, |
|
"loss": 1.6019, |
|
"step": 953 |
|
}, |
|
{ |
|
"epoch": 0.5723885522289555, |
|
"grad_norm": 1.6484375, |
|
"learning_rate": 1.7648514851485149e-06, |
|
"loss": 1.5817, |
|
"step": 954 |
|
}, |
|
{ |
|
"epoch": 0.5729885402291954, |
|
"grad_norm": 1.5859375, |
|
"learning_rate": 1.7623762376237624e-06, |
|
"loss": 1.7078, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 0.5735885282294354, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 1.7599009900990098e-06, |
|
"loss": 1.6346, |
|
"step": 956 |
|
}, |
|
{ |
|
"epoch": 0.5741885162296754, |
|
"grad_norm": 1.7421875, |
|
"learning_rate": 1.7574257425742573e-06, |
|
"loss": 1.6595, |
|
"step": 957 |
|
}, |
|
{ |
|
"epoch": 0.5747885042299155, |
|
"grad_norm": 1.6484375, |
|
"learning_rate": 1.7549504950495049e-06, |
|
"loss": 1.6867, |
|
"step": 958 |
|
}, |
|
{ |
|
"epoch": 0.5753884922301554, |
|
"grad_norm": 1.7421875, |
|
"learning_rate": 1.7524752475247524e-06, |
|
"loss": 1.6465, |
|
"step": 959 |
|
}, |
|
{ |
|
"epoch": 0.5759884802303954, |
|
"grad_norm": 1.6875, |
|
"learning_rate": 1.75e-06, |
|
"loss": 1.7808, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.5765884682306354, |
|
"grad_norm": 1.6015625, |
|
"learning_rate": 1.7475247524752475e-06, |
|
"loss": 1.648, |
|
"step": 961 |
|
}, |
|
{ |
|
"epoch": 0.5771884562308753, |
|
"grad_norm": 1.65625, |
|
"learning_rate": 1.7450495049504949e-06, |
|
"loss": 1.6428, |
|
"step": 962 |
|
}, |
|
{ |
|
"epoch": 0.5777884442311154, |
|
"grad_norm": 1.8046875, |
|
"learning_rate": 1.7425742574257424e-06, |
|
"loss": 1.7126, |
|
"step": 963 |
|
}, |
|
{ |
|
"epoch": 0.5783884322313554, |
|
"grad_norm": 1.578125, |
|
"learning_rate": 1.74009900990099e-06, |
|
"loss": 1.6308, |
|
"step": 964 |
|
}, |
|
{ |
|
"epoch": 0.5789884202315954, |
|
"grad_norm": 1.75, |
|
"learning_rate": 1.7376237623762375e-06, |
|
"loss": 1.7662, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 0.5795884082318353, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 1.735148514851485e-06, |
|
"loss": 1.6381, |
|
"step": 966 |
|
}, |
|
{ |
|
"epoch": 0.5801883962320754, |
|
"grad_norm": 1.65625, |
|
"learning_rate": 1.7326732673267326e-06, |
|
"loss": 1.7187, |
|
"step": 967 |
|
}, |
|
{ |
|
"epoch": 0.5807883842323154, |
|
"grad_norm": 1.6484375, |
|
"learning_rate": 1.7301980198019802e-06, |
|
"loss": 1.6956, |
|
"step": 968 |
|
}, |
|
{ |
|
"epoch": 0.5813883722325554, |
|
"grad_norm": 1.65625, |
|
"learning_rate": 1.7277227722772275e-06, |
|
"loss": 1.6442, |
|
"step": 969 |
|
}, |
|
{ |
|
"epoch": 0.5819883602327953, |
|
"grad_norm": 1.5859375, |
|
"learning_rate": 1.725247524752475e-06, |
|
"loss": 1.6717, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.5825883482330353, |
|
"grad_norm": 1.71875, |
|
"learning_rate": 1.7227722772277227e-06, |
|
"loss": 1.704, |
|
"step": 971 |
|
}, |
|
{ |
|
"epoch": 0.5831883362332754, |
|
"grad_norm": 1.765625, |
|
"learning_rate": 1.7202970297029702e-06, |
|
"loss": 1.6427, |
|
"step": 972 |
|
}, |
|
{ |
|
"epoch": 0.5837883242335153, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 1.7178217821782178e-06, |
|
"loss": 1.7412, |
|
"step": 973 |
|
}, |
|
{ |
|
"epoch": 0.5843883122337553, |
|
"grad_norm": 1.46875, |
|
"learning_rate": 1.7153465346534653e-06, |
|
"loss": 1.6321, |
|
"step": 974 |
|
}, |
|
{ |
|
"epoch": 0.5849883002339953, |
|
"grad_norm": 1.7578125, |
|
"learning_rate": 1.7128712871287127e-06, |
|
"loss": 1.6029, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 0.5855882882342354, |
|
"grad_norm": 1.5703125, |
|
"learning_rate": 1.7103960396039602e-06, |
|
"loss": 1.6052, |
|
"step": 976 |
|
}, |
|
{ |
|
"epoch": 0.5861882762344753, |
|
"grad_norm": 1.5546875, |
|
"learning_rate": 1.7079207920792078e-06, |
|
"loss": 1.638, |
|
"step": 977 |
|
}, |
|
{ |
|
"epoch": 0.5867882642347153, |
|
"grad_norm": 1.65625, |
|
"learning_rate": 1.7054455445544553e-06, |
|
"loss": 1.8175, |
|
"step": 978 |
|
}, |
|
{ |
|
"epoch": 0.5873882522349553, |
|
"grad_norm": 1.7265625, |
|
"learning_rate": 1.7029702970297029e-06, |
|
"loss": 1.7441, |
|
"step": 979 |
|
}, |
|
{ |
|
"epoch": 0.5879882402351952, |
|
"grad_norm": 1.53125, |
|
"learning_rate": 1.7004950495049504e-06, |
|
"loss": 1.5586, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.5885882282354353, |
|
"grad_norm": 1.71875, |
|
"learning_rate": 1.698019801980198e-06, |
|
"loss": 1.8408, |
|
"step": 981 |
|
}, |
|
{ |
|
"epoch": 0.5891882162356753, |
|
"grad_norm": 1.6171875, |
|
"learning_rate": 1.6955445544554453e-06, |
|
"loss": 1.7207, |
|
"step": 982 |
|
}, |
|
{ |
|
"epoch": 0.5897882042359153, |
|
"grad_norm": 1.796875, |
|
"learning_rate": 1.6930693069306929e-06, |
|
"loss": 1.654, |
|
"step": 983 |
|
}, |
|
{ |
|
"epoch": 0.5903881922361552, |
|
"grad_norm": 1.859375, |
|
"learning_rate": 1.6905940594059404e-06, |
|
"loss": 1.645, |
|
"step": 984 |
|
}, |
|
{ |
|
"epoch": 0.5909881802363953, |
|
"grad_norm": 1.78125, |
|
"learning_rate": 1.688118811881188e-06, |
|
"loss": 1.7415, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 0.5915881682366353, |
|
"grad_norm": 1.578125, |
|
"learning_rate": 1.6856435643564355e-06, |
|
"loss": 1.7001, |
|
"step": 986 |
|
}, |
|
{ |
|
"epoch": 0.5921881562368753, |
|
"grad_norm": 1.6328125, |
|
"learning_rate": 1.683168316831683e-06, |
|
"loss": 1.6615, |
|
"step": 987 |
|
}, |
|
{ |
|
"epoch": 0.5927881442371152, |
|
"grad_norm": 1.78125, |
|
"learning_rate": 1.6806930693069307e-06, |
|
"loss": 1.7398, |
|
"step": 988 |
|
}, |
|
{ |
|
"epoch": 0.5933881322373552, |
|
"grad_norm": 1.6484375, |
|
"learning_rate": 1.678217821782178e-06, |
|
"loss": 1.6766, |
|
"step": 989 |
|
}, |
|
{ |
|
"epoch": 0.5939881202375953, |
|
"grad_norm": 1.8515625, |
|
"learning_rate": 1.6757425742574256e-06, |
|
"loss": 1.6571, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.5945881082378353, |
|
"grad_norm": 1.5078125, |
|
"learning_rate": 1.6732673267326731e-06, |
|
"loss": 1.622, |
|
"step": 991 |
|
}, |
|
{ |
|
"epoch": 0.5951880962380752, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 1.6707920792079207e-06, |
|
"loss": 1.6102, |
|
"step": 992 |
|
}, |
|
{ |
|
"epoch": 0.5957880842383152, |
|
"grad_norm": 1.7265625, |
|
"learning_rate": 1.6683168316831682e-06, |
|
"loss": 1.7403, |
|
"step": 993 |
|
}, |
|
{ |
|
"epoch": 0.5963880722385553, |
|
"grad_norm": 1.8359375, |
|
"learning_rate": 1.6658415841584158e-06, |
|
"loss": 1.7306, |
|
"step": 994 |
|
}, |
|
{ |
|
"epoch": 0.5969880602387952, |
|
"grad_norm": 1.6328125, |
|
"learning_rate": 1.6633663366336631e-06, |
|
"loss": 1.6436, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 0.5975880482390352, |
|
"grad_norm": 2.140625, |
|
"learning_rate": 1.6608910891089107e-06, |
|
"loss": 1.6222, |
|
"step": 996 |
|
}, |
|
{ |
|
"epoch": 0.5981880362392752, |
|
"grad_norm": 1.640625, |
|
"learning_rate": 1.6584158415841582e-06, |
|
"loss": 1.6213, |
|
"step": 997 |
|
}, |
|
{ |
|
"epoch": 0.5987880242395152, |
|
"grad_norm": 1.5859375, |
|
"learning_rate": 1.6559405940594058e-06, |
|
"loss": 1.6387, |
|
"step": 998 |
|
}, |
|
{ |
|
"epoch": 0.5993880122397552, |
|
"grad_norm": 1.734375, |
|
"learning_rate": 1.6534653465346533e-06, |
|
"loss": 1.6206, |
|
"step": 999 |
|
}, |
|
{ |
|
"epoch": 0.5999880002399952, |
|
"grad_norm": 1.546875, |
|
"learning_rate": 1.6509900990099009e-06, |
|
"loss": 1.633, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.5999880002399952, |
|
"eval_loss": 1.7774205207824707, |
|
"eval_model_preparation_time": 0.0037, |
|
"eval_runtime": 66.1167, |
|
"eval_samples_per_second": 151.248, |
|
"eval_steps_per_second": 25.213, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.6005879882402352, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 1.6485148514851484e-06, |
|
"loss": 1.7197, |
|
"step": 1001 |
|
}, |
|
{ |
|
"epoch": 0.6011879762404752, |
|
"grad_norm": 1.640625, |
|
"learning_rate": 1.6460396039603958e-06, |
|
"loss": 1.6299, |
|
"step": 1002 |
|
}, |
|
{ |
|
"epoch": 0.6017879642407152, |
|
"grad_norm": 1.6875, |
|
"learning_rate": 1.6435643564356433e-06, |
|
"loss": 1.6693, |
|
"step": 1003 |
|
}, |
|
{ |
|
"epoch": 0.6023879522409552, |
|
"grad_norm": 1.59375, |
|
"learning_rate": 1.641089108910891e-06, |
|
"loss": 1.6508, |
|
"step": 1004 |
|
}, |
|
{ |
|
"epoch": 0.6029879402411952, |
|
"grad_norm": 1.8046875, |
|
"learning_rate": 1.6386138613861385e-06, |
|
"loss": 1.6538, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 0.6035879282414351, |
|
"grad_norm": 1.875, |
|
"learning_rate": 1.636138613861386e-06, |
|
"loss": 1.624, |
|
"step": 1006 |
|
}, |
|
{ |
|
"epoch": 0.6041879162416751, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 1.6336633663366336e-06, |
|
"loss": 1.7978, |
|
"step": 1007 |
|
}, |
|
{ |
|
"epoch": 0.6047879042419152, |
|
"grad_norm": 1.953125, |
|
"learning_rate": 1.631188118811881e-06, |
|
"loss": 1.5949, |
|
"step": 1008 |
|
}, |
|
{ |
|
"epoch": 0.6053878922421552, |
|
"grad_norm": 1.6171875, |
|
"learning_rate": 1.6287128712871285e-06, |
|
"loss": 1.71, |
|
"step": 1009 |
|
}, |
|
{ |
|
"epoch": 0.6059878802423951, |
|
"grad_norm": 1.6015625, |
|
"learning_rate": 1.626237623762376e-06, |
|
"loss": 1.7189, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.6065878682426351, |
|
"grad_norm": 1.7109375, |
|
"learning_rate": 1.6237623762376238e-06, |
|
"loss": 1.6493, |
|
"step": 1011 |
|
}, |
|
{ |
|
"epoch": 0.6071878562428752, |
|
"grad_norm": 1.5625, |
|
"learning_rate": 1.6212871287128713e-06, |
|
"loss": 1.6566, |
|
"step": 1012 |
|
}, |
|
{ |
|
"epoch": 0.6077878442431152, |
|
"grad_norm": 1.7421875, |
|
"learning_rate": 1.6188118811881189e-06, |
|
"loss": 1.6018, |
|
"step": 1013 |
|
}, |
|
{ |
|
"epoch": 0.6083878322433551, |
|
"grad_norm": 2.046875, |
|
"learning_rate": 1.6163366336633664e-06, |
|
"loss": 1.6503, |
|
"step": 1014 |
|
}, |
|
{ |
|
"epoch": 0.6089878202435951, |
|
"grad_norm": 1.6640625, |
|
"learning_rate": 1.6138613861386138e-06, |
|
"loss": 1.6243, |
|
"step": 1015 |
|
}, |
|
{ |
|
"epoch": 0.6095878082438351, |
|
"grad_norm": 1.6328125, |
|
"learning_rate": 1.6113861386138613e-06, |
|
"loss": 1.6594, |
|
"step": 1016 |
|
}, |
|
{ |
|
"epoch": 0.6101877962440752, |
|
"grad_norm": 1.5859375, |
|
"learning_rate": 1.608910891089109e-06, |
|
"loss": 1.6897, |
|
"step": 1017 |
|
}, |
|
{ |
|
"epoch": 0.6107877842443151, |
|
"grad_norm": 1.59375, |
|
"learning_rate": 1.6064356435643565e-06, |
|
"loss": 1.692, |
|
"step": 1018 |
|
}, |
|
{ |
|
"epoch": 0.6113877722445551, |
|
"grad_norm": 1.5859375, |
|
"learning_rate": 1.603960396039604e-06, |
|
"loss": 1.6694, |
|
"step": 1019 |
|
}, |
|
{ |
|
"epoch": 0.6119877602447951, |
|
"grad_norm": 1.75, |
|
"learning_rate": 1.6014851485148516e-06, |
|
"loss": 1.7478, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.6125877482450351, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 1.5990099009900991e-06, |
|
"loss": 1.5819, |
|
"step": 1021 |
|
}, |
|
{ |
|
"epoch": 0.6131877362452751, |
|
"grad_norm": 1.8125, |
|
"learning_rate": 1.5965346534653465e-06, |
|
"loss": 1.6587, |
|
"step": 1022 |
|
}, |
|
{ |
|
"epoch": 0.6137877242455151, |
|
"grad_norm": 1.4765625, |
|
"learning_rate": 1.594059405940594e-06, |
|
"loss": 1.6205, |
|
"step": 1023 |
|
}, |
|
{ |
|
"epoch": 0.6143877122457551, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 1.5915841584158416e-06, |
|
"loss": 1.6359, |
|
"step": 1024 |
|
}, |
|
{ |
|
"epoch": 0.614987700245995, |
|
"grad_norm": 1.6171875, |
|
"learning_rate": 1.5891089108910891e-06, |
|
"loss": 1.6837, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 0.6155876882462351, |
|
"grad_norm": 1.5546875, |
|
"learning_rate": 1.5866336633663367e-06, |
|
"loss": 1.6778, |
|
"step": 1026 |
|
}, |
|
{ |
|
"epoch": 0.6161876762464751, |
|
"grad_norm": 1.9453125, |
|
"learning_rate": 1.5841584158415842e-06, |
|
"loss": 1.6798, |
|
"step": 1027 |
|
}, |
|
{ |
|
"epoch": 0.616787664246715, |
|
"grad_norm": 1.5546875, |
|
"learning_rate": 1.5816831683168316e-06, |
|
"loss": 1.5861, |
|
"step": 1028 |
|
}, |
|
{ |
|
"epoch": 0.617387652246955, |
|
"grad_norm": 1.625, |
|
"learning_rate": 1.5792079207920791e-06, |
|
"loss": 1.6128, |
|
"step": 1029 |
|
}, |
|
{ |
|
"epoch": 0.6179876402471951, |
|
"grad_norm": 1.7578125, |
|
"learning_rate": 1.5767326732673267e-06, |
|
"loss": 1.6962, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.6185876282474351, |
|
"grad_norm": 1.6875, |
|
"learning_rate": 1.5742574257425742e-06, |
|
"loss": 1.5868, |
|
"step": 1031 |
|
}, |
|
{ |
|
"epoch": 0.619187616247675, |
|
"grad_norm": 1.859375, |
|
"learning_rate": 1.5717821782178218e-06, |
|
"loss": 1.627, |
|
"step": 1032 |
|
}, |
|
{ |
|
"epoch": 0.619787604247915, |
|
"grad_norm": 1.71875, |
|
"learning_rate": 1.5693069306930694e-06, |
|
"loss": 1.6367, |
|
"step": 1033 |
|
}, |
|
{ |
|
"epoch": 0.620387592248155, |
|
"grad_norm": 1.65625, |
|
"learning_rate": 1.566831683168317e-06, |
|
"loss": 1.6377, |
|
"step": 1034 |
|
}, |
|
{ |
|
"epoch": 0.6209875802483951, |
|
"grad_norm": 1.71875, |
|
"learning_rate": 1.5643564356435643e-06, |
|
"loss": 1.6437, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 0.621587568248635, |
|
"grad_norm": 1.625, |
|
"learning_rate": 1.5618811881188118e-06, |
|
"loss": 1.6799, |
|
"step": 1036 |
|
}, |
|
{ |
|
"epoch": 0.622187556248875, |
|
"grad_norm": 1.6640625, |
|
"learning_rate": 1.5594059405940594e-06, |
|
"loss": 1.6935, |
|
"step": 1037 |
|
}, |
|
{ |
|
"epoch": 0.622787544249115, |
|
"grad_norm": 1.6171875, |
|
"learning_rate": 1.556930693069307e-06, |
|
"loss": 1.782, |
|
"step": 1038 |
|
}, |
|
{ |
|
"epoch": 0.6233875322493551, |
|
"grad_norm": 1.8359375, |
|
"learning_rate": 1.5544554455445545e-06, |
|
"loss": 1.7106, |
|
"step": 1039 |
|
}, |
|
{ |
|
"epoch": 0.623987520249595, |
|
"grad_norm": 1.6328125, |
|
"learning_rate": 1.551980198019802e-06, |
|
"loss": 1.6429, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.624587508249835, |
|
"grad_norm": 1.6328125, |
|
"learning_rate": 1.5495049504950496e-06, |
|
"loss": 1.6784, |
|
"step": 1041 |
|
}, |
|
{ |
|
"epoch": 0.625187496250075, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 1.547029702970297e-06, |
|
"loss": 1.6739, |
|
"step": 1042 |
|
}, |
|
{ |
|
"epoch": 0.6257874842503149, |
|
"grad_norm": 1.8671875, |
|
"learning_rate": 1.5445544554455445e-06, |
|
"loss": 1.6582, |
|
"step": 1043 |
|
}, |
|
{ |
|
"epoch": 0.626387472250555, |
|
"grad_norm": 1.640625, |
|
"learning_rate": 1.542079207920792e-06, |
|
"loss": 1.6651, |
|
"step": 1044 |
|
}, |
|
{ |
|
"epoch": 0.626987460250795, |
|
"grad_norm": 1.6328125, |
|
"learning_rate": 1.5396039603960396e-06, |
|
"loss": 1.6378, |
|
"step": 1045 |
|
}, |
|
{ |
|
"epoch": 0.627587448251035, |
|
"grad_norm": 1.625, |
|
"learning_rate": 1.5371287128712871e-06, |
|
"loss": 1.7835, |
|
"step": 1046 |
|
}, |
|
{ |
|
"epoch": 0.6281874362512749, |
|
"grad_norm": 1.734375, |
|
"learning_rate": 1.5346534653465347e-06, |
|
"loss": 1.7662, |
|
"step": 1047 |
|
}, |
|
{ |
|
"epoch": 0.628787424251515, |
|
"grad_norm": 1.59375, |
|
"learning_rate": 1.532178217821782e-06, |
|
"loss": 1.6731, |
|
"step": 1048 |
|
}, |
|
{ |
|
"epoch": 0.629387412251755, |
|
"grad_norm": 1.59375, |
|
"learning_rate": 1.5297029702970296e-06, |
|
"loss": 1.671, |
|
"step": 1049 |
|
}, |
|
{ |
|
"epoch": 0.629987400251995, |
|
"grad_norm": 1.6484375, |
|
"learning_rate": 1.5272277227722771e-06, |
|
"loss": 1.7948, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.629987400251995, |
|
"eval_loss": 1.777391791343689, |
|
"eval_model_preparation_time": 0.0037, |
|
"eval_runtime": 69.2838, |
|
"eval_samples_per_second": 144.334, |
|
"eval_steps_per_second": 24.06, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.6305873882522349, |
|
"grad_norm": 1.90625, |
|
"learning_rate": 1.5247524752475247e-06, |
|
"loss": 1.6642, |
|
"step": 1051 |
|
}, |
|
{ |
|
"epoch": 0.6311873762524749, |
|
"grad_norm": 1.6640625, |
|
"learning_rate": 1.5222772277227723e-06, |
|
"loss": 1.6911, |
|
"step": 1052 |
|
}, |
|
{ |
|
"epoch": 0.631787364252715, |
|
"grad_norm": 1.65625, |
|
"learning_rate": 1.5198019801980198e-06, |
|
"loss": 1.5992, |
|
"step": 1053 |
|
}, |
|
{ |
|
"epoch": 0.632387352252955, |
|
"grad_norm": 1.7421875, |
|
"learning_rate": 1.5173267326732674e-06, |
|
"loss": 1.6522, |
|
"step": 1054 |
|
}, |
|
{ |
|
"epoch": 0.6329873402531949, |
|
"grad_norm": 1.7734375, |
|
"learning_rate": 1.5148514851485147e-06, |
|
"loss": 1.6688, |
|
"step": 1055 |
|
}, |
|
{ |
|
"epoch": 0.6335873282534349, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 1.5123762376237623e-06, |
|
"loss": 1.6327, |
|
"step": 1056 |
|
}, |
|
{ |
|
"epoch": 0.634187316253675, |
|
"grad_norm": 1.59375, |
|
"learning_rate": 1.5099009900990098e-06, |
|
"loss": 1.6012, |
|
"step": 1057 |
|
}, |
|
{ |
|
"epoch": 0.6347873042539149, |
|
"grad_norm": 1.5625, |
|
"learning_rate": 1.5074257425742574e-06, |
|
"loss": 1.7478, |
|
"step": 1058 |
|
}, |
|
{ |
|
"epoch": 0.6353872922541549, |
|
"grad_norm": 1.7265625, |
|
"learning_rate": 1.504950495049505e-06, |
|
"loss": 1.6642, |
|
"step": 1059 |
|
}, |
|
{ |
|
"epoch": 0.6359872802543949, |
|
"grad_norm": 1.875, |
|
"learning_rate": 1.5024752475247525e-06, |
|
"loss": 1.6575, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.6365872682546349, |
|
"grad_norm": 1.625, |
|
"learning_rate": 1.5e-06, |
|
"loss": 1.682, |
|
"step": 1061 |
|
}, |
|
{ |
|
"epoch": 0.6371872562548749, |
|
"grad_norm": 1.640625, |
|
"learning_rate": 1.4975247524752474e-06, |
|
"loss": 1.6323, |
|
"step": 1062 |
|
}, |
|
{ |
|
"epoch": 0.6377872442551149, |
|
"grad_norm": 1.65625, |
|
"learning_rate": 1.495049504950495e-06, |
|
"loss": 1.6041, |
|
"step": 1063 |
|
}, |
|
{ |
|
"epoch": 0.6383872322553549, |
|
"grad_norm": 1.8203125, |
|
"learning_rate": 1.4925742574257425e-06, |
|
"loss": 1.6278, |
|
"step": 1064 |
|
}, |
|
{ |
|
"epoch": 0.6389872202555948, |
|
"grad_norm": 1.765625, |
|
"learning_rate": 1.49009900990099e-06, |
|
"loss": 1.6691, |
|
"step": 1065 |
|
}, |
|
{ |
|
"epoch": 0.6395872082558349, |
|
"grad_norm": 1.546875, |
|
"learning_rate": 1.4876237623762376e-06, |
|
"loss": 1.6173, |
|
"step": 1066 |
|
}, |
|
{ |
|
"epoch": 0.6401871962560749, |
|
"grad_norm": 1.7734375, |
|
"learning_rate": 1.4851485148514852e-06, |
|
"loss": 1.6719, |
|
"step": 1067 |
|
}, |
|
{ |
|
"epoch": 0.6407871842563149, |
|
"grad_norm": 1.6015625, |
|
"learning_rate": 1.4826732673267325e-06, |
|
"loss": 1.6717, |
|
"step": 1068 |
|
}, |
|
{ |
|
"epoch": 0.6413871722565548, |
|
"grad_norm": 1.5703125, |
|
"learning_rate": 1.48019801980198e-06, |
|
"loss": 1.6107, |
|
"step": 1069 |
|
}, |
|
{ |
|
"epoch": 0.6419871602567948, |
|
"grad_norm": 1.71875, |
|
"learning_rate": 1.4777227722772276e-06, |
|
"loss": 1.6747, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.6425871482570349, |
|
"grad_norm": 1.59375, |
|
"learning_rate": 1.4752475247524752e-06, |
|
"loss": 1.5889, |
|
"step": 1071 |
|
}, |
|
{ |
|
"epoch": 0.6431871362572749, |
|
"grad_norm": 1.5859375, |
|
"learning_rate": 1.4727722772277227e-06, |
|
"loss": 1.6483, |
|
"step": 1072 |
|
}, |
|
{ |
|
"epoch": 0.6437871242575148, |
|
"grad_norm": 1.7578125, |
|
"learning_rate": 1.4702970297029703e-06, |
|
"loss": 1.713, |
|
"step": 1073 |
|
}, |
|
{ |
|
"epoch": 0.6443871122577548, |
|
"grad_norm": 1.6484375, |
|
"learning_rate": 1.4678217821782178e-06, |
|
"loss": 1.6684, |
|
"step": 1074 |
|
}, |
|
{ |
|
"epoch": 0.6449871002579949, |
|
"grad_norm": 1.6640625, |
|
"learning_rate": 1.4653465346534652e-06, |
|
"loss": 1.566, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 0.6455870882582349, |
|
"grad_norm": 1.5546875, |
|
"learning_rate": 1.4628712871287127e-06, |
|
"loss": 1.6242, |
|
"step": 1076 |
|
}, |
|
{ |
|
"epoch": 0.6461870762584748, |
|
"grad_norm": 1.640625, |
|
"learning_rate": 1.4603960396039603e-06, |
|
"loss": 1.613, |
|
"step": 1077 |
|
}, |
|
{ |
|
"epoch": 0.6467870642587148, |
|
"grad_norm": 1.6640625, |
|
"learning_rate": 1.4579207920792078e-06, |
|
"loss": 1.595, |
|
"step": 1078 |
|
}, |
|
{ |
|
"epoch": 0.6473870522589549, |
|
"grad_norm": 1.640625, |
|
"learning_rate": 1.4554455445544554e-06, |
|
"loss": 1.6096, |
|
"step": 1079 |
|
}, |
|
{ |
|
"epoch": 0.6479870402591948, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 1.452970297029703e-06, |
|
"loss": 1.6353, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.6485870282594348, |
|
"grad_norm": 2.03125, |
|
"learning_rate": 1.4504950495049503e-06, |
|
"loss": 1.6858, |
|
"step": 1081 |
|
}, |
|
{ |
|
"epoch": 0.6491870162596748, |
|
"grad_norm": 1.609375, |
|
"learning_rate": 1.4480198019801978e-06, |
|
"loss": 1.6339, |
|
"step": 1082 |
|
}, |
|
{ |
|
"epoch": 0.6497870042599148, |
|
"grad_norm": 1.8515625, |
|
"learning_rate": 1.4455445544554454e-06, |
|
"loss": 1.6156, |
|
"step": 1083 |
|
}, |
|
{ |
|
"epoch": 0.6503869922601548, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 1.443069306930693e-06, |
|
"loss": 1.6629, |
|
"step": 1084 |
|
}, |
|
{ |
|
"epoch": 0.6509869802603948, |
|
"grad_norm": 1.734375, |
|
"learning_rate": 1.4405940594059405e-06, |
|
"loss": 1.6012, |
|
"step": 1085 |
|
}, |
|
{ |
|
"epoch": 0.6515869682606348, |
|
"grad_norm": 1.75, |
|
"learning_rate": 1.438118811881188e-06, |
|
"loss": 1.7112, |
|
"step": 1086 |
|
}, |
|
{ |
|
"epoch": 0.6521869562608748, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 1.4356435643564356e-06, |
|
"loss": 1.7422, |
|
"step": 1087 |
|
}, |
|
{ |
|
"epoch": 0.6527869442611148, |
|
"grad_norm": 1.7421875, |
|
"learning_rate": 1.433168316831683e-06, |
|
"loss": 1.5988, |
|
"step": 1088 |
|
}, |
|
{ |
|
"epoch": 0.6533869322613548, |
|
"grad_norm": 1.6640625, |
|
"learning_rate": 1.4306930693069305e-06, |
|
"loss": 1.631, |
|
"step": 1089 |
|
}, |
|
{ |
|
"epoch": 0.6539869202615948, |
|
"grad_norm": 1.84375, |
|
"learning_rate": 1.428217821782178e-06, |
|
"loss": 1.7748, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.6545869082618347, |
|
"grad_norm": 1.6875, |
|
"learning_rate": 1.4257425742574256e-06, |
|
"loss": 1.6101, |
|
"step": 1091 |
|
}, |
|
{ |
|
"epoch": 0.6551868962620747, |
|
"grad_norm": 1.578125, |
|
"learning_rate": 1.4232673267326732e-06, |
|
"loss": 1.6727, |
|
"step": 1092 |
|
}, |
|
{ |
|
"epoch": 0.6557868842623148, |
|
"grad_norm": 1.765625, |
|
"learning_rate": 1.4207920792079207e-06, |
|
"loss": 1.656, |
|
"step": 1093 |
|
}, |
|
{ |
|
"epoch": 0.6563868722625548, |
|
"grad_norm": 1.84375, |
|
"learning_rate": 1.4183168316831683e-06, |
|
"loss": 1.7244, |
|
"step": 1094 |
|
}, |
|
{ |
|
"epoch": 0.6569868602627947, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 1.4158415841584156e-06, |
|
"loss": 1.6891, |
|
"step": 1095 |
|
}, |
|
{ |
|
"epoch": 0.6575868482630347, |
|
"grad_norm": 1.7265625, |
|
"learning_rate": 1.4133663366336632e-06, |
|
"loss": 1.7265, |
|
"step": 1096 |
|
}, |
|
{ |
|
"epoch": 0.6581868362632748, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 1.4108910891089107e-06, |
|
"loss": 1.5939, |
|
"step": 1097 |
|
}, |
|
{ |
|
"epoch": 0.6587868242635148, |
|
"grad_norm": 1.765625, |
|
"learning_rate": 1.4084158415841583e-06, |
|
"loss": 1.6942, |
|
"step": 1098 |
|
}, |
|
{ |
|
"epoch": 0.6593868122637547, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 1.4059405940594058e-06, |
|
"loss": 1.5412, |
|
"step": 1099 |
|
}, |
|
{ |
|
"epoch": 0.6599868002639947, |
|
"grad_norm": 1.5703125, |
|
"learning_rate": 1.4034653465346534e-06, |
|
"loss": 1.5955, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.6599868002639947, |
|
"eval_loss": 1.7773131132125854, |
|
"eval_model_preparation_time": 0.0037, |
|
"eval_runtime": 66.1015, |
|
"eval_samples_per_second": 151.283, |
|
"eval_steps_per_second": 25.219, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.6605867882642347, |
|
"grad_norm": 1.65625, |
|
"learning_rate": 1.4009900990099007e-06, |
|
"loss": 1.665, |
|
"step": 1101 |
|
}, |
|
{ |
|
"epoch": 0.6611867762644748, |
|
"grad_norm": 1.515625, |
|
"learning_rate": 1.3985148514851483e-06, |
|
"loss": 1.6523, |
|
"step": 1102 |
|
}, |
|
{ |
|
"epoch": 0.6617867642647147, |
|
"grad_norm": 1.640625, |
|
"learning_rate": 1.3960396039603959e-06, |
|
"loss": 1.5928, |
|
"step": 1103 |
|
}, |
|
{ |
|
"epoch": 0.6623867522649547, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 1.3935643564356434e-06, |
|
"loss": 1.5513, |
|
"step": 1104 |
|
}, |
|
{ |
|
"epoch": 0.6629867402651947, |
|
"grad_norm": 1.8359375, |
|
"learning_rate": 1.391089108910891e-06, |
|
"loss": 1.7027, |
|
"step": 1105 |
|
}, |
|
{ |
|
"epoch": 0.6635867282654347, |
|
"grad_norm": 1.65625, |
|
"learning_rate": 1.3886138613861385e-06, |
|
"loss": 1.6501, |
|
"step": 1106 |
|
}, |
|
{ |
|
"epoch": 0.6641867162656747, |
|
"grad_norm": 1.5625, |
|
"learning_rate": 1.386138613861386e-06, |
|
"loss": 1.6262, |
|
"step": 1107 |
|
}, |
|
{ |
|
"epoch": 0.6647867042659147, |
|
"grad_norm": 1.546875, |
|
"learning_rate": 1.3836633663366334e-06, |
|
"loss": 1.6506, |
|
"step": 1108 |
|
}, |
|
{ |
|
"epoch": 0.6653866922661547, |
|
"grad_norm": 1.625, |
|
"learning_rate": 1.381188118811881e-06, |
|
"loss": 1.6444, |
|
"step": 1109 |
|
}, |
|
{ |
|
"epoch": 0.6659866802663946, |
|
"grad_norm": 1.640625, |
|
"learning_rate": 1.3787128712871285e-06, |
|
"loss": 1.6403, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.6665866682666347, |
|
"grad_norm": 1.515625, |
|
"learning_rate": 1.376237623762376e-06, |
|
"loss": 1.6624, |
|
"step": 1111 |
|
}, |
|
{ |
|
"epoch": 0.6671866562668747, |
|
"grad_norm": 1.8515625, |
|
"learning_rate": 1.3737623762376238e-06, |
|
"loss": 1.7689, |
|
"step": 1112 |
|
}, |
|
{ |
|
"epoch": 0.6677866442671146, |
|
"grad_norm": 1.7421875, |
|
"learning_rate": 1.3712871287128714e-06, |
|
"loss": 1.661, |
|
"step": 1113 |
|
}, |
|
{ |
|
"epoch": 0.6683866322673546, |
|
"grad_norm": 2.03125, |
|
"learning_rate": 1.368811881188119e-06, |
|
"loss": 1.6815, |
|
"step": 1114 |
|
}, |
|
{ |
|
"epoch": 0.6689866202675947, |
|
"grad_norm": 1.6484375, |
|
"learning_rate": 1.3663366336633663e-06, |
|
"loss": 1.6806, |
|
"step": 1115 |
|
}, |
|
{ |
|
"epoch": 0.6695866082678347, |
|
"grad_norm": 1.625, |
|
"learning_rate": 1.3638613861386139e-06, |
|
"loss": 1.6907, |
|
"step": 1116 |
|
}, |
|
{ |
|
"epoch": 0.6701865962680746, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 1.3613861386138614e-06, |
|
"loss": 1.6261, |
|
"step": 1117 |
|
}, |
|
{ |
|
"epoch": 0.6707865842683146, |
|
"grad_norm": 1.6171875, |
|
"learning_rate": 1.358910891089109e-06, |
|
"loss": 1.7169, |
|
"step": 1118 |
|
}, |
|
{ |
|
"epoch": 0.6713865722685546, |
|
"grad_norm": 1.7734375, |
|
"learning_rate": 1.3564356435643565e-06, |
|
"loss": 1.5983, |
|
"step": 1119 |
|
}, |
|
{ |
|
"epoch": 0.6719865602687947, |
|
"grad_norm": 1.6328125, |
|
"learning_rate": 1.353960396039604e-06, |
|
"loss": 1.6789, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.6725865482690346, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 1.3514851485148514e-06, |
|
"loss": 1.6336, |
|
"step": 1121 |
|
}, |
|
{ |
|
"epoch": 0.6731865362692746, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 1.349009900990099e-06, |
|
"loss": 1.7563, |
|
"step": 1122 |
|
}, |
|
{ |
|
"epoch": 0.6737865242695146, |
|
"grad_norm": 1.7578125, |
|
"learning_rate": 1.3465346534653465e-06, |
|
"loss": 1.6033, |
|
"step": 1123 |
|
}, |
|
{ |
|
"epoch": 0.6743865122697547, |
|
"grad_norm": 1.6640625, |
|
"learning_rate": 1.344059405940594e-06, |
|
"loss": 1.6741, |
|
"step": 1124 |
|
}, |
|
{ |
|
"epoch": 0.6749865002699946, |
|
"grad_norm": 1.5234375, |
|
"learning_rate": 1.3415841584158416e-06, |
|
"loss": 1.5703, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 0.6755864882702346, |
|
"grad_norm": 1.5546875, |
|
"learning_rate": 1.3391089108910892e-06, |
|
"loss": 1.7107, |
|
"step": 1126 |
|
}, |
|
{ |
|
"epoch": 0.6761864762704746, |
|
"grad_norm": 1.640625, |
|
"learning_rate": 1.3366336633663367e-06, |
|
"loss": 1.6928, |
|
"step": 1127 |
|
}, |
|
{ |
|
"epoch": 0.6767864642707145, |
|
"grad_norm": 1.640625, |
|
"learning_rate": 1.334158415841584e-06, |
|
"loss": 1.6344, |
|
"step": 1128 |
|
}, |
|
{ |
|
"epoch": 0.6773864522709546, |
|
"grad_norm": 1.6015625, |
|
"learning_rate": 1.3316831683168316e-06, |
|
"loss": 1.5788, |
|
"step": 1129 |
|
}, |
|
{ |
|
"epoch": 0.6779864402711946, |
|
"grad_norm": 1.609375, |
|
"learning_rate": 1.3292079207920792e-06, |
|
"loss": 1.6234, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.6785864282714346, |
|
"grad_norm": 1.8046875, |
|
"learning_rate": 1.3267326732673268e-06, |
|
"loss": 1.6987, |
|
"step": 1131 |
|
}, |
|
{ |
|
"epoch": 0.6791864162716745, |
|
"grad_norm": 1.7265625, |
|
"learning_rate": 1.3242574257425743e-06, |
|
"loss": 1.5971, |
|
"step": 1132 |
|
}, |
|
{ |
|
"epoch": 0.6797864042719146, |
|
"grad_norm": 1.75, |
|
"learning_rate": 1.3217821782178219e-06, |
|
"loss": 1.6059, |
|
"step": 1133 |
|
}, |
|
{ |
|
"epoch": 0.6803863922721546, |
|
"grad_norm": 1.6640625, |
|
"learning_rate": 1.3193069306930692e-06, |
|
"loss": 1.6445, |
|
"step": 1134 |
|
}, |
|
{ |
|
"epoch": 0.6809863802723946, |
|
"grad_norm": 1.828125, |
|
"learning_rate": 1.3168316831683168e-06, |
|
"loss": 1.6312, |
|
"step": 1135 |
|
}, |
|
{ |
|
"epoch": 0.6815863682726345, |
|
"grad_norm": 1.7578125, |
|
"learning_rate": 1.3143564356435643e-06, |
|
"loss": 1.6458, |
|
"step": 1136 |
|
}, |
|
{ |
|
"epoch": 0.6821863562728745, |
|
"grad_norm": 1.796875, |
|
"learning_rate": 1.3118811881188119e-06, |
|
"loss": 1.6534, |
|
"step": 1137 |
|
}, |
|
{ |
|
"epoch": 0.6827863442731146, |
|
"grad_norm": 1.515625, |
|
"learning_rate": 1.3094059405940594e-06, |
|
"loss": 1.6041, |
|
"step": 1138 |
|
}, |
|
{ |
|
"epoch": 0.6833863322733545, |
|
"grad_norm": 1.921875, |
|
"learning_rate": 1.306930693069307e-06, |
|
"loss": 1.7073, |
|
"step": 1139 |
|
}, |
|
{ |
|
"epoch": 0.6839863202735945, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 1.3044554455445545e-06, |
|
"loss": 1.7033, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.6845863082738345, |
|
"grad_norm": 1.546875, |
|
"learning_rate": 1.3019801980198019e-06, |
|
"loss": 1.6317, |
|
"step": 1141 |
|
}, |
|
{ |
|
"epoch": 0.6851862962740746, |
|
"grad_norm": 1.734375, |
|
"learning_rate": 1.2995049504950494e-06, |
|
"loss": 1.6607, |
|
"step": 1142 |
|
}, |
|
{ |
|
"epoch": 0.6857862842743145, |
|
"grad_norm": 1.609375, |
|
"learning_rate": 1.297029702970297e-06, |
|
"loss": 1.6282, |
|
"step": 1143 |
|
}, |
|
{ |
|
"epoch": 0.6863862722745545, |
|
"grad_norm": 1.7109375, |
|
"learning_rate": 1.2945544554455445e-06, |
|
"loss": 1.7765, |
|
"step": 1144 |
|
}, |
|
{ |
|
"epoch": 0.6869862602747945, |
|
"grad_norm": 1.640625, |
|
"learning_rate": 1.292079207920792e-06, |
|
"loss": 1.6843, |
|
"step": 1145 |
|
}, |
|
{ |
|
"epoch": 0.6875862482750345, |
|
"grad_norm": 1.5859375, |
|
"learning_rate": 1.2896039603960396e-06, |
|
"loss": 1.6195, |
|
"step": 1146 |
|
}, |
|
{ |
|
"epoch": 0.6881862362752745, |
|
"grad_norm": 1.6328125, |
|
"learning_rate": 1.2871287128712872e-06, |
|
"loss": 1.6546, |
|
"step": 1147 |
|
}, |
|
{ |
|
"epoch": 0.6887862242755145, |
|
"grad_norm": 1.59375, |
|
"learning_rate": 1.2846534653465345e-06, |
|
"loss": 1.6723, |
|
"step": 1148 |
|
}, |
|
{ |
|
"epoch": 0.6893862122757545, |
|
"grad_norm": 1.6640625, |
|
"learning_rate": 1.282178217821782e-06, |
|
"loss": 1.6211, |
|
"step": 1149 |
|
}, |
|
{ |
|
"epoch": 0.6899862002759944, |
|
"grad_norm": 1.5625, |
|
"learning_rate": 1.2797029702970297e-06, |
|
"loss": 1.7107, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.6899862002759944, |
|
"eval_loss": 1.7773100137710571, |
|
"eval_model_preparation_time": 0.0037, |
|
"eval_runtime": 66.2071, |
|
"eval_samples_per_second": 151.041, |
|
"eval_steps_per_second": 25.179, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.6905861882762345, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 1.2772277227722772e-06, |
|
"loss": 1.6873, |
|
"step": 1151 |
|
}, |
|
{ |
|
"epoch": 0.6911861762764745, |
|
"grad_norm": 1.5859375, |
|
"learning_rate": 1.2747524752475248e-06, |
|
"loss": 1.5066, |
|
"step": 1152 |
|
}, |
|
{ |
|
"epoch": 0.6917861642767145, |
|
"grad_norm": 1.640625, |
|
"learning_rate": 1.2722772277227723e-06, |
|
"loss": 1.5879, |
|
"step": 1153 |
|
}, |
|
{ |
|
"epoch": 0.6923861522769544, |
|
"grad_norm": 1.5390625, |
|
"learning_rate": 1.2698019801980197e-06, |
|
"loss": 1.7176, |
|
"step": 1154 |
|
}, |
|
{ |
|
"epoch": 0.6929861402771944, |
|
"grad_norm": 1.5625, |
|
"learning_rate": 1.2673267326732672e-06, |
|
"loss": 1.6146, |
|
"step": 1155 |
|
}, |
|
{ |
|
"epoch": 0.6935861282774345, |
|
"grad_norm": 1.8359375, |
|
"learning_rate": 1.2648514851485148e-06, |
|
"loss": 1.6275, |
|
"step": 1156 |
|
}, |
|
{ |
|
"epoch": 0.6941861162776745, |
|
"grad_norm": 1.6171875, |
|
"learning_rate": 1.2623762376237623e-06, |
|
"loss": 1.6011, |
|
"step": 1157 |
|
}, |
|
{ |
|
"epoch": 0.6947861042779144, |
|
"grad_norm": 1.6171875, |
|
"learning_rate": 1.2599009900990099e-06, |
|
"loss": 1.7129, |
|
"step": 1158 |
|
}, |
|
{ |
|
"epoch": 0.6953860922781544, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 1.2574257425742574e-06, |
|
"loss": 1.6008, |
|
"step": 1159 |
|
}, |
|
{ |
|
"epoch": 0.6959860802783945, |
|
"grad_norm": 1.7265625, |
|
"learning_rate": 1.254950495049505e-06, |
|
"loss": 1.6409, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.6965860682786345, |
|
"grad_norm": 1.7578125, |
|
"learning_rate": 1.2524752475247523e-06, |
|
"loss": 1.7206, |
|
"step": 1161 |
|
}, |
|
{ |
|
"epoch": 0.6971860562788744, |
|
"grad_norm": 1.6484375, |
|
"learning_rate": 1.2499999999999999e-06, |
|
"loss": 1.6677, |
|
"step": 1162 |
|
}, |
|
{ |
|
"epoch": 0.6977860442791144, |
|
"grad_norm": 1.71875, |
|
"learning_rate": 1.2475247524752474e-06, |
|
"loss": 1.4656, |
|
"step": 1163 |
|
}, |
|
{ |
|
"epoch": 0.6983860322793544, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 1.245049504950495e-06, |
|
"loss": 1.6027, |
|
"step": 1164 |
|
}, |
|
{ |
|
"epoch": 0.6989860202795944, |
|
"grad_norm": 1.6640625, |
|
"learning_rate": 1.2425742574257426e-06, |
|
"loss": 1.7318, |
|
"step": 1165 |
|
}, |
|
{ |
|
"epoch": 0.6995860082798344, |
|
"grad_norm": 1.734375, |
|
"learning_rate": 1.2400990099009901e-06, |
|
"loss": 1.6006, |
|
"step": 1166 |
|
}, |
|
{ |
|
"epoch": 0.7001859962800744, |
|
"grad_norm": 1.6328125, |
|
"learning_rate": 1.2376237623762375e-06, |
|
"loss": 1.6734, |
|
"step": 1167 |
|
}, |
|
{ |
|
"epoch": 0.7007859842803144, |
|
"grad_norm": 1.7421875, |
|
"learning_rate": 1.235148514851485e-06, |
|
"loss": 1.7149, |
|
"step": 1168 |
|
}, |
|
{ |
|
"epoch": 0.7013859722805544, |
|
"grad_norm": 1.8515625, |
|
"learning_rate": 1.2326732673267326e-06, |
|
"loss": 1.7003, |
|
"step": 1169 |
|
}, |
|
{ |
|
"epoch": 0.7019859602807944, |
|
"grad_norm": 1.578125, |
|
"learning_rate": 1.2301980198019801e-06, |
|
"loss": 1.5693, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.7025859482810344, |
|
"grad_norm": 1.6015625, |
|
"learning_rate": 1.2277227722772277e-06, |
|
"loss": 1.6384, |
|
"step": 1171 |
|
}, |
|
{ |
|
"epoch": 0.7031859362812743, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 1.2252475247524752e-06, |
|
"loss": 1.6105, |
|
"step": 1172 |
|
}, |
|
{ |
|
"epoch": 0.7037859242815143, |
|
"grad_norm": 1.75, |
|
"learning_rate": 1.2227722772277228e-06, |
|
"loss": 1.5411, |
|
"step": 1173 |
|
}, |
|
{ |
|
"epoch": 0.7043859122817544, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 1.2202970297029701e-06, |
|
"loss": 1.677, |
|
"step": 1174 |
|
}, |
|
{ |
|
"epoch": 0.7049859002819944, |
|
"grad_norm": 1.6640625, |
|
"learning_rate": 1.2178217821782177e-06, |
|
"loss": 1.7706, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 0.7055858882822343, |
|
"grad_norm": 1.4921875, |
|
"learning_rate": 1.2153465346534652e-06, |
|
"loss": 1.666, |
|
"step": 1176 |
|
}, |
|
{ |
|
"epoch": 0.7061858762824743, |
|
"grad_norm": 1.578125, |
|
"learning_rate": 1.2128712871287128e-06, |
|
"loss": 1.6809, |
|
"step": 1177 |
|
}, |
|
{ |
|
"epoch": 0.7067858642827144, |
|
"grad_norm": 1.6171875, |
|
"learning_rate": 1.2103960396039603e-06, |
|
"loss": 1.6347, |
|
"step": 1178 |
|
}, |
|
{ |
|
"epoch": 0.7073858522829544, |
|
"grad_norm": 1.640625, |
|
"learning_rate": 1.207920792079208e-06, |
|
"loss": 1.7277, |
|
"step": 1179 |
|
}, |
|
{ |
|
"epoch": 0.7079858402831943, |
|
"grad_norm": 1.75, |
|
"learning_rate": 1.2054455445544555e-06, |
|
"loss": 1.5418, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.7085858282834343, |
|
"grad_norm": 1.5078125, |
|
"learning_rate": 1.2029702970297028e-06, |
|
"loss": 1.6748, |
|
"step": 1181 |
|
}, |
|
{ |
|
"epoch": 0.7091858162836743, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 1.2004950495049504e-06, |
|
"loss": 1.6577, |
|
"step": 1182 |
|
}, |
|
{ |
|
"epoch": 0.7097858042839144, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 1.198019801980198e-06, |
|
"loss": 1.7349, |
|
"step": 1183 |
|
}, |
|
{ |
|
"epoch": 0.7103857922841543, |
|
"grad_norm": 1.65625, |
|
"learning_rate": 1.1955445544554455e-06, |
|
"loss": 1.5652, |
|
"step": 1184 |
|
}, |
|
{ |
|
"epoch": 0.7109857802843943, |
|
"grad_norm": 1.6875, |
|
"learning_rate": 1.193069306930693e-06, |
|
"loss": 1.6844, |
|
"step": 1185 |
|
}, |
|
{ |
|
"epoch": 0.7115857682846343, |
|
"grad_norm": 1.625, |
|
"learning_rate": 1.1905940594059406e-06, |
|
"loss": 1.6704, |
|
"step": 1186 |
|
}, |
|
{ |
|
"epoch": 0.7121857562848743, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 1.188118811881188e-06, |
|
"loss": 1.6941, |
|
"step": 1187 |
|
}, |
|
{ |
|
"epoch": 0.7127857442851143, |
|
"grad_norm": 1.609375, |
|
"learning_rate": 1.1856435643564355e-06, |
|
"loss": 1.6569, |
|
"step": 1188 |
|
}, |
|
{ |
|
"epoch": 0.7133857322853543, |
|
"grad_norm": 1.7578125, |
|
"learning_rate": 1.183168316831683e-06, |
|
"loss": 1.749, |
|
"step": 1189 |
|
}, |
|
{ |
|
"epoch": 0.7139857202855943, |
|
"grad_norm": 1.5703125, |
|
"learning_rate": 1.1806930693069306e-06, |
|
"loss": 1.6929, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.7145857082858342, |
|
"grad_norm": 1.5859375, |
|
"learning_rate": 1.1782178217821781e-06, |
|
"loss": 1.645, |
|
"step": 1191 |
|
}, |
|
{ |
|
"epoch": 0.7151856962860743, |
|
"grad_norm": 1.59375, |
|
"learning_rate": 1.1757425742574257e-06, |
|
"loss": 1.6773, |
|
"step": 1192 |
|
}, |
|
{ |
|
"epoch": 0.7157856842863143, |
|
"grad_norm": 1.6640625, |
|
"learning_rate": 1.1732673267326732e-06, |
|
"loss": 1.7084, |
|
"step": 1193 |
|
}, |
|
{ |
|
"epoch": 0.7163856722865543, |
|
"grad_norm": 1.59375, |
|
"learning_rate": 1.1707920792079206e-06, |
|
"loss": 1.5868, |
|
"step": 1194 |
|
}, |
|
{ |
|
"epoch": 0.7169856602867942, |
|
"grad_norm": 1.7734375, |
|
"learning_rate": 1.1683168316831681e-06, |
|
"loss": 1.5783, |
|
"step": 1195 |
|
}, |
|
{ |
|
"epoch": 0.7175856482870343, |
|
"grad_norm": 1.7421875, |
|
"learning_rate": 1.1658415841584157e-06, |
|
"loss": 1.7283, |
|
"step": 1196 |
|
}, |
|
{ |
|
"epoch": 0.7181856362872743, |
|
"grad_norm": 1.640625, |
|
"learning_rate": 1.1633663366336632e-06, |
|
"loss": 1.6033, |
|
"step": 1197 |
|
}, |
|
{ |
|
"epoch": 0.7187856242875142, |
|
"grad_norm": 1.6640625, |
|
"learning_rate": 1.1608910891089108e-06, |
|
"loss": 1.7633, |
|
"step": 1198 |
|
}, |
|
{ |
|
"epoch": 0.7193856122877542, |
|
"grad_norm": 1.890625, |
|
"learning_rate": 1.1584158415841584e-06, |
|
"loss": 1.6804, |
|
"step": 1199 |
|
}, |
|
{ |
|
"epoch": 0.7199856002879942, |
|
"grad_norm": 1.6328125, |
|
"learning_rate": 1.1559405940594057e-06, |
|
"loss": 1.5985, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.7199856002879942, |
|
"eval_loss": 1.777273416519165, |
|
"eval_model_preparation_time": 0.0037, |
|
"eval_runtime": 68.7207, |
|
"eval_samples_per_second": 145.517, |
|
"eval_steps_per_second": 24.258, |
|
"step": 1200 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 1666, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.356578165633057e+18, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|