diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,8650 @@ +{ + "best_global_step": 1200, + "best_metric": 1.777273416519165, + "best_model_checkpoint": "./output_dir/th-Llama-3.1-8B-lr4e-06-atten0.25-ffn0.25_20250430_142946/checkpoint-1200", + "epoch": 0.7199856002879942, + "eval_steps": 50, + "global_step": 1200, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0005999880002399952, + "grad_norm": 2.234375, + "learning_rate": 0.0, + "loss": 1.7675, + "step": 1 + }, + { + "epoch": 0.0011999760004799903, + "grad_norm": 2.1875, + "learning_rate": 8e-08, + "loss": 1.6481, + "step": 2 + }, + { + "epoch": 0.0017999640007199855, + "grad_norm": 2.15625, + "learning_rate": 1.6e-07, + "loss": 1.6866, + "step": 3 + }, + { + "epoch": 0.0023999520009599807, + "grad_norm": 1.953125, + "learning_rate": 2.4e-07, + "loss": 1.7062, + "step": 4 + }, + { + "epoch": 0.002999940001199976, + "grad_norm": 2.3125, + "learning_rate": 3.2e-07, + "loss": 1.774, + "step": 5 + }, + { + "epoch": 0.003599928001439971, + "grad_norm": 2.09375, + "learning_rate": 4e-07, + "loss": 1.6995, + "step": 6 + }, + { + "epoch": 0.004199916001679967, + "grad_norm": 1.9765625, + "learning_rate": 4.8e-07, + "loss": 1.6063, + "step": 7 + }, + { + "epoch": 0.004799904001919961, + "grad_norm": 1.9296875, + "learning_rate": 5.6e-07, + "loss": 1.5995, + "step": 8 + }, + { + "epoch": 0.005399892002159957, + "grad_norm": 2.03125, + "learning_rate": 6.4e-07, + "loss": 1.7045, + "step": 9 + }, + { + "epoch": 0.005999880002399952, + "grad_norm": 2.421875, + "learning_rate": 7.2e-07, + "loss": 1.5958, + "step": 10 + }, + { + "epoch": 0.006599868002639947, + "grad_norm": 2.1875, + "learning_rate": 8e-07, + "loss": 1.6149, + "step": 11 + }, + { + "epoch": 0.007199856002879942, + "grad_norm": 2.09375, + "learning_rate": 8.799999999999999e-07, + "loss": 1.7559, + "step": 12 + }, + { + "epoch": 0.007799844003119938, + "grad_norm": 2.015625, + "learning_rate": 9.6e-07, + "loss": 1.682, + "step": 13 + }, + { + "epoch": 0.008399832003359933, + "grad_norm": 2.4375, + "learning_rate": 1.04e-06, + "loss": 1.6184, + "step": 14 + }, + { + "epoch": 0.008999820003599928, + "grad_norm": 2.0625, + "learning_rate": 1.12e-06, + "loss": 1.7178, + "step": 15 + }, + { + "epoch": 0.009599808003839923, + "grad_norm": 2.15625, + "learning_rate": 1.2e-06, + "loss": 1.5901, + "step": 16 + }, + { + "epoch": 0.01019979600407992, + "grad_norm": 1.9375, + "learning_rate": 1.28e-06, + "loss": 1.6869, + "step": 17 + }, + { + "epoch": 0.010799784004319914, + "grad_norm": 2.3125, + "learning_rate": 1.3600000000000001e-06, + "loss": 1.6398, + "step": 18 + }, + { + "epoch": 0.011399772004559909, + "grad_norm": 1.953125, + "learning_rate": 1.44e-06, + "loss": 1.7697, + "step": 19 + }, + { + "epoch": 0.011999760004799903, + "grad_norm": 2.109375, + "learning_rate": 1.5199999999999998e-06, + "loss": 1.7484, + "step": 20 + }, + { + "epoch": 0.0125997480050399, + "grad_norm": 2.234375, + "learning_rate": 1.6e-06, + "loss": 1.7561, + "step": 21 + }, + { + "epoch": 0.013199736005279895, + "grad_norm": 2.234375, + "learning_rate": 1.6799999999999998e-06, + "loss": 1.5346, + "step": 22 + }, + { + "epoch": 0.01379972400551989, + "grad_norm": 2.390625, + "learning_rate": 1.7599999999999999e-06, + "loss": 1.7269, + "step": 23 + }, + { + "epoch": 0.014399712005759884, + "grad_norm": 1.921875, + "learning_rate": 1.84e-06, + "loss": 1.6799, + "step": 24 + }, + { + "epoch": 0.01499970000599988, + "grad_norm": 2.25, + "learning_rate": 1.92e-06, + "loss": 1.6713, + "step": 25 + }, + { + "epoch": 0.015599688006239875, + "grad_norm": 2.140625, + "learning_rate": 2e-06, + "loss": 1.6378, + "step": 26 + }, + { + "epoch": 0.016199676006479872, + "grad_norm": 2.140625, + "learning_rate": 2.08e-06, + "loss": 1.7315, + "step": 27 + }, + { + "epoch": 0.016799664006719867, + "grad_norm": 2.34375, + "learning_rate": 2.16e-06, + "loss": 1.7283, + "step": 28 + }, + { + "epoch": 0.01739965200695986, + "grad_norm": 2.015625, + "learning_rate": 2.24e-06, + "loss": 1.7627, + "step": 29 + }, + { + "epoch": 0.017999640007199856, + "grad_norm": 2.1875, + "learning_rate": 2.32e-06, + "loss": 1.6382, + "step": 30 + }, + { + "epoch": 0.01859962800743985, + "grad_norm": 1.78125, + "learning_rate": 2.4e-06, + "loss": 1.6786, + "step": 31 + }, + { + "epoch": 0.019199616007679846, + "grad_norm": 2.359375, + "learning_rate": 2.48e-06, + "loss": 1.6262, + "step": 32 + }, + { + "epoch": 0.01979960400791984, + "grad_norm": 2.0, + "learning_rate": 2.56e-06, + "loss": 1.6589, + "step": 33 + }, + { + "epoch": 0.02039959200815984, + "grad_norm": 2.34375, + "learning_rate": 2.64e-06, + "loss": 1.671, + "step": 34 + }, + { + "epoch": 0.020999580008399833, + "grad_norm": 2.03125, + "learning_rate": 2.7200000000000002e-06, + "loss": 1.7393, + "step": 35 + }, + { + "epoch": 0.021599568008639828, + "grad_norm": 2.140625, + "learning_rate": 2.8e-06, + "loss": 1.6027, + "step": 36 + }, + { + "epoch": 0.022199556008879823, + "grad_norm": 2.0, + "learning_rate": 2.88e-06, + "loss": 1.8158, + "step": 37 + }, + { + "epoch": 0.022799544009119817, + "grad_norm": 2.015625, + "learning_rate": 2.96e-06, + "loss": 1.7158, + "step": 38 + }, + { + "epoch": 0.023399532009359812, + "grad_norm": 1.96875, + "learning_rate": 3.0399999999999997e-06, + "loss": 1.7778, + "step": 39 + }, + { + "epoch": 0.023999520009599807, + "grad_norm": 1.9765625, + "learning_rate": 3.1199999999999998e-06, + "loss": 1.6903, + "step": 40 + }, + { + "epoch": 0.0245995080098398, + "grad_norm": 2.046875, + "learning_rate": 3.2e-06, + "loss": 1.6403, + "step": 41 + }, + { + "epoch": 0.0251994960100798, + "grad_norm": 1.859375, + "learning_rate": 3.2799999999999995e-06, + "loss": 1.7292, + "step": 42 + }, + { + "epoch": 0.025799484010319795, + "grad_norm": 2.09375, + "learning_rate": 3.3599999999999996e-06, + "loss": 1.6956, + "step": 43 + }, + { + "epoch": 0.02639947201055979, + "grad_norm": 1.84375, + "learning_rate": 3.4399999999999997e-06, + "loss": 1.6927, + "step": 44 + }, + { + "epoch": 0.026999460010799784, + "grad_norm": 2.15625, + "learning_rate": 3.5199999999999998e-06, + "loss": 1.6794, + "step": 45 + }, + { + "epoch": 0.02759944801103978, + "grad_norm": 2.046875, + "learning_rate": 3.6e-06, + "loss": 1.7373, + "step": 46 + }, + { + "epoch": 0.028199436011279774, + "grad_norm": 2.0, + "learning_rate": 3.68e-06, + "loss": 1.6971, + "step": 47 + }, + { + "epoch": 0.02879942401151977, + "grad_norm": 2.03125, + "learning_rate": 3.7599999999999996e-06, + "loss": 1.7465, + "step": 48 + }, + { + "epoch": 0.029399412011759767, + "grad_norm": 1.890625, + "learning_rate": 3.84e-06, + "loss": 1.6621, + "step": 49 + }, + { + "epoch": 0.02999940001199976, + "grad_norm": 1.9140625, + "learning_rate": 3.92e-06, + "loss": 1.6802, + "step": 50 + }, + { + "epoch": 0.02999940001199976, + "eval_loss": 1.7964001893997192, + "eval_model_preparation_time": 0.0037, + "eval_runtime": 66.1162, + "eval_samples_per_second": 151.249, + "eval_steps_per_second": 25.213, + "step": 50 + }, + { + "epoch": 0.030599388012239756, + "grad_norm": 1.8203125, + "learning_rate": 4e-06, + "loss": 1.6904, + "step": 51 + }, + { + "epoch": 0.03119937601247975, + "grad_norm": 2.140625, + "learning_rate": 3.997524752475248e-06, + "loss": 1.6148, + "step": 52 + }, + { + "epoch": 0.031799364012719745, + "grad_norm": 1.7578125, + "learning_rate": 3.9950495049504945e-06, + "loss": 1.6905, + "step": 53 + }, + { + "epoch": 0.032399352012959744, + "grad_norm": 2.09375, + "learning_rate": 3.992574257425742e-06, + "loss": 1.7708, + "step": 54 + }, + { + "epoch": 0.032999340013199735, + "grad_norm": 1.75, + "learning_rate": 3.99009900990099e-06, + "loss": 1.7365, + "step": 55 + }, + { + "epoch": 0.03359932801343973, + "grad_norm": 2.109375, + "learning_rate": 3.987623762376238e-06, + "loss": 1.5333, + "step": 56 + }, + { + "epoch": 0.034199316013679724, + "grad_norm": 1.921875, + "learning_rate": 3.985148514851485e-06, + "loss": 1.7474, + "step": 57 + }, + { + "epoch": 0.03479930401391972, + "grad_norm": 1.90625, + "learning_rate": 3.9826732673267325e-06, + "loss": 1.667, + "step": 58 + }, + { + "epoch": 0.035399292014159714, + "grad_norm": 2.140625, + "learning_rate": 3.98019801980198e-06, + "loss": 1.652, + "step": 59 + }, + { + "epoch": 0.03599928001439971, + "grad_norm": 1.9140625, + "learning_rate": 3.977722772277228e-06, + "loss": 1.6359, + "step": 60 + }, + { + "epoch": 0.03659926801463971, + "grad_norm": 1.9765625, + "learning_rate": 3.975247524752475e-06, + "loss": 1.6885, + "step": 61 + }, + { + "epoch": 0.0371992560148797, + "grad_norm": 1.9765625, + "learning_rate": 3.972772277227723e-06, + "loss": 1.6758, + "step": 62 + }, + { + "epoch": 0.0377992440151197, + "grad_norm": 2.0, + "learning_rate": 3.9702970297029705e-06, + "loss": 1.6944, + "step": 63 + }, + { + "epoch": 0.03839923201535969, + "grad_norm": 1.734375, + "learning_rate": 3.967821782178218e-06, + "loss": 1.5645, + "step": 64 + }, + { + "epoch": 0.03899922001559969, + "grad_norm": 1.8125, + "learning_rate": 3.965346534653465e-06, + "loss": 1.7534, + "step": 65 + }, + { + "epoch": 0.03959920801583968, + "grad_norm": 2.078125, + "learning_rate": 3.962871287128713e-06, + "loss": 1.6782, + "step": 66 + }, + { + "epoch": 0.04019919601607968, + "grad_norm": 1.90625, + "learning_rate": 3.96039603960396e-06, + "loss": 1.6826, + "step": 67 + }, + { + "epoch": 0.04079918401631968, + "grad_norm": 1.8203125, + "learning_rate": 3.957920792079208e-06, + "loss": 1.692, + "step": 68 + }, + { + "epoch": 0.04139917201655967, + "grad_norm": 1.8984375, + "learning_rate": 3.955445544554455e-06, + "loss": 1.5594, + "step": 69 + }, + { + "epoch": 0.041999160016799666, + "grad_norm": 1.7890625, + "learning_rate": 3.952970297029703e-06, + "loss": 1.677, + "step": 70 + }, + { + "epoch": 0.04259914801703966, + "grad_norm": 1.8984375, + "learning_rate": 3.95049504950495e-06, + "loss": 1.6147, + "step": 71 + }, + { + "epoch": 0.043199136017279656, + "grad_norm": 1.8125, + "learning_rate": 3.948019801980198e-06, + "loss": 1.6306, + "step": 72 + }, + { + "epoch": 0.04379912401751965, + "grad_norm": 1.8984375, + "learning_rate": 3.945544554455446e-06, + "loss": 1.5884, + "step": 73 + }, + { + "epoch": 0.044399112017759645, + "grad_norm": 1.8359375, + "learning_rate": 3.943069306930693e-06, + "loss": 1.5904, + "step": 74 + }, + { + "epoch": 0.04499910001799964, + "grad_norm": 1.8671875, + "learning_rate": 3.94059405940594e-06, + "loss": 1.628, + "step": 75 + }, + { + "epoch": 0.045599088018239635, + "grad_norm": 1.875, + "learning_rate": 3.938118811881188e-06, + "loss": 1.7228, + "step": 76 + }, + { + "epoch": 0.04619907601847963, + "grad_norm": 1.8671875, + "learning_rate": 3.935643564356436e-06, + "loss": 1.7077, + "step": 77 + }, + { + "epoch": 0.046799064018719624, + "grad_norm": 1.703125, + "learning_rate": 3.933168316831683e-06, + "loss": 1.5831, + "step": 78 + }, + { + "epoch": 0.04739905201895962, + "grad_norm": 1.6875, + "learning_rate": 3.9306930693069305e-06, + "loss": 1.6097, + "step": 79 + }, + { + "epoch": 0.047999040019199614, + "grad_norm": 1.765625, + "learning_rate": 3.928217821782178e-06, + "loss": 1.6748, + "step": 80 + }, + { + "epoch": 0.04859902801943961, + "grad_norm": 1.78125, + "learning_rate": 3.925742574257425e-06, + "loss": 1.598, + "step": 81 + }, + { + "epoch": 0.0491990160196796, + "grad_norm": 1.6640625, + "learning_rate": 3.923267326732673e-06, + "loss": 1.6844, + "step": 82 + }, + { + "epoch": 0.0497990040199196, + "grad_norm": 2.03125, + "learning_rate": 3.920792079207921e-06, + "loss": 1.7564, + "step": 83 + }, + { + "epoch": 0.0503989920201596, + "grad_norm": 1.984375, + "learning_rate": 3.9183168316831685e-06, + "loss": 1.6621, + "step": 84 + }, + { + "epoch": 0.05099898002039959, + "grad_norm": 1.953125, + "learning_rate": 3.915841584158415e-06, + "loss": 1.6924, + "step": 85 + }, + { + "epoch": 0.05159896802063959, + "grad_norm": 1.8515625, + "learning_rate": 3.913366336633663e-06, + "loss": 1.6198, + "step": 86 + }, + { + "epoch": 0.05219895602087958, + "grad_norm": 1.9140625, + "learning_rate": 3.910891089108911e-06, + "loss": 1.6174, + "step": 87 + }, + { + "epoch": 0.05279894402111958, + "grad_norm": 1.8515625, + "learning_rate": 3.908415841584159e-06, + "loss": 1.67, + "step": 88 + }, + { + "epoch": 0.05339893202135957, + "grad_norm": 2.453125, + "learning_rate": 3.905940594059406e-06, + "loss": 1.7446, + "step": 89 + }, + { + "epoch": 0.05399892002159957, + "grad_norm": 1.828125, + "learning_rate": 3.903465346534653e-06, + "loss": 1.6408, + "step": 90 + }, + { + "epoch": 0.054598908021839566, + "grad_norm": 1.7421875, + "learning_rate": 3.9009900990099e-06, + "loss": 1.7832, + "step": 91 + }, + { + "epoch": 0.05519889602207956, + "grad_norm": 1.890625, + "learning_rate": 3.898514851485148e-06, + "loss": 1.6397, + "step": 92 + }, + { + "epoch": 0.055798884022319556, + "grad_norm": 1.6953125, + "learning_rate": 3.896039603960396e-06, + "loss": 1.6805, + "step": 93 + }, + { + "epoch": 0.05639887202255955, + "grad_norm": 1.84375, + "learning_rate": 3.893564356435644e-06, + "loss": 1.742, + "step": 94 + }, + { + "epoch": 0.056998860022799545, + "grad_norm": 1.875, + "learning_rate": 3.8910891089108905e-06, + "loss": 1.5952, + "step": 95 + }, + { + "epoch": 0.05759884802303954, + "grad_norm": 1.9296875, + "learning_rate": 3.888613861386138e-06, + "loss": 1.7769, + "step": 96 + }, + { + "epoch": 0.058198836023279535, + "grad_norm": 1.6484375, + "learning_rate": 3.886138613861386e-06, + "loss": 1.6168, + "step": 97 + }, + { + "epoch": 0.05879882402351953, + "grad_norm": 1.984375, + "learning_rate": 3.883663366336634e-06, + "loss": 1.6283, + "step": 98 + }, + { + "epoch": 0.059398812023759524, + "grad_norm": 1.8828125, + "learning_rate": 3.881188118811881e-06, + "loss": 1.6354, + "step": 99 + }, + { + "epoch": 0.05999880002399952, + "grad_norm": 1.609375, + "learning_rate": 3.8787128712871285e-06, + "loss": 1.6169, + "step": 100 + }, + { + "epoch": 0.05999880002399952, + "eval_loss": 1.7915641069412231, + "eval_model_preparation_time": 0.0037, + "eval_runtime": 66.2561, + "eval_samples_per_second": 150.93, + "eval_steps_per_second": 25.16, + "step": 100 + }, + { + "epoch": 0.060598788024239514, + "grad_norm": 1.703125, + "learning_rate": 3.876237623762376e-06, + "loss": 1.785, + "step": 101 + }, + { + "epoch": 0.06119877602447951, + "grad_norm": 1.6484375, + "learning_rate": 3.873762376237624e-06, + "loss": 1.7006, + "step": 102 + }, + { + "epoch": 0.0617987640247195, + "grad_norm": 1.890625, + "learning_rate": 3.871287128712871e-06, + "loss": 1.7633, + "step": 103 + }, + { + "epoch": 0.0623987520249595, + "grad_norm": 1.6640625, + "learning_rate": 3.868811881188119e-06, + "loss": 1.7084, + "step": 104 + }, + { + "epoch": 0.06299874002519949, + "grad_norm": 1.75, + "learning_rate": 3.866336633663366e-06, + "loss": 1.7583, + "step": 105 + }, + { + "epoch": 0.06359872802543949, + "grad_norm": 1.6875, + "learning_rate": 3.8638613861386134e-06, + "loss": 1.6945, + "step": 106 + }, + { + "epoch": 0.06419871602567949, + "grad_norm": 1.703125, + "learning_rate": 3.861386138613861e-06, + "loss": 1.7145, + "step": 107 + }, + { + "epoch": 0.06479870402591949, + "grad_norm": 1.6328125, + "learning_rate": 3.858910891089109e-06, + "loss": 1.6482, + "step": 108 + }, + { + "epoch": 0.06539869202615947, + "grad_norm": 1.6953125, + "learning_rate": 3.856435643564356e-06, + "loss": 1.7366, + "step": 109 + }, + { + "epoch": 0.06599868002639947, + "grad_norm": 1.84375, + "learning_rate": 3.853960396039604e-06, + "loss": 1.6372, + "step": 110 + }, + { + "epoch": 0.06659866802663947, + "grad_norm": 1.8359375, + "learning_rate": 3.851485148514851e-06, + "loss": 1.7359, + "step": 111 + }, + { + "epoch": 0.06719865602687947, + "grad_norm": 1.71875, + "learning_rate": 3.849009900990099e-06, + "loss": 1.6406, + "step": 112 + }, + { + "epoch": 0.06779864402711945, + "grad_norm": 1.7421875, + "learning_rate": 3.846534653465346e-06, + "loss": 1.696, + "step": 113 + }, + { + "epoch": 0.06839863202735945, + "grad_norm": 1.703125, + "learning_rate": 3.844059405940594e-06, + "loss": 1.6321, + "step": 114 + }, + { + "epoch": 0.06899862002759945, + "grad_norm": 1.8828125, + "learning_rate": 3.841584158415842e-06, + "loss": 1.706, + "step": 115 + }, + { + "epoch": 0.06959860802783945, + "grad_norm": 1.65625, + "learning_rate": 3.839108910891089e-06, + "loss": 1.7333, + "step": 116 + }, + { + "epoch": 0.07019859602807944, + "grad_norm": 1.9296875, + "learning_rate": 3.836633663366336e-06, + "loss": 1.7489, + "step": 117 + }, + { + "epoch": 0.07079858402831943, + "grad_norm": 1.7578125, + "learning_rate": 3.834158415841584e-06, + "loss": 1.6628, + "step": 118 + }, + { + "epoch": 0.07139857202855943, + "grad_norm": 1.7421875, + "learning_rate": 3.831683168316831e-06, + "loss": 1.7741, + "step": 119 + }, + { + "epoch": 0.07199856002879942, + "grad_norm": 1.6796875, + "learning_rate": 3.829207920792079e-06, + "loss": 1.6783, + "step": 120 + }, + { + "epoch": 0.07259854802903942, + "grad_norm": 1.921875, + "learning_rate": 3.8267326732673265e-06, + "loss": 1.6955, + "step": 121 + }, + { + "epoch": 0.07319853602927942, + "grad_norm": 1.984375, + "learning_rate": 3.824257425742574e-06, + "loss": 1.721, + "step": 122 + }, + { + "epoch": 0.0737985240295194, + "grad_norm": 1.6640625, + "learning_rate": 3.821782178217821e-06, + "loss": 1.6035, + "step": 123 + }, + { + "epoch": 0.0743985120297594, + "grad_norm": 2.140625, + "learning_rate": 3.819306930693069e-06, + "loss": 1.5864, + "step": 124 + }, + { + "epoch": 0.0749985000299994, + "grad_norm": 1.546875, + "learning_rate": 3.816831683168317e-06, + "loss": 1.6237, + "step": 125 + }, + { + "epoch": 0.0755984880302394, + "grad_norm": 1.734375, + "learning_rate": 3.814356435643564e-06, + "loss": 1.6389, + "step": 126 + }, + { + "epoch": 0.07619847603047938, + "grad_norm": 1.96875, + "learning_rate": 3.8118811881188114e-06, + "loss": 1.6965, + "step": 127 + }, + { + "epoch": 0.07679846403071938, + "grad_norm": 2.34375, + "learning_rate": 3.809405940594059e-06, + "loss": 1.5421, + "step": 128 + }, + { + "epoch": 0.07739845203095938, + "grad_norm": 1.875, + "learning_rate": 3.8069306930693065e-06, + "loss": 1.7131, + "step": 129 + }, + { + "epoch": 0.07799844003119938, + "grad_norm": 1.7265625, + "learning_rate": 3.8044554455445543e-06, + "loss": 1.7201, + "step": 130 + }, + { + "epoch": 0.07859842803143938, + "grad_norm": 1.8125, + "learning_rate": 3.8019801980198017e-06, + "loss": 1.7214, + "step": 131 + }, + { + "epoch": 0.07919841603167936, + "grad_norm": 1.71875, + "learning_rate": 3.7995049504950494e-06, + "loss": 1.7073, + "step": 132 + }, + { + "epoch": 0.07979840403191936, + "grad_norm": 1.578125, + "learning_rate": 3.7970297029702968e-06, + "loss": 1.6527, + "step": 133 + }, + { + "epoch": 0.08039839203215936, + "grad_norm": 1.7890625, + "learning_rate": 3.7945544554455445e-06, + "loss": 1.6767, + "step": 134 + }, + { + "epoch": 0.08099838003239936, + "grad_norm": 1.609375, + "learning_rate": 3.792079207920792e-06, + "loss": 1.7262, + "step": 135 + }, + { + "epoch": 0.08159836803263935, + "grad_norm": 1.65625, + "learning_rate": 3.7896039603960396e-06, + "loss": 1.7001, + "step": 136 + }, + { + "epoch": 0.08219835603287934, + "grad_norm": 1.6171875, + "learning_rate": 3.7871287128712866e-06, + "loss": 1.6917, + "step": 137 + }, + { + "epoch": 0.08279834403311934, + "grad_norm": 1.734375, + "learning_rate": 3.7846534653465343e-06, + "loss": 1.6897, + "step": 138 + }, + { + "epoch": 0.08339833203335933, + "grad_norm": 1.78125, + "learning_rate": 3.7821782178217817e-06, + "loss": 1.6675, + "step": 139 + }, + { + "epoch": 0.08399832003359933, + "grad_norm": 1.6015625, + "learning_rate": 3.7797029702970294e-06, + "loss": 1.5694, + "step": 140 + }, + { + "epoch": 0.08459830803383932, + "grad_norm": 1.71875, + "learning_rate": 3.7772277227722768e-06, + "loss": 1.633, + "step": 141 + }, + { + "epoch": 0.08519829603407932, + "grad_norm": 1.90625, + "learning_rate": 3.7747524752475245e-06, + "loss": 1.7887, + "step": 142 + }, + { + "epoch": 0.08579828403431931, + "grad_norm": 1.6953125, + "learning_rate": 3.772277227722772e-06, + "loss": 1.593, + "step": 143 + }, + { + "epoch": 0.08639827203455931, + "grad_norm": 1.796875, + "learning_rate": 3.7698019801980197e-06, + "loss": 1.6052, + "step": 144 + }, + { + "epoch": 0.08699826003479931, + "grad_norm": 1.6796875, + "learning_rate": 3.767326732673267e-06, + "loss": 1.5389, + "step": 145 + }, + { + "epoch": 0.0875982480350393, + "grad_norm": 1.6953125, + "learning_rate": 3.7648514851485148e-06, + "loss": 1.6326, + "step": 146 + }, + { + "epoch": 0.08819823603527929, + "grad_norm": 1.4921875, + "learning_rate": 3.762376237623762e-06, + "loss": 1.6813, + "step": 147 + }, + { + "epoch": 0.08879822403551929, + "grad_norm": 1.8359375, + "learning_rate": 3.75990099009901e-06, + "loss": 1.6882, + "step": 148 + }, + { + "epoch": 0.08939821203575929, + "grad_norm": 1.8515625, + "learning_rate": 3.7574257425742572e-06, + "loss": 1.5999, + "step": 149 + }, + { + "epoch": 0.08999820003599927, + "grad_norm": 1.6796875, + "learning_rate": 3.754950495049505e-06, + "loss": 1.5369, + "step": 150 + }, + { + "epoch": 0.08999820003599927, + "eval_loss": 1.788702130317688, + "eval_model_preparation_time": 0.0037, + "eval_runtime": 68.5232, + "eval_samples_per_second": 145.936, + "eval_steps_per_second": 24.328, + "step": 150 + }, + { + "epoch": 0.09059818803623927, + "grad_norm": 1.8046875, + "learning_rate": 3.752475247524752e-06, + "loss": 1.6148, + "step": 151 + }, + { + "epoch": 0.09119817603647927, + "grad_norm": 1.8203125, + "learning_rate": 3.7499999999999997e-06, + "loss": 1.6697, + "step": 152 + }, + { + "epoch": 0.09179816403671927, + "grad_norm": 1.75, + "learning_rate": 3.7475247524752474e-06, + "loss": 1.6048, + "step": 153 + }, + { + "epoch": 0.09239815203695927, + "grad_norm": 1.6953125, + "learning_rate": 3.7450495049504948e-06, + "loss": 1.6984, + "step": 154 + }, + { + "epoch": 0.09299814003719925, + "grad_norm": 1.7421875, + "learning_rate": 3.7425742574257425e-06, + "loss": 1.5856, + "step": 155 + }, + { + "epoch": 0.09359812803743925, + "grad_norm": 1.5546875, + "learning_rate": 3.74009900990099e-06, + "loss": 1.6575, + "step": 156 + }, + { + "epoch": 0.09419811603767925, + "grad_norm": 1.8359375, + "learning_rate": 3.7376237623762377e-06, + "loss": 1.6302, + "step": 157 + }, + { + "epoch": 0.09479810403791925, + "grad_norm": 1.6640625, + "learning_rate": 3.735148514851485e-06, + "loss": 1.644, + "step": 158 + }, + { + "epoch": 0.09539809203815924, + "grad_norm": 1.734375, + "learning_rate": 3.7326732673267328e-06, + "loss": 1.7122, + "step": 159 + }, + { + "epoch": 0.09599808003839923, + "grad_norm": 1.8984375, + "learning_rate": 3.73019801980198e-06, + "loss": 1.6811, + "step": 160 + }, + { + "epoch": 0.09659806803863923, + "grad_norm": 1.65625, + "learning_rate": 3.727722772277228e-06, + "loss": 1.6262, + "step": 161 + }, + { + "epoch": 0.09719805603887922, + "grad_norm": 1.6875, + "learning_rate": 3.7252475247524752e-06, + "loss": 1.5868, + "step": 162 + }, + { + "epoch": 0.09779804403911922, + "grad_norm": 1.6640625, + "learning_rate": 3.722772277227723e-06, + "loss": 1.7002, + "step": 163 + }, + { + "epoch": 0.0983980320393592, + "grad_norm": 1.6796875, + "learning_rate": 3.72029702970297e-06, + "loss": 1.6849, + "step": 164 + }, + { + "epoch": 0.0989980200395992, + "grad_norm": 1.7109375, + "learning_rate": 3.7178217821782177e-06, + "loss": 1.7176, + "step": 165 + }, + { + "epoch": 0.0995980080398392, + "grad_norm": 1.6875, + "learning_rate": 3.715346534653465e-06, + "loss": 1.6203, + "step": 166 + }, + { + "epoch": 0.1001979960400792, + "grad_norm": 1.6953125, + "learning_rate": 3.7128712871287128e-06, + "loss": 1.715, + "step": 167 + }, + { + "epoch": 0.1007979840403192, + "grad_norm": 1.640625, + "learning_rate": 3.71039603960396e-06, + "loss": 1.6671, + "step": 168 + }, + { + "epoch": 0.10139797204055918, + "grad_norm": 1.734375, + "learning_rate": 3.707920792079208e-06, + "loss": 1.7003, + "step": 169 + }, + { + "epoch": 0.10199796004079918, + "grad_norm": 1.6875, + "learning_rate": 3.7054455445544552e-06, + "loss": 1.658, + "step": 170 + }, + { + "epoch": 0.10259794804103918, + "grad_norm": 1.8828125, + "learning_rate": 3.702970297029703e-06, + "loss": 1.6661, + "step": 171 + }, + { + "epoch": 0.10319793604127918, + "grad_norm": 1.859375, + "learning_rate": 3.7004950495049503e-06, + "loss": 1.6916, + "step": 172 + }, + { + "epoch": 0.10379792404151916, + "grad_norm": 2.03125, + "learning_rate": 3.698019801980198e-06, + "loss": 1.7125, + "step": 173 + }, + { + "epoch": 0.10439791204175916, + "grad_norm": 1.671875, + "learning_rate": 3.6955445544554455e-06, + "loss": 1.6975, + "step": 174 + }, + { + "epoch": 0.10499790004199916, + "grad_norm": 1.7578125, + "learning_rate": 3.6930693069306932e-06, + "loss": 1.7408, + "step": 175 + }, + { + "epoch": 0.10559788804223916, + "grad_norm": 1.625, + "learning_rate": 3.6905940594059406e-06, + "loss": 1.7682, + "step": 176 + }, + { + "epoch": 0.10619787604247916, + "grad_norm": 1.6796875, + "learning_rate": 3.6881188118811883e-06, + "loss": 1.6703, + "step": 177 + }, + { + "epoch": 0.10679786404271914, + "grad_norm": 1.90625, + "learning_rate": 3.6856435643564352e-06, + "loss": 1.6898, + "step": 178 + }, + { + "epoch": 0.10739785204295914, + "grad_norm": 1.890625, + "learning_rate": 3.683168316831683e-06, + "loss": 1.7006, + "step": 179 + }, + { + "epoch": 0.10799784004319914, + "grad_norm": 1.734375, + "learning_rate": 3.6806930693069304e-06, + "loss": 1.659, + "step": 180 + }, + { + "epoch": 0.10859782804343913, + "grad_norm": 1.7421875, + "learning_rate": 3.678217821782178e-06, + "loss": 1.685, + "step": 181 + }, + { + "epoch": 0.10919781604367913, + "grad_norm": 1.796875, + "learning_rate": 3.6757425742574255e-06, + "loss": 1.7294, + "step": 182 + }, + { + "epoch": 0.10979780404391912, + "grad_norm": 1.890625, + "learning_rate": 3.6732673267326732e-06, + "loss": 1.7036, + "step": 183 + }, + { + "epoch": 0.11039779204415912, + "grad_norm": 1.765625, + "learning_rate": 3.6707920792079206e-06, + "loss": 1.6733, + "step": 184 + }, + { + "epoch": 0.11099778004439911, + "grad_norm": 1.6953125, + "learning_rate": 3.6683168316831683e-06, + "loss": 1.7434, + "step": 185 + }, + { + "epoch": 0.11159776804463911, + "grad_norm": 1.7734375, + "learning_rate": 3.6658415841584157e-06, + "loss": 1.6844, + "step": 186 + }, + { + "epoch": 0.1121977560448791, + "grad_norm": 1.7421875, + "learning_rate": 3.6633663366336635e-06, + "loss": 1.7077, + "step": 187 + }, + { + "epoch": 0.1127977440451191, + "grad_norm": 1.75, + "learning_rate": 3.660891089108911e-06, + "loss": 1.7586, + "step": 188 + }, + { + "epoch": 0.11339773204535909, + "grad_norm": 1.828125, + "learning_rate": 3.6584158415841586e-06, + "loss": 1.6478, + "step": 189 + }, + { + "epoch": 0.11399772004559909, + "grad_norm": 1.7265625, + "learning_rate": 3.6559405940594055e-06, + "loss": 1.6893, + "step": 190 + }, + { + "epoch": 0.11459770804583909, + "grad_norm": 1.9375, + "learning_rate": 3.6534653465346532e-06, + "loss": 1.7054, + "step": 191 + }, + { + "epoch": 0.11519769604607907, + "grad_norm": 1.65625, + "learning_rate": 3.6509900990099006e-06, + "loss": 1.7179, + "step": 192 + }, + { + "epoch": 0.11579768404631907, + "grad_norm": 1.6796875, + "learning_rate": 3.6485148514851484e-06, + "loss": 1.5688, + "step": 193 + }, + { + "epoch": 0.11639767204655907, + "grad_norm": 1.8203125, + "learning_rate": 3.6460396039603957e-06, + "loss": 1.807, + "step": 194 + }, + { + "epoch": 0.11699766004679907, + "grad_norm": 1.734375, + "learning_rate": 3.6435643564356435e-06, + "loss": 1.6499, + "step": 195 + }, + { + "epoch": 0.11759764804703907, + "grad_norm": 1.796875, + "learning_rate": 3.641089108910891e-06, + "loss": 1.6366, + "step": 196 + }, + { + "epoch": 0.11819763604727905, + "grad_norm": 1.7890625, + "learning_rate": 3.6386138613861386e-06, + "loss": 1.7076, + "step": 197 + }, + { + "epoch": 0.11879762404751905, + "grad_norm": 1.671875, + "learning_rate": 3.636138613861386e-06, + "loss": 1.6531, + "step": 198 + }, + { + "epoch": 0.11939761204775905, + "grad_norm": 1.6640625, + "learning_rate": 3.6336633663366337e-06, + "loss": 1.6723, + "step": 199 + }, + { + "epoch": 0.11999760004799905, + "grad_norm": 1.5859375, + "learning_rate": 3.631188118811881e-06, + "loss": 1.5718, + "step": 200 + }, + { + "epoch": 0.11999760004799905, + "eval_loss": 1.7864090204238892, + "eval_model_preparation_time": 0.0037, + "eval_runtime": 65.9536, + "eval_samples_per_second": 151.622, + "eval_steps_per_second": 25.275, + "step": 200 + }, + { + "epoch": 0.12059758804823903, + "grad_norm": 1.78125, + "learning_rate": 3.628712871287129e-06, + "loss": 1.6047, + "step": 201 + }, + { + "epoch": 0.12119757604847903, + "grad_norm": 1.5234375, + "learning_rate": 3.626237623762376e-06, + "loss": 1.5836, + "step": 202 + }, + { + "epoch": 0.12179756404871903, + "grad_norm": 1.625, + "learning_rate": 3.623762376237624e-06, + "loss": 1.7101, + "step": 203 + }, + { + "epoch": 0.12239755204895902, + "grad_norm": 1.6953125, + "learning_rate": 3.621287128712871e-06, + "loss": 1.7021, + "step": 204 + }, + { + "epoch": 0.12299754004919902, + "grad_norm": 1.6875, + "learning_rate": 3.6188118811881186e-06, + "loss": 1.673, + "step": 205 + }, + { + "epoch": 0.123597528049439, + "grad_norm": 1.71875, + "learning_rate": 3.616336633663366e-06, + "loss": 1.6808, + "step": 206 + }, + { + "epoch": 0.124197516049679, + "grad_norm": 1.6796875, + "learning_rate": 3.6138613861386137e-06, + "loss": 1.6939, + "step": 207 + }, + { + "epoch": 0.124797504049919, + "grad_norm": 1.6796875, + "learning_rate": 3.611386138613861e-06, + "loss": 1.691, + "step": 208 + }, + { + "epoch": 0.125397492050159, + "grad_norm": 1.6640625, + "learning_rate": 3.608910891089109e-06, + "loss": 1.6701, + "step": 209 + }, + { + "epoch": 0.12599748005039899, + "grad_norm": 1.71875, + "learning_rate": 3.606435643564356e-06, + "loss": 1.7267, + "step": 210 + }, + { + "epoch": 0.12659746805063898, + "grad_norm": 1.6015625, + "learning_rate": 3.603960396039604e-06, + "loss": 1.626, + "step": 211 + }, + { + "epoch": 0.12719745605087898, + "grad_norm": 1.8828125, + "learning_rate": 3.6014851485148513e-06, + "loss": 1.6452, + "step": 212 + }, + { + "epoch": 0.12779744405111898, + "grad_norm": 1.625, + "learning_rate": 3.599009900990099e-06, + "loss": 1.6894, + "step": 213 + }, + { + "epoch": 0.12839743205135898, + "grad_norm": 1.7109375, + "learning_rate": 3.5965346534653464e-06, + "loss": 1.5345, + "step": 214 + }, + { + "epoch": 0.12899742005159898, + "grad_norm": 1.828125, + "learning_rate": 3.594059405940594e-06, + "loss": 1.6531, + "step": 215 + }, + { + "epoch": 0.12959740805183897, + "grad_norm": 1.71875, + "learning_rate": 3.5915841584158415e-06, + "loss": 1.6251, + "step": 216 + }, + { + "epoch": 0.13019739605207895, + "grad_norm": 1.796875, + "learning_rate": 3.589108910891089e-06, + "loss": 1.6431, + "step": 217 + }, + { + "epoch": 0.13079738405231894, + "grad_norm": 1.5, + "learning_rate": 3.586633663366336e-06, + "loss": 1.6415, + "step": 218 + }, + { + "epoch": 0.13139737205255894, + "grad_norm": 1.9765625, + "learning_rate": 3.584158415841584e-06, + "loss": 1.777, + "step": 219 + }, + { + "epoch": 0.13199736005279894, + "grad_norm": 1.7890625, + "learning_rate": 3.5816831683168313e-06, + "loss": 1.6775, + "step": 220 + }, + { + "epoch": 0.13259734805303894, + "grad_norm": 1.6640625, + "learning_rate": 3.579207920792079e-06, + "loss": 1.6495, + "step": 221 + }, + { + "epoch": 0.13319733605327894, + "grad_norm": 1.6796875, + "learning_rate": 3.5767326732673264e-06, + "loss": 1.6738, + "step": 222 + }, + { + "epoch": 0.13379732405351893, + "grad_norm": 1.8046875, + "learning_rate": 3.574257425742574e-06, + "loss": 1.6602, + "step": 223 + }, + { + "epoch": 0.13439731205375893, + "grad_norm": 1.8046875, + "learning_rate": 3.5717821782178215e-06, + "loss": 1.6473, + "step": 224 + }, + { + "epoch": 0.13499730005399893, + "grad_norm": 1.796875, + "learning_rate": 3.5693069306930693e-06, + "loss": 1.6683, + "step": 225 + }, + { + "epoch": 0.1355972880542389, + "grad_norm": 1.5, + "learning_rate": 3.5668316831683166e-06, + "loss": 1.6702, + "step": 226 + }, + { + "epoch": 0.1361972760544789, + "grad_norm": 1.8125, + "learning_rate": 3.5643564356435644e-06, + "loss": 1.6736, + "step": 227 + }, + { + "epoch": 0.1367972640547189, + "grad_norm": 1.6875, + "learning_rate": 3.5618811881188117e-06, + "loss": 1.7395, + "step": 228 + }, + { + "epoch": 0.1373972520549589, + "grad_norm": 1.7734375, + "learning_rate": 3.5594059405940595e-06, + "loss": 1.6352, + "step": 229 + }, + { + "epoch": 0.1379972400551989, + "grad_norm": 1.703125, + "learning_rate": 3.5569306930693064e-06, + "loss": 1.7035, + "step": 230 + }, + { + "epoch": 0.1385972280554389, + "grad_norm": 1.859375, + "learning_rate": 3.554455445544554e-06, + "loss": 1.6634, + "step": 231 + }, + { + "epoch": 0.1391972160556789, + "grad_norm": 1.5703125, + "learning_rate": 3.5519801980198015e-06, + "loss": 1.5949, + "step": 232 + }, + { + "epoch": 0.1397972040559189, + "grad_norm": 1.6875, + "learning_rate": 3.5495049504950493e-06, + "loss": 1.6848, + "step": 233 + }, + { + "epoch": 0.1403971920561589, + "grad_norm": 1.7421875, + "learning_rate": 3.5470297029702966e-06, + "loss": 1.6158, + "step": 234 + }, + { + "epoch": 0.14099718005639889, + "grad_norm": 1.765625, + "learning_rate": 3.5445544554455444e-06, + "loss": 1.6746, + "step": 235 + }, + { + "epoch": 0.14159716805663886, + "grad_norm": 1.8515625, + "learning_rate": 3.5420792079207917e-06, + "loss": 1.6911, + "step": 236 + }, + { + "epoch": 0.14219715605687885, + "grad_norm": 1.7734375, + "learning_rate": 3.5396039603960395e-06, + "loss": 1.6304, + "step": 237 + }, + { + "epoch": 0.14279714405711885, + "grad_norm": 2.03125, + "learning_rate": 3.537128712871287e-06, + "loss": 1.6485, + "step": 238 + }, + { + "epoch": 0.14339713205735885, + "grad_norm": 1.6875, + "learning_rate": 3.5346534653465346e-06, + "loss": 1.6509, + "step": 239 + }, + { + "epoch": 0.14399712005759885, + "grad_norm": 1.7578125, + "learning_rate": 3.532178217821782e-06, + "loss": 1.7299, + "step": 240 + }, + { + "epoch": 0.14459710805783885, + "grad_norm": 1.6171875, + "learning_rate": 3.5297029702970297e-06, + "loss": 1.5903, + "step": 241 + }, + { + "epoch": 0.14519709605807885, + "grad_norm": 1.8515625, + "learning_rate": 3.527227722772277e-06, + "loss": 1.7034, + "step": 242 + }, + { + "epoch": 0.14579708405831884, + "grad_norm": 1.6953125, + "learning_rate": 3.524752475247525e-06, + "loss": 1.604, + "step": 243 + }, + { + "epoch": 0.14639707205855884, + "grad_norm": 1.578125, + "learning_rate": 3.5222772277227717e-06, + "loss": 1.6242, + "step": 244 + }, + { + "epoch": 0.1469970600587988, + "grad_norm": 1.703125, + "learning_rate": 3.5198019801980195e-06, + "loss": 1.6533, + "step": 245 + }, + { + "epoch": 0.1475970480590388, + "grad_norm": 1.6015625, + "learning_rate": 3.517326732673267e-06, + "loss": 1.6703, + "step": 246 + }, + { + "epoch": 0.1481970360592788, + "grad_norm": 1.6875, + "learning_rate": 3.5148514851485146e-06, + "loss": 1.6464, + "step": 247 + }, + { + "epoch": 0.1487970240595188, + "grad_norm": 1.7421875, + "learning_rate": 3.512376237623762e-06, + "loss": 1.8545, + "step": 248 + }, + { + "epoch": 0.1493970120597588, + "grad_norm": 1.5, + "learning_rate": 3.5099009900990097e-06, + "loss": 1.7212, + "step": 249 + }, + { + "epoch": 0.1499970000599988, + "grad_norm": 1.6484375, + "learning_rate": 3.507425742574257e-06, + "loss": 1.637, + "step": 250 + }, + { + "epoch": 0.1499970000599988, + "eval_loss": 1.7847853899002075, + "eval_model_preparation_time": 0.0037, + "eval_runtime": 65.9306, + "eval_samples_per_second": 151.675, + "eval_steps_per_second": 25.284, + "step": 250 + }, + { + "epoch": 0.1505969880602388, + "grad_norm": 1.8828125, + "learning_rate": 3.504950495049505e-06, + "loss": 1.7151, + "step": 251 + }, + { + "epoch": 0.1511969760604788, + "grad_norm": 1.71875, + "learning_rate": 3.502475247524752e-06, + "loss": 1.6645, + "step": 252 + }, + { + "epoch": 0.1517969640607188, + "grad_norm": 1.6640625, + "learning_rate": 3.5e-06, + "loss": 1.6666, + "step": 253 + }, + { + "epoch": 0.15239695206095877, + "grad_norm": 1.84375, + "learning_rate": 3.4975247524752477e-06, + "loss": 1.5748, + "step": 254 + }, + { + "epoch": 0.15299694006119877, + "grad_norm": 1.6796875, + "learning_rate": 3.495049504950495e-06, + "loss": 1.6351, + "step": 255 + }, + { + "epoch": 0.15359692806143876, + "grad_norm": 1.734375, + "learning_rate": 3.492574257425743e-06, + "loss": 1.6819, + "step": 256 + }, + { + "epoch": 0.15419691606167876, + "grad_norm": 1.71875, + "learning_rate": 3.4900990099009897e-06, + "loss": 1.714, + "step": 257 + }, + { + "epoch": 0.15479690406191876, + "grad_norm": 1.78125, + "learning_rate": 3.4876237623762375e-06, + "loss": 1.6284, + "step": 258 + }, + { + "epoch": 0.15539689206215876, + "grad_norm": 1.8515625, + "learning_rate": 3.485148514851485e-06, + "loss": 1.5917, + "step": 259 + }, + { + "epoch": 0.15599688006239876, + "grad_norm": 1.828125, + "learning_rate": 3.4826732673267326e-06, + "loss": 1.7185, + "step": 260 + }, + { + "epoch": 0.15659686806263876, + "grad_norm": 1.734375, + "learning_rate": 3.48019801980198e-06, + "loss": 1.6383, + "step": 261 + }, + { + "epoch": 0.15719685606287875, + "grad_norm": 1.78125, + "learning_rate": 3.4777227722772277e-06, + "loss": 1.6595, + "step": 262 + }, + { + "epoch": 0.15779684406311872, + "grad_norm": 1.71875, + "learning_rate": 3.475247524752475e-06, + "loss": 1.6335, + "step": 263 + }, + { + "epoch": 0.15839683206335872, + "grad_norm": 1.984375, + "learning_rate": 3.472772277227723e-06, + "loss": 1.7455, + "step": 264 + }, + { + "epoch": 0.15899682006359872, + "grad_norm": 1.5859375, + "learning_rate": 3.47029702970297e-06, + "loss": 1.7527, + "step": 265 + }, + { + "epoch": 0.15959680806383872, + "grad_norm": 1.6171875, + "learning_rate": 3.467821782178218e-06, + "loss": 1.6564, + "step": 266 + }, + { + "epoch": 0.16019679606407872, + "grad_norm": 1.5703125, + "learning_rate": 3.4653465346534653e-06, + "loss": 1.6865, + "step": 267 + }, + { + "epoch": 0.16079678406431872, + "grad_norm": 1.578125, + "learning_rate": 3.462871287128713e-06, + "loss": 1.5996, + "step": 268 + }, + { + "epoch": 0.1613967720645587, + "grad_norm": 1.5078125, + "learning_rate": 3.4603960396039604e-06, + "loss": 1.5936, + "step": 269 + }, + { + "epoch": 0.1619967600647987, + "grad_norm": 1.6640625, + "learning_rate": 3.4579207920792077e-06, + "loss": 1.5691, + "step": 270 + }, + { + "epoch": 0.1625967480650387, + "grad_norm": 1.59375, + "learning_rate": 3.455445544554455e-06, + "loss": 1.5869, + "step": 271 + }, + { + "epoch": 0.1631967360652787, + "grad_norm": 1.7265625, + "learning_rate": 3.452970297029703e-06, + "loss": 1.675, + "step": 272 + }, + { + "epoch": 0.16379672406551868, + "grad_norm": 1.65625, + "learning_rate": 3.45049504950495e-06, + "loss": 1.7083, + "step": 273 + }, + { + "epoch": 0.16439671206575868, + "grad_norm": 1.7734375, + "learning_rate": 3.448019801980198e-06, + "loss": 1.7339, + "step": 274 + }, + { + "epoch": 0.16499670006599867, + "grad_norm": 1.6640625, + "learning_rate": 3.4455445544554453e-06, + "loss": 1.6974, + "step": 275 + }, + { + "epoch": 0.16559668806623867, + "grad_norm": 1.65625, + "learning_rate": 3.443069306930693e-06, + "loss": 1.6987, + "step": 276 + }, + { + "epoch": 0.16619667606647867, + "grad_norm": 1.90625, + "learning_rate": 3.4405940594059404e-06, + "loss": 1.6543, + "step": 277 + }, + { + "epoch": 0.16679666406671867, + "grad_norm": 2.203125, + "learning_rate": 3.438118811881188e-06, + "loss": 1.7186, + "step": 278 + }, + { + "epoch": 0.16739665206695867, + "grad_norm": 1.6328125, + "learning_rate": 3.4356435643564355e-06, + "loss": 1.5786, + "step": 279 + }, + { + "epoch": 0.16799664006719867, + "grad_norm": 1.78125, + "learning_rate": 3.4331683168316833e-06, + "loss": 1.6934, + "step": 280 + }, + { + "epoch": 0.16859662806743866, + "grad_norm": 2.125, + "learning_rate": 3.4306930693069306e-06, + "loss": 1.6231, + "step": 281 + }, + { + "epoch": 0.16919661606767863, + "grad_norm": 1.8125, + "learning_rate": 3.4282178217821784e-06, + "loss": 1.7557, + "step": 282 + }, + { + "epoch": 0.16979660406791863, + "grad_norm": 1.8359375, + "learning_rate": 3.4257425742574253e-06, + "loss": 1.7097, + "step": 283 + }, + { + "epoch": 0.17039659206815863, + "grad_norm": 1.8828125, + "learning_rate": 3.423267326732673e-06, + "loss": 1.6093, + "step": 284 + }, + { + "epoch": 0.17099658006839863, + "grad_norm": 1.6953125, + "learning_rate": 3.4207920792079204e-06, + "loss": 1.6127, + "step": 285 + }, + { + "epoch": 0.17159656806863863, + "grad_norm": 1.578125, + "learning_rate": 3.418316831683168e-06, + "loss": 1.6953, + "step": 286 + }, + { + "epoch": 0.17219655606887863, + "grad_norm": 1.8515625, + "learning_rate": 3.4158415841584155e-06, + "loss": 1.5976, + "step": 287 + }, + { + "epoch": 0.17279654406911862, + "grad_norm": 1.5703125, + "learning_rate": 3.4133663366336633e-06, + "loss": 1.696, + "step": 288 + }, + { + "epoch": 0.17339653206935862, + "grad_norm": 1.578125, + "learning_rate": 3.4108910891089106e-06, + "loss": 1.6185, + "step": 289 + }, + { + "epoch": 0.17399652006959862, + "grad_norm": 1.6875, + "learning_rate": 3.4084158415841584e-06, + "loss": 1.671, + "step": 290 + }, + { + "epoch": 0.1745965080698386, + "grad_norm": 1.671875, + "learning_rate": 3.4059405940594058e-06, + "loss": 1.6512, + "step": 291 + }, + { + "epoch": 0.1751964960700786, + "grad_norm": 1.75, + "learning_rate": 3.4034653465346535e-06, + "loss": 1.6009, + "step": 292 + }, + { + "epoch": 0.1757964840703186, + "grad_norm": 1.578125, + "learning_rate": 3.400990099009901e-06, + "loss": 1.5156, + "step": 293 + }, + { + "epoch": 0.17639647207055859, + "grad_norm": 1.796875, + "learning_rate": 3.3985148514851486e-06, + "loss": 1.694, + "step": 294 + }, + { + "epoch": 0.17699646007079858, + "grad_norm": 1.7109375, + "learning_rate": 3.396039603960396e-06, + "loss": 1.6629, + "step": 295 + }, + { + "epoch": 0.17759644807103858, + "grad_norm": 1.625, + "learning_rate": 3.3935643564356437e-06, + "loss": 1.697, + "step": 296 + }, + { + "epoch": 0.17819643607127858, + "grad_norm": 1.78125, + "learning_rate": 3.3910891089108907e-06, + "loss": 1.597, + "step": 297 + }, + { + "epoch": 0.17879642407151858, + "grad_norm": 1.65625, + "learning_rate": 3.3886138613861384e-06, + "loss": 1.6441, + "step": 298 + }, + { + "epoch": 0.17939641207175858, + "grad_norm": 1.9140625, + "learning_rate": 3.3861386138613858e-06, + "loss": 1.6373, + "step": 299 + }, + { + "epoch": 0.17999640007199855, + "grad_norm": 1.703125, + "learning_rate": 3.3836633663366335e-06, + "loss": 1.75, + "step": 300 + }, + { + "epoch": 0.17999640007199855, + "eval_loss": 1.7828515768051147, + "eval_model_preparation_time": 0.0037, + "eval_runtime": 68.7039, + "eval_samples_per_second": 145.552, + "eval_steps_per_second": 24.264, + "step": 300 + }, + { + "epoch": 0.18059638807223855, + "grad_norm": 1.6953125, + "learning_rate": 3.381188118811881e-06, + "loss": 1.648, + "step": 301 + }, + { + "epoch": 0.18119637607247854, + "grad_norm": 1.859375, + "learning_rate": 3.3787128712871286e-06, + "loss": 1.7426, + "step": 302 + }, + { + "epoch": 0.18179636407271854, + "grad_norm": 1.640625, + "learning_rate": 3.376237623762376e-06, + "loss": 1.6039, + "step": 303 + }, + { + "epoch": 0.18239635207295854, + "grad_norm": 1.546875, + "learning_rate": 3.3737623762376238e-06, + "loss": 1.6396, + "step": 304 + }, + { + "epoch": 0.18299634007319854, + "grad_norm": 1.953125, + "learning_rate": 3.371287128712871e-06, + "loss": 1.6994, + "step": 305 + }, + { + "epoch": 0.18359632807343854, + "grad_norm": 1.5390625, + "learning_rate": 3.368811881188119e-06, + "loss": 1.4593, + "step": 306 + }, + { + "epoch": 0.18419631607367853, + "grad_norm": 1.6640625, + "learning_rate": 3.366336633663366e-06, + "loss": 1.5756, + "step": 307 + }, + { + "epoch": 0.18479630407391853, + "grad_norm": 1.78125, + "learning_rate": 3.363861386138614e-06, + "loss": 1.5962, + "step": 308 + }, + { + "epoch": 0.1853962920741585, + "grad_norm": 1.796875, + "learning_rate": 3.3613861386138613e-06, + "loss": 1.5964, + "step": 309 + }, + { + "epoch": 0.1859962800743985, + "grad_norm": 1.5390625, + "learning_rate": 3.3589108910891087e-06, + "loss": 1.6447, + "step": 310 + }, + { + "epoch": 0.1865962680746385, + "grad_norm": 1.5625, + "learning_rate": 3.356435643564356e-06, + "loss": 1.7261, + "step": 311 + }, + { + "epoch": 0.1871962560748785, + "grad_norm": 1.828125, + "learning_rate": 3.3539603960396038e-06, + "loss": 1.6419, + "step": 312 + }, + { + "epoch": 0.1877962440751185, + "grad_norm": 1.7734375, + "learning_rate": 3.351485148514851e-06, + "loss": 1.6787, + "step": 313 + }, + { + "epoch": 0.1883962320753585, + "grad_norm": 1.6328125, + "learning_rate": 3.349009900990099e-06, + "loss": 1.7553, + "step": 314 + }, + { + "epoch": 0.1889962200755985, + "grad_norm": 1.6796875, + "learning_rate": 3.3465346534653462e-06, + "loss": 1.7045, + "step": 315 + }, + { + "epoch": 0.1895962080758385, + "grad_norm": 1.6640625, + "learning_rate": 3.344059405940594e-06, + "loss": 1.715, + "step": 316 + }, + { + "epoch": 0.1901961960760785, + "grad_norm": 1.6328125, + "learning_rate": 3.3415841584158413e-06, + "loss": 1.6036, + "step": 317 + }, + { + "epoch": 0.1907961840763185, + "grad_norm": 1.8515625, + "learning_rate": 3.339108910891089e-06, + "loss": 1.7363, + "step": 318 + }, + { + "epoch": 0.19139617207655846, + "grad_norm": 1.5625, + "learning_rate": 3.3366336633663364e-06, + "loss": 1.7064, + "step": 319 + }, + { + "epoch": 0.19199616007679846, + "grad_norm": 1.5625, + "learning_rate": 3.334158415841584e-06, + "loss": 1.7196, + "step": 320 + }, + { + "epoch": 0.19259614807703845, + "grad_norm": 1.7265625, + "learning_rate": 3.3316831683168316e-06, + "loss": 1.6718, + "step": 321 + }, + { + "epoch": 0.19319613607727845, + "grad_norm": 1.546875, + "learning_rate": 3.3292079207920793e-06, + "loss": 1.689, + "step": 322 + }, + { + "epoch": 0.19379612407751845, + "grad_norm": 1.6328125, + "learning_rate": 3.3267326732673262e-06, + "loss": 1.716, + "step": 323 + }, + { + "epoch": 0.19439611207775845, + "grad_norm": 1.8046875, + "learning_rate": 3.324257425742574e-06, + "loss": 1.7491, + "step": 324 + }, + { + "epoch": 0.19499610007799845, + "grad_norm": 1.75, + "learning_rate": 3.3217821782178213e-06, + "loss": 1.6944, + "step": 325 + }, + { + "epoch": 0.19559608807823844, + "grad_norm": 1.890625, + "learning_rate": 3.319306930693069e-06, + "loss": 1.691, + "step": 326 + }, + { + "epoch": 0.19619607607847844, + "grad_norm": 2.0625, + "learning_rate": 3.3168316831683165e-06, + "loss": 1.662, + "step": 327 + }, + { + "epoch": 0.1967960640787184, + "grad_norm": 1.6953125, + "learning_rate": 3.3143564356435642e-06, + "loss": 1.7127, + "step": 328 + }, + { + "epoch": 0.1973960520789584, + "grad_norm": 1.75, + "learning_rate": 3.3118811881188116e-06, + "loss": 1.6894, + "step": 329 + }, + { + "epoch": 0.1979960400791984, + "grad_norm": 1.6640625, + "learning_rate": 3.3094059405940593e-06, + "loss": 1.6596, + "step": 330 + }, + { + "epoch": 0.1985960280794384, + "grad_norm": 1.609375, + "learning_rate": 3.3069306930693067e-06, + "loss": 1.7074, + "step": 331 + }, + { + "epoch": 0.1991960160796784, + "grad_norm": 1.7734375, + "learning_rate": 3.3044554455445544e-06, + "loss": 1.7924, + "step": 332 + }, + { + "epoch": 0.1997960040799184, + "grad_norm": 1.890625, + "learning_rate": 3.3019801980198018e-06, + "loss": 1.6684, + "step": 333 + }, + { + "epoch": 0.2003959920801584, + "grad_norm": 1.875, + "learning_rate": 3.2995049504950496e-06, + "loss": 1.6355, + "step": 334 + }, + { + "epoch": 0.2009959800803984, + "grad_norm": 1.6484375, + "learning_rate": 3.297029702970297e-06, + "loss": 1.6694, + "step": 335 + }, + { + "epoch": 0.2015959680806384, + "grad_norm": 1.6015625, + "learning_rate": 3.2945544554455442e-06, + "loss": 1.722, + "step": 336 + }, + { + "epoch": 0.20219595608087837, + "grad_norm": 1.703125, + "learning_rate": 3.2920792079207916e-06, + "loss": 1.6248, + "step": 337 + }, + { + "epoch": 0.20279594408111837, + "grad_norm": 1.703125, + "learning_rate": 3.2896039603960393e-06, + "loss": 1.6511, + "step": 338 + }, + { + "epoch": 0.20339593208135837, + "grad_norm": 1.9375, + "learning_rate": 3.2871287128712867e-06, + "loss": 1.6785, + "step": 339 + }, + { + "epoch": 0.20399592008159836, + "grad_norm": 1.7734375, + "learning_rate": 3.2846534653465345e-06, + "loss": 1.6779, + "step": 340 + }, + { + "epoch": 0.20459590808183836, + "grad_norm": 1.859375, + "learning_rate": 3.282178217821782e-06, + "loss": 1.6735, + "step": 341 + }, + { + "epoch": 0.20519589608207836, + "grad_norm": 1.59375, + "learning_rate": 3.2797029702970296e-06, + "loss": 1.6365, + "step": 342 + }, + { + "epoch": 0.20579588408231836, + "grad_norm": 1.7578125, + "learning_rate": 3.277227722772277e-06, + "loss": 1.6497, + "step": 343 + }, + { + "epoch": 0.20639587208255836, + "grad_norm": 1.6015625, + "learning_rate": 3.2747524752475247e-06, + "loss": 1.6381, + "step": 344 + }, + { + "epoch": 0.20699586008279836, + "grad_norm": 1.8671875, + "learning_rate": 3.272277227722772e-06, + "loss": 1.669, + "step": 345 + }, + { + "epoch": 0.20759584808303833, + "grad_norm": 1.8359375, + "learning_rate": 3.2698019801980198e-06, + "loss": 1.6485, + "step": 346 + }, + { + "epoch": 0.20819583608327832, + "grad_norm": 1.6796875, + "learning_rate": 3.267326732673267e-06, + "loss": 1.5217, + "step": 347 + }, + { + "epoch": 0.20879582408351832, + "grad_norm": 1.7265625, + "learning_rate": 3.264851485148515e-06, + "loss": 1.5988, + "step": 348 + }, + { + "epoch": 0.20939581208375832, + "grad_norm": 1.6171875, + "learning_rate": 3.262376237623762e-06, + "loss": 1.6752, + "step": 349 + }, + { + "epoch": 0.20999580008399832, + "grad_norm": 1.6484375, + "learning_rate": 3.2599009900990096e-06, + "loss": 1.6443, + "step": 350 + }, + { + "epoch": 0.20999580008399832, + "eval_loss": 1.7816474437713623, + "eval_model_preparation_time": 0.0037, + "eval_runtime": 65.9284, + "eval_samples_per_second": 151.68, + "eval_steps_per_second": 25.285, + "step": 350 + }, + { + "epoch": 0.21059578808423832, + "grad_norm": 1.84375, + "learning_rate": 3.257425742574257e-06, + "loss": 1.6992, + "step": 351 + }, + { + "epoch": 0.21119577608447831, + "grad_norm": 1.640625, + "learning_rate": 3.2549504950495047e-06, + "loss": 1.6317, + "step": 352 + }, + { + "epoch": 0.2117957640847183, + "grad_norm": 1.71875, + "learning_rate": 3.252475247524752e-06, + "loss": 1.6773, + "step": 353 + }, + { + "epoch": 0.2123957520849583, + "grad_norm": 1.8515625, + "learning_rate": 3.25e-06, + "loss": 1.6705, + "step": 354 + }, + { + "epoch": 0.2129957400851983, + "grad_norm": 1.578125, + "learning_rate": 3.2475247524752476e-06, + "loss": 1.6884, + "step": 355 + }, + { + "epoch": 0.21359572808543828, + "grad_norm": 1.84375, + "learning_rate": 3.245049504950495e-06, + "loss": 1.6452, + "step": 356 + }, + { + "epoch": 0.21419571608567828, + "grad_norm": 1.96875, + "learning_rate": 3.2425742574257427e-06, + "loss": 1.6493, + "step": 357 + }, + { + "epoch": 0.21479570408591828, + "grad_norm": 1.9765625, + "learning_rate": 3.24009900990099e-06, + "loss": 1.648, + "step": 358 + }, + { + "epoch": 0.21539569208615827, + "grad_norm": 1.5390625, + "learning_rate": 3.2376237623762378e-06, + "loss": 1.633, + "step": 359 + }, + { + "epoch": 0.21599568008639827, + "grad_norm": 1.765625, + "learning_rate": 3.235148514851485e-06, + "loss": 1.678, + "step": 360 + }, + { + "epoch": 0.21659566808663827, + "grad_norm": 1.65625, + "learning_rate": 3.232673267326733e-06, + "loss": 1.7183, + "step": 361 + }, + { + "epoch": 0.21719565608687827, + "grad_norm": 1.75, + "learning_rate": 3.2301980198019802e-06, + "loss": 1.6863, + "step": 362 + }, + { + "epoch": 0.21779564408711827, + "grad_norm": 1.71875, + "learning_rate": 3.2277227722772276e-06, + "loss": 1.8238, + "step": 363 + }, + { + "epoch": 0.21839563208735827, + "grad_norm": 1.7265625, + "learning_rate": 3.225247524752475e-06, + "loss": 1.6829, + "step": 364 + }, + { + "epoch": 0.21899562008759824, + "grad_norm": 1.6953125, + "learning_rate": 3.2227722772277227e-06, + "loss": 1.5743, + "step": 365 + }, + { + "epoch": 0.21959560808783823, + "grad_norm": 1.6796875, + "learning_rate": 3.22029702970297e-06, + "loss": 1.7275, + "step": 366 + }, + { + "epoch": 0.22019559608807823, + "grad_norm": 1.84375, + "learning_rate": 3.217821782178218e-06, + "loss": 1.7962, + "step": 367 + }, + { + "epoch": 0.22079558408831823, + "grad_norm": 1.5703125, + "learning_rate": 3.215346534653465e-06, + "loss": 1.6825, + "step": 368 + }, + { + "epoch": 0.22139557208855823, + "grad_norm": 1.5234375, + "learning_rate": 3.212871287128713e-06, + "loss": 1.7308, + "step": 369 + }, + { + "epoch": 0.22199556008879823, + "grad_norm": 1.6953125, + "learning_rate": 3.2103960396039603e-06, + "loss": 1.6639, + "step": 370 + }, + { + "epoch": 0.22259554808903823, + "grad_norm": 1.546875, + "learning_rate": 3.207920792079208e-06, + "loss": 1.7165, + "step": 371 + }, + { + "epoch": 0.22319553608927822, + "grad_norm": 1.6796875, + "learning_rate": 3.2054455445544554e-06, + "loss": 1.655, + "step": 372 + }, + { + "epoch": 0.22379552408951822, + "grad_norm": 1.859375, + "learning_rate": 3.202970297029703e-06, + "loss": 1.6016, + "step": 373 + }, + { + "epoch": 0.2243955120897582, + "grad_norm": 1.7578125, + "learning_rate": 3.2004950495049505e-06, + "loss": 1.6397, + "step": 374 + }, + { + "epoch": 0.2249955000899982, + "grad_norm": 1.6328125, + "learning_rate": 3.1980198019801982e-06, + "loss": 1.6711, + "step": 375 + }, + { + "epoch": 0.2255954880902382, + "grad_norm": 1.65625, + "learning_rate": 3.195544554455445e-06, + "loss": 1.5991, + "step": 376 + }, + { + "epoch": 0.2261954760904782, + "grad_norm": 1.8203125, + "learning_rate": 3.193069306930693e-06, + "loss": 1.6873, + "step": 377 + }, + { + "epoch": 0.22679546409071819, + "grad_norm": 1.65625, + "learning_rate": 3.1905940594059403e-06, + "loss": 1.7299, + "step": 378 + }, + { + "epoch": 0.22739545209095818, + "grad_norm": 1.6015625, + "learning_rate": 3.188118811881188e-06, + "loss": 1.6678, + "step": 379 + }, + { + "epoch": 0.22799544009119818, + "grad_norm": 1.8046875, + "learning_rate": 3.1856435643564354e-06, + "loss": 1.632, + "step": 380 + }, + { + "epoch": 0.22859542809143818, + "grad_norm": 1.828125, + "learning_rate": 3.183168316831683e-06, + "loss": 1.6137, + "step": 381 + }, + { + "epoch": 0.22919541609167818, + "grad_norm": 1.7421875, + "learning_rate": 3.1806930693069305e-06, + "loss": 1.5535, + "step": 382 + }, + { + "epoch": 0.22979540409191815, + "grad_norm": 1.6484375, + "learning_rate": 3.1782178217821783e-06, + "loss": 1.6658, + "step": 383 + }, + { + "epoch": 0.23039539209215815, + "grad_norm": 1.7734375, + "learning_rate": 3.1757425742574256e-06, + "loss": 1.636, + "step": 384 + }, + { + "epoch": 0.23099538009239814, + "grad_norm": 1.53125, + "learning_rate": 3.1732673267326734e-06, + "loss": 1.64, + "step": 385 + }, + { + "epoch": 0.23159536809263814, + "grad_norm": 1.609375, + "learning_rate": 3.1707920792079207e-06, + "loss": 1.6077, + "step": 386 + }, + { + "epoch": 0.23219535609287814, + "grad_norm": 1.7109375, + "learning_rate": 3.1683168316831685e-06, + "loss": 1.6389, + "step": 387 + }, + { + "epoch": 0.23279534409311814, + "grad_norm": 1.625, + "learning_rate": 3.165841584158416e-06, + "loss": 1.6041, + "step": 388 + }, + { + "epoch": 0.23339533209335814, + "grad_norm": 1.75, + "learning_rate": 3.163366336633663e-06, + "loss": 1.6911, + "step": 389 + }, + { + "epoch": 0.23399532009359814, + "grad_norm": 1.78125, + "learning_rate": 3.1608910891089105e-06, + "loss": 1.5314, + "step": 390 + }, + { + "epoch": 0.23459530809383813, + "grad_norm": 1.765625, + "learning_rate": 3.1584158415841583e-06, + "loss": 1.5704, + "step": 391 + }, + { + "epoch": 0.23519529609407813, + "grad_norm": 1.6171875, + "learning_rate": 3.1559405940594056e-06, + "loss": 1.7246, + "step": 392 + }, + { + "epoch": 0.2357952840943181, + "grad_norm": 1.6640625, + "learning_rate": 3.1534653465346534e-06, + "loss": 1.6135, + "step": 393 + }, + { + "epoch": 0.2363952720945581, + "grad_norm": 1.578125, + "learning_rate": 3.1509900990099007e-06, + "loss": 1.7003, + "step": 394 + }, + { + "epoch": 0.2369952600947981, + "grad_norm": 1.6953125, + "learning_rate": 3.1485148514851485e-06, + "loss": 1.7987, + "step": 395 + }, + { + "epoch": 0.2375952480950381, + "grad_norm": 1.8125, + "learning_rate": 3.146039603960396e-06, + "loss": 1.6395, + "step": 396 + }, + { + "epoch": 0.2381952360952781, + "grad_norm": 1.6484375, + "learning_rate": 3.1435643564356436e-06, + "loss": 1.6497, + "step": 397 + }, + { + "epoch": 0.2387952240955181, + "grad_norm": 1.8203125, + "learning_rate": 3.141089108910891e-06, + "loss": 1.692, + "step": 398 + }, + { + "epoch": 0.2393952120957581, + "grad_norm": 1.7421875, + "learning_rate": 3.1386138613861387e-06, + "loss": 1.7406, + "step": 399 + }, + { + "epoch": 0.2399952000959981, + "grad_norm": 1.7265625, + "learning_rate": 3.136138613861386e-06, + "loss": 1.7191, + "step": 400 + }, + { + "epoch": 0.2399952000959981, + "eval_loss": 1.7807087898254395, + "eval_model_preparation_time": 0.0037, + "eval_runtime": 66.0367, + "eval_samples_per_second": 151.431, + "eval_steps_per_second": 25.244, + "step": 400 + }, + { + "epoch": 0.2405951880962381, + "grad_norm": 1.7734375, + "learning_rate": 3.133663366336634e-06, + "loss": 1.6603, + "step": 401 + }, + { + "epoch": 0.24119517609647806, + "grad_norm": 1.609375, + "learning_rate": 3.1311881188118807e-06, + "loss": 1.6492, + "step": 402 + }, + { + "epoch": 0.24179516409671806, + "grad_norm": 1.671875, + "learning_rate": 3.1287128712871285e-06, + "loss": 1.7367, + "step": 403 + }, + { + "epoch": 0.24239515209695806, + "grad_norm": 1.6484375, + "learning_rate": 3.126237623762376e-06, + "loss": 1.7123, + "step": 404 + }, + { + "epoch": 0.24299514009719805, + "grad_norm": 1.7265625, + "learning_rate": 3.1237623762376236e-06, + "loss": 1.7351, + "step": 405 + }, + { + "epoch": 0.24359512809743805, + "grad_norm": 1.7109375, + "learning_rate": 3.121287128712871e-06, + "loss": 1.7273, + "step": 406 + }, + { + "epoch": 0.24419511609767805, + "grad_norm": 1.6953125, + "learning_rate": 3.1188118811881187e-06, + "loss": 1.6324, + "step": 407 + }, + { + "epoch": 0.24479510409791805, + "grad_norm": 1.90625, + "learning_rate": 3.116336633663366e-06, + "loss": 1.6113, + "step": 408 + }, + { + "epoch": 0.24539509209815805, + "grad_norm": 1.5078125, + "learning_rate": 3.113861386138614e-06, + "loss": 1.6485, + "step": 409 + }, + { + "epoch": 0.24599508009839804, + "grad_norm": 1.9921875, + "learning_rate": 3.111386138613861e-06, + "loss": 1.664, + "step": 410 + }, + { + "epoch": 0.24659506809863802, + "grad_norm": 1.6796875, + "learning_rate": 3.108910891089109e-06, + "loss": 1.7174, + "step": 411 + }, + { + "epoch": 0.247195056098878, + "grad_norm": 1.5390625, + "learning_rate": 3.1064356435643563e-06, + "loss": 1.6438, + "step": 412 + }, + { + "epoch": 0.247795044099118, + "grad_norm": 1.5859375, + "learning_rate": 3.103960396039604e-06, + "loss": 1.7208, + "step": 413 + }, + { + "epoch": 0.248395032099358, + "grad_norm": 1.4765625, + "learning_rate": 3.1014851485148514e-06, + "loss": 1.7257, + "step": 414 + }, + { + "epoch": 0.248995020099598, + "grad_norm": 1.5703125, + "learning_rate": 3.099009900990099e-06, + "loss": 1.683, + "step": 415 + }, + { + "epoch": 0.249595008099838, + "grad_norm": 1.6953125, + "learning_rate": 3.096534653465346e-06, + "loss": 1.7018, + "step": 416 + }, + { + "epoch": 0.250194996100078, + "grad_norm": 1.59375, + "learning_rate": 3.094059405940594e-06, + "loss": 1.6217, + "step": 417 + }, + { + "epoch": 0.250794984100318, + "grad_norm": 1.703125, + "learning_rate": 3.091584158415841e-06, + "loss": 1.6733, + "step": 418 + }, + { + "epoch": 0.251394972100558, + "grad_norm": 1.703125, + "learning_rate": 3.089108910891089e-06, + "loss": 1.7294, + "step": 419 + }, + { + "epoch": 0.25199496010079797, + "grad_norm": 1.625, + "learning_rate": 3.0866336633663363e-06, + "loss": 1.6648, + "step": 420 + }, + { + "epoch": 0.252594948101038, + "grad_norm": 1.625, + "learning_rate": 3.084158415841584e-06, + "loss": 1.6886, + "step": 421 + }, + { + "epoch": 0.25319493610127797, + "grad_norm": 1.5078125, + "learning_rate": 3.0816831683168314e-06, + "loss": 1.6579, + "step": 422 + }, + { + "epoch": 0.253794924101518, + "grad_norm": 1.71875, + "learning_rate": 3.079207920792079e-06, + "loss": 1.5933, + "step": 423 + }, + { + "epoch": 0.25439491210175796, + "grad_norm": 1.625, + "learning_rate": 3.0767326732673265e-06, + "loss": 1.6736, + "step": 424 + }, + { + "epoch": 0.25499490010199793, + "grad_norm": 1.6953125, + "learning_rate": 3.0742574257425743e-06, + "loss": 1.6689, + "step": 425 + }, + { + "epoch": 0.25559488810223796, + "grad_norm": 1.875, + "learning_rate": 3.0717821782178216e-06, + "loss": 1.6232, + "step": 426 + }, + { + "epoch": 0.25619487610247793, + "grad_norm": 1.609375, + "learning_rate": 3.0693069306930694e-06, + "loss": 1.6268, + "step": 427 + }, + { + "epoch": 0.25679486410271796, + "grad_norm": 1.6953125, + "learning_rate": 3.0668316831683167e-06, + "loss": 1.6636, + "step": 428 + }, + { + "epoch": 0.2573948521029579, + "grad_norm": 1.4921875, + "learning_rate": 3.064356435643564e-06, + "loss": 1.6045, + "step": 429 + }, + { + "epoch": 0.25799484010319795, + "grad_norm": 1.7265625, + "learning_rate": 3.0618811881188114e-06, + "loss": 1.6552, + "step": 430 + }, + { + "epoch": 0.2585948281034379, + "grad_norm": 1.59375, + "learning_rate": 3.059405940594059e-06, + "loss": 1.7048, + "step": 431 + }, + { + "epoch": 0.25919481610367795, + "grad_norm": 1.7578125, + "learning_rate": 3.0569306930693065e-06, + "loss": 1.6747, + "step": 432 + }, + { + "epoch": 0.2597948041039179, + "grad_norm": 1.6328125, + "learning_rate": 3.0544554455445543e-06, + "loss": 1.811, + "step": 433 + }, + { + "epoch": 0.2603947921041579, + "grad_norm": 1.6484375, + "learning_rate": 3.0519801980198016e-06, + "loss": 1.5845, + "step": 434 + }, + { + "epoch": 0.2609947801043979, + "grad_norm": 1.6796875, + "learning_rate": 3.0495049504950494e-06, + "loss": 1.6937, + "step": 435 + }, + { + "epoch": 0.2615947681046379, + "grad_norm": 2.078125, + "learning_rate": 3.0470297029702967e-06, + "loss": 1.7166, + "step": 436 + }, + { + "epoch": 0.2621947561048779, + "grad_norm": 1.8671875, + "learning_rate": 3.0445544554455445e-06, + "loss": 1.5966, + "step": 437 + }, + { + "epoch": 0.2627947441051179, + "grad_norm": 1.46875, + "learning_rate": 3.042079207920792e-06, + "loss": 1.5997, + "step": 438 + }, + { + "epoch": 0.2633947321053579, + "grad_norm": 1.6171875, + "learning_rate": 3.0396039603960396e-06, + "loss": 1.5692, + "step": 439 + }, + { + "epoch": 0.2639947201055979, + "grad_norm": 1.8125, + "learning_rate": 3.037128712871287e-06, + "loss": 1.6428, + "step": 440 + }, + { + "epoch": 0.2645947081058379, + "grad_norm": 1.6484375, + "learning_rate": 3.0346534653465347e-06, + "loss": 1.5924, + "step": 441 + }, + { + "epoch": 0.2651946961060779, + "grad_norm": 1.5625, + "learning_rate": 3.0321782178217817e-06, + "loss": 1.6718, + "step": 442 + }, + { + "epoch": 0.26579468410631785, + "grad_norm": 1.53125, + "learning_rate": 3.0297029702970294e-06, + "loss": 1.7045, + "step": 443 + }, + { + "epoch": 0.2663946721065579, + "grad_norm": 1.8046875, + "learning_rate": 3.0272277227722768e-06, + "loss": 1.6391, + "step": 444 + }, + { + "epoch": 0.26699466010679784, + "grad_norm": 1.53125, + "learning_rate": 3.0247524752475245e-06, + "loss": 1.648, + "step": 445 + }, + { + "epoch": 0.26759464810703787, + "grad_norm": 1.75, + "learning_rate": 3.022277227722772e-06, + "loss": 1.6945, + "step": 446 + }, + { + "epoch": 0.26819463610727784, + "grad_norm": 1.921875, + "learning_rate": 3.0198019801980196e-06, + "loss": 1.6156, + "step": 447 + }, + { + "epoch": 0.26879462410751787, + "grad_norm": 1.6796875, + "learning_rate": 3.017326732673267e-06, + "loss": 1.7582, + "step": 448 + }, + { + "epoch": 0.26939461210775784, + "grad_norm": 1.6328125, + "learning_rate": 3.0148514851485147e-06, + "loss": 1.6294, + "step": 449 + }, + { + "epoch": 0.26999460010799786, + "grad_norm": 1.6875, + "learning_rate": 3.012376237623762e-06, + "loss": 1.7376, + "step": 450 + }, + { + "epoch": 0.26999460010799786, + "eval_loss": 1.7797411680221558, + "eval_model_preparation_time": 0.0037, + "eval_runtime": 68.9129, + "eval_samples_per_second": 145.111, + "eval_steps_per_second": 24.19, + "step": 450 + }, + { + "epoch": 0.27059458810823783, + "grad_norm": 1.6015625, + "learning_rate": 3.00990099009901e-06, + "loss": 1.6089, + "step": 451 + }, + { + "epoch": 0.2711945761084778, + "grad_norm": 1.6484375, + "learning_rate": 3.007425742574257e-06, + "loss": 1.657, + "step": 452 + }, + { + "epoch": 0.27179456410871783, + "grad_norm": 1.5859375, + "learning_rate": 3.004950495049505e-06, + "loss": 1.7169, + "step": 453 + }, + { + "epoch": 0.2723945521089578, + "grad_norm": 1.6484375, + "learning_rate": 3.0024752475247523e-06, + "loss": 1.5906, + "step": 454 + }, + { + "epoch": 0.2729945401091978, + "grad_norm": 1.828125, + "learning_rate": 3e-06, + "loss": 1.5775, + "step": 455 + }, + { + "epoch": 0.2735945281094378, + "grad_norm": 1.6484375, + "learning_rate": 2.9975247524752474e-06, + "loss": 1.7177, + "step": 456 + }, + { + "epoch": 0.2741945161096778, + "grad_norm": 1.6171875, + "learning_rate": 2.9950495049504948e-06, + "loss": 1.6019, + "step": 457 + }, + { + "epoch": 0.2747945041099178, + "grad_norm": 1.703125, + "learning_rate": 2.9925742574257425e-06, + "loss": 1.6325, + "step": 458 + }, + { + "epoch": 0.2753944921101578, + "grad_norm": 1.7109375, + "learning_rate": 2.99009900990099e-06, + "loss": 1.7797, + "step": 459 + }, + { + "epoch": 0.2759944801103978, + "grad_norm": 1.7890625, + "learning_rate": 2.9876237623762376e-06, + "loss": 1.6406, + "step": 460 + }, + { + "epoch": 0.2765944681106378, + "grad_norm": 1.71875, + "learning_rate": 2.985148514851485e-06, + "loss": 1.745, + "step": 461 + }, + { + "epoch": 0.2771944561108778, + "grad_norm": 1.7265625, + "learning_rate": 2.9826732673267327e-06, + "loss": 1.7707, + "step": 462 + }, + { + "epoch": 0.27779444411111776, + "grad_norm": 1.65625, + "learning_rate": 2.98019801980198e-06, + "loss": 1.7372, + "step": 463 + }, + { + "epoch": 0.2783944321113578, + "grad_norm": 1.71875, + "learning_rate": 2.977722772277228e-06, + "loss": 1.6024, + "step": 464 + }, + { + "epoch": 0.27899442011159775, + "grad_norm": 1.6171875, + "learning_rate": 2.975247524752475e-06, + "loss": 1.508, + "step": 465 + }, + { + "epoch": 0.2795944081118378, + "grad_norm": 1.78125, + "learning_rate": 2.972772277227723e-06, + "loss": 1.8252, + "step": 466 + }, + { + "epoch": 0.28019439611207775, + "grad_norm": 1.609375, + "learning_rate": 2.9702970297029703e-06, + "loss": 1.5962, + "step": 467 + }, + { + "epoch": 0.2807943841123178, + "grad_norm": 1.8046875, + "learning_rate": 2.967821782178218e-06, + "loss": 1.6764, + "step": 468 + }, + { + "epoch": 0.28139437211255774, + "grad_norm": 1.5, + "learning_rate": 2.965346534653465e-06, + "loss": 1.6339, + "step": 469 + }, + { + "epoch": 0.28199436011279777, + "grad_norm": 1.640625, + "learning_rate": 2.9628712871287128e-06, + "loss": 1.5768, + "step": 470 + }, + { + "epoch": 0.28259434811303774, + "grad_norm": 1.7265625, + "learning_rate": 2.96039603960396e-06, + "loss": 1.6435, + "step": 471 + }, + { + "epoch": 0.2831943361132777, + "grad_norm": 1.671875, + "learning_rate": 2.957920792079208e-06, + "loss": 1.6312, + "step": 472 + }, + { + "epoch": 0.28379432411351774, + "grad_norm": 1.625, + "learning_rate": 2.9554455445544552e-06, + "loss": 1.7725, + "step": 473 + }, + { + "epoch": 0.2843943121137577, + "grad_norm": 1.6953125, + "learning_rate": 2.952970297029703e-06, + "loss": 1.6755, + "step": 474 + }, + { + "epoch": 0.28499430011399773, + "grad_norm": 1.78125, + "learning_rate": 2.9504950495049503e-06, + "loss": 1.6577, + "step": 475 + }, + { + "epoch": 0.2855942881142377, + "grad_norm": 1.6171875, + "learning_rate": 2.948019801980198e-06, + "loss": 1.7085, + "step": 476 + }, + { + "epoch": 0.28619427611447773, + "grad_norm": 1.59375, + "learning_rate": 2.9455445544554454e-06, + "loss": 1.7308, + "step": 477 + }, + { + "epoch": 0.2867942641147177, + "grad_norm": 1.6171875, + "learning_rate": 2.943069306930693e-06, + "loss": 1.592, + "step": 478 + }, + { + "epoch": 0.2873942521149577, + "grad_norm": 1.8203125, + "learning_rate": 2.9405940594059405e-06, + "loss": 1.5693, + "step": 479 + }, + { + "epoch": 0.2879942401151977, + "grad_norm": 1.6328125, + "learning_rate": 2.9381188118811883e-06, + "loss": 1.6561, + "step": 480 + }, + { + "epoch": 0.28859422811543767, + "grad_norm": 1.7734375, + "learning_rate": 2.9356435643564357e-06, + "loss": 1.6724, + "step": 481 + }, + { + "epoch": 0.2891942161156777, + "grad_norm": 1.5546875, + "learning_rate": 2.933168316831683e-06, + "loss": 1.6792, + "step": 482 + }, + { + "epoch": 0.28979420411591766, + "grad_norm": 1.609375, + "learning_rate": 2.9306930693069303e-06, + "loss": 1.5727, + "step": 483 + }, + { + "epoch": 0.2903941921161577, + "grad_norm": 1.7265625, + "learning_rate": 2.928217821782178e-06, + "loss": 1.6944, + "step": 484 + }, + { + "epoch": 0.29099418011639766, + "grad_norm": 1.7265625, + "learning_rate": 2.9257425742574254e-06, + "loss": 1.6274, + "step": 485 + }, + { + "epoch": 0.2915941681166377, + "grad_norm": 1.59375, + "learning_rate": 2.9232673267326732e-06, + "loss": 1.7017, + "step": 486 + }, + { + "epoch": 0.29219415611687766, + "grad_norm": 1.671875, + "learning_rate": 2.9207920792079206e-06, + "loss": 1.6242, + "step": 487 + }, + { + "epoch": 0.2927941441171177, + "grad_norm": 1.8125, + "learning_rate": 2.9183168316831683e-06, + "loss": 1.5515, + "step": 488 + }, + { + "epoch": 0.29339413211735765, + "grad_norm": 1.546875, + "learning_rate": 2.9158415841584157e-06, + "loss": 1.6318, + "step": 489 + }, + { + "epoch": 0.2939941201175976, + "grad_norm": 1.734375, + "learning_rate": 2.9133663366336634e-06, + "loss": 1.6594, + "step": 490 + }, + { + "epoch": 0.29459410811783765, + "grad_norm": 1.5703125, + "learning_rate": 2.9108910891089108e-06, + "loss": 1.6918, + "step": 491 + }, + { + "epoch": 0.2951940961180776, + "grad_norm": 1.53125, + "learning_rate": 2.9084158415841585e-06, + "loss": 1.7203, + "step": 492 + }, + { + "epoch": 0.29579408411831765, + "grad_norm": 1.703125, + "learning_rate": 2.905940594059406e-06, + "loss": 1.6643, + "step": 493 + }, + { + "epoch": 0.2963940721185576, + "grad_norm": 1.7890625, + "learning_rate": 2.9034653465346537e-06, + "loss": 1.5509, + "step": 494 + }, + { + "epoch": 0.29699406011879764, + "grad_norm": 1.7578125, + "learning_rate": 2.9009900990099006e-06, + "loss": 1.7432, + "step": 495 + }, + { + "epoch": 0.2975940481190376, + "grad_norm": 1.640625, + "learning_rate": 2.8985148514851483e-06, + "loss": 1.6119, + "step": 496 + }, + { + "epoch": 0.29819403611927764, + "grad_norm": 1.6328125, + "learning_rate": 2.8960396039603957e-06, + "loss": 1.7538, + "step": 497 + }, + { + "epoch": 0.2987940241195176, + "grad_norm": 1.59375, + "learning_rate": 2.8935643564356434e-06, + "loss": 1.6927, + "step": 498 + }, + { + "epoch": 0.2993940121197576, + "grad_norm": 1.7265625, + "learning_rate": 2.891089108910891e-06, + "loss": 1.8514, + "step": 499 + }, + { + "epoch": 0.2999940001199976, + "grad_norm": 1.4921875, + "learning_rate": 2.8886138613861386e-06, + "loss": 1.6127, + "step": 500 + }, + { + "epoch": 0.2999940001199976, + "eval_loss": 1.7789958715438843, + "eval_model_preparation_time": 0.0037, + "eval_runtime": 66.0169, + "eval_samples_per_second": 151.476, + "eval_steps_per_second": 25.251, + "step": 500 + }, + { + "epoch": 0.3005939881202376, + "grad_norm": 1.703125, + "learning_rate": 2.886138613861386e-06, + "loss": 1.7337, + "step": 501 + }, + { + "epoch": 0.3011939761204776, + "grad_norm": 1.59375, + "learning_rate": 2.8836633663366337e-06, + "loss": 1.684, + "step": 502 + }, + { + "epoch": 0.3017939641207176, + "grad_norm": 1.8203125, + "learning_rate": 2.881188118811881e-06, + "loss": 1.6062, + "step": 503 + }, + { + "epoch": 0.3023939521209576, + "grad_norm": 1.6015625, + "learning_rate": 2.8787128712871288e-06, + "loss": 1.6014, + "step": 504 + }, + { + "epoch": 0.30299394012119757, + "grad_norm": 1.671875, + "learning_rate": 2.876237623762376e-06, + "loss": 1.5647, + "step": 505 + }, + { + "epoch": 0.3035939281214376, + "grad_norm": 1.8671875, + "learning_rate": 2.873762376237624e-06, + "loss": 1.7988, + "step": 506 + }, + { + "epoch": 0.30419391612167757, + "grad_norm": 1.5859375, + "learning_rate": 2.8712871287128712e-06, + "loss": 1.6926, + "step": 507 + }, + { + "epoch": 0.30479390412191754, + "grad_norm": 1.8515625, + "learning_rate": 2.868811881188119e-06, + "loss": 1.5877, + "step": 508 + }, + { + "epoch": 0.30539389212215756, + "grad_norm": 1.8203125, + "learning_rate": 2.866336633663366e-06, + "loss": 1.6823, + "step": 509 + }, + { + "epoch": 0.30599388012239753, + "grad_norm": 1.578125, + "learning_rate": 2.8638613861386137e-06, + "loss": 1.5566, + "step": 510 + }, + { + "epoch": 0.30659386812263756, + "grad_norm": 1.7578125, + "learning_rate": 2.861386138613861e-06, + "loss": 1.6614, + "step": 511 + }, + { + "epoch": 0.30719385612287753, + "grad_norm": 1.65625, + "learning_rate": 2.858910891089109e-06, + "loss": 1.6932, + "step": 512 + }, + { + "epoch": 0.30779384412311755, + "grad_norm": 1.6640625, + "learning_rate": 2.856435643564356e-06, + "loss": 1.7368, + "step": 513 + }, + { + "epoch": 0.3083938321233575, + "grad_norm": 1.6328125, + "learning_rate": 2.853960396039604e-06, + "loss": 1.6836, + "step": 514 + }, + { + "epoch": 0.30899382012359755, + "grad_norm": 1.6953125, + "learning_rate": 2.8514851485148512e-06, + "loss": 1.5781, + "step": 515 + }, + { + "epoch": 0.3095938081238375, + "grad_norm": 1.75, + "learning_rate": 2.849009900990099e-06, + "loss": 1.6617, + "step": 516 + }, + { + "epoch": 0.3101937961240775, + "grad_norm": 1.8203125, + "learning_rate": 2.8465346534653464e-06, + "loss": 1.6568, + "step": 517 + }, + { + "epoch": 0.3107937841243175, + "grad_norm": 1.671875, + "learning_rate": 2.844059405940594e-06, + "loss": 1.6805, + "step": 518 + }, + { + "epoch": 0.3113937721245575, + "grad_norm": 1.8046875, + "learning_rate": 2.8415841584158415e-06, + "loss": 1.6114, + "step": 519 + }, + { + "epoch": 0.3119937601247975, + "grad_norm": 1.71875, + "learning_rate": 2.8391089108910892e-06, + "loss": 1.6399, + "step": 520 + }, + { + "epoch": 0.3125937481250375, + "grad_norm": 1.6640625, + "learning_rate": 2.8366336633663366e-06, + "loss": 1.6753, + "step": 521 + }, + { + "epoch": 0.3131937361252775, + "grad_norm": 1.6640625, + "learning_rate": 2.834158415841584e-06, + "loss": 1.6818, + "step": 522 + }, + { + "epoch": 0.3137937241255175, + "grad_norm": 1.7421875, + "learning_rate": 2.8316831683168313e-06, + "loss": 1.6999, + "step": 523 + }, + { + "epoch": 0.3143937121257575, + "grad_norm": 1.6484375, + "learning_rate": 2.829207920792079e-06, + "loss": 1.5612, + "step": 524 + }, + { + "epoch": 0.3149937001259975, + "grad_norm": 1.8671875, + "learning_rate": 2.8267326732673264e-06, + "loss": 1.7166, + "step": 525 + }, + { + "epoch": 0.31559368812623745, + "grad_norm": 1.78125, + "learning_rate": 2.824257425742574e-06, + "loss": 1.6112, + "step": 526 + }, + { + "epoch": 0.3161936761264775, + "grad_norm": 1.5625, + "learning_rate": 2.8217821782178215e-06, + "loss": 1.5862, + "step": 527 + }, + { + "epoch": 0.31679366412671744, + "grad_norm": 1.5390625, + "learning_rate": 2.8193069306930692e-06, + "loss": 1.6285, + "step": 528 + }, + { + "epoch": 0.31739365212695747, + "grad_norm": 1.5234375, + "learning_rate": 2.8168316831683166e-06, + "loss": 1.6099, + "step": 529 + }, + { + "epoch": 0.31799364012719744, + "grad_norm": 1.7109375, + "learning_rate": 2.8143564356435644e-06, + "loss": 1.6994, + "step": 530 + }, + { + "epoch": 0.31859362812743747, + "grad_norm": 1.703125, + "learning_rate": 2.8118811881188117e-06, + "loss": 1.5936, + "step": 531 + }, + { + "epoch": 0.31919361612767744, + "grad_norm": 1.53125, + "learning_rate": 2.8094059405940595e-06, + "loss": 1.6696, + "step": 532 + }, + { + "epoch": 0.31979360412791746, + "grad_norm": 1.765625, + "learning_rate": 2.806930693069307e-06, + "loss": 1.5878, + "step": 533 + }, + { + "epoch": 0.32039359212815743, + "grad_norm": 1.765625, + "learning_rate": 2.8044554455445546e-06, + "loss": 1.6611, + "step": 534 + }, + { + "epoch": 0.3209935801283974, + "grad_norm": 1.71875, + "learning_rate": 2.8019801980198015e-06, + "loss": 1.7178, + "step": 535 + }, + { + "epoch": 0.32159356812863743, + "grad_norm": 1.734375, + "learning_rate": 2.7995049504950493e-06, + "loss": 1.6074, + "step": 536 + }, + { + "epoch": 0.3221935561288774, + "grad_norm": 1.6484375, + "learning_rate": 2.7970297029702966e-06, + "loss": 1.6906, + "step": 537 + }, + { + "epoch": 0.3227935441291174, + "grad_norm": 1.6015625, + "learning_rate": 2.7945544554455444e-06, + "loss": 1.6133, + "step": 538 + }, + { + "epoch": 0.3233935321293574, + "grad_norm": 1.5390625, + "learning_rate": 2.7920792079207917e-06, + "loss": 1.6716, + "step": 539 + }, + { + "epoch": 0.3239935201295974, + "grad_norm": 1.78125, + "learning_rate": 2.7896039603960395e-06, + "loss": 1.6822, + "step": 540 + }, + { + "epoch": 0.3245935081298374, + "grad_norm": 1.671875, + "learning_rate": 2.787128712871287e-06, + "loss": 1.6233, + "step": 541 + }, + { + "epoch": 0.3251934961300774, + "grad_norm": 1.578125, + "learning_rate": 2.7846534653465346e-06, + "loss": 1.6941, + "step": 542 + }, + { + "epoch": 0.3257934841303174, + "grad_norm": 1.59375, + "learning_rate": 2.782178217821782e-06, + "loss": 1.7407, + "step": 543 + }, + { + "epoch": 0.3263934721305574, + "grad_norm": 1.7578125, + "learning_rate": 2.7797029702970297e-06, + "loss": 1.6666, + "step": 544 + }, + { + "epoch": 0.3269934601307974, + "grad_norm": 1.6875, + "learning_rate": 2.777227722772277e-06, + "loss": 1.5721, + "step": 545 + }, + { + "epoch": 0.32759344813103736, + "grad_norm": 1.6328125, + "learning_rate": 2.774752475247525e-06, + "loss": 1.6345, + "step": 546 + }, + { + "epoch": 0.3281934361312774, + "grad_norm": 1.578125, + "learning_rate": 2.772277227722772e-06, + "loss": 1.7061, + "step": 547 + }, + { + "epoch": 0.32879342413151735, + "grad_norm": 1.921875, + "learning_rate": 2.7698019801980195e-06, + "loss": 1.72, + "step": 548 + }, + { + "epoch": 0.3293934121317574, + "grad_norm": 1.4765625, + "learning_rate": 2.767326732673267e-06, + "loss": 1.5754, + "step": 549 + }, + { + "epoch": 0.32999340013199735, + "grad_norm": 1.796875, + "learning_rate": 2.7648514851485146e-06, + "loss": 1.6856, + "step": 550 + }, + { + "epoch": 0.32999340013199735, + "eval_loss": 1.7785999774932861, + "eval_model_preparation_time": 0.0037, + "eval_runtime": 65.9577, + "eval_samples_per_second": 151.612, + "eval_steps_per_second": 25.274, + "step": 550 + }, + { + "epoch": 0.3305933881322374, + "grad_norm": 1.75, + "learning_rate": 2.762376237623762e-06, + "loss": 1.6151, + "step": 551 + }, + { + "epoch": 0.33119337613247735, + "grad_norm": 1.59375, + "learning_rate": 2.7599009900990097e-06, + "loss": 1.6493, + "step": 552 + }, + { + "epoch": 0.33179336413271737, + "grad_norm": 1.8203125, + "learning_rate": 2.757425742574257e-06, + "loss": 1.7389, + "step": 553 + }, + { + "epoch": 0.33239335213295734, + "grad_norm": 1.7109375, + "learning_rate": 2.754950495049505e-06, + "loss": 1.6765, + "step": 554 + }, + { + "epoch": 0.3329933401331973, + "grad_norm": 1.625, + "learning_rate": 2.752475247524752e-06, + "loss": 1.6237, + "step": 555 + }, + { + "epoch": 0.33359332813343734, + "grad_norm": 1.5, + "learning_rate": 2.75e-06, + "loss": 1.7249, + "step": 556 + }, + { + "epoch": 0.3341933161336773, + "grad_norm": 1.71875, + "learning_rate": 2.7475247524752477e-06, + "loss": 1.6253, + "step": 557 + }, + { + "epoch": 0.33479330413391734, + "grad_norm": 1.671875, + "learning_rate": 2.745049504950495e-06, + "loss": 1.6199, + "step": 558 + }, + { + "epoch": 0.3353932921341573, + "grad_norm": 1.703125, + "learning_rate": 2.742574257425743e-06, + "loss": 1.6945, + "step": 559 + }, + { + "epoch": 0.33599328013439733, + "grad_norm": 1.5859375, + "learning_rate": 2.74009900990099e-06, + "loss": 1.6295, + "step": 560 + }, + { + "epoch": 0.3365932681346373, + "grad_norm": 1.5625, + "learning_rate": 2.737623762376238e-06, + "loss": 1.617, + "step": 561 + }, + { + "epoch": 0.33719325613487733, + "grad_norm": 1.546875, + "learning_rate": 2.735148514851485e-06, + "loss": 1.7085, + "step": 562 + }, + { + "epoch": 0.3377932441351173, + "grad_norm": 1.765625, + "learning_rate": 2.7326732673267326e-06, + "loss": 1.7428, + "step": 563 + }, + { + "epoch": 0.33839323213535727, + "grad_norm": 1.5703125, + "learning_rate": 2.73019801980198e-06, + "loss": 1.5509, + "step": 564 + }, + { + "epoch": 0.3389932201355973, + "grad_norm": 1.65625, + "learning_rate": 2.7277227722772277e-06, + "loss": 1.6567, + "step": 565 + }, + { + "epoch": 0.33959320813583727, + "grad_norm": 1.515625, + "learning_rate": 2.725247524752475e-06, + "loss": 1.5832, + "step": 566 + }, + { + "epoch": 0.3401931961360773, + "grad_norm": 1.7109375, + "learning_rate": 2.722772277227723e-06, + "loss": 1.6637, + "step": 567 + }, + { + "epoch": 0.34079318413631726, + "grad_norm": 1.5390625, + "learning_rate": 2.72029702970297e-06, + "loss": 1.6742, + "step": 568 + }, + { + "epoch": 0.3413931721365573, + "grad_norm": 1.4375, + "learning_rate": 2.717821782178218e-06, + "loss": 1.6378, + "step": 569 + }, + { + "epoch": 0.34199316013679726, + "grad_norm": 1.5234375, + "learning_rate": 2.7153465346534653e-06, + "loss": 1.6595, + "step": 570 + }, + { + "epoch": 0.3425931481370373, + "grad_norm": 1.765625, + "learning_rate": 2.712871287128713e-06, + "loss": 1.7167, + "step": 571 + }, + { + "epoch": 0.34319313613727725, + "grad_norm": 1.8046875, + "learning_rate": 2.7103960396039604e-06, + "loss": 1.7184, + "step": 572 + }, + { + "epoch": 0.3437931241375172, + "grad_norm": 1.6796875, + "learning_rate": 2.707920792079208e-06, + "loss": 1.6598, + "step": 573 + }, + { + "epoch": 0.34439311213775725, + "grad_norm": 1.578125, + "learning_rate": 2.7054455445544555e-06, + "loss": 1.7302, + "step": 574 + }, + { + "epoch": 0.3449931001379972, + "grad_norm": 2.0, + "learning_rate": 2.702970297029703e-06, + "loss": 1.6286, + "step": 575 + }, + { + "epoch": 0.34559308813823725, + "grad_norm": 1.6171875, + "learning_rate": 2.70049504950495e-06, + "loss": 1.7379, + "step": 576 + }, + { + "epoch": 0.3461930761384772, + "grad_norm": 1.8203125, + "learning_rate": 2.698019801980198e-06, + "loss": 1.7016, + "step": 577 + }, + { + "epoch": 0.34679306413871724, + "grad_norm": 1.890625, + "learning_rate": 2.6955445544554453e-06, + "loss": 1.6231, + "step": 578 + }, + { + "epoch": 0.3473930521389572, + "grad_norm": 1.9140625, + "learning_rate": 2.693069306930693e-06, + "loss": 1.7478, + "step": 579 + }, + { + "epoch": 0.34799304013919724, + "grad_norm": 1.6484375, + "learning_rate": 2.6905940594059404e-06, + "loss": 1.6871, + "step": 580 + }, + { + "epoch": 0.3485930281394372, + "grad_norm": 1.6015625, + "learning_rate": 2.688118811881188e-06, + "loss": 1.6774, + "step": 581 + }, + { + "epoch": 0.3491930161396772, + "grad_norm": 1.765625, + "learning_rate": 2.6856435643564355e-06, + "loss": 1.6801, + "step": 582 + }, + { + "epoch": 0.3497930041399172, + "grad_norm": 1.703125, + "learning_rate": 2.6831683168316833e-06, + "loss": 1.6734, + "step": 583 + }, + { + "epoch": 0.3503929921401572, + "grad_norm": 1.59375, + "learning_rate": 2.6806930693069306e-06, + "loss": 1.6161, + "step": 584 + }, + { + "epoch": 0.3509929801403972, + "grad_norm": 1.84375, + "learning_rate": 2.6782178217821784e-06, + "loss": 1.621, + "step": 585 + }, + { + "epoch": 0.3515929681406372, + "grad_norm": 1.71875, + "learning_rate": 2.6757425742574257e-06, + "loss": 1.6603, + "step": 586 + }, + { + "epoch": 0.3521929561408772, + "grad_norm": 1.9296875, + "learning_rate": 2.6732673267326735e-06, + "loss": 1.7578, + "step": 587 + }, + { + "epoch": 0.35279294414111717, + "grad_norm": 1.75, + "learning_rate": 2.6707920792079204e-06, + "loss": 1.7356, + "step": 588 + }, + { + "epoch": 0.3533929321413572, + "grad_norm": 1.7421875, + "learning_rate": 2.668316831683168e-06, + "loss": 1.7648, + "step": 589 + }, + { + "epoch": 0.35399292014159717, + "grad_norm": 1.5703125, + "learning_rate": 2.6658415841584155e-06, + "loss": 1.6623, + "step": 590 + }, + { + "epoch": 0.35459290814183714, + "grad_norm": 1.8515625, + "learning_rate": 2.6633663366336633e-06, + "loss": 1.7709, + "step": 591 + }, + { + "epoch": 0.35519289614207716, + "grad_norm": 1.7890625, + "learning_rate": 2.6608910891089106e-06, + "loss": 1.6477, + "step": 592 + }, + { + "epoch": 0.35579288414231713, + "grad_norm": 1.5546875, + "learning_rate": 2.6584158415841584e-06, + "loss": 1.5843, + "step": 593 + }, + { + "epoch": 0.35639287214255716, + "grad_norm": 1.5703125, + "learning_rate": 2.6559405940594057e-06, + "loss": 1.6412, + "step": 594 + }, + { + "epoch": 0.35699286014279713, + "grad_norm": 1.6328125, + "learning_rate": 2.6534653465346535e-06, + "loss": 1.646, + "step": 595 + }, + { + "epoch": 0.35759284814303716, + "grad_norm": 1.65625, + "learning_rate": 2.650990099009901e-06, + "loss": 1.753, + "step": 596 + }, + { + "epoch": 0.3581928361432771, + "grad_norm": 1.609375, + "learning_rate": 2.6485148514851486e-06, + "loss": 1.6465, + "step": 597 + }, + { + "epoch": 0.35879282414351715, + "grad_norm": 1.6796875, + "learning_rate": 2.646039603960396e-06, + "loss": 1.6874, + "step": 598 + }, + { + "epoch": 0.3593928121437571, + "grad_norm": 1.6875, + "learning_rate": 2.6435643564356437e-06, + "loss": 1.6842, + "step": 599 + }, + { + "epoch": 0.3599928001439971, + "grad_norm": 1.53125, + "learning_rate": 2.641089108910891e-06, + "loss": 1.692, + "step": 600 + }, + { + "epoch": 0.3599928001439971, + "eval_loss": 1.7782986164093018, + "eval_model_preparation_time": 0.0037, + "eval_runtime": 68.6919, + "eval_samples_per_second": 145.577, + "eval_steps_per_second": 24.268, + "step": 600 + }, + { + "epoch": 0.3605927881442371, + "grad_norm": 1.703125, + "learning_rate": 2.6386138613861384e-06, + "loss": 1.7203, + "step": 601 + }, + { + "epoch": 0.3611927761444771, + "grad_norm": 1.6953125, + "learning_rate": 2.6361386138613858e-06, + "loss": 1.7563, + "step": 602 + }, + { + "epoch": 0.3617927641447171, + "grad_norm": 1.625, + "learning_rate": 2.6336633663366335e-06, + "loss": 1.5713, + "step": 603 + }, + { + "epoch": 0.3623927521449571, + "grad_norm": 1.671875, + "learning_rate": 2.631188118811881e-06, + "loss": 1.7128, + "step": 604 + }, + { + "epoch": 0.3629927401451971, + "grad_norm": 1.7109375, + "learning_rate": 2.6287128712871286e-06, + "loss": 1.7093, + "step": 605 + }, + { + "epoch": 0.3635927281454371, + "grad_norm": 1.9765625, + "learning_rate": 2.626237623762376e-06, + "loss": 1.7053, + "step": 606 + }, + { + "epoch": 0.3641927161456771, + "grad_norm": 1.5703125, + "learning_rate": 2.6237623762376237e-06, + "loss": 1.6861, + "step": 607 + }, + { + "epoch": 0.3647927041459171, + "grad_norm": 1.515625, + "learning_rate": 2.621287128712871e-06, + "loss": 1.5996, + "step": 608 + }, + { + "epoch": 0.36539269214615705, + "grad_norm": 1.65625, + "learning_rate": 2.618811881188119e-06, + "loss": 1.6401, + "step": 609 + }, + { + "epoch": 0.3659926801463971, + "grad_norm": 1.671875, + "learning_rate": 2.616336633663366e-06, + "loss": 1.6489, + "step": 610 + }, + { + "epoch": 0.36659266814663705, + "grad_norm": 1.59375, + "learning_rate": 2.613861386138614e-06, + "loss": 1.7161, + "step": 611 + }, + { + "epoch": 0.36719265614687707, + "grad_norm": 1.9453125, + "learning_rate": 2.6113861386138613e-06, + "loss": 1.5981, + "step": 612 + }, + { + "epoch": 0.36779264414711704, + "grad_norm": 1.7265625, + "learning_rate": 2.608910891089109e-06, + "loss": 1.7098, + "step": 613 + }, + { + "epoch": 0.36839263214735707, + "grad_norm": 1.796875, + "learning_rate": 2.606435643564356e-06, + "loss": 1.6385, + "step": 614 + }, + { + "epoch": 0.36899262014759704, + "grad_norm": 1.6328125, + "learning_rate": 2.6039603960396038e-06, + "loss": 1.7776, + "step": 615 + }, + { + "epoch": 0.36959260814783707, + "grad_norm": 1.765625, + "learning_rate": 2.601485148514851e-06, + "loss": 1.6092, + "step": 616 + }, + { + "epoch": 0.37019259614807704, + "grad_norm": 1.6015625, + "learning_rate": 2.599009900990099e-06, + "loss": 1.7118, + "step": 617 + }, + { + "epoch": 0.370792584148317, + "grad_norm": 1.796875, + "learning_rate": 2.596534653465346e-06, + "loss": 1.5554, + "step": 618 + }, + { + "epoch": 0.37139257214855703, + "grad_norm": 1.671875, + "learning_rate": 2.594059405940594e-06, + "loss": 1.7137, + "step": 619 + }, + { + "epoch": 0.371992560148797, + "grad_norm": 2.265625, + "learning_rate": 2.5915841584158413e-06, + "loss": 1.568, + "step": 620 + }, + { + "epoch": 0.37259254814903703, + "grad_norm": 1.6796875, + "learning_rate": 2.589108910891089e-06, + "loss": 1.6098, + "step": 621 + }, + { + "epoch": 0.373192536149277, + "grad_norm": 1.7265625, + "learning_rate": 2.5866336633663364e-06, + "loss": 1.7429, + "step": 622 + }, + { + "epoch": 0.373792524149517, + "grad_norm": 1.515625, + "learning_rate": 2.584158415841584e-06, + "loss": 1.5776, + "step": 623 + }, + { + "epoch": 0.374392512149757, + "grad_norm": 1.6796875, + "learning_rate": 2.5816831683168315e-06, + "loss": 1.7225, + "step": 624 + }, + { + "epoch": 0.374992500149997, + "grad_norm": 1.671875, + "learning_rate": 2.5792079207920793e-06, + "loss": 1.636, + "step": 625 + }, + { + "epoch": 0.375592488150237, + "grad_norm": 1.71875, + "learning_rate": 2.5767326732673266e-06, + "loss": 1.6907, + "step": 626 + }, + { + "epoch": 0.376192476150477, + "grad_norm": 1.515625, + "learning_rate": 2.5742574257425744e-06, + "loss": 1.6, + "step": 627 + }, + { + "epoch": 0.376792464150717, + "grad_norm": 1.703125, + "learning_rate": 2.5717821782178213e-06, + "loss": 1.7379, + "step": 628 + }, + { + "epoch": 0.37739245215095696, + "grad_norm": 1.8203125, + "learning_rate": 2.569306930693069e-06, + "loss": 1.7201, + "step": 629 + }, + { + "epoch": 0.377992440151197, + "grad_norm": 1.75, + "learning_rate": 2.5668316831683164e-06, + "loss": 1.5961, + "step": 630 + }, + { + "epoch": 0.37859242815143695, + "grad_norm": 1.609375, + "learning_rate": 2.564356435643564e-06, + "loss": 1.6398, + "step": 631 + }, + { + "epoch": 0.379192416151677, + "grad_norm": 1.84375, + "learning_rate": 2.5618811881188115e-06, + "loss": 1.7187, + "step": 632 + }, + { + "epoch": 0.37979240415191695, + "grad_norm": 1.7578125, + "learning_rate": 2.5594059405940593e-06, + "loss": 1.6705, + "step": 633 + }, + { + "epoch": 0.380392392152157, + "grad_norm": 1.5703125, + "learning_rate": 2.5569306930693067e-06, + "loss": 1.6784, + "step": 634 + }, + { + "epoch": 0.38099238015239695, + "grad_norm": 1.71875, + "learning_rate": 2.5544554455445544e-06, + "loss": 1.6956, + "step": 635 + }, + { + "epoch": 0.381592368152637, + "grad_norm": 1.5625, + "learning_rate": 2.5519801980198018e-06, + "loss": 1.6573, + "step": 636 + }, + { + "epoch": 0.38219235615287694, + "grad_norm": 1.53125, + "learning_rate": 2.5495049504950495e-06, + "loss": 1.6514, + "step": 637 + }, + { + "epoch": 0.3827923441531169, + "grad_norm": 1.6328125, + "learning_rate": 2.547029702970297e-06, + "loss": 1.5906, + "step": 638 + }, + { + "epoch": 0.38339233215335694, + "grad_norm": 1.6875, + "learning_rate": 2.5445544554455446e-06, + "loss": 1.6045, + "step": 639 + }, + { + "epoch": 0.3839923201535969, + "grad_norm": 1.8125, + "learning_rate": 2.542079207920792e-06, + "loss": 1.6549, + "step": 640 + }, + { + "epoch": 0.38459230815383694, + "grad_norm": 1.59375, + "learning_rate": 2.5396039603960393e-06, + "loss": 1.6548, + "step": 641 + }, + { + "epoch": 0.3851922961540769, + "grad_norm": 1.75, + "learning_rate": 2.5371287128712867e-06, + "loss": 1.6412, + "step": 642 + }, + { + "epoch": 0.38579228415431693, + "grad_norm": 1.59375, + "learning_rate": 2.5346534653465344e-06, + "loss": 1.6326, + "step": 643 + }, + { + "epoch": 0.3863922721545569, + "grad_norm": 1.8125, + "learning_rate": 2.5321782178217818e-06, + "loss": 1.5856, + "step": 644 + }, + { + "epoch": 0.38699226015479693, + "grad_norm": 1.8203125, + "learning_rate": 2.5297029702970295e-06, + "loss": 1.548, + "step": 645 + }, + { + "epoch": 0.3875922481550369, + "grad_norm": 1.546875, + "learning_rate": 2.527227722772277e-06, + "loss": 1.6333, + "step": 646 + }, + { + "epoch": 0.38819223615527687, + "grad_norm": 1.796875, + "learning_rate": 2.5247524752475247e-06, + "loss": 1.6828, + "step": 647 + }, + { + "epoch": 0.3887922241555169, + "grad_norm": 1.6328125, + "learning_rate": 2.522277227722772e-06, + "loss": 1.6181, + "step": 648 + }, + { + "epoch": 0.38939221215575687, + "grad_norm": 1.625, + "learning_rate": 2.5198019801980198e-06, + "loss": 1.6848, + "step": 649 + }, + { + "epoch": 0.3899922001559969, + "grad_norm": 1.734375, + "learning_rate": 2.517326732673267e-06, + "loss": 1.7051, + "step": 650 + }, + { + "epoch": 0.3899922001559969, + "eval_loss": 1.778051733970642, + "eval_model_preparation_time": 0.0037, + "eval_runtime": 65.9746, + "eval_samples_per_second": 151.573, + "eval_steps_per_second": 25.267, + "step": 650 + }, + { + "epoch": 0.39059218815623686, + "grad_norm": 1.5, + "learning_rate": 2.514851485148515e-06, + "loss": 1.6987, + "step": 651 + }, + { + "epoch": 0.3911921761564769, + "grad_norm": 1.84375, + "learning_rate": 2.5123762376237622e-06, + "loss": 1.6951, + "step": 652 + }, + { + "epoch": 0.39179216415671686, + "grad_norm": 1.6015625, + "learning_rate": 2.50990099009901e-06, + "loss": 1.6218, + "step": 653 + }, + { + "epoch": 0.3923921521569569, + "grad_norm": 1.6015625, + "learning_rate": 2.507425742574257e-06, + "loss": 1.7695, + "step": 654 + }, + { + "epoch": 0.39299214015719686, + "grad_norm": 1.5625, + "learning_rate": 2.5049504950495047e-06, + "loss": 1.6856, + "step": 655 + }, + { + "epoch": 0.3935921281574368, + "grad_norm": 1.6640625, + "learning_rate": 2.502475247524752e-06, + "loss": 1.658, + "step": 656 + }, + { + "epoch": 0.39419211615767685, + "grad_norm": 1.8203125, + "learning_rate": 2.4999999999999998e-06, + "loss": 1.6642, + "step": 657 + }, + { + "epoch": 0.3947921041579168, + "grad_norm": 1.890625, + "learning_rate": 2.4975247524752475e-06, + "loss": 1.6252, + "step": 658 + }, + { + "epoch": 0.39539209215815685, + "grad_norm": 1.6484375, + "learning_rate": 2.495049504950495e-06, + "loss": 1.6601, + "step": 659 + }, + { + "epoch": 0.3959920801583968, + "grad_norm": 1.671875, + "learning_rate": 2.4925742574257427e-06, + "loss": 1.6114, + "step": 660 + }, + { + "epoch": 0.39659206815863685, + "grad_norm": 1.515625, + "learning_rate": 2.49009900990099e-06, + "loss": 1.6902, + "step": 661 + }, + { + "epoch": 0.3971920561588768, + "grad_norm": 1.6015625, + "learning_rate": 2.4876237623762378e-06, + "loss": 1.5903, + "step": 662 + }, + { + "epoch": 0.39779204415911684, + "grad_norm": 1.8359375, + "learning_rate": 2.485148514851485e-06, + "loss": 1.7081, + "step": 663 + }, + { + "epoch": 0.3983920321593568, + "grad_norm": 1.765625, + "learning_rate": 2.482673267326733e-06, + "loss": 1.7127, + "step": 664 + }, + { + "epoch": 0.3989920201595968, + "grad_norm": 1.78125, + "learning_rate": 2.4801980198019802e-06, + "loss": 1.538, + "step": 665 + }, + { + "epoch": 0.3995920081598368, + "grad_norm": 1.609375, + "learning_rate": 2.477722772277228e-06, + "loss": 1.7153, + "step": 666 + }, + { + "epoch": 0.4001919961600768, + "grad_norm": 1.5703125, + "learning_rate": 2.475247524752475e-06, + "loss": 1.5796, + "step": 667 + }, + { + "epoch": 0.4007919841603168, + "grad_norm": 1.796875, + "learning_rate": 2.4727722772277227e-06, + "loss": 1.7261, + "step": 668 + }, + { + "epoch": 0.4013919721605568, + "grad_norm": 1.75, + "learning_rate": 2.47029702970297e-06, + "loss": 1.6201, + "step": 669 + }, + { + "epoch": 0.4019919601607968, + "grad_norm": 1.78125, + "learning_rate": 2.4678217821782178e-06, + "loss": 1.5838, + "step": 670 + }, + { + "epoch": 0.4025919481610368, + "grad_norm": 1.828125, + "learning_rate": 2.465346534653465e-06, + "loss": 1.675, + "step": 671 + }, + { + "epoch": 0.4031919361612768, + "grad_norm": 1.625, + "learning_rate": 2.462871287128713e-06, + "loss": 1.5512, + "step": 672 + }, + { + "epoch": 0.40379192416151677, + "grad_norm": 1.625, + "learning_rate": 2.4603960396039602e-06, + "loss": 1.7064, + "step": 673 + }, + { + "epoch": 0.40439191216175674, + "grad_norm": 1.7265625, + "learning_rate": 2.457920792079208e-06, + "loss": 1.6955, + "step": 674 + }, + { + "epoch": 0.40499190016199677, + "grad_norm": 1.7265625, + "learning_rate": 2.4554455445544553e-06, + "loss": 1.5868, + "step": 675 + }, + { + "epoch": 0.40559188816223674, + "grad_norm": 1.84375, + "learning_rate": 2.452970297029703e-06, + "loss": 1.6344, + "step": 676 + }, + { + "epoch": 0.40619187616247676, + "grad_norm": 1.7734375, + "learning_rate": 2.4504950495049505e-06, + "loss": 1.7793, + "step": 677 + }, + { + "epoch": 0.40679186416271673, + "grad_norm": 1.6328125, + "learning_rate": 2.4480198019801982e-06, + "loss": 1.6707, + "step": 678 + }, + { + "epoch": 0.40739185216295676, + "grad_norm": 1.453125, + "learning_rate": 2.4455445544554456e-06, + "loss": 1.6521, + "step": 679 + }, + { + "epoch": 0.40799184016319673, + "grad_norm": 1.6796875, + "learning_rate": 2.4430693069306933e-06, + "loss": 1.6611, + "step": 680 + }, + { + "epoch": 0.40859182816343675, + "grad_norm": 1.625, + "learning_rate": 2.4405940594059402e-06, + "loss": 1.7121, + "step": 681 + }, + { + "epoch": 0.4091918161636767, + "grad_norm": 1.6796875, + "learning_rate": 2.438118811881188e-06, + "loss": 1.6899, + "step": 682 + }, + { + "epoch": 0.4097918041639167, + "grad_norm": 1.609375, + "learning_rate": 2.4356435643564354e-06, + "loss": 1.5913, + "step": 683 + }, + { + "epoch": 0.4103917921641567, + "grad_norm": 1.59375, + "learning_rate": 2.433168316831683e-06, + "loss": 1.6691, + "step": 684 + }, + { + "epoch": 0.4109917801643967, + "grad_norm": 1.8359375, + "learning_rate": 2.4306930693069305e-06, + "loss": 1.5859, + "step": 685 + }, + { + "epoch": 0.4115917681646367, + "grad_norm": 1.703125, + "learning_rate": 2.4282178217821782e-06, + "loss": 1.6688, + "step": 686 + }, + { + "epoch": 0.4121917561648767, + "grad_norm": 1.6953125, + "learning_rate": 2.4257425742574256e-06, + "loss": 1.6196, + "step": 687 + }, + { + "epoch": 0.4127917441651167, + "grad_norm": 1.6171875, + "learning_rate": 2.4232673267326733e-06, + "loss": 1.6959, + "step": 688 + }, + { + "epoch": 0.4133917321653567, + "grad_norm": 1.671875, + "learning_rate": 2.4207920792079207e-06, + "loss": 1.6958, + "step": 689 + }, + { + "epoch": 0.4139917201655967, + "grad_norm": 1.6875, + "learning_rate": 2.4183168316831685e-06, + "loss": 1.6803, + "step": 690 + }, + { + "epoch": 0.4145917081658367, + "grad_norm": 1.6484375, + "learning_rate": 2.415841584158416e-06, + "loss": 1.67, + "step": 691 + }, + { + "epoch": 0.41519169616607665, + "grad_norm": 1.6953125, + "learning_rate": 2.4133663366336636e-06, + "loss": 1.6805, + "step": 692 + }, + { + "epoch": 0.4157916841663167, + "grad_norm": 1.859375, + "learning_rate": 2.410891089108911e-06, + "loss": 1.67, + "step": 693 + }, + { + "epoch": 0.41639167216655665, + "grad_norm": 1.8203125, + "learning_rate": 2.4084158415841582e-06, + "loss": 1.5639, + "step": 694 + }, + { + "epoch": 0.4169916601667967, + "grad_norm": 1.84375, + "learning_rate": 2.4059405940594056e-06, + "loss": 1.7609, + "step": 695 + }, + { + "epoch": 0.41759164816703664, + "grad_norm": 1.5625, + "learning_rate": 2.4034653465346534e-06, + "loss": 1.6972, + "step": 696 + }, + { + "epoch": 0.41819163616727667, + "grad_norm": 1.640625, + "learning_rate": 2.4009900990099007e-06, + "loss": 1.5793, + "step": 697 + }, + { + "epoch": 0.41879162416751664, + "grad_norm": 1.6640625, + "learning_rate": 2.3985148514851485e-06, + "loss": 1.6058, + "step": 698 + }, + { + "epoch": 0.41939161216775667, + "grad_norm": 1.8671875, + "learning_rate": 2.396039603960396e-06, + "loss": 1.7138, + "step": 699 + }, + { + "epoch": 0.41999160016799664, + "grad_norm": 1.53125, + "learning_rate": 2.3935643564356436e-06, + "loss": 1.6364, + "step": 700 + }, + { + "epoch": 0.41999160016799664, + "eval_loss": 1.777879238128662, + "eval_model_preparation_time": 0.0037, + "eval_runtime": 66.0918, + "eval_samples_per_second": 151.305, + "eval_steps_per_second": 25.222, + "step": 700 + }, + { + "epoch": 0.4205915881682366, + "grad_norm": 1.75, + "learning_rate": 2.391089108910891e-06, + "loss": 1.7899, + "step": 701 + }, + { + "epoch": 0.42119157616847663, + "grad_norm": 1.8671875, + "learning_rate": 2.3886138613861387e-06, + "loss": 1.6305, + "step": 702 + }, + { + "epoch": 0.4217915641687166, + "grad_norm": 1.6953125, + "learning_rate": 2.386138613861386e-06, + "loss": 1.6793, + "step": 703 + }, + { + "epoch": 0.42239155216895663, + "grad_norm": 1.703125, + "learning_rate": 2.383663366336634e-06, + "loss": 1.657, + "step": 704 + }, + { + "epoch": 0.4229915401691966, + "grad_norm": 1.6484375, + "learning_rate": 2.381188118811881e-06, + "loss": 1.6353, + "step": 705 + }, + { + "epoch": 0.4235915281694366, + "grad_norm": 1.8515625, + "learning_rate": 2.378712871287129e-06, + "loss": 1.592, + "step": 706 + }, + { + "epoch": 0.4241915161696766, + "grad_norm": 1.546875, + "learning_rate": 2.376237623762376e-06, + "loss": 1.615, + "step": 707 + }, + { + "epoch": 0.4247915041699166, + "grad_norm": 1.6875, + "learning_rate": 2.3737623762376236e-06, + "loss": 1.5985, + "step": 708 + }, + { + "epoch": 0.4253914921701566, + "grad_norm": 1.8046875, + "learning_rate": 2.371287128712871e-06, + "loss": 1.6037, + "step": 709 + }, + { + "epoch": 0.4259914801703966, + "grad_norm": 1.671875, + "learning_rate": 2.3688118811881187e-06, + "loss": 1.6959, + "step": 710 + }, + { + "epoch": 0.4265914681706366, + "grad_norm": 1.6796875, + "learning_rate": 2.366336633663366e-06, + "loss": 1.726, + "step": 711 + }, + { + "epoch": 0.42719145617087656, + "grad_norm": 1.75, + "learning_rate": 2.363861386138614e-06, + "loss": 1.7963, + "step": 712 + }, + { + "epoch": 0.4277914441711166, + "grad_norm": 1.6328125, + "learning_rate": 2.361386138613861e-06, + "loss": 1.64, + "step": 713 + }, + { + "epoch": 0.42839143217135656, + "grad_norm": 1.75, + "learning_rate": 2.358910891089109e-06, + "loss": 1.6623, + "step": 714 + }, + { + "epoch": 0.4289914201715966, + "grad_norm": 1.609375, + "learning_rate": 2.3564356435643563e-06, + "loss": 1.6111, + "step": 715 + }, + { + "epoch": 0.42959140817183655, + "grad_norm": 1.765625, + "learning_rate": 2.353960396039604e-06, + "loss": 1.7229, + "step": 716 + }, + { + "epoch": 0.4301913961720766, + "grad_norm": 1.6640625, + "learning_rate": 2.3514851485148514e-06, + "loss": 1.6864, + "step": 717 + }, + { + "epoch": 0.43079138417231655, + "grad_norm": 1.453125, + "learning_rate": 2.349009900990099e-06, + "loss": 1.7142, + "step": 718 + }, + { + "epoch": 0.4313913721725566, + "grad_norm": 1.8984375, + "learning_rate": 2.3465346534653465e-06, + "loss": 1.7028, + "step": 719 + }, + { + "epoch": 0.43199136017279655, + "grad_norm": 1.765625, + "learning_rate": 2.3440594059405942e-06, + "loss": 1.7853, + "step": 720 + }, + { + "epoch": 0.4325913481730365, + "grad_norm": 1.6875, + "learning_rate": 2.341584158415841e-06, + "loss": 1.6417, + "step": 721 + }, + { + "epoch": 0.43319133617327654, + "grad_norm": 1.5546875, + "learning_rate": 2.339108910891089e-06, + "loss": 1.6398, + "step": 722 + }, + { + "epoch": 0.4337913241735165, + "grad_norm": 1.84375, + "learning_rate": 2.3366336633663363e-06, + "loss": 1.6656, + "step": 723 + }, + { + "epoch": 0.43439131217375654, + "grad_norm": 1.578125, + "learning_rate": 2.334158415841584e-06, + "loss": 1.6787, + "step": 724 + }, + { + "epoch": 0.4349913001739965, + "grad_norm": 1.7421875, + "learning_rate": 2.3316831683168314e-06, + "loss": 1.6582, + "step": 725 + }, + { + "epoch": 0.43559128817423654, + "grad_norm": 1.703125, + "learning_rate": 2.329207920792079e-06, + "loss": 1.6612, + "step": 726 + }, + { + "epoch": 0.4361912761744765, + "grad_norm": 1.7734375, + "learning_rate": 2.3267326732673265e-06, + "loss": 1.7151, + "step": 727 + }, + { + "epoch": 0.43679126417471653, + "grad_norm": 1.78125, + "learning_rate": 2.3242574257425743e-06, + "loss": 1.6576, + "step": 728 + }, + { + "epoch": 0.4373912521749565, + "grad_norm": 1.59375, + "learning_rate": 2.3217821782178216e-06, + "loss": 1.7122, + "step": 729 + }, + { + "epoch": 0.4379912401751965, + "grad_norm": 1.75, + "learning_rate": 2.3193069306930694e-06, + "loss": 1.6774, + "step": 730 + }, + { + "epoch": 0.4385912281754365, + "grad_norm": 1.578125, + "learning_rate": 2.3168316831683167e-06, + "loss": 1.6655, + "step": 731 + }, + { + "epoch": 0.43919121617567647, + "grad_norm": 1.78125, + "learning_rate": 2.3143564356435645e-06, + "loss": 1.6039, + "step": 732 + }, + { + "epoch": 0.4397912041759165, + "grad_norm": 1.671875, + "learning_rate": 2.3118811881188114e-06, + "loss": 1.6186, + "step": 733 + }, + { + "epoch": 0.44039119217615647, + "grad_norm": 1.9609375, + "learning_rate": 2.309405940594059e-06, + "loss": 1.628, + "step": 734 + }, + { + "epoch": 0.4409911801763965, + "grad_norm": 1.625, + "learning_rate": 2.3069306930693065e-06, + "loss": 1.6176, + "step": 735 + }, + { + "epoch": 0.44159116817663646, + "grad_norm": 1.6171875, + "learning_rate": 2.3044554455445543e-06, + "loss": 1.6363, + "step": 736 + }, + { + "epoch": 0.4421911561768765, + "grad_norm": 1.5546875, + "learning_rate": 2.3019801980198016e-06, + "loss": 1.6177, + "step": 737 + }, + { + "epoch": 0.44279114417711646, + "grad_norm": 1.5859375, + "learning_rate": 2.2995049504950494e-06, + "loss": 1.8267, + "step": 738 + }, + { + "epoch": 0.44339113217735643, + "grad_norm": 1.7109375, + "learning_rate": 2.2970297029702967e-06, + "loss": 1.7321, + "step": 739 + }, + { + "epoch": 0.44399112017759645, + "grad_norm": 1.671875, + "learning_rate": 2.2945544554455445e-06, + "loss": 1.6686, + "step": 740 + }, + { + "epoch": 0.4445911081778364, + "grad_norm": 1.828125, + "learning_rate": 2.292079207920792e-06, + "loss": 1.5937, + "step": 741 + }, + { + "epoch": 0.44519109617807645, + "grad_norm": 1.625, + "learning_rate": 2.2896039603960396e-06, + "loss": 1.6786, + "step": 742 + }, + { + "epoch": 0.4457910841783164, + "grad_norm": 1.671875, + "learning_rate": 2.287128712871287e-06, + "loss": 1.6417, + "step": 743 + }, + { + "epoch": 0.44639107217855645, + "grad_norm": 1.7109375, + "learning_rate": 2.2846534653465347e-06, + "loss": 1.7238, + "step": 744 + }, + { + "epoch": 0.4469910601787964, + "grad_norm": 1.8828125, + "learning_rate": 2.282178217821782e-06, + "loss": 1.6931, + "step": 745 + }, + { + "epoch": 0.44759104817903644, + "grad_norm": 1.5546875, + "learning_rate": 2.27970297029703e-06, + "loss": 1.6201, + "step": 746 + }, + { + "epoch": 0.4481910361792764, + "grad_norm": 1.5390625, + "learning_rate": 2.2772277227722767e-06, + "loss": 1.6692, + "step": 747 + }, + { + "epoch": 0.4487910241795164, + "grad_norm": 1.7734375, + "learning_rate": 2.2747524752475245e-06, + "loss": 1.6056, + "step": 748 + }, + { + "epoch": 0.4493910121797564, + "grad_norm": 1.65625, + "learning_rate": 2.272277227722772e-06, + "loss": 1.7107, + "step": 749 + }, + { + "epoch": 0.4499910001799964, + "grad_norm": 1.7421875, + "learning_rate": 2.2698019801980196e-06, + "loss": 1.6711, + "step": 750 + }, + { + "epoch": 0.4499910001799964, + "eval_loss": 1.77778959274292, + "eval_model_preparation_time": 0.0037, + "eval_runtime": 69.2035, + "eval_samples_per_second": 144.501, + "eval_steps_per_second": 24.088, + "step": 750 + }, + { + "epoch": 0.4505909881802364, + "grad_norm": 1.703125, + "learning_rate": 2.267326732673267e-06, + "loss": 1.6199, + "step": 751 + }, + { + "epoch": 0.4511909761804764, + "grad_norm": 1.7734375, + "learning_rate": 2.2648514851485147e-06, + "loss": 1.6252, + "step": 752 + }, + { + "epoch": 0.4517909641807164, + "grad_norm": 1.65625, + "learning_rate": 2.262376237623762e-06, + "loss": 1.6548, + "step": 753 + }, + { + "epoch": 0.4523909521809564, + "grad_norm": 1.5390625, + "learning_rate": 2.25990099009901e-06, + "loss": 1.6134, + "step": 754 + }, + { + "epoch": 0.4529909401811964, + "grad_norm": 1.7109375, + "learning_rate": 2.257425742574257e-06, + "loss": 1.6512, + "step": 755 + }, + { + "epoch": 0.45359092818143637, + "grad_norm": 1.6953125, + "learning_rate": 2.254950495049505e-06, + "loss": 1.717, + "step": 756 + }, + { + "epoch": 0.45419091618167634, + "grad_norm": 1.6953125, + "learning_rate": 2.2524752475247523e-06, + "loss": 1.667, + "step": 757 + }, + { + "epoch": 0.45479090418191637, + "grad_norm": 1.84375, + "learning_rate": 2.25e-06, + "loss": 1.6836, + "step": 758 + }, + { + "epoch": 0.45539089218215634, + "grad_norm": 1.6875, + "learning_rate": 2.247524752475248e-06, + "loss": 1.7515, + "step": 759 + }, + { + "epoch": 0.45599088018239636, + "grad_norm": 1.78125, + "learning_rate": 2.2450495049504947e-06, + "loss": 1.7191, + "step": 760 + }, + { + "epoch": 0.45659086818263633, + "grad_norm": 1.9453125, + "learning_rate": 2.2425742574257425e-06, + "loss": 1.6758, + "step": 761 + }, + { + "epoch": 0.45719085618287636, + "grad_norm": 1.6640625, + "learning_rate": 2.24009900990099e-06, + "loss": 1.7625, + "step": 762 + }, + { + "epoch": 0.45779084418311633, + "grad_norm": 1.6796875, + "learning_rate": 2.2376237623762376e-06, + "loss": 1.6278, + "step": 763 + }, + { + "epoch": 0.45839083218335636, + "grad_norm": 1.7734375, + "learning_rate": 2.235148514851485e-06, + "loss": 1.6825, + "step": 764 + }, + { + "epoch": 0.4589908201835963, + "grad_norm": 1.65625, + "learning_rate": 2.2326732673267327e-06, + "loss": 1.5404, + "step": 765 + }, + { + "epoch": 0.4595908081838363, + "grad_norm": 1.6640625, + "learning_rate": 2.23019801980198e-06, + "loss": 1.5464, + "step": 766 + }, + { + "epoch": 0.4601907961840763, + "grad_norm": 1.6640625, + "learning_rate": 2.227722772277228e-06, + "loss": 1.7487, + "step": 767 + }, + { + "epoch": 0.4607907841843163, + "grad_norm": 1.4296875, + "learning_rate": 2.225247524752475e-06, + "loss": 1.6372, + "step": 768 + }, + { + "epoch": 0.4613907721845563, + "grad_norm": 1.6484375, + "learning_rate": 2.222772277227723e-06, + "loss": 1.7136, + "step": 769 + }, + { + "epoch": 0.4619907601847963, + "grad_norm": 1.59375, + "learning_rate": 2.2202970297029703e-06, + "loss": 1.6032, + "step": 770 + }, + { + "epoch": 0.4625907481850363, + "grad_norm": 1.5078125, + "learning_rate": 2.217821782178218e-06, + "loss": 1.6514, + "step": 771 + }, + { + "epoch": 0.4631907361852763, + "grad_norm": 1.6171875, + "learning_rate": 2.2153465346534654e-06, + "loss": 1.6665, + "step": 772 + }, + { + "epoch": 0.4637907241855163, + "grad_norm": 1.796875, + "learning_rate": 2.212871287128713e-06, + "loss": 1.6432, + "step": 773 + }, + { + "epoch": 0.4643907121857563, + "grad_norm": 1.8515625, + "learning_rate": 2.21039603960396e-06, + "loss": 1.5685, + "step": 774 + }, + { + "epoch": 0.46499070018599625, + "grad_norm": 1.5546875, + "learning_rate": 2.207920792079208e-06, + "loss": 1.7123, + "step": 775 + }, + { + "epoch": 0.4655906881862363, + "grad_norm": 1.4921875, + "learning_rate": 2.205445544554455e-06, + "loss": 1.6538, + "step": 776 + }, + { + "epoch": 0.46619067618647625, + "grad_norm": 1.8046875, + "learning_rate": 2.202970297029703e-06, + "loss": 1.6715, + "step": 777 + }, + { + "epoch": 0.4667906641867163, + "grad_norm": 1.7421875, + "learning_rate": 2.2004950495049503e-06, + "loss": 1.6058, + "step": 778 + }, + { + "epoch": 0.46739065218695625, + "grad_norm": 1.7265625, + "learning_rate": 2.198019801980198e-06, + "loss": 1.766, + "step": 779 + }, + { + "epoch": 0.46799064018719627, + "grad_norm": 1.5546875, + "learning_rate": 2.1955445544554454e-06, + "loss": 1.7647, + "step": 780 + }, + { + "epoch": 0.46859062818743624, + "grad_norm": 1.6875, + "learning_rate": 2.193069306930693e-06, + "loss": 1.6069, + "step": 781 + }, + { + "epoch": 0.46919061618767627, + "grad_norm": 1.6171875, + "learning_rate": 2.1905940594059405e-06, + "loss": 1.6362, + "step": 782 + }, + { + "epoch": 0.46979060418791624, + "grad_norm": 1.75, + "learning_rate": 2.1881188118811883e-06, + "loss": 1.6003, + "step": 783 + }, + { + "epoch": 0.47039059218815626, + "grad_norm": 1.671875, + "learning_rate": 2.1856435643564356e-06, + "loss": 1.676, + "step": 784 + }, + { + "epoch": 0.47099058018839624, + "grad_norm": 1.5703125, + "learning_rate": 2.1831683168316834e-06, + "loss": 1.6357, + "step": 785 + }, + { + "epoch": 0.4715905681886362, + "grad_norm": 1.7578125, + "learning_rate": 2.1806930693069307e-06, + "loss": 1.7, + "step": 786 + }, + { + "epoch": 0.47219055618887623, + "grad_norm": 1.6875, + "learning_rate": 2.178217821782178e-06, + "loss": 1.5975, + "step": 787 + }, + { + "epoch": 0.4727905441891162, + "grad_norm": 1.7265625, + "learning_rate": 2.1757425742574254e-06, + "loss": 1.7075, + "step": 788 + }, + { + "epoch": 0.47339053218935623, + "grad_norm": 1.5546875, + "learning_rate": 2.173267326732673e-06, + "loss": 1.6066, + "step": 789 + }, + { + "epoch": 0.4739905201895962, + "grad_norm": 1.765625, + "learning_rate": 2.1707920792079205e-06, + "loss": 1.6408, + "step": 790 + }, + { + "epoch": 0.4745905081898362, + "grad_norm": 1.5, + "learning_rate": 2.1683168316831683e-06, + "loss": 1.6861, + "step": 791 + }, + { + "epoch": 0.4751904961900762, + "grad_norm": 1.796875, + "learning_rate": 2.1658415841584156e-06, + "loss": 1.6754, + "step": 792 + }, + { + "epoch": 0.4757904841903162, + "grad_norm": 1.7265625, + "learning_rate": 2.1633663366336634e-06, + "loss": 1.5473, + "step": 793 + }, + { + "epoch": 0.4763904721905562, + "grad_norm": 1.6328125, + "learning_rate": 2.1608910891089108e-06, + "loss": 1.5907, + "step": 794 + }, + { + "epoch": 0.47699046019079616, + "grad_norm": 1.4609375, + "learning_rate": 2.1584158415841585e-06, + "loss": 1.5811, + "step": 795 + }, + { + "epoch": 0.4775904481910362, + "grad_norm": 1.65625, + "learning_rate": 2.155940594059406e-06, + "loss": 1.7476, + "step": 796 + }, + { + "epoch": 0.47819043619127616, + "grad_norm": 1.7890625, + "learning_rate": 2.1534653465346536e-06, + "loss": 1.6973, + "step": 797 + }, + { + "epoch": 0.4787904241915162, + "grad_norm": 1.7421875, + "learning_rate": 2.150990099009901e-06, + "loss": 1.6153, + "step": 798 + }, + { + "epoch": 0.47939041219175615, + "grad_norm": 1.6171875, + "learning_rate": 2.1485148514851487e-06, + "loss": 1.5589, + "step": 799 + }, + { + "epoch": 0.4799904001919962, + "grad_norm": 1.6953125, + "learning_rate": 2.1460396039603957e-06, + "loss": 1.6918, + "step": 800 + }, + { + "epoch": 0.4799904001919962, + "eval_loss": 1.7776175737380981, + "eval_model_preparation_time": 0.0037, + "eval_runtime": 66.0238, + "eval_samples_per_second": 151.461, + "eval_steps_per_second": 25.248, + "step": 800 + }, + { + "epoch": 0.48059038819223615, + "grad_norm": 1.7109375, + "learning_rate": 2.1435643564356434e-06, + "loss": 1.6829, + "step": 801 + }, + { + "epoch": 0.4811903761924762, + "grad_norm": 1.65625, + "learning_rate": 2.1410891089108908e-06, + "loss": 1.6587, + "step": 802 + }, + { + "epoch": 0.48179036419271615, + "grad_norm": 1.75, + "learning_rate": 2.1386138613861385e-06, + "loss": 1.6863, + "step": 803 + }, + { + "epoch": 0.4823903521929561, + "grad_norm": 1.5546875, + "learning_rate": 2.136138613861386e-06, + "loss": 1.7411, + "step": 804 + }, + { + "epoch": 0.48299034019319614, + "grad_norm": 1.6015625, + "learning_rate": 2.1336633663366336e-06, + "loss": 1.6127, + "step": 805 + }, + { + "epoch": 0.4835903281934361, + "grad_norm": 1.640625, + "learning_rate": 2.131188118811881e-06, + "loss": 1.6713, + "step": 806 + }, + { + "epoch": 0.48419031619367614, + "grad_norm": 1.6171875, + "learning_rate": 2.1287128712871288e-06, + "loss": 1.6152, + "step": 807 + }, + { + "epoch": 0.4847903041939161, + "grad_norm": 1.6796875, + "learning_rate": 2.126237623762376e-06, + "loss": 1.6861, + "step": 808 + }, + { + "epoch": 0.48539029219415614, + "grad_norm": 1.515625, + "learning_rate": 2.123762376237624e-06, + "loss": 1.6041, + "step": 809 + }, + { + "epoch": 0.4859902801943961, + "grad_norm": 1.6328125, + "learning_rate": 2.121287128712871e-06, + "loss": 1.6616, + "step": 810 + }, + { + "epoch": 0.48659026819463613, + "grad_norm": 1.546875, + "learning_rate": 2.118811881188119e-06, + "loss": 1.6977, + "step": 811 + }, + { + "epoch": 0.4871902561948761, + "grad_norm": 1.7421875, + "learning_rate": 2.1163366336633663e-06, + "loss": 1.7304, + "step": 812 + }, + { + "epoch": 0.4877902441951161, + "grad_norm": 1.7265625, + "learning_rate": 2.1138613861386137e-06, + "loss": 1.6383, + "step": 813 + }, + { + "epoch": 0.4883902321953561, + "grad_norm": 1.515625, + "learning_rate": 2.111386138613861e-06, + "loss": 1.646, + "step": 814 + }, + { + "epoch": 0.48899022019559607, + "grad_norm": 1.7421875, + "learning_rate": 2.1089108910891088e-06, + "loss": 1.6341, + "step": 815 + }, + { + "epoch": 0.4895902081958361, + "grad_norm": 1.625, + "learning_rate": 2.106435643564356e-06, + "loss": 1.625, + "step": 816 + }, + { + "epoch": 0.49019019619607607, + "grad_norm": 1.703125, + "learning_rate": 2.103960396039604e-06, + "loss": 1.6915, + "step": 817 + }, + { + "epoch": 0.4907901841963161, + "grad_norm": 1.859375, + "learning_rate": 2.1014851485148512e-06, + "loss": 1.5856, + "step": 818 + }, + { + "epoch": 0.49139017219655606, + "grad_norm": 1.7265625, + "learning_rate": 2.099009900990099e-06, + "loss": 1.6453, + "step": 819 + }, + { + "epoch": 0.4919901601967961, + "grad_norm": 1.6640625, + "learning_rate": 2.0965346534653463e-06, + "loss": 1.6145, + "step": 820 + }, + { + "epoch": 0.49259014819703606, + "grad_norm": 1.5625, + "learning_rate": 2.094059405940594e-06, + "loss": 1.6025, + "step": 821 + }, + { + "epoch": 0.49319013619727603, + "grad_norm": 1.65625, + "learning_rate": 2.0915841584158414e-06, + "loss": 1.753, + "step": 822 + }, + { + "epoch": 0.49379012419751606, + "grad_norm": 1.640625, + "learning_rate": 2.089108910891089e-06, + "loss": 1.6231, + "step": 823 + }, + { + "epoch": 0.494390112197756, + "grad_norm": 1.5859375, + "learning_rate": 2.0866336633663366e-06, + "loss": 1.6406, + "step": 824 + }, + { + "epoch": 0.49499010019799605, + "grad_norm": 1.671875, + "learning_rate": 2.0841584158415843e-06, + "loss": 1.6947, + "step": 825 + }, + { + "epoch": 0.495590088198236, + "grad_norm": 1.671875, + "learning_rate": 2.0816831683168312e-06, + "loss": 1.6547, + "step": 826 + }, + { + "epoch": 0.49619007619847605, + "grad_norm": 1.6015625, + "learning_rate": 2.079207920792079e-06, + "loss": 1.6789, + "step": 827 + }, + { + "epoch": 0.496790064198716, + "grad_norm": 1.7890625, + "learning_rate": 2.0767326732673263e-06, + "loss": 1.7236, + "step": 828 + }, + { + "epoch": 0.49739005219895605, + "grad_norm": 1.6171875, + "learning_rate": 2.074257425742574e-06, + "loss": 1.6554, + "step": 829 + }, + { + "epoch": 0.497990040199196, + "grad_norm": 1.7421875, + "learning_rate": 2.0717821782178215e-06, + "loss": 1.651, + "step": 830 + }, + { + "epoch": 0.498590028199436, + "grad_norm": 1.6171875, + "learning_rate": 2.0693069306930692e-06, + "loss": 1.618, + "step": 831 + }, + { + "epoch": 0.499190016199676, + "grad_norm": 1.7265625, + "learning_rate": 2.0668316831683166e-06, + "loss": 1.5994, + "step": 832 + }, + { + "epoch": 0.499790004199916, + "grad_norm": 1.8515625, + "learning_rate": 2.0643564356435643e-06, + "loss": 1.6897, + "step": 833 + }, + { + "epoch": 0.500389992200156, + "grad_norm": 1.6796875, + "learning_rate": 2.0618811881188117e-06, + "loss": 1.5805, + "step": 834 + }, + { + "epoch": 0.500989980200396, + "grad_norm": 1.65625, + "learning_rate": 2.0594059405940594e-06, + "loss": 1.7514, + "step": 835 + }, + { + "epoch": 0.501589968200636, + "grad_norm": 1.6015625, + "learning_rate": 2.0569306930693068e-06, + "loss": 1.7048, + "step": 836 + }, + { + "epoch": 0.502189956200876, + "grad_norm": 1.53125, + "learning_rate": 2.0544554455445546e-06, + "loss": 1.6226, + "step": 837 + }, + { + "epoch": 0.502789944201116, + "grad_norm": 1.703125, + "learning_rate": 2.051980198019802e-06, + "loss": 1.7454, + "step": 838 + }, + { + "epoch": 0.503389932201356, + "grad_norm": 1.515625, + "learning_rate": 2.0495049504950497e-06, + "loss": 1.6367, + "step": 839 + }, + { + "epoch": 0.5039899202015959, + "grad_norm": 1.7578125, + "learning_rate": 2.0470297029702966e-06, + "loss": 1.6435, + "step": 840 + }, + { + "epoch": 0.5045899082018359, + "grad_norm": 2.0, + "learning_rate": 2.0445544554455443e-06, + "loss": 1.5614, + "step": 841 + }, + { + "epoch": 0.505189896202076, + "grad_norm": 1.6796875, + "learning_rate": 2.0420792079207917e-06, + "loss": 1.7469, + "step": 842 + }, + { + "epoch": 0.505789884202316, + "grad_norm": 1.7109375, + "learning_rate": 2.0396039603960395e-06, + "loss": 1.6062, + "step": 843 + }, + { + "epoch": 0.5063898722025559, + "grad_norm": 1.4140625, + "learning_rate": 2.037128712871287e-06, + "loss": 1.5944, + "step": 844 + }, + { + "epoch": 0.5069898602027959, + "grad_norm": 1.6953125, + "learning_rate": 2.0346534653465346e-06, + "loss": 1.6536, + "step": 845 + }, + { + "epoch": 0.507589848203036, + "grad_norm": 1.734375, + "learning_rate": 2.032178217821782e-06, + "loss": 1.6721, + "step": 846 + }, + { + "epoch": 0.508189836203276, + "grad_norm": 1.8046875, + "learning_rate": 2.0297029702970297e-06, + "loss": 1.6654, + "step": 847 + }, + { + "epoch": 0.5087898242035159, + "grad_norm": 1.640625, + "learning_rate": 2.027227722772277e-06, + "loss": 1.7078, + "step": 848 + }, + { + "epoch": 0.5093898122037559, + "grad_norm": 1.6953125, + "learning_rate": 2.0247524752475248e-06, + "loss": 1.7034, + "step": 849 + }, + { + "epoch": 0.5099898002039959, + "grad_norm": 1.7421875, + "learning_rate": 2.022277227722772e-06, + "loss": 1.6864, + "step": 850 + }, + { + "epoch": 0.5099898002039959, + "eval_loss": 1.7774574756622314, + "eval_model_preparation_time": 0.0037, + "eval_runtime": 66.0293, + "eval_samples_per_second": 151.448, + "eval_steps_per_second": 25.246, + "step": 850 + }, + { + "epoch": 0.510589788204236, + "grad_norm": 1.78125, + "learning_rate": 2.01980198019802e-06, + "loss": 1.7012, + "step": 851 + }, + { + "epoch": 0.5111897762044759, + "grad_norm": 1.6484375, + "learning_rate": 2.0173267326732672e-06, + "loss": 1.7185, + "step": 852 + }, + { + "epoch": 0.5117897642047159, + "grad_norm": 1.8203125, + "learning_rate": 2.0148514851485146e-06, + "loss": 1.5713, + "step": 853 + }, + { + "epoch": 0.5123897522049559, + "grad_norm": 1.640625, + "learning_rate": 2.012376237623762e-06, + "loss": 1.6537, + "step": 854 + }, + { + "epoch": 0.5129897402051959, + "grad_norm": 1.640625, + "learning_rate": 2.0099009900990097e-06, + "loss": 1.6145, + "step": 855 + }, + { + "epoch": 0.5135897282054359, + "grad_norm": 1.5859375, + "learning_rate": 2.007425742574257e-06, + "loss": 1.6468, + "step": 856 + }, + { + "epoch": 0.5141897162056759, + "grad_norm": 1.65625, + "learning_rate": 2.004950495049505e-06, + "loss": 1.6845, + "step": 857 + }, + { + "epoch": 0.5147897042059159, + "grad_norm": 1.5078125, + "learning_rate": 2.002475247524752e-06, + "loss": 1.7128, + "step": 858 + }, + { + "epoch": 0.5153896922061558, + "grad_norm": 1.8359375, + "learning_rate": 2e-06, + "loss": 1.58, + "step": 859 + }, + { + "epoch": 0.5159896802063959, + "grad_norm": 1.625, + "learning_rate": 1.9975247524752473e-06, + "loss": 1.5932, + "step": 860 + }, + { + "epoch": 0.5165896682066359, + "grad_norm": 1.734375, + "learning_rate": 1.995049504950495e-06, + "loss": 1.6651, + "step": 861 + }, + { + "epoch": 0.5171896562068758, + "grad_norm": 1.6953125, + "learning_rate": 1.9925742574257424e-06, + "loss": 1.6753, + "step": 862 + }, + { + "epoch": 0.5177896442071158, + "grad_norm": 1.7578125, + "learning_rate": 1.99009900990099e-06, + "loss": 1.7112, + "step": 863 + }, + { + "epoch": 0.5183896322073559, + "grad_norm": 1.75, + "learning_rate": 1.9876237623762375e-06, + "loss": 1.6007, + "step": 864 + }, + { + "epoch": 0.5189896202075959, + "grad_norm": 1.8125, + "learning_rate": 1.9851485148514852e-06, + "loss": 1.763, + "step": 865 + }, + { + "epoch": 0.5195896082078358, + "grad_norm": 1.6796875, + "learning_rate": 1.9826732673267326e-06, + "loss": 1.6699, + "step": 866 + }, + { + "epoch": 0.5201895962080758, + "grad_norm": 1.6640625, + "learning_rate": 1.98019801980198e-06, + "loss": 1.6485, + "step": 867 + }, + { + "epoch": 0.5207895842083158, + "grad_norm": 1.75, + "learning_rate": 1.9777227722772277e-06, + "loss": 1.7541, + "step": 868 + }, + { + "epoch": 0.5213895722085559, + "grad_norm": 1.703125, + "learning_rate": 1.975247524752475e-06, + "loss": 1.7011, + "step": 869 + }, + { + "epoch": 0.5219895602087958, + "grad_norm": 1.8125, + "learning_rate": 1.972772277227723e-06, + "loss": 1.5922, + "step": 870 + }, + { + "epoch": 0.5225895482090358, + "grad_norm": 1.5078125, + "learning_rate": 1.97029702970297e-06, + "loss": 1.6786, + "step": 871 + }, + { + "epoch": 0.5231895362092758, + "grad_norm": 1.4296875, + "learning_rate": 1.967821782178218e-06, + "loss": 1.6818, + "step": 872 + }, + { + "epoch": 0.5237895242095159, + "grad_norm": 1.6640625, + "learning_rate": 1.9653465346534653e-06, + "loss": 1.6646, + "step": 873 + }, + { + "epoch": 0.5243895122097558, + "grad_norm": 1.5625, + "learning_rate": 1.9628712871287126e-06, + "loss": 1.7467, + "step": 874 + }, + { + "epoch": 0.5249895002099958, + "grad_norm": 1.6015625, + "learning_rate": 1.9603960396039604e-06, + "loss": 1.6728, + "step": 875 + }, + { + "epoch": 0.5255894882102358, + "grad_norm": 1.7109375, + "learning_rate": 1.9579207920792077e-06, + "loss": 1.654, + "step": 876 + }, + { + "epoch": 0.5261894762104757, + "grad_norm": 1.5390625, + "learning_rate": 1.9554455445544555e-06, + "loss": 1.6831, + "step": 877 + }, + { + "epoch": 0.5267894642107158, + "grad_norm": 1.609375, + "learning_rate": 1.952970297029703e-06, + "loss": 1.5535, + "step": 878 + }, + { + "epoch": 0.5273894522109558, + "grad_norm": 1.984375, + "learning_rate": 1.95049504950495e-06, + "loss": 1.6483, + "step": 879 + }, + { + "epoch": 0.5279894402111958, + "grad_norm": 1.59375, + "learning_rate": 1.948019801980198e-06, + "loss": 1.7029, + "step": 880 + }, + { + "epoch": 0.5285894282114357, + "grad_norm": 1.7265625, + "learning_rate": 1.9455445544554453e-06, + "loss": 1.6396, + "step": 881 + }, + { + "epoch": 0.5291894162116758, + "grad_norm": 1.65625, + "learning_rate": 1.943069306930693e-06, + "loss": 1.6633, + "step": 882 + }, + { + "epoch": 0.5297894042119158, + "grad_norm": 1.5859375, + "learning_rate": 1.9405940594059404e-06, + "loss": 1.6083, + "step": 883 + }, + { + "epoch": 0.5303893922121558, + "grad_norm": 1.6171875, + "learning_rate": 1.938118811881188e-06, + "loss": 1.684, + "step": 884 + }, + { + "epoch": 0.5309893802123957, + "grad_norm": 1.5703125, + "learning_rate": 1.9356435643564355e-06, + "loss": 1.6599, + "step": 885 + }, + { + "epoch": 0.5315893682126357, + "grad_norm": 1.6328125, + "learning_rate": 1.933168316831683e-06, + "loss": 1.6902, + "step": 886 + }, + { + "epoch": 0.5321893562128758, + "grad_norm": 1.6484375, + "learning_rate": 1.9306930693069306e-06, + "loss": 1.6117, + "step": 887 + }, + { + "epoch": 0.5327893442131157, + "grad_norm": 1.6875, + "learning_rate": 1.928217821782178e-06, + "loss": 1.6018, + "step": 888 + }, + { + "epoch": 0.5333893322133557, + "grad_norm": 1.671875, + "learning_rate": 1.9257425742574257e-06, + "loss": 1.7181, + "step": 889 + }, + { + "epoch": 0.5339893202135957, + "grad_norm": 1.8046875, + "learning_rate": 1.923267326732673e-06, + "loss": 1.6801, + "step": 890 + }, + { + "epoch": 0.5345893082138358, + "grad_norm": 1.5703125, + "learning_rate": 1.920792079207921e-06, + "loss": 1.6134, + "step": 891 + }, + { + "epoch": 0.5351892962140757, + "grad_norm": 1.65625, + "learning_rate": 1.918316831683168e-06, + "loss": 1.7768, + "step": 892 + }, + { + "epoch": 0.5357892842143157, + "grad_norm": 1.65625, + "learning_rate": 1.9158415841584155e-06, + "loss": 1.6964, + "step": 893 + }, + { + "epoch": 0.5363892722145557, + "grad_norm": 1.6171875, + "learning_rate": 1.9133663366336633e-06, + "loss": 1.6572, + "step": 894 + }, + { + "epoch": 0.5369892602147956, + "grad_norm": 1.6171875, + "learning_rate": 1.9108910891089106e-06, + "loss": 1.6688, + "step": 895 + }, + { + "epoch": 0.5375892482150357, + "grad_norm": 1.515625, + "learning_rate": 1.9084158415841584e-06, + "loss": 1.5866, + "step": 896 + }, + { + "epoch": 0.5381892362152757, + "grad_norm": 1.71875, + "learning_rate": 1.9059405940594057e-06, + "loss": 1.6582, + "step": 897 + }, + { + "epoch": 0.5387892242155157, + "grad_norm": 1.78125, + "learning_rate": 1.9034653465346533e-06, + "loss": 1.7269, + "step": 898 + }, + { + "epoch": 0.5393892122157556, + "grad_norm": 1.8125, + "learning_rate": 1.9009900990099008e-06, + "loss": 1.5366, + "step": 899 + }, + { + "epoch": 0.5399892002159957, + "grad_norm": 1.8046875, + "learning_rate": 1.8985148514851484e-06, + "loss": 1.6964, + "step": 900 + }, + { + "epoch": 0.5399892002159957, + "eval_loss": 1.7774394750595093, + "eval_model_preparation_time": 0.0037, + "eval_runtime": 68.6659, + "eval_samples_per_second": 145.633, + "eval_steps_per_second": 24.277, + "step": 900 + }, + { + "epoch": 0.5405891882162357, + "grad_norm": 1.6953125, + "learning_rate": 1.896039603960396e-06, + "loss": 1.7363, + "step": 901 + }, + { + "epoch": 0.5411891762164757, + "grad_norm": 1.6328125, + "learning_rate": 1.8935643564356433e-06, + "loss": 1.6164, + "step": 902 + }, + { + "epoch": 0.5417891642167156, + "grad_norm": 1.6328125, + "learning_rate": 1.8910891089108908e-06, + "loss": 1.6276, + "step": 903 + }, + { + "epoch": 0.5423891522169556, + "grad_norm": 1.765625, + "learning_rate": 1.8886138613861384e-06, + "loss": 1.5128, + "step": 904 + }, + { + "epoch": 0.5429891402171957, + "grad_norm": 1.7578125, + "learning_rate": 1.886138613861386e-06, + "loss": 1.6413, + "step": 905 + }, + { + "epoch": 0.5435891282174357, + "grad_norm": 1.8671875, + "learning_rate": 1.8836633663366335e-06, + "loss": 1.6669, + "step": 906 + }, + { + "epoch": 0.5441891162176756, + "grad_norm": 1.84375, + "learning_rate": 1.881188118811881e-06, + "loss": 1.6869, + "step": 907 + }, + { + "epoch": 0.5447891042179156, + "grad_norm": 1.625, + "learning_rate": 1.8787128712871286e-06, + "loss": 1.6706, + "step": 908 + }, + { + "epoch": 0.5453890922181557, + "grad_norm": 1.6328125, + "learning_rate": 1.876237623762376e-06, + "loss": 1.6194, + "step": 909 + }, + { + "epoch": 0.5459890802183957, + "grad_norm": 1.7421875, + "learning_rate": 1.8737623762376237e-06, + "loss": 1.7302, + "step": 910 + }, + { + "epoch": 0.5465890682186356, + "grad_norm": 1.7734375, + "learning_rate": 1.8712871287128713e-06, + "loss": 1.698, + "step": 911 + }, + { + "epoch": 0.5471890562188756, + "grad_norm": 1.7421875, + "learning_rate": 1.8688118811881188e-06, + "loss": 1.6802, + "step": 912 + }, + { + "epoch": 0.5477890442191157, + "grad_norm": 1.9453125, + "learning_rate": 1.8663366336633664e-06, + "loss": 1.7833, + "step": 913 + }, + { + "epoch": 0.5483890322193556, + "grad_norm": 1.7734375, + "learning_rate": 1.863861386138614e-06, + "loss": 1.7153, + "step": 914 + }, + { + "epoch": 0.5489890202195956, + "grad_norm": 1.84375, + "learning_rate": 1.8613861386138615e-06, + "loss": 1.6458, + "step": 915 + }, + { + "epoch": 0.5495890082198356, + "grad_norm": 1.65625, + "learning_rate": 1.8589108910891088e-06, + "loss": 1.5899, + "step": 916 + }, + { + "epoch": 0.5501889962200756, + "grad_norm": 1.890625, + "learning_rate": 1.8564356435643564e-06, + "loss": 1.618, + "step": 917 + }, + { + "epoch": 0.5507889842203156, + "grad_norm": 1.6953125, + "learning_rate": 1.853960396039604e-06, + "loss": 1.6677, + "step": 918 + }, + { + "epoch": 0.5513889722205556, + "grad_norm": 1.5703125, + "learning_rate": 1.8514851485148515e-06, + "loss": 1.6409, + "step": 919 + }, + { + "epoch": 0.5519889602207956, + "grad_norm": 1.671875, + "learning_rate": 1.849009900990099e-06, + "loss": 1.6201, + "step": 920 + }, + { + "epoch": 0.5525889482210355, + "grad_norm": 1.6171875, + "learning_rate": 1.8465346534653466e-06, + "loss": 1.6385, + "step": 921 + }, + { + "epoch": 0.5531889362212756, + "grad_norm": 1.6328125, + "learning_rate": 1.8440594059405942e-06, + "loss": 1.6725, + "step": 922 + }, + { + "epoch": 0.5537889242215156, + "grad_norm": 1.6640625, + "learning_rate": 1.8415841584158415e-06, + "loss": 1.6324, + "step": 923 + }, + { + "epoch": 0.5543889122217556, + "grad_norm": 1.546875, + "learning_rate": 1.839108910891089e-06, + "loss": 1.6393, + "step": 924 + }, + { + "epoch": 0.5549889002219955, + "grad_norm": 1.578125, + "learning_rate": 1.8366336633663366e-06, + "loss": 1.727, + "step": 925 + }, + { + "epoch": 0.5555888882222355, + "grad_norm": 1.6953125, + "learning_rate": 1.8341584158415842e-06, + "loss": 1.7974, + "step": 926 + }, + { + "epoch": 0.5561888762224756, + "grad_norm": 1.5625, + "learning_rate": 1.8316831683168317e-06, + "loss": 1.7538, + "step": 927 + }, + { + "epoch": 0.5567888642227156, + "grad_norm": 1.859375, + "learning_rate": 1.8292079207920793e-06, + "loss": 1.6599, + "step": 928 + }, + { + "epoch": 0.5573888522229555, + "grad_norm": 1.546875, + "learning_rate": 1.8267326732673266e-06, + "loss": 1.6722, + "step": 929 + }, + { + "epoch": 0.5579888402231955, + "grad_norm": 1.8046875, + "learning_rate": 1.8242574257425742e-06, + "loss": 1.6885, + "step": 930 + }, + { + "epoch": 0.5585888282234356, + "grad_norm": 1.625, + "learning_rate": 1.8217821782178217e-06, + "loss": 1.6151, + "step": 931 + }, + { + "epoch": 0.5591888162236756, + "grad_norm": 1.5625, + "learning_rate": 1.8193069306930693e-06, + "loss": 1.621, + "step": 932 + }, + { + "epoch": 0.5597888042239155, + "grad_norm": 1.7578125, + "learning_rate": 1.8168316831683168e-06, + "loss": 1.6494, + "step": 933 + }, + { + "epoch": 0.5603887922241555, + "grad_norm": 1.5546875, + "learning_rate": 1.8143564356435644e-06, + "loss": 1.6403, + "step": 934 + }, + { + "epoch": 0.5609887802243955, + "grad_norm": 1.71875, + "learning_rate": 1.811881188118812e-06, + "loss": 1.6198, + "step": 935 + }, + { + "epoch": 0.5615887682246355, + "grad_norm": 1.6328125, + "learning_rate": 1.8094059405940593e-06, + "loss": 1.6158, + "step": 936 + }, + { + "epoch": 0.5621887562248755, + "grad_norm": 1.578125, + "learning_rate": 1.8069306930693068e-06, + "loss": 1.7377, + "step": 937 + }, + { + "epoch": 0.5627887442251155, + "grad_norm": 1.765625, + "learning_rate": 1.8044554455445544e-06, + "loss": 1.6515, + "step": 938 + }, + { + "epoch": 0.5633887322253555, + "grad_norm": 1.734375, + "learning_rate": 1.801980198019802e-06, + "loss": 1.6708, + "step": 939 + }, + { + "epoch": 0.5639887202255955, + "grad_norm": 1.7265625, + "learning_rate": 1.7995049504950495e-06, + "loss": 1.6375, + "step": 940 + }, + { + "epoch": 0.5645887082258355, + "grad_norm": 1.6171875, + "learning_rate": 1.797029702970297e-06, + "loss": 1.7352, + "step": 941 + }, + { + "epoch": 0.5651886962260755, + "grad_norm": 1.515625, + "learning_rate": 1.7945544554455444e-06, + "loss": 1.5845, + "step": 942 + }, + { + "epoch": 0.5657886842263155, + "grad_norm": 1.6953125, + "learning_rate": 1.792079207920792e-06, + "loss": 1.6404, + "step": 943 + }, + { + "epoch": 0.5663886722265554, + "grad_norm": 1.484375, + "learning_rate": 1.7896039603960395e-06, + "loss": 1.693, + "step": 944 + }, + { + "epoch": 0.5669886602267955, + "grad_norm": 1.65625, + "learning_rate": 1.787128712871287e-06, + "loss": 1.6918, + "step": 945 + }, + { + "epoch": 0.5675886482270355, + "grad_norm": 1.8125, + "learning_rate": 1.7846534653465346e-06, + "loss": 1.6853, + "step": 946 + }, + { + "epoch": 0.5681886362272754, + "grad_norm": 1.703125, + "learning_rate": 1.7821782178217822e-06, + "loss": 1.7385, + "step": 947 + }, + { + "epoch": 0.5687886242275154, + "grad_norm": 1.734375, + "learning_rate": 1.7797029702970297e-06, + "loss": 1.6999, + "step": 948 + }, + { + "epoch": 0.5693886122277555, + "grad_norm": 1.71875, + "learning_rate": 1.777227722772277e-06, + "loss": 1.5892, + "step": 949 + }, + { + "epoch": 0.5699886002279955, + "grad_norm": 1.6328125, + "learning_rate": 1.7747524752475246e-06, + "loss": 1.7026, + "step": 950 + }, + { + "epoch": 0.5699886002279955, + "eval_loss": 1.777536153793335, + "eval_model_preparation_time": 0.0037, + "eval_runtime": 65.9833, + "eval_samples_per_second": 151.554, + "eval_steps_per_second": 25.264, + "step": 950 + }, + { + "epoch": 0.5705885882282354, + "grad_norm": 1.6328125, + "learning_rate": 1.7722772277227722e-06, + "loss": 1.6403, + "step": 951 + }, + { + "epoch": 0.5711885762284754, + "grad_norm": 1.5234375, + "learning_rate": 1.7698019801980197e-06, + "loss": 1.5106, + "step": 952 + }, + { + "epoch": 0.5717885642287154, + "grad_norm": 1.8515625, + "learning_rate": 1.7673267326732673e-06, + "loss": 1.6019, + "step": 953 + }, + { + "epoch": 0.5723885522289555, + "grad_norm": 1.6484375, + "learning_rate": 1.7648514851485149e-06, + "loss": 1.5817, + "step": 954 + }, + { + "epoch": 0.5729885402291954, + "grad_norm": 1.5859375, + "learning_rate": 1.7623762376237624e-06, + "loss": 1.7078, + "step": 955 + }, + { + "epoch": 0.5735885282294354, + "grad_norm": 1.671875, + "learning_rate": 1.7599009900990098e-06, + "loss": 1.6346, + "step": 956 + }, + { + "epoch": 0.5741885162296754, + "grad_norm": 1.7421875, + "learning_rate": 1.7574257425742573e-06, + "loss": 1.6595, + "step": 957 + }, + { + "epoch": 0.5747885042299155, + "grad_norm": 1.6484375, + "learning_rate": 1.7549504950495049e-06, + "loss": 1.6867, + "step": 958 + }, + { + "epoch": 0.5753884922301554, + "grad_norm": 1.7421875, + "learning_rate": 1.7524752475247524e-06, + "loss": 1.6465, + "step": 959 + }, + { + "epoch": 0.5759884802303954, + "grad_norm": 1.6875, + "learning_rate": 1.75e-06, + "loss": 1.7808, + "step": 960 + }, + { + "epoch": 0.5765884682306354, + "grad_norm": 1.6015625, + "learning_rate": 1.7475247524752475e-06, + "loss": 1.648, + "step": 961 + }, + { + "epoch": 0.5771884562308753, + "grad_norm": 1.65625, + "learning_rate": 1.7450495049504949e-06, + "loss": 1.6428, + "step": 962 + }, + { + "epoch": 0.5777884442311154, + "grad_norm": 1.8046875, + "learning_rate": 1.7425742574257424e-06, + "loss": 1.7126, + "step": 963 + }, + { + "epoch": 0.5783884322313554, + "grad_norm": 1.578125, + "learning_rate": 1.74009900990099e-06, + "loss": 1.6308, + "step": 964 + }, + { + "epoch": 0.5789884202315954, + "grad_norm": 1.75, + "learning_rate": 1.7376237623762375e-06, + "loss": 1.7662, + "step": 965 + }, + { + "epoch": 0.5795884082318353, + "grad_norm": 1.671875, + "learning_rate": 1.735148514851485e-06, + "loss": 1.6381, + "step": 966 + }, + { + "epoch": 0.5801883962320754, + "grad_norm": 1.65625, + "learning_rate": 1.7326732673267326e-06, + "loss": 1.7187, + "step": 967 + }, + { + "epoch": 0.5807883842323154, + "grad_norm": 1.6484375, + "learning_rate": 1.7301980198019802e-06, + "loss": 1.6956, + "step": 968 + }, + { + "epoch": 0.5813883722325554, + "grad_norm": 1.65625, + "learning_rate": 1.7277227722772275e-06, + "loss": 1.6442, + "step": 969 + }, + { + "epoch": 0.5819883602327953, + "grad_norm": 1.5859375, + "learning_rate": 1.725247524752475e-06, + "loss": 1.6717, + "step": 970 + }, + { + "epoch": 0.5825883482330353, + "grad_norm": 1.71875, + "learning_rate": 1.7227722772277227e-06, + "loss": 1.704, + "step": 971 + }, + { + "epoch": 0.5831883362332754, + "grad_norm": 1.765625, + "learning_rate": 1.7202970297029702e-06, + "loss": 1.6427, + "step": 972 + }, + { + "epoch": 0.5837883242335153, + "grad_norm": 1.671875, + "learning_rate": 1.7178217821782178e-06, + "loss": 1.7412, + "step": 973 + }, + { + "epoch": 0.5843883122337553, + "grad_norm": 1.46875, + "learning_rate": 1.7153465346534653e-06, + "loss": 1.6321, + "step": 974 + }, + { + "epoch": 0.5849883002339953, + "grad_norm": 1.7578125, + "learning_rate": 1.7128712871287127e-06, + "loss": 1.6029, + "step": 975 + }, + { + "epoch": 0.5855882882342354, + "grad_norm": 1.5703125, + "learning_rate": 1.7103960396039602e-06, + "loss": 1.6052, + "step": 976 + }, + { + "epoch": 0.5861882762344753, + "grad_norm": 1.5546875, + "learning_rate": 1.7079207920792078e-06, + "loss": 1.638, + "step": 977 + }, + { + "epoch": 0.5867882642347153, + "grad_norm": 1.65625, + "learning_rate": 1.7054455445544553e-06, + "loss": 1.8175, + "step": 978 + }, + { + "epoch": 0.5873882522349553, + "grad_norm": 1.7265625, + "learning_rate": 1.7029702970297029e-06, + "loss": 1.7441, + "step": 979 + }, + { + "epoch": 0.5879882402351952, + "grad_norm": 1.53125, + "learning_rate": 1.7004950495049504e-06, + "loss": 1.5586, + "step": 980 + }, + { + "epoch": 0.5885882282354353, + "grad_norm": 1.71875, + "learning_rate": 1.698019801980198e-06, + "loss": 1.8408, + "step": 981 + }, + { + "epoch": 0.5891882162356753, + "grad_norm": 1.6171875, + "learning_rate": 1.6955445544554453e-06, + "loss": 1.7207, + "step": 982 + }, + { + "epoch": 0.5897882042359153, + "grad_norm": 1.796875, + "learning_rate": 1.6930693069306929e-06, + "loss": 1.654, + "step": 983 + }, + { + "epoch": 0.5903881922361552, + "grad_norm": 1.859375, + "learning_rate": 1.6905940594059404e-06, + "loss": 1.645, + "step": 984 + }, + { + "epoch": 0.5909881802363953, + "grad_norm": 1.78125, + "learning_rate": 1.688118811881188e-06, + "loss": 1.7415, + "step": 985 + }, + { + "epoch": 0.5915881682366353, + "grad_norm": 1.578125, + "learning_rate": 1.6856435643564355e-06, + "loss": 1.7001, + "step": 986 + }, + { + "epoch": 0.5921881562368753, + "grad_norm": 1.6328125, + "learning_rate": 1.683168316831683e-06, + "loss": 1.6615, + "step": 987 + }, + { + "epoch": 0.5927881442371152, + "grad_norm": 1.78125, + "learning_rate": 1.6806930693069307e-06, + "loss": 1.7398, + "step": 988 + }, + { + "epoch": 0.5933881322373552, + "grad_norm": 1.6484375, + "learning_rate": 1.678217821782178e-06, + "loss": 1.6766, + "step": 989 + }, + { + "epoch": 0.5939881202375953, + "grad_norm": 1.8515625, + "learning_rate": 1.6757425742574256e-06, + "loss": 1.6571, + "step": 990 + }, + { + "epoch": 0.5945881082378353, + "grad_norm": 1.5078125, + "learning_rate": 1.6732673267326731e-06, + "loss": 1.622, + "step": 991 + }, + { + "epoch": 0.5951880962380752, + "grad_norm": 1.671875, + "learning_rate": 1.6707920792079207e-06, + "loss": 1.6102, + "step": 992 + }, + { + "epoch": 0.5957880842383152, + "grad_norm": 1.7265625, + "learning_rate": 1.6683168316831682e-06, + "loss": 1.7403, + "step": 993 + }, + { + "epoch": 0.5963880722385553, + "grad_norm": 1.8359375, + "learning_rate": 1.6658415841584158e-06, + "loss": 1.7306, + "step": 994 + }, + { + "epoch": 0.5969880602387952, + "grad_norm": 1.6328125, + "learning_rate": 1.6633663366336631e-06, + "loss": 1.6436, + "step": 995 + }, + { + "epoch": 0.5975880482390352, + "grad_norm": 2.140625, + "learning_rate": 1.6608910891089107e-06, + "loss": 1.6222, + "step": 996 + }, + { + "epoch": 0.5981880362392752, + "grad_norm": 1.640625, + "learning_rate": 1.6584158415841582e-06, + "loss": 1.6213, + "step": 997 + }, + { + "epoch": 0.5987880242395152, + "grad_norm": 1.5859375, + "learning_rate": 1.6559405940594058e-06, + "loss": 1.6387, + "step": 998 + }, + { + "epoch": 0.5993880122397552, + "grad_norm": 1.734375, + "learning_rate": 1.6534653465346533e-06, + "loss": 1.6206, + "step": 999 + }, + { + "epoch": 0.5999880002399952, + "grad_norm": 1.546875, + "learning_rate": 1.6509900990099009e-06, + "loss": 1.633, + "step": 1000 + }, + { + "epoch": 0.5999880002399952, + "eval_loss": 1.7774205207824707, + "eval_model_preparation_time": 0.0037, + "eval_runtime": 66.1167, + "eval_samples_per_second": 151.248, + "eval_steps_per_second": 25.213, + "step": 1000 + }, + { + "epoch": 0.6005879882402352, + "grad_norm": 1.671875, + "learning_rate": 1.6485148514851484e-06, + "loss": 1.7197, + "step": 1001 + }, + { + "epoch": 0.6011879762404752, + "grad_norm": 1.640625, + "learning_rate": 1.6460396039603958e-06, + "loss": 1.6299, + "step": 1002 + }, + { + "epoch": 0.6017879642407152, + "grad_norm": 1.6875, + "learning_rate": 1.6435643564356433e-06, + "loss": 1.6693, + "step": 1003 + }, + { + "epoch": 0.6023879522409552, + "grad_norm": 1.59375, + "learning_rate": 1.641089108910891e-06, + "loss": 1.6508, + "step": 1004 + }, + { + "epoch": 0.6029879402411952, + "grad_norm": 1.8046875, + "learning_rate": 1.6386138613861385e-06, + "loss": 1.6538, + "step": 1005 + }, + { + "epoch": 0.6035879282414351, + "grad_norm": 1.875, + "learning_rate": 1.636138613861386e-06, + "loss": 1.624, + "step": 1006 + }, + { + "epoch": 0.6041879162416751, + "grad_norm": 1.6796875, + "learning_rate": 1.6336633663366336e-06, + "loss": 1.7978, + "step": 1007 + }, + { + "epoch": 0.6047879042419152, + "grad_norm": 1.953125, + "learning_rate": 1.631188118811881e-06, + "loss": 1.5949, + "step": 1008 + }, + { + "epoch": 0.6053878922421552, + "grad_norm": 1.6171875, + "learning_rate": 1.6287128712871285e-06, + "loss": 1.71, + "step": 1009 + }, + { + "epoch": 0.6059878802423951, + "grad_norm": 1.6015625, + "learning_rate": 1.626237623762376e-06, + "loss": 1.7189, + "step": 1010 + }, + { + "epoch": 0.6065878682426351, + "grad_norm": 1.7109375, + "learning_rate": 1.6237623762376238e-06, + "loss": 1.6493, + "step": 1011 + }, + { + "epoch": 0.6071878562428752, + "grad_norm": 1.5625, + "learning_rate": 1.6212871287128713e-06, + "loss": 1.6566, + "step": 1012 + }, + { + "epoch": 0.6077878442431152, + "grad_norm": 1.7421875, + "learning_rate": 1.6188118811881189e-06, + "loss": 1.6018, + "step": 1013 + }, + { + "epoch": 0.6083878322433551, + "grad_norm": 2.046875, + "learning_rate": 1.6163366336633664e-06, + "loss": 1.6503, + "step": 1014 + }, + { + "epoch": 0.6089878202435951, + "grad_norm": 1.6640625, + "learning_rate": 1.6138613861386138e-06, + "loss": 1.6243, + "step": 1015 + }, + { + "epoch": 0.6095878082438351, + "grad_norm": 1.6328125, + "learning_rate": 1.6113861386138613e-06, + "loss": 1.6594, + "step": 1016 + }, + { + "epoch": 0.6101877962440752, + "grad_norm": 1.5859375, + "learning_rate": 1.608910891089109e-06, + "loss": 1.6897, + "step": 1017 + }, + { + "epoch": 0.6107877842443151, + "grad_norm": 1.59375, + "learning_rate": 1.6064356435643565e-06, + "loss": 1.692, + "step": 1018 + }, + { + "epoch": 0.6113877722445551, + "grad_norm": 1.5859375, + "learning_rate": 1.603960396039604e-06, + "loss": 1.6694, + "step": 1019 + }, + { + "epoch": 0.6119877602447951, + "grad_norm": 1.75, + "learning_rate": 1.6014851485148516e-06, + "loss": 1.7478, + "step": 1020 + }, + { + "epoch": 0.6125877482450351, + "grad_norm": 1.703125, + "learning_rate": 1.5990099009900991e-06, + "loss": 1.5819, + "step": 1021 + }, + { + "epoch": 0.6131877362452751, + "grad_norm": 1.8125, + "learning_rate": 1.5965346534653465e-06, + "loss": 1.6587, + "step": 1022 + }, + { + "epoch": 0.6137877242455151, + "grad_norm": 1.4765625, + "learning_rate": 1.594059405940594e-06, + "loss": 1.6205, + "step": 1023 + }, + { + "epoch": 0.6143877122457551, + "grad_norm": 1.671875, + "learning_rate": 1.5915841584158416e-06, + "loss": 1.6359, + "step": 1024 + }, + { + "epoch": 0.614987700245995, + "grad_norm": 1.6171875, + "learning_rate": 1.5891089108910891e-06, + "loss": 1.6837, + "step": 1025 + }, + { + "epoch": 0.6155876882462351, + "grad_norm": 1.5546875, + "learning_rate": 1.5866336633663367e-06, + "loss": 1.6778, + "step": 1026 + }, + { + "epoch": 0.6161876762464751, + "grad_norm": 1.9453125, + "learning_rate": 1.5841584158415842e-06, + "loss": 1.6798, + "step": 1027 + }, + { + "epoch": 0.616787664246715, + "grad_norm": 1.5546875, + "learning_rate": 1.5816831683168316e-06, + "loss": 1.5861, + "step": 1028 + }, + { + "epoch": 0.617387652246955, + "grad_norm": 1.625, + "learning_rate": 1.5792079207920791e-06, + "loss": 1.6128, + "step": 1029 + }, + { + "epoch": 0.6179876402471951, + "grad_norm": 1.7578125, + "learning_rate": 1.5767326732673267e-06, + "loss": 1.6962, + "step": 1030 + }, + { + "epoch": 0.6185876282474351, + "grad_norm": 1.6875, + "learning_rate": 1.5742574257425742e-06, + "loss": 1.5868, + "step": 1031 + }, + { + "epoch": 0.619187616247675, + "grad_norm": 1.859375, + "learning_rate": 1.5717821782178218e-06, + "loss": 1.627, + "step": 1032 + }, + { + "epoch": 0.619787604247915, + "grad_norm": 1.71875, + "learning_rate": 1.5693069306930694e-06, + "loss": 1.6367, + "step": 1033 + }, + { + "epoch": 0.620387592248155, + "grad_norm": 1.65625, + "learning_rate": 1.566831683168317e-06, + "loss": 1.6377, + "step": 1034 + }, + { + "epoch": 0.6209875802483951, + "grad_norm": 1.71875, + "learning_rate": 1.5643564356435643e-06, + "loss": 1.6437, + "step": 1035 + }, + { + "epoch": 0.621587568248635, + "grad_norm": 1.625, + "learning_rate": 1.5618811881188118e-06, + "loss": 1.6799, + "step": 1036 + }, + { + "epoch": 0.622187556248875, + "grad_norm": 1.6640625, + "learning_rate": 1.5594059405940594e-06, + "loss": 1.6935, + "step": 1037 + }, + { + "epoch": 0.622787544249115, + "grad_norm": 1.6171875, + "learning_rate": 1.556930693069307e-06, + "loss": 1.782, + "step": 1038 + }, + { + "epoch": 0.6233875322493551, + "grad_norm": 1.8359375, + "learning_rate": 1.5544554455445545e-06, + "loss": 1.7106, + "step": 1039 + }, + { + "epoch": 0.623987520249595, + "grad_norm": 1.6328125, + "learning_rate": 1.551980198019802e-06, + "loss": 1.6429, + "step": 1040 + }, + { + "epoch": 0.624587508249835, + "grad_norm": 1.6328125, + "learning_rate": 1.5495049504950496e-06, + "loss": 1.6784, + "step": 1041 + }, + { + "epoch": 0.625187496250075, + "grad_norm": 1.671875, + "learning_rate": 1.547029702970297e-06, + "loss": 1.6739, + "step": 1042 + }, + { + "epoch": 0.6257874842503149, + "grad_norm": 1.8671875, + "learning_rate": 1.5445544554455445e-06, + "loss": 1.6582, + "step": 1043 + }, + { + "epoch": 0.626387472250555, + "grad_norm": 1.640625, + "learning_rate": 1.542079207920792e-06, + "loss": 1.6651, + "step": 1044 + }, + { + "epoch": 0.626987460250795, + "grad_norm": 1.6328125, + "learning_rate": 1.5396039603960396e-06, + "loss": 1.6378, + "step": 1045 + }, + { + "epoch": 0.627587448251035, + "grad_norm": 1.625, + "learning_rate": 1.5371287128712871e-06, + "loss": 1.7835, + "step": 1046 + }, + { + "epoch": 0.6281874362512749, + "grad_norm": 1.734375, + "learning_rate": 1.5346534653465347e-06, + "loss": 1.7662, + "step": 1047 + }, + { + "epoch": 0.628787424251515, + "grad_norm": 1.59375, + "learning_rate": 1.532178217821782e-06, + "loss": 1.6731, + "step": 1048 + }, + { + "epoch": 0.629387412251755, + "grad_norm": 1.59375, + "learning_rate": 1.5297029702970296e-06, + "loss": 1.671, + "step": 1049 + }, + { + "epoch": 0.629987400251995, + "grad_norm": 1.6484375, + "learning_rate": 1.5272277227722771e-06, + "loss": 1.7948, + "step": 1050 + }, + { + "epoch": 0.629987400251995, + "eval_loss": 1.777391791343689, + "eval_model_preparation_time": 0.0037, + "eval_runtime": 69.2838, + "eval_samples_per_second": 144.334, + "eval_steps_per_second": 24.06, + "step": 1050 + }, + { + "epoch": 0.6305873882522349, + "grad_norm": 1.90625, + "learning_rate": 1.5247524752475247e-06, + "loss": 1.6642, + "step": 1051 + }, + { + "epoch": 0.6311873762524749, + "grad_norm": 1.6640625, + "learning_rate": 1.5222772277227723e-06, + "loss": 1.6911, + "step": 1052 + }, + { + "epoch": 0.631787364252715, + "grad_norm": 1.65625, + "learning_rate": 1.5198019801980198e-06, + "loss": 1.5992, + "step": 1053 + }, + { + "epoch": 0.632387352252955, + "grad_norm": 1.7421875, + "learning_rate": 1.5173267326732674e-06, + "loss": 1.6522, + "step": 1054 + }, + { + "epoch": 0.6329873402531949, + "grad_norm": 1.7734375, + "learning_rate": 1.5148514851485147e-06, + "loss": 1.6688, + "step": 1055 + }, + { + "epoch": 0.6335873282534349, + "grad_norm": 1.6796875, + "learning_rate": 1.5123762376237623e-06, + "loss": 1.6327, + "step": 1056 + }, + { + "epoch": 0.634187316253675, + "grad_norm": 1.59375, + "learning_rate": 1.5099009900990098e-06, + "loss": 1.6012, + "step": 1057 + }, + { + "epoch": 0.6347873042539149, + "grad_norm": 1.5625, + "learning_rate": 1.5074257425742574e-06, + "loss": 1.7478, + "step": 1058 + }, + { + "epoch": 0.6353872922541549, + "grad_norm": 1.7265625, + "learning_rate": 1.504950495049505e-06, + "loss": 1.6642, + "step": 1059 + }, + { + "epoch": 0.6359872802543949, + "grad_norm": 1.875, + "learning_rate": 1.5024752475247525e-06, + "loss": 1.6575, + "step": 1060 + }, + { + "epoch": 0.6365872682546349, + "grad_norm": 1.625, + "learning_rate": 1.5e-06, + "loss": 1.682, + "step": 1061 + }, + { + "epoch": 0.6371872562548749, + "grad_norm": 1.640625, + "learning_rate": 1.4975247524752474e-06, + "loss": 1.6323, + "step": 1062 + }, + { + "epoch": 0.6377872442551149, + "grad_norm": 1.65625, + "learning_rate": 1.495049504950495e-06, + "loss": 1.6041, + "step": 1063 + }, + { + "epoch": 0.6383872322553549, + "grad_norm": 1.8203125, + "learning_rate": 1.4925742574257425e-06, + "loss": 1.6278, + "step": 1064 + }, + { + "epoch": 0.6389872202555948, + "grad_norm": 1.765625, + "learning_rate": 1.49009900990099e-06, + "loss": 1.6691, + "step": 1065 + }, + { + "epoch": 0.6395872082558349, + "grad_norm": 1.546875, + "learning_rate": 1.4876237623762376e-06, + "loss": 1.6173, + "step": 1066 + }, + { + "epoch": 0.6401871962560749, + "grad_norm": 1.7734375, + "learning_rate": 1.4851485148514852e-06, + "loss": 1.6719, + "step": 1067 + }, + { + "epoch": 0.6407871842563149, + "grad_norm": 1.6015625, + "learning_rate": 1.4826732673267325e-06, + "loss": 1.6717, + "step": 1068 + }, + { + "epoch": 0.6413871722565548, + "grad_norm": 1.5703125, + "learning_rate": 1.48019801980198e-06, + "loss": 1.6107, + "step": 1069 + }, + { + "epoch": 0.6419871602567948, + "grad_norm": 1.71875, + "learning_rate": 1.4777227722772276e-06, + "loss": 1.6747, + "step": 1070 + }, + { + "epoch": 0.6425871482570349, + "grad_norm": 1.59375, + "learning_rate": 1.4752475247524752e-06, + "loss": 1.5889, + "step": 1071 + }, + { + "epoch": 0.6431871362572749, + "grad_norm": 1.5859375, + "learning_rate": 1.4727722772277227e-06, + "loss": 1.6483, + "step": 1072 + }, + { + "epoch": 0.6437871242575148, + "grad_norm": 1.7578125, + "learning_rate": 1.4702970297029703e-06, + "loss": 1.713, + "step": 1073 + }, + { + "epoch": 0.6443871122577548, + "grad_norm": 1.6484375, + "learning_rate": 1.4678217821782178e-06, + "loss": 1.6684, + "step": 1074 + }, + { + "epoch": 0.6449871002579949, + "grad_norm": 1.6640625, + "learning_rate": 1.4653465346534652e-06, + "loss": 1.566, + "step": 1075 + }, + { + "epoch": 0.6455870882582349, + "grad_norm": 1.5546875, + "learning_rate": 1.4628712871287127e-06, + "loss": 1.6242, + "step": 1076 + }, + { + "epoch": 0.6461870762584748, + "grad_norm": 1.640625, + "learning_rate": 1.4603960396039603e-06, + "loss": 1.613, + "step": 1077 + }, + { + "epoch": 0.6467870642587148, + "grad_norm": 1.6640625, + "learning_rate": 1.4579207920792078e-06, + "loss": 1.595, + "step": 1078 + }, + { + "epoch": 0.6473870522589549, + "grad_norm": 1.640625, + "learning_rate": 1.4554455445544554e-06, + "loss": 1.6096, + "step": 1079 + }, + { + "epoch": 0.6479870402591948, + "grad_norm": 1.703125, + "learning_rate": 1.452970297029703e-06, + "loss": 1.6353, + "step": 1080 + }, + { + "epoch": 0.6485870282594348, + "grad_norm": 2.03125, + "learning_rate": 1.4504950495049503e-06, + "loss": 1.6858, + "step": 1081 + }, + { + "epoch": 0.6491870162596748, + "grad_norm": 1.609375, + "learning_rate": 1.4480198019801978e-06, + "loss": 1.6339, + "step": 1082 + }, + { + "epoch": 0.6497870042599148, + "grad_norm": 1.8515625, + "learning_rate": 1.4455445544554454e-06, + "loss": 1.6156, + "step": 1083 + }, + { + "epoch": 0.6503869922601548, + "grad_norm": 1.703125, + "learning_rate": 1.443069306930693e-06, + "loss": 1.6629, + "step": 1084 + }, + { + "epoch": 0.6509869802603948, + "grad_norm": 1.734375, + "learning_rate": 1.4405940594059405e-06, + "loss": 1.6012, + "step": 1085 + }, + { + "epoch": 0.6515869682606348, + "grad_norm": 1.75, + "learning_rate": 1.438118811881188e-06, + "loss": 1.7112, + "step": 1086 + }, + { + "epoch": 0.6521869562608748, + "grad_norm": 1.6796875, + "learning_rate": 1.4356435643564356e-06, + "loss": 1.7422, + "step": 1087 + }, + { + "epoch": 0.6527869442611148, + "grad_norm": 1.7421875, + "learning_rate": 1.433168316831683e-06, + "loss": 1.5988, + "step": 1088 + }, + { + "epoch": 0.6533869322613548, + "grad_norm": 1.6640625, + "learning_rate": 1.4306930693069305e-06, + "loss": 1.631, + "step": 1089 + }, + { + "epoch": 0.6539869202615948, + "grad_norm": 1.84375, + "learning_rate": 1.428217821782178e-06, + "loss": 1.7748, + "step": 1090 + }, + { + "epoch": 0.6545869082618347, + "grad_norm": 1.6875, + "learning_rate": 1.4257425742574256e-06, + "loss": 1.6101, + "step": 1091 + }, + { + "epoch": 0.6551868962620747, + "grad_norm": 1.578125, + "learning_rate": 1.4232673267326732e-06, + "loss": 1.6727, + "step": 1092 + }, + { + "epoch": 0.6557868842623148, + "grad_norm": 1.765625, + "learning_rate": 1.4207920792079207e-06, + "loss": 1.656, + "step": 1093 + }, + { + "epoch": 0.6563868722625548, + "grad_norm": 1.84375, + "learning_rate": 1.4183168316831683e-06, + "loss": 1.7244, + "step": 1094 + }, + { + "epoch": 0.6569868602627947, + "grad_norm": 1.703125, + "learning_rate": 1.4158415841584156e-06, + "loss": 1.6891, + "step": 1095 + }, + { + "epoch": 0.6575868482630347, + "grad_norm": 1.7265625, + "learning_rate": 1.4133663366336632e-06, + "loss": 1.7265, + "step": 1096 + }, + { + "epoch": 0.6581868362632748, + "grad_norm": 1.6796875, + "learning_rate": 1.4108910891089107e-06, + "loss": 1.5939, + "step": 1097 + }, + { + "epoch": 0.6587868242635148, + "grad_norm": 1.765625, + "learning_rate": 1.4084158415841583e-06, + "loss": 1.6942, + "step": 1098 + }, + { + "epoch": 0.6593868122637547, + "grad_norm": 1.6796875, + "learning_rate": 1.4059405940594058e-06, + "loss": 1.5412, + "step": 1099 + }, + { + "epoch": 0.6599868002639947, + "grad_norm": 1.5703125, + "learning_rate": 1.4034653465346534e-06, + "loss": 1.5955, + "step": 1100 + }, + { + "epoch": 0.6599868002639947, + "eval_loss": 1.7773131132125854, + "eval_model_preparation_time": 0.0037, + "eval_runtime": 66.1015, + "eval_samples_per_second": 151.283, + "eval_steps_per_second": 25.219, + "step": 1100 + }, + { + "epoch": 0.6605867882642347, + "grad_norm": 1.65625, + "learning_rate": 1.4009900990099007e-06, + "loss": 1.665, + "step": 1101 + }, + { + "epoch": 0.6611867762644748, + "grad_norm": 1.515625, + "learning_rate": 1.3985148514851483e-06, + "loss": 1.6523, + "step": 1102 + }, + { + "epoch": 0.6617867642647147, + "grad_norm": 1.640625, + "learning_rate": 1.3960396039603959e-06, + "loss": 1.5928, + "step": 1103 + }, + { + "epoch": 0.6623867522649547, + "grad_norm": 1.6796875, + "learning_rate": 1.3935643564356434e-06, + "loss": 1.5513, + "step": 1104 + }, + { + "epoch": 0.6629867402651947, + "grad_norm": 1.8359375, + "learning_rate": 1.391089108910891e-06, + "loss": 1.7027, + "step": 1105 + }, + { + "epoch": 0.6635867282654347, + "grad_norm": 1.65625, + "learning_rate": 1.3886138613861385e-06, + "loss": 1.6501, + "step": 1106 + }, + { + "epoch": 0.6641867162656747, + "grad_norm": 1.5625, + "learning_rate": 1.386138613861386e-06, + "loss": 1.6262, + "step": 1107 + }, + { + "epoch": 0.6647867042659147, + "grad_norm": 1.546875, + "learning_rate": 1.3836633663366334e-06, + "loss": 1.6506, + "step": 1108 + }, + { + "epoch": 0.6653866922661547, + "grad_norm": 1.625, + "learning_rate": 1.381188118811881e-06, + "loss": 1.6444, + "step": 1109 + }, + { + "epoch": 0.6659866802663946, + "grad_norm": 1.640625, + "learning_rate": 1.3787128712871285e-06, + "loss": 1.6403, + "step": 1110 + }, + { + "epoch": 0.6665866682666347, + "grad_norm": 1.515625, + "learning_rate": 1.376237623762376e-06, + "loss": 1.6624, + "step": 1111 + }, + { + "epoch": 0.6671866562668747, + "grad_norm": 1.8515625, + "learning_rate": 1.3737623762376238e-06, + "loss": 1.7689, + "step": 1112 + }, + { + "epoch": 0.6677866442671146, + "grad_norm": 1.7421875, + "learning_rate": 1.3712871287128714e-06, + "loss": 1.661, + "step": 1113 + }, + { + "epoch": 0.6683866322673546, + "grad_norm": 2.03125, + "learning_rate": 1.368811881188119e-06, + "loss": 1.6815, + "step": 1114 + }, + { + "epoch": 0.6689866202675947, + "grad_norm": 1.6484375, + "learning_rate": 1.3663366336633663e-06, + "loss": 1.6806, + "step": 1115 + }, + { + "epoch": 0.6695866082678347, + "grad_norm": 1.625, + "learning_rate": 1.3638613861386139e-06, + "loss": 1.6907, + "step": 1116 + }, + { + "epoch": 0.6701865962680746, + "grad_norm": 1.6796875, + "learning_rate": 1.3613861386138614e-06, + "loss": 1.6261, + "step": 1117 + }, + { + "epoch": 0.6707865842683146, + "grad_norm": 1.6171875, + "learning_rate": 1.358910891089109e-06, + "loss": 1.7169, + "step": 1118 + }, + { + "epoch": 0.6713865722685546, + "grad_norm": 1.7734375, + "learning_rate": 1.3564356435643565e-06, + "loss": 1.5983, + "step": 1119 + }, + { + "epoch": 0.6719865602687947, + "grad_norm": 1.6328125, + "learning_rate": 1.353960396039604e-06, + "loss": 1.6789, + "step": 1120 + }, + { + "epoch": 0.6725865482690346, + "grad_norm": 1.703125, + "learning_rate": 1.3514851485148514e-06, + "loss": 1.6336, + "step": 1121 + }, + { + "epoch": 0.6731865362692746, + "grad_norm": 1.671875, + "learning_rate": 1.349009900990099e-06, + "loss": 1.7563, + "step": 1122 + }, + { + "epoch": 0.6737865242695146, + "grad_norm": 1.7578125, + "learning_rate": 1.3465346534653465e-06, + "loss": 1.6033, + "step": 1123 + }, + { + "epoch": 0.6743865122697547, + "grad_norm": 1.6640625, + "learning_rate": 1.344059405940594e-06, + "loss": 1.6741, + "step": 1124 + }, + { + "epoch": 0.6749865002699946, + "grad_norm": 1.5234375, + "learning_rate": 1.3415841584158416e-06, + "loss": 1.5703, + "step": 1125 + }, + { + "epoch": 0.6755864882702346, + "grad_norm": 1.5546875, + "learning_rate": 1.3391089108910892e-06, + "loss": 1.7107, + "step": 1126 + }, + { + "epoch": 0.6761864762704746, + "grad_norm": 1.640625, + "learning_rate": 1.3366336633663367e-06, + "loss": 1.6928, + "step": 1127 + }, + { + "epoch": 0.6767864642707145, + "grad_norm": 1.640625, + "learning_rate": 1.334158415841584e-06, + "loss": 1.6344, + "step": 1128 + }, + { + "epoch": 0.6773864522709546, + "grad_norm": 1.6015625, + "learning_rate": 1.3316831683168316e-06, + "loss": 1.5788, + "step": 1129 + }, + { + "epoch": 0.6779864402711946, + "grad_norm": 1.609375, + "learning_rate": 1.3292079207920792e-06, + "loss": 1.6234, + "step": 1130 + }, + { + "epoch": 0.6785864282714346, + "grad_norm": 1.8046875, + "learning_rate": 1.3267326732673268e-06, + "loss": 1.6987, + "step": 1131 + }, + { + "epoch": 0.6791864162716745, + "grad_norm": 1.7265625, + "learning_rate": 1.3242574257425743e-06, + "loss": 1.5971, + "step": 1132 + }, + { + "epoch": 0.6797864042719146, + "grad_norm": 1.75, + "learning_rate": 1.3217821782178219e-06, + "loss": 1.6059, + "step": 1133 + }, + { + "epoch": 0.6803863922721546, + "grad_norm": 1.6640625, + "learning_rate": 1.3193069306930692e-06, + "loss": 1.6445, + "step": 1134 + }, + { + "epoch": 0.6809863802723946, + "grad_norm": 1.828125, + "learning_rate": 1.3168316831683168e-06, + "loss": 1.6312, + "step": 1135 + }, + { + "epoch": 0.6815863682726345, + "grad_norm": 1.7578125, + "learning_rate": 1.3143564356435643e-06, + "loss": 1.6458, + "step": 1136 + }, + { + "epoch": 0.6821863562728745, + "grad_norm": 1.796875, + "learning_rate": 1.3118811881188119e-06, + "loss": 1.6534, + "step": 1137 + }, + { + "epoch": 0.6827863442731146, + "grad_norm": 1.515625, + "learning_rate": 1.3094059405940594e-06, + "loss": 1.6041, + "step": 1138 + }, + { + "epoch": 0.6833863322733545, + "grad_norm": 1.921875, + "learning_rate": 1.306930693069307e-06, + "loss": 1.7073, + "step": 1139 + }, + { + "epoch": 0.6839863202735945, + "grad_norm": 1.671875, + "learning_rate": 1.3044554455445545e-06, + "loss": 1.7033, + "step": 1140 + }, + { + "epoch": 0.6845863082738345, + "grad_norm": 1.546875, + "learning_rate": 1.3019801980198019e-06, + "loss": 1.6317, + "step": 1141 + }, + { + "epoch": 0.6851862962740746, + "grad_norm": 1.734375, + "learning_rate": 1.2995049504950494e-06, + "loss": 1.6607, + "step": 1142 + }, + { + "epoch": 0.6857862842743145, + "grad_norm": 1.609375, + "learning_rate": 1.297029702970297e-06, + "loss": 1.6282, + "step": 1143 + }, + { + "epoch": 0.6863862722745545, + "grad_norm": 1.7109375, + "learning_rate": 1.2945544554455445e-06, + "loss": 1.7765, + "step": 1144 + }, + { + "epoch": 0.6869862602747945, + "grad_norm": 1.640625, + "learning_rate": 1.292079207920792e-06, + "loss": 1.6843, + "step": 1145 + }, + { + "epoch": 0.6875862482750345, + "grad_norm": 1.5859375, + "learning_rate": 1.2896039603960396e-06, + "loss": 1.6195, + "step": 1146 + }, + { + "epoch": 0.6881862362752745, + "grad_norm": 1.6328125, + "learning_rate": 1.2871287128712872e-06, + "loss": 1.6546, + "step": 1147 + }, + { + "epoch": 0.6887862242755145, + "grad_norm": 1.59375, + "learning_rate": 1.2846534653465345e-06, + "loss": 1.6723, + "step": 1148 + }, + { + "epoch": 0.6893862122757545, + "grad_norm": 1.6640625, + "learning_rate": 1.282178217821782e-06, + "loss": 1.6211, + "step": 1149 + }, + { + "epoch": 0.6899862002759944, + "grad_norm": 1.5625, + "learning_rate": 1.2797029702970297e-06, + "loss": 1.7107, + "step": 1150 + }, + { + "epoch": 0.6899862002759944, + "eval_loss": 1.7773100137710571, + "eval_model_preparation_time": 0.0037, + "eval_runtime": 66.2071, + "eval_samples_per_second": 151.041, + "eval_steps_per_second": 25.179, + "step": 1150 + }, + { + "epoch": 0.6905861882762345, + "grad_norm": 1.6796875, + "learning_rate": 1.2772277227722772e-06, + "loss": 1.6873, + "step": 1151 + }, + { + "epoch": 0.6911861762764745, + "grad_norm": 1.5859375, + "learning_rate": 1.2747524752475248e-06, + "loss": 1.5066, + "step": 1152 + }, + { + "epoch": 0.6917861642767145, + "grad_norm": 1.640625, + "learning_rate": 1.2722772277227723e-06, + "loss": 1.5879, + "step": 1153 + }, + { + "epoch": 0.6923861522769544, + "grad_norm": 1.5390625, + "learning_rate": 1.2698019801980197e-06, + "loss": 1.7176, + "step": 1154 + }, + { + "epoch": 0.6929861402771944, + "grad_norm": 1.5625, + "learning_rate": 1.2673267326732672e-06, + "loss": 1.6146, + "step": 1155 + }, + { + "epoch": 0.6935861282774345, + "grad_norm": 1.8359375, + "learning_rate": 1.2648514851485148e-06, + "loss": 1.6275, + "step": 1156 + }, + { + "epoch": 0.6941861162776745, + "grad_norm": 1.6171875, + "learning_rate": 1.2623762376237623e-06, + "loss": 1.6011, + "step": 1157 + }, + { + "epoch": 0.6947861042779144, + "grad_norm": 1.6171875, + "learning_rate": 1.2599009900990099e-06, + "loss": 1.7129, + "step": 1158 + }, + { + "epoch": 0.6953860922781544, + "grad_norm": 1.671875, + "learning_rate": 1.2574257425742574e-06, + "loss": 1.6008, + "step": 1159 + }, + { + "epoch": 0.6959860802783945, + "grad_norm": 1.7265625, + "learning_rate": 1.254950495049505e-06, + "loss": 1.6409, + "step": 1160 + }, + { + "epoch": 0.6965860682786345, + "grad_norm": 1.7578125, + "learning_rate": 1.2524752475247523e-06, + "loss": 1.7206, + "step": 1161 + }, + { + "epoch": 0.6971860562788744, + "grad_norm": 1.6484375, + "learning_rate": 1.2499999999999999e-06, + "loss": 1.6677, + "step": 1162 + }, + { + "epoch": 0.6977860442791144, + "grad_norm": 1.71875, + "learning_rate": 1.2475247524752474e-06, + "loss": 1.4656, + "step": 1163 + }, + { + "epoch": 0.6983860322793544, + "grad_norm": 1.703125, + "learning_rate": 1.245049504950495e-06, + "loss": 1.6027, + "step": 1164 + }, + { + "epoch": 0.6989860202795944, + "grad_norm": 1.6640625, + "learning_rate": 1.2425742574257426e-06, + "loss": 1.7318, + "step": 1165 + }, + { + "epoch": 0.6995860082798344, + "grad_norm": 1.734375, + "learning_rate": 1.2400990099009901e-06, + "loss": 1.6006, + "step": 1166 + }, + { + "epoch": 0.7001859962800744, + "grad_norm": 1.6328125, + "learning_rate": 1.2376237623762375e-06, + "loss": 1.6734, + "step": 1167 + }, + { + "epoch": 0.7007859842803144, + "grad_norm": 1.7421875, + "learning_rate": 1.235148514851485e-06, + "loss": 1.7149, + "step": 1168 + }, + { + "epoch": 0.7013859722805544, + "grad_norm": 1.8515625, + "learning_rate": 1.2326732673267326e-06, + "loss": 1.7003, + "step": 1169 + }, + { + "epoch": 0.7019859602807944, + "grad_norm": 1.578125, + "learning_rate": 1.2301980198019801e-06, + "loss": 1.5693, + "step": 1170 + }, + { + "epoch": 0.7025859482810344, + "grad_norm": 1.6015625, + "learning_rate": 1.2277227722772277e-06, + "loss": 1.6384, + "step": 1171 + }, + { + "epoch": 0.7031859362812743, + "grad_norm": 1.6953125, + "learning_rate": 1.2252475247524752e-06, + "loss": 1.6105, + "step": 1172 + }, + { + "epoch": 0.7037859242815143, + "grad_norm": 1.75, + "learning_rate": 1.2227722772277228e-06, + "loss": 1.5411, + "step": 1173 + }, + { + "epoch": 0.7043859122817544, + "grad_norm": 1.6796875, + "learning_rate": 1.2202970297029701e-06, + "loss": 1.677, + "step": 1174 + }, + { + "epoch": 0.7049859002819944, + "grad_norm": 1.6640625, + "learning_rate": 1.2178217821782177e-06, + "loss": 1.7706, + "step": 1175 + }, + { + "epoch": 0.7055858882822343, + "grad_norm": 1.4921875, + "learning_rate": 1.2153465346534652e-06, + "loss": 1.666, + "step": 1176 + }, + { + "epoch": 0.7061858762824743, + "grad_norm": 1.578125, + "learning_rate": 1.2128712871287128e-06, + "loss": 1.6809, + "step": 1177 + }, + { + "epoch": 0.7067858642827144, + "grad_norm": 1.6171875, + "learning_rate": 1.2103960396039603e-06, + "loss": 1.6347, + "step": 1178 + }, + { + "epoch": 0.7073858522829544, + "grad_norm": 1.640625, + "learning_rate": 1.207920792079208e-06, + "loss": 1.7277, + "step": 1179 + }, + { + "epoch": 0.7079858402831943, + "grad_norm": 1.75, + "learning_rate": 1.2054455445544555e-06, + "loss": 1.5418, + "step": 1180 + }, + { + "epoch": 0.7085858282834343, + "grad_norm": 1.5078125, + "learning_rate": 1.2029702970297028e-06, + "loss": 1.6748, + "step": 1181 + }, + { + "epoch": 0.7091858162836743, + "grad_norm": 1.6953125, + "learning_rate": 1.2004950495049504e-06, + "loss": 1.6577, + "step": 1182 + }, + { + "epoch": 0.7097858042839144, + "grad_norm": 1.6796875, + "learning_rate": 1.198019801980198e-06, + "loss": 1.7349, + "step": 1183 + }, + { + "epoch": 0.7103857922841543, + "grad_norm": 1.65625, + "learning_rate": 1.1955445544554455e-06, + "loss": 1.5652, + "step": 1184 + }, + { + "epoch": 0.7109857802843943, + "grad_norm": 1.6875, + "learning_rate": 1.193069306930693e-06, + "loss": 1.6844, + "step": 1185 + }, + { + "epoch": 0.7115857682846343, + "grad_norm": 1.625, + "learning_rate": 1.1905940594059406e-06, + "loss": 1.6704, + "step": 1186 + }, + { + "epoch": 0.7121857562848743, + "grad_norm": 1.6953125, + "learning_rate": 1.188118811881188e-06, + "loss": 1.6941, + "step": 1187 + }, + { + "epoch": 0.7127857442851143, + "grad_norm": 1.609375, + "learning_rate": 1.1856435643564355e-06, + "loss": 1.6569, + "step": 1188 + }, + { + "epoch": 0.7133857322853543, + "grad_norm": 1.7578125, + "learning_rate": 1.183168316831683e-06, + "loss": 1.749, + "step": 1189 + }, + { + "epoch": 0.7139857202855943, + "grad_norm": 1.5703125, + "learning_rate": 1.1806930693069306e-06, + "loss": 1.6929, + "step": 1190 + }, + { + "epoch": 0.7145857082858342, + "grad_norm": 1.5859375, + "learning_rate": 1.1782178217821781e-06, + "loss": 1.645, + "step": 1191 + }, + { + "epoch": 0.7151856962860743, + "grad_norm": 1.59375, + "learning_rate": 1.1757425742574257e-06, + "loss": 1.6773, + "step": 1192 + }, + { + "epoch": 0.7157856842863143, + "grad_norm": 1.6640625, + "learning_rate": 1.1732673267326732e-06, + "loss": 1.7084, + "step": 1193 + }, + { + "epoch": 0.7163856722865543, + "grad_norm": 1.59375, + "learning_rate": 1.1707920792079206e-06, + "loss": 1.5868, + "step": 1194 + }, + { + "epoch": 0.7169856602867942, + "grad_norm": 1.7734375, + "learning_rate": 1.1683168316831681e-06, + "loss": 1.5783, + "step": 1195 + }, + { + "epoch": 0.7175856482870343, + "grad_norm": 1.7421875, + "learning_rate": 1.1658415841584157e-06, + "loss": 1.7283, + "step": 1196 + }, + { + "epoch": 0.7181856362872743, + "grad_norm": 1.640625, + "learning_rate": 1.1633663366336632e-06, + "loss": 1.6033, + "step": 1197 + }, + { + "epoch": 0.7187856242875142, + "grad_norm": 1.6640625, + "learning_rate": 1.1608910891089108e-06, + "loss": 1.7633, + "step": 1198 + }, + { + "epoch": 0.7193856122877542, + "grad_norm": 1.890625, + "learning_rate": 1.1584158415841584e-06, + "loss": 1.6804, + "step": 1199 + }, + { + "epoch": 0.7199856002879942, + "grad_norm": 1.6328125, + "learning_rate": 1.1559405940594057e-06, + "loss": 1.5985, + "step": 1200 + }, + { + "epoch": 0.7199856002879942, + "eval_loss": 1.777273416519165, + "eval_model_preparation_time": 0.0037, + "eval_runtime": 68.7207, + "eval_samples_per_second": 145.517, + "eval_steps_per_second": 24.258, + "step": 1200 + } + ], + "logging_steps": 1, + "max_steps": 1666, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.356578165633057e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}