diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,5060 @@ +{ + "best_global_step": 700, + "best_metric": 2.0324764251708984, + "best_model_checkpoint": "./output_dir/fr-Llama-3.1-8B-lr4e-06-atten0.25-ffn0.25_20250430_122245/checkpoint-700", + "epoch": 0.41999160016799664, + "eval_steps": 50, + "global_step": 700, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0005999880002399952, + "grad_norm": 1.9765625, + "learning_rate": 0.0, + "loss": 1.9517, + "step": 1 + }, + { + "epoch": 0.0011999760004799903, + "grad_norm": 1.7265625, + "learning_rate": 8e-08, + "loss": 1.8084, + "step": 2 + }, + { + "epoch": 0.0017999640007199855, + "grad_norm": 2.046875, + "learning_rate": 1.6e-07, + "loss": 1.8214, + "step": 3 + }, + { + "epoch": 0.0023999520009599807, + "grad_norm": 1.7109375, + "learning_rate": 2.4e-07, + "loss": 1.8709, + "step": 4 + }, + { + "epoch": 0.002999940001199976, + "grad_norm": 2.125, + "learning_rate": 3.2e-07, + "loss": 1.8741, + "step": 5 + }, + { + "epoch": 0.003599928001439971, + "grad_norm": 2.015625, + "learning_rate": 4e-07, + "loss": 1.8875, + "step": 6 + }, + { + "epoch": 0.004199916001679967, + "grad_norm": 1.8359375, + "learning_rate": 4.8e-07, + "loss": 1.9111, + "step": 7 + }, + { + "epoch": 0.004799904001919961, + "grad_norm": 1.6875, + "learning_rate": 5.6e-07, + "loss": 1.9349, + "step": 8 + }, + { + "epoch": 0.005399892002159957, + "grad_norm": 1.8671875, + "learning_rate": 6.4e-07, + "loss": 1.891, + "step": 9 + }, + { + "epoch": 0.005999880002399952, + "grad_norm": 2.65625, + "learning_rate": 7.2e-07, + "loss": 1.9387, + "step": 10 + }, + { + "epoch": 0.006599868002639947, + "grad_norm": 1.8203125, + "learning_rate": 8e-07, + "loss": 1.8719, + "step": 11 + }, + { + "epoch": 0.007199856002879942, + "grad_norm": 1.71875, + "learning_rate": 8.799999999999999e-07, + "loss": 2.0476, + "step": 12 + }, + { + "epoch": 0.007799844003119938, + "grad_norm": 1.828125, + "learning_rate": 9.6e-07, + "loss": 1.8709, + "step": 13 + }, + { + "epoch": 0.008399832003359933, + "grad_norm": 1.9375, + "learning_rate": 1.04e-06, + "loss": 1.83, + "step": 14 + }, + { + "epoch": 0.008999820003599928, + "grad_norm": 1.796875, + "learning_rate": 1.12e-06, + "loss": 1.8584, + "step": 15 + }, + { + "epoch": 0.009599808003839923, + "grad_norm": 1.9375, + "learning_rate": 1.2e-06, + "loss": 1.9224, + "step": 16 + }, + { + "epoch": 0.01019979600407992, + "grad_norm": 1.953125, + "learning_rate": 1.28e-06, + "loss": 1.8243, + "step": 17 + }, + { + "epoch": 0.010799784004319914, + "grad_norm": 2.0625, + "learning_rate": 1.3600000000000001e-06, + "loss": 1.9473, + "step": 18 + }, + { + "epoch": 0.011399772004559909, + "grad_norm": 1.8828125, + "learning_rate": 1.44e-06, + "loss": 1.9006, + "step": 19 + }, + { + "epoch": 0.011999760004799903, + "grad_norm": 2.15625, + "learning_rate": 1.5199999999999998e-06, + "loss": 1.8361, + "step": 20 + }, + { + "epoch": 0.0125997480050399, + "grad_norm": 1.9140625, + "learning_rate": 1.6e-06, + "loss": 1.9316, + "step": 21 + }, + { + "epoch": 0.013199736005279895, + "grad_norm": 2.015625, + "learning_rate": 1.6799999999999998e-06, + "loss": 1.8865, + "step": 22 + }, + { + "epoch": 0.01379972400551989, + "grad_norm": 1.9453125, + "learning_rate": 1.7599999999999999e-06, + "loss": 1.9309, + "step": 23 + }, + { + "epoch": 0.014399712005759884, + "grad_norm": 2.0625, + "learning_rate": 1.84e-06, + "loss": 1.8815, + "step": 24 + }, + { + "epoch": 0.01499970000599988, + "grad_norm": 1.9140625, + "learning_rate": 1.92e-06, + "loss": 1.8123, + "step": 25 + }, + { + "epoch": 0.015599688006239875, + "grad_norm": 1.703125, + "learning_rate": 2e-06, + "loss": 1.8855, + "step": 26 + }, + { + "epoch": 0.016199676006479872, + "grad_norm": 2.0, + "learning_rate": 2.08e-06, + "loss": 1.8313, + "step": 27 + }, + { + "epoch": 0.016799664006719867, + "grad_norm": 1.671875, + "learning_rate": 2.16e-06, + "loss": 1.8148, + "step": 28 + }, + { + "epoch": 0.01739965200695986, + "grad_norm": 1.7890625, + "learning_rate": 2.24e-06, + "loss": 1.9719, + "step": 29 + }, + { + "epoch": 0.017999640007199856, + "grad_norm": 2.1875, + "learning_rate": 2.32e-06, + "loss": 1.8331, + "step": 30 + }, + { + "epoch": 0.01859962800743985, + "grad_norm": 1.890625, + "learning_rate": 2.4e-06, + "loss": 1.9469, + "step": 31 + }, + { + "epoch": 0.019199616007679846, + "grad_norm": 1.6953125, + "learning_rate": 2.48e-06, + "loss": 1.884, + "step": 32 + }, + { + "epoch": 0.01979960400791984, + "grad_norm": 1.8203125, + "learning_rate": 2.56e-06, + "loss": 1.9041, + "step": 33 + }, + { + "epoch": 0.02039959200815984, + "grad_norm": 1.9609375, + "learning_rate": 2.64e-06, + "loss": 2.0232, + "step": 34 + }, + { + "epoch": 0.020999580008399833, + "grad_norm": 2.125, + "learning_rate": 2.7200000000000002e-06, + "loss": 1.832, + "step": 35 + }, + { + "epoch": 0.021599568008639828, + "grad_norm": 1.8984375, + "learning_rate": 2.8e-06, + "loss": 1.7751, + "step": 36 + }, + { + "epoch": 0.022199556008879823, + "grad_norm": 1.875, + "learning_rate": 2.88e-06, + "loss": 1.8467, + "step": 37 + }, + { + "epoch": 0.022799544009119817, + "grad_norm": 1.84375, + "learning_rate": 2.96e-06, + "loss": 1.7807, + "step": 38 + }, + { + "epoch": 0.023399532009359812, + "grad_norm": 1.9765625, + "learning_rate": 3.0399999999999997e-06, + "loss": 1.7781, + "step": 39 + }, + { + "epoch": 0.023999520009599807, + "grad_norm": 1.609375, + "learning_rate": 3.1199999999999998e-06, + "loss": 1.8486, + "step": 40 + }, + { + "epoch": 0.0245995080098398, + "grad_norm": 1.9296875, + "learning_rate": 3.2e-06, + "loss": 1.8328, + "step": 41 + }, + { + "epoch": 0.0251994960100798, + "grad_norm": 1.7265625, + "learning_rate": 3.2799999999999995e-06, + "loss": 2.0436, + "step": 42 + }, + { + "epoch": 0.025799484010319795, + "grad_norm": 1.8125, + "learning_rate": 3.3599999999999996e-06, + "loss": 1.9575, + "step": 43 + }, + { + "epoch": 0.02639947201055979, + "grad_norm": 1.8125, + "learning_rate": 3.4399999999999997e-06, + "loss": 1.9247, + "step": 44 + }, + { + "epoch": 0.026999460010799784, + "grad_norm": 1.734375, + "learning_rate": 3.5199999999999998e-06, + "loss": 1.9135, + "step": 45 + }, + { + "epoch": 0.02759944801103978, + "grad_norm": 1.78125, + "learning_rate": 3.6e-06, + "loss": 1.9564, + "step": 46 + }, + { + "epoch": 0.028199436011279774, + "grad_norm": 1.828125, + "learning_rate": 3.68e-06, + "loss": 1.8047, + "step": 47 + }, + { + "epoch": 0.02879942401151977, + "grad_norm": 1.5859375, + "learning_rate": 3.7599999999999996e-06, + "loss": 1.9223, + "step": 48 + }, + { + "epoch": 0.029399412011759767, + "grad_norm": 1.7109375, + "learning_rate": 3.84e-06, + "loss": 1.9724, + "step": 49 + }, + { + "epoch": 0.02999940001199976, + "grad_norm": 1.7890625, + "learning_rate": 3.92e-06, + "loss": 1.8151, + "step": 50 + }, + { + "epoch": 0.02999940001199976, + "eval_loss": 2.060624361038208, + "eval_model_preparation_time": 0.0036, + "eval_runtime": 65.0119, + "eval_samples_per_second": 153.818, + "eval_steps_per_second": 25.641, + "step": 50 + }, + { + "epoch": 0.030599388012239756, + "grad_norm": 1.46875, + "learning_rate": 4e-06, + "loss": 1.8375, + "step": 51 + }, + { + "epoch": 0.03119937601247975, + "grad_norm": 1.7109375, + "learning_rate": 3.997524752475248e-06, + "loss": 1.8369, + "step": 52 + }, + { + "epoch": 0.031799364012719745, + "grad_norm": 1.84375, + "learning_rate": 3.9950495049504945e-06, + "loss": 1.7909, + "step": 53 + }, + { + "epoch": 0.032399352012959744, + "grad_norm": 1.46875, + "learning_rate": 3.992574257425742e-06, + "loss": 1.819, + "step": 54 + }, + { + "epoch": 0.032999340013199735, + "grad_norm": 1.8046875, + "learning_rate": 3.99009900990099e-06, + "loss": 1.8014, + "step": 55 + }, + { + "epoch": 0.03359932801343973, + "grad_norm": 1.96875, + "learning_rate": 3.987623762376238e-06, + "loss": 1.7094, + "step": 56 + }, + { + "epoch": 0.034199316013679724, + "grad_norm": 1.546875, + "learning_rate": 3.985148514851485e-06, + "loss": 1.8507, + "step": 57 + }, + { + "epoch": 0.03479930401391972, + "grad_norm": 1.640625, + "learning_rate": 3.9826732673267325e-06, + "loss": 1.8693, + "step": 58 + }, + { + "epoch": 0.035399292014159714, + "grad_norm": 1.4765625, + "learning_rate": 3.98019801980198e-06, + "loss": 1.9363, + "step": 59 + }, + { + "epoch": 0.03599928001439971, + "grad_norm": 1.6328125, + "learning_rate": 3.977722772277228e-06, + "loss": 1.8308, + "step": 60 + }, + { + "epoch": 0.03659926801463971, + "grad_norm": 1.71875, + "learning_rate": 3.975247524752475e-06, + "loss": 1.8361, + "step": 61 + }, + { + "epoch": 0.0371992560148797, + "grad_norm": 1.375, + "learning_rate": 3.972772277227723e-06, + "loss": 1.839, + "step": 62 + }, + { + "epoch": 0.0377992440151197, + "grad_norm": 1.53125, + "learning_rate": 3.9702970297029705e-06, + "loss": 1.8829, + "step": 63 + }, + { + "epoch": 0.03839923201535969, + "grad_norm": 1.703125, + "learning_rate": 3.967821782178218e-06, + "loss": 1.88, + "step": 64 + }, + { + "epoch": 0.03899922001559969, + "grad_norm": 1.546875, + "learning_rate": 3.965346534653465e-06, + "loss": 1.9815, + "step": 65 + }, + { + "epoch": 0.03959920801583968, + "grad_norm": 1.921875, + "learning_rate": 3.962871287128713e-06, + "loss": 1.8474, + "step": 66 + }, + { + "epoch": 0.04019919601607968, + "grad_norm": 1.40625, + "learning_rate": 3.96039603960396e-06, + "loss": 1.8953, + "step": 67 + }, + { + "epoch": 0.04079918401631968, + "grad_norm": 1.640625, + "learning_rate": 3.957920792079208e-06, + "loss": 1.861, + "step": 68 + }, + { + "epoch": 0.04139917201655967, + "grad_norm": 1.6171875, + "learning_rate": 3.955445544554455e-06, + "loss": 1.9094, + "step": 69 + }, + { + "epoch": 0.041999160016799666, + "grad_norm": 1.421875, + "learning_rate": 3.952970297029703e-06, + "loss": 2.0281, + "step": 70 + }, + { + "epoch": 0.04259914801703966, + "grad_norm": 1.5234375, + "learning_rate": 3.95049504950495e-06, + "loss": 1.896, + "step": 71 + }, + { + "epoch": 0.043199136017279656, + "grad_norm": 1.796875, + "learning_rate": 3.948019801980198e-06, + "loss": 1.8792, + "step": 72 + }, + { + "epoch": 0.04379912401751965, + "grad_norm": 1.6484375, + "learning_rate": 3.945544554455446e-06, + "loss": 1.8139, + "step": 73 + }, + { + "epoch": 0.044399112017759645, + "grad_norm": 1.53125, + "learning_rate": 3.943069306930693e-06, + "loss": 1.813, + "step": 74 + }, + { + "epoch": 0.04499910001799964, + "grad_norm": 1.546875, + "learning_rate": 3.94059405940594e-06, + "loss": 1.8587, + "step": 75 + }, + { + "epoch": 0.045599088018239635, + "grad_norm": 1.3125, + "learning_rate": 3.938118811881188e-06, + "loss": 1.8793, + "step": 76 + }, + { + "epoch": 0.04619907601847963, + "grad_norm": 1.59375, + "learning_rate": 3.935643564356436e-06, + "loss": 1.7305, + "step": 77 + }, + { + "epoch": 0.046799064018719624, + "grad_norm": 1.46875, + "learning_rate": 3.933168316831683e-06, + "loss": 1.8243, + "step": 78 + }, + { + "epoch": 0.04739905201895962, + "grad_norm": 1.4609375, + "learning_rate": 3.9306930693069305e-06, + "loss": 1.9018, + "step": 79 + }, + { + "epoch": 0.047999040019199614, + "grad_norm": 1.59375, + "learning_rate": 3.928217821782178e-06, + "loss": 1.8979, + "step": 80 + }, + { + "epoch": 0.04859902801943961, + "grad_norm": 1.359375, + "learning_rate": 3.925742574257425e-06, + "loss": 1.9308, + "step": 81 + }, + { + "epoch": 0.0491990160196796, + "grad_norm": 1.6796875, + "learning_rate": 3.923267326732673e-06, + "loss": 1.8194, + "step": 82 + }, + { + "epoch": 0.0497990040199196, + "grad_norm": 1.6015625, + "learning_rate": 3.920792079207921e-06, + "loss": 1.8127, + "step": 83 + }, + { + "epoch": 0.0503989920201596, + "grad_norm": 1.625, + "learning_rate": 3.9183168316831685e-06, + "loss": 1.7168, + "step": 84 + }, + { + "epoch": 0.05099898002039959, + "grad_norm": 1.921875, + "learning_rate": 3.915841584158415e-06, + "loss": 1.9451, + "step": 85 + }, + { + "epoch": 0.05159896802063959, + "grad_norm": 1.484375, + "learning_rate": 3.913366336633663e-06, + "loss": 1.8582, + "step": 86 + }, + { + "epoch": 0.05219895602087958, + "grad_norm": 1.5703125, + "learning_rate": 3.910891089108911e-06, + "loss": 1.7901, + "step": 87 + }, + { + "epoch": 0.05279894402111958, + "grad_norm": 1.484375, + "learning_rate": 3.908415841584159e-06, + "loss": 1.7489, + "step": 88 + }, + { + "epoch": 0.05339893202135957, + "grad_norm": 1.625, + "learning_rate": 3.905940594059406e-06, + "loss": 1.8365, + "step": 89 + }, + { + "epoch": 0.05399892002159957, + "grad_norm": 1.375, + "learning_rate": 3.903465346534653e-06, + "loss": 1.8664, + "step": 90 + }, + { + "epoch": 0.054598908021839566, + "grad_norm": 1.34375, + "learning_rate": 3.9009900990099e-06, + "loss": 1.7951, + "step": 91 + }, + { + "epoch": 0.05519889602207956, + "grad_norm": 1.7421875, + "learning_rate": 3.898514851485148e-06, + "loss": 1.6784, + "step": 92 + }, + { + "epoch": 0.055798884022319556, + "grad_norm": 1.6171875, + "learning_rate": 3.896039603960396e-06, + "loss": 1.8358, + "step": 93 + }, + { + "epoch": 0.05639887202255955, + "grad_norm": 1.5546875, + "learning_rate": 3.893564356435644e-06, + "loss": 1.7865, + "step": 94 + }, + { + "epoch": 0.056998860022799545, + "grad_norm": 1.5234375, + "learning_rate": 3.8910891089108905e-06, + "loss": 1.8682, + "step": 95 + }, + { + "epoch": 0.05759884802303954, + "grad_norm": 1.4296875, + "learning_rate": 3.888613861386138e-06, + "loss": 1.8253, + "step": 96 + }, + { + "epoch": 0.058198836023279535, + "grad_norm": 1.5703125, + "learning_rate": 3.886138613861386e-06, + "loss": 1.7567, + "step": 97 + }, + { + "epoch": 0.05879882402351953, + "grad_norm": 1.5703125, + "learning_rate": 3.883663366336634e-06, + "loss": 2.0378, + "step": 98 + }, + { + "epoch": 0.059398812023759524, + "grad_norm": 1.6171875, + "learning_rate": 3.881188118811881e-06, + "loss": 1.9027, + "step": 99 + }, + { + "epoch": 0.05999880002399952, + "grad_norm": 1.5234375, + "learning_rate": 3.8787128712871285e-06, + "loss": 2.0289, + "step": 100 + }, + { + "epoch": 0.05999880002399952, + "eval_loss": 2.0494163036346436, + "eval_model_preparation_time": 0.0036, + "eval_runtime": 65.139, + "eval_samples_per_second": 153.518, + "eval_steps_per_second": 25.591, + "step": 100 + }, + { + "epoch": 0.060598788024239514, + "grad_norm": 1.6328125, + "learning_rate": 3.876237623762376e-06, + "loss": 1.8943, + "step": 101 + }, + { + "epoch": 0.06119877602447951, + "grad_norm": 1.59375, + "learning_rate": 3.873762376237624e-06, + "loss": 1.8859, + "step": 102 + }, + { + "epoch": 0.0617987640247195, + "grad_norm": 1.5625, + "learning_rate": 3.871287128712871e-06, + "loss": 1.8459, + "step": 103 + }, + { + "epoch": 0.0623987520249595, + "grad_norm": 1.46875, + "learning_rate": 3.868811881188119e-06, + "loss": 1.9159, + "step": 104 + }, + { + "epoch": 0.06299874002519949, + "grad_norm": 1.4375, + "learning_rate": 3.866336633663366e-06, + "loss": 1.8295, + "step": 105 + }, + { + "epoch": 0.06359872802543949, + "grad_norm": 1.2734375, + "learning_rate": 3.8638613861386134e-06, + "loss": 1.9225, + "step": 106 + }, + { + "epoch": 0.06419871602567949, + "grad_norm": 1.5234375, + "learning_rate": 3.861386138613861e-06, + "loss": 1.8648, + "step": 107 + }, + { + "epoch": 0.06479870402591949, + "grad_norm": 1.5078125, + "learning_rate": 3.858910891089109e-06, + "loss": 1.8594, + "step": 108 + }, + { + "epoch": 0.06539869202615947, + "grad_norm": 1.4375, + "learning_rate": 3.856435643564356e-06, + "loss": 1.979, + "step": 109 + }, + { + "epoch": 0.06599868002639947, + "grad_norm": 1.7734375, + "learning_rate": 3.853960396039604e-06, + "loss": 1.8235, + "step": 110 + }, + { + "epoch": 0.06659866802663947, + "grad_norm": 1.40625, + "learning_rate": 3.851485148514851e-06, + "loss": 1.7781, + "step": 111 + }, + { + "epoch": 0.06719865602687947, + "grad_norm": 1.5625, + "learning_rate": 3.849009900990099e-06, + "loss": 1.8508, + "step": 112 + }, + { + "epoch": 0.06779864402711945, + "grad_norm": 1.25, + "learning_rate": 3.846534653465346e-06, + "loss": 1.8538, + "step": 113 + }, + { + "epoch": 0.06839863202735945, + "grad_norm": 1.296875, + "learning_rate": 3.844059405940594e-06, + "loss": 1.7364, + "step": 114 + }, + { + "epoch": 0.06899862002759945, + "grad_norm": 1.703125, + "learning_rate": 3.841584158415842e-06, + "loss": 1.7373, + "step": 115 + }, + { + "epoch": 0.06959860802783945, + "grad_norm": 1.4140625, + "learning_rate": 3.839108910891089e-06, + "loss": 1.8901, + "step": 116 + }, + { + "epoch": 0.07019859602807944, + "grad_norm": 1.5625, + "learning_rate": 3.836633663366336e-06, + "loss": 1.8541, + "step": 117 + }, + { + "epoch": 0.07079858402831943, + "grad_norm": 1.6640625, + "learning_rate": 3.834158415841584e-06, + "loss": 1.7285, + "step": 118 + }, + { + "epoch": 0.07139857202855943, + "grad_norm": 1.390625, + "learning_rate": 3.831683168316831e-06, + "loss": 1.7322, + "step": 119 + }, + { + "epoch": 0.07199856002879942, + "grad_norm": 1.390625, + "learning_rate": 3.829207920792079e-06, + "loss": 1.8256, + "step": 120 + }, + { + "epoch": 0.07259854802903942, + "grad_norm": 1.3203125, + "learning_rate": 3.8267326732673265e-06, + "loss": 1.7352, + "step": 121 + }, + { + "epoch": 0.07319853602927942, + "grad_norm": 1.4609375, + "learning_rate": 3.824257425742574e-06, + "loss": 1.8343, + "step": 122 + }, + { + "epoch": 0.0737985240295194, + "grad_norm": 1.5078125, + "learning_rate": 3.821782178217821e-06, + "loss": 1.9866, + "step": 123 + }, + { + "epoch": 0.0743985120297594, + "grad_norm": 1.359375, + "learning_rate": 3.819306930693069e-06, + "loss": 1.8854, + "step": 124 + }, + { + "epoch": 0.0749985000299994, + "grad_norm": 1.484375, + "learning_rate": 3.816831683168317e-06, + "loss": 1.8341, + "step": 125 + }, + { + "epoch": 0.0755984880302394, + "grad_norm": 1.3984375, + "learning_rate": 3.814356435643564e-06, + "loss": 1.829, + "step": 126 + }, + { + "epoch": 0.07619847603047938, + "grad_norm": 1.4375, + "learning_rate": 3.8118811881188114e-06, + "loss": 1.8042, + "step": 127 + }, + { + "epoch": 0.07679846403071938, + "grad_norm": 1.484375, + "learning_rate": 3.809405940594059e-06, + "loss": 1.7094, + "step": 128 + }, + { + "epoch": 0.07739845203095938, + "grad_norm": 1.40625, + "learning_rate": 3.8069306930693065e-06, + "loss": 1.7705, + "step": 129 + }, + { + "epoch": 0.07799844003119938, + "grad_norm": 1.3984375, + "learning_rate": 3.8044554455445543e-06, + "loss": 1.8023, + "step": 130 + }, + { + "epoch": 0.07859842803143938, + "grad_norm": 1.390625, + "learning_rate": 3.8019801980198017e-06, + "loss": 1.9408, + "step": 131 + }, + { + "epoch": 0.07919841603167936, + "grad_norm": 1.4921875, + "learning_rate": 3.7995049504950494e-06, + "loss": 1.8589, + "step": 132 + }, + { + "epoch": 0.07979840403191936, + "grad_norm": 1.4140625, + "learning_rate": 3.7970297029702968e-06, + "loss": 1.8806, + "step": 133 + }, + { + "epoch": 0.08039839203215936, + "grad_norm": 1.5234375, + "learning_rate": 3.7945544554455445e-06, + "loss": 1.8629, + "step": 134 + }, + { + "epoch": 0.08099838003239936, + "grad_norm": 1.734375, + "learning_rate": 3.792079207920792e-06, + "loss": 1.7811, + "step": 135 + }, + { + "epoch": 0.08159836803263935, + "grad_norm": 1.421875, + "learning_rate": 3.7896039603960396e-06, + "loss": 1.8703, + "step": 136 + }, + { + "epoch": 0.08219835603287934, + "grad_norm": 1.4375, + "learning_rate": 3.7871287128712866e-06, + "loss": 1.9697, + "step": 137 + }, + { + "epoch": 0.08279834403311934, + "grad_norm": 1.484375, + "learning_rate": 3.7846534653465343e-06, + "loss": 1.8083, + "step": 138 + }, + { + "epoch": 0.08339833203335933, + "grad_norm": 1.5078125, + "learning_rate": 3.7821782178217817e-06, + "loss": 1.8348, + "step": 139 + }, + { + "epoch": 0.08399832003359933, + "grad_norm": 1.3671875, + "learning_rate": 3.7797029702970294e-06, + "loss": 1.7378, + "step": 140 + }, + { + "epoch": 0.08459830803383932, + "grad_norm": 1.3046875, + "learning_rate": 3.7772277227722768e-06, + "loss": 1.9362, + "step": 141 + }, + { + "epoch": 0.08519829603407932, + "grad_norm": 1.75, + "learning_rate": 3.7747524752475245e-06, + "loss": 1.9628, + "step": 142 + }, + { + "epoch": 0.08579828403431931, + "grad_norm": 1.3203125, + "learning_rate": 3.772277227722772e-06, + "loss": 1.8524, + "step": 143 + }, + { + "epoch": 0.08639827203455931, + "grad_norm": 1.6875, + "learning_rate": 3.7698019801980197e-06, + "loss": 1.7373, + "step": 144 + }, + { + "epoch": 0.08699826003479931, + "grad_norm": 1.546875, + "learning_rate": 3.767326732673267e-06, + "loss": 1.8752, + "step": 145 + }, + { + "epoch": 0.0875982480350393, + "grad_norm": 1.484375, + "learning_rate": 3.7648514851485148e-06, + "loss": 1.7939, + "step": 146 + }, + { + "epoch": 0.08819823603527929, + "grad_norm": 1.265625, + "learning_rate": 3.762376237623762e-06, + "loss": 1.8857, + "step": 147 + }, + { + "epoch": 0.08879822403551929, + "grad_norm": 1.4921875, + "learning_rate": 3.75990099009901e-06, + "loss": 1.8343, + "step": 148 + }, + { + "epoch": 0.08939821203575929, + "grad_norm": 1.4921875, + "learning_rate": 3.7574257425742572e-06, + "loss": 1.8464, + "step": 149 + }, + { + "epoch": 0.08999820003599927, + "grad_norm": 1.375, + "learning_rate": 3.754950495049505e-06, + "loss": 1.6997, + "step": 150 + }, + { + "epoch": 0.08999820003599927, + "eval_loss": 2.043649673461914, + "eval_model_preparation_time": 0.0036, + "eval_runtime": 67.4909, + "eval_samples_per_second": 148.168, + "eval_steps_per_second": 24.7, + "step": 150 + }, + { + "epoch": 0.09059818803623927, + "grad_norm": 1.3203125, + "learning_rate": 3.752475247524752e-06, + "loss": 1.8642, + "step": 151 + }, + { + "epoch": 0.09119817603647927, + "grad_norm": 1.390625, + "learning_rate": 3.7499999999999997e-06, + "loss": 1.7887, + "step": 152 + }, + { + "epoch": 0.09179816403671927, + "grad_norm": 1.3203125, + "learning_rate": 3.7475247524752474e-06, + "loss": 1.8596, + "step": 153 + }, + { + "epoch": 0.09239815203695927, + "grad_norm": 1.484375, + "learning_rate": 3.7450495049504948e-06, + "loss": 2.0243, + "step": 154 + }, + { + "epoch": 0.09299814003719925, + "grad_norm": 1.3203125, + "learning_rate": 3.7425742574257425e-06, + "loss": 1.7887, + "step": 155 + }, + { + "epoch": 0.09359812803743925, + "grad_norm": 1.359375, + "learning_rate": 3.74009900990099e-06, + "loss": 1.828, + "step": 156 + }, + { + "epoch": 0.09419811603767925, + "grad_norm": 1.3984375, + "learning_rate": 3.7376237623762377e-06, + "loss": 1.7897, + "step": 157 + }, + { + "epoch": 0.09479810403791925, + "grad_norm": 1.4453125, + "learning_rate": 3.735148514851485e-06, + "loss": 1.761, + "step": 158 + }, + { + "epoch": 0.09539809203815924, + "grad_norm": 1.4609375, + "learning_rate": 3.7326732673267328e-06, + "loss": 1.812, + "step": 159 + }, + { + "epoch": 0.09599808003839923, + "grad_norm": 1.3203125, + "learning_rate": 3.73019801980198e-06, + "loss": 1.8758, + "step": 160 + }, + { + "epoch": 0.09659806803863923, + "grad_norm": 1.4453125, + "learning_rate": 3.727722772277228e-06, + "loss": 1.7899, + "step": 161 + }, + { + "epoch": 0.09719805603887922, + "grad_norm": 1.296875, + "learning_rate": 3.7252475247524752e-06, + "loss": 1.7542, + "step": 162 + }, + { + "epoch": 0.09779804403911922, + "grad_norm": 1.53125, + "learning_rate": 3.722772277227723e-06, + "loss": 1.765, + "step": 163 + }, + { + "epoch": 0.0983980320393592, + "grad_norm": 1.578125, + "learning_rate": 3.72029702970297e-06, + "loss": 1.7773, + "step": 164 + }, + { + "epoch": 0.0989980200395992, + "grad_norm": 1.3203125, + "learning_rate": 3.7178217821782177e-06, + "loss": 1.9011, + "step": 165 + }, + { + "epoch": 0.0995980080398392, + "grad_norm": 1.2578125, + "learning_rate": 3.715346534653465e-06, + "loss": 1.8742, + "step": 166 + }, + { + "epoch": 0.1001979960400792, + "grad_norm": 1.4140625, + "learning_rate": 3.7128712871287128e-06, + "loss": 1.8805, + "step": 167 + }, + { + "epoch": 0.1007979840403192, + "grad_norm": 1.3046875, + "learning_rate": 3.71039603960396e-06, + "loss": 1.9764, + "step": 168 + }, + { + "epoch": 0.10139797204055918, + "grad_norm": 1.3203125, + "learning_rate": 3.707920792079208e-06, + "loss": 1.8004, + "step": 169 + }, + { + "epoch": 0.10199796004079918, + "grad_norm": 1.265625, + "learning_rate": 3.7054455445544552e-06, + "loss": 1.8625, + "step": 170 + }, + { + "epoch": 0.10259794804103918, + "grad_norm": 1.328125, + "learning_rate": 3.702970297029703e-06, + "loss": 1.7509, + "step": 171 + }, + { + "epoch": 0.10319793604127918, + "grad_norm": 1.5546875, + "learning_rate": 3.7004950495049503e-06, + "loss": 1.7476, + "step": 172 + }, + { + "epoch": 0.10379792404151916, + "grad_norm": 1.4140625, + "learning_rate": 3.698019801980198e-06, + "loss": 1.8179, + "step": 173 + }, + { + "epoch": 0.10439791204175916, + "grad_norm": 1.4375, + "learning_rate": 3.6955445544554455e-06, + "loss": 1.9345, + "step": 174 + }, + { + "epoch": 0.10499790004199916, + "grad_norm": 1.3984375, + "learning_rate": 3.6930693069306932e-06, + "loss": 1.8446, + "step": 175 + }, + { + "epoch": 0.10559788804223916, + "grad_norm": 1.34375, + "learning_rate": 3.6905940594059406e-06, + "loss": 1.9043, + "step": 176 + }, + { + "epoch": 0.10619787604247916, + "grad_norm": 1.3203125, + "learning_rate": 3.6881188118811883e-06, + "loss": 1.8542, + "step": 177 + }, + { + "epoch": 0.10679786404271914, + "grad_norm": 1.3671875, + "learning_rate": 3.6856435643564352e-06, + "loss": 1.9043, + "step": 178 + }, + { + "epoch": 0.10739785204295914, + "grad_norm": 1.328125, + "learning_rate": 3.683168316831683e-06, + "loss": 1.8059, + "step": 179 + }, + { + "epoch": 0.10799784004319914, + "grad_norm": 1.21875, + "learning_rate": 3.6806930693069304e-06, + "loss": 1.6908, + "step": 180 + }, + { + "epoch": 0.10859782804343913, + "grad_norm": 1.2265625, + "learning_rate": 3.678217821782178e-06, + "loss": 1.8294, + "step": 181 + }, + { + "epoch": 0.10919781604367913, + "grad_norm": 1.5859375, + "learning_rate": 3.6757425742574255e-06, + "loss": 2.0129, + "step": 182 + }, + { + "epoch": 0.10979780404391912, + "grad_norm": 1.5, + "learning_rate": 3.6732673267326732e-06, + "loss": 1.7686, + "step": 183 + }, + { + "epoch": 0.11039779204415912, + "grad_norm": 1.7421875, + "learning_rate": 3.6707920792079206e-06, + "loss": 1.9643, + "step": 184 + }, + { + "epoch": 0.11099778004439911, + "grad_norm": 1.5234375, + "learning_rate": 3.6683168316831683e-06, + "loss": 1.7562, + "step": 185 + }, + { + "epoch": 0.11159776804463911, + "grad_norm": 1.2421875, + "learning_rate": 3.6658415841584157e-06, + "loss": 1.8898, + "step": 186 + }, + { + "epoch": 0.1121977560448791, + "grad_norm": 1.59375, + "learning_rate": 3.6633663366336635e-06, + "loss": 1.885, + "step": 187 + }, + { + "epoch": 0.1127977440451191, + "grad_norm": 5.5625, + "learning_rate": 3.660891089108911e-06, + "loss": 1.9108, + "step": 188 + }, + { + "epoch": 0.11339773204535909, + "grad_norm": 1.546875, + "learning_rate": 3.6584158415841586e-06, + "loss": 1.826, + "step": 189 + }, + { + "epoch": 0.11399772004559909, + "grad_norm": 1.3515625, + "learning_rate": 3.6559405940594055e-06, + "loss": 1.8905, + "step": 190 + }, + { + "epoch": 0.11459770804583909, + "grad_norm": 1.25, + "learning_rate": 3.6534653465346532e-06, + "loss": 1.8051, + "step": 191 + }, + { + "epoch": 0.11519769604607907, + "grad_norm": 1.21875, + "learning_rate": 3.6509900990099006e-06, + "loss": 1.7055, + "step": 192 + }, + { + "epoch": 0.11579768404631907, + "grad_norm": 1.3671875, + "learning_rate": 3.6485148514851484e-06, + "loss": 1.8768, + "step": 193 + }, + { + "epoch": 0.11639767204655907, + "grad_norm": 1.2265625, + "learning_rate": 3.6460396039603957e-06, + "loss": 1.7644, + "step": 194 + }, + { + "epoch": 0.11699766004679907, + "grad_norm": 1.4375, + "learning_rate": 3.6435643564356435e-06, + "loss": 1.9606, + "step": 195 + }, + { + "epoch": 0.11759764804703907, + "grad_norm": 1.34375, + "learning_rate": 3.641089108910891e-06, + "loss": 1.7722, + "step": 196 + }, + { + "epoch": 0.11819763604727905, + "grad_norm": 1.3984375, + "learning_rate": 3.6386138613861386e-06, + "loss": 1.7471, + "step": 197 + }, + { + "epoch": 0.11879762404751905, + "grad_norm": 1.2265625, + "learning_rate": 3.636138613861386e-06, + "loss": 1.8394, + "step": 198 + }, + { + "epoch": 0.11939761204775905, + "grad_norm": 1.4765625, + "learning_rate": 3.6336633663366337e-06, + "loss": 1.8986, + "step": 199 + }, + { + "epoch": 0.11999760004799905, + "grad_norm": 1.4296875, + "learning_rate": 3.631188118811881e-06, + "loss": 1.6904, + "step": 200 + }, + { + "epoch": 0.11999760004799905, + "eval_loss": 2.040451765060425, + "eval_model_preparation_time": 0.0036, + "eval_runtime": 65.1419, + "eval_samples_per_second": 153.511, + "eval_steps_per_second": 25.59, + "step": 200 + }, + { + "epoch": 0.12059758804823903, + "grad_norm": 1.421875, + "learning_rate": 3.628712871287129e-06, + "loss": 1.8726, + "step": 201 + }, + { + "epoch": 0.12119757604847903, + "grad_norm": 1.3671875, + "learning_rate": 3.626237623762376e-06, + "loss": 1.8508, + "step": 202 + }, + { + "epoch": 0.12179756404871903, + "grad_norm": 1.3984375, + "learning_rate": 3.623762376237624e-06, + "loss": 1.9293, + "step": 203 + }, + { + "epoch": 0.12239755204895902, + "grad_norm": 1.3671875, + "learning_rate": 3.621287128712871e-06, + "loss": 1.9013, + "step": 204 + }, + { + "epoch": 0.12299754004919902, + "grad_norm": 2.09375, + "learning_rate": 3.6188118811881186e-06, + "loss": 1.9189, + "step": 205 + }, + { + "epoch": 0.123597528049439, + "grad_norm": 1.2890625, + "learning_rate": 3.616336633663366e-06, + "loss": 1.8183, + "step": 206 + }, + { + "epoch": 0.124197516049679, + "grad_norm": 1.296875, + "learning_rate": 3.6138613861386137e-06, + "loss": 1.8873, + "step": 207 + }, + { + "epoch": 0.124797504049919, + "grad_norm": 1.2265625, + "learning_rate": 3.611386138613861e-06, + "loss": 1.8153, + "step": 208 + }, + { + "epoch": 0.125397492050159, + "grad_norm": 1.375, + "learning_rate": 3.608910891089109e-06, + "loss": 1.7048, + "step": 209 + }, + { + "epoch": 0.12599748005039899, + "grad_norm": 1.5234375, + "learning_rate": 3.606435643564356e-06, + "loss": 1.6584, + "step": 210 + }, + { + "epoch": 0.12659746805063898, + "grad_norm": 1.2109375, + "learning_rate": 3.603960396039604e-06, + "loss": 1.7825, + "step": 211 + }, + { + "epoch": 0.12719745605087898, + "grad_norm": 1.3046875, + "learning_rate": 3.6014851485148513e-06, + "loss": 1.8709, + "step": 212 + }, + { + "epoch": 0.12779744405111898, + "grad_norm": 1.46875, + "learning_rate": 3.599009900990099e-06, + "loss": 1.858, + "step": 213 + }, + { + "epoch": 0.12839743205135898, + "grad_norm": 1.46875, + "learning_rate": 3.5965346534653464e-06, + "loss": 1.9115, + "step": 214 + }, + { + "epoch": 0.12899742005159898, + "grad_norm": 1.171875, + "learning_rate": 3.594059405940594e-06, + "loss": 1.7853, + "step": 215 + }, + { + "epoch": 0.12959740805183897, + "grad_norm": 1.234375, + "learning_rate": 3.5915841584158415e-06, + "loss": 1.8218, + "step": 216 + }, + { + "epoch": 0.13019739605207895, + "grad_norm": 1.484375, + "learning_rate": 3.589108910891089e-06, + "loss": 1.7609, + "step": 217 + }, + { + "epoch": 0.13079738405231894, + "grad_norm": 1.4921875, + "learning_rate": 3.586633663366336e-06, + "loss": 2.1418, + "step": 218 + }, + { + "epoch": 0.13139737205255894, + "grad_norm": 1.4375, + "learning_rate": 3.584158415841584e-06, + "loss": 1.7065, + "step": 219 + }, + { + "epoch": 0.13199736005279894, + "grad_norm": 1.328125, + "learning_rate": 3.5816831683168313e-06, + "loss": 1.8543, + "step": 220 + }, + { + "epoch": 0.13259734805303894, + "grad_norm": 1.59375, + "learning_rate": 3.579207920792079e-06, + "loss": 1.8026, + "step": 221 + }, + { + "epoch": 0.13319733605327894, + "grad_norm": 1.4140625, + "learning_rate": 3.5767326732673264e-06, + "loss": 1.7373, + "step": 222 + }, + { + "epoch": 0.13379732405351893, + "grad_norm": 1.65625, + "learning_rate": 3.574257425742574e-06, + "loss": 1.7388, + "step": 223 + }, + { + "epoch": 0.13439731205375893, + "grad_norm": 1.265625, + "learning_rate": 3.5717821782178215e-06, + "loss": 1.8013, + "step": 224 + }, + { + "epoch": 0.13499730005399893, + "grad_norm": 1.375, + "learning_rate": 3.5693069306930693e-06, + "loss": 1.9171, + "step": 225 + }, + { + "epoch": 0.1355972880542389, + "grad_norm": 1.234375, + "learning_rate": 3.5668316831683166e-06, + "loss": 1.9055, + "step": 226 + }, + { + "epoch": 0.1361972760544789, + "grad_norm": 1.21875, + "learning_rate": 3.5643564356435644e-06, + "loss": 1.9024, + "step": 227 + }, + { + "epoch": 0.1367972640547189, + "grad_norm": 1.453125, + "learning_rate": 3.5618811881188117e-06, + "loss": 1.8366, + "step": 228 + }, + { + "epoch": 0.1373972520549589, + "grad_norm": 1.2109375, + "learning_rate": 3.5594059405940595e-06, + "loss": 1.8003, + "step": 229 + }, + { + "epoch": 0.1379972400551989, + "grad_norm": 1.3359375, + "learning_rate": 3.5569306930693064e-06, + "loss": 1.9299, + "step": 230 + }, + { + "epoch": 0.1385972280554389, + "grad_norm": 1.25, + "learning_rate": 3.554455445544554e-06, + "loss": 1.8557, + "step": 231 + }, + { + "epoch": 0.1391972160556789, + "grad_norm": 1.2421875, + "learning_rate": 3.5519801980198015e-06, + "loss": 1.8823, + "step": 232 + }, + { + "epoch": 0.1397972040559189, + "grad_norm": 1.3046875, + "learning_rate": 3.5495049504950493e-06, + "loss": 1.891, + "step": 233 + }, + { + "epoch": 0.1403971920561589, + "grad_norm": 1.734375, + "learning_rate": 3.5470297029702966e-06, + "loss": 1.6939, + "step": 234 + }, + { + "epoch": 0.14099718005639889, + "grad_norm": 1.421875, + "learning_rate": 3.5445544554455444e-06, + "loss": 1.8792, + "step": 235 + }, + { + "epoch": 0.14159716805663886, + "grad_norm": 1.25, + "learning_rate": 3.5420792079207917e-06, + "loss": 1.856, + "step": 236 + }, + { + "epoch": 0.14219715605687885, + "grad_norm": 1.34375, + "learning_rate": 3.5396039603960395e-06, + "loss": 1.8268, + "step": 237 + }, + { + "epoch": 0.14279714405711885, + "grad_norm": 1.203125, + "learning_rate": 3.537128712871287e-06, + "loss": 1.7116, + "step": 238 + }, + { + "epoch": 0.14339713205735885, + "grad_norm": 1.234375, + "learning_rate": 3.5346534653465346e-06, + "loss": 1.922, + "step": 239 + }, + { + "epoch": 0.14399712005759885, + "grad_norm": 1.5, + "learning_rate": 3.532178217821782e-06, + "loss": 1.792, + "step": 240 + }, + { + "epoch": 0.14459710805783885, + "grad_norm": 1.390625, + "learning_rate": 3.5297029702970297e-06, + "loss": 1.6891, + "step": 241 + }, + { + "epoch": 0.14519709605807885, + "grad_norm": 1.359375, + "learning_rate": 3.527227722772277e-06, + "loss": 1.8681, + "step": 242 + }, + { + "epoch": 0.14579708405831884, + "grad_norm": 1.2265625, + "learning_rate": 3.524752475247525e-06, + "loss": 1.6985, + "step": 243 + }, + { + "epoch": 0.14639707205855884, + "grad_norm": 1.3203125, + "learning_rate": 3.5222772277227717e-06, + "loss": 1.7168, + "step": 244 + }, + { + "epoch": 0.1469970600587988, + "grad_norm": 2.921875, + "learning_rate": 3.5198019801980195e-06, + "loss": 1.7725, + "step": 245 + }, + { + "epoch": 0.1475970480590388, + "grad_norm": 1.609375, + "learning_rate": 3.517326732673267e-06, + "loss": 1.8263, + "step": 246 + }, + { + "epoch": 0.1481970360592788, + "grad_norm": 1.3515625, + "learning_rate": 3.5148514851485146e-06, + "loss": 1.791, + "step": 247 + }, + { + "epoch": 0.1487970240595188, + "grad_norm": 1.359375, + "learning_rate": 3.512376237623762e-06, + "loss": 1.867, + "step": 248 + }, + { + "epoch": 0.1493970120597588, + "grad_norm": 1.6953125, + "learning_rate": 3.5099009900990097e-06, + "loss": 1.92, + "step": 249 + }, + { + "epoch": 0.1499970000599988, + "grad_norm": 1.3984375, + "learning_rate": 3.507425742574257e-06, + "loss": 1.8612, + "step": 250 + }, + { + "epoch": 0.1499970000599988, + "eval_loss": 2.0383219718933105, + "eval_model_preparation_time": 0.0036, + "eval_runtime": 65.0682, + "eval_samples_per_second": 153.685, + "eval_steps_per_second": 25.619, + "step": 250 + }, + { + "epoch": 0.1505969880602388, + "grad_norm": 1.4375, + "learning_rate": 3.504950495049505e-06, + "loss": 1.8017, + "step": 251 + }, + { + "epoch": 0.1511969760604788, + "grad_norm": 1.375, + "learning_rate": 3.502475247524752e-06, + "loss": 1.944, + "step": 252 + }, + { + "epoch": 0.1517969640607188, + "grad_norm": 1.265625, + "learning_rate": 3.5e-06, + "loss": 1.7536, + "step": 253 + }, + { + "epoch": 0.15239695206095877, + "grad_norm": 1.203125, + "learning_rate": 3.4975247524752477e-06, + "loss": 1.7696, + "step": 254 + }, + { + "epoch": 0.15299694006119877, + "grad_norm": 1.3515625, + "learning_rate": 3.495049504950495e-06, + "loss": 1.8545, + "step": 255 + }, + { + "epoch": 0.15359692806143876, + "grad_norm": 1.4609375, + "learning_rate": 3.492574257425743e-06, + "loss": 1.9009, + "step": 256 + }, + { + "epoch": 0.15419691606167876, + "grad_norm": 1.328125, + "learning_rate": 3.4900990099009897e-06, + "loss": 1.915, + "step": 257 + }, + { + "epoch": 0.15479690406191876, + "grad_norm": 1.546875, + "learning_rate": 3.4876237623762375e-06, + "loss": 1.7616, + "step": 258 + }, + { + "epoch": 0.15539689206215876, + "grad_norm": 1.4140625, + "learning_rate": 3.485148514851485e-06, + "loss": 1.843, + "step": 259 + }, + { + "epoch": 0.15599688006239876, + "grad_norm": 1.2265625, + "learning_rate": 3.4826732673267326e-06, + "loss": 1.8939, + "step": 260 + }, + { + "epoch": 0.15659686806263876, + "grad_norm": 1.2421875, + "learning_rate": 3.48019801980198e-06, + "loss": 1.8774, + "step": 261 + }, + { + "epoch": 0.15719685606287875, + "grad_norm": 1.25, + "learning_rate": 3.4777227722772277e-06, + "loss": 1.7766, + "step": 262 + }, + { + "epoch": 0.15779684406311872, + "grad_norm": 1.46875, + "learning_rate": 3.475247524752475e-06, + "loss": 1.806, + "step": 263 + }, + { + "epoch": 0.15839683206335872, + "grad_norm": 1.2734375, + "learning_rate": 3.472772277227723e-06, + "loss": 1.8267, + "step": 264 + }, + { + "epoch": 0.15899682006359872, + "grad_norm": 1.609375, + "learning_rate": 3.47029702970297e-06, + "loss": 1.8525, + "step": 265 + }, + { + "epoch": 0.15959680806383872, + "grad_norm": 1.6796875, + "learning_rate": 3.467821782178218e-06, + "loss": 1.7749, + "step": 266 + }, + { + "epoch": 0.16019679606407872, + "grad_norm": 1.6328125, + "learning_rate": 3.4653465346534653e-06, + "loss": 1.905, + "step": 267 + }, + { + "epoch": 0.16079678406431872, + "grad_norm": 1.328125, + "learning_rate": 3.462871287128713e-06, + "loss": 1.7351, + "step": 268 + }, + { + "epoch": 0.1613967720645587, + "grad_norm": 1.703125, + "learning_rate": 3.4603960396039604e-06, + "loss": 1.8969, + "step": 269 + }, + { + "epoch": 0.1619967600647987, + "grad_norm": 1.53125, + "learning_rate": 3.4579207920792077e-06, + "loss": 1.7878, + "step": 270 + }, + { + "epoch": 0.1625967480650387, + "grad_norm": 1.703125, + "learning_rate": 3.455445544554455e-06, + "loss": 1.84, + "step": 271 + }, + { + "epoch": 0.1631967360652787, + "grad_norm": 1.375, + "learning_rate": 3.452970297029703e-06, + "loss": 1.9861, + "step": 272 + }, + { + "epoch": 0.16379672406551868, + "grad_norm": 1.3359375, + "learning_rate": 3.45049504950495e-06, + "loss": 1.8173, + "step": 273 + }, + { + "epoch": 0.16439671206575868, + "grad_norm": 1.2890625, + "learning_rate": 3.448019801980198e-06, + "loss": 1.8155, + "step": 274 + }, + { + "epoch": 0.16499670006599867, + "grad_norm": 1.4140625, + "learning_rate": 3.4455445544554453e-06, + "loss": 1.7419, + "step": 275 + }, + { + "epoch": 0.16559668806623867, + "grad_norm": 1.265625, + "learning_rate": 3.443069306930693e-06, + "loss": 1.7549, + "step": 276 + }, + { + "epoch": 0.16619667606647867, + "grad_norm": 1.515625, + "learning_rate": 3.4405940594059404e-06, + "loss": 1.8526, + "step": 277 + }, + { + "epoch": 0.16679666406671867, + "grad_norm": 1.4765625, + "learning_rate": 3.438118811881188e-06, + "loss": 1.8459, + "step": 278 + }, + { + "epoch": 0.16739665206695867, + "grad_norm": 1.390625, + "learning_rate": 3.4356435643564355e-06, + "loss": 1.8812, + "step": 279 + }, + { + "epoch": 0.16799664006719867, + "grad_norm": 1.328125, + "learning_rate": 3.4331683168316833e-06, + "loss": 1.8331, + "step": 280 + }, + { + "epoch": 0.16859662806743866, + "grad_norm": 1.6015625, + "learning_rate": 3.4306930693069306e-06, + "loss": 1.965, + "step": 281 + }, + { + "epoch": 0.16919661606767863, + "grad_norm": 1.5859375, + "learning_rate": 3.4282178217821784e-06, + "loss": 1.9069, + "step": 282 + }, + { + "epoch": 0.16979660406791863, + "grad_norm": 1.890625, + "learning_rate": 3.4257425742574253e-06, + "loss": 1.7781, + "step": 283 + }, + { + "epoch": 0.17039659206815863, + "grad_norm": 1.1953125, + "learning_rate": 3.423267326732673e-06, + "loss": 1.9266, + "step": 284 + }, + { + "epoch": 0.17099658006839863, + "grad_norm": 1.359375, + "learning_rate": 3.4207920792079204e-06, + "loss": 1.8376, + "step": 285 + }, + { + "epoch": 0.17159656806863863, + "grad_norm": 1.3203125, + "learning_rate": 3.418316831683168e-06, + "loss": 1.7678, + "step": 286 + }, + { + "epoch": 0.17219655606887863, + "grad_norm": 1.15625, + "learning_rate": 3.4158415841584155e-06, + "loss": 1.8636, + "step": 287 + }, + { + "epoch": 0.17279654406911862, + "grad_norm": 1.1484375, + "learning_rate": 3.4133663366336633e-06, + "loss": 1.8833, + "step": 288 + }, + { + "epoch": 0.17339653206935862, + "grad_norm": 1.375, + "learning_rate": 3.4108910891089106e-06, + "loss": 1.7882, + "step": 289 + }, + { + "epoch": 0.17399652006959862, + "grad_norm": 1.3046875, + "learning_rate": 3.4084158415841584e-06, + "loss": 1.9467, + "step": 290 + }, + { + "epoch": 0.1745965080698386, + "grad_norm": 1.2890625, + "learning_rate": 3.4059405940594058e-06, + "loss": 1.7262, + "step": 291 + }, + { + "epoch": 0.1751964960700786, + "grad_norm": 1.484375, + "learning_rate": 3.4034653465346535e-06, + "loss": 1.8879, + "step": 292 + }, + { + "epoch": 0.1757964840703186, + "grad_norm": 1.2734375, + "learning_rate": 3.400990099009901e-06, + "loss": 1.8687, + "step": 293 + }, + { + "epoch": 0.17639647207055859, + "grad_norm": 1.3828125, + "learning_rate": 3.3985148514851486e-06, + "loss": 1.7491, + "step": 294 + }, + { + "epoch": 0.17699646007079858, + "grad_norm": 1.3359375, + "learning_rate": 3.396039603960396e-06, + "loss": 1.939, + "step": 295 + }, + { + "epoch": 0.17759644807103858, + "grad_norm": 1.4375, + "learning_rate": 3.3935643564356437e-06, + "loss": 1.8243, + "step": 296 + }, + { + "epoch": 0.17819643607127858, + "grad_norm": 1.3125, + "learning_rate": 3.3910891089108907e-06, + "loss": 1.8302, + "step": 297 + }, + { + "epoch": 0.17879642407151858, + "grad_norm": 1.4140625, + "learning_rate": 3.3886138613861384e-06, + "loss": 1.8882, + "step": 298 + }, + { + "epoch": 0.17939641207175858, + "grad_norm": 1.375, + "learning_rate": 3.3861386138613858e-06, + "loss": 1.8015, + "step": 299 + }, + { + "epoch": 0.17999640007199855, + "grad_norm": 1.296875, + "learning_rate": 3.3836633663366335e-06, + "loss": 1.9455, + "step": 300 + }, + { + "epoch": 0.17999640007199855, + "eval_loss": 2.0367591381073, + "eval_model_preparation_time": 0.0036, + "eval_runtime": 67.518, + "eval_samples_per_second": 148.109, + "eval_steps_per_second": 24.69, + "step": 300 + }, + { + "epoch": 0.18059638807223855, + "grad_norm": 1.40625, + "learning_rate": 3.381188118811881e-06, + "loss": 1.7753, + "step": 301 + }, + { + "epoch": 0.18119637607247854, + "grad_norm": 1.3203125, + "learning_rate": 3.3787128712871286e-06, + "loss": 1.9745, + "step": 302 + }, + { + "epoch": 0.18179636407271854, + "grad_norm": 1.59375, + "learning_rate": 3.376237623762376e-06, + "loss": 1.7602, + "step": 303 + }, + { + "epoch": 0.18239635207295854, + "grad_norm": 1.296875, + "learning_rate": 3.3737623762376238e-06, + "loss": 1.9368, + "step": 304 + }, + { + "epoch": 0.18299634007319854, + "grad_norm": 1.9609375, + "learning_rate": 3.371287128712871e-06, + "loss": 1.9062, + "step": 305 + }, + { + "epoch": 0.18359632807343854, + "grad_norm": 1.2578125, + "learning_rate": 3.368811881188119e-06, + "loss": 1.7124, + "step": 306 + }, + { + "epoch": 0.18419631607367853, + "grad_norm": 1.328125, + "learning_rate": 3.366336633663366e-06, + "loss": 1.7911, + "step": 307 + }, + { + "epoch": 0.18479630407391853, + "grad_norm": 1.296875, + "learning_rate": 3.363861386138614e-06, + "loss": 1.8183, + "step": 308 + }, + { + "epoch": 0.1853962920741585, + "grad_norm": 1.484375, + "learning_rate": 3.3613861386138613e-06, + "loss": 1.8593, + "step": 309 + }, + { + "epoch": 0.1859962800743985, + "grad_norm": 2.046875, + "learning_rate": 3.3589108910891087e-06, + "loss": 1.9335, + "step": 310 + }, + { + "epoch": 0.1865962680746385, + "grad_norm": 1.3359375, + "learning_rate": 3.356435643564356e-06, + "loss": 1.8858, + "step": 311 + }, + { + "epoch": 0.1871962560748785, + "grad_norm": 1.359375, + "learning_rate": 3.3539603960396038e-06, + "loss": 1.9154, + "step": 312 + }, + { + "epoch": 0.1877962440751185, + "grad_norm": 1.28125, + "learning_rate": 3.351485148514851e-06, + "loss": 1.7767, + "step": 313 + }, + { + "epoch": 0.1883962320753585, + "grad_norm": 1.4765625, + "learning_rate": 3.349009900990099e-06, + "loss": 1.8943, + "step": 314 + }, + { + "epoch": 0.1889962200755985, + "grad_norm": 1.2109375, + "learning_rate": 3.3465346534653462e-06, + "loss": 1.7825, + "step": 315 + }, + { + "epoch": 0.1895962080758385, + "grad_norm": 1.484375, + "learning_rate": 3.344059405940594e-06, + "loss": 1.891, + "step": 316 + }, + { + "epoch": 0.1901961960760785, + "grad_norm": 1.359375, + "learning_rate": 3.3415841584158413e-06, + "loss": 1.8893, + "step": 317 + }, + { + "epoch": 0.1907961840763185, + "grad_norm": 1.296875, + "learning_rate": 3.339108910891089e-06, + "loss": 1.7661, + "step": 318 + }, + { + "epoch": 0.19139617207655846, + "grad_norm": 1.5078125, + "learning_rate": 3.3366336633663364e-06, + "loss": 1.7951, + "step": 319 + }, + { + "epoch": 0.19199616007679846, + "grad_norm": 1.203125, + "learning_rate": 3.334158415841584e-06, + "loss": 1.825, + "step": 320 + }, + { + "epoch": 0.19259614807703845, + "grad_norm": 1.265625, + "learning_rate": 3.3316831683168316e-06, + "loss": 1.7065, + "step": 321 + }, + { + "epoch": 0.19319613607727845, + "grad_norm": 1.3984375, + "learning_rate": 3.3292079207920793e-06, + "loss": 1.8174, + "step": 322 + }, + { + "epoch": 0.19379612407751845, + "grad_norm": 1.3203125, + "learning_rate": 3.3267326732673262e-06, + "loss": 1.8812, + "step": 323 + }, + { + "epoch": 0.19439611207775845, + "grad_norm": 1.2578125, + "learning_rate": 3.324257425742574e-06, + "loss": 1.8562, + "step": 324 + }, + { + "epoch": 0.19499610007799845, + "grad_norm": 1.2734375, + "learning_rate": 3.3217821782178213e-06, + "loss": 1.7252, + "step": 325 + }, + { + "epoch": 0.19559608807823844, + "grad_norm": 1.6015625, + "learning_rate": 3.319306930693069e-06, + "loss": 1.7466, + "step": 326 + }, + { + "epoch": 0.19619607607847844, + "grad_norm": 1.34375, + "learning_rate": 3.3168316831683165e-06, + "loss": 1.7714, + "step": 327 + }, + { + "epoch": 0.1967960640787184, + "grad_norm": 1.2265625, + "learning_rate": 3.3143564356435642e-06, + "loss": 1.8033, + "step": 328 + }, + { + "epoch": 0.1973960520789584, + "grad_norm": 1.71875, + "learning_rate": 3.3118811881188116e-06, + "loss": 1.812, + "step": 329 + }, + { + "epoch": 0.1979960400791984, + "grad_norm": 1.3984375, + "learning_rate": 3.3094059405940593e-06, + "loss": 1.8678, + "step": 330 + }, + { + "epoch": 0.1985960280794384, + "grad_norm": 1.25, + "learning_rate": 3.3069306930693067e-06, + "loss": 1.9161, + "step": 331 + }, + { + "epoch": 0.1991960160796784, + "grad_norm": 1.3984375, + "learning_rate": 3.3044554455445544e-06, + "loss": 1.7642, + "step": 332 + }, + { + "epoch": 0.1997960040799184, + "grad_norm": 1.390625, + "learning_rate": 3.3019801980198018e-06, + "loss": 1.8247, + "step": 333 + }, + { + "epoch": 0.2003959920801584, + "grad_norm": 1.2578125, + "learning_rate": 3.2995049504950496e-06, + "loss": 1.9492, + "step": 334 + }, + { + "epoch": 0.2009959800803984, + "grad_norm": 1.34375, + "learning_rate": 3.297029702970297e-06, + "loss": 1.7759, + "step": 335 + }, + { + "epoch": 0.2015959680806384, + "grad_norm": 1.203125, + "learning_rate": 3.2945544554455442e-06, + "loss": 1.8604, + "step": 336 + }, + { + "epoch": 0.20219595608087837, + "grad_norm": 1.21875, + "learning_rate": 3.2920792079207916e-06, + "loss": 1.8555, + "step": 337 + }, + { + "epoch": 0.20279594408111837, + "grad_norm": 1.359375, + "learning_rate": 3.2896039603960393e-06, + "loss": 1.7694, + "step": 338 + }, + { + "epoch": 0.20339593208135837, + "grad_norm": 1.421875, + "learning_rate": 3.2871287128712867e-06, + "loss": 1.7972, + "step": 339 + }, + { + "epoch": 0.20399592008159836, + "grad_norm": 1.3203125, + "learning_rate": 3.2846534653465345e-06, + "loss": 1.902, + "step": 340 + }, + { + "epoch": 0.20459590808183836, + "grad_norm": 1.296875, + "learning_rate": 3.282178217821782e-06, + "loss": 1.9065, + "step": 341 + }, + { + "epoch": 0.20519589608207836, + "grad_norm": 1.40625, + "learning_rate": 3.2797029702970296e-06, + "loss": 1.7904, + "step": 342 + }, + { + "epoch": 0.20579588408231836, + "grad_norm": 1.203125, + "learning_rate": 3.277227722772277e-06, + "loss": 1.7835, + "step": 343 + }, + { + "epoch": 0.20639587208255836, + "grad_norm": 1.34375, + "learning_rate": 3.2747524752475247e-06, + "loss": 1.8314, + "step": 344 + }, + { + "epoch": 0.20699586008279836, + "grad_norm": 1.4296875, + "learning_rate": 3.272277227722772e-06, + "loss": 1.774, + "step": 345 + }, + { + "epoch": 0.20759584808303833, + "grad_norm": 1.2265625, + "learning_rate": 3.2698019801980198e-06, + "loss": 1.8417, + "step": 346 + }, + { + "epoch": 0.20819583608327832, + "grad_norm": 1.28125, + "learning_rate": 3.267326732673267e-06, + "loss": 1.8123, + "step": 347 + }, + { + "epoch": 0.20879582408351832, + "grad_norm": 1.2890625, + "learning_rate": 3.264851485148515e-06, + "loss": 1.8038, + "step": 348 + }, + { + "epoch": 0.20939581208375832, + "grad_norm": 1.28125, + "learning_rate": 3.262376237623762e-06, + "loss": 1.9785, + "step": 349 + }, + { + "epoch": 0.20999580008399832, + "grad_norm": 1.328125, + "learning_rate": 3.2599009900990096e-06, + "loss": 1.756, + "step": 350 + }, + { + "epoch": 0.20999580008399832, + "eval_loss": 2.0354440212249756, + "eval_model_preparation_time": 0.0036, + "eval_runtime": 65.0421, + "eval_samples_per_second": 153.747, + "eval_steps_per_second": 25.63, + "step": 350 + }, + { + "epoch": 0.21059578808423832, + "grad_norm": 1.34375, + "learning_rate": 3.257425742574257e-06, + "loss": 1.7987, + "step": 351 + }, + { + "epoch": 0.21119577608447831, + "grad_norm": 1.2578125, + "learning_rate": 3.2549504950495047e-06, + "loss": 1.868, + "step": 352 + }, + { + "epoch": 0.2117957640847183, + "grad_norm": 1.2734375, + "learning_rate": 3.252475247524752e-06, + "loss": 1.945, + "step": 353 + }, + { + "epoch": 0.2123957520849583, + "grad_norm": 1.5625, + "learning_rate": 3.25e-06, + "loss": 1.869, + "step": 354 + }, + { + "epoch": 0.2129957400851983, + "grad_norm": 1.28125, + "learning_rate": 3.2475247524752476e-06, + "loss": 1.6786, + "step": 355 + }, + { + "epoch": 0.21359572808543828, + "grad_norm": 1.40625, + "learning_rate": 3.245049504950495e-06, + "loss": 1.885, + "step": 356 + }, + { + "epoch": 0.21419571608567828, + "grad_norm": 1.2578125, + "learning_rate": 3.2425742574257427e-06, + "loss": 1.8202, + "step": 357 + }, + { + "epoch": 0.21479570408591828, + "grad_norm": 1.234375, + "learning_rate": 3.24009900990099e-06, + "loss": 1.8872, + "step": 358 + }, + { + "epoch": 0.21539569208615827, + "grad_norm": 1.3515625, + "learning_rate": 3.2376237623762378e-06, + "loss": 1.8847, + "step": 359 + }, + { + "epoch": 0.21599568008639827, + "grad_norm": 1.4375, + "learning_rate": 3.235148514851485e-06, + "loss": 1.769, + "step": 360 + }, + { + "epoch": 0.21659566808663827, + "grad_norm": 1.25, + "learning_rate": 3.232673267326733e-06, + "loss": 1.8455, + "step": 361 + }, + { + "epoch": 0.21719565608687827, + "grad_norm": 1.375, + "learning_rate": 3.2301980198019802e-06, + "loss": 1.8822, + "step": 362 + }, + { + "epoch": 0.21779564408711827, + "grad_norm": 1.2109375, + "learning_rate": 3.2277227722772276e-06, + "loss": 1.863, + "step": 363 + }, + { + "epoch": 0.21839563208735827, + "grad_norm": 1.359375, + "learning_rate": 3.225247524752475e-06, + "loss": 1.8595, + "step": 364 + }, + { + "epoch": 0.21899562008759824, + "grad_norm": 1.53125, + "learning_rate": 3.2227722772277227e-06, + "loss": 1.8043, + "step": 365 + }, + { + "epoch": 0.21959560808783823, + "grad_norm": 1.2109375, + "learning_rate": 3.22029702970297e-06, + "loss": 1.8976, + "step": 366 + }, + { + "epoch": 0.22019559608807823, + "grad_norm": 1.2265625, + "learning_rate": 3.217821782178218e-06, + "loss": 1.8178, + "step": 367 + }, + { + "epoch": 0.22079558408831823, + "grad_norm": 1.234375, + "learning_rate": 3.215346534653465e-06, + "loss": 1.7554, + "step": 368 + }, + { + "epoch": 0.22139557208855823, + "grad_norm": 1.4296875, + "learning_rate": 3.212871287128713e-06, + "loss": 1.8248, + "step": 369 + }, + { + "epoch": 0.22199556008879823, + "grad_norm": 1.296875, + "learning_rate": 3.2103960396039603e-06, + "loss": 1.93, + "step": 370 + }, + { + "epoch": 0.22259554808903823, + "grad_norm": 1.3203125, + "learning_rate": 3.207920792079208e-06, + "loss": 1.9148, + "step": 371 + }, + { + "epoch": 0.22319553608927822, + "grad_norm": 1.3359375, + "learning_rate": 3.2054455445544554e-06, + "loss": 1.8438, + "step": 372 + }, + { + "epoch": 0.22379552408951822, + "grad_norm": 1.2734375, + "learning_rate": 3.202970297029703e-06, + "loss": 1.7993, + "step": 373 + }, + { + "epoch": 0.2243955120897582, + "grad_norm": 1.28125, + "learning_rate": 3.2004950495049505e-06, + "loss": 1.8202, + "step": 374 + }, + { + "epoch": 0.2249955000899982, + "grad_norm": 1.328125, + "learning_rate": 3.1980198019801982e-06, + "loss": 1.8704, + "step": 375 + }, + { + "epoch": 0.2255954880902382, + "grad_norm": 1.40625, + "learning_rate": 3.195544554455445e-06, + "loss": 1.8558, + "step": 376 + }, + { + "epoch": 0.2261954760904782, + "grad_norm": 1.234375, + "learning_rate": 3.193069306930693e-06, + "loss": 2.0454, + "step": 377 + }, + { + "epoch": 0.22679546409071819, + "grad_norm": 1.2734375, + "learning_rate": 3.1905940594059403e-06, + "loss": 1.8577, + "step": 378 + }, + { + "epoch": 0.22739545209095818, + "grad_norm": 1.8125, + "learning_rate": 3.188118811881188e-06, + "loss": 1.8602, + "step": 379 + }, + { + "epoch": 0.22799544009119818, + "grad_norm": 1.3125, + "learning_rate": 3.1856435643564354e-06, + "loss": 1.7231, + "step": 380 + }, + { + "epoch": 0.22859542809143818, + "grad_norm": 1.359375, + "learning_rate": 3.183168316831683e-06, + "loss": 1.712, + "step": 381 + }, + { + "epoch": 0.22919541609167818, + "grad_norm": 1.25, + "learning_rate": 3.1806930693069305e-06, + "loss": 1.8823, + "step": 382 + }, + { + "epoch": 0.22979540409191815, + "grad_norm": 1.3828125, + "learning_rate": 3.1782178217821783e-06, + "loss": 1.845, + "step": 383 + }, + { + "epoch": 0.23039539209215815, + "grad_norm": 1.4296875, + "learning_rate": 3.1757425742574256e-06, + "loss": 1.8086, + "step": 384 + }, + { + "epoch": 0.23099538009239814, + "grad_norm": 1.40625, + "learning_rate": 3.1732673267326734e-06, + "loss": 1.8122, + "step": 385 + }, + { + "epoch": 0.23159536809263814, + "grad_norm": 1.2734375, + "learning_rate": 3.1707920792079207e-06, + "loss": 1.8456, + "step": 386 + }, + { + "epoch": 0.23219535609287814, + "grad_norm": 1.4765625, + "learning_rate": 3.1683168316831685e-06, + "loss": 1.8158, + "step": 387 + }, + { + "epoch": 0.23279534409311814, + "grad_norm": 1.1875, + "learning_rate": 3.165841584158416e-06, + "loss": 1.8107, + "step": 388 + }, + { + "epoch": 0.23339533209335814, + "grad_norm": 1.4375, + "learning_rate": 3.163366336633663e-06, + "loss": 1.825, + "step": 389 + }, + { + "epoch": 0.23399532009359814, + "grad_norm": 1.4921875, + "learning_rate": 3.1608910891089105e-06, + "loss": 1.8038, + "step": 390 + }, + { + "epoch": 0.23459530809383813, + "grad_norm": 1.203125, + "learning_rate": 3.1584158415841583e-06, + "loss": 1.8152, + "step": 391 + }, + { + "epoch": 0.23519529609407813, + "grad_norm": 1.4453125, + "learning_rate": 3.1559405940594056e-06, + "loss": 1.926, + "step": 392 + }, + { + "epoch": 0.2357952840943181, + "grad_norm": 1.328125, + "learning_rate": 3.1534653465346534e-06, + "loss": 1.8543, + "step": 393 + }, + { + "epoch": 0.2363952720945581, + "grad_norm": 1.25, + "learning_rate": 3.1509900990099007e-06, + "loss": 1.8222, + "step": 394 + }, + { + "epoch": 0.2369952600947981, + "grad_norm": 1.2578125, + "learning_rate": 3.1485148514851485e-06, + "loss": 1.9147, + "step": 395 + }, + { + "epoch": 0.2375952480950381, + "grad_norm": 1.21875, + "learning_rate": 3.146039603960396e-06, + "loss": 1.814, + "step": 396 + }, + { + "epoch": 0.2381952360952781, + "grad_norm": 1.0625, + "learning_rate": 3.1435643564356436e-06, + "loss": 1.8623, + "step": 397 + }, + { + "epoch": 0.2387952240955181, + "grad_norm": 1.34375, + "learning_rate": 3.141089108910891e-06, + "loss": 1.8996, + "step": 398 + }, + { + "epoch": 0.2393952120957581, + "grad_norm": 1.2265625, + "learning_rate": 3.1386138613861387e-06, + "loss": 1.8078, + "step": 399 + }, + { + "epoch": 0.2399952000959981, + "grad_norm": 1.125, + "learning_rate": 3.136138613861386e-06, + "loss": 1.8248, + "step": 400 + }, + { + "epoch": 0.2399952000959981, + "eval_loss": 2.0347118377685547, + "eval_model_preparation_time": 0.0036, + "eval_runtime": 65.1996, + "eval_samples_per_second": 153.375, + "eval_steps_per_second": 25.568, + "step": 400 + }, + { + "epoch": 0.2405951880962381, + "grad_norm": 1.203125, + "learning_rate": 3.133663366336634e-06, + "loss": 1.9232, + "step": 401 + }, + { + "epoch": 0.24119517609647806, + "grad_norm": 1.3359375, + "learning_rate": 3.1311881188118807e-06, + "loss": 1.8487, + "step": 402 + }, + { + "epoch": 0.24179516409671806, + "grad_norm": 1.3828125, + "learning_rate": 3.1287128712871285e-06, + "loss": 1.8713, + "step": 403 + }, + { + "epoch": 0.24239515209695806, + "grad_norm": 1.5078125, + "learning_rate": 3.126237623762376e-06, + "loss": 1.9976, + "step": 404 + }, + { + "epoch": 0.24299514009719805, + "grad_norm": 1.3359375, + "learning_rate": 3.1237623762376236e-06, + "loss": 1.769, + "step": 405 + }, + { + "epoch": 0.24359512809743805, + "grad_norm": 1.296875, + "learning_rate": 3.121287128712871e-06, + "loss": 1.9225, + "step": 406 + }, + { + "epoch": 0.24419511609767805, + "grad_norm": 1.3046875, + "learning_rate": 3.1188118811881187e-06, + "loss": 1.8012, + "step": 407 + }, + { + "epoch": 0.24479510409791805, + "grad_norm": 1.3125, + "learning_rate": 3.116336633663366e-06, + "loss": 1.8264, + "step": 408 + }, + { + "epoch": 0.24539509209815805, + "grad_norm": 1.3203125, + "learning_rate": 3.113861386138614e-06, + "loss": 1.9196, + "step": 409 + }, + { + "epoch": 0.24599508009839804, + "grad_norm": 1.2109375, + "learning_rate": 3.111386138613861e-06, + "loss": 1.7687, + "step": 410 + }, + { + "epoch": 0.24659506809863802, + "grad_norm": 1.2578125, + "learning_rate": 3.108910891089109e-06, + "loss": 1.8449, + "step": 411 + }, + { + "epoch": 0.247195056098878, + "grad_norm": 1.2265625, + "learning_rate": 3.1064356435643563e-06, + "loss": 1.9169, + "step": 412 + }, + { + "epoch": 0.247795044099118, + "grad_norm": 1.4453125, + "learning_rate": 3.103960396039604e-06, + "loss": 1.83, + "step": 413 + }, + { + "epoch": 0.248395032099358, + "grad_norm": 1.3203125, + "learning_rate": 3.1014851485148514e-06, + "loss": 1.8564, + "step": 414 + }, + { + "epoch": 0.248995020099598, + "grad_norm": 1.203125, + "learning_rate": 3.099009900990099e-06, + "loss": 1.8824, + "step": 415 + }, + { + "epoch": 0.249595008099838, + "grad_norm": 1.421875, + "learning_rate": 3.096534653465346e-06, + "loss": 1.7244, + "step": 416 + }, + { + "epoch": 0.250194996100078, + "grad_norm": 1.3203125, + "learning_rate": 3.094059405940594e-06, + "loss": 1.7585, + "step": 417 + }, + { + "epoch": 0.250794984100318, + "grad_norm": 1.28125, + "learning_rate": 3.091584158415841e-06, + "loss": 1.8895, + "step": 418 + }, + { + "epoch": 0.251394972100558, + "grad_norm": 1.3203125, + "learning_rate": 3.089108910891089e-06, + "loss": 1.8884, + "step": 419 + }, + { + "epoch": 0.25199496010079797, + "grad_norm": 2.0625, + "learning_rate": 3.0866336633663363e-06, + "loss": 1.8346, + "step": 420 + }, + { + "epoch": 0.252594948101038, + "grad_norm": 1.4140625, + "learning_rate": 3.084158415841584e-06, + "loss": 1.8132, + "step": 421 + }, + { + "epoch": 0.25319493610127797, + "grad_norm": 1.125, + "learning_rate": 3.0816831683168314e-06, + "loss": 1.7638, + "step": 422 + }, + { + "epoch": 0.253794924101518, + "grad_norm": 1.28125, + "learning_rate": 3.079207920792079e-06, + "loss": 1.8337, + "step": 423 + }, + { + "epoch": 0.25439491210175796, + "grad_norm": 1.421875, + "learning_rate": 3.0767326732673265e-06, + "loss": 1.8597, + "step": 424 + }, + { + "epoch": 0.25499490010199793, + "grad_norm": 4.25, + "learning_rate": 3.0742574257425743e-06, + "loss": 1.7829, + "step": 425 + }, + { + "epoch": 0.25559488810223796, + "grad_norm": 1.40625, + "learning_rate": 3.0717821782178216e-06, + "loss": 1.8581, + "step": 426 + }, + { + "epoch": 0.25619487610247793, + "grad_norm": 1.4921875, + "learning_rate": 3.0693069306930694e-06, + "loss": 1.8007, + "step": 427 + }, + { + "epoch": 0.25679486410271796, + "grad_norm": 1.359375, + "learning_rate": 3.0668316831683167e-06, + "loss": 1.7999, + "step": 428 + }, + { + "epoch": 0.2573948521029579, + "grad_norm": 1.2734375, + "learning_rate": 3.064356435643564e-06, + "loss": 1.9848, + "step": 429 + }, + { + "epoch": 0.25799484010319795, + "grad_norm": 1.5390625, + "learning_rate": 3.0618811881188114e-06, + "loss": 1.7019, + "step": 430 + }, + { + "epoch": 0.2585948281034379, + "grad_norm": 1.3359375, + "learning_rate": 3.059405940594059e-06, + "loss": 1.8219, + "step": 431 + }, + { + "epoch": 0.25919481610367795, + "grad_norm": 1.3671875, + "learning_rate": 3.0569306930693065e-06, + "loss": 1.8192, + "step": 432 + }, + { + "epoch": 0.2597948041039179, + "grad_norm": 1.3828125, + "learning_rate": 3.0544554455445543e-06, + "loss": 1.9334, + "step": 433 + }, + { + "epoch": 0.2603947921041579, + "grad_norm": 1.1484375, + "learning_rate": 3.0519801980198016e-06, + "loss": 1.7363, + "step": 434 + }, + { + "epoch": 0.2609947801043979, + "grad_norm": 1.34375, + "learning_rate": 3.0495049504950494e-06, + "loss": 1.8116, + "step": 435 + }, + { + "epoch": 0.2615947681046379, + "grad_norm": 2.453125, + "learning_rate": 3.0470297029702967e-06, + "loss": 1.849, + "step": 436 + }, + { + "epoch": 0.2621947561048779, + "grad_norm": 1.28125, + "learning_rate": 3.0445544554455445e-06, + "loss": 1.8584, + "step": 437 + }, + { + "epoch": 0.2627947441051179, + "grad_norm": 1.1875, + "learning_rate": 3.042079207920792e-06, + "loss": 1.7644, + "step": 438 + }, + { + "epoch": 0.2633947321053579, + "grad_norm": 1.34375, + "learning_rate": 3.0396039603960396e-06, + "loss": 1.71, + "step": 439 + }, + { + "epoch": 0.2639947201055979, + "grad_norm": 1.3515625, + "learning_rate": 3.037128712871287e-06, + "loss": 1.7796, + "step": 440 + }, + { + "epoch": 0.2645947081058379, + "grad_norm": 1.3046875, + "learning_rate": 3.0346534653465347e-06, + "loss": 1.8207, + "step": 441 + }, + { + "epoch": 0.2651946961060779, + "grad_norm": 1.2734375, + "learning_rate": 3.0321782178217817e-06, + "loss": 1.8873, + "step": 442 + }, + { + "epoch": 0.26579468410631785, + "grad_norm": 1.328125, + "learning_rate": 3.0297029702970294e-06, + "loss": 1.9306, + "step": 443 + }, + { + "epoch": 0.2663946721065579, + "grad_norm": 1.3203125, + "learning_rate": 3.0272277227722768e-06, + "loss": 1.8683, + "step": 444 + }, + { + "epoch": 0.26699466010679784, + "grad_norm": 1.3671875, + "learning_rate": 3.0247524752475245e-06, + "loss": 1.8457, + "step": 445 + }, + { + "epoch": 0.26759464810703787, + "grad_norm": 1.40625, + "learning_rate": 3.022277227722772e-06, + "loss": 1.7025, + "step": 446 + }, + { + "epoch": 0.26819463610727784, + "grad_norm": 1.7421875, + "learning_rate": 3.0198019801980196e-06, + "loss": 1.7783, + "step": 447 + }, + { + "epoch": 0.26879462410751787, + "grad_norm": 1.328125, + "learning_rate": 3.017326732673267e-06, + "loss": 1.9892, + "step": 448 + }, + { + "epoch": 0.26939461210775784, + "grad_norm": 1.4609375, + "learning_rate": 3.0148514851485147e-06, + "loss": 1.7737, + "step": 449 + }, + { + "epoch": 0.26999460010799786, + "grad_norm": 1.2109375, + "learning_rate": 3.012376237623762e-06, + "loss": 1.8656, + "step": 450 + }, + { + "epoch": 0.26999460010799786, + "eval_loss": 2.033843994140625, + "eval_model_preparation_time": 0.0036, + "eval_runtime": 67.9747, + "eval_samples_per_second": 147.114, + "eval_steps_per_second": 24.524, + "step": 450 + }, + { + "epoch": 0.27059458810823783, + "grad_norm": 1.3984375, + "learning_rate": 3.00990099009901e-06, + "loss": 1.7738, + "step": 451 + }, + { + "epoch": 0.2711945761084778, + "grad_norm": 1.1640625, + "learning_rate": 3.007425742574257e-06, + "loss": 1.8191, + "step": 452 + }, + { + "epoch": 0.27179456410871783, + "grad_norm": 1.171875, + "learning_rate": 3.004950495049505e-06, + "loss": 1.7908, + "step": 453 + }, + { + "epoch": 0.2723945521089578, + "grad_norm": 1.46875, + "learning_rate": 3.0024752475247523e-06, + "loss": 1.7694, + "step": 454 + }, + { + "epoch": 0.2729945401091978, + "grad_norm": 1.40625, + "learning_rate": 3e-06, + "loss": 1.8002, + "step": 455 + }, + { + "epoch": 0.2735945281094378, + "grad_norm": 1.390625, + "learning_rate": 2.9975247524752474e-06, + "loss": 1.8376, + "step": 456 + }, + { + "epoch": 0.2741945161096778, + "grad_norm": 1.65625, + "learning_rate": 2.9950495049504948e-06, + "loss": 1.8418, + "step": 457 + }, + { + "epoch": 0.2747945041099178, + "grad_norm": 1.5625, + "learning_rate": 2.9925742574257425e-06, + "loss": 1.9308, + "step": 458 + }, + { + "epoch": 0.2753944921101578, + "grad_norm": 1.359375, + "learning_rate": 2.99009900990099e-06, + "loss": 1.7999, + "step": 459 + }, + { + "epoch": 0.2759944801103978, + "grad_norm": 1.3984375, + "learning_rate": 2.9876237623762376e-06, + "loss": 1.8369, + "step": 460 + }, + { + "epoch": 0.2765944681106378, + "grad_norm": 1.3359375, + "learning_rate": 2.985148514851485e-06, + "loss": 1.8621, + "step": 461 + }, + { + "epoch": 0.2771944561108778, + "grad_norm": 1.3046875, + "learning_rate": 2.9826732673267327e-06, + "loss": 1.7909, + "step": 462 + }, + { + "epoch": 0.27779444411111776, + "grad_norm": 2.21875, + "learning_rate": 2.98019801980198e-06, + "loss": 1.7593, + "step": 463 + }, + { + "epoch": 0.2783944321113578, + "grad_norm": 1.5703125, + "learning_rate": 2.977722772277228e-06, + "loss": 1.8737, + "step": 464 + }, + { + "epoch": 0.27899442011159775, + "grad_norm": 1.3125, + "learning_rate": 2.975247524752475e-06, + "loss": 1.7006, + "step": 465 + }, + { + "epoch": 0.2795944081118378, + "grad_norm": 1.203125, + "learning_rate": 2.972772277227723e-06, + "loss": 1.9047, + "step": 466 + }, + { + "epoch": 0.28019439611207775, + "grad_norm": 1.28125, + "learning_rate": 2.9702970297029703e-06, + "loss": 1.7895, + "step": 467 + }, + { + "epoch": 0.2807943841123178, + "grad_norm": 1.3046875, + "learning_rate": 2.967821782178218e-06, + "loss": 1.7947, + "step": 468 + }, + { + "epoch": 0.28139437211255774, + "grad_norm": 1.3203125, + "learning_rate": 2.965346534653465e-06, + "loss": 1.7682, + "step": 469 + }, + { + "epoch": 0.28199436011279777, + "grad_norm": 1.1953125, + "learning_rate": 2.9628712871287128e-06, + "loss": 1.8296, + "step": 470 + }, + { + "epoch": 0.28259434811303774, + "grad_norm": 1.1640625, + "learning_rate": 2.96039603960396e-06, + "loss": 1.9427, + "step": 471 + }, + { + "epoch": 0.2831943361132777, + "grad_norm": 1.28125, + "learning_rate": 2.957920792079208e-06, + "loss": 1.9328, + "step": 472 + }, + { + "epoch": 0.28379432411351774, + "grad_norm": 1.390625, + "learning_rate": 2.9554455445544552e-06, + "loss": 1.8043, + "step": 473 + }, + { + "epoch": 0.2843943121137577, + "grad_norm": 2.375, + "learning_rate": 2.952970297029703e-06, + "loss": 1.7856, + "step": 474 + }, + { + "epoch": 0.28499430011399773, + "grad_norm": 1.421875, + "learning_rate": 2.9504950495049503e-06, + "loss": 1.7967, + "step": 475 + }, + { + "epoch": 0.2855942881142377, + "grad_norm": 1.296875, + "learning_rate": 2.948019801980198e-06, + "loss": 1.8577, + "step": 476 + }, + { + "epoch": 0.28619427611447773, + "grad_norm": 1.328125, + "learning_rate": 2.9455445544554454e-06, + "loss": 1.8487, + "step": 477 + }, + { + "epoch": 0.2867942641147177, + "grad_norm": 1.8515625, + "learning_rate": 2.943069306930693e-06, + "loss": 1.9896, + "step": 478 + }, + { + "epoch": 0.2873942521149577, + "grad_norm": 1.3125, + "learning_rate": 2.9405940594059405e-06, + "loss": 1.9078, + "step": 479 + }, + { + "epoch": 0.2879942401151977, + "grad_norm": 1.40625, + "learning_rate": 2.9381188118811883e-06, + "loss": 1.8419, + "step": 480 + }, + { + "epoch": 0.28859422811543767, + "grad_norm": 1.1953125, + "learning_rate": 2.9356435643564357e-06, + "loss": 1.8428, + "step": 481 + }, + { + "epoch": 0.2891942161156777, + "grad_norm": 1.4453125, + "learning_rate": 2.933168316831683e-06, + "loss": 1.9351, + "step": 482 + }, + { + "epoch": 0.28979420411591766, + "grad_norm": 1.1953125, + "learning_rate": 2.9306930693069303e-06, + "loss": 1.8918, + "step": 483 + }, + { + "epoch": 0.2903941921161577, + "grad_norm": 1.3203125, + "learning_rate": 2.928217821782178e-06, + "loss": 2.0012, + "step": 484 + }, + { + "epoch": 0.29099418011639766, + "grad_norm": 1.1796875, + "learning_rate": 2.9257425742574254e-06, + "loss": 1.7536, + "step": 485 + }, + { + "epoch": 0.2915941681166377, + "grad_norm": 1.265625, + "learning_rate": 2.9232673267326732e-06, + "loss": 1.8447, + "step": 486 + }, + { + "epoch": 0.29219415611687766, + "grad_norm": 1.34375, + "learning_rate": 2.9207920792079206e-06, + "loss": 1.7318, + "step": 487 + }, + { + "epoch": 0.2927941441171177, + "grad_norm": 1.4609375, + "learning_rate": 2.9183168316831683e-06, + "loss": 1.8265, + "step": 488 + }, + { + "epoch": 0.29339413211735765, + "grad_norm": 1.234375, + "learning_rate": 2.9158415841584157e-06, + "loss": 1.7215, + "step": 489 + }, + { + "epoch": 0.2939941201175976, + "grad_norm": 1.3984375, + "learning_rate": 2.9133663366336634e-06, + "loss": 1.8658, + "step": 490 + }, + { + "epoch": 0.29459410811783765, + "grad_norm": 1.2109375, + "learning_rate": 2.9108910891089108e-06, + "loss": 1.8764, + "step": 491 + }, + { + "epoch": 0.2951940961180776, + "grad_norm": 1.28125, + "learning_rate": 2.9084158415841585e-06, + "loss": 1.9107, + "step": 492 + }, + { + "epoch": 0.29579408411831765, + "grad_norm": 1.28125, + "learning_rate": 2.905940594059406e-06, + "loss": 1.8486, + "step": 493 + }, + { + "epoch": 0.2963940721185576, + "grad_norm": 1.2890625, + "learning_rate": 2.9034653465346537e-06, + "loss": 1.7087, + "step": 494 + }, + { + "epoch": 0.29699406011879764, + "grad_norm": 1.25, + "learning_rate": 2.9009900990099006e-06, + "loss": 1.8336, + "step": 495 + }, + { + "epoch": 0.2975940481190376, + "grad_norm": 1.5546875, + "learning_rate": 2.8985148514851483e-06, + "loss": 1.9546, + "step": 496 + }, + { + "epoch": 0.29819403611927764, + "grad_norm": 1.2890625, + "learning_rate": 2.8960396039603957e-06, + "loss": 1.8021, + "step": 497 + }, + { + "epoch": 0.2987940241195176, + "grad_norm": 1.296875, + "learning_rate": 2.8935643564356434e-06, + "loss": 1.8557, + "step": 498 + }, + { + "epoch": 0.2993940121197576, + "grad_norm": 1.3203125, + "learning_rate": 2.891089108910891e-06, + "loss": 1.8676, + "step": 499 + }, + { + "epoch": 0.2999940001199976, + "grad_norm": 1.3671875, + "learning_rate": 2.8886138613861386e-06, + "loss": 1.8587, + "step": 500 + }, + { + "epoch": 0.2999940001199976, + "eval_loss": 2.0334386825561523, + "eval_model_preparation_time": 0.0036, + "eval_runtime": 65.1108, + "eval_samples_per_second": 153.584, + "eval_steps_per_second": 25.603, + "step": 500 + }, + { + "epoch": 0.3005939881202376, + "grad_norm": 1.4140625, + "learning_rate": 2.886138613861386e-06, + "loss": 1.828, + "step": 501 + }, + { + "epoch": 0.3011939761204776, + "grad_norm": 1.3125, + "learning_rate": 2.8836633663366337e-06, + "loss": 1.8682, + "step": 502 + }, + { + "epoch": 0.3017939641207176, + "grad_norm": 1.34375, + "learning_rate": 2.881188118811881e-06, + "loss": 1.7229, + "step": 503 + }, + { + "epoch": 0.3023939521209576, + "grad_norm": 2.765625, + "learning_rate": 2.8787128712871288e-06, + "loss": 1.9047, + "step": 504 + }, + { + "epoch": 0.30299394012119757, + "grad_norm": 1.5390625, + "learning_rate": 2.876237623762376e-06, + "loss": 1.8765, + "step": 505 + }, + { + "epoch": 0.3035939281214376, + "grad_norm": 1.265625, + "learning_rate": 2.873762376237624e-06, + "loss": 1.8074, + "step": 506 + }, + { + "epoch": 0.30419391612167757, + "grad_norm": 1.46875, + "learning_rate": 2.8712871287128712e-06, + "loss": 1.8522, + "step": 507 + }, + { + "epoch": 0.30479390412191754, + "grad_norm": 1.390625, + "learning_rate": 2.868811881188119e-06, + "loss": 1.7335, + "step": 508 + }, + { + "epoch": 0.30539389212215756, + "grad_norm": 1.1875, + "learning_rate": 2.866336633663366e-06, + "loss": 1.8103, + "step": 509 + }, + { + "epoch": 0.30599388012239753, + "grad_norm": 1.3203125, + "learning_rate": 2.8638613861386137e-06, + "loss": 1.7559, + "step": 510 + }, + { + "epoch": 0.30659386812263756, + "grad_norm": 1.5546875, + "learning_rate": 2.861386138613861e-06, + "loss": 1.7131, + "step": 511 + }, + { + "epoch": 0.30719385612287753, + "grad_norm": 1.375, + "learning_rate": 2.858910891089109e-06, + "loss": 1.889, + "step": 512 + }, + { + "epoch": 0.30779384412311755, + "grad_norm": 1.265625, + "learning_rate": 2.856435643564356e-06, + "loss": 1.843, + "step": 513 + }, + { + "epoch": 0.3083938321233575, + "grad_norm": 1.359375, + "learning_rate": 2.853960396039604e-06, + "loss": 1.7522, + "step": 514 + }, + { + "epoch": 0.30899382012359755, + "grad_norm": 1.2265625, + "learning_rate": 2.8514851485148512e-06, + "loss": 1.8443, + "step": 515 + }, + { + "epoch": 0.3095938081238375, + "grad_norm": 1.296875, + "learning_rate": 2.849009900990099e-06, + "loss": 1.7471, + "step": 516 + }, + { + "epoch": 0.3101937961240775, + "grad_norm": 1.2109375, + "learning_rate": 2.8465346534653464e-06, + "loss": 1.8667, + "step": 517 + }, + { + "epoch": 0.3107937841243175, + "grad_norm": 1.328125, + "learning_rate": 2.844059405940594e-06, + "loss": 1.7233, + "step": 518 + }, + { + "epoch": 0.3113937721245575, + "grad_norm": 1.296875, + "learning_rate": 2.8415841584158415e-06, + "loss": 1.8619, + "step": 519 + }, + { + "epoch": 0.3119937601247975, + "grad_norm": 1.2734375, + "learning_rate": 2.8391089108910892e-06, + "loss": 1.8094, + "step": 520 + }, + { + "epoch": 0.3125937481250375, + "grad_norm": 1.3515625, + "learning_rate": 2.8366336633663366e-06, + "loss": 1.8636, + "step": 521 + }, + { + "epoch": 0.3131937361252775, + "grad_norm": 1.421875, + "learning_rate": 2.834158415841584e-06, + "loss": 1.8544, + "step": 522 + }, + { + "epoch": 0.3137937241255175, + "grad_norm": 1.1796875, + "learning_rate": 2.8316831683168313e-06, + "loss": 1.7636, + "step": 523 + }, + { + "epoch": 0.3143937121257575, + "grad_norm": 1.3046875, + "learning_rate": 2.829207920792079e-06, + "loss": 1.8123, + "step": 524 + }, + { + "epoch": 0.3149937001259975, + "grad_norm": 1.390625, + "learning_rate": 2.8267326732673264e-06, + "loss": 1.7643, + "step": 525 + }, + { + "epoch": 0.31559368812623745, + "grad_norm": 1.15625, + "learning_rate": 2.824257425742574e-06, + "loss": 1.8393, + "step": 526 + }, + { + "epoch": 0.3161936761264775, + "grad_norm": 1.5390625, + "learning_rate": 2.8217821782178215e-06, + "loss": 1.7629, + "step": 527 + }, + { + "epoch": 0.31679366412671744, + "grad_norm": 1.3203125, + "learning_rate": 2.8193069306930692e-06, + "loss": 1.8098, + "step": 528 + }, + { + "epoch": 0.31739365212695747, + "grad_norm": 1.4296875, + "learning_rate": 2.8168316831683166e-06, + "loss": 1.7945, + "step": 529 + }, + { + "epoch": 0.31799364012719744, + "grad_norm": 1.234375, + "learning_rate": 2.8143564356435644e-06, + "loss": 1.7641, + "step": 530 + }, + { + "epoch": 0.31859362812743747, + "grad_norm": 1.2734375, + "learning_rate": 2.8118811881188117e-06, + "loss": 1.9744, + "step": 531 + }, + { + "epoch": 0.31919361612767744, + "grad_norm": 1.2109375, + "learning_rate": 2.8094059405940595e-06, + "loss": 1.7955, + "step": 532 + }, + { + "epoch": 0.31979360412791746, + "grad_norm": 1.40625, + "learning_rate": 2.806930693069307e-06, + "loss": 1.8048, + "step": 533 + }, + { + "epoch": 0.32039359212815743, + "grad_norm": 1.515625, + "learning_rate": 2.8044554455445546e-06, + "loss": 1.8906, + "step": 534 + }, + { + "epoch": 0.3209935801283974, + "grad_norm": 1.1953125, + "learning_rate": 2.8019801980198015e-06, + "loss": 1.8028, + "step": 535 + }, + { + "epoch": 0.32159356812863743, + "grad_norm": 1.359375, + "learning_rate": 2.7995049504950493e-06, + "loss": 1.7173, + "step": 536 + }, + { + "epoch": 0.3221935561288774, + "grad_norm": 1.3984375, + "learning_rate": 2.7970297029702966e-06, + "loss": 1.8689, + "step": 537 + }, + { + "epoch": 0.3227935441291174, + "grad_norm": 1.3125, + "learning_rate": 2.7945544554455444e-06, + "loss": 1.7717, + "step": 538 + }, + { + "epoch": 0.3233935321293574, + "grad_norm": 1.3125, + "learning_rate": 2.7920792079207917e-06, + "loss": 1.9151, + "step": 539 + }, + { + "epoch": 0.3239935201295974, + "grad_norm": 1.21875, + "learning_rate": 2.7896039603960395e-06, + "loss": 1.9621, + "step": 540 + }, + { + "epoch": 0.3245935081298374, + "grad_norm": 1.3046875, + "learning_rate": 2.787128712871287e-06, + "loss": 1.801, + "step": 541 + }, + { + "epoch": 0.3251934961300774, + "grad_norm": 1.8828125, + "learning_rate": 2.7846534653465346e-06, + "loss": 1.9634, + "step": 542 + }, + { + "epoch": 0.3257934841303174, + "grad_norm": 1.390625, + "learning_rate": 2.782178217821782e-06, + "loss": 1.7282, + "step": 543 + }, + { + "epoch": 0.3263934721305574, + "grad_norm": 1.453125, + "learning_rate": 2.7797029702970297e-06, + "loss": 1.6894, + "step": 544 + }, + { + "epoch": 0.3269934601307974, + "grad_norm": 1.6015625, + "learning_rate": 2.777227722772277e-06, + "loss": 1.7942, + "step": 545 + }, + { + "epoch": 0.32759344813103736, + "grad_norm": 1.1796875, + "learning_rate": 2.774752475247525e-06, + "loss": 1.7971, + "step": 546 + }, + { + "epoch": 0.3281934361312774, + "grad_norm": 1.4609375, + "learning_rate": 2.772277227722772e-06, + "loss": 1.7986, + "step": 547 + }, + { + "epoch": 0.32879342413151735, + "grad_norm": 1.1328125, + "learning_rate": 2.7698019801980195e-06, + "loss": 2.0276, + "step": 548 + }, + { + "epoch": 0.3293934121317574, + "grad_norm": 1.28125, + "learning_rate": 2.767326732673267e-06, + "loss": 1.8949, + "step": 549 + }, + { + "epoch": 0.32999340013199735, + "grad_norm": 1.53125, + "learning_rate": 2.7648514851485146e-06, + "loss": 1.8993, + "step": 550 + }, + { + "epoch": 0.32999340013199735, + "eval_loss": 2.0332491397857666, + "eval_model_preparation_time": 0.0036, + "eval_runtime": 65.0973, + "eval_samples_per_second": 153.616, + "eval_steps_per_second": 25.608, + "step": 550 + }, + { + "epoch": 0.3305933881322374, + "grad_norm": 1.359375, + "learning_rate": 2.762376237623762e-06, + "loss": 1.7827, + "step": 551 + }, + { + "epoch": 0.33119337613247735, + "grad_norm": 1.34375, + "learning_rate": 2.7599009900990097e-06, + "loss": 1.754, + "step": 552 + }, + { + "epoch": 0.33179336413271737, + "grad_norm": 1.3203125, + "learning_rate": 2.757425742574257e-06, + "loss": 1.7121, + "step": 553 + }, + { + "epoch": 0.33239335213295734, + "grad_norm": 1.703125, + "learning_rate": 2.754950495049505e-06, + "loss": 1.8265, + "step": 554 + }, + { + "epoch": 0.3329933401331973, + "grad_norm": 1.3359375, + "learning_rate": 2.752475247524752e-06, + "loss": 1.8452, + "step": 555 + }, + { + "epoch": 0.33359332813343734, + "grad_norm": 1.3046875, + "learning_rate": 2.75e-06, + "loss": 1.9355, + "step": 556 + }, + { + "epoch": 0.3341933161336773, + "grad_norm": 1.5859375, + "learning_rate": 2.7475247524752477e-06, + "loss": 1.7684, + "step": 557 + }, + { + "epoch": 0.33479330413391734, + "grad_norm": 1.3125, + "learning_rate": 2.745049504950495e-06, + "loss": 1.8229, + "step": 558 + }, + { + "epoch": 0.3353932921341573, + "grad_norm": 1.4375, + "learning_rate": 2.742574257425743e-06, + "loss": 1.9012, + "step": 559 + }, + { + "epoch": 0.33599328013439733, + "grad_norm": 1.7890625, + "learning_rate": 2.74009900990099e-06, + "loss": 1.8624, + "step": 560 + }, + { + "epoch": 0.3365932681346373, + "grad_norm": 1.34375, + "learning_rate": 2.737623762376238e-06, + "loss": 1.8385, + "step": 561 + }, + { + "epoch": 0.33719325613487733, + "grad_norm": 1.3671875, + "learning_rate": 2.735148514851485e-06, + "loss": 1.9174, + "step": 562 + }, + { + "epoch": 0.3377932441351173, + "grad_norm": 1.4921875, + "learning_rate": 2.7326732673267326e-06, + "loss": 1.6744, + "step": 563 + }, + { + "epoch": 0.33839323213535727, + "grad_norm": 1.28125, + "learning_rate": 2.73019801980198e-06, + "loss": 1.9218, + "step": 564 + }, + { + "epoch": 0.3389932201355973, + "grad_norm": 1.3828125, + "learning_rate": 2.7277227722772277e-06, + "loss": 1.9251, + "step": 565 + }, + { + "epoch": 0.33959320813583727, + "grad_norm": 1.5390625, + "learning_rate": 2.725247524752475e-06, + "loss": 2.0032, + "step": 566 + }, + { + "epoch": 0.3401931961360773, + "grad_norm": 1.3671875, + "learning_rate": 2.722772277227723e-06, + "loss": 1.7809, + "step": 567 + }, + { + "epoch": 0.34079318413631726, + "grad_norm": 1.3359375, + "learning_rate": 2.72029702970297e-06, + "loss": 1.7717, + "step": 568 + }, + { + "epoch": 0.3413931721365573, + "grad_norm": 1.25, + "learning_rate": 2.717821782178218e-06, + "loss": 1.881, + "step": 569 + }, + { + "epoch": 0.34199316013679726, + "grad_norm": 1.296875, + "learning_rate": 2.7153465346534653e-06, + "loss": 1.8142, + "step": 570 + }, + { + "epoch": 0.3425931481370373, + "grad_norm": 1.234375, + "learning_rate": 2.712871287128713e-06, + "loss": 1.9001, + "step": 571 + }, + { + "epoch": 0.34319313613727725, + "grad_norm": 1.34375, + "learning_rate": 2.7103960396039604e-06, + "loss": 1.7508, + "step": 572 + }, + { + "epoch": 0.3437931241375172, + "grad_norm": 1.5625, + "learning_rate": 2.707920792079208e-06, + "loss": 2.0335, + "step": 573 + }, + { + "epoch": 0.34439311213775725, + "grad_norm": 1.1875, + "learning_rate": 2.7054455445544555e-06, + "loss": 1.9974, + "step": 574 + }, + { + "epoch": 0.3449931001379972, + "grad_norm": 1.3984375, + "learning_rate": 2.702970297029703e-06, + "loss": 1.9025, + "step": 575 + }, + { + "epoch": 0.34559308813823725, + "grad_norm": 1.2578125, + "learning_rate": 2.70049504950495e-06, + "loss": 1.7668, + "step": 576 + }, + { + "epoch": 0.3461930761384772, + "grad_norm": 1.3984375, + "learning_rate": 2.698019801980198e-06, + "loss": 1.8825, + "step": 577 + }, + { + "epoch": 0.34679306413871724, + "grad_norm": 1.453125, + "learning_rate": 2.6955445544554453e-06, + "loss": 1.9186, + "step": 578 + }, + { + "epoch": 0.3473930521389572, + "grad_norm": 1.6796875, + "learning_rate": 2.693069306930693e-06, + "loss": 1.8675, + "step": 579 + }, + { + "epoch": 0.34799304013919724, + "grad_norm": 1.328125, + "learning_rate": 2.6905940594059404e-06, + "loss": 1.9197, + "step": 580 + }, + { + "epoch": 0.3485930281394372, + "grad_norm": 1.359375, + "learning_rate": 2.688118811881188e-06, + "loss": 1.7689, + "step": 581 + }, + { + "epoch": 0.3491930161396772, + "grad_norm": 1.2109375, + "learning_rate": 2.6856435643564355e-06, + "loss": 1.7074, + "step": 582 + }, + { + "epoch": 0.3497930041399172, + "grad_norm": 1.546875, + "learning_rate": 2.6831683168316833e-06, + "loss": 1.8839, + "step": 583 + }, + { + "epoch": 0.3503929921401572, + "grad_norm": 1.28125, + "learning_rate": 2.6806930693069306e-06, + "loss": 1.7625, + "step": 584 + }, + { + "epoch": 0.3509929801403972, + "grad_norm": 1.234375, + "learning_rate": 2.6782178217821784e-06, + "loss": 1.767, + "step": 585 + }, + { + "epoch": 0.3515929681406372, + "grad_norm": 1.328125, + "learning_rate": 2.6757425742574257e-06, + "loss": 1.9642, + "step": 586 + }, + { + "epoch": 0.3521929561408772, + "grad_norm": 1.328125, + "learning_rate": 2.6732673267326735e-06, + "loss": 1.9743, + "step": 587 + }, + { + "epoch": 0.35279294414111717, + "grad_norm": 1.3515625, + "learning_rate": 2.6707920792079204e-06, + "loss": 1.9189, + "step": 588 + }, + { + "epoch": 0.3533929321413572, + "grad_norm": 1.15625, + "learning_rate": 2.668316831683168e-06, + "loss": 1.793, + "step": 589 + }, + { + "epoch": 0.35399292014159717, + "grad_norm": 1.28125, + "learning_rate": 2.6658415841584155e-06, + "loss": 1.8837, + "step": 590 + }, + { + "epoch": 0.35459290814183714, + "grad_norm": 1.3203125, + "learning_rate": 2.6633663366336633e-06, + "loss": 1.8041, + "step": 591 + }, + { + "epoch": 0.35519289614207716, + "grad_norm": 1.359375, + "learning_rate": 2.6608910891089106e-06, + "loss": 1.797, + "step": 592 + }, + { + "epoch": 0.35579288414231713, + "grad_norm": 1.359375, + "learning_rate": 2.6584158415841584e-06, + "loss": 1.6775, + "step": 593 + }, + { + "epoch": 0.35639287214255716, + "grad_norm": 1.1953125, + "learning_rate": 2.6559405940594057e-06, + "loss": 1.8262, + "step": 594 + }, + { + "epoch": 0.35699286014279713, + "grad_norm": 1.4453125, + "learning_rate": 2.6534653465346535e-06, + "loss": 1.8204, + "step": 595 + }, + { + "epoch": 0.35759284814303716, + "grad_norm": 1.2734375, + "learning_rate": 2.650990099009901e-06, + "loss": 1.8847, + "step": 596 + }, + { + "epoch": 0.3581928361432771, + "grad_norm": 1.1953125, + "learning_rate": 2.6485148514851486e-06, + "loss": 1.8749, + "step": 597 + }, + { + "epoch": 0.35879282414351715, + "grad_norm": 1.2578125, + "learning_rate": 2.646039603960396e-06, + "loss": 1.816, + "step": 598 + }, + { + "epoch": 0.3593928121437571, + "grad_norm": 1.296875, + "learning_rate": 2.6435643564356437e-06, + "loss": 1.8531, + "step": 599 + }, + { + "epoch": 0.3599928001439971, + "grad_norm": 1.2265625, + "learning_rate": 2.641089108910891e-06, + "loss": 1.8029, + "step": 600 + }, + { + "epoch": 0.3599928001439971, + "eval_loss": 2.0328757762908936, + "eval_model_preparation_time": 0.0036, + "eval_runtime": 67.4254, + "eval_samples_per_second": 148.312, + "eval_steps_per_second": 24.724, + "step": 600 + }, + { + "epoch": 0.3605927881442371, + "grad_norm": 1.3671875, + "learning_rate": 2.6386138613861384e-06, + "loss": 1.8685, + "step": 601 + }, + { + "epoch": 0.3611927761444771, + "grad_norm": 1.296875, + "learning_rate": 2.6361386138613858e-06, + "loss": 1.8958, + "step": 602 + }, + { + "epoch": 0.3617927641447171, + "grad_norm": 1.2734375, + "learning_rate": 2.6336633663366335e-06, + "loss": 1.7194, + "step": 603 + }, + { + "epoch": 0.3623927521449571, + "grad_norm": 1.296875, + "learning_rate": 2.631188118811881e-06, + "loss": 1.9711, + "step": 604 + }, + { + "epoch": 0.3629927401451971, + "grad_norm": 1.265625, + "learning_rate": 2.6287128712871286e-06, + "loss": 1.7863, + "step": 605 + }, + { + "epoch": 0.3635927281454371, + "grad_norm": 1.2421875, + "learning_rate": 2.626237623762376e-06, + "loss": 1.8361, + "step": 606 + }, + { + "epoch": 0.3641927161456771, + "grad_norm": 1.3125, + "learning_rate": 2.6237623762376237e-06, + "loss": 1.9021, + "step": 607 + }, + { + "epoch": 0.3647927041459171, + "grad_norm": 1.3125, + "learning_rate": 2.621287128712871e-06, + "loss": 1.9613, + "step": 608 + }, + { + "epoch": 0.36539269214615705, + "grad_norm": 1.2109375, + "learning_rate": 2.618811881188119e-06, + "loss": 1.8389, + "step": 609 + }, + { + "epoch": 0.3659926801463971, + "grad_norm": 1.578125, + "learning_rate": 2.616336633663366e-06, + "loss": 1.8752, + "step": 610 + }, + { + "epoch": 0.36659266814663705, + "grad_norm": 1.234375, + "learning_rate": 2.613861386138614e-06, + "loss": 1.8041, + "step": 611 + }, + { + "epoch": 0.36719265614687707, + "grad_norm": 1.4375, + "learning_rate": 2.6113861386138613e-06, + "loss": 1.8281, + "step": 612 + }, + { + "epoch": 0.36779264414711704, + "grad_norm": 1.3046875, + "learning_rate": 2.608910891089109e-06, + "loss": 1.7512, + "step": 613 + }, + { + "epoch": 0.36839263214735707, + "grad_norm": 1.2578125, + "learning_rate": 2.606435643564356e-06, + "loss": 1.9474, + "step": 614 + }, + { + "epoch": 0.36899262014759704, + "grad_norm": 1.265625, + "learning_rate": 2.6039603960396038e-06, + "loss": 1.8229, + "step": 615 + }, + { + "epoch": 0.36959260814783707, + "grad_norm": 1.171875, + "learning_rate": 2.601485148514851e-06, + "loss": 1.6716, + "step": 616 + }, + { + "epoch": 0.37019259614807704, + "grad_norm": 1.3828125, + "learning_rate": 2.599009900990099e-06, + "loss": 1.958, + "step": 617 + }, + { + "epoch": 0.370792584148317, + "grad_norm": 1.3125, + "learning_rate": 2.596534653465346e-06, + "loss": 1.886, + "step": 618 + }, + { + "epoch": 0.37139257214855703, + "grad_norm": 1.3671875, + "learning_rate": 2.594059405940594e-06, + "loss": 1.904, + "step": 619 + }, + { + "epoch": 0.371992560148797, + "grad_norm": 1.5234375, + "learning_rate": 2.5915841584158413e-06, + "loss": 1.8177, + "step": 620 + }, + { + "epoch": 0.37259254814903703, + "grad_norm": 1.1875, + "learning_rate": 2.589108910891089e-06, + "loss": 1.7927, + "step": 621 + }, + { + "epoch": 0.373192536149277, + "grad_norm": 1.2890625, + "learning_rate": 2.5866336633663364e-06, + "loss": 1.8946, + "step": 622 + }, + { + "epoch": 0.373792524149517, + "grad_norm": 1.21875, + "learning_rate": 2.584158415841584e-06, + "loss": 1.7751, + "step": 623 + }, + { + "epoch": 0.374392512149757, + "grad_norm": 1.4453125, + "learning_rate": 2.5816831683168315e-06, + "loss": 1.9221, + "step": 624 + }, + { + "epoch": 0.374992500149997, + "grad_norm": 1.3515625, + "learning_rate": 2.5792079207920793e-06, + "loss": 1.9133, + "step": 625 + }, + { + "epoch": 0.375592488150237, + "grad_norm": 1.28125, + "learning_rate": 2.5767326732673266e-06, + "loss": 1.8273, + "step": 626 + }, + { + "epoch": 0.376192476150477, + "grad_norm": 1.5390625, + "learning_rate": 2.5742574257425744e-06, + "loss": 1.7544, + "step": 627 + }, + { + "epoch": 0.376792464150717, + "grad_norm": 1.3046875, + "learning_rate": 2.5717821782178213e-06, + "loss": 1.7994, + "step": 628 + }, + { + "epoch": 0.37739245215095696, + "grad_norm": 1.3359375, + "learning_rate": 2.569306930693069e-06, + "loss": 1.9355, + "step": 629 + }, + { + "epoch": 0.377992440151197, + "grad_norm": 1.2578125, + "learning_rate": 2.5668316831683164e-06, + "loss": 1.7472, + "step": 630 + }, + { + "epoch": 0.37859242815143695, + "grad_norm": 1.28125, + "learning_rate": 2.564356435643564e-06, + "loss": 1.9562, + "step": 631 + }, + { + "epoch": 0.379192416151677, + "grad_norm": 1.4765625, + "learning_rate": 2.5618811881188115e-06, + "loss": 1.9807, + "step": 632 + }, + { + "epoch": 0.37979240415191695, + "grad_norm": 1.3046875, + "learning_rate": 2.5594059405940593e-06, + "loss": 1.9289, + "step": 633 + }, + { + "epoch": 0.380392392152157, + "grad_norm": 1.4375, + "learning_rate": 2.5569306930693067e-06, + "loss": 1.9202, + "step": 634 + }, + { + "epoch": 0.38099238015239695, + "grad_norm": 1.265625, + "learning_rate": 2.5544554455445544e-06, + "loss": 1.8881, + "step": 635 + }, + { + "epoch": 0.381592368152637, + "grad_norm": 1.2578125, + "learning_rate": 2.5519801980198018e-06, + "loss": 1.8555, + "step": 636 + }, + { + "epoch": 0.38219235615287694, + "grad_norm": 1.2734375, + "learning_rate": 2.5495049504950495e-06, + "loss": 1.9314, + "step": 637 + }, + { + "epoch": 0.3827923441531169, + "grad_norm": 1.2578125, + "learning_rate": 2.547029702970297e-06, + "loss": 1.7896, + "step": 638 + }, + { + "epoch": 0.38339233215335694, + "grad_norm": 1.4375, + "learning_rate": 2.5445544554455446e-06, + "loss": 1.8339, + "step": 639 + }, + { + "epoch": 0.3839923201535969, + "grad_norm": 1.3125, + "learning_rate": 2.542079207920792e-06, + "loss": 1.9142, + "step": 640 + }, + { + "epoch": 0.38459230815383694, + "grad_norm": 1.390625, + "learning_rate": 2.5396039603960393e-06, + "loss": 1.9154, + "step": 641 + }, + { + "epoch": 0.3851922961540769, + "grad_norm": 1.1328125, + "learning_rate": 2.5371287128712867e-06, + "loss": 1.8165, + "step": 642 + }, + { + "epoch": 0.38579228415431693, + "grad_norm": 1.1953125, + "learning_rate": 2.5346534653465344e-06, + "loss": 1.6729, + "step": 643 + }, + { + "epoch": 0.3863922721545569, + "grad_norm": 1.1875, + "learning_rate": 2.5321782178217818e-06, + "loss": 1.8441, + "step": 644 + }, + { + "epoch": 0.38699226015479693, + "grad_norm": 1.4453125, + "learning_rate": 2.5297029702970295e-06, + "loss": 1.7124, + "step": 645 + }, + { + "epoch": 0.3875922481550369, + "grad_norm": 1.359375, + "learning_rate": 2.527227722772277e-06, + "loss": 1.8526, + "step": 646 + }, + { + "epoch": 0.38819223615527687, + "grad_norm": 1.2578125, + "learning_rate": 2.5247524752475247e-06, + "loss": 1.7846, + "step": 647 + }, + { + "epoch": 0.3887922241555169, + "grad_norm": 1.140625, + "learning_rate": 2.522277227722772e-06, + "loss": 1.8099, + "step": 648 + }, + { + "epoch": 0.38939221215575687, + "grad_norm": 1.390625, + "learning_rate": 2.5198019801980198e-06, + "loss": 1.8258, + "step": 649 + }, + { + "epoch": 0.3899922001559969, + "grad_norm": 1.3125, + "learning_rate": 2.517326732673267e-06, + "loss": 1.7954, + "step": 650 + }, + { + "epoch": 0.3899922001559969, + "eval_loss": 2.032578468322754, + "eval_model_preparation_time": 0.0036, + "eval_runtime": 65.0425, + "eval_samples_per_second": 153.746, + "eval_steps_per_second": 25.629, + "step": 650 + }, + { + "epoch": 0.39059218815623686, + "grad_norm": 2.40625, + "learning_rate": 2.514851485148515e-06, + "loss": 1.8582, + "step": 651 + }, + { + "epoch": 0.3911921761564769, + "grad_norm": 1.265625, + "learning_rate": 2.5123762376237622e-06, + "loss": 1.7973, + "step": 652 + }, + { + "epoch": 0.39179216415671686, + "grad_norm": 1.34375, + "learning_rate": 2.50990099009901e-06, + "loss": 1.8664, + "step": 653 + }, + { + "epoch": 0.3923921521569569, + "grad_norm": 1.1953125, + "learning_rate": 2.507425742574257e-06, + "loss": 1.8547, + "step": 654 + }, + { + "epoch": 0.39299214015719686, + "grad_norm": 1.2421875, + "learning_rate": 2.5049504950495047e-06, + "loss": 1.7987, + "step": 655 + }, + { + "epoch": 0.3935921281574368, + "grad_norm": 1.3828125, + "learning_rate": 2.502475247524752e-06, + "loss": 1.7597, + "step": 656 + }, + { + "epoch": 0.39419211615767685, + "grad_norm": 1.28125, + "learning_rate": 2.4999999999999998e-06, + "loss": 1.8468, + "step": 657 + }, + { + "epoch": 0.3947921041579168, + "grad_norm": 1.328125, + "learning_rate": 2.4975247524752475e-06, + "loss": 1.7945, + "step": 658 + }, + { + "epoch": 0.39539209215815685, + "grad_norm": 1.390625, + "learning_rate": 2.495049504950495e-06, + "loss": 1.7316, + "step": 659 + }, + { + "epoch": 0.3959920801583968, + "grad_norm": 1.328125, + "learning_rate": 2.4925742574257427e-06, + "loss": 1.7246, + "step": 660 + }, + { + "epoch": 0.39659206815863685, + "grad_norm": 1.1875, + "learning_rate": 2.49009900990099e-06, + "loss": 1.7835, + "step": 661 + }, + { + "epoch": 0.3971920561588768, + "grad_norm": 1.4453125, + "learning_rate": 2.4876237623762378e-06, + "loss": 1.8527, + "step": 662 + }, + { + "epoch": 0.39779204415911684, + "grad_norm": 1.234375, + "learning_rate": 2.485148514851485e-06, + "loss": 1.8803, + "step": 663 + }, + { + "epoch": 0.3983920321593568, + "grad_norm": 1.3125, + "learning_rate": 2.482673267326733e-06, + "loss": 1.8751, + "step": 664 + }, + { + "epoch": 0.3989920201595968, + "grad_norm": 1.640625, + "learning_rate": 2.4801980198019802e-06, + "loss": 1.9245, + "step": 665 + }, + { + "epoch": 0.3995920081598368, + "grad_norm": 1.25, + "learning_rate": 2.477722772277228e-06, + "loss": 1.7848, + "step": 666 + }, + { + "epoch": 0.4001919961600768, + "grad_norm": 1.9765625, + "learning_rate": 2.475247524752475e-06, + "loss": 1.7866, + "step": 667 + }, + { + "epoch": 0.4007919841603168, + "grad_norm": 1.3515625, + "learning_rate": 2.4727722772277227e-06, + "loss": 1.696, + "step": 668 + }, + { + "epoch": 0.4013919721605568, + "grad_norm": 1.1953125, + "learning_rate": 2.47029702970297e-06, + "loss": 1.768, + "step": 669 + }, + { + "epoch": 0.4019919601607968, + "grad_norm": 1.6328125, + "learning_rate": 2.4678217821782178e-06, + "loss": 1.8123, + "step": 670 + }, + { + "epoch": 0.4025919481610368, + "grad_norm": 1.2421875, + "learning_rate": 2.465346534653465e-06, + "loss": 1.8934, + "step": 671 + }, + { + "epoch": 0.4031919361612768, + "grad_norm": 1.3359375, + "learning_rate": 2.462871287128713e-06, + "loss": 1.8896, + "step": 672 + }, + { + "epoch": 0.40379192416151677, + "grad_norm": 1.28125, + "learning_rate": 2.4603960396039602e-06, + "loss": 1.9371, + "step": 673 + }, + { + "epoch": 0.40439191216175674, + "grad_norm": 1.296875, + "learning_rate": 2.457920792079208e-06, + "loss": 1.68, + "step": 674 + }, + { + "epoch": 0.40499190016199677, + "grad_norm": 1.5, + "learning_rate": 2.4554455445544553e-06, + "loss": 1.9408, + "step": 675 + }, + { + "epoch": 0.40559188816223674, + "grad_norm": 1.4609375, + "learning_rate": 2.452970297029703e-06, + "loss": 1.7283, + "step": 676 + }, + { + "epoch": 0.40619187616247676, + "grad_norm": 2.203125, + "learning_rate": 2.4504950495049505e-06, + "loss": 1.8666, + "step": 677 + }, + { + "epoch": 0.40679186416271673, + "grad_norm": 1.3125, + "learning_rate": 2.4480198019801982e-06, + "loss": 1.8113, + "step": 678 + }, + { + "epoch": 0.40739185216295676, + "grad_norm": 1.71875, + "learning_rate": 2.4455445544554456e-06, + "loss": 1.8736, + "step": 679 + }, + { + "epoch": 0.40799184016319673, + "grad_norm": 1.3359375, + "learning_rate": 2.4430693069306933e-06, + "loss": 1.8091, + "step": 680 + }, + { + "epoch": 0.40859182816343675, + "grad_norm": 1.3515625, + "learning_rate": 2.4405940594059402e-06, + "loss": 1.8982, + "step": 681 + }, + { + "epoch": 0.4091918161636767, + "grad_norm": 1.1796875, + "learning_rate": 2.438118811881188e-06, + "loss": 1.8069, + "step": 682 + }, + { + "epoch": 0.4097918041639167, + "grad_norm": 1.296875, + "learning_rate": 2.4356435643564354e-06, + "loss": 1.8585, + "step": 683 + }, + { + "epoch": 0.4103917921641567, + "grad_norm": 1.65625, + "learning_rate": 2.433168316831683e-06, + "loss": 1.9333, + "step": 684 + }, + { + "epoch": 0.4109917801643967, + "grad_norm": 1.390625, + "learning_rate": 2.4306930693069305e-06, + "loss": 1.8872, + "step": 685 + }, + { + "epoch": 0.4115917681646367, + "grad_norm": 1.421875, + "learning_rate": 2.4282178217821782e-06, + "loss": 1.8252, + "step": 686 + }, + { + "epoch": 0.4121917561648767, + "grad_norm": 1.3359375, + "learning_rate": 2.4257425742574256e-06, + "loss": 1.7709, + "step": 687 + }, + { + "epoch": 0.4127917441651167, + "grad_norm": 1.3046875, + "learning_rate": 2.4232673267326733e-06, + "loss": 1.8485, + "step": 688 + }, + { + "epoch": 0.4133917321653567, + "grad_norm": 1.390625, + "learning_rate": 2.4207920792079207e-06, + "loss": 1.7962, + "step": 689 + }, + { + "epoch": 0.4139917201655967, + "grad_norm": 1.2578125, + "learning_rate": 2.4183168316831685e-06, + "loss": 1.8375, + "step": 690 + }, + { + "epoch": 0.4145917081658367, + "grad_norm": 1.265625, + "learning_rate": 2.415841584158416e-06, + "loss": 1.8245, + "step": 691 + }, + { + "epoch": 0.41519169616607665, + "grad_norm": 1.265625, + "learning_rate": 2.4133663366336636e-06, + "loss": 1.9368, + "step": 692 + }, + { + "epoch": 0.4157916841663167, + "grad_norm": 1.3125, + "learning_rate": 2.410891089108911e-06, + "loss": 1.8899, + "step": 693 + }, + { + "epoch": 0.41639167216655665, + "grad_norm": 1.3125, + "learning_rate": 2.4084158415841582e-06, + "loss": 1.8622, + "step": 694 + }, + { + "epoch": 0.4169916601667967, + "grad_norm": 1.2421875, + "learning_rate": 2.4059405940594056e-06, + "loss": 1.77, + "step": 695 + }, + { + "epoch": 0.41759164816703664, + "grad_norm": 1.28125, + "learning_rate": 2.4034653465346534e-06, + "loss": 1.983, + "step": 696 + }, + { + "epoch": 0.41819163616727667, + "grad_norm": 1.296875, + "learning_rate": 2.4009900990099007e-06, + "loss": 1.9289, + "step": 697 + }, + { + "epoch": 0.41879162416751664, + "grad_norm": 1.3984375, + "learning_rate": 2.3985148514851485e-06, + "loss": 1.7553, + "step": 698 + }, + { + "epoch": 0.41939161216775667, + "grad_norm": 1.28125, + "learning_rate": 2.396039603960396e-06, + "loss": 1.9037, + "step": 699 + }, + { + "epoch": 0.41999160016799664, + "grad_norm": 1.3125, + "learning_rate": 2.3935643564356436e-06, + "loss": 1.8117, + "step": 700 + }, + { + "epoch": 0.41999160016799664, + "eval_loss": 2.0324764251708984, + "eval_model_preparation_time": 0.0036, + "eval_runtime": 65.1863, + "eval_samples_per_second": 153.407, + "eval_steps_per_second": 25.573, + "step": 700 + } + ], + "logging_steps": 1, + "max_steps": 1666, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.3404255362492662e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}