{ "best_global_step": 700, "best_metric": 2.0324764251708984, "best_model_checkpoint": "./output_dir/fr-Llama-3.1-8B-lr4e-06-atten0.25-ffn0.25_20250430_122245/checkpoint-700", "epoch": 0.41999160016799664, "eval_steps": 50, "global_step": 700, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0005999880002399952, "grad_norm": 1.9765625, "learning_rate": 0.0, "loss": 1.9517, "step": 1 }, { "epoch": 0.0011999760004799903, "grad_norm": 1.7265625, "learning_rate": 8e-08, "loss": 1.8084, "step": 2 }, { "epoch": 0.0017999640007199855, "grad_norm": 2.046875, "learning_rate": 1.6e-07, "loss": 1.8214, "step": 3 }, { "epoch": 0.0023999520009599807, "grad_norm": 1.7109375, "learning_rate": 2.4e-07, "loss": 1.8709, "step": 4 }, { "epoch": 0.002999940001199976, "grad_norm": 2.125, "learning_rate": 3.2e-07, "loss": 1.8741, "step": 5 }, { "epoch": 0.003599928001439971, "grad_norm": 2.015625, "learning_rate": 4e-07, "loss": 1.8875, "step": 6 }, { "epoch": 0.004199916001679967, "grad_norm": 1.8359375, "learning_rate": 4.8e-07, "loss": 1.9111, "step": 7 }, { "epoch": 0.004799904001919961, "grad_norm": 1.6875, "learning_rate": 5.6e-07, "loss": 1.9349, "step": 8 }, { "epoch": 0.005399892002159957, "grad_norm": 1.8671875, "learning_rate": 6.4e-07, "loss": 1.891, "step": 9 }, { "epoch": 0.005999880002399952, "grad_norm": 2.65625, "learning_rate": 7.2e-07, "loss": 1.9387, "step": 10 }, { "epoch": 0.006599868002639947, "grad_norm": 1.8203125, "learning_rate": 8e-07, "loss": 1.8719, "step": 11 }, { "epoch": 0.007199856002879942, "grad_norm": 1.71875, "learning_rate": 8.799999999999999e-07, "loss": 2.0476, "step": 12 }, { "epoch": 0.007799844003119938, "grad_norm": 1.828125, "learning_rate": 9.6e-07, "loss": 1.8709, "step": 13 }, { "epoch": 0.008399832003359933, "grad_norm": 1.9375, "learning_rate": 1.04e-06, "loss": 1.83, "step": 14 }, { "epoch": 0.008999820003599928, "grad_norm": 1.796875, "learning_rate": 1.12e-06, "loss": 1.8584, "step": 15 }, { "epoch": 0.009599808003839923, "grad_norm": 1.9375, "learning_rate": 1.2e-06, "loss": 1.9224, "step": 16 }, { "epoch": 0.01019979600407992, "grad_norm": 1.953125, "learning_rate": 1.28e-06, "loss": 1.8243, "step": 17 }, { "epoch": 0.010799784004319914, "grad_norm": 2.0625, "learning_rate": 1.3600000000000001e-06, "loss": 1.9473, "step": 18 }, { "epoch": 0.011399772004559909, "grad_norm": 1.8828125, "learning_rate": 1.44e-06, "loss": 1.9006, "step": 19 }, { "epoch": 0.011999760004799903, "grad_norm": 2.15625, "learning_rate": 1.5199999999999998e-06, "loss": 1.8361, "step": 20 }, { "epoch": 0.0125997480050399, "grad_norm": 1.9140625, "learning_rate": 1.6e-06, "loss": 1.9316, "step": 21 }, { "epoch": 0.013199736005279895, "grad_norm": 2.015625, "learning_rate": 1.6799999999999998e-06, "loss": 1.8865, "step": 22 }, { "epoch": 0.01379972400551989, "grad_norm": 1.9453125, "learning_rate": 1.7599999999999999e-06, "loss": 1.9309, "step": 23 }, { "epoch": 0.014399712005759884, "grad_norm": 2.0625, "learning_rate": 1.84e-06, "loss": 1.8815, "step": 24 }, { "epoch": 0.01499970000599988, "grad_norm": 1.9140625, "learning_rate": 1.92e-06, "loss": 1.8123, "step": 25 }, { "epoch": 0.015599688006239875, "grad_norm": 1.703125, "learning_rate": 2e-06, "loss": 1.8855, "step": 26 }, { "epoch": 0.016199676006479872, "grad_norm": 2.0, "learning_rate": 2.08e-06, "loss": 1.8313, "step": 27 }, { "epoch": 0.016799664006719867, "grad_norm": 1.671875, "learning_rate": 2.16e-06, "loss": 1.8148, "step": 28 }, { "epoch": 0.01739965200695986, "grad_norm": 1.7890625, "learning_rate": 2.24e-06, "loss": 1.9719, "step": 29 }, { "epoch": 0.017999640007199856, "grad_norm": 2.1875, "learning_rate": 2.32e-06, "loss": 1.8331, "step": 30 }, { "epoch": 0.01859962800743985, "grad_norm": 1.890625, "learning_rate": 2.4e-06, "loss": 1.9469, "step": 31 }, { "epoch": 0.019199616007679846, "grad_norm": 1.6953125, "learning_rate": 2.48e-06, "loss": 1.884, "step": 32 }, { "epoch": 0.01979960400791984, "grad_norm": 1.8203125, "learning_rate": 2.56e-06, "loss": 1.9041, "step": 33 }, { "epoch": 0.02039959200815984, "grad_norm": 1.9609375, "learning_rate": 2.64e-06, "loss": 2.0232, "step": 34 }, { "epoch": 0.020999580008399833, "grad_norm": 2.125, "learning_rate": 2.7200000000000002e-06, "loss": 1.832, "step": 35 }, { "epoch": 0.021599568008639828, "grad_norm": 1.8984375, "learning_rate": 2.8e-06, "loss": 1.7751, "step": 36 }, { "epoch": 0.022199556008879823, "grad_norm": 1.875, "learning_rate": 2.88e-06, "loss": 1.8467, "step": 37 }, { "epoch": 0.022799544009119817, "grad_norm": 1.84375, "learning_rate": 2.96e-06, "loss": 1.7807, "step": 38 }, { "epoch": 0.023399532009359812, "grad_norm": 1.9765625, "learning_rate": 3.0399999999999997e-06, "loss": 1.7781, "step": 39 }, { "epoch": 0.023999520009599807, "grad_norm": 1.609375, "learning_rate": 3.1199999999999998e-06, "loss": 1.8486, "step": 40 }, { "epoch": 0.0245995080098398, "grad_norm": 1.9296875, "learning_rate": 3.2e-06, "loss": 1.8328, "step": 41 }, { "epoch": 0.0251994960100798, "grad_norm": 1.7265625, "learning_rate": 3.2799999999999995e-06, "loss": 2.0436, "step": 42 }, { "epoch": 0.025799484010319795, "grad_norm": 1.8125, "learning_rate": 3.3599999999999996e-06, "loss": 1.9575, "step": 43 }, { "epoch": 0.02639947201055979, "grad_norm": 1.8125, "learning_rate": 3.4399999999999997e-06, "loss": 1.9247, "step": 44 }, { "epoch": 0.026999460010799784, "grad_norm": 1.734375, "learning_rate": 3.5199999999999998e-06, "loss": 1.9135, "step": 45 }, { "epoch": 0.02759944801103978, "grad_norm": 1.78125, "learning_rate": 3.6e-06, "loss": 1.9564, "step": 46 }, { "epoch": 0.028199436011279774, "grad_norm": 1.828125, "learning_rate": 3.68e-06, "loss": 1.8047, "step": 47 }, { "epoch": 0.02879942401151977, "grad_norm": 1.5859375, "learning_rate": 3.7599999999999996e-06, "loss": 1.9223, "step": 48 }, { "epoch": 0.029399412011759767, "grad_norm": 1.7109375, "learning_rate": 3.84e-06, "loss": 1.9724, "step": 49 }, { "epoch": 0.02999940001199976, "grad_norm": 1.7890625, "learning_rate": 3.92e-06, "loss": 1.8151, "step": 50 }, { "epoch": 0.02999940001199976, "eval_loss": 2.060624361038208, "eval_model_preparation_time": 0.0036, "eval_runtime": 65.0119, "eval_samples_per_second": 153.818, "eval_steps_per_second": 25.641, "step": 50 }, { "epoch": 0.030599388012239756, "grad_norm": 1.46875, "learning_rate": 4e-06, "loss": 1.8375, "step": 51 }, { "epoch": 0.03119937601247975, "grad_norm": 1.7109375, "learning_rate": 3.997524752475248e-06, "loss": 1.8369, "step": 52 }, { "epoch": 0.031799364012719745, "grad_norm": 1.84375, "learning_rate": 3.9950495049504945e-06, "loss": 1.7909, "step": 53 }, { "epoch": 0.032399352012959744, "grad_norm": 1.46875, "learning_rate": 3.992574257425742e-06, "loss": 1.819, "step": 54 }, { "epoch": 0.032999340013199735, "grad_norm": 1.8046875, "learning_rate": 3.99009900990099e-06, "loss": 1.8014, "step": 55 }, { "epoch": 0.03359932801343973, "grad_norm": 1.96875, "learning_rate": 3.987623762376238e-06, "loss": 1.7094, "step": 56 }, { "epoch": 0.034199316013679724, "grad_norm": 1.546875, "learning_rate": 3.985148514851485e-06, "loss": 1.8507, "step": 57 }, { "epoch": 0.03479930401391972, "grad_norm": 1.640625, "learning_rate": 3.9826732673267325e-06, "loss": 1.8693, "step": 58 }, { "epoch": 0.035399292014159714, "grad_norm": 1.4765625, "learning_rate": 3.98019801980198e-06, "loss": 1.9363, "step": 59 }, { "epoch": 0.03599928001439971, "grad_norm": 1.6328125, "learning_rate": 3.977722772277228e-06, "loss": 1.8308, "step": 60 }, { "epoch": 0.03659926801463971, "grad_norm": 1.71875, "learning_rate": 3.975247524752475e-06, "loss": 1.8361, "step": 61 }, { "epoch": 0.0371992560148797, "grad_norm": 1.375, "learning_rate": 3.972772277227723e-06, "loss": 1.839, "step": 62 }, { "epoch": 0.0377992440151197, "grad_norm": 1.53125, "learning_rate": 3.9702970297029705e-06, "loss": 1.8829, "step": 63 }, { "epoch": 0.03839923201535969, "grad_norm": 1.703125, "learning_rate": 3.967821782178218e-06, "loss": 1.88, "step": 64 }, { "epoch": 0.03899922001559969, "grad_norm": 1.546875, "learning_rate": 3.965346534653465e-06, "loss": 1.9815, "step": 65 }, { "epoch": 0.03959920801583968, "grad_norm": 1.921875, "learning_rate": 3.962871287128713e-06, "loss": 1.8474, "step": 66 }, { "epoch": 0.04019919601607968, "grad_norm": 1.40625, "learning_rate": 3.96039603960396e-06, "loss": 1.8953, "step": 67 }, { "epoch": 0.04079918401631968, "grad_norm": 1.640625, "learning_rate": 3.957920792079208e-06, "loss": 1.861, "step": 68 }, { "epoch": 0.04139917201655967, "grad_norm": 1.6171875, "learning_rate": 3.955445544554455e-06, "loss": 1.9094, "step": 69 }, { "epoch": 0.041999160016799666, "grad_norm": 1.421875, "learning_rate": 3.952970297029703e-06, "loss": 2.0281, "step": 70 }, { "epoch": 0.04259914801703966, "grad_norm": 1.5234375, "learning_rate": 3.95049504950495e-06, "loss": 1.896, "step": 71 }, { "epoch": 0.043199136017279656, "grad_norm": 1.796875, "learning_rate": 3.948019801980198e-06, "loss": 1.8792, "step": 72 }, { "epoch": 0.04379912401751965, "grad_norm": 1.6484375, "learning_rate": 3.945544554455446e-06, "loss": 1.8139, "step": 73 }, { "epoch": 0.044399112017759645, "grad_norm": 1.53125, "learning_rate": 3.943069306930693e-06, "loss": 1.813, "step": 74 }, { "epoch": 0.04499910001799964, "grad_norm": 1.546875, "learning_rate": 3.94059405940594e-06, "loss": 1.8587, "step": 75 }, { "epoch": 0.045599088018239635, "grad_norm": 1.3125, "learning_rate": 3.938118811881188e-06, "loss": 1.8793, "step": 76 }, { "epoch": 0.04619907601847963, "grad_norm": 1.59375, "learning_rate": 3.935643564356436e-06, "loss": 1.7305, "step": 77 }, { "epoch": 0.046799064018719624, "grad_norm": 1.46875, "learning_rate": 3.933168316831683e-06, "loss": 1.8243, "step": 78 }, { "epoch": 0.04739905201895962, "grad_norm": 1.4609375, "learning_rate": 3.9306930693069305e-06, "loss": 1.9018, "step": 79 }, { "epoch": 0.047999040019199614, "grad_norm": 1.59375, "learning_rate": 3.928217821782178e-06, "loss": 1.8979, "step": 80 }, { "epoch": 0.04859902801943961, "grad_norm": 1.359375, "learning_rate": 3.925742574257425e-06, "loss": 1.9308, "step": 81 }, { "epoch": 0.0491990160196796, "grad_norm": 1.6796875, "learning_rate": 3.923267326732673e-06, "loss": 1.8194, "step": 82 }, { "epoch": 0.0497990040199196, "grad_norm": 1.6015625, "learning_rate": 3.920792079207921e-06, "loss": 1.8127, "step": 83 }, { "epoch": 0.0503989920201596, "grad_norm": 1.625, "learning_rate": 3.9183168316831685e-06, "loss": 1.7168, "step": 84 }, { "epoch": 0.05099898002039959, "grad_norm": 1.921875, "learning_rate": 3.915841584158415e-06, "loss": 1.9451, "step": 85 }, { "epoch": 0.05159896802063959, "grad_norm": 1.484375, "learning_rate": 3.913366336633663e-06, "loss": 1.8582, "step": 86 }, { "epoch": 0.05219895602087958, "grad_norm": 1.5703125, "learning_rate": 3.910891089108911e-06, "loss": 1.7901, "step": 87 }, { "epoch": 0.05279894402111958, "grad_norm": 1.484375, "learning_rate": 3.908415841584159e-06, "loss": 1.7489, "step": 88 }, { "epoch": 0.05339893202135957, "grad_norm": 1.625, "learning_rate": 3.905940594059406e-06, "loss": 1.8365, "step": 89 }, { "epoch": 0.05399892002159957, "grad_norm": 1.375, "learning_rate": 3.903465346534653e-06, "loss": 1.8664, "step": 90 }, { "epoch": 0.054598908021839566, "grad_norm": 1.34375, "learning_rate": 3.9009900990099e-06, "loss": 1.7951, "step": 91 }, { "epoch": 0.05519889602207956, "grad_norm": 1.7421875, "learning_rate": 3.898514851485148e-06, "loss": 1.6784, "step": 92 }, { "epoch": 0.055798884022319556, "grad_norm": 1.6171875, "learning_rate": 3.896039603960396e-06, "loss": 1.8358, "step": 93 }, { "epoch": 0.05639887202255955, "grad_norm": 1.5546875, "learning_rate": 3.893564356435644e-06, "loss": 1.7865, "step": 94 }, { "epoch": 0.056998860022799545, "grad_norm": 1.5234375, "learning_rate": 3.8910891089108905e-06, "loss": 1.8682, "step": 95 }, { "epoch": 0.05759884802303954, "grad_norm": 1.4296875, "learning_rate": 3.888613861386138e-06, "loss": 1.8253, "step": 96 }, { "epoch": 0.058198836023279535, "grad_norm": 1.5703125, "learning_rate": 3.886138613861386e-06, "loss": 1.7567, "step": 97 }, { "epoch": 0.05879882402351953, "grad_norm": 1.5703125, "learning_rate": 3.883663366336634e-06, "loss": 2.0378, "step": 98 }, { "epoch": 0.059398812023759524, "grad_norm": 1.6171875, "learning_rate": 3.881188118811881e-06, "loss": 1.9027, "step": 99 }, { "epoch": 0.05999880002399952, "grad_norm": 1.5234375, "learning_rate": 3.8787128712871285e-06, "loss": 2.0289, "step": 100 }, { "epoch": 0.05999880002399952, "eval_loss": 2.0494163036346436, "eval_model_preparation_time": 0.0036, "eval_runtime": 65.139, "eval_samples_per_second": 153.518, "eval_steps_per_second": 25.591, "step": 100 }, { "epoch": 0.060598788024239514, "grad_norm": 1.6328125, "learning_rate": 3.876237623762376e-06, "loss": 1.8943, "step": 101 }, { "epoch": 0.06119877602447951, "grad_norm": 1.59375, "learning_rate": 3.873762376237624e-06, "loss": 1.8859, "step": 102 }, { "epoch": 0.0617987640247195, "grad_norm": 1.5625, "learning_rate": 3.871287128712871e-06, "loss": 1.8459, "step": 103 }, { "epoch": 0.0623987520249595, "grad_norm": 1.46875, "learning_rate": 3.868811881188119e-06, "loss": 1.9159, "step": 104 }, { "epoch": 0.06299874002519949, "grad_norm": 1.4375, "learning_rate": 3.866336633663366e-06, "loss": 1.8295, "step": 105 }, { "epoch": 0.06359872802543949, "grad_norm": 1.2734375, "learning_rate": 3.8638613861386134e-06, "loss": 1.9225, "step": 106 }, { "epoch": 0.06419871602567949, "grad_norm": 1.5234375, "learning_rate": 3.861386138613861e-06, "loss": 1.8648, "step": 107 }, { "epoch": 0.06479870402591949, "grad_norm": 1.5078125, "learning_rate": 3.858910891089109e-06, "loss": 1.8594, "step": 108 }, { "epoch": 0.06539869202615947, "grad_norm": 1.4375, "learning_rate": 3.856435643564356e-06, "loss": 1.979, "step": 109 }, { "epoch": 0.06599868002639947, "grad_norm": 1.7734375, "learning_rate": 3.853960396039604e-06, "loss": 1.8235, "step": 110 }, { "epoch": 0.06659866802663947, "grad_norm": 1.40625, "learning_rate": 3.851485148514851e-06, "loss": 1.7781, "step": 111 }, { "epoch": 0.06719865602687947, "grad_norm": 1.5625, "learning_rate": 3.849009900990099e-06, "loss": 1.8508, "step": 112 }, { "epoch": 0.06779864402711945, "grad_norm": 1.25, "learning_rate": 3.846534653465346e-06, "loss": 1.8538, "step": 113 }, { "epoch": 0.06839863202735945, "grad_norm": 1.296875, "learning_rate": 3.844059405940594e-06, "loss": 1.7364, "step": 114 }, { "epoch": 0.06899862002759945, "grad_norm": 1.703125, "learning_rate": 3.841584158415842e-06, "loss": 1.7373, "step": 115 }, { "epoch": 0.06959860802783945, "grad_norm": 1.4140625, "learning_rate": 3.839108910891089e-06, "loss": 1.8901, "step": 116 }, { "epoch": 0.07019859602807944, "grad_norm": 1.5625, "learning_rate": 3.836633663366336e-06, "loss": 1.8541, "step": 117 }, { "epoch": 0.07079858402831943, "grad_norm": 1.6640625, "learning_rate": 3.834158415841584e-06, "loss": 1.7285, "step": 118 }, { "epoch": 0.07139857202855943, "grad_norm": 1.390625, "learning_rate": 3.831683168316831e-06, "loss": 1.7322, "step": 119 }, { "epoch": 0.07199856002879942, "grad_norm": 1.390625, "learning_rate": 3.829207920792079e-06, "loss": 1.8256, "step": 120 }, { "epoch": 0.07259854802903942, "grad_norm": 1.3203125, "learning_rate": 3.8267326732673265e-06, "loss": 1.7352, "step": 121 }, { "epoch": 0.07319853602927942, "grad_norm": 1.4609375, "learning_rate": 3.824257425742574e-06, "loss": 1.8343, "step": 122 }, { "epoch": 0.0737985240295194, "grad_norm": 1.5078125, "learning_rate": 3.821782178217821e-06, "loss": 1.9866, "step": 123 }, { "epoch": 0.0743985120297594, "grad_norm": 1.359375, "learning_rate": 3.819306930693069e-06, "loss": 1.8854, "step": 124 }, { "epoch": 0.0749985000299994, "grad_norm": 1.484375, "learning_rate": 3.816831683168317e-06, "loss": 1.8341, "step": 125 }, { "epoch": 0.0755984880302394, "grad_norm": 1.3984375, "learning_rate": 3.814356435643564e-06, "loss": 1.829, "step": 126 }, { "epoch": 0.07619847603047938, "grad_norm": 1.4375, "learning_rate": 3.8118811881188114e-06, "loss": 1.8042, "step": 127 }, { "epoch": 0.07679846403071938, "grad_norm": 1.484375, "learning_rate": 3.809405940594059e-06, "loss": 1.7094, "step": 128 }, { "epoch": 0.07739845203095938, "grad_norm": 1.40625, "learning_rate": 3.8069306930693065e-06, "loss": 1.7705, "step": 129 }, { "epoch": 0.07799844003119938, "grad_norm": 1.3984375, "learning_rate": 3.8044554455445543e-06, "loss": 1.8023, "step": 130 }, { "epoch": 0.07859842803143938, "grad_norm": 1.390625, "learning_rate": 3.8019801980198017e-06, "loss": 1.9408, "step": 131 }, { "epoch": 0.07919841603167936, "grad_norm": 1.4921875, "learning_rate": 3.7995049504950494e-06, "loss": 1.8589, "step": 132 }, { "epoch": 0.07979840403191936, "grad_norm": 1.4140625, "learning_rate": 3.7970297029702968e-06, "loss": 1.8806, "step": 133 }, { "epoch": 0.08039839203215936, "grad_norm": 1.5234375, "learning_rate": 3.7945544554455445e-06, "loss": 1.8629, "step": 134 }, { "epoch": 0.08099838003239936, "grad_norm": 1.734375, "learning_rate": 3.792079207920792e-06, "loss": 1.7811, "step": 135 }, { "epoch": 0.08159836803263935, "grad_norm": 1.421875, "learning_rate": 3.7896039603960396e-06, "loss": 1.8703, "step": 136 }, { "epoch": 0.08219835603287934, "grad_norm": 1.4375, "learning_rate": 3.7871287128712866e-06, "loss": 1.9697, "step": 137 }, { "epoch": 0.08279834403311934, "grad_norm": 1.484375, "learning_rate": 3.7846534653465343e-06, "loss": 1.8083, "step": 138 }, { "epoch": 0.08339833203335933, "grad_norm": 1.5078125, "learning_rate": 3.7821782178217817e-06, "loss": 1.8348, "step": 139 }, { "epoch": 0.08399832003359933, "grad_norm": 1.3671875, "learning_rate": 3.7797029702970294e-06, "loss": 1.7378, "step": 140 }, { "epoch": 0.08459830803383932, "grad_norm": 1.3046875, "learning_rate": 3.7772277227722768e-06, "loss": 1.9362, "step": 141 }, { "epoch": 0.08519829603407932, "grad_norm": 1.75, "learning_rate": 3.7747524752475245e-06, "loss": 1.9628, "step": 142 }, { "epoch": 0.08579828403431931, "grad_norm": 1.3203125, "learning_rate": 3.772277227722772e-06, "loss": 1.8524, "step": 143 }, { "epoch": 0.08639827203455931, "grad_norm": 1.6875, "learning_rate": 3.7698019801980197e-06, "loss": 1.7373, "step": 144 }, { "epoch": 0.08699826003479931, "grad_norm": 1.546875, "learning_rate": 3.767326732673267e-06, "loss": 1.8752, "step": 145 }, { "epoch": 0.0875982480350393, "grad_norm": 1.484375, "learning_rate": 3.7648514851485148e-06, "loss": 1.7939, "step": 146 }, { "epoch": 0.08819823603527929, "grad_norm": 1.265625, "learning_rate": 3.762376237623762e-06, "loss": 1.8857, "step": 147 }, { "epoch": 0.08879822403551929, "grad_norm": 1.4921875, "learning_rate": 3.75990099009901e-06, "loss": 1.8343, "step": 148 }, { "epoch": 0.08939821203575929, "grad_norm": 1.4921875, "learning_rate": 3.7574257425742572e-06, "loss": 1.8464, "step": 149 }, { "epoch": 0.08999820003599927, "grad_norm": 1.375, "learning_rate": 3.754950495049505e-06, "loss": 1.6997, "step": 150 }, { "epoch": 0.08999820003599927, "eval_loss": 2.043649673461914, "eval_model_preparation_time": 0.0036, "eval_runtime": 67.4909, "eval_samples_per_second": 148.168, "eval_steps_per_second": 24.7, "step": 150 }, { "epoch": 0.09059818803623927, "grad_norm": 1.3203125, "learning_rate": 3.752475247524752e-06, "loss": 1.8642, "step": 151 }, { "epoch": 0.09119817603647927, "grad_norm": 1.390625, "learning_rate": 3.7499999999999997e-06, "loss": 1.7887, "step": 152 }, { "epoch": 0.09179816403671927, "grad_norm": 1.3203125, "learning_rate": 3.7475247524752474e-06, "loss": 1.8596, "step": 153 }, { "epoch": 0.09239815203695927, "grad_norm": 1.484375, "learning_rate": 3.7450495049504948e-06, "loss": 2.0243, "step": 154 }, { "epoch": 0.09299814003719925, "grad_norm": 1.3203125, "learning_rate": 3.7425742574257425e-06, "loss": 1.7887, "step": 155 }, { "epoch": 0.09359812803743925, "grad_norm": 1.359375, "learning_rate": 3.74009900990099e-06, "loss": 1.828, "step": 156 }, { "epoch": 0.09419811603767925, "grad_norm": 1.3984375, "learning_rate": 3.7376237623762377e-06, "loss": 1.7897, "step": 157 }, { "epoch": 0.09479810403791925, "grad_norm": 1.4453125, "learning_rate": 3.735148514851485e-06, "loss": 1.761, "step": 158 }, { "epoch": 0.09539809203815924, "grad_norm": 1.4609375, "learning_rate": 3.7326732673267328e-06, "loss": 1.812, "step": 159 }, { "epoch": 0.09599808003839923, "grad_norm": 1.3203125, "learning_rate": 3.73019801980198e-06, "loss": 1.8758, "step": 160 }, { "epoch": 0.09659806803863923, "grad_norm": 1.4453125, "learning_rate": 3.727722772277228e-06, "loss": 1.7899, "step": 161 }, { "epoch": 0.09719805603887922, "grad_norm": 1.296875, "learning_rate": 3.7252475247524752e-06, "loss": 1.7542, "step": 162 }, { "epoch": 0.09779804403911922, "grad_norm": 1.53125, "learning_rate": 3.722772277227723e-06, "loss": 1.765, "step": 163 }, { "epoch": 0.0983980320393592, "grad_norm": 1.578125, "learning_rate": 3.72029702970297e-06, "loss": 1.7773, "step": 164 }, { "epoch": 0.0989980200395992, "grad_norm": 1.3203125, "learning_rate": 3.7178217821782177e-06, "loss": 1.9011, "step": 165 }, { "epoch": 0.0995980080398392, "grad_norm": 1.2578125, "learning_rate": 3.715346534653465e-06, "loss": 1.8742, "step": 166 }, { "epoch": 0.1001979960400792, "grad_norm": 1.4140625, "learning_rate": 3.7128712871287128e-06, "loss": 1.8805, "step": 167 }, { "epoch": 0.1007979840403192, "grad_norm": 1.3046875, "learning_rate": 3.71039603960396e-06, "loss": 1.9764, "step": 168 }, { "epoch": 0.10139797204055918, "grad_norm": 1.3203125, "learning_rate": 3.707920792079208e-06, "loss": 1.8004, "step": 169 }, { "epoch": 0.10199796004079918, "grad_norm": 1.265625, "learning_rate": 3.7054455445544552e-06, "loss": 1.8625, "step": 170 }, { "epoch": 0.10259794804103918, "grad_norm": 1.328125, "learning_rate": 3.702970297029703e-06, "loss": 1.7509, "step": 171 }, { "epoch": 0.10319793604127918, "grad_norm": 1.5546875, "learning_rate": 3.7004950495049503e-06, "loss": 1.7476, "step": 172 }, { "epoch": 0.10379792404151916, "grad_norm": 1.4140625, "learning_rate": 3.698019801980198e-06, "loss": 1.8179, "step": 173 }, { "epoch": 0.10439791204175916, "grad_norm": 1.4375, "learning_rate": 3.6955445544554455e-06, "loss": 1.9345, "step": 174 }, { "epoch": 0.10499790004199916, "grad_norm": 1.3984375, "learning_rate": 3.6930693069306932e-06, "loss": 1.8446, "step": 175 }, { "epoch": 0.10559788804223916, "grad_norm": 1.34375, "learning_rate": 3.6905940594059406e-06, "loss": 1.9043, "step": 176 }, { "epoch": 0.10619787604247916, "grad_norm": 1.3203125, "learning_rate": 3.6881188118811883e-06, "loss": 1.8542, "step": 177 }, { "epoch": 0.10679786404271914, "grad_norm": 1.3671875, "learning_rate": 3.6856435643564352e-06, "loss": 1.9043, "step": 178 }, { "epoch": 0.10739785204295914, "grad_norm": 1.328125, "learning_rate": 3.683168316831683e-06, "loss": 1.8059, "step": 179 }, { "epoch": 0.10799784004319914, "grad_norm": 1.21875, "learning_rate": 3.6806930693069304e-06, "loss": 1.6908, "step": 180 }, { "epoch": 0.10859782804343913, "grad_norm": 1.2265625, "learning_rate": 3.678217821782178e-06, "loss": 1.8294, "step": 181 }, { "epoch": 0.10919781604367913, "grad_norm": 1.5859375, "learning_rate": 3.6757425742574255e-06, "loss": 2.0129, "step": 182 }, { "epoch": 0.10979780404391912, "grad_norm": 1.5, "learning_rate": 3.6732673267326732e-06, "loss": 1.7686, "step": 183 }, { "epoch": 0.11039779204415912, "grad_norm": 1.7421875, "learning_rate": 3.6707920792079206e-06, "loss": 1.9643, "step": 184 }, { "epoch": 0.11099778004439911, "grad_norm": 1.5234375, "learning_rate": 3.6683168316831683e-06, "loss": 1.7562, "step": 185 }, { "epoch": 0.11159776804463911, "grad_norm": 1.2421875, "learning_rate": 3.6658415841584157e-06, "loss": 1.8898, "step": 186 }, { "epoch": 0.1121977560448791, "grad_norm": 1.59375, "learning_rate": 3.6633663366336635e-06, "loss": 1.885, "step": 187 }, { "epoch": 0.1127977440451191, "grad_norm": 5.5625, "learning_rate": 3.660891089108911e-06, "loss": 1.9108, "step": 188 }, { "epoch": 0.11339773204535909, "grad_norm": 1.546875, "learning_rate": 3.6584158415841586e-06, "loss": 1.826, "step": 189 }, { "epoch": 0.11399772004559909, "grad_norm": 1.3515625, "learning_rate": 3.6559405940594055e-06, "loss": 1.8905, "step": 190 }, { "epoch": 0.11459770804583909, "grad_norm": 1.25, "learning_rate": 3.6534653465346532e-06, "loss": 1.8051, "step": 191 }, { "epoch": 0.11519769604607907, "grad_norm": 1.21875, "learning_rate": 3.6509900990099006e-06, "loss": 1.7055, "step": 192 }, { "epoch": 0.11579768404631907, "grad_norm": 1.3671875, "learning_rate": 3.6485148514851484e-06, "loss": 1.8768, "step": 193 }, { "epoch": 0.11639767204655907, "grad_norm": 1.2265625, "learning_rate": 3.6460396039603957e-06, "loss": 1.7644, "step": 194 }, { "epoch": 0.11699766004679907, "grad_norm": 1.4375, "learning_rate": 3.6435643564356435e-06, "loss": 1.9606, "step": 195 }, { "epoch": 0.11759764804703907, "grad_norm": 1.34375, "learning_rate": 3.641089108910891e-06, "loss": 1.7722, "step": 196 }, { "epoch": 0.11819763604727905, "grad_norm": 1.3984375, "learning_rate": 3.6386138613861386e-06, "loss": 1.7471, "step": 197 }, { "epoch": 0.11879762404751905, "grad_norm": 1.2265625, "learning_rate": 3.636138613861386e-06, "loss": 1.8394, "step": 198 }, { "epoch": 0.11939761204775905, "grad_norm": 1.4765625, "learning_rate": 3.6336633663366337e-06, "loss": 1.8986, "step": 199 }, { "epoch": 0.11999760004799905, "grad_norm": 1.4296875, "learning_rate": 3.631188118811881e-06, "loss": 1.6904, "step": 200 }, { "epoch": 0.11999760004799905, "eval_loss": 2.040451765060425, "eval_model_preparation_time": 0.0036, "eval_runtime": 65.1419, "eval_samples_per_second": 153.511, "eval_steps_per_second": 25.59, "step": 200 }, { "epoch": 0.12059758804823903, "grad_norm": 1.421875, "learning_rate": 3.628712871287129e-06, "loss": 1.8726, "step": 201 }, { "epoch": 0.12119757604847903, "grad_norm": 1.3671875, "learning_rate": 3.626237623762376e-06, "loss": 1.8508, "step": 202 }, { "epoch": 0.12179756404871903, "grad_norm": 1.3984375, "learning_rate": 3.623762376237624e-06, "loss": 1.9293, "step": 203 }, { "epoch": 0.12239755204895902, "grad_norm": 1.3671875, "learning_rate": 3.621287128712871e-06, "loss": 1.9013, "step": 204 }, { "epoch": 0.12299754004919902, "grad_norm": 2.09375, "learning_rate": 3.6188118811881186e-06, "loss": 1.9189, "step": 205 }, { "epoch": 0.123597528049439, "grad_norm": 1.2890625, "learning_rate": 3.616336633663366e-06, "loss": 1.8183, "step": 206 }, { "epoch": 0.124197516049679, "grad_norm": 1.296875, "learning_rate": 3.6138613861386137e-06, "loss": 1.8873, "step": 207 }, { "epoch": 0.124797504049919, "grad_norm": 1.2265625, "learning_rate": 3.611386138613861e-06, "loss": 1.8153, "step": 208 }, { "epoch": 0.125397492050159, "grad_norm": 1.375, "learning_rate": 3.608910891089109e-06, "loss": 1.7048, "step": 209 }, { "epoch": 0.12599748005039899, "grad_norm": 1.5234375, "learning_rate": 3.606435643564356e-06, "loss": 1.6584, "step": 210 }, { "epoch": 0.12659746805063898, "grad_norm": 1.2109375, "learning_rate": 3.603960396039604e-06, "loss": 1.7825, "step": 211 }, { "epoch": 0.12719745605087898, "grad_norm": 1.3046875, "learning_rate": 3.6014851485148513e-06, "loss": 1.8709, "step": 212 }, { "epoch": 0.12779744405111898, "grad_norm": 1.46875, "learning_rate": 3.599009900990099e-06, "loss": 1.858, "step": 213 }, { "epoch": 0.12839743205135898, "grad_norm": 1.46875, "learning_rate": 3.5965346534653464e-06, "loss": 1.9115, "step": 214 }, { "epoch": 0.12899742005159898, "grad_norm": 1.171875, "learning_rate": 3.594059405940594e-06, "loss": 1.7853, "step": 215 }, { "epoch": 0.12959740805183897, "grad_norm": 1.234375, "learning_rate": 3.5915841584158415e-06, "loss": 1.8218, "step": 216 }, { "epoch": 0.13019739605207895, "grad_norm": 1.484375, "learning_rate": 3.589108910891089e-06, "loss": 1.7609, "step": 217 }, { "epoch": 0.13079738405231894, "grad_norm": 1.4921875, "learning_rate": 3.586633663366336e-06, "loss": 2.1418, "step": 218 }, { "epoch": 0.13139737205255894, "grad_norm": 1.4375, "learning_rate": 3.584158415841584e-06, "loss": 1.7065, "step": 219 }, { "epoch": 0.13199736005279894, "grad_norm": 1.328125, "learning_rate": 3.5816831683168313e-06, "loss": 1.8543, "step": 220 }, { "epoch": 0.13259734805303894, "grad_norm": 1.59375, "learning_rate": 3.579207920792079e-06, "loss": 1.8026, "step": 221 }, { "epoch": 0.13319733605327894, "grad_norm": 1.4140625, "learning_rate": 3.5767326732673264e-06, "loss": 1.7373, "step": 222 }, { "epoch": 0.13379732405351893, "grad_norm": 1.65625, "learning_rate": 3.574257425742574e-06, "loss": 1.7388, "step": 223 }, { "epoch": 0.13439731205375893, "grad_norm": 1.265625, "learning_rate": 3.5717821782178215e-06, "loss": 1.8013, "step": 224 }, { "epoch": 0.13499730005399893, "grad_norm": 1.375, "learning_rate": 3.5693069306930693e-06, "loss": 1.9171, "step": 225 }, { "epoch": 0.1355972880542389, "grad_norm": 1.234375, "learning_rate": 3.5668316831683166e-06, "loss": 1.9055, "step": 226 }, { "epoch": 0.1361972760544789, "grad_norm": 1.21875, "learning_rate": 3.5643564356435644e-06, "loss": 1.9024, "step": 227 }, { "epoch": 0.1367972640547189, "grad_norm": 1.453125, "learning_rate": 3.5618811881188117e-06, "loss": 1.8366, "step": 228 }, { "epoch": 0.1373972520549589, "grad_norm": 1.2109375, "learning_rate": 3.5594059405940595e-06, "loss": 1.8003, "step": 229 }, { "epoch": 0.1379972400551989, "grad_norm": 1.3359375, "learning_rate": 3.5569306930693064e-06, "loss": 1.9299, "step": 230 }, { "epoch": 0.1385972280554389, "grad_norm": 1.25, "learning_rate": 3.554455445544554e-06, "loss": 1.8557, "step": 231 }, { "epoch": 0.1391972160556789, "grad_norm": 1.2421875, "learning_rate": 3.5519801980198015e-06, "loss": 1.8823, "step": 232 }, { "epoch": 0.1397972040559189, "grad_norm": 1.3046875, "learning_rate": 3.5495049504950493e-06, "loss": 1.891, "step": 233 }, { "epoch": 0.1403971920561589, "grad_norm": 1.734375, "learning_rate": 3.5470297029702966e-06, "loss": 1.6939, "step": 234 }, { "epoch": 0.14099718005639889, "grad_norm": 1.421875, "learning_rate": 3.5445544554455444e-06, "loss": 1.8792, "step": 235 }, { "epoch": 0.14159716805663886, "grad_norm": 1.25, "learning_rate": 3.5420792079207917e-06, "loss": 1.856, "step": 236 }, { "epoch": 0.14219715605687885, "grad_norm": 1.34375, "learning_rate": 3.5396039603960395e-06, "loss": 1.8268, "step": 237 }, { "epoch": 0.14279714405711885, "grad_norm": 1.203125, "learning_rate": 3.537128712871287e-06, "loss": 1.7116, "step": 238 }, { "epoch": 0.14339713205735885, "grad_norm": 1.234375, "learning_rate": 3.5346534653465346e-06, "loss": 1.922, "step": 239 }, { "epoch": 0.14399712005759885, "grad_norm": 1.5, "learning_rate": 3.532178217821782e-06, "loss": 1.792, "step": 240 }, { "epoch": 0.14459710805783885, "grad_norm": 1.390625, "learning_rate": 3.5297029702970297e-06, "loss": 1.6891, "step": 241 }, { "epoch": 0.14519709605807885, "grad_norm": 1.359375, "learning_rate": 3.527227722772277e-06, "loss": 1.8681, "step": 242 }, { "epoch": 0.14579708405831884, "grad_norm": 1.2265625, "learning_rate": 3.524752475247525e-06, "loss": 1.6985, "step": 243 }, { "epoch": 0.14639707205855884, "grad_norm": 1.3203125, "learning_rate": 3.5222772277227717e-06, "loss": 1.7168, "step": 244 }, { "epoch": 0.1469970600587988, "grad_norm": 2.921875, "learning_rate": 3.5198019801980195e-06, "loss": 1.7725, "step": 245 }, { "epoch": 0.1475970480590388, "grad_norm": 1.609375, "learning_rate": 3.517326732673267e-06, "loss": 1.8263, "step": 246 }, { "epoch": 0.1481970360592788, "grad_norm": 1.3515625, "learning_rate": 3.5148514851485146e-06, "loss": 1.791, "step": 247 }, { "epoch": 0.1487970240595188, "grad_norm": 1.359375, "learning_rate": 3.512376237623762e-06, "loss": 1.867, "step": 248 }, { "epoch": 0.1493970120597588, "grad_norm": 1.6953125, "learning_rate": 3.5099009900990097e-06, "loss": 1.92, "step": 249 }, { "epoch": 0.1499970000599988, "grad_norm": 1.3984375, "learning_rate": 3.507425742574257e-06, "loss": 1.8612, "step": 250 }, { "epoch": 0.1499970000599988, "eval_loss": 2.0383219718933105, "eval_model_preparation_time": 0.0036, "eval_runtime": 65.0682, "eval_samples_per_second": 153.685, "eval_steps_per_second": 25.619, "step": 250 }, { "epoch": 0.1505969880602388, "grad_norm": 1.4375, "learning_rate": 3.504950495049505e-06, "loss": 1.8017, "step": 251 }, { "epoch": 0.1511969760604788, "grad_norm": 1.375, "learning_rate": 3.502475247524752e-06, "loss": 1.944, "step": 252 }, { "epoch": 0.1517969640607188, "grad_norm": 1.265625, "learning_rate": 3.5e-06, "loss": 1.7536, "step": 253 }, { "epoch": 0.15239695206095877, "grad_norm": 1.203125, "learning_rate": 3.4975247524752477e-06, "loss": 1.7696, "step": 254 }, { "epoch": 0.15299694006119877, "grad_norm": 1.3515625, "learning_rate": 3.495049504950495e-06, "loss": 1.8545, "step": 255 }, { "epoch": 0.15359692806143876, "grad_norm": 1.4609375, "learning_rate": 3.492574257425743e-06, "loss": 1.9009, "step": 256 }, { "epoch": 0.15419691606167876, "grad_norm": 1.328125, "learning_rate": 3.4900990099009897e-06, "loss": 1.915, "step": 257 }, { "epoch": 0.15479690406191876, "grad_norm": 1.546875, "learning_rate": 3.4876237623762375e-06, "loss": 1.7616, "step": 258 }, { "epoch": 0.15539689206215876, "grad_norm": 1.4140625, "learning_rate": 3.485148514851485e-06, "loss": 1.843, "step": 259 }, { "epoch": 0.15599688006239876, "grad_norm": 1.2265625, "learning_rate": 3.4826732673267326e-06, "loss": 1.8939, "step": 260 }, { "epoch": 0.15659686806263876, "grad_norm": 1.2421875, "learning_rate": 3.48019801980198e-06, "loss": 1.8774, "step": 261 }, { "epoch": 0.15719685606287875, "grad_norm": 1.25, "learning_rate": 3.4777227722772277e-06, "loss": 1.7766, "step": 262 }, { "epoch": 0.15779684406311872, "grad_norm": 1.46875, "learning_rate": 3.475247524752475e-06, "loss": 1.806, "step": 263 }, { "epoch": 0.15839683206335872, "grad_norm": 1.2734375, "learning_rate": 3.472772277227723e-06, "loss": 1.8267, "step": 264 }, { "epoch": 0.15899682006359872, "grad_norm": 1.609375, "learning_rate": 3.47029702970297e-06, "loss": 1.8525, "step": 265 }, { "epoch": 0.15959680806383872, "grad_norm": 1.6796875, "learning_rate": 3.467821782178218e-06, "loss": 1.7749, "step": 266 }, { "epoch": 0.16019679606407872, "grad_norm": 1.6328125, "learning_rate": 3.4653465346534653e-06, "loss": 1.905, "step": 267 }, { "epoch": 0.16079678406431872, "grad_norm": 1.328125, "learning_rate": 3.462871287128713e-06, "loss": 1.7351, "step": 268 }, { "epoch": 0.1613967720645587, "grad_norm": 1.703125, "learning_rate": 3.4603960396039604e-06, "loss": 1.8969, "step": 269 }, { "epoch": 0.1619967600647987, "grad_norm": 1.53125, "learning_rate": 3.4579207920792077e-06, "loss": 1.7878, "step": 270 }, { "epoch": 0.1625967480650387, "grad_norm": 1.703125, "learning_rate": 3.455445544554455e-06, "loss": 1.84, "step": 271 }, { "epoch": 0.1631967360652787, "grad_norm": 1.375, "learning_rate": 3.452970297029703e-06, "loss": 1.9861, "step": 272 }, { "epoch": 0.16379672406551868, "grad_norm": 1.3359375, "learning_rate": 3.45049504950495e-06, "loss": 1.8173, "step": 273 }, { "epoch": 0.16439671206575868, "grad_norm": 1.2890625, "learning_rate": 3.448019801980198e-06, "loss": 1.8155, "step": 274 }, { "epoch": 0.16499670006599867, "grad_norm": 1.4140625, "learning_rate": 3.4455445544554453e-06, "loss": 1.7419, "step": 275 }, { "epoch": 0.16559668806623867, "grad_norm": 1.265625, "learning_rate": 3.443069306930693e-06, "loss": 1.7549, "step": 276 }, { "epoch": 0.16619667606647867, "grad_norm": 1.515625, "learning_rate": 3.4405940594059404e-06, "loss": 1.8526, "step": 277 }, { "epoch": 0.16679666406671867, "grad_norm": 1.4765625, "learning_rate": 3.438118811881188e-06, "loss": 1.8459, "step": 278 }, { "epoch": 0.16739665206695867, "grad_norm": 1.390625, "learning_rate": 3.4356435643564355e-06, "loss": 1.8812, "step": 279 }, { "epoch": 0.16799664006719867, "grad_norm": 1.328125, "learning_rate": 3.4331683168316833e-06, "loss": 1.8331, "step": 280 }, { "epoch": 0.16859662806743866, "grad_norm": 1.6015625, "learning_rate": 3.4306930693069306e-06, "loss": 1.965, "step": 281 }, { "epoch": 0.16919661606767863, "grad_norm": 1.5859375, "learning_rate": 3.4282178217821784e-06, "loss": 1.9069, "step": 282 }, { "epoch": 0.16979660406791863, "grad_norm": 1.890625, "learning_rate": 3.4257425742574253e-06, "loss": 1.7781, "step": 283 }, { "epoch": 0.17039659206815863, "grad_norm": 1.1953125, "learning_rate": 3.423267326732673e-06, "loss": 1.9266, "step": 284 }, { "epoch": 0.17099658006839863, "grad_norm": 1.359375, "learning_rate": 3.4207920792079204e-06, "loss": 1.8376, "step": 285 }, { "epoch": 0.17159656806863863, "grad_norm": 1.3203125, "learning_rate": 3.418316831683168e-06, "loss": 1.7678, "step": 286 }, { "epoch": 0.17219655606887863, "grad_norm": 1.15625, "learning_rate": 3.4158415841584155e-06, "loss": 1.8636, "step": 287 }, { "epoch": 0.17279654406911862, "grad_norm": 1.1484375, "learning_rate": 3.4133663366336633e-06, "loss": 1.8833, "step": 288 }, { "epoch": 0.17339653206935862, "grad_norm": 1.375, "learning_rate": 3.4108910891089106e-06, "loss": 1.7882, "step": 289 }, { "epoch": 0.17399652006959862, "grad_norm": 1.3046875, "learning_rate": 3.4084158415841584e-06, "loss": 1.9467, "step": 290 }, { "epoch": 0.1745965080698386, "grad_norm": 1.2890625, "learning_rate": 3.4059405940594058e-06, "loss": 1.7262, "step": 291 }, { "epoch": 0.1751964960700786, "grad_norm": 1.484375, "learning_rate": 3.4034653465346535e-06, "loss": 1.8879, "step": 292 }, { "epoch": 0.1757964840703186, "grad_norm": 1.2734375, "learning_rate": 3.400990099009901e-06, "loss": 1.8687, "step": 293 }, { "epoch": 0.17639647207055859, "grad_norm": 1.3828125, "learning_rate": 3.3985148514851486e-06, "loss": 1.7491, "step": 294 }, { "epoch": 0.17699646007079858, "grad_norm": 1.3359375, "learning_rate": 3.396039603960396e-06, "loss": 1.939, "step": 295 }, { "epoch": 0.17759644807103858, "grad_norm": 1.4375, "learning_rate": 3.3935643564356437e-06, "loss": 1.8243, "step": 296 }, { "epoch": 0.17819643607127858, "grad_norm": 1.3125, "learning_rate": 3.3910891089108907e-06, "loss": 1.8302, "step": 297 }, { "epoch": 0.17879642407151858, "grad_norm": 1.4140625, "learning_rate": 3.3886138613861384e-06, "loss": 1.8882, "step": 298 }, { "epoch": 0.17939641207175858, "grad_norm": 1.375, "learning_rate": 3.3861386138613858e-06, "loss": 1.8015, "step": 299 }, { "epoch": 0.17999640007199855, "grad_norm": 1.296875, "learning_rate": 3.3836633663366335e-06, "loss": 1.9455, "step": 300 }, { "epoch": 0.17999640007199855, "eval_loss": 2.0367591381073, "eval_model_preparation_time": 0.0036, "eval_runtime": 67.518, "eval_samples_per_second": 148.109, "eval_steps_per_second": 24.69, "step": 300 }, { "epoch": 0.18059638807223855, "grad_norm": 1.40625, "learning_rate": 3.381188118811881e-06, "loss": 1.7753, "step": 301 }, { "epoch": 0.18119637607247854, "grad_norm": 1.3203125, "learning_rate": 3.3787128712871286e-06, "loss": 1.9745, "step": 302 }, { "epoch": 0.18179636407271854, "grad_norm": 1.59375, "learning_rate": 3.376237623762376e-06, "loss": 1.7602, "step": 303 }, { "epoch": 0.18239635207295854, "grad_norm": 1.296875, "learning_rate": 3.3737623762376238e-06, "loss": 1.9368, "step": 304 }, { "epoch": 0.18299634007319854, "grad_norm": 1.9609375, "learning_rate": 3.371287128712871e-06, "loss": 1.9062, "step": 305 }, { "epoch": 0.18359632807343854, "grad_norm": 1.2578125, "learning_rate": 3.368811881188119e-06, "loss": 1.7124, "step": 306 }, { "epoch": 0.18419631607367853, "grad_norm": 1.328125, "learning_rate": 3.366336633663366e-06, "loss": 1.7911, "step": 307 }, { "epoch": 0.18479630407391853, "grad_norm": 1.296875, "learning_rate": 3.363861386138614e-06, "loss": 1.8183, "step": 308 }, { "epoch": 0.1853962920741585, "grad_norm": 1.484375, "learning_rate": 3.3613861386138613e-06, "loss": 1.8593, "step": 309 }, { "epoch": 0.1859962800743985, "grad_norm": 2.046875, "learning_rate": 3.3589108910891087e-06, "loss": 1.9335, "step": 310 }, { "epoch": 0.1865962680746385, "grad_norm": 1.3359375, "learning_rate": 3.356435643564356e-06, "loss": 1.8858, "step": 311 }, { "epoch": 0.1871962560748785, "grad_norm": 1.359375, "learning_rate": 3.3539603960396038e-06, "loss": 1.9154, "step": 312 }, { "epoch": 0.1877962440751185, "grad_norm": 1.28125, "learning_rate": 3.351485148514851e-06, "loss": 1.7767, "step": 313 }, { "epoch": 0.1883962320753585, "grad_norm": 1.4765625, "learning_rate": 3.349009900990099e-06, "loss": 1.8943, "step": 314 }, { "epoch": 0.1889962200755985, "grad_norm": 1.2109375, "learning_rate": 3.3465346534653462e-06, "loss": 1.7825, "step": 315 }, { "epoch": 0.1895962080758385, "grad_norm": 1.484375, "learning_rate": 3.344059405940594e-06, "loss": 1.891, "step": 316 }, { "epoch": 0.1901961960760785, "grad_norm": 1.359375, "learning_rate": 3.3415841584158413e-06, "loss": 1.8893, "step": 317 }, { "epoch": 0.1907961840763185, "grad_norm": 1.296875, "learning_rate": 3.339108910891089e-06, "loss": 1.7661, "step": 318 }, { "epoch": 0.19139617207655846, "grad_norm": 1.5078125, "learning_rate": 3.3366336633663364e-06, "loss": 1.7951, "step": 319 }, { "epoch": 0.19199616007679846, "grad_norm": 1.203125, "learning_rate": 3.334158415841584e-06, "loss": 1.825, "step": 320 }, { "epoch": 0.19259614807703845, "grad_norm": 1.265625, "learning_rate": 3.3316831683168316e-06, "loss": 1.7065, "step": 321 }, { "epoch": 0.19319613607727845, "grad_norm": 1.3984375, "learning_rate": 3.3292079207920793e-06, "loss": 1.8174, "step": 322 }, { "epoch": 0.19379612407751845, "grad_norm": 1.3203125, "learning_rate": 3.3267326732673262e-06, "loss": 1.8812, "step": 323 }, { "epoch": 0.19439611207775845, "grad_norm": 1.2578125, "learning_rate": 3.324257425742574e-06, "loss": 1.8562, "step": 324 }, { "epoch": 0.19499610007799845, "grad_norm": 1.2734375, "learning_rate": 3.3217821782178213e-06, "loss": 1.7252, "step": 325 }, { "epoch": 0.19559608807823844, "grad_norm": 1.6015625, "learning_rate": 3.319306930693069e-06, "loss": 1.7466, "step": 326 }, { "epoch": 0.19619607607847844, "grad_norm": 1.34375, "learning_rate": 3.3168316831683165e-06, "loss": 1.7714, "step": 327 }, { "epoch": 0.1967960640787184, "grad_norm": 1.2265625, "learning_rate": 3.3143564356435642e-06, "loss": 1.8033, "step": 328 }, { "epoch": 0.1973960520789584, "grad_norm": 1.71875, "learning_rate": 3.3118811881188116e-06, "loss": 1.812, "step": 329 }, { "epoch": 0.1979960400791984, "grad_norm": 1.3984375, "learning_rate": 3.3094059405940593e-06, "loss": 1.8678, "step": 330 }, { "epoch": 0.1985960280794384, "grad_norm": 1.25, "learning_rate": 3.3069306930693067e-06, "loss": 1.9161, "step": 331 }, { "epoch": 0.1991960160796784, "grad_norm": 1.3984375, "learning_rate": 3.3044554455445544e-06, "loss": 1.7642, "step": 332 }, { "epoch": 0.1997960040799184, "grad_norm": 1.390625, "learning_rate": 3.3019801980198018e-06, "loss": 1.8247, "step": 333 }, { "epoch": 0.2003959920801584, "grad_norm": 1.2578125, "learning_rate": 3.2995049504950496e-06, "loss": 1.9492, "step": 334 }, { "epoch": 0.2009959800803984, "grad_norm": 1.34375, "learning_rate": 3.297029702970297e-06, "loss": 1.7759, "step": 335 }, { "epoch": 0.2015959680806384, "grad_norm": 1.203125, "learning_rate": 3.2945544554455442e-06, "loss": 1.8604, "step": 336 }, { "epoch": 0.20219595608087837, "grad_norm": 1.21875, "learning_rate": 3.2920792079207916e-06, "loss": 1.8555, "step": 337 }, { "epoch": 0.20279594408111837, "grad_norm": 1.359375, "learning_rate": 3.2896039603960393e-06, "loss": 1.7694, "step": 338 }, { "epoch": 0.20339593208135837, "grad_norm": 1.421875, "learning_rate": 3.2871287128712867e-06, "loss": 1.7972, "step": 339 }, { "epoch": 0.20399592008159836, "grad_norm": 1.3203125, "learning_rate": 3.2846534653465345e-06, "loss": 1.902, "step": 340 }, { "epoch": 0.20459590808183836, "grad_norm": 1.296875, "learning_rate": 3.282178217821782e-06, "loss": 1.9065, "step": 341 }, { "epoch": 0.20519589608207836, "grad_norm": 1.40625, "learning_rate": 3.2797029702970296e-06, "loss": 1.7904, "step": 342 }, { "epoch": 0.20579588408231836, "grad_norm": 1.203125, "learning_rate": 3.277227722772277e-06, "loss": 1.7835, "step": 343 }, { "epoch": 0.20639587208255836, "grad_norm": 1.34375, "learning_rate": 3.2747524752475247e-06, "loss": 1.8314, "step": 344 }, { "epoch": 0.20699586008279836, "grad_norm": 1.4296875, "learning_rate": 3.272277227722772e-06, "loss": 1.774, "step": 345 }, { "epoch": 0.20759584808303833, "grad_norm": 1.2265625, "learning_rate": 3.2698019801980198e-06, "loss": 1.8417, "step": 346 }, { "epoch": 0.20819583608327832, "grad_norm": 1.28125, "learning_rate": 3.267326732673267e-06, "loss": 1.8123, "step": 347 }, { "epoch": 0.20879582408351832, "grad_norm": 1.2890625, "learning_rate": 3.264851485148515e-06, "loss": 1.8038, "step": 348 }, { "epoch": 0.20939581208375832, "grad_norm": 1.28125, "learning_rate": 3.262376237623762e-06, "loss": 1.9785, "step": 349 }, { "epoch": 0.20999580008399832, "grad_norm": 1.328125, "learning_rate": 3.2599009900990096e-06, "loss": 1.756, "step": 350 }, { "epoch": 0.20999580008399832, "eval_loss": 2.0354440212249756, "eval_model_preparation_time": 0.0036, "eval_runtime": 65.0421, "eval_samples_per_second": 153.747, "eval_steps_per_second": 25.63, "step": 350 }, { "epoch": 0.21059578808423832, "grad_norm": 1.34375, "learning_rate": 3.257425742574257e-06, "loss": 1.7987, "step": 351 }, { "epoch": 0.21119577608447831, "grad_norm": 1.2578125, "learning_rate": 3.2549504950495047e-06, "loss": 1.868, "step": 352 }, { "epoch": 0.2117957640847183, "grad_norm": 1.2734375, "learning_rate": 3.252475247524752e-06, "loss": 1.945, "step": 353 }, { "epoch": 0.2123957520849583, "grad_norm": 1.5625, "learning_rate": 3.25e-06, "loss": 1.869, "step": 354 }, { "epoch": 0.2129957400851983, "grad_norm": 1.28125, "learning_rate": 3.2475247524752476e-06, "loss": 1.6786, "step": 355 }, { "epoch": 0.21359572808543828, "grad_norm": 1.40625, "learning_rate": 3.245049504950495e-06, "loss": 1.885, "step": 356 }, { "epoch": 0.21419571608567828, "grad_norm": 1.2578125, "learning_rate": 3.2425742574257427e-06, "loss": 1.8202, "step": 357 }, { "epoch": 0.21479570408591828, "grad_norm": 1.234375, "learning_rate": 3.24009900990099e-06, "loss": 1.8872, "step": 358 }, { "epoch": 0.21539569208615827, "grad_norm": 1.3515625, "learning_rate": 3.2376237623762378e-06, "loss": 1.8847, "step": 359 }, { "epoch": 0.21599568008639827, "grad_norm": 1.4375, "learning_rate": 3.235148514851485e-06, "loss": 1.769, "step": 360 }, { "epoch": 0.21659566808663827, "grad_norm": 1.25, "learning_rate": 3.232673267326733e-06, "loss": 1.8455, "step": 361 }, { "epoch": 0.21719565608687827, "grad_norm": 1.375, "learning_rate": 3.2301980198019802e-06, "loss": 1.8822, "step": 362 }, { "epoch": 0.21779564408711827, "grad_norm": 1.2109375, "learning_rate": 3.2277227722772276e-06, "loss": 1.863, "step": 363 }, { "epoch": 0.21839563208735827, "grad_norm": 1.359375, "learning_rate": 3.225247524752475e-06, "loss": 1.8595, "step": 364 }, { "epoch": 0.21899562008759824, "grad_norm": 1.53125, "learning_rate": 3.2227722772277227e-06, "loss": 1.8043, "step": 365 }, { "epoch": 0.21959560808783823, "grad_norm": 1.2109375, "learning_rate": 3.22029702970297e-06, "loss": 1.8976, "step": 366 }, { "epoch": 0.22019559608807823, "grad_norm": 1.2265625, "learning_rate": 3.217821782178218e-06, "loss": 1.8178, "step": 367 }, { "epoch": 0.22079558408831823, "grad_norm": 1.234375, "learning_rate": 3.215346534653465e-06, "loss": 1.7554, "step": 368 }, { "epoch": 0.22139557208855823, "grad_norm": 1.4296875, "learning_rate": 3.212871287128713e-06, "loss": 1.8248, "step": 369 }, { "epoch": 0.22199556008879823, "grad_norm": 1.296875, "learning_rate": 3.2103960396039603e-06, "loss": 1.93, "step": 370 }, { "epoch": 0.22259554808903823, "grad_norm": 1.3203125, "learning_rate": 3.207920792079208e-06, "loss": 1.9148, "step": 371 }, { "epoch": 0.22319553608927822, "grad_norm": 1.3359375, "learning_rate": 3.2054455445544554e-06, "loss": 1.8438, "step": 372 }, { "epoch": 0.22379552408951822, "grad_norm": 1.2734375, "learning_rate": 3.202970297029703e-06, "loss": 1.7993, "step": 373 }, { "epoch": 0.2243955120897582, "grad_norm": 1.28125, "learning_rate": 3.2004950495049505e-06, "loss": 1.8202, "step": 374 }, { "epoch": 0.2249955000899982, "grad_norm": 1.328125, "learning_rate": 3.1980198019801982e-06, "loss": 1.8704, "step": 375 }, { "epoch": 0.2255954880902382, "grad_norm": 1.40625, "learning_rate": 3.195544554455445e-06, "loss": 1.8558, "step": 376 }, { "epoch": 0.2261954760904782, "grad_norm": 1.234375, "learning_rate": 3.193069306930693e-06, "loss": 2.0454, "step": 377 }, { "epoch": 0.22679546409071819, "grad_norm": 1.2734375, "learning_rate": 3.1905940594059403e-06, "loss": 1.8577, "step": 378 }, { "epoch": 0.22739545209095818, "grad_norm": 1.8125, "learning_rate": 3.188118811881188e-06, "loss": 1.8602, "step": 379 }, { "epoch": 0.22799544009119818, "grad_norm": 1.3125, "learning_rate": 3.1856435643564354e-06, "loss": 1.7231, "step": 380 }, { "epoch": 0.22859542809143818, "grad_norm": 1.359375, "learning_rate": 3.183168316831683e-06, "loss": 1.712, "step": 381 }, { "epoch": 0.22919541609167818, "grad_norm": 1.25, "learning_rate": 3.1806930693069305e-06, "loss": 1.8823, "step": 382 }, { "epoch": 0.22979540409191815, "grad_norm": 1.3828125, "learning_rate": 3.1782178217821783e-06, "loss": 1.845, "step": 383 }, { "epoch": 0.23039539209215815, "grad_norm": 1.4296875, "learning_rate": 3.1757425742574256e-06, "loss": 1.8086, "step": 384 }, { "epoch": 0.23099538009239814, "grad_norm": 1.40625, "learning_rate": 3.1732673267326734e-06, "loss": 1.8122, "step": 385 }, { "epoch": 0.23159536809263814, "grad_norm": 1.2734375, "learning_rate": 3.1707920792079207e-06, "loss": 1.8456, "step": 386 }, { "epoch": 0.23219535609287814, "grad_norm": 1.4765625, "learning_rate": 3.1683168316831685e-06, "loss": 1.8158, "step": 387 }, { "epoch": 0.23279534409311814, "grad_norm": 1.1875, "learning_rate": 3.165841584158416e-06, "loss": 1.8107, "step": 388 }, { "epoch": 0.23339533209335814, "grad_norm": 1.4375, "learning_rate": 3.163366336633663e-06, "loss": 1.825, "step": 389 }, { "epoch": 0.23399532009359814, "grad_norm": 1.4921875, "learning_rate": 3.1608910891089105e-06, "loss": 1.8038, "step": 390 }, { "epoch": 0.23459530809383813, "grad_norm": 1.203125, "learning_rate": 3.1584158415841583e-06, "loss": 1.8152, "step": 391 }, { "epoch": 0.23519529609407813, "grad_norm": 1.4453125, "learning_rate": 3.1559405940594056e-06, "loss": 1.926, "step": 392 }, { "epoch": 0.2357952840943181, "grad_norm": 1.328125, "learning_rate": 3.1534653465346534e-06, "loss": 1.8543, "step": 393 }, { "epoch": 0.2363952720945581, "grad_norm": 1.25, "learning_rate": 3.1509900990099007e-06, "loss": 1.8222, "step": 394 }, { "epoch": 0.2369952600947981, "grad_norm": 1.2578125, "learning_rate": 3.1485148514851485e-06, "loss": 1.9147, "step": 395 }, { "epoch": 0.2375952480950381, "grad_norm": 1.21875, "learning_rate": 3.146039603960396e-06, "loss": 1.814, "step": 396 }, { "epoch": 0.2381952360952781, "grad_norm": 1.0625, "learning_rate": 3.1435643564356436e-06, "loss": 1.8623, "step": 397 }, { "epoch": 0.2387952240955181, "grad_norm": 1.34375, "learning_rate": 3.141089108910891e-06, "loss": 1.8996, "step": 398 }, { "epoch": 0.2393952120957581, "grad_norm": 1.2265625, "learning_rate": 3.1386138613861387e-06, "loss": 1.8078, "step": 399 }, { "epoch": 0.2399952000959981, "grad_norm": 1.125, "learning_rate": 3.136138613861386e-06, "loss": 1.8248, "step": 400 }, { "epoch": 0.2399952000959981, "eval_loss": 2.0347118377685547, "eval_model_preparation_time": 0.0036, "eval_runtime": 65.1996, "eval_samples_per_second": 153.375, "eval_steps_per_second": 25.568, "step": 400 }, { "epoch": 0.2405951880962381, "grad_norm": 1.203125, "learning_rate": 3.133663366336634e-06, "loss": 1.9232, "step": 401 }, { "epoch": 0.24119517609647806, "grad_norm": 1.3359375, "learning_rate": 3.1311881188118807e-06, "loss": 1.8487, "step": 402 }, { "epoch": 0.24179516409671806, "grad_norm": 1.3828125, "learning_rate": 3.1287128712871285e-06, "loss": 1.8713, "step": 403 }, { "epoch": 0.24239515209695806, "grad_norm": 1.5078125, "learning_rate": 3.126237623762376e-06, "loss": 1.9976, "step": 404 }, { "epoch": 0.24299514009719805, "grad_norm": 1.3359375, "learning_rate": 3.1237623762376236e-06, "loss": 1.769, "step": 405 }, { "epoch": 0.24359512809743805, "grad_norm": 1.296875, "learning_rate": 3.121287128712871e-06, "loss": 1.9225, "step": 406 }, { "epoch": 0.24419511609767805, "grad_norm": 1.3046875, "learning_rate": 3.1188118811881187e-06, "loss": 1.8012, "step": 407 }, { "epoch": 0.24479510409791805, "grad_norm": 1.3125, "learning_rate": 3.116336633663366e-06, "loss": 1.8264, "step": 408 }, { "epoch": 0.24539509209815805, "grad_norm": 1.3203125, "learning_rate": 3.113861386138614e-06, "loss": 1.9196, "step": 409 }, { "epoch": 0.24599508009839804, "grad_norm": 1.2109375, "learning_rate": 3.111386138613861e-06, "loss": 1.7687, "step": 410 }, { "epoch": 0.24659506809863802, "grad_norm": 1.2578125, "learning_rate": 3.108910891089109e-06, "loss": 1.8449, "step": 411 }, { "epoch": 0.247195056098878, "grad_norm": 1.2265625, "learning_rate": 3.1064356435643563e-06, "loss": 1.9169, "step": 412 }, { "epoch": 0.247795044099118, "grad_norm": 1.4453125, "learning_rate": 3.103960396039604e-06, "loss": 1.83, "step": 413 }, { "epoch": 0.248395032099358, "grad_norm": 1.3203125, "learning_rate": 3.1014851485148514e-06, "loss": 1.8564, "step": 414 }, { "epoch": 0.248995020099598, "grad_norm": 1.203125, "learning_rate": 3.099009900990099e-06, "loss": 1.8824, "step": 415 }, { "epoch": 0.249595008099838, "grad_norm": 1.421875, "learning_rate": 3.096534653465346e-06, "loss": 1.7244, "step": 416 }, { "epoch": 0.250194996100078, "grad_norm": 1.3203125, "learning_rate": 3.094059405940594e-06, "loss": 1.7585, "step": 417 }, { "epoch": 0.250794984100318, "grad_norm": 1.28125, "learning_rate": 3.091584158415841e-06, "loss": 1.8895, "step": 418 }, { "epoch": 0.251394972100558, "grad_norm": 1.3203125, "learning_rate": 3.089108910891089e-06, "loss": 1.8884, "step": 419 }, { "epoch": 0.25199496010079797, "grad_norm": 2.0625, "learning_rate": 3.0866336633663363e-06, "loss": 1.8346, "step": 420 }, { "epoch": 0.252594948101038, "grad_norm": 1.4140625, "learning_rate": 3.084158415841584e-06, "loss": 1.8132, "step": 421 }, { "epoch": 0.25319493610127797, "grad_norm": 1.125, "learning_rate": 3.0816831683168314e-06, "loss": 1.7638, "step": 422 }, { "epoch": 0.253794924101518, "grad_norm": 1.28125, "learning_rate": 3.079207920792079e-06, "loss": 1.8337, "step": 423 }, { "epoch": 0.25439491210175796, "grad_norm": 1.421875, "learning_rate": 3.0767326732673265e-06, "loss": 1.8597, "step": 424 }, { "epoch": 0.25499490010199793, "grad_norm": 4.25, "learning_rate": 3.0742574257425743e-06, "loss": 1.7829, "step": 425 }, { "epoch": 0.25559488810223796, "grad_norm": 1.40625, "learning_rate": 3.0717821782178216e-06, "loss": 1.8581, "step": 426 }, { "epoch": 0.25619487610247793, "grad_norm": 1.4921875, "learning_rate": 3.0693069306930694e-06, "loss": 1.8007, "step": 427 }, { "epoch": 0.25679486410271796, "grad_norm": 1.359375, "learning_rate": 3.0668316831683167e-06, "loss": 1.7999, "step": 428 }, { "epoch": 0.2573948521029579, "grad_norm": 1.2734375, "learning_rate": 3.064356435643564e-06, "loss": 1.9848, "step": 429 }, { "epoch": 0.25799484010319795, "grad_norm": 1.5390625, "learning_rate": 3.0618811881188114e-06, "loss": 1.7019, "step": 430 }, { "epoch": 0.2585948281034379, "grad_norm": 1.3359375, "learning_rate": 3.059405940594059e-06, "loss": 1.8219, "step": 431 }, { "epoch": 0.25919481610367795, "grad_norm": 1.3671875, "learning_rate": 3.0569306930693065e-06, "loss": 1.8192, "step": 432 }, { "epoch": 0.2597948041039179, "grad_norm": 1.3828125, "learning_rate": 3.0544554455445543e-06, "loss": 1.9334, "step": 433 }, { "epoch": 0.2603947921041579, "grad_norm": 1.1484375, "learning_rate": 3.0519801980198016e-06, "loss": 1.7363, "step": 434 }, { "epoch": 0.2609947801043979, "grad_norm": 1.34375, "learning_rate": 3.0495049504950494e-06, "loss": 1.8116, "step": 435 }, { "epoch": 0.2615947681046379, "grad_norm": 2.453125, "learning_rate": 3.0470297029702967e-06, "loss": 1.849, "step": 436 }, { "epoch": 0.2621947561048779, "grad_norm": 1.28125, "learning_rate": 3.0445544554455445e-06, "loss": 1.8584, "step": 437 }, { "epoch": 0.2627947441051179, "grad_norm": 1.1875, "learning_rate": 3.042079207920792e-06, "loss": 1.7644, "step": 438 }, { "epoch": 0.2633947321053579, "grad_norm": 1.34375, "learning_rate": 3.0396039603960396e-06, "loss": 1.71, "step": 439 }, { "epoch": 0.2639947201055979, "grad_norm": 1.3515625, "learning_rate": 3.037128712871287e-06, "loss": 1.7796, "step": 440 }, { "epoch": 0.2645947081058379, "grad_norm": 1.3046875, "learning_rate": 3.0346534653465347e-06, "loss": 1.8207, "step": 441 }, { "epoch": 0.2651946961060779, "grad_norm": 1.2734375, "learning_rate": 3.0321782178217817e-06, "loss": 1.8873, "step": 442 }, { "epoch": 0.26579468410631785, "grad_norm": 1.328125, "learning_rate": 3.0297029702970294e-06, "loss": 1.9306, "step": 443 }, { "epoch": 0.2663946721065579, "grad_norm": 1.3203125, "learning_rate": 3.0272277227722768e-06, "loss": 1.8683, "step": 444 }, { "epoch": 0.26699466010679784, "grad_norm": 1.3671875, "learning_rate": 3.0247524752475245e-06, "loss": 1.8457, "step": 445 }, { "epoch": 0.26759464810703787, "grad_norm": 1.40625, "learning_rate": 3.022277227722772e-06, "loss": 1.7025, "step": 446 }, { "epoch": 0.26819463610727784, "grad_norm": 1.7421875, "learning_rate": 3.0198019801980196e-06, "loss": 1.7783, "step": 447 }, { "epoch": 0.26879462410751787, "grad_norm": 1.328125, "learning_rate": 3.017326732673267e-06, "loss": 1.9892, "step": 448 }, { "epoch": 0.26939461210775784, "grad_norm": 1.4609375, "learning_rate": 3.0148514851485147e-06, "loss": 1.7737, "step": 449 }, { "epoch": 0.26999460010799786, "grad_norm": 1.2109375, "learning_rate": 3.012376237623762e-06, "loss": 1.8656, "step": 450 }, { "epoch": 0.26999460010799786, "eval_loss": 2.033843994140625, "eval_model_preparation_time": 0.0036, "eval_runtime": 67.9747, "eval_samples_per_second": 147.114, "eval_steps_per_second": 24.524, "step": 450 }, { "epoch": 0.27059458810823783, "grad_norm": 1.3984375, "learning_rate": 3.00990099009901e-06, "loss": 1.7738, "step": 451 }, { "epoch": 0.2711945761084778, "grad_norm": 1.1640625, "learning_rate": 3.007425742574257e-06, "loss": 1.8191, "step": 452 }, { "epoch": 0.27179456410871783, "grad_norm": 1.171875, "learning_rate": 3.004950495049505e-06, "loss": 1.7908, "step": 453 }, { "epoch": 0.2723945521089578, "grad_norm": 1.46875, "learning_rate": 3.0024752475247523e-06, "loss": 1.7694, "step": 454 }, { "epoch": 0.2729945401091978, "grad_norm": 1.40625, "learning_rate": 3e-06, "loss": 1.8002, "step": 455 }, { "epoch": 0.2735945281094378, "grad_norm": 1.390625, "learning_rate": 2.9975247524752474e-06, "loss": 1.8376, "step": 456 }, { "epoch": 0.2741945161096778, "grad_norm": 1.65625, "learning_rate": 2.9950495049504948e-06, "loss": 1.8418, "step": 457 }, { "epoch": 0.2747945041099178, "grad_norm": 1.5625, "learning_rate": 2.9925742574257425e-06, "loss": 1.9308, "step": 458 }, { "epoch": 0.2753944921101578, "grad_norm": 1.359375, "learning_rate": 2.99009900990099e-06, "loss": 1.7999, "step": 459 }, { "epoch": 0.2759944801103978, "grad_norm": 1.3984375, "learning_rate": 2.9876237623762376e-06, "loss": 1.8369, "step": 460 }, { "epoch": 0.2765944681106378, "grad_norm": 1.3359375, "learning_rate": 2.985148514851485e-06, "loss": 1.8621, "step": 461 }, { "epoch": 0.2771944561108778, "grad_norm": 1.3046875, "learning_rate": 2.9826732673267327e-06, "loss": 1.7909, "step": 462 }, { "epoch": 0.27779444411111776, "grad_norm": 2.21875, "learning_rate": 2.98019801980198e-06, "loss": 1.7593, "step": 463 }, { "epoch": 0.2783944321113578, "grad_norm": 1.5703125, "learning_rate": 2.977722772277228e-06, "loss": 1.8737, "step": 464 }, { "epoch": 0.27899442011159775, "grad_norm": 1.3125, "learning_rate": 2.975247524752475e-06, "loss": 1.7006, "step": 465 }, { "epoch": 0.2795944081118378, "grad_norm": 1.203125, "learning_rate": 2.972772277227723e-06, "loss": 1.9047, "step": 466 }, { "epoch": 0.28019439611207775, "grad_norm": 1.28125, "learning_rate": 2.9702970297029703e-06, "loss": 1.7895, "step": 467 }, { "epoch": 0.2807943841123178, "grad_norm": 1.3046875, "learning_rate": 2.967821782178218e-06, "loss": 1.7947, "step": 468 }, { "epoch": 0.28139437211255774, "grad_norm": 1.3203125, "learning_rate": 2.965346534653465e-06, "loss": 1.7682, "step": 469 }, { "epoch": 0.28199436011279777, "grad_norm": 1.1953125, "learning_rate": 2.9628712871287128e-06, "loss": 1.8296, "step": 470 }, { "epoch": 0.28259434811303774, "grad_norm": 1.1640625, "learning_rate": 2.96039603960396e-06, "loss": 1.9427, "step": 471 }, { "epoch": 0.2831943361132777, "grad_norm": 1.28125, "learning_rate": 2.957920792079208e-06, "loss": 1.9328, "step": 472 }, { "epoch": 0.28379432411351774, "grad_norm": 1.390625, "learning_rate": 2.9554455445544552e-06, "loss": 1.8043, "step": 473 }, { "epoch": 0.2843943121137577, "grad_norm": 2.375, "learning_rate": 2.952970297029703e-06, "loss": 1.7856, "step": 474 }, { "epoch": 0.28499430011399773, "grad_norm": 1.421875, "learning_rate": 2.9504950495049503e-06, "loss": 1.7967, "step": 475 }, { "epoch": 0.2855942881142377, "grad_norm": 1.296875, "learning_rate": 2.948019801980198e-06, "loss": 1.8577, "step": 476 }, { "epoch": 0.28619427611447773, "grad_norm": 1.328125, "learning_rate": 2.9455445544554454e-06, "loss": 1.8487, "step": 477 }, { "epoch": 0.2867942641147177, "grad_norm": 1.8515625, "learning_rate": 2.943069306930693e-06, "loss": 1.9896, "step": 478 }, { "epoch": 0.2873942521149577, "grad_norm": 1.3125, "learning_rate": 2.9405940594059405e-06, "loss": 1.9078, "step": 479 }, { "epoch": 0.2879942401151977, "grad_norm": 1.40625, "learning_rate": 2.9381188118811883e-06, "loss": 1.8419, "step": 480 }, { "epoch": 0.28859422811543767, "grad_norm": 1.1953125, "learning_rate": 2.9356435643564357e-06, "loss": 1.8428, "step": 481 }, { "epoch": 0.2891942161156777, "grad_norm": 1.4453125, "learning_rate": 2.933168316831683e-06, "loss": 1.9351, "step": 482 }, { "epoch": 0.28979420411591766, "grad_norm": 1.1953125, "learning_rate": 2.9306930693069303e-06, "loss": 1.8918, "step": 483 }, { "epoch": 0.2903941921161577, "grad_norm": 1.3203125, "learning_rate": 2.928217821782178e-06, "loss": 2.0012, "step": 484 }, { "epoch": 0.29099418011639766, "grad_norm": 1.1796875, "learning_rate": 2.9257425742574254e-06, "loss": 1.7536, "step": 485 }, { "epoch": 0.2915941681166377, "grad_norm": 1.265625, "learning_rate": 2.9232673267326732e-06, "loss": 1.8447, "step": 486 }, { "epoch": 0.29219415611687766, "grad_norm": 1.34375, "learning_rate": 2.9207920792079206e-06, "loss": 1.7318, "step": 487 }, { "epoch": 0.2927941441171177, "grad_norm": 1.4609375, "learning_rate": 2.9183168316831683e-06, "loss": 1.8265, "step": 488 }, { "epoch": 0.29339413211735765, "grad_norm": 1.234375, "learning_rate": 2.9158415841584157e-06, "loss": 1.7215, "step": 489 }, { "epoch": 0.2939941201175976, "grad_norm": 1.3984375, "learning_rate": 2.9133663366336634e-06, "loss": 1.8658, "step": 490 }, { "epoch": 0.29459410811783765, "grad_norm": 1.2109375, "learning_rate": 2.9108910891089108e-06, "loss": 1.8764, "step": 491 }, { "epoch": 0.2951940961180776, "grad_norm": 1.28125, "learning_rate": 2.9084158415841585e-06, "loss": 1.9107, "step": 492 }, { "epoch": 0.29579408411831765, "grad_norm": 1.28125, "learning_rate": 2.905940594059406e-06, "loss": 1.8486, "step": 493 }, { "epoch": 0.2963940721185576, "grad_norm": 1.2890625, "learning_rate": 2.9034653465346537e-06, "loss": 1.7087, "step": 494 }, { "epoch": 0.29699406011879764, "grad_norm": 1.25, "learning_rate": 2.9009900990099006e-06, "loss": 1.8336, "step": 495 }, { "epoch": 0.2975940481190376, "grad_norm": 1.5546875, "learning_rate": 2.8985148514851483e-06, "loss": 1.9546, "step": 496 }, { "epoch": 0.29819403611927764, "grad_norm": 1.2890625, "learning_rate": 2.8960396039603957e-06, "loss": 1.8021, "step": 497 }, { "epoch": 0.2987940241195176, "grad_norm": 1.296875, "learning_rate": 2.8935643564356434e-06, "loss": 1.8557, "step": 498 }, { "epoch": 0.2993940121197576, "grad_norm": 1.3203125, "learning_rate": 2.891089108910891e-06, "loss": 1.8676, "step": 499 }, { "epoch": 0.2999940001199976, "grad_norm": 1.3671875, "learning_rate": 2.8886138613861386e-06, "loss": 1.8587, "step": 500 }, { "epoch": 0.2999940001199976, "eval_loss": 2.0334386825561523, "eval_model_preparation_time": 0.0036, "eval_runtime": 65.1108, "eval_samples_per_second": 153.584, "eval_steps_per_second": 25.603, "step": 500 }, { "epoch": 0.3005939881202376, "grad_norm": 1.4140625, "learning_rate": 2.886138613861386e-06, "loss": 1.828, "step": 501 }, { "epoch": 0.3011939761204776, "grad_norm": 1.3125, "learning_rate": 2.8836633663366337e-06, "loss": 1.8682, "step": 502 }, { "epoch": 0.3017939641207176, "grad_norm": 1.34375, "learning_rate": 2.881188118811881e-06, "loss": 1.7229, "step": 503 }, { "epoch": 0.3023939521209576, "grad_norm": 2.765625, "learning_rate": 2.8787128712871288e-06, "loss": 1.9047, "step": 504 }, { "epoch": 0.30299394012119757, "grad_norm": 1.5390625, "learning_rate": 2.876237623762376e-06, "loss": 1.8765, "step": 505 }, { "epoch": 0.3035939281214376, "grad_norm": 1.265625, "learning_rate": 2.873762376237624e-06, "loss": 1.8074, "step": 506 }, { "epoch": 0.30419391612167757, "grad_norm": 1.46875, "learning_rate": 2.8712871287128712e-06, "loss": 1.8522, "step": 507 }, { "epoch": 0.30479390412191754, "grad_norm": 1.390625, "learning_rate": 2.868811881188119e-06, "loss": 1.7335, "step": 508 }, { "epoch": 0.30539389212215756, "grad_norm": 1.1875, "learning_rate": 2.866336633663366e-06, "loss": 1.8103, "step": 509 }, { "epoch": 0.30599388012239753, "grad_norm": 1.3203125, "learning_rate": 2.8638613861386137e-06, "loss": 1.7559, "step": 510 }, { "epoch": 0.30659386812263756, "grad_norm": 1.5546875, "learning_rate": 2.861386138613861e-06, "loss": 1.7131, "step": 511 }, { "epoch": 0.30719385612287753, "grad_norm": 1.375, "learning_rate": 2.858910891089109e-06, "loss": 1.889, "step": 512 }, { "epoch": 0.30779384412311755, "grad_norm": 1.265625, "learning_rate": 2.856435643564356e-06, "loss": 1.843, "step": 513 }, { "epoch": 0.3083938321233575, "grad_norm": 1.359375, "learning_rate": 2.853960396039604e-06, "loss": 1.7522, "step": 514 }, { "epoch": 0.30899382012359755, "grad_norm": 1.2265625, "learning_rate": 2.8514851485148512e-06, "loss": 1.8443, "step": 515 }, { "epoch": 0.3095938081238375, "grad_norm": 1.296875, "learning_rate": 2.849009900990099e-06, "loss": 1.7471, "step": 516 }, { "epoch": 0.3101937961240775, "grad_norm": 1.2109375, "learning_rate": 2.8465346534653464e-06, "loss": 1.8667, "step": 517 }, { "epoch": 0.3107937841243175, "grad_norm": 1.328125, "learning_rate": 2.844059405940594e-06, "loss": 1.7233, "step": 518 }, { "epoch": 0.3113937721245575, "grad_norm": 1.296875, "learning_rate": 2.8415841584158415e-06, "loss": 1.8619, "step": 519 }, { "epoch": 0.3119937601247975, "grad_norm": 1.2734375, "learning_rate": 2.8391089108910892e-06, "loss": 1.8094, "step": 520 }, { "epoch": 0.3125937481250375, "grad_norm": 1.3515625, "learning_rate": 2.8366336633663366e-06, "loss": 1.8636, "step": 521 }, { "epoch": 0.3131937361252775, "grad_norm": 1.421875, "learning_rate": 2.834158415841584e-06, "loss": 1.8544, "step": 522 }, { "epoch": 0.3137937241255175, "grad_norm": 1.1796875, "learning_rate": 2.8316831683168313e-06, "loss": 1.7636, "step": 523 }, { "epoch": 0.3143937121257575, "grad_norm": 1.3046875, "learning_rate": 2.829207920792079e-06, "loss": 1.8123, "step": 524 }, { "epoch": 0.3149937001259975, "grad_norm": 1.390625, "learning_rate": 2.8267326732673264e-06, "loss": 1.7643, "step": 525 }, { "epoch": 0.31559368812623745, "grad_norm": 1.15625, "learning_rate": 2.824257425742574e-06, "loss": 1.8393, "step": 526 }, { "epoch": 0.3161936761264775, "grad_norm": 1.5390625, "learning_rate": 2.8217821782178215e-06, "loss": 1.7629, "step": 527 }, { "epoch": 0.31679366412671744, "grad_norm": 1.3203125, "learning_rate": 2.8193069306930692e-06, "loss": 1.8098, "step": 528 }, { "epoch": 0.31739365212695747, "grad_norm": 1.4296875, "learning_rate": 2.8168316831683166e-06, "loss": 1.7945, "step": 529 }, { "epoch": 0.31799364012719744, "grad_norm": 1.234375, "learning_rate": 2.8143564356435644e-06, "loss": 1.7641, "step": 530 }, { "epoch": 0.31859362812743747, "grad_norm": 1.2734375, "learning_rate": 2.8118811881188117e-06, "loss": 1.9744, "step": 531 }, { "epoch": 0.31919361612767744, "grad_norm": 1.2109375, "learning_rate": 2.8094059405940595e-06, "loss": 1.7955, "step": 532 }, { "epoch": 0.31979360412791746, "grad_norm": 1.40625, "learning_rate": 2.806930693069307e-06, "loss": 1.8048, "step": 533 }, { "epoch": 0.32039359212815743, "grad_norm": 1.515625, "learning_rate": 2.8044554455445546e-06, "loss": 1.8906, "step": 534 }, { "epoch": 0.3209935801283974, "grad_norm": 1.1953125, "learning_rate": 2.8019801980198015e-06, "loss": 1.8028, "step": 535 }, { "epoch": 0.32159356812863743, "grad_norm": 1.359375, "learning_rate": 2.7995049504950493e-06, "loss": 1.7173, "step": 536 }, { "epoch": 0.3221935561288774, "grad_norm": 1.3984375, "learning_rate": 2.7970297029702966e-06, "loss": 1.8689, "step": 537 }, { "epoch": 0.3227935441291174, "grad_norm": 1.3125, "learning_rate": 2.7945544554455444e-06, "loss": 1.7717, "step": 538 }, { "epoch": 0.3233935321293574, "grad_norm": 1.3125, "learning_rate": 2.7920792079207917e-06, "loss": 1.9151, "step": 539 }, { "epoch": 0.3239935201295974, "grad_norm": 1.21875, "learning_rate": 2.7896039603960395e-06, "loss": 1.9621, "step": 540 }, { "epoch": 0.3245935081298374, "grad_norm": 1.3046875, "learning_rate": 2.787128712871287e-06, "loss": 1.801, "step": 541 }, { "epoch": 0.3251934961300774, "grad_norm": 1.8828125, "learning_rate": 2.7846534653465346e-06, "loss": 1.9634, "step": 542 }, { "epoch": 0.3257934841303174, "grad_norm": 1.390625, "learning_rate": 2.782178217821782e-06, "loss": 1.7282, "step": 543 }, { "epoch": 0.3263934721305574, "grad_norm": 1.453125, "learning_rate": 2.7797029702970297e-06, "loss": 1.6894, "step": 544 }, { "epoch": 0.3269934601307974, "grad_norm": 1.6015625, "learning_rate": 2.777227722772277e-06, "loss": 1.7942, "step": 545 }, { "epoch": 0.32759344813103736, "grad_norm": 1.1796875, "learning_rate": 2.774752475247525e-06, "loss": 1.7971, "step": 546 }, { "epoch": 0.3281934361312774, "grad_norm": 1.4609375, "learning_rate": 2.772277227722772e-06, "loss": 1.7986, "step": 547 }, { "epoch": 0.32879342413151735, "grad_norm": 1.1328125, "learning_rate": 2.7698019801980195e-06, "loss": 2.0276, "step": 548 }, { "epoch": 0.3293934121317574, "grad_norm": 1.28125, "learning_rate": 2.767326732673267e-06, "loss": 1.8949, "step": 549 }, { "epoch": 0.32999340013199735, "grad_norm": 1.53125, "learning_rate": 2.7648514851485146e-06, "loss": 1.8993, "step": 550 }, { "epoch": 0.32999340013199735, "eval_loss": 2.0332491397857666, "eval_model_preparation_time": 0.0036, "eval_runtime": 65.0973, "eval_samples_per_second": 153.616, "eval_steps_per_second": 25.608, "step": 550 }, { "epoch": 0.3305933881322374, "grad_norm": 1.359375, "learning_rate": 2.762376237623762e-06, "loss": 1.7827, "step": 551 }, { "epoch": 0.33119337613247735, "grad_norm": 1.34375, "learning_rate": 2.7599009900990097e-06, "loss": 1.754, "step": 552 }, { "epoch": 0.33179336413271737, "grad_norm": 1.3203125, "learning_rate": 2.757425742574257e-06, "loss": 1.7121, "step": 553 }, { "epoch": 0.33239335213295734, "grad_norm": 1.703125, "learning_rate": 2.754950495049505e-06, "loss": 1.8265, "step": 554 }, { "epoch": 0.3329933401331973, "grad_norm": 1.3359375, "learning_rate": 2.752475247524752e-06, "loss": 1.8452, "step": 555 }, { "epoch": 0.33359332813343734, "grad_norm": 1.3046875, "learning_rate": 2.75e-06, "loss": 1.9355, "step": 556 }, { "epoch": 0.3341933161336773, "grad_norm": 1.5859375, "learning_rate": 2.7475247524752477e-06, "loss": 1.7684, "step": 557 }, { "epoch": 0.33479330413391734, "grad_norm": 1.3125, "learning_rate": 2.745049504950495e-06, "loss": 1.8229, "step": 558 }, { "epoch": 0.3353932921341573, "grad_norm": 1.4375, "learning_rate": 2.742574257425743e-06, "loss": 1.9012, "step": 559 }, { "epoch": 0.33599328013439733, "grad_norm": 1.7890625, "learning_rate": 2.74009900990099e-06, "loss": 1.8624, "step": 560 }, { "epoch": 0.3365932681346373, "grad_norm": 1.34375, "learning_rate": 2.737623762376238e-06, "loss": 1.8385, "step": 561 }, { "epoch": 0.33719325613487733, "grad_norm": 1.3671875, "learning_rate": 2.735148514851485e-06, "loss": 1.9174, "step": 562 }, { "epoch": 0.3377932441351173, "grad_norm": 1.4921875, "learning_rate": 2.7326732673267326e-06, "loss": 1.6744, "step": 563 }, { "epoch": 0.33839323213535727, "grad_norm": 1.28125, "learning_rate": 2.73019801980198e-06, "loss": 1.9218, "step": 564 }, { "epoch": 0.3389932201355973, "grad_norm": 1.3828125, "learning_rate": 2.7277227722772277e-06, "loss": 1.9251, "step": 565 }, { "epoch": 0.33959320813583727, "grad_norm": 1.5390625, "learning_rate": 2.725247524752475e-06, "loss": 2.0032, "step": 566 }, { "epoch": 0.3401931961360773, "grad_norm": 1.3671875, "learning_rate": 2.722772277227723e-06, "loss": 1.7809, "step": 567 }, { "epoch": 0.34079318413631726, "grad_norm": 1.3359375, "learning_rate": 2.72029702970297e-06, "loss": 1.7717, "step": 568 }, { "epoch": 0.3413931721365573, "grad_norm": 1.25, "learning_rate": 2.717821782178218e-06, "loss": 1.881, "step": 569 }, { "epoch": 0.34199316013679726, "grad_norm": 1.296875, "learning_rate": 2.7153465346534653e-06, "loss": 1.8142, "step": 570 }, { "epoch": 0.3425931481370373, "grad_norm": 1.234375, "learning_rate": 2.712871287128713e-06, "loss": 1.9001, "step": 571 }, { "epoch": 0.34319313613727725, "grad_norm": 1.34375, "learning_rate": 2.7103960396039604e-06, "loss": 1.7508, "step": 572 }, { "epoch": 0.3437931241375172, "grad_norm": 1.5625, "learning_rate": 2.707920792079208e-06, "loss": 2.0335, "step": 573 }, { "epoch": 0.34439311213775725, "grad_norm": 1.1875, "learning_rate": 2.7054455445544555e-06, "loss": 1.9974, "step": 574 }, { "epoch": 0.3449931001379972, "grad_norm": 1.3984375, "learning_rate": 2.702970297029703e-06, "loss": 1.9025, "step": 575 }, { "epoch": 0.34559308813823725, "grad_norm": 1.2578125, "learning_rate": 2.70049504950495e-06, "loss": 1.7668, "step": 576 }, { "epoch": 0.3461930761384772, "grad_norm": 1.3984375, "learning_rate": 2.698019801980198e-06, "loss": 1.8825, "step": 577 }, { "epoch": 0.34679306413871724, "grad_norm": 1.453125, "learning_rate": 2.6955445544554453e-06, "loss": 1.9186, "step": 578 }, { "epoch": 0.3473930521389572, "grad_norm": 1.6796875, "learning_rate": 2.693069306930693e-06, "loss": 1.8675, "step": 579 }, { "epoch": 0.34799304013919724, "grad_norm": 1.328125, "learning_rate": 2.6905940594059404e-06, "loss": 1.9197, "step": 580 }, { "epoch": 0.3485930281394372, "grad_norm": 1.359375, "learning_rate": 2.688118811881188e-06, "loss": 1.7689, "step": 581 }, { "epoch": 0.3491930161396772, "grad_norm": 1.2109375, "learning_rate": 2.6856435643564355e-06, "loss": 1.7074, "step": 582 }, { "epoch": 0.3497930041399172, "grad_norm": 1.546875, "learning_rate": 2.6831683168316833e-06, "loss": 1.8839, "step": 583 }, { "epoch": 0.3503929921401572, "grad_norm": 1.28125, "learning_rate": 2.6806930693069306e-06, "loss": 1.7625, "step": 584 }, { "epoch": 0.3509929801403972, "grad_norm": 1.234375, "learning_rate": 2.6782178217821784e-06, "loss": 1.767, "step": 585 }, { "epoch": 0.3515929681406372, "grad_norm": 1.328125, "learning_rate": 2.6757425742574257e-06, "loss": 1.9642, "step": 586 }, { "epoch": 0.3521929561408772, "grad_norm": 1.328125, "learning_rate": 2.6732673267326735e-06, "loss": 1.9743, "step": 587 }, { "epoch": 0.35279294414111717, "grad_norm": 1.3515625, "learning_rate": 2.6707920792079204e-06, "loss": 1.9189, "step": 588 }, { "epoch": 0.3533929321413572, "grad_norm": 1.15625, "learning_rate": 2.668316831683168e-06, "loss": 1.793, "step": 589 }, { "epoch": 0.35399292014159717, "grad_norm": 1.28125, "learning_rate": 2.6658415841584155e-06, "loss": 1.8837, "step": 590 }, { "epoch": 0.35459290814183714, "grad_norm": 1.3203125, "learning_rate": 2.6633663366336633e-06, "loss": 1.8041, "step": 591 }, { "epoch": 0.35519289614207716, "grad_norm": 1.359375, "learning_rate": 2.6608910891089106e-06, "loss": 1.797, "step": 592 }, { "epoch": 0.35579288414231713, "grad_norm": 1.359375, "learning_rate": 2.6584158415841584e-06, "loss": 1.6775, "step": 593 }, { "epoch": 0.35639287214255716, "grad_norm": 1.1953125, "learning_rate": 2.6559405940594057e-06, "loss": 1.8262, "step": 594 }, { "epoch": 0.35699286014279713, "grad_norm": 1.4453125, "learning_rate": 2.6534653465346535e-06, "loss": 1.8204, "step": 595 }, { "epoch": 0.35759284814303716, "grad_norm": 1.2734375, "learning_rate": 2.650990099009901e-06, "loss": 1.8847, "step": 596 }, { "epoch": 0.3581928361432771, "grad_norm": 1.1953125, "learning_rate": 2.6485148514851486e-06, "loss": 1.8749, "step": 597 }, { "epoch": 0.35879282414351715, "grad_norm": 1.2578125, "learning_rate": 2.646039603960396e-06, "loss": 1.816, "step": 598 }, { "epoch": 0.3593928121437571, "grad_norm": 1.296875, "learning_rate": 2.6435643564356437e-06, "loss": 1.8531, "step": 599 }, { "epoch": 0.3599928001439971, "grad_norm": 1.2265625, "learning_rate": 2.641089108910891e-06, "loss": 1.8029, "step": 600 }, { "epoch": 0.3599928001439971, "eval_loss": 2.0328757762908936, "eval_model_preparation_time": 0.0036, "eval_runtime": 67.4254, "eval_samples_per_second": 148.312, "eval_steps_per_second": 24.724, "step": 600 }, { "epoch": 0.3605927881442371, "grad_norm": 1.3671875, "learning_rate": 2.6386138613861384e-06, "loss": 1.8685, "step": 601 }, { "epoch": 0.3611927761444771, "grad_norm": 1.296875, "learning_rate": 2.6361386138613858e-06, "loss": 1.8958, "step": 602 }, { "epoch": 0.3617927641447171, "grad_norm": 1.2734375, "learning_rate": 2.6336633663366335e-06, "loss": 1.7194, "step": 603 }, { "epoch": 0.3623927521449571, "grad_norm": 1.296875, "learning_rate": 2.631188118811881e-06, "loss": 1.9711, "step": 604 }, { "epoch": 0.3629927401451971, "grad_norm": 1.265625, "learning_rate": 2.6287128712871286e-06, "loss": 1.7863, "step": 605 }, { "epoch": 0.3635927281454371, "grad_norm": 1.2421875, "learning_rate": 2.626237623762376e-06, "loss": 1.8361, "step": 606 }, { "epoch": 0.3641927161456771, "grad_norm": 1.3125, "learning_rate": 2.6237623762376237e-06, "loss": 1.9021, "step": 607 }, { "epoch": 0.3647927041459171, "grad_norm": 1.3125, "learning_rate": 2.621287128712871e-06, "loss": 1.9613, "step": 608 }, { "epoch": 0.36539269214615705, "grad_norm": 1.2109375, "learning_rate": 2.618811881188119e-06, "loss": 1.8389, "step": 609 }, { "epoch": 0.3659926801463971, "grad_norm": 1.578125, "learning_rate": 2.616336633663366e-06, "loss": 1.8752, "step": 610 }, { "epoch": 0.36659266814663705, "grad_norm": 1.234375, "learning_rate": 2.613861386138614e-06, "loss": 1.8041, "step": 611 }, { "epoch": 0.36719265614687707, "grad_norm": 1.4375, "learning_rate": 2.6113861386138613e-06, "loss": 1.8281, "step": 612 }, { "epoch": 0.36779264414711704, "grad_norm": 1.3046875, "learning_rate": 2.608910891089109e-06, "loss": 1.7512, "step": 613 }, { "epoch": 0.36839263214735707, "grad_norm": 1.2578125, "learning_rate": 2.606435643564356e-06, "loss": 1.9474, "step": 614 }, { "epoch": 0.36899262014759704, "grad_norm": 1.265625, "learning_rate": 2.6039603960396038e-06, "loss": 1.8229, "step": 615 }, { "epoch": 0.36959260814783707, "grad_norm": 1.171875, "learning_rate": 2.601485148514851e-06, "loss": 1.6716, "step": 616 }, { "epoch": 0.37019259614807704, "grad_norm": 1.3828125, "learning_rate": 2.599009900990099e-06, "loss": 1.958, "step": 617 }, { "epoch": 0.370792584148317, "grad_norm": 1.3125, "learning_rate": 2.596534653465346e-06, "loss": 1.886, "step": 618 }, { "epoch": 0.37139257214855703, "grad_norm": 1.3671875, "learning_rate": 2.594059405940594e-06, "loss": 1.904, "step": 619 }, { "epoch": 0.371992560148797, "grad_norm": 1.5234375, "learning_rate": 2.5915841584158413e-06, "loss": 1.8177, "step": 620 }, { "epoch": 0.37259254814903703, "grad_norm": 1.1875, "learning_rate": 2.589108910891089e-06, "loss": 1.7927, "step": 621 }, { "epoch": 0.373192536149277, "grad_norm": 1.2890625, "learning_rate": 2.5866336633663364e-06, "loss": 1.8946, "step": 622 }, { "epoch": 0.373792524149517, "grad_norm": 1.21875, "learning_rate": 2.584158415841584e-06, "loss": 1.7751, "step": 623 }, { "epoch": 0.374392512149757, "grad_norm": 1.4453125, "learning_rate": 2.5816831683168315e-06, "loss": 1.9221, "step": 624 }, { "epoch": 0.374992500149997, "grad_norm": 1.3515625, "learning_rate": 2.5792079207920793e-06, "loss": 1.9133, "step": 625 }, { "epoch": 0.375592488150237, "grad_norm": 1.28125, "learning_rate": 2.5767326732673266e-06, "loss": 1.8273, "step": 626 }, { "epoch": 0.376192476150477, "grad_norm": 1.5390625, "learning_rate": 2.5742574257425744e-06, "loss": 1.7544, "step": 627 }, { "epoch": 0.376792464150717, "grad_norm": 1.3046875, "learning_rate": 2.5717821782178213e-06, "loss": 1.7994, "step": 628 }, { "epoch": 0.37739245215095696, "grad_norm": 1.3359375, "learning_rate": 2.569306930693069e-06, "loss": 1.9355, "step": 629 }, { "epoch": 0.377992440151197, "grad_norm": 1.2578125, "learning_rate": 2.5668316831683164e-06, "loss": 1.7472, "step": 630 }, { "epoch": 0.37859242815143695, "grad_norm": 1.28125, "learning_rate": 2.564356435643564e-06, "loss": 1.9562, "step": 631 }, { "epoch": 0.379192416151677, "grad_norm": 1.4765625, "learning_rate": 2.5618811881188115e-06, "loss": 1.9807, "step": 632 }, { "epoch": 0.37979240415191695, "grad_norm": 1.3046875, "learning_rate": 2.5594059405940593e-06, "loss": 1.9289, "step": 633 }, { "epoch": 0.380392392152157, "grad_norm": 1.4375, "learning_rate": 2.5569306930693067e-06, "loss": 1.9202, "step": 634 }, { "epoch": 0.38099238015239695, "grad_norm": 1.265625, "learning_rate": 2.5544554455445544e-06, "loss": 1.8881, "step": 635 }, { "epoch": 0.381592368152637, "grad_norm": 1.2578125, "learning_rate": 2.5519801980198018e-06, "loss": 1.8555, "step": 636 }, { "epoch": 0.38219235615287694, "grad_norm": 1.2734375, "learning_rate": 2.5495049504950495e-06, "loss": 1.9314, "step": 637 }, { "epoch": 0.3827923441531169, "grad_norm": 1.2578125, "learning_rate": 2.547029702970297e-06, "loss": 1.7896, "step": 638 }, { "epoch": 0.38339233215335694, "grad_norm": 1.4375, "learning_rate": 2.5445544554455446e-06, "loss": 1.8339, "step": 639 }, { "epoch": 0.3839923201535969, "grad_norm": 1.3125, "learning_rate": 2.542079207920792e-06, "loss": 1.9142, "step": 640 }, { "epoch": 0.38459230815383694, "grad_norm": 1.390625, "learning_rate": 2.5396039603960393e-06, "loss": 1.9154, "step": 641 }, { "epoch": 0.3851922961540769, "grad_norm": 1.1328125, "learning_rate": 2.5371287128712867e-06, "loss": 1.8165, "step": 642 }, { "epoch": 0.38579228415431693, "grad_norm": 1.1953125, "learning_rate": 2.5346534653465344e-06, "loss": 1.6729, "step": 643 }, { "epoch": 0.3863922721545569, "grad_norm": 1.1875, "learning_rate": 2.5321782178217818e-06, "loss": 1.8441, "step": 644 }, { "epoch": 0.38699226015479693, "grad_norm": 1.4453125, "learning_rate": 2.5297029702970295e-06, "loss": 1.7124, "step": 645 }, { "epoch": 0.3875922481550369, "grad_norm": 1.359375, "learning_rate": 2.527227722772277e-06, "loss": 1.8526, "step": 646 }, { "epoch": 0.38819223615527687, "grad_norm": 1.2578125, "learning_rate": 2.5247524752475247e-06, "loss": 1.7846, "step": 647 }, { "epoch": 0.3887922241555169, "grad_norm": 1.140625, "learning_rate": 2.522277227722772e-06, "loss": 1.8099, "step": 648 }, { "epoch": 0.38939221215575687, "grad_norm": 1.390625, "learning_rate": 2.5198019801980198e-06, "loss": 1.8258, "step": 649 }, { "epoch": 0.3899922001559969, "grad_norm": 1.3125, "learning_rate": 2.517326732673267e-06, "loss": 1.7954, "step": 650 }, { "epoch": 0.3899922001559969, "eval_loss": 2.032578468322754, "eval_model_preparation_time": 0.0036, "eval_runtime": 65.0425, "eval_samples_per_second": 153.746, "eval_steps_per_second": 25.629, "step": 650 }, { "epoch": 0.39059218815623686, "grad_norm": 2.40625, "learning_rate": 2.514851485148515e-06, "loss": 1.8582, "step": 651 }, { "epoch": 0.3911921761564769, "grad_norm": 1.265625, "learning_rate": 2.5123762376237622e-06, "loss": 1.7973, "step": 652 }, { "epoch": 0.39179216415671686, "grad_norm": 1.34375, "learning_rate": 2.50990099009901e-06, "loss": 1.8664, "step": 653 }, { "epoch": 0.3923921521569569, "grad_norm": 1.1953125, "learning_rate": 2.507425742574257e-06, "loss": 1.8547, "step": 654 }, { "epoch": 0.39299214015719686, "grad_norm": 1.2421875, "learning_rate": 2.5049504950495047e-06, "loss": 1.7987, "step": 655 }, { "epoch": 0.3935921281574368, "grad_norm": 1.3828125, "learning_rate": 2.502475247524752e-06, "loss": 1.7597, "step": 656 }, { "epoch": 0.39419211615767685, "grad_norm": 1.28125, "learning_rate": 2.4999999999999998e-06, "loss": 1.8468, "step": 657 }, { "epoch": 0.3947921041579168, "grad_norm": 1.328125, "learning_rate": 2.4975247524752475e-06, "loss": 1.7945, "step": 658 }, { "epoch": 0.39539209215815685, "grad_norm": 1.390625, "learning_rate": 2.495049504950495e-06, "loss": 1.7316, "step": 659 }, { "epoch": 0.3959920801583968, "grad_norm": 1.328125, "learning_rate": 2.4925742574257427e-06, "loss": 1.7246, "step": 660 }, { "epoch": 0.39659206815863685, "grad_norm": 1.1875, "learning_rate": 2.49009900990099e-06, "loss": 1.7835, "step": 661 }, { "epoch": 0.3971920561588768, "grad_norm": 1.4453125, "learning_rate": 2.4876237623762378e-06, "loss": 1.8527, "step": 662 }, { "epoch": 0.39779204415911684, "grad_norm": 1.234375, "learning_rate": 2.485148514851485e-06, "loss": 1.8803, "step": 663 }, { "epoch": 0.3983920321593568, "grad_norm": 1.3125, "learning_rate": 2.482673267326733e-06, "loss": 1.8751, "step": 664 }, { "epoch": 0.3989920201595968, "grad_norm": 1.640625, "learning_rate": 2.4801980198019802e-06, "loss": 1.9245, "step": 665 }, { "epoch": 0.3995920081598368, "grad_norm": 1.25, "learning_rate": 2.477722772277228e-06, "loss": 1.7848, "step": 666 }, { "epoch": 0.4001919961600768, "grad_norm": 1.9765625, "learning_rate": 2.475247524752475e-06, "loss": 1.7866, "step": 667 }, { "epoch": 0.4007919841603168, "grad_norm": 1.3515625, "learning_rate": 2.4727722772277227e-06, "loss": 1.696, "step": 668 }, { "epoch": 0.4013919721605568, "grad_norm": 1.1953125, "learning_rate": 2.47029702970297e-06, "loss": 1.768, "step": 669 }, { "epoch": 0.4019919601607968, "grad_norm": 1.6328125, "learning_rate": 2.4678217821782178e-06, "loss": 1.8123, "step": 670 }, { "epoch": 0.4025919481610368, "grad_norm": 1.2421875, "learning_rate": 2.465346534653465e-06, "loss": 1.8934, "step": 671 }, { "epoch": 0.4031919361612768, "grad_norm": 1.3359375, "learning_rate": 2.462871287128713e-06, "loss": 1.8896, "step": 672 }, { "epoch": 0.40379192416151677, "grad_norm": 1.28125, "learning_rate": 2.4603960396039602e-06, "loss": 1.9371, "step": 673 }, { "epoch": 0.40439191216175674, "grad_norm": 1.296875, "learning_rate": 2.457920792079208e-06, "loss": 1.68, "step": 674 }, { "epoch": 0.40499190016199677, "grad_norm": 1.5, "learning_rate": 2.4554455445544553e-06, "loss": 1.9408, "step": 675 }, { "epoch": 0.40559188816223674, "grad_norm": 1.4609375, "learning_rate": 2.452970297029703e-06, "loss": 1.7283, "step": 676 }, { "epoch": 0.40619187616247676, "grad_norm": 2.203125, "learning_rate": 2.4504950495049505e-06, "loss": 1.8666, "step": 677 }, { "epoch": 0.40679186416271673, "grad_norm": 1.3125, "learning_rate": 2.4480198019801982e-06, "loss": 1.8113, "step": 678 }, { "epoch": 0.40739185216295676, "grad_norm": 1.71875, "learning_rate": 2.4455445544554456e-06, "loss": 1.8736, "step": 679 }, { "epoch": 0.40799184016319673, "grad_norm": 1.3359375, "learning_rate": 2.4430693069306933e-06, "loss": 1.8091, "step": 680 }, { "epoch": 0.40859182816343675, "grad_norm": 1.3515625, "learning_rate": 2.4405940594059402e-06, "loss": 1.8982, "step": 681 }, { "epoch": 0.4091918161636767, "grad_norm": 1.1796875, "learning_rate": 2.438118811881188e-06, "loss": 1.8069, "step": 682 }, { "epoch": 0.4097918041639167, "grad_norm": 1.296875, "learning_rate": 2.4356435643564354e-06, "loss": 1.8585, "step": 683 }, { "epoch": 0.4103917921641567, "grad_norm": 1.65625, "learning_rate": 2.433168316831683e-06, "loss": 1.9333, "step": 684 }, { "epoch": 0.4109917801643967, "grad_norm": 1.390625, "learning_rate": 2.4306930693069305e-06, "loss": 1.8872, "step": 685 }, { "epoch": 0.4115917681646367, "grad_norm": 1.421875, "learning_rate": 2.4282178217821782e-06, "loss": 1.8252, "step": 686 }, { "epoch": 0.4121917561648767, "grad_norm": 1.3359375, "learning_rate": 2.4257425742574256e-06, "loss": 1.7709, "step": 687 }, { "epoch": 0.4127917441651167, "grad_norm": 1.3046875, "learning_rate": 2.4232673267326733e-06, "loss": 1.8485, "step": 688 }, { "epoch": 0.4133917321653567, "grad_norm": 1.390625, "learning_rate": 2.4207920792079207e-06, "loss": 1.7962, "step": 689 }, { "epoch": 0.4139917201655967, "grad_norm": 1.2578125, "learning_rate": 2.4183168316831685e-06, "loss": 1.8375, "step": 690 }, { "epoch": 0.4145917081658367, "grad_norm": 1.265625, "learning_rate": 2.415841584158416e-06, "loss": 1.8245, "step": 691 }, { "epoch": 0.41519169616607665, "grad_norm": 1.265625, "learning_rate": 2.4133663366336636e-06, "loss": 1.9368, "step": 692 }, { "epoch": 0.4157916841663167, "grad_norm": 1.3125, "learning_rate": 2.410891089108911e-06, "loss": 1.8899, "step": 693 }, { "epoch": 0.41639167216655665, "grad_norm": 1.3125, "learning_rate": 2.4084158415841582e-06, "loss": 1.8622, "step": 694 }, { "epoch": 0.4169916601667967, "grad_norm": 1.2421875, "learning_rate": 2.4059405940594056e-06, "loss": 1.77, "step": 695 }, { "epoch": 0.41759164816703664, "grad_norm": 1.28125, "learning_rate": 2.4034653465346534e-06, "loss": 1.983, "step": 696 }, { "epoch": 0.41819163616727667, "grad_norm": 1.296875, "learning_rate": 2.4009900990099007e-06, "loss": 1.9289, "step": 697 }, { "epoch": 0.41879162416751664, "grad_norm": 1.3984375, "learning_rate": 2.3985148514851485e-06, "loss": 1.7553, "step": 698 }, { "epoch": 0.41939161216775667, "grad_norm": 1.28125, "learning_rate": 2.396039603960396e-06, "loss": 1.9037, "step": 699 }, { "epoch": 0.41999160016799664, "grad_norm": 1.3125, "learning_rate": 2.3935643564356436e-06, "loss": 1.8117, "step": 700 }, { "epoch": 0.41999160016799664, "eval_loss": 2.0324764251708984, "eval_model_preparation_time": 0.0036, "eval_runtime": 65.1863, "eval_samples_per_second": 153.407, "eval_steps_per_second": 25.573, "step": 700 } ], "logging_steps": 1, "max_steps": 1666, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.3404255362492662e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }