{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.994535519125683, "eval_steps": 23, "global_step": 182, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01092896174863388, "grad_norm": 4.323166356121248, "learning_rate": 2.5000000000000004e-07, "loss": 1.46, "step": 1 }, { "epoch": 0.01092896174863388, "eval_loss": 1.471703052520752, "eval_runtime": 67.3533, "eval_samples_per_second": 13.036, "eval_steps_per_second": 0.817, "step": 1 }, { "epoch": 0.02185792349726776, "grad_norm": 4.1781600437358755, "learning_rate": 5.000000000000001e-07, "loss": 1.415, "step": 2 }, { "epoch": 0.03278688524590164, "grad_norm": 4.4797610006391855, "learning_rate": 7.5e-07, "loss": 1.4586, "step": 3 }, { "epoch": 0.04371584699453552, "grad_norm": 4.522211465781013, "learning_rate": 1.0000000000000002e-06, "loss": 1.4732, "step": 4 }, { "epoch": 0.0546448087431694, "grad_norm": 4.117560041368195, "learning_rate": 1.25e-06, "loss": 1.4663, "step": 5 }, { "epoch": 0.06557377049180328, "grad_norm": 3.1952397139828306, "learning_rate": 1.5e-06, "loss": 1.4197, "step": 6 }, { "epoch": 0.07650273224043716, "grad_norm": 3.0943887809741044, "learning_rate": 1.75e-06, "loss": 1.4206, "step": 7 }, { "epoch": 0.08743169398907104, "grad_norm": 1.6636573080749355, "learning_rate": 2.0000000000000003e-06, "loss": 1.4173, "step": 8 }, { "epoch": 0.09836065573770492, "grad_norm": 1.4931794668592633, "learning_rate": 2.25e-06, "loss": 1.4325, "step": 9 }, { "epoch": 0.1092896174863388, "grad_norm": 1.9926906649884222, "learning_rate": 2.5e-06, "loss": 1.3892, "step": 10 }, { "epoch": 0.12021857923497267, "grad_norm": 2.068898356029309, "learning_rate": 2.7500000000000004e-06, "loss": 1.4158, "step": 11 }, { "epoch": 0.13114754098360656, "grad_norm": 1.8938233779693168, "learning_rate": 3e-06, "loss": 1.4257, "step": 12 }, { "epoch": 0.14207650273224043, "grad_norm": 1.6754958457589977, "learning_rate": 3.2500000000000002e-06, "loss": 1.394, "step": 13 }, { "epoch": 0.15300546448087432, "grad_norm": 1.7031384010788408, "learning_rate": 3.5e-06, "loss": 1.4318, "step": 14 }, { "epoch": 0.16393442622950818, "grad_norm": 1.4871557710052379, "learning_rate": 3.7500000000000005e-06, "loss": 1.4186, "step": 15 }, { "epoch": 0.17486338797814208, "grad_norm": 1.2824217624009038, "learning_rate": 4.000000000000001e-06, "loss": 1.3989, "step": 16 }, { "epoch": 0.18579234972677597, "grad_norm": 1.0624643257712287, "learning_rate": 4.25e-06, "loss": 1.4084, "step": 17 }, { "epoch": 0.19672131147540983, "grad_norm": 0.9369571625662664, "learning_rate": 4.5e-06, "loss": 1.3921, "step": 18 }, { "epoch": 0.20765027322404372, "grad_norm": 1.002838786114982, "learning_rate": 4.75e-06, "loss": 1.396, "step": 19 }, { "epoch": 0.2185792349726776, "grad_norm": 1.0819346486503056, "learning_rate": 5e-06, "loss": 1.3853, "step": 20 }, { "epoch": 0.22950819672131148, "grad_norm": 1.0437618103232364, "learning_rate": 5.2500000000000006e-06, "loss": 1.4124, "step": 21 }, { "epoch": 0.24043715846994534, "grad_norm": 0.9870933148158793, "learning_rate": 5.500000000000001e-06, "loss": 1.3836, "step": 22 }, { "epoch": 0.25136612021857924, "grad_norm": 0.8609592841643053, "learning_rate": 5.75e-06, "loss": 1.369, "step": 23 }, { "epoch": 0.25136612021857924, "eval_loss": 1.3864474296569824, "eval_runtime": 59.1953, "eval_samples_per_second": 14.832, "eval_steps_per_second": 0.929, "step": 23 }, { "epoch": 0.26229508196721313, "grad_norm": 0.8818014288027249, "learning_rate": 6e-06, "loss": 1.3342, "step": 24 }, { "epoch": 0.273224043715847, "grad_norm": 0.842658047302736, "learning_rate": 6.25e-06, "loss": 1.4017, "step": 25 }, { "epoch": 0.28415300546448086, "grad_norm": 0.8367707055500635, "learning_rate": 6.5000000000000004e-06, "loss": 1.367, "step": 26 }, { "epoch": 0.29508196721311475, "grad_norm": 0.823707416803997, "learning_rate": 6.750000000000001e-06, "loss": 1.3824, "step": 27 }, { "epoch": 0.30601092896174864, "grad_norm": 0.8123802637531551, "learning_rate": 7e-06, "loss": 1.3783, "step": 28 }, { "epoch": 0.31693989071038253, "grad_norm": 0.7265068511749192, "learning_rate": 7.25e-06, "loss": 1.3502, "step": 29 }, { "epoch": 0.32786885245901637, "grad_norm": 0.8212154430417543, "learning_rate": 7.500000000000001e-06, "loss": 1.375, "step": 30 }, { "epoch": 0.33879781420765026, "grad_norm": 0.7201546959585452, "learning_rate": 7.75e-06, "loss": 1.3501, "step": 31 }, { "epoch": 0.34972677595628415, "grad_norm": 0.7118289859939233, "learning_rate": 8.000000000000001e-06, "loss": 1.3551, "step": 32 }, { "epoch": 0.36065573770491804, "grad_norm": 0.7021806935938936, "learning_rate": 8.25e-06, "loss": 1.3533, "step": 33 }, { "epoch": 0.37158469945355194, "grad_norm": 0.7308638407523511, "learning_rate": 8.5e-06, "loss": 1.3388, "step": 34 }, { "epoch": 0.3825136612021858, "grad_norm": 0.690342645073811, "learning_rate": 8.750000000000001e-06, "loss": 1.3404, "step": 35 }, { "epoch": 0.39344262295081966, "grad_norm": 0.7147253386601637, "learning_rate": 9e-06, "loss": 1.3625, "step": 36 }, { "epoch": 0.40437158469945356, "grad_norm": 0.706260198509053, "learning_rate": 9.250000000000001e-06, "loss": 1.3502, "step": 37 }, { "epoch": 0.41530054644808745, "grad_norm": 0.6897623596194925, "learning_rate": 9.5e-06, "loss": 1.3206, "step": 38 }, { "epoch": 0.4262295081967213, "grad_norm": 0.6514959571687202, "learning_rate": 9.75e-06, "loss": 1.3494, "step": 39 }, { "epoch": 0.4371584699453552, "grad_norm": 0.6886259367989683, "learning_rate": 1e-05, "loss": 1.3344, "step": 40 }, { "epoch": 0.44808743169398907, "grad_norm": 0.6559836313039776, "learning_rate": 9.998776383426217e-06, "loss": 1.3489, "step": 41 }, { "epoch": 0.45901639344262296, "grad_norm": 0.658015735271608, "learning_rate": 9.995106132599869e-06, "loss": 1.3239, "step": 42 }, { "epoch": 0.46994535519125685, "grad_norm": 0.6161892869495798, "learning_rate": 9.988991043912857e-06, "loss": 1.3196, "step": 43 }, { "epoch": 0.4808743169398907, "grad_norm": 0.6513414770808108, "learning_rate": 9.980434110374725e-06, "loss": 1.2841, "step": 44 }, { "epoch": 0.4918032786885246, "grad_norm": 0.6827797259661088, "learning_rate": 9.969439520147754e-06, "loss": 1.3395, "step": 45 }, { "epoch": 0.5027322404371585, "grad_norm": 0.6781840183933557, "learning_rate": 9.956012654497073e-06, "loss": 1.3301, "step": 46 }, { "epoch": 0.5027322404371585, "eval_loss": 1.3286651372909546, "eval_runtime": 58.1829, "eval_samples_per_second": 15.09, "eval_steps_per_second": 0.945, "step": 46 }, { "epoch": 0.5136612021857924, "grad_norm": 0.6559060986211395, "learning_rate": 9.94016008515682e-06, "loss": 1.3495, "step": 47 }, { "epoch": 0.5245901639344263, "grad_norm": 0.7179302729276815, "learning_rate": 9.921889571113629e-06, "loss": 1.3468, "step": 48 }, { "epoch": 0.5355191256830601, "grad_norm": 0.6222614483506688, "learning_rate": 9.901210054809015e-06, "loss": 1.3149, "step": 49 }, { "epoch": 0.546448087431694, "grad_norm": 0.6735276021304194, "learning_rate": 9.878131657762535e-06, "loss": 1.3102, "step": 50 }, { "epoch": 0.5573770491803278, "grad_norm": 0.6525530650655267, "learning_rate": 9.852665675617837e-06, "loss": 1.3233, "step": 51 }, { "epoch": 0.5683060109289617, "grad_norm": 0.6218720239491331, "learning_rate": 9.82482457261405e-06, "loss": 1.3007, "step": 52 }, { "epoch": 0.5792349726775956, "grad_norm": 0.6588103953111063, "learning_rate": 9.7946219754852e-06, "loss": 1.3246, "step": 53 }, { "epoch": 0.5901639344262295, "grad_norm": 0.6419385916682782, "learning_rate": 9.762072666790658e-06, "loss": 1.2913, "step": 54 }, { "epoch": 0.6010928961748634, "grad_norm": 0.6560409133992583, "learning_rate": 9.727192577679852e-06, "loss": 1.3193, "step": 55 }, { "epoch": 0.6120218579234973, "grad_norm": 0.6699402457543598, "learning_rate": 9.689998780094839e-06, "loss": 1.2974, "step": 56 }, { "epoch": 0.6229508196721312, "grad_norm": 0.6326591480970687, "learning_rate": 9.650509478414483e-06, "loss": 1.2922, "step": 57 }, { "epoch": 0.6338797814207651, "grad_norm": 0.6243610225526046, "learning_rate": 9.608744000544392e-06, "loss": 1.2781, "step": 58 }, { "epoch": 0.644808743169399, "grad_norm": 0.6346243862103911, "learning_rate": 9.564722788456943e-06, "loss": 1.3148, "step": 59 }, { "epoch": 0.6557377049180327, "grad_norm": 0.686173369501883, "learning_rate": 9.51846738818602e-06, "loss": 1.3012, "step": 60 }, { "epoch": 0.6666666666666666, "grad_norm": 0.6396528010720596, "learning_rate": 9.470000439281379e-06, "loss": 1.323, "step": 61 }, { "epoch": 0.6775956284153005, "grad_norm": 0.6280592659051639, "learning_rate": 9.419345663727805e-06, "loss": 1.2583, "step": 62 }, { "epoch": 0.6885245901639344, "grad_norm": 0.6420274496941516, "learning_rate": 9.366527854334464e-06, "loss": 1.2989, "step": 63 }, { "epoch": 0.6994535519125683, "grad_norm": 0.713814113456601, "learning_rate": 9.31157286260014e-06, "loss": 1.3011, "step": 64 }, { "epoch": 0.7103825136612022, "grad_norm": 0.6763883811386611, "learning_rate": 9.25450758606031e-06, "loss": 1.3357, "step": 65 }, { "epoch": 0.7213114754098361, "grad_norm": 0.8257959597194754, "learning_rate": 9.195359955122244e-06, "loss": 1.2578, "step": 66 }, { "epoch": 0.73224043715847, "grad_norm": 0.6993051236307988, "learning_rate": 9.134158919394545e-06, "loss": 1.2804, "step": 67 }, { "epoch": 0.7431693989071039, "grad_norm": 0.6366195914456914, "learning_rate": 9.070934433517872e-06, "loss": 1.2788, "step": 68 }, { "epoch": 0.7540983606557377, "grad_norm": 0.6138838190949962, "learning_rate": 9.005717442503741e-06, "loss": 1.2993, "step": 69 }, { "epoch": 0.7540983606557377, "eval_loss": 1.3025286197662354, "eval_runtime": 58.4556, "eval_samples_per_second": 15.02, "eval_steps_per_second": 0.941, "step": 69 }, { "epoch": 0.7650273224043715, "grad_norm": 0.843204915752198, "learning_rate": 8.938539866588593e-06, "loss": 1.3, "step": 70 }, { "epoch": 0.7759562841530054, "grad_norm": 0.6912329842396053, "learning_rate": 8.869434585610534e-06, "loss": 1.2828, "step": 71 }, { "epoch": 0.7868852459016393, "grad_norm": 0.6473743835448474, "learning_rate": 8.798435422916425e-06, "loss": 1.288, "step": 72 }, { "epoch": 0.7978142076502732, "grad_norm": 0.639092935937692, "learning_rate": 8.725577128807144e-06, "loss": 1.3105, "step": 73 }, { "epoch": 0.8087431693989071, "grad_norm": 0.8137557719640728, "learning_rate": 8.650895363529172e-06, "loss": 1.3156, "step": 74 }, { "epoch": 0.819672131147541, "grad_norm": 0.7383465569055297, "learning_rate": 8.574426679820813e-06, "loss": 1.3037, "step": 75 }, { "epoch": 0.8306010928961749, "grad_norm": 0.6171745331371523, "learning_rate": 8.496208505021572e-06, "loss": 1.3028, "step": 76 }, { "epoch": 0.8415300546448088, "grad_norm": 0.6291247778309232, "learning_rate": 8.416279122753468e-06, "loss": 1.3273, "step": 77 }, { "epoch": 0.8524590163934426, "grad_norm": 0.6987699462118843, "learning_rate": 8.334677654183254e-06, "loss": 1.2971, "step": 78 }, { "epoch": 0.8633879781420765, "grad_norm": 0.6353842002955319, "learning_rate": 8.251444038874685e-06, "loss": 1.2652, "step": 79 }, { "epoch": 0.8743169398907104, "grad_norm": 0.6365174883375557, "learning_rate": 8.166619015240236e-06, "loss": 1.3079, "step": 80 }, { "epoch": 0.8852459016393442, "grad_norm": 0.6261104038690326, "learning_rate": 8.080244100601822e-06, "loss": 1.2919, "step": 81 }, { "epoch": 0.8961748633879781, "grad_norm": 0.602681991114368, "learning_rate": 7.992361570870289e-06, "loss": 1.2903, "step": 82 }, { "epoch": 0.907103825136612, "grad_norm": 0.6280750704621436, "learning_rate": 7.903014439853605e-06, "loss": 1.2749, "step": 83 }, { "epoch": 0.9180327868852459, "grad_norm": 0.603325888267899, "learning_rate": 7.812246438203905e-06, "loss": 1.283, "step": 84 }, { "epoch": 0.9289617486338798, "grad_norm": 0.6537444009300672, "learning_rate": 7.720101992013661e-06, "loss": 1.2899, "step": 85 }, { "epoch": 0.9398907103825137, "grad_norm": 1.0009624572991398, "learning_rate": 7.626626201071494e-06, "loss": 1.2982, "step": 86 }, { "epoch": 0.9508196721311475, "grad_norm": 0.6053851732533568, "learning_rate": 7.53186481678822e-06, "loss": 1.2905, "step": 87 }, { "epoch": 0.9617486338797814, "grad_norm": 0.6388873189001231, "learning_rate": 7.4358642198039835e-06, "loss": 1.2719, "step": 88 }, { "epoch": 0.9726775956284153, "grad_norm": 0.6698323369862678, "learning_rate": 7.338671397287409e-06, "loss": 1.302, "step": 89 }, { "epoch": 0.9836065573770492, "grad_norm": 0.610169592291778, "learning_rate": 7.240333919937893e-06, "loss": 1.27, "step": 90 }, { "epoch": 0.994535519125683, "grad_norm": 0.6741974281210771, "learning_rate": 7.140899918702276e-06, "loss": 1.2652, "step": 91 }, { "epoch": 1.010928961748634, "grad_norm": 1.349139914041096, "learning_rate": 7.040418061217325e-06, "loss": 2.4756, "step": 92 }, { "epoch": 1.010928961748634, "eval_loss": 1.2848957777023315, "eval_runtime": 58.901, "eval_samples_per_second": 14.906, "eval_steps_per_second": 0.934, "step": 92 }, { "epoch": 1.0218579234972678, "grad_norm": 0.7847574907233984, "learning_rate": 6.938937527989511e-06, "loss": 1.227, "step": 93 }, { "epoch": 1.0327868852459017, "grad_norm": 0.7794251316397642, "learning_rate": 6.836507988323785e-06, "loss": 1.2365, "step": 94 }, { "epoch": 1.0437158469945356, "grad_norm": 0.6992975886317206, "learning_rate": 6.733179576013098e-06, "loss": 1.1739, "step": 95 }, { "epoch": 1.0546448087431695, "grad_norm": 0.9341037131870787, "learning_rate": 6.629002864800589e-06, "loss": 1.1329, "step": 96 }, { "epoch": 1.0655737704918034, "grad_norm": 0.7994619006093533, "learning_rate": 6.524028843626433e-06, "loss": 1.1783, "step": 97 }, { "epoch": 1.0765027322404372, "grad_norm": 0.6861103500973762, "learning_rate": 6.418308891671484e-06, "loss": 1.1951, "step": 98 }, { "epoch": 1.0874316939890711, "grad_norm": 0.6992308118882774, "learning_rate": 6.311894753209896e-06, "loss": 1.1571, "step": 99 }, { "epoch": 1.098360655737705, "grad_norm": 0.7268267626106699, "learning_rate": 6.204838512283073e-06, "loss": 1.198, "step": 100 }, { "epoch": 1.1092896174863387, "grad_norm": 0.7004085489452564, "learning_rate": 6.097192567207304e-06, "loss": 1.159, "step": 101 }, { "epoch": 1.1202185792349726, "grad_norm": 0.7081495917097259, "learning_rate": 5.989009604927587e-06, "loss": 1.2062, "step": 102 }, { "epoch": 1.1311475409836065, "grad_norm": 0.7399808878498276, "learning_rate": 5.8803425752301814e-06, "loss": 1.1718, "step": 103 }, { "epoch": 1.1420765027322404, "grad_norm": 0.6863305017042436, "learning_rate": 5.771244664826512e-06, "loss": 1.1899, "step": 104 }, { "epoch": 1.1530054644808743, "grad_norm": 0.7471284178004858, "learning_rate": 5.661769271321113e-06, "loss": 1.1902, "step": 105 }, { "epoch": 1.1639344262295082, "grad_norm": 0.7273544033515342, "learning_rate": 5.55196997707635e-06, "loss": 1.1942, "step": 106 }, { "epoch": 1.174863387978142, "grad_norm": 0.7335993688112395, "learning_rate": 5.441900522986712e-06, "loss": 1.1664, "step": 107 }, { "epoch": 1.185792349726776, "grad_norm": 0.6213254185402152, "learning_rate": 5.33161478217552e-06, "loss": 1.1913, "step": 108 }, { "epoch": 1.1967213114754098, "grad_norm": 0.7044768944751936, "learning_rate": 5.221166733626895e-06, "loss": 1.1805, "step": 109 }, { "epoch": 1.2076502732240437, "grad_norm": 0.7774085220214105, "learning_rate": 5.110610435765935e-06, "loss": 1.2001, "step": 110 }, { "epoch": 1.2185792349726776, "grad_norm": 0.648958403837677, "learning_rate": 5e-06, "loss": 1.1737, "step": 111 }, { "epoch": 1.2295081967213115, "grad_norm": 0.6438647531031693, "learning_rate": 4.8893895642340665e-06, "loss": 1.1653, "step": 112 }, { "epoch": 1.2404371584699454, "grad_norm": 0.6535212242202219, "learning_rate": 4.778833266373107e-06, "loss": 1.1804, "step": 113 }, { "epoch": 1.2513661202185793, "grad_norm": 0.6349246304669695, "learning_rate": 4.668385217824482e-06, "loss": 1.183, "step": 114 }, { "epoch": 1.2622950819672132, "grad_norm": 0.6118998202183843, "learning_rate": 4.558099477013288e-06, "loss": 1.1485, "step": 115 }, { "epoch": 1.2622950819672132, "eval_loss": 1.279318928718567, "eval_runtime": 59.2311, "eval_samples_per_second": 14.823, "eval_steps_per_second": 0.929, "step": 115 }, { "epoch": 1.273224043715847, "grad_norm": 0.6644690764921367, "learning_rate": 4.4480300229236525e-06, "loss": 1.1774, "step": 116 }, { "epoch": 1.2841530054644807, "grad_norm": 0.6176573501543406, "learning_rate": 4.338230728678888e-06, "loss": 1.1829, "step": 117 }, { "epoch": 1.2950819672131146, "grad_norm": 0.6520998519813272, "learning_rate": 4.228755335173488e-06, "loss": 1.157, "step": 118 }, { "epoch": 1.3060109289617485, "grad_norm": 0.6163587130274223, "learning_rate": 4.119657424769819e-06, "loss": 1.1929, "step": 119 }, { "epoch": 1.3169398907103824, "grad_norm": 0.6334729039491491, "learning_rate": 4.010990395072414e-06, "loss": 1.1702, "step": 120 }, { "epoch": 1.3278688524590163, "grad_norm": 0.6353071109842556, "learning_rate": 3.902807432792698e-06, "loss": 1.1765, "step": 121 }, { "epoch": 1.3387978142076502, "grad_norm": 0.6256962037064312, "learning_rate": 3.7951614877169285e-06, "loss": 1.1438, "step": 122 }, { "epoch": 1.349726775956284, "grad_norm": 0.6737260420418549, "learning_rate": 3.6881052467901056e-06, "loss": 1.1803, "step": 123 }, { "epoch": 1.360655737704918, "grad_norm": 0.6279777460538392, "learning_rate": 3.5816911083285165e-06, "loss": 1.1589, "step": 124 }, { "epoch": 1.3715846994535519, "grad_norm": 0.5795711246988038, "learning_rate": 3.4759711563735676e-06, "loss": 1.1426, "step": 125 }, { "epoch": 1.3825136612021858, "grad_norm": 0.6494142257139885, "learning_rate": 3.370997135199413e-06, "loss": 1.1773, "step": 126 }, { "epoch": 1.3934426229508197, "grad_norm": 0.6698609975979317, "learning_rate": 3.2668204239869046e-06, "loss": 1.1726, "step": 127 }, { "epoch": 1.4043715846994536, "grad_norm": 0.6181461595436926, "learning_rate": 3.1634920116762175e-06, "loss": 1.1889, "step": 128 }, { "epoch": 1.4153005464480874, "grad_norm": 0.5826475422912465, "learning_rate": 3.061062472010489e-06, "loss": 1.183, "step": 129 }, { "epoch": 1.4262295081967213, "grad_norm": 0.6370383256377398, "learning_rate": 2.9595819387826753e-06, "loss": 1.1717, "step": 130 }, { "epoch": 1.4371584699453552, "grad_norm": 0.6014048070136724, "learning_rate": 2.8591000812977245e-06, "loss": 1.1879, "step": 131 }, { "epoch": 1.4480874316939891, "grad_norm": 0.6237735238977697, "learning_rate": 2.7596660800621076e-06, "loss": 1.1608, "step": 132 }, { "epoch": 1.459016393442623, "grad_norm": 0.6931128987738802, "learning_rate": 2.661328602712592e-06, "loss": 1.1722, "step": 133 }, { "epoch": 1.469945355191257, "grad_norm": 0.6306146197889505, "learning_rate": 2.5641357801960186e-06, "loss": 1.167, "step": 134 }, { "epoch": 1.4808743169398908, "grad_norm": 0.5867961543489938, "learning_rate": 2.4681351832117815e-06, "loss": 1.1548, "step": 135 }, { "epoch": 1.4918032786885247, "grad_norm": 0.656763324147763, "learning_rate": 2.373373798928507e-06, "loss": 1.1853, "step": 136 }, { "epoch": 1.5027322404371586, "grad_norm": 0.6408411450777574, "learning_rate": 2.2798980079863386e-06, "loss": 1.1578, "step": 137 }, { "epoch": 1.5136612021857925, "grad_norm": 0.621288746974463, "learning_rate": 2.187753561796097e-06, "loss": 1.194, "step": 138 }, { "epoch": 1.5136612021857925, "eval_loss": 1.2727837562561035, "eval_runtime": 58.9805, "eval_samples_per_second": 14.886, "eval_steps_per_second": 0.933, "step": 138 }, { "epoch": 1.5245901639344264, "grad_norm": 0.6007003020421285, "learning_rate": 2.0969855601463966e-06, "loss": 1.1602, "step": 139 }, { "epoch": 1.5355191256830603, "grad_norm": 0.6401788493150212, "learning_rate": 2.0076384291297134e-06, "loss": 1.1668, "step": 140 }, { "epoch": 1.5464480874316942, "grad_norm": 0.6218252779777648, "learning_rate": 1.9197558993981784e-06, "loss": 1.1621, "step": 141 }, { "epoch": 1.5573770491803278, "grad_norm": 0.5843465914656714, "learning_rate": 1.8333809847597644e-06, "loss": 1.1507, "step": 142 }, { "epoch": 1.5683060109289617, "grad_norm": 0.5882355294498812, "learning_rate": 1.748555961125315e-06, "loss": 1.1651, "step": 143 }, { "epoch": 1.5792349726775956, "grad_norm": 0.6132409501369763, "learning_rate": 1.665322345816746e-06, "loss": 1.177, "step": 144 }, { "epoch": 1.5901639344262295, "grad_norm": 0.5807021272968182, "learning_rate": 1.583720877246533e-06, "loss": 1.1459, "step": 145 }, { "epoch": 1.6010928961748634, "grad_norm": 0.6055553753427836, "learning_rate": 1.50379149497843e-06, "loss": 1.1549, "step": 146 }, { "epoch": 1.6120218579234973, "grad_norm": 0.654732872405666, "learning_rate": 1.4255733201791883e-06, "loss": 1.1482, "step": 147 }, { "epoch": 1.6229508196721312, "grad_norm": 0.6062073245967988, "learning_rate": 1.3491046364708294e-06, "loss": 1.1698, "step": 148 }, { "epoch": 1.633879781420765, "grad_norm": 0.6087933206349436, "learning_rate": 1.2744228711928585e-06, "loss": 1.1899, "step": 149 }, { "epoch": 1.644808743169399, "grad_norm": 0.5998767147150661, "learning_rate": 1.2015645770835765e-06, "loss": 1.1843, "step": 150 }, { "epoch": 1.6557377049180326, "grad_norm": 0.577767106715806, "learning_rate": 1.1305654143894674e-06, "loss": 1.1686, "step": 151 }, { "epoch": 1.6666666666666665, "grad_norm": 0.6057300867263614, "learning_rate": 1.0614601334114099e-06, "loss": 1.1801, "step": 152 }, { "epoch": 1.6775956284153004, "grad_norm": 0.6495835993842239, "learning_rate": 9.942825574962595e-07, "loss": 1.1871, "step": 153 }, { "epoch": 1.6885245901639343, "grad_norm": 0.5842017627194859, "learning_rate": 9.290655664821296e-07, "loss": 1.1676, "step": 154 }, { "epoch": 1.6994535519125682, "grad_norm": 0.6068106844021876, "learning_rate": 8.658410806054568e-07, "loss": 1.2038, "step": 155 }, { "epoch": 1.710382513661202, "grad_norm": 0.5805150609705962, "learning_rate": 8.046400448777575e-07, "loss": 1.1816, "step": 156 }, { "epoch": 1.721311475409836, "grad_norm": 0.5936012728557775, "learning_rate": 7.45492413939689e-07, "loss": 1.1574, "step": 157 }, { "epoch": 1.7322404371584699, "grad_norm": 0.6125651408906958, "learning_rate": 6.884271373998608e-07, "loss": 1.1712, "step": 158 }, { "epoch": 1.7431693989071038, "grad_norm": 0.569193812462443, "learning_rate": 6.334721456655363e-07, "loss": 1.1404, "step": 159 }, { "epoch": 1.7540983606557377, "grad_norm": 0.6296357089840062, "learning_rate": 5.806543362721945e-07, "loss": 1.1991, "step": 160 }, { "epoch": 1.7650273224043715, "grad_norm": 0.6123103143915483, "learning_rate": 5.29999560718622e-07, "loss": 1.1609, "step": 161 }, { "epoch": 1.7650273224043715, "eval_loss": 1.2695263624191284, "eval_runtime": 59.067, "eval_samples_per_second": 14.864, "eval_steps_per_second": 0.931, "step": 161 }, { "epoch": 1.7759562841530054, "grad_norm": 0.5921257066861633, "learning_rate": 4.815326118139813e-07, "loss": 1.1631, "step": 162 }, { "epoch": 1.7868852459016393, "grad_norm": 0.5952780410674572, "learning_rate": 4.3527721154305703e-07, "loss": 1.1713, "step": 163 }, { "epoch": 1.7978142076502732, "grad_norm": 0.5699617828228588, "learning_rate": 3.9125599945560866e-07, "loss": 1.1511, "step": 164 }, { "epoch": 1.8087431693989071, "grad_norm": 0.6002641655642085, "learning_rate": 3.4949052158551875e-07, "loss": 1.1649, "step": 165 }, { "epoch": 1.819672131147541, "grad_norm": 0.6301400306145488, "learning_rate": 3.100012199051627e-07, "loss": 1.1665, "step": 166 }, { "epoch": 1.830601092896175, "grad_norm": 0.584278868067724, "learning_rate": 2.728074223201488e-07, "loss": 1.1412, "step": 167 }, { "epoch": 1.8415300546448088, "grad_norm": 0.5953997436020207, "learning_rate": 2.3792733320934348e-07, "loss": 1.1639, "step": 168 }, { "epoch": 1.8524590163934427, "grad_norm": 0.5985913900987108, "learning_rate": 2.053780245147996e-07, "loss": 1.1761, "step": 169 }, { "epoch": 1.8633879781420766, "grad_norm": 0.6024385137849072, "learning_rate": 1.7517542738595071e-07, "loss": 1.1844, "step": 170 }, { "epoch": 1.8743169398907105, "grad_norm": 0.6392054588893102, "learning_rate": 1.47334324382164e-07, "loss": 1.1946, "step": 171 }, { "epoch": 1.8852459016393444, "grad_norm": 0.5893969366629673, "learning_rate": 1.2186834223746612e-07, "loss": 1.1686, "step": 172 }, { "epoch": 1.8961748633879782, "grad_norm": 0.6189772219632783, "learning_rate": 9.878994519098573e-08, "loss": 1.1921, "step": 173 }, { "epoch": 1.9071038251366121, "grad_norm": 0.5898365619848454, "learning_rate": 7.81104288863721e-08, "loss": 1.174, "step": 174 }, { "epoch": 1.918032786885246, "grad_norm": 0.5693284124016588, "learning_rate": 5.983991484317997e-08, "loss": 1.1794, "step": 175 }, { "epoch": 1.92896174863388, "grad_norm": 0.5986212948833937, "learning_rate": 4.398734550292716e-08, "loss": 1.1968, "step": 176 }, { "epoch": 1.9398907103825138, "grad_norm": 0.5655509208847077, "learning_rate": 3.0560479852246304e-08, "loss": 1.1613, "step": 177 }, { "epoch": 1.9508196721311475, "grad_norm": 0.5909594336596956, "learning_rate": 1.9565889625275945e-08, "loss": 1.1501, "step": 178 }, { "epoch": 1.9617486338797814, "grad_norm": 0.5749125303246629, "learning_rate": 1.1008956087144585e-08, "loss": 1.1631, "step": 179 }, { "epoch": 1.9726775956284153, "grad_norm": 0.6109694333516082, "learning_rate": 4.89386740013198e-09, "loss": 1.1787, "step": 180 }, { "epoch": 1.9836065573770492, "grad_norm": 0.5681425630460843, "learning_rate": 1.2236165737850025e-09, "loss": 1.1755, "step": 181 }, { "epoch": 1.994535519125683, "grad_norm": 0.6845144994561815, "learning_rate": 0.0, "loss": 1.1956, "step": 182 } ], "logging_steps": 1, "max_steps": 182, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 91, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 6.784224770465464e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }