{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 4000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0025, "grad_norm": 11520.0, "learning_rate": 4.5000000000000003e-07, "loss": 9.2625, "step": 10 }, { "epoch": 0.005, "grad_norm": 3056.0, "learning_rate": 9.500000000000001e-07, "loss": 8.2125, "step": 20 }, { "epoch": 0.0075, "grad_norm": 3056.0, "learning_rate": 1.45e-06, "loss": 8.3969, "step": 30 }, { "epoch": 0.01, "grad_norm": 2128.0, "learning_rate": 1.9500000000000004e-06, "loss": 9.4531, "step": 40 }, { "epoch": 0.0125, "grad_norm": 1912.0, "learning_rate": 2.4500000000000003e-06, "loss": 8.8375, "step": 50 }, { "epoch": 0.015, "grad_norm": 1888.0, "learning_rate": 2.95e-06, "loss": 7.4313, "step": 60 }, { "epoch": 0.0175, "grad_norm": 1512.0, "learning_rate": 3.45e-06, "loss": 7.2719, "step": 70 }, { "epoch": 0.02, "grad_norm": 1384.0, "learning_rate": 3.95e-06, "loss": 5.3406, "step": 80 }, { "epoch": 0.0225, "grad_norm": 2928.0, "learning_rate": 4.450000000000001e-06, "loss": 5.5117, "step": 90 }, { "epoch": 0.025, "grad_norm": 2512.0, "learning_rate": 4.95e-06, "loss": 2.4094, "step": 100 }, { "epoch": 0.0275, "grad_norm": 16.75, "learning_rate": 5.450000000000001e-06, "loss": 1.0006, "step": 110 }, { "epoch": 0.03, "grad_norm": 26.75, "learning_rate": 5.950000000000001e-06, "loss": 0.8576, "step": 120 }, { "epoch": 0.0325, "grad_norm": 20.375, "learning_rate": 6.450000000000001e-06, "loss": 0.4145, "step": 130 }, { "epoch": 0.035, "grad_norm": 139.0, "learning_rate": 6.95e-06, "loss": 1.4863, "step": 140 }, { "epoch": 0.0375, "grad_norm": 24.375, "learning_rate": 7.450000000000001e-06, "loss": 0.4854, "step": 150 }, { "epoch": 0.04, "grad_norm": 60.5, "learning_rate": 7.950000000000002e-06, "loss": 0.3298, "step": 160 }, { "epoch": 0.0425, "grad_norm": 6.21875, "learning_rate": 8.45e-06, "loss": 0.2929, "step": 170 }, { "epoch": 0.045, "grad_norm": 5.3125, "learning_rate": 8.95e-06, "loss": 0.2917, "step": 180 }, { "epoch": 0.0475, "grad_norm": 6.875, "learning_rate": 9.450000000000001e-06, "loss": 0.2365, "step": 190 }, { "epoch": 0.05, "grad_norm": 3.609375, "learning_rate": 9.950000000000001e-06, "loss": 0.92, "step": 200 }, { "epoch": 0.0525, "grad_norm": 2.578125, "learning_rate": 1.045e-05, "loss": 0.2194, "step": 210 }, { "epoch": 0.055, "grad_norm": 2.8125, "learning_rate": 1.095e-05, "loss": 0.2138, "step": 220 }, { "epoch": 0.0575, "grad_norm": 23.0, "learning_rate": 1.145e-05, "loss": 0.2114, "step": 230 }, { "epoch": 0.06, "grad_norm": 3.8125, "learning_rate": 1.195e-05, "loss": 0.2169, "step": 240 }, { "epoch": 0.0625, "grad_norm": 8.125, "learning_rate": 1.2450000000000003e-05, "loss": 0.2167, "step": 250 }, { "epoch": 0.065, "grad_norm": 2.375, "learning_rate": 1.295e-05, "loss": 0.1904, "step": 260 }, { "epoch": 0.0675, "grad_norm": 2.015625, "learning_rate": 1.3450000000000002e-05, "loss": 0.198, "step": 270 }, { "epoch": 0.07, "grad_norm": 2.359375, "learning_rate": 1.3950000000000002e-05, "loss": 0.1944, "step": 280 }, { "epoch": 0.0725, "grad_norm": 2.328125, "learning_rate": 1.4450000000000002e-05, "loss": 0.1808, "step": 290 }, { "epoch": 0.075, "grad_norm": 1.6640625, "learning_rate": 1.4950000000000003e-05, "loss": 0.1957, "step": 300 }, { "epoch": 0.0775, "grad_norm": 1.7265625, "learning_rate": 1.545e-05, "loss": 0.1956, "step": 310 }, { "epoch": 0.08, "grad_norm": 1.8359375, "learning_rate": 1.595e-05, "loss": 0.1965, "step": 320 }, { "epoch": 0.0825, "grad_norm": 2.078125, "learning_rate": 1.645e-05, "loss": 0.1774, "step": 330 }, { "epoch": 0.085, "grad_norm": 2.21875, "learning_rate": 1.6950000000000002e-05, "loss": 0.1829, "step": 340 }, { "epoch": 0.0875, "grad_norm": 2.296875, "learning_rate": 1.7450000000000004e-05, "loss": 0.1771, "step": 350 }, { "epoch": 0.09, "grad_norm": 1.765625, "learning_rate": 1.795e-05, "loss": 0.1938, "step": 360 }, { "epoch": 0.0925, "grad_norm": 2.140625, "learning_rate": 1.845e-05, "loss": 0.1987, "step": 370 }, { "epoch": 0.095, "grad_norm": 3.390625, "learning_rate": 1.8950000000000003e-05, "loss": 0.1864, "step": 380 }, { "epoch": 0.0975, "grad_norm": 3.125, "learning_rate": 1.9450000000000002e-05, "loss": 0.1918, "step": 390 }, { "epoch": 0.1, "grad_norm": 1.8359375, "learning_rate": 1.9950000000000004e-05, "loss": 0.1674, "step": 400 }, { "epoch": 0.1025, "grad_norm": 2.4375, "learning_rate": 1.99996915764479e-05, "loss": 0.1662, "step": 410 }, { "epoch": 0.105, "grad_norm": 1.828125, "learning_rate": 1.9998625445384374e-05, "loss": 0.1918, "step": 420 }, { "epoch": 0.1075, "grad_norm": 1.5234375, "learning_rate": 1.9996797880281935e-05, "loss": 0.168, "step": 430 }, { "epoch": 0.11, "grad_norm": 2.078125, "learning_rate": 1.999420902031673e-05, "loss": 0.1941, "step": 440 }, { "epoch": 0.1125, "grad_norm": 1.9609375, "learning_rate": 1.9990859062640478e-05, "loss": 0.1858, "step": 450 }, { "epoch": 0.115, "grad_norm": 2.203125, "learning_rate": 1.998674826236542e-05, "loss": 0.19, "step": 460 }, { "epoch": 0.1175, "grad_norm": 2.671875, "learning_rate": 1.9981876932544918e-05, "loss": 0.165, "step": 470 }, { "epoch": 0.12, "grad_norm": 1.65625, "learning_rate": 1.997624544414959e-05, "loss": 0.185, "step": 480 }, { "epoch": 0.1225, "grad_norm": 1.9921875, "learning_rate": 1.9969854226039088e-05, "loss": 0.1903, "step": 490 }, { "epoch": 0.125, "grad_norm": 1.34375, "learning_rate": 1.9962703764929413e-05, "loss": 0.1676, "step": 500 }, { "epoch": 0.1275, "grad_norm": 1.2734375, "learning_rate": 1.995479460535586e-05, "loss": 0.1657, "step": 510 }, { "epoch": 0.13, "grad_norm": 1.515625, "learning_rate": 1.9946127349631564e-05, "loss": 0.1604, "step": 520 }, { "epoch": 0.1325, "grad_norm": 1.2734375, "learning_rate": 1.9936702657801586e-05, "loss": 0.1662, "step": 530 }, { "epoch": 0.135, "grad_norm": 1.2578125, "learning_rate": 1.992652124759271e-05, "loss": 0.1697, "step": 540 }, { "epoch": 0.1375, "grad_norm": 1.1640625, "learning_rate": 1.9915583894358744e-05, "loss": 0.1707, "step": 550 }, { "epoch": 0.14, "grad_norm": 1.1796875, "learning_rate": 1.9903891431021477e-05, "loss": 0.1626, "step": 560 }, { "epoch": 0.1425, "grad_norm": 2.0625, "learning_rate": 1.989144474800726e-05, "loss": 0.1898, "step": 570 }, { "epoch": 0.145, "grad_norm": 1.6484375, "learning_rate": 1.9878244793179198e-05, "loss": 0.1819, "step": 580 }, { "epoch": 0.1475, "grad_norm": 1.09375, "learning_rate": 1.9864292571764956e-05, "loss": 0.1628, "step": 590 }, { "epoch": 0.15, "grad_norm": 1.0625, "learning_rate": 1.9849589146280212e-05, "loss": 0.1547, "step": 600 }, { "epoch": 0.1525, "grad_norm": 1.0625, "learning_rate": 1.9834135636447745e-05, "loss": 0.1722, "step": 610 }, { "epoch": 0.155, "grad_norm": 1.8828125, "learning_rate": 1.981793321911216e-05, "loss": 0.1672, "step": 620 }, { "epoch": 0.1575, "grad_norm": 1.390625, "learning_rate": 1.9800983128150263e-05, "loss": 0.1637, "step": 630 }, { "epoch": 0.16, "grad_norm": 1.59375, "learning_rate": 1.978328665437711e-05, "loss": 0.1812, "step": 640 }, { "epoch": 0.1625, "grad_norm": 1.2734375, "learning_rate": 1.9764845145447687e-05, "loss": 0.1768, "step": 650 }, { "epoch": 0.165, "grad_norm": 1.2421875, "learning_rate": 1.974566000575431e-05, "loss": 0.1678, "step": 660 }, { "epoch": 0.1675, "grad_norm": 1.765625, "learning_rate": 1.9725732696319634e-05, "loss": 0.167, "step": 670 }, { "epoch": 0.17, "grad_norm": 1.3125, "learning_rate": 1.9705064734685425e-05, "loss": 0.1616, "step": 680 }, { "epoch": 0.1725, "grad_norm": 1.28125, "learning_rate": 1.9683657694796988e-05, "loss": 0.1851, "step": 690 }, { "epoch": 0.175, "grad_norm": 1.2265625, "learning_rate": 1.9661513206883288e-05, "loss": 0.1734, "step": 700 }, { "epoch": 0.1775, "grad_norm": 1.234375, "learning_rate": 1.963863295733281e-05, "loss": 0.174, "step": 710 }, { "epoch": 0.18, "grad_norm": 2.03125, "learning_rate": 1.961501868856515e-05, "loss": 0.1758, "step": 720 }, { "epoch": 0.1825, "grad_norm": 1.4453125, "learning_rate": 1.9590672198898297e-05, "loss": 0.1601, "step": 730 }, { "epoch": 0.185, "grad_norm": 1.09375, "learning_rate": 1.95655953424117e-05, "loss": 0.1822, "step": 740 }, { "epoch": 0.1875, "grad_norm": 1.421875, "learning_rate": 1.953979002880507e-05, "loss": 0.1604, "step": 750 }, { "epoch": 0.19, "grad_norm": 1.03125, "learning_rate": 1.951325822325295e-05, "loss": 0.1543, "step": 760 }, { "epoch": 0.1925, "grad_norm": 1.5, "learning_rate": 1.9486001946255046e-05, "loss": 0.1754, "step": 770 }, { "epoch": 0.195, "grad_norm": 1.390625, "learning_rate": 1.945802327348239e-05, "loss": 0.1712, "step": 780 }, { "epoch": 0.1975, "grad_norm": 1.296875, "learning_rate": 1.9429324335619234e-05, "loss": 0.1739, "step": 790 }, { "epoch": 0.2, "grad_norm": 1.015625, "learning_rate": 1.93999073182008e-05, "loss": 0.1738, "step": 800 }, { "epoch": 0.2025, "grad_norm": 1.0234375, "learning_rate": 1.936977446144687e-05, "loss": 0.1605, "step": 810 }, { "epoch": 0.205, "grad_norm": 1.15625, "learning_rate": 1.9338928060091145e-05, "loss": 0.1641, "step": 820 }, { "epoch": 0.2075, "grad_norm": 1.3359375, "learning_rate": 1.9307370463206512e-05, "loss": 0.1535, "step": 830 }, { "epoch": 0.21, "grad_norm": 1.4921875, "learning_rate": 1.9275104074026152e-05, "loss": 0.1786, "step": 840 }, { "epoch": 0.2125, "grad_norm": 1.21875, "learning_rate": 1.9242131349760536e-05, "loss": 0.1547, "step": 850 }, { "epoch": 0.215, "grad_norm": 1.34375, "learning_rate": 1.9208454801410267e-05, "loss": 0.1586, "step": 860 }, { "epoch": 0.2175, "grad_norm": 1.2265625, "learning_rate": 1.9174076993574883e-05, "loss": 0.1588, "step": 870 }, { "epoch": 0.22, "grad_norm": 1.21875, "learning_rate": 1.913900054425756e-05, "loss": 0.1701, "step": 880 }, { "epoch": 0.2225, "grad_norm": 1.171875, "learning_rate": 1.9103228124665713e-05, "loss": 0.146, "step": 890 }, { "epoch": 0.225, "grad_norm": 1.359375, "learning_rate": 1.906676245900759e-05, "loss": 0.1494, "step": 900 }, { "epoch": 0.2275, "grad_norm": 0.97265625, "learning_rate": 1.9029606324284814e-05, "loss": 0.1433, "step": 910 }, { "epoch": 0.23, "grad_norm": 1.3203125, "learning_rate": 1.8991762550080905e-05, "loss": 0.1541, "step": 920 }, { "epoch": 0.2325, "grad_norm": 1.2109375, "learning_rate": 1.895323401834578e-05, "loss": 0.1653, "step": 930 }, { "epoch": 0.235, "grad_norm": 1.3125, "learning_rate": 1.8914023663176306e-05, "loss": 0.1694, "step": 940 }, { "epoch": 0.2375, "grad_norm": 1.265625, "learning_rate": 1.8874134470592836e-05, "loss": 0.159, "step": 950 }, { "epoch": 0.24, "grad_norm": 1.15625, "learning_rate": 1.8833569478311818e-05, "loss": 0.1498, "step": 960 }, { "epoch": 0.2425, "grad_norm": 1.234375, "learning_rate": 1.879233177551447e-05, "loss": 0.1562, "step": 970 }, { "epoch": 0.245, "grad_norm": 0.98828125, "learning_rate": 1.8750424502611528e-05, "loss": 0.153, "step": 980 }, { "epoch": 0.2475, "grad_norm": 1.3125, "learning_rate": 1.870785085100406e-05, "loss": 0.1437, "step": 990 }, { "epoch": 0.25, "grad_norm": 1.0390625, "learning_rate": 1.8664614062840472e-05, "loss": 0.1529, "step": 1000 }, { "epoch": 0.2525, "grad_norm": 1.21875, "learning_rate": 1.8620717430769586e-05, "loss": 0.1583, "step": 1010 }, { "epoch": 0.255, "grad_norm": 1.328125, "learning_rate": 1.8576164297689877e-05, "loss": 0.1521, "step": 1020 }, { "epoch": 0.2575, "grad_norm": 1.421875, "learning_rate": 1.8530958056494934e-05, "loss": 0.1493, "step": 1030 }, { "epoch": 0.26, "grad_norm": 1.2109375, "learning_rate": 1.8485102149815038e-05, "loss": 0.1373, "step": 1040 }, { "epoch": 0.2625, "grad_norm": 1.453125, "learning_rate": 1.8438600069755027e-05, "loss": 0.1409, "step": 1050 }, { "epoch": 0.265, "grad_norm": 0.94921875, "learning_rate": 1.8391455357628334e-05, "loss": 0.143, "step": 1060 }, { "epoch": 0.2675, "grad_norm": 1.0703125, "learning_rate": 1.834367160368732e-05, "loss": 0.1576, "step": 1070 }, { "epoch": 0.27, "grad_norm": 1.4140625, "learning_rate": 1.8295252446849842e-05, "loss": 0.1499, "step": 1080 }, { "epoch": 0.2725, "grad_norm": 1.140625, "learning_rate": 1.8246201574422164e-05, "loss": 0.1485, "step": 1090 }, { "epoch": 0.275, "grad_norm": 0.828125, "learning_rate": 1.8196522721818128e-05, "loss": 0.1275, "step": 1100 }, { "epoch": 0.2775, "grad_norm": 1.2109375, "learning_rate": 1.8146219672274693e-05, "loss": 0.126, "step": 1110 }, { "epoch": 0.28, "grad_norm": 1.2265625, "learning_rate": 1.8095296256563845e-05, "loss": 0.1368, "step": 1120 }, { "epoch": 0.2825, "grad_norm": 1.125, "learning_rate": 1.8043756352700844e-05, "loss": 0.1352, "step": 1130 }, { "epoch": 0.285, "grad_norm": 1.21875, "learning_rate": 1.799160388564892e-05, "loss": 0.1349, "step": 1140 }, { "epoch": 0.2875, "grad_norm": 2.96875, "learning_rate": 1.7938842827020347e-05, "loss": 0.1525, "step": 1150 }, { "epoch": 0.29, "grad_norm": 1.1015625, "learning_rate": 1.788547719477402e-05, "loss": 0.1312, "step": 1160 }, { "epoch": 0.2925, "grad_norm": 0.98828125, "learning_rate": 1.7831511052909442e-05, "loss": 0.1346, "step": 1170 }, { "epoch": 0.295, "grad_norm": 1.03125, "learning_rate": 1.777694851115726e-05, "loss": 0.1398, "step": 1180 }, { "epoch": 0.2975, "grad_norm": 1.0078125, "learning_rate": 1.772179372466627e-05, "loss": 0.1519, "step": 1190 }, { "epoch": 0.3, "grad_norm": 1.171875, "learning_rate": 1.7666050893687007e-05, "loss": 0.1287, "step": 1200 }, { "epoch": 0.3025, "grad_norm": 1.15625, "learning_rate": 1.760972426325187e-05, "loss": 0.1235, "step": 1210 }, { "epoch": 0.305, "grad_norm": 1.0078125, "learning_rate": 1.7552818122851838e-05, "loss": 0.1182, "step": 1220 }, { "epoch": 0.3075, "grad_norm": 1.203125, "learning_rate": 1.7495336806109828e-05, "loss": 0.1382, "step": 1230 }, { "epoch": 0.31, "grad_norm": 1.015625, "learning_rate": 1.7437284690450656e-05, "loss": 0.1331, "step": 1240 }, { "epoch": 0.3125, "grad_norm": 1.40625, "learning_rate": 1.7378666196767685e-05, "loss": 0.1365, "step": 1250 }, { "epoch": 0.315, "grad_norm": 1.1796875, "learning_rate": 1.7319485789086164e-05, "loss": 0.1214, "step": 1260 }, { "epoch": 0.3175, "grad_norm": 0.97265625, "learning_rate": 1.7259747974223264e-05, "loss": 0.1443, "step": 1270 }, { "epoch": 0.32, "grad_norm": 1.0390625, "learning_rate": 1.719945730144487e-05, "loss": 0.134, "step": 1280 }, { "epoch": 0.3225, "grad_norm": 0.97265625, "learning_rate": 1.7138618362119136e-05, "loss": 0.1256, "step": 1290 }, { "epoch": 0.325, "grad_norm": 1.515625, "learning_rate": 1.7077235789366845e-05, "loss": 0.1234, "step": 1300 }, { "epoch": 0.3275, "grad_norm": 1.140625, "learning_rate": 1.701531425770856e-05, "loss": 0.1358, "step": 1310 }, { "epoch": 0.33, "grad_norm": 0.88671875, "learning_rate": 1.6952858482708657e-05, "loss": 0.1177, "step": 1320 }, { "epoch": 0.3325, "grad_norm": 0.92578125, "learning_rate": 1.6889873220616208e-05, "loss": 0.134, "step": 1330 }, { "epoch": 0.335, "grad_norm": 1.0546875, "learning_rate": 1.682636326800278e-05, "loss": 0.1412, "step": 1340 }, { "epoch": 0.3375, "grad_norm": 1.6484375, "learning_rate": 1.6762333461397157e-05, "loss": 0.1308, "step": 1350 }, { "epoch": 0.34, "grad_norm": 1.234375, "learning_rate": 1.6697788676917007e-05, "loss": 0.1279, "step": 1360 }, { "epoch": 0.3425, "grad_norm": 1.1484375, "learning_rate": 1.6632733829897567e-05, "loss": 0.1292, "step": 1370 }, { "epoch": 0.345, "grad_norm": 1.140625, "learning_rate": 1.656717387451731e-05, "loss": 0.1227, "step": 1380 }, { "epoch": 0.3475, "grad_norm": 0.9609375, "learning_rate": 1.650111380342066e-05, "loss": 0.1399, "step": 1390 }, { "epoch": 0.35, "grad_norm": 1.3203125, "learning_rate": 1.643455864733779e-05, "loss": 0.1266, "step": 1400 }, { "epoch": 0.3525, "grad_norm": 1.2578125, "learning_rate": 1.636751347470152e-05, "loss": 0.1357, "step": 1410 }, { "epoch": 0.355, "grad_norm": 1.125, "learning_rate": 1.6299983391261323e-05, "loss": 0.129, "step": 1420 }, { "epoch": 0.3575, "grad_norm": 1.046875, "learning_rate": 1.6231973539694506e-05, "loss": 0.1121, "step": 1430 }, { "epoch": 0.36, "grad_norm": 1.3828125, "learning_rate": 1.616348909921457e-05, "loss": 0.1432, "step": 1440 }, { "epoch": 0.3625, "grad_norm": 0.90625, "learning_rate": 1.6094535285176812e-05, "loss": 0.1196, "step": 1450 }, { "epoch": 0.365, "grad_norm": 1.125, "learning_rate": 1.6025117348681133e-05, "loss": 0.1102, "step": 1460 }, { "epoch": 0.3675, "grad_norm": 1.0078125, "learning_rate": 1.5955240576172165e-05, "loss": 0.1382, "step": 1470 }, { "epoch": 0.37, "grad_norm": 0.94140625, "learning_rate": 1.588491028903667e-05, "loss": 0.1265, "step": 1480 }, { "epoch": 0.3725, "grad_norm": 0.953125, "learning_rate": 1.5814131843198307e-05, "loss": 0.132, "step": 1490 }, { "epoch": 0.375, "grad_norm": 1.1640625, "learning_rate": 1.5742910628709757e-05, "loss": 0.1338, "step": 1500 }, { "epoch": 0.3775, "grad_norm": 1.0625, "learning_rate": 1.567125206934225e-05, "loss": 0.1113, "step": 1510 }, { "epoch": 0.38, "grad_norm": 1.15625, "learning_rate": 1.5599161622172515e-05, "loss": 0.1284, "step": 1520 }, { "epoch": 0.3825, "grad_norm": 1.296875, "learning_rate": 1.552664477716722e-05, "loss": 0.1238, "step": 1530 }, { "epoch": 0.385, "grad_norm": 1.34375, "learning_rate": 1.5453707056764865e-05, "loss": 0.1266, "step": 1540 }, { "epoch": 0.3875, "grad_norm": 1.2890625, "learning_rate": 1.538035401545525e-05, "loss": 0.1168, "step": 1550 }, { "epoch": 0.39, "grad_norm": 1.046875, "learning_rate": 1.5306591239356477e-05, "loss": 0.1228, "step": 1560 }, { "epoch": 0.3925, "grad_norm": 1.125, "learning_rate": 1.5232424345789517e-05, "loss": 0.1217, "step": 1570 }, { "epoch": 0.395, "grad_norm": 0.75, "learning_rate": 1.5157858982850475e-05, "loss": 0.1247, "step": 1580 }, { "epoch": 0.3975, "grad_norm": 0.92578125, "learning_rate": 1.5082900828980425e-05, "loss": 0.1108, "step": 1590 }, { "epoch": 0.4, "grad_norm": 1.40625, "learning_rate": 1.5007555592532996e-05, "loss": 0.1141, "step": 1600 }, { "epoch": 0.4025, "grad_norm": 0.87890625, "learning_rate": 1.4931829011339658e-05, "loss": 0.1063, "step": 1610 }, { "epoch": 0.405, "grad_norm": 1.25, "learning_rate": 1.4855726852272754e-05, "loss": 0.1221, "step": 1620 }, { "epoch": 0.4075, "grad_norm": 0.703125, "learning_rate": 1.4779254910806336e-05, "loss": 0.125, "step": 1630 }, { "epoch": 0.41, "grad_norm": 1.03125, "learning_rate": 1.4702419010574824e-05, "loss": 0.1349, "step": 1640 }, { "epoch": 0.4125, "grad_norm": 1.203125, "learning_rate": 1.4625225002929501e-05, "loss": 0.121, "step": 1650 }, { "epoch": 0.415, "grad_norm": 0.91015625, "learning_rate": 1.4547678766492919e-05, "loss": 0.1073, "step": 1660 }, { "epoch": 0.4175, "grad_norm": 1.015625, "learning_rate": 1.4469786206711213e-05, "loss": 0.119, "step": 1670 }, { "epoch": 0.42, "grad_norm": 1.0546875, "learning_rate": 1.4391553255404385e-05, "loss": 0.1125, "step": 1680 }, { "epoch": 0.4225, "grad_norm": 0.7421875, "learning_rate": 1.4312985870314568e-05, "loss": 0.1129, "step": 1690 }, { "epoch": 0.425, "grad_norm": 1.6328125, "learning_rate": 1.4234090034652324e-05, "loss": 0.1219, "step": 1700 }, { "epoch": 0.4275, "grad_norm": 1.34375, "learning_rate": 1.4154871756640996e-05, "loss": 0.1299, "step": 1710 }, { "epoch": 0.43, "grad_norm": 0.82421875, "learning_rate": 1.4075337069059159e-05, "loss": 0.1122, "step": 1720 }, { "epoch": 0.4325, "grad_norm": 0.9921875, "learning_rate": 1.3995492028781202e-05, "loss": 0.1127, "step": 1730 }, { "epoch": 0.435, "grad_norm": 0.7578125, "learning_rate": 1.3915342716316075e-05, "loss": 0.1033, "step": 1740 }, { "epoch": 0.4375, "grad_norm": 1.1015625, "learning_rate": 1.3834895235344242e-05, "loss": 0.1149, "step": 1750 }, { "epoch": 0.44, "grad_norm": 0.94140625, "learning_rate": 1.3754155712252832e-05, "loss": 0.1083, "step": 1760 }, { "epoch": 0.4425, "grad_norm": 0.9296875, "learning_rate": 1.367313029566913e-05, "loss": 0.116, "step": 1770 }, { "epoch": 0.445, "grad_norm": 1.328125, "learning_rate": 1.3591825155992311e-05, "loss": 0.1157, "step": 1780 }, { "epoch": 0.4475, "grad_norm": 0.875, "learning_rate": 1.3510246484923547e-05, "loss": 0.1066, "step": 1790 }, { "epoch": 0.45, "grad_norm": 0.93359375, "learning_rate": 1.3428400494994484e-05, "loss": 0.0956, "step": 1800 }, { "epoch": 0.4525, "grad_norm": 1.09375, "learning_rate": 1.3346293419094134e-05, "loss": 0.0991, "step": 1810 }, { "epoch": 0.455, "grad_norm": 1.0390625, "learning_rate": 1.326393150999422e-05, "loss": 0.1087, "step": 1820 }, { "epoch": 0.4575, "grad_norm": 1.015625, "learning_rate": 1.3181321039872995e-05, "loss": 0.0984, "step": 1830 }, { "epoch": 0.46, "grad_norm": 1.0625, "learning_rate": 1.3098468299837599e-05, "loss": 0.1007, "step": 1840 }, { "epoch": 0.4625, "grad_norm": 1.1171875, "learning_rate": 1.3015379599444958e-05, "loss": 0.1127, "step": 1850 }, { "epoch": 0.465, "grad_norm": 1.625, "learning_rate": 1.2932061266221304e-05, "loss": 0.1068, "step": 1860 }, { "epoch": 0.4675, "grad_norm": 0.87890625, "learning_rate": 1.2848519645180296e-05, "loss": 0.0954, "step": 1870 }, { "epoch": 0.47, "grad_norm": 1.09375, "learning_rate": 1.276476109833981e-05, "loss": 0.1052, "step": 1880 }, { "epoch": 0.4725, "grad_norm": 0.88671875, "learning_rate": 1.268079200423748e-05, "loss": 0.1035, "step": 1890 }, { "epoch": 0.475, "grad_norm": 0.88671875, "learning_rate": 1.2596618757444918e-05, "loss": 0.0977, "step": 1900 }, { "epoch": 0.4775, "grad_norm": 0.79296875, "learning_rate": 1.2512247768080756e-05, "loss": 0.1016, "step": 1910 }, { "epoch": 0.48, "grad_norm": 0.890625, "learning_rate": 1.2427685461322497e-05, "loss": 0.0858, "step": 1920 }, { "epoch": 0.4825, "grad_norm": 1.078125, "learning_rate": 1.2342938276917187e-05, "loss": 0.1034, "step": 1930 }, { "epoch": 0.485, "grad_norm": 1.1171875, "learning_rate": 1.2258012668691039e-05, "loss": 0.1034, "step": 1940 }, { "epoch": 0.4875, "grad_norm": 1.0078125, "learning_rate": 1.2172915104057919e-05, "loss": 0.0958, "step": 1950 }, { "epoch": 0.49, "grad_norm": 1.0390625, "learning_rate": 1.2087652063526837e-05, "loss": 0.1055, "step": 1960 }, { "epoch": 0.4925, "grad_norm": 0.8125, "learning_rate": 1.2002230040208446e-05, "loss": 0.1064, "step": 1970 }, { "epoch": 0.495, "grad_norm": 0.91796875, "learning_rate": 1.1916655539320547e-05, "loss": 0.1016, "step": 1980 }, { "epoch": 0.4975, "grad_norm": 0.82421875, "learning_rate": 1.1830935077692696e-05, "loss": 0.0929, "step": 1990 }, { "epoch": 0.5, "grad_norm": 0.93359375, "learning_rate": 1.174507518326992e-05, "loss": 0.0931, "step": 2000 }, { "epoch": 0.5025, "grad_norm": 0.92578125, "learning_rate": 1.1659082394615608e-05, "loss": 0.1083, "step": 2010 }, { "epoch": 0.505, "grad_norm": 1.171875, "learning_rate": 1.1572963260413547e-05, "loss": 0.0919, "step": 2020 }, { "epoch": 0.5075, "grad_norm": 1.0234375, "learning_rate": 1.1486724338969233e-05, "loss": 0.1003, "step": 2030 }, { "epoch": 0.51, "grad_norm": 0.99609375, "learning_rate": 1.1400372197710414e-05, "loss": 0.1094, "step": 2040 }, { "epoch": 0.5125, "grad_norm": 0.796875, "learning_rate": 1.131391341268698e-05, "loss": 0.0971, "step": 2050 }, { "epoch": 0.515, "grad_norm": 0.98828125, "learning_rate": 1.122735456807015e-05, "loss": 1.1244, "step": 2060 }, { "epoch": 0.5175, "grad_norm": 1.40625, "learning_rate": 1.1140702255651064e-05, "loss": 0.0971, "step": 2070 }, { "epoch": 0.52, "grad_norm": 0.9921875, "learning_rate": 1.1053963074338798e-05, "loss": 0.0965, "step": 2080 }, { "epoch": 0.5225, "grad_norm": 1.125, "learning_rate": 1.0967143629657842e-05, "loss": 0.1068, "step": 2090 }, { "epoch": 0.525, "grad_norm": 0.765625, "learning_rate": 1.0880250533245038e-05, "loss": 0.0962, "step": 2100 }, { "epoch": 0.5275, "grad_norm": 0.890625, "learning_rate": 1.0793290402346094e-05, "loss": 0.1056, "step": 2110 }, { "epoch": 0.53, "grad_norm": 0.73828125, "learning_rate": 1.0706269859311669e-05, "loss": 0.0993, "step": 2120 }, { "epoch": 0.5325, "grad_norm": 0.8046875, "learning_rate": 1.0619195531093019e-05, "loss": 0.0913, "step": 2130 }, { "epoch": 0.535, "grad_norm": 1.1640625, "learning_rate": 1.0532074048737364e-05, "loss": 0.113, "step": 2140 }, { "epoch": 0.5375, "grad_norm": 0.87890625, "learning_rate": 1.0444912046882889e-05, "loss": 0.0884, "step": 2150 }, { "epoch": 0.54, "grad_norm": 0.78515625, "learning_rate": 1.0357716163253498e-05, "loss": 0.0867, "step": 2160 }, { "epoch": 0.5425, "grad_norm": 0.9765625, "learning_rate": 1.027049303815332e-05, "loss": 0.1046, "step": 2170 }, { "epoch": 0.545, "grad_norm": 0.9765625, "learning_rate": 1.018324931396103e-05, "loss": 0.1045, "step": 2180 }, { "epoch": 0.5475, "grad_norm": 1.0078125, "learning_rate": 1.0095991634624001e-05, "loss": 0.097, "step": 2190 }, { "epoch": 0.55, "grad_norm": 0.94921875, "learning_rate": 1.0008726645152354e-05, "loss": 0.0992, "step": 2200 }, { "epoch": 0.5525, "grad_norm": 1.2421875, "learning_rate": 9.921460991112891e-06, "loss": 0.0876, "step": 2210 }, { "epoch": 0.555, "grad_norm": 0.9765625, "learning_rate": 9.834201318123026e-06, "loss": 0.0966, "step": 2220 }, { "epoch": 0.5575, "grad_norm": 0.8046875, "learning_rate": 9.746954271344703e-06, "loss": 0.0945, "step": 2230 }, { "epoch": 0.56, "grad_norm": 0.8359375, "learning_rate": 9.659726494978325e-06, "loss": 0.0938, "step": 2240 }, { "epoch": 0.5625, "grad_norm": 0.7265625, "learning_rate": 9.572524631756779e-06, "loss": 0.0946, "step": 2250 }, { "epoch": 0.565, "grad_norm": 1.03125, "learning_rate": 9.48535532243956e-06, "loss": 0.1001, "step": 2260 }, { "epoch": 0.5675, "grad_norm": 0.98828125, "learning_rate": 9.398225205307067e-06, "loss": 0.0928, "step": 2270 }, { "epoch": 0.57, "grad_norm": 0.953125, "learning_rate": 9.311140915655055e-06, "loss": 0.0982, "step": 2280 }, { "epoch": 0.5725, "grad_norm": 0.734375, "learning_rate": 9.224109085289343e-06, "loss": 0.0866, "step": 2290 }, { "epoch": 0.575, "grad_norm": 0.8515625, "learning_rate": 9.137136342020768e-06, "loss": 0.0922, "step": 2300 }, { "epoch": 0.5775, "grad_norm": 0.84765625, "learning_rate": 9.050229309160462e-06, "loss": 0.0811, "step": 2310 }, { "epoch": 0.58, "grad_norm": 0.796875, "learning_rate": 8.963394605015453e-06, "loss": 0.0965, "step": 2320 }, { "epoch": 0.5825, "grad_norm": 0.85546875, "learning_rate": 8.876638842384645e-06, "loss": 0.0972, "step": 2330 }, { "epoch": 0.585, "grad_norm": 0.84375, "learning_rate": 8.789968628055262e-06, "loss": 0.104, "step": 2340 }, { "epoch": 0.5875, "grad_norm": 0.8203125, "learning_rate": 8.703390562299683e-06, "loss": 0.0911, "step": 2350 }, { "epoch": 0.59, "grad_norm": 1.03125, "learning_rate": 8.616911238372812e-06, "loss": 0.0935, "step": 2360 }, { "epoch": 0.5925, "grad_norm": 0.8671875, "learning_rate": 8.530537242009985e-06, "loss": 0.0901, "step": 2370 }, { "epoch": 0.595, "grad_norm": 0.87890625, "learning_rate": 8.444275150925431e-06, "loss": 0.0931, "step": 2380 }, { "epoch": 0.5975, "grad_norm": 1.3046875, "learning_rate": 8.358131534311372e-06, "loss": 0.0908, "step": 2390 }, { "epoch": 0.6, "grad_norm": 1.5078125, "learning_rate": 8.272112952337728e-06, "loss": 0.0948, "step": 2400 }, { "epoch": 0.6025, "grad_norm": 0.76953125, "learning_rate": 8.186225955652547e-06, "loss": 0.0927, "step": 2410 }, { "epoch": 0.605, "grad_norm": 1.1640625, "learning_rate": 8.100477084883156e-06, "loss": 0.0915, "step": 2420 }, { "epoch": 0.6075, "grad_norm": 0.921875, "learning_rate": 8.014872870138055e-06, "loss": 0.0853, "step": 2430 }, { "epoch": 0.61, "grad_norm": 0.75390625, "learning_rate": 7.929419830509619e-06, "loss": 0.0804, "step": 2440 }, { "epoch": 0.6125, "grad_norm": 0.796875, "learning_rate": 7.844124473577672e-06, "loss": 0.0952, "step": 2450 }, { "epoch": 0.615, "grad_norm": 0.9921875, "learning_rate": 7.758993294913873e-06, "loss": 0.0886, "step": 2460 }, { "epoch": 0.6175, "grad_norm": 0.83203125, "learning_rate": 7.674032777587076e-06, "loss": 0.0942, "step": 2470 }, { "epoch": 0.62, "grad_norm": 0.75390625, "learning_rate": 7.589249391669615e-06, "loss": 0.0926, "step": 2480 }, { "epoch": 0.6225, "grad_norm": 0.796875, "learning_rate": 7.5046495937445774e-06, "loss": 0.0877, "step": 2490 }, { "epoch": 0.625, "grad_norm": 0.7734375, "learning_rate": 7.4202398264141195e-06, "loss": 0.0781, "step": 2500 }, { "epoch": 0.6275, "grad_norm": 0.75, "learning_rate": 7.336026517808828e-06, "loss": 0.076, "step": 2510 }, { "epoch": 0.63, "grad_norm": 0.8359375, "learning_rate": 7.252016081098195e-06, "loss": 0.0922, "step": 2520 }, { "epoch": 0.6325, "grad_norm": 1.1796875, "learning_rate": 7.168214914002243e-06, "loss": 0.0816, "step": 2530 }, { "epoch": 0.635, "grad_norm": 1.0859375, "learning_rate": 7.08462939830429e-06, "loss": 0.084, "step": 2540 }, { "epoch": 0.6375, "grad_norm": 0.9140625, "learning_rate": 7.001265899364968e-06, "loss": 0.0965, "step": 2550 }, { "epoch": 0.64, "grad_norm": 0.7734375, "learning_rate": 6.918130765637485e-06, "loss": 0.0843, "step": 2560 }, { "epoch": 0.6425, "grad_norm": 0.87890625, "learning_rate": 6.835230328184139e-06, "loss": 0.0765, "step": 2570 }, { "epoch": 0.645, "grad_norm": 1.3828125, "learning_rate": 6.752570900194206e-06, "loss": 0.0901, "step": 2580 }, { "epoch": 0.6475, "grad_norm": 0.85546875, "learning_rate": 6.670158776503158e-06, "loss": 0.0867, "step": 2590 }, { "epoch": 0.65, "grad_norm": 0.6640625, "learning_rate": 6.588000233113282e-06, "loss": 0.0768, "step": 2600 }, { "epoch": 0.6525, "grad_norm": 0.8203125, "learning_rate": 6.506101526715749e-06, "loss": 0.0793, "step": 2610 }, { "epoch": 0.655, "grad_norm": 0.859375, "learning_rate": 6.424468894214137e-06, "loss": 0.074, "step": 2620 }, { "epoch": 0.6575, "grad_norm": 0.8125, "learning_rate": 6.343108552249457e-06, "loss": 0.0854, "step": 2630 }, { "epoch": 0.66, "grad_norm": 0.96484375, "learning_rate": 6.262026696726759e-06, "loss": 0.0863, "step": 2640 }, { "epoch": 0.6625, "grad_norm": 0.84375, "learning_rate": 6.181229502343256e-06, "loss": 0.0836, "step": 2650 }, { "epoch": 0.665, "grad_norm": 0.79296875, "learning_rate": 6.100723122118121e-06, "loss": 0.0874, "step": 2660 }, { "epoch": 0.6675, "grad_norm": 0.69921875, "learning_rate": 6.020513686923897e-06, "loss": 0.0918, "step": 2670 }, { "epoch": 0.67, "grad_norm": 2.328125, "learning_rate": 5.940607305019623e-06, "loss": 0.0879, "step": 2680 }, { "epoch": 0.6725, "grad_norm": 0.93359375, "learning_rate": 5.861010061585652e-06, "loss": 0.0759, "step": 2690 }, { "epoch": 0.675, "grad_norm": 0.9296875, "learning_rate": 5.781728018260242e-06, "loss": 0.0771, "step": 2700 }, { "epoch": 0.6775, "grad_norm": 0.89453125, "learning_rate": 5.7027672126779465e-06, "loss": 0.0883, "step": 2710 }, { "epoch": 0.68, "grad_norm": 0.82421875, "learning_rate": 5.624133658009817e-06, "loss": 0.0766, "step": 2720 }, { "epoch": 0.6825, "grad_norm": 0.73046875, "learning_rate": 5.5458333425054825e-06, "loss": 0.0898, "step": 2730 }, { "epoch": 0.685, "grad_norm": 0.9453125, "learning_rate": 5.467872229037128e-06, "loss": 0.0862, "step": 2740 }, { "epoch": 0.6875, "grad_norm": 0.79296875, "learning_rate": 5.390256254645379e-06, "loss": 0.079, "step": 2750 }, { "epoch": 0.69, "grad_norm": 1.140625, "learning_rate": 5.3129913300871846e-06, "loss": 0.0916, "step": 2760 }, { "epoch": 0.6925, "grad_norm": 0.83984375, "learning_rate": 5.236083339385698e-06, "loss": 0.0799, "step": 2770 }, { "epoch": 0.695, "grad_norm": 1.0625, "learning_rate": 5.159538139382168e-06, "loss": 0.0832, "step": 2780 }, { "epoch": 0.6975, "grad_norm": 0.80078125, "learning_rate": 5.0833615592899355e-06, "loss": 0.0911, "step": 2790 }, { "epoch": 0.7, "grad_norm": 1.1640625, "learning_rate": 5.0075594002505035e-06, "loss": 0.0826, "step": 2800 }, { "epoch": 0.7025, "grad_norm": 0.66015625, "learning_rate": 4.932137434891759e-06, "loss": 0.09, "step": 2810 }, { "epoch": 0.705, "grad_norm": 0.8671875, "learning_rate": 4.857101406888378e-06, "loss": 0.0826, "step": 2820 }, { "epoch": 0.7075, "grad_norm": 1.015625, "learning_rate": 4.782457030524405e-06, "loss": 0.0807, "step": 2830 }, { "epoch": 0.71, "grad_norm": 0.73828125, "learning_rate": 4.708209990258096e-06, "loss": 0.0894, "step": 2840 }, { "epoch": 0.7125, "grad_norm": 0.65625, "learning_rate": 4.634365940289026e-06, "loss": 0.0748, "step": 2850 }, { "epoch": 0.715, "grad_norm": 0.84375, "learning_rate": 4.560930504127501e-06, "loss": 0.0733, "step": 2860 }, { "epoch": 0.7175, "grad_norm": 0.765625, "learning_rate": 4.4879092741663e-06, "loss": 0.0892, "step": 2870 }, { "epoch": 0.72, "grad_norm": 0.93359375, "learning_rate": 4.4153078112547955e-06, "loss": 0.0944, "step": 2880 }, { "epoch": 0.7225, "grad_norm": 0.97265625, "learning_rate": 4.343131644275478e-06, "loss": 0.0806, "step": 2890 }, { "epoch": 0.725, "grad_norm": 1.1171875, "learning_rate": 4.271386269722909e-06, "loss": 0.0833, "step": 2900 }, { "epoch": 0.7275, "grad_norm": 1.0625, "learning_rate": 4.20007715128513e-06, "loss": 0.0834, "step": 2910 }, { "epoch": 0.73, "grad_norm": 0.87890625, "learning_rate": 4.129209719427596e-06, "loss": 0.078, "step": 2920 }, { "epoch": 0.7325, "grad_norm": 1.0546875, "learning_rate": 4.058789370979616e-06, "loss": 0.0851, "step": 2930 }, { "epoch": 0.735, "grad_norm": 0.87109375, "learning_rate": 3.988821468723368e-06, "loss": 0.0817, "step": 2940 }, { "epoch": 0.7375, "grad_norm": 1.140625, "learning_rate": 3.919311340985501e-06, "loss": 0.0807, "step": 2950 }, { "epoch": 0.74, "grad_norm": 0.75390625, "learning_rate": 3.8502642812313574e-06, "loss": 0.0837, "step": 2960 }, { "epoch": 0.7425, "grad_norm": 0.76953125, "learning_rate": 3.781685547661872e-06, "loss": 0.0832, "step": 2970 }, { "epoch": 0.745, "grad_norm": 0.59765625, "learning_rate": 3.7135803628131152e-06, "loss": 0.0776, "step": 2980 }, { "epoch": 0.7475, "grad_norm": 1.0078125, "learning_rate": 3.645953913158593e-06, "loss": 0.0766, "step": 2990 }, { "epoch": 0.75, "grad_norm": 0.62890625, "learning_rate": 3.5788113487142707e-06, "loss": 0.0801, "step": 3000 }, { "epoch": 0.7525, "grad_norm": 0.7734375, "learning_rate": 3.5121577826463894e-06, "loss": 0.0714, "step": 3010 }, { "epoch": 0.755, "grad_norm": 0.62890625, "learning_rate": 3.445998290882062e-06, "loss": 0.0807, "step": 3020 }, { "epoch": 0.7575, "grad_norm": 0.94921875, "learning_rate": 3.380337911722731e-06, "loss": 0.089, "step": 3030 }, { "epoch": 0.76, "grad_norm": 0.84765625, "learning_rate": 3.315181645460488e-06, "loss": 0.089, "step": 3040 }, { "epoch": 0.7625, "grad_norm": 0.85546875, "learning_rate": 3.2505344539972703e-06, "loss": 0.0827, "step": 3050 }, { "epoch": 0.765, "grad_norm": 0.828125, "learning_rate": 3.1864012604669993e-06, "loss": 0.0844, "step": 3060 }, { "epoch": 0.7675, "grad_norm": 1.09375, "learning_rate": 3.1227869488606643e-06, "loss": 0.084, "step": 3070 }, { "epoch": 0.77, "grad_norm": 0.78515625, "learning_rate": 3.0596963636543865e-06, "loss": 0.0827, "step": 3080 }, { "epoch": 0.7725, "grad_norm": 0.73828125, "learning_rate": 2.9971343094404935e-06, "loss": 0.085, "step": 3090 }, { "epoch": 0.775, "grad_norm": 1.0859375, "learning_rate": 2.935105550561631e-06, "loss": 0.085, "step": 3100 }, { "epoch": 0.7775, "grad_norm": 1.0703125, "learning_rate": 2.8736148107479467e-06, "loss": 0.0843, "step": 3110 }, { "epoch": 0.78, "grad_norm": 0.640625, "learning_rate": 2.8126667727573444e-06, "loss": 0.0808, "step": 3120 }, { "epoch": 0.7825, "grad_norm": 0.6640625, "learning_rate": 2.7522660780188904e-06, "loss": 0.0753, "step": 3130 }, { "epoch": 0.785, "grad_norm": 0.78125, "learning_rate": 2.6924173262793386e-06, "loss": 0.0722, "step": 3140 }, { "epoch": 0.7875, "grad_norm": 0.70703125, "learning_rate": 2.633125075252848e-06, "loss": 0.0869, "step": 3150 }, { "epoch": 0.79, "grad_norm": 0.73046875, "learning_rate": 2.5743938402738923e-06, "loss": 0.0783, "step": 3160 }, { "epoch": 0.7925, "grad_norm": 1.1328125, "learning_rate": 2.5162280939534024e-06, "loss": 0.0836, "step": 3170 }, { "epoch": 0.795, "grad_norm": 0.7421875, "learning_rate": 2.4586322658381566e-06, "loss": 0.0835, "step": 3180 }, { "epoch": 0.7975, "grad_norm": 0.921875, "learning_rate": 2.4016107420734625e-06, "loss": 0.078, "step": 3190 }, { "epoch": 0.8, "grad_norm": 0.734375, "learning_rate": 2.3451678650691202e-06, "loss": 0.0689, "step": 3200 }, { "epoch": 0.8025, "grad_norm": 0.8359375, "learning_rate": 2.2893079331687397e-06, "loss": 0.0754, "step": 3210 }, { "epoch": 0.805, "grad_norm": 0.71484375, "learning_rate": 2.2340352003224065e-06, "loss": 0.0768, "step": 3220 }, { "epoch": 0.8075, "grad_norm": 0.92578125, "learning_rate": 2.1793538757627217e-06, "loss": 0.0758, "step": 3230 }, { "epoch": 0.81, "grad_norm": 1.640625, "learning_rate": 2.125268123684262e-06, "loss": 0.0782, "step": 3240 }, { "epoch": 0.8125, "grad_norm": 0.69140625, "learning_rate": 2.0717820629264496e-06, "loss": 0.0795, "step": 3250 }, { "epoch": 0.815, "grad_norm": 0.8125, "learning_rate": 2.0188997666598853e-06, "loss": 0.0745, "step": 3260 }, { "epoch": 0.8175, "grad_norm": 0.99609375, "learning_rate": 1.9666252620761782e-06, "loss": 0.0716, "step": 3270 }, { "epoch": 0.82, "grad_norm": 0.74609375, "learning_rate": 1.9149625300812403e-06, "loss": 0.0802, "step": 3280 }, { "epoch": 0.8225, "grad_norm": 0.8671875, "learning_rate": 1.863915504992131e-06, "loss": 0.0793, "step": 3290 }, { "epoch": 0.825, "grad_norm": 0.69140625, "learning_rate": 1.8134880742374483e-06, "loss": 0.0735, "step": 3300 }, { "epoch": 0.8275, "grad_norm": 0.7890625, "learning_rate": 1.763684078061283e-06, "loss": 0.0728, "step": 3310 }, { "epoch": 0.83, "grad_norm": 1.0625, "learning_rate": 1.7145073092307663e-06, "loss": 0.074, "step": 3320 }, { "epoch": 0.8325, "grad_norm": 0.82421875, "learning_rate": 1.6659615127472384e-06, "loss": 0.08, "step": 3330 }, { "epoch": 0.835, "grad_norm": 0.67578125, "learning_rate": 1.6180503855610563e-06, "loss": 0.0757, "step": 3340 }, { "epoch": 0.8375, "grad_norm": 1.015625, "learning_rate": 1.5707775762900546e-06, "loss": 0.0767, "step": 3350 }, { "epoch": 0.84, "grad_norm": 0.6875, "learning_rate": 1.5241466849416797e-06, "loss": 0.0759, "step": 3360 }, { "epoch": 0.8425, "grad_norm": 0.91796875, "learning_rate": 1.4781612626388475e-06, "loss": 0.0832, "step": 3370 }, { "epoch": 0.845, "grad_norm": 0.86328125, "learning_rate": 1.4328248113495046e-06, "loss": 0.0805, "step": 3380 }, { "epoch": 0.8475, "grad_norm": 0.68359375, "learning_rate": 1.3881407836199446e-06, "loss": 0.0706, "step": 3390 }, { "epoch": 0.85, "grad_norm": 0.96484375, "learning_rate": 1.3441125823118816e-06, "loss": 0.077, "step": 3400 }, { "epoch": 0.8525, "grad_norm": 0.75, "learning_rate": 1.3007435603433039e-06, "loss": 0.0767, "step": 3410 }, { "epoch": 0.855, "grad_norm": 0.8515625, "learning_rate": 1.2580370204331482e-06, "loss": 0.0736, "step": 3420 }, { "epoch": 0.8575, "grad_norm": 0.80078125, "learning_rate": 1.2159962148497728e-06, "loss": 0.0834, "step": 3430 }, { "epoch": 0.86, "grad_norm": 0.86328125, "learning_rate": 1.17462434516329e-06, "loss": 0.082, "step": 3440 }, { "epoch": 0.8625, "grad_norm": 0.93359375, "learning_rate": 1.1339245620017525e-06, "loss": 0.0748, "step": 3450 }, { "epoch": 0.865, "grad_norm": 0.8515625, "learning_rate": 1.093899964811227e-06, "loss": 0.0843, "step": 3460 }, { "epoch": 0.8675, "grad_norm": 0.7890625, "learning_rate": 1.05455360161975e-06, "loss": 0.0746, "step": 3470 }, { "epoch": 0.87, "grad_norm": 0.91015625, "learning_rate": 1.0158884688052107e-06, "loss": 0.0829, "step": 3480 }, { "epoch": 0.8725, "grad_norm": 0.91796875, "learning_rate": 9.779075108671766e-07, "loss": 0.0839, "step": 3490 }, { "epoch": 0.875, "grad_norm": 0.79296875, "learning_rate": 9.406136202026417e-07, "loss": 0.0751, "step": 3500 }, { "epoch": 0.8775, "grad_norm": 0.8125, "learning_rate": 9.040096368857676e-07, "loss": 0.0854, "step": 3510 }, { "epoch": 0.88, "grad_norm": 0.60546875, "learning_rate": 8.680983484516048e-07, "loss": 0.0787, "step": 3520 }, { "epoch": 0.8825, "grad_norm": 0.9296875, "learning_rate": 8.328824896838006e-07, "loss": 0.0797, "step": 3530 }, { "epoch": 0.885, "grad_norm": 0.7890625, "learning_rate": 7.983647424063468e-07, "loss": 0.0849, "step": 3540 }, { "epoch": 0.8875, "grad_norm": 1.0859375, "learning_rate": 7.645477352793362e-07, "loss": 0.0772, "step": 3550 }, { "epoch": 0.89, "grad_norm": 0.765625, "learning_rate": 7.314340435987921e-07, "loss": 0.0699, "step": 3560 }, { "epoch": 0.8925, "grad_norm": 0.9375, "learning_rate": 6.990261891005401e-07, "loss": 0.0888, "step": 3570 }, { "epoch": 0.895, "grad_norm": 0.8359375, "learning_rate": 6.673266397681732e-07, "loss": 0.0847, "step": 3580 }, { "epoch": 0.8975, "grad_norm": 1.0, "learning_rate": 6.363378096451011e-07, "loss": 0.082, "step": 3590 }, { "epoch": 0.9, "grad_norm": 0.76171875, "learning_rate": 6.060620586507183e-07, "loss": 0.0807, "step": 3600 }, { "epoch": 0.9025, "grad_norm": 0.859375, "learning_rate": 5.765016924006784e-07, "loss": 0.0756, "step": 3610 }, { "epoch": 0.905, "grad_norm": 0.9296875, "learning_rate": 5.476589620313189e-07, "loss": 0.0739, "step": 3620 }, { "epoch": 0.9075, "grad_norm": 0.81640625, "learning_rate": 5.195360640282254e-07, "loss": 0.0855, "step": 3630 }, { "epoch": 0.91, "grad_norm": 0.8828125, "learning_rate": 4.92135140058967e-07, "loss": 0.0788, "step": 3640 }, { "epoch": 0.9125, "grad_norm": 0.73828125, "learning_rate": 4.65458276809988e-07, "loss": 0.0777, "step": 3650 }, { "epoch": 0.915, "grad_norm": 0.8828125, "learning_rate": 4.3950750582770985e-07, "loss": 0.0854, "step": 3660 }, { "epoch": 0.9175, "grad_norm": 1.1640625, "learning_rate": 4.142848033638136e-07, "loss": 0.0809, "step": 3670 }, { "epoch": 0.92, "grad_norm": 0.78125, "learning_rate": 3.897920902247465e-07, "loss": 0.0777, "step": 3680 }, { "epoch": 0.9225, "grad_norm": 0.9765625, "learning_rate": 3.6603123162544216e-07, "loss": 0.0778, "step": 3690 }, { "epoch": 0.925, "grad_norm": 0.64453125, "learning_rate": 3.4300403704727536e-07, "loss": 0.0748, "step": 3700 }, { "epoch": 0.9275, "grad_norm": 0.8125, "learning_rate": 3.207122601002677e-07, "loss": 0.0739, "step": 3710 }, { "epoch": 0.93, "grad_norm": 1.15625, "learning_rate": 2.991575983895401e-07, "loss": 0.085, "step": 3720 }, { "epoch": 0.9325, "grad_norm": 0.99609375, "learning_rate": 2.783416933860339e-07, "loss": 0.0871, "step": 3730 }, { "epoch": 0.935, "grad_norm": 1.6796875, "learning_rate": 2.582661303015066e-07, "loss": 0.0875, "step": 3740 }, { "epoch": 0.9375, "grad_norm": 0.97265625, "learning_rate": 2.3893243796781374e-07, "loss": 0.0806, "step": 3750 }, { "epoch": 0.94, "grad_norm": 0.82421875, "learning_rate": 2.203420887204788e-07, "loss": 0.0823, "step": 3760 }, { "epoch": 0.9425, "grad_norm": 0.65625, "learning_rate": 2.0249649828657292e-07, "loss": 0.0795, "step": 3770 }, { "epoch": 0.945, "grad_norm": 0.64453125, "learning_rate": 1.8539702567689888e-07, "loss": 0.0783, "step": 3780 }, { "epoch": 0.9475, "grad_norm": 0.67578125, "learning_rate": 1.6904497308250056e-07, "loss": 0.0833, "step": 3790 }, { "epoch": 0.95, "grad_norm": 1.1796875, "learning_rate": 1.534415857754923e-07, "loss": 0.0948, "step": 3800 }, { "epoch": 0.9525, "grad_norm": 1.109375, "learning_rate": 1.3858805201422798e-07, "loss": 0.0802, "step": 3810 }, { "epoch": 0.955, "grad_norm": 0.71875, "learning_rate": 1.2448550295281136e-07, "loss": 0.075, "step": 3820 }, { "epoch": 0.9575, "grad_norm": 0.8203125, "learning_rate": 1.1113501255495485e-07, "loss": 0.08, "step": 3830 }, { "epoch": 0.96, "grad_norm": 0.84375, "learning_rate": 9.853759751218828e-08, "loss": 0.0726, "step": 3840 }, { "epoch": 0.9625, "grad_norm": 1.0390625, "learning_rate": 8.66942171664431e-08, "loss": 0.0858, "step": 3850 }, { "epoch": 0.965, "grad_norm": 1.0234375, "learning_rate": 7.560577343698306e-08, "loss": 0.0814, "step": 3860 }, { "epoch": 0.9675, "grad_norm": 0.78515625, "learning_rate": 6.527311075173127e-08, "loss": 0.0812, "step": 3870 }, { "epoch": 0.97, "grad_norm": 0.7578125, "learning_rate": 5.5697015982955074e-08, "loss": 0.0802, "step": 3880 }, { "epoch": 0.9725, "grad_norm": 0.78515625, "learning_rate": 4.687821838734618e-08, "loss": 0.0798, "step": 3890 }, { "epoch": 0.975, "grad_norm": 0.71875, "learning_rate": 3.881738955048398e-08, "loss": 0.0711, "step": 3900 }, { "epoch": 0.9775, "grad_norm": 0.8203125, "learning_rate": 3.1515143335695274e-08, "loss": 0.0792, "step": 3910 }, { "epoch": 0.98, "grad_norm": 0.8125, "learning_rate": 2.4972035837299567e-08, "loss": 0.073, "step": 3920 }, { "epoch": 0.9825, "grad_norm": 0.70703125, "learning_rate": 1.9188565338265073e-08, "loss": 0.0808, "step": 3930 }, { "epoch": 0.985, "grad_norm": 0.7734375, "learning_rate": 1.416517227226355e-08, "loss": 0.0819, "step": 3940 }, { "epoch": 0.9875, "grad_norm": 0.83984375, "learning_rate": 9.902239190124896e-09, "loss": 0.0793, "step": 3950 }, { "epoch": 0.99, "grad_norm": 0.6640625, "learning_rate": 6.400090730708242e-09, "loss": 0.0752, "step": 3960 }, { "epoch": 0.9925, "grad_norm": 0.89453125, "learning_rate": 3.6589935961772736e-09, "loss": 0.077, "step": 3970 }, { "epoch": 0.995, "grad_norm": 0.7109375, "learning_rate": 1.6791565316920388e-09, "loss": 0.0783, "step": 3980 }, { "epoch": 0.9975, "grad_norm": 1.09375, "learning_rate": 4.6073030951054866e-10, "loss": 0.0784, "step": 3990 }, { "epoch": 1.0, "grad_norm": 0.984375, "learning_rate": 3.807717505743824e-12, "loss": 0.0748, "step": 4000 } ], "logging_steps": 10, "max_steps": 4000, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 2000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.739849055207424e+17, "train_batch_size": 12, "trial_name": null, "trial_params": null }