{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9829619921363041, "eval_steps": 500, "global_step": 1500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0163826998689384, "grad_norm": 10.452404975891113, "learning_rate": 0.00010434782608695653, "loss": 2.7741, "step": 25 }, { "epoch": 0.0327653997378768, "grad_norm": 11.673263549804688, "learning_rate": 0.0001999979723762913, "loss": 2.2936, "step": 50 }, { "epoch": 0.0491480996068152, "grad_norm": 0.0, "learning_rate": 0.00019982342283922738, "loss": 2.0868, "step": 75 }, { "epoch": 0.0655307994757536, "grad_norm": 12.91142463684082, "learning_rate": 0.000199367821181338, "loss": 1.9288, "step": 100 }, { "epoch": 0.08191349934469201, "grad_norm": 0.0, "learning_rate": 0.00019863245014577668, "loss": 10.3322, "step": 125 }, { "epoch": 0.0982961992136304, "grad_norm": 11.256654739379883, "learning_rate": 0.00019761938016434448, "loss": 2.2306, "step": 150 }, { "epoch": 0.11467889908256881, "grad_norm": 5.196111679077148, "learning_rate": 0.0001963314635282044, "loss": 7.8393, "step": 175 }, { "epoch": 0.1310615989515072, "grad_norm": 6.82452917098999, "learning_rate": 0.00019477232635727637, "loss": 3.5118, "step": 200 }, { "epoch": 0.1474442988204456, "grad_norm": 0.0, "learning_rate": 0.000192946358390923, "loss": 3.4974, "step": 225 }, { "epoch": 0.16382699868938402, "grad_norm": 0.0, "learning_rate": 0.0001908587006286703, "loss": 4.7892, "step": 250 }, { "epoch": 0.18020969855832242, "grad_norm": 7.4760284423828125, "learning_rate": 0.00018851523085576096, "loss": 1.7774, "step": 275 }, { "epoch": 0.1965923984272608, "grad_norm": 0.0, "learning_rate": 0.0001859225470942928, "loss": 2.1847, "step": 300 }, { "epoch": 0.2129750982961992, "grad_norm": 3.8474948406219482, "learning_rate": 0.00018308794902653533, "loss": 3.0351, "step": 325 }, { "epoch": 0.22935779816513763, "grad_norm": 69.14743041992188, "learning_rate": 0.00018001941744272767, "loss": 2.0028, "step": 350 }, { "epoch": 0.24574049803407602, "grad_norm": 10.878889083862305, "learning_rate": 0.00017672559177122165, "loss": 1.89, "step": 375 }, { "epoch": 0.2621231979030144, "grad_norm": 6.749469757080078, "learning_rate": 0.00017321574575423406, "loss": 2.203, "step": 400 }, { "epoch": 0.27850589777195284, "grad_norm": 6.272464752197266, "learning_rate": 0.0001694997613376928, "loss": 1.8728, "step": 425 }, { "epoch": 0.2948885976408912, "grad_norm": 65.30072784423828, "learning_rate": 0.0001655881008486903, "loss": 2.1526, "step": 450 }, { "epoch": 0.3112712975098296, "grad_norm": 0.0, "learning_rate": 0.00016149177753887746, "loss": 1.8009, "step": 475 }, { "epoch": 0.32765399737876805, "grad_norm": 14.52846622467041, "learning_rate": 0.0001572223245767338, "loss": 2.0906, "step": 500 }, { "epoch": 0.3440366972477064, "grad_norm": 10.511902809143066, "learning_rate": 0.00015279176257601557, "loss": 2.9393, "step": 525 }, { "epoch": 0.36041939711664484, "grad_norm": 8.591642379760742, "learning_rate": 0.00014821256575180507, "loss": 1.1961, "step": 550 }, { "epoch": 0.3768020969855832, "grad_norm": 9.35624885559082, "learning_rate": 0.00014349762679944896, "loss": 1.3758, "step": 575 }, { "epoch": 0.3931847968545216, "grad_norm": 51.37662887573242, "learning_rate": 0.0001386602205952681, "loss": 1.4211, "step": 600 }, { "epoch": 0.40956749672346004, "grad_norm": 9.251007080078125, "learning_rate": 0.00013371396682124005, "loss": 3.5546, "step": 625 }, { "epoch": 0.4259501965923984, "grad_norm": 0.0, "learning_rate": 0.0001286727916188834, "loss": 2.1943, "step": 650 }, { "epoch": 0.44233289646133683, "grad_norm": 8.098660469055176, "learning_rate": 0.00012355088838030776, "loss": 1.8534, "step": 675 }, { "epoch": 0.45871559633027525, "grad_norm": 13.290648460388184, "learning_rate": 0.00011836267778682133, "loss": 2.822, "step": 700 }, { "epoch": 0.4750982961992136, "grad_norm": 4.341573715209961, "learning_rate": 0.00011312276720760782, "loss": 4.252, "step": 725 }, { "epoch": 0.49148099606815204, "grad_norm": 49.69887924194336, "learning_rate": 0.0001078459095727845, "loss": 1.6486, "step": 750 }, { "epoch": 0.5078636959370905, "grad_norm": 12.931384086608887, "learning_rate": 0.00010254696183663511, "loss": 1.824, "step": 775 }, { "epoch": 0.5242463958060288, "grad_norm": 8.917986869812012, "learning_rate": 9.724084314796292e-05, "loss": 1.1574, "step": 800 }, { "epoch": 0.5406290956749672, "grad_norm": 11.101146697998047, "learning_rate": 9.194249284533576e-05, "loss": 3.1605, "step": 825 }, { "epoch": 0.5570117955439057, "grad_norm": 8.709878921508789, "learning_rate": 8.666682839548719e-05, "loss": 2.3045, "step": 850 }, { "epoch": 0.573394495412844, "grad_norm": 5.888107776641846, "learning_rate": 8.142870339329723e-05, "loss": 1.6091, "step": 875 }, { "epoch": 0.5897771952817824, "grad_norm": 0.0, "learning_rate": 7.624286574160409e-05, "loss": 1.6872, "step": 900 }, { "epoch": 0.6061598951507209, "grad_norm": 9.354924201965332, "learning_rate": 7.112391612859118e-05, "loss": 1.5703, "step": 925 }, { "epoch": 0.6225425950196593, "grad_norm": 8.378700256347656, "learning_rate": 6.608626691965541e-05, "loss": 3.3726, "step": 950 }, { "epoch": 0.6389252948885976, "grad_norm": 40.91008758544922, "learning_rate": 6.114410157949745e-05, "loss": 3.2629, "step": 975 }, { "epoch": 0.6553079947575361, "grad_norm": 14.765325546264648, "learning_rate": 5.631133473868018e-05, "loss": 1.8236, "step": 1000 }, { "epoch": 0.6716906946264745, "grad_norm": 7.710732936859131, "learning_rate": 5.160157301708732e-05, "loss": 1.7716, "step": 1025 }, { "epoch": 0.6880733944954128, "grad_norm": 21.088882446289062, "learning_rate": 4.70280767145842e-05, "loss": 1.5484, "step": 1050 }, { "epoch": 0.7044560943643512, "grad_norm": 6.960242748260498, "learning_rate": 4.260372247674004e-05, "loss": 1.1968, "step": 1075 }, { "epoch": 0.7208387942332897, "grad_norm": 0.0, "learning_rate": 3.8340967040725995e-05, "loss": 1.7298, "step": 1100 }, { "epoch": 0.737221494102228, "grad_norm": 0.0, "learning_rate": 3.425181216346213e-05, "loss": 21.8374, "step": 1125 }, { "epoch": 0.7536041939711664, "grad_norm": 14.913447380065918, "learning_rate": 3.0347770830758316e-05, "loss": 2.7944, "step": 1150 }, { "epoch": 0.7699868938401049, "grad_norm": 12.520014762878418, "learning_rate": 2.6639834842586365e-05, "loss": 2.1394, "step": 1175 }, { "epoch": 0.7863695937090432, "grad_norm": 8.859058380126953, "learning_rate": 2.3138443865747062e-05, "loss": 1.3066, "step": 1200 }, { "epoch": 0.8027522935779816, "grad_norm": 14.920854568481445, "learning_rate": 1.985345604106439e-05, "loss": 2.6596, "step": 1225 }, { "epoch": 0.8191349934469201, "grad_norm": 32.096649169921875, "learning_rate": 1.679412022786172e-05, "loss": 9.7768, "step": 1250 }, { "epoch": 0.8355176933158585, "grad_norm": 0.0, "learning_rate": 1.396904996386551e-05, "loss": 8.7551, "step": 1275 }, { "epoch": 0.8519003931847968, "grad_norm": 10.609190940856934, "learning_rate": 1.1386199213852755e-05, "loss": 1.4679, "step": 1300 }, { "epoch": 0.8682830930537353, "grad_norm": 4.4171624183654785, "learning_rate": 9.052839975320836e-06, "loss": 1.4764, "step": 1325 }, { "epoch": 0.8846657929226737, "grad_norm": 0.0, "learning_rate": 6.975541804231478e-06, "loss": 3.5415, "step": 1350 }, { "epoch": 0.901048492791612, "grad_norm": 0.0, "learning_rate": 5.160153318473815e-06, "loss": 2.2546, "step": 1375 }, { "epoch": 0.9174311926605505, "grad_norm": 5.587663173675537, "learning_rate": 3.611785731123274e-06, "loss": 2.6802, "step": 1400 }, { "epoch": 0.9338138925294889, "grad_norm": 9.083599090576172, "learning_rate": 2.3347984598581783e-06, "loss": 1.555, "step": 1425 }, { "epoch": 0.9501965923984272, "grad_norm": 4.485108852386475, "learning_rate": 1.3327868530511934e-06, "loss": 1.736, "step": 1450 }, { "epoch": 0.9665792922673656, "grad_norm": 9.273879051208496, "learning_rate": 6.08572067092017e-07, "loss": 1.7019, "step": 1475 }, { "epoch": 0.9829619921363041, "grad_norm": 8.116653442382812, "learning_rate": 1.6419312344211347e-07, "loss": 5.0805, "step": 1500 } ], "logging_steps": 25, "max_steps": 1526, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 6665443461341952.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }