{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0206685378923195, "eval_steps": 500, "global_step": 4000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.025516713447307986, "grad_norm": 22.250633239746094, "learning_rate": 1.1862244897959185e-06, "loss": 2.3912, "step": 100 }, { "epoch": 0.05103342689461597, "grad_norm": 18.358823776245117, "learning_rate": 2.461734693877551e-06, "loss": 0.9986, "step": 200 }, { "epoch": 0.07655014034192396, "grad_norm": 72.52508544921875, "learning_rate": 3.737244897959184e-06, "loss": 0.8566, "step": 300 }, { "epoch": 0.10206685378923194, "grad_norm": 23.034740447998047, "learning_rate": 5.012755102040817e-06, "loss": 0.7413, "step": 400 }, { "epoch": 0.12758356723653994, "grad_norm": 45.18895721435547, "learning_rate": 6.288265306122449e-06, "loss": 0.7472, "step": 500 }, { "epoch": 0.15310028068384793, "grad_norm": 49.649410247802734, "learning_rate": 7.563775510204082e-06, "loss": 0.7018, "step": 600 }, { "epoch": 0.17861699413115592, "grad_norm": 37.48847198486328, "learning_rate": 8.839285714285714e-06, "loss": 0.6664, "step": 700 }, { "epoch": 0.20413370757846389, "grad_norm": 41.822566986083984, "learning_rate": 9.987241281542388e-06, "loss": 0.6929, "step": 800 }, { "epoch": 0.22965042102577188, "grad_norm": 31.396697998046875, "learning_rate": 9.84547774312447e-06, "loss": 0.6417, "step": 900 }, { "epoch": 0.25516713447307987, "grad_norm": 18.564287185668945, "learning_rate": 9.70371420470655e-06, "loss": 0.6491, "step": 1000 }, { "epoch": 0.28068384792038786, "grad_norm": 40.36125946044922, "learning_rate": 9.561950666288631e-06, "loss": 0.6364, "step": 1100 }, { "epoch": 0.30620056136769586, "grad_norm": 22.446331024169922, "learning_rate": 9.420187127870712e-06, "loss": 0.6668, "step": 1200 }, { "epoch": 0.33171727481500385, "grad_norm": 19.624980926513672, "learning_rate": 9.278423589452793e-06, "loss": 0.6713, "step": 1300 }, { "epoch": 0.35723398826231184, "grad_norm": 14.320337295532227, "learning_rate": 9.136660051034874e-06, "loss": 0.5721, "step": 1400 }, { "epoch": 0.3827507017096198, "grad_norm": 25.894556045532227, "learning_rate": 8.994896512616955e-06, "loss": 0.597, "step": 1500 }, { "epoch": 0.40826741515692777, "grad_norm": 40.72581100463867, "learning_rate": 8.853132974199036e-06, "loss": 0.594, "step": 1600 }, { "epoch": 0.43378412860423576, "grad_norm": 21.20204734802246, "learning_rate": 8.711369435781117e-06, "loss": 0.5882, "step": 1700 }, { "epoch": 0.45930084205154376, "grad_norm": 49.4229850769043, "learning_rate": 8.5696058973632e-06, "loss": 0.6018, "step": 1800 }, { "epoch": 0.48481755549885175, "grad_norm": 21.961654663085938, "learning_rate": 8.42784235894528e-06, "loss": 0.5872, "step": 1900 }, { "epoch": 0.5103342689461597, "grad_norm": 15.664958000183105, "learning_rate": 8.286078820527362e-06, "loss": 0.5802, "step": 2000 }, { "epoch": 0.5358509823934677, "grad_norm": 32.18100357055664, "learning_rate": 8.144315282109441e-06, "loss": 0.5614, "step": 2100 }, { "epoch": 0.5613676958407757, "grad_norm": 14.913948059082031, "learning_rate": 8.002551743691524e-06, "loss": 0.5166, "step": 2200 }, { "epoch": 0.5868844092880837, "grad_norm": 14.455422401428223, "learning_rate": 7.860788205273603e-06, "loss": 0.5722, "step": 2300 }, { "epoch": 0.6124011227353917, "grad_norm": 49.46743392944336, "learning_rate": 7.719024666855686e-06, "loss": 0.5289, "step": 2400 }, { "epoch": 0.6379178361826997, "grad_norm": 16.0176944732666, "learning_rate": 7.577261128437766e-06, "loss": 0.5803, "step": 2500 }, { "epoch": 0.6634345496300077, "grad_norm": 11.620992660522461, "learning_rate": 7.435497590019848e-06, "loss": 0.5355, "step": 2600 }, { "epoch": 0.6889512630773157, "grad_norm": 24.707399368286133, "learning_rate": 7.293734051601928e-06, "loss": 0.5694, "step": 2700 }, { "epoch": 0.7144679765246237, "grad_norm": 35.875511169433594, "learning_rate": 7.15197051318401e-06, "loss": 0.569, "step": 2800 }, { "epoch": 0.7399846899719316, "grad_norm": 32.23057556152344, "learning_rate": 7.010206974766091e-06, "loss": 0.5016, "step": 2900 }, { "epoch": 0.7655014034192396, "grad_norm": 24.113296508789062, "learning_rate": 6.868443436348172e-06, "loss": 0.5586, "step": 3000 }, { "epoch": 0.7910181168665475, "grad_norm": 25.87450408935547, "learning_rate": 6.726679897930253e-06, "loss": 0.5154, "step": 3100 }, { "epoch": 0.8165348303138555, "grad_norm": 32.350250244140625, "learning_rate": 6.584916359512335e-06, "loss": 0.5318, "step": 3200 }, { "epoch": 0.8420515437611635, "grad_norm": 22.004989624023438, "learning_rate": 6.443152821094415e-06, "loss": 0.5461, "step": 3300 }, { "epoch": 0.8675682572084715, "grad_norm": 39.10634994506836, "learning_rate": 6.301389282676497e-06, "loss": 0.4885, "step": 3400 }, { "epoch": 0.8930849706557795, "grad_norm": 24.000411987304688, "learning_rate": 6.159625744258577e-06, "loss": 0.573, "step": 3500 }, { "epoch": 0.9186016841030875, "grad_norm": 15.122103691101074, "learning_rate": 6.017862205840658e-06, "loss": 0.5145, "step": 3600 }, { "epoch": 0.9441183975503955, "grad_norm": 29.39118003845215, "learning_rate": 5.87609866742274e-06, "loss": 0.5214, "step": 3700 }, { "epoch": 0.9696351109977035, "grad_norm": 21.783594131469727, "learning_rate": 5.73433512900482e-06, "loss": 0.493, "step": 3800 }, { "epoch": 0.9951518244450115, "grad_norm": 6.728275299072266, "learning_rate": 5.592571590586902e-06, "loss": 0.5413, "step": 3900 }, { "epoch": 1.0206685378923195, "grad_norm": 23.733306884765625, "learning_rate": 5.450808052168982e-06, "loss": 0.442, "step": 4000 } ], "logging_steps": 100, "max_steps": 7838, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }