{ "best_metric": null, "best_model_checkpoint": null, "epoch": 22.857142857142858, "eval_steps": 100, "global_step": 800, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 8.333333333333334, "grad_norm": 0.8547727465629578, "learning_rate": 4.962019382530521e-05, "loss": 0.2973, "num_input_tokens_seen": 1240440, "step": 50 }, { "epoch": 16.666666666666668, "grad_norm": 0.5775834321975708, "learning_rate": 4.849231551964771e-05, "loss": 0.0165, "num_input_tokens_seen": 2467560, "step": 100 }, { "epoch": 16.666666666666668, "eval_loss": 1.8753944635391235, "eval_runtime": 0.0881, "eval_samples_per_second": 669.817, "eval_steps_per_second": 22.706, "num_input_tokens_seen": 2467560, "step": 100 }, { "epoch": 25.0, "grad_norm": 2.726205825805664, "learning_rate": 4.665063509461097e-05, "loss": 0.0117, "num_input_tokens_seen": 3714840, "step": 150 }, { "epoch": 33.333333333333336, "grad_norm": 0.17272137105464935, "learning_rate": 4.415111107797445e-05, "loss": 0.0053, "num_input_tokens_seen": 4957560, "step": 200 }, { "epoch": 33.333333333333336, "eval_loss": 2.176852226257324, "eval_runtime": 0.0894, "eval_samples_per_second": 660.304, "eval_steps_per_second": 22.383, "num_input_tokens_seen": 4957560, "step": 200 }, { "epoch": 41.666666666666664, "grad_norm": 0.2948448061943054, "learning_rate": 4.1069690242163484e-05, "loss": 0.0054, "num_input_tokens_seen": 6181800, "step": 250 }, { "epoch": 50.0, "grad_norm": 0.24040719866752625, "learning_rate": 3.7500000000000003e-05, "loss": 0.0033, "num_input_tokens_seen": 7418880, "step": 300 }, { "epoch": 50.0, "eval_loss": 2.383711814880371, "eval_runtime": 0.1033, "eval_samples_per_second": 571.316, "eval_steps_per_second": 19.367, "num_input_tokens_seen": 7418880, "step": 300 }, { "epoch": 38.888888888888886, "grad_norm": 0.5190167427062988, "learning_rate": 4.215604094671835e-05, "loss": 0.3195, "num_input_tokens_seen": 13014840, "step": 350 }, { "epoch": 44.44444444444444, "grad_norm": 0.37368133664131165, "learning_rate": 3.9928964792569655e-05, "loss": 0.0144, "num_input_tokens_seen": 18598920, "step": 400 }, { "epoch": 44.44444444444444, "eval_loss": 0.9897297024726868, "eval_runtime": 1.0997, "eval_samples_per_second": 75.477, "eval_steps_per_second": 2.728, "num_input_tokens_seen": 18598920, "step": 400 }, { "epoch": 50.0, "grad_norm": 0.15123282372951508, "learning_rate": 3.7500000000000003e-05, "loss": 0.0082, "num_input_tokens_seen": 24251520, "step": 450 }, { "epoch": 55.55555555555556, "grad_norm": 0.056949373334646225, "learning_rate": 3.490199415097892e-05, "loss": 0.0012, "num_input_tokens_seen": 29856960, "step": 500 }, { "epoch": 55.55555555555556, "eval_loss": 1.1057275533676147, "eval_runtime": 1.1105, "eval_samples_per_second": 74.743, "eval_steps_per_second": 2.702, "num_input_tokens_seen": 29856960, "step": 500 }, { "epoch": 15.714285714285714, "grad_norm": 1.0327025651931763, "learning_rate": 4.865818459497911e-05, "loss": 1.2219, "num_input_tokens_seen": 40827720, "step": 550 }, { "epoch": 17.142857142857142, "grad_norm": 0.8191797137260437, "learning_rate": 4.8405871765993433e-05, "loss": 0.4354, "num_input_tokens_seen": 51641040, "step": 600 }, { "epoch": 17.142857142857142, "eval_loss": 1.7303279638290405, "eval_runtime": 7.2384, "eval_samples_per_second": 48.63, "eval_steps_per_second": 1.52, "num_input_tokens_seen": 51641040, "step": 600 }, { "epoch": 18.571428571428573, "grad_norm": 0.4633180797100067, "learning_rate": 4.813260751184992e-05, "loss": 0.0876, "num_input_tokens_seen": 62482320, "step": 650 }, { "epoch": 20.0, "grad_norm": 0.23285113275051117, "learning_rate": 4.783863644106502e-05, "loss": 0.0268, "num_input_tokens_seen": 73404120, "step": 700 }, { "epoch": 20.0, "eval_loss": 1.9282101392745972, "eval_runtime": 7.2488, "eval_samples_per_second": 48.56, "eval_steps_per_second": 1.517, "num_input_tokens_seen": 73404120, "step": 700 }, { "epoch": 21.428571428571427, "grad_norm": 0.15514932572841644, "learning_rate": 4.752422169756048e-05, "loss": 0.0085, "num_input_tokens_seen": 84276120, "step": 750 }, { "epoch": 22.857142857142858, "grad_norm": 0.12508106231689453, "learning_rate": 4.718964472511386e-05, "loss": 0.0051, "num_input_tokens_seen": 95276640, "step": 800 }, { "epoch": 22.857142857142858, "eval_loss": 2.1112966537475586, "eval_runtime": 7.2618, "eval_samples_per_second": 48.473, "eval_steps_per_second": 1.515, "num_input_tokens_seen": 95276640, "step": 800 } ], "logging_steps": 50, "max_steps": 5250, "num_input_tokens_seen": 95276640, "num_train_epochs": 150, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.314993447114375e+18, "train_batch_size": 15, "trial_name": null, "trial_params": null }