{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 5, "global_step": 125, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.008, "grad_norm": 0.2523050010204315, "learning_rate": 1.5384615384615387e-05, "loss": 0.4523, "mean_token_accuracy": 0.9231184720993042, "step": 1 }, { "epoch": 0.04, "grad_norm": 0.24141842126846313, "learning_rate": 7.692307692307693e-05, "loss": 0.4396, "mean_token_accuracy": 0.9300202205777168, "step": 5 }, { "epoch": 0.04, "eval_loss": 0.38802939653396606, "eval_mean_token_accuracy": 0.931274386882782, "eval_runtime": 572.2225, "eval_samples_per_second": 1.748, "eval_steps_per_second": 0.218, "step": 5 }, { "epoch": 0.08, "grad_norm": 0.16969740390777588, "learning_rate": 0.00015384615384615385, "loss": 0.3426, "mean_token_accuracy": 0.93111452460289, "step": 10 }, { "epoch": 0.08, "eval_loss": 0.24519848823547363, "eval_mean_token_accuracy": 0.943490716934204, "eval_runtime": 572.2716, "eval_samples_per_second": 1.747, "eval_steps_per_second": 0.218, "step": 10 }, { "epoch": 0.12, "grad_norm": 0.15904344618320465, "learning_rate": 0.00019984268150178167, "loss": 0.1922, "mean_token_accuracy": 0.9507227122783661, "step": 15 }, { "epoch": 0.12, "eval_loss": 0.12690386176109314, "eval_mean_token_accuracy": 0.962551230430603, "eval_runtime": 572.2269, "eval_samples_per_second": 1.748, "eval_steps_per_second": 0.218, "step": 15 }, { "epoch": 0.16, "grad_norm": 0.12561501562595367, "learning_rate": 0.00019807852804032305, "loss": 0.094, "mean_token_accuracy": 0.9698278903961182, "step": 20 }, { "epoch": 0.16, "eval_loss": 0.07218066602945328, "eval_mean_token_accuracy": 0.9754553689956665, "eval_runtime": 572.3734, "eval_samples_per_second": 1.747, "eval_steps_per_second": 0.218, "step": 20 }, { "epoch": 0.2, "grad_norm": 0.09696544706821442, "learning_rate": 0.00019438833303083678, "loss": 0.0578, "mean_token_accuracy": 0.9804269909858704, "step": 25 }, { "epoch": 0.2, "eval_loss": 0.06172483041882515, "eval_mean_token_accuracy": 0.9775349216461182, "eval_runtime": 571.3615, "eval_samples_per_second": 1.75, "eval_steps_per_second": 0.219, "step": 25 }, { "epoch": 0.24, "grad_norm": 0.07436518371105194, "learning_rate": 0.00018884456359788724, "loss": 0.0474, "mean_token_accuracy": 0.9817552506923676, "step": 30 }, { "epoch": 0.24, "eval_loss": 0.051653869450092316, "eval_mean_token_accuracy": 0.97948202419281, "eval_runtime": 572.6759, "eval_samples_per_second": 1.746, "eval_steps_per_second": 0.218, "step": 30 }, { "epoch": 0.28, "grad_norm": 0.058379314839839935, "learning_rate": 0.00018155608689592604, "loss": 0.0473, "mean_token_accuracy": 0.9801956593990326, "step": 35 }, { "epoch": 0.28, "eval_loss": 0.04774034023284912, "eval_mean_token_accuracy": 0.9803278684616089, "eval_runtime": 572.9678, "eval_samples_per_second": 1.745, "eval_steps_per_second": 0.218, "step": 35 }, { "epoch": 0.32, "grad_norm": 0.05066407099366188, "learning_rate": 0.0001726660322034027, "loss": 0.0471, "mean_token_accuracy": 0.9803119540214539, "step": 40 }, { "epoch": 0.32, "eval_loss": 0.04579387605190277, "eval_mean_token_accuracy": 0.980647020816803, "eval_runtime": 572.7685, "eval_samples_per_second": 1.746, "eval_steps_per_second": 0.218, "step": 40 }, { "epoch": 0.36, "grad_norm": 0.05025569722056389, "learning_rate": 0.00016234898018587337, "loss": 0.0423, "mean_token_accuracy": 0.9822601974010468, "step": 45 }, { "epoch": 0.36, "eval_loss": 0.04449079558253288, "eval_mean_token_accuracy": 0.981070707321167, "eval_runtime": 572.4781, "eval_samples_per_second": 1.747, "eval_steps_per_second": 0.218, "step": 45 }, { "epoch": 0.4, "grad_norm": 0.03607247769832611, "learning_rate": 0.00015080753452465296, "loss": 0.0465, "mean_token_accuracy": 0.9798099577426911, "step": 50 }, { "epoch": 0.4, "eval_loss": 0.04439844563603401, "eval_mean_token_accuracy": 0.9811892700195313, "eval_runtime": 572.6901, "eval_samples_per_second": 1.746, "eval_steps_per_second": 0.218, "step": 50 }, { "epoch": 0.44, "grad_norm": 0.037045057862997055, "learning_rate": 0.000138268343236509, "loss": 0.0431, "mean_token_accuracy": 0.9816874027252197, "step": 55 }, { "epoch": 0.44, "eval_loss": 0.04384024068713188, "eval_mean_token_accuracy": 0.9812552394866944, "eval_runtime": 572.4389, "eval_samples_per_second": 1.747, "eval_steps_per_second": 0.218, "step": 55 }, { "epoch": 0.48, "grad_norm": 0.03762717917561531, "learning_rate": 0.0001249776478167227, "loss": 0.0437, "mean_token_accuracy": 0.9811290085315705, "step": 60 }, { "epoch": 0.48, "eval_loss": 0.04316685348749161, "eval_mean_token_accuracy": 0.9816457834243775, "eval_runtime": 572.5081, "eval_samples_per_second": 1.747, "eval_steps_per_second": 0.218, "step": 60 }, { "epoch": 0.52, "grad_norm": 0.02974274381995201, "learning_rate": 0.00011119644761033078, "loss": 0.0412, "mean_token_accuracy": 0.9832498252391815, "step": 65 }, { "epoch": 0.52, "eval_loss": 0.04281109571456909, "eval_mean_token_accuracy": 0.9816416425704956, "eval_runtime": 572.3877, "eval_samples_per_second": 1.747, "eval_steps_per_second": 0.218, "step": 65 }, { "epoch": 0.56, "grad_norm": 0.03527842089533806, "learning_rate": 9.719537437241312e-05, "loss": 0.0422, "mean_token_accuracy": 0.9818739414215087, "step": 70 }, { "epoch": 0.56, "eval_loss": 0.042156100273132324, "eval_mean_token_accuracy": 0.9818385190963745, "eval_runtime": 572.8793, "eval_samples_per_second": 1.746, "eval_steps_per_second": 0.218, "step": 70 }, { "epoch": 0.6, "grad_norm": 0.03159947320818901, "learning_rate": 8.324937766952638e-05, "loss": 0.0415, "mean_token_accuracy": 0.9812052190303803, "step": 75 }, { "epoch": 0.6, "eval_loss": 0.04201458767056465, "eval_mean_token_accuracy": 0.9818556265830993, "eval_runtime": 572.8981, "eval_samples_per_second": 1.746, "eval_steps_per_second": 0.218, "step": 75 }, { "epoch": 0.64, "grad_norm": 0.05335124209523201, "learning_rate": 6.963232548903853e-05, "loss": 0.0434, "mean_token_accuracy": 0.981024295091629, "step": 80 }, { "epoch": 0.64, "eval_loss": 0.0419701524078846, "eval_mean_token_accuracy": 0.9820275835990906, "eval_runtime": 572.5542, "eval_samples_per_second": 1.747, "eval_steps_per_second": 0.218, "step": 80 }, { "epoch": 0.68, "grad_norm": 0.03178861737251282, "learning_rate": 5.6611626088244194e-05, "loss": 0.0446, "mean_token_accuracy": 0.9806674957275391, "step": 85 }, { "epoch": 0.68, "eval_loss": 0.04141935706138611, "eval_mean_token_accuracy": 0.9820602812767029, "eval_runtime": 571.1387, "eval_samples_per_second": 1.751, "eval_steps_per_second": 0.219, "step": 85 }, { "epoch": 0.72, "grad_norm": 0.040076956152915955, "learning_rate": 4.444297669803981e-05, "loss": 0.04, "mean_token_accuracy": 0.9825624287128448, "step": 90 }, { "epoch": 0.72, "eval_loss": 0.04136450216174126, "eval_mean_token_accuracy": 0.9821350569725037, "eval_runtime": 571.1079, "eval_samples_per_second": 1.751, "eval_steps_per_second": 0.219, "step": 90 }, { "epoch": 0.76, "grad_norm": 0.037734489887952805, "learning_rate": 3.336534220479961e-05, "loss": 0.0443, "mean_token_accuracy": 0.9814210355281829, "step": 95 }, { "epoch": 0.76, "eval_loss": 0.041187919676303864, "eval_mean_token_accuracy": 0.9821804766654968, "eval_runtime": 572.5641, "eval_samples_per_second": 1.747, "eval_steps_per_second": 0.218, "step": 95 }, { "epoch": 0.8, "grad_norm": 0.04483392462134361, "learning_rate": 2.3596262417839255e-05, "loss": 0.0394, "mean_token_accuracy": 0.9827538132667542, "step": 100 }, { "epoch": 0.8, "eval_loss": 0.041005875915288925, "eval_mean_token_accuracy": 0.9823568396568298, "eval_runtime": 572.6853, "eval_samples_per_second": 1.746, "eval_steps_per_second": 0.218, "step": 100 }, { "epoch": 0.84, "grad_norm": 0.035200804471969604, "learning_rate": 1.5327580077171587e-05, "loss": 0.0406, "mean_token_accuracy": 0.9826211035251617, "step": 105 }, { "epoch": 0.84, "eval_loss": 0.040836114436388016, "eval_mean_token_accuracy": 0.9824077115058899, "eval_runtime": 572.7135, "eval_samples_per_second": 1.746, "eval_steps_per_second": 0.218, "step": 105 }, { "epoch": 0.88, "grad_norm": 0.033276792615652084, "learning_rate": 8.72167349386811e-06, "loss": 0.0418, "mean_token_accuracy": 0.9821678340435028, "step": 110 }, { "epoch": 0.88, "eval_loss": 0.040718287229537964, "eval_mean_token_accuracy": 0.9824112377166748, "eval_runtime": 571.9537, "eval_samples_per_second": 1.748, "eval_steps_per_second": 0.219, "step": 110 }, { "epoch": 0.92, "grad_norm": 0.03913981840014458, "learning_rate": 3.908267805490051e-06, "loss": 0.0413, "mean_token_accuracy": 0.9817866742610931, "step": 115 }, { "epoch": 0.92, "eval_loss": 0.040665969252586365, "eval_mean_token_accuracy": 0.9824027805328369, "eval_runtime": 572.0692, "eval_samples_per_second": 1.748, "eval_steps_per_second": 0.219, "step": 115 }, { "epoch": 0.96, "grad_norm": 0.053303398191928864, "learning_rate": 9.818874663554357e-07, "loss": 0.0391, "mean_token_accuracy": 0.9823383331298828, "step": 120 }, { "epoch": 0.96, "eval_loss": 0.04064611718058586, "eval_mean_token_accuracy": 0.9824066462516785, "eval_runtime": 571.9701, "eval_samples_per_second": 1.748, "eval_steps_per_second": 0.219, "step": 120 }, { "epoch": 1.0, "grad_norm": 0.031917620450258255, "learning_rate": 0.0, "loss": 0.0414, "mean_token_accuracy": 0.9825141310691834, "step": 125 }, { "epoch": 1.0, "eval_loss": 0.04065406322479248, "eval_mean_token_accuracy": 0.9823896341323852, "eval_runtime": 571.6781, "eval_samples_per_second": 1.749, "eval_steps_per_second": 0.219, "step": 125 }, { "epoch": 1.0, "step": 125, "total_flos": 8.539757562822656e+16, "train_loss": 0.0, "train_runtime": 2.0051, "train_samples_per_second": 498.739, "train_steps_per_second": 62.342 } ], "logging_steps": 5, "max_steps": 125, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 8.539757562822656e+16, "train_batch_size": 4, "trial_name": null, "trial_params": null }