{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.658586374075019, "eval_steps": 500, "global_step": 6500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.025516713447307986, "grad_norm": 22.250633239746094, "learning_rate": 1.1862244897959185e-06, "loss": 2.3912, "step": 100 }, { "epoch": 0.05103342689461597, "grad_norm": 18.358823776245117, "learning_rate": 2.461734693877551e-06, "loss": 0.9986, "step": 200 }, { "epoch": 0.07655014034192396, "grad_norm": 72.52508544921875, "learning_rate": 3.737244897959184e-06, "loss": 0.8566, "step": 300 }, { "epoch": 0.10206685378923194, "grad_norm": 23.034740447998047, "learning_rate": 5.012755102040817e-06, "loss": 0.7413, "step": 400 }, { "epoch": 0.12758356723653994, "grad_norm": 45.18895721435547, "learning_rate": 6.288265306122449e-06, "loss": 0.7472, "step": 500 }, { "epoch": 0.15310028068384793, "grad_norm": 49.649410247802734, "learning_rate": 7.563775510204082e-06, "loss": 0.7018, "step": 600 }, { "epoch": 0.17861699413115592, "grad_norm": 37.48847198486328, "learning_rate": 8.839285714285714e-06, "loss": 0.6664, "step": 700 }, { "epoch": 0.20413370757846389, "grad_norm": 41.822566986083984, "learning_rate": 9.987241281542388e-06, "loss": 0.6929, "step": 800 }, { "epoch": 0.22965042102577188, "grad_norm": 31.396697998046875, "learning_rate": 9.84547774312447e-06, "loss": 0.6417, "step": 900 }, { "epoch": 0.25516713447307987, "grad_norm": 18.564287185668945, "learning_rate": 9.70371420470655e-06, "loss": 0.6491, "step": 1000 }, { "epoch": 0.28068384792038786, "grad_norm": 40.36125946044922, "learning_rate": 9.561950666288631e-06, "loss": 0.6364, "step": 1100 }, { "epoch": 0.30620056136769586, "grad_norm": 22.446331024169922, "learning_rate": 9.420187127870712e-06, "loss": 0.6668, "step": 1200 }, { "epoch": 0.33171727481500385, "grad_norm": 19.624980926513672, "learning_rate": 9.278423589452793e-06, "loss": 0.6713, "step": 1300 }, { "epoch": 0.35723398826231184, "grad_norm": 14.320337295532227, "learning_rate": 9.136660051034874e-06, "loss": 0.5721, "step": 1400 }, { "epoch": 0.3827507017096198, "grad_norm": 25.894556045532227, "learning_rate": 8.994896512616955e-06, "loss": 0.597, "step": 1500 }, { "epoch": 0.40826741515692777, "grad_norm": 40.72581100463867, "learning_rate": 8.853132974199036e-06, "loss": 0.594, "step": 1600 }, { "epoch": 0.43378412860423576, "grad_norm": 21.20204734802246, "learning_rate": 8.711369435781117e-06, "loss": 0.5882, "step": 1700 }, { "epoch": 0.45930084205154376, "grad_norm": 49.4229850769043, "learning_rate": 8.5696058973632e-06, "loss": 0.6018, "step": 1800 }, { "epoch": 0.48481755549885175, "grad_norm": 21.961654663085938, "learning_rate": 8.42784235894528e-06, "loss": 0.5872, "step": 1900 }, { "epoch": 0.5103342689461597, "grad_norm": 15.664958000183105, "learning_rate": 8.286078820527362e-06, "loss": 0.5802, "step": 2000 }, { "epoch": 0.5358509823934677, "grad_norm": 32.18100357055664, "learning_rate": 8.144315282109441e-06, "loss": 0.5614, "step": 2100 }, { "epoch": 0.5613676958407757, "grad_norm": 14.913948059082031, "learning_rate": 8.002551743691524e-06, "loss": 0.5166, "step": 2200 }, { "epoch": 0.5868844092880837, "grad_norm": 14.455422401428223, "learning_rate": 7.860788205273603e-06, "loss": 0.5722, "step": 2300 }, { "epoch": 0.6124011227353917, "grad_norm": 49.46743392944336, "learning_rate": 7.719024666855686e-06, "loss": 0.5289, "step": 2400 }, { "epoch": 0.6379178361826997, "grad_norm": 16.0176944732666, "learning_rate": 7.577261128437766e-06, "loss": 0.5803, "step": 2500 }, { "epoch": 0.6634345496300077, "grad_norm": 11.620992660522461, "learning_rate": 7.435497590019848e-06, "loss": 0.5355, "step": 2600 }, { "epoch": 0.6889512630773157, "grad_norm": 24.707399368286133, "learning_rate": 7.293734051601928e-06, "loss": 0.5694, "step": 2700 }, { "epoch": 0.7144679765246237, "grad_norm": 35.875511169433594, "learning_rate": 7.15197051318401e-06, "loss": 0.569, "step": 2800 }, { "epoch": 0.7399846899719316, "grad_norm": 32.23057556152344, "learning_rate": 7.010206974766091e-06, "loss": 0.5016, "step": 2900 }, { "epoch": 0.7655014034192396, "grad_norm": 24.113296508789062, "learning_rate": 6.868443436348172e-06, "loss": 0.5586, "step": 3000 }, { "epoch": 0.7910181168665475, "grad_norm": 25.87450408935547, "learning_rate": 6.726679897930253e-06, "loss": 0.5154, "step": 3100 }, { "epoch": 0.8165348303138555, "grad_norm": 32.350250244140625, "learning_rate": 6.584916359512335e-06, "loss": 0.5318, "step": 3200 }, { "epoch": 0.8420515437611635, "grad_norm": 22.004989624023438, "learning_rate": 6.443152821094415e-06, "loss": 0.5461, "step": 3300 }, { "epoch": 0.8675682572084715, "grad_norm": 39.10634994506836, "learning_rate": 6.301389282676497e-06, "loss": 0.4885, "step": 3400 }, { "epoch": 0.8930849706557795, "grad_norm": 24.000411987304688, "learning_rate": 6.159625744258577e-06, "loss": 0.573, "step": 3500 }, { "epoch": 0.9186016841030875, "grad_norm": 15.122103691101074, "learning_rate": 6.017862205840658e-06, "loss": 0.5145, "step": 3600 }, { "epoch": 0.9441183975503955, "grad_norm": 29.39118003845215, "learning_rate": 5.87609866742274e-06, "loss": 0.5214, "step": 3700 }, { "epoch": 0.9696351109977035, "grad_norm": 21.783594131469727, "learning_rate": 5.73433512900482e-06, "loss": 0.493, "step": 3800 }, { "epoch": 0.9951518244450115, "grad_norm": 6.728275299072266, "learning_rate": 5.592571590586902e-06, "loss": 0.5413, "step": 3900 }, { "epoch": 1.0206685378923195, "grad_norm": 23.733306884765625, "learning_rate": 5.450808052168982e-06, "loss": 0.442, "step": 4000 }, { "epoch": 1.0461852513396275, "grad_norm": 27.818904876708984, "learning_rate": 5.309044513751064e-06, "loss": 0.4068, "step": 4100 }, { "epoch": 1.0717019647869355, "grad_norm": 27.924034118652344, "learning_rate": 5.167280975333145e-06, "loss": 0.4142, "step": 4200 }, { "epoch": 1.0972186782342435, "grad_norm": 30.92214012145996, "learning_rate": 5.025517436915226e-06, "loss": 0.4112, "step": 4300 }, { "epoch": 1.1227353916815515, "grad_norm": 31.626100540161133, "learning_rate": 4.8837538984973074e-06, "loss": 0.4168, "step": 4400 }, { "epoch": 1.1482521051288594, "grad_norm": 37.9410514831543, "learning_rate": 4.7419903600793884e-06, "loss": 0.399, "step": 4500 }, { "epoch": 1.1737688185761674, "grad_norm": 23.123756408691406, "learning_rate": 4.6002268216614694e-06, "loss": 0.4104, "step": 4600 }, { "epoch": 1.1992855320234754, "grad_norm": 16.32610511779785, "learning_rate": 4.45846328324355e-06, "loss": 0.4082, "step": 4700 }, { "epoch": 1.2248022454707834, "grad_norm": 28.599788665771484, "learning_rate": 4.316699744825631e-06, "loss": 0.4197, "step": 4800 }, { "epoch": 1.2503189589180914, "grad_norm": 27.3553524017334, "learning_rate": 4.174936206407712e-06, "loss": 0.4162, "step": 4900 }, { "epoch": 1.2758356723653994, "grad_norm": 30.498682022094727, "learning_rate": 4.033172667989793e-06, "loss": 0.4178, "step": 5000 }, { "epoch": 1.3013523858127074, "grad_norm": 33.331581115722656, "learning_rate": 3.891409129571874e-06, "loss": 0.3608, "step": 5100 }, { "epoch": 1.3268690992600152, "grad_norm": 32.784950256347656, "learning_rate": 3.7496455911539554e-06, "loss": 0.4207, "step": 5200 }, { "epoch": 1.3523858127073232, "grad_norm": 31.74955177307129, "learning_rate": 3.6078820527360364e-06, "loss": 0.3906, "step": 5300 }, { "epoch": 1.3779025261546312, "grad_norm": 23.00249671936035, "learning_rate": 3.4661185143181174e-06, "loss": 0.3973, "step": 5400 }, { "epoch": 1.4034192396019392, "grad_norm": 31.479957580566406, "learning_rate": 3.324354975900199e-06, "loss": 0.4064, "step": 5500 }, { "epoch": 1.4289359530492471, "grad_norm": 8.337751388549805, "learning_rate": 3.18259143748228e-06, "loss": 0.3594, "step": 5600 }, { "epoch": 1.4544526664965551, "grad_norm": 27.91620445251465, "learning_rate": 3.040827899064361e-06, "loss": 0.4218, "step": 5700 }, { "epoch": 1.4799693799438631, "grad_norm": 26.840124130249023, "learning_rate": 2.899064360646442e-06, "loss": 0.368, "step": 5800 }, { "epoch": 1.5054860933911711, "grad_norm": 38.38251876831055, "learning_rate": 2.757300822228523e-06, "loss": 0.3994, "step": 5900 }, { "epoch": 1.5310028068384791, "grad_norm": 31.26285743713379, "learning_rate": 2.615537283810604e-06, "loss": 0.3785, "step": 6000 }, { "epoch": 1.556519520285787, "grad_norm": 17.311967849731445, "learning_rate": 2.473773745392685e-06, "loss": 0.3602, "step": 6100 }, { "epoch": 1.582036233733095, "grad_norm": 10.20802116394043, "learning_rate": 2.332010206974766e-06, "loss": 0.3891, "step": 6200 }, { "epoch": 1.607552947180403, "grad_norm": 15.230350494384766, "learning_rate": 2.1902466685568476e-06, "loss": 0.3718, "step": 6300 }, { "epoch": 1.633069660627711, "grad_norm": 27.391233444213867, "learning_rate": 2.0484831301389286e-06, "loss": 0.3699, "step": 6400 }, { "epoch": 1.658586374075019, "grad_norm": 22.2066707611084, "learning_rate": 1.9067195917210096e-06, "loss": 0.3702, "step": 6500 } ], "logging_steps": 100, "max_steps": 7838, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }