|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 10.0, |
|
"eval_steps": 500, |
|
"global_step": 5000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.4000000000000003e-07, |
|
"loss": 1.3992, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 1.1072826385498047, |
|
"learning_rate": 6.000000000000001e-07, |
|
"loss": 1.4497, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 3.9411494731903076, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"loss": 1.2599, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 3.928187847137451, |
|
"learning_rate": 1.4000000000000001e-06, |
|
"loss": 1.4124, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.6260775923728943, |
|
"learning_rate": 1.8000000000000001e-06, |
|
"loss": 1.3542, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 1.5522129535675049, |
|
"learning_rate": 2.2e-06, |
|
"loss": 1.2123, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 1.6981067657470703, |
|
"learning_rate": 2.6e-06, |
|
"loss": 1.2098, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 5.6391496658325195, |
|
"learning_rate": 3e-06, |
|
"loss": 1.2226, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.5151563286781311, |
|
"learning_rate": 3.3600000000000004e-06, |
|
"loss": 1.0624, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 1.4428874254226685, |
|
"learning_rate": 3.7600000000000004e-06, |
|
"loss": 1.0969, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 2.130734920501709, |
|
"learning_rate": 4.16e-06, |
|
"loss": 1.0879, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 2.011439561843872, |
|
"learning_rate": 4.56e-06, |
|
"loss": 1.0195, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 2.217247486114502, |
|
"learning_rate": 4.960000000000001e-06, |
|
"loss": 0.9765, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.946567952632904, |
|
"learning_rate": 5.36e-06, |
|
"loss": 1.0205, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 6.922210693359375, |
|
"learning_rate": 5.76e-06, |
|
"loss": 0.9517, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 3.384115219116211, |
|
"learning_rate": 6.16e-06, |
|
"loss": 0.9324, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 1.7061117887496948, |
|
"learning_rate": 6.560000000000001e-06, |
|
"loss": 0.804, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 1.5616205930709839, |
|
"learning_rate": 6.96e-06, |
|
"loss": 0.7821, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 1.8393518924713135, |
|
"learning_rate": 7.360000000000001e-06, |
|
"loss": 0.8086, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 1.4879248142242432, |
|
"learning_rate": 7.76e-06, |
|
"loss": 0.7655, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.6370295882225037, |
|
"learning_rate": 8.16e-06, |
|
"loss": 0.7508, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.5269752144813538, |
|
"learning_rate": 8.560000000000001e-06, |
|
"loss": 0.7429, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 1.2695356607437134, |
|
"learning_rate": 8.96e-06, |
|
"loss": 0.7502, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 1.5492205619812012, |
|
"learning_rate": 9.360000000000002e-06, |
|
"loss": 0.7029, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 1.7380893230438232, |
|
"learning_rate": 9.760000000000001e-06, |
|
"loss": 0.7324, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 1.8911452293395996, |
|
"learning_rate": 1.0160000000000001e-05, |
|
"loss": 0.7521, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 2.0408151149749756, |
|
"learning_rate": 1.056e-05, |
|
"loss": 0.698, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 1.9015631675720215, |
|
"learning_rate": 1.0960000000000002e-05, |
|
"loss": 0.6859, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 1.5284056663513184, |
|
"learning_rate": 1.136e-05, |
|
"loss": 0.6716, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 1.3547126054763794, |
|
"learning_rate": 1.1760000000000001e-05, |
|
"loss": 0.6978, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 1.4986441135406494, |
|
"learning_rate": 1.216e-05, |
|
"loss": 0.6584, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 1.1250969171524048, |
|
"learning_rate": 1.2560000000000002e-05, |
|
"loss": 0.7188, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 1.1186408996582031, |
|
"learning_rate": 1.2960000000000001e-05, |
|
"loss": 0.6687, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 1.1250578165054321, |
|
"learning_rate": 1.3360000000000003e-05, |
|
"loss": 0.623, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 0.8931149840354919, |
|
"learning_rate": 1.376e-05, |
|
"loss": 0.6795, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 1.5052251815795898, |
|
"learning_rate": 1.416e-05, |
|
"loss": 0.6455, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 1.8043763637542725, |
|
"learning_rate": 1.4560000000000001e-05, |
|
"loss": 0.6548, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 1.7357759475708008, |
|
"learning_rate": 1.496e-05, |
|
"loss": 0.6508, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 1.4080872535705566, |
|
"learning_rate": 1.5360000000000002e-05, |
|
"loss": 0.7105, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.7395206689834595, |
|
"learning_rate": 1.576e-05, |
|
"loss": 0.6738, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 1.4803540706634521, |
|
"learning_rate": 1.616e-05, |
|
"loss": 0.6741, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 0.9231945276260376, |
|
"learning_rate": 1.656e-05, |
|
"loss": 0.6385, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 1.0000849962234497, |
|
"learning_rate": 1.696e-05, |
|
"loss": 0.6304, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 1.8198318481445312, |
|
"learning_rate": 1.736e-05, |
|
"loss": 0.652, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 0.8213591575622559, |
|
"learning_rate": 1.7760000000000003e-05, |
|
"loss": 0.6517, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 2.0881271362304688, |
|
"learning_rate": 1.8160000000000002e-05, |
|
"loss": 0.7044, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 2.583738088607788, |
|
"learning_rate": 1.8560000000000002e-05, |
|
"loss": 0.6801, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 2.3651039600372314, |
|
"learning_rate": 1.896e-05, |
|
"loss": 0.6531, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 1.174816608428955, |
|
"learning_rate": 1.936e-05, |
|
"loss": 0.6837, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 1.93290376663208, |
|
"learning_rate": 1.976e-05, |
|
"loss": 0.6659, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.02, |
|
"grad_norm": 1.616199254989624, |
|
"learning_rate": 1.9999961008995607e-05, |
|
"loss": 0.6212, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"grad_norm": 1.4221971035003662, |
|
"learning_rate": 1.99995223636881e-05, |
|
"loss": 0.6705, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.06, |
|
"grad_norm": 1.4205855131149292, |
|
"learning_rate": 1.9998596355767805e-05, |
|
"loss": 0.6346, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"grad_norm": 1.5640236139297485, |
|
"learning_rate": 1.999718303036705e-05, |
|
"loss": 0.6698, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.1, |
|
"grad_norm": 1.3953174352645874, |
|
"learning_rate": 1.9995282456369313e-05, |
|
"loss": 0.5925, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"grad_norm": 1.3201889991760254, |
|
"learning_rate": 1.9992894726405894e-05, |
|
"loss": 0.6795, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.1400000000000001, |
|
"grad_norm": 1.8795799016952515, |
|
"learning_rate": 1.9990019956851384e-05, |
|
"loss": 0.6096, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.16, |
|
"grad_norm": 1.9675489664077759, |
|
"learning_rate": 1.998665828781799e-05, |
|
"loss": 0.5971, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.18, |
|
"grad_norm": 1.0514065027236938, |
|
"learning_rate": 1.998280988314872e-05, |
|
"loss": 0.6055, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 0.7312430143356323, |
|
"learning_rate": 1.9978474930409396e-05, |
|
"loss": 0.6304, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.22, |
|
"grad_norm": 11.12009048461914, |
|
"learning_rate": 1.9973653640879486e-05, |
|
"loss": 0.6812, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.24, |
|
"grad_norm": 2.484487771987915, |
|
"learning_rate": 1.9968346249541848e-05, |
|
"loss": 0.5842, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.26, |
|
"grad_norm": 1.4185147285461426, |
|
"learning_rate": 1.996255301507125e-05, |
|
"loss": 0.6478, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"grad_norm": 0.939939022064209, |
|
"learning_rate": 1.995627421982176e-05, |
|
"loss": 0.6003, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.3, |
|
"grad_norm": 2.303175687789917, |
|
"learning_rate": 1.9949510169813006e-05, |
|
"loss": 0.6087, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.32, |
|
"grad_norm": 1.1472731828689575, |
|
"learning_rate": 1.9942261194715236e-05, |
|
"loss": 0.5905, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.34, |
|
"grad_norm": 0.8142107129096985, |
|
"learning_rate": 1.9934527647833276e-05, |
|
"loss": 0.593, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 1.3599999999999999, |
|
"grad_norm": 1.4084548950195312, |
|
"learning_rate": 1.992630990608929e-05, |
|
"loss": 0.6253, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.38, |
|
"grad_norm": 1.6054160594940186, |
|
"learning_rate": 1.9917608370004417e-05, |
|
"loss": 0.6338, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"grad_norm": 2.442007541656494, |
|
"learning_rate": 1.9908423463679246e-05, |
|
"loss": 0.6148, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.42, |
|
"grad_norm": 0.7885130047798157, |
|
"learning_rate": 1.989875563477316e-05, |
|
"loss": 0.6271, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 1.44, |
|
"grad_norm": 2.8302054405212402, |
|
"learning_rate": 1.9888605354482494e-05, |
|
"loss": 0.638, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.46, |
|
"grad_norm": 1.134475827217102, |
|
"learning_rate": 1.987797311751759e-05, |
|
"loss": 0.6304, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 1.48, |
|
"grad_norm": 0.7951676249504089, |
|
"learning_rate": 1.986685944207868e-05, |
|
"loss": 0.5877, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 1.5261722803115845, |
|
"learning_rate": 1.985526486983063e-05, |
|
"loss": 0.5747, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.52, |
|
"grad_norm": 4.633131980895996, |
|
"learning_rate": 1.9843189965876525e-05, |
|
"loss": 0.6514, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 1.54, |
|
"grad_norm": 0.7688568830490112, |
|
"learning_rate": 1.9830635318730155e-05, |
|
"loss": 0.5879, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 1.56, |
|
"grad_norm": 0.859425961971283, |
|
"learning_rate": 1.981760154028731e-05, |
|
"loss": 0.6152, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 1.58, |
|
"grad_norm": 1.939563274383545, |
|
"learning_rate": 1.980408926579596e-05, |
|
"loss": 0.6342, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 0.7612221837043762, |
|
"learning_rate": 1.97900991538253e-05, |
|
"loss": 0.6167, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.62, |
|
"grad_norm": 2.2331180572509766, |
|
"learning_rate": 1.9775631886233655e-05, |
|
"loss": 0.5688, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 1.6400000000000001, |
|
"grad_norm": 3.1707897186279297, |
|
"learning_rate": 1.9760688168135233e-05, |
|
"loss": 0.6023, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 1.6600000000000001, |
|
"grad_norm": 1.5035152435302734, |
|
"learning_rate": 1.9745268727865774e-05, |
|
"loss": 0.5957, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 1.6800000000000002, |
|
"grad_norm": 1.8150962591171265, |
|
"learning_rate": 1.972937431694704e-05, |
|
"loss": 0.5409, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 1.7, |
|
"grad_norm": 1.5338727235794067, |
|
"learning_rate": 1.9713005710050203e-05, |
|
"loss": 0.6286, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 1.72, |
|
"grad_norm": 1.0299500226974487, |
|
"learning_rate": 1.969616370495806e-05, |
|
"loss": 0.5936, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 1.74, |
|
"grad_norm": 3.0667810440063477, |
|
"learning_rate": 1.967884912252619e-05, |
|
"loss": 0.6535, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 1.76, |
|
"grad_norm": 0.9165984988212585, |
|
"learning_rate": 1.9661062806642903e-05, |
|
"loss": 0.5864, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 1.78, |
|
"grad_norm": 1.398553729057312, |
|
"learning_rate": 1.964280562418815e-05, |
|
"loss": 0.6181, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"grad_norm": 1.1232646703720093, |
|
"learning_rate": 1.962407846499124e-05, |
|
"loss": 0.5736, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.8199999999999998, |
|
"grad_norm": 1.905674695968628, |
|
"learning_rate": 1.96048822417875e-05, |
|
"loss": 0.5769, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 1.8399999999999999, |
|
"grad_norm": 0.9369404911994934, |
|
"learning_rate": 1.958521789017376e-05, |
|
"loss": 0.6056, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 1.8599999999999999, |
|
"grad_norm": 1.0187280178070068, |
|
"learning_rate": 1.956508636856278e-05, |
|
"loss": 0.6632, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 1.88, |
|
"grad_norm": 1.4954912662506104, |
|
"learning_rate": 1.9546569379242446e-05, |
|
"loss": 0.5803, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 1.9, |
|
"grad_norm": 1.1077364683151245, |
|
"learning_rate": 1.9525552956573244e-05, |
|
"loss": 0.6028, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 1.92, |
|
"grad_norm": 1.777431607246399, |
|
"learning_rate": 1.9504072271891486e-05, |
|
"loss": 0.5932, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 1.94, |
|
"grad_norm": 1.1748569011688232, |
|
"learning_rate": 1.9482128372135446e-05, |
|
"loss": 0.581, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 1.96, |
|
"grad_norm": 1.5400365591049194, |
|
"learning_rate": 1.945972232681984e-05, |
|
"loss": 0.6207, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 1.98, |
|
"grad_norm": 1.3064255714416504, |
|
"learning_rate": 1.9436855227983695e-05, |
|
"loss": 0.5576, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 1.6377707719802856, |
|
"learning_rate": 1.9413528190137158e-05, |
|
"loss": 0.6121, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 2.02, |
|
"grad_norm": 0.8060258030891418, |
|
"learning_rate": 1.938974235020714e-05, |
|
"loss": 0.5909, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 2.04, |
|
"grad_norm": 2.174511671066284, |
|
"learning_rate": 1.9365498867481926e-05, |
|
"loss": 0.5459, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 2.06, |
|
"grad_norm": 2.0883657932281494, |
|
"learning_rate": 1.9340798923554657e-05, |
|
"loss": 0.5781, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 2.08, |
|
"grad_norm": 1.4199424982070923, |
|
"learning_rate": 1.931564372226576e-05, |
|
"loss": 0.5284, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 2.1, |
|
"grad_norm": 1.2759448289871216, |
|
"learning_rate": 1.9290034489644247e-05, |
|
"loss": 0.5476, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 2.12, |
|
"grad_norm": 0.9871682524681091, |
|
"learning_rate": 1.9263972473847995e-05, |
|
"loss": 0.5386, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 2.14, |
|
"grad_norm": 1.325584053993225, |
|
"learning_rate": 1.923745894510288e-05, |
|
"loss": 0.5864, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 2.16, |
|
"grad_norm": 1.4707293510437012, |
|
"learning_rate": 1.9210495195640895e-05, |
|
"loss": 0.5413, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 2.18, |
|
"grad_norm": 1.222374439239502, |
|
"learning_rate": 1.918308253963715e-05, |
|
"loss": 0.5201, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 2.2, |
|
"grad_norm": 2.6065587997436523, |
|
"learning_rate": 1.9155222313145817e-05, |
|
"loss": 0.5658, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 2.22, |
|
"grad_norm": 1.2004817724227905, |
|
"learning_rate": 1.912691587403503e-05, |
|
"loss": 0.5578, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 2.24, |
|
"grad_norm": 2.0765957832336426, |
|
"learning_rate": 1.9098164601920702e-05, |
|
"loss": 0.4792, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 2.26, |
|
"grad_norm": 1.2682489156723022, |
|
"learning_rate": 1.906896989809927e-05, |
|
"loss": 0.6048, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 2.2800000000000002, |
|
"grad_norm": 3.5352697372436523, |
|
"learning_rate": 1.903933318547942e-05, |
|
"loss": 0.567, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 2.3, |
|
"grad_norm": 1.590265154838562, |
|
"learning_rate": 1.9009255908512704e-05, |
|
"loss": 0.5965, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 2.32, |
|
"grad_norm": 2.2210605144500732, |
|
"learning_rate": 1.897873953312317e-05, |
|
"loss": 0.5561, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 2.34, |
|
"grad_norm": 1.0093090534210205, |
|
"learning_rate": 1.8947785546635905e-05, |
|
"loss": 0.5529, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 2.36, |
|
"grad_norm": 1.523364543914795, |
|
"learning_rate": 1.8916395457704536e-05, |
|
"loss": 0.5818, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 2.38, |
|
"grad_norm": 0.9908381700515747, |
|
"learning_rate": 1.888457079623772e-05, |
|
"loss": 0.5558, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 2.4, |
|
"grad_norm": 1.499123215675354, |
|
"learning_rate": 1.8852313113324553e-05, |
|
"loss": 0.5833, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 2.42, |
|
"grad_norm": 1.700926661491394, |
|
"learning_rate": 1.8819623981158996e-05, |
|
"loss": 0.527, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 2.44, |
|
"grad_norm": 0.7639631032943726, |
|
"learning_rate": 1.878650499296323e-05, |
|
"loss": 0.5605, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 2.46, |
|
"grad_norm": 0.9155722856521606, |
|
"learning_rate": 1.8752957762910016e-05, |
|
"loss": 0.5528, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 2.48, |
|
"grad_norm": 1.4307914972305298, |
|
"learning_rate": 1.871898392604402e-05, |
|
"loss": 0.5239, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"grad_norm": 0.9646509885787964, |
|
"learning_rate": 1.8684585138202122e-05, |
|
"loss": 0.5825, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 2.52, |
|
"grad_norm": 0.823187530040741, |
|
"learning_rate": 1.864976307593271e-05, |
|
"loss": 0.5816, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 2.54, |
|
"grad_norm": 0.9524659514427185, |
|
"learning_rate": 1.8614519436413968e-05, |
|
"loss": 0.595, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 2.56, |
|
"grad_norm": 3.476361036300659, |
|
"learning_rate": 1.8578855937371176e-05, |
|
"loss": 0.5828, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 2.58, |
|
"grad_norm": 0.8790251016616821, |
|
"learning_rate": 1.8542774316992953e-05, |
|
"loss": 0.5408, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 2.6, |
|
"grad_norm": 1.8205924034118652, |
|
"learning_rate": 1.850627633384658e-05, |
|
"loss": 0.5683, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 2.62, |
|
"grad_norm": 0.7987878918647766, |
|
"learning_rate": 1.8469363766792258e-05, |
|
"loss": 0.5734, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 2.64, |
|
"grad_norm": 1.1774803400039673, |
|
"learning_rate": 1.8432038414896432e-05, |
|
"loss": 0.5581, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 2.66, |
|
"grad_norm": 1.4109526872634888, |
|
"learning_rate": 1.8394302097344103e-05, |
|
"loss": 0.5781, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 2.68, |
|
"grad_norm": 2.104931116104126, |
|
"learning_rate": 1.8356156653350138e-05, |
|
"loss": 0.5468, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 2.7, |
|
"grad_norm": 1.7022193670272827, |
|
"learning_rate": 1.8317603942069665e-05, |
|
"loss": 0.543, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 2.7199999999999998, |
|
"grad_norm": 1.9831078052520752, |
|
"learning_rate": 1.8278645842507448e-05, |
|
"loss": 0.5416, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 2.74, |
|
"grad_norm": 1.4346486330032349, |
|
"learning_rate": 1.8239284253426294e-05, |
|
"loss": 0.5692, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 2.76, |
|
"grad_norm": 3.786205768585205, |
|
"learning_rate": 1.8199521093254524e-05, |
|
"loss": 0.5372, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 2.7800000000000002, |
|
"grad_norm": 1.2855556011199951, |
|
"learning_rate": 1.815935829999247e-05, |
|
"loss": 0.5205, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 2.8, |
|
"grad_norm": 1.4858529567718506, |
|
"learning_rate": 1.811879783111801e-05, |
|
"loss": 0.5159, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 2.82, |
|
"grad_norm": 1.3300436735153198, |
|
"learning_rate": 1.8077841663491174e-05, |
|
"loss": 0.5405, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 2.84, |
|
"grad_norm": 1.2626831531524658, |
|
"learning_rate": 1.80364917932578e-05, |
|
"loss": 0.5769, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 2.86, |
|
"grad_norm": 1.4615288972854614, |
|
"learning_rate": 1.799475023575222e-05, |
|
"loss": 0.5724, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 2.88, |
|
"grad_norm": 1.2775332927703857, |
|
"learning_rate": 1.795261902539906e-05, |
|
"loss": 0.5603, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 2.9, |
|
"grad_norm": 1.9211925268173218, |
|
"learning_rate": 1.791010021561407e-05, |
|
"loss": 0.5609, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 2.92, |
|
"grad_norm": 3.850076675415039, |
|
"learning_rate": 1.7867195878704062e-05, |
|
"loss": 0.585, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 2.94, |
|
"grad_norm": 1.0455843210220337, |
|
"learning_rate": 1.7823908105765883e-05, |
|
"loss": 0.5818, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 2.96, |
|
"grad_norm": 2.406511068344116, |
|
"learning_rate": 1.7780239006584515e-05, |
|
"loss": 0.5453, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 2.98, |
|
"grad_norm": 1.212849736213684, |
|
"learning_rate": 1.773619070953025e-05, |
|
"loss": 0.5526, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 1.608502984046936, |
|
"learning_rate": 1.769176536145494e-05, |
|
"loss": 0.5664, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 3.02, |
|
"grad_norm": 2.083935499191284, |
|
"learning_rate": 1.7646965127587373e-05, |
|
"loss": 0.4993, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 3.04, |
|
"grad_norm": 3.191681146621704, |
|
"learning_rate": 1.760179219142774e-05, |
|
"loss": 0.5302, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 3.06, |
|
"grad_norm": 2.241614580154419, |
|
"learning_rate": 1.7556248754641237e-05, |
|
"loss": 0.4995, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 3.08, |
|
"grad_norm": 2.2549211978912354, |
|
"learning_rate": 1.7510337036950703e-05, |
|
"loss": 0.4902, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 3.1, |
|
"grad_norm": 2.623849630355835, |
|
"learning_rate": 1.7464059276028497e-05, |
|
"loss": 0.5571, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 3.12, |
|
"grad_norm": 1.099225401878357, |
|
"learning_rate": 1.7417417727387392e-05, |
|
"loss": 0.5441, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 3.14, |
|
"grad_norm": 1.0318654775619507, |
|
"learning_rate": 1.7370414664270675e-05, |
|
"loss": 0.5498, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 3.16, |
|
"grad_norm": 1.4295272827148438, |
|
"learning_rate": 1.732305237754132e-05, |
|
"loss": 0.4799, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 3.18, |
|
"grad_norm": 3.886106014251709, |
|
"learning_rate": 1.727533317557037e-05, |
|
"loss": 0.5285, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 3.2, |
|
"grad_norm": 0.9527933597564697, |
|
"learning_rate": 1.7227259384124408e-05, |
|
"loss": 0.5328, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 3.22, |
|
"grad_norm": 1.0082141160964966, |
|
"learning_rate": 1.7178833346252208e-05, |
|
"loss": 0.5333, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 3.24, |
|
"grad_norm": 0.9779771566390991, |
|
"learning_rate": 1.713005742217053e-05, |
|
"loss": 0.5163, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 3.26, |
|
"grad_norm": 2.230241060256958, |
|
"learning_rate": 1.7080933989149112e-05, |
|
"loss": 0.5173, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 3.2800000000000002, |
|
"grad_norm": 4.50337028503418, |
|
"learning_rate": 1.7031465441394766e-05, |
|
"loss": 0.5187, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 3.3, |
|
"grad_norm": 1.1370171308517456, |
|
"learning_rate": 1.698165418993473e-05, |
|
"loss": 0.5611, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 3.32, |
|
"grad_norm": 0.9167156219482422, |
|
"learning_rate": 1.6931502662499116e-05, |
|
"loss": 0.5381, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 3.34, |
|
"grad_norm": 2.5473690032958984, |
|
"learning_rate": 1.688101330340263e-05, |
|
"loss": 0.5089, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 3.36, |
|
"grad_norm": 0.9971883893013, |
|
"learning_rate": 1.683018857342539e-05, |
|
"loss": 0.5538, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 3.38, |
|
"grad_norm": 1.7401093244552612, |
|
"learning_rate": 1.6779030949693044e-05, |
|
"loss": 0.5216, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 3.4, |
|
"grad_norm": 6.5078935623168945, |
|
"learning_rate": 1.6727542925556e-05, |
|
"loss": 0.5356, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 3.42, |
|
"grad_norm": 0.9460225105285645, |
|
"learning_rate": 1.667572701046791e-05, |
|
"loss": 0.497, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 3.44, |
|
"grad_norm": 1.153794765472412, |
|
"learning_rate": 1.662358572986337e-05, |
|
"loss": 0.4934, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 3.46, |
|
"grad_norm": 1.1257123947143555, |
|
"learning_rate": 1.6571121625034847e-05, |
|
"loss": 0.5327, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 3.48, |
|
"grad_norm": 2.454582691192627, |
|
"learning_rate": 1.651833725300879e-05, |
|
"loss": 0.4995, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 3.5, |
|
"grad_norm": 1.7266925573349, |
|
"learning_rate": 1.6465235186421024e-05, |
|
"loss": 0.4945, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 3.52, |
|
"grad_norm": 2.5186126232147217, |
|
"learning_rate": 1.6411818013391357e-05, |
|
"loss": 0.4969, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 3.54, |
|
"grad_norm": 2.2978458404541016, |
|
"learning_rate": 1.6358088337397444e-05, |
|
"loss": 0.5133, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 3.56, |
|
"grad_norm": 9.083796501159668, |
|
"learning_rate": 1.630404877714789e-05, |
|
"loss": 0.4598, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 3.58, |
|
"grad_norm": 1.687730312347412, |
|
"learning_rate": 1.6249701966454626e-05, |
|
"loss": 0.5721, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 3.6, |
|
"grad_norm": 0.949276328086853, |
|
"learning_rate": 1.619505055410453e-05, |
|
"loss": 0.5549, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 3.62, |
|
"grad_norm": 1.7653878927230835, |
|
"learning_rate": 1.614009720373034e-05, |
|
"loss": 0.5192, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 3.64, |
|
"grad_norm": 0.6442993879318237, |
|
"learning_rate": 1.608484459368082e-05, |
|
"loss": 0.4927, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 3.66, |
|
"grad_norm": 2.4791717529296875, |
|
"learning_rate": 1.602929541689025e-05, |
|
"loss": 0.5319, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 3.68, |
|
"grad_norm": 1.8095901012420654, |
|
"learning_rate": 1.5973452380747125e-05, |
|
"loss": 0.5025, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 3.7, |
|
"grad_norm": 1.8692318201065063, |
|
"learning_rate": 1.591731820696224e-05, |
|
"loss": 0.497, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 3.7199999999999998, |
|
"grad_norm": 1.3001285791397095, |
|
"learning_rate": 1.5860895631436044e-05, |
|
"loss": 0.556, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 3.74, |
|
"grad_norm": 1.0697414875030518, |
|
"learning_rate": 1.580418740412526e-05, |
|
"loss": 0.506, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 3.76, |
|
"grad_norm": 3.5158541202545166, |
|
"learning_rate": 1.5747196288908887e-05, |
|
"loss": 0.5154, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 3.7800000000000002, |
|
"grad_norm": 1.699308156967163, |
|
"learning_rate": 1.5689925063453483e-05, |
|
"loss": 0.5887, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 3.8, |
|
"grad_norm": 0.825036883354187, |
|
"learning_rate": 1.563237651907777e-05, |
|
"loss": 0.508, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 3.82, |
|
"grad_norm": 1.7058846950531006, |
|
"learning_rate": 1.5574553460616608e-05, |
|
"loss": 0.4954, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 3.84, |
|
"grad_norm": 1.7128701210021973, |
|
"learning_rate": 1.5516458706284306e-05, |
|
"loss": 0.5628, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 3.86, |
|
"grad_norm": 0.8009471297264099, |
|
"learning_rate": 1.5458095087537216e-05, |
|
"loss": 0.4494, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 3.88, |
|
"grad_norm": 1.8152306079864502, |
|
"learning_rate": 1.5399465448935788e-05, |
|
"loss": 0.522, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 3.9, |
|
"grad_norm": 2.59840989112854, |
|
"learning_rate": 1.5340572648005887e-05, |
|
"loss": 0.5225, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 3.92, |
|
"grad_norm": 3.8816378116607666, |
|
"learning_rate": 1.5281419555099547e-05, |
|
"loss": 0.5092, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 3.94, |
|
"grad_norm": 1.4815788269042969, |
|
"learning_rate": 1.5222009053255061e-05, |
|
"loss": 0.5167, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 3.96, |
|
"grad_norm": 1.5924495458602905, |
|
"learning_rate": 1.5162344038056476e-05, |
|
"loss": 0.5198, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 3.98, |
|
"grad_norm": 1.44657301902771, |
|
"learning_rate": 1.510242741749246e-05, |
|
"loss": 0.5723, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 2.8102152347564697, |
|
"learning_rate": 1.5042262111814566e-05, |
|
"loss": 0.4707, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 4.02, |
|
"grad_norm": 1.1032963991165161, |
|
"learning_rate": 1.498185105339491e-05, |
|
"loss": 0.523, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 4.04, |
|
"grad_norm": 1.0523751974105835, |
|
"learning_rate": 1.4921197186583256e-05, |
|
"loss": 0.4433, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 4.06, |
|
"grad_norm": 3.989997148513794, |
|
"learning_rate": 1.4860303467563504e-05, |
|
"loss": 0.4861, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 4.08, |
|
"grad_norm": 2.7621233463287354, |
|
"learning_rate": 1.4799172864209607e-05, |
|
"loss": 0.4621, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 4.1, |
|
"grad_norm": 0.960752010345459, |
|
"learning_rate": 1.4737808355940932e-05, |
|
"loss": 0.5454, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 4.12, |
|
"grad_norm": 1.279720664024353, |
|
"learning_rate": 1.467621293357704e-05, |
|
"loss": 0.4984, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 4.14, |
|
"grad_norm": 2.1226606369018555, |
|
"learning_rate": 1.4614389599191917e-05, |
|
"loss": 0.5375, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 4.16, |
|
"grad_norm": 1.6314127445220947, |
|
"learning_rate": 1.455234136596766e-05, |
|
"loss": 0.5191, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 4.18, |
|
"grad_norm": 2.72589111328125, |
|
"learning_rate": 1.4490071258047625e-05, |
|
"loss": 0.4872, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 4.2, |
|
"grad_norm": 4.3185648918151855, |
|
"learning_rate": 1.442758231038902e-05, |
|
"loss": 0.4883, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 4.22, |
|
"grad_norm": 0.8917685747146606, |
|
"learning_rate": 1.436487756861499e-05, |
|
"loss": 0.4564, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 4.24, |
|
"grad_norm": 1.1269738674163818, |
|
"learning_rate": 1.4301960088866187e-05, |
|
"loss": 0.4311, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 4.26, |
|
"grad_norm": 4.341186046600342, |
|
"learning_rate": 1.4238832937651816e-05, |
|
"loss": 0.4663, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 4.28, |
|
"grad_norm": 0.9497487545013428, |
|
"learning_rate": 1.4175499191700169e-05, |
|
"loss": 0.5048, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 4.3, |
|
"grad_norm": 1.7001278400421143, |
|
"learning_rate": 1.4111961937808665e-05, |
|
"loss": 0.493, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 4.32, |
|
"grad_norm": 1.7037960290908813, |
|
"learning_rate": 1.4048224272693426e-05, |
|
"loss": 0.4712, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 4.34, |
|
"grad_norm": 0.9490267634391785, |
|
"learning_rate": 1.3984289302838327e-05, |
|
"loss": 0.4641, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 4.36, |
|
"grad_norm": 2.2001664638519287, |
|
"learning_rate": 1.3920160144343604e-05, |
|
"loss": 0.4929, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 4.38, |
|
"grad_norm": 1.0687891244888306, |
|
"learning_rate": 1.3855839922773968e-05, |
|
"loss": 0.5269, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 4.4, |
|
"grad_norm": 1.1579177379608154, |
|
"learning_rate": 1.3791331773006272e-05, |
|
"loss": 0.4857, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 4.42, |
|
"grad_norm": 0.9180253744125366, |
|
"learning_rate": 1.3726638839076732e-05, |
|
"loss": 0.5613, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 4.44, |
|
"grad_norm": 1.170154333114624, |
|
"learning_rate": 1.3661764274027678e-05, |
|
"loss": 0.4884, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 4.46, |
|
"grad_norm": 1.8389673233032227, |
|
"learning_rate": 1.3596711239753889e-05, |
|
"loss": 0.4849, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 4.48, |
|
"grad_norm": 1.43435537815094, |
|
"learning_rate": 1.3531482906848474e-05, |
|
"loss": 0.4752, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 4.5, |
|
"grad_norm": 1.2561147212982178, |
|
"learning_rate": 1.3466082454448364e-05, |
|
"loss": 0.4804, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 4.52, |
|
"grad_norm": 1.3098878860473633, |
|
"learning_rate": 1.340051307007933e-05, |
|
"loss": 0.4719, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 4.54, |
|
"grad_norm": 7.966071128845215, |
|
"learning_rate": 1.3334777949500673e-05, |
|
"loss": 0.4599, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 4.5600000000000005, |
|
"grad_norm": 2.28067946434021, |
|
"learning_rate": 1.3268880296549424e-05, |
|
"loss": 0.4712, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 4.58, |
|
"grad_norm": 1.1881040334701538, |
|
"learning_rate": 1.3202823322984228e-05, |
|
"loss": 0.4772, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 4.6, |
|
"grad_norm": 1.482059359550476, |
|
"learning_rate": 1.3136610248328779e-05, |
|
"loss": 0.453, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 4.62, |
|
"grad_norm": 2.1320247650146484, |
|
"learning_rate": 1.307024429971492e-05, |
|
"loss": 0.4657, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 4.64, |
|
"grad_norm": 1.2124568223953247, |
|
"learning_rate": 1.3003728711725364e-05, |
|
"loss": 0.4791, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 4.66, |
|
"grad_norm": 0.7249093055725098, |
|
"learning_rate": 1.2937066726236029e-05, |
|
"loss": 0.5586, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 4.68, |
|
"grad_norm": 1.5038686990737915, |
|
"learning_rate": 1.2870261592258038e-05, |
|
"loss": 0.4603, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 4.7, |
|
"grad_norm": 1.320827603340149, |
|
"learning_rate": 1.2803316565779378e-05, |
|
"loss": 0.4618, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 4.72, |
|
"grad_norm": 1.4801580905914307, |
|
"learning_rate": 1.2736234909606186e-05, |
|
"loss": 0.4643, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 4.74, |
|
"grad_norm": 2.649113655090332, |
|
"learning_rate": 1.2669019893203758e-05, |
|
"loss": 0.5017, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 4.76, |
|
"grad_norm": 1.3265902996063232, |
|
"learning_rate": 1.2601674792537157e-05, |
|
"loss": 0.451, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 4.78, |
|
"grad_norm": 1.6874167919158936, |
|
"learning_rate": 1.2534202889911584e-05, |
|
"loss": 0.4763, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 4.8, |
|
"grad_norm": 0.7948456406593323, |
|
"learning_rate": 1.2466607473812386e-05, |
|
"loss": 0.4984, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 4.82, |
|
"grad_norm": 1.3831831216812134, |
|
"learning_rate": 1.2398891838744777e-05, |
|
"loss": 0.4594, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 4.84, |
|
"grad_norm": 2.660630941390991, |
|
"learning_rate": 1.233105928507328e-05, |
|
"loss": 0.476, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 4.86, |
|
"grad_norm": 1.7315685749053955, |
|
"learning_rate": 1.226311311886086e-05, |
|
"loss": 0.4599, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 4.88, |
|
"grad_norm": 1.0656920671463013, |
|
"learning_rate": 1.2195056651707806e-05, |
|
"loss": 0.4786, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 4.9, |
|
"grad_norm": 1.317185878753662, |
|
"learning_rate": 1.2126893200590309e-05, |
|
"loss": 0.539, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 4.92, |
|
"grad_norm": 2.1588151454925537, |
|
"learning_rate": 1.2058626087698814e-05, |
|
"loss": 0.442, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 4.9399999999999995, |
|
"grad_norm": 1.3337817192077637, |
|
"learning_rate": 1.1990258640276094e-05, |
|
"loss": 0.4829, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 4.96, |
|
"grad_norm": 1.3558602333068848, |
|
"learning_rate": 1.1921794190455082e-05, |
|
"loss": 0.5055, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 4.98, |
|
"grad_norm": 1.18630850315094, |
|
"learning_rate": 1.1853236075096474e-05, |
|
"loss": 0.4857, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"grad_norm": 1.4073606729507446, |
|
"learning_rate": 1.1784587635626095e-05, |
|
"loss": 0.4962, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 5.02, |
|
"grad_norm": 2.1152431964874268, |
|
"learning_rate": 1.171585221787203e-05, |
|
"loss": 0.4185, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 5.04, |
|
"grad_norm": 1.4434751272201538, |
|
"learning_rate": 1.1647033171901573e-05, |
|
"loss": 0.4545, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 5.06, |
|
"grad_norm": 2.575100898742676, |
|
"learning_rate": 1.157813385185794e-05, |
|
"loss": 0.4162, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 5.08, |
|
"grad_norm": 1.808670163154602, |
|
"learning_rate": 1.1509157615796775e-05, |
|
"loss": 0.425, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 5.1, |
|
"grad_norm": 2.0756947994232178, |
|
"learning_rate": 1.1440107825522522e-05, |
|
"loss": 0.4514, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 5.12, |
|
"grad_norm": 1.3622123003005981, |
|
"learning_rate": 1.1370987846424547e-05, |
|
"loss": 0.4687, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 5.14, |
|
"grad_norm": 1.938477873802185, |
|
"learning_rate": 1.1301801047313106e-05, |
|
"loss": 0.4892, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 5.16, |
|
"grad_norm": 1.3552794456481934, |
|
"learning_rate": 1.1232550800255188e-05, |
|
"loss": 0.4675, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 5.18, |
|
"grad_norm": 2.3927013874053955, |
|
"learning_rate": 1.1163240480410136e-05, |
|
"loss": 0.4336, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 5.2, |
|
"grad_norm": 1.3408219814300537, |
|
"learning_rate": 1.1093873465865156e-05, |
|
"loss": 0.4358, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 5.22, |
|
"grad_norm": 2.9869275093078613, |
|
"learning_rate": 1.1024453137470677e-05, |
|
"loss": 0.4709, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 5.24, |
|
"grad_norm": 2.7815663814544678, |
|
"learning_rate": 1.0954982878675564e-05, |
|
"loss": 0.4349, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 5.26, |
|
"grad_norm": 1.9527360200881958, |
|
"learning_rate": 1.0885466075362224e-05, |
|
"loss": 0.4581, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 5.28, |
|
"grad_norm": 1.804969072341919, |
|
"learning_rate": 1.0815906115681579e-05, |
|
"loss": 0.4482, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 5.3, |
|
"grad_norm": 1.6230815649032593, |
|
"learning_rate": 1.0746306389887924e-05, |
|
"loss": 0.4771, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 5.32, |
|
"grad_norm": 2.6288340091705322, |
|
"learning_rate": 1.067667029017371e-05, |
|
"loss": 0.4893, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 5.34, |
|
"grad_norm": 0.7628895044326782, |
|
"learning_rate": 1.060700121050419e-05, |
|
"loss": 0.4823, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 5.36, |
|
"grad_norm": 1.395524501800537, |
|
"learning_rate": 1.0537302546452022e-05, |
|
"loss": 0.45, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 5.38, |
|
"grad_norm": 1.0264369249343872, |
|
"learning_rate": 1.0467577695031763e-05, |
|
"loss": 0.4817, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 5.4, |
|
"grad_norm": 1.3651304244995117, |
|
"learning_rate": 1.03978300545343e-05, |
|
"loss": 0.4472, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 5.42, |
|
"grad_norm": 1.520727276802063, |
|
"learning_rate": 1.0328063024361232e-05, |
|
"loss": 0.4351, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 5.44, |
|
"grad_norm": 2.283327102661133, |
|
"learning_rate": 1.0258280004859189e-05, |
|
"loss": 0.4052, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 5.46, |
|
"grad_norm": 5.243598937988281, |
|
"learning_rate": 1.0188484397154083e-05, |
|
"loss": 0.51, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 5.48, |
|
"grad_norm": 2.3326563835144043, |
|
"learning_rate": 1.0118679602985373e-05, |
|
"loss": 0.4678, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 5.5, |
|
"grad_norm": 1.8756747245788574, |
|
"learning_rate": 1.0048869024540247e-05, |
|
"loss": 0.4802, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 5.52, |
|
"grad_norm": 2.212642192840576, |
|
"learning_rate": 9.979056064287807e-06, |
|
"loss": 0.4416, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 5.54, |
|
"grad_norm": 1.893557071685791, |
|
"learning_rate": 9.909244124813246e-06, |
|
"loss": 0.4613, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 5.5600000000000005, |
|
"grad_norm": 3.211782217025757, |
|
"learning_rate": 9.839436608652007e-06, |
|
"loss": 0.4163, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 5.58, |
|
"grad_norm": 1.4164925813674927, |
|
"learning_rate": 9.76963691812394e-06, |
|
"loss": 0.4753, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 5.6, |
|
"grad_norm": 1.139273762702942, |
|
"learning_rate": 9.699848455167489e-06, |
|
"loss": 0.4725, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 5.62, |
|
"grad_norm": 1.5774643421173096, |
|
"learning_rate": 9.630074621173882e-06, |
|
"loss": 0.4521, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 5.64, |
|
"grad_norm": 2.0061256885528564, |
|
"learning_rate": 9.560318816821354e-06, |
|
"loss": 0.3838, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 5.66, |
|
"grad_norm": 3.7671396732330322, |
|
"learning_rate": 9.490584441909392e-06, |
|
"loss": 0.4603, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 5.68, |
|
"grad_norm": 1.6197257041931152, |
|
"learning_rate": 9.420874895193056e-06, |
|
"loss": 0.4538, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 5.7, |
|
"grad_norm": 3.386794328689575, |
|
"learning_rate": 9.351193574217305e-06, |
|
"loss": 0.4527, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 5.72, |
|
"grad_norm": 1.4062769412994385, |
|
"learning_rate": 9.281543875151419e-06, |
|
"loss": 0.4915, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 5.74, |
|
"grad_norm": 2.367417573928833, |
|
"learning_rate": 9.211929192623466e-06, |
|
"loss": 0.4338, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 5.76, |
|
"grad_norm": 1.7326956987380981, |
|
"learning_rate": 9.142352919554862e-06, |
|
"loss": 0.4573, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 5.78, |
|
"grad_norm": 2.8575878143310547, |
|
"learning_rate": 9.072818446995e-06, |
|
"loss": 0.4494, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 5.8, |
|
"grad_norm": 1.1295793056488037, |
|
"learning_rate": 9.003329163955973e-06, |
|
"loss": 0.5061, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 5.82, |
|
"grad_norm": 1.31191885471344, |
|
"learning_rate": 8.933888457247402e-06, |
|
"loss": 0.4537, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 5.84, |
|
"grad_norm": 1.8330936431884766, |
|
"learning_rate": 8.864499711311362e-06, |
|
"loss": 0.4764, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 5.86, |
|
"grad_norm": 1.8839877843856812, |
|
"learning_rate": 8.79516630805745e-06, |
|
"loss": 0.4563, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 5.88, |
|
"grad_norm": 2.6970791816711426, |
|
"learning_rate": 8.725891626697912e-06, |
|
"loss": 0.4887, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 5.9, |
|
"grad_norm": 3.178072214126587, |
|
"learning_rate": 8.656679043582986e-06, |
|
"loss": 0.446, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 5.92, |
|
"grad_norm": 2.423067569732666, |
|
"learning_rate": 8.587531932036334e-06, |
|
"loss": 0.4533, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 5.9399999999999995, |
|
"grad_norm": 3.043440580368042, |
|
"learning_rate": 8.518453662190622e-06, |
|
"loss": 0.4451, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 5.96, |
|
"grad_norm": 2.4324257373809814, |
|
"learning_rate": 8.449447600823262e-06, |
|
"loss": 0.393, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 5.98, |
|
"grad_norm": 7.399738311767578, |
|
"learning_rate": 8.380517111192336e-06, |
|
"loss": 0.4406, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"grad_norm": 0.8923618197441101, |
|
"learning_rate": 8.311665552872662e-06, |
|
"loss": 0.474, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 6.02, |
|
"grad_norm": 1.7500466108322144, |
|
"learning_rate": 8.242896281592057e-06, |
|
"loss": 0.3953, |
|
"step": 3010 |
|
}, |
|
{ |
|
"epoch": 6.04, |
|
"grad_norm": 3.8324530124664307, |
|
"learning_rate": 8.174212649067781e-06, |
|
"loss": 0.4117, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 6.06, |
|
"grad_norm": 2.275822639465332, |
|
"learning_rate": 8.10561800284319e-06, |
|
"loss": 0.3988, |
|
"step": 3030 |
|
}, |
|
{ |
|
"epoch": 6.08, |
|
"grad_norm": 1.2562943696975708, |
|
"learning_rate": 8.037115686124564e-06, |
|
"loss": 0.418, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 6.1, |
|
"grad_norm": 1.3214370012283325, |
|
"learning_rate": 7.96870903761818e-06, |
|
"loss": 0.4084, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 6.12, |
|
"grad_norm": 2.595797061920166, |
|
"learning_rate": 7.900401391367576e-06, |
|
"loss": 0.3739, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 6.14, |
|
"grad_norm": 2.055779457092285, |
|
"learning_rate": 7.832196076591067e-06, |
|
"loss": 0.3763, |
|
"step": 3070 |
|
}, |
|
{ |
|
"epoch": 6.16, |
|
"grad_norm": 2.5182206630706787, |
|
"learning_rate": 7.76409641751947e-06, |
|
"loss": 0.4522, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 6.18, |
|
"grad_norm": 6.86693000793457, |
|
"learning_rate": 7.696105733234099e-06, |
|
"loss": 0.4661, |
|
"step": 3090 |
|
}, |
|
{ |
|
"epoch": 6.2, |
|
"grad_norm": 0.7651225924491882, |
|
"learning_rate": 7.628227337504972e-06, |
|
"loss": 0.4519, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 6.22, |
|
"grad_norm": 1.531447172164917, |
|
"learning_rate": 7.560464538629345e-06, |
|
"loss": 0.4073, |
|
"step": 3110 |
|
}, |
|
{ |
|
"epoch": 6.24, |
|
"grad_norm": 2.4921135902404785, |
|
"learning_rate": 7.492820639270435e-06, |
|
"loss": 0.4458, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 6.26, |
|
"grad_norm": 1.5849100351333618, |
|
"learning_rate": 7.4252989362964635e-06, |
|
"loss": 0.3703, |
|
"step": 3130 |
|
}, |
|
{ |
|
"epoch": 6.28, |
|
"grad_norm": 1.8190685510635376, |
|
"learning_rate": 7.357902720619976e-06, |
|
"loss": 0.4393, |
|
"step": 3140 |
|
}, |
|
{ |
|
"epoch": 6.3, |
|
"grad_norm": 1.195379376411438, |
|
"learning_rate": 7.290635277037442e-06, |
|
"loss": 0.437, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 6.32, |
|
"grad_norm": 1.6389209032058716, |
|
"learning_rate": 7.22349988406916e-06, |
|
"loss": 0.3979, |
|
"step": 3160 |
|
}, |
|
{ |
|
"epoch": 6.34, |
|
"grad_norm": 1.7568351030349731, |
|
"learning_rate": 7.156499813799477e-06, |
|
"loss": 0.4078, |
|
"step": 3170 |
|
}, |
|
{ |
|
"epoch": 6.36, |
|
"grad_norm": 2.5609893798828125, |
|
"learning_rate": 7.0896383317172845e-06, |
|
"loss": 0.4182, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 6.38, |
|
"grad_norm": 2.070969343185425, |
|
"learning_rate": 7.022918696556896e-06, |
|
"loss": 0.4239, |
|
"step": 3190 |
|
}, |
|
{ |
|
"epoch": 6.4, |
|
"grad_norm": 3.3474345207214355, |
|
"learning_rate": 6.956344160139201e-06, |
|
"loss": 0.4369, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 6.42, |
|
"grad_norm": 1.3559445142745972, |
|
"learning_rate": 6.889917967213184e-06, |
|
"loss": 0.4469, |
|
"step": 3210 |
|
}, |
|
{ |
|
"epoch": 6.44, |
|
"grad_norm": 1.4630825519561768, |
|
"learning_rate": 6.823643355297774e-06, |
|
"loss": 0.4312, |
|
"step": 3220 |
|
}, |
|
{ |
|
"epoch": 6.46, |
|
"grad_norm": 2.0589451789855957, |
|
"learning_rate": 6.757523554524056e-06, |
|
"loss": 0.4465, |
|
"step": 3230 |
|
}, |
|
{ |
|
"epoch": 6.48, |
|
"grad_norm": 2.448317766189575, |
|
"learning_rate": 6.69156178747784e-06, |
|
"loss": 0.4201, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 6.5, |
|
"grad_norm": 1.911927580833435, |
|
"learning_rate": 6.62576126904259e-06, |
|
"loss": 0.3882, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 6.52, |
|
"grad_norm": 3.176950216293335, |
|
"learning_rate": 6.560125206242746e-06, |
|
"loss": 0.4448, |
|
"step": 3260 |
|
}, |
|
{ |
|
"epoch": 6.54, |
|
"grad_norm": 1.4145492315292358, |
|
"learning_rate": 6.494656798087412e-06, |
|
"loss": 0.3915, |
|
"step": 3270 |
|
}, |
|
{ |
|
"epoch": 6.5600000000000005, |
|
"grad_norm": 4.982487201690674, |
|
"learning_rate": 6.4293592354144365e-06, |
|
"loss": 0.3769, |
|
"step": 3280 |
|
}, |
|
{ |
|
"epoch": 6.58, |
|
"grad_norm": 3.9301717281341553, |
|
"learning_rate": 6.364235700734903e-06, |
|
"loss": 0.4503, |
|
"step": 3290 |
|
}, |
|
{ |
|
"epoch": 6.6, |
|
"grad_norm": 3.643587112426758, |
|
"learning_rate": 6.299289368078016e-06, |
|
"loss": 0.4398, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 6.62, |
|
"grad_norm": 2.1195595264434814, |
|
"learning_rate": 6.234523402836408e-06, |
|
"loss": 0.4199, |
|
"step": 3310 |
|
}, |
|
{ |
|
"epoch": 6.64, |
|
"grad_norm": 1.3784760236740112, |
|
"learning_rate": 6.169940961611853e-06, |
|
"loss": 0.4574, |
|
"step": 3320 |
|
}, |
|
{ |
|
"epoch": 6.66, |
|
"grad_norm": 4.373683452606201, |
|
"learning_rate": 6.1055451920614165e-06, |
|
"loss": 0.4252, |
|
"step": 3330 |
|
}, |
|
{ |
|
"epoch": 6.68, |
|
"grad_norm": 1.9143246412277222, |
|
"learning_rate": 6.0413392327440635e-06, |
|
"loss": 0.4069, |
|
"step": 3340 |
|
}, |
|
{ |
|
"epoch": 6.7, |
|
"grad_norm": 1.9608592987060547, |
|
"learning_rate": 5.977326212967671e-06, |
|
"loss": 0.4173, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 6.72, |
|
"grad_norm": 2.7381646633148193, |
|
"learning_rate": 5.913509252636511e-06, |
|
"loss": 0.3737, |
|
"step": 3360 |
|
}, |
|
{ |
|
"epoch": 6.74, |
|
"grad_norm": 1.9294111728668213, |
|
"learning_rate": 5.849891462099199e-06, |
|
"loss": 0.437, |
|
"step": 3370 |
|
}, |
|
{ |
|
"epoch": 6.76, |
|
"grad_norm": 0.8937060236930847, |
|
"learning_rate": 5.786475941997094e-06, |
|
"loss": 0.4457, |
|
"step": 3380 |
|
}, |
|
{ |
|
"epoch": 6.78, |
|
"grad_norm": 1.2962634563446045, |
|
"learning_rate": 5.723265783113181e-06, |
|
"loss": 0.3989, |
|
"step": 3390 |
|
}, |
|
{ |
|
"epoch": 6.8, |
|
"grad_norm": 1.0391006469726562, |
|
"learning_rate": 5.660264066221426e-06, |
|
"loss": 0.4314, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 6.82, |
|
"grad_norm": 7.157230377197266, |
|
"learning_rate": 5.59747386193663e-06, |
|
"loss": 0.3989, |
|
"step": 3410 |
|
}, |
|
{ |
|
"epoch": 6.84, |
|
"grad_norm": 2.1349549293518066, |
|
"learning_rate": 5.534898230564765e-06, |
|
"loss": 0.3792, |
|
"step": 3420 |
|
}, |
|
{ |
|
"epoch": 6.86, |
|
"grad_norm": 1.362468957901001, |
|
"learning_rate": 5.472540221953824e-06, |
|
"loss": 0.4115, |
|
"step": 3430 |
|
}, |
|
{ |
|
"epoch": 6.88, |
|
"grad_norm": 3.6166296005249023, |
|
"learning_rate": 5.41040287534517e-06, |
|
"loss": 0.4067, |
|
"step": 3440 |
|
}, |
|
{ |
|
"epoch": 6.9, |
|
"grad_norm": 2.424628257751465, |
|
"learning_rate": 5.348489219225417e-06, |
|
"loss": 0.4424, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 6.92, |
|
"grad_norm": 2.7839276790618896, |
|
"learning_rate": 5.286802271178815e-06, |
|
"loss": 0.4508, |
|
"step": 3460 |
|
}, |
|
{ |
|
"epoch": 6.9399999999999995, |
|
"grad_norm": 3.2447237968444824, |
|
"learning_rate": 5.225345037740186e-06, |
|
"loss": 0.3984, |
|
"step": 3470 |
|
}, |
|
{ |
|
"epoch": 6.96, |
|
"grad_norm": 1.6200125217437744, |
|
"learning_rate": 5.16412051424839e-06, |
|
"loss": 0.4499, |
|
"step": 3480 |
|
}, |
|
{ |
|
"epoch": 6.98, |
|
"grad_norm": 1.5760829448699951, |
|
"learning_rate": 5.103131684700315e-06, |
|
"loss": 0.4154, |
|
"step": 3490 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"grad_norm": 11.93620491027832, |
|
"learning_rate": 5.042381521605473e-06, |
|
"loss": 0.391, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 7.02, |
|
"grad_norm": 4.725837707519531, |
|
"learning_rate": 4.981872985841115e-06, |
|
"loss": 0.38, |
|
"step": 3510 |
|
}, |
|
{ |
|
"epoch": 7.04, |
|
"grad_norm": 1.6126409769058228, |
|
"learning_rate": 4.921609026507907e-06, |
|
"loss": 0.3478, |
|
"step": 3520 |
|
}, |
|
{ |
|
"epoch": 7.06, |
|
"grad_norm": 2.6732842922210693, |
|
"learning_rate": 4.861592580786205e-06, |
|
"loss": 0.3712, |
|
"step": 3530 |
|
}, |
|
{ |
|
"epoch": 7.08, |
|
"grad_norm": 2.0151851177215576, |
|
"learning_rate": 4.801826573792905e-06, |
|
"loss": 0.3801, |
|
"step": 3540 |
|
}, |
|
{ |
|
"epoch": 7.1, |
|
"grad_norm": 1.151302456855774, |
|
"learning_rate": 4.7423139184388725e-06, |
|
"loss": 0.3881, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 7.12, |
|
"grad_norm": 3.5267462730407715, |
|
"learning_rate": 4.6830575152869615e-06, |
|
"loss": 0.3489, |
|
"step": 3560 |
|
}, |
|
{ |
|
"epoch": 7.14, |
|
"grad_norm": 1.8429639339447021, |
|
"learning_rate": 4.62406025241067e-06, |
|
"loss": 0.4284, |
|
"step": 3570 |
|
}, |
|
{ |
|
"epoch": 7.16, |
|
"grad_norm": 4.2320051193237305, |
|
"learning_rate": 4.565325005253356e-06, |
|
"loss": 0.4055, |
|
"step": 3580 |
|
}, |
|
{ |
|
"epoch": 7.18, |
|
"grad_norm": 2.368800163269043, |
|
"learning_rate": 4.506854636488103e-06, |
|
"loss": 0.3627, |
|
"step": 3590 |
|
}, |
|
{ |
|
"epoch": 7.2, |
|
"grad_norm": 7.869661331176758, |
|
"learning_rate": 4.44865199587819e-06, |
|
"loss": 0.3866, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 7.22, |
|
"grad_norm": 1.2418557405471802, |
|
"learning_rate": 4.39071992013822e-06, |
|
"loss": 0.3947, |
|
"step": 3610 |
|
}, |
|
{ |
|
"epoch": 7.24, |
|
"grad_norm": 1.7556277513504028, |
|
"learning_rate": 4.3330612327958265e-06, |
|
"loss": 0.4266, |
|
"step": 3620 |
|
}, |
|
{ |
|
"epoch": 7.26, |
|
"grad_norm": 4.239712238311768, |
|
"learning_rate": 4.275678744054094e-06, |
|
"loss": 0.3495, |
|
"step": 3630 |
|
}, |
|
{ |
|
"epoch": 7.28, |
|
"grad_norm": 2.917245626449585, |
|
"learning_rate": 4.218575250654559e-06, |
|
"loss": 0.4153, |
|
"step": 3640 |
|
}, |
|
{ |
|
"epoch": 7.3, |
|
"grad_norm": 1.490869402885437, |
|
"learning_rate": 4.161753535740932e-06, |
|
"loss": 0.3819, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 7.32, |
|
"grad_norm": 1.5143734216690063, |
|
"learning_rate": 4.105216368723437e-06, |
|
"loss": 0.4032, |
|
"step": 3660 |
|
}, |
|
{ |
|
"epoch": 7.34, |
|
"grad_norm": 3.434727907180786, |
|
"learning_rate": 4.048966505143831e-06, |
|
"loss": 0.358, |
|
"step": 3670 |
|
}, |
|
{ |
|
"epoch": 7.36, |
|
"grad_norm": 1.666413426399231, |
|
"learning_rate": 3.993006686541108e-06, |
|
"loss": 0.4101, |
|
"step": 3680 |
|
}, |
|
{ |
|
"epoch": 7.38, |
|
"grad_norm": 2.142817974090576, |
|
"learning_rate": 3.937339640317879e-06, |
|
"loss": 0.3803, |
|
"step": 3690 |
|
}, |
|
{ |
|
"epoch": 7.4, |
|
"grad_norm": 0.9919471740722656, |
|
"learning_rate": 3.88196807960744e-06, |
|
"loss": 0.3844, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 7.42, |
|
"grad_norm": 2.370820999145508, |
|
"learning_rate": 3.826894703141552e-06, |
|
"loss": 0.3536, |
|
"step": 3710 |
|
}, |
|
{ |
|
"epoch": 7.44, |
|
"grad_norm": 1.761391520500183, |
|
"learning_rate": 3.772122195118877e-06, |
|
"loss": 0.3957, |
|
"step": 3720 |
|
}, |
|
{ |
|
"epoch": 7.46, |
|
"grad_norm": 1.3135240077972412, |
|
"learning_rate": 3.7176532250741857e-06, |
|
"loss": 0.4308, |
|
"step": 3730 |
|
}, |
|
{ |
|
"epoch": 7.48, |
|
"grad_norm": 7.84911584854126, |
|
"learning_rate": 3.663490447748236e-06, |
|
"loss": 0.3988, |
|
"step": 3740 |
|
}, |
|
{ |
|
"epoch": 7.5, |
|
"grad_norm": 2.369114875793457, |
|
"learning_rate": 3.6096365029583803e-06, |
|
"loss": 0.3983, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 7.52, |
|
"grad_norm": 3.6399729251861572, |
|
"learning_rate": 3.5560940154699133e-06, |
|
"loss": 0.37, |
|
"step": 3760 |
|
}, |
|
{ |
|
"epoch": 7.54, |
|
"grad_norm": 3.580399751663208, |
|
"learning_rate": 3.502865594868136e-06, |
|
"loss": 0.3645, |
|
"step": 3770 |
|
}, |
|
{ |
|
"epoch": 7.5600000000000005, |
|
"grad_norm": 3.4667141437530518, |
|
"learning_rate": 3.4499538354311757e-06, |
|
"loss": 0.4179, |
|
"step": 3780 |
|
}, |
|
{ |
|
"epoch": 7.58, |
|
"grad_norm": 2.440298318862915, |
|
"learning_rate": 3.397361316003539e-06, |
|
"loss": 0.324, |
|
"step": 3790 |
|
}, |
|
{ |
|
"epoch": 7.6, |
|
"grad_norm": 2.0638418197631836, |
|
"learning_rate": 3.3450905998704274e-06, |
|
"loss": 0.3789, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 7.62, |
|
"grad_norm": 2.4778010845184326, |
|
"learning_rate": 3.2931442346328e-06, |
|
"loss": 0.3608, |
|
"step": 3810 |
|
}, |
|
{ |
|
"epoch": 7.64, |
|
"grad_norm": 2.4514052867889404, |
|
"learning_rate": 3.241524752083215e-06, |
|
"loss": 0.3985, |
|
"step": 3820 |
|
}, |
|
{ |
|
"epoch": 7.66, |
|
"grad_norm": 1.3875998258590698, |
|
"learning_rate": 3.190234668082427e-06, |
|
"loss": 0.3447, |
|
"step": 3830 |
|
}, |
|
{ |
|
"epoch": 7.68, |
|
"grad_norm": 1.2562263011932373, |
|
"learning_rate": 3.1392764824367706e-06, |
|
"loss": 0.3426, |
|
"step": 3840 |
|
}, |
|
{ |
|
"epoch": 7.7, |
|
"grad_norm": 1.41866135597229, |
|
"learning_rate": 3.0886526787763237e-06, |
|
"loss": 0.3576, |
|
"step": 3850 |
|
}, |
|
{ |
|
"epoch": 7.72, |
|
"grad_norm": 3.5626509189605713, |
|
"learning_rate": 3.038365724433858e-06, |
|
"loss": 0.3928, |
|
"step": 3860 |
|
}, |
|
{ |
|
"epoch": 7.74, |
|
"grad_norm": 5.021074295043945, |
|
"learning_rate": 2.988418070324577e-06, |
|
"loss": 0.3589, |
|
"step": 3870 |
|
}, |
|
{ |
|
"epoch": 7.76, |
|
"grad_norm": 1.392564058303833, |
|
"learning_rate": 2.938812150826684e-06, |
|
"loss": 0.3851, |
|
"step": 3880 |
|
}, |
|
{ |
|
"epoch": 7.78, |
|
"grad_norm": 3.992396116256714, |
|
"learning_rate": 2.8895503836627105e-06, |
|
"loss": 0.3688, |
|
"step": 3890 |
|
}, |
|
{ |
|
"epoch": 7.8, |
|
"grad_norm": 2.653534173965454, |
|
"learning_rate": 2.840635169781688e-06, |
|
"loss": 0.3585, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 7.82, |
|
"grad_norm": 6.615116596221924, |
|
"learning_rate": 2.7920688932421337e-06, |
|
"loss": 0.3653, |
|
"step": 3910 |
|
}, |
|
{ |
|
"epoch": 7.84, |
|
"grad_norm": 5.369716644287109, |
|
"learning_rate": 2.7438539210958483e-06, |
|
"loss": 0.3512, |
|
"step": 3920 |
|
}, |
|
{ |
|
"epoch": 7.86, |
|
"grad_norm": 1.679790735244751, |
|
"learning_rate": 2.6959926032725537e-06, |
|
"loss": 0.3717, |
|
"step": 3930 |
|
}, |
|
{ |
|
"epoch": 7.88, |
|
"grad_norm": 2.233903646469116, |
|
"learning_rate": 2.648487272465361e-06, |
|
"loss": 0.3806, |
|
"step": 3940 |
|
}, |
|
{ |
|
"epoch": 7.9, |
|
"grad_norm": 3.0028107166290283, |
|
"learning_rate": 2.6013402440170676e-06, |
|
"loss": 0.3993, |
|
"step": 3950 |
|
}, |
|
{ |
|
"epoch": 7.92, |
|
"grad_norm": 3.600489854812622, |
|
"learning_rate": 2.5545538158073278e-06, |
|
"loss": 0.3387, |
|
"step": 3960 |
|
}, |
|
{ |
|
"epoch": 7.9399999999999995, |
|
"grad_norm": 7.545295715332031, |
|
"learning_rate": 2.512756228659141e-06, |
|
"loss": 0.37, |
|
"step": 3970 |
|
}, |
|
{ |
|
"epoch": 7.96, |
|
"grad_norm": 2.566960573196411, |
|
"learning_rate": 2.4666612085261344e-06, |
|
"loss": 0.3967, |
|
"step": 3980 |
|
}, |
|
{ |
|
"epoch": 7.98, |
|
"grad_norm": 2.3997247219085693, |
|
"learning_rate": 2.420933352697865e-06, |
|
"loss": 0.4029, |
|
"step": 3990 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"grad_norm": 2.916670560836792, |
|
"learning_rate": 2.37557488988552e-06, |
|
"loss": 0.3713, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 8.02, |
|
"grad_norm": 1.7952624559402466, |
|
"learning_rate": 2.3305880307965834e-06, |
|
"loss": 0.3232, |
|
"step": 4010 |
|
}, |
|
{ |
|
"epoch": 8.04, |
|
"grad_norm": 1.91434645652771, |
|
"learning_rate": 2.2859749680270983e-06, |
|
"loss": 0.331, |
|
"step": 4020 |
|
}, |
|
{ |
|
"epoch": 8.06, |
|
"grad_norm": 3.671706438064575, |
|
"learning_rate": 2.241737875954808e-06, |
|
"loss": 0.3818, |
|
"step": 4030 |
|
}, |
|
{ |
|
"epoch": 8.08, |
|
"grad_norm": 1.5308889150619507, |
|
"learning_rate": 2.1978789106331666e-06, |
|
"loss": 0.3482, |
|
"step": 4040 |
|
}, |
|
{ |
|
"epoch": 8.1, |
|
"grad_norm": 1.8674166202545166, |
|
"learning_rate": 2.154400209686268e-06, |
|
"loss": 0.3195, |
|
"step": 4050 |
|
}, |
|
{ |
|
"epoch": 8.12, |
|
"grad_norm": 1.5842407941818237, |
|
"learning_rate": 2.1113038922046603e-06, |
|
"loss": 0.3557, |
|
"step": 4060 |
|
}, |
|
{ |
|
"epoch": 8.14, |
|
"grad_norm": 2.2769813537597656, |
|
"learning_rate": 2.0685920586420562e-06, |
|
"loss": 0.2853, |
|
"step": 4070 |
|
}, |
|
{ |
|
"epoch": 8.16, |
|
"grad_norm": 1.7789726257324219, |
|
"learning_rate": 2.026266790712965e-06, |
|
"loss": 0.316, |
|
"step": 4080 |
|
}, |
|
{ |
|
"epoch": 8.18, |
|
"grad_norm": 2.544579029083252, |
|
"learning_rate": 1.984330151291233e-06, |
|
"loss": 0.3328, |
|
"step": 4090 |
|
}, |
|
{ |
|
"epoch": 8.2, |
|
"grad_norm": 5.644877910614014, |
|
"learning_rate": 1.9427841843095063e-06, |
|
"loss": 0.3338, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 8.22, |
|
"grad_norm": 2.9125707149505615, |
|
"learning_rate": 1.9016309146596024e-06, |
|
"loss": 0.3226, |
|
"step": 4110 |
|
}, |
|
{ |
|
"epoch": 8.24, |
|
"grad_norm": 2.9386703968048096, |
|
"learning_rate": 1.8608723480938207e-06, |
|
"loss": 0.3147, |
|
"step": 4120 |
|
}, |
|
{ |
|
"epoch": 8.26, |
|
"grad_norm": 5.057535648345947, |
|
"learning_rate": 1.820510471127196e-06, |
|
"loss": 0.3549, |
|
"step": 4130 |
|
}, |
|
{ |
|
"epoch": 8.28, |
|
"grad_norm": 1.1568169593811035, |
|
"learning_rate": 1.7805472509406695e-06, |
|
"loss": 0.3701, |
|
"step": 4140 |
|
}, |
|
{ |
|
"epoch": 8.3, |
|
"grad_norm": 2.978498697280884, |
|
"learning_rate": 1.7409846352852144e-06, |
|
"loss": 0.341, |
|
"step": 4150 |
|
}, |
|
{ |
|
"epoch": 8.32, |
|
"grad_norm": 1.8623732328414917, |
|
"learning_rate": 1.7018245523869038e-06, |
|
"loss": 0.2754, |
|
"step": 4160 |
|
}, |
|
{ |
|
"epoch": 8.34, |
|
"grad_norm": 3.46683406829834, |
|
"learning_rate": 1.6630689108529286e-06, |
|
"loss": 0.3958, |
|
"step": 4170 |
|
}, |
|
{ |
|
"epoch": 8.36, |
|
"grad_norm": 5.015219211578369, |
|
"learning_rate": 1.6247195995785836e-06, |
|
"loss": 0.3512, |
|
"step": 4180 |
|
}, |
|
{ |
|
"epoch": 8.38, |
|
"grad_norm": 1.5242810249328613, |
|
"learning_rate": 1.5867784876551973e-06, |
|
"loss": 0.3533, |
|
"step": 4190 |
|
}, |
|
{ |
|
"epoch": 8.4, |
|
"grad_norm": 4.693676948547363, |
|
"learning_rate": 1.5492474242790368e-06, |
|
"loss": 0.3746, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 8.42, |
|
"grad_norm": 2.436262845993042, |
|
"learning_rate": 1.5121282386611823e-06, |
|
"loss": 0.3274, |
|
"step": 4210 |
|
}, |
|
{ |
|
"epoch": 8.44, |
|
"grad_norm": 2.2660608291625977, |
|
"learning_rate": 1.4754227399383758e-06, |
|
"loss": 0.3055, |
|
"step": 4220 |
|
}, |
|
{ |
|
"epoch": 8.46, |
|
"grad_norm": 2.7948834896087646, |
|
"learning_rate": 1.439132717084839e-06, |
|
"loss": 0.3078, |
|
"step": 4230 |
|
}, |
|
{ |
|
"epoch": 8.48, |
|
"grad_norm": 1.3765865564346313, |
|
"learning_rate": 1.40325993882509e-06, |
|
"loss": 0.3194, |
|
"step": 4240 |
|
}, |
|
{ |
|
"epoch": 8.5, |
|
"grad_norm": 1.2223212718963623, |
|
"learning_rate": 1.3678061535477305e-06, |
|
"loss": 0.352, |
|
"step": 4250 |
|
}, |
|
{ |
|
"epoch": 8.52, |
|
"grad_norm": 2.556001663208008, |
|
"learning_rate": 1.3327730892202384e-06, |
|
"loss": 0.3061, |
|
"step": 4260 |
|
}, |
|
{ |
|
"epoch": 8.54, |
|
"grad_norm": 4.0893168449401855, |
|
"learning_rate": 1.2981624533047432e-06, |
|
"loss": 0.406, |
|
"step": 4270 |
|
}, |
|
{ |
|
"epoch": 8.56, |
|
"grad_norm": 1.8102929592132568, |
|
"learning_rate": 1.2639759326748136e-06, |
|
"loss": 0.3335, |
|
"step": 4280 |
|
}, |
|
{ |
|
"epoch": 8.58, |
|
"grad_norm": 0.6934239268302917, |
|
"learning_rate": 1.230215193533233e-06, |
|
"loss": 0.4048, |
|
"step": 4290 |
|
}, |
|
{ |
|
"epoch": 8.6, |
|
"grad_norm": 2.5495901107788086, |
|
"learning_rate": 1.196881881330798e-06, |
|
"loss": 0.3388, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 8.62, |
|
"grad_norm": 2.681366443634033, |
|
"learning_rate": 1.1639776206861197e-06, |
|
"loss": 0.358, |
|
"step": 4310 |
|
}, |
|
{ |
|
"epoch": 8.64, |
|
"grad_norm": 1.624990463256836, |
|
"learning_rate": 1.1315040153064416e-06, |
|
"loss": 0.3628, |
|
"step": 4320 |
|
}, |
|
{ |
|
"epoch": 8.66, |
|
"grad_norm": 7.331467151641846, |
|
"learning_rate": 1.0994626479094749e-06, |
|
"loss": 0.3585, |
|
"step": 4330 |
|
}, |
|
{ |
|
"epoch": 8.68, |
|
"grad_norm": 1.2213658094406128, |
|
"learning_rate": 1.0678550801462662e-06, |
|
"loss": 0.3583, |
|
"step": 4340 |
|
}, |
|
{ |
|
"epoch": 8.7, |
|
"grad_norm": 2.539713144302368, |
|
"learning_rate": 1.0366828525250728e-06, |
|
"loss": 0.2861, |
|
"step": 4350 |
|
}, |
|
{ |
|
"epoch": 8.72, |
|
"grad_norm": 4.281270980834961, |
|
"learning_rate": 1.0059474843362893e-06, |
|
"loss": 0.3422, |
|
"step": 4360 |
|
}, |
|
{ |
|
"epoch": 8.74, |
|
"grad_norm": 1.515568733215332, |
|
"learning_rate": 9.756504735784067e-07, |
|
"loss": 0.3337, |
|
"step": 4370 |
|
}, |
|
{ |
|
"epoch": 8.76, |
|
"grad_norm": 1.730093240737915, |
|
"learning_rate": 9.457932968849826e-07, |
|
"loss": 0.3163, |
|
"step": 4380 |
|
}, |
|
{ |
|
"epoch": 8.78, |
|
"grad_norm": 4.305525302886963, |
|
"learning_rate": 9.16377409452689e-07, |
|
"loss": 0.3132, |
|
"step": 4390 |
|
}, |
|
{ |
|
"epoch": 8.8, |
|
"grad_norm": 1.6857116222381592, |
|
"learning_rate": 8.874042449703779e-07, |
|
"loss": 0.3108, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 8.82, |
|
"grad_norm": 1.7638370990753174, |
|
"learning_rate": 8.58875215549212e-07, |
|
"loss": 0.3444, |
|
"step": 4410 |
|
}, |
|
{ |
|
"epoch": 8.84, |
|
"grad_norm": 2.9410483837127686, |
|
"learning_rate": 8.307917116538378e-07, |
|
"loss": 0.3582, |
|
"step": 4420 |
|
}, |
|
{ |
|
"epoch": 8.86, |
|
"grad_norm": 1.4133245944976807, |
|
"learning_rate": 8.031551020346129e-07, |
|
"loss": 0.3014, |
|
"step": 4430 |
|
}, |
|
{ |
|
"epoch": 8.88, |
|
"grad_norm": 2.466925621032715, |
|
"learning_rate": 7.759667336609011e-07, |
|
"loss": 0.3578, |
|
"step": 4440 |
|
}, |
|
{ |
|
"epoch": 8.9, |
|
"grad_norm": 1.979108214378357, |
|
"learning_rate": 7.492279316554207e-07, |
|
"loss": 0.3253, |
|
"step": 4450 |
|
}, |
|
{ |
|
"epoch": 8.92, |
|
"grad_norm": 1.6241674423217773, |
|
"learning_rate": 7.22939999229657e-07, |
|
"loss": 0.3839, |
|
"step": 4460 |
|
}, |
|
{ |
|
"epoch": 8.94, |
|
"grad_norm": 2.58152174949646, |
|
"learning_rate": 6.971042176203535e-07, |
|
"loss": 0.268, |
|
"step": 4470 |
|
}, |
|
{ |
|
"epoch": 8.96, |
|
"grad_norm": 2.5680618286132812, |
|
"learning_rate": 6.717218460270536e-07, |
|
"loss": 0.332, |
|
"step": 4480 |
|
}, |
|
{ |
|
"epoch": 8.98, |
|
"grad_norm": 2.1524252891540527, |
|
"learning_rate": 6.467941215507434e-07, |
|
"loss": 0.361, |
|
"step": 4490 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"grad_norm": 1.6696678400039673, |
|
"learning_rate": 6.223222591335409e-07, |
|
"loss": 0.3358, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 9.02, |
|
"grad_norm": 1.1763229370117188, |
|
"learning_rate": 5.98307451499498e-07, |
|
"loss": 0.2874, |
|
"step": 4510 |
|
}, |
|
{ |
|
"epoch": 9.04, |
|
"grad_norm": 1.381579041481018, |
|
"learning_rate": 5.747508690964599e-07, |
|
"loss": 0.361, |
|
"step": 4520 |
|
}, |
|
{ |
|
"epoch": 9.06, |
|
"grad_norm": 1.560950756072998, |
|
"learning_rate": 5.516536600390188e-07, |
|
"loss": 0.2929, |
|
"step": 4530 |
|
}, |
|
{ |
|
"epoch": 9.08, |
|
"grad_norm": 2.703350782394409, |
|
"learning_rate": 5.290169500525577e-07, |
|
"loss": 0.2854, |
|
"step": 4540 |
|
}, |
|
{ |
|
"epoch": 9.1, |
|
"grad_norm": 1.5632970333099365, |
|
"learning_rate": 5.068418424183874e-07, |
|
"loss": 0.3173, |
|
"step": 4550 |
|
}, |
|
{ |
|
"epoch": 9.12, |
|
"grad_norm": 1.8101422786712646, |
|
"learning_rate": 4.851294179199673e-07, |
|
"loss": 0.3683, |
|
"step": 4560 |
|
}, |
|
{ |
|
"epoch": 9.14, |
|
"grad_norm": 1.0653437376022339, |
|
"learning_rate": 4.638807347902408e-07, |
|
"loss": 0.3256, |
|
"step": 4570 |
|
}, |
|
{ |
|
"epoch": 9.16, |
|
"grad_norm": 2.522818088531494, |
|
"learning_rate": 4.4309682866004124e-07, |
|
"loss": 0.319, |
|
"step": 4580 |
|
}, |
|
{ |
|
"epoch": 9.18, |
|
"grad_norm": 3.289451837539673, |
|
"learning_rate": 4.2277871250763327e-07, |
|
"loss": 0.3221, |
|
"step": 4590 |
|
}, |
|
{ |
|
"epoch": 9.2, |
|
"grad_norm": 2.0382297039031982, |
|
"learning_rate": 4.0292737660933335e-07, |
|
"loss": 0.2951, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 9.22, |
|
"grad_norm": 2.1435165405273438, |
|
"learning_rate": 3.835437884912474e-07, |
|
"loss": 0.3738, |
|
"step": 4610 |
|
}, |
|
{ |
|
"epoch": 9.24, |
|
"grad_norm": 1.6461173295974731, |
|
"learning_rate": 3.646288928821151e-07, |
|
"loss": 0.2898, |
|
"step": 4620 |
|
}, |
|
{ |
|
"epoch": 9.26, |
|
"grad_norm": 2.4130804538726807, |
|
"learning_rate": 3.4618361166726123e-07, |
|
"loss": 0.3792, |
|
"step": 4630 |
|
}, |
|
{ |
|
"epoch": 9.28, |
|
"grad_norm": 2.4446017742156982, |
|
"learning_rate": 3.282088438436715e-07, |
|
"loss": 0.3424, |
|
"step": 4640 |
|
}, |
|
{ |
|
"epoch": 9.3, |
|
"grad_norm": 1.3320859670639038, |
|
"learning_rate": 3.10705465476171e-07, |
|
"loss": 0.358, |
|
"step": 4650 |
|
}, |
|
{ |
|
"epoch": 9.32, |
|
"grad_norm": 3.8511667251586914, |
|
"learning_rate": 2.936743296547273e-07, |
|
"loss": 0.32, |
|
"step": 4660 |
|
}, |
|
{ |
|
"epoch": 9.34, |
|
"grad_norm": 5.630286693572998, |
|
"learning_rate": 2.771162664528726e-07, |
|
"loss": 0.3079, |
|
"step": 4670 |
|
}, |
|
{ |
|
"epoch": 9.36, |
|
"grad_norm": 1.600219964981079, |
|
"learning_rate": 2.6103208288724815e-07, |
|
"loss": 0.2834, |
|
"step": 4680 |
|
}, |
|
{ |
|
"epoch": 9.38, |
|
"grad_norm": 1.0820380449295044, |
|
"learning_rate": 2.4542256287826915e-07, |
|
"loss": 0.354, |
|
"step": 4690 |
|
}, |
|
{ |
|
"epoch": 9.4, |
|
"grad_norm": 1.4870035648345947, |
|
"learning_rate": 2.3028846721191878e-07, |
|
"loss": 0.3243, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 9.42, |
|
"grad_norm": 3.148569107055664, |
|
"learning_rate": 2.1563053350266983e-07, |
|
"loss": 0.3121, |
|
"step": 4710 |
|
}, |
|
{ |
|
"epoch": 9.44, |
|
"grad_norm": 1.8829195499420166, |
|
"learning_rate": 2.014494761575314e-07, |
|
"loss": 0.3142, |
|
"step": 4720 |
|
}, |
|
{ |
|
"epoch": 9.46, |
|
"grad_norm": 3.1038215160369873, |
|
"learning_rate": 1.877459863412323e-07, |
|
"loss": 0.3287, |
|
"step": 4730 |
|
}, |
|
{ |
|
"epoch": 9.48, |
|
"grad_norm": 1.9286001920700073, |
|
"learning_rate": 1.7452073194253237e-07, |
|
"loss": 0.2989, |
|
"step": 4740 |
|
}, |
|
{ |
|
"epoch": 9.5, |
|
"grad_norm": 2.0495471954345703, |
|
"learning_rate": 1.6177435754167413e-07, |
|
"loss": 0.3632, |
|
"step": 4750 |
|
}, |
|
{ |
|
"epoch": 9.52, |
|
"grad_norm": 2.1833696365356445, |
|
"learning_rate": 1.4950748437896235e-07, |
|
"loss": 0.265, |
|
"step": 4760 |
|
}, |
|
{ |
|
"epoch": 9.54, |
|
"grad_norm": 3.15493106842041, |
|
"learning_rate": 1.377207103244904e-07, |
|
"loss": 0.283, |
|
"step": 4770 |
|
}, |
|
{ |
|
"epoch": 9.56, |
|
"grad_norm": 1.2273836135864258, |
|
"learning_rate": 1.26414609848996e-07, |
|
"loss": 0.2264, |
|
"step": 4780 |
|
}, |
|
{ |
|
"epoch": 9.58, |
|
"grad_norm": 1.6316149234771729, |
|
"learning_rate": 1.1558973399586671e-07, |
|
"loss": 0.3198, |
|
"step": 4790 |
|
}, |
|
{ |
|
"epoch": 9.6, |
|
"grad_norm": 2.2700629234313965, |
|
"learning_rate": 1.052466103542793e-07, |
|
"loss": 0.2258, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 9.62, |
|
"grad_norm": 1.132501244544983, |
|
"learning_rate": 9.538574303348813e-08, |
|
"loss": 0.3053, |
|
"step": 4810 |
|
}, |
|
{ |
|
"epoch": 9.64, |
|
"grad_norm": 1.9846259355545044, |
|
"learning_rate": 8.600761263825475e-08, |
|
"loss": 0.278, |
|
"step": 4820 |
|
}, |
|
{ |
|
"epoch": 9.66, |
|
"grad_norm": 1.3777711391448975, |
|
"learning_rate": 7.71126762454233e-08, |
|
"loss": 0.3211, |
|
"step": 4830 |
|
}, |
|
{ |
|
"epoch": 9.68, |
|
"grad_norm": 1.355865240097046, |
|
"learning_rate": 6.870136738164612e-08, |
|
"loss": 0.3079, |
|
"step": 4840 |
|
}, |
|
{ |
|
"epoch": 9.7, |
|
"grad_norm": 2.8824238777160645, |
|
"learning_rate": 6.07740960022507e-08, |
|
"loss": 0.3717, |
|
"step": 4850 |
|
}, |
|
{ |
|
"epoch": 9.72, |
|
"grad_norm": 2.4788002967834473, |
|
"learning_rate": 5.3331248471258926e-08, |
|
"loss": 0.3052, |
|
"step": 4860 |
|
}, |
|
{ |
|
"epoch": 9.74, |
|
"grad_norm": 1.3350954055786133, |
|
"learning_rate": 4.6373187542561036e-08, |
|
"loss": 0.3018, |
|
"step": 4870 |
|
}, |
|
{ |
|
"epoch": 9.76, |
|
"grad_norm": 1.485877513885498, |
|
"learning_rate": 3.990025234222872e-08, |
|
"loss": 0.2694, |
|
"step": 4880 |
|
}, |
|
{ |
|
"epoch": 9.78, |
|
"grad_norm": 2.1320180892944336, |
|
"learning_rate": 3.391275835199159e-08, |
|
"loss": 0.323, |
|
"step": 4890 |
|
}, |
|
{ |
|
"epoch": 9.8, |
|
"grad_norm": 1.2034118175506592, |
|
"learning_rate": 2.8410997393860663e-08, |
|
"loss": 0.302, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 9.82, |
|
"grad_norm": 1.9911288022994995, |
|
"learning_rate": 2.339523761590301e-08, |
|
"loss": 0.3561, |
|
"step": 4910 |
|
}, |
|
{ |
|
"epoch": 9.84, |
|
"grad_norm": 4.588063716888428, |
|
"learning_rate": 1.886572347917337e-08, |
|
"loss": 0.3486, |
|
"step": 4920 |
|
}, |
|
{ |
|
"epoch": 9.86, |
|
"grad_norm": 1.2661594152450562, |
|
"learning_rate": 1.482267574580143e-08, |
|
"loss": 0.3651, |
|
"step": 4930 |
|
}, |
|
{ |
|
"epoch": 9.88, |
|
"grad_norm": 1.7865337133407593, |
|
"learning_rate": 1.126629146822933e-08, |
|
"loss": 0.2544, |
|
"step": 4940 |
|
}, |
|
{ |
|
"epoch": 9.9, |
|
"grad_norm": 3.7908692359924316, |
|
"learning_rate": 8.196743979610455e-09, |
|
"loss": 0.2575, |
|
"step": 4950 |
|
}, |
|
{ |
|
"epoch": 9.92, |
|
"grad_norm": 2.3560678958892822, |
|
"learning_rate": 5.614182885357311e-09, |
|
"loss": 0.2833, |
|
"step": 4960 |
|
}, |
|
{ |
|
"epoch": 9.94, |
|
"grad_norm": 1.629074215888977, |
|
"learning_rate": 3.518734055855122e-09, |
|
"loss": 0.3345, |
|
"step": 4970 |
|
}, |
|
{ |
|
"epoch": 9.96, |
|
"grad_norm": 2.738832473754883, |
|
"learning_rate": 1.910499620322304e-09, |
|
"loss": 0.3034, |
|
"step": 4980 |
|
}, |
|
{ |
|
"epoch": 9.98, |
|
"grad_norm": 1.9189926385879517, |
|
"learning_rate": 7.895579618388827e-10, |
|
"loss": 0.2543, |
|
"step": 4990 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"grad_norm": 1.597418189048767, |
|
"learning_rate": 1.559637135173375e-10, |
|
"loss": 0.3149, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"step": 5000, |
|
"total_flos": 1.1899956660640154e+17, |
|
"train_loss": 0.49240388979911803, |
|
"train_runtime": 35927.6647, |
|
"train_samples_per_second": 0.278, |
|
"train_steps_per_second": 0.139 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 5000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.1899956660640154e+17, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|