|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9999557541701695, |
|
"eval_steps": 1000, |
|
"global_step": 11300, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 8.849165966107695e-05, |
|
"eval_accuracy": 0.337381835495043, |
|
"eval_loss": 5.641775608062744, |
|
"eval_runtime": 12.0942, |
|
"eval_samples_per_second": 26.294, |
|
"eval_steps_per_second": 0.413, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0008849165966107694, |
|
"grad_norm": 9.357199668884277, |
|
"learning_rate": 5e-05, |
|
"loss": 5.9637, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.0017698331932215388, |
|
"grad_norm": 3.343987226486206, |
|
"learning_rate": 0.0001, |
|
"loss": 5.3257, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.002654749789832308, |
|
"grad_norm": 1.7534539699554443, |
|
"learning_rate": 0.00015, |
|
"loss": 4.6726, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.0035396663864430775, |
|
"grad_norm": 0.7918533086776733, |
|
"learning_rate": 0.0002, |
|
"loss": 4.1863, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.004424582983053847, |
|
"grad_norm": 0.5450736880302429, |
|
"learning_rate": 0.00025, |
|
"loss": 3.8618, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.005309499579664616, |
|
"grad_norm": 0.4212624132633209, |
|
"learning_rate": 0.0003, |
|
"loss": 3.4744, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.006194416176275386, |
|
"grad_norm": 0.5554006695747375, |
|
"learning_rate": 0.00035, |
|
"loss": 3.209, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.007079332772886155, |
|
"grad_norm": 0.45975297689437866, |
|
"learning_rate": 0.0004, |
|
"loss": 3.0271, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.007964249369496926, |
|
"grad_norm": 0.4895482361316681, |
|
"learning_rate": 0.00045000000000000004, |
|
"loss": 2.8733, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.008849165966107695, |
|
"grad_norm": 0.6686625480651855, |
|
"learning_rate": 0.0005, |
|
"loss": 2.7663, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.009734082562718464, |
|
"grad_norm": 0.6015220880508423, |
|
"learning_rate": 0.0004999990165021195, |
|
"loss": 2.6571, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.010618999159329233, |
|
"grad_norm": 0.5138149261474609, |
|
"learning_rate": 0.0004999960660162163, |
|
"loss": 2.6347, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.011503915755940003, |
|
"grad_norm": 0.5395781397819519, |
|
"learning_rate": 0.0004999911485655047, |
|
"loss": 2.5949, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.012388832352550772, |
|
"grad_norm": 0.47543829679489136, |
|
"learning_rate": 0.0004999842641886751, |
|
"loss": 2.5891, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.013273748949161541, |
|
"grad_norm": 0.6284042000770569, |
|
"learning_rate": 0.0004999754129398938, |
|
"loss": 2.5148, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.01415866554577231, |
|
"grad_norm": 0.6861183047294617, |
|
"learning_rate": 0.000499964594888802, |
|
"loss": 2.4946, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.01504358214238308, |
|
"grad_norm": 0.5336579084396362, |
|
"learning_rate": 0.0004999518101205162, |
|
"loss": 2.4972, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.01592849873899385, |
|
"grad_norm": 0.6088137626647949, |
|
"learning_rate": 0.0004999370587356267, |
|
"loss": 2.4671, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.01681341533560462, |
|
"grad_norm": 0.8096277117729187, |
|
"learning_rate": 0.000499920340850197, |
|
"loss": 2.4413, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.01769833193221539, |
|
"grad_norm": 0.6315465569496155, |
|
"learning_rate": 0.0004999016565957633, |
|
"loss": 2.455, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.018583248528826157, |
|
"grad_norm": 0.49901631474494934, |
|
"learning_rate": 0.0004998810061193329, |
|
"loss": 2.4232, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.019468165125436927, |
|
"grad_norm": 0.46556416153907776, |
|
"learning_rate": 0.0004998583895833834, |
|
"loss": 2.4315, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.020353081722047698, |
|
"grad_norm": 0.7050290703773499, |
|
"learning_rate": 0.0004998338071658613, |
|
"loss": 2.4305, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.021237998318658465, |
|
"grad_norm": 0.89899742603302, |
|
"learning_rate": 0.0004998072590601808, |
|
"loss": 2.41, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.022122914915269236, |
|
"grad_norm": 0.5768831968307495, |
|
"learning_rate": 0.0004997787454752217, |
|
"loss": 2.4048, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.023007831511880007, |
|
"grad_norm": 0.659796953201294, |
|
"learning_rate": 0.0004997482666353287, |
|
"loss": 2.3955, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.023892748108490774, |
|
"grad_norm": 0.546999454498291, |
|
"learning_rate": 0.0004997158227803086, |
|
"loss": 2.3885, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.024777664705101544, |
|
"grad_norm": 0.9052286744117737, |
|
"learning_rate": 0.000499681414165429, |
|
"loss": 2.3975, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.025662581301712315, |
|
"grad_norm": 0.7265748977661133, |
|
"learning_rate": 0.0004996450410614166, |
|
"loss": 2.3928, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.026547497898323082, |
|
"grad_norm": 1.073904275894165, |
|
"learning_rate": 0.0004996067037544541, |
|
"loss": 2.3627, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.027432414494933853, |
|
"grad_norm": 0.6801771521568298, |
|
"learning_rate": 0.000499566402546179, |
|
"loss": 2.3652, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.02831733109154462, |
|
"grad_norm": 0.5730037689208984, |
|
"learning_rate": 0.0004995241377536803, |
|
"loss": 2.3587, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.02920224768815539, |
|
"grad_norm": 0.8887476325035095, |
|
"learning_rate": 0.0004994799097094969, |
|
"loss": 2.3759, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.03008716428476616, |
|
"grad_norm": 0.495370477437973, |
|
"learning_rate": 0.000499433718761614, |
|
"loss": 2.362, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.03097208088137693, |
|
"grad_norm": 0.5238328576087952, |
|
"learning_rate": 0.0004993855652734615, |
|
"loss": 2.3779, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.0318569974779877, |
|
"grad_norm": 0.7129193544387817, |
|
"learning_rate": 0.0004993354496239101, |
|
"loss": 2.3643, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.03274191407459847, |
|
"grad_norm": 0.6504082679748535, |
|
"learning_rate": 0.0004992833722072688, |
|
"loss": 2.3448, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.03362683067120924, |
|
"grad_norm": 0.690351665019989, |
|
"learning_rate": 0.000499229333433282, |
|
"loss": 2.3695, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.034511747267820005, |
|
"grad_norm": 0.762799859046936, |
|
"learning_rate": 0.0004991733337271258, |
|
"loss": 2.3541, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.03539666386443078, |
|
"grad_norm": 0.7155598998069763, |
|
"learning_rate": 0.0004991153735294048, |
|
"loss": 2.3481, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.036281580461041546, |
|
"grad_norm": 0.4801159203052521, |
|
"learning_rate": 0.000499055453296149, |
|
"loss": 2.3555, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.03716649705765231, |
|
"grad_norm": 0.48848673701286316, |
|
"learning_rate": 0.0004989935734988098, |
|
"loss": 2.3622, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.03805141365426309, |
|
"grad_norm": 0.46054649353027344, |
|
"learning_rate": 0.0004989297346242562, |
|
"loss": 2.3634, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.038936330250873855, |
|
"grad_norm": 0.5708670020103455, |
|
"learning_rate": 0.0004988639371747717, |
|
"loss": 2.34, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.03982124684748462, |
|
"grad_norm": 0.7245877981185913, |
|
"learning_rate": 0.0004987961816680492, |
|
"loss": 2.3564, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.040706163444095396, |
|
"grad_norm": 0.513332724571228, |
|
"learning_rate": 0.0004987264686371881, |
|
"loss": 2.3544, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.04159108004070616, |
|
"grad_norm": 0.5079577565193176, |
|
"learning_rate": 0.0004986547986306892, |
|
"loss": 2.3531, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.04247599663731693, |
|
"grad_norm": 0.8436957001686096, |
|
"learning_rate": 0.000498581172212451, |
|
"loss": 2.3402, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.043360913233927705, |
|
"grad_norm": 0.5677080750465393, |
|
"learning_rate": 0.0004985055899617649, |
|
"loss": 2.3315, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.04424582983053847, |
|
"grad_norm": 0.4759403467178345, |
|
"learning_rate": 0.0004984280524733107, |
|
"loss": 2.326, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.04513074642714924, |
|
"grad_norm": 0.45146846771240234, |
|
"learning_rate": 0.0004983485603571521, |
|
"loss": 2.3177, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.04601566302376001, |
|
"grad_norm": 0.6578854322433472, |
|
"learning_rate": 0.0004982671142387316, |
|
"loss": 2.3379, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.04690057962037078, |
|
"grad_norm": 0.8977625370025635, |
|
"learning_rate": 0.000498183714758866, |
|
"loss": 2.3233, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.04778549621698155, |
|
"grad_norm": 0.5207841396331787, |
|
"learning_rate": 0.0004980983625737411, |
|
"loss": 2.3449, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.04867041281359232, |
|
"grad_norm": 0.563421905040741, |
|
"learning_rate": 0.0004980110583549062, |
|
"loss": 2.3111, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.04955532941020309, |
|
"grad_norm": 0.6460586786270142, |
|
"learning_rate": 0.0004979218027892695, |
|
"loss": 2.3382, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.050440246006813856, |
|
"grad_norm": 0.7345250844955444, |
|
"learning_rate": 0.0004978305965790924, |
|
"loss": 2.3141, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.05132516260342463, |
|
"grad_norm": 0.6413494348526001, |
|
"learning_rate": 0.0004977374404419837, |
|
"loss": 2.3172, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.0522100792000354, |
|
"grad_norm": 0.5809776186943054, |
|
"learning_rate": 0.0004976423351108943, |
|
"loss": 2.3214, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.053094995796646165, |
|
"grad_norm": 0.5282315015792847, |
|
"learning_rate": 0.0004975452813341115, |
|
"loss": 2.3188, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.05397991239325694, |
|
"grad_norm": 0.673841655254364, |
|
"learning_rate": 0.0004974462798752524, |
|
"loss": 2.3226, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.054864828989867706, |
|
"grad_norm": 0.8785530924797058, |
|
"learning_rate": 0.0004973453315132592, |
|
"loss": 2.3097, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.05574974558647847, |
|
"grad_norm": 0.7876306772232056, |
|
"learning_rate": 0.0004972424370423917, |
|
"loss": 2.3342, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.05663466218308924, |
|
"grad_norm": 0.5609032511711121, |
|
"learning_rate": 0.0004971375972722218, |
|
"loss": 2.3265, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.057519578779700015, |
|
"grad_norm": 0.730330228805542, |
|
"learning_rate": 0.0004970308130276272, |
|
"loss": 2.3289, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.05840449537631078, |
|
"grad_norm": 0.7334195971488953, |
|
"learning_rate": 0.0004969220851487844, |
|
"loss": 2.3107, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.05928941197292155, |
|
"grad_norm": 0.7410897612571716, |
|
"learning_rate": 0.0004968114144911626, |
|
"loss": 2.316, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.06017432856953232, |
|
"grad_norm": 0.5102954506874084, |
|
"learning_rate": 0.0004966988019255166, |
|
"loss": 2.3348, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.06105924516614309, |
|
"grad_norm": 0.48943185806274414, |
|
"learning_rate": 0.0004965842483378802, |
|
"loss": 2.324, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.06194416176275386, |
|
"grad_norm": 0.7627712488174438, |
|
"learning_rate": 0.0004964677546295589, |
|
"loss": 2.3016, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.06282907835936463, |
|
"grad_norm": 0.5588313937187195, |
|
"learning_rate": 0.0004963493217171235, |
|
"loss": 2.3134, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.0637139949559754, |
|
"grad_norm": 0.5578395128250122, |
|
"learning_rate": 0.0004962289505324021, |
|
"loss": 2.2991, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.06459891155258617, |
|
"grad_norm": 0.6175896525382996, |
|
"learning_rate": 0.0004961066420224729, |
|
"loss": 2.3257, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.06548382814919694, |
|
"grad_norm": 0.727881908416748, |
|
"learning_rate": 0.0004959823971496574, |
|
"loss": 2.2855, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.06636874474580771, |
|
"grad_norm": 0.6838656663894653, |
|
"learning_rate": 0.0004958562168915122, |
|
"loss": 2.2925, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.06725366134241847, |
|
"grad_norm": 0.6439931988716125, |
|
"learning_rate": 0.0004957281022408211, |
|
"loss": 2.3086, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.06813857793902925, |
|
"grad_norm": 0.7111929655075073, |
|
"learning_rate": 0.0004955980542055883, |
|
"loss": 2.3276, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.06902349453564001, |
|
"grad_norm": 0.5941621661186218, |
|
"learning_rate": 0.0004954660738090296, |
|
"loss": 2.2986, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.06990841113225078, |
|
"grad_norm": 1.118166446685791, |
|
"learning_rate": 0.0004953321620895643, |
|
"loss": 2.3091, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.07079332772886156, |
|
"grad_norm": 0.543308675289154, |
|
"learning_rate": 0.0004951963201008077, |
|
"loss": 2.3208, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.07167824432547232, |
|
"grad_norm": 0.6741182208061218, |
|
"learning_rate": 0.000495058548911562, |
|
"loss": 2.3007, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.07256316092208309, |
|
"grad_norm": 0.577864408493042, |
|
"learning_rate": 0.0004949188496058089, |
|
"loss": 2.3049, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.07344807751869387, |
|
"grad_norm": 0.5314656496047974, |
|
"learning_rate": 0.0004947772232827, |
|
"loss": 2.2865, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.07433299411530463, |
|
"grad_norm": 0.5619907975196838, |
|
"learning_rate": 0.0004946336710565488, |
|
"loss": 2.2991, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.0752179107119154, |
|
"grad_norm": 0.7731435298919678, |
|
"learning_rate": 0.0004944881940568219, |
|
"loss": 2.2954, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.07610282730852617, |
|
"grad_norm": 0.6936209201812744, |
|
"learning_rate": 0.0004943407934281299, |
|
"loss": 2.2966, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.07698774390513693, |
|
"grad_norm": 0.7555710673332214, |
|
"learning_rate": 0.0004941914703302181, |
|
"loss": 2.2794, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.07787266050174771, |
|
"grad_norm": 0.5199636220932007, |
|
"learning_rate": 0.0004940402259379585, |
|
"loss": 2.3113, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.07875757709835848, |
|
"grad_norm": 0.4673093557357788, |
|
"learning_rate": 0.0004938870614413392, |
|
"loss": 2.2965, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.07964249369496924, |
|
"grad_norm": 0.49087241291999817, |
|
"learning_rate": 0.0004937319780454559, |
|
"loss": 2.2903, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.08052741029158002, |
|
"grad_norm": 0.5380146503448486, |
|
"learning_rate": 0.0004935749769705022, |
|
"loss": 2.311, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.08141232688819079, |
|
"grad_norm": 0.8122909665107727, |
|
"learning_rate": 0.0004934160594517598, |
|
"loss": 2.2972, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.08229724348480155, |
|
"grad_norm": 0.4482613503932953, |
|
"learning_rate": 0.0004932552267395891, |
|
"loss": 2.2864, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.08318216008141233, |
|
"grad_norm": 0.4072429835796356, |
|
"learning_rate": 0.0004930924800994192, |
|
"loss": 2.2931, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.0840670766780231, |
|
"grad_norm": 0.840983510017395, |
|
"learning_rate": 0.0004929278208117378, |
|
"loss": 2.2763, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.08495199327463386, |
|
"grad_norm": 0.5435421466827393, |
|
"learning_rate": 0.0004927612501720814, |
|
"loss": 2.2896, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.08583690987124463, |
|
"grad_norm": 0.5765254497528076, |
|
"learning_rate": 0.000492592769491025, |
|
"loss": 2.2992, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.08672182646785541, |
|
"grad_norm": 0.6193447113037109, |
|
"learning_rate": 0.0004924223800941717, |
|
"loss": 2.3071, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.08760674306446617, |
|
"grad_norm": 0.9472047686576843, |
|
"learning_rate": 0.0004922500833221425, |
|
"loss": 2.2825, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.08849165966107694, |
|
"grad_norm": 0.5508486032485962, |
|
"learning_rate": 0.0004920758805305654, |
|
"loss": 2.2914, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.08849165966107694, |
|
"eval_accuracy": 0.5370846484054032, |
|
"eval_loss": 2.197066068649292, |
|
"eval_runtime": 12.287, |
|
"eval_samples_per_second": 25.881, |
|
"eval_steps_per_second": 0.407, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.08937657625768772, |
|
"grad_norm": 0.4986010789871216, |
|
"learning_rate": 0.0004918997730900649, |
|
"loss": 2.2682, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.09026149285429848, |
|
"grad_norm": 0.5728856921195984, |
|
"learning_rate": 0.0004917217623862517, |
|
"loss": 2.2828, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.09114640945090925, |
|
"grad_norm": 0.5309883952140808, |
|
"learning_rate": 0.0004915418498197105, |
|
"loss": 2.3083, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.09203132604752003, |
|
"grad_norm": 0.46175774931907654, |
|
"learning_rate": 0.0004913600368059907, |
|
"loss": 2.2686, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.09291624264413079, |
|
"grad_norm": 0.4882391691207886, |
|
"learning_rate": 0.000491176324775594, |
|
"loss": 2.2916, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.09380115924074156, |
|
"grad_norm": 0.7018927335739136, |
|
"learning_rate": 0.0004909907151739633, |
|
"loss": 2.2805, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.09468607583735233, |
|
"grad_norm": 0.6598804593086243, |
|
"learning_rate": 0.0004908032094614721, |
|
"loss": 2.3002, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.0955709924339631, |
|
"grad_norm": 0.5327743887901306, |
|
"learning_rate": 0.0004906138091134118, |
|
"loss": 2.3015, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.09645590903057387, |
|
"grad_norm": 0.5282323956489563, |
|
"learning_rate": 0.0004904225156199815, |
|
"loss": 2.2905, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.09734082562718464, |
|
"grad_norm": 0.4804977774620056, |
|
"learning_rate": 0.000490229330486275, |
|
"loss": 2.278, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.0982257422237954, |
|
"grad_norm": 0.6388362646102905, |
|
"learning_rate": 0.0004900342552322694, |
|
"loss": 2.2983, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.09911065882040618, |
|
"grad_norm": 0.6902673244476318, |
|
"learning_rate": 0.000489837291392814, |
|
"loss": 2.2952, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.09999557541701695, |
|
"grad_norm": 0.8888295292854309, |
|
"learning_rate": 0.0004896384405176167, |
|
"loss": 2.2779, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.10088049201362771, |
|
"grad_norm": 0.7182716131210327, |
|
"learning_rate": 0.0004894377041712326, |
|
"loss": 2.2783, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.10176540861023849, |
|
"grad_norm": 0.4691000282764435, |
|
"learning_rate": 0.0004892350839330522, |
|
"loss": 2.2996, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.10265032520684926, |
|
"grad_norm": 0.6428681015968323, |
|
"learning_rate": 0.000489030581397288, |
|
"loss": 2.2673, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.10353524180346002, |
|
"grad_norm": 0.6838648319244385, |
|
"learning_rate": 0.0004888241981729624, |
|
"loss": 2.251, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.1044201584000708, |
|
"grad_norm": 0.5071864128112793, |
|
"learning_rate": 0.0004886159358838952, |
|
"loss": 2.281, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.10530507499668157, |
|
"grad_norm": 0.5089443325996399, |
|
"learning_rate": 0.0004884057961686906, |
|
"loss": 2.2951, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.10618999159329233, |
|
"grad_norm": 0.5187750458717346, |
|
"learning_rate": 0.00048819378068072405, |
|
"loss": 2.264, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.1070749081899031, |
|
"grad_norm": 0.4623073637485504, |
|
"learning_rate": 0.00048797989108813013, |
|
"loss": 2.2772, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.10795982478651388, |
|
"grad_norm": 0.5824326872825623, |
|
"learning_rate": 0.0004877641290737884, |
|
"loss": 2.2703, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.10884474138312464, |
|
"grad_norm": 0.644314706325531, |
|
"learning_rate": 0.00048754649633531074, |
|
"loss": 2.2779, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.10972965797973541, |
|
"grad_norm": 0.6066089272499084, |
|
"learning_rate": 0.00048732699458502784, |
|
"loss": 2.305, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.11061457457634619, |
|
"grad_norm": 0.8288434147834778, |
|
"learning_rate": 0.00048710562554997574, |
|
"loss": 2.2944, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.11149949117295695, |
|
"grad_norm": 0.5620648264884949, |
|
"learning_rate": 0.00048688239097188226, |
|
"loss": 2.2584, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.11238440776956772, |
|
"grad_norm": 0.5757160186767578, |
|
"learning_rate": 0.0004866572926071532, |
|
"loss": 2.2949, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.11326932436617848, |
|
"grad_norm": 0.5411326885223389, |
|
"learning_rate": 0.00048643033222685886, |
|
"loss": 2.2671, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.11415424096278926, |
|
"grad_norm": 0.8147817254066467, |
|
"learning_rate": 0.00048620151161671955, |
|
"loss": 2.3014, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.11503915755940003, |
|
"grad_norm": 0.600642204284668, |
|
"learning_rate": 0.0004859708325770919, |
|
"loss": 2.2699, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.11592407415601079, |
|
"grad_norm": 0.6259739398956299, |
|
"learning_rate": 0.0004857382969229548, |
|
"loss": 2.2599, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.11680899075262156, |
|
"grad_norm": 0.546262800693512, |
|
"learning_rate": 0.00048550390648389476, |
|
"loss": 2.2823, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.11769390734923234, |
|
"grad_norm": 0.4821476340293884, |
|
"learning_rate": 0.00048526766310409176, |
|
"loss": 2.2521, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.1185788239458431, |
|
"grad_norm": 0.825333833694458, |
|
"learning_rate": 0.00048502956864230473, |
|
"loss": 2.2572, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.11946374054245387, |
|
"grad_norm": 0.4751971960067749, |
|
"learning_rate": 0.000484789624971857, |
|
"loss": 2.2741, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.12034865713906465, |
|
"grad_norm": 0.5960304737091064, |
|
"learning_rate": 0.0004845478339806211, |
|
"loss": 2.2763, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.1212335737356754, |
|
"grad_norm": 0.6432631015777588, |
|
"learning_rate": 0.0004843041975710044, |
|
"loss": 2.2609, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 0.12211849033228618, |
|
"grad_norm": 0.7140398621559143, |
|
"learning_rate": 0.0004840587176599343, |
|
"loss": 2.3021, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.12300340692889696, |
|
"grad_norm": 0.519575834274292, |
|
"learning_rate": 0.0004838113961788424, |
|
"loss": 2.2788, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 0.12388832352550772, |
|
"grad_norm": 0.6823663711547852, |
|
"learning_rate": 0.00048356223507364993, |
|
"loss": 2.2905, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.12477324012211849, |
|
"grad_norm": 0.553036093711853, |
|
"learning_rate": 0.0004833112363047524, |
|
"loss": 2.2917, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 0.12565815671872926, |
|
"grad_norm": 0.4933728277683258, |
|
"learning_rate": 0.00048305840184700356, |
|
"loss": 2.2589, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.12654307331534004, |
|
"grad_norm": 0.9149543642997742, |
|
"learning_rate": 0.00048280373368970086, |
|
"loss": 2.264, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 0.1274279899119508, |
|
"grad_norm": 0.4670112729072571, |
|
"learning_rate": 0.0004825472338365691, |
|
"loss": 2.2684, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.12831290650856156, |
|
"grad_norm": 0.5053747296333313, |
|
"learning_rate": 0.0004822889043057446, |
|
"loss": 2.2563, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.12919782310517233, |
|
"grad_norm": 0.5054446458816528, |
|
"learning_rate": 0.00048202874712975977, |
|
"loss": 2.2829, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 0.1300827397017831, |
|
"grad_norm": 0.6858576536178589, |
|
"learning_rate": 0.0004817667643555269, |
|
"loss": 2.2531, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 0.13096765629839388, |
|
"grad_norm": 0.7087405920028687, |
|
"learning_rate": 0.00048150295804432196, |
|
"loss": 2.2693, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 0.13185257289500465, |
|
"grad_norm": 0.44586825370788574, |
|
"learning_rate": 0.0004812373302717686, |
|
"loss": 2.2751, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 0.13273748949161543, |
|
"grad_norm": 0.4149426221847534, |
|
"learning_rate": 0.0004809698831278217, |
|
"loss": 2.2507, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.13362240608822618, |
|
"grad_norm": 0.6579311490058899, |
|
"learning_rate": 0.0004807006187167507, |
|
"loss": 2.274, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 0.13450732268483695, |
|
"grad_norm": 0.46561411023139954, |
|
"learning_rate": 0.0004804295391571235, |
|
"loss": 2.262, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 0.13539223928144772, |
|
"grad_norm": 0.5864225625991821, |
|
"learning_rate": 0.00048015664658178944, |
|
"loss": 2.2859, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 0.1362771558780585, |
|
"grad_norm": 0.6503337621688843, |
|
"learning_rate": 0.0004798819431378627, |
|
"loss": 2.2601, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 0.13716207247466927, |
|
"grad_norm": 0.5384878516197205, |
|
"learning_rate": 0.0004796054309867053, |
|
"loss": 2.2657, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.13804698907128002, |
|
"grad_norm": 0.8244152665138245, |
|
"learning_rate": 0.00047932711230391014, |
|
"loss": 2.2766, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 0.1389319056678908, |
|
"grad_norm": 0.4444003999233246, |
|
"learning_rate": 0.00047904698927928404, |
|
"loss": 2.246, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 0.13981682226450157, |
|
"grad_norm": 0.5598679780960083, |
|
"learning_rate": 0.00047876506411683, |
|
"loss": 2.2731, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 0.14070173886111234, |
|
"grad_norm": 0.5661593675613403, |
|
"learning_rate": 0.0004784813390347305, |
|
"loss": 2.2549, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 0.14158665545772312, |
|
"grad_norm": 0.6023704409599304, |
|
"learning_rate": 0.0004781958162653297, |
|
"loss": 2.2782, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.1424715720543339, |
|
"grad_norm": 0.8696288466453552, |
|
"learning_rate": 0.00047790849805511595, |
|
"loss": 2.248, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 0.14335648865094464, |
|
"grad_norm": 0.7130827903747559, |
|
"learning_rate": 0.000477619386664704, |
|
"loss": 2.2693, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 0.1442414052475554, |
|
"grad_norm": 0.7435203790664673, |
|
"learning_rate": 0.00047732848436881736, |
|
"loss": 2.2648, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 0.14512632184416618, |
|
"grad_norm": 0.5171283483505249, |
|
"learning_rate": 0.00047703579345627036, |
|
"loss": 2.2506, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 0.14601123844077696, |
|
"grad_norm": 0.5777902007102966, |
|
"learning_rate": 0.0004767413162299501, |
|
"loss": 2.2732, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.14689615503738773, |
|
"grad_norm": 0.5333867073059082, |
|
"learning_rate": 0.0004764450550067985, |
|
"loss": 2.2803, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 0.1477810716339985, |
|
"grad_norm": 0.5803987979888916, |
|
"learning_rate": 0.0004761470121177938, |
|
"loss": 2.2928, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 0.14866598823060925, |
|
"grad_norm": 0.5488025546073914, |
|
"learning_rate": 0.0004758471899079324, |
|
"loss": 2.2695, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 0.14955090482722003, |
|
"grad_norm": 0.7418103814125061, |
|
"learning_rate": 0.00047554559073621034, |
|
"loss": 2.2442, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 0.1504358214238308, |
|
"grad_norm": 0.5090646147727966, |
|
"learning_rate": 0.00047524221697560476, |
|
"loss": 2.2637, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.15132073802044158, |
|
"grad_norm": 0.46209344267845154, |
|
"learning_rate": 0.0004749370710130554, |
|
"loss": 2.235, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 0.15220565461705235, |
|
"grad_norm": 0.5527107119560242, |
|
"learning_rate": 0.0004746301552494453, |
|
"loss": 2.2815, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 0.15309057121366312, |
|
"grad_norm": 0.617348849773407, |
|
"learning_rate": 0.0004743214720995827, |
|
"loss": 2.2734, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 0.15397548781027387, |
|
"grad_norm": 0.8233256340026855, |
|
"learning_rate": 0.00047401102399218133, |
|
"loss": 2.258, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 0.15486040440688464, |
|
"grad_norm": 0.5554172992706299, |
|
"learning_rate": 0.0004736988133698416, |
|
"loss": 2.2703, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.15574532100349542, |
|
"grad_norm": 0.6374910473823547, |
|
"learning_rate": 0.0004733848426890313, |
|
"loss": 2.2656, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 0.1566302376001062, |
|
"grad_norm": 0.5161751508712769, |
|
"learning_rate": 0.00047306911442006653, |
|
"loss": 2.2636, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 0.15751515419671697, |
|
"grad_norm": 0.6015154719352722, |
|
"learning_rate": 0.00047275163104709196, |
|
"loss": 2.2511, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 0.15840007079332774, |
|
"grad_norm": 0.5937806367874146, |
|
"learning_rate": 0.0004724323950680614, |
|
"loss": 2.2593, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 0.1592849873899385, |
|
"grad_norm": 0.5501092672348022, |
|
"learning_rate": 0.00047211140899471813, |
|
"loss": 2.2621, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.16016990398654926, |
|
"grad_norm": 0.6284824013710022, |
|
"learning_rate": 0.000471788675352575, |
|
"loss": 2.2683, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 0.16105482058316004, |
|
"grad_norm": 0.46114546060562134, |
|
"learning_rate": 0.000471464196680895, |
|
"loss": 2.2593, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 0.1619397371797708, |
|
"grad_norm": 0.5204902291297913, |
|
"learning_rate": 0.0004711379755326707, |
|
"loss": 2.2511, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 0.16282465377638158, |
|
"grad_norm": 0.5937714576721191, |
|
"learning_rate": 0.00047081001447460457, |
|
"loss": 2.2603, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 0.16370957037299236, |
|
"grad_norm": 0.5259864330291748, |
|
"learning_rate": 0.00047048031608708875, |
|
"loss": 2.2427, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.1645944869696031, |
|
"grad_norm": 0.43361151218414307, |
|
"learning_rate": 0.0004701488829641845, |
|
"loss": 2.2455, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 0.16547940356621388, |
|
"grad_norm": 0.5359675884246826, |
|
"learning_rate": 0.000469815717713602, |
|
"loss": 2.268, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 0.16636432016282465, |
|
"grad_norm": 0.6381211876869202, |
|
"learning_rate": 0.00046948082295667984, |
|
"loss": 2.2709, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 0.16724923675943543, |
|
"grad_norm": 0.5162480473518372, |
|
"learning_rate": 0.0004691442013283642, |
|
"loss": 2.2489, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 0.1681341533560462, |
|
"grad_norm": 0.4458593726158142, |
|
"learning_rate": 0.00046880585547718847, |
|
"loss": 2.2603, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.16901906995265698, |
|
"grad_norm": 0.46709561347961426, |
|
"learning_rate": 0.00046846578806525194, |
|
"loss": 2.2666, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 0.16990398654926772, |
|
"grad_norm": 0.7030518054962158, |
|
"learning_rate": 0.0004681240017681993, |
|
"loss": 2.2222, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 0.1707889031458785, |
|
"grad_norm": 0.5679172277450562, |
|
"learning_rate": 0.00046778049927519936, |
|
"loss": 2.2753, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 0.17167381974248927, |
|
"grad_norm": 0.5176842212677002, |
|
"learning_rate": 0.0004674352832889239, |
|
"loss": 2.2578, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 0.17255873633910004, |
|
"grad_norm": 0.5601808428764343, |
|
"learning_rate": 0.0004670883565255264, |
|
"loss": 2.2406, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.17344365293571082, |
|
"grad_norm": 0.6519585847854614, |
|
"learning_rate": 0.00046673972171462077, |
|
"loss": 2.2535, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 0.1743285695323216, |
|
"grad_norm": 0.5103752017021179, |
|
"learning_rate": 0.0004663893815992599, |
|
"loss": 2.2528, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 0.17521348612893234, |
|
"grad_norm": 0.496896892786026, |
|
"learning_rate": 0.0004660373389359137, |
|
"loss": 2.247, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 0.1760984027255431, |
|
"grad_norm": 0.7528384327888489, |
|
"learning_rate": 0.00046568359649444796, |
|
"loss": 2.2525, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 0.1769833193221539, |
|
"grad_norm": 0.5633223056793213, |
|
"learning_rate": 0.0004653281570581023, |
|
"loss": 2.2471, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.1769833193221539, |
|
"eval_accuracy": 0.5411561883259997, |
|
"eval_loss": 2.162017583847046, |
|
"eval_runtime": 11.2812, |
|
"eval_samples_per_second": 28.189, |
|
"eval_steps_per_second": 0.443, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.17786823591876466, |
|
"grad_norm": 0.713107168674469, |
|
"learning_rate": 0.000464971023423468, |
|
"loss": 2.2638, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 0.17875315251537544, |
|
"grad_norm": 0.5677906274795532, |
|
"learning_rate": 0.0004646121984004665, |
|
"loss": 2.2495, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 0.17963806911198618, |
|
"grad_norm": 0.637523353099823, |
|
"learning_rate": 0.0004642516848123272, |
|
"loss": 2.2509, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 0.18052298570859696, |
|
"grad_norm": 0.5341629385948181, |
|
"learning_rate": 0.00046388948549556453, |
|
"loss": 2.2659, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 0.18140790230520773, |
|
"grad_norm": 0.5201821327209473, |
|
"learning_rate": 0.00046352560329995687, |
|
"loss": 2.2512, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 0.1822928189018185, |
|
"grad_norm": 0.49913713335990906, |
|
"learning_rate": 0.00046316004108852305, |
|
"loss": 2.2724, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 0.18317773549842928, |
|
"grad_norm": 0.5114869475364685, |
|
"learning_rate": 0.0004627928017375004, |
|
"loss": 2.2714, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 0.18406265209504005, |
|
"grad_norm": 0.8079931139945984, |
|
"learning_rate": 0.00046242388813632187, |
|
"loss": 2.2608, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 0.1849475686916508, |
|
"grad_norm": 0.469683974981308, |
|
"learning_rate": 0.0004620533031875934, |
|
"loss": 2.2567, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 0.18583248528826157, |
|
"grad_norm": 0.7134404182434082, |
|
"learning_rate": 0.00046168104980707104, |
|
"loss": 2.2418, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.18671740188487235, |
|
"grad_norm": 0.8264422416687012, |
|
"learning_rate": 0.0004613071309236382, |
|
"loss": 2.2404, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 0.18760231848148312, |
|
"grad_norm": 0.6578531265258789, |
|
"learning_rate": 0.00046093154947928226, |
|
"loss": 2.2531, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 0.1884872350780939, |
|
"grad_norm": 0.6748083829879761, |
|
"learning_rate": 0.0004605543084290716, |
|
"loss": 2.2349, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 0.18937215167470467, |
|
"grad_norm": 0.9525237083435059, |
|
"learning_rate": 0.00046017541074113257, |
|
"loss": 2.2385, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 0.19025706827131542, |
|
"grad_norm": 0.7239274382591248, |
|
"learning_rate": 0.00045979485939662556, |
|
"loss": 2.2345, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 0.1911419848679262, |
|
"grad_norm": 0.5265571475028992, |
|
"learning_rate": 0.00045941265738972217, |
|
"loss": 2.2621, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 0.19202690146453696, |
|
"grad_norm": 0.45004111528396606, |
|
"learning_rate": 0.0004590288077275814, |
|
"loss": 2.2504, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 0.19291181806114774, |
|
"grad_norm": 0.622985303401947, |
|
"learning_rate": 0.00045864331343032565, |
|
"loss": 2.2176, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 0.1937967346577585, |
|
"grad_norm": 0.500320553779602, |
|
"learning_rate": 0.00045825617753101776, |
|
"loss": 2.2466, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 0.1946816512543693, |
|
"grad_norm": 0.6258721351623535, |
|
"learning_rate": 0.00045786740307563633, |
|
"loss": 2.2386, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.19556656785098003, |
|
"grad_norm": 0.9133718013763428, |
|
"learning_rate": 0.0004574769931230521, |
|
"loss": 2.2468, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 0.1964514844475908, |
|
"grad_norm": 0.8068430423736572, |
|
"learning_rate": 0.0004570849507450041, |
|
"loss": 2.2421, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 0.19733640104420158, |
|
"grad_norm": 0.651720404624939, |
|
"learning_rate": 0.0004566912790260751, |
|
"loss": 2.2868, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 0.19822131764081236, |
|
"grad_norm": 0.4649779200553894, |
|
"learning_rate": 0.0004562959810636674, |
|
"loss": 2.2455, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 0.19910623423742313, |
|
"grad_norm": 0.7452356815338135, |
|
"learning_rate": 0.0004558990599679787, |
|
"loss": 2.2457, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 0.1999911508340339, |
|
"grad_norm": 0.4573175013065338, |
|
"learning_rate": 0.00045550051886197754, |
|
"loss": 2.2525, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 0.20087606743064465, |
|
"grad_norm": 0.46977052092552185, |
|
"learning_rate": 0.0004551003608813784, |
|
"loss": 2.2605, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 0.20176098402725542, |
|
"grad_norm": 0.43524104356765747, |
|
"learning_rate": 0.0004546985891746177, |
|
"loss": 2.2411, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 0.2026459006238662, |
|
"grad_norm": 0.5056027173995972, |
|
"learning_rate": 0.00045429520690282827, |
|
"loss": 2.2434, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 0.20353081722047697, |
|
"grad_norm": 0.46207907795906067, |
|
"learning_rate": 0.00045389021723981504, |
|
"loss": 2.2489, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.20441573381708775, |
|
"grad_norm": 0.4407023787498474, |
|
"learning_rate": 0.00045348362337202985, |
|
"loss": 2.229, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 0.20530065041369852, |
|
"grad_norm": 0.4465203583240509, |
|
"learning_rate": 0.00045307542849854626, |
|
"loss": 2.2567, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 0.20618556701030927, |
|
"grad_norm": 0.64149010181427, |
|
"learning_rate": 0.00045266563583103473, |
|
"loss": 2.2637, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 0.20707048360692004, |
|
"grad_norm": 0.5589755177497864, |
|
"learning_rate": 0.0004522542485937369, |
|
"loss": 2.2421, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 0.20795540020353082, |
|
"grad_norm": 0.4988935589790344, |
|
"learning_rate": 0.0004518412700234406, |
|
"loss": 2.25, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 0.2088403168001416, |
|
"grad_norm": 0.5745148062705994, |
|
"learning_rate": 0.0004514267033694543, |
|
"loss": 2.2564, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 0.20972523339675236, |
|
"grad_norm": 0.4813830256462097, |
|
"learning_rate": 0.0004510105518935813, |
|
"loss": 2.2491, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 0.21061014999336314, |
|
"grad_norm": 0.4937480092048645, |
|
"learning_rate": 0.0004505928188700945, |
|
"loss": 2.2467, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 0.21149506658997388, |
|
"grad_norm": 0.5905641913414001, |
|
"learning_rate": 0.0004501735075857101, |
|
"loss": 2.2548, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 0.21237998318658466, |
|
"grad_norm": 0.5014283657073975, |
|
"learning_rate": 0.0004497526213395623, |
|
"loss": 2.2366, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.21326489978319543, |
|
"grad_norm": 0.5339481830596924, |
|
"learning_rate": 0.0004493301634431768, |
|
"loss": 2.2451, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 0.2141498163798062, |
|
"grad_norm": 0.7018898129463196, |
|
"learning_rate": 0.00044890613722044524, |
|
"loss": 2.2499, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 0.21503473297641698, |
|
"grad_norm": 0.8874839544296265, |
|
"learning_rate": 0.0004484805460075988, |
|
"loss": 2.2615, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 0.21591964957302776, |
|
"grad_norm": 0.5717945694923401, |
|
"learning_rate": 0.0004480533931531819, |
|
"loss": 2.2245, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 0.2168045661696385, |
|
"grad_norm": 0.7452505826950073, |
|
"learning_rate": 0.00044762468201802586, |
|
"loss": 2.2589, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 0.21768948276624928, |
|
"grad_norm": 0.5501087307929993, |
|
"learning_rate": 0.0004471944159752228, |
|
"loss": 2.2288, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 0.21857439936286005, |
|
"grad_norm": 0.5167734622955322, |
|
"learning_rate": 0.00044676259841009845, |
|
"loss": 2.234, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 0.21945931595947082, |
|
"grad_norm": 0.5619193911552429, |
|
"learning_rate": 0.0004463292327201862, |
|
"loss": 2.2395, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 0.2203442325560816, |
|
"grad_norm": 0.45388907194137573, |
|
"learning_rate": 0.0004458943223152, |
|
"loss": 2.2539, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 0.22122914915269237, |
|
"grad_norm": 0.5973688364028931, |
|
"learning_rate": 0.0004454578706170075, |
|
"loss": 2.2375, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.22211406574930312, |
|
"grad_norm": 0.5753281712532043, |
|
"learning_rate": 0.00044501988105960315, |
|
"loss": 2.2295, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 0.2229989823459139, |
|
"grad_norm": 0.4576527178287506, |
|
"learning_rate": 0.00044458035708908153, |
|
"loss": 2.2607, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 0.22388389894252467, |
|
"grad_norm": 0.6270558834075928, |
|
"learning_rate": 0.00044413930216360964, |
|
"loss": 2.2444, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 0.22476881553913544, |
|
"grad_norm": 0.45903804898262024, |
|
"learning_rate": 0.00044369671975340026, |
|
"loss": 2.2355, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 0.22565373213574622, |
|
"grad_norm": 0.4801378846168518, |
|
"learning_rate": 0.0004432526133406842, |
|
"loss": 2.2208, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 0.22653864873235696, |
|
"grad_norm": 0.5071857571601868, |
|
"learning_rate": 0.0004428069864196833, |
|
"loss": 2.2471, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 0.22742356532896774, |
|
"grad_norm": 0.4489947259426117, |
|
"learning_rate": 0.00044235984249658256, |
|
"loss": 2.2241, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 0.2283084819255785, |
|
"grad_norm": 0.7193836569786072, |
|
"learning_rate": 0.00044191118508950277, |
|
"loss": 2.2384, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 0.22919339852218928, |
|
"grad_norm": 0.42136234045028687, |
|
"learning_rate": 0.0004414610177284728, |
|
"loss": 2.2508, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 0.23007831511880006, |
|
"grad_norm": 0.43367722630500793, |
|
"learning_rate": 0.0004410093439554019, |
|
"loss": 2.2279, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.23096323171541083, |
|
"grad_norm": 0.6120412945747375, |
|
"learning_rate": 0.00044055616732405147, |
|
"loss": 2.2308, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 0.23184814831202158, |
|
"grad_norm": 0.4905729293823242, |
|
"learning_rate": 0.0004401014914000078, |
|
"loss": 2.2417, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 0.23273306490863235, |
|
"grad_norm": 0.6039494276046753, |
|
"learning_rate": 0.00043964531976065313, |
|
"loss": 2.25, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 0.23361798150524313, |
|
"grad_norm": 0.4729721248149872, |
|
"learning_rate": 0.00043918765599513826, |
|
"loss": 2.2409, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 0.2345028981018539, |
|
"grad_norm": 0.5762743353843689, |
|
"learning_rate": 0.00043872850370435404, |
|
"loss": 2.2175, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 0.23538781469846468, |
|
"grad_norm": 0.4876422584056854, |
|
"learning_rate": 0.00043826786650090276, |
|
"loss": 2.2264, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 0.23627273129507545, |
|
"grad_norm": 0.6149749159812927, |
|
"learning_rate": 0.0004378057480090702, |
|
"loss": 2.2494, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 0.2371576478916862, |
|
"grad_norm": 0.5245630741119385, |
|
"learning_rate": 0.0004373421518647968, |
|
"loss": 2.2434, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 0.23804256448829697, |
|
"grad_norm": 0.46496084332466125, |
|
"learning_rate": 0.00043687708171564923, |
|
"loss": 2.2323, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 0.23892748108490774, |
|
"grad_norm": 0.4286579191684723, |
|
"learning_rate": 0.00043641054122079136, |
|
"loss": 2.2202, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.23981239768151852, |
|
"grad_norm": 0.45216891169548035, |
|
"learning_rate": 0.00043594253405095616, |
|
"loss": 2.2416, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 0.2406973142781293, |
|
"grad_norm": 0.49955543875694275, |
|
"learning_rate": 0.0004354730638884159, |
|
"loss": 2.2494, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 0.24158223087474007, |
|
"grad_norm": 0.4823377728462219, |
|
"learning_rate": 0.0004350021344269539, |
|
"loss": 2.244, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 0.2424671474713508, |
|
"grad_norm": 0.4880935847759247, |
|
"learning_rate": 0.0004345297493718352, |
|
"loss": 2.2297, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 0.2433520640679616, |
|
"grad_norm": 0.5247299075126648, |
|
"learning_rate": 0.00043405591243977736, |
|
"loss": 2.2463, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 0.24423698066457236, |
|
"grad_norm": 0.5798355340957642, |
|
"learning_rate": 0.0004335806273589214, |
|
"loss": 2.2325, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 0.24512189726118314, |
|
"grad_norm": 0.6595978140830994, |
|
"learning_rate": 0.0004331038978688022, |
|
"loss": 2.2407, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 0.2460068138577939, |
|
"grad_norm": 0.4189043641090393, |
|
"learning_rate": 0.0004326257277203194, |
|
"loss": 2.2523, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 0.24689173045440468, |
|
"grad_norm": 0.4813700318336487, |
|
"learning_rate": 0.00043214612067570755, |
|
"loss": 2.243, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 0.24777664705101543, |
|
"grad_norm": 0.8035611510276794, |
|
"learning_rate": 0.0004316650805085068, |
|
"loss": 2.2522, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.2486615636476262, |
|
"grad_norm": 0.5478577017784119, |
|
"learning_rate": 0.00043118261100353293, |
|
"loss": 2.2334, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 0.24954648024423698, |
|
"grad_norm": 0.39827045798301697, |
|
"learning_rate": 0.0004306987159568479, |
|
"loss": 2.2208, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 0.25043139684084775, |
|
"grad_norm": 0.5026851296424866, |
|
"learning_rate": 0.0004302133991757297, |
|
"loss": 2.2375, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 0.2513163134374585, |
|
"grad_norm": 0.5758419036865234, |
|
"learning_rate": 0.00042972666447864264, |
|
"loss": 2.2377, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 0.2522012300340693, |
|
"grad_norm": 0.5519118309020996, |
|
"learning_rate": 0.00042923851569520683, |
|
"loss": 2.2492, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 0.2530861466306801, |
|
"grad_norm": 0.5647429823875427, |
|
"learning_rate": 0.00042874895666616887, |
|
"loss": 2.2255, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 0.25397106322729085, |
|
"grad_norm": 0.8345276117324829, |
|
"learning_rate": 0.0004282579912433707, |
|
"loss": 2.2529, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 0.2548559798239016, |
|
"grad_norm": 0.6527183651924133, |
|
"learning_rate": 0.0004277656232897201, |
|
"loss": 2.2267, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 0.25574089642051234, |
|
"grad_norm": 0.555591881275177, |
|
"learning_rate": 0.00042727185667915975, |
|
"loss": 2.2088, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 0.2566258130171231, |
|
"grad_norm": 0.7504310011863708, |
|
"learning_rate": 0.00042677669529663686, |
|
"loss": 2.2205, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.2575107296137339, |
|
"grad_norm": 0.6328213810920715, |
|
"learning_rate": 0.00042628014303807294, |
|
"loss": 2.2329, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 0.25839564621034466, |
|
"grad_norm": 0.4501785337924957, |
|
"learning_rate": 0.00042578220381033263, |
|
"loss": 2.2063, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 0.25928056280695544, |
|
"grad_norm": 0.4656241834163666, |
|
"learning_rate": 0.0004252828815311934, |
|
"loss": 2.2345, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 0.2601654794035662, |
|
"grad_norm": 0.43484318256378174, |
|
"learning_rate": 0.00042478218012931436, |
|
"loss": 2.2351, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 0.261050396000177, |
|
"grad_norm": 0.43128812313079834, |
|
"learning_rate": 0.00042428010354420584, |
|
"loss": 2.2253, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 0.26193531259678776, |
|
"grad_norm": 0.4222058951854706, |
|
"learning_rate": 0.00042377665572619774, |
|
"loss": 2.2426, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 0.26282022919339854, |
|
"grad_norm": 0.791580319404602, |
|
"learning_rate": 0.000423271840636409, |
|
"loss": 2.2227, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 0.2637051457900093, |
|
"grad_norm": 0.551547110080719, |
|
"learning_rate": 0.0004227656622467162, |
|
"loss": 2.2591, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 0.2645900623866201, |
|
"grad_norm": 0.4557199478149414, |
|
"learning_rate": 0.0004222581245397223, |
|
"loss": 2.2383, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 0.26547497898323086, |
|
"grad_norm": 0.6194254159927368, |
|
"learning_rate": 0.0004217492315087254, |
|
"loss": 2.2295, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.26547497898323086, |
|
"eval_accuracy": 0.5435882077391512, |
|
"eval_loss": 2.143740177154541, |
|
"eval_runtime": 11.414, |
|
"eval_samples_per_second": 27.861, |
|
"eval_steps_per_second": 0.438, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.2663598955798416, |
|
"grad_norm": 0.7033931612968445, |
|
"learning_rate": 0.0004212389871576873, |
|
"loss": 2.2397, |
|
"step": 3010 |
|
}, |
|
{ |
|
"epoch": 0.26724481217645235, |
|
"grad_norm": 0.43708673119544983, |
|
"learning_rate": 0.00042072739550120175, |
|
"loss": 2.2126, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 0.2681297287730631, |
|
"grad_norm": 0.6861522793769836, |
|
"learning_rate": 0.00042021446056446333, |
|
"loss": 2.2453, |
|
"step": 3030 |
|
}, |
|
{ |
|
"epoch": 0.2690146453696739, |
|
"grad_norm": 0.4883657395839691, |
|
"learning_rate": 0.00041970018638323546, |
|
"loss": 2.2502, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 0.2698995619662847, |
|
"grad_norm": 0.5091878175735474, |
|
"learning_rate": 0.00041918457700381855, |
|
"loss": 2.2258, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 0.27078447856289545, |
|
"grad_norm": 0.7300602197647095, |
|
"learning_rate": 0.00041866763648301864, |
|
"loss": 2.2418, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 0.2716693951595062, |
|
"grad_norm": 0.5962792038917542, |
|
"learning_rate": 0.00041814936888811475, |
|
"loss": 2.2388, |
|
"step": 3070 |
|
}, |
|
{ |
|
"epoch": 0.272554311756117, |
|
"grad_norm": 0.5023945569992065, |
|
"learning_rate": 0.0004176297782968277, |
|
"loss": 2.2196, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 0.27343922835272777, |
|
"grad_norm": 0.9406496286392212, |
|
"learning_rate": 0.00041710886879728744, |
|
"loss": 2.2477, |
|
"step": 3090 |
|
}, |
|
{ |
|
"epoch": 0.27432414494933854, |
|
"grad_norm": 0.6057345867156982, |
|
"learning_rate": 0.000416586644488001, |
|
"loss": 2.2268, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.2752090615459493, |
|
"grad_norm": 0.7561666369438171, |
|
"learning_rate": 0.00041606310947782046, |
|
"loss": 2.2292, |
|
"step": 3110 |
|
}, |
|
{ |
|
"epoch": 0.27609397814256004, |
|
"grad_norm": 0.5512217283248901, |
|
"learning_rate": 0.0004155382678859103, |
|
"loss": 2.237, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 0.2769788947391708, |
|
"grad_norm": 0.4626982510089874, |
|
"learning_rate": 0.00041501212384171545, |
|
"loss": 2.2396, |
|
"step": 3130 |
|
}, |
|
{ |
|
"epoch": 0.2778638113357816, |
|
"grad_norm": 0.41738298535346985, |
|
"learning_rate": 0.0004144846814849282, |
|
"loss": 2.2221, |
|
"step": 3140 |
|
}, |
|
{ |
|
"epoch": 0.27874872793239236, |
|
"grad_norm": 0.5502617359161377, |
|
"learning_rate": 0.00041395594496545607, |
|
"loss": 2.2323, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 0.27963364452900313, |
|
"grad_norm": 0.5470404624938965, |
|
"learning_rate": 0.0004134259184433891, |
|
"loss": 2.2321, |
|
"step": 3160 |
|
}, |
|
{ |
|
"epoch": 0.2805185611256139, |
|
"grad_norm": 0.5013723969459534, |
|
"learning_rate": 0.0004128946060889668, |
|
"loss": 2.2229, |
|
"step": 3170 |
|
}, |
|
{ |
|
"epoch": 0.2814034777222247, |
|
"grad_norm": 0.4174317419528961, |
|
"learning_rate": 0.0004123620120825459, |
|
"loss": 2.2022, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 0.28228839431883546, |
|
"grad_norm": 0.4371856451034546, |
|
"learning_rate": 0.00041182814061456707, |
|
"loss": 2.2515, |
|
"step": 3190 |
|
}, |
|
{ |
|
"epoch": 0.28317331091544623, |
|
"grad_norm": 0.4697751998901367, |
|
"learning_rate": 0.00041129299588552195, |
|
"loss": 2.2225, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.284058227512057, |
|
"grad_norm": 0.4424945116043091, |
|
"learning_rate": 0.00041075658210592, |
|
"loss": 2.2382, |
|
"step": 3210 |
|
}, |
|
{ |
|
"epoch": 0.2849431441086678, |
|
"grad_norm": 0.600278377532959, |
|
"learning_rate": 0.000410218903496256, |
|
"loss": 2.2321, |
|
"step": 3220 |
|
}, |
|
{ |
|
"epoch": 0.28582806070527855, |
|
"grad_norm": 0.4000381827354431, |
|
"learning_rate": 0.0004096799642869761, |
|
"loss": 2.2241, |
|
"step": 3230 |
|
}, |
|
{ |
|
"epoch": 0.28671297730188927, |
|
"grad_norm": 0.6013597249984741, |
|
"learning_rate": 0.0004091397687184446, |
|
"loss": 2.2457, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 0.28759789389850005, |
|
"grad_norm": 0.6114123463630676, |
|
"learning_rate": 0.0004085983210409114, |
|
"loss": 2.2292, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 0.2884828104951108, |
|
"grad_norm": 0.7183189988136292, |
|
"learning_rate": 0.00040805562551447745, |
|
"loss": 2.2317, |
|
"step": 3260 |
|
}, |
|
{ |
|
"epoch": 0.2893677270917216, |
|
"grad_norm": 0.5196136236190796, |
|
"learning_rate": 0.000407511686409062, |
|
"loss": 2.2159, |
|
"step": 3270 |
|
}, |
|
{ |
|
"epoch": 0.29025264368833237, |
|
"grad_norm": 0.5873175859451294, |
|
"learning_rate": 0.0004069665080043687, |
|
"loss": 2.2169, |
|
"step": 3280 |
|
}, |
|
{ |
|
"epoch": 0.29113756028494314, |
|
"grad_norm": 0.46012935042381287, |
|
"learning_rate": 0.00040642009458985196, |
|
"loss": 2.2217, |
|
"step": 3290 |
|
}, |
|
{ |
|
"epoch": 0.2920224768815539, |
|
"grad_norm": 0.7356370687484741, |
|
"learning_rate": 0.0004058724504646834, |
|
"loss": 2.2384, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.2929073934781647, |
|
"grad_norm": 0.5594988465309143, |
|
"learning_rate": 0.0004053235799377176, |
|
"loss": 2.2454, |
|
"step": 3310 |
|
}, |
|
{ |
|
"epoch": 0.29379231007477546, |
|
"grad_norm": 0.40601104497909546, |
|
"learning_rate": 0.00040477348732745853, |
|
"loss": 2.2355, |
|
"step": 3320 |
|
}, |
|
{ |
|
"epoch": 0.29467722667138624, |
|
"grad_norm": 0.45904645323753357, |
|
"learning_rate": 0.0004042221769620256, |
|
"loss": 2.241, |
|
"step": 3330 |
|
}, |
|
{ |
|
"epoch": 0.295562143267997, |
|
"grad_norm": 0.485551655292511, |
|
"learning_rate": 0.0004036696531791193, |
|
"loss": 2.2299, |
|
"step": 3340 |
|
}, |
|
{ |
|
"epoch": 0.2964470598646078, |
|
"grad_norm": 0.5098533630371094, |
|
"learning_rate": 0.0004031159203259875, |
|
"loss": 2.2382, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 0.2973319764612185, |
|
"grad_norm": 0.5241096615791321, |
|
"learning_rate": 0.0004025609827593909, |
|
"loss": 2.2331, |
|
"step": 3360 |
|
}, |
|
{ |
|
"epoch": 0.2982168930578293, |
|
"grad_norm": 0.5468673706054688, |
|
"learning_rate": 0.00040200484484556885, |
|
"loss": 2.227, |
|
"step": 3370 |
|
}, |
|
{ |
|
"epoch": 0.29910180965444005, |
|
"grad_norm": 0.3913686275482178, |
|
"learning_rate": 0.000401447510960205, |
|
"loss": 2.2258, |
|
"step": 3380 |
|
}, |
|
{ |
|
"epoch": 0.29998672625105083, |
|
"grad_norm": 0.5263382196426392, |
|
"learning_rate": 0.0004008889854883929, |
|
"loss": 2.2131, |
|
"step": 3390 |
|
}, |
|
{ |
|
"epoch": 0.3008716428476616, |
|
"grad_norm": 0.4936838746070862, |
|
"learning_rate": 0.00040032927282460145, |
|
"loss": 2.2466, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.3017565594442724, |
|
"grad_norm": 0.5192393064498901, |
|
"learning_rate": 0.0003997683773726405, |
|
"loss": 2.2269, |
|
"step": 3410 |
|
}, |
|
{ |
|
"epoch": 0.30264147604088315, |
|
"grad_norm": 0.4334067702293396, |
|
"learning_rate": 0.0003992063035456259, |
|
"loss": 2.2143, |
|
"step": 3420 |
|
}, |
|
{ |
|
"epoch": 0.3035263926374939, |
|
"grad_norm": 0.472269743680954, |
|
"learning_rate": 0.00039864305576594504, |
|
"loss": 2.2426, |
|
"step": 3430 |
|
}, |
|
{ |
|
"epoch": 0.3044113092341047, |
|
"grad_norm": 0.8175429701805115, |
|
"learning_rate": 0.00039807863846522183, |
|
"loss": 2.2166, |
|
"step": 3440 |
|
}, |
|
{ |
|
"epoch": 0.3052962258307155, |
|
"grad_norm": 0.7245342135429382, |
|
"learning_rate": 0.0003975130560842821, |
|
"loss": 2.233, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 0.30618114242732625, |
|
"grad_norm": 0.4689445197582245, |
|
"learning_rate": 0.0003969463130731183, |
|
"loss": 2.2324, |
|
"step": 3460 |
|
}, |
|
{ |
|
"epoch": 0.307066059023937, |
|
"grad_norm": 0.6678940057754517, |
|
"learning_rate": 0.00039637841389085493, |
|
"loss": 2.2287, |
|
"step": 3470 |
|
}, |
|
{ |
|
"epoch": 0.30795097562054774, |
|
"grad_norm": 0.6060863733291626, |
|
"learning_rate": 0.0003958093630057131, |
|
"loss": 2.2472, |
|
"step": 3480 |
|
}, |
|
{ |
|
"epoch": 0.3088358922171585, |
|
"grad_norm": 0.4979764521121979, |
|
"learning_rate": 0.0003952391648949757, |
|
"loss": 2.2204, |
|
"step": 3490 |
|
}, |
|
{ |
|
"epoch": 0.3097208088137693, |
|
"grad_norm": 0.44741326570510864, |
|
"learning_rate": 0.0003946678240449515, |
|
"loss": 2.2048, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.31060572541038006, |
|
"grad_norm": 0.48008590936660767, |
|
"learning_rate": 0.00039409534495094076, |
|
"loss": 2.2155, |
|
"step": 3510 |
|
}, |
|
{ |
|
"epoch": 0.31149064200699084, |
|
"grad_norm": 0.6374879479408264, |
|
"learning_rate": 0.0003935217321171992, |
|
"loss": 2.2279, |
|
"step": 3520 |
|
}, |
|
{ |
|
"epoch": 0.3123755586036016, |
|
"grad_norm": 0.3989739716053009, |
|
"learning_rate": 0.000392946990056903, |
|
"loss": 2.239, |
|
"step": 3530 |
|
}, |
|
{ |
|
"epoch": 0.3132604752002124, |
|
"grad_norm": 0.44668588042259216, |
|
"learning_rate": 0.000392371123292113, |
|
"loss": 2.2256, |
|
"step": 3540 |
|
}, |
|
{ |
|
"epoch": 0.31414539179682316, |
|
"grad_norm": 0.5262216329574585, |
|
"learning_rate": 0.00039179413635373895, |
|
"loss": 2.2257, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 0.31503030839343393, |
|
"grad_norm": 0.5311539173126221, |
|
"learning_rate": 0.00039121603378150445, |
|
"loss": 2.2436, |
|
"step": 3560 |
|
}, |
|
{ |
|
"epoch": 0.3159152249900447, |
|
"grad_norm": 0.39581969380378723, |
|
"learning_rate": 0.0003906368201239106, |
|
"loss": 2.2165, |
|
"step": 3570 |
|
}, |
|
{ |
|
"epoch": 0.3168001415866555, |
|
"grad_norm": 0.7394521236419678, |
|
"learning_rate": 0.0003900564999382007, |
|
"loss": 2.2223, |
|
"step": 3580 |
|
}, |
|
{ |
|
"epoch": 0.3176850581832662, |
|
"grad_norm": 0.6843695044517517, |
|
"learning_rate": 0.0003894750777903242, |
|
"loss": 2.2278, |
|
"step": 3590 |
|
}, |
|
{ |
|
"epoch": 0.318569974779877, |
|
"grad_norm": 0.5867041349411011, |
|
"learning_rate": 0.00038889255825490053, |
|
"loss": 2.2326, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.31945489137648775, |
|
"grad_norm": 0.442703515291214, |
|
"learning_rate": 0.0003883089459151837, |
|
"loss": 2.2093, |
|
"step": 3610 |
|
}, |
|
{ |
|
"epoch": 0.3203398079730985, |
|
"grad_norm": 0.6596940755844116, |
|
"learning_rate": 0.0003877242453630256, |
|
"loss": 2.2084, |
|
"step": 3620 |
|
}, |
|
{ |
|
"epoch": 0.3212247245697093, |
|
"grad_norm": 0.42338627576828003, |
|
"learning_rate": 0.00038713846119884033, |
|
"loss": 2.2328, |
|
"step": 3630 |
|
}, |
|
{ |
|
"epoch": 0.32210964116632007, |
|
"grad_norm": 0.48313215374946594, |
|
"learning_rate": 0.0003865515980315677, |
|
"loss": 2.1973, |
|
"step": 3640 |
|
}, |
|
{ |
|
"epoch": 0.32299455776293085, |
|
"grad_norm": 0.6299489736557007, |
|
"learning_rate": 0.0003859636604786372, |
|
"loss": 2.2255, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 0.3238794743595416, |
|
"grad_norm": 0.4562525153160095, |
|
"learning_rate": 0.00038537465316593146, |
|
"loss": 2.2053, |
|
"step": 3660 |
|
}, |
|
{ |
|
"epoch": 0.3247643909561524, |
|
"grad_norm": 0.4125101566314697, |
|
"learning_rate": 0.0003847845807277501, |
|
"loss": 2.2179, |
|
"step": 3670 |
|
}, |
|
{ |
|
"epoch": 0.32564930755276317, |
|
"grad_norm": 0.531594455242157, |
|
"learning_rate": 0.000384193447806773, |
|
"loss": 2.2211, |
|
"step": 3680 |
|
}, |
|
{ |
|
"epoch": 0.32653422414937394, |
|
"grad_norm": 0.36836495995521545, |
|
"learning_rate": 0.00038360125905402396, |
|
"loss": 2.2381, |
|
"step": 3690 |
|
}, |
|
{ |
|
"epoch": 0.3274191407459847, |
|
"grad_norm": 0.4548482298851013, |
|
"learning_rate": 0.00038300801912883415, |
|
"loss": 2.2213, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 0.32830405734259543, |
|
"grad_norm": 0.6590666174888611, |
|
"learning_rate": 0.00038241373269880507, |
|
"loss": 2.2244, |
|
"step": 3710 |
|
}, |
|
{ |
|
"epoch": 0.3291889739392062, |
|
"grad_norm": 0.41382184624671936, |
|
"learning_rate": 0.0003818184044397725, |
|
"loss": 2.2275, |
|
"step": 3720 |
|
}, |
|
{ |
|
"epoch": 0.330073890535817, |
|
"grad_norm": 0.4768171012401581, |
|
"learning_rate": 0.0003812220390357689, |
|
"loss": 2.2221, |
|
"step": 3730 |
|
}, |
|
{ |
|
"epoch": 0.33095880713242776, |
|
"grad_norm": 0.5649170875549316, |
|
"learning_rate": 0.0003806246411789872, |
|
"loss": 2.2368, |
|
"step": 3740 |
|
}, |
|
{ |
|
"epoch": 0.33184372372903853, |
|
"grad_norm": 0.7073878645896912, |
|
"learning_rate": 0.00038002621556974364, |
|
"loss": 2.2258, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 0.3327286403256493, |
|
"grad_norm": 0.4342755675315857, |
|
"learning_rate": 0.0003794267669164408, |
|
"loss": 2.2217, |
|
"step": 3760 |
|
}, |
|
{ |
|
"epoch": 0.3336135569222601, |
|
"grad_norm": 0.43126773834228516, |
|
"learning_rate": 0.0003788262999355304, |
|
"loss": 2.2547, |
|
"step": 3770 |
|
}, |
|
{ |
|
"epoch": 0.33449847351887085, |
|
"grad_norm": 0.4855419993400574, |
|
"learning_rate": 0.00037822481935147656, |
|
"loss": 2.2363, |
|
"step": 3780 |
|
}, |
|
{ |
|
"epoch": 0.3353833901154816, |
|
"grad_norm": 0.39136576652526855, |
|
"learning_rate": 0.00037762232989671827, |
|
"loss": 2.2248, |
|
"step": 3790 |
|
}, |
|
{ |
|
"epoch": 0.3362683067120924, |
|
"grad_norm": 0.5595036745071411, |
|
"learning_rate": 0.0003770188363116324, |
|
"loss": 2.2123, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 0.3371532233087032, |
|
"grad_norm": 0.4321103096008301, |
|
"learning_rate": 0.0003764143433444962, |
|
"loss": 2.2066, |
|
"step": 3810 |
|
}, |
|
{ |
|
"epoch": 0.33803813990531395, |
|
"grad_norm": 0.4474160075187683, |
|
"learning_rate": 0.00037580885575145005, |
|
"loss": 2.2289, |
|
"step": 3820 |
|
}, |
|
{ |
|
"epoch": 0.33892305650192467, |
|
"grad_norm": 0.46913856267929077, |
|
"learning_rate": 0.0003752023782964601, |
|
"loss": 2.2222, |
|
"step": 3830 |
|
}, |
|
{ |
|
"epoch": 0.33980797309853544, |
|
"grad_norm": 0.516632080078125, |
|
"learning_rate": 0.00037459491575128075, |
|
"loss": 2.2166, |
|
"step": 3840 |
|
}, |
|
{ |
|
"epoch": 0.3406928896951462, |
|
"grad_norm": 0.5410568118095398, |
|
"learning_rate": 0.000373986472895417, |
|
"loss": 2.234, |
|
"step": 3850 |
|
}, |
|
{ |
|
"epoch": 0.341577806291757, |
|
"grad_norm": 0.685844361782074, |
|
"learning_rate": 0.0003733770545160867, |
|
"loss": 2.2179, |
|
"step": 3860 |
|
}, |
|
{ |
|
"epoch": 0.34246272288836777, |
|
"grad_norm": 0.40717822313308716, |
|
"learning_rate": 0.0003727666654081836, |
|
"loss": 2.2165, |
|
"step": 3870 |
|
}, |
|
{ |
|
"epoch": 0.34334763948497854, |
|
"grad_norm": 0.49223434925079346, |
|
"learning_rate": 0.0003721553103742388, |
|
"loss": 2.2215, |
|
"step": 3880 |
|
}, |
|
{ |
|
"epoch": 0.3442325560815893, |
|
"grad_norm": 0.4947943687438965, |
|
"learning_rate": 0.00037154299422438315, |
|
"loss": 2.2213, |
|
"step": 3890 |
|
}, |
|
{ |
|
"epoch": 0.3451174726782001, |
|
"grad_norm": 0.6207186579704285, |
|
"learning_rate": 0.00037092972177631, |
|
"loss": 2.2237, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 0.34600238927481086, |
|
"grad_norm": 0.5417460799217224, |
|
"learning_rate": 0.00037031549785523633, |
|
"loss": 2.2149, |
|
"step": 3910 |
|
}, |
|
{ |
|
"epoch": 0.34688730587142164, |
|
"grad_norm": 0.5263400077819824, |
|
"learning_rate": 0.0003697003272938657, |
|
"loss": 2.212, |
|
"step": 3920 |
|
}, |
|
{ |
|
"epoch": 0.3477722224680324, |
|
"grad_norm": 0.40013402700424194, |
|
"learning_rate": 0.00036908421493234963, |
|
"loss": 2.2225, |
|
"step": 3930 |
|
}, |
|
{ |
|
"epoch": 0.3486571390646432, |
|
"grad_norm": 0.5232884883880615, |
|
"learning_rate": 0.00036846716561824967, |
|
"loss": 2.2182, |
|
"step": 3940 |
|
}, |
|
{ |
|
"epoch": 0.3495420556612539, |
|
"grad_norm": 0.3998386263847351, |
|
"learning_rate": 0.0003678491842064995, |
|
"loss": 2.224, |
|
"step": 3950 |
|
}, |
|
{ |
|
"epoch": 0.3504269722578647, |
|
"grad_norm": 0.45659396052360535, |
|
"learning_rate": 0.0003672302755593661, |
|
"loss": 2.2309, |
|
"step": 3960 |
|
}, |
|
{ |
|
"epoch": 0.35131188885447545, |
|
"grad_norm": 0.8522054553031921, |
|
"learning_rate": 0.00036661044454641255, |
|
"loss": 2.189, |
|
"step": 3970 |
|
}, |
|
{ |
|
"epoch": 0.3521968054510862, |
|
"grad_norm": 0.5786595344543457, |
|
"learning_rate": 0.00036598969604445856, |
|
"loss": 2.2328, |
|
"step": 3980 |
|
}, |
|
{ |
|
"epoch": 0.353081722047697, |
|
"grad_norm": 0.6108263731002808, |
|
"learning_rate": 0.00036536803493754285, |
|
"loss": 2.2325, |
|
"step": 3990 |
|
}, |
|
{ |
|
"epoch": 0.3539666386443078, |
|
"grad_norm": 0.4051169753074646, |
|
"learning_rate": 0.00036474546611688443, |
|
"loss": 2.2336, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.3539666386443078, |
|
"eval_accuracy": 0.5453820812311378, |
|
"eval_loss": 2.1286191940307617, |
|
"eval_runtime": 12.2098, |
|
"eval_samples_per_second": 26.045, |
|
"eval_steps_per_second": 0.41, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.35485155524091855, |
|
"grad_norm": 0.6282593011856079, |
|
"learning_rate": 0.0003641219944808443, |
|
"loss": 2.221, |
|
"step": 4010 |
|
}, |
|
{ |
|
"epoch": 0.3557364718375293, |
|
"grad_norm": 0.7185233235359192, |
|
"learning_rate": 0.00036349762493488667, |
|
"loss": 2.2076, |
|
"step": 4020 |
|
}, |
|
{ |
|
"epoch": 0.3566213884341401, |
|
"grad_norm": 0.7254645228385925, |
|
"learning_rate": 0.00036287236239154064, |
|
"loss": 2.2315, |
|
"step": 4030 |
|
}, |
|
{ |
|
"epoch": 0.35750630503075087, |
|
"grad_norm": 0.6520965099334717, |
|
"learning_rate": 0.00036224621177036116, |
|
"loss": 2.2236, |
|
"step": 4040 |
|
}, |
|
{ |
|
"epoch": 0.35839122162736164, |
|
"grad_norm": 0.5554526448249817, |
|
"learning_rate": 0.0003616191779978907, |
|
"loss": 2.2314, |
|
"step": 4050 |
|
}, |
|
{ |
|
"epoch": 0.35927613822397236, |
|
"grad_norm": 0.6766877174377441, |
|
"learning_rate": 0.00036099126600762057, |
|
"loss": 2.2228, |
|
"step": 4060 |
|
}, |
|
{ |
|
"epoch": 0.36016105482058314, |
|
"grad_norm": 0.43382716178894043, |
|
"learning_rate": 0.00036036248073995135, |
|
"loss": 2.217, |
|
"step": 4070 |
|
}, |
|
{ |
|
"epoch": 0.3610459714171939, |
|
"grad_norm": 0.4572659134864807, |
|
"learning_rate": 0.0003597328271421551, |
|
"loss": 2.2222, |
|
"step": 4080 |
|
}, |
|
{ |
|
"epoch": 0.3619308880138047, |
|
"grad_norm": 0.6247086524963379, |
|
"learning_rate": 0.0003591023101683355, |
|
"loss": 2.2301, |
|
"step": 4090 |
|
}, |
|
{ |
|
"epoch": 0.36281580461041546, |
|
"grad_norm": 0.3971577286720276, |
|
"learning_rate": 0.00035847093477938953, |
|
"loss": 2.2116, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 0.36370072120702623, |
|
"grad_norm": 0.4561915099620819, |
|
"learning_rate": 0.00035783870594296795, |
|
"loss": 2.2151, |
|
"step": 4110 |
|
}, |
|
{ |
|
"epoch": 0.364585637803637, |
|
"grad_norm": 0.6701768636703491, |
|
"learning_rate": 0.0003572056286334366, |
|
"loss": 2.2285, |
|
"step": 4120 |
|
}, |
|
{ |
|
"epoch": 0.3654705544002478, |
|
"grad_norm": 0.49255579710006714, |
|
"learning_rate": 0.000356571707831837, |
|
"loss": 2.2201, |
|
"step": 4130 |
|
}, |
|
{ |
|
"epoch": 0.36635547099685856, |
|
"grad_norm": 0.547238290309906, |
|
"learning_rate": 0.00035593694852584717, |
|
"loss": 2.2295, |
|
"step": 4140 |
|
}, |
|
{ |
|
"epoch": 0.36724038759346933, |
|
"grad_norm": 0.6822459697723389, |
|
"learning_rate": 0.0003553013557097428, |
|
"loss": 2.2206, |
|
"step": 4150 |
|
}, |
|
{ |
|
"epoch": 0.3681253041900801, |
|
"grad_norm": 0.6047173738479614, |
|
"learning_rate": 0.00035466493438435703, |
|
"loss": 2.2155, |
|
"step": 4160 |
|
}, |
|
{ |
|
"epoch": 0.3690102207866909, |
|
"grad_norm": 0.43247050046920776, |
|
"learning_rate": 0.0003540276895570424, |
|
"loss": 2.2315, |
|
"step": 4170 |
|
}, |
|
{ |
|
"epoch": 0.3698951373833016, |
|
"grad_norm": 0.495980441570282, |
|
"learning_rate": 0.0003533896262416302, |
|
"loss": 2.2295, |
|
"step": 4180 |
|
}, |
|
{ |
|
"epoch": 0.37078005397991237, |
|
"grad_norm": 0.5255789756774902, |
|
"learning_rate": 0.00035275074945839187, |
|
"loss": 2.2338, |
|
"step": 4190 |
|
}, |
|
{ |
|
"epoch": 0.37166497057652315, |
|
"grad_norm": 0.5229047536849976, |
|
"learning_rate": 0.0003521110642339991, |
|
"loss": 2.2253, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 0.3725498871731339, |
|
"grad_norm": 0.42177897691726685, |
|
"learning_rate": 0.00035147057560148433, |
|
"loss": 2.2267, |
|
"step": 4210 |
|
}, |
|
{ |
|
"epoch": 0.3734348037697447, |
|
"grad_norm": 0.45331940054893494, |
|
"learning_rate": 0.0003508292886002013, |
|
"loss": 2.2153, |
|
"step": 4220 |
|
}, |
|
{ |
|
"epoch": 0.37431972036635547, |
|
"grad_norm": 0.6908669471740723, |
|
"learning_rate": 0.0003501872082757852, |
|
"loss": 2.2118, |
|
"step": 4230 |
|
}, |
|
{ |
|
"epoch": 0.37520463696296624, |
|
"grad_norm": 0.4515652656555176, |
|
"learning_rate": 0.00034954433968011333, |
|
"loss": 2.2165, |
|
"step": 4240 |
|
}, |
|
{ |
|
"epoch": 0.376089553559577, |
|
"grad_norm": 0.39639732241630554, |
|
"learning_rate": 0.00034890068787126475, |
|
"loss": 2.2182, |
|
"step": 4250 |
|
}, |
|
{ |
|
"epoch": 0.3769744701561878, |
|
"grad_norm": 0.4517054259777069, |
|
"learning_rate": 0.0003482562579134809, |
|
"loss": 2.2161, |
|
"step": 4260 |
|
}, |
|
{ |
|
"epoch": 0.37785938675279857, |
|
"grad_norm": 0.4925696551799774, |
|
"learning_rate": 0.0003476110548771259, |
|
"loss": 2.2094, |
|
"step": 4270 |
|
}, |
|
{ |
|
"epoch": 0.37874430334940934, |
|
"grad_norm": 0.5564557909965515, |
|
"learning_rate": 0.00034696508383864633, |
|
"loss": 2.1985, |
|
"step": 4280 |
|
}, |
|
{ |
|
"epoch": 0.3796292199460201, |
|
"grad_norm": 0.5726402401924133, |
|
"learning_rate": 0.0003463183498805312, |
|
"loss": 2.207, |
|
"step": 4290 |
|
}, |
|
{ |
|
"epoch": 0.38051413654263083, |
|
"grad_norm": 0.39135950803756714, |
|
"learning_rate": 0.0003456708580912725, |
|
"loss": 2.207, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 0.3813990531392416, |
|
"grad_norm": 0.5821109414100647, |
|
"learning_rate": 0.0003450226135653245, |
|
"loss": 2.2369, |
|
"step": 4310 |
|
}, |
|
{ |
|
"epoch": 0.3822839697358524, |
|
"grad_norm": 0.44956621527671814, |
|
"learning_rate": 0.00034437362140306424, |
|
"loss": 2.2136, |
|
"step": 4320 |
|
}, |
|
{ |
|
"epoch": 0.38316888633246315, |
|
"grad_norm": 0.3989977538585663, |
|
"learning_rate": 0.000343723886710751, |
|
"loss": 2.2124, |
|
"step": 4330 |
|
}, |
|
{ |
|
"epoch": 0.38405380292907393, |
|
"grad_norm": 0.5621548295021057, |
|
"learning_rate": 0.0003430734146004863, |
|
"loss": 2.2079, |
|
"step": 4340 |
|
}, |
|
{ |
|
"epoch": 0.3849387195256847, |
|
"grad_norm": 0.6707894206047058, |
|
"learning_rate": 0.0003424222101901738, |
|
"loss": 2.2275, |
|
"step": 4350 |
|
}, |
|
{ |
|
"epoch": 0.3858236361222955, |
|
"grad_norm": 0.4202430248260498, |
|
"learning_rate": 0.0003417702786034786, |
|
"loss": 2.2233, |
|
"step": 4360 |
|
}, |
|
{ |
|
"epoch": 0.38670855271890625, |
|
"grad_norm": 0.6233255863189697, |
|
"learning_rate": 0.0003411176249697875, |
|
"loss": 2.2169, |
|
"step": 4370 |
|
}, |
|
{ |
|
"epoch": 0.387593469315517, |
|
"grad_norm": 0.609015703201294, |
|
"learning_rate": 0.00034046425442416805, |
|
"loss": 2.232, |
|
"step": 4380 |
|
}, |
|
{ |
|
"epoch": 0.3884783859121278, |
|
"grad_norm": 0.5517158508300781, |
|
"learning_rate": 0.0003398101721073288, |
|
"loss": 2.229, |
|
"step": 4390 |
|
}, |
|
{ |
|
"epoch": 0.3893633025087386, |
|
"grad_norm": 0.6552030444145203, |
|
"learning_rate": 0.00033915538316557826, |
|
"loss": 2.2172, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 0.39024821910534935, |
|
"grad_norm": 0.509896993637085, |
|
"learning_rate": 0.00033849989275078473, |
|
"loss": 2.2444, |
|
"step": 4410 |
|
}, |
|
{ |
|
"epoch": 0.39113313570196007, |
|
"grad_norm": 0.5120652914047241, |
|
"learning_rate": 0.0003378437060203357, |
|
"loss": 2.2113, |
|
"step": 4420 |
|
}, |
|
{ |
|
"epoch": 0.39201805229857084, |
|
"grad_norm": 0.4150264859199524, |
|
"learning_rate": 0.00033718682813709715, |
|
"loss": 2.2235, |
|
"step": 4430 |
|
}, |
|
{ |
|
"epoch": 0.3929029688951816, |
|
"grad_norm": 0.43550485372543335, |
|
"learning_rate": 0.0003365292642693733, |
|
"loss": 2.2227, |
|
"step": 4440 |
|
}, |
|
{ |
|
"epoch": 0.3937878854917924, |
|
"grad_norm": 0.6145851016044617, |
|
"learning_rate": 0.00033587101959086524, |
|
"loss": 2.2307, |
|
"step": 4450 |
|
}, |
|
{ |
|
"epoch": 0.39467280208840316, |
|
"grad_norm": 0.7754809260368347, |
|
"learning_rate": 0.00033521209928063123, |
|
"loss": 2.1927, |
|
"step": 4460 |
|
}, |
|
{ |
|
"epoch": 0.39555771868501394, |
|
"grad_norm": 0.45462676882743835, |
|
"learning_rate": 0.0003345525085230449, |
|
"loss": 2.2267, |
|
"step": 4470 |
|
}, |
|
{ |
|
"epoch": 0.3964426352816247, |
|
"grad_norm": 0.6465119123458862, |
|
"learning_rate": 0.0003338922525077553, |
|
"loss": 2.2108, |
|
"step": 4480 |
|
}, |
|
{ |
|
"epoch": 0.3973275518782355, |
|
"grad_norm": 0.5132725834846497, |
|
"learning_rate": 0.00033323133642964545, |
|
"loss": 2.2237, |
|
"step": 4490 |
|
}, |
|
{ |
|
"epoch": 0.39821246847484626, |
|
"grad_norm": 0.399767130613327, |
|
"learning_rate": 0.00033256976548879183, |
|
"loss": 2.211, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.39909738507145703, |
|
"grad_norm": 0.45235475897789, |
|
"learning_rate": 0.0003319075448904234, |
|
"loss": 2.2203, |
|
"step": 4510 |
|
}, |
|
{ |
|
"epoch": 0.3999823016680678, |
|
"grad_norm": 0.3382948935031891, |
|
"learning_rate": 0.00033124467984488066, |
|
"loss": 2.2098, |
|
"step": 4520 |
|
}, |
|
{ |
|
"epoch": 0.4008672182646785, |
|
"grad_norm": 0.5118781328201294, |
|
"learning_rate": 0.00033058117556757457, |
|
"loss": 2.2064, |
|
"step": 4530 |
|
}, |
|
{ |
|
"epoch": 0.4017521348612893, |
|
"grad_norm": 0.39723286032676697, |
|
"learning_rate": 0.0003299170372789454, |
|
"loss": 2.2086, |
|
"step": 4540 |
|
}, |
|
{ |
|
"epoch": 0.4026370514579001, |
|
"grad_norm": 0.46771299839019775, |
|
"learning_rate": 0.0003292522702044221, |
|
"loss": 2.2039, |
|
"step": 4550 |
|
}, |
|
{ |
|
"epoch": 0.40352196805451085, |
|
"grad_norm": 0.43072032928466797, |
|
"learning_rate": 0.0003285868795743805, |
|
"loss": 2.1885, |
|
"step": 4560 |
|
}, |
|
{ |
|
"epoch": 0.4044068846511216, |
|
"grad_norm": 0.5793524980545044, |
|
"learning_rate": 0.0003279208706241031, |
|
"loss": 2.2077, |
|
"step": 4570 |
|
}, |
|
{ |
|
"epoch": 0.4052918012477324, |
|
"grad_norm": 0.4353041648864746, |
|
"learning_rate": 0.00032725424859373687, |
|
"loss": 2.2022, |
|
"step": 4580 |
|
}, |
|
{ |
|
"epoch": 0.40617671784434317, |
|
"grad_norm": 0.5556246042251587, |
|
"learning_rate": 0.00032658701872825265, |
|
"loss": 2.2221, |
|
"step": 4590 |
|
}, |
|
{ |
|
"epoch": 0.40706163444095395, |
|
"grad_norm": 0.4479421079158783, |
|
"learning_rate": 0.0003259191862774037, |
|
"loss": 2.2024, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 0.4079465510375647, |
|
"grad_norm": 0.4452154338359833, |
|
"learning_rate": 0.0003252507564956844, |
|
"loss": 2.2154, |
|
"step": 4610 |
|
}, |
|
{ |
|
"epoch": 0.4088314676341755, |
|
"grad_norm": 0.4886869788169861, |
|
"learning_rate": 0.000324581734642289, |
|
"loss": 2.2013, |
|
"step": 4620 |
|
}, |
|
{ |
|
"epoch": 0.40971638423078627, |
|
"grad_norm": 0.4508967101573944, |
|
"learning_rate": 0.0003239121259810701, |
|
"loss": 2.2229, |
|
"step": 4630 |
|
}, |
|
{ |
|
"epoch": 0.41060130082739704, |
|
"grad_norm": 0.37643152475357056, |
|
"learning_rate": 0.00032324193578049724, |
|
"loss": 2.2062, |
|
"step": 4640 |
|
}, |
|
{ |
|
"epoch": 0.41148621742400776, |
|
"grad_norm": 0.4318946897983551, |
|
"learning_rate": 0.00032257116931361555, |
|
"loss": 2.2152, |
|
"step": 4650 |
|
}, |
|
{ |
|
"epoch": 0.41237113402061853, |
|
"grad_norm": 0.5894390940666199, |
|
"learning_rate": 0.0003218998318580043, |
|
"loss": 2.2079, |
|
"step": 4660 |
|
}, |
|
{ |
|
"epoch": 0.4132560506172293, |
|
"grad_norm": 0.5740794539451599, |
|
"learning_rate": 0.0003212279286957352, |
|
"loss": 2.2118, |
|
"step": 4670 |
|
}, |
|
{ |
|
"epoch": 0.4141409672138401, |
|
"grad_norm": 0.4139980673789978, |
|
"learning_rate": 0.00032055546511333075, |
|
"loss": 2.207, |
|
"step": 4680 |
|
}, |
|
{ |
|
"epoch": 0.41502588381045086, |
|
"grad_norm": 0.5307886004447937, |
|
"learning_rate": 0.00031988244640172327, |
|
"loss": 2.2077, |
|
"step": 4690 |
|
}, |
|
{ |
|
"epoch": 0.41591080040706163, |
|
"grad_norm": 0.5277409553527832, |
|
"learning_rate": 0.00031920887785621233, |
|
"loss": 2.2067, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 0.4167957170036724, |
|
"grad_norm": 0.4328935444355011, |
|
"learning_rate": 0.0003185347647764241, |
|
"loss": 2.2273, |
|
"step": 4710 |
|
}, |
|
{ |
|
"epoch": 0.4176806336002832, |
|
"grad_norm": 0.4828528165817261, |
|
"learning_rate": 0.00031786011246626855, |
|
"loss": 2.2284, |
|
"step": 4720 |
|
}, |
|
{ |
|
"epoch": 0.41856555019689395, |
|
"grad_norm": 0.6279691457748413, |
|
"learning_rate": 0.00031718492623389896, |
|
"loss": 2.1989, |
|
"step": 4730 |
|
}, |
|
{ |
|
"epoch": 0.41945046679350473, |
|
"grad_norm": 0.5163887739181519, |
|
"learning_rate": 0.0003165092113916688, |
|
"loss": 2.2091, |
|
"step": 4740 |
|
}, |
|
{ |
|
"epoch": 0.4203353833901155, |
|
"grad_norm": 0.38124004006385803, |
|
"learning_rate": 0.00031583297325609116, |
|
"loss": 2.209, |
|
"step": 4750 |
|
}, |
|
{ |
|
"epoch": 0.4212202999867263, |
|
"grad_norm": 0.502122700214386, |
|
"learning_rate": 0.00031515621714779636, |
|
"loss": 2.1982, |
|
"step": 4760 |
|
}, |
|
{ |
|
"epoch": 0.422105216583337, |
|
"grad_norm": 0.403822124004364, |
|
"learning_rate": 0.0003144789483914898, |
|
"loss": 2.2207, |
|
"step": 4770 |
|
}, |
|
{ |
|
"epoch": 0.42299013317994777, |
|
"grad_norm": 0.3561129868030548, |
|
"learning_rate": 0.00031380117231591067, |
|
"loss": 2.2032, |
|
"step": 4780 |
|
}, |
|
{ |
|
"epoch": 0.42387504977655854, |
|
"grad_norm": 0.6012318730354309, |
|
"learning_rate": 0.0003131228942537895, |
|
"loss": 2.1987, |
|
"step": 4790 |
|
}, |
|
{ |
|
"epoch": 0.4247599663731693, |
|
"grad_norm": 0.5018305778503418, |
|
"learning_rate": 0.00031244411954180673, |
|
"loss": 2.2083, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 0.4256448829697801, |
|
"grad_norm": 0.4451483488082886, |
|
"learning_rate": 0.00031176485352055015, |
|
"loss": 2.2157, |
|
"step": 4810 |
|
}, |
|
{ |
|
"epoch": 0.42652979956639087, |
|
"grad_norm": 0.5879373550415039, |
|
"learning_rate": 0.0003110851015344735, |
|
"loss": 2.2291, |
|
"step": 4820 |
|
}, |
|
{ |
|
"epoch": 0.42741471616300164, |
|
"grad_norm": 0.5806208252906799, |
|
"learning_rate": 0.0003104048689318538, |
|
"loss": 2.2127, |
|
"step": 4830 |
|
}, |
|
{ |
|
"epoch": 0.4282996327596124, |
|
"grad_norm": 0.4805116355419159, |
|
"learning_rate": 0.0003097241610647494, |
|
"loss": 2.2026, |
|
"step": 4840 |
|
}, |
|
{ |
|
"epoch": 0.4291845493562232, |
|
"grad_norm": 0.45545217394828796, |
|
"learning_rate": 0.00030904298328895865, |
|
"loss": 2.2075, |
|
"step": 4850 |
|
}, |
|
{ |
|
"epoch": 0.43006946595283396, |
|
"grad_norm": 0.545505166053772, |
|
"learning_rate": 0.0003083613409639764, |
|
"loss": 2.2201, |
|
"step": 4860 |
|
}, |
|
{ |
|
"epoch": 0.43095438254944474, |
|
"grad_norm": 0.5077127814292908, |
|
"learning_rate": 0.00030767923945295306, |
|
"loss": 2.2317, |
|
"step": 4870 |
|
}, |
|
{ |
|
"epoch": 0.4318392991460555, |
|
"grad_norm": 0.5988184213638306, |
|
"learning_rate": 0.00030699668412265173, |
|
"loss": 2.1995, |
|
"step": 4880 |
|
}, |
|
{ |
|
"epoch": 0.43272421574266623, |
|
"grad_norm": 0.4757969081401825, |
|
"learning_rate": 0.00030631368034340624, |
|
"loss": 2.1997, |
|
"step": 4890 |
|
}, |
|
{ |
|
"epoch": 0.433609132339277, |
|
"grad_norm": 0.4290432631969452, |
|
"learning_rate": 0.0003056302334890786, |
|
"loss": 2.2172, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 0.4344940489358878, |
|
"grad_norm": 0.48155564069747925, |
|
"learning_rate": 0.00030494634893701725, |
|
"loss": 2.2104, |
|
"step": 4910 |
|
}, |
|
{ |
|
"epoch": 0.43537896553249855, |
|
"grad_norm": 0.47956278920173645, |
|
"learning_rate": 0.00030426203206801406, |
|
"loss": 2.1989, |
|
"step": 4920 |
|
}, |
|
{ |
|
"epoch": 0.4362638821291093, |
|
"grad_norm": 0.433569073677063, |
|
"learning_rate": 0.00030357728826626266, |
|
"loss": 2.2159, |
|
"step": 4930 |
|
}, |
|
{ |
|
"epoch": 0.4371487987257201, |
|
"grad_norm": 0.4097784757614136, |
|
"learning_rate": 0.0003028921229193157, |
|
"loss": 2.2198, |
|
"step": 4940 |
|
}, |
|
{ |
|
"epoch": 0.4380337153223309, |
|
"grad_norm": 0.3822705149650574, |
|
"learning_rate": 0.00030220654141804247, |
|
"loss": 2.2031, |
|
"step": 4950 |
|
}, |
|
{ |
|
"epoch": 0.43891863191894165, |
|
"grad_norm": 0.5066282749176025, |
|
"learning_rate": 0.00030152054915658663, |
|
"loss": 2.2007, |
|
"step": 4960 |
|
}, |
|
{ |
|
"epoch": 0.4398035485155524, |
|
"grad_norm": 0.3919082283973694, |
|
"learning_rate": 0.0003008341515323235, |
|
"loss": 2.2084, |
|
"step": 4970 |
|
}, |
|
{ |
|
"epoch": 0.4406884651121632, |
|
"grad_norm": 0.40400826930999756, |
|
"learning_rate": 0.0003001473539458182, |
|
"loss": 2.2039, |
|
"step": 4980 |
|
}, |
|
{ |
|
"epoch": 0.44157338170877397, |
|
"grad_norm": 0.44832319021224976, |
|
"learning_rate": 0.00029946016180078234, |
|
"loss": 2.1997, |
|
"step": 4990 |
|
}, |
|
{ |
|
"epoch": 0.44245829830538475, |
|
"grad_norm": 0.42437443137168884, |
|
"learning_rate": 0.0002987725805040321, |
|
"loss": 2.21, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.44245829830538475, |
|
"eval_accuracy": 0.5471037697452792, |
|
"eval_loss": 2.1154496669769287, |
|
"eval_runtime": 12.1652, |
|
"eval_samples_per_second": 26.14, |
|
"eval_steps_per_second": 0.411, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.44334321490199546, |
|
"grad_norm": 0.44694674015045166, |
|
"learning_rate": 0.0002980846154654455, |
|
"loss": 2.2418, |
|
"step": 5010 |
|
}, |
|
{ |
|
"epoch": 0.44422813149860624, |
|
"grad_norm": 0.5123154520988464, |
|
"learning_rate": 0.0002973962720979196, |
|
"loss": 2.1966, |
|
"step": 5020 |
|
}, |
|
{ |
|
"epoch": 0.445113048095217, |
|
"grad_norm": 0.452332466840744, |
|
"learning_rate": 0.0002967075558173287, |
|
"loss": 2.2079, |
|
"step": 5030 |
|
}, |
|
{ |
|
"epoch": 0.4459979646918278, |
|
"grad_norm": 0.42223265767097473, |
|
"learning_rate": 0.00029601847204248046, |
|
"loss": 2.1987, |
|
"step": 5040 |
|
}, |
|
{ |
|
"epoch": 0.44688288128843856, |
|
"grad_norm": 0.37268826365470886, |
|
"learning_rate": 0.00029532902619507464, |
|
"loss": 2.2177, |
|
"step": 5050 |
|
}, |
|
{ |
|
"epoch": 0.44776779788504933, |
|
"grad_norm": 0.45287415385246277, |
|
"learning_rate": 0.0002946392236996592, |
|
"loss": 2.2085, |
|
"step": 5060 |
|
}, |
|
{ |
|
"epoch": 0.4486527144816601, |
|
"grad_norm": 0.3807585835456848, |
|
"learning_rate": 0.0002939490699835887, |
|
"loss": 2.2083, |
|
"step": 5070 |
|
}, |
|
{ |
|
"epoch": 0.4495376310782709, |
|
"grad_norm": 0.45550355315208435, |
|
"learning_rate": 0.00029325857047698067, |
|
"loss": 2.195, |
|
"step": 5080 |
|
}, |
|
{ |
|
"epoch": 0.45042254767488166, |
|
"grad_norm": 0.45763885974884033, |
|
"learning_rate": 0.00029256773061267375, |
|
"loss": 2.21, |
|
"step": 5090 |
|
}, |
|
{ |
|
"epoch": 0.45130746427149243, |
|
"grad_norm": 0.3954174816608429, |
|
"learning_rate": 0.0002918765558261841, |
|
"loss": 2.2278, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 0.4521923808681032, |
|
"grad_norm": 0.4900796413421631, |
|
"learning_rate": 0.00029118505155566334, |
|
"loss": 2.2106, |
|
"step": 5110 |
|
}, |
|
{ |
|
"epoch": 0.4530772974647139, |
|
"grad_norm": 0.6190772652626038, |
|
"learning_rate": 0.00029049322324185524, |
|
"loss": 2.2257, |
|
"step": 5120 |
|
}, |
|
{ |
|
"epoch": 0.4539622140613247, |
|
"grad_norm": 0.4596320688724518, |
|
"learning_rate": 0.0002898010763280533, |
|
"loss": 2.2085, |
|
"step": 5130 |
|
}, |
|
{ |
|
"epoch": 0.4548471306579355, |
|
"grad_norm": 0.391040563583374, |
|
"learning_rate": 0.00028910861626005774, |
|
"loss": 2.2019, |
|
"step": 5140 |
|
}, |
|
{ |
|
"epoch": 0.45573204725454625, |
|
"grad_norm": 0.6000839471817017, |
|
"learning_rate": 0.0002884158484861325, |
|
"loss": 2.1821, |
|
"step": 5150 |
|
}, |
|
{ |
|
"epoch": 0.456616963851157, |
|
"grad_norm": 0.4674588739871979, |
|
"learning_rate": 0.00028772277845696287, |
|
"loss": 2.1972, |
|
"step": 5160 |
|
}, |
|
{ |
|
"epoch": 0.4575018804477678, |
|
"grad_norm": 0.6962230801582336, |
|
"learning_rate": 0.0002870294116256119, |
|
"loss": 2.2126, |
|
"step": 5170 |
|
}, |
|
{ |
|
"epoch": 0.45838679704437857, |
|
"grad_norm": 0.5181962251663208, |
|
"learning_rate": 0.0002863357534474782, |
|
"loss": 2.1981, |
|
"step": 5180 |
|
}, |
|
{ |
|
"epoch": 0.45927171364098934, |
|
"grad_norm": 0.7072561979293823, |
|
"learning_rate": 0.0002856418093802525, |
|
"loss": 2.2286, |
|
"step": 5190 |
|
}, |
|
{ |
|
"epoch": 0.4601566302376001, |
|
"grad_norm": 0.4124125838279724, |
|
"learning_rate": 0.0002849475848838749, |
|
"loss": 2.1959, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 0.4610415468342109, |
|
"grad_norm": 0.4569956660270691, |
|
"learning_rate": 0.00028425308542049207, |
|
"loss": 2.2136, |
|
"step": 5210 |
|
}, |
|
{ |
|
"epoch": 0.46192646343082167, |
|
"grad_norm": 0.43784773349761963, |
|
"learning_rate": 0.0002835583164544139, |
|
"loss": 2.2099, |
|
"step": 5220 |
|
}, |
|
{ |
|
"epoch": 0.46281138002743244, |
|
"grad_norm": 0.4107704758644104, |
|
"learning_rate": 0.00028286328345207096, |
|
"loss": 2.1818, |
|
"step": 5230 |
|
}, |
|
{ |
|
"epoch": 0.46369629662404316, |
|
"grad_norm": 0.4356732964515686, |
|
"learning_rate": 0.000282167991881971, |
|
"loss": 2.2151, |
|
"step": 5240 |
|
}, |
|
{ |
|
"epoch": 0.46458121322065393, |
|
"grad_norm": 0.4446347653865814, |
|
"learning_rate": 0.00028147244721465637, |
|
"loss": 2.2017, |
|
"step": 5250 |
|
}, |
|
{ |
|
"epoch": 0.4654661298172647, |
|
"grad_norm": 0.3608684539794922, |
|
"learning_rate": 0.00028077665492266075, |
|
"loss": 2.2149, |
|
"step": 5260 |
|
}, |
|
{ |
|
"epoch": 0.4663510464138755, |
|
"grad_norm": 0.6555531620979309, |
|
"learning_rate": 0.000280080620480466, |
|
"loss": 2.2124, |
|
"step": 5270 |
|
}, |
|
{ |
|
"epoch": 0.46723596301048625, |
|
"grad_norm": 0.5321036577224731, |
|
"learning_rate": 0.00027938434936445943, |
|
"loss": 2.1878, |
|
"step": 5280 |
|
}, |
|
{ |
|
"epoch": 0.46812087960709703, |
|
"grad_norm": 0.5092534422874451, |
|
"learning_rate": 0.00027868784705289024, |
|
"loss": 2.1893, |
|
"step": 5290 |
|
}, |
|
{ |
|
"epoch": 0.4690057962037078, |
|
"grad_norm": 0.65367192029953, |
|
"learning_rate": 0.00027799111902582696, |
|
"loss": 2.1994, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 0.4698907128003186, |
|
"grad_norm": 0.44738465547561646, |
|
"learning_rate": 0.0002772941707651138, |
|
"loss": 2.2117, |
|
"step": 5310 |
|
}, |
|
{ |
|
"epoch": 0.47077562939692935, |
|
"grad_norm": 0.3654314875602722, |
|
"learning_rate": 0.00027659700775432784, |
|
"loss": 2.1912, |
|
"step": 5320 |
|
}, |
|
{ |
|
"epoch": 0.4716605459935401, |
|
"grad_norm": 0.6195691227912903, |
|
"learning_rate": 0.00027589963547873585, |
|
"loss": 2.1965, |
|
"step": 5330 |
|
}, |
|
{ |
|
"epoch": 0.4725454625901509, |
|
"grad_norm": 0.40310239791870117, |
|
"learning_rate": 0.0002752020594252511, |
|
"loss": 2.2075, |
|
"step": 5340 |
|
}, |
|
{ |
|
"epoch": 0.4734303791867617, |
|
"grad_norm": 0.43661314249038696, |
|
"learning_rate": 0.0002745042850823902, |
|
"loss": 2.1909, |
|
"step": 5350 |
|
}, |
|
{ |
|
"epoch": 0.4743152957833724, |
|
"grad_norm": 0.67514568567276, |
|
"learning_rate": 0.00027380631794022967, |
|
"loss": 2.192, |
|
"step": 5360 |
|
}, |
|
{ |
|
"epoch": 0.47520021237998317, |
|
"grad_norm": 0.6213112473487854, |
|
"learning_rate": 0.0002731081634903633, |
|
"loss": 2.205, |
|
"step": 5370 |
|
}, |
|
{ |
|
"epoch": 0.47608512897659394, |
|
"grad_norm": 0.6548684239387512, |
|
"learning_rate": 0.0002724098272258584, |
|
"loss": 2.2097, |
|
"step": 5380 |
|
}, |
|
{ |
|
"epoch": 0.4769700455732047, |
|
"grad_norm": 0.412087619304657, |
|
"learning_rate": 0.0002717113146412129, |
|
"loss": 2.1935, |
|
"step": 5390 |
|
}, |
|
{ |
|
"epoch": 0.4778549621698155, |
|
"grad_norm": 0.49308520555496216, |
|
"learning_rate": 0.0002710126312323119, |
|
"loss": 2.1952, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 0.47873987876642626, |
|
"grad_norm": 0.6524165272712708, |
|
"learning_rate": 0.00027031378249638474, |
|
"loss": 2.2203, |
|
"step": 5410 |
|
}, |
|
{ |
|
"epoch": 0.47962479536303704, |
|
"grad_norm": 0.4079228341579437, |
|
"learning_rate": 0.00026961477393196127, |
|
"loss": 2.211, |
|
"step": 5420 |
|
}, |
|
{ |
|
"epoch": 0.4805097119596478, |
|
"grad_norm": 0.46248430013656616, |
|
"learning_rate": 0.0002689156110388292, |
|
"loss": 2.2164, |
|
"step": 5430 |
|
}, |
|
{ |
|
"epoch": 0.4813946285562586, |
|
"grad_norm": 0.4539341628551483, |
|
"learning_rate": 0.0002682162993179901, |
|
"loss": 2.2176, |
|
"step": 5440 |
|
}, |
|
{ |
|
"epoch": 0.48227954515286936, |
|
"grad_norm": 0.5375662446022034, |
|
"learning_rate": 0.00026751684427161684, |
|
"loss": 2.2197, |
|
"step": 5450 |
|
}, |
|
{ |
|
"epoch": 0.48316446174948013, |
|
"grad_norm": 0.4172353148460388, |
|
"learning_rate": 0.00026681725140300993, |
|
"loss": 2.1972, |
|
"step": 5460 |
|
}, |
|
{ |
|
"epoch": 0.4840493783460909, |
|
"grad_norm": 0.4539187252521515, |
|
"learning_rate": 0.0002661175262165541, |
|
"loss": 2.2104, |
|
"step": 5470 |
|
}, |
|
{ |
|
"epoch": 0.4849342949427016, |
|
"grad_norm": 0.6489385962486267, |
|
"learning_rate": 0.0002654176742176754, |
|
"loss": 2.2101, |
|
"step": 5480 |
|
}, |
|
{ |
|
"epoch": 0.4858192115393124, |
|
"grad_norm": 0.40475186705589294, |
|
"learning_rate": 0.00026471770091279724, |
|
"loss": 2.2076, |
|
"step": 5490 |
|
}, |
|
{ |
|
"epoch": 0.4867041281359232, |
|
"grad_norm": 0.4963974356651306, |
|
"learning_rate": 0.00026401761180929796, |
|
"loss": 2.2017, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.48758904473253395, |
|
"grad_norm": 0.40801239013671875, |
|
"learning_rate": 0.0002633174124154666, |
|
"loss": 2.2242, |
|
"step": 5510 |
|
}, |
|
{ |
|
"epoch": 0.4884739613291447, |
|
"grad_norm": 0.3968900144100189, |
|
"learning_rate": 0.0002626171082404602, |
|
"loss": 2.1925, |
|
"step": 5520 |
|
}, |
|
{ |
|
"epoch": 0.4893588779257555, |
|
"grad_norm": 0.42560887336730957, |
|
"learning_rate": 0.0002619167047942602, |
|
"loss": 2.1916, |
|
"step": 5530 |
|
}, |
|
{ |
|
"epoch": 0.49024379452236627, |
|
"grad_norm": 0.42510178685188293, |
|
"learning_rate": 0.00026121620758762877, |
|
"loss": 2.1896, |
|
"step": 5540 |
|
}, |
|
{ |
|
"epoch": 0.49112871111897705, |
|
"grad_norm": 0.34354716539382935, |
|
"learning_rate": 0.0002605156221320663, |
|
"loss": 2.1943, |
|
"step": 5550 |
|
}, |
|
{ |
|
"epoch": 0.4920136277155878, |
|
"grad_norm": 0.617936372756958, |
|
"learning_rate": 0.00025981495393976716, |
|
"loss": 2.2124, |
|
"step": 5560 |
|
}, |
|
{ |
|
"epoch": 0.4928985443121986, |
|
"grad_norm": 0.6653580069541931, |
|
"learning_rate": 0.00025911420852357695, |
|
"loss": 2.2019, |
|
"step": 5570 |
|
}, |
|
{ |
|
"epoch": 0.49378346090880937, |
|
"grad_norm": 0.38025906682014465, |
|
"learning_rate": 0.0002584133913969485, |
|
"loss": 2.1852, |
|
"step": 5580 |
|
}, |
|
{ |
|
"epoch": 0.4946683775054201, |
|
"grad_norm": 0.38345324993133545, |
|
"learning_rate": 0.0002577125080738993, |
|
"loss": 2.1998, |
|
"step": 5590 |
|
}, |
|
{ |
|
"epoch": 0.49555329410203086, |
|
"grad_norm": 0.41947296261787415, |
|
"learning_rate": 0.00025701156406896723, |
|
"loss": 2.2045, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 0.49643821069864164, |
|
"grad_norm": 0.6970186233520508, |
|
"learning_rate": 0.0002563105648971681, |
|
"loss": 2.1963, |
|
"step": 5610 |
|
}, |
|
{ |
|
"epoch": 0.4973231272952524, |
|
"grad_norm": 0.4399164021015167, |
|
"learning_rate": 0.00025560951607395127, |
|
"loss": 2.2119, |
|
"step": 5620 |
|
}, |
|
{ |
|
"epoch": 0.4982080438918632, |
|
"grad_norm": 0.45648398995399475, |
|
"learning_rate": 0.00025490842311515704, |
|
"loss": 2.1836, |
|
"step": 5630 |
|
}, |
|
{ |
|
"epoch": 0.49909296048847396, |
|
"grad_norm": 0.43257445096969604, |
|
"learning_rate": 0.00025420729153697306, |
|
"loss": 2.2201, |
|
"step": 5640 |
|
}, |
|
{ |
|
"epoch": 0.49997787708508473, |
|
"grad_norm": 0.5123971104621887, |
|
"learning_rate": 0.00025350612685589056, |
|
"loss": 2.1969, |
|
"step": 5650 |
|
}, |
|
{ |
|
"epoch": 0.5008627936816955, |
|
"grad_norm": 0.6617570519447327, |
|
"learning_rate": 0.0002528049345886615, |
|
"loss": 2.1973, |
|
"step": 5660 |
|
}, |
|
{ |
|
"epoch": 0.5017477102783062, |
|
"grad_norm": 0.35327666997909546, |
|
"learning_rate": 0.0002521037202522546, |
|
"loss": 2.2027, |
|
"step": 5670 |
|
}, |
|
{ |
|
"epoch": 0.502632626874917, |
|
"grad_norm": 0.5147636532783508, |
|
"learning_rate": 0.00025140248936381246, |
|
"loss": 2.2133, |
|
"step": 5680 |
|
}, |
|
{ |
|
"epoch": 0.5035175434715278, |
|
"grad_norm": 0.4274057447910309, |
|
"learning_rate": 0.0002507012474406077, |
|
"loss": 2.195, |
|
"step": 5690 |
|
}, |
|
{ |
|
"epoch": 0.5044024600681386, |
|
"grad_norm": 0.40779900550842285, |
|
"learning_rate": 0.00025, |
|
"loss": 2.214, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 0.5052873766647493, |
|
"grad_norm": 0.6237518191337585, |
|
"learning_rate": 0.00024929875255939236, |
|
"loss": 2.2385, |
|
"step": 5710 |
|
}, |
|
{ |
|
"epoch": 0.5061722932613602, |
|
"grad_norm": 0.4006004333496094, |
|
"learning_rate": 0.0002485975106361876, |
|
"loss": 2.211, |
|
"step": 5720 |
|
}, |
|
{ |
|
"epoch": 0.5070572098579709, |
|
"grad_norm": 0.5931833982467651, |
|
"learning_rate": 0.0002478962797477455, |
|
"loss": 2.2033, |
|
"step": 5730 |
|
}, |
|
{ |
|
"epoch": 0.5079421264545817, |
|
"grad_norm": 0.5313906073570251, |
|
"learning_rate": 0.00024719506541133853, |
|
"loss": 2.2076, |
|
"step": 5740 |
|
}, |
|
{ |
|
"epoch": 0.5088270430511924, |
|
"grad_norm": 0.3830896317958832, |
|
"learning_rate": 0.00024649387314410945, |
|
"loss": 2.1989, |
|
"step": 5750 |
|
}, |
|
{ |
|
"epoch": 0.5097119596478032, |
|
"grad_norm": 0.35289981961250305, |
|
"learning_rate": 0.00024579270846302695, |
|
"loss": 2.2155, |
|
"step": 5760 |
|
}, |
|
{ |
|
"epoch": 0.510596876244414, |
|
"grad_norm": 0.4024136960506439, |
|
"learning_rate": 0.00024509157688484297, |
|
"loss": 2.184, |
|
"step": 5770 |
|
}, |
|
{ |
|
"epoch": 0.5114817928410247, |
|
"grad_norm": 0.5324482321739197, |
|
"learning_rate": 0.0002443904839260488, |
|
"loss": 2.2059, |
|
"step": 5780 |
|
}, |
|
{ |
|
"epoch": 0.5123667094376355, |
|
"grad_norm": 0.3289058208465576, |
|
"learning_rate": 0.000243689435102832, |
|
"loss": 2.1914, |
|
"step": 5790 |
|
}, |
|
{ |
|
"epoch": 0.5132516260342462, |
|
"grad_norm": 0.4984874725341797, |
|
"learning_rate": 0.00024298843593103278, |
|
"loss": 2.1879, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 0.5141365426308571, |
|
"grad_norm": 0.4327697455883026, |
|
"learning_rate": 0.0002422874919261008, |
|
"loss": 2.1919, |
|
"step": 5810 |
|
}, |
|
{ |
|
"epoch": 0.5150214592274678, |
|
"grad_norm": 0.5142413973808289, |
|
"learning_rate": 0.0002415866086030516, |
|
"loss": 2.2187, |
|
"step": 5820 |
|
}, |
|
{ |
|
"epoch": 0.5159063758240786, |
|
"grad_norm": 0.3675195574760437, |
|
"learning_rate": 0.00024088579147642317, |
|
"loss": 2.2138, |
|
"step": 5830 |
|
}, |
|
{ |
|
"epoch": 0.5167912924206893, |
|
"grad_norm": 0.44548624753952026, |
|
"learning_rate": 0.00024018504606023293, |
|
"loss": 2.1809, |
|
"step": 5840 |
|
}, |
|
{ |
|
"epoch": 0.5176762090173002, |
|
"grad_norm": 0.39151784777641296, |
|
"learning_rate": 0.0002394843778679338, |
|
"loss": 2.1724, |
|
"step": 5850 |
|
}, |
|
{ |
|
"epoch": 0.5185611256139109, |
|
"grad_norm": 0.519999086856842, |
|
"learning_rate": 0.00023878379241237135, |
|
"loss": 2.1958, |
|
"step": 5860 |
|
}, |
|
{ |
|
"epoch": 0.5194460422105217, |
|
"grad_norm": 0.467977911233902, |
|
"learning_rate": 0.00023808329520573997, |
|
"loss": 2.1986, |
|
"step": 5870 |
|
}, |
|
{ |
|
"epoch": 0.5203309588071324, |
|
"grad_norm": 0.3409745693206787, |
|
"learning_rate": 0.00023738289175953976, |
|
"loss": 2.1951, |
|
"step": 5880 |
|
}, |
|
{ |
|
"epoch": 0.5212158754037431, |
|
"grad_norm": 0.41722747683525085, |
|
"learning_rate": 0.00023668258758453338, |
|
"loss": 2.1927, |
|
"step": 5890 |
|
}, |
|
{ |
|
"epoch": 0.522100792000354, |
|
"grad_norm": 0.44009125232696533, |
|
"learning_rate": 0.00023598238819070203, |
|
"loss": 2.1973, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 0.5229857085969647, |
|
"grad_norm": 0.3806867301464081, |
|
"learning_rate": 0.00023528229908720272, |
|
"loss": 2.2134, |
|
"step": 5910 |
|
}, |
|
{ |
|
"epoch": 0.5238706251935755, |
|
"grad_norm": 0.3461344242095947, |
|
"learning_rate": 0.00023458232578232462, |
|
"loss": 2.18, |
|
"step": 5920 |
|
}, |
|
{ |
|
"epoch": 0.5247555417901862, |
|
"grad_norm": 0.3539530634880066, |
|
"learning_rate": 0.0002338824737834459, |
|
"loss": 2.1847, |
|
"step": 5930 |
|
}, |
|
{ |
|
"epoch": 0.5256404583867971, |
|
"grad_norm": 0.3867835998535156, |
|
"learning_rate": 0.0002331827485969901, |
|
"loss": 2.2008, |
|
"step": 5940 |
|
}, |
|
{ |
|
"epoch": 0.5265253749834078, |
|
"grad_norm": 0.47032880783081055, |
|
"learning_rate": 0.00023248315572838317, |
|
"loss": 2.2001, |
|
"step": 5950 |
|
}, |
|
{ |
|
"epoch": 0.5274102915800186, |
|
"grad_norm": 0.39191240072250366, |
|
"learning_rate": 0.00023178370068201, |
|
"loss": 2.1971, |
|
"step": 5960 |
|
}, |
|
{ |
|
"epoch": 0.5282952081766293, |
|
"grad_norm": 0.3935898542404175, |
|
"learning_rate": 0.0002310843889611709, |
|
"loss": 2.2009, |
|
"step": 5970 |
|
}, |
|
{ |
|
"epoch": 0.5291801247732402, |
|
"grad_norm": 0.3939705789089203, |
|
"learning_rate": 0.0002303852260680388, |
|
"loss": 2.1976, |
|
"step": 5980 |
|
}, |
|
{ |
|
"epoch": 0.5300650413698509, |
|
"grad_norm": 0.37949132919311523, |
|
"learning_rate": 0.00022968621750361532, |
|
"loss": 2.2045, |
|
"step": 5990 |
|
}, |
|
{ |
|
"epoch": 0.5309499579664617, |
|
"grad_norm": 0.5454613566398621, |
|
"learning_rate": 0.00022898736876768815, |
|
"loss": 2.1945, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.5309499579664617, |
|
"eval_accuracy": 0.5490304943135131, |
|
"eval_loss": 2.104365110397339, |
|
"eval_runtime": 11.2034, |
|
"eval_samples_per_second": 28.384, |
|
"eval_steps_per_second": 0.446, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.5318348745630724, |
|
"grad_norm": 0.4849005341529846, |
|
"learning_rate": 0.00022828868535878713, |
|
"loss": 2.1681, |
|
"step": 6010 |
|
}, |
|
{ |
|
"epoch": 0.5327197911596832, |
|
"grad_norm": 0.5288079380989075, |
|
"learning_rate": 0.00022759017277414165, |
|
"loss": 2.1923, |
|
"step": 6020 |
|
}, |
|
{ |
|
"epoch": 0.533604707756294, |
|
"grad_norm": 0.37164533138275146, |
|
"learning_rate": 0.0002268918365096367, |
|
"loss": 2.2136, |
|
"step": 6030 |
|
}, |
|
{ |
|
"epoch": 0.5344896243529047, |
|
"grad_norm": 0.35533079504966736, |
|
"learning_rate": 0.00022619368205977036, |
|
"loss": 2.21, |
|
"step": 6040 |
|
}, |
|
{ |
|
"epoch": 0.5353745409495155, |
|
"grad_norm": 0.3953758180141449, |
|
"learning_rate": 0.00022549571491760985, |
|
"loss": 2.2034, |
|
"step": 6050 |
|
}, |
|
{ |
|
"epoch": 0.5362594575461263, |
|
"grad_norm": 0.41037318110466003, |
|
"learning_rate": 0.0002247979405747489, |
|
"loss": 2.1955, |
|
"step": 6060 |
|
}, |
|
{ |
|
"epoch": 0.5371443741427371, |
|
"grad_norm": 0.46826791763305664, |
|
"learning_rate": 0.00022410036452126417, |
|
"loss": 2.1646, |
|
"step": 6070 |
|
}, |
|
{ |
|
"epoch": 0.5380292907393478, |
|
"grad_norm": 0.6032235622406006, |
|
"learning_rate": 0.00022340299224567217, |
|
"loss": 2.2023, |
|
"step": 6080 |
|
}, |
|
{ |
|
"epoch": 0.5389142073359586, |
|
"grad_norm": 0.41023147106170654, |
|
"learning_rate": 0.00022270582923488626, |
|
"loss": 2.2043, |
|
"step": 6090 |
|
}, |
|
{ |
|
"epoch": 0.5397991239325693, |
|
"grad_norm": 0.44359543919563293, |
|
"learning_rate": 0.00022200888097417305, |
|
"loss": 2.1805, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 0.5406840405291802, |
|
"grad_norm": 0.3892384469509125, |
|
"learning_rate": 0.00022131215294710977, |
|
"loss": 2.1955, |
|
"step": 6110 |
|
}, |
|
{ |
|
"epoch": 0.5415689571257909, |
|
"grad_norm": 0.3548850417137146, |
|
"learning_rate": 0.00022061565063554063, |
|
"loss": 2.1924, |
|
"step": 6120 |
|
}, |
|
{ |
|
"epoch": 0.5424538737224016, |
|
"grad_norm": 0.4247482419013977, |
|
"learning_rate": 0.00021991937951953405, |
|
"loss": 2.1926, |
|
"step": 6130 |
|
}, |
|
{ |
|
"epoch": 0.5433387903190124, |
|
"grad_norm": 0.43686312437057495, |
|
"learning_rate": 0.00021922334507733931, |
|
"loss": 2.187, |
|
"step": 6140 |
|
}, |
|
{ |
|
"epoch": 0.5442237069156232, |
|
"grad_norm": 0.4155753552913666, |
|
"learning_rate": 0.0002185275527853437, |
|
"loss": 2.2047, |
|
"step": 6150 |
|
}, |
|
{ |
|
"epoch": 0.545108623512234, |
|
"grad_norm": 0.4141368865966797, |
|
"learning_rate": 0.00021783200811802906, |
|
"loss": 2.1979, |
|
"step": 6160 |
|
}, |
|
{ |
|
"epoch": 0.5459935401088447, |
|
"grad_norm": 0.4047829210758209, |
|
"learning_rate": 0.00021713671654792916, |
|
"loss": 2.1808, |
|
"step": 6170 |
|
}, |
|
{ |
|
"epoch": 0.5468784567054555, |
|
"grad_norm": 0.4952094554901123, |
|
"learning_rate": 0.0002164416835455862, |
|
"loss": 2.2289, |
|
"step": 6180 |
|
}, |
|
{ |
|
"epoch": 0.5477633733020663, |
|
"grad_norm": 0.34553036093711853, |
|
"learning_rate": 0.00021574691457950805, |
|
"loss": 2.1779, |
|
"step": 6190 |
|
}, |
|
{ |
|
"epoch": 0.5486482898986771, |
|
"grad_norm": 0.37532058358192444, |
|
"learning_rate": 0.00021505241511612523, |
|
"loss": 2.1995, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 0.5495332064952878, |
|
"grad_norm": 0.5026415586471558, |
|
"learning_rate": 0.0002143581906197476, |
|
"loss": 2.199, |
|
"step": 6210 |
|
}, |
|
{ |
|
"epoch": 0.5504181230918986, |
|
"grad_norm": 0.4540441036224365, |
|
"learning_rate": 0.0002136642465525219, |
|
"loss": 2.1923, |
|
"step": 6220 |
|
}, |
|
{ |
|
"epoch": 0.5513030396885094, |
|
"grad_norm": 0.43957459926605225, |
|
"learning_rate": 0.0002129705883743881, |
|
"loss": 2.1816, |
|
"step": 6230 |
|
}, |
|
{ |
|
"epoch": 0.5521879562851201, |
|
"grad_norm": 0.3559401333332062, |
|
"learning_rate": 0.00021227722154303714, |
|
"loss": 2.1891, |
|
"step": 6240 |
|
}, |
|
{ |
|
"epoch": 0.5530728728817309, |
|
"grad_norm": 0.5275906324386597, |
|
"learning_rate": 0.00021158415151386746, |
|
"loss": 2.2062, |
|
"step": 6250 |
|
}, |
|
{ |
|
"epoch": 0.5539577894783416, |
|
"grad_norm": 0.44601666927337646, |
|
"learning_rate": 0.00021089138373994224, |
|
"loss": 2.1942, |
|
"step": 6260 |
|
}, |
|
{ |
|
"epoch": 0.5548427060749525, |
|
"grad_norm": 0.42079922556877136, |
|
"learning_rate": 0.0002101989236719467, |
|
"loss": 2.2039, |
|
"step": 6270 |
|
}, |
|
{ |
|
"epoch": 0.5557276226715632, |
|
"grad_norm": 0.46110275387763977, |
|
"learning_rate": 0.0002095067767581447, |
|
"loss": 2.1989, |
|
"step": 6280 |
|
}, |
|
{ |
|
"epoch": 0.556612539268174, |
|
"grad_norm": 0.7742722630500793, |
|
"learning_rate": 0.0002088149484443367, |
|
"loss": 2.1854, |
|
"step": 6290 |
|
}, |
|
{ |
|
"epoch": 0.5574974558647847, |
|
"grad_norm": 0.43369126319885254, |
|
"learning_rate": 0.00020812344417381592, |
|
"loss": 2.1982, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 0.5583823724613955, |
|
"grad_norm": 0.4741725027561188, |
|
"learning_rate": 0.00020743226938732626, |
|
"loss": 2.2027, |
|
"step": 6310 |
|
}, |
|
{ |
|
"epoch": 0.5592672890580063, |
|
"grad_norm": 0.3926391303539276, |
|
"learning_rate": 0.00020674142952301934, |
|
"loss": 2.1918, |
|
"step": 6320 |
|
}, |
|
{ |
|
"epoch": 0.5601522056546171, |
|
"grad_norm": 0.410693883895874, |
|
"learning_rate": 0.00020605093001641137, |
|
"loss": 2.1705, |
|
"step": 6330 |
|
}, |
|
{ |
|
"epoch": 0.5610371222512278, |
|
"grad_norm": 0.6874290108680725, |
|
"learning_rate": 0.00020536077630034085, |
|
"loss": 2.1878, |
|
"step": 6340 |
|
}, |
|
{ |
|
"epoch": 0.5619220388478386, |
|
"grad_norm": 0.5000712871551514, |
|
"learning_rate": 0.00020467097380492545, |
|
"loss": 2.1787, |
|
"step": 6350 |
|
}, |
|
{ |
|
"epoch": 0.5628069554444494, |
|
"grad_norm": 0.4700552523136139, |
|
"learning_rate": 0.00020398152795751955, |
|
"loss": 2.1964, |
|
"step": 6360 |
|
}, |
|
{ |
|
"epoch": 0.5636918720410601, |
|
"grad_norm": 0.4930076003074646, |
|
"learning_rate": 0.00020329244418267138, |
|
"loss": 2.1953, |
|
"step": 6370 |
|
}, |
|
{ |
|
"epoch": 0.5645767886376709, |
|
"grad_norm": 0.476241797208786, |
|
"learning_rate": 0.0002026037279020804, |
|
"loss": 2.2076, |
|
"step": 6380 |
|
}, |
|
{ |
|
"epoch": 0.5654617052342816, |
|
"grad_norm": 0.34659117460250854, |
|
"learning_rate": 0.00020191538453455458, |
|
"loss": 2.1888, |
|
"step": 6390 |
|
}, |
|
{ |
|
"epoch": 0.5663466218308925, |
|
"grad_norm": 0.40348589420318604, |
|
"learning_rate": 0.00020122741949596797, |
|
"loss": 2.1915, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 0.5672315384275032, |
|
"grad_norm": 0.5236355662345886, |
|
"learning_rate": 0.00020053983819921773, |
|
"loss": 2.2073, |
|
"step": 6410 |
|
}, |
|
{ |
|
"epoch": 0.568116455024114, |
|
"grad_norm": 0.3683709502220154, |
|
"learning_rate": 0.00019985264605418181, |
|
"loss": 2.1811, |
|
"step": 6420 |
|
}, |
|
{ |
|
"epoch": 0.5690013716207247, |
|
"grad_norm": 0.349714457988739, |
|
"learning_rate": 0.00019916584846767652, |
|
"loss": 2.1851, |
|
"step": 6430 |
|
}, |
|
{ |
|
"epoch": 0.5698862882173356, |
|
"grad_norm": 0.35854193568229675, |
|
"learning_rate": 0.00019847945084341343, |
|
"loss": 2.1777, |
|
"step": 6440 |
|
}, |
|
{ |
|
"epoch": 0.5707712048139463, |
|
"grad_norm": 0.41335052251815796, |
|
"learning_rate": 0.00019779345858195757, |
|
"loss": 2.1887, |
|
"step": 6450 |
|
}, |
|
{ |
|
"epoch": 0.5716561214105571, |
|
"grad_norm": 0.3837135434150696, |
|
"learning_rate": 0.0001971078770806843, |
|
"loss": 2.2111, |
|
"step": 6460 |
|
}, |
|
{ |
|
"epoch": 0.5725410380071678, |
|
"grad_norm": 0.4042917490005493, |
|
"learning_rate": 0.00019642271173373735, |
|
"loss": 2.2014, |
|
"step": 6470 |
|
}, |
|
{ |
|
"epoch": 0.5734259546037785, |
|
"grad_norm": 0.6200820803642273, |
|
"learning_rate": 0.00019573796793198595, |
|
"loss": 2.2045, |
|
"step": 6480 |
|
}, |
|
{ |
|
"epoch": 0.5743108712003894, |
|
"grad_norm": 0.3666927218437195, |
|
"learning_rate": 0.00019505365106298284, |
|
"loss": 2.2006, |
|
"step": 6490 |
|
}, |
|
{ |
|
"epoch": 0.5751957877970001, |
|
"grad_norm": 0.3415856957435608, |
|
"learning_rate": 0.00019436976651092142, |
|
"loss": 2.1875, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.5760807043936109, |
|
"grad_norm": 0.36049988865852356, |
|
"learning_rate": 0.00019368631965659385, |
|
"loss": 2.2193, |
|
"step": 6510 |
|
}, |
|
{ |
|
"epoch": 0.5769656209902216, |
|
"grad_norm": 0.47630825638771057, |
|
"learning_rate": 0.00019300331587734833, |
|
"loss": 2.1938, |
|
"step": 6520 |
|
}, |
|
{ |
|
"epoch": 0.5778505375868325, |
|
"grad_norm": 0.3804795444011688, |
|
"learning_rate": 0.000192320760547047, |
|
"loss": 2.1808, |
|
"step": 6530 |
|
}, |
|
{ |
|
"epoch": 0.5787354541834432, |
|
"grad_norm": 0.49671676754951477, |
|
"learning_rate": 0.00019163865903602372, |
|
"loss": 2.1902, |
|
"step": 6540 |
|
}, |
|
{ |
|
"epoch": 0.579620370780054, |
|
"grad_norm": 0.37095028162002563, |
|
"learning_rate": 0.0001909570167110415, |
|
"loss": 2.178, |
|
"step": 6550 |
|
}, |
|
{ |
|
"epoch": 0.5805052873766647, |
|
"grad_norm": 0.3675883114337921, |
|
"learning_rate": 0.00019027583893525067, |
|
"loss": 2.2009, |
|
"step": 6560 |
|
}, |
|
{ |
|
"epoch": 0.5813902039732756, |
|
"grad_norm": 0.4657158851623535, |
|
"learning_rate": 0.00018959513106814633, |
|
"loss": 2.1962, |
|
"step": 6570 |
|
}, |
|
{ |
|
"epoch": 0.5822751205698863, |
|
"grad_norm": 0.3962918519973755, |
|
"learning_rate": 0.00018891489846552647, |
|
"loss": 2.2035, |
|
"step": 6580 |
|
}, |
|
{ |
|
"epoch": 0.583160037166497, |
|
"grad_norm": 0.3245933949947357, |
|
"learning_rate": 0.00018823514647944977, |
|
"loss": 2.1891, |
|
"step": 6590 |
|
}, |
|
{ |
|
"epoch": 0.5840449537631078, |
|
"grad_norm": 0.33663785457611084, |
|
"learning_rate": 0.00018755588045819325, |
|
"loss": 2.1994, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 0.5849298703597186, |
|
"grad_norm": 0.3755311667919159, |
|
"learning_rate": 0.00018687710574621051, |
|
"loss": 2.1949, |
|
"step": 6610 |
|
}, |
|
{ |
|
"epoch": 0.5858147869563294, |
|
"grad_norm": 0.3784433603286743, |
|
"learning_rate": 0.00018619882768408937, |
|
"loss": 2.1972, |
|
"step": 6620 |
|
}, |
|
{ |
|
"epoch": 0.5866997035529401, |
|
"grad_norm": 0.3571663796901703, |
|
"learning_rate": 0.00018552105160851018, |
|
"loss": 2.1922, |
|
"step": 6630 |
|
}, |
|
{ |
|
"epoch": 0.5875846201495509, |
|
"grad_norm": 0.5752764344215393, |
|
"learning_rate": 0.00018484378285220365, |
|
"loss": 2.1765, |
|
"step": 6640 |
|
}, |
|
{ |
|
"epoch": 0.5884695367461616, |
|
"grad_norm": 0.39760568737983704, |
|
"learning_rate": 0.0001841670267439088, |
|
"loss": 2.1922, |
|
"step": 6650 |
|
}, |
|
{ |
|
"epoch": 0.5893544533427725, |
|
"grad_norm": 0.48002487421035767, |
|
"learning_rate": 0.00018349078860833125, |
|
"loss": 2.173, |
|
"step": 6660 |
|
}, |
|
{ |
|
"epoch": 0.5902393699393832, |
|
"grad_norm": 0.4568367600440979, |
|
"learning_rate": 0.00018281507376610113, |
|
"loss": 2.2042, |
|
"step": 6670 |
|
}, |
|
{ |
|
"epoch": 0.591124286535994, |
|
"grad_norm": 0.44323500990867615, |
|
"learning_rate": 0.00018213988753373146, |
|
"loss": 2.1808, |
|
"step": 6680 |
|
}, |
|
{ |
|
"epoch": 0.5920092031326047, |
|
"grad_norm": 0.4837689697742462, |
|
"learning_rate": 0.00018146523522357595, |
|
"loss": 2.1728, |
|
"step": 6690 |
|
}, |
|
{ |
|
"epoch": 0.5928941197292156, |
|
"grad_norm": 0.4340761601924896, |
|
"learning_rate": 0.00018079112214378768, |
|
"loss": 2.1915, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 0.5937790363258263, |
|
"grad_norm": 0.42073750495910645, |
|
"learning_rate": 0.00018011755359827677, |
|
"loss": 2.1906, |
|
"step": 6710 |
|
}, |
|
{ |
|
"epoch": 0.594663952922437, |
|
"grad_norm": 0.4763769805431366, |
|
"learning_rate": 0.00017944453488666928, |
|
"loss": 2.1974, |
|
"step": 6720 |
|
}, |
|
{ |
|
"epoch": 0.5955488695190478, |
|
"grad_norm": 0.42602041363716125, |
|
"learning_rate": 0.00017877207130426488, |
|
"loss": 2.1697, |
|
"step": 6730 |
|
}, |
|
{ |
|
"epoch": 0.5964337861156586, |
|
"grad_norm": 0.7025333046913147, |
|
"learning_rate": 0.0001781001681419957, |
|
"loss": 2.1958, |
|
"step": 6740 |
|
}, |
|
{ |
|
"epoch": 0.5973187027122694, |
|
"grad_norm": 0.46738317608833313, |
|
"learning_rate": 0.00017742883068638446, |
|
"loss": 2.1852, |
|
"step": 6750 |
|
}, |
|
{ |
|
"epoch": 0.5982036193088801, |
|
"grad_norm": 0.4451451003551483, |
|
"learning_rate": 0.00017675806421950277, |
|
"loss": 2.1855, |
|
"step": 6760 |
|
}, |
|
{ |
|
"epoch": 0.5990885359054909, |
|
"grad_norm": 0.3433722257614136, |
|
"learning_rate": 0.00017608787401892994, |
|
"loss": 2.1793, |
|
"step": 6770 |
|
}, |
|
{ |
|
"epoch": 0.5999734525021017, |
|
"grad_norm": 0.305349200963974, |
|
"learning_rate": 0.000175418265357711, |
|
"loss": 2.1926, |
|
"step": 6780 |
|
}, |
|
{ |
|
"epoch": 0.6008583690987125, |
|
"grad_norm": 0.369484007358551, |
|
"learning_rate": 0.00017474924350431565, |
|
"loss": 2.1917, |
|
"step": 6790 |
|
}, |
|
{ |
|
"epoch": 0.6017432856953232, |
|
"grad_norm": 0.42088308930397034, |
|
"learning_rate": 0.00017408081372259632, |
|
"loss": 2.1814, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 0.602628202291934, |
|
"grad_norm": 0.396318256855011, |
|
"learning_rate": 0.00017341298127174744, |
|
"loss": 2.2017, |
|
"step": 6810 |
|
}, |
|
{ |
|
"epoch": 0.6035131188885448, |
|
"grad_norm": 0.37352317571640015, |
|
"learning_rate": 0.00017274575140626317, |
|
"loss": 2.1785, |
|
"step": 6820 |
|
}, |
|
{ |
|
"epoch": 0.6043980354851555, |
|
"grad_norm": 0.37729695439338684, |
|
"learning_rate": 0.00017207912937589696, |
|
"loss": 2.2002, |
|
"step": 6830 |
|
}, |
|
{ |
|
"epoch": 0.6052829520817663, |
|
"grad_norm": 0.3980563282966614, |
|
"learning_rate": 0.0001714131204256195, |
|
"loss": 2.1804, |
|
"step": 6840 |
|
}, |
|
{ |
|
"epoch": 0.606167868678377, |
|
"grad_norm": 0.3724077343940735, |
|
"learning_rate": 0.000170747729795578, |
|
"loss": 2.1873, |
|
"step": 6850 |
|
}, |
|
{ |
|
"epoch": 0.6070527852749878, |
|
"grad_norm": 0.4318739175796509, |
|
"learning_rate": 0.00017008296272105468, |
|
"loss": 2.1726, |
|
"step": 6860 |
|
}, |
|
{ |
|
"epoch": 0.6079377018715986, |
|
"grad_norm": 0.5658116340637207, |
|
"learning_rate": 0.00016941882443242555, |
|
"loss": 2.1767, |
|
"step": 6870 |
|
}, |
|
{ |
|
"epoch": 0.6088226184682094, |
|
"grad_norm": 0.38237351179122925, |
|
"learning_rate": 0.00016875532015511944, |
|
"loss": 2.1893, |
|
"step": 6880 |
|
}, |
|
{ |
|
"epoch": 0.6097075350648201, |
|
"grad_norm": 0.32790622115135193, |
|
"learning_rate": 0.00016809245510957666, |
|
"loss": 2.1834, |
|
"step": 6890 |
|
}, |
|
{ |
|
"epoch": 0.610592451661431, |
|
"grad_norm": 0.3833366334438324, |
|
"learning_rate": 0.00016743023451120832, |
|
"loss": 2.1983, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 0.6114773682580417, |
|
"grad_norm": 0.2924376130104065, |
|
"learning_rate": 0.00016676866357035467, |
|
"loss": 2.2048, |
|
"step": 6910 |
|
}, |
|
{ |
|
"epoch": 0.6123622848546525, |
|
"grad_norm": 0.4293077290058136, |
|
"learning_rate": 0.00016610774749224483, |
|
"loss": 2.189, |
|
"step": 6920 |
|
}, |
|
{ |
|
"epoch": 0.6132472014512632, |
|
"grad_norm": 0.5160631537437439, |
|
"learning_rate": 0.0001654474914769551, |
|
"loss": 2.1937, |
|
"step": 6930 |
|
}, |
|
{ |
|
"epoch": 0.614132118047874, |
|
"grad_norm": 0.4933619499206543, |
|
"learning_rate": 0.00016478790071936875, |
|
"loss": 2.1793, |
|
"step": 6940 |
|
}, |
|
{ |
|
"epoch": 0.6150170346444848, |
|
"grad_norm": 0.4233640134334564, |
|
"learning_rate": 0.00016412898040913472, |
|
"loss": 2.1766, |
|
"step": 6950 |
|
}, |
|
{ |
|
"epoch": 0.6159019512410955, |
|
"grad_norm": 0.34439942240715027, |
|
"learning_rate": 0.0001634707357306267, |
|
"loss": 2.1895, |
|
"step": 6960 |
|
}, |
|
{ |
|
"epoch": 0.6167868678377063, |
|
"grad_norm": 0.34741097688674927, |
|
"learning_rate": 0.00016281317186290283, |
|
"loss": 2.2113, |
|
"step": 6970 |
|
}, |
|
{ |
|
"epoch": 0.617671784434317, |
|
"grad_norm": 0.34368258714675903, |
|
"learning_rate": 0.0001621562939796643, |
|
"loss": 2.1917, |
|
"step": 6980 |
|
}, |
|
{ |
|
"epoch": 0.6185567010309279, |
|
"grad_norm": 0.47897687554359436, |
|
"learning_rate": 0.00016150010724921525, |
|
"loss": 2.1909, |
|
"step": 6990 |
|
}, |
|
{ |
|
"epoch": 0.6194416176275386, |
|
"grad_norm": 0.3656957149505615, |
|
"learning_rate": 0.00016084461683442175, |
|
"loss": 2.1744, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.6194416176275386, |
|
"eval_accuracy": 0.5502146351202954, |
|
"eval_loss": 2.095036029815674, |
|
"eval_runtime": 12.4508, |
|
"eval_samples_per_second": 25.541, |
|
"eval_steps_per_second": 0.402, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.6203265342241494, |
|
"grad_norm": 0.29836395382881165, |
|
"learning_rate": 0.00016018982789267123, |
|
"loss": 2.1902, |
|
"step": 7010 |
|
}, |
|
{ |
|
"epoch": 0.6212114508207601, |
|
"grad_norm": 0.5148487091064453, |
|
"learning_rate": 0.000159535745575832, |
|
"loss": 2.1859, |
|
"step": 7020 |
|
}, |
|
{ |
|
"epoch": 0.622096367417371, |
|
"grad_norm": 0.379721075296402, |
|
"learning_rate": 0.0001588823750302126, |
|
"loss": 2.1793, |
|
"step": 7030 |
|
}, |
|
{ |
|
"epoch": 0.6229812840139817, |
|
"grad_norm": 0.3983226716518402, |
|
"learning_rate": 0.00015822972139652148, |
|
"loss": 2.1819, |
|
"step": 7040 |
|
}, |
|
{ |
|
"epoch": 0.6238662006105925, |
|
"grad_norm": 0.30344444513320923, |
|
"learning_rate": 0.00015757778980982626, |
|
"loss": 2.19, |
|
"step": 7050 |
|
}, |
|
{ |
|
"epoch": 0.6247511172072032, |
|
"grad_norm": 0.3318573236465454, |
|
"learning_rate": 0.00015692658539951372, |
|
"loss": 2.1693, |
|
"step": 7060 |
|
}, |
|
{ |
|
"epoch": 0.6256360338038139, |
|
"grad_norm": 0.3758665919303894, |
|
"learning_rate": 0.00015627611328924903, |
|
"loss": 2.1951, |
|
"step": 7070 |
|
}, |
|
{ |
|
"epoch": 0.6265209504004248, |
|
"grad_norm": 0.3673815131187439, |
|
"learning_rate": 0.00015562637859693586, |
|
"loss": 2.1886, |
|
"step": 7080 |
|
}, |
|
{ |
|
"epoch": 0.6274058669970355, |
|
"grad_norm": 0.31688493490219116, |
|
"learning_rate": 0.0001549773864346755, |
|
"loss": 2.2186, |
|
"step": 7090 |
|
}, |
|
{ |
|
"epoch": 0.6282907835936463, |
|
"grad_norm": 0.4083735942840576, |
|
"learning_rate": 0.00015432914190872756, |
|
"loss": 2.1785, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 0.629175700190257, |
|
"grad_norm": 0.34488794207572937, |
|
"learning_rate": 0.00015368165011946886, |
|
"loss": 2.2075, |
|
"step": 7110 |
|
}, |
|
{ |
|
"epoch": 0.6300606167868679, |
|
"grad_norm": 0.31938597559928894, |
|
"learning_rate": 0.00015303491616135373, |
|
"loss": 2.1995, |
|
"step": 7120 |
|
}, |
|
{ |
|
"epoch": 0.6309455333834786, |
|
"grad_norm": 0.366621196269989, |
|
"learning_rate": 0.00015238894512287413, |
|
"loss": 2.1676, |
|
"step": 7130 |
|
}, |
|
{ |
|
"epoch": 0.6318304499800894, |
|
"grad_norm": 0.42678719758987427, |
|
"learning_rate": 0.0001517437420865191, |
|
"loss": 2.1845, |
|
"step": 7140 |
|
}, |
|
{ |
|
"epoch": 0.6327153665767001, |
|
"grad_norm": 0.41343775391578674, |
|
"learning_rate": 0.00015109931212873535, |
|
"loss": 2.1639, |
|
"step": 7150 |
|
}, |
|
{ |
|
"epoch": 0.633600283173311, |
|
"grad_norm": 0.30154532194137573, |
|
"learning_rate": 0.0001504556603198867, |
|
"loss": 2.202, |
|
"step": 7160 |
|
}, |
|
{ |
|
"epoch": 0.6344851997699217, |
|
"grad_norm": 0.33191919326782227, |
|
"learning_rate": 0.00014981279172421482, |
|
"loss": 2.1845, |
|
"step": 7170 |
|
}, |
|
{ |
|
"epoch": 0.6353701163665324, |
|
"grad_norm": 0.38592010736465454, |
|
"learning_rate": 0.00014917071139979875, |
|
"loss": 2.1916, |
|
"step": 7180 |
|
}, |
|
{ |
|
"epoch": 0.6362550329631432, |
|
"grad_norm": 0.3502799868583679, |
|
"learning_rate": 0.00014852942439851576, |
|
"loss": 2.1942, |
|
"step": 7190 |
|
}, |
|
{ |
|
"epoch": 0.637139949559754, |
|
"grad_norm": 0.34366223216056824, |
|
"learning_rate": 0.000147888935766001, |
|
"loss": 2.1823, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 0.6380248661563648, |
|
"grad_norm": 0.32077911496162415, |
|
"learning_rate": 0.0001472492505416082, |
|
"loss": 2.1911, |
|
"step": 7210 |
|
}, |
|
{ |
|
"epoch": 0.6389097827529755, |
|
"grad_norm": 0.36590802669525146, |
|
"learning_rate": 0.00014661037375836988, |
|
"loss": 2.1914, |
|
"step": 7220 |
|
}, |
|
{ |
|
"epoch": 0.6397946993495863, |
|
"grad_norm": 0.34605008363723755, |
|
"learning_rate": 0.0001459723104429577, |
|
"loss": 2.1833, |
|
"step": 7230 |
|
}, |
|
{ |
|
"epoch": 0.640679615946197, |
|
"grad_norm": 0.3608168065547943, |
|
"learning_rate": 0.00014533506561564306, |
|
"loss": 2.1944, |
|
"step": 7240 |
|
}, |
|
{ |
|
"epoch": 0.6415645325428079, |
|
"grad_norm": 0.330559104681015, |
|
"learning_rate": 0.0001446986442902574, |
|
"loss": 2.192, |
|
"step": 7250 |
|
}, |
|
{ |
|
"epoch": 0.6424494491394186, |
|
"grad_norm": 0.32037970423698425, |
|
"learning_rate": 0.00014406305147415284, |
|
"loss": 2.1803, |
|
"step": 7260 |
|
}, |
|
{ |
|
"epoch": 0.6433343657360294, |
|
"grad_norm": 0.37160640954971313, |
|
"learning_rate": 0.00014342829216816309, |
|
"loss": 2.1795, |
|
"step": 7270 |
|
}, |
|
{ |
|
"epoch": 0.6442192823326401, |
|
"grad_norm": 0.38226374983787537, |
|
"learning_rate": 0.00014279437136656336, |
|
"loss": 2.1567, |
|
"step": 7280 |
|
}, |
|
{ |
|
"epoch": 0.645104198929251, |
|
"grad_norm": 0.39466509222984314, |
|
"learning_rate": 0.00014216129405703203, |
|
"loss": 2.1963, |
|
"step": 7290 |
|
}, |
|
{ |
|
"epoch": 0.6459891155258617, |
|
"grad_norm": 0.4139063060283661, |
|
"learning_rate": 0.00014152906522061048, |
|
"loss": 2.1784, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 0.6468740321224724, |
|
"grad_norm": 0.35524383187294006, |
|
"learning_rate": 0.00014089768983166444, |
|
"loss": 2.1712, |
|
"step": 7310 |
|
}, |
|
{ |
|
"epoch": 0.6477589487190832, |
|
"grad_norm": 0.3009701669216156, |
|
"learning_rate": 0.00014026717285784492, |
|
"loss": 2.1888, |
|
"step": 7320 |
|
}, |
|
{ |
|
"epoch": 0.648643865315694, |
|
"grad_norm": 0.528045117855072, |
|
"learning_rate": 0.00013963751926004863, |
|
"loss": 2.2064, |
|
"step": 7330 |
|
}, |
|
{ |
|
"epoch": 0.6495287819123048, |
|
"grad_norm": 0.4917372167110443, |
|
"learning_rate": 0.0001390087339923795, |
|
"loss": 2.1688, |
|
"step": 7340 |
|
}, |
|
{ |
|
"epoch": 0.6504136985089155, |
|
"grad_norm": 0.40180593729019165, |
|
"learning_rate": 0.0001383808220021093, |
|
"loss": 2.1832, |
|
"step": 7350 |
|
}, |
|
{ |
|
"epoch": 0.6512986151055263, |
|
"grad_norm": 0.4933311939239502, |
|
"learning_rate": 0.00013775378822963882, |
|
"loss": 2.1737, |
|
"step": 7360 |
|
}, |
|
{ |
|
"epoch": 0.652183531702137, |
|
"grad_norm": 0.44765591621398926, |
|
"learning_rate": 0.00013712763760845937, |
|
"loss": 2.1721, |
|
"step": 7370 |
|
}, |
|
{ |
|
"epoch": 0.6530684482987479, |
|
"grad_norm": 0.511542797088623, |
|
"learning_rate": 0.00013650237506511331, |
|
"loss": 2.1801, |
|
"step": 7380 |
|
}, |
|
{ |
|
"epoch": 0.6539533648953586, |
|
"grad_norm": 0.33777767419815063, |
|
"learning_rate": 0.00013587800551915575, |
|
"loss": 2.183, |
|
"step": 7390 |
|
}, |
|
{ |
|
"epoch": 0.6548382814919694, |
|
"grad_norm": 0.4522639513015747, |
|
"learning_rate": 0.00013525453388311555, |
|
"loss": 2.1912, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 0.6557231980885802, |
|
"grad_norm": 0.35160988569259644, |
|
"learning_rate": 0.0001346319650624572, |
|
"loss": 2.1796, |
|
"step": 7410 |
|
}, |
|
{ |
|
"epoch": 0.6566081146851909, |
|
"grad_norm": 0.3671887516975403, |
|
"learning_rate": 0.0001340103039555415, |
|
"loss": 2.1904, |
|
"step": 7420 |
|
}, |
|
{ |
|
"epoch": 0.6574930312818017, |
|
"grad_norm": 0.3306543827056885, |
|
"learning_rate": 0.00013338955545358754, |
|
"loss": 2.1922, |
|
"step": 7430 |
|
}, |
|
{ |
|
"epoch": 0.6583779478784124, |
|
"grad_norm": 0.5238605737686157, |
|
"learning_rate": 0.00013276972444063384, |
|
"loss": 2.2065, |
|
"step": 7440 |
|
}, |
|
{ |
|
"epoch": 0.6592628644750232, |
|
"grad_norm": 0.3615835905075073, |
|
"learning_rate": 0.00013215081579350058, |
|
"loss": 2.1936, |
|
"step": 7450 |
|
}, |
|
{ |
|
"epoch": 0.660147781071634, |
|
"grad_norm": 0.3860037624835968, |
|
"learning_rate": 0.00013153283438175034, |
|
"loss": 2.1908, |
|
"step": 7460 |
|
}, |
|
{ |
|
"epoch": 0.6610326976682448, |
|
"grad_norm": 0.3101390600204468, |
|
"learning_rate": 0.00013091578506765046, |
|
"loss": 2.2007, |
|
"step": 7470 |
|
}, |
|
{ |
|
"epoch": 0.6619176142648555, |
|
"grad_norm": 0.3494165539741516, |
|
"learning_rate": 0.00013029967270613435, |
|
"loss": 2.1729, |
|
"step": 7480 |
|
}, |
|
{ |
|
"epoch": 0.6628025308614663, |
|
"grad_norm": 0.4269309341907501, |
|
"learning_rate": 0.00012968450214476368, |
|
"loss": 2.1882, |
|
"step": 7490 |
|
}, |
|
{ |
|
"epoch": 0.6636874474580771, |
|
"grad_norm": 0.43789321184158325, |
|
"learning_rate": 0.00012907027822369005, |
|
"loss": 2.1983, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.6645723640546879, |
|
"grad_norm": 0.3296915590763092, |
|
"learning_rate": 0.0001284570057756169, |
|
"loss": 2.1759, |
|
"step": 7510 |
|
}, |
|
{ |
|
"epoch": 0.6654572806512986, |
|
"grad_norm": 0.3194144666194916, |
|
"learning_rate": 0.00012784468962576134, |
|
"loss": 2.1868, |
|
"step": 7520 |
|
}, |
|
{ |
|
"epoch": 0.6663421972479093, |
|
"grad_norm": 0.42226454615592957, |
|
"learning_rate": 0.00012723333459181642, |
|
"loss": 2.2096, |
|
"step": 7530 |
|
}, |
|
{ |
|
"epoch": 0.6672271138445202, |
|
"grad_norm": 0.3759223520755768, |
|
"learning_rate": 0.00012662294548391328, |
|
"loss": 2.1851, |
|
"step": 7540 |
|
}, |
|
{ |
|
"epoch": 0.6681120304411309, |
|
"grad_norm": 0.292855441570282, |
|
"learning_rate": 0.00012601352710458314, |
|
"loss": 2.1992, |
|
"step": 7550 |
|
}, |
|
{ |
|
"epoch": 0.6689969470377417, |
|
"grad_norm": 0.348399817943573, |
|
"learning_rate": 0.00012540508424871934, |
|
"loss": 2.1677, |
|
"step": 7560 |
|
}, |
|
{ |
|
"epoch": 0.6698818636343524, |
|
"grad_norm": 0.36560651659965515, |
|
"learning_rate": 0.00012479762170353997, |
|
"loss": 2.1773, |
|
"step": 7570 |
|
}, |
|
{ |
|
"epoch": 0.6707667802309633, |
|
"grad_norm": 0.3574565351009369, |
|
"learning_rate": 0.00012419114424854998, |
|
"loss": 2.1848, |
|
"step": 7580 |
|
}, |
|
{ |
|
"epoch": 0.671651696827574, |
|
"grad_norm": 0.35882261395454407, |
|
"learning_rate": 0.0001235856566555039, |
|
"loss": 2.1868, |
|
"step": 7590 |
|
}, |
|
{ |
|
"epoch": 0.6725366134241848, |
|
"grad_norm": 0.3170746862888336, |
|
"learning_rate": 0.0001229811636883677, |
|
"loss": 2.1881, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 0.6734215300207955, |
|
"grad_norm": 0.3750855028629303, |
|
"learning_rate": 0.00012237767010328182, |
|
"loss": 2.1658, |
|
"step": 7610 |
|
}, |
|
{ |
|
"epoch": 0.6743064466174064, |
|
"grad_norm": 0.31514084339141846, |
|
"learning_rate": 0.0001217751806485235, |
|
"loss": 2.1709, |
|
"step": 7620 |
|
}, |
|
{ |
|
"epoch": 0.6751913632140171, |
|
"grad_norm": 0.4474596679210663, |
|
"learning_rate": 0.00012117370006446957, |
|
"loss": 2.1802, |
|
"step": 7630 |
|
}, |
|
{ |
|
"epoch": 0.6760762798106279, |
|
"grad_norm": 0.36836448311805725, |
|
"learning_rate": 0.00012057323308355922, |
|
"loss": 2.1704, |
|
"step": 7640 |
|
}, |
|
{ |
|
"epoch": 0.6769611964072386, |
|
"grad_norm": 0.3137301206588745, |
|
"learning_rate": 0.00011997378443025633, |
|
"loss": 2.2023, |
|
"step": 7650 |
|
}, |
|
{ |
|
"epoch": 0.6778461130038493, |
|
"grad_norm": 0.3115224838256836, |
|
"learning_rate": 0.00011937535882101281, |
|
"loss": 2.1702, |
|
"step": 7660 |
|
}, |
|
{ |
|
"epoch": 0.6787310296004602, |
|
"grad_norm": 0.4060702621936798, |
|
"learning_rate": 0.00011877796096423105, |
|
"loss": 2.1803, |
|
"step": 7670 |
|
}, |
|
{ |
|
"epoch": 0.6796159461970709, |
|
"grad_norm": 0.30474671721458435, |
|
"learning_rate": 0.00011818159556022748, |
|
"loss": 2.1892, |
|
"step": 7680 |
|
}, |
|
{ |
|
"epoch": 0.6805008627936817, |
|
"grad_norm": 0.29813340306282043, |
|
"learning_rate": 0.00011758626730119487, |
|
"loss": 2.1831, |
|
"step": 7690 |
|
}, |
|
{ |
|
"epoch": 0.6813857793902924, |
|
"grad_norm": 0.35376375913619995, |
|
"learning_rate": 0.00011699198087116588, |
|
"loss": 2.1768, |
|
"step": 7700 |
|
}, |
|
{ |
|
"epoch": 0.6822706959869033, |
|
"grad_norm": 0.34044161438941956, |
|
"learning_rate": 0.00011639874094597605, |
|
"loss": 2.1696, |
|
"step": 7710 |
|
}, |
|
{ |
|
"epoch": 0.683155612583514, |
|
"grad_norm": 0.3716510534286499, |
|
"learning_rate": 0.000115806552193227, |
|
"loss": 2.1897, |
|
"step": 7720 |
|
}, |
|
{ |
|
"epoch": 0.6840405291801248, |
|
"grad_norm": 0.3332691788673401, |
|
"learning_rate": 0.00011521541927224994, |
|
"loss": 2.1611, |
|
"step": 7730 |
|
}, |
|
{ |
|
"epoch": 0.6849254457767355, |
|
"grad_norm": 0.3617340624332428, |
|
"learning_rate": 0.00011462534683406858, |
|
"loss": 2.1593, |
|
"step": 7740 |
|
}, |
|
{ |
|
"epoch": 0.6858103623733464, |
|
"grad_norm": 0.33560147881507874, |
|
"learning_rate": 0.00011403633952136289, |
|
"loss": 2.1933, |
|
"step": 7750 |
|
}, |
|
{ |
|
"epoch": 0.6866952789699571, |
|
"grad_norm": 0.30507397651672363, |
|
"learning_rate": 0.00011344840196843228, |
|
"loss": 2.1908, |
|
"step": 7760 |
|
}, |
|
{ |
|
"epoch": 0.6875801955665678, |
|
"grad_norm": 0.28181737661361694, |
|
"learning_rate": 0.00011286153880115966, |
|
"loss": 2.1963, |
|
"step": 7770 |
|
}, |
|
{ |
|
"epoch": 0.6884651121631786, |
|
"grad_norm": 0.3205542266368866, |
|
"learning_rate": 0.0001122757546369744, |
|
"loss": 2.1629, |
|
"step": 7780 |
|
}, |
|
{ |
|
"epoch": 0.6893500287597893, |
|
"grad_norm": 0.4453352391719818, |
|
"learning_rate": 0.00011169105408481634, |
|
"loss": 2.2044, |
|
"step": 7790 |
|
}, |
|
{ |
|
"epoch": 0.6902349453564002, |
|
"grad_norm": 0.29993703961372375, |
|
"learning_rate": 0.00011110744174509952, |
|
"loss": 2.1742, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 0.6911198619530109, |
|
"grad_norm": 0.3727935552597046, |
|
"learning_rate": 0.00011052492220967583, |
|
"loss": 2.1755, |
|
"step": 7810 |
|
}, |
|
{ |
|
"epoch": 0.6920047785496217, |
|
"grad_norm": 0.32742083072662354, |
|
"learning_rate": 0.00010994350006179932, |
|
"loss": 2.2046, |
|
"step": 7820 |
|
}, |
|
{ |
|
"epoch": 0.6928896951462324, |
|
"grad_norm": 0.45911964774131775, |
|
"learning_rate": 0.00010936317987608946, |
|
"loss": 2.1755, |
|
"step": 7830 |
|
}, |
|
{ |
|
"epoch": 0.6937746117428433, |
|
"grad_norm": 0.3224821090698242, |
|
"learning_rate": 0.00010878396621849565, |
|
"loss": 2.1789, |
|
"step": 7840 |
|
}, |
|
{ |
|
"epoch": 0.694659528339454, |
|
"grad_norm": 0.5779576301574707, |
|
"learning_rate": 0.00010820586364626103, |
|
"loss": 2.186, |
|
"step": 7850 |
|
}, |
|
{ |
|
"epoch": 0.6955444449360648, |
|
"grad_norm": 0.36172717809677124, |
|
"learning_rate": 0.00010762887670788701, |
|
"loss": 2.2043, |
|
"step": 7860 |
|
}, |
|
{ |
|
"epoch": 0.6964293615326755, |
|
"grad_norm": 0.3582923710346222, |
|
"learning_rate": 0.00010705300994309697, |
|
"loss": 2.1745, |
|
"step": 7870 |
|
}, |
|
{ |
|
"epoch": 0.6973142781292864, |
|
"grad_norm": 0.3162693977355957, |
|
"learning_rate": 0.00010647826788280083, |
|
"loss": 2.1838, |
|
"step": 7880 |
|
}, |
|
{ |
|
"epoch": 0.6981991947258971, |
|
"grad_norm": 0.3397110402584076, |
|
"learning_rate": 0.0001059046550490593, |
|
"loss": 2.1854, |
|
"step": 7890 |
|
}, |
|
{ |
|
"epoch": 0.6990841113225078, |
|
"grad_norm": 0.30742955207824707, |
|
"learning_rate": 0.00010533217595504857, |
|
"loss": 2.1747, |
|
"step": 7900 |
|
}, |
|
{ |
|
"epoch": 0.6999690279191186, |
|
"grad_norm": 0.32416754961013794, |
|
"learning_rate": 0.00010476083510502443, |
|
"loss": 2.1828, |
|
"step": 7910 |
|
}, |
|
{ |
|
"epoch": 0.7008539445157294, |
|
"grad_norm": 0.41834747791290283, |
|
"learning_rate": 0.00010419063699428691, |
|
"loss": 2.1849, |
|
"step": 7920 |
|
}, |
|
{ |
|
"epoch": 0.7017388611123402, |
|
"grad_norm": 0.3807116150856018, |
|
"learning_rate": 0.00010362158610914516, |
|
"loss": 2.1674, |
|
"step": 7930 |
|
}, |
|
{ |
|
"epoch": 0.7026237777089509, |
|
"grad_norm": 0.3779432475566864, |
|
"learning_rate": 0.00010305368692688174, |
|
"loss": 2.1683, |
|
"step": 7940 |
|
}, |
|
{ |
|
"epoch": 0.7035086943055617, |
|
"grad_norm": 0.41769009828567505, |
|
"learning_rate": 0.000102486943915718, |
|
"loss": 2.1788, |
|
"step": 7950 |
|
}, |
|
{ |
|
"epoch": 0.7043936109021725, |
|
"grad_norm": 0.33971527218818665, |
|
"learning_rate": 0.00010192136153477825, |
|
"loss": 2.1844, |
|
"step": 7960 |
|
}, |
|
{ |
|
"epoch": 0.7052785274987833, |
|
"grad_norm": 0.2998791038990021, |
|
"learning_rate": 0.00010135694423405506, |
|
"loss": 2.1906, |
|
"step": 7970 |
|
}, |
|
{ |
|
"epoch": 0.706163444095394, |
|
"grad_norm": 0.30726414918899536, |
|
"learning_rate": 0.00010079369645437411, |
|
"loss": 2.1802, |
|
"step": 7980 |
|
}, |
|
{ |
|
"epoch": 0.7070483606920048, |
|
"grad_norm": 0.3112538456916809, |
|
"learning_rate": 0.00010023162262735944, |
|
"loss": 2.1887, |
|
"step": 7990 |
|
}, |
|
{ |
|
"epoch": 0.7079332772886155, |
|
"grad_norm": 0.33356785774230957, |
|
"learning_rate": 9.967072717539852e-05, |
|
"loss": 2.1971, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.7079332772886155, |
|
"eval_accuracy": 0.5514356363412967, |
|
"eval_loss": 2.0872702598571777, |
|
"eval_runtime": 12.2195, |
|
"eval_samples_per_second": 26.024, |
|
"eval_steps_per_second": 0.409, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.7088181938852263, |
|
"grad_norm": 0.35864993929862976, |
|
"learning_rate": 9.911101451160715e-05, |
|
"loss": 2.1773, |
|
"step": 8010 |
|
}, |
|
{ |
|
"epoch": 0.7097031104818371, |
|
"grad_norm": 0.4708113372325897, |
|
"learning_rate": 9.855248903979506e-05, |
|
"loss": 2.1867, |
|
"step": 8020 |
|
}, |
|
{ |
|
"epoch": 0.7105880270784478, |
|
"grad_norm": 0.32000553607940674, |
|
"learning_rate": 9.79951551544311e-05, |
|
"loss": 2.1721, |
|
"step": 8030 |
|
}, |
|
{ |
|
"epoch": 0.7114729436750586, |
|
"grad_norm": 0.3156352639198303, |
|
"learning_rate": 9.743901724060905e-05, |
|
"loss": 2.1935, |
|
"step": 8040 |
|
}, |
|
{ |
|
"epoch": 0.7123578602716694, |
|
"grad_norm": 0.3504098355770111, |
|
"learning_rate": 9.688407967401247e-05, |
|
"loss": 2.1891, |
|
"step": 8050 |
|
}, |
|
{ |
|
"epoch": 0.7132427768682802, |
|
"grad_norm": 0.4470130205154419, |
|
"learning_rate": 9.633034682088071e-05, |
|
"loss": 2.1604, |
|
"step": 8060 |
|
}, |
|
{ |
|
"epoch": 0.7141276934648909, |
|
"grad_norm": 0.37104523181915283, |
|
"learning_rate": 9.57778230379745e-05, |
|
"loss": 2.1695, |
|
"step": 8070 |
|
}, |
|
{ |
|
"epoch": 0.7150126100615017, |
|
"grad_norm": 0.30490779876708984, |
|
"learning_rate": 9.522651267254148e-05, |
|
"loss": 2.1722, |
|
"step": 8080 |
|
}, |
|
{ |
|
"epoch": 0.7158975266581125, |
|
"grad_norm": 0.288102924823761, |
|
"learning_rate": 9.467642006228244e-05, |
|
"loss": 2.165, |
|
"step": 8090 |
|
}, |
|
{ |
|
"epoch": 0.7167824432547233, |
|
"grad_norm": 0.2840191125869751, |
|
"learning_rate": 9.412754953531663e-05, |
|
"loss": 2.1873, |
|
"step": 8100 |
|
}, |
|
{ |
|
"epoch": 0.717667359851334, |
|
"grad_norm": 0.3146025836467743, |
|
"learning_rate": 9.357990541014805e-05, |
|
"loss": 2.1756, |
|
"step": 8110 |
|
}, |
|
{ |
|
"epoch": 0.7185522764479447, |
|
"grad_norm": 0.299532413482666, |
|
"learning_rate": 9.30334919956313e-05, |
|
"loss": 2.1809, |
|
"step": 8120 |
|
}, |
|
{ |
|
"epoch": 0.7194371930445556, |
|
"grad_norm": 0.34434300661087036, |
|
"learning_rate": 9.248831359093803e-05, |
|
"loss": 2.1781, |
|
"step": 8130 |
|
}, |
|
{ |
|
"epoch": 0.7203221096411663, |
|
"grad_norm": 0.2795085906982422, |
|
"learning_rate": 9.194437448552259e-05, |
|
"loss": 2.1806, |
|
"step": 8140 |
|
}, |
|
{ |
|
"epoch": 0.7212070262377771, |
|
"grad_norm": 0.32003116607666016, |
|
"learning_rate": 9.140167895908866e-05, |
|
"loss": 2.1801, |
|
"step": 8150 |
|
}, |
|
{ |
|
"epoch": 0.7220919428343878, |
|
"grad_norm": 0.2715960144996643, |
|
"learning_rate": 9.086023128155544e-05, |
|
"loss": 2.1647, |
|
"step": 8160 |
|
}, |
|
{ |
|
"epoch": 0.7229768594309987, |
|
"grad_norm": 0.28309276700019836, |
|
"learning_rate": 9.032003571302397e-05, |
|
"loss": 2.177, |
|
"step": 8170 |
|
}, |
|
{ |
|
"epoch": 0.7238617760276094, |
|
"grad_norm": 0.28893986344337463, |
|
"learning_rate": 8.978109650374397e-05, |
|
"loss": 2.1915, |
|
"step": 8180 |
|
}, |
|
{ |
|
"epoch": 0.7247466926242202, |
|
"grad_norm": 0.32400646805763245, |
|
"learning_rate": 8.924341789408e-05, |
|
"loss": 2.1657, |
|
"step": 8190 |
|
}, |
|
{ |
|
"epoch": 0.7256316092208309, |
|
"grad_norm": 0.3074786961078644, |
|
"learning_rate": 8.870700411447816e-05, |
|
"loss": 2.1951, |
|
"step": 8200 |
|
}, |
|
{ |
|
"epoch": 0.7265165258174417, |
|
"grad_norm": 0.30175745487213135, |
|
"learning_rate": 8.817185938543293e-05, |
|
"loss": 2.1753, |
|
"step": 8210 |
|
}, |
|
{ |
|
"epoch": 0.7274014424140525, |
|
"grad_norm": 0.3015293776988983, |
|
"learning_rate": 8.763798791745412e-05, |
|
"loss": 2.1634, |
|
"step": 8220 |
|
}, |
|
{ |
|
"epoch": 0.7282863590106633, |
|
"grad_norm": 0.32819435000419617, |
|
"learning_rate": 8.710539391103328e-05, |
|
"loss": 2.1839, |
|
"step": 8230 |
|
}, |
|
{ |
|
"epoch": 0.729171275607274, |
|
"grad_norm": 0.3071001470088959, |
|
"learning_rate": 8.657408155661109e-05, |
|
"loss": 2.1746, |
|
"step": 8240 |
|
}, |
|
{ |
|
"epoch": 0.7300561922038847, |
|
"grad_norm": 0.38603559136390686, |
|
"learning_rate": 8.604405503454398e-05, |
|
"loss": 2.179, |
|
"step": 8250 |
|
}, |
|
{ |
|
"epoch": 0.7309411088004956, |
|
"grad_norm": 0.3808230757713318, |
|
"learning_rate": 8.551531851507186e-05, |
|
"loss": 2.1609, |
|
"step": 8260 |
|
}, |
|
{ |
|
"epoch": 0.7318260253971063, |
|
"grad_norm": 0.3062630295753479, |
|
"learning_rate": 8.49878761582846e-05, |
|
"loss": 2.1762, |
|
"step": 8270 |
|
}, |
|
{ |
|
"epoch": 0.7327109419937171, |
|
"grad_norm": 0.30554407835006714, |
|
"learning_rate": 8.446173211408972e-05, |
|
"loss": 2.18, |
|
"step": 8280 |
|
}, |
|
{ |
|
"epoch": 0.7335958585903278, |
|
"grad_norm": 0.30430638790130615, |
|
"learning_rate": 8.393689052217964e-05, |
|
"loss": 2.17, |
|
"step": 8290 |
|
}, |
|
{ |
|
"epoch": 0.7344807751869387, |
|
"grad_norm": 0.37365472316741943, |
|
"learning_rate": 8.341335551199903e-05, |
|
"loss": 2.1717, |
|
"step": 8300 |
|
}, |
|
{ |
|
"epoch": 0.7353656917835494, |
|
"grad_norm": 0.30844250321388245, |
|
"learning_rate": 8.289113120271264e-05, |
|
"loss": 2.1989, |
|
"step": 8310 |
|
}, |
|
{ |
|
"epoch": 0.7362506083801602, |
|
"grad_norm": 0.3102276623249054, |
|
"learning_rate": 8.237022170317235e-05, |
|
"loss": 2.1967, |
|
"step": 8320 |
|
}, |
|
{ |
|
"epoch": 0.7371355249767709, |
|
"grad_norm": 0.35639065504074097, |
|
"learning_rate": 8.185063111188523e-05, |
|
"loss": 2.1596, |
|
"step": 8330 |
|
}, |
|
{ |
|
"epoch": 0.7380204415733818, |
|
"grad_norm": 0.3256385326385498, |
|
"learning_rate": 8.133236351698142e-05, |
|
"loss": 2.1866, |
|
"step": 8340 |
|
}, |
|
{ |
|
"epoch": 0.7389053581699925, |
|
"grad_norm": 0.4910948872566223, |
|
"learning_rate": 8.081542299618138e-05, |
|
"loss": 2.1701, |
|
"step": 8350 |
|
}, |
|
{ |
|
"epoch": 0.7397902747666032, |
|
"grad_norm": 0.4157789647579193, |
|
"learning_rate": 8.029981361676455e-05, |
|
"loss": 2.1664, |
|
"step": 8360 |
|
}, |
|
{ |
|
"epoch": 0.740675191363214, |
|
"grad_norm": 0.3930807411670685, |
|
"learning_rate": 7.978553943553665e-05, |
|
"loss": 2.1909, |
|
"step": 8370 |
|
}, |
|
{ |
|
"epoch": 0.7415601079598247, |
|
"grad_norm": 0.3209260404109955, |
|
"learning_rate": 7.927260449879828e-05, |
|
"loss": 2.1963, |
|
"step": 8380 |
|
}, |
|
{ |
|
"epoch": 0.7424450245564356, |
|
"grad_norm": 0.39223039150238037, |
|
"learning_rate": 7.876101284231277e-05, |
|
"loss": 2.1674, |
|
"step": 8390 |
|
}, |
|
{ |
|
"epoch": 0.7433299411530463, |
|
"grad_norm": 0.3471428453922272, |
|
"learning_rate": 7.825076849127458e-05, |
|
"loss": 2.1933, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 0.7442148577496571, |
|
"grad_norm": 0.38216593861579895, |
|
"learning_rate": 7.774187546027769e-05, |
|
"loss": 2.1688, |
|
"step": 8410 |
|
}, |
|
{ |
|
"epoch": 0.7450997743462678, |
|
"grad_norm": 0.2885330617427826, |
|
"learning_rate": 7.723433775328384e-05, |
|
"loss": 2.2004, |
|
"step": 8420 |
|
}, |
|
{ |
|
"epoch": 0.7459846909428787, |
|
"grad_norm": 0.3208834230899811, |
|
"learning_rate": 7.672815936359106e-05, |
|
"loss": 2.1859, |
|
"step": 8430 |
|
}, |
|
{ |
|
"epoch": 0.7468696075394894, |
|
"grad_norm": 0.32311803102493286, |
|
"learning_rate": 7.622334427380229e-05, |
|
"loss": 2.1901, |
|
"step": 8440 |
|
}, |
|
{ |
|
"epoch": 0.7477545241361002, |
|
"grad_norm": 0.4310019910335541, |
|
"learning_rate": 7.571989645579418e-05, |
|
"loss": 2.1787, |
|
"step": 8450 |
|
}, |
|
{ |
|
"epoch": 0.7486394407327109, |
|
"grad_norm": 0.3954008221626282, |
|
"learning_rate": 7.521781987068566e-05, |
|
"loss": 2.148, |
|
"step": 8460 |
|
}, |
|
{ |
|
"epoch": 0.7495243573293218, |
|
"grad_norm": 0.34704872965812683, |
|
"learning_rate": 7.471711846880669e-05, |
|
"loss": 2.1572, |
|
"step": 8470 |
|
}, |
|
{ |
|
"epoch": 0.7504092739259325, |
|
"grad_norm": 0.3665640950202942, |
|
"learning_rate": 7.421779618966738e-05, |
|
"loss": 2.1767, |
|
"step": 8480 |
|
}, |
|
{ |
|
"epoch": 0.7512941905225432, |
|
"grad_norm": 0.27584344148635864, |
|
"learning_rate": 7.371985696192707e-05, |
|
"loss": 2.1606, |
|
"step": 8490 |
|
}, |
|
{ |
|
"epoch": 0.752179107119154, |
|
"grad_norm": 0.3363373279571533, |
|
"learning_rate": 7.322330470336314e-05, |
|
"loss": 2.1717, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 0.7530640237157648, |
|
"grad_norm": 0.41205254197120667, |
|
"learning_rate": 7.27281433208403e-05, |
|
"loss": 2.1901, |
|
"step": 8510 |
|
}, |
|
{ |
|
"epoch": 0.7539489403123756, |
|
"grad_norm": 0.29698577523231506, |
|
"learning_rate": 7.223437671027994e-05, |
|
"loss": 2.1762, |
|
"step": 8520 |
|
}, |
|
{ |
|
"epoch": 0.7548338569089863, |
|
"grad_norm": 0.3263265788555145, |
|
"learning_rate": 7.174200875662928e-05, |
|
"loss": 2.1642, |
|
"step": 8530 |
|
}, |
|
{ |
|
"epoch": 0.7557187735055971, |
|
"grad_norm": 0.46074342727661133, |
|
"learning_rate": 7.125104333383118e-05, |
|
"loss": 2.1665, |
|
"step": 8540 |
|
}, |
|
{ |
|
"epoch": 0.7566036901022078, |
|
"grad_norm": 0.31991058588027954, |
|
"learning_rate": 7.07614843047932e-05, |
|
"loss": 2.1941, |
|
"step": 8550 |
|
}, |
|
{ |
|
"epoch": 0.7574886066988187, |
|
"grad_norm": 0.3378481864929199, |
|
"learning_rate": 7.027333552135748e-05, |
|
"loss": 2.1801, |
|
"step": 8560 |
|
}, |
|
{ |
|
"epoch": 0.7583735232954294, |
|
"grad_norm": 0.34531331062316895, |
|
"learning_rate": 6.97866008242703e-05, |
|
"loss": 2.1645, |
|
"step": 8570 |
|
}, |
|
{ |
|
"epoch": 0.7592584398920402, |
|
"grad_norm": 0.29340556263923645, |
|
"learning_rate": 6.930128404315214e-05, |
|
"loss": 2.1734, |
|
"step": 8580 |
|
}, |
|
{ |
|
"epoch": 0.760143356488651, |
|
"grad_norm": 0.3951212763786316, |
|
"learning_rate": 6.881738899646713e-05, |
|
"loss": 2.1725, |
|
"step": 8590 |
|
}, |
|
{ |
|
"epoch": 0.7610282730852617, |
|
"grad_norm": 0.3772408664226532, |
|
"learning_rate": 6.833491949149328e-05, |
|
"loss": 2.1778, |
|
"step": 8600 |
|
}, |
|
{ |
|
"epoch": 0.7619131896818725, |
|
"grad_norm": 0.2892204821109772, |
|
"learning_rate": 6.785387932429243e-05, |
|
"loss": 2.1748, |
|
"step": 8610 |
|
}, |
|
{ |
|
"epoch": 0.7627981062784832, |
|
"grad_norm": 0.3378978669643402, |
|
"learning_rate": 6.737427227968062e-05, |
|
"loss": 2.1871, |
|
"step": 8620 |
|
}, |
|
{ |
|
"epoch": 0.763683022875094, |
|
"grad_norm": 0.2745198607444763, |
|
"learning_rate": 6.689610213119782e-05, |
|
"loss": 2.1752, |
|
"step": 8630 |
|
}, |
|
{ |
|
"epoch": 0.7645679394717048, |
|
"grad_norm": 0.3293655514717102, |
|
"learning_rate": 6.641937264107867e-05, |
|
"loss": 2.1811, |
|
"step": 8640 |
|
}, |
|
{ |
|
"epoch": 0.7654528560683156, |
|
"grad_norm": 0.3286610245704651, |
|
"learning_rate": 6.594408756022272e-05, |
|
"loss": 2.1823, |
|
"step": 8650 |
|
}, |
|
{ |
|
"epoch": 0.7663377726649263, |
|
"grad_norm": 0.3277672231197357, |
|
"learning_rate": 6.547025062816486e-05, |
|
"loss": 2.1707, |
|
"step": 8660 |
|
}, |
|
{ |
|
"epoch": 0.7672226892615371, |
|
"grad_norm": 0.33863088488578796, |
|
"learning_rate": 6.499786557304618e-05, |
|
"loss": 2.1675, |
|
"step": 8670 |
|
}, |
|
{ |
|
"epoch": 0.7681076058581479, |
|
"grad_norm": 0.27539339661598206, |
|
"learning_rate": 6.452693611158411e-05, |
|
"loss": 2.1991, |
|
"step": 8680 |
|
}, |
|
{ |
|
"epoch": 0.7689925224547587, |
|
"grad_norm": 0.35537365078926086, |
|
"learning_rate": 6.405746594904388e-05, |
|
"loss": 2.185, |
|
"step": 8690 |
|
}, |
|
{ |
|
"epoch": 0.7698774390513694, |
|
"grad_norm": 0.34015700221061707, |
|
"learning_rate": 6.35894587792086e-05, |
|
"loss": 2.1794, |
|
"step": 8700 |
|
}, |
|
{ |
|
"epoch": 0.7707623556479801, |
|
"grad_norm": 0.32763075828552246, |
|
"learning_rate": 6.312291828435076e-05, |
|
"loss": 2.1469, |
|
"step": 8710 |
|
}, |
|
{ |
|
"epoch": 0.771647272244591, |
|
"grad_norm": 0.38287562131881714, |
|
"learning_rate": 6.265784813520318e-05, |
|
"loss": 2.1877, |
|
"step": 8720 |
|
}, |
|
{ |
|
"epoch": 0.7725321888412017, |
|
"grad_norm": 0.33399245142936707, |
|
"learning_rate": 6.219425199092981e-05, |
|
"loss": 2.18, |
|
"step": 8730 |
|
}, |
|
{ |
|
"epoch": 0.7734171054378125, |
|
"grad_norm": 0.4592779278755188, |
|
"learning_rate": 6.173213349909729e-05, |
|
"loss": 2.2047, |
|
"step": 8740 |
|
}, |
|
{ |
|
"epoch": 0.7743020220344232, |
|
"grad_norm": 0.40799829363822937, |
|
"learning_rate": 6.127149629564605e-05, |
|
"loss": 2.1583, |
|
"step": 8750 |
|
}, |
|
{ |
|
"epoch": 0.775186938631034, |
|
"grad_norm": 0.3907710909843445, |
|
"learning_rate": 6.081234400486171e-05, |
|
"loss": 2.1608, |
|
"step": 8760 |
|
}, |
|
{ |
|
"epoch": 0.7760718552276448, |
|
"grad_norm": 0.29933851957321167, |
|
"learning_rate": 6.0354680239346925e-05, |
|
"loss": 2.1774, |
|
"step": 8770 |
|
}, |
|
{ |
|
"epoch": 0.7769567718242556, |
|
"grad_norm": 0.33313485980033875, |
|
"learning_rate": 5.989850859999227e-05, |
|
"loss": 2.1656, |
|
"step": 8780 |
|
}, |
|
{ |
|
"epoch": 0.7778416884208663, |
|
"grad_norm": 0.2630128562450409, |
|
"learning_rate": 5.944383267594855e-05, |
|
"loss": 2.1807, |
|
"step": 8790 |
|
}, |
|
{ |
|
"epoch": 0.7787266050174771, |
|
"grad_norm": 0.42465919256210327, |
|
"learning_rate": 5.899065604459813e-05, |
|
"loss": 2.165, |
|
"step": 8800 |
|
}, |
|
{ |
|
"epoch": 0.7796115216140879, |
|
"grad_norm": 0.4454388916492462, |
|
"learning_rate": 5.853898227152718e-05, |
|
"loss": 2.1983, |
|
"step": 8810 |
|
}, |
|
{ |
|
"epoch": 0.7804964382106987, |
|
"grad_norm": 0.32380637526512146, |
|
"learning_rate": 5.808881491049722e-05, |
|
"loss": 2.1738, |
|
"step": 8820 |
|
}, |
|
{ |
|
"epoch": 0.7813813548073094, |
|
"grad_norm": 0.3573935329914093, |
|
"learning_rate": 5.7640157503417444e-05, |
|
"loss": 2.178, |
|
"step": 8830 |
|
}, |
|
{ |
|
"epoch": 0.7822662714039201, |
|
"grad_norm": 0.292910635471344, |
|
"learning_rate": 5.7193013580316646e-05, |
|
"loss": 2.164, |
|
"step": 8840 |
|
}, |
|
{ |
|
"epoch": 0.783151188000531, |
|
"grad_norm": 0.3087019622325897, |
|
"learning_rate": 5.6747386659315755e-05, |
|
"loss": 2.1872, |
|
"step": 8850 |
|
}, |
|
{ |
|
"epoch": 0.7840361045971417, |
|
"grad_norm": 0.37168508768081665, |
|
"learning_rate": 5.6303280246599784e-05, |
|
"loss": 2.1645, |
|
"step": 8860 |
|
}, |
|
{ |
|
"epoch": 0.7849210211937525, |
|
"grad_norm": 0.28677845001220703, |
|
"learning_rate": 5.586069783639039e-05, |
|
"loss": 2.16, |
|
"step": 8870 |
|
}, |
|
{ |
|
"epoch": 0.7858059377903632, |
|
"grad_norm": 0.2966271638870239, |
|
"learning_rate": 5.541964291091855e-05, |
|
"loss": 2.1959, |
|
"step": 8880 |
|
}, |
|
{ |
|
"epoch": 0.7866908543869741, |
|
"grad_norm": 0.3936939537525177, |
|
"learning_rate": 5.4980118940396864e-05, |
|
"loss": 2.173, |
|
"step": 8890 |
|
}, |
|
{ |
|
"epoch": 0.7875757709835848, |
|
"grad_norm": 0.3042806386947632, |
|
"learning_rate": 5.454212938299255e-05, |
|
"loss": 2.1841, |
|
"step": 8900 |
|
}, |
|
{ |
|
"epoch": 0.7884606875801956, |
|
"grad_norm": 0.26205936074256897, |
|
"learning_rate": 5.410567768480004e-05, |
|
"loss": 2.1785, |
|
"step": 8910 |
|
}, |
|
{ |
|
"epoch": 0.7893456041768063, |
|
"grad_norm": 0.29167279601097107, |
|
"learning_rate": 5.367076727981382e-05, |
|
"loss": 2.1918, |
|
"step": 8920 |
|
}, |
|
{ |
|
"epoch": 0.7902305207734172, |
|
"grad_norm": 0.29951152205467224, |
|
"learning_rate": 5.3237401589901536e-05, |
|
"loss": 2.1855, |
|
"step": 8930 |
|
}, |
|
{ |
|
"epoch": 0.7911154373700279, |
|
"grad_norm": 0.31647637486457825, |
|
"learning_rate": 5.2805584024777256e-05, |
|
"loss": 2.1795, |
|
"step": 8940 |
|
}, |
|
{ |
|
"epoch": 0.7920003539666386, |
|
"grad_norm": 0.27526384592056274, |
|
"learning_rate": 5.2375317981974145e-05, |
|
"loss": 2.159, |
|
"step": 8950 |
|
}, |
|
{ |
|
"epoch": 0.7928852705632494, |
|
"grad_norm": 0.30185645818710327, |
|
"learning_rate": 5.194660684681818e-05, |
|
"loss": 2.1746, |
|
"step": 8960 |
|
}, |
|
{ |
|
"epoch": 0.7937701871598601, |
|
"grad_norm": 0.2594131827354431, |
|
"learning_rate": 5.151945399240127e-05, |
|
"loss": 2.1713, |
|
"step": 8970 |
|
}, |
|
{ |
|
"epoch": 0.794655103756471, |
|
"grad_norm": 0.27709755301475525, |
|
"learning_rate": 5.109386277955477e-05, |
|
"loss": 2.1592, |
|
"step": 8980 |
|
}, |
|
{ |
|
"epoch": 0.7955400203530817, |
|
"grad_norm": 0.30955156683921814, |
|
"learning_rate": 5.066983655682325e-05, |
|
"loss": 2.1832, |
|
"step": 8990 |
|
}, |
|
{ |
|
"epoch": 0.7964249369496925, |
|
"grad_norm": 0.2957077920436859, |
|
"learning_rate": 5.02473786604378e-05, |
|
"loss": 2.1662, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.7964249369496925, |
|
"eval_accuracy": 0.5522150805169673, |
|
"eval_loss": 2.081662654876709, |
|
"eval_runtime": 12.2604, |
|
"eval_samples_per_second": 25.937, |
|
"eval_steps_per_second": 0.408, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.7973098535463032, |
|
"grad_norm": 0.31898653507232666, |
|
"learning_rate": 4.982649241428997e-05, |
|
"loss": 2.1762, |
|
"step": 9010 |
|
}, |
|
{ |
|
"epoch": 0.7981947701429141, |
|
"grad_norm": 0.3314710259437561, |
|
"learning_rate": 4.9407181129905525e-05, |
|
"loss": 2.1888, |
|
"step": 9020 |
|
}, |
|
{ |
|
"epoch": 0.7990796867395248, |
|
"grad_norm": 0.3407798111438751, |
|
"learning_rate": 4.898944810641862e-05, |
|
"loss": 2.1655, |
|
"step": 9030 |
|
}, |
|
{ |
|
"epoch": 0.7999646033361356, |
|
"grad_norm": 0.3467320203781128, |
|
"learning_rate": 4.8573296630545685e-05, |
|
"loss": 2.1822, |
|
"step": 9040 |
|
}, |
|
{ |
|
"epoch": 0.8008495199327463, |
|
"grad_norm": 0.31269845366477966, |
|
"learning_rate": 4.81587299765594e-05, |
|
"loss": 2.1553, |
|
"step": 9050 |
|
}, |
|
{ |
|
"epoch": 0.801734436529357, |
|
"grad_norm": 0.3272798955440521, |
|
"learning_rate": 4.7745751406263163e-05, |
|
"loss": 2.1901, |
|
"step": 9060 |
|
}, |
|
{ |
|
"epoch": 0.8026193531259679, |
|
"grad_norm": 0.3094612956047058, |
|
"learning_rate": 4.733436416896528e-05, |
|
"loss": 2.1862, |
|
"step": 9070 |
|
}, |
|
{ |
|
"epoch": 0.8035042697225786, |
|
"grad_norm": 0.32090067863464355, |
|
"learning_rate": 4.692457150145374e-05, |
|
"loss": 2.1739, |
|
"step": 9080 |
|
}, |
|
{ |
|
"epoch": 0.8043891863191894, |
|
"grad_norm": 0.25567829608917236, |
|
"learning_rate": 4.651637662797018e-05, |
|
"loss": 2.1701, |
|
"step": 9090 |
|
}, |
|
{ |
|
"epoch": 0.8052741029158001, |
|
"grad_norm": 0.270111620426178, |
|
"learning_rate": 4.610978276018496e-05, |
|
"loss": 2.1619, |
|
"step": 9100 |
|
}, |
|
{ |
|
"epoch": 0.806159019512411, |
|
"grad_norm": 0.2996044456958771, |
|
"learning_rate": 4.5704793097171766e-05, |
|
"loss": 2.1669, |
|
"step": 9110 |
|
}, |
|
{ |
|
"epoch": 0.8070439361090217, |
|
"grad_norm": 0.2638896703720093, |
|
"learning_rate": 4.5301410825382304e-05, |
|
"loss": 2.1864, |
|
"step": 9120 |
|
}, |
|
{ |
|
"epoch": 0.8079288527056325, |
|
"grad_norm": 0.28163716197013855, |
|
"learning_rate": 4.4899639118621604e-05, |
|
"loss": 2.1842, |
|
"step": 9130 |
|
}, |
|
{ |
|
"epoch": 0.8088137693022432, |
|
"grad_norm": 0.31031861901283264, |
|
"learning_rate": 4.4499481138022546e-05, |
|
"loss": 2.179, |
|
"step": 9140 |
|
}, |
|
{ |
|
"epoch": 0.8096986858988541, |
|
"grad_norm": 0.30774182081222534, |
|
"learning_rate": 4.4100940032021334e-05, |
|
"loss": 2.1791, |
|
"step": 9150 |
|
}, |
|
{ |
|
"epoch": 0.8105836024954648, |
|
"grad_norm": 0.2719011604785919, |
|
"learning_rate": 4.3704018936332605e-05, |
|
"loss": 2.1836, |
|
"step": 9160 |
|
}, |
|
{ |
|
"epoch": 0.8114685190920756, |
|
"grad_norm": 0.33051612973213196, |
|
"learning_rate": 4.3308720973924936e-05, |
|
"loss": 2.1712, |
|
"step": 9170 |
|
}, |
|
{ |
|
"epoch": 0.8123534356886863, |
|
"grad_norm": 0.2812555432319641, |
|
"learning_rate": 4.29150492549959e-05, |
|
"loss": 2.1602, |
|
"step": 9180 |
|
}, |
|
{ |
|
"epoch": 0.8132383522852971, |
|
"grad_norm": 0.26210522651672363, |
|
"learning_rate": 4.2523006876947904e-05, |
|
"loss": 2.176, |
|
"step": 9190 |
|
}, |
|
{ |
|
"epoch": 0.8141232688819079, |
|
"grad_norm": 0.29666024446487427, |
|
"learning_rate": 4.213259692436367e-05, |
|
"loss": 2.1815, |
|
"step": 9200 |
|
}, |
|
{ |
|
"epoch": 0.8150081854785186, |
|
"grad_norm": 0.2840687334537506, |
|
"learning_rate": 4.1743822468982226e-05, |
|
"loss": 2.1808, |
|
"step": 9210 |
|
}, |
|
{ |
|
"epoch": 0.8158931020751294, |
|
"grad_norm": 0.2684008479118347, |
|
"learning_rate": 4.135668656967434e-05, |
|
"loss": 2.1932, |
|
"step": 9220 |
|
}, |
|
{ |
|
"epoch": 0.8167780186717402, |
|
"grad_norm": 0.2919354736804962, |
|
"learning_rate": 4.097119227241869e-05, |
|
"loss": 2.1761, |
|
"step": 9230 |
|
}, |
|
{ |
|
"epoch": 0.817662935268351, |
|
"grad_norm": 0.29693448543548584, |
|
"learning_rate": 4.0587342610277886e-05, |
|
"loss": 2.1651, |
|
"step": 9240 |
|
}, |
|
{ |
|
"epoch": 0.8185478518649617, |
|
"grad_norm": 0.2847377061843872, |
|
"learning_rate": 4.020514060337446e-05, |
|
"loss": 2.1867, |
|
"step": 9250 |
|
}, |
|
{ |
|
"epoch": 0.8194327684615725, |
|
"grad_norm": 0.29556918144226074, |
|
"learning_rate": 3.982458925886748e-05, |
|
"loss": 2.1802, |
|
"step": 9260 |
|
}, |
|
{ |
|
"epoch": 0.8203176850581833, |
|
"grad_norm": 0.3286089599132538, |
|
"learning_rate": 3.944569157092839e-05, |
|
"loss": 2.1653, |
|
"step": 9270 |
|
}, |
|
{ |
|
"epoch": 0.8212026016547941, |
|
"grad_norm": 0.29342418909072876, |
|
"learning_rate": 3.906845052071778e-05, |
|
"loss": 2.1805, |
|
"step": 9280 |
|
}, |
|
{ |
|
"epoch": 0.8220875182514048, |
|
"grad_norm": 0.26084861159324646, |
|
"learning_rate": 3.8692869076361794e-05, |
|
"loss": 2.1626, |
|
"step": 9290 |
|
}, |
|
{ |
|
"epoch": 0.8229724348480155, |
|
"grad_norm": 0.27467480301856995, |
|
"learning_rate": 3.831895019292897e-05, |
|
"loss": 2.18, |
|
"step": 9300 |
|
}, |
|
{ |
|
"epoch": 0.8238573514446264, |
|
"grad_norm": 0.28592967987060547, |
|
"learning_rate": 3.794669681240667e-05, |
|
"loss": 2.1666, |
|
"step": 9310 |
|
}, |
|
{ |
|
"epoch": 0.8247422680412371, |
|
"grad_norm": 0.2642505168914795, |
|
"learning_rate": 3.757611186367823e-05, |
|
"loss": 2.1843, |
|
"step": 9320 |
|
}, |
|
{ |
|
"epoch": 0.8256271846378479, |
|
"grad_norm": 0.2525290250778198, |
|
"learning_rate": 3.7207198262499684e-05, |
|
"loss": 2.1798, |
|
"step": 9330 |
|
}, |
|
{ |
|
"epoch": 0.8265121012344586, |
|
"grad_norm": 0.2868938744068146, |
|
"learning_rate": 3.6839958911476953e-05, |
|
"loss": 2.1682, |
|
"step": 9340 |
|
}, |
|
{ |
|
"epoch": 0.8273970178310694, |
|
"grad_norm": 0.25756001472473145, |
|
"learning_rate": 3.647439670004315e-05, |
|
"loss": 2.1791, |
|
"step": 9350 |
|
}, |
|
{ |
|
"epoch": 0.8282819344276802, |
|
"grad_norm": 0.4099997580051422, |
|
"learning_rate": 3.611051450443551e-05, |
|
"loss": 2.1663, |
|
"step": 9360 |
|
}, |
|
{ |
|
"epoch": 0.829166851024291, |
|
"grad_norm": 0.285736083984375, |
|
"learning_rate": 3.5748315187672935e-05, |
|
"loss": 2.1809, |
|
"step": 9370 |
|
}, |
|
{ |
|
"epoch": 0.8300517676209017, |
|
"grad_norm": 0.2643410265445709, |
|
"learning_rate": 3.5387801599533474e-05, |
|
"loss": 2.2038, |
|
"step": 9380 |
|
}, |
|
{ |
|
"epoch": 0.8309366842175125, |
|
"grad_norm": 0.2976551651954651, |
|
"learning_rate": 3.502897657653201e-05, |
|
"loss": 2.1714, |
|
"step": 9390 |
|
}, |
|
{ |
|
"epoch": 0.8318216008141233, |
|
"grad_norm": 0.25775617361068726, |
|
"learning_rate": 3.467184294189776e-05, |
|
"loss": 2.1549, |
|
"step": 9400 |
|
}, |
|
{ |
|
"epoch": 0.8327065174107341, |
|
"grad_norm": 0.26022714376449585, |
|
"learning_rate": 3.431640350555204e-05, |
|
"loss": 2.1577, |
|
"step": 9410 |
|
}, |
|
{ |
|
"epoch": 0.8335914340073448, |
|
"grad_norm": 0.3256966471672058, |
|
"learning_rate": 3.3962661064086356e-05, |
|
"loss": 2.1812, |
|
"step": 9420 |
|
}, |
|
{ |
|
"epoch": 0.8344763506039555, |
|
"grad_norm": 0.3099459409713745, |
|
"learning_rate": 3.3610618400740146e-05, |
|
"loss": 2.1798, |
|
"step": 9430 |
|
}, |
|
{ |
|
"epoch": 0.8353612672005664, |
|
"grad_norm": 0.2882852554321289, |
|
"learning_rate": 3.326027828537923e-05, |
|
"loss": 2.1551, |
|
"step": 9440 |
|
}, |
|
{ |
|
"epoch": 0.8362461837971771, |
|
"grad_norm": 0.2728523015975952, |
|
"learning_rate": 3.2911643474473646e-05, |
|
"loss": 2.1779, |
|
"step": 9450 |
|
}, |
|
{ |
|
"epoch": 0.8371311003937879, |
|
"grad_norm": 0.28265324234962463, |
|
"learning_rate": 3.2564716711076164e-05, |
|
"loss": 2.1652, |
|
"step": 9460 |
|
}, |
|
{ |
|
"epoch": 0.8380160169903986, |
|
"grad_norm": 0.32904767990112305, |
|
"learning_rate": 3.2219500724800705e-05, |
|
"loss": 2.1833, |
|
"step": 9470 |
|
}, |
|
{ |
|
"epoch": 0.8389009335870095, |
|
"grad_norm": 0.27072396874427795, |
|
"learning_rate": 3.187599823180071e-05, |
|
"loss": 2.1993, |
|
"step": 9480 |
|
}, |
|
{ |
|
"epoch": 0.8397858501836202, |
|
"grad_norm": 0.27186015248298645, |
|
"learning_rate": 3.153421193474809e-05, |
|
"loss": 2.1792, |
|
"step": 9490 |
|
}, |
|
{ |
|
"epoch": 0.840670766780231, |
|
"grad_norm": 0.260337769985199, |
|
"learning_rate": 3.119414452281158e-05, |
|
"loss": 2.1747, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 0.8415556833768417, |
|
"grad_norm": 0.2633416950702667, |
|
"learning_rate": 3.085579867163582e-05, |
|
"loss": 2.1835, |
|
"step": 9510 |
|
}, |
|
{ |
|
"epoch": 0.8424405999734526, |
|
"grad_norm": 0.2791996896266937, |
|
"learning_rate": 3.051917704332016e-05, |
|
"loss": 2.1776, |
|
"step": 9520 |
|
}, |
|
{ |
|
"epoch": 0.8433255165700633, |
|
"grad_norm": 0.3286284804344177, |
|
"learning_rate": 3.0184282286397997e-05, |
|
"loss": 2.1834, |
|
"step": 9530 |
|
}, |
|
{ |
|
"epoch": 0.844210433166674, |
|
"grad_norm": 0.284078449010849, |
|
"learning_rate": 2.98511170358155e-05, |
|
"loss": 2.1815, |
|
"step": 9540 |
|
}, |
|
{ |
|
"epoch": 0.8450953497632848, |
|
"grad_norm": 0.2807869017124176, |
|
"learning_rate": 2.9519683912911265e-05, |
|
"loss": 2.173, |
|
"step": 9550 |
|
}, |
|
{ |
|
"epoch": 0.8459802663598955, |
|
"grad_norm": 0.28499874472618103, |
|
"learning_rate": 2.918998552539545e-05, |
|
"loss": 2.1757, |
|
"step": 9560 |
|
}, |
|
{ |
|
"epoch": 0.8468651829565064, |
|
"grad_norm": 0.26381710171699524, |
|
"learning_rate": 2.886202446732933e-05, |
|
"loss": 2.1615, |
|
"step": 9570 |
|
}, |
|
{ |
|
"epoch": 0.8477500995531171, |
|
"grad_norm": 0.29104191064834595, |
|
"learning_rate": 2.8535803319105047e-05, |
|
"loss": 2.189, |
|
"step": 9580 |
|
}, |
|
{ |
|
"epoch": 0.8486350161497279, |
|
"grad_norm": 0.2626052498817444, |
|
"learning_rate": 2.821132464742504e-05, |
|
"loss": 2.1785, |
|
"step": 9590 |
|
}, |
|
{ |
|
"epoch": 0.8495199327463386, |
|
"grad_norm": 0.27080950140953064, |
|
"learning_rate": 2.788859100528196e-05, |
|
"loss": 2.1668, |
|
"step": 9600 |
|
}, |
|
{ |
|
"epoch": 0.8504048493429495, |
|
"grad_norm": 0.29530128836631775, |
|
"learning_rate": 2.7567604931938606e-05, |
|
"loss": 2.1941, |
|
"step": 9610 |
|
}, |
|
{ |
|
"epoch": 0.8512897659395602, |
|
"grad_norm": 0.3058115541934967, |
|
"learning_rate": 2.7248368952908055e-05, |
|
"loss": 2.1511, |
|
"step": 9620 |
|
}, |
|
{ |
|
"epoch": 0.852174682536171, |
|
"grad_norm": 0.25152358412742615, |
|
"learning_rate": 2.6930885579933507e-05, |
|
"loss": 2.1721, |
|
"step": 9630 |
|
}, |
|
{ |
|
"epoch": 0.8530595991327817, |
|
"grad_norm": 0.2745242714881897, |
|
"learning_rate": 2.6615157310968778e-05, |
|
"loss": 2.1737, |
|
"step": 9640 |
|
}, |
|
{ |
|
"epoch": 0.8539445157293925, |
|
"grad_norm": 0.2541252374649048, |
|
"learning_rate": 2.6301186630158486e-05, |
|
"loss": 2.1639, |
|
"step": 9650 |
|
}, |
|
{ |
|
"epoch": 0.8548294323260033, |
|
"grad_norm": 0.2664197087287903, |
|
"learning_rate": 2.5988976007818716e-05, |
|
"loss": 2.159, |
|
"step": 9660 |
|
}, |
|
{ |
|
"epoch": 0.855714348922614, |
|
"grad_norm": 0.2929265797138214, |
|
"learning_rate": 2.5678527900417302e-05, |
|
"loss": 2.1731, |
|
"step": 9670 |
|
}, |
|
{ |
|
"epoch": 0.8565992655192248, |
|
"grad_norm": 0.27480408549308777, |
|
"learning_rate": 2.5369844750554705e-05, |
|
"loss": 2.1847, |
|
"step": 9680 |
|
}, |
|
{ |
|
"epoch": 0.8574841821158355, |
|
"grad_norm": 0.25206154584884644, |
|
"learning_rate": 2.5062928986944677e-05, |
|
"loss": 2.1595, |
|
"step": 9690 |
|
}, |
|
{ |
|
"epoch": 0.8583690987124464, |
|
"grad_norm": 0.30056387186050415, |
|
"learning_rate": 2.4757783024395242e-05, |
|
"loss": 2.1659, |
|
"step": 9700 |
|
}, |
|
{ |
|
"epoch": 0.8592540153090571, |
|
"grad_norm": 0.2622138261795044, |
|
"learning_rate": 2.4454409263789694e-05, |
|
"loss": 2.1945, |
|
"step": 9710 |
|
}, |
|
{ |
|
"epoch": 0.8601389319056679, |
|
"grad_norm": 0.29541319608688354, |
|
"learning_rate": 2.4152810092067658e-05, |
|
"loss": 2.1668, |
|
"step": 9720 |
|
}, |
|
{ |
|
"epoch": 0.8610238485022786, |
|
"grad_norm": 0.2514541447162628, |
|
"learning_rate": 2.3852987882206188e-05, |
|
"loss": 2.1667, |
|
"step": 9730 |
|
}, |
|
{ |
|
"epoch": 0.8619087650988895, |
|
"grad_norm": 0.32891350984573364, |
|
"learning_rate": 2.3554944993201487e-05, |
|
"loss": 2.1707, |
|
"step": 9740 |
|
}, |
|
{ |
|
"epoch": 0.8627936816955002, |
|
"grad_norm": 0.30388736724853516, |
|
"learning_rate": 2.325868377004986e-05, |
|
"loss": 2.1727, |
|
"step": 9750 |
|
}, |
|
{ |
|
"epoch": 0.863678598292111, |
|
"grad_norm": 0.28365710377693176, |
|
"learning_rate": 2.296420654372966e-05, |
|
"loss": 2.1745, |
|
"step": 9760 |
|
}, |
|
{ |
|
"epoch": 0.8645635148887217, |
|
"grad_norm": 0.29059097170829773, |
|
"learning_rate": 2.2671515631182666e-05, |
|
"loss": 2.175, |
|
"step": 9770 |
|
}, |
|
{ |
|
"epoch": 0.8654484314853325, |
|
"grad_norm": 0.2463858276605606, |
|
"learning_rate": 2.2380613335296037e-05, |
|
"loss": 2.1817, |
|
"step": 9780 |
|
}, |
|
{ |
|
"epoch": 0.8663333480819433, |
|
"grad_norm": 0.30910032987594604, |
|
"learning_rate": 2.2091501944884073e-05, |
|
"loss": 2.1894, |
|
"step": 9790 |
|
}, |
|
{ |
|
"epoch": 0.867218264678554, |
|
"grad_norm": 0.2667929530143738, |
|
"learning_rate": 2.1804183734670273e-05, |
|
"loss": 2.1672, |
|
"step": 9800 |
|
}, |
|
{ |
|
"epoch": 0.8681031812751648, |
|
"grad_norm": 0.2770427167415619, |
|
"learning_rate": 2.15186609652695e-05, |
|
"loss": 2.1676, |
|
"step": 9810 |
|
}, |
|
{ |
|
"epoch": 0.8689880978717756, |
|
"grad_norm": 0.2695925831794739, |
|
"learning_rate": 2.1234935883170047e-05, |
|
"loss": 2.1946, |
|
"step": 9820 |
|
}, |
|
{ |
|
"epoch": 0.8698730144683864, |
|
"grad_norm": 0.24846522510051727, |
|
"learning_rate": 2.0953010720716037e-05, |
|
"loss": 2.1797, |
|
"step": 9830 |
|
}, |
|
{ |
|
"epoch": 0.8707579310649971, |
|
"grad_norm": 0.2570924758911133, |
|
"learning_rate": 2.0672887696089827e-05, |
|
"loss": 2.1913, |
|
"step": 9840 |
|
}, |
|
{ |
|
"epoch": 0.8716428476616079, |
|
"grad_norm": 0.25783997774124146, |
|
"learning_rate": 2.039456901329473e-05, |
|
"loss": 2.1552, |
|
"step": 9850 |
|
}, |
|
{ |
|
"epoch": 0.8725277642582187, |
|
"grad_norm": 0.293927937746048, |
|
"learning_rate": 2.0118056862137357e-05, |
|
"loss": 2.1819, |
|
"step": 9860 |
|
}, |
|
{ |
|
"epoch": 0.8734126808548295, |
|
"grad_norm": 0.28890037536621094, |
|
"learning_rate": 1.9843353418210614e-05, |
|
"loss": 2.1729, |
|
"step": 9870 |
|
}, |
|
{ |
|
"epoch": 0.8742975974514402, |
|
"grad_norm": 0.2489652782678604, |
|
"learning_rate": 1.9570460842876532e-05, |
|
"loss": 2.1802, |
|
"step": 9880 |
|
}, |
|
{ |
|
"epoch": 0.8751825140480509, |
|
"grad_norm": 0.2771342098712921, |
|
"learning_rate": 1.9299381283249317e-05, |
|
"loss": 2.18, |
|
"step": 9890 |
|
}, |
|
{ |
|
"epoch": 0.8760674306446617, |
|
"grad_norm": 0.28583505749702454, |
|
"learning_rate": 1.9030116872178316e-05, |
|
"loss": 2.1947, |
|
"step": 9900 |
|
}, |
|
{ |
|
"epoch": 0.8769523472412725, |
|
"grad_norm": 0.2841069996356964, |
|
"learning_rate": 1.8762669728231373e-05, |
|
"loss": 2.1661, |
|
"step": 9910 |
|
}, |
|
{ |
|
"epoch": 0.8778372638378833, |
|
"grad_norm": 0.25715571641921997, |
|
"learning_rate": 1.8497041955678057e-05, |
|
"loss": 2.1709, |
|
"step": 9920 |
|
}, |
|
{ |
|
"epoch": 0.878722180434494, |
|
"grad_norm": 0.2675383985042572, |
|
"learning_rate": 1.823323564447313e-05, |
|
"loss": 2.1752, |
|
"step": 9930 |
|
}, |
|
{ |
|
"epoch": 0.8796070970311048, |
|
"grad_norm": 0.2713576555252075, |
|
"learning_rate": 1.797125287024029e-05, |
|
"loss": 2.1795, |
|
"step": 9940 |
|
}, |
|
{ |
|
"epoch": 0.8804920136277156, |
|
"grad_norm": 0.2969569265842438, |
|
"learning_rate": 1.7711095694255468e-05, |
|
"loss": 2.1897, |
|
"step": 9950 |
|
}, |
|
{ |
|
"epoch": 0.8813769302243264, |
|
"grad_norm": 0.27227532863616943, |
|
"learning_rate": 1.7452766163430972e-05, |
|
"loss": 2.1905, |
|
"step": 9960 |
|
}, |
|
{ |
|
"epoch": 0.8822618468209371, |
|
"grad_norm": 0.30250978469848633, |
|
"learning_rate": 1.719626631029911e-05, |
|
"loss": 2.1664, |
|
"step": 9970 |
|
}, |
|
{ |
|
"epoch": 0.8831467634175479, |
|
"grad_norm": 0.2830178737640381, |
|
"learning_rate": 1.6941598152996453e-05, |
|
"loss": 2.1834, |
|
"step": 9980 |
|
}, |
|
{ |
|
"epoch": 0.8840316800141587, |
|
"grad_norm": 0.2569063603878021, |
|
"learning_rate": 1.668876369524769e-05, |
|
"loss": 2.1794, |
|
"step": 9990 |
|
}, |
|
{ |
|
"epoch": 0.8849165966107695, |
|
"grad_norm": 0.2609952390193939, |
|
"learning_rate": 1.6437764926350073e-05, |
|
"loss": 2.1844, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.8849165966107695, |
|
"eval_accuracy": 0.5528685849440567, |
|
"eval_loss": 2.0780797004699707, |
|
"eval_runtime": 12.3984, |
|
"eval_samples_per_second": 25.649, |
|
"eval_steps_per_second": 0.403, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.8858015132073802, |
|
"grad_norm": 0.31888630986213684, |
|
"learning_rate": 1.6188603821157583e-05, |
|
"loss": 2.1743, |
|
"step": 10010 |
|
}, |
|
{ |
|
"epoch": 0.8866864298039909, |
|
"grad_norm": 0.251662015914917, |
|
"learning_rate": 1.59412823400657e-05, |
|
"loss": 2.1553, |
|
"step": 10020 |
|
}, |
|
{ |
|
"epoch": 0.8875713464006018, |
|
"grad_norm": 0.32126525044441223, |
|
"learning_rate": 1.569580242899557e-05, |
|
"loss": 2.1664, |
|
"step": 10030 |
|
}, |
|
{ |
|
"epoch": 0.8884562629972125, |
|
"grad_norm": 0.27544891834259033, |
|
"learning_rate": 1.5452166019378987e-05, |
|
"loss": 2.1668, |
|
"step": 10040 |
|
}, |
|
{ |
|
"epoch": 0.8893411795938233, |
|
"grad_norm": 0.25146690011024475, |
|
"learning_rate": 1.5210375028143097e-05, |
|
"loss": 2.165, |
|
"step": 10050 |
|
}, |
|
{ |
|
"epoch": 0.890226096190434, |
|
"grad_norm": 0.2671303153038025, |
|
"learning_rate": 1.497043135769524e-05, |
|
"loss": 2.173, |
|
"step": 10060 |
|
}, |
|
{ |
|
"epoch": 0.8911110127870449, |
|
"grad_norm": 0.25551095604896545, |
|
"learning_rate": 1.4732336895908278e-05, |
|
"loss": 2.1663, |
|
"step": 10070 |
|
}, |
|
{ |
|
"epoch": 0.8919959293836556, |
|
"grad_norm": 0.2685263752937317, |
|
"learning_rate": 1.4496093516105258e-05, |
|
"loss": 2.1714, |
|
"step": 10080 |
|
}, |
|
{ |
|
"epoch": 0.8928808459802664, |
|
"grad_norm": 0.25145915150642395, |
|
"learning_rate": 1.4261703077045218e-05, |
|
"loss": 2.1556, |
|
"step": 10090 |
|
}, |
|
{ |
|
"epoch": 0.8937657625768771, |
|
"grad_norm": 0.25494644045829773, |
|
"learning_rate": 1.4029167422908107e-05, |
|
"loss": 2.1627, |
|
"step": 10100 |
|
}, |
|
{ |
|
"epoch": 0.894650679173488, |
|
"grad_norm": 0.2593258023262024, |
|
"learning_rate": 1.3798488383280488e-05, |
|
"loss": 2.1478, |
|
"step": 10110 |
|
}, |
|
{ |
|
"epoch": 0.8955355957700987, |
|
"grad_norm": 0.36344897747039795, |
|
"learning_rate": 1.3569667773141142e-05, |
|
"loss": 2.1828, |
|
"step": 10120 |
|
}, |
|
{ |
|
"epoch": 0.8964205123667094, |
|
"grad_norm": 0.2809944152832031, |
|
"learning_rate": 1.3342707392846792e-05, |
|
"loss": 2.1709, |
|
"step": 10130 |
|
}, |
|
{ |
|
"epoch": 0.8973054289633202, |
|
"grad_norm": 0.24097634851932526, |
|
"learning_rate": 1.3117609028117817e-05, |
|
"loss": 2.1602, |
|
"step": 10140 |
|
}, |
|
{ |
|
"epoch": 0.8981903455599309, |
|
"grad_norm": 0.28794658184051514, |
|
"learning_rate": 1.2894374450024338e-05, |
|
"loss": 2.1823, |
|
"step": 10150 |
|
}, |
|
{ |
|
"epoch": 0.8990752621565418, |
|
"grad_norm": 0.3050318658351898, |
|
"learning_rate": 1.2673005414972184e-05, |
|
"loss": 2.1651, |
|
"step": 10160 |
|
}, |
|
{ |
|
"epoch": 0.8999601787531525, |
|
"grad_norm": 0.25824499130249023, |
|
"learning_rate": 1.2453503664689282e-05, |
|
"loss": 2.1717, |
|
"step": 10170 |
|
}, |
|
{ |
|
"epoch": 0.9008450953497633, |
|
"grad_norm": 0.26507601141929626, |
|
"learning_rate": 1.2235870926211617e-05, |
|
"loss": 2.175, |
|
"step": 10180 |
|
}, |
|
{ |
|
"epoch": 0.901730011946374, |
|
"grad_norm": 0.2640542685985565, |
|
"learning_rate": 1.2020108911869888e-05, |
|
"loss": 2.1734, |
|
"step": 10190 |
|
}, |
|
{ |
|
"epoch": 0.9026149285429849, |
|
"grad_norm": 0.2744889557361603, |
|
"learning_rate": 1.1806219319275918e-05, |
|
"loss": 2.1503, |
|
"step": 10200 |
|
}, |
|
{ |
|
"epoch": 0.9034998451395956, |
|
"grad_norm": 0.2510084807872772, |
|
"learning_rate": 1.1594203831309491e-05, |
|
"loss": 2.1853, |
|
"step": 10210 |
|
}, |
|
{ |
|
"epoch": 0.9043847617362064, |
|
"grad_norm": 0.2631700932979584, |
|
"learning_rate": 1.138406411610482e-05, |
|
"loss": 2.188, |
|
"step": 10220 |
|
}, |
|
{ |
|
"epoch": 0.9052696783328171, |
|
"grad_norm": 0.2694438099861145, |
|
"learning_rate": 1.1175801827037618e-05, |
|
"loss": 2.1763, |
|
"step": 10230 |
|
}, |
|
{ |
|
"epoch": 0.9061545949294278, |
|
"grad_norm": 0.2635989189147949, |
|
"learning_rate": 1.0969418602712e-05, |
|
"loss": 2.1731, |
|
"step": 10240 |
|
}, |
|
{ |
|
"epoch": 0.9070395115260387, |
|
"grad_norm": 0.26256707310676575, |
|
"learning_rate": 1.0764916066947795e-05, |
|
"loss": 2.1564, |
|
"step": 10250 |
|
}, |
|
{ |
|
"epoch": 0.9079244281226494, |
|
"grad_norm": 0.2520720958709717, |
|
"learning_rate": 1.0562295828767388e-05, |
|
"loss": 2.158, |
|
"step": 10260 |
|
}, |
|
{ |
|
"epoch": 0.9088093447192602, |
|
"grad_norm": 0.2579314410686493, |
|
"learning_rate": 1.0361559482383404e-05, |
|
"loss": 2.1648, |
|
"step": 10270 |
|
}, |
|
{ |
|
"epoch": 0.909694261315871, |
|
"grad_norm": 0.280298113822937, |
|
"learning_rate": 1.0162708607186045e-05, |
|
"loss": 2.1658, |
|
"step": 10280 |
|
}, |
|
{ |
|
"epoch": 0.9105791779124818, |
|
"grad_norm": 0.27980297803878784, |
|
"learning_rate": 9.965744767730545e-06, |
|
"loss": 2.174, |
|
"step": 10290 |
|
}, |
|
{ |
|
"epoch": 0.9114640945090925, |
|
"grad_norm": 0.30590036511421204, |
|
"learning_rate": 9.770669513725128e-06, |
|
"loss": 2.1489, |
|
"step": 10300 |
|
}, |
|
{ |
|
"epoch": 0.9123490111057033, |
|
"grad_norm": 0.2882310152053833, |
|
"learning_rate": 9.57748438001857e-06, |
|
"loss": 2.1724, |
|
"step": 10310 |
|
}, |
|
{ |
|
"epoch": 0.913233927702314, |
|
"grad_norm": 0.30974969267845154, |
|
"learning_rate": 9.386190886588208e-06, |
|
"loss": 2.1528, |
|
"step": 10320 |
|
}, |
|
{ |
|
"epoch": 0.9141188442989249, |
|
"grad_norm": 0.2538847029209137, |
|
"learning_rate": 9.196790538527982e-06, |
|
"loss": 2.1783, |
|
"step": 10330 |
|
}, |
|
{ |
|
"epoch": 0.9150037608955356, |
|
"grad_norm": 0.3312051594257355, |
|
"learning_rate": 9.00928482603669e-06, |
|
"loss": 2.1768, |
|
"step": 10340 |
|
}, |
|
{ |
|
"epoch": 0.9158886774921464, |
|
"grad_norm": 0.261616975069046, |
|
"learning_rate": 8.823675224406053e-06, |
|
"loss": 2.1665, |
|
"step": 10350 |
|
}, |
|
{ |
|
"epoch": 0.9167735940887571, |
|
"grad_norm": 0.26149144768714905, |
|
"learning_rate": 8.639963194009282e-06, |
|
"loss": 2.1672, |
|
"step": 10360 |
|
}, |
|
{ |
|
"epoch": 0.9176585106853679, |
|
"grad_norm": 0.27541011571884155, |
|
"learning_rate": 8.458150180289504e-06, |
|
"loss": 2.1698, |
|
"step": 10370 |
|
}, |
|
{ |
|
"epoch": 0.9185434272819787, |
|
"grad_norm": 0.24506379663944244, |
|
"learning_rate": 8.278237613748408e-06, |
|
"loss": 2.1628, |
|
"step": 10380 |
|
}, |
|
{ |
|
"epoch": 0.9194283438785894, |
|
"grad_norm": 0.304977148771286, |
|
"learning_rate": 8.10022690993506e-06, |
|
"loss": 2.1522, |
|
"step": 10390 |
|
}, |
|
{ |
|
"epoch": 0.9203132604752002, |
|
"grad_norm": 0.2372012585401535, |
|
"learning_rate": 7.924119469434665e-06, |
|
"loss": 2.1558, |
|
"step": 10400 |
|
}, |
|
{ |
|
"epoch": 0.921198177071811, |
|
"grad_norm": 0.26251688599586487, |
|
"learning_rate": 7.749916677857544e-06, |
|
"loss": 2.1829, |
|
"step": 10410 |
|
}, |
|
{ |
|
"epoch": 0.9220830936684218, |
|
"grad_norm": 0.26747065782546997, |
|
"learning_rate": 7.577619905828281e-06, |
|
"loss": 2.1815, |
|
"step": 10420 |
|
}, |
|
{ |
|
"epoch": 0.9229680102650325, |
|
"grad_norm": 0.2626737654209137, |
|
"learning_rate": 7.4072305089750155e-06, |
|
"loss": 2.176, |
|
"step": 10430 |
|
}, |
|
{ |
|
"epoch": 0.9238529268616433, |
|
"grad_norm": 0.33040159940719604, |
|
"learning_rate": 7.238749827918639e-06, |
|
"loss": 2.1718, |
|
"step": 10440 |
|
}, |
|
{ |
|
"epoch": 0.924737843458254, |
|
"grad_norm": 0.2599565386772156, |
|
"learning_rate": 7.072179188262251e-06, |
|
"loss": 2.1781, |
|
"step": 10450 |
|
}, |
|
{ |
|
"epoch": 0.9256227600548649, |
|
"grad_norm": 0.24637052416801453, |
|
"learning_rate": 6.907519900580861e-06, |
|
"loss": 2.1651, |
|
"step": 10460 |
|
}, |
|
{ |
|
"epoch": 0.9265076766514756, |
|
"grad_norm": 0.25145140290260315, |
|
"learning_rate": 6.744773260410869e-06, |
|
"loss": 2.1819, |
|
"step": 10470 |
|
}, |
|
{ |
|
"epoch": 0.9273925932480863, |
|
"grad_norm": 0.24591967463493347, |
|
"learning_rate": 6.583940548240186e-06, |
|
"loss": 2.1876, |
|
"step": 10480 |
|
}, |
|
{ |
|
"epoch": 0.9282775098446971, |
|
"grad_norm": 0.3043903708457947, |
|
"learning_rate": 6.425023029497823e-06, |
|
"loss": 2.1796, |
|
"step": 10490 |
|
}, |
|
{ |
|
"epoch": 0.9291624264413079, |
|
"grad_norm": 0.2612435519695282, |
|
"learning_rate": 6.268021954544096e-06, |
|
"loss": 2.1849, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 0.9300473430379187, |
|
"grad_norm": 0.27911072969436646, |
|
"learning_rate": 6.112938558660852e-06, |
|
"loss": 2.1609, |
|
"step": 10510 |
|
}, |
|
{ |
|
"epoch": 0.9309322596345294, |
|
"grad_norm": 0.23749688267707825, |
|
"learning_rate": 5.95977406204154e-06, |
|
"loss": 2.1698, |
|
"step": 10520 |
|
}, |
|
{ |
|
"epoch": 0.9318171762311402, |
|
"grad_norm": 0.3167470395565033, |
|
"learning_rate": 5.808529669781903e-06, |
|
"loss": 2.1715, |
|
"step": 10530 |
|
}, |
|
{ |
|
"epoch": 0.932702092827751, |
|
"grad_norm": 0.28257447481155396, |
|
"learning_rate": 5.659206571870218e-06, |
|
"loss": 2.1672, |
|
"step": 10540 |
|
}, |
|
{ |
|
"epoch": 0.9335870094243618, |
|
"grad_norm": 0.30499503016471863, |
|
"learning_rate": 5.5118059431781e-06, |
|
"loss": 2.1625, |
|
"step": 10550 |
|
}, |
|
{ |
|
"epoch": 0.9344719260209725, |
|
"grad_norm": 0.27074891328811646, |
|
"learning_rate": 5.3663289434511546e-06, |
|
"loss": 2.1757, |
|
"step": 10560 |
|
}, |
|
{ |
|
"epoch": 0.9353568426175833, |
|
"grad_norm": 0.4108920395374298, |
|
"learning_rate": 5.222776717300009e-06, |
|
"loss": 2.1518, |
|
"step": 10570 |
|
}, |
|
{ |
|
"epoch": 0.9362417592141941, |
|
"grad_norm": 0.24994902312755585, |
|
"learning_rate": 5.0811503941911304e-06, |
|
"loss": 2.181, |
|
"step": 10580 |
|
}, |
|
{ |
|
"epoch": 0.9371266758108048, |
|
"grad_norm": 0.2533867657184601, |
|
"learning_rate": 4.941451088437993e-06, |
|
"loss": 2.1959, |
|
"step": 10590 |
|
}, |
|
{ |
|
"epoch": 0.9380115924074156, |
|
"grad_norm": 0.2800501883029938, |
|
"learning_rate": 4.803679899192393e-06, |
|
"loss": 2.1684, |
|
"step": 10600 |
|
}, |
|
{ |
|
"epoch": 0.9388965090040263, |
|
"grad_norm": 0.3431278467178345, |
|
"learning_rate": 4.667837910435707e-06, |
|
"loss": 2.1656, |
|
"step": 10610 |
|
}, |
|
{ |
|
"epoch": 0.9397814256006372, |
|
"grad_norm": 0.2510489225387573, |
|
"learning_rate": 4.5339261909704e-06, |
|
"loss": 2.1566, |
|
"step": 10620 |
|
}, |
|
{ |
|
"epoch": 0.9406663421972479, |
|
"grad_norm": 0.26005586981773376, |
|
"learning_rate": 4.401945794411611e-06, |
|
"loss": 2.1758, |
|
"step": 10630 |
|
}, |
|
{ |
|
"epoch": 0.9415512587938587, |
|
"grad_norm": 0.2854231595993042, |
|
"learning_rate": 4.271897759178883e-06, |
|
"loss": 2.1784, |
|
"step": 10640 |
|
}, |
|
{ |
|
"epoch": 0.9424361753904694, |
|
"grad_norm": 0.2713136672973633, |
|
"learning_rate": 4.143783108487897e-06, |
|
"loss": 2.1916, |
|
"step": 10650 |
|
}, |
|
{ |
|
"epoch": 0.9433210919870803, |
|
"grad_norm": 0.2967517375946045, |
|
"learning_rate": 4.017602850342584e-06, |
|
"loss": 2.2032, |
|
"step": 10660 |
|
}, |
|
{ |
|
"epoch": 0.944206008583691, |
|
"grad_norm": 0.2542577087879181, |
|
"learning_rate": 3.893357977527101e-06, |
|
"loss": 2.1896, |
|
"step": 10670 |
|
}, |
|
{ |
|
"epoch": 0.9450909251803018, |
|
"grad_norm": 0.25759467482566833, |
|
"learning_rate": 3.771049467597959e-06, |
|
"loss": 2.1535, |
|
"step": 10680 |
|
}, |
|
{ |
|
"epoch": 0.9459758417769125, |
|
"grad_norm": 0.24098724126815796, |
|
"learning_rate": 3.650678282876463e-06, |
|
"loss": 2.1857, |
|
"step": 10690 |
|
}, |
|
{ |
|
"epoch": 0.9468607583735233, |
|
"grad_norm": 0.24877989292144775, |
|
"learning_rate": 3.5322453704410283e-06, |
|
"loss": 2.1735, |
|
"step": 10700 |
|
}, |
|
{ |
|
"epoch": 0.9477456749701341, |
|
"grad_norm": 0.2515574097633362, |
|
"learning_rate": 3.4157516621198536e-06, |
|
"loss": 2.1655, |
|
"step": 10710 |
|
}, |
|
{ |
|
"epoch": 0.9486305915667448, |
|
"grad_norm": 0.41679874062538147, |
|
"learning_rate": 3.301198074483397e-06, |
|
"loss": 2.1798, |
|
"step": 10720 |
|
}, |
|
{ |
|
"epoch": 0.9495155081633556, |
|
"grad_norm": 0.2788831293582916, |
|
"learning_rate": 3.1885855088374104e-06, |
|
"loss": 2.1772, |
|
"step": 10730 |
|
}, |
|
{ |
|
"epoch": 0.9504004247599663, |
|
"grad_norm": 0.2543758749961853, |
|
"learning_rate": 3.077914851215585e-06, |
|
"loss": 2.1808, |
|
"step": 10740 |
|
}, |
|
{ |
|
"epoch": 0.9512853413565772, |
|
"grad_norm": 0.2551940977573395, |
|
"learning_rate": 2.969186972372806e-06, |
|
"loss": 2.1826, |
|
"step": 10750 |
|
}, |
|
{ |
|
"epoch": 0.9521702579531879, |
|
"grad_norm": 0.24164696037769318, |
|
"learning_rate": 2.862402727778185e-06, |
|
"loss": 2.1888, |
|
"step": 10760 |
|
}, |
|
{ |
|
"epoch": 0.9530551745497987, |
|
"grad_norm": 0.2645455002784729, |
|
"learning_rate": 2.757562957608373e-06, |
|
"loss": 2.1626, |
|
"step": 10770 |
|
}, |
|
{ |
|
"epoch": 0.9539400911464094, |
|
"grad_norm": 0.32282203435897827, |
|
"learning_rate": 2.654668486740841e-06, |
|
"loss": 2.1498, |
|
"step": 10780 |
|
}, |
|
{ |
|
"epoch": 0.9548250077430203, |
|
"grad_norm": 0.3102979063987732, |
|
"learning_rate": 2.5537201247475828e-06, |
|
"loss": 2.1726, |
|
"step": 10790 |
|
}, |
|
{ |
|
"epoch": 0.955709924339631, |
|
"grad_norm": 0.2809552252292633, |
|
"learning_rate": 2.454718665888589e-06, |
|
"loss": 2.1684, |
|
"step": 10800 |
|
}, |
|
{ |
|
"epoch": 0.9565948409362418, |
|
"grad_norm": 0.3085186183452606, |
|
"learning_rate": 2.357664889105687e-06, |
|
"loss": 2.1708, |
|
"step": 10810 |
|
}, |
|
{ |
|
"epoch": 0.9574797575328525, |
|
"grad_norm": 0.2588469684123993, |
|
"learning_rate": 2.262559558016325e-06, |
|
"loss": 2.1536, |
|
"step": 10820 |
|
}, |
|
{ |
|
"epoch": 0.9583646741294632, |
|
"grad_norm": 0.30835458636283875, |
|
"learning_rate": 2.169403420907601e-06, |
|
"loss": 2.2094, |
|
"step": 10830 |
|
}, |
|
{ |
|
"epoch": 0.9592495907260741, |
|
"grad_norm": 0.2416762262582779, |
|
"learning_rate": 2.078197210730465e-06, |
|
"loss": 2.1617, |
|
"step": 10840 |
|
}, |
|
{ |
|
"epoch": 0.9601345073226848, |
|
"grad_norm": 0.311798632144928, |
|
"learning_rate": 1.9889416450938334e-06, |
|
"loss": 2.1897, |
|
"step": 10850 |
|
}, |
|
{ |
|
"epoch": 0.9610194239192956, |
|
"grad_norm": 0.2555478811264038, |
|
"learning_rate": 1.901637426258984e-06, |
|
"loss": 2.1769, |
|
"step": 10860 |
|
}, |
|
{ |
|
"epoch": 0.9619043405159063, |
|
"grad_norm": 0.2964852452278137, |
|
"learning_rate": 1.8162852411340025e-06, |
|
"loss": 2.1741, |
|
"step": 10870 |
|
}, |
|
{ |
|
"epoch": 0.9627892571125172, |
|
"grad_norm": 0.3563735783100128, |
|
"learning_rate": 1.7328857612684267e-06, |
|
"loss": 2.1444, |
|
"step": 10880 |
|
}, |
|
{ |
|
"epoch": 0.9636741737091279, |
|
"grad_norm": 0.24288448691368103, |
|
"learning_rate": 1.6514396428480017e-06, |
|
"loss": 2.1926, |
|
"step": 10890 |
|
}, |
|
{ |
|
"epoch": 0.9645590903057387, |
|
"grad_norm": 0.3532714247703552, |
|
"learning_rate": 1.571947526689349e-06, |
|
"loss": 2.1464, |
|
"step": 10900 |
|
}, |
|
{ |
|
"epoch": 0.9654440069023494, |
|
"grad_norm": 0.250182569026947, |
|
"learning_rate": 1.494410038235139e-06, |
|
"loss": 2.1819, |
|
"step": 10910 |
|
}, |
|
{ |
|
"epoch": 0.9663289234989603, |
|
"grad_norm": 0.2672261595726013, |
|
"learning_rate": 1.418827787548982e-06, |
|
"loss": 2.1882, |
|
"step": 10920 |
|
}, |
|
{ |
|
"epoch": 0.967213840095571, |
|
"grad_norm": 0.24856804311275482, |
|
"learning_rate": 1.3452013693107667e-06, |
|
"loss": 2.1657, |
|
"step": 10930 |
|
}, |
|
{ |
|
"epoch": 0.9680987566921818, |
|
"grad_norm": 0.30384114384651184, |
|
"learning_rate": 1.273531362811914e-06, |
|
"loss": 2.1746, |
|
"step": 10940 |
|
}, |
|
{ |
|
"epoch": 0.9689836732887925, |
|
"grad_norm": 0.31302690505981445, |
|
"learning_rate": 1.2038183319507957e-06, |
|
"loss": 2.1761, |
|
"step": 10950 |
|
}, |
|
{ |
|
"epoch": 0.9698685898854033, |
|
"grad_norm": 0.2600124180316925, |
|
"learning_rate": 1.1360628252283511e-06, |
|
"loss": 2.1481, |
|
"step": 10960 |
|
}, |
|
{ |
|
"epoch": 0.9707535064820141, |
|
"grad_norm": 0.34927940368652344, |
|
"learning_rate": 1.0702653757437564e-06, |
|
"loss": 2.1751, |
|
"step": 10970 |
|
}, |
|
{ |
|
"epoch": 0.9716384230786248, |
|
"grad_norm": 0.34013205766677856, |
|
"learning_rate": 1.006426501190233e-06, |
|
"loss": 2.1871, |
|
"step": 10980 |
|
}, |
|
{ |
|
"epoch": 0.9725233396752356, |
|
"grad_norm": 0.23738481104373932, |
|
"learning_rate": 9.445467038509958e-07, |
|
"loss": 2.1675, |
|
"step": 10990 |
|
}, |
|
{ |
|
"epoch": 0.9734082562718464, |
|
"grad_norm": 0.2345409244298935, |
|
"learning_rate": 8.84626470595229e-07, |
|
"loss": 2.167, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 0.9734082562718464, |
|
"eval_accuracy": 0.5530912832799625, |
|
"eval_loss": 2.076953411102295, |
|
"eval_runtime": 12.1468, |
|
"eval_samples_per_second": 26.18, |
|
"eval_steps_per_second": 0.412, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 0.9742931728684572, |
|
"grad_norm": 0.24096181988716125, |
|
"learning_rate": 8.266662728742547e-07, |
|
"loss": 2.168, |
|
"step": 11010 |
|
}, |
|
{ |
|
"epoch": 0.9751780894650679, |
|
"grad_norm": 0.2738332748413086, |
|
"learning_rate": 7.70666566718009e-07, |
|
"loss": 2.1738, |
|
"step": 11020 |
|
}, |
|
{ |
|
"epoch": 0.9760630060616787, |
|
"grad_norm": 0.30326494574546814, |
|
"learning_rate": 7.166277927311837e-07, |
|
"loss": 2.1737, |
|
"step": 11030 |
|
}, |
|
{ |
|
"epoch": 0.9769479226582894, |
|
"grad_norm": 0.2905402183532715, |
|
"learning_rate": 6.645503760899507e-07, |
|
"loss": 2.1725, |
|
"step": 11040 |
|
}, |
|
{ |
|
"epoch": 0.9778328392549003, |
|
"grad_norm": 0.25171908736228943, |
|
"learning_rate": 6.144347265384931e-07, |
|
"loss": 2.18, |
|
"step": 11050 |
|
}, |
|
{ |
|
"epoch": 0.978717755851511, |
|
"grad_norm": 0.26368576288223267, |
|
"learning_rate": 5.662812383859795e-07, |
|
"loss": 2.1911, |
|
"step": 11060 |
|
}, |
|
{ |
|
"epoch": 0.9796026724481217, |
|
"grad_norm": 0.2813120484352112, |
|
"learning_rate": 5.20090290503178e-07, |
|
"loss": 2.1725, |
|
"step": 11070 |
|
}, |
|
{ |
|
"epoch": 0.9804875890447325, |
|
"grad_norm": 0.34227675199508667, |
|
"learning_rate": 4.7586224631968047e-07, |
|
"loss": 2.1664, |
|
"step": 11080 |
|
}, |
|
{ |
|
"epoch": 0.9813725056413433, |
|
"grad_norm": 0.2556191384792328, |
|
"learning_rate": 4.335974538210441e-07, |
|
"loss": 2.1698, |
|
"step": 11090 |
|
}, |
|
{ |
|
"epoch": 0.9822574222379541, |
|
"grad_norm": 0.2765117883682251, |
|
"learning_rate": 3.932962455458489e-07, |
|
"loss": 2.1849, |
|
"step": 11100 |
|
}, |
|
{ |
|
"epoch": 0.9831423388345648, |
|
"grad_norm": 0.2285272777080536, |
|
"learning_rate": 3.5495893858342174e-07, |
|
"loss": 2.1529, |
|
"step": 11110 |
|
}, |
|
{ |
|
"epoch": 0.9840272554311756, |
|
"grad_norm": 0.24666184186935425, |
|
"learning_rate": 3.1858583457095026e-07, |
|
"loss": 2.1711, |
|
"step": 11120 |
|
}, |
|
{ |
|
"epoch": 0.9849121720277864, |
|
"grad_norm": 0.2382645606994629, |
|
"learning_rate": 2.8417721969145604e-07, |
|
"loss": 2.1318, |
|
"step": 11130 |
|
}, |
|
{ |
|
"epoch": 0.9857970886243972, |
|
"grad_norm": 0.2539430558681488, |
|
"learning_rate": 2.5173336467135267e-07, |
|
"loss": 2.1724, |
|
"step": 11140 |
|
}, |
|
{ |
|
"epoch": 0.9866820052210079, |
|
"grad_norm": 0.30961015820503235, |
|
"learning_rate": 2.2125452477828045e-07, |
|
"loss": 2.1845, |
|
"step": 11150 |
|
}, |
|
{ |
|
"epoch": 0.9875669218176187, |
|
"grad_norm": 0.24505238234996796, |
|
"learning_rate": 1.9274093981927476e-07, |
|
"loss": 2.1824, |
|
"step": 11160 |
|
}, |
|
{ |
|
"epoch": 0.9884518384142295, |
|
"grad_norm": 0.32805320620536804, |
|
"learning_rate": 1.66192834138712e-07, |
|
"loss": 2.1744, |
|
"step": 11170 |
|
}, |
|
{ |
|
"epoch": 0.9893367550108402, |
|
"grad_norm": 0.2783229649066925, |
|
"learning_rate": 1.4161041661667208e-07, |
|
"loss": 2.1709, |
|
"step": 11180 |
|
}, |
|
{ |
|
"epoch": 0.990221671607451, |
|
"grad_norm": 0.2550203502178192, |
|
"learning_rate": 1.1899388066718975e-07, |
|
"loss": 2.1681, |
|
"step": 11190 |
|
}, |
|
{ |
|
"epoch": 0.9911065882040617, |
|
"grad_norm": 0.2780303657054901, |
|
"learning_rate": 9.834340423678367e-08, |
|
"loss": 2.149, |
|
"step": 11200 |
|
}, |
|
{ |
|
"epoch": 0.9919915048006726, |
|
"grad_norm": 0.24265854060649872, |
|
"learning_rate": 7.965914980304079e-08, |
|
"loss": 2.1807, |
|
"step": 11210 |
|
}, |
|
{ |
|
"epoch": 0.9928764213972833, |
|
"grad_norm": 0.31807270646095276, |
|
"learning_rate": 6.294126437336733e-08, |
|
"loss": 2.1778, |
|
"step": 11220 |
|
}, |
|
{ |
|
"epoch": 0.9937613379938941, |
|
"grad_norm": 0.27723586559295654, |
|
"learning_rate": 4.818987948379538e-08, |
|
"loss": 2.1862, |
|
"step": 11230 |
|
}, |
|
{ |
|
"epoch": 0.9946462545905048, |
|
"grad_norm": 0.3337404131889343, |
|
"learning_rate": 3.5405111197955865e-08, |
|
"loss": 2.1644, |
|
"step": 11240 |
|
}, |
|
{ |
|
"epoch": 0.9955311711871156, |
|
"grad_norm": 0.24611864984035492, |
|
"learning_rate": 2.4587060106245897e-08, |
|
"loss": 2.1762, |
|
"step": 11250 |
|
}, |
|
{ |
|
"epoch": 0.9964160877837264, |
|
"grad_norm": 0.2372807413339615, |
|
"learning_rate": 1.5735811324857354e-08, |
|
"loss": 2.1612, |
|
"step": 11260 |
|
}, |
|
{ |
|
"epoch": 0.9973010043803372, |
|
"grad_norm": 0.2733718752861023, |
|
"learning_rate": 8.851434495277256e-09, |
|
"loss": 2.1666, |
|
"step": 11270 |
|
}, |
|
{ |
|
"epoch": 0.9981859209769479, |
|
"grad_norm": 0.3090679943561554, |
|
"learning_rate": 3.933983783677153e-09, |
|
"loss": 2.1616, |
|
"step": 11280 |
|
}, |
|
{ |
|
"epoch": 0.9990708375735587, |
|
"grad_norm": 0.3332498371601105, |
|
"learning_rate": 9.834978804412752e-10, |
|
"loss": 2.1655, |
|
"step": 11290 |
|
}, |
|
{ |
|
"epoch": 0.9999557541701695, |
|
"grad_norm": 0.29820016026496887, |
|
"learning_rate": 0.0, |
|
"loss": 2.1751, |
|
"step": 11300 |
|
}, |
|
{ |
|
"epoch": 0.9999557541701695, |
|
"step": 11300, |
|
"total_flos": 1.1360877451905964e+21, |
|
"train_loss": 2.231945479570237, |
|
"train_runtime": 110078.6466, |
|
"train_samples_per_second": 13.14, |
|
"train_steps_per_second": 0.103 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 11300, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 10000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.1360877451905964e+21, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|