RefalMachine's picture
Upload folder using huggingface_hub
9762fd7 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9999557541701695,
"eval_steps": 1000,
"global_step": 11300,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 8.849165966107695e-05,
"eval_accuracy": 0.337381835495043,
"eval_loss": 5.641775608062744,
"eval_runtime": 12.0942,
"eval_samples_per_second": 26.294,
"eval_steps_per_second": 0.413,
"step": 1
},
{
"epoch": 0.0008849165966107694,
"grad_norm": 9.357199668884277,
"learning_rate": 5e-05,
"loss": 5.9637,
"step": 10
},
{
"epoch": 0.0017698331932215388,
"grad_norm": 3.343987226486206,
"learning_rate": 0.0001,
"loss": 5.3257,
"step": 20
},
{
"epoch": 0.002654749789832308,
"grad_norm": 1.7534539699554443,
"learning_rate": 0.00015,
"loss": 4.6726,
"step": 30
},
{
"epoch": 0.0035396663864430775,
"grad_norm": 0.7918533086776733,
"learning_rate": 0.0002,
"loss": 4.1863,
"step": 40
},
{
"epoch": 0.004424582983053847,
"grad_norm": 0.5450736880302429,
"learning_rate": 0.00025,
"loss": 3.8618,
"step": 50
},
{
"epoch": 0.005309499579664616,
"grad_norm": 0.4212624132633209,
"learning_rate": 0.0003,
"loss": 3.4744,
"step": 60
},
{
"epoch": 0.006194416176275386,
"grad_norm": 0.5554006695747375,
"learning_rate": 0.00035,
"loss": 3.209,
"step": 70
},
{
"epoch": 0.007079332772886155,
"grad_norm": 0.45975297689437866,
"learning_rate": 0.0004,
"loss": 3.0271,
"step": 80
},
{
"epoch": 0.007964249369496926,
"grad_norm": 0.4895482361316681,
"learning_rate": 0.00045000000000000004,
"loss": 2.8733,
"step": 90
},
{
"epoch": 0.008849165966107695,
"grad_norm": 0.6686625480651855,
"learning_rate": 0.0005,
"loss": 2.7663,
"step": 100
},
{
"epoch": 0.009734082562718464,
"grad_norm": 0.6015220880508423,
"learning_rate": 0.0004999990165021195,
"loss": 2.6571,
"step": 110
},
{
"epoch": 0.010618999159329233,
"grad_norm": 0.5138149261474609,
"learning_rate": 0.0004999960660162163,
"loss": 2.6347,
"step": 120
},
{
"epoch": 0.011503915755940003,
"grad_norm": 0.5395781397819519,
"learning_rate": 0.0004999911485655047,
"loss": 2.5949,
"step": 130
},
{
"epoch": 0.012388832352550772,
"grad_norm": 0.47543829679489136,
"learning_rate": 0.0004999842641886751,
"loss": 2.5891,
"step": 140
},
{
"epoch": 0.013273748949161541,
"grad_norm": 0.6284042000770569,
"learning_rate": 0.0004999754129398938,
"loss": 2.5148,
"step": 150
},
{
"epoch": 0.01415866554577231,
"grad_norm": 0.6861183047294617,
"learning_rate": 0.000499964594888802,
"loss": 2.4946,
"step": 160
},
{
"epoch": 0.01504358214238308,
"grad_norm": 0.5336579084396362,
"learning_rate": 0.0004999518101205162,
"loss": 2.4972,
"step": 170
},
{
"epoch": 0.01592849873899385,
"grad_norm": 0.6088137626647949,
"learning_rate": 0.0004999370587356267,
"loss": 2.4671,
"step": 180
},
{
"epoch": 0.01681341533560462,
"grad_norm": 0.8096277117729187,
"learning_rate": 0.000499920340850197,
"loss": 2.4413,
"step": 190
},
{
"epoch": 0.01769833193221539,
"grad_norm": 0.6315465569496155,
"learning_rate": 0.0004999016565957633,
"loss": 2.455,
"step": 200
},
{
"epoch": 0.018583248528826157,
"grad_norm": 0.49901631474494934,
"learning_rate": 0.0004998810061193329,
"loss": 2.4232,
"step": 210
},
{
"epoch": 0.019468165125436927,
"grad_norm": 0.46556416153907776,
"learning_rate": 0.0004998583895833834,
"loss": 2.4315,
"step": 220
},
{
"epoch": 0.020353081722047698,
"grad_norm": 0.7050290703773499,
"learning_rate": 0.0004998338071658613,
"loss": 2.4305,
"step": 230
},
{
"epoch": 0.021237998318658465,
"grad_norm": 0.89899742603302,
"learning_rate": 0.0004998072590601808,
"loss": 2.41,
"step": 240
},
{
"epoch": 0.022122914915269236,
"grad_norm": 0.5768831968307495,
"learning_rate": 0.0004997787454752217,
"loss": 2.4048,
"step": 250
},
{
"epoch": 0.023007831511880007,
"grad_norm": 0.659796953201294,
"learning_rate": 0.0004997482666353287,
"loss": 2.3955,
"step": 260
},
{
"epoch": 0.023892748108490774,
"grad_norm": 0.546999454498291,
"learning_rate": 0.0004997158227803086,
"loss": 2.3885,
"step": 270
},
{
"epoch": 0.024777664705101544,
"grad_norm": 0.9052286744117737,
"learning_rate": 0.000499681414165429,
"loss": 2.3975,
"step": 280
},
{
"epoch": 0.025662581301712315,
"grad_norm": 0.7265748977661133,
"learning_rate": 0.0004996450410614166,
"loss": 2.3928,
"step": 290
},
{
"epoch": 0.026547497898323082,
"grad_norm": 1.073904275894165,
"learning_rate": 0.0004996067037544541,
"loss": 2.3627,
"step": 300
},
{
"epoch": 0.027432414494933853,
"grad_norm": 0.6801771521568298,
"learning_rate": 0.000499566402546179,
"loss": 2.3652,
"step": 310
},
{
"epoch": 0.02831733109154462,
"grad_norm": 0.5730037689208984,
"learning_rate": 0.0004995241377536803,
"loss": 2.3587,
"step": 320
},
{
"epoch": 0.02920224768815539,
"grad_norm": 0.8887476325035095,
"learning_rate": 0.0004994799097094969,
"loss": 2.3759,
"step": 330
},
{
"epoch": 0.03008716428476616,
"grad_norm": 0.495370477437973,
"learning_rate": 0.000499433718761614,
"loss": 2.362,
"step": 340
},
{
"epoch": 0.03097208088137693,
"grad_norm": 0.5238328576087952,
"learning_rate": 0.0004993855652734615,
"loss": 2.3779,
"step": 350
},
{
"epoch": 0.0318569974779877,
"grad_norm": 0.7129193544387817,
"learning_rate": 0.0004993354496239101,
"loss": 2.3643,
"step": 360
},
{
"epoch": 0.03274191407459847,
"grad_norm": 0.6504082679748535,
"learning_rate": 0.0004992833722072688,
"loss": 2.3448,
"step": 370
},
{
"epoch": 0.03362683067120924,
"grad_norm": 0.690351665019989,
"learning_rate": 0.000499229333433282,
"loss": 2.3695,
"step": 380
},
{
"epoch": 0.034511747267820005,
"grad_norm": 0.762799859046936,
"learning_rate": 0.0004991733337271258,
"loss": 2.3541,
"step": 390
},
{
"epoch": 0.03539666386443078,
"grad_norm": 0.7155598998069763,
"learning_rate": 0.0004991153735294048,
"loss": 2.3481,
"step": 400
},
{
"epoch": 0.036281580461041546,
"grad_norm": 0.4801159203052521,
"learning_rate": 0.000499055453296149,
"loss": 2.3555,
"step": 410
},
{
"epoch": 0.03716649705765231,
"grad_norm": 0.48848673701286316,
"learning_rate": 0.0004989935734988098,
"loss": 2.3622,
"step": 420
},
{
"epoch": 0.03805141365426309,
"grad_norm": 0.46054649353027344,
"learning_rate": 0.0004989297346242562,
"loss": 2.3634,
"step": 430
},
{
"epoch": 0.038936330250873855,
"grad_norm": 0.5708670020103455,
"learning_rate": 0.0004988639371747717,
"loss": 2.34,
"step": 440
},
{
"epoch": 0.03982124684748462,
"grad_norm": 0.7245877981185913,
"learning_rate": 0.0004987961816680492,
"loss": 2.3564,
"step": 450
},
{
"epoch": 0.040706163444095396,
"grad_norm": 0.513332724571228,
"learning_rate": 0.0004987264686371881,
"loss": 2.3544,
"step": 460
},
{
"epoch": 0.04159108004070616,
"grad_norm": 0.5079577565193176,
"learning_rate": 0.0004986547986306892,
"loss": 2.3531,
"step": 470
},
{
"epoch": 0.04247599663731693,
"grad_norm": 0.8436957001686096,
"learning_rate": 0.000498581172212451,
"loss": 2.3402,
"step": 480
},
{
"epoch": 0.043360913233927705,
"grad_norm": 0.5677080750465393,
"learning_rate": 0.0004985055899617649,
"loss": 2.3315,
"step": 490
},
{
"epoch": 0.04424582983053847,
"grad_norm": 0.4759403467178345,
"learning_rate": 0.0004984280524733107,
"loss": 2.326,
"step": 500
},
{
"epoch": 0.04513074642714924,
"grad_norm": 0.45146846771240234,
"learning_rate": 0.0004983485603571521,
"loss": 2.3177,
"step": 510
},
{
"epoch": 0.04601566302376001,
"grad_norm": 0.6578854322433472,
"learning_rate": 0.0004982671142387316,
"loss": 2.3379,
"step": 520
},
{
"epoch": 0.04690057962037078,
"grad_norm": 0.8977625370025635,
"learning_rate": 0.000498183714758866,
"loss": 2.3233,
"step": 530
},
{
"epoch": 0.04778549621698155,
"grad_norm": 0.5207841396331787,
"learning_rate": 0.0004980983625737411,
"loss": 2.3449,
"step": 540
},
{
"epoch": 0.04867041281359232,
"grad_norm": 0.563421905040741,
"learning_rate": 0.0004980110583549062,
"loss": 2.3111,
"step": 550
},
{
"epoch": 0.04955532941020309,
"grad_norm": 0.6460586786270142,
"learning_rate": 0.0004979218027892695,
"loss": 2.3382,
"step": 560
},
{
"epoch": 0.050440246006813856,
"grad_norm": 0.7345250844955444,
"learning_rate": 0.0004978305965790924,
"loss": 2.3141,
"step": 570
},
{
"epoch": 0.05132516260342463,
"grad_norm": 0.6413494348526001,
"learning_rate": 0.0004977374404419837,
"loss": 2.3172,
"step": 580
},
{
"epoch": 0.0522100792000354,
"grad_norm": 0.5809776186943054,
"learning_rate": 0.0004976423351108943,
"loss": 2.3214,
"step": 590
},
{
"epoch": 0.053094995796646165,
"grad_norm": 0.5282315015792847,
"learning_rate": 0.0004975452813341115,
"loss": 2.3188,
"step": 600
},
{
"epoch": 0.05397991239325694,
"grad_norm": 0.673841655254364,
"learning_rate": 0.0004974462798752524,
"loss": 2.3226,
"step": 610
},
{
"epoch": 0.054864828989867706,
"grad_norm": 0.8785530924797058,
"learning_rate": 0.0004973453315132592,
"loss": 2.3097,
"step": 620
},
{
"epoch": 0.05574974558647847,
"grad_norm": 0.7876306772232056,
"learning_rate": 0.0004972424370423917,
"loss": 2.3342,
"step": 630
},
{
"epoch": 0.05663466218308924,
"grad_norm": 0.5609032511711121,
"learning_rate": 0.0004971375972722218,
"loss": 2.3265,
"step": 640
},
{
"epoch": 0.057519578779700015,
"grad_norm": 0.730330228805542,
"learning_rate": 0.0004970308130276272,
"loss": 2.3289,
"step": 650
},
{
"epoch": 0.05840449537631078,
"grad_norm": 0.7334195971488953,
"learning_rate": 0.0004969220851487844,
"loss": 2.3107,
"step": 660
},
{
"epoch": 0.05928941197292155,
"grad_norm": 0.7410897612571716,
"learning_rate": 0.0004968114144911626,
"loss": 2.316,
"step": 670
},
{
"epoch": 0.06017432856953232,
"grad_norm": 0.5102954506874084,
"learning_rate": 0.0004966988019255166,
"loss": 2.3348,
"step": 680
},
{
"epoch": 0.06105924516614309,
"grad_norm": 0.48943185806274414,
"learning_rate": 0.0004965842483378802,
"loss": 2.324,
"step": 690
},
{
"epoch": 0.06194416176275386,
"grad_norm": 0.7627712488174438,
"learning_rate": 0.0004964677546295589,
"loss": 2.3016,
"step": 700
},
{
"epoch": 0.06282907835936463,
"grad_norm": 0.5588313937187195,
"learning_rate": 0.0004963493217171235,
"loss": 2.3134,
"step": 710
},
{
"epoch": 0.0637139949559754,
"grad_norm": 0.5578395128250122,
"learning_rate": 0.0004962289505324021,
"loss": 2.2991,
"step": 720
},
{
"epoch": 0.06459891155258617,
"grad_norm": 0.6175896525382996,
"learning_rate": 0.0004961066420224729,
"loss": 2.3257,
"step": 730
},
{
"epoch": 0.06548382814919694,
"grad_norm": 0.727881908416748,
"learning_rate": 0.0004959823971496574,
"loss": 2.2855,
"step": 740
},
{
"epoch": 0.06636874474580771,
"grad_norm": 0.6838656663894653,
"learning_rate": 0.0004958562168915122,
"loss": 2.2925,
"step": 750
},
{
"epoch": 0.06725366134241847,
"grad_norm": 0.6439931988716125,
"learning_rate": 0.0004957281022408211,
"loss": 2.3086,
"step": 760
},
{
"epoch": 0.06813857793902925,
"grad_norm": 0.7111929655075073,
"learning_rate": 0.0004955980542055883,
"loss": 2.3276,
"step": 770
},
{
"epoch": 0.06902349453564001,
"grad_norm": 0.5941621661186218,
"learning_rate": 0.0004954660738090296,
"loss": 2.2986,
"step": 780
},
{
"epoch": 0.06990841113225078,
"grad_norm": 1.118166446685791,
"learning_rate": 0.0004953321620895643,
"loss": 2.3091,
"step": 790
},
{
"epoch": 0.07079332772886156,
"grad_norm": 0.543308675289154,
"learning_rate": 0.0004951963201008077,
"loss": 2.3208,
"step": 800
},
{
"epoch": 0.07167824432547232,
"grad_norm": 0.6741182208061218,
"learning_rate": 0.000495058548911562,
"loss": 2.3007,
"step": 810
},
{
"epoch": 0.07256316092208309,
"grad_norm": 0.577864408493042,
"learning_rate": 0.0004949188496058089,
"loss": 2.3049,
"step": 820
},
{
"epoch": 0.07344807751869387,
"grad_norm": 0.5314656496047974,
"learning_rate": 0.0004947772232827,
"loss": 2.2865,
"step": 830
},
{
"epoch": 0.07433299411530463,
"grad_norm": 0.5619907975196838,
"learning_rate": 0.0004946336710565488,
"loss": 2.2991,
"step": 840
},
{
"epoch": 0.0752179107119154,
"grad_norm": 0.7731435298919678,
"learning_rate": 0.0004944881940568219,
"loss": 2.2954,
"step": 850
},
{
"epoch": 0.07610282730852617,
"grad_norm": 0.6936209201812744,
"learning_rate": 0.0004943407934281299,
"loss": 2.2966,
"step": 860
},
{
"epoch": 0.07698774390513693,
"grad_norm": 0.7555710673332214,
"learning_rate": 0.0004941914703302181,
"loss": 2.2794,
"step": 870
},
{
"epoch": 0.07787266050174771,
"grad_norm": 0.5199636220932007,
"learning_rate": 0.0004940402259379585,
"loss": 2.3113,
"step": 880
},
{
"epoch": 0.07875757709835848,
"grad_norm": 0.4673093557357788,
"learning_rate": 0.0004938870614413392,
"loss": 2.2965,
"step": 890
},
{
"epoch": 0.07964249369496924,
"grad_norm": 0.49087241291999817,
"learning_rate": 0.0004937319780454559,
"loss": 2.2903,
"step": 900
},
{
"epoch": 0.08052741029158002,
"grad_norm": 0.5380146503448486,
"learning_rate": 0.0004935749769705022,
"loss": 2.311,
"step": 910
},
{
"epoch": 0.08141232688819079,
"grad_norm": 0.8122909665107727,
"learning_rate": 0.0004934160594517598,
"loss": 2.2972,
"step": 920
},
{
"epoch": 0.08229724348480155,
"grad_norm": 0.4482613503932953,
"learning_rate": 0.0004932552267395891,
"loss": 2.2864,
"step": 930
},
{
"epoch": 0.08318216008141233,
"grad_norm": 0.4072429835796356,
"learning_rate": 0.0004930924800994192,
"loss": 2.2931,
"step": 940
},
{
"epoch": 0.0840670766780231,
"grad_norm": 0.840983510017395,
"learning_rate": 0.0004929278208117378,
"loss": 2.2763,
"step": 950
},
{
"epoch": 0.08495199327463386,
"grad_norm": 0.5435421466827393,
"learning_rate": 0.0004927612501720814,
"loss": 2.2896,
"step": 960
},
{
"epoch": 0.08583690987124463,
"grad_norm": 0.5765254497528076,
"learning_rate": 0.000492592769491025,
"loss": 2.2992,
"step": 970
},
{
"epoch": 0.08672182646785541,
"grad_norm": 0.6193447113037109,
"learning_rate": 0.0004924223800941717,
"loss": 2.3071,
"step": 980
},
{
"epoch": 0.08760674306446617,
"grad_norm": 0.9472047686576843,
"learning_rate": 0.0004922500833221425,
"loss": 2.2825,
"step": 990
},
{
"epoch": 0.08849165966107694,
"grad_norm": 0.5508486032485962,
"learning_rate": 0.0004920758805305654,
"loss": 2.2914,
"step": 1000
},
{
"epoch": 0.08849165966107694,
"eval_accuracy": 0.5370846484054032,
"eval_loss": 2.197066068649292,
"eval_runtime": 12.287,
"eval_samples_per_second": 25.881,
"eval_steps_per_second": 0.407,
"step": 1000
},
{
"epoch": 0.08937657625768772,
"grad_norm": 0.4986010789871216,
"learning_rate": 0.0004918997730900649,
"loss": 2.2682,
"step": 1010
},
{
"epoch": 0.09026149285429848,
"grad_norm": 0.5728856921195984,
"learning_rate": 0.0004917217623862517,
"loss": 2.2828,
"step": 1020
},
{
"epoch": 0.09114640945090925,
"grad_norm": 0.5309883952140808,
"learning_rate": 0.0004915418498197105,
"loss": 2.3083,
"step": 1030
},
{
"epoch": 0.09203132604752003,
"grad_norm": 0.46175774931907654,
"learning_rate": 0.0004913600368059907,
"loss": 2.2686,
"step": 1040
},
{
"epoch": 0.09291624264413079,
"grad_norm": 0.4882391691207886,
"learning_rate": 0.000491176324775594,
"loss": 2.2916,
"step": 1050
},
{
"epoch": 0.09380115924074156,
"grad_norm": 0.7018927335739136,
"learning_rate": 0.0004909907151739633,
"loss": 2.2805,
"step": 1060
},
{
"epoch": 0.09468607583735233,
"grad_norm": 0.6598804593086243,
"learning_rate": 0.0004908032094614721,
"loss": 2.3002,
"step": 1070
},
{
"epoch": 0.0955709924339631,
"grad_norm": 0.5327743887901306,
"learning_rate": 0.0004906138091134118,
"loss": 2.3015,
"step": 1080
},
{
"epoch": 0.09645590903057387,
"grad_norm": 0.5282323956489563,
"learning_rate": 0.0004904225156199815,
"loss": 2.2905,
"step": 1090
},
{
"epoch": 0.09734082562718464,
"grad_norm": 0.4804977774620056,
"learning_rate": 0.000490229330486275,
"loss": 2.278,
"step": 1100
},
{
"epoch": 0.0982257422237954,
"grad_norm": 0.6388362646102905,
"learning_rate": 0.0004900342552322694,
"loss": 2.2983,
"step": 1110
},
{
"epoch": 0.09911065882040618,
"grad_norm": 0.6902673244476318,
"learning_rate": 0.000489837291392814,
"loss": 2.2952,
"step": 1120
},
{
"epoch": 0.09999557541701695,
"grad_norm": 0.8888295292854309,
"learning_rate": 0.0004896384405176167,
"loss": 2.2779,
"step": 1130
},
{
"epoch": 0.10088049201362771,
"grad_norm": 0.7182716131210327,
"learning_rate": 0.0004894377041712326,
"loss": 2.2783,
"step": 1140
},
{
"epoch": 0.10176540861023849,
"grad_norm": 0.4691000282764435,
"learning_rate": 0.0004892350839330522,
"loss": 2.2996,
"step": 1150
},
{
"epoch": 0.10265032520684926,
"grad_norm": 0.6428681015968323,
"learning_rate": 0.000489030581397288,
"loss": 2.2673,
"step": 1160
},
{
"epoch": 0.10353524180346002,
"grad_norm": 0.6838648319244385,
"learning_rate": 0.0004888241981729624,
"loss": 2.251,
"step": 1170
},
{
"epoch": 0.1044201584000708,
"grad_norm": 0.5071864128112793,
"learning_rate": 0.0004886159358838952,
"loss": 2.281,
"step": 1180
},
{
"epoch": 0.10530507499668157,
"grad_norm": 0.5089443325996399,
"learning_rate": 0.0004884057961686906,
"loss": 2.2951,
"step": 1190
},
{
"epoch": 0.10618999159329233,
"grad_norm": 0.5187750458717346,
"learning_rate": 0.00048819378068072405,
"loss": 2.264,
"step": 1200
},
{
"epoch": 0.1070749081899031,
"grad_norm": 0.4623073637485504,
"learning_rate": 0.00048797989108813013,
"loss": 2.2772,
"step": 1210
},
{
"epoch": 0.10795982478651388,
"grad_norm": 0.5824326872825623,
"learning_rate": 0.0004877641290737884,
"loss": 2.2703,
"step": 1220
},
{
"epoch": 0.10884474138312464,
"grad_norm": 0.644314706325531,
"learning_rate": 0.00048754649633531074,
"loss": 2.2779,
"step": 1230
},
{
"epoch": 0.10972965797973541,
"grad_norm": 0.6066089272499084,
"learning_rate": 0.00048732699458502784,
"loss": 2.305,
"step": 1240
},
{
"epoch": 0.11061457457634619,
"grad_norm": 0.8288434147834778,
"learning_rate": 0.00048710562554997574,
"loss": 2.2944,
"step": 1250
},
{
"epoch": 0.11149949117295695,
"grad_norm": 0.5620648264884949,
"learning_rate": 0.00048688239097188226,
"loss": 2.2584,
"step": 1260
},
{
"epoch": 0.11238440776956772,
"grad_norm": 0.5757160186767578,
"learning_rate": 0.0004866572926071532,
"loss": 2.2949,
"step": 1270
},
{
"epoch": 0.11326932436617848,
"grad_norm": 0.5411326885223389,
"learning_rate": 0.00048643033222685886,
"loss": 2.2671,
"step": 1280
},
{
"epoch": 0.11415424096278926,
"grad_norm": 0.8147817254066467,
"learning_rate": 0.00048620151161671955,
"loss": 2.3014,
"step": 1290
},
{
"epoch": 0.11503915755940003,
"grad_norm": 0.600642204284668,
"learning_rate": 0.0004859708325770919,
"loss": 2.2699,
"step": 1300
},
{
"epoch": 0.11592407415601079,
"grad_norm": 0.6259739398956299,
"learning_rate": 0.0004857382969229548,
"loss": 2.2599,
"step": 1310
},
{
"epoch": 0.11680899075262156,
"grad_norm": 0.546262800693512,
"learning_rate": 0.00048550390648389476,
"loss": 2.2823,
"step": 1320
},
{
"epoch": 0.11769390734923234,
"grad_norm": 0.4821476340293884,
"learning_rate": 0.00048526766310409176,
"loss": 2.2521,
"step": 1330
},
{
"epoch": 0.1185788239458431,
"grad_norm": 0.825333833694458,
"learning_rate": 0.00048502956864230473,
"loss": 2.2572,
"step": 1340
},
{
"epoch": 0.11946374054245387,
"grad_norm": 0.4751971960067749,
"learning_rate": 0.000484789624971857,
"loss": 2.2741,
"step": 1350
},
{
"epoch": 0.12034865713906465,
"grad_norm": 0.5960304737091064,
"learning_rate": 0.0004845478339806211,
"loss": 2.2763,
"step": 1360
},
{
"epoch": 0.1212335737356754,
"grad_norm": 0.6432631015777588,
"learning_rate": 0.0004843041975710044,
"loss": 2.2609,
"step": 1370
},
{
"epoch": 0.12211849033228618,
"grad_norm": 0.7140398621559143,
"learning_rate": 0.0004840587176599343,
"loss": 2.3021,
"step": 1380
},
{
"epoch": 0.12300340692889696,
"grad_norm": 0.519575834274292,
"learning_rate": 0.0004838113961788424,
"loss": 2.2788,
"step": 1390
},
{
"epoch": 0.12388832352550772,
"grad_norm": 0.6823663711547852,
"learning_rate": 0.00048356223507364993,
"loss": 2.2905,
"step": 1400
},
{
"epoch": 0.12477324012211849,
"grad_norm": 0.553036093711853,
"learning_rate": 0.0004833112363047524,
"loss": 2.2917,
"step": 1410
},
{
"epoch": 0.12565815671872926,
"grad_norm": 0.4933728277683258,
"learning_rate": 0.00048305840184700356,
"loss": 2.2589,
"step": 1420
},
{
"epoch": 0.12654307331534004,
"grad_norm": 0.9149543642997742,
"learning_rate": 0.00048280373368970086,
"loss": 2.264,
"step": 1430
},
{
"epoch": 0.1274279899119508,
"grad_norm": 0.4670112729072571,
"learning_rate": 0.0004825472338365691,
"loss": 2.2684,
"step": 1440
},
{
"epoch": 0.12831290650856156,
"grad_norm": 0.5053747296333313,
"learning_rate": 0.0004822889043057446,
"loss": 2.2563,
"step": 1450
},
{
"epoch": 0.12919782310517233,
"grad_norm": 0.5054446458816528,
"learning_rate": 0.00048202874712975977,
"loss": 2.2829,
"step": 1460
},
{
"epoch": 0.1300827397017831,
"grad_norm": 0.6858576536178589,
"learning_rate": 0.0004817667643555269,
"loss": 2.2531,
"step": 1470
},
{
"epoch": 0.13096765629839388,
"grad_norm": 0.7087405920028687,
"learning_rate": 0.00048150295804432196,
"loss": 2.2693,
"step": 1480
},
{
"epoch": 0.13185257289500465,
"grad_norm": 0.44586825370788574,
"learning_rate": 0.0004812373302717686,
"loss": 2.2751,
"step": 1490
},
{
"epoch": 0.13273748949161543,
"grad_norm": 0.4149426221847534,
"learning_rate": 0.0004809698831278217,
"loss": 2.2507,
"step": 1500
},
{
"epoch": 0.13362240608822618,
"grad_norm": 0.6579311490058899,
"learning_rate": 0.0004807006187167507,
"loss": 2.274,
"step": 1510
},
{
"epoch": 0.13450732268483695,
"grad_norm": 0.46561411023139954,
"learning_rate": 0.0004804295391571235,
"loss": 2.262,
"step": 1520
},
{
"epoch": 0.13539223928144772,
"grad_norm": 0.5864225625991821,
"learning_rate": 0.00048015664658178944,
"loss": 2.2859,
"step": 1530
},
{
"epoch": 0.1362771558780585,
"grad_norm": 0.6503337621688843,
"learning_rate": 0.0004798819431378627,
"loss": 2.2601,
"step": 1540
},
{
"epoch": 0.13716207247466927,
"grad_norm": 0.5384878516197205,
"learning_rate": 0.0004796054309867053,
"loss": 2.2657,
"step": 1550
},
{
"epoch": 0.13804698907128002,
"grad_norm": 0.8244152665138245,
"learning_rate": 0.00047932711230391014,
"loss": 2.2766,
"step": 1560
},
{
"epoch": 0.1389319056678908,
"grad_norm": 0.4444003999233246,
"learning_rate": 0.00047904698927928404,
"loss": 2.246,
"step": 1570
},
{
"epoch": 0.13981682226450157,
"grad_norm": 0.5598679780960083,
"learning_rate": 0.00047876506411683,
"loss": 2.2731,
"step": 1580
},
{
"epoch": 0.14070173886111234,
"grad_norm": 0.5661593675613403,
"learning_rate": 0.0004784813390347305,
"loss": 2.2549,
"step": 1590
},
{
"epoch": 0.14158665545772312,
"grad_norm": 0.6023704409599304,
"learning_rate": 0.0004781958162653297,
"loss": 2.2782,
"step": 1600
},
{
"epoch": 0.1424715720543339,
"grad_norm": 0.8696288466453552,
"learning_rate": 0.00047790849805511595,
"loss": 2.248,
"step": 1610
},
{
"epoch": 0.14335648865094464,
"grad_norm": 0.7130827903747559,
"learning_rate": 0.000477619386664704,
"loss": 2.2693,
"step": 1620
},
{
"epoch": 0.1442414052475554,
"grad_norm": 0.7435203790664673,
"learning_rate": 0.00047732848436881736,
"loss": 2.2648,
"step": 1630
},
{
"epoch": 0.14512632184416618,
"grad_norm": 0.5171283483505249,
"learning_rate": 0.00047703579345627036,
"loss": 2.2506,
"step": 1640
},
{
"epoch": 0.14601123844077696,
"grad_norm": 0.5777902007102966,
"learning_rate": 0.0004767413162299501,
"loss": 2.2732,
"step": 1650
},
{
"epoch": 0.14689615503738773,
"grad_norm": 0.5333867073059082,
"learning_rate": 0.0004764450550067985,
"loss": 2.2803,
"step": 1660
},
{
"epoch": 0.1477810716339985,
"grad_norm": 0.5803987979888916,
"learning_rate": 0.0004761470121177938,
"loss": 2.2928,
"step": 1670
},
{
"epoch": 0.14866598823060925,
"grad_norm": 0.5488025546073914,
"learning_rate": 0.0004758471899079324,
"loss": 2.2695,
"step": 1680
},
{
"epoch": 0.14955090482722003,
"grad_norm": 0.7418103814125061,
"learning_rate": 0.00047554559073621034,
"loss": 2.2442,
"step": 1690
},
{
"epoch": 0.1504358214238308,
"grad_norm": 0.5090646147727966,
"learning_rate": 0.00047524221697560476,
"loss": 2.2637,
"step": 1700
},
{
"epoch": 0.15132073802044158,
"grad_norm": 0.46209344267845154,
"learning_rate": 0.0004749370710130554,
"loss": 2.235,
"step": 1710
},
{
"epoch": 0.15220565461705235,
"grad_norm": 0.5527107119560242,
"learning_rate": 0.0004746301552494453,
"loss": 2.2815,
"step": 1720
},
{
"epoch": 0.15309057121366312,
"grad_norm": 0.617348849773407,
"learning_rate": 0.0004743214720995827,
"loss": 2.2734,
"step": 1730
},
{
"epoch": 0.15397548781027387,
"grad_norm": 0.8233256340026855,
"learning_rate": 0.00047401102399218133,
"loss": 2.258,
"step": 1740
},
{
"epoch": 0.15486040440688464,
"grad_norm": 0.5554172992706299,
"learning_rate": 0.0004736988133698416,
"loss": 2.2703,
"step": 1750
},
{
"epoch": 0.15574532100349542,
"grad_norm": 0.6374910473823547,
"learning_rate": 0.0004733848426890313,
"loss": 2.2656,
"step": 1760
},
{
"epoch": 0.1566302376001062,
"grad_norm": 0.5161751508712769,
"learning_rate": 0.00047306911442006653,
"loss": 2.2636,
"step": 1770
},
{
"epoch": 0.15751515419671697,
"grad_norm": 0.6015154719352722,
"learning_rate": 0.00047275163104709196,
"loss": 2.2511,
"step": 1780
},
{
"epoch": 0.15840007079332774,
"grad_norm": 0.5937806367874146,
"learning_rate": 0.0004724323950680614,
"loss": 2.2593,
"step": 1790
},
{
"epoch": 0.1592849873899385,
"grad_norm": 0.5501092672348022,
"learning_rate": 0.00047211140899471813,
"loss": 2.2621,
"step": 1800
},
{
"epoch": 0.16016990398654926,
"grad_norm": 0.6284824013710022,
"learning_rate": 0.000471788675352575,
"loss": 2.2683,
"step": 1810
},
{
"epoch": 0.16105482058316004,
"grad_norm": 0.46114546060562134,
"learning_rate": 0.000471464196680895,
"loss": 2.2593,
"step": 1820
},
{
"epoch": 0.1619397371797708,
"grad_norm": 0.5204902291297913,
"learning_rate": 0.0004711379755326707,
"loss": 2.2511,
"step": 1830
},
{
"epoch": 0.16282465377638158,
"grad_norm": 0.5937714576721191,
"learning_rate": 0.00047081001447460457,
"loss": 2.2603,
"step": 1840
},
{
"epoch": 0.16370957037299236,
"grad_norm": 0.5259864330291748,
"learning_rate": 0.00047048031608708875,
"loss": 2.2427,
"step": 1850
},
{
"epoch": 0.1645944869696031,
"grad_norm": 0.43361151218414307,
"learning_rate": 0.0004701488829641845,
"loss": 2.2455,
"step": 1860
},
{
"epoch": 0.16547940356621388,
"grad_norm": 0.5359675884246826,
"learning_rate": 0.000469815717713602,
"loss": 2.268,
"step": 1870
},
{
"epoch": 0.16636432016282465,
"grad_norm": 0.6381211876869202,
"learning_rate": 0.00046948082295667984,
"loss": 2.2709,
"step": 1880
},
{
"epoch": 0.16724923675943543,
"grad_norm": 0.5162480473518372,
"learning_rate": 0.0004691442013283642,
"loss": 2.2489,
"step": 1890
},
{
"epoch": 0.1681341533560462,
"grad_norm": 0.4458593726158142,
"learning_rate": 0.00046880585547718847,
"loss": 2.2603,
"step": 1900
},
{
"epoch": 0.16901906995265698,
"grad_norm": 0.46709561347961426,
"learning_rate": 0.00046846578806525194,
"loss": 2.2666,
"step": 1910
},
{
"epoch": 0.16990398654926772,
"grad_norm": 0.7030518054962158,
"learning_rate": 0.0004681240017681993,
"loss": 2.2222,
"step": 1920
},
{
"epoch": 0.1707889031458785,
"grad_norm": 0.5679172277450562,
"learning_rate": 0.00046778049927519936,
"loss": 2.2753,
"step": 1930
},
{
"epoch": 0.17167381974248927,
"grad_norm": 0.5176842212677002,
"learning_rate": 0.0004674352832889239,
"loss": 2.2578,
"step": 1940
},
{
"epoch": 0.17255873633910004,
"grad_norm": 0.5601808428764343,
"learning_rate": 0.0004670883565255264,
"loss": 2.2406,
"step": 1950
},
{
"epoch": 0.17344365293571082,
"grad_norm": 0.6519585847854614,
"learning_rate": 0.00046673972171462077,
"loss": 2.2535,
"step": 1960
},
{
"epoch": 0.1743285695323216,
"grad_norm": 0.5103752017021179,
"learning_rate": 0.0004663893815992599,
"loss": 2.2528,
"step": 1970
},
{
"epoch": 0.17521348612893234,
"grad_norm": 0.496896892786026,
"learning_rate": 0.0004660373389359137,
"loss": 2.247,
"step": 1980
},
{
"epoch": 0.1760984027255431,
"grad_norm": 0.7528384327888489,
"learning_rate": 0.00046568359649444796,
"loss": 2.2525,
"step": 1990
},
{
"epoch": 0.1769833193221539,
"grad_norm": 0.5633223056793213,
"learning_rate": 0.0004653281570581023,
"loss": 2.2471,
"step": 2000
},
{
"epoch": 0.1769833193221539,
"eval_accuracy": 0.5411561883259997,
"eval_loss": 2.162017583847046,
"eval_runtime": 11.2812,
"eval_samples_per_second": 28.189,
"eval_steps_per_second": 0.443,
"step": 2000
},
{
"epoch": 0.17786823591876466,
"grad_norm": 0.713107168674469,
"learning_rate": 0.000464971023423468,
"loss": 2.2638,
"step": 2010
},
{
"epoch": 0.17875315251537544,
"grad_norm": 0.5677906274795532,
"learning_rate": 0.0004646121984004665,
"loss": 2.2495,
"step": 2020
},
{
"epoch": 0.17963806911198618,
"grad_norm": 0.637523353099823,
"learning_rate": 0.0004642516848123272,
"loss": 2.2509,
"step": 2030
},
{
"epoch": 0.18052298570859696,
"grad_norm": 0.5341629385948181,
"learning_rate": 0.00046388948549556453,
"loss": 2.2659,
"step": 2040
},
{
"epoch": 0.18140790230520773,
"grad_norm": 0.5201821327209473,
"learning_rate": 0.00046352560329995687,
"loss": 2.2512,
"step": 2050
},
{
"epoch": 0.1822928189018185,
"grad_norm": 0.49913713335990906,
"learning_rate": 0.00046316004108852305,
"loss": 2.2724,
"step": 2060
},
{
"epoch": 0.18317773549842928,
"grad_norm": 0.5114869475364685,
"learning_rate": 0.0004627928017375004,
"loss": 2.2714,
"step": 2070
},
{
"epoch": 0.18406265209504005,
"grad_norm": 0.8079931139945984,
"learning_rate": 0.00046242388813632187,
"loss": 2.2608,
"step": 2080
},
{
"epoch": 0.1849475686916508,
"grad_norm": 0.469683974981308,
"learning_rate": 0.0004620533031875934,
"loss": 2.2567,
"step": 2090
},
{
"epoch": 0.18583248528826157,
"grad_norm": 0.7134404182434082,
"learning_rate": 0.00046168104980707104,
"loss": 2.2418,
"step": 2100
},
{
"epoch": 0.18671740188487235,
"grad_norm": 0.8264422416687012,
"learning_rate": 0.0004613071309236382,
"loss": 2.2404,
"step": 2110
},
{
"epoch": 0.18760231848148312,
"grad_norm": 0.6578531265258789,
"learning_rate": 0.00046093154947928226,
"loss": 2.2531,
"step": 2120
},
{
"epoch": 0.1884872350780939,
"grad_norm": 0.6748083829879761,
"learning_rate": 0.0004605543084290716,
"loss": 2.2349,
"step": 2130
},
{
"epoch": 0.18937215167470467,
"grad_norm": 0.9525237083435059,
"learning_rate": 0.00046017541074113257,
"loss": 2.2385,
"step": 2140
},
{
"epoch": 0.19025706827131542,
"grad_norm": 0.7239274382591248,
"learning_rate": 0.00045979485939662556,
"loss": 2.2345,
"step": 2150
},
{
"epoch": 0.1911419848679262,
"grad_norm": 0.5265571475028992,
"learning_rate": 0.00045941265738972217,
"loss": 2.2621,
"step": 2160
},
{
"epoch": 0.19202690146453696,
"grad_norm": 0.45004111528396606,
"learning_rate": 0.0004590288077275814,
"loss": 2.2504,
"step": 2170
},
{
"epoch": 0.19291181806114774,
"grad_norm": 0.622985303401947,
"learning_rate": 0.00045864331343032565,
"loss": 2.2176,
"step": 2180
},
{
"epoch": 0.1937967346577585,
"grad_norm": 0.500320553779602,
"learning_rate": 0.00045825617753101776,
"loss": 2.2466,
"step": 2190
},
{
"epoch": 0.1946816512543693,
"grad_norm": 0.6258721351623535,
"learning_rate": 0.00045786740307563633,
"loss": 2.2386,
"step": 2200
},
{
"epoch": 0.19556656785098003,
"grad_norm": 0.9133718013763428,
"learning_rate": 0.0004574769931230521,
"loss": 2.2468,
"step": 2210
},
{
"epoch": 0.1964514844475908,
"grad_norm": 0.8068430423736572,
"learning_rate": 0.0004570849507450041,
"loss": 2.2421,
"step": 2220
},
{
"epoch": 0.19733640104420158,
"grad_norm": 0.651720404624939,
"learning_rate": 0.0004566912790260751,
"loss": 2.2868,
"step": 2230
},
{
"epoch": 0.19822131764081236,
"grad_norm": 0.4649779200553894,
"learning_rate": 0.0004562959810636674,
"loss": 2.2455,
"step": 2240
},
{
"epoch": 0.19910623423742313,
"grad_norm": 0.7452356815338135,
"learning_rate": 0.0004558990599679787,
"loss": 2.2457,
"step": 2250
},
{
"epoch": 0.1999911508340339,
"grad_norm": 0.4573175013065338,
"learning_rate": 0.00045550051886197754,
"loss": 2.2525,
"step": 2260
},
{
"epoch": 0.20087606743064465,
"grad_norm": 0.46977052092552185,
"learning_rate": 0.0004551003608813784,
"loss": 2.2605,
"step": 2270
},
{
"epoch": 0.20176098402725542,
"grad_norm": 0.43524104356765747,
"learning_rate": 0.0004546985891746177,
"loss": 2.2411,
"step": 2280
},
{
"epoch": 0.2026459006238662,
"grad_norm": 0.5056027173995972,
"learning_rate": 0.00045429520690282827,
"loss": 2.2434,
"step": 2290
},
{
"epoch": 0.20353081722047697,
"grad_norm": 0.46207907795906067,
"learning_rate": 0.00045389021723981504,
"loss": 2.2489,
"step": 2300
},
{
"epoch": 0.20441573381708775,
"grad_norm": 0.4407023787498474,
"learning_rate": 0.00045348362337202985,
"loss": 2.229,
"step": 2310
},
{
"epoch": 0.20530065041369852,
"grad_norm": 0.4465203583240509,
"learning_rate": 0.00045307542849854626,
"loss": 2.2567,
"step": 2320
},
{
"epoch": 0.20618556701030927,
"grad_norm": 0.64149010181427,
"learning_rate": 0.00045266563583103473,
"loss": 2.2637,
"step": 2330
},
{
"epoch": 0.20707048360692004,
"grad_norm": 0.5589755177497864,
"learning_rate": 0.0004522542485937369,
"loss": 2.2421,
"step": 2340
},
{
"epoch": 0.20795540020353082,
"grad_norm": 0.4988935589790344,
"learning_rate": 0.0004518412700234406,
"loss": 2.25,
"step": 2350
},
{
"epoch": 0.2088403168001416,
"grad_norm": 0.5745148062705994,
"learning_rate": 0.0004514267033694543,
"loss": 2.2564,
"step": 2360
},
{
"epoch": 0.20972523339675236,
"grad_norm": 0.4813830256462097,
"learning_rate": 0.0004510105518935813,
"loss": 2.2491,
"step": 2370
},
{
"epoch": 0.21061014999336314,
"grad_norm": 0.4937480092048645,
"learning_rate": 0.0004505928188700945,
"loss": 2.2467,
"step": 2380
},
{
"epoch": 0.21149506658997388,
"grad_norm": 0.5905641913414001,
"learning_rate": 0.0004501735075857101,
"loss": 2.2548,
"step": 2390
},
{
"epoch": 0.21237998318658466,
"grad_norm": 0.5014283657073975,
"learning_rate": 0.0004497526213395623,
"loss": 2.2366,
"step": 2400
},
{
"epoch": 0.21326489978319543,
"grad_norm": 0.5339481830596924,
"learning_rate": 0.0004493301634431768,
"loss": 2.2451,
"step": 2410
},
{
"epoch": 0.2141498163798062,
"grad_norm": 0.7018898129463196,
"learning_rate": 0.00044890613722044524,
"loss": 2.2499,
"step": 2420
},
{
"epoch": 0.21503473297641698,
"grad_norm": 0.8874839544296265,
"learning_rate": 0.0004484805460075988,
"loss": 2.2615,
"step": 2430
},
{
"epoch": 0.21591964957302776,
"grad_norm": 0.5717945694923401,
"learning_rate": 0.0004480533931531819,
"loss": 2.2245,
"step": 2440
},
{
"epoch": 0.2168045661696385,
"grad_norm": 0.7452505826950073,
"learning_rate": 0.00044762468201802586,
"loss": 2.2589,
"step": 2450
},
{
"epoch": 0.21768948276624928,
"grad_norm": 0.5501087307929993,
"learning_rate": 0.0004471944159752228,
"loss": 2.2288,
"step": 2460
},
{
"epoch": 0.21857439936286005,
"grad_norm": 0.5167734622955322,
"learning_rate": 0.00044676259841009845,
"loss": 2.234,
"step": 2470
},
{
"epoch": 0.21945931595947082,
"grad_norm": 0.5619193911552429,
"learning_rate": 0.0004463292327201862,
"loss": 2.2395,
"step": 2480
},
{
"epoch": 0.2203442325560816,
"grad_norm": 0.45388907194137573,
"learning_rate": 0.0004458943223152,
"loss": 2.2539,
"step": 2490
},
{
"epoch": 0.22122914915269237,
"grad_norm": 0.5973688364028931,
"learning_rate": 0.0004454578706170075,
"loss": 2.2375,
"step": 2500
},
{
"epoch": 0.22211406574930312,
"grad_norm": 0.5753281712532043,
"learning_rate": 0.00044501988105960315,
"loss": 2.2295,
"step": 2510
},
{
"epoch": 0.2229989823459139,
"grad_norm": 0.4576527178287506,
"learning_rate": 0.00044458035708908153,
"loss": 2.2607,
"step": 2520
},
{
"epoch": 0.22388389894252467,
"grad_norm": 0.6270558834075928,
"learning_rate": 0.00044413930216360964,
"loss": 2.2444,
"step": 2530
},
{
"epoch": 0.22476881553913544,
"grad_norm": 0.45903804898262024,
"learning_rate": 0.00044369671975340026,
"loss": 2.2355,
"step": 2540
},
{
"epoch": 0.22565373213574622,
"grad_norm": 0.4801378846168518,
"learning_rate": 0.0004432526133406842,
"loss": 2.2208,
"step": 2550
},
{
"epoch": 0.22653864873235696,
"grad_norm": 0.5071857571601868,
"learning_rate": 0.0004428069864196833,
"loss": 2.2471,
"step": 2560
},
{
"epoch": 0.22742356532896774,
"grad_norm": 0.4489947259426117,
"learning_rate": 0.00044235984249658256,
"loss": 2.2241,
"step": 2570
},
{
"epoch": 0.2283084819255785,
"grad_norm": 0.7193836569786072,
"learning_rate": 0.00044191118508950277,
"loss": 2.2384,
"step": 2580
},
{
"epoch": 0.22919339852218928,
"grad_norm": 0.42136234045028687,
"learning_rate": 0.0004414610177284728,
"loss": 2.2508,
"step": 2590
},
{
"epoch": 0.23007831511880006,
"grad_norm": 0.43367722630500793,
"learning_rate": 0.0004410093439554019,
"loss": 2.2279,
"step": 2600
},
{
"epoch": 0.23096323171541083,
"grad_norm": 0.6120412945747375,
"learning_rate": 0.00044055616732405147,
"loss": 2.2308,
"step": 2610
},
{
"epoch": 0.23184814831202158,
"grad_norm": 0.4905729293823242,
"learning_rate": 0.0004401014914000078,
"loss": 2.2417,
"step": 2620
},
{
"epoch": 0.23273306490863235,
"grad_norm": 0.6039494276046753,
"learning_rate": 0.00043964531976065313,
"loss": 2.25,
"step": 2630
},
{
"epoch": 0.23361798150524313,
"grad_norm": 0.4729721248149872,
"learning_rate": 0.00043918765599513826,
"loss": 2.2409,
"step": 2640
},
{
"epoch": 0.2345028981018539,
"grad_norm": 0.5762743353843689,
"learning_rate": 0.00043872850370435404,
"loss": 2.2175,
"step": 2650
},
{
"epoch": 0.23538781469846468,
"grad_norm": 0.4876422584056854,
"learning_rate": 0.00043826786650090276,
"loss": 2.2264,
"step": 2660
},
{
"epoch": 0.23627273129507545,
"grad_norm": 0.6149749159812927,
"learning_rate": 0.0004378057480090702,
"loss": 2.2494,
"step": 2670
},
{
"epoch": 0.2371576478916862,
"grad_norm": 0.5245630741119385,
"learning_rate": 0.0004373421518647968,
"loss": 2.2434,
"step": 2680
},
{
"epoch": 0.23804256448829697,
"grad_norm": 0.46496084332466125,
"learning_rate": 0.00043687708171564923,
"loss": 2.2323,
"step": 2690
},
{
"epoch": 0.23892748108490774,
"grad_norm": 0.4286579191684723,
"learning_rate": 0.00043641054122079136,
"loss": 2.2202,
"step": 2700
},
{
"epoch": 0.23981239768151852,
"grad_norm": 0.45216891169548035,
"learning_rate": 0.00043594253405095616,
"loss": 2.2416,
"step": 2710
},
{
"epoch": 0.2406973142781293,
"grad_norm": 0.49955543875694275,
"learning_rate": 0.0004354730638884159,
"loss": 2.2494,
"step": 2720
},
{
"epoch": 0.24158223087474007,
"grad_norm": 0.4823377728462219,
"learning_rate": 0.0004350021344269539,
"loss": 2.244,
"step": 2730
},
{
"epoch": 0.2424671474713508,
"grad_norm": 0.4880935847759247,
"learning_rate": 0.0004345297493718352,
"loss": 2.2297,
"step": 2740
},
{
"epoch": 0.2433520640679616,
"grad_norm": 0.5247299075126648,
"learning_rate": 0.00043405591243977736,
"loss": 2.2463,
"step": 2750
},
{
"epoch": 0.24423698066457236,
"grad_norm": 0.5798355340957642,
"learning_rate": 0.0004335806273589214,
"loss": 2.2325,
"step": 2760
},
{
"epoch": 0.24512189726118314,
"grad_norm": 0.6595978140830994,
"learning_rate": 0.0004331038978688022,
"loss": 2.2407,
"step": 2770
},
{
"epoch": 0.2460068138577939,
"grad_norm": 0.4189043641090393,
"learning_rate": 0.0004326257277203194,
"loss": 2.2523,
"step": 2780
},
{
"epoch": 0.24689173045440468,
"grad_norm": 0.4813700318336487,
"learning_rate": 0.00043214612067570755,
"loss": 2.243,
"step": 2790
},
{
"epoch": 0.24777664705101543,
"grad_norm": 0.8035611510276794,
"learning_rate": 0.0004316650805085068,
"loss": 2.2522,
"step": 2800
},
{
"epoch": 0.2486615636476262,
"grad_norm": 0.5478577017784119,
"learning_rate": 0.00043118261100353293,
"loss": 2.2334,
"step": 2810
},
{
"epoch": 0.24954648024423698,
"grad_norm": 0.39827045798301697,
"learning_rate": 0.0004306987159568479,
"loss": 2.2208,
"step": 2820
},
{
"epoch": 0.25043139684084775,
"grad_norm": 0.5026851296424866,
"learning_rate": 0.0004302133991757297,
"loss": 2.2375,
"step": 2830
},
{
"epoch": 0.2513163134374585,
"grad_norm": 0.5758419036865234,
"learning_rate": 0.00042972666447864264,
"loss": 2.2377,
"step": 2840
},
{
"epoch": 0.2522012300340693,
"grad_norm": 0.5519118309020996,
"learning_rate": 0.00042923851569520683,
"loss": 2.2492,
"step": 2850
},
{
"epoch": 0.2530861466306801,
"grad_norm": 0.5647429823875427,
"learning_rate": 0.00042874895666616887,
"loss": 2.2255,
"step": 2860
},
{
"epoch": 0.25397106322729085,
"grad_norm": 0.8345276117324829,
"learning_rate": 0.0004282579912433707,
"loss": 2.2529,
"step": 2870
},
{
"epoch": 0.2548559798239016,
"grad_norm": 0.6527183651924133,
"learning_rate": 0.0004277656232897201,
"loss": 2.2267,
"step": 2880
},
{
"epoch": 0.25574089642051234,
"grad_norm": 0.555591881275177,
"learning_rate": 0.00042727185667915975,
"loss": 2.2088,
"step": 2890
},
{
"epoch": 0.2566258130171231,
"grad_norm": 0.7504310011863708,
"learning_rate": 0.00042677669529663686,
"loss": 2.2205,
"step": 2900
},
{
"epoch": 0.2575107296137339,
"grad_norm": 0.6328213810920715,
"learning_rate": 0.00042628014303807294,
"loss": 2.2329,
"step": 2910
},
{
"epoch": 0.25839564621034466,
"grad_norm": 0.4501785337924957,
"learning_rate": 0.00042578220381033263,
"loss": 2.2063,
"step": 2920
},
{
"epoch": 0.25928056280695544,
"grad_norm": 0.4656241834163666,
"learning_rate": 0.0004252828815311934,
"loss": 2.2345,
"step": 2930
},
{
"epoch": 0.2601654794035662,
"grad_norm": 0.43484318256378174,
"learning_rate": 0.00042478218012931436,
"loss": 2.2351,
"step": 2940
},
{
"epoch": 0.261050396000177,
"grad_norm": 0.43128812313079834,
"learning_rate": 0.00042428010354420584,
"loss": 2.2253,
"step": 2950
},
{
"epoch": 0.26193531259678776,
"grad_norm": 0.4222058951854706,
"learning_rate": 0.00042377665572619774,
"loss": 2.2426,
"step": 2960
},
{
"epoch": 0.26282022919339854,
"grad_norm": 0.791580319404602,
"learning_rate": 0.000423271840636409,
"loss": 2.2227,
"step": 2970
},
{
"epoch": 0.2637051457900093,
"grad_norm": 0.551547110080719,
"learning_rate": 0.0004227656622467162,
"loss": 2.2591,
"step": 2980
},
{
"epoch": 0.2645900623866201,
"grad_norm": 0.4557199478149414,
"learning_rate": 0.0004222581245397223,
"loss": 2.2383,
"step": 2990
},
{
"epoch": 0.26547497898323086,
"grad_norm": 0.6194254159927368,
"learning_rate": 0.0004217492315087254,
"loss": 2.2295,
"step": 3000
},
{
"epoch": 0.26547497898323086,
"eval_accuracy": 0.5435882077391512,
"eval_loss": 2.143740177154541,
"eval_runtime": 11.414,
"eval_samples_per_second": 27.861,
"eval_steps_per_second": 0.438,
"step": 3000
},
{
"epoch": 0.2663598955798416,
"grad_norm": 0.7033931612968445,
"learning_rate": 0.0004212389871576873,
"loss": 2.2397,
"step": 3010
},
{
"epoch": 0.26724481217645235,
"grad_norm": 0.43708673119544983,
"learning_rate": 0.00042072739550120175,
"loss": 2.2126,
"step": 3020
},
{
"epoch": 0.2681297287730631,
"grad_norm": 0.6861522793769836,
"learning_rate": 0.00042021446056446333,
"loss": 2.2453,
"step": 3030
},
{
"epoch": 0.2690146453696739,
"grad_norm": 0.4883657395839691,
"learning_rate": 0.00041970018638323546,
"loss": 2.2502,
"step": 3040
},
{
"epoch": 0.2698995619662847,
"grad_norm": 0.5091878175735474,
"learning_rate": 0.00041918457700381855,
"loss": 2.2258,
"step": 3050
},
{
"epoch": 0.27078447856289545,
"grad_norm": 0.7300602197647095,
"learning_rate": 0.00041866763648301864,
"loss": 2.2418,
"step": 3060
},
{
"epoch": 0.2716693951595062,
"grad_norm": 0.5962792038917542,
"learning_rate": 0.00041814936888811475,
"loss": 2.2388,
"step": 3070
},
{
"epoch": 0.272554311756117,
"grad_norm": 0.5023945569992065,
"learning_rate": 0.0004176297782968277,
"loss": 2.2196,
"step": 3080
},
{
"epoch": 0.27343922835272777,
"grad_norm": 0.9406496286392212,
"learning_rate": 0.00041710886879728744,
"loss": 2.2477,
"step": 3090
},
{
"epoch": 0.27432414494933854,
"grad_norm": 0.6057345867156982,
"learning_rate": 0.000416586644488001,
"loss": 2.2268,
"step": 3100
},
{
"epoch": 0.2752090615459493,
"grad_norm": 0.7561666369438171,
"learning_rate": 0.00041606310947782046,
"loss": 2.2292,
"step": 3110
},
{
"epoch": 0.27609397814256004,
"grad_norm": 0.5512217283248901,
"learning_rate": 0.0004155382678859103,
"loss": 2.237,
"step": 3120
},
{
"epoch": 0.2769788947391708,
"grad_norm": 0.4626982510089874,
"learning_rate": 0.00041501212384171545,
"loss": 2.2396,
"step": 3130
},
{
"epoch": 0.2778638113357816,
"grad_norm": 0.41738298535346985,
"learning_rate": 0.0004144846814849282,
"loss": 2.2221,
"step": 3140
},
{
"epoch": 0.27874872793239236,
"grad_norm": 0.5502617359161377,
"learning_rate": 0.00041395594496545607,
"loss": 2.2323,
"step": 3150
},
{
"epoch": 0.27963364452900313,
"grad_norm": 0.5470404624938965,
"learning_rate": 0.0004134259184433891,
"loss": 2.2321,
"step": 3160
},
{
"epoch": 0.2805185611256139,
"grad_norm": 0.5013723969459534,
"learning_rate": 0.0004128946060889668,
"loss": 2.2229,
"step": 3170
},
{
"epoch": 0.2814034777222247,
"grad_norm": 0.4174317419528961,
"learning_rate": 0.0004123620120825459,
"loss": 2.2022,
"step": 3180
},
{
"epoch": 0.28228839431883546,
"grad_norm": 0.4371856451034546,
"learning_rate": 0.00041182814061456707,
"loss": 2.2515,
"step": 3190
},
{
"epoch": 0.28317331091544623,
"grad_norm": 0.4697751998901367,
"learning_rate": 0.00041129299588552195,
"loss": 2.2225,
"step": 3200
},
{
"epoch": 0.284058227512057,
"grad_norm": 0.4424945116043091,
"learning_rate": 0.00041075658210592,
"loss": 2.2382,
"step": 3210
},
{
"epoch": 0.2849431441086678,
"grad_norm": 0.600278377532959,
"learning_rate": 0.000410218903496256,
"loss": 2.2321,
"step": 3220
},
{
"epoch": 0.28582806070527855,
"grad_norm": 0.4000381827354431,
"learning_rate": 0.0004096799642869761,
"loss": 2.2241,
"step": 3230
},
{
"epoch": 0.28671297730188927,
"grad_norm": 0.6013597249984741,
"learning_rate": 0.0004091397687184446,
"loss": 2.2457,
"step": 3240
},
{
"epoch": 0.28759789389850005,
"grad_norm": 0.6114123463630676,
"learning_rate": 0.0004085983210409114,
"loss": 2.2292,
"step": 3250
},
{
"epoch": 0.2884828104951108,
"grad_norm": 0.7183189988136292,
"learning_rate": 0.00040805562551447745,
"loss": 2.2317,
"step": 3260
},
{
"epoch": 0.2893677270917216,
"grad_norm": 0.5196136236190796,
"learning_rate": 0.000407511686409062,
"loss": 2.2159,
"step": 3270
},
{
"epoch": 0.29025264368833237,
"grad_norm": 0.5873175859451294,
"learning_rate": 0.0004069665080043687,
"loss": 2.2169,
"step": 3280
},
{
"epoch": 0.29113756028494314,
"grad_norm": 0.46012935042381287,
"learning_rate": 0.00040642009458985196,
"loss": 2.2217,
"step": 3290
},
{
"epoch": 0.2920224768815539,
"grad_norm": 0.7356370687484741,
"learning_rate": 0.0004058724504646834,
"loss": 2.2384,
"step": 3300
},
{
"epoch": 0.2929073934781647,
"grad_norm": 0.5594988465309143,
"learning_rate": 0.0004053235799377176,
"loss": 2.2454,
"step": 3310
},
{
"epoch": 0.29379231007477546,
"grad_norm": 0.40601104497909546,
"learning_rate": 0.00040477348732745853,
"loss": 2.2355,
"step": 3320
},
{
"epoch": 0.29467722667138624,
"grad_norm": 0.45904645323753357,
"learning_rate": 0.0004042221769620256,
"loss": 2.241,
"step": 3330
},
{
"epoch": 0.295562143267997,
"grad_norm": 0.485551655292511,
"learning_rate": 0.0004036696531791193,
"loss": 2.2299,
"step": 3340
},
{
"epoch": 0.2964470598646078,
"grad_norm": 0.5098533630371094,
"learning_rate": 0.0004031159203259875,
"loss": 2.2382,
"step": 3350
},
{
"epoch": 0.2973319764612185,
"grad_norm": 0.5241096615791321,
"learning_rate": 0.0004025609827593909,
"loss": 2.2331,
"step": 3360
},
{
"epoch": 0.2982168930578293,
"grad_norm": 0.5468673706054688,
"learning_rate": 0.00040200484484556885,
"loss": 2.227,
"step": 3370
},
{
"epoch": 0.29910180965444005,
"grad_norm": 0.3913686275482178,
"learning_rate": 0.000401447510960205,
"loss": 2.2258,
"step": 3380
},
{
"epoch": 0.29998672625105083,
"grad_norm": 0.5263382196426392,
"learning_rate": 0.0004008889854883929,
"loss": 2.2131,
"step": 3390
},
{
"epoch": 0.3008716428476616,
"grad_norm": 0.4936838746070862,
"learning_rate": 0.00040032927282460145,
"loss": 2.2466,
"step": 3400
},
{
"epoch": 0.3017565594442724,
"grad_norm": 0.5192393064498901,
"learning_rate": 0.0003997683773726405,
"loss": 2.2269,
"step": 3410
},
{
"epoch": 0.30264147604088315,
"grad_norm": 0.4334067702293396,
"learning_rate": 0.0003992063035456259,
"loss": 2.2143,
"step": 3420
},
{
"epoch": 0.3035263926374939,
"grad_norm": 0.472269743680954,
"learning_rate": 0.00039864305576594504,
"loss": 2.2426,
"step": 3430
},
{
"epoch": 0.3044113092341047,
"grad_norm": 0.8175429701805115,
"learning_rate": 0.00039807863846522183,
"loss": 2.2166,
"step": 3440
},
{
"epoch": 0.3052962258307155,
"grad_norm": 0.7245342135429382,
"learning_rate": 0.0003975130560842821,
"loss": 2.233,
"step": 3450
},
{
"epoch": 0.30618114242732625,
"grad_norm": 0.4689445197582245,
"learning_rate": 0.0003969463130731183,
"loss": 2.2324,
"step": 3460
},
{
"epoch": 0.307066059023937,
"grad_norm": 0.6678940057754517,
"learning_rate": 0.00039637841389085493,
"loss": 2.2287,
"step": 3470
},
{
"epoch": 0.30795097562054774,
"grad_norm": 0.6060863733291626,
"learning_rate": 0.0003958093630057131,
"loss": 2.2472,
"step": 3480
},
{
"epoch": 0.3088358922171585,
"grad_norm": 0.4979764521121979,
"learning_rate": 0.0003952391648949757,
"loss": 2.2204,
"step": 3490
},
{
"epoch": 0.3097208088137693,
"grad_norm": 0.44741326570510864,
"learning_rate": 0.0003946678240449515,
"loss": 2.2048,
"step": 3500
},
{
"epoch": 0.31060572541038006,
"grad_norm": 0.48008590936660767,
"learning_rate": 0.00039409534495094076,
"loss": 2.2155,
"step": 3510
},
{
"epoch": 0.31149064200699084,
"grad_norm": 0.6374879479408264,
"learning_rate": 0.0003935217321171992,
"loss": 2.2279,
"step": 3520
},
{
"epoch": 0.3123755586036016,
"grad_norm": 0.3989739716053009,
"learning_rate": 0.000392946990056903,
"loss": 2.239,
"step": 3530
},
{
"epoch": 0.3132604752002124,
"grad_norm": 0.44668588042259216,
"learning_rate": 0.000392371123292113,
"loss": 2.2256,
"step": 3540
},
{
"epoch": 0.31414539179682316,
"grad_norm": 0.5262216329574585,
"learning_rate": 0.00039179413635373895,
"loss": 2.2257,
"step": 3550
},
{
"epoch": 0.31503030839343393,
"grad_norm": 0.5311539173126221,
"learning_rate": 0.00039121603378150445,
"loss": 2.2436,
"step": 3560
},
{
"epoch": 0.3159152249900447,
"grad_norm": 0.39581969380378723,
"learning_rate": 0.0003906368201239106,
"loss": 2.2165,
"step": 3570
},
{
"epoch": 0.3168001415866555,
"grad_norm": 0.7394521236419678,
"learning_rate": 0.0003900564999382007,
"loss": 2.2223,
"step": 3580
},
{
"epoch": 0.3176850581832662,
"grad_norm": 0.6843695044517517,
"learning_rate": 0.0003894750777903242,
"loss": 2.2278,
"step": 3590
},
{
"epoch": 0.318569974779877,
"grad_norm": 0.5867041349411011,
"learning_rate": 0.00038889255825490053,
"loss": 2.2326,
"step": 3600
},
{
"epoch": 0.31945489137648775,
"grad_norm": 0.442703515291214,
"learning_rate": 0.0003883089459151837,
"loss": 2.2093,
"step": 3610
},
{
"epoch": 0.3203398079730985,
"grad_norm": 0.6596940755844116,
"learning_rate": 0.0003877242453630256,
"loss": 2.2084,
"step": 3620
},
{
"epoch": 0.3212247245697093,
"grad_norm": 0.42338627576828003,
"learning_rate": 0.00038713846119884033,
"loss": 2.2328,
"step": 3630
},
{
"epoch": 0.32210964116632007,
"grad_norm": 0.48313215374946594,
"learning_rate": 0.0003865515980315677,
"loss": 2.1973,
"step": 3640
},
{
"epoch": 0.32299455776293085,
"grad_norm": 0.6299489736557007,
"learning_rate": 0.0003859636604786372,
"loss": 2.2255,
"step": 3650
},
{
"epoch": 0.3238794743595416,
"grad_norm": 0.4562525153160095,
"learning_rate": 0.00038537465316593146,
"loss": 2.2053,
"step": 3660
},
{
"epoch": 0.3247643909561524,
"grad_norm": 0.4125101566314697,
"learning_rate": 0.0003847845807277501,
"loss": 2.2179,
"step": 3670
},
{
"epoch": 0.32564930755276317,
"grad_norm": 0.531594455242157,
"learning_rate": 0.000384193447806773,
"loss": 2.2211,
"step": 3680
},
{
"epoch": 0.32653422414937394,
"grad_norm": 0.36836495995521545,
"learning_rate": 0.00038360125905402396,
"loss": 2.2381,
"step": 3690
},
{
"epoch": 0.3274191407459847,
"grad_norm": 0.4548482298851013,
"learning_rate": 0.00038300801912883415,
"loss": 2.2213,
"step": 3700
},
{
"epoch": 0.32830405734259543,
"grad_norm": 0.6590666174888611,
"learning_rate": 0.00038241373269880507,
"loss": 2.2244,
"step": 3710
},
{
"epoch": 0.3291889739392062,
"grad_norm": 0.41382184624671936,
"learning_rate": 0.0003818184044397725,
"loss": 2.2275,
"step": 3720
},
{
"epoch": 0.330073890535817,
"grad_norm": 0.4768171012401581,
"learning_rate": 0.0003812220390357689,
"loss": 2.2221,
"step": 3730
},
{
"epoch": 0.33095880713242776,
"grad_norm": 0.5649170875549316,
"learning_rate": 0.0003806246411789872,
"loss": 2.2368,
"step": 3740
},
{
"epoch": 0.33184372372903853,
"grad_norm": 0.7073878645896912,
"learning_rate": 0.00038002621556974364,
"loss": 2.2258,
"step": 3750
},
{
"epoch": 0.3327286403256493,
"grad_norm": 0.4342755675315857,
"learning_rate": 0.0003794267669164408,
"loss": 2.2217,
"step": 3760
},
{
"epoch": 0.3336135569222601,
"grad_norm": 0.43126773834228516,
"learning_rate": 0.0003788262999355304,
"loss": 2.2547,
"step": 3770
},
{
"epoch": 0.33449847351887085,
"grad_norm": 0.4855419993400574,
"learning_rate": 0.00037822481935147656,
"loss": 2.2363,
"step": 3780
},
{
"epoch": 0.3353833901154816,
"grad_norm": 0.39136576652526855,
"learning_rate": 0.00037762232989671827,
"loss": 2.2248,
"step": 3790
},
{
"epoch": 0.3362683067120924,
"grad_norm": 0.5595036745071411,
"learning_rate": 0.0003770188363116324,
"loss": 2.2123,
"step": 3800
},
{
"epoch": 0.3371532233087032,
"grad_norm": 0.4321103096008301,
"learning_rate": 0.0003764143433444962,
"loss": 2.2066,
"step": 3810
},
{
"epoch": 0.33803813990531395,
"grad_norm": 0.4474160075187683,
"learning_rate": 0.00037580885575145005,
"loss": 2.2289,
"step": 3820
},
{
"epoch": 0.33892305650192467,
"grad_norm": 0.46913856267929077,
"learning_rate": 0.0003752023782964601,
"loss": 2.2222,
"step": 3830
},
{
"epoch": 0.33980797309853544,
"grad_norm": 0.516632080078125,
"learning_rate": 0.00037459491575128075,
"loss": 2.2166,
"step": 3840
},
{
"epoch": 0.3406928896951462,
"grad_norm": 0.5410568118095398,
"learning_rate": 0.000373986472895417,
"loss": 2.234,
"step": 3850
},
{
"epoch": 0.341577806291757,
"grad_norm": 0.685844361782074,
"learning_rate": 0.0003733770545160867,
"loss": 2.2179,
"step": 3860
},
{
"epoch": 0.34246272288836777,
"grad_norm": 0.40717822313308716,
"learning_rate": 0.0003727666654081836,
"loss": 2.2165,
"step": 3870
},
{
"epoch": 0.34334763948497854,
"grad_norm": 0.49223434925079346,
"learning_rate": 0.0003721553103742388,
"loss": 2.2215,
"step": 3880
},
{
"epoch": 0.3442325560815893,
"grad_norm": 0.4947943687438965,
"learning_rate": 0.00037154299422438315,
"loss": 2.2213,
"step": 3890
},
{
"epoch": 0.3451174726782001,
"grad_norm": 0.6207186579704285,
"learning_rate": 0.00037092972177631,
"loss": 2.2237,
"step": 3900
},
{
"epoch": 0.34600238927481086,
"grad_norm": 0.5417460799217224,
"learning_rate": 0.00037031549785523633,
"loss": 2.2149,
"step": 3910
},
{
"epoch": 0.34688730587142164,
"grad_norm": 0.5263400077819824,
"learning_rate": 0.0003697003272938657,
"loss": 2.212,
"step": 3920
},
{
"epoch": 0.3477722224680324,
"grad_norm": 0.40013402700424194,
"learning_rate": 0.00036908421493234963,
"loss": 2.2225,
"step": 3930
},
{
"epoch": 0.3486571390646432,
"grad_norm": 0.5232884883880615,
"learning_rate": 0.00036846716561824967,
"loss": 2.2182,
"step": 3940
},
{
"epoch": 0.3495420556612539,
"grad_norm": 0.3998386263847351,
"learning_rate": 0.0003678491842064995,
"loss": 2.224,
"step": 3950
},
{
"epoch": 0.3504269722578647,
"grad_norm": 0.45659396052360535,
"learning_rate": 0.0003672302755593661,
"loss": 2.2309,
"step": 3960
},
{
"epoch": 0.35131188885447545,
"grad_norm": 0.8522054553031921,
"learning_rate": 0.00036661044454641255,
"loss": 2.189,
"step": 3970
},
{
"epoch": 0.3521968054510862,
"grad_norm": 0.5786595344543457,
"learning_rate": 0.00036598969604445856,
"loss": 2.2328,
"step": 3980
},
{
"epoch": 0.353081722047697,
"grad_norm": 0.6108263731002808,
"learning_rate": 0.00036536803493754285,
"loss": 2.2325,
"step": 3990
},
{
"epoch": 0.3539666386443078,
"grad_norm": 0.4051169753074646,
"learning_rate": 0.00036474546611688443,
"loss": 2.2336,
"step": 4000
},
{
"epoch": 0.3539666386443078,
"eval_accuracy": 0.5453820812311378,
"eval_loss": 2.1286191940307617,
"eval_runtime": 12.2098,
"eval_samples_per_second": 26.045,
"eval_steps_per_second": 0.41,
"step": 4000
},
{
"epoch": 0.35485155524091855,
"grad_norm": 0.6282593011856079,
"learning_rate": 0.0003641219944808443,
"loss": 2.221,
"step": 4010
},
{
"epoch": 0.3557364718375293,
"grad_norm": 0.7185233235359192,
"learning_rate": 0.00036349762493488667,
"loss": 2.2076,
"step": 4020
},
{
"epoch": 0.3566213884341401,
"grad_norm": 0.7254645228385925,
"learning_rate": 0.00036287236239154064,
"loss": 2.2315,
"step": 4030
},
{
"epoch": 0.35750630503075087,
"grad_norm": 0.6520965099334717,
"learning_rate": 0.00036224621177036116,
"loss": 2.2236,
"step": 4040
},
{
"epoch": 0.35839122162736164,
"grad_norm": 0.5554526448249817,
"learning_rate": 0.0003616191779978907,
"loss": 2.2314,
"step": 4050
},
{
"epoch": 0.35927613822397236,
"grad_norm": 0.6766877174377441,
"learning_rate": 0.00036099126600762057,
"loss": 2.2228,
"step": 4060
},
{
"epoch": 0.36016105482058314,
"grad_norm": 0.43382716178894043,
"learning_rate": 0.00036036248073995135,
"loss": 2.217,
"step": 4070
},
{
"epoch": 0.3610459714171939,
"grad_norm": 0.4572659134864807,
"learning_rate": 0.0003597328271421551,
"loss": 2.2222,
"step": 4080
},
{
"epoch": 0.3619308880138047,
"grad_norm": 0.6247086524963379,
"learning_rate": 0.0003591023101683355,
"loss": 2.2301,
"step": 4090
},
{
"epoch": 0.36281580461041546,
"grad_norm": 0.3971577286720276,
"learning_rate": 0.00035847093477938953,
"loss": 2.2116,
"step": 4100
},
{
"epoch": 0.36370072120702623,
"grad_norm": 0.4561915099620819,
"learning_rate": 0.00035783870594296795,
"loss": 2.2151,
"step": 4110
},
{
"epoch": 0.364585637803637,
"grad_norm": 0.6701768636703491,
"learning_rate": 0.0003572056286334366,
"loss": 2.2285,
"step": 4120
},
{
"epoch": 0.3654705544002478,
"grad_norm": 0.49255579710006714,
"learning_rate": 0.000356571707831837,
"loss": 2.2201,
"step": 4130
},
{
"epoch": 0.36635547099685856,
"grad_norm": 0.547238290309906,
"learning_rate": 0.00035593694852584717,
"loss": 2.2295,
"step": 4140
},
{
"epoch": 0.36724038759346933,
"grad_norm": 0.6822459697723389,
"learning_rate": 0.0003553013557097428,
"loss": 2.2206,
"step": 4150
},
{
"epoch": 0.3681253041900801,
"grad_norm": 0.6047173738479614,
"learning_rate": 0.00035466493438435703,
"loss": 2.2155,
"step": 4160
},
{
"epoch": 0.3690102207866909,
"grad_norm": 0.43247050046920776,
"learning_rate": 0.0003540276895570424,
"loss": 2.2315,
"step": 4170
},
{
"epoch": 0.3698951373833016,
"grad_norm": 0.495980441570282,
"learning_rate": 0.0003533896262416302,
"loss": 2.2295,
"step": 4180
},
{
"epoch": 0.37078005397991237,
"grad_norm": 0.5255789756774902,
"learning_rate": 0.00035275074945839187,
"loss": 2.2338,
"step": 4190
},
{
"epoch": 0.37166497057652315,
"grad_norm": 0.5229047536849976,
"learning_rate": 0.0003521110642339991,
"loss": 2.2253,
"step": 4200
},
{
"epoch": 0.3725498871731339,
"grad_norm": 0.42177897691726685,
"learning_rate": 0.00035147057560148433,
"loss": 2.2267,
"step": 4210
},
{
"epoch": 0.3734348037697447,
"grad_norm": 0.45331940054893494,
"learning_rate": 0.0003508292886002013,
"loss": 2.2153,
"step": 4220
},
{
"epoch": 0.37431972036635547,
"grad_norm": 0.6908669471740723,
"learning_rate": 0.0003501872082757852,
"loss": 2.2118,
"step": 4230
},
{
"epoch": 0.37520463696296624,
"grad_norm": 0.4515652656555176,
"learning_rate": 0.00034954433968011333,
"loss": 2.2165,
"step": 4240
},
{
"epoch": 0.376089553559577,
"grad_norm": 0.39639732241630554,
"learning_rate": 0.00034890068787126475,
"loss": 2.2182,
"step": 4250
},
{
"epoch": 0.3769744701561878,
"grad_norm": 0.4517054259777069,
"learning_rate": 0.0003482562579134809,
"loss": 2.2161,
"step": 4260
},
{
"epoch": 0.37785938675279857,
"grad_norm": 0.4925696551799774,
"learning_rate": 0.0003476110548771259,
"loss": 2.2094,
"step": 4270
},
{
"epoch": 0.37874430334940934,
"grad_norm": 0.5564557909965515,
"learning_rate": 0.00034696508383864633,
"loss": 2.1985,
"step": 4280
},
{
"epoch": 0.3796292199460201,
"grad_norm": 0.5726402401924133,
"learning_rate": 0.0003463183498805312,
"loss": 2.207,
"step": 4290
},
{
"epoch": 0.38051413654263083,
"grad_norm": 0.39135950803756714,
"learning_rate": 0.0003456708580912725,
"loss": 2.207,
"step": 4300
},
{
"epoch": 0.3813990531392416,
"grad_norm": 0.5821109414100647,
"learning_rate": 0.0003450226135653245,
"loss": 2.2369,
"step": 4310
},
{
"epoch": 0.3822839697358524,
"grad_norm": 0.44956621527671814,
"learning_rate": 0.00034437362140306424,
"loss": 2.2136,
"step": 4320
},
{
"epoch": 0.38316888633246315,
"grad_norm": 0.3989977538585663,
"learning_rate": 0.000343723886710751,
"loss": 2.2124,
"step": 4330
},
{
"epoch": 0.38405380292907393,
"grad_norm": 0.5621548295021057,
"learning_rate": 0.0003430734146004863,
"loss": 2.2079,
"step": 4340
},
{
"epoch": 0.3849387195256847,
"grad_norm": 0.6707894206047058,
"learning_rate": 0.0003424222101901738,
"loss": 2.2275,
"step": 4350
},
{
"epoch": 0.3858236361222955,
"grad_norm": 0.4202430248260498,
"learning_rate": 0.0003417702786034786,
"loss": 2.2233,
"step": 4360
},
{
"epoch": 0.38670855271890625,
"grad_norm": 0.6233255863189697,
"learning_rate": 0.0003411176249697875,
"loss": 2.2169,
"step": 4370
},
{
"epoch": 0.387593469315517,
"grad_norm": 0.609015703201294,
"learning_rate": 0.00034046425442416805,
"loss": 2.232,
"step": 4380
},
{
"epoch": 0.3884783859121278,
"grad_norm": 0.5517158508300781,
"learning_rate": 0.0003398101721073288,
"loss": 2.229,
"step": 4390
},
{
"epoch": 0.3893633025087386,
"grad_norm": 0.6552030444145203,
"learning_rate": 0.00033915538316557826,
"loss": 2.2172,
"step": 4400
},
{
"epoch": 0.39024821910534935,
"grad_norm": 0.509896993637085,
"learning_rate": 0.00033849989275078473,
"loss": 2.2444,
"step": 4410
},
{
"epoch": 0.39113313570196007,
"grad_norm": 0.5120652914047241,
"learning_rate": 0.0003378437060203357,
"loss": 2.2113,
"step": 4420
},
{
"epoch": 0.39201805229857084,
"grad_norm": 0.4150264859199524,
"learning_rate": 0.00033718682813709715,
"loss": 2.2235,
"step": 4430
},
{
"epoch": 0.3929029688951816,
"grad_norm": 0.43550485372543335,
"learning_rate": 0.0003365292642693733,
"loss": 2.2227,
"step": 4440
},
{
"epoch": 0.3937878854917924,
"grad_norm": 0.6145851016044617,
"learning_rate": 0.00033587101959086524,
"loss": 2.2307,
"step": 4450
},
{
"epoch": 0.39467280208840316,
"grad_norm": 0.7754809260368347,
"learning_rate": 0.00033521209928063123,
"loss": 2.1927,
"step": 4460
},
{
"epoch": 0.39555771868501394,
"grad_norm": 0.45462676882743835,
"learning_rate": 0.0003345525085230449,
"loss": 2.2267,
"step": 4470
},
{
"epoch": 0.3964426352816247,
"grad_norm": 0.6465119123458862,
"learning_rate": 0.0003338922525077553,
"loss": 2.2108,
"step": 4480
},
{
"epoch": 0.3973275518782355,
"grad_norm": 0.5132725834846497,
"learning_rate": 0.00033323133642964545,
"loss": 2.2237,
"step": 4490
},
{
"epoch": 0.39821246847484626,
"grad_norm": 0.399767130613327,
"learning_rate": 0.00033256976548879183,
"loss": 2.211,
"step": 4500
},
{
"epoch": 0.39909738507145703,
"grad_norm": 0.45235475897789,
"learning_rate": 0.0003319075448904234,
"loss": 2.2203,
"step": 4510
},
{
"epoch": 0.3999823016680678,
"grad_norm": 0.3382948935031891,
"learning_rate": 0.00033124467984488066,
"loss": 2.2098,
"step": 4520
},
{
"epoch": 0.4008672182646785,
"grad_norm": 0.5118781328201294,
"learning_rate": 0.00033058117556757457,
"loss": 2.2064,
"step": 4530
},
{
"epoch": 0.4017521348612893,
"grad_norm": 0.39723286032676697,
"learning_rate": 0.0003299170372789454,
"loss": 2.2086,
"step": 4540
},
{
"epoch": 0.4026370514579001,
"grad_norm": 0.46771299839019775,
"learning_rate": 0.0003292522702044221,
"loss": 2.2039,
"step": 4550
},
{
"epoch": 0.40352196805451085,
"grad_norm": 0.43072032928466797,
"learning_rate": 0.0003285868795743805,
"loss": 2.1885,
"step": 4560
},
{
"epoch": 0.4044068846511216,
"grad_norm": 0.5793524980545044,
"learning_rate": 0.0003279208706241031,
"loss": 2.2077,
"step": 4570
},
{
"epoch": 0.4052918012477324,
"grad_norm": 0.4353041648864746,
"learning_rate": 0.00032725424859373687,
"loss": 2.2022,
"step": 4580
},
{
"epoch": 0.40617671784434317,
"grad_norm": 0.5556246042251587,
"learning_rate": 0.00032658701872825265,
"loss": 2.2221,
"step": 4590
},
{
"epoch": 0.40706163444095395,
"grad_norm": 0.4479421079158783,
"learning_rate": 0.0003259191862774037,
"loss": 2.2024,
"step": 4600
},
{
"epoch": 0.4079465510375647,
"grad_norm": 0.4452154338359833,
"learning_rate": 0.0003252507564956844,
"loss": 2.2154,
"step": 4610
},
{
"epoch": 0.4088314676341755,
"grad_norm": 0.4886869788169861,
"learning_rate": 0.000324581734642289,
"loss": 2.2013,
"step": 4620
},
{
"epoch": 0.40971638423078627,
"grad_norm": 0.4508967101573944,
"learning_rate": 0.0003239121259810701,
"loss": 2.2229,
"step": 4630
},
{
"epoch": 0.41060130082739704,
"grad_norm": 0.37643152475357056,
"learning_rate": 0.00032324193578049724,
"loss": 2.2062,
"step": 4640
},
{
"epoch": 0.41148621742400776,
"grad_norm": 0.4318946897983551,
"learning_rate": 0.00032257116931361555,
"loss": 2.2152,
"step": 4650
},
{
"epoch": 0.41237113402061853,
"grad_norm": 0.5894390940666199,
"learning_rate": 0.0003218998318580043,
"loss": 2.2079,
"step": 4660
},
{
"epoch": 0.4132560506172293,
"grad_norm": 0.5740794539451599,
"learning_rate": 0.0003212279286957352,
"loss": 2.2118,
"step": 4670
},
{
"epoch": 0.4141409672138401,
"grad_norm": 0.4139980673789978,
"learning_rate": 0.00032055546511333075,
"loss": 2.207,
"step": 4680
},
{
"epoch": 0.41502588381045086,
"grad_norm": 0.5307886004447937,
"learning_rate": 0.00031988244640172327,
"loss": 2.2077,
"step": 4690
},
{
"epoch": 0.41591080040706163,
"grad_norm": 0.5277409553527832,
"learning_rate": 0.00031920887785621233,
"loss": 2.2067,
"step": 4700
},
{
"epoch": 0.4167957170036724,
"grad_norm": 0.4328935444355011,
"learning_rate": 0.0003185347647764241,
"loss": 2.2273,
"step": 4710
},
{
"epoch": 0.4176806336002832,
"grad_norm": 0.4828528165817261,
"learning_rate": 0.00031786011246626855,
"loss": 2.2284,
"step": 4720
},
{
"epoch": 0.41856555019689395,
"grad_norm": 0.6279691457748413,
"learning_rate": 0.00031718492623389896,
"loss": 2.1989,
"step": 4730
},
{
"epoch": 0.41945046679350473,
"grad_norm": 0.5163887739181519,
"learning_rate": 0.0003165092113916688,
"loss": 2.2091,
"step": 4740
},
{
"epoch": 0.4203353833901155,
"grad_norm": 0.38124004006385803,
"learning_rate": 0.00031583297325609116,
"loss": 2.209,
"step": 4750
},
{
"epoch": 0.4212202999867263,
"grad_norm": 0.502122700214386,
"learning_rate": 0.00031515621714779636,
"loss": 2.1982,
"step": 4760
},
{
"epoch": 0.422105216583337,
"grad_norm": 0.403822124004364,
"learning_rate": 0.0003144789483914898,
"loss": 2.2207,
"step": 4770
},
{
"epoch": 0.42299013317994777,
"grad_norm": 0.3561129868030548,
"learning_rate": 0.00031380117231591067,
"loss": 2.2032,
"step": 4780
},
{
"epoch": 0.42387504977655854,
"grad_norm": 0.6012318730354309,
"learning_rate": 0.0003131228942537895,
"loss": 2.1987,
"step": 4790
},
{
"epoch": 0.4247599663731693,
"grad_norm": 0.5018305778503418,
"learning_rate": 0.00031244411954180673,
"loss": 2.2083,
"step": 4800
},
{
"epoch": 0.4256448829697801,
"grad_norm": 0.4451483488082886,
"learning_rate": 0.00031176485352055015,
"loss": 2.2157,
"step": 4810
},
{
"epoch": 0.42652979956639087,
"grad_norm": 0.5879373550415039,
"learning_rate": 0.0003110851015344735,
"loss": 2.2291,
"step": 4820
},
{
"epoch": 0.42741471616300164,
"grad_norm": 0.5806208252906799,
"learning_rate": 0.0003104048689318538,
"loss": 2.2127,
"step": 4830
},
{
"epoch": 0.4282996327596124,
"grad_norm": 0.4805116355419159,
"learning_rate": 0.0003097241610647494,
"loss": 2.2026,
"step": 4840
},
{
"epoch": 0.4291845493562232,
"grad_norm": 0.45545217394828796,
"learning_rate": 0.00030904298328895865,
"loss": 2.2075,
"step": 4850
},
{
"epoch": 0.43006946595283396,
"grad_norm": 0.545505166053772,
"learning_rate": 0.0003083613409639764,
"loss": 2.2201,
"step": 4860
},
{
"epoch": 0.43095438254944474,
"grad_norm": 0.5077127814292908,
"learning_rate": 0.00030767923945295306,
"loss": 2.2317,
"step": 4870
},
{
"epoch": 0.4318392991460555,
"grad_norm": 0.5988184213638306,
"learning_rate": 0.00030699668412265173,
"loss": 2.1995,
"step": 4880
},
{
"epoch": 0.43272421574266623,
"grad_norm": 0.4757969081401825,
"learning_rate": 0.00030631368034340624,
"loss": 2.1997,
"step": 4890
},
{
"epoch": 0.433609132339277,
"grad_norm": 0.4290432631969452,
"learning_rate": 0.0003056302334890786,
"loss": 2.2172,
"step": 4900
},
{
"epoch": 0.4344940489358878,
"grad_norm": 0.48155564069747925,
"learning_rate": 0.00030494634893701725,
"loss": 2.2104,
"step": 4910
},
{
"epoch": 0.43537896553249855,
"grad_norm": 0.47956278920173645,
"learning_rate": 0.00030426203206801406,
"loss": 2.1989,
"step": 4920
},
{
"epoch": 0.4362638821291093,
"grad_norm": 0.433569073677063,
"learning_rate": 0.00030357728826626266,
"loss": 2.2159,
"step": 4930
},
{
"epoch": 0.4371487987257201,
"grad_norm": 0.4097784757614136,
"learning_rate": 0.0003028921229193157,
"loss": 2.2198,
"step": 4940
},
{
"epoch": 0.4380337153223309,
"grad_norm": 0.3822705149650574,
"learning_rate": 0.00030220654141804247,
"loss": 2.2031,
"step": 4950
},
{
"epoch": 0.43891863191894165,
"grad_norm": 0.5066282749176025,
"learning_rate": 0.00030152054915658663,
"loss": 2.2007,
"step": 4960
},
{
"epoch": 0.4398035485155524,
"grad_norm": 0.3919082283973694,
"learning_rate": 0.0003008341515323235,
"loss": 2.2084,
"step": 4970
},
{
"epoch": 0.4406884651121632,
"grad_norm": 0.40400826930999756,
"learning_rate": 0.0003001473539458182,
"loss": 2.2039,
"step": 4980
},
{
"epoch": 0.44157338170877397,
"grad_norm": 0.44832319021224976,
"learning_rate": 0.00029946016180078234,
"loss": 2.1997,
"step": 4990
},
{
"epoch": 0.44245829830538475,
"grad_norm": 0.42437443137168884,
"learning_rate": 0.0002987725805040321,
"loss": 2.21,
"step": 5000
},
{
"epoch": 0.44245829830538475,
"eval_accuracy": 0.5471037697452792,
"eval_loss": 2.1154496669769287,
"eval_runtime": 12.1652,
"eval_samples_per_second": 26.14,
"eval_steps_per_second": 0.411,
"step": 5000
},
{
"epoch": 0.44334321490199546,
"grad_norm": 0.44694674015045166,
"learning_rate": 0.0002980846154654455,
"loss": 2.2418,
"step": 5010
},
{
"epoch": 0.44422813149860624,
"grad_norm": 0.5123154520988464,
"learning_rate": 0.0002973962720979196,
"loss": 2.1966,
"step": 5020
},
{
"epoch": 0.445113048095217,
"grad_norm": 0.452332466840744,
"learning_rate": 0.0002967075558173287,
"loss": 2.2079,
"step": 5030
},
{
"epoch": 0.4459979646918278,
"grad_norm": 0.42223265767097473,
"learning_rate": 0.00029601847204248046,
"loss": 2.1987,
"step": 5040
},
{
"epoch": 0.44688288128843856,
"grad_norm": 0.37268826365470886,
"learning_rate": 0.00029532902619507464,
"loss": 2.2177,
"step": 5050
},
{
"epoch": 0.44776779788504933,
"grad_norm": 0.45287415385246277,
"learning_rate": 0.0002946392236996592,
"loss": 2.2085,
"step": 5060
},
{
"epoch": 0.4486527144816601,
"grad_norm": 0.3807585835456848,
"learning_rate": 0.0002939490699835887,
"loss": 2.2083,
"step": 5070
},
{
"epoch": 0.4495376310782709,
"grad_norm": 0.45550355315208435,
"learning_rate": 0.00029325857047698067,
"loss": 2.195,
"step": 5080
},
{
"epoch": 0.45042254767488166,
"grad_norm": 0.45763885974884033,
"learning_rate": 0.00029256773061267375,
"loss": 2.21,
"step": 5090
},
{
"epoch": 0.45130746427149243,
"grad_norm": 0.3954174816608429,
"learning_rate": 0.0002918765558261841,
"loss": 2.2278,
"step": 5100
},
{
"epoch": 0.4521923808681032,
"grad_norm": 0.4900796413421631,
"learning_rate": 0.00029118505155566334,
"loss": 2.2106,
"step": 5110
},
{
"epoch": 0.4530772974647139,
"grad_norm": 0.6190772652626038,
"learning_rate": 0.00029049322324185524,
"loss": 2.2257,
"step": 5120
},
{
"epoch": 0.4539622140613247,
"grad_norm": 0.4596320688724518,
"learning_rate": 0.0002898010763280533,
"loss": 2.2085,
"step": 5130
},
{
"epoch": 0.4548471306579355,
"grad_norm": 0.391040563583374,
"learning_rate": 0.00028910861626005774,
"loss": 2.2019,
"step": 5140
},
{
"epoch": 0.45573204725454625,
"grad_norm": 0.6000839471817017,
"learning_rate": 0.0002884158484861325,
"loss": 2.1821,
"step": 5150
},
{
"epoch": 0.456616963851157,
"grad_norm": 0.4674588739871979,
"learning_rate": 0.00028772277845696287,
"loss": 2.1972,
"step": 5160
},
{
"epoch": 0.4575018804477678,
"grad_norm": 0.6962230801582336,
"learning_rate": 0.0002870294116256119,
"loss": 2.2126,
"step": 5170
},
{
"epoch": 0.45838679704437857,
"grad_norm": 0.5181962251663208,
"learning_rate": 0.0002863357534474782,
"loss": 2.1981,
"step": 5180
},
{
"epoch": 0.45927171364098934,
"grad_norm": 0.7072561979293823,
"learning_rate": 0.0002856418093802525,
"loss": 2.2286,
"step": 5190
},
{
"epoch": 0.4601566302376001,
"grad_norm": 0.4124125838279724,
"learning_rate": 0.0002849475848838749,
"loss": 2.1959,
"step": 5200
},
{
"epoch": 0.4610415468342109,
"grad_norm": 0.4569956660270691,
"learning_rate": 0.00028425308542049207,
"loss": 2.2136,
"step": 5210
},
{
"epoch": 0.46192646343082167,
"grad_norm": 0.43784773349761963,
"learning_rate": 0.0002835583164544139,
"loss": 2.2099,
"step": 5220
},
{
"epoch": 0.46281138002743244,
"grad_norm": 0.4107704758644104,
"learning_rate": 0.00028286328345207096,
"loss": 2.1818,
"step": 5230
},
{
"epoch": 0.46369629662404316,
"grad_norm": 0.4356732964515686,
"learning_rate": 0.000282167991881971,
"loss": 2.2151,
"step": 5240
},
{
"epoch": 0.46458121322065393,
"grad_norm": 0.4446347653865814,
"learning_rate": 0.00028147244721465637,
"loss": 2.2017,
"step": 5250
},
{
"epoch": 0.4654661298172647,
"grad_norm": 0.3608684539794922,
"learning_rate": 0.00028077665492266075,
"loss": 2.2149,
"step": 5260
},
{
"epoch": 0.4663510464138755,
"grad_norm": 0.6555531620979309,
"learning_rate": 0.000280080620480466,
"loss": 2.2124,
"step": 5270
},
{
"epoch": 0.46723596301048625,
"grad_norm": 0.5321036577224731,
"learning_rate": 0.00027938434936445943,
"loss": 2.1878,
"step": 5280
},
{
"epoch": 0.46812087960709703,
"grad_norm": 0.5092534422874451,
"learning_rate": 0.00027868784705289024,
"loss": 2.1893,
"step": 5290
},
{
"epoch": 0.4690057962037078,
"grad_norm": 0.65367192029953,
"learning_rate": 0.00027799111902582696,
"loss": 2.1994,
"step": 5300
},
{
"epoch": 0.4698907128003186,
"grad_norm": 0.44738465547561646,
"learning_rate": 0.0002772941707651138,
"loss": 2.2117,
"step": 5310
},
{
"epoch": 0.47077562939692935,
"grad_norm": 0.3654314875602722,
"learning_rate": 0.00027659700775432784,
"loss": 2.1912,
"step": 5320
},
{
"epoch": 0.4716605459935401,
"grad_norm": 0.6195691227912903,
"learning_rate": 0.00027589963547873585,
"loss": 2.1965,
"step": 5330
},
{
"epoch": 0.4725454625901509,
"grad_norm": 0.40310239791870117,
"learning_rate": 0.0002752020594252511,
"loss": 2.2075,
"step": 5340
},
{
"epoch": 0.4734303791867617,
"grad_norm": 0.43661314249038696,
"learning_rate": 0.0002745042850823902,
"loss": 2.1909,
"step": 5350
},
{
"epoch": 0.4743152957833724,
"grad_norm": 0.67514568567276,
"learning_rate": 0.00027380631794022967,
"loss": 2.192,
"step": 5360
},
{
"epoch": 0.47520021237998317,
"grad_norm": 0.6213112473487854,
"learning_rate": 0.0002731081634903633,
"loss": 2.205,
"step": 5370
},
{
"epoch": 0.47608512897659394,
"grad_norm": 0.6548684239387512,
"learning_rate": 0.0002724098272258584,
"loss": 2.2097,
"step": 5380
},
{
"epoch": 0.4769700455732047,
"grad_norm": 0.412087619304657,
"learning_rate": 0.0002717113146412129,
"loss": 2.1935,
"step": 5390
},
{
"epoch": 0.4778549621698155,
"grad_norm": 0.49308520555496216,
"learning_rate": 0.0002710126312323119,
"loss": 2.1952,
"step": 5400
},
{
"epoch": 0.47873987876642626,
"grad_norm": 0.6524165272712708,
"learning_rate": 0.00027031378249638474,
"loss": 2.2203,
"step": 5410
},
{
"epoch": 0.47962479536303704,
"grad_norm": 0.4079228341579437,
"learning_rate": 0.00026961477393196127,
"loss": 2.211,
"step": 5420
},
{
"epoch": 0.4805097119596478,
"grad_norm": 0.46248430013656616,
"learning_rate": 0.0002689156110388292,
"loss": 2.2164,
"step": 5430
},
{
"epoch": 0.4813946285562586,
"grad_norm": 0.4539341628551483,
"learning_rate": 0.0002682162993179901,
"loss": 2.2176,
"step": 5440
},
{
"epoch": 0.48227954515286936,
"grad_norm": 0.5375662446022034,
"learning_rate": 0.00026751684427161684,
"loss": 2.2197,
"step": 5450
},
{
"epoch": 0.48316446174948013,
"grad_norm": 0.4172353148460388,
"learning_rate": 0.00026681725140300993,
"loss": 2.1972,
"step": 5460
},
{
"epoch": 0.4840493783460909,
"grad_norm": 0.4539187252521515,
"learning_rate": 0.0002661175262165541,
"loss": 2.2104,
"step": 5470
},
{
"epoch": 0.4849342949427016,
"grad_norm": 0.6489385962486267,
"learning_rate": 0.0002654176742176754,
"loss": 2.2101,
"step": 5480
},
{
"epoch": 0.4858192115393124,
"grad_norm": 0.40475186705589294,
"learning_rate": 0.00026471770091279724,
"loss": 2.2076,
"step": 5490
},
{
"epoch": 0.4867041281359232,
"grad_norm": 0.4963974356651306,
"learning_rate": 0.00026401761180929796,
"loss": 2.2017,
"step": 5500
},
{
"epoch": 0.48758904473253395,
"grad_norm": 0.40801239013671875,
"learning_rate": 0.0002633174124154666,
"loss": 2.2242,
"step": 5510
},
{
"epoch": 0.4884739613291447,
"grad_norm": 0.3968900144100189,
"learning_rate": 0.0002626171082404602,
"loss": 2.1925,
"step": 5520
},
{
"epoch": 0.4893588779257555,
"grad_norm": 0.42560887336730957,
"learning_rate": 0.0002619167047942602,
"loss": 2.1916,
"step": 5530
},
{
"epoch": 0.49024379452236627,
"grad_norm": 0.42510178685188293,
"learning_rate": 0.00026121620758762877,
"loss": 2.1896,
"step": 5540
},
{
"epoch": 0.49112871111897705,
"grad_norm": 0.34354716539382935,
"learning_rate": 0.0002605156221320663,
"loss": 2.1943,
"step": 5550
},
{
"epoch": 0.4920136277155878,
"grad_norm": 0.617936372756958,
"learning_rate": 0.00025981495393976716,
"loss": 2.2124,
"step": 5560
},
{
"epoch": 0.4928985443121986,
"grad_norm": 0.6653580069541931,
"learning_rate": 0.00025911420852357695,
"loss": 2.2019,
"step": 5570
},
{
"epoch": 0.49378346090880937,
"grad_norm": 0.38025906682014465,
"learning_rate": 0.0002584133913969485,
"loss": 2.1852,
"step": 5580
},
{
"epoch": 0.4946683775054201,
"grad_norm": 0.38345324993133545,
"learning_rate": 0.0002577125080738993,
"loss": 2.1998,
"step": 5590
},
{
"epoch": 0.49555329410203086,
"grad_norm": 0.41947296261787415,
"learning_rate": 0.00025701156406896723,
"loss": 2.2045,
"step": 5600
},
{
"epoch": 0.49643821069864164,
"grad_norm": 0.6970186233520508,
"learning_rate": 0.0002563105648971681,
"loss": 2.1963,
"step": 5610
},
{
"epoch": 0.4973231272952524,
"grad_norm": 0.4399164021015167,
"learning_rate": 0.00025560951607395127,
"loss": 2.2119,
"step": 5620
},
{
"epoch": 0.4982080438918632,
"grad_norm": 0.45648398995399475,
"learning_rate": 0.00025490842311515704,
"loss": 2.1836,
"step": 5630
},
{
"epoch": 0.49909296048847396,
"grad_norm": 0.43257445096969604,
"learning_rate": 0.00025420729153697306,
"loss": 2.2201,
"step": 5640
},
{
"epoch": 0.49997787708508473,
"grad_norm": 0.5123971104621887,
"learning_rate": 0.00025350612685589056,
"loss": 2.1969,
"step": 5650
},
{
"epoch": 0.5008627936816955,
"grad_norm": 0.6617570519447327,
"learning_rate": 0.0002528049345886615,
"loss": 2.1973,
"step": 5660
},
{
"epoch": 0.5017477102783062,
"grad_norm": 0.35327666997909546,
"learning_rate": 0.0002521037202522546,
"loss": 2.2027,
"step": 5670
},
{
"epoch": 0.502632626874917,
"grad_norm": 0.5147636532783508,
"learning_rate": 0.00025140248936381246,
"loss": 2.2133,
"step": 5680
},
{
"epoch": 0.5035175434715278,
"grad_norm": 0.4274057447910309,
"learning_rate": 0.0002507012474406077,
"loss": 2.195,
"step": 5690
},
{
"epoch": 0.5044024600681386,
"grad_norm": 0.40779900550842285,
"learning_rate": 0.00025,
"loss": 2.214,
"step": 5700
},
{
"epoch": 0.5052873766647493,
"grad_norm": 0.6237518191337585,
"learning_rate": 0.00024929875255939236,
"loss": 2.2385,
"step": 5710
},
{
"epoch": 0.5061722932613602,
"grad_norm": 0.4006004333496094,
"learning_rate": 0.0002485975106361876,
"loss": 2.211,
"step": 5720
},
{
"epoch": 0.5070572098579709,
"grad_norm": 0.5931833982467651,
"learning_rate": 0.0002478962797477455,
"loss": 2.2033,
"step": 5730
},
{
"epoch": 0.5079421264545817,
"grad_norm": 0.5313906073570251,
"learning_rate": 0.00024719506541133853,
"loss": 2.2076,
"step": 5740
},
{
"epoch": 0.5088270430511924,
"grad_norm": 0.3830896317958832,
"learning_rate": 0.00024649387314410945,
"loss": 2.1989,
"step": 5750
},
{
"epoch": 0.5097119596478032,
"grad_norm": 0.35289981961250305,
"learning_rate": 0.00024579270846302695,
"loss": 2.2155,
"step": 5760
},
{
"epoch": 0.510596876244414,
"grad_norm": 0.4024136960506439,
"learning_rate": 0.00024509157688484297,
"loss": 2.184,
"step": 5770
},
{
"epoch": 0.5114817928410247,
"grad_norm": 0.5324482321739197,
"learning_rate": 0.0002443904839260488,
"loss": 2.2059,
"step": 5780
},
{
"epoch": 0.5123667094376355,
"grad_norm": 0.3289058208465576,
"learning_rate": 0.000243689435102832,
"loss": 2.1914,
"step": 5790
},
{
"epoch": 0.5132516260342462,
"grad_norm": 0.4984874725341797,
"learning_rate": 0.00024298843593103278,
"loss": 2.1879,
"step": 5800
},
{
"epoch": 0.5141365426308571,
"grad_norm": 0.4327697455883026,
"learning_rate": 0.0002422874919261008,
"loss": 2.1919,
"step": 5810
},
{
"epoch": 0.5150214592274678,
"grad_norm": 0.5142413973808289,
"learning_rate": 0.0002415866086030516,
"loss": 2.2187,
"step": 5820
},
{
"epoch": 0.5159063758240786,
"grad_norm": 0.3675195574760437,
"learning_rate": 0.00024088579147642317,
"loss": 2.2138,
"step": 5830
},
{
"epoch": 0.5167912924206893,
"grad_norm": 0.44548624753952026,
"learning_rate": 0.00024018504606023293,
"loss": 2.1809,
"step": 5840
},
{
"epoch": 0.5176762090173002,
"grad_norm": 0.39151784777641296,
"learning_rate": 0.0002394843778679338,
"loss": 2.1724,
"step": 5850
},
{
"epoch": 0.5185611256139109,
"grad_norm": 0.519999086856842,
"learning_rate": 0.00023878379241237135,
"loss": 2.1958,
"step": 5860
},
{
"epoch": 0.5194460422105217,
"grad_norm": 0.467977911233902,
"learning_rate": 0.00023808329520573997,
"loss": 2.1986,
"step": 5870
},
{
"epoch": 0.5203309588071324,
"grad_norm": 0.3409745693206787,
"learning_rate": 0.00023738289175953976,
"loss": 2.1951,
"step": 5880
},
{
"epoch": 0.5212158754037431,
"grad_norm": 0.41722747683525085,
"learning_rate": 0.00023668258758453338,
"loss": 2.1927,
"step": 5890
},
{
"epoch": 0.522100792000354,
"grad_norm": 0.44009125232696533,
"learning_rate": 0.00023598238819070203,
"loss": 2.1973,
"step": 5900
},
{
"epoch": 0.5229857085969647,
"grad_norm": 0.3806867301464081,
"learning_rate": 0.00023528229908720272,
"loss": 2.2134,
"step": 5910
},
{
"epoch": 0.5238706251935755,
"grad_norm": 0.3461344242095947,
"learning_rate": 0.00023458232578232462,
"loss": 2.18,
"step": 5920
},
{
"epoch": 0.5247555417901862,
"grad_norm": 0.3539530634880066,
"learning_rate": 0.0002338824737834459,
"loss": 2.1847,
"step": 5930
},
{
"epoch": 0.5256404583867971,
"grad_norm": 0.3867835998535156,
"learning_rate": 0.0002331827485969901,
"loss": 2.2008,
"step": 5940
},
{
"epoch": 0.5265253749834078,
"grad_norm": 0.47032880783081055,
"learning_rate": 0.00023248315572838317,
"loss": 2.2001,
"step": 5950
},
{
"epoch": 0.5274102915800186,
"grad_norm": 0.39191240072250366,
"learning_rate": 0.00023178370068201,
"loss": 2.1971,
"step": 5960
},
{
"epoch": 0.5282952081766293,
"grad_norm": 0.3935898542404175,
"learning_rate": 0.0002310843889611709,
"loss": 2.2009,
"step": 5970
},
{
"epoch": 0.5291801247732402,
"grad_norm": 0.3939705789089203,
"learning_rate": 0.0002303852260680388,
"loss": 2.1976,
"step": 5980
},
{
"epoch": 0.5300650413698509,
"grad_norm": 0.37949132919311523,
"learning_rate": 0.00022968621750361532,
"loss": 2.2045,
"step": 5990
},
{
"epoch": 0.5309499579664617,
"grad_norm": 0.5454613566398621,
"learning_rate": 0.00022898736876768815,
"loss": 2.1945,
"step": 6000
},
{
"epoch": 0.5309499579664617,
"eval_accuracy": 0.5490304943135131,
"eval_loss": 2.104365110397339,
"eval_runtime": 11.2034,
"eval_samples_per_second": 28.384,
"eval_steps_per_second": 0.446,
"step": 6000
},
{
"epoch": 0.5318348745630724,
"grad_norm": 0.4849005341529846,
"learning_rate": 0.00022828868535878713,
"loss": 2.1681,
"step": 6010
},
{
"epoch": 0.5327197911596832,
"grad_norm": 0.5288079380989075,
"learning_rate": 0.00022759017277414165,
"loss": 2.1923,
"step": 6020
},
{
"epoch": 0.533604707756294,
"grad_norm": 0.37164533138275146,
"learning_rate": 0.0002268918365096367,
"loss": 2.2136,
"step": 6030
},
{
"epoch": 0.5344896243529047,
"grad_norm": 0.35533079504966736,
"learning_rate": 0.00022619368205977036,
"loss": 2.21,
"step": 6040
},
{
"epoch": 0.5353745409495155,
"grad_norm": 0.3953758180141449,
"learning_rate": 0.00022549571491760985,
"loss": 2.2034,
"step": 6050
},
{
"epoch": 0.5362594575461263,
"grad_norm": 0.41037318110466003,
"learning_rate": 0.0002247979405747489,
"loss": 2.1955,
"step": 6060
},
{
"epoch": 0.5371443741427371,
"grad_norm": 0.46826791763305664,
"learning_rate": 0.00022410036452126417,
"loss": 2.1646,
"step": 6070
},
{
"epoch": 0.5380292907393478,
"grad_norm": 0.6032235622406006,
"learning_rate": 0.00022340299224567217,
"loss": 2.2023,
"step": 6080
},
{
"epoch": 0.5389142073359586,
"grad_norm": 0.41023147106170654,
"learning_rate": 0.00022270582923488626,
"loss": 2.2043,
"step": 6090
},
{
"epoch": 0.5397991239325693,
"grad_norm": 0.44359543919563293,
"learning_rate": 0.00022200888097417305,
"loss": 2.1805,
"step": 6100
},
{
"epoch": 0.5406840405291802,
"grad_norm": 0.3892384469509125,
"learning_rate": 0.00022131215294710977,
"loss": 2.1955,
"step": 6110
},
{
"epoch": 0.5415689571257909,
"grad_norm": 0.3548850417137146,
"learning_rate": 0.00022061565063554063,
"loss": 2.1924,
"step": 6120
},
{
"epoch": 0.5424538737224016,
"grad_norm": 0.4247482419013977,
"learning_rate": 0.00021991937951953405,
"loss": 2.1926,
"step": 6130
},
{
"epoch": 0.5433387903190124,
"grad_norm": 0.43686312437057495,
"learning_rate": 0.00021922334507733931,
"loss": 2.187,
"step": 6140
},
{
"epoch": 0.5442237069156232,
"grad_norm": 0.4155753552913666,
"learning_rate": 0.0002185275527853437,
"loss": 2.2047,
"step": 6150
},
{
"epoch": 0.545108623512234,
"grad_norm": 0.4141368865966797,
"learning_rate": 0.00021783200811802906,
"loss": 2.1979,
"step": 6160
},
{
"epoch": 0.5459935401088447,
"grad_norm": 0.4047829210758209,
"learning_rate": 0.00021713671654792916,
"loss": 2.1808,
"step": 6170
},
{
"epoch": 0.5468784567054555,
"grad_norm": 0.4952094554901123,
"learning_rate": 0.0002164416835455862,
"loss": 2.2289,
"step": 6180
},
{
"epoch": 0.5477633733020663,
"grad_norm": 0.34553036093711853,
"learning_rate": 0.00021574691457950805,
"loss": 2.1779,
"step": 6190
},
{
"epoch": 0.5486482898986771,
"grad_norm": 0.37532058358192444,
"learning_rate": 0.00021505241511612523,
"loss": 2.1995,
"step": 6200
},
{
"epoch": 0.5495332064952878,
"grad_norm": 0.5026415586471558,
"learning_rate": 0.0002143581906197476,
"loss": 2.199,
"step": 6210
},
{
"epoch": 0.5504181230918986,
"grad_norm": 0.4540441036224365,
"learning_rate": 0.0002136642465525219,
"loss": 2.1923,
"step": 6220
},
{
"epoch": 0.5513030396885094,
"grad_norm": 0.43957459926605225,
"learning_rate": 0.0002129705883743881,
"loss": 2.1816,
"step": 6230
},
{
"epoch": 0.5521879562851201,
"grad_norm": 0.3559401333332062,
"learning_rate": 0.00021227722154303714,
"loss": 2.1891,
"step": 6240
},
{
"epoch": 0.5530728728817309,
"grad_norm": 0.5275906324386597,
"learning_rate": 0.00021158415151386746,
"loss": 2.2062,
"step": 6250
},
{
"epoch": 0.5539577894783416,
"grad_norm": 0.44601666927337646,
"learning_rate": 0.00021089138373994224,
"loss": 2.1942,
"step": 6260
},
{
"epoch": 0.5548427060749525,
"grad_norm": 0.42079922556877136,
"learning_rate": 0.0002101989236719467,
"loss": 2.2039,
"step": 6270
},
{
"epoch": 0.5557276226715632,
"grad_norm": 0.46110275387763977,
"learning_rate": 0.0002095067767581447,
"loss": 2.1989,
"step": 6280
},
{
"epoch": 0.556612539268174,
"grad_norm": 0.7742722630500793,
"learning_rate": 0.0002088149484443367,
"loss": 2.1854,
"step": 6290
},
{
"epoch": 0.5574974558647847,
"grad_norm": 0.43369126319885254,
"learning_rate": 0.00020812344417381592,
"loss": 2.1982,
"step": 6300
},
{
"epoch": 0.5583823724613955,
"grad_norm": 0.4741725027561188,
"learning_rate": 0.00020743226938732626,
"loss": 2.2027,
"step": 6310
},
{
"epoch": 0.5592672890580063,
"grad_norm": 0.3926391303539276,
"learning_rate": 0.00020674142952301934,
"loss": 2.1918,
"step": 6320
},
{
"epoch": 0.5601522056546171,
"grad_norm": 0.410693883895874,
"learning_rate": 0.00020605093001641137,
"loss": 2.1705,
"step": 6330
},
{
"epoch": 0.5610371222512278,
"grad_norm": 0.6874290108680725,
"learning_rate": 0.00020536077630034085,
"loss": 2.1878,
"step": 6340
},
{
"epoch": 0.5619220388478386,
"grad_norm": 0.5000712871551514,
"learning_rate": 0.00020467097380492545,
"loss": 2.1787,
"step": 6350
},
{
"epoch": 0.5628069554444494,
"grad_norm": 0.4700552523136139,
"learning_rate": 0.00020398152795751955,
"loss": 2.1964,
"step": 6360
},
{
"epoch": 0.5636918720410601,
"grad_norm": 0.4930076003074646,
"learning_rate": 0.00020329244418267138,
"loss": 2.1953,
"step": 6370
},
{
"epoch": 0.5645767886376709,
"grad_norm": 0.476241797208786,
"learning_rate": 0.0002026037279020804,
"loss": 2.2076,
"step": 6380
},
{
"epoch": 0.5654617052342816,
"grad_norm": 0.34659117460250854,
"learning_rate": 0.00020191538453455458,
"loss": 2.1888,
"step": 6390
},
{
"epoch": 0.5663466218308925,
"grad_norm": 0.40348589420318604,
"learning_rate": 0.00020122741949596797,
"loss": 2.1915,
"step": 6400
},
{
"epoch": 0.5672315384275032,
"grad_norm": 0.5236355662345886,
"learning_rate": 0.00020053983819921773,
"loss": 2.2073,
"step": 6410
},
{
"epoch": 0.568116455024114,
"grad_norm": 0.3683709502220154,
"learning_rate": 0.00019985264605418181,
"loss": 2.1811,
"step": 6420
},
{
"epoch": 0.5690013716207247,
"grad_norm": 0.349714457988739,
"learning_rate": 0.00019916584846767652,
"loss": 2.1851,
"step": 6430
},
{
"epoch": 0.5698862882173356,
"grad_norm": 0.35854193568229675,
"learning_rate": 0.00019847945084341343,
"loss": 2.1777,
"step": 6440
},
{
"epoch": 0.5707712048139463,
"grad_norm": 0.41335052251815796,
"learning_rate": 0.00019779345858195757,
"loss": 2.1887,
"step": 6450
},
{
"epoch": 0.5716561214105571,
"grad_norm": 0.3837135434150696,
"learning_rate": 0.0001971078770806843,
"loss": 2.2111,
"step": 6460
},
{
"epoch": 0.5725410380071678,
"grad_norm": 0.4042917490005493,
"learning_rate": 0.00019642271173373735,
"loss": 2.2014,
"step": 6470
},
{
"epoch": 0.5734259546037785,
"grad_norm": 0.6200820803642273,
"learning_rate": 0.00019573796793198595,
"loss": 2.2045,
"step": 6480
},
{
"epoch": 0.5743108712003894,
"grad_norm": 0.3666927218437195,
"learning_rate": 0.00019505365106298284,
"loss": 2.2006,
"step": 6490
},
{
"epoch": 0.5751957877970001,
"grad_norm": 0.3415856957435608,
"learning_rate": 0.00019436976651092142,
"loss": 2.1875,
"step": 6500
},
{
"epoch": 0.5760807043936109,
"grad_norm": 0.36049988865852356,
"learning_rate": 0.00019368631965659385,
"loss": 2.2193,
"step": 6510
},
{
"epoch": 0.5769656209902216,
"grad_norm": 0.47630825638771057,
"learning_rate": 0.00019300331587734833,
"loss": 2.1938,
"step": 6520
},
{
"epoch": 0.5778505375868325,
"grad_norm": 0.3804795444011688,
"learning_rate": 0.000192320760547047,
"loss": 2.1808,
"step": 6530
},
{
"epoch": 0.5787354541834432,
"grad_norm": 0.49671676754951477,
"learning_rate": 0.00019163865903602372,
"loss": 2.1902,
"step": 6540
},
{
"epoch": 0.579620370780054,
"grad_norm": 0.37095028162002563,
"learning_rate": 0.0001909570167110415,
"loss": 2.178,
"step": 6550
},
{
"epoch": 0.5805052873766647,
"grad_norm": 0.3675883114337921,
"learning_rate": 0.00019027583893525067,
"loss": 2.2009,
"step": 6560
},
{
"epoch": 0.5813902039732756,
"grad_norm": 0.4657158851623535,
"learning_rate": 0.00018959513106814633,
"loss": 2.1962,
"step": 6570
},
{
"epoch": 0.5822751205698863,
"grad_norm": 0.3962918519973755,
"learning_rate": 0.00018891489846552647,
"loss": 2.2035,
"step": 6580
},
{
"epoch": 0.583160037166497,
"grad_norm": 0.3245933949947357,
"learning_rate": 0.00018823514647944977,
"loss": 2.1891,
"step": 6590
},
{
"epoch": 0.5840449537631078,
"grad_norm": 0.33663785457611084,
"learning_rate": 0.00018755588045819325,
"loss": 2.1994,
"step": 6600
},
{
"epoch": 0.5849298703597186,
"grad_norm": 0.3755311667919159,
"learning_rate": 0.00018687710574621051,
"loss": 2.1949,
"step": 6610
},
{
"epoch": 0.5858147869563294,
"grad_norm": 0.3784433603286743,
"learning_rate": 0.00018619882768408937,
"loss": 2.1972,
"step": 6620
},
{
"epoch": 0.5866997035529401,
"grad_norm": 0.3571663796901703,
"learning_rate": 0.00018552105160851018,
"loss": 2.1922,
"step": 6630
},
{
"epoch": 0.5875846201495509,
"grad_norm": 0.5752764344215393,
"learning_rate": 0.00018484378285220365,
"loss": 2.1765,
"step": 6640
},
{
"epoch": 0.5884695367461616,
"grad_norm": 0.39760568737983704,
"learning_rate": 0.0001841670267439088,
"loss": 2.1922,
"step": 6650
},
{
"epoch": 0.5893544533427725,
"grad_norm": 0.48002487421035767,
"learning_rate": 0.00018349078860833125,
"loss": 2.173,
"step": 6660
},
{
"epoch": 0.5902393699393832,
"grad_norm": 0.4568367600440979,
"learning_rate": 0.00018281507376610113,
"loss": 2.2042,
"step": 6670
},
{
"epoch": 0.591124286535994,
"grad_norm": 0.44323500990867615,
"learning_rate": 0.00018213988753373146,
"loss": 2.1808,
"step": 6680
},
{
"epoch": 0.5920092031326047,
"grad_norm": 0.4837689697742462,
"learning_rate": 0.00018146523522357595,
"loss": 2.1728,
"step": 6690
},
{
"epoch": 0.5928941197292156,
"grad_norm": 0.4340761601924896,
"learning_rate": 0.00018079112214378768,
"loss": 2.1915,
"step": 6700
},
{
"epoch": 0.5937790363258263,
"grad_norm": 0.42073750495910645,
"learning_rate": 0.00018011755359827677,
"loss": 2.1906,
"step": 6710
},
{
"epoch": 0.594663952922437,
"grad_norm": 0.4763769805431366,
"learning_rate": 0.00017944453488666928,
"loss": 2.1974,
"step": 6720
},
{
"epoch": 0.5955488695190478,
"grad_norm": 0.42602041363716125,
"learning_rate": 0.00017877207130426488,
"loss": 2.1697,
"step": 6730
},
{
"epoch": 0.5964337861156586,
"grad_norm": 0.7025333046913147,
"learning_rate": 0.0001781001681419957,
"loss": 2.1958,
"step": 6740
},
{
"epoch": 0.5973187027122694,
"grad_norm": 0.46738317608833313,
"learning_rate": 0.00017742883068638446,
"loss": 2.1852,
"step": 6750
},
{
"epoch": 0.5982036193088801,
"grad_norm": 0.4451451003551483,
"learning_rate": 0.00017675806421950277,
"loss": 2.1855,
"step": 6760
},
{
"epoch": 0.5990885359054909,
"grad_norm": 0.3433722257614136,
"learning_rate": 0.00017608787401892994,
"loss": 2.1793,
"step": 6770
},
{
"epoch": 0.5999734525021017,
"grad_norm": 0.305349200963974,
"learning_rate": 0.000175418265357711,
"loss": 2.1926,
"step": 6780
},
{
"epoch": 0.6008583690987125,
"grad_norm": 0.369484007358551,
"learning_rate": 0.00017474924350431565,
"loss": 2.1917,
"step": 6790
},
{
"epoch": 0.6017432856953232,
"grad_norm": 0.42088308930397034,
"learning_rate": 0.00017408081372259632,
"loss": 2.1814,
"step": 6800
},
{
"epoch": 0.602628202291934,
"grad_norm": 0.396318256855011,
"learning_rate": 0.00017341298127174744,
"loss": 2.2017,
"step": 6810
},
{
"epoch": 0.6035131188885448,
"grad_norm": 0.37352317571640015,
"learning_rate": 0.00017274575140626317,
"loss": 2.1785,
"step": 6820
},
{
"epoch": 0.6043980354851555,
"grad_norm": 0.37729695439338684,
"learning_rate": 0.00017207912937589696,
"loss": 2.2002,
"step": 6830
},
{
"epoch": 0.6052829520817663,
"grad_norm": 0.3980563282966614,
"learning_rate": 0.0001714131204256195,
"loss": 2.1804,
"step": 6840
},
{
"epoch": 0.606167868678377,
"grad_norm": 0.3724077343940735,
"learning_rate": 0.000170747729795578,
"loss": 2.1873,
"step": 6850
},
{
"epoch": 0.6070527852749878,
"grad_norm": 0.4318739175796509,
"learning_rate": 0.00017008296272105468,
"loss": 2.1726,
"step": 6860
},
{
"epoch": 0.6079377018715986,
"grad_norm": 0.5658116340637207,
"learning_rate": 0.00016941882443242555,
"loss": 2.1767,
"step": 6870
},
{
"epoch": 0.6088226184682094,
"grad_norm": 0.38237351179122925,
"learning_rate": 0.00016875532015511944,
"loss": 2.1893,
"step": 6880
},
{
"epoch": 0.6097075350648201,
"grad_norm": 0.32790622115135193,
"learning_rate": 0.00016809245510957666,
"loss": 2.1834,
"step": 6890
},
{
"epoch": 0.610592451661431,
"grad_norm": 0.3833366334438324,
"learning_rate": 0.00016743023451120832,
"loss": 2.1983,
"step": 6900
},
{
"epoch": 0.6114773682580417,
"grad_norm": 0.2924376130104065,
"learning_rate": 0.00016676866357035467,
"loss": 2.2048,
"step": 6910
},
{
"epoch": 0.6123622848546525,
"grad_norm": 0.4293077290058136,
"learning_rate": 0.00016610774749224483,
"loss": 2.189,
"step": 6920
},
{
"epoch": 0.6132472014512632,
"grad_norm": 0.5160631537437439,
"learning_rate": 0.0001654474914769551,
"loss": 2.1937,
"step": 6930
},
{
"epoch": 0.614132118047874,
"grad_norm": 0.4933619499206543,
"learning_rate": 0.00016478790071936875,
"loss": 2.1793,
"step": 6940
},
{
"epoch": 0.6150170346444848,
"grad_norm": 0.4233640134334564,
"learning_rate": 0.00016412898040913472,
"loss": 2.1766,
"step": 6950
},
{
"epoch": 0.6159019512410955,
"grad_norm": 0.34439942240715027,
"learning_rate": 0.0001634707357306267,
"loss": 2.1895,
"step": 6960
},
{
"epoch": 0.6167868678377063,
"grad_norm": 0.34741097688674927,
"learning_rate": 0.00016281317186290283,
"loss": 2.2113,
"step": 6970
},
{
"epoch": 0.617671784434317,
"grad_norm": 0.34368258714675903,
"learning_rate": 0.0001621562939796643,
"loss": 2.1917,
"step": 6980
},
{
"epoch": 0.6185567010309279,
"grad_norm": 0.47897687554359436,
"learning_rate": 0.00016150010724921525,
"loss": 2.1909,
"step": 6990
},
{
"epoch": 0.6194416176275386,
"grad_norm": 0.3656957149505615,
"learning_rate": 0.00016084461683442175,
"loss": 2.1744,
"step": 7000
},
{
"epoch": 0.6194416176275386,
"eval_accuracy": 0.5502146351202954,
"eval_loss": 2.095036029815674,
"eval_runtime": 12.4508,
"eval_samples_per_second": 25.541,
"eval_steps_per_second": 0.402,
"step": 7000
},
{
"epoch": 0.6203265342241494,
"grad_norm": 0.29836395382881165,
"learning_rate": 0.00016018982789267123,
"loss": 2.1902,
"step": 7010
},
{
"epoch": 0.6212114508207601,
"grad_norm": 0.5148487091064453,
"learning_rate": 0.000159535745575832,
"loss": 2.1859,
"step": 7020
},
{
"epoch": 0.622096367417371,
"grad_norm": 0.379721075296402,
"learning_rate": 0.0001588823750302126,
"loss": 2.1793,
"step": 7030
},
{
"epoch": 0.6229812840139817,
"grad_norm": 0.3983226716518402,
"learning_rate": 0.00015822972139652148,
"loss": 2.1819,
"step": 7040
},
{
"epoch": 0.6238662006105925,
"grad_norm": 0.30344444513320923,
"learning_rate": 0.00015757778980982626,
"loss": 2.19,
"step": 7050
},
{
"epoch": 0.6247511172072032,
"grad_norm": 0.3318573236465454,
"learning_rate": 0.00015692658539951372,
"loss": 2.1693,
"step": 7060
},
{
"epoch": 0.6256360338038139,
"grad_norm": 0.3758665919303894,
"learning_rate": 0.00015627611328924903,
"loss": 2.1951,
"step": 7070
},
{
"epoch": 0.6265209504004248,
"grad_norm": 0.3673815131187439,
"learning_rate": 0.00015562637859693586,
"loss": 2.1886,
"step": 7080
},
{
"epoch": 0.6274058669970355,
"grad_norm": 0.31688493490219116,
"learning_rate": 0.0001549773864346755,
"loss": 2.2186,
"step": 7090
},
{
"epoch": 0.6282907835936463,
"grad_norm": 0.4083735942840576,
"learning_rate": 0.00015432914190872756,
"loss": 2.1785,
"step": 7100
},
{
"epoch": 0.629175700190257,
"grad_norm": 0.34488794207572937,
"learning_rate": 0.00015368165011946886,
"loss": 2.2075,
"step": 7110
},
{
"epoch": 0.6300606167868679,
"grad_norm": 0.31938597559928894,
"learning_rate": 0.00015303491616135373,
"loss": 2.1995,
"step": 7120
},
{
"epoch": 0.6309455333834786,
"grad_norm": 0.366621196269989,
"learning_rate": 0.00015238894512287413,
"loss": 2.1676,
"step": 7130
},
{
"epoch": 0.6318304499800894,
"grad_norm": 0.42678719758987427,
"learning_rate": 0.0001517437420865191,
"loss": 2.1845,
"step": 7140
},
{
"epoch": 0.6327153665767001,
"grad_norm": 0.41343775391578674,
"learning_rate": 0.00015109931212873535,
"loss": 2.1639,
"step": 7150
},
{
"epoch": 0.633600283173311,
"grad_norm": 0.30154532194137573,
"learning_rate": 0.0001504556603198867,
"loss": 2.202,
"step": 7160
},
{
"epoch": 0.6344851997699217,
"grad_norm": 0.33191919326782227,
"learning_rate": 0.00014981279172421482,
"loss": 2.1845,
"step": 7170
},
{
"epoch": 0.6353701163665324,
"grad_norm": 0.38592010736465454,
"learning_rate": 0.00014917071139979875,
"loss": 2.1916,
"step": 7180
},
{
"epoch": 0.6362550329631432,
"grad_norm": 0.3502799868583679,
"learning_rate": 0.00014852942439851576,
"loss": 2.1942,
"step": 7190
},
{
"epoch": 0.637139949559754,
"grad_norm": 0.34366223216056824,
"learning_rate": 0.000147888935766001,
"loss": 2.1823,
"step": 7200
},
{
"epoch": 0.6380248661563648,
"grad_norm": 0.32077911496162415,
"learning_rate": 0.0001472492505416082,
"loss": 2.1911,
"step": 7210
},
{
"epoch": 0.6389097827529755,
"grad_norm": 0.36590802669525146,
"learning_rate": 0.00014661037375836988,
"loss": 2.1914,
"step": 7220
},
{
"epoch": 0.6397946993495863,
"grad_norm": 0.34605008363723755,
"learning_rate": 0.0001459723104429577,
"loss": 2.1833,
"step": 7230
},
{
"epoch": 0.640679615946197,
"grad_norm": 0.3608168065547943,
"learning_rate": 0.00014533506561564306,
"loss": 2.1944,
"step": 7240
},
{
"epoch": 0.6415645325428079,
"grad_norm": 0.330559104681015,
"learning_rate": 0.0001446986442902574,
"loss": 2.192,
"step": 7250
},
{
"epoch": 0.6424494491394186,
"grad_norm": 0.32037970423698425,
"learning_rate": 0.00014406305147415284,
"loss": 2.1803,
"step": 7260
},
{
"epoch": 0.6433343657360294,
"grad_norm": 0.37160640954971313,
"learning_rate": 0.00014342829216816309,
"loss": 2.1795,
"step": 7270
},
{
"epoch": 0.6442192823326401,
"grad_norm": 0.38226374983787537,
"learning_rate": 0.00014279437136656336,
"loss": 2.1567,
"step": 7280
},
{
"epoch": 0.645104198929251,
"grad_norm": 0.39466509222984314,
"learning_rate": 0.00014216129405703203,
"loss": 2.1963,
"step": 7290
},
{
"epoch": 0.6459891155258617,
"grad_norm": 0.4139063060283661,
"learning_rate": 0.00014152906522061048,
"loss": 2.1784,
"step": 7300
},
{
"epoch": 0.6468740321224724,
"grad_norm": 0.35524383187294006,
"learning_rate": 0.00014089768983166444,
"loss": 2.1712,
"step": 7310
},
{
"epoch": 0.6477589487190832,
"grad_norm": 0.3009701669216156,
"learning_rate": 0.00014026717285784492,
"loss": 2.1888,
"step": 7320
},
{
"epoch": 0.648643865315694,
"grad_norm": 0.528045117855072,
"learning_rate": 0.00013963751926004863,
"loss": 2.2064,
"step": 7330
},
{
"epoch": 0.6495287819123048,
"grad_norm": 0.4917372167110443,
"learning_rate": 0.0001390087339923795,
"loss": 2.1688,
"step": 7340
},
{
"epoch": 0.6504136985089155,
"grad_norm": 0.40180593729019165,
"learning_rate": 0.0001383808220021093,
"loss": 2.1832,
"step": 7350
},
{
"epoch": 0.6512986151055263,
"grad_norm": 0.4933311939239502,
"learning_rate": 0.00013775378822963882,
"loss": 2.1737,
"step": 7360
},
{
"epoch": 0.652183531702137,
"grad_norm": 0.44765591621398926,
"learning_rate": 0.00013712763760845937,
"loss": 2.1721,
"step": 7370
},
{
"epoch": 0.6530684482987479,
"grad_norm": 0.511542797088623,
"learning_rate": 0.00013650237506511331,
"loss": 2.1801,
"step": 7380
},
{
"epoch": 0.6539533648953586,
"grad_norm": 0.33777767419815063,
"learning_rate": 0.00013587800551915575,
"loss": 2.183,
"step": 7390
},
{
"epoch": 0.6548382814919694,
"grad_norm": 0.4522639513015747,
"learning_rate": 0.00013525453388311555,
"loss": 2.1912,
"step": 7400
},
{
"epoch": 0.6557231980885802,
"grad_norm": 0.35160988569259644,
"learning_rate": 0.0001346319650624572,
"loss": 2.1796,
"step": 7410
},
{
"epoch": 0.6566081146851909,
"grad_norm": 0.3671887516975403,
"learning_rate": 0.0001340103039555415,
"loss": 2.1904,
"step": 7420
},
{
"epoch": 0.6574930312818017,
"grad_norm": 0.3306543827056885,
"learning_rate": 0.00013338955545358754,
"loss": 2.1922,
"step": 7430
},
{
"epoch": 0.6583779478784124,
"grad_norm": 0.5238605737686157,
"learning_rate": 0.00013276972444063384,
"loss": 2.2065,
"step": 7440
},
{
"epoch": 0.6592628644750232,
"grad_norm": 0.3615835905075073,
"learning_rate": 0.00013215081579350058,
"loss": 2.1936,
"step": 7450
},
{
"epoch": 0.660147781071634,
"grad_norm": 0.3860037624835968,
"learning_rate": 0.00013153283438175034,
"loss": 2.1908,
"step": 7460
},
{
"epoch": 0.6610326976682448,
"grad_norm": 0.3101390600204468,
"learning_rate": 0.00013091578506765046,
"loss": 2.2007,
"step": 7470
},
{
"epoch": 0.6619176142648555,
"grad_norm": 0.3494165539741516,
"learning_rate": 0.00013029967270613435,
"loss": 2.1729,
"step": 7480
},
{
"epoch": 0.6628025308614663,
"grad_norm": 0.4269309341907501,
"learning_rate": 0.00012968450214476368,
"loss": 2.1882,
"step": 7490
},
{
"epoch": 0.6636874474580771,
"grad_norm": 0.43789321184158325,
"learning_rate": 0.00012907027822369005,
"loss": 2.1983,
"step": 7500
},
{
"epoch": 0.6645723640546879,
"grad_norm": 0.3296915590763092,
"learning_rate": 0.0001284570057756169,
"loss": 2.1759,
"step": 7510
},
{
"epoch": 0.6654572806512986,
"grad_norm": 0.3194144666194916,
"learning_rate": 0.00012784468962576134,
"loss": 2.1868,
"step": 7520
},
{
"epoch": 0.6663421972479093,
"grad_norm": 0.42226454615592957,
"learning_rate": 0.00012723333459181642,
"loss": 2.2096,
"step": 7530
},
{
"epoch": 0.6672271138445202,
"grad_norm": 0.3759223520755768,
"learning_rate": 0.00012662294548391328,
"loss": 2.1851,
"step": 7540
},
{
"epoch": 0.6681120304411309,
"grad_norm": 0.292855441570282,
"learning_rate": 0.00012601352710458314,
"loss": 2.1992,
"step": 7550
},
{
"epoch": 0.6689969470377417,
"grad_norm": 0.348399817943573,
"learning_rate": 0.00012540508424871934,
"loss": 2.1677,
"step": 7560
},
{
"epoch": 0.6698818636343524,
"grad_norm": 0.36560651659965515,
"learning_rate": 0.00012479762170353997,
"loss": 2.1773,
"step": 7570
},
{
"epoch": 0.6707667802309633,
"grad_norm": 0.3574565351009369,
"learning_rate": 0.00012419114424854998,
"loss": 2.1848,
"step": 7580
},
{
"epoch": 0.671651696827574,
"grad_norm": 0.35882261395454407,
"learning_rate": 0.0001235856566555039,
"loss": 2.1868,
"step": 7590
},
{
"epoch": 0.6725366134241848,
"grad_norm": 0.3170746862888336,
"learning_rate": 0.0001229811636883677,
"loss": 2.1881,
"step": 7600
},
{
"epoch": 0.6734215300207955,
"grad_norm": 0.3750855028629303,
"learning_rate": 0.00012237767010328182,
"loss": 2.1658,
"step": 7610
},
{
"epoch": 0.6743064466174064,
"grad_norm": 0.31514084339141846,
"learning_rate": 0.0001217751806485235,
"loss": 2.1709,
"step": 7620
},
{
"epoch": 0.6751913632140171,
"grad_norm": 0.4474596679210663,
"learning_rate": 0.00012117370006446957,
"loss": 2.1802,
"step": 7630
},
{
"epoch": 0.6760762798106279,
"grad_norm": 0.36836448311805725,
"learning_rate": 0.00012057323308355922,
"loss": 2.1704,
"step": 7640
},
{
"epoch": 0.6769611964072386,
"grad_norm": 0.3137301206588745,
"learning_rate": 0.00011997378443025633,
"loss": 2.2023,
"step": 7650
},
{
"epoch": 0.6778461130038493,
"grad_norm": 0.3115224838256836,
"learning_rate": 0.00011937535882101281,
"loss": 2.1702,
"step": 7660
},
{
"epoch": 0.6787310296004602,
"grad_norm": 0.4060702621936798,
"learning_rate": 0.00011877796096423105,
"loss": 2.1803,
"step": 7670
},
{
"epoch": 0.6796159461970709,
"grad_norm": 0.30474671721458435,
"learning_rate": 0.00011818159556022748,
"loss": 2.1892,
"step": 7680
},
{
"epoch": 0.6805008627936817,
"grad_norm": 0.29813340306282043,
"learning_rate": 0.00011758626730119487,
"loss": 2.1831,
"step": 7690
},
{
"epoch": 0.6813857793902924,
"grad_norm": 0.35376375913619995,
"learning_rate": 0.00011699198087116588,
"loss": 2.1768,
"step": 7700
},
{
"epoch": 0.6822706959869033,
"grad_norm": 0.34044161438941956,
"learning_rate": 0.00011639874094597605,
"loss": 2.1696,
"step": 7710
},
{
"epoch": 0.683155612583514,
"grad_norm": 0.3716510534286499,
"learning_rate": 0.000115806552193227,
"loss": 2.1897,
"step": 7720
},
{
"epoch": 0.6840405291801248,
"grad_norm": 0.3332691788673401,
"learning_rate": 0.00011521541927224994,
"loss": 2.1611,
"step": 7730
},
{
"epoch": 0.6849254457767355,
"grad_norm": 0.3617340624332428,
"learning_rate": 0.00011462534683406858,
"loss": 2.1593,
"step": 7740
},
{
"epoch": 0.6858103623733464,
"grad_norm": 0.33560147881507874,
"learning_rate": 0.00011403633952136289,
"loss": 2.1933,
"step": 7750
},
{
"epoch": 0.6866952789699571,
"grad_norm": 0.30507397651672363,
"learning_rate": 0.00011344840196843228,
"loss": 2.1908,
"step": 7760
},
{
"epoch": 0.6875801955665678,
"grad_norm": 0.28181737661361694,
"learning_rate": 0.00011286153880115966,
"loss": 2.1963,
"step": 7770
},
{
"epoch": 0.6884651121631786,
"grad_norm": 0.3205542266368866,
"learning_rate": 0.0001122757546369744,
"loss": 2.1629,
"step": 7780
},
{
"epoch": 0.6893500287597893,
"grad_norm": 0.4453352391719818,
"learning_rate": 0.00011169105408481634,
"loss": 2.2044,
"step": 7790
},
{
"epoch": 0.6902349453564002,
"grad_norm": 0.29993703961372375,
"learning_rate": 0.00011110744174509952,
"loss": 2.1742,
"step": 7800
},
{
"epoch": 0.6911198619530109,
"grad_norm": 0.3727935552597046,
"learning_rate": 0.00011052492220967583,
"loss": 2.1755,
"step": 7810
},
{
"epoch": 0.6920047785496217,
"grad_norm": 0.32742083072662354,
"learning_rate": 0.00010994350006179932,
"loss": 2.2046,
"step": 7820
},
{
"epoch": 0.6928896951462324,
"grad_norm": 0.45911964774131775,
"learning_rate": 0.00010936317987608946,
"loss": 2.1755,
"step": 7830
},
{
"epoch": 0.6937746117428433,
"grad_norm": 0.3224821090698242,
"learning_rate": 0.00010878396621849565,
"loss": 2.1789,
"step": 7840
},
{
"epoch": 0.694659528339454,
"grad_norm": 0.5779576301574707,
"learning_rate": 0.00010820586364626103,
"loss": 2.186,
"step": 7850
},
{
"epoch": 0.6955444449360648,
"grad_norm": 0.36172717809677124,
"learning_rate": 0.00010762887670788701,
"loss": 2.2043,
"step": 7860
},
{
"epoch": 0.6964293615326755,
"grad_norm": 0.3582923710346222,
"learning_rate": 0.00010705300994309697,
"loss": 2.1745,
"step": 7870
},
{
"epoch": 0.6973142781292864,
"grad_norm": 0.3162693977355957,
"learning_rate": 0.00010647826788280083,
"loss": 2.1838,
"step": 7880
},
{
"epoch": 0.6981991947258971,
"grad_norm": 0.3397110402584076,
"learning_rate": 0.0001059046550490593,
"loss": 2.1854,
"step": 7890
},
{
"epoch": 0.6990841113225078,
"grad_norm": 0.30742955207824707,
"learning_rate": 0.00010533217595504857,
"loss": 2.1747,
"step": 7900
},
{
"epoch": 0.6999690279191186,
"grad_norm": 0.32416754961013794,
"learning_rate": 0.00010476083510502443,
"loss": 2.1828,
"step": 7910
},
{
"epoch": 0.7008539445157294,
"grad_norm": 0.41834747791290283,
"learning_rate": 0.00010419063699428691,
"loss": 2.1849,
"step": 7920
},
{
"epoch": 0.7017388611123402,
"grad_norm": 0.3807116150856018,
"learning_rate": 0.00010362158610914516,
"loss": 2.1674,
"step": 7930
},
{
"epoch": 0.7026237777089509,
"grad_norm": 0.3779432475566864,
"learning_rate": 0.00010305368692688174,
"loss": 2.1683,
"step": 7940
},
{
"epoch": 0.7035086943055617,
"grad_norm": 0.41769009828567505,
"learning_rate": 0.000102486943915718,
"loss": 2.1788,
"step": 7950
},
{
"epoch": 0.7043936109021725,
"grad_norm": 0.33971527218818665,
"learning_rate": 0.00010192136153477825,
"loss": 2.1844,
"step": 7960
},
{
"epoch": 0.7052785274987833,
"grad_norm": 0.2998791038990021,
"learning_rate": 0.00010135694423405506,
"loss": 2.1906,
"step": 7970
},
{
"epoch": 0.706163444095394,
"grad_norm": 0.30726414918899536,
"learning_rate": 0.00010079369645437411,
"loss": 2.1802,
"step": 7980
},
{
"epoch": 0.7070483606920048,
"grad_norm": 0.3112538456916809,
"learning_rate": 0.00010023162262735944,
"loss": 2.1887,
"step": 7990
},
{
"epoch": 0.7079332772886155,
"grad_norm": 0.33356785774230957,
"learning_rate": 9.967072717539852e-05,
"loss": 2.1971,
"step": 8000
},
{
"epoch": 0.7079332772886155,
"eval_accuracy": 0.5514356363412967,
"eval_loss": 2.0872702598571777,
"eval_runtime": 12.2195,
"eval_samples_per_second": 26.024,
"eval_steps_per_second": 0.409,
"step": 8000
},
{
"epoch": 0.7088181938852263,
"grad_norm": 0.35864993929862976,
"learning_rate": 9.911101451160715e-05,
"loss": 2.1773,
"step": 8010
},
{
"epoch": 0.7097031104818371,
"grad_norm": 0.4708113372325897,
"learning_rate": 9.855248903979506e-05,
"loss": 2.1867,
"step": 8020
},
{
"epoch": 0.7105880270784478,
"grad_norm": 0.32000553607940674,
"learning_rate": 9.79951551544311e-05,
"loss": 2.1721,
"step": 8030
},
{
"epoch": 0.7114729436750586,
"grad_norm": 0.3156352639198303,
"learning_rate": 9.743901724060905e-05,
"loss": 2.1935,
"step": 8040
},
{
"epoch": 0.7123578602716694,
"grad_norm": 0.3504098355770111,
"learning_rate": 9.688407967401247e-05,
"loss": 2.1891,
"step": 8050
},
{
"epoch": 0.7132427768682802,
"grad_norm": 0.4470130205154419,
"learning_rate": 9.633034682088071e-05,
"loss": 2.1604,
"step": 8060
},
{
"epoch": 0.7141276934648909,
"grad_norm": 0.37104523181915283,
"learning_rate": 9.57778230379745e-05,
"loss": 2.1695,
"step": 8070
},
{
"epoch": 0.7150126100615017,
"grad_norm": 0.30490779876708984,
"learning_rate": 9.522651267254148e-05,
"loss": 2.1722,
"step": 8080
},
{
"epoch": 0.7158975266581125,
"grad_norm": 0.288102924823761,
"learning_rate": 9.467642006228244e-05,
"loss": 2.165,
"step": 8090
},
{
"epoch": 0.7167824432547233,
"grad_norm": 0.2840191125869751,
"learning_rate": 9.412754953531663e-05,
"loss": 2.1873,
"step": 8100
},
{
"epoch": 0.717667359851334,
"grad_norm": 0.3146025836467743,
"learning_rate": 9.357990541014805e-05,
"loss": 2.1756,
"step": 8110
},
{
"epoch": 0.7185522764479447,
"grad_norm": 0.299532413482666,
"learning_rate": 9.30334919956313e-05,
"loss": 2.1809,
"step": 8120
},
{
"epoch": 0.7194371930445556,
"grad_norm": 0.34434300661087036,
"learning_rate": 9.248831359093803e-05,
"loss": 2.1781,
"step": 8130
},
{
"epoch": 0.7203221096411663,
"grad_norm": 0.2795085906982422,
"learning_rate": 9.194437448552259e-05,
"loss": 2.1806,
"step": 8140
},
{
"epoch": 0.7212070262377771,
"grad_norm": 0.32003116607666016,
"learning_rate": 9.140167895908866e-05,
"loss": 2.1801,
"step": 8150
},
{
"epoch": 0.7220919428343878,
"grad_norm": 0.2715960144996643,
"learning_rate": 9.086023128155544e-05,
"loss": 2.1647,
"step": 8160
},
{
"epoch": 0.7229768594309987,
"grad_norm": 0.28309276700019836,
"learning_rate": 9.032003571302397e-05,
"loss": 2.177,
"step": 8170
},
{
"epoch": 0.7238617760276094,
"grad_norm": 0.28893986344337463,
"learning_rate": 8.978109650374397e-05,
"loss": 2.1915,
"step": 8180
},
{
"epoch": 0.7247466926242202,
"grad_norm": 0.32400646805763245,
"learning_rate": 8.924341789408e-05,
"loss": 2.1657,
"step": 8190
},
{
"epoch": 0.7256316092208309,
"grad_norm": 0.3074786961078644,
"learning_rate": 8.870700411447816e-05,
"loss": 2.1951,
"step": 8200
},
{
"epoch": 0.7265165258174417,
"grad_norm": 0.30175745487213135,
"learning_rate": 8.817185938543293e-05,
"loss": 2.1753,
"step": 8210
},
{
"epoch": 0.7274014424140525,
"grad_norm": 0.3015293776988983,
"learning_rate": 8.763798791745412e-05,
"loss": 2.1634,
"step": 8220
},
{
"epoch": 0.7282863590106633,
"grad_norm": 0.32819435000419617,
"learning_rate": 8.710539391103328e-05,
"loss": 2.1839,
"step": 8230
},
{
"epoch": 0.729171275607274,
"grad_norm": 0.3071001470088959,
"learning_rate": 8.657408155661109e-05,
"loss": 2.1746,
"step": 8240
},
{
"epoch": 0.7300561922038847,
"grad_norm": 0.38603559136390686,
"learning_rate": 8.604405503454398e-05,
"loss": 2.179,
"step": 8250
},
{
"epoch": 0.7309411088004956,
"grad_norm": 0.3808230757713318,
"learning_rate": 8.551531851507186e-05,
"loss": 2.1609,
"step": 8260
},
{
"epoch": 0.7318260253971063,
"grad_norm": 0.3062630295753479,
"learning_rate": 8.49878761582846e-05,
"loss": 2.1762,
"step": 8270
},
{
"epoch": 0.7327109419937171,
"grad_norm": 0.30554407835006714,
"learning_rate": 8.446173211408972e-05,
"loss": 2.18,
"step": 8280
},
{
"epoch": 0.7335958585903278,
"grad_norm": 0.30430638790130615,
"learning_rate": 8.393689052217964e-05,
"loss": 2.17,
"step": 8290
},
{
"epoch": 0.7344807751869387,
"grad_norm": 0.37365472316741943,
"learning_rate": 8.341335551199903e-05,
"loss": 2.1717,
"step": 8300
},
{
"epoch": 0.7353656917835494,
"grad_norm": 0.30844250321388245,
"learning_rate": 8.289113120271264e-05,
"loss": 2.1989,
"step": 8310
},
{
"epoch": 0.7362506083801602,
"grad_norm": 0.3102276623249054,
"learning_rate": 8.237022170317235e-05,
"loss": 2.1967,
"step": 8320
},
{
"epoch": 0.7371355249767709,
"grad_norm": 0.35639065504074097,
"learning_rate": 8.185063111188523e-05,
"loss": 2.1596,
"step": 8330
},
{
"epoch": 0.7380204415733818,
"grad_norm": 0.3256385326385498,
"learning_rate": 8.133236351698142e-05,
"loss": 2.1866,
"step": 8340
},
{
"epoch": 0.7389053581699925,
"grad_norm": 0.4910948872566223,
"learning_rate": 8.081542299618138e-05,
"loss": 2.1701,
"step": 8350
},
{
"epoch": 0.7397902747666032,
"grad_norm": 0.4157789647579193,
"learning_rate": 8.029981361676455e-05,
"loss": 2.1664,
"step": 8360
},
{
"epoch": 0.740675191363214,
"grad_norm": 0.3930807411670685,
"learning_rate": 7.978553943553665e-05,
"loss": 2.1909,
"step": 8370
},
{
"epoch": 0.7415601079598247,
"grad_norm": 0.3209260404109955,
"learning_rate": 7.927260449879828e-05,
"loss": 2.1963,
"step": 8380
},
{
"epoch": 0.7424450245564356,
"grad_norm": 0.39223039150238037,
"learning_rate": 7.876101284231277e-05,
"loss": 2.1674,
"step": 8390
},
{
"epoch": 0.7433299411530463,
"grad_norm": 0.3471428453922272,
"learning_rate": 7.825076849127458e-05,
"loss": 2.1933,
"step": 8400
},
{
"epoch": 0.7442148577496571,
"grad_norm": 0.38216593861579895,
"learning_rate": 7.774187546027769e-05,
"loss": 2.1688,
"step": 8410
},
{
"epoch": 0.7450997743462678,
"grad_norm": 0.2885330617427826,
"learning_rate": 7.723433775328384e-05,
"loss": 2.2004,
"step": 8420
},
{
"epoch": 0.7459846909428787,
"grad_norm": 0.3208834230899811,
"learning_rate": 7.672815936359106e-05,
"loss": 2.1859,
"step": 8430
},
{
"epoch": 0.7468696075394894,
"grad_norm": 0.32311803102493286,
"learning_rate": 7.622334427380229e-05,
"loss": 2.1901,
"step": 8440
},
{
"epoch": 0.7477545241361002,
"grad_norm": 0.4310019910335541,
"learning_rate": 7.571989645579418e-05,
"loss": 2.1787,
"step": 8450
},
{
"epoch": 0.7486394407327109,
"grad_norm": 0.3954008221626282,
"learning_rate": 7.521781987068566e-05,
"loss": 2.148,
"step": 8460
},
{
"epoch": 0.7495243573293218,
"grad_norm": 0.34704872965812683,
"learning_rate": 7.471711846880669e-05,
"loss": 2.1572,
"step": 8470
},
{
"epoch": 0.7504092739259325,
"grad_norm": 0.3665640950202942,
"learning_rate": 7.421779618966738e-05,
"loss": 2.1767,
"step": 8480
},
{
"epoch": 0.7512941905225432,
"grad_norm": 0.27584344148635864,
"learning_rate": 7.371985696192707e-05,
"loss": 2.1606,
"step": 8490
},
{
"epoch": 0.752179107119154,
"grad_norm": 0.3363373279571533,
"learning_rate": 7.322330470336314e-05,
"loss": 2.1717,
"step": 8500
},
{
"epoch": 0.7530640237157648,
"grad_norm": 0.41205254197120667,
"learning_rate": 7.27281433208403e-05,
"loss": 2.1901,
"step": 8510
},
{
"epoch": 0.7539489403123756,
"grad_norm": 0.29698577523231506,
"learning_rate": 7.223437671027994e-05,
"loss": 2.1762,
"step": 8520
},
{
"epoch": 0.7548338569089863,
"grad_norm": 0.3263265788555145,
"learning_rate": 7.174200875662928e-05,
"loss": 2.1642,
"step": 8530
},
{
"epoch": 0.7557187735055971,
"grad_norm": 0.46074342727661133,
"learning_rate": 7.125104333383118e-05,
"loss": 2.1665,
"step": 8540
},
{
"epoch": 0.7566036901022078,
"grad_norm": 0.31991058588027954,
"learning_rate": 7.07614843047932e-05,
"loss": 2.1941,
"step": 8550
},
{
"epoch": 0.7574886066988187,
"grad_norm": 0.3378481864929199,
"learning_rate": 7.027333552135748e-05,
"loss": 2.1801,
"step": 8560
},
{
"epoch": 0.7583735232954294,
"grad_norm": 0.34531331062316895,
"learning_rate": 6.97866008242703e-05,
"loss": 2.1645,
"step": 8570
},
{
"epoch": 0.7592584398920402,
"grad_norm": 0.29340556263923645,
"learning_rate": 6.930128404315214e-05,
"loss": 2.1734,
"step": 8580
},
{
"epoch": 0.760143356488651,
"grad_norm": 0.3951212763786316,
"learning_rate": 6.881738899646713e-05,
"loss": 2.1725,
"step": 8590
},
{
"epoch": 0.7610282730852617,
"grad_norm": 0.3772408664226532,
"learning_rate": 6.833491949149328e-05,
"loss": 2.1778,
"step": 8600
},
{
"epoch": 0.7619131896818725,
"grad_norm": 0.2892204821109772,
"learning_rate": 6.785387932429243e-05,
"loss": 2.1748,
"step": 8610
},
{
"epoch": 0.7627981062784832,
"grad_norm": 0.3378978669643402,
"learning_rate": 6.737427227968062e-05,
"loss": 2.1871,
"step": 8620
},
{
"epoch": 0.763683022875094,
"grad_norm": 0.2745198607444763,
"learning_rate": 6.689610213119782e-05,
"loss": 2.1752,
"step": 8630
},
{
"epoch": 0.7645679394717048,
"grad_norm": 0.3293655514717102,
"learning_rate": 6.641937264107867e-05,
"loss": 2.1811,
"step": 8640
},
{
"epoch": 0.7654528560683156,
"grad_norm": 0.3286610245704651,
"learning_rate": 6.594408756022272e-05,
"loss": 2.1823,
"step": 8650
},
{
"epoch": 0.7663377726649263,
"grad_norm": 0.3277672231197357,
"learning_rate": 6.547025062816486e-05,
"loss": 2.1707,
"step": 8660
},
{
"epoch": 0.7672226892615371,
"grad_norm": 0.33863088488578796,
"learning_rate": 6.499786557304618e-05,
"loss": 2.1675,
"step": 8670
},
{
"epoch": 0.7681076058581479,
"grad_norm": 0.27539339661598206,
"learning_rate": 6.452693611158411e-05,
"loss": 2.1991,
"step": 8680
},
{
"epoch": 0.7689925224547587,
"grad_norm": 0.35537365078926086,
"learning_rate": 6.405746594904388e-05,
"loss": 2.185,
"step": 8690
},
{
"epoch": 0.7698774390513694,
"grad_norm": 0.34015700221061707,
"learning_rate": 6.35894587792086e-05,
"loss": 2.1794,
"step": 8700
},
{
"epoch": 0.7707623556479801,
"grad_norm": 0.32763075828552246,
"learning_rate": 6.312291828435076e-05,
"loss": 2.1469,
"step": 8710
},
{
"epoch": 0.771647272244591,
"grad_norm": 0.38287562131881714,
"learning_rate": 6.265784813520318e-05,
"loss": 2.1877,
"step": 8720
},
{
"epoch": 0.7725321888412017,
"grad_norm": 0.33399245142936707,
"learning_rate": 6.219425199092981e-05,
"loss": 2.18,
"step": 8730
},
{
"epoch": 0.7734171054378125,
"grad_norm": 0.4592779278755188,
"learning_rate": 6.173213349909729e-05,
"loss": 2.2047,
"step": 8740
},
{
"epoch": 0.7743020220344232,
"grad_norm": 0.40799829363822937,
"learning_rate": 6.127149629564605e-05,
"loss": 2.1583,
"step": 8750
},
{
"epoch": 0.775186938631034,
"grad_norm": 0.3907710909843445,
"learning_rate": 6.081234400486171e-05,
"loss": 2.1608,
"step": 8760
},
{
"epoch": 0.7760718552276448,
"grad_norm": 0.29933851957321167,
"learning_rate": 6.0354680239346925e-05,
"loss": 2.1774,
"step": 8770
},
{
"epoch": 0.7769567718242556,
"grad_norm": 0.33313485980033875,
"learning_rate": 5.989850859999227e-05,
"loss": 2.1656,
"step": 8780
},
{
"epoch": 0.7778416884208663,
"grad_norm": 0.2630128562450409,
"learning_rate": 5.944383267594855e-05,
"loss": 2.1807,
"step": 8790
},
{
"epoch": 0.7787266050174771,
"grad_norm": 0.42465919256210327,
"learning_rate": 5.899065604459813e-05,
"loss": 2.165,
"step": 8800
},
{
"epoch": 0.7796115216140879,
"grad_norm": 0.4454388916492462,
"learning_rate": 5.853898227152718e-05,
"loss": 2.1983,
"step": 8810
},
{
"epoch": 0.7804964382106987,
"grad_norm": 0.32380637526512146,
"learning_rate": 5.808881491049722e-05,
"loss": 2.1738,
"step": 8820
},
{
"epoch": 0.7813813548073094,
"grad_norm": 0.3573935329914093,
"learning_rate": 5.7640157503417444e-05,
"loss": 2.178,
"step": 8830
},
{
"epoch": 0.7822662714039201,
"grad_norm": 0.292910635471344,
"learning_rate": 5.7193013580316646e-05,
"loss": 2.164,
"step": 8840
},
{
"epoch": 0.783151188000531,
"grad_norm": 0.3087019622325897,
"learning_rate": 5.6747386659315755e-05,
"loss": 2.1872,
"step": 8850
},
{
"epoch": 0.7840361045971417,
"grad_norm": 0.37168508768081665,
"learning_rate": 5.6303280246599784e-05,
"loss": 2.1645,
"step": 8860
},
{
"epoch": 0.7849210211937525,
"grad_norm": 0.28677845001220703,
"learning_rate": 5.586069783639039e-05,
"loss": 2.16,
"step": 8870
},
{
"epoch": 0.7858059377903632,
"grad_norm": 0.2966271638870239,
"learning_rate": 5.541964291091855e-05,
"loss": 2.1959,
"step": 8880
},
{
"epoch": 0.7866908543869741,
"grad_norm": 0.3936939537525177,
"learning_rate": 5.4980118940396864e-05,
"loss": 2.173,
"step": 8890
},
{
"epoch": 0.7875757709835848,
"grad_norm": 0.3042806386947632,
"learning_rate": 5.454212938299255e-05,
"loss": 2.1841,
"step": 8900
},
{
"epoch": 0.7884606875801956,
"grad_norm": 0.26205936074256897,
"learning_rate": 5.410567768480004e-05,
"loss": 2.1785,
"step": 8910
},
{
"epoch": 0.7893456041768063,
"grad_norm": 0.29167279601097107,
"learning_rate": 5.367076727981382e-05,
"loss": 2.1918,
"step": 8920
},
{
"epoch": 0.7902305207734172,
"grad_norm": 0.29951152205467224,
"learning_rate": 5.3237401589901536e-05,
"loss": 2.1855,
"step": 8930
},
{
"epoch": 0.7911154373700279,
"grad_norm": 0.31647637486457825,
"learning_rate": 5.2805584024777256e-05,
"loss": 2.1795,
"step": 8940
},
{
"epoch": 0.7920003539666386,
"grad_norm": 0.27526384592056274,
"learning_rate": 5.2375317981974145e-05,
"loss": 2.159,
"step": 8950
},
{
"epoch": 0.7928852705632494,
"grad_norm": 0.30185645818710327,
"learning_rate": 5.194660684681818e-05,
"loss": 2.1746,
"step": 8960
},
{
"epoch": 0.7937701871598601,
"grad_norm": 0.2594131827354431,
"learning_rate": 5.151945399240127e-05,
"loss": 2.1713,
"step": 8970
},
{
"epoch": 0.794655103756471,
"grad_norm": 0.27709755301475525,
"learning_rate": 5.109386277955477e-05,
"loss": 2.1592,
"step": 8980
},
{
"epoch": 0.7955400203530817,
"grad_norm": 0.30955156683921814,
"learning_rate": 5.066983655682325e-05,
"loss": 2.1832,
"step": 8990
},
{
"epoch": 0.7964249369496925,
"grad_norm": 0.2957077920436859,
"learning_rate": 5.02473786604378e-05,
"loss": 2.1662,
"step": 9000
},
{
"epoch": 0.7964249369496925,
"eval_accuracy": 0.5522150805169673,
"eval_loss": 2.081662654876709,
"eval_runtime": 12.2604,
"eval_samples_per_second": 25.937,
"eval_steps_per_second": 0.408,
"step": 9000
},
{
"epoch": 0.7973098535463032,
"grad_norm": 0.31898653507232666,
"learning_rate": 4.982649241428997e-05,
"loss": 2.1762,
"step": 9010
},
{
"epoch": 0.7981947701429141,
"grad_norm": 0.3314710259437561,
"learning_rate": 4.9407181129905525e-05,
"loss": 2.1888,
"step": 9020
},
{
"epoch": 0.7990796867395248,
"grad_norm": 0.3407798111438751,
"learning_rate": 4.898944810641862e-05,
"loss": 2.1655,
"step": 9030
},
{
"epoch": 0.7999646033361356,
"grad_norm": 0.3467320203781128,
"learning_rate": 4.8573296630545685e-05,
"loss": 2.1822,
"step": 9040
},
{
"epoch": 0.8008495199327463,
"grad_norm": 0.31269845366477966,
"learning_rate": 4.81587299765594e-05,
"loss": 2.1553,
"step": 9050
},
{
"epoch": 0.801734436529357,
"grad_norm": 0.3272798955440521,
"learning_rate": 4.7745751406263163e-05,
"loss": 2.1901,
"step": 9060
},
{
"epoch": 0.8026193531259679,
"grad_norm": 0.3094612956047058,
"learning_rate": 4.733436416896528e-05,
"loss": 2.1862,
"step": 9070
},
{
"epoch": 0.8035042697225786,
"grad_norm": 0.32090067863464355,
"learning_rate": 4.692457150145374e-05,
"loss": 2.1739,
"step": 9080
},
{
"epoch": 0.8043891863191894,
"grad_norm": 0.25567829608917236,
"learning_rate": 4.651637662797018e-05,
"loss": 2.1701,
"step": 9090
},
{
"epoch": 0.8052741029158001,
"grad_norm": 0.270111620426178,
"learning_rate": 4.610978276018496e-05,
"loss": 2.1619,
"step": 9100
},
{
"epoch": 0.806159019512411,
"grad_norm": 0.2996044456958771,
"learning_rate": 4.5704793097171766e-05,
"loss": 2.1669,
"step": 9110
},
{
"epoch": 0.8070439361090217,
"grad_norm": 0.2638896703720093,
"learning_rate": 4.5301410825382304e-05,
"loss": 2.1864,
"step": 9120
},
{
"epoch": 0.8079288527056325,
"grad_norm": 0.28163716197013855,
"learning_rate": 4.4899639118621604e-05,
"loss": 2.1842,
"step": 9130
},
{
"epoch": 0.8088137693022432,
"grad_norm": 0.31031861901283264,
"learning_rate": 4.4499481138022546e-05,
"loss": 2.179,
"step": 9140
},
{
"epoch": 0.8096986858988541,
"grad_norm": 0.30774182081222534,
"learning_rate": 4.4100940032021334e-05,
"loss": 2.1791,
"step": 9150
},
{
"epoch": 0.8105836024954648,
"grad_norm": 0.2719011604785919,
"learning_rate": 4.3704018936332605e-05,
"loss": 2.1836,
"step": 9160
},
{
"epoch": 0.8114685190920756,
"grad_norm": 0.33051612973213196,
"learning_rate": 4.3308720973924936e-05,
"loss": 2.1712,
"step": 9170
},
{
"epoch": 0.8123534356886863,
"grad_norm": 0.2812555432319641,
"learning_rate": 4.29150492549959e-05,
"loss": 2.1602,
"step": 9180
},
{
"epoch": 0.8132383522852971,
"grad_norm": 0.26210522651672363,
"learning_rate": 4.2523006876947904e-05,
"loss": 2.176,
"step": 9190
},
{
"epoch": 0.8141232688819079,
"grad_norm": 0.29666024446487427,
"learning_rate": 4.213259692436367e-05,
"loss": 2.1815,
"step": 9200
},
{
"epoch": 0.8150081854785186,
"grad_norm": 0.2840687334537506,
"learning_rate": 4.1743822468982226e-05,
"loss": 2.1808,
"step": 9210
},
{
"epoch": 0.8158931020751294,
"grad_norm": 0.2684008479118347,
"learning_rate": 4.135668656967434e-05,
"loss": 2.1932,
"step": 9220
},
{
"epoch": 0.8167780186717402,
"grad_norm": 0.2919354736804962,
"learning_rate": 4.097119227241869e-05,
"loss": 2.1761,
"step": 9230
},
{
"epoch": 0.817662935268351,
"grad_norm": 0.29693448543548584,
"learning_rate": 4.0587342610277886e-05,
"loss": 2.1651,
"step": 9240
},
{
"epoch": 0.8185478518649617,
"grad_norm": 0.2847377061843872,
"learning_rate": 4.020514060337446e-05,
"loss": 2.1867,
"step": 9250
},
{
"epoch": 0.8194327684615725,
"grad_norm": 0.29556918144226074,
"learning_rate": 3.982458925886748e-05,
"loss": 2.1802,
"step": 9260
},
{
"epoch": 0.8203176850581833,
"grad_norm": 0.3286089599132538,
"learning_rate": 3.944569157092839e-05,
"loss": 2.1653,
"step": 9270
},
{
"epoch": 0.8212026016547941,
"grad_norm": 0.29342418909072876,
"learning_rate": 3.906845052071778e-05,
"loss": 2.1805,
"step": 9280
},
{
"epoch": 0.8220875182514048,
"grad_norm": 0.26084861159324646,
"learning_rate": 3.8692869076361794e-05,
"loss": 2.1626,
"step": 9290
},
{
"epoch": 0.8229724348480155,
"grad_norm": 0.27467480301856995,
"learning_rate": 3.831895019292897e-05,
"loss": 2.18,
"step": 9300
},
{
"epoch": 0.8238573514446264,
"grad_norm": 0.28592967987060547,
"learning_rate": 3.794669681240667e-05,
"loss": 2.1666,
"step": 9310
},
{
"epoch": 0.8247422680412371,
"grad_norm": 0.2642505168914795,
"learning_rate": 3.757611186367823e-05,
"loss": 2.1843,
"step": 9320
},
{
"epoch": 0.8256271846378479,
"grad_norm": 0.2525290250778198,
"learning_rate": 3.7207198262499684e-05,
"loss": 2.1798,
"step": 9330
},
{
"epoch": 0.8265121012344586,
"grad_norm": 0.2868938744068146,
"learning_rate": 3.6839958911476953e-05,
"loss": 2.1682,
"step": 9340
},
{
"epoch": 0.8273970178310694,
"grad_norm": 0.25756001472473145,
"learning_rate": 3.647439670004315e-05,
"loss": 2.1791,
"step": 9350
},
{
"epoch": 0.8282819344276802,
"grad_norm": 0.4099997580051422,
"learning_rate": 3.611051450443551e-05,
"loss": 2.1663,
"step": 9360
},
{
"epoch": 0.829166851024291,
"grad_norm": 0.285736083984375,
"learning_rate": 3.5748315187672935e-05,
"loss": 2.1809,
"step": 9370
},
{
"epoch": 0.8300517676209017,
"grad_norm": 0.2643410265445709,
"learning_rate": 3.5387801599533474e-05,
"loss": 2.2038,
"step": 9380
},
{
"epoch": 0.8309366842175125,
"grad_norm": 0.2976551651954651,
"learning_rate": 3.502897657653201e-05,
"loss": 2.1714,
"step": 9390
},
{
"epoch": 0.8318216008141233,
"grad_norm": 0.25775617361068726,
"learning_rate": 3.467184294189776e-05,
"loss": 2.1549,
"step": 9400
},
{
"epoch": 0.8327065174107341,
"grad_norm": 0.26022714376449585,
"learning_rate": 3.431640350555204e-05,
"loss": 2.1577,
"step": 9410
},
{
"epoch": 0.8335914340073448,
"grad_norm": 0.3256966471672058,
"learning_rate": 3.3962661064086356e-05,
"loss": 2.1812,
"step": 9420
},
{
"epoch": 0.8344763506039555,
"grad_norm": 0.3099459409713745,
"learning_rate": 3.3610618400740146e-05,
"loss": 2.1798,
"step": 9430
},
{
"epoch": 0.8353612672005664,
"grad_norm": 0.2882852554321289,
"learning_rate": 3.326027828537923e-05,
"loss": 2.1551,
"step": 9440
},
{
"epoch": 0.8362461837971771,
"grad_norm": 0.2728523015975952,
"learning_rate": 3.2911643474473646e-05,
"loss": 2.1779,
"step": 9450
},
{
"epoch": 0.8371311003937879,
"grad_norm": 0.28265324234962463,
"learning_rate": 3.2564716711076164e-05,
"loss": 2.1652,
"step": 9460
},
{
"epoch": 0.8380160169903986,
"grad_norm": 0.32904767990112305,
"learning_rate": 3.2219500724800705e-05,
"loss": 2.1833,
"step": 9470
},
{
"epoch": 0.8389009335870095,
"grad_norm": 0.27072396874427795,
"learning_rate": 3.187599823180071e-05,
"loss": 2.1993,
"step": 9480
},
{
"epoch": 0.8397858501836202,
"grad_norm": 0.27186015248298645,
"learning_rate": 3.153421193474809e-05,
"loss": 2.1792,
"step": 9490
},
{
"epoch": 0.840670766780231,
"grad_norm": 0.260337769985199,
"learning_rate": 3.119414452281158e-05,
"loss": 2.1747,
"step": 9500
},
{
"epoch": 0.8415556833768417,
"grad_norm": 0.2633416950702667,
"learning_rate": 3.085579867163582e-05,
"loss": 2.1835,
"step": 9510
},
{
"epoch": 0.8424405999734526,
"grad_norm": 0.2791996896266937,
"learning_rate": 3.051917704332016e-05,
"loss": 2.1776,
"step": 9520
},
{
"epoch": 0.8433255165700633,
"grad_norm": 0.3286284804344177,
"learning_rate": 3.0184282286397997e-05,
"loss": 2.1834,
"step": 9530
},
{
"epoch": 0.844210433166674,
"grad_norm": 0.284078449010849,
"learning_rate": 2.98511170358155e-05,
"loss": 2.1815,
"step": 9540
},
{
"epoch": 0.8450953497632848,
"grad_norm": 0.2807869017124176,
"learning_rate": 2.9519683912911265e-05,
"loss": 2.173,
"step": 9550
},
{
"epoch": 0.8459802663598955,
"grad_norm": 0.28499874472618103,
"learning_rate": 2.918998552539545e-05,
"loss": 2.1757,
"step": 9560
},
{
"epoch": 0.8468651829565064,
"grad_norm": 0.26381710171699524,
"learning_rate": 2.886202446732933e-05,
"loss": 2.1615,
"step": 9570
},
{
"epoch": 0.8477500995531171,
"grad_norm": 0.29104191064834595,
"learning_rate": 2.8535803319105047e-05,
"loss": 2.189,
"step": 9580
},
{
"epoch": 0.8486350161497279,
"grad_norm": 0.2626052498817444,
"learning_rate": 2.821132464742504e-05,
"loss": 2.1785,
"step": 9590
},
{
"epoch": 0.8495199327463386,
"grad_norm": 0.27080950140953064,
"learning_rate": 2.788859100528196e-05,
"loss": 2.1668,
"step": 9600
},
{
"epoch": 0.8504048493429495,
"grad_norm": 0.29530128836631775,
"learning_rate": 2.7567604931938606e-05,
"loss": 2.1941,
"step": 9610
},
{
"epoch": 0.8512897659395602,
"grad_norm": 0.3058115541934967,
"learning_rate": 2.7248368952908055e-05,
"loss": 2.1511,
"step": 9620
},
{
"epoch": 0.852174682536171,
"grad_norm": 0.25152358412742615,
"learning_rate": 2.6930885579933507e-05,
"loss": 2.1721,
"step": 9630
},
{
"epoch": 0.8530595991327817,
"grad_norm": 0.2745242714881897,
"learning_rate": 2.6615157310968778e-05,
"loss": 2.1737,
"step": 9640
},
{
"epoch": 0.8539445157293925,
"grad_norm": 0.2541252374649048,
"learning_rate": 2.6301186630158486e-05,
"loss": 2.1639,
"step": 9650
},
{
"epoch": 0.8548294323260033,
"grad_norm": 0.2664197087287903,
"learning_rate": 2.5988976007818716e-05,
"loss": 2.159,
"step": 9660
},
{
"epoch": 0.855714348922614,
"grad_norm": 0.2929265797138214,
"learning_rate": 2.5678527900417302e-05,
"loss": 2.1731,
"step": 9670
},
{
"epoch": 0.8565992655192248,
"grad_norm": 0.27480408549308777,
"learning_rate": 2.5369844750554705e-05,
"loss": 2.1847,
"step": 9680
},
{
"epoch": 0.8574841821158355,
"grad_norm": 0.25206154584884644,
"learning_rate": 2.5062928986944677e-05,
"loss": 2.1595,
"step": 9690
},
{
"epoch": 0.8583690987124464,
"grad_norm": 0.30056387186050415,
"learning_rate": 2.4757783024395242e-05,
"loss": 2.1659,
"step": 9700
},
{
"epoch": 0.8592540153090571,
"grad_norm": 0.2622138261795044,
"learning_rate": 2.4454409263789694e-05,
"loss": 2.1945,
"step": 9710
},
{
"epoch": 0.8601389319056679,
"grad_norm": 0.29541319608688354,
"learning_rate": 2.4152810092067658e-05,
"loss": 2.1668,
"step": 9720
},
{
"epoch": 0.8610238485022786,
"grad_norm": 0.2514541447162628,
"learning_rate": 2.3852987882206188e-05,
"loss": 2.1667,
"step": 9730
},
{
"epoch": 0.8619087650988895,
"grad_norm": 0.32891350984573364,
"learning_rate": 2.3554944993201487e-05,
"loss": 2.1707,
"step": 9740
},
{
"epoch": 0.8627936816955002,
"grad_norm": 0.30388736724853516,
"learning_rate": 2.325868377004986e-05,
"loss": 2.1727,
"step": 9750
},
{
"epoch": 0.863678598292111,
"grad_norm": 0.28365710377693176,
"learning_rate": 2.296420654372966e-05,
"loss": 2.1745,
"step": 9760
},
{
"epoch": 0.8645635148887217,
"grad_norm": 0.29059097170829773,
"learning_rate": 2.2671515631182666e-05,
"loss": 2.175,
"step": 9770
},
{
"epoch": 0.8654484314853325,
"grad_norm": 0.2463858276605606,
"learning_rate": 2.2380613335296037e-05,
"loss": 2.1817,
"step": 9780
},
{
"epoch": 0.8663333480819433,
"grad_norm": 0.30910032987594604,
"learning_rate": 2.2091501944884073e-05,
"loss": 2.1894,
"step": 9790
},
{
"epoch": 0.867218264678554,
"grad_norm": 0.2667929530143738,
"learning_rate": 2.1804183734670273e-05,
"loss": 2.1672,
"step": 9800
},
{
"epoch": 0.8681031812751648,
"grad_norm": 0.2770427167415619,
"learning_rate": 2.15186609652695e-05,
"loss": 2.1676,
"step": 9810
},
{
"epoch": 0.8689880978717756,
"grad_norm": 0.2695925831794739,
"learning_rate": 2.1234935883170047e-05,
"loss": 2.1946,
"step": 9820
},
{
"epoch": 0.8698730144683864,
"grad_norm": 0.24846522510051727,
"learning_rate": 2.0953010720716037e-05,
"loss": 2.1797,
"step": 9830
},
{
"epoch": 0.8707579310649971,
"grad_norm": 0.2570924758911133,
"learning_rate": 2.0672887696089827e-05,
"loss": 2.1913,
"step": 9840
},
{
"epoch": 0.8716428476616079,
"grad_norm": 0.25783997774124146,
"learning_rate": 2.039456901329473e-05,
"loss": 2.1552,
"step": 9850
},
{
"epoch": 0.8725277642582187,
"grad_norm": 0.293927937746048,
"learning_rate": 2.0118056862137357e-05,
"loss": 2.1819,
"step": 9860
},
{
"epoch": 0.8734126808548295,
"grad_norm": 0.28890037536621094,
"learning_rate": 1.9843353418210614e-05,
"loss": 2.1729,
"step": 9870
},
{
"epoch": 0.8742975974514402,
"grad_norm": 0.2489652782678604,
"learning_rate": 1.9570460842876532e-05,
"loss": 2.1802,
"step": 9880
},
{
"epoch": 0.8751825140480509,
"grad_norm": 0.2771342098712921,
"learning_rate": 1.9299381283249317e-05,
"loss": 2.18,
"step": 9890
},
{
"epoch": 0.8760674306446617,
"grad_norm": 0.28583505749702454,
"learning_rate": 1.9030116872178316e-05,
"loss": 2.1947,
"step": 9900
},
{
"epoch": 0.8769523472412725,
"grad_norm": 0.2841069996356964,
"learning_rate": 1.8762669728231373e-05,
"loss": 2.1661,
"step": 9910
},
{
"epoch": 0.8778372638378833,
"grad_norm": 0.25715571641921997,
"learning_rate": 1.8497041955678057e-05,
"loss": 2.1709,
"step": 9920
},
{
"epoch": 0.878722180434494,
"grad_norm": 0.2675383985042572,
"learning_rate": 1.823323564447313e-05,
"loss": 2.1752,
"step": 9930
},
{
"epoch": 0.8796070970311048,
"grad_norm": 0.2713576555252075,
"learning_rate": 1.797125287024029e-05,
"loss": 2.1795,
"step": 9940
},
{
"epoch": 0.8804920136277156,
"grad_norm": 0.2969569265842438,
"learning_rate": 1.7711095694255468e-05,
"loss": 2.1897,
"step": 9950
},
{
"epoch": 0.8813769302243264,
"grad_norm": 0.27227532863616943,
"learning_rate": 1.7452766163430972e-05,
"loss": 2.1905,
"step": 9960
},
{
"epoch": 0.8822618468209371,
"grad_norm": 0.30250978469848633,
"learning_rate": 1.719626631029911e-05,
"loss": 2.1664,
"step": 9970
},
{
"epoch": 0.8831467634175479,
"grad_norm": 0.2830178737640381,
"learning_rate": 1.6941598152996453e-05,
"loss": 2.1834,
"step": 9980
},
{
"epoch": 0.8840316800141587,
"grad_norm": 0.2569063603878021,
"learning_rate": 1.668876369524769e-05,
"loss": 2.1794,
"step": 9990
},
{
"epoch": 0.8849165966107695,
"grad_norm": 0.2609952390193939,
"learning_rate": 1.6437764926350073e-05,
"loss": 2.1844,
"step": 10000
},
{
"epoch": 0.8849165966107695,
"eval_accuracy": 0.5528685849440567,
"eval_loss": 2.0780797004699707,
"eval_runtime": 12.3984,
"eval_samples_per_second": 25.649,
"eval_steps_per_second": 0.403,
"step": 10000
},
{
"epoch": 0.8858015132073802,
"grad_norm": 0.31888630986213684,
"learning_rate": 1.6188603821157583e-05,
"loss": 2.1743,
"step": 10010
},
{
"epoch": 0.8866864298039909,
"grad_norm": 0.251662015914917,
"learning_rate": 1.59412823400657e-05,
"loss": 2.1553,
"step": 10020
},
{
"epoch": 0.8875713464006018,
"grad_norm": 0.32126525044441223,
"learning_rate": 1.569580242899557e-05,
"loss": 2.1664,
"step": 10030
},
{
"epoch": 0.8884562629972125,
"grad_norm": 0.27544891834259033,
"learning_rate": 1.5452166019378987e-05,
"loss": 2.1668,
"step": 10040
},
{
"epoch": 0.8893411795938233,
"grad_norm": 0.25146690011024475,
"learning_rate": 1.5210375028143097e-05,
"loss": 2.165,
"step": 10050
},
{
"epoch": 0.890226096190434,
"grad_norm": 0.2671303153038025,
"learning_rate": 1.497043135769524e-05,
"loss": 2.173,
"step": 10060
},
{
"epoch": 0.8911110127870449,
"grad_norm": 0.25551095604896545,
"learning_rate": 1.4732336895908278e-05,
"loss": 2.1663,
"step": 10070
},
{
"epoch": 0.8919959293836556,
"grad_norm": 0.2685263752937317,
"learning_rate": 1.4496093516105258e-05,
"loss": 2.1714,
"step": 10080
},
{
"epoch": 0.8928808459802664,
"grad_norm": 0.25145915150642395,
"learning_rate": 1.4261703077045218e-05,
"loss": 2.1556,
"step": 10090
},
{
"epoch": 0.8937657625768771,
"grad_norm": 0.25494644045829773,
"learning_rate": 1.4029167422908107e-05,
"loss": 2.1627,
"step": 10100
},
{
"epoch": 0.894650679173488,
"grad_norm": 0.2593258023262024,
"learning_rate": 1.3798488383280488e-05,
"loss": 2.1478,
"step": 10110
},
{
"epoch": 0.8955355957700987,
"grad_norm": 0.36344897747039795,
"learning_rate": 1.3569667773141142e-05,
"loss": 2.1828,
"step": 10120
},
{
"epoch": 0.8964205123667094,
"grad_norm": 0.2809944152832031,
"learning_rate": 1.3342707392846792e-05,
"loss": 2.1709,
"step": 10130
},
{
"epoch": 0.8973054289633202,
"grad_norm": 0.24097634851932526,
"learning_rate": 1.3117609028117817e-05,
"loss": 2.1602,
"step": 10140
},
{
"epoch": 0.8981903455599309,
"grad_norm": 0.28794658184051514,
"learning_rate": 1.2894374450024338e-05,
"loss": 2.1823,
"step": 10150
},
{
"epoch": 0.8990752621565418,
"grad_norm": 0.3050318658351898,
"learning_rate": 1.2673005414972184e-05,
"loss": 2.1651,
"step": 10160
},
{
"epoch": 0.8999601787531525,
"grad_norm": 0.25824499130249023,
"learning_rate": 1.2453503664689282e-05,
"loss": 2.1717,
"step": 10170
},
{
"epoch": 0.9008450953497633,
"grad_norm": 0.26507601141929626,
"learning_rate": 1.2235870926211617e-05,
"loss": 2.175,
"step": 10180
},
{
"epoch": 0.901730011946374,
"grad_norm": 0.2640542685985565,
"learning_rate": 1.2020108911869888e-05,
"loss": 2.1734,
"step": 10190
},
{
"epoch": 0.9026149285429849,
"grad_norm": 0.2744889557361603,
"learning_rate": 1.1806219319275918e-05,
"loss": 2.1503,
"step": 10200
},
{
"epoch": 0.9034998451395956,
"grad_norm": 0.2510084807872772,
"learning_rate": 1.1594203831309491e-05,
"loss": 2.1853,
"step": 10210
},
{
"epoch": 0.9043847617362064,
"grad_norm": 0.2631700932979584,
"learning_rate": 1.138406411610482e-05,
"loss": 2.188,
"step": 10220
},
{
"epoch": 0.9052696783328171,
"grad_norm": 0.2694438099861145,
"learning_rate": 1.1175801827037618e-05,
"loss": 2.1763,
"step": 10230
},
{
"epoch": 0.9061545949294278,
"grad_norm": 0.2635989189147949,
"learning_rate": 1.0969418602712e-05,
"loss": 2.1731,
"step": 10240
},
{
"epoch": 0.9070395115260387,
"grad_norm": 0.26256707310676575,
"learning_rate": 1.0764916066947795e-05,
"loss": 2.1564,
"step": 10250
},
{
"epoch": 0.9079244281226494,
"grad_norm": 0.2520720958709717,
"learning_rate": 1.0562295828767388e-05,
"loss": 2.158,
"step": 10260
},
{
"epoch": 0.9088093447192602,
"grad_norm": 0.2579314410686493,
"learning_rate": 1.0361559482383404e-05,
"loss": 2.1648,
"step": 10270
},
{
"epoch": 0.909694261315871,
"grad_norm": 0.280298113822937,
"learning_rate": 1.0162708607186045e-05,
"loss": 2.1658,
"step": 10280
},
{
"epoch": 0.9105791779124818,
"grad_norm": 0.27980297803878784,
"learning_rate": 9.965744767730545e-06,
"loss": 2.174,
"step": 10290
},
{
"epoch": 0.9114640945090925,
"grad_norm": 0.30590036511421204,
"learning_rate": 9.770669513725128e-06,
"loss": 2.1489,
"step": 10300
},
{
"epoch": 0.9123490111057033,
"grad_norm": 0.2882310152053833,
"learning_rate": 9.57748438001857e-06,
"loss": 2.1724,
"step": 10310
},
{
"epoch": 0.913233927702314,
"grad_norm": 0.30974969267845154,
"learning_rate": 9.386190886588208e-06,
"loss": 2.1528,
"step": 10320
},
{
"epoch": 0.9141188442989249,
"grad_norm": 0.2538847029209137,
"learning_rate": 9.196790538527982e-06,
"loss": 2.1783,
"step": 10330
},
{
"epoch": 0.9150037608955356,
"grad_norm": 0.3312051594257355,
"learning_rate": 9.00928482603669e-06,
"loss": 2.1768,
"step": 10340
},
{
"epoch": 0.9158886774921464,
"grad_norm": 0.261616975069046,
"learning_rate": 8.823675224406053e-06,
"loss": 2.1665,
"step": 10350
},
{
"epoch": 0.9167735940887571,
"grad_norm": 0.26149144768714905,
"learning_rate": 8.639963194009282e-06,
"loss": 2.1672,
"step": 10360
},
{
"epoch": 0.9176585106853679,
"grad_norm": 0.27541011571884155,
"learning_rate": 8.458150180289504e-06,
"loss": 2.1698,
"step": 10370
},
{
"epoch": 0.9185434272819787,
"grad_norm": 0.24506379663944244,
"learning_rate": 8.278237613748408e-06,
"loss": 2.1628,
"step": 10380
},
{
"epoch": 0.9194283438785894,
"grad_norm": 0.304977148771286,
"learning_rate": 8.10022690993506e-06,
"loss": 2.1522,
"step": 10390
},
{
"epoch": 0.9203132604752002,
"grad_norm": 0.2372012585401535,
"learning_rate": 7.924119469434665e-06,
"loss": 2.1558,
"step": 10400
},
{
"epoch": 0.921198177071811,
"grad_norm": 0.26251688599586487,
"learning_rate": 7.749916677857544e-06,
"loss": 2.1829,
"step": 10410
},
{
"epoch": 0.9220830936684218,
"grad_norm": 0.26747065782546997,
"learning_rate": 7.577619905828281e-06,
"loss": 2.1815,
"step": 10420
},
{
"epoch": 0.9229680102650325,
"grad_norm": 0.2626737654209137,
"learning_rate": 7.4072305089750155e-06,
"loss": 2.176,
"step": 10430
},
{
"epoch": 0.9238529268616433,
"grad_norm": 0.33040159940719604,
"learning_rate": 7.238749827918639e-06,
"loss": 2.1718,
"step": 10440
},
{
"epoch": 0.924737843458254,
"grad_norm": 0.2599565386772156,
"learning_rate": 7.072179188262251e-06,
"loss": 2.1781,
"step": 10450
},
{
"epoch": 0.9256227600548649,
"grad_norm": 0.24637052416801453,
"learning_rate": 6.907519900580861e-06,
"loss": 2.1651,
"step": 10460
},
{
"epoch": 0.9265076766514756,
"grad_norm": 0.25145140290260315,
"learning_rate": 6.744773260410869e-06,
"loss": 2.1819,
"step": 10470
},
{
"epoch": 0.9273925932480863,
"grad_norm": 0.24591967463493347,
"learning_rate": 6.583940548240186e-06,
"loss": 2.1876,
"step": 10480
},
{
"epoch": 0.9282775098446971,
"grad_norm": 0.3043903708457947,
"learning_rate": 6.425023029497823e-06,
"loss": 2.1796,
"step": 10490
},
{
"epoch": 0.9291624264413079,
"grad_norm": 0.2612435519695282,
"learning_rate": 6.268021954544096e-06,
"loss": 2.1849,
"step": 10500
},
{
"epoch": 0.9300473430379187,
"grad_norm": 0.27911072969436646,
"learning_rate": 6.112938558660852e-06,
"loss": 2.1609,
"step": 10510
},
{
"epoch": 0.9309322596345294,
"grad_norm": 0.23749688267707825,
"learning_rate": 5.95977406204154e-06,
"loss": 2.1698,
"step": 10520
},
{
"epoch": 0.9318171762311402,
"grad_norm": 0.3167470395565033,
"learning_rate": 5.808529669781903e-06,
"loss": 2.1715,
"step": 10530
},
{
"epoch": 0.932702092827751,
"grad_norm": 0.28257447481155396,
"learning_rate": 5.659206571870218e-06,
"loss": 2.1672,
"step": 10540
},
{
"epoch": 0.9335870094243618,
"grad_norm": 0.30499503016471863,
"learning_rate": 5.5118059431781e-06,
"loss": 2.1625,
"step": 10550
},
{
"epoch": 0.9344719260209725,
"grad_norm": 0.27074891328811646,
"learning_rate": 5.3663289434511546e-06,
"loss": 2.1757,
"step": 10560
},
{
"epoch": 0.9353568426175833,
"grad_norm": 0.4108920395374298,
"learning_rate": 5.222776717300009e-06,
"loss": 2.1518,
"step": 10570
},
{
"epoch": 0.9362417592141941,
"grad_norm": 0.24994902312755585,
"learning_rate": 5.0811503941911304e-06,
"loss": 2.181,
"step": 10580
},
{
"epoch": 0.9371266758108048,
"grad_norm": 0.2533867657184601,
"learning_rate": 4.941451088437993e-06,
"loss": 2.1959,
"step": 10590
},
{
"epoch": 0.9380115924074156,
"grad_norm": 0.2800501883029938,
"learning_rate": 4.803679899192393e-06,
"loss": 2.1684,
"step": 10600
},
{
"epoch": 0.9388965090040263,
"grad_norm": 0.3431278467178345,
"learning_rate": 4.667837910435707e-06,
"loss": 2.1656,
"step": 10610
},
{
"epoch": 0.9397814256006372,
"grad_norm": 0.2510489225387573,
"learning_rate": 4.5339261909704e-06,
"loss": 2.1566,
"step": 10620
},
{
"epoch": 0.9406663421972479,
"grad_norm": 0.26005586981773376,
"learning_rate": 4.401945794411611e-06,
"loss": 2.1758,
"step": 10630
},
{
"epoch": 0.9415512587938587,
"grad_norm": 0.2854231595993042,
"learning_rate": 4.271897759178883e-06,
"loss": 2.1784,
"step": 10640
},
{
"epoch": 0.9424361753904694,
"grad_norm": 0.2713136672973633,
"learning_rate": 4.143783108487897e-06,
"loss": 2.1916,
"step": 10650
},
{
"epoch": 0.9433210919870803,
"grad_norm": 0.2967517375946045,
"learning_rate": 4.017602850342584e-06,
"loss": 2.2032,
"step": 10660
},
{
"epoch": 0.944206008583691,
"grad_norm": 0.2542577087879181,
"learning_rate": 3.893357977527101e-06,
"loss": 2.1896,
"step": 10670
},
{
"epoch": 0.9450909251803018,
"grad_norm": 0.25759467482566833,
"learning_rate": 3.771049467597959e-06,
"loss": 2.1535,
"step": 10680
},
{
"epoch": 0.9459758417769125,
"grad_norm": 0.24098724126815796,
"learning_rate": 3.650678282876463e-06,
"loss": 2.1857,
"step": 10690
},
{
"epoch": 0.9468607583735233,
"grad_norm": 0.24877989292144775,
"learning_rate": 3.5322453704410283e-06,
"loss": 2.1735,
"step": 10700
},
{
"epoch": 0.9477456749701341,
"grad_norm": 0.2515574097633362,
"learning_rate": 3.4157516621198536e-06,
"loss": 2.1655,
"step": 10710
},
{
"epoch": 0.9486305915667448,
"grad_norm": 0.41679874062538147,
"learning_rate": 3.301198074483397e-06,
"loss": 2.1798,
"step": 10720
},
{
"epoch": 0.9495155081633556,
"grad_norm": 0.2788831293582916,
"learning_rate": 3.1885855088374104e-06,
"loss": 2.1772,
"step": 10730
},
{
"epoch": 0.9504004247599663,
"grad_norm": 0.2543758749961853,
"learning_rate": 3.077914851215585e-06,
"loss": 2.1808,
"step": 10740
},
{
"epoch": 0.9512853413565772,
"grad_norm": 0.2551940977573395,
"learning_rate": 2.969186972372806e-06,
"loss": 2.1826,
"step": 10750
},
{
"epoch": 0.9521702579531879,
"grad_norm": 0.24164696037769318,
"learning_rate": 2.862402727778185e-06,
"loss": 2.1888,
"step": 10760
},
{
"epoch": 0.9530551745497987,
"grad_norm": 0.2645455002784729,
"learning_rate": 2.757562957608373e-06,
"loss": 2.1626,
"step": 10770
},
{
"epoch": 0.9539400911464094,
"grad_norm": 0.32282203435897827,
"learning_rate": 2.654668486740841e-06,
"loss": 2.1498,
"step": 10780
},
{
"epoch": 0.9548250077430203,
"grad_norm": 0.3102979063987732,
"learning_rate": 2.5537201247475828e-06,
"loss": 2.1726,
"step": 10790
},
{
"epoch": 0.955709924339631,
"grad_norm": 0.2809552252292633,
"learning_rate": 2.454718665888589e-06,
"loss": 2.1684,
"step": 10800
},
{
"epoch": 0.9565948409362418,
"grad_norm": 0.3085186183452606,
"learning_rate": 2.357664889105687e-06,
"loss": 2.1708,
"step": 10810
},
{
"epoch": 0.9574797575328525,
"grad_norm": 0.2588469684123993,
"learning_rate": 2.262559558016325e-06,
"loss": 2.1536,
"step": 10820
},
{
"epoch": 0.9583646741294632,
"grad_norm": 0.30835458636283875,
"learning_rate": 2.169403420907601e-06,
"loss": 2.2094,
"step": 10830
},
{
"epoch": 0.9592495907260741,
"grad_norm": 0.2416762262582779,
"learning_rate": 2.078197210730465e-06,
"loss": 2.1617,
"step": 10840
},
{
"epoch": 0.9601345073226848,
"grad_norm": 0.311798632144928,
"learning_rate": 1.9889416450938334e-06,
"loss": 2.1897,
"step": 10850
},
{
"epoch": 0.9610194239192956,
"grad_norm": 0.2555478811264038,
"learning_rate": 1.901637426258984e-06,
"loss": 2.1769,
"step": 10860
},
{
"epoch": 0.9619043405159063,
"grad_norm": 0.2964852452278137,
"learning_rate": 1.8162852411340025e-06,
"loss": 2.1741,
"step": 10870
},
{
"epoch": 0.9627892571125172,
"grad_norm": 0.3563735783100128,
"learning_rate": 1.7328857612684267e-06,
"loss": 2.1444,
"step": 10880
},
{
"epoch": 0.9636741737091279,
"grad_norm": 0.24288448691368103,
"learning_rate": 1.6514396428480017e-06,
"loss": 2.1926,
"step": 10890
},
{
"epoch": 0.9645590903057387,
"grad_norm": 0.3532714247703552,
"learning_rate": 1.571947526689349e-06,
"loss": 2.1464,
"step": 10900
},
{
"epoch": 0.9654440069023494,
"grad_norm": 0.250182569026947,
"learning_rate": 1.494410038235139e-06,
"loss": 2.1819,
"step": 10910
},
{
"epoch": 0.9663289234989603,
"grad_norm": 0.2672261595726013,
"learning_rate": 1.418827787548982e-06,
"loss": 2.1882,
"step": 10920
},
{
"epoch": 0.967213840095571,
"grad_norm": 0.24856804311275482,
"learning_rate": 1.3452013693107667e-06,
"loss": 2.1657,
"step": 10930
},
{
"epoch": 0.9680987566921818,
"grad_norm": 0.30384114384651184,
"learning_rate": 1.273531362811914e-06,
"loss": 2.1746,
"step": 10940
},
{
"epoch": 0.9689836732887925,
"grad_norm": 0.31302690505981445,
"learning_rate": 1.2038183319507957e-06,
"loss": 2.1761,
"step": 10950
},
{
"epoch": 0.9698685898854033,
"grad_norm": 0.2600124180316925,
"learning_rate": 1.1360628252283511e-06,
"loss": 2.1481,
"step": 10960
},
{
"epoch": 0.9707535064820141,
"grad_norm": 0.34927940368652344,
"learning_rate": 1.0702653757437564e-06,
"loss": 2.1751,
"step": 10970
},
{
"epoch": 0.9716384230786248,
"grad_norm": 0.34013205766677856,
"learning_rate": 1.006426501190233e-06,
"loss": 2.1871,
"step": 10980
},
{
"epoch": 0.9725233396752356,
"grad_norm": 0.23738481104373932,
"learning_rate": 9.445467038509958e-07,
"loss": 2.1675,
"step": 10990
},
{
"epoch": 0.9734082562718464,
"grad_norm": 0.2345409244298935,
"learning_rate": 8.84626470595229e-07,
"loss": 2.167,
"step": 11000
},
{
"epoch": 0.9734082562718464,
"eval_accuracy": 0.5530912832799625,
"eval_loss": 2.076953411102295,
"eval_runtime": 12.1468,
"eval_samples_per_second": 26.18,
"eval_steps_per_second": 0.412,
"step": 11000
},
{
"epoch": 0.9742931728684572,
"grad_norm": 0.24096181988716125,
"learning_rate": 8.266662728742547e-07,
"loss": 2.168,
"step": 11010
},
{
"epoch": 0.9751780894650679,
"grad_norm": 0.2738332748413086,
"learning_rate": 7.70666566718009e-07,
"loss": 2.1738,
"step": 11020
},
{
"epoch": 0.9760630060616787,
"grad_norm": 0.30326494574546814,
"learning_rate": 7.166277927311837e-07,
"loss": 2.1737,
"step": 11030
},
{
"epoch": 0.9769479226582894,
"grad_norm": 0.2905402183532715,
"learning_rate": 6.645503760899507e-07,
"loss": 2.1725,
"step": 11040
},
{
"epoch": 0.9778328392549003,
"grad_norm": 0.25171908736228943,
"learning_rate": 6.144347265384931e-07,
"loss": 2.18,
"step": 11050
},
{
"epoch": 0.978717755851511,
"grad_norm": 0.26368576288223267,
"learning_rate": 5.662812383859795e-07,
"loss": 2.1911,
"step": 11060
},
{
"epoch": 0.9796026724481217,
"grad_norm": 0.2813120484352112,
"learning_rate": 5.20090290503178e-07,
"loss": 2.1725,
"step": 11070
},
{
"epoch": 0.9804875890447325,
"grad_norm": 0.34227675199508667,
"learning_rate": 4.7586224631968047e-07,
"loss": 2.1664,
"step": 11080
},
{
"epoch": 0.9813725056413433,
"grad_norm": 0.2556191384792328,
"learning_rate": 4.335974538210441e-07,
"loss": 2.1698,
"step": 11090
},
{
"epoch": 0.9822574222379541,
"grad_norm": 0.2765117883682251,
"learning_rate": 3.932962455458489e-07,
"loss": 2.1849,
"step": 11100
},
{
"epoch": 0.9831423388345648,
"grad_norm": 0.2285272777080536,
"learning_rate": 3.5495893858342174e-07,
"loss": 2.1529,
"step": 11110
},
{
"epoch": 0.9840272554311756,
"grad_norm": 0.24666184186935425,
"learning_rate": 3.1858583457095026e-07,
"loss": 2.1711,
"step": 11120
},
{
"epoch": 0.9849121720277864,
"grad_norm": 0.2382645606994629,
"learning_rate": 2.8417721969145604e-07,
"loss": 2.1318,
"step": 11130
},
{
"epoch": 0.9857970886243972,
"grad_norm": 0.2539430558681488,
"learning_rate": 2.5173336467135267e-07,
"loss": 2.1724,
"step": 11140
},
{
"epoch": 0.9866820052210079,
"grad_norm": 0.30961015820503235,
"learning_rate": 2.2125452477828045e-07,
"loss": 2.1845,
"step": 11150
},
{
"epoch": 0.9875669218176187,
"grad_norm": 0.24505238234996796,
"learning_rate": 1.9274093981927476e-07,
"loss": 2.1824,
"step": 11160
},
{
"epoch": 0.9884518384142295,
"grad_norm": 0.32805320620536804,
"learning_rate": 1.66192834138712e-07,
"loss": 2.1744,
"step": 11170
},
{
"epoch": 0.9893367550108402,
"grad_norm": 0.2783229649066925,
"learning_rate": 1.4161041661667208e-07,
"loss": 2.1709,
"step": 11180
},
{
"epoch": 0.990221671607451,
"grad_norm": 0.2550203502178192,
"learning_rate": 1.1899388066718975e-07,
"loss": 2.1681,
"step": 11190
},
{
"epoch": 0.9911065882040617,
"grad_norm": 0.2780303657054901,
"learning_rate": 9.834340423678367e-08,
"loss": 2.149,
"step": 11200
},
{
"epoch": 0.9919915048006726,
"grad_norm": 0.24265854060649872,
"learning_rate": 7.965914980304079e-08,
"loss": 2.1807,
"step": 11210
},
{
"epoch": 0.9928764213972833,
"grad_norm": 0.31807270646095276,
"learning_rate": 6.294126437336733e-08,
"loss": 2.1778,
"step": 11220
},
{
"epoch": 0.9937613379938941,
"grad_norm": 0.27723586559295654,
"learning_rate": 4.818987948379538e-08,
"loss": 2.1862,
"step": 11230
},
{
"epoch": 0.9946462545905048,
"grad_norm": 0.3337404131889343,
"learning_rate": 3.5405111197955865e-08,
"loss": 2.1644,
"step": 11240
},
{
"epoch": 0.9955311711871156,
"grad_norm": 0.24611864984035492,
"learning_rate": 2.4587060106245897e-08,
"loss": 2.1762,
"step": 11250
},
{
"epoch": 0.9964160877837264,
"grad_norm": 0.2372807413339615,
"learning_rate": 1.5735811324857354e-08,
"loss": 2.1612,
"step": 11260
},
{
"epoch": 0.9973010043803372,
"grad_norm": 0.2733718752861023,
"learning_rate": 8.851434495277256e-09,
"loss": 2.1666,
"step": 11270
},
{
"epoch": 0.9981859209769479,
"grad_norm": 0.3090679943561554,
"learning_rate": 3.933983783677153e-09,
"loss": 2.1616,
"step": 11280
},
{
"epoch": 0.9990708375735587,
"grad_norm": 0.3332498371601105,
"learning_rate": 9.834978804412752e-10,
"loss": 2.1655,
"step": 11290
},
{
"epoch": 0.9999557541701695,
"grad_norm": 0.29820016026496887,
"learning_rate": 0.0,
"loss": 2.1751,
"step": 11300
},
{
"epoch": 0.9999557541701695,
"step": 11300,
"total_flos": 1.1360877451905964e+21,
"train_loss": 2.231945479570237,
"train_runtime": 110078.6466,
"train_samples_per_second": 13.14,
"train_steps_per_second": 0.103
}
],
"logging_steps": 10,
"max_steps": 11300,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 10000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.1360877451905964e+21,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}