finetune_demo / trainer_state.json
kguo2's picture
Model save
72ba05e verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 1428,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.01050420168067227,
"grad_norm": 21.68925666809082,
"learning_rate": 3.4722222222222224e-06,
"loss": 1.7424,
"num_tokens": 61440.0,
"step": 5
},
{
"epoch": 0.02100840336134454,
"grad_norm": 6.517210483551025,
"learning_rate": 6.944444444444445e-06,
"loss": 0.8531,
"num_tokens": 122880.0,
"step": 10
},
{
"epoch": 0.031512605042016806,
"grad_norm": 1.3076003789901733,
"learning_rate": 1.0416666666666668e-05,
"loss": 0.2121,
"num_tokens": 184320.0,
"step": 15
},
{
"epoch": 0.04201680672268908,
"grad_norm": 1.2455157041549683,
"learning_rate": 1.388888888888889e-05,
"loss": 0.113,
"num_tokens": 245760.0,
"step": 20
},
{
"epoch": 0.052521008403361345,
"grad_norm": 0.854950487613678,
"learning_rate": 1.736111111111111e-05,
"loss": 0.0885,
"num_tokens": 307200.0,
"step": 25
},
{
"epoch": 0.06302521008403361,
"grad_norm": 0.9929510951042175,
"learning_rate": 2.0833333333333336e-05,
"loss": 0.0771,
"num_tokens": 368640.0,
"step": 30
},
{
"epoch": 0.07352941176470588,
"grad_norm": 1.0237308740615845,
"learning_rate": 2.4305555555555558e-05,
"loss": 0.0805,
"num_tokens": 430080.0,
"step": 35
},
{
"epoch": 0.08403361344537816,
"grad_norm": 1.4065616130828857,
"learning_rate": 2.777777777777778e-05,
"loss": 0.0822,
"num_tokens": 491520.0,
"step": 40
},
{
"epoch": 0.09453781512605042,
"grad_norm": 0.7961967587471008,
"learning_rate": 3.125e-05,
"loss": 0.0721,
"num_tokens": 552960.0,
"step": 45
},
{
"epoch": 0.10504201680672269,
"grad_norm": 0.7718358039855957,
"learning_rate": 3.472222222222222e-05,
"loss": 0.0683,
"num_tokens": 614297.0,
"step": 50
},
{
"epoch": 0.11554621848739496,
"grad_norm": 0.6594786047935486,
"learning_rate": 3.8194444444444444e-05,
"loss": 0.0665,
"num_tokens": 675536.0,
"step": 55
},
{
"epoch": 0.12605042016806722,
"grad_norm": 0.637739896774292,
"learning_rate": 4.166666666666667e-05,
"loss": 0.0718,
"num_tokens": 736976.0,
"step": 60
},
{
"epoch": 0.13655462184873948,
"grad_norm": 0.4128863215446472,
"learning_rate": 4.5138888888888894e-05,
"loss": 0.069,
"num_tokens": 798416.0,
"step": 65
},
{
"epoch": 0.14705882352941177,
"grad_norm": 0.7012873888015747,
"learning_rate": 4.8611111111111115e-05,
"loss": 0.0651,
"num_tokens": 859856.0,
"step": 70
},
{
"epoch": 0.15756302521008403,
"grad_norm": 0.6367799639701843,
"learning_rate": 4.9999456532409905e-05,
"loss": 0.0777,
"num_tokens": 921296.0,
"step": 75
},
{
"epoch": 0.16806722689075632,
"grad_norm": 0.49970555305480957,
"learning_rate": 4.999613543665713e-05,
"loss": 0.0642,
"num_tokens": 982736.0,
"step": 80
},
{
"epoch": 0.17857142857142858,
"grad_norm": 0.49360784888267517,
"learning_rate": 4.998979561670338e-05,
"loss": 0.0651,
"num_tokens": 1044176.0,
"step": 85
},
{
"epoch": 0.18907563025210083,
"grad_norm": 0.432689905166626,
"learning_rate": 4.9980437923280036e-05,
"loss": 0.063,
"num_tokens": 1105611.0,
"step": 90
},
{
"epoch": 0.19957983193277312,
"grad_norm": 0.6416879296302795,
"learning_rate": 4.996806361208257e-05,
"loss": 0.0659,
"num_tokens": 1167051.0,
"step": 95
},
{
"epoch": 0.21008403361344538,
"grad_norm": 0.517444908618927,
"learning_rate": 4.995267434360207e-05,
"loss": 0.0638,
"num_tokens": 1228314.0,
"step": 100
},
{
"epoch": 0.22058823529411764,
"grad_norm": 107.99620819091797,
"learning_rate": 4.993427218290246e-05,
"loss": 0.6636,
"num_tokens": 1289602.0,
"step": 105
},
{
"epoch": 0.23109243697478993,
"grad_norm": 0.5925813317298889,
"learning_rate": 4.991285959934332e-05,
"loss": 0.1472,
"num_tokens": 1351042.0,
"step": 110
},
{
"epoch": 0.2415966386554622,
"grad_norm": 0.3515715003013611,
"learning_rate": 4.988843946624858e-05,
"loss": 0.0645,
"num_tokens": 1412482.0,
"step": 115
},
{
"epoch": 0.25210084033613445,
"grad_norm": 0.4023377001285553,
"learning_rate": 4.9861015060520935e-05,
"loss": 0.0632,
"num_tokens": 1473922.0,
"step": 120
},
{
"epoch": 0.26260504201680673,
"grad_norm": 0.37604016065597534,
"learning_rate": 4.9830590062202105e-05,
"loss": 0.0635,
"num_tokens": 1535362.0,
"step": 125
},
{
"epoch": 0.27310924369747897,
"grad_norm": 3.824808359146118,
"learning_rate": 4.9797168553979054e-05,
"loss": 0.0704,
"num_tokens": 1596802.0,
"step": 130
},
{
"epoch": 0.28361344537815125,
"grad_norm": 4.908388137817383,
"learning_rate": 4.976075502063613e-05,
"loss": 0.128,
"num_tokens": 1658080.0,
"step": 135
},
{
"epoch": 0.29411764705882354,
"grad_norm": 0.3574487566947937,
"learning_rate": 4.97213543484532e-05,
"loss": 0.0682,
"num_tokens": 1719520.0,
"step": 140
},
{
"epoch": 0.30462184873949577,
"grad_norm": 0.35017549991607666,
"learning_rate": 4.9678971824550074e-05,
"loss": 0.0593,
"num_tokens": 1780960.0,
"step": 145
},
{
"epoch": 0.31512605042016806,
"grad_norm": 0.36095139384269714,
"learning_rate": 4.9633613136176925e-05,
"loss": 0.0605,
"num_tokens": 1842400.0,
"step": 150
},
{
"epoch": 0.32563025210084034,
"grad_norm": 0.3857307434082031,
"learning_rate": 4.95852843699512e-05,
"loss": 0.059,
"num_tokens": 1903840.0,
"step": 155
},
{
"epoch": 0.33613445378151263,
"grad_norm": 0.27103978395462036,
"learning_rate": 4.953399201104084e-05,
"loss": 0.0581,
"num_tokens": 1965280.0,
"step": 160
},
{
"epoch": 0.34663865546218486,
"grad_norm": 0.29975804686546326,
"learning_rate": 4.9479742942294035e-05,
"loss": 0.0578,
"num_tokens": 2026720.0,
"step": 165
},
{
"epoch": 0.35714285714285715,
"grad_norm": 0.3278215825557709,
"learning_rate": 4.9422544443315635e-05,
"loss": 0.056,
"num_tokens": 2088160.0,
"step": 170
},
{
"epoch": 0.36764705882352944,
"grad_norm": 0.28858262300491333,
"learning_rate": 4.936240418949032e-05,
"loss": 0.0582,
"num_tokens": 2149600.0,
"step": 175
},
{
"epoch": 0.37815126050420167,
"grad_norm": 0.36514896154403687,
"learning_rate": 4.929933025095261e-05,
"loss": 0.0577,
"num_tokens": 2211007.0,
"step": 180
},
{
"epoch": 0.38865546218487396,
"grad_norm": 0.37031087279319763,
"learning_rate": 4.9233331091504034e-05,
"loss": 0.0554,
"num_tokens": 2272203.0,
"step": 185
},
{
"epoch": 0.39915966386554624,
"grad_norm": 0.28869447112083435,
"learning_rate": 4.916441556747727e-05,
"loss": 0.0575,
"num_tokens": 2333494.0,
"step": 190
},
{
"epoch": 0.4096638655462185,
"grad_norm": 0.29715102910995483,
"learning_rate": 4.909259292654782e-05,
"loss": 0.0573,
"num_tokens": 2394934.0,
"step": 195
},
{
"epoch": 0.42016806722689076,
"grad_norm": 0.40769094228744507,
"learning_rate": 4.9017872806492995e-05,
"loss": 0.0583,
"num_tokens": 2456374.0,
"step": 200
},
{
"epoch": 0.43067226890756305,
"grad_norm": 0.3187924325466156,
"learning_rate": 4.8940265233898744e-05,
"loss": 0.0546,
"num_tokens": 2517814.0,
"step": 205
},
{
"epoch": 0.4411764705882353,
"grad_norm": 0.25656041502952576,
"learning_rate": 4.885978062281408e-05,
"loss": 0.057,
"num_tokens": 2579248.0,
"step": 210
},
{
"epoch": 0.45168067226890757,
"grad_norm": 0.2591699957847595,
"learning_rate": 4.877642977335371e-05,
"loss": 0.0573,
"num_tokens": 2640688.0,
"step": 215
},
{
"epoch": 0.46218487394957986,
"grad_norm": 0.3359907567501068,
"learning_rate": 4.869022387024879e-05,
"loss": 0.0538,
"num_tokens": 2702128.0,
"step": 220
},
{
"epoch": 0.4726890756302521,
"grad_norm": 0.3090253174304962,
"learning_rate": 4.8601174481346015e-05,
"loss": 0.057,
"num_tokens": 2763412.0,
"step": 225
},
{
"epoch": 0.4831932773109244,
"grad_norm": 0.2006455659866333,
"learning_rate": 4.8509293556055345e-05,
"loss": 0.0554,
"num_tokens": 2824852.0,
"step": 230
},
{
"epoch": 0.49369747899159666,
"grad_norm": 0.255356103181839,
"learning_rate": 4.84145934237466e-05,
"loss": 0.0557,
"num_tokens": 2886292.0,
"step": 235
},
{
"epoch": 0.5042016806722689,
"grad_norm": 0.21021538972854614,
"learning_rate": 4.8317086792094906e-05,
"loss": 0.0527,
"num_tokens": 2947732.0,
"step": 240
},
{
"epoch": 0.5147058823529411,
"grad_norm": 0.24062202870845795,
"learning_rate": 4.821678674537557e-05,
"loss": 0.0545,
"num_tokens": 3009172.0,
"step": 245
},
{
"epoch": 0.5252100840336135,
"grad_norm": 0.30908721685409546,
"learning_rate": 4.811370674270821e-05,
"loss": 0.0538,
"num_tokens": 3070612.0,
"step": 250
},
{
"epoch": 0.5357142857142857,
"grad_norm": 0.2805303931236267,
"learning_rate": 4.800786061625078e-05,
"loss": 0.0528,
"num_tokens": 3132052.0,
"step": 255
},
{
"epoch": 0.5462184873949579,
"grad_norm": 0.245064839720726,
"learning_rate": 4.789926256934345e-05,
"loss": 0.0566,
"num_tokens": 3193416.0,
"step": 260
},
{
"epoch": 0.5567226890756303,
"grad_norm": 0.2865758538246155,
"learning_rate": 4.778792717460259e-05,
"loss": 0.0542,
"num_tokens": 3254856.0,
"step": 265
},
{
"epoch": 0.5672268907563025,
"grad_norm": 0.2498483657836914,
"learning_rate": 4.7673869371965425e-05,
"loss": 0.0544,
"num_tokens": 3316296.0,
"step": 270
},
{
"epoch": 0.5777310924369747,
"grad_norm": 0.24384859204292297,
"learning_rate": 4.755710446668515e-05,
"loss": 0.0564,
"num_tokens": 3377345.0,
"step": 275
},
{
"epoch": 0.5882352941176471,
"grad_norm": 0.22256287932395935,
"learning_rate": 4.7437648127277216e-05,
"loss": 0.0543,
"num_tokens": 3438785.0,
"step": 280
},
{
"epoch": 0.5987394957983193,
"grad_norm": 0.7542266845703125,
"learning_rate": 4.7315516383416736e-05,
"loss": 0.0595,
"num_tokens": 3500225.0,
"step": 285
},
{
"epoch": 0.6092436974789915,
"grad_norm": 0.24494485557079315,
"learning_rate": 4.7190725623787545e-05,
"loss": 0.0565,
"num_tokens": 3561466.0,
"step": 290
},
{
"epoch": 0.6197478991596639,
"grad_norm": 0.26762306690216064,
"learning_rate": 4.706329259388298e-05,
"loss": 0.0557,
"num_tokens": 3622906.0,
"step": 295
},
{
"epoch": 0.6302521008403361,
"grad_norm": 0.285203754901886,
"learning_rate": 4.6933234393758844e-05,
"loss": 0.0537,
"num_tokens": 3684184.0,
"step": 300
},
{
"epoch": 0.6407563025210085,
"grad_norm": 0.5487973690032959,
"learning_rate": 4.680056847573878e-05,
"loss": 0.0551,
"num_tokens": 3745624.0,
"step": 305
},
{
"epoch": 0.6512605042016807,
"grad_norm": 0.2598506808280945,
"learning_rate": 4.666531264207235e-05,
"loss": 0.0542,
"num_tokens": 3806907.0,
"step": 310
},
{
"epoch": 0.6617647058823529,
"grad_norm": 0.30089858174324036,
"learning_rate": 4.6527485042546204e-05,
"loss": 0.0576,
"num_tokens": 3868347.0,
"step": 315
},
{
"epoch": 0.6722689075630253,
"grad_norm": 0.24752771854400635,
"learning_rate": 4.638710417204855e-05,
"loss": 0.0538,
"num_tokens": 3929787.0,
"step": 320
},
{
"epoch": 0.6827731092436975,
"grad_norm": 0.2561896741390228,
"learning_rate": 4.6244188868087395e-05,
"loss": 0.0556,
"num_tokens": 3991227.0,
"step": 325
},
{
"epoch": 0.6932773109243697,
"grad_norm": 0.3156558871269226,
"learning_rate": 4.609875830826272e-05,
"loss": 0.0564,
"num_tokens": 4052667.0,
"step": 330
},
{
"epoch": 0.7037815126050421,
"grad_norm": 0.291674941778183,
"learning_rate": 4.59508320076931e-05,
"loss": 0.0558,
"num_tokens": 4114107.0,
"step": 335
},
{
"epoch": 0.7142857142857143,
"grad_norm": 0.29663559794425964,
"learning_rate": 4.580042981639698e-05,
"loss": 0.0545,
"num_tokens": 4175485.0,
"step": 340
},
{
"epoch": 0.7247899159663865,
"grad_norm": 0.2254744917154312,
"learning_rate": 4.5647571916629064e-05,
"loss": 0.0544,
"num_tokens": 4236925.0,
"step": 345
},
{
"epoch": 0.7352941176470589,
"grad_norm": 0.327118843793869,
"learning_rate": 4.549227882017202e-05,
"loss": 0.0556,
"num_tokens": 4298365.0,
"step": 350
},
{
"epoch": 0.7457983193277311,
"grad_norm": 0.24738788604736328,
"learning_rate": 4.533457136558408e-05,
"loss": 0.0533,
"num_tokens": 4359805.0,
"step": 355
},
{
"epoch": 0.7563025210084033,
"grad_norm": 0.17776817083358765,
"learning_rate": 4.5174470715402764e-05,
"loss": 0.0559,
"num_tokens": 4421245.0,
"step": 360
},
{
"epoch": 0.7668067226890757,
"grad_norm": 0.2214263677597046,
"learning_rate": 4.501199835330507e-05,
"loss": 0.0533,
"num_tokens": 4482685.0,
"step": 365
},
{
"epoch": 0.7773109243697479,
"grad_norm": 0.2129214107990265,
"learning_rate": 4.484717608122459e-05,
"loss": 0.0542,
"num_tokens": 4544125.0,
"step": 370
},
{
"epoch": 0.7878151260504201,
"grad_norm": 0.2668643295764923,
"learning_rate": 4.468002601642603e-05,
"loss": 0.052,
"num_tokens": 4605565.0,
"step": 375
},
{
"epoch": 0.7983193277310925,
"grad_norm": 0.4849870502948761,
"learning_rate": 4.4510570588537206e-05,
"loss": 0.057,
"num_tokens": 4666849.0,
"step": 380
},
{
"epoch": 0.8088235294117647,
"grad_norm": 0.2671234607696533,
"learning_rate": 4.433883253653936e-05,
"loss": 0.0533,
"num_tokens": 4728021.0,
"step": 385
},
{
"epoch": 0.819327731092437,
"grad_norm": 0.8898406028747559,
"learning_rate": 4.416483490571574e-05,
"loss": 0.0551,
"num_tokens": 4789461.0,
"step": 390
},
{
"epoch": 0.8298319327731093,
"grad_norm": 0.1947408765554428,
"learning_rate": 4.39886010445593e-05,
"loss": 0.0555,
"num_tokens": 4850862.0,
"step": 395
},
{
"epoch": 0.8403361344537815,
"grad_norm": 0.3551996648311615,
"learning_rate": 4.381015460163949e-05,
"loss": 0.0559,
"num_tokens": 4912302.0,
"step": 400
},
{
"epoch": 0.8508403361344538,
"grad_norm": 0.24865303933620453,
"learning_rate": 4.362951952242898e-05,
"loss": 0.0554,
"num_tokens": 4973742.0,
"step": 405
},
{
"epoch": 0.8613445378151261,
"grad_norm": 0.21480585634708405,
"learning_rate": 4.344672004609037e-05,
"loss": 0.0536,
"num_tokens": 5035005.0,
"step": 410
},
{
"epoch": 0.8718487394957983,
"grad_norm": 0.2362818419933319,
"learning_rate": 4.326178070222364e-05,
"loss": 0.0552,
"num_tokens": 5096371.0,
"step": 415
},
{
"epoch": 0.8823529411764706,
"grad_norm": 0.3188863694667816,
"learning_rate": 4.3074726307574516e-05,
"loss": 0.0579,
"num_tokens": 5157811.0,
"step": 420
},
{
"epoch": 0.8928571428571429,
"grad_norm": 0.2944329082965851,
"learning_rate": 4.2885581962704366e-05,
"loss": 0.0555,
"num_tokens": 5219251.0,
"step": 425
},
{
"epoch": 0.9033613445378151,
"grad_norm": 0.35381075739860535,
"learning_rate": 4.2694373048622e-05,
"loss": 0.0548,
"num_tokens": 5280691.0,
"step": 430
},
{
"epoch": 0.9138655462184874,
"grad_norm": 0.2967956066131592,
"learning_rate": 4.2501125223377754e-05,
"loss": 0.0542,
"num_tokens": 5342131.0,
"step": 435
},
{
"epoch": 0.9243697478991597,
"grad_norm": 1.3979747295379639,
"learning_rate": 4.230586441862062e-05,
"loss": 0.0529,
"num_tokens": 5403410.0,
"step": 440
},
{
"epoch": 0.9348739495798319,
"grad_norm": 0.2130478024482727,
"learning_rate": 4.210861683611837e-05,
"loss": 0.0546,
"num_tokens": 5464723.0,
"step": 445
},
{
"epoch": 0.9453781512605042,
"grad_norm": 0.23148652911186218,
"learning_rate": 4.1909408944241644e-05,
"loss": 0.0543,
"num_tokens": 5526163.0,
"step": 450
},
{
"epoch": 0.9558823529411765,
"grad_norm": 0.17318475246429443,
"learning_rate": 4.1708267474412215e-05,
"loss": 0.0543,
"num_tokens": 5587603.0,
"step": 455
},
{
"epoch": 0.9663865546218487,
"grad_norm": 0.28641510009765625,
"learning_rate": 4.1505219417515884e-05,
"loss": 0.0549,
"num_tokens": 5649043.0,
"step": 460
},
{
"epoch": 0.976890756302521,
"grad_norm": 0.35609665513038635,
"learning_rate": 4.1300292020280645e-05,
"loss": 0.056,
"num_tokens": 5710483.0,
"step": 465
},
{
"epoch": 0.9873949579831933,
"grad_norm": 0.2831043601036072,
"learning_rate": 4.10935127816205e-05,
"loss": 0.0574,
"num_tokens": 5771914.0,
"step": 470
},
{
"epoch": 0.9978991596638656,
"grad_norm": 0.2836240530014038,
"learning_rate": 4.088490944894539e-05,
"loss": 0.0515,
"num_tokens": 5833354.0,
"step": 475
},
{
"epoch": 1.0084033613445378,
"grad_norm": 2.6926634311676025,
"learning_rate": 4.06745100144378e-05,
"loss": 0.0558,
"num_tokens": 5894794.0,
"step": 480
},
{
"epoch": 1.01890756302521,
"grad_norm": 0.17681336402893066,
"learning_rate": 4.0462342711296584e-05,
"loss": 0.0523,
"num_tokens": 5956077.0,
"step": 485
},
{
"epoch": 1.0294117647058822,
"grad_norm": 0.2838696539402008,
"learning_rate": 4.024843600994833e-05,
"loss": 0.0537,
"num_tokens": 6017517.0,
"step": 490
},
{
"epoch": 1.0399159663865547,
"grad_norm": 0.2431504875421524,
"learning_rate": 4.003281861422699e-05,
"loss": 0.0537,
"num_tokens": 6078801.0,
"step": 495
},
{
"epoch": 1.050420168067227,
"grad_norm": 0.2204369157552719,
"learning_rate": 3.981551945752215e-05,
"loss": 0.0538,
"num_tokens": 6140232.0,
"step": 500
},
{
"epoch": 1.0609243697478992,
"grad_norm": 0.2458706945180893,
"learning_rate": 3.959656769889646e-05,
"loss": 0.0545,
"num_tokens": 6201672.0,
"step": 505
},
{
"epoch": 1.0714285714285714,
"grad_norm": 0.21258144080638885,
"learning_rate": 3.937599271917292e-05,
"loss": 0.056,
"num_tokens": 6263112.0,
"step": 510
},
{
"epoch": 1.0819327731092436,
"grad_norm": 0.2708013355731964,
"learning_rate": 3.915382411699218e-05,
"loss": 0.0547,
"num_tokens": 6324552.0,
"step": 515
},
{
"epoch": 1.092436974789916,
"grad_norm": 2.9274137020111084,
"learning_rate": 3.893009170484085e-05,
"loss": 0.0524,
"num_tokens": 6385992.0,
"step": 520
},
{
"epoch": 1.1029411764705883,
"grad_norm": 0.3301822543144226,
"learning_rate": 3.870482550505094e-05,
"loss": 0.0554,
"num_tokens": 6447432.0,
"step": 525
},
{
"epoch": 1.1134453781512605,
"grad_norm": 0.4145117700099945,
"learning_rate": 3.847805574577123e-05,
"loss": 0.0551,
"num_tokens": 6508872.0,
"step": 530
},
{
"epoch": 1.1239495798319328,
"grad_norm": 0.2403404861688614,
"learning_rate": 3.8249812856910985e-05,
"loss": 0.0576,
"num_tokens": 6570312.0,
"step": 535
},
{
"epoch": 1.134453781512605,
"grad_norm": 0.2703566551208496,
"learning_rate": 3.8020127466056636e-05,
"loss": 0.0526,
"num_tokens": 6631553.0,
"step": 540
},
{
"epoch": 1.1449579831932772,
"grad_norm": 0.23629266023635864,
"learning_rate": 3.778903039436189e-05,
"loss": 0.053,
"num_tokens": 6692993.0,
"step": 545
},
{
"epoch": 1.1554621848739495,
"grad_norm": 31.853532791137695,
"learning_rate": 3.755655265241187e-05,
"loss": 0.0551,
"num_tokens": 6754394.0,
"step": 550
},
{
"epoch": 1.165966386554622,
"grad_norm": 0.274700790643692,
"learning_rate": 3.7322725436061875e-05,
"loss": 0.0534,
"num_tokens": 6815834.0,
"step": 555
},
{
"epoch": 1.1764705882352942,
"grad_norm": 0.26774168014526367,
"learning_rate": 3.708758012225125e-05,
"loss": 0.0528,
"num_tokens": 6877269.0,
"step": 560
},
{
"epoch": 1.1869747899159664,
"grad_norm": 0.2192794382572174,
"learning_rate": 3.685114826479292e-05,
"loss": 0.0543,
"num_tokens": 6938555.0,
"step": 565
},
{
"epoch": 1.1974789915966386,
"grad_norm": 0.4298264980316162,
"learning_rate": 3.661346159013929e-05,
"loss": 0.0536,
"num_tokens": 6999704.0,
"step": 570
},
{
"epoch": 1.2079831932773109,
"grad_norm": 0.19733546674251556,
"learning_rate": 3.637455199312488e-05,
"loss": 0.053,
"num_tokens": 7061144.0,
"step": 575
},
{
"epoch": 1.2184873949579833,
"grad_norm": 0.22298553586006165,
"learning_rate": 3.61344515326864e-05,
"loss": 0.0532,
"num_tokens": 7122407.0,
"step": 580
},
{
"epoch": 1.2289915966386555,
"grad_norm": 0.19753730297088623,
"learning_rate": 3.5893192427560834e-05,
"loss": 0.0536,
"num_tokens": 7183847.0,
"step": 585
},
{
"epoch": 1.2394957983193278,
"grad_norm": 0.20278260111808777,
"learning_rate": 3.565080705196202e-05,
"loss": 0.0525,
"num_tokens": 7245125.0,
"step": 590
},
{
"epoch": 1.25,
"grad_norm": 0.8037598133087158,
"learning_rate": 3.5407327931236434e-05,
"loss": 0.0536,
"num_tokens": 7306565.0,
"step": 595
},
{
"epoch": 1.2605042016806722,
"grad_norm": 0.25609511137008667,
"learning_rate": 3.516278773749863e-05,
"loss": 0.0534,
"num_tokens": 7368005.0,
"step": 600
},
{
"epoch": 1.2710084033613445,
"grad_norm": 0.18984173238277435,
"learning_rate": 3.4917219285247036e-05,
"loss": 0.0517,
"num_tokens": 7429445.0,
"step": 605
},
{
"epoch": 1.2815126050420167,
"grad_norm": 0.17902691662311554,
"learning_rate": 3.4670655526960627e-05,
"loss": 0.0538,
"num_tokens": 7490885.0,
"step": 610
},
{
"epoch": 1.2920168067226891,
"grad_norm": 0.19289465248584747,
"learning_rate": 3.4423129548677055e-05,
"loss": 0.0526,
"num_tokens": 7552325.0,
"step": 615
},
{
"epoch": 1.3025210084033614,
"grad_norm": 0.21860499680042267,
"learning_rate": 3.41746745655529e-05,
"loss": 0.0546,
"num_tokens": 7613765.0,
"step": 620
},
{
"epoch": 1.3130252100840336,
"grad_norm": 0.19712497293949127,
"learning_rate": 3.3925323917406574e-05,
"loss": 0.0538,
"num_tokens": 7675205.0,
"step": 625
},
{
"epoch": 1.3235294117647058,
"grad_norm": 0.22082890570163727,
"learning_rate": 3.3675111064244504e-05,
"loss": 0.0537,
"num_tokens": 7736645.0,
"step": 630
},
{
"epoch": 1.334033613445378,
"grad_norm": 0.20152664184570312,
"learning_rate": 3.3424069581771155e-05,
"loss": 0.0529,
"num_tokens": 7798085.0,
"step": 635
},
{
"epoch": 1.3445378151260505,
"grad_norm": 0.22678756713867188,
"learning_rate": 3.317223315688358e-05,
"loss": 0.0539,
"num_tokens": 7859525.0,
"step": 640
},
{
"epoch": 1.3550420168067228,
"grad_norm": 0.2191995084285736,
"learning_rate": 3.2919635583151025e-05,
"loss": 0.0529,
"num_tokens": 7920965.0,
"step": 645
},
{
"epoch": 1.365546218487395,
"grad_norm": 0.18456892669200897,
"learning_rate": 3.2666310756280194e-05,
"loss": 0.0544,
"num_tokens": 7982405.0,
"step": 650
},
{
"epoch": 1.3760504201680672,
"grad_norm": 0.1721736490726471,
"learning_rate": 3.241229266956687e-05,
"loss": 0.054,
"num_tokens": 8043845.0,
"step": 655
},
{
"epoch": 1.3865546218487395,
"grad_norm": 0.22381868958473206,
"learning_rate": 3.215761540933436e-05,
"loss": 0.0525,
"num_tokens": 8105285.0,
"step": 660
},
{
"epoch": 1.3970588235294117,
"grad_norm": 0.19951923191547394,
"learning_rate": 3.190231315035954e-05,
"loss": 0.0514,
"num_tokens": 8166725.0,
"step": 665
},
{
"epoch": 1.407563025210084,
"grad_norm": 0.21700704097747803,
"learning_rate": 3.164642015128694e-05,
"loss": 0.0531,
"num_tokens": 8228159.0,
"step": 670
},
{
"epoch": 1.4180672268907564,
"grad_norm": 0.13238979876041412,
"learning_rate": 3.13899707500317e-05,
"loss": 0.0503,
"num_tokens": 8289370.0,
"step": 675
},
{
"epoch": 1.4285714285714286,
"grad_norm": 0.16383114457130432,
"learning_rate": 3.1132999359171737e-05,
"loss": 0.0513,
"num_tokens": 8350810.0,
"step": 680
},
{
"epoch": 1.4390756302521008,
"grad_norm": 0.18902327120304108,
"learning_rate": 3.087554046133004e-05,
"loss": 0.052,
"num_tokens": 8412174.0,
"step": 685
},
{
"epoch": 1.449579831932773,
"grad_norm": 0.16599516570568085,
"learning_rate": 3.0617628604547424e-05,
"loss": 0.0533,
"num_tokens": 8473614.0,
"step": 690
},
{
"epoch": 1.4600840336134453,
"grad_norm": 0.20513266324996948,
"learning_rate": 3.035929839764665e-05,
"loss": 0.0507,
"num_tokens": 8535054.0,
"step": 695
},
{
"epoch": 1.4705882352941178,
"grad_norm": 0.22719748318195343,
"learning_rate": 3.0100584505588275e-05,
"loss": 0.052,
"num_tokens": 8596494.0,
"step": 700
},
{
"epoch": 1.48109243697479,
"grad_norm": 0.18906480073928833,
"learning_rate": 2.9841521644818976e-05,
"loss": 0.0516,
"num_tokens": 8657934.0,
"step": 705
},
{
"epoch": 1.4915966386554622,
"grad_norm": 0.17335395514965057,
"learning_rate": 2.9582144578613102e-05,
"loss": 0.0496,
"num_tokens": 8719374.0,
"step": 710
},
{
"epoch": 1.5021008403361344,
"grad_norm": 0.20907028019428253,
"learning_rate": 2.9322488112407743e-05,
"loss": 0.0523,
"num_tokens": 8780740.0,
"step": 715
},
{
"epoch": 1.5126050420168067,
"grad_norm": 0.21596895158290863,
"learning_rate": 2.906258708913228e-05,
"loss": 0.053,
"num_tokens": 8842180.0,
"step": 720
},
{
"epoch": 1.523109243697479,
"grad_norm": 0.21814534068107605,
"learning_rate": 2.880247638453288e-05,
"loss": 0.0535,
"num_tokens": 8903620.0,
"step": 725
},
{
"epoch": 1.5336134453781511,
"grad_norm": 0.17172180116176605,
"learning_rate": 2.854219090249251e-05,
"loss": 0.0511,
"num_tokens": 8965060.0,
"step": 730
},
{
"epoch": 1.5441176470588234,
"grad_norm": 0.144153892993927,
"learning_rate": 2.8281765570347306e-05,
"loss": 0.0509,
"num_tokens": 9026344.0,
"step": 735
},
{
"epoch": 1.5546218487394958,
"grad_norm": 0.1880050003528595,
"learning_rate": 2.802123533419966e-05,
"loss": 0.0546,
"num_tokens": 9087784.0,
"step": 740
},
{
"epoch": 1.565126050420168,
"grad_norm": 0.15667995810508728,
"learning_rate": 2.7760635154228896e-05,
"loss": 0.051,
"num_tokens": 9149063.0,
"step": 745
},
{
"epoch": 1.5756302521008403,
"grad_norm": 0.2708027958869934,
"learning_rate": 2.7500000000000004e-05,
"loss": 0.0544,
"num_tokens": 9210503.0,
"step": 750
},
{
"epoch": 1.5861344537815127,
"grad_norm": 0.1797020584344864,
"learning_rate": 2.723936484577111e-05,
"loss": 0.0528,
"num_tokens": 9271881.0,
"step": 755
},
{
"epoch": 1.596638655462185,
"grad_norm": 0.21367360651493073,
"learning_rate": 2.6978764665800343e-05,
"loss": 0.0535,
"num_tokens": 9333321.0,
"step": 760
},
{
"epoch": 1.6071428571428572,
"grad_norm": 0.21819233894348145,
"learning_rate": 2.67182344296527e-05,
"loss": 0.0522,
"num_tokens": 9394761.0,
"step": 765
},
{
"epoch": 1.6176470588235294,
"grad_norm": 0.1992005854845047,
"learning_rate": 2.6457809097507496e-05,
"loss": 0.0506,
"num_tokens": 9456201.0,
"step": 770
},
{
"epoch": 1.6281512605042017,
"grad_norm": 0.19866126775741577,
"learning_rate": 2.619752361546713e-05,
"loss": 0.0518,
"num_tokens": 9517492.0,
"step": 775
},
{
"epoch": 1.638655462184874,
"grad_norm": 0.174868643283844,
"learning_rate": 2.593741291086772e-05,
"loss": 0.0532,
"num_tokens": 9578932.0,
"step": 780
},
{
"epoch": 1.6491596638655461,
"grad_norm": 0.22887223958969116,
"learning_rate": 2.567751188759227e-05,
"loss": 0.0523,
"num_tokens": 9640372.0,
"step": 785
},
{
"epoch": 1.6596638655462184,
"grad_norm": 0.17208142578601837,
"learning_rate": 2.541785542138691e-05,
"loss": 0.0502,
"num_tokens": 9701812.0,
"step": 790
},
{
"epoch": 1.6701680672268906,
"grad_norm": 0.21313603222370148,
"learning_rate": 2.515847835518103e-05,
"loss": 0.0526,
"num_tokens": 9763075.0,
"step": 795
},
{
"epoch": 1.680672268907563,
"grad_norm": 0.15035264194011688,
"learning_rate": 2.4899415494411737e-05,
"loss": 0.0507,
"num_tokens": 9824515.0,
"step": 800
},
{
"epoch": 1.6911764705882353,
"grad_norm": 0.2601998746395111,
"learning_rate": 2.464070160235335e-05,
"loss": 0.0526,
"num_tokens": 9885955.0,
"step": 805
},
{
"epoch": 1.7016806722689075,
"grad_norm": 0.17123112082481384,
"learning_rate": 2.438237139545258e-05,
"loss": 0.0521,
"num_tokens": 9947395.0,
"step": 810
},
{
"epoch": 1.71218487394958,
"grad_norm": 0.17243990302085876,
"learning_rate": 2.412445953866997e-05,
"loss": 0.0502,
"num_tokens": 10008835.0,
"step": 815
},
{
"epoch": 1.7226890756302522,
"grad_norm": 0.21723228693008423,
"learning_rate": 2.386700064082827e-05,
"loss": 0.0517,
"num_tokens": 10070123.0,
"step": 820
},
{
"epoch": 1.7331932773109244,
"grad_norm": 0.13738787174224854,
"learning_rate": 2.361002924996831e-05,
"loss": 0.051,
"num_tokens": 10131563.0,
"step": 825
},
{
"epoch": 1.7436974789915967,
"grad_norm": 0.21257147192955017,
"learning_rate": 2.3353579848713063e-05,
"loss": 0.0522,
"num_tokens": 10192967.0,
"step": 830
},
{
"epoch": 1.754201680672269,
"grad_norm": 0.20029078423976898,
"learning_rate": 2.3097686849640476e-05,
"loss": 0.0543,
"num_tokens": 10254407.0,
"step": 835
},
{
"epoch": 1.7647058823529411,
"grad_norm": 0.20497262477874756,
"learning_rate": 2.2842384590665645e-05,
"loss": 0.0526,
"num_tokens": 10315847.0,
"step": 840
},
{
"epoch": 1.7752100840336134,
"grad_norm": 0.19529034197330475,
"learning_rate": 2.2587707330433133e-05,
"loss": 0.052,
"num_tokens": 10377287.0,
"step": 845
},
{
"epoch": 1.7857142857142856,
"grad_norm": 0.19968290627002716,
"learning_rate": 2.23336892437198e-05,
"loss": 0.0511,
"num_tokens": 10438565.0,
"step": 850
},
{
"epoch": 1.7962184873949578,
"grad_norm": 0.20312048494815826,
"learning_rate": 2.2080364416848987e-05,
"loss": 0.0508,
"num_tokens": 10500005.0,
"step": 855
},
{
"epoch": 1.8067226890756303,
"grad_norm": 0.2170594483613968,
"learning_rate": 2.1827766843116428e-05,
"loss": 0.052,
"num_tokens": 10561445.0,
"step": 860
},
{
"epoch": 1.8172268907563025,
"grad_norm": 0.20793762803077698,
"learning_rate": 2.157593041822885e-05,
"loss": 0.0507,
"num_tokens": 10622885.0,
"step": 865
},
{
"epoch": 1.8277310924369747,
"grad_norm": 0.18194827437400818,
"learning_rate": 2.1324888935755498e-05,
"loss": 0.0512,
"num_tokens": 10684325.0,
"step": 870
},
{
"epoch": 1.8382352941176472,
"grad_norm": 0.14043785631656647,
"learning_rate": 2.1074676082593425e-05,
"loss": 0.0507,
"num_tokens": 10745533.0,
"step": 875
},
{
"epoch": 1.8487394957983194,
"grad_norm": 0.17620113492012024,
"learning_rate": 2.0825325434447106e-05,
"loss": 0.0526,
"num_tokens": 10806971.0,
"step": 880
},
{
"epoch": 1.8592436974789917,
"grad_norm": 0.17084655165672302,
"learning_rate": 2.0576870451322953e-05,
"loss": 0.05,
"num_tokens": 10868411.0,
"step": 885
},
{
"epoch": 1.8697478991596639,
"grad_norm": 0.17954443395137787,
"learning_rate": 2.032934447303938e-05,
"loss": 0.0479,
"num_tokens": 10929851.0,
"step": 890
},
{
"epoch": 1.8802521008403361,
"grad_norm": 0.19004443287849426,
"learning_rate": 2.0082780714752963e-05,
"loss": 0.0516,
"num_tokens": 10991291.0,
"step": 895
},
{
"epoch": 1.8907563025210083,
"grad_norm": 0.1933521330356598,
"learning_rate": 1.9837212262501382e-05,
"loss": 0.0526,
"num_tokens": 11052731.0,
"step": 900
},
{
"epoch": 1.9012605042016806,
"grad_norm": 0.1794319450855255,
"learning_rate": 1.9592672068763574e-05,
"loss": 0.052,
"num_tokens": 11114068.0,
"step": 905
},
{
"epoch": 1.9117647058823528,
"grad_norm": 0.16216090321540833,
"learning_rate": 1.934919294803798e-05,
"loss": 0.0519,
"num_tokens": 11175508.0,
"step": 910
},
{
"epoch": 1.9222689075630253,
"grad_norm": 0.19467321038246155,
"learning_rate": 1.9106807572439168e-05,
"loss": 0.0506,
"num_tokens": 11236948.0,
"step": 915
},
{
"epoch": 1.9327731092436975,
"grad_norm": 0.13739857077598572,
"learning_rate": 1.88655484673136e-05,
"loss": 0.0516,
"num_tokens": 11298388.0,
"step": 920
},
{
"epoch": 1.9432773109243697,
"grad_norm": 0.15686306357383728,
"learning_rate": 1.8625448006875123e-05,
"loss": 0.0505,
"num_tokens": 11359828.0,
"step": 925
},
{
"epoch": 1.9537815126050422,
"grad_norm": 0.12999138236045837,
"learning_rate": 1.8386538409860708e-05,
"loss": 0.051,
"num_tokens": 11421268.0,
"step": 930
},
{
"epoch": 1.9642857142857144,
"grad_norm": 0.18375808000564575,
"learning_rate": 1.8148851735207083e-05,
"loss": 0.0523,
"num_tokens": 11482548.0,
"step": 935
},
{
"epoch": 1.9747899159663866,
"grad_norm": 0.19671285152435303,
"learning_rate": 1.791241987774876e-05,
"loss": 0.0509,
"num_tokens": 11543988.0,
"step": 940
},
{
"epoch": 1.9852941176470589,
"grad_norm": 0.1805330216884613,
"learning_rate": 1.7677274563938134e-05,
"loss": 0.0503,
"num_tokens": 11605268.0,
"step": 945
},
{
"epoch": 1.995798319327731,
"grad_norm": 0.19455303251743317,
"learning_rate": 1.744344734758814e-05,
"loss": 0.0517,
"num_tokens": 11666708.0,
"step": 950
},
{
"epoch": 2.0063025210084033,
"grad_norm": 0.17816315591335297,
"learning_rate": 1.721096960563812e-05,
"loss": 0.0507,
"num_tokens": 11728148.0,
"step": 955
},
{
"epoch": 2.0168067226890756,
"grad_norm": 0.12756673991680145,
"learning_rate": 1.697987253394337e-05,
"loss": 0.0491,
"num_tokens": 11789273.0,
"step": 960
},
{
"epoch": 2.027310924369748,
"grad_norm": 0.19657427072525024,
"learning_rate": 1.675018714308902e-05,
"loss": 0.0504,
"num_tokens": 11850713.0,
"step": 965
},
{
"epoch": 2.03781512605042,
"grad_norm": 0.1950300633907318,
"learning_rate": 1.652194425422878e-05,
"loss": 0.0505,
"num_tokens": 11912153.0,
"step": 970
},
{
"epoch": 2.0483193277310923,
"grad_norm": 0.16631367802619934,
"learning_rate": 1.629517449494906e-05,
"loss": 0.0502,
"num_tokens": 11973593.0,
"step": 975
},
{
"epoch": 2.0588235294117645,
"grad_norm": 0.17350395023822784,
"learning_rate": 1.6069908295159146e-05,
"loss": 0.0526,
"num_tokens": 12035033.0,
"step": 980
},
{
"epoch": 2.069327731092437,
"grad_norm": 0.18997882306575775,
"learning_rate": 1.5846175883007815e-05,
"loss": 0.0493,
"num_tokens": 12096473.0,
"step": 985
},
{
"epoch": 2.0798319327731094,
"grad_norm": 0.1386975198984146,
"learning_rate": 1.562400728082709e-05,
"loss": 0.0497,
"num_tokens": 12157913.0,
"step": 990
},
{
"epoch": 2.0903361344537816,
"grad_norm": 0.1656985878944397,
"learning_rate": 1.540343230110354e-05,
"loss": 0.0509,
"num_tokens": 12219353.0,
"step": 995
},
{
"epoch": 2.100840336134454,
"grad_norm": 0.19251607358455658,
"learning_rate": 1.5184480542477869e-05,
"loss": 0.0503,
"num_tokens": 12280793.0,
"step": 1000
},
{
"epoch": 2.111344537815126,
"grad_norm": 0.17274506390094757,
"learning_rate": 1.4967181385773022e-05,
"loss": 0.0491,
"num_tokens": 12342004.0,
"step": 1005
},
{
"epoch": 2.1218487394957983,
"grad_norm": 0.20883677899837494,
"learning_rate": 1.4751563990051675e-05,
"loss": 0.0495,
"num_tokens": 12403444.0,
"step": 1010
},
{
"epoch": 2.1323529411764706,
"grad_norm": 0.20437228679656982,
"learning_rate": 1.453765728870343e-05,
"loss": 0.0514,
"num_tokens": 12464884.0,
"step": 1015
},
{
"epoch": 2.142857142857143,
"grad_norm": 0.20462237298488617,
"learning_rate": 1.432548998556221e-05,
"loss": 0.051,
"num_tokens": 12526175.0,
"step": 1020
},
{
"epoch": 2.153361344537815,
"grad_norm": 0.2599621117115021,
"learning_rate": 1.4115090551054622e-05,
"loss": 0.0517,
"num_tokens": 12587615.0,
"step": 1025
},
{
"epoch": 2.1638655462184873,
"grad_norm": 0.1801358163356781,
"learning_rate": 1.3906487218379504e-05,
"loss": 0.0499,
"num_tokens": 12649055.0,
"step": 1030
},
{
"epoch": 2.1743697478991595,
"grad_norm": 0.1843215674161911,
"learning_rate": 1.3699707979719357e-05,
"loss": 0.0513,
"num_tokens": 12710459.0,
"step": 1035
},
{
"epoch": 2.184873949579832,
"grad_norm": 0.19053132832050323,
"learning_rate": 1.3494780582484126e-05,
"loss": 0.0496,
"num_tokens": 12771899.0,
"step": 1040
},
{
"epoch": 2.1953781512605044,
"grad_norm": 0.15285778045654297,
"learning_rate": 1.329173252558779e-05,
"loss": 0.0497,
"num_tokens": 12833339.0,
"step": 1045
},
{
"epoch": 2.2058823529411766,
"grad_norm": 0.14396464824676514,
"learning_rate": 1.3090591055758356e-05,
"loss": 0.0507,
"num_tokens": 12894779.0,
"step": 1050
},
{
"epoch": 2.216386554621849,
"grad_norm": 0.14991876482963562,
"learning_rate": 1.2891383163881633e-05,
"loss": 0.05,
"num_tokens": 12956219.0,
"step": 1055
},
{
"epoch": 2.226890756302521,
"grad_norm": 0.14839011430740356,
"learning_rate": 1.2694135581379383e-05,
"loss": 0.0499,
"num_tokens": 13017659.0,
"step": 1060
},
{
"epoch": 2.2373949579831933,
"grad_norm": 0.12264993786811829,
"learning_rate": 1.2498874776622246e-05,
"loss": 0.0462,
"num_tokens": 13079099.0,
"step": 1065
},
{
"epoch": 2.2478991596638656,
"grad_norm": 0.1659439504146576,
"learning_rate": 1.2305626951378019e-05,
"loss": 0.0492,
"num_tokens": 13140539.0,
"step": 1070
},
{
"epoch": 2.258403361344538,
"grad_norm": 0.16605842113494873,
"learning_rate": 1.2114418037295636e-05,
"loss": 0.0502,
"num_tokens": 13201979.0,
"step": 1075
},
{
"epoch": 2.26890756302521,
"grad_norm": 0.16009701788425446,
"learning_rate": 1.1925273692425487e-05,
"loss": 0.0496,
"num_tokens": 13263419.0,
"step": 1080
},
{
"epoch": 2.2794117647058822,
"grad_norm": 0.1512678861618042,
"learning_rate": 1.1738219297776371e-05,
"loss": 0.0497,
"num_tokens": 13324859.0,
"step": 1085
},
{
"epoch": 2.2899159663865545,
"grad_norm": 0.18362964689731598,
"learning_rate": 1.1553279953909641e-05,
"loss": 0.0485,
"num_tokens": 13386299.0,
"step": 1090
},
{
"epoch": 2.3004201680672267,
"grad_norm": 0.15746116638183594,
"learning_rate": 1.1370480477571029e-05,
"loss": 0.0503,
"num_tokens": 13447730.0,
"step": 1095
},
{
"epoch": 2.310924369747899,
"grad_norm": 0.2701464891433716,
"learning_rate": 1.118984539836051e-05,
"loss": 0.0521,
"num_tokens": 13509170.0,
"step": 1100
},
{
"epoch": 2.3214285714285716,
"grad_norm": 0.18647603690624237,
"learning_rate": 1.1011398955440702e-05,
"loss": 0.0498,
"num_tokens": 13570409.0,
"step": 1105
},
{
"epoch": 2.331932773109244,
"grad_norm": 0.12975195050239563,
"learning_rate": 1.0835165094284264e-05,
"loss": 0.0507,
"num_tokens": 13631849.0,
"step": 1110
},
{
"epoch": 2.342436974789916,
"grad_norm": 0.15623484551906586,
"learning_rate": 1.066116746346065e-05,
"loss": 0.0499,
"num_tokens": 13693289.0,
"step": 1115
},
{
"epoch": 2.3529411764705883,
"grad_norm": 0.1415032297372818,
"learning_rate": 1.0489429411462794e-05,
"loss": 0.05,
"num_tokens": 13754729.0,
"step": 1120
},
{
"epoch": 2.3634453781512605,
"grad_norm": 0.188720241189003,
"learning_rate": 1.0319973983573971e-05,
"loss": 0.053,
"num_tokens": 13816169.0,
"step": 1125
},
{
"epoch": 2.3739495798319328,
"grad_norm": 0.19719360768795013,
"learning_rate": 1.0152823918775408e-05,
"loss": 0.0503,
"num_tokens": 13877609.0,
"step": 1130
},
{
"epoch": 2.384453781512605,
"grad_norm": 0.1669566035270691,
"learning_rate": 9.988001646694935e-06,
"loss": 0.0499,
"num_tokens": 13939049.0,
"step": 1135
},
{
"epoch": 2.3949579831932772,
"grad_norm": 0.22400623559951782,
"learning_rate": 9.825529284597238e-06,
"loss": 0.0534,
"num_tokens": 14000489.0,
"step": 1140
},
{
"epoch": 2.4054621848739495,
"grad_norm": 0.15708568692207336,
"learning_rate": 9.665428634415923e-06,
"loss": 0.0499,
"num_tokens": 14061697.0,
"step": 1145
},
{
"epoch": 2.4159663865546217,
"grad_norm": 0.16055414080619812,
"learning_rate": 9.50772117982799e-06,
"loss": 0.0506,
"num_tokens": 14123137.0,
"step": 1150
},
{
"epoch": 2.426470588235294,
"grad_norm": 0.14245997369289398,
"learning_rate": 9.352428083370946e-06,
"loss": 0.0497,
"num_tokens": 14184577.0,
"step": 1155
},
{
"epoch": 2.4369747899159666,
"grad_norm": 0.14547857642173767,
"learning_rate": 9.199570183603021e-06,
"loss": 0.0501,
"num_tokens": 14246017.0,
"step": 1160
},
{
"epoch": 2.447478991596639,
"grad_norm": 0.17205478250980377,
"learning_rate": 9.049167992306908e-06,
"loss": 0.0501,
"num_tokens": 14307457.0,
"step": 1165
},
{
"epoch": 2.457983193277311,
"grad_norm": 0.16485774517059326,
"learning_rate": 8.901241691737286e-06,
"loss": 0.0499,
"num_tokens": 14368897.0,
"step": 1170
},
{
"epoch": 2.4684873949579833,
"grad_norm": 0.1967056393623352,
"learning_rate": 8.755811131912612e-06,
"loss": 0.051,
"num_tokens": 14430337.0,
"step": 1175
},
{
"epoch": 2.4789915966386555,
"grad_norm": 0.14919425547122955,
"learning_rate": 8.612895827951451e-06,
"loss": 0.0495,
"num_tokens": 14491744.0,
"step": 1180
},
{
"epoch": 2.4894957983193278,
"grad_norm": 0.1448267251253128,
"learning_rate": 8.472514957453801e-06,
"loss": 0.0512,
"num_tokens": 14553007.0,
"step": 1185
},
{
"epoch": 2.5,
"grad_norm": 0.17768217623233795,
"learning_rate": 8.33468735792765e-06,
"loss": 0.0501,
"num_tokens": 14614447.0,
"step": 1190
},
{
"epoch": 2.5105042016806722,
"grad_norm": 0.1507992148399353,
"learning_rate": 8.199431524261223e-06,
"loss": 0.0503,
"num_tokens": 14675727.0,
"step": 1195
},
{
"epoch": 2.5210084033613445,
"grad_norm": 0.16350729763507843,
"learning_rate": 8.066765606241163e-06,
"loss": 0.0496,
"num_tokens": 14737165.0,
"step": 1200
},
{
"epoch": 2.5315126050420167,
"grad_norm": 0.16035616397857666,
"learning_rate": 7.936707406117028e-06,
"loss": 0.0488,
"num_tokens": 14798605.0,
"step": 1205
},
{
"epoch": 2.542016806722689,
"grad_norm": 0.1894913911819458,
"learning_rate": 7.809274376212464e-06,
"loss": 0.0508,
"num_tokens": 14859883.0,
"step": 1210
},
{
"epoch": 2.552521008403361,
"grad_norm": 0.1903340071439743,
"learning_rate": 7.68448361658327e-06,
"loss": 0.0488,
"num_tokens": 14921105.0,
"step": 1215
},
{
"epoch": 2.5630252100840334,
"grad_norm": 0.14671629667282104,
"learning_rate": 7.5623518727227975e-06,
"loss": 0.0495,
"num_tokens": 14982545.0,
"step": 1220
},
{
"epoch": 2.5735294117647056,
"grad_norm": 0.16081440448760986,
"learning_rate": 7.442895533314856e-06,
"loss": 0.0473,
"num_tokens": 15043985.0,
"step": 1225
},
{
"epoch": 2.5840336134453783,
"grad_norm": 0.1555902659893036,
"learning_rate": 7.326130628034581e-06,
"loss": 0.0492,
"num_tokens": 15105425.0,
"step": 1230
},
{
"epoch": 2.5945378151260505,
"grad_norm": 0.1796170324087143,
"learning_rate": 7.212072825397413e-06,
"loss": 0.0497,
"num_tokens": 15166865.0,
"step": 1235
},
{
"epoch": 2.6050420168067228,
"grad_norm": 0.13445882499217987,
"learning_rate": 7.100737430656561e-06,
"loss": 0.0494,
"num_tokens": 15228139.0,
"step": 1240
},
{
"epoch": 2.615546218487395,
"grad_norm": 0.18797667324543,
"learning_rate": 6.992139383749224e-06,
"loss": 0.0499,
"num_tokens": 15289579.0,
"step": 1245
},
{
"epoch": 2.6260504201680672,
"grad_norm": 0.1478380262851715,
"learning_rate": 6.886293257291801e-06,
"loss": 0.0503,
"num_tokens": 15351019.0,
"step": 1250
},
{
"epoch": 2.6365546218487395,
"grad_norm": 0.19320227205753326,
"learning_rate": 6.78321325462444e-06,
"loss": 0.0486,
"num_tokens": 15412459.0,
"step": 1255
},
{
"epoch": 2.6470588235294117,
"grad_norm": 0.18944524228572845,
"learning_rate": 6.682913207905095e-06,
"loss": 0.0496,
"num_tokens": 15473796.0,
"step": 1260
},
{
"epoch": 2.657563025210084,
"grad_norm": 0.17592737078666687,
"learning_rate": 6.585406576253404e-06,
"loss": 0.0501,
"num_tokens": 15535236.0,
"step": 1265
},
{
"epoch": 2.668067226890756,
"grad_norm": 0.18211396038532257,
"learning_rate": 6.490706443944656e-06,
"loss": 0.0491,
"num_tokens": 15596676.0,
"step": 1270
},
{
"epoch": 2.678571428571429,
"grad_norm": 0.1536846160888672,
"learning_rate": 6.398825518653992e-06,
"loss": 0.05,
"num_tokens": 15658116.0,
"step": 1275
},
{
"epoch": 2.689075630252101,
"grad_norm": 0.18677380681037903,
"learning_rate": 6.30977612975121e-06,
"loss": 0.0493,
"num_tokens": 15719399.0,
"step": 1280
},
{
"epoch": 2.6995798319327733,
"grad_norm": 0.1490916907787323,
"learning_rate": 6.223570226646291e-06,
"loss": 0.0514,
"num_tokens": 15780839.0,
"step": 1285
},
{
"epoch": 2.7100840336134455,
"grad_norm": 0.15238384902477264,
"learning_rate": 6.140219377185933e-06,
"loss": 0.05,
"num_tokens": 15842274.0,
"step": 1290
},
{
"epoch": 2.7205882352941178,
"grad_norm": 0.15011648833751678,
"learning_rate": 6.0597347661012635e-06,
"loss": 0.0493,
"num_tokens": 15903714.0,
"step": 1295
},
{
"epoch": 2.73109243697479,
"grad_norm": 0.1596149504184723,
"learning_rate": 5.982127193507003e-06,
"loss": 0.0494,
"num_tokens": 15965148.0,
"step": 1300
},
{
"epoch": 2.741596638655462,
"grad_norm": 0.16487446427345276,
"learning_rate": 5.907407073452186e-06,
"loss": 0.0506,
"num_tokens": 16026588.0,
"step": 1305
},
{
"epoch": 2.7521008403361344,
"grad_norm": 0.1454056352376938,
"learning_rate": 5.835584432522727e-06,
"loss": 0.0492,
"num_tokens": 16088028.0,
"step": 1310
},
{
"epoch": 2.7626050420168067,
"grad_norm": 0.16204313933849335,
"learning_rate": 5.766668908495966e-06,
"loss": 0.0509,
"num_tokens": 16149468.0,
"step": 1315
},
{
"epoch": 2.773109243697479,
"grad_norm": 0.18910805881023407,
"learning_rate": 5.700669749047387e-06,
"loss": 0.0489,
"num_tokens": 16210908.0,
"step": 1320
},
{
"epoch": 2.783613445378151,
"grad_norm": 0.17493724822998047,
"learning_rate": 5.637595810509689e-06,
"loss": 0.05,
"num_tokens": 16272348.0,
"step": 1325
},
{
"epoch": 2.7941176470588234,
"grad_norm": 0.14875848591327667,
"learning_rate": 5.577455556684369e-06,
"loss": 0.049,
"num_tokens": 16333788.0,
"step": 1330
},
{
"epoch": 2.8046218487394956,
"grad_norm": 0.1500275731086731,
"learning_rate": 5.520257057705971e-06,
"loss": 0.0498,
"num_tokens": 16395228.0,
"step": 1335
},
{
"epoch": 2.815126050420168,
"grad_norm": 0.1598060131072998,
"learning_rate": 5.466007988959163e-06,
"loss": 0.0507,
"num_tokens": 16456417.0,
"step": 1340
},
{
"epoch": 2.82563025210084,
"grad_norm": 0.1572778970003128,
"learning_rate": 5.414715630048797e-06,
"loss": 0.051,
"num_tokens": 16517857.0,
"step": 1345
},
{
"epoch": 2.8361344537815127,
"grad_norm": 0.15581144392490387,
"learning_rate": 5.366386863823077e-06,
"loss": 0.0499,
"num_tokens": 16579297.0,
"step": 1350
},
{
"epoch": 2.846638655462185,
"grad_norm": 0.18151573836803436,
"learning_rate": 5.3210281754499284e-06,
"loss": 0.0496,
"num_tokens": 16640737.0,
"step": 1355
},
{
"epoch": 2.857142857142857,
"grad_norm": 0.1601044237613678,
"learning_rate": 5.278645651546797e-06,
"loss": 0.0487,
"num_tokens": 16702177.0,
"step": 1360
},
{
"epoch": 2.8676470588235294,
"grad_norm": 0.1497681736946106,
"learning_rate": 5.239244979363877e-06,
"loss": 0.0492,
"num_tokens": 16763617.0,
"step": 1365
},
{
"epoch": 2.8781512605042017,
"grad_norm": 0.15907803177833557,
"learning_rate": 5.202831446020945e-06,
"loss": 0.0502,
"num_tokens": 16824905.0,
"step": 1370
},
{
"epoch": 2.888655462184874,
"grad_norm": 0.17641125619411469,
"learning_rate": 5.169409937797901e-06,
"loss": 0.0502,
"num_tokens": 16886146.0,
"step": 1375
},
{
"epoch": 2.899159663865546,
"grad_norm": 0.12964367866516113,
"learning_rate": 5.138984939479077e-06,
"loss": 0.0487,
"num_tokens": 16947586.0,
"step": 1380
},
{
"epoch": 2.9096638655462184,
"grad_norm": 0.14473247528076172,
"learning_rate": 5.111560533751426e-06,
"loss": 0.0491,
"num_tokens": 17009026.0,
"step": 1385
},
{
"epoch": 2.9201680672268906,
"grad_norm": 0.18652838468551636,
"learning_rate": 5.087140400656684e-06,
"loss": 0.0506,
"num_tokens": 17070466.0,
"step": 1390
},
{
"epoch": 2.9306722689075633,
"grad_norm": 0.18603888154029846,
"learning_rate": 5.065727817097544e-06,
"loss": 0.0492,
"num_tokens": 17131779.0,
"step": 1395
},
{
"epoch": 2.9411764705882355,
"grad_norm": 0.15723615884780884,
"learning_rate": 5.047325656397932e-06,
"loss": 0.0494,
"num_tokens": 17193063.0,
"step": 1400
},
{
"epoch": 2.9516806722689077,
"grad_norm": 0.14798587560653687,
"learning_rate": 5.031936387917442e-06,
"loss": 0.049,
"num_tokens": 17254503.0,
"step": 1405
},
{
"epoch": 2.96218487394958,
"grad_norm": 0.19435246288776398,
"learning_rate": 5.019562076719972e-06,
"loss": 0.0494,
"num_tokens": 17315742.0,
"step": 1410
},
{
"epoch": 2.972689075630252,
"grad_norm": 0.17056235671043396,
"learning_rate": 5.0102043832966236e-06,
"loss": 0.0493,
"num_tokens": 17377182.0,
"step": 1415
},
{
"epoch": 2.9831932773109244,
"grad_norm": 0.12487131357192993,
"learning_rate": 5.003864563342878e-06,
"loss": 0.0477,
"num_tokens": 17438622.0,
"step": 1420
},
{
"epoch": 2.9936974789915967,
"grad_norm": 0.13770359754562378,
"learning_rate": 5.0005434675900966e-06,
"loss": 0.0477,
"num_tokens": 17500062.0,
"step": 1425
},
{
"epoch": 3.0,
"num_tokens": 17536926.0,
"step": 1428,
"total_flos": 7.444201440207176e+17,
"train_loss": 0.06586769079210378,
"train_runtime": 7485.0094,
"train_samples_per_second": 9.147,
"train_steps_per_second": 0.191
}
],
"logging_steps": 5,
"max_steps": 1428,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 7.444201440207176e+17,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}