|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 3.0, |
|
"eval_steps": 500, |
|
"global_step": 1428, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.01050420168067227, |
|
"grad_norm": 21.68925666809082, |
|
"learning_rate": 3.4722222222222224e-06, |
|
"loss": 1.7424, |
|
"num_tokens": 61440.0, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.02100840336134454, |
|
"grad_norm": 6.517210483551025, |
|
"learning_rate": 6.944444444444445e-06, |
|
"loss": 0.8531, |
|
"num_tokens": 122880.0, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.031512605042016806, |
|
"grad_norm": 1.3076003789901733, |
|
"learning_rate": 1.0416666666666668e-05, |
|
"loss": 0.2121, |
|
"num_tokens": 184320.0, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.04201680672268908, |
|
"grad_norm": 1.2455157041549683, |
|
"learning_rate": 1.388888888888889e-05, |
|
"loss": 0.113, |
|
"num_tokens": 245760.0, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.052521008403361345, |
|
"grad_norm": 0.854950487613678, |
|
"learning_rate": 1.736111111111111e-05, |
|
"loss": 0.0885, |
|
"num_tokens": 307200.0, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.06302521008403361, |
|
"grad_norm": 0.9929510951042175, |
|
"learning_rate": 2.0833333333333336e-05, |
|
"loss": 0.0771, |
|
"num_tokens": 368640.0, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.07352941176470588, |
|
"grad_norm": 1.0237308740615845, |
|
"learning_rate": 2.4305555555555558e-05, |
|
"loss": 0.0805, |
|
"num_tokens": 430080.0, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.08403361344537816, |
|
"grad_norm": 1.4065616130828857, |
|
"learning_rate": 2.777777777777778e-05, |
|
"loss": 0.0822, |
|
"num_tokens": 491520.0, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.09453781512605042, |
|
"grad_norm": 0.7961967587471008, |
|
"learning_rate": 3.125e-05, |
|
"loss": 0.0721, |
|
"num_tokens": 552960.0, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.10504201680672269, |
|
"grad_norm": 0.7718358039855957, |
|
"learning_rate": 3.472222222222222e-05, |
|
"loss": 0.0683, |
|
"num_tokens": 614297.0, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.11554621848739496, |
|
"grad_norm": 0.6594786047935486, |
|
"learning_rate": 3.8194444444444444e-05, |
|
"loss": 0.0665, |
|
"num_tokens": 675536.0, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.12605042016806722, |
|
"grad_norm": 0.637739896774292, |
|
"learning_rate": 4.166666666666667e-05, |
|
"loss": 0.0718, |
|
"num_tokens": 736976.0, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.13655462184873948, |
|
"grad_norm": 0.4128863215446472, |
|
"learning_rate": 4.5138888888888894e-05, |
|
"loss": 0.069, |
|
"num_tokens": 798416.0, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.14705882352941177, |
|
"grad_norm": 0.7012873888015747, |
|
"learning_rate": 4.8611111111111115e-05, |
|
"loss": 0.0651, |
|
"num_tokens": 859856.0, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.15756302521008403, |
|
"grad_norm": 0.6367799639701843, |
|
"learning_rate": 4.9999456532409905e-05, |
|
"loss": 0.0777, |
|
"num_tokens": 921296.0, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.16806722689075632, |
|
"grad_norm": 0.49970555305480957, |
|
"learning_rate": 4.999613543665713e-05, |
|
"loss": 0.0642, |
|
"num_tokens": 982736.0, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.17857142857142858, |
|
"grad_norm": 0.49360784888267517, |
|
"learning_rate": 4.998979561670338e-05, |
|
"loss": 0.0651, |
|
"num_tokens": 1044176.0, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.18907563025210083, |
|
"grad_norm": 0.432689905166626, |
|
"learning_rate": 4.9980437923280036e-05, |
|
"loss": 0.063, |
|
"num_tokens": 1105611.0, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.19957983193277312, |
|
"grad_norm": 0.6416879296302795, |
|
"learning_rate": 4.996806361208257e-05, |
|
"loss": 0.0659, |
|
"num_tokens": 1167051.0, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.21008403361344538, |
|
"grad_norm": 0.517444908618927, |
|
"learning_rate": 4.995267434360207e-05, |
|
"loss": 0.0638, |
|
"num_tokens": 1228314.0, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.22058823529411764, |
|
"grad_norm": 107.99620819091797, |
|
"learning_rate": 4.993427218290246e-05, |
|
"loss": 0.6636, |
|
"num_tokens": 1289602.0, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.23109243697478993, |
|
"grad_norm": 0.5925813317298889, |
|
"learning_rate": 4.991285959934332e-05, |
|
"loss": 0.1472, |
|
"num_tokens": 1351042.0, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.2415966386554622, |
|
"grad_norm": 0.3515715003013611, |
|
"learning_rate": 4.988843946624858e-05, |
|
"loss": 0.0645, |
|
"num_tokens": 1412482.0, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.25210084033613445, |
|
"grad_norm": 0.4023377001285553, |
|
"learning_rate": 4.9861015060520935e-05, |
|
"loss": 0.0632, |
|
"num_tokens": 1473922.0, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.26260504201680673, |
|
"grad_norm": 0.37604016065597534, |
|
"learning_rate": 4.9830590062202105e-05, |
|
"loss": 0.0635, |
|
"num_tokens": 1535362.0, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.27310924369747897, |
|
"grad_norm": 3.824808359146118, |
|
"learning_rate": 4.9797168553979054e-05, |
|
"loss": 0.0704, |
|
"num_tokens": 1596802.0, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.28361344537815125, |
|
"grad_norm": 4.908388137817383, |
|
"learning_rate": 4.976075502063613e-05, |
|
"loss": 0.128, |
|
"num_tokens": 1658080.0, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.29411764705882354, |
|
"grad_norm": 0.3574487566947937, |
|
"learning_rate": 4.97213543484532e-05, |
|
"loss": 0.0682, |
|
"num_tokens": 1719520.0, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.30462184873949577, |
|
"grad_norm": 0.35017549991607666, |
|
"learning_rate": 4.9678971824550074e-05, |
|
"loss": 0.0593, |
|
"num_tokens": 1780960.0, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.31512605042016806, |
|
"grad_norm": 0.36095139384269714, |
|
"learning_rate": 4.9633613136176925e-05, |
|
"loss": 0.0605, |
|
"num_tokens": 1842400.0, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.32563025210084034, |
|
"grad_norm": 0.3857307434082031, |
|
"learning_rate": 4.95852843699512e-05, |
|
"loss": 0.059, |
|
"num_tokens": 1903840.0, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.33613445378151263, |
|
"grad_norm": 0.27103978395462036, |
|
"learning_rate": 4.953399201104084e-05, |
|
"loss": 0.0581, |
|
"num_tokens": 1965280.0, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.34663865546218486, |
|
"grad_norm": 0.29975804686546326, |
|
"learning_rate": 4.9479742942294035e-05, |
|
"loss": 0.0578, |
|
"num_tokens": 2026720.0, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.35714285714285715, |
|
"grad_norm": 0.3278215825557709, |
|
"learning_rate": 4.9422544443315635e-05, |
|
"loss": 0.056, |
|
"num_tokens": 2088160.0, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.36764705882352944, |
|
"grad_norm": 0.28858262300491333, |
|
"learning_rate": 4.936240418949032e-05, |
|
"loss": 0.0582, |
|
"num_tokens": 2149600.0, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.37815126050420167, |
|
"grad_norm": 0.36514896154403687, |
|
"learning_rate": 4.929933025095261e-05, |
|
"loss": 0.0577, |
|
"num_tokens": 2211007.0, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.38865546218487396, |
|
"grad_norm": 0.37031087279319763, |
|
"learning_rate": 4.9233331091504034e-05, |
|
"loss": 0.0554, |
|
"num_tokens": 2272203.0, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.39915966386554624, |
|
"grad_norm": 0.28869447112083435, |
|
"learning_rate": 4.916441556747727e-05, |
|
"loss": 0.0575, |
|
"num_tokens": 2333494.0, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.4096638655462185, |
|
"grad_norm": 0.29715102910995483, |
|
"learning_rate": 4.909259292654782e-05, |
|
"loss": 0.0573, |
|
"num_tokens": 2394934.0, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.42016806722689076, |
|
"grad_norm": 0.40769094228744507, |
|
"learning_rate": 4.9017872806492995e-05, |
|
"loss": 0.0583, |
|
"num_tokens": 2456374.0, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.43067226890756305, |
|
"grad_norm": 0.3187924325466156, |
|
"learning_rate": 4.8940265233898744e-05, |
|
"loss": 0.0546, |
|
"num_tokens": 2517814.0, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.4411764705882353, |
|
"grad_norm": 0.25656041502952576, |
|
"learning_rate": 4.885978062281408e-05, |
|
"loss": 0.057, |
|
"num_tokens": 2579248.0, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.45168067226890757, |
|
"grad_norm": 0.2591699957847595, |
|
"learning_rate": 4.877642977335371e-05, |
|
"loss": 0.0573, |
|
"num_tokens": 2640688.0, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.46218487394957986, |
|
"grad_norm": 0.3359907567501068, |
|
"learning_rate": 4.869022387024879e-05, |
|
"loss": 0.0538, |
|
"num_tokens": 2702128.0, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.4726890756302521, |
|
"grad_norm": 0.3090253174304962, |
|
"learning_rate": 4.8601174481346015e-05, |
|
"loss": 0.057, |
|
"num_tokens": 2763412.0, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.4831932773109244, |
|
"grad_norm": 0.2006455659866333, |
|
"learning_rate": 4.8509293556055345e-05, |
|
"loss": 0.0554, |
|
"num_tokens": 2824852.0, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.49369747899159666, |
|
"grad_norm": 0.255356103181839, |
|
"learning_rate": 4.84145934237466e-05, |
|
"loss": 0.0557, |
|
"num_tokens": 2886292.0, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.5042016806722689, |
|
"grad_norm": 0.21021538972854614, |
|
"learning_rate": 4.8317086792094906e-05, |
|
"loss": 0.0527, |
|
"num_tokens": 2947732.0, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.5147058823529411, |
|
"grad_norm": 0.24062202870845795, |
|
"learning_rate": 4.821678674537557e-05, |
|
"loss": 0.0545, |
|
"num_tokens": 3009172.0, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.5252100840336135, |
|
"grad_norm": 0.30908721685409546, |
|
"learning_rate": 4.811370674270821e-05, |
|
"loss": 0.0538, |
|
"num_tokens": 3070612.0, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.5357142857142857, |
|
"grad_norm": 0.2805303931236267, |
|
"learning_rate": 4.800786061625078e-05, |
|
"loss": 0.0528, |
|
"num_tokens": 3132052.0, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.5462184873949579, |
|
"grad_norm": 0.245064839720726, |
|
"learning_rate": 4.789926256934345e-05, |
|
"loss": 0.0566, |
|
"num_tokens": 3193416.0, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.5567226890756303, |
|
"grad_norm": 0.2865758538246155, |
|
"learning_rate": 4.778792717460259e-05, |
|
"loss": 0.0542, |
|
"num_tokens": 3254856.0, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.5672268907563025, |
|
"grad_norm": 0.2498483657836914, |
|
"learning_rate": 4.7673869371965425e-05, |
|
"loss": 0.0544, |
|
"num_tokens": 3316296.0, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.5777310924369747, |
|
"grad_norm": 0.24384859204292297, |
|
"learning_rate": 4.755710446668515e-05, |
|
"loss": 0.0564, |
|
"num_tokens": 3377345.0, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.5882352941176471, |
|
"grad_norm": 0.22256287932395935, |
|
"learning_rate": 4.7437648127277216e-05, |
|
"loss": 0.0543, |
|
"num_tokens": 3438785.0, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.5987394957983193, |
|
"grad_norm": 0.7542266845703125, |
|
"learning_rate": 4.7315516383416736e-05, |
|
"loss": 0.0595, |
|
"num_tokens": 3500225.0, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.6092436974789915, |
|
"grad_norm": 0.24494485557079315, |
|
"learning_rate": 4.7190725623787545e-05, |
|
"loss": 0.0565, |
|
"num_tokens": 3561466.0, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.6197478991596639, |
|
"grad_norm": 0.26762306690216064, |
|
"learning_rate": 4.706329259388298e-05, |
|
"loss": 0.0557, |
|
"num_tokens": 3622906.0, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.6302521008403361, |
|
"grad_norm": 0.285203754901886, |
|
"learning_rate": 4.6933234393758844e-05, |
|
"loss": 0.0537, |
|
"num_tokens": 3684184.0, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.6407563025210085, |
|
"grad_norm": 0.5487973690032959, |
|
"learning_rate": 4.680056847573878e-05, |
|
"loss": 0.0551, |
|
"num_tokens": 3745624.0, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.6512605042016807, |
|
"grad_norm": 0.2598506808280945, |
|
"learning_rate": 4.666531264207235e-05, |
|
"loss": 0.0542, |
|
"num_tokens": 3806907.0, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.6617647058823529, |
|
"grad_norm": 0.30089858174324036, |
|
"learning_rate": 4.6527485042546204e-05, |
|
"loss": 0.0576, |
|
"num_tokens": 3868347.0, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.6722689075630253, |
|
"grad_norm": 0.24752771854400635, |
|
"learning_rate": 4.638710417204855e-05, |
|
"loss": 0.0538, |
|
"num_tokens": 3929787.0, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.6827731092436975, |
|
"grad_norm": 0.2561896741390228, |
|
"learning_rate": 4.6244188868087395e-05, |
|
"loss": 0.0556, |
|
"num_tokens": 3991227.0, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.6932773109243697, |
|
"grad_norm": 0.3156558871269226, |
|
"learning_rate": 4.609875830826272e-05, |
|
"loss": 0.0564, |
|
"num_tokens": 4052667.0, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.7037815126050421, |
|
"grad_norm": 0.291674941778183, |
|
"learning_rate": 4.59508320076931e-05, |
|
"loss": 0.0558, |
|
"num_tokens": 4114107.0, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.7142857142857143, |
|
"grad_norm": 0.29663559794425964, |
|
"learning_rate": 4.580042981639698e-05, |
|
"loss": 0.0545, |
|
"num_tokens": 4175485.0, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.7247899159663865, |
|
"grad_norm": 0.2254744917154312, |
|
"learning_rate": 4.5647571916629064e-05, |
|
"loss": 0.0544, |
|
"num_tokens": 4236925.0, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.7352941176470589, |
|
"grad_norm": 0.327118843793869, |
|
"learning_rate": 4.549227882017202e-05, |
|
"loss": 0.0556, |
|
"num_tokens": 4298365.0, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.7457983193277311, |
|
"grad_norm": 0.24738788604736328, |
|
"learning_rate": 4.533457136558408e-05, |
|
"loss": 0.0533, |
|
"num_tokens": 4359805.0, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.7563025210084033, |
|
"grad_norm": 0.17776817083358765, |
|
"learning_rate": 4.5174470715402764e-05, |
|
"loss": 0.0559, |
|
"num_tokens": 4421245.0, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.7668067226890757, |
|
"grad_norm": 0.2214263677597046, |
|
"learning_rate": 4.501199835330507e-05, |
|
"loss": 0.0533, |
|
"num_tokens": 4482685.0, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.7773109243697479, |
|
"grad_norm": 0.2129214107990265, |
|
"learning_rate": 4.484717608122459e-05, |
|
"loss": 0.0542, |
|
"num_tokens": 4544125.0, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.7878151260504201, |
|
"grad_norm": 0.2668643295764923, |
|
"learning_rate": 4.468002601642603e-05, |
|
"loss": 0.052, |
|
"num_tokens": 4605565.0, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.7983193277310925, |
|
"grad_norm": 0.4849870502948761, |
|
"learning_rate": 4.4510570588537206e-05, |
|
"loss": 0.057, |
|
"num_tokens": 4666849.0, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.8088235294117647, |
|
"grad_norm": 0.2671234607696533, |
|
"learning_rate": 4.433883253653936e-05, |
|
"loss": 0.0533, |
|
"num_tokens": 4728021.0, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.819327731092437, |
|
"grad_norm": 0.8898406028747559, |
|
"learning_rate": 4.416483490571574e-05, |
|
"loss": 0.0551, |
|
"num_tokens": 4789461.0, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.8298319327731093, |
|
"grad_norm": 0.1947408765554428, |
|
"learning_rate": 4.39886010445593e-05, |
|
"loss": 0.0555, |
|
"num_tokens": 4850862.0, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.8403361344537815, |
|
"grad_norm": 0.3551996648311615, |
|
"learning_rate": 4.381015460163949e-05, |
|
"loss": 0.0559, |
|
"num_tokens": 4912302.0, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.8508403361344538, |
|
"grad_norm": 0.24865303933620453, |
|
"learning_rate": 4.362951952242898e-05, |
|
"loss": 0.0554, |
|
"num_tokens": 4973742.0, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.8613445378151261, |
|
"grad_norm": 0.21480585634708405, |
|
"learning_rate": 4.344672004609037e-05, |
|
"loss": 0.0536, |
|
"num_tokens": 5035005.0, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.8718487394957983, |
|
"grad_norm": 0.2362818419933319, |
|
"learning_rate": 4.326178070222364e-05, |
|
"loss": 0.0552, |
|
"num_tokens": 5096371.0, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.8823529411764706, |
|
"grad_norm": 0.3188863694667816, |
|
"learning_rate": 4.3074726307574516e-05, |
|
"loss": 0.0579, |
|
"num_tokens": 5157811.0, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.8928571428571429, |
|
"grad_norm": 0.2944329082965851, |
|
"learning_rate": 4.2885581962704366e-05, |
|
"loss": 0.0555, |
|
"num_tokens": 5219251.0, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.9033613445378151, |
|
"grad_norm": 0.35381075739860535, |
|
"learning_rate": 4.2694373048622e-05, |
|
"loss": 0.0548, |
|
"num_tokens": 5280691.0, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.9138655462184874, |
|
"grad_norm": 0.2967956066131592, |
|
"learning_rate": 4.2501125223377754e-05, |
|
"loss": 0.0542, |
|
"num_tokens": 5342131.0, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.9243697478991597, |
|
"grad_norm": 1.3979747295379639, |
|
"learning_rate": 4.230586441862062e-05, |
|
"loss": 0.0529, |
|
"num_tokens": 5403410.0, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.9348739495798319, |
|
"grad_norm": 0.2130478024482727, |
|
"learning_rate": 4.210861683611837e-05, |
|
"loss": 0.0546, |
|
"num_tokens": 5464723.0, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.9453781512605042, |
|
"grad_norm": 0.23148652911186218, |
|
"learning_rate": 4.1909408944241644e-05, |
|
"loss": 0.0543, |
|
"num_tokens": 5526163.0, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.9558823529411765, |
|
"grad_norm": 0.17318475246429443, |
|
"learning_rate": 4.1708267474412215e-05, |
|
"loss": 0.0543, |
|
"num_tokens": 5587603.0, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.9663865546218487, |
|
"grad_norm": 0.28641510009765625, |
|
"learning_rate": 4.1505219417515884e-05, |
|
"loss": 0.0549, |
|
"num_tokens": 5649043.0, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.976890756302521, |
|
"grad_norm": 0.35609665513038635, |
|
"learning_rate": 4.1300292020280645e-05, |
|
"loss": 0.056, |
|
"num_tokens": 5710483.0, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.9873949579831933, |
|
"grad_norm": 0.2831043601036072, |
|
"learning_rate": 4.10935127816205e-05, |
|
"loss": 0.0574, |
|
"num_tokens": 5771914.0, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.9978991596638656, |
|
"grad_norm": 0.2836240530014038, |
|
"learning_rate": 4.088490944894539e-05, |
|
"loss": 0.0515, |
|
"num_tokens": 5833354.0, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 1.0084033613445378, |
|
"grad_norm": 2.6926634311676025, |
|
"learning_rate": 4.06745100144378e-05, |
|
"loss": 0.0558, |
|
"num_tokens": 5894794.0, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.01890756302521, |
|
"grad_norm": 0.17681336402893066, |
|
"learning_rate": 4.0462342711296584e-05, |
|
"loss": 0.0523, |
|
"num_tokens": 5956077.0, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 1.0294117647058822, |
|
"grad_norm": 0.2838696539402008, |
|
"learning_rate": 4.024843600994833e-05, |
|
"loss": 0.0537, |
|
"num_tokens": 6017517.0, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.0399159663865547, |
|
"grad_norm": 0.2431504875421524, |
|
"learning_rate": 4.003281861422699e-05, |
|
"loss": 0.0537, |
|
"num_tokens": 6078801.0, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 1.050420168067227, |
|
"grad_norm": 0.2204369157552719, |
|
"learning_rate": 3.981551945752215e-05, |
|
"loss": 0.0538, |
|
"num_tokens": 6140232.0, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.0609243697478992, |
|
"grad_norm": 0.2458706945180893, |
|
"learning_rate": 3.959656769889646e-05, |
|
"loss": 0.0545, |
|
"num_tokens": 6201672.0, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 1.0714285714285714, |
|
"grad_norm": 0.21258144080638885, |
|
"learning_rate": 3.937599271917292e-05, |
|
"loss": 0.056, |
|
"num_tokens": 6263112.0, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.0819327731092436, |
|
"grad_norm": 0.2708013355731964, |
|
"learning_rate": 3.915382411699218e-05, |
|
"loss": 0.0547, |
|
"num_tokens": 6324552.0, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 1.092436974789916, |
|
"grad_norm": 2.9274137020111084, |
|
"learning_rate": 3.893009170484085e-05, |
|
"loss": 0.0524, |
|
"num_tokens": 6385992.0, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.1029411764705883, |
|
"grad_norm": 0.3301822543144226, |
|
"learning_rate": 3.870482550505094e-05, |
|
"loss": 0.0554, |
|
"num_tokens": 6447432.0, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 1.1134453781512605, |
|
"grad_norm": 0.4145117700099945, |
|
"learning_rate": 3.847805574577123e-05, |
|
"loss": 0.0551, |
|
"num_tokens": 6508872.0, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.1239495798319328, |
|
"grad_norm": 0.2403404861688614, |
|
"learning_rate": 3.8249812856910985e-05, |
|
"loss": 0.0576, |
|
"num_tokens": 6570312.0, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 1.134453781512605, |
|
"grad_norm": 0.2703566551208496, |
|
"learning_rate": 3.8020127466056636e-05, |
|
"loss": 0.0526, |
|
"num_tokens": 6631553.0, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.1449579831932772, |
|
"grad_norm": 0.23629266023635864, |
|
"learning_rate": 3.778903039436189e-05, |
|
"loss": 0.053, |
|
"num_tokens": 6692993.0, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 1.1554621848739495, |
|
"grad_norm": 31.853532791137695, |
|
"learning_rate": 3.755655265241187e-05, |
|
"loss": 0.0551, |
|
"num_tokens": 6754394.0, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.165966386554622, |
|
"grad_norm": 0.274700790643692, |
|
"learning_rate": 3.7322725436061875e-05, |
|
"loss": 0.0534, |
|
"num_tokens": 6815834.0, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 1.1764705882352942, |
|
"grad_norm": 0.26774168014526367, |
|
"learning_rate": 3.708758012225125e-05, |
|
"loss": 0.0528, |
|
"num_tokens": 6877269.0, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.1869747899159664, |
|
"grad_norm": 0.2192794382572174, |
|
"learning_rate": 3.685114826479292e-05, |
|
"loss": 0.0543, |
|
"num_tokens": 6938555.0, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 1.1974789915966386, |
|
"grad_norm": 0.4298264980316162, |
|
"learning_rate": 3.661346159013929e-05, |
|
"loss": 0.0536, |
|
"num_tokens": 6999704.0, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.2079831932773109, |
|
"grad_norm": 0.19733546674251556, |
|
"learning_rate": 3.637455199312488e-05, |
|
"loss": 0.053, |
|
"num_tokens": 7061144.0, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 1.2184873949579833, |
|
"grad_norm": 0.22298553586006165, |
|
"learning_rate": 3.61344515326864e-05, |
|
"loss": 0.0532, |
|
"num_tokens": 7122407.0, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.2289915966386555, |
|
"grad_norm": 0.19753730297088623, |
|
"learning_rate": 3.5893192427560834e-05, |
|
"loss": 0.0536, |
|
"num_tokens": 7183847.0, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 1.2394957983193278, |
|
"grad_norm": 0.20278260111808777, |
|
"learning_rate": 3.565080705196202e-05, |
|
"loss": 0.0525, |
|
"num_tokens": 7245125.0, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 0.8037598133087158, |
|
"learning_rate": 3.5407327931236434e-05, |
|
"loss": 0.0536, |
|
"num_tokens": 7306565.0, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 1.2605042016806722, |
|
"grad_norm": 0.25609511137008667, |
|
"learning_rate": 3.516278773749863e-05, |
|
"loss": 0.0534, |
|
"num_tokens": 7368005.0, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.2710084033613445, |
|
"grad_norm": 0.18984173238277435, |
|
"learning_rate": 3.4917219285247036e-05, |
|
"loss": 0.0517, |
|
"num_tokens": 7429445.0, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 1.2815126050420167, |
|
"grad_norm": 0.17902691662311554, |
|
"learning_rate": 3.4670655526960627e-05, |
|
"loss": 0.0538, |
|
"num_tokens": 7490885.0, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.2920168067226891, |
|
"grad_norm": 0.19289465248584747, |
|
"learning_rate": 3.4423129548677055e-05, |
|
"loss": 0.0526, |
|
"num_tokens": 7552325.0, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 1.3025210084033614, |
|
"grad_norm": 0.21860499680042267, |
|
"learning_rate": 3.41746745655529e-05, |
|
"loss": 0.0546, |
|
"num_tokens": 7613765.0, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.3130252100840336, |
|
"grad_norm": 0.19712497293949127, |
|
"learning_rate": 3.3925323917406574e-05, |
|
"loss": 0.0538, |
|
"num_tokens": 7675205.0, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 1.3235294117647058, |
|
"grad_norm": 0.22082890570163727, |
|
"learning_rate": 3.3675111064244504e-05, |
|
"loss": 0.0537, |
|
"num_tokens": 7736645.0, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.334033613445378, |
|
"grad_norm": 0.20152664184570312, |
|
"learning_rate": 3.3424069581771155e-05, |
|
"loss": 0.0529, |
|
"num_tokens": 7798085.0, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 1.3445378151260505, |
|
"grad_norm": 0.22678756713867188, |
|
"learning_rate": 3.317223315688358e-05, |
|
"loss": 0.0539, |
|
"num_tokens": 7859525.0, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.3550420168067228, |
|
"grad_norm": 0.2191995084285736, |
|
"learning_rate": 3.2919635583151025e-05, |
|
"loss": 0.0529, |
|
"num_tokens": 7920965.0, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 1.365546218487395, |
|
"grad_norm": 0.18456892669200897, |
|
"learning_rate": 3.2666310756280194e-05, |
|
"loss": 0.0544, |
|
"num_tokens": 7982405.0, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.3760504201680672, |
|
"grad_norm": 0.1721736490726471, |
|
"learning_rate": 3.241229266956687e-05, |
|
"loss": 0.054, |
|
"num_tokens": 8043845.0, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 1.3865546218487395, |
|
"grad_norm": 0.22381868958473206, |
|
"learning_rate": 3.215761540933436e-05, |
|
"loss": 0.0525, |
|
"num_tokens": 8105285.0, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.3970588235294117, |
|
"grad_norm": 0.19951923191547394, |
|
"learning_rate": 3.190231315035954e-05, |
|
"loss": 0.0514, |
|
"num_tokens": 8166725.0, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 1.407563025210084, |
|
"grad_norm": 0.21700704097747803, |
|
"learning_rate": 3.164642015128694e-05, |
|
"loss": 0.0531, |
|
"num_tokens": 8228159.0, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 1.4180672268907564, |
|
"grad_norm": 0.13238979876041412, |
|
"learning_rate": 3.13899707500317e-05, |
|
"loss": 0.0503, |
|
"num_tokens": 8289370.0, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 1.4285714285714286, |
|
"grad_norm": 0.16383114457130432, |
|
"learning_rate": 3.1132999359171737e-05, |
|
"loss": 0.0513, |
|
"num_tokens": 8350810.0, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.4390756302521008, |
|
"grad_norm": 0.18902327120304108, |
|
"learning_rate": 3.087554046133004e-05, |
|
"loss": 0.052, |
|
"num_tokens": 8412174.0, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 1.449579831932773, |
|
"grad_norm": 0.16599516570568085, |
|
"learning_rate": 3.0617628604547424e-05, |
|
"loss": 0.0533, |
|
"num_tokens": 8473614.0, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 1.4600840336134453, |
|
"grad_norm": 0.20513266324996948, |
|
"learning_rate": 3.035929839764665e-05, |
|
"loss": 0.0507, |
|
"num_tokens": 8535054.0, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 1.4705882352941178, |
|
"grad_norm": 0.22719748318195343, |
|
"learning_rate": 3.0100584505588275e-05, |
|
"loss": 0.052, |
|
"num_tokens": 8596494.0, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.48109243697479, |
|
"grad_norm": 0.18906480073928833, |
|
"learning_rate": 2.9841521644818976e-05, |
|
"loss": 0.0516, |
|
"num_tokens": 8657934.0, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 1.4915966386554622, |
|
"grad_norm": 0.17335395514965057, |
|
"learning_rate": 2.9582144578613102e-05, |
|
"loss": 0.0496, |
|
"num_tokens": 8719374.0, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 1.5021008403361344, |
|
"grad_norm": 0.20907028019428253, |
|
"learning_rate": 2.9322488112407743e-05, |
|
"loss": 0.0523, |
|
"num_tokens": 8780740.0, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 1.5126050420168067, |
|
"grad_norm": 0.21596895158290863, |
|
"learning_rate": 2.906258708913228e-05, |
|
"loss": 0.053, |
|
"num_tokens": 8842180.0, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.523109243697479, |
|
"grad_norm": 0.21814534068107605, |
|
"learning_rate": 2.880247638453288e-05, |
|
"loss": 0.0535, |
|
"num_tokens": 8903620.0, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 1.5336134453781511, |
|
"grad_norm": 0.17172180116176605, |
|
"learning_rate": 2.854219090249251e-05, |
|
"loss": 0.0511, |
|
"num_tokens": 8965060.0, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 1.5441176470588234, |
|
"grad_norm": 0.144153892993927, |
|
"learning_rate": 2.8281765570347306e-05, |
|
"loss": 0.0509, |
|
"num_tokens": 9026344.0, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 1.5546218487394958, |
|
"grad_norm": 0.1880050003528595, |
|
"learning_rate": 2.802123533419966e-05, |
|
"loss": 0.0546, |
|
"num_tokens": 9087784.0, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 1.565126050420168, |
|
"grad_norm": 0.15667995810508728, |
|
"learning_rate": 2.7760635154228896e-05, |
|
"loss": 0.051, |
|
"num_tokens": 9149063.0, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 1.5756302521008403, |
|
"grad_norm": 0.2708027958869934, |
|
"learning_rate": 2.7500000000000004e-05, |
|
"loss": 0.0544, |
|
"num_tokens": 9210503.0, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.5861344537815127, |
|
"grad_norm": 0.1797020584344864, |
|
"learning_rate": 2.723936484577111e-05, |
|
"loss": 0.0528, |
|
"num_tokens": 9271881.0, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 1.596638655462185, |
|
"grad_norm": 0.21367360651493073, |
|
"learning_rate": 2.6978764665800343e-05, |
|
"loss": 0.0535, |
|
"num_tokens": 9333321.0, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 1.6071428571428572, |
|
"grad_norm": 0.21819233894348145, |
|
"learning_rate": 2.67182344296527e-05, |
|
"loss": 0.0522, |
|
"num_tokens": 9394761.0, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 1.6176470588235294, |
|
"grad_norm": 0.1992005854845047, |
|
"learning_rate": 2.6457809097507496e-05, |
|
"loss": 0.0506, |
|
"num_tokens": 9456201.0, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 1.6281512605042017, |
|
"grad_norm": 0.19866126775741577, |
|
"learning_rate": 2.619752361546713e-05, |
|
"loss": 0.0518, |
|
"num_tokens": 9517492.0, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 1.638655462184874, |
|
"grad_norm": 0.174868643283844, |
|
"learning_rate": 2.593741291086772e-05, |
|
"loss": 0.0532, |
|
"num_tokens": 9578932.0, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 1.6491596638655461, |
|
"grad_norm": 0.22887223958969116, |
|
"learning_rate": 2.567751188759227e-05, |
|
"loss": 0.0523, |
|
"num_tokens": 9640372.0, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 1.6596638655462184, |
|
"grad_norm": 0.17208142578601837, |
|
"learning_rate": 2.541785542138691e-05, |
|
"loss": 0.0502, |
|
"num_tokens": 9701812.0, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 1.6701680672268906, |
|
"grad_norm": 0.21313603222370148, |
|
"learning_rate": 2.515847835518103e-05, |
|
"loss": 0.0526, |
|
"num_tokens": 9763075.0, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 1.680672268907563, |
|
"grad_norm": 0.15035264194011688, |
|
"learning_rate": 2.4899415494411737e-05, |
|
"loss": 0.0507, |
|
"num_tokens": 9824515.0, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.6911764705882353, |
|
"grad_norm": 0.2601998746395111, |
|
"learning_rate": 2.464070160235335e-05, |
|
"loss": 0.0526, |
|
"num_tokens": 9885955.0, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 1.7016806722689075, |
|
"grad_norm": 0.17123112082481384, |
|
"learning_rate": 2.438237139545258e-05, |
|
"loss": 0.0521, |
|
"num_tokens": 9947395.0, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 1.71218487394958, |
|
"grad_norm": 0.17243990302085876, |
|
"learning_rate": 2.412445953866997e-05, |
|
"loss": 0.0502, |
|
"num_tokens": 10008835.0, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 1.7226890756302522, |
|
"grad_norm": 0.21723228693008423, |
|
"learning_rate": 2.386700064082827e-05, |
|
"loss": 0.0517, |
|
"num_tokens": 10070123.0, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 1.7331932773109244, |
|
"grad_norm": 0.13738787174224854, |
|
"learning_rate": 2.361002924996831e-05, |
|
"loss": 0.051, |
|
"num_tokens": 10131563.0, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 1.7436974789915967, |
|
"grad_norm": 0.21257147192955017, |
|
"learning_rate": 2.3353579848713063e-05, |
|
"loss": 0.0522, |
|
"num_tokens": 10192967.0, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 1.754201680672269, |
|
"grad_norm": 0.20029078423976898, |
|
"learning_rate": 2.3097686849640476e-05, |
|
"loss": 0.0543, |
|
"num_tokens": 10254407.0, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 1.7647058823529411, |
|
"grad_norm": 0.20497262477874756, |
|
"learning_rate": 2.2842384590665645e-05, |
|
"loss": 0.0526, |
|
"num_tokens": 10315847.0, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 1.7752100840336134, |
|
"grad_norm": 0.19529034197330475, |
|
"learning_rate": 2.2587707330433133e-05, |
|
"loss": 0.052, |
|
"num_tokens": 10377287.0, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 1.7857142857142856, |
|
"grad_norm": 0.19968290627002716, |
|
"learning_rate": 2.23336892437198e-05, |
|
"loss": 0.0511, |
|
"num_tokens": 10438565.0, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 1.7962184873949578, |
|
"grad_norm": 0.20312048494815826, |
|
"learning_rate": 2.2080364416848987e-05, |
|
"loss": 0.0508, |
|
"num_tokens": 10500005.0, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 1.8067226890756303, |
|
"grad_norm": 0.2170594483613968, |
|
"learning_rate": 2.1827766843116428e-05, |
|
"loss": 0.052, |
|
"num_tokens": 10561445.0, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 1.8172268907563025, |
|
"grad_norm": 0.20793762803077698, |
|
"learning_rate": 2.157593041822885e-05, |
|
"loss": 0.0507, |
|
"num_tokens": 10622885.0, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 1.8277310924369747, |
|
"grad_norm": 0.18194827437400818, |
|
"learning_rate": 2.1324888935755498e-05, |
|
"loss": 0.0512, |
|
"num_tokens": 10684325.0, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 1.8382352941176472, |
|
"grad_norm": 0.14043785631656647, |
|
"learning_rate": 2.1074676082593425e-05, |
|
"loss": 0.0507, |
|
"num_tokens": 10745533.0, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 1.8487394957983194, |
|
"grad_norm": 0.17620113492012024, |
|
"learning_rate": 2.0825325434447106e-05, |
|
"loss": 0.0526, |
|
"num_tokens": 10806971.0, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 1.8592436974789917, |
|
"grad_norm": 0.17084655165672302, |
|
"learning_rate": 2.0576870451322953e-05, |
|
"loss": 0.05, |
|
"num_tokens": 10868411.0, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 1.8697478991596639, |
|
"grad_norm": 0.17954443395137787, |
|
"learning_rate": 2.032934447303938e-05, |
|
"loss": 0.0479, |
|
"num_tokens": 10929851.0, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 1.8802521008403361, |
|
"grad_norm": 0.19004443287849426, |
|
"learning_rate": 2.0082780714752963e-05, |
|
"loss": 0.0516, |
|
"num_tokens": 10991291.0, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 1.8907563025210083, |
|
"grad_norm": 0.1933521330356598, |
|
"learning_rate": 1.9837212262501382e-05, |
|
"loss": 0.0526, |
|
"num_tokens": 11052731.0, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.9012605042016806, |
|
"grad_norm": 0.1794319450855255, |
|
"learning_rate": 1.9592672068763574e-05, |
|
"loss": 0.052, |
|
"num_tokens": 11114068.0, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 1.9117647058823528, |
|
"grad_norm": 0.16216090321540833, |
|
"learning_rate": 1.934919294803798e-05, |
|
"loss": 0.0519, |
|
"num_tokens": 11175508.0, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 1.9222689075630253, |
|
"grad_norm": 0.19467321038246155, |
|
"learning_rate": 1.9106807572439168e-05, |
|
"loss": 0.0506, |
|
"num_tokens": 11236948.0, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 1.9327731092436975, |
|
"grad_norm": 0.13739857077598572, |
|
"learning_rate": 1.88655484673136e-05, |
|
"loss": 0.0516, |
|
"num_tokens": 11298388.0, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 1.9432773109243697, |
|
"grad_norm": 0.15686306357383728, |
|
"learning_rate": 1.8625448006875123e-05, |
|
"loss": 0.0505, |
|
"num_tokens": 11359828.0, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 1.9537815126050422, |
|
"grad_norm": 0.12999138236045837, |
|
"learning_rate": 1.8386538409860708e-05, |
|
"loss": 0.051, |
|
"num_tokens": 11421268.0, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 1.9642857142857144, |
|
"grad_norm": 0.18375808000564575, |
|
"learning_rate": 1.8148851735207083e-05, |
|
"loss": 0.0523, |
|
"num_tokens": 11482548.0, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 1.9747899159663866, |
|
"grad_norm": 0.19671285152435303, |
|
"learning_rate": 1.791241987774876e-05, |
|
"loss": 0.0509, |
|
"num_tokens": 11543988.0, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 1.9852941176470589, |
|
"grad_norm": 0.1805330216884613, |
|
"learning_rate": 1.7677274563938134e-05, |
|
"loss": 0.0503, |
|
"num_tokens": 11605268.0, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 1.995798319327731, |
|
"grad_norm": 0.19455303251743317, |
|
"learning_rate": 1.744344734758814e-05, |
|
"loss": 0.0517, |
|
"num_tokens": 11666708.0, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 2.0063025210084033, |
|
"grad_norm": 0.17816315591335297, |
|
"learning_rate": 1.721096960563812e-05, |
|
"loss": 0.0507, |
|
"num_tokens": 11728148.0, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 2.0168067226890756, |
|
"grad_norm": 0.12756673991680145, |
|
"learning_rate": 1.697987253394337e-05, |
|
"loss": 0.0491, |
|
"num_tokens": 11789273.0, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 2.027310924369748, |
|
"grad_norm": 0.19657427072525024, |
|
"learning_rate": 1.675018714308902e-05, |
|
"loss": 0.0504, |
|
"num_tokens": 11850713.0, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 2.03781512605042, |
|
"grad_norm": 0.1950300633907318, |
|
"learning_rate": 1.652194425422878e-05, |
|
"loss": 0.0505, |
|
"num_tokens": 11912153.0, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 2.0483193277310923, |
|
"grad_norm": 0.16631367802619934, |
|
"learning_rate": 1.629517449494906e-05, |
|
"loss": 0.0502, |
|
"num_tokens": 11973593.0, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 2.0588235294117645, |
|
"grad_norm": 0.17350395023822784, |
|
"learning_rate": 1.6069908295159146e-05, |
|
"loss": 0.0526, |
|
"num_tokens": 12035033.0, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 2.069327731092437, |
|
"grad_norm": 0.18997882306575775, |
|
"learning_rate": 1.5846175883007815e-05, |
|
"loss": 0.0493, |
|
"num_tokens": 12096473.0, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 2.0798319327731094, |
|
"grad_norm": 0.1386975198984146, |
|
"learning_rate": 1.562400728082709e-05, |
|
"loss": 0.0497, |
|
"num_tokens": 12157913.0, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 2.0903361344537816, |
|
"grad_norm": 0.1656985878944397, |
|
"learning_rate": 1.540343230110354e-05, |
|
"loss": 0.0509, |
|
"num_tokens": 12219353.0, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 2.100840336134454, |
|
"grad_norm": 0.19251607358455658, |
|
"learning_rate": 1.5184480542477869e-05, |
|
"loss": 0.0503, |
|
"num_tokens": 12280793.0, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 2.111344537815126, |
|
"grad_norm": 0.17274506390094757, |
|
"learning_rate": 1.4967181385773022e-05, |
|
"loss": 0.0491, |
|
"num_tokens": 12342004.0, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 2.1218487394957983, |
|
"grad_norm": 0.20883677899837494, |
|
"learning_rate": 1.4751563990051675e-05, |
|
"loss": 0.0495, |
|
"num_tokens": 12403444.0, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 2.1323529411764706, |
|
"grad_norm": 0.20437228679656982, |
|
"learning_rate": 1.453765728870343e-05, |
|
"loss": 0.0514, |
|
"num_tokens": 12464884.0, |
|
"step": 1015 |
|
}, |
|
{ |
|
"epoch": 2.142857142857143, |
|
"grad_norm": 0.20462237298488617, |
|
"learning_rate": 1.432548998556221e-05, |
|
"loss": 0.051, |
|
"num_tokens": 12526175.0, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 2.153361344537815, |
|
"grad_norm": 0.2599621117115021, |
|
"learning_rate": 1.4115090551054622e-05, |
|
"loss": 0.0517, |
|
"num_tokens": 12587615.0, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 2.1638655462184873, |
|
"grad_norm": 0.1801358163356781, |
|
"learning_rate": 1.3906487218379504e-05, |
|
"loss": 0.0499, |
|
"num_tokens": 12649055.0, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 2.1743697478991595, |
|
"grad_norm": 0.1843215674161911, |
|
"learning_rate": 1.3699707979719357e-05, |
|
"loss": 0.0513, |
|
"num_tokens": 12710459.0, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 2.184873949579832, |
|
"grad_norm": 0.19053132832050323, |
|
"learning_rate": 1.3494780582484126e-05, |
|
"loss": 0.0496, |
|
"num_tokens": 12771899.0, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 2.1953781512605044, |
|
"grad_norm": 0.15285778045654297, |
|
"learning_rate": 1.329173252558779e-05, |
|
"loss": 0.0497, |
|
"num_tokens": 12833339.0, |
|
"step": 1045 |
|
}, |
|
{ |
|
"epoch": 2.2058823529411766, |
|
"grad_norm": 0.14396464824676514, |
|
"learning_rate": 1.3090591055758356e-05, |
|
"loss": 0.0507, |
|
"num_tokens": 12894779.0, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 2.216386554621849, |
|
"grad_norm": 0.14991876482963562, |
|
"learning_rate": 1.2891383163881633e-05, |
|
"loss": 0.05, |
|
"num_tokens": 12956219.0, |
|
"step": 1055 |
|
}, |
|
{ |
|
"epoch": 2.226890756302521, |
|
"grad_norm": 0.14839011430740356, |
|
"learning_rate": 1.2694135581379383e-05, |
|
"loss": 0.0499, |
|
"num_tokens": 13017659.0, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 2.2373949579831933, |
|
"grad_norm": 0.12264993786811829, |
|
"learning_rate": 1.2498874776622246e-05, |
|
"loss": 0.0462, |
|
"num_tokens": 13079099.0, |
|
"step": 1065 |
|
}, |
|
{ |
|
"epoch": 2.2478991596638656, |
|
"grad_norm": 0.1659439504146576, |
|
"learning_rate": 1.2305626951378019e-05, |
|
"loss": 0.0492, |
|
"num_tokens": 13140539.0, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 2.258403361344538, |
|
"grad_norm": 0.16605842113494873, |
|
"learning_rate": 1.2114418037295636e-05, |
|
"loss": 0.0502, |
|
"num_tokens": 13201979.0, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 2.26890756302521, |
|
"grad_norm": 0.16009701788425446, |
|
"learning_rate": 1.1925273692425487e-05, |
|
"loss": 0.0496, |
|
"num_tokens": 13263419.0, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 2.2794117647058822, |
|
"grad_norm": 0.1512678861618042, |
|
"learning_rate": 1.1738219297776371e-05, |
|
"loss": 0.0497, |
|
"num_tokens": 13324859.0, |
|
"step": 1085 |
|
}, |
|
{ |
|
"epoch": 2.2899159663865545, |
|
"grad_norm": 0.18362964689731598, |
|
"learning_rate": 1.1553279953909641e-05, |
|
"loss": 0.0485, |
|
"num_tokens": 13386299.0, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 2.3004201680672267, |
|
"grad_norm": 0.15746116638183594, |
|
"learning_rate": 1.1370480477571029e-05, |
|
"loss": 0.0503, |
|
"num_tokens": 13447730.0, |
|
"step": 1095 |
|
}, |
|
{ |
|
"epoch": 2.310924369747899, |
|
"grad_norm": 0.2701464891433716, |
|
"learning_rate": 1.118984539836051e-05, |
|
"loss": 0.0521, |
|
"num_tokens": 13509170.0, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 2.3214285714285716, |
|
"grad_norm": 0.18647603690624237, |
|
"learning_rate": 1.1011398955440702e-05, |
|
"loss": 0.0498, |
|
"num_tokens": 13570409.0, |
|
"step": 1105 |
|
}, |
|
{ |
|
"epoch": 2.331932773109244, |
|
"grad_norm": 0.12975195050239563, |
|
"learning_rate": 1.0835165094284264e-05, |
|
"loss": 0.0507, |
|
"num_tokens": 13631849.0, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 2.342436974789916, |
|
"grad_norm": 0.15623484551906586, |
|
"learning_rate": 1.066116746346065e-05, |
|
"loss": 0.0499, |
|
"num_tokens": 13693289.0, |
|
"step": 1115 |
|
}, |
|
{ |
|
"epoch": 2.3529411764705883, |
|
"grad_norm": 0.1415032297372818, |
|
"learning_rate": 1.0489429411462794e-05, |
|
"loss": 0.05, |
|
"num_tokens": 13754729.0, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 2.3634453781512605, |
|
"grad_norm": 0.188720241189003, |
|
"learning_rate": 1.0319973983573971e-05, |
|
"loss": 0.053, |
|
"num_tokens": 13816169.0, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 2.3739495798319328, |
|
"grad_norm": 0.19719360768795013, |
|
"learning_rate": 1.0152823918775408e-05, |
|
"loss": 0.0503, |
|
"num_tokens": 13877609.0, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 2.384453781512605, |
|
"grad_norm": 0.1669566035270691, |
|
"learning_rate": 9.988001646694935e-06, |
|
"loss": 0.0499, |
|
"num_tokens": 13939049.0, |
|
"step": 1135 |
|
}, |
|
{ |
|
"epoch": 2.3949579831932772, |
|
"grad_norm": 0.22400623559951782, |
|
"learning_rate": 9.825529284597238e-06, |
|
"loss": 0.0534, |
|
"num_tokens": 14000489.0, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 2.4054621848739495, |
|
"grad_norm": 0.15708568692207336, |
|
"learning_rate": 9.665428634415923e-06, |
|
"loss": 0.0499, |
|
"num_tokens": 14061697.0, |
|
"step": 1145 |
|
}, |
|
{ |
|
"epoch": 2.4159663865546217, |
|
"grad_norm": 0.16055414080619812, |
|
"learning_rate": 9.50772117982799e-06, |
|
"loss": 0.0506, |
|
"num_tokens": 14123137.0, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 2.426470588235294, |
|
"grad_norm": 0.14245997369289398, |
|
"learning_rate": 9.352428083370946e-06, |
|
"loss": 0.0497, |
|
"num_tokens": 14184577.0, |
|
"step": 1155 |
|
}, |
|
{ |
|
"epoch": 2.4369747899159666, |
|
"grad_norm": 0.14547857642173767, |
|
"learning_rate": 9.199570183603021e-06, |
|
"loss": 0.0501, |
|
"num_tokens": 14246017.0, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 2.447478991596639, |
|
"grad_norm": 0.17205478250980377, |
|
"learning_rate": 9.049167992306908e-06, |
|
"loss": 0.0501, |
|
"num_tokens": 14307457.0, |
|
"step": 1165 |
|
}, |
|
{ |
|
"epoch": 2.457983193277311, |
|
"grad_norm": 0.16485774517059326, |
|
"learning_rate": 8.901241691737286e-06, |
|
"loss": 0.0499, |
|
"num_tokens": 14368897.0, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 2.4684873949579833, |
|
"grad_norm": 0.1967056393623352, |
|
"learning_rate": 8.755811131912612e-06, |
|
"loss": 0.051, |
|
"num_tokens": 14430337.0, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 2.4789915966386555, |
|
"grad_norm": 0.14919425547122955, |
|
"learning_rate": 8.612895827951451e-06, |
|
"loss": 0.0495, |
|
"num_tokens": 14491744.0, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 2.4894957983193278, |
|
"grad_norm": 0.1448267251253128, |
|
"learning_rate": 8.472514957453801e-06, |
|
"loss": 0.0512, |
|
"num_tokens": 14553007.0, |
|
"step": 1185 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"grad_norm": 0.17768217623233795, |
|
"learning_rate": 8.33468735792765e-06, |
|
"loss": 0.0501, |
|
"num_tokens": 14614447.0, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 2.5105042016806722, |
|
"grad_norm": 0.1507992148399353, |
|
"learning_rate": 8.199431524261223e-06, |
|
"loss": 0.0503, |
|
"num_tokens": 14675727.0, |
|
"step": 1195 |
|
}, |
|
{ |
|
"epoch": 2.5210084033613445, |
|
"grad_norm": 0.16350729763507843, |
|
"learning_rate": 8.066765606241163e-06, |
|
"loss": 0.0496, |
|
"num_tokens": 14737165.0, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 2.5315126050420167, |
|
"grad_norm": 0.16035616397857666, |
|
"learning_rate": 7.936707406117028e-06, |
|
"loss": 0.0488, |
|
"num_tokens": 14798605.0, |
|
"step": 1205 |
|
}, |
|
{ |
|
"epoch": 2.542016806722689, |
|
"grad_norm": 0.1894913911819458, |
|
"learning_rate": 7.809274376212464e-06, |
|
"loss": 0.0508, |
|
"num_tokens": 14859883.0, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 2.552521008403361, |
|
"grad_norm": 0.1903340071439743, |
|
"learning_rate": 7.68448361658327e-06, |
|
"loss": 0.0488, |
|
"num_tokens": 14921105.0, |
|
"step": 1215 |
|
}, |
|
{ |
|
"epoch": 2.5630252100840334, |
|
"grad_norm": 0.14671629667282104, |
|
"learning_rate": 7.5623518727227975e-06, |
|
"loss": 0.0495, |
|
"num_tokens": 14982545.0, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 2.5735294117647056, |
|
"grad_norm": 0.16081440448760986, |
|
"learning_rate": 7.442895533314856e-06, |
|
"loss": 0.0473, |
|
"num_tokens": 15043985.0, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 2.5840336134453783, |
|
"grad_norm": 0.1555902659893036, |
|
"learning_rate": 7.326130628034581e-06, |
|
"loss": 0.0492, |
|
"num_tokens": 15105425.0, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 2.5945378151260505, |
|
"grad_norm": 0.1796170324087143, |
|
"learning_rate": 7.212072825397413e-06, |
|
"loss": 0.0497, |
|
"num_tokens": 15166865.0, |
|
"step": 1235 |
|
}, |
|
{ |
|
"epoch": 2.6050420168067228, |
|
"grad_norm": 0.13445882499217987, |
|
"learning_rate": 7.100737430656561e-06, |
|
"loss": 0.0494, |
|
"num_tokens": 15228139.0, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 2.615546218487395, |
|
"grad_norm": 0.18797667324543, |
|
"learning_rate": 6.992139383749224e-06, |
|
"loss": 0.0499, |
|
"num_tokens": 15289579.0, |
|
"step": 1245 |
|
}, |
|
{ |
|
"epoch": 2.6260504201680672, |
|
"grad_norm": 0.1478380262851715, |
|
"learning_rate": 6.886293257291801e-06, |
|
"loss": 0.0503, |
|
"num_tokens": 15351019.0, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 2.6365546218487395, |
|
"grad_norm": 0.19320227205753326, |
|
"learning_rate": 6.78321325462444e-06, |
|
"loss": 0.0486, |
|
"num_tokens": 15412459.0, |
|
"step": 1255 |
|
}, |
|
{ |
|
"epoch": 2.6470588235294117, |
|
"grad_norm": 0.18944524228572845, |
|
"learning_rate": 6.682913207905095e-06, |
|
"loss": 0.0496, |
|
"num_tokens": 15473796.0, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 2.657563025210084, |
|
"grad_norm": 0.17592737078666687, |
|
"learning_rate": 6.585406576253404e-06, |
|
"loss": 0.0501, |
|
"num_tokens": 15535236.0, |
|
"step": 1265 |
|
}, |
|
{ |
|
"epoch": 2.668067226890756, |
|
"grad_norm": 0.18211396038532257, |
|
"learning_rate": 6.490706443944656e-06, |
|
"loss": 0.0491, |
|
"num_tokens": 15596676.0, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 2.678571428571429, |
|
"grad_norm": 0.1536846160888672, |
|
"learning_rate": 6.398825518653992e-06, |
|
"loss": 0.05, |
|
"num_tokens": 15658116.0, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 2.689075630252101, |
|
"grad_norm": 0.18677380681037903, |
|
"learning_rate": 6.30977612975121e-06, |
|
"loss": 0.0493, |
|
"num_tokens": 15719399.0, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 2.6995798319327733, |
|
"grad_norm": 0.1490916907787323, |
|
"learning_rate": 6.223570226646291e-06, |
|
"loss": 0.0514, |
|
"num_tokens": 15780839.0, |
|
"step": 1285 |
|
}, |
|
{ |
|
"epoch": 2.7100840336134455, |
|
"grad_norm": 0.15238384902477264, |
|
"learning_rate": 6.140219377185933e-06, |
|
"loss": 0.05, |
|
"num_tokens": 15842274.0, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 2.7205882352941178, |
|
"grad_norm": 0.15011648833751678, |
|
"learning_rate": 6.0597347661012635e-06, |
|
"loss": 0.0493, |
|
"num_tokens": 15903714.0, |
|
"step": 1295 |
|
}, |
|
{ |
|
"epoch": 2.73109243697479, |
|
"grad_norm": 0.1596149504184723, |
|
"learning_rate": 5.982127193507003e-06, |
|
"loss": 0.0494, |
|
"num_tokens": 15965148.0, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 2.741596638655462, |
|
"grad_norm": 0.16487446427345276, |
|
"learning_rate": 5.907407073452186e-06, |
|
"loss": 0.0506, |
|
"num_tokens": 16026588.0, |
|
"step": 1305 |
|
}, |
|
{ |
|
"epoch": 2.7521008403361344, |
|
"grad_norm": 0.1454056352376938, |
|
"learning_rate": 5.835584432522727e-06, |
|
"loss": 0.0492, |
|
"num_tokens": 16088028.0, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 2.7626050420168067, |
|
"grad_norm": 0.16204313933849335, |
|
"learning_rate": 5.766668908495966e-06, |
|
"loss": 0.0509, |
|
"num_tokens": 16149468.0, |
|
"step": 1315 |
|
}, |
|
{ |
|
"epoch": 2.773109243697479, |
|
"grad_norm": 0.18910805881023407, |
|
"learning_rate": 5.700669749047387e-06, |
|
"loss": 0.0489, |
|
"num_tokens": 16210908.0, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 2.783613445378151, |
|
"grad_norm": 0.17493724822998047, |
|
"learning_rate": 5.637595810509689e-06, |
|
"loss": 0.05, |
|
"num_tokens": 16272348.0, |
|
"step": 1325 |
|
}, |
|
{ |
|
"epoch": 2.7941176470588234, |
|
"grad_norm": 0.14875848591327667, |
|
"learning_rate": 5.577455556684369e-06, |
|
"loss": 0.049, |
|
"num_tokens": 16333788.0, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 2.8046218487394956, |
|
"grad_norm": 0.1500275731086731, |
|
"learning_rate": 5.520257057705971e-06, |
|
"loss": 0.0498, |
|
"num_tokens": 16395228.0, |
|
"step": 1335 |
|
}, |
|
{ |
|
"epoch": 2.815126050420168, |
|
"grad_norm": 0.1598060131072998, |
|
"learning_rate": 5.466007988959163e-06, |
|
"loss": 0.0507, |
|
"num_tokens": 16456417.0, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 2.82563025210084, |
|
"grad_norm": 0.1572778970003128, |
|
"learning_rate": 5.414715630048797e-06, |
|
"loss": 0.051, |
|
"num_tokens": 16517857.0, |
|
"step": 1345 |
|
}, |
|
{ |
|
"epoch": 2.8361344537815127, |
|
"grad_norm": 0.15581144392490387, |
|
"learning_rate": 5.366386863823077e-06, |
|
"loss": 0.0499, |
|
"num_tokens": 16579297.0, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 2.846638655462185, |
|
"grad_norm": 0.18151573836803436, |
|
"learning_rate": 5.3210281754499284e-06, |
|
"loss": 0.0496, |
|
"num_tokens": 16640737.0, |
|
"step": 1355 |
|
}, |
|
{ |
|
"epoch": 2.857142857142857, |
|
"grad_norm": 0.1601044237613678, |
|
"learning_rate": 5.278645651546797e-06, |
|
"loss": 0.0487, |
|
"num_tokens": 16702177.0, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 2.8676470588235294, |
|
"grad_norm": 0.1497681736946106, |
|
"learning_rate": 5.239244979363877e-06, |
|
"loss": 0.0492, |
|
"num_tokens": 16763617.0, |
|
"step": 1365 |
|
}, |
|
{ |
|
"epoch": 2.8781512605042017, |
|
"grad_norm": 0.15907803177833557, |
|
"learning_rate": 5.202831446020945e-06, |
|
"loss": 0.0502, |
|
"num_tokens": 16824905.0, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 2.888655462184874, |
|
"grad_norm": 0.17641125619411469, |
|
"learning_rate": 5.169409937797901e-06, |
|
"loss": 0.0502, |
|
"num_tokens": 16886146.0, |
|
"step": 1375 |
|
}, |
|
{ |
|
"epoch": 2.899159663865546, |
|
"grad_norm": 0.12964367866516113, |
|
"learning_rate": 5.138984939479077e-06, |
|
"loss": 0.0487, |
|
"num_tokens": 16947586.0, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 2.9096638655462184, |
|
"grad_norm": 0.14473247528076172, |
|
"learning_rate": 5.111560533751426e-06, |
|
"loss": 0.0491, |
|
"num_tokens": 17009026.0, |
|
"step": 1385 |
|
}, |
|
{ |
|
"epoch": 2.9201680672268906, |
|
"grad_norm": 0.18652838468551636, |
|
"learning_rate": 5.087140400656684e-06, |
|
"loss": 0.0506, |
|
"num_tokens": 17070466.0, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 2.9306722689075633, |
|
"grad_norm": 0.18603888154029846, |
|
"learning_rate": 5.065727817097544e-06, |
|
"loss": 0.0492, |
|
"num_tokens": 17131779.0, |
|
"step": 1395 |
|
}, |
|
{ |
|
"epoch": 2.9411764705882355, |
|
"grad_norm": 0.15723615884780884, |
|
"learning_rate": 5.047325656397932e-06, |
|
"loss": 0.0494, |
|
"num_tokens": 17193063.0, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 2.9516806722689077, |
|
"grad_norm": 0.14798587560653687, |
|
"learning_rate": 5.031936387917442e-06, |
|
"loss": 0.049, |
|
"num_tokens": 17254503.0, |
|
"step": 1405 |
|
}, |
|
{ |
|
"epoch": 2.96218487394958, |
|
"grad_norm": 0.19435246288776398, |
|
"learning_rate": 5.019562076719972e-06, |
|
"loss": 0.0494, |
|
"num_tokens": 17315742.0, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 2.972689075630252, |
|
"grad_norm": 0.17056235671043396, |
|
"learning_rate": 5.0102043832966236e-06, |
|
"loss": 0.0493, |
|
"num_tokens": 17377182.0, |
|
"step": 1415 |
|
}, |
|
{ |
|
"epoch": 2.9831932773109244, |
|
"grad_norm": 0.12487131357192993, |
|
"learning_rate": 5.003864563342878e-06, |
|
"loss": 0.0477, |
|
"num_tokens": 17438622.0, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 2.9936974789915967, |
|
"grad_norm": 0.13770359754562378, |
|
"learning_rate": 5.0005434675900966e-06, |
|
"loss": 0.0477, |
|
"num_tokens": 17500062.0, |
|
"step": 1425 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"num_tokens": 17536926.0, |
|
"step": 1428, |
|
"total_flos": 7.444201440207176e+17, |
|
"train_loss": 0.06586769079210378, |
|
"train_runtime": 7485.0094, |
|
"train_samples_per_second": 9.147, |
|
"train_steps_per_second": 0.191 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 1428, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 7.444201440207176e+17, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|