{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 1074, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.004657661853749418, "grad_norm": 27.66617259818207, "learning_rate": 5.813953488372093e-07, "loss": 1.2864, "step": 5 }, { "epoch": 0.009315323707498836, "grad_norm": 19.626216359351606, "learning_rate": 1.1627906976744186e-06, "loss": 1.2277, "step": 10 }, { "epoch": 0.013972985561248253, "grad_norm": 7.689899823367007, "learning_rate": 1.744186046511628e-06, "loss": 1.0529, "step": 15 }, { "epoch": 0.018630647414997672, "grad_norm": 4.9632975953671465, "learning_rate": 2.325581395348837e-06, "loss": 0.907, "step": 20 }, { "epoch": 0.02328830926874709, "grad_norm": 2.000237294817273, "learning_rate": 2.9069767441860468e-06, "loss": 0.7884, "step": 25 }, { "epoch": 0.027945971122496506, "grad_norm": 1.3650044798865562, "learning_rate": 3.488372093023256e-06, "loss": 0.7091, "step": 30 }, { "epoch": 0.032603632976245925, "grad_norm": 1.0145438785392298, "learning_rate": 4.0697674418604655e-06, "loss": 0.6666, "step": 35 }, { "epoch": 0.037261294829995344, "grad_norm": 0.7666883370082983, "learning_rate": 4.651162790697674e-06, "loss": 0.6289, "step": 40 }, { "epoch": 0.04191895668374476, "grad_norm": 0.6977041513064055, "learning_rate": 5.232558139534884e-06, "loss": 0.5772, "step": 45 }, { "epoch": 0.04657661853749418, "grad_norm": 0.6177135531852819, "learning_rate": 5.8139534883720935e-06, "loss": 0.5729, "step": 50 }, { "epoch": 0.05123428039124359, "grad_norm": 0.5909988418524453, "learning_rate": 6.395348837209303e-06, "loss": 0.5555, "step": 55 }, { "epoch": 0.05589194224499301, "grad_norm": 0.6995185567241584, "learning_rate": 6.976744186046512e-06, "loss": 0.5472, "step": 60 }, { "epoch": 0.06054960409874243, "grad_norm": 0.7744854841742352, "learning_rate": 7.558139534883721e-06, "loss": 0.5379, "step": 65 }, { "epoch": 0.06520726595249185, "grad_norm": 0.5473128068676195, "learning_rate": 8.139534883720931e-06, "loss": 0.5149, "step": 70 }, { "epoch": 0.06986492780624126, "grad_norm": 0.711636506166052, "learning_rate": 8.72093023255814e-06, "loss": 0.5151, "step": 75 }, { "epoch": 0.07452258965999069, "grad_norm": 0.6813906468319959, "learning_rate": 9.302325581395349e-06, "loss": 0.5252, "step": 80 }, { "epoch": 0.0791802515137401, "grad_norm": 0.7134287439415516, "learning_rate": 9.883720930232558e-06, "loss": 0.5024, "step": 85 }, { "epoch": 0.08383791336748952, "grad_norm": 0.6992933648203333, "learning_rate": 1.0465116279069768e-05, "loss": 0.5085, "step": 90 }, { "epoch": 0.08849557522123894, "grad_norm": 0.6462820616067007, "learning_rate": 1.1046511627906977e-05, "loss": 0.5129, "step": 95 }, { "epoch": 0.09315323707498836, "grad_norm": 0.67397577790992, "learning_rate": 1.1627906976744187e-05, "loss": 0.5034, "step": 100 }, { "epoch": 0.09781089892873777, "grad_norm": 0.7146380645525483, "learning_rate": 1.2209302325581395e-05, "loss": 0.506, "step": 105 }, { "epoch": 0.10246856078248719, "grad_norm": 0.7569877075727182, "learning_rate": 1.2790697674418606e-05, "loss": 0.4939, "step": 110 }, { "epoch": 0.10712622263623661, "grad_norm": 0.5765767741771239, "learning_rate": 1.3372093023255814e-05, "loss": 0.4932, "step": 115 }, { "epoch": 0.11178388448998602, "grad_norm": 0.7689970599513694, "learning_rate": 1.3953488372093024e-05, "loss": 0.5041, "step": 120 }, { "epoch": 0.11644154634373545, "grad_norm": 0.7710814056961762, "learning_rate": 1.4534883720930233e-05, "loss": 0.4846, "step": 125 }, { "epoch": 0.12109920819748486, "grad_norm": 0.8067799411588391, "learning_rate": 1.5116279069767441e-05, "loss": 0.4868, "step": 130 }, { "epoch": 0.1257568700512343, "grad_norm": 0.7337587312339595, "learning_rate": 1.569767441860465e-05, "loss": 0.4937, "step": 135 }, { "epoch": 0.1304145319049837, "grad_norm": 0.8156553975576142, "learning_rate": 1.6279069767441862e-05, "loss": 0.4871, "step": 140 }, { "epoch": 0.1350721937587331, "grad_norm": 0.9350450002033659, "learning_rate": 1.686046511627907e-05, "loss": 0.4972, "step": 145 }, { "epoch": 0.13972985561248252, "grad_norm": 0.7681334266478443, "learning_rate": 1.744186046511628e-05, "loss": 0.4907, "step": 150 }, { "epoch": 0.14438751746623196, "grad_norm": 0.84555038297673, "learning_rate": 1.802325581395349e-05, "loss": 0.4723, "step": 155 }, { "epoch": 0.14904517931998137, "grad_norm": 0.8372039362868026, "learning_rate": 1.8604651162790697e-05, "loss": 0.4793, "step": 160 }, { "epoch": 0.1537028411737308, "grad_norm": 0.7817304569895932, "learning_rate": 1.918604651162791e-05, "loss": 0.4798, "step": 165 }, { "epoch": 0.1583605030274802, "grad_norm": 0.9789922595882735, "learning_rate": 1.9767441860465116e-05, "loss": 0.4783, "step": 170 }, { "epoch": 0.1630181648812296, "grad_norm": 0.7335091977994727, "learning_rate": 2.0348837209302328e-05, "loss": 0.4893, "step": 175 }, { "epoch": 0.16767582673497905, "grad_norm": 0.8155578488051264, "learning_rate": 2.0930232558139536e-05, "loss": 0.484, "step": 180 }, { "epoch": 0.17233348858872846, "grad_norm": 0.8144677339935327, "learning_rate": 2.1511627906976744e-05, "loss": 0.482, "step": 185 }, { "epoch": 0.17699115044247787, "grad_norm": 0.8341691871989111, "learning_rate": 2.2093023255813955e-05, "loss": 0.4753, "step": 190 }, { "epoch": 0.18164881229622729, "grad_norm": 0.7550827853742104, "learning_rate": 2.2674418604651163e-05, "loss": 0.4811, "step": 195 }, { "epoch": 0.18630647414997673, "grad_norm": 0.726150592947813, "learning_rate": 2.3255813953488374e-05, "loss": 0.4548, "step": 200 }, { "epoch": 0.19096413600372614, "grad_norm": 0.9618454710454514, "learning_rate": 2.3837209302325582e-05, "loss": 0.4765, "step": 205 }, { "epoch": 0.19562179785747555, "grad_norm": 0.8357087005031101, "learning_rate": 2.441860465116279e-05, "loss": 0.4664, "step": 210 }, { "epoch": 0.20027945971122496, "grad_norm": 0.9769412255461828, "learning_rate": 2.5e-05, "loss": 0.4772, "step": 215 }, { "epoch": 0.20493712156497437, "grad_norm": 0.819002974551843, "learning_rate": 2.5581395348837212e-05, "loss": 0.4731, "step": 220 }, { "epoch": 0.2095947834187238, "grad_norm": 1.0374332843969527, "learning_rate": 2.616279069767442e-05, "loss": 0.4763, "step": 225 }, { "epoch": 0.21425244527247322, "grad_norm": 0.8985668587375348, "learning_rate": 2.674418604651163e-05, "loss": 0.4745, "step": 230 }, { "epoch": 0.21891010712622264, "grad_norm": 1.0814445549381904, "learning_rate": 2.7325581395348836e-05, "loss": 0.4695, "step": 235 }, { "epoch": 0.22356776897997205, "grad_norm": 1.0714624697860875, "learning_rate": 2.7906976744186048e-05, "loss": 0.4687, "step": 240 }, { "epoch": 0.22822543083372146, "grad_norm": 0.9839374258594388, "learning_rate": 2.848837209302326e-05, "loss": 0.4598, "step": 245 }, { "epoch": 0.2328830926874709, "grad_norm": 0.9846036088169035, "learning_rate": 2.9069767441860467e-05, "loss": 0.4569, "step": 250 }, { "epoch": 0.2375407545412203, "grad_norm": 0.7487150924303477, "learning_rate": 2.9651162790697678e-05, "loss": 0.4676, "step": 255 }, { "epoch": 0.24219841639496972, "grad_norm": 0.8226804932307876, "learning_rate": 3.0232558139534883e-05, "loss": 0.4722, "step": 260 }, { "epoch": 0.24685607824871914, "grad_norm": 0.7711022626726491, "learning_rate": 3.081395348837209e-05, "loss": 0.4658, "step": 265 }, { "epoch": 0.2515137401024686, "grad_norm": 0.8932122698414526, "learning_rate": 3.13953488372093e-05, "loss": 0.4557, "step": 270 }, { "epoch": 0.25617140195621796, "grad_norm": 0.7549553477075355, "learning_rate": 3.197674418604651e-05, "loss": 0.4716, "step": 275 }, { "epoch": 0.2608290638099674, "grad_norm": 1.203597171762952, "learning_rate": 3.2558139534883724e-05, "loss": 0.4602, "step": 280 }, { "epoch": 0.26548672566371684, "grad_norm": 0.6877770638858097, "learning_rate": 3.313953488372093e-05, "loss": 0.4589, "step": 285 }, { "epoch": 0.2701443875174662, "grad_norm": 0.9865065100124113, "learning_rate": 3.372093023255814e-05, "loss": 0.4577, "step": 290 }, { "epoch": 0.27480204937121566, "grad_norm": 0.9375168702189124, "learning_rate": 3.430232558139535e-05, "loss": 0.4516, "step": 295 }, { "epoch": 0.27945971122496505, "grad_norm": 0.7788845536490606, "learning_rate": 3.488372093023256e-05, "loss": 0.4633, "step": 300 }, { "epoch": 0.2841173730787145, "grad_norm": 0.8098072656748624, "learning_rate": 3.5465116279069774e-05, "loss": 0.4691, "step": 305 }, { "epoch": 0.2887750349324639, "grad_norm": 1.3633357913402282, "learning_rate": 3.604651162790698e-05, "loss": 0.4662, "step": 310 }, { "epoch": 0.2934326967862133, "grad_norm": 0.8073757027557402, "learning_rate": 3.662790697674418e-05, "loss": 0.4551, "step": 315 }, { "epoch": 0.29809035863996275, "grad_norm": 1.0699898191361614, "learning_rate": 3.7209302325581394e-05, "loss": 0.4614, "step": 320 }, { "epoch": 0.30274802049371213, "grad_norm": 0.7424537685527064, "learning_rate": 3.7790697674418606e-05, "loss": 0.4637, "step": 325 }, { "epoch": 0.3074056823474616, "grad_norm": 1.0663211887340074, "learning_rate": 3.837209302325582e-05, "loss": 0.463, "step": 330 }, { "epoch": 0.312063344201211, "grad_norm": 0.9616713033039357, "learning_rate": 3.895348837209303e-05, "loss": 0.4483, "step": 335 }, { "epoch": 0.3167210060549604, "grad_norm": 0.9289014338139833, "learning_rate": 3.953488372093023e-05, "loss": 0.4618, "step": 340 }, { "epoch": 0.32137866790870984, "grad_norm": 0.9205940376448726, "learning_rate": 4.0116279069767444e-05, "loss": 0.4516, "step": 345 }, { "epoch": 0.3260363297624592, "grad_norm": 0.9475824983358404, "learning_rate": 4.0697674418604655e-05, "loss": 0.4545, "step": 350 }, { "epoch": 0.33069399161620866, "grad_norm": 1.315055782839884, "learning_rate": 4.127906976744187e-05, "loss": 0.452, "step": 355 }, { "epoch": 0.3353516534699581, "grad_norm": 1.068696802277167, "learning_rate": 4.186046511627907e-05, "loss": 0.4625, "step": 360 }, { "epoch": 0.3400093153237075, "grad_norm": 1.036508382207286, "learning_rate": 4.2441860465116276e-05, "loss": 0.4615, "step": 365 }, { "epoch": 0.3446669771774569, "grad_norm": 0.9771206427059291, "learning_rate": 4.302325581395349e-05, "loss": 0.472, "step": 370 }, { "epoch": 0.3493246390312063, "grad_norm": 0.9364995309041826, "learning_rate": 4.36046511627907e-05, "loss": 0.4662, "step": 375 }, { "epoch": 0.35398230088495575, "grad_norm": 0.8666798423814609, "learning_rate": 4.418604651162791e-05, "loss": 0.4513, "step": 380 }, { "epoch": 0.3586399627387052, "grad_norm": 0.6625516854940565, "learning_rate": 4.476744186046512e-05, "loss": 0.4458, "step": 385 }, { "epoch": 0.36329762459245457, "grad_norm": 0.5910994980517921, "learning_rate": 4.5348837209302326e-05, "loss": 0.4518, "step": 390 }, { "epoch": 0.367955286446204, "grad_norm": 0.8576114090853963, "learning_rate": 4.593023255813954e-05, "loss": 0.4472, "step": 395 }, { "epoch": 0.37261294829995345, "grad_norm": 1.059751439208873, "learning_rate": 4.651162790697675e-05, "loss": 0.4453, "step": 400 }, { "epoch": 0.37727061015370283, "grad_norm": 0.7913724415049757, "learning_rate": 4.709302325581396e-05, "loss": 0.4492, "step": 405 }, { "epoch": 0.3819282720074523, "grad_norm": 0.7471961729660666, "learning_rate": 4.7674418604651164e-05, "loss": 0.4559, "step": 410 }, { "epoch": 0.38658593386120166, "grad_norm": 0.7938017748454547, "learning_rate": 4.8255813953488375e-05, "loss": 0.4556, "step": 415 }, { "epoch": 0.3912435957149511, "grad_norm": 0.861398782650763, "learning_rate": 4.883720930232558e-05, "loss": 0.4457, "step": 420 }, { "epoch": 0.39590125756870054, "grad_norm": 1.164899725237036, "learning_rate": 4.941860465116279e-05, "loss": 0.4619, "step": 425 }, { "epoch": 0.4005589194224499, "grad_norm": 0.9327630997758065, "learning_rate": 5e-05, "loss": 0.4553, "step": 430 }, { "epoch": 0.40521658127619936, "grad_norm": 0.7755438754585425, "learning_rate": 4.9935266701191095e-05, "loss": 0.4589, "step": 435 }, { "epoch": 0.40987424312994875, "grad_norm": 0.6084430372051816, "learning_rate": 4.987053340238219e-05, "loss": 0.4659, "step": 440 }, { "epoch": 0.4145319049836982, "grad_norm": 0.706361406749313, "learning_rate": 4.980580010357328e-05, "loss": 0.4454, "step": 445 }, { "epoch": 0.4191895668374476, "grad_norm": 0.8310997319677449, "learning_rate": 4.9741066804764374e-05, "loss": 0.4492, "step": 450 }, { "epoch": 0.423847228691197, "grad_norm": 0.7992980724203008, "learning_rate": 4.967633350595546e-05, "loss": 0.4677, "step": 455 }, { "epoch": 0.42850489054494645, "grad_norm": 0.7848440239850348, "learning_rate": 4.961160020714656e-05, "loss": 0.4484, "step": 460 }, { "epoch": 0.43316255239869583, "grad_norm": 0.6875355874505797, "learning_rate": 4.954686690833765e-05, "loss": 0.4592, "step": 465 }, { "epoch": 0.43782021425244527, "grad_norm": 0.9019111936374453, "learning_rate": 4.948213360952874e-05, "loss": 0.4507, "step": 470 }, { "epoch": 0.4424778761061947, "grad_norm": 0.9237443867694993, "learning_rate": 4.941740031071983e-05, "loss": 0.46, "step": 475 }, { "epoch": 0.4471355379599441, "grad_norm": 0.7227765777311265, "learning_rate": 4.935266701191093e-05, "loss": 0.4448, "step": 480 }, { "epoch": 0.45179319981369354, "grad_norm": 0.7627976373615327, "learning_rate": 4.9287933713102025e-05, "loss": 0.4467, "step": 485 }, { "epoch": 0.4564508616674429, "grad_norm": 0.8963050574087497, "learning_rate": 4.922320041429311e-05, "loss": 0.45, "step": 490 }, { "epoch": 0.46110852352119236, "grad_norm": 0.629147905901097, "learning_rate": 4.915846711548421e-05, "loss": 0.4427, "step": 495 }, { "epoch": 0.4657661853749418, "grad_norm": 0.5883243359451029, "learning_rate": 4.9093733816675304e-05, "loss": 0.4491, "step": 500 }, { "epoch": 0.4704238472286912, "grad_norm": 0.5369867711481909, "learning_rate": 4.902900051786639e-05, "loss": 0.4435, "step": 505 }, { "epoch": 0.4750815090824406, "grad_norm": 0.7877373044239472, "learning_rate": 4.8964267219057483e-05, "loss": 0.4392, "step": 510 }, { "epoch": 0.47973917093619, "grad_norm": 0.5720363420014942, "learning_rate": 4.889953392024858e-05, "loss": 0.4502, "step": 515 }, { "epoch": 0.48439683278993945, "grad_norm": 0.802768212629227, "learning_rate": 4.883480062143967e-05, "loss": 0.4479, "step": 520 }, { "epoch": 0.4890544946436889, "grad_norm": 0.9089354308220536, "learning_rate": 4.877006732263076e-05, "loss": 0.4562, "step": 525 }, { "epoch": 0.49371215649743827, "grad_norm": 0.9062598882129868, "learning_rate": 4.8705334023821855e-05, "loss": 0.4433, "step": 530 }, { "epoch": 0.4983698183511877, "grad_norm": 1.0614931500809168, "learning_rate": 4.864060072501295e-05, "loss": 0.4509, "step": 535 }, { "epoch": 0.5030274802049371, "grad_norm": 0.6656921373680668, "learning_rate": 4.857586742620404e-05, "loss": 0.4526, "step": 540 }, { "epoch": 0.5076851420586865, "grad_norm": 0.761135804704884, "learning_rate": 4.8511134127395134e-05, "loss": 0.4428, "step": 545 }, { "epoch": 0.5123428039124359, "grad_norm": 0.7524291911003331, "learning_rate": 4.844640082858623e-05, "loss": 0.4559, "step": 550 }, { "epoch": 0.5170004657661854, "grad_norm": 0.7643694469846594, "learning_rate": 4.838166752977732e-05, "loss": 0.4441, "step": 555 }, { "epoch": 0.5216581276199348, "grad_norm": 0.6727074125902812, "learning_rate": 4.831693423096841e-05, "loss": 0.4482, "step": 560 }, { "epoch": 0.5263157894736842, "grad_norm": 0.6080795466448415, "learning_rate": 4.82522009321595e-05, "loss": 0.4464, "step": 565 }, { "epoch": 0.5309734513274337, "grad_norm": 0.7154575256811082, "learning_rate": 4.81874676333506e-05, "loss": 0.4456, "step": 570 }, { "epoch": 0.5356311131811831, "grad_norm": 0.5881145419877987, "learning_rate": 4.812273433454169e-05, "loss": 0.4429, "step": 575 }, { "epoch": 0.5402887750349324, "grad_norm": 0.5629504481988451, "learning_rate": 4.8058001035732785e-05, "loss": 0.4428, "step": 580 }, { "epoch": 0.5449464368886818, "grad_norm": 0.6740507784348432, "learning_rate": 4.799326773692387e-05, "loss": 0.4389, "step": 585 }, { "epoch": 0.5496040987424313, "grad_norm": 0.5590413217515213, "learning_rate": 4.792853443811497e-05, "loss": 0.4543, "step": 590 }, { "epoch": 0.5542617605961807, "grad_norm": 0.648175325079465, "learning_rate": 4.7863801139306064e-05, "loss": 0.4446, "step": 595 }, { "epoch": 0.5589194224499301, "grad_norm": 0.6553497135224098, "learning_rate": 4.779906784049715e-05, "loss": 0.4501, "step": 600 }, { "epoch": 0.5635770843036796, "grad_norm": 0.6606737255970081, "learning_rate": 4.773433454168825e-05, "loss": 0.4256, "step": 605 }, { "epoch": 0.568234746157429, "grad_norm": 0.7570935534531892, "learning_rate": 4.766960124287934e-05, "loss": 0.4434, "step": 610 }, { "epoch": 0.5728924080111784, "grad_norm": 0.6404868107239774, "learning_rate": 4.760486794407043e-05, "loss": 0.4368, "step": 615 }, { "epoch": 0.5775500698649279, "grad_norm": 0.6133747116202044, "learning_rate": 4.754013464526152e-05, "loss": 0.4389, "step": 620 }, { "epoch": 0.5822077317186772, "grad_norm": 1.0158420393440462, "learning_rate": 4.747540134645262e-05, "loss": 0.4492, "step": 625 }, { "epoch": 0.5868653935724266, "grad_norm": 0.6424938062971148, "learning_rate": 4.741066804764371e-05, "loss": 0.4364, "step": 630 }, { "epoch": 0.5915230554261761, "grad_norm": 0.5587540204090086, "learning_rate": 4.73459347488348e-05, "loss": 0.4353, "step": 635 }, { "epoch": 0.5961807172799255, "grad_norm": 0.710771955315232, "learning_rate": 4.7281201450025894e-05, "loss": 0.4401, "step": 640 }, { "epoch": 0.6008383791336749, "grad_norm": 0.8503719196899987, "learning_rate": 4.721646815121699e-05, "loss": 0.4401, "step": 645 }, { "epoch": 0.6054960409874243, "grad_norm": 0.6213357524921049, "learning_rate": 4.715173485240808e-05, "loss": 0.4377, "step": 650 }, { "epoch": 0.6101537028411738, "grad_norm": 0.728615015594888, "learning_rate": 4.708700155359917e-05, "loss": 0.4435, "step": 655 }, { "epoch": 0.6148113646949231, "grad_norm": 0.69572702326182, "learning_rate": 4.7022268254790266e-05, "loss": 0.4438, "step": 660 }, { "epoch": 0.6194690265486725, "grad_norm": 0.717769460394585, "learning_rate": 4.695753495598136e-05, "loss": 0.4382, "step": 665 }, { "epoch": 0.624126688402422, "grad_norm": 0.6390733598139207, "learning_rate": 4.689280165717245e-05, "loss": 0.4419, "step": 670 }, { "epoch": 0.6287843502561714, "grad_norm": 0.8175947498621025, "learning_rate": 4.6828068358363545e-05, "loss": 0.4338, "step": 675 }, { "epoch": 0.6334420121099208, "grad_norm": 0.6392191276829822, "learning_rate": 4.676333505955464e-05, "loss": 0.437, "step": 680 }, { "epoch": 0.6380996739636703, "grad_norm": 0.5385507245755475, "learning_rate": 4.669860176074573e-05, "loss": 0.448, "step": 685 }, { "epoch": 0.6427573358174197, "grad_norm": 0.779484565903713, "learning_rate": 4.6633868461936824e-05, "loss": 0.4388, "step": 690 }, { "epoch": 0.6474149976711691, "grad_norm": 0.609631979942144, "learning_rate": 4.656913516312791e-05, "loss": 0.4371, "step": 695 }, { "epoch": 0.6520726595249184, "grad_norm": 0.6612175365081071, "learning_rate": 4.650440186431901e-05, "loss": 0.4346, "step": 700 }, { "epoch": 0.6567303213786679, "grad_norm": 0.5776526816400351, "learning_rate": 4.64396685655101e-05, "loss": 0.4325, "step": 705 }, { "epoch": 0.6613879832324173, "grad_norm": 0.6777612372991866, "learning_rate": 4.637493526670119e-05, "loss": 0.4433, "step": 710 }, { "epoch": 0.6660456450861667, "grad_norm": 0.652431971316431, "learning_rate": 4.631020196789229e-05, "loss": 0.4383, "step": 715 }, { "epoch": 0.6707033069399162, "grad_norm": 0.8345742527824963, "learning_rate": 4.624546866908338e-05, "loss": 0.4297, "step": 720 }, { "epoch": 0.6753609687936656, "grad_norm": 0.5978893188286112, "learning_rate": 4.618073537027447e-05, "loss": 0.4353, "step": 725 }, { "epoch": 0.680018630647415, "grad_norm": 0.8328268112464421, "learning_rate": 4.611600207146556e-05, "loss": 0.4421, "step": 730 }, { "epoch": 0.6846762925011645, "grad_norm": 0.7087213010225971, "learning_rate": 4.605126877265666e-05, "loss": 0.4304, "step": 735 }, { "epoch": 0.6893339543549138, "grad_norm": 0.6869314447013355, "learning_rate": 4.598653547384775e-05, "loss": 0.4347, "step": 740 }, { "epoch": 0.6939916162086632, "grad_norm": 0.6167757431721599, "learning_rate": 4.592180217503884e-05, "loss": 0.4312, "step": 745 }, { "epoch": 0.6986492780624126, "grad_norm": 0.7676543887073451, "learning_rate": 4.585706887622993e-05, "loss": 0.4393, "step": 750 }, { "epoch": 0.7033069399161621, "grad_norm": 0.6961688773290436, "learning_rate": 4.5792335577421026e-05, "loss": 0.4295, "step": 755 }, { "epoch": 0.7079646017699115, "grad_norm": 0.5967737066278368, "learning_rate": 4.572760227861212e-05, "loss": 0.4317, "step": 760 }, { "epoch": 0.7126222636236609, "grad_norm": 0.5577548927242444, "learning_rate": 4.566286897980321e-05, "loss": 0.4388, "step": 765 }, { "epoch": 0.7172799254774104, "grad_norm": 0.6798109409577441, "learning_rate": 4.5598135680994305e-05, "loss": 0.438, "step": 770 }, { "epoch": 0.7219375873311598, "grad_norm": 0.7079083857791663, "learning_rate": 4.55334023821854e-05, "loss": 0.4266, "step": 775 }, { "epoch": 0.7265952491849091, "grad_norm": 0.8509226139438899, "learning_rate": 4.546866908337649e-05, "loss": 0.4427, "step": 780 }, { "epoch": 0.7312529110386586, "grad_norm": 0.7242979399552838, "learning_rate": 4.5403935784567584e-05, "loss": 0.4362, "step": 785 }, { "epoch": 0.735910572892408, "grad_norm": 0.5877311409433356, "learning_rate": 4.533920248575868e-05, "loss": 0.4284, "step": 790 }, { "epoch": 0.7405682347461574, "grad_norm": 0.6078912772108137, "learning_rate": 4.527446918694977e-05, "loss": 0.4347, "step": 795 }, { "epoch": 0.7452258965999069, "grad_norm": 0.5476766413302819, "learning_rate": 4.520973588814086e-05, "loss": 0.4308, "step": 800 }, { "epoch": 0.7498835584536563, "grad_norm": 0.5195378720227425, "learning_rate": 4.5145002589331956e-05, "loss": 0.4453, "step": 805 }, { "epoch": 0.7545412203074057, "grad_norm": 0.7360682617560098, "learning_rate": 4.508026929052305e-05, "loss": 0.431, "step": 810 }, { "epoch": 0.759198882161155, "grad_norm": 0.5685816437073101, "learning_rate": 4.501553599171414e-05, "loss": 0.4342, "step": 815 }, { "epoch": 0.7638565440149045, "grad_norm": 0.5972759336495264, "learning_rate": 4.495080269290523e-05, "loss": 0.4336, "step": 820 }, { "epoch": 0.7685142058686539, "grad_norm": 0.6728421551142247, "learning_rate": 4.488606939409633e-05, "loss": 0.4226, "step": 825 }, { "epoch": 0.7731718677224033, "grad_norm": 0.5582332660093027, "learning_rate": 4.482133609528742e-05, "loss": 0.4302, "step": 830 }, { "epoch": 0.7778295295761528, "grad_norm": 0.6980425901666201, "learning_rate": 4.475660279647851e-05, "loss": 0.4372, "step": 835 }, { "epoch": 0.7824871914299022, "grad_norm": 0.7390227270424299, "learning_rate": 4.46918694976696e-05, "loss": 0.4232, "step": 840 }, { "epoch": 0.7871448532836516, "grad_norm": 0.6485234571078656, "learning_rate": 4.46271361988607e-05, "loss": 0.4284, "step": 845 }, { "epoch": 0.7918025151374011, "grad_norm": 0.6336992788450944, "learning_rate": 4.4562402900051786e-05, "loss": 0.4307, "step": 850 }, { "epoch": 0.7964601769911505, "grad_norm": 0.7410255325128174, "learning_rate": 4.449766960124288e-05, "loss": 0.4236, "step": 855 }, { "epoch": 0.8011178388448998, "grad_norm": 0.5276824982854947, "learning_rate": 4.443293630243397e-05, "loss": 0.4252, "step": 860 }, { "epoch": 0.8057755006986492, "grad_norm": 0.5896389602903055, "learning_rate": 4.436820300362507e-05, "loss": 0.4284, "step": 865 }, { "epoch": 0.8104331625523987, "grad_norm": 0.5253923050441579, "learning_rate": 4.430346970481616e-05, "loss": 0.4227, "step": 870 }, { "epoch": 0.8150908244061481, "grad_norm": 0.5344210226388759, "learning_rate": 4.423873640600725e-05, "loss": 0.4321, "step": 875 }, { "epoch": 0.8197484862598975, "grad_norm": 0.5189955942586891, "learning_rate": 4.4174003107198344e-05, "loss": 0.4164, "step": 880 }, { "epoch": 0.824406148113647, "grad_norm": 0.505727185520852, "learning_rate": 4.410926980838944e-05, "loss": 0.4311, "step": 885 }, { "epoch": 0.8290638099673964, "grad_norm": 0.6952374910519269, "learning_rate": 4.404453650958053e-05, "loss": 0.4298, "step": 890 }, { "epoch": 0.8337214718211458, "grad_norm": 0.6334651402321975, "learning_rate": 4.397980321077162e-05, "loss": 0.4302, "step": 895 }, { "epoch": 0.8383791336748952, "grad_norm": 0.5871993814882748, "learning_rate": 4.3915069911962716e-05, "loss": 0.4243, "step": 900 }, { "epoch": 0.8430367955286446, "grad_norm": 0.562675211982263, "learning_rate": 4.385033661315381e-05, "loss": 0.4282, "step": 905 }, { "epoch": 0.847694457382394, "grad_norm": 0.55189342404869, "learning_rate": 4.37856033143449e-05, "loss": 0.4342, "step": 910 }, { "epoch": 0.8523521192361434, "grad_norm": 0.7717927793072482, "learning_rate": 4.3720870015535995e-05, "loss": 0.4262, "step": 915 }, { "epoch": 0.8570097810898929, "grad_norm": 0.545706766656389, "learning_rate": 4.365613671672709e-05, "loss": 0.4334, "step": 920 }, { "epoch": 0.8616674429436423, "grad_norm": 0.7308396494889845, "learning_rate": 4.359140341791818e-05, "loss": 0.4276, "step": 925 }, { "epoch": 0.8663251047973917, "grad_norm": 0.6334665220388306, "learning_rate": 4.352667011910927e-05, "loss": 0.4289, "step": 930 }, { "epoch": 0.8709827666511412, "grad_norm": 0.5789727565447382, "learning_rate": 4.346193682030037e-05, "loss": 0.4177, "step": 935 }, { "epoch": 0.8756404285048905, "grad_norm": 0.6071364036108049, "learning_rate": 4.339720352149146e-05, "loss": 0.4187, "step": 940 }, { "epoch": 0.8802980903586399, "grad_norm": 0.48355618067734446, "learning_rate": 4.3332470222682546e-05, "loss": 0.4222, "step": 945 }, { "epoch": 0.8849557522123894, "grad_norm": 0.757016952941287, "learning_rate": 4.326773692387364e-05, "loss": 0.4149, "step": 950 }, { "epoch": 0.8896134140661388, "grad_norm": 0.5970956354199685, "learning_rate": 4.320300362506474e-05, "loss": 0.4285, "step": 955 }, { "epoch": 0.8942710759198882, "grad_norm": 1.7486796656102368, "learning_rate": 4.313827032625583e-05, "loss": 0.4328, "step": 960 }, { "epoch": 0.8989287377736377, "grad_norm": 0.676896991938164, "learning_rate": 4.307353702744692e-05, "loss": 0.4278, "step": 965 }, { "epoch": 0.9035863996273871, "grad_norm": 0.5343157092353157, "learning_rate": 4.300880372863801e-05, "loss": 0.4298, "step": 970 }, { "epoch": 0.9082440614811365, "grad_norm": 0.6312289170160827, "learning_rate": 4.294407042982911e-05, "loss": 0.4237, "step": 975 }, { "epoch": 0.9129017233348858, "grad_norm": 0.7628441977751337, "learning_rate": 4.28793371310202e-05, "loss": 0.4266, "step": 980 }, { "epoch": 0.9175593851886353, "grad_norm": 1.0298339126452174, "learning_rate": 4.281460383221129e-05, "loss": 0.4308, "step": 985 }, { "epoch": 0.9222170470423847, "grad_norm": 0.7772941072113776, "learning_rate": 4.274987053340238e-05, "loss": 0.4042, "step": 990 }, { "epoch": 0.9268747088961341, "grad_norm": 0.8073817232990661, "learning_rate": 4.2685137234593476e-05, "loss": 0.4195, "step": 995 }, { "epoch": 0.9315323707498836, "grad_norm": 0.7213209273575877, "learning_rate": 4.262040393578457e-05, "loss": 0.4224, "step": 1000 }, { "epoch": 0.936190032603633, "grad_norm": 0.7416359254585871, "learning_rate": 4.255567063697566e-05, "loss": 0.4312, "step": 1005 }, { "epoch": 0.9408476944573824, "grad_norm": 0.527102694728885, "learning_rate": 4.2490937338166755e-05, "loss": 0.4201, "step": 1010 }, { "epoch": 0.9455053563111319, "grad_norm": 0.5594511470545082, "learning_rate": 4.242620403935785e-05, "loss": 0.4224, "step": 1015 }, { "epoch": 0.9501630181648812, "grad_norm": 0.5965418648993862, "learning_rate": 4.236147074054894e-05, "loss": 0.4309, "step": 1020 }, { "epoch": 0.9548206800186306, "grad_norm": 0.5450943367909993, "learning_rate": 4.2296737441740034e-05, "loss": 0.4227, "step": 1025 }, { "epoch": 0.95947834187238, "grad_norm": 0.7935428149469217, "learning_rate": 4.223200414293113e-05, "loss": 0.422, "step": 1030 }, { "epoch": 0.9641360037261295, "grad_norm": 0.48700743765986526, "learning_rate": 4.216727084412222e-05, "loss": 0.4186, "step": 1035 }, { "epoch": 0.9687936655798789, "grad_norm": 0.7728666319795747, "learning_rate": 4.2102537545313306e-05, "loss": 0.4177, "step": 1040 }, { "epoch": 0.9734513274336283, "grad_norm": 0.6153911472582105, "learning_rate": 4.2037804246504406e-05, "loss": 0.4185, "step": 1045 }, { "epoch": 0.9781089892873778, "grad_norm": 0.4852026029806901, "learning_rate": 4.19730709476955e-05, "loss": 0.4185, "step": 1050 }, { "epoch": 0.9827666511411272, "grad_norm": 0.4462069599855887, "learning_rate": 4.190833764888659e-05, "loss": 0.4193, "step": 1055 }, { "epoch": 0.9874243129948765, "grad_norm": 0.5229879709804952, "learning_rate": 4.184360435007768e-05, "loss": 0.4221, "step": 1060 }, { "epoch": 0.992081974848626, "grad_norm": 0.9581807172329343, "learning_rate": 4.177887105126878e-05, "loss": 0.4193, "step": 1065 }, { "epoch": 0.9967396367023754, "grad_norm": 0.6118951656784746, "learning_rate": 4.171413775245987e-05, "loss": 0.411, "step": 1070 } ], "logging_steps": 5, "max_steps": 4292, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 9.201439617780285e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }