Training in progress, step 1600, checkpoint

Browse files

Files changed (5) hide show

last-checkpoint/model.safetensors +1 -1
last-checkpoint/optimizer.pt +1 -1
last-checkpoint/rng_state.pth +1 -1
last-checkpoint/scheduler.pt +1 -1
last-checkpoint/trainer_state.json +1123 -3

last-checkpoint/model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c19f4b8400a3385551bd98cf95b6e3f1d64dcf7f82712e395c277f316efdb089
 size 891644712

 version https://git-lfs.github.com/spec/v1
+oid sha256:77e67eae521c3cba24d0ae9eb0d44447bec89794066751cc7d707f2fdc5c29bf
 size 891644712

last-checkpoint/optimizer.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:84230522d2dcedf2de4d8922e543f648e253a14cc0db4e635e673f12800b1231
 size 1783444794

 version https://git-lfs.github.com/spec/v1
+oid sha256:42e0c34297a9f3d72c5022f0b3dc2e519698ad0189be335ae6d8648524d74bb2
 size 1783444794

last-checkpoint/rng_state.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:4e862bee55033739b9ae895cccb1fea0613d44a4ebc98463c3105553aed127ff
 size 14244

 version https://git-lfs.github.com/spec/v1
+oid sha256:8dee4a2b51470c2e565b08aae8a4e5156e5c34e8c236adf6153eeb283fa560ec
 size 14244

last-checkpoint/scheduler.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:9822aef898957bd458dc9360bb1e3058b7e31c090ed4f3ed492670e2394dfa96
 size 1064

 version https://git-lfs.github.com/spec/v1
+oid sha256:1affb368014b9c4895e09d750783801a20ec7c8f622ab5a02e1bce055904fb15
 size 1064

last-checkpoint/trainer_state.json CHANGED Viewed

@@ -1,9 +1,9 @@
 {
   "best_metric": null,
   "best_model_checkpoint": null,
-  "epoch": 0.7099278979478647,
   "eval_steps": 500,
-  "global_step": 1280,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
@@ -4487,6 +4487,1126 @@
       "learning_rate": 0.00014958448930255265,
       "loss": 0.5353,
       "step": 1280
     }
   ],
   "logging_steps": 2,
@@ -4506,7 +5626,7 @@
       "attributes": {}
     }
   },
-  "total_flos": 3117864399667200.0,
   "train_batch_size": 8,
   "trial_name": null,
   "trial_params": null

 {
   "best_metric": null,
   "best_model_checkpoint": null,
+  "epoch": 0.8874098724348308,
   "eval_steps": 500,
+  "global_step": 1600,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
       "learning_rate": 0.00014958448930255265,
       "loss": 0.5353,
       "step": 1280
+    },
+    {
+      "epoch": 0.7110371602884082,
+      "grad_norm": 0.29970696568489075,
+      "learning_rate": 0.00014942837881378465,
+      "loss": 0.4261,
+      "step": 1282
+    },
+    {
+      "epoch": 0.7121464226289518,
+      "grad_norm": 0.36919617652893066,
+      "learning_rate": 0.00014927210875741347,
+      "loss": 0.4935,
+      "step": 1284
+    },
+    {
+      "epoch": 0.7132556849694953,
+      "grad_norm": 0.3045351505279541,
+      "learning_rate": 0.00014911567963791928,
+      "loss": 0.4191,
+      "step": 1286
+    },
+    {
+      "epoch": 0.7143649473100389,
+      "grad_norm": 0.30813243985176086,
+      "learning_rate": 0.00014895909196029585,
+      "loss": 0.3992,
+      "step": 1288
+    },
+    {
+      "epoch": 0.7154742096505824,
+      "grad_norm": 0.3444893956184387,
+      "learning_rate": 0.00014880234623004866,
+      "loss": 0.4351,
+      "step": 1290
+    },
+    {
+      "epoch": 0.7165834719911259,
+      "grad_norm": 0.29215720295906067,
+      "learning_rate": 0.00014864544295319356,
+      "loss": 0.4323,
+      "step": 1292
+    },
+    {
+      "epoch": 0.7176927343316695,
+      "grad_norm": 0.37148571014404297,
+      "learning_rate": 0.00014848838263625496,
+      "loss": 0.4463,
+      "step": 1294
+    },
+    {
+      "epoch": 0.718801996672213,
+      "grad_norm": 0.38779592514038086,
+      "learning_rate": 0.00014833116578626417,
+      "loss": 0.5293,
+      "step": 1296
+    },
+    {
+      "epoch": 0.7199112590127565,
+      "grad_norm": 0.31231850385665894,
+      "learning_rate": 0.00014817379291075792,
+      "loss": 0.4575,
+      "step": 1298
+    },
+    {
+      "epoch": 0.7210205213533001,
+      "grad_norm": 0.2966194450855255,
+      "learning_rate": 0.00014801626451777658,
+      "loss": 0.4337,
+      "step": 1300
+    },
+    {
+      "epoch": 0.7221297836938436,
+      "grad_norm": 0.2993515133857727,
+      "learning_rate": 0.00014785858111586258,
+      "loss": 0.3994,
+      "step": 1302
+    },
+    {
+      "epoch": 0.7232390460343872,
+      "grad_norm": 0.27827030420303345,
+      "learning_rate": 0.00014770074321405878,
+      "loss": 0.4079,
+      "step": 1304
+    },
+    {
+      "epoch": 0.7243483083749307,
+      "grad_norm": 0.2576323449611664,
+      "learning_rate": 0.00014754275132190678,
+      "loss": 0.3798,
+      "step": 1306
+    },
+    {
+      "epoch": 0.7254575707154742,
+      "grad_norm": 0.24843810498714447,
+      "learning_rate": 0.0001473846059494453,
+      "loss": 0.4564,
+      "step": 1308
+    },
+    {
+      "epoch": 0.7265668330560178,
+      "grad_norm": 0.2799087166786194,
+      "learning_rate": 0.00014722630760720856,
+      "loss": 0.3906,
+      "step": 1310
+    },
+    {
+      "epoch": 0.7276760953965613,
+      "grad_norm": 0.3635210394859314,
+      "learning_rate": 0.00014706785680622462,
+      "loss": 0.4409,
+      "step": 1312
+    },
+    {
+      "epoch": 0.7287853577371048,
+      "grad_norm": 0.30838248133659363,
+      "learning_rate": 0.0001469092540580136,
+      "loss": 0.4268,
+      "step": 1314
+    },
+    {
+      "epoch": 0.7298946200776484,
+      "grad_norm": 0.2162405252456665,
+      "learning_rate": 0.0001467504998745863,
+      "loss": 0.4085,
+      "step": 1316
+    },
+    {
+      "epoch": 0.7310038824181919,
+      "grad_norm": 0.30635640025138855,
+      "learning_rate": 0.00014659159476844232,
+      "loss": 0.4089,
+      "step": 1318
+    },
+    {
+      "epoch": 0.7321131447587355,
+      "grad_norm": 0.4048236012458801,
+      "learning_rate": 0.00014643253925256846,
+      "loss": 0.5283,
+      "step": 1320
+    },
+    {
+      "epoch": 0.733222407099279,
+      "grad_norm": 0.4575350284576416,
+      "learning_rate": 0.00014627333384043713,
+      "loss": 0.4675,
+      "step": 1322
+    },
+    {
+      "epoch": 0.7343316694398225,
+      "grad_norm": 0.3289128243923187,
+      "learning_rate": 0.00014611397904600458,
+      "loss": 0.4777,
+      "step": 1324
+    },
+    {
+      "epoch": 0.7354409317803661,
+      "grad_norm": 0.29231977462768555,
+      "learning_rate": 0.00014595447538370935,
+      "loss": 0.3851,
+      "step": 1326
+    },
+    {
+      "epoch": 0.7365501941209096,
+      "grad_norm": 0.33795663714408875,
+      "learning_rate": 0.00014579482336847058,
+      "loss": 0.475,
+      "step": 1328
+    },
+    {
+      "epoch": 0.7376594564614531,
+      "grad_norm": 0.20978425443172455,
+      "learning_rate": 0.00014563502351568625,
+      "loss": 0.3455,
+      "step": 1330
+    },
+    {
+      "epoch": 0.7387687188019967,
+      "grad_norm": 0.4113425016403198,
+      "learning_rate": 0.00014547507634123176,
+      "loss": 0.3741,
+      "step": 1332
+    },
+    {
+      "epoch": 0.7398779811425402,
+      "grad_norm": 0.38009563088417053,
+      "learning_rate": 0.0001453149823614579,
+      "loss": 0.53,
+      "step": 1334
+    },
+    {
+      "epoch": 0.7409872434830838,
+      "grad_norm": 0.2306888997554779,
+      "learning_rate": 0.00014515474209318948,
+      "loss": 0.4229,
+      "step": 1336
+    },
+    {
+      "epoch": 0.7420965058236273,
+      "grad_norm": 0.326107382774353,
+      "learning_rate": 0.00014499435605372366,
+      "loss": 0.3731,
+      "step": 1338
+    },
+    {
+      "epoch": 0.7432057681641708,
+      "grad_norm": 0.26597580313682556,
+      "learning_rate": 0.00014483382476082802,
+      "loss": 0.3841,
+      "step": 1340
+    },
+    {
+      "epoch": 0.7443150305047144,
+      "grad_norm": 0.3690161406993866,
+      "learning_rate": 0.00014467314873273918,
+      "loss": 0.4814,
+      "step": 1342
+    },
+    {
+      "epoch": 0.7454242928452579,
+      "grad_norm": 0.30208808183670044,
+      "learning_rate": 0.0001445123284881609,
+      "loss": 0.4522,
+      "step": 1344
+    },
+    {
+      "epoch": 0.7465335551858014,
+      "grad_norm": 0.28505003452301025,
+      "learning_rate": 0.00014435136454626264,
+      "loss": 0.4283,
+      "step": 1346
+    },
+    {
+      "epoch": 0.747642817526345,
+      "grad_norm": 0.2772189974784851,
+      "learning_rate": 0.0001441902574266776,
+      "loss": 0.3964,
+      "step": 1348
+    },
+    {
+      "epoch": 0.7487520798668885,
+      "grad_norm": 0.4741387665271759,
+      "learning_rate": 0.0001440290076495013,
+      "loss": 0.4577,
+      "step": 1350
+    },
+    {
+      "epoch": 0.7498613422074321,
+      "grad_norm": 0.3234037160873413,
+      "learning_rate": 0.00014386761573528976,
+      "loss": 0.4542,
+      "step": 1352
+    },
+    {
+      "epoch": 0.7509706045479756,
+      "grad_norm": 0.3096826374530792,
+      "learning_rate": 0.0001437060822050579,
+      "loss": 0.3892,
+      "step": 1354
+    },
+    {
+      "epoch": 0.7520798668885191,
+      "grad_norm": 0.2626829743385315,
+      "learning_rate": 0.00014354440758027772,
+      "loss": 0.4441,
+      "step": 1356
+    },
+    {
+      "epoch": 0.7531891292290627,
+      "grad_norm": 0.2835614085197449,
+      "learning_rate": 0.00014338259238287678,
+      "loss": 0.4905,
+      "step": 1358
+    },
+    {
+      "epoch": 0.7542983915696062,
+      "grad_norm": 0.23285934329032898,
+      "learning_rate": 0.00014322063713523647,
+      "loss": 0.3911,
+      "step": 1360
+    },
+    {
+      "epoch": 0.7554076539101497,
+      "grad_norm": 0.2746225595474243,
+      "learning_rate": 0.00014305854236019018,
+      "loss": 0.4681,
+      "step": 1362
+    },
+    {
+      "epoch": 0.7565169162506933,
+      "grad_norm": 0.30551669001579285,
+      "learning_rate": 0.0001428963085810219,
+      "loss": 0.3883,
+      "step": 1364
+    },
+    {
+      "epoch": 0.7576261785912368,
+      "grad_norm": 0.3372795283794403,
+      "learning_rate": 0.0001427339363214642,
+      "loss": 0.4452,
+      "step": 1366
+    },
+    {
+      "epoch": 0.7587354409317804,
+      "grad_norm": 0.2697629928588867,
+      "learning_rate": 0.00014257142610569682,
+      "loss": 0.3853,
+      "step": 1368
+    },
+    {
+      "epoch": 0.7598447032723239,
+      "grad_norm": 0.29290249943733215,
+      "learning_rate": 0.00014240877845834472,
+      "loss": 0.4694,
+      "step": 1370
+    },
+    {
+      "epoch": 0.7609539656128674,
+      "grad_norm": 0.298874169588089,
+      "learning_rate": 0.00014224599390447672,
+      "loss": 0.4644,
+      "step": 1372
+    },
+    {
+      "epoch": 0.762063227953411,
+      "grad_norm": 0.23659248650074005,
+      "learning_rate": 0.00014208307296960344,
+      "loss": 0.395,
+      "step": 1374
+    },
+    {
+      "epoch": 0.7631724902939545,
+      "grad_norm": 0.2965550124645233,
+      "learning_rate": 0.00014192001617967587,
+      "loss": 0.5332,
+      "step": 1376
+    },
+    {
+      "epoch": 0.764281752634498,
+      "grad_norm": 0.3338853716850281,
+      "learning_rate": 0.00014175682406108352,
+      "loss": 0.5176,
+      "step": 1378
+    },
+    {
+      "epoch": 0.7653910149750416,
+      "grad_norm": 0.24134789407253265,
+      "learning_rate": 0.0001415934971406528,
+      "loss": 0.4224,
+      "step": 1380
+    },
+    {
+      "epoch": 0.7665002773155851,
+      "grad_norm": 0.3920575678348541,
+      "learning_rate": 0.00014143003594564528,
+      "loss": 0.4627,
+      "step": 1382
+    },
+    {
+      "epoch": 0.7676095396561287,
+      "grad_norm": 0.3521714508533478,
+      "learning_rate": 0.00014126644100375603,
+      "loss": 0.446,
+      "step": 1384
+    },
+    {
+      "epoch": 0.7687188019966722,
+      "grad_norm": 0.2819899022579193,
+      "learning_rate": 0.0001411027128431119,
+      "loss": 0.3637,
+      "step": 1386
+    },
+    {
+      "epoch": 0.7698280643372157,
+      "grad_norm": 0.1896730363368988,
+      "learning_rate": 0.00014093885199226972,
+      "loss": 0.3206,
+      "step": 1388
+    },
+    {
+      "epoch": 0.7709373266777593,
+      "grad_norm": 0.21066512167453766,
+      "learning_rate": 0.0001407748589802148,
+      "loss": 0.3081,
+      "step": 1390
+    },
+    {
+      "epoch": 0.7720465890183028,
+      "grad_norm": 0.2417469471693039,
+      "learning_rate": 0.000140610734336359,
+      "loss": 0.3954,
+      "step": 1392
+    },
+    {
+      "epoch": 0.7731558513588463,
+      "grad_norm": 0.41810843348503113,
+      "learning_rate": 0.00014044647859053915,
+      "loss": 0.521,
+      "step": 1394
+    },
+    {
+      "epoch": 0.7742651136993899,
+      "grad_norm": 0.21894732117652893,
+      "learning_rate": 0.00014028209227301533,
+      "loss": 0.342,
+      "step": 1396
+    },
+    {
+      "epoch": 0.7753743760399334,
+      "grad_norm": 0.31191563606262207,
+      "learning_rate": 0.00014011757591446918,
+      "loss": 0.4173,
+      "step": 1398
+    },
+    {
+      "epoch": 0.776483638380477,
+      "grad_norm": 0.34966176748275757,
+      "learning_rate": 0.00013995293004600206,
+      "loss": 0.4993,
+      "step": 1400
+    },
+    {
+      "epoch": 0.7775929007210205,
+      "grad_norm": 0.3630419671535492,
+      "learning_rate": 0.00013978815519913345,
+      "loss": 0.4331,
+      "step": 1402
+    },
+    {
+      "epoch": 0.778702163061564,
+      "grad_norm": 0.2934836447238922,
+      "learning_rate": 0.00013962325190579919,
+      "loss": 0.4618,
+      "step": 1404
+    },
+    {
+      "epoch": 0.7798114254021076,
+      "grad_norm": 0.330842524766922,
+      "learning_rate": 0.00013945822069834983,
+      "loss": 0.4437,
+      "step": 1406
+    },
+    {
+      "epoch": 0.7809206877426511,
+      "grad_norm": 0.24101948738098145,
+      "learning_rate": 0.0001392930621095489,
+      "loss": 0.4065,
+      "step": 1408
+    },
+    {
+      "epoch": 0.7820299500831946,
+      "grad_norm": 0.5105953216552734,
+      "learning_rate": 0.00013912777667257094,
+      "loss": 0.5135,
+      "step": 1410
+    },
+    {
+      "epoch": 0.7831392124237382,
+      "grad_norm": 0.2571849226951599,
+      "learning_rate": 0.00013896236492100025,
+      "loss": 0.4153,
+      "step": 1412
+    },
+    {
+      "epoch": 0.7842484747642817,
+      "grad_norm": 0.27226725220680237,
+      "learning_rate": 0.00013879682738882873,
+      "loss": 0.3522,
+      "step": 1414
+    },
+    {
+      "epoch": 0.7853577371048253,
+      "grad_norm": 0.2498832792043686,
+      "learning_rate": 0.0001386311646104544,
+      "loss": 0.4186,
+      "step": 1416
+    },
+    {
+      "epoch": 0.7864669994453688,
+      "grad_norm": 0.18799692392349243,
+      "learning_rate": 0.00013846537712067962,
+      "loss": 0.3278,
+      "step": 1418
+    },
+    {
+      "epoch": 0.7875762617859123,
+      "grad_norm": 0.3741225600242615,
+      "learning_rate": 0.0001382994654547093,
+      "loss": 0.4816,
+      "step": 1420
+    },
+    {
+      "epoch": 0.7886855241264559,
+      "grad_norm": 0.34847941994667053,
+      "learning_rate": 0.00013813343014814925,
+      "loss": 0.5673,
+      "step": 1422
+    },
+    {
+      "epoch": 0.7897947864669994,
+      "grad_norm": 0.38350898027420044,
+      "learning_rate": 0.00013796727173700444,
+      "loss": 0.5009,
+      "step": 1424
+    },
+    {
+      "epoch": 0.790904048807543,
+      "grad_norm": 0.2870427668094635,
+      "learning_rate": 0.0001378009907576772,
+      "loss": 0.4333,
+      "step": 1426
+    },
+    {
+      "epoch": 0.7920133111480865,
+      "grad_norm": 0.2194262146949768,
+      "learning_rate": 0.00013763458774696563,
+      "loss": 0.3433,
+      "step": 1428
+    },
+    {
+      "epoch": 0.79312257348863,
+      "grad_norm": 0.26900714635849,
+      "learning_rate": 0.00013746806324206173,
+      "loss": 0.5099,
+      "step": 1430
+    },
+    {
+      "epoch": 0.7942318358291736,
+      "grad_norm": 0.2423945814371109,
+      "learning_rate": 0.00013730141778054962,
+      "loss": 0.333,
+      "step": 1432
+    },
+    {
+      "epoch": 0.7953410981697171,
+      "grad_norm": 0.31677019596099854,
+      "learning_rate": 0.00013713465190040415,
+      "loss": 0.4285,
+      "step": 1434
+    },
+    {
+      "epoch": 0.7964503605102606,
+      "grad_norm": 0.33169832825660706,
+      "learning_rate": 0.0001369677661399886,
+      "loss": 0.4058,
+      "step": 1436
+    },
+    {
+      "epoch": 0.7975596228508042,
+      "grad_norm": 0.3489621579647064,
+      "learning_rate": 0.0001368007610380535,
+      "loss": 0.4153,
+      "step": 1438
+    },
+    {
+      "epoch": 0.7986688851913477,
+      "grad_norm": 0.2639998495578766,
+      "learning_rate": 0.00013663363713373454,
+      "loss": 0.3959,
+      "step": 1440
+    },
+    {
+      "epoch": 0.7997781475318912,
+      "grad_norm": 0.27134037017822266,
+      "learning_rate": 0.0001364663949665509,
+      "loss": 0.4331,
+      "step": 1442
+    },
+    {
+      "epoch": 0.8008874098724348,
+      "grad_norm": 0.28077998757362366,
+      "learning_rate": 0.00013629903507640369,
+      "loss": 0.5282,
+      "step": 1444
+    },
+    {
+      "epoch": 0.8019966722129783,
+      "grad_norm": 0.2948680818080902,
+      "learning_rate": 0.00013613155800357385,
+      "loss": 0.3951,
+      "step": 1446
+    },
+    {
+      "epoch": 0.8031059345535219,
+      "grad_norm": 0.3196534812450409,
+      "learning_rate": 0.0001359639642887208,
+      "loss": 0.4191,
+      "step": 1448
+    },
+    {
+      "epoch": 0.8042151968940654,
+      "grad_norm": 0.26202476024627686,
+      "learning_rate": 0.00013579625447288044,
+      "loss": 0.352,
+      "step": 1450
+    },
+    {
+      "epoch": 0.8053244592346089,
+      "grad_norm": 0.24866290390491486,
+      "learning_rate": 0.00013562842909746342,
+      "loss": 0.3203,
+      "step": 1452
+    },
+    {
+      "epoch": 0.8064337215751525,
+      "grad_norm": 0.41027557849884033,
+      "learning_rate": 0.00013546048870425356,
+      "loss": 0.4187,
+      "step": 1454
+    },
+    {
+      "epoch": 0.807542983915696,
+      "grad_norm": 0.34640318155288696,
+      "learning_rate": 0.0001352924338354059,
+      "loss": 0.3204,
+      "step": 1456
+    },
+    {
+      "epoch": 0.8086522462562395,
+      "grad_norm": 0.3971330523490906,
+      "learning_rate": 0.0001351242650334451,
+      "loss": 0.4598,
+      "step": 1458
+    },
+    {
+      "epoch": 0.8097615085967831,
+      "grad_norm": 0.3868078887462616,
+      "learning_rate": 0.0001349559828412635,
+      "loss": 0.3641,
+      "step": 1460
+    },
+    {
+      "epoch": 0.8108707709373266,
+      "grad_norm": 0.26136353611946106,
+      "learning_rate": 0.00013478758780211965,
+      "loss": 0.4286,
+      "step": 1462
+    },
+    {
+      "epoch": 0.8119800332778702,
+      "grad_norm": 0.2987557351589203,
+      "learning_rate": 0.00013461908045963634,
+      "loss": 0.4286,
+      "step": 1464
+    },
+    {
+      "epoch": 0.8130892956184138,
+      "grad_norm": 0.33275967836380005,
+      "learning_rate": 0.00013445046135779885,
+      "loss": 0.3616,
+      "step": 1466
+    },
+    {
+      "epoch": 0.8141985579589573,
+      "grad_norm": 0.2795950770378113,
+      "learning_rate": 0.00013428173104095331,
+      "loss": 0.5246,
+      "step": 1468
+    },
+    {
+      "epoch": 0.8153078202995009,
+      "grad_norm": 0.3052369952201843,
+      "learning_rate": 0.00013411289005380494,
+      "loss": 0.3672,
+      "step": 1470
+    },
+    {
+      "epoch": 0.8164170826400444,
+      "grad_norm": 0.2728108763694763,
+      "learning_rate": 0.00013394393894141605,
+      "loss": 0.4897,
+      "step": 1472
+    },
+    {
+      "epoch": 0.817526344980588,
+      "grad_norm": 0.3310782015323639,
+      "learning_rate": 0.00013377487824920459,
+      "loss": 0.5144,
+      "step": 1474
+    },
+    {
+      "epoch": 0.8186356073211315,
+      "grad_norm": 0.4153352677822113,
+      "learning_rate": 0.00013360570852294227,
+      "loss": 0.5313,
+      "step": 1476
+    },
+    {
+      "epoch": 0.819744869661675,
+      "grad_norm": 0.4107600450515747,
+      "learning_rate": 0.00013343643030875276,
+      "loss": 0.4873,
+      "step": 1478
+    },
+    {
+      "epoch": 0.8208541320022186,
+      "grad_norm": 0.2750249207019806,
+      "learning_rate": 0.00013326704415311,
+      "loss": 0.4373,
+      "step": 1480
+    },
+    {
+      "epoch": 0.8219633943427621,
+      "grad_norm": 0.2961103618144989,
+      "learning_rate": 0.00013309755060283626,
+      "loss": 0.4252,
+      "step": 1482
+    },
+    {
+      "epoch": 0.8230726566833056,
+      "grad_norm": 0.25156864523887634,
+      "learning_rate": 0.00013292795020510066,
+      "loss": 0.307,
+      "step": 1484
+    },
+    {
+      "epoch": 0.8241819190238492,
+      "grad_norm": 0.2984169125556946,
+      "learning_rate": 0.00013275824350741716,
+      "loss": 0.4416,
+      "step": 1486
+    },
+    {
+      "epoch": 0.8252911813643927,
+      "grad_norm": 0.27711641788482666,
+      "learning_rate": 0.00013258843105764297,
+      "loss": 0.3382,
+      "step": 1488
+    },
+    {
+      "epoch": 0.8264004437049363,
+      "grad_norm": 0.3105831742286682,
+      "learning_rate": 0.00013241851340397656,
+      "loss": 0.4276,
+      "step": 1490
+    },
+    {
+      "epoch": 0.8275097060454798,
+      "grad_norm": 0.3303448557853699,
+      "learning_rate": 0.00013224849109495622,
+      "loss": 0.4679,
+      "step": 1492
+    },
+    {
+      "epoch": 0.8286189683860233,
+      "grad_norm": 0.3527246415615082,
+      "learning_rate": 0.00013207836467945785,
+      "loss": 0.5059,
+      "step": 1494
+    },
+    {
+      "epoch": 0.8297282307265669,
+      "grad_norm": 0.41127604246139526,
+      "learning_rate": 0.00013190813470669363,
+      "loss": 0.5412,
+      "step": 1496
+    },
+    {
+      "epoch": 0.8308374930671104,
+      "grad_norm": 0.24502909183502197,
+      "learning_rate": 0.00013173780172620999,
+      "loss": 0.3072,
+      "step": 1498
+    },
+    {
+      "epoch": 0.831946755407654,
+      "grad_norm": 0.21016305685043335,
+      "learning_rate": 0.00013156736628788584,
+      "loss": 0.3889,
+      "step": 1500
+    },
+    {
+      "epoch": 0.8330560177481975,
+      "grad_norm": 0.5300863981246948,
+      "learning_rate": 0.000131396828941931,
+      "loss": 0.5858,
+      "step": 1502
+    },
+    {
+      "epoch": 0.834165280088741,
+      "grad_norm": 0.2194632738828659,
+      "learning_rate": 0.00013122619023888402,
+      "loss": 0.3433,
+      "step": 1504
+    },
+    {
+      "epoch": 0.8352745424292846,
+      "grad_norm": 0.7686997056007385,
+      "learning_rate": 0.00013105545072961093,
+      "loss": 0.5005,
+      "step": 1506
+    },
+    {
+      "epoch": 0.8363838047698281,
+      "grad_norm": 0.2888992428779602,
+      "learning_rate": 0.00013088461096530304,
+      "loss": 0.3987,
+      "step": 1508
+    },
+    {
+      "epoch": 0.8374930671103716,
+      "grad_norm": 0.2540237009525299,
+      "learning_rate": 0.00013071367149747535,
+      "loss": 0.4531,
+      "step": 1510
+    },
+    {
+      "epoch": 0.8386023294509152,
+      "grad_norm": 0.3219870924949646,
+      "learning_rate": 0.00013054263287796465,
+      "loss": 0.561,
+      "step": 1512
+    },
+    {
+      "epoch": 0.8397115917914587,
+      "grad_norm": 0.24539148807525635,
+      "learning_rate": 0.00013037149565892794,
+      "loss": 0.3225,
+      "step": 1514
+    },
+    {
+      "epoch": 0.8408208541320022,
+      "grad_norm": 0.23382632434368134,
+      "learning_rate": 0.00013020026039284045,
+      "loss": 0.3557,
+      "step": 1516
+    },
+    {
+      "epoch": 0.8419301164725458,
+      "grad_norm": 0.2739519476890564,
+      "learning_rate": 0.00013002892763249398,
+      "loss": 0.3635,
+      "step": 1518
+    },
+    {
+      "epoch": 0.8430393788130893,
+      "grad_norm": 0.23864848911762238,
+      "learning_rate": 0.000129857497930995,
+      "loss": 0.393,
+      "step": 1520
+    },
+    {
+      "epoch": 0.8441486411536329,
+      "grad_norm": 0.3150426149368286,
+      "learning_rate": 0.00012968597184176298,
+      "loss": 0.4337,
+      "step": 1522
+    },
+    {
+      "epoch": 0.8452579034941764,
+      "grad_norm": 0.25435131788253784,
+      "learning_rate": 0.00012951434991852857,
+      "loss": 0.3681,
+      "step": 1524
+    },
+    {
+      "epoch": 0.8463671658347199,
+      "grad_norm": 0.3123234808444977,
+      "learning_rate": 0.0001293426327153317,
+      "loss": 0.4089,
+      "step": 1526
+    },
+    {
+      "epoch": 0.8474764281752635,
+      "grad_norm": 0.52870112657547,
+      "learning_rate": 0.00012917082078652,
+      "loss": 0.4779,
+      "step": 1528
+    },
+    {
+      "epoch": 0.848585690515807,
+      "grad_norm": 0.2784457206726074,
+      "learning_rate": 0.00012899891468674688,
+      "loss": 0.4163,
+      "step": 1530
+    },
+    {
+      "epoch": 0.8496949528563505,
+      "grad_norm": 0.26158082485198975,
+      "learning_rate": 0.0001288269149709697,
+      "loss": 0.464,
+      "step": 1532
+    },
+    {
+      "epoch": 0.8508042151968941,
+      "grad_norm": 0.2595478594303131,
+      "learning_rate": 0.00012865482219444804,
+      "loss": 0.3473,
+      "step": 1534
+    },
+    {
+      "epoch": 0.8519134775374376,
+      "grad_norm": 0.28629809617996216,
+      "learning_rate": 0.000128482636912742,
+      "loss": 0.4626,
+      "step": 1536
+    },
+    {
+      "epoch": 0.8530227398779812,
+      "grad_norm": 0.2769101560115814,
+      "learning_rate": 0.00012831035968171025,
+      "loss": 0.5318,
+      "step": 1538
+    },
+    {
+      "epoch": 0.8541320022185247,
+      "grad_norm": 0.32585474848747253,
+      "learning_rate": 0.00012813799105750823,
+      "loss": 0.4428,
+      "step": 1540
+    },
+    {
+      "epoch": 0.8552412645590682,
+      "grad_norm": 0.2759438157081604,
+      "learning_rate": 0.00012796553159658653,
+      "loss": 0.3853,
+      "step": 1542
+    },
+    {
+      "epoch": 0.8563505268996118,
+      "grad_norm": 0.34193405508995056,
+      "learning_rate": 0.0001277929818556889,
+      "loss": 0.5144,
+      "step": 1544
+    },
+    {
+      "epoch": 0.8574597892401553,
+      "grad_norm": 0.3385421335697174,
+      "learning_rate": 0.00012762034239185063,
+      "loss": 0.4466,
+      "step": 1546
+    },
+    {
+      "epoch": 0.8585690515806988,
+      "grad_norm": 0.3747256398200989,
+      "learning_rate": 0.00012744761376239655,
+      "loss": 0.517,
+      "step": 1548
+    },
+    {
+      "epoch": 0.8596783139212424,
+      "grad_norm": 0.31231391429901123,
+      "learning_rate": 0.00012727479652493943,
+      "loss": 0.3797,
+      "step": 1550
+    },
+    {
+      "epoch": 0.8607875762617859,
+      "grad_norm": 0.2660077214241028,
+      "learning_rate": 0.00012710189123737802,
+      "loss": 0.3481,
+      "step": 1552
+    },
+    {
+      "epoch": 0.8618968386023295,
+      "grad_norm": 0.34666526317596436,
+      "learning_rate": 0.00012692889845789538,
+      "loss": 0.4369,
+      "step": 1554
+    },
+    {
+      "epoch": 0.863006100942873,
+      "grad_norm": 0.26898959279060364,
+      "learning_rate": 0.00012675581874495697,
+      "loss": 0.4428,
+      "step": 1556
+    },
+    {
+      "epoch": 0.8641153632834165,
+      "grad_norm": 0.378214567899704,
+      "learning_rate": 0.0001265826526573089,
+      "loss": 0.5425,
+      "step": 1558
+    },
+    {
+      "epoch": 0.8652246256239601,
+      "grad_norm": 0.3740502595901489,
+      "learning_rate": 0.0001264094007539762,
+      "loss": 0.5318,
+      "step": 1560
+    },
+    {
+      "epoch": 0.8663338879645036,
+      "grad_norm": 0.25904256105422974,
+      "learning_rate": 0.00012623606359426077,
+      "loss": 0.4032,
+      "step": 1562
+    },
+    {
+      "epoch": 0.8674431503050472,
+      "grad_norm": 0.29247143864631653,
+      "learning_rate": 0.00012606264173773988,
+      "loss": 0.4446,
+      "step": 1564
+    },
+    {
+      "epoch": 0.8685524126455907,
+      "grad_norm": 0.2748325765132904,
+      "learning_rate": 0.0001258891357442642,
+      "loss": 0.4435,
+      "step": 1566
+    },
+    {
+      "epoch": 0.8696616749861342,
+      "grad_norm": 0.4215485155582428,
+      "learning_rate": 0.00012571554617395598,
+      "loss": 0.4285,
+      "step": 1568
+    },
+    {
+      "epoch": 0.8707709373266778,
+      "grad_norm": 0.2875049412250519,
+      "learning_rate": 0.00012554187358720725,
+      "loss": 0.4743,
+      "step": 1570
+    },
+    {
+      "epoch": 0.8718801996672213,
+      "grad_norm": 0.3484318256378174,
+      "learning_rate": 0.00012536811854467817,
+      "loss": 0.4997,
+      "step": 1572
+    },
+    {
+      "epoch": 0.8729894620077648,
+      "grad_norm": 0.33628854155540466,
+      "learning_rate": 0.0001251942816072949,
+      "loss": 0.5044,
+      "step": 1574
+    },
+    {
+      "epoch": 0.8740987243483084,
+      "grad_norm": 0.28208592534065247,
+      "learning_rate": 0.00012502036333624815,
+      "loss": 0.3405,
+      "step": 1576
+    },
+    {
+      "epoch": 0.8752079866888519,
+      "grad_norm": 0.2113669067621231,
+      "learning_rate": 0.00012484636429299114,
+      "loss": 0.3996,
+      "step": 1578
+    },
+    {
+      "epoch": 0.8763172490293955,
+      "grad_norm": 0.2464189976453781,
+      "learning_rate": 0.00012467228503923773,
+      "loss": 0.3572,
+      "step": 1580
+    },
+    {
+      "epoch": 0.877426511369939,
+      "grad_norm": 0.391923189163208,
+      "learning_rate": 0.00012449812613696094,
+      "loss": 0.6324,
+      "step": 1582
+    },
+    {
+      "epoch": 0.8785357737104825,
+      "grad_norm": 0.2782968282699585,
+      "learning_rate": 0.0001243238881483907,
+      "loss": 0.352,
+      "step": 1584
+    },
+    {
+      "epoch": 0.8796450360510261,
+      "grad_norm": 0.30209866166114807,
+      "learning_rate": 0.00012414957163601236,
+      "loss": 0.4611,
+      "step": 1586
+    },
+    {
+      "epoch": 0.8807542983915696,
+      "grad_norm": 0.2990707457065582,
+      "learning_rate": 0.0001239751771625648,
+      "loss": 0.5419,
+      "step": 1588
+    },
+    {
+      "epoch": 0.8818635607321131,
+      "grad_norm": 0.25346022844314575,
+      "learning_rate": 0.00012380070529103852,
+      "loss": 0.4205,
+      "step": 1590
+    },
+    {
+      "epoch": 0.8829728230726567,
+      "grad_norm": 0.33787423372268677,
+      "learning_rate": 0.00012362615658467377,
+      "loss": 0.4186,
+      "step": 1592
+    },
+    {
+      "epoch": 0.8840820854132002,
+      "grad_norm": 0.23708541691303253,
+      "learning_rate": 0.00012345153160695917,
+      "loss": 0.3261,
+      "step": 1594
+    },
+    {
+      "epoch": 0.8851913477537438,
+      "grad_norm": 0.22642351686954498,
+      "learning_rate": 0.00012327683092162918,
+      "loss": 0.3415,
+      "step": 1596
+    },
+    {
+      "epoch": 0.8863006100942873,
+      "grad_norm": 0.27935388684272766,
+      "learning_rate": 0.00012310205509266292,
+      "loss": 0.3471,
+      "step": 1598
+    },
+    {
+      "epoch": 0.8874098724348308,
+      "grad_norm": 0.23772071301937103,
+      "learning_rate": 0.000122927204684282,
+      "loss": 0.3386,
+      "step": 1600
     }
   ],
   "logging_steps": 2,
       "attributes": {}
     }
   },
+  "total_flos": 3897330499584000.0,
   "train_batch_size": 8,
   "trial_name": null,
   "trial_params": null