Training in progress, step 1728, checkpoint

Browse files

Files changed (5) hide show

last-checkpoint/model.safetensors +1 -1
last-checkpoint/optimizer.pt +1 -1
last-checkpoint/rng_state.pth +1 -1
last-checkpoint/scheduler.pt +1 -1
last-checkpoint/trainer_state.json +451 -3

last-checkpoint/model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:77e67eae521c3cba24d0ae9eb0d44447bec89794066751cc7d707f2fdc5c29bf
 size 891644712

 version https://git-lfs.github.com/spec/v1
+oid sha256:8d157e7286a6dfc4ba5aacde53caecbbb37a38b4599aefda7a1cea8fa2b162ea
 size 891644712

last-checkpoint/optimizer.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:42e0c34297a9f3d72c5022f0b3dc2e519698ad0189be335ae6d8648524d74bb2
 size 1783444794

 version https://git-lfs.github.com/spec/v1
+oid sha256:96ad3332c7a20eba91ce157eec436b4cce08960829f029b3ab7446b861d4f3da
 size 1783444794

last-checkpoint/rng_state.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:8dee4a2b51470c2e565b08aae8a4e5156e5c34e8c236adf6153eeb283fa560ec
 size 14244

 version https://git-lfs.github.com/spec/v1
+oid sha256:edb897ff7c5edb0daaf53db6c8527f7da45dd70041d3faeafb18eb1d69b53ca7
 size 14244

last-checkpoint/scheduler.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:1affb368014b9c4895e09d750783801a20ec7c8f622ab5a02e1bce055904fb15
 size 1064

 version https://git-lfs.github.com/spec/v1
+oid sha256:c92509254bae5d36eb0a64d8994d4ff09c04cd0249ac08027b0f40c16d4a82bf
 size 1064

last-checkpoint/trainer_state.json CHANGED Viewed

@@ -1,9 +1,9 @@
 {
   "best_metric": null,
   "best_model_checkpoint": null,
-  "epoch": 0.8874098724348308,
   "eval_steps": 500,
-  "global_step": 1600,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
@@ -5607,6 +5607,454 @@
       "learning_rate": 0.000122927204684282,
       "loss": 0.3386,
       "step": 1600
     }
   ],
   "logging_steps": 2,
@@ -5626,7 +6074,7 @@
       "attributes": {}
     }
   },
-  "total_flos": 3897330499584000.0,
   "train_batch_size": 8,
   "trial_name": null,
   "trial_params": null

 {
   "best_metric": null,
   "best_model_checkpoint": null,
+  "epoch": 0.9584026622296173,
   "eval_steps": 500,
+  "global_step": 1728,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
       "learning_rate": 0.000122927204684282,
       "loss": 0.3386,
       "step": 1600
+    },
+    {
+      "epoch": 0.8885191347753744,
+      "grad_norm": 0.3254069983959198,
+      "learning_rate": 0.00012275228026094881,
+      "loss": 0.5074,
+      "step": 1602
+    },
+    {
+      "epoch": 0.8896283971159179,
+      "grad_norm": 0.37417536973953247,
+      "learning_rate": 0.00012257728238736467,
+      "loss": 0.5318,
+      "step": 1604
+    },
+    {
+      "epoch": 0.8907376594564614,
+      "grad_norm": 0.35727638006210327,
+      "learning_rate": 0.000122402211628468,
+      "loss": 0.4729,
+      "step": 1606
+    },
+    {
+      "epoch": 0.891846921797005,
+      "grad_norm": 0.26733312010765076,
+      "learning_rate": 0.00012222706854943255,
+      "loss": 0.421,
+      "step": 1608
+    },
+    {
+      "epoch": 0.8929561841375485,
+      "grad_norm": 0.22087961435317993,
+      "learning_rate": 0.00012205185371566554,
+      "loss": 0.3354,
+      "step": 1610
+    },
+    {
+      "epoch": 0.894065446478092,
+      "grad_norm": 0.4256139099597931,
+      "learning_rate": 0.00012187656769280578,
+      "loss": 0.4,
+      "step": 1612
+    },
+    {
+      "epoch": 0.8951747088186356,
+      "grad_norm": 0.2818162441253662,
+      "learning_rate": 0.00012170121104672196,
+      "loss": 0.4098,
+      "step": 1614
+    },
+    {
+      "epoch": 0.8962839711591791,
+      "grad_norm": 0.2936331331729889,
+      "learning_rate": 0.00012152578434351071,
+      "loss": 0.436,
+      "step": 1616
+    },
+    {
+      "epoch": 0.8973932334997227,
+      "grad_norm": 0.2814910113811493,
+      "learning_rate": 0.00012135028814949487,
+      "loss": 0.4096,
+      "step": 1618
+    },
+    {
+      "epoch": 0.8985024958402662,
+      "grad_norm": 0.3062569797039032,
+      "learning_rate": 0.00012117472303122157,
+      "loss": 0.4595,
+      "step": 1620
+    },
+    {
+      "epoch": 0.8996117581808097,
+      "grad_norm": 0.3199828565120697,
+      "learning_rate": 0.00012099908955546044,
+      "loss": 0.4696,
+      "step": 1622
+    },
+    {
+      "epoch": 0.9007210205213533,
+      "grad_norm": 0.35935017466545105,
+      "learning_rate": 0.00012082338828920185,
+      "loss": 0.5822,
+      "step": 1624
+    },
+    {
+      "epoch": 0.9018302828618968,
+      "grad_norm": 0.2030808925628662,
+      "learning_rate": 0.00012064761979965497,
+      "loss": 0.3524,
+      "step": 1626
+    },
+    {
+      "epoch": 0.9029395452024404,
+      "grad_norm": 0.29535773396492004,
+      "learning_rate": 0.00012047178465424596,
+      "loss": 0.3698,
+      "step": 1628
+    },
+    {
+      "epoch": 0.9040488075429839,
+      "grad_norm": 0.26572179794311523,
+      "learning_rate": 0.00012029588342061621,
+      "loss": 0.3789,
+      "step": 1630
+    },
+    {
+      "epoch": 0.9051580698835274,
+      "grad_norm": 0.4271789491176605,
+      "learning_rate": 0.00012011991666662044,
+      "loss": 0.5669,
+      "step": 1632
+    },
+    {
+      "epoch": 0.906267332224071,
+      "grad_norm": 0.35716575384140015,
+      "learning_rate": 0.00011994388496032487,
+      "loss": 0.4521,
+      "step": 1634
+    },
+    {
+      "epoch": 0.9073765945646145,
+      "grad_norm": 0.2956486642360687,
+      "learning_rate": 0.00011976778887000543,
+      "loss": 0.3755,
+      "step": 1636
+    },
+    {
+      "epoch": 0.908485856905158,
+      "grad_norm": 0.3578818738460541,
+      "learning_rate": 0.0001195916289641459,
+      "loss": 0.4935,
+      "step": 1638
+    },
+    {
+      "epoch": 0.9095951192457016,
+      "grad_norm": 0.3232196867465973,
+      "learning_rate": 0.00011941540581143608,
+      "loss": 0.4826,
+      "step": 1640
+    },
+    {
+      "epoch": 0.9107043815862451,
+      "grad_norm": 0.2944696247577667,
+      "learning_rate": 0.00011923911998076988,
+      "loss": 0.3827,
+      "step": 1642
+    },
+    {
+      "epoch": 0.9118136439267887,
+      "grad_norm": 0.27748194336891174,
+      "learning_rate": 0.00011906277204124363,
+      "loss": 0.5143,
+      "step": 1644
+    },
+    {
+      "epoch": 0.9129229062673322,
+      "grad_norm": 0.2819176912307739,
+      "learning_rate": 0.00011888636256215413,
+      "loss": 0.4159,
+      "step": 1646
+    },
+    {
+      "epoch": 0.9140321686078757,
+      "grad_norm": 0.3712371289730072,
+      "learning_rate": 0.00011870989211299686,
+      "loss": 0.5419,
+      "step": 1648
+    },
+    {
+      "epoch": 0.9151414309484193,
+      "grad_norm": 0.2566871643066406,
+      "learning_rate": 0.00011853336126346406,
+      "loss": 0.4926,
+      "step": 1650
+    },
+    {
+      "epoch": 0.9162506932889628,
+      "grad_norm": 0.38075196743011475,
+      "learning_rate": 0.0001183567705834431,
+      "loss": 0.4784,
+      "step": 1652
+    },
+    {
+      "epoch": 0.9173599556295063,
+      "grad_norm": 0.30149558186531067,
+      "learning_rate": 0.00011818012064301433,
+      "loss": 0.3791,
+      "step": 1654
+    },
+    {
+      "epoch": 0.9184692179700499,
+      "grad_norm": 0.37024736404418945,
+      "learning_rate": 0.00011800341201244954,
+      "loss": 0.4495,
+      "step": 1656
+    },
+    {
+      "epoch": 0.9195784803105934,
+      "grad_norm": 0.30697816610336304,
+      "learning_rate": 0.00011782664526220992,
+      "loss": 0.385,
+      "step": 1658
+    },
+    {
+      "epoch": 0.920687742651137,
+      "grad_norm": 0.48160433769226074,
+      "learning_rate": 0.00011764982096294432,
+      "loss": 0.3435,
+      "step": 1660
+    },
+    {
+      "epoch": 0.9217970049916805,
+      "grad_norm": 0.3319704234600067,
+      "learning_rate": 0.00011747293968548734,
+      "loss": 0.4893,
+      "step": 1662
+    },
+    {
+      "epoch": 0.922906267332224,
+      "grad_norm": 0.2384938895702362,
+      "learning_rate": 0.00011729600200085752,
+      "loss": 0.4826,
+      "step": 1664
+    },
+    {
+      "epoch": 0.9240155296727676,
+      "grad_norm": 0.2830295264720917,
+      "learning_rate": 0.00011711900848025555,
+      "loss": 0.5185,
+      "step": 1666
+    },
+    {
+      "epoch": 0.9251247920133111,
+      "grad_norm": 0.3078785836696625,
+      "learning_rate": 0.0001169419596950623,
+      "loss": 0.6303,
+      "step": 1668
+    },
+    {
+      "epoch": 0.9262340543538546,
+      "grad_norm": 0.2661837637424469,
+      "learning_rate": 0.00011676485621683713,
+      "loss": 0.4059,
+      "step": 1670
+    },
+    {
+      "epoch": 0.9273433166943982,
+      "grad_norm": 0.2619323134422302,
+      "learning_rate": 0.00011658769861731584,
+      "loss": 0.3383,
+      "step": 1672
+    },
+    {
+      "epoch": 0.9284525790349417,
+      "grad_norm": 0.3200634717941284,
+      "learning_rate": 0.00011641048746840912,
+      "loss": 0.42,
+      "step": 1674
+    },
+    {
+      "epoch": 0.9295618413754853,
+      "grad_norm": 0.29915183782577515,
+      "learning_rate": 0.00011623322334220038,
+      "loss": 0.4156,
+      "step": 1676
+    },
+    {
+      "epoch": 0.9306711037160288,
+      "grad_norm": 0.36972782015800476,
+      "learning_rate": 0.0001160559068109441,
+      "loss": 0.3698,
+      "step": 1678
+    },
+    {
+      "epoch": 0.9317803660565723,
+      "grad_norm": 0.24870358407497406,
+      "learning_rate": 0.00011587853844706397,
+      "loss": 0.4126,
+      "step": 1680
+    },
+    {
+      "epoch": 0.9328896283971159,
+      "grad_norm": 0.2665979862213135,
+      "learning_rate": 0.000115701118823151,
+      "loss": 0.4097,
+      "step": 1682
+    },
+    {
+      "epoch": 0.9339988907376594,
+      "grad_norm": 0.34804749488830566,
+      "learning_rate": 0.00011552364851196167,
+      "loss": 0.3956,
+      "step": 1684
+    },
+    {
+      "epoch": 0.9351081530782029,
+      "grad_norm": 0.2750212550163269,
+      "learning_rate": 0.00011534612808641603,
+      "loss": 0.3434,
+      "step": 1686
+    },
+    {
+      "epoch": 0.9362174154187465,
+      "grad_norm": 0.249044269323349,
+      "learning_rate": 0.00011516855811959604,
+      "loss": 0.4786,
+      "step": 1688
+    },
+    {
+      "epoch": 0.93732667775929,
+      "grad_norm": 0.29392093420028687,
+      "learning_rate": 0.00011499093918474348,
+      "loss": 0.3028,
+      "step": 1690
+    },
+    {
+      "epoch": 0.9384359400998337,
+      "grad_norm": 0.2747836112976074,
+      "learning_rate": 0.00011481327185525828,
+      "loss": 0.4296,
+      "step": 1692
+    },
+    {
+      "epoch": 0.9395452024403772,
+      "grad_norm": 0.3494579493999481,
+      "learning_rate": 0.00011463555670469657,
+      "loss": 0.3412,
+      "step": 1694
+    },
+    {
+      "epoch": 0.9406544647809207,
+      "grad_norm": 0.28468579053878784,
+      "learning_rate": 0.00011445779430676884,
+      "loss": 0.5185,
+      "step": 1696
+    },
+    {
+      "epoch": 0.9417637271214643,
+      "grad_norm": 0.27110087871551514,
+      "learning_rate": 0.0001142799852353382,
+      "loss": 0.3075,
+      "step": 1698
+    },
+    {
+      "epoch": 0.9428729894620078,
+      "grad_norm": 0.38002222776412964,
+      "learning_rate": 0.00011410213006441827,
+      "loss": 0.6445,
+      "step": 1700
+    },
+    {
+      "epoch": 0.9439822518025514,
+      "grad_norm": 0.27994948625564575,
+      "learning_rate": 0.00011392422936817166,
+      "loss": 0.3741,
+      "step": 1702
+    },
+    {
+      "epoch": 0.9450915141430949,
+      "grad_norm": 0.26837414503097534,
+      "learning_rate": 0.00011374628372090783,
+      "loss": 0.3902,
+      "step": 1704
+    },
+    {
+      "epoch": 0.9462007764836384,
+      "grad_norm": 0.2525213062763214,
+      "learning_rate": 0.00011356829369708146,
+      "loss": 0.397,
+      "step": 1706
+    },
+    {
+      "epoch": 0.947310038824182,
+      "grad_norm": 0.24402830004692078,
+      "learning_rate": 0.00011339025987129032,
+      "loss": 0.349,
+      "step": 1708
+    },
+    {
+      "epoch": 0.9484193011647255,
+      "grad_norm": 0.2694087624549866,
+      "learning_rate": 0.0001132121828182738,
+      "loss": 0.4212,
+      "step": 1710
+    },
+    {
+      "epoch": 0.949528563505269,
+      "grad_norm": 0.24677637219429016,
+      "learning_rate": 0.00011303406311291065,
+      "loss": 0.4076,
+      "step": 1712
+    },
+    {
+      "epoch": 0.9506378258458126,
+      "grad_norm": 0.23484551906585693,
+      "learning_rate": 0.00011285590133021741,
+      "loss": 0.3533,
+      "step": 1714
+    },
+    {
+      "epoch": 0.9517470881863561,
+      "grad_norm": 0.2949685752391815,
+      "learning_rate": 0.00011267769804534647,
+      "loss": 0.4117,
+      "step": 1716
+    },
+    {
+      "epoch": 0.9528563505268997,
+      "grad_norm": 0.9004955887794495,
+      "learning_rate": 0.00011249945383358414,
+      "loss": 0.4805,
+      "step": 1718
+    },
+    {
+      "epoch": 0.9539656128674432,
+      "grad_norm": 0.33412787318229675,
+      "learning_rate": 0.00011232116927034893,
+      "loss": 0.5482,
+      "step": 1720
+    },
+    {
+      "epoch": 0.9550748752079867,
+      "grad_norm": 0.24728718400001526,
+      "learning_rate": 0.00011214284493118948,
+      "loss": 0.329,
+      "step": 1722
+    },
+    {
+      "epoch": 0.9561841375485303,
+      "grad_norm": 0.3215670883655548,
+      "learning_rate": 0.00011196448139178298,
+      "loss": 0.4933,
+      "step": 1724
+    },
+    {
+      "epoch": 0.9572933998890738,
+      "grad_norm": 0.21227394044399261,
+      "learning_rate": 0.00011178607922793307,
+      "loss": 0.3171,
+      "step": 1726
+    },
+    {
+      "epoch": 0.9584026622296173,
+      "grad_norm": 0.22638051211833954,
+      "learning_rate": 0.0001116076390155682,
+      "loss": 0.4248,
+      "step": 1728
     }
   ],
   "logging_steps": 2,
       "attributes": {}
     }
   },
+  "total_flos": 4209116939550720.0,
   "train_batch_size": 8,
   "trial_name": null,
   "trial_params": null