jasong03 commited on
Commit
5c232d5
·
verified ·
1 Parent(s): 248f30c

Training in progress, step 1728, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:77e67eae521c3cba24d0ae9eb0d44447bec89794066751cc7d707f2fdc5c29bf
3
  size 891644712
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8d157e7286a6dfc4ba5aacde53caecbbb37a38b4599aefda7a1cea8fa2b162ea
3
  size 891644712
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:42e0c34297a9f3d72c5022f0b3dc2e519698ad0189be335ae6d8648524d74bb2
3
  size 1783444794
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:96ad3332c7a20eba91ce157eec436b4cce08960829f029b3ab7446b861d4f3da
3
  size 1783444794
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8dee4a2b51470c2e565b08aae8a4e5156e5c34e8c236adf6153eeb283fa560ec
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:edb897ff7c5edb0daaf53db6c8527f7da45dd70041d3faeafb18eb1d69b53ca7
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1affb368014b9c4895e09d750783801a20ec7c8f622ab5a02e1bce055904fb15
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c92509254bae5d36eb0a64d8994d4ff09c04cd0249ac08027b0f40c16d4a82bf
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.8874098724348308,
5
  "eval_steps": 500,
6
- "global_step": 1600,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -5607,6 +5607,454 @@
5607
  "learning_rate": 0.000122927204684282,
5608
  "loss": 0.3386,
5609
  "step": 1600
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5610
  }
5611
  ],
5612
  "logging_steps": 2,
@@ -5626,7 +6074,7 @@
5626
  "attributes": {}
5627
  }
5628
  },
5629
- "total_flos": 3897330499584000.0,
5630
  "train_batch_size": 8,
5631
  "trial_name": null,
5632
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.9584026622296173,
5
  "eval_steps": 500,
6
+ "global_step": 1728,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
5607
  "learning_rate": 0.000122927204684282,
5608
  "loss": 0.3386,
5609
  "step": 1600
5610
+ },
5611
+ {
5612
+ "epoch": 0.8885191347753744,
5613
+ "grad_norm": 0.3254069983959198,
5614
+ "learning_rate": 0.00012275228026094881,
5615
+ "loss": 0.5074,
5616
+ "step": 1602
5617
+ },
5618
+ {
5619
+ "epoch": 0.8896283971159179,
5620
+ "grad_norm": 0.37417536973953247,
5621
+ "learning_rate": 0.00012257728238736467,
5622
+ "loss": 0.5318,
5623
+ "step": 1604
5624
+ },
5625
+ {
5626
+ "epoch": 0.8907376594564614,
5627
+ "grad_norm": 0.35727638006210327,
5628
+ "learning_rate": 0.000122402211628468,
5629
+ "loss": 0.4729,
5630
+ "step": 1606
5631
+ },
5632
+ {
5633
+ "epoch": 0.891846921797005,
5634
+ "grad_norm": 0.26733312010765076,
5635
+ "learning_rate": 0.00012222706854943255,
5636
+ "loss": 0.421,
5637
+ "step": 1608
5638
+ },
5639
+ {
5640
+ "epoch": 0.8929561841375485,
5641
+ "grad_norm": 0.22087961435317993,
5642
+ "learning_rate": 0.00012205185371566554,
5643
+ "loss": 0.3354,
5644
+ "step": 1610
5645
+ },
5646
+ {
5647
+ "epoch": 0.894065446478092,
5648
+ "grad_norm": 0.4256139099597931,
5649
+ "learning_rate": 0.00012187656769280578,
5650
+ "loss": 0.4,
5651
+ "step": 1612
5652
+ },
5653
+ {
5654
+ "epoch": 0.8951747088186356,
5655
+ "grad_norm": 0.2818162441253662,
5656
+ "learning_rate": 0.00012170121104672196,
5657
+ "loss": 0.4098,
5658
+ "step": 1614
5659
+ },
5660
+ {
5661
+ "epoch": 0.8962839711591791,
5662
+ "grad_norm": 0.2936331331729889,
5663
+ "learning_rate": 0.00012152578434351071,
5664
+ "loss": 0.436,
5665
+ "step": 1616
5666
+ },
5667
+ {
5668
+ "epoch": 0.8973932334997227,
5669
+ "grad_norm": 0.2814910113811493,
5670
+ "learning_rate": 0.00012135028814949487,
5671
+ "loss": 0.4096,
5672
+ "step": 1618
5673
+ },
5674
+ {
5675
+ "epoch": 0.8985024958402662,
5676
+ "grad_norm": 0.3062569797039032,
5677
+ "learning_rate": 0.00012117472303122157,
5678
+ "loss": 0.4595,
5679
+ "step": 1620
5680
+ },
5681
+ {
5682
+ "epoch": 0.8996117581808097,
5683
+ "grad_norm": 0.3199828565120697,
5684
+ "learning_rate": 0.00012099908955546044,
5685
+ "loss": 0.4696,
5686
+ "step": 1622
5687
+ },
5688
+ {
5689
+ "epoch": 0.9007210205213533,
5690
+ "grad_norm": 0.35935017466545105,
5691
+ "learning_rate": 0.00012082338828920185,
5692
+ "loss": 0.5822,
5693
+ "step": 1624
5694
+ },
5695
+ {
5696
+ "epoch": 0.9018302828618968,
5697
+ "grad_norm": 0.2030808925628662,
5698
+ "learning_rate": 0.00012064761979965497,
5699
+ "loss": 0.3524,
5700
+ "step": 1626
5701
+ },
5702
+ {
5703
+ "epoch": 0.9029395452024404,
5704
+ "grad_norm": 0.29535773396492004,
5705
+ "learning_rate": 0.00012047178465424596,
5706
+ "loss": 0.3698,
5707
+ "step": 1628
5708
+ },
5709
+ {
5710
+ "epoch": 0.9040488075429839,
5711
+ "grad_norm": 0.26572179794311523,
5712
+ "learning_rate": 0.00012029588342061621,
5713
+ "loss": 0.3789,
5714
+ "step": 1630
5715
+ },
5716
+ {
5717
+ "epoch": 0.9051580698835274,
5718
+ "grad_norm": 0.4271789491176605,
5719
+ "learning_rate": 0.00012011991666662044,
5720
+ "loss": 0.5669,
5721
+ "step": 1632
5722
+ },
5723
+ {
5724
+ "epoch": 0.906267332224071,
5725
+ "grad_norm": 0.35716575384140015,
5726
+ "learning_rate": 0.00011994388496032487,
5727
+ "loss": 0.4521,
5728
+ "step": 1634
5729
+ },
5730
+ {
5731
+ "epoch": 0.9073765945646145,
5732
+ "grad_norm": 0.2956486642360687,
5733
+ "learning_rate": 0.00011976778887000543,
5734
+ "loss": 0.3755,
5735
+ "step": 1636
5736
+ },
5737
+ {
5738
+ "epoch": 0.908485856905158,
5739
+ "grad_norm": 0.3578818738460541,
5740
+ "learning_rate": 0.0001195916289641459,
5741
+ "loss": 0.4935,
5742
+ "step": 1638
5743
+ },
5744
+ {
5745
+ "epoch": 0.9095951192457016,
5746
+ "grad_norm": 0.3232196867465973,
5747
+ "learning_rate": 0.00011941540581143608,
5748
+ "loss": 0.4826,
5749
+ "step": 1640
5750
+ },
5751
+ {
5752
+ "epoch": 0.9107043815862451,
5753
+ "grad_norm": 0.2944696247577667,
5754
+ "learning_rate": 0.00011923911998076988,
5755
+ "loss": 0.3827,
5756
+ "step": 1642
5757
+ },
5758
+ {
5759
+ "epoch": 0.9118136439267887,
5760
+ "grad_norm": 0.27748194336891174,
5761
+ "learning_rate": 0.00011906277204124363,
5762
+ "loss": 0.5143,
5763
+ "step": 1644
5764
+ },
5765
+ {
5766
+ "epoch": 0.9129229062673322,
5767
+ "grad_norm": 0.2819176912307739,
5768
+ "learning_rate": 0.00011888636256215413,
5769
+ "loss": 0.4159,
5770
+ "step": 1646
5771
+ },
5772
+ {
5773
+ "epoch": 0.9140321686078757,
5774
+ "grad_norm": 0.3712371289730072,
5775
+ "learning_rate": 0.00011870989211299686,
5776
+ "loss": 0.5419,
5777
+ "step": 1648
5778
+ },
5779
+ {
5780
+ "epoch": 0.9151414309484193,
5781
+ "grad_norm": 0.2566871643066406,
5782
+ "learning_rate": 0.00011853336126346406,
5783
+ "loss": 0.4926,
5784
+ "step": 1650
5785
+ },
5786
+ {
5787
+ "epoch": 0.9162506932889628,
5788
+ "grad_norm": 0.38075196743011475,
5789
+ "learning_rate": 0.0001183567705834431,
5790
+ "loss": 0.4784,
5791
+ "step": 1652
5792
+ },
5793
+ {
5794
+ "epoch": 0.9173599556295063,
5795
+ "grad_norm": 0.30149558186531067,
5796
+ "learning_rate": 0.00011818012064301433,
5797
+ "loss": 0.3791,
5798
+ "step": 1654
5799
+ },
5800
+ {
5801
+ "epoch": 0.9184692179700499,
5802
+ "grad_norm": 0.37024736404418945,
5803
+ "learning_rate": 0.00011800341201244954,
5804
+ "loss": 0.4495,
5805
+ "step": 1656
5806
+ },
5807
+ {
5808
+ "epoch": 0.9195784803105934,
5809
+ "grad_norm": 0.30697816610336304,
5810
+ "learning_rate": 0.00011782664526220992,
5811
+ "loss": 0.385,
5812
+ "step": 1658
5813
+ },
5814
+ {
5815
+ "epoch": 0.920687742651137,
5816
+ "grad_norm": 0.48160433769226074,
5817
+ "learning_rate": 0.00011764982096294432,
5818
+ "loss": 0.3435,
5819
+ "step": 1660
5820
+ },
5821
+ {
5822
+ "epoch": 0.9217970049916805,
5823
+ "grad_norm": 0.3319704234600067,
5824
+ "learning_rate": 0.00011747293968548734,
5825
+ "loss": 0.4893,
5826
+ "step": 1662
5827
+ },
5828
+ {
5829
+ "epoch": 0.922906267332224,
5830
+ "grad_norm": 0.2384938895702362,
5831
+ "learning_rate": 0.00011729600200085752,
5832
+ "loss": 0.4826,
5833
+ "step": 1664
5834
+ },
5835
+ {
5836
+ "epoch": 0.9240155296727676,
5837
+ "grad_norm": 0.2830295264720917,
5838
+ "learning_rate": 0.00011711900848025555,
5839
+ "loss": 0.5185,
5840
+ "step": 1666
5841
+ },
5842
+ {
5843
+ "epoch": 0.9251247920133111,
5844
+ "grad_norm": 0.3078785836696625,
5845
+ "learning_rate": 0.0001169419596950623,
5846
+ "loss": 0.6303,
5847
+ "step": 1668
5848
+ },
5849
+ {
5850
+ "epoch": 0.9262340543538546,
5851
+ "grad_norm": 0.2661837637424469,
5852
+ "learning_rate": 0.00011676485621683713,
5853
+ "loss": 0.4059,
5854
+ "step": 1670
5855
+ },
5856
+ {
5857
+ "epoch": 0.9273433166943982,
5858
+ "grad_norm": 0.2619323134422302,
5859
+ "learning_rate": 0.00011658769861731584,
5860
+ "loss": 0.3383,
5861
+ "step": 1672
5862
+ },
5863
+ {
5864
+ "epoch": 0.9284525790349417,
5865
+ "grad_norm": 0.3200634717941284,
5866
+ "learning_rate": 0.00011641048746840912,
5867
+ "loss": 0.42,
5868
+ "step": 1674
5869
+ },
5870
+ {
5871
+ "epoch": 0.9295618413754853,
5872
+ "grad_norm": 0.29915183782577515,
5873
+ "learning_rate": 0.00011623322334220038,
5874
+ "loss": 0.4156,
5875
+ "step": 1676
5876
+ },
5877
+ {
5878
+ "epoch": 0.9306711037160288,
5879
+ "grad_norm": 0.36972782015800476,
5880
+ "learning_rate": 0.0001160559068109441,
5881
+ "loss": 0.3698,
5882
+ "step": 1678
5883
+ },
5884
+ {
5885
+ "epoch": 0.9317803660565723,
5886
+ "grad_norm": 0.24870358407497406,
5887
+ "learning_rate": 0.00011587853844706397,
5888
+ "loss": 0.4126,
5889
+ "step": 1680
5890
+ },
5891
+ {
5892
+ "epoch": 0.9328896283971159,
5893
+ "grad_norm": 0.2665979862213135,
5894
+ "learning_rate": 0.000115701118823151,
5895
+ "loss": 0.4097,
5896
+ "step": 1682
5897
+ },
5898
+ {
5899
+ "epoch": 0.9339988907376594,
5900
+ "grad_norm": 0.34804749488830566,
5901
+ "learning_rate": 0.00011552364851196167,
5902
+ "loss": 0.3956,
5903
+ "step": 1684
5904
+ },
5905
+ {
5906
+ "epoch": 0.9351081530782029,
5907
+ "grad_norm": 0.2750212550163269,
5908
+ "learning_rate": 0.00011534612808641603,
5909
+ "loss": 0.3434,
5910
+ "step": 1686
5911
+ },
5912
+ {
5913
+ "epoch": 0.9362174154187465,
5914
+ "grad_norm": 0.249044269323349,
5915
+ "learning_rate": 0.00011516855811959604,
5916
+ "loss": 0.4786,
5917
+ "step": 1688
5918
+ },
5919
+ {
5920
+ "epoch": 0.93732667775929,
5921
+ "grad_norm": 0.29392093420028687,
5922
+ "learning_rate": 0.00011499093918474348,
5923
+ "loss": 0.3028,
5924
+ "step": 1690
5925
+ },
5926
+ {
5927
+ "epoch": 0.9384359400998337,
5928
+ "grad_norm": 0.2747836112976074,
5929
+ "learning_rate": 0.00011481327185525828,
5930
+ "loss": 0.4296,
5931
+ "step": 1692
5932
+ },
5933
+ {
5934
+ "epoch": 0.9395452024403772,
5935
+ "grad_norm": 0.3494579493999481,
5936
+ "learning_rate": 0.00011463555670469657,
5937
+ "loss": 0.3412,
5938
+ "step": 1694
5939
+ },
5940
+ {
5941
+ "epoch": 0.9406544647809207,
5942
+ "grad_norm": 0.28468579053878784,
5943
+ "learning_rate": 0.00011445779430676884,
5944
+ "loss": 0.5185,
5945
+ "step": 1696
5946
+ },
5947
+ {
5948
+ "epoch": 0.9417637271214643,
5949
+ "grad_norm": 0.27110087871551514,
5950
+ "learning_rate": 0.0001142799852353382,
5951
+ "loss": 0.3075,
5952
+ "step": 1698
5953
+ },
5954
+ {
5955
+ "epoch": 0.9428729894620078,
5956
+ "grad_norm": 0.38002222776412964,
5957
+ "learning_rate": 0.00011410213006441827,
5958
+ "loss": 0.6445,
5959
+ "step": 1700
5960
+ },
5961
+ {
5962
+ "epoch": 0.9439822518025514,
5963
+ "grad_norm": 0.27994948625564575,
5964
+ "learning_rate": 0.00011392422936817166,
5965
+ "loss": 0.3741,
5966
+ "step": 1702
5967
+ },
5968
+ {
5969
+ "epoch": 0.9450915141430949,
5970
+ "grad_norm": 0.26837414503097534,
5971
+ "learning_rate": 0.00011374628372090783,
5972
+ "loss": 0.3902,
5973
+ "step": 1704
5974
+ },
5975
+ {
5976
+ "epoch": 0.9462007764836384,
5977
+ "grad_norm": 0.2525213062763214,
5978
+ "learning_rate": 0.00011356829369708146,
5979
+ "loss": 0.397,
5980
+ "step": 1706
5981
+ },
5982
+ {
5983
+ "epoch": 0.947310038824182,
5984
+ "grad_norm": 0.24402830004692078,
5985
+ "learning_rate": 0.00011339025987129032,
5986
+ "loss": 0.349,
5987
+ "step": 1708
5988
+ },
5989
+ {
5990
+ "epoch": 0.9484193011647255,
5991
+ "grad_norm": 0.2694087624549866,
5992
+ "learning_rate": 0.0001132121828182738,
5993
+ "loss": 0.4212,
5994
+ "step": 1710
5995
+ },
5996
+ {
5997
+ "epoch": 0.949528563505269,
5998
+ "grad_norm": 0.24677637219429016,
5999
+ "learning_rate": 0.00011303406311291065,
6000
+ "loss": 0.4076,
6001
+ "step": 1712
6002
+ },
6003
+ {
6004
+ "epoch": 0.9506378258458126,
6005
+ "grad_norm": 0.23484551906585693,
6006
+ "learning_rate": 0.00011285590133021741,
6007
+ "loss": 0.3533,
6008
+ "step": 1714
6009
+ },
6010
+ {
6011
+ "epoch": 0.9517470881863561,
6012
+ "grad_norm": 0.2949685752391815,
6013
+ "learning_rate": 0.00011267769804534647,
6014
+ "loss": 0.4117,
6015
+ "step": 1716
6016
+ },
6017
+ {
6018
+ "epoch": 0.9528563505268997,
6019
+ "grad_norm": 0.9004955887794495,
6020
+ "learning_rate": 0.00011249945383358414,
6021
+ "loss": 0.4805,
6022
+ "step": 1718
6023
+ },
6024
+ {
6025
+ "epoch": 0.9539656128674432,
6026
+ "grad_norm": 0.33412787318229675,
6027
+ "learning_rate": 0.00011232116927034893,
6028
+ "loss": 0.5482,
6029
+ "step": 1720
6030
+ },
6031
+ {
6032
+ "epoch": 0.9550748752079867,
6033
+ "grad_norm": 0.24728718400001526,
6034
+ "learning_rate": 0.00011214284493118948,
6035
+ "loss": 0.329,
6036
+ "step": 1722
6037
+ },
6038
+ {
6039
+ "epoch": 0.9561841375485303,
6040
+ "grad_norm": 0.3215670883655548,
6041
+ "learning_rate": 0.00011196448139178298,
6042
+ "loss": 0.4933,
6043
+ "step": 1724
6044
+ },
6045
+ {
6046
+ "epoch": 0.9572933998890738,
6047
+ "grad_norm": 0.21227394044399261,
6048
+ "learning_rate": 0.00011178607922793307,
6049
+ "loss": 0.3171,
6050
+ "step": 1726
6051
+ },
6052
+ {
6053
+ "epoch": 0.9584026622296173,
6054
+ "grad_norm": 0.22638051211833954,
6055
+ "learning_rate": 0.0001116076390155682,
6056
+ "loss": 0.4248,
6057
+ "step": 1728
6058
  }
6059
  ],
6060
  "logging_steps": 2,
 
6074
  "attributes": {}
6075
  }
6076
  },
6077
+ "total_flos": 4209116939550720.0,
6078
  "train_batch_size": 8,
6079
  "trial_name": null,
6080
  "trial_params": null