jasong03 commited on
Commit
1be12f3
·
verified ·
1 Parent(s): b344083

Training in progress, step 1280, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c6fdfd04882e54d8c85e4fd1121c9b495bdc478d046c30727f9191fb9a7b6ffd
3
  size 891644712
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c19f4b8400a3385551bd98cf95b6e3f1d64dcf7f82712e395c277f316efdb089
3
  size 891644712
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ba9d4b5e87bde4baa1fbc823936a6569f330e0bb67a3c91ff29b3ed40dbaee5c
3
  size 1783444794
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:84230522d2dcedf2de4d8922e543f648e253a14cc0db4e635e673f12800b1231
3
  size 1783444794
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5124869157a4455cd00862d9aaae81127fd59ae804ab56be5a3fc4646cbf1edb
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4e862bee55033739b9ae895cccb1fea0613d44a4ebc98463c3105553aed127ff
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1620b49b414a5d168f9b30e31d438f72d39f8987ab280bb69dc2bb863354e75d
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9822aef898957bd458dc9360bb1e3058b7e31c090ed4f3ed492670e2394dfa96
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.6389351081530782,
5
  "eval_steps": 500,
6
- "global_step": 1152,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -4039,6 +4039,454 @@
4039
  "learning_rate": 0.00015922098532995083,
4040
  "loss": 0.5526,
4041
  "step": 1152
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4042
  }
4043
  ],
4044
  "logging_steps": 2,
@@ -4058,7 +4506,7 @@
4058
  "attributes": {}
4059
  }
4060
  },
4061
- "total_flos": 2806077959700480.0,
4062
  "train_batch_size": 8,
4063
  "trial_name": null,
4064
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.7099278979478647,
5
  "eval_steps": 500,
6
+ "global_step": 1280,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
4039
  "learning_rate": 0.00015922098532995083,
4040
  "loss": 0.5526,
4041
  "step": 1152
4042
+ },
4043
+ {
4044
+ "epoch": 0.6400443704936217,
4045
+ "grad_norm": 0.2813306152820587,
4046
+ "learning_rate": 0.00015907611175727443,
4047
+ "loss": 0.4691,
4048
+ "step": 1154
4049
+ },
4050
+ {
4051
+ "epoch": 0.6411536328341653,
4052
+ "grad_norm": 0.27971866726875305,
4053
+ "learning_rate": 0.00015893104747161525,
4054
+ "loss": 0.4229,
4055
+ "step": 1156
4056
+ },
4057
+ {
4058
+ "epoch": 0.6422628951747088,
4059
+ "grad_norm": 0.2833665907382965,
4060
+ "learning_rate": 0.00015878579294127833,
4061
+ "loss": 0.3703,
4062
+ "step": 1158
4063
+ },
4064
+ {
4065
+ "epoch": 0.6433721575152523,
4066
+ "grad_norm": 0.2487824559211731,
4067
+ "learning_rate": 0.00015864034863518294,
4068
+ "loss": 0.4713,
4069
+ "step": 1160
4070
+ },
4071
+ {
4072
+ "epoch": 0.6444814198557959,
4073
+ "grad_norm": 0.2890799641609192,
4074
+ "learning_rate": 0.0001584947150228609,
4075
+ "loss": 0.4088,
4076
+ "step": 1162
4077
+ },
4078
+ {
4079
+ "epoch": 0.6455906821963394,
4080
+ "grad_norm": 0.38614341616630554,
4081
+ "learning_rate": 0.00015834889257445526,
4082
+ "loss": 0.6233,
4083
+ "step": 1164
4084
+ },
4085
+ {
4086
+ "epoch": 0.646699944536883,
4087
+ "grad_norm": 0.32349148392677307,
4088
+ "learning_rate": 0.00015820288176071861,
4089
+ "loss": 0.4304,
4090
+ "step": 1166
4091
+ },
4092
+ {
4093
+ "epoch": 0.6478092068774265,
4094
+ "grad_norm": 0.3689476549625397,
4095
+ "learning_rate": 0.0001580566830530117,
4096
+ "loss": 0.4791,
4097
+ "step": 1168
4098
+ },
4099
+ {
4100
+ "epoch": 0.64891846921797,
4101
+ "grad_norm": 0.33342450857162476,
4102
+ "learning_rate": 0.00015791029692330174,
4103
+ "loss": 0.5544,
4104
+ "step": 1170
4105
+ },
4106
+ {
4107
+ "epoch": 0.6500277315585136,
4108
+ "grad_norm": 0.27900078892707825,
4109
+ "learning_rate": 0.00015776372384416107,
4110
+ "loss": 0.3984,
4111
+ "step": 1172
4112
+ },
4113
+ {
4114
+ "epoch": 0.6511369938990571,
4115
+ "grad_norm": 0.2558038532733917,
4116
+ "learning_rate": 0.00015761696428876558,
4117
+ "loss": 0.3949,
4118
+ "step": 1174
4119
+ },
4120
+ {
4121
+ "epoch": 0.6522462562396006,
4122
+ "grad_norm": 0.3070181906223297,
4123
+ "learning_rate": 0.00015747001873089305,
4124
+ "loss": 0.4118,
4125
+ "step": 1176
4126
+ },
4127
+ {
4128
+ "epoch": 0.6533555185801442,
4129
+ "grad_norm": 0.3025922477245331,
4130
+ "learning_rate": 0.00015732288764492184,
4131
+ "loss": 0.4185,
4132
+ "step": 1178
4133
+ },
4134
+ {
4135
+ "epoch": 0.6544647809206877,
4136
+ "grad_norm": 0.2864340543746948,
4137
+ "learning_rate": 0.0001571755715058292,
4138
+ "loss": 0.4129,
4139
+ "step": 1180
4140
+ },
4141
+ {
4142
+ "epoch": 0.6555740432612313,
4143
+ "grad_norm": 0.30463624000549316,
4144
+ "learning_rate": 0.00015702807078918967,
4145
+ "loss": 0.3837,
4146
+ "step": 1182
4147
+ },
4148
+ {
4149
+ "epoch": 0.6566833056017748,
4150
+ "grad_norm": 0.36540600657463074,
4151
+ "learning_rate": 0.0001568803859711738,
4152
+ "loss": 0.4379,
4153
+ "step": 1184
4154
+ },
4155
+ {
4156
+ "epoch": 0.6577925679423183,
4157
+ "grad_norm": 0.30166247487068176,
4158
+ "learning_rate": 0.00015673251752854644,
4159
+ "loss": 0.4748,
4160
+ "step": 1186
4161
+ },
4162
+ {
4163
+ "epoch": 0.6589018302828619,
4164
+ "grad_norm": 0.2592626214027405,
4165
+ "learning_rate": 0.00015658446593866518,
4166
+ "loss": 0.4211,
4167
+ "step": 1188
4168
+ },
4169
+ {
4170
+ "epoch": 0.6600110926234054,
4171
+ "grad_norm": 0.3025457262992859,
4172
+ "learning_rate": 0.0001564362316794789,
4173
+ "loss": 0.4014,
4174
+ "step": 1190
4175
+ },
4176
+ {
4177
+ "epoch": 0.6611203549639489,
4178
+ "grad_norm": 0.3164603114128113,
4179
+ "learning_rate": 0.00015628781522952613,
4180
+ "loss": 0.3479,
4181
+ "step": 1192
4182
+ },
4183
+ {
4184
+ "epoch": 0.6622296173044925,
4185
+ "grad_norm": 0.3951297700405121,
4186
+ "learning_rate": 0.00015613921706793363,
4187
+ "loss": 0.5119,
4188
+ "step": 1194
4189
+ },
4190
+ {
4191
+ "epoch": 0.663338879645036,
4192
+ "grad_norm": 0.3717001974582672,
4193
+ "learning_rate": 0.00015599043767441473,
4194
+ "loss": 0.6534,
4195
+ "step": 1196
4196
+ },
4197
+ {
4198
+ "epoch": 0.6644481419855796,
4199
+ "grad_norm": 0.3157028555870056,
4200
+ "learning_rate": 0.0001558414775292678,
4201
+ "loss": 0.4616,
4202
+ "step": 1198
4203
+ },
4204
+ {
4205
+ "epoch": 0.6655574043261231,
4206
+ "grad_norm": 0.3201621472835541,
4207
+ "learning_rate": 0.00015569233711337476,
4208
+ "loss": 0.5525,
4209
+ "step": 1200
4210
+ },
4211
+ {
4212
+ "epoch": 0.6666666666666666,
4213
+ "grad_norm": 0.2802707254886627,
4214
+ "learning_rate": 0.00015554301690819952,
4215
+ "loss": 0.371,
4216
+ "step": 1202
4217
+ },
4218
+ {
4219
+ "epoch": 0.6677759290072102,
4220
+ "grad_norm": 0.2569611668586731,
4221
+ "learning_rate": 0.00015539351739578632,
4222
+ "loss": 0.4024,
4223
+ "step": 1204
4224
+ },
4225
+ {
4226
+ "epoch": 0.6688851913477537,
4227
+ "grad_norm": 0.20614203810691833,
4228
+ "learning_rate": 0.0001552438390587583,
4229
+ "loss": 0.4023,
4230
+ "step": 1206
4231
+ },
4232
+ {
4233
+ "epoch": 0.6699944536882972,
4234
+ "grad_norm": 0.28919216990470886,
4235
+ "learning_rate": 0.00015509398238031588,
4236
+ "loss": 0.5277,
4237
+ "step": 1208
4238
+ },
4239
+ {
4240
+ "epoch": 0.6711037160288408,
4241
+ "grad_norm": 0.3089144825935364,
4242
+ "learning_rate": 0.00015494394784423525,
4243
+ "loss": 0.4952,
4244
+ "step": 1210
4245
+ },
4246
+ {
4247
+ "epoch": 0.6722129783693843,
4248
+ "grad_norm": 0.3368174433708191,
4249
+ "learning_rate": 0.00015479373593486667,
4250
+ "loss": 0.5854,
4251
+ "step": 1212
4252
+ },
4253
+ {
4254
+ "epoch": 0.6733222407099279,
4255
+ "grad_norm": 0.45089298486709595,
4256
+ "learning_rate": 0.0001546433471371331,
4257
+ "loss": 0.4141,
4258
+ "step": 1214
4259
+ },
4260
+ {
4261
+ "epoch": 0.6744315030504714,
4262
+ "grad_norm": 0.3095158040523529,
4263
+ "learning_rate": 0.00015449278193652854,
4264
+ "loss": 0.4969,
4265
+ "step": 1216
4266
+ },
4267
+ {
4268
+ "epoch": 0.6755407653910149,
4269
+ "grad_norm": 0.36336401104927063,
4270
+ "learning_rate": 0.00015434204081911642,
4271
+ "loss": 0.4679,
4272
+ "step": 1218
4273
+ },
4274
+ {
4275
+ "epoch": 0.6766500277315585,
4276
+ "grad_norm": 0.3150896430015564,
4277
+ "learning_rate": 0.00015419112427152807,
4278
+ "loss": 0.5038,
4279
+ "step": 1220
4280
+ },
4281
+ {
4282
+ "epoch": 0.677759290072102,
4283
+ "grad_norm": 0.29561394453048706,
4284
+ "learning_rate": 0.0001540400327809612,
4285
+ "loss": 0.4697,
4286
+ "step": 1222
4287
+ },
4288
+ {
4289
+ "epoch": 0.6788685524126455,
4290
+ "grad_norm": 0.2975095212459564,
4291
+ "learning_rate": 0.00015388876683517826,
4292
+ "loss": 0.5544,
4293
+ "step": 1224
4294
+ },
4295
+ {
4296
+ "epoch": 0.6799778147531891,
4297
+ "grad_norm": 0.2124488651752472,
4298
+ "learning_rate": 0.00015373732692250486,
4299
+ "loss": 0.3321,
4300
+ "step": 1226
4301
+ },
4302
+ {
4303
+ "epoch": 0.6810870770937326,
4304
+ "grad_norm": 0.37978988885879517,
4305
+ "learning_rate": 0.00015358571353182824,
4306
+ "loss": 0.5268,
4307
+ "step": 1228
4308
+ },
4309
+ {
4310
+ "epoch": 0.6821963394342762,
4311
+ "grad_norm": 0.32258403301239014,
4312
+ "learning_rate": 0.0001534339271525957,
4313
+ "loss": 0.4983,
4314
+ "step": 1230
4315
+ },
4316
+ {
4317
+ "epoch": 0.6833056017748197,
4318
+ "grad_norm": 0.3182342052459717,
4319
+ "learning_rate": 0.00015328196827481302,
4320
+ "loss": 0.4742,
4321
+ "step": 1232
4322
+ },
4323
+ {
4324
+ "epoch": 0.6844148641153632,
4325
+ "grad_norm": 0.26553046703338623,
4326
+ "learning_rate": 0.0001531298373890427,
4327
+ "loss": 0.4627,
4328
+ "step": 1234
4329
+ },
4330
+ {
4331
+ "epoch": 0.6855241264559068,
4332
+ "grad_norm": 0.3853413164615631,
4333
+ "learning_rate": 0.0001529775349864026,
4334
+ "loss": 0.5493,
4335
+ "step": 1236
4336
+ },
4337
+ {
4338
+ "epoch": 0.6866333887964503,
4339
+ "grad_norm": 0.26181191205978394,
4340
+ "learning_rate": 0.0001528250615585644,
4341
+ "loss": 0.4308,
4342
+ "step": 1238
4343
+ },
4344
+ {
4345
+ "epoch": 0.687742651136994,
4346
+ "grad_norm": 0.29632505774497986,
4347
+ "learning_rate": 0.0001526724175977518,
4348
+ "loss": 0.3972,
4349
+ "step": 1240
4350
+ },
4351
+ {
4352
+ "epoch": 0.6888519134775375,
4353
+ "grad_norm": 0.21180076897144318,
4354
+ "learning_rate": 0.000152519603596739,
4355
+ "loss": 0.2871,
4356
+ "step": 1242
4357
+ },
4358
+ {
4359
+ "epoch": 0.689961175818081,
4360
+ "grad_norm": 0.3028866946697235,
4361
+ "learning_rate": 0.00015236662004884912,
4362
+ "loss": 0.5045,
4363
+ "step": 1244
4364
+ },
4365
+ {
4366
+ "epoch": 0.6910704381586246,
4367
+ "grad_norm": 0.22230632603168488,
4368
+ "learning_rate": 0.0001522134674479527,
4369
+ "loss": 0.3913,
4370
+ "step": 1246
4371
+ },
4372
+ {
4373
+ "epoch": 0.6921797004991681,
4374
+ "grad_norm": 0.3115563690662384,
4375
+ "learning_rate": 0.00015206014628846594,
4376
+ "loss": 0.4612,
4377
+ "step": 1248
4378
+ },
4379
+ {
4380
+ "epoch": 0.6932889628397116,
4381
+ "grad_norm": 0.30068281292915344,
4382
+ "learning_rate": 0.00015190665706534925,
4383
+ "loss": 0.4224,
4384
+ "step": 1250
4385
+ },
4386
+ {
4387
+ "epoch": 0.6943982251802552,
4388
+ "grad_norm": 0.3180410861968994,
4389
+ "learning_rate": 0.00015175300027410566,
4390
+ "loss": 0.4094,
4391
+ "step": 1252
4392
+ },
4393
+ {
4394
+ "epoch": 0.6955074875207987,
4395
+ "grad_norm": 0.3450130224227905,
4396
+ "learning_rate": 0.00015159917641077895,
4397
+ "loss": 0.5635,
4398
+ "step": 1254
4399
+ },
4400
+ {
4401
+ "epoch": 0.6966167498613423,
4402
+ "grad_norm": 0.27590128779411316,
4403
+ "learning_rate": 0.00015144518597195243,
4404
+ "loss": 0.4893,
4405
+ "step": 1256
4406
+ },
4407
+ {
4408
+ "epoch": 0.6977260122018858,
4409
+ "grad_norm": 0.25257548689842224,
4410
+ "learning_rate": 0.0001512910294547471,
4411
+ "loss": 0.4252,
4412
+ "step": 1258
4413
+ },
4414
+ {
4415
+ "epoch": 0.6988352745424293,
4416
+ "grad_norm": 0.3455219864845276,
4417
+ "learning_rate": 0.00015113670735682013,
4418
+ "loss": 0.5274,
4419
+ "step": 1260
4420
+ },
4421
+ {
4422
+ "epoch": 0.6999445368829729,
4423
+ "grad_norm": 0.3079371750354767,
4424
+ "learning_rate": 0.0001509822201763632,
4425
+ "loss": 0.3667,
4426
+ "step": 1262
4427
+ },
4428
+ {
4429
+ "epoch": 0.7010537992235164,
4430
+ "grad_norm": 0.27760395407676697,
4431
+ "learning_rate": 0.00015082756841210086,
4432
+ "loss": 0.4693,
4433
+ "step": 1264
4434
+ },
4435
+ {
4436
+ "epoch": 0.7021630615640599,
4437
+ "grad_norm": 0.3691544830799103,
4438
+ "learning_rate": 0.0001506727525632891,
4439
+ "loss": 0.4975,
4440
+ "step": 1266
4441
+ },
4442
+ {
4443
+ "epoch": 0.7032723239046035,
4444
+ "grad_norm": 0.30312782526016235,
4445
+ "learning_rate": 0.00015051777312971357,
4446
+ "loss": 0.4377,
4447
+ "step": 1268
4448
+ },
4449
+ {
4450
+ "epoch": 0.704381586245147,
4451
+ "grad_norm": 0.31264883279800415,
4452
+ "learning_rate": 0.00015036263061168797,
4453
+ "loss": 0.3841,
4454
+ "step": 1270
4455
+ },
4456
+ {
4457
+ "epoch": 0.7054908485856906,
4458
+ "grad_norm": 0.27693864703178406,
4459
+ "learning_rate": 0.0001502073255100525,
4460
+ "loss": 0.4347,
4461
+ "step": 1272
4462
+ },
4463
+ {
4464
+ "epoch": 0.7066001109262341,
4465
+ "grad_norm": 0.3322737514972687,
4466
+ "learning_rate": 0.0001500518583261723,
4467
+ "loss": 0.4424,
4468
+ "step": 1274
4469
+ },
4470
+ {
4471
+ "epoch": 0.7077093732667776,
4472
+ "grad_norm": 0.2735716998577118,
4473
+ "learning_rate": 0.0001498962295619356,
4474
+ "loss": 0.3577,
4475
+ "step": 1276
4476
+ },
4477
+ {
4478
+ "epoch": 0.7088186356073212,
4479
+ "grad_norm": 0.2565724849700928,
4480
+ "learning_rate": 0.00014974043971975243,
4481
+ "loss": 0.4086,
4482
+ "step": 1278
4483
+ },
4484
+ {
4485
+ "epoch": 0.7099278979478647,
4486
+ "grad_norm": 0.4010816216468811,
4487
+ "learning_rate": 0.00014958448930255265,
4488
+ "loss": 0.5353,
4489
+ "step": 1280
4490
  }
4491
  ],
4492
  "logging_steps": 2,
 
4506
  "attributes": {}
4507
  }
4508
  },
4509
+ "total_flos": 3117864399667200.0,
4510
  "train_batch_size": 8,
4511
  "trial_name": null,
4512
  "trial_params": null