li-muyang commited on
Commit
95c82dd
·
verified ·
1 Parent(s): 95eafff

Model save

Browse files
Files changed (5) hide show
  1. README.md +12 -14
  2. all_results.json +5 -18
  3. generation_config.json +3 -3
  4. train_results.json +5 -5
  5. trainer_state.json +539 -1275
README.md CHANGED
@@ -3,7 +3,6 @@ library_name: transformers
3
  tags:
4
  - trl
5
  - dpo
6
- - alignment-handbook
7
  - generated_from_trainer
8
  model-index:
9
  - name: zephyr-7b-dpo-full
@@ -17,15 +16,15 @@ should probably proofread and complete it, then remove this comment. -->
17
 
18
  This model was trained from scratch on an unknown dataset.
19
  It achieves the following results on the evaluation set:
20
- - Logits/chosen: -0.5950
21
- - Logits/rejected: -0.4472
22
- - Logps/chosen: -426.8990
23
- - Logps/rejected: -545.3427
24
- - Loss: 0.5225
25
- - Rewards/accuracies: 0.7734
26
- - Rewards/chosen: -1.5476
27
- - Rewards/margins: 1.1972
28
- - Rewards/rejected: -2.7448
29
 
30
  ## Model description
31
 
@@ -60,10 +59,9 @@ The following hyperparameters were used during training:
60
 
61
  ### Training results
62
 
63
- | Training Loss | Epoch | Step | Logits/chosen | Logits/rejected | Logps/chosen | Logps/rejected | Validation Loss | Rewards/accuracies | Rewards/chosen | Rewards/margins | Rewards/rejected |
64
- |:-------------:|:------:|:----:|:-------------:|:---------------:|:------------:|:--------------:|:---------------:|:------------------:|:--------------:|:---------------:|:----------------:|
65
- | 0.501 | 0.9984 | 477 | -1.2732 | -1.1615 | -346.5501 | -435.7064 | 0.5185 | 0.7773 | -0.7441 | 0.9044 | -1.6484 |
66
- | 0.3187 | 1.9969 | 954 | -0.5950 | -0.4472 | -426.8990 | -545.3427 | 0.5225 | 0.7734 | -1.5476 | 1.1972 | -2.7448 |
67
 
68
 
69
  ### Framework versions
 
3
  tags:
4
  - trl
5
  - dpo
 
6
  - generated_from_trainer
7
  model-index:
8
  - name: zephyr-7b-dpo-full
 
16
 
17
  This model was trained from scratch on an unknown dataset.
18
  It achieves the following results on the evaluation set:
19
+ - Loss: 0.5997
20
+ - Rewards/chosen: -0.2062
21
+ - Rewards/rejected: -0.5288
22
+ - Rewards/accuracies: 0.7070
23
+ - Rewards/margins: 0.3226
24
+ - Logps/rejected: -332.8396
25
+ - Logps/chosen: -301.5587
26
+ - Logits/rejected: -1.1529
27
+ - Logits/chosen: -1.2491
28
 
29
  ## Model description
30
 
 
59
 
60
  ### Training results
61
 
62
+ | Training Loss | Epoch | Step | Validation Loss | Rewards/chosen | Rewards/rejected | Rewards/accuracies | Rewards/margins | Logps/rejected | Logps/chosen | Logits/rejected | Logits/chosen |
63
+ |:-------------:|:------:|:----:|:---------------:|:--------------:|:----------------:|:------------------:|:---------------:|:--------------:|:------------:|:---------------:|:-------------:|
64
+ | 0.5873 | 0.9984 | 477 | 0.5997 | -0.2062 | -0.5288 | 0.7070 | 0.3226 | -332.8396 | -301.5587 | -1.1529 | -1.2491 |
 
65
 
66
 
67
  ### Framework versions
all_results.json CHANGED
@@ -1,22 +1,9 @@
1
  {
2
- "epoch": 1.9968602825745683,
3
- "eval_logits/chosen": -0.5949550271034241,
4
- "eval_logits/rejected": -0.44720327854156494,
5
- "eval_logps/chosen": -426.89898681640625,
6
- "eval_logps/rejected": -545.3427124023438,
7
- "eval_loss": 1.1966235637664795,
8
- "eval_rewards/accuracies": 0.7734375,
9
- "eval_rewards/chosen": -7.737778663635254,
10
- "eval_rewards/margins": 5.986159324645996,
11
- "eval_rewards/rejected": -13.723937034606934,
12
- "eval_runtime": 182.4829,
13
- "eval_samples": 2000,
14
- "eval_samples_per_second": 10.96,
15
- "eval_steps_per_second": 0.175,
16
  "total_flos": 0.0,
17
- "train_loss": 0.0,
18
- "train_runtime": 3.4239,
19
  "train_samples": 61134,
20
- "train_samples_per_second": 17855.176,
21
- "train_steps_per_second": 139.316
22
  }
 
1
  {
2
+ "epoch": 0.9984301412872841,
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  "total_flos": 0.0,
4
+ "train_loss": 0.6286907725863986,
5
+ "train_runtime": 7498.289,
6
  "train_samples": 61134,
7
+ "train_samples_per_second": 8.153,
8
+ "train_steps_per_second": 0.064
9
  }
generation_config.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
- "_from_model_config": true,
3
- "bos_token_id": 1,
4
- "eos_token_id": 2,
5
  "transformers_version": "4.45.2"
6
  }
 
1
  {
2
+ "bos_token_id": 151643,
3
+ "eos_token_id": 151643,
4
+ "max_new_tokens": 2048,
5
  "transformers_version": "4.45.2"
6
  }
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "epoch": 1.9968602825745683,
3
  "total_flos": 0.0,
4
- "train_loss": 0.0,
5
- "train_runtime": 3.4239,
6
  "train_samples": 61134,
7
- "train_samples_per_second": 17855.176,
8
- "train_steps_per_second": 139.316
9
  }
 
1
  {
2
+ "epoch": 0.9984301412872841,
3
  "total_flos": 0.0,
4
+ "train_loss": 0.6286907725863986,
5
+ "train_runtime": 7498.289,
6
  "train_samples": 61134,
7
+ "train_samples_per_second": 8.153,
8
+ "train_steps_per_second": 0.064
9
  }
trainer_state.json CHANGED
@@ -1,21 +1,21 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 1.9968602825745683,
5
  "eval_steps": 500,
6
- "global_step": 954,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.0020931449502878076,
13
- "grad_norm": 14.88607346284462,
14
- "learning_rate": 5.208333333333333e-09,
15
- "logits/chosen": -2.925722122192383,
16
- "logits/rejected": -2.8885936737060547,
17
- "logps/chosen": -321.0921630859375,
18
- "logps/rejected": -365.8306884765625,
19
  "loss": 0.6931,
20
  "rewards/accuracies": 0.0,
21
  "rewards/chosen": 0.0,
@@ -25,1469 +25,733 @@
25
  },
26
  {
27
  "epoch": 0.020931449502878074,
28
- "grad_norm": 10.30635291782621,
29
- "learning_rate": 5.208333333333333e-08,
30
- "logits/chosen": -2.7202770709991455,
31
- "logits/rejected": -2.695319175720215,
32
- "logps/chosen": -321.58056640625,
33
- "logps/rejected": -289.4584045410156,
34
- "loss": 0.6931,
35
- "rewards/accuracies": 0.4513888955116272,
36
- "rewards/chosen": 0.00041189632611349225,
37
- "rewards/margins": -6.186102837091312e-05,
38
- "rewards/rejected": 0.00047375739086419344,
39
  "step": 10
40
  },
41
  {
42
  "epoch": 0.04186289900575615,
43
- "grad_norm": 11.851618937043359,
44
- "learning_rate": 1.0416666666666667e-07,
45
- "logits/chosen": -2.7979576587677,
46
- "logits/rejected": -2.743467092514038,
47
- "logps/chosen": -317.7387390136719,
48
- "logps/rejected": -289.9251708984375,
49
  "loss": 0.6931,
50
- "rewards/accuracies": 0.53125,
51
- "rewards/chosen": 0.0007057279581204057,
52
- "rewards/margins": 0.00038364241481758654,
53
- "rewards/rejected": 0.0003220855724066496,
54
  "step": 20
55
  },
56
  {
57
  "epoch": 0.06279434850863422,
58
- "grad_norm": 7.845409123408187,
59
- "learning_rate": 1.5624999999999999e-07,
60
- "logits/chosen": -2.7853832244873047,
61
- "logits/rejected": -2.7237634658813477,
62
- "logps/chosen": -300.7030029296875,
63
- "logps/rejected": -254.2164306640625,
64
- "loss": 0.6924,
65
- "rewards/accuracies": 0.6187499761581421,
66
- "rewards/chosen": 0.004664666019380093,
67
- "rewards/margins": 0.002464447868987918,
68
- "rewards/rejected": 0.002200217917561531,
69
  "step": 30
70
  },
71
  {
72
  "epoch": 0.0837257980115123,
73
- "grad_norm": 7.717127331852517,
74
- "learning_rate": 2.0833333333333333e-07,
75
- "logits/chosen": -2.707097291946411,
76
- "logits/rejected": -2.6731085777282715,
77
- "logps/chosen": -278.5015869140625,
78
- "logps/rejected": -266.61273193359375,
79
- "loss": 0.6904,
80
- "rewards/accuracies": 0.7124999761581421,
81
- "rewards/chosen": 0.012720887549221516,
82
- "rewards/margins": 0.005492700729519129,
83
- "rewards/rejected": 0.007228186819702387,
84
  "step": 40
85
  },
86
  {
87
  "epoch": 0.10465724751439037,
88
- "grad_norm": 15.224224641480427,
89
- "learning_rate": 2.604166666666667e-07,
90
- "logits/chosen": -2.734222888946533,
91
- "logits/rejected": -2.6672184467315674,
92
- "logps/chosen": -284.1479187011719,
93
- "logps/rejected": -280.54168701171875,
94
- "loss": 0.6873,
95
- "rewards/accuracies": 0.699999988079071,
96
- "rewards/chosen": 0.023878615349531174,
97
- "rewards/margins": 0.010178199037909508,
98
- "rewards/rejected": 0.013700416311621666,
99
  "step": 50
100
  },
101
  {
102
  "epoch": 0.12558869701726844,
103
- "grad_norm": 9.724530192766299,
104
- "learning_rate": 3.1249999999999997e-07,
105
- "logits/chosen": -2.7641100883483887,
106
- "logits/rejected": -2.710592746734619,
107
- "logps/chosen": -257.2303161621094,
108
- "logps/rejected": -249.379638671875,
109
- "loss": 0.6824,
110
- "rewards/accuracies": 0.737500011920929,
111
- "rewards/chosen": 0.041937388479709625,
112
- "rewards/margins": 0.02456718310713768,
113
- "rewards/rejected": 0.017370199784636497,
114
  "step": 60
115
  },
116
  {
117
  "epoch": 0.14652014652014653,
118
- "grad_norm": 7.609142075748871,
119
- "learning_rate": 3.645833333333333e-07,
120
- "logits/chosen": -2.7005503177642822,
121
- "logits/rejected": -2.670154094696045,
122
- "logps/chosen": -302.48907470703125,
123
- "logps/rejected": -288.536865234375,
124
- "loss": 0.6728,
125
- "rewards/accuracies": 0.6812499761581421,
126
- "rewards/chosen": 0.037350092083215714,
127
- "rewards/margins": 0.03656148537993431,
128
- "rewards/rejected": 0.0007886036182753742,
129
  "step": 70
130
  },
131
  {
132
  "epoch": 0.1674515960230246,
133
- "grad_norm": 11.507263661586103,
134
- "learning_rate": 4.1666666666666667e-07,
135
- "logits/chosen": -2.795855760574341,
136
- "logits/rejected": -2.709174156188965,
137
- "logps/chosen": -344.7085876464844,
138
- "logps/rejected": -274.75372314453125,
139
- "loss": 0.6558,
140
- "rewards/accuracies": 0.7437499761581421,
141
- "rewards/chosen": 0.013917540200054646,
142
- "rewards/margins": 0.09853295236825943,
143
- "rewards/rejected": -0.08461540192365646,
144
  "step": 80
145
  },
146
  {
147
  "epoch": 0.18838304552590268,
148
- "grad_norm": 12.522918295649193,
149
- "learning_rate": 4.6874999999999996e-07,
150
- "logits/chosen": -2.7532315254211426,
151
- "logits/rejected": -2.712498188018799,
152
- "logps/chosen": -264.0057678222656,
153
- "logps/rejected": -278.46209716796875,
154
- "loss": 0.6358,
155
- "rewards/accuracies": 0.65625,
156
- "rewards/chosen": -0.041521135717630386,
157
- "rewards/margins": 0.12843890488147736,
158
- "rewards/rejected": -0.16996005177497864,
159
  "step": 90
160
  },
161
  {
162
  "epoch": 0.20931449502878074,
163
- "grad_norm": 16.714062222107753,
164
- "learning_rate": 4.999731868769026e-07,
165
- "logits/chosen": -2.7503976821899414,
166
- "logits/rejected": -2.7277534008026123,
167
- "logps/chosen": -317.9760437011719,
168
- "logps/rejected": -314.3436584472656,
169
- "loss": 0.6267,
170
- "rewards/accuracies": 0.762499988079071,
171
- "rewards/chosen": -0.06901533901691437,
172
- "rewards/margins": 0.24417515099048615,
173
- "rewards/rejected": -0.3131905198097229,
174
  "step": 100
175
  },
176
  {
177
  "epoch": 0.2302459445316588,
178
- "grad_norm": 11.040271216087397,
179
- "learning_rate": 4.996716052911017e-07,
180
- "logits/chosen": -2.790177583694458,
181
- "logits/rejected": -2.747727394104004,
182
- "logps/chosen": -327.3770751953125,
183
- "logps/rejected": -331.28594970703125,
184
- "loss": 0.6264,
185
- "rewards/accuracies": 0.675000011920929,
186
- "rewards/chosen": -0.28078389167785645,
187
- "rewards/margins": 0.21150124073028564,
188
- "rewards/rejected": -0.4922851622104645,
189
  "step": 110
190
  },
191
  {
192
  "epoch": 0.25117739403453687,
193
- "grad_norm": 12.319376619341371,
194
- "learning_rate": 4.990353313429303e-07,
195
- "logits/chosen": -2.7004141807556152,
196
- "logits/rejected": -2.7081363201141357,
197
- "logps/chosen": -292.29193115234375,
198
- "logps/rejected": -308.19244384765625,
199
- "loss": 0.6059,
200
- "rewards/accuracies": 0.731249988079071,
201
- "rewards/chosen": -0.08327662199735641,
202
- "rewards/margins": 0.2638171315193176,
203
- "rewards/rejected": -0.34709376096725464,
204
  "step": 120
205
  },
206
  {
207
  "epoch": 0.272108843537415,
208
- "grad_norm": 24.07326364555429,
209
- "learning_rate": 4.980652179769217e-07,
210
- "logits/chosen": -2.7517778873443604,
211
- "logits/rejected": -2.735368013381958,
212
- "logps/chosen": -326.88323974609375,
213
- "logps/rejected": -333.8082275390625,
214
- "loss": 0.5854,
215
- "rewards/accuracies": 0.6625000238418579,
216
- "rewards/chosen": -0.27743759751319885,
217
- "rewards/margins": 0.24996426701545715,
218
- "rewards/rejected": -0.5274018049240112,
219
  "step": 130
220
  },
221
  {
222
  "epoch": 0.29304029304029305,
223
- "grad_norm": 23.35517515247709,
224
- "learning_rate": 4.967625656594781e-07,
225
- "logits/chosen": -2.771867275238037,
226
- "logits/rejected": -2.7476229667663574,
227
- "logps/chosen": -339.9781799316406,
228
- "logps/rejected": -328.70538330078125,
229
- "loss": 0.5878,
230
- "rewards/accuracies": 0.737500011920929,
231
- "rewards/chosen": -0.13580283522605896,
232
- "rewards/margins": 0.391187846660614,
233
- "rewards/rejected": -0.5269905924797058,
234
  "step": 140
235
  },
236
  {
237
  "epoch": 0.3139717425431711,
238
- "grad_norm": 20.972969810387227,
239
- "learning_rate": 4.951291206355559e-07,
240
- "logits/chosen": -2.748628616333008,
241
- "logits/rejected": -2.7197518348693848,
242
- "logps/chosen": -327.03125,
243
- "logps/rejected": -318.48638916015625,
244
- "loss": 0.5967,
245
- "rewards/accuracies": 0.6625000238418579,
246
- "rewards/chosen": -0.3390689790248871,
247
- "rewards/margins": 0.2791776657104492,
248
- "rewards/rejected": -0.6182466745376587,
249
  "step": 150
250
  },
251
  {
252
  "epoch": 0.3349031920460492,
253
- "grad_norm": 13.425598261254686,
254
- "learning_rate": 4.93167072587771e-07,
255
- "logits/chosen": -2.788029193878174,
256
- "logits/rejected": -2.746370553970337,
257
- "logps/chosen": -324.72052001953125,
258
- "logps/rejected": -337.3094787597656,
259
- "loss": 0.5846,
260
- "rewards/accuracies": 0.731249988079071,
261
- "rewards/chosen": -0.3970801830291748,
262
- "rewards/margins": 0.40131622552871704,
263
- "rewards/rejected": -0.7983964681625366,
264
  "step": 160
265
  },
266
  {
267
  "epoch": 0.35583464154892724,
268
- "grad_norm": 17.272372981195964,
269
- "learning_rate": 4.908790517010636e-07,
270
- "logits/chosen": -2.6719164848327637,
271
- "logits/rejected": -2.6612837314605713,
272
- "logps/chosen": -315.0084228515625,
273
- "logps/rejected": -320.8651428222656,
274
- "loss": 0.5888,
275
- "rewards/accuracies": 0.6937500238418579,
276
- "rewards/chosen": -0.4511398375034332,
277
- "rewards/margins": 0.3194407820701599,
278
- "rewards/rejected": -0.7705805897712708,
279
  "step": 170
280
  },
281
  {
282
  "epoch": 0.37676609105180536,
283
- "grad_norm": 14.426317833201416,
284
- "learning_rate": 4.882681251368548e-07,
285
- "logits/chosen": -2.7663211822509766,
286
- "logits/rejected": -2.7407591342926025,
287
- "logps/chosen": -322.9417419433594,
288
- "logps/rejected": -330.96038818359375,
289
- "loss": 0.5596,
290
- "rewards/accuracies": 0.731249988079071,
291
- "rewards/chosen": -0.4329908490180969,
292
- "rewards/margins": 0.358435720205307,
293
- "rewards/rejected": -0.7914265394210815,
294
  "step": 180
295
  },
296
  {
297
  "epoch": 0.3976975405546834,
298
- "grad_norm": 16.424447211666443,
299
- "learning_rate": 4.853377929214243e-07,
300
- "logits/chosen": -2.7122840881347656,
301
- "logits/rejected": -2.6895980834960938,
302
- "logps/chosen": -305.7542419433594,
303
- "logps/rejected": -333.2830810546875,
304
- "loss": 0.5689,
305
- "rewards/accuracies": 0.6812499761581421,
306
- "rewards/chosen": -0.4040308892726898,
307
- "rewards/margins": 0.32926663756370544,
308
- "rewards/rejected": -0.7332974672317505,
309
  "step": 190
310
  },
311
  {
312
  "epoch": 0.4186289900575615,
313
- "grad_norm": 18.075431248646595,
314
- "learning_rate": 4.820919832540181e-07,
315
- "logits/chosen": -2.670199155807495,
316
- "logits/rejected": -2.6688408851623535,
317
- "logps/chosen": -299.871826171875,
318
- "logps/rejected": -358.0787048339844,
319
- "loss": 0.5766,
320
- "rewards/accuracies": 0.7437499761581421,
321
- "rewards/chosen": -0.37743473052978516,
322
- "rewards/margins": 0.4126531183719635,
323
- "rewards/rejected": -0.7900878190994263,
324
  "step": 200
325
  },
326
  {
327
  "epoch": 0.43956043956043955,
328
- "grad_norm": 16.199236277149858,
329
- "learning_rate": 4.785350472409791e-07,
330
- "logits/chosen": -2.718061685562134,
331
- "logits/rejected": -2.6642110347747803,
332
- "logps/chosen": -362.8133544921875,
333
- "logps/rejected": -342.171875,
334
- "loss": 0.5765,
335
- "rewards/accuracies": 0.699999988079071,
336
- "rewards/chosen": -0.42062854766845703,
337
- "rewards/margins": 0.43933743238449097,
338
- "rewards/rejected": -0.859965980052948,
339
  "step": 210
340
  },
341
  {
342
  "epoch": 0.4604918890633176,
343
- "grad_norm": 17.93992307211224,
344
- "learning_rate": 4.7467175306295647e-07,
345
- "logits/chosen": -2.6234521865844727,
346
- "logits/rejected": -2.640157699584961,
347
- "logps/chosen": -310.472412109375,
348
- "logps/rejected": -345.089111328125,
349
- "loss": 0.5611,
350
- "rewards/accuracies": 0.7250000238418579,
351
- "rewards/chosen": -0.47760000824928284,
352
- "rewards/margins": 0.43446213006973267,
353
- "rewards/rejected": -0.9120620489120483,
354
  "step": 220
355
  },
356
  {
357
  "epoch": 0.48142333856619574,
358
- "grad_norm": 20.29472356294206,
359
- "learning_rate": 4.70507279583015e-07,
360
- "logits/chosen": -2.7133331298828125,
361
- "logits/rejected": -2.6721091270446777,
362
- "logps/chosen": -318.923095703125,
363
- "logps/rejected": -366.59710693359375,
364
- "loss": 0.5624,
365
- "rewards/accuracies": 0.768750011920929,
366
- "rewards/chosen": -0.4767213761806488,
367
- "rewards/margins": 0.6975632905960083,
368
- "rewards/rejected": -1.1742846965789795,
369
  "step": 230
370
  },
371
  {
372
  "epoch": 0.5023547880690737,
373
- "grad_norm": 35.1028217518017,
374
- "learning_rate": 4.6604720940421207e-07,
375
- "logits/chosen": -2.640972137451172,
376
- "logits/rejected": -2.5848052501678467,
377
- "logps/chosen": -344.928955078125,
378
- "logps/rejected": -369.1497497558594,
379
- "loss": 0.5347,
380
  "rewards/accuracies": 0.699999988079071,
381
- "rewards/chosen": -0.47312647104263306,
382
- "rewards/margins": 0.495448499917984,
383
- "rewards/rejected": -0.9685748815536499,
384
  "step": 240
385
  },
386
  {
387
  "epoch": 0.5232862375719518,
388
- "grad_norm": 19.843272670557887,
389
- "learning_rate": 4.612975213859487e-07,
390
- "logits/chosen": -2.5373644828796387,
391
- "logits/rejected": -2.4579813480377197,
392
- "logps/chosen": -347.16748046875,
393
- "logps/rejected": -379.2513732910156,
394
- "loss": 0.5437,
395
- "rewards/accuracies": 0.800000011920929,
396
- "rewards/chosen": -0.566421389579773,
397
- "rewards/margins": 0.6545469164848328,
398
- "rewards/rejected": -1.220968246459961,
399
  "step": 250
400
  },
401
  {
402
  "epoch": 0.54421768707483,
403
- "grad_norm": 23.320833527504977,
404
- "learning_rate": 4.5626458262912735e-07,
405
- "logits/chosen": -2.3933730125427246,
406
- "logits/rejected": -2.3900551795959473,
407
- "logps/chosen": -352.5663146972656,
408
- "logps/rejected": -384.38397216796875,
409
- "loss": 0.5419,
410
- "rewards/accuracies": 0.699999988079071,
411
- "rewards/chosen": -0.8031826019287109,
412
- "rewards/margins": 0.5431042909622192,
413
- "rewards/rejected": -1.3462870121002197,
414
  "step": 260
415
  },
416
  {
417
  "epoch": 0.565149136577708,
418
- "grad_norm": 19.483357541000405,
419
- "learning_rate": 4.5095513994085974e-07,
420
- "logits/chosen": -2.200190305709839,
421
- "logits/rejected": -2.142879009246826,
422
- "logps/chosen": -368.5080871582031,
423
- "logps/rejected": -412.9193420410156,
424
- "loss": 0.5489,
425
- "rewards/accuracies": 0.7437499761581421,
426
- "rewards/chosen": -0.6709359884262085,
427
- "rewards/margins": 0.7138775587081909,
428
- "rewards/rejected": -1.3848135471343994,
429
  "step": 270
430
  },
431
  {
432
  "epoch": 0.5860805860805861,
433
- "grad_norm": 23.630987101691453,
434
- "learning_rate": 4.453763107901675e-07,
435
- "logits/chosen": -2.013388156890869,
436
- "logits/rejected": -1.9315751791000366,
437
- "logps/chosen": -335.5148620605469,
438
- "logps/rejected": -353.8980712890625,
439
- "loss": 0.5513,
440
  "rewards/accuracies": 0.6625000238418579,
441
- "rewards/chosen": -0.537543773651123,
442
- "rewards/margins": 0.6034060716629028,
443
- "rewards/rejected": -1.1409497261047363,
444
  "step": 280
445
  },
446
  {
447
  "epoch": 0.6070120355834642,
448
- "grad_norm": 21.50926745793373,
449
- "learning_rate": 4.395355737667985e-07,
450
- "logits/chosen": -2.042461395263672,
451
- "logits/rejected": -1.9405343532562256,
452
- "logps/chosen": -329.45513916015625,
453
- "logps/rejected": -386.4042663574219,
454
- "loss": 0.546,
455
- "rewards/accuracies": 0.731249988079071,
456
- "rewards/chosen": -0.737878680229187,
457
- "rewards/margins": 0.6161252856254578,
458
- "rewards/rejected": -1.3540040254592896,
459
  "step": 290
460
  },
461
  {
462
  "epoch": 0.6279434850863422,
463
- "grad_norm": 21.13385612640552,
464
- "learning_rate": 4.3344075855595097e-07,
465
- "logits/chosen": -2.0211853981018066,
466
- "logits/rejected": -1.9133087396621704,
467
- "logps/chosen": -419.280029296875,
468
- "logps/rejected": -442.6297912597656,
469
- "loss": 0.5279,
470
- "rewards/accuracies": 0.71875,
471
- "rewards/chosen": -0.8050423860549927,
472
- "rewards/margins": 0.6596104502677917,
473
- "rewards/rejected": -1.4646527767181396,
474
  "step": 300
475
  },
476
  {
477
  "epoch": 0.6488749345892203,
478
- "grad_norm": 19.779873187639623,
479
- "learning_rate": 4.271000354423425e-07,
480
- "logits/chosen": -2.02839994430542,
481
- "logits/rejected": -1.9323558807373047,
482
- "logps/chosen": -383.2649841308594,
483
- "logps/rejected": -405.40740966796875,
484
- "loss": 0.5219,
485
- "rewards/accuracies": 0.7562500238418579,
486
- "rewards/chosen": -0.7099379897117615,
487
- "rewards/margins": 0.7654498815536499,
488
- "rewards/rejected": -1.4753879308700562,
489
  "step": 310
490
  },
491
  {
492
  "epoch": 0.6698063840920984,
493
- "grad_norm": 21.08095676735262,
494
- "learning_rate": 4.2052190435769554e-07,
495
- "logits/chosen": -1.9793522357940674,
496
- "logits/rejected": -1.860769271850586,
497
- "logps/chosen": -344.1609191894531,
498
- "logps/rejected": -391.5399475097656,
499
- "loss": 0.5066,
500
- "rewards/accuracies": 0.831250011920929,
501
- "rewards/chosen": -0.7778881192207336,
502
- "rewards/margins": 0.8001760244369507,
503
- "rewards/rejected": -1.5780640840530396,
504
  "step": 320
505
  },
506
  {
507
  "epoch": 0.6907378335949764,
508
- "grad_norm": 27.57188250299686,
509
- "learning_rate": 4.137151834863213e-07,
510
- "logits/chosen": -2.071329116821289,
511
- "logits/rejected": -2.0509676933288574,
512
- "logps/chosen": -345.35186767578125,
513
- "logps/rejected": -415.62261962890625,
514
- "loss": 0.5115,
515
- "rewards/accuracies": 0.762499988079071,
516
- "rewards/chosen": -0.6397430300712585,
517
- "rewards/margins": 0.7728389501571655,
518
- "rewards/rejected": -1.4125821590423584,
519
  "step": 330
520
  },
521
  {
522
  "epoch": 0.7116692830978545,
523
- "grad_norm": 30.579996976984805,
524
- "learning_rate": 4.0668899744407567e-07,
525
- "logits/chosen": -1.8872754573822021,
526
- "logits/rejected": -1.8190815448760986,
527
- "logps/chosen": -379.37225341796875,
528
- "logps/rejected": -436.101318359375,
529
- "loss": 0.52,
530
- "rewards/accuracies": 0.706250011920929,
531
- "rewards/chosen": -0.7692102789878845,
532
- "rewards/margins": 0.7424911260604858,
533
- "rewards/rejected": -1.5117013454437256,
534
  "step": 340
535
  },
536
  {
537
  "epoch": 0.7326007326007326,
538
- "grad_norm": 23.99831270999009,
539
- "learning_rate": 3.994527650465352e-07,
540
- "logits/chosen": -1.8584909439086914,
541
- "logits/rejected": -1.7157665491104126,
542
- "logps/chosen": -362.5115661621094,
543
- "logps/rejected": -413.8670349121094,
544
- "loss": 0.5019,
545
- "rewards/accuracies": 0.78125,
546
- "rewards/chosen": -0.8885606527328491,
547
- "rewards/margins": 0.7002802491188049,
548
- "rewards/rejected": -1.5888408422470093,
549
  "step": 350
550
  },
551
  {
552
  "epoch": 0.7535321821036107,
553
- "grad_norm": 20.569792166483495,
554
- "learning_rate": 3.920161866827889e-07,
555
- "logits/chosen": -1.8123042583465576,
556
- "logits/rejected": -1.6628021001815796,
557
- "logps/chosen": -398.8226623535156,
558
- "logps/rejected": -431.47088623046875,
559
- "loss": 0.542,
560
- "rewards/accuracies": 0.7437499761581421,
561
- "rewards/chosen": -0.8019789457321167,
562
- "rewards/margins": 0.7802454233169556,
563
- "rewards/rejected": -1.5822242498397827,
564
  "step": 360
565
  },
566
  {
567
  "epoch": 0.7744636316064888,
568
- "grad_norm": 21.1660600636486,
569
- "learning_rate": 3.8438923131177237e-07,
570
- "logits/chosen": -1.702923059463501,
571
- "logits/rejected": -1.573188066482544,
572
- "logps/chosen": -352.84765625,
573
- "logps/rejected": -393.1349182128906,
574
- "loss": 0.5227,
575
- "rewards/accuracies": 0.737500011920929,
576
- "rewards/chosen": -0.7422314882278442,
577
- "rewards/margins": 0.7614965438842773,
578
- "rewards/rejected": -1.5037281513214111,
579
  "step": 370
580
  },
581
  {
582
  "epoch": 0.7953950811093669,
583
- "grad_norm": 17.183773141452058,
584
- "learning_rate": 3.765821230985757e-07,
585
- "logits/chosen": -1.6204278469085693,
586
- "logits/rejected": -1.5805397033691406,
587
- "logps/chosen": -352.36553955078125,
588
- "logps/rejected": -456.62127685546875,
589
- "loss": 0.5167,
590
- "rewards/accuracies": 0.731249988079071,
591
- "rewards/chosen": -0.8929269909858704,
592
- "rewards/margins": 0.8663182258605957,
593
- "rewards/rejected": -1.7592451572418213,
594
  "step": 380
595
  },
596
  {
597
  "epoch": 0.8163265306122449,
598
- "grad_norm": 23.349904894629407,
599
- "learning_rate": 3.6860532770864005e-07,
600
- "logits/chosen": -1.9042911529541016,
601
- "logits/rejected": -1.7993557453155518,
602
- "logps/chosen": -437.3020935058594,
603
- "logps/rejected": -478.8271484375,
604
- "loss": 0.5087,
605
  "rewards/accuracies": 0.731249988079071,
606
- "rewards/chosen": -0.8284949064254761,
607
- "rewards/margins": 0.7516407370567322,
608
- "rewards/rejected": -1.580135703086853,
609
  "step": 390
610
  },
611
  {
612
  "epoch": 0.837257980115123,
613
- "grad_norm": 23.752316920168525,
614
- "learning_rate": 3.604695382782159e-07,
615
- "logits/chosen": -1.7125256061553955,
616
- "logits/rejected": -1.591506838798523,
617
- "logps/chosen": -373.38818359375,
618
- "logps/rejected": -433.75799560546875,
619
- "loss": 0.5247,
620
- "rewards/accuracies": 0.768750011920929,
621
- "rewards/chosen": -1.0498483180999756,
622
- "rewards/margins": 0.8007469177246094,
623
- "rewards/rejected": -1.850595235824585,
624
  "step": 400
625
  },
626
  {
627
  "epoch": 0.858189429618001,
628
- "grad_norm": 19.666172854109288,
629
- "learning_rate": 3.5218566107988867e-07,
630
- "logits/chosen": -2.0926361083984375,
631
- "logits/rejected": -2.002737283706665,
632
- "logps/chosen": -392.9161682128906,
633
- "logps/rejected": -434.8521423339844,
634
- "loss": 0.5204,
635
- "rewards/accuracies": 0.7250000238418579,
636
- "rewards/chosen": -0.8088946342468262,
637
- "rewards/margins": 0.8107136487960815,
638
- "rewards/rejected": -1.6196085214614868,
639
  "step": 410
640
  },
641
  {
642
  "epoch": 0.8791208791208791,
643
- "grad_norm": 21.227558543972002,
644
- "learning_rate": 3.4376480090239047e-07,
645
- "logits/chosen": -2.0091476440429688,
646
- "logits/rejected": -1.9699198007583618,
647
- "logps/chosen": -343.46112060546875,
648
- "logps/rejected": -425.591796875,
649
- "loss": 0.5084,
650
- "rewards/accuracies": 0.762499988079071,
651
- "rewards/chosen": -0.7525590658187866,
652
- "rewards/margins": 0.8109620213508606,
653
- "rewards/rejected": -1.563521146774292,
654
  "step": 420
655
  },
656
  {
657
  "epoch": 0.9000523286237572,
658
- "grad_norm": 28.76341659471802,
659
- "learning_rate": 3.3521824616429284e-07,
660
- "logits/chosen": -1.5924561023712158,
661
- "logits/rejected": -1.4453504085540771,
662
- "logps/chosen": -398.48284912109375,
663
- "logps/rejected": -460.5140686035156,
664
- "loss": 0.4912,
665
- "rewards/accuracies": 0.7562500238418579,
666
- "rewards/chosen": -0.9409183263778687,
667
- "rewards/margins": 0.8444429636001587,
668
- "rewards/rejected": -1.7853610515594482,
669
  "step": 430
670
  },
671
  {
672
  "epoch": 0.9209837781266352,
673
- "grad_norm": 24.356244505358386,
674
- "learning_rate": 3.265574537815398e-07,
675
- "logits/chosen": -1.3780838251113892,
676
- "logits/rejected": -1.1997829675674438,
677
- "logps/chosen": -373.2276916503906,
678
- "logps/rejected": -418.62335205078125,
679
- "loss": 0.5069,
680
- "rewards/accuracies": 0.737500011920929,
681
- "rewards/chosen": -1.034752607345581,
682
- "rewards/margins": 0.7990537881851196,
683
- "rewards/rejected": -1.8338062763214111,
684
  "step": 440
685
  },
686
  {
687
  "epoch": 0.9419152276295133,
688
- "grad_norm": 21.508083750235922,
689
- "learning_rate": 3.1779403380910425e-07,
690
- "logits/chosen": -1.2375565767288208,
691
- "logits/rejected": -1.1287505626678467,
692
- "logps/chosen": -379.8437194824219,
693
- "logps/rejected": -447.7130432128906,
694
- "loss": 0.5131,
695
- "rewards/accuracies": 0.737500011920929,
696
- "rewards/chosen": -0.929580807685852,
697
- "rewards/margins": 0.8013578653335571,
698
- "rewards/rejected": -1.7309386730194092,
699
  "step": 450
700
  },
701
  {
702
  "epoch": 0.9628466771323915,
703
- "grad_norm": 29.86693166076459,
704
- "learning_rate": 3.0893973387735683e-07,
705
- "logits/chosen": -1.4521411657333374,
706
- "logits/rejected": -1.2153605222702026,
707
- "logps/chosen": -377.4960021972656,
708
- "logps/rejected": -401.8866882324219,
709
- "loss": 0.5157,
710
- "rewards/accuracies": 0.75,
711
- "rewards/chosen": -0.9293259382247925,
712
- "rewards/margins": 0.7097651362419128,
713
- "rewards/rejected": -1.63909113407135,
714
  "step": 460
715
  },
716
  {
717
  "epoch": 0.9837781266352695,
718
- "grad_norm": 18.514726211898278,
719
- "learning_rate": 3.000064234440111e-07,
720
- "logits/chosen": -1.4076189994812012,
721
- "logits/rejected": -1.2628891468048096,
722
- "logps/chosen": -399.5198059082031,
723
- "logps/rejected": -439.87969970703125,
724
- "loss": 0.501,
725
- "rewards/accuracies": 0.706250011920929,
726
- "rewards/chosen": -0.7778880000114441,
727
- "rewards/margins": 0.7691278457641602,
728
- "rewards/rejected": -1.5470157861709595,
729
  "step": 470
730
  },
731
  {
732
  "epoch": 0.9984301412872841,
733
- "eval_logits/chosen": -1.273218035697937,
734
- "eval_logits/rejected": -1.1614912748336792,
735
- "eval_logps/chosen": -346.5500793457031,
736
- "eval_logps/rejected": -435.70635986328125,
737
- "eval_loss": 0.5185136795043945,
738
- "eval_rewards/accuracies": 0.77734375,
739
- "eval_rewards/chosen": -0.7440664172172546,
740
- "eval_rewards/margins": 0.9043572545051575,
741
- "eval_rewards/rejected": -1.6484237909317017,
742
- "eval_runtime": 171.5848,
743
- "eval_samples_per_second": 11.656,
744
- "eval_steps_per_second": 0.186,
745
  "step": 477
746
  },
747
  {
748
- "epoch": 1.0047095761381475,
749
- "grad_norm": 21.17560575026596,
750
- "learning_rate": 2.910060778827554e-07,
751
- "logits/chosen": -1.3738365173339844,
752
- "logits/rejected": -1.2765980958938599,
753
- "logps/chosen": -379.2554931640625,
754
- "logps/rejected": -449.7948303222656,
755
- "loss": 0.4798,
756
- "rewards/accuracies": 0.7875000238418579,
757
- "rewards/chosen": -0.723404586315155,
758
- "rewards/margins": 0.8685447573661804,
759
- "rewards/rejected": -1.591949224472046,
760
- "step": 480
761
- },
762
- {
763
- "epoch": 1.0256410256410255,
764
- "grad_norm": 16.51017448411662,
765
- "learning_rate": 2.8195076242990116e-07,
766
- "logits/chosen": -1.2624908685684204,
767
- "logits/rejected": -1.0200715065002441,
768
- "logps/chosen": -391.1546325683594,
769
- "logps/rejected": -462.6971130371094,
770
- "loss": 0.3721,
771
- "rewards/accuracies": 0.8374999761581421,
772
- "rewards/chosen": -0.8392894864082336,
773
- "rewards/margins": 1.1826223134994507,
774
- "rewards/rejected": -2.021911859512329,
775
- "step": 490
776
- },
777
- {
778
- "epoch": 1.0465724751439036,
779
- "grad_norm": 28.692357308634083,
780
- "learning_rate": 2.7285261601056697e-07,
781
- "logits/chosen": -1.04556405544281,
782
- "logits/rejected": -0.8983597755432129,
783
- "logps/chosen": -385.15594482421875,
784
- "logps/rejected": -492.3553771972656,
785
- "loss": 0.3623,
786
- "rewards/accuracies": 0.862500011920929,
787
- "rewards/chosen": -1.0549384355545044,
788
- "rewards/margins": 1.216042399406433,
789
- "rewards/rejected": -2.2709805965423584,
790
- "step": 500
791
- },
792
- {
793
- "epoch": 1.0675039246467817,
794
- "grad_norm": 21.922390152982405,
795
- "learning_rate": 2.6372383496608186e-07,
796
- "logits/chosen": -0.9019845128059387,
797
- "logits/rejected": -0.6276781558990479,
798
- "logps/chosen": -385.09625244140625,
799
- "logps/rejected": -487.7792053222656,
800
- "loss": 0.36,
801
- "rewards/accuracies": 0.893750011920929,
802
- "rewards/chosen": -0.9738361239433289,
803
- "rewards/margins": 1.5190062522888184,
804
- "rewards/rejected": -2.492842197418213,
805
- "step": 510
806
- },
807
- {
808
- "epoch": 1.08843537414966,
809
- "grad_norm": 23.056390516413284,
810
- "learning_rate": 2.5457665670441937e-07,
811
- "logits/chosen": -0.8516889810562134,
812
- "logits/rejected": -0.5428006052970886,
813
- "logps/chosen": -382.0596923828125,
814
- "logps/rejected": -487.93701171875,
815
- "loss": 0.3563,
816
- "rewards/accuracies": 0.8687499761581421,
817
- "rewards/chosen": -0.9770641326904297,
818
- "rewards/margins": 1.3976478576660156,
819
- "rewards/rejected": -2.3747119903564453,
820
- "step": 520
821
- },
822
- {
823
- "epoch": 1.109366823652538,
824
- "grad_norm": 24.839002506991555,
825
- "learning_rate": 2.454233432955807e-07,
826
- "logits/chosen": -0.893288254737854,
827
- "logits/rejected": -0.55360347032547,
828
- "logps/chosen": -373.3586120605469,
829
- "logps/rejected": -470.93048095703125,
830
- "loss": 0.3595,
831
- "rewards/accuracies": 0.856249988079071,
832
- "rewards/chosen": -1.079138159751892,
833
- "rewards/margins": 1.3878281116485596,
834
- "rewards/rejected": -2.466966152191162,
835
- "step": 530
836
- },
837
- {
838
- "epoch": 1.130298273155416,
839
- "grad_norm": 24.9976173018764,
840
- "learning_rate": 2.3627616503391812e-07,
841
- "logits/chosen": -0.6603145003318787,
842
- "logits/rejected": -0.45139390230178833,
843
- "logps/chosen": -381.8931884765625,
844
- "logps/rejected": -511.0888671875,
845
- "loss": 0.3615,
846
- "rewards/accuracies": 0.862500011920929,
847
- "rewards/chosen": -1.2078765630722046,
848
- "rewards/margins": 1.415185809135437,
849
- "rewards/rejected": -2.6230626106262207,
850
- "step": 540
851
- },
852
- {
853
- "epoch": 1.1512297226582942,
854
- "grad_norm": 23.70399953245455,
855
- "learning_rate": 2.2714738398943308e-07,
856
- "logits/chosen": -0.7952468395233154,
857
- "logits/rejected": -0.5057519674301147,
858
- "logps/chosen": -405.1830139160156,
859
- "logps/rejected": -512.9744873046875,
860
- "loss": 0.3432,
861
- "rewards/accuracies": 0.8374999761581421,
862
- "rewards/chosen": -1.1684848070144653,
863
- "rewards/margins": 1.4340193271636963,
864
- "rewards/rejected": -2.6025044918060303,
865
- "step": 550
866
- },
867
- {
868
- "epoch": 1.1721611721611722,
869
- "grad_norm": 26.9881287231271,
870
- "learning_rate": 2.1804923757009882e-07,
871
- "logits/chosen": -0.7067749500274658,
872
- "logits/rejected": -0.4858238697052002,
873
- "logps/chosen": -358.0526428222656,
874
- "logps/rejected": -488.3414001464844,
875
- "loss": 0.3631,
876
- "rewards/accuracies": 0.925000011920929,
877
- "rewards/chosen": -0.9131903648376465,
878
- "rewards/margins": 1.4550573825836182,
879
- "rewards/rejected": -2.3682477474212646,
880
- "step": 560
881
- },
882
- {
883
- "epoch": 1.1930926216640503,
884
- "grad_norm": 26.437360887515176,
885
- "learning_rate": 2.089939221172446e-07,
886
- "logits/chosen": -0.6216100454330444,
887
- "logits/rejected": -0.4619014263153076,
888
- "logps/chosen": -427.541015625,
889
- "logps/rejected": -578.9843139648438,
890
- "loss": 0.3526,
891
- "rewards/accuracies": 0.8999999761581421,
892
- "rewards/chosen": -1.259556770324707,
893
- "rewards/margins": 1.5523223876953125,
894
- "rewards/rejected": -2.8118791580200195,
895
- "step": 570
896
- },
897
- {
898
- "epoch": 1.2140240711669283,
899
- "grad_norm": 24.08017206287918,
900
- "learning_rate": 1.9999357655598891e-07,
901
- "logits/chosen": -0.596352219581604,
902
- "logits/rejected": -0.3721233308315277,
903
- "logps/chosen": -412.52301025390625,
904
- "logps/rejected": -562.3777465820312,
905
- "loss": 0.3344,
906
- "rewards/accuracies": 0.887499988079071,
907
- "rewards/chosen": -1.310210943222046,
908
- "rewards/margins": 1.5689070224761963,
909
- "rewards/rejected": -2.879117965698242,
910
- "step": 580
911
- },
912
- {
913
- "epoch": 1.2349555206698064,
914
- "grad_norm": 19.74419678906988,
915
- "learning_rate": 1.9106026612264315e-07,
916
- "logits/chosen": -0.8253545761108398,
917
- "logits/rejected": -0.6318912506103516,
918
- "logps/chosen": -399.89910888671875,
919
- "logps/rejected": -515.3082275390625,
920
- "loss": 0.3424,
921
- "rewards/accuracies": 0.893750011920929,
922
- "rewards/chosen": -1.0740442276000977,
923
- "rewards/margins": 1.4926326274871826,
924
- "rewards/rejected": -2.5666770935058594,
925
- "step": 590
926
- },
927
- {
928
- "epoch": 1.2558869701726845,
929
- "grad_norm": 30.297500591228765,
930
- "learning_rate": 1.8220596619089573e-07,
931
- "logits/chosen": -0.7947873473167419,
932
- "logits/rejected": -0.5936676859855652,
933
- "logps/chosen": -452.333740234375,
934
- "logps/rejected": -589.5086059570312,
935
- "loss": 0.3349,
936
- "rewards/accuracies": 0.8374999761581421,
937
- "rewards/chosen": -1.3032448291778564,
938
- "rewards/margins": 1.449398398399353,
939
- "rewards/rejected": -2.752643346786499,
940
- "step": 600
941
- },
942
- {
943
- "epoch": 1.2768184196755625,
944
- "grad_norm": 25.095514098513064,
945
- "learning_rate": 1.7344254621846017e-07,
946
- "logits/chosen": -0.5728591680526733,
947
- "logits/rejected": -0.2157384604215622,
948
- "logps/chosen": -446.3045959472656,
949
- "logps/rejected": -558.5189819335938,
950
- "loss": 0.3583,
951
- "rewards/accuracies": 0.856249988079071,
952
- "rewards/chosen": -1.3666396141052246,
953
- "rewards/margins": 1.6065607070922852,
954
- "rewards/rejected": -2.9732003211975098,
955
- "step": 610
956
- },
957
- {
958
- "epoch": 1.2977498691784406,
959
- "grad_norm": 26.711706547263283,
960
- "learning_rate": 1.647817538357072e-07,
961
- "logits/chosen": -0.6483644247055054,
962
- "logits/rejected": -0.436624675989151,
963
- "logps/chosen": -375.31463623046875,
964
- "logps/rejected": -502.0403747558594,
965
- "loss": 0.3359,
966
- "rewards/accuracies": 0.824999988079071,
967
- "rewards/chosen": -1.1623198986053467,
968
- "rewards/margins": 1.4659651517868042,
969
- "rewards/rejected": -2.6282851696014404,
970
- "step": 620
971
- },
972
- {
973
- "epoch": 1.3186813186813187,
974
- "grad_norm": 29.34167044151856,
975
- "learning_rate": 1.562351990976095e-07,
976
- "logits/chosen": -0.5509764552116394,
977
- "logits/rejected": -0.2896868884563446,
978
- "logps/chosen": -404.48773193359375,
979
- "logps/rejected": -547.499267578125,
980
- "loss": 0.3416,
981
- "rewards/accuracies": 0.887499988079071,
982
- "rewards/chosen": -1.2804601192474365,
983
- "rewards/margins": 1.6254936456680298,
984
- "rewards/rejected": -2.905953884124756,
985
- "step": 630
986
- },
987
- {
988
- "epoch": 1.3396127681841967,
989
- "grad_norm": 23.153596326850824,
990
- "learning_rate": 1.478143389201113e-07,
991
- "logits/chosen": -0.5090593099594116,
992
- "logits/rejected": -0.1941351443529129,
993
- "logps/chosen": -416.89141845703125,
994
- "logps/rejected": -544.4210815429688,
995
- "loss": 0.3508,
996
- "rewards/accuracies": 0.8687499761581421,
997
- "rewards/chosen": -1.4043817520141602,
998
- "rewards/margins": 1.578391194343567,
999
- "rewards/rejected": -2.9827733039855957,
1000
- "step": 640
1001
- },
1002
- {
1003
- "epoch": 1.3605442176870748,
1004
- "grad_norm": 37.7718208645946,
1005
- "learning_rate": 1.3953046172178413e-07,
1006
- "logits/chosen": -0.4676644206047058,
1007
- "logits/rejected": -0.29953527450561523,
1008
- "logps/chosen": -402.21417236328125,
1009
- "logps/rejected": -548.4046020507812,
1010
- "loss": 0.3546,
1011
- "rewards/accuracies": 0.925000011920929,
1012
- "rewards/chosen": -1.1596171855926514,
1013
- "rewards/margins": 1.624544382095337,
1014
- "rewards/rejected": -2.7841618061065674,
1015
- "step": 650
1016
- },
1017
- {
1018
- "epoch": 1.3814756671899528,
1019
- "grad_norm": 32.318559995180884,
1020
- "learning_rate": 1.3139467229135998e-07,
1021
- "logits/chosen": -0.625167191028595,
1022
- "logits/rejected": -0.40474313497543335,
1023
- "logps/chosen": -410.2447204589844,
1024
- "logps/rejected": -531.7794799804688,
1025
- "loss": 0.3438,
1026
- "rewards/accuracies": 0.8687499761581421,
1027
- "rewards/chosen": -1.2046838998794556,
1028
- "rewards/margins": 1.518923044204712,
1029
- "rewards/rejected": -2.723607063293457,
1030
- "step": 660
1031
- },
1032
- {
1033
- "epoch": 1.402407116692831,
1034
- "grad_norm": 26.63444910378978,
1035
- "learning_rate": 1.2341787690142435e-07,
1036
- "logits/chosen": -0.5080076456069946,
1037
- "logits/rejected": -0.39028996229171753,
1038
- "logps/chosen": -417.2349548339844,
1039
- "logps/rejected": -571.6380615234375,
1040
- "loss": 0.3373,
1041
- "rewards/accuracies": 0.887499988079071,
1042
- "rewards/chosen": -1.290380597114563,
1043
- "rewards/margins": 1.4690136909484863,
1044
- "rewards/rejected": -2.7593941688537598,
1045
- "step": 670
1046
- },
1047
- {
1048
- "epoch": 1.423338566195709,
1049
- "grad_norm": 20.19948420893599,
1050
- "learning_rate": 1.1561076868822755e-07,
1051
- "logits/chosen": -0.6434948444366455,
1052
- "logits/rejected": -0.3340326249599457,
1053
- "logps/chosen": -425.06585693359375,
1054
- "logps/rejected": -558.2105712890625,
1055
- "loss": 0.3408,
1056
- "rewards/accuracies": 0.8687499761581421,
1057
- "rewards/chosen": -1.3932050466537476,
1058
- "rewards/margins": 1.6028707027435303,
1059
- "rewards/rejected": -2.9960758686065674,
1060
- "step": 680
1061
- },
1062
- {
1063
- "epoch": 1.4442700156985873,
1064
- "grad_norm": 23.405895905191862,
1065
- "learning_rate": 1.0798381331721107e-07,
1066
- "logits/chosen": -0.45825833082199097,
1067
- "logits/rejected": -0.26569095253944397,
1068
- "logps/chosen": -413.29705810546875,
1069
- "logps/rejected": -516.1429443359375,
1070
- "loss": 0.3387,
1071
- "rewards/accuracies": 0.84375,
1072
- "rewards/chosen": -1.2925164699554443,
1073
- "rewards/margins": 1.4324119091033936,
1074
- "rewards/rejected": -2.724928379058838,
1075
- "step": 690
1076
- },
1077
- {
1078
- "epoch": 1.4652014652014653,
1079
- "grad_norm": 26.00052056036154,
1080
- "learning_rate": 1.0054723495346482e-07,
1081
- "logits/chosen": -0.5345702767372131,
1082
- "logits/rejected": -0.28709763288497925,
1083
- "logps/chosen": -439.5155334472656,
1084
- "logps/rejected": -595.0430908203125,
1085
- "loss": 0.3385,
1086
- "rewards/accuracies": 0.893750011920929,
1087
- "rewards/chosen": -1.335257649421692,
1088
- "rewards/margins": 1.8328659534454346,
1089
- "rewards/rejected": -3.168123483657837,
1090
- "step": 700
1091
- },
1092
- {
1093
- "epoch": 1.4861329147043434,
1094
- "grad_norm": 29.88922989167434,
1095
- "learning_rate": 9.331100255592436e-08,
1096
- "logits/chosen": -0.44970911741256714,
1097
- "logits/rejected": -0.3438431918621063,
1098
- "logps/chosen": -440.60009765625,
1099
- "logps/rejected": -598.9261474609375,
1100
- "loss": 0.3343,
1101
- "rewards/accuracies": 0.8687499761581421,
1102
- "rewards/chosen": -1.5944675207138062,
1103
- "rewards/margins": 1.555371642112732,
1104
- "rewards/rejected": -3.149839401245117,
1105
- "step": 710
1106
- },
1107
- {
1108
- "epoch": 1.5070643642072215,
1109
- "grad_norm": 35.80679957440786,
1110
- "learning_rate": 8.628481651367875e-08,
1111
- "logits/chosen": -0.6270388960838318,
1112
- "logits/rejected": -0.3709457218647003,
1113
- "logps/chosen": -430.08734130859375,
1114
- "logps/rejected": -598.91357421875,
1115
- "loss": 0.324,
1116
- "rewards/accuracies": 0.918749988079071,
1117
- "rewards/chosen": -1.4218626022338867,
1118
- "rewards/margins": 1.803938865661621,
1119
- "rewards/rejected": -3.2258009910583496,
1120
- "step": 720
1121
- },
1122
- {
1123
- "epoch": 1.5279958137100995,
1124
- "grad_norm": 35.40691778150228,
1125
- "learning_rate": 7.947809564230445e-08,
1126
- "logits/chosen": -0.7805054187774658,
1127
- "logits/rejected": -0.5504633188247681,
1128
- "logps/chosen": -417.5584411621094,
1129
- "logps/rejected": -570.6063232421875,
1130
- "loss": 0.3251,
1131
- "rewards/accuracies": 0.8500000238418579,
1132
- "rewards/chosen": -1.2576138973236084,
1133
- "rewards/margins": 1.5007960796356201,
1134
- "rewards/rejected": -2.7584102153778076,
1135
- "step": 730
1136
- },
1137
- {
1138
- "epoch": 1.5489272632129776,
1139
- "grad_norm": 22.542061727173216,
1140
- "learning_rate": 7.289996455765748e-08,
1141
- "logits/chosen": -0.7156350612640381,
1142
- "logits/rejected": -0.5093780755996704,
1143
- "logps/chosen": -440.34375,
1144
- "logps/rejected": -552.8305053710938,
1145
- "loss": 0.3487,
1146
- "rewards/accuracies": 0.875,
1147
- "rewards/chosen": -1.445813536643982,
1148
- "rewards/margins": 1.539942741394043,
1149
- "rewards/rejected": -2.9857563972473145,
1150
- "step": 740
1151
- },
1152
- {
1153
- "epoch": 1.5698587127158556,
1154
- "grad_norm": 22.391994910590135,
1155
- "learning_rate": 6.655924144404906e-08,
1156
- "logits/chosen": -0.5188966393470764,
1157
- "logits/rejected": -0.31423279643058777,
1158
- "logps/chosen": -427.5140686035156,
1159
- "logps/rejected": -574.3653564453125,
1160
- "loss": 0.3262,
1161
- "rewards/accuracies": 0.8812500238418579,
1162
- "rewards/chosen": -1.3562893867492676,
1163
- "rewards/margins": 1.616431474685669,
1164
- "rewards/rejected": -2.9727206230163574,
1165
- "step": 750
1166
- },
1167
- {
1168
- "epoch": 1.5907901622187337,
1169
- "grad_norm": 27.067458253461155,
1170
- "learning_rate": 6.046442623320145e-08,
1171
- "logits/chosen": -0.7770091891288757,
1172
- "logits/rejected": -0.4757114350795746,
1173
- "logps/chosen": -492.76416015625,
1174
- "logps/rejected": -625.2634887695312,
1175
- "loss": 0.3424,
1176
- "rewards/accuracies": 0.8500000238418579,
1177
- "rewards/chosen": -1.4647096395492554,
1178
- "rewards/margins": 1.647743821144104,
1179
- "rewards/rejected": -3.1124536991119385,
1180
- "step": 760
1181
- },
1182
- {
1183
- "epoch": 1.6117216117216118,
1184
- "grad_norm": 27.917693830137047,
1185
- "learning_rate": 5.4623689209832484e-08,
1186
- "logits/chosen": -0.8214691281318665,
1187
- "logits/rejected": -0.43942517042160034,
1188
- "logps/chosen": -453.11199951171875,
1189
- "logps/rejected": -539.7423706054688,
1190
- "loss": 0.3324,
1191
- "rewards/accuracies": 0.8500000238418579,
1192
- "rewards/chosen": -1.3807549476623535,
1193
- "rewards/margins": 1.52043879032135,
1194
- "rewards/rejected": -2.901193618774414,
1195
- "step": 770
1196
- },
1197
- {
1198
- "epoch": 1.6326530612244898,
1199
- "grad_norm": 35.7523647379527,
1200
- "learning_rate": 4.904486005914027e-08,
1201
- "logits/chosen": -0.7380484342575073,
1202
- "logits/rejected": -0.5423134565353394,
1203
- "logps/chosen": -448.4818420410156,
1204
- "logps/rejected": -601.2604370117188,
1205
- "loss": 0.3395,
1206
- "rewards/accuracies": 0.8687499761581421,
1207
- "rewards/chosen": -1.1892478466033936,
1208
- "rewards/margins": 1.734458565711975,
1209
- "rewards/rejected": -2.923706293106079,
1210
- "step": 780
1211
- },
1212
- {
1213
- "epoch": 1.653584510727368,
1214
- "grad_norm": 31.011854985332448,
1215
- "learning_rate": 4.373541737087263e-08,
1216
- "logits/chosen": -0.7779333591461182,
1217
- "logits/rejected": -0.49003076553344727,
1218
- "logps/chosen": -406.6073303222656,
1219
- "logps/rejected": -552.443359375,
1220
- "loss": 0.3239,
1221
- "rewards/accuracies": 0.8687499761581421,
1222
- "rewards/chosen": -1.2308647632598877,
1223
- "rewards/margins": 1.5439412593841553,
1224
- "rewards/rejected": -2.774806261062622,
1225
- "step": 790
1226
- },
1227
- {
1228
- "epoch": 1.674515960230246,
1229
- "grad_norm": 25.052300316679634,
1230
- "learning_rate": 3.8702478614051345e-08,
1231
- "logits/chosen": -0.7519603371620178,
1232
- "logits/rejected": -0.4932466447353363,
1233
- "logps/chosen": -459.8077087402344,
1234
- "logps/rejected": -562.577392578125,
1235
- "loss": 0.3121,
1236
- "rewards/accuracies": 0.893750011920929,
1237
- "rewards/chosen": -1.291195273399353,
1238
- "rewards/margins": 1.5607550144195557,
1239
- "rewards/rejected": -2.851950168609619,
1240
- "step": 800
1241
- },
1242
- {
1243
- "epoch": 1.695447409733124,
1244
- "grad_norm": 28.051358555998323,
1245
- "learning_rate": 3.3952790595787986e-08,
1246
- "logits/chosen": -0.7064075469970703,
1247
- "logits/rejected": -0.5052956938743591,
1248
- "logps/chosen": -427.84832763671875,
1249
- "logps/rejected": -587.9678955078125,
1250
- "loss": 0.3363,
1251
- "rewards/accuracies": 0.887499988079071,
1252
- "rewards/chosen": -1.3051536083221436,
1253
- "rewards/margins": 1.7959152460098267,
1254
- "rewards/rejected": -3.1010687351226807,
1255
- "step": 810
1256
- },
1257
- {
1258
- "epoch": 1.716378859236002,
1259
- "grad_norm": 31.67321567378694,
1260
- "learning_rate": 2.9492720416985e-08,
1261
- "logits/chosen": -0.8305649757385254,
1262
- "logits/rejected": -0.7014783620834351,
1263
- "logps/chosen": -413.65576171875,
1264
- "logps/rejected": -569.78759765625,
1265
- "loss": 0.3315,
1266
- "rewards/accuracies": 0.862500011920929,
1267
- "rewards/chosen": -1.4015345573425293,
1268
- "rewards/margins": 1.6014522314071655,
1269
- "rewards/rejected": -3.002986431121826,
1270
- "step": 820
1271
- },
1272
- {
1273
- "epoch": 1.7373103087388801,
1274
- "grad_norm": 21.644645673450903,
1275
- "learning_rate": 2.5328246937043525e-08,
1276
- "logits/chosen": -0.8004710078239441,
1277
- "logits/rejected": -0.6543309092521667,
1278
- "logps/chosen": -454.3511657714844,
1279
- "logps/rejected": -579.9097900390625,
1280
- "loss": 0.3224,
1281
- "rewards/accuracies": 0.862500011920929,
1282
- "rewards/chosen": -1.3592346906661987,
1283
- "rewards/margins": 1.5871307849884033,
1284
- "rewards/rejected": -2.9463653564453125,
1285
- "step": 830
1286
- },
1287
- {
1288
- "epoch": 1.7582417582417582,
1289
- "grad_norm": 29.695508251939298,
1290
- "learning_rate": 2.1464952759020856e-08,
1291
- "logits/chosen": -0.5647540092468262,
1292
- "logits/rejected": -0.4591120779514313,
1293
- "logps/chosen": -411.2245178222656,
1294
- "logps/rejected": -579.0653076171875,
1295
- "loss": 0.3265,
1296
- "rewards/accuracies": 0.887499988079071,
1297
- "rewards/chosen": -1.4194848537445068,
1298
- "rewards/margins": 1.6787960529327393,
1299
- "rewards/rejected": -3.098280906677246,
1300
- "step": 840
1301
- },
1302
- {
1303
- "epoch": 1.7791732077446363,
1304
- "grad_norm": 30.24532074338387,
1305
- "learning_rate": 1.7908016745981856e-08,
1306
- "logits/chosen": -0.631384551525116,
1307
- "logits/rejected": -0.5165312886238098,
1308
- "logps/chosen": -408.71173095703125,
1309
- "logps/rejected": -562.2457275390625,
1310
- "loss": 0.3275,
1311
- "rewards/accuracies": 0.8500000238418579,
1312
- "rewards/chosen": -1.459183931350708,
1313
- "rewards/margins": 1.5402036905288696,
1314
- "rewards/rejected": -2.999387264251709,
1315
- "step": 850
1316
- },
1317
- {
1318
- "epoch": 1.8001046572475143,
1319
- "grad_norm": 28.810585688980133,
1320
- "learning_rate": 1.4662207078575684e-08,
1321
- "logits/chosen": -0.8192375302314758,
1322
- "logits/rejected": -0.6066277623176575,
1323
- "logps/chosen": -447.3929748535156,
1324
- "logps/rejected": -533.9404296875,
1325
- "loss": 0.3479,
1326
- "rewards/accuracies": 0.84375,
1327
- "rewards/chosen": -1.381157636642456,
1328
- "rewards/margins": 1.4070510864257812,
1329
- "rewards/rejected": -2.788208484649658,
1330
- "step": 860
1331
- },
1332
- {
1333
- "epoch": 1.8210361067503924,
1334
- "grad_norm": 24.013363968760547,
1335
- "learning_rate": 1.1731874863145142e-08,
1336
- "logits/chosen": -0.7617680430412292,
1337
- "logits/rejected": -0.47873860597610474,
1338
- "logps/chosen": -454.9310607910156,
1339
- "logps/rejected": -573.6890869140625,
1340
- "loss": 0.331,
1341
- "rewards/accuracies": 0.8999999761581421,
1342
- "rewards/chosen": -1.3413336277008057,
1343
- "rewards/margins": 1.7013572454452515,
1344
- "rewards/rejected": -3.0426909923553467,
1345
- "step": 870
1346
- },
1347
- {
1348
- "epoch": 1.8419675562532705,
1349
- "grad_norm": 32.07599972860766,
1350
- "learning_rate": 9.12094829893642e-09,
1351
- "logits/chosen": -0.7640255689620972,
1352
- "logits/rejected": -0.4442223012447357,
1353
- "logps/chosen": -438.59490966796875,
1354
- "logps/rejected": -557.7069091796875,
1355
- "loss": 0.3402,
1356
- "rewards/accuracies": 0.8812500238418579,
1357
- "rewards/chosen": -1.409976840019226,
1358
- "rewards/margins": 1.640062928199768,
1359
- "rewards/rejected": -3.050039529800415,
1360
- "step": 880
1361
- },
1362
- {
1363
- "epoch": 1.8628990057561485,
1364
- "grad_norm": 24.432518139204586,
1365
- "learning_rate": 6.832927412229017e-09,
1366
- "logits/chosen": -0.8613092303276062,
1367
- "logits/rejected": -0.6306430101394653,
1368
- "logps/chosen": -456.99072265625,
1369
- "logps/rejected": -594.9552001953125,
1370
- "loss": 0.3223,
1371
- "rewards/accuracies": 0.9125000238418579,
1372
- "rewards/chosen": -1.3222033977508545,
1373
- "rewards/margins": 1.7843767404556274,
1374
- "rewards/rejected": -3.1065802574157715,
1375
- "step": 890
1376
- },
1377
- {
1378
- "epoch": 1.8838304552590266,
1379
- "grad_norm": 27.645379277195858,
1380
- "learning_rate": 4.8708793644441086e-09,
1381
- "logits/chosen": -0.7418426275253296,
1382
- "logits/rejected": -0.4674626886844635,
1383
- "logps/chosen": -423.3577575683594,
1384
- "logps/rejected": -554.7745361328125,
1385
- "loss": 0.3093,
1386
- "rewards/accuracies": 0.925000011920929,
1387
- "rewards/chosen": -1.3512873649597168,
1388
- "rewards/margins": 1.6987825632095337,
1389
- "rewards/rejected": -3.050069808959961,
1390
- "step": 900
1391
- },
1392
- {
1393
- "epoch": 1.9047619047619047,
1394
- "grad_norm": 28.923783797405456,
1395
- "learning_rate": 3.2374343405217884e-09,
1396
- "logits/chosen": -0.6597197651863098,
1397
- "logits/rejected": -0.4865621030330658,
1398
- "logps/chosen": -408.0905456542969,
1399
- "logps/rejected": -535.84521484375,
1400
- "loss": 0.3367,
1401
- "rewards/accuracies": 0.887499988079071,
1402
- "rewards/chosen": -1.4176795482635498,
1403
- "rewards/margins": 1.6123201847076416,
1404
- "rewards/rejected": -3.0299999713897705,
1405
- "step": 910
1406
- },
1407
- {
1408
- "epoch": 1.9256933542647827,
1409
- "grad_norm": 28.331424470243782,
1410
- "learning_rate": 1.9347820230782295e-09,
1411
- "logits/chosen": -0.8034309148788452,
1412
- "logits/rejected": -0.6734964847564697,
1413
- "logps/chosen": -424.65673828125,
1414
- "logps/rejected": -592.0272216796875,
1415
- "loss": 0.3345,
1416
- "rewards/accuracies": 0.8812500238418579,
1417
- "rewards/chosen": -1.302234172821045,
1418
- "rewards/margins": 1.7047960758209229,
1419
- "rewards/rejected": -3.0070300102233887,
1420
- "step": 920
1421
- },
1422
- {
1423
- "epoch": 1.9466248037676608,
1424
- "grad_norm": 31.096436385559773,
1425
- "learning_rate": 9.64668657069706e-10,
1426
- "logits/chosen": -0.8711435198783875,
1427
- "logits/rejected": -0.6492515206336975,
1428
- "logps/chosen": -412.20159912109375,
1429
- "logps/rejected": -539.3062744140625,
1430
- "loss": 0.3326,
1431
- "rewards/accuracies": 0.856249988079071,
1432
- "rewards/chosen": -1.3435027599334717,
1433
- "rewards/margins": 1.5116230249404907,
1434
- "rewards/rejected": -2.855125904083252,
1435
- "step": 930
1436
- },
1437
- {
1438
- "epoch": 1.9675562532705388,
1439
- "grad_norm": 39.27145853425959,
1440
- "learning_rate": 3.2839470889836627e-10,
1441
- "logits/chosen": -0.8457353711128235,
1442
- "logits/rejected": -0.46064504981040955,
1443
- "logps/chosen": -439.29815673828125,
1444
- "logps/rejected": -539.5534057617188,
1445
- "loss": 0.3418,
1446
- "rewards/accuracies": 0.8812500238418579,
1447
- "rewards/chosen": -1.3375287055969238,
1448
- "rewards/margins": 1.5794165134429932,
1449
- "rewards/rejected": -2.916945457458496,
1450
- "step": 940
1451
- },
1452
- {
1453
- "epoch": 1.988487702773417,
1454
- "grad_norm": 30.98924464228554,
1455
- "learning_rate": 2.6813123097352287e-11,
1456
- "logits/chosen": -0.7241233587265015,
1457
- "logits/rejected": -0.4993128180503845,
1458
- "logps/chosen": -443.7752380371094,
1459
- "logps/rejected": -581.8499755859375,
1460
- "loss": 0.3187,
1461
- "rewards/accuracies": 0.893750011920929,
1462
- "rewards/chosen": -1.385292649269104,
1463
- "rewards/margins": 1.7712970972061157,
1464
- "rewards/rejected": -3.1565897464752197,
1465
- "step": 950
1466
- },
1467
- {
1468
- "epoch": 1.9968602825745683,
1469
- "eval_logits/chosen": -0.5949550271034241,
1470
- "eval_logits/rejected": -0.44720327854156494,
1471
- "eval_logps/chosen": -426.89898681640625,
1472
- "eval_logps/rejected": -545.3427124023438,
1473
- "eval_loss": 0.5225037932395935,
1474
- "eval_rewards/accuracies": 0.7734375,
1475
- "eval_rewards/chosen": -1.547555685043335,
1476
- "eval_rewards/margins": 1.1972318887710571,
1477
- "eval_rewards/rejected": -2.7447874546051025,
1478
- "eval_runtime": 170.0564,
1479
- "eval_samples_per_second": 11.761,
1480
- "eval_steps_per_second": 0.188,
1481
- "step": 954
1482
- },
1483
- {
1484
- "epoch": 1.9968602825745683,
1485
- "step": 954,
1486
  "total_flos": 0.0,
1487
- "train_loss": 0.0,
1488
- "train_runtime": 3.4239,
1489
- "train_samples_per_second": 17855.176,
1490
- "train_steps_per_second": 139.316
1491
  }
1492
  ],
1493
  "logging_steps": 10,
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.9984301412872841,
5
  "eval_steps": 500,
6
+ "global_step": 477,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.0020931449502878076,
13
+ "grad_norm": 2.2216637005002386,
14
+ "learning_rate": 1.0416666666666666e-08,
15
+ "logits/chosen": -1.6314841508865356,
16
+ "logits/rejected": -1.353194236755371,
17
+ "logps/chosen": -368.29144287109375,
18
+ "logps/rejected": -370.7230224609375,
19
  "loss": 0.6931,
20
  "rewards/accuracies": 0.0,
21
  "rewards/chosen": 0.0,
 
25
  },
26
  {
27
  "epoch": 0.020931449502878074,
28
+ "grad_norm": 2.1524494849038778,
29
+ "learning_rate": 1.0416666666666667e-07,
30
+ "logits/chosen": -1.2345138788223267,
31
+ "logits/rejected": -1.203288197517395,
32
+ "logps/chosen": -314.1005554199219,
33
+ "logps/rejected": -286.3752136230469,
34
+ "loss": 0.693,
35
+ "rewards/accuracies": 0.4930555522441864,
36
+ "rewards/chosen": -2.686006700969301e-05,
37
+ "rewards/margins": 0.0007661848794668913,
38
+ "rewards/rejected": -0.0007930449792183936,
39
  "step": 10
40
  },
41
  {
42
  "epoch": 0.04186289900575615,
43
+ "grad_norm": 2.0655596511164083,
44
+ "learning_rate": 2.0833333333333333e-07,
45
+ "logits/chosen": -1.304213523864746,
46
+ "logits/rejected": -1.1859759092330933,
47
+ "logps/chosen": -320.55169677734375,
48
+ "logps/rejected": -283.34149169921875,
49
  "loss": 0.6931,
50
+ "rewards/accuracies": 0.518750011920929,
51
+ "rewards/chosen": -0.00017411960288882256,
52
+ "rewards/margins": 5.0039543566526845e-05,
53
+ "rewards/rejected": -0.0002241591428173706,
54
  "step": 20
55
  },
56
  {
57
  "epoch": 0.06279434850863422,
58
+ "grad_norm": 2.12111028976462,
59
+ "learning_rate": 3.1249999999999997e-07,
60
+ "logits/chosen": -1.3263781070709229,
61
+ "logits/rejected": -1.2141128778457642,
62
+ "logps/chosen": -294.7076110839844,
63
+ "logps/rejected": -251.52481079101562,
64
+ "loss": 0.6931,
65
+ "rewards/accuracies": 0.4937500059604645,
66
+ "rewards/chosen": 0.0012247291160747409,
67
+ "rewards/margins": -0.0003233866882510483,
68
+ "rewards/rejected": 0.0015481156297028065,
69
  "step": 30
70
  },
71
  {
72
  "epoch": 0.0837257980115123,
73
+ "grad_norm": 1.9880970924710342,
74
+ "learning_rate": 4.1666666666666667e-07,
75
+ "logits/chosen": -1.2525466680526733,
76
+ "logits/rejected": -1.1410564184188843,
77
+ "logps/chosen": -270.8800354003906,
78
+ "logps/rejected": -266.47222900390625,
79
+ "loss": 0.6922,
80
+ "rewards/accuracies": 0.5625,
81
+ "rewards/chosen": 0.005721802823245525,
82
+ "rewards/margins": 0.0018067390192300081,
83
+ "rewards/rejected": 0.003915064036846161,
84
  "step": 40
85
  },
86
  {
87
  "epoch": 0.10465724751439037,
88
+ "grad_norm": 2.1837707951870677,
89
+ "learning_rate": 4.999731868769026e-07,
90
+ "logits/chosen": -1.2781437635421753,
91
+ "logits/rejected": -1.1634522676467896,
92
+ "logps/chosen": -299.9478454589844,
93
+ "logps/rejected": -279.74188232421875,
94
+ "loss": 0.6902,
95
+ "rewards/accuracies": 0.59375,
96
+ "rewards/chosen": 0.014337332919239998,
97
+ "rewards/margins": 0.005479713901877403,
98
+ "rewards/rejected": 0.00885761994868517,
99
  "step": 50
100
  },
101
  {
102
  "epoch": 0.12558869701726844,
103
+ "grad_norm": 2.3039683597630227,
104
+ "learning_rate": 4.990353313429303e-07,
105
+ "logits/chosen": -1.3196525573730469,
106
+ "logits/rejected": -1.2014460563659668,
107
+ "logps/chosen": -259.46942138671875,
108
+ "logps/rejected": -252.1083984375,
109
+ "loss": 0.6885,
110
+ "rewards/accuracies": 0.731249988079071,
111
+ "rewards/chosen": 0.027976389974355698,
112
+ "rewards/margins": 0.010958237573504448,
113
+ "rewards/rejected": 0.0170181542634964,
114
  "step": 60
115
  },
116
  {
117
  "epoch": 0.14652014652014653,
118
+ "grad_norm": 2.025942457662755,
119
+ "learning_rate": 4.967625656594781e-07,
120
+ "logits/chosen": -1.246511697769165,
121
+ "logits/rejected": -1.1918096542358398,
122
+ "logps/chosen": -304.3538513183594,
123
+ "logps/rejected": -286.70849609375,
124
+ "loss": 0.6857,
125
+ "rewards/accuracies": 0.65625,
126
+ "rewards/chosen": 0.034880757331848145,
127
+ "rewards/margins": 0.013793361373245716,
128
+ "rewards/rejected": 0.021087396889925003,
129
  "step": 70
130
  },
131
  {
132
  "epoch": 0.1674515960230246,
133
+ "grad_norm": 2.139816541586556,
134
+ "learning_rate": 4.93167072587771e-07,
135
+ "logits/chosen": -1.3581794500350952,
136
+ "logits/rejected": -1.1709249019622803,
137
+ "logps/chosen": -337.73638916015625,
138
+ "logps/rejected": -256.8132629394531,
139
+ "loss": 0.6816,
140
+ "rewards/accuracies": 0.737500011920929,
141
+ "rewards/chosen": 0.04202472046017647,
142
+ "rewards/margins": 0.029817480593919754,
143
+ "rewards/rejected": 0.01220723893493414,
144
  "step": 80
145
  },
146
  {
147
  "epoch": 0.18838304552590268,
148
+ "grad_norm": 2.184463703029387,
149
+ "learning_rate": 4.882681251368548e-07,
150
+ "logits/chosen": -1.2384573221206665,
151
+ "logits/rejected": -1.1635406017303467,
152
+ "logps/chosen": -260.00115966796875,
153
+ "logps/rejected": -258.2013244628906,
154
+ "loss": 0.6781,
155
+ "rewards/accuracies": 0.6499999761581421,
156
+ "rewards/chosen": 0.025884713977575302,
157
+ "rewards/margins": 0.02733892761170864,
158
+ "rewards/rejected": -0.0014542154967784882,
159
  "step": 90
160
  },
161
  {
162
  "epoch": 0.20931449502878074,
163
+ "grad_norm": 2.302037128469088,
164
+ "learning_rate": 4.820919832540181e-07,
165
+ "logits/chosen": -1.304317831993103,
166
+ "logits/rejected": -1.1416256427764893,
167
+ "logps/chosen": -312.20867919921875,
168
+ "logps/rejected": -292.9761962890625,
169
+ "loss": 0.6731,
170
+ "rewards/accuracies": 0.7437499761581421,
171
+ "rewards/chosen": 0.016404878348112106,
172
+ "rewards/margins": 0.0562920868396759,
173
+ "rewards/rejected": -0.0398872084915638,
174
  "step": 100
175
  },
176
  {
177
  "epoch": 0.2302459445316588,
178
+ "grad_norm": 2.476640494327824,
179
+ "learning_rate": 4.7467175306295647e-07,
180
+ "logits/chosen": -1.3065650463104248,
181
+ "logits/rejected": -1.2025437355041504,
182
+ "logps/chosen": -299.5341796875,
183
+ "logps/rejected": -286.65081787109375,
184
+ "loss": 0.6719,
185
+ "rewards/accuracies": 0.637499988079071,
186
+ "rewards/chosen": -0.019125858321785927,
187
+ "rewards/margins": 0.05537647008895874,
188
+ "rewards/rejected": -0.07450231909751892,
189
  "step": 110
190
  },
191
  {
192
  "epoch": 0.25117739403453687,
193
+ "grad_norm": 2.42503756687085,
194
+ "learning_rate": 4.6604720940421207e-07,
195
+ "logits/chosen": -1.3212846517562866,
196
+ "logits/rejected": -1.3139406442642212,
197
+ "logps/chosen": -273.289306640625,
198
+ "logps/rejected": -277.60711669921875,
199
+ "loss": 0.6601,
200
+ "rewards/accuracies": 0.65625,
201
+ "rewards/chosen": -0.03048776648938656,
202
+ "rewards/margins": 0.06633703410625458,
203
+ "rewards/rejected": -0.09682480990886688,
204
  "step": 120
205
  },
206
  {
207
  "epoch": 0.272108843537415,
208
+ "grad_norm": 2.7217854249308493,
209
+ "learning_rate": 4.5626458262912735e-07,
210
+ "logits/chosen": -1.3054877519607544,
211
+ "logits/rejected": -1.209935188293457,
212
+ "logps/chosen": -295.87884521484375,
213
+ "logps/rejected": -290.7591247558594,
214
+ "loss": 0.652,
215
+ "rewards/accuracies": 0.6499999761581421,
216
+ "rewards/chosen": -0.06190527230501175,
217
+ "rewards/margins": 0.07534609735012054,
218
+ "rewards/rejected": -0.1372513771057129,
219
  "step": 130
220
  },
221
  {
222
  "epoch": 0.29304029304029305,
223
+ "grad_norm": 2.884195278457084,
224
+ "learning_rate": 4.453763107901675e-07,
225
+ "logits/chosen": -1.3432964086532593,
226
+ "logits/rejected": -1.202289342880249,
227
+ "logps/chosen": -339.2923278808594,
228
+ "logps/rejected": -292.0403137207031,
229
+ "loss": 0.6479,
230
+ "rewards/accuracies": 0.7124999761581421,
231
+ "rewards/chosen": -0.05155748128890991,
232
+ "rewards/margins": 0.12817394733428955,
233
+ "rewards/rejected": -0.17973145842552185,
234
  "step": 140
235
  },
236
  {
237
  "epoch": 0.3139717425431711,
238
+ "grad_norm": 3.293629728295891,
239
+ "learning_rate": 4.3344075855595097e-07,
240
+ "logits/chosen": -1.3512450456619263,
241
+ "logits/rejected": -1.267796516418457,
242
+ "logps/chosen": -301.23553466796875,
243
+ "logps/rejected": -276.87689208984375,
244
+ "loss": 0.6454,
245
+ "rewards/accuracies": 0.731249988079071,
246
+ "rewards/chosen": -0.09927480667829514,
247
+ "rewards/margins": 0.12617069482803345,
248
+ "rewards/rejected": -0.22544550895690918,
249
  "step": 150
250
  },
251
  {
252
  "epoch": 0.3349031920460492,
253
+ "grad_norm": 5.329039698823607,
254
+ "learning_rate": 4.2052190435769554e-07,
255
+ "logits/chosen": -1.3824660778045654,
256
+ "logits/rejected": -1.1998783349990845,
257
+ "logps/chosen": -293.38714599609375,
258
+ "logps/rejected": -284.198486328125,
259
+ "loss": 0.6335,
260
+ "rewards/accuracies": 0.668749988079071,
261
+ "rewards/chosen": -0.07924506813287735,
262
+ "rewards/margins": 0.18151573836803436,
263
+ "rewards/rejected": -0.2607608139514923,
264
  "step": 160
265
  },
266
  {
267
  "epoch": 0.35583464154892724,
268
+ "grad_norm": 4.73498829155891,
269
+ "learning_rate": 4.0668899744407567e-07,
270
+ "logits/chosen": -1.3166488409042358,
271
+ "logits/rejected": -1.225121021270752,
272
+ "logps/chosen": -283.9803161621094,
273
+ "logps/rejected": -269.71551513671875,
274
+ "loss": 0.6341,
275
+ "rewards/accuracies": 0.7124999761581421,
276
+ "rewards/chosen": -0.157345250248909,
277
+ "rewards/margins": 0.14062240719795227,
278
+ "rewards/rejected": -0.2979676425457001,
279
  "step": 170
280
  },
281
  {
282
  "epoch": 0.37676609105180536,
283
+ "grad_norm": 4.148844444439802,
284
+ "learning_rate": 3.920161866827889e-07,
285
+ "logits/chosen": -1.2676044702529907,
286
+ "logits/rejected": -1.2061588764190674,
287
+ "logps/chosen": -293.41998291015625,
288
+ "logps/rejected": -282.3377380371094,
289
+ "loss": 0.6255,
290
+ "rewards/accuracies": 0.6937500238418579,
291
+ "rewards/chosen": -0.14930489659309387,
292
+ "rewards/margins": 0.13889411091804504,
293
+ "rewards/rejected": -0.2881990075111389,
294
  "step": 180
295
  },
296
  {
297
  "epoch": 0.3976975405546834,
298
+ "grad_norm": 4.210268144972537,
299
+ "learning_rate": 3.765821230985757e-07,
300
+ "logits/chosen": -1.270572543144226,
301
+ "logits/rejected": -1.255990982055664,
302
+ "logps/chosen": -278.5187683105469,
303
+ "logps/rejected": -289.1994934082031,
304
+ "loss": 0.6236,
305
+ "rewards/accuracies": 0.6499999761581421,
306
+ "rewards/chosen": -0.05700838565826416,
307
+ "rewards/margins": 0.1534106433391571,
308
+ "rewards/rejected": -0.21041902899742126,
309
  "step": 190
310
  },
311
  {
312
  "epoch": 0.4186289900575615,
313
+ "grad_norm": 4.513968949295029,
314
+ "learning_rate": 3.604695382782159e-07,
315
+ "logits/chosen": -1.2794849872589111,
316
+ "logits/rejected": -1.2373203039169312,
317
+ "logps/chosen": -275.9900817871094,
318
+ "logps/rejected": -305.5406188964844,
319
+ "loss": 0.6285,
320
+ "rewards/accuracies": 0.643750011920929,
321
+ "rewards/chosen": -0.1595189869403839,
322
+ "rewards/margins": 0.13092303276062012,
323
+ "rewards/rejected": -0.29044201970100403,
324
  "step": 200
325
  },
326
  {
327
  "epoch": 0.43956043956043955,
328
+ "grad_norm": 4.130260296784633,
329
+ "learning_rate": 3.4376480090239047e-07,
330
+ "logits/chosen": -1.3395261764526367,
331
+ "logits/rejected": -1.160781741142273,
332
+ "logps/chosen": -336.36309814453125,
333
+ "logps/rejected": -299.7681884765625,
334
+ "loss": 0.6274,
335
+ "rewards/accuracies": 0.706250011920929,
336
+ "rewards/chosen": -0.1335216909646988,
337
+ "rewards/margins": 0.22881793975830078,
338
+ "rewards/rejected": -0.36233964562416077,
339
  "step": 210
340
  },
341
  {
342
  "epoch": 0.4604918890633176,
343
+ "grad_norm": 4.2020468567803135,
344
+ "learning_rate": 3.265574537815398e-07,
345
+ "logits/chosen": -1.2917451858520508,
346
+ "logits/rejected": -1.2045356035232544,
347
+ "logps/chosen": -264.0050354003906,
348
+ "logps/rejected": -286.49566650390625,
349
+ "loss": 0.6234,
350
+ "rewards/accuracies": 0.6937500238418579,
351
+ "rewards/chosen": -0.21042123436927795,
352
+ "rewards/margins": 0.1836375892162323,
353
+ "rewards/rejected": -0.39405879378318787,
354
  "step": 220
355
  },
356
  {
357
  "epoch": 0.48142333856619574,
358
+ "grad_norm": 5.462895375975161,
359
+ "learning_rate": 3.0893973387735683e-07,
360
+ "logits/chosen": -1.33823561668396,
361
+ "logits/rejected": -1.204892873764038,
362
+ "logps/chosen": -285.23309326171875,
363
+ "logps/rejected": -287.7167663574219,
364
+ "loss": 0.6225,
365
+ "rewards/accuracies": 0.7437499761581421,
366
+ "rewards/chosen": -0.1863180696964264,
367
+ "rewards/margins": 0.2673104703426361,
368
+ "rewards/rejected": -0.4536285400390625,
369
  "step": 230
370
  },
371
  {
372
  "epoch": 0.5023547880690737,
373
+ "grad_norm": 4.249954784852311,
374
+ "learning_rate": 2.910060778827554e-07,
375
+ "logits/chosen": -1.311199426651001,
376
+ "logits/rejected": -1.22099769115448,
377
+ "logps/chosen": -313.4962463378906,
378
+ "logps/rejected": -319.8878479003906,
379
+ "loss": 0.606,
380
  "rewards/accuracies": 0.699999988079071,
381
+ "rewards/chosen": -0.2013070285320282,
382
+ "rewards/margins": 0.27799052000045776,
383
+ "rewards/rejected": -0.47929757833480835,
384
  "step": 240
385
  },
386
  {
387
  "epoch": 0.5232862375719518,
388
+ "grad_norm": 6.044167355639739,
389
+ "learning_rate": 2.7285261601056697e-07,
390
+ "logits/chosen": -1.3467825651168823,
391
+ "logits/rejected": -1.194777011871338,
392
+ "logps/chosen": -313.8829040527344,
393
+ "logps/rejected": -296.2748107910156,
394
+ "loss": 0.612,
395
+ "rewards/accuracies": 0.706250011920929,
396
+ "rewards/chosen": -0.2433101385831833,
397
+ "rewards/margins": 0.253059446811676,
398
+ "rewards/rejected": -0.49636954069137573,
399
  "step": 250
400
  },
401
  {
402
  "epoch": 0.54421768707483,
403
+ "grad_norm": 4.939384686255436,
404
+ "learning_rate": 2.5457665670441937e-07,
405
+ "logits/chosen": -1.2902387380599976,
406
+ "logits/rejected": -1.2375991344451904,
407
+ "logps/chosen": -289.4366455078125,
408
+ "logps/rejected": -296.7834167480469,
409
+ "loss": 0.608,
410
+ "rewards/accuracies": 0.6499999761581421,
411
+ "rewards/chosen": -0.23678669333457947,
412
+ "rewards/margins": 0.2222580909729004,
413
+ "rewards/rejected": -0.45904478430747986,
414
  "step": 260
415
  },
416
  {
417
  "epoch": 0.565149136577708,
418
+ "grad_norm": 3.639946353207058,
419
+ "learning_rate": 2.3627616503391812e-07,
420
+ "logits/chosen": -1.321746826171875,
421
+ "logits/rejected": -1.22855806350708,
422
+ "logps/chosen": -322.65325927734375,
423
+ "logps/rejected": -311.2021179199219,
424
+ "loss": 0.6071,
425
+ "rewards/accuracies": 0.6499999761581421,
426
+ "rewards/chosen": -0.13227801024913788,
427
+ "rewards/margins": 0.22432568669319153,
428
+ "rewards/rejected": -0.3566036820411682,
429
  "step": 270
430
  },
431
  {
432
  "epoch": 0.5860805860805861,
433
+ "grad_norm": 4.275254606059205,
434
+ "learning_rate": 2.1804923757009882e-07,
435
+ "logits/chosen": -1.275221347808838,
436
+ "logits/rejected": -1.1522352695465088,
437
+ "logps/chosen": -280.47003173828125,
438
+ "logps/rejected": -272.01348876953125,
439
+ "loss": 0.6143,
440
  "rewards/accuracies": 0.6625000238418579,
441
+ "rewards/chosen": -0.12610167264938354,
442
+ "rewards/margins": 0.22788269817829132,
443
+ "rewards/rejected": -0.35398435592651367,
444
  "step": 280
445
  },
446
  {
447
  "epoch": 0.6070120355834642,
448
+ "grad_norm": 5.423999562212257,
449
+ "learning_rate": 1.9999357655598891e-07,
450
+ "logits/chosen": -1.2647807598114014,
451
+ "logits/rejected": -1.2297732830047607,
452
+ "logps/chosen": -280.0606384277344,
453
+ "logps/rejected": -303.05706787109375,
454
+ "loss": 0.606,
455
+ "rewards/accuracies": 0.706250011920929,
456
+ "rewards/chosen": -0.24480751156806946,
457
+ "rewards/margins": 0.22092871367931366,
458
+ "rewards/rejected": -0.4657362103462219,
459
  "step": 290
460
  },
461
  {
462
  "epoch": 0.6279434850863422,
463
+ "grad_norm": 4.555810704174,
464
+ "learning_rate": 1.8220596619089573e-07,
465
+ "logits/chosen": -1.3479591608047485,
466
+ "logits/rejected": -1.2260494232177734,
467
+ "logps/chosen": -363.6427307128906,
468
+ "logps/rejected": -332.61077880859375,
469
+ "loss": 0.6015,
470
+ "rewards/accuracies": 0.6312500238418579,
471
+ "rewards/chosen": -0.21298007667064667,
472
+ "rewards/margins": 0.23558974266052246,
473
+ "rewards/rejected": -0.4485698342323303,
474
  "step": 300
475
  },
476
  {
477
  "epoch": 0.6488749345892203,
478
+ "grad_norm": 4.921819098251592,
479
+ "learning_rate": 1.647817538357072e-07,
480
+ "logits/chosen": -1.3201913833618164,
481
+ "logits/rejected": -1.2364635467529297,
482
+ "logps/chosen": -336.68865966796875,
483
+ "logps/rejected": -303.8711242675781,
484
+ "loss": 0.5976,
485
+ "rewards/accuracies": 0.6812499761581421,
486
+ "rewards/chosen": -0.18723243474960327,
487
+ "rewards/margins": 0.2934957444667816,
488
+ "rewards/rejected": -0.4807282090187073,
489
  "step": 310
490
  },
491
  {
492
  "epoch": 0.6698063840920984,
493
+ "grad_norm": 5.4800197473848025,
494
+ "learning_rate": 1.478143389201113e-07,
495
+ "logits/chosen": -1.2567358016967773,
496
+ "logits/rejected": -1.1461724042892456,
497
+ "logps/chosen": -284.4916076660156,
498
+ "logps/rejected": -278.87237548828125,
499
+ "loss": 0.5938,
500
+ "rewards/accuracies": 0.75,
501
+ "rewards/chosen": -0.20529761910438538,
502
+ "rewards/margins": 0.2834058701992035,
503
+ "rewards/rejected": -0.48870348930358887,
504
  "step": 320
505
  },
506
  {
507
  "epoch": 0.6907378335949764,
508
+ "grad_norm": 5.489304392569405,
509
+ "learning_rate": 1.3139467229135998e-07,
510
+ "logits/chosen": -1.3123420476913452,
511
+ "logits/rejected": -1.279762625694275,
512
+ "logps/chosen": -300.2234802246094,
513
+ "logps/rejected": -313.7867126464844,
514
+ "loss": 0.6092,
515
+ "rewards/accuracies": 0.706250011920929,
516
+ "rewards/chosen": -0.14779067039489746,
517
+ "rewards/margins": 0.2438393384218216,
518
+ "rewards/rejected": -0.39162999391555786,
519
  "step": 330
520
  },
521
  {
522
  "epoch": 0.7116692830978545,
523
+ "grad_norm": 7.6710695980682,
524
+ "learning_rate": 1.1561076868822755e-07,
525
+ "logits/chosen": -1.3397032022476196,
526
+ "logits/rejected": -1.2899221181869507,
527
+ "logps/chosen": -317.85455322265625,
528
+ "logps/rejected": -308.87054443359375,
529
+ "loss": 0.5991,
530
+ "rewards/accuracies": 0.7250000238418579,
531
+ "rewards/chosen": -0.12762676179409027,
532
+ "rewards/margins": 0.22320690751075745,
533
+ "rewards/rejected": -0.3508336842060089,
534
  "step": 340
535
  },
536
  {
537
  "epoch": 0.7326007326007326,
538
+ "grad_norm": 4.866645044943069,
539
+ "learning_rate": 1.0054723495346482e-07,
540
+ "logits/chosen": -1.2954719066619873,
541
+ "logits/rejected": -1.1627123355865479,
542
+ "logps/chosen": -287.058837890625,
543
+ "logps/rejected": -284.2725524902344,
544
+ "loss": 0.5907,
545
+ "rewards/accuracies": 0.6937500238418579,
546
+ "rewards/chosen": -0.17578962445259094,
547
+ "rewards/margins": 0.2943018078804016,
548
+ "rewards/rejected": -0.47009140253067017,
549
  "step": 350
550
  },
551
  {
552
  "epoch": 0.7535321821036107,
553
+ "grad_norm": 4.349425498711494,
554
+ "learning_rate": 8.628481651367875e-08,
555
+ "logits/chosen": -1.2945648431777954,
556
+ "logits/rejected": -1.2246530055999756,
557
+ "logps/chosen": -341.4446716308594,
558
+ "logps/rejected": -321.95196533203125,
559
+ "loss": 0.6128,
560
+ "rewards/accuracies": 0.7250000238418579,
561
+ "rewards/chosen": -0.1850513368844986,
562
+ "rewards/margins": 0.2748066782951355,
563
+ "rewards/rejected": -0.4598580300807953,
564
  "step": 360
565
  },
566
  {
567
  "epoch": 0.7744636316064888,
568
+ "grad_norm": 4.798940209140212,
569
+ "learning_rate": 7.289996455765748e-08,
570
+ "logits/chosen": -1.2601999044418335,
571
+ "logits/rejected": -1.1612764596939087,
572
+ "logps/chosen": -292.4561767578125,
573
+ "logps/rejected": -281.00152587890625,
574
+ "loss": 0.5979,
575
+ "rewards/accuracies": 0.6937500238418579,
576
+ "rewards/chosen": -0.17593416571617126,
577
+ "rewards/margins": 0.2297048270702362,
578
+ "rewards/rejected": -0.40563899278640747,
579
  "step": 370
580
  },
581
  {
582
  "epoch": 0.7953950811093669,
583
+ "grad_norm": 6.704056490224638,
584
+ "learning_rate": 6.046442623320145e-08,
585
+ "logits/chosen": -1.3166625499725342,
586
+ "logits/rejected": -1.2426071166992188,
587
+ "logps/chosen": -280.49658203125,
588
+ "logps/rejected": -327.62945556640625,
589
+ "loss": 0.5965,
590
+ "rewards/accuracies": 0.7250000238418579,
591
+ "rewards/chosen": -0.27102166414260864,
592
+ "rewards/margins": 0.3087245225906372,
593
+ "rewards/rejected": -0.5797461271286011,
594
  "step": 380
595
  },
596
  {
597
  "epoch": 0.8163265306122449,
598
+ "grad_norm": 4.8192929971991605,
599
+ "learning_rate": 4.904486005914027e-08,
600
+ "logits/chosen": -1.4123605489730835,
601
+ "logits/rejected": -1.334108591079712,
602
+ "logps/chosen": -365.5756530761719,
603
+ "logps/rejected": -360.2679443359375,
604
+ "loss": 0.5969,
605
  "rewards/accuracies": 0.731249988079071,
606
+ "rewards/chosen": -0.1903412640094757,
607
+ "rewards/margins": 0.3084810674190521,
608
+ "rewards/rejected": -0.4988223612308502,
609
  "step": 390
610
  },
611
  {
612
  "epoch": 0.837257980115123,
613
+ "grad_norm": 5.625190550048412,
614
+ "learning_rate": 3.8702478614051345e-08,
615
+ "logits/chosen": -1.3394266366958618,
616
+ "logits/rejected": -1.2082809209823608,
617
+ "logps/chosen": -290.7579040527344,
618
+ "logps/rejected": -297.6111145019531,
619
+ "loss": 0.6015,
620
+ "rewards/accuracies": 0.731249988079071,
621
+ "rewards/chosen": -0.20424337685108185,
622
+ "rewards/margins": 0.30908313393592834,
623
+ "rewards/rejected": -0.5133264660835266,
624
  "step": 400
625
  },
626
  {
627
  "epoch": 0.858189429618001,
628
+ "grad_norm": 6.001911832374207,
629
+ "learning_rate": 2.9492720416985e-08,
630
+ "logits/chosen": -1.3413333892822266,
631
+ "logits/rejected": -1.1819798946380615,
632
+ "logps/chosen": -345.768798828125,
633
+ "logps/rejected": -324.0213317871094,
634
+ "loss": 0.6113,
635
+ "rewards/accuracies": 0.7124999761581421,
636
+ "rewards/chosen": -0.2449842393398285,
637
+ "rewards/margins": 0.26237279176712036,
638
+ "rewards/rejected": -0.5073570013046265,
639
  "step": 410
640
  },
641
  {
642
  "epoch": 0.8791208791208791,
643
+ "grad_norm": 7.098479038891308,
644
+ "learning_rate": 2.1464952759020856e-08,
645
+ "logits/chosen": -1.3054345846176147,
646
+ "logits/rejected": -1.212334394454956,
647
+ "logps/chosen": -300.0458984375,
648
+ "logps/rejected": -334.25799560546875,
649
+ "loss": 0.5909,
650
+ "rewards/accuracies": 0.668749988079071,
651
+ "rewards/chosen": -0.3056102991104126,
652
+ "rewards/margins": 0.30079561471939087,
653
+ "rewards/rejected": -0.6064059138298035,
654
  "step": 420
655
  },
656
  {
657
  "epoch": 0.9000523286237572,
658
+ "grad_norm": 5.986649846414336,
659
+ "learning_rate": 1.4662207078575684e-08,
660
+ "logits/chosen": -1.2815589904785156,
661
+ "logits/rejected": -1.230791449546814,
662
+ "logps/chosen": -327.59326171875,
663
+ "logps/rejected": -328.6330261230469,
664
+ "loss": 0.5866,
665
+ "rewards/accuracies": 0.699999988079071,
666
+ "rewards/chosen": -0.2670283317565918,
667
+ "rewards/margins": 0.29276418685913086,
668
+ "rewards/rejected": -0.5597925186157227,
669
  "step": 430
670
  },
671
  {
672
  "epoch": 0.9209837781266352,
673
+ "grad_norm": 6.667255378058642,
674
+ "learning_rate": 9.12094829893642e-09,
675
+ "logits/chosen": -1.3328527212142944,
676
+ "logits/rejected": -1.2644946575164795,
677
+ "logps/chosen": -298.36688232421875,
678
+ "logps/rejected": -278.701416015625,
679
+ "loss": 0.5928,
680
+ "rewards/accuracies": 0.6812499761581421,
681
+ "rewards/chosen": -0.23760783672332764,
682
+ "rewards/margins": 0.25196802616119385,
683
+ "rewards/rejected": -0.4895758628845215,
684
  "step": 440
685
  },
686
  {
687
  "epoch": 0.9419152276295133,
688
+ "grad_norm": 6.876184304010807,
689
+ "learning_rate": 4.8708793644441086e-09,
690
+ "logits/chosen": -1.387279748916626,
691
+ "logits/rejected": -1.2943140268325806,
692
+ "logps/chosen": -305.9947509765625,
693
+ "logps/rejected": -321.4436340332031,
694
+ "loss": 0.5988,
695
+ "rewards/accuracies": 0.699999988079071,
696
+ "rewards/chosen": -0.2561241090297699,
697
+ "rewards/margins": 0.28780630230903625,
698
+ "rewards/rejected": -0.5439304113388062,
699
  "step": 450
700
  },
701
  {
702
  "epoch": 0.9628466771323915,
703
+ "grad_norm": 5.120319319432807,
704
+ "learning_rate": 1.9347820230782295e-09,
705
+ "logits/chosen": -1.3500282764434814,
706
+ "logits/rejected": -1.2091432809829712,
707
+ "logps/chosen": -310.1429443359375,
708
+ "logps/rejected": -289.270263671875,
709
+ "loss": 0.6001,
710
+ "rewards/accuracies": 0.699999988079071,
711
+ "rewards/chosen": -0.25589779019355774,
712
+ "rewards/margins": 0.28265419602394104,
713
+ "rewards/rejected": -0.5385519862174988,
714
  "step": 460
715
  },
716
  {
717
  "epoch": 0.9837781266352695,
718
+ "grad_norm": 4.959973504548126,
719
+ "learning_rate": 3.2839470889836627e-10,
720
+ "logits/chosen": -1.307715892791748,
721
+ "logits/rejected": -1.2506450414657593,
722
+ "logps/chosen": -334.43817138671875,
723
+ "logps/rejected": -338.33966064453125,
724
+ "loss": 0.5873,
725
+ "rewards/accuracies": 0.6937500238418579,
726
+ "rewards/chosen": -0.23161163926124573,
727
+ "rewards/margins": 0.2939731180667877,
728
+ "rewards/rejected": -0.5255848169326782,
729
  "step": 470
730
  },
731
  {
732
  "epoch": 0.9984301412872841,
733
+ "eval_logits/chosen": -1.2491042613983154,
734
+ "eval_logits/rejected": -1.1529250144958496,
735
+ "eval_logps/chosen": -301.5587463378906,
736
+ "eval_logps/rejected": -332.839599609375,
737
+ "eval_loss": 0.5996649861335754,
738
+ "eval_rewards/accuracies": 0.70703125,
739
+ "eval_rewards/chosen": -0.20619867742061615,
740
+ "eval_rewards/margins": 0.3225747346878052,
741
+ "eval_rewards/rejected": -0.5287734866142273,
742
+ "eval_runtime": 144.7492,
743
+ "eval_samples_per_second": 13.817,
744
+ "eval_steps_per_second": 0.221,
745
  "step": 477
746
  },
747
  {
748
+ "epoch": 0.9984301412872841,
749
+ "step": 477,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
750
  "total_flos": 0.0,
751
+ "train_loss": 0.6286907725863986,
752
+ "train_runtime": 7498.289,
753
+ "train_samples_per_second": 8.153,
754
+ "train_steps_per_second": 0.064
755
  }
756
  ],
757
  "logging_steps": 10,