li-muyang commited on
Commit
87560e5
·
verified ·
1 Parent(s): a55fce1

Model save

Browse files
README.md CHANGED
@@ -17,15 +17,15 @@ should probably proofread and complete it, then remove this comment. -->
17
 
18
  This model was trained from scratch on an unknown dataset.
19
  It achieves the following results on the evaluation set:
20
- - Logits/chosen: -1.8431
21
- - Logits/rejected: -1.7970
22
- - Logps/chosen: -488.8500
23
- - Logps/rejected: -619.1251
24
- - Loss: 0.5119
25
- - Rewards/accuracies: 0.7461
26
- - Rewards/chosen: -2.1151
27
- - Rewards/margins: 1.3443
28
- - Rewards/rejected: -3.4594
29
 
30
  ## Model description
31
 
@@ -62,8 +62,8 @@ The following hyperparameters were used during training:
62
 
63
  | Training Loss | Epoch | Step | Logits/chosen | Logits/rejected | Logps/chosen | Logps/rejected | Validation Loss | Rewards/accuracies | Rewards/chosen | Rewards/margins | Rewards/rejected |
64
  |:-------------:|:------:|:----:|:-------------:|:---------------:|:------------:|:--------------:|:---------------:|:------------------:|:--------------:|:---------------:|:----------------:|
65
- | 0.4926 | 0.9984 | 477 | -2.1760 | -2.1455 | -412.9510 | -506.2944 | 0.5082 | 0.7578 | -1.3561 | 0.9750 | -2.3311 |
66
- | 0.3138 | 1.9969 | 954 | -1.8431 | -1.7970 | -488.8500 | -619.1251 | 0.5119 | 0.7461 | -2.1151 | 1.3443 | -3.4594 |
67
 
68
 
69
  ### Framework versions
 
17
 
18
  This model was trained from scratch on an unknown dataset.
19
  It achieves the following results on the evaluation set:
20
+ - Logits/chosen: -0.5950
21
+ - Logits/rejected: -0.4472
22
+ - Logps/chosen: -426.8990
23
+ - Logps/rejected: -545.3427
24
+ - Loss: 0.5225
25
+ - Rewards/accuracies: 0.7734
26
+ - Rewards/chosen: -1.5476
27
+ - Rewards/margins: 1.1972
28
+ - Rewards/rejected: -2.7448
29
 
30
  ## Model description
31
 
 
62
 
63
  | Training Loss | Epoch | Step | Logits/chosen | Logits/rejected | Logps/chosen | Logps/rejected | Validation Loss | Rewards/accuracies | Rewards/chosen | Rewards/margins | Rewards/rejected |
64
  |:-------------:|:------:|:----:|:-------------:|:---------------:|:------------:|:--------------:|:---------------:|:------------------:|:--------------:|:---------------:|:----------------:|
65
+ | 0.501 | 0.9984 | 477 | -1.2732 | -1.1615 | -346.5501 | -435.7064 | 0.5185 | 0.7773 | -0.7441 | 0.9044 | -1.6484 |
66
+ | 0.3187 | 1.9969 | 954 | -0.5950 | -0.4472 | -426.8990 | -545.3427 | 0.5225 | 0.7734 | -1.5476 | 1.1972 | -2.7448 |
67
 
68
 
69
  ### Framework versions
all_results.json CHANGED
@@ -1,22 +1,22 @@
1
  {
2
  "epoch": 1.9968602825745683,
3
- "eval_logits/chosen": -1.843076467514038,
4
- "eval_logits/rejected": -1.7970439195632935,
5
- "eval_logps/chosen": -488.8499755859375,
6
- "eval_logps/rejected": -619.1251220703125,
7
- "eval_loss": 1.217253565788269,
8
- "eval_rewards/accuracies": 0.74609375,
9
- "eval_rewards/chosen": -10.575380325317383,
10
- "eval_rewards/margins": 6.721622467041016,
11
- "eval_rewards/rejected": -17.2970027923584,
12
- "eval_runtime": 180.4143,
13
  "eval_samples": 2000,
14
- "eval_samples_per_second": 11.086,
15
- "eval_steps_per_second": 0.177,
16
  "total_flos": 0.0,
17
  "train_loss": 0.0,
18
- "train_runtime": 3.478,
19
  "train_samples": 61134,
20
- "train_samples_per_second": 17577.112,
21
- "train_steps_per_second": 137.146
22
  }
 
1
  {
2
  "epoch": 1.9968602825745683,
3
+ "eval_logits/chosen": -0.5949550271034241,
4
+ "eval_logits/rejected": -0.44720327854156494,
5
+ "eval_logps/chosen": -426.89898681640625,
6
+ "eval_logps/rejected": -545.3427124023438,
7
+ "eval_loss": 1.1966235637664795,
8
+ "eval_rewards/accuracies": 0.7734375,
9
+ "eval_rewards/chosen": -7.737778663635254,
10
+ "eval_rewards/margins": 5.986159324645996,
11
+ "eval_rewards/rejected": -13.723937034606934,
12
+ "eval_runtime": 182.4829,
13
  "eval_samples": 2000,
14
+ "eval_samples_per_second": 10.96,
15
+ "eval_steps_per_second": 0.175,
16
  "total_flos": 0.0,
17
  "train_loss": 0.0,
18
+ "train_runtime": 3.4239,
19
  "train_samples": 61134,
20
+ "train_samples_per_second": 17855.176,
21
+ "train_steps_per_second": 139.316
22
  }
config.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "_name_or_path": "data/sft/zephyr-7b-sft-1e-every25/checkpoint-200",
3
  "architectures": [
4
  "MistralForCausalLM"
5
  ],
@@ -22,6 +22,6 @@
22
  "tie_word_embeddings": false,
23
  "torch_dtype": "bfloat16",
24
  "transformers_version": "4.45.2",
25
- "use_cache": false,
26
  "vocab_size": 32000
27
  }
 
1
  {
2
+ "_name_or_path": "data/sft/zephyr-7b-sft-1e-every25/checkpoint-700",
3
  "architectures": [
4
  "MistralForCausalLM"
5
  ],
 
22
  "tie_word_embeddings": false,
23
  "torch_dtype": "bfloat16",
24
  "transformers_version": "4.45.2",
25
+ "use_cache": true,
26
  "vocab_size": 32000
27
  }
eval_results.json CHANGED
@@ -1,16 +1,16 @@
1
  {
2
  "epoch": 1.9968602825745683,
3
- "eval_logits/chosen": -1.843076467514038,
4
- "eval_logits/rejected": -1.7970439195632935,
5
- "eval_logps/chosen": -488.8499755859375,
6
- "eval_logps/rejected": -619.1251220703125,
7
- "eval_loss": 1.217253565788269,
8
- "eval_rewards/accuracies": 0.74609375,
9
- "eval_rewards/chosen": -10.575380325317383,
10
- "eval_rewards/margins": 6.721622467041016,
11
- "eval_rewards/rejected": -17.2970027923584,
12
- "eval_runtime": 180.4143,
13
  "eval_samples": 2000,
14
- "eval_samples_per_second": 11.086,
15
- "eval_steps_per_second": 0.177
16
  }
 
1
  {
2
  "epoch": 1.9968602825745683,
3
+ "eval_logits/chosen": -0.5949550271034241,
4
+ "eval_logits/rejected": -0.44720327854156494,
5
+ "eval_logps/chosen": -426.89898681640625,
6
+ "eval_logps/rejected": -545.3427124023438,
7
+ "eval_loss": 1.1966235637664795,
8
+ "eval_rewards/accuracies": 0.7734375,
9
+ "eval_rewards/chosen": -7.737778663635254,
10
+ "eval_rewards/margins": 5.986159324645996,
11
+ "eval_rewards/rejected": -13.723937034606934,
12
+ "eval_runtime": 182.4829,
13
  "eval_samples": 2000,
14
+ "eval_samples_per_second": 10.96,
15
+ "eval_steps_per_second": 0.175
16
  }
model-00001-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d409451aee7e67f7af769c603992848bb56686f2128725dc784dd7116cf33841
3
  size 4943162336
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4972678ffd391d087c4bac63c1b36a29f720341b4209786c908dd86d831d6743
3
  size 4943162336
model-00002-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8587b5070be5d2d68540351a78d713ee93922409df181e41978f6c67a7753634
3
  size 4999819336
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8d64a5b766f7d7253c75c391e10b48930d0ae1069fdf00413a63935fe368678d
3
  size 4999819336
model-00003-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:21db7468292c3c37b9fa7faed58f9e587e4299595d99e8b7b0c2873fc4b5238a
3
  size 4540516344
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8980262882ff0e180af996a8e4e6254086d7ec8b69eab295bc7820f8b92788a9
3
  size 4540516344
train_results.json CHANGED
@@ -2,8 +2,8 @@
2
  "epoch": 1.9968602825745683,
3
  "total_flos": 0.0,
4
  "train_loss": 0.0,
5
- "train_runtime": 3.478,
6
  "train_samples": 61134,
7
- "train_samples_per_second": 17577.112,
8
- "train_steps_per_second": 137.146
9
  }
 
2
  "epoch": 1.9968602825745683,
3
  "total_flos": 0.0,
4
  "train_loss": 0.0,
5
+ "train_runtime": 3.4239,
6
  "train_samples": 61134,
7
+ "train_samples_per_second": 17855.176,
8
+ "train_steps_per_second": 139.316
9
  }
trainer_state.json CHANGED
@@ -10,12 +10,12 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.0020931449502878076,
13
- "grad_norm": 8.648654168519636,
14
  "learning_rate": 5.208333333333333e-09,
15
- "logits/chosen": -2.911508798599243,
16
- "logits/rejected": -2.8825056552886963,
17
- "logps/chosen": -328.824462890625,
18
- "logps/rejected": -375.2520751953125,
19
  "loss": 0.6931,
20
  "rewards/accuracies": 0.0,
21
  "rewards/chosen": 0.0,
@@ -25,1459 +25,1459 @@
25
  },
26
  {
27
  "epoch": 0.020931449502878074,
28
- "grad_norm": 8.529629402090206,
29
  "learning_rate": 5.208333333333333e-08,
30
- "logits/chosen": -2.738391876220703,
31
- "logits/rejected": -2.715987205505371,
32
- "logps/chosen": -320.3747863769531,
33
- "logps/rejected": -286.63323974609375,
34
- "loss": 0.6932,
35
- "rewards/accuracies": 0.4375,
36
- "rewards/chosen": -1.6565963960601948e-05,
37
- "rewards/margins": -6.0656500863842666e-05,
38
- "rewards/rejected": 4.4090484152548015e-05,
39
  "step": 10
40
  },
41
  {
42
  "epoch": 0.04186289900575615,
43
- "grad_norm": 8.121160296339491,
44
  "learning_rate": 1.0416666666666667e-07,
45
- "logits/chosen": -2.812523365020752,
46
- "logits/rejected": -2.765321731567383,
47
- "logps/chosen": -326.799072265625,
48
- "logps/rejected": -291.5594177246094,
49
- "loss": 0.693,
50
- "rewards/accuracies": 0.543749988079071,
51
- "rewards/chosen": 0.0013127574929967523,
52
- "rewards/margins": 0.0006053571123629808,
53
- "rewards/rejected": 0.0007074003806337714,
54
  "step": 20
55
  },
56
  {
57
  "epoch": 0.06279434850863422,
58
- "grad_norm": 7.841014680458238,
59
  "learning_rate": 1.5624999999999999e-07,
60
- "logits/chosen": -2.791351318359375,
61
- "logits/rejected": -2.737368583679199,
62
- "logps/chosen": -304.5863952636719,
63
- "logps/rejected": -255.7802734375,
64
- "loss": 0.6923,
65
  "rewards/accuracies": 0.6187499761581421,
66
- "rewards/chosen": 0.005904044955968857,
67
- "rewards/margins": 0.0021464950405061245,
68
- "rewards/rejected": 0.003757549449801445,
69
  "step": 30
70
  },
71
  {
72
  "epoch": 0.0837257980115123,
73
- "grad_norm": 7.780689883117447,
74
  "learning_rate": 2.0833333333333333e-07,
75
- "logits/chosen": -2.7076315879821777,
76
- "logits/rejected": -2.685647487640381,
77
- "logps/chosen": -276.40045166015625,
78
- "logps/rejected": -266.44720458984375,
79
- "loss": 0.69,
80
- "rewards/accuracies": 0.6812499761581421,
81
- "rewards/chosen": 0.015561707317829132,
82
- "rewards/margins": 0.00809282623231411,
83
- "rewards/rejected": 0.007468880619853735,
84
  "step": 40
85
  },
86
  {
87
  "epoch": 0.10465724751439037,
88
- "grad_norm": 7.837366185969384,
89
  "learning_rate": 2.604166666666667e-07,
90
- "logits/chosen": -2.7266910076141357,
91
- "logits/rejected": -2.665973424911499,
92
- "logps/chosen": -287.90960693359375,
93
- "logps/rejected": -280.13787841796875,
94
- "loss": 0.6871,
95
- "rewards/accuracies": 0.7124999761581421,
96
- "rewards/chosen": 0.026710817590355873,
97
- "rewards/margins": 0.01235121302306652,
98
- "rewards/rejected": 0.014359605498611927,
99
  "step": 50
100
  },
101
  {
102
  "epoch": 0.12558869701726844,
103
- "grad_norm": 8.68562739775634,
104
  "learning_rate": 3.1249999999999997e-07,
105
- "logits/chosen": -2.738069534301758,
106
- "logits/rejected": -2.694655656814575,
107
- "logps/chosen": -261.654296875,
108
- "logps/rejected": -249.44241333007812,
109
- "loss": 0.6811,
110
- "rewards/accuracies": 0.71875,
111
- "rewards/chosen": 0.039315465837717056,
112
- "rewards/margins": 0.02459501288831234,
113
- "rewards/rejected": 0.014720454812049866,
114
  "step": 60
115
  },
116
  {
117
  "epoch": 0.14652014652014653,
118
- "grad_norm": 8.17805459442737,
119
  "learning_rate": 3.645833333333333e-07,
120
- "logits/chosen": -2.6594691276550293,
121
- "logits/rejected": -2.6348719596862793,
122
- "logps/chosen": -304.16754150390625,
123
- "logps/rejected": -292.7433166503906,
124
- "loss": 0.6713,
125
- "rewards/accuracies": 0.699999988079071,
126
- "rewards/chosen": 0.03804076835513115,
127
- "rewards/margins": 0.0464973971247673,
128
- "rewards/rejected": -0.008456626906991005,
129
  "step": 70
130
  },
131
  {
132
  "epoch": 0.1674515960230246,
133
- "grad_norm": 9.10750617456122,
134
  "learning_rate": 4.1666666666666667e-07,
135
- "logits/chosen": -2.713014602661133,
136
- "logits/rejected": -2.63926362991333,
137
- "logps/chosen": -343.3531494140625,
138
- "logps/rejected": -272.9491271972656,
139
- "loss": 0.6603,
140
- "rewards/accuracies": 0.737500011920929,
141
- "rewards/chosen": 0.01621154509484768,
142
- "rewards/margins": 0.09821438044309616,
143
- "rewards/rejected": -0.08200283348560333,
144
  "step": 80
145
  },
146
  {
147
  "epoch": 0.18838304552590268,
148
- "grad_norm": 10.344063716824344,
149
  "learning_rate": 4.6874999999999996e-07,
150
- "logits/chosen": -2.6566672325134277,
151
- "logits/rejected": -2.616140127182007,
152
- "logps/chosen": -273.27459716796875,
153
- "logps/rejected": -285.70361328125,
154
- "loss": 0.6362,
155
- "rewards/accuracies": 0.6937500238418579,
156
- "rewards/chosen": -0.09348011016845703,
157
- "rewards/margins": 0.12727037072181702,
158
- "rewards/rejected": -0.22075048089027405,
159
  "step": 90
160
  },
161
  {
162
  "epoch": 0.20931449502878074,
163
- "grad_norm": 18.46434787727296,
164
  "learning_rate": 4.999731868769026e-07,
165
- "logits/chosen": -2.62548828125,
166
- "logits/rejected": -2.604613780975342,
167
- "logps/chosen": -326.0329895019531,
168
- "logps/rejected": -328.9161071777344,
169
- "loss": 0.6209,
170
- "rewards/accuracies": 0.706250011920929,
171
- "rewards/chosen": -0.15337222814559937,
172
- "rewards/margins": 0.27119967341423035,
173
- "rewards/rejected": -0.4245719313621521,
174
  "step": 100
175
  },
176
  {
177
  "epoch": 0.2302459445316588,
178
- "grad_norm": 12.853014931259976,
179
  "learning_rate": 4.996716052911017e-07,
180
- "logits/chosen": -2.6524977684020996,
181
- "logits/rejected": -2.613800287246704,
182
- "logps/chosen": -333.86810302734375,
183
- "logps/rejected": -339.0367431640625,
184
- "loss": 0.6155,
185
- "rewards/accuracies": 0.65625,
186
- "rewards/chosen": -0.2943059504032135,
187
- "rewards/margins": 0.2476470172405243,
188
- "rewards/rejected": -0.5419529676437378,
189
  "step": 110
190
  },
191
  {
192
  "epoch": 0.25117739403453687,
193
- "grad_norm": 13.434978559765035,
194
  "learning_rate": 4.990353313429303e-07,
195
- "logits/chosen": -2.5527138710021973,
196
- "logits/rejected": -2.5590224266052246,
197
- "logps/chosen": -309.571044921875,
198
- "logps/rejected": -332.9461364746094,
199
- "loss": 0.598,
200
- "rewards/accuracies": 0.737500011920929,
201
- "rewards/chosen": -0.2963547110557556,
202
- "rewards/margins": 0.33014291524887085,
203
- "rewards/rejected": -0.6264976263046265,
204
  "step": 120
205
  },
206
  {
207
  "epoch": 0.272108843537415,
208
- "grad_norm": 13.77722186805988,
209
  "learning_rate": 4.980652179769217e-07,
210
- "logits/chosen": -2.5915229320526123,
211
- "logits/rejected": -2.5797364711761475,
212
- "logps/chosen": -358.6594543457031,
213
- "logps/rejected": -371.49652099609375,
214
- "loss": 0.588,
215
- "rewards/accuracies": 0.6812499761581421,
216
- "rewards/chosen": -0.6360726356506348,
217
- "rewards/margins": 0.286227822303772,
218
- "rewards/rejected": -0.9223003387451172,
219
  "step": 130
220
  },
221
  {
222
  "epoch": 0.29304029304029305,
223
- "grad_norm": 19.98956871690929,
224
  "learning_rate": 4.967625656594781e-07,
225
- "logits/chosen": -2.6256630420684814,
226
- "logits/rejected": -2.6000983715057373,
227
- "logps/chosen": -368.37274169921875,
228
- "logps/rejected": -357.54522705078125,
229
- "loss": 0.5846,
230
- "rewards/accuracies": 0.75,
231
- "rewards/chosen": -0.3657643496990204,
232
- "rewards/margins": 0.411146879196167,
233
- "rewards/rejected": -0.776911199092865,
234
  "step": 140
235
  },
236
  {
237
  "epoch": 0.3139717425431711,
238
- "grad_norm": 18.241862609597362,
239
  "learning_rate": 4.951291206355559e-07,
240
- "logits/chosen": -2.5957682132720947,
241
- "logits/rejected": -2.570385456085205,
242
- "logps/chosen": -348.45361328125,
243
- "logps/rejected": -350.3853759765625,
244
- "loss": 0.5898,
245
- "rewards/accuracies": 0.706250011920929,
246
- "rewards/chosen": -0.5664342641830444,
247
- "rewards/margins": 0.37359124422073364,
248
- "rewards/rejected": -0.9400255084037781,
249
  "step": 150
250
  },
251
  {
252
  "epoch": 0.3349031920460492,
253
- "grad_norm": 16.695399210826476,
254
  "learning_rate": 4.93167072587771e-07,
255
- "logits/chosen": -2.6461105346679688,
256
- "logits/rejected": -2.6081748008728027,
257
- "logps/chosen": -334.31024169921875,
258
- "logps/rejected": -353.1805419921875,
259
- "loss": 0.5763,
260
- "rewards/accuracies": 0.7562500238418579,
261
- "rewards/chosen": -0.45854535698890686,
262
- "rewards/margins": 0.46823421120643616,
263
- "rewards/rejected": -0.926779568195343,
264
  "step": 160
265
  },
266
  {
267
  "epoch": 0.35583464154892724,
268
- "grad_norm": 20.770623175746024,
269
  "learning_rate": 4.908790517010636e-07,
270
- "logits/chosen": -2.5199551582336426,
271
- "logits/rejected": -2.5082385540008545,
272
- "logps/chosen": -325.3274230957031,
273
- "logps/rejected": -337.1798400878906,
274
- "loss": 0.5817,
275
- "rewards/accuracies": 0.71875,
276
- "rewards/chosen": -0.5404187440872192,
277
- "rewards/margins": 0.3684747815132141,
278
- "rewards/rejected": -0.9088934659957886,
279
  "step": 170
280
  },
281
  {
282
  "epoch": 0.37676609105180536,
283
- "grad_norm": 17.458454310533863,
284
  "learning_rate": 4.882681251368548e-07,
285
- "logits/chosen": -2.5944693088531494,
286
- "logits/rejected": -2.57043719291687,
287
- "logps/chosen": -339.6058044433594,
288
- "logps/rejected": -346.19708251953125,
289
- "loss": 0.5499,
290
- "rewards/accuracies": 0.71875,
291
- "rewards/chosen": -0.5662221312522888,
292
- "rewards/margins": 0.3738660216331482,
293
- "rewards/rejected": -0.9400882720947266,
294
  "step": 180
295
  },
296
  {
297
  "epoch": 0.3976975405546834,
298
- "grad_norm": 21.162626385312407,
299
  "learning_rate": 4.853377929214243e-07,
300
- "logits/chosen": -2.5164248943328857,
301
- "logits/rejected": -2.490086793899536,
302
- "logps/chosen": -326.11981201171875,
303
- "logps/rejected": -356.9112548828125,
304
- "loss": 0.567,
305
- "rewards/accuracies": 0.699999988079071,
306
- "rewards/chosen": -0.5691932439804077,
307
- "rewards/margins": 0.3675684630870819,
308
- "rewards/rejected": -0.9367616772651672,
309
  "step": 190
310
  },
311
  {
312
  "epoch": 0.4186289900575615,
313
- "grad_norm": 81.07404869193526,
314
  "learning_rate": 4.820919832540181e-07,
315
- "logits/chosen": -2.495347499847412,
316
- "logits/rejected": -2.5034618377685547,
317
- "logps/chosen": -325.27166748046875,
318
- "logps/rejected": -383.38934326171875,
319
- "loss": 0.5841,
320
- "rewards/accuracies": 0.71875,
321
- "rewards/chosen": -0.6413096189498901,
322
- "rewards/margins": 0.3838884234428406,
323
- "rewards/rejected": -1.0251981019973755,
324
  "step": 200
325
  },
326
  {
327
  "epoch": 0.43956043956043955,
328
- "grad_norm": 20.383161365084906,
329
  "learning_rate": 4.785350472409791e-07,
330
- "logits/chosen": -2.5574824810028076,
331
- "logits/rejected": -2.5132758617401123,
332
- "logps/chosen": -382.79547119140625,
333
- "logps/rejected": -362.0451354980469,
334
- "loss": 0.5673,
335
- "rewards/accuracies": 0.6937500238418579,
336
- "rewards/chosen": -0.5895588397979736,
337
- "rewards/margins": 0.43525028228759766,
338
- "rewards/rejected": -1.0248091220855713,
339
  "step": 210
340
  },
341
  {
342
  "epoch": 0.4604918890633176,
343
- "grad_norm": 19.259994881877645,
344
  "learning_rate": 4.7467175306295647e-07,
345
- "logits/chosen": -2.493474245071411,
346
- "logits/rejected": -2.5184781551361084,
347
- "logps/chosen": -318.1018981933594,
348
- "logps/rejected": -364.78167724609375,
349
- "loss": 0.561,
350
- "rewards/accuracies": 0.768750011920929,
351
- "rewards/chosen": -0.6697767376899719,
352
- "rewards/margins": 0.4676804542541504,
353
- "rewards/rejected": -1.137457251548767,
354
  "step": 220
355
  },
356
  {
357
  "epoch": 0.48142333856619574,
358
- "grad_norm": 19.775692922777765,
359
  "learning_rate": 4.70507279583015e-07,
360
- "logits/chosen": -2.6063382625579834,
361
- "logits/rejected": -2.5750064849853516,
362
- "logps/chosen": -334.16009521484375,
363
- "logps/rejected": -381.5287780761719,
364
- "loss": 0.5634,
365
- "rewards/accuracies": 0.7749999761581421,
366
- "rewards/chosen": -0.6287840008735657,
367
- "rewards/margins": 0.7023510932922363,
368
- "rewards/rejected": -1.3311350345611572,
369
  "step": 230
370
  },
371
  {
372
  "epoch": 0.5023547880690737,
373
- "grad_norm": 26.289584097515366,
374
  "learning_rate": 4.6604720940421207e-07,
375
- "logits/chosen": -2.5913660526275635,
376
- "logits/rejected": -2.5510830879211426,
377
- "logps/chosen": -363.0348205566406,
378
- "logps/rejected": -385.59173583984375,
379
- "loss": 0.5304,
380
  "rewards/accuracies": 0.699999988079071,
381
- "rewards/chosen": -0.599415123462677,
382
- "rewards/margins": 0.555901050567627,
383
- "rewards/rejected": -1.1553161144256592,
384
  "step": 240
385
  },
386
  {
387
  "epoch": 0.5232862375719518,
388
- "grad_norm": 20.96954894885707,
389
  "learning_rate": 4.612975213859487e-07,
390
- "logits/chosen": -2.5373878479003906,
391
- "logits/rejected": -2.4917821884155273,
392
- "logps/chosen": -379.81591796875,
393
- "logps/rejected": -413.7845764160156,
394
- "loss": 0.5326,
395
- "rewards/accuracies": 0.8062499761581421,
396
- "rewards/chosen": -0.8831035494804382,
397
- "rewards/margins": 0.783009946346283,
398
- "rewards/rejected": -1.6661134958267212,
399
  "step": 250
400
  },
401
  {
402
  "epoch": 0.54421768707483,
403
- "grad_norm": 20.35055226575624,
404
  "learning_rate": 4.5626458262912735e-07,
405
- "logits/chosen": -2.5308146476745605,
406
- "logits/rejected": -2.5432188510894775,
407
- "logps/chosen": -362.18731689453125,
408
- "logps/rejected": -401.601318359375,
409
- "loss": 0.527,
410
- "rewards/accuracies": 0.7124999761581421,
411
- "rewards/chosen": -0.8506525754928589,
412
- "rewards/margins": 0.6240705847740173,
413
- "rewards/rejected": -1.474723219871521,
414
  "step": 260
415
  },
416
  {
417
  "epoch": 0.565149136577708,
418
- "grad_norm": 21.42532206002976,
419
  "learning_rate": 4.5095513994085974e-07,
420
- "logits/chosen": -2.396043300628662,
421
- "logits/rejected": -2.3887431621551514,
422
- "logps/chosen": -395.9293212890625,
423
- "logps/rejected": -445.9908142089844,
424
- "loss": 0.5364,
425
  "rewards/accuracies": 0.7437499761581421,
426
- "rewards/chosen": -0.8952659368515015,
427
- "rewards/margins": 0.8105502128601074,
428
- "rewards/rejected": -1.7058160305023193,
429
  "step": 270
430
  },
431
  {
432
  "epoch": 0.5860805860805861,
433
- "grad_norm": 24.442795762265213,
434
  "learning_rate": 4.453763107901675e-07,
435
- "logits/chosen": -2.251249074935913,
436
- "logits/rejected": -2.2444510459899902,
437
- "logps/chosen": -361.8575744628906,
438
- "logps/rejected": -387.69476318359375,
439
- "loss": 0.5385,
440
- "rewards/accuracies": 0.6937500238418579,
441
- "rewards/chosen": -0.8830081820487976,
442
- "rewards/margins": 0.6050563454627991,
443
- "rewards/rejected": -1.4880646467208862,
444
  "step": 280
445
  },
446
  {
447
  "epoch": 0.6070120355834642,
448
- "grad_norm": 21.25843011088378,
449
  "learning_rate": 4.395355737667985e-07,
450
- "logits/chosen": -2.3289453983306885,
451
- "logits/rejected": -2.2574009895324707,
452
- "logps/chosen": -364.3363342285156,
453
- "logps/rejected": -418.8705139160156,
454
- "loss": 0.5286,
455
- "rewards/accuracies": 0.7562500238418579,
456
- "rewards/chosen": -1.0685086250305176,
457
- "rewards/margins": 0.6174401044845581,
458
- "rewards/rejected": -1.6859489679336548,
459
  "step": 290
460
  },
461
  {
462
  "epoch": 0.6279434850863422,
463
- "grad_norm": 21.179507487381557,
464
  "learning_rate": 4.3344075855595097e-07,
465
- "logits/chosen": -2.2611992359161377,
466
- "logits/rejected": -2.218019962310791,
467
- "logps/chosen": -428.0699768066406,
468
- "logps/rejected": -442.8394470214844,
469
- "loss": 0.5118,
470
- "rewards/accuracies": 0.731249988079071,
471
- "rewards/chosen": -0.9603120684623718,
472
- "rewards/margins": 0.6408740878105164,
473
- "rewards/rejected": -1.6011863946914673,
474
  "step": 300
475
  },
476
  {
477
  "epoch": 0.6488749345892203,
478
- "grad_norm": 26.830608117395943,
479
  "learning_rate": 4.271000354423425e-07,
480
- "logits/chosen": -2.3035359382629395,
481
- "logits/rejected": -2.2721774578094482,
482
- "logps/chosen": -399.39703369140625,
483
- "logps/rejected": -419.53594970703125,
484
- "loss": 0.5142,
485
- "rewards/accuracies": 0.762499988079071,
486
- "rewards/chosen": -0.8477271199226379,
487
- "rewards/margins": 0.7570197582244873,
488
- "rewards/rejected": -1.6047470569610596,
489
  "step": 310
490
  },
491
  {
492
  "epoch": 0.6698063840920984,
493
- "grad_norm": 25.48809105017836,
494
  "learning_rate": 4.2052190435769554e-07,
495
- "logits/chosen": -2.2605085372924805,
496
- "logits/rejected": -2.199134349822998,
497
- "logps/chosen": -384.046142578125,
498
- "logps/rejected": -435.5370178222656,
499
- "loss": 0.5029,
500
  "rewards/accuracies": 0.831250011920929,
501
- "rewards/chosen": -1.1322637796401978,
502
- "rewards/margins": 0.8748885989189148,
503
- "rewards/rejected": -2.007152557373047,
504
  "step": 320
505
  },
506
  {
507
  "epoch": 0.6907378335949764,
508
- "grad_norm": 25.806873877392455,
509
  "learning_rate": 4.137151834863213e-07,
510
- "logits/chosen": -2.2592532634735107,
511
- "logits/rejected": -2.2413222789764404,
512
- "logps/chosen": -370.25860595703125,
513
- "logps/rejected": -438.299072265625,
514
- "loss": 0.5145,
515
- "rewards/accuracies": 0.7562500238418579,
516
- "rewards/chosen": -0.8921791911125183,
517
- "rewards/margins": 0.7555935382843018,
518
- "rewards/rejected": -1.6477725505828857,
519
  "step": 330
520
  },
521
  {
522
  "epoch": 0.7116692830978545,
523
- "grad_norm": 31.495534978222874,
524
  "learning_rate": 4.0668899744407567e-07,
525
- "logits/chosen": -2.151144504547119,
526
- "logits/rejected": -2.1227824687957764,
527
- "logps/chosen": -416.178955078125,
528
- "logps/rejected": -469.052490234375,
529
- "loss": 0.5129,
530
- "rewards/accuracies": 0.7250000238418579,
531
- "rewards/chosen": -1.136067271232605,
532
- "rewards/margins": 0.7855342626571655,
533
- "rewards/rejected": -1.9216015338897705,
534
  "step": 340
535
  },
536
  {
537
  "epoch": 0.7326007326007326,
538
- "grad_norm": 22.383341966309303,
539
  "learning_rate": 3.994527650465352e-07,
540
- "logits/chosen": -2.245666027069092,
541
- "logits/rejected": -2.1971018314361572,
542
- "logps/chosen": -376.11187744140625,
543
- "logps/rejected": -433.7845764160156,
544
- "loss": 0.4804,
545
- "rewards/accuracies": 0.8187500238418579,
546
- "rewards/chosen": -1.0256567001342773,
547
- "rewards/margins": 0.8312705159187317,
548
- "rewards/rejected": -1.856927514076233,
549
  "step": 350
550
  },
551
  {
552
  "epoch": 0.7535321821036107,
553
- "grad_norm": 22.50493532567032,
554
  "learning_rate": 3.920161866827889e-07,
555
- "logits/chosen": -2.2463080883026123,
556
- "logits/rejected": -2.190325975418091,
557
- "logps/chosen": -426.4788513183594,
558
- "logps/rejected": -468.1583557128906,
559
- "loss": 0.5312,
560
- "rewards/accuracies": 0.768750011920929,
561
- "rewards/chosen": -1.0429120063781738,
562
- "rewards/margins": 0.8944908976554871,
563
- "rewards/rejected": -1.9374029636383057,
564
  "step": 360
565
  },
566
  {
567
  "epoch": 0.7744636316064888,
568
- "grad_norm": 31.530089384956717,
569
  "learning_rate": 3.8438923131177237e-07,
570
- "logits/chosen": -2.1957874298095703,
571
- "logits/rejected": -2.1412596702575684,
572
- "logps/chosen": -406.10504150390625,
573
- "logps/rejected": -453.65252685546875,
574
- "loss": 0.5133,
575
- "rewards/accuracies": 0.762499988079071,
576
- "rewards/chosen": -1.265554666519165,
577
- "rewards/margins": 0.8195264935493469,
578
- "rewards/rejected": -2.085081100463867,
579
  "step": 370
580
  },
581
  {
582
  "epoch": 0.7953950811093669,
583
- "grad_norm": 22.98180976919898,
584
  "learning_rate": 3.765821230985757e-07,
585
- "logits/chosen": -2.0363900661468506,
586
- "logits/rejected": -2.024770736694336,
587
- "logps/chosen": -385.59698486328125,
588
- "logps/rejected": -494.7015075683594,
589
- "loss": 0.5061,
590
- "rewards/accuracies": 0.737500011920929,
591
- "rewards/chosen": -1.2817010879516602,
592
- "rewards/margins": 0.9372233152389526,
593
- "rewards/rejected": -2.2189245223999023,
594
  "step": 380
595
  },
596
  {
597
  "epoch": 0.8163265306122449,
598
- "grad_norm": 25.12613071409362,
599
  "learning_rate": 3.6860532770864005e-07,
600
- "logits/chosen": -2.099245548248291,
601
- "logits/rejected": -2.04612135887146,
602
- "logps/chosen": -457.05682373046875,
603
- "logps/rejected": -488.36944580078125,
604
- "loss": 0.5015,
605
  "rewards/accuracies": 0.731249988079071,
606
- "rewards/chosen": -1.019736647605896,
607
- "rewards/margins": 0.7141124606132507,
608
- "rewards/rejected": -1.7338489294052124,
609
  "step": 390
610
  },
611
  {
612
  "epoch": 0.837257980115123,
613
- "grad_norm": 23.959018744941538,
614
  "learning_rate": 3.604695382782159e-07,
615
- "logits/chosen": -1.991755485534668,
616
- "logits/rejected": -1.9470754861831665,
617
- "logps/chosen": -408.3104248046875,
618
- "logps/rejected": -473.34637451171875,
619
- "loss": 0.5099,
620
- "rewards/accuracies": 0.78125,
621
- "rewards/chosen": -1.4101927280426025,
622
- "rewards/margins": 0.8306114077568054,
623
- "rewards/rejected": -2.2408041954040527,
624
  "step": 400
625
  },
626
  {
627
  "epoch": 0.858189429618001,
628
- "grad_norm": 21.1631977830508,
629
  "learning_rate": 3.5218566107988867e-07,
630
- "logits/chosen": -2.091287136077881,
631
- "logits/rejected": -2.0397820472717285,
632
- "logps/chosen": -457.28131103515625,
633
- "logps/rejected": -507.02667236328125,
634
- "loss": 0.5122,
635
- "rewards/accuracies": 0.7437499761581421,
636
- "rewards/chosen": -1.4175649881362915,
637
- "rewards/margins": 0.8866469264030457,
638
- "rewards/rejected": -2.3042116165161133,
639
  "step": 410
640
  },
641
  {
642
  "epoch": 0.8791208791208791,
643
- "grad_norm": 27.242164262823913,
644
  "learning_rate": 3.4376480090239047e-07,
645
- "logits/chosen": -2.0190649032592773,
646
- "logits/rejected": -2.000903844833374,
647
- "logps/chosen": -379.5609436035156,
648
- "logps/rejected": -469.435302734375,
649
- "loss": 0.5039,
650
- "rewards/accuracies": 0.793749988079071,
651
- "rewards/chosen": -1.0910499095916748,
652
- "rewards/margins": 0.9324447512626648,
653
- "rewards/rejected": -2.0234947204589844,
654
  "step": 420
655
  },
656
  {
657
  "epoch": 0.9000523286237572,
658
- "grad_norm": 30.81437560389435,
659
  "learning_rate": 3.3521824616429284e-07,
660
- "logits/chosen": -2.035524606704712,
661
- "logits/rejected": -1.9534003734588623,
662
- "logps/chosen": -431.333740234375,
663
- "logps/rejected": -497.05035400390625,
664
- "loss": 0.4921,
665
  "rewards/accuracies": 0.7562500238418579,
666
- "rewards/chosen": -1.2844098806381226,
667
- "rewards/margins": 0.9379785656929016,
668
- "rewards/rejected": -2.222388505935669,
669
  "step": 430
670
  },
671
  {
672
  "epoch": 0.9209837781266352,
673
- "grad_norm": 27.873966573342,
674
  "learning_rate": 3.265574537815398e-07,
675
- "logits/chosen": -2.1473166942596436,
676
- "logits/rejected": -2.0772416591644287,
677
- "logps/chosen": -405.3023986816406,
678
- "logps/rejected": -452.0018615722656,
679
- "loss": 0.4995,
680
- "rewards/accuracies": 0.7124999761581421,
681
- "rewards/chosen": -1.3173587322235107,
682
- "rewards/margins": 0.8507854342460632,
683
- "rewards/rejected": -2.1681439876556396,
684
  "step": 440
685
  },
686
  {
687
  "epoch": 0.9419152276295133,
688
- "grad_norm": 27.057353974975,
689
  "learning_rate": 3.1779403380910425e-07,
690
- "logits/chosen": -1.9614715576171875,
691
- "logits/rejected": -1.947115182876587,
692
- "logps/chosen": -397.4813232421875,
693
- "logps/rejected": -475.9383850097656,
694
- "loss": 0.5059,
695
- "rewards/accuracies": 0.7875000238418579,
696
- "rewards/chosen": -1.1171411275863647,
697
- "rewards/margins": 0.9366900324821472,
698
- "rewards/rejected": -2.053831100463867,
699
  "step": 450
700
  },
701
  {
702
  "epoch": 0.9628466771323915,
703
- "grad_norm": 23.906392544335443,
704
  "learning_rate": 3.0893973387735683e-07,
705
- "logits/chosen": -2.1360177993774414,
706
- "logits/rejected": -2.067448139190674,
707
- "logps/chosen": -395.38140869140625,
708
- "logps/rejected": -425.06060791015625,
709
- "loss": 0.5181,
710
- "rewards/accuracies": 0.762499988079071,
711
- "rewards/chosen": -1.0835450887680054,
712
- "rewards/margins": 0.7977073192596436,
713
- "rewards/rejected": -1.8812522888183594,
714
  "step": 460
715
  },
716
  {
717
  "epoch": 0.9837781266352695,
718
- "grad_norm": 23.415439117206944,
719
  "learning_rate": 3.000064234440111e-07,
720
- "logits/chosen": -2.1340832710266113,
721
- "logits/rejected": -2.099747657775879,
722
- "logps/chosen": -433.472900390625,
723
- "logps/rejected": -478.5433654785156,
724
- "loss": 0.4926,
725
  "rewards/accuracies": 0.706250011920929,
726
- "rewards/chosen": -1.1599546670913696,
727
- "rewards/margins": 0.8136328458786011,
728
- "rewards/rejected": -1.9735876321792603,
729
  "step": 470
730
  },
731
  {
732
  "epoch": 0.9984301412872841,
733
- "eval_logits/chosen": -2.1760072708129883,
734
- "eval_logits/rejected": -2.14554762840271,
735
- "eval_logps/chosen": -412.9510192871094,
736
- "eval_logps/rejected": -506.2944030761719,
737
- "eval_loss": 0.5081629753112793,
738
- "eval_rewards/accuracies": 0.7578125,
739
- "eval_rewards/chosen": -1.3560869693756104,
740
- "eval_rewards/margins": 0.9750058054924011,
741
- "eval_rewards/rejected": -2.331092596054077,
742
- "eval_runtime": 169.9844,
743
- "eval_samples_per_second": 11.766,
744
- "eval_steps_per_second": 0.188,
745
  "step": 477
746
  },
747
  {
748
  "epoch": 1.0047095761381475,
749
- "grad_norm": 23.808566042067955,
750
  "learning_rate": 2.910060778827554e-07,
751
- "logits/chosen": -2.1567416191101074,
752
- "logits/rejected": -2.1457903385162354,
753
- "logps/chosen": -434.89910888671875,
754
- "logps/rejected": -504.3150329589844,
755
- "loss": 0.4859,
756
- "rewards/accuracies": 0.7749999761581421,
757
- "rewards/chosen": -1.2960957288742065,
758
- "rewards/margins": 0.8546605110168457,
759
- "rewards/rejected": -2.1507561206817627,
760
  "step": 480
761
  },
762
  {
763
  "epoch": 1.0256410256410255,
764
- "grad_norm": 21.238305952154406,
765
  "learning_rate": 2.8195076242990116e-07,
766
- "logits/chosen": -2.1224186420440674,
767
- "logits/rejected": -2.067232131958008,
768
- "logps/chosen": -452.2091369628906,
769
- "logps/rejected": -532.13623046875,
770
- "loss": 0.3595,
771
- "rewards/accuracies": 0.8187500238418579,
772
- "rewards/chosen": -1.5007661581039429,
773
- "rewards/margins": 1.2642806768417358,
774
- "rewards/rejected": -2.7650468349456787,
775
  "step": 490
776
  },
777
  {
778
  "epoch": 1.0465724751439036,
779
- "grad_norm": 29.689018536225188,
780
  "learning_rate": 2.7285261601056697e-07,
781
- "logits/chosen": -2.0156354904174805,
782
- "logits/rejected": -1.971776008605957,
783
- "logps/chosen": -468.71783447265625,
784
- "logps/rejected": -602.7720947265625,
785
- "loss": 0.339,
786
  "rewards/accuracies": 0.862500011920929,
787
- "rewards/chosen": -1.8369992971420288,
788
- "rewards/margins": 1.511438012123108,
789
- "rewards/rejected": -3.348437547683716,
790
  "step": 500
791
  },
792
  {
793
  "epoch": 1.0675039246467817,
794
- "grad_norm": 29.243062750459888,
795
  "learning_rate": 2.6372383496608186e-07,
796
- "logits/chosen": -1.928108811378479,
797
- "logits/rejected": -1.8696941137313843,
798
- "logps/chosen": -471.1832580566406,
799
- "logps/rejected": -599.2332763671875,
800
- "loss": 0.3513,
801
- "rewards/accuracies": 0.8687499761581421,
802
- "rewards/chosen": -1.8683710098266602,
803
- "rewards/margins": 1.7321943044662476,
804
- "rewards/rejected": -3.6005654335021973,
805
  "step": 510
806
  },
807
  {
808
  "epoch": 1.08843537414966,
809
- "grad_norm": 25.479631786694963,
810
  "learning_rate": 2.5457665670441937e-07,
811
- "logits/chosen": -1.9516363143920898,
812
- "logits/rejected": -1.8393253087997437,
813
- "logps/chosen": -469.7832946777344,
814
- "logps/rejected": -591.2280883789062,
815
- "loss": 0.3432,
816
- "rewards/accuracies": 0.875,
817
- "rewards/chosen": -1.8596889972686768,
818
- "rewards/margins": 1.560598373413086,
819
- "rewards/rejected": -3.4202873706817627,
820
  "step": 520
821
  },
822
  {
823
  "epoch": 1.109366823652538,
824
- "grad_norm": 41.52483663106515,
825
  "learning_rate": 2.454233432955807e-07,
826
- "logits/chosen": -1.983980417251587,
827
- "logits/rejected": -1.8693246841430664,
828
- "logps/chosen": -452.64208984375,
829
- "logps/rejected": -564.1638793945312,
830
- "loss": 0.3472,
831
- "rewards/accuracies": 0.8374999761581421,
832
- "rewards/chosen": -1.847670316696167,
833
- "rewards/margins": 1.559227705001831,
834
- "rewards/rejected": -3.406898021697998,
835
  "step": 530
836
  },
837
  {
838
  "epoch": 1.130298273155416,
839
- "grad_norm": 26.257090591710913,
840
  "learning_rate": 2.3627616503391812e-07,
841
- "logits/chosen": -1.9792553186416626,
842
- "logits/rejected": -1.9432990550994873,
843
- "logps/chosen": -432.3143615722656,
844
- "logps/rejected": -584.3980102539062,
845
- "loss": 0.3457,
846
- "rewards/accuracies": 0.8812500238418579,
847
- "rewards/chosen": -1.673628568649292,
848
- "rewards/margins": 1.6501662731170654,
849
- "rewards/rejected": -3.3237948417663574,
850
  "step": 540
851
  },
852
  {
853
  "epoch": 1.1512297226582942,
854
- "grad_norm": 25.747556328035525,
855
  "learning_rate": 2.2714738398943308e-07,
856
- "logits/chosen": -1.9460952281951904,
857
- "logits/rejected": -1.875335693359375,
858
- "logps/chosen": -468.60198974609375,
859
- "logps/rejected": -596.0556640625,
860
- "loss": 0.3378,
861
- "rewards/accuracies": 0.856249988079071,
862
- "rewards/chosen": -1.7640159130096436,
863
- "rewards/margins": 1.6453392505645752,
864
- "rewards/rejected": -3.409355640411377,
865
  "step": 550
866
  },
867
  {
868
  "epoch": 1.1721611721611722,
869
- "grad_norm": 41.43400202902694,
870
  "learning_rate": 2.1804923757009882e-07,
871
- "logits/chosen": -1.9045295715332031,
872
- "logits/rejected": -1.8370736837387085,
873
- "logps/chosen": -432.0585021972656,
874
- "logps/rejected": -571.2145385742188,
875
- "loss": 0.3533,
876
  "rewards/accuracies": 0.925000011920929,
877
- "rewards/chosen": -1.6177046298980713,
878
- "rewards/margins": 1.5667366981506348,
879
- "rewards/rejected": -3.184441328048706,
880
  "step": 560
881
  },
882
  {
883
  "epoch": 1.1930926216640503,
884
- "grad_norm": 34.75832087075438,
885
  "learning_rate": 2.089939221172446e-07,
886
- "logits/chosen": -1.939172387123108,
887
- "logits/rejected": -1.8915830850601196,
888
- "logps/chosen": -472.9189453125,
889
- "logps/rejected": -638.5263061523438,
890
- "loss": 0.3399,
891
- "rewards/accuracies": 0.918749988079071,
892
- "rewards/chosen": -1.6979265213012695,
893
- "rewards/margins": 1.673963189125061,
894
- "rewards/rejected": -3.371889591217041,
895
  "step": 570
896
  },
897
  {
898
  "epoch": 1.2140240711669283,
899
- "grad_norm": 30.420080915117918,
900
  "learning_rate": 1.9999357655598891e-07,
901
- "logits/chosen": -1.8901792764663696,
902
- "logits/rejected": -1.8162235021591187,
903
- "logps/chosen": -475.49859619140625,
904
- "logps/rejected": -622.9483032226562,
905
- "loss": 0.3291,
906
- "rewards/accuracies": 0.875,
907
- "rewards/chosen": -1.903842568397522,
908
- "rewards/margins": 1.5753123760223389,
909
- "rewards/rejected": -3.4791553020477295,
910
  "step": 580
911
  },
912
  {
913
  "epoch": 1.2349555206698064,
914
- "grad_norm": 22.335542729275428,
915
  "learning_rate": 1.9106026612264315e-07,
916
- "logits/chosen": -1.867323637008667,
917
- "logits/rejected": -1.8158063888549805,
918
- "logps/chosen": -455.6680603027344,
919
- "logps/rejected": -624.3533935546875,
920
- "loss": 0.3306,
921
- "rewards/accuracies": 0.887499988079071,
922
- "rewards/chosen": -1.6538959741592407,
923
- "rewards/margins": 1.7916101217269897,
924
- "rewards/rejected": -3.4455063343048096,
925
  "step": 590
926
  },
927
  {
928
  "epoch": 1.2558869701726845,
929
- "grad_norm": 36.90747690001929,
930
  "learning_rate": 1.8220596619089573e-07,
931
- "logits/chosen": -1.8775126934051514,
932
- "logits/rejected": -1.8093585968017578,
933
- "logps/chosen": -513.2244873046875,
934
- "logps/rejected": -664.3628540039062,
935
- "loss": 0.3203,
936
  "rewards/accuracies": 0.8374999761581421,
937
- "rewards/chosen": -1.8860762119293213,
938
- "rewards/margins": 1.6223599910736084,
939
- "rewards/rejected": -3.5084362030029297,
940
  "step": 600
941
  },
942
  {
943
  "epoch": 1.2768184196755625,
944
- "grad_norm": 33.78718364461237,
945
  "learning_rate": 1.7344254621846017e-07,
946
- "logits/chosen": -1.8397912979125977,
947
- "logits/rejected": -1.718785047531128,
948
- "logps/chosen": -515.6451416015625,
949
- "logps/rejected": -648.025390625,
950
- "loss": 0.3529,
951
- "rewards/accuracies": 0.8812500238418579,
952
- "rewards/chosen": -2.0395894050598145,
953
- "rewards/margins": 1.8018648624420166,
954
- "rewards/rejected": -3.841454267501831,
955
  "step": 610
956
  },
957
  {
958
  "epoch": 1.2977498691784406,
959
- "grad_norm": 31.52253218044683,
960
  "learning_rate": 1.647817538357072e-07,
961
- "logits/chosen": -1.8304967880249023,
962
- "logits/rejected": -1.7391868829727173,
963
- "logps/chosen": -446.7728576660156,
964
- "logps/rejected": -589.5924682617188,
965
- "loss": 0.3282,
966
  "rewards/accuracies": 0.824999988079071,
967
- "rewards/chosen": -1.8400465250015259,
968
- "rewards/margins": 1.6443830728530884,
969
- "rewards/rejected": -3.4844298362731934,
970
  "step": 620
971
  },
972
  {
973
  "epoch": 1.3186813186813187,
974
- "grad_norm": 27.590908392591455,
975
  "learning_rate": 1.562351990976095e-07,
976
- "logits/chosen": -1.8083171844482422,
977
- "logits/rejected": -1.7326056957244873,
978
- "logps/chosen": -467.4588928222656,
979
- "logps/rejected": -620.0399169921875,
980
- "loss": 0.3287,
981
- "rewards/accuracies": 0.893750011920929,
982
- "rewards/chosen": -1.8651363849639893,
983
- "rewards/margins": 1.7772023677825928,
984
- "rewards/rejected": -3.642338514328003,
985
  "step": 630
986
  },
987
  {
988
  "epoch": 1.3396127681841967,
989
- "grad_norm": 32.562595361242664,
990
  "learning_rate": 1.478143389201113e-07,
991
- "logits/chosen": -1.8179775476455688,
992
- "logits/rejected": -1.7229331731796265,
993
- "logps/chosen": -464.788818359375,
994
- "logps/rejected": -596.3846435546875,
995
- "loss": 0.3404,
996
- "rewards/accuracies": 0.84375,
997
- "rewards/chosen": -1.8788200616836548,
998
- "rewards/margins": 1.667773962020874,
999
- "rewards/rejected": -3.5465941429138184,
1000
  "step": 640
1001
  },
1002
  {
1003
  "epoch": 1.3605442176870748,
1004
- "grad_norm": 25.048068766086473,
1005
  "learning_rate": 1.3953046172178413e-07,
1006
- "logits/chosen": -1.831610083580017,
1007
- "logits/rejected": -1.7702150344848633,
1008
- "logps/chosen": -464.4568786621094,
1009
- "logps/rejected": -618.4747924804688,
1010
- "loss": 0.3413,
1011
- "rewards/accuracies": 0.90625,
1012
- "rewards/chosen": -1.7352968454360962,
1013
- "rewards/margins": 1.7301437854766846,
1014
- "rewards/rejected": -3.4654407501220703,
1015
  "step": 650
1016
  },
1017
  {
1018
  "epoch": 1.3814756671899528,
1019
- "grad_norm": 34.73968044504535,
1020
  "learning_rate": 1.3139467229135998e-07,
1021
- "logits/chosen": -1.850510835647583,
1022
- "logits/rejected": -1.768938660621643,
1023
- "logps/chosen": -475.8479919433594,
1024
- "logps/rejected": -614.0346069335938,
1025
- "loss": 0.3299,
1026
  "rewards/accuracies": 0.8687499761581421,
1027
- "rewards/chosen": -1.8521826267242432,
1028
- "rewards/margins": 1.6953092813491821,
1029
- "rewards/rejected": -3.5474917888641357,
1030
  "step": 660
1031
  },
1032
  {
1033
  "epoch": 1.402407116692831,
1034
- "grad_norm": 32.80513573876802,
1035
  "learning_rate": 1.2341787690142435e-07,
1036
- "logits/chosen": -1.828988790512085,
1037
- "logits/rejected": -1.7655560970306396,
1038
- "logps/chosen": -496.58599853515625,
1039
- "logps/rejected": -668.3373413085938,
1040
- "loss": 0.3336,
1041
  "rewards/accuracies": 0.887499988079071,
1042
- "rewards/chosen": -2.0432398319244385,
1043
- "rewards/margins": 1.6564356088638306,
1044
- "rewards/rejected": -3.6996757984161377,
1045
  "step": 670
1046
  },
1047
  {
1048
  "epoch": 1.423338566195709,
1049
- "grad_norm": 23.481598085104388,
1050
  "learning_rate": 1.1561076868822755e-07,
1051
- "logits/chosen": -1.82489812374115,
1052
- "logits/rejected": -1.736617088317871,
1053
- "logps/chosen": -481.0087890625,
1054
- "logps/rejected": -625.2978515625,
1055
- "loss": 0.3322,
1056
- "rewards/accuracies": 0.856249988079071,
1057
- "rewards/chosen": -1.9523290395736694,
1058
- "rewards/margins": 1.745566725730896,
1059
- "rewards/rejected": -3.6978955268859863,
1060
  "step": 680
1061
  },
1062
  {
1063
  "epoch": 1.4442700156985873,
1064
- "grad_norm": 25.58609752457987,
1065
  "learning_rate": 1.0798381331721107e-07,
1066
- "logits/chosen": -1.7810049057006836,
1067
- "logits/rejected": -1.7480462789535522,
1068
- "logps/chosen": -467.09161376953125,
1069
- "logps/rejected": -561.844482421875,
1070
- "loss": 0.3345,
1071
- "rewards/accuracies": 0.8187500238418579,
1072
- "rewards/chosen": -1.8833472728729248,
1073
- "rewards/margins": 1.3580461740493774,
1074
- "rewards/rejected": -3.241393566131592,
1075
  "step": 690
1076
  },
1077
  {
1078
  "epoch": 1.4652014652014653,
1079
- "grad_norm": 24.893301883943476,
1080
  "learning_rate": 1.0054723495346482e-07,
1081
- "logits/chosen": -1.8719736337661743,
1082
- "logits/rejected": -1.7901394367218018,
1083
- "logps/chosen": -495.5224609375,
1084
- "logps/rejected": -657.6851806640625,
1085
- "loss": 0.325,
1086
- "rewards/accuracies": 0.856249988079071,
1087
- "rewards/chosen": -1.8440284729003906,
1088
- "rewards/margins": 1.9203050136566162,
1089
- "rewards/rejected": -3.7643332481384277,
1090
  "step": 700
1091
  },
1092
  {
1093
  "epoch": 1.4861329147043434,
1094
- "grad_norm": 31.496542283071072,
1095
  "learning_rate": 9.331100255592436e-08,
1096
- "logits/chosen": -1.8443610668182373,
1097
- "logits/rejected": -1.8108571767807007,
1098
- "logps/chosen": -477.08660888671875,
1099
- "logps/rejected": -641.9728393554688,
1100
- "loss": 0.3221,
1101
- "rewards/accuracies": 0.875,
1102
- "rewards/chosen": -1.9340569972991943,
1103
- "rewards/margins": 1.6980880498886108,
1104
- "rewards/rejected": -3.6321449279785156,
1105
  "step": 710
1106
  },
1107
  {
1108
  "epoch": 1.5070643642072215,
1109
- "grad_norm": 32.42007210036511,
1110
  "learning_rate": 8.628481651367875e-08,
1111
- "logits/chosen": -1.8817088603973389,
1112
- "logits/rejected": -1.7917859554290771,
1113
- "logps/chosen": -490.68963623046875,
1114
- "logps/rejected": -666.5413818359375,
1115
- "loss": 0.3147,
1116
- "rewards/accuracies": 0.887499988079071,
1117
- "rewards/chosen": -2.0003697872161865,
1118
- "rewards/margins": 1.905077338218689,
1119
- "rewards/rejected": -3.905447483062744,
1120
  "step": 720
1121
  },
1122
  {
1123
  "epoch": 1.5279958137100995,
1124
- "grad_norm": 31.861142046155326,
1125
  "learning_rate": 7.947809564230445e-08,
1126
- "logits/chosen": -1.943153738975525,
1127
- "logits/rejected": -1.8634049892425537,
1128
- "logps/chosen": -467.35162353515625,
1129
- "logps/rejected": -635.6701049804688,
1130
- "loss": 0.311,
1131
- "rewards/accuracies": 0.862500011920929,
1132
- "rewards/chosen": -1.7533117532730103,
1133
- "rewards/margins": 1.6374591588974,
1134
- "rewards/rejected": -3.3907711505889893,
1135
  "step": 730
1136
  },
1137
  {
1138
  "epoch": 1.5489272632129776,
1139
- "grad_norm": 31.38323843580912,
1140
  "learning_rate": 7.289996455765748e-08,
1141
- "logits/chosen": -1.9120324850082397,
1142
- "logits/rejected": -1.8584543466567993,
1143
- "logps/chosen": -483.58868408203125,
1144
- "logps/rejected": -613.0440673828125,
1145
- "loss": 0.3397,
1146
- "rewards/accuracies": 0.90625,
1147
- "rewards/chosen": -1.846814513206482,
1148
- "rewards/margins": 1.7059099674224854,
1149
- "rewards/rejected": -3.5527243614196777,
1150
  "step": 740
1151
  },
1152
  {
1153
  "epoch": 1.5698587127158556,
1154
- "grad_norm": 23.167192529713482,
1155
  "learning_rate": 6.655924144404906e-08,
1156
- "logits/chosen": -1.7680591344833374,
1157
- "logits/rejected": -1.7400696277618408,
1158
- "logps/chosen": -473.86248779296875,
1159
- "logps/rejected": -644.2675170898438,
1160
- "loss": 0.313,
1161
- "rewards/accuracies": 0.9125000238418579,
1162
- "rewards/chosen": -1.8474416732788086,
1163
- "rewards/margins": 1.7949087619781494,
1164
- "rewards/rejected": -3.642350673675537,
1165
  "step": 750
1166
  },
1167
  {
1168
  "epoch": 1.5907901622187337,
1169
- "grad_norm": 33.669973383827035,
1170
  "learning_rate": 6.046442623320145e-08,
1171
- "logits/chosen": -1.8686587810516357,
1172
- "logits/rejected": -1.7762336730957031,
1173
- "logps/chosen": -554.06591796875,
1174
- "logps/rejected": -700.1908569335938,
1175
- "loss": 0.3261,
1176
- "rewards/accuracies": 0.8687499761581421,
1177
- "rewards/chosen": -2.0161635875701904,
1178
- "rewards/margins": 1.855608582496643,
1179
- "rewards/rejected": -3.871772050857544,
1180
  "step": 760
1181
  },
1182
  {
1183
  "epoch": 1.6117216117216118,
1184
- "grad_norm": 30.7976860953366,
1185
  "learning_rate": 5.4623689209832484e-08,
1186
- "logits/chosen": -1.9193071126937866,
1187
- "logits/rejected": -1.7936471700668335,
1188
- "logps/chosen": -525.6559448242188,
1189
- "logps/rejected": -623.5465087890625,
1190
- "loss": 0.3316,
1191
  "rewards/accuracies": 0.8500000238418579,
1192
- "rewards/chosen": -2.0721254348754883,
1193
- "rewards/margins": 1.649618148803711,
1194
- "rewards/rejected": -3.72174334526062,
1195
  "step": 770
1196
  },
1197
  {
1198
  "epoch": 1.6326530612244898,
1199
- "grad_norm": 25.816934052591822,
1200
  "learning_rate": 4.904486005914027e-08,
1201
- "logits/chosen": -1.8367938995361328,
1202
- "logits/rejected": -1.7610937356948853,
1203
- "logps/chosen": -509.4143981933594,
1204
- "logps/rejected": -686.8128662109375,
1205
- "loss": 0.3352,
1206
- "rewards/accuracies": 0.887499988079071,
1207
- "rewards/chosen": -1.8448041677474976,
1208
- "rewards/margins": 1.9604244232177734,
1209
- "rewards/rejected": -3.8052284717559814,
1210
  "step": 780
1211
  },
1212
  {
1213
  "epoch": 1.653584510727368,
1214
- "grad_norm": 22.36666573974123,
1215
  "learning_rate": 4.373541737087263e-08,
1216
- "logits/chosen": -1.8902270793914795,
1217
- "logits/rejected": -1.7814185619354248,
1218
- "logps/chosen": -464.0545959472656,
1219
- "logps/rejected": -618.5113525390625,
1220
- "loss": 0.2998,
1221
- "rewards/accuracies": 0.875,
1222
- "rewards/chosen": -1.7587671279907227,
1223
- "rewards/margins": 1.7177417278289795,
1224
- "rewards/rejected": -3.4765090942382812,
1225
  "step": 790
1226
  },
1227
  {
1228
  "epoch": 1.674515960230246,
1229
- "grad_norm": 40.0165337984474,
1230
  "learning_rate": 3.8702478614051345e-08,
1231
- "logits/chosen": -1.8753862380981445,
1232
- "logits/rejected": -1.803836464881897,
1233
- "logps/chosen": -516.0680541992188,
1234
- "logps/rejected": -638.5137329101562,
1235
- "loss": 0.3013,
1236
- "rewards/accuracies": 0.90625,
1237
- "rewards/chosen": -1.8721412420272827,
1238
- "rewards/margins": 1.763323187828064,
1239
- "rewards/rejected": -3.635464906692505,
1240
  "step": 800
1241
  },
1242
  {
1243
  "epoch": 1.695447409733124,
1244
- "grad_norm": 27.862986136186375,
1245
  "learning_rate": 3.3952790595787986e-08,
1246
- "logits/chosen": -1.8482224941253662,
1247
- "logits/rejected": -1.7838103771209717,
1248
- "logps/chosen": -477.73760986328125,
1249
- "logps/rejected": -661.8113403320312,
1250
- "loss": 0.3248,
1251
  "rewards/accuracies": 0.887499988079071,
1252
- "rewards/chosen": -1.789380431175232,
1253
- "rewards/margins": 2.0652589797973633,
1254
- "rewards/rejected": -3.8546395301818848,
1255
  "step": 810
1256
  },
1257
  {
1258
  "epoch": 1.716378859236002,
1259
- "grad_norm": 34.28604882984395,
1260
  "learning_rate": 2.9492720416985e-08,
1261
- "logits/chosen": -1.898856520652771,
1262
- "logits/rejected": -1.8373451232910156,
1263
- "logps/chosen": -471.712890625,
1264
- "logps/rejected": -652.3070068359375,
1265
- "loss": 0.322,
1266
- "rewards/accuracies": 0.8687499761581421,
1267
- "rewards/chosen": -1.9994008541107178,
1268
- "rewards/margins": 1.8078997135162354,
1269
- "rewards/rejected": -3.807300567626953,
1270
  "step": 820
1271
  },
1272
  {
1273
  "epoch": 1.7373103087388801,
1274
- "grad_norm": 25.18301550597887,
1275
  "learning_rate": 2.5328246937043525e-08,
1276
- "logits/chosen": -1.8851368427276611,
1277
- "logits/rejected": -1.8264014720916748,
1278
- "logps/chosen": -510.86798095703125,
1279
- "logps/rejected": -643.9739379882812,
1280
- "loss": 0.3037,
1281
- "rewards/accuracies": 0.8812500238418579,
1282
- "rewards/chosen": -1.9238808155059814,
1283
- "rewards/margins": 1.7123737335205078,
1284
- "rewards/rejected": -3.636254072189331,
1285
  "step": 830
1286
  },
1287
  {
1288
  "epoch": 1.7582417582417582,
1289
- "grad_norm": 39.715305232221894,
1290
  "learning_rate": 2.1464952759020856e-08,
1291
- "logits/chosen": -1.7454639673233032,
1292
- "logits/rejected": -1.719659447669983,
1293
- "logps/chosen": -476.7300720214844,
1294
- "logps/rejected": -662.8555908203125,
1295
- "loss": 0.3174,
1296
  "rewards/accuracies": 0.887499988079071,
1297
- "rewards/chosen": -2.1026253700256348,
1298
- "rewards/margins": 1.8342987298965454,
1299
- "rewards/rejected": -3.9369239807128906,
1300
  "step": 840
1301
  },
1302
  {
1303
  "epoch": 1.7791732077446363,
1304
- "grad_norm": 26.712746983305596,
1305
  "learning_rate": 1.7908016745981856e-08,
1306
- "logits/chosen": -1.8332322835922241,
1307
- "logits/rejected": -1.7802534103393555,
1308
- "logps/chosen": -459.216064453125,
1309
- "logps/rejected": -635.7386474609375,
1310
- "loss": 0.3118,
1311
- "rewards/accuracies": 0.856249988079071,
1312
- "rewards/chosen": -1.981248140335083,
1313
- "rewards/margins": 1.7714271545410156,
1314
- "rewards/rejected": -3.7526748180389404,
1315
  "step": 850
1316
  },
1317
  {
1318
  "epoch": 1.8001046572475143,
1319
- "grad_norm": 37.61857911792585,
1320
  "learning_rate": 1.4662207078575684e-08,
1321
- "logits/chosen": -1.8996174335479736,
1322
- "logits/rejected": -1.843186616897583,
1323
- "logps/chosen": -505.267822265625,
1324
- "logps/rejected": -616.5883178710938,
1325
- "loss": 0.3276,
1326
  "rewards/accuracies": 0.84375,
1327
- "rewards/chosen": -1.9595191478729248,
1328
- "rewards/margins": 1.6286653280258179,
1329
- "rewards/rejected": -3.588184356689453,
1330
  "step": 860
1331
  },
1332
  {
1333
  "epoch": 1.8210361067503924,
1334
- "grad_norm": 36.465175683499524,
1335
  "learning_rate": 1.1731874863145142e-08,
1336
- "logits/chosen": -1.8656623363494873,
1337
- "logits/rejected": -1.7836997509002686,
1338
- "logps/chosen": -524.4869995117188,
1339
- "logps/rejected": -655.26025390625,
1340
- "loss": 0.3114,
1341
  "rewards/accuracies": 0.8999999761581421,
1342
- "rewards/chosen": -1.984437346458435,
1343
- "rewards/margins": 1.840402603149414,
1344
- "rewards/rejected": -3.8248400688171387,
1345
  "step": 870
1346
  },
1347
  {
1348
  "epoch": 1.8419675562532705,
1349
- "grad_norm": 31.317239416373415,
1350
  "learning_rate": 9.12094829893642e-09,
1351
- "logits/chosen": -1.8914934396743774,
1352
- "logits/rejected": -1.8087772130966187,
1353
- "logps/chosen": -499.74639892578125,
1354
- "logps/rejected": -632.9876708984375,
1355
- "loss": 0.3185,
1356
  "rewards/accuracies": 0.8812500238418579,
1357
- "rewards/chosen": -2.0976243019104004,
1358
- "rewards/margins": 1.6914640665054321,
1359
- "rewards/rejected": -3.789088010787964,
1360
  "step": 880
1361
  },
1362
  {
1363
  "epoch": 1.8628990057561485,
1364
- "grad_norm": 25.15346806696213,
1365
  "learning_rate": 6.832927412229017e-09,
1366
- "logits/chosen": -1.954000473022461,
1367
- "logits/rejected": -1.8538668155670166,
1368
- "logps/chosen": -519.0631713867188,
1369
- "logps/rejected": -677.8694458007812,
1370
- "loss": 0.2969,
1371
  "rewards/accuracies": 0.9125000238418579,
1372
- "rewards/chosen": -1.8931020498275757,
1373
- "rewards/margins": 2.0227460861206055,
1374
- "rewards/rejected": -3.9158482551574707,
1375
  "step": 890
1376
  },
1377
  {
1378
  "epoch": 1.8838304552590266,
1379
- "grad_norm": 47.22034878743445,
1380
  "learning_rate": 4.8708793644441086e-09,
1381
- "logits/chosen": -1.7922518253326416,
1382
- "logits/rejected": -1.7178666591644287,
1383
- "logps/chosen": -470.116455078125,
1384
- "logps/rejected": -621.2272338867188,
1385
- "loss": 0.3031,
1386
- "rewards/accuracies": 0.90625,
1387
- "rewards/chosen": -1.822932481765747,
1388
- "rewards/margins": 1.9056673049926758,
1389
- "rewards/rejected": -3.7285995483398438,
1390
  "step": 900
1391
  },
1392
  {
1393
  "epoch": 1.9047619047619047,
1394
- "grad_norm": 33.30456524388558,
1395
  "learning_rate": 3.2374343405217884e-09,
1396
- "logits/chosen": -1.8453195095062256,
1397
- "logits/rejected": -1.783247947692871,
1398
- "logps/chosen": -464.78131103515625,
1399
- "logps/rejected": -612.1450805664062,
1400
- "loss": 0.3202,
1401
  "rewards/accuracies": 0.887499988079071,
1402
- "rewards/chosen": -1.9610507488250732,
1403
- "rewards/margins": 1.8240289688110352,
1404
- "rewards/rejected": -3.7850799560546875,
1405
  "step": 910
1406
  },
1407
  {
1408
  "epoch": 1.9256933542647827,
1409
- "grad_norm": 34.393381899703904,
1410
  "learning_rate": 1.9347820230782295e-09,
1411
- "logits/chosen": -1.8662745952606201,
1412
- "logits/rejected": -1.8287159204483032,
1413
- "logps/chosen": -490.3973083496094,
1414
- "logps/rejected": -676.7599487304688,
1415
- "loss": 0.3221,
1416
- "rewards/accuracies": 0.8999999761581421,
1417
- "rewards/chosen": -1.9537601470947266,
1418
- "rewards/margins": 1.8785762786865234,
1419
- "rewards/rejected": -3.832336902618408,
1420
  "step": 920
1421
  },
1422
  {
1423
  "epoch": 1.9466248037676608,
1424
- "grad_norm": 48.439087658815986,
1425
  "learning_rate": 9.64668657069706e-10,
1426
- "logits/chosen": -1.8716627359390259,
1427
- "logits/rejected": -1.8195035457611084,
1428
- "logps/chosen": -469.0355529785156,
1429
- "logps/rejected": -609.7899780273438,
1430
- "loss": 0.3118,
1431
- "rewards/accuracies": 0.862500011920929,
1432
- "rewards/chosen": -1.8933093547821045,
1433
- "rewards/margins": 1.6561877727508545,
1434
- "rewards/rejected": -3.549497127532959,
1435
  "step": 930
1436
  },
1437
  {
1438
  "epoch": 1.9675562532705388,
1439
- "grad_norm": 33.514570074363135,
1440
  "learning_rate": 3.2839470889836627e-10,
1441
- "logits/chosen": -1.8711134195327759,
1442
- "logits/rejected": -1.7610361576080322,
1443
- "logps/chosen": -504.5443420410156,
1444
- "logps/rejected": -611.5713500976562,
1445
- "loss": 0.3216,
1446
- "rewards/accuracies": 0.862500011920929,
1447
- "rewards/chosen": -1.9376933574676514,
1448
- "rewards/margins": 1.6980724334716797,
1449
- "rewards/rejected": -3.635765790939331,
1450
  "step": 940
1451
  },
1452
  {
1453
  "epoch": 1.988487702773417,
1454
- "grad_norm": 33.13760490600229,
1455
  "learning_rate": 2.6813123097352287e-11,
1456
- "logits/chosen": -1.8425813913345337,
1457
- "logits/rejected": -1.7561390399932861,
1458
- "logps/chosen": -512.4994506835938,
1459
- "logps/rejected": -666.7471313476562,
1460
- "loss": 0.3138,
1461
- "rewards/accuracies": 0.9125000238418579,
1462
- "rewards/chosen": -2.029484748840332,
1463
- "rewards/margins": 1.9451849460601807,
1464
- "rewards/rejected": -3.9746696949005127,
1465
  "step": 950
1466
  },
1467
  {
1468
  "epoch": 1.9968602825745683,
1469
- "eval_logits/chosen": -1.843076467514038,
1470
- "eval_logits/rejected": -1.7970439195632935,
1471
- "eval_logps/chosen": -488.8499755859375,
1472
- "eval_logps/rejected": -619.1251220703125,
1473
- "eval_loss": 0.5119243860244751,
1474
- "eval_rewards/accuracies": 0.74609375,
1475
- "eval_rewards/chosen": -2.1150760650634766,
1476
- "eval_rewards/margins": 1.3443243503570557,
1477
- "eval_rewards/rejected": -3.4594006538391113,
1478
- "eval_runtime": 169.1222,
1479
- "eval_samples_per_second": 11.826,
1480
- "eval_steps_per_second": 0.189,
1481
  "step": 954
1482
  },
1483
  {
@@ -1485,9 +1485,9 @@
1485
  "step": 954,
1486
  "total_flos": 0.0,
1487
  "train_loss": 0.0,
1488
- "train_runtime": 3.478,
1489
- "train_samples_per_second": 17577.112,
1490
- "train_steps_per_second": 137.146
1491
  }
1492
  ],
1493
  "logging_steps": 10,
 
10
  "log_history": [
11
  {
12
  "epoch": 0.0020931449502878076,
13
+ "grad_norm": 14.88607346284462,
14
  "learning_rate": 5.208333333333333e-09,
15
+ "logits/chosen": -2.925722122192383,
16
+ "logits/rejected": -2.8885936737060547,
17
+ "logps/chosen": -321.0921630859375,
18
+ "logps/rejected": -365.8306884765625,
19
  "loss": 0.6931,
20
  "rewards/accuracies": 0.0,
21
  "rewards/chosen": 0.0,
 
25
  },
26
  {
27
  "epoch": 0.020931449502878074,
28
+ "grad_norm": 10.30635291782621,
29
  "learning_rate": 5.208333333333333e-08,
30
+ "logits/chosen": -2.7202770709991455,
31
+ "logits/rejected": -2.695319175720215,
32
+ "logps/chosen": -321.58056640625,
33
+ "logps/rejected": -289.4584045410156,
34
+ "loss": 0.6931,
35
+ "rewards/accuracies": 0.4513888955116272,
36
+ "rewards/chosen": 0.00041189632611349225,
37
+ "rewards/margins": -6.186102837091312e-05,
38
+ "rewards/rejected": 0.00047375739086419344,
39
  "step": 10
40
  },
41
  {
42
  "epoch": 0.04186289900575615,
43
+ "grad_norm": 11.851618937043359,
44
  "learning_rate": 1.0416666666666667e-07,
45
+ "logits/chosen": -2.7979576587677,
46
+ "logits/rejected": -2.743467092514038,
47
+ "logps/chosen": -317.7387390136719,
48
+ "logps/rejected": -289.9251708984375,
49
+ "loss": 0.6931,
50
+ "rewards/accuracies": 0.53125,
51
+ "rewards/chosen": 0.0007057279581204057,
52
+ "rewards/margins": 0.00038364241481758654,
53
+ "rewards/rejected": 0.0003220855724066496,
54
  "step": 20
55
  },
56
  {
57
  "epoch": 0.06279434850863422,
58
+ "grad_norm": 7.845409123408187,
59
  "learning_rate": 1.5624999999999999e-07,
60
+ "logits/chosen": -2.7853832244873047,
61
+ "logits/rejected": -2.7237634658813477,
62
+ "logps/chosen": -300.7030029296875,
63
+ "logps/rejected": -254.2164306640625,
64
+ "loss": 0.6924,
65
  "rewards/accuracies": 0.6187499761581421,
66
+ "rewards/chosen": 0.004664666019380093,
67
+ "rewards/margins": 0.002464447868987918,
68
+ "rewards/rejected": 0.002200217917561531,
69
  "step": 30
70
  },
71
  {
72
  "epoch": 0.0837257980115123,
73
+ "grad_norm": 7.717127331852517,
74
  "learning_rate": 2.0833333333333333e-07,
75
+ "logits/chosen": -2.707097291946411,
76
+ "logits/rejected": -2.6731085777282715,
77
+ "logps/chosen": -278.5015869140625,
78
+ "logps/rejected": -266.61273193359375,
79
+ "loss": 0.6904,
80
+ "rewards/accuracies": 0.7124999761581421,
81
+ "rewards/chosen": 0.012720887549221516,
82
+ "rewards/margins": 0.005492700729519129,
83
+ "rewards/rejected": 0.007228186819702387,
84
  "step": 40
85
  },
86
  {
87
  "epoch": 0.10465724751439037,
88
+ "grad_norm": 15.224224641480427,
89
  "learning_rate": 2.604166666666667e-07,
90
+ "logits/chosen": -2.734222888946533,
91
+ "logits/rejected": -2.6672184467315674,
92
+ "logps/chosen": -284.1479187011719,
93
+ "logps/rejected": -280.54168701171875,
94
+ "loss": 0.6873,
95
+ "rewards/accuracies": 0.699999988079071,
96
+ "rewards/chosen": 0.023878615349531174,
97
+ "rewards/margins": 0.010178199037909508,
98
+ "rewards/rejected": 0.013700416311621666,
99
  "step": 50
100
  },
101
  {
102
  "epoch": 0.12558869701726844,
103
+ "grad_norm": 9.724530192766299,
104
  "learning_rate": 3.1249999999999997e-07,
105
+ "logits/chosen": -2.7641100883483887,
106
+ "logits/rejected": -2.710592746734619,
107
+ "logps/chosen": -257.2303161621094,
108
+ "logps/rejected": -249.379638671875,
109
+ "loss": 0.6824,
110
+ "rewards/accuracies": 0.737500011920929,
111
+ "rewards/chosen": 0.041937388479709625,
112
+ "rewards/margins": 0.02456718310713768,
113
+ "rewards/rejected": 0.017370199784636497,
114
  "step": 60
115
  },
116
  {
117
  "epoch": 0.14652014652014653,
118
+ "grad_norm": 7.609142075748871,
119
  "learning_rate": 3.645833333333333e-07,
120
+ "logits/chosen": -2.7005503177642822,
121
+ "logits/rejected": -2.670154094696045,
122
+ "logps/chosen": -302.48907470703125,
123
+ "logps/rejected": -288.536865234375,
124
+ "loss": 0.6728,
125
+ "rewards/accuracies": 0.6812499761581421,
126
+ "rewards/chosen": 0.037350092083215714,
127
+ "rewards/margins": 0.03656148537993431,
128
+ "rewards/rejected": 0.0007886036182753742,
129
  "step": 70
130
  },
131
  {
132
  "epoch": 0.1674515960230246,
133
+ "grad_norm": 11.507263661586103,
134
  "learning_rate": 4.1666666666666667e-07,
135
+ "logits/chosen": -2.795855760574341,
136
+ "logits/rejected": -2.709174156188965,
137
+ "logps/chosen": -344.7085876464844,
138
+ "logps/rejected": -274.75372314453125,
139
+ "loss": 0.6558,
140
+ "rewards/accuracies": 0.7437499761581421,
141
+ "rewards/chosen": 0.013917540200054646,
142
+ "rewards/margins": 0.09853295236825943,
143
+ "rewards/rejected": -0.08461540192365646,
144
  "step": 80
145
  },
146
  {
147
  "epoch": 0.18838304552590268,
148
+ "grad_norm": 12.522918295649193,
149
  "learning_rate": 4.6874999999999996e-07,
150
+ "logits/chosen": -2.7532315254211426,
151
+ "logits/rejected": -2.712498188018799,
152
+ "logps/chosen": -264.0057678222656,
153
+ "logps/rejected": -278.46209716796875,
154
+ "loss": 0.6358,
155
+ "rewards/accuracies": 0.65625,
156
+ "rewards/chosen": -0.041521135717630386,
157
+ "rewards/margins": 0.12843890488147736,
158
+ "rewards/rejected": -0.16996005177497864,
159
  "step": 90
160
  },
161
  {
162
  "epoch": 0.20931449502878074,
163
+ "grad_norm": 16.714062222107753,
164
  "learning_rate": 4.999731868769026e-07,
165
+ "logits/chosen": -2.7503976821899414,
166
+ "logits/rejected": -2.7277534008026123,
167
+ "logps/chosen": -317.9760437011719,
168
+ "logps/rejected": -314.3436584472656,
169
+ "loss": 0.6267,
170
+ "rewards/accuracies": 0.762499988079071,
171
+ "rewards/chosen": -0.06901533901691437,
172
+ "rewards/margins": 0.24417515099048615,
173
+ "rewards/rejected": -0.3131905198097229,
174
  "step": 100
175
  },
176
  {
177
  "epoch": 0.2302459445316588,
178
+ "grad_norm": 11.040271216087397,
179
  "learning_rate": 4.996716052911017e-07,
180
+ "logits/chosen": -2.790177583694458,
181
+ "logits/rejected": -2.747727394104004,
182
+ "logps/chosen": -327.3770751953125,
183
+ "logps/rejected": -331.28594970703125,
184
+ "loss": 0.6264,
185
+ "rewards/accuracies": 0.675000011920929,
186
+ "rewards/chosen": -0.28078389167785645,
187
+ "rewards/margins": 0.21150124073028564,
188
+ "rewards/rejected": -0.4922851622104645,
189
  "step": 110
190
  },
191
  {
192
  "epoch": 0.25117739403453687,
193
+ "grad_norm": 12.319376619341371,
194
  "learning_rate": 4.990353313429303e-07,
195
+ "logits/chosen": -2.7004141807556152,
196
+ "logits/rejected": -2.7081363201141357,
197
+ "logps/chosen": -292.29193115234375,
198
+ "logps/rejected": -308.19244384765625,
199
+ "loss": 0.6059,
200
+ "rewards/accuracies": 0.731249988079071,
201
+ "rewards/chosen": -0.08327662199735641,
202
+ "rewards/margins": 0.2638171315193176,
203
+ "rewards/rejected": -0.34709376096725464,
204
  "step": 120
205
  },
206
  {
207
  "epoch": 0.272108843537415,
208
+ "grad_norm": 24.07326364555429,
209
  "learning_rate": 4.980652179769217e-07,
210
+ "logits/chosen": -2.7517778873443604,
211
+ "logits/rejected": -2.735368013381958,
212
+ "logps/chosen": -326.88323974609375,
213
+ "logps/rejected": -333.8082275390625,
214
+ "loss": 0.5854,
215
+ "rewards/accuracies": 0.6625000238418579,
216
+ "rewards/chosen": -0.27743759751319885,
217
+ "rewards/margins": 0.24996426701545715,
218
+ "rewards/rejected": -0.5274018049240112,
219
  "step": 130
220
  },
221
  {
222
  "epoch": 0.29304029304029305,
223
+ "grad_norm": 23.35517515247709,
224
  "learning_rate": 4.967625656594781e-07,
225
+ "logits/chosen": -2.771867275238037,
226
+ "logits/rejected": -2.7476229667663574,
227
+ "logps/chosen": -339.9781799316406,
228
+ "logps/rejected": -328.70538330078125,
229
+ "loss": 0.5878,
230
+ "rewards/accuracies": 0.737500011920929,
231
+ "rewards/chosen": -0.13580283522605896,
232
+ "rewards/margins": 0.391187846660614,
233
+ "rewards/rejected": -0.5269905924797058,
234
  "step": 140
235
  },
236
  {
237
  "epoch": 0.3139717425431711,
238
+ "grad_norm": 20.972969810387227,
239
  "learning_rate": 4.951291206355559e-07,
240
+ "logits/chosen": -2.748628616333008,
241
+ "logits/rejected": -2.7197518348693848,
242
+ "logps/chosen": -327.03125,
243
+ "logps/rejected": -318.48638916015625,
244
+ "loss": 0.5967,
245
+ "rewards/accuracies": 0.6625000238418579,
246
+ "rewards/chosen": -0.3390689790248871,
247
+ "rewards/margins": 0.2791776657104492,
248
+ "rewards/rejected": -0.6182466745376587,
249
  "step": 150
250
  },
251
  {
252
  "epoch": 0.3349031920460492,
253
+ "grad_norm": 13.425598261254686,
254
  "learning_rate": 4.93167072587771e-07,
255
+ "logits/chosen": -2.788029193878174,
256
+ "logits/rejected": -2.746370553970337,
257
+ "logps/chosen": -324.72052001953125,
258
+ "logps/rejected": -337.3094787597656,
259
+ "loss": 0.5846,
260
+ "rewards/accuracies": 0.731249988079071,
261
+ "rewards/chosen": -0.3970801830291748,
262
+ "rewards/margins": 0.40131622552871704,
263
+ "rewards/rejected": -0.7983964681625366,
264
  "step": 160
265
  },
266
  {
267
  "epoch": 0.35583464154892724,
268
+ "grad_norm": 17.272372981195964,
269
  "learning_rate": 4.908790517010636e-07,
270
+ "logits/chosen": -2.6719164848327637,
271
+ "logits/rejected": -2.6612837314605713,
272
+ "logps/chosen": -315.0084228515625,
273
+ "logps/rejected": -320.8651428222656,
274
+ "loss": 0.5888,
275
+ "rewards/accuracies": 0.6937500238418579,
276
+ "rewards/chosen": -0.4511398375034332,
277
+ "rewards/margins": 0.3194407820701599,
278
+ "rewards/rejected": -0.7705805897712708,
279
  "step": 170
280
  },
281
  {
282
  "epoch": 0.37676609105180536,
283
+ "grad_norm": 14.426317833201416,
284
  "learning_rate": 4.882681251368548e-07,
285
+ "logits/chosen": -2.7663211822509766,
286
+ "logits/rejected": -2.7407591342926025,
287
+ "logps/chosen": -322.9417419433594,
288
+ "logps/rejected": -330.96038818359375,
289
+ "loss": 0.5596,
290
+ "rewards/accuracies": 0.731249988079071,
291
+ "rewards/chosen": -0.4329908490180969,
292
+ "rewards/margins": 0.358435720205307,
293
+ "rewards/rejected": -0.7914265394210815,
294
  "step": 180
295
  },
296
  {
297
  "epoch": 0.3976975405546834,
298
+ "grad_norm": 16.424447211666443,
299
  "learning_rate": 4.853377929214243e-07,
300
+ "logits/chosen": -2.7122840881347656,
301
+ "logits/rejected": -2.6895980834960938,
302
+ "logps/chosen": -305.7542419433594,
303
+ "logps/rejected": -333.2830810546875,
304
+ "loss": 0.5689,
305
+ "rewards/accuracies": 0.6812499761581421,
306
+ "rewards/chosen": -0.4040308892726898,
307
+ "rewards/margins": 0.32926663756370544,
308
+ "rewards/rejected": -0.7332974672317505,
309
  "step": 190
310
  },
311
  {
312
  "epoch": 0.4186289900575615,
313
+ "grad_norm": 18.075431248646595,
314
  "learning_rate": 4.820919832540181e-07,
315
+ "logits/chosen": -2.670199155807495,
316
+ "logits/rejected": -2.6688408851623535,
317
+ "logps/chosen": -299.871826171875,
318
+ "logps/rejected": -358.0787048339844,
319
+ "loss": 0.5766,
320
+ "rewards/accuracies": 0.7437499761581421,
321
+ "rewards/chosen": -0.37743473052978516,
322
+ "rewards/margins": 0.4126531183719635,
323
+ "rewards/rejected": -0.7900878190994263,
324
  "step": 200
325
  },
326
  {
327
  "epoch": 0.43956043956043955,
328
+ "grad_norm": 16.199236277149858,
329
  "learning_rate": 4.785350472409791e-07,
330
+ "logits/chosen": -2.718061685562134,
331
+ "logits/rejected": -2.6642110347747803,
332
+ "logps/chosen": -362.8133544921875,
333
+ "logps/rejected": -342.171875,
334
+ "loss": 0.5765,
335
+ "rewards/accuracies": 0.699999988079071,
336
+ "rewards/chosen": -0.42062854766845703,
337
+ "rewards/margins": 0.43933743238449097,
338
+ "rewards/rejected": -0.859965980052948,
339
  "step": 210
340
  },
341
  {
342
  "epoch": 0.4604918890633176,
343
+ "grad_norm": 17.93992307211224,
344
  "learning_rate": 4.7467175306295647e-07,
345
+ "logits/chosen": -2.6234521865844727,
346
+ "logits/rejected": -2.640157699584961,
347
+ "logps/chosen": -310.472412109375,
348
+ "logps/rejected": -345.089111328125,
349
+ "loss": 0.5611,
350
+ "rewards/accuracies": 0.7250000238418579,
351
+ "rewards/chosen": -0.47760000824928284,
352
+ "rewards/margins": 0.43446213006973267,
353
+ "rewards/rejected": -0.9120620489120483,
354
  "step": 220
355
  },
356
  {
357
  "epoch": 0.48142333856619574,
358
+ "grad_norm": 20.29472356294206,
359
  "learning_rate": 4.70507279583015e-07,
360
+ "logits/chosen": -2.7133331298828125,
361
+ "logits/rejected": -2.6721091270446777,
362
+ "logps/chosen": -318.923095703125,
363
+ "logps/rejected": -366.59710693359375,
364
+ "loss": 0.5624,
365
+ "rewards/accuracies": 0.768750011920929,
366
+ "rewards/chosen": -0.4767213761806488,
367
+ "rewards/margins": 0.6975632905960083,
368
+ "rewards/rejected": -1.1742846965789795,
369
  "step": 230
370
  },
371
  {
372
  "epoch": 0.5023547880690737,
373
+ "grad_norm": 35.1028217518017,
374
  "learning_rate": 4.6604720940421207e-07,
375
+ "logits/chosen": -2.640972137451172,
376
+ "logits/rejected": -2.5848052501678467,
377
+ "logps/chosen": -344.928955078125,
378
+ "logps/rejected": -369.1497497558594,
379
+ "loss": 0.5347,
380
  "rewards/accuracies": 0.699999988079071,
381
+ "rewards/chosen": -0.47312647104263306,
382
+ "rewards/margins": 0.495448499917984,
383
+ "rewards/rejected": -0.9685748815536499,
384
  "step": 240
385
  },
386
  {
387
  "epoch": 0.5232862375719518,
388
+ "grad_norm": 19.843272670557887,
389
  "learning_rate": 4.612975213859487e-07,
390
+ "logits/chosen": -2.5373644828796387,
391
+ "logits/rejected": -2.4579813480377197,
392
+ "logps/chosen": -347.16748046875,
393
+ "logps/rejected": -379.2513732910156,
394
+ "loss": 0.5437,
395
+ "rewards/accuracies": 0.800000011920929,
396
+ "rewards/chosen": -0.566421389579773,
397
+ "rewards/margins": 0.6545469164848328,
398
+ "rewards/rejected": -1.220968246459961,
399
  "step": 250
400
  },
401
  {
402
  "epoch": 0.54421768707483,
403
+ "grad_norm": 23.320833527504977,
404
  "learning_rate": 4.5626458262912735e-07,
405
+ "logits/chosen": -2.3933730125427246,
406
+ "logits/rejected": -2.3900551795959473,
407
+ "logps/chosen": -352.5663146972656,
408
+ "logps/rejected": -384.38397216796875,
409
+ "loss": 0.5419,
410
+ "rewards/accuracies": 0.699999988079071,
411
+ "rewards/chosen": -0.8031826019287109,
412
+ "rewards/margins": 0.5431042909622192,
413
+ "rewards/rejected": -1.3462870121002197,
414
  "step": 260
415
  },
416
  {
417
  "epoch": 0.565149136577708,
418
+ "grad_norm": 19.483357541000405,
419
  "learning_rate": 4.5095513994085974e-07,
420
+ "logits/chosen": -2.200190305709839,
421
+ "logits/rejected": -2.142879009246826,
422
+ "logps/chosen": -368.5080871582031,
423
+ "logps/rejected": -412.9193420410156,
424
+ "loss": 0.5489,
425
  "rewards/accuracies": 0.7437499761581421,
426
+ "rewards/chosen": -0.6709359884262085,
427
+ "rewards/margins": 0.7138775587081909,
428
+ "rewards/rejected": -1.3848135471343994,
429
  "step": 270
430
  },
431
  {
432
  "epoch": 0.5860805860805861,
433
+ "grad_norm": 23.630987101691453,
434
  "learning_rate": 4.453763107901675e-07,
435
+ "logits/chosen": -2.013388156890869,
436
+ "logits/rejected": -1.9315751791000366,
437
+ "logps/chosen": -335.5148620605469,
438
+ "logps/rejected": -353.8980712890625,
439
+ "loss": 0.5513,
440
+ "rewards/accuracies": 0.6625000238418579,
441
+ "rewards/chosen": -0.537543773651123,
442
+ "rewards/margins": 0.6034060716629028,
443
+ "rewards/rejected": -1.1409497261047363,
444
  "step": 280
445
  },
446
  {
447
  "epoch": 0.6070120355834642,
448
+ "grad_norm": 21.50926745793373,
449
  "learning_rate": 4.395355737667985e-07,
450
+ "logits/chosen": -2.042461395263672,
451
+ "logits/rejected": -1.9405343532562256,
452
+ "logps/chosen": -329.45513916015625,
453
+ "logps/rejected": -386.4042663574219,
454
+ "loss": 0.546,
455
+ "rewards/accuracies": 0.731249988079071,
456
+ "rewards/chosen": -0.737878680229187,
457
+ "rewards/margins": 0.6161252856254578,
458
+ "rewards/rejected": -1.3540040254592896,
459
  "step": 290
460
  },
461
  {
462
  "epoch": 0.6279434850863422,
463
+ "grad_norm": 21.13385612640552,
464
  "learning_rate": 4.3344075855595097e-07,
465
+ "logits/chosen": -2.0211853981018066,
466
+ "logits/rejected": -1.9133087396621704,
467
+ "logps/chosen": -419.280029296875,
468
+ "logps/rejected": -442.6297912597656,
469
+ "loss": 0.5279,
470
+ "rewards/accuracies": 0.71875,
471
+ "rewards/chosen": -0.8050423860549927,
472
+ "rewards/margins": 0.6596104502677917,
473
+ "rewards/rejected": -1.4646527767181396,
474
  "step": 300
475
  },
476
  {
477
  "epoch": 0.6488749345892203,
478
+ "grad_norm": 19.779873187639623,
479
  "learning_rate": 4.271000354423425e-07,
480
+ "logits/chosen": -2.02839994430542,
481
+ "logits/rejected": -1.9323558807373047,
482
+ "logps/chosen": -383.2649841308594,
483
+ "logps/rejected": -405.40740966796875,
484
+ "loss": 0.5219,
485
+ "rewards/accuracies": 0.7562500238418579,
486
+ "rewards/chosen": -0.7099379897117615,
487
+ "rewards/margins": 0.7654498815536499,
488
+ "rewards/rejected": -1.4753879308700562,
489
  "step": 310
490
  },
491
  {
492
  "epoch": 0.6698063840920984,
493
+ "grad_norm": 21.08095676735262,
494
  "learning_rate": 4.2052190435769554e-07,
495
+ "logits/chosen": -1.9793522357940674,
496
+ "logits/rejected": -1.860769271850586,
497
+ "logps/chosen": -344.1609191894531,
498
+ "logps/rejected": -391.5399475097656,
499
+ "loss": 0.5066,
500
  "rewards/accuracies": 0.831250011920929,
501
+ "rewards/chosen": -0.7778881192207336,
502
+ "rewards/margins": 0.8001760244369507,
503
+ "rewards/rejected": -1.5780640840530396,
504
  "step": 320
505
  },
506
  {
507
  "epoch": 0.6907378335949764,
508
+ "grad_norm": 27.57188250299686,
509
  "learning_rate": 4.137151834863213e-07,
510
+ "logits/chosen": -2.071329116821289,
511
+ "logits/rejected": -2.0509676933288574,
512
+ "logps/chosen": -345.35186767578125,
513
+ "logps/rejected": -415.62261962890625,
514
+ "loss": 0.5115,
515
+ "rewards/accuracies": 0.762499988079071,
516
+ "rewards/chosen": -0.6397430300712585,
517
+ "rewards/margins": 0.7728389501571655,
518
+ "rewards/rejected": -1.4125821590423584,
519
  "step": 330
520
  },
521
  {
522
  "epoch": 0.7116692830978545,
523
+ "grad_norm": 30.579996976984805,
524
  "learning_rate": 4.0668899744407567e-07,
525
+ "logits/chosen": -1.8872754573822021,
526
+ "logits/rejected": -1.8190815448760986,
527
+ "logps/chosen": -379.37225341796875,
528
+ "logps/rejected": -436.101318359375,
529
+ "loss": 0.52,
530
+ "rewards/accuracies": 0.706250011920929,
531
+ "rewards/chosen": -0.7692102789878845,
532
+ "rewards/margins": 0.7424911260604858,
533
+ "rewards/rejected": -1.5117013454437256,
534
  "step": 340
535
  },
536
  {
537
  "epoch": 0.7326007326007326,
538
+ "grad_norm": 23.99831270999009,
539
  "learning_rate": 3.994527650465352e-07,
540
+ "logits/chosen": -1.8584909439086914,
541
+ "logits/rejected": -1.7157665491104126,
542
+ "logps/chosen": -362.5115661621094,
543
+ "logps/rejected": -413.8670349121094,
544
+ "loss": 0.5019,
545
+ "rewards/accuracies": 0.78125,
546
+ "rewards/chosen": -0.8885606527328491,
547
+ "rewards/margins": 0.7002802491188049,
548
+ "rewards/rejected": -1.5888408422470093,
549
  "step": 350
550
  },
551
  {
552
  "epoch": 0.7535321821036107,
553
+ "grad_norm": 20.569792166483495,
554
  "learning_rate": 3.920161866827889e-07,
555
+ "logits/chosen": -1.8123042583465576,
556
+ "logits/rejected": -1.6628021001815796,
557
+ "logps/chosen": -398.8226623535156,
558
+ "logps/rejected": -431.47088623046875,
559
+ "loss": 0.542,
560
+ "rewards/accuracies": 0.7437499761581421,
561
+ "rewards/chosen": -0.8019789457321167,
562
+ "rewards/margins": 0.7802454233169556,
563
+ "rewards/rejected": -1.5822242498397827,
564
  "step": 360
565
  },
566
  {
567
  "epoch": 0.7744636316064888,
568
+ "grad_norm": 21.1660600636486,
569
  "learning_rate": 3.8438923131177237e-07,
570
+ "logits/chosen": -1.702923059463501,
571
+ "logits/rejected": -1.573188066482544,
572
+ "logps/chosen": -352.84765625,
573
+ "logps/rejected": -393.1349182128906,
574
+ "loss": 0.5227,
575
+ "rewards/accuracies": 0.737500011920929,
576
+ "rewards/chosen": -0.7422314882278442,
577
+ "rewards/margins": 0.7614965438842773,
578
+ "rewards/rejected": -1.5037281513214111,
579
  "step": 370
580
  },
581
  {
582
  "epoch": 0.7953950811093669,
583
+ "grad_norm": 17.183773141452058,
584
  "learning_rate": 3.765821230985757e-07,
585
+ "logits/chosen": -1.6204278469085693,
586
+ "logits/rejected": -1.5805397033691406,
587
+ "logps/chosen": -352.36553955078125,
588
+ "logps/rejected": -456.62127685546875,
589
+ "loss": 0.5167,
590
+ "rewards/accuracies": 0.731249988079071,
591
+ "rewards/chosen": -0.8929269909858704,
592
+ "rewards/margins": 0.8663182258605957,
593
+ "rewards/rejected": -1.7592451572418213,
594
  "step": 380
595
  },
596
  {
597
  "epoch": 0.8163265306122449,
598
+ "grad_norm": 23.349904894629407,
599
  "learning_rate": 3.6860532770864005e-07,
600
+ "logits/chosen": -1.9042911529541016,
601
+ "logits/rejected": -1.7993557453155518,
602
+ "logps/chosen": -437.3020935058594,
603
+ "logps/rejected": -478.8271484375,
604
+ "loss": 0.5087,
605
  "rewards/accuracies": 0.731249988079071,
606
+ "rewards/chosen": -0.8284949064254761,
607
+ "rewards/margins": 0.7516407370567322,
608
+ "rewards/rejected": -1.580135703086853,
609
  "step": 390
610
  },
611
  {
612
  "epoch": 0.837257980115123,
613
+ "grad_norm": 23.752316920168525,
614
  "learning_rate": 3.604695382782159e-07,
615
+ "logits/chosen": -1.7125256061553955,
616
+ "logits/rejected": -1.591506838798523,
617
+ "logps/chosen": -373.38818359375,
618
+ "logps/rejected": -433.75799560546875,
619
+ "loss": 0.5247,
620
+ "rewards/accuracies": 0.768750011920929,
621
+ "rewards/chosen": -1.0498483180999756,
622
+ "rewards/margins": 0.8007469177246094,
623
+ "rewards/rejected": -1.850595235824585,
624
  "step": 400
625
  },
626
  {
627
  "epoch": 0.858189429618001,
628
+ "grad_norm": 19.666172854109288,
629
  "learning_rate": 3.5218566107988867e-07,
630
+ "logits/chosen": -2.0926361083984375,
631
+ "logits/rejected": -2.002737283706665,
632
+ "logps/chosen": -392.9161682128906,
633
+ "logps/rejected": -434.8521423339844,
634
+ "loss": 0.5204,
635
+ "rewards/accuracies": 0.7250000238418579,
636
+ "rewards/chosen": -0.8088946342468262,
637
+ "rewards/margins": 0.8107136487960815,
638
+ "rewards/rejected": -1.6196085214614868,
639
  "step": 410
640
  },
641
  {
642
  "epoch": 0.8791208791208791,
643
+ "grad_norm": 21.227558543972002,
644
  "learning_rate": 3.4376480090239047e-07,
645
+ "logits/chosen": -2.0091476440429688,
646
+ "logits/rejected": -1.9699198007583618,
647
+ "logps/chosen": -343.46112060546875,
648
+ "logps/rejected": -425.591796875,
649
+ "loss": 0.5084,
650
+ "rewards/accuracies": 0.762499988079071,
651
+ "rewards/chosen": -0.7525590658187866,
652
+ "rewards/margins": 0.8109620213508606,
653
+ "rewards/rejected": -1.563521146774292,
654
  "step": 420
655
  },
656
  {
657
  "epoch": 0.9000523286237572,
658
+ "grad_norm": 28.76341659471802,
659
  "learning_rate": 3.3521824616429284e-07,
660
+ "logits/chosen": -1.5924561023712158,
661
+ "logits/rejected": -1.4453504085540771,
662
+ "logps/chosen": -398.48284912109375,
663
+ "logps/rejected": -460.5140686035156,
664
+ "loss": 0.4912,
665
  "rewards/accuracies": 0.7562500238418579,
666
+ "rewards/chosen": -0.9409183263778687,
667
+ "rewards/margins": 0.8444429636001587,
668
+ "rewards/rejected": -1.7853610515594482,
669
  "step": 430
670
  },
671
  {
672
  "epoch": 0.9209837781266352,
673
+ "grad_norm": 24.356244505358386,
674
  "learning_rate": 3.265574537815398e-07,
675
+ "logits/chosen": -1.3780838251113892,
676
+ "logits/rejected": -1.1997829675674438,
677
+ "logps/chosen": -373.2276916503906,
678
+ "logps/rejected": -418.62335205078125,
679
+ "loss": 0.5069,
680
+ "rewards/accuracies": 0.737500011920929,
681
+ "rewards/chosen": -1.034752607345581,
682
+ "rewards/margins": 0.7990537881851196,
683
+ "rewards/rejected": -1.8338062763214111,
684
  "step": 440
685
  },
686
  {
687
  "epoch": 0.9419152276295133,
688
+ "grad_norm": 21.508083750235922,
689
  "learning_rate": 3.1779403380910425e-07,
690
+ "logits/chosen": -1.2375565767288208,
691
+ "logits/rejected": -1.1287505626678467,
692
+ "logps/chosen": -379.8437194824219,
693
+ "logps/rejected": -447.7130432128906,
694
+ "loss": 0.5131,
695
+ "rewards/accuracies": 0.737500011920929,
696
+ "rewards/chosen": -0.929580807685852,
697
+ "rewards/margins": 0.8013578653335571,
698
+ "rewards/rejected": -1.7309386730194092,
699
  "step": 450
700
  },
701
  {
702
  "epoch": 0.9628466771323915,
703
+ "grad_norm": 29.86693166076459,
704
  "learning_rate": 3.0893973387735683e-07,
705
+ "logits/chosen": -1.4521411657333374,
706
+ "logits/rejected": -1.2153605222702026,
707
+ "logps/chosen": -377.4960021972656,
708
+ "logps/rejected": -401.8866882324219,
709
+ "loss": 0.5157,
710
+ "rewards/accuracies": 0.75,
711
+ "rewards/chosen": -0.9293259382247925,
712
+ "rewards/margins": 0.7097651362419128,
713
+ "rewards/rejected": -1.63909113407135,
714
  "step": 460
715
  },
716
  {
717
  "epoch": 0.9837781266352695,
718
+ "grad_norm": 18.514726211898278,
719
  "learning_rate": 3.000064234440111e-07,
720
+ "logits/chosen": -1.4076189994812012,
721
+ "logits/rejected": -1.2628891468048096,
722
+ "logps/chosen": -399.5198059082031,
723
+ "logps/rejected": -439.87969970703125,
724
+ "loss": 0.501,
725
  "rewards/accuracies": 0.706250011920929,
726
+ "rewards/chosen": -0.7778880000114441,
727
+ "rewards/margins": 0.7691278457641602,
728
+ "rewards/rejected": -1.5470157861709595,
729
  "step": 470
730
  },
731
  {
732
  "epoch": 0.9984301412872841,
733
+ "eval_logits/chosen": -1.273218035697937,
734
+ "eval_logits/rejected": -1.1614912748336792,
735
+ "eval_logps/chosen": -346.5500793457031,
736
+ "eval_logps/rejected": -435.70635986328125,
737
+ "eval_loss": 0.5185136795043945,
738
+ "eval_rewards/accuracies": 0.77734375,
739
+ "eval_rewards/chosen": -0.7440664172172546,
740
+ "eval_rewards/margins": 0.9043572545051575,
741
+ "eval_rewards/rejected": -1.6484237909317017,
742
+ "eval_runtime": 171.5848,
743
+ "eval_samples_per_second": 11.656,
744
+ "eval_steps_per_second": 0.186,
745
  "step": 477
746
  },
747
  {
748
  "epoch": 1.0047095761381475,
749
+ "grad_norm": 21.17560575026596,
750
  "learning_rate": 2.910060778827554e-07,
751
+ "logits/chosen": -1.3738365173339844,
752
+ "logits/rejected": -1.2765980958938599,
753
+ "logps/chosen": -379.2554931640625,
754
+ "logps/rejected": -449.7948303222656,
755
+ "loss": 0.4798,
756
+ "rewards/accuracies": 0.7875000238418579,
757
+ "rewards/chosen": -0.723404586315155,
758
+ "rewards/margins": 0.8685447573661804,
759
+ "rewards/rejected": -1.591949224472046,
760
  "step": 480
761
  },
762
  {
763
  "epoch": 1.0256410256410255,
764
+ "grad_norm": 16.51017448411662,
765
  "learning_rate": 2.8195076242990116e-07,
766
+ "logits/chosen": -1.2624908685684204,
767
+ "logits/rejected": -1.0200715065002441,
768
+ "logps/chosen": -391.1546325683594,
769
+ "logps/rejected": -462.6971130371094,
770
+ "loss": 0.3721,
771
+ "rewards/accuracies": 0.8374999761581421,
772
+ "rewards/chosen": -0.8392894864082336,
773
+ "rewards/margins": 1.1826223134994507,
774
+ "rewards/rejected": -2.021911859512329,
775
  "step": 490
776
  },
777
  {
778
  "epoch": 1.0465724751439036,
779
+ "grad_norm": 28.692357308634083,
780
  "learning_rate": 2.7285261601056697e-07,
781
+ "logits/chosen": -1.04556405544281,
782
+ "logits/rejected": -0.8983597755432129,
783
+ "logps/chosen": -385.15594482421875,
784
+ "logps/rejected": -492.3553771972656,
785
+ "loss": 0.3623,
786
  "rewards/accuracies": 0.862500011920929,
787
+ "rewards/chosen": -1.0549384355545044,
788
+ "rewards/margins": 1.216042399406433,
789
+ "rewards/rejected": -2.2709805965423584,
790
  "step": 500
791
  },
792
  {
793
  "epoch": 1.0675039246467817,
794
+ "grad_norm": 21.922390152982405,
795
  "learning_rate": 2.6372383496608186e-07,
796
+ "logits/chosen": -0.9019845128059387,
797
+ "logits/rejected": -0.6276781558990479,
798
+ "logps/chosen": -385.09625244140625,
799
+ "logps/rejected": -487.7792053222656,
800
+ "loss": 0.36,
801
+ "rewards/accuracies": 0.893750011920929,
802
+ "rewards/chosen": -0.9738361239433289,
803
+ "rewards/margins": 1.5190062522888184,
804
+ "rewards/rejected": -2.492842197418213,
805
  "step": 510
806
  },
807
  {
808
  "epoch": 1.08843537414966,
809
+ "grad_norm": 23.056390516413284,
810
  "learning_rate": 2.5457665670441937e-07,
811
+ "logits/chosen": -0.8516889810562134,
812
+ "logits/rejected": -0.5428006052970886,
813
+ "logps/chosen": -382.0596923828125,
814
+ "logps/rejected": -487.93701171875,
815
+ "loss": 0.3563,
816
+ "rewards/accuracies": 0.8687499761581421,
817
+ "rewards/chosen": -0.9770641326904297,
818
+ "rewards/margins": 1.3976478576660156,
819
+ "rewards/rejected": -2.3747119903564453,
820
  "step": 520
821
  },
822
  {
823
  "epoch": 1.109366823652538,
824
+ "grad_norm": 24.839002506991555,
825
  "learning_rate": 2.454233432955807e-07,
826
+ "logits/chosen": -0.893288254737854,
827
+ "logits/rejected": -0.55360347032547,
828
+ "logps/chosen": -373.3586120605469,
829
+ "logps/rejected": -470.93048095703125,
830
+ "loss": 0.3595,
831
+ "rewards/accuracies": 0.856249988079071,
832
+ "rewards/chosen": -1.079138159751892,
833
+ "rewards/margins": 1.3878281116485596,
834
+ "rewards/rejected": -2.466966152191162,
835
  "step": 530
836
  },
837
  {
838
  "epoch": 1.130298273155416,
839
+ "grad_norm": 24.9976173018764,
840
  "learning_rate": 2.3627616503391812e-07,
841
+ "logits/chosen": -0.6603145003318787,
842
+ "logits/rejected": -0.45139390230178833,
843
+ "logps/chosen": -381.8931884765625,
844
+ "logps/rejected": -511.0888671875,
845
+ "loss": 0.3615,
846
+ "rewards/accuracies": 0.862500011920929,
847
+ "rewards/chosen": -1.2078765630722046,
848
+ "rewards/margins": 1.415185809135437,
849
+ "rewards/rejected": -2.6230626106262207,
850
  "step": 540
851
  },
852
  {
853
  "epoch": 1.1512297226582942,
854
+ "grad_norm": 23.70399953245455,
855
  "learning_rate": 2.2714738398943308e-07,
856
+ "logits/chosen": -0.7952468395233154,
857
+ "logits/rejected": -0.5057519674301147,
858
+ "logps/chosen": -405.1830139160156,
859
+ "logps/rejected": -512.9744873046875,
860
+ "loss": 0.3432,
861
+ "rewards/accuracies": 0.8374999761581421,
862
+ "rewards/chosen": -1.1684848070144653,
863
+ "rewards/margins": 1.4340193271636963,
864
+ "rewards/rejected": -2.6025044918060303,
865
  "step": 550
866
  },
867
  {
868
  "epoch": 1.1721611721611722,
869
+ "grad_norm": 26.9881287231271,
870
  "learning_rate": 2.1804923757009882e-07,
871
+ "logits/chosen": -0.7067749500274658,
872
+ "logits/rejected": -0.4858238697052002,
873
+ "logps/chosen": -358.0526428222656,
874
+ "logps/rejected": -488.3414001464844,
875
+ "loss": 0.3631,
876
  "rewards/accuracies": 0.925000011920929,
877
+ "rewards/chosen": -0.9131903648376465,
878
+ "rewards/margins": 1.4550573825836182,
879
+ "rewards/rejected": -2.3682477474212646,
880
  "step": 560
881
  },
882
  {
883
  "epoch": 1.1930926216640503,
884
+ "grad_norm": 26.437360887515176,
885
  "learning_rate": 2.089939221172446e-07,
886
+ "logits/chosen": -0.6216100454330444,
887
+ "logits/rejected": -0.4619014263153076,
888
+ "logps/chosen": -427.541015625,
889
+ "logps/rejected": -578.9843139648438,
890
+ "loss": 0.3526,
891
+ "rewards/accuracies": 0.8999999761581421,
892
+ "rewards/chosen": -1.259556770324707,
893
+ "rewards/margins": 1.5523223876953125,
894
+ "rewards/rejected": -2.8118791580200195,
895
  "step": 570
896
  },
897
  {
898
  "epoch": 1.2140240711669283,
899
+ "grad_norm": 24.08017206287918,
900
  "learning_rate": 1.9999357655598891e-07,
901
+ "logits/chosen": -0.596352219581604,
902
+ "logits/rejected": -0.3721233308315277,
903
+ "logps/chosen": -412.52301025390625,
904
+ "logps/rejected": -562.3777465820312,
905
+ "loss": 0.3344,
906
+ "rewards/accuracies": 0.887499988079071,
907
+ "rewards/chosen": -1.310210943222046,
908
+ "rewards/margins": 1.5689070224761963,
909
+ "rewards/rejected": -2.879117965698242,
910
  "step": 580
911
  },
912
  {
913
  "epoch": 1.2349555206698064,
914
+ "grad_norm": 19.74419678906988,
915
  "learning_rate": 1.9106026612264315e-07,
916
+ "logits/chosen": -0.8253545761108398,
917
+ "logits/rejected": -0.6318912506103516,
918
+ "logps/chosen": -399.89910888671875,
919
+ "logps/rejected": -515.3082275390625,
920
+ "loss": 0.3424,
921
+ "rewards/accuracies": 0.893750011920929,
922
+ "rewards/chosen": -1.0740442276000977,
923
+ "rewards/margins": 1.4926326274871826,
924
+ "rewards/rejected": -2.5666770935058594,
925
  "step": 590
926
  },
927
  {
928
  "epoch": 1.2558869701726845,
929
+ "grad_norm": 30.297500591228765,
930
  "learning_rate": 1.8220596619089573e-07,
931
+ "logits/chosen": -0.7947873473167419,
932
+ "logits/rejected": -0.5936676859855652,
933
+ "logps/chosen": -452.333740234375,
934
+ "logps/rejected": -589.5086059570312,
935
+ "loss": 0.3349,
936
  "rewards/accuracies": 0.8374999761581421,
937
+ "rewards/chosen": -1.3032448291778564,
938
+ "rewards/margins": 1.449398398399353,
939
+ "rewards/rejected": -2.752643346786499,
940
  "step": 600
941
  },
942
  {
943
  "epoch": 1.2768184196755625,
944
+ "grad_norm": 25.095514098513064,
945
  "learning_rate": 1.7344254621846017e-07,
946
+ "logits/chosen": -0.5728591680526733,
947
+ "logits/rejected": -0.2157384604215622,
948
+ "logps/chosen": -446.3045959472656,
949
+ "logps/rejected": -558.5189819335938,
950
+ "loss": 0.3583,
951
+ "rewards/accuracies": 0.856249988079071,
952
+ "rewards/chosen": -1.3666396141052246,
953
+ "rewards/margins": 1.6065607070922852,
954
+ "rewards/rejected": -2.9732003211975098,
955
  "step": 610
956
  },
957
  {
958
  "epoch": 1.2977498691784406,
959
+ "grad_norm": 26.711706547263283,
960
  "learning_rate": 1.647817538357072e-07,
961
+ "logits/chosen": -0.6483644247055054,
962
+ "logits/rejected": -0.436624675989151,
963
+ "logps/chosen": -375.31463623046875,
964
+ "logps/rejected": -502.0403747558594,
965
+ "loss": 0.3359,
966
  "rewards/accuracies": 0.824999988079071,
967
+ "rewards/chosen": -1.1623198986053467,
968
+ "rewards/margins": 1.4659651517868042,
969
+ "rewards/rejected": -2.6282851696014404,
970
  "step": 620
971
  },
972
  {
973
  "epoch": 1.3186813186813187,
974
+ "grad_norm": 29.34167044151856,
975
  "learning_rate": 1.562351990976095e-07,
976
+ "logits/chosen": -0.5509764552116394,
977
+ "logits/rejected": -0.2896868884563446,
978
+ "logps/chosen": -404.48773193359375,
979
+ "logps/rejected": -547.499267578125,
980
+ "loss": 0.3416,
981
+ "rewards/accuracies": 0.887499988079071,
982
+ "rewards/chosen": -1.2804601192474365,
983
+ "rewards/margins": 1.6254936456680298,
984
+ "rewards/rejected": -2.905953884124756,
985
  "step": 630
986
  },
987
  {
988
  "epoch": 1.3396127681841967,
989
+ "grad_norm": 23.153596326850824,
990
  "learning_rate": 1.478143389201113e-07,
991
+ "logits/chosen": -0.5090593099594116,
992
+ "logits/rejected": -0.1941351443529129,
993
+ "logps/chosen": -416.89141845703125,
994
+ "logps/rejected": -544.4210815429688,
995
+ "loss": 0.3508,
996
+ "rewards/accuracies": 0.8687499761581421,
997
+ "rewards/chosen": -1.4043817520141602,
998
+ "rewards/margins": 1.578391194343567,
999
+ "rewards/rejected": -2.9827733039855957,
1000
  "step": 640
1001
  },
1002
  {
1003
  "epoch": 1.3605442176870748,
1004
+ "grad_norm": 37.7718208645946,
1005
  "learning_rate": 1.3953046172178413e-07,
1006
+ "logits/chosen": -0.4676644206047058,
1007
+ "logits/rejected": -0.29953527450561523,
1008
+ "logps/chosen": -402.21417236328125,
1009
+ "logps/rejected": -548.4046020507812,
1010
+ "loss": 0.3546,
1011
+ "rewards/accuracies": 0.925000011920929,
1012
+ "rewards/chosen": -1.1596171855926514,
1013
+ "rewards/margins": 1.624544382095337,
1014
+ "rewards/rejected": -2.7841618061065674,
1015
  "step": 650
1016
  },
1017
  {
1018
  "epoch": 1.3814756671899528,
1019
+ "grad_norm": 32.318559995180884,
1020
  "learning_rate": 1.3139467229135998e-07,
1021
+ "logits/chosen": -0.625167191028595,
1022
+ "logits/rejected": -0.40474313497543335,
1023
+ "logps/chosen": -410.2447204589844,
1024
+ "logps/rejected": -531.7794799804688,
1025
+ "loss": 0.3438,
1026
  "rewards/accuracies": 0.8687499761581421,
1027
+ "rewards/chosen": -1.2046838998794556,
1028
+ "rewards/margins": 1.518923044204712,
1029
+ "rewards/rejected": -2.723607063293457,
1030
  "step": 660
1031
  },
1032
  {
1033
  "epoch": 1.402407116692831,
1034
+ "grad_norm": 26.63444910378978,
1035
  "learning_rate": 1.2341787690142435e-07,
1036
+ "logits/chosen": -0.5080076456069946,
1037
+ "logits/rejected": -0.39028996229171753,
1038
+ "logps/chosen": -417.2349548339844,
1039
+ "logps/rejected": -571.6380615234375,
1040
+ "loss": 0.3373,
1041
  "rewards/accuracies": 0.887499988079071,
1042
+ "rewards/chosen": -1.290380597114563,
1043
+ "rewards/margins": 1.4690136909484863,
1044
+ "rewards/rejected": -2.7593941688537598,
1045
  "step": 670
1046
  },
1047
  {
1048
  "epoch": 1.423338566195709,
1049
+ "grad_norm": 20.19948420893599,
1050
  "learning_rate": 1.1561076868822755e-07,
1051
+ "logits/chosen": -0.6434948444366455,
1052
+ "logits/rejected": -0.3340326249599457,
1053
+ "logps/chosen": -425.06585693359375,
1054
+ "logps/rejected": -558.2105712890625,
1055
+ "loss": 0.3408,
1056
+ "rewards/accuracies": 0.8687499761581421,
1057
+ "rewards/chosen": -1.3932050466537476,
1058
+ "rewards/margins": 1.6028707027435303,
1059
+ "rewards/rejected": -2.9960758686065674,
1060
  "step": 680
1061
  },
1062
  {
1063
  "epoch": 1.4442700156985873,
1064
+ "grad_norm": 23.405895905191862,
1065
  "learning_rate": 1.0798381331721107e-07,
1066
+ "logits/chosen": -0.45825833082199097,
1067
+ "logits/rejected": -0.26569095253944397,
1068
+ "logps/chosen": -413.29705810546875,
1069
+ "logps/rejected": -516.1429443359375,
1070
+ "loss": 0.3387,
1071
+ "rewards/accuracies": 0.84375,
1072
+ "rewards/chosen": -1.2925164699554443,
1073
+ "rewards/margins": 1.4324119091033936,
1074
+ "rewards/rejected": -2.724928379058838,
1075
  "step": 690
1076
  },
1077
  {
1078
  "epoch": 1.4652014652014653,
1079
+ "grad_norm": 26.00052056036154,
1080
  "learning_rate": 1.0054723495346482e-07,
1081
+ "logits/chosen": -0.5345702767372131,
1082
+ "logits/rejected": -0.28709763288497925,
1083
+ "logps/chosen": -439.5155334472656,
1084
+ "logps/rejected": -595.0430908203125,
1085
+ "loss": 0.3385,
1086
+ "rewards/accuracies": 0.893750011920929,
1087
+ "rewards/chosen": -1.335257649421692,
1088
+ "rewards/margins": 1.8328659534454346,
1089
+ "rewards/rejected": -3.168123483657837,
1090
  "step": 700
1091
  },
1092
  {
1093
  "epoch": 1.4861329147043434,
1094
+ "grad_norm": 29.88922989167434,
1095
  "learning_rate": 9.331100255592436e-08,
1096
+ "logits/chosen": -0.44970911741256714,
1097
+ "logits/rejected": -0.3438431918621063,
1098
+ "logps/chosen": -440.60009765625,
1099
+ "logps/rejected": -598.9261474609375,
1100
+ "loss": 0.3343,
1101
+ "rewards/accuracies": 0.8687499761581421,
1102
+ "rewards/chosen": -1.5944675207138062,
1103
+ "rewards/margins": 1.555371642112732,
1104
+ "rewards/rejected": -3.149839401245117,
1105
  "step": 710
1106
  },
1107
  {
1108
  "epoch": 1.5070643642072215,
1109
+ "grad_norm": 35.80679957440786,
1110
  "learning_rate": 8.628481651367875e-08,
1111
+ "logits/chosen": -0.6270388960838318,
1112
+ "logits/rejected": -0.3709457218647003,
1113
+ "logps/chosen": -430.08734130859375,
1114
+ "logps/rejected": -598.91357421875,
1115
+ "loss": 0.324,
1116
+ "rewards/accuracies": 0.918749988079071,
1117
+ "rewards/chosen": -1.4218626022338867,
1118
+ "rewards/margins": 1.803938865661621,
1119
+ "rewards/rejected": -3.2258009910583496,
1120
  "step": 720
1121
  },
1122
  {
1123
  "epoch": 1.5279958137100995,
1124
+ "grad_norm": 35.40691778150228,
1125
  "learning_rate": 7.947809564230445e-08,
1126
+ "logits/chosen": -0.7805054187774658,
1127
+ "logits/rejected": -0.5504633188247681,
1128
+ "logps/chosen": -417.5584411621094,
1129
+ "logps/rejected": -570.6063232421875,
1130
+ "loss": 0.3251,
1131
+ "rewards/accuracies": 0.8500000238418579,
1132
+ "rewards/chosen": -1.2576138973236084,
1133
+ "rewards/margins": 1.5007960796356201,
1134
+ "rewards/rejected": -2.7584102153778076,
1135
  "step": 730
1136
  },
1137
  {
1138
  "epoch": 1.5489272632129776,
1139
+ "grad_norm": 22.542061727173216,
1140
  "learning_rate": 7.289996455765748e-08,
1141
+ "logits/chosen": -0.7156350612640381,
1142
+ "logits/rejected": -0.5093780755996704,
1143
+ "logps/chosen": -440.34375,
1144
+ "logps/rejected": -552.8305053710938,
1145
+ "loss": 0.3487,
1146
+ "rewards/accuracies": 0.875,
1147
+ "rewards/chosen": -1.445813536643982,
1148
+ "rewards/margins": 1.539942741394043,
1149
+ "rewards/rejected": -2.9857563972473145,
1150
  "step": 740
1151
  },
1152
  {
1153
  "epoch": 1.5698587127158556,
1154
+ "grad_norm": 22.391994910590135,
1155
  "learning_rate": 6.655924144404906e-08,
1156
+ "logits/chosen": -0.5188966393470764,
1157
+ "logits/rejected": -0.31423279643058777,
1158
+ "logps/chosen": -427.5140686035156,
1159
+ "logps/rejected": -574.3653564453125,
1160
+ "loss": 0.3262,
1161
+ "rewards/accuracies": 0.8812500238418579,
1162
+ "rewards/chosen": -1.3562893867492676,
1163
+ "rewards/margins": 1.616431474685669,
1164
+ "rewards/rejected": -2.9727206230163574,
1165
  "step": 750
1166
  },
1167
  {
1168
  "epoch": 1.5907901622187337,
1169
+ "grad_norm": 27.067458253461155,
1170
  "learning_rate": 6.046442623320145e-08,
1171
+ "logits/chosen": -0.7770091891288757,
1172
+ "logits/rejected": -0.4757114350795746,
1173
+ "logps/chosen": -492.76416015625,
1174
+ "logps/rejected": -625.2634887695312,
1175
+ "loss": 0.3424,
1176
+ "rewards/accuracies": 0.8500000238418579,
1177
+ "rewards/chosen": -1.4647096395492554,
1178
+ "rewards/margins": 1.647743821144104,
1179
+ "rewards/rejected": -3.1124536991119385,
1180
  "step": 760
1181
  },
1182
  {
1183
  "epoch": 1.6117216117216118,
1184
+ "grad_norm": 27.917693830137047,
1185
  "learning_rate": 5.4623689209832484e-08,
1186
+ "logits/chosen": -0.8214691281318665,
1187
+ "logits/rejected": -0.43942517042160034,
1188
+ "logps/chosen": -453.11199951171875,
1189
+ "logps/rejected": -539.7423706054688,
1190
+ "loss": 0.3324,
1191
  "rewards/accuracies": 0.8500000238418579,
1192
+ "rewards/chosen": -1.3807549476623535,
1193
+ "rewards/margins": 1.52043879032135,
1194
+ "rewards/rejected": -2.901193618774414,
1195
  "step": 770
1196
  },
1197
  {
1198
  "epoch": 1.6326530612244898,
1199
+ "grad_norm": 35.7523647379527,
1200
  "learning_rate": 4.904486005914027e-08,
1201
+ "logits/chosen": -0.7380484342575073,
1202
+ "logits/rejected": -0.5423134565353394,
1203
+ "logps/chosen": -448.4818420410156,
1204
+ "logps/rejected": -601.2604370117188,
1205
+ "loss": 0.3395,
1206
+ "rewards/accuracies": 0.8687499761581421,
1207
+ "rewards/chosen": -1.1892478466033936,
1208
+ "rewards/margins": 1.734458565711975,
1209
+ "rewards/rejected": -2.923706293106079,
1210
  "step": 780
1211
  },
1212
  {
1213
  "epoch": 1.653584510727368,
1214
+ "grad_norm": 31.011854985332448,
1215
  "learning_rate": 4.373541737087263e-08,
1216
+ "logits/chosen": -0.7779333591461182,
1217
+ "logits/rejected": -0.49003076553344727,
1218
+ "logps/chosen": -406.6073303222656,
1219
+ "logps/rejected": -552.443359375,
1220
+ "loss": 0.3239,
1221
+ "rewards/accuracies": 0.8687499761581421,
1222
+ "rewards/chosen": -1.2308647632598877,
1223
+ "rewards/margins": 1.5439412593841553,
1224
+ "rewards/rejected": -2.774806261062622,
1225
  "step": 790
1226
  },
1227
  {
1228
  "epoch": 1.674515960230246,
1229
+ "grad_norm": 25.052300316679634,
1230
  "learning_rate": 3.8702478614051345e-08,
1231
+ "logits/chosen": -0.7519603371620178,
1232
+ "logits/rejected": -0.4932466447353363,
1233
+ "logps/chosen": -459.8077087402344,
1234
+ "logps/rejected": -562.577392578125,
1235
+ "loss": 0.3121,
1236
+ "rewards/accuracies": 0.893750011920929,
1237
+ "rewards/chosen": -1.291195273399353,
1238
+ "rewards/margins": 1.5607550144195557,
1239
+ "rewards/rejected": -2.851950168609619,
1240
  "step": 800
1241
  },
1242
  {
1243
  "epoch": 1.695447409733124,
1244
+ "grad_norm": 28.051358555998323,
1245
  "learning_rate": 3.3952790595787986e-08,
1246
+ "logits/chosen": -0.7064075469970703,
1247
+ "logits/rejected": -0.5052956938743591,
1248
+ "logps/chosen": -427.84832763671875,
1249
+ "logps/rejected": -587.9678955078125,
1250
+ "loss": 0.3363,
1251
  "rewards/accuracies": 0.887499988079071,
1252
+ "rewards/chosen": -1.3051536083221436,
1253
+ "rewards/margins": 1.7959152460098267,
1254
+ "rewards/rejected": -3.1010687351226807,
1255
  "step": 810
1256
  },
1257
  {
1258
  "epoch": 1.716378859236002,
1259
+ "grad_norm": 31.67321567378694,
1260
  "learning_rate": 2.9492720416985e-08,
1261
+ "logits/chosen": -0.8305649757385254,
1262
+ "logits/rejected": -0.7014783620834351,
1263
+ "logps/chosen": -413.65576171875,
1264
+ "logps/rejected": -569.78759765625,
1265
+ "loss": 0.3315,
1266
+ "rewards/accuracies": 0.862500011920929,
1267
+ "rewards/chosen": -1.4015345573425293,
1268
+ "rewards/margins": 1.6014522314071655,
1269
+ "rewards/rejected": -3.002986431121826,
1270
  "step": 820
1271
  },
1272
  {
1273
  "epoch": 1.7373103087388801,
1274
+ "grad_norm": 21.644645673450903,
1275
  "learning_rate": 2.5328246937043525e-08,
1276
+ "logits/chosen": -0.8004710078239441,
1277
+ "logits/rejected": -0.6543309092521667,
1278
+ "logps/chosen": -454.3511657714844,
1279
+ "logps/rejected": -579.9097900390625,
1280
+ "loss": 0.3224,
1281
+ "rewards/accuracies": 0.862500011920929,
1282
+ "rewards/chosen": -1.3592346906661987,
1283
+ "rewards/margins": 1.5871307849884033,
1284
+ "rewards/rejected": -2.9463653564453125,
1285
  "step": 830
1286
  },
1287
  {
1288
  "epoch": 1.7582417582417582,
1289
+ "grad_norm": 29.695508251939298,
1290
  "learning_rate": 2.1464952759020856e-08,
1291
+ "logits/chosen": -0.5647540092468262,
1292
+ "logits/rejected": -0.4591120779514313,
1293
+ "logps/chosen": -411.2245178222656,
1294
+ "logps/rejected": -579.0653076171875,
1295
+ "loss": 0.3265,
1296
  "rewards/accuracies": 0.887499988079071,
1297
+ "rewards/chosen": -1.4194848537445068,
1298
+ "rewards/margins": 1.6787960529327393,
1299
+ "rewards/rejected": -3.098280906677246,
1300
  "step": 840
1301
  },
1302
  {
1303
  "epoch": 1.7791732077446363,
1304
+ "grad_norm": 30.24532074338387,
1305
  "learning_rate": 1.7908016745981856e-08,
1306
+ "logits/chosen": -0.631384551525116,
1307
+ "logits/rejected": -0.5165312886238098,
1308
+ "logps/chosen": -408.71173095703125,
1309
+ "logps/rejected": -562.2457275390625,
1310
+ "loss": 0.3275,
1311
+ "rewards/accuracies": 0.8500000238418579,
1312
+ "rewards/chosen": -1.459183931350708,
1313
+ "rewards/margins": 1.5402036905288696,
1314
+ "rewards/rejected": -2.999387264251709,
1315
  "step": 850
1316
  },
1317
  {
1318
  "epoch": 1.8001046572475143,
1319
+ "grad_norm": 28.810585688980133,
1320
  "learning_rate": 1.4662207078575684e-08,
1321
+ "logits/chosen": -0.8192375302314758,
1322
+ "logits/rejected": -0.6066277623176575,
1323
+ "logps/chosen": -447.3929748535156,
1324
+ "logps/rejected": -533.9404296875,
1325
+ "loss": 0.3479,
1326
  "rewards/accuracies": 0.84375,
1327
+ "rewards/chosen": -1.381157636642456,
1328
+ "rewards/margins": 1.4070510864257812,
1329
+ "rewards/rejected": -2.788208484649658,
1330
  "step": 860
1331
  },
1332
  {
1333
  "epoch": 1.8210361067503924,
1334
+ "grad_norm": 24.013363968760547,
1335
  "learning_rate": 1.1731874863145142e-08,
1336
+ "logits/chosen": -0.7617680430412292,
1337
+ "logits/rejected": -0.47873860597610474,
1338
+ "logps/chosen": -454.9310607910156,
1339
+ "logps/rejected": -573.6890869140625,
1340
+ "loss": 0.331,
1341
  "rewards/accuracies": 0.8999999761581421,
1342
+ "rewards/chosen": -1.3413336277008057,
1343
+ "rewards/margins": 1.7013572454452515,
1344
+ "rewards/rejected": -3.0426909923553467,
1345
  "step": 870
1346
  },
1347
  {
1348
  "epoch": 1.8419675562532705,
1349
+ "grad_norm": 32.07599972860766,
1350
  "learning_rate": 9.12094829893642e-09,
1351
+ "logits/chosen": -0.7640255689620972,
1352
+ "logits/rejected": -0.4442223012447357,
1353
+ "logps/chosen": -438.59490966796875,
1354
+ "logps/rejected": -557.7069091796875,
1355
+ "loss": 0.3402,
1356
  "rewards/accuracies": 0.8812500238418579,
1357
+ "rewards/chosen": -1.409976840019226,
1358
+ "rewards/margins": 1.640062928199768,
1359
+ "rewards/rejected": -3.050039529800415,
1360
  "step": 880
1361
  },
1362
  {
1363
  "epoch": 1.8628990057561485,
1364
+ "grad_norm": 24.432518139204586,
1365
  "learning_rate": 6.832927412229017e-09,
1366
+ "logits/chosen": -0.8613092303276062,
1367
+ "logits/rejected": -0.6306430101394653,
1368
+ "logps/chosen": -456.99072265625,
1369
+ "logps/rejected": -594.9552001953125,
1370
+ "loss": 0.3223,
1371
  "rewards/accuracies": 0.9125000238418579,
1372
+ "rewards/chosen": -1.3222033977508545,
1373
+ "rewards/margins": 1.7843767404556274,
1374
+ "rewards/rejected": -3.1065802574157715,
1375
  "step": 890
1376
  },
1377
  {
1378
  "epoch": 1.8838304552590266,
1379
+ "grad_norm": 27.645379277195858,
1380
  "learning_rate": 4.8708793644441086e-09,
1381
+ "logits/chosen": -0.7418426275253296,
1382
+ "logits/rejected": -0.4674626886844635,
1383
+ "logps/chosen": -423.3577575683594,
1384
+ "logps/rejected": -554.7745361328125,
1385
+ "loss": 0.3093,
1386
+ "rewards/accuracies": 0.925000011920929,
1387
+ "rewards/chosen": -1.3512873649597168,
1388
+ "rewards/margins": 1.6987825632095337,
1389
+ "rewards/rejected": -3.050069808959961,
1390
  "step": 900
1391
  },
1392
  {
1393
  "epoch": 1.9047619047619047,
1394
+ "grad_norm": 28.923783797405456,
1395
  "learning_rate": 3.2374343405217884e-09,
1396
+ "logits/chosen": -0.6597197651863098,
1397
+ "logits/rejected": -0.4865621030330658,
1398
+ "logps/chosen": -408.0905456542969,
1399
+ "logps/rejected": -535.84521484375,
1400
+ "loss": 0.3367,
1401
  "rewards/accuracies": 0.887499988079071,
1402
+ "rewards/chosen": -1.4176795482635498,
1403
+ "rewards/margins": 1.6123201847076416,
1404
+ "rewards/rejected": -3.0299999713897705,
1405
  "step": 910
1406
  },
1407
  {
1408
  "epoch": 1.9256933542647827,
1409
+ "grad_norm": 28.331424470243782,
1410
  "learning_rate": 1.9347820230782295e-09,
1411
+ "logits/chosen": -0.8034309148788452,
1412
+ "logits/rejected": -0.6734964847564697,
1413
+ "logps/chosen": -424.65673828125,
1414
+ "logps/rejected": -592.0272216796875,
1415
+ "loss": 0.3345,
1416
+ "rewards/accuracies": 0.8812500238418579,
1417
+ "rewards/chosen": -1.302234172821045,
1418
+ "rewards/margins": 1.7047960758209229,
1419
+ "rewards/rejected": -3.0070300102233887,
1420
  "step": 920
1421
  },
1422
  {
1423
  "epoch": 1.9466248037676608,
1424
+ "grad_norm": 31.096436385559773,
1425
  "learning_rate": 9.64668657069706e-10,
1426
+ "logits/chosen": -0.8711435198783875,
1427
+ "logits/rejected": -0.6492515206336975,
1428
+ "logps/chosen": -412.20159912109375,
1429
+ "logps/rejected": -539.3062744140625,
1430
+ "loss": 0.3326,
1431
+ "rewards/accuracies": 0.856249988079071,
1432
+ "rewards/chosen": -1.3435027599334717,
1433
+ "rewards/margins": 1.5116230249404907,
1434
+ "rewards/rejected": -2.855125904083252,
1435
  "step": 930
1436
  },
1437
  {
1438
  "epoch": 1.9675562532705388,
1439
+ "grad_norm": 39.27145853425959,
1440
  "learning_rate": 3.2839470889836627e-10,
1441
+ "logits/chosen": -0.8457353711128235,
1442
+ "logits/rejected": -0.46064504981040955,
1443
+ "logps/chosen": -439.29815673828125,
1444
+ "logps/rejected": -539.5534057617188,
1445
+ "loss": 0.3418,
1446
+ "rewards/accuracies": 0.8812500238418579,
1447
+ "rewards/chosen": -1.3375287055969238,
1448
+ "rewards/margins": 1.5794165134429932,
1449
+ "rewards/rejected": -2.916945457458496,
1450
  "step": 940
1451
  },
1452
  {
1453
  "epoch": 1.988487702773417,
1454
+ "grad_norm": 30.98924464228554,
1455
  "learning_rate": 2.6813123097352287e-11,
1456
+ "logits/chosen": -0.7241233587265015,
1457
+ "logits/rejected": -0.4993128180503845,
1458
+ "logps/chosen": -443.7752380371094,
1459
+ "logps/rejected": -581.8499755859375,
1460
+ "loss": 0.3187,
1461
+ "rewards/accuracies": 0.893750011920929,
1462
+ "rewards/chosen": -1.385292649269104,
1463
+ "rewards/margins": 1.7712970972061157,
1464
+ "rewards/rejected": -3.1565897464752197,
1465
  "step": 950
1466
  },
1467
  {
1468
  "epoch": 1.9968602825745683,
1469
+ "eval_logits/chosen": -0.5949550271034241,
1470
+ "eval_logits/rejected": -0.44720327854156494,
1471
+ "eval_logps/chosen": -426.89898681640625,
1472
+ "eval_logps/rejected": -545.3427124023438,
1473
+ "eval_loss": 0.5225037932395935,
1474
+ "eval_rewards/accuracies": 0.7734375,
1475
+ "eval_rewards/chosen": -1.547555685043335,
1476
+ "eval_rewards/margins": 1.1972318887710571,
1477
+ "eval_rewards/rejected": -2.7447874546051025,
1478
+ "eval_runtime": 170.0564,
1479
+ "eval_samples_per_second": 11.761,
1480
+ "eval_steps_per_second": 0.188,
1481
  "step": 954
1482
  },
1483
  {
 
1485
  "step": 954,
1486
  "total_flos": 0.0,
1487
  "train_loss": 0.0,
1488
+ "train_runtime": 3.4239,
1489
+ "train_samples_per_second": 17855.176,
1490
+ "train_steps_per_second": 139.316
1491
  }
1492
  ],
1493
  "logging_steps": 10,
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5c6d2d89a16be19cf14d1d8c8ab9be0c7839e54c975f498671e2e6605832100e
3
  size 7672
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b011e9731aa70abf6ae9d2221721c467a75e1cd362b37eee4df8b0758ba30727
3
  size 7672