chentianqi commited on
Commit
b746a4a
·
verified ·
1 Parent(s): 10729f8

End of training

Browse files
README.md ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ datasets: open-r1/OpenR1-Math-220k
3
+ library_name: transformers
4
+ model_name: Qwen2.5-1.5B-Open-R1-Distill-ScaleTuning-lightkernel
5
+ tags:
6
+ - generated_from_trainer
7
+ - open-r1
8
+ - trl
9
+ - sft
10
+ licence: license
11
+ ---
12
+
13
+ # Model Card for Qwen2.5-1.5B-Open-R1-Distill-ScaleTuning-lightkernel
14
+
15
+ This model is a fine-tuned version of [None](https://huggingface.co/None) on the [open-r1/OpenR1-Math-220k](https://huggingface.co/datasets/open-r1/OpenR1-Math-220k) dataset.
16
+ It has been trained using [TRL](https://github.com/huggingface/trl).
17
+
18
+ ## Quick start
19
+
20
+ ```python
21
+ from transformers import pipeline
22
+
23
+ question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?"
24
+ generator = pipeline("text-generation", model="chentianqi/Qwen2.5-1.5B-Open-R1-Distill-ScaleTuning-lightkernel", device="cuda")
25
+ output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0]
26
+ print(output["generated_text"])
27
+ ```
28
+
29
+ ## Training procedure
30
+
31
+
32
+
33
+
34
+ This model was trained with SFT.
35
+
36
+ ### Framework versions
37
+
38
+ - TRL: 0.16.0.dev0
39
+ - Transformers: 4.49.0
40
+ - Pytorch: 2.5.1+cu124
41
+ - Datasets: 3.4.1
42
+ - Tokenizers: 0.21.1
43
+
44
+ ## Citations
45
+
46
+
47
+
48
+ Cite TRL as:
49
+
50
+ ```bibtex
51
+ @misc{vonwerra2022trl,
52
+ title = {{TRL: Transformer Reinforcement Learning}},
53
+ author = {Leandro von Werra and Younes Belkada and Lewis Tunstall and Edward Beeching and Tristan Thrush and Nathan Lambert and Shengyi Huang and Kashif Rasul and Quentin Gallouédec},
54
+ year = 2020,
55
+ journal = {GitHub repository},
56
+ publisher = {GitHub},
57
+ howpublished = {\url{https://github.com/huggingface/trl}}
58
+ }
59
+ ```
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:44e7873ccee54adb7ad4707879a988aecc28898395b3d1fa0e4d5ac2952b6094
3
  size 20497672
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:910c251d3f52345339ea72eacbe2ea8eb8354abb65c1cf3e96332d0f06135a91
3
  size 20497672
all_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "total_flos": 1.3060333048430592e+16,
3
+ "train_loss": 0.6017354608175174,
4
+ "train_runtime": 40505.6177,
5
+ "train_samples": 93733,
6
+ "train_samples_per_second": 2.54,
7
+ "train_steps_per_second": 0.02
8
+ }
config.json ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_attn_implementation_autoset": true,
3
+ "_name_or_path": "/home/chentianqi/model/Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int4",
4
+ "architectures": [
5
+ "Qwen2ForCausalLM"
6
+ ],
7
+ "attention_dropout": 0.0,
8
+ "bos_token_id": 151643,
9
+ "eos_token_id": 151645,
10
+ "hidden_act": "silu",
11
+ "hidden_size": 1536,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 8960,
14
+ "max_position_embeddings": 32768,
15
+ "max_window_layers": 21,
16
+ "model_type": "qwen2",
17
+ "num_attention_heads": 12,
18
+ "num_hidden_layers": 28,
19
+ "num_key_value_heads": 2,
20
+ "quantization_config": {
21
+ "backend": "auto",
22
+ "batch_size": 1,
23
+ "bits": 4,
24
+ "block_name_to_quantize": null,
25
+ "cache_block_outputs": true,
26
+ "checkpoint_format": "gptq",
27
+ "damp_percent": 0.01,
28
+ "dataset": null,
29
+ "desc_act": false,
30
+ "exllama_config": {
31
+ "version": 1
32
+ },
33
+ "group_size": 128,
34
+ "max_input_length": null,
35
+ "meta": null,
36
+ "model_seqlen": null,
37
+ "module_name_preceding_first_block": null,
38
+ "modules_in_block_to_quantize": null,
39
+ "pad_token_id": null,
40
+ "quant_method": "gptq",
41
+ "sym": true,
42
+ "tokenizer": null,
43
+ "true_sequential": true,
44
+ "use_cuda_fp16": false,
45
+ "use_exllama": true
46
+ },
47
+ "rms_norm_eps": 1e-06,
48
+ "rope_scaling": null,
49
+ "rope_theta": 1000000.0,
50
+ "sliding_window": 32768,
51
+ "tie_word_embeddings": true,
52
+ "torch_dtype": "bfloat16",
53
+ "transformers_version": "4.49.0",
54
+ "use_cache": true,
55
+ "use_sliding_window": false,
56
+ "vocab_size": 151936
57
+ }
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "total_flos": 1.3060333048430592e+16,
3
+ "train_loss": 0.6017354608175174,
4
+ "train_runtime": 40505.6177,
5
+ "train_samples": 93733,
6
+ "train_samples_per_second": 2.54,
7
+ "train_steps_per_second": 0.02
8
+ }
trainer_state.json ADDED
@@ -0,0 +1,1162 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 3.0,
5
+ "eval_steps": 500,
6
+ "global_step": 804,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.018656716417910446,
13
+ "grad_norm": 3.679038169484767,
14
+ "learning_rate": 6.0975609756097564e-06,
15
+ "loss": 0.8918,
16
+ "step": 5
17
+ },
18
+ {
19
+ "epoch": 0.03731343283582089,
20
+ "grad_norm": 5.026239257110004,
21
+ "learning_rate": 1.2195121951219513e-05,
22
+ "loss": 0.8762,
23
+ "step": 10
24
+ },
25
+ {
26
+ "epoch": 0.055970149253731345,
27
+ "grad_norm": 2.3394173415240784,
28
+ "learning_rate": 1.8292682926829268e-05,
29
+ "loss": 0.8351,
30
+ "step": 15
31
+ },
32
+ {
33
+ "epoch": 0.07462686567164178,
34
+ "grad_norm": 1.1189651624014834,
35
+ "learning_rate": 2.4390243902439026e-05,
36
+ "loss": 0.7859,
37
+ "step": 20
38
+ },
39
+ {
40
+ "epoch": 0.09328358208955224,
41
+ "grad_norm": 0.8786176397893852,
42
+ "learning_rate": 3.048780487804878e-05,
43
+ "loss": 0.7439,
44
+ "step": 25
45
+ },
46
+ {
47
+ "epoch": 0.11194029850746269,
48
+ "grad_norm": 0.6562223167500421,
49
+ "learning_rate": 3.6585365853658535e-05,
50
+ "loss": 0.7259,
51
+ "step": 30
52
+ },
53
+ {
54
+ "epoch": 0.13059701492537312,
55
+ "grad_norm": 0.5310123696364398,
56
+ "learning_rate": 4.26829268292683e-05,
57
+ "loss": 0.701,
58
+ "step": 35
59
+ },
60
+ {
61
+ "epoch": 0.14925373134328357,
62
+ "grad_norm": 0.46613789682502904,
63
+ "learning_rate": 4.878048780487805e-05,
64
+ "loss": 0.6863,
65
+ "step": 40
66
+ },
67
+ {
68
+ "epoch": 0.16791044776119404,
69
+ "grad_norm": 0.4418945671753766,
70
+ "learning_rate": 4.999694850011677e-05,
71
+ "loss": 0.6823,
72
+ "step": 45
73
+ },
74
+ {
75
+ "epoch": 0.1865671641791045,
76
+ "grad_norm": 0.40933598345210676,
77
+ "learning_rate": 4.998455320039942e-05,
78
+ "loss": 0.661,
79
+ "step": 50
80
+ },
81
+ {
82
+ "epoch": 0.20522388059701493,
83
+ "grad_norm": 0.4205832806859922,
84
+ "learning_rate": 4.9962628631365625e-05,
85
+ "loss": 0.6615,
86
+ "step": 55
87
+ },
88
+ {
89
+ "epoch": 0.22388059701492538,
90
+ "grad_norm": 0.38681078578931943,
91
+ "learning_rate": 4.9931184084955565e-05,
92
+ "loss": 0.6552,
93
+ "step": 60
94
+ },
95
+ {
96
+ "epoch": 0.24253731343283583,
97
+ "grad_norm": 0.38590365166796287,
98
+ "learning_rate": 4.989023288780946e-05,
99
+ "loss": 0.6441,
100
+ "step": 65
101
+ },
102
+ {
103
+ "epoch": 0.26119402985074625,
104
+ "grad_norm": 0.35607227076559017,
105
+ "learning_rate": 4.9839792395619594e-05,
106
+ "loss": 0.6397,
107
+ "step": 70
108
+ },
109
+ {
110
+ "epoch": 0.2798507462686567,
111
+ "grad_norm": 0.4028728870784642,
112
+ "learning_rate": 4.977988398577472e-05,
113
+ "loss": 0.6374,
114
+ "step": 75
115
+ },
116
+ {
117
+ "epoch": 0.29850746268656714,
118
+ "grad_norm": 0.3769441310675025,
119
+ "learning_rate": 4.971053304830001e-05,
120
+ "loss": 0.636,
121
+ "step": 80
122
+ },
123
+ {
124
+ "epoch": 0.31716417910447764,
125
+ "grad_norm": 0.38288329325808956,
126
+ "learning_rate": 4.96317689750964e-05,
127
+ "loss": 0.6315,
128
+ "step": 85
129
+ },
130
+ {
131
+ "epoch": 0.3358208955223881,
132
+ "grad_norm": 0.398945804229678,
133
+ "learning_rate": 4.954362514748392e-05,
134
+ "loss": 0.6343,
135
+ "step": 90
136
+ },
137
+ {
138
+ "epoch": 0.35447761194029853,
139
+ "grad_norm": 0.39656777599536375,
140
+ "learning_rate": 4.9446138922054206e-05,
141
+ "loss": 0.6324,
142
+ "step": 95
143
+ },
144
+ {
145
+ "epoch": 0.373134328358209,
146
+ "grad_norm": 0.41566588973303703,
147
+ "learning_rate": 4.933935161483824e-05,
148
+ "loss": 0.6279,
149
+ "step": 100
150
+ },
151
+ {
152
+ "epoch": 0.3917910447761194,
153
+ "grad_norm": 0.40776908605390205,
154
+ "learning_rate": 4.922330848379606e-05,
155
+ "loss": 0.6243,
156
+ "step": 105
157
+ },
158
+ {
159
+ "epoch": 0.41044776119402987,
160
+ "grad_norm": 0.36377249343235496,
161
+ "learning_rate": 4.909805870963577e-05,
162
+ "loss": 0.6316,
163
+ "step": 110
164
+ },
165
+ {
166
+ "epoch": 0.4291044776119403,
167
+ "grad_norm": 0.4088424282993371,
168
+ "learning_rate": 4.89636553749701e-05,
169
+ "loss": 0.6218,
170
+ "step": 115
171
+ },
172
+ {
173
+ "epoch": 0.44776119402985076,
174
+ "grad_norm": 0.4530001034479991,
175
+ "learning_rate": 4.882015544181922e-05,
176
+ "loss": 0.6233,
177
+ "step": 120
178
+ },
179
+ {
180
+ "epoch": 0.4664179104477612,
181
+ "grad_norm": 0.390599205811657,
182
+ "learning_rate": 4.866761972746946e-05,
183
+ "loss": 0.6203,
184
+ "step": 125
185
+ },
186
+ {
187
+ "epoch": 0.48507462686567165,
188
+ "grad_norm": 0.40143376973435857,
189
+ "learning_rate": 4.850611287869809e-05,
190
+ "loss": 0.6191,
191
+ "step": 130
192
+ },
193
+ {
194
+ "epoch": 0.503731343283582,
195
+ "grad_norm": 0.38289121019799516,
196
+ "learning_rate": 4.833570334437505e-05,
197
+ "loss": 0.6157,
198
+ "step": 135
199
+ },
200
+ {
201
+ "epoch": 0.5223880597014925,
202
+ "grad_norm": 0.35624951740987876,
203
+ "learning_rate": 4.8156463346453454e-05,
204
+ "loss": 0.6217,
205
+ "step": 140
206
+ },
207
+ {
208
+ "epoch": 0.5410447761194029,
209
+ "grad_norm": 0.38768155956960526,
210
+ "learning_rate": 4.7968468849360844e-05,
211
+ "loss": 0.6136,
212
+ "step": 145
213
+ },
214
+ {
215
+ "epoch": 0.5597014925373134,
216
+ "grad_norm": 0.43150760988607983,
217
+ "learning_rate": 4.777179952780443e-05,
218
+ "loss": 0.6123,
219
+ "step": 150
220
+ },
221
+ {
222
+ "epoch": 0.5783582089552238,
223
+ "grad_norm": 0.38123867806776124,
224
+ "learning_rate": 4.756653873300381e-05,
225
+ "loss": 0.6121,
226
+ "step": 155
227
+ },
228
+ {
229
+ "epoch": 0.5970149253731343,
230
+ "grad_norm": 0.3873396799901745,
231
+ "learning_rate": 4.735277345736555e-05,
232
+ "loss": 0.6101,
233
+ "step": 160
234
+ },
235
+ {
236
+ "epoch": 0.6156716417910447,
237
+ "grad_norm": 0.4652661550873779,
238
+ "learning_rate": 4.713059429761462e-05,
239
+ "loss": 0.6091,
240
+ "step": 165
241
+ },
242
+ {
243
+ "epoch": 0.6343283582089553,
244
+ "grad_norm": 0.460574163755368,
245
+ "learning_rate": 4.690009541639818e-05,
246
+ "loss": 0.6093,
247
+ "step": 170
248
+ },
249
+ {
250
+ "epoch": 0.6529850746268657,
251
+ "grad_norm": 0.42147449415040555,
252
+ "learning_rate": 4.666137450237816e-05,
253
+ "loss": 0.6094,
254
+ "step": 175
255
+ },
256
+ {
257
+ "epoch": 0.6716417910447762,
258
+ "grad_norm": 0.41549765227071744,
259
+ "learning_rate": 4.641453272882943e-05,
260
+ "loss": 0.6008,
261
+ "step": 180
262
+ },
263
+ {
264
+ "epoch": 0.6902985074626866,
265
+ "grad_norm": 0.4263198940412901,
266
+ "learning_rate": 4.615967471076114e-05,
267
+ "loss": 0.6043,
268
+ "step": 185
269
+ },
270
+ {
271
+ "epoch": 0.7089552238805971,
272
+ "grad_norm": 0.41334703868436146,
273
+ "learning_rate": 4.5896908460579396e-05,
274
+ "loss": 0.6081,
275
+ "step": 190
276
+ },
277
+ {
278
+ "epoch": 0.7276119402985075,
279
+ "grad_norm": 0.40449119343462997,
280
+ "learning_rate": 4.562634534231012e-05,
281
+ "loss": 0.611,
282
+ "step": 195
283
+ },
284
+ {
285
+ "epoch": 0.746268656716418,
286
+ "grad_norm": 0.4322927322961092,
287
+ "learning_rate": 4.5348100024401387e-05,
288
+ "loss": 0.6052,
289
+ "step": 200
290
+ },
291
+ {
292
+ "epoch": 0.7649253731343284,
293
+ "grad_norm": 0.3998193681510812,
294
+ "learning_rate": 4.5062290431125306e-05,
295
+ "loss": 0.602,
296
+ "step": 205
297
+ },
298
+ {
299
+ "epoch": 0.7835820895522388,
300
+ "grad_norm": 0.3852462917147824,
301
+ "learning_rate": 4.476903769260014e-05,
302
+ "loss": 0.6073,
303
+ "step": 210
304
+ },
305
+ {
306
+ "epoch": 0.8022388059701493,
307
+ "grad_norm": 0.3607133278854917,
308
+ "learning_rate": 4.4468466093453555e-05,
309
+ "loss": 0.601,
310
+ "step": 215
311
+ },
312
+ {
313
+ "epoch": 0.8208955223880597,
314
+ "grad_norm": 0.384005014576649,
315
+ "learning_rate": 4.416070302014912e-05,
316
+ "loss": 0.601,
317
+ "step": 220
318
+ },
319
+ {
320
+ "epoch": 0.8395522388059702,
321
+ "grad_norm": 0.4067545567905266,
322
+ "learning_rate": 4.384587890699813e-05,
323
+ "loss": 0.6077,
324
+ "step": 225
325
+ },
326
+ {
327
+ "epoch": 0.8582089552238806,
328
+ "grad_norm": 0.4518428868611041,
329
+ "learning_rate": 4.352412718087967e-05,
330
+ "loss": 0.6036,
331
+ "step": 230
332
+ },
333
+ {
334
+ "epoch": 0.8768656716417911,
335
+ "grad_norm": 0.4403878189269791,
336
+ "learning_rate": 4.31955842046925e-05,
337
+ "loss": 0.6036,
338
+ "step": 235
339
+ },
340
+ {
341
+ "epoch": 0.8955223880597015,
342
+ "grad_norm": 0.3839440206537794,
343
+ "learning_rate": 4.2860389219562457e-05,
344
+ "loss": 0.5991,
345
+ "step": 240
346
+ },
347
+ {
348
+ "epoch": 0.914179104477612,
349
+ "grad_norm": 0.38470392150378924,
350
+ "learning_rate": 4.25186842858302e-05,
351
+ "loss": 0.6029,
352
+ "step": 245
353
+ },
354
+ {
355
+ "epoch": 0.9328358208955224,
356
+ "grad_norm": 0.3685287159565594,
357
+ "learning_rate": 4.217061422284397e-05,
358
+ "loss": 0.5994,
359
+ "step": 250
360
+ },
361
+ {
362
+ "epoch": 0.9514925373134329,
363
+ "grad_norm": 0.41527152296142383,
364
+ "learning_rate": 4.181632654758317e-05,
365
+ "loss": 0.5947,
366
+ "step": 255
367
+ },
368
+ {
369
+ "epoch": 0.9701492537313433,
370
+ "grad_norm": 0.40422034593241946,
371
+ "learning_rate": 4.145597141213857e-05,
372
+ "loss": 0.5984,
373
+ "step": 260
374
+ },
375
+ {
376
+ "epoch": 0.9888059701492538,
377
+ "grad_norm": 0.39843553796794084,
378
+ "learning_rate": 4.1089701540075746e-05,
379
+ "loss": 0.6021,
380
+ "step": 265
381
+ },
382
+ {
383
+ "epoch": 1.007462686567164,
384
+ "grad_norm": 0.3956036245521926,
385
+ "learning_rate": 4.07176721617087e-05,
386
+ "loss": 0.5928,
387
+ "step": 270
388
+ },
389
+ {
390
+ "epoch": 1.0261194029850746,
391
+ "grad_norm": 0.4166017237270764,
392
+ "learning_rate": 4.034004094831106e-05,
393
+ "loss": 0.5949,
394
+ "step": 275
395
+ },
396
+ {
397
+ "epoch": 1.044776119402985,
398
+ "grad_norm": 0.40419260207040375,
399
+ "learning_rate": 3.995696794529279e-05,
400
+ "loss": 0.5957,
401
+ "step": 280
402
+ },
403
+ {
404
+ "epoch": 1.0634328358208955,
405
+ "grad_norm": 0.38414601131349857,
406
+ "learning_rate": 3.9568615504370675e-05,
407
+ "loss": 0.596,
408
+ "step": 285
409
+ },
410
+ {
411
+ "epoch": 1.0820895522388059,
412
+ "grad_norm": 0.42937062188634684,
413
+ "learning_rate": 3.9175148214761445e-05,
414
+ "loss": 0.5854,
415
+ "step": 290
416
+ },
417
+ {
418
+ "epoch": 1.1007462686567164,
419
+ "grad_norm": 0.43039344431563725,
420
+ "learning_rate": 3.877673283342647e-05,
421
+ "loss": 0.5937,
422
+ "step": 295
423
+ },
424
+ {
425
+ "epoch": 1.1194029850746268,
426
+ "grad_norm": 0.44611677130222577,
427
+ "learning_rate": 3.8373538214397895e-05,
428
+ "loss": 0.5888,
429
+ "step": 300
430
+ },
431
+ {
432
+ "epoch": 1.1380597014925373,
433
+ "grad_norm": 0.36740897698634617,
434
+ "learning_rate": 3.796573523721588e-05,
435
+ "loss": 0.5916,
436
+ "step": 305
437
+ },
438
+ {
439
+ "epoch": 1.1567164179104479,
440
+ "grad_norm": 0.3520969327962709,
441
+ "learning_rate": 3.755349673450747e-05,
442
+ "loss": 0.5896,
443
+ "step": 310
444
+ },
445
+ {
446
+ "epoch": 1.1753731343283582,
447
+ "grad_norm": 0.4320109755196434,
448
+ "learning_rate": 3.713699741873769e-05,
449
+ "loss": 0.5898,
450
+ "step": 315
451
+ },
452
+ {
453
+ "epoch": 1.1940298507462686,
454
+ "grad_norm": 0.3751910728794563,
455
+ "learning_rate": 3.6716413808163996e-05,
456
+ "loss": 0.5917,
457
+ "step": 320
458
+ },
459
+ {
460
+ "epoch": 1.212686567164179,
461
+ "grad_norm": 0.37242919604692815,
462
+ "learning_rate": 3.6291924152025287e-05,
463
+ "loss": 0.5899,
464
+ "step": 325
465
+ },
466
+ {
467
+ "epoch": 1.2313432835820897,
468
+ "grad_norm": 0.43957155355695143,
469
+ "learning_rate": 3.5863708354997426e-05,
470
+ "loss": 0.587,
471
+ "step": 330
472
+ },
473
+ {
474
+ "epoch": 1.25,
475
+ "grad_norm": 0.39912947385365477,
476
+ "learning_rate": 3.5431947900947086e-05,
477
+ "loss": 0.5922,
478
+ "step": 335
479
+ },
480
+ {
481
+ "epoch": 1.2686567164179103,
482
+ "grad_norm": 0.3971406740720117,
483
+ "learning_rate": 3.499682577601638e-05,
484
+ "loss": 0.5833,
485
+ "step": 340
486
+ },
487
+ {
488
+ "epoch": 1.287313432835821,
489
+ "grad_norm": 0.4227860368740872,
490
+ "learning_rate": 3.455852639107071e-05,
491
+ "loss": 0.5894,
492
+ "step": 345
493
+ },
494
+ {
495
+ "epoch": 1.3059701492537314,
496
+ "grad_norm": 0.3682350519851987,
497
+ "learning_rate": 3.4117235503542874e-05,
498
+ "loss": 0.587,
499
+ "step": 350
500
+ },
501
+ {
502
+ "epoch": 1.3246268656716418,
503
+ "grad_norm": 0.3920988151030277,
504
+ "learning_rate": 3.3673140138706474e-05,
505
+ "loss": 0.5913,
506
+ "step": 355
507
+ },
508
+ {
509
+ "epoch": 1.3432835820895521,
510
+ "grad_norm": 0.3554571528020441,
511
+ "learning_rate": 3.322642851041199e-05,
512
+ "loss": 0.5851,
513
+ "step": 360
514
+ },
515
+ {
516
+ "epoch": 1.3619402985074627,
517
+ "grad_norm": 0.3698532741751772,
518
+ "learning_rate": 3.277728994131904e-05,
519
+ "loss": 0.5824,
520
+ "step": 365
521
+ },
522
+ {
523
+ "epoch": 1.3805970149253732,
524
+ "grad_norm": 0.4227494867891386,
525
+ "learning_rate": 3.232591478265887e-05,
526
+ "loss": 0.5932,
527
+ "step": 370
528
+ },
529
+ {
530
+ "epoch": 1.3992537313432836,
531
+ "grad_norm": 0.3844556086274232,
532
+ "learning_rate": 3.187249433356076e-05,
533
+ "loss": 0.5874,
534
+ "step": 375
535
+ },
536
+ {
537
+ "epoch": 1.417910447761194,
538
+ "grad_norm": 0.38521953470020937,
539
+ "learning_rate": 3.141722075997681e-05,
540
+ "loss": 0.5865,
541
+ "step": 380
542
+ },
543
+ {
544
+ "epoch": 1.4365671641791045,
545
+ "grad_norm": 0.3350120161906321,
546
+ "learning_rate": 3.096028701323926e-05,
547
+ "loss": 0.5806,
548
+ "step": 385
549
+ },
550
+ {
551
+ "epoch": 1.455223880597015,
552
+ "grad_norm": 0.3373119468257974,
553
+ "learning_rate": 3.050188674828507e-05,
554
+ "loss": 0.5831,
555
+ "step": 390
556
+ },
557
+ {
558
+ "epoch": 1.4738805970149254,
559
+ "grad_norm": 0.3466289424501227,
560
+ "learning_rate": 3.00422142415822e-05,
561
+ "loss": 0.5792,
562
+ "step": 395
563
+ },
564
+ {
565
+ "epoch": 1.4925373134328357,
566
+ "grad_norm": 0.3895078262928191,
567
+ "learning_rate": 2.958146430879254e-05,
568
+ "loss": 0.5837,
569
+ "step": 400
570
+ },
571
+ {
572
+ "epoch": 1.5111940298507462,
573
+ "grad_norm": 0.35008152039998003,
574
+ "learning_rate": 2.9119832222206262e-05,
575
+ "loss": 0.5779,
576
+ "step": 405
577
+ },
578
+ {
579
+ "epoch": 1.5298507462686568,
580
+ "grad_norm": 0.4148666068648647,
581
+ "learning_rate": 2.8657513627982702e-05,
582
+ "loss": 0.5823,
583
+ "step": 410
584
+ },
585
+ {
586
+ "epoch": 1.5485074626865671,
587
+ "grad_norm": 0.36672078601069563,
588
+ "learning_rate": 2.8194704463232792e-05,
589
+ "loss": 0.5878,
590
+ "step": 415
591
+ },
592
+ {
593
+ "epoch": 1.5671641791044775,
594
+ "grad_norm": 0.36305996654514194,
595
+ "learning_rate": 2.7731600872978102e-05,
596
+ "loss": 0.584,
597
+ "step": 420
598
+ },
599
+ {
600
+ "epoch": 1.585820895522388,
601
+ "grad_norm": 0.35704474129568975,
602
+ "learning_rate": 2.726839912702191e-05,
603
+ "loss": 0.5865,
604
+ "step": 425
605
+ },
606
+ {
607
+ "epoch": 1.6044776119402986,
608
+ "grad_norm": 0.3741518662245682,
609
+ "learning_rate": 2.6805295536767224e-05,
610
+ "loss": 0.585,
611
+ "step": 430
612
+ },
613
+ {
614
+ "epoch": 1.623134328358209,
615
+ "grad_norm": 0.32077101033201805,
616
+ "learning_rate": 2.6342486372017306e-05,
617
+ "loss": 0.5842,
618
+ "step": 435
619
+ },
620
+ {
621
+ "epoch": 1.6417910447761193,
622
+ "grad_norm": 0.3178314837454159,
623
+ "learning_rate": 2.5880167777793746e-05,
624
+ "loss": 0.5844,
625
+ "step": 440
626
+ },
627
+ {
628
+ "epoch": 1.6604477611940298,
629
+ "grad_norm": 0.3529301003062152,
630
+ "learning_rate": 2.5418535691207464e-05,
631
+ "loss": 0.5868,
632
+ "step": 445
633
+ },
634
+ {
635
+ "epoch": 1.6791044776119404,
636
+ "grad_norm": 0.34752849287500914,
637
+ "learning_rate": 2.49577857584178e-05,
638
+ "loss": 0.5782,
639
+ "step": 450
640
+ },
641
+ {
642
+ "epoch": 1.6977611940298507,
643
+ "grad_norm": 0.3230360844634613,
644
+ "learning_rate": 2.4498113251714936e-05,
645
+ "loss": 0.5873,
646
+ "step": 455
647
+ },
648
+ {
649
+ "epoch": 1.716417910447761,
650
+ "grad_norm": 0.32484263533625507,
651
+ "learning_rate": 2.4039712986760755e-05,
652
+ "loss": 0.5876,
653
+ "step": 460
654
+ },
655
+ {
656
+ "epoch": 1.7350746268656716,
657
+ "grad_norm": 0.33639950466603447,
658
+ "learning_rate": 2.35827792400232e-05,
659
+ "loss": 0.5872,
660
+ "step": 465
661
+ },
662
+ {
663
+ "epoch": 1.7537313432835822,
664
+ "grad_norm": 0.3416585845114808,
665
+ "learning_rate": 2.3127505666439243e-05,
666
+ "loss": 0.5853,
667
+ "step": 470
668
+ },
669
+ {
670
+ "epoch": 1.7723880597014925,
671
+ "grad_norm": 0.34586493676409896,
672
+ "learning_rate": 2.267408521734113e-05,
673
+ "loss": 0.5825,
674
+ "step": 475
675
+ },
676
+ {
677
+ "epoch": 1.7910447761194028,
678
+ "grad_norm": 0.3320503170217977,
679
+ "learning_rate": 2.2222710058680963e-05,
680
+ "loss": 0.5795,
681
+ "step": 480
682
+ },
683
+ {
684
+ "epoch": 1.8097014925373134,
685
+ "grad_norm": 0.3189202128526884,
686
+ "learning_rate": 2.1773571489588017e-05,
687
+ "loss": 0.5813,
688
+ "step": 485
689
+ },
690
+ {
691
+ "epoch": 1.828358208955224,
692
+ "grad_norm": 0.30999362090745874,
693
+ "learning_rate": 2.132685986129353e-05,
694
+ "loss": 0.582,
695
+ "step": 490
696
+ },
697
+ {
698
+ "epoch": 1.8470149253731343,
699
+ "grad_norm": 0.3208193396105105,
700
+ "learning_rate": 2.088276449645714e-05,
701
+ "loss": 0.5885,
702
+ "step": 495
703
+ },
704
+ {
705
+ "epoch": 1.8656716417910446,
706
+ "grad_norm": 0.33440700938533485,
707
+ "learning_rate": 2.0441473608929303e-05,
708
+ "loss": 0.5898,
709
+ "step": 500
710
+ },
711
+ {
712
+ "epoch": 1.8843283582089554,
713
+ "grad_norm": 0.3346425107943208,
714
+ "learning_rate": 2.0003174223983623e-05,
715
+ "loss": 0.5843,
716
+ "step": 505
717
+ },
718
+ {
719
+ "epoch": 1.9029850746268657,
720
+ "grad_norm": 0.3303009280184891,
721
+ "learning_rate": 1.9568052099052912e-05,
722
+ "loss": 0.5791,
723
+ "step": 510
724
+ },
725
+ {
726
+ "epoch": 1.921641791044776,
727
+ "grad_norm": 0.38141375892882895,
728
+ "learning_rate": 1.913629164500258e-05,
729
+ "loss": 0.5875,
730
+ "step": 515
731
+ },
732
+ {
733
+ "epoch": 1.9402985074626866,
734
+ "grad_norm": 0.36039154207921786,
735
+ "learning_rate": 1.8708075847974722e-05,
736
+ "loss": 0.5872,
737
+ "step": 520
738
+ },
739
+ {
740
+ "epoch": 1.9589552238805972,
741
+ "grad_norm": 0.3298358812601559,
742
+ "learning_rate": 1.8283586191836006e-05,
743
+ "loss": 0.5816,
744
+ "step": 525
745
+ },
746
+ {
747
+ "epoch": 1.9776119402985075,
748
+ "grad_norm": 0.34046592191940217,
749
+ "learning_rate": 1.786300258126231e-05,
750
+ "loss": 0.5874,
751
+ "step": 530
752
+ },
753
+ {
754
+ "epoch": 1.9962686567164178,
755
+ "grad_norm": 0.3195322344116163,
756
+ "learning_rate": 1.744650326549254e-05,
757
+ "loss": 0.5769,
758
+ "step": 535
759
+ },
760
+ {
761
+ "epoch": 2.014925373134328,
762
+ "grad_norm": 0.3204173767070315,
763
+ "learning_rate": 1.703426476278413e-05,
764
+ "loss": 0.5802,
765
+ "step": 540
766
+ },
767
+ {
768
+ "epoch": 2.033582089552239,
769
+ "grad_norm": 0.3154688814717341,
770
+ "learning_rate": 1.6626461785602114e-05,
771
+ "loss": 0.5778,
772
+ "step": 545
773
+ },
774
+ {
775
+ "epoch": 2.0522388059701493,
776
+ "grad_norm": 0.31510495279535944,
777
+ "learning_rate": 1.622326716657353e-05,
778
+ "loss": 0.5783,
779
+ "step": 550
780
+ },
781
+ {
782
+ "epoch": 2.0708955223880596,
783
+ "grad_norm": 0.31737263080458217,
784
+ "learning_rate": 1.582485178523856e-05,
785
+ "loss": 0.5751,
786
+ "step": 555
787
+ },
788
+ {
789
+ "epoch": 2.08955223880597,
790
+ "grad_norm": 0.33300368390424534,
791
+ "learning_rate": 1.5431384495629337e-05,
792
+ "loss": 0.5775,
793
+ "step": 560
794
+ },
795
+ {
796
+ "epoch": 2.1082089552238807,
797
+ "grad_norm": 0.32160995299568407,
798
+ "learning_rate": 1.504303205470723e-05,
799
+ "loss": 0.579,
800
+ "step": 565
801
+ },
802
+ {
803
+ "epoch": 2.126865671641791,
804
+ "grad_norm": 0.319837551567213,
805
+ "learning_rate": 1.4659959051688944e-05,
806
+ "loss": 0.5821,
807
+ "step": 570
808
+ },
809
+ {
810
+ "epoch": 2.1455223880597014,
811
+ "grad_norm": 0.3038052998476957,
812
+ "learning_rate": 1.4282327838291304e-05,
813
+ "loss": 0.5737,
814
+ "step": 575
815
+ },
816
+ {
817
+ "epoch": 2.1641791044776117,
818
+ "grad_norm": 0.33813938650371755,
819
+ "learning_rate": 1.391029845992426e-05,
820
+ "loss": 0.5782,
821
+ "step": 580
822
+ },
823
+ {
824
+ "epoch": 2.1828358208955225,
825
+ "grad_norm": 0.3225796594795952,
826
+ "learning_rate": 1.3544028587861441e-05,
827
+ "loss": 0.5782,
828
+ "step": 585
829
+ },
830
+ {
831
+ "epoch": 2.201492537313433,
832
+ "grad_norm": 0.3239207141788903,
833
+ "learning_rate": 1.3183673452416833e-05,
834
+ "loss": 0.5739,
835
+ "step": 590
836
+ },
837
+ {
838
+ "epoch": 2.220149253731343,
839
+ "grad_norm": 0.33531079821647236,
840
+ "learning_rate": 1.2829385777156036e-05,
841
+ "loss": 0.5728,
842
+ "step": 595
843
+ },
844
+ {
845
+ "epoch": 2.2388059701492535,
846
+ "grad_norm": 0.32137769720589743,
847
+ "learning_rate": 1.2481315714169812e-05,
848
+ "loss": 0.578,
849
+ "step": 600
850
+ },
851
+ {
852
+ "epoch": 2.2574626865671643,
853
+ "grad_norm": 0.34070696397399053,
854
+ "learning_rate": 1.2139610780437552e-05,
855
+ "loss": 0.5818,
856
+ "step": 605
857
+ },
858
+ {
859
+ "epoch": 2.2761194029850746,
860
+ "grad_norm": 0.3006860010554388,
861
+ "learning_rate": 1.1804415795307511e-05,
862
+ "loss": 0.5769,
863
+ "step": 610
864
+ },
865
+ {
866
+ "epoch": 2.294776119402985,
867
+ "grad_norm": 0.31337655622296023,
868
+ "learning_rate": 1.1475872819120328e-05,
869
+ "loss": 0.5776,
870
+ "step": 615
871
+ },
872
+ {
873
+ "epoch": 2.3134328358208958,
874
+ "grad_norm": 0.2981116652881978,
875
+ "learning_rate": 1.1154121093001874e-05,
876
+ "loss": 0.575,
877
+ "step": 620
878
+ },
879
+ {
880
+ "epoch": 2.332089552238806,
881
+ "grad_norm": 0.2938206447451833,
882
+ "learning_rate": 1.083929697985089e-05,
883
+ "loss": 0.564,
884
+ "step": 625
885
+ },
886
+ {
887
+ "epoch": 2.3507462686567164,
888
+ "grad_norm": 0.3213396976554827,
889
+ "learning_rate": 1.0531533906546454e-05,
890
+ "loss": 0.5782,
891
+ "step": 630
892
+ },
893
+ {
894
+ "epoch": 2.3694029850746268,
895
+ "grad_norm": 0.3430403927031693,
896
+ "learning_rate": 1.023096230739987e-05,
897
+ "loss": 0.5722,
898
+ "step": 635
899
+ },
900
+ {
901
+ "epoch": 2.388059701492537,
902
+ "grad_norm": 0.30576766014646795,
903
+ "learning_rate": 9.937709568874698e-06,
904
+ "loss": 0.5809,
905
+ "step": 640
906
+ },
907
+ {
908
+ "epoch": 2.406716417910448,
909
+ "grad_norm": 0.29816034162735167,
910
+ "learning_rate": 9.651899975598627e-06,
911
+ "loss": 0.5766,
912
+ "step": 645
913
+ },
914
+ {
915
+ "epoch": 2.425373134328358,
916
+ "grad_norm": 0.31152970177387024,
917
+ "learning_rate": 9.373654657689884e-06,
918
+ "loss": 0.5761,
919
+ "step": 650
920
+ },
921
+ {
922
+ "epoch": 2.4440298507462686,
923
+ "grad_norm": 0.30215892233066055,
924
+ "learning_rate": 9.103091539420603e-06,
925
+ "loss": 0.5746,
926
+ "step": 655
927
+ },
928
+ {
929
+ "epoch": 2.4626865671641793,
930
+ "grad_norm": 0.30791593132566586,
931
+ "learning_rate": 8.840325289238862e-06,
932
+ "loss": 0.5749,
933
+ "step": 660
934
+ },
935
+ {
936
+ "epoch": 2.4813432835820897,
937
+ "grad_norm": 0.29887643889544613,
938
+ "learning_rate": 8.585467271170572e-06,
939
+ "loss": 0.5777,
940
+ "step": 665
941
+ },
942
+ {
943
+ "epoch": 2.5,
944
+ "grad_norm": 0.29735670437763606,
945
+ "learning_rate": 8.338625497621846e-06,
946
+ "loss": 0.5749,
947
+ "step": 670
948
+ },
949
+ {
950
+ "epoch": 2.5186567164179103,
951
+ "grad_norm": 0.3079893045559778,
952
+ "learning_rate": 8.099904583601826e-06,
953
+ "loss": 0.5775,
954
+ "step": 675
955
+ },
956
+ {
957
+ "epoch": 2.5373134328358207,
958
+ "grad_norm": 0.3019628677258252,
959
+ "learning_rate": 7.869405702385388e-06,
960
+ "loss": 0.5731,
961
+ "step": 680
962
+ },
963
+ {
964
+ "epoch": 2.5559701492537314,
965
+ "grad_norm": 0.30551348308932164,
966
+ "learning_rate": 7.647226542634454e-06,
967
+ "loss": 0.5786,
968
+ "step": 685
969
+ },
970
+ {
971
+ "epoch": 2.574626865671642,
972
+ "grad_norm": 0.28720419068403863,
973
+ "learning_rate": 7.433461266996197e-06,
974
+ "loss": 0.5744,
975
+ "step": 690
976
+ },
977
+ {
978
+ "epoch": 2.593283582089552,
979
+ "grad_norm": 0.2923821314628744,
980
+ "learning_rate": 7.228200472195573e-06,
981
+ "loss": 0.5747,
982
+ "step": 695
983
+ },
984
+ {
985
+ "epoch": 2.611940298507463,
986
+ "grad_norm": 0.3050845250930222,
987
+ "learning_rate": 7.031531150639156e-06,
988
+ "loss": 0.5767,
989
+ "step": 700
990
+ },
991
+ {
992
+ "epoch": 2.6305970149253732,
993
+ "grad_norm": 0.28662100492424564,
994
+ "learning_rate": 6.843536653546554e-06,
995
+ "loss": 0.5723,
996
+ "step": 705
997
+ },
998
+ {
999
+ "epoch": 2.6492537313432836,
1000
+ "grad_norm": 0.28381082471561775,
1001
+ "learning_rate": 6.664296655624957e-06,
1002
+ "loss": 0.5765,
1003
+ "step": 710
1004
+ },
1005
+ {
1006
+ "epoch": 2.667910447761194,
1007
+ "grad_norm": 0.3054028997442964,
1008
+ "learning_rate": 6.49388712130192e-06,
1009
+ "loss": 0.581,
1010
+ "step": 715
1011
+ },
1012
+ {
1013
+ "epoch": 2.6865671641791042,
1014
+ "grad_norm": 0.29433829621662605,
1015
+ "learning_rate": 6.332380272530536e-06,
1016
+ "loss": 0.5785,
1017
+ "step": 720
1018
+ },
1019
+ {
1020
+ "epoch": 2.705223880597015,
1021
+ "grad_norm": 0.295224905137171,
1022
+ "learning_rate": 6.17984455818078e-06,
1023
+ "loss": 0.5688,
1024
+ "step": 725
1025
+ },
1026
+ {
1027
+ "epoch": 2.7238805970149254,
1028
+ "grad_norm": 0.2869765774620476,
1029
+ "learning_rate": 6.036344625029903e-06,
1030
+ "loss": 0.5729,
1031
+ "step": 730
1032
+ },
1033
+ {
1034
+ "epoch": 2.7425373134328357,
1035
+ "grad_norm": 0.31435646418449753,
1036
+ "learning_rate": 5.901941290364234e-06,
1037
+ "loss": 0.5735,
1038
+ "step": 735
1039
+ },
1040
+ {
1041
+ "epoch": 2.7611940298507465,
1042
+ "grad_norm": 0.2968674038357889,
1043
+ "learning_rate": 5.776691516203942e-06,
1044
+ "loss": 0.5786,
1045
+ "step": 740
1046
+ },
1047
+ {
1048
+ "epoch": 2.779850746268657,
1049
+ "grad_norm": 0.3033607414001532,
1050
+ "learning_rate": 5.660648385161759e-06,
1051
+ "loss": 0.5765,
1052
+ "step": 745
1053
+ },
1054
+ {
1055
+ "epoch": 2.798507462686567,
1056
+ "grad_norm": 0.2954603289858981,
1057
+ "learning_rate": 5.5538610779457975e-06,
1058
+ "loss": 0.5753,
1059
+ "step": 750
1060
+ },
1061
+ {
1062
+ "epoch": 2.8171641791044775,
1063
+ "grad_norm": 0.2826973145479748,
1064
+ "learning_rate": 5.456374852516083e-06,
1065
+ "loss": 0.5765,
1066
+ "step": 755
1067
+ },
1068
+ {
1069
+ "epoch": 2.835820895522388,
1070
+ "grad_norm": 0.29096478354623156,
1071
+ "learning_rate": 5.368231024903606e-06,
1072
+ "loss": 0.5729,
1073
+ "step": 760
1074
+ },
1075
+ {
1076
+ "epoch": 2.8544776119402986,
1077
+ "grad_norm": 0.2985469166556133,
1078
+ "learning_rate": 5.289466951699997e-06,
1079
+ "loss": 0.5765,
1080
+ "step": 765
1081
+ },
1082
+ {
1083
+ "epoch": 2.873134328358209,
1084
+ "grad_norm": 0.2830027158917109,
1085
+ "learning_rate": 5.2201160142252795e-06,
1086
+ "loss": 0.5775,
1087
+ "step": 770
1088
+ },
1089
+ {
1090
+ "epoch": 2.8917910447761193,
1091
+ "grad_norm": 0.28578279002331924,
1092
+ "learning_rate": 5.1602076043804036e-06,
1093
+ "loss": 0.5715,
1094
+ "step": 775
1095
+ },
1096
+ {
1097
+ "epoch": 2.91044776119403,
1098
+ "grad_norm": 0.28729246907339345,
1099
+ "learning_rate": 5.1097671121905425e-06,
1100
+ "loss": 0.5776,
1101
+ "step": 780
1102
+ },
1103
+ {
1104
+ "epoch": 2.9291044776119404,
1105
+ "grad_norm": 0.28782792746123514,
1106
+ "learning_rate": 5.0688159150444395e-06,
1107
+ "loss": 0.5699,
1108
+ "step": 785
1109
+ },
1110
+ {
1111
+ "epoch": 2.9477611940298507,
1112
+ "grad_norm": 0.28496009364917135,
1113
+ "learning_rate": 5.0373713686343774e-06,
1114
+ "loss": 0.5807,
1115
+ "step": 790
1116
+ },
1117
+ {
1118
+ "epoch": 2.966417910447761,
1119
+ "grad_norm": 0.29694942599247387,
1120
+ "learning_rate": 5.015446799600588e-06,
1121
+ "loss": 0.5726,
1122
+ "step": 795
1123
+ },
1124
+ {
1125
+ "epoch": 2.9850746268656714,
1126
+ "grad_norm": 0.2882907454258871,
1127
+ "learning_rate": 5.003051499883236e-06,
1128
+ "loss": 0.5826,
1129
+ "step": 800
1130
+ },
1131
+ {
1132
+ "epoch": 3.0,
1133
+ "step": 804,
1134
+ "total_flos": 1.3060333048430592e+16,
1135
+ "train_loss": 0.6017354608175174,
1136
+ "train_runtime": 40505.6177,
1137
+ "train_samples_per_second": 2.54,
1138
+ "train_steps_per_second": 0.02
1139
+ }
1140
+ ],
1141
+ "logging_steps": 5,
1142
+ "max_steps": 804,
1143
+ "num_input_tokens_seen": 0,
1144
+ "num_train_epochs": 3,
1145
+ "save_steps": 500,
1146
+ "stateful_callbacks": {
1147
+ "TrainerControl": {
1148
+ "args": {
1149
+ "should_epoch_stop": false,
1150
+ "should_evaluate": false,
1151
+ "should_log": false,
1152
+ "should_save": true,
1153
+ "should_training_stop": true
1154
+ },
1155
+ "attributes": {}
1156
+ }
1157
+ },
1158
+ "total_flos": 1.3060333048430592e+16,
1159
+ "train_batch_size": 16,
1160
+ "trial_name": null,
1161
+ "trial_params": null
1162
+ }