Model save

Browse files

Files changed (6) hide show

README.md +58 -0
all_results.json +9 -0
config.json +1 -1
generation_config.json +14 -0
train_results.json +9 -0
trainer_state.json +2002 -0

README.md ADDED Viewed

	@@ -0,0 +1,58 @@

+---
+base_model: Qwen/Qwen2.5-1.5B-Instruct
+library_name: transformers
+model_name: Qwen2.5-Argunaut-1-1.5B-SFT-dev0
+tags:
+- generated_from_trainer
+- trl
+- sft
+licence: license
+---
+# Model Card for Qwen2.5-Argunaut-1-1.5B-SFT-dev0
+This model is a fine-tuned version of [Qwen/Qwen2.5-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct).
+It has been trained using [TRL](https://github.com/huggingface/trl).
+## Quick start
+```python
+from transformers import pipeline
+question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?"
+generator = pipeline("text-generation", model="DebateLabKIT/Qwen2.5-Argunaut-1-1.5B-SFT-dev0", device="cuda")
+output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0]
+print(output["generated_text"])
+```
+## Training procedure
+[<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/ggbetz/argunauts-training/runs/sai3aev5)
+This model was trained with SFT.
+### Framework versions
+- TRL: 0.14.0
+- Transformers: 4.46.3
+- Pytorch: 2.4.1
+- Datasets: 3.1.0
+- Tokenizers: 0.20.3
+## Citations
+Cite TRL as:
+```bibtex
+@misc{vonwerra2022trl,
+	title        = {{TRL: Transformer Reinforcement Learning}},
+	author       = {Leandro von Werra and Younes Belkada and Lewis Tunstall and Edward Beeching and Tristan Thrush and Nathan Lambert and Shengyi Huang and Kashif Rasul and Quentin Gallouédec},
+	year         = 2020,
+	journal      = {GitHub repository},
+	publisher    = {GitHub},
+	howpublished = {\url{https://github.com/huggingface/trl}}
+}
+```

all_results.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "epoch": 1.0,
+    "total_flos": 1267133463920640.0,
+    "train_loss": 0.6670485469077131,
+    "train_runtime": 18244.3099,
+    "train_samples": 1000000,
+    "train_samples_per_second": 9.746,
+    "train_steps_per_second": 0.076
+}

config.json CHANGED Viewed

@@ -23,7 +23,7 @@
   "tie_word_embeddings": true,
   "torch_dtype": "bfloat16",
   "transformers_version": "4.46.3",
-  "use_cache": false,
   "use_sliding_window": false,
   "vocab_size": 151936
 }

   "tie_word_embeddings": true,
   "torch_dtype": "bfloat16",
   "transformers_version": "4.46.3",
+  "use_cache": true,
   "use_sliding_window": false,
   "vocab_size": 151936
 }

generation_config.json ADDED Viewed

	@@ -0,0 +1,14 @@

+{
+  "bos_token_id": 151643,
+  "do_sample": true,
+  "eos_token_id": [
+    151645,
+    151643
+  ],
+  "pad_token_id": 151643,
+  "repetition_penalty": 1.1,
+  "temperature": 0.7,
+  "top_k": 20,
+  "top_p": 0.8,
+  "transformers_version": "4.46.3"
+}

train_results.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "epoch": 1.0,
+    "total_flos": 1267133463920640.0,
+    "train_loss": 0.6670485469077131,
+    "train_runtime": 18244.3099,
+    "train_samples": 1000000,
+    "train_samples_per_second": 9.746,
+    "train_steps_per_second": 0.076
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,2002 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.0,
+  "eval_steps": 500,
+  "global_step": 1390,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0035971223021582736,
+      "grad_norm": 5.020955964884568,
+      "learning_rate": 1.7985611510791368e-07,
+      "loss": 1.1944,
+      "step": 5
+    },
+    {
+      "epoch": 0.007194244604316547,
+      "grad_norm": 4.938024747588621,
+      "learning_rate": 3.5971223021582736e-07,
+      "loss": 1.213,
+      "step": 10
+    },
+    {
+      "epoch": 0.01079136690647482,
+      "grad_norm": 4.729140166656345,
+      "learning_rate": 5.39568345323741e-07,
+      "loss": 1.2154,
+      "step": 15
+    },
+    {
+      "epoch": 0.014388489208633094,
+      "grad_norm": 4.486827019767407,
+      "learning_rate": 7.194244604316547e-07,
+      "loss": 1.1966,
+      "step": 20
+    },
+    {
+      "epoch": 0.017985611510791366,
+      "grad_norm": 2.8354701196618293,
+      "learning_rate": 8.992805755395684e-07,
+      "loss": 1.1617,
+      "step": 25
+    },
+    {
+      "epoch": 0.02158273381294964,
+      "grad_norm": 1.502509909859451,
+      "learning_rate": 1.079136690647482e-06,
+      "loss": 1.1554,
+      "step": 30
+    },
+    {
+      "epoch": 0.025179856115107913,
+      "grad_norm": 0.8955286838566213,
+      "learning_rate": 1.2589928057553958e-06,
+      "loss": 1.103,
+      "step": 35
+    },
+    {
+      "epoch": 0.02877697841726619,
+      "grad_norm": 0.7456236862794698,
+      "learning_rate": 1.4388489208633094e-06,
+      "loss": 1.1106,
+      "step": 40
+    },
+    {
+      "epoch": 0.03237410071942446,
+      "grad_norm": 0.7017287601950278,
+      "learning_rate": 1.618705035971223e-06,
+      "loss": 1.0751,
+      "step": 45
+    },
+    {
+      "epoch": 0.03597122302158273,
+      "grad_norm": 0.5721443296844856,
+      "learning_rate": 1.7985611510791368e-06,
+      "loss": 1.0696,
+      "step": 50
+    },
+    {
+      "epoch": 0.039568345323741004,
+      "grad_norm": 0.4464262507331005,
+      "learning_rate": 1.9784172661870504e-06,
+      "loss": 1.0304,
+      "step": 55
+    },
+    {
+      "epoch": 0.04316546762589928,
+      "grad_norm": 0.4092857312926078,
+      "learning_rate": 2.158273381294964e-06,
+      "loss": 1.0121,
+      "step": 60
+    },
+    {
+      "epoch": 0.046762589928057555,
+      "grad_norm": 0.3578204568994334,
+      "learning_rate": 2.3381294964028776e-06,
+      "loss": 0.9918,
+      "step": 65
+    },
+    {
+      "epoch": 0.050359712230215826,
+      "grad_norm": 0.343624490661988,
+      "learning_rate": 2.5179856115107916e-06,
+      "loss": 0.9676,
+      "step": 70
+    },
+    {
+      "epoch": 0.0539568345323741,
+      "grad_norm": 0.3052531774113653,
+      "learning_rate": 2.6978417266187052e-06,
+      "loss": 0.9529,
+      "step": 75
+    },
+    {
+      "epoch": 0.05755395683453238,
+      "grad_norm": 0.29429542067049574,
+      "learning_rate": 2.877697841726619e-06,
+      "loss": 0.9197,
+      "step": 80
+    },
+    {
+      "epoch": 0.06115107913669065,
+      "grad_norm": 0.27942488562810386,
+      "learning_rate": 3.0575539568345324e-06,
+      "loss": 0.8649,
+      "step": 85
+    },
+    {
+      "epoch": 0.06474820143884892,
+      "grad_norm": 0.2754960077654889,
+      "learning_rate": 3.237410071942446e-06,
+      "loss": 0.8803,
+      "step": 90
+    },
+    {
+      "epoch": 0.0683453237410072,
+      "grad_norm": 0.2637478480095364,
+      "learning_rate": 3.4172661870503596e-06,
+      "loss": 0.833,
+      "step": 95
+    },
+    {
+      "epoch": 0.07194244604316546,
+      "grad_norm": 0.2588869176654825,
+      "learning_rate": 3.5971223021582737e-06,
+      "loss": 0.824,
+      "step": 100
+    },
+    {
+      "epoch": 0.07553956834532374,
+      "grad_norm": 0.25809976308965515,
+      "learning_rate": 3.7769784172661873e-06,
+      "loss": 0.8134,
+      "step": 105
+    },
+    {
+      "epoch": 0.07913669064748201,
+      "grad_norm": 0.2280622185227855,
+      "learning_rate": 3.956834532374101e-06,
+      "loss": 0.8173,
+      "step": 110
+    },
+    {
+      "epoch": 0.08273381294964029,
+      "grad_norm": 0.23779953523720807,
+      "learning_rate": 4.1366906474820145e-06,
+      "loss": 0.7979,
+      "step": 115
+    },
+    {
+      "epoch": 0.08633093525179857,
+      "grad_norm": 0.22506106711683146,
+      "learning_rate": 4.316546762589928e-06,
+      "loss": 0.7681,
+      "step": 120
+    },
+    {
+      "epoch": 0.08992805755395683,
+      "grad_norm": 0.2175874711615906,
+      "learning_rate": 4.496402877697842e-06,
+      "loss": 0.7916,
+      "step": 125
+    },
+    {
+      "epoch": 0.09352517985611511,
+      "grad_norm": 0.2118293459237304,
+      "learning_rate": 4.676258992805755e-06,
+      "loss": 0.7798,
+      "step": 130
+    },
+    {
+      "epoch": 0.09712230215827339,
+      "grad_norm": 0.22010121805042218,
+      "learning_rate": 4.856115107913669e-06,
+      "loss": 0.7218,
+      "step": 135
+    },
+    {
+      "epoch": 0.10071942446043165,
+      "grad_norm": 0.20337355284622044,
+      "learning_rate": 4.999992116938572e-06,
+      "loss": 0.7329,
+      "step": 140
+    },
+    {
+      "epoch": 0.10431654676258993,
+      "grad_norm": 0.19294611128703623,
+      "learning_rate": 4.999716215008542e-06,
+      "loss": 0.7343,
+      "step": 145
+    },
+    {
+      "epoch": 0.1079136690647482,
+      "grad_norm": 0.18569152586574716,
+      "learning_rate": 4.999046209719832e-06,
+      "loss": 0.7078,
+      "step": 150
+    },
+    {
+      "epoch": 0.11151079136690648,
+      "grad_norm": 0.19184024245666095,
+      "learning_rate": 4.997982206704965e-06,
+      "loss": 0.7279,
+      "step": 155
+    },
+    {
+      "epoch": 0.11510791366906475,
+      "grad_norm": 0.1943879874060985,
+      "learning_rate": 4.996524373713848e-06,
+      "loss": 0.735,
+      "step": 160
+    },
+    {
+      "epoch": 0.11870503597122302,
+      "grad_norm": 0.188675637921693,
+      "learning_rate": 4.994672940587324e-06,
+      "loss": 0.7113,
+      "step": 165
+    },
+    {
+      "epoch": 0.1223021582733813,
+      "grad_norm": 0.18883128374108282,
+      "learning_rate": 4.992428199220931e-06,
+      "loss": 0.718,
+      "step": 170
+    },
+    {
+      "epoch": 0.12589928057553956,
+      "grad_norm": 0.18714736466453813,
+      "learning_rate": 4.989790503518888e-06,
+      "loss": 0.7153,
+      "step": 175
+    },
+    {
+      "epoch": 0.12949640287769784,
+      "grad_norm": 0.18374854258433984,
+      "learning_rate": 4.986760269338294e-06,
+      "loss": 0.7244,
+      "step": 180
+    },
+    {
+      "epoch": 0.13309352517985612,
+      "grad_norm": 0.18743762170823544,
+      "learning_rate": 4.983337974423567e-06,
+      "loss": 0.7239,
+      "step": 185
+    },
+    {
+      "epoch": 0.1366906474820144,
+      "grad_norm": 0.1849033708626242,
+      "learning_rate": 4.979524158331123e-06,
+      "loss": 0.7115,
+      "step": 190
+    },
+    {
+      "epoch": 0.14028776978417265,
+      "grad_norm": 0.18641479765213534,
+      "learning_rate": 4.975319422344308e-06,
+      "loss": 0.705,
+      "step": 195
+    },
+    {
+      "epoch": 0.14388489208633093,
+      "grad_norm": 0.18977732323761207,
+      "learning_rate": 4.970724429378602e-06,
+      "loss": 0.7008,
+      "step": 200
+    },
+    {
+      "epoch": 0.1474820143884892,
+      "grad_norm": 0.18827613594876644,
+      "learning_rate": 4.9657399038771045e-06,
+      "loss": 0.6916,
+      "step": 205
+    },
+    {
+      "epoch": 0.1510791366906475,
+      "grad_norm": 0.1770962322034755,
+      "learning_rate": 4.960366631696317e-06,
+      "loss": 0.6861,
+      "step": 210
+    },
+    {
+      "epoch": 0.15467625899280577,
+      "grad_norm": 0.18186648674203168,
+      "learning_rate": 4.954605459982248e-06,
+      "loss": 0.6758,
+      "step": 215
+    },
+    {
+      "epoch": 0.15827338129496402,
+      "grad_norm": 0.1766656970929977,
+      "learning_rate": 4.9484572970368516e-06,
+      "loss": 0.6944,
+      "step": 220
+    },
+    {
+      "epoch": 0.1618705035971223,
+      "grad_norm": 0.18242864024285693,
+      "learning_rate": 4.941923112174826e-06,
+      "loss": 0.703,
+      "step": 225
+    },
+    {
+      "epoch": 0.16546762589928057,
+      "grad_norm": 0.18130495443681138,
+      "learning_rate": 4.935003935570789e-06,
+      "loss": 0.6894,
+      "step": 230
+    },
+    {
+      "epoch": 0.16906474820143885,
+      "grad_norm": 0.17185494495136486,
+      "learning_rate": 4.9277008580968665e-06,
+      "loss": 0.6786,
+      "step": 235
+    },
+    {
+      "epoch": 0.17266187050359713,
+      "grad_norm": 0.17745476735656734,
+      "learning_rate": 4.920015031150702e-06,
+      "loss": 0.6885,
+      "step": 240
+    },
+    {
+      "epoch": 0.17625899280575538,
+      "grad_norm": 0.17746918416706206,
+      "learning_rate": 4.911947666473932e-06,
+      "loss": 0.6566,
+      "step": 245
+    },
+    {
+      "epoch": 0.17985611510791366,
+      "grad_norm": 0.17890622279080073,
+      "learning_rate": 4.903500035961139e-06,
+      "loss": 0.6607,
+      "step": 250
+    },
+    {
+      "epoch": 0.18345323741007194,
+      "grad_norm": 0.17911194474517148,
+      "learning_rate": 4.894673471459331e-06,
+      "loss": 0.6955,
+      "step": 255
+    },
+    {
+      "epoch": 0.18705035971223022,
+      "grad_norm": 0.17868419774364908,
+      "learning_rate": 4.885469364557956e-06,
+      "loss": 0.6888,
+      "step": 260
+    },
+    {
+      "epoch": 0.1906474820143885,
+      "grad_norm": 0.18451377818125733,
+      "learning_rate": 4.8758891663695165e-06,
+      "loss": 0.6595,
+      "step": 265
+    },
+    {
+      "epoch": 0.19424460431654678,
+      "grad_norm": 0.17946177717246753,
+      "learning_rate": 4.865934387300776e-06,
+      "loss": 0.684,
+      "step": 270
+    },
+    {
+      "epoch": 0.19784172661870503,
+      "grad_norm": 0.19613942764348846,
+      "learning_rate": 4.8556065968146385e-06,
+      "loss": 0.6819,
+      "step": 275
+    },
+    {
+      "epoch": 0.2014388489208633,
+      "grad_norm": 0.17900557510693368,
+      "learning_rate": 4.844907423182699e-06,
+      "loss": 0.6779,
+      "step": 280
+    },
+    {
+      "epoch": 0.20503597122302158,
+      "grad_norm": 0.17655099392118984,
+      "learning_rate": 4.833838553228547e-06,
+      "loss": 0.6598,
+      "step": 285
+    },
+    {
+      "epoch": 0.20863309352517986,
+      "grad_norm": 0.17383906870606797,
+      "learning_rate": 4.822401732061802e-06,
+      "loss": 0.6707,
+      "step": 290
+    },
+    {
+      "epoch": 0.21223021582733814,
+      "grad_norm": 0.18068285893162356,
+      "learning_rate": 4.810598762803e-06,
+      "loss": 0.6699,
+      "step": 295
+    },
+    {
+      "epoch": 0.2158273381294964,
+      "grad_norm": 0.17183382610494244,
+      "learning_rate": 4.798431506299303e-06,
+      "loss": 0.6578,
+      "step": 300
+    },
+    {
+      "epoch": 0.21942446043165467,
+      "grad_norm": 0.18275661567359278,
+      "learning_rate": 4.785901880831124e-06,
+      "loss": 0.627,
+      "step": 305
+    },
+    {
+      "epoch": 0.22302158273381295,
+      "grad_norm": 0.17659347495851926,
+      "learning_rate": 4.773011861809694e-06,
+      "loss": 0.6695,
+      "step": 310
+    },
+    {
+      "epoch": 0.22661870503597123,
+      "grad_norm": 0.17029323062013704,
+      "learning_rate": 4.759763481465611e-06,
+      "loss": 0.6686,
+      "step": 315
+    },
+    {
+      "epoch": 0.2302158273381295,
+      "grad_norm": 0.17470042602053984,
+      "learning_rate": 4.746158828528457e-06,
+      "loss": 0.6387,
+      "step": 320
+    },
+    {
+      "epoch": 0.23381294964028776,
+      "grad_norm": 0.1717697065536718,
+      "learning_rate": 4.73220004789747e-06,
+      "loss": 0.6454,
+      "step": 325
+    },
+    {
+      "epoch": 0.23741007194244604,
+      "grad_norm": 0.17271020357114406,
+      "learning_rate": 4.717889340303399e-06,
+      "loss": 0.6409,
+      "step": 330
+    },
+    {
+      "epoch": 0.24100719424460432,
+      "grad_norm": 0.17563690998151346,
+      "learning_rate": 4.703228961961524e-06,
+      "loss": 0.654,
+      "step": 335
+    },
+    {
+      "epoch": 0.2446043165467626,
+      "grad_norm": 0.17294921658141785,
+      "learning_rate": 4.6882212242159555e-06,
+      "loss": 0.6427,
+      "step": 340
+    },
+    {
+      "epoch": 0.24820143884892087,
+      "grad_norm": 0.17486817689111733,
+      "learning_rate": 4.672868493175219e-06,
+      "loss": 0.648,
+      "step": 345
+    },
+    {
+      "epoch": 0.2517985611510791,
+      "grad_norm": 0.17267103530941283,
+      "learning_rate": 4.657173189339222e-06,
+      "loss": 0.643,
+      "step": 350
+    },
+    {
+      "epoch": 0.25539568345323743,
+      "grad_norm": 0.17335584700693588,
+      "learning_rate": 4.64113778721764e-06,
+      "loss": 0.6486,
+      "step": 355
+    },
+    {
+      "epoch": 0.2589928057553957,
+      "grad_norm": 0.16915911050050117,
+      "learning_rate": 4.624764814939785e-06,
+      "loss": 0.6618,
+      "step": 360
+    },
+    {
+      "epoch": 0.26258992805755393,
+      "grad_norm": 0.1718246313112584,
+      "learning_rate": 4.608056853856021e-06,
+      "loss": 0.6727,
+      "step": 365
+    },
+    {
+      "epoch": 0.26618705035971224,
+      "grad_norm": 0.17570002762242526,
+      "learning_rate": 4.591016538130796e-06,
+      "loss": 0.6409,
+      "step": 370
+    },
+    {
+      "epoch": 0.2697841726618705,
+      "grad_norm": 0.17190700381660223,
+      "learning_rate": 4.573646554327336e-06,
+      "loss": 0.6429,
+      "step": 375
+    },
+    {
+      "epoch": 0.2733812949640288,
+      "grad_norm": 0.1710399115293668,
+      "learning_rate": 4.555949640984087e-06,
+      "loss": 0.633,
+      "step": 380
+    },
+    {
+      "epoch": 0.27697841726618705,
+      "grad_norm": 0.17294539558410943,
+      "learning_rate": 4.537928588182955e-06,
+      "loss": 0.6506,
+      "step": 385
+    },
+    {
+      "epoch": 0.2805755395683453,
+      "grad_norm": 0.17037231572109038,
+      "learning_rate": 4.519586237109431e-06,
+      "loss": 0.6518,
+      "step": 390
+    },
+    {
+      "epoch": 0.2841726618705036,
+      "grad_norm": 0.18232442328302445,
+      "learning_rate": 4.500925479604645e-06,
+      "loss": 0.6303,
+      "step": 395
+    },
+    {
+      "epoch": 0.28776978417266186,
+      "grad_norm": 0.1735924546035651,
+      "learning_rate": 4.481949257709442e-06,
+      "loss": 0.6471,
+      "step": 400
+    },
+    {
+      "epoch": 0.29136690647482016,
+      "grad_norm": 0.1785525245588453,
+      "learning_rate": 4.462660563200545e-06,
+      "loss": 0.664,
+      "step": 405
+    },
+    {
+      "epoch": 0.2949640287769784,
+      "grad_norm": 0.17152892007141038,
+      "learning_rate": 4.44306243711887e-06,
+      "loss": 0.651,
+      "step": 410
+    },
+    {
+      "epoch": 0.29856115107913667,
+      "grad_norm": 0.18187119520437112,
+      "learning_rate": 4.423157969290081e-06,
+      "loss": 0.6772,
+      "step": 415
+    },
+    {
+      "epoch": 0.302158273381295,
+      "grad_norm": 0.17859792680665468,
+      "learning_rate": 4.402950297837449e-06,
+      "loss": 0.6814,
+      "step": 420
+    },
+    {
+      "epoch": 0.3057553956834532,
+      "grad_norm": 0.1663311727852307,
+      "learning_rate": 4.382442608687097e-06,
+      "loss": 0.6315,
+      "step": 425
+    },
+    {
+      "epoch": 0.30935251798561153,
+      "grad_norm": 0.1701214320556193,
+      "learning_rate": 4.361638135065711e-06,
+      "loss": 0.6539,
+      "step": 430
+    },
+    {
+      "epoch": 0.3129496402877698,
+      "grad_norm": 0.17523250297062218,
+      "learning_rate": 4.34054015699079e-06,
+      "loss": 0.6667,
+      "step": 435
+    },
+    {
+      "epoch": 0.31654676258992803,
+      "grad_norm": 0.19076203006305673,
+      "learning_rate": 4.3191520007535235e-06,
+      "loss": 0.6398,
+      "step": 440
+    },
+    {
+      "epoch": 0.32014388489208634,
+      "grad_norm": 0.17133961624985056,
+      "learning_rate": 4.297477038394368e-06,
+      "loss": 0.6535,
+      "step": 445
+    },
+    {
+      "epoch": 0.3237410071942446,
+      "grad_norm": 0.16928107408174564,
+      "learning_rate": 4.275518687171418e-06,
+      "loss": 0.6531,
+      "step": 450
+    },
+    {
+      "epoch": 0.3273381294964029,
+      "grad_norm": 0.16431975083654335,
+      "learning_rate": 4.2532804090216374e-06,
+      "loss": 0.6542,
+      "step": 455
+    },
+    {
+      "epoch": 0.33093525179856115,
+      "grad_norm": 0.1720650129283096,
+      "learning_rate": 4.230765710015058e-06,
+      "loss": 0.6582,
+      "step": 460
+    },
+    {
+      "epoch": 0.3345323741007194,
+      "grad_norm": 0.16337467117780707,
+      "learning_rate": 4.2079781398020155e-06,
+      "loss": 0.6221,
+      "step": 465
+    },
+    {
+      "epoch": 0.3381294964028777,
+      "grad_norm": 0.17958123058890538,
+      "learning_rate": 4.184921291053511e-06,
+      "loss": 0.6578,
+      "step": 470
+    },
+    {
+      "epoch": 0.34172661870503596,
+      "grad_norm": 0.17044537787660088,
+      "learning_rate": 4.161598798894795e-06,
+      "loss": 0.6591,
+      "step": 475
+    },
+    {
+      "epoch": 0.34532374100719426,
+      "grad_norm": 0.162676186545111,
+      "learning_rate": 4.1380143403322546e-06,
+      "loss": 0.6384,
+      "step": 480
+    },
+    {
+      "epoch": 0.3489208633093525,
+      "grad_norm": 0.1694294650638159,
+      "learning_rate": 4.114171633673705e-06,
+      "loss": 0.6238,
+      "step": 485
+    },
+    {
+      "epoch": 0.35251798561151076,
+      "grad_norm": 0.17043437687962124,
+      "learning_rate": 4.090074437942155e-06,
+      "loss": 0.6362,
+      "step": 490
+    },
+    {
+      "epoch": 0.35611510791366907,
+      "grad_norm": 0.17325913850748673,
+      "learning_rate": 4.065726552283173e-06,
+      "loss": 0.6567,
+      "step": 495
+    },
+    {
+      "epoch": 0.3597122302158273,
+      "grad_norm": 0.19654711855055748,
+      "learning_rate": 4.0411318153659056e-06,
+      "loss": 0.6605,
+      "step": 500
+    },
+    {
+      "epoch": 0.3597122302158273,
+      "eval_runtime": 14.2753,
+      "eval_samples_per_second": 70.051,
+      "eval_steps_per_second": 2.242,
+      "step": 500
+    },
+    {
+      "epoch": 0.36330935251798563,
+      "grad_norm": 0.181643640264244,
+      "learning_rate": 4.016294104777883e-06,
+      "loss": 0.6593,
+      "step": 505
+    },
+    {
+      "epoch": 0.3669064748201439,
+      "grad_norm": 0.17145091906098897,
+      "learning_rate": 3.99121733641368e-06,
+      "loss": 0.6434,
+      "step": 510
+    },
+    {
+      "epoch": 0.37050359712230213,
+      "grad_norm": 0.18571422852307243,
+      "learning_rate": 3.96590546385754e-06,
+      "loss": 0.6351,
+      "step": 515
+    },
+    {
+      "epoch": 0.37410071942446044,
+      "grad_norm": 0.1655963338752092,
+      "learning_rate": 3.9403624777600526e-06,
+      "loss": 0.6241,
+      "step": 520
+    },
+    {
+      "epoch": 0.3776978417266187,
+      "grad_norm": 0.21928102436797317,
+      "learning_rate": 3.914592405208993e-06,
+      "loss": 0.623,
+      "step": 525
+    },
+    {
+      "epoch": 0.381294964028777,
+      "grad_norm": 0.17321888081042625,
+      "learning_rate": 3.888599309094413e-06,
+      "loss": 0.6289,
+      "step": 530
+    },
+    {
+      "epoch": 0.38489208633093525,
+      "grad_norm": 0.17012813283361522,
+      "learning_rate": 3.862387287468095e-06,
+      "loss": 0.6268,
+      "step": 535
+    },
+    {
+      "epoch": 0.38848920863309355,
+      "grad_norm": 0.1736732174250609,
+      "learning_rate": 3.835960472897444e-06,
+      "loss": 0.638,
+      "step": 540
+    },
+    {
+      "epoch": 0.3920863309352518,
+      "grad_norm": 0.23704552857732844,
+      "learning_rate": 3.809323031813963e-06,
+      "loss": 0.6244,
+      "step": 545
+    },
+    {
+      "epoch": 0.39568345323741005,
+      "grad_norm": 0.16584229684045446,
+      "learning_rate": 3.7824791638563674e-06,
+      "loss": 0.6195,
+      "step": 550
+    },
+    {
+      "epoch": 0.39928057553956836,
+      "grad_norm": 0.18088337678779626,
+      "learning_rate": 3.75543310120848e-06,
+      "loss": 0.6216,
+      "step": 555
+    },
+    {
+      "epoch": 0.4028776978417266,
+      "grad_norm": 0.1670474782849822,
+      "learning_rate": 3.728189107931981e-06,
+      "loss": 0.629,
+      "step": 560
+    },
+    {
+      "epoch": 0.4064748201438849,
+      "grad_norm": 0.16881903430684905,
+      "learning_rate": 3.7007514792941462e-06,
+      "loss": 0.6159,
+      "step": 565
+    },
+    {
+      "epoch": 0.41007194244604317,
+      "grad_norm": 0.166561835843837,
+      "learning_rate": 3.6731245410906537e-06,
+      "loss": 0.6182,
+      "step": 570
+    },
+    {
+      "epoch": 0.4136690647482014,
+      "grad_norm": 0.17513847235031135,
+      "learning_rate": 3.6453126489635845e-06,
+      "loss": 0.6469,
+      "step": 575
+    },
+    {
+      "epoch": 0.4172661870503597,
+      "grad_norm": 0.17220800442112816,
+      "learning_rate": 3.6173201877147134e-06,
+      "loss": 0.6342,
+      "step": 580
+    },
+    {
+      "epoch": 0.420863309352518,
+      "grad_norm": 0.167483163354845,
+      "learning_rate": 3.5891515706142083e-06,
+      "loss": 0.6248,
+      "step": 585
+    },
+    {
+      "epoch": 0.4244604316546763,
+      "grad_norm": 0.16391025566890444,
+      "learning_rate": 3.560811238704832e-06,
+      "loss": 0.6285,
+      "step": 590
+    },
+    {
+      "epoch": 0.42805755395683454,
+      "grad_norm": 0.17954405708333362,
+      "learning_rate": 3.532303660101776e-06,
+      "loss": 0.6653,
+      "step": 595
+    },
+    {
+      "epoch": 0.4316546762589928,
+      "grad_norm": 0.1748136517289543,
+      "learning_rate": 3.503633329288215e-06,
+      "loss": 0.61,
+      "step": 600
+    },
+    {
+      "epoch": 0.4352517985611511,
+      "grad_norm": 0.16312734641366394,
+      "learning_rate": 3.474804766406718e-06,
+      "loss": 0.6507,
+      "step": 605
+    },
+    {
+      "epoch": 0.43884892086330934,
+      "grad_norm": 0.16678530089886806,
+      "learning_rate": 3.445822516546598e-06,
+      "loss": 0.6391,
+      "step": 610
+    },
+    {
+      "epoch": 0.44244604316546765,
+      "grad_norm": 0.16772897874751536,
+      "learning_rate": 3.416691149027341e-06,
+      "loss": 0.6488,
+      "step": 615
+    },
+    {
+      "epoch": 0.4460431654676259,
+      "grad_norm": 0.17568340970424068,
+      "learning_rate": 3.3874152566782127e-06,
+      "loss": 0.6542,
+      "step": 620
+    },
+    {
+      "epoch": 0.44964028776978415,
+      "grad_norm": 0.16904932894961983,
+      "learning_rate": 3.357999455114148e-06,
+      "loss": 0.6308,
+      "step": 625
+    },
+    {
+      "epoch": 0.45323741007194246,
+      "grad_norm": 0.17964693226103262,
+      "learning_rate": 3.3284483820080694e-06,
+      "loss": 0.658,
+      "step": 630
+    },
+    {
+      "epoch": 0.4568345323741007,
+      "grad_norm": 0.16972600698098864,
+      "learning_rate": 3.2987666963597006e-06,
+      "loss": 0.666,
+      "step": 635
+    },
+    {
+      "epoch": 0.460431654676259,
+      "grad_norm": 0.16863410876839938,
+      "learning_rate": 3.2689590777610443e-06,
+      "loss": 0.6158,
+      "step": 640
+    },
+    {
+      "epoch": 0.46402877697841727,
+      "grad_norm": 0.16752405859925734,
+      "learning_rate": 3.239030225658595e-06,
+      "loss": 0.6421,
+      "step": 645
+    },
+    {
+      "epoch": 0.4676258992805755,
+      "grad_norm": 0.16737880285691936,
+      "learning_rate": 3.208984858612429e-06,
+      "loss": 0.6567,
+      "step": 650
+    },
+    {
+      "epoch": 0.4712230215827338,
+      "grad_norm": 0.1622627800832757,
+      "learning_rate": 3.178827713552281e-06,
+      "loss": 0.6222,
+      "step": 655
+    },
+    {
+      "epoch": 0.4748201438848921,
+      "grad_norm": 0.1637301148248093,
+      "learning_rate": 3.148563545030722e-06,
+      "loss": 0.6253,
+      "step": 660
+    },
+    {
+      "epoch": 0.4784172661870504,
+      "grad_norm": 0.1658915803907661,
+      "learning_rate": 3.1181971244735594e-06,
+      "loss": 0.6277,
+      "step": 665
+    },
+    {
+      "epoch": 0.48201438848920863,
+      "grad_norm": 0.17005292953461104,
+      "learning_rate": 3.0877332394275806e-06,
+      "loss": 0.638,
+      "step": 670
+    },
+    {
+      "epoch": 0.4856115107913669,
+      "grad_norm": 0.17306211053376286,
+      "learning_rate": 3.05717669280575e-06,
+      "loss": 0.6317,
+      "step": 675
+    },
+    {
+      "epoch": 0.4892086330935252,
+      "grad_norm": 0.16464061138395658,
+      "learning_rate": 3.026532302129984e-06,
+      "loss": 0.6135,
+      "step": 680
+    },
+    {
+      "epoch": 0.49280575539568344,
+      "grad_norm": 0.16817670931577094,
+      "learning_rate": 2.9958048987716266e-06,
+      "loss": 0.6255,
+      "step": 685
+    },
+    {
+      "epoch": 0.49640287769784175,
+      "grad_norm": 0.16195389526156515,
+      "learning_rate": 2.96499932718974e-06,
+      "loss": 0.6024,
+      "step": 690
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 0.1645172149025666,
+      "learning_rate": 2.9341204441673267e-06,
+      "loss": 0.6345,
+      "step": 695
+    },
+    {
+      "epoch": 0.5035971223021583,
+      "grad_norm": 0.22649243976417918,
+      "learning_rate": 2.903173118045616e-06,
+      "loss": 0.6312,
+      "step": 700
+    },
+    {
+      "epoch": 0.5071942446043165,
+      "grad_norm": 0.17150190400496662,
+      "learning_rate": 2.8721622279565248e-06,
+      "loss": 0.642,
+      "step": 705
+    },
+    {
+      "epoch": 0.5107913669064749,
+      "grad_norm": 0.16890621997495983,
+      "learning_rate": 2.8410926630534164e-06,
+      "loss": 0.6499,
+      "step": 710
+    },
+    {
+      "epoch": 0.5143884892086331,
+      "grad_norm": 0.16876468012144627,
+      "learning_rate": 2.8099693217402807e-06,
+      "loss": 0.63,
+      "step": 715
+    },
+    {
+      "epoch": 0.5179856115107914,
+      "grad_norm": 0.16959393931472438,
+      "learning_rate": 2.7787971108994557e-06,
+      "loss": 0.6181,
+      "step": 720
+    },
+    {
+      "epoch": 0.5215827338129496,
+      "grad_norm": 0.17314847147328546,
+      "learning_rate": 2.7475809451180103e-06,
+      "loss": 0.6584,
+      "step": 725
+    },
+    {
+      "epoch": 0.5251798561151079,
+      "grad_norm": 0.1687033450249176,
+      "learning_rate": 2.7163257459129184e-06,
+      "loss": 0.6308,
+      "step": 730
+    },
+    {
+      "epoch": 0.5287769784172662,
+      "grad_norm": 0.1658057380008524,
+      "learning_rate": 2.685036440955133e-06,
+      "loss": 0.6529,
+      "step": 735
+    },
+    {
+      "epoch": 0.5323741007194245,
+      "grad_norm": 0.16991236487195122,
+      "learning_rate": 2.6537179632926953e-06,
+      "loss": 0.6177,
+      "step": 740
+    },
+    {
+      "epoch": 0.5359712230215827,
+      "grad_norm": 0.16035267630476027,
+      "learning_rate": 2.6223752505729884e-06,
+      "loss": 0.6461,
+      "step": 745
+    },
+    {
+      "epoch": 0.539568345323741,
+      "grad_norm": 0.16126651583706925,
+      "learning_rate": 2.5910132442642815e-06,
+      "loss": 0.6,
+      "step": 750
+    },
+    {
+      "epoch": 0.5431654676258992,
+      "grad_norm": 0.1623864590113543,
+      "learning_rate": 2.5596368888766537e-06,
+      "loss": 0.5975,
+      "step": 755
+    },
+    {
+      "epoch": 0.5467625899280576,
+      "grad_norm": 0.165811450449987,
+      "learning_rate": 2.52825113118245e-06,
+      "loss": 0.6292,
+      "step": 760
+    },
+    {
+      "epoch": 0.5503597122302158,
+      "grad_norm": 0.16532836548055116,
+      "learning_rate": 2.496860919436374e-06,
+      "loss": 0.6084,
+      "step": 765
+    },
+    {
+      "epoch": 0.5539568345323741,
+      "grad_norm": 0.16785794412521857,
+      "learning_rate": 2.4654712025953543e-06,
+      "loss": 0.6087,
+      "step": 770
+    },
+    {
+      "epoch": 0.5575539568345323,
+      "grad_norm": 0.1635851401173821,
+      "learning_rate": 2.4340869295382924e-06,
+      "loss": 0.6328,
+      "step": 775
+    },
+    {
+      "epoch": 0.5611510791366906,
+      "grad_norm": 0.1600778829109294,
+      "learning_rate": 2.402713048285825e-06,
+      "loss": 0.6033,
+      "step": 780
+    },
+    {
+      "epoch": 0.564748201438849,
+      "grad_norm": 0.18588185959536366,
+      "learning_rate": 2.3713545052202268e-06,
+      "loss": 0.6158,
+      "step": 785
+    },
+    {
+      "epoch": 0.5683453237410072,
+      "grad_norm": 0.16421497309958943,
+      "learning_rate": 2.3400162443055655e-06,
+      "loss": 0.604,
+      "step": 790
+    },
+    {
+      "epoch": 0.5719424460431655,
+      "grad_norm": 0.160265450599092,
+      "learning_rate": 2.30870320630824e-06,
+      "loss": 0.612,
+      "step": 795
+    },
+    {
+      "epoch": 0.5755395683453237,
+      "grad_norm": 0.16954798697965026,
+      "learning_rate": 2.277420328018023e-06,
+      "loss": 0.6366,
+      "step": 800
+    },
+    {
+      "epoch": 0.579136690647482,
+      "grad_norm": 0.1611435816780583,
+      "learning_rate": 2.24617254146973e-06,
+      "loss": 0.6349,
+      "step": 805
+    },
+    {
+      "epoch": 0.5827338129496403,
+      "grad_norm": 0.17546659615139334,
+      "learning_rate": 2.214964773165641e-06,
+      "loss": 0.5949,
+      "step": 810
+    },
+    {
+      "epoch": 0.5863309352517986,
+      "grad_norm": 0.16049134388976258,
+      "learning_rate": 2.183801943298789e-06,
+      "loss": 0.6067,
+      "step": 815
+    },
+    {
+      "epoch": 0.5899280575539568,
+      "grad_norm": 0.16957473530909392,
+      "learning_rate": 2.1526889649772477e-06,
+      "loss": 0.5934,
+      "step": 820
+    },
+    {
+      "epoch": 0.5935251798561151,
+      "grad_norm": 0.16581052754385936,
+      "learning_rate": 2.121630743449532e-06,
+      "loss": 0.6171,
+      "step": 825
+    },
+    {
+      "epoch": 0.5971223021582733,
+      "grad_norm": 0.16120322904660017,
+      "learning_rate": 2.090632175331244e-06,
+      "loss": 0.6163,
+      "step": 830
+    },
+    {
+      "epoch": 0.6007194244604317,
+      "grad_norm": 0.1581771997504381,
+      "learning_rate": 2.059698147833075e-06,
+      "loss": 0.6197,
+      "step": 835
+    },
+    {
+      "epoch": 0.60431654676259,
+      "grad_norm": 0.16362498684959523,
+      "learning_rate": 2.0288335379902895e-06,
+      "loss": 0.6207,
+      "step": 840
+    },
+    {
+      "epoch": 0.6079136690647482,
+      "grad_norm": 0.16660059907607108,
+      "learning_rate": 1.9980432118938204e-06,
+      "loss": 0.6354,
+      "step": 845
+    },
+    {
+      "epoch": 0.6115107913669064,
+      "grad_norm": 0.1803995538499011,
+      "learning_rate": 1.9673320239230783e-06,
+      "loss": 0.6041,
+      "step": 850
+    },
+    {
+      "epoch": 0.6151079136690647,
+      "grad_norm": 0.20020108799514952,
+      "learning_rate": 1.9367048159806175e-06,
+      "loss": 0.6236,
+      "step": 855
+    },
+    {
+      "epoch": 0.6187050359712231,
+      "grad_norm": 0.16682386811464148,
+      "learning_rate": 1.9061664167287672e-06,
+      "loss": 0.6242,
+      "step": 860
+    },
+    {
+      "epoch": 0.6223021582733813,
+      "grad_norm": 0.1637761383770433,
+      "learning_rate": 1.875721640828344e-06,
+      "loss": 0.6039,
+      "step": 865
+    },
+    {
+      "epoch": 0.6258992805755396,
+      "grad_norm": 0.1624753055536377,
+      "learning_rate": 1.8453752881795772e-06,
+      "loss": 0.612,
+      "step": 870
+    },
+    {
+      "epoch": 0.6294964028776978,
+      "grad_norm": 0.16504744819300535,
+      "learning_rate": 1.8151321431653627e-06,
+      "loss": 0.6071,
+      "step": 875
+    },
+    {
+      "epoch": 0.6330935251798561,
+      "grad_norm": 0.16453571982489182,
+      "learning_rate": 1.7849969738969592e-06,
+      "loss": 0.6163,
+      "step": 880
+    },
+    {
+      "epoch": 0.6366906474820144,
+      "grad_norm": 0.185104019176102,
+      "learning_rate": 1.754974531462251e-06,
+      "loss": 0.6329,
+      "step": 885
+    },
+    {
+      "epoch": 0.6402877697841727,
+      "grad_norm": 0.1677831386628234,
+      "learning_rate": 1.725069549176695e-06,
+      "loss": 0.6046,
+      "step": 890
+    },
+    {
+      "epoch": 0.6438848920863309,
+      "grad_norm": 0.1585666383654991,
+      "learning_rate": 1.6952867418370707e-06,
+      "loss": 0.6002,
+      "step": 895
+    },
+    {
+      "epoch": 0.6474820143884892,
+      "grad_norm": 0.16489693303482242,
+      "learning_rate": 1.665630804978149e-06,
+      "loss": 0.6376,
+      "step": 900
+    },
+    {
+      "epoch": 0.6510791366906474,
+      "grad_norm": 0.16805125828505416,
+      "learning_rate": 1.6361064141323953e-06,
+      "loss": 0.6365,
+      "step": 905
+    },
+    {
+      "epoch": 0.6546762589928058,
+      "grad_norm": 0.181566998160182,
+      "learning_rate": 1.6067182240928332e-06,
+      "loss": 0.6112,
+      "step": 910
+    },
+    {
+      "epoch": 0.658273381294964,
+      "grad_norm": 0.1694206935808062,
+      "learning_rate": 1.5774708681791692e-06,
+      "loss": 0.6187,
+      "step": 915
+    },
+    {
+      "epoch": 0.6618705035971223,
+      "grad_norm": 0.16461060571783928,
+      "learning_rate": 1.548368957507308e-06,
+      "loss": 0.5782,
+      "step": 920
+    },
+    {
+      "epoch": 0.6654676258992805,
+      "grad_norm": 0.1669683477972491,
+      "learning_rate": 1.5194170802623692e-06,
+      "loss": 0.6145,
+      "step": 925
+    },
+    {
+      "epoch": 0.6690647482014388,
+      "grad_norm": 0.16862830642330334,
+      "learning_rate": 1.4906198009753159e-06,
+      "loss": 0.6353,
+      "step": 930
+    },
+    {
+      "epoch": 0.6726618705035972,
+      "grad_norm": 0.17094178135804486,
+      "learning_rate": 1.4619816598033148e-06,
+      "loss": 0.597,
+      "step": 935
+    },
+    {
+      "epoch": 0.6762589928057554,
+      "grad_norm": 0.1594405984136636,
+      "learning_rate": 1.4335071718139379e-06,
+      "loss": 0.6141,
+      "step": 940
+    },
+    {
+      "epoch": 0.6798561151079137,
+      "grad_norm": 0.16333147484898983,
+      "learning_rate": 1.4052008262733205e-06,
+      "loss": 0.6048,
+      "step": 945
+    },
+    {
+      "epoch": 0.6834532374100719,
+      "grad_norm": 0.15979882139296125,
+      "learning_rate": 1.3770670859383895e-06,
+      "loss": 0.6217,
+      "step": 950
+    },
+    {
+      "epoch": 0.6870503597122302,
+      "grad_norm": 0.16504220868699096,
+      "learning_rate": 1.3491103863532626e-06,
+      "loss": 0.6052,
+      "step": 955
+    },
+    {
+      "epoch": 0.6906474820143885,
+      "grad_norm": 0.17583165256569067,
+      "learning_rate": 1.321335135149952e-06,
+      "loss": 0.6309,
+      "step": 960
+    },
+    {
+      "epoch": 0.6942446043165468,
+      "grad_norm": 0.18291087991427973,
+      "learning_rate": 1.2937457113534498e-06,
+      "loss": 0.6152,
+      "step": 965
+    },
+    {
+      "epoch": 0.697841726618705,
+      "grad_norm": 0.15817294651209343,
+      "learning_rate": 1.266346464691346e-06,
+      "loss": 0.6186,
+      "step": 970
+    },
+    {
+      "epoch": 0.7014388489208633,
+      "grad_norm": 0.16487935933779904,
+      "learning_rate": 1.2391417149080458e-06,
+      "loss": 0.6206,
+      "step": 975
+    },
+    {
+      "epoch": 0.7050359712230215,
+      "grad_norm": 0.16501096888220573,
+      "learning_rate": 1.212135751083724e-06,
+      "loss": 0.642,
+      "step": 980
+    },
+    {
+      "epoch": 0.7086330935251799,
+      "grad_norm": 0.16345813795019531,
+      "learning_rate": 1.1853328309581139e-06,
+      "loss": 0.6105,
+      "step": 985
+    },
+    {
+      "epoch": 0.7122302158273381,
+      "grad_norm": 0.16982887708707609,
+      "learning_rate": 1.1587371802592302e-06,
+      "loss": 0.6302,
+      "step": 990
+    },
+    {
+      "epoch": 0.7158273381294964,
+      "grad_norm": 0.15843844245859565,
+      "learning_rate": 1.1323529920371518e-06,
+      "loss": 0.5946,
+      "step": 995
+    },
+    {
+      "epoch": 0.7194244604316546,
+      "grad_norm": 0.16408266793537263,
+      "learning_rate": 1.10618442600294e-06,
+      "loss": 0.6108,
+      "step": 1000
+    },
+    {
+      "epoch": 0.7194244604316546,
+      "eval_runtime": 12.2588,
+      "eval_samples_per_second": 81.574,
+      "eval_steps_per_second": 2.61,
+      "step": 1000
+    },
+    {
+      "epoch": 0.7230215827338129,
+      "grad_norm": 0.16405449199837346,
+      "learning_rate": 1.0802356078728293e-06,
+      "loss": 0.6247,
+      "step": 1005
+    },
+    {
+      "epoch": 0.7266187050359713,
+      "grad_norm": 0.17021168722767338,
+      "learning_rate": 1.0545106287177645e-06,
+      "loss": 0.6146,
+      "step": 1010
+    },
+    {
+      "epoch": 0.7302158273381295,
+      "grad_norm": 0.1663335063999724,
+      "learning_rate": 1.029013544318407e-06,
+      "loss": 0.6128,
+      "step": 1015
+    },
+    {
+      "epoch": 0.7338129496402878,
+      "grad_norm": 0.16192750797174188,
+      "learning_rate": 1.0037483745257073e-06,
+      "loss": 0.6282,
+      "step": 1020
+    },
+    {
+      "epoch": 0.737410071942446,
+      "grad_norm": 0.1676690522330883,
+      "learning_rate": 9.78719102627132e-07,
+      "loss": 0.6405,
+      "step": 1025
+    },
+    {
+      "epoch": 0.7410071942446043,
+      "grad_norm": 0.16103562098112373,
+      "learning_rate": 9.53929674718668e-07,
+      "loss": 0.6092,
+      "step": 1030
+    },
+    {
+      "epoch": 0.7446043165467626,
+      "grad_norm": 0.15629957097839647,
+      "learning_rate": 9.293839990826778e-07,
+      "loss": 0.6288,
+      "step": 1035
+    },
+    {
+      "epoch": 0.7482014388489209,
+      "grad_norm": 0.1649719012991602,
+      "learning_rate": 9.050859455717292e-07,
+      "loss": 0.6487,
+      "step": 1040
+    },
+    {
+      "epoch": 0.7517985611510791,
+      "grad_norm": 0.16621814957301698,
+      "learning_rate": 8.810393449984706e-07,
+      "loss": 0.6579,
+      "step": 1045
+    },
+    {
+      "epoch": 0.7553956834532374,
+      "grad_norm": 0.15499205580375533,
+      "learning_rate": 8.572479885316745e-07,
+      "loss": 0.6113,
+      "step": 1050
+    },
+    {
+      "epoch": 0.7589928057553957,
+      "grad_norm": 0.16101254180295285,
+      "learning_rate": 8.337156270985197e-07,
+      "loss": 0.6414,
+      "step": 1055
+    },
+    {
+      "epoch": 0.762589928057554,
+      "grad_norm": 0.16549776608782082,
+      "learning_rate": 8.104459707932238e-07,
+      "loss": 0.6123,
+      "step": 1060
+    },
+    {
+      "epoch": 0.7661870503597122,
+      "grad_norm": 0.17260224506613606,
+      "learning_rate": 7.874426882921171e-07,
+      "loss": 0.6274,
+      "step": 1065
+    },
+    {
+      "epoch": 0.7697841726618705,
+      "grad_norm": 0.16949981491490823,
+      "learning_rate": 7.647094062752347e-07,
+      "loss": 0.6253,
+      "step": 1070
+    },
+    {
+      "epoch": 0.7733812949640287,
+      "grad_norm": 0.16631605946143985,
+      "learning_rate": 7.422497088545436e-07,
+      "loss": 0.6233,
+      "step": 1075
+    },
+    {
+      "epoch": 0.7769784172661871,
+      "grad_norm": 0.16504960475084282,
+      "learning_rate": 7.200671370088682e-07,
+      "loss": 0.5962,
+      "step": 1080
+    },
+    {
+      "epoch": 0.7805755395683454,
+      "grad_norm": 0.15889073547280716,
+      "learning_rate": 6.981651880256285e-07,
+      "loss": 0.6094,
+      "step": 1085
+    },
+    {
+      "epoch": 0.7841726618705036,
+      "grad_norm": 0.15921735320612743,
+      "learning_rate": 6.765473149494545e-07,
+      "loss": 0.6271,
+      "step": 1090
+    },
+    {
+      "epoch": 0.7877697841726619,
+      "grad_norm": 0.16149400411033712,
+      "learning_rate": 6.552169260377872e-07,
+      "loss": 0.6194,
+      "step": 1095
+    },
+    {
+      "epoch": 0.7913669064748201,
+      "grad_norm": 0.17519197732403996,
+      "learning_rate": 6.341773842235307e-07,
+      "loss": 0.5996,
+      "step": 1100
+    },
+    {
+      "epoch": 0.7949640287769785,
+      "grad_norm": 0.16208379962765676,
+      "learning_rate": 6.134320065848564e-07,
+      "loss": 0.606,
+      "step": 1105
+    },
+    {
+      "epoch": 0.7985611510791367,
+      "grad_norm": 0.17454197491728474,
+      "learning_rate": 5.929840638222384e-07,
+      "loss": 0.5922,
+      "step": 1110
+    },
+    {
+      "epoch": 0.802158273381295,
+      "grad_norm": 0.1646154768218277,
+      "learning_rate": 5.728367797427906e-07,
+      "loss": 0.6243,
+      "step": 1115
+    },
+    {
+      "epoch": 0.8057553956834532,
+      "grad_norm": 0.16381184894925108,
+      "learning_rate": 5.529933307520102e-07,
+      "loss": 0.591,
+      "step": 1120
+    },
+    {
+      "epoch": 0.8093525179856115,
+      "grad_norm": 0.16087653845215716,
+      "learning_rate": 5.334568453529831e-07,
+      "loss": 0.6032,
+      "step": 1125
+    },
+    {
+      "epoch": 0.8129496402877698,
+      "grad_norm": 0.169499547501472,
+      "learning_rate": 5.142304036531503e-07,
+      "loss": 0.6272,
+      "step": 1130
+    },
+    {
+      "epoch": 0.8165467625899281,
+      "grad_norm": 0.16073871102203974,
+      "learning_rate": 4.953170368786985e-07,
+      "loss": 0.6038,
+      "step": 1135
+    },
+    {
+      "epoch": 0.8201438848920863,
+      "grad_norm": 0.1718082418985466,
+      "learning_rate": 4.767197268966589e-07,
+      "loss": 0.6329,
+      "step": 1140
+    },
+    {
+      "epoch": 0.8237410071942446,
+      "grad_norm": 0.16600964986098235,
+      "learning_rate": 4.5844140574478944e-07,
+      "loss": 0.6059,
+      "step": 1145
+    },
+    {
+      "epoch": 0.8273381294964028,
+      "grad_norm": 0.16551060707936915,
+      "learning_rate": 4.404849551693102e-07,
+      "loss": 0.6178,
+      "step": 1150
+    },
+    {
+      "epoch": 0.8309352517985612,
+      "grad_norm": 0.16058229980877273,
+      "learning_rate": 4.228532061705742e-07,
+      "loss": 0.6162,
+      "step": 1155
+    },
+    {
+      "epoch": 0.8345323741007195,
+      "grad_norm": 0.1650296995501976,
+      "learning_rate": 4.055489385567266e-07,
+      "loss": 0.6252,
+      "step": 1160
+    },
+    {
+      "epoch": 0.8381294964028777,
+      "grad_norm": 0.16480409428538245,
+      "learning_rate": 3.8857488050544903e-07,
+      "loss": 0.6203,
+      "step": 1165
+    },
+    {
+      "epoch": 0.841726618705036,
+      "grad_norm": 0.16232443768326824,
+      "learning_rate": 3.7193370813383425e-07,
+      "loss": 0.6349,
+      "step": 1170
+    },
+    {
+      "epoch": 0.8453237410071942,
+      "grad_norm": 0.1584897308647271,
+      "learning_rate": 3.556280450764699e-07,
+      "loss": 0.584,
+      "step": 1175
+    },
+    {
+      "epoch": 0.8489208633093526,
+      "grad_norm": 0.15664466526812762,
+      "learning_rate": 3.396604620718025e-07,
+      "loss": 0.5986,
+      "step": 1180
+    },
+    {
+      "epoch": 0.8525179856115108,
+      "grad_norm": 4.159707417477735,
+      "learning_rate": 3.240334765568329e-07,
+      "loss": 0.6446,
+      "step": 1185
+    },
+    {
+      "epoch": 0.8561151079136691,
+      "grad_norm": 0.15382670213595884,
+      "learning_rate": 3.0874955227022053e-07,
+      "loss": 0.6192,
+      "step": 1190
+    },
+    {
+      "epoch": 0.8597122302158273,
+      "grad_norm": 0.16333556616379255,
+      "learning_rate": 2.938110988638521e-07,
+      "loss": 0.5958,
+      "step": 1195
+    },
+    {
+      "epoch": 0.8633093525179856,
+      "grad_norm": 0.1590368907083947,
+      "learning_rate": 2.7922047152293934e-07,
+      "loss": 0.6197,
+      "step": 1200
+    },
+    {
+      "epoch": 0.8669064748201439,
+      "grad_norm": 0.17020833161122917,
+      "learning_rate": 2.6497997059470065e-07,
+      "loss": 0.5887,
+      "step": 1205
+    },
+    {
+      "epoch": 0.8705035971223022,
+      "grad_norm": 0.16037027743271673,
+      "learning_rate": 2.5109184122568797e-07,
+      "loss": 0.5971,
+      "step": 1210
+    },
+    {
+      "epoch": 0.8741007194244604,
+      "grad_norm": 0.1633515403590847,
+      "learning_rate": 2.3755827300782436e-07,
+      "loss": 0.6147,
+      "step": 1215
+    },
+    {
+      "epoch": 0.8776978417266187,
+      "grad_norm": 0.15724633669451,
+      "learning_rate": 2.2438139963318812e-07,
+      "loss": 0.5837,
+      "step": 1220
+    },
+    {
+      "epoch": 0.8812949640287769,
+      "grad_norm": 0.1606820436871867,
+      "learning_rate": 2.1156329855762243e-07,
+      "loss": 0.6084,
+      "step": 1225
+    },
+    {
+      "epoch": 0.8848920863309353,
+      "grad_norm": 0.1665829512742088,
+      "learning_rate": 1.9910599067319984e-07,
+      "loss": 0.5792,
+      "step": 1230
+    },
+    {
+      "epoch": 0.8884892086330936,
+      "grad_norm": 0.1617341090573433,
+      "learning_rate": 1.8701143998961502e-07,
+      "loss": 0.6127,
+      "step": 1235
+    },
+    {
+      "epoch": 0.8920863309352518,
+      "grad_norm": 0.16444482030532112,
+      "learning_rate": 1.752815533245364e-07,
+      "loss": 0.6068,
+      "step": 1240
+    },
+    {
+      "epoch": 0.89568345323741,
+      "grad_norm": 0.16828126570381266,
+      "learning_rate": 1.6391818000298043e-07,
+      "loss": 0.6125,
+      "step": 1245
+    },
+    {
+      "epoch": 0.8992805755395683,
+      "grad_norm": 0.1629957454088181,
+      "learning_rate": 1.529231115657498e-07,
+      "loss": 0.6381,
+      "step": 1250
+    },
+    {
+      "epoch": 0.9028776978417267,
+      "grad_norm": 0.16556649184516456,
+      "learning_rate": 1.4229808148697732e-07,
+      "loss": 0.5998,
+      "step": 1255
+    },
+    {
+      "epoch": 0.9064748201438849,
+      "grad_norm": 0.16203099377866306,
+      "learning_rate": 1.320447649008308e-07,
+      "loss": 0.6137,
+      "step": 1260
+    },
+    {
+      "epoch": 0.9100719424460432,
+      "grad_norm": 0.16902179185053035,
+      "learning_rate": 1.2216477833741025e-07,
+      "loss": 0.5974,
+      "step": 1265
+    },
+    {
+      "epoch": 0.9136690647482014,
+      "grad_norm": 0.16479092334567338,
+      "learning_rate": 1.1265967946788913e-07,
+      "loss": 0.6203,
+      "step": 1270
+    },
+    {
+      "epoch": 0.9172661870503597,
+      "grad_norm": 0.15887060324938326,
+      "learning_rate": 1.0353096685893044e-07,
+      "loss": 0.5961,
+      "step": 1275
+    },
+    {
+      "epoch": 0.920863309352518,
+      "grad_norm": 0.15717928347327234,
+      "learning_rate": 9.478007973642733e-08,
+      "loss": 0.5939,
+      "step": 1280
+    },
+    {
+      "epoch": 0.9244604316546763,
+      "grad_norm": 0.16321773056840833,
+      "learning_rate": 8.640839775859222e-08,
+      "loss": 0.5993,
+      "step": 1285
+    },
+    {
+      "epoch": 0.9280575539568345,
+      "grad_norm": 0.15772198437541257,
+      "learning_rate": 7.841724079844321e-08,
+      "loss": 0.6423,
+      "step": 1290
+    },
+    {
+      "epoch": 0.9316546762589928,
+      "grad_norm": 0.16917268204241048,
+      "learning_rate": 7.080786873571388e-08,
+      "loss": 0.6566,
+      "step": 1295
+    },
+    {
+      "epoch": 0.935251798561151,
+      "grad_norm": 0.16079832958818444,
+      "learning_rate": 6.358148125822e-08,
+      "loss": 0.6154,
+      "step": 1300
+    },
+    {
+      "epoch": 0.9388489208633094,
+      "grad_norm": 0.15308750470330593,
+      "learning_rate": 5.673921767271967e-08,
+      "loss": 0.6068,
+      "step": 1305
+    },
+    {
+      "epoch": 0.9424460431654677,
+      "grad_norm": 0.1675136373579217,
+      "learning_rate": 5.028215672528924e-08,
+      "loss": 0.6031,
+      "step": 1310
+    },
+    {
+      "epoch": 0.9460431654676259,
+      "grad_norm": 0.15773902663760664,
+      "learning_rate": 4.421131643125104e-08,
+      "loss": 0.5913,
+      "step": 1315
+    },
+    {
+      "epoch": 0.9496402877697842,
+      "grad_norm": 0.15738955399333943,
+      "learning_rate": 3.852765391467117e-08,
+      "loss": 0.6246,
+      "step": 1320
+    },
+    {
+      "epoch": 0.9532374100719424,
+      "grad_norm": 0.1673309275263978,
+      "learning_rate": 3.323206525746303e-08,
+      "loss": 0.6113,
+      "step": 1325
+    },
+    {
+      "epoch": 0.9568345323741008,
+      "grad_norm": 0.16639511680642716,
+      "learning_rate": 2.832538535810947e-08,
+      "loss": 0.6129,
+      "step": 1330
+    },
+    {
+      "epoch": 0.960431654676259,
+      "grad_norm": 0.15467986329263658,
+      "learning_rate": 2.3808387800034194e-08,
+      "loss": 0.6135,
+      "step": 1335
+    },
+    {
+      "epoch": 0.9640287769784173,
+      "grad_norm": 0.15556453745576904,
+      "learning_rate": 1.9681784729639608e-08,
+      "loss": 0.5911,
+      "step": 1340
+    },
+    {
+      "epoch": 0.9676258992805755,
+      "grad_norm": 0.17175964565211488,
+      "learning_rate": 1.5946226744029402e-08,
+      "loss": 0.6353,
+      "step": 1345
+    },
+    {
+      "epoch": 0.9712230215827338,
+      "grad_norm": 0.15611077883391353,
+      "learning_rate": 1.2602302788436715e-08,
+      "loss": 0.5965,
+      "step": 1350
+    },
+    {
+      "epoch": 0.9748201438848921,
+      "grad_norm": 0.15835204960049692,
+      "learning_rate": 9.650540063370628e-09,
+      "loss": 0.602,
+      "step": 1355
+    },
+    {
+      "epoch": 0.9784172661870504,
+      "grad_norm": 0.16328945271739753,
+      "learning_rate": 7.091403941499597e-09,
+      "loss": 0.5979,
+      "step": 1360
+    },
+    {
+      "epoch": 0.9820143884892086,
+      "grad_norm": 0.1583518161160204,
+      "learning_rate": 4.9252978942793125e-09,
+      "loss": 0.5995,
+      "step": 1365
+    },
+    {
+      "epoch": 0.9856115107913669,
+      "grad_norm": 0.16148337835743226,
+      "learning_rate": 3.152563428343025e-09,
+      "loss": 0.5697,
+      "step": 1370
+    },
+    {
+      "epoch": 0.9892086330935251,
+      "grad_norm": 0.16118815430555725,
+      "learning_rate": 1.7734800316596135e-09,
+      "loss": 0.6051,
+      "step": 1375
+    },
+    {
+      "epoch": 0.9928057553956835,
+      "grad_norm": 0.1572343438101448,
+      "learning_rate": 7.882651294685573e-10,
+      "loss": 0.6255,
+      "step": 1380
+    },
+    {
+      "epoch": 0.9964028776978417,
+      "grad_norm": 0.16657064617868036,
+      "learning_rate": 1.9707405000346513e-10,
+      "loss": 0.6444,
+      "step": 1385
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.15928289801803977,
+      "learning_rate": 0.0,
+      "loss": 0.5923,
+      "step": 1390
+    },
+    {
+      "epoch": 1.0,
+      "step": 1390,
+      "total_flos": 1267133463920640.0,
+      "train_loss": 0.6670485469077131,
+      "train_runtime": 18244.3099,
+      "train_samples_per_second": 9.746,
+      "train_steps_per_second": 0.076
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 1390,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 50,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1267133463920640.0,
+  "train_batch_size": 32,
+  "trial_name": null,
+  "trial_params": null
+}