YuchenLi01 commited on
Commit
6cd5617
·
verified ·
1 Parent(s): 4926477

Model save

Browse files
README.md ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: alignment-handbook/zephyr-7b-sft-full
3
+ library_name: transformers
4
+ model_name: ultrafeedbackSkyworkAgree_alignmentZephyr7BSftFull_sdpo_score_ebs64_lr5e-07_1
5
+ tags:
6
+ - generated_from_trainer
7
+ - trl
8
+ - dpo
9
+ licence: license
10
+ ---
11
+
12
+ # Model Card for ultrafeedbackSkyworkAgree_alignmentZephyr7BSftFull_sdpo_score_ebs64_lr5e-07_1
13
+
14
+ This model is a fine-tuned version of [alignment-handbook/zephyr-7b-sft-full](https://huggingface.co/alignment-handbook/zephyr-7b-sft-full).
15
+ It has been trained using [TRL](https://github.com/huggingface/trl).
16
+
17
+ ## Quick start
18
+
19
+ ```python
20
+ from transformers import pipeline
21
+
22
+ question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?"
23
+ generator = pipeline("text-generation", model="YuchenLi01/ultrafeedbackSkyworkAgree_alignmentZephyr7BSftFull_sdpo_score_ebs64_lr5e-07_1", device="cuda")
24
+ output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0]
25
+ print(output["generated_text"])
26
+ ```
27
+
28
+ ## Training procedure
29
+
30
+ [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/yuchenl4/lmpref/runs/ultrafeedbackSkyworkAgree_alignmentZephyr7BSftFull_sdpo_score_ebs64_lr5e-07_1try1S4eJmGXVUhdSrsIAqeWu49ao5UELq4jSJBSvoOVdLYLRFy)
31
+
32
+ This model was trained with DPO, a method introduced in [Direct Preference Optimization: Your Language Model is Secretly a Reward Model](https://huggingface.co/papers/2305.18290).
33
+
34
+ ### Framework versions
35
+
36
+ - TRL: 0.12.0
37
+ - Transformers: 4.46.3
38
+ - Pytorch: 2.3.0
39
+ - Datasets: 3.1.0
40
+ - Tokenizers: 0.20.3
41
+
42
+ ## Citations
43
+
44
+ Cite DPO as:
45
+
46
+ ```bibtex
47
+ @inproceedings{rafailov2023direct,
48
+ title = {{Direct Preference Optimization: Your Language Model is Secretly a Reward Model}},
49
+ author = {Rafael Rafailov and Archit Sharma and Eric Mitchell and Christopher D. Manning and Stefano Ermon and Chelsea Finn},
50
+ year = 2023,
51
+ booktitle = {Advances in Neural Information Processing Systems 36: Annual Conference on Neural Information Processing Systems 2023, NeurIPS 2023, New Orleans, LA, USA, December 10 - 16, 2023},
52
+ url = {http://papers.nips.cc/paper_files/paper/2023/hash/a85b405ed65c6477a4fe8302b5e06ce7-Abstract-Conference.html},
53
+ editor = {Alice Oh and Tristan Naumann and Amir Globerson and Kate Saenko and Moritz Hardt and Sergey Levine},
54
+ }
55
+ ```
56
+
57
+ Cite TRL as:
58
+
59
+ ```bibtex
60
+ @misc{vonwerra2022trl,
61
+ title = {{TRL: Transformer Reinforcement Learning}},
62
+ author = {Leandro von Werra and Younes Belkada and Lewis Tunstall and Edward Beeching and Tristan Thrush and Nathan Lambert and Shengyi Huang and Kashif Rasul and Quentin Gallouédec},
63
+ year = 2020,
64
+ journal = {GitHub repository},
65
+ publisher = {GitHub},
66
+ howpublished = {\url{https://github.com/huggingface/trl}}
67
+ }
68
+ ```
all_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1.0,
3
+ "total_flos": 0.0,
4
+ "train_loss": 0.4091535410406212,
5
+ "train_runtime": 30868.5922,
6
+ "train_samples": 45608,
7
+ "train_samples_per_second": 1.477,
8
+ "train_steps_per_second": 0.023
9
+ }
generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "transformers_version": "4.46.3"
6
+ }
model-00001-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a934bf1cab1daa9c509357ab7c28f132f8deeaf27d98dfd47b66f71f40ea95dc
3
  size 4943162336
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8cc3c1b33def2af680389b5ce6915b6a7adf0179e5a76ab7bc3936de6b99d9dd
3
  size 4943162336
model-00002-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ddffc17bd8e3cee92a27ab3a17c7fbc72b98f7dcf3e5e72a290f6eb01d9ce7aa
3
  size 4999819336
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3bfa92f6339d97133e57d049aa167be3745a90b323fb7f5ef9cef720c12a8f10
3
  size 4999819336
model-00003-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ee01b5ec17faeb33e77f8bea9bd2595a28496b8ee7754de27635dd860bba1390
3
  size 4540516344
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:947c2594c61ea8af6f42287d3775e3006a40a61d533e8347c22a20ec6863992a
3
  size 4540516344
train_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1.0,
3
+ "total_flos": 0.0,
4
+ "train_loss": 0.4091535410406212,
5
+ "train_runtime": 30868.5922,
6
+ "train_samples": 45608,
7
+ "train_samples_per_second": 1.477,
8
+ "train_steps_per_second": 0.023
9
+ }
trainer_state.json ADDED
@@ -0,0 +1,2546 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 1.0,
5
+ "eval_steps": 8,
6
+ "global_step": 713,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.001402524544179523,
13
+ "grad_norm": 51.46674233848396,
14
+ "learning_rate": 6.9444444444444435e-09,
15
+ "logits/chosen": -3.328125,
16
+ "logits/rejected": -3.46875,
17
+ "logps/chosen": -468.0,
18
+ "logps/rejected": -300.0,
19
+ "loss": 0.6914,
20
+ "rewards/accuracies": 0.0,
21
+ "rewards/chosen": 0.0,
22
+ "rewards/margins": 0.0,
23
+ "rewards/rejected": 0.0,
24
+ "step": 1
25
+ },
26
+ {
27
+ "epoch": 0.011220196353436185,
28
+ "eval_logits/chosen": -3.234375,
29
+ "eval_logits/rejected": -3.28125,
30
+ "eval_logps/chosen": -322.0,
31
+ "eval_logps/rejected": -276.0,
32
+ "eval_loss": 0.6924511194229126,
33
+ "eval_rewards/accuracies": 0.21808511018753052,
34
+ "eval_rewards/chosen": 0.001068115234375,
35
+ "eval_rewards/margins": -0.0003376007080078125,
36
+ "eval_rewards/rejected": 0.00139617919921875,
37
+ "eval_runtime": 64.5741,
38
+ "eval_samples_per_second": 23.028,
39
+ "eval_steps_per_second": 0.728,
40
+ "step": 8
41
+ },
42
+ {
43
+ "epoch": 0.014025245441795231,
44
+ "grad_norm": 53.200557685453425,
45
+ "learning_rate": 6.944444444444444e-08,
46
+ "logits/chosen": -3.140625,
47
+ "logits/rejected": -3.21875,
48
+ "logps/chosen": -306.0,
49
+ "logps/rejected": -264.0,
50
+ "loss": 0.694,
51
+ "rewards/accuracies": 0.1666666716337204,
52
+ "rewards/chosen": -0.004241943359375,
53
+ "rewards/margins": -0.00970458984375,
54
+ "rewards/rejected": 0.0054931640625,
55
+ "step": 10
56
+ },
57
+ {
58
+ "epoch": 0.02244039270687237,
59
+ "eval_logits/chosen": -3.234375,
60
+ "eval_logits/rejected": -3.28125,
61
+ "eval_logps/chosen": -322.0,
62
+ "eval_logps/rejected": -276.0,
63
+ "eval_loss": 0.6895634531974792,
64
+ "eval_rewards/accuracies": 0.2606382966041565,
65
+ "eval_rewards/chosen": -0.01318359375,
66
+ "eval_rewards/margins": -0.004058837890625,
67
+ "eval_rewards/rejected": -0.0091552734375,
68
+ "eval_runtime": 65.8066,
69
+ "eval_samples_per_second": 22.597,
70
+ "eval_steps_per_second": 0.714,
71
+ "step": 16
72
+ },
73
+ {
74
+ "epoch": 0.028050490883590462,
75
+ "grad_norm": 61.01600920733608,
76
+ "learning_rate": 1.3888888888888888e-07,
77
+ "logits/chosen": -3.171875,
78
+ "logits/rejected": -3.234375,
79
+ "logps/chosen": -314.0,
80
+ "logps/rejected": -294.0,
81
+ "loss": 0.6909,
82
+ "rewards/accuracies": 0.3125,
83
+ "rewards/chosen": -0.006256103515625,
84
+ "rewards/margins": 0.0089111328125,
85
+ "rewards/rejected": -0.01513671875,
86
+ "step": 20
87
+ },
88
+ {
89
+ "epoch": 0.033660589060308554,
90
+ "eval_logits/chosen": -3.234375,
91
+ "eval_logits/rejected": -3.265625,
92
+ "eval_logps/chosen": -322.0,
93
+ "eval_logps/rejected": -276.0,
94
+ "eval_loss": 0.6792454719543457,
95
+ "eval_rewards/accuracies": 0.48404255509376526,
96
+ "eval_rewards/chosen": -0.04248046875,
97
+ "eval_rewards/margins": 0.027587890625,
98
+ "eval_rewards/rejected": -0.0703125,
99
+ "eval_runtime": 65.5822,
100
+ "eval_samples_per_second": 22.674,
101
+ "eval_steps_per_second": 0.717,
102
+ "step": 24
103
+ },
104
+ {
105
+ "epoch": 0.04207573632538569,
106
+ "grad_norm": 48.23881467629644,
107
+ "learning_rate": 2.0833333333333333e-07,
108
+ "logits/chosen": -3.09375,
109
+ "logits/rejected": -3.25,
110
+ "logps/chosen": -352.0,
111
+ "logps/rejected": -280.0,
112
+ "loss": 0.6754,
113
+ "rewards/accuracies": 0.6499999761581421,
114
+ "rewards/chosen": -0.038330078125,
115
+ "rewards/margins": 0.051513671875,
116
+ "rewards/rejected": -0.08984375,
117
+ "step": 30
118
+ },
119
+ {
120
+ "epoch": 0.04488078541374474,
121
+ "eval_logits/chosen": -3.21875,
122
+ "eval_logits/rejected": -3.25,
123
+ "eval_logps/chosen": -324.0,
124
+ "eval_logps/rejected": -280.0,
125
+ "eval_loss": 0.6562117338180542,
126
+ "eval_rewards/accuracies": 0.6117021441459656,
127
+ "eval_rewards/chosen": -0.111328125,
128
+ "eval_rewards/margins": 0.08544921875,
129
+ "eval_rewards/rejected": -0.197265625,
130
+ "eval_runtime": 65.2452,
131
+ "eval_samples_per_second": 22.791,
132
+ "eval_steps_per_second": 0.72,
133
+ "step": 32
134
+ },
135
+ {
136
+ "epoch": 0.056100981767180924,
137
+ "grad_norm": 47.782407457061694,
138
+ "learning_rate": 2.7777777777777776e-07,
139
+ "logits/chosen": -3.109375,
140
+ "logits/rejected": -3.25,
141
+ "logps/chosen": -346.0,
142
+ "logps/rejected": -276.0,
143
+ "loss": 0.6441,
144
+ "rewards/accuracies": 0.6875,
145
+ "rewards/chosen": -0.1259765625,
146
+ "rewards/margins": 0.1298828125,
147
+ "rewards/rejected": -0.255859375,
148
+ "step": 40
149
+ },
150
+ {
151
+ "epoch": 0.056100981767180924,
152
+ "eval_logits/chosen": -3.203125,
153
+ "eval_logits/rejected": -3.234375,
154
+ "eval_logps/chosen": -326.0,
155
+ "eval_logps/rejected": -284.0,
156
+ "eval_loss": 0.6230681538581848,
157
+ "eval_rewards/accuracies": 0.664893627166748,
158
+ "eval_rewards/chosen": -0.212890625,
159
+ "eval_rewards/margins": 0.1767578125,
160
+ "eval_rewards/rejected": -0.390625,
161
+ "eval_runtime": 65.4966,
162
+ "eval_samples_per_second": 22.703,
163
+ "eval_steps_per_second": 0.718,
164
+ "step": 40
165
+ },
166
+ {
167
+ "epoch": 0.06732117812061711,
168
+ "eval_logits/chosen": -3.1875,
169
+ "eval_logits/rejected": -3.21875,
170
+ "eval_logps/chosen": -328.0,
171
+ "eval_logps/rejected": -288.0,
172
+ "eval_loss": 0.5846173167228699,
173
+ "eval_rewards/accuracies": 0.6914893388748169,
174
+ "eval_rewards/chosen": -0.328125,
175
+ "eval_rewards/margins": 0.328125,
176
+ "eval_rewards/rejected": -0.65625,
177
+ "eval_runtime": 65.9802,
178
+ "eval_samples_per_second": 22.537,
179
+ "eval_steps_per_second": 0.712,
180
+ "step": 48
181
+ },
182
+ {
183
+ "epoch": 0.07012622720897616,
184
+ "grad_norm": 39.07068999911395,
185
+ "learning_rate": 3.472222222222222e-07,
186
+ "logits/chosen": -3.09375,
187
+ "logits/rejected": -3.078125,
188
+ "logps/chosen": -344.0,
189
+ "logps/rejected": -316.0,
190
+ "loss": 0.5952,
191
+ "rewards/accuracies": 0.762499988079071,
192
+ "rewards/chosen": -0.33203125,
193
+ "rewards/margins": 0.2578125,
194
+ "rewards/rejected": -0.58984375,
195
+ "step": 50
196
+ },
197
+ {
198
+ "epoch": 0.0785413744740533,
199
+ "eval_logits/chosen": -3.15625,
200
+ "eval_logits/rejected": -3.1875,
201
+ "eval_logps/chosen": -328.0,
202
+ "eval_logps/rejected": -292.0,
203
+ "eval_loss": 0.5460087060928345,
204
+ "eval_rewards/accuracies": 0.728723406791687,
205
+ "eval_rewards/chosen": -0.326171875,
206
+ "eval_rewards/margins": 0.498046875,
207
+ "eval_rewards/rejected": -0.82421875,
208
+ "eval_runtime": 65.6815,
209
+ "eval_samples_per_second": 22.64,
210
+ "eval_steps_per_second": 0.716,
211
+ "step": 56
212
+ },
213
+ {
214
+ "epoch": 0.08415147265077139,
215
+ "grad_norm": 35.960821270018556,
216
+ "learning_rate": 4.1666666666666667e-07,
217
+ "logits/chosen": -3.046875,
218
+ "logits/rejected": -3.109375,
219
+ "logps/chosen": -346.0,
220
+ "logps/rejected": -292.0,
221
+ "loss": 0.5601,
222
+ "rewards/accuracies": 0.7749999761581421,
223
+ "rewards/chosen": -0.337890625,
224
+ "rewards/margins": 0.48828125,
225
+ "rewards/rejected": -0.828125,
226
+ "step": 60
227
+ },
228
+ {
229
+ "epoch": 0.08976157082748948,
230
+ "eval_logits/chosen": -3.140625,
231
+ "eval_logits/rejected": -3.171875,
232
+ "eval_logps/chosen": -324.0,
233
+ "eval_logps/rejected": -292.0,
234
+ "eval_loss": 0.5130031704902649,
235
+ "eval_rewards/accuracies": 0.7553191781044006,
236
+ "eval_rewards/chosen": -0.10791015625,
237
+ "eval_rewards/margins": 0.68359375,
238
+ "eval_rewards/rejected": -0.7890625,
239
+ "eval_runtime": 65.377,
240
+ "eval_samples_per_second": 22.745,
241
+ "eval_steps_per_second": 0.719,
242
+ "step": 64
243
+ },
244
+ {
245
+ "epoch": 0.09817671809256662,
246
+ "grad_norm": 45.987879679409986,
247
+ "learning_rate": 4.861111111111111e-07,
248
+ "logits/chosen": -2.921875,
249
+ "logits/rejected": -2.96875,
250
+ "logps/chosen": -340.0,
251
+ "logps/rejected": -318.0,
252
+ "loss": 0.5168,
253
+ "rewards/accuracies": 0.699999988079071,
254
+ "rewards/chosen": -0.384765625,
255
+ "rewards/margins": 0.515625,
256
+ "rewards/rejected": -0.8984375,
257
+ "step": 70
258
+ },
259
+ {
260
+ "epoch": 0.10098176718092566,
261
+ "eval_logits/chosen": -3.125,
262
+ "eval_logits/rejected": -3.15625,
263
+ "eval_logps/chosen": -330.0,
264
+ "eval_logps/rejected": -302.0,
265
+ "eval_loss": 0.48529163002967834,
266
+ "eval_rewards/accuracies": 0.7553191781044006,
267
+ "eval_rewards/chosen": -0.3671875,
268
+ "eval_rewards/margins": 0.9375,
269
+ "eval_rewards/rejected": -1.3046875,
270
+ "eval_runtime": 65.3763,
271
+ "eval_samples_per_second": 22.745,
272
+ "eval_steps_per_second": 0.719,
273
+ "step": 72
274
+ },
275
+ {
276
+ "epoch": 0.11220196353436185,
277
+ "grad_norm": 50.85101203918117,
278
+ "learning_rate": 4.998078598898921e-07,
279
+ "logits/chosen": -3.046875,
280
+ "logits/rejected": -3.09375,
281
+ "logps/chosen": -322.0,
282
+ "logps/rejected": -316.0,
283
+ "loss": 0.4973,
284
+ "rewards/accuracies": 0.824999988079071,
285
+ "rewards/chosen": -0.5703125,
286
+ "rewards/margins": 1.1015625,
287
+ "rewards/rejected": -1.671875,
288
+ "step": 80
289
+ },
290
+ {
291
+ "epoch": 0.11220196353436185,
292
+ "eval_logits/chosen": -3.125,
293
+ "eval_logits/rejected": -3.15625,
294
+ "eval_logps/chosen": -324.0,
295
+ "eval_logps/rejected": -298.0,
296
+ "eval_loss": 0.4733535349369049,
297
+ "eval_rewards/accuracies": 0.792553186416626,
298
+ "eval_rewards/chosen": -0.11181640625,
299
+ "eval_rewards/margins": 1.0703125,
300
+ "eval_rewards/rejected": -1.1796875,
301
+ "eval_runtime": 65.986,
302
+ "eval_samples_per_second": 22.535,
303
+ "eval_steps_per_second": 0.712,
304
+ "step": 80
305
+ },
306
+ {
307
+ "epoch": 0.12342215988779803,
308
+ "eval_logits/chosen": -3.125,
309
+ "eval_logits/rejected": -3.15625,
310
+ "eval_logps/chosen": -324.0,
311
+ "eval_logps/rejected": -300.0,
312
+ "eval_loss": 0.46471452713012695,
313
+ "eval_rewards/accuracies": 0.8031914830207825,
314
+ "eval_rewards/chosen": -0.1474609375,
315
+ "eval_rewards/margins": 1.1015625,
316
+ "eval_rewards/rejected": -1.25,
317
+ "eval_runtime": 65.8761,
318
+ "eval_samples_per_second": 22.573,
319
+ "eval_steps_per_second": 0.713,
320
+ "step": 88
321
+ },
322
+ {
323
+ "epoch": 0.12622720897615708,
324
+ "grad_norm": 43.69982337922237,
325
+ "learning_rate": 4.990277968429684e-07,
326
+ "logits/chosen": -3.046875,
327
+ "logits/rejected": -2.953125,
328
+ "logps/chosen": -354.0,
329
+ "logps/rejected": -312.0,
330
+ "loss": 0.4756,
331
+ "rewards/accuracies": 0.7749999761581421,
332
+ "rewards/chosen": -0.11376953125,
333
+ "rewards/margins": 1.1484375,
334
+ "rewards/rejected": -1.2578125,
335
+ "step": 90
336
+ },
337
+ {
338
+ "epoch": 0.13464235624123422,
339
+ "eval_logits/chosen": -3.109375,
340
+ "eval_logits/rejected": -3.140625,
341
+ "eval_logps/chosen": -346.0,
342
+ "eval_logps/rejected": -326.0,
343
+ "eval_loss": 0.4726095497608185,
344
+ "eval_rewards/accuracies": 0.7659574747085571,
345
+ "eval_rewards/chosen": -1.2109375,
346
+ "eval_rewards/margins": 1.28125,
347
+ "eval_rewards/rejected": -2.5,
348
+ "eval_runtime": 66.1434,
349
+ "eval_samples_per_second": 22.481,
350
+ "eval_steps_per_second": 0.711,
351
+ "step": 96
352
+ },
353
+ {
354
+ "epoch": 0.1402524544179523,
355
+ "grad_norm": 65.26391444387855,
356
+ "learning_rate": 4.976496740424417e-07,
357
+ "logits/chosen": -3.078125,
358
+ "logits/rejected": -3.109375,
359
+ "logps/chosen": -330.0,
360
+ "logps/rejected": -302.0,
361
+ "loss": 0.4548,
362
+ "rewards/accuracies": 0.8374999761581421,
363
+ "rewards/chosen": -0.98828125,
364
+ "rewards/margins": 1.3515625,
365
+ "rewards/rejected": -2.34375,
366
+ "step": 100
367
+ },
368
+ {
369
+ "epoch": 0.1458625525946704,
370
+ "eval_logits/chosen": -3.078125,
371
+ "eval_logits/rejected": -3.109375,
372
+ "eval_logps/chosen": -334.0,
373
+ "eval_logps/rejected": -316.0,
374
+ "eval_loss": 0.44555220007896423,
375
+ "eval_rewards/accuracies": 0.7872340679168701,
376
+ "eval_rewards/chosen": -0.6328125,
377
+ "eval_rewards/margins": 1.3828125,
378
+ "eval_rewards/rejected": -2.015625,
379
+ "eval_runtime": 65.9643,
380
+ "eval_samples_per_second": 22.542,
381
+ "eval_steps_per_second": 0.713,
382
+ "step": 104
383
+ },
384
+ {
385
+ "epoch": 0.15427769985974754,
386
+ "grad_norm": 43.09963797811936,
387
+ "learning_rate": 4.956768011581281e-07,
388
+ "logits/chosen": -2.984375,
389
+ "logits/rejected": -3.140625,
390
+ "logps/chosen": -356.0,
391
+ "logps/rejected": -326.0,
392
+ "loss": 0.451,
393
+ "rewards/accuracies": 0.862500011920929,
394
+ "rewards/chosen": -0.53515625,
395
+ "rewards/margins": 1.71875,
396
+ "rewards/rejected": -2.25,
397
+ "step": 110
398
+ },
399
+ {
400
+ "epoch": 0.1570827489481066,
401
+ "eval_logits/chosen": -3.046875,
402
+ "eval_logits/rejected": -3.078125,
403
+ "eval_logps/chosen": -326.0,
404
+ "eval_logps/rejected": -306.0,
405
+ "eval_loss": 0.4476098418235779,
406
+ "eval_rewards/accuracies": 0.813829779624939,
407
+ "eval_rewards/chosen": -0.25390625,
408
+ "eval_rewards/margins": 1.2734375,
409
+ "eval_rewards/rejected": -1.53125,
410
+ "eval_runtime": 65.5408,
411
+ "eval_samples_per_second": 22.688,
412
+ "eval_steps_per_second": 0.717,
413
+ "step": 112
414
+ },
415
+ {
416
+ "epoch": 0.16830294530154277,
417
+ "grad_norm": 35.62825840397104,
418
+ "learning_rate": 4.931139161987398e-07,
419
+ "logits/chosen": -2.953125,
420
+ "logits/rejected": -2.9375,
421
+ "logps/chosen": -366.0,
422
+ "logps/rejected": -360.0,
423
+ "loss": 0.4475,
424
+ "rewards/accuracies": 0.8125,
425
+ "rewards/chosen": -0.5625,
426
+ "rewards/margins": 1.4453125,
427
+ "rewards/rejected": -2.015625,
428
+ "step": 120
429
+ },
430
+ {
431
+ "epoch": 0.16830294530154277,
432
+ "eval_logits/chosen": -3.03125,
433
+ "eval_logits/rejected": -3.0625,
434
+ "eval_logps/chosen": -344.0,
435
+ "eval_logps/rejected": -330.0,
436
+ "eval_loss": 0.4360468089580536,
437
+ "eval_rewards/accuracies": 0.8191489577293396,
438
+ "eval_rewards/chosen": -1.15625,
439
+ "eval_rewards/margins": 1.546875,
440
+ "eval_rewards/rejected": -2.703125,
441
+ "eval_runtime": 67.0228,
442
+ "eval_samples_per_second": 22.186,
443
+ "eval_steps_per_second": 0.701,
444
+ "step": 120
445
+ },
446
+ {
447
+ "epoch": 0.17952314165497896,
448
+ "eval_logits/chosen": -3.03125,
449
+ "eval_logits/rejected": -3.078125,
450
+ "eval_logps/chosen": -338.0,
451
+ "eval_logps/rejected": -322.0,
452
+ "eval_loss": 0.4318736791610718,
453
+ "eval_rewards/accuracies": 0.8404255509376526,
454
+ "eval_rewards/chosen": -0.77734375,
455
+ "eval_rewards/margins": 1.578125,
456
+ "eval_rewards/rejected": -2.359375,
457
+ "eval_runtime": 65.8922,
458
+ "eval_samples_per_second": 22.567,
459
+ "eval_steps_per_second": 0.713,
460
+ "step": 128
461
+ },
462
+ {
463
+ "epoch": 0.182328190743338,
464
+ "grad_norm": 53.386726765592485,
465
+ "learning_rate": 4.89967174133187e-07,
466
+ "logits/chosen": -2.859375,
467
+ "logits/rejected": -2.984375,
468
+ "logps/chosen": -392.0,
469
+ "logps/rejected": -338.0,
470
+ "loss": 0.46,
471
+ "rewards/accuracies": 0.762499988079071,
472
+ "rewards/chosen": -1.2421875,
473
+ "rewards/margins": 1.609375,
474
+ "rewards/rejected": -2.859375,
475
+ "step": 130
476
+ },
477
+ {
478
+ "epoch": 0.19074333800841514,
479
+ "eval_logits/chosen": -3.046875,
480
+ "eval_logits/rejected": -3.078125,
481
+ "eval_logps/chosen": -334.0,
482
+ "eval_logps/rejected": -318.0,
483
+ "eval_loss": 0.427120566368103,
484
+ "eval_rewards/accuracies": 0.835106372833252,
485
+ "eval_rewards/chosen": -0.58203125,
486
+ "eval_rewards/margins": 1.53125,
487
+ "eval_rewards/rejected": -2.109375,
488
+ "eval_runtime": 65.8368,
489
+ "eval_samples_per_second": 22.586,
490
+ "eval_steps_per_second": 0.714,
491
+ "step": 136
492
+ },
493
+ {
494
+ "epoch": 0.19635343618513323,
495
+ "grad_norm": 75.83519043403058,
496
+ "learning_rate": 4.862441321089378e-07,
497
+ "logits/chosen": -3.03125,
498
+ "logits/rejected": -3.140625,
499
+ "logps/chosen": -372.0,
500
+ "logps/rejected": -322.0,
501
+ "loss": 0.4198,
502
+ "rewards/accuracies": 0.8500000238418579,
503
+ "rewards/chosen": -0.5859375,
504
+ "rewards/margins": 1.5546875,
505
+ "rewards/rejected": -2.140625,
506
+ "step": 140
507
+ },
508
+ {
509
+ "epoch": 0.20196353436185133,
510
+ "eval_logits/chosen": -3.046875,
511
+ "eval_logits/rejected": -3.078125,
512
+ "eval_logps/chosen": -336.0,
513
+ "eval_logps/rejected": -322.0,
514
+ "eval_loss": 0.4199545085430145,
515
+ "eval_rewards/accuracies": 0.8244680762290955,
516
+ "eval_rewards/chosen": -0.67578125,
517
+ "eval_rewards/margins": 1.6953125,
518
+ "eval_rewards/rejected": -2.375,
519
+ "eval_runtime": 65.5367,
520
+ "eval_samples_per_second": 22.69,
521
+ "eval_steps_per_second": 0.717,
522
+ "step": 144
523
+ },
524
+ {
525
+ "epoch": 0.21037868162692847,
526
+ "grad_norm": 53.861403761625574,
527
+ "learning_rate": 4.819537313029364e-07,
528
+ "logits/chosen": -3.03125,
529
+ "logits/rejected": -3.046875,
530
+ "logps/chosen": -314.0,
531
+ "logps/rejected": -322.0,
532
+ "loss": 0.3987,
533
+ "rewards/accuracies": 0.8999999761581421,
534
+ "rewards/chosen": -0.7890625,
535
+ "rewards/margins": 1.78125,
536
+ "rewards/rejected": -2.5625,
537
+ "step": 150
538
+ },
539
+ {
540
+ "epoch": 0.2131837307152875,
541
+ "eval_logits/chosen": -3.03125,
542
+ "eval_logits/rejected": -3.0625,
543
+ "eval_logps/chosen": -338.0,
544
+ "eval_logps/rejected": -326.0,
545
+ "eval_loss": 0.417427122592926,
546
+ "eval_rewards/accuracies": 0.8457446694374084,
547
+ "eval_rewards/chosen": -0.79296875,
548
+ "eval_rewards/margins": 1.75,
549
+ "eval_rewards/rejected": -2.546875,
550
+ "eval_runtime": 65.4524,
551
+ "eval_samples_per_second": 22.719,
552
+ "eval_steps_per_second": 0.718,
553
+ "step": 152
554
+ },
555
+ {
556
+ "epoch": 0.2244039270687237,
557
+ "grad_norm": 76.00085050984279,
558
+ "learning_rate": 4.771062754486677e-07,
559
+ "logits/chosen": -3.046875,
560
+ "logits/rejected": -3.0625,
561
+ "logps/chosen": -350.0,
562
+ "logps/rejected": -316.0,
563
+ "loss": 0.4084,
564
+ "rewards/accuracies": 0.9125000238418579,
565
+ "rewards/chosen": -0.69921875,
566
+ "rewards/margins": 1.921875,
567
+ "rewards/rejected": -2.625,
568
+ "step": 160
569
+ },
570
+ {
571
+ "epoch": 0.2244039270687237,
572
+ "eval_logits/chosen": -3.0625,
573
+ "eval_logits/rejected": -3.09375,
574
+ "eval_logps/chosen": -330.0,
575
+ "eval_logps/rejected": -320.0,
576
+ "eval_loss": 0.4198690950870514,
577
+ "eval_rewards/accuracies": 0.8404255509376526,
578
+ "eval_rewards/chosen": -0.451171875,
579
+ "eval_rewards/margins": 1.828125,
580
+ "eval_rewards/rejected": -2.28125,
581
+ "eval_runtime": 65.9805,
582
+ "eval_samples_per_second": 22.537,
583
+ "eval_steps_per_second": 0.712,
584
+ "step": 160
585
+ },
586
+ {
587
+ "epoch": 0.23562412342215988,
588
+ "eval_logits/chosen": -3.0625,
589
+ "eval_logits/rejected": -3.09375,
590
+ "eval_logps/chosen": -340.0,
591
+ "eval_logps/rejected": -332.0,
592
+ "eval_loss": 0.41553688049316406,
593
+ "eval_rewards/accuracies": 0.835106372833252,
594
+ "eval_rewards/chosen": -0.9296875,
595
+ "eval_rewards/margins": 1.9296875,
596
+ "eval_rewards/rejected": -2.859375,
597
+ "eval_runtime": 66.758,
598
+ "eval_samples_per_second": 22.274,
599
+ "eval_steps_per_second": 0.704,
600
+ "step": 168
601
+ },
602
+ {
603
+ "epoch": 0.23842917251051893,
604
+ "grad_norm": 81.43616859600893,
605
+ "learning_rate": 4.717134060909331e-07,
606
+ "logits/chosen": -2.9375,
607
+ "logits/rejected": -2.75,
608
+ "logps/chosen": -300.0,
609
+ "logps/rejected": -308.0,
610
+ "loss": 0.4194,
611
+ "rewards/accuracies": 0.887499988079071,
612
+ "rewards/chosen": -1.0,
613
+ "rewards/margins": 2.296875,
614
+ "rewards/rejected": -3.296875,
615
+ "step": 170
616
+ },
617
+ {
618
+ "epoch": 0.24684431977559607,
619
+ "eval_logits/chosen": -3.0625,
620
+ "eval_logits/rejected": -3.109375,
621
+ "eval_logps/chosen": -326.0,
622
+ "eval_logps/rejected": -312.0,
623
+ "eval_loss": 0.4193520247936249,
624
+ "eval_rewards/accuracies": 0.813829779624939,
625
+ "eval_rewards/chosen": -0.2197265625,
626
+ "eval_rewards/margins": 1.6484375,
627
+ "eval_rewards/rejected": -1.8671875,
628
+ "eval_runtime": 67.0437,
629
+ "eval_samples_per_second": 22.18,
630
+ "eval_steps_per_second": 0.701,
631
+ "step": 176
632
+ },
633
+ {
634
+ "epoch": 0.25245441795231416,
635
+ "grad_norm": 41.336420518994295,
636
+ "learning_rate": 4.6578807462777004e-07,
637
+ "logits/chosen": -3.078125,
638
+ "logits/rejected": -3.015625,
639
+ "logps/chosen": -298.0,
640
+ "logps/rejected": -352.0,
641
+ "loss": 0.4338,
642
+ "rewards/accuracies": 0.875,
643
+ "rewards/chosen": -0.6015625,
644
+ "rewards/margins": 1.5234375,
645
+ "rewards/rejected": -2.125,
646
+ "step": 180
647
+ },
648
+ {
649
+ "epoch": 0.25806451612903225,
650
+ "eval_logits/chosen": -3.0625,
651
+ "eval_logits/rejected": -3.109375,
652
+ "eval_logps/chosen": -322.0,
653
+ "eval_logps/rejected": -306.0,
654
+ "eval_loss": 0.4271917939186096,
655
+ "eval_rewards/accuracies": 0.8191489577293396,
656
+ "eval_rewards/chosen": -0.0283203125,
657
+ "eval_rewards/margins": 1.5234375,
658
+ "eval_rewards/rejected": -1.546875,
659
+ "eval_runtime": 65.8831,
660
+ "eval_samples_per_second": 22.57,
661
+ "eval_steps_per_second": 0.713,
662
+ "step": 184
663
+ },
664
+ {
665
+ "epoch": 0.2664796633941094,
666
+ "grad_norm": 32.121143393496304,
667
+ "learning_rate": 4.593445112066553e-07,
668
+ "logits/chosen": -3.015625,
669
+ "logits/rejected": -3.09375,
670
+ "logps/chosen": -394.0,
671
+ "logps/rejected": -354.0,
672
+ "loss": 0.415,
673
+ "rewards/accuracies": 0.875,
674
+ "rewards/chosen": 0.0849609375,
675
+ "rewards/margins": 1.84375,
676
+ "rewards/rejected": -1.7578125,
677
+ "step": 190
678
+ },
679
+ {
680
+ "epoch": 0.26928471248246844,
681
+ "eval_logits/chosen": -3.0625,
682
+ "eval_logits/rejected": -3.109375,
683
+ "eval_logps/chosen": -338.0,
684
+ "eval_logps/rejected": -326.0,
685
+ "eval_loss": 0.4112021028995514,
686
+ "eval_rewards/accuracies": 0.8297872543334961,
687
+ "eval_rewards/chosen": -0.84765625,
688
+ "eval_rewards/margins": 1.7265625,
689
+ "eval_rewards/rejected": -2.578125,
690
+ "eval_runtime": 65.4321,
691
+ "eval_samples_per_second": 22.726,
692
+ "eval_steps_per_second": 0.718,
693
+ "step": 192
694
+ },
695
+ {
696
+ "epoch": 0.2805049088359046,
697
+ "grad_norm": 128.88104657311357,
698
+ "learning_rate": 4.523981905496914e-07,
699
+ "logits/chosen": -2.859375,
700
+ "logits/rejected": -2.9375,
701
+ "logps/chosen": -332.0,
702
+ "logps/rejected": -342.0,
703
+ "loss": 0.4038,
704
+ "rewards/accuracies": 0.925000011920929,
705
+ "rewards/chosen": -0.8203125,
706
+ "rewards/margins": 2.15625,
707
+ "rewards/rejected": -2.984375,
708
+ "step": 200
709
+ },
710
+ {
711
+ "epoch": 0.2805049088359046,
712
+ "eval_logits/chosen": -3.078125,
713
+ "eval_logits/rejected": -3.109375,
714
+ "eval_logps/chosen": -344.0,
715
+ "eval_logps/rejected": -334.0,
716
+ "eval_loss": 0.4111826717853546,
717
+ "eval_rewards/accuracies": 0.792553186416626,
718
+ "eval_rewards/chosen": -1.109375,
719
+ "eval_rewards/margins": 1.8125,
720
+ "eval_rewards/rejected": -2.921875,
721
+ "eval_runtime": 65.5119,
722
+ "eval_samples_per_second": 22.698,
723
+ "eval_steps_per_second": 0.717,
724
+ "step": 200
725
+ },
726
+ {
727
+ "epoch": 0.2917251051893408,
728
+ "eval_logits/chosen": -3.09375,
729
+ "eval_logits/rejected": -3.125,
730
+ "eval_logps/chosen": -326.0,
731
+ "eval_logps/rejected": -314.0,
732
+ "eval_loss": 0.41739681363105774,
733
+ "eval_rewards/accuracies": 0.8191489577293396,
734
+ "eval_rewards/chosen": -0.173828125,
735
+ "eval_rewards/margins": 1.7421875,
736
+ "eval_rewards/rejected": -1.9140625,
737
+ "eval_runtime": 65.6569,
738
+ "eval_samples_per_second": 22.648,
739
+ "eval_steps_per_second": 0.716,
740
+ "step": 208
741
+ },
742
+ {
743
+ "epoch": 0.29453015427769985,
744
+ "grad_norm": 33.6284095972861,
745
+ "learning_rate": 4.4496579478984995e-07,
746
+ "logits/chosen": -3.0,
747
+ "logits/rejected": -2.96875,
748
+ "logps/chosen": -308.0,
749
+ "logps/rejected": -296.0,
750
+ "loss": 0.4119,
751
+ "rewards/accuracies": 0.824999988079071,
752
+ "rewards/chosen": -0.91015625,
753
+ "rewards/margins": 1.84375,
754
+ "rewards/rejected": -2.75,
755
+ "step": 210
756
+ },
757
+ {
758
+ "epoch": 0.302945301542777,
759
+ "eval_logits/chosen": -3.125,
760
+ "eval_logits/rejected": -3.15625,
761
+ "eval_logps/chosen": -330.0,
762
+ "eval_logps/rejected": -320.0,
763
+ "eval_loss": 0.4090508818626404,
764
+ "eval_rewards/accuracies": 0.8191489577293396,
765
+ "eval_rewards/chosen": -0.45703125,
766
+ "eval_rewards/margins": 1.796875,
767
+ "eval_rewards/rejected": -2.25,
768
+ "eval_runtime": 65.408,
769
+ "eval_samples_per_second": 22.734,
770
+ "eval_steps_per_second": 0.719,
771
+ "step": 216
772
+ },
773
+ {
774
+ "epoch": 0.3085553997194951,
775
+ "grad_norm": 50.722207221810784,
776
+ "learning_rate": 4.370651734075229e-07,
777
+ "logits/chosen": -2.890625,
778
+ "logits/rejected": -2.859375,
779
+ "logps/chosen": -352.0,
780
+ "logps/rejected": -356.0,
781
+ "loss": 0.3826,
782
+ "rewards/accuracies": 0.8999999761581421,
783
+ "rewards/chosen": -0.62109375,
784
+ "rewards/margins": 1.953125,
785
+ "rewards/rejected": -2.578125,
786
+ "step": 220
787
+ },
788
+ {
789
+ "epoch": 0.3141654978962132,
790
+ "eval_logits/chosen": -3.109375,
791
+ "eval_logits/rejected": -3.140625,
792
+ "eval_logps/chosen": -352.0,
793
+ "eval_logps/rejected": -342.0,
794
+ "eval_loss": 0.41070857644081116,
795
+ "eval_rewards/accuracies": 0.8031914830207825,
796
+ "eval_rewards/chosen": -1.5,
797
+ "eval_rewards/margins": 1.8125,
798
+ "eval_rewards/rejected": -3.3125,
799
+ "eval_runtime": 65.3108,
800
+ "eval_samples_per_second": 22.768,
801
+ "eval_steps_per_second": 0.72,
802
+ "step": 224
803
+ },
804
+ {
805
+ "epoch": 0.3225806451612903,
806
+ "grad_norm": 30.84942010137543,
807
+ "learning_rate": 4.2871530036359783e-07,
808
+ "logits/chosen": -3.15625,
809
+ "logits/rejected": -3.15625,
810
+ "logps/chosen": -348.0,
811
+ "logps/rejected": -336.0,
812
+ "loss": 0.3878,
813
+ "rewards/accuracies": 0.887499988079071,
814
+ "rewards/chosen": -1.671875,
815
+ "rewards/margins": 2.125,
816
+ "rewards/rejected": -3.796875,
817
+ "step": 230
818
+ },
819
+ {
820
+ "epoch": 0.32538569424964936,
821
+ "eval_logits/chosen": -3.109375,
822
+ "eval_logits/rejected": -3.140625,
823
+ "eval_logps/chosen": -346.0,
824
+ "eval_logps/rejected": -338.0,
825
+ "eval_loss": 0.409851610660553,
826
+ "eval_rewards/accuracies": 0.8244680762290955,
827
+ "eval_rewards/chosen": -1.2265625,
828
+ "eval_rewards/margins": 1.8984375,
829
+ "eval_rewards/rejected": -3.125,
830
+ "eval_runtime": 65.7782,
831
+ "eval_samples_per_second": 22.606,
832
+ "eval_steps_per_second": 0.715,
833
+ "step": 232
834
+ },
835
+ {
836
+ "epoch": 0.33660589060308554,
837
+ "grad_norm": 69.24941711385972,
838
+ "learning_rate": 4.1993622853200526e-07,
839
+ "logits/chosen": -3.078125,
840
+ "logits/rejected": -3.078125,
841
+ "logps/chosen": -316.0,
842
+ "logps/rejected": -336.0,
843
+ "loss": 0.3852,
844
+ "rewards/accuracies": 0.824999988079071,
845
+ "rewards/chosen": -1.3203125,
846
+ "rewards/margins": 1.953125,
847
+ "rewards/rejected": -3.265625,
848
+ "step": 240
849
+ },
850
+ {
851
+ "epoch": 0.33660589060308554,
852
+ "eval_logits/chosen": -3.125,
853
+ "eval_logits/rejected": -3.15625,
854
+ "eval_logps/chosen": -336.0,
855
+ "eval_logps/rejected": -328.0,
856
+ "eval_loss": 0.40991342067718506,
857
+ "eval_rewards/accuracies": 0.8244680762290955,
858
+ "eval_rewards/chosen": -0.69140625,
859
+ "eval_rewards/margins": 1.9453125,
860
+ "eval_rewards/rejected": -2.640625,
861
+ "eval_runtime": 65.801,
862
+ "eval_samples_per_second": 22.598,
863
+ "eval_steps_per_second": 0.714,
864
+ "step": 240
865
+ },
866
+ {
867
+ "epoch": 0.34782608695652173,
868
+ "eval_logits/chosen": -3.109375,
869
+ "eval_logits/rejected": -3.140625,
870
+ "eval_logps/chosen": -340.0,
871
+ "eval_logps/rejected": -330.0,
872
+ "eval_loss": 0.4066612720489502,
873
+ "eval_rewards/accuracies": 0.835106372833252,
874
+ "eval_rewards/chosen": -0.87890625,
875
+ "eval_rewards/margins": 1.8828125,
876
+ "eval_rewards/rejected": -2.765625,
877
+ "eval_runtime": 65.3251,
878
+ "eval_samples_per_second": 22.763,
879
+ "eval_steps_per_second": 0.719,
880
+ "step": 248
881
+ },
882
+ {
883
+ "epoch": 0.3506311360448808,
884
+ "grad_norm": 47.10180307111659,
885
+ "learning_rate": 4.107490415411714e-07,
886
+ "logits/chosen": -3.03125,
887
+ "logits/rejected": -3.078125,
888
+ "logps/chosen": -422.0,
889
+ "logps/rejected": -310.0,
890
+ "loss": 0.4338,
891
+ "rewards/accuracies": 0.800000011920929,
892
+ "rewards/chosen": -1.09375,
893
+ "rewards/margins": 1.515625,
894
+ "rewards/rejected": -2.609375,
895
+ "step": 250
896
+ },
897
+ {
898
+ "epoch": 0.3590462833099579,
899
+ "eval_logits/chosen": -3.09375,
900
+ "eval_logits/rejected": -3.140625,
901
+ "eval_logps/chosen": -342.0,
902
+ "eval_logps/rejected": -334.0,
903
+ "eval_loss": 0.4039236903190613,
904
+ "eval_rewards/accuracies": 0.8244680762290955,
905
+ "eval_rewards/chosen": -1.0390625,
906
+ "eval_rewards/margins": 1.9296875,
907
+ "eval_rewards/rejected": -2.96875,
908
+ "eval_runtime": 65.7057,
909
+ "eval_samples_per_second": 22.631,
910
+ "eval_steps_per_second": 0.715,
911
+ "step": 256
912
+ },
913
+ {
914
+ "epoch": 0.364656381486676,
915
+ "grad_norm": 37.27672395240747,
916
+ "learning_rate": 4.01175803140034e-07,
917
+ "logits/chosen": -3.09375,
918
+ "logits/rejected": -3.140625,
919
+ "logps/chosen": -370.0,
920
+ "logps/rejected": -332.0,
921
+ "loss": 0.3974,
922
+ "rewards/accuracies": 0.887499988079071,
923
+ "rewards/chosen": -1.328125,
924
+ "rewards/margins": 2.265625,
925
+ "rewards/rejected": -3.59375,
926
+ "step": 260
927
+ },
928
+ {
929
+ "epoch": 0.3702664796633941,
930
+ "eval_logits/chosen": -3.09375,
931
+ "eval_logits/rejected": -3.125,
932
+ "eval_logps/chosen": -336.0,
933
+ "eval_logps/rejected": -328.0,
934
+ "eval_loss": 0.40224307775497437,
935
+ "eval_rewards/accuracies": 0.8191489577293396,
936
+ "eval_rewards/chosen": -0.72265625,
937
+ "eval_rewards/margins": 1.875,
938
+ "eval_rewards/rejected": -2.59375,
939
+ "eval_runtime": 65.6767,
940
+ "eval_samples_per_second": 22.641,
941
+ "eval_steps_per_second": 0.716,
942
+ "step": 264
943
+ },
944
+ {
945
+ "epoch": 0.37868162692847124,
946
+ "grad_norm": 32.86123128156331,
947
+ "learning_rate": 3.9123950421022135e-07,
948
+ "logits/chosen": -3.09375,
949
+ "logits/rejected": -3.015625,
950
+ "logps/chosen": -326.0,
951
+ "logps/rejected": -328.0,
952
+ "loss": 0.4019,
953
+ "rewards/accuracies": 0.8500000238418579,
954
+ "rewards/chosen": -0.921875,
955
+ "rewards/margins": 1.90625,
956
+ "rewards/rejected": -2.828125,
957
+ "step": 270
958
+ },
959
+ {
960
+ "epoch": 0.3814866760168303,
961
+ "eval_logits/chosen": -3.09375,
962
+ "eval_logits/rejected": -3.125,
963
+ "eval_logps/chosen": -332.0,
964
+ "eval_logps/rejected": -320.0,
965
+ "eval_loss": 0.40509188175201416,
966
+ "eval_rewards/accuracies": 0.8457446694374084,
967
+ "eval_rewards/chosen": -0.5546875,
968
+ "eval_rewards/margins": 1.7109375,
969
+ "eval_rewards/rejected": -2.265625,
970
+ "eval_runtime": 65.6219,
971
+ "eval_samples_per_second": 22.66,
972
+ "eval_steps_per_second": 0.716,
973
+ "step": 272
974
+ },
975
+ {
976
+ "epoch": 0.39270687237026647,
977
+ "grad_norm": 34.91842459517312,
978
+ "learning_rate": 3.8096400755164976e-07,
979
+ "logits/chosen": -3.0625,
980
+ "logits/rejected": -3.03125,
981
+ "logps/chosen": -340.0,
982
+ "logps/rejected": -328.0,
983
+ "loss": 0.3903,
984
+ "rewards/accuracies": 0.887499988079071,
985
+ "rewards/chosen": -0.984375,
986
+ "rewards/margins": 1.7265625,
987
+ "rewards/rejected": -2.703125,
988
+ "step": 280
989
+ },
990
+ {
991
+ "epoch": 0.39270687237026647,
992
+ "eval_logits/chosen": -3.09375,
993
+ "eval_logits/rejected": -3.125,
994
+ "eval_logps/chosen": -338.0,
995
+ "eval_logps/rejected": -330.0,
996
+ "eval_loss": 0.4004907011985779,
997
+ "eval_rewards/accuracies": 0.8617021441459656,
998
+ "eval_rewards/chosen": -0.84765625,
999
+ "eval_rewards/margins": 1.875,
1000
+ "eval_rewards/rejected": -2.71875,
1001
+ "eval_runtime": 66.0203,
1002
+ "eval_samples_per_second": 22.523,
1003
+ "eval_steps_per_second": 0.712,
1004
+ "step": 280
1005
+ },
1006
+ {
1007
+ "epoch": 0.40392706872370265,
1008
+ "eval_logits/chosen": -3.078125,
1009
+ "eval_logits/rejected": -3.109375,
1010
+ "eval_logps/chosen": -348.0,
1011
+ "eval_logps/rejected": -340.0,
1012
+ "eval_loss": 0.3978319466114044,
1013
+ "eval_rewards/accuracies": 0.8563829660415649,
1014
+ "eval_rewards/chosen": -1.265625,
1015
+ "eval_rewards/margins": 2.0,
1016
+ "eval_rewards/rejected": -3.265625,
1017
+ "eval_runtime": 65.7904,
1018
+ "eval_samples_per_second": 22.602,
1019
+ "eval_steps_per_second": 0.714,
1020
+ "step": 288
1021
+ },
1022
+ {
1023
+ "epoch": 0.4067321178120617,
1024
+ "grad_norm": 48.315535907518516,
1025
+ "learning_rate": 3.7037399057414135e-07,
1026
+ "logits/chosen": -2.890625,
1027
+ "logits/rejected": -2.96875,
1028
+ "logps/chosen": -314.0,
1029
+ "logps/rejected": -332.0,
1030
+ "loss": 0.3948,
1031
+ "rewards/accuracies": 0.9375,
1032
+ "rewards/chosen": -1.2421875,
1033
+ "rewards/margins": 2.15625,
1034
+ "rewards/rejected": -3.40625,
1035
+ "step": 290
1036
+ },
1037
+ {
1038
+ "epoch": 0.41514726507713884,
1039
+ "eval_logits/chosen": -3.09375,
1040
+ "eval_logits/rejected": -3.125,
1041
+ "eval_logps/chosen": -334.0,
1042
+ "eval_logps/rejected": -326.0,
1043
+ "eval_loss": 0.39696547389030457,
1044
+ "eval_rewards/accuracies": 0.8297872543334961,
1045
+ "eval_rewards/chosen": -0.65234375,
1046
+ "eval_rewards/margins": 1.90625,
1047
+ "eval_rewards/rejected": -2.5625,
1048
+ "eval_runtime": 65.6246,
1049
+ "eval_samples_per_second": 22.659,
1050
+ "eval_steps_per_second": 0.716,
1051
+ "step": 296
1052
+ },
1053
+ {
1054
+ "epoch": 0.42075736325385693,
1055
+ "grad_norm": 31.56207798451172,
1056
+ "learning_rate": 3.594948860326918e-07,
1057
+ "logits/chosen": -3.078125,
1058
+ "logits/rejected": -3.09375,
1059
+ "logps/chosen": -304.0,
1060
+ "logps/rejected": -314.0,
1061
+ "loss": 0.3894,
1062
+ "rewards/accuracies": 0.887499988079071,
1063
+ "rewards/chosen": -0.8203125,
1064
+ "rewards/margins": 1.9453125,
1065
+ "rewards/rejected": -2.765625,
1066
+ "step": 300
1067
+ },
1068
+ {
1069
+ "epoch": 0.426367461430575,
1070
+ "eval_logits/chosen": -3.078125,
1071
+ "eval_logits/rejected": -3.109375,
1072
+ "eval_logps/chosen": -344.0,
1073
+ "eval_logps/rejected": -334.0,
1074
+ "eval_loss": 0.39327114820480347,
1075
+ "eval_rewards/accuracies": 0.8404255509376526,
1076
+ "eval_rewards/chosen": -1.109375,
1077
+ "eval_rewards/margins": 1.8671875,
1078
+ "eval_rewards/rejected": -2.984375,
1079
+ "eval_runtime": 66.1549,
1080
+ "eval_samples_per_second": 22.478,
1081
+ "eval_steps_per_second": 0.71,
1082
+ "step": 304
1083
+ },
1084
+ {
1085
+ "epoch": 0.43478260869565216,
1086
+ "grad_norm": 33.392648021583874,
1087
+ "learning_rate": 3.4835282094871775e-07,
1088
+ "logits/chosen": -3.046875,
1089
+ "logits/rejected": -3.078125,
1090
+ "logps/chosen": -380.0,
1091
+ "logps/rejected": -360.0,
1092
+ "loss": 0.3824,
1093
+ "rewards/accuracies": 0.9125000238418579,
1094
+ "rewards/chosen": -1.2421875,
1095
+ "rewards/margins": 2.0,
1096
+ "rewards/rejected": -3.25,
1097
+ "step": 310
1098
+ },
1099
+ {
1100
+ "epoch": 0.4375876577840112,
1101
+ "eval_logits/chosen": -3.078125,
1102
+ "eval_logits/rejected": -3.109375,
1103
+ "eval_logps/chosen": -350.0,
1104
+ "eval_logps/rejected": -342.0,
1105
+ "eval_loss": 0.39230719208717346,
1106
+ "eval_rewards/accuracies": 0.835106372833252,
1107
+ "eval_rewards/chosen": -1.4140625,
1108
+ "eval_rewards/margins": 1.9296875,
1109
+ "eval_rewards/rejected": -3.34375,
1110
+ "eval_runtime": 65.8562,
1111
+ "eval_samples_per_second": 22.58,
1112
+ "eval_steps_per_second": 0.714,
1113
+ "step": 312
1114
+ },
1115
+ {
1116
+ "epoch": 0.4488078541374474,
1117
+ "grad_norm": 92.67447234773125,
1118
+ "learning_rate": 3.369745538639694e-07,
1119
+ "logits/chosen": -3.0,
1120
+ "logits/rejected": -2.96875,
1121
+ "logps/chosen": -378.0,
1122
+ "logps/rejected": -316.0,
1123
+ "loss": 0.422,
1124
+ "rewards/accuracies": 0.8500000238418579,
1125
+ "rewards/chosen": -1.3046875,
1126
+ "rewards/margins": 2.1875,
1127
+ "rewards/rejected": -3.484375,
1128
+ "step": 320
1129
+ },
1130
+ {
1131
+ "epoch": 0.4488078541374474,
1132
+ "eval_logits/chosen": -3.09375,
1133
+ "eval_logits/rejected": -3.125,
1134
+ "eval_logps/chosen": -336.0,
1135
+ "eval_logps/rejected": -332.0,
1136
+ "eval_loss": 0.38914069533348083,
1137
+ "eval_rewards/accuracies": 0.835106372833252,
1138
+ "eval_rewards/chosen": -0.74609375,
1139
+ "eval_rewards/margins": 2.046875,
1140
+ "eval_rewards/rejected": -2.796875,
1141
+ "eval_runtime": 65.8877,
1142
+ "eval_samples_per_second": 22.569,
1143
+ "eval_steps_per_second": 0.713,
1144
+ "step": 320
1145
+ },
1146
+ {
1147
+ "epoch": 0.4600280504908836,
1148
+ "eval_logits/chosen": -3.09375,
1149
+ "eval_logits/rejected": -3.125,
1150
+ "eval_logps/chosen": -340.0,
1151
+ "eval_logps/rejected": -336.0,
1152
+ "eval_loss": 0.3886280059814453,
1153
+ "eval_rewards/accuracies": 0.835106372833252,
1154
+ "eval_rewards/chosen": -0.9296875,
1155
+ "eval_rewards/margins": 2.078125,
1156
+ "eval_rewards/rejected": -3.0,
1157
+ "eval_runtime": 65.7911,
1158
+ "eval_samples_per_second": 22.602,
1159
+ "eval_steps_per_second": 0.714,
1160
+ "step": 328
1161
+ },
1162
+ {
1163
+ "epoch": 0.4628330995792426,
1164
+ "grad_norm": 44.34559171874966,
1165
+ "learning_rate": 3.2538741057779675e-07,
1166
+ "logits/chosen": -2.953125,
1167
+ "logits/rejected": -2.84375,
1168
+ "logps/chosen": -376.0,
1169
+ "logps/rejected": -360.0,
1170
+ "loss": 0.3761,
1171
+ "rewards/accuracies": 0.9125000238418579,
1172
+ "rewards/chosen": -0.87890625,
1173
+ "rewards/margins": 2.46875,
1174
+ "rewards/rejected": -3.34375,
1175
+ "step": 330
1176
+ },
1177
+ {
1178
+ "epoch": 0.47124824684431976,
1179
+ "eval_logits/chosen": -3.09375,
1180
+ "eval_logits/rejected": -3.125,
1181
+ "eval_logps/chosen": -336.0,
1182
+ "eval_logps/rejected": -330.0,
1183
+ "eval_loss": 0.38978397846221924,
1184
+ "eval_rewards/accuracies": 0.8191489577293396,
1185
+ "eval_rewards/chosen": -0.6796875,
1186
+ "eval_rewards/margins": 2.0625,
1187
+ "eval_rewards/rejected": -2.75,
1188
+ "eval_runtime": 66.1797,
1189
+ "eval_samples_per_second": 22.469,
1190
+ "eval_steps_per_second": 0.71,
1191
+ "step": 336
1192
+ },
1193
+ {
1194
+ "epoch": 0.47685834502103785,
1195
+ "grad_norm": 35.10773780257282,
1196
+ "learning_rate": 3.136192185221032e-07,
1197
+ "logits/chosen": -3.03125,
1198
+ "logits/rejected": -2.96875,
1199
+ "logps/chosen": -350.0,
1200
+ "logps/rejected": -366.0,
1201
+ "loss": 0.3534,
1202
+ "rewards/accuracies": 0.875,
1203
+ "rewards/chosen": -0.9609375,
1204
+ "rewards/margins": 2.28125,
1205
+ "rewards/rejected": -3.25,
1206
+ "step": 340
1207
+ },
1208
+ {
1209
+ "epoch": 0.48246844319775595,
1210
+ "eval_logits/chosen": -3.078125,
1211
+ "eval_logits/rejected": -3.109375,
1212
+ "eval_logps/chosen": -340.0,
1213
+ "eval_logps/rejected": -336.0,
1214
+ "eval_loss": 0.3868924677371979,
1215
+ "eval_rewards/accuracies": 0.8563829660415649,
1216
+ "eval_rewards/chosen": -0.953125,
1217
+ "eval_rewards/margins": 2.078125,
1218
+ "eval_rewards/rejected": -3.03125,
1219
+ "eval_runtime": 66.0426,
1220
+ "eval_samples_per_second": 22.516,
1221
+ "eval_steps_per_second": 0.712,
1222
+ "step": 344
1223
+ },
1224
+ {
1225
+ "epoch": 0.4908835904628331,
1226
+ "grad_norm": 31.140190663491566,
1227
+ "learning_rate": 3.016982399315888e-07,
1228
+ "logits/chosen": -3.078125,
1229
+ "logits/rejected": -3.09375,
1230
+ "logps/chosen": -374.0,
1231
+ "logps/rejected": -332.0,
1232
+ "loss": 0.3601,
1233
+ "rewards/accuracies": 0.925000011920929,
1234
+ "rewards/chosen": -1.234375,
1235
+ "rewards/margins": 2.421875,
1236
+ "rewards/rejected": -3.65625,
1237
+ "step": 350
1238
+ },
1239
+ {
1240
+ "epoch": 0.49368863955119213,
1241
+ "eval_logits/chosen": -3.0625,
1242
+ "eval_logits/rejected": -3.09375,
1243
+ "eval_logps/chosen": -342.0,
1244
+ "eval_logps/rejected": -338.0,
1245
+ "eval_loss": 0.38676464557647705,
1246
+ "eval_rewards/accuracies": 0.8510638475418091,
1247
+ "eval_rewards/chosen": -1.0078125,
1248
+ "eval_rewards/margins": 2.140625,
1249
+ "eval_rewards/rejected": -3.15625,
1250
+ "eval_runtime": 65.739,
1251
+ "eval_samples_per_second": 22.62,
1252
+ "eval_steps_per_second": 0.715,
1253
+ "step": 352
1254
+ },
1255
+ {
1256
+ "epoch": 0.5049088359046283,
1257
+ "grad_norm": 47.07813704000649,
1258
+ "learning_rate": 2.896531039697801e-07,
1259
+ "logits/chosen": -3.03125,
1260
+ "logits/rejected": -3.078125,
1261
+ "logps/chosen": -334.0,
1262
+ "logps/rejected": -330.0,
1263
+ "loss": 0.3996,
1264
+ "rewards/accuracies": 0.8999999761581421,
1265
+ "rewards/chosen": -1.40625,
1266
+ "rewards/margins": 2.265625,
1267
+ "rewards/rejected": -3.671875,
1268
+ "step": 360
1269
+ },
1270
+ {
1271
+ "epoch": 0.5049088359046283,
1272
+ "eval_logits/chosen": -3.03125,
1273
+ "eval_logits/rejected": -3.0625,
1274
+ "eval_logps/chosen": -348.0,
1275
+ "eval_logps/rejected": -344.0,
1276
+ "eval_loss": 0.3909456133842468,
1277
+ "eval_rewards/accuracies": 0.8404255509376526,
1278
+ "eval_rewards/chosen": -1.2734375,
1279
+ "eval_rewards/margins": 2.171875,
1280
+ "eval_rewards/rejected": -3.4375,
1281
+ "eval_runtime": 66.2809,
1282
+ "eval_samples_per_second": 22.435,
1283
+ "eval_steps_per_second": 0.709,
1284
+ "step": 360
1285
+ },
1286
+ {
1287
+ "epoch": 0.5161290322580645,
1288
+ "eval_logits/chosen": -3.03125,
1289
+ "eval_logits/rejected": -3.046875,
1290
+ "eval_logps/chosen": -350.0,
1291
+ "eval_logps/rejected": -344.0,
1292
+ "eval_loss": 0.3894420564174652,
1293
+ "eval_rewards/accuracies": 0.8297872543334961,
1294
+ "eval_rewards/chosen": -1.375,
1295
+ "eval_rewards/margins": 2.109375,
1296
+ "eval_rewards/rejected": -3.484375,
1297
+ "eval_runtime": 65.5848,
1298
+ "eval_samples_per_second": 22.673,
1299
+ "eval_steps_per_second": 0.717,
1300
+ "step": 368
1301
+ },
1302
+ {
1303
+ "epoch": 0.5189340813464236,
1304
+ "grad_norm": 33.796854317256766,
1305
+ "learning_rate": 2.7751273797385324e-07,
1306
+ "logits/chosen": -2.921875,
1307
+ "logits/rejected": -2.796875,
1308
+ "logps/chosen": -364.0,
1309
+ "logps/rejected": -338.0,
1310
+ "loss": 0.3591,
1311
+ "rewards/accuracies": 0.8374999761581421,
1312
+ "rewards/chosen": -1.4921875,
1313
+ "rewards/margins": 2.390625,
1314
+ "rewards/rejected": -3.890625,
1315
+ "step": 370
1316
+ },
1317
+ {
1318
+ "epoch": 0.5273492286115007,
1319
+ "eval_logits/chosen": -3.046875,
1320
+ "eval_logits/rejected": -3.078125,
1321
+ "eval_logps/chosen": -338.0,
1322
+ "eval_logps/rejected": -334.0,
1323
+ "eval_loss": 0.38463667035102844,
1324
+ "eval_rewards/accuracies": 0.8404255509376526,
1325
+ "eval_rewards/chosen": -0.8203125,
1326
+ "eval_rewards/margins": 2.15625,
1327
+ "eval_rewards/rejected": -2.984375,
1328
+ "eval_runtime": 65.7596,
1329
+ "eval_samples_per_second": 22.613,
1330
+ "eval_steps_per_second": 0.715,
1331
+ "step": 376
1332
+ },
1333
+ {
1334
+ "epoch": 0.5329593267882188,
1335
+ "grad_norm": 40.02993475158031,
1336
+ "learning_rate": 2.6530629798336756e-07,
1337
+ "logits/chosen": -3.0,
1338
+ "logits/rejected": -2.96875,
1339
+ "logps/chosen": -364.0,
1340
+ "logps/rejected": -372.0,
1341
+ "loss": 0.3726,
1342
+ "rewards/accuracies": 0.875,
1343
+ "rewards/chosen": -0.90625,
1344
+ "rewards/margins": 2.34375,
1345
+ "rewards/rejected": -3.25,
1346
+ "step": 380
1347
+ },
1348
+ {
1349
+ "epoch": 0.5385694249649369,
1350
+ "eval_logits/chosen": -3.0625,
1351
+ "eval_logits/rejected": -3.09375,
1352
+ "eval_logps/chosen": -340.0,
1353
+ "eval_logps/rejected": -334.0,
1354
+ "eval_loss": 0.3832222521305084,
1355
+ "eval_rewards/accuracies": 0.8404255509376526,
1356
+ "eval_rewards/chosen": -0.87109375,
1357
+ "eval_rewards/margins": 2.078125,
1358
+ "eval_rewards/rejected": -2.953125,
1359
+ "eval_runtime": 65.913,
1360
+ "eval_samples_per_second": 22.56,
1361
+ "eval_steps_per_second": 0.713,
1362
+ "step": 384
1363
+ },
1364
+ {
1365
+ "epoch": 0.5469845722300141,
1366
+ "grad_norm": 24.220998890998306,
1367
+ "learning_rate": 2.5306309871975437e-07,
1368
+ "logits/chosen": -2.96875,
1369
+ "logits/rejected": -2.96875,
1370
+ "logps/chosen": -358.0,
1371
+ "logps/rejected": -366.0,
1372
+ "loss": 0.371,
1373
+ "rewards/accuracies": 0.875,
1374
+ "rewards/chosen": -0.953125,
1375
+ "rewards/margins": 2.265625,
1376
+ "rewards/rejected": -3.21875,
1377
+ "step": 390
1378
+ },
1379
+ {
1380
+ "epoch": 0.5497896213183731,
1381
+ "eval_logits/chosen": -3.078125,
1382
+ "eval_logits/rejected": -3.109375,
1383
+ "eval_logps/chosen": -340.0,
1384
+ "eval_logps/rejected": -334.0,
1385
+ "eval_loss": 0.38178983330726624,
1386
+ "eval_rewards/accuracies": 0.835106372833252,
1387
+ "eval_rewards/chosen": -0.8984375,
1388
+ "eval_rewards/margins": 2.046875,
1389
+ "eval_rewards/rejected": -2.953125,
1390
+ "eval_runtime": 65.9539,
1391
+ "eval_samples_per_second": 22.546,
1392
+ "eval_steps_per_second": 0.713,
1393
+ "step": 392
1394
+ },
1395
+ {
1396
+ "epoch": 0.5610098176718092,
1397
+ "grad_norm": 40.61458230374078,
1398
+ "learning_rate": 2.408125431847175e-07,
1399
+ "logits/chosen": -2.828125,
1400
+ "logits/rejected": -2.984375,
1401
+ "logps/chosen": -326.0,
1402
+ "logps/rejected": -360.0,
1403
+ "loss": 0.3546,
1404
+ "rewards/accuracies": 0.9624999761581421,
1405
+ "rewards/chosen": -1.0390625,
1406
+ "rewards/margins": 2.3125,
1407
+ "rewards/rejected": -3.34375,
1408
+ "step": 400
1409
+ },
1410
+ {
1411
+ "epoch": 0.5610098176718092,
1412
+ "eval_logits/chosen": -3.0625,
1413
+ "eval_logits/rejected": -3.09375,
1414
+ "eval_logps/chosen": -344.0,
1415
+ "eval_logps/rejected": -338.0,
1416
+ "eval_loss": 0.3821606934070587,
1417
+ "eval_rewards/accuracies": 0.835106372833252,
1418
+ "eval_rewards/chosen": -1.1328125,
1419
+ "eval_rewards/margins": 2.03125,
1420
+ "eval_rewards/rejected": -3.171875,
1421
+ "eval_runtime": 65.6255,
1422
+ "eval_samples_per_second": 22.659,
1423
+ "eval_steps_per_second": 0.716,
1424
+ "step": 400
1425
+ },
1426
+ {
1427
+ "epoch": 0.5722300140252454,
1428
+ "eval_logits/chosen": -3.078125,
1429
+ "eval_logits/rejected": -3.109375,
1430
+ "eval_logps/chosen": -336.0,
1431
+ "eval_logps/rejected": -332.0,
1432
+ "eval_loss": 0.3827318847179413,
1433
+ "eval_rewards/accuracies": 0.835106372833252,
1434
+ "eval_rewards/chosen": -0.72265625,
1435
+ "eval_rewards/margins": 2.09375,
1436
+ "eval_rewards/rejected": -2.8125,
1437
+ "eval_runtime": 66.6876,
1438
+ "eval_samples_per_second": 22.298,
1439
+ "eval_steps_per_second": 0.705,
1440
+ "step": 408
1441
+ },
1442
+ {
1443
+ "epoch": 0.5750350631136045,
1444
+ "grad_norm": 37.20301916256947,
1445
+ "learning_rate": 2.2858405204662287e-07,
1446
+ "logits/chosen": -3.0,
1447
+ "logits/rejected": -3.015625,
1448
+ "logps/chosen": -336.0,
1449
+ "logps/rejected": -318.0,
1450
+ "loss": 0.3815,
1451
+ "rewards/accuracies": 0.925000011920929,
1452
+ "rewards/chosen": -1.2421875,
1453
+ "rewards/margins": 2.515625,
1454
+ "rewards/rejected": -3.765625,
1455
+ "step": 410
1456
+ },
1457
+ {
1458
+ "epoch": 0.5834502103786816,
1459
+ "eval_logits/chosen": -3.09375,
1460
+ "eval_logits/rejected": -3.125,
1461
+ "eval_logps/chosen": -344.0,
1462
+ "eval_logps/rejected": -338.0,
1463
+ "eval_loss": 0.38075175881385803,
1464
+ "eval_rewards/accuracies": 0.835106372833252,
1465
+ "eval_rewards/chosen": -1.1171875,
1466
+ "eval_rewards/margins": 2.015625,
1467
+ "eval_rewards/rejected": -3.125,
1468
+ "eval_runtime": 66.0902,
1469
+ "eval_samples_per_second": 22.5,
1470
+ "eval_steps_per_second": 0.711,
1471
+ "step": 416
1472
+ },
1473
+ {
1474
+ "epoch": 0.5890603085553997,
1475
+ "grad_norm": 27.9879490973163,
1476
+ "learning_rate": 2.164069929844592e-07,
1477
+ "logits/chosen": -2.9375,
1478
+ "logits/rejected": -3.09375,
1479
+ "logps/chosen": -350.0,
1480
+ "logps/rejected": -324.0,
1481
+ "loss": 0.36,
1482
+ "rewards/accuracies": 0.862500011920929,
1483
+ "rewards/chosen": -0.97265625,
1484
+ "rewards/margins": 2.03125,
1485
+ "rewards/rejected": -3.015625,
1486
+ "step": 420
1487
+ },
1488
+ {
1489
+ "epoch": 0.5946704067321178,
1490
+ "eval_logits/chosen": -3.09375,
1491
+ "eval_logits/rejected": -3.125,
1492
+ "eval_logps/chosen": -338.0,
1493
+ "eval_logps/rejected": -334.0,
1494
+ "eval_loss": 0.37844687700271606,
1495
+ "eval_rewards/accuracies": 0.8457446694374084,
1496
+ "eval_rewards/chosen": -0.84375,
1497
+ "eval_rewards/margins": 2.078125,
1498
+ "eval_rewards/rejected": -2.921875,
1499
+ "eval_runtime": 66.0062,
1500
+ "eval_samples_per_second": 22.528,
1501
+ "eval_steps_per_second": 0.712,
1502
+ "step": 424
1503
+ },
1504
+ {
1505
+ "epoch": 0.603085553997195,
1506
+ "grad_norm": 41.322997758051834,
1507
+ "learning_rate": 2.0431061015905793e-07,
1508
+ "logits/chosen": -3.09375,
1509
+ "logits/rejected": -3.109375,
1510
+ "logps/chosen": -318.0,
1511
+ "logps/rejected": -360.0,
1512
+ "loss": 0.3522,
1513
+ "rewards/accuracies": 0.9375,
1514
+ "rewards/chosen": -1.046875,
1515
+ "rewards/margins": 2.6875,
1516
+ "rewards/rejected": -3.734375,
1517
+ "step": 430
1518
+ },
1519
+ {
1520
+ "epoch": 0.605890603085554,
1521
+ "eval_logits/chosen": -3.09375,
1522
+ "eval_logits/rejected": -3.109375,
1523
+ "eval_logps/chosen": -342.0,
1524
+ "eval_logps/rejected": -338.0,
1525
+ "eval_loss": 0.37742072343826294,
1526
+ "eval_rewards/accuracies": 0.835106372833252,
1527
+ "eval_rewards/chosen": -1.03125,
1528
+ "eval_rewards/margins": 2.109375,
1529
+ "eval_rewards/rejected": -3.140625,
1530
+ "eval_runtime": 65.9149,
1531
+ "eval_samples_per_second": 22.559,
1532
+ "eval_steps_per_second": 0.713,
1533
+ "step": 432
1534
+ },
1535
+ {
1536
+ "epoch": 0.6171107994389902,
1537
+ "grad_norm": 31.204018506373746,
1538
+ "learning_rate": 1.923239539809505e-07,
1539
+ "logits/chosen": -3.0,
1540
+ "logits/rejected": -3.046875,
1541
+ "logps/chosen": -338.0,
1542
+ "logps/rejected": -334.0,
1543
+ "loss": 0.3323,
1544
+ "rewards/accuracies": 0.875,
1545
+ "rewards/chosen": -1.1953125,
1546
+ "rewards/margins": 2.15625,
1547
+ "rewards/rejected": -3.359375,
1548
+ "step": 440
1549
+ },
1550
+ {
1551
+ "epoch": 0.6171107994389902,
1552
+ "eval_logits/chosen": -3.078125,
1553
+ "eval_logits/rejected": -3.109375,
1554
+ "eval_logps/chosen": -344.0,
1555
+ "eval_logps/rejected": -342.0,
1556
+ "eval_loss": 0.37543419003486633,
1557
+ "eval_rewards/accuracies": 0.8457446694374084,
1558
+ "eval_rewards/chosen": -1.125,
1559
+ "eval_rewards/margins": 2.171875,
1560
+ "eval_rewards/rejected": -3.296875,
1561
+ "eval_runtime": 66.2693,
1562
+ "eval_samples_per_second": 22.439,
1563
+ "eval_steps_per_second": 0.709,
1564
+ "step": 440
1565
+ },
1566
+ {
1567
+ "epoch": 0.6283309957924264,
1568
+ "eval_logits/chosen": -3.078125,
1569
+ "eval_logits/rejected": -3.109375,
1570
+ "eval_logps/chosen": -346.0,
1571
+ "eval_logps/rejected": -342.0,
1572
+ "eval_loss": 0.3757575452327728,
1573
+ "eval_rewards/accuracies": 0.8297872543334961,
1574
+ "eval_rewards/chosen": -1.1953125,
1575
+ "eval_rewards/margins": 2.171875,
1576
+ "eval_rewards/rejected": -3.375,
1577
+ "eval_runtime": 66.0335,
1578
+ "eval_samples_per_second": 22.519,
1579
+ "eval_steps_per_second": 0.712,
1580
+ "step": 448
1581
+ },
1582
+ {
1583
+ "epoch": 0.6311360448807855,
1584
+ "grad_norm": 29.323680763100512,
1585
+ "learning_rate": 1.8047581134353296e-07,
1586
+ "logits/chosen": -2.96875,
1587
+ "logits/rejected": -2.984375,
1588
+ "logps/chosen": -394.0,
1589
+ "logps/rejected": -380.0,
1590
+ "loss": 0.3768,
1591
+ "rewards/accuracies": 0.887499988079071,
1592
+ "rewards/chosen": -1.4375,
1593
+ "rewards/margins": 2.234375,
1594
+ "rewards/rejected": -3.671875,
1595
+ "step": 450
1596
+ },
1597
+ {
1598
+ "epoch": 0.6395511921458625,
1599
+ "eval_logits/chosen": -3.078125,
1600
+ "eval_logits/rejected": -3.109375,
1601
+ "eval_logps/chosen": -344.0,
1602
+ "eval_logps/rejected": -340.0,
1603
+ "eval_loss": 0.3760261535644531,
1604
+ "eval_rewards/accuracies": 0.8457446694374084,
1605
+ "eval_rewards/chosen": -1.078125,
1606
+ "eval_rewards/margins": 2.1875,
1607
+ "eval_rewards/rejected": -3.265625,
1608
+ "eval_runtime": 66.2078,
1609
+ "eval_samples_per_second": 22.46,
1610
+ "eval_steps_per_second": 0.71,
1611
+ "step": 456
1612
+ },
1613
+ {
1614
+ "epoch": 0.6451612903225806,
1615
+ "grad_norm": 36.95205536065495,
1616
+ "learning_rate": 1.687946364890877e-07,
1617
+ "logits/chosen": -2.96875,
1618
+ "logits/rejected": -2.96875,
1619
+ "logps/chosen": -338.0,
1620
+ "logps/rejected": -332.0,
1621
+ "loss": 0.3637,
1622
+ "rewards/accuracies": 0.9375,
1623
+ "rewards/chosen": -1.2734375,
1624
+ "rewards/margins": 2.40625,
1625
+ "rewards/rejected": -3.6875,
1626
+ "step": 460
1627
+ },
1628
+ {
1629
+ "epoch": 0.6507713884992987,
1630
+ "eval_logits/chosen": -3.078125,
1631
+ "eval_logits/rejected": -3.09375,
1632
+ "eval_logps/chosen": -342.0,
1633
+ "eval_logps/rejected": -340.0,
1634
+ "eval_loss": 0.3759002089500427,
1635
+ "eval_rewards/accuracies": 0.835106372833252,
1636
+ "eval_rewards/chosen": -1.0625,
1637
+ "eval_rewards/margins": 2.21875,
1638
+ "eval_rewards/rejected": -3.28125,
1639
+ "eval_runtime": 65.6388,
1640
+ "eval_samples_per_second": 22.654,
1641
+ "eval_steps_per_second": 0.716,
1642
+ "step": 464
1643
+ },
1644
+ {
1645
+ "epoch": 0.6591865357643759,
1646
+ "grad_norm": 50.750866736233036,
1647
+ "learning_rate": 1.573084826736921e-07,
1648
+ "logits/chosen": -2.984375,
1649
+ "logits/rejected": -3.046875,
1650
+ "logps/chosen": -332.0,
1651
+ "logps/rejected": -340.0,
1652
+ "loss": 0.3901,
1653
+ "rewards/accuracies": 0.875,
1654
+ "rewards/chosen": -1.484375,
1655
+ "rewards/margins": 2.21875,
1656
+ "rewards/rejected": -3.703125,
1657
+ "step": 470
1658
+ },
1659
+ {
1660
+ "epoch": 0.6619915848527349,
1661
+ "eval_logits/chosen": -3.0625,
1662
+ "eval_logits/rejected": -3.09375,
1663
+ "eval_logps/chosen": -350.0,
1664
+ "eval_logps/rejected": -346.0,
1665
+ "eval_loss": 0.3759775161743164,
1666
+ "eval_rewards/accuracies": 0.8457446694374084,
1667
+ "eval_rewards/chosen": -1.4375,
1668
+ "eval_rewards/margins": 2.109375,
1669
+ "eval_rewards/rejected": -3.546875,
1670
+ "eval_runtime": 66.2324,
1671
+ "eval_samples_per_second": 22.451,
1672
+ "eval_steps_per_second": 0.71,
1673
+ "step": 472
1674
+ },
1675
+ {
1676
+ "epoch": 0.6732117812061711,
1677
+ "grad_norm": 46.86048650983821,
1678
+ "learning_rate": 1.460449347951278e-07,
1679
+ "logits/chosen": -2.875,
1680
+ "logits/rejected": -3.0,
1681
+ "logps/chosen": -328.0,
1682
+ "logps/rejected": -324.0,
1683
+ "loss": 0.3555,
1684
+ "rewards/accuracies": 0.875,
1685
+ "rewards/chosen": -1.390625,
1686
+ "rewards/margins": 2.453125,
1687
+ "rewards/rejected": -3.84375,
1688
+ "step": 480
1689
+ },
1690
+ {
1691
+ "epoch": 0.6732117812061711,
1692
+ "eval_logits/chosen": -3.078125,
1693
+ "eval_logits/rejected": -3.09375,
1694
+ "eval_logps/chosen": -350.0,
1695
+ "eval_logps/rejected": -346.0,
1696
+ "eval_loss": 0.3772374987602234,
1697
+ "eval_rewards/accuracies": 0.8404255509376526,
1698
+ "eval_rewards/chosen": -1.3671875,
1699
+ "eval_rewards/margins": 2.140625,
1700
+ "eval_rewards/rejected": -3.5,
1701
+ "eval_runtime": 66.1888,
1702
+ "eval_samples_per_second": 22.466,
1703
+ "eval_steps_per_second": 0.71,
1704
+ "step": 480
1705
+ },
1706
+ {
1707
+ "epoch": 0.6844319775596073,
1708
+ "eval_logits/chosen": -3.078125,
1709
+ "eval_logits/rejected": -3.109375,
1710
+ "eval_logps/chosen": -344.0,
1711
+ "eval_logps/rejected": -340.0,
1712
+ "eval_loss": 0.37612324953079224,
1713
+ "eval_rewards/accuracies": 0.8563829660415649,
1714
+ "eval_rewards/chosen": -1.0859375,
1715
+ "eval_rewards/margins": 2.171875,
1716
+ "eval_rewards/rejected": -3.25,
1717
+ "eval_runtime": 66.0573,
1718
+ "eval_samples_per_second": 22.511,
1719
+ "eval_steps_per_second": 0.712,
1720
+ "step": 488
1721
+ },
1722
+ {
1723
+ "epoch": 0.6872370266479664,
1724
+ "grad_norm": 38.26875632825401,
1725
+ "learning_rate": 1.3503104314558838e-07,
1726
+ "logits/chosen": -3.046875,
1727
+ "logits/rejected": -3.0625,
1728
+ "logps/chosen": -382.0,
1729
+ "logps/rejected": -350.0,
1730
+ "loss": 0.373,
1731
+ "rewards/accuracies": 0.9125000238418579,
1732
+ "rewards/chosen": -1.40625,
1733
+ "rewards/margins": 2.15625,
1734
+ "rewards/rejected": -3.5625,
1735
+ "step": 490
1736
+ },
1737
+ {
1738
+ "epoch": 0.6956521739130435,
1739
+ "eval_logits/chosen": -3.078125,
1740
+ "eval_logits/rejected": -3.109375,
1741
+ "eval_logps/chosen": -340.0,
1742
+ "eval_logps/rejected": -334.0,
1743
+ "eval_loss": 0.37762168049812317,
1744
+ "eval_rewards/accuracies": 0.8457446694374084,
1745
+ "eval_rewards/chosen": -0.90625,
1746
+ "eval_rewards/margins": 2.046875,
1747
+ "eval_rewards/rejected": -2.953125,
1748
+ "eval_runtime": 65.9608,
1749
+ "eval_samples_per_second": 22.544,
1750
+ "eval_steps_per_second": 0.713,
1751
+ "step": 496
1752
+ },
1753
+ {
1754
+ "epoch": 0.7012622720897616,
1755
+ "grad_norm": 37.30046756155309,
1756
+ "learning_rate": 1.2429325844828475e-07,
1757
+ "logits/chosen": -2.90625,
1758
+ "logits/rejected": -3.015625,
1759
+ "logps/chosen": -336.0,
1760
+ "logps/rejected": -328.0,
1761
+ "loss": 0.3617,
1762
+ "rewards/accuracies": 0.9375,
1763
+ "rewards/chosen": -0.9296875,
1764
+ "rewards/margins": 2.59375,
1765
+ "rewards/rejected": -3.53125,
1766
+ "step": 500
1767
+ },
1768
+ {
1769
+ "epoch": 0.7068723702664796,
1770
+ "eval_logits/chosen": -3.078125,
1771
+ "eval_logits/rejected": -3.109375,
1772
+ "eval_logps/chosen": -340.0,
1773
+ "eval_logps/rejected": -334.0,
1774
+ "eval_loss": 0.3774646520614624,
1775
+ "eval_rewards/accuracies": 0.8404255509376526,
1776
+ "eval_rewards/chosen": -0.94921875,
1777
+ "eval_rewards/margins": 2.0,
1778
+ "eval_rewards/rejected": -2.953125,
1779
+ "eval_runtime": 65.763,
1780
+ "eval_samples_per_second": 22.612,
1781
+ "eval_steps_per_second": 0.715,
1782
+ "step": 504
1783
+ },
1784
+ {
1785
+ "epoch": 0.7152875175315568,
1786
+ "grad_norm": 28.616464108884514,
1787
+ "learning_rate": 1.1385736833396248e-07,
1788
+ "logits/chosen": -2.984375,
1789
+ "logits/rejected": -3.0625,
1790
+ "logps/chosen": -392.0,
1791
+ "logps/rejected": -338.0,
1792
+ "loss": 0.3665,
1793
+ "rewards/accuracies": 0.8374999761581421,
1794
+ "rewards/chosen": -1.2578125,
1795
+ "rewards/margins": 1.90625,
1796
+ "rewards/rejected": -3.15625,
1797
+ "step": 510
1798
+ },
1799
+ {
1800
+ "epoch": 0.7180925666199158,
1801
+ "eval_logits/chosen": -3.078125,
1802
+ "eval_logits/rejected": -3.109375,
1803
+ "eval_logps/chosen": -338.0,
1804
+ "eval_logps/rejected": -334.0,
1805
+ "eval_loss": 0.3764929175376892,
1806
+ "eval_rewards/accuracies": 0.8457446694374084,
1807
+ "eval_rewards/chosen": -0.8359375,
1808
+ "eval_rewards/margins": 2.0625,
1809
+ "eval_rewards/rejected": -2.90625,
1810
+ "eval_runtime": 65.6772,
1811
+ "eval_samples_per_second": 22.641,
1812
+ "eval_steps_per_second": 0.716,
1813
+ "step": 512
1814
+ },
1815
+ {
1816
+ "epoch": 0.729312762973352,
1817
+ "grad_norm": 26.70960973383744,
1818
+ "learning_rate": 1.0374843540988668e-07,
1819
+ "logits/chosen": -2.984375,
1820
+ "logits/rejected": -2.9375,
1821
+ "logps/chosen": -314.0,
1822
+ "logps/rejected": -318.0,
1823
+ "loss": 0.3612,
1824
+ "rewards/accuracies": 0.9375,
1825
+ "rewards/chosen": -1.0546875,
1826
+ "rewards/margins": 2.390625,
1827
+ "rewards/rejected": -3.4375,
1828
+ "step": 520
1829
+ },
1830
+ {
1831
+ "epoch": 0.729312762973352,
1832
+ "eval_logits/chosen": -3.078125,
1833
+ "eval_logits/rejected": -3.109375,
1834
+ "eval_logps/chosen": -340.0,
1835
+ "eval_logps/rejected": -336.0,
1836
+ "eval_loss": 0.3756672441959381,
1837
+ "eval_rewards/accuracies": 0.835106372833252,
1838
+ "eval_rewards/chosen": -0.9296875,
1839
+ "eval_rewards/margins": 2.125,
1840
+ "eval_rewards/rejected": -3.0625,
1841
+ "eval_runtime": 66.0986,
1842
+ "eval_samples_per_second": 22.497,
1843
+ "eval_steps_per_second": 0.711,
1844
+ "step": 520
1845
+ },
1846
+ {
1847
+ "epoch": 0.7405329593267882,
1848
+ "eval_logits/chosen": -3.078125,
1849
+ "eval_logits/rejected": -3.109375,
1850
+ "eval_logps/chosen": -340.0,
1851
+ "eval_logps/rejected": -338.0,
1852
+ "eval_loss": 0.3768278658390045,
1853
+ "eval_rewards/accuracies": 0.835106372833252,
1854
+ "eval_rewards/chosen": -0.9375,
1855
+ "eval_rewards/margins": 2.1875,
1856
+ "eval_rewards/rejected": -3.109375,
1857
+ "eval_runtime": 66.3604,
1858
+ "eval_samples_per_second": 22.408,
1859
+ "eval_steps_per_second": 0.708,
1860
+ "step": 528
1861
+ },
1862
+ {
1863
+ "epoch": 0.7433380084151473,
1864
+ "grad_norm": 44.52616987508156,
1865
+ "learning_rate": 9.39907370700287e-08,
1866
+ "logits/chosen": -3.078125,
1867
+ "logits/rejected": -3.0625,
1868
+ "logps/chosen": -328.0,
1869
+ "logps/rejected": -320.0,
1870
+ "loss": 0.3761,
1871
+ "rewards/accuracies": 0.862500011920929,
1872
+ "rewards/chosen": -1.109375,
1873
+ "rewards/margins": 2.1875,
1874
+ "rewards/rejected": -3.28125,
1875
+ "step": 530
1876
+ },
1877
+ {
1878
+ "epoch": 0.7517531556802244,
1879
+ "eval_logits/chosen": -3.09375,
1880
+ "eval_logits/rejected": -3.109375,
1881
+ "eval_logps/chosen": -340.0,
1882
+ "eval_logps/rejected": -336.0,
1883
+ "eval_loss": 0.3774166703224182,
1884
+ "eval_rewards/accuracies": 0.8457446694374084,
1885
+ "eval_rewards/chosen": -0.9453125,
1886
+ "eval_rewards/margins": 2.125,
1887
+ "eval_rewards/rejected": -3.0625,
1888
+ "eval_runtime": 66.0445,
1889
+ "eval_samples_per_second": 22.515,
1890
+ "eval_steps_per_second": 0.712,
1891
+ "step": 536
1892
+ },
1893
+ {
1894
+ "epoch": 0.7573632538569425,
1895
+ "grad_norm": 31.39879813498721,
1896
+ "learning_rate": 8.460770719100316e-08,
1897
+ "logits/chosen": -3.0,
1898
+ "logits/rejected": -3.0,
1899
+ "logps/chosen": -326.0,
1900
+ "logps/rejected": -346.0,
1901
+ "loss": 0.3852,
1902
+ "rewards/accuracies": 0.887499988079071,
1903
+ "rewards/chosen": -1.015625,
1904
+ "rewards/margins": 2.046875,
1905
+ "rewards/rejected": -3.0625,
1906
+ "step": 540
1907
+ },
1908
+ {
1909
+ "epoch": 0.7629733520336606,
1910
+ "eval_logits/chosen": -3.09375,
1911
+ "eval_logits/rejected": -3.125,
1912
+ "eval_logps/chosen": -340.0,
1913
+ "eval_logps/rejected": -336.0,
1914
+ "eval_loss": 0.37610381841659546,
1915
+ "eval_rewards/accuracies": 0.835106372833252,
1916
+ "eval_rewards/chosen": -0.94140625,
1917
+ "eval_rewards/margins": 2.0625,
1918
+ "eval_rewards/rejected": -3.0,
1919
+ "eval_runtime": 66.8835,
1920
+ "eval_samples_per_second": 22.233,
1921
+ "eval_steps_per_second": 0.703,
1922
+ "step": 544
1923
+ },
1924
+ {
1925
+ "epoch": 0.7713884992987378,
1926
+ "grad_norm": 33.163388614762034,
1927
+ "learning_rate": 7.562187985377877e-08,
1928
+ "logits/chosen": -3.03125,
1929
+ "logits/rejected": -3.0625,
1930
+ "logps/chosen": -302.0,
1931
+ "logps/rejected": -336.0,
1932
+ "loss": 0.3887,
1933
+ "rewards/accuracies": 0.862500011920929,
1934
+ "rewards/chosen": -1.0,
1935
+ "rewards/margins": 2.234375,
1936
+ "rewards/rejected": -3.234375,
1937
+ "step": 550
1938
+ },
1939
+ {
1940
+ "epoch": 0.7741935483870968,
1941
+ "eval_logits/chosen": -3.09375,
1942
+ "eval_logits/rejected": -3.125,
1943
+ "eval_logps/chosen": -340.0,
1944
+ "eval_logps/rejected": -334.0,
1945
+ "eval_loss": 0.3767978847026825,
1946
+ "eval_rewards/accuracies": 0.8191489577293396,
1947
+ "eval_rewards/chosen": -0.8828125,
1948
+ "eval_rewards/margins": 2.03125,
1949
+ "eval_rewards/rejected": -2.921875,
1950
+ "eval_runtime": 66.2121,
1951
+ "eval_samples_per_second": 22.458,
1952
+ "eval_steps_per_second": 0.71,
1953
+ "step": 552
1954
+ },
1955
+ {
1956
+ "epoch": 0.7854137447405329,
1957
+ "grad_norm": 33.17915364920084,
1958
+ "learning_rate": 6.705483522631811e-08,
1959
+ "logits/chosen": -3.203125,
1960
+ "logits/rejected": -3.109375,
1961
+ "logps/chosen": -284.0,
1962
+ "logps/rejected": -296.0,
1963
+ "loss": 0.3653,
1964
+ "rewards/accuracies": 0.875,
1965
+ "rewards/chosen": -1.2578125,
1966
+ "rewards/margins": 1.953125,
1967
+ "rewards/rejected": -3.21875,
1968
+ "step": 560
1969
+ },
1970
+ {
1971
+ "epoch": 0.7854137447405329,
1972
+ "eval_logits/chosen": -3.09375,
1973
+ "eval_logits/rejected": -3.125,
1974
+ "eval_logps/chosen": -338.0,
1975
+ "eval_logps/rejected": -334.0,
1976
+ "eval_loss": 0.37570077180862427,
1977
+ "eval_rewards/accuracies": 0.8244680762290955,
1978
+ "eval_rewards/chosen": -0.84765625,
1979
+ "eval_rewards/margins": 2.078125,
1980
+ "eval_rewards/rejected": -2.921875,
1981
+ "eval_runtime": 66.2859,
1982
+ "eval_samples_per_second": 22.433,
1983
+ "eval_steps_per_second": 0.709,
1984
+ "step": 560
1985
+ },
1986
+ {
1987
+ "epoch": 0.7966339410939691,
1988
+ "eval_logits/chosen": -3.09375,
1989
+ "eval_logits/rejected": -3.125,
1990
+ "eval_logps/chosen": -336.0,
1991
+ "eval_logps/rejected": -332.0,
1992
+ "eval_loss": 0.37514832615852356,
1993
+ "eval_rewards/accuracies": 0.8297872543334961,
1994
+ "eval_rewards/chosen": -0.76171875,
1995
+ "eval_rewards/margins": 2.109375,
1996
+ "eval_rewards/rejected": -2.875,
1997
+ "eval_runtime": 66.2321,
1998
+ "eval_samples_per_second": 22.451,
1999
+ "eval_steps_per_second": 0.71,
2000
+ "step": 568
2001
+ },
2002
+ {
2003
+ "epoch": 0.7994389901823282,
2004
+ "grad_norm": 39.45560582368654,
2005
+ "learning_rate": 5.8927147737114546e-08,
2006
+ "logits/chosen": -3.09375,
2007
+ "logits/rejected": -3.046875,
2008
+ "logps/chosen": -322.0,
2009
+ "logps/rejected": -306.0,
2010
+ "loss": 0.3595,
2011
+ "rewards/accuracies": 0.8374999761581421,
2012
+ "rewards/chosen": -1.171875,
2013
+ "rewards/margins": 2.171875,
2014
+ "rewards/rejected": -3.34375,
2015
+ "step": 570
2016
+ },
2017
+ {
2018
+ "epoch": 0.8078541374474053,
2019
+ "eval_logits/chosen": -3.09375,
2020
+ "eval_logits/rejected": -3.125,
2021
+ "eval_logps/chosen": -338.0,
2022
+ "eval_logps/rejected": -334.0,
2023
+ "eval_loss": 0.3752020299434662,
2024
+ "eval_rewards/accuracies": 0.8297872543334961,
2025
+ "eval_rewards/chosen": -0.78125,
2026
+ "eval_rewards/margins": 2.109375,
2027
+ "eval_rewards/rejected": -2.890625,
2028
+ "eval_runtime": 66.0782,
2029
+ "eval_samples_per_second": 22.504,
2030
+ "eval_steps_per_second": 0.711,
2031
+ "step": 576
2032
+ },
2033
+ {
2034
+ "epoch": 0.8134642356241234,
2035
+ "grad_norm": 56.20015949598879,
2036
+ "learning_rate": 5.125833666409107e-08,
2037
+ "logits/chosen": -2.9375,
2038
+ "logits/rejected": -3.0,
2039
+ "logps/chosen": -316.0,
2040
+ "logps/rejected": -304.0,
2041
+ "loss": 0.3618,
2042
+ "rewards/accuracies": 0.887499988079071,
2043
+ "rewards/chosen": -1.0078125,
2044
+ "rewards/margins": 2.21875,
2045
+ "rewards/rejected": -3.234375,
2046
+ "step": 580
2047
+ },
2048
+ {
2049
+ "epoch": 0.8190743338008415,
2050
+ "eval_logits/chosen": -3.09375,
2051
+ "eval_logits/rejected": -3.125,
2052
+ "eval_logps/chosen": -340.0,
2053
+ "eval_logps/rejected": -336.0,
2054
+ "eval_loss": 0.3744948208332062,
2055
+ "eval_rewards/accuracies": 0.835106372833252,
2056
+ "eval_rewards/chosen": -0.91015625,
2057
+ "eval_rewards/margins": 2.09375,
2058
+ "eval_rewards/rejected": -3.0,
2059
+ "eval_runtime": 66.6042,
2060
+ "eval_samples_per_second": 22.326,
2061
+ "eval_steps_per_second": 0.706,
2062
+ "step": 584
2063
+ },
2064
+ {
2065
+ "epoch": 0.8274894810659187,
2066
+ "grad_norm": 46.520579632902326,
2067
+ "learning_rate": 4.4066819257526423e-08,
2068
+ "logits/chosen": -3.0625,
2069
+ "logits/rejected": -3.015625,
2070
+ "logps/chosen": -326.0,
2071
+ "logps/rejected": -402.0,
2072
+ "loss": 0.3668,
2073
+ "rewards/accuracies": 0.8999999761581421,
2074
+ "rewards/chosen": -0.953125,
2075
+ "rewards/margins": 2.40625,
2076
+ "rewards/rejected": -3.375,
2077
+ "step": 590
2078
+ },
2079
+ {
2080
+ "epoch": 0.8302945301542777,
2081
+ "eval_logits/chosen": -3.09375,
2082
+ "eval_logits/rejected": -3.125,
2083
+ "eval_logps/chosen": -340.0,
2084
+ "eval_logps/rejected": -336.0,
2085
+ "eval_loss": 0.3740004003047943,
2086
+ "eval_rewards/accuracies": 0.8404255509376526,
2087
+ "eval_rewards/chosen": -0.921875,
2088
+ "eval_rewards/margins": 2.09375,
2089
+ "eval_rewards/rejected": -3.015625,
2090
+ "eval_runtime": 66.3956,
2091
+ "eval_samples_per_second": 22.396,
2092
+ "eval_steps_per_second": 0.708,
2093
+ "step": 592
2094
+ },
2095
+ {
2096
+ "epoch": 0.8415147265077139,
2097
+ "grad_norm": 47.468270807569716,
2098
+ "learning_rate": 3.736986650958562e-08,
2099
+ "logits/chosen": -3.03125,
2100
+ "logits/rejected": -3.15625,
2101
+ "logps/chosen": -408.0,
2102
+ "logps/rejected": -378.0,
2103
+ "loss": 0.3709,
2104
+ "rewards/accuracies": 0.925000011920929,
2105
+ "rewards/chosen": -0.72265625,
2106
+ "rewards/margins": 2.921875,
2107
+ "rewards/rejected": -3.640625,
2108
+ "step": 600
2109
+ },
2110
+ {
2111
+ "epoch": 0.8415147265077139,
2112
+ "eval_logits/chosen": -3.09375,
2113
+ "eval_logits/rejected": -3.125,
2114
+ "eval_logps/chosen": -342.0,
2115
+ "eval_logps/rejected": -336.0,
2116
+ "eval_loss": 0.374103307723999,
2117
+ "eval_rewards/accuracies": 0.8297872543334961,
2118
+ "eval_rewards/chosen": -0.97265625,
2119
+ "eval_rewards/margins": 2.0625,
2120
+ "eval_rewards/rejected": -3.046875,
2121
+ "eval_runtime": 66.1384,
2122
+ "eval_samples_per_second": 22.483,
2123
+ "eval_steps_per_second": 0.711,
2124
+ "step": 600
2125
+ },
2126
+ {
2127
+ "epoch": 0.85273492286115,
2128
+ "eval_logits/chosen": -3.09375,
2129
+ "eval_logits/rejected": -3.125,
2130
+ "eval_logps/chosen": -342.0,
2131
+ "eval_logps/rejected": -338.0,
2132
+ "eval_loss": 0.3734557628631592,
2133
+ "eval_rewards/accuracies": 0.835106372833252,
2134
+ "eval_rewards/chosen": -1.015625,
2135
+ "eval_rewards/margins": 2.078125,
2136
+ "eval_rewards/rejected": -3.09375,
2137
+ "eval_runtime": 66.2586,
2138
+ "eval_samples_per_second": 22.442,
2139
+ "eval_steps_per_second": 0.709,
2140
+ "step": 608
2141
+ },
2142
+ {
2143
+ "epoch": 0.8555399719495091,
2144
+ "grad_norm": 56.20453228147611,
2145
+ "learning_rate": 3.118356167668065e-08,
2146
+ "logits/chosen": -3.109375,
2147
+ "logits/rejected": -3.078125,
2148
+ "logps/chosen": -320.0,
2149
+ "logps/rejected": -330.0,
2150
+ "loss": 0.3574,
2151
+ "rewards/accuracies": 0.925000011920929,
2152
+ "rewards/chosen": -1.015625,
2153
+ "rewards/margins": 2.671875,
2154
+ "rewards/rejected": -3.6875,
2155
+ "step": 610
2156
+ },
2157
+ {
2158
+ "epoch": 0.8639551192145862,
2159
+ "eval_logits/chosen": -3.09375,
2160
+ "eval_logits/rejected": -3.125,
2161
+ "eval_logps/chosen": -342.0,
2162
+ "eval_logps/rejected": -338.0,
2163
+ "eval_loss": 0.3729269802570343,
2164
+ "eval_rewards/accuracies": 0.8297872543334961,
2165
+ "eval_rewards/chosen": -1.0390625,
2166
+ "eval_rewards/margins": 2.09375,
2167
+ "eval_rewards/rejected": -3.125,
2168
+ "eval_runtime": 66.4655,
2169
+ "eval_samples_per_second": 22.372,
2170
+ "eval_steps_per_second": 0.707,
2171
+ "step": 616
2172
+ },
2173
+ {
2174
+ "epoch": 0.8695652173913043,
2175
+ "grad_norm": 40.32752366240338,
2176
+ "learning_rate": 2.552276165427056e-08,
2177
+ "logits/chosen": -3.046875,
2178
+ "logits/rejected": -3.03125,
2179
+ "logps/chosen": -368.0,
2180
+ "logps/rejected": -394.0,
2181
+ "loss": 0.3611,
2182
+ "rewards/accuracies": 0.949999988079071,
2183
+ "rewards/chosen": -1.2421875,
2184
+ "rewards/margins": 2.5,
2185
+ "rewards/rejected": -3.734375,
2186
+ "step": 620
2187
+ },
2188
+ {
2189
+ "epoch": 0.8751753155680224,
2190
+ "eval_logits/chosen": -3.09375,
2191
+ "eval_logits/rejected": -3.125,
2192
+ "eval_logps/chosen": -342.0,
2193
+ "eval_logps/rejected": -338.0,
2194
+ "eval_loss": 0.3727641999721527,
2195
+ "eval_rewards/accuracies": 0.835106372833252,
2196
+ "eval_rewards/chosen": -1.0625,
2197
+ "eval_rewards/margins": 2.09375,
2198
+ "eval_rewards/rejected": -3.15625,
2199
+ "eval_runtime": 66.0542,
2200
+ "eval_samples_per_second": 22.512,
2201
+ "eval_steps_per_second": 0.712,
2202
+ "step": 624
2203
+ },
2204
+ {
2205
+ "epoch": 0.8835904628330996,
2206
+ "grad_norm": 39.39615195808181,
2207
+ "learning_rate": 2.040106129686356e-08,
2208
+ "logits/chosen": -3.0,
2209
+ "logits/rejected": -3.078125,
2210
+ "logps/chosen": -374.0,
2211
+ "logps/rejected": -348.0,
2212
+ "loss": 0.3413,
2213
+ "rewards/accuracies": 0.887499988079071,
2214
+ "rewards/chosen": -0.953125,
2215
+ "rewards/margins": 2.53125,
2216
+ "rewards/rejected": -3.484375,
2217
+ "step": 630
2218
+ },
2219
+ {
2220
+ "epoch": 0.8863955119214586,
2221
+ "eval_logits/chosen": -3.09375,
2222
+ "eval_logits/rejected": -3.125,
2223
+ "eval_logps/chosen": -344.0,
2224
+ "eval_logps/rejected": -340.0,
2225
+ "eval_loss": 0.3731161057949066,
2226
+ "eval_rewards/accuracies": 0.835106372833252,
2227
+ "eval_rewards/chosen": -1.1015625,
2228
+ "eval_rewards/margins": 2.09375,
2229
+ "eval_rewards/rejected": -3.203125,
2230
+ "eval_runtime": 66.117,
2231
+ "eval_samples_per_second": 22.49,
2232
+ "eval_steps_per_second": 0.711,
2233
+ "step": 632
2234
+ },
2235
+ {
2236
+ "epoch": 0.8976157082748948,
2237
+ "grad_norm": 31.05579865660738,
2238
+ "learning_rate": 1.583076076890963e-08,
2239
+ "logits/chosen": -3.109375,
2240
+ "logits/rejected": -3.109375,
2241
+ "logps/chosen": -360.0,
2242
+ "logps/rejected": -328.0,
2243
+ "loss": 0.3556,
2244
+ "rewards/accuracies": 0.8374999761581421,
2245
+ "rewards/chosen": -1.171875,
2246
+ "rewards/margins": 2.21875,
2247
+ "rewards/rejected": -3.390625,
2248
+ "step": 640
2249
+ },
2250
+ {
2251
+ "epoch": 0.8976157082748948,
2252
+ "eval_logits/chosen": -3.09375,
2253
+ "eval_logits/rejected": -3.125,
2254
+ "eval_logps/chosen": -344.0,
2255
+ "eval_logps/rejected": -340.0,
2256
+ "eval_loss": 0.372738242149353,
2257
+ "eval_rewards/accuracies": 0.835106372833252,
2258
+ "eval_rewards/chosen": -1.1015625,
2259
+ "eval_rewards/margins": 2.109375,
2260
+ "eval_rewards/rejected": -3.21875,
2261
+ "eval_runtime": 66.4596,
2262
+ "eval_samples_per_second": 22.374,
2263
+ "eval_steps_per_second": 0.707,
2264
+ "step": 640
2265
+ },
2266
+ {
2267
+ "epoch": 0.908835904628331,
2268
+ "eval_logits/chosen": -3.09375,
2269
+ "eval_logits/rejected": -3.125,
2270
+ "eval_logps/chosen": -344.0,
2271
+ "eval_logps/rejected": -340.0,
2272
+ "eval_loss": 0.37327107787132263,
2273
+ "eval_rewards/accuracies": 0.835106372833252,
2274
+ "eval_rewards/chosen": -1.0625,
2275
+ "eval_rewards/margins": 2.125,
2276
+ "eval_rewards/rejected": -3.1875,
2277
+ "eval_runtime": 67.9699,
2278
+ "eval_samples_per_second": 21.877,
2279
+ "eval_steps_per_second": 0.691,
2280
+ "step": 648
2281
+ },
2282
+ {
2283
+ "epoch": 0.9116409537166901,
2284
+ "grad_norm": 42.559333930514235,
2285
+ "learning_rate": 1.1822836004992343e-08,
2286
+ "logits/chosen": -3.046875,
2287
+ "logits/rejected": -2.96875,
2288
+ "logps/chosen": -296.0,
2289
+ "logps/rejected": -324.0,
2290
+ "loss": 0.3569,
2291
+ "rewards/accuracies": 0.9375,
2292
+ "rewards/chosen": -1.4375,
2293
+ "rewards/margins": 2.3125,
2294
+ "rewards/rejected": -3.75,
2295
+ "step": 650
2296
+ },
2297
+ {
2298
+ "epoch": 0.9200561009817672,
2299
+ "eval_logits/chosen": -3.09375,
2300
+ "eval_logits/rejected": -3.125,
2301
+ "eval_logps/chosen": -342.0,
2302
+ "eval_logps/rejected": -338.0,
2303
+ "eval_loss": 0.3728487491607666,
2304
+ "eval_rewards/accuracies": 0.835106372833252,
2305
+ "eval_rewards/chosen": -1.03125,
2306
+ "eval_rewards/margins": 2.140625,
2307
+ "eval_rewards/rejected": -3.171875,
2308
+ "eval_runtime": 66.1242,
2309
+ "eval_samples_per_second": 22.488,
2310
+ "eval_steps_per_second": 0.711,
2311
+ "step": 656
2312
+ },
2313
+ {
2314
+ "epoch": 0.9256661991584852,
2315
+ "grad_norm": 33.71718914979295,
2316
+ "learning_rate": 8.386912350262566e-09,
2317
+ "logits/chosen": -2.96875,
2318
+ "logits/rejected": -3.0,
2319
+ "logps/chosen": -352.0,
2320
+ "logps/rejected": -362.0,
2321
+ "loss": 0.3563,
2322
+ "rewards/accuracies": 0.862500011920929,
2323
+ "rewards/chosen": -1.2265625,
2324
+ "rewards/margins": 2.03125,
2325
+ "rewards/rejected": -3.265625,
2326
+ "step": 660
2327
+ },
2328
+ {
2329
+ "epoch": 0.9312762973352033,
2330
+ "eval_logits/chosen": -3.09375,
2331
+ "eval_logits/rejected": -3.125,
2332
+ "eval_logps/chosen": -342.0,
2333
+ "eval_logps/rejected": -338.0,
2334
+ "eval_loss": 0.37276628613471985,
2335
+ "eval_rewards/accuracies": 0.835106372833252,
2336
+ "eval_rewards/chosen": -1.015625,
2337
+ "eval_rewards/margins": 2.140625,
2338
+ "eval_rewards/rejected": -3.15625,
2339
+ "eval_runtime": 66.0656,
2340
+ "eval_samples_per_second": 22.508,
2341
+ "eval_steps_per_second": 0.711,
2342
+ "step": 664
2343
+ },
2344
+ {
2345
+ "epoch": 0.9396914446002805,
2346
+ "grad_norm": 52.307264892507675,
2347
+ "learning_rate": 5.5312414444183276e-09,
2348
+ "logits/chosen": -3.046875,
2349
+ "logits/rejected": -2.859375,
2350
+ "logps/chosen": -354.0,
2351
+ "logps/rejected": -370.0,
2352
+ "loss": 0.3694,
2353
+ "rewards/accuracies": 0.8500000238418579,
2354
+ "rewards/chosen": -1.109375,
2355
+ "rewards/margins": 2.3125,
2356
+ "rewards/rejected": -3.421875,
2357
+ "step": 670
2358
+ },
2359
+ {
2360
+ "epoch": 0.9424964936886395,
2361
+ "eval_logits/chosen": -3.09375,
2362
+ "eval_logits/rejected": -3.125,
2363
+ "eval_logps/chosen": -342.0,
2364
+ "eval_logps/rejected": -338.0,
2365
+ "eval_loss": 0.3723788857460022,
2366
+ "eval_rewards/accuracies": 0.8404255509376526,
2367
+ "eval_rewards/chosen": -1.0078125,
2368
+ "eval_rewards/margins": 2.140625,
2369
+ "eval_rewards/rejected": -3.15625,
2370
+ "eval_runtime": 65.8691,
2371
+ "eval_samples_per_second": 22.575,
2372
+ "eval_steps_per_second": 0.714,
2373
+ "step": 672
2374
+ },
2375
+ {
2376
+ "epoch": 0.9537166900420757,
2377
+ "grad_norm": 39.5471805424214,
2378
+ "learning_rate": 3.262681404746004e-09,
2379
+ "logits/chosen": -3.09375,
2380
+ "logits/rejected": -3.015625,
2381
+ "logps/chosen": -368.0,
2382
+ "logps/rejected": -368.0,
2383
+ "loss": 0.3447,
2384
+ "rewards/accuracies": 0.887499988079071,
2385
+ "rewards/chosen": -1.0546875,
2386
+ "rewards/margins": 2.671875,
2387
+ "rewards/rejected": -3.71875,
2388
+ "step": 680
2389
+ },
2390
+ {
2391
+ "epoch": 0.9537166900420757,
2392
+ "eval_logits/chosen": -3.09375,
2393
+ "eval_logits/rejected": -3.125,
2394
+ "eval_logps/chosen": -342.0,
2395
+ "eval_logps/rejected": -338.0,
2396
+ "eval_loss": 0.37281739711761475,
2397
+ "eval_rewards/accuracies": 0.835106372833252,
2398
+ "eval_rewards/chosen": -1.015625,
2399
+ "eval_rewards/margins": 2.140625,
2400
+ "eval_rewards/rejected": -3.15625,
2401
+ "eval_runtime": 66.1705,
2402
+ "eval_samples_per_second": 22.472,
2403
+ "eval_steps_per_second": 0.71,
2404
+ "step": 680
2405
+ },
2406
+ {
2407
+ "epoch": 0.9649368863955119,
2408
+ "eval_logits/chosen": -3.09375,
2409
+ "eval_logits/rejected": -3.125,
2410
+ "eval_logps/chosen": -342.0,
2411
+ "eval_logps/rejected": -338.0,
2412
+ "eval_loss": 0.3726153075695038,
2413
+ "eval_rewards/accuracies": 0.835106372833252,
2414
+ "eval_rewards/chosen": -1.0078125,
2415
+ "eval_rewards/margins": 2.140625,
2416
+ "eval_rewards/rejected": -3.15625,
2417
+ "eval_runtime": 66.3119,
2418
+ "eval_samples_per_second": 22.424,
2419
+ "eval_steps_per_second": 0.709,
2420
+ "step": 688
2421
+ },
2422
+ {
2423
+ "epoch": 0.967741935483871,
2424
+ "grad_norm": 40.36229007754571,
2425
+ "learning_rate": 1.5866803558146624e-09,
2426
+ "logits/chosen": -2.984375,
2427
+ "logits/rejected": -2.96875,
2428
+ "logps/chosen": -322.0,
2429
+ "logps/rejected": -372.0,
2430
+ "loss": 0.3766,
2431
+ "rewards/accuracies": 0.949999988079071,
2432
+ "rewards/chosen": -1.359375,
2433
+ "rewards/margins": 2.046875,
2434
+ "rewards/rejected": -3.40625,
2435
+ "step": 690
2436
+ },
2437
+ {
2438
+ "epoch": 0.9761570827489481,
2439
+ "eval_logits/chosen": -3.09375,
2440
+ "eval_logits/rejected": -3.125,
2441
+ "eval_logps/chosen": -342.0,
2442
+ "eval_logps/rejected": -338.0,
2443
+ "eval_loss": 0.3727646470069885,
2444
+ "eval_rewards/accuracies": 0.835106372833252,
2445
+ "eval_rewards/chosen": -1.0078125,
2446
+ "eval_rewards/margins": 2.140625,
2447
+ "eval_rewards/rejected": -3.140625,
2448
+ "eval_runtime": 65.8839,
2449
+ "eval_samples_per_second": 22.57,
2450
+ "eval_steps_per_second": 0.713,
2451
+ "step": 696
2452
+ },
2453
+ {
2454
+ "epoch": 0.9817671809256662,
2455
+ "grad_norm": 41.008248288595254,
2456
+ "learning_rate": 5.07263345378317e-10,
2457
+ "logits/chosen": -2.96875,
2458
+ "logits/rejected": -3.015625,
2459
+ "logps/chosen": -346.0,
2460
+ "logps/rejected": -358.0,
2461
+ "loss": 0.3699,
2462
+ "rewards/accuracies": 0.9125000238418579,
2463
+ "rewards/chosen": -0.8984375,
2464
+ "rewards/margins": 2.578125,
2465
+ "rewards/rejected": -3.484375,
2466
+ "step": 700
2467
+ },
2468
+ {
2469
+ "epoch": 0.9873772791023843,
2470
+ "eval_logits/chosen": -3.09375,
2471
+ "eval_logits/rejected": -3.125,
2472
+ "eval_logps/chosen": -342.0,
2473
+ "eval_logps/rejected": -338.0,
2474
+ "eval_loss": 0.372467964887619,
2475
+ "eval_rewards/accuracies": 0.835106372833252,
2476
+ "eval_rewards/chosen": -1.015625,
2477
+ "eval_rewards/margins": 2.125,
2478
+ "eval_rewards/rejected": -3.140625,
2479
+ "eval_runtime": 65.8027,
2480
+ "eval_samples_per_second": 22.598,
2481
+ "eval_steps_per_second": 0.714,
2482
+ "step": 704
2483
+ },
2484
+ {
2485
+ "epoch": 0.9957924263674615,
2486
+ "grad_norm": 50.076560093112214,
2487
+ "learning_rate": 2.7022677908467016e-11,
2488
+ "logits/chosen": -3.03125,
2489
+ "logits/rejected": -3.015625,
2490
+ "logps/chosen": -394.0,
2491
+ "logps/rejected": -354.0,
2492
+ "loss": 0.3758,
2493
+ "rewards/accuracies": 0.8500000238418579,
2494
+ "rewards/chosen": -1.1015625,
2495
+ "rewards/margins": 1.9296875,
2496
+ "rewards/rejected": -3.03125,
2497
+ "step": 710
2498
+ },
2499
+ {
2500
+ "epoch": 0.9985974754558204,
2501
+ "eval_logits/chosen": -3.09375,
2502
+ "eval_logits/rejected": -3.125,
2503
+ "eval_logps/chosen": -342.0,
2504
+ "eval_logps/rejected": -338.0,
2505
+ "eval_loss": 0.37290096282958984,
2506
+ "eval_rewards/accuracies": 0.835106372833252,
2507
+ "eval_rewards/chosen": -1.0078125,
2508
+ "eval_rewards/margins": 2.140625,
2509
+ "eval_rewards/rejected": -3.140625,
2510
+ "eval_runtime": 66.4035,
2511
+ "eval_samples_per_second": 22.393,
2512
+ "eval_steps_per_second": 0.708,
2513
+ "step": 712
2514
+ },
2515
+ {
2516
+ "epoch": 1.0,
2517
+ "step": 713,
2518
+ "total_flos": 0.0,
2519
+ "train_loss": 0.4091535410406212,
2520
+ "train_runtime": 30868.5922,
2521
+ "train_samples_per_second": 1.477,
2522
+ "train_steps_per_second": 0.023
2523
+ }
2524
+ ],
2525
+ "logging_steps": 10,
2526
+ "max_steps": 713,
2527
+ "num_input_tokens_seen": 0,
2528
+ "num_train_epochs": 1,
2529
+ "save_steps": 8,
2530
+ "stateful_callbacks": {
2531
+ "TrainerControl": {
2532
+ "args": {
2533
+ "should_epoch_stop": false,
2534
+ "should_evaluate": false,
2535
+ "should_log": false,
2536
+ "should_save": true,
2537
+ "should_training_stop": true
2538
+ },
2539
+ "attributes": {}
2540
+ }
2541
+ },
2542
+ "total_flos": 0.0,
2543
+ "train_batch_size": 4,
2544
+ "trial_name": null,
2545
+ "trial_params": null
2546
+ }