diff --git "a/trainer_state.json" "b/trainer_state.json"
new file mode 100644--- /dev/null
+++ "b/trainer_state.json"
@@ -0,0 +1,31284 @@
+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.0,
+  "eval_steps": 500,
+  "global_step": 1250,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 485.0,
+      "completions/mean_length": 262.625,
+      "completions/mean_terminated_length": 179.5,
+      "completions/min_length": 7.0,
+      "completions/min_terminated_length": 7.0,
+      "epoch": 0.0008,
+      "grad_norm": 5.598288059234619,
+      "kl": 0.0005154609680175781,
+      "learning_rate": 1.5873015873015872e-08,
+      "loss": 0.0537,
+      "num_tokens": 15100.0,
+      "reward": 0.04846250265836716,
+      "reward_std": 0.06843117624521255,
+      "rewards/bleu_reward_func/mean": 0.04846250265836716,
+      "rewards/bleu_reward_func/std": 0.07639143615961075,
+      "step": 1
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 306.0,
+      "completions/mean_length": 248.09375,
+      "completions/mean_terminated_length": 128.13636779785156,
+      "completions/min_length": 19.0,
+      "completions/min_terminated_length": 19.0,
+      "epoch": 0.0016,
+      "grad_norm": 7.323095321655273,
+      "kl": 0.0005979537963867188,
+      "learning_rate": 3.1746031746031744e-08,
+      "loss": 0.2393,
+      "num_tokens": 31479.0,
+      "reward": 0.03515050560235977,
+      "reward_std": 0.0315697155892849,
+      "rewards/bleu_reward_func/mean": 0.03515050560235977,
+      "rewards/bleu_reward_func/std": 0.048244670033454895,
+      "step": 2
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.28125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 503.0,
+      "completions/mean_length": 258.34375,
+      "completions/mean_terminated_length": 159.0869598388672,
+      "completions/min_length": 7.0,
+      "completions/min_terminated_length": 7.0,
+      "epoch": 0.0024,
+      "grad_norm": 5.801818370819092,
+      "kl": 0.0008335113525390625,
+      "learning_rate": 4.7619047619047613e-08,
+      "loss": 0.2227,
+      "num_tokens": 47330.0,
+      "reward": 0.0770750418305397,
+      "reward_std": 0.05211775749921799,
+      "rewards/bleu_reward_func/mean": 0.0770750418305397,
+      "rewards/bleu_reward_func/std": 0.07082299888134003,
+      "step": 3
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.28125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 512.0,
+      "completions/mean_length": 285.59375,
+      "completions/mean_terminated_length": 197.0,
+      "completions/min_length": 6.0,
+      "completions/min_terminated_length": 6.0,
+      "epoch": 0.0032,
+      "grad_norm": 6.7342329025268555,
+      "kl": 0.0007953643798828125,
+      "learning_rate": 6.349206349206349e-08,
+      "loss": 0.1714,
+      "num_tokens": 62101.0,
+      "reward": 0.05630416050553322,
+      "reward_std": 0.0387054979801178,
+      "rewards/bleu_reward_func/mean": 0.05630416050553322,
+      "rewards/bleu_reward_func/std": 0.05173136293888092,
+      "step": 4
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.34375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 479.0,
+      "completions/mean_length": 312.75,
+      "completions/mean_terminated_length": 208.38095092773438,
+      "completions/min_length": 7.0,
+      "completions/min_terminated_length": 7.0,
+      "epoch": 0.004,
+      "grad_norm": 4.261541843414307,
+      "kl": 0.0007123947143554688,
+      "learning_rate": 7.936507936507936e-08,
+      "loss": 0.0096,
+      "num_tokens": 74629.0,
+      "reward": 0.03661263734102249,
+      "reward_std": 0.02765350043773651,
+      "rewards/bleu_reward_func/mean": 0.03661263734102249,
+      "rewards/bleu_reward_func/std": 0.05122661218047142,
+      "step": 5
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 472.0,
+      "completions/mean_length": 269.0625,
+      "completions/mean_terminated_length": 80.11111450195312,
+      "completions/min_length": 7.0,
+      "completions/min_terminated_length": 7.0,
+      "epoch": 0.0048,
+      "grad_norm": 25.550201416015625,
+      "kl": 0.000881195068359375,
+      "learning_rate": 9.523809523809523e-08,
+      "loss": -0.1788,
+      "num_tokens": 91711.0,
+      "reward": 0.01917407289147377,
+      "reward_std": 0.014019257389008999,
+      "rewards/bleu_reward_func/mean": 0.01917407289147377,
+      "rewards/bleu_reward_func/std": 0.024173468351364136,
+      "step": 6
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.46875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 499.0,
+      "completions/mean_length": 377.8125,
+      "completions/mean_terminated_length": 259.4117736816406,
+      "completions/min_length": 2.0,
+      "completions/min_terminated_length": 2.0,
+      "epoch": 0.0056,
+      "grad_norm": 3.7816717624664307,
+      "kl": 0.0007390975952148438,
+      "learning_rate": 1.111111111111111e-07,
+      "loss": -0.2289,
+      "num_tokens": 107369.0,
+      "reward": 0.02209433726966381,
+      "reward_std": 0.011734157800674438,
+      "rewards/bleu_reward_func/mean": 0.02209433726966381,
+      "rewards/bleu_reward_func/std": 0.023080473765730858,
+      "step": 7
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 493.0,
+      "completions/mean_length": 283.5625,
+      "completions/mean_terminated_length": 146.5,
+      "completions/min_length": 7.0,
+      "completions/min_terminated_length": 7.0,
+      "epoch": 0.0064,
+      "grad_norm": 4.340329647064209,
+      "kl": 0.000911712646484375,
+      "learning_rate": 1.2698412698412698e-07,
+      "loss": -0.0252,
+      "num_tokens": 125275.0,
+      "reward": 0.03392016887664795,
+      "reward_std": 0.04013249650597572,
+      "rewards/bleu_reward_func/mean": 0.03392016887664795,
+      "rewards/bleu_reward_func/std": 0.05353143438696861,
+      "step": 8
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.28125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 490.0,
+      "completions/mean_length": 282.09375,
+      "completions/mean_terminated_length": 192.13043212890625,
+      "completions/min_length": 7.0,
+      "completions/min_terminated_length": 7.0,
+      "epoch": 0.0072,
+      "grad_norm": 5.671853542327881,
+      "kl": 0.000640869140625,
+      "learning_rate": 1.4285714285714285e-07,
+      "loss": -0.4792,
+      "num_tokens": 142190.0,
+      "reward": 0.02354184165596962,
+      "reward_std": 0.015565130859613419,
+      "rewards/bleu_reward_func/mean": 0.02354184165596962,
+      "rewards/bleu_reward_func/std": 0.02305246703326702,
+      "step": 9
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.40625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 505.0,
+      "completions/mean_length": 421.625,
+      "completions/mean_terminated_length": 359.78948974609375,
+      "completions/min_length": 86.0,
+      "completions/min_terminated_length": 86.0,
+      "epoch": 0.008,
+      "grad_norm": 3.240866184234619,
+      "kl": 0.0006823539733886719,
+      "learning_rate": 1.5873015873015872e-07,
+      "loss": -0.0021,
+      "num_tokens": 158282.0,
+      "reward": 0.02482026070356369,
+      "reward_std": 0.0131409652531147,
+      "rewards/bleu_reward_func/mean": 0.02482026070356369,
+      "rewards/bleu_reward_func/std": 0.015270248055458069,
+      "step": 10
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.46875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 486.0,
+      "completions/mean_length": 345.90625,
+      "completions/mean_terminated_length": 199.35293579101562,
+      "completions/min_length": 7.0,
+      "completions/min_terminated_length": 7.0,
+      "epoch": 0.0088,
+      "grad_norm": 3.652275800704956,
+      "kl": 0.0006260871887207031,
+      "learning_rate": 1.7460317460317458e-07,
+      "loss": -0.2852,
+      "num_tokens": 177455.0,
+      "reward": 0.03390186280012131,
+      "reward_std": 0.016770539805293083,
+      "rewards/bleu_reward_func/mean": 0.03390186280012131,
+      "rewards/bleu_reward_func/std": 0.04328485205769539,
+      "step": 11
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 438.0,
+      "completions/mean_length": 275.46875,
+      "completions/mean_terminated_length": 196.625,
+      "completions/min_length": 8.0,
+      "completions/min_terminated_length": 8.0,
+      "epoch": 0.0096,
+      "grad_norm": 73.21807098388672,
+      "kl": 0.0032701492309570312,
+      "learning_rate": 1.9047619047619045e-07,
+      "loss": 0.0661,
+      "num_tokens": 189486.0,
+      "reward": 0.022345196455717087,
+      "reward_std": 0.019753258675336838,
+      "rewards/bleu_reward_func/mean": 0.022345196455717087,
+      "rewards/bleu_reward_func/std": 0.020975911989808083,
+      "step": 12
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 408.0,
+      "completions/mean_length": 309.90625,
+      "completions/mean_terminated_length": 188.65000915527344,
+      "completions/min_length": 7.0,
+      "completions/min_terminated_length": 7.0,
+      "epoch": 0.0104,
+      "grad_norm": 5.3057379722595215,
+      "kl": 0.0005865097045898438,
+      "learning_rate": 2.0634920634920632e-07,
+      "loss": -0.1972,
+      "num_tokens": 203691.0,
+      "reward": 0.031099505722522736,
+      "reward_std": 0.04415294528007507,
+      "rewards/bleu_reward_func/mean": 0.031099505722522736,
+      "rewards/bleu_reward_func/std": 0.05319083109498024,
+      "step": 13
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.21875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 495.0,
+      "completions/mean_length": 211.5625,
+      "completions/mean_terminated_length": 127.43999481201172,
+      "completions/min_length": 2.0,
+      "completions/min_terminated_length": 2.0,
+      "epoch": 0.0112,
+      "grad_norm": 10.997786521911621,
+      "kl": 0.0007891654968261719,
+      "learning_rate": 2.222222222222222e-07,
+      "loss": 0.005,
+      "num_tokens": 220117.0,
+      "reward": 0.07334433495998383,
+      "reward_std": 0.05255947634577751,
+      "rewards/bleu_reward_func/mean": 0.07334433495998383,
+      "rewards/bleu_reward_func/std": 0.11127088218927383,
+      "step": 14
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 481.0,
+      "completions/mean_length": 322.71875,
+      "completions/mean_terminated_length": 175.5,
+      "completions/min_length": 2.0,
+      "completions/min_terminated_length": 2.0,
+      "epoch": 0.012,
+      "grad_norm": 4.558916091918945,
+      "kl": 0.000732421875,
+      "learning_rate": 2.3809523809523806e-07,
+      "loss": -0.1845,
+      "num_tokens": 232508.0,
+      "reward": 0.01538888644427061,
+      "reward_std": 0.012768322601914406,
+      "rewards/bleu_reward_func/mean": 0.01538888644427061,
+      "rewards/bleu_reward_func/std": 0.01415330171585083,
+      "step": 15
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.34375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 397.0,
+      "completions/mean_length": 266.78125,
+      "completions/mean_terminated_length": 138.33334350585938,
+      "completions/min_length": 2.0,
+      "completions/min_terminated_length": 2.0,
+      "epoch": 0.0128,
+      "grad_norm": 4.418691158294678,
+      "kl": 0.0009031295776367188,
+      "learning_rate": 2.5396825396825396e-07,
+      "loss": 0.2931,
+      "num_tokens": 246325.0,
+      "reward": 0.04519380256533623,
+      "reward_std": 0.047629594802856445,
+      "rewards/bleu_reward_func/mean": 0.04519380256533623,
+      "rewards/bleu_reward_func/std": 0.09796681255102158,
+      "step": 16
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 498.0,
+      "completions/mean_length": 375.375,
+      "completions/mean_terminated_length": 238.75,
+      "completions/min_length": 7.0,
+      "completions/min_terminated_length": 7.0,
+      "epoch": 0.0136,
+      "grad_norm": 4.360379219055176,
+      "kl": 0.0007829666137695312,
+      "learning_rate": 2.698412698412698e-07,
+      "loss": 0.0392,
+      "num_tokens": 262393.0,
+      "reward": 0.02785748988389969,
+      "reward_std": 0.02370397374033928,
+      "rewards/bleu_reward_func/mean": 0.02785748988389969,
+      "rewards/bleu_reward_func/std": 0.031648874282836914,
+      "step": 17
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 506.0,
+      "completions/mean_length": 341.625,
+      "completions/mean_terminated_length": 171.25,
+      "completions/min_length": 2.0,
+      "completions/min_terminated_length": 2.0,
+      "epoch": 0.0144,
+      "grad_norm": 4.028530597686768,
+      "kl": 0.0006041526794433594,
+      "learning_rate": 2.857142857142857e-07,
+      "loss": -0.0867,
+      "num_tokens": 276509.0,
+      "reward": 0.03313319757580757,
+      "reward_std": 0.026780985295772552,
+      "rewards/bleu_reward_func/mean": 0.03313319757580757,
+      "rewards/bleu_reward_func/std": 0.03177988529205322,
+      "step": 18
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.28125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 508.0,
+      "completions/mean_length": 323.78125,
+      "completions/mean_terminated_length": 250.13043212890625,
+      "completions/min_length": 13.0,
+      "completions/min_terminated_length": 13.0,
+      "epoch": 0.0152,
+      "grad_norm": 6.029330253601074,
+      "kl": 0.0007543563842773438,
+      "learning_rate": 3.0158730158730156e-07,
+      "loss": 0.2177,
+      "num_tokens": 288774.0,
+      "reward": 0.04934918135404587,
+      "reward_std": 0.035659849643707275,
+      "rewards/bleu_reward_func/mean": 0.04934918135404587,
+      "rewards/bleu_reward_func/std": 0.046043358743190765,
+      "step": 19
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 495.0,
+      "completions/mean_length": 365.5625,
+      "completions/mean_terminated_length": 251.6666717529297,
+      "completions/min_length": 8.0,
+      "completions/min_terminated_length": 8.0,
+      "epoch": 0.016,
+      "grad_norm": 3.2459499835968018,
+      "kl": 0.0007076263427734375,
+      "learning_rate": 3.1746031746031743e-07,
+      "loss": -0.1034,
+      "num_tokens": 302384.0,
+      "reward": 0.045273810625076294,
+      "reward_std": 0.033148057758808136,
+      "rewards/bleu_reward_func/mean": 0.045273810625076294,
+      "rewards/bleu_reward_func/std": 0.05641715228557587,
+      "step": 20
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.40625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 470.0,
+      "completions/mean_length": 309.4375,
+      "completions/mean_terminated_length": 170.84210205078125,
+      "completions/min_length": 2.0,
+      "completions/min_terminated_length": 2.0,
+      "epoch": 0.0168,
+      "grad_norm": 3.209543228149414,
+      "kl": 0.0006561279296875,
+      "learning_rate": 3.333333333333333e-07,
+      "loss": -0.0094,
+      "num_tokens": 317406.0,
+      "reward": 0.10972930490970612,
+      "reward_std": 0.09467534720897675,
+      "rewards/bleu_reward_func/mean": 0.10972930490970612,
+      "rewards/bleu_reward_func/std": 0.1834246814250946,
+      "step": 21
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 420.0,
+      "completions/mean_length": 276.4375,
+      "completions/mean_terminated_length": 169.3636474609375,
+      "completions/min_length": 39.0,
+      "completions/min_terminated_length": 39.0,
+      "epoch": 0.0176,
+      "grad_norm": 6.837025165557861,
+      "kl": 0.0008249282836914062,
+      "learning_rate": 3.4920634920634917e-07,
+      "loss": 0.192,
+      "num_tokens": 331436.0,
+      "reward": 0.08987575769424438,
+      "reward_std": 0.03435216099023819,
+      "rewards/bleu_reward_func/mean": 0.08987575769424438,
+      "rewards/bleu_reward_func/std": 0.13043095171451569,
+      "step": 22
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 453.0,
+      "completions/mean_length": 392.46875,
+      "completions/mean_terminated_length": 272.9375,
+      "completions/min_length": 35.0,
+      "completions/min_terminated_length": 35.0,
+      "epoch": 0.0184,
+      "grad_norm": 6.737916946411133,
+      "kl": 0.00086212158203125,
+      "learning_rate": 3.6507936507936504e-07,
+      "loss": -0.0441,
+      "num_tokens": 349715.0,
+      "reward": 0.027110569179058075,
+      "reward_std": 0.01938316598534584,
+      "rewards/bleu_reward_func/mean": 0.027110569179058075,
+      "rewards/bleu_reward_func/std": 0.021934401243925095,
+      "step": 23
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.15625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 500.0,
+      "completions/mean_length": 274.65625,
+      "completions/mean_terminated_length": 230.70370483398438,
+      "completions/min_length": 7.0,
+      "completions/min_terminated_length": 7.0,
+      "epoch": 0.0192,
+      "grad_norm": 10.491765022277832,
+      "kl": 0.0007648468017578125,
+      "learning_rate": 3.809523809523809e-07,
+      "loss": 0.269,
+      "num_tokens": 360336.0,
+      "reward": 0.03281049802899361,
+      "reward_std": 0.023013217374682426,
+      "rewards/bleu_reward_func/mean": 0.03281049802899361,
+      "rewards/bleu_reward_func/std": 0.026025522500276566,
+      "step": 24
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.40625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 418.0,
+      "completions/mean_length": 269.59375,
+      "completions/mean_terminated_length": 103.7368392944336,
+      "completions/min_length": 19.0,
+      "completions/min_terminated_length": 19.0,
+      "epoch": 0.02,
+      "grad_norm": 5.670685291290283,
+      "kl": 0.0008592605590820312,
+      "learning_rate": 3.968253968253968e-07,
+      "loss": 0.2917,
+      "num_tokens": 374179.0,
+      "reward": 0.04281582683324814,
+      "reward_std": 0.0440773144364357,
+      "rewards/bleu_reward_func/mean": 0.04281582683324814,
+      "rewards/bleu_reward_func/std": 0.0797559842467308,
+      "step": 25
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.34375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 499.0,
+      "completions/mean_length": 296.96875,
+      "completions/mean_terminated_length": 184.33334350585938,
+      "completions/min_length": 7.0,
+      "completions/min_terminated_length": 7.0,
+      "epoch": 0.0208,
+      "grad_norm": 6.63213586807251,
+      "kl": 0.0011281967163085938,
+      "learning_rate": 4.1269841269841265e-07,
+      "loss": 0.0991,
+      "num_tokens": 386458.0,
+      "reward": 0.07768785208463669,
+      "reward_std": 0.08760131150484085,
+      "rewards/bleu_reward_func/mean": 0.07768785208463669,
+      "rewards/bleu_reward_func/std": 0.12583571672439575,
+      "step": 26
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 447.0,
+      "completions/mean_length": 280.4375,
+      "completions/mean_terminated_length": 175.18182373046875,
+      "completions/min_length": 46.0,
+      "completions/min_terminated_length": 46.0,
+      "epoch": 0.0216,
+      "grad_norm": 5.1314802169799805,
+      "kl": 0.0008611679077148438,
+      "learning_rate": 4.285714285714285e-07,
+      "loss": 0.2129,
+      "num_tokens": 399600.0,
+      "reward": 0.034803349524736404,
+      "reward_std": 0.033125463873147964,
+      "rewards/bleu_reward_func/mean": 0.034803349524736404,
+      "rewards/bleu_reward_func/std": 0.04297792166471481,
+      "step": 27
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.65625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 405.0,
+      "completions/mean_length": 403.625,
+      "completions/mean_terminated_length": 196.72727966308594,
+      "completions/min_length": 7.0,
+      "completions/min_terminated_length": 7.0,
+      "epoch": 0.0224,
+      "grad_norm": 2.8215885162353516,
+      "kl": 0.0007638931274414062,
+      "learning_rate": 4.444444444444444e-07,
+      "loss": -0.2953,
+      "num_tokens": 415372.0,
+      "reward": 0.02452818863093853,
+      "reward_std": 0.018821807578206062,
+      "rewards/bleu_reward_func/mean": 0.02452818863093853,
+      "rewards/bleu_reward_func/std": 0.03300207853317261,
+      "step": 28
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 458.0,
+      "completions/mean_length": 213.84375,
+      "completions/mean_terminated_length": 114.45833587646484,
+      "completions/min_length": 7.0,
+      "completions/min_terminated_length": 7.0,
+      "epoch": 0.0232,
+      "grad_norm": 7.012094020843506,
+      "kl": 0.0006189346313476562,
+      "learning_rate": 4.6031746031746025e-07,
+      "loss": 0.119,
+      "num_tokens": 428351.0,
+      "reward": 0.055403269827365875,
+      "reward_std": 0.06412488222122192,
+      "rewards/bleu_reward_func/mean": 0.055403269827365875,
+      "rewards/bleu_reward_func/std": 0.07173087447881699,
+      "step": 29
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 496.0,
+      "completions/mean_length": 374.28125,
+      "completions/mean_terminated_length": 291.6499938964844,
+      "completions/min_length": 10.0,
+      "completions/min_terminated_length": 10.0,
+      "epoch": 0.024,
+      "grad_norm": 5.251861095428467,
+      "kl": 0.0007734298706054688,
+      "learning_rate": 4.761904761904761e-07,
+      "loss": 0.0396,
+      "num_tokens": 443856.0,
+      "reward": 0.033150382339954376,
+      "reward_std": 0.029685020446777344,
+      "rewards/bleu_reward_func/mean": 0.033150382339954376,
+      "rewards/bleu_reward_func/std": 0.04449395835399628,
+      "step": 30
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.46875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 482.0,
+      "completions/mean_length": 352.6875,
+      "completions/mean_terminated_length": 212.11764526367188,
+      "completions/min_length": 16.0,
+      "completions/min_terminated_length": 16.0,
+      "epoch": 0.0248,
+      "grad_norm": 2.8306992053985596,
+      "kl": 0.0007534027099609375,
+      "learning_rate": 4.92063492063492e-07,
+      "loss": 0.0386,
+      "num_tokens": 458846.0,
+      "reward": 0.07098191231489182,
+      "reward_std": 0.07976502180099487,
+      "rewards/bleu_reward_func/mean": 0.07098191231489182,
+      "rewards/bleu_reward_func/std": 0.13301755487918854,
+      "step": 31
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 476.0,
+      "completions/mean_length": 314.96875,
+      "completions/mean_terminated_length": 249.2916717529297,
+      "completions/min_length": 24.0,
+      "completions/min_terminated_length": 24.0,
+      "epoch": 0.0256,
+      "grad_norm": 4.18798303604126,
+      "kl": 0.0013475418090820312,
+      "learning_rate": 5.079365079365079e-07,
+      "loss": 0.1184,
+      "num_tokens": 475693.0,
+      "reward": 0.06003670394420624,
+      "reward_std": 0.04762943834066391,
+      "rewards/bleu_reward_func/mean": 0.06003670394420624,
+      "rewards/bleu_reward_func/std": 0.06799852848052979,
+      "step": 32
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 433.0,
+      "completions/mean_length": 246.09375,
+      "completions/mean_terminated_length": 157.45834350585938,
+      "completions/min_length": 2.0,
+      "completions/min_terminated_length": 2.0,
+      "epoch": 0.0264,
+      "grad_norm": 7.522784233093262,
+      "kl": 0.0013055801391601562,
+      "learning_rate": 5.238095238095238e-07,
+      "loss": 0.2574,
+      "num_tokens": 489632.0,
+      "reward": 0.035463202744722366,
+      "reward_std": 0.02683849260210991,
+      "rewards/bleu_reward_func/mean": 0.035463202744722366,
+      "rewards/bleu_reward_func/std": 0.05300255864858627,
+      "step": 33
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.28125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 387.0,
+      "completions/mean_length": 281.40625,
+      "completions/mean_terminated_length": 191.17391967773438,
+      "completions/min_length": 8.0,
+      "completions/min_terminated_length": 8.0,
+      "epoch": 0.0272,
+      "grad_norm": 4.312166213989258,
+      "kl": 0.0016279220581054688,
+      "learning_rate": 5.396825396825396e-07,
+      "loss": 0.0276,
+      "num_tokens": 503221.0,
+      "reward": 0.036928486078977585,
+      "reward_std": 0.030746515840291977,
+      "rewards/bleu_reward_func/mean": 0.036928486078977585,
+      "rewards/bleu_reward_func/std": 0.041675370186567307,
+      "step": 34
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.46875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 447.0,
+      "completions/mean_length": 363.0,
+      "completions/mean_terminated_length": 231.5294189453125,
+      "completions/min_length": 9.0,
+      "completions/min_terminated_length": 9.0,
+      "epoch": 0.028,
+      "grad_norm": 5.934871196746826,
+      "kl": 0.001247406005859375,
+      "learning_rate": 5.555555555555555e-07,
+      "loss": 0.0007,
+      "num_tokens": 519629.0,
+      "reward": 0.02279968000948429,
+      "reward_std": 0.0171576626598835,
+      "rewards/bleu_reward_func/mean": 0.02279968000948429,
+      "rewards/bleu_reward_func/std": 0.02809896320104599,
+      "step": 35
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 471.0,
+      "completions/mean_length": 245.125,
+      "completions/mean_terminated_length": 227.33334350585938,
+      "completions/min_length": 7.0,
+      "completions/min_terminated_length": 7.0,
+      "epoch": 0.0288,
+      "grad_norm": 6.152184963226318,
+      "kl": 0.00135040283203125,
+      "learning_rate": 5.714285714285714e-07,
+      "loss": 0.1277,
+      "num_tokens": 531009.0,
+      "reward": 0.08614860475063324,
+      "reward_std": 0.05592390149831772,
+      "rewards/bleu_reward_func/mean": 0.08614860475063324,
+      "rewards/bleu_reward_func/std": 0.07292494177818298,
+      "step": 36
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 477.0,
+      "completions/mean_length": 315.46875,
+      "completions/mean_terminated_length": 226.13636779785156,
+      "completions/min_length": 19.0,
+      "completions/min_terminated_length": 19.0,
+      "epoch": 0.0296,
+      "grad_norm": 4.999361991882324,
+      "kl": 0.0010061264038085938,
+      "learning_rate": 5.873015873015873e-07,
+      "loss": -0.1945,
+      "num_tokens": 553904.0,
+      "reward": 0.022978566586971283,
+      "reward_std": 0.0320000983774662,
+      "rewards/bleu_reward_func/mean": 0.022978566586971283,
+      "rewards/bleu_reward_func/std": 0.05384916067123413,
+      "step": 37
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 501.0,
+      "completions/mean_length": 255.125,
+      "completions/mean_terminated_length": 195.84616088867188,
+      "completions/min_length": 24.0,
+      "completions/min_terminated_length": 24.0,
+      "epoch": 0.0304,
+      "grad_norm": 12.462119102478027,
+      "kl": 0.00140380859375,
+      "learning_rate": 6.031746031746031e-07,
+      "loss": -0.0499,
+      "num_tokens": 569980.0,
+      "reward": 0.06601191312074661,
+      "reward_std": 0.06571432948112488,
+      "rewards/bleu_reward_func/mean": 0.06601191312074661,
+      "rewards/bleu_reward_func/std": 0.11037519574165344,
+      "step": 38
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.28125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 501.0,
+      "completions/mean_length": 350.78125,
+      "completions/mean_terminated_length": 287.6956481933594,
+      "completions/min_length": 7.0,
+      "completions/min_terminated_length": 7.0,
+      "epoch": 0.0312,
+      "grad_norm": 4.318572044372559,
+      "kl": 0.0013608932495117188,
+      "learning_rate": 6.19047619047619e-07,
+      "loss": 0.1581,
+      "num_tokens": 584165.0,
+      "reward": 0.03686396777629852,
+      "reward_std": 0.00873212143778801,
+      "rewards/bleu_reward_func/mean": 0.03686396777629852,
+      "rewards/bleu_reward_func/std": 0.03987700119614601,
+      "step": 39
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.40625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 467.0,
+      "completions/mean_length": 395.40625,
+      "completions/mean_terminated_length": 315.631591796875,
+      "completions/min_length": 60.0,
+      "completions/min_terminated_length": 60.0,
+      "epoch": 0.032,
+      "grad_norm": 2.8483028411865234,
+      "kl": 0.0015163421630859375,
+      "learning_rate": 6.349206349206349e-07,
+      "loss": 0.2409,
+      "num_tokens": 599602.0,
+      "reward": 0.012605215422809124,
+      "reward_std": 0.007717709057033062,
+      "rewards/bleu_reward_func/mean": 0.012605215422809124,
+      "rewards/bleu_reward_func/std": 0.008546828292310238,
+      "step": 40
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.46875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 401.0,
+      "completions/mean_length": 306.46875,
+      "completions/mean_terminated_length": 125.11764526367188,
+      "completions/min_length": 11.0,
+      "completions/min_terminated_length": 11.0,
+      "epoch": 0.0328,
+      "grad_norm": 15.546673774719238,
+      "kl": 0.0033998489379882812,
+      "learning_rate": 6.507936507936507e-07,
+      "loss": 0.2262,
+      "num_tokens": 617761.0,
+      "reward": 0.037311654537916183,
+      "reward_std": 0.04001215100288391,
+      "rewards/bleu_reward_func/mean": 0.037311654537916183,
+      "rewards/bleu_reward_func/std": 0.05116492509841919,
+      "step": 41
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.53125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 394.0,
+      "completions/mean_length": 308.375,
+      "completions/mean_terminated_length": 77.60000610351562,
+      "completions/min_length": 2.0,
+      "completions/min_terminated_length": 2.0,
+      "epoch": 0.0336,
+      "grad_norm": 9.35763168334961,
+      "kl": 0.0030651092529296875,
+      "learning_rate": 6.666666666666666e-07,
+      "loss": 0.1783,
+      "num_tokens": 637541.0,
+      "reward": 0.06710080057382584,
+      "reward_std": 0.0418785884976387,
+      "rewards/bleu_reward_func/mean": 0.06710080057382584,
+      "rewards/bleu_reward_func/std": 0.09365852922201157,
+      "step": 42
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.40625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 509.0,
+      "completions/mean_length": 336.59375,
+      "completions/mean_terminated_length": 216.57894897460938,
+      "completions/min_length": 16.0,
+      "completions/min_terminated_length": 16.0,
+      "epoch": 0.0344,
+      "grad_norm": 4.267389297485352,
+      "kl": 0.002620697021484375,
+      "learning_rate": 6.825396825396826e-07,
+      "loss": -0.0163,
+      "num_tokens": 650776.0,
+      "reward": 0.04351692646741867,
+      "reward_std": 0.03509015589952469,
+      "rewards/bleu_reward_func/mean": 0.04351692646741867,
+      "rewards/bleu_reward_func/std": 0.052853576838970184,
+      "step": 43
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 387.0,
+      "completions/mean_length": 240.5,
+      "completions/mean_terminated_length": 177.84616088867188,
+      "completions/min_length": 23.0,
+      "completions/min_terminated_length": 23.0,
+      "epoch": 0.0352,
+      "grad_norm": 112.21627807617188,
+      "kl": 0.0047245025634765625,
+      "learning_rate": 6.984126984126983e-07,
+      "loss": 0.0521,
+      "num_tokens": 665576.0,
+      "reward": 0.05281548202037811,
+      "reward_std": 0.034495480358600616,
+      "rewards/bleu_reward_func/mean": 0.05281548202037811,
+      "rewards/bleu_reward_func/std": 0.0704483836889267,
+      "step": 44
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 477.0,
+      "completions/mean_length": 335.46875,
+      "completions/mean_terminated_length": 310.25,
+      "completions/min_length": 81.0,
+      "completions/min_terminated_length": 81.0,
+      "epoch": 0.036,
+      "grad_norm": 30.739818572998047,
+      "kl": 0.00278472900390625,
+      "learning_rate": 7.142857142857143e-07,
+      "loss": -0.1091,
+      "num_tokens": 678375.0,
+      "reward": 0.04049266129732132,
+      "reward_std": 0.020605597645044327,
+      "rewards/bleu_reward_func/mean": 0.04049266129732132,
+      "rewards/bleu_reward_func/std": 0.04322003573179245,
+      "step": 45
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.15625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 456.0,
+      "completions/mean_length": 253.28125,
+      "completions/mean_terminated_length": 205.37037658691406,
+      "completions/min_length": 19.0,
+      "completions/min_terminated_length": 19.0,
+      "epoch": 0.0368,
+      "grad_norm": 3.857532501220703,
+      "kl": 0.002582550048828125,
+      "learning_rate": 7.301587301587301e-07,
+      "loss": 0.1863,
+      "num_tokens": 693632.0,
+      "reward": 0.03602021187543869,
+      "reward_std": 0.03167928382754326,
+      "rewards/bleu_reward_func/mean": 0.03602021187543869,
+      "rewards/bleu_reward_func/std": 0.060269005596637726,
+      "step": 46
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 457.0,
+      "completions/mean_length": 379.125,
+      "completions/mean_terminated_length": 208.2857208251953,
+      "completions/min_length": 42.0,
+      "completions/min_terminated_length": 42.0,
+      "epoch": 0.0376,
+      "grad_norm": 44.705684661865234,
+      "kl": 0.002960205078125,
+      "learning_rate": 7.46031746031746e-07,
+      "loss": -0.2537,
+      "num_tokens": 712244.0,
+      "reward": 0.009683560580015182,
+      "reward_std": 0.007736856117844582,
+      "rewards/bleu_reward_func/mean": 0.009683560580015182,
+      "rewards/bleu_reward_func/std": 0.010262547992169857,
+      "step": 47
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.15625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 491.0,
+      "completions/mean_length": 209.46875,
+      "completions/mean_terminated_length": 153.44444274902344,
+      "completions/min_length": 7.0,
+      "completions/min_terminated_length": 7.0,
+      "epoch": 0.0384,
+      "grad_norm": 4.927426815032959,
+      "kl": 0.010406494140625,
+      "learning_rate": 7.619047619047618e-07,
+      "loss": 0.2249,
+      "num_tokens": 722779.0,
+      "reward": 0.06434739381074905,
+      "reward_std": 0.062096044421195984,
+      "rewards/bleu_reward_func/mean": 0.06434739381074905,
+      "rewards/bleu_reward_func/std": 0.07261113822460175,
+      "step": 48
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.40625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 463.0,
+      "completions/mean_length": 319.53125,
+      "completions/mean_terminated_length": 187.84210205078125,
+      "completions/min_length": 12.0,
+      "completions/min_terminated_length": 12.0,
+      "epoch": 0.0392,
+      "grad_norm": 6.036177158355713,
+      "kl": 0.0051422119140625,
+      "learning_rate": 7.777777777777778e-07,
+      "loss": 0.2132,
+      "num_tokens": 735892.0,
+      "reward": 0.0316137932240963,
+      "reward_std": 0.028243713080883026,
+      "rewards/bleu_reward_func/mean": 0.0316137932240963,
+      "rewards/bleu_reward_func/std": 0.032289810478687286,
+      "step": 49
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.34375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 489.0,
+      "completions/mean_length": 329.0625,
+      "completions/mean_terminated_length": 233.23809814453125,
+      "completions/min_length": 25.0,
+      "completions/min_terminated_length": 25.0,
+      "epoch": 0.04,
+      "grad_norm": 6.000904560089111,
+      "kl": 0.0041103363037109375,
+      "learning_rate": 7.936507936507936e-07,
+      "loss": 0.164,
+      "num_tokens": 748926.0,
+      "reward": 0.031059542670845985,
+      "reward_std": 0.02046222612261772,
+      "rewards/bleu_reward_func/mean": 0.031059542670845985,
+      "rewards/bleu_reward_func/std": 0.029215287417173386,
+      "step": 50
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 460.0,
+      "completions/mean_length": 240.03125,
+      "completions/mean_terminated_length": 201.17857360839844,
+      "completions/min_length": 10.0,
+      "completions/min_terminated_length": 10.0,
+      "epoch": 0.0408,
+      "grad_norm": 4.45400333404541,
+      "kl": 0.00446319580078125,
+      "learning_rate": 8.095238095238095e-07,
+      "loss": 0.095,
+      "num_tokens": 763935.0,
+      "reward": 0.06022896245121956,
+      "reward_std": 0.04401791840791702,
+      "rewards/bleu_reward_func/mean": 0.06022896245121956,
+      "rewards/bleu_reward_func/std": 0.06288844347000122,
+      "step": 51
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.40625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 488.0,
+      "completions/mean_length": 308.875,
+      "completions/mean_terminated_length": 169.89474487304688,
+      "completions/min_length": 6.0,
+      "completions/min_terminated_length": 6.0,
+      "epoch": 0.0416,
+      "grad_norm": 6.896392822265625,
+      "kl": 0.00627899169921875,
+      "learning_rate": 8.253968253968253e-07,
+      "loss": 0.1302,
+      "num_tokens": 781619.0,
+      "reward": 0.02847466617822647,
+      "reward_std": 0.024918708950281143,
+      "rewards/bleu_reward_func/mean": 0.02847466617822647,
+      "rewards/bleu_reward_func/std": 0.03209677338600159,
+      "step": 52
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 472.0,
+      "completions/mean_length": 309.9375,
+      "completions/mean_terminated_length": 263.3077087402344,
+      "completions/min_length": 50.0,
+      "completions/min_terminated_length": 50.0,
+      "epoch": 0.0424,
+      "grad_norm": 5.093250751495361,
+      "kl": 0.004283905029296875,
+      "learning_rate": 8.412698412698413e-07,
+      "loss": -0.0777,
+      "num_tokens": 795977.0,
+      "reward": 0.07096201926469803,
+      "reward_std": 0.06636855751276016,
+      "rewards/bleu_reward_func/mean": 0.07096201926469803,
+      "rewards/bleu_reward_func/std": 0.09039857983589172,
+      "step": 53
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.59375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 484.0,
+      "completions/mean_length": 386.4375,
+      "completions/mean_terminated_length": 202.92308044433594,
+      "completions/min_length": 20.0,
+      "completions/min_terminated_length": 20.0,
+      "epoch": 0.0432,
+      "grad_norm": 3.2264180183410645,
+      "kl": 0.0028362274169921875,
+      "learning_rate": 8.57142857142857e-07,
+      "loss": -0.0863,
+      "num_tokens": 814135.0,
+      "reward": 0.014086933806538582,
+      "reward_std": 0.013363949954509735,
+      "rewards/bleu_reward_func/mean": 0.014086933806538582,
+      "rewards/bleu_reward_func/std": 0.01598522998392582,
+      "step": 54
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.15625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 456.0,
+      "completions/mean_length": 244.59375,
+      "completions/mean_terminated_length": 195.07408142089844,
+      "completions/min_length": 15.0,
+      "completions/min_terminated_length": 15.0,
+      "epoch": 0.044,
+      "grad_norm": 8.706979751586914,
+      "kl": 0.00571441650390625,
+      "learning_rate": 8.73015873015873e-07,
+      "loss": 0.1609,
+      "num_tokens": 827226.0,
+      "reward": 0.0647934228181839,
+      "reward_std": 0.0345802828669548,
+      "rewards/bleu_reward_func/mean": 0.0647934228181839,
+      "rewards/bleu_reward_func/std": 0.04030924290418625,
+      "step": 55
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 463.0,
+      "completions/mean_length": 126.0625,
+      "completions/mean_terminated_length": 113.61289978027344,
+      "completions/min_length": 20.0,
+      "completions/min_terminated_length": 20.0,
+      "epoch": 0.0448,
+      "grad_norm": 8.598736763000488,
+      "kl": 0.0142974853515625,
+      "learning_rate": 8.888888888888888e-07,
+      "loss": 0.1419,
+      "num_tokens": 834268.0,
+      "reward": 0.04880748316645622,
+      "reward_std": 0.042880259454250336,
+      "rewards/bleu_reward_func/mean": 0.04880748316645622,
+      "rewards/bleu_reward_func/std": 0.05060458555817604,
+      "step": 56
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.28125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 479.0,
+      "completions/mean_length": 302.03125,
+      "completions/mean_terminated_length": 219.86956787109375,
+      "completions/min_length": 7.0,
+      "completions/min_terminated_length": 7.0,
+      "epoch": 0.0456,
+      "grad_norm": 6.926377296447754,
+      "kl": 0.009532928466796875,
+      "learning_rate": 9.047619047619047e-07,
+      "loss": -0.0374,
+      "num_tokens": 851701.0,
+      "reward": 0.06913506239652634,
+      "reward_std": 0.04138587415218353,
+      "rewards/bleu_reward_func/mean": 0.06913506239652634,
+      "rewards/bleu_reward_func/std": 0.0750163346529007,
+      "step": 57
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.21875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 416.0,
+      "completions/mean_length": 211.40625,
+      "completions/mean_terminated_length": 127.23999786376953,
+      "completions/min_length": 20.0,
+      "completions/min_terminated_length": 20.0,
+      "epoch": 0.0464,
+      "grad_norm": 7.853041648864746,
+      "kl": 0.0272064208984375,
+      "learning_rate": 9.206349206349205e-07,
+      "loss": 0.4023,
+      "num_tokens": 865434.0,
+      "reward": 0.12499310076236725,
+      "reward_std": 0.08980046212673187,
+      "rewards/bleu_reward_func/mean": 0.12499310076236725,
+      "rewards/bleu_reward_func/std": 0.13493874669075012,
+      "step": 58
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.46875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 505.0,
+      "completions/mean_length": 390.65625,
+      "completions/mean_terminated_length": 283.5882263183594,
+      "completions/min_length": 2.0,
+      "completions/min_terminated_length": 2.0,
+      "epoch": 0.0472,
+      "grad_norm": 2.4230539798736572,
+      "kl": 0.003765106201171875,
+      "learning_rate": 9.365079365079365e-07,
+      "loss": -0.0092,
+      "num_tokens": 884815.0,
+      "reward": 0.021261584013700485,
+      "reward_std": 0.027461236342787743,
+      "rewards/bleu_reward_func/mean": 0.021261584013700485,
+      "rewards/bleu_reward_func/std": 0.03110821731388569,
+      "step": 59
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.28125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 495.0,
+      "completions/mean_length": 281.59375,
+      "completions/mean_terminated_length": 191.43478393554688,
+      "completions/min_length": 35.0,
+      "completions/min_terminated_length": 35.0,
+      "epoch": 0.048,
+      "grad_norm": 2.9367215633392334,
+      "kl": 0.00628662109375,
+      "learning_rate": 9.523809523809522e-07,
+      "loss": 0.2021,
+      "num_tokens": 896810.0,
+      "reward": 0.023613639175891876,
+      "reward_std": 0.02252291887998581,
+      "rewards/bleu_reward_func/mean": 0.023613639175891876,
+      "rewards/bleu_reward_func/std": 0.041281431913375854,
+      "step": 60
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.15625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 509.0,
+      "completions/mean_length": 272.34375,
+      "completions/mean_terminated_length": 227.9629669189453,
+      "completions/min_length": 20.0,
+      "completions/min_terminated_length": 20.0,
+      "epoch": 0.0488,
+      "grad_norm": 3.998281717300415,
+      "kl": 0.0076904296875,
+      "learning_rate": 9.682539682539682e-07,
+      "loss": -0.1513,
+      "num_tokens": 907349.0,
+      "reward": 0.07193129509687424,
+      "reward_std": 0.05195175111293793,
+      "rewards/bleu_reward_func/mean": 0.07193129509687424,
+      "rewards/bleu_reward_func/std": 0.07358168065547943,
+      "step": 61
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 335.0,
+      "completions/max_terminated_length": 335.0,
+      "completions/mean_length": 82.25,
+      "completions/mean_terminated_length": 82.25,
+      "completions/min_length": 16.0,
+      "completions/min_terminated_length": 16.0,
+      "epoch": 0.0496,
+      "grad_norm": 12.274957656860352,
+      "kl": 0.049835205078125,
+      "learning_rate": 9.84126984126984e-07,
+      "loss": 0.0211,
+      "num_tokens": 916605.0,
+      "reward": 0.1968570053577423,
+      "reward_std": 0.09575757384300232,
+      "rewards/bleu_reward_func/mean": 0.1968570053577423,
+      "rewards/bleu_reward_func/std": 0.14971531927585602,
+      "step": 62
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.09375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 353.0,
+      "completions/mean_length": 209.71875,
+      "completions/mean_terminated_length": 178.44827270507812,
+      "completions/min_length": 18.0,
+      "completions/min_terminated_length": 18.0,
+      "epoch": 0.0504,
+      "grad_norm": 4.69417142868042,
+      "kl": 0.0074615478515625,
+      "learning_rate": 1e-06,
+      "loss": -0.0102,
+      "num_tokens": 924772.0,
+      "reward": 0.026346374303102493,
+      "reward_std": 0.015668006613850594,
+      "rewards/bleu_reward_func/mean": 0.026346374303102493,
+      "rewards/bleu_reward_func/std": 0.016677534207701683,
+      "step": 63
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 367.0,
+      "completions/mean_length": 147.34375,
+      "completions/mean_terminated_length": 95.25000762939453,
+      "completions/min_length": 10.0,
+      "completions/min_terminated_length": 10.0,
+      "epoch": 0.0512,
+      "grad_norm": 7.854241371154785,
+      "kl": 0.019744873046875,
+      "learning_rate": 1e-06,
+      "loss": 0.1402,
+      "num_tokens": 932879.0,
+      "reward": 0.039189111441373825,
+      "reward_std": 0.034408073872327805,
+      "rewards/bleu_reward_func/mean": 0.039189111441373825,
+      "rewards/bleu_reward_func/std": 0.06643246859312057,
+      "step": 64
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.15625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 473.0,
+      "completions/mean_length": 217.53125,
+      "completions/mean_terminated_length": 163.0,
+      "completions/min_length": 15.0,
+      "completions/min_terminated_length": 15.0,
+      "epoch": 0.052,
+      "grad_norm": 5.94617223739624,
+      "kl": 0.012752532958984375,
+      "learning_rate": 1e-06,
+      "loss": 0.0629,
+      "num_tokens": 944744.0,
+      "reward": 0.0992283821105957,
+      "reward_std": 0.04174066707491875,
+      "rewards/bleu_reward_func/mean": 0.0992283821105957,
+      "rewards/bleu_reward_func/std": 0.14538165926933289,
+      "step": 65
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.34375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 512.0,
+      "completions/mean_length": 360.84375,
+      "completions/mean_terminated_length": 281.66668701171875,
+      "completions/min_length": 57.0,
+      "completions/min_terminated_length": 57.0,
+      "epoch": 0.0528,
+      "grad_norm": 2.7164971828460693,
+      "kl": 0.00780487060546875,
+      "learning_rate": 1e-06,
+      "loss": -0.1815,
+      "num_tokens": 959043.0,
+      "reward": 0.03164489567279816,
+      "reward_std": 0.024089161306619644,
+      "rewards/bleu_reward_func/mean": 0.03164489567279816,
+      "rewards/bleu_reward_func/std": 0.03230883181095123,
+      "step": 66
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.21875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 512.0,
+      "completions/mean_length": 174.375,
+      "completions/mean_terminated_length": 79.83999633789062,
+      "completions/min_length": 19.0,
+      "completions/min_terminated_length": 19.0,
+      "epoch": 0.0536,
+      "grad_norm": 8.954367637634277,
+      "kl": 0.040679931640625,
+      "learning_rate": 1e-06,
+      "loss": 0.4022,
+      "num_tokens": 970487.0,
+      "reward": 0.1188623458147049,
+      "reward_std": 0.06528393179178238,
+      "rewards/bleu_reward_func/mean": 0.1188623458147049,
+      "rewards/bleu_reward_func/std": 0.10126637667417526,
+      "step": 67
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.34375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 458.0,
+      "completions/mean_length": 249.78125,
+      "completions/mean_terminated_length": 112.42857360839844,
+      "completions/min_length": 11.0,
+      "completions/min_terminated_length": 11.0,
+      "epoch": 0.0544,
+      "grad_norm": 8.929741859436035,
+      "kl": 0.01055908203125,
+      "learning_rate": 1e-06,
+      "loss": -0.3676,
+      "num_tokens": 980432.0,
+      "reward": 0.04680415242910385,
+      "reward_std": 0.015473801642656326,
+      "rewards/bleu_reward_func/mean": 0.04680415242910385,
+      "rewards/bleu_reward_func/std": 0.05666949972510338,
+      "step": 68
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 481.0,
+      "completions/mean_length": 200.28125,
+      "completions/mean_terminated_length": 155.75,
+      "completions/min_length": 18.0,
+      "completions/min_terminated_length": 18.0,
+      "epoch": 0.0552,
+      "grad_norm": 19.934701919555664,
+      "kl": 0.036396026611328125,
+      "learning_rate": 1e-06,
+      "loss": 0.1105,
+      "num_tokens": 988713.0,
+      "reward": 0.03349726274609566,
+      "reward_std": 0.007375569082796574,
+      "rewards/bleu_reward_func/mean": 0.03349726274609566,
+      "rewards/bleu_reward_func/std": 0.0360921286046505,
+      "step": 69
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.40625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 349.0,
+      "completions/mean_length": 295.1875,
+      "completions/mean_terminated_length": 146.84210205078125,
+      "completions/min_length": 13.0,
+      "completions/min_terminated_length": 13.0,
+      "epoch": 0.056,
+      "grad_norm": 3.6616406440734863,
+      "kl": 0.0112762451171875,
+      "learning_rate": 1e-06,
+      "loss": 0.081,
+      "num_tokens": 1002319.0,
+      "reward": 0.016106903553009033,
+      "reward_std": 0.008415726944804192,
+      "rewards/bleu_reward_func/mean": 0.016106903553009033,
+      "rewards/bleu_reward_func/std": 0.012413726188242435,
+      "step": 70
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 501.0,
+      "completions/mean_length": 270.1875,
+      "completions/mean_terminated_length": 235.6428680419922,
+      "completions/min_length": 27.0,
+      "completions/min_terminated_length": 27.0,
+      "epoch": 0.0568,
+      "grad_norm": 11.310477256774902,
+      "kl": 0.02431488037109375,
+      "learning_rate": 1e-06,
+      "loss": 0.0216,
+      "num_tokens": 1014117.0,
+      "reward": 0.09336908906698227,
+      "reward_std": 0.04001408815383911,
+      "rewards/bleu_reward_func/mean": 0.09336908906698227,
+      "rewards/bleu_reward_func/std": 0.04507448151707649,
+      "step": 71
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.34375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 508.0,
+      "completions/mean_length": 298.96875,
+      "completions/mean_terminated_length": 187.38095092773438,
+      "completions/min_length": 20.0,
+      "completions/min_terminated_length": 20.0,
+      "epoch": 0.0576,
+      "grad_norm": 11.831945419311523,
+      "kl": 0.0236358642578125,
+      "learning_rate": 1e-06,
+      "loss": 0.2398,
+      "num_tokens": 1029324.0,
+      "reward": 0.06671467423439026,
+      "reward_std": 0.07224421948194504,
+      "rewards/bleu_reward_func/mean": 0.06671467423439026,
+      "rewards/bleu_reward_func/std": 0.09839192777872086,
+      "step": 72
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 506.0,
+      "completions/mean_length": 262.28125,
+      "completions/mean_terminated_length": 245.6333465576172,
+      "completions/min_length": 70.0,
+      "completions/min_terminated_length": 70.0,
+      "epoch": 0.0584,
+      "grad_norm": 3.4266369342803955,
+      "kl": 0.01332855224609375,
+      "learning_rate": 1e-06,
+      "loss": -0.117,
+      "num_tokens": 1040677.0,
+      "reward": 0.048909105360507965,
+      "reward_std": 0.01749919354915619,
+      "rewards/bleu_reward_func/mean": 0.048909105360507965,
+      "rewards/bleu_reward_func/std": 0.046220190823078156,
+      "step": 73
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.34375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 443.0,
+      "completions/mean_length": 315.78125,
+      "completions/mean_terminated_length": 213.0,
+      "completions/min_length": 25.0,
+      "completions/min_terminated_length": 25.0,
+      "epoch": 0.0592,
+      "grad_norm": 3.334998369216919,
+      "kl": 0.030864715576171875,
+      "learning_rate": 1e-06,
+      "loss": 0.0762,
+      "num_tokens": 1057350.0,
+      "reward": 0.06654933840036392,
+      "reward_std": 0.030867960304021835,
+      "rewards/bleu_reward_func/mean": 0.06654933840036392,
+      "rewards/bleu_reward_func/std": 0.04364337399601936,
+      "step": 74
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 510.0,
+      "completions/mean_length": 287.4375,
+      "completions/mean_terminated_length": 235.61538696289062,
+      "completions/min_length": 12.0,
+      "completions/min_terminated_length": 12.0,
+      "epoch": 0.06,
+      "grad_norm": 12.321810722351074,
+      "kl": 0.05252838134765625,
+      "learning_rate": 1e-06,
+      "loss": 0.1111,
+      "num_tokens": 1072668.0,
+      "reward": 0.07815341651439667,
+      "reward_std": 0.05233295261859894,
+      "rewards/bleu_reward_func/mean": 0.07815341651439667,
+      "rewards/bleu_reward_func/std": 0.0646696388721466,
+      "step": 75
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 503.0,
+      "completions/mean_length": 306.3125,
+      "completions/mean_terminated_length": 182.90000915527344,
+      "completions/min_length": 30.0,
+      "completions/min_terminated_length": 30.0,
+      "epoch": 0.0608,
+      "grad_norm": 3.883251905441284,
+      "kl": 0.0373382568359375,
+      "learning_rate": 1e-06,
+      "loss": 0.1517,
+      "num_tokens": 1086694.0,
+      "reward": 0.06417744606733322,
+      "reward_std": 0.034075379371643066,
+      "rewards/bleu_reward_func/mean": 0.06417744606733322,
+      "rewards/bleu_reward_func/std": 0.049788232892751694,
+      "step": 76
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 415.0,
+      "completions/max_terminated_length": 415.0,
+      "completions/mean_length": 196.4375,
+      "completions/mean_terminated_length": 196.4375,
+      "completions/min_length": 23.0,
+      "completions/min_terminated_length": 23.0,
+      "epoch": 0.0616,
+      "grad_norm": 6.460638523101807,
+      "kl": 0.05291748046875,
+      "learning_rate": 1e-06,
+      "loss": -0.0849,
+      "num_tokens": 1097476.0,
+      "reward": 0.08122064173221588,
+      "reward_std": 0.03298315033316612,
+      "rewards/bleu_reward_func/mean": 0.08122064173221588,
+      "rewards/bleu_reward_func/std": 0.047924816608428955,
+      "step": 77
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.09375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 448.0,
+      "completions/mean_length": 199.15625,
+      "completions/mean_terminated_length": 166.79310607910156,
+      "completions/min_length": 4.0,
+      "completions/min_terminated_length": 4.0,
+      "epoch": 0.0624,
+      "grad_norm": 6.63805627822876,
+      "kl": 0.07642364501953125,
+      "learning_rate": 1e-06,
+      "loss": -0.0967,
+      "num_tokens": 1108793.0,
+      "reward": 0.07887591421604156,
+      "reward_std": 0.05435461550951004,
+      "rewards/bleu_reward_func/mean": 0.07887591421604156,
+      "rewards/bleu_reward_func/std": 0.10201766341924667,
+      "step": 78
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 480.0,
+      "completions/mean_length": 193.0,
+      "completions/mean_terminated_length": 147.42857360839844,
+      "completions/min_length": 4.0,
+      "completions/min_terminated_length": 4.0,
+      "epoch": 0.0632,
+      "grad_norm": 14.087907791137695,
+      "kl": 0.0762939453125,
+      "learning_rate": 1e-06,
+      "loss": -0.056,
+      "num_tokens": 1118721.0,
+      "reward": 0.035933416336774826,
+      "reward_std": 0.02187356725335121,
+      "rewards/bleu_reward_func/mean": 0.035933416336774826,
+      "rewards/bleu_reward_func/std": 0.025764403864741325,
+      "step": 79
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.15625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 482.0,
+      "completions/mean_length": 192.9375,
+      "completions/mean_terminated_length": 133.8518524169922,
+      "completions/min_length": 20.0,
+      "completions/min_terminated_length": 20.0,
+      "epoch": 0.064,
+      "grad_norm": 6.712767124176025,
+      "kl": 0.07209014892578125,
+      "learning_rate": 1e-06,
+      "loss": 0.2799,
+      "num_tokens": 1130095.0,
+      "reward": 0.04000134766101837,
+      "reward_std": 0.014790613204240799,
+      "rewards/bleu_reward_func/mean": 0.04000134766101837,
+      "rewards/bleu_reward_func/std": 0.028310615569353104,
+      "step": 80
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.40625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 509.0,
+      "completions/mean_length": 395.5625,
+      "completions/mean_terminated_length": 315.8947448730469,
+      "completions/min_length": 82.0,
+      "completions/min_terminated_length": 82.0,
+      "epoch": 0.0648,
+      "grad_norm": 3.1348772048950195,
+      "kl": 0.012363433837890625,
+      "learning_rate": 1e-06,
+      "loss": -0.0477,
+      "num_tokens": 1145009.0,
+      "reward": 0.05394501984119415,
+      "reward_std": 0.019456665962934494,
+      "rewards/bleu_reward_func/mean": 0.05394501984119415,
+      "rewards/bleu_reward_func/std": 0.05528007075190544,
+      "step": 81
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.28125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 511.0,
+      "completions/mean_length": 297.25,
+      "completions/mean_terminated_length": 213.21739196777344,
+      "completions/min_length": 9.0,
+      "completions/min_terminated_length": 9.0,
+      "epoch": 0.0656,
+      "grad_norm": 4.045035362243652,
+      "kl": 0.02558135986328125,
+      "learning_rate": 1e-06,
+      "loss": -0.0948,
+      "num_tokens": 1156665.0,
+      "reward": 0.08088956773281097,
+      "reward_std": 0.031020794063806534,
+      "rewards/bleu_reward_func/mean": 0.08088956773281097,
+      "rewards/bleu_reward_func/std": 0.04719265177845955,
+      "step": 82
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.15625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 462.0,
+      "completions/mean_length": 252.65625,
+      "completions/mean_terminated_length": 204.629638671875,
+      "completions/min_length": 32.0,
+      "completions/min_terminated_length": 32.0,
+      "epoch": 0.0664,
+      "grad_norm": 7.020449161529541,
+      "kl": 0.022491455078125,
+      "learning_rate": 1e-06,
+      "loss": -0.2084,
+      "num_tokens": 1170686.0,
+      "reward": 0.048978567123413086,
+      "reward_std": 0.014538805931806564,
+      "rewards/bleu_reward_func/mean": 0.048978567123413086,
+      "rewards/bleu_reward_func/std": 0.03447263315320015,
+      "step": 83
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.34375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 511.0,
+      "completions/mean_length": 279.53125,
+      "completions/mean_terminated_length": 157.76190185546875,
+      "completions/min_length": 16.0,
+      "completions/min_terminated_length": 16.0,
+      "epoch": 0.0672,
+      "grad_norm": 6.09721040725708,
+      "kl": 0.02556610107421875,
+      "learning_rate": 1e-06,
+      "loss": 0.0979,
+      "num_tokens": 1181927.0,
+      "reward": 0.07267215847969055,
+      "reward_std": 0.029872559010982513,
+      "rewards/bleu_reward_func/mean": 0.07267215847969055,
+      "rewards/bleu_reward_func/std": 0.05035723000764847,
+      "step": 84
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.09375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 512.0,
+      "completions/mean_length": 281.1875,
+      "completions/mean_terminated_length": 257.3103332519531,
+      "completions/min_length": 17.0,
+      "completions/min_terminated_length": 17.0,
+      "epoch": 0.068,
+      "grad_norm": 5.706907272338867,
+      "kl": 0.030059814453125,
+      "learning_rate": 1e-06,
+      "loss": -0.0967,
+      "num_tokens": 1198797.0,
+      "reward": 0.05050581321120262,
+      "reward_std": 0.023779014125466347,
+      "rewards/bleu_reward_func/mean": 0.05050581321120262,
+      "rewards/bleu_reward_func/std": 0.03608938306570053,
+      "step": 85
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 512.0,
+      "completions/mean_length": 163.65625,
+      "completions/mean_terminated_length": 163.65625,
+      "completions/min_length": 3.0,
+      "completions/min_terminated_length": 3.0,
+      "epoch": 0.0688,
+      "grad_norm": 7.1789960861206055,
+      "kl": 0.0573577880859375,
+      "learning_rate": 1e-06,
+      "loss": 0.0245,
+      "num_tokens": 1207106.0,
+      "reward": 0.07873363792896271,
+      "reward_std": 0.0395892933011055,
+      "rewards/bleu_reward_func/mean": 0.07873363792896271,
+      "rewards/bleu_reward_func/std": 0.0705900639295578,
+      "step": 86
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.28125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 425.0,
+      "completions/mean_length": 304.6875,
+      "completions/mean_terminated_length": 223.56521606445312,
+      "completions/min_length": 67.0,
+      "completions/min_terminated_length": 67.0,
+      "epoch": 0.0696,
+      "grad_norm": 3.9068655967712402,
+      "kl": 0.01955413818359375,
+      "learning_rate": 1e-06,
+      "loss": -0.2361,
+      "num_tokens": 1219760.0,
+      "reward": 0.03426438570022583,
+      "reward_std": 0.021733341738581657,
+      "rewards/bleu_reward_func/mean": 0.03426438570022583,
+      "rewards/bleu_reward_func/std": 0.031944356858730316,
+      "step": 87
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 496.0,
+      "completions/max_terminated_length": 496.0,
+      "completions/mean_length": 137.1875,
+      "completions/mean_terminated_length": 137.1875,
+      "completions/min_length": 14.0,
+      "completions/min_terminated_length": 14.0,
+      "epoch": 0.0704,
+      "grad_norm": 7.929437160491943,
+      "kl": 0.1163482666015625,
+      "learning_rate": 1e-06,
+      "loss": -0.0079,
+      "num_tokens": 1230702.0,
+      "reward": 0.13437795639038086,
+      "reward_std": 0.04989761859178543,
+      "rewards/bleu_reward_func/mean": 0.13437795639038086,
+      "rewards/bleu_reward_func/std": 0.08757011592388153,
+      "step": 88
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 428.0,
+      "completions/mean_length": 248.1875,
+      "completions/mean_terminated_length": 160.25,
+      "completions/min_length": 20.0,
+      "completions/min_terminated_length": 20.0,
+      "epoch": 0.0712,
+      "grad_norm": 3.5676372051239014,
+      "kl": 0.029571533203125,
+      "learning_rate": 1e-06,
+      "loss": 0.085,
+      "num_tokens": 1241748.0,
+      "reward": 0.06261839717626572,
+      "reward_std": 0.05303023010492325,
+      "rewards/bleu_reward_func/mean": 0.06261839717626572,
+      "rewards/bleu_reward_func/std": 0.07371754199266434,
+      "step": 89
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 487.0,
+      "completions/mean_length": 199.21875,
+      "completions/mean_terminated_length": 178.36666870117188,
+      "completions/min_length": 5.0,
+      "completions/min_terminated_length": 5.0,
+      "epoch": 0.072,
+      "grad_norm": 13.081062316894531,
+      "kl": 0.0998382568359375,
+      "learning_rate": 1e-06,
+      "loss": 0.0303,
+      "num_tokens": 1254971.0,
+      "reward": 0.09151400625705719,
+      "reward_std": 0.049102533608675,
+      "rewards/bleu_reward_func/mean": 0.09151400625705719,
+      "rewards/bleu_reward_func/std": 0.08098553121089935,
+      "step": 90
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 458.0,
+      "completions/mean_length": 277.8125,
+      "completions/mean_terminated_length": 244.35714721679688,
+      "completions/min_length": 24.0,
+      "completions/min_terminated_length": 24.0,
+      "epoch": 0.0728,
+      "grad_norm": 4.541591167449951,
+      "kl": 0.0150604248046875,
+      "learning_rate": 1e-06,
+      "loss": -0.2243,
+      "num_tokens": 1268229.0,
+      "reward": 0.029024727642536163,
+      "reward_std": 0.02233259379863739,
+      "rewards/bleu_reward_func/mean": 0.029024727642536163,
+      "rewards/bleu_reward_func/std": 0.0296621173620224,
+      "step": 91
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.21875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 461.0,
+      "completions/mean_length": 175.875,
+      "completions/mean_terminated_length": 81.75999450683594,
+      "completions/min_length": 14.0,
+      "completions/min_terminated_length": 14.0,
+      "epoch": 0.0736,
+      "grad_norm": 12.45702075958252,
+      "kl": 0.09152984619140625,
+      "learning_rate": 1e-06,
+      "loss": 0.3301,
+      "num_tokens": 1279753.0,
+      "reward": 0.06008782982826233,
+      "reward_std": 0.03770461678504944,
+      "rewards/bleu_reward_func/mean": 0.06008782982826233,
+      "rewards/bleu_reward_func/std": 0.056894708424806595,
+      "step": 92
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 239.0,
+      "completions/max_terminated_length": 239.0,
+      "completions/mean_length": 71.03125,
+      "completions/mean_terminated_length": 71.03125,
+      "completions/min_length": 24.0,
+      "completions/min_terminated_length": 24.0,
+      "epoch": 0.0744,
+      "grad_norm": 8.271183967590332,
+      "kl": 0.0682220458984375,
+      "learning_rate": 1e-06,
+      "loss": 0.2167,
+      "num_tokens": 1288762.0,
+      "reward": 0.17779187858104706,
+      "reward_std": 0.02900426834821701,
+      "rewards/bleu_reward_func/mean": 0.17779187858104706,
+      "rewards/bleu_reward_func/std": 0.1678331196308136,
+      "step": 93
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 501.0,
+      "completions/mean_length": 132.28125,
+      "completions/mean_terminated_length": 78.03572082519531,
+      "completions/min_length": 16.0,
+      "completions/min_terminated_length": 16.0,
+      "epoch": 0.0752,
+      "grad_norm": 45.396934509277344,
+      "kl": 0.140625,
+      "learning_rate": 1e-06,
+      "loss": 0.1526,
+      "num_tokens": 1299835.0,
+      "reward": 0.1527654230594635,
+      "reward_std": 0.061802513897418976,
+      "rewards/bleu_reward_func/mean": 0.1527654230594635,
+      "rewards/bleu_reward_func/std": 0.10723396390676498,
+      "step": 94
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.34375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 500.0,
+      "completions/mean_length": 350.8125,
+      "completions/mean_terminated_length": 266.3809509277344,
+      "completions/min_length": 40.0,
+      "completions/min_terminated_length": 40.0,
+      "epoch": 0.076,
+      "grad_norm": 4.4763383865356445,
+      "kl": 0.03443145751953125,
+      "learning_rate": 1e-06,
+      "loss": 0.0315,
+      "num_tokens": 1314877.0,
+      "reward": 0.08366496115922928,
+      "reward_std": 0.023002739995718002,
+      "rewards/bleu_reward_func/mean": 0.08366496115922928,
+      "rewards/bleu_reward_func/std": 0.07334847003221512,
+      "step": 95
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.09375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 509.0,
+      "completions/mean_length": 237.75,
+      "completions/mean_terminated_length": 209.37930297851562,
+      "completions/min_length": 22.0,
+      "completions/min_terminated_length": 22.0,
+      "epoch": 0.0768,
+      "grad_norm": 13.181612968444824,
+      "kl": 0.0660247802734375,
+      "learning_rate": 1e-06,
+      "loss": -0.1104,
+      "num_tokens": 1326757.0,
+      "reward": 0.04618287831544876,
+      "reward_std": 0.022957133129239082,
+      "rewards/bleu_reward_func/mean": 0.04618287831544876,
+      "rewards/bleu_reward_func/std": 0.03049774467945099,
+      "step": 96
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.28125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 508.0,
+      "completions/mean_length": 326.71875,
+      "completions/mean_terminated_length": 254.21739196777344,
+      "completions/min_length": 67.0,
+      "completions/min_terminated_length": 67.0,
+      "epoch": 0.0776,
+      "grad_norm": 5.014129161834717,
+      "kl": 0.022705078125,
+      "learning_rate": 1e-06,
+      "loss": 0.0227,
+      "num_tokens": 1340716.0,
+      "reward": 0.08603382110595703,
+      "reward_std": 0.022703565657138824,
+      "rewards/bleu_reward_func/mean": 0.08603382110595703,
+      "rewards/bleu_reward_func/std": 0.09760169684886932,
+      "step": 97
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 511.0,
+      "completions/mean_length": 205.03125,
+      "completions/mean_terminated_length": 195.1290283203125,
+      "completions/min_length": 7.0,
+      "completions/min_terminated_length": 7.0,
+      "epoch": 0.0784,
+      "grad_norm": 10.452898025512695,
+      "kl": 0.101318359375,
+      "learning_rate": 1e-06,
+      "loss": -0.0956,
+      "num_tokens": 1354373.0,
+      "reward": 0.07816646993160248,
+      "reward_std": 0.03450850397348404,
+      "rewards/bleu_reward_func/mean": 0.07816646993160248,
+      "rewards/bleu_reward_func/std": 0.05475042015314102,
+      "step": 98
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.15625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 480.0,
+      "completions/mean_length": 240.4375,
+      "completions/mean_terminated_length": 190.1481475830078,
+      "completions/min_length": 21.0,
+      "completions/min_terminated_length": 21.0,
+      "epoch": 0.0792,
+      "grad_norm": 8.82993221282959,
+      "kl": 0.0435791015625,
+      "learning_rate": 1e-06,
+      "loss": -0.1898,
+      "num_tokens": 1365875.0,
+      "reward": 0.027829378843307495,
+      "reward_std": 0.016982190310955048,
+      "rewards/bleu_reward_func/mean": 0.027829378843307495,
+      "rewards/bleu_reward_func/std": 0.019511230289936066,
+      "step": 99
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 466.0,
+      "completions/mean_length": 210.21875,
+      "completions/mean_terminated_length": 140.57693481445312,
+      "completions/min_length": 3.0,
+      "completions/min_terminated_length": 3.0,
+      "epoch": 0.08,
+      "grad_norm": 14.261658668518066,
+      "kl": 0.1817169189453125,
+      "learning_rate": 1e-06,
+      "loss": -0.4193,
+      "num_tokens": 1375586.0,
+      "reward": 0.0430663600564003,
+      "reward_std": 0.023313239216804504,
+      "rewards/bleu_reward_func/mean": 0.0430663600564003,
+      "rewards/bleu_reward_func/std": 0.0409073531627655,
+      "step": 100
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 493.0,
+      "completions/mean_length": 210.90625,
+      "completions/mean_terminated_length": 201.19354248046875,
+      "completions/min_length": 15.0,
+      "completions/min_terminated_length": 15.0,
+      "epoch": 0.0808,
+      "grad_norm": 7.960334300994873,
+      "kl": 0.0833740234375,
+      "learning_rate": 1e-06,
+      "loss": 0.1121,
+      "num_tokens": 1384975.0,
+      "reward": 0.0974574014544487,
+      "reward_std": 0.03397291898727417,
+      "rewards/bleu_reward_func/mean": 0.0974574014544487,
+      "rewards/bleu_reward_func/std": 0.10795393586158752,
+      "step": 101
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.28125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 459.0,
+      "completions/mean_length": 306.21875,
+      "completions/mean_terminated_length": 225.69566345214844,
+      "completions/min_length": 51.0,
+      "completions/min_terminated_length": 51.0,
+      "epoch": 0.0816,
+      "grad_norm": 3.8322501182556152,
+      "kl": 0.0701446533203125,
+      "learning_rate": 1e-06,
+      "loss": 0.0153,
+      "num_tokens": 1403126.0,
+      "reward": 0.07732782512903214,
+      "reward_std": 0.038768649101257324,
+      "rewards/bleu_reward_func/mean": 0.07732782512903214,
+      "rewards/bleu_reward_func/std": 0.06468553096055984,
+      "step": 102
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 490.0,
+      "completions/mean_length": 167.8125,
+      "completions/mean_terminated_length": 144.86666870117188,
+      "completions/min_length": 9.0,
+      "completions/min_terminated_length": 9.0,
+      "epoch": 0.0824,
+      "grad_norm": 6.311352729797363,
+      "kl": 0.06109619140625,
+      "learning_rate": 1e-06,
+      "loss": 0.1029,
+      "num_tokens": 1417360.0,
+      "reward": 0.23947298526763916,
+      "reward_std": 0.10021178424358368,
+      "rewards/bleu_reward_func/mean": 0.23947298526763916,
+      "rewards/bleu_reward_func/std": 0.40957576036453247,
+      "step": 103
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 502.0,
+      "completions/mean_length": 319.375,
+      "completions/mean_terminated_length": 126.75,
+      "completions/min_length": 17.0,
+      "completions/min_terminated_length": 17.0,
+      "epoch": 0.0832,
+      "grad_norm": 6.871039867401123,
+      "kl": 0.06162261962890625,
+      "learning_rate": 1e-06,
+      "loss": 0.3071,
+      "num_tokens": 1431932.0,
+      "reward": 0.11237628757953644,
+      "reward_std": 0.05608592554926872,
+      "rewards/bleu_reward_func/mean": 0.11237628757953644,
+      "rewards/bleu_reward_func/std": 0.1758151650428772,
+      "step": 104
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 293.0,
+      "completions/mean_length": 206.34375,
+      "completions/mean_terminated_length": 104.45833587646484,
+      "completions/min_length": 15.0,
+      "completions/min_terminated_length": 15.0,
+      "epoch": 0.084,
+      "grad_norm": 13.681912422180176,
+      "kl": 0.063812255859375,
+      "learning_rate": 1e-06,
+      "loss": 0.3711,
+      "num_tokens": 1440791.0,
+      "reward": 0.13408097624778748,
+      "reward_std": 0.07736363261938095,
+      "rewards/bleu_reward_func/mean": 0.13408097624778748,
+      "rewards/bleu_reward_func/std": 0.10995227843523026,
+      "step": 105
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.15625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 505.0,
+      "completions/mean_length": 239.21875,
+      "completions/mean_terminated_length": 188.70370483398438,
+      "completions/min_length": 13.0,
+      "completions/min_terminated_length": 13.0,
+      "epoch": 0.0848,
+      "grad_norm": 6.377567291259766,
+      "kl": 0.212432861328125,
+      "learning_rate": 1e-06,
+      "loss": 0.1742,
+      "num_tokens": 1452430.0,
+      "reward": 0.09214982390403748,
+      "reward_std": 0.037541188299655914,
+      "rewards/bleu_reward_func/mean": 0.09214982390403748,
+      "rewards/bleu_reward_func/std": 0.06507368385791779,
+      "step": 106
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.53125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 416.0,
+      "completions/mean_length": 382.84375,
+      "completions/mean_terminated_length": 236.4666748046875,
+      "completions/min_length": 71.0,
+      "completions/min_terminated_length": 71.0,
+      "epoch": 0.0856,
+      "grad_norm": 2.956113338470459,
+      "kl": 0.015625,
+      "learning_rate": 1e-06,
+      "loss": 0.019,
+      "num_tokens": 1470353.0,
+      "reward": 0.029356852173805237,
+      "reward_std": 0.020268836989998817,
+      "rewards/bleu_reward_func/mean": 0.029356852173805237,
+      "rewards/bleu_reward_func/std": 0.031047984957695007,
+      "step": 107
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 500.0,
+      "completions/mean_length": 346.5,
+      "completions/mean_terminated_length": 247.1999969482422,
+      "completions/min_length": 14.0,
+      "completions/min_terminated_length": 14.0,
+      "epoch": 0.0864,
+      "grad_norm": 5.237264156341553,
+      "kl": 0.0384063720703125,
+      "learning_rate": 1e-06,
+      "loss": -0.2289,
+      "num_tokens": 1483353.0,
+      "reward": 0.06388352811336517,
+      "reward_std": 0.03146419674158096,
+      "rewards/bleu_reward_func/mean": 0.06388352811336517,
+      "rewards/bleu_reward_func/std": 0.0666789561510086,
+      "step": 108
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.40625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 499.0,
+      "completions/mean_length": 326.1875,
+      "completions/mean_terminated_length": 199.05262756347656,
+      "completions/min_length": 31.0,
+      "completions/min_terminated_length": 31.0,
+      "epoch": 0.0872,
+      "grad_norm": 5.559842109680176,
+      "kl": 0.03765106201171875,
+      "learning_rate": 1e-06,
+      "loss": -0.0076,
+      "num_tokens": 1497415.0,
+      "reward": 0.2991971969604492,
+      "reward_std": 0.10907518863677979,
+      "rewards/bleu_reward_func/mean": 0.2991971969604492,
+      "rewards/bleu_reward_func/std": 0.36222296953201294,
+      "step": 109
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 238.0,
+      "completions/max_terminated_length": 238.0,
+      "completions/mean_length": 98.53125,
+      "completions/mean_terminated_length": 98.53125,
+      "completions/min_length": 21.0,
+      "completions/min_terminated_length": 21.0,
+      "epoch": 0.088,
+      "grad_norm": 8.213761329650879,
+      "kl": 0.078704833984375,
+      "learning_rate": 1e-06,
+      "loss": 0.1858,
+      "num_tokens": 1504528.0,
+      "reward": 0.048464857041835785,
+      "reward_std": 0.0210396908223629,
+      "rewards/bleu_reward_func/mean": 0.048464857041835785,
+      "rewards/bleu_reward_func/std": 0.03311728686094284,
+      "step": 110
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 399.0,
+      "completions/max_terminated_length": 399.0,
+      "completions/mean_length": 118.59375,
+      "completions/mean_terminated_length": 118.59375,
+      "completions/min_length": 20.0,
+      "completions/min_terminated_length": 20.0,
+      "epoch": 0.0888,
+      "grad_norm": 7.166593074798584,
+      "kl": 0.096771240234375,
+      "learning_rate": 1e-06,
+      "loss": -0.0566,
+      "num_tokens": 1517035.0,
+      "reward": 0.09873979538679123,
+      "reward_std": 0.03707325458526611,
+      "rewards/bleu_reward_func/mean": 0.09873979538679123,
+      "rewards/bleu_reward_func/std": 0.13200855255126953,
+      "step": 111
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.59375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 496.0,
+      "completions/mean_length": 401.5,
+      "completions/mean_terminated_length": 240.00001525878906,
+      "completions/min_length": 112.0,
+      "completions/min_terminated_length": 112.0,
+      "epoch": 0.0896,
+      "grad_norm": 2.8505043983459473,
+      "kl": 0.02176666259765625,
+      "learning_rate": 1e-06,
+      "loss": -0.0153,
+      "num_tokens": 1534363.0,
+      "reward": 0.11044108867645264,
+      "reward_std": 0.03410620242357254,
+      "rewards/bleu_reward_func/mean": 0.11044108867645264,
+      "rewards/bleu_reward_func/std": 0.16289857029914856,
+      "step": 112
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 482.0,
+      "completions/max_terminated_length": 482.0,
+      "completions/mean_length": 125.96875,
+      "completions/mean_terminated_length": 125.96875,
+      "completions/min_length": 24.0,
+      "completions/min_terminated_length": 24.0,
+      "epoch": 0.0904,
+      "grad_norm": 7.475080490112305,
+      "kl": 0.100982666015625,
+      "learning_rate": 1e-06,
+      "loss": -0.0271,
+      "num_tokens": 1546258.0,
+      "reward": 0.12119434028863907,
+      "reward_std": 0.03986787050962448,
+      "rewards/bleu_reward_func/mean": 0.12119434028863907,
+      "rewards/bleu_reward_func/std": 0.10625314712524414,
+      "step": 113
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 470.0,
+      "completions/mean_length": 332.09375,
+      "completions/mean_terminated_length": 250.3181915283203,
+      "completions/min_length": 31.0,
+      "completions/min_terminated_length": 31.0,
+      "epoch": 0.0912,
+      "grad_norm": 3.1941514015197754,
+      "kl": 0.01666259765625,
+      "learning_rate": 1e-06,
+      "loss": -0.0363,
+      "num_tokens": 1559133.0,
+      "reward": 0.05715271458029747,
+      "reward_std": 0.04336331784725189,
+      "rewards/bleu_reward_func/mean": 0.05715271458029747,
+      "rewards/bleu_reward_func/std": 0.05400845408439636,
+      "step": 114
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 476.0,
+      "completions/mean_length": 182.03125,
+      "completions/mean_terminated_length": 160.03334045410156,
+      "completions/min_length": 11.0,
+      "completions/min_terminated_length": 11.0,
+      "epoch": 0.092,
+      "grad_norm": 8.439948081970215,
+      "kl": 0.0611572265625,
+      "learning_rate": 1e-06,
+      "loss": -0.2392,
+      "num_tokens": 1569774.0,
+      "reward": 0.0502852126955986,
+      "reward_std": 0.01610748842358589,
+      "rewards/bleu_reward_func/mean": 0.0502852126955986,
+      "rewards/bleu_reward_func/std": 0.040807489305734634,
+      "step": 115
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 408.0,
+      "completions/max_terminated_length": 408.0,
+      "completions/mean_length": 163.84375,
+      "completions/mean_terminated_length": 163.84375,
+      "completions/min_length": 34.0,
+      "completions/min_terminated_length": 34.0,
+      "epoch": 0.0928,
+      "grad_norm": 4.541551113128662,
+      "kl": 0.069915771484375,
+      "learning_rate": 1e-06,
+      "loss": -0.0109,
+      "num_tokens": 1580281.0,
+      "reward": 0.03980318829417229,
+      "reward_std": 0.01563824526965618,
+      "rewards/bleu_reward_func/mean": 0.03980318829417229,
+      "rewards/bleu_reward_func/std": 0.023048467934131622,
+      "step": 116
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.21875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 458.0,
+      "completions/mean_length": 288.875,
+      "completions/mean_terminated_length": 226.39999389648438,
+      "completions/min_length": 12.0,
+      "completions/min_terminated_length": 12.0,
+      "epoch": 0.0936,
+      "grad_norm": 8.210314750671387,
+      "kl": 0.09372711181640625,
+      "learning_rate": 1e-06,
+      "loss": 0.122,
+      "num_tokens": 1593285.0,
+      "reward": 0.0629456490278244,
+      "reward_std": 0.015063179656863213,
+      "rewards/bleu_reward_func/mean": 0.0629456490278244,
+      "rewards/bleu_reward_func/std": 0.03602227941155434,
+      "step": 117
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.34375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 501.0,
+      "completions/mean_length": 350.0,
+      "completions/mean_terminated_length": 265.1428527832031,
+      "completions/min_length": 68.0,
+      "completions/min_terminated_length": 68.0,
+      "epoch": 0.0944,
+      "grad_norm": 2.992391586303711,
+      "kl": 0.02304840087890625,
+      "learning_rate": 1e-06,
+      "loss": -0.1711,
+      "num_tokens": 1606645.0,
+      "reward": 0.022465957328677177,
+      "reward_std": 0.016872048377990723,
+      "rewards/bleu_reward_func/mean": 0.022465957328677177,
+      "rewards/bleu_reward_func/std": 0.023790787905454636,
+      "step": 118
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 509.0,
+      "completions/mean_length": 285.375,
+      "completions/mean_terminated_length": 233.07693481445312,
+      "completions/min_length": 24.0,
+      "completions/min_terminated_length": 24.0,
+      "epoch": 0.0952,
+      "grad_norm": 4.377739429473877,
+      "kl": 0.0274658203125,
+      "learning_rate": 1e-06,
+      "loss": -0.0351,
+      "num_tokens": 1617737.0,
+      "reward": 0.053562991321086884,
+      "reward_std": 0.025934984907507896,
+      "rewards/bleu_reward_func/mean": 0.053562991321086884,
+      "rewards/bleu_reward_func/std": 0.03459456190466881,
+      "step": 119
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.46875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 277.0,
+      "completions/mean_length": 285.25,
+      "completions/mean_terminated_length": 85.17646789550781,
+      "completions/min_length": 37.0,
+      "completions/min_terminated_length": 37.0,
+      "epoch": 0.096,
+      "grad_norm": 5.66406774520874,
+      "kl": 0.032745361328125,
+      "learning_rate": 1e-06,
+      "loss": 0.1042,
+      "num_tokens": 1631409.0,
+      "reward": 0.05639251321554184,
+      "reward_std": 0.025049947202205658,
+      "rewards/bleu_reward_func/mean": 0.05639251321554184,
+      "rewards/bleu_reward_func/std": 0.04031047970056534,
+      "step": 120
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.53125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 467.0,
+      "completions/mean_length": 391.25,
+      "completions/mean_terminated_length": 254.40000915527344,
+      "completions/min_length": 81.0,
+      "completions/min_terminated_length": 81.0,
+      "epoch": 0.0968,
+      "grad_norm": 2.7011678218841553,
+      "kl": 0.019012451171875,
+      "learning_rate": 1e-06,
+      "loss": -0.0595,
+      "num_tokens": 1647665.0,
+      "reward": 0.1355845332145691,
+      "reward_std": 0.03834523260593414,
+      "rewards/bleu_reward_func/mean": 0.1355845332145691,
+      "rewards/bleu_reward_func/std": 0.17731845378875732,
+      "step": 121
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 505.0,
+      "completions/mean_length": 360.09375,
+      "completions/mean_terminated_length": 241.94444274902344,
+      "completions/min_length": 36.0,
+      "completions/min_terminated_length": 36.0,
+      "epoch": 0.0976,
+      "grad_norm": 3.2975172996520996,
+      "kl": 0.0248870849609375,
+      "learning_rate": 1e-06,
+      "loss": -0.0281,
+      "num_tokens": 1661164.0,
+      "reward": 0.02182621881365776,
+      "reward_std": 0.010437489487230778,
+      "rewards/bleu_reward_func/mean": 0.02182621881365776,
+      "rewards/bleu_reward_func/std": 0.019065655767917633,
+      "step": 122
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.15625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 497.0,
+      "completions/mean_length": 329.03125,
+      "completions/mean_terminated_length": 295.1481628417969,
+      "completions/min_length": 79.0,
+      "completions/min_terminated_length": 79.0,
+      "epoch": 0.0984,
+      "grad_norm": 3.011232376098633,
+      "kl": 0.023040771484375,
+      "learning_rate": 1e-06,
+      "loss": -0.0609,
+      "num_tokens": 1674269.0,
+      "reward": 0.05195554345846176,
+      "reward_std": 0.020864665508270264,
+      "rewards/bleu_reward_func/mean": 0.05195554345846176,
+      "rewards/bleu_reward_func/std": 0.027087198570370674,
+      "step": 123
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.46875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 451.0,
+      "completions/mean_length": 301.0,
+      "completions/mean_terminated_length": 114.82353210449219,
+      "completions/min_length": 17.0,
+      "completions/min_terminated_length": 17.0,
+      "epoch": 0.0992,
+      "grad_norm": 8.828636169433594,
+      "kl": 0.0798492431640625,
+      "learning_rate": 1e-06,
+      "loss": -0.0267,
+      "num_tokens": 1691797.0,
+      "reward": 0.1420682966709137,
+      "reward_std": 0.04143287241458893,
+      "rewards/bleu_reward_func/mean": 0.1420682966709137,
+      "rewards/bleu_reward_func/std": 0.07349839806556702,
+      "step": 124
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 432.0,
+      "completions/max_terminated_length": 432.0,
+      "completions/mean_length": 167.28125,
+      "completions/mean_terminated_length": 167.28125,
+      "completions/min_length": 26.0,
+      "completions/min_terminated_length": 26.0,
+      "epoch": 0.1,
+      "grad_norm": 7.268754482269287,
+      "kl": 0.104766845703125,
+      "learning_rate": 1e-06,
+      "loss": 0.2657,
+      "num_tokens": 1702150.0,
+      "reward": 0.16663971543312073,
+      "reward_std": 0.05392443761229515,
+      "rewards/bleu_reward_func/mean": 0.16663971543312073,
+      "rewards/bleu_reward_func/std": 0.09980462491512299,
+      "step": 125
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 432.0,
+      "completions/mean_length": 367.8125,
+      "completions/mean_terminated_length": 223.625,
+      "completions/min_length": 77.0,
+      "completions/min_terminated_length": 77.0,
+      "epoch": 0.1008,
+      "grad_norm": 2.9197561740875244,
+      "kl": 0.0296630859375,
+      "learning_rate": 1e-06,
+      "loss": 0.069,
+      "num_tokens": 1720080.0,
+      "reward": 0.05814104527235031,
+      "reward_std": 0.023808015510439873,
+      "rewards/bleu_reward_func/mean": 0.05814104527235031,
+      "rewards/bleu_reward_func/std": 0.06258071959018707,
+      "step": 126
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 222.0,
+      "completions/mean_length": 311.03125,
+      "completions/mean_terminated_length": 110.0625,
+      "completions/min_length": 44.0,
+      "completions/min_terminated_length": 44.0,
+      "epoch": 0.1016,
+      "grad_norm": 3.6131699085235596,
+      "kl": 0.040496826171875,
+      "learning_rate": 1e-06,
+      "loss": -0.1103,
+      "num_tokens": 1736129.0,
+      "reward": 0.1627029925584793,
+      "reward_std": 0.048266101628541946,
+      "rewards/bleu_reward_func/mean": 0.1627029925584793,
+      "rewards/bleu_reward_func/std": 0.2640880048274994,
+      "step": 127
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.21875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 511.0,
+      "completions/mean_length": 182.1875,
+      "completions/mean_terminated_length": 89.83999633789062,
+      "completions/min_length": 15.0,
+      "completions/min_terminated_length": 15.0,
+      "epoch": 0.1024,
+      "grad_norm": 5.8553900718688965,
+      "kl": 0.0695953369140625,
+      "learning_rate": 1e-06,
+      "loss": 0.2073,
+      "num_tokens": 1744807.0,
+      "reward": 0.05680542066693306,
+      "reward_std": 0.02900797501206398,
+      "rewards/bleu_reward_func/mean": 0.05680542066693306,
+      "rewards/bleu_reward_func/std": 0.062428779900074005,
+      "step": 128
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 434.0,
+      "completions/mean_length": 304.375,
+      "completions/mean_terminated_length": 235.1666717529297,
+      "completions/min_length": 56.0,
+      "completions/min_terminated_length": 56.0,
+      "epoch": 0.1032,
+      "grad_norm": 2.625256299972534,
+      "kl": 0.014190673828125,
+      "learning_rate": 1e-06,
+      "loss": -0.3256,
+      "num_tokens": 1759979.0,
+      "reward": 0.07073010504245758,
+      "reward_std": 0.0585593655705452,
+      "rewards/bleu_reward_func/mean": 0.07073010504245758,
+      "rewards/bleu_reward_func/std": 0.0830271914601326,
+      "step": 129
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 502.0,
+      "completions/mean_length": 269.21875,
+      "completions/mean_terminated_length": 188.2916717529297,
+      "completions/min_length": 26.0,
+      "completions/min_terminated_length": 26.0,
+      "epoch": 0.104,
+      "grad_norm": 7.594178199768066,
+      "kl": 0.019775390625,
+      "learning_rate": 1e-06,
+      "loss": 0.103,
+      "num_tokens": 1772810.0,
+      "reward": 0.03343900665640831,
+      "reward_std": 0.008691318333148956,
+      "rewards/bleu_reward_func/mean": 0.03343900665640831,
+      "rewards/bleu_reward_func/std": 0.027092551812529564,
+      "step": 130
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.09375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 509.0,
+      "completions/mean_length": 307.375,
+      "completions/mean_terminated_length": 286.2069091796875,
+      "completions/min_length": 114.0,
+      "completions/min_terminated_length": 114.0,
+      "epoch": 0.1048,
+      "grad_norm": 3.5237069129943848,
+      "kl": 0.0251312255859375,
+      "learning_rate": 1e-06,
+      "loss": 0.0902,
+      "num_tokens": 1786094.0,
+      "reward": 0.03853389620780945,
+      "reward_std": 0.016378795728087425,
+      "rewards/bleu_reward_func/mean": 0.03853389620780945,
+      "rewards/bleu_reward_func/std": 0.02983209490776062,
+      "step": 131
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.15625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 505.0,
+      "completions/mean_length": 244.03125,
+      "completions/mean_terminated_length": 194.40740966796875,
+      "completions/min_length": 28.0,
+      "completions/min_terminated_length": 28.0,
+      "epoch": 0.1056,
+      "grad_norm": 3.428116798400879,
+      "kl": 0.072906494140625,
+      "learning_rate": 1e-06,
+      "loss": 0.1144,
+      "num_tokens": 1797551.0,
+      "reward": 0.1538739800453186,
+      "reward_std": 0.03595956414937973,
+      "rewards/bleu_reward_func/mean": 0.1538739800453186,
+      "rewards/bleu_reward_func/std": 0.21548843383789062,
+      "step": 132
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 494.0,
+      "completions/mean_length": 220.46875,
+      "completions/mean_terminated_length": 211.06451416015625,
+      "completions/min_length": 30.0,
+      "completions/min_terminated_length": 30.0,
+      "epoch": 0.1064,
+      "grad_norm": 4.208179950714111,
+      "kl": 0.058685302734375,
+      "learning_rate": 1e-06,
+      "loss": -0.1489,
+      "num_tokens": 1808726.0,
+      "reward": 0.18491268157958984,
+      "reward_std": 0.0416969433426857,
+      "rewards/bleu_reward_func/mean": 0.18491268157958984,
+      "rewards/bleu_reward_func/std": 0.2198871225118637,
+      "step": 133
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 402.0,
+      "completions/max_terminated_length": 402.0,
+      "completions/mean_length": 117.5625,
+      "completions/mean_terminated_length": 117.5625,
+      "completions/min_length": 12.0,
+      "completions/min_terminated_length": 12.0,
+      "epoch": 0.1072,
+      "grad_norm": 9.312064170837402,
+      "kl": 0.2574310302734375,
+      "learning_rate": 1e-06,
+      "loss": -0.1699,
+      "num_tokens": 1816152.0,
+      "reward": 0.09744147956371307,
+      "reward_std": 0.03963543474674225,
+      "rewards/bleu_reward_func/mean": 0.09744147956371307,
+      "rewards/bleu_reward_func/std": 0.07821591198444366,
+      "step": 134
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.34375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 503.0,
+      "completions/mean_length": 284.5625,
+      "completions/mean_terminated_length": 165.42857360839844,
+      "completions/min_length": 2.0,
+      "completions/min_terminated_length": 2.0,
+      "epoch": 0.108,
+      "grad_norm": 2.491009473800659,
+      "kl": 0.5361785888671875,
+      "learning_rate": 1e-06,
+      "loss": 0.0755,
+      "num_tokens": 1830010.0,
+      "reward": 0.07847163081169128,
+      "reward_std": 0.07447989284992218,
+      "rewards/bleu_reward_func/mean": 0.07847163081169128,
+      "rewards/bleu_reward_func/std": 0.1269197165966034,
+      "step": 135
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 491.0,
+      "completions/mean_length": 208.28125,
+      "completions/mean_terminated_length": 164.8928680419922,
+      "completions/min_length": 17.0,
+      "completions/min_terminated_length": 17.0,
+      "epoch": 0.1088,
+      "grad_norm": 9.84536075592041,
+      "kl": 0.137481689453125,
+      "learning_rate": 1e-06,
+      "loss": 0.0643,
+      "num_tokens": 1843019.0,
+      "reward": 0.23385955393314362,
+      "reward_std": 0.07621090114116669,
+      "rewards/bleu_reward_func/mean": 0.23385955393314362,
+      "rewards/bleu_reward_func/std": 0.2127569168806076,
+      "step": 136
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.28125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 485.0,
+      "completions/mean_length": 243.46875,
+      "completions/mean_terminated_length": 138.3913116455078,
+      "completions/min_length": 25.0,
+      "completions/min_terminated_length": 25.0,
+      "epoch": 0.1096,
+      "grad_norm": 6.7033257484436035,
+      "kl": 0.051727294921875,
+      "learning_rate": 1e-06,
+      "loss": 0.1285,
+      "num_tokens": 1856450.0,
+      "reward": 0.04753299057483673,
+      "reward_std": 0.016634728759527206,
+      "rewards/bleu_reward_func/mean": 0.04753299057483673,
+      "rewards/bleu_reward_func/std": 0.030512619763612747,
+      "step": 137
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.53125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 491.0,
+      "completions/mean_length": 352.40625,
+      "completions/mean_terminated_length": 171.53334045410156,
+      "completions/min_length": 26.0,
+      "completions/min_terminated_length": 26.0,
+      "epoch": 0.1104,
+      "grad_norm": 3.812756061553955,
+      "kl": 0.0177764892578125,
+      "learning_rate": 1e-06,
+      "loss": 0.1601,
+      "num_tokens": 1870943.0,
+      "reward": 0.04067971557378769,
+      "reward_std": 0.026344479992985725,
+      "rewards/bleu_reward_func/mean": 0.04067971557378769,
+      "rewards/bleu_reward_func/std": 0.06328170746564865,
+      "step": 138
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.09375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 498.0,
+      "completions/mean_length": 287.46875,
+      "completions/mean_terminated_length": 264.2413635253906,
+      "completions/min_length": 17.0,
+      "completions/min_terminated_length": 17.0,
+      "epoch": 0.1112,
+      "grad_norm": 4.916464805603027,
+      "kl": 0.06890869140625,
+      "learning_rate": 1e-06,
+      "loss": 0.0108,
+      "num_tokens": 1883774.0,
+      "reward": 0.1910865753889084,
+      "reward_std": 0.09566200524568558,
+      "rewards/bleu_reward_func/mean": 0.1910865753889084,
+      "rewards/bleu_reward_func/std": 0.2485995888710022,
+      "step": 139
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 446.0,
+      "completions/mean_length": 312.6875,
+      "completions/mean_terminated_length": 157.6666717529297,
+      "completions/min_length": 18.0,
+      "completions/min_terminated_length": 18.0,
+      "epoch": 0.112,
+      "grad_norm": 5.02897310256958,
+      "kl": 0.081298828125,
+      "learning_rate": 1e-06,
+      "loss": 0.0347,
+      "num_tokens": 1900964.0,
+      "reward": 0.1273106336593628,
+      "reward_std": 0.037408363074064255,
+      "rewards/bleu_reward_func/mean": 0.1273106336593628,
+      "rewards/bleu_reward_func/std": 0.1255699247121811,
+      "step": 140
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.28125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 421.0,
+      "completions/mean_length": 287.40625,
+      "completions/mean_terminated_length": 199.52174377441406,
+      "completions/min_length": 34.0,
+      "completions/min_terminated_length": 34.0,
+      "epoch": 0.1128,
+      "grad_norm": 3.5728607177734375,
+      "kl": 0.0290985107421875,
+      "learning_rate": 1e-06,
+      "loss": -0.0802,
+      "num_tokens": 1914153.0,
+      "reward": 0.1449739634990692,
+      "reward_std": 0.05561315268278122,
+      "rewards/bleu_reward_func/mean": 0.1449739634990692,
+      "rewards/bleu_reward_func/std": 0.10589203238487244,
+      "step": 141
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 503.0,
+      "completions/mean_length": 171.6875,
+      "completions/mean_terminated_length": 149.00001525878906,
+      "completions/min_length": 15.0,
+      "completions/min_terminated_length": 15.0,
+      "epoch": 0.1136,
+      "grad_norm": 6.168550491333008,
+      "kl": 0.076751708984375,
+      "learning_rate": 1e-06,
+      "loss": 0.1377,
+      "num_tokens": 1924551.0,
+      "reward": 0.07935678958892822,
+      "reward_std": 0.044586654752492905,
+      "rewards/bleu_reward_func/mean": 0.07935678958892822,
+      "rewards/bleu_reward_func/std": 0.11080160737037659,
+      "step": 142
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 485.0,
+      "completions/mean_length": 188.28125,
+      "completions/mean_terminated_length": 166.70001220703125,
+      "completions/min_length": 8.0,
+      "completions/min_terminated_length": 8.0,
+      "epoch": 0.1144,
+      "grad_norm": 5.237224102020264,
+      "kl": 0.042633056640625,
+      "learning_rate": 1e-06,
+      "loss": 0.1969,
+      "num_tokens": 1936192.0,
+      "reward": 0.07339954376220703,
+      "reward_std": 0.04980514198541641,
+      "rewards/bleu_reward_func/mean": 0.07339954376220703,
+      "rewards/bleu_reward_func/std": 0.06703697144985199,
+      "step": 143
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 340.0,
+      "completions/mean_length": 168.03125,
+      "completions/mean_terminated_length": 53.375,
+      "completions/min_length": 12.0,
+      "completions/min_terminated_length": 12.0,
+      "epoch": 0.1152,
+      "grad_norm": 6.578721523284912,
+      "kl": 0.148895263671875,
+      "learning_rate": 1e-06,
+      "loss": 0.2196,
+      "num_tokens": 1946361.0,
+      "reward": 0.2388084977865219,
+      "reward_std": 0.05400132015347481,
+      "rewards/bleu_reward_func/mean": 0.2388084977865219,
+      "rewards/bleu_reward_func/std": 0.2556310296058655,
+      "step": 144
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 502.0,
+      "completions/mean_length": 254.90625,
+      "completions/mean_terminated_length": 218.17857360839844,
+      "completions/min_length": 22.0,
+      "completions/min_terminated_length": 22.0,
+      "epoch": 0.116,
+      "grad_norm": 5.644160270690918,
+      "kl": 0.10284423828125,
+      "learning_rate": 1e-06,
+      "loss": 0.0093,
+      "num_tokens": 1958990.0,
+      "reward": 0.05691784247756004,
+      "reward_std": 0.045338764786720276,
+      "rewards/bleu_reward_func/mean": 0.05691784247756004,
+      "rewards/bleu_reward_func/std": 0.051530975848436356,
+      "step": 145
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 445.0,
+      "completions/mean_length": 138.125,
+      "completions/mean_terminated_length": 126.06451416015625,
+      "completions/min_length": 14.0,
+      "completions/min_terminated_length": 14.0,
+      "epoch": 0.1168,
+      "grad_norm": 6.659230709075928,
+      "kl": 0.07525634765625,
+      "learning_rate": 1e-06,
+      "loss": 0.3905,
+      "num_tokens": 1966178.0,
+      "reward": 0.08115407824516296,
+      "reward_std": 0.05008203536272049,
+      "rewards/bleu_reward_func/mean": 0.08115407824516296,
+      "rewards/bleu_reward_func/std": 0.060907039791345596,
+      "step": 146
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 446.0,
+      "completions/mean_length": 152.25,
+      "completions/mean_terminated_length": 69.23077392578125,
+      "completions/min_length": 13.0,
+      "completions/min_terminated_length": 13.0,
+      "epoch": 0.1176,
+      "grad_norm": 10.029218673706055,
+      "kl": 0.1614227294921875,
+      "learning_rate": 1e-06,
+      "loss": 0.0338,
+      "num_tokens": 1977194.0,
+      "reward": 0.23646463453769684,
+      "reward_std": 0.09375543892383575,
+      "rewards/bleu_reward_func/mean": 0.23646463453769684,
+      "rewards/bleu_reward_func/std": 0.27427393198013306,
+      "step": 147
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 502.0,
+      "completions/mean_length": 300.84375,
+      "completions/mean_terminated_length": 174.15000915527344,
+      "completions/min_length": 2.0,
+      "completions/min_terminated_length": 2.0,
+      "epoch": 0.1184,
+      "grad_norm": 7.6274027824401855,
+      "kl": 0.0694122314453125,
+      "learning_rate": 1e-06,
+      "loss": -0.0195,
+      "num_tokens": 1991693.0,
+      "reward": 0.1271597295999527,
+      "reward_std": 0.03925805538892746,
+      "rewards/bleu_reward_func/mean": 0.1271597295999527,
+      "rewards/bleu_reward_func/std": 0.20968182384967804,
+      "step": 148
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 442.0,
+      "completions/mean_length": 235.0625,
+      "completions/mean_terminated_length": 142.75,
+      "completions/min_length": 14.0,
+      "completions/min_terminated_length": 14.0,
+      "epoch": 0.1192,
+      "grad_norm": 5.1624908447265625,
+      "kl": 0.074005126953125,
+      "learning_rate": 1e-06,
+      "loss": -0.0749,
+      "num_tokens": 2002359.0,
+      "reward": 0.0867965817451477,
+      "reward_std": 0.03743039071559906,
+      "rewards/bleu_reward_func/mean": 0.0867965817451477,
+      "rewards/bleu_reward_func/std": 0.06982331722974777,
+      "step": 149
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 383.0,
+      "completions/mean_length": 180.125,
+      "completions/mean_terminated_length": 69.5,
+      "completions/min_length": 20.0,
+      "completions/min_terminated_length": 20.0,
+      "epoch": 0.12,
+      "grad_norm": 5.687462329864502,
+      "kl": 0.079742431640625,
+      "learning_rate": 1e-06,
+      "loss": 0.1774,
+      "num_tokens": 2012139.0,
+      "reward": 0.08913667500019073,
+      "reward_std": 0.03803376108407974,
+      "rewards/bleu_reward_func/mean": 0.08913667500019073,
+      "rewards/bleu_reward_func/std": 0.07373686879873276,
+      "step": 150
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 486.0,
+      "completions/mean_length": 246.6875,
+      "completions/mean_terminated_length": 158.25,
+      "completions/min_length": 19.0,
+      "completions/min_terminated_length": 19.0,
+      "epoch": 0.1208,
+      "grad_norm": 6.3145341873168945,
+      "kl": 0.168792724609375,
+      "learning_rate": 1e-06,
+      "loss": 0.0474,
+      "num_tokens": 2024961.0,
+      "reward": 0.09886027127504349,
+      "reward_std": 0.09059572219848633,
+      "rewards/bleu_reward_func/mean": 0.09886027127504349,
+      "rewards/bleu_reward_func/std": 0.20261086523532867,
+      "step": 151
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 470.0,
+      "completions/mean_length": 278.375,
+      "completions/mean_terminated_length": 200.5,
+      "completions/min_length": 34.0,
+      "completions/min_terminated_length": 34.0,
+      "epoch": 0.1216,
+      "grad_norm": 6.056458473205566,
+      "kl": 0.04815673828125,
+      "learning_rate": 1e-06,
+      "loss": 0.0367,
+      "num_tokens": 2038621.0,
+      "reward": 0.040941424667835236,
+      "reward_std": 0.024181999266147614,
+      "rewards/bleu_reward_func/mean": 0.040941424667835236,
+      "rewards/bleu_reward_func/std": 0.031022800132632256,
+      "step": 152
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.21875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 408.0,
+      "completions/mean_length": 174.4375,
+      "completions/mean_terminated_length": 79.91999816894531,
+      "completions/min_length": 13.0,
+      "completions/min_terminated_length": 13.0,
+      "epoch": 0.1224,
+      "grad_norm": 11.584298133850098,
+      "kl": 0.10552978515625,
+      "learning_rate": 1e-06,
+      "loss": -0.3199,
+      "num_tokens": 2048947.0,
+      "reward": 0.057592377066612244,
+      "reward_std": 0.02831832319498062,
+      "rewards/bleu_reward_func/mean": 0.057592377066612244,
+      "rewards/bleu_reward_func/std": 0.0929059162735939,
+      "step": 153
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.28125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 508.0,
+      "completions/mean_length": 285.71875,
+      "completions/mean_terminated_length": 197.17391967773438,
+      "completions/min_length": 38.0,
+      "completions/min_terminated_length": 38.0,
+      "epoch": 0.1232,
+      "grad_norm": 3.8603882789611816,
+      "kl": 0.042999267578125,
+      "learning_rate": 1e-06,
+      "loss": -0.1903,
+      "num_tokens": 2059978.0,
+      "reward": 0.0238445196300745,
+      "reward_std": 0.016163241118192673,
+      "rewards/bleu_reward_func/mean": 0.0238445196300745,
+      "rewards/bleu_reward_func/std": 0.020820245146751404,
+      "step": 154
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 262.0,
+      "completions/max_terminated_length": 262.0,
+      "completions/mean_length": 110.90625,
+      "completions/mean_terminated_length": 110.90625,
+      "completions/min_length": 23.0,
+      "completions/min_terminated_length": 23.0,
+      "epoch": 0.124,
+      "grad_norm": 9.231768608093262,
+      "kl": 0.134063720703125,
+      "learning_rate": 1e-06,
+      "loss": -0.12,
+      "num_tokens": 2068223.0,
+      "reward": 0.093255415558815,
+      "reward_std": 0.04695024713873863,
+      "rewards/bleu_reward_func/mean": 0.093255415558815,
+      "rewards/bleu_reward_func/std": 0.07957140356302261,
+      "step": 155
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 490.0,
+      "completions/mean_length": 196.875,
+      "completions/mean_terminated_length": 151.85714721679688,
+      "completions/min_length": 15.0,
+      "completions/min_terminated_length": 15.0,
+      "epoch": 0.1248,
+      "grad_norm": 6.661685466766357,
+      "kl": 0.11859130859375,
+      "learning_rate": 1e-06,
+      "loss": -0.1122,
+      "num_tokens": 2079491.0,
+      "reward": 0.10441941022872925,
+      "reward_std": 0.06782116740942001,
+      "rewards/bleu_reward_func/mean": 0.10441941022872925,
+      "rewards/bleu_reward_func/std": 0.1558544933795929,
+      "step": 156
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.40625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 497.0,
+      "completions/mean_length": 417.84375,
+      "completions/mean_terminated_length": 353.4210510253906,
+      "completions/min_length": 190.0,
+      "completions/min_terminated_length": 190.0,
+      "epoch": 0.1256,
+      "grad_norm": 2.281557559967041,
+      "kl": 0.020050048828125,
+      "learning_rate": 1e-06,
+      "loss": -0.049,
+      "num_tokens": 2094998.0,
+      "reward": 0.03994186595082283,
+      "reward_std": 0.020151065662503242,
+      "rewards/bleu_reward_func/mean": 0.03994186595082283,
+      "rewards/bleu_reward_func/std": 0.03798232972621918,
+      "step": 157
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.15625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 490.0,
+      "completions/mean_length": 186.75,
+      "completions/mean_terminated_length": 126.51851654052734,
+      "completions/min_length": 12.0,
+      "completions/min_terminated_length": 12.0,
+      "epoch": 0.1264,
+      "grad_norm": 5.349869251251221,
+      "kl": 0.052093505859375,
+      "learning_rate": 1e-06,
+      "loss": 0.0712,
+      "num_tokens": 2104006.0,
+      "reward": 0.060490936040878296,
+      "reward_std": 0.039247751235961914,
+      "rewards/bleu_reward_func/mean": 0.060490936040878296,
+      "rewards/bleu_reward_func/std": 0.06767360866069794,
+      "step": 158
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 503.0,
+      "completions/mean_length": 312.375,
+      "completions/mean_terminated_length": 266.3077087402344,
+      "completions/min_length": 12.0,
+      "completions/min_terminated_length": 12.0,
+      "epoch": 0.1272,
+      "grad_norm": 6.974966526031494,
+      "kl": 0.0902099609375,
+      "learning_rate": 1e-06,
+      "loss": -0.0785,
+      "num_tokens": 2121570.0,
+      "reward": 0.21717938780784607,
+      "reward_std": 0.08217764645814896,
+      "rewards/bleu_reward_func/mean": 0.21717938780784607,
+      "rewards/bleu_reward_func/std": 0.1689896285533905,
+      "step": 159
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.40625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 505.0,
+      "completions/mean_length": 350.25,
+      "completions/mean_terminated_length": 239.57894897460938,
+      "completions/min_length": 16.0,
+      "completions/min_terminated_length": 16.0,
+      "epoch": 0.128,
+      "grad_norm": 8.79633617401123,
+      "kl": 0.10308837890625,
+      "learning_rate": 1e-06,
+      "loss": -0.0746,
+      "num_tokens": 2134322.0,
+      "reward": 0.027915209531784058,
+      "reward_std": 0.008189969696104527,
+      "rewards/bleu_reward_func/mean": 0.027915209531784058,
+      "rewards/bleu_reward_func/std": 0.021798407658934593,
+      "step": 160
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 432.0,
+      "completions/max_terminated_length": 432.0,
+      "completions/mean_length": 98.40625,
+      "completions/mean_terminated_length": 98.40625,
+      "completions/min_length": 9.0,
+      "completions/min_terminated_length": 9.0,
+      "epoch": 0.1288,
+      "grad_norm": 9.201173782348633,
+      "kl": 0.112060546875,
+      "learning_rate": 1e-06,
+      "loss": 0.1926,
+      "num_tokens": 2139903.0,
+      "reward": 0.08629470318555832,
+      "reward_std": 0.0329008549451828,
+      "rewards/bleu_reward_func/mean": 0.08629470318555832,
+      "rewards/bleu_reward_func/std": 0.04737285524606705,
+      "step": 161
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.65625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 426.0,
+      "completions/mean_length": 408.75,
+      "completions/mean_terminated_length": 211.63636779785156,
+      "completions/min_length": 51.0,
+      "completions/min_terminated_length": 51.0,
+      "epoch": 0.1296,
+      "grad_norm": 3.5992963314056396,
+      "kl": 0.034759521484375,
+      "learning_rate": 1e-06,
+      "loss": 0.2439,
+      "num_tokens": 2158343.0,
+      "reward": 0.07093626260757446,
+      "reward_std": 0.04270578920841217,
+      "rewards/bleu_reward_func/mean": 0.07093626260757446,
+      "rewards/bleu_reward_func/std": 0.09919130057096481,
+      "step": 162
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 396.0,
+      "completions/mean_length": 171.4375,
+      "completions/mean_terminated_length": 92.84616088867188,
+      "completions/min_length": 9.0,
+      "completions/min_terminated_length": 9.0,
+      "epoch": 0.1304,
+      "grad_norm": 8.71653938293457,
+      "kl": 0.069793701171875,
+      "learning_rate": 1e-06,
+      "loss": -0.2102,
+      "num_tokens": 2166957.0,
+      "reward": 0.045812517404556274,
+      "reward_std": 0.0257731880992651,
+      "rewards/bleu_reward_func/mean": 0.045812517404556274,
+      "rewards/bleu_reward_func/std": 0.033692970871925354,
+      "step": 163
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.40625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 507.0,
+      "completions/mean_length": 435.65625,
+      "completions/mean_terminated_length": 383.4210510253906,
+      "completions/min_length": 69.0,
+      "completions/min_terminated_length": 69.0,
+      "epoch": 0.1312,
+      "grad_norm": 2.308507204055786,
+      "kl": 0.020599365234375,
+      "learning_rate": 1e-06,
+      "loss": -0.142,
+      "num_tokens": 2182538.0,
+      "reward": 0.05681996047496796,
+      "reward_std": 0.022751763463020325,
+      "rewards/bleu_reward_func/mean": 0.05681996047496796,
+      "rewards/bleu_reward_func/std": 0.034446995705366135,
+      "step": 164
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 488.0,
+      "completions/mean_length": 422.1875,
+      "completions/mean_terminated_length": 368.3000183105469,
+      "completions/min_length": 14.0,
+      "completions/min_terminated_length": 14.0,
+      "epoch": 0.132,
+      "grad_norm": 2.0656521320343018,
+      "kl": 0.0328369140625,
+      "learning_rate": 1e-06,
+      "loss": -0.1159,
+      "num_tokens": 2198440.0,
+      "reward": 0.08400298655033112,
+      "reward_std": 0.03193335980176926,
+      "rewards/bleu_reward_func/mean": 0.08400298655033112,
+      "rewards/bleu_reward_func/std": 0.05056838318705559,
+      "step": 165
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 468.0,
+      "completions/mean_length": 265.25,
+      "completions/mean_terminated_length": 117.20000457763672,
+      "completions/min_length": 17.0,
+      "completions/min_terminated_length": 17.0,
+      "epoch": 0.1328,
+      "grad_norm": 7.659345626831055,
+      "kl": 0.12249755859375,
+      "learning_rate": 1e-06,
+      "loss": -0.0683,
+      "num_tokens": 2212440.0,
+      "reward": 0.17941661179065704,
+      "reward_std": 0.040813662111759186,
+      "rewards/bleu_reward_func/mean": 0.17941661179065704,
+      "rewards/bleu_reward_func/std": 0.2576500475406647,
+      "step": 166
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 410.0,
+      "completions/max_terminated_length": 410.0,
+      "completions/mean_length": 171.625,
+      "completions/mean_terminated_length": 171.625,
+      "completions/min_length": 32.0,
+      "completions/min_terminated_length": 32.0,
+      "epoch": 0.1336,
+      "grad_norm": 4.253745079040527,
+      "kl": 0.03948974609375,
+      "learning_rate": 1e-06,
+      "loss": -0.0369,
+      "num_tokens": 2220868.0,
+      "reward": 0.15958541631698608,
+      "reward_std": 0.08837255835533142,
+      "rewards/bleu_reward_func/mean": 0.15958541631698608,
+      "rewards/bleu_reward_func/std": 0.2750999629497528,
+      "step": 167
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.28125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 502.0,
+      "completions/mean_length": 258.09375,
+      "completions/mean_terminated_length": 158.7391357421875,
+      "completions/min_length": 11.0,
+      "completions/min_terminated_length": 11.0,
+      "epoch": 0.1344,
+      "grad_norm": 7.613649845123291,
+      "kl": 0.230072021484375,
+      "learning_rate": 1e-06,
+      "loss": 0.2337,
+      "num_tokens": 2232063.0,
+      "reward": 0.0643484890460968,
+      "reward_std": 0.04164566472172737,
+      "rewards/bleu_reward_func/mean": 0.0643484890460968,
+      "rewards/bleu_reward_func/std": 0.07561130821704865,
+      "step": 168
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 504.0,
+      "completions/mean_length": 312.75,
+      "completions/mean_terminated_length": 193.1999969482422,
+      "completions/min_length": 17.0,
+      "completions/min_terminated_length": 17.0,
+      "epoch": 0.1352,
+      "grad_norm": 7.345302104949951,
+      "kl": 0.19305419921875,
+      "learning_rate": 1e-06,
+      "loss": -0.2036,
+      "num_tokens": 2248271.0,
+      "reward": 0.04911228269338608,
+      "reward_std": 0.018512040376663208,
+      "rewards/bleu_reward_func/mean": 0.04911228269338608,
+      "rewards/bleu_reward_func/std": 0.05713532865047455,
+      "step": 169
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.34375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 468.0,
+      "completions/mean_length": 327.0,
+      "completions/mean_terminated_length": 230.09524536132812,
+      "completions/min_length": 38.0,
+      "completions/min_terminated_length": 38.0,
+      "epoch": 0.136,
+      "grad_norm": 5.079345226287842,
+      "kl": 0.02252197265625,
+      "learning_rate": 1e-06,
+      "loss": 0.1003,
+      "num_tokens": 2261791.0,
+      "reward": 0.03408445790410042,
+      "reward_std": 0.007548983674496412,
+      "rewards/bleu_reward_func/mean": 0.03408445790410042,
+      "rewards/bleu_reward_func/std": 0.030450724065303802,
+      "step": 170
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.21875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 204.0,
+      "completions/mean_length": 173.3125,
+      "completions/mean_terminated_length": 78.47999572753906,
+      "completions/min_length": 7.0,
+      "completions/min_terminated_length": 7.0,
+      "epoch": 0.1368,
+      "grad_norm": 7.255119800567627,
+      "kl": 0.098876953125,
+      "learning_rate": 1e-06,
+      "loss": -0.1893,
+      "num_tokens": 2271041.0,
+      "reward": 0.08309763669967651,
+      "reward_std": 0.05162087082862854,
+      "rewards/bleu_reward_func/mean": 0.08309763669967651,
+      "rewards/bleu_reward_func/std": 0.08563226461410522,
+      "step": 171
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 452.0,
+      "completions/mean_length": 251.59375,
+      "completions/mean_terminated_length": 164.7916717529297,
+      "completions/min_length": 13.0,
+      "completions/min_terminated_length": 13.0,
+      "epoch": 0.1376,
+      "grad_norm": 9.955636024475098,
+      "kl": 0.094970703125,
+      "learning_rate": 1e-06,
+      "loss": 0.4501,
+      "num_tokens": 2281108.0,
+      "reward": 0.09667688608169556,
+      "reward_std": 0.047036267817020416,
+      "rewards/bleu_reward_func/mean": 0.09667688608169556,
+      "rewards/bleu_reward_func/std": 0.05911566689610481,
+      "step": 172
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 498.0,
+      "completions/mean_length": 259.625,
+      "completions/mean_terminated_length": 251.48385620117188,
+      "completions/min_length": 22.0,
+      "completions/min_terminated_length": 22.0,
+      "epoch": 0.1384,
+      "grad_norm": 2.710672616958618,
+      "kl": 0.05035400390625,
+      "learning_rate": 1e-06,
+      "loss": -0.1131,
+      "num_tokens": 2292040.0,
+      "reward": 0.01771564967930317,
+      "reward_std": 0.0045564379543066025,
+      "rewards/bleu_reward_func/mean": 0.01771564967930317,
+      "rewards/bleu_reward_func/std": 0.009397609159350395,
+      "step": 173
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.40625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 507.0,
+      "completions/mean_length": 279.15625,
+      "completions/mean_terminated_length": 119.84210968017578,
+      "completions/min_length": 20.0,
+      "completions/min_terminated_length": 20.0,
+      "epoch": 0.1392,
+      "grad_norm": 5.8446946144104,
+      "kl": 0.10295867919921875,
+      "learning_rate": 1e-06,
+      "loss": -0.0422,
+      "num_tokens": 2306453.0,
+      "reward": 0.10576937347650528,
+      "reward_std": 0.040997594594955444,
+      "rewards/bleu_reward_func/mean": 0.10576937347650528,
+      "rewards/bleu_reward_func/std": 0.15739315748214722,
+      "step": 174
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 499.0,
+      "completions/mean_length": 406.375,
+      "completions/mean_terminated_length": 270.5714416503906,
+      "completions/min_length": 102.0,
+      "completions/min_terminated_length": 102.0,
+      "epoch": 0.14,
+      "grad_norm": 2.8740079402923584,
+      "kl": 0.022064208984375,
+      "learning_rate": 1e-06,
+      "loss": -0.001,
+      "num_tokens": 2321945.0,
+      "reward": 0.07392336428165436,
+      "reward_std": 0.027644775807857513,
+      "rewards/bleu_reward_func/mean": 0.07392336428165436,
+      "rewards/bleu_reward_func/std": 0.079840287566185,
+      "step": 175
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 454.0,
+      "completions/mean_length": 384.6875,
+      "completions/mean_terminated_length": 308.3000183105469,
+      "completions/min_length": 12.0,
+      "completions/min_terminated_length": 12.0,
+      "epoch": 0.1408,
+      "grad_norm": 2.571645498275757,
+      "kl": 0.0183868408203125,
+      "learning_rate": 1e-06,
+      "loss": -0.106,
+      "num_tokens": 2338695.0,
+      "reward": 0.043530724942684174,
+      "reward_std": 0.02269122190773487,
+      "rewards/bleu_reward_func/mean": 0.043530724942684174,
+      "rewards/bleu_reward_func/std": 0.029228538274765015,
+      "step": 176
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.28125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 504.0,
+      "completions/mean_length": 276.71875,
+      "completions/mean_terminated_length": 184.6521759033203,
+      "completions/min_length": 23.0,
+      "completions/min_terminated_length": 23.0,
+      "epoch": 0.1416,
+      "grad_norm": 6.338461399078369,
+      "kl": 0.0661468505859375,
+      "learning_rate": 1e-06,
+      "loss": 0.2588,
+      "num_tokens": 2349054.0,
+      "reward": 0.04008907824754715,
+      "reward_std": 0.03199386969208717,
+      "rewards/bleu_reward_func/mean": 0.04008907824754715,
+      "rewards/bleu_reward_func/std": 0.05116712674498558,
+      "step": 177
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 491.0,
+      "completions/mean_length": 367.84375,
+      "completions/mean_terminated_length": 182.50001525878906,
+      "completions/min_length": 10.0,
+      "completions/min_terminated_length": 10.0,
+      "epoch": 0.1424,
+      "grad_norm": 7.893227577209473,
+      "kl": 0.1038818359375,
+      "learning_rate": 1e-06,
+      "loss": -0.0007,
+      "num_tokens": 2365297.0,
+      "reward": 0.05835431069135666,
+      "reward_std": 0.01447733398526907,
+      "rewards/bleu_reward_func/mean": 0.05835431069135666,
+      "rewards/bleu_reward_func/std": 0.05388018116354942,
+      "step": 178
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 471.0,
+      "completions/mean_length": 106.9375,
+      "completions/mean_terminated_length": 79.93333435058594,
+      "completions/min_length": 11.0,
+      "completions/min_terminated_length": 11.0,
+      "epoch": 0.1432,
+      "grad_norm": 13.133338928222656,
+      "kl": 0.378204345703125,
+      "learning_rate": 1e-06,
+      "loss": 0.1078,
+      "num_tokens": 2377279.0,
+      "reward": 0.27373576164245605,
+      "reward_std": 0.10149600356817245,
+      "rewards/bleu_reward_func/mean": 0.27373576164245605,
+      "rewards/bleu_reward_func/std": 0.21089527010917664,
+      "step": 179
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.34375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 492.0,
+      "completions/mean_length": 384.875,
+      "completions/mean_terminated_length": 318.28570556640625,
+      "completions/min_length": 125.0,
+      "completions/min_terminated_length": 125.0,
+      "epoch": 0.144,
+      "grad_norm": 2.6368002891540527,
+      "kl": 0.02276611328125,
+      "learning_rate": 1e-06,
+      "loss": 0.0261,
+      "num_tokens": 2393507.0,
+      "reward": 0.06703202426433563,
+      "reward_std": 0.02514977753162384,
+      "rewards/bleu_reward_func/mean": 0.06703202426433563,
+      "rewards/bleu_reward_func/std": 0.05334871634840965,
+      "step": 180
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 408.0,
+      "completions/mean_length": 216.15625,
+      "completions/mean_terminated_length": 147.88462829589844,
+      "completions/min_length": 13.0,
+      "completions/min_terminated_length": 13.0,
+      "epoch": 0.1448,
+      "grad_norm": 26.23644256591797,
+      "kl": 0.052398681640625,
+      "learning_rate": 1e-06,
+      "loss": 0.2092,
+      "num_tokens": 2403920.0,
+      "reward": 0.07073464244604111,
+      "reward_std": 0.0369129553437233,
+      "rewards/bleu_reward_func/mean": 0.07073464244604111,
+      "rewards/bleu_reward_func/std": 0.04567345231771469,
+      "step": 181
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.34375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 490.0,
+      "completions/mean_length": 285.09375,
+      "completions/mean_terminated_length": 166.23809814453125,
+      "completions/min_length": 11.0,
+      "completions/min_terminated_length": 11.0,
+      "epoch": 0.1456,
+      "grad_norm": 12.993139266967773,
+      "kl": 0.135498046875,
+      "learning_rate": 1e-06,
+      "loss": 0.241,
+      "num_tokens": 2416907.0,
+      "reward": 0.05180336907505989,
+      "reward_std": 0.024485625326633453,
+      "rewards/bleu_reward_func/mean": 0.05180336907505989,
+      "rewards/bleu_reward_func/std": 0.03925548121333122,
+      "step": 182
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 318.0,
+      "completions/mean_length": 196.96875,
+      "completions/mean_terminated_length": 91.95833587646484,
+      "completions/min_length": 14.0,
+      "completions/min_terminated_length": 14.0,
+      "epoch": 0.1464,
+      "grad_norm": 8.714866638183594,
+      "kl": 0.2222137451171875,
+      "learning_rate": 1e-06,
+      "loss": 0.1099,
+      "num_tokens": 2428778.0,
+      "reward": 0.08364134281873703,
+      "reward_std": 0.042949263006448746,
+      "rewards/bleu_reward_func/mean": 0.08364134281873703,
+      "rewards/bleu_reward_func/std": 0.09259536862373352,
+      "step": 183
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.09375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 508.0,
+      "completions/mean_length": 314.09375,
+      "completions/mean_terminated_length": 293.6206970214844,
+      "completions/min_length": 35.0,
+      "completions/min_terminated_length": 35.0,
+      "epoch": 0.1472,
+      "grad_norm": 2.9456305503845215,
+      "kl": 0.029815673828125,
+      "learning_rate": 1e-06,
+      "loss": -0.172,
+      "num_tokens": 2441829.0,
+      "reward": 0.11525549739599228,
+      "reward_std": 0.056866977363824844,
+      "rewards/bleu_reward_func/mean": 0.11525549739599228,
+      "rewards/bleu_reward_func/std": 0.10229503363370895,
+      "step": 184
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 308.0,
+      "completions/mean_length": 310.21875,
+      "completions/mean_terminated_length": 108.4375,
+      "completions/min_length": 23.0,
+      "completions/min_terminated_length": 23.0,
+      "epoch": 0.148,
+      "grad_norm": 4.448924541473389,
+      "kl": 0.118255615234375,
+      "learning_rate": 1e-06,
+      "loss": -0.0768,
+      "num_tokens": 2456612.0,
+      "reward": 0.1624433547258377,
+      "reward_std": 0.045910030603408813,
+      "rewards/bleu_reward_func/mean": 0.1624433547258377,
+      "rewards/bleu_reward_func/std": 0.19173115491867065,
+      "step": 185
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.21875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 135.0,
+      "completions/mean_length": 134.09375,
+      "completions/mean_terminated_length": 28.279998779296875,
+      "completions/min_length": 3.0,
+      "completions/min_terminated_length": 3.0,
+      "epoch": 0.1488,
+      "grad_norm": 13.645524024963379,
+      "kl": 0.1475830078125,
+      "learning_rate": 1e-06,
+      "loss": 0.0664,
+      "num_tokens": 2464839.0,
+      "reward": 0.05004946142435074,
+      "reward_std": 0.03280433267354965,
+      "rewards/bleu_reward_func/mean": 0.05004946142435074,
+      "rewards/bleu_reward_func/std": 0.05075250193476677,
+      "step": 186
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.28125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 503.0,
+      "completions/mean_length": 383.28125,
+      "completions/mean_terminated_length": 332.9130554199219,
+      "completions/min_length": 194.0,
+      "completions/min_terminated_length": 194.0,
+      "epoch": 0.1496,
+      "grad_norm": 2.3289942741394043,
+      "kl": 0.0212860107421875,
+      "learning_rate": 1e-06,
+      "loss": 0.005,
+      "num_tokens": 2478608.0,
+      "reward": 0.03798733651638031,
+      "reward_std": 0.014268442057073116,
+      "rewards/bleu_reward_func/mean": 0.03798733651638031,
+      "rewards/bleu_reward_func/std": 0.03045865148305893,
+      "step": 187
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 448.0,
+      "completions/mean_length": 355.9375,
+      "completions/mean_terminated_length": 303.91668701171875,
+      "completions/min_length": 40.0,
+      "completions/min_terminated_length": 40.0,
+      "epoch": 0.1504,
+      "grad_norm": 3.192103862762451,
+      "kl": 0.0250701904296875,
+      "learning_rate": 1e-06,
+      "loss": 0.167,
+      "num_tokens": 2492518.0,
+      "reward": 0.02103330008685589,
+      "reward_std": 0.0090586943551898,
+      "rewards/bleu_reward_func/mean": 0.02103330008685589,
+      "rewards/bleu_reward_func/std": 0.01017869170755148,
+      "step": 188
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 350.0,
+      "completions/max_terminated_length": 350.0,
+      "completions/mean_length": 60.375,
+      "completions/mean_terminated_length": 60.375,
+      "completions/min_length": 16.0,
+      "completions/min_terminated_length": 16.0,
+      "epoch": 0.1512,
+      "grad_norm": 10.600973129272461,
+      "kl": 0.31732177734375,
+      "learning_rate": 1e-06,
+      "loss": -0.0974,
+      "num_tokens": 2503642.0,
+      "reward": 0.2223111093044281,
+      "reward_std": 0.05318839102983475,
+      "rewards/bleu_reward_func/mean": 0.2223111093044281,
+      "rewards/bleu_reward_func/std": 0.1549021303653717,
+      "step": 189
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.65625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 494.0,
+      "completions/mean_length": 468.3125,
+      "completions/mean_terminated_length": 384.9090881347656,
+      "completions/min_length": 227.0,
+      "completions/min_terminated_length": 227.0,
+      "epoch": 0.152,
+      "grad_norm": 2.152480363845825,
+      "kl": 0.02301025390625,
+      "learning_rate": 1e-06,
+      "loss": -0.0043,
+      "num_tokens": 2521540.0,
+      "reward": 0.04742058366537094,
+      "reward_std": 0.0165211483836174,
+      "rewards/bleu_reward_func/mean": 0.04742058366537094,
+      "rewards/bleu_reward_func/std": 0.038380105048418045,
+      "step": 190
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.40625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 485.0,
+      "completions/mean_length": 295.5625,
+      "completions/mean_terminated_length": 147.4736785888672,
+      "completions/min_length": 18.0,
+      "completions/min_terminated_length": 18.0,
+      "epoch": 0.1528,
+      "grad_norm": 3.0485126972198486,
+      "kl": 0.0381622314453125,
+      "learning_rate": 1e-06,
+      "loss": 0.5548,
+      "num_tokens": 2536702.0,
+      "reward": 0.059137165546417236,
+      "reward_std": 0.029524236917495728,
+      "rewards/bleu_reward_func/mean": 0.059137165546417236,
+      "rewards/bleu_reward_func/std": 0.04191603511571884,
+      "step": 191
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 479.0,
+      "completions/mean_length": 305.4375,
+      "completions/mean_terminated_length": 211.5454559326172,
+      "completions/min_length": 24.0,
+      "completions/min_terminated_length": 24.0,
+      "epoch": 0.1536,
+      "grad_norm": 3.727417230606079,
+      "kl": 0.06072998046875,
+      "learning_rate": 1e-06,
+      "loss": 0.0741,
+      "num_tokens": 2553892.0,
+      "reward": 0.06053918972611427,
+      "reward_std": 0.025174250826239586,
+      "rewards/bleu_reward_func/mean": 0.06053918972611427,
+      "rewards/bleu_reward_func/std": 0.03798559308052063,
+      "step": 192
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 511.0,
+      "completions/mean_length": 327.0625,
+      "completions/mean_terminated_length": 216.10000610351562,
+      "completions/min_length": 20.0,
+      "completions/min_terminated_length": 20.0,
+      "epoch": 0.1544,
+      "grad_norm": 22.730863571166992,
+      "kl": 0.1234130859375,
+      "learning_rate": 1e-06,
+      "loss": -0.1216,
+      "num_tokens": 2569950.0,
+      "reward": 0.14068183302879333,
+      "reward_std": 0.05201031640172005,
+      "rewards/bleu_reward_func/mean": 0.14068183302879333,
+      "rewards/bleu_reward_func/std": 0.1718810796737671,
+      "step": 193
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 349.0,
+      "completions/mean_length": 189.71875,
+      "completions/mean_terminated_length": 82.29167175292969,
+      "completions/min_length": 27.0,
+      "completions/min_terminated_length": 27.0,
+      "epoch": 0.1552,
+      "grad_norm": 5.675025939941406,
+      "kl": 0.063934326171875,
+      "learning_rate": 1e-06,
+      "loss": -0.0105,
+      "num_tokens": 2579693.0,
+      "reward": 0.08947663754224777,
+      "reward_std": 0.029948215931653976,
+      "rewards/bleu_reward_func/mean": 0.08947663754224777,
+      "rewards/bleu_reward_func/std": 0.06868135929107666,
+      "step": 194
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 410.0,
+      "completions/mean_length": 210.5625,
+      "completions/mean_terminated_length": 167.5,
+      "completions/min_length": 14.0,
+      "completions/min_terminated_length": 14.0,
+      "epoch": 0.156,
+      "grad_norm": 6.797698974609375,
+      "kl": 0.17242431640625,
+      "learning_rate": 1e-06,
+      "loss": -0.0415,
+      "num_tokens": 2592967.0,
+      "reward": 0.16623055934906006,
+      "reward_std": 0.08808746933937073,
+      "rewards/bleu_reward_func/mean": 0.16623055934906006,
+      "rewards/bleu_reward_func/std": 0.17983676493167877,
+      "step": 195
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 277.0,
+      "completions/max_terminated_length": 277.0,
+      "completions/mean_length": 119.09375,
+      "completions/mean_terminated_length": 119.09375,
+      "completions/min_length": 17.0,
+      "completions/min_terminated_length": 17.0,
+      "epoch": 0.1568,
+      "grad_norm": 5.494751453399658,
+      "kl": 0.05902099609375,
+      "learning_rate": 1e-06,
+      "loss": 0.2727,
+      "num_tokens": 2602042.0,
+      "reward": 0.17185799777507782,
+      "reward_std": 0.10617370158433914,
+      "rewards/bleu_reward_func/mean": 0.17185799777507782,
+      "rewards/bleu_reward_func/std": 0.16121239960193634,
+      "step": 196
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.34375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 512.0,
+      "completions/mean_length": 368.5,
+      "completions/mean_terminated_length": 293.3333435058594,
+      "completions/min_length": 12.0,
+      "completions/min_terminated_length": 12.0,
+      "epoch": 0.1576,
+      "grad_norm": 4.1480302810668945,
+      "kl": 0.0312652587890625,
+      "learning_rate": 1e-06,
+      "loss": -0.1879,
+      "num_tokens": 2615826.0,
+      "reward": 0.051297686994075775,
+      "reward_std": 0.018504546955227852,
+      "rewards/bleu_reward_func/mean": 0.051297686994075775,
+      "rewards/bleu_reward_func/std": 0.034977275878190994,
+      "step": 197
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 490.0,
+      "completions/mean_length": 267.53125,
+      "completions/mean_terminated_length": 251.2333526611328,
+      "completions/min_length": 24.0,
+      "completions/min_terminated_length": 24.0,
+      "epoch": 0.1584,
+      "grad_norm": 4.113983631134033,
+      "kl": 0.027557373046875,
+      "learning_rate": 1e-06,
+      "loss": 0.0749,
+      "num_tokens": 2626275.0,
+      "reward": 0.054141815751791,
+      "reward_std": 0.02476467750966549,
+      "rewards/bleu_reward_func/mean": 0.054141815751791,
+      "rewards/bleu_reward_func/std": 0.07109448313713074,
+      "step": 198
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 470.0,
+      "completions/max_terminated_length": 470.0,
+      "completions/mean_length": 112.96875,
+      "completions/mean_terminated_length": 112.96875,
+      "completions/min_length": 11.0,
+      "completions/min_terminated_length": 11.0,
+      "epoch": 0.1592,
+      "grad_norm": 7.432074546813965,
+      "kl": 0.1759033203125,
+      "learning_rate": 1e-06,
+      "loss": 0.0648,
+      "num_tokens": 2633962.0,
+      "reward": 0.16682901978492737,
+      "reward_std": 0.07138749957084656,
+      "rewards/bleu_reward_func/mean": 0.16682901978492737,
+      "rewards/bleu_reward_func/std": 0.15276572108268738,
+      "step": 199
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 495.0,
+      "completions/mean_length": 451.78125,
+      "completions/mean_terminated_length": 404.9444580078125,
+      "completions/min_length": 289.0,
+      "completions/min_terminated_length": 289.0,
+      "epoch": 0.16,
+      "grad_norm": 2.110192060470581,
+      "kl": 0.022369384765625,
+      "learning_rate": 1e-06,
+      "loss": -0.0268,
+      "num_tokens": 2653971.0,
+      "reward": 0.11942745745182037,
+      "reward_std": 0.02005620300769806,
+      "rewards/bleu_reward_func/mean": 0.11942745745182037,
+      "rewards/bleu_reward_func/std": 0.09454692155122757,
+      "step": 200
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 349.0,
+      "completions/mean_length": 185.9375,
+      "completions/mean_terminated_length": 110.69231414794922,
+      "completions/min_length": 20.0,
+      "completions/min_terminated_length": 20.0,
+      "epoch": 0.1608,
+      "grad_norm": 6.2729973793029785,
+      "kl": 0.063720703125,
+      "learning_rate": 1e-06,
+      "loss": -0.0659,
+      "num_tokens": 2664601.0,
+      "reward": 0.03557516261935234,
+      "reward_std": 0.021523961797356606,
+      "rewards/bleu_reward_func/mean": 0.03557516261935234,
+      "rewards/bleu_reward_func/std": 0.02618589997291565,
+      "step": 201
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 509.0,
+      "completions/mean_length": 292.5,
+      "completions/mean_terminated_length": 121.77777862548828,
+      "completions/min_length": 13.0,
+      "completions/min_terminated_length": 13.0,
+      "epoch": 0.1616,
+      "grad_norm": 5.936282157897949,
+      "kl": 0.0358734130859375,
+      "learning_rate": 1e-06,
+      "loss": -0.2742,
+      "num_tokens": 2679849.0,
+      "reward": 0.038136985152959824,
+      "reward_std": 0.022807471454143524,
+      "rewards/bleu_reward_func/mean": 0.038136985152959824,
+      "rewards/bleu_reward_func/std": 0.061121899634599686,
+      "step": 202
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 510.0,
+      "completions/mean_length": 230.96875,
+      "completions/mean_terminated_length": 221.90321350097656,
+      "completions/min_length": 21.0,
+      "completions/min_terminated_length": 21.0,
+      "epoch": 0.1624,
+      "grad_norm": 8.785550117492676,
+      "kl": 0.1523590087890625,
+      "learning_rate": 1e-06,
+      "loss": -0.2049,
+      "num_tokens": 2693024.0,
+      "reward": 0.1289938986301422,
+      "reward_std": 0.045512765645980835,
+      "rewards/bleu_reward_func/mean": 0.1289938986301422,
+      "rewards/bleu_reward_func/std": 0.09638386219739914,
+      "step": 203
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 475.0,
+      "completions/mean_length": 361.8125,
+      "completions/mean_terminated_length": 168.71429443359375,
+      "completions/min_length": 17.0,
+      "completions/min_terminated_length": 17.0,
+      "epoch": 0.1632,
+      "grad_norm": 6.617871284484863,
+      "kl": 0.0502777099609375,
+      "learning_rate": 1e-06,
+      "loss": 0.1304,
+      "num_tokens": 2710354.0,
+      "reward": 0.033049020916223526,
+      "reward_std": 0.017362549901008606,
+      "rewards/bleu_reward_func/mean": 0.033049020916223526,
+      "rewards/bleu_reward_func/std": 0.026102159172296524,
+      "step": 204
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 456.0,
+      "completions/mean_length": 281.4375,
+      "completions/mean_terminated_length": 176.63636779785156,
+      "completions/min_length": 21.0,
+      "completions/min_terminated_length": 21.0,
+      "epoch": 0.164,
+      "grad_norm": 3.961705446243286,
+      "kl": 0.104736328125,
+      "learning_rate": 1e-06,
+      "loss": -0.0209,
+      "num_tokens": 2724680.0,
+      "reward": 0.1750263273715973,
+      "reward_std": 0.02830299735069275,
+      "rewards/bleu_reward_func/mean": 0.1750263273715973,
+      "rewards/bleu_reward_func/std": 0.13747908174991608,
+      "step": 205
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 509.0,
+      "completions/mean_length": 287.28125,
+      "completions/mean_terminated_length": 255.1785888671875,
+      "completions/min_length": 53.0,
+      "completions/min_terminated_length": 53.0,
+      "epoch": 0.1648,
+      "grad_norm": 3.098118305206299,
+      "kl": 0.020477294921875,
+      "learning_rate": 1e-06,
+      "loss": -0.1863,
+      "num_tokens": 2736209.0,
+      "reward": 0.06041261553764343,
+      "reward_std": 0.033261410892009735,
+      "rewards/bleu_reward_func/mean": 0.06041261553764343,
+      "rewards/bleu_reward_func/std": 0.046081364154815674,
+      "step": 206
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 429.0,
+      "completions/mean_length": 230.1875,
+      "completions/mean_terminated_length": 136.25,
+      "completions/min_length": 22.0,
+      "completions/min_terminated_length": 22.0,
+      "epoch": 0.1656,
+      "grad_norm": 6.539205551147461,
+      "kl": 0.0445404052734375,
+      "learning_rate": 1e-06,
+      "loss": 0.0949,
+      "num_tokens": 2749503.0,
+      "reward": 0.039952248334884644,
+      "reward_std": 0.05510722100734711,
+      "rewards/bleu_reward_func/mean": 0.039952248334884644,
+      "rewards/bleu_reward_func/std": 0.08833327889442444,
+      "step": 207
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 453.0,
+      "completions/mean_length": 385.8125,
+      "completions/mean_terminated_length": 223.57144165039062,
+      "completions/min_length": 62.0,
+      "completions/min_terminated_length": 62.0,
+      "epoch": 0.1664,
+      "grad_norm": 3.262167453765869,
+      "kl": 0.0330657958984375,
+      "learning_rate": 1e-06,
+      "loss": 0.0025,
+      "num_tokens": 2763289.0,
+      "reward": 0.06319095194339752,
+      "reward_std": 0.021728292107582092,
+      "rewards/bleu_reward_func/mean": 0.06319095194339752,
+      "rewards/bleu_reward_func/std": 0.03750937059521675,
+      "step": 208
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 498.0,
+      "completions/mean_length": 182.71875,
+      "completions/mean_terminated_length": 172.09677124023438,
+      "completions/min_length": 6.0,
+      "completions/min_terminated_length": 6.0,
+      "epoch": 0.1672,
+      "grad_norm": 6.667765140533447,
+      "kl": 0.113616943359375,
+      "learning_rate": 1e-06,
+      "loss": 0.3285,
+      "num_tokens": 2773104.0,
+      "reward": 0.1846303939819336,
+      "reward_std": 0.16774994134902954,
+      "rewards/bleu_reward_func/mean": 0.1846303939819336,
+      "rewards/bleu_reward_func/std": 0.20520828664302826,
+      "step": 209
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.15625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 449.0,
+      "completions/mean_length": 204.6875,
+      "completions/mean_terminated_length": 147.7777862548828,
+      "completions/min_length": 34.0,
+      "completions/min_terminated_length": 34.0,
+      "epoch": 0.168,
+      "grad_norm": 5.794483661651611,
+      "kl": 0.09796142578125,
+      "learning_rate": 1e-06,
+      "loss": 0.1363,
+      "num_tokens": 2787670.0,
+      "reward": 0.09086121618747711,
+      "reward_std": 0.052026841789484024,
+      "rewards/bleu_reward_func/mean": 0.09086121618747711,
+      "rewards/bleu_reward_func/std": 0.09278357774019241,
+      "step": 210
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.71875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 367.0,
+      "completions/mean_length": 399.8125,
+      "completions/mean_terminated_length": 113.11111450195312,
+      "completions/min_length": 37.0,
+      "completions/min_terminated_length": 37.0,
+      "epoch": 0.1688,
+      "grad_norm": 11.576338768005371,
+      "kl": 0.0574493408203125,
+      "learning_rate": 1e-06,
+      "loss": 0.1204,
+      "num_tokens": 2805864.0,
+      "reward": 0.023652518168091774,
+      "reward_std": 0.01210303045809269,
+      "rewards/bleu_reward_func/mean": 0.023652518168091774,
+      "rewards/bleu_reward_func/std": 0.02501726523041725,
+      "step": 211
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.21875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 401.0,
+      "completions/mean_length": 157.6875,
+      "completions/mean_terminated_length": 58.47999954223633,
+      "completions/min_length": 16.0,
+      "completions/min_terminated_length": 16.0,
+      "epoch": 0.1696,
+      "grad_norm": 10.228835105895996,
+      "kl": 0.1674041748046875,
+      "learning_rate": 1e-06,
+      "loss": 0.0582,
+      "num_tokens": 2816758.0,
+      "reward": 0.13800185918807983,
+      "reward_std": 0.047296687960624695,
+      "rewards/bleu_reward_func/mean": 0.13800185918807983,
+      "rewards/bleu_reward_func/std": 0.0863277018070221,
+      "step": 212
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.28125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 506.0,
+      "completions/mean_length": 402.15625,
+      "completions/mean_terminated_length": 359.1739196777344,
+      "completions/min_length": 214.0,
+      "completions/min_terminated_length": 214.0,
+      "epoch": 0.1704,
+      "grad_norm": 2.593717336654663,
+      "kl": 0.01934814453125,
+      "learning_rate": 1e-06,
+      "loss": -0.0091,
+      "num_tokens": 2833347.0,
+      "reward": 0.05193600431084633,
+      "reward_std": 0.018484318628907204,
+      "rewards/bleu_reward_func/mean": 0.05193600431084633,
+      "rewards/bleu_reward_func/std": 0.04251272976398468,
+      "step": 213
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 509.0,
+      "completions/max_terminated_length": 509.0,
+      "completions/mean_length": 198.25,
+      "completions/mean_terminated_length": 198.25,
+      "completions/min_length": 12.0,
+      "completions/min_terminated_length": 12.0,
+      "epoch": 0.1712,
+      "grad_norm": 6.071621894836426,
+      "kl": 0.057373046875,
+      "learning_rate": 1e-06,
+      "loss": -0.1354,
+      "num_tokens": 2841195.0,
+      "reward": 0.062206219881772995,
+      "reward_std": 0.03749649226665497,
+      "rewards/bleu_reward_func/mean": 0.062206219881772995,
+      "rewards/bleu_reward_func/std": 0.0528765432536602,
+      "step": 214
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.09375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 491.0,
+      "completions/mean_length": 246.40625,
+      "completions/mean_terminated_length": 218.9310302734375,
+      "completions/min_length": 18.0,
+      "completions/min_terminated_length": 18.0,
+      "epoch": 0.172,
+      "grad_norm": 4.833486557006836,
+      "kl": 0.068695068359375,
+      "learning_rate": 1e-06,
+      "loss": 0.1631,
+      "num_tokens": 2851224.0,
+      "reward": 0.06542235612869263,
+      "reward_std": 0.03771442174911499,
+      "rewards/bleu_reward_func/mean": 0.06542235612869263,
+      "rewards/bleu_reward_func/std": 0.0579860620200634,
+      "step": 215
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.53125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 494.0,
+      "completions/mean_length": 465.15625,
+      "completions/mean_terminated_length": 412.0666809082031,
+      "completions/min_length": 309.0,
+      "completions/min_terminated_length": 309.0,
+      "epoch": 0.1728,
+      "grad_norm": 2.1820828914642334,
+      "kl": 0.0188446044921875,
+      "learning_rate": 1e-06,
+      "loss": 0.0277,
+      "num_tokens": 2870765.0,
+      "reward": 0.06440776586532593,
+      "reward_std": 0.013088207691907883,
+      "rewards/bleu_reward_func/mean": 0.06440776586532593,
+      "rewards/bleu_reward_func/std": 0.06307429075241089,
+      "step": 216
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.15625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 512.0,
+      "completions/mean_length": 224.0625,
+      "completions/mean_terminated_length": 170.74073791503906,
+      "completions/min_length": 17.0,
+      "completions/min_terminated_length": 17.0,
+      "epoch": 0.1736,
+      "grad_norm": 9.596390724182129,
+      "kl": 0.2492218017578125,
+      "learning_rate": 1e-06,
+      "loss": 0.1621,
+      "num_tokens": 2882151.0,
+      "reward": 0.15283548831939697,
+      "reward_std": 0.08103044331073761,
+      "rewards/bleu_reward_func/mean": 0.15283548831939697,
+      "rewards/bleu_reward_func/std": 0.13223250210285187,
+      "step": 217
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.75,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 480.0,
+      "completions/mean_length": 464.09375,
+      "completions/mean_terminated_length": 320.375,
+      "completions/min_length": 17.0,
+      "completions/min_terminated_length": 17.0,
+      "epoch": 0.1744,
+      "grad_norm": 2.1536099910736084,
+      "kl": 0.02032470703125,
+      "learning_rate": 1e-06,
+      "loss": 0.0402,
+      "num_tokens": 2900658.0,
+      "reward": 0.02000538259744644,
+      "reward_std": 0.008671639487147331,
+      "rewards/bleu_reward_func/mean": 0.02000538259744644,
+      "rewards/bleu_reward_func/std": 0.01867109164595604,
+      "step": 218
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.21875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 489.0,
+      "completions/mean_length": 155.5,
+      "completions/mean_terminated_length": 55.68000030517578,
+      "completions/min_length": 13.0,
+      "completions/min_terminated_length": 13.0,
+      "epoch": 0.1752,
+      "grad_norm": 8.202893257141113,
+      "kl": 0.2074127197265625,
+      "learning_rate": 1e-06,
+      "loss": -0.0157,
+      "num_tokens": 2909778.0,
+      "reward": 0.1296558678150177,
+      "reward_std": 0.04394569993019104,
+      "rewards/bleu_reward_func/mean": 0.1296558678150177,
+      "rewards/bleu_reward_func/std": 0.05605300888419151,
+      "step": 219
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 485.0,
+      "completions/mean_length": 350.40625,
+      "completions/mean_terminated_length": 188.8125,
+      "completions/min_length": 18.0,
+      "completions/min_terminated_length": 18.0,
+      "epoch": 0.176,
+      "grad_norm": 5.133015155792236,
+      "kl": 0.0847930908203125,
+      "learning_rate": 1e-06,
+      "loss": 0.2562,
+      "num_tokens": 2926239.0,
+      "reward": 0.1607290506362915,
+      "reward_std": 0.12061528861522675,
+      "rewards/bleu_reward_func/mean": 0.1607290506362915,
+      "rewards/bleu_reward_func/std": 0.19297951459884644,
+      "step": 220
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.34375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 509.0,
+      "completions/mean_length": 377.09375,
+      "completions/mean_terminated_length": 306.4285888671875,
+      "completions/min_length": 55.0,
+      "completions/min_terminated_length": 55.0,
+      "epoch": 0.1768,
+      "grad_norm": 2.917404890060425,
+      "kl": 0.02947998046875,
+      "learning_rate": 1e-06,
+      "loss": 0.0902,
+      "num_tokens": 2940970.0,
+      "reward": 0.05033531412482262,
+      "reward_std": 0.015085380524396896,
+      "rewards/bleu_reward_func/mean": 0.05033531412482262,
+      "rewards/bleu_reward_func/std": 0.03601166605949402,
+      "step": 221
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 307.0,
+      "completions/max_terminated_length": 307.0,
+      "completions/mean_length": 98.625,
+      "completions/mean_terminated_length": 98.625,
+      "completions/min_length": 14.0,
+      "completions/min_terminated_length": 14.0,
+      "epoch": 0.1776,
+      "grad_norm": 9.739842414855957,
+      "kl": 0.28759765625,
+      "learning_rate": 1e-06,
+      "loss": 0.1954,
+      "num_tokens": 2951942.0,
+      "reward": 0.18511344492435455,
+      "reward_std": 0.09618590772151947,
+      "rewards/bleu_reward_func/mean": 0.18511344492435455,
+      "rewards/bleu_reward_func/std": 0.13407698273658752,
+      "step": 222
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 469.0,
+      "completions/mean_length": 155.1875,
+      "completions/mean_terminated_length": 131.40000915527344,
+      "completions/min_length": 10.0,
+      "completions/min_terminated_length": 10.0,
+      "epoch": 0.1784,
+      "grad_norm": 5.931830883026123,
+      "kl": 0.082611083984375,
+      "learning_rate": 1e-06,
+      "loss": 0.0924,
+      "num_tokens": 2959460.0,
+      "reward": 0.07271347939968109,
+      "reward_std": 0.05200031027197838,
+      "rewards/bleu_reward_func/mean": 0.07271347939968109,
+      "rewards/bleu_reward_func/std": 0.06765022873878479,
+      "step": 223
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 391.0,
+      "completions/mean_length": 130.71875,
+      "completions/mean_terminated_length": 105.30000305175781,
+      "completions/min_length": 11.0,
+      "completions/min_terminated_length": 11.0,
+      "epoch": 0.1792,
+      "grad_norm": 7.8368730545043945,
+      "kl": 0.09417724609375,
+      "learning_rate": 1e-06,
+      "loss": 0.1685,
+      "num_tokens": 2966491.0,
+      "reward": 0.0899183601140976,
+      "reward_std": 0.05122753232717514,
+      "rewards/bleu_reward_func/mean": 0.0899183601140976,
+      "rewards/bleu_reward_func/std": 0.11120127141475677,
+      "step": 224
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.40625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 472.0,
+      "completions/mean_length": 367.5,
+      "completions/mean_terminated_length": 268.631591796875,
+      "completions/min_length": 33.0,
+      "completions/min_terminated_length": 33.0,
+      "epoch": 0.18,
+      "grad_norm": 3.555055618286133,
+      "kl": 0.0318603515625,
+      "learning_rate": 1e-06,
+      "loss": -0.0636,
+      "num_tokens": 2982515.0,
+      "reward": 0.11773502081632614,
+      "reward_std": 0.046606093645095825,
+      "rewards/bleu_reward_func/mean": 0.11773502081632614,
+      "rewards/bleu_reward_func/std": 0.15673232078552246,
+      "step": 225
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 505.0,
+      "completions/max_terminated_length": 505.0,
+      "completions/mean_length": 122.65625,
+      "completions/mean_terminated_length": 122.65625,
+      "completions/min_length": 15.0,
+      "completions/min_terminated_length": 15.0,
+      "epoch": 0.1808,
+      "grad_norm": 10.452176094055176,
+      "kl": 0.1519775390625,
+      "learning_rate": 1e-06,
+      "loss": 0.0183,
+      "num_tokens": 2991512.0,
+      "reward": 0.13446207344532013,
+      "reward_std": 0.060547836124897,
+      "rewards/bleu_reward_func/mean": 0.13446207344532013,
+      "rewards/bleu_reward_func/std": 0.07454977184534073,
+      "step": 226
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.34375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 496.0,
+      "completions/mean_length": 345.6875,
+      "completions/mean_terminated_length": 258.5714416503906,
+      "completions/min_length": 26.0,
+      "completions/min_terminated_length": 26.0,
+      "epoch": 0.1816,
+      "grad_norm": 2.964317560195923,
+      "kl": 0.038421630859375,
+      "learning_rate": 1e-06,
+      "loss": 0.1194,
+      "num_tokens": 3005678.0,
+      "reward": 0.14132392406463623,
+      "reward_std": 0.05001860111951828,
+      "rewards/bleu_reward_func/mean": 0.14132392406463623,
+      "rewards/bleu_reward_func/std": 0.08175285160541534,
+      "step": 227
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.6875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 449.0,
+      "completions/mean_length": 390.125,
+      "completions/mean_terminated_length": 122.0,
+      "completions/min_length": 15.0,
+      "completions/min_terminated_length": 15.0,
+      "epoch": 0.1824,
+      "grad_norm": 5.049752235412598,
+      "kl": 0.04425048828125,
+      "learning_rate": 1e-06,
+      "loss": 0.1754,
+      "num_tokens": 3021434.0,
+      "reward": 0.04336467757821083,
+      "reward_std": 0.018742987886071205,
+      "rewards/bleu_reward_func/mean": 0.04336467757821083,
+      "rewards/bleu_reward_func/std": 0.03402964025735855,
+      "step": 228
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 483.0,
+      "completions/mean_length": 364.0625,
+      "completions/mean_terminated_length": 249.0,
+      "completions/min_length": 111.0,
+      "completions/min_terminated_length": 111.0,
+      "epoch": 0.1832,
+      "grad_norm": 3.026240348815918,
+      "kl": 0.0245361328125,
+      "learning_rate": 1e-06,
+      "loss": -0.2846,
+      "num_tokens": 3040956.0,
+      "reward": 0.028285246342420578,
+      "reward_std": 0.018473699688911438,
+      "rewards/bleu_reward_func/mean": 0.028285246342420578,
+      "rewards/bleu_reward_func/std": 0.02460222877562046,
+      "step": 229
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.21875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 506.0,
+      "completions/mean_length": 370.1875,
+      "completions/mean_terminated_length": 330.47998046875,
+      "completions/min_length": 162.0,
+      "completions/min_terminated_length": 162.0,
+      "epoch": 0.184,
+      "grad_norm": 2.621922731399536,
+      "kl": 0.0340728759765625,
+      "learning_rate": 1e-06,
+      "loss": -0.0699,
+      "num_tokens": 3058866.0,
+      "reward": 0.18184542655944824,
+      "reward_std": 0.06604617834091187,
+      "rewards/bleu_reward_func/mean": 0.18184542655944824,
+      "rewards/bleu_reward_func/std": 0.16794371604919434,
+      "step": 230
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.28125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 504.0,
+      "completions/mean_length": 213.71875,
+      "completions/mean_terminated_length": 97.0,
+      "completions/min_length": 12.0,
+      "completions/min_terminated_length": 12.0,
+      "epoch": 0.1848,
+      "grad_norm": 5.496671676635742,
+      "kl": 0.11822509765625,
+      "learning_rate": 1e-06,
+      "loss": 0.4021,
+      "num_tokens": 3071713.0,
+      "reward": 0.22397759556770325,
+      "reward_std": 0.09391038119792938,
+      "rewards/bleu_reward_func/mean": 0.22397759556770325,
+      "rewards/bleu_reward_func/std": 0.19180122017860413,
+      "step": 231
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 267.0,
+      "completions/mean_length": 197.84375,
+      "completions/mean_terminated_length": 93.125,
+      "completions/min_length": 40.0,
+      "completions/min_terminated_length": 40.0,
+      "epoch": 0.1856,
+      "grad_norm": 3.808242082595825,
+      "kl": 0.04571533203125,
+      "learning_rate": 1e-06,
+      "loss": 0.0821,
+      "num_tokens": 3079892.0,
+      "reward": 0.060666900128126144,
+      "reward_std": 0.029011715203523636,
+      "rewards/bleu_reward_func/mean": 0.060666900128126144,
+      "rewards/bleu_reward_func/std": 0.0762709304690361,
+      "step": 232
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.21875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 441.0,
+      "completions/mean_length": 160.40625,
+      "completions/mean_terminated_length": 61.959999084472656,
+      "completions/min_length": 9.0,
+      "completions/min_terminated_length": 9.0,
+      "epoch": 0.1864,
+      "grad_norm": 11.2310791015625,
+      "kl": 0.160797119140625,
+      "learning_rate": 1e-06,
+      "loss": 0.2881,
+      "num_tokens": 3087689.0,
+      "reward": 0.07089974731206894,
+      "reward_std": 0.03123306669294834,
+      "rewards/bleu_reward_func/mean": 0.07089974731206894,
+      "rewards/bleu_reward_func/std": 0.06456828862428665,
+      "step": 233
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 437.0,
+      "completions/max_terminated_length": 437.0,
+      "completions/mean_length": 66.875,
+      "completions/mean_terminated_length": 66.875,
+      "completions/min_length": 6.0,
+      "completions/min_terminated_length": 6.0,
+      "epoch": 0.1872,
+      "grad_norm": 13.989295959472656,
+      "kl": 0.3311767578125,
+      "learning_rate": 1e-06,
+      "loss": -0.0331,
+      "num_tokens": 3093357.0,
+      "reward": 0.15325351059436798,
+      "reward_std": 0.0506255105137825,
+      "rewards/bleu_reward_func/mean": 0.15325351059436798,
+      "rewards/bleu_reward_func/std": 0.19497260451316833,
+      "step": 234
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.28125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 466.0,
+      "completions/mean_length": 260.75,
+      "completions/mean_terminated_length": 162.43478393554688,
+      "completions/min_length": 17.0,
+      "completions/min_terminated_length": 17.0,
+      "epoch": 0.188,
+      "grad_norm": 7.557122230529785,
+      "kl": 0.173126220703125,
+      "learning_rate": 1e-06,
+      "loss": 0.1592,
+      "num_tokens": 3105749.0,
+      "reward": 0.20930011570453644,
+      "reward_std": 0.06161898747086525,
+      "rewards/bleu_reward_func/mean": 0.20930011570453644,
+      "rewards/bleu_reward_func/std": 0.2159973680973053,
+      "step": 235
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.46875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 506.0,
+      "completions/mean_length": 404.9375,
+      "completions/mean_terminated_length": 310.4705810546875,
+      "completions/min_length": 13.0,
+      "completions/min_terminated_length": 13.0,
+      "epoch": 0.1888,
+      "grad_norm": 4.613722324371338,
+      "kl": 0.025421142578125,
+      "learning_rate": 1e-06,
+      "loss": -0.1006,
+      "num_tokens": 3121795.0,
+      "reward": 0.02748030610382557,
+      "reward_std": 0.0075658103451132774,
+      "rewards/bleu_reward_func/mean": 0.02748030610382557,
+      "rewards/bleu_reward_func/std": 0.03438537195324898,
+      "step": 236
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 472.0,
+      "completions/mean_length": 389.09375,
+      "completions/mean_terminated_length": 266.1875,
+      "completions/min_length": 8.0,
+      "completions/min_terminated_length": 8.0,
+      "epoch": 0.1896,
+      "grad_norm": 2.435314178466797,
+      "kl": 0.0276031494140625,
+      "learning_rate": 1e-06,
+      "loss": -0.1746,
+      "num_tokens": 3140006.0,
+      "reward": 0.10853572189807892,
+      "reward_std": 0.05605427548289299,
+      "rewards/bleu_reward_func/mean": 0.10853572189807892,
+      "rewards/bleu_reward_func/std": 0.1485956311225891,
+      "step": 237
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 61.0,
+      "completions/mean_length": 149.78125,
+      "completions/mean_terminated_length": 29.041667938232422,
+      "completions/min_length": 13.0,
+      "completions/min_terminated_length": 13.0,
+      "epoch": 0.1904,
+      "grad_norm": 8.839442253112793,
+      "kl": 0.2632598876953125,
+      "learning_rate": 1e-06,
+      "loss": 0.0665,
+      "num_tokens": 3149743.0,
+      "reward": 0.13384486734867096,
+      "reward_std": 0.03735985979437828,
+      "rewards/bleu_reward_func/mean": 0.13384486734867096,
+      "rewards/bleu_reward_func/std": 0.17275770008563995,
+      "step": 238
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 508.0,
+      "completions/mean_length": 261.625,
+      "completions/mean_terminated_length": 178.1666717529297,
+      "completions/min_length": 20.0,
+      "completions/min_terminated_length": 20.0,
+      "epoch": 0.1912,
+      "grad_norm": 4.326257228851318,
+      "kl": 0.14703369140625,
+      "learning_rate": 1e-06,
+      "loss": -0.0395,
+      "num_tokens": 3163195.0,
+      "reward": 0.16435688734054565,
+      "reward_std": 0.051772814244031906,
+      "rewards/bleu_reward_func/mean": 0.16435688734054565,
+      "rewards/bleu_reward_func/std": 0.13062016665935516,
+      "step": 239
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.40625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 512.0,
+      "completions/mean_length": 359.4375,
+      "completions/mean_terminated_length": 255.05262756347656,
+      "completions/min_length": 18.0,
+      "completions/min_terminated_length": 18.0,
+      "epoch": 0.192,
+      "grad_norm": 6.709453582763672,
+      "kl": 0.097503662109375,
+      "learning_rate": 1e-06,
+      "loss": -0.0418,
+      "num_tokens": 3180209.0,
+      "reward": 0.10101380944252014,
+      "reward_std": 0.030364379286766052,
+      "rewards/bleu_reward_func/mean": 0.10101380944252014,
+      "rewards/bleu_reward_func/std": 0.08647928386926651,
+      "step": 240
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.21875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 412.0,
+      "completions/mean_length": 185.0,
+      "completions/mean_terminated_length": 93.43999481201172,
+      "completions/min_length": 20.0,
+      "completions/min_terminated_length": 20.0,
+      "epoch": 0.1928,
+      "grad_norm": 9.118388175964355,
+      "kl": 0.14984130859375,
+      "learning_rate": 1e-06,
+      "loss": -0.0512,
+      "num_tokens": 3189425.0,
+      "reward": 0.19255727529525757,
+      "reward_std": 0.03786986321210861,
+      "rewards/bleu_reward_func/mean": 0.19255727529525757,
+      "rewards/bleu_reward_func/std": 0.18927834928035736,
+      "step": 241
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 497.0,
+      "completions/mean_length": 162.4375,
+      "completions/mean_terminated_length": 81.76923370361328,
+      "completions/min_length": 16.0,
+      "completions/min_terminated_length": 16.0,
+      "epoch": 0.1936,
+      "grad_norm": 7.5658745765686035,
+      "kl": 0.1087493896484375,
+      "learning_rate": 1e-06,
+      "loss": 0.11,
+      "num_tokens": 3196911.0,
+      "reward": 0.08898752182722092,
+      "reward_std": 0.01980067417025566,
+      "rewards/bleu_reward_func/mean": 0.08898752182722092,
+      "rewards/bleu_reward_func/std": 0.09810609370470047,
+      "step": 242
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 479.0,
+      "completions/mean_length": 238.8125,
+      "completions/mean_terminated_length": 175.7692413330078,
+      "completions/min_length": 43.0,
+      "completions/min_terminated_length": 43.0,
+      "epoch": 0.1944,
+      "grad_norm": 3.6591224670410156,
+      "kl": 0.068145751953125,
+      "learning_rate": 1e-06,
+      "loss": 0.0677,
+      "num_tokens": 3212161.0,
+      "reward": 0.16356298327445984,
+      "reward_std": 0.08266205340623856,
+      "rewards/bleu_reward_func/mean": 0.16356298327445984,
+      "rewards/bleu_reward_func/std": 0.17177340388298035,
+      "step": 243
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.34375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 489.0,
+      "completions/mean_length": 369.0625,
+      "completions/mean_terminated_length": 294.19049072265625,
+      "completions/min_length": 35.0,
+      "completions/min_terminated_length": 35.0,
+      "epoch": 0.1952,
+      "grad_norm": 2.8115499019622803,
+      "kl": 0.032623291015625,
+      "learning_rate": 1e-06,
+      "loss": 0.0943,
+      "num_tokens": 3228483.0,
+      "reward": 0.06906401365995407,
+      "reward_std": 0.025964463129639626,
+      "rewards/bleu_reward_func/mean": 0.06906401365995407,
+      "rewards/bleu_reward_func/std": 0.044564370065927505,
+      "step": 244
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 488.0,
+      "completions/mean_length": 253.875,
+      "completions/mean_terminated_length": 194.3076934814453,
+      "completions/min_length": 16.0,
+      "completions/min_terminated_length": 16.0,
+      "epoch": 0.196,
+      "grad_norm": 6.153151512145996,
+      "kl": 0.072845458984375,
+      "learning_rate": 1e-06,
+      "loss": 0.1336,
+      "num_tokens": 3238447.0,
+      "reward": 0.05225534737110138,
+      "reward_std": 0.019162572920322418,
+      "rewards/bleu_reward_func/mean": 0.05225534737110138,
+      "rewards/bleu_reward_func/std": 0.04069560393691063,
+      "step": 245
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 508.0,
+      "completions/mean_length": 357.6875,
+      "completions/mean_terminated_length": 237.6666717529297,
+      "completions/min_length": 28.0,
+      "completions/min_terminated_length": 28.0,
+      "epoch": 0.1968,
+      "grad_norm": 4.332682132720947,
+      "kl": 0.074615478515625,
+      "learning_rate": 1e-06,
+      "loss": -0.0015,
+      "num_tokens": 3252661.0,
+      "reward": 0.06644366681575775,
+      "reward_std": 0.029834389686584473,
+      "rewards/bleu_reward_func/mean": 0.06644366681575775,
+      "rewards/bleu_reward_func/std": 0.0527600534260273,
+      "step": 246
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 433.0,
+      "completions/mean_length": 280.84375,
+      "completions/mean_terminated_length": 203.7916717529297,
+      "completions/min_length": 56.0,
+      "completions/min_terminated_length": 56.0,
+      "epoch": 0.1976,
+      "grad_norm": 3.9714043140411377,
+      "kl": 0.046600341796875,
+      "learning_rate": 1e-06,
+      "loss": 0.0574,
+      "num_tokens": 3263392.0,
+      "reward": 0.04084426164627075,
+      "reward_std": 0.022724341601133347,
+      "rewards/bleu_reward_func/mean": 0.04084426164627075,
+      "rewards/bleu_reward_func/std": 0.03625248372554779,
+      "step": 247
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.21875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 508.0,
+      "completions/mean_length": 246.1875,
+      "completions/mean_terminated_length": 171.75999450683594,
+      "completions/min_length": 17.0,
+      "completions/min_terminated_length": 17.0,
+      "epoch": 0.1984,
+      "grad_norm": 7.287817478179932,
+      "kl": 0.128570556640625,
+      "learning_rate": 1e-06,
+      "loss": 0.0058,
+      "num_tokens": 3274190.0,
+      "reward": 0.05778396502137184,
+      "reward_std": 0.020291190594434738,
+      "rewards/bleu_reward_func/mean": 0.05778396502137184,
+      "rewards/bleu_reward_func/std": 0.046611472964286804,
+      "step": 248
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.40625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 454.0,
+      "completions/mean_length": 317.21875,
+      "completions/mean_terminated_length": 183.94737243652344,
+      "completions/min_length": 10.0,
+      "completions/min_terminated_length": 10.0,
+      "epoch": 0.1992,
+      "grad_norm": 9.650996208190918,
+      "kl": 0.1335906982421875,
+      "learning_rate": 1e-06,
+      "loss": -0.0956,
+      "num_tokens": 3288605.0,
+      "reward": 0.15271537005901337,
+      "reward_std": 0.0891089141368866,
+      "rewards/bleu_reward_func/mean": 0.15271537005901337,
+      "rewards/bleu_reward_func/std": 0.1993638128042221,
+      "step": 249
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.28125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 511.0,
+      "completions/mean_length": 315.125,
+      "completions/mean_terminated_length": 238.0869598388672,
+      "completions/min_length": 18.0,
+      "completions/min_terminated_length": 18.0,
+      "epoch": 0.2,
+      "grad_norm": 8.128390312194824,
+      "kl": 0.073394775390625,
+      "learning_rate": 1e-06,
+      "loss": 0.1129,
+      "num_tokens": 3303449.0,
+      "reward": 0.05582565814256668,
+      "reward_std": 0.04732588678598404,
+      "rewards/bleu_reward_func/mean": 0.05582565814256668,
+      "rewards/bleu_reward_func/std": 0.06975270062685013,
+      "step": 250
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 510.0,
+      "completions/mean_length": 397.34375,
+      "completions/mean_terminated_length": 282.6875,
+      "completions/min_length": 61.0,
+      "completions/min_terminated_length": 61.0,
+      "epoch": 0.2008,
+      "grad_norm": 2.3954126834869385,
+      "kl": 0.027587890625,
+      "learning_rate": 1e-06,
+      "loss": -0.0758,
+      "num_tokens": 3322684.0,
+      "reward": 0.20381565392017365,
+      "reward_std": 0.06331950426101685,
+      "rewards/bleu_reward_func/mean": 0.20381565392017365,
+      "rewards/bleu_reward_func/std": 0.30689555406570435,
+      "step": 251
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 507.0,
+      "completions/mean_length": 187.75,
+      "completions/mean_terminated_length": 177.29031372070312,
+      "completions/min_length": 17.0,
+      "completions/min_terminated_length": 17.0,
+      "epoch": 0.2016,
+      "grad_norm": 10.365123748779297,
+      "kl": 0.403076171875,
+      "learning_rate": 1e-06,
+      "loss": -0.0409,
+      "num_tokens": 3332612.0,
+      "reward": 0.09179520606994629,
+      "reward_std": 0.042515259236097336,
+      "rewards/bleu_reward_func/mean": 0.09179520606994629,
+      "rewards/bleu_reward_func/std": 0.06000783294439316,
+      "step": 252
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.21875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 509.0,
+      "completions/mean_length": 256.4375,
+      "completions/mean_terminated_length": 184.87998962402344,
+      "completions/min_length": 20.0,
+      "completions/min_terminated_length": 20.0,
+      "epoch": 0.2024,
+      "grad_norm": 5.642463207244873,
+      "kl": 0.1347198486328125,
+      "learning_rate": 1e-06,
+      "loss": -0.0626,
+      "num_tokens": 3347706.0,
+      "reward": 0.12519359588623047,
+      "reward_std": 0.036009326577186584,
+      "rewards/bleu_reward_func/mean": 0.12519359588623047,
+      "rewards/bleu_reward_func/std": 0.1556256264448166,
+      "step": 253
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 469.0,
+      "completions/mean_length": 133.4375,
+      "completions/mean_terminated_length": 79.35714721679688,
+      "completions/min_length": 13.0,
+      "completions/min_terminated_length": 13.0,
+      "epoch": 0.2032,
+      "grad_norm": 13.66992473602295,
+      "kl": 0.30645751953125,
+      "learning_rate": 1e-06,
+      "loss": -0.1315,
+      "num_tokens": 3353864.0,
+      "reward": 0.10196495056152344,
+      "reward_std": 0.05300650745630264,
+      "rewards/bleu_reward_func/mean": 0.10196495056152344,
+      "rewards/bleu_reward_func/std": 0.09023614972829819,
+      "step": 254
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 487.0,
+      "completions/mean_length": 267.6875,
+      "completions/mean_terminated_length": 232.7857208251953,
+      "completions/min_length": 38.0,
+      "completions/min_terminated_length": 38.0,
+      "epoch": 0.204,
+      "grad_norm": 24.040653228759766,
+      "kl": 0.062896728515625,
+      "learning_rate": 1e-06,
+      "loss": -0.1232,
+      "num_tokens": 3365758.0,
+      "reward": 0.03941156342625618,
+      "reward_std": 0.017305800691246986,
+      "rewards/bleu_reward_func/mean": 0.03941156342625618,
+      "rewards/bleu_reward_func/std": 0.02295033633708954,
+      "step": 255
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.53125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 486.0,
+      "completions/mean_length": 420.21875,
+      "completions/mean_terminated_length": 316.20001220703125,
+      "completions/min_length": 98.0,
+      "completions/min_terminated_length": 98.0,
+      "epoch": 0.2048,
+      "grad_norm": 2.9320602416992188,
+      "kl": 0.033233642578125,
+      "learning_rate": 1e-06,
+      "loss": -0.1371,
+      "num_tokens": 3382765.0,
+      "reward": 0.05339156836271286,
+      "reward_std": 0.02982841432094574,
+      "rewards/bleu_reward_func/mean": 0.05339156836271286,
+      "rewards/bleu_reward_func/std": 0.07343700528144836,
+      "step": 256
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.21875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 497.0,
+      "completions/mean_length": 247.8125,
+      "completions/mean_terminated_length": 173.83999633789062,
+      "completions/min_length": 19.0,
+      "completions/min_terminated_length": 19.0,
+      "epoch": 0.2056,
+      "grad_norm": 8.614324569702148,
+      "kl": 0.1400146484375,
+      "learning_rate": 1e-06,
+      "loss": 0.1332,
+      "num_tokens": 3394391.0,
+      "reward": 0.06851230561733246,
+      "reward_std": 0.04152427613735199,
+      "rewards/bleu_reward_func/mean": 0.06851230561733246,
+      "rewards/bleu_reward_func/std": 0.056356508284807205,
+      "step": 257
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.53125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 503.0,
+      "completions/mean_length": 365.625,
+      "completions/mean_terminated_length": 199.73333740234375,
+      "completions/min_length": 30.0,
+      "completions/min_terminated_length": 30.0,
+      "epoch": 0.2064,
+      "grad_norm": 6.318526744842529,
+      "kl": 0.099365234375,
+      "learning_rate": 1e-06,
+      "loss": -0.0091,
+      "num_tokens": 3411195.0,
+      "reward": 0.08351869136095047,
+      "reward_std": 0.012093533761799335,
+      "rewards/bleu_reward_func/mean": 0.08351869136095047,
+      "rewards/bleu_reward_func/std": 0.08073550462722778,
+      "step": 258
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 348.0,
+      "completions/mean_length": 134.65625,
+      "completions/mean_terminated_length": 109.50000762939453,
+      "completions/min_length": 17.0,
+      "completions/min_terminated_length": 17.0,
+      "epoch": 0.2072,
+      "grad_norm": 7.35445499420166,
+      "kl": 0.2371826171875,
+      "learning_rate": 1e-06,
+      "loss": 0.4692,
+      "num_tokens": 3419136.0,
+      "reward": 0.15089674293994904,
+      "reward_std": 0.06239618360996246,
+      "rewards/bleu_reward_func/mean": 0.15089674293994904,
+      "rewards/bleu_reward_func/std": 0.09912555664777756,
+      "step": 259
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 507.0,
+      "completions/mean_length": 242.46875,
+      "completions/mean_terminated_length": 180.2692413330078,
+      "completions/min_length": 35.0,
+      "completions/min_terminated_length": 35.0,
+      "epoch": 0.208,
+      "grad_norm": 4.740394592285156,
+      "kl": 0.08489990234375,
+      "learning_rate": 1e-06,
+      "loss": 0.0594,
+      "num_tokens": 3432127.0,
+      "reward": 0.05275239422917366,
+      "reward_std": 0.050225820392370224,
+      "rewards/bleu_reward_func/mean": 0.05275239422917366,
+      "rewards/bleu_reward_func/std": 0.07898835092782974,
+      "step": 260
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 462.0,
+      "completions/mean_length": 331.84375,
+      "completions/mean_terminated_length": 223.75,
+      "completions/min_length": 44.0,
+      "completions/min_terminated_length": 44.0,
+      "epoch": 0.2088,
+      "grad_norm": 3.1740782260894775,
+      "kl": 0.05401611328125,
+      "learning_rate": 1e-06,
+      "loss": -0.0923,
+      "num_tokens": 3446746.0,
+      "reward": 0.12386887520551682,
+      "reward_std": 0.031204696744680405,
+      "rewards/bleu_reward_func/mean": 0.12386887520551682,
+      "rewards/bleu_reward_func/std": 0.1644604653120041,
+      "step": 261
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 495.0,
+      "completions/mean_length": 329.5,
+      "completions/mean_terminated_length": 268.66668701171875,
+      "completions/min_length": 38.0,
+      "completions/min_terminated_length": 38.0,
+      "epoch": 0.2096,
+      "grad_norm": 2.937896728515625,
+      "kl": 0.033477783203125,
+      "learning_rate": 1e-06,
+      "loss": -0.0197,
+      "num_tokens": 3460890.0,
+      "reward": 0.05950773134827614,
+      "reward_std": 0.017293047159910202,
+      "rewards/bleu_reward_func/mean": 0.05950773134827614,
+      "rewards/bleu_reward_func/std": 0.04094443470239639,
+      "step": 262
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 455.0,
+      "completions/max_terminated_length": 455.0,
+      "completions/mean_length": 135.40625,
+      "completions/mean_terminated_length": 135.40625,
+      "completions/min_length": 16.0,
+      "completions/min_terminated_length": 16.0,
+      "epoch": 0.2104,
+      "grad_norm": 8.865147590637207,
+      "kl": 0.2249755859375,
+      "learning_rate": 1e-06,
+      "loss": -0.0363,
+      "num_tokens": 3475103.0,
+      "reward": 0.20508863031864166,
+      "reward_std": 0.040958937257528305,
+      "rewards/bleu_reward_func/mean": 0.20508863031864166,
+      "rewards/bleu_reward_func/std": 0.14616157114505768,
+      "step": 263
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 505.0,
+      "completions/mean_length": 267.8125,
+      "completions/mean_terminated_length": 186.4166717529297,
+      "completions/min_length": 12.0,
+      "completions/min_terminated_length": 12.0,
+      "epoch": 0.2112,
+      "grad_norm": 9.684611320495605,
+      "kl": 0.241973876953125,
+      "learning_rate": 1e-06,
+      "loss": 0.0071,
+      "num_tokens": 3487049.0,
+      "reward": 0.098166324198246,
+      "reward_std": 0.040819209069013596,
+      "rewards/bleu_reward_func/mean": 0.098166324198246,
+      "rewards/bleu_reward_func/std": 0.08471043407917023,
+      "step": 264
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.34375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 421.0,
+      "completions/mean_length": 260.875,
+      "completions/mean_terminated_length": 129.3333282470703,
+      "completions/min_length": 15.0,
+      "completions/min_terminated_length": 15.0,
+      "epoch": 0.212,
+      "grad_norm": 10.798442840576172,
+      "kl": 0.1309814453125,
+      "learning_rate": 1e-06,
+      "loss": 0.3087,
+      "num_tokens": 3501029.0,
+      "reward": 0.12524467706680298,
+      "reward_std": 0.05395754426717758,
+      "rewards/bleu_reward_func/mean": 0.12524467706680298,
+      "rewards/bleu_reward_func/std": 0.1178852915763855,
+      "step": 265
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 477.0,
+      "completions/mean_length": 205.71875,
+      "completions/mean_terminated_length": 103.625,
+      "completions/min_length": 17.0,
+      "completions/min_terminated_length": 17.0,
+      "epoch": 0.2128,
+      "grad_norm": 6.346302032470703,
+      "kl": 0.132049560546875,
+      "learning_rate": 1e-06,
+      "loss": 0.0884,
+      "num_tokens": 3511372.0,
+      "reward": 0.10632273554801941,
+      "reward_std": 0.041688427329063416,
+      "rewards/bleu_reward_func/mean": 0.10632273554801941,
+      "rewards/bleu_reward_func/std": 0.09963962435722351,
+      "step": 266
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 509.0,
+      "completions/mean_length": 231.4375,
+      "completions/mean_terminated_length": 137.9166717529297,
+      "completions/min_length": 13.0,
+      "completions/min_terminated_length": 13.0,
+      "epoch": 0.2136,
+      "grad_norm": 9.553611755371094,
+      "kl": 0.2503204345703125,
+      "learning_rate": 1e-06,
+      "loss": -0.085,
+      "num_tokens": 3524570.0,
+      "reward": 0.08381873369216919,
+      "reward_std": 0.026928268373012543,
+      "rewards/bleu_reward_func/mean": 0.08381873369216919,
+      "rewards/bleu_reward_func/std": 0.06075910106301308,
+      "step": 267
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.21875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 504.0,
+      "completions/mean_length": 266.8125,
+      "completions/mean_terminated_length": 198.1599884033203,
+      "completions/min_length": 28.0,
+      "completions/min_terminated_length": 28.0,
+      "epoch": 0.2144,
+      "grad_norm": 5.0754289627075195,
+      "kl": 0.09002685546875,
+      "learning_rate": 1e-06,
+      "loss": -0.0985,
+      "num_tokens": 3535156.0,
+      "reward": 0.04936995357275009,
+      "reward_std": 0.02683193050324917,
+      "rewards/bleu_reward_func/mean": 0.04936995357275009,
+      "rewards/bleu_reward_func/std": 0.05894342064857483,
+      "step": 268
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 423.0,
+      "completions/mean_length": 432.5,
+      "completions/mean_terminated_length": 300.0,
+      "completions/min_length": 174.0,
+      "completions/min_terminated_length": 174.0,
+      "epoch": 0.2152,
+      "grad_norm": 2.118546724319458,
+      "kl": 0.0207061767578125,
+      "learning_rate": 1e-06,
+      "loss": -0.0481,
+      "num_tokens": 3555804.0,
+      "reward": 0.05241474509239197,
+      "reward_std": 0.019338509067893028,
+      "rewards/bleu_reward_func/mean": 0.05241474509239197,
+      "rewards/bleu_reward_func/std": 0.06824250519275665,
+      "step": 269
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 507.0,
+      "completions/mean_length": 272.625,
+      "completions/mean_terminated_length": 192.83334350585938,
+      "completions/min_length": 41.0,
+      "completions/min_terminated_length": 41.0,
+      "epoch": 0.216,
+      "grad_norm": 3.938976526260376,
+      "kl": 0.042999267578125,
+      "learning_rate": 1e-06,
+      "loss": -0.1052,
+      "num_tokens": 3572328.0,
+      "reward": 0.21750634908676147,
+      "reward_std": 0.06779822707176208,
+      "rewards/bleu_reward_func/mean": 0.21750634908676147,
+      "rewards/bleu_reward_func/std": 0.28914642333984375,
+      "step": 270
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.34375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 475.0,
+      "completions/mean_length": 293.3125,
+      "completions/mean_terminated_length": 178.76190185546875,
+      "completions/min_length": 40.0,
+      "completions/min_terminated_length": 40.0,
+      "epoch": 0.2168,
+      "grad_norm": 3.788853645324707,
+      "kl": 0.041900634765625,
+      "learning_rate": 1e-06,
+      "loss": 0.0279,
+      "num_tokens": 3587050.0,
+      "reward": 0.04385410249233246,
+      "reward_std": 0.030311163514852524,
+      "rewards/bleu_reward_func/mean": 0.04385410249233246,
+      "rewards/bleu_reward_func/std": 0.047958169132471085,
+      "step": 271
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 500.0,
+      "completions/mean_length": 232.25,
+      "completions/mean_terminated_length": 139.0,
+      "completions/min_length": 15.0,
+      "completions/min_terminated_length": 15.0,
+      "epoch": 0.2176,
+      "grad_norm": 12.908583641052246,
+      "kl": 0.464263916015625,
+      "learning_rate": 1e-06,
+      "loss": 0.2404,
+      "num_tokens": 3598874.0,
+      "reward": 0.1504618227481842,
+      "reward_std": 0.04004389047622681,
+      "rewards/bleu_reward_func/mean": 0.1504618227481842,
+      "rewards/bleu_reward_func/std": 0.16537794470787048,
+      "step": 272
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 475.0,
+      "completions/mean_length": 209.03125,
+      "completions/mean_terminated_length": 199.258056640625,
+      "completions/min_length": 30.0,
+      "completions/min_terminated_length": 30.0,
+      "epoch": 0.2184,
+      "grad_norm": 6.985334873199463,
+      "kl": 0.16595458984375,
+      "learning_rate": 1e-06,
+      "loss": -0.0408,
+      "num_tokens": 3610515.0,
+      "reward": 0.21218228340148926,
+      "reward_std": 0.09676108509302139,
+      "rewards/bleu_reward_func/mean": 0.21218228340148926,
+      "rewards/bleu_reward_func/std": 0.22182048857212067,
+      "step": 273
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 465.0,
+      "completions/mean_length": 333.9375,
+      "completions/mean_terminated_length": 195.44444274902344,
+      "completions/min_length": 26.0,
+      "completions/min_terminated_length": 26.0,
+      "epoch": 0.2192,
+      "grad_norm": 3.482099771499634,
+      "kl": 0.03741455078125,
+      "learning_rate": 1e-06,
+      "loss": -0.1819,
+      "num_tokens": 3623921.0,
+      "reward": 0.11982771754264832,
+      "reward_std": 0.063297338783741,
+      "rewards/bleu_reward_func/mean": 0.11982771754264832,
+      "rewards/bleu_reward_func/std": 0.09915972501039505,
+      "step": 274
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 496.0,
+      "completions/mean_length": 292.90625,
+      "completions/mean_terminated_length": 261.6071472167969,
+      "completions/min_length": 58.0,
+      "completions/min_terminated_length": 58.0,
+      "epoch": 0.22,
+      "grad_norm": 4.1525559425354,
+      "kl": 0.041351318359375,
+      "learning_rate": 1e-06,
+      "loss": -0.0478,
+      "num_tokens": 3634918.0,
+      "reward": 0.05305434763431549,
+      "reward_std": 0.019571729004383087,
+      "rewards/bleu_reward_func/mean": 0.05305434763431549,
+      "rewards/bleu_reward_func/std": 0.04326590150594711,
+      "step": 275
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 488.0,
+      "completions/mean_length": 229.59375,
+      "completions/mean_terminated_length": 135.45834350585938,
+      "completions/min_length": 16.0,
+      "completions/min_terminated_length": 16.0,
+      "epoch": 0.2208,
+      "grad_norm": 14.463852882385254,
+      "kl": 0.257415771484375,
+      "learning_rate": 1e-06,
+      "loss": -0.0433,
+      "num_tokens": 3647289.0,
+      "reward": 0.23456689715385437,
+      "reward_std": 0.08336643874645233,
+      "rewards/bleu_reward_func/mean": 0.23456689715385437,
+      "rewards/bleu_reward_func/std": 0.2258531004190445,
+      "step": 276
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 438.0,
+      "completions/mean_length": 193.375,
+      "completions/mean_terminated_length": 87.16667175292969,
+      "completions/min_length": 11.0,
+      "completions/min_terminated_length": 11.0,
+      "epoch": 0.2216,
+      "grad_norm": 21.709369659423828,
+      "kl": 0.1391448974609375,
+      "learning_rate": 1e-06,
+      "loss": -0.0033,
+      "num_tokens": 3656709.0,
+      "reward": 0.16775630414485931,
+      "reward_std": 0.03647792339324951,
+      "rewards/bleu_reward_func/mean": 0.16775630414485931,
+      "rewards/bleu_reward_func/std": 0.15713484585285187,
+      "step": 277
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 468.0,
+      "completions/mean_length": 219.78125,
+      "completions/mean_terminated_length": 122.375,
+      "completions/min_length": 19.0,
+      "completions/min_terminated_length": 19.0,
+      "epoch": 0.2224,
+      "grad_norm": 7.275771141052246,
+      "kl": 0.22491455078125,
+      "learning_rate": 1e-06,
+      "loss": 0.072,
+      "num_tokens": 3670158.0,
+      "reward": 0.1231408566236496,
+      "reward_std": 0.022272268310189247,
+      "rewards/bleu_reward_func/mean": 0.1231408566236496,
+      "rewards/bleu_reward_func/std": 0.1077708899974823,
+      "step": 278
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 464.0,
+      "completions/mean_length": 339.46875,
+      "completions/mean_terminated_length": 261.04547119140625,
+      "completions/min_length": 15.0,
+      "completions/min_terminated_length": 15.0,
+      "epoch": 0.2232,
+      "grad_norm": 3.146303176879883,
+      "kl": 0.0390625,
+      "learning_rate": 1e-06,
+      "loss": -0.1333,
+      "num_tokens": 3683925.0,
+      "reward": 0.0675458312034607,
+      "reward_std": 0.017428681254386902,
+      "rewards/bleu_reward_func/mean": 0.0675458312034607,
+      "rewards/bleu_reward_func/std": 0.05334463343024254,
+      "step": 279
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.21875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 484.0,
+      "completions/mean_length": 321.9375,
+      "completions/mean_terminated_length": 268.7200012207031,
+      "completions/min_length": 14.0,
+      "completions/min_terminated_length": 14.0,
+      "epoch": 0.224,
+      "grad_norm": 8.726150512695312,
+      "kl": 0.156707763671875,
+      "learning_rate": 1e-06,
+      "loss": 0.0175,
+      "num_tokens": 3699747.0,
+      "reward": 0.11248552799224854,
+      "reward_std": 0.03111671656370163,
+      "rewards/bleu_reward_func/mean": 0.11248552799224854,
+      "rewards/bleu_reward_func/std": 0.08908119797706604,
+      "step": 280
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 493.0,
+      "completions/mean_length": 297.375,
+      "completions/mean_terminated_length": 247.84616088867188,
+      "completions/min_length": 18.0,
+      "completions/min_terminated_length": 18.0,
+      "epoch": 0.2248,
+      "grad_norm": 3.081026077270508,
+      "kl": 0.021881103515625,
+      "learning_rate": 1e-06,
+      "loss": -0.1217,
+      "num_tokens": 3712759.0,
+      "reward": 0.09313205629587173,
+      "reward_std": 0.03823218122124672,
+      "rewards/bleu_reward_func/mean": 0.09313205629587173,
+      "rewards/bleu_reward_func/std": 0.06713149696588516,
+      "step": 281
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 490.0,
+      "completions/mean_length": 316.46875,
+      "completions/mean_terminated_length": 251.2916717529297,
+      "completions/min_length": 50.0,
+      "completions/min_terminated_length": 50.0,
+      "epoch": 0.2256,
+      "grad_norm": 3.1275222301483154,
+      "kl": 0.035675048828125,
+      "learning_rate": 1e-06,
+      "loss": -0.1645,
+      "num_tokens": 3725598.0,
+      "reward": 0.032498396933078766,
+      "reward_std": 0.018658628687262535,
+      "rewards/bleu_reward_func/mean": 0.032498396933078766,
+      "rewards/bleu_reward_func/std": 0.019405974075198174,
+      "step": 282
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 481.0,
+      "completions/mean_length": 194.71875,
+      "completions/mean_terminated_length": 173.56668090820312,
+      "completions/min_length": 18.0,
+      "completions/min_terminated_length": 18.0,
+      "epoch": 0.2264,
+      "grad_norm": 4.276275634765625,
+      "kl": 0.08013916015625,
+      "learning_rate": 1e-06,
+      "loss": 0.0746,
+      "num_tokens": 3736861.0,
+      "reward": 0.12694165110588074,
+      "reward_std": 0.04432743415236473,
+      "rewards/bleu_reward_func/mean": 0.12694165110588074,
+      "rewards/bleu_reward_func/std": 0.13188457489013672,
+      "step": 283
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.40625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 501.0,
+      "completions/mean_length": 268.375,
+      "completions/mean_terminated_length": 101.68421173095703,
+      "completions/min_length": 21.0,
+      "completions/min_terminated_length": 21.0,
+      "epoch": 0.2272,
+      "grad_norm": 7.712943077087402,
+      "kl": 0.271392822265625,
+      "learning_rate": 1e-06,
+      "loss": -0.0787,
+      "num_tokens": 3751545.0,
+      "reward": 0.1655203104019165,
+      "reward_std": 0.08383054286241531,
+      "rewards/bleu_reward_func/mean": 0.1655203104019165,
+      "rewards/bleu_reward_func/std": 0.1525241732597351,
+      "step": 284
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.40625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 451.0,
+      "completions/mean_length": 342.125,
+      "completions/mean_terminated_length": 225.89474487304688,
+      "completions/min_length": 49.0,
+      "completions/min_terminated_length": 49.0,
+      "epoch": 0.228,
+      "grad_norm": 3.6280434131622314,
+      "kl": 0.046173095703125,
+      "learning_rate": 1e-06,
+      "loss": -0.1072,
+      "num_tokens": 3765781.0,
+      "reward": 0.042814724147319794,
+      "reward_std": 0.026553209871053696,
+      "rewards/bleu_reward_func/mean": 0.042814724147319794,
+      "rewards/bleu_reward_func/std": 0.03911494091153145,
+      "step": 285
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 489.0,
+      "completions/mean_length": 208.96875,
+      "completions/mean_terminated_length": 199.19354248046875,
+      "completions/min_length": 20.0,
+      "completions/min_terminated_length": 20.0,
+      "epoch": 0.2288,
+      "grad_norm": 7.985737323760986,
+      "kl": 0.12091064453125,
+      "learning_rate": 1e-06,
+      "loss": -0.1044,
+      "num_tokens": 3779220.0,
+      "reward": 0.11331084370613098,
+      "reward_std": 0.025679122656583786,
+      "rewards/bleu_reward_func/mean": 0.11331084370613098,
+      "rewards/bleu_reward_func/std": 0.16165612637996674,
+      "step": 286
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.46875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 487.0,
+      "completions/mean_length": 351.3125,
+      "completions/mean_terminated_length": 209.5294189453125,
+      "completions/min_length": 31.0,
+      "completions/min_terminated_length": 31.0,
+      "epoch": 0.2296,
+      "grad_norm": 3.912679433822632,
+      "kl": 0.026214599609375,
+      "learning_rate": 1e-06,
+      "loss": 0.1208,
+      "num_tokens": 3794550.0,
+      "reward": 0.01693039759993553,
+      "reward_std": 0.0203933697193861,
+      "rewards/bleu_reward_func/mean": 0.01693039759993553,
+      "rewards/bleu_reward_func/std": 0.02536601759493351,
+      "step": 287
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 453.0,
+      "completions/mean_length": 274.65625,
+      "completions/mean_terminated_length": 240.75001525878906,
+      "completions/min_length": 33.0,
+      "completions/min_terminated_length": 33.0,
+      "epoch": 0.2304,
+      "grad_norm": 6.236807346343994,
+      "kl": 0.1007537841796875,
+      "learning_rate": 1e-06,
+      "loss": -0.0844,
+      "num_tokens": 3808595.0,
+      "reward": 0.13739125430583954,
+      "reward_std": 0.042728863656520844,
+      "rewards/bleu_reward_func/mean": 0.13739125430583954,
+      "rewards/bleu_reward_func/std": 0.09978168457746506,
+      "step": 288
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.40625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 443.0,
+      "completions/mean_length": 260.15625,
+      "completions/mean_terminated_length": 87.84210968017578,
+      "completions/min_length": 10.0,
+      "completions/min_terminated_length": 10.0,
+      "epoch": 0.2312,
+      "grad_norm": 12.087539672851562,
+      "kl": 0.2603912353515625,
+      "learning_rate": 1e-06,
+      "loss": 0.1979,
+      "num_tokens": 3821552.0,
+      "reward": 0.1537414938211441,
+      "reward_std": 0.04864966496825218,
+      "rewards/bleu_reward_func/mean": 0.1537414938211441,
+      "rewards/bleu_reward_func/std": 0.08011970669031143,
+      "step": 289
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 274.0,
+      "completions/mean_length": 214.40625,
+      "completions/mean_terminated_length": 79.13636779785156,
+      "completions/min_length": 17.0,
+      "completions/min_terminated_length": 17.0,
+      "epoch": 0.232,
+      "grad_norm": 6.511635780334473,
+      "kl": 0.12432861328125,
+      "learning_rate": 1e-06,
+      "loss": 0.2609,
+      "num_tokens": 3838117.0,
+      "reward": 0.19495005905628204,
+      "reward_std": 0.09461250901222229,
+      "rewards/bleu_reward_func/mean": 0.19495005905628204,
+      "rewards/bleu_reward_func/std": 0.20672400295734406,
+      "step": 290
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.34375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 504.0,
+      "completions/mean_length": 322.78125,
+      "completions/mean_terminated_length": 223.6666717529297,
+      "completions/min_length": 22.0,
+      "completions/min_terminated_length": 22.0,
+      "epoch": 0.2328,
+      "grad_norm": 4.590160369873047,
+      "kl": 0.127716064453125,
+      "learning_rate": 1e-06,
+      "loss": -0.1225,
+      "num_tokens": 3853958.0,
+      "reward": 0.1360878348350525,
+      "reward_std": 0.03053300268948078,
+      "rewards/bleu_reward_func/mean": 0.1360878348350525,
+      "rewards/bleu_reward_func/std": 0.17878462374210358,
+      "step": 291
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.59375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 457.0,
+      "completions/mean_length": 462.6875,
+      "completions/mean_terminated_length": 390.6153869628906,
+      "completions/min_length": 315.0,
+      "completions/min_terminated_length": 315.0,
+      "epoch": 0.2336,
+      "grad_norm": 2.3334367275238037,
+      "kl": 0.030426025390625,
+      "learning_rate": 1e-06,
+      "loss": -0.0405,
+      "num_tokens": 3875596.0,
+      "reward": 0.06421424448490143,
+      "reward_std": 0.02072659507393837,
+      "rewards/bleu_reward_func/mean": 0.06421424448490143,
+      "rewards/bleu_reward_func/std": 0.02574257366359234,
+      "step": 292
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.28125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 490.0,
+      "completions/mean_length": 325.03125,
+      "completions/mean_terminated_length": 251.86956787109375,
+      "completions/min_length": 19.0,
+      "completions/min_terminated_length": 19.0,
+      "epoch": 0.2344,
+      "grad_norm": 3.4702064990997314,
+      "kl": 0.03289794921875,
+      "learning_rate": 1e-06,
+      "loss": 0.1122,
+      "num_tokens": 3892021.0,
+      "reward": 0.04875369742512703,
+      "reward_std": 0.020287783816456795,
+      "rewards/bleu_reward_func/mean": 0.04875369742512703,
+      "rewards/bleu_reward_func/std": 0.0285445898771286,
+      "step": 293
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 368.0,
+      "completions/mean_length": 107.375,
+      "completions/mean_terminated_length": 94.32257843017578,
+      "completions/min_length": 13.0,
+      "completions/min_terminated_length": 13.0,
+      "epoch": 0.2352,
+      "grad_norm": 25.415559768676758,
+      "kl": 0.23876953125,
+      "learning_rate": 1e-06,
+      "loss": 0.1099,
+      "num_tokens": 3903457.0,
+      "reward": 0.12372880429029465,
+      "reward_std": 0.02668173238635063,
+      "rewards/bleu_reward_func/mean": 0.12372880429029465,
+      "rewards/bleu_reward_func/std": 0.12391357123851776,
+      "step": 294
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 489.0,
+      "completions/mean_length": 244.09375,
+      "completions/mean_terminated_length": 122.31818389892578,
+      "completions/min_length": 16.0,
+      "completions/min_terminated_length": 16.0,
+      "epoch": 0.236,
+      "grad_norm": 8.319884300231934,
+      "kl": 0.14251708984375,
+      "learning_rate": 1e-06,
+      "loss": -0.034,
+      "num_tokens": 3917028.0,
+      "reward": 0.16006486117839813,
+      "reward_std": 0.02584708109498024,
+      "rewards/bleu_reward_func/mean": 0.16006486117839813,
+      "rewards/bleu_reward_func/std": 0.1484500914812088,
+      "step": 295
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 502.0,
+      "completions/mean_length": 278.875,
+      "completions/mean_terminated_length": 139.0,
+      "completions/min_length": 27.0,
+      "completions/min_terminated_length": 27.0,
+      "epoch": 0.2368,
+      "grad_norm": 4.291149616241455,
+      "kl": 0.131500244140625,
+      "learning_rate": 1e-06,
+      "loss": -0.192,
+      "num_tokens": 3929400.0,
+      "reward": 0.09954051673412323,
+      "reward_std": 0.03838299959897995,
+      "rewards/bleu_reward_func/mean": 0.09954051673412323,
+      "rewards/bleu_reward_func/std": 0.13533763587474823,
+      "step": 296
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 486.0,
+      "completions/mean_length": 288.9375,
+      "completions/mean_terminated_length": 187.5454559326172,
+      "completions/min_length": 3.0,
+      "completions/min_terminated_length": 3.0,
+      "epoch": 0.2376,
+      "grad_norm": 5.0546417236328125,
+      "kl": 0.10858154296875,
+      "learning_rate": 1e-06,
+      "loss": -0.0687,
+      "num_tokens": 3942222.0,
+      "reward": 0.16907253861427307,
+      "reward_std": 0.03968513384461403,
+      "rewards/bleu_reward_func/mean": 0.16907253861427307,
+      "rewards/bleu_reward_func/std": 0.10800375789403915,
+      "step": 297
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 507.0,
+      "completions/mean_length": 309.1875,
+      "completions/mean_terminated_length": 151.44444274902344,
+      "completions/min_length": 13.0,
+      "completions/min_terminated_length": 13.0,
+      "epoch": 0.2384,
+      "grad_norm": 8.339001655578613,
+      "kl": 0.1490631103515625,
+      "learning_rate": 1e-06,
+      "loss": -0.0121,
+      "num_tokens": 3954316.0,
+      "reward": 0.06681232899427414,
+      "reward_std": 0.015474791638553143,
+      "rewards/bleu_reward_func/mean": 0.06681232899427414,
+      "rewards/bleu_reward_func/std": 0.06617429107427597,
+      "step": 298
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 498.0,
+      "completions/mean_length": 285.59375,
+      "completions/mean_terminated_length": 109.5,
+      "completions/min_length": 15.0,
+      "completions/min_terminated_length": 15.0,
+      "epoch": 0.2392,
+      "grad_norm": 3.8715662956237793,
+      "kl": 0.050140380859375,
+      "learning_rate": 1e-06,
+      "loss": 0.1729,
+      "num_tokens": 3967087.0,
+      "reward": 0.1066230833530426,
+      "reward_std": 0.08889298141002655,
+      "rewards/bleu_reward_func/mean": 0.1066230833530426,
+      "rewards/bleu_reward_func/std": 0.14223438501358032,
+      "step": 299
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 361.0,
+      "completions/max_terminated_length": 361.0,
+      "completions/mean_length": 120.9375,
+      "completions/mean_terminated_length": 120.9375,
+      "completions/min_length": 14.0,
+      "completions/min_terminated_length": 14.0,
+      "epoch": 0.24,
+      "grad_norm": 9.271559715270996,
+      "kl": 0.223388671875,
+      "learning_rate": 1e-06,
+      "loss": 0.0531,
+      "num_tokens": 3977797.0,
+      "reward": 0.09239183366298676,
+      "reward_std": 0.04012807458639145,
+      "rewards/bleu_reward_func/mean": 0.09239183366298676,
+      "rewards/bleu_reward_func/std": 0.07950045168399811,
+      "step": 300
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.21875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 492.0,
+      "completions/mean_length": 270.0,
+      "completions/mean_terminated_length": 202.239990234375,
+      "completions/min_length": 2.0,
+      "completions/min_terminated_length": 2.0,
+      "epoch": 0.2408,
+      "grad_norm": 8.53159236907959,
+      "kl": 0.18048095703125,
+      "learning_rate": 1e-06,
+      "loss": -0.1823,
+      "num_tokens": 3988157.0,
+      "reward": 0.04499006271362305,
+      "reward_std": 0.015048853121697903,
+      "rewards/bleu_reward_func/mean": 0.04499006271362305,
+      "rewards/bleu_reward_func/std": 0.036676883697509766,
+      "step": 301
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 510.0,
+      "completions/mean_length": 224.0625,
+      "completions/mean_terminated_length": 157.61538696289062,
+      "completions/min_length": 17.0,
+      "completions/min_terminated_length": 17.0,
+      "epoch": 0.2416,
+      "grad_norm": 6.0366997718811035,
+      "kl": 0.099029541015625,
+      "learning_rate": 1e-06,
+      "loss": -0.1824,
+      "num_tokens": 4000135.0,
+      "reward": 0.1630059778690338,
+      "reward_std": 0.04720958322286606,
+      "rewards/bleu_reward_func/mean": 0.1630059778690338,
+      "rewards/bleu_reward_func/std": 0.1834760457277298,
+      "step": 302
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 486.0,
+      "completions/mean_length": 169.3125,
+      "completions/mean_terminated_length": 90.23077392578125,
+      "completions/min_length": 15.0,
+      "completions/min_terminated_length": 15.0,
+      "epoch": 0.2424,
+      "grad_norm": 9.543852806091309,
+      "kl": 0.35198974609375,
+      "learning_rate": 1e-06,
+      "loss": -0.2399,
+      "num_tokens": 4009009.0,
+      "reward": 0.06052142754197121,
+      "reward_std": 0.026765264570713043,
+      "rewards/bleu_reward_func/mean": 0.06052142754197121,
+      "rewards/bleu_reward_func/std": 0.052253786474466324,
+      "step": 303
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.40625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 509.0,
+      "completions/mean_length": 365.78125,
+      "completions/mean_terminated_length": 265.7368469238281,
+      "completions/min_length": 50.0,
+      "completions/min_terminated_length": 50.0,
+      "epoch": 0.2432,
+      "grad_norm": 3.007157564163208,
+      "kl": 0.0393524169921875,
+      "learning_rate": 1e-06,
+      "loss": 0.032,
+      "num_tokens": 4023690.0,
+      "reward": 0.025675857439637184,
+      "reward_std": 0.013720525428652763,
+      "rewards/bleu_reward_func/mean": 0.025675857439637184,
+      "rewards/bleu_reward_func/std": 0.022033939138054848,
+      "step": 304
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 506.0,
+      "completions/max_terminated_length": 506.0,
+      "completions/mean_length": 158.6875,
+      "completions/mean_terminated_length": 158.6875,
+      "completions/min_length": 18.0,
+      "completions/min_terminated_length": 18.0,
+      "epoch": 0.244,
+      "grad_norm": 7.10622501373291,
+      "kl": 0.21661376953125,
+      "learning_rate": 1e-06,
+      "loss": 0.166,
+      "num_tokens": 4033848.0,
+      "reward": 0.19492439925670624,
+      "reward_std": 0.0628402829170227,
+      "rewards/bleu_reward_func/mean": 0.19492439925670624,
+      "rewards/bleu_reward_func/std": 0.22491495311260223,
+      "step": 305
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 484.0,
+      "completions/mean_length": 345.53125,
+      "completions/mean_terminated_length": 290.04168701171875,
+      "completions/min_length": 42.0,
+      "completions/min_terminated_length": 42.0,
+      "epoch": 0.2448,
+      "grad_norm": 4.572328090667725,
+      "kl": 0.099700927734375,
+      "learning_rate": 1e-06,
+      "loss": 0.1016,
+      "num_tokens": 4047897.0,
+      "reward": 0.12647973001003265,
+      "reward_std": 0.03362637385725975,
+      "rewards/bleu_reward_func/mean": 0.12647973001003265,
+      "rewards/bleu_reward_func/std": 0.08024211972951889,
+      "step": 306
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 497.0,
+      "completions/mean_length": 217.6875,
+      "completions/mean_terminated_length": 175.6428680419922,
+      "completions/min_length": 14.0,
+      "completions/min_terminated_length": 14.0,
+      "epoch": 0.2456,
+      "grad_norm": 7.489211082458496,
+      "kl": 0.17584228515625,
+      "learning_rate": 1e-06,
+      "loss": -0.1361,
+      "num_tokens": 4062471.0,
+      "reward": 0.15859398245811462,
+      "reward_std": 0.059820279479026794,
+      "rewards/bleu_reward_func/mean": 0.15859398245811462,
+      "rewards/bleu_reward_func/std": 0.11927466094493866,
+      "step": 307
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 509.0,
+      "completions/mean_length": 304.0625,
+      "completions/mean_terminated_length": 209.5454559326172,
+      "completions/min_length": 18.0,
+      "completions/min_terminated_length": 18.0,
+      "epoch": 0.2464,
+      "grad_norm": 6.605251789093018,
+      "kl": 0.15716552734375,
+      "learning_rate": 1e-06,
+      "loss": 0.1835,
+      "num_tokens": 4079553.0,
+      "reward": 0.048189468681812286,
+      "reward_std": 0.01783904619514942,
+      "rewards/bleu_reward_func/mean": 0.048189468681812286,
+      "rewards/bleu_reward_func/std": 0.037260618060827255,
+      "step": 308
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 422.0,
+      "completions/mean_length": 150.46875,
+      "completions/mean_terminated_length": 67.03846740722656,
+      "completions/min_length": 21.0,
+      "completions/min_terminated_length": 21.0,
+      "epoch": 0.2472,
+      "grad_norm": 20.150175094604492,
+      "kl": 0.31695556640625,
+      "learning_rate": 1e-06,
+      "loss": -0.2639,
+      "num_tokens": 4089536.0,
+      "reward": 0.19017143547534943,
+      "reward_std": 0.06138678267598152,
+      "rewards/bleu_reward_func/mean": 0.19017143547534943,
+      "rewards/bleu_reward_func/std": 0.25128865242004395,
+      "step": 309
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.09375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 478.0,
+      "completions/mean_length": 171.3125,
+      "completions/mean_terminated_length": 136.0689697265625,
+      "completions/min_length": 12.0,
+      "completions/min_terminated_length": 12.0,
+      "epoch": 0.248,
+      "grad_norm": 6.626379013061523,
+      "kl": 0.103729248046875,
+      "learning_rate": 1e-06,
+      "loss": -0.1912,
+      "num_tokens": 4100146.0,
+      "reward": 0.08903198689222336,
+      "reward_std": 0.029232412576675415,
+      "rewards/bleu_reward_func/mean": 0.08903198689222336,
+      "rewards/bleu_reward_func/std": 0.09126507490873337,
+      "step": 310
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 481.0,
+      "completions/mean_length": 216.46875,
+      "completions/mean_terminated_length": 117.95833587646484,
+      "completions/min_length": 16.0,
+      "completions/min_terminated_length": 16.0,
+      "epoch": 0.2488,
+      "grad_norm": 5.2524285316467285,
+      "kl": 0.0589141845703125,
+      "learning_rate": 1e-06,
+      "loss": 0.3318,
+      "num_tokens": 4112841.0,
+      "reward": 0.07349678874015808,
+      "reward_std": 0.05337782949209213,
+      "rewards/bleu_reward_func/mean": 0.07349678874015808,
+      "rewards/bleu_reward_func/std": 0.10531707108020782,
+      "step": 311
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.28125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 489.0,
+      "completions/mean_length": 285.0,
+      "completions/mean_terminated_length": 196.17391967773438,
+      "completions/min_length": 21.0,
+      "completions/min_terminated_length": 21.0,
+      "epoch": 0.2496,
+      "grad_norm": 5.209020137786865,
+      "kl": 0.11212158203125,
+      "learning_rate": 1e-06,
+      "loss": -0.1362,
+      "num_tokens": 4125369.0,
+      "reward": 0.1321243941783905,
+      "reward_std": 0.035379908978939056,
+      "rewards/bleu_reward_func/mean": 0.1321243941783905,
+      "rewards/bleu_reward_func/std": 0.12779219448566437,
+      "step": 312
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 508.0,
+      "completions/mean_length": 397.1875,
+      "completions/mean_terminated_length": 205.83334350585938,
+      "completions/min_length": 15.0,
+      "completions/min_terminated_length": 15.0,
+      "epoch": 0.2504,
+      "grad_norm": 2.491729974746704,
+      "kl": 0.029266357421875,
+      "learning_rate": 1e-06,
+      "loss": -0.0819,
+      "num_tokens": 4142671.0,
+      "reward": 0.021221335977315903,
+      "reward_std": 0.008927191607654095,
+      "rewards/bleu_reward_func/mean": 0.021221335977315903,
+      "rewards/bleu_reward_func/std": 0.01940017379820347,
+      "step": 313
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 479.0,
+      "completions/mean_length": 276.8125,
+      "completions/mean_terminated_length": 222.53846740722656,
+      "completions/min_length": 73.0,
+      "completions/min_terminated_length": 73.0,
+      "epoch": 0.2512,
+      "grad_norm": 3.1302947998046875,
+      "kl": 0.030731201171875,
+      "learning_rate": 1e-06,
+      "loss": -0.09,
+      "num_tokens": 4158681.0,
+      "reward": 0.18806447088718414,
+      "reward_std": 0.04276939481496811,
+      "rewards/bleu_reward_func/mean": 0.18806447088718414,
+      "rewards/bleu_reward_func/std": 0.2711097002029419,
+      "step": 314
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 511.0,
+      "completions/mean_length": 202.71875,
+      "completions/mean_terminated_length": 182.10000610351562,
+      "completions/min_length": 13.0,
+      "completions/min_terminated_length": 13.0,
+      "epoch": 0.252,
+      "grad_norm": 9.11577320098877,
+      "kl": 0.321502685546875,
+      "learning_rate": 1e-06,
+      "loss": 0.2469,
+      "num_tokens": 4168304.0,
+      "reward": 0.17324072122573853,
+      "reward_std": 0.07514998316764832,
+      "rewards/bleu_reward_func/mean": 0.17324072122573853,
+      "rewards/bleu_reward_func/std": 0.15059800446033478,
+      "step": 315
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.46875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 457.0,
+      "completions/mean_length": 310.65625,
+      "completions/mean_terminated_length": 133.0,
+      "completions/min_length": 15.0,
+      "completions/min_terminated_length": 15.0,
+      "epoch": 0.2528,
+      "grad_norm": 4.476902961730957,
+      "kl": 0.22100830078125,
+      "learning_rate": 1e-06,
+      "loss": -0.0695,
+      "num_tokens": 4183237.0,
+      "reward": 0.11044389009475708,
+      "reward_std": 0.04662460461258888,
+      "rewards/bleu_reward_func/mean": 0.11044389009475708,
+      "rewards/bleu_reward_func/std": 0.13189704716205597,
+      "step": 316
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 506.0,
+      "completions/mean_length": 326.40625,
+      "completions/mean_terminated_length": 264.54168701171875,
+      "completions/min_length": 14.0,
+      "completions/min_terminated_length": 14.0,
+      "epoch": 0.2536,
+      "grad_norm": 4.724470138549805,
+      "kl": 0.039764404296875,
+      "learning_rate": 1e-06,
+      "loss": 0.0133,
+      "num_tokens": 4196786.0,
+      "reward": 0.1738719940185547,
+      "reward_std": 0.06735121458768845,
+      "rewards/bleu_reward_func/mean": 0.1738719940185547,
+      "rewards/bleu_reward_func/std": 0.15234871208667755,
+      "step": 317
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 511.0,
+      "completions/mean_length": 264.65625,
+      "completions/mean_terminated_length": 116.25,
+      "completions/min_length": 17.0,
+      "completions/min_terminated_length": 17.0,
+      "epoch": 0.2544,
+      "grad_norm": 7.755268096923828,
+      "kl": 0.23388671875,
+      "learning_rate": 1e-06,
+      "loss": 0.0091,
+      "num_tokens": 4211319.0,
+      "reward": 0.16174694895744324,
+      "reward_std": 0.04472574219107628,
+      "rewards/bleu_reward_func/mean": 0.16174694895744324,
+      "rewards/bleu_reward_func/std": 0.13533204793930054,
+      "step": 318
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.34375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 509.0,
+      "completions/mean_length": 326.0625,
+      "completions/mean_terminated_length": 228.6666717529297,
+      "completions/min_length": 46.0,
+      "completions/min_terminated_length": 46.0,
+      "epoch": 0.2552,
+      "grad_norm": 3.5100746154785156,
+      "kl": 0.0638427734375,
+      "learning_rate": 1e-06,
+      "loss": -0.0174,
+      "num_tokens": 4224641.0,
+      "reward": 0.14605101943016052,
+      "reward_std": 0.039064351469278336,
+      "rewards/bleu_reward_func/mean": 0.14605101943016052,
+      "rewards/bleu_reward_func/std": 0.1437525898218155,
+      "step": 319
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 465.0,
+      "completions/mean_length": 387.5,
+      "completions/mean_terminated_length": 180.0,
+      "completions/min_length": 39.0,
+      "completions/min_terminated_length": 39.0,
+      "epoch": 0.256,
+      "grad_norm": 3.499901056289673,
+      "kl": 0.03240966796875,
+      "learning_rate": 1e-06,
+      "loss": -0.101,
+      "num_tokens": 4244041.0,
+      "reward": 0.038129642605781555,
+      "reward_std": 0.0157744400203228,
+      "rewards/bleu_reward_func/mean": 0.038129642605781555,
+      "rewards/bleu_reward_func/std": 0.030961766839027405,
+      "step": 320
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.09375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 485.0,
+      "completions/mean_length": 235.75,
+      "completions/mean_terminated_length": 207.1724090576172,
+      "completions/min_length": 15.0,
+      "completions/min_terminated_length": 15.0,
+      "epoch": 0.2568,
+      "grad_norm": 6.800954341888428,
+      "kl": 0.172210693359375,
+      "learning_rate": 1e-06,
+      "loss": -0.2682,
+      "num_tokens": 4257425.0,
+      "reward": 0.08078090846538544,
+      "reward_std": 0.0318281352519989,
+      "rewards/bleu_reward_func/mean": 0.08078090846538544,
+      "rewards/bleu_reward_func/std": 0.060885149985551834,
+      "step": 321
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 52.0,
+      "completions/mean_length": 153.5,
+      "completions/mean_terminated_length": 34.0,
+      "completions/min_length": 16.0,
+      "completions/min_terminated_length": 16.0,
+      "epoch": 0.2576,
+      "grad_norm": 6.995741367340088,
+      "kl": 0.197662353515625,
+      "learning_rate": 1e-06,
+      "loss": 0.0321,
+      "num_tokens": 4270729.0,
+      "reward": 0.3046156167984009,
+      "reward_std": 0.045112840831279755,
+      "rewards/bleu_reward_func/mean": 0.3046156167984009,
+      "rewards/bleu_reward_func/std": 0.17106564342975616,
+      "step": 322
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 485.0,
+      "completions/mean_length": 309.0625,
+      "completions/mean_terminated_length": 216.8181915283203,
+      "completions/min_length": 15.0,
+      "completions/min_terminated_length": 15.0,
+      "epoch": 0.2584,
+      "grad_norm": 8.159075736999512,
+      "kl": 0.11962890625,
+      "learning_rate": 1e-06,
+      "loss": 0.0599,
+      "num_tokens": 4286907.0,
+      "reward": 0.11749087274074554,
+      "reward_std": 0.04918123036623001,
+      "rewards/bleu_reward_func/mean": 0.11749087274074554,
+      "rewards/bleu_reward_func/std": 0.12518151104450226,
+      "step": 323
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 424.0,
+      "completions/max_terminated_length": 424.0,
+      "completions/mean_length": 156.90625,
+      "completions/mean_terminated_length": 156.90625,
+      "completions/min_length": 15.0,
+      "completions/min_terminated_length": 15.0,
+      "epoch": 0.2592,
+      "grad_norm": 7.079853057861328,
+      "kl": 0.09991455078125,
+      "learning_rate": 1e-06,
+      "loss": 0.0397,
+      "num_tokens": 4295536.0,
+      "reward": 0.11096417158842087,
+      "reward_std": 0.04051455110311508,
+      "rewards/bleu_reward_func/mean": 0.11096417158842087,
+      "rewards/bleu_reward_func/std": 0.1420901119709015,
+      "step": 324
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.21875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 464.0,
+      "completions/mean_length": 150.875,
+      "completions/mean_terminated_length": 49.7599983215332,
+      "completions/min_length": 14.0,
+      "completions/min_terminated_length": 14.0,
+      "epoch": 0.26,
+      "grad_norm": 8.065258026123047,
+      "kl": 0.167816162109375,
+      "learning_rate": 1e-06,
+      "loss": -0.0243,
+      "num_tokens": 4306404.0,
+      "reward": 0.13756218552589417,
+      "reward_std": 0.02154640108346939,
+      "rewards/bleu_reward_func/mean": 0.13756218552589417,
+      "rewards/bleu_reward_func/std": 0.14523112773895264,
+      "step": 325
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 445.0,
+      "completions/mean_length": 385.75,
+      "completions/mean_terminated_length": 310.0,
+      "completions/min_length": 189.0,
+      "completions/min_terminated_length": 189.0,
+      "epoch": 0.2608,
+      "grad_norm": 2.441365957260132,
+      "kl": 0.019775390625,
+      "learning_rate": 1e-06,
+      "loss": 0.024,
+      "num_tokens": 4323836.0,
+      "reward": 0.023768192157149315,
+      "reward_std": 0.009069718420505524,
+      "rewards/bleu_reward_func/mean": 0.023768192157149315,
+      "rewards/bleu_reward_func/std": 0.029040560126304626,
+      "step": 326
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 512.0,
+      "completions/mean_length": 319.46875,
+      "completions/mean_terminated_length": 203.9499969482422,
+      "completions/min_length": 33.0,
+      "completions/min_terminated_length": 33.0,
+      "epoch": 0.2616,
+      "grad_norm": 5.7556071281433105,
+      "kl": 0.091705322265625,
+      "learning_rate": 1e-06,
+      "loss": 0.0042,
+      "num_tokens": 4338667.0,
+      "reward": 0.07871399819850922,
+      "reward_std": 0.03653344139456749,
+      "rewards/bleu_reward_func/mean": 0.07871399819850922,
+      "rewards/bleu_reward_func/std": 0.06572794169187546,
+      "step": 327
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 442.0,
+      "completions/mean_length": 264.84375,
+      "completions/mean_terminated_length": 152.5,
+      "completions/min_length": 20.0,
+      "completions/min_terminated_length": 20.0,
+      "epoch": 0.2624,
+      "grad_norm": 6.231250286102295,
+      "kl": 0.1138916015625,
+      "learning_rate": 1e-06,
+      "loss": -0.0458,
+      "num_tokens": 4351270.0,
+      "reward": 0.16190959513187408,
+      "reward_std": 0.02650507725775242,
+      "rewards/bleu_reward_func/mean": 0.16190959513187408,
+      "rewards/bleu_reward_func/std": 0.15018552541732788,
+      "step": 328
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 493.0,
+      "completions/mean_length": 350.625,
+      "completions/mean_terminated_length": 277.2727355957031,
+      "completions/min_length": 65.0,
+      "completions/min_terminated_length": 65.0,
+      "epoch": 0.2632,
+      "grad_norm": 2.828697681427002,
+      "kl": 0.02972412109375,
+      "learning_rate": 1e-06,
+      "loss": 0.0748,
+      "num_tokens": 4366018.0,
+      "reward": 0.07461819052696228,
+      "reward_std": 0.034676797688007355,
+      "rewards/bleu_reward_func/mean": 0.07461819052696228,
+      "rewards/bleu_reward_func/std": 0.10171358287334442,
+      "step": 329
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.46875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 447.0,
+      "completions/mean_length": 330.96875,
+      "completions/mean_terminated_length": 171.23529052734375,
+      "completions/min_length": 18.0,
+      "completions/min_terminated_length": 18.0,
+      "epoch": 0.264,
+      "grad_norm": 7.326402187347412,
+      "kl": 0.0977630615234375,
+      "learning_rate": 1e-06,
+      "loss": 0.2768,
+      "num_tokens": 4378353.0,
+      "reward": 0.07485680282115936,
+      "reward_std": 0.04837151616811752,
+      "rewards/bleu_reward_func/mean": 0.07485680282115936,
+      "rewards/bleu_reward_func/std": 0.04874453693628311,
+      "step": 330
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 194.0,
+      "completions/max_terminated_length": 194.0,
+      "completions/mean_length": 65.40625,
+      "completions/mean_terminated_length": 65.40625,
+      "completions/min_length": 15.0,
+      "completions/min_terminated_length": 15.0,
+      "epoch": 0.2648,
+      "grad_norm": 12.08074951171875,
+      "kl": 0.335693359375,
+      "learning_rate": 1e-06,
+      "loss": 0.1573,
+      "num_tokens": 4384062.0,
+      "reward": 0.19588544964790344,
+      "reward_std": 0.09824244678020477,
+      "rewards/bleu_reward_func/mean": 0.19588544964790344,
+      "rewards/bleu_reward_func/std": 0.16972649097442627,
+      "step": 331
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.21875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 507.0,
+      "completions/mean_length": 270.65625,
+      "completions/mean_terminated_length": 203.0800018310547,
+      "completions/min_length": 9.0,
+      "completions/min_terminated_length": 9.0,
+      "epoch": 0.2656,
+      "grad_norm": 4.561427593231201,
+      "kl": 0.039154052734375,
+      "learning_rate": 1e-06,
+      "loss": -0.0917,
+      "num_tokens": 4394427.0,
+      "reward": 0.06531640887260437,
+      "reward_std": 0.018873782828450203,
+      "rewards/bleu_reward_func/mean": 0.06531640887260437,
+      "rewards/bleu_reward_func/std": 0.059104837477207184,
+      "step": 332
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 507.0,
+      "completions/mean_length": 200.25,
+      "completions/mean_terminated_length": 155.71429443359375,
+      "completions/min_length": 17.0,
+      "completions/min_terminated_length": 17.0,
+      "epoch": 0.2664,
+      "grad_norm": 6.5239057540893555,
+      "kl": 0.169952392578125,
+      "learning_rate": 1e-06,
+      "loss": 0.0545,
+      "num_tokens": 4409739.0,
+      "reward": 0.23698079586029053,
+      "reward_std": 0.08829502761363983,
+      "rewards/bleu_reward_func/mean": 0.23698079586029053,
+      "rewards/bleu_reward_func/std": 0.2539888322353363,
+      "step": 333
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.09375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 509.0,
+      "completions/mean_length": 142.1875,
+      "completions/mean_terminated_length": 103.93103790283203,
+      "completions/min_length": 14.0,
+      "completions/min_terminated_length": 14.0,
+      "epoch": 0.2672,
+      "grad_norm": 6.988838195800781,
+      "kl": 0.192413330078125,
+      "learning_rate": 1e-06,
+      "loss": 0.0242,
+      "num_tokens": 4420033.0,
+      "reward": 0.18931233882904053,
+      "reward_std": 0.06329823285341263,
+      "rewards/bleu_reward_func/mean": 0.18931233882904053,
+      "rewards/bleu_reward_func/std": 0.16267651319503784,
+      "step": 334
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 455.0,
+      "completions/mean_length": 149.71875,
+      "completions/mean_terminated_length": 66.11538696289062,
+      "completions/min_length": 11.0,
+      "completions/min_terminated_length": 11.0,
+      "epoch": 0.268,
+      "grad_norm": 16.95305061340332,
+      "kl": 0.3260498046875,
+      "learning_rate": 1e-06,
+      "loss": 0.3727,
+      "num_tokens": 4429376.0,
+      "reward": 0.11154920607805252,
+      "reward_std": 0.06479852646589279,
+      "rewards/bleu_reward_func/mean": 0.11154920607805252,
+      "rewards/bleu_reward_func/std": 0.07707681506872177,
+      "step": 335
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.09375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 506.0,
+      "completions/mean_length": 216.125,
+      "completions/mean_terminated_length": 185.51724243164062,
+      "completions/min_length": 17.0,
+      "completions/min_terminated_length": 17.0,
+      "epoch": 0.2688,
+      "grad_norm": 12.891951560974121,
+      "kl": 0.154815673828125,
+      "learning_rate": 1e-06,
+      "loss": 0.1506,
+      "num_tokens": 4438340.0,
+      "reward": 0.11881305277347565,
+      "reward_std": 0.04300341382622719,
+      "rewards/bleu_reward_func/mean": 0.11881305277347565,
+      "rewards/bleu_reward_func/std": 0.11628168076276779,
+      "step": 336
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.34375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 490.0,
+      "completions/mean_length": 259.375,
+      "completions/mean_terminated_length": 127.04762268066406,
+      "completions/min_length": 23.0,
+      "completions/min_terminated_length": 23.0,
+      "epoch": 0.2696,
+      "grad_norm": 8.3147554397583,
+      "kl": 0.0980682373046875,
+      "learning_rate": 1e-06,
+      "loss": 0.0185,
+      "num_tokens": 4451328.0,
+      "reward": 0.11791149526834488,
+      "reward_std": 0.02945806086063385,
+      "rewards/bleu_reward_func/mean": 0.11791149526834488,
+      "rewards/bleu_reward_func/std": 0.06387177854776382,
+      "step": 337
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.21875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 397.0,
+      "completions/mean_length": 159.28125,
+      "completions/mean_terminated_length": 60.52000045776367,
+      "completions/min_length": 18.0,
+      "completions/min_terminated_length": 18.0,
+      "epoch": 0.2704,
+      "grad_norm": 6.874416828155518,
+      "kl": 0.235107421875,
+      "learning_rate": 1e-06,
+      "loss": 0.1172,
+      "num_tokens": 4461785.0,
+      "reward": 0.18331755697727203,
+      "reward_std": 0.05733542889356613,
+      "rewards/bleu_reward_func/mean": 0.18331755697727203,
+      "rewards/bleu_reward_func/std": 0.17218343913555145,
+      "step": 338
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.21875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 505.0,
+      "completions/mean_length": 206.1875,
+      "completions/mean_terminated_length": 120.55999755859375,
+      "completions/min_length": 11.0,
+      "completions/min_terminated_length": 11.0,
+      "epoch": 0.2712,
+      "grad_norm": 7.444963455200195,
+      "kl": 0.08843994140625,
+      "learning_rate": 1e-06,
+      "loss": 0.3417,
+      "num_tokens": 4471031.0,
+      "reward": 0.08221863210201263,
+      "reward_std": 0.030037853866815567,
+      "rewards/bleu_reward_func/mean": 0.08221863210201263,
+      "rewards/bleu_reward_func/std": 0.05527469143271446,
+      "step": 339
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 485.0,
+      "completions/mean_length": 256.84375,
+      "completions/mean_terminated_length": 197.9615478515625,
+      "completions/min_length": 20.0,
+      "completions/min_terminated_length": 20.0,
+      "epoch": 0.272,
+      "grad_norm": 6855.86328125,
+      "kl": 1.03955078125,
+      "learning_rate": 1e-06,
+      "loss": 0.0411,
+      "num_tokens": 4485834.0,
+      "reward": 0.13405509293079376,
+      "reward_std": 0.03707335144281387,
+      "rewards/bleu_reward_func/mean": 0.13405509293079376,
+      "rewards/bleu_reward_func/std": 0.15687085688114166,
+      "step": 340
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 388.0,
+      "completions/mean_length": 211.0,
+      "completions/mean_terminated_length": 141.53846740722656,
+      "completions/min_length": 36.0,
+      "completions/min_terminated_length": 36.0,
+      "epoch": 0.2728,
+      "grad_norm": 8.717283248901367,
+      "kl": 0.128326416015625,
+      "learning_rate": 1e-06,
+      "loss": -0.0381,
+      "num_tokens": 4496202.0,
+      "reward": 0.0755915641784668,
+      "reward_std": 0.029588045552372932,
+      "rewards/bleu_reward_func/mean": 0.0755915641784668,
+      "rewards/bleu_reward_func/std": 0.05914263799786568,
+      "step": 341
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.21875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 416.0,
+      "completions/mean_length": 213.15625,
+      "completions/mean_terminated_length": 129.47999572753906,
+      "completions/min_length": 16.0,
+      "completions/min_terminated_length": 16.0,
+      "epoch": 0.2736,
+      "grad_norm": 9.269394874572754,
+      "kl": 0.21234130859375,
+      "learning_rate": 1e-06,
+      "loss": -0.0787,
+      "num_tokens": 4505447.0,
+      "reward": 0.11310072988271713,
+      "reward_std": 0.035067904740571976,
+      "rewards/bleu_reward_func/mean": 0.11310072988271713,
+      "rewards/bleu_reward_func/std": 0.10819036513566971,
+      "step": 342
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.53125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 511.0,
+      "completions/mean_length": 353.90625,
+      "completions/mean_terminated_length": 174.73333740234375,
+      "completions/min_length": 19.0,
+      "completions/min_terminated_length": 19.0,
+      "epoch": 0.2744,
+      "grad_norm": 6.147165298461914,
+      "kl": 0.0384521484375,
+      "learning_rate": 1e-06,
+      "loss": -0.0413,
+      "num_tokens": 4519052.0,
+      "reward": 0.06785966455936432,
+      "reward_std": 0.039666250348091125,
+      "rewards/bleu_reward_func/mean": 0.06785966455936432,
+      "rewards/bleu_reward_func/std": 0.059012189507484436,
+      "step": 343
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 507.0,
+      "completions/mean_length": 370.84375,
+      "completions/mean_terminated_length": 261.0555725097656,
+      "completions/min_length": 19.0,
+      "completions/min_terminated_length": 19.0,
+      "epoch": 0.2752,
+      "grad_norm": 6.257096767425537,
+      "kl": 0.170440673828125,
+      "learning_rate": 1e-06,
+      "loss": 0.0397,
+      "num_tokens": 4533975.0,
+      "reward": 0.05020497739315033,
+      "reward_std": 0.009127253666520119,
+      "rewards/bleu_reward_func/mean": 0.05020497739315033,
+      "rewards/bleu_reward_func/std": 0.04745229333639145,
+      "step": 344
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 463.0,
+      "completions/mean_length": 260.1875,
+      "completions/mean_terminated_length": 64.33333587646484,
+      "completions/min_length": 17.0,
+      "completions/min_terminated_length": 17.0,
+      "epoch": 0.276,
+      "grad_norm": 8.694131851196289,
+      "kl": 0.40167236328125,
+      "learning_rate": 1e-06,
+      "loss": 0.124,
+      "num_tokens": 4548765.0,
+      "reward": 0.17815490067005157,
+      "reward_std": 0.04761611297726631,
+      "rewards/bleu_reward_func/mean": 0.17815490067005157,
+      "rewards/bleu_reward_func/std": 0.22018791735172272,
+      "step": 345
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 491.0,
+      "completions/mean_length": 334.78125,
+      "completions/mean_terminated_length": 275.7083435058594,
+      "completions/min_length": 15.0,
+      "completions/min_terminated_length": 15.0,
+      "epoch": 0.2768,
+      "grad_norm": 6.1226325035095215,
+      "kl": 0.105133056640625,
+      "learning_rate": 1e-06,
+      "loss": -0.0251,
+      "num_tokens": 4565158.0,
+      "reward": 0.09645688533782959,
+      "reward_std": 0.0746307447552681,
+      "rewards/bleu_reward_func/mean": 0.09645688533782959,
+      "rewards/bleu_reward_func/std": 0.1715475171804428,
+      "step": 346
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 388.0,
+      "completions/max_terminated_length": 388.0,
+      "completions/mean_length": 98.46875,
+      "completions/mean_terminated_length": 98.46875,
+      "completions/min_length": 20.0,
+      "completions/min_terminated_length": 20.0,
+      "epoch": 0.2776,
+      "grad_norm": 8.647904396057129,
+      "kl": 0.328857421875,
+      "learning_rate": 1e-06,
+      "loss": -0.0264,
+      "num_tokens": 4576637.0,
+      "reward": 0.3595752716064453,
+      "reward_std": 0.09626303613185883,
+      "rewards/bleu_reward_func/mean": 0.3595752716064453,
+      "rewards/bleu_reward_func/std": 0.293544203042984,
+      "step": 347
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 239.0,
+      "completions/mean_length": 191.28125,
+      "completions/mean_terminated_length": 84.375,
+      "completions/min_length": 19.0,
+      "completions/min_terminated_length": 19.0,
+      "epoch": 0.2784,
+      "grad_norm": 7.7827630043029785,
+      "kl": 0.24041748046875,
+      "learning_rate": 1e-06,
+      "loss": 0.0475,
+      "num_tokens": 4586230.0,
+      "reward": 0.2051679939031601,
+      "reward_std": 0.029646433889865875,
+      "rewards/bleu_reward_func/mean": 0.2051679939031601,
+      "rewards/bleu_reward_func/std": 0.20678655803203583,
+      "step": 348
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 498.0,
+      "completions/mean_length": 320.5,
+      "completions/mean_terminated_length": 256.66668701171875,
+      "completions/min_length": 11.0,
+      "completions/min_terminated_length": 11.0,
+      "epoch": 0.2792,
+      "grad_norm": 8.593353271484375,
+      "kl": 0.149017333984375,
+      "learning_rate": 1e-06,
+      "loss": -0.0651,
+      "num_tokens": 4603070.0,
+      "reward": 0.1438911259174347,
+      "reward_std": 0.06431536376476288,
+      "rewards/bleu_reward_func/mean": 0.1438911259174347,
+      "rewards/bleu_reward_func/std": 0.22814705967903137,
+      "step": 349
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.59375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 469.0,
+      "completions/mean_length": 381.96875,
+      "completions/mean_terminated_length": 191.92308044433594,
+      "completions/min_length": 30.0,
+      "completions/min_terminated_length": 30.0,
+      "epoch": 0.28,
+      "grad_norm": 2.2874648571014404,
+      "kl": 0.023284912109375,
+      "learning_rate": 1e-06,
+      "loss": 0.0246,
+      "num_tokens": 4619757.0,
+      "reward": 0.19660863280296326,
+      "reward_std": 0.08571420609951019,
+      "rewards/bleu_reward_func/mean": 0.19660863280296326,
+      "rewards/bleu_reward_func/std": 0.2662343680858612,
+      "step": 350
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.40625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 507.0,
+      "completions/mean_length": 380.53125,
+      "completions/mean_terminated_length": 290.5789489746094,
+      "completions/min_length": 109.0,
+      "completions/min_terminated_length": 109.0,
+      "epoch": 0.2808,
+      "grad_norm": 2.8600640296936035,
+      "kl": 0.02679443359375,
+      "learning_rate": 1e-06,
+      "loss": 0.082,
+      "num_tokens": 4636806.0,
+      "reward": 0.05401962995529175,
+      "reward_std": 0.019372381269931793,
+      "rewards/bleu_reward_func/mean": 0.05401962995529175,
+      "rewards/bleu_reward_func/std": 0.026677841320633888,
+      "step": 351
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 509.0,
+      "completions/mean_length": 328.125,
+      "completions/mean_terminated_length": 244.5454559326172,
+      "completions/min_length": 16.0,
+      "completions/min_terminated_length": 16.0,
+      "epoch": 0.2816,
+      "grad_norm": 6.117258548736572,
+      "kl": 0.10986328125,
+      "learning_rate": 1e-06,
+      "loss": 0.162,
+      "num_tokens": 4649338.0,
+      "reward": 0.12430500984191895,
+      "reward_std": 0.046015314757823944,
+      "rewards/bleu_reward_func/mean": 0.12430500984191895,
+      "rewards/bleu_reward_func/std": 0.11290674656629562,
+      "step": 352
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.46875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 487.0,
+      "completions/mean_length": 335.15625,
+      "completions/mean_terminated_length": 179.11764526367188,
+      "completions/min_length": 15.0,
+      "completions/min_terminated_length": 15.0,
+      "epoch": 0.2824,
+      "grad_norm": 9.883430480957031,
+      "kl": 0.134429931640625,
+      "learning_rate": 1e-06,
+      "loss": -0.0897,
+      "num_tokens": 4664823.0,
+      "reward": 0.10318648815155029,
+      "reward_std": 0.040948014706373215,
+      "rewards/bleu_reward_func/mean": 0.10318648815155029,
+      "rewards/bleu_reward_func/std": 0.098084457218647,
+      "step": 353
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 425.0,
+      "completions/mean_length": 213.96875,
+      "completions/mean_terminated_length": 204.35482788085938,
+      "completions/min_length": 47.0,
+      "completions/min_terminated_length": 47.0,
+      "epoch": 0.2832,
+      "grad_norm": 3.7569406032562256,
+      "kl": 0.0567626953125,
+      "learning_rate": 1e-06,
+      "loss": 0.276,
+      "num_tokens": 4673198.0,
+      "reward": 0.02880779653787613,
+      "reward_std": 0.02136135660111904,
+      "rewards/bleu_reward_func/mean": 0.02880779653787613,
+      "rewards/bleu_reward_func/std": 0.031262028962373734,
+      "step": 354
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.65625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 503.0,
+      "completions/mean_length": 382.34375,
+      "completions/mean_terminated_length": 134.8181915283203,
+      "completions/min_length": 15.0,
+      "completions/min_terminated_length": 15.0,
+      "epoch": 0.284,
+      "grad_norm": 5.402606010437012,
+      "kl": 0.059539794921875,
+      "learning_rate": 1e-06,
+      "loss": 0.0167,
+      "num_tokens": 4690033.0,
+      "reward": 0.11326944082975388,
+      "reward_std": 0.04008851572871208,
+      "rewards/bleu_reward_func/mean": 0.11326944082975388,
+      "rewards/bleu_reward_func/std": 0.1632446050643921,
+      "step": 355
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 184.0,
+      "completions/max_terminated_length": 184.0,
+      "completions/mean_length": 49.5,
+      "completions/mean_terminated_length": 49.5,
+      "completions/min_length": 2.0,
+      "completions/min_terminated_length": 2.0,
+      "epoch": 0.2848,
+      "grad_norm": 15.920856475830078,
+      "kl": 0.2000732421875,
+      "learning_rate": 1e-06,
+      "loss": 0.1343,
+      "num_tokens": 4696953.0,
+      "reward": 0.1625998467206955,
+      "reward_std": 0.10141640901565552,
+      "rewards/bleu_reward_func/mean": 0.1625998467206955,
+      "rewards/bleu_reward_func/std": 0.12067051976919174,
+      "step": 356
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 328.0,
+      "completions/max_terminated_length": 328.0,
+      "completions/mean_length": 103.71875,
+      "completions/mean_terminated_length": 103.71875,
+      "completions/min_length": 17.0,
+      "completions/min_terminated_length": 17.0,
+      "epoch": 0.2856,
+      "grad_norm": 32.86006546020508,
+      "kl": 0.153564453125,
+      "learning_rate": 1e-06,
+      "loss": 0.1497,
+      "num_tokens": 4705000.0,
+      "reward": 0.05853947252035141,
+      "reward_std": 0.014492938295006752,
+      "rewards/bleu_reward_func/mean": 0.05853947252035141,
+      "rewards/bleu_reward_func/std": 0.02192818373441696,
+      "step": 357
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 430.0,
+      "completions/max_terminated_length": 430.0,
+      "completions/mean_length": 110.96875,
+      "completions/mean_terminated_length": 110.96875,
+      "completions/min_length": 10.0,
+      "completions/min_terminated_length": 10.0,
+      "epoch": 0.2864,
+      "grad_norm": 8.785351753234863,
+      "kl": 0.1767578125,
+      "learning_rate": 1e-06,
+      "loss": 0.0291,
+      "num_tokens": 4713815.0,
+      "reward": 0.256367951631546,
+      "reward_std": 0.06547890603542328,
+      "rewards/bleu_reward_func/mean": 0.256367951631546,
+      "rewards/bleu_reward_func/std": 0.2225809097290039,
+      "step": 358
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.46875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 387.0,
+      "completions/mean_length": 333.21875,
+      "completions/mean_terminated_length": 175.47059631347656,
+      "completions/min_length": 36.0,
+      "completions/min_terminated_length": 36.0,
+      "epoch": 0.2872,
+      "grad_norm": 3.714874744415283,
+      "kl": 0.0813140869140625,
+      "learning_rate": 1e-06,
+      "loss": 0.0176,
+      "num_tokens": 4732606.0,
+      "reward": 0.08705547451972961,
+      "reward_std": 0.02976841665804386,
+      "rewards/bleu_reward_func/mean": 0.08705547451972961,
+      "rewards/bleu_reward_func/std": 0.041370097547769547,
+      "step": 359
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 474.0,
+      "completions/mean_length": 159.0,
+      "completions/mean_terminated_length": 147.61289978027344,
+      "completions/min_length": 20.0,
+      "completions/min_terminated_length": 20.0,
+      "epoch": 0.288,
+      "grad_norm": 7.568475723266602,
+      "kl": 0.069976806640625,
+      "learning_rate": 1e-06,
+      "loss": 0.0023,
+      "num_tokens": 4742086.0,
+      "reward": 0.05895683914422989,
+      "reward_std": 0.036796510219573975,
+      "rewards/bleu_reward_func/mean": 0.05895683914422989,
+      "rewards/bleu_reward_func/std": 0.06153297796845436,
+      "step": 360
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.28125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 401.0,
+      "completions/mean_length": 303.34375,
+      "completions/mean_terminated_length": 221.69566345214844,
+      "completions/min_length": 52.0,
+      "completions/min_terminated_length": 52.0,
+      "epoch": 0.2888,
+      "grad_norm": 3.495642900466919,
+      "kl": 0.033843994140625,
+      "learning_rate": 1e-06,
+      "loss": 0.1195,
+      "num_tokens": 4755153.0,
+      "reward": 0.024642691016197205,
+      "reward_std": 0.00707631791010499,
+      "rewards/bleu_reward_func/mean": 0.024642691016197205,
+      "rewards/bleu_reward_func/std": 0.01350654847919941,
+      "step": 361
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.40625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 487.0,
+      "completions/mean_length": 349.59375,
+      "completions/mean_terminated_length": 238.4736785888672,
+      "completions/min_length": 26.0,
+      "completions/min_terminated_length": 26.0,
+      "epoch": 0.2896,
+      "grad_norm": 3.2497663497924805,
+      "kl": 0.032928466796875,
+      "learning_rate": 1e-06,
+      "loss": 0.0909,
+      "num_tokens": 4768724.0,
+      "reward": 0.06024404242634773,
+      "reward_std": 0.029051221907138824,
+      "rewards/bleu_reward_func/mean": 0.06024404242634773,
+      "rewards/bleu_reward_func/std": 0.05113474279642105,
+      "step": 362
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.34375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 490.0,
+      "completions/mean_length": 308.21875,
+      "completions/mean_terminated_length": 201.4761962890625,
+      "completions/min_length": 29.0,
+      "completions/min_terminated_length": 29.0,
+      "epoch": 0.2904,
+      "grad_norm": 3.932180643081665,
+      "kl": 0.0535736083984375,
+      "learning_rate": 1e-06,
+      "loss": -0.1216,
+      "num_tokens": 4784611.0,
+      "reward": 0.10957776010036469,
+      "reward_std": 0.018995165824890137,
+      "rewards/bleu_reward_func/mean": 0.10957776010036469,
+      "rewards/bleu_reward_func/std": 0.12744034826755524,
+      "step": 363
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.21875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 467.0,
+      "completions/mean_length": 315.15625,
+      "completions/mean_terminated_length": 260.0400085449219,
+      "completions/min_length": 95.0,
+      "completions/min_terminated_length": 95.0,
+      "epoch": 0.2912,
+      "grad_norm": 3.873363971710205,
+      "kl": 0.04693603515625,
+      "learning_rate": 1e-06,
+      "loss": 0.1157,
+      "num_tokens": 4798600.0,
+      "reward": 0.06850136816501617,
+      "reward_std": 0.03206296265125275,
+      "rewards/bleu_reward_func/mean": 0.06850136816501617,
+      "rewards/bleu_reward_func/std": 0.06299194693565369,
+      "step": 364
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 508.0,
+      "completions/mean_length": 301.28125,
+      "completions/mean_terminated_length": 205.5,
+      "completions/min_length": 37.0,
+      "completions/min_terminated_length": 37.0,
+      "epoch": 0.292,
+      "grad_norm": 3.491849184036255,
+      "kl": 0.050079345703125,
+      "learning_rate": 1e-06,
+      "loss": 0.1634,
+      "num_tokens": 4812193.0,
+      "reward": 0.0632539913058281,
+      "reward_std": 0.04620906710624695,
+      "rewards/bleu_reward_func/mean": 0.0632539913058281,
+      "rewards/bleu_reward_func/std": 0.08490858227014542,
+      "step": 365
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.21875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 446.0,
+      "completions/mean_length": 320.40625,
+      "completions/mean_terminated_length": 266.7599792480469,
+      "completions/min_length": 23.0,
+      "completions/min_terminated_length": 23.0,
+      "epoch": 0.2928,
+      "grad_norm": 10.243452072143555,
+      "kl": 0.1219482421875,
+      "learning_rate": 1e-06,
+      "loss": 0.0432,
+      "num_tokens": 4824134.0,
+      "reward": 0.0788659006357193,
+      "reward_std": 0.019495027139782906,
+      "rewards/bleu_reward_func/mean": 0.0788659006357193,
+      "rewards/bleu_reward_func/std": 0.05461956560611725,
+      "step": 366
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.28125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 499.0,
+      "completions/mean_length": 357.40625,
+      "completions/mean_terminated_length": 296.9130554199219,
+      "completions/min_length": 7.0,
+      "completions/min_terminated_length": 7.0,
+      "epoch": 0.2936,
+      "grad_norm": 2.715989351272583,
+      "kl": 0.036712646484375,
+      "learning_rate": 1e-06,
+      "loss": -0.1141,
+      "num_tokens": 4839219.0,
+      "reward": 0.1387082040309906,
+      "reward_std": 0.025043122470378876,
+      "rewards/bleu_reward_func/mean": 0.1387082040309906,
+      "rewards/bleu_reward_func/std": 0.14657536149024963,
+      "step": 367
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 477.0,
+      "completions/mean_length": 418.0,
+      "completions/mean_terminated_length": 261.3333435058594,
+      "completions/min_length": 116.0,
+      "completions/min_terminated_length": 116.0,
+      "epoch": 0.2944,
+      "grad_norm": 2.414018154144287,
+      "kl": 0.029937744140625,
+      "learning_rate": 1e-06,
+      "loss": 0.0066,
+      "num_tokens": 4857699.0,
+      "reward": 0.06751300394535065,
+      "reward_std": 0.05967854708433151,
+      "rewards/bleu_reward_func/mean": 0.06751300394535065,
+      "rewards/bleu_reward_func/std": 0.08448994904756546,
+      "step": 368
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 414.0,
+      "completions/max_terminated_length": 414.0,
+      "completions/mean_length": 50.09375,
+      "completions/mean_terminated_length": 50.09375,
+      "completions/min_length": 13.0,
+      "completions/min_terminated_length": 13.0,
+      "epoch": 0.2952,
+      "grad_norm": 12.07331657409668,
+      "kl": 0.331298828125,
+      "learning_rate": 1e-06,
+      "loss": -0.167,
+      "num_tokens": 4865766.0,
+      "reward": 0.2235146164894104,
+      "reward_std": 0.06765347719192505,
+      "rewards/bleu_reward_func/mean": 0.2235146164894104,
+      "rewards/bleu_reward_func/std": 0.15006797015666962,
+      "step": 369
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.40625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 476.0,
+      "completions/mean_length": 300.125,
+      "completions/mean_terminated_length": 155.15789794921875,
+      "completions/min_length": 17.0,
+      "completions/min_terminated_length": 17.0,
+      "epoch": 0.296,
+      "grad_norm": 6.003938674926758,
+      "kl": 0.062225341796875,
+      "learning_rate": 1e-06,
+      "loss": 0.0295,
+      "num_tokens": 4883786.0,
+      "reward": 0.09686341136693954,
+      "reward_std": 0.04255010187625885,
+      "rewards/bleu_reward_func/mean": 0.09686341136693954,
+      "rewards/bleu_reward_func/std": 0.11752825975418091,
+      "step": 370
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 484.0,
+      "completions/mean_length": 381.84375,
+      "completions/mean_terminated_length": 338.4583435058594,
+      "completions/min_length": 72.0,
+      "completions/min_terminated_length": 72.0,
+      "epoch": 0.2968,
+      "grad_norm": 2.782743215560913,
+      "kl": 0.041107177734375,
+      "learning_rate": 1e-06,
+      "loss": -0.0931,
+      "num_tokens": 4898397.0,
+      "reward": 0.06518180668354034,
+      "reward_std": 0.017261603847146034,
+      "rewards/bleu_reward_func/mean": 0.06518180668354034,
+      "rewards/bleu_reward_func/std": 0.07592527568340302,
+      "step": 371
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 484.0,
+      "completions/mean_length": 191.71875,
+      "completions/mean_terminated_length": 170.36666870117188,
+      "completions/min_length": 19.0,
+      "completions/min_terminated_length": 19.0,
+      "epoch": 0.2976,
+      "grad_norm": 6.834630489349365,
+      "kl": 0.155029296875,
+      "learning_rate": 1e-06,
+      "loss": -0.0674,
+      "num_tokens": 4907564.0,
+      "reward": 0.0751166045665741,
+      "reward_std": 0.03539106994867325,
+      "rewards/bleu_reward_func/mean": 0.0751166045665741,
+      "rewards/bleu_reward_func/std": 0.03759034350514412,
+      "step": 372
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 471.0,
+      "completions/mean_length": 167.65625,
+      "completions/mean_terminated_length": 156.5483856201172,
+      "completions/min_length": 17.0,
+      "completions/min_terminated_length": 17.0,
+      "epoch": 0.2984,
+      "grad_norm": 9.550743103027344,
+      "kl": 0.1636962890625,
+      "learning_rate": 1e-06,
+      "loss": 0.1533,
+      "num_tokens": 4916969.0,
+      "reward": 0.12691722810268402,
+      "reward_std": 0.019398069009184837,
+      "rewards/bleu_reward_func/mean": 0.12691722810268402,
+      "rewards/bleu_reward_func/std": 0.14723701775074005,
+      "step": 373
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.21875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 511.0,
+      "completions/mean_length": 267.6875,
+      "completions/mean_terminated_length": 199.27999877929688,
+      "completions/min_length": 7.0,
+      "completions/min_terminated_length": 7.0,
+      "epoch": 0.2992,
+      "grad_norm": 5.277988433837891,
+      "kl": 0.143890380859375,
+      "learning_rate": 1e-06,
+      "loss": -0.2024,
+      "num_tokens": 4930367.0,
+      "reward": 0.21388903260231018,
+      "reward_std": 0.0590648353099823,
+      "rewards/bleu_reward_func/mean": 0.21388903260231018,
+      "rewards/bleu_reward_func/std": 0.2627076506614685,
+      "step": 374
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.34375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 494.0,
+      "completions/mean_length": 337.59375,
+      "completions/mean_terminated_length": 246.23809814453125,
+      "completions/min_length": 21.0,
+      "completions/min_terminated_length": 21.0,
+      "epoch": 0.3,
+      "grad_norm": 10.797468185424805,
+      "kl": 0.12530517578125,
+      "learning_rate": 1e-06,
+      "loss": -0.06,
+      "num_tokens": 4948002.0,
+      "reward": 0.1380675733089447,
+      "reward_std": 0.049179110676050186,
+      "rewards/bleu_reward_func/mean": 0.1380675733089447,
+      "rewards/bleu_reward_func/std": 0.14962899684906006,
+      "step": 375
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 493.0,
+      "completions/mean_length": 250.375,
+      "completions/mean_terminated_length": 163.1666717529297,
+      "completions/min_length": 28.0,
+      "completions/min_terminated_length": 28.0,
+      "epoch": 0.3008,
+      "grad_norm": 6.259679794311523,
+      "kl": 0.1361083984375,
+      "learning_rate": 1e-06,
+      "loss": -0.0324,
+      "num_tokens": 4963942.0,
+      "reward": 0.2779002785682678,
+      "reward_std": 0.049215167760849,
+      "rewards/bleu_reward_func/mean": 0.2779002785682678,
+      "rewards/bleu_reward_func/std": 0.247111514210701,
+      "step": 376
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 492.0,
+      "completions/mean_length": 228.96875,
+      "completions/mean_terminated_length": 188.5357208251953,
+      "completions/min_length": 14.0,
+      "completions/min_terminated_length": 14.0,
+      "epoch": 0.3016,
+      "grad_norm": 9.751809120178223,
+      "kl": 0.12164306640625,
+      "learning_rate": 1e-06,
+      "loss": -0.0784,
+      "num_tokens": 4974437.0,
+      "reward": 0.12611877918243408,
+      "reward_std": 0.05333450064063072,
+      "rewards/bleu_reward_func/mean": 0.12611877918243408,
+      "rewards/bleu_reward_func/std": 0.11847065389156342,
+      "step": 377
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.59375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 506.0,
+      "completions/mean_length": 428.34375,
+      "completions/mean_terminated_length": 306.0769348144531,
+      "completions/min_length": 12.0,
+      "completions/min_terminated_length": 12.0,
+      "epoch": 0.3024,
+      "grad_norm": 1.8198633193969727,
+      "kl": 0.025543212890625,
+      "learning_rate": 1e-06,
+      "loss": 0.0668,
+      "num_tokens": 4993880.0,
+      "reward": 0.07207944989204407,
+      "reward_std": 0.019526129588484764,
+      "rewards/bleu_reward_func/mean": 0.07207944989204407,
+      "rewards/bleu_reward_func/std": 0.06778865307569504,
+      "step": 378
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 237.0,
+      "completions/max_terminated_length": 237.0,
+      "completions/mean_length": 63.40625,
+      "completions/mean_terminated_length": 63.40625,
+      "completions/min_length": 9.0,
+      "completions/min_terminated_length": 9.0,
+      "epoch": 0.3032,
+      "grad_norm": 6.933629512786865,
+      "kl": 0.13250732421875,
+      "learning_rate": 1e-06,
+      "loss": 0.3354,
+      "num_tokens": 5001909.0,
+      "reward": 0.12609761953353882,
+      "reward_std": 0.07611958682537079,
+      "rewards/bleu_reward_func/mean": 0.12609761953353882,
+      "rewards/bleu_reward_func/std": 0.09586605429649353,
+      "step": 379
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.15625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 505.0,
+      "completions/mean_length": 205.25,
+      "completions/mean_terminated_length": 148.44444274902344,
+      "completions/min_length": 17.0,
+      "completions/min_terminated_length": 17.0,
+      "epoch": 0.304,
+      "grad_norm": 8.042766571044922,
+      "kl": 0.155029296875,
+      "learning_rate": 1e-06,
+      "loss": 0.1587,
+      "num_tokens": 5012533.0,
+      "reward": 0.18430504202842712,
+      "reward_std": 0.09831003099679947,
+      "rewards/bleu_reward_func/mean": 0.18430504202842712,
+      "rewards/bleu_reward_func/std": 0.1858755648136139,
+      "step": 380
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.40625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 505.0,
+      "completions/mean_length": 333.46875,
+      "completions/mean_terminated_length": 211.3157958984375,
+      "completions/min_length": 13.0,
+      "completions/min_terminated_length": 13.0,
+      "epoch": 0.3048,
+      "grad_norm": 19.682376861572266,
+      "kl": 0.1099853515625,
+      "learning_rate": 1e-06,
+      "loss": 0.015,
+      "num_tokens": 5028972.0,
+      "reward": 0.09155917167663574,
+      "reward_std": 0.012800632044672966,
+      "rewards/bleu_reward_func/mean": 0.09155917167663574,
+      "rewards/bleu_reward_func/std": 0.1374584585428238,
+      "step": 381
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 505.0,
+      "completions/mean_length": 301.6875,
+      "completions/mean_terminated_length": 253.1538543701172,
+      "completions/min_length": 28.0,
+      "completions/min_terminated_length": 28.0,
+      "epoch": 0.3056,
+      "grad_norm": 3.2053592205047607,
+      "kl": 0.051788330078125,
+      "learning_rate": 1e-06,
+      "loss": 0.2446,
+      "num_tokens": 5041090.0,
+      "reward": 0.06323020905256271,
+      "reward_std": 0.032996732741594315,
+      "rewards/bleu_reward_func/mean": 0.06323020905256271,
+      "rewards/bleu_reward_func/std": 0.05562639981508255,
+      "step": 382
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.59375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 468.0,
+      "completions/mean_length": 369.375,
+      "completions/mean_terminated_length": 160.92308044433594,
+      "completions/min_length": 15.0,
+      "completions/min_terminated_length": 15.0,
+      "epoch": 0.3064,
+      "grad_norm": 7.353909492492676,
+      "kl": 0.07830810546875,
+      "learning_rate": 1e-06,
+      "loss": 0.0282,
+      "num_tokens": 5057798.0,
+      "reward": 0.1571401059627533,
+      "reward_std": 0.02875007688999176,
+      "rewards/bleu_reward_func/mean": 0.1571401059627533,
+      "rewards/bleu_reward_func/std": 0.20372198522090912,
+      "step": 383
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 506.0,
+      "completions/mean_length": 316.84375,
+      "completions/mean_terminated_length": 199.75,
+      "completions/min_length": 17.0,
+      "completions/min_terminated_length": 17.0,
+      "epoch": 0.3072,
+      "grad_norm": 3.984431743621826,
+      "kl": 0.066986083984375,
+      "learning_rate": 1e-06,
+      "loss": 0.0376,
+      "num_tokens": 5073449.0,
+      "reward": 0.03315318748354912,
+      "reward_std": 0.038507476449012756,
+      "rewards/bleu_reward_func/mean": 0.03315318748354912,
+      "rewards/bleu_reward_func/std": 0.06562887132167816,
+      "step": 384
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 465.0,
+      "completions/mean_length": 319.375,
+      "completions/mean_terminated_length": 126.75,
+      "completions/min_length": 16.0,
+      "completions/min_terminated_length": 16.0,
+      "epoch": 0.308,
+      "grad_norm": 6.559665203094482,
+      "kl": 0.0677490234375,
+      "learning_rate": 1e-06,
+      "loss": 0.3157,
+      "num_tokens": 5087821.0,
+      "reward": 0.06230534613132477,
+      "reward_std": 0.03765605762600899,
+      "rewards/bleu_reward_func/mean": 0.06230534613132477,
+      "rewards/bleu_reward_func/std": 0.07213454693555832,
+      "step": 385
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.21875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 488.0,
+      "completions/mean_length": 319.15625,
+      "completions/mean_terminated_length": 265.1600036621094,
+      "completions/min_length": 43.0,
+      "completions/min_terminated_length": 43.0,
+      "epoch": 0.3088,
+      "grad_norm": 3.6326193809509277,
+      "kl": 0.05596923828125,
+      "learning_rate": 1e-06,
+      "loss": -0.0929,
+      "num_tokens": 5100618.0,
+      "reward": 0.04398781806230545,
+      "reward_std": 0.02026546560227871,
+      "rewards/bleu_reward_func/mean": 0.04398781806230545,
+      "rewards/bleu_reward_func/std": 0.042056936770677567,
+      "step": 386
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 470.0,
+      "completions/mean_length": 326.0,
+      "completions/mean_terminated_length": 181.3333282470703,
+      "completions/min_length": 52.0,
+      "completions/min_terminated_length": 52.0,
+      "epoch": 0.3096,
+      "grad_norm": 4.189205646514893,
+      "kl": 0.0577392578125,
+      "learning_rate": 1e-06,
+      "loss": 0.0616,
+      "num_tokens": 5118850.0,
+      "reward": 0.10049895197153091,
+      "reward_std": 0.035130538046360016,
+      "rewards/bleu_reward_func/mean": 0.10049895197153091,
+      "rewards/bleu_reward_func/std": 0.0897059291601181,
+      "step": 387
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.40625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 479.0,
+      "completions/mean_length": 357.125,
+      "completions/mean_terminated_length": 251.15789794921875,
+      "completions/min_length": 22.0,
+      "completions/min_terminated_length": 22.0,
+      "epoch": 0.3104,
+      "grad_norm": 8.503087997436523,
+      "kl": 0.1228790283203125,
+      "learning_rate": 1e-06,
+      "loss": 0.1152,
+      "num_tokens": 5131574.0,
+      "reward": 0.10157294571399689,
+      "reward_std": 0.05235150083899498,
+      "rewards/bleu_reward_func/mean": 0.10157294571399689,
+      "rewards/bleu_reward_func/std": 0.11832693964242935,
+      "step": 388
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 490.0,
+      "completions/mean_length": 261.0625,
+      "completions/mean_terminated_length": 244.33334350585938,
+      "completions/min_length": 14.0,
+      "completions/min_terminated_length": 14.0,
+      "epoch": 0.3112,
+      "grad_norm": 7.511518478393555,
+      "kl": 0.0782470703125,
+      "learning_rate": 1e-06,
+      "loss": 0.1551,
+      "num_tokens": 5142288.0,
+      "reward": 0.05309104174375534,
+      "reward_std": 0.0195770300924778,
+      "rewards/bleu_reward_func/mean": 0.05309104174375534,
+      "rewards/bleu_reward_func/std": 0.03859832510352135,
+      "step": 389
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 136.0,
+      "completions/max_terminated_length": 136.0,
+      "completions/mean_length": 77.8125,
+      "completions/mean_terminated_length": 77.8125,
+      "completions/min_length": 10.0,
+      "completions/min_terminated_length": 10.0,
+      "epoch": 0.312,
+      "grad_norm": 7.358268737792969,
+      "kl": 0.1142578125,
+      "learning_rate": 1e-06,
+      "loss": 0.0692,
+      "num_tokens": 5147226.0,
+      "reward": 0.2647009789943695,
+      "reward_std": 0.0788542777299881,
+      "rewards/bleu_reward_func/mean": 0.2647009789943695,
+      "rewards/bleu_reward_func/std": 0.3669854998588562,
+      "step": 390
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.53125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 376.0,
+      "completions/mean_length": 331.1875,
+      "completions/mean_terminated_length": 126.26667022705078,
+      "completions/min_length": 17.0,
+      "completions/min_terminated_length": 17.0,
+      "epoch": 0.3128,
+      "grad_norm": 6.546727180480957,
+      "kl": 0.131866455078125,
+      "learning_rate": 1e-06,
+      "loss": 0.0216,
+      "num_tokens": 5162552.0,
+      "reward": 0.06478870660066605,
+      "reward_std": 0.016362179070711136,
+      "rewards/bleu_reward_func/mean": 0.06478870660066605,
+      "rewards/bleu_reward_func/std": 0.07661883533000946,
+      "step": 391
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 506.0,
+      "completions/mean_length": 305.5,
+      "completions/mean_terminated_length": 211.63636779785156,
+      "completions/min_length": 27.0,
+      "completions/min_terminated_length": 27.0,
+      "epoch": 0.3136,
+      "grad_norm": 3.7394042015075684,
+      "kl": 0.0411224365234375,
+      "learning_rate": 1e-06,
+      "loss": -0.0472,
+      "num_tokens": 5174632.0,
+      "reward": 0.07655475288629532,
+      "reward_std": 0.04063459113240242,
+      "rewards/bleu_reward_func/mean": 0.07655475288629532,
+      "rewards/bleu_reward_func/std": 0.05244217440485954,
+      "step": 392
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.21875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 464.0,
+      "completions/mean_length": 299.4375,
+      "completions/mean_terminated_length": 239.9199981689453,
+      "completions/min_length": 33.0,
+      "completions/min_terminated_length": 33.0,
+      "epoch": 0.3144,
+      "grad_norm": 3.130519151687622,
+      "kl": 0.036407470703125,
+      "learning_rate": 1e-06,
+      "loss": 0.0573,
+      "num_tokens": 5189038.0,
+      "reward": 0.08177624642848969,
+      "reward_std": 0.03700428456068039,
+      "rewards/bleu_reward_func/mean": 0.08177624642848969,
+      "rewards/bleu_reward_func/std": 0.07332108914852142,
+      "step": 393
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 507.0,
+      "completions/mean_length": 175.0,
+      "completions/mean_terminated_length": 152.53334045410156,
+      "completions/min_length": 26.0,
+      "completions/min_terminated_length": 26.0,
+      "epoch": 0.3152,
+      "grad_norm": 6.235530853271484,
+      "kl": 0.119140625,
+      "learning_rate": 1e-06,
+      "loss": -0.1315,
+      "num_tokens": 5199614.0,
+      "reward": 0.08668357878923416,
+      "reward_std": 0.029862932860851288,
+      "rewards/bleu_reward_func/mean": 0.08668357878923416,
+      "rewards/bleu_reward_func/std": 0.04458598420023918,
+      "step": 394
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 449.0,
+      "completions/mean_length": 303.21875,
+      "completions/mean_terminated_length": 140.8333282470703,
+      "completions/min_length": 19.0,
+      "completions/min_terminated_length": 19.0,
+      "epoch": 0.316,
+      "grad_norm": 3.7761735916137695,
+      "kl": 0.051483154296875,
+      "learning_rate": 1e-06,
+      "loss": 0.4336,
+      "num_tokens": 5216893.0,
+      "reward": 0.04373088479042053,
+      "reward_std": 0.025996902957558632,
+      "rewards/bleu_reward_func/mean": 0.04373088479042053,
+      "rewards/bleu_reward_func/std": 0.035521000623703,
+      "step": 395
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 473.0,
+      "completions/mean_length": 102.375,
+      "completions/mean_terminated_length": 89.16128540039062,
+      "completions/min_length": 9.0,
+      "completions/min_terminated_length": 9.0,
+      "epoch": 0.3168,
+      "grad_norm": 9.422346115112305,
+      "kl": 0.20269775390625,
+      "learning_rate": 1e-06,
+      "loss": -0.3887,
+      "num_tokens": 5222225.0,
+      "reward": 0.0936415046453476,
+      "reward_std": 0.07821927219629288,
+      "rewards/bleu_reward_func/mean": 0.0936415046453476,
+      "rewards/bleu_reward_func/std": 0.1016775444149971,
+      "step": 396
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.15625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 508.0,
+      "completions/mean_length": 187.875,
+      "completions/mean_terminated_length": 127.85185241699219,
+      "completions/min_length": 9.0,
+      "completions/min_terminated_length": 9.0,
+      "epoch": 0.3176,
+      "grad_norm": 16.8355712890625,
+      "kl": 0.111968994140625,
+      "learning_rate": 1e-06,
+      "loss": 0.2515,
+      "num_tokens": 5234477.0,
+      "reward": 0.2821354866027832,
+      "reward_std": 0.16070716083049774,
+      "rewards/bleu_reward_func/mean": 0.2821354866027832,
+      "rewards/bleu_reward_func/std": 0.34524035453796387,
+      "step": 397
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.15625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 487.0,
+      "completions/mean_length": 222.625,
+      "completions/mean_terminated_length": 169.0370330810547,
+      "completions/min_length": 21.0,
+      "completions/min_terminated_length": 21.0,
+      "epoch": 0.3184,
+      "grad_norm": 4.937644004821777,
+      "kl": 0.0895843505859375,
+      "learning_rate": 1e-06,
+      "loss": 0.1443,
+      "num_tokens": 5243161.0,
+      "reward": 0.04823939502239227,
+      "reward_std": 0.020888181403279305,
+      "rewards/bleu_reward_func/mean": 0.04823939502239227,
+      "rewards/bleu_reward_func/std": 0.032690465450286865,
+      "step": 398
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 507.0,
+      "completions/mean_length": 330.5625,
+      "completions/mean_terminated_length": 304.64288330078125,
+      "completions/min_length": 147.0,
+      "completions/min_terminated_length": 147.0,
+      "epoch": 0.3192,
+      "grad_norm": 2.7899651527404785,
+      "kl": 0.028900146484375,
+      "learning_rate": 1e-06,
+      "loss": -0.0361,
+      "num_tokens": 5257211.0,
+      "reward": 0.10274805128574371,
+      "reward_std": 0.03329307958483696,
+      "rewards/bleu_reward_func/mean": 0.10274805128574371,
+      "rewards/bleu_reward_func/std": 0.08635566383600235,
+      "step": 399
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 511.0,
+      "completions/mean_length": 173.875,
+      "completions/mean_terminated_length": 125.5714340209961,
+      "completions/min_length": 14.0,
+      "completions/min_terminated_length": 14.0,
+      "epoch": 0.32,
+      "grad_norm": 9.990334510803223,
+      "kl": 0.19793701171875,
+      "learning_rate": 1e-06,
+      "loss": 0.0415,
+      "num_tokens": 5265263.0,
+      "reward": 0.13340914249420166,
+      "reward_std": 0.06052035093307495,
+      "rewards/bleu_reward_func/mean": 0.13340914249420166,
+      "rewards/bleu_reward_func/std": 0.12332285940647125,
+      "step": 400
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 174.0,
+      "completions/mean_length": 299.625,
+      "completions/mean_terminated_length": 87.25,
+      "completions/min_length": 21.0,
+      "completions/min_terminated_length": 21.0,
+      "epoch": 0.3208,
+      "grad_norm": 5.343194007873535,
+      "kl": 0.0823516845703125,
+      "learning_rate": 1e-06,
+      "loss": 0.1588,
+      "num_tokens": 5279491.0,
+      "reward": 0.04100114479660988,
+      "reward_std": 0.021917924284934998,
+      "rewards/bleu_reward_func/mean": 0.04100114479660988,
+      "rewards/bleu_reward_func/std": 0.059245530515909195,
+      "step": 401
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 481.0,
+      "completions/mean_length": 363.96875,
+      "completions/mean_terminated_length": 296.68182373046875,
+      "completions/min_length": 23.0,
+      "completions/min_terminated_length": 23.0,
+      "epoch": 0.3216,
+      "grad_norm": 2.502444267272949,
+      "kl": 0.0249786376953125,
+      "learning_rate": 1e-06,
+      "loss": -0.1811,
+      "num_tokens": 5298618.0,
+      "reward": 0.06452260166406631,
+      "reward_std": 0.043596021831035614,
+      "rewards/bleu_reward_func/mean": 0.06452260166406631,
+      "rewards/bleu_reward_func/std": 0.0457596592605114,
+      "step": 402
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.09375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 504.0,
+      "completions/mean_length": 186.8125,
+      "completions/mean_terminated_length": 153.1724090576172,
+      "completions/min_length": 24.0,
+      "completions/min_terminated_length": 24.0,
+      "epoch": 0.3224,
+      "grad_norm": 6.430903434753418,
+      "kl": 0.0926513671875,
+      "learning_rate": 1e-06,
+      "loss": 0.1091,
+      "num_tokens": 5308788.0,
+      "reward": 0.1375400573015213,
+      "reward_std": 0.044691912829875946,
+      "rewards/bleu_reward_func/mean": 0.1375400573015213,
+      "rewards/bleu_reward_func/std": 0.1667727530002594,
+      "step": 403
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.28125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 476.0,
+      "completions/mean_length": 261.375,
+      "completions/mean_terminated_length": 163.30435180664062,
+      "completions/min_length": 15.0,
+      "completions/min_terminated_length": 15.0,
+      "epoch": 0.3232,
+      "grad_norm": 7.348942279815674,
+      "kl": 0.114410400390625,
+      "learning_rate": 1e-06,
+      "loss": 0.124,
+      "num_tokens": 5325816.0,
+      "reward": 0.29955723881721497,
+      "reward_std": 0.09420829266309738,
+      "rewards/bleu_reward_func/mean": 0.29955723881721497,
+      "rewards/bleu_reward_func/std": 0.27135762572288513,
+      "step": 404
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 490.0,
+      "completions/mean_length": 313.125,
+      "completions/mean_terminated_length": 284.71429443359375,
+      "completions/min_length": 45.0,
+      "completions/min_terminated_length": 45.0,
+      "epoch": 0.324,
+      "grad_norm": 5.8108601570129395,
+      "kl": 0.0474853515625,
+      "learning_rate": 1e-06,
+      "loss": -0.0256,
+      "num_tokens": 5339252.0,
+      "reward": 0.125982865691185,
+      "reward_std": 0.03331389278173447,
+      "rewards/bleu_reward_func/mean": 0.125982865691185,
+      "rewards/bleu_reward_func/std": 0.07514968514442444,
+      "step": 405
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 507.0,
+      "completions/mean_length": 221.1875,
+      "completions/mean_terminated_length": 154.07693481445312,
+      "completions/min_length": 23.0,
+      "completions/min_terminated_length": 23.0,
+      "epoch": 0.3248,
+      "grad_norm": 6.334237098693848,
+      "kl": 0.167236328125,
+      "learning_rate": 1e-06,
+      "loss": 0.2154,
+      "num_tokens": 5350538.0,
+      "reward": 0.12314164638519287,
+      "reward_std": 0.034954577684402466,
+      "rewards/bleu_reward_func/mean": 0.12314164638519287,
+      "rewards/bleu_reward_func/std": 0.11711690574884415,
+      "step": 406
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.21875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 511.0,
+      "completions/mean_length": 182.96875,
+      "completions/mean_terminated_length": 90.83999633789062,
+      "completions/min_length": 21.0,
+      "completions/min_terminated_length": 21.0,
+      "epoch": 0.3256,
+      "grad_norm": 6.83364200592041,
+      "kl": 0.09881591796875,
+      "learning_rate": 1e-06,
+      "loss": 0.2224,
+      "num_tokens": 5364465.0,
+      "reward": 0.23839128017425537,
+      "reward_std": 0.09448365867137909,
+      "rewards/bleu_reward_func/mean": 0.23839128017425537,
+      "rewards/bleu_reward_func/std": 0.17264093458652496,
+      "step": 407
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 413.0,
+      "completions/mean_length": 251.59375,
+      "completions/mean_terminated_length": 164.7916717529297,
+      "completions/min_length": 17.0,
+      "completions/min_terminated_length": 17.0,
+      "epoch": 0.3264,
+      "grad_norm": 5.291790962219238,
+      "kl": 0.120269775390625,
+      "learning_rate": 1e-06,
+      "loss": 0.1051,
+      "num_tokens": 5379212.0,
+      "reward": 0.07936831563711166,
+      "reward_std": 0.026489000767469406,
+      "rewards/bleu_reward_func/mean": 0.07936831563711166,
+      "rewards/bleu_reward_func/std": 0.04656874015927315,
+      "step": 408
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 495.0,
+      "completions/mean_length": 395.90625,
+      "completions/mean_terminated_length": 279.8125,
+      "completions/min_length": 13.0,
+      "completions/min_terminated_length": 13.0,
+      "epoch": 0.3272,
+      "grad_norm": 2.7209274768829346,
+      "kl": 0.033355712890625,
+      "learning_rate": 1e-06,
+      "loss": -0.0145,
+      "num_tokens": 5395369.0,
+      "reward": 0.05327831208705902,
+      "reward_std": 0.020644793286919594,
+      "rewards/bleu_reward_func/mean": 0.05327831208705902,
+      "rewards/bleu_reward_func/std": 0.044744666665792465,
+      "step": 409
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.34375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 402.0,
+      "completions/mean_length": 279.125,
+      "completions/mean_terminated_length": 157.14285278320312,
+      "completions/min_length": 20.0,
+      "completions/min_terminated_length": 20.0,
+      "epoch": 0.328,
+      "grad_norm": 6.835958003997803,
+      "kl": 0.17645263671875,
+      "learning_rate": 1e-06,
+      "loss": 0.1004,
+      "num_tokens": 5408861.0,
+      "reward": 0.15895725786685944,
+      "reward_std": 0.053282976150512695,
+      "rewards/bleu_reward_func/mean": 0.15895725786685944,
+      "rewards/bleu_reward_func/std": 0.1344875991344452,
+      "step": 410
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 510.0,
+      "completions/mean_length": 124.625,
+      "completions/mean_terminated_length": 112.1290283203125,
+      "completions/min_length": 22.0,
+      "completions/min_terminated_length": 22.0,
+      "epoch": 0.3288,
+      "grad_norm": 7.9765801429748535,
+      "kl": 0.12908935546875,
+      "learning_rate": 1e-06,
+      "loss": 0.0791,
+      "num_tokens": 5422569.0,
+      "reward": 0.29637736082077026,
+      "reward_std": 0.07562527060508728,
+      "rewards/bleu_reward_func/mean": 0.29637736082077026,
+      "rewards/bleu_reward_func/std": 0.1916900873184204,
+      "step": 411
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 420.0,
+      "completions/mean_length": 242.6875,
+      "completions/mean_terminated_length": 180.53846740722656,
+      "completions/min_length": 38.0,
+      "completions/min_terminated_length": 38.0,
+      "epoch": 0.3296,
+      "grad_norm": 5.444021701812744,
+      "kl": 0.079376220703125,
+      "learning_rate": 1e-06,
+      "loss": -0.0428,
+      "num_tokens": 5432847.0,
+      "reward": 0.1152123510837555,
+      "reward_std": 0.07390551269054413,
+      "rewards/bleu_reward_func/mean": 0.1152123510837555,
+      "rewards/bleu_reward_func/std": 0.14451570808887482,
+      "step": 412
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.40625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 489.0,
+      "completions/mean_length": 329.4375,
+      "completions/mean_terminated_length": 204.5263214111328,
+      "completions/min_length": 18.0,
+      "completions/min_terminated_length": 18.0,
+      "epoch": 0.3304,
+      "grad_norm": 14.007586479187012,
+      "kl": 0.074188232421875,
+      "learning_rate": 1e-06,
+      "loss": 0.0903,
+      "num_tokens": 5451693.0,
+      "reward": 0.13860949873924255,
+      "reward_std": 0.032740939408540726,
+      "rewards/bleu_reward_func/mean": 0.13860949873924255,
+      "rewards/bleu_reward_func/std": 0.15230515599250793,
+      "step": 413
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 493.0,
+      "completions/mean_length": 370.0,
+      "completions/mean_terminated_length": 322.66668701171875,
+      "completions/min_length": 115.0,
+      "completions/min_terminated_length": 115.0,
+      "epoch": 0.3312,
+      "grad_norm": 2.6279470920562744,
+      "kl": 0.02838134765625,
+      "learning_rate": 1e-06,
+      "loss": 0.1011,
+      "num_tokens": 5465741.0,
+      "reward": 0.07638199627399445,
+      "reward_std": 0.018498672172427177,
+      "rewards/bleu_reward_func/mean": 0.07638199627399445,
+      "rewards/bleu_reward_func/std": 0.07297802716493607,
+      "step": 414
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.28125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 504.0,
+      "completions/mean_length": 433.96875,
+      "completions/mean_terminated_length": 403.4347839355469,
+      "completions/min_length": 253.0,
+      "completions/min_terminated_length": 253.0,
+      "epoch": 0.332,
+      "grad_norm": 2.4823691844940186,
+      "kl": 0.035186767578125,
+      "learning_rate": 1e-06,
+      "loss": -0.0328,
+      "num_tokens": 5482924.0,
+      "reward": 0.06871578842401505,
+      "reward_std": 0.015666324645280838,
+      "rewards/bleu_reward_func/mean": 0.06871578842401505,
+      "rewards/bleu_reward_func/std": 0.03051225282251835,
+      "step": 415
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 432.0,
+      "completions/mean_length": 241.5,
+      "completions/mean_terminated_length": 179.07693481445312,
+      "completions/min_length": 18.0,
+      "completions/min_terminated_length": 18.0,
+      "epoch": 0.3328,
+      "grad_norm": 6.543938159942627,
+      "kl": 0.16180419921875,
+      "learning_rate": 1e-06,
+      "loss": 0.0852,
+      "num_tokens": 5494084.0,
+      "reward": 0.1368054300546646,
+      "reward_std": 0.05007235333323479,
+      "rewards/bleu_reward_func/mean": 0.1368054300546646,
+      "rewards/bleu_reward_func/std": 0.17140735685825348,
+      "step": 416
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.40625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 511.0,
+      "completions/mean_length": 440.0,
+      "completions/mean_terminated_length": 390.7368469238281,
+      "completions/min_length": 45.0,
+      "completions/min_terminated_length": 45.0,
+      "epoch": 0.3336,
+      "grad_norm": 2.235297203063965,
+      "kl": 0.03033447265625,
+      "learning_rate": 1e-06,
+      "loss": -0.0838,
+      "num_tokens": 5511716.0,
+      "reward": 0.038143888115882874,
+      "reward_std": 0.01655811443924904,
+      "rewards/bleu_reward_func/mean": 0.038143888115882874,
+      "rewards/bleu_reward_func/std": 0.024868454784154892,
+      "step": 417
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 366.0,
+      "completions/mean_length": 182.3125,
+      "completions/mean_terminated_length": 72.41667175292969,
+      "completions/min_length": 33.0,
+      "completions/min_terminated_length": 33.0,
+      "epoch": 0.3344,
+      "grad_norm": 7.744441509246826,
+      "kl": 0.191436767578125,
+      "learning_rate": 1e-06,
+      "loss": 0.3195,
+      "num_tokens": 5523022.0,
+      "reward": 0.31701600551605225,
+      "reward_std": 0.07194612175226212,
+      "rewards/bleu_reward_func/mean": 0.31701600551605225,
+      "rewards/bleu_reward_func/std": 0.3555218279361725,
+      "step": 418
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.46875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 503.0,
+      "completions/mean_length": 366.15625,
+      "completions/mean_terminated_length": 237.47059631347656,
+      "completions/min_length": 28.0,
+      "completions/min_terminated_length": 28.0,
+      "epoch": 0.3352,
+      "grad_norm": 6.1128129959106445,
+      "kl": 0.156463623046875,
+      "learning_rate": 1e-06,
+      "loss": -0.0201,
+      "num_tokens": 5541259.0,
+      "reward": 0.08823719620704651,
+      "reward_std": 0.024577319622039795,
+      "rewards/bleu_reward_func/mean": 0.08823719620704651,
+      "rewards/bleu_reward_func/std": 0.06854464113712311,
+      "step": 419
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.53125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 463.0,
+      "completions/mean_length": 441.34375,
+      "completions/mean_terminated_length": 361.2666931152344,
+      "completions/min_length": 213.0,
+      "completions/min_terminated_length": 213.0,
+      "epoch": 0.336,
+      "grad_norm": 1.8351125717163086,
+      "kl": 0.021087646484375,
+      "learning_rate": 1e-06,
+      "loss": 0.1005,
+      "num_tokens": 5559934.0,
+      "reward": 0.04894189164042473,
+      "reward_std": 0.02001025900244713,
+      "rewards/bleu_reward_func/mean": 0.04894189164042473,
+      "rewards/bleu_reward_func/std": 0.05484846979379654,
+      "step": 420
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 439.0,
+      "completions/mean_length": 326.625,
+      "completions/mean_terminated_length": 264.8333435058594,
+      "completions/min_length": 13.0,
+      "completions/min_terminated_length": 13.0,
+      "epoch": 0.3368,
+      "grad_norm": 2.533376932144165,
+      "kl": 0.036773681640625,
+      "learning_rate": 1e-06,
+      "loss": -0.0233,
+      "num_tokens": 5573530.0,
+      "reward": 0.040375903248786926,
+      "reward_std": 0.020407570526003838,
+      "rewards/bleu_reward_func/mean": 0.040375903248786926,
+      "rewards/bleu_reward_func/std": 0.03530384972691536,
+      "step": 421
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 495.0,
+      "completions/mean_length": 387.0,
+      "completions/mean_terminated_length": 312.0,
+      "completions/min_length": 124.0,
+      "completions/min_terminated_length": 124.0,
+      "epoch": 0.3376,
+      "grad_norm": 2.77024507522583,
+      "kl": 0.023895263671875,
+      "learning_rate": 1e-06,
+      "loss": -0.0548,
+      "num_tokens": 5587906.0,
+      "reward": 0.07852312177419662,
+      "reward_std": 0.01865551620721817,
+      "rewards/bleu_reward_func/mean": 0.07852312177419662,
+      "rewards/bleu_reward_func/std": 0.01962001994252205,
+      "step": 422
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.21875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 491.0,
+      "completions/mean_length": 286.625,
+      "completions/mean_terminated_length": 223.51998901367188,
+      "completions/min_length": 11.0,
+      "completions/min_terminated_length": 11.0,
+      "epoch": 0.3384,
+      "grad_norm": 10.24564266204834,
+      "kl": 0.1881103515625,
+      "learning_rate": 1e-06,
+      "loss": 0.3541,
+      "num_tokens": 5600862.0,
+      "reward": 0.1451932042837143,
+      "reward_std": 0.04526112228631973,
+      "rewards/bleu_reward_func/mean": 0.1451932042837143,
+      "rewards/bleu_reward_func/std": 0.11114869266748428,
+      "step": 423
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.53125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 485.0,
+      "completions/mean_length": 374.0,
+      "completions/mean_terminated_length": 217.60000610351562,
+      "completions/min_length": 26.0,
+      "completions/min_terminated_length": 26.0,
+      "epoch": 0.3392,
+      "grad_norm": 3.2997934818267822,
+      "kl": 0.03955078125,
+      "learning_rate": 1e-06,
+      "loss": 0.0442,
+      "num_tokens": 5614662.0,
+      "reward": 0.029227450489997864,
+      "reward_std": 0.015134407207369804,
+      "rewards/bleu_reward_func/mean": 0.029227450489997864,
+      "rewards/bleu_reward_func/std": 0.03273903205990791,
+      "step": 424
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 375.0,
+      "completions/mean_length": 266.59375,
+      "completions/mean_terminated_length": 184.7916717529297,
+      "completions/min_length": 15.0,
+      "completions/min_terminated_length": 15.0,
+      "epoch": 0.34,
+      "grad_norm": 3.9605484008789062,
+      "kl": 0.05718994140625,
+      "learning_rate": 1e-06,
+      "loss": 0.3365,
+      "num_tokens": 5625193.0,
+      "reward": 0.07731406390666962,
+      "reward_std": 0.04166540876030922,
+      "rewards/bleu_reward_func/mean": 0.07731406390666962,
+      "rewards/bleu_reward_func/std": 0.07211390882730484,
+      "step": 425
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.40625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 506.0,
+      "completions/mean_length": 260.625,
+      "completions/mean_terminated_length": 88.63157653808594,
+      "completions/min_length": 13.0,
+      "completions/min_terminated_length": 13.0,
+      "epoch": 0.3408,
+      "grad_norm": 11.257648468017578,
+      "kl": 0.2677154541015625,
+      "learning_rate": 1e-06,
+      "loss": 0.1169,
+      "num_tokens": 5640717.0,
+      "reward": 0.19435667991638184,
+      "reward_std": 0.055491410195827484,
+      "rewards/bleu_reward_func/mean": 0.19435667991638184,
+      "rewards/bleu_reward_func/std": 0.1956581324338913,
+      "step": 426
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.59375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 489.0,
+      "completions/mean_length": 376.9375,
+      "completions/mean_terminated_length": 179.53846740722656,
+      "completions/min_length": 22.0,
+      "completions/min_terminated_length": 22.0,
+      "epoch": 0.3416,
+      "grad_norm": 25.653825759887695,
+      "kl": 0.1090087890625,
+      "learning_rate": 1e-06,
+      "loss": 0.024,
+      "num_tokens": 5655923.0,
+      "reward": 0.11750101298093796,
+      "reward_std": 0.0449095293879509,
+      "rewards/bleu_reward_func/mean": 0.11750101298093796,
+      "rewards/bleu_reward_func/std": 0.10332971811294556,
+      "step": 427
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.28125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 454.0,
+      "completions/mean_length": 313.0,
+      "completions/mean_terminated_length": 235.13043212890625,
+      "completions/min_length": 7.0,
+      "completions/min_terminated_length": 7.0,
+      "epoch": 0.3424,
+      "grad_norm": 3.291689157485962,
+      "kl": 0.05206298828125,
+      "learning_rate": 1e-06,
+      "loss": -0.0722,
+      "num_tokens": 5672371.0,
+      "reward": 0.07329948246479034,
+      "reward_std": 0.04769134148955345,
+      "rewards/bleu_reward_func/mean": 0.07329948246479034,
+      "rewards/bleu_reward_func/std": 0.10588011890649796,
+      "step": 428
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 512.0,
+      "completions/mean_length": 218.1875,
+      "completions/mean_terminated_length": 176.21429443359375,
+      "completions/min_length": 18.0,
+      "completions/min_terminated_length": 18.0,
+      "epoch": 0.3432,
+      "grad_norm": 70.38253784179688,
+      "kl": 0.111907958984375,
+      "learning_rate": 1e-06,
+      "loss": 0.2102,
+      "num_tokens": 5685137.0,
+      "reward": 0.057677462697029114,
+      "reward_std": 0.02635624073445797,
+      "rewards/bleu_reward_func/mean": 0.057677462697029114,
+      "rewards/bleu_reward_func/std": 0.03576910123229027,
+      "step": 429
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.28125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 448.0,
+      "completions/mean_length": 276.34375,
+      "completions/mean_terminated_length": 184.13043212890625,
+      "completions/min_length": 29.0,
+      "completions/min_terminated_length": 29.0,
+      "epoch": 0.344,
+      "grad_norm": 6.003584861755371,
+      "kl": 0.0679931640625,
+      "learning_rate": 1e-06,
+      "loss": 0.0754,
+      "num_tokens": 5699164.0,
+      "reward": 0.1447058618068695,
+      "reward_std": 0.02169397845864296,
+      "rewards/bleu_reward_func/mean": 0.1447058618068695,
+      "rewards/bleu_reward_func/std": 0.17934927344322205,
+      "step": 430
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 487.0,
+      "completions/mean_length": 190.0,
+      "completions/mean_terminated_length": 179.61289978027344,
+      "completions/min_length": 16.0,
+      "completions/min_terminated_length": 16.0,
+      "epoch": 0.3448,
+      "grad_norm": 6.911223888397217,
+      "kl": 0.186920166015625,
+      "learning_rate": 1e-06,
+      "loss": -0.1198,
+      "num_tokens": 5707324.0,
+      "reward": 0.1218734011054039,
+      "reward_std": 0.029896825551986694,
+      "rewards/bleu_reward_func/mean": 0.1218734011054039,
+      "rewards/bleu_reward_func/std": 0.12784428894519806,
+      "step": 431
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 441.0,
+      "completions/max_terminated_length": 441.0,
+      "completions/mean_length": 161.3125,
+      "completions/mean_terminated_length": 161.3125,
+      "completions/min_length": 21.0,
+      "completions/min_terminated_length": 21.0,
+      "epoch": 0.3456,
+      "grad_norm": 7.491186141967773,
+      "kl": 0.174072265625,
+      "learning_rate": 1e-06,
+      "loss": -0.0073,
+      "num_tokens": 5715774.0,
+      "reward": 0.24741162359714508,
+      "reward_std": 0.06959841400384903,
+      "rewards/bleu_reward_func/mean": 0.24741162359714508,
+      "rewards/bleu_reward_func/std": 0.12952403724193573,
+      "step": 432
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.34375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 492.0,
+      "completions/mean_length": 360.375,
+      "completions/mean_terminated_length": 280.952392578125,
+      "completions/min_length": 13.0,
+      "completions/min_terminated_length": 13.0,
+      "epoch": 0.3464,
+      "grad_norm": 2.933356523513794,
+      "kl": 0.05389404296875,
+      "learning_rate": 1e-06,
+      "loss": 0.1508,
+      "num_tokens": 5729490.0,
+      "reward": 0.047768086194992065,
+      "reward_std": 0.022835325449705124,
+      "rewards/bleu_reward_func/mean": 0.047768086194992065,
+      "rewards/bleu_reward_func/std": 0.03785131126642227,
+      "step": 433
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 486.0,
+      "completions/mean_length": 290.65625,
+      "completions/mean_terminated_length": 216.875,
+      "completions/min_length": 16.0,
+      "completions/min_terminated_length": 16.0,
+      "epoch": 0.3472,
+      "grad_norm": 4.7709503173828125,
+      "kl": 0.202667236328125,
+      "learning_rate": 1e-06,
+      "loss": -0.0613,
+      "num_tokens": 5744007.0,
+      "reward": 0.17955930531024933,
+      "reward_std": 0.04158224165439606,
+      "rewards/bleu_reward_func/mean": 0.17955930531024933,
+      "rewards/bleu_reward_func/std": 0.16465015709400177,
+      "step": 434
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 508.0,
+      "completions/mean_length": 471.46875,
+      "completions/mean_terminated_length": 430.9375,
+      "completions/min_length": 318.0,
+      "completions/min_terminated_length": 318.0,
+      "epoch": 0.348,
+      "grad_norm": 2.0240750312805176,
+      "kl": 0.0249786376953125,
+      "learning_rate": 1e-06,
+      "loss": -0.0115,
+      "num_tokens": 5764798.0,
+      "reward": 0.06078977510333061,
+      "reward_std": 0.014253700152039528,
+      "rewards/bleu_reward_func/mean": 0.06078977510333061,
+      "rewards/bleu_reward_func/std": 0.061424292623996735,
+      "step": 435
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.28125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 496.0,
+      "completions/mean_length": 297.5,
+      "completions/mean_terminated_length": 213.56521606445312,
+      "completions/min_length": 19.0,
+      "completions/min_terminated_length": 19.0,
+      "epoch": 0.3488,
+      "grad_norm": 6.2941999435424805,
+      "kl": 0.14788818359375,
+      "learning_rate": 1e-06,
+      "loss": -0.117,
+      "num_tokens": 5778070.0,
+      "reward": 0.18015003204345703,
+      "reward_std": 0.04164495691657066,
+      "rewards/bleu_reward_func/mean": 0.18015003204345703,
+      "rewards/bleu_reward_func/std": 0.25248411297798157,
+      "step": 436
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.21875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 507.0,
+      "completions/mean_length": 319.0625,
+      "completions/mean_terminated_length": 265.0400085449219,
+      "completions/min_length": 30.0,
+      "completions/min_terminated_length": 30.0,
+      "epoch": 0.3496,
+      "grad_norm": 6.8885884284973145,
+      "kl": 0.15325927734375,
+      "learning_rate": 1e-06,
+      "loss": 0.1008,
+      "num_tokens": 5794464.0,
+      "reward": 0.06313855201005936,
+      "reward_std": 0.01877717673778534,
+      "rewards/bleu_reward_func/mean": 0.06313855201005936,
+      "rewards/bleu_reward_func/std": 0.07749292254447937,
+      "step": 437
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 431.0,
+      "completions/mean_length": 174.96875,
+      "completions/mean_terminated_length": 126.8214340209961,
+      "completions/min_length": 17.0,
+      "completions/min_terminated_length": 17.0,
+      "epoch": 0.3504,
+      "grad_norm": 6.638815402984619,
+      "kl": 0.21893310546875,
+      "learning_rate": 1e-06,
+      "loss": -0.1077,
+      "num_tokens": 5808279.0,
+      "reward": 0.1649433970451355,
+      "reward_std": 0.03847195580601692,
+      "rewards/bleu_reward_func/mean": 0.1649433970451355,
+      "rewards/bleu_reward_func/std": 0.1434909999370575,
+      "step": 438
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 487.0,
+      "completions/max_terminated_length": 487.0,
+      "completions/mean_length": 111.46875,
+      "completions/mean_terminated_length": 111.46875,
+      "completions/min_length": 12.0,
+      "completions/min_terminated_length": 12.0,
+      "epoch": 0.3512,
+      "grad_norm": 8.658287048339844,
+      "kl": 0.380615234375,
+      "learning_rate": 1e-06,
+      "loss": -0.0241,
+      "num_tokens": 5818006.0,
+      "reward": 0.16367265582084656,
+      "reward_std": 0.043664492666721344,
+      "rewards/bleu_reward_func/mean": 0.16367265582084656,
+      "rewards/bleu_reward_func/std": 0.09786061942577362,
+      "step": 439
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 484.0,
+      "completions/mean_length": 183.0625,
+      "completions/mean_terminated_length": 161.1333465576172,
+      "completions/min_length": 2.0,
+      "completions/min_terminated_length": 2.0,
+      "epoch": 0.352,
+      "grad_norm": 6.755293369293213,
+      "kl": 0.094757080078125,
+      "learning_rate": 1e-06,
+      "loss": -0.3775,
+      "num_tokens": 5827832.0,
+      "reward": 0.20365653932094574,
+      "reward_std": 0.022682592272758484,
+      "rewards/bleu_reward_func/mean": 0.20365653932094574,
+      "rewards/bleu_reward_func/std": 0.28341981768608093,
+      "step": 440
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 489.0,
+      "completions/mean_length": 374.15625,
+      "completions/mean_terminated_length": 196.92857360839844,
+      "completions/min_length": 5.0,
+      "completions/min_terminated_length": 5.0,
+      "epoch": 0.3528,
+      "grad_norm": 3.652517557144165,
+      "kl": 0.04571533203125,
+      "learning_rate": 1e-06,
+      "loss": 0.1243,
+      "num_tokens": 5846877.0,
+      "reward": 0.028015542775392532,
+      "reward_std": 0.017580918967723846,
+      "rewards/bleu_reward_func/mean": 0.028015542775392532,
+      "rewards/bleu_reward_func/std": 0.018063105642795563,
+      "step": 441
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.21875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 503.0,
+      "completions/mean_length": 247.96875,
+      "completions/mean_terminated_length": 174.0399932861328,
+      "completions/min_length": 12.0,
+      "completions/min_terminated_length": 12.0,
+      "epoch": 0.3536,
+      "grad_norm": 5.749145984649658,
+      "kl": 0.2269287109375,
+      "learning_rate": 1e-06,
+      "loss": 0.0053,
+      "num_tokens": 5857276.0,
+      "reward": 0.24086514115333557,
+      "reward_std": 0.11034538596868515,
+      "rewards/bleu_reward_func/mean": 0.24086514115333557,
+      "rewards/bleu_reward_func/std": 0.2930907607078552,
+      "step": 442
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.21875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 458.0,
+      "completions/mean_length": 180.9375,
+      "completions/mean_terminated_length": 88.23999786376953,
+      "completions/min_length": 18.0,
+      "completions/min_terminated_length": 18.0,
+      "epoch": 0.3544,
+      "grad_norm": 6.519045352935791,
+      "kl": 0.3109130859375,
+      "learning_rate": 1e-06,
+      "loss": 0.0523,
+      "num_tokens": 5866418.0,
+      "reward": 0.14787587523460388,
+      "reward_std": 0.08442827314138412,
+      "rewards/bleu_reward_func/mean": 0.14787587523460388,
+      "rewards/bleu_reward_func/std": 0.13120223581790924,
+      "step": 443
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 474.0,
+      "completions/mean_length": 290.6875,
+      "completions/mean_terminated_length": 239.61538696289062,
+      "completions/min_length": 50.0,
+      "completions/min_terminated_length": 50.0,
+      "epoch": 0.3552,
+      "grad_norm": 3.2144103050231934,
+      "kl": 0.042388916015625,
+      "learning_rate": 1e-06,
+      "loss": -0.0012,
+      "num_tokens": 5878832.0,
+      "reward": 0.06585465371608734,
+      "reward_std": 0.03217202052474022,
+      "rewards/bleu_reward_func/mean": 0.06585465371608734,
+      "rewards/bleu_reward_func/std": 0.0564405731856823,
+      "step": 444
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.09375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 506.0,
+      "completions/mean_length": 226.0625,
+      "completions/mean_terminated_length": 196.48275756835938,
+      "completions/min_length": 23.0,
+      "completions/min_terminated_length": 23.0,
+      "epoch": 0.356,
+      "grad_norm": 7.220034122467041,
+      "kl": 0.255218505859375,
+      "learning_rate": 1e-06,
+      "loss": 0.0212,
+      "num_tokens": 5894266.0,
+      "reward": 0.1998336911201477,
+      "reward_std": 0.05887780338525772,
+      "rewards/bleu_reward_func/mean": 0.1998336911201477,
+      "rewards/bleu_reward_func/std": 0.1896047741174698,
+      "step": 445
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 394.0,
+      "completions/mean_length": 194.46875,
+      "completions/mean_terminated_length": 173.3000030517578,
+      "completions/min_length": 37.0,
+      "completions/min_terminated_length": 37.0,
+      "epoch": 0.3568,
+      "grad_norm": 5.338675022125244,
+      "kl": 0.112518310546875,
+      "learning_rate": 1e-06,
+      "loss": 0.1041,
+      "num_tokens": 5902393.0,
+      "reward": 0.08252020180225372,
+      "reward_std": 0.041884347796440125,
+      "rewards/bleu_reward_func/mean": 0.08252020180225372,
+      "rewards/bleu_reward_func/std": 0.05604247748851776,
+      "step": 446
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 491.0,
+      "completions/mean_length": 266.96875,
+      "completions/mean_terminated_length": 231.96429443359375,
+      "completions/min_length": 36.0,
+      "completions/min_terminated_length": 36.0,
+      "epoch": 0.3576,
+      "grad_norm": 3.9111521244049072,
+      "kl": 0.048919677734375,
+      "learning_rate": 1e-06,
+      "loss": 0.0005,
+      "num_tokens": 5914296.0,
+      "reward": 0.2005537748336792,
+      "reward_std": 0.03531679883599281,
+      "rewards/bleu_reward_func/mean": 0.2005537748336792,
+      "rewards/bleu_reward_func/std": 0.1125224232673645,
+      "step": 447
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 491.0,
+      "completions/mean_length": 274.125,
+      "completions/mean_terminated_length": 89.11111450195312,
+      "completions/min_length": 8.0,
+      "completions/min_terminated_length": 8.0,
+      "epoch": 0.3584,
+      "grad_norm": 18.863727569580078,
+      "kl": 0.269134521484375,
+      "learning_rate": 1e-06,
+      "loss": 0.4411,
+      "num_tokens": 5927172.0,
+      "reward": 0.12709318101406097,
+      "reward_std": 0.020968245342373848,
+      "rewards/bleu_reward_func/mean": 0.12709318101406097,
+      "rewards/bleu_reward_func/std": 0.14331206679344177,
+      "step": 448
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 483.0,
+      "completions/mean_length": 378.21875,
+      "completions/mean_terminated_length": 297.95001220703125,
+      "completions/min_length": 130.0,
+      "completions/min_terminated_length": 130.0,
+      "epoch": 0.3592,
+      "grad_norm": 2.759582996368408,
+      "kl": 0.02587890625,
+      "learning_rate": 1e-06,
+      "loss": 0.0122,
+      "num_tokens": 5944491.0,
+      "reward": 0.04890431463718414,
+      "reward_std": 0.01871412619948387,
+      "rewards/bleu_reward_func/mean": 0.04890431463718414,
+      "rewards/bleu_reward_func/std": 0.05281543731689453,
+      "step": 449
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 482.0,
+      "completions/mean_length": 234.8125,
+      "completions/mean_terminated_length": 216.33334350585938,
+      "completions/min_length": 26.0,
+      "completions/min_terminated_length": 26.0,
+      "epoch": 0.36,
+      "grad_norm": 5.44106388092041,
+      "kl": 0.08453369140625,
+      "learning_rate": 1e-06,
+      "loss": -0.2079,
+      "num_tokens": 5954413.0,
+      "reward": 0.08892585337162018,
+      "reward_std": 0.05316928029060364,
+      "rewards/bleu_reward_func/mean": 0.08892585337162018,
+      "rewards/bleu_reward_func/std": 0.09096309542655945,
+      "step": 450
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.21875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 385.0,
+      "completions/mean_length": 182.59375,
+      "completions/mean_terminated_length": 90.36000061035156,
+      "completions/min_length": 16.0,
+      "completions/min_terminated_length": 16.0,
+      "epoch": 0.3608,
+      "grad_norm": 10.483473777770996,
+      "kl": 0.30169677734375,
+      "learning_rate": 1e-06,
+      "loss": 0.1221,
+      "num_tokens": 5965776.0,
+      "reward": 0.2010711133480072,
+      "reward_std": 0.035105034708976746,
+      "rewards/bleu_reward_func/mean": 0.2010711133480072,
+      "rewards/bleu_reward_func/std": 0.20054543018341064,
+      "step": 451
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.09375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 510.0,
+      "completions/mean_length": 289.5,
+      "completions/mean_terminated_length": 266.4827575683594,
+      "completions/min_length": 11.0,
+      "completions/min_terminated_length": 11.0,
+      "epoch": 0.3616,
+      "grad_norm": 8.551454544067383,
+      "kl": 0.21197509765625,
+      "learning_rate": 1e-06,
+      "loss": 0.0288,
+      "num_tokens": 5976704.0,
+      "reward": 0.03945029526948929,
+      "reward_std": 0.011974655091762543,
+      "rewards/bleu_reward_func/mean": 0.03945029526948929,
+      "rewards/bleu_reward_func/std": 0.027504391968250275,
+      "step": 452
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 451.0,
+      "completions/max_terminated_length": 451.0,
+      "completions/mean_length": 241.90625,
+      "completions/mean_terminated_length": 241.90625,
+      "completions/min_length": 26.0,
+      "completions/min_terminated_length": 26.0,
+      "epoch": 0.3624,
+      "grad_norm": 5.853670597076416,
+      "kl": 0.209869384765625,
+      "learning_rate": 1e-06,
+      "loss": -0.0173,
+      "num_tokens": 5987269.0,
+      "reward": 0.09715719521045685,
+      "reward_std": 0.009554330259561539,
+      "rewards/bleu_reward_func/mean": 0.09715719521045685,
+      "rewards/bleu_reward_func/std": 0.0827893614768982,
+      "step": 453
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 512.0,
+      "completions/mean_length": 258.6875,
+      "completions/mean_terminated_length": 143.5454559326172,
+      "completions/min_length": 17.0,
+      "completions/min_terminated_length": 17.0,
+      "epoch": 0.3632,
+      "grad_norm": 9.270416259765625,
+      "kl": 0.208038330078125,
+      "learning_rate": 1e-06,
+      "loss": 0.0672,
+      "num_tokens": 6001411.0,
+      "reward": 0.1554635763168335,
+      "reward_std": 0.03311417996883392,
+      "rewards/bleu_reward_func/mean": 0.1554635763168335,
+      "rewards/bleu_reward_func/std": 0.1801016479730606,
+      "step": 454
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 496.0,
+      "completions/max_terminated_length": 496.0,
+      "completions/mean_length": 196.65625,
+      "completions/mean_terminated_length": 196.65625,
+      "completions/min_length": 21.0,
+      "completions/min_terminated_length": 21.0,
+      "epoch": 0.364,
+      "grad_norm": 11.34135913848877,
+      "kl": 0.224945068359375,
+      "learning_rate": 1e-06,
+      "loss": 0.3216,
+      "num_tokens": 6012208.0,
+      "reward": 0.058545198291540146,
+      "reward_std": 0.017396699637174606,
+      "rewards/bleu_reward_func/mean": 0.058545198291540146,
+      "rewards/bleu_reward_func/std": 0.04106508567929268,
+      "step": 455
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 459.0,
+      "completions/max_terminated_length": 459.0,
+      "completions/mean_length": 137.8125,
+      "completions/mean_terminated_length": 137.8125,
+      "completions/min_length": 29.0,
+      "completions/min_terminated_length": 29.0,
+      "epoch": 0.3648,
+      "grad_norm": 5.569085597991943,
+      "kl": 0.173675537109375,
+      "learning_rate": 1e-06,
+      "loss": -0.0886,
+      "num_tokens": 6020026.0,
+      "reward": 0.25735002756118774,
+      "reward_std": 0.08652571588754654,
+      "rewards/bleu_reward_func/mean": 0.25735002756118774,
+      "rewards/bleu_reward_func/std": 0.34091776609420776,
+      "step": 456
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.28125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 487.0,
+      "completions/mean_length": 261.0625,
+      "completions/mean_terminated_length": 162.86956787109375,
+      "completions/min_length": 18.0,
+      "completions/min_terminated_length": 18.0,
+      "epoch": 0.3656,
+      "grad_norm": 10.537775993347168,
+      "kl": 0.191162109375,
+      "learning_rate": 1e-06,
+      "loss": 0.1318,
+      "num_tokens": 6031956.0,
+      "reward": 0.12902843952178955,
+      "reward_std": 0.049239080399274826,
+      "rewards/bleu_reward_func/mean": 0.12902843952178955,
+      "rewards/bleu_reward_func/std": 0.1560073047876358,
+      "step": 457
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 499.0,
+      "completions/mean_length": 267.65625,
+      "completions/mean_terminated_length": 156.59091186523438,
+      "completions/min_length": 9.0,
+      "completions/min_terminated_length": 9.0,
+      "epoch": 0.3664,
+      "grad_norm": 8.385242462158203,
+      "kl": 0.13568115234375,
+      "learning_rate": 1e-06,
+      "loss": 0.1967,
+      "num_tokens": 6048289.0,
+      "reward": 0.09441059827804565,
+      "reward_std": 0.02894745022058487,
+      "rewards/bleu_reward_func/mean": 0.09441059827804565,
+      "rewards/bleu_reward_func/std": 0.07357289642095566,
+      "step": 458
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.15625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 506.0,
+      "completions/mean_length": 144.3125,
+      "completions/mean_terminated_length": 76.22222137451172,
+      "completions/min_length": 14.0,
+      "completions/min_terminated_length": 14.0,
+      "epoch": 0.3672,
+      "grad_norm": 9.128081321716309,
+      "kl": 0.275970458984375,
+      "learning_rate": 1e-06,
+      "loss": -0.0309,
+      "num_tokens": 6060955.0,
+      "reward": 0.23786574602127075,
+      "reward_std": 0.04663696512579918,
+      "rewards/bleu_reward_func/mean": 0.23786574602127075,
+      "rewards/bleu_reward_func/std": 0.15007296204566956,
+      "step": 459
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 509.0,
+      "completions/mean_length": 293.25,
+      "completions/mean_terminated_length": 262.0,
+      "completions/min_length": 22.0,
+      "completions/min_terminated_length": 22.0,
+      "epoch": 0.368,
+      "grad_norm": 10.163530349731445,
+      "kl": 0.1614837646484375,
+      "learning_rate": 1e-06,
+      "loss": 0.0314,
+      "num_tokens": 6075291.0,
+      "reward": 0.11764833331108093,
+      "reward_std": 0.025302093476057053,
+      "rewards/bleu_reward_func/mean": 0.11764833331108093,
+      "rewards/bleu_reward_func/std": 0.054068438708782196,
+      "step": 460
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 465.0,
+      "completions/mean_length": 160.78125,
+      "completions/mean_terminated_length": 137.36666870117188,
+      "completions/min_length": 21.0,
+      "completions/min_terminated_length": 21.0,
+      "epoch": 0.3688,
+      "grad_norm": 7.89539909362793,
+      "kl": 0.18206787109375,
+      "learning_rate": 1e-06,
+      "loss": -0.0019,
+      "num_tokens": 6083980.0,
+      "reward": 0.0945214033126831,
+      "reward_std": 0.046040039509534836,
+      "rewards/bleu_reward_func/mean": 0.0945214033126831,
+      "rewards/bleu_reward_func/std": 0.08345890045166016,
+      "step": 461
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 493.0,
+      "completions/mean_length": 254.96875,
+      "completions/mean_terminated_length": 246.6774139404297,
+      "completions/min_length": 18.0,
+      "completions/min_terminated_length": 18.0,
+      "epoch": 0.3696,
+      "grad_norm": 6.462737560272217,
+      "kl": 0.130950927734375,
+      "learning_rate": 1e-06,
+      "loss": 0.0046,
+      "num_tokens": 6096555.0,
+      "reward": 0.04283145070075989,
+      "reward_std": 0.010249357670545578,
+      "rewards/bleu_reward_func/mean": 0.04283145070075989,
+      "rewards/bleu_reward_func/std": 0.038907162845134735,
+      "step": 462
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 208.0,
+      "completions/mean_length": 190.875,
+      "completions/mean_terminated_length": 83.83333587646484,
+      "completions/min_length": 26.0,
+      "completions/min_terminated_length": 26.0,
+      "epoch": 0.3704,
+      "grad_norm": 5.569899559020996,
+      "kl": 0.115325927734375,
+      "learning_rate": 1e-06,
+      "loss": 0.006,
+      "num_tokens": 6105487.0,
+      "reward": 0.13501238822937012,
+      "reward_std": 0.034556735306978226,
+      "rewards/bleu_reward_func/mean": 0.13501238822937012,
+      "rewards/bleu_reward_func/std": 0.09039971977472305,
+      "step": 463
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.21875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 454.0,
+      "completions/mean_length": 219.53125,
+      "completions/mean_terminated_length": 137.63999938964844,
+      "completions/min_length": 19.0,
+      "completions/min_terminated_length": 19.0,
+      "epoch": 0.3712,
+      "grad_norm": 12.397187232971191,
+      "kl": 0.133880615234375,
+      "learning_rate": 1e-06,
+      "loss": -0.0829,
+      "num_tokens": 6117800.0,
+      "reward": 0.18308544158935547,
+      "reward_std": 0.06162799149751663,
+      "rewards/bleu_reward_func/mean": 0.18308544158935547,
+      "rewards/bleu_reward_func/std": 0.15996244549751282,
+      "step": 464
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.09375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 459.0,
+      "completions/mean_length": 220.65625,
+      "completions/mean_terminated_length": 190.51724243164062,
+      "completions/min_length": 18.0,
+      "completions/min_terminated_length": 18.0,
+      "epoch": 0.372,
+      "grad_norm": 7.391514301300049,
+      "kl": 0.11492919921875,
+      "learning_rate": 1e-06,
+      "loss": -0.0768,
+      "num_tokens": 6128413.0,
+      "reward": 0.05292118340730667,
+      "reward_std": 0.04890108108520508,
+      "rewards/bleu_reward_func/mean": 0.05292118340730667,
+      "rewards/bleu_reward_func/std": 0.07255055755376816,
+      "step": 465
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 488.0,
+      "completions/max_terminated_length": 488.0,
+      "completions/mean_length": 194.15625,
+      "completions/mean_terminated_length": 194.15625,
+      "completions/min_length": 20.0,
+      "completions/min_terminated_length": 20.0,
+      "epoch": 0.3728,
+      "grad_norm": 3.9842867851257324,
+      "kl": 0.050048828125,
+      "learning_rate": 1e-06,
+      "loss": -0.0064,
+      "num_tokens": 6137562.0,
+      "reward": 0.04538443684577942,
+      "reward_std": 0.024577371776103973,
+      "rewards/bleu_reward_func/mean": 0.04538443684577942,
+      "rewards/bleu_reward_func/std": 0.03160402178764343,
+      "step": 466
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.28125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 508.0,
+      "completions/mean_length": 368.0,
+      "completions/mean_terminated_length": 311.6521911621094,
+      "completions/min_length": 73.0,
+      "completions/min_terminated_length": 73.0,
+      "epoch": 0.3736,
+      "grad_norm": 2.496399402618408,
+      "kl": 0.025543212890625,
+      "learning_rate": 1e-06,
+      "loss": -0.0006,
+      "num_tokens": 6152090.0,
+      "reward": 0.062375668436288834,
+      "reward_std": 0.031018512323498726,
+      "rewards/bleu_reward_func/mean": 0.062375668436288834,
+      "rewards/bleu_reward_func/std": 0.06766829639673233,
+      "step": 467
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 503.0,
+      "completions/mean_length": 341.59375,
+      "completions/mean_terminated_length": 302.2692565917969,
+      "completions/min_length": 14.0,
+      "completions/min_terminated_length": 14.0,
+      "epoch": 0.3744,
+      "grad_norm": 4.490657329559326,
+      "kl": 0.042816162109375,
+      "learning_rate": 1e-06,
+      "loss": 0.2011,
+      "num_tokens": 6165917.0,
+      "reward": 0.06601699441671371,
+      "reward_std": 0.028723105788230896,
+      "rewards/bleu_reward_func/mean": 0.06601699441671371,
+      "rewards/bleu_reward_func/std": 0.039854664355516434,
+      "step": 468
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.28125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 484.0,
+      "completions/mean_length": 250.21875,
+      "completions/mean_terminated_length": 147.78260803222656,
+      "completions/min_length": 18.0,
+      "completions/min_terminated_length": 18.0,
+      "epoch": 0.3752,
+      "grad_norm": 4.7490010261535645,
+      "kl": 0.1627197265625,
+      "learning_rate": 1e-06,
+      "loss": -0.0409,
+      "num_tokens": 6178940.0,
+      "reward": 0.15887555480003357,
+      "reward_std": 0.018191883340477943,
+      "rewards/bleu_reward_func/mean": 0.15887555480003357,
+      "rewards/bleu_reward_func/std": 0.21522025763988495,
+      "step": 469
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.53125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 472.0,
+      "completions/mean_length": 396.125,
+      "completions/mean_terminated_length": 264.8000183105469,
+      "completions/min_length": 83.0,
+      "completions/min_terminated_length": 83.0,
+      "epoch": 0.376,
+      "grad_norm": 3.33166241645813,
+      "kl": 0.046051025390625,
+      "learning_rate": 1e-06,
+      "loss": -0.0611,
+      "num_tokens": 6194232.0,
+      "reward": 0.0860922709107399,
+      "reward_std": 0.04104076325893402,
+      "rewards/bleu_reward_func/mean": 0.0860922709107399,
+      "rewards/bleu_reward_func/std": 0.13754135370254517,
+      "step": 470
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.21875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 493.0,
+      "completions/mean_length": 364.6875,
+      "completions/mean_terminated_length": 323.44000244140625,
+      "completions/min_length": 95.0,
+      "completions/min_terminated_length": 95.0,
+      "epoch": 0.3768,
+      "grad_norm": 2.6695375442504883,
+      "kl": 0.038360595703125,
+      "learning_rate": 1e-06,
+      "loss": -0.0899,
+      "num_tokens": 6207750.0,
+      "reward": 0.05763555318117142,
+      "reward_std": 0.022492559626698494,
+      "rewards/bleu_reward_func/mean": 0.05763555318117142,
+      "rewards/bleu_reward_func/std": 0.034512683749198914,
+      "step": 471
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.09375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 439.0,
+      "completions/mean_length": 237.40625,
+      "completions/mean_terminated_length": 209.0,
+      "completions/min_length": 26.0,
+      "completions/min_terminated_length": 26.0,
+      "epoch": 0.3776,
+      "grad_norm": 4.532895565032959,
+      "kl": 0.088165283203125,
+      "learning_rate": 1e-06,
+      "loss": 0.0908,
+      "num_tokens": 6220235.0,
+      "reward": 0.07317312806844711,
+      "reward_std": 0.02968096360564232,
+      "rewards/bleu_reward_func/mean": 0.07317312806844711,
+      "rewards/bleu_reward_func/std": 0.04997172951698303,
+      "step": 472
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 505.0,
+      "completions/mean_length": 310.25,
+      "completions/mean_terminated_length": 263.69232177734375,
+      "completions/min_length": 27.0,
+      "completions/min_terminated_length": 27.0,
+      "epoch": 0.3784,
+      "grad_norm": 2.9320926666259766,
+      "kl": 0.04736328125,
+      "learning_rate": 1e-06,
+      "loss": -0.0956,
+      "num_tokens": 6233691.0,
+      "reward": 0.07909499108791351,
+      "reward_std": 0.02384771592915058,
+      "rewards/bleu_reward_func/mean": 0.07909499108791351,
+      "rewards/bleu_reward_func/std": 0.08157114684581757,
+      "step": 473
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.15625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 486.0,
+      "completions/mean_length": 276.8125,
+      "completions/mean_terminated_length": 233.25926208496094,
+      "completions/min_length": 19.0,
+      "completions/min_terminated_length": 19.0,
+      "epoch": 0.3792,
+      "grad_norm": 5.839748859405518,
+      "kl": 0.14556884765625,
+      "learning_rate": 1e-06,
+      "loss": -0.0466,
+      "num_tokens": 6245669.0,
+      "reward": 0.10992265492677689,
+      "reward_std": 0.027910416945815086,
+      "rewards/bleu_reward_func/mean": 0.10992265492677689,
+      "rewards/bleu_reward_func/std": 0.11659030616283417,
+      "step": 474
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 455.0,
+      "completions/mean_length": 194.09375,
+      "completions/mean_terminated_length": 172.90000915527344,
+      "completions/min_length": 16.0,
+      "completions/min_terminated_length": 16.0,
+      "epoch": 0.38,
+      "grad_norm": 22.791318893432617,
+      "kl": 0.30963134765625,
+      "learning_rate": 1e-06,
+      "loss": -0.0749,
+      "num_tokens": 6255632.0,
+      "reward": 0.14596156775951385,
+      "reward_std": 0.0427117757499218,
+      "rewards/bleu_reward_func/mean": 0.14596156775951385,
+      "rewards/bleu_reward_func/std": 0.06039505451917648,
+      "step": 475
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 474.0,
+      "completions/max_terminated_length": 474.0,
+      "completions/mean_length": 147.03125,
+      "completions/mean_terminated_length": 147.03125,
+      "completions/min_length": 16.0,
+      "completions/min_terminated_length": 16.0,
+      "epoch": 0.3808,
+      "grad_norm": 7.391219615936279,
+      "kl": 0.34600830078125,
+      "learning_rate": 1e-06,
+      "loss": -0.0711,
+      "num_tokens": 6267817.0,
+      "reward": 0.155485600233078,
+      "reward_std": 0.03775210678577423,
+      "rewards/bleu_reward_func/mean": 0.155485600233078,
+      "rewards/bleu_reward_func/std": 0.14854131639003754,
+      "step": 476
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 502.0,
+      "completions/mean_length": 176.09375,
+      "completions/mean_terminated_length": 153.70001220703125,
+      "completions/min_length": 12.0,
+      "completions/min_terminated_length": 12.0,
+      "epoch": 0.3816,
+      "grad_norm": 7.248151779174805,
+      "kl": 0.1455078125,
+      "learning_rate": 1e-06,
+      "loss": -0.1443,
+      "num_tokens": 6276772.0,
+      "reward": 0.08080196380615234,
+      "reward_std": 0.06804326176643372,
+      "rewards/bleu_reward_func/mean": 0.08080196380615234,
+      "rewards/bleu_reward_func/std": 0.11115432530641556,
+      "step": 477
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.21875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 507.0,
+      "completions/mean_length": 230.03125,
+      "completions/mean_terminated_length": 151.0800018310547,
+      "completions/min_length": 15.0,
+      "completions/min_terminated_length": 15.0,
+      "epoch": 0.3824,
+      "grad_norm": 8.359848022460938,
+      "kl": 0.151123046875,
+      "learning_rate": 1e-06,
+      "loss": 0.36,
+      "num_tokens": 6287581.0,
+      "reward": 0.06686853617429733,
+      "reward_std": 0.028161579743027687,
+      "rewards/bleu_reward_func/mean": 0.06686853617429733,
+      "rewards/bleu_reward_func/std": 0.054127294570207596,
+      "step": 478
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 470.0,
+      "completions/max_terminated_length": 470.0,
+      "completions/mean_length": 120.03125,
+      "completions/mean_terminated_length": 120.03125,
+      "completions/min_length": 15.0,
+      "completions/min_terminated_length": 15.0,
+      "epoch": 0.3832,
+      "grad_norm": 7.5159101486206055,
+      "kl": 0.190155029296875,
+      "learning_rate": 1e-06,
+      "loss": 0.1074,
+      "num_tokens": 6297390.0,
+      "reward": 0.19040237367153168,
+      "reward_std": 0.05353376269340515,
+      "rewards/bleu_reward_func/mean": 0.19040237367153168,
+      "rewards/bleu_reward_func/std": 0.17947913706302643,
+      "step": 479
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.40625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 512.0,
+      "completions/mean_length": 451.4375,
+      "completions/mean_terminated_length": 410.0,
+      "completions/min_length": 61.0,
+      "completions/min_terminated_length": 61.0,
+      "epoch": 0.384,
+      "grad_norm": 2.1026315689086914,
+      "kl": 0.0289306640625,
+      "learning_rate": 1e-06,
+      "loss": 0.046,
+      "num_tokens": 6314548.0,
+      "reward": 0.09041387587785721,
+      "reward_std": 0.04015309736132622,
+      "rewards/bleu_reward_func/mean": 0.09041387587785721,
+      "rewards/bleu_reward_func/std": 0.09059884399175644,
+      "step": 480
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 503.0,
+      "completions/mean_length": 319.5625,
+      "completions/mean_terminated_length": 204.10000610351562,
+      "completions/min_length": 22.0,
+      "completions/min_terminated_length": 22.0,
+      "epoch": 0.3848,
+      "grad_norm": 5.199352264404297,
+      "kl": 0.172821044921875,
+      "learning_rate": 1e-06,
+      "loss": -0.062,
+      "num_tokens": 6333886.0,
+      "reward": 0.13319844007492065,
+      "reward_std": 0.03567848354578018,
+      "rewards/bleu_reward_func/mean": 0.13319844007492065,
+      "rewards/bleu_reward_func/std": 0.12437637895345688,
+      "step": 481
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.15625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 505.0,
+      "completions/mean_length": 208.9375,
+      "completions/mean_terminated_length": 152.8148193359375,
+      "completions/min_length": 11.0,
+      "completions/min_terminated_length": 11.0,
+      "epoch": 0.3856,
+      "grad_norm": 6.110198497772217,
+      "kl": 0.2414398193359375,
+      "learning_rate": 1e-06,
+      "loss": -0.0203,
+      "num_tokens": 6346564.0,
+      "reward": 0.19878074526786804,
+      "reward_std": 0.043283406645059586,
+      "rewards/bleu_reward_func/mean": 0.19878074526786804,
+      "rewards/bleu_reward_func/std": 0.1821635365486145,
+      "step": 482
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.15625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 502.0,
+      "completions/mean_length": 237.53125,
+      "completions/mean_terminated_length": 186.70370483398438,
+      "completions/min_length": 16.0,
+      "completions/min_terminated_length": 16.0,
+      "epoch": 0.3864,
+      "grad_norm": 8.788106918334961,
+      "kl": 0.17132568359375,
+      "learning_rate": 1e-06,
+      "loss": -0.0653,
+      "num_tokens": 6358741.0,
+      "reward": 0.07478289306163788,
+      "reward_std": 0.019201520830392838,
+      "rewards/bleu_reward_func/mean": 0.07478289306163788,
+      "rewards/bleu_reward_func/std": 0.05620751157402992,
+      "step": 483
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.21875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 489.0,
+      "completions/mean_length": 202.4375,
+      "completions/mean_terminated_length": 115.75999450683594,
+      "completions/min_length": 27.0,
+      "completions/min_terminated_length": 27.0,
+      "epoch": 0.3872,
+      "grad_norm": 7.577336311340332,
+      "kl": 0.109588623046875,
+      "learning_rate": 1e-06,
+      "loss": 0.3355,
+      "num_tokens": 6371787.0,
+      "reward": 0.09253311157226562,
+      "reward_std": 0.03513386473059654,
+      "rewards/bleu_reward_func/mean": 0.09253311157226562,
+      "rewards/bleu_reward_func/std": 0.0667162612080574,
+      "step": 484
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.21875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 512.0,
+      "completions/mean_length": 331.625,
+      "completions/mean_terminated_length": 281.1199951171875,
+      "completions/min_length": 19.0,
+      "completions/min_terminated_length": 19.0,
+      "epoch": 0.388,
+      "grad_norm": 5.319460391998291,
+      "kl": 0.0972900390625,
+      "learning_rate": 1e-06,
+      "loss": 0.057,
+      "num_tokens": 6388231.0,
+      "reward": 0.16802164912223816,
+      "reward_std": 0.024459581822156906,
+      "rewards/bleu_reward_func/mean": 0.16802164912223816,
+      "rewards/bleu_reward_func/std": 0.17531749606132507,
+      "step": 485
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 475.0,
+      "completions/mean_length": 222.6875,
+      "completions/mean_terminated_length": 155.92308044433594,
+      "completions/min_length": 18.0,
+      "completions/min_terminated_length": 18.0,
+      "epoch": 0.3888,
+      "grad_norm": 9.174544334411621,
+      "kl": 0.21746826171875,
+      "learning_rate": 1e-06,
+      "loss": 0.0552,
+      "num_tokens": 6399885.0,
+      "reward": 0.20374764502048492,
+      "reward_std": 0.02469576895236969,
+      "rewards/bleu_reward_func/mean": 0.20374764502048492,
+      "rewards/bleu_reward_func/std": 0.17774522304534912,
+      "step": 486
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 465.0,
+      "completions/mean_length": 212.40625,
+      "completions/mean_terminated_length": 112.54167175292969,
+      "completions/min_length": 18.0,
+      "completions/min_terminated_length": 18.0,
+      "epoch": 0.3896,
+      "grad_norm": 8.529189109802246,
+      "kl": 0.412689208984375,
+      "learning_rate": 1e-06,
+      "loss": -0.0442,
+      "num_tokens": 6410746.0,
+      "reward": 0.13253280520439148,
+      "reward_std": 0.03401318937540054,
+      "rewards/bleu_reward_func/mean": 0.13253280520439148,
+      "rewards/bleu_reward_func/std": 0.09572894126176834,
+      "step": 487
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.28125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 511.0,
+      "completions/mean_length": 343.40625,
+      "completions/mean_terminated_length": 277.4347839355469,
+      "completions/min_length": 32.0,
+      "completions/min_terminated_length": 32.0,
+      "epoch": 0.3904,
+      "grad_norm": 5.815334796905518,
+      "kl": 0.06719970703125,
+      "learning_rate": 1e-06,
+      "loss": 0.0079,
+      "num_tokens": 6424639.0,
+      "reward": 0.14998552203178406,
+      "reward_std": 0.03536435216665268,
+      "rewards/bleu_reward_func/mean": 0.14998552203178406,
+      "rewards/bleu_reward_func/std": 0.08015048503875732,
+      "step": 488
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.15625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 501.0,
+      "completions/mean_length": 242.21875,
+      "completions/mean_terminated_length": 192.25926208496094,
+      "completions/min_length": 17.0,
+      "completions/min_terminated_length": 17.0,
+      "epoch": 0.3912,
+      "grad_norm": 8.644153594970703,
+      "kl": 0.204193115234375,
+      "learning_rate": 1e-06,
+      "loss": 0.2233,
+      "num_tokens": 6437382.0,
+      "reward": 0.08585190027952194,
+      "reward_std": 0.032436732202768326,
+      "rewards/bleu_reward_func/mean": 0.08585190027952194,
+      "rewards/bleu_reward_func/std": 0.10239724069833755,
+      "step": 489
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 506.0,
+      "completions/mean_length": 259.625,
+      "completions/mean_terminated_length": 108.20000457763672,
+      "completions/min_length": 11.0,
+      "completions/min_terminated_length": 11.0,
+      "epoch": 0.392,
+      "grad_norm": 18.119718551635742,
+      "kl": 0.39862060546875,
+      "learning_rate": 1e-06,
+      "loss": 0.0372,
+      "num_tokens": 6450634.0,
+      "reward": 0.07857100665569305,
+      "reward_std": 0.010440990328788757,
+      "rewards/bleu_reward_func/mean": 0.07857100665569305,
+      "rewards/bleu_reward_func/std": 0.06719467043876648,
+      "step": 490
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.34375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 417.0,
+      "completions/mean_length": 348.28125,
+      "completions/mean_terminated_length": 262.5238037109375,
+      "completions/min_length": 100.0,
+      "completions/min_terminated_length": 100.0,
+      "epoch": 0.3928,
+      "grad_norm": 2.811199903488159,
+      "kl": 0.03350830078125,
+      "learning_rate": 1e-06,
+      "loss": 0.0223,
+      "num_tokens": 6464187.0,
+      "reward": 0.07400047779083252,
+      "reward_std": 0.021461695432662964,
+      "rewards/bleu_reward_func/mean": 0.07400047779083252,
+      "rewards/bleu_reward_func/std": 0.061210907995700836,
+      "step": 491
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.15625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 471.0,
+      "completions/mean_length": 225.53125,
+      "completions/mean_terminated_length": 172.48147583007812,
+      "completions/min_length": 11.0,
+      "completions/min_terminated_length": 11.0,
+      "epoch": 0.3936,
+      "grad_norm": 8.102995872497559,
+      "kl": 0.17437744140625,
+      "learning_rate": 1e-06,
+      "loss": -0.0621,
+      "num_tokens": 6477388.0,
+      "reward": 0.08205416798591614,
+      "reward_std": 0.02140321210026741,
+      "rewards/bleu_reward_func/mean": 0.08205416798591614,
+      "rewards/bleu_reward_func/std": 0.06504324823617935,
+      "step": 492
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 484.0,
+      "completions/mean_length": 248.40625,
+      "completions/mean_terminated_length": 239.90321350097656,
+      "completions/min_length": 31.0,
+      "completions/min_terminated_length": 31.0,
+      "epoch": 0.3944,
+      "grad_norm": 5.510415554046631,
+      "kl": 0.054595947265625,
+      "learning_rate": 1e-06,
+      "loss": 0.1424,
+      "num_tokens": 6490681.0,
+      "reward": 0.09917749464511871,
+      "reward_std": 0.03953540325164795,
+      "rewards/bleu_reward_func/mean": 0.09917749464511871,
+      "rewards/bleu_reward_func/std": 0.062214821577072144,
+      "step": 493
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.46875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 430.0,
+      "completions/mean_length": 347.65625,
+      "completions/mean_terminated_length": 202.64706420898438,
+      "completions/min_length": 71.0,
+      "completions/min_terminated_length": 71.0,
+      "epoch": 0.3952,
+      "grad_norm": 3.754049301147461,
+      "kl": 0.065643310546875,
+      "learning_rate": 1e-06,
+      "loss": -0.0312,
+      "num_tokens": 6508054.0,
+      "reward": 0.04995376244187355,
+      "reward_std": 0.018671657890081406,
+      "rewards/bleu_reward_func/mean": 0.04995376244187355,
+      "rewards/bleu_reward_func/std": 0.021997425705194473,
+      "step": 494
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.46875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 448.0,
+      "completions/mean_length": 304.78125,
+      "completions/mean_terminated_length": 121.94117736816406,
+      "completions/min_length": 31.0,
+      "completions/min_terminated_length": 31.0,
+      "epoch": 0.396,
+      "grad_norm": 3.28092098236084,
+      "kl": 0.0880889892578125,
+      "learning_rate": 1e-06,
+      "loss": 0.2271,
+      "num_tokens": 6528167.0,
+      "reward": 0.21464568376541138,
+      "reward_std": 0.04326138645410538,
+      "rewards/bleu_reward_func/mean": 0.21464568376541138,
+      "rewards/bleu_reward_func/std": 0.2538887560367584,
+      "step": 495
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.28125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 493.0,
+      "completions/mean_length": 260.0,
+      "completions/mean_terminated_length": 161.3913116455078,
+      "completions/min_length": 21.0,
+      "completions/min_terminated_length": 21.0,
+      "epoch": 0.3968,
+      "grad_norm": 7.667696475982666,
+      "kl": 0.27783203125,
+      "learning_rate": 1e-06,
+      "loss": -0.0064,
+      "num_tokens": 6539503.0,
+      "reward": 0.14023897051811218,
+      "reward_std": 0.03843347355723381,
+      "rewards/bleu_reward_func/mean": 0.14023897051811218,
+      "rewards/bleu_reward_func/std": 0.12260077148675919,
+      "step": 496
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.15625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 508.0,
+      "completions/mean_length": 312.78125,
+      "completions/mean_terminated_length": 275.8888854980469,
+      "completions/min_length": 21.0,
+      "completions/min_terminated_length": 21.0,
+      "epoch": 0.3976,
+      "grad_norm": 5.650123596191406,
+      "kl": 0.122955322265625,
+      "learning_rate": 1e-06,
+      "loss": -0.218,
+      "num_tokens": 6556880.0,
+      "reward": 0.2068222463130951,
+      "reward_std": 0.08186712116003036,
+      "rewards/bleu_reward_func/mean": 0.2068222463130951,
+      "rewards/bleu_reward_func/std": 0.30478134751319885,
+      "step": 497
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 482.0,
+      "completions/max_terminated_length": 482.0,
+      "completions/mean_length": 115.5,
+      "completions/mean_terminated_length": 115.5,
+      "completions/min_length": 15.0,
+      "completions/min_terminated_length": 15.0,
+      "epoch": 0.3984,
+      "grad_norm": 9.522087097167969,
+      "kl": 0.31427001953125,
+      "learning_rate": 1e-06,
+      "loss": 0.3453,
+      "num_tokens": 6564648.0,
+      "reward": 0.21922443807125092,
+      "reward_std": 0.07997345924377441,
+      "rewards/bleu_reward_func/mean": 0.21922443807125092,
+      "rewards/bleu_reward_func/std": 0.12106078118085861,
+      "step": 498
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.28125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 510.0,
+      "completions/mean_length": 283.46875,
+      "completions/mean_terminated_length": 194.04348754882812,
+      "completions/min_length": 20.0,
+      "completions/min_terminated_length": 20.0,
+      "epoch": 0.3992,
+      "grad_norm": 4.472853183746338,
+      "kl": 0.10198974609375,
+      "learning_rate": 1e-06,
+      "loss": 0.0606,
+      "num_tokens": 6577575.0,
+      "reward": 0.1807648241519928,
+      "reward_std": 0.04940491169691086,
+      "rewards/bleu_reward_func/mean": 0.1807648241519928,
+      "rewards/bleu_reward_func/std": 0.2276194989681244,
+      "step": 499
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.15625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 500.0,
+      "completions/mean_length": 257.6875,
+      "completions/mean_terminated_length": 210.59259033203125,
+      "completions/min_length": 34.0,
+      "completions/min_terminated_length": 34.0,
+      "epoch": 0.4,
+      "grad_norm": 3.429314136505127,
+      "kl": 0.120086669921875,
+      "learning_rate": 1e-06,
+      "loss": 0.1092,
+      "num_tokens": 6590341.0,
+      "reward": 0.13892096281051636,
+      "reward_std": 0.04246610775589943,
+      "rewards/bleu_reward_func/mean": 0.13892096281051636,
+      "rewards/bleu_reward_func/std": 0.12665794789791107,
+      "step": 500
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 422.0,
+      "completions/mean_length": 150.78125,
+      "completions/mean_terminated_length": 126.70000457763672,
+      "completions/min_length": 14.0,
+      "completions/min_terminated_length": 14.0,
+      "epoch": 0.4008,
+      "grad_norm": 6.932479381561279,
+      "kl": 0.4072265625,
+      "learning_rate": 1e-06,
+      "loss": 0.1306,
+      "num_tokens": 6604182.0,
+      "reward": 0.13375571370124817,
+      "reward_std": 0.05735353007912636,
+      "rewards/bleu_reward_func/mean": 0.13375571370124817,
+      "rewards/bleu_reward_func/std": 0.14047691226005554,
+      "step": 501
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.28125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 483.0,
+      "completions/mean_length": 329.375,
+      "completions/mean_terminated_length": 257.9130554199219,
+      "completions/min_length": 27.0,
+      "completions/min_terminated_length": 27.0,
+      "epoch": 0.4016,
+      "grad_norm": 3.9977669715881348,
+      "kl": 0.0543212890625,
+      "learning_rate": 1e-06,
+      "loss": 0.1227,
+      "num_tokens": 6619994.0,
+      "reward": 0.08314976096153259,
+      "reward_std": 0.01850474253296852,
+      "rewards/bleu_reward_func/mean": 0.08314976096153259,
+      "rewards/bleu_reward_func/std": 0.03126469627022743,
+      "step": 502
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 501.0,
+      "completions/mean_length": 328.875,
+      "completions/mean_terminated_length": 245.63636779785156,
+      "completions/min_length": 13.0,
+      "completions/min_terminated_length": 13.0,
+      "epoch": 0.4024,
+      "grad_norm": 8.637741088867188,
+      "kl": 0.294342041015625,
+      "learning_rate": 1e-06,
+      "loss": -0.0294,
+      "num_tokens": 6632862.0,
+      "reward": 0.21461226046085358,
+      "reward_std": 0.05726875364780426,
+      "rewards/bleu_reward_func/mean": 0.21461226046085358,
+      "rewards/bleu_reward_func/std": 0.19377335906028748,
+      "step": 503
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 495.0,
+      "completions/mean_length": 345.71875,
+      "completions/mean_terminated_length": 245.9499969482422,
+      "completions/min_length": 30.0,
+      "completions/min_terminated_length": 30.0,
+      "epoch": 0.4032,
+      "grad_norm": 5.415818691253662,
+      "kl": 0.11663818359375,
+      "learning_rate": 1e-06,
+      "loss": -0.1008,
+      "num_tokens": 6649573.0,
+      "reward": 0.10018286108970642,
+      "reward_std": 0.025530360639095306,
+      "rewards/bleu_reward_func/mean": 0.10018286108970642,
+      "rewards/bleu_reward_func/std": 0.08217810094356537,
+      "step": 504
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 508.0,
+      "completions/mean_length": 293.21875,
+      "completions/mean_terminated_length": 123.05555725097656,
+      "completions/min_length": 27.0,
+      "completions/min_terminated_length": 27.0,
+      "epoch": 0.404,
+      "grad_norm": 5.128636837005615,
+      "kl": 0.0567626953125,
+      "learning_rate": 1e-06,
+      "loss": 0.1747,
+      "num_tokens": 6661556.0,
+      "reward": 0.08193753659725189,
+      "reward_std": 0.036860473453998566,
+      "rewards/bleu_reward_func/mean": 0.08193753659725189,
+      "rewards/bleu_reward_func/std": 0.0639234408736229,
+      "step": 505
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 499.0,
+      "completions/mean_length": 383.1875,
+      "completions/mean_terminated_length": 254.375,
+      "completions/min_length": 77.0,
+      "completions/min_terminated_length": 77.0,
+      "epoch": 0.4048,
+      "grad_norm": 2.5557072162628174,
+      "kl": 0.03839111328125,
+      "learning_rate": 1e-06,
+      "loss": -0.0743,
+      "num_tokens": 6678938.0,
+      "reward": 0.05591622740030289,
+      "reward_std": 0.017734069377183914,
+      "rewards/bleu_reward_func/mean": 0.05591622740030289,
+      "rewards/bleu_reward_func/std": 0.04607876017689705,
+      "step": 506
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 271.0,
+      "completions/mean_length": 293.625,
+      "completions/mean_terminated_length": 75.25,
+      "completions/min_length": 18.0,
+      "completions/min_terminated_length": 18.0,
+      "epoch": 0.4056,
+      "grad_norm": 5.220037460327148,
+      "kl": 0.07464599609375,
+      "learning_rate": 1e-06,
+      "loss": 0.0253,
+      "num_tokens": 6691822.0,
+      "reward": 0.029562367126345634,
+      "reward_std": 0.03146641328930855,
+      "rewards/bleu_reward_func/mean": 0.029562367126345634,
+      "rewards/bleu_reward_func/std": 0.04593721404671669,
+      "step": 507
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.65625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 507.0,
+      "completions/mean_length": 453.96875,
+      "completions/mean_terminated_length": 343.18182373046875,
+      "completions/min_length": 14.0,
+      "completions/min_terminated_length": 14.0,
+      "epoch": 0.4064,
+      "grad_norm": 2.189230442047119,
+      "kl": 0.026153564453125,
+      "learning_rate": 1e-06,
+      "loss": -0.0992,
+      "num_tokens": 6709997.0,
+      "reward": 0.03725602477788925,
+      "reward_std": 0.02092660963535309,
+      "rewards/bleu_reward_func/mean": 0.03725602477788925,
+      "rewards/bleu_reward_func/std": 0.02429044619202614,
+      "step": 508
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 290.0,
+      "completions/mean_length": 106.34375,
+      "completions/mean_terminated_length": 79.30000305175781,
+      "completions/min_length": 14.0,
+      "completions/min_terminated_length": 14.0,
+      "epoch": 0.4072,
+      "grad_norm": 6.567111015319824,
+      "kl": 0.233154296875,
+      "learning_rate": 1e-06,
+      "loss": 0.2905,
+      "num_tokens": 6718912.0,
+      "reward": 0.15163123607635498,
+      "reward_std": 0.039707012474536896,
+      "rewards/bleu_reward_func/mean": 0.15163123607635498,
+      "rewards/bleu_reward_func/std": 0.12998701632022858,
+      "step": 509
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 411.0,
+      "completions/mean_length": 119.03125,
+      "completions/mean_terminated_length": 106.3548355102539,
+      "completions/min_length": 21.0,
+      "completions/min_terminated_length": 21.0,
+      "epoch": 0.408,
+      "grad_norm": 8.991604804992676,
+      "kl": 0.14703369140625,
+      "learning_rate": 1e-06,
+      "loss": 0.0972,
+      "num_tokens": 6728609.0,
+      "reward": 0.23723718523979187,
+      "reward_std": 0.07665139436721802,
+      "rewards/bleu_reward_func/mean": 0.23723718523979187,
+      "rewards/bleu_reward_func/std": 0.27060666680336,
+      "step": 510
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.34375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 500.0,
+      "completions/mean_length": 288.375,
+      "completions/mean_terminated_length": 171.23809814453125,
+      "completions/min_length": 19.0,
+      "completions/min_terminated_length": 19.0,
+      "epoch": 0.4088,
+      "grad_norm": 6.349617958068848,
+      "kl": 0.168121337890625,
+      "learning_rate": 1e-06,
+      "loss": 0.0688,
+      "num_tokens": 6743789.0,
+      "reward": 0.1937231868505478,
+      "reward_std": 0.13082939386367798,
+      "rewards/bleu_reward_func/mean": 0.1937231868505478,
+      "rewards/bleu_reward_func/std": 0.25435397028923035,
+      "step": 511
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 486.0,
+      "completions/mean_length": 401.96875,
+      "completions/mean_terminated_length": 291.9375,
+      "completions/min_length": 56.0,
+      "completions/min_terminated_length": 56.0,
+      "epoch": 0.4096,
+      "grad_norm": 2.5390427112579346,
+      "kl": 0.03851318359375,
+      "learning_rate": 1e-06,
+      "loss": 0.1042,
+      "num_tokens": 6759732.0,
+      "reward": 0.029224077239632607,
+      "reward_std": 0.016936711966991425,
+      "rewards/bleu_reward_func/mean": 0.029224077239632607,
+      "rewards/bleu_reward_func/std": 0.022709792479872704,
+      "step": 512
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 451.0,
+      "completions/mean_length": 268.46875,
+      "completions/mean_terminated_length": 79.05555725097656,
+      "completions/min_length": 24.0,
+      "completions/min_terminated_length": 24.0,
+      "epoch": 0.4104,
+      "grad_norm": 3.7983713150024414,
+      "kl": 0.137786865234375,
+      "learning_rate": 1e-06,
+      "loss": -0.0173,
+      "num_tokens": 6774051.0,
+      "reward": 0.20052862167358398,
+      "reward_std": 0.028155002743005753,
+      "rewards/bleu_reward_func/mean": 0.20052862167358398,
+      "rewards/bleu_reward_func/std": 0.2302575409412384,
+      "step": 513
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 512.0,
+      "completions/mean_length": 388.71875,
+      "completions/mean_terminated_length": 360.2692565917969,
+      "completions/min_length": 176.0,
+      "completions/min_terminated_length": 176.0,
+      "epoch": 0.4112,
+      "grad_norm": 2.4446346759796143,
+      "kl": 0.028076171875,
+      "learning_rate": 1e-06,
+      "loss": -0.0783,
+      "num_tokens": 6790610.0,
+      "reward": 0.10578086227178574,
+      "reward_std": 0.029093941673636436,
+      "rewards/bleu_reward_func/mean": 0.10578086227178574,
+      "rewards/bleu_reward_func/std": 0.08641202747821808,
+      "step": 514
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 483.0,
+      "completions/mean_length": 275.34375,
+      "completions/mean_terminated_length": 259.5666809082031,
+      "completions/min_length": 21.0,
+      "completions/min_terminated_length": 21.0,
+      "epoch": 0.412,
+      "grad_norm": 5.883263111114502,
+      "kl": 0.18634033203125,
+      "learning_rate": 1e-06,
+      "loss": 0.0231,
+      "num_tokens": 6803965.0,
+      "reward": 0.1322258561849594,
+      "reward_std": 0.030806170776486397,
+      "rewards/bleu_reward_func/mean": 0.1322258561849594,
+      "rewards/bleu_reward_func/std": 0.16078709065914154,
+      "step": 515
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 484.0,
+      "completions/mean_length": 333.84375,
+      "completions/mean_terminated_length": 274.4583435058594,
+      "completions/min_length": 76.0,
+      "completions/min_terminated_length": 76.0,
+      "epoch": 0.4128,
+      "grad_norm": 3.016139507293701,
+      "kl": 0.03173828125,
+      "learning_rate": 1e-06,
+      "loss": 0.025,
+      "num_tokens": 6818840.0,
+      "reward": 0.09323176741600037,
+      "reward_std": 0.05342460051178932,
+      "rewards/bleu_reward_func/mean": 0.09323176741600037,
+      "rewards/bleu_reward_func/std": 0.06577997654676437,
+      "step": 516
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.21875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 442.0,
+      "completions/mean_length": 276.5625,
+      "completions/mean_terminated_length": 210.63999938964844,
+      "completions/min_length": 72.0,
+      "completions/min_terminated_length": 72.0,
+      "epoch": 0.4136,
+      "grad_norm": 4.685121059417725,
+      "kl": 0.050933837890625,
+      "learning_rate": 1e-06,
+      "loss": -0.1784,
+      "num_tokens": 6830770.0,
+      "reward": 0.03872024267911911,
+      "reward_std": 0.016178004443645477,
+      "rewards/bleu_reward_func/mean": 0.03872024267911911,
+      "rewards/bleu_reward_func/std": 0.025313377380371094,
+      "step": 517
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.46875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 496.0,
+      "completions/mean_length": 362.875,
+      "completions/mean_terminated_length": 231.2941131591797,
+      "completions/min_length": 24.0,
+      "completions/min_terminated_length": 24.0,
+      "epoch": 0.4144,
+      "grad_norm": 4.639893054962158,
+      "kl": 0.122711181640625,
+      "learning_rate": 1e-06,
+      "loss": 0.0545,
+      "num_tokens": 6848302.0,
+      "reward": 0.07996964454650879,
+      "reward_std": 0.01709877885878086,
+      "rewards/bleu_reward_func/mean": 0.07996964454650879,
+      "rewards/bleu_reward_func/std": 0.10056579113006592,
+      "step": 518
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 500.0,
+      "completions/mean_length": 294.625,
+      "completions/mean_terminated_length": 263.5714416503906,
+      "completions/min_length": 22.0,
+      "completions/min_terminated_length": 22.0,
+      "epoch": 0.4152,
+      "grad_norm": 9.085565567016602,
+      "kl": 0.1670379638671875,
+      "learning_rate": 1e-06,
+      "loss": -0.1226,
+      "num_tokens": 6859826.0,
+      "reward": 0.10505213588476181,
+      "reward_std": 0.05224030464887619,
+      "rewards/bleu_reward_func/mean": 0.10505213588476181,
+      "rewards/bleu_reward_func/std": 0.0725407749414444,
+      "step": 519
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 497.0,
+      "completions/mean_length": 258.3125,
+      "completions/mean_terminated_length": 173.75,
+      "completions/min_length": 26.0,
+      "completions/min_terminated_length": 26.0,
+      "epoch": 0.416,
+      "grad_norm": 7.134840965270996,
+      "kl": 0.175750732421875,
+      "learning_rate": 1e-06,
+      "loss": 0.1168,
+      "num_tokens": 6873516.0,
+      "reward": 0.21853026747703552,
+      "reward_std": 0.06429094821214676,
+      "rewards/bleu_reward_func/mean": 0.21853026747703552,
+      "rewards/bleu_reward_func/std": 0.14174966514110565,
+      "step": 520
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.28125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 477.0,
+      "completions/mean_length": 328.65625,
+      "completions/mean_terminated_length": 256.9130554199219,
+      "completions/min_length": 28.0,
+      "completions/min_terminated_length": 28.0,
+      "epoch": 0.4168,
+      "grad_norm": 6.0497517585754395,
+      "kl": 0.131439208984375,
+      "learning_rate": 1e-06,
+      "loss": 0.0008,
+      "num_tokens": 6887761.0,
+      "reward": 0.0685054138302803,
+      "reward_std": 0.012891553342342377,
+      "rewards/bleu_reward_func/mean": 0.0685054138302803,
+      "rewards/bleu_reward_func/std": 0.057060711085796356,
+      "step": 521
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.15625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 491.0,
+      "completions/mean_length": 267.0625,
+      "completions/mean_terminated_length": 221.70370483398438,
+      "completions/min_length": 38.0,
+      "completions/min_terminated_length": 38.0,
+      "epoch": 0.4176,
+      "grad_norm": 4.475714683532715,
+      "kl": 0.047637939453125,
+      "learning_rate": 1e-06,
+      "loss": 0.1563,
+      "num_tokens": 6901251.0,
+      "reward": 0.18483126163482666,
+      "reward_std": 0.02913127839565277,
+      "rewards/bleu_reward_func/mean": 0.18483126163482666,
+      "rewards/bleu_reward_func/std": 0.16543246805667877,
+      "step": 522
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 506.0,
+      "completions/mean_length": 228.53125,
+      "completions/mean_terminated_length": 163.11538696289062,
+      "completions/min_length": 9.0,
+      "completions/min_terminated_length": 9.0,
+      "epoch": 0.4184,
+      "grad_norm": 7.8729143142700195,
+      "kl": 0.297882080078125,
+      "learning_rate": 1e-06,
+      "loss": 0.0428,
+      "num_tokens": 6912844.0,
+      "reward": 0.1846814900636673,
+      "reward_std": 0.10159599035978317,
+      "rewards/bleu_reward_func/mean": 0.1846814900636673,
+      "rewards/bleu_reward_func/std": 0.2030598670244217,
+      "step": 523
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 456.0,
+      "completions/mean_length": 251.09375,
+      "completions/mean_terminated_length": 233.70001220703125,
+      "completions/min_length": 31.0,
+      "completions/min_terminated_length": 31.0,
+      "epoch": 0.4192,
+      "grad_norm": 4.603794097900391,
+      "kl": 0.08197021484375,
+      "learning_rate": 1e-06,
+      "loss": -0.0057,
+      "num_tokens": 6923895.0,
+      "reward": 0.11323156207799911,
+      "reward_std": 0.03932211175560951,
+      "rewards/bleu_reward_func/mean": 0.11323156207799911,
+      "rewards/bleu_reward_func/std": 0.08274988830089569,
+      "step": 524
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 288.0,
+      "completions/max_terminated_length": 288.0,
+      "completions/mean_length": 76.625,
+      "completions/mean_terminated_length": 76.625,
+      "completions/min_length": 9.0,
+      "completions/min_terminated_length": 9.0,
+      "epoch": 0.42,
+      "grad_norm": 8.708818435668945,
+      "kl": 0.1953125,
+      "learning_rate": 1e-06,
+      "loss": 0.3306,
+      "num_tokens": 6934091.0,
+      "reward": 0.18468719720840454,
+      "reward_std": 0.0689420998096466,
+      "rewards/bleu_reward_func/mean": 0.18468719720840454,
+      "rewards/bleu_reward_func/std": 0.12529541552066803,
+      "step": 525
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 481.0,
+      "completions/mean_length": 256.5,
+      "completions/mean_terminated_length": 220.00001525878906,
+      "completions/min_length": 17.0,
+      "completions/min_terminated_length": 17.0,
+      "epoch": 0.4208,
+      "grad_norm": 15.188727378845215,
+      "kl": 0.10748291015625,
+      "learning_rate": 1e-06,
+      "loss": -0.0496,
+      "num_tokens": 6946923.0,
+      "reward": 0.09780866652727127,
+      "reward_std": 0.029562484472990036,
+      "rewards/bleu_reward_func/mean": 0.09780866652727127,
+      "rewards/bleu_reward_func/std": 0.09735672175884247,
+      "step": 526
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.34375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 483.0,
+      "completions/mean_length": 246.75,
+      "completions/mean_terminated_length": 107.80952453613281,
+      "completions/min_length": 13.0,
+      "completions/min_terminated_length": 13.0,
+      "epoch": 0.4216,
+      "grad_norm": 5.919389247894287,
+      "kl": 0.1287841796875,
+      "learning_rate": 1e-06,
+      "loss": 0.0944,
+      "num_tokens": 6958675.0,
+      "reward": 0.049182113260030746,
+      "reward_std": 0.03928225487470627,
+      "rewards/bleu_reward_func/mean": 0.049182113260030746,
+      "rewards/bleu_reward_func/std": 0.05703483149409294,
+      "step": 527
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 488.0,
+      "completions/mean_length": 298.59375,
+      "completions/mean_terminated_length": 268.1071472167969,
+      "completions/min_length": 38.0,
+      "completions/min_terminated_length": 38.0,
+      "epoch": 0.4224,
+      "grad_norm": 4.162198066711426,
+      "kl": 0.048370361328125,
+      "learning_rate": 1e-06,
+      "loss": -0.0255,
+      "num_tokens": 6973038.0,
+      "reward": 0.19552364945411682,
+      "reward_std": 0.05411393195390701,
+      "rewards/bleu_reward_func/mean": 0.19552364945411682,
+      "rewards/bleu_reward_func/std": 0.11564817279577255,
+      "step": 528
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 505.0,
+      "completions/mean_length": 343.125,
+      "completions/mean_terminated_length": 266.3636474609375,
+      "completions/min_length": 33.0,
+      "completions/min_terminated_length": 33.0,
+      "epoch": 0.4232,
+      "grad_norm": 7.422494411468506,
+      "kl": 0.06976318359375,
+      "learning_rate": 1e-06,
+      "loss": 0.0526,
+      "num_tokens": 6988034.0,
+      "reward": 0.03407738357782364,
+      "reward_std": 0.010626979172229767,
+      "rewards/bleu_reward_func/mean": 0.03407738357782364,
+      "rewards/bleu_reward_func/std": 0.027887288480997086,
+      "step": 529
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 249.0,
+      "completions/max_terminated_length": 249.0,
+      "completions/mean_length": 53.53125,
+      "completions/mean_terminated_length": 53.53125,
+      "completions/min_length": 14.0,
+      "completions/min_terminated_length": 14.0,
+      "epoch": 0.424,
+      "grad_norm": 13.498769760131836,
+      "kl": 0.46209716796875,
+      "learning_rate": 1e-06,
+      "loss": 0.0087,
+      "num_tokens": 6995419.0,
+      "reward": 0.24595381319522858,
+      "reward_std": 0.09870806336402893,
+      "rewards/bleu_reward_func/mean": 0.24595381319522858,
+      "rewards/bleu_reward_func/std": 0.1663571149110794,
+      "step": 530
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.46875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 472.0,
+      "completions/mean_length": 392.5,
+      "completions/mean_terminated_length": 287.058837890625,
+      "completions/min_length": 46.0,
+      "completions/min_terminated_length": 46.0,
+      "epoch": 0.4248,
+      "grad_norm": 2.2019829750061035,
+      "kl": 0.0286407470703125,
+      "learning_rate": 1e-06,
+      "loss": -0.2369,
+      "num_tokens": 7014707.0,
+      "reward": 0.12730640172958374,
+      "reward_std": 0.03398028016090393,
+      "rewards/bleu_reward_func/mean": 0.12730640172958374,
+      "rewards/bleu_reward_func/std": 0.20578297972679138,
+      "step": 531
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.21875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 499.0,
+      "completions/mean_length": 317.15625,
+      "completions/mean_terminated_length": 262.6000061035156,
+      "completions/min_length": 35.0,
+      "completions/min_terminated_length": 35.0,
+      "epoch": 0.4256,
+      "grad_norm": 5.298976421356201,
+      "kl": 0.087127685546875,
+      "learning_rate": 1e-06,
+      "loss": -0.0481,
+      "num_tokens": 7030000.0,
+      "reward": 0.06116287037730217,
+      "reward_std": 0.04584234952926636,
+      "rewards/bleu_reward_func/mean": 0.06116287037730217,
+      "rewards/bleu_reward_func/std": 0.07913482189178467,
+      "step": 532
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 503.0,
+      "completions/mean_length": 321.34375,
+      "completions/mean_terminated_length": 234.68182373046875,
+      "completions/min_length": 22.0,
+      "completions/min_terminated_length": 22.0,
+      "epoch": 0.4264,
+      "grad_norm": 4.799532413482666,
+      "kl": 0.1138916015625,
+      "learning_rate": 1e-06,
+      "loss": -0.0807,
+      "num_tokens": 7046643.0,
+      "reward": 0.08829933404922485,
+      "reward_std": 0.03609791770577431,
+      "rewards/bleu_reward_func/mean": 0.08829933404922485,
+      "rewards/bleu_reward_func/std": 0.10983619093894958,
+      "step": 533
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 466.0,
+      "completions/mean_length": 196.84375,
+      "completions/mean_terminated_length": 151.82144165039062,
+      "completions/min_length": 14.0,
+      "completions/min_terminated_length": 14.0,
+      "epoch": 0.4272,
+      "grad_norm": 15.142457008361816,
+      "kl": 0.25970458984375,
+      "learning_rate": 1e-06,
+      "loss": -0.0027,
+      "num_tokens": 7057718.0,
+      "reward": 0.10968612134456635,
+      "reward_std": 0.05676144361495972,
+      "rewards/bleu_reward_func/mean": 0.10968612134456635,
+      "rewards/bleu_reward_func/std": 0.1397821009159088,
+      "step": 534
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.15625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 391.0,
+      "completions/mean_length": 244.9375,
+      "completions/mean_terminated_length": 195.48147583007812,
+      "completions/min_length": 16.0,
+      "completions/min_terminated_length": 16.0,
+      "epoch": 0.428,
+      "grad_norm": 6.9789934158325195,
+      "kl": 0.092926025390625,
+      "learning_rate": 1e-06,
+      "loss": 0.1057,
+      "num_tokens": 7073980.0,
+      "reward": 0.19463014602661133,
+      "reward_std": 0.09179598838090897,
+      "rewards/bleu_reward_func/mean": 0.19463014602661133,
+      "rewards/bleu_reward_func/std": 0.1903815120458603,
+      "step": 535
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 506.0,
+      "completions/mean_length": 146.5,
+      "completions/mean_terminated_length": 122.13333892822266,
+      "completions/min_length": 8.0,
+      "completions/min_terminated_length": 8.0,
+      "epoch": 0.4288,
+      "grad_norm": 10.111763000488281,
+      "kl": 0.143310546875,
+      "learning_rate": 1e-06,
+      "loss": -0.0902,
+      "num_tokens": 7085228.0,
+      "reward": 0.15931251645088196,
+      "reward_std": 0.06651220470666885,
+      "rewards/bleu_reward_func/mean": 0.15931251645088196,
+      "rewards/bleu_reward_func/std": 0.10370245575904846,
+      "step": 536
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 508.0,
+      "completions/mean_length": 245.09375,
+      "completions/mean_terminated_length": 183.5,
+      "completions/min_length": 12.0,
+      "completions/min_terminated_length": 12.0,
+      "epoch": 0.4296,
+      "grad_norm": 11.313093185424805,
+      "kl": 0.135894775390625,
+      "learning_rate": 1e-06,
+      "loss": 0.2524,
+      "num_tokens": 7098583.0,
+      "reward": 0.08501166105270386,
+      "reward_std": 0.03819301724433899,
+      "rewards/bleu_reward_func/mean": 0.08501166105270386,
+      "rewards/bleu_reward_func/std": 0.0931810513138771,
+      "step": 537
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 502.0,
+      "completions/mean_length": 261.5,
+      "completions/mean_terminated_length": 244.80001831054688,
+      "completions/min_length": 38.0,
+      "completions/min_terminated_length": 38.0,
+      "epoch": 0.4304,
+      "grad_norm": 4.153919696807861,
+      "kl": 0.050628662109375,
+      "learning_rate": 1e-06,
+      "loss": -0.1541,
+      "num_tokens": 7108663.0,
+      "reward": 0.06835095584392548,
+      "reward_std": 0.042577650398015976,
+      "rewards/bleu_reward_func/mean": 0.06835095584392548,
+      "rewards/bleu_reward_func/std": 0.05704295262694359,
+      "step": 538
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.15625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 477.0,
+      "completions/mean_length": 226.375,
+      "completions/mean_terminated_length": 173.48147583007812,
+      "completions/min_length": 13.0,
+      "completions/min_terminated_length": 13.0,
+      "epoch": 0.4312,
+      "grad_norm": 13.284401893615723,
+      "kl": 0.14434814453125,
+      "learning_rate": 1e-06,
+      "loss": -0.0842,
+      "num_tokens": 7118427.0,
+      "reward": 0.08002069592475891,
+      "reward_std": 0.029213791713118553,
+      "rewards/bleu_reward_func/mean": 0.08002069592475891,
+      "rewards/bleu_reward_func/std": 0.03687189891934395,
+      "step": 539
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.09375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 391.0,
+      "completions/mean_length": 123.65625,
+      "completions/mean_terminated_length": 83.48275756835938,
+      "completions/min_length": 10.0,
+      "completions/min_terminated_length": 10.0,
+      "epoch": 0.432,
+      "grad_norm": 23.51561164855957,
+      "kl": 0.1839599609375,
+      "learning_rate": 1e-06,
+      "loss": 0.4993,
+      "num_tokens": 7128856.0,
+      "reward": 0.2179010808467865,
+      "reward_std": 0.08272600173950195,
+      "rewards/bleu_reward_func/mean": 0.2179010808467865,
+      "rewards/bleu_reward_func/std": 0.26301127672195435,
+      "step": 540
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.46875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 452.0,
+      "completions/mean_length": 332.40625,
+      "completions/mean_terminated_length": 173.94117736816406,
+      "completions/min_length": 7.0,
+      "completions/min_terminated_length": 7.0,
+      "epoch": 0.4328,
+      "grad_norm": 10.934617042541504,
+      "kl": 0.10113525390625,
+      "learning_rate": 1e-06,
+      "loss": -0.1254,
+      "num_tokens": 7141661.0,
+      "reward": 0.06413869559764862,
+      "reward_std": 0.05120678246021271,
+      "rewards/bleu_reward_func/mean": 0.06413869559764862,
+      "rewards/bleu_reward_func/std": 0.09179537743330002,
+      "step": 541
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 482.0,
+      "completions/mean_length": 294.375,
+      "completions/mean_terminated_length": 221.83334350585938,
+      "completions/min_length": 18.0,
+      "completions/min_terminated_length": 18.0,
+      "epoch": 0.4336,
+      "grad_norm": 3.505484104156494,
+      "kl": 0.06707763671875,
+      "learning_rate": 1e-06,
+      "loss": 0.0835,
+      "num_tokens": 7153161.0,
+      "reward": 0.09516981989145279,
+      "reward_std": 0.044140610843896866,
+      "rewards/bleu_reward_func/mean": 0.09516981989145279,
+      "rewards/bleu_reward_func/std": 0.049775656312704086,
+      "step": 542
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 478.0,
+      "completions/mean_length": 326.25,
+      "completions/mean_terminated_length": 214.8000030517578,
+      "completions/min_length": 30.0,
+      "completions/min_terminated_length": 30.0,
+      "epoch": 0.4344,
+      "grad_norm": 7.354869842529297,
+      "kl": 0.10260009765625,
+      "learning_rate": 1e-06,
+      "loss": -0.1239,
+      "num_tokens": 7167649.0,
+      "reward": 0.03533574938774109,
+      "reward_std": 0.014214935712516308,
+      "rewards/bleu_reward_func/mean": 0.03533574938774109,
+      "rewards/bleu_reward_func/std": 0.027195338159799576,
+      "step": 543
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 501.0,
+      "completions/mean_length": 225.78125,
+      "completions/mean_terminated_length": 159.73077392578125,
+      "completions/min_length": 21.0,
+      "completions/min_terminated_length": 21.0,
+      "epoch": 0.4352,
+      "grad_norm": 5.206735610961914,
+      "kl": 0.09149169921875,
+      "learning_rate": 1e-06,
+      "loss": -0.0428,
+      "num_tokens": 7181226.0,
+      "reward": 0.22954684495925903,
+      "reward_std": 0.06006891652941704,
+      "rewards/bleu_reward_func/mean": 0.22954684495925903,
+      "rewards/bleu_reward_func/std": 0.11863149702548981,
+      "step": 544
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 499.0,
+      "completions/mean_length": 250.75,
+      "completions/mean_terminated_length": 190.4615478515625,
+      "completions/min_length": 32.0,
+      "completions/min_terminated_length": 32.0,
+      "epoch": 0.436,
+      "grad_norm": 5.510367393493652,
+      "kl": 0.05084228515625,
+      "learning_rate": 1e-06,
+      "loss": 0.1971,
+      "num_tokens": 7191810.0,
+      "reward": 0.08453569561243057,
+      "reward_std": 0.050511520355939865,
+      "rewards/bleu_reward_func/mean": 0.08453569561243057,
+      "rewards/bleu_reward_func/std": 0.07364515960216522,
+      "step": 545
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 508.0,
+      "completions/mean_length": 388.34375,
+      "completions/mean_terminated_length": 314.1499938964844,
+      "completions/min_length": 34.0,
+      "completions/min_terminated_length": 34.0,
+      "epoch": 0.4368,
+      "grad_norm": 3.61635160446167,
+      "kl": 0.045135498046875,
+      "learning_rate": 1e-06,
+      "loss": 0.0817,
+      "num_tokens": 7206789.0,
+      "reward": 0.050152119249105453,
+      "reward_std": 0.03165213763713837,
+      "rewards/bleu_reward_func/mean": 0.050152119249105453,
+      "rewards/bleu_reward_func/std": 0.05620579421520233,
+      "step": 546
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 160.0,
+      "completions/max_terminated_length": 160.0,
+      "completions/mean_length": 57.5,
+      "completions/mean_terminated_length": 57.5,
+      "completions/min_length": 22.0,
+      "completions/min_terminated_length": 22.0,
+      "epoch": 0.4376,
+      "grad_norm": 8.828208923339844,
+      "kl": 0.25537109375,
+      "learning_rate": 1e-06,
+      "loss": 0.103,
+      "num_tokens": 7213949.0,
+      "reward": 0.20786888897418976,
+      "reward_std": 0.06727642565965652,
+      "rewards/bleu_reward_func/mean": 0.20786888897418976,
+      "rewards/bleu_reward_func/std": 0.1706974357366562,
+      "step": 547
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 512.0,
+      "completions/mean_length": 275.125,
+      "completions/mean_terminated_length": 196.1666717529297,
+      "completions/min_length": 17.0,
+      "completions/min_terminated_length": 17.0,
+      "epoch": 0.4384,
+      "grad_norm": 13.268147468566895,
+      "kl": 0.058074951171875,
+      "learning_rate": 1e-06,
+      "loss": -0.0694,
+      "num_tokens": 7227041.0,
+      "reward": 0.05118046700954437,
+      "reward_std": 0.02497515268623829,
+      "rewards/bleu_reward_func/mean": 0.05118046700954437,
+      "rewards/bleu_reward_func/std": 0.035916514694690704,
+      "step": 548
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 487.0,
+      "completions/mean_length": 351.3125,
+      "completions/mean_terminated_length": 278.2727355957031,
+      "completions/min_length": 56.0,
+      "completions/min_terminated_length": 56.0,
+      "epoch": 0.4392,
+      "grad_norm": 13.135753631591797,
+      "kl": 0.077850341796875,
+      "learning_rate": 1e-06,
+      "loss": 0.0555,
+      "num_tokens": 7244763.0,
+      "reward": 0.07840518653392792,
+      "reward_std": 0.022635504603385925,
+      "rewards/bleu_reward_func/mean": 0.07840518653392792,
+      "rewards/bleu_reward_func/std": 0.06580173969268799,
+      "step": 549
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 497.0,
+      "completions/mean_length": 201.1875,
+      "completions/mean_terminated_length": 156.7857208251953,
+      "completions/min_length": 19.0,
+      "completions/min_terminated_length": 19.0,
+      "epoch": 0.44,
+      "grad_norm": 7.055432319641113,
+      "kl": 0.25830078125,
+      "learning_rate": 1e-06,
+      "loss": -0.0546,
+      "num_tokens": 7256785.0,
+      "reward": 0.253431499004364,
+      "reward_std": 0.028121720999479294,
+      "rewards/bleu_reward_func/mean": 0.253431499004364,
+      "rewards/bleu_reward_func/std": 0.20365522801876068,
+      "step": 550
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.21875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 474.0,
+      "completions/mean_length": 336.8125,
+      "completions/mean_terminated_length": 287.7599792480469,
+      "completions/min_length": 110.0,
+      "completions/min_terminated_length": 110.0,
+      "epoch": 0.4408,
+      "grad_norm": 3.6197187900543213,
+      "kl": 0.046844482421875,
+      "learning_rate": 1e-06,
+      "loss": 0.1119,
+      "num_tokens": 7268987.0,
+      "reward": 0.061213478446006775,
+      "reward_std": 0.01489005982875824,
+      "rewards/bleu_reward_func/mean": 0.061213478446006775,
+      "rewards/bleu_reward_func/std": 0.038935884833335876,
+      "step": 551
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.21875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 511.0,
+      "completions/mean_length": 386.21875,
+      "completions/mean_terminated_length": 351.0,
+      "completions/min_length": 3.0,
+      "completions/min_terminated_length": 3.0,
+      "epoch": 0.4416,
+      "grad_norm": 2.6066031455993652,
+      "kl": 0.034149169921875,
+      "learning_rate": 1e-06,
+      "loss": -0.1341,
+      "num_tokens": 7287306.0,
+      "reward": 0.11066319048404694,
+      "reward_std": 0.105903759598732,
+      "rewards/bleu_reward_func/mean": 0.11066319048404694,
+      "rewards/bleu_reward_func/std": 0.16723419725894928,
+      "step": 552
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.09375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 476.0,
+      "completions/mean_length": 263.28125,
+      "completions/mean_terminated_length": 237.55172729492188,
+      "completions/min_length": 8.0,
+      "completions/min_terminated_length": 8.0,
+      "epoch": 0.4424,
+      "grad_norm": 9.281195640563965,
+      "kl": 0.226043701171875,
+      "learning_rate": 1e-06,
+      "loss": -0.1571,
+      "num_tokens": 7299899.0,
+      "reward": 0.06672249734401703,
+      "reward_std": 0.03525693714618683,
+      "rewards/bleu_reward_func/mean": 0.06672249734401703,
+      "rewards/bleu_reward_func/std": 0.0810592845082283,
+      "step": 553
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.15625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 489.0,
+      "completions/mean_length": 233.84375,
+      "completions/mean_terminated_length": 182.3333282470703,
+      "completions/min_length": 14.0,
+      "completions/min_terminated_length": 14.0,
+      "epoch": 0.4432,
+      "grad_norm": 7.543862342834473,
+      "kl": 0.308807373046875,
+      "learning_rate": 1e-06,
+      "loss": 0.0082,
+      "num_tokens": 7313638.0,
+      "reward": 0.30825120210647583,
+      "reward_std": 0.07663644850254059,
+      "rewards/bleu_reward_func/mean": 0.30825120210647583,
+      "rewards/bleu_reward_func/std": 0.1689450740814209,
+      "step": 554
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.09375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 474.0,
+      "completions/mean_length": 246.40625,
+      "completions/mean_terminated_length": 218.9310302734375,
+      "completions/min_length": 35.0,
+      "completions/min_terminated_length": 35.0,
+      "epoch": 0.444,
+      "grad_norm": 4.433272361755371,
+      "kl": 0.104339599609375,
+      "learning_rate": 1e-06,
+      "loss": 0.0582,
+      "num_tokens": 7324939.0,
+      "reward": 0.19763408601284027,
+      "reward_std": 0.028635632246732712,
+      "rewards/bleu_reward_func/mean": 0.19763408601284027,
+      "rewards/bleu_reward_func/std": 0.18309614062309265,
+      "step": 555
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 280.0,
+      "completions/mean_length": 62.96875,
+      "completions/mean_terminated_length": 48.48386764526367,
+      "completions/min_length": 14.0,
+      "completions/min_terminated_length": 14.0,
+      "epoch": 0.4448,
+      "grad_norm": 10.322346687316895,
+      "kl": 0.1849365234375,
+      "learning_rate": 1e-06,
+      "loss": 0.432,
+      "num_tokens": 7330802.0,
+      "reward": 0.19210518896579742,
+      "reward_std": 0.03121430240571499,
+      "rewards/bleu_reward_func/mean": 0.19210518896579742,
+      "rewards/bleu_reward_func/std": 0.16853223741054535,
+      "step": 556
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 511.0,
+      "completions/mean_length": 202.78125,
+      "completions/mean_terminated_length": 182.1666717529297,
+      "completions/min_length": 29.0,
+      "completions/min_terminated_length": 29.0,
+      "epoch": 0.4456,
+      "grad_norm": 9.519190788269043,
+      "kl": 0.171844482421875,
+      "learning_rate": 1e-06,
+      "loss": 0.0451,
+      "num_tokens": 7339683.0,
+      "reward": 0.170665442943573,
+      "reward_std": 0.06568457931280136,
+      "rewards/bleu_reward_func/mean": 0.170665442943573,
+      "rewards/bleu_reward_func/std": 0.1584860235452652,
+      "step": 557
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.34375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 495.0,
+      "completions/mean_length": 256.75,
+      "completions/mean_terminated_length": 123.04762268066406,
+      "completions/min_length": 16.0,
+      "completions/min_terminated_length": 16.0,
+      "epoch": 0.4464,
+      "grad_norm": 5.70733118057251,
+      "kl": 0.177093505859375,
+      "learning_rate": 1e-06,
+      "loss": -0.0177,
+      "num_tokens": 7354059.0,
+      "reward": 0.11887075752019882,
+      "reward_std": 0.037268251180648804,
+      "rewards/bleu_reward_func/mean": 0.11887075752019882,
+      "rewards/bleu_reward_func/std": 0.09704269468784332,
+      "step": 558
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 398.0,
+      "completions/mean_length": 237.28125,
+      "completions/mean_terminated_length": 145.70834350585938,
+      "completions/min_length": 18.0,
+      "completions/min_terminated_length": 18.0,
+      "epoch": 0.4472,
+      "grad_norm": 10.510950088500977,
+      "kl": 0.141387939453125,
+      "learning_rate": 1e-06,
+      "loss": 0.2298,
+      "num_tokens": 7369252.0,
+      "reward": 0.11686157435178757,
+      "reward_std": 0.06300412118434906,
+      "rewards/bleu_reward_func/mean": 0.11686157435178757,
+      "rewards/bleu_reward_func/std": 0.10008818656206131,
+      "step": 559
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.28125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 464.0,
+      "completions/mean_length": 294.21875,
+      "completions/mean_terminated_length": 209.0,
+      "completions/min_length": 17.0,
+      "completions/min_terminated_length": 17.0,
+      "epoch": 0.448,
+      "grad_norm": 12.427102088928223,
+      "kl": 0.15325927734375,
+      "learning_rate": 1e-06,
+      "loss": -0.1463,
+      "num_tokens": 7384539.0,
+      "reward": 0.10454531759023666,
+      "reward_std": 0.032633934170007706,
+      "rewards/bleu_reward_func/mean": 0.10454531759023666,
+      "rewards/bleu_reward_func/std": 0.09093461185693741,
+      "step": 560
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 471.0,
+      "completions/mean_length": 175.84375,
+      "completions/mean_terminated_length": 63.79166793823242,
+      "completions/min_length": 18.0,
+      "completions/min_terminated_length": 18.0,
+      "epoch": 0.4488,
+      "grad_norm": 18.402080535888672,
+      "kl": 0.187530517578125,
+      "learning_rate": 1e-06,
+      "loss": 0.6927,
+      "num_tokens": 7394846.0,
+      "reward": 0.21487680077552795,
+      "reward_std": 0.08058933913707733,
+      "rewards/bleu_reward_func/mean": 0.21487680077552795,
+      "rewards/bleu_reward_func/std": 0.20088493824005127,
+      "step": 561
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.34375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 504.0,
+      "completions/mean_length": 425.21875,
+      "completions/mean_terminated_length": 379.76190185546875,
+      "completions/min_length": 180.0,
+      "completions/min_terminated_length": 180.0,
+      "epoch": 0.4496,
+      "grad_norm": 2.861811637878418,
+      "kl": 0.030670166015625,
+      "learning_rate": 1e-06,
+      "loss": -0.0581,
+      "num_tokens": 7414069.0,
+      "reward": 0.09261719137430191,
+      "reward_std": 0.046390384435653687,
+      "rewards/bleu_reward_func/mean": 0.09261719137430191,
+      "rewards/bleu_reward_func/std": 0.14345434308052063,
+      "step": 562
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.40625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 506.0,
+      "completions/mean_length": 372.53125,
+      "completions/mean_terminated_length": 277.1052551269531,
+      "completions/min_length": 58.0,
+      "completions/min_terminated_length": 58.0,
+      "epoch": 0.4504,
+      "grad_norm": 3.737154960632324,
+      "kl": 0.07086181640625,
+      "learning_rate": 1e-06,
+      "loss": -0.0085,
+      "num_tokens": 7433038.0,
+      "reward": 0.13954411447048187,
+      "reward_std": 0.09964635223150253,
+      "rewards/bleu_reward_func/mean": 0.13954411447048187,
+      "rewards/bleu_reward_func/std": 0.2269161492586136,
+      "step": 563
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 490.0,
+      "completions/mean_length": 217.5,
+      "completions/mean_terminated_length": 175.42857360839844,
+      "completions/min_length": 10.0,
+      "completions/min_terminated_length": 10.0,
+      "epoch": 0.4512,
+      "grad_norm": 9.693827629089355,
+      "kl": 0.2703857421875,
+      "learning_rate": 1e-06,
+      "loss": 0.0569,
+      "num_tokens": 7442590.0,
+      "reward": 0.08689892292022705,
+      "reward_std": 0.046516068279743195,
+      "rewards/bleu_reward_func/mean": 0.08689892292022705,
+      "rewards/bleu_reward_func/std": 0.09460947662591934,
+      "step": 564
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 489.0,
+      "completions/mean_length": 344.3125,
+      "completions/mean_terminated_length": 320.3571472167969,
+      "completions/min_length": 102.0,
+      "completions/min_terminated_length": 102.0,
+      "epoch": 0.452,
+      "grad_norm": 3.287851333618164,
+      "kl": 0.03594970703125,
+      "learning_rate": 1e-06,
+      "loss": -0.0888,
+      "num_tokens": 7455856.0,
+      "reward": 0.09177221357822418,
+      "reward_std": 0.02658715285360813,
+      "rewards/bleu_reward_func/mean": 0.09177221357822418,
+      "rewards/bleu_reward_func/std": 0.04939228668808937,
+      "step": 565
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 505.0,
+      "completions/mean_length": 257.0,
+      "completions/mean_terminated_length": 141.09091186523438,
+      "completions/min_length": 23.0,
+      "completions/min_terminated_length": 23.0,
+      "epoch": 0.4528,
+      "grad_norm": 5.329028129577637,
+      "kl": 0.21087646484375,
+      "learning_rate": 1e-06,
+      "loss": -0.0318,
+      "num_tokens": 7471128.0,
+      "reward": 0.30248120427131653,
+      "reward_std": 0.045193642377853394,
+      "rewards/bleu_reward_func/mean": 0.30248120427131653,
+      "rewards/bleu_reward_func/std": 0.09429154545068741,
+      "step": 566
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 505.0,
+      "completions/mean_length": 396.5625,
+      "completions/mean_terminated_length": 306.77777099609375,
+      "completions/min_length": 15.0,
+      "completions/min_terminated_length": 15.0,
+      "epoch": 0.4536,
+      "grad_norm": 2.8240556716918945,
+      "kl": 0.029327392578125,
+      "learning_rate": 1e-06,
+      "loss": -0.006,
+      "num_tokens": 7486778.0,
+      "reward": 0.046158432960510254,
+      "reward_std": 0.012592589482665062,
+      "rewards/bleu_reward_func/mean": 0.046158432960510254,
+      "rewards/bleu_reward_func/std": 0.0691753551363945,
+      "step": 567
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 463.0,
+      "completions/mean_length": 221.5,
+      "completions/mean_terminated_length": 180.00001525878906,
+      "completions/min_length": 33.0,
+      "completions/min_terminated_length": 33.0,
+      "epoch": 0.4544,
+      "grad_norm": 5.554408073425293,
+      "kl": 0.077850341796875,
+      "learning_rate": 1e-06,
+      "loss": -0.0114,
+      "num_tokens": 7501522.0,
+      "reward": 0.19211658835411072,
+      "reward_std": 0.052228912711143494,
+      "rewards/bleu_reward_func/mean": 0.19211658835411072,
+      "rewards/bleu_reward_func/std": 0.12220965325832367,
+      "step": 568
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 370.0,
+      "completions/mean_length": 282.90625,
+      "completions/mean_terminated_length": 104.72222137451172,
+      "completions/min_length": 8.0,
+      "completions/min_terminated_length": 8.0,
+      "epoch": 0.4552,
+      "grad_norm": 8.932016372680664,
+      "kl": 0.1943359375,
+      "learning_rate": 1e-06,
+      "loss": 0.0056,
+      "num_tokens": 7515511.0,
+      "reward": 0.08466814458370209,
+      "reward_std": 0.03040888160467148,
+      "rewards/bleu_reward_func/mean": 0.08466814458370209,
+      "rewards/bleu_reward_func/std": 0.07005324959754944,
+      "step": 569
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 505.0,
+      "completions/mean_length": 450.0,
+      "completions/mean_terminated_length": 388.0,
+      "completions/min_length": 85.0,
+      "completions/min_terminated_length": 85.0,
+      "epoch": 0.456,
+      "grad_norm": 1.950373888015747,
+      "kl": 0.0325927734375,
+      "learning_rate": 1e-06,
+      "loss": 0.0603,
+      "num_tokens": 7535791.0,
+      "reward": 0.06426975131034851,
+      "reward_std": 0.02304723486304283,
+      "rewards/bleu_reward_func/mean": 0.06426975131034851,
+      "rewards/bleu_reward_func/std": 0.04708797112107277,
+      "step": 570
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.09375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 480.0,
+      "completions/mean_length": 200.03125,
+      "completions/mean_terminated_length": 167.7586212158203,
+      "completions/min_length": 20.0,
+      "completions/min_terminated_length": 20.0,
+      "epoch": 0.4568,
+      "grad_norm": 8.692915916442871,
+      "kl": 0.300445556640625,
+      "learning_rate": 1e-06,
+      "loss": 0.179,
+      "num_tokens": 7548720.0,
+      "reward": 0.16858291625976562,
+      "reward_std": 0.04772442951798439,
+      "rewards/bleu_reward_func/mean": 0.16858291625976562,
+      "rewards/bleu_reward_func/std": 0.187880739569664,
+      "step": 571
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 471.0,
+      "completions/mean_length": 221.4375,
+      "completions/mean_terminated_length": 154.38462829589844,
+      "completions/min_length": 15.0,
+      "completions/min_terminated_length": 15.0,
+      "epoch": 0.4576,
+      "grad_norm": 5.559481143951416,
+      "kl": 0.2091064453125,
+      "learning_rate": 1e-06,
+      "loss": 0.1461,
+      "num_tokens": 7559926.0,
+      "reward": 0.2749570608139038,
+      "reward_std": 0.07935648411512375,
+      "rewards/bleu_reward_func/mean": 0.2749570608139038,
+      "rewards/bleu_reward_func/std": 0.20695801079273224,
+      "step": 572
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 511.0,
+      "completions/mean_length": 298.25,
+      "completions/mean_terminated_length": 227.0,
+      "completions/min_length": 39.0,
+      "completions/min_terminated_length": 39.0,
+      "epoch": 0.4584,
+      "grad_norm": 4.713781833648682,
+      "kl": 0.10894775390625,
+      "learning_rate": 1e-06,
+      "loss": -0.1979,
+      "num_tokens": 7573214.0,
+      "reward": 0.11424913257360458,
+      "reward_std": 0.0238350722938776,
+      "rewards/bleu_reward_func/mean": 0.11424913257360458,
+      "rewards/bleu_reward_func/std": 0.1513095498085022,
+      "step": 573
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 481.0,
+      "completions/mean_length": 301.375,
+      "completions/mean_terminated_length": 231.1666717529297,
+      "completions/min_length": 23.0,
+      "completions/min_terminated_length": 23.0,
+      "epoch": 0.4592,
+      "grad_norm": 6.019801616668701,
+      "kl": 0.117706298828125,
+      "learning_rate": 1e-06,
+      "loss": -0.0467,
+      "num_tokens": 7584922.0,
+      "reward": 0.12773753702640533,
+      "reward_std": 0.03902646526694298,
+      "rewards/bleu_reward_func/mean": 0.12773753702640533,
+      "rewards/bleu_reward_func/std": 0.08676618337631226,
+      "step": 574
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.46875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 310.0,
+      "completions/mean_length": 261.3125,
+      "completions/mean_terminated_length": 40.11764907836914,
+      "completions/min_length": 17.0,
+      "completions/min_terminated_length": 17.0,
+      "epoch": 0.46,
+      "grad_norm": 20.097490310668945,
+      "kl": 0.32666015625,
+      "learning_rate": 1e-06,
+      "loss": 0.0115,
+      "num_tokens": 7599564.0,
+      "reward": 0.1775631606578827,
+      "reward_std": 0.05471285060048103,
+      "rewards/bleu_reward_func/mean": 0.1775631606578827,
+      "rewards/bleu_reward_func/std": 0.1462731659412384,
+      "step": 575
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 483.0,
+      "completions/mean_length": 224.59375,
+      "completions/mean_terminated_length": 93.95455169677734,
+      "completions/min_length": 16.0,
+      "completions/min_terminated_length": 16.0,
+      "epoch": 0.4608,
+      "grad_norm": 12.845754623413086,
+      "kl": 0.2415771484375,
+      "learning_rate": 1e-06,
+      "loss": 0.1681,
+      "num_tokens": 7609343.0,
+      "reward": 0.10711174458265305,
+      "reward_std": 0.03790780156850815,
+      "rewards/bleu_reward_func/mean": 0.10711174458265305,
+      "rewards/bleu_reward_func/std": 0.11842114478349686,
+      "step": 576
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.71875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 471.0,
+      "completions/mean_length": 429.15625,
+      "completions/mean_terminated_length": 217.44444274902344,
+      "completions/min_length": 10.0,
+      "completions/min_terminated_length": 10.0,
+      "epoch": 0.4616,
+      "grad_norm": 3.16619873046875,
+      "kl": 0.031341552734375,
+      "learning_rate": 1e-06,
+      "loss": -0.1864,
+      "num_tokens": 7628708.0,
+      "reward": 0.12277669459581375,
+      "reward_std": 0.030532412230968475,
+      "rewards/bleu_reward_func/mean": 0.12277669459581375,
+      "rewards/bleu_reward_func/std": 0.14895910024642944,
+      "step": 577
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.21875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 452.0,
+      "completions/mean_length": 190.625,
+      "completions/mean_terminated_length": 100.63999938964844,
+      "completions/min_length": 20.0,
+      "completions/min_terminated_length": 20.0,
+      "epoch": 0.4624,
+      "grad_norm": 6.808165550231934,
+      "kl": 0.2611083984375,
+      "learning_rate": 1e-06,
+      "loss": -0.043,
+      "num_tokens": 7637504.0,
+      "reward": 0.06198694184422493,
+      "reward_std": 0.018319500610232353,
+      "rewards/bleu_reward_func/mean": 0.06198694184422493,
+      "rewards/bleu_reward_func/std": 0.05399094894528389,
+      "step": 578
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.46875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 422.0,
+      "completions/mean_length": 306.65625,
+      "completions/mean_terminated_length": 125.47058868408203,
+      "completions/min_length": 16.0,
+      "completions/min_terminated_length": 16.0,
+      "epoch": 0.4632,
+      "grad_norm": 6.689248085021973,
+      "kl": 0.165435791015625,
+      "learning_rate": 1e-06,
+      "loss": 0.0301,
+      "num_tokens": 7651797.0,
+      "reward": 0.07045552134513855,
+      "reward_std": 0.018690217286348343,
+      "rewards/bleu_reward_func/mean": 0.07045552134513855,
+      "rewards/bleu_reward_func/std": 0.07137548923492432,
+      "step": 579
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 497.0,
+      "completions/mean_length": 332.6875,
+      "completions/mean_terminated_length": 193.22222900390625,
+      "completions/min_length": 64.0,
+      "completions/min_terminated_length": 64.0,
+      "epoch": 0.464,
+      "grad_norm": 4.129982948303223,
+      "kl": 0.050750732421875,
+      "learning_rate": 1e-06,
+      "loss": 0.2464,
+      "num_tokens": 7667787.0,
+      "reward": 0.09383320808410645,
+      "reward_std": 0.046889662742614746,
+      "rewards/bleu_reward_func/mean": 0.09383320808410645,
+      "rewards/bleu_reward_func/std": 0.10615876317024231,
+      "step": 580
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.21875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 438.0,
+      "completions/mean_length": 145.375,
+      "completions/mean_terminated_length": 42.71999740600586,
+      "completions/min_length": 12.0,
+      "completions/min_terminated_length": 12.0,
+      "epoch": 0.4648,
+      "grad_norm": 9.511686325073242,
+      "kl": 0.385894775390625,
+      "learning_rate": 1e-06,
+      "loss": 0.0501,
+      "num_tokens": 7679535.0,
+      "reward": 0.08031031489372253,
+      "reward_std": 0.036660827696323395,
+      "rewards/bleu_reward_func/mean": 0.08031031489372253,
+      "rewards/bleu_reward_func/std": 0.07939815521240234,
+      "step": 581
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.09375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 512.0,
+      "completions/mean_length": 294.0,
+      "completions/mean_terminated_length": 271.4482727050781,
+      "completions/min_length": 21.0,
+      "completions/min_terminated_length": 21.0,
+      "epoch": 0.4656,
+      "grad_norm": 7.2719950675964355,
+      "kl": 0.149932861328125,
+      "learning_rate": 1e-06,
+      "loss": 0.0885,
+      "num_tokens": 7690815.0,
+      "reward": 0.11769823729991913,
+      "reward_std": 0.02824997529387474,
+      "rewards/bleu_reward_func/mean": 0.11769823729991913,
+      "rewards/bleu_reward_func/std": 0.12788043916225433,
+      "step": 582
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 453.0,
+      "completions/max_terminated_length": 453.0,
+      "completions/mean_length": 120.15625,
+      "completions/mean_terminated_length": 120.15625,
+      "completions/min_length": 15.0,
+      "completions/min_terminated_length": 15.0,
+      "epoch": 0.4664,
+      "grad_norm": 11.640869140625,
+      "kl": 0.19036865234375,
+      "learning_rate": 1e-06,
+      "loss": 0.1164,
+      "num_tokens": 7698172.0,
+      "reward": 0.052216824144124985,
+      "reward_std": 0.015741443261504173,
+      "rewards/bleu_reward_func/mean": 0.052216824144124985,
+      "rewards/bleu_reward_func/std": 0.01899011991918087,
+      "step": 583
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 476.0,
+      "completions/mean_length": 236.46875,
+      "completions/mean_terminated_length": 144.625,
+      "completions/min_length": 16.0,
+      "completions/min_terminated_length": 16.0,
+      "epoch": 0.4672,
+      "grad_norm": 7.708470821380615,
+      "kl": 0.25323486328125,
+      "learning_rate": 1e-06,
+      "loss": 0.0057,
+      "num_tokens": 7711683.0,
+      "reward": 0.20987500250339508,
+      "reward_std": 0.050422437489032745,
+      "rewards/bleu_reward_func/mean": 0.20987500250339508,
+      "rewards/bleu_reward_func/std": 0.21432380378246307,
+      "step": 584
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.34375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 503.0,
+      "completions/mean_length": 255.09375,
+      "completions/mean_terminated_length": 120.52381134033203,
+      "completions/min_length": 14.0,
+      "completions/min_terminated_length": 14.0,
+      "epoch": 0.468,
+      "grad_norm": 8.178823471069336,
+      "kl": 0.181243896484375,
+      "learning_rate": 1e-06,
+      "loss": 0.0026,
+      "num_tokens": 7721638.0,
+      "reward": 0.11023026704788208,
+      "reward_std": 0.03732236102223396,
+      "rewards/bleu_reward_func/mean": 0.11023026704788208,
+      "rewards/bleu_reward_func/std": 0.06018221378326416,
+      "step": 585
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.21875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 497.0,
+      "completions/mean_length": 209.40625,
+      "completions/mean_terminated_length": 124.68000030517578,
+      "completions/min_length": 15.0,
+      "completions/min_terminated_length": 15.0,
+      "epoch": 0.4688,
+      "grad_norm": 6.163815498352051,
+      "kl": 0.057861328125,
+      "learning_rate": 1e-06,
+      "loss": -0.382,
+      "num_tokens": 7731483.0,
+      "reward": 0.022579330950975418,
+      "reward_std": 0.024172717705368996,
+      "rewards/bleu_reward_func/mean": 0.022579330950975418,
+      "rewards/bleu_reward_func/std": 0.03154170513153076,
+      "step": 586
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 493.0,
+      "completions/mean_length": 179.28125,
+      "completions/mean_terminated_length": 131.75,
+      "completions/min_length": 12.0,
+      "completions/min_terminated_length": 12.0,
+      "epoch": 0.4696,
+      "grad_norm": 24.94843864440918,
+      "kl": 0.303466796875,
+      "learning_rate": 1e-06,
+      "loss": -0.149,
+      "num_tokens": 7740828.0,
+      "reward": 0.062010329216718674,
+      "reward_std": 0.030193448066711426,
+      "rewards/bleu_reward_func/mean": 0.062010329216718674,
+      "rewards/bleu_reward_func/std": 0.04090145602822304,
+      "step": 587
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.40625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 494.0,
+      "completions/mean_length": 372.0625,
+      "completions/mean_terminated_length": 276.3157958984375,
+      "completions/min_length": 13.0,
+      "completions/min_terminated_length": 13.0,
+      "epoch": 0.4704,
+      "grad_norm": 2.5675299167633057,
+      "kl": 0.03485107421875,
+      "learning_rate": 1e-06,
+      "loss": 0.0333,
+      "num_tokens": 7757862.0,
+      "reward": 0.037547022104263306,
+      "reward_std": 0.01179808471351862,
+      "rewards/bleu_reward_func/mean": 0.037547022104263306,
+      "rewards/bleu_reward_func/std": 0.03366583213210106,
+      "step": 588
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.15625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 487.0,
+      "completions/mean_length": 268.875,
+      "completions/mean_terminated_length": 223.8518524169922,
+      "completions/min_length": 28.0,
+      "completions/min_terminated_length": 28.0,
+      "epoch": 0.4712,
+      "grad_norm": 9.238602638244629,
+      "kl": 0.09326171875,
+      "learning_rate": 1e-06,
+      "loss": 0.0178,
+      "num_tokens": 7769098.0,
+      "reward": 0.0967094898223877,
+      "reward_std": 0.041084855794906616,
+      "rewards/bleu_reward_func/mean": 0.0967094898223877,
+      "rewards/bleu_reward_func/std": 0.10235904902219772,
+      "step": 589
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.40625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 475.0,
+      "completions/mean_length": 271.71875,
+      "completions/mean_terminated_length": 107.31578826904297,
+      "completions/min_length": 15.0,
+      "completions/min_terminated_length": 15.0,
+      "epoch": 0.472,
+      "grad_norm": 12.115531921386719,
+      "kl": 0.196563720703125,
+      "learning_rate": 1e-06,
+      "loss": 0.3242,
+      "num_tokens": 7782097.0,
+      "reward": 0.19325336813926697,
+      "reward_std": 0.0921676903963089,
+      "rewards/bleu_reward_func/mean": 0.19325336813926697,
+      "rewards/bleu_reward_func/std": 0.24508582055568695,
+      "step": 590
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 501.0,
+      "completions/mean_length": 197.59375,
+      "completions/mean_terminated_length": 176.6333465576172,
+      "completions/min_length": 22.0,
+      "completions/min_terminated_length": 22.0,
+      "epoch": 0.4728,
+      "grad_norm": 9.336063385009766,
+      "kl": 0.17633056640625,
+      "learning_rate": 1e-06,
+      "loss": 0.0424,
+      "num_tokens": 7793588.0,
+      "reward": 0.14900264143943787,
+      "reward_std": 0.06498396396636963,
+      "rewards/bleu_reward_func/mean": 0.14900264143943787,
+      "rewards/bleu_reward_func/std": 0.13959822058677673,
+      "step": 591
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 396.0,
+      "completions/mean_length": 213.96875,
+      "completions/mean_terminated_length": 145.1923065185547,
+      "completions/min_length": 19.0,
+      "completions/min_terminated_length": 19.0,
+      "epoch": 0.4736,
+      "grad_norm": 40.333492279052734,
+      "kl": 0.1762237548828125,
+      "learning_rate": 1e-06,
+      "loss": 0.034,
+      "num_tokens": 7809987.0,
+      "reward": 0.11991982161998749,
+      "reward_std": 0.024838652461767197,
+      "rewards/bleu_reward_func/mean": 0.11991982161998749,
+      "rewards/bleu_reward_func/std": 0.13350419700145721,
+      "step": 592
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 499.0,
+      "completions/mean_length": 349.46875,
+      "completions/mean_terminated_length": 295.29168701171875,
+      "completions/min_length": 12.0,
+      "completions/min_terminated_length": 12.0,
+      "epoch": 0.4744,
+      "grad_norm": 6.914525032043457,
+      "kl": 0.02703857421875,
+      "learning_rate": 1e-06,
+      "loss": 0.1437,
+      "num_tokens": 7826610.0,
+      "reward": 0.02816709131002426,
+      "reward_std": 0.015584287233650684,
+      "rewards/bleu_reward_func/mean": 0.02816709131002426,
+      "rewards/bleu_reward_func/std": 0.027631772682070732,
+      "step": 593
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 444.0,
+      "completions/mean_length": 189.28125,
+      "completions/mean_terminated_length": 178.87095642089844,
+      "completions/min_length": 14.0,
+      "completions/min_terminated_length": 14.0,
+      "epoch": 0.4752,
+      "grad_norm": 26.64930534362793,
+      "kl": 0.619964599609375,
+      "learning_rate": 1e-06,
+      "loss": 0.0753,
+      "num_tokens": 7834363.0,
+      "reward": 0.1504502296447754,
+      "reward_std": 0.061798207461833954,
+      "rewards/bleu_reward_func/mean": 0.1504502296447754,
+      "rewards/bleu_reward_func/std": 0.1269664466381073,
+      "step": 594
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 493.0,
+      "completions/mean_length": 317.125,
+      "completions/mean_terminated_length": 200.1999969482422,
+      "completions/min_length": 23.0,
+      "completions/min_terminated_length": 23.0,
+      "epoch": 0.476,
+      "grad_norm": 8.9805908203125,
+      "kl": 0.294097900390625,
+      "learning_rate": 1e-06,
+      "loss": -0.0704,
+      "num_tokens": 7849831.0,
+      "reward": 0.13049980998039246,
+      "reward_std": 0.02749776840209961,
+      "rewards/bleu_reward_func/mean": 0.13049980998039246,
+      "rewards/bleu_reward_func/std": 0.109443299472332,
+      "step": 595
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 466.0,
+      "completions/mean_length": 272.125,
+      "completions/mean_terminated_length": 192.1666717529297,
+      "completions/min_length": 17.0,
+      "completions/min_terminated_length": 17.0,
+      "epoch": 0.4768,
+      "grad_norm": 4.8097429275512695,
+      "kl": 0.08526611328125,
+      "learning_rate": 1e-06,
+      "loss": -0.1088,
+      "num_tokens": 7861347.0,
+      "reward": 0.0661308616399765,
+      "reward_std": 0.02051004208624363,
+      "rewards/bleu_reward_func/mean": 0.0661308616399765,
+      "rewards/bleu_reward_func/std": 0.05708196386694908,
+      "step": 596
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.21875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 468.0,
+      "completions/mean_length": 206.125,
+      "completions/mean_terminated_length": 120.47999572753906,
+      "completions/min_length": 18.0,
+      "completions/min_terminated_length": 18.0,
+      "epoch": 0.4776,
+      "grad_norm": 7.351487636566162,
+      "kl": 0.334259033203125,
+      "learning_rate": 1e-06,
+      "loss": 0.0519,
+      "num_tokens": 7871295.0,
+      "reward": 0.16019710898399353,
+      "reward_std": 0.03656643629074097,
+      "rewards/bleu_reward_func/mean": 0.16019710898399353,
+      "rewards/bleu_reward_func/std": 0.19289268553256989,
+      "step": 597
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 480.0,
+      "completions/mean_length": 349.3125,
+      "completions/mean_terminated_length": 186.625,
+      "completions/min_length": 24.0,
+      "completions/min_terminated_length": 24.0,
+      "epoch": 0.4784,
+      "grad_norm": 4.930379867553711,
+      "kl": 0.181121826171875,
+      "learning_rate": 1e-06,
+      "loss": -0.0202,
+      "num_tokens": 7887577.0,
+      "reward": 0.1285662055015564,
+      "reward_std": 0.03015293926000595,
+      "rewards/bleu_reward_func/mean": 0.1285662055015564,
+      "rewards/bleu_reward_func/std": 0.08600351959466934,
+      "step": 598
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 507.0,
+      "completions/mean_length": 175.875,
+      "completions/mean_terminated_length": 153.4666748046875,
+      "completions/min_length": 10.0,
+      "completions/min_terminated_length": 10.0,
+      "epoch": 0.4792,
+      "grad_norm": 15.110285758972168,
+      "kl": 0.34112548828125,
+      "learning_rate": 1e-06,
+      "loss": -0.1173,
+      "num_tokens": 7898453.0,
+      "reward": 0.09940430521965027,
+      "reward_std": 0.046547506004571915,
+      "rewards/bleu_reward_func/mean": 0.09940430521965027,
+      "rewards/bleu_reward_func/std": 0.05020095780491829,
+      "step": 599
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.09375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 484.0,
+      "completions/mean_length": 334.03125,
+      "completions/mean_terminated_length": 315.6206970214844,
+      "completions/min_length": 41.0,
+      "completions/min_terminated_length": 41.0,
+      "epoch": 0.48,
+      "grad_norm": 3.0496630668640137,
+      "kl": 0.03216552734375,
+      "learning_rate": 1e-06,
+      "loss": -0.1382,
+      "num_tokens": 7911190.0,
+      "reward": 0.08442967385053635,
+      "reward_std": 0.027117565274238586,
+      "rewards/bleu_reward_func/mean": 0.08442967385053635,
+      "rewards/bleu_reward_func/std": 0.07272256910800934,
+      "step": 600
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 389.0,
+      "completions/mean_length": 132.125,
+      "completions/mean_terminated_length": 119.87096405029297,
+      "completions/min_length": 14.0,
+      "completions/min_terminated_length": 14.0,
+      "epoch": 0.4808,
+      "grad_norm": 9.77161693572998,
+      "kl": 0.3096923828125,
+      "learning_rate": 1e-06,
+      "loss": 0.0712,
+      "num_tokens": 7923114.0,
+      "reward": 0.19400227069854736,
+      "reward_std": 0.08562377095222473,
+      "rewards/bleu_reward_func/mean": 0.19400227069854736,
+      "rewards/bleu_reward_func/std": 0.18403199315071106,
+      "step": 601
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.40625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 433.0,
+      "completions/mean_length": 270.53125,
+      "completions/mean_terminated_length": 105.31578826904297,
+      "completions/min_length": 19.0,
+      "completions/min_terminated_length": 19.0,
+      "epoch": 0.4816,
+      "grad_norm": 5.418551445007324,
+      "kl": 0.132049560546875,
+      "learning_rate": 1e-06,
+      "loss": 0.057,
+      "num_tokens": 7937747.0,
+      "reward": 0.15049128234386444,
+      "reward_std": 0.024429049342870712,
+      "rewards/bleu_reward_func/mean": 0.15049128234386444,
+      "rewards/bleu_reward_func/std": 0.1750853955745697,
+      "step": 602
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.46875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 492.0,
+      "completions/mean_length": 295.46875,
+      "completions/mean_terminated_length": 104.4117660522461,
+      "completions/min_length": 24.0,
+      "completions/min_terminated_length": 24.0,
+      "epoch": 0.4824,
+      "grad_norm": 4.260025501251221,
+      "kl": 0.074981689453125,
+      "learning_rate": 1e-06,
+      "loss": 0.0337,
+      "num_tokens": 7951266.0,
+      "reward": 0.0902150496840477,
+      "reward_std": 0.0313844196498394,
+      "rewards/bleu_reward_func/mean": 0.0902150496840477,
+      "rewards/bleu_reward_func/std": 0.09559616446495056,
+      "step": 603
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.40625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 480.0,
+      "completions/mean_length": 359.625,
+      "completions/mean_terminated_length": 255.36842346191406,
+      "completions/min_length": 49.0,
+      "completions/min_terminated_length": 49.0,
+      "epoch": 0.4832,
+      "grad_norm": 3.981938362121582,
+      "kl": 0.05792236328125,
+      "learning_rate": 1e-06,
+      "loss": 0.068,
+      "num_tokens": 7967510.0,
+      "reward": 0.15250109136104584,
+      "reward_std": 0.050509147346019745,
+      "rewards/bleu_reward_func/mean": 0.15250109136104584,
+      "rewards/bleu_reward_func/std": 0.2119276374578476,
+      "step": 604
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 507.0,
+      "completions/mean_length": 243.625,
+      "completions/mean_terminated_length": 205.2857208251953,
+      "completions/min_length": 18.0,
+      "completions/min_terminated_length": 18.0,
+      "epoch": 0.484,
+      "grad_norm": 4.506271839141846,
+      "kl": 0.133819580078125,
+      "learning_rate": 1e-06,
+      "loss": 0.0768,
+      "num_tokens": 7979194.0,
+      "reward": 0.049993276596069336,
+      "reward_std": 0.01375819742679596,
+      "rewards/bleu_reward_func/mean": 0.049993276596069336,
+      "rewards/bleu_reward_func/std": 0.019665135070681572,
+      "step": 605
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.15625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 394.0,
+      "completions/mean_length": 184.90625,
+      "completions/mean_terminated_length": 124.33333587646484,
+      "completions/min_length": 14.0,
+      "completions/min_terminated_length": 14.0,
+      "epoch": 0.4848,
+      "grad_norm": 13.16498851776123,
+      "kl": 0.163360595703125,
+      "learning_rate": 1e-06,
+      "loss": 0.2145,
+      "num_tokens": 7992215.0,
+      "reward": 0.16849525272846222,
+      "reward_std": 0.041973263025283813,
+      "rewards/bleu_reward_func/mean": 0.16849525272846222,
+      "rewards/bleu_reward_func/std": 0.11670318245887756,
+      "step": 606
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 468.0,
+      "completions/max_terminated_length": 468.0,
+      "completions/mean_length": 278.3125,
+      "completions/mean_terminated_length": 278.3125,
+      "completions/min_length": 15.0,
+      "completions/min_terminated_length": 15.0,
+      "epoch": 0.4856,
+      "grad_norm": 9.305381774902344,
+      "kl": 0.1671142578125,
+      "learning_rate": 1e-06,
+      "loss": 0.0542,
+      "num_tokens": 8003561.0,
+      "reward": 0.14806249737739563,
+      "reward_std": 0.04475884884595871,
+      "rewards/bleu_reward_func/mean": 0.14806249737739563,
+      "rewards/bleu_reward_func/std": 0.10317616909742355,
+      "step": 607
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 487.0,
+      "completions/mean_length": 392.3125,
+      "completions/mean_terminated_length": 320.5,
+      "completions/min_length": 102.0,
+      "completions/min_terminated_length": 102.0,
+      "epoch": 0.4864,
+      "grad_norm": 8.050078392028809,
+      "kl": 0.0286865234375,
+      "learning_rate": 1e-06,
+      "loss": -0.0652,
+      "num_tokens": 8020771.0,
+      "reward": 0.09127211570739746,
+      "reward_std": 0.02751500904560089,
+      "rewards/bleu_reward_func/mean": 0.09127211570739746,
+      "rewards/bleu_reward_func/std": 0.04517889395356178,
+      "step": 608
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 505.0,
+      "completions/mean_length": 293.0,
+      "completions/mean_terminated_length": 285.93548583984375,
+      "completions/min_length": 108.0,
+      "completions/min_terminated_length": 108.0,
+      "epoch": 0.4872,
+      "grad_norm": 9.551247596740723,
+      "kl": 0.05633544921875,
+      "learning_rate": 1e-06,
+      "loss": 0.0629,
+      "num_tokens": 8033227.0,
+      "reward": 0.07062816619873047,
+      "reward_std": 0.032938919961452484,
+      "rewards/bleu_reward_func/mean": 0.07062816619873047,
+      "rewards/bleu_reward_func/std": 0.05320809781551361,
+      "step": 609
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 461.0,
+      "completions/mean_length": 246.25,
+      "completions/mean_terminated_length": 184.92308044433594,
+      "completions/min_length": 16.0,
+      "completions/min_terminated_length": 16.0,
+      "epoch": 0.488,
+      "grad_norm": 28.349098205566406,
+      "kl": 0.06927490234375,
+      "learning_rate": 1e-06,
+      "loss": 0.1921,
+      "num_tokens": 8048579.0,
+      "reward": 0.12640823423862457,
+      "reward_std": 0.028129609301686287,
+      "rewards/bleu_reward_func/mean": 0.12640823423862457,
+      "rewards/bleu_reward_func/std": 0.12890547513961792,
+      "step": 610
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.34375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 510.0,
+      "completions/mean_length": 334.28125,
+      "completions/mean_terminated_length": 241.1904754638672,
+      "completions/min_length": 16.0,
+      "completions/min_terminated_length": 16.0,
+      "epoch": 0.4888,
+      "grad_norm": 6.840433597564697,
+      "kl": 0.054046630859375,
+      "learning_rate": 1e-06,
+      "loss": 0.1175,
+      "num_tokens": 8063660.0,
+      "reward": 0.06585729867219925,
+      "reward_std": 0.016829343512654305,
+      "rewards/bleu_reward_func/mean": 0.06585729867219925,
+      "rewards/bleu_reward_func/std": 0.027104271575808525,
+      "step": 611
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 507.0,
+      "completions/mean_length": 469.4375,
+      "completions/mean_terminated_length": 443.8999938964844,
+      "completions/min_length": 359.0,
+      "completions/min_terminated_length": 359.0,
+      "epoch": 0.4896,
+      "grad_norm": 2.304220199584961,
+      "kl": 0.029449462890625,
+      "learning_rate": 1e-06,
+      "loss": -0.0018,
+      "num_tokens": 8081922.0,
+      "reward": 0.029903851449489594,
+      "reward_std": 0.007851570844650269,
+      "rewards/bleu_reward_func/mean": 0.029903851449489594,
+      "rewards/bleu_reward_func/std": 0.017835307866334915,
+      "step": 612
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 483.0,
+      "completions/mean_length": 197.75,
+      "completions/mean_terminated_length": 187.61289978027344,
+      "completions/min_length": 18.0,
+      "completions/min_terminated_length": 18.0,
+      "epoch": 0.4904,
+      "grad_norm": 5.66249418258667,
+      "kl": 0.0855712890625,
+      "learning_rate": 1e-06,
+      "loss": 0.1286,
+      "num_tokens": 8091010.0,
+      "reward": 0.2724965810775757,
+      "reward_std": 0.06183997541666031,
+      "rewards/bleu_reward_func/mean": 0.2724965810775757,
+      "rewards/bleu_reward_func/std": 0.2708708643913269,
+      "step": 613
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 485.0,
+      "completions/mean_length": 328.71875,
+      "completions/mean_terminated_length": 186.1666717529297,
+      "completions/min_length": 70.0,
+      "completions/min_terminated_length": 70.0,
+      "epoch": 0.4912,
+      "grad_norm": 3.228691816329956,
+      "kl": 0.0655670166015625,
+      "learning_rate": 1e-06,
+      "loss": 0.0338,
+      "num_tokens": 8105809.0,
+      "reward": 0.12389599531888962,
+      "reward_std": 0.07396578788757324,
+      "rewards/bleu_reward_func/mean": 0.12389599531888962,
+      "rewards/bleu_reward_func/std": 0.18483103811740875,
+      "step": 614
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.34375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 511.0,
+      "completions/mean_length": 317.59375,
+      "completions/mean_terminated_length": 215.76190185546875,
+      "completions/min_length": 35.0,
+      "completions/min_terminated_length": 35.0,
+      "epoch": 0.492,
+      "grad_norm": 3.5793278217315674,
+      "kl": 0.044097900390625,
+      "learning_rate": 1e-06,
+      "loss": 0.0314,
+      "num_tokens": 8118620.0,
+      "reward": 0.08205534517765045,
+      "reward_std": 0.032849013805389404,
+      "rewards/bleu_reward_func/mean": 0.08205534517765045,
+      "rewards/bleu_reward_func/std": 0.05394502356648445,
+      "step": 615
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 449.0,
+      "completions/max_terminated_length": 449.0,
+      "completions/mean_length": 180.9375,
+      "completions/mean_terminated_length": 180.9375,
+      "completions/min_length": 14.0,
+      "completions/min_terminated_length": 14.0,
+      "epoch": 0.4928,
+      "grad_norm": 7.083518028259277,
+      "kl": 0.40167236328125,
+      "learning_rate": 1e-06,
+      "loss": -0.1081,
+      "num_tokens": 8128922.0,
+      "reward": 0.12701216340065002,
+      "reward_std": 0.03847620263695717,
+      "rewards/bleu_reward_func/mean": 0.12701216340065002,
+      "rewards/bleu_reward_func/std": 0.08405326306819916,
+      "step": 616
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 473.0,
+      "completions/mean_length": 214.25,
+      "completions/mean_terminated_length": 145.53846740722656,
+      "completions/min_length": 18.0,
+      "completions/min_terminated_length": 18.0,
+      "epoch": 0.4936,
+      "grad_norm": 6.532368183135986,
+      "kl": 0.239410400390625,
+      "learning_rate": 1e-06,
+      "loss": -0.0347,
+      "num_tokens": 8143162.0,
+      "reward": 0.11757355183362961,
+      "reward_std": 0.02820819616317749,
+      "rewards/bleu_reward_func/mean": 0.11757355183362961,
+      "rewards/bleu_reward_func/std": 0.10728771984577179,
+      "step": 617
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 507.0,
+      "completions/mean_length": 364.375,
+      "completions/mean_terminated_length": 297.2727355957031,
+      "completions/min_length": 142.0,
+      "completions/min_terminated_length": 142.0,
+      "epoch": 0.4944,
+      "grad_norm": 2.549912929534912,
+      "kl": 0.044189453125,
+      "learning_rate": 1e-06,
+      "loss": 0.0332,
+      "num_tokens": 8158198.0,
+      "reward": 0.04174516722559929,
+      "reward_std": 0.011650302447378635,
+      "rewards/bleu_reward_func/mean": 0.04174516722559929,
+      "rewards/bleu_reward_func/std": 0.03221089020371437,
+      "step": 618
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.34375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 441.0,
+      "completions/mean_length": 311.84375,
+      "completions/mean_terminated_length": 207.0,
+      "completions/min_length": 49.0,
+      "completions/min_terminated_length": 49.0,
+      "epoch": 0.4952,
+      "grad_norm": 3.869034767150879,
+      "kl": 0.05462646484375,
+      "learning_rate": 1e-06,
+      "loss": 0.0031,
+      "num_tokens": 8171945.0,
+      "reward": 0.028928130865097046,
+      "reward_std": 0.012434298172593117,
+      "rewards/bleu_reward_func/mean": 0.028928130865097046,
+      "rewards/bleu_reward_func/std": 0.025789210572838783,
+      "step": 619
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 483.0,
+      "completions/mean_length": 243.90625,
+      "completions/mean_terminated_length": 205.60714721679688,
+      "completions/min_length": 11.0,
+      "completions/min_terminated_length": 11.0,
+      "epoch": 0.496,
+      "grad_norm": 6.46402645111084,
+      "kl": 0.304168701171875,
+      "learning_rate": 1e-06,
+      "loss": 0.0145,
+      "num_tokens": 8181710.0,
+      "reward": 0.13130733370780945,
+      "reward_std": 0.018212325870990753,
+      "rewards/bleu_reward_func/mean": 0.13130733370780945,
+      "rewards/bleu_reward_func/std": 0.09850703179836273,
+      "step": 620
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 495.0,
+      "completions/mean_length": 242.0,
+      "completions/mean_terminated_length": 203.42857360839844,
+      "completions/min_length": 21.0,
+      "completions/min_terminated_length": 21.0,
+      "epoch": 0.4968,
+      "grad_norm": 5.473258018493652,
+      "kl": 0.16949462890625,
+      "learning_rate": 1e-06,
+      "loss": -0.0354,
+      "num_tokens": 8193118.0,
+      "reward": 0.2502046227455139,
+      "reward_std": 0.03522457554936409,
+      "rewards/bleu_reward_func/mean": 0.2502046227455139,
+      "rewards/bleu_reward_func/std": 0.2565787732601166,
+      "step": 621
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 494.0,
+      "completions/mean_length": 236.71875,
+      "completions/mean_terminated_length": 144.95834350585938,
+      "completions/min_length": 19.0,
+      "completions/min_terminated_length": 19.0,
+      "epoch": 0.4976,
+      "grad_norm": 30.692569732666016,
+      "kl": 0.29986572265625,
+      "learning_rate": 1e-06,
+      "loss": -0.0477,
+      "num_tokens": 8207549.0,
+      "reward": 0.22225125133991241,
+      "reward_std": 0.033524345606565475,
+      "rewards/bleu_reward_func/mean": 0.22225125133991241,
+      "rewards/bleu_reward_func/std": 0.19432197511196136,
+      "step": 622
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.09375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 495.0,
+      "completions/mean_length": 120.96875,
+      "completions/mean_terminated_length": 80.51724243164062,
+      "completions/min_length": 13.0,
+      "completions/min_terminated_length": 13.0,
+      "epoch": 0.4984,
+      "grad_norm": 7.908195495605469,
+      "kl": 0.380523681640625,
+      "learning_rate": 1e-06,
+      "loss": -0.0756,
+      "num_tokens": 8218324.0,
+      "reward": 0.23351526260375977,
+      "reward_std": 0.05452558770775795,
+      "rewards/bleu_reward_func/mean": 0.23351526260375977,
+      "rewards/bleu_reward_func/std": 0.1365489512681961,
+      "step": 623
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.09375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 415.0,
+      "completions/mean_length": 172.5625,
+      "completions/mean_terminated_length": 137.44827270507812,
+      "completions/min_length": 19.0,
+      "completions/min_terminated_length": 19.0,
+      "epoch": 0.4992,
+      "grad_norm": 8.18444538116455,
+      "kl": 0.296051025390625,
+      "learning_rate": 1e-06,
+      "loss": 0.0021,
+      "num_tokens": 8228742.0,
+      "reward": 0.14677512645721436,
+      "reward_std": 0.04820986092090607,
+      "rewards/bleu_reward_func/mean": 0.14677512645721436,
+      "rewards/bleu_reward_func/std": 0.15982075035572052,
+      "step": 624
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.28125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 494.0,
+      "completions/mean_length": 333.65625,
+      "completions/mean_terminated_length": 263.86956787109375,
+      "completions/min_length": 22.0,
+      "completions/min_terminated_length": 22.0,
+      "epoch": 0.5,
+      "grad_norm": 15.107973098754883,
+      "kl": 0.160400390625,
+      "learning_rate": 1e-06,
+      "loss": 0.0015,
+      "num_tokens": 8245059.0,
+      "reward": 0.13298457860946655,
+      "reward_std": 0.018914809450507164,
+      "rewards/bleu_reward_func/mean": 0.13298457860946655,
+      "rewards/bleu_reward_func/std": 0.07686522603034973,
+      "step": 625
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.15625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 459.0,
+      "completions/mean_length": 299.53125,
+      "completions/mean_terminated_length": 260.1851806640625,
+      "completions/min_length": 16.0,
+      "completions/min_terminated_length": 16.0,
+      "epoch": 0.5008,
+      "grad_norm": 131.54771423339844,
+      "kl": 0.102081298828125,
+      "learning_rate": 1e-06,
+      "loss": 0.1782,
+      "num_tokens": 8256620.0,
+      "reward": 0.1039574146270752,
+      "reward_std": 0.03130800276994705,
+      "rewards/bleu_reward_func/mean": 0.1039574146270752,
+      "rewards/bleu_reward_func/std": 0.05177094042301178,
+      "step": 626
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 486.0,
+      "completions/mean_length": 202.15625,
+      "completions/mean_terminated_length": 181.50001525878906,
+      "completions/min_length": 10.0,
+      "completions/min_terminated_length": 10.0,
+      "epoch": 0.5016,
+      "grad_norm": 8.109158515930176,
+      "kl": 0.213104248046875,
+      "learning_rate": 1e-06,
+      "loss": -0.0456,
+      "num_tokens": 8266585.0,
+      "reward": 0.23561137914657593,
+      "reward_std": 0.03910698741674423,
+      "rewards/bleu_reward_func/mean": 0.23561137914657593,
+      "rewards/bleu_reward_func/std": 0.1352909654378891,
+      "step": 627
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 508.0,
+      "completions/mean_length": 259.5625,
+      "completions/mean_terminated_length": 144.8181915283203,
+      "completions/min_length": 28.0,
+      "completions/min_terminated_length": 28.0,
+      "epoch": 0.5024,
+      "grad_norm": 5.162299633026123,
+      "kl": 0.147857666015625,
+      "learning_rate": 1e-06,
+      "loss": -0.0329,
+      "num_tokens": 8278827.0,
+      "reward": 0.1635468751192093,
+      "reward_std": 0.04077983647584915,
+      "rewards/bleu_reward_func/mean": 0.1635468751192093,
+      "rewards/bleu_reward_func/std": 0.1520238220691681,
+      "step": 628
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 501.0,
+      "completions/mean_length": 354.84375,
+      "completions/mean_terminated_length": 302.4583435058594,
+      "completions/min_length": 127.0,
+      "completions/min_terminated_length": 127.0,
+      "epoch": 0.5032,
+      "grad_norm": 3.0983669757843018,
+      "kl": 0.0435791015625,
+      "learning_rate": 1e-06,
+      "loss": 0.0095,
+      "num_tokens": 8293502.0,
+      "reward": 0.05737052857875824,
+      "reward_std": 0.021961018443107605,
+      "rewards/bleu_reward_func/mean": 0.05737052857875824,
+      "rewards/bleu_reward_func/std": 0.03505769371986389,
+      "step": 629
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.09375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 506.0,
+      "completions/mean_length": 216.25,
+      "completions/mean_terminated_length": 185.65516662597656,
+      "completions/min_length": 19.0,
+      "completions/min_terminated_length": 19.0,
+      "epoch": 0.504,
+      "grad_norm": 6.821644306182861,
+      "kl": 0.0950927734375,
+      "learning_rate": 1e-06,
+      "loss": -0.0482,
+      "num_tokens": 8302182.0,
+      "reward": 0.07243853062391281,
+      "reward_std": 0.06683069467544556,
+      "rewards/bleu_reward_func/mean": 0.07243853062391281,
+      "rewards/bleu_reward_func/std": 0.10312769562005997,
+      "step": 630
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 497.0,
+      "completions/mean_length": 352.34375,
+      "completions/mean_terminated_length": 329.5357360839844,
+      "completions/min_length": 20.0,
+      "completions/min_terminated_length": 20.0,
+      "epoch": 0.5048,
+      "grad_norm": 3.5717883110046387,
+      "kl": 0.036041259765625,
+      "learning_rate": 1e-06,
+      "loss": 0.1298,
+      "num_tokens": 8318753.0,
+      "reward": 0.026654381304979324,
+      "reward_std": 0.024883870035409927,
+      "rewards/bleu_reward_func/mean": 0.026654381304979324,
+      "rewards/bleu_reward_func/std": 0.03104417398571968,
+      "step": 631
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 499.0,
+      "completions/max_terminated_length": 499.0,
+      "completions/mean_length": 183.09375,
+      "completions/mean_terminated_length": 183.09375,
+      "completions/min_length": 19.0,
+      "completions/min_terminated_length": 19.0,
+      "epoch": 0.5056,
+      "grad_norm": 6.945363998413086,
+      "kl": 0.234039306640625,
+      "learning_rate": 1e-06,
+      "loss": 0.0696,
+      "num_tokens": 8329596.0,
+      "reward": 0.18802031874656677,
+      "reward_std": 0.06351514160633087,
+      "rewards/bleu_reward_func/mean": 0.18802031874656677,
+      "rewards/bleu_reward_func/std": 0.16961929202079773,
+      "step": 632
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 485.0,
+      "completions/mean_length": 157.9375,
+      "completions/mean_terminated_length": 134.33334350585938,
+      "completions/min_length": 19.0,
+      "completions/min_terminated_length": 19.0,
+      "epoch": 0.5064,
+      "grad_norm": 99.53560638427734,
+      "kl": 0.190399169921875,
+      "learning_rate": 1e-06,
+      "loss": 0.0621,
+      "num_tokens": 8343090.0,
+      "reward": 0.06075248867273331,
+      "reward_std": 0.018952492624521255,
+      "rewards/bleu_reward_func/mean": 0.06075248867273331,
+      "rewards/bleu_reward_func/std": 0.057455144822597504,
+      "step": 633
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.09375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 508.0,
+      "completions/mean_length": 291.71875,
+      "completions/mean_terminated_length": 268.9310302734375,
+      "completions/min_length": 12.0,
+      "completions/min_terminated_length": 12.0,
+      "epoch": 0.5072,
+      "grad_norm": 6.928218364715576,
+      "kl": 0.155853271484375,
+      "learning_rate": 1e-06,
+      "loss": 0.0112,
+      "num_tokens": 8354305.0,
+      "reward": 0.15043729543685913,
+      "reward_std": 0.04871266707777977,
+      "rewards/bleu_reward_func/mean": 0.15043729543685913,
+      "rewards/bleu_reward_func/std": 0.17611344158649445,
+      "step": 634
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 498.0,
+      "completions/mean_length": 198.5625,
+      "completions/mean_terminated_length": 177.6666717529297,
+      "completions/min_length": 44.0,
+      "completions/min_terminated_length": 44.0,
+      "epoch": 0.508,
+      "grad_norm": 4.697340488433838,
+      "kl": 0.094085693359375,
+      "learning_rate": 1e-06,
+      "loss": 0.0737,
+      "num_tokens": 8364091.0,
+      "reward": 0.06601180136203766,
+      "reward_std": 0.02227596938610077,
+      "rewards/bleu_reward_func/mean": 0.06601180136203766,
+      "rewards/bleu_reward_func/std": 0.043257758021354675,
+      "step": 635
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 508.0,
+      "completions/mean_length": 410.78125,
+      "completions/mean_terminated_length": 332.0555725097656,
+      "completions/min_length": 20.0,
+      "completions/min_terminated_length": 20.0,
+      "epoch": 0.5088,
+      "grad_norm": 77.46025085449219,
+      "kl": 0.05377197265625,
+      "learning_rate": 1e-06,
+      "loss": -0.1447,
+      "num_tokens": 8381724.0,
+      "reward": 0.03671726584434509,
+      "reward_std": 0.015178699977695942,
+      "rewards/bleu_reward_func/mean": 0.03671726584434509,
+      "rewards/bleu_reward_func/std": 0.03225603699684143,
+      "step": 636
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 408.0,
+      "completions/mean_length": 333.25,
+      "completions/mean_terminated_length": 154.5,
+      "completions/min_length": 16.0,
+      "completions/min_terminated_length": 16.0,
+      "epoch": 0.5096,
+      "grad_norm": 6.136791706085205,
+      "kl": 0.124664306640625,
+      "learning_rate": 1e-06,
+      "loss": -0.0018,
+      "num_tokens": 8397476.0,
+      "reward": 0.16264644265174866,
+      "reward_std": 0.03679278865456581,
+      "rewards/bleu_reward_func/mean": 0.16264644265174866,
+      "rewards/bleu_reward_func/std": 0.16195148229599,
+      "step": 637
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 502.0,
+      "completions/mean_length": 221.125,
+      "completions/mean_terminated_length": 179.57144165039062,
+      "completions/min_length": 46.0,
+      "completions/min_terminated_length": 46.0,
+      "epoch": 0.5104,
+      "grad_norm": 4.400974750518799,
+      "kl": 0.058013916015625,
+      "learning_rate": 1e-06,
+      "loss": -0.0244,
+      "num_tokens": 8406544.0,
+      "reward": 0.059636689722537994,
+      "reward_std": 0.024229735136032104,
+      "rewards/bleu_reward_func/mean": 0.059636689722537994,
+      "rewards/bleu_reward_func/std": 0.04718983918428421,
+      "step": 638
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 499.0,
+      "completions/mean_length": 287.09375,
+      "completions/mean_terminated_length": 235.19232177734375,
+      "completions/min_length": 17.0,
+      "completions/min_terminated_length": 17.0,
+      "epoch": 0.5112,
+      "grad_norm": 10.916999816894531,
+      "kl": 0.118011474609375,
+      "learning_rate": 1e-06,
+      "loss": 0.0766,
+      "num_tokens": 8422323.0,
+      "reward": 0.10468322038650513,
+      "reward_std": 0.021623361855745316,
+      "rewards/bleu_reward_func/mean": 0.10468322038650513,
+      "rewards/bleu_reward_func/std": 0.08197237551212311,
+      "step": 639
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 461.0,
+      "completions/mean_length": 336.28125,
+      "completions/mean_terminated_length": 311.1785888671875,
+      "completions/min_length": 190.0,
+      "completions/min_terminated_length": 190.0,
+      "epoch": 0.512,
+      "grad_norm": 2.1528663635253906,
+      "kl": 0.033172607421875,
+      "learning_rate": 1e-06,
+      "loss": -0.0722,
+      "num_tokens": 8437100.0,
+      "reward": 0.10049737989902496,
+      "reward_std": 0.03208357095718384,
+      "rewards/bleu_reward_func/mean": 0.10049737989902496,
+      "rewards/bleu_reward_func/std": 0.0739847868680954,
+      "step": 640
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.53125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 451.0,
+      "completions/mean_length": 361.375,
+      "completions/mean_terminated_length": 190.6666717529297,
+      "completions/min_length": 24.0,
+      "completions/min_terminated_length": 24.0,
+      "epoch": 0.5128,
+      "grad_norm": 4.406239986419678,
+      "kl": 0.163421630859375,
+      "learning_rate": 1e-06,
+      "loss": 0.0712,
+      "num_tokens": 8450184.0,
+      "reward": 0.06270510703325272,
+      "reward_std": 0.01890096440911293,
+      "rewards/bleu_reward_func/mean": 0.06270510703325272,
+      "rewards/bleu_reward_func/std": 0.04299367591738701,
+      "step": 641
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 508.0,
+      "completions/mean_length": 344.75,
+      "completions/mean_terminated_length": 268.727294921875,
+      "completions/min_length": 21.0,
+      "completions/min_terminated_length": 21.0,
+      "epoch": 0.5136,
+      "grad_norm": 3.8396599292755127,
+      "kl": 0.21343994140625,
+      "learning_rate": 1e-06,
+      "loss": -0.0071,
+      "num_tokens": 8464952.0,
+      "reward": 0.12920798361301422,
+      "reward_std": 0.025508491322398186,
+      "rewards/bleu_reward_func/mean": 0.12920798361301422,
+      "rewards/bleu_reward_func/std": 0.11343086510896683,
+      "step": 642
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 443.0,
+      "completions/mean_length": 243.0625,
+      "completions/mean_terminated_length": 204.6428680419922,
+      "completions/min_length": 20.0,
+      "completions/min_terminated_length": 20.0,
+      "epoch": 0.5144,
+      "grad_norm": 9.709904670715332,
+      "kl": 0.127349853515625,
+      "learning_rate": 1e-06,
+      "loss": -0.0569,
+      "num_tokens": 8481010.0,
+      "reward": 0.08636625856161118,
+      "reward_std": 0.023468628525733948,
+      "rewards/bleu_reward_func/mean": 0.08636625856161118,
+      "rewards/bleu_reward_func/std": 0.11172276735305786,
+      "step": 643
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.46875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 430.0,
+      "completions/mean_length": 319.8125,
+      "completions/mean_terminated_length": 150.23529052734375,
+      "completions/min_length": 38.0,
+      "completions/min_terminated_length": 38.0,
+      "epoch": 0.5152,
+      "grad_norm": 4.022066116333008,
+      "kl": 0.095428466796875,
+      "learning_rate": 1e-06,
+      "loss": 0.0192,
+      "num_tokens": 8495668.0,
+      "reward": 0.13581258058547974,
+      "reward_std": 0.049042053520679474,
+      "rewards/bleu_reward_func/mean": 0.13581258058547974,
+      "rewards/bleu_reward_func/std": 0.10865607112646103,
+      "step": 644
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.21875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 421.0,
+      "completions/mean_length": 200.40625,
+      "completions/mean_terminated_length": 113.15999603271484,
+      "completions/min_length": 18.0,
+      "completions/min_terminated_length": 18.0,
+      "epoch": 0.516,
+      "grad_norm": 14.371395111083984,
+      "kl": 0.188232421875,
+      "learning_rate": 1e-06,
+      "loss": 0.1384,
+      "num_tokens": 8510193.0,
+      "reward": 0.16988492012023926,
+      "reward_std": 0.02835988998413086,
+      "rewards/bleu_reward_func/mean": 0.16988492012023926,
+      "rewards/bleu_reward_func/std": 0.22432467341423035,
+      "step": 645
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 505.0,
+      "completions/mean_length": 350.625,
+      "completions/mean_terminated_length": 189.25,
+      "completions/min_length": 30.0,
+      "completions/min_terminated_length": 30.0,
+      "epoch": 0.5168,
+      "grad_norm": 3.603522539138794,
+      "kl": 0.106658935546875,
+      "learning_rate": 1e-06,
+      "loss": 0.0683,
+      "num_tokens": 8527613.0,
+      "reward": 0.04950277507305145,
+      "reward_std": 0.02557562291622162,
+      "rewards/bleu_reward_func/mean": 0.04950277507305145,
+      "rewards/bleu_reward_func/std": 0.036064986139535904,
+      "step": 646
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.15625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 463.0,
+      "completions/mean_length": 240.71875,
+      "completions/mean_terminated_length": 190.48147583007812,
+      "completions/min_length": 88.0,
+      "completions/min_terminated_length": 88.0,
+      "epoch": 0.5176,
+      "grad_norm": 2.1311533451080322,
+      "kl": 0.040771484375,
+      "learning_rate": 1e-06,
+      "loss": -0.0675,
+      "num_tokens": 8540012.0,
+      "reward": 0.4242175817489624,
+      "reward_std": 0.05443207919597626,
+      "rewards/bleu_reward_func/mean": 0.4242175817489624,
+      "rewards/bleu_reward_func/std": 0.3835957646369934,
+      "step": 647
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 488.0,
+      "completions/mean_length": 162.78125,
+      "completions/mean_terminated_length": 82.19231414794922,
+      "completions/min_length": 17.0,
+      "completions/min_terminated_length": 17.0,
+      "epoch": 0.5184,
+      "grad_norm": 7.217045783996582,
+      "kl": 0.238250732421875,
+      "learning_rate": 1e-06,
+      "loss": -0.2492,
+      "num_tokens": 8550285.0,
+      "reward": 0.15483121573925018,
+      "reward_std": 0.04074571654200554,
+      "rewards/bleu_reward_func/mean": 0.15483121573925018,
+      "rewards/bleu_reward_func/std": 0.1628112941980362,
+      "step": 648
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.09375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 505.0,
+      "completions/mean_length": 226.03125,
+      "completions/mean_terminated_length": 196.44827270507812,
+      "completions/min_length": 16.0,
+      "completions/min_terminated_length": 16.0,
+      "epoch": 0.5192,
+      "grad_norm": 9.426239013671875,
+      "kl": 0.326446533203125,
+      "learning_rate": 1e-06,
+      "loss": -0.0078,
+      "num_tokens": 8562006.0,
+      "reward": 0.22631683945655823,
+      "reward_std": 0.046764910221099854,
+      "rewards/bleu_reward_func/mean": 0.22631683945655823,
+      "rewards/bleu_reward_func/std": 0.24870522320270538,
+      "step": 649
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 501.0,
+      "completions/mean_length": 408.4375,
+      "completions/mean_terminated_length": 373.91668701171875,
+      "completions/min_length": 224.0,
+      "completions/min_terminated_length": 224.0,
+      "epoch": 0.52,
+      "grad_norm": 2.5924437046051025,
+      "kl": 0.03192138671875,
+      "learning_rate": 1e-06,
+      "loss": -0.0517,
+      "num_tokens": 8577388.0,
+      "reward": 0.050332337617874146,
+      "reward_std": 0.013445645570755005,
+      "rewards/bleu_reward_func/mean": 0.050332337617874146,
+      "rewards/bleu_reward_func/std": 0.04263650253415108,
+      "step": 650
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 490.0,
+      "completions/mean_length": 323.65625,
+      "completions/mean_terminated_length": 296.75,
+      "completions/min_length": 57.0,
+      "completions/min_terminated_length": 57.0,
+      "epoch": 0.5208,
+      "grad_norm": 6.97017765045166,
+      "kl": 0.062255859375,
+      "learning_rate": 1e-06,
+      "loss": 0.075,
+      "num_tokens": 8589889.0,
+      "reward": 0.0671861320734024,
+      "reward_std": 0.020000552758574486,
+      "rewards/bleu_reward_func/mean": 0.0671861320734024,
+      "rewards/bleu_reward_func/std": 0.027637863531708717,
+      "step": 651
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.09375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 500.0,
+      "completions/mean_length": 202.6875,
+      "completions/mean_terminated_length": 170.6896514892578,
+      "completions/min_length": 19.0,
+      "completions/min_terminated_length": 19.0,
+      "epoch": 0.5216,
+      "grad_norm": 5.9939727783203125,
+      "kl": 0.30169677734375,
+      "learning_rate": 1e-06,
+      "loss": 0.0126,
+      "num_tokens": 8603471.0,
+      "reward": 0.23086336255073547,
+      "reward_std": 0.03887036070227623,
+      "rewards/bleu_reward_func/mean": 0.23086336255073547,
+      "rewards/bleu_reward_func/std": 0.1954699456691742,
+      "step": 652
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.28125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 512.0,
+      "completions/mean_length": 334.09375,
+      "completions/mean_terminated_length": 264.478271484375,
+      "completions/min_length": 24.0,
+      "completions/min_terminated_length": 24.0,
+      "epoch": 0.5224,
+      "grad_norm": 10.721747398376465,
+      "kl": 0.11480712890625,
+      "learning_rate": 1e-06,
+      "loss": -0.0181,
+      "num_tokens": 8618802.0,
+      "reward": 0.12768197059631348,
+      "reward_std": 0.018044453114271164,
+      "rewards/bleu_reward_func/mean": 0.12768197059631348,
+      "rewards/bleu_reward_func/std": 0.18208470940589905,
+      "step": 653
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.21875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 498.0,
+      "completions/mean_length": 254.28125,
+      "completions/mean_terminated_length": 182.1199951171875,
+      "completions/min_length": 6.0,
+      "completions/min_terminated_length": 6.0,
+      "epoch": 0.5232,
+      "grad_norm": 21.832870483398438,
+      "kl": 0.224090576171875,
+      "learning_rate": 1e-06,
+      "loss": 0.0275,
+      "num_tokens": 8633395.0,
+      "reward": 0.23750805854797363,
+      "reward_std": 0.10584703087806702,
+      "rewards/bleu_reward_func/mean": 0.23750805854797363,
+      "rewards/bleu_reward_func/std": 0.24472850561141968,
+      "step": 654
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.28125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 488.0,
+      "completions/mean_length": 293.375,
+      "completions/mean_terminated_length": 207.8260955810547,
+      "completions/min_length": 8.0,
+      "completions/min_terminated_length": 8.0,
+      "epoch": 0.524,
+      "grad_norm": 14.025049209594727,
+      "kl": 0.1319580078125,
+      "learning_rate": 1e-06,
+      "loss": 0.2269,
+      "num_tokens": 8647583.0,
+      "reward": 0.08922699838876724,
+      "reward_std": 0.022407300770282745,
+      "rewards/bleu_reward_func/mean": 0.08922699838876724,
+      "rewards/bleu_reward_func/std": 0.05691966786980629,
+      "step": 655
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 506.0,
+      "completions/mean_length": 417.34375,
+      "completions/mean_terminated_length": 374.3182067871094,
+      "completions/min_length": 143.0,
+      "completions/min_terminated_length": 143.0,
+      "epoch": 0.5248,
+      "grad_norm": 2.506775379180908,
+      "kl": 0.030517578125,
+      "learning_rate": 1e-06,
+      "loss": -0.1182,
+      "num_tokens": 8664458.0,
+      "reward": 0.056899260729551315,
+      "reward_std": 0.024433575570583344,
+      "rewards/bleu_reward_func/mean": 0.056899260729551315,
+      "rewards/bleu_reward_func/std": 0.043169718235731125,
+      "step": 656
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 506.0,
+      "completions/mean_length": 251.78125,
+      "completions/mean_terminated_length": 214.60714721679688,
+      "completions/min_length": 14.0,
+      "completions/min_terminated_length": 14.0,
+      "epoch": 0.5256,
+      "grad_norm": 7.267916202545166,
+      "kl": 0.123870849609375,
+      "learning_rate": 1e-06,
+      "loss": 0.0057,
+      "num_tokens": 8673803.0,
+      "reward": 0.10382385551929474,
+      "reward_std": 0.051886267960071564,
+      "rewards/bleu_reward_func/mean": 0.10382385551929474,
+      "rewards/bleu_reward_func/std": 0.06761174649000168,
+      "step": 657
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 288.0,
+      "completions/mean_length": 227.34375,
+      "completions/mean_terminated_length": 132.45834350585938,
+      "completions/min_length": 27.0,
+      "completions/min_terminated_length": 27.0,
+      "epoch": 0.5264,
+      "grad_norm": 5.546152114868164,
+      "kl": 0.098236083984375,
+      "learning_rate": 1e-06,
+      "loss": -0.0623,
+      "num_tokens": 8685270.0,
+      "reward": 0.1565043181180954,
+      "reward_std": 0.08428065478801727,
+      "rewards/bleu_reward_func/mean": 0.1565043181180954,
+      "rewards/bleu_reward_func/std": 0.17227834463119507,
+      "step": 658
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.15625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 414.0,
+      "completions/mean_length": 152.96875,
+      "completions/mean_terminated_length": 86.48148345947266,
+      "completions/min_length": 11.0,
+      "completions/min_terminated_length": 11.0,
+      "epoch": 0.5272,
+      "grad_norm": 14.263039588928223,
+      "kl": 0.30914306640625,
+      "learning_rate": 1e-06,
+      "loss": 0.0075,
+      "num_tokens": 8696005.0,
+      "reward": 0.24965888261795044,
+      "reward_std": 0.051375266164541245,
+      "rewards/bleu_reward_func/mean": 0.24965888261795044,
+      "rewards/bleu_reward_func/std": 0.21870571374893188,
+      "step": 659
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 165.0,
+      "completions/max_terminated_length": 165.0,
+      "completions/mean_length": 32.5625,
+      "completions/mean_terminated_length": 32.5625,
+      "completions/min_length": 6.0,
+      "completions/min_terminated_length": 6.0,
+      "epoch": 0.528,
+      "grad_norm": 442.6045837402344,
+      "kl": 0.5015869140625,
+      "learning_rate": 1e-06,
+      "loss": 0.0936,
+      "num_tokens": 8704767.0,
+      "reward": 0.13235034048557281,
+      "reward_std": 0.07672514766454697,
+      "rewards/bleu_reward_func/mean": 0.13235034048557281,
+      "rewards/bleu_reward_func/std": 0.13803941011428833,
+      "step": 660
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.40625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 431.0,
+      "completions/mean_length": 252.4375,
+      "completions/mean_terminated_length": 74.84210205078125,
+      "completions/min_length": 18.0,
+      "completions/min_terminated_length": 18.0,
+      "epoch": 0.5288,
+      "grad_norm": 5.065097332000732,
+      "kl": 0.18255615234375,
+      "learning_rate": 1e-06,
+      "loss": 0.0464,
+      "num_tokens": 8721005.0,
+      "reward": 0.2883501648902893,
+      "reward_std": 0.022871889173984528,
+      "rewards/bleu_reward_func/mean": 0.2883501648902893,
+      "rewards/bleu_reward_func/std": 0.23920658230781555,
+      "step": 661
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.09375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 402.0,
+      "completions/mean_length": 178.625,
+      "completions/mean_terminated_length": 144.13792419433594,
+      "completions/min_length": 20.0,
+      "completions/min_terminated_length": 20.0,
+      "epoch": 0.5296,
+      "grad_norm": 4.1915202140808105,
+      "kl": 0.152069091796875,
+      "learning_rate": 1e-06,
+      "loss": 0.1483,
+      "num_tokens": 8729545.0,
+      "reward": 0.09845434874296188,
+      "reward_std": 0.049190133810043335,
+      "rewards/bleu_reward_func/mean": 0.09845434874296188,
+      "rewards/bleu_reward_func/std": 0.06372099369764328,
+      "step": 662
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 506.0,
+      "completions/mean_length": 293.9375,
+      "completions/mean_terminated_length": 279.4000244140625,
+      "completions/min_length": 90.0,
+      "completions/min_terminated_length": 90.0,
+      "epoch": 0.5304,
+      "grad_norm": 10.675251007080078,
+      "kl": 0.0498046875,
+      "learning_rate": 1e-06,
+      "loss": 0.1798,
+      "num_tokens": 8741895.0,
+      "reward": 0.09067553281784058,
+      "reward_std": 0.036186374723911285,
+      "rewards/bleu_reward_func/mean": 0.09067553281784058,
+      "rewards/bleu_reward_func/std": 0.057389046996831894,
+      "step": 663
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 491.0,
+      "completions/mean_length": 157.75,
+      "completions/mean_terminated_length": 146.32257080078125,
+      "completions/min_length": 33.0,
+      "completions/min_terminated_length": 33.0,
+      "epoch": 0.5312,
+      "grad_norm": 42.93239212036133,
+      "kl": 0.20751953125,
+      "learning_rate": 1e-06,
+      "loss": 0.0697,
+      "num_tokens": 8752255.0,
+      "reward": 0.27352431416511536,
+      "reward_std": 0.07531043887138367,
+      "rewards/bleu_reward_func/mean": 0.27352431416511536,
+      "rewards/bleu_reward_func/std": 0.13157765567302704,
+      "step": 664
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.21875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 497.0,
+      "completions/mean_length": 285.625,
+      "completions/mean_terminated_length": 222.239990234375,
+      "completions/min_length": 42.0,
+      "completions/min_terminated_length": 42.0,
+      "epoch": 0.532,
+      "grad_norm": 5.36870813369751,
+      "kl": 0.075042724609375,
+      "learning_rate": 1e-06,
+      "loss": -0.0052,
+      "num_tokens": 8764259.0,
+      "reward": 0.024750784039497375,
+      "reward_std": 0.022341227158904076,
+      "rewards/bleu_reward_func/mean": 0.024750784039497375,
+      "rewards/bleu_reward_func/std": 0.03164950758218765,
+      "step": 665
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 498.0,
+      "completions/max_terminated_length": 498.0,
+      "completions/mean_length": 231.0,
+      "completions/mean_terminated_length": 231.0,
+      "completions/min_length": 15.0,
+      "completions/min_terminated_length": 15.0,
+      "epoch": 0.5328,
+      "grad_norm": 5.825709342956543,
+      "kl": 0.2059326171875,
+      "learning_rate": 1e-06,
+      "loss": -0.126,
+      "num_tokens": 8776043.0,
+      "reward": 0.09888751804828644,
+      "reward_std": 0.027325943112373352,
+      "rewards/bleu_reward_func/mean": 0.09888751804828644,
+      "rewards/bleu_reward_func/std": 0.06260307133197784,
+      "step": 666
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 480.0,
+      "completions/mean_length": 154.09375,
+      "completions/mean_terminated_length": 71.5,
+      "completions/min_length": 14.0,
+      "completions/min_terminated_length": 14.0,
+      "epoch": 0.5336,
+      "grad_norm": 13.691543579101562,
+      "kl": 0.3173828125,
+      "learning_rate": 1e-06,
+      "loss": -0.0016,
+      "num_tokens": 8787118.0,
+      "reward": 0.16891013085842133,
+      "reward_std": 0.033737391233444214,
+      "rewards/bleu_reward_func/mean": 0.16891013085842133,
+      "rewards/bleu_reward_func/std": 0.1783466339111328,
+      "step": 667
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 458.0,
+      "completions/mean_length": 258.71875,
+      "completions/mean_terminated_length": 174.2916717529297,
+      "completions/min_length": 12.0,
+      "completions/min_terminated_length": 12.0,
+      "epoch": 0.5344,
+      "grad_norm": 5.329594612121582,
+      "kl": 0.25360107421875,
+      "learning_rate": 1e-06,
+      "loss": -0.0877,
+      "num_tokens": 8801157.0,
+      "reward": 0.071600541472435,
+      "reward_std": 0.021211300045251846,
+      "rewards/bleu_reward_func/mean": 0.071600541472435,
+      "rewards/bleu_reward_func/std": 0.054277434945106506,
+      "step": 668
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 463.0,
+      "completions/mean_length": 300.71875,
+      "completions/mean_terminated_length": 136.38888549804688,
+      "completions/min_length": 57.0,
+      "completions/min_terminated_length": 57.0,
+      "epoch": 0.5352,
+      "grad_norm": 10.509814262390137,
+      "kl": 0.1150054931640625,
+      "learning_rate": 1e-06,
+      "loss": 0.0738,
+      "num_tokens": 8815244.0,
+      "reward": 0.09365338832139969,
+      "reward_std": 0.023113342002034187,
+      "rewards/bleu_reward_func/mean": 0.09365338832139969,
+      "rewards/bleu_reward_func/std": 0.0734696164727211,
+      "step": 669
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 496.0,
+      "completions/mean_length": 253.9375,
+      "completions/mean_terminated_length": 245.61289978027344,
+      "completions/min_length": 27.0,
+      "completions/min_terminated_length": 27.0,
+      "epoch": 0.536,
+      "grad_norm": 23.27628517150879,
+      "kl": 0.17572021484375,
+      "learning_rate": 1e-06,
+      "loss": -0.1243,
+      "num_tokens": 8826930.0,
+      "reward": 0.10506822168827057,
+      "reward_std": 0.023658432066440582,
+      "rewards/bleu_reward_func/mean": 0.10506822168827057,
+      "rewards/bleu_reward_func/std": 0.06695646047592163,
+      "step": 670
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.53125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 482.0,
+      "completions/mean_length": 447.71875,
+      "completions/mean_terminated_length": 374.86669921875,
+      "completions/min_length": 187.0,
+      "completions/min_terminated_length": 187.0,
+      "epoch": 0.5368,
+      "grad_norm": 3.2729108333587646,
+      "kl": 0.028778076171875,
+      "learning_rate": 1e-06,
+      "loss": -0.0068,
+      "num_tokens": 8843889.0,
+      "reward": 0.025088129565119743,
+      "reward_std": 0.00651167519390583,
+      "rewards/bleu_reward_func/mean": 0.025088129565119743,
+      "rewards/bleu_reward_func/std": 0.02992870658636093,
+      "step": 671
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 488.0,
+      "completions/mean_length": 220.0,
+      "completions/mean_terminated_length": 210.5806427001953,
+      "completions/min_length": 21.0,
+      "completions/min_terminated_length": 21.0,
+      "epoch": 0.5376,
+      "grad_norm": 245.07106018066406,
+      "kl": 0.28985595703125,
+      "learning_rate": 1e-06,
+      "loss": 0.0867,
+      "num_tokens": 8855401.0,
+      "reward": 0.09645279496908188,
+      "reward_std": 0.0731353610754013,
+      "rewards/bleu_reward_func/mean": 0.09645279496908188,
+      "rewards/bleu_reward_func/std": 0.09792789071798325,
+      "step": 672
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.09375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 497.0,
+      "completions/mean_length": 257.0625,
+      "completions/mean_terminated_length": 230.6896514892578,
+      "completions/min_length": 12.0,
+      "completions/min_terminated_length": 12.0,
+      "epoch": 0.5384,
+      "grad_norm": 4.84860897064209,
+      "kl": 0.12005615234375,
+      "learning_rate": 1e-06,
+      "loss": -0.1854,
+      "num_tokens": 8867371.0,
+      "reward": 0.11370459198951721,
+      "reward_std": 0.06978605687618256,
+      "rewards/bleu_reward_func/mean": 0.11370459198951721,
+      "rewards/bleu_reward_func/std": 0.18471869826316833,
+      "step": 673
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.21875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 463.0,
+      "completions/mean_length": 237.78125,
+      "completions/mean_terminated_length": 161.0,
+      "completions/min_length": 16.0,
+      "completions/min_terminated_length": 16.0,
+      "epoch": 0.5392,
+      "grad_norm": 33.883243560791016,
+      "kl": 0.1602783203125,
+      "learning_rate": 1e-06,
+      "loss": -0.0528,
+      "num_tokens": 8882588.0,
+      "reward": 0.20929288864135742,
+      "reward_std": 0.04879160225391388,
+      "rewards/bleu_reward_func/mean": 0.20929288864135742,
+      "rewards/bleu_reward_func/std": 0.17186923325061798,
+      "step": 674
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 473.0,
+      "completions/mean_length": 274.5,
+      "completions/mean_terminated_length": 240.57144165039062,
+      "completions/min_length": 20.0,
+      "completions/min_terminated_length": 20.0,
+      "epoch": 0.54,
+      "grad_norm": 6.54965877532959,
+      "kl": 0.08441162109375,
+      "learning_rate": 1e-06,
+      "loss": -0.037,
+      "num_tokens": 8894508.0,
+      "reward": 0.04714702442288399,
+      "reward_std": 0.010213707573711872,
+      "rewards/bleu_reward_func/mean": 0.04714702442288399,
+      "rewards/bleu_reward_func/std": 0.04436042159795761,
+      "step": 675
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.21875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 497.0,
+      "completions/mean_length": 183.375,
+      "completions/mean_terminated_length": 91.36000061035156,
+      "completions/min_length": 18.0,
+      "completions/min_terminated_length": 18.0,
+      "epoch": 0.5408,
+      "grad_norm": 11.63567066192627,
+      "kl": 0.16424560546875,
+      "learning_rate": 1e-06,
+      "loss": -0.0095,
+      "num_tokens": 8902704.0,
+      "reward": 0.18900102376937866,
+      "reward_std": 0.045921262353658676,
+      "rewards/bleu_reward_func/mean": 0.18900102376937866,
+      "rewards/bleu_reward_func/std": 0.25213196873664856,
+      "step": 676
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 496.0,
+      "completions/mean_length": 181.96875,
+      "completions/mean_terminated_length": 105.80769348144531,
+      "completions/min_length": 18.0,
+      "completions/min_terminated_length": 18.0,
+      "epoch": 0.5416,
+      "grad_norm": 180.27212524414062,
+      "kl": 0.203125,
+      "learning_rate": 1e-06,
+      "loss": 0.5786,
+      "num_tokens": 8917647.0,
+      "reward": 0.11598189175128937,
+      "reward_std": 0.0453701987862587,
+      "rewards/bleu_reward_func/mean": 0.11598189175128937,
+      "rewards/bleu_reward_func/std": 0.12164945900440216,
+      "step": 677
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 258.0,
+      "completions/mean_length": 187.59375,
+      "completions/mean_terminated_length": 79.45833587646484,
+      "completions/min_length": 25.0,
+      "completions/min_terminated_length": 25.0,
+      "epoch": 0.5424,
+      "grad_norm": 13.031902313232422,
+      "kl": 0.232452392578125,
+      "learning_rate": 1e-06,
+      "loss": 0.1064,
+      "num_tokens": 8927434.0,
+      "reward": 0.2563853859901428,
+      "reward_std": 0.021821634843945503,
+      "rewards/bleu_reward_func/mean": 0.2563853859901428,
+      "rewards/bleu_reward_func/std": 0.25126466155052185,
+      "step": 678
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 501.0,
+      "completions/mean_length": 157.53125,
+      "completions/mean_terminated_length": 133.90000915527344,
+      "completions/min_length": 20.0,
+      "completions/min_terminated_length": 20.0,
+      "epoch": 0.5432,
+      "grad_norm": 37.64163589477539,
+      "kl": 0.17242431640625,
+      "learning_rate": 1e-06,
+      "loss": -0.1013,
+      "num_tokens": 8939683.0,
+      "reward": 0.22671283781528473,
+      "reward_std": 0.05255472660064697,
+      "rewards/bleu_reward_func/mean": 0.22671283781528473,
+      "rewards/bleu_reward_func/std": 0.22751960158348083,
+      "step": 679
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.09375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 486.0,
+      "completions/mean_length": 263.71875,
+      "completions/mean_terminated_length": 238.03448486328125,
+      "completions/min_length": 18.0,
+      "completions/min_terminated_length": 18.0,
+      "epoch": 0.544,
+      "grad_norm": 129.8694610595703,
+      "kl": 0.1982421875,
+      "learning_rate": 1e-06,
+      "loss": 0.0067,
+      "num_tokens": 8949442.0,
+      "reward": 0.08898493647575378,
+      "reward_std": 0.0456019788980484,
+      "rewards/bleu_reward_func/mean": 0.08898493647575378,
+      "rewards/bleu_reward_func/std": 0.10032162815332413,
+      "step": 680
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.9375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 509.0,
+      "completions/mean_length": 510.1875,
+      "completions/mean_terminated_length": 483.0,
+      "completions/min_length": 457.0,
+      "completions/min_terminated_length": 457.0,
+      "epoch": 0.5448,
+      "grad_norm": 16.652488708496094,
+      "kl": 0.063751220703125,
+      "learning_rate": 1e-06,
+      "loss": 0.0038,
+      "num_tokens": 8968384.0,
+      "reward": 0.04759781062602997,
+      "reward_std": 0.009598957374691963,
+      "rewards/bleu_reward_func/mean": 0.04759781062602997,
+      "rewards/bleu_reward_func/std": 0.050501517951488495,
+      "step": 681
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.46875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 503.0,
+      "completions/mean_length": 374.875,
+      "completions/mean_terminated_length": 253.88235473632812,
+      "completions/min_length": 28.0,
+      "completions/min_terminated_length": 28.0,
+      "epoch": 0.5456,
+      "grad_norm": 130.1652069091797,
+      "kl": 0.032989501953125,
+      "learning_rate": 1e-06,
+      "loss": -0.213,
+      "num_tokens": 8984364.0,
+      "reward": 0.07015404105186462,
+      "reward_std": 0.037277355790138245,
+      "rewards/bleu_reward_func/mean": 0.07015404105186462,
+      "rewards/bleu_reward_func/std": 0.10696472972631454,
+      "step": 682
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 492.0,
+      "completions/max_terminated_length": 492.0,
+      "completions/mean_length": 221.59375,
+      "completions/mean_terminated_length": 221.59375,
+      "completions/min_length": 14.0,
+      "completions/min_terminated_length": 14.0,
+      "epoch": 0.5464,
+      "grad_norm": 15.168272018432617,
+      "kl": 0.25604248046875,
+      "learning_rate": 1e-06,
+      "loss": 0.0072,
+      "num_tokens": 8993615.0,
+      "reward": 0.05222689360380173,
+      "reward_std": 0.015750503167510033,
+      "rewards/bleu_reward_func/mean": 0.05222689360380173,
+      "rewards/bleu_reward_func/std": 0.03590291365981102,
+      "step": 683
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.28125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 385.0,
+      "completions/mean_length": 231.0625,
+      "completions/mean_terminated_length": 121.13043975830078,
+      "completions/min_length": 15.0,
+      "completions/min_terminated_length": 15.0,
+      "epoch": 0.5472,
+      "grad_norm": 35.623695373535156,
+      "kl": 0.1175537109375,
+      "learning_rate": 1e-06,
+      "loss": -0.0464,
+      "num_tokens": 9004377.0,
+      "reward": 0.04063406586647034,
+      "reward_std": 0.028225397691130638,
+      "rewards/bleu_reward_func/mean": 0.04063406586647034,
+      "rewards/bleu_reward_func/std": 0.05525263398885727,
+      "step": 684
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.40625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 482.0,
+      "completions/mean_length": 361.78125,
+      "completions/mean_terminated_length": 259.0,
+      "completions/min_length": 24.0,
+      "completions/min_terminated_length": 24.0,
+      "epoch": 0.548,
+      "grad_norm": 593.3710327148438,
+      "kl": 0.0638427734375,
+      "learning_rate": 1e-06,
+      "loss": 0.1608,
+      "num_tokens": 9020330.0,
+      "reward": 0.1717539131641388,
+      "reward_std": 0.0751393511891365,
+      "rewards/bleu_reward_func/mean": 0.1717539131641388,
+      "rewards/bleu_reward_func/std": 0.25347769260406494,
+      "step": 685
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 462.0,
+      "completions/max_terminated_length": 462.0,
+      "completions/mean_length": 175.59375,
+      "completions/mean_terminated_length": 175.59375,
+      "completions/min_length": 22.0,
+      "completions/min_terminated_length": 22.0,
+      "epoch": 0.5488,
+      "grad_norm": 7.423861980438232,
+      "kl": 0.170654296875,
+      "learning_rate": 1e-06,
+      "loss": -0.0565,
+      "num_tokens": 9028509.0,
+      "reward": 0.091148242354393,
+      "reward_std": 0.017926650121808052,
+      "rewards/bleu_reward_func/mean": 0.091148242354393,
+      "rewards/bleu_reward_func/std": 0.07815965265035629,
+      "step": 686
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 479.0,
+      "completions/mean_length": 455.15625,
+      "completions/mean_terminated_length": 398.3125,
+      "completions/min_length": 28.0,
+      "completions/min_terminated_length": 28.0,
+      "epoch": 0.5496,
+      "grad_norm": 2.612523317337036,
+      "kl": 0.033416748046875,
+      "learning_rate": 1e-06,
+      "loss": -0.0994,
+      "num_tokens": 9046090.0,
+      "reward": 0.04202251136302948,
+      "reward_std": 0.015885071828961372,
+      "rewards/bleu_reward_func/mean": 0.04202251136302948,
+      "rewards/bleu_reward_func/std": 0.03677666559815407,
+      "step": 687
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 505.0,
+      "completions/mean_length": 448.9375,
+      "completions/mean_terminated_length": 399.8888854980469,
+      "completions/min_length": 97.0,
+      "completions/min_terminated_length": 97.0,
+      "epoch": 0.5504,
+      "grad_norm": 14.944954872131348,
+      "kl": 0.056488037109375,
+      "learning_rate": 1e-06,
+      "loss": 0.1249,
+      "num_tokens": 9064360.0,
+      "reward": 0.03943703696131706,
+      "reward_std": 0.024654783308506012,
+      "rewards/bleu_reward_func/mean": 0.03943703696131706,
+      "rewards/bleu_reward_func/std": 0.03771531209349632,
+      "step": 688
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 510.0,
+      "completions/mean_length": 165.3125,
+      "completions/mean_terminated_length": 85.30769348144531,
+      "completions/min_length": 2.0,
+      "completions/min_terminated_length": 2.0,
+      "epoch": 0.5512,
+      "grad_norm": 66.34259033203125,
+      "kl": 0.408477783203125,
+      "learning_rate": 1e-06,
+      "loss": -0.1375,
+      "num_tokens": 9078698.0,
+      "reward": 0.20156420767307281,
+      "reward_std": 0.07818345725536346,
+      "rewards/bleu_reward_func/mean": 0.20156420767307281,
+      "rewards/bleu_reward_func/std": 0.23512743413448334,
+      "step": 689
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.15625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 471.0,
+      "completions/mean_length": 224.0,
+      "completions/mean_terminated_length": 170.6666717529297,
+      "completions/min_length": 15.0,
+      "completions/min_terminated_length": 15.0,
+      "epoch": 0.552,
+      "grad_norm": 45.66501998901367,
+      "kl": 1.2672119140625,
+      "learning_rate": 1e-06,
+      "loss": 0.3161,
+      "num_tokens": 9087402.0,
+      "reward": 0.12671013176441193,
+      "reward_std": 0.03653056174516678,
+      "rewards/bleu_reward_func/mean": 0.12671013176441193,
+      "rewards/bleu_reward_func/std": 0.0971146747469902,
+      "step": 690
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.34375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 442.0,
+      "completions/mean_length": 237.125,
+      "completions/mean_terminated_length": 93.14286041259766,
+      "completions/min_length": 2.0,
+      "completions/min_terminated_length": 2.0,
+      "epoch": 0.5528,
+      "grad_norm": 497.3811950683594,
+      "kl": 0.163970947265625,
+      "learning_rate": 1e-06,
+      "loss": 0.069,
+      "num_tokens": 9099254.0,
+      "reward": 0.0354698970913887,
+      "reward_std": 0.02819395810365677,
+      "rewards/bleu_reward_func/mean": 0.0354698970913887,
+      "rewards/bleu_reward_func/std": 0.030991537496447563,
+      "step": 691
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.15625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 504.0,
+      "completions/mean_length": 287.65625,
+      "completions/mean_terminated_length": 246.11111450195312,
+      "completions/min_length": 36.0,
+      "completions/min_terminated_length": 36.0,
+      "epoch": 0.5536,
+      "grad_norm": 6.150320053100586,
+      "kl": 0.086273193359375,
+      "learning_rate": 1e-06,
+      "loss": 0.1272,
+      "num_tokens": 9110347.0,
+      "reward": 0.06792090833187103,
+      "reward_std": 0.02885974571108818,
+      "rewards/bleu_reward_func/mean": 0.06792090833187103,
+      "rewards/bleu_reward_func/std": 0.06222621724009514,
+      "step": 692
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.28125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 491.0,
+      "completions/mean_length": 244.84375,
+      "completions/mean_terminated_length": 140.30435180664062,
+      "completions/min_length": 18.0,
+      "completions/min_terminated_length": 18.0,
+      "epoch": 0.5544,
+      "grad_norm": 36.0806999206543,
+      "kl": 0.23052978515625,
+      "learning_rate": 1e-06,
+      "loss": 0.0802,
+      "num_tokens": 9121382.0,
+      "reward": 0.09483693540096283,
+      "reward_std": 0.05147245526313782,
+      "rewards/bleu_reward_func/mean": 0.09483693540096283,
+      "rewards/bleu_reward_func/std": 0.08640998601913452,
+      "step": 693
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 361.0,
+      "completions/mean_length": 155.0,
+      "completions/mean_terminated_length": 131.20001220703125,
+      "completions/min_length": 26.0,
+      "completions/min_terminated_length": 26.0,
+      "epoch": 0.5552,
+      "grad_norm": 55.443695068359375,
+      "kl": 0.318450927734375,
+      "learning_rate": 1e-06,
+      "loss": -0.0617,
+      "num_tokens": 9132678.0,
+      "reward": 0.10359849035739899,
+      "reward_std": 0.07937172800302505,
+      "rewards/bleu_reward_func/mean": 0.10359849035739899,
+      "rewards/bleu_reward_func/std": 0.13979652523994446,
+      "step": 694
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 305.0,
+      "completions/mean_length": 236.65625,
+      "completions/mean_terminated_length": 71.45000457763672,
+      "completions/min_length": 13.0,
+      "completions/min_terminated_length": 13.0,
+      "epoch": 0.556,
+      "grad_norm": 146.36070251464844,
+      "kl": 0.099609375,
+      "learning_rate": 1e-06,
+      "loss": 0.1483,
+      "num_tokens": 9145227.0,
+      "reward": 0.17079538106918335,
+      "reward_std": 0.05914284288883209,
+      "rewards/bleu_reward_func/mean": 0.17079538106918335,
+      "rewards/bleu_reward_func/std": 0.23325958847999573,
+      "step": 695
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 472.0,
+      "completions/mean_length": 228.5625,
+      "completions/mean_terminated_length": 134.08334350585938,
+      "completions/min_length": 14.0,
+      "completions/min_terminated_length": 14.0,
+      "epoch": 0.5568,
+      "grad_norm": 298.8739013671875,
+      "kl": 0.1568603515625,
+      "learning_rate": 1e-06,
+      "loss": -0.1223,
+      "num_tokens": 9160869.0,
+      "reward": 0.037290386855602264,
+      "reward_std": 0.014398223720490932,
+      "rewards/bleu_reward_func/mean": 0.037290386855602264,
+      "rewards/bleu_reward_func/std": 0.03690984100103378,
+      "step": 696
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 511.0,
+      "completions/mean_length": 229.3125,
+      "completions/mean_terminated_length": 164.07693481445312,
+      "completions/min_length": 17.0,
+      "completions/min_terminated_length": 17.0,
+      "epoch": 0.5576,
+      "grad_norm": 220.50381469726562,
+      "kl": 0.365478515625,
+      "learning_rate": 1e-06,
+      "loss": 0.0947,
+      "num_tokens": 9170959.0,
+      "reward": 0.11910027265548706,
+      "reward_std": 0.04049726575613022,
+      "rewards/bleu_reward_func/mean": 0.11910027265548706,
+      "rewards/bleu_reward_func/std": 0.14281374216079712,
+      "step": 697
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.15625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 473.0,
+      "completions/mean_length": 216.15625,
+      "completions/mean_terminated_length": 161.37037658691406,
+      "completions/min_length": 15.0,
+      "completions/min_terminated_length": 15.0,
+      "epoch": 0.5584,
+      "grad_norm": 11.570438385009766,
+      "kl": 0.205352783203125,
+      "learning_rate": 1e-06,
+      "loss": 0.0257,
+      "num_tokens": 9181980.0,
+      "reward": 0.08099336922168732,
+      "reward_std": 0.04509742930531502,
+      "rewards/bleu_reward_func/mean": 0.08099336922168732,
+      "rewards/bleu_reward_func/std": 0.10288692265748978,
+      "step": 698
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.21875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 463.0,
+      "completions/mean_length": 349.96875,
+      "completions/mean_terminated_length": 304.6000061035156,
+      "completions/min_length": 96.0,
+      "completions/min_terminated_length": 96.0,
+      "epoch": 0.5592,
+      "grad_norm": 37.448883056640625,
+      "kl": 0.09320068359375,
+      "learning_rate": 1e-06,
+      "loss": 0.0422,
+      "num_tokens": 9197723.0,
+      "reward": 0.05653442069888115,
+      "reward_std": 0.026268266141414642,
+      "rewards/bleu_reward_func/mean": 0.05653442069888115,
+      "rewards/bleu_reward_func/std": 0.04277388006448746,
+      "step": 699
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 503.0,
+      "completions/mean_length": 255.03125,
+      "completions/mean_terminated_length": 195.73077392578125,
+      "completions/min_length": 2.0,
+      "completions/min_terminated_length": 2.0,
+      "epoch": 0.56,
+      "grad_norm": 340.22479248046875,
+      "kl": 0.41278076171875,
+      "learning_rate": 1e-06,
+      "loss": -0.2392,
+      "num_tokens": 9210148.0,
+      "reward": 0.06149371713399887,
+      "reward_std": 0.023687850683927536,
+      "rewards/bleu_reward_func/mean": 0.06149371713399887,
+      "rewards/bleu_reward_func/std": 0.03754807263612747,
+      "step": 700
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 497.0,
+      "completions/mean_length": 282.84375,
+      "completions/mean_terminated_length": 145.35000610351562,
+      "completions/min_length": 19.0,
+      "completions/min_terminated_length": 19.0,
+      "epoch": 0.5608,
+      "grad_norm": 404.3658752441406,
+      "kl": 0.29132080078125,
+      "learning_rate": 1e-06,
+      "loss": 0.148,
+      "num_tokens": 9222815.0,
+      "reward": 0.05664993077516556,
+      "reward_std": 0.01803937554359436,
+      "rewards/bleu_reward_func/mean": 0.05664993077516556,
+      "rewards/bleu_reward_func/std": 0.02324024587869644,
+      "step": 701
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 205.0,
+      "completions/mean_length": 116.40625,
+      "completions/mean_terminated_length": 59.892860412597656,
+      "completions/min_length": 8.0,
+      "completions/min_terminated_length": 8.0,
+      "epoch": 0.5616,
+      "grad_norm": 43.716766357421875,
+      "kl": 0.216552734375,
+      "learning_rate": 1e-06,
+      "loss": -0.0484,
+      "num_tokens": 9231460.0,
+      "reward": 0.1364556849002838,
+      "reward_std": 0.09380181133747101,
+      "rewards/bleu_reward_func/mean": 0.1364556849002838,
+      "rewards/bleu_reward_func/std": 0.21690192818641663,
+      "step": 702
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 477.0,
+      "completions/mean_length": 305.65625,
+      "completions/mean_terminated_length": 211.8636474609375,
+      "completions/min_length": 10.0,
+      "completions/min_terminated_length": 10.0,
+      "epoch": 0.5624,
+      "grad_norm": 745.7041625976562,
+      "kl": 0.14068603515625,
+      "learning_rate": 1e-06,
+      "loss": 0.1945,
+      "num_tokens": 9245265.0,
+      "reward": 0.04224216938018799,
+      "reward_std": 0.016487902030348778,
+      "rewards/bleu_reward_func/mean": 0.04224216938018799,
+      "rewards/bleu_reward_func/std": 0.025008555501699448,
+      "step": 703
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 408.0,
+      "completions/mean_length": 115.0625,
+      "completions/mean_terminated_length": 102.25806427001953,
+      "completions/min_length": 2.0,
+      "completions/min_terminated_length": 2.0,
+      "epoch": 0.5632,
+      "grad_norm": 216.922607421875,
+      "kl": 0.3056640625,
+      "learning_rate": 1e-06,
+      "loss": 0.0737,
+      "num_tokens": 9253899.0,
+      "reward": 0.08648309111595154,
+      "reward_std": 0.03777506947517395,
+      "rewards/bleu_reward_func/mean": 0.08648309111595154,
+      "rewards/bleu_reward_func/std": 0.05951961874961853,
+      "step": 704
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 461.0,
+      "completions/mean_length": 253.71875,
+      "completions/mean_terminated_length": 98.75,
+      "completions/min_length": 12.0,
+      "completions/min_terminated_length": 12.0,
+      "epoch": 0.564,
+      "grad_norm": 162.15737915039062,
+      "kl": 0.1361083984375,
+      "learning_rate": 1e-06,
+      "loss": -0.0623,
+      "num_tokens": 9269314.0,
+      "reward": 0.2322496771812439,
+      "reward_std": 0.045732706785202026,
+      "rewards/bleu_reward_func/mean": 0.2322496771812439,
+      "rewards/bleu_reward_func/std": 0.25273510813713074,
+      "step": 705
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.28125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 504.0,
+      "completions/mean_length": 340.0625,
+      "completions/mean_terminated_length": 272.7826232910156,
+      "completions/min_length": 40.0,
+      "completions/min_terminated_length": 40.0,
+      "epoch": 0.5648,
+      "grad_norm": 4.662173271179199,
+      "kl": 0.035003662109375,
+      "learning_rate": 1e-06,
+      "loss": 0.0266,
+      "num_tokens": 9282188.0,
+      "reward": 0.04237870126962662,
+      "reward_std": 0.01780301332473755,
+      "rewards/bleu_reward_func/mean": 0.04237870126962662,
+      "rewards/bleu_reward_func/std": 0.04967799782752991,
+      "step": 706
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 438.0,
+      "completions/mean_length": 349.6875,
+      "completions/mean_terminated_length": 275.9090881347656,
+      "completions/min_length": 88.0,
+      "completions/min_terminated_length": 88.0,
+      "epoch": 0.5656,
+      "grad_norm": 2.4642882347106934,
+      "kl": 0.04644775390625,
+      "learning_rate": 1e-06,
+      "loss": 0.0446,
+      "num_tokens": 9298362.0,
+      "reward": 0.0549091175198555,
+      "reward_std": 0.03525715321302414,
+      "rewards/bleu_reward_func/mean": 0.0549091175198555,
+      "rewards/bleu_reward_func/std": 0.051221489906311035,
+      "step": 707
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 510.0,
+      "completions/mean_length": 249.875,
+      "completions/mean_terminated_length": 232.40000915527344,
+      "completions/min_length": 10.0,
+      "completions/min_terminated_length": 10.0,
+      "epoch": 0.5664,
+      "grad_norm": 11.45730972290039,
+      "kl": 0.16558837890625,
+      "learning_rate": 1e-06,
+      "loss": 0.0823,
+      "num_tokens": 9308934.0,
+      "reward": 0.10365931689739227,
+      "reward_std": 0.028398117050528526,
+      "rewards/bleu_reward_func/mean": 0.10365931689739227,
+      "rewards/bleu_reward_func/std": 0.06428122520446777,
+      "step": 708
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 452.0,
+      "completions/mean_length": 254.0625,
+      "completions/mean_terminated_length": 99.30000305175781,
+      "completions/min_length": 18.0,
+      "completions/min_terminated_length": 18.0,
+      "epoch": 0.5672,
+      "grad_norm": 7.227019786834717,
+      "kl": 0.154449462890625,
+      "learning_rate": 1e-06,
+      "loss": 0.0107,
+      "num_tokens": 9322640.0,
+      "reward": 0.18866762518882751,
+      "reward_std": 0.044271718710660934,
+      "rewards/bleu_reward_func/mean": 0.18866762518882751,
+      "rewards/bleu_reward_func/std": 0.1287185698747635,
+      "step": 709
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.09375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 507.0,
+      "completions/mean_length": 304.96875,
+      "completions/mean_terminated_length": 283.5517272949219,
+      "completions/min_length": 35.0,
+      "completions/min_terminated_length": 35.0,
+      "epoch": 0.568,
+      "grad_norm": 5.722259998321533,
+      "kl": 0.07122802734375,
+      "learning_rate": 1e-06,
+      "loss": -0.1128,
+      "num_tokens": 9337639.0,
+      "reward": 0.0695868507027626,
+      "reward_std": 0.022387558594346046,
+      "rewards/bleu_reward_func/mean": 0.0695868507027626,
+      "rewards/bleu_reward_func/std": 0.065777987241745,
+      "step": 710
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 387.0,
+      "completions/mean_length": 150.09375,
+      "completions/mean_terminated_length": 125.9666748046875,
+      "completions/min_length": 21.0,
+      "completions/min_terminated_length": 21.0,
+      "epoch": 0.5688,
+      "grad_norm": 5.365989685058594,
+      "kl": 0.0738525390625,
+      "learning_rate": 1e-06,
+      "loss": 0.3885,
+      "num_tokens": 9346522.0,
+      "reward": 0.2348867505788803,
+      "reward_std": 0.09850712865591049,
+      "rewards/bleu_reward_func/mean": 0.2348867505788803,
+      "rewards/bleu_reward_func/std": 0.302653044462204,
+      "step": 711
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 492.0,
+      "completions/mean_length": 342.65625,
+      "completions/mean_terminated_length": 265.68182373046875,
+      "completions/min_length": 20.0,
+      "completions/min_terminated_length": 20.0,
+      "epoch": 0.5696,
+      "grad_norm": 7.032015800476074,
+      "kl": 0.060089111328125,
+      "learning_rate": 1e-06,
+      "loss": -0.0338,
+      "num_tokens": 9359935.0,
+      "reward": 0.053069278597831726,
+      "reward_std": 0.015607406385242939,
+      "rewards/bleu_reward_func/mean": 0.053069278597831726,
+      "rewards/bleu_reward_func/std": 0.0380670465528965,
+      "step": 712
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.21875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 372.0,
+      "completions/mean_length": 166.46875,
+      "completions/mean_terminated_length": 69.72000122070312,
+      "completions/min_length": 17.0,
+      "completions/min_terminated_length": 17.0,
+      "epoch": 0.5704,
+      "grad_norm": 7.543166160583496,
+      "kl": 0.26068115234375,
+      "learning_rate": 1e-06,
+      "loss": 0.3241,
+      "num_tokens": 9367886.0,
+      "reward": 0.17086198925971985,
+      "reward_std": 0.059704020619392395,
+      "rewards/bleu_reward_func/mean": 0.17086198925971985,
+      "rewards/bleu_reward_func/std": 0.13924144208431244,
+      "step": 713
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.21875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 496.0,
+      "completions/mean_length": 306.3125,
+      "completions/mean_terminated_length": 248.72000122070312,
+      "completions/min_length": 21.0,
+      "completions/min_terminated_length": 21.0,
+      "epoch": 0.5712,
+      "grad_norm": 3.74664568901062,
+      "kl": 0.086090087890625,
+      "learning_rate": 1e-06,
+      "loss": -0.2145,
+      "num_tokens": 9381256.0,
+      "reward": 0.026702141389250755,
+      "reward_std": 0.010159555822610855,
+      "rewards/bleu_reward_func/mean": 0.026702141389250755,
+      "rewards/bleu_reward_func/std": 0.025492098182439804,
+      "step": 714
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.15625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 374.0,
+      "completions/mean_length": 202.40625,
+      "completions/mean_terminated_length": 145.07408142089844,
+      "completions/min_length": 42.0,
+      "completions/min_terminated_length": 42.0,
+      "epoch": 0.572,
+      "grad_norm": 3.670616865158081,
+      "kl": 0.0810394287109375,
+      "learning_rate": 1e-06,
+      "loss": 0.3116,
+      "num_tokens": 9394869.0,
+      "reward": 0.09095513820648193,
+      "reward_std": 0.0500517264008522,
+      "rewards/bleu_reward_func/mean": 0.09095513820648193,
+      "rewards/bleu_reward_func/std": 0.07213761657476425,
+      "step": 715
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 432.0,
+      "completions/mean_length": 221.5625,
+      "completions/mean_terminated_length": 202.20001220703125,
+      "completions/min_length": 18.0,
+      "completions/min_terminated_length": 18.0,
+      "epoch": 0.5728,
+      "grad_norm": 9.221464157104492,
+      "kl": 0.10400390625,
+      "learning_rate": 1e-06,
+      "loss": 0.3058,
+      "num_tokens": 9405335.0,
+      "reward": 0.12374541163444519,
+      "reward_std": 0.040764160454273224,
+      "rewards/bleu_reward_func/mean": 0.12374541163444519,
+      "rewards/bleu_reward_func/std": 0.13386160135269165,
+      "step": 716
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.40625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 512.0,
+      "completions/mean_length": 347.1875,
+      "completions/mean_terminated_length": 234.42105102539062,
+      "completions/min_length": 14.0,
+      "completions/min_terminated_length": 14.0,
+      "epoch": 0.5736,
+      "grad_norm": 12.991151809692383,
+      "kl": 0.157958984375,
+      "learning_rate": 1e-06,
+      "loss": 0.0424,
+      "num_tokens": 9420789.0,
+      "reward": 0.07029742747545242,
+      "reward_std": 0.012854170054197311,
+      "rewards/bleu_reward_func/mean": 0.07029742747545242,
+      "rewards/bleu_reward_func/std": 0.041719451546669006,
+      "step": 717
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.78125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 495.0,
+      "completions/mean_length": 503.75,
+      "completions/mean_terminated_length": 474.2857360839844,
+      "completions/min_length": 408.0,
+      "completions/min_terminated_length": 408.0,
+      "epoch": 0.5744,
+      "grad_norm": 2.107853889465332,
+      "kl": 0.028350830078125,
+      "learning_rate": 1e-06,
+      "loss": -0.0047,
+      "num_tokens": 9439789.0,
+      "reward": 0.05256051570177078,
+      "reward_std": 0.010154004208743572,
+      "rewards/bleu_reward_func/mean": 0.05256051570177078,
+      "rewards/bleu_reward_func/std": 0.03522626310586929,
+      "step": 718
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 496.0,
+      "completions/mean_length": 244.15625,
+      "completions/mean_terminated_length": 235.51612854003906,
+      "completions/min_length": 73.0,
+      "completions/min_terminated_length": 73.0,
+      "epoch": 0.5752,
+      "grad_norm": 3.158909559249878,
+      "kl": 0.0923919677734375,
+      "learning_rate": 1e-06,
+      "loss": -0.1019,
+      "num_tokens": 9451978.0,
+      "reward": 0.12841522693634033,
+      "reward_std": 0.05657704174518585,
+      "rewards/bleu_reward_func/mean": 0.12841522693634033,
+      "rewards/bleu_reward_func/std": 0.07523242384195328,
+      "step": 719
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.15625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 494.0,
+      "completions/mean_length": 150.4375,
+      "completions/mean_terminated_length": 83.48148345947266,
+      "completions/min_length": 16.0,
+      "completions/min_terminated_length": 16.0,
+      "epoch": 0.576,
+      "grad_norm": 8.099937438964844,
+      "kl": 0.214874267578125,
+      "learning_rate": 1e-06,
+      "loss": 0.0129,
+      "num_tokens": 9461736.0,
+      "reward": 0.08868992328643799,
+      "reward_std": 0.017071515321731567,
+      "rewards/bleu_reward_func/mean": 0.08868992328643799,
+      "rewards/bleu_reward_func/std": 0.08577441424131393,
+      "step": 720
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 504.0,
+      "completions/mean_length": 447.65625,
+      "completions/mean_terminated_length": 383.3125,
+      "completions/min_length": 89.0,
+      "completions/min_terminated_length": 89.0,
+      "epoch": 0.5768,
+      "grad_norm": 2.4854142665863037,
+      "kl": 0.045318603515625,
+      "learning_rate": 1e-06,
+      "loss": -0.0504,
+      "num_tokens": 9477381.0,
+      "reward": 0.0682106539607048,
+      "reward_std": 0.022257793694734573,
+      "rewards/bleu_reward_func/mean": 0.0682106539607048,
+      "rewards/bleu_reward_func/std": 0.05095710977911949,
+      "step": 721
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 350.0,
+      "completions/mean_length": 167.84375,
+      "completions/mean_terminated_length": 156.74192810058594,
+      "completions/min_length": 69.0,
+      "completions/min_terminated_length": 69.0,
+      "epoch": 0.5776,
+      "grad_norm": 4.308876037597656,
+      "kl": 0.07659912109375,
+      "learning_rate": 1e-06,
+      "loss": 0.1774,
+      "num_tokens": 9489008.0,
+      "reward": 0.0951995924115181,
+      "reward_std": 0.033833228051662445,
+      "rewards/bleu_reward_func/mean": 0.0951995924115181,
+      "rewards/bleu_reward_func/std": 0.0729941874742508,
+      "step": 722
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.09375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 478.0,
+      "completions/mean_length": 179.34375,
+      "completions/mean_terminated_length": 144.9310302734375,
+      "completions/min_length": 19.0,
+      "completions/min_terminated_length": 19.0,
+      "epoch": 0.5784,
+      "grad_norm": 12.132597923278809,
+      "kl": 0.33624267578125,
+      "learning_rate": 1e-06,
+      "loss": 0.2153,
+      "num_tokens": 9498059.0,
+      "reward": 0.10619839280843735,
+      "reward_std": 0.04761648178100586,
+      "rewards/bleu_reward_func/mean": 0.10619839280843735,
+      "rewards/bleu_reward_func/std": 0.0809776559472084,
+      "step": 723
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 511.0,
+      "completions/mean_length": 374.78125,
+      "completions/mean_terminated_length": 268.0555725097656,
+      "completions/min_length": 39.0,
+      "completions/min_terminated_length": 39.0,
+      "epoch": 0.5792,
+      "grad_norm": 7.476963996887207,
+      "kl": 0.1087646484375,
+      "learning_rate": 1e-06,
+      "loss": 0.0108,
+      "num_tokens": 9517972.0,
+      "reward": 0.17816489934921265,
+      "reward_std": 0.016055870801210403,
+      "rewards/bleu_reward_func/mean": 0.17816489934921265,
+      "rewards/bleu_reward_func/std": 0.266427606344223,
+      "step": 724
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 503.0,
+      "completions/mean_length": 220.125,
+      "completions/mean_terminated_length": 152.7692413330078,
+      "completions/min_length": 18.0,
+      "completions/min_terminated_length": 18.0,
+      "epoch": 0.58,
+      "grad_norm": 5.399383544921875,
+      "kl": 0.13983154296875,
+      "learning_rate": 1e-06,
+      "loss": 0.1212,
+      "num_tokens": 9530760.0,
+      "reward": 0.07149016857147217,
+      "reward_std": 0.023819390684366226,
+      "rewards/bleu_reward_func/mean": 0.07149016857147217,
+      "rewards/bleu_reward_func/std": 0.053011875599622726,
+      "step": 725
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 475.0,
+      "completions/mean_length": 500.3125,
+      "completions/mean_terminated_length": 418.5,
+      "completions/min_length": 355.0,
+      "completions/min_terminated_length": 355.0,
+      "epoch": 0.5808,
+      "grad_norm": 2.243359088897705,
+      "kl": 0.028045654296875,
+      "learning_rate": 1e-06,
+      "loss": -0.0175,
+      "num_tokens": 9550330.0,
+      "reward": 0.04440176486968994,
+      "reward_std": 0.00922885537147522,
+      "rewards/bleu_reward_func/mean": 0.04440176486968994,
+      "rewards/bleu_reward_func/std": 0.03932040557265282,
+      "step": 726
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 498.0,
+      "completions/mean_length": 154.65625,
+      "completions/mean_terminated_length": 103.60714721679688,
+      "completions/min_length": 16.0,
+      "completions/min_terminated_length": 16.0,
+      "epoch": 0.5816,
+      "grad_norm": 54.60096740722656,
+      "kl": 0.27252197265625,
+      "learning_rate": 1e-06,
+      "loss": 0.1714,
+      "num_tokens": 9557863.0,
+      "reward": 0.34987902641296387,
+      "reward_std": 0.09637948125600815,
+      "rewards/bleu_reward_func/mean": 0.34987902641296387,
+      "rewards/bleu_reward_func/std": 0.30998000502586365,
+      "step": 727
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 400.0,
+      "completions/mean_length": 394.21875,
+      "completions/mean_terminated_length": 197.9166717529297,
+      "completions/min_length": 26.0,
+      "completions/min_terminated_length": 26.0,
+      "epoch": 0.5824,
+      "grad_norm": 3.5557267665863037,
+      "kl": 0.04107666015625,
+      "learning_rate": 1e-06,
+      "loss": -0.2035,
+      "num_tokens": 9576342.0,
+      "reward": 0.03708350285887718,
+      "reward_std": 0.013400746509432793,
+      "rewards/bleu_reward_func/mean": 0.03708350285887718,
+      "rewards/bleu_reward_func/std": 0.030460968613624573,
+      "step": 728
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 492.0,
+      "completions/mean_length": 287.53125,
+      "completions/mean_terminated_length": 272.5666809082031,
+      "completions/min_length": 14.0,
+      "completions/min_terminated_length": 14.0,
+      "epoch": 0.5832,
+      "grad_norm": 5.337757110595703,
+      "kl": 0.106048583984375,
+      "learning_rate": 1e-06,
+      "loss": -0.0648,
+      "num_tokens": 9587719.0,
+      "reward": 0.08226186782121658,
+      "reward_std": 0.016267672181129456,
+      "rewards/bleu_reward_func/mean": 0.08226186782121658,
+      "rewards/bleu_reward_func/std": 0.047058336436748505,
+      "step": 729
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.28125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 497.0,
+      "completions/mean_length": 353.46875,
+      "completions/mean_terminated_length": 291.4347839355469,
+      "completions/min_length": 13.0,
+      "completions/min_terminated_length": 13.0,
+      "epoch": 0.584,
+      "grad_norm": 172.38458251953125,
+      "kl": 0.142059326171875,
+      "learning_rate": 1e-06,
+      "loss": -0.0203,
+      "num_tokens": 9604270.0,
+      "reward": 0.08777523040771484,
+      "reward_std": 0.028989041224122047,
+      "rewards/bleu_reward_func/mean": 0.08777523040771484,
+      "rewards/bleu_reward_func/std": 0.053535155951976776,
+      "step": 730
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 412.0,
+      "completions/max_terminated_length": 412.0,
+      "completions/mean_length": 123.5625,
+      "completions/mean_terminated_length": 123.5625,
+      "completions/min_length": 11.0,
+      "completions/min_terminated_length": 11.0,
+      "epoch": 0.5848,
+      "grad_norm": 14.68234920501709,
+      "kl": 0.1837158203125,
+      "learning_rate": 1e-06,
+      "loss": 0.053,
+      "num_tokens": 9613376.0,
+      "reward": 0.09781108796596527,
+      "reward_std": 0.03509049117565155,
+      "rewards/bleu_reward_func/mean": 0.09781108796596527,
+      "rewards/bleu_reward_func/std": 0.07531887292861938,
+      "step": 731
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 499.0,
+      "completions/mean_length": 147.53125,
+      "completions/mean_terminated_length": 63.42308044433594,
+      "completions/min_length": 6.0,
+      "completions/min_terminated_length": 6.0,
+      "epoch": 0.5856,
+      "grad_norm": 471.59912109375,
+      "kl": 0.191680908203125,
+      "learning_rate": 1e-06,
+      "loss": 0.0896,
+      "num_tokens": 9624305.0,
+      "reward": 0.08778894692659378,
+      "reward_std": 0.025603748857975006,
+      "rewards/bleu_reward_func/mean": 0.08778894692659378,
+      "rewards/bleu_reward_func/std": 0.06823020428419113,
+      "step": 732
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 506.0,
+      "completions/mean_length": 254.0,
+      "completions/mean_terminated_length": 194.4615478515625,
+      "completions/min_length": 87.0,
+      "completions/min_terminated_length": 87.0,
+      "epoch": 0.5864,
+      "grad_norm": 3.397786855697632,
+      "kl": 0.03460693359375,
+      "learning_rate": 1e-06,
+      "loss": -0.0596,
+      "num_tokens": 9634593.0,
+      "reward": 0.08798034489154816,
+      "reward_std": 0.02149152383208275,
+      "rewards/bleu_reward_func/mean": 0.08798034489154816,
+      "rewards/bleu_reward_func/std": 0.07060196995735168,
+      "step": 733
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.15625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 511.0,
+      "completions/mean_length": 300.75,
+      "completions/mean_terminated_length": 261.629638671875,
+      "completions/min_length": 25.0,
+      "completions/min_terminated_length": 25.0,
+      "epoch": 0.5872,
+      "grad_norm": 11.675055503845215,
+      "kl": 0.0538330078125,
+      "learning_rate": 1e-06,
+      "loss": 0.1186,
+      "num_tokens": 9647729.0,
+      "reward": 0.20255795121192932,
+      "reward_std": 0.044919952750205994,
+      "rewards/bleu_reward_func/mean": 0.20255795121192932,
+      "rewards/bleu_reward_func/std": 0.23513151705265045,
+      "step": 734
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 381.0,
+      "completions/max_terminated_length": 381.0,
+      "completions/mean_length": 100.875,
+      "completions/mean_terminated_length": 100.875,
+      "completions/min_length": 16.0,
+      "completions/min_terminated_length": 16.0,
+      "epoch": 0.588,
+      "grad_norm": 7.969452381134033,
+      "kl": 0.2791748046875,
+      "learning_rate": 1e-06,
+      "loss": 0.1783,
+      "num_tokens": 9655565.0,
+      "reward": 0.1732563078403473,
+      "reward_std": 0.06255275756120682,
+      "rewards/bleu_reward_func/mean": 0.1732563078403473,
+      "rewards/bleu_reward_func/std": 0.14761896431446075,
+      "step": 735
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 504.0,
+      "completions/mean_length": 255.78125,
+      "completions/mean_terminated_length": 196.6538543701172,
+      "completions/min_length": 33.0,
+      "completions/min_terminated_length": 33.0,
+      "epoch": 0.5888,
+      "grad_norm": 37.51567840576172,
+      "kl": 0.139739990234375,
+      "learning_rate": 1e-06,
+      "loss": 0.0247,
+      "num_tokens": 9672070.0,
+      "reward": 0.3068300187587738,
+      "reward_std": 0.018469596281647682,
+      "rewards/bleu_reward_func/mean": 0.3068300187587738,
+      "rewards/bleu_reward_func/std": 0.29021966457366943,
+      "step": 736
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 477.0,
+      "completions/mean_length": 367.15625,
+      "completions/mean_terminated_length": 301.31817626953125,
+      "completions/min_length": 184.0,
+      "completions/min_terminated_length": 184.0,
+      "epoch": 0.5896,
+      "grad_norm": 2.4598867893218994,
+      "kl": 0.0330657958984375,
+      "learning_rate": 1e-06,
+      "loss": -0.0194,
+      "num_tokens": 9688219.0,
+      "reward": 0.05701170861721039,
+      "reward_std": 0.020281650125980377,
+      "rewards/bleu_reward_func/mean": 0.05701170861721039,
+      "rewards/bleu_reward_func/std": 0.055385053157806396,
+      "step": 737
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 486.0,
+      "completions/mean_length": 318.46875,
+      "completions/mean_terminated_length": 230.5,
+      "completions/min_length": 48.0,
+      "completions/min_terminated_length": 48.0,
+      "epoch": 0.5904,
+      "grad_norm": 4.623442649841309,
+      "kl": 0.045806884765625,
+      "learning_rate": 1e-06,
+      "loss": -0.2136,
+      "num_tokens": 9701082.0,
+      "reward": 0.05631488561630249,
+      "reward_std": 0.022235814481973648,
+      "rewards/bleu_reward_func/mean": 0.05631488561630249,
+      "rewards/bleu_reward_func/std": 0.0748782679438591,
+      "step": 738
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 429.0,
+      "completions/mean_length": 195.40625,
+      "completions/mean_terminated_length": 122.34616088867188,
+      "completions/min_length": 15.0,
+      "completions/min_terminated_length": 15.0,
+      "epoch": 0.5912,
+      "grad_norm": 6.426390171051025,
+      "kl": 0.21075439453125,
+      "learning_rate": 1e-06,
+      "loss": 0.0043,
+      "num_tokens": 9712175.0,
+      "reward": 0.250314861536026,
+      "reward_std": 0.043683700263500214,
+      "rewards/bleu_reward_func/mean": 0.250314861536026,
+      "rewards/bleu_reward_func/std": 0.27451202273368835,
+      "step": 739
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 504.0,
+      "completions/mean_length": 314.375,
+      "completions/mean_terminated_length": 195.8000030517578,
+      "completions/min_length": 25.0,
+      "completions/min_terminated_length": 25.0,
+      "epoch": 0.592,
+      "grad_norm": 4.511030197143555,
+      "kl": 0.0513916015625,
+      "learning_rate": 1e-06,
+      "loss": 0.0376,
+      "num_tokens": 9727491.0,
+      "reward": 0.24225017428398132,
+      "reward_std": 0.0391615591943264,
+      "rewards/bleu_reward_func/mean": 0.24225017428398132,
+      "rewards/bleu_reward_func/std": 0.23075063526630402,
+      "step": 740
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 509.0,
+      "completions/mean_length": 291.34375,
+      "completions/mean_terminated_length": 191.0454559326172,
+      "completions/min_length": 27.0,
+      "completions/min_terminated_length": 27.0,
+      "epoch": 0.5928,
+      "grad_norm": 5.867874622344971,
+      "kl": 0.070037841796875,
+      "learning_rate": 1e-06,
+      "loss": 0.0656,
+      "num_tokens": 9740590.0,
+      "reward": 0.08597154170274734,
+      "reward_std": 0.053836189210414886,
+      "rewards/bleu_reward_func/mean": 0.08597154170274734,
+      "rewards/bleu_reward_func/std": 0.12926995754241943,
+      "step": 741
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.15625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 414.0,
+      "completions/mean_length": 205.34375,
+      "completions/mean_terminated_length": 148.55555725097656,
+      "completions/min_length": 20.0,
+      "completions/min_terminated_length": 20.0,
+      "epoch": 0.5936,
+      "grad_norm": 6.713605880737305,
+      "kl": 0.12249755859375,
+      "learning_rate": 1e-06,
+      "loss": 0.1915,
+      "num_tokens": 9751033.0,
+      "reward": 0.19744500517845154,
+      "reward_std": 0.041014768183231354,
+      "rewards/bleu_reward_func/mean": 0.19744500517845154,
+      "rewards/bleu_reward_func/std": 0.1538456529378891,
+      "step": 742
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.15625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 506.0,
+      "completions/mean_length": 249.46875,
+      "completions/mean_terminated_length": 200.8518524169922,
+      "completions/min_length": 16.0,
+      "completions/min_terminated_length": 16.0,
+      "epoch": 0.5944,
+      "grad_norm": 5.988234043121338,
+      "kl": 0.22430419921875,
+      "learning_rate": 1e-06,
+      "loss": 0.0382,
+      "num_tokens": 9764368.0,
+      "reward": 0.2869833707809448,
+      "reward_std": 0.07026369869709015,
+      "rewards/bleu_reward_func/mean": 0.2869833707809448,
+      "rewards/bleu_reward_func/std": 0.2287815362215042,
+      "step": 743
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.21875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 464.0,
+      "completions/mean_length": 265.3125,
+      "completions/mean_terminated_length": 196.239990234375,
+      "completions/min_length": 7.0,
+      "completions/min_terminated_length": 7.0,
+      "epoch": 0.5952,
+      "grad_norm": 8.931143760681152,
+      "kl": 0.234375,
+      "learning_rate": 1e-06,
+      "loss": -0.1984,
+      "num_tokens": 9774882.0,
+      "reward": 0.07221800833940506,
+      "reward_std": 0.031748898327350616,
+      "rewards/bleu_reward_func/mean": 0.07221800833940506,
+      "rewards/bleu_reward_func/std": 0.06019110977649689,
+      "step": 744
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 491.0,
+      "completions/mean_length": 410.3125,
+      "completions/mean_terminated_length": 279.5714416503906,
+      "completions/min_length": 75.0,
+      "completions/min_terminated_length": 75.0,
+      "epoch": 0.596,
+      "grad_norm": 2.886622190475464,
+      "kl": 0.031707763671875,
+      "learning_rate": 1e-06,
+      "loss": -0.0903,
+      "num_tokens": 9790372.0,
+      "reward": 0.030976204201579094,
+      "reward_std": 0.015047797001898289,
+      "rewards/bleu_reward_func/mean": 0.030976204201579094,
+      "rewards/bleu_reward_func/std": 0.033486902713775635,
+      "step": 745
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.21875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 481.0,
+      "completions/mean_length": 241.84375,
+      "completions/mean_terminated_length": 166.1999969482422,
+      "completions/min_length": 43.0,
+      "completions/min_terminated_length": 43.0,
+      "epoch": 0.5968,
+      "grad_norm": 4.601830005645752,
+      "kl": 0.069915771484375,
+      "learning_rate": 1e-06,
+      "loss": 0.0366,
+      "num_tokens": 9801439.0,
+      "reward": 0.10595028847455978,
+      "reward_std": 0.024186890572309494,
+      "rewards/bleu_reward_func/mean": 0.10595028847455978,
+      "rewards/bleu_reward_func/std": 0.10705985873937607,
+      "step": 746
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.21875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 490.0,
+      "completions/mean_length": 311.53125,
+      "completions/mean_terminated_length": 255.39999389648438,
+      "completions/min_length": 67.0,
+      "completions/min_terminated_length": 67.0,
+      "epoch": 0.5976,
+      "grad_norm": 2.84419322013855,
+      "kl": 0.05322265625,
+      "learning_rate": 1e-06,
+      "loss": -0.0815,
+      "num_tokens": 9816096.0,
+      "reward": 0.02818489633500576,
+      "reward_std": 0.008401873521506786,
+      "rewards/bleu_reward_func/mean": 0.02818489633500576,
+      "rewards/bleu_reward_func/std": 0.02303573302924633,
+      "step": 747
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 343.0,
+      "completions/max_terminated_length": 343.0,
+      "completions/mean_length": 100.78125,
+      "completions/mean_terminated_length": 100.78125,
+      "completions/min_length": 22.0,
+      "completions/min_terminated_length": 22.0,
+      "epoch": 0.5984,
+      "grad_norm": 8.280403137207031,
+      "kl": 0.14306640625,
+      "learning_rate": 1e-06,
+      "loss": 0.0722,
+      "num_tokens": 9821761.0,
+      "reward": 0.07066097855567932,
+      "reward_std": 0.026687482371926308,
+      "rewards/bleu_reward_func/mean": 0.07066097855567932,
+      "rewards/bleu_reward_func/std": 0.04903886467218399,
+      "step": 748
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 396.0,
+      "completions/max_terminated_length": 396.0,
+      "completions/mean_length": 129.1875,
+      "completions/mean_terminated_length": 129.1875,
+      "completions/min_length": 23.0,
+      "completions/min_terminated_length": 23.0,
+      "epoch": 0.5992,
+      "grad_norm": 6.650774955749512,
+      "kl": 0.1456298828125,
+      "learning_rate": 1e-06,
+      "loss": 0.1458,
+      "num_tokens": 9829943.0,
+      "reward": 0.19511400163173676,
+      "reward_std": 0.03503159433603287,
+      "rewards/bleu_reward_func/mean": 0.19511400163173676,
+      "rewards/bleu_reward_func/std": 0.21101784706115723,
+      "step": 749
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 469.0,
+      "completions/mean_length": 258.1875,
+      "completions/mean_terminated_length": 105.9000015258789,
+      "completions/min_length": 42.0,
+      "completions/min_terminated_length": 42.0,
+      "epoch": 0.6,
+      "grad_norm": 5.664450645446777,
+      "kl": 0.081695556640625,
+      "learning_rate": 1e-06,
+      "loss": 0.008,
+      "num_tokens": 9841813.0,
+      "reward": 0.20738312602043152,
+      "reward_std": 0.07332950830459595,
+      "rewards/bleu_reward_func/mean": 0.20738312602043152,
+      "rewards/bleu_reward_func/std": 0.2197185456752777,
+      "step": 750
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 421.0,
+      "completions/mean_length": 160.03125,
+      "completions/mean_terminated_length": 136.56668090820312,
+      "completions/min_length": 17.0,
+      "completions/min_terminated_length": 17.0,
+      "epoch": 0.6008,
+      "grad_norm": 69.39850616455078,
+      "kl": 0.24407958984375,
+      "learning_rate": 1e-06,
+      "loss": 0.016,
+      "num_tokens": 9852910.0,
+      "reward": 0.2434358447790146,
+      "reward_std": 0.10366295278072357,
+      "rewards/bleu_reward_func/mean": 0.2434358447790146,
+      "rewards/bleu_reward_func/std": 0.18655826151371002,
+      "step": 751
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 512.0,
+      "completions/mean_length": 316.875,
+      "completions/mean_terminated_length": 251.83334350585938,
+      "completions/min_length": 23.0,
+      "completions/min_terminated_length": 23.0,
+      "epoch": 0.6016,
+      "grad_norm": 4.908503532409668,
+      "kl": 0.0460205078125,
+      "learning_rate": 1e-06,
+      "loss": 0.1812,
+      "num_tokens": 9865266.0,
+      "reward": 0.05490465834736824,
+      "reward_std": 0.02047915570437908,
+      "rewards/bleu_reward_func/mean": 0.05490465834736824,
+      "rewards/bleu_reward_func/std": 0.04037528112530708,
+      "step": 752
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 491.0,
+      "completions/mean_length": 269.09375,
+      "completions/mean_terminated_length": 252.90000915527344,
+      "completions/min_length": 76.0,
+      "completions/min_terminated_length": 76.0,
+      "epoch": 0.6024,
+      "grad_norm": 2.893157958984375,
+      "kl": 0.048187255859375,
+      "learning_rate": 1e-06,
+      "loss": -0.0419,
+      "num_tokens": 9876613.0,
+      "reward": 0.05797014757990837,
+      "reward_std": 0.029720589518547058,
+      "rewards/bleu_reward_func/mean": 0.05797014757990837,
+      "rewards/bleu_reward_func/std": 0.07483170926570892,
+      "step": 753
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.65625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 506.0,
+      "completions/mean_length": 434.21875,
+      "completions/mean_terminated_length": 285.727294921875,
+      "completions/min_length": 22.0,
+      "completions/min_terminated_length": 22.0,
+      "epoch": 0.6032,
+      "grad_norm": 3.0334763526916504,
+      "kl": 0.042449951171875,
+      "learning_rate": 1e-06,
+      "loss": -0.167,
+      "num_tokens": 9894044.0,
+      "reward": 0.0762338861823082,
+      "reward_std": 0.02360478974878788,
+      "rewards/bleu_reward_func/mean": 0.0762338861823082,
+      "rewards/bleu_reward_func/std": 0.0673457533121109,
+      "step": 754
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 509.0,
+      "completions/mean_length": 423.21875,
+      "completions/mean_terminated_length": 309.0714416503906,
+      "completions/min_length": 87.0,
+      "completions/min_terminated_length": 87.0,
+      "epoch": 0.604,
+      "grad_norm": 2.2641522884368896,
+      "kl": 0.03399658203125,
+      "learning_rate": 1e-06,
+      "loss": -0.0917,
+      "num_tokens": 9911107.0,
+      "reward": 0.03881996497511864,
+      "reward_std": 0.012424922548234463,
+      "rewards/bleu_reward_func/mean": 0.03881996497511864,
+      "rewards/bleu_reward_func/std": 0.02825937233865261,
+      "step": 755
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.21875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 431.0,
+      "completions/mean_length": 196.53125,
+      "completions/mean_terminated_length": 108.19999694824219,
+      "completions/min_length": 17.0,
+      "completions/min_terminated_length": 17.0,
+      "epoch": 0.6048,
+      "grad_norm": 9.265530586242676,
+      "kl": 0.2891845703125,
+      "learning_rate": 1e-06,
+      "loss": 0.1189,
+      "num_tokens": 9924868.0,
+      "reward": 0.2168477475643158,
+      "reward_std": 0.07323689758777618,
+      "rewards/bleu_reward_func/mean": 0.2168477475643158,
+      "rewards/bleu_reward_func/std": 0.17768503725528717,
+      "step": 756
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 447.0,
+      "completions/mean_length": 163.53125,
+      "completions/mean_terminated_length": 152.29031372070312,
+      "completions/min_length": 13.0,
+      "completions/min_terminated_length": 13.0,
+      "epoch": 0.6056,
+      "grad_norm": 44.89513397216797,
+      "kl": 0.31927490234375,
+      "learning_rate": 1e-06,
+      "loss": 0.0321,
+      "num_tokens": 9935765.0,
+      "reward": 0.08691335469484329,
+      "reward_std": 0.03311008960008621,
+      "rewards/bleu_reward_func/mean": 0.08691335469484329,
+      "rewards/bleu_reward_func/std": 0.08314234763383865,
+      "step": 757
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 425.0,
+      "completions/max_terminated_length": 425.0,
+      "completions/mean_length": 97.375,
+      "completions/mean_terminated_length": 97.375,
+      "completions/min_length": 18.0,
+      "completions/min_terminated_length": 18.0,
+      "epoch": 0.6064,
+      "grad_norm": 8.500920295715332,
+      "kl": 0.15838623046875,
+      "learning_rate": 1e-06,
+      "loss": 0.1392,
+      "num_tokens": 9944529.0,
+      "reward": 0.25747808814048767,
+      "reward_std": 0.048998236656188965,
+      "rewards/bleu_reward_func/mean": 0.25747808814048767,
+      "rewards/bleu_reward_func/std": 0.22997993230819702,
+      "step": 758
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 498.0,
+      "completions/mean_length": 357.78125,
+      "completions/mean_terminated_length": 265.25,
+      "completions/min_length": 77.0,
+      "completions/min_terminated_length": 77.0,
+      "epoch": 0.6072,
+      "grad_norm": 3.4296295642852783,
+      "kl": 0.05389404296875,
+      "learning_rate": 1e-06,
+      "loss": -0.0162,
+      "num_tokens": 9961058.0,
+      "reward": 0.07074315845966339,
+      "reward_std": 0.0662379041314125,
+      "rewards/bleu_reward_func/mean": 0.07074315845966339,
+      "rewards/bleu_reward_func/std": 0.1079547107219696,
+      "step": 759
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.09375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 505.0,
+      "completions/mean_length": 207.375,
+      "completions/mean_terminated_length": 175.86207580566406,
+      "completions/min_length": 17.0,
+      "completions/min_terminated_length": 17.0,
+      "epoch": 0.608,
+      "grad_norm": 6.224173545837402,
+      "kl": 0.1185302734375,
+      "learning_rate": 1e-06,
+      "loss": 0.0424,
+      "num_tokens": 9972646.0,
+      "reward": 0.09982403367757797,
+      "reward_std": 0.06204840913414955,
+      "rewards/bleu_reward_func/mean": 0.09982403367757797,
+      "rewards/bleu_reward_func/std": 0.10973682999610901,
+      "step": 760
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 452.0,
+      "completions/mean_length": 152.8125,
+      "completions/mean_terminated_length": 128.86666870117188,
+      "completions/min_length": 9.0,
+      "completions/min_terminated_length": 9.0,
+      "epoch": 0.6088,
+      "grad_norm": 12.398276329040527,
+      "kl": 0.240966796875,
+      "learning_rate": 1e-06,
+      "loss": 0.198,
+      "num_tokens": 9985136.0,
+      "reward": 0.059197958558797836,
+      "reward_std": 0.02872345596551895,
+      "rewards/bleu_reward_func/mean": 0.059197958558797836,
+      "rewards/bleu_reward_func/std": 0.04119112715125084,
+      "step": 761
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 493.0,
+      "completions/mean_length": 377.6875,
+      "completions/mean_terminated_length": 273.22222900390625,
+      "completions/min_length": 56.0,
+      "completions/min_terminated_length": 56.0,
+      "epoch": 0.6096,
+      "grad_norm": 3.9670443534851074,
+      "kl": 0.046600341796875,
+      "learning_rate": 1e-06,
+      "loss": 0.0648,
+      "num_tokens": 10003078.0,
+      "reward": 0.0413321927189827,
+      "reward_std": 0.015110660344362259,
+      "rewards/bleu_reward_func/mean": 0.0413321927189827,
+      "rewards/bleu_reward_func/std": 0.032528944313526154,
+      "step": 762
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.59375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 498.0,
+      "completions/mean_length": 472.0,
+      "completions/mean_terminated_length": 413.5384826660156,
+      "completions/min_length": 283.0,
+      "completions/min_terminated_length": 283.0,
+      "epoch": 0.6104,
+      "grad_norm": 2.092273712158203,
+      "kl": 0.0268096923828125,
+      "learning_rate": 1e-06,
+      "loss": 0.0147,
+      "num_tokens": 10020230.0,
+      "reward": 0.08549900352954865,
+      "reward_std": 0.03175706788897514,
+      "rewards/bleu_reward_func/mean": 0.08549900352954865,
+      "rewards/bleu_reward_func/std": 0.042792484164237976,
+      "step": 763
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.15625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 446.0,
+      "completions/mean_length": 150.6875,
+      "completions/mean_terminated_length": 83.77777862548828,
+      "completions/min_length": 18.0,
+      "completions/min_terminated_length": 18.0,
+      "epoch": 0.6112,
+      "grad_norm": 9.28490924835205,
+      "kl": 0.265869140625,
+      "learning_rate": 1e-06,
+      "loss": 0.2918,
+      "num_tokens": 10031172.0,
+      "reward": 0.21292155981063843,
+      "reward_std": 0.06925603747367859,
+      "rewards/bleu_reward_func/mean": 0.21292155981063843,
+      "rewards/bleu_reward_func/std": 0.18994402885437012,
+      "step": 764
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 507.0,
+      "completions/mean_length": 328.3125,
+      "completions/mean_terminated_length": 218.10000610351562,
+      "completions/min_length": 56.0,
+      "completions/min_terminated_length": 56.0,
+      "epoch": 0.612,
+      "grad_norm": 21.738191604614258,
+      "kl": 0.1495361328125,
+      "learning_rate": 1e-06,
+      "loss": -0.0895,
+      "num_tokens": 10046358.0,
+      "reward": 0.09622834622859955,
+      "reward_std": 0.04176661744713783,
+      "rewards/bleu_reward_func/mean": 0.09622834622859955,
+      "rewards/bleu_reward_func/std": 0.09296616911888123,
+      "step": 765
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.15625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 502.0,
+      "completions/mean_length": 288.21875,
+      "completions/mean_terminated_length": 246.7777862548828,
+      "completions/min_length": 8.0,
+      "completions/min_terminated_length": 8.0,
+      "epoch": 0.6128,
+      "grad_norm": 5.178511142730713,
+      "kl": 0.1473388671875,
+      "learning_rate": 1e-06,
+      "loss": 0.0688,
+      "num_tokens": 10059877.0,
+      "reward": 0.088385209441185,
+      "reward_std": 0.016962474212050438,
+      "rewards/bleu_reward_func/mean": 0.088385209441185,
+      "rewards/bleu_reward_func/std": 0.08985943347215652,
+      "step": 766
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 504.0,
+      "completions/mean_length": 311.46875,
+      "completions/mean_terminated_length": 244.625,
+      "completions/min_length": 13.0,
+      "completions/min_terminated_length": 13.0,
+      "epoch": 0.6136,
+      "grad_norm": 11.883882522583008,
+      "kl": 0.2354583740234375,
+      "learning_rate": 1e-06,
+      "loss": -0.0238,
+      "num_tokens": 10073076.0,
+      "reward": 0.18446165323257446,
+      "reward_std": 0.10309243947267532,
+      "rewards/bleu_reward_func/mean": 0.18446165323257446,
+      "rewards/bleu_reward_func/std": 0.30338525772094727,
+      "step": 767
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 468.0,
+      "completions/mean_length": 275.59375,
+      "completions/mean_terminated_length": 168.13636779785156,
+      "completions/min_length": 17.0,
+      "completions/min_terminated_length": 17.0,
+      "epoch": 0.6144,
+      "grad_norm": 5.438653469085693,
+      "kl": 0.081451416015625,
+      "learning_rate": 1e-06,
+      "loss": 0.0639,
+      "num_tokens": 10085871.0,
+      "reward": 0.06602545082569122,
+      "reward_std": 0.030349329113960266,
+      "rewards/bleu_reward_func/mean": 0.06602545082569122,
+      "rewards/bleu_reward_func/std": 0.04767395555973053,
+      "step": 768
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 467.0,
+      "completions/max_terminated_length": 467.0,
+      "completions/mean_length": 136.09375,
+      "completions/mean_terminated_length": 136.09375,
+      "completions/min_length": 15.0,
+      "completions/min_terminated_length": 15.0,
+      "epoch": 0.6152,
+      "grad_norm": 5.674292087554932,
+      "kl": 0.26715087890625,
+      "learning_rate": 1e-06,
+      "loss": 0.2437,
+      "num_tokens": 10094858.0,
+      "reward": 0.07597756385803223,
+      "reward_std": 0.027629435062408447,
+      "rewards/bleu_reward_func/mean": 0.07597756385803223,
+      "rewards/bleu_reward_func/std": 0.054181892424821854,
+      "step": 769
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 401.0,
+      "completions/mean_length": 107.9375,
+      "completions/mean_terminated_length": 94.9032211303711,
+      "completions/min_length": 18.0,
+      "completions/min_terminated_length": 18.0,
+      "epoch": 0.616,
+      "grad_norm": 9.938776969909668,
+      "kl": 0.16571044921875,
+      "learning_rate": 1e-06,
+      "loss": 0.082,
+      "num_tokens": 10106672.0,
+      "reward": 0.3735446035861969,
+      "reward_std": 0.03898521885275841,
+      "rewards/bleu_reward_func/mean": 0.3735446035861969,
+      "rewards/bleu_reward_func/std": 0.30306297540664673,
+      "step": 770
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 508.0,
+      "completions/mean_length": 338.6875,
+      "completions/mean_terminated_length": 280.91668701171875,
+      "completions/min_length": 43.0,
+      "completions/min_terminated_length": 43.0,
+      "epoch": 0.6168,
+      "grad_norm": 3.914602041244507,
+      "kl": 0.05010986328125,
+      "learning_rate": 1e-06,
+      "loss": -0.1112,
+      "num_tokens": 10120510.0,
+      "reward": 0.09634806215763092,
+      "reward_std": 0.04157658666372299,
+      "rewards/bleu_reward_func/mean": 0.09634806215763092,
+      "rewards/bleu_reward_func/std": 0.08702099323272705,
+      "step": 771
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 511.0,
+      "completions/mean_length": 270.875,
+      "completions/mean_terminated_length": 215.23077392578125,
+      "completions/min_length": 15.0,
+      "completions/min_terminated_length": 15.0,
+      "epoch": 0.6176,
+      "grad_norm": 5.461522102355957,
+      "kl": 0.3319091796875,
+      "learning_rate": 1e-06,
+      "loss": 0.0344,
+      "num_tokens": 10132138.0,
+      "reward": 0.18050828576087952,
+      "reward_std": 0.033299222588539124,
+      "rewards/bleu_reward_func/mean": 0.18050828576087952,
+      "rewards/bleu_reward_func/std": 0.21068614721298218,
+      "step": 772
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 310.0,
+      "completions/max_terminated_length": 310.0,
+      "completions/mean_length": 131.375,
+      "completions/mean_terminated_length": 131.375,
+      "completions/min_length": 18.0,
+      "completions/min_terminated_length": 18.0,
+      "epoch": 0.6184,
+      "grad_norm": 8.162845611572266,
+      "kl": 0.2459716796875,
+      "learning_rate": 1e-06,
+      "loss": 0.1064,
+      "num_tokens": 10142558.0,
+      "reward": 0.1030242070555687,
+      "reward_std": 0.05847536772489548,
+      "rewards/bleu_reward_func/mean": 0.1030242070555687,
+      "rewards/bleu_reward_func/std": 0.14961844682693481,
+      "step": 773
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 492.0,
+      "completions/max_terminated_length": 492.0,
+      "completions/mean_length": 169.9375,
+      "completions/mean_terminated_length": 169.9375,
+      "completions/min_length": 12.0,
+      "completions/min_terminated_length": 12.0,
+      "epoch": 0.6192,
+      "grad_norm": 43.1834602355957,
+      "kl": 0.22735595703125,
+      "learning_rate": 1e-06,
+      "loss": -0.0649,
+      "num_tokens": 10151012.0,
+      "reward": 0.04478512331843376,
+      "reward_std": 0.012456279247999191,
+      "rewards/bleu_reward_func/mean": 0.04478512331843376,
+      "rewards/bleu_reward_func/std": 0.04301442950963974,
+      "step": 774
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 495.0,
+      "completions/mean_length": 249.3125,
+      "completions/mean_terminated_length": 188.69232177734375,
+      "completions/min_length": 15.0,
+      "completions/min_terminated_length": 15.0,
+      "epoch": 0.62,
+      "grad_norm": 8.241740226745605,
+      "kl": 0.188079833984375,
+      "learning_rate": 1e-06,
+      "loss": 0.1066,
+      "num_tokens": 10163446.0,
+      "reward": 0.15149368345737457,
+      "reward_std": 0.028546612709760666,
+      "rewards/bleu_reward_func/mean": 0.15149368345737457,
+      "rewards/bleu_reward_func/std": 0.14032159745693207,
+      "step": 775
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.28125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 495.0,
+      "completions/mean_length": 235.125,
+      "completions/mean_terminated_length": 126.78260803222656,
+      "completions/min_length": 12.0,
+      "completions/min_terminated_length": 12.0,
+      "epoch": 0.6208,
+      "grad_norm": 8.033968925476074,
+      "kl": 0.22894287109375,
+      "learning_rate": 1e-06,
+      "loss": 0.098,
+      "num_tokens": 10174522.0,
+      "reward": 0.22483249008655548,
+      "reward_std": 0.04489654302597046,
+      "rewards/bleu_reward_func/mean": 0.22483249008655548,
+      "rewards/bleu_reward_func/std": 0.24184906482696533,
+      "step": 776
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.28125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 478.0,
+      "completions/mean_length": 273.0625,
+      "completions/mean_terminated_length": 179.56521606445312,
+      "completions/min_length": 17.0,
+      "completions/min_terminated_length": 17.0,
+      "epoch": 0.6216,
+      "grad_norm": 6.315563201904297,
+      "kl": 0.09613037109375,
+      "learning_rate": 1e-06,
+      "loss": 0.1547,
+      "num_tokens": 10188964.0,
+      "reward": 0.06417440623044968,
+      "reward_std": 0.01652311347424984,
+      "rewards/bleu_reward_func/mean": 0.06417440623044968,
+      "rewards/bleu_reward_func/std": 0.05222758278250694,
+      "step": 777
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 506.0,
+      "completions/mean_length": 350.40625,
+      "completions/mean_terminated_length": 188.8125,
+      "completions/min_length": 12.0,
+      "completions/min_terminated_length": 12.0,
+      "epoch": 0.6224,
+      "grad_norm": 4.945059776306152,
+      "kl": 0.08636474609375,
+      "learning_rate": 1e-06,
+      "loss": 0.0638,
+      "num_tokens": 10203657.0,
+      "reward": 0.07400526106357574,
+      "reward_std": 0.03621644526720047,
+      "rewards/bleu_reward_func/mean": 0.07400526106357574,
+      "rewards/bleu_reward_func/std": 0.05740804970264435,
+      "step": 778
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.28125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 509.0,
+      "completions/mean_length": 319.1875,
+      "completions/mean_terminated_length": 243.7391357421875,
+      "completions/min_length": 36.0,
+      "completions/min_terminated_length": 36.0,
+      "epoch": 0.6232,
+      "grad_norm": 4.364378929138184,
+      "kl": 0.07342529296875,
+      "learning_rate": 1e-06,
+      "loss": -0.0132,
+      "num_tokens": 10219919.0,
+      "reward": 0.24302205443382263,
+      "reward_std": 0.05576051399111748,
+      "rewards/bleu_reward_func/mean": 0.24302205443382263,
+      "rewards/bleu_reward_func/std": 0.21575042605400085,
+      "step": 779
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 491.0,
+      "completions/mean_length": 298.75,
+      "completions/mean_terminated_length": 201.8181915283203,
+      "completions/min_length": 24.0,
+      "completions/min_terminated_length": 24.0,
+      "epoch": 0.624,
+      "grad_norm": 3.959169864654541,
+      "kl": 0.06689453125,
+      "learning_rate": 1e-06,
+      "loss": 0.0405,
+      "num_tokens": 10234119.0,
+      "reward": 0.09677430242300034,
+      "reward_std": 0.02671782858669758,
+      "rewards/bleu_reward_func/mean": 0.09677430242300034,
+      "rewards/bleu_reward_func/std": 0.06890682131052017,
+      "step": 780
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 499.0,
+      "completions/mean_length": 211.40625,
+      "completions/mean_terminated_length": 168.46429443359375,
+      "completions/min_length": 11.0,
+      "completions/min_terminated_length": 11.0,
+      "epoch": 0.6248,
+      "grad_norm": 10.254522323608398,
+      "kl": 0.30865478515625,
+      "learning_rate": 1e-06,
+      "loss": -0.044,
+      "num_tokens": 10247388.0,
+      "reward": 0.2194344401359558,
+      "reward_std": 0.04920031875371933,
+      "rewards/bleu_reward_func/mean": 0.2194344401359558,
+      "rewards/bleu_reward_func/std": 0.15552020072937012,
+      "step": 781
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 510.0,
+      "completions/mean_length": 201.46875,
+      "completions/mean_terminated_length": 180.7666778564453,
+      "completions/min_length": 24.0,
+      "completions/min_terminated_length": 24.0,
+      "epoch": 0.6256,
+      "grad_norm": 7.234709739685059,
+      "kl": 0.1651611328125,
+      "learning_rate": 1e-06,
+      "loss": 0.2297,
+      "num_tokens": 10256947.0,
+      "reward": 0.11007180064916611,
+      "reward_std": 0.07193183898925781,
+      "rewards/bleu_reward_func/mean": 0.11007180064916611,
+      "rewards/bleu_reward_func/std": 0.13098347187042236,
+      "step": 782
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.15625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 482.0,
+      "completions/mean_length": 288.15625,
+      "completions/mean_terminated_length": 246.70370483398438,
+      "completions/min_length": 12.0,
+      "completions/min_terminated_length": 12.0,
+      "epoch": 0.6264,
+      "grad_norm": 3.6756701469421387,
+      "kl": 0.050689697265625,
+      "learning_rate": 1e-06,
+      "loss": -0.2297,
+      "num_tokens": 10271504.0,
+      "reward": 0.07084184139966965,
+      "reward_std": 0.03263479843735695,
+      "rewards/bleu_reward_func/mean": 0.07084184139966965,
+      "rewards/bleu_reward_func/std": 0.07953313738107681,
+      "step": 783
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 370.0,
+      "completions/mean_length": 127.375,
+      "completions/mean_terminated_length": 114.96773529052734,
+      "completions/min_length": 21.0,
+      "completions/min_terminated_length": 21.0,
+      "epoch": 0.6272,
+      "grad_norm": 7.3130879402160645,
+      "kl": 0.143096923828125,
+      "learning_rate": 1e-06,
+      "loss": -0.1119,
+      "num_tokens": 10281236.0,
+      "reward": 0.17116650938987732,
+      "reward_std": 0.040961284190416336,
+      "rewards/bleu_reward_func/mean": 0.17116650938987732,
+      "rewards/bleu_reward_func/std": 0.16110415756702423,
+      "step": 784
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.15625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 458.0,
+      "completions/mean_length": 198.125,
+      "completions/mean_terminated_length": 140.0,
+      "completions/min_length": 23.0,
+      "completions/min_terminated_length": 23.0,
+      "epoch": 0.628,
+      "grad_norm": 5.703468322753906,
+      "kl": 0.168975830078125,
+      "learning_rate": 1e-06,
+      "loss": 0.0673,
+      "num_tokens": 10293288.0,
+      "reward": 0.14155232906341553,
+      "reward_std": 0.059418316930532455,
+      "rewards/bleu_reward_func/mean": 0.14155232906341553,
+      "rewards/bleu_reward_func/std": 0.142944797873497,
+      "step": 785
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 475.0,
+      "completions/mean_length": 312.5,
+      "completions/mean_terminated_length": 266.4615478515625,
+      "completions/min_length": 83.0,
+      "completions/min_terminated_length": 83.0,
+      "epoch": 0.6288,
+      "grad_norm": 2.4137492179870605,
+      "kl": 0.023956298828125,
+      "learning_rate": 1e-06,
+      "loss": 0.0405,
+      "num_tokens": 10309744.0,
+      "reward": 0.16134724020957947,
+      "reward_std": 0.01978662982583046,
+      "rewards/bleu_reward_func/mean": 0.16134724020957947,
+      "rewards/bleu_reward_func/std": 0.16176313161849976,
+      "step": 786
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 475.0,
+      "completions/mean_length": 281.34375,
+      "completions/mean_terminated_length": 228.11538696289062,
+      "completions/min_length": 26.0,
+      "completions/min_terminated_length": 26.0,
+      "epoch": 0.6296,
+      "grad_norm": 5.432137489318848,
+      "kl": 0.126953125,
+      "learning_rate": 1e-06,
+      "loss": -0.0834,
+      "num_tokens": 10322867.0,
+      "reward": 0.13262051343917847,
+      "reward_std": 0.054468683898448944,
+      "rewards/bleu_reward_func/mean": 0.13262051343917847,
+      "rewards/bleu_reward_func/std": 0.1454581618309021,
+      "step": 787
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.09375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 434.0,
+      "completions/mean_length": 157.8125,
+      "completions/mean_terminated_length": 121.17241668701172,
+      "completions/min_length": 13.0,
+      "completions/min_terminated_length": 13.0,
+      "epoch": 0.6304,
+      "grad_norm": 8.820817947387695,
+      "kl": 0.384033203125,
+      "learning_rate": 1e-06,
+      "loss": 0.1057,
+      "num_tokens": 10330085.0,
+      "reward": 0.14398705959320068,
+      "reward_std": 0.05267474800348282,
+      "rewards/bleu_reward_func/mean": 0.14398705959320068,
+      "rewards/bleu_reward_func/std": 0.12204661965370178,
+      "step": 788
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 492.0,
+      "completions/mean_length": 313.875,
+      "completions/mean_terminated_length": 223.8181915283203,
+      "completions/min_length": 13.0,
+      "completions/min_terminated_length": 13.0,
+      "epoch": 0.6312,
+      "grad_norm": 6.252136707305908,
+      "kl": 0.202178955078125,
+      "learning_rate": 1e-06,
+      "loss": -0.0958,
+      "num_tokens": 10344937.0,
+      "reward": 0.08566081523895264,
+      "reward_std": 0.0418044775724411,
+      "rewards/bleu_reward_func/mean": 0.08566081523895264,
+      "rewards/bleu_reward_func/std": 0.1277945637702942,
+      "step": 789
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.65625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 504.0,
+      "completions/mean_length": 476.40625,
+      "completions/mean_terminated_length": 408.4545593261719,
+      "completions/min_length": 270.0,
+      "completions/min_terminated_length": 270.0,
+      "epoch": 0.632,
+      "grad_norm": 2.1677629947662354,
+      "kl": 0.03436279296875,
+      "learning_rate": 1e-06,
+      "loss": -0.0004,
+      "num_tokens": 10362222.0,
+      "reward": 0.04695405811071396,
+      "reward_std": 0.013839447870850563,
+      "rewards/bleu_reward_func/mean": 0.04695405811071396,
+      "rewards/bleu_reward_func/std": 0.03280064836144447,
+      "step": 790
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 505.0,
+      "completions/mean_length": 404.46875,
+      "completions/mean_terminated_length": 296.9375,
+      "completions/min_length": 122.0,
+      "completions/min_terminated_length": 122.0,
+      "epoch": 0.6328,
+      "grad_norm": 2.3538496494293213,
+      "kl": 0.0289459228515625,
+      "learning_rate": 1e-06,
+      "loss": -0.0297,
+      "num_tokens": 10382845.0,
+      "reward": 0.08459493517875671,
+      "reward_std": 0.029446884989738464,
+      "rewards/bleu_reward_func/mean": 0.08459493517875671,
+      "rewards/bleu_reward_func/std": 0.051741067320108414,
+      "step": 791
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 418.0,
+      "completions/mean_length": 133.0625,
+      "completions/mean_terminated_length": 107.80000305175781,
+      "completions/min_length": 15.0,
+      "completions/min_terminated_length": 15.0,
+      "epoch": 0.6336,
+      "grad_norm": 6.885672092437744,
+      "kl": 0.20733642578125,
+      "learning_rate": 1e-06,
+      "loss": 0.1098,
+      "num_tokens": 10391199.0,
+      "reward": 0.10581733286380768,
+      "reward_std": 0.034825149923563004,
+      "rewards/bleu_reward_func/mean": 0.10581733286380768,
+      "rewards/bleu_reward_func/std": 0.10278832167387009,
+      "step": 792
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 95.0,
+      "completions/mean_length": 154.4375,
+      "completions/mean_terminated_length": 35.25,
+      "completions/min_length": 11.0,
+      "completions/min_terminated_length": 11.0,
+      "epoch": 0.6344,
+      "grad_norm": 8.279248237609863,
+      "kl": 0.2750244140625,
+      "learning_rate": 1e-06,
+      "loss": -0.0444,
+      "num_tokens": 10399789.0,
+      "reward": 0.1634266972541809,
+      "reward_std": 0.029335156083106995,
+      "rewards/bleu_reward_func/mean": 0.1634266972541809,
+      "rewards/bleu_reward_func/std": 0.1743723601102829,
+      "step": 793
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 477.0,
+      "completions/mean_length": 244.96875,
+      "completions/mean_terminated_length": 123.59091186523438,
+      "completions/min_length": 18.0,
+      "completions/min_terminated_length": 18.0,
+      "epoch": 0.6352,
+      "grad_norm": 5.577760219573975,
+      "kl": 0.230865478515625,
+      "learning_rate": 1e-06,
+      "loss": -0.0138,
+      "num_tokens": 10413116.0,
+      "reward": 0.18318259716033936,
+      "reward_std": 0.02782328985631466,
+      "rewards/bleu_reward_func/mean": 0.18318259716033936,
+      "rewards/bleu_reward_func/std": 0.14704957604408264,
+      "step": 794
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 491.0,
+      "completions/mean_length": 354.78125,
+      "completions/mean_terminated_length": 344.3000183105469,
+      "completions/min_length": 176.0,
+      "completions/min_terminated_length": 176.0,
+      "epoch": 0.636,
+      "grad_norm": 2.591658115386963,
+      "kl": 0.0296630859375,
+      "learning_rate": 1e-06,
+      "loss": -0.0201,
+      "num_tokens": 10426797.0,
+      "reward": 0.06094507500529289,
+      "reward_std": 0.02977069467306137,
+      "rewards/bleu_reward_func/mean": 0.06094507500529289,
+      "rewards/bleu_reward_func/std": 0.03347548097372055,
+      "step": 795
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 512.0,
+      "completions/mean_length": 209.25,
+      "completions/mean_terminated_length": 189.06668090820312,
+      "completions/min_length": 27.0,
+      "completions/min_terminated_length": 27.0,
+      "epoch": 0.6368,
+      "grad_norm": 7.372705936431885,
+      "kl": 0.226715087890625,
+      "learning_rate": 1e-06,
+      "loss": 0.0166,
+      "num_tokens": 10435821.0,
+      "reward": 0.17854920029640198,
+      "reward_std": 0.039038486778736115,
+      "rewards/bleu_reward_func/mean": 0.17854920029640198,
+      "rewards/bleu_reward_func/std": 0.11250942945480347,
+      "step": 796
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 510.0,
+      "completions/max_terminated_length": 510.0,
+      "completions/mean_length": 139.90625,
+      "completions/mean_terminated_length": 139.90625,
+      "completions/min_length": 18.0,
+      "completions/min_terminated_length": 18.0,
+      "epoch": 0.6376,
+      "grad_norm": 7.399951457977295,
+      "kl": 0.2593994140625,
+      "learning_rate": 1e-06,
+      "loss": 0.0215,
+      "num_tokens": 10444706.0,
+      "reward": 0.20415213704109192,
+      "reward_std": 0.05372469127178192,
+      "rewards/bleu_reward_func/mean": 0.20415213704109192,
+      "rewards/bleu_reward_func/std": 0.15420135855674744,
+      "step": 797
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 507.0,
+      "completions/mean_length": 253.3125,
+      "completions/mean_terminated_length": 193.61538696289062,
+      "completions/min_length": 18.0,
+      "completions/min_terminated_length": 18.0,
+      "epoch": 0.6384,
+      "grad_norm": 5.452202320098877,
+      "kl": 0.207672119140625,
+      "learning_rate": 1e-06,
+      "loss": 0.0658,
+      "num_tokens": 10457300.0,
+      "reward": 0.18789556622505188,
+      "reward_std": 0.06054109334945679,
+      "rewards/bleu_reward_func/mean": 0.18789556622505188,
+      "rewards/bleu_reward_func/std": 0.18226853013038635,
+      "step": 798
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.15625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 436.0,
+      "completions/mean_length": 164.53125,
+      "completions/mean_terminated_length": 100.18518829345703,
+      "completions/min_length": 18.0,
+      "completions/min_terminated_length": 18.0,
+      "epoch": 0.6392,
+      "grad_norm": 9.581258773803711,
+      "kl": 0.312286376953125,
+      "learning_rate": 1e-06,
+      "loss": -0.0913,
+      "num_tokens": 10465053.0,
+      "reward": 0.14276297390460968,
+      "reward_std": 0.028537599369883537,
+      "rewards/bleu_reward_func/mean": 0.14276297390460968,
+      "rewards/bleu_reward_func/std": 0.10928227007389069,
+      "step": 799
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 313.0,
+      "completions/max_terminated_length": 313.0,
+      "completions/mean_length": 56.59375,
+      "completions/mean_terminated_length": 56.59375,
+      "completions/min_length": 19.0,
+      "completions/min_terminated_length": 19.0,
+      "epoch": 0.64,
+      "grad_norm": 7.995264053344727,
+      "kl": 0.2779541015625,
+      "learning_rate": 1e-06,
+      "loss": 0.3759,
+      "num_tokens": 10477608.0,
+      "reward": 0.34325188398361206,
+      "reward_std": 0.07241753488779068,
+      "rewards/bleu_reward_func/mean": 0.34325188398361206,
+      "rewards/bleu_reward_func/std": 0.20597775280475616,
+      "step": 800
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 245.0,
+      "completions/mean_length": 191.5625,
+      "completions/mean_terminated_length": 117.61538696289062,
+      "completions/min_length": 8.0,
+      "completions/min_terminated_length": 8.0,
+      "epoch": 0.6408,
+      "grad_norm": 4.229004859924316,
+      "kl": 0.0904541015625,
+      "learning_rate": 1e-06,
+      "loss": 0.1193,
+      "num_tokens": 10485490.0,
+      "reward": 0.11370354145765305,
+      "reward_std": 0.061382561922073364,
+      "rewards/bleu_reward_func/mean": 0.11370354145765305,
+      "rewards/bleu_reward_func/std": 0.15154796838760376,
+      "step": 801
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.46875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 439.0,
+      "completions/mean_length": 293.0,
+      "completions/mean_terminated_length": 99.76470947265625,
+      "completions/min_length": 27.0,
+      "completions/min_terminated_length": 27.0,
+      "epoch": 0.6416,
+      "grad_norm": 4.936343193054199,
+      "kl": 0.227630615234375,
+      "learning_rate": 1e-06,
+      "loss": 0.0723,
+      "num_tokens": 10497930.0,
+      "reward": 0.15342603623867035,
+      "reward_std": 0.018828846514225006,
+      "rewards/bleu_reward_func/mean": 0.15342603623867035,
+      "rewards/bleu_reward_func/std": 0.22573818266391754,
+      "step": 802
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 357.0,
+      "completions/mean_length": 206.9375,
+      "completions/mean_terminated_length": 163.35714721679688,
+      "completions/min_length": 19.0,
+      "completions/min_terminated_length": 19.0,
+      "epoch": 0.6424,
+      "grad_norm": 4.5400471687316895,
+      "kl": 0.1302490234375,
+      "learning_rate": 1e-06,
+      "loss": -0.0265,
+      "num_tokens": 10509096.0,
+      "reward": 0.042201556265354156,
+      "reward_std": 0.01641710475087166,
+      "rewards/bleu_reward_func/mean": 0.042201556265354156,
+      "rewards/bleu_reward_func/std": 0.026252396404743195,
+      "step": 803
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.40625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 487.0,
+      "completions/mean_length": 339.8125,
+      "completions/mean_terminated_length": 222.0,
+      "completions/min_length": 38.0,
+      "completions/min_terminated_length": 38.0,
+      "epoch": 0.6432,
+      "grad_norm": 4.132330417633057,
+      "kl": 0.042144775390625,
+      "learning_rate": 1e-06,
+      "loss": -0.0638,
+      "num_tokens": 10522810.0,
+      "reward": 0.05155924707651138,
+      "reward_std": 0.017338326200842857,
+      "rewards/bleu_reward_func/mean": 0.05155924707651138,
+      "rewards/bleu_reward_func/std": 0.03961692750453949,
+      "step": 804
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 375.0,
+      "completions/mean_length": 251.0,
+      "completions/mean_terminated_length": 132.3636474609375,
+      "completions/min_length": 12.0,
+      "completions/min_terminated_length": 12.0,
+      "epoch": 0.644,
+      "grad_norm": 6.286128044128418,
+      "kl": 0.150299072265625,
+      "learning_rate": 1e-06,
+      "loss": 0.1961,
+      "num_tokens": 10533090.0,
+      "reward": 0.03828435763716698,
+      "reward_std": 0.01768323965370655,
+      "rewards/bleu_reward_func/mean": 0.03828435763716698,
+      "rewards/bleu_reward_func/std": 0.035699598491191864,
+      "step": 805
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 472.0,
+      "completions/mean_length": 286.03125,
+      "completions/mean_terminated_length": 253.75001525878906,
+      "completions/min_length": 63.0,
+      "completions/min_terminated_length": 63.0,
+      "epoch": 0.6448,
+      "grad_norm": 3.333425283432007,
+      "kl": 0.0326385498046875,
+      "learning_rate": 1e-06,
+      "loss": 0.0689,
+      "num_tokens": 10544131.0,
+      "reward": 0.11853313446044922,
+      "reward_std": 0.06690388172864914,
+      "rewards/bleu_reward_func/mean": 0.11853313446044922,
+      "rewards/bleu_reward_func/std": 0.14521227777004242,
+      "step": 806
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 428.0,
+      "completions/mean_length": 176.6875,
+      "completions/mean_terminated_length": 165.87095642089844,
+      "completions/min_length": 15.0,
+      "completions/min_terminated_length": 15.0,
+      "epoch": 0.6456,
+      "grad_norm": 6.8076090812683105,
+      "kl": 0.24957275390625,
+      "learning_rate": 1e-06,
+      "loss": -0.1496,
+      "num_tokens": 10554641.0,
+      "reward": 0.12172487378120422,
+      "reward_std": 0.05724428966641426,
+      "rewards/bleu_reward_func/mean": 0.12172487378120422,
+      "rewards/bleu_reward_func/std": 0.11496427655220032,
+      "step": 807
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 505.0,
+      "completions/mean_length": 146.875,
+      "completions/mean_terminated_length": 62.615386962890625,
+      "completions/min_length": 16.0,
+      "completions/min_terminated_length": 16.0,
+      "epoch": 0.6464,
+      "grad_norm": 9.55725383758545,
+      "kl": 0.30224609375,
+      "learning_rate": 1e-06,
+      "loss": 0.0709,
+      "num_tokens": 10563829.0,
+      "reward": 0.20068883895874023,
+      "reward_std": 0.06663694977760315,
+      "rewards/bleu_reward_func/mean": 0.20068883895874023,
+      "rewards/bleu_reward_func/std": 0.13896267116069794,
+      "step": 808
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.21875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 176.0,
+      "completions/mean_length": 141.34375,
+      "completions/mean_terminated_length": 37.55999755859375,
+      "completions/min_length": 17.0,
+      "completions/min_terminated_length": 17.0,
+      "epoch": 0.6472,
+      "grad_norm": 26.54884147644043,
+      "kl": 0.41717529296875,
+      "learning_rate": 1e-06,
+      "loss": -0.1795,
+      "num_tokens": 10574624.0,
+      "reward": 0.22808396816253662,
+      "reward_std": 0.06877206265926361,
+      "rewards/bleu_reward_func/mean": 0.22808396816253662,
+      "rewards/bleu_reward_func/std": 0.21049334108829498,
+      "step": 809
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 308.0,
+      "completions/max_terminated_length": 308.0,
+      "completions/mean_length": 111.21875,
+      "completions/mean_terminated_length": 111.21875,
+      "completions/min_length": 19.0,
+      "completions/min_terminated_length": 19.0,
+      "epoch": 0.648,
+      "grad_norm": 13.591133117675781,
+      "kl": 0.272674560546875,
+      "learning_rate": 1e-06,
+      "loss": -0.0292,
+      "num_tokens": 10583671.0,
+      "reward": 0.2966269850730896,
+      "reward_std": 0.015265233814716339,
+      "rewards/bleu_reward_func/mean": 0.2966269850730896,
+      "rewards/bleu_reward_func/std": 0.24745707213878632,
+      "step": 810
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 473.0,
+      "completions/mean_length": 94.34375,
+      "completions/mean_terminated_length": 80.87096405029297,
+      "completions/min_length": 15.0,
+      "completions/min_terminated_length": 15.0,
+      "epoch": 0.6488,
+      "grad_norm": 10.449114799499512,
+      "kl": 0.2784423828125,
+      "learning_rate": 1e-06,
+      "loss": -0.2645,
+      "num_tokens": 10592586.0,
+      "reward": 0.23048871755599976,
+      "reward_std": 0.05683053284883499,
+      "rewards/bleu_reward_func/mean": 0.23048871755599976,
+      "rewards/bleu_reward_func/std": 0.304109662771225,
+      "step": 811
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 262.0,
+      "completions/mean_length": 184.65625,
+      "completions/mean_terminated_length": 75.54167175292969,
+      "completions/min_length": 17.0,
+      "completions/min_terminated_length": 17.0,
+      "epoch": 0.6496,
+      "grad_norm": 6.2868170738220215,
+      "kl": 0.202392578125,
+      "learning_rate": 1e-06,
+      "loss": 0.0856,
+      "num_tokens": 10604447.0,
+      "reward": 0.06996900588274002,
+      "reward_std": 0.01753135770559311,
+      "rewards/bleu_reward_func/mean": 0.06996900588274002,
+      "rewards/bleu_reward_func/std": 0.07089151442050934,
+      "step": 812
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 171.0,
+      "completions/max_terminated_length": 171.0,
+      "completions/mean_length": 46.96875,
+      "completions/mean_terminated_length": 46.96875,
+      "completions/min_length": 5.0,
+      "completions/min_terminated_length": 5.0,
+      "epoch": 0.6504,
+      "grad_norm": 15.99052619934082,
+      "kl": 0.55419921875,
+      "learning_rate": 1e-06,
+      "loss": 0.1537,
+      "num_tokens": 10612710.0,
+      "reward": 0.08658448606729507,
+      "reward_std": 0.03601383790373802,
+      "rewards/bleu_reward_func/mean": 0.08658448606729507,
+      "rewards/bleu_reward_func/std": 0.05530841648578644,
+      "step": 813
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 484.0,
+      "completions/mean_length": 355.0,
+      "completions/mean_terminated_length": 283.6363830566406,
+      "completions/min_length": 32.0,
+      "completions/min_terminated_length": 32.0,
+      "epoch": 0.6512,
+      "grad_norm": 3.1796348094940186,
+      "kl": 0.059417724609375,
+      "learning_rate": 1e-06,
+      "loss": 0.1311,
+      "num_tokens": 10625326.0,
+      "reward": 0.11449694633483887,
+      "reward_std": 0.027395280078053474,
+      "rewards/bleu_reward_func/mean": 0.11449694633483887,
+      "rewards/bleu_reward_func/std": 0.05288613215088844,
+      "step": 814
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 499.0,
+      "completions/mean_length": 128.96875,
+      "completions/mean_terminated_length": 103.43334197998047,
+      "completions/min_length": 23.0,
+      "completions/min_terminated_length": 23.0,
+      "epoch": 0.652,
+      "grad_norm": 6.432667255401611,
+      "kl": 0.1822509765625,
+      "learning_rate": 1e-06,
+      "loss": 0.1721,
+      "num_tokens": 10635421.0,
+      "reward": 0.18185263872146606,
+      "reward_std": 0.0783199891448021,
+      "rewards/bleu_reward_func/mean": 0.18185263872146606,
+      "rewards/bleu_reward_func/std": 0.18959355354309082,
+      "step": 815
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 493.0,
+      "completions/mean_length": 242.9375,
+      "completions/mean_terminated_length": 204.50001525878906,
+      "completions/min_length": 13.0,
+      "completions/min_terminated_length": 13.0,
+      "epoch": 0.6528,
+      "grad_norm": 4.343508720397949,
+      "kl": 0.06640625,
+      "learning_rate": 1e-06,
+      "loss": 0.1592,
+      "num_tokens": 10647283.0,
+      "reward": 0.10118309408426285,
+      "reward_std": 0.026538610458374023,
+      "rewards/bleu_reward_func/mean": 0.10118309408426285,
+      "rewards/bleu_reward_func/std": 0.08866976201534271,
+      "step": 816
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 456.0,
+      "completions/mean_length": 217.5625,
+      "completions/mean_terminated_length": 175.50001525878906,
+      "completions/min_length": 25.0,
+      "completions/min_terminated_length": 25.0,
+      "epoch": 0.6536,
+      "grad_norm": 4.102277755737305,
+      "kl": 0.141204833984375,
+      "learning_rate": 1e-06,
+      "loss": 0.2738,
+      "num_tokens": 10660189.0,
+      "reward": 0.23801954090595245,
+      "reward_std": 0.07484984397888184,
+      "rewards/bleu_reward_func/mean": 0.23801954090595245,
+      "rewards/bleu_reward_func/std": 0.168580561876297,
+      "step": 817
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 485.0,
+      "completions/mean_length": 180.625,
+      "completions/mean_terminated_length": 169.93548583984375,
+      "completions/min_length": 39.0,
+      "completions/min_terminated_length": 39.0,
+      "epoch": 0.6544,
+      "grad_norm": 5.659482002258301,
+      "kl": 0.08270263671875,
+      "learning_rate": 1e-06,
+      "loss": 0.1325,
+      "num_tokens": 10668393.0,
+      "reward": 0.08136270940303802,
+      "reward_std": 0.02622528187930584,
+      "rewards/bleu_reward_func/mean": 0.08136270940303802,
+      "rewards/bleu_reward_func/std": 0.03544744476675987,
+      "step": 818
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.21875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 511.0,
+      "completions/mean_length": 289.375,
+      "completions/mean_terminated_length": 227.0399932861328,
+      "completions/min_length": 55.0,
+      "completions/min_terminated_length": 55.0,
+      "epoch": 0.6552,
+      "grad_norm": 3.059807300567627,
+      "kl": 0.076446533203125,
+      "learning_rate": 1e-06,
+      "loss": -0.016,
+      "num_tokens": 10682749.0,
+      "reward": 0.07544586062431335,
+      "reward_std": 0.0220788661390543,
+      "rewards/bleu_reward_func/mean": 0.07544586062431335,
+      "rewards/bleu_reward_func/std": 0.04309820756316185,
+      "step": 819
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 407.0,
+      "completions/mean_length": 271.4375,
+      "completions/mean_terminated_length": 191.25,
+      "completions/min_length": 25.0,
+      "completions/min_terminated_length": 25.0,
+      "epoch": 0.656,
+      "grad_norm": 4.126210689544678,
+      "kl": 0.06744384765625,
+      "learning_rate": 1e-06,
+      "loss": 0.2091,
+      "num_tokens": 10693011.0,
+      "reward": 0.12100762873888016,
+      "reward_std": 0.040202461183071136,
+      "rewards/bleu_reward_func/mean": 0.12100762873888016,
+      "rewards/bleu_reward_func/std": 0.09315716475248337,
+      "step": 820
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.46875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 478.0,
+      "completions/mean_length": 360.34375,
+      "completions/mean_terminated_length": 226.5294189453125,
+      "completions/min_length": 38.0,
+      "completions/min_terminated_length": 38.0,
+      "epoch": 0.6568,
+      "grad_norm": 3.169628620147705,
+      "kl": 0.030731201171875,
+      "learning_rate": 1e-06,
+      "loss": -0.1334,
+      "num_tokens": 10706846.0,
+      "reward": 0.041019197553396225,
+      "reward_std": 0.012767975218594074,
+      "rewards/bleu_reward_func/mean": 0.041019197553396225,
+      "rewards/bleu_reward_func/std": 0.050586286932229996,
+      "step": 821
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.46875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 378.0,
+      "completions/mean_length": 264.46875,
+      "completions/mean_terminated_length": 46.05882263183594,
+      "completions/min_length": 13.0,
+      "completions/min_terminated_length": 13.0,
+      "epoch": 0.6576,
+      "grad_norm": 6.150150775909424,
+      "kl": 0.174896240234375,
+      "learning_rate": 1e-06,
+      "loss": 0.0093,
+      "num_tokens": 10720189.0,
+      "reward": 0.10611159354448318,
+      "reward_std": 0.044405680149793625,
+      "rewards/bleu_reward_func/mean": 0.10611159354448318,
+      "rewards/bleu_reward_func/std": 0.10892455279827118,
+      "step": 822
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 494.0,
+      "completions/mean_length": 266.84375,
+      "completions/mean_terminated_length": 210.2692413330078,
+      "completions/min_length": 29.0,
+      "completions/min_terminated_length": 29.0,
+      "epoch": 0.6584,
+      "grad_norm": 6.753479957580566,
+      "kl": 0.20147705078125,
+      "learning_rate": 1e-06,
+      "loss": 0.0273,
+      "num_tokens": 10731600.0,
+      "reward": 0.10170187056064606,
+      "reward_std": 0.03044716641306877,
+      "rewards/bleu_reward_func/mean": 0.10170187056064606,
+      "rewards/bleu_reward_func/std": 0.05836126208305359,
+      "step": 823
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 510.0,
+      "completions/mean_length": 392.28125,
+      "completions/mean_terminated_length": 299.1666564941406,
+      "completions/min_length": 44.0,
+      "completions/min_terminated_length": 44.0,
+      "epoch": 0.6592,
+      "grad_norm": 2.5884182453155518,
+      "kl": 0.0380859375,
+      "learning_rate": 1e-06,
+      "loss": -0.0465,
+      "num_tokens": 10748121.0,
+      "reward": 0.03844967484474182,
+      "reward_std": 0.012901275418698788,
+      "rewards/bleu_reward_func/mean": 0.03844967484474182,
+      "rewards/bleu_reward_func/std": 0.032823171466588974,
+      "step": 824
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 491.0,
+      "completions/mean_length": 270.5,
+      "completions/mean_terminated_length": 214.7692413330078,
+      "completions/min_length": 69.0,
+      "completions/min_terminated_length": 69.0,
+      "epoch": 0.66,
+      "grad_norm": 4.168792247772217,
+      "kl": 0.083251953125,
+      "learning_rate": 1e-06,
+      "loss": -0.0346,
+      "num_tokens": 10763353.0,
+      "reward": 0.08619528263807297,
+      "reward_std": 0.02499576285481453,
+      "rewards/bleu_reward_func/mean": 0.08619528263807297,
+      "rewards/bleu_reward_func/std": 0.10102304071187973,
+      "step": 825
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.21875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 510.0,
+      "completions/mean_length": 264.78125,
+      "completions/mean_terminated_length": 195.55999755859375,
+      "completions/min_length": 19.0,
+      "completions/min_terminated_length": 19.0,
+      "epoch": 0.6608,
+      "grad_norm": 10.431943893432617,
+      "kl": 0.154876708984375,
+      "learning_rate": 1e-06,
+      "loss": 0.1006,
+      "num_tokens": 10776642.0,
+      "reward": 0.06286956369876862,
+      "reward_std": 0.027797410264611244,
+      "rewards/bleu_reward_func/mean": 0.06286956369876862,
+      "rewards/bleu_reward_func/std": 0.07537111639976501,
+      "step": 826
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.09375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 476.0,
+      "completions/mean_length": 231.1875,
+      "completions/mean_terminated_length": 202.13792419433594,
+      "completions/min_length": 18.0,
+      "completions/min_terminated_length": 18.0,
+      "epoch": 0.6616,
+      "grad_norm": 7.486932754516602,
+      "kl": 0.15045166015625,
+      "learning_rate": 1e-06,
+      "loss": 0.2166,
+      "num_tokens": 10788656.0,
+      "reward": 0.10479411482810974,
+      "reward_std": 0.029287472367286682,
+      "rewards/bleu_reward_func/mean": 0.10479411482810974,
+      "rewards/bleu_reward_func/std": 0.07098822295665741,
+      "step": 827
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 92.0,
+      "completions/mean_length": 165.3125,
+      "completions/mean_terminated_length": 49.75,
+      "completions/min_length": 22.0,
+      "completions/min_terminated_length": 22.0,
+      "epoch": 0.6624,
+      "grad_norm": 7.1387104988098145,
+      "kl": 0.23382568359375,
+      "learning_rate": 1e-06,
+      "loss": 0.0727,
+      "num_tokens": 10795930.0,
+      "reward": 0.11342652887105942,
+      "reward_std": 0.027198534458875656,
+      "rewards/bleu_reward_func/mean": 0.11342652887105942,
+      "rewards/bleu_reward_func/std": 0.11971734464168549,
+      "step": 828
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 464.0,
+      "completions/max_terminated_length": 464.0,
+      "completions/mean_length": 108.96875,
+      "completions/mean_terminated_length": 108.96875,
+      "completions/min_length": 15.0,
+      "completions/min_terminated_length": 15.0,
+      "epoch": 0.6632,
+      "grad_norm": 8.27657413482666,
+      "kl": 0.36053466796875,
+      "learning_rate": 1e-06,
+      "loss": -0.0108,
+      "num_tokens": 10804241.0,
+      "reward": 0.28853511810302734,
+      "reward_std": 0.07218953967094421,
+      "rewards/bleu_reward_func/mean": 0.28853511810302734,
+      "rewards/bleu_reward_func/std": 0.20515379309654236,
+      "step": 829
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 143.0,
+      "completions/max_terminated_length": 143.0,
+      "completions/mean_length": 51.875,
+      "completions/mean_terminated_length": 51.875,
+      "completions/min_length": 9.0,
+      "completions/min_terminated_length": 9.0,
+      "epoch": 0.664,
+      "grad_norm": 8.845324516296387,
+      "kl": 0.4793701171875,
+      "learning_rate": 1e-06,
+      "loss": 0.1651,
+      "num_tokens": 10813957.0,
+      "reward": 0.2881876826286316,
+      "reward_std": 0.06279260665178299,
+      "rewards/bleu_reward_func/mean": 0.2881876826286316,
+      "rewards/bleu_reward_func/std": 0.23817574977874756,
+      "step": 830
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 482.0,
+      "completions/mean_length": 268.46875,
+      "completions/mean_terminated_length": 157.77273559570312,
+      "completions/min_length": 30.0,
+      "completions/min_terminated_length": 30.0,
+      "epoch": 0.6648,
+      "grad_norm": 5.753482818603516,
+      "kl": 0.143280029296875,
+      "learning_rate": 1e-06,
+      "loss": 0.0154,
+      "num_tokens": 10826356.0,
+      "reward": 0.13365639746189117,
+      "reward_std": 0.023316586390137672,
+      "rewards/bleu_reward_func/mean": 0.13365639746189117,
+      "rewards/bleu_reward_func/std": 0.20613247156143188,
+      "step": 831
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 444.0,
+      "completions/mean_length": 194.125,
+      "completions/mean_terminated_length": 172.933349609375,
+      "completions/min_length": 20.0,
+      "completions/min_terminated_length": 20.0,
+      "epoch": 0.6656,
+      "grad_norm": 5.661513805389404,
+      "kl": 0.106048583984375,
+      "learning_rate": 1e-06,
+      "loss": 0.0722,
+      "num_tokens": 10836128.0,
+      "reward": 0.08124659210443497,
+      "reward_std": 0.016117524355649948,
+      "rewards/bleu_reward_func/mean": 0.08124659210443497,
+      "rewards/bleu_reward_func/std": 0.08725257217884064,
+      "step": 832
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 60.0,
+      "completions/max_terminated_length": 60.0,
+      "completions/mean_length": 26.625,
+      "completions/mean_terminated_length": 26.625,
+      "completions/min_length": 18.0,
+      "completions/min_terminated_length": 18.0,
+      "epoch": 0.6664,
+      "grad_norm": 12.99825382232666,
+      "kl": 0.453857421875,
+      "learning_rate": 1e-06,
+      "loss": 0.1057,
+      "num_tokens": 10842116.0,
+      "reward": 0.3153986930847168,
+      "reward_std": 0.0658825933933258,
+      "rewards/bleu_reward_func/mean": 0.3153986930847168,
+      "rewards/bleu_reward_func/std": 0.17146961390972137,
+      "step": 833
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.21875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 500.0,
+      "completions/mean_length": 185.5625,
+      "completions/mean_terminated_length": 94.15999603271484,
+      "completions/min_length": 16.0,
+      "completions/min_terminated_length": 16.0,
+      "epoch": 0.6672,
+      "grad_norm": 9.406866073608398,
+      "kl": 0.144378662109375,
+      "learning_rate": 1e-06,
+      "loss": 0.2779,
+      "num_tokens": 10852030.0,
+      "reward": 0.13975293934345245,
+      "reward_std": 0.04399016499519348,
+      "rewards/bleu_reward_func/mean": 0.13975293934345245,
+      "rewards/bleu_reward_func/std": 0.17490676045417786,
+      "step": 834
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 484.0,
+      "completions/mean_length": 355.125,
+      "completions/mean_terminated_length": 198.25,
+      "completions/min_length": 23.0,
+      "completions/min_terminated_length": 23.0,
+      "epoch": 0.668,
+      "grad_norm": 8.956555366516113,
+      "kl": 0.220977783203125,
+      "learning_rate": 1e-06,
+      "loss": -0.047,
+      "num_tokens": 10869930.0,
+      "reward": 0.04670516401529312,
+      "reward_std": 0.01496485248208046,
+      "rewards/bleu_reward_func/mean": 0.04670516401529312,
+      "rewards/bleu_reward_func/std": 0.03720833733677864,
+      "step": 835
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.21875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 495.0,
+      "completions/mean_length": 370.71875,
+      "completions/mean_terminated_length": 331.1600036621094,
+      "completions/min_length": 151.0,
+      "completions/min_terminated_length": 151.0,
+      "epoch": 0.6688,
+      "grad_norm": 2.452960729598999,
+      "kl": 0.048583984375,
+      "learning_rate": 1e-06,
+      "loss": -0.0511,
+      "num_tokens": 10884969.0,
+      "reward": 0.05606111139059067,
+      "reward_std": 0.01513909362256527,
+      "rewards/bleu_reward_func/mean": 0.05606111139059067,
+      "rewards/bleu_reward_func/std": 0.05016703903675079,
+      "step": 836
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 340.0,
+      "completions/mean_length": 200.40625,
+      "completions/mean_terminated_length": 96.54167175292969,
+      "completions/min_length": 13.0,
+      "completions/min_terminated_length": 13.0,
+      "epoch": 0.6696,
+      "grad_norm": 4.966017723083496,
+      "kl": 0.11663818359375,
+      "learning_rate": 1e-06,
+      "loss": 0.2335,
+      "num_tokens": 10897982.0,
+      "reward": 0.0694584771990776,
+      "reward_std": 0.04167729243636131,
+      "rewards/bleu_reward_func/mean": 0.0694584771990776,
+      "rewards/bleu_reward_func/std": 0.06985452026128769,
+      "step": 837
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 397.0,
+      "completions/mean_length": 150.125,
+      "completions/mean_terminated_length": 126.00000762939453,
+      "completions/min_length": 16.0,
+      "completions/min_terminated_length": 16.0,
+      "epoch": 0.6704,
+      "grad_norm": 6.741494178771973,
+      "kl": 0.11309814453125,
+      "learning_rate": 1e-06,
+      "loss": -0.1029,
+      "num_tokens": 10911978.0,
+      "reward": 0.2410159707069397,
+      "reward_std": 0.056731171905994415,
+      "rewards/bleu_reward_func/mean": 0.2410159707069397,
+      "rewards/bleu_reward_func/std": 0.20536428689956665,
+      "step": 838
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.15625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 492.0,
+      "completions/mean_length": 198.8125,
+      "completions/mean_terminated_length": 140.8148193359375,
+      "completions/min_length": 20.0,
+      "completions/min_terminated_length": 20.0,
+      "epoch": 0.6712,
+      "grad_norm": 5.741243839263916,
+      "kl": 0.2135009765625,
+      "learning_rate": 1e-06,
+      "loss": 0.1006,
+      "num_tokens": 10925892.0,
+      "reward": 0.2018284797668457,
+      "reward_std": 0.04848968982696533,
+      "rewards/bleu_reward_func/mean": 0.2018284797668457,
+      "rewards/bleu_reward_func/std": 0.19715876877307892,
+      "step": 839
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 401.0,
+      "completions/max_terminated_length": 401.0,
+      "completions/mean_length": 158.5,
+      "completions/mean_terminated_length": 158.5,
+      "completions/min_length": 33.0,
+      "completions/min_terminated_length": 33.0,
+      "epoch": 0.672,
+      "grad_norm": 6.41207218170166,
+      "kl": 0.09161376953125,
+      "learning_rate": 1e-06,
+      "loss": -0.0922,
+      "num_tokens": 10934300.0,
+      "reward": 0.0522538498044014,
+      "reward_std": 0.021779239177703857,
+      "rewards/bleu_reward_func/mean": 0.0522538498044014,
+      "rewards/bleu_reward_func/std": 0.02408943697810173,
+      "step": 840
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 506.0,
+      "completions/mean_length": 389.8125,
+      "completions/mean_terminated_length": 316.5,
+      "completions/min_length": 145.0,
+      "completions/min_terminated_length": 145.0,
+      "epoch": 0.6728,
+      "grad_norm": 2.996119976043701,
+      "kl": 0.06011962890625,
+      "learning_rate": 1e-06,
+      "loss": 0.0296,
+      "num_tokens": 10951166.0,
+      "reward": 0.08128196746110916,
+      "reward_std": 0.01865270733833313,
+      "rewards/bleu_reward_func/mean": 0.08128196746110916,
+      "rewards/bleu_reward_func/std": 0.05130209028720856,
+      "step": 841
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.09375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 482.0,
+      "completions/mean_length": 211.25,
+      "completions/mean_terminated_length": 180.13792419433594,
+      "completions/min_length": 38.0,
+      "completions/min_terminated_length": 38.0,
+      "epoch": 0.6736,
+      "grad_norm": 4.3674750328063965,
+      "kl": 0.07989501953125,
+      "learning_rate": 1e-06,
+      "loss": 0.2159,
+      "num_tokens": 10962038.0,
+      "reward": 0.035381607711315155,
+      "reward_std": 0.015435540117323399,
+      "rewards/bleu_reward_func/mean": 0.035381607711315155,
+      "rewards/bleu_reward_func/std": 0.02227640338242054,
+      "step": 842
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 494.0,
+      "completions/mean_length": 326.15625,
+      "completions/mean_terminated_length": 241.68182373046875,
+      "completions/min_length": 19.0,
+      "completions/min_terminated_length": 19.0,
+      "epoch": 0.6744,
+      "grad_norm": 7.144293308258057,
+      "kl": 0.1136932373046875,
+      "learning_rate": 1e-06,
+      "loss": 0.0113,
+      "num_tokens": 10980947.0,
+      "reward": 0.24271616339683533,
+      "reward_std": 0.03907809406518936,
+      "rewards/bleu_reward_func/mean": 0.24271616339683533,
+      "rewards/bleu_reward_func/std": 0.21944448351860046,
+      "step": 843
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 503.0,
+      "completions/mean_length": 318.6875,
+      "completions/mean_terminated_length": 274.0769348144531,
+      "completions/min_length": 55.0,
+      "completions/min_terminated_length": 55.0,
+      "epoch": 0.6752,
+      "grad_norm": 3.7767539024353027,
+      "kl": 0.080078125,
+      "learning_rate": 1e-06,
+      "loss": -0.0971,
+      "num_tokens": 10993977.0,
+      "reward": 0.034242644906044006,
+      "reward_std": 0.01977381855249405,
+      "rewards/bleu_reward_func/mean": 0.034242644906044006,
+      "rewards/bleu_reward_func/std": 0.024919696152210236,
+      "step": 844
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 454.0,
+      "completions/mean_length": 265.375,
+      "completions/mean_terminated_length": 73.55555725097656,
+      "completions/min_length": 20.0,
+      "completions/min_terminated_length": 20.0,
+      "epoch": 0.676,
+      "grad_norm": 8.71445083618164,
+      "kl": 0.34588623046875,
+      "learning_rate": 1e-06,
+      "loss": 0.0431,
+      "num_tokens": 11007565.0,
+      "reward": 0.23305484652519226,
+      "reward_std": 0.05401034653186798,
+      "rewards/bleu_reward_func/mean": 0.23305484652519226,
+      "rewards/bleu_reward_func/std": 0.2091369926929474,
+      "step": 845
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.28125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 508.0,
+      "completions/mean_length": 363.375,
+      "completions/mean_terminated_length": 305.2174072265625,
+      "completions/min_length": 25.0,
+      "completions/min_terminated_length": 25.0,
+      "epoch": 0.6768,
+      "grad_norm": 5.406923770904541,
+      "kl": 0.061279296875,
+      "learning_rate": 1e-06,
+      "loss": -0.1012,
+      "num_tokens": 11023473.0,
+      "reward": 0.15702804923057556,
+      "reward_std": 0.02012755163013935,
+      "rewards/bleu_reward_func/mean": 0.15702804923057556,
+      "rewards/bleu_reward_func/std": 0.16683605313301086,
+      "step": 846
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 158.0,
+      "completions/mean_length": 84.125,
+      "completions/mean_terminated_length": 70.32257843017578,
+      "completions/min_length": 18.0,
+      "completions/min_terminated_length": 18.0,
+      "epoch": 0.6776,
+      "grad_norm": 7.79757022857666,
+      "kl": 0.094970703125,
+      "learning_rate": 1e-06,
+      "loss": 0.1681,
+      "num_tokens": 11033277.0,
+      "reward": 0.10883745551109314,
+      "reward_std": 0.06399966031312943,
+      "rewards/bleu_reward_func/mean": 0.10883745551109314,
+      "rewards/bleu_reward_func/std": 0.11935968697071075,
+      "step": 847
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.15625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 507.0,
+      "completions/mean_length": 282.375,
+      "completions/mean_terminated_length": 239.8518524169922,
+      "completions/min_length": 28.0,
+      "completions/min_terminated_length": 28.0,
+      "epoch": 0.6784,
+      "grad_norm": 7.137828350067139,
+      "kl": 0.18231201171875,
+      "learning_rate": 1e-06,
+      "loss": -0.1134,
+      "num_tokens": 11047977.0,
+      "reward": 0.07731978595256805,
+      "reward_std": 0.026035165414214134,
+      "rewards/bleu_reward_func/mean": 0.07731978595256805,
+      "rewards/bleu_reward_func/std": 0.08138881623744965,
+      "step": 848
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.15625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 427.0,
+      "completions/mean_length": 180.21875,
+      "completions/mean_terminated_length": 118.77777862548828,
+      "completions/min_length": 16.0,
+      "completions/min_terminated_length": 16.0,
+      "epoch": 0.6792,
+      "grad_norm": 10.036446571350098,
+      "kl": 0.2659912109375,
+      "learning_rate": 1e-06,
+      "loss": 0.1133,
+      "num_tokens": 11058320.0,
+      "reward": 0.13975116610527039,
+      "reward_std": 0.02090391516685486,
+      "rewards/bleu_reward_func/mean": 0.13975116610527039,
+      "rewards/bleu_reward_func/std": 0.15142837166786194,
+      "step": 849
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 478.0,
+      "completions/mean_length": 369.6875,
+      "completions/mean_terminated_length": 284.3000183105469,
+      "completions/min_length": 20.0,
+      "completions/min_terminated_length": 20.0,
+      "epoch": 0.68,
+      "grad_norm": 3.8194141387939453,
+      "kl": 0.038970947265625,
+      "learning_rate": 1e-06,
+      "loss": 0.0524,
+      "num_tokens": 11073662.0,
+      "reward": 0.08185985684394836,
+      "reward_std": 0.033635906875133514,
+      "rewards/bleu_reward_func/mean": 0.08185985684394836,
+      "rewards/bleu_reward_func/std": 0.06655923277139664,
+      "step": 850
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.15625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 54.0,
+      "completions/mean_length": 106.5,
+      "completions/mean_terminated_length": 31.407407760620117,
+      "completions/min_length": 14.0,
+      "completions/min_terminated_length": 14.0,
+      "epoch": 0.6808,
+      "grad_norm": 6.564956188201904,
+      "kl": 0.1971435546875,
+      "learning_rate": 1e-06,
+      "loss": 0.1071,
+      "num_tokens": 11085022.0,
+      "reward": 0.22551283240318298,
+      "reward_std": 0.04716075211763382,
+      "rewards/bleu_reward_func/mean": 0.22551283240318298,
+      "rewards/bleu_reward_func/std": 0.16660061478614807,
+      "step": 851
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 417.0,
+      "completions/mean_length": 154.375,
+      "completions/mean_terminated_length": 130.53334045410156,
+      "completions/min_length": 11.0,
+      "completions/min_terminated_length": 11.0,
+      "epoch": 0.6816,
+      "grad_norm": 15.921568870544434,
+      "kl": 0.28778076171875,
+      "learning_rate": 1e-06,
+      "loss": -0.0597,
+      "num_tokens": 11095378.0,
+      "reward": 0.1252121478319168,
+      "reward_std": 0.05527370423078537,
+      "rewards/bleu_reward_func/mean": 0.1252121478319168,
+      "rewards/bleu_reward_func/std": 0.12923383712768555,
+      "step": 852
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 501.0,
+      "completions/mean_length": 305.9375,
+      "completions/mean_terminated_length": 212.27273559570312,
+      "completions/min_length": 15.0,
+      "completions/min_terminated_length": 15.0,
+      "epoch": 0.6824,
+      "grad_norm": 8.1792631149292,
+      "kl": 0.1992645263671875,
+      "learning_rate": 1e-06,
+      "loss": 0.0667,
+      "num_tokens": 11109616.0,
+      "reward": 0.0659586489200592,
+      "reward_std": 0.027510065585374832,
+      "rewards/bleu_reward_func/mean": 0.0659586489200592,
+      "rewards/bleu_reward_func/std": 0.08249466121196747,
+      "step": 853
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.40625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 499.0,
+      "completions/mean_length": 341.125,
+      "completions/mean_terminated_length": 224.2105255126953,
+      "completions/min_length": 35.0,
+      "completions/min_terminated_length": 35.0,
+      "epoch": 0.6832,
+      "grad_norm": 2.265425443649292,
+      "kl": 0.039276123046875,
+      "learning_rate": 1e-06,
+      "loss": 0.2352,
+      "num_tokens": 11123036.0,
+      "reward": 0.10829215496778488,
+      "reward_std": 0.10021056979894638,
+      "rewards/bleu_reward_func/mean": 0.10829215496778488,
+      "rewards/bleu_reward_func/std": 0.15350966155529022,
+      "step": 854
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 436.0,
+      "completions/mean_length": 398.5625,
+      "completions/mean_terminated_length": 285.125,
+      "completions/min_length": 34.0,
+      "completions/min_terminated_length": 34.0,
+      "epoch": 0.684,
+      "grad_norm": 2.628075361251831,
+      "kl": 0.047515869140625,
+      "learning_rate": 1e-06,
+      "loss": 0.1379,
+      "num_tokens": 11139950.0,
+      "reward": 0.04525969177484512,
+      "reward_std": 0.025323685258626938,
+      "rewards/bleu_reward_func/mean": 0.04525969177484512,
+      "rewards/bleu_reward_func/std": 0.04984954744577408,
+      "step": 855
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.8125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 459.0,
+      "completions/mean_length": 489.0625,
+      "completions/mean_terminated_length": 389.66668701171875,
+      "completions/min_length": 325.0,
+      "completions/min_terminated_length": 325.0,
+      "epoch": 0.6848,
+      "grad_norm": 2.320620059967041,
+      "kl": 0.035888671875,
+      "learning_rate": 1e-06,
+      "loss": -0.0345,
+      "num_tokens": 11158672.0,
+      "reward": 0.016138827428221703,
+      "reward_std": 0.0038068746216595173,
+      "rewards/bleu_reward_func/mean": 0.016138827428221703,
+      "rewards/bleu_reward_func/std": 0.016928784549236298,
+      "step": 856
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.28125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 453.0,
+      "completions/mean_length": 317.84375,
+      "completions/mean_terminated_length": 241.86956787109375,
+      "completions/min_length": 28.0,
+      "completions/min_terminated_length": 28.0,
+      "epoch": 0.6856,
+      "grad_norm": 3.551910638809204,
+      "kl": 0.109039306640625,
+      "learning_rate": 1e-06,
+      "loss": 0.0289,
+      "num_tokens": 11173291.0,
+      "reward": 0.20694701373577118,
+      "reward_std": 0.014496378600597382,
+      "rewards/bleu_reward_func/mean": 0.20694701373577118,
+      "rewards/bleu_reward_func/std": 0.2963625490665436,
+      "step": 857
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 453.0,
+      "completions/mean_length": 147.125,
+      "completions/mean_terminated_length": 62.92308044433594,
+      "completions/min_length": 11.0,
+      "completions/min_terminated_length": 11.0,
+      "epoch": 0.6864,
+      "grad_norm": 15.02341079711914,
+      "kl": 0.70330810546875,
+      "learning_rate": 1e-06,
+      "loss": 0.2886,
+      "num_tokens": 11182215.0,
+      "reward": 0.19951725006103516,
+      "reward_std": 0.052443791180849075,
+      "rewards/bleu_reward_func/mean": 0.19951725006103516,
+      "rewards/bleu_reward_func/std": 0.19433696568012238,
+      "step": 858
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.40625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 471.0,
+      "completions/mean_length": 284.125,
+      "completions/mean_terminated_length": 128.2105255126953,
+      "completions/min_length": 21.0,
+      "completions/min_terminated_length": 21.0,
+      "epoch": 0.6872,
+      "grad_norm": 8.070085525512695,
+      "kl": 0.110443115234375,
+      "learning_rate": 1e-06,
+      "loss": 0.1827,
+      "num_tokens": 11194667.0,
+      "reward": 0.04423338174819946,
+      "reward_std": 0.017294086515903473,
+      "rewards/bleu_reward_func/mean": 0.04423338174819946,
+      "rewards/bleu_reward_func/std": 0.047055598348379135,
+      "step": 859
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 500.0,
+      "completions/mean_length": 329.71875,
+      "completions/mean_terminated_length": 287.65386962890625,
+      "completions/min_length": 26.0,
+      "completions/min_terminated_length": 26.0,
+      "epoch": 0.688,
+      "grad_norm": 2.8106324672698975,
+      "kl": 0.039154052734375,
+      "learning_rate": 1e-06,
+      "loss": -0.2058,
+      "num_tokens": 11207642.0,
+      "reward": 0.06786108016967773,
+      "reward_std": 0.0352618470788002,
+      "rewards/bleu_reward_func/mean": 0.06786108016967773,
+      "rewards/bleu_reward_func/std": 0.04090343415737152,
+      "step": 860
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 512.0,
+      "completions/mean_length": 368.53125,
+      "completions/mean_terminated_length": 320.7083435058594,
+      "completions/min_length": 54.0,
+      "completions/min_terminated_length": 54.0,
+      "epoch": 0.6888,
+      "grad_norm": 2.837979793548584,
+      "kl": 0.052001953125,
+      "learning_rate": 1e-06,
+      "loss": 0.107,
+      "num_tokens": 11221547.0,
+      "reward": 0.07621696591377258,
+      "reward_std": 0.029543904587626457,
+      "rewards/bleu_reward_func/mean": 0.07621696591377258,
+      "rewards/bleu_reward_func/std": 0.04072652757167816,
+      "step": 861
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.40625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 481.0,
+      "completions/mean_length": 398.59375,
+      "completions/mean_terminated_length": 321.0,
+      "completions/min_length": 190.0,
+      "completions/min_terminated_length": 190.0,
+      "epoch": 0.6896,
+      "grad_norm": 2.941206693649292,
+      "kl": 0.046844482421875,
+      "learning_rate": 1e-06,
+      "loss": -0.0139,
+      "num_tokens": 11237182.0,
+      "reward": 0.026391834020614624,
+      "reward_std": 0.016949903219938278,
+      "rewards/bleu_reward_func/mean": 0.026391834020614624,
+      "rewards/bleu_reward_func/std": 0.03409172222018242,
+      "step": 862
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.40625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 484.0,
+      "completions/mean_length": 357.40625,
+      "completions/mean_terminated_length": 251.63157653808594,
+      "completions/min_length": 21.0,
+      "completions/min_terminated_length": 21.0,
+      "epoch": 0.6904,
+      "grad_norm": 2.633358955383301,
+      "kl": 0.055328369140625,
+      "learning_rate": 1e-06,
+      "loss": -0.0911,
+      "num_tokens": 11251547.0,
+      "reward": 0.042053550481796265,
+      "reward_std": 0.021867552772164345,
+      "rewards/bleu_reward_func/mean": 0.042053550481796265,
+      "rewards/bleu_reward_func/std": 0.029616717249155045,
+      "step": 863
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 512.0,
+      "completions/mean_length": 168.3125,
+      "completions/mean_terminated_length": 119.21429443359375,
+      "completions/min_length": 33.0,
+      "completions/min_terminated_length": 33.0,
+      "epoch": 0.6912,
+      "grad_norm": 4.309210777282715,
+      "kl": 0.13665771484375,
+      "learning_rate": 1e-06,
+      "loss": -0.1275,
+      "num_tokens": 11261245.0,
+      "reward": 0.1768188774585724,
+      "reward_std": 0.030298635363578796,
+      "rewards/bleu_reward_func/mean": 0.1768188774585724,
+      "rewards/bleu_reward_func/std": 0.12399855256080627,
+      "step": 864
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.28125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 494.0,
+      "completions/mean_length": 252.78125,
+      "completions/mean_terminated_length": 151.3478240966797,
+      "completions/min_length": 15.0,
+      "completions/min_terminated_length": 15.0,
+      "epoch": 0.692,
+      "grad_norm": 8.154788970947266,
+      "kl": 0.23638916015625,
+      "learning_rate": 1e-06,
+      "loss": 0.0438,
+      "num_tokens": 11275478.0,
+      "reward": 0.07366465032100677,
+      "reward_std": 0.029438909143209457,
+      "rewards/bleu_reward_func/mean": 0.07366465032100677,
+      "rewards/bleu_reward_func/std": 0.05699191242456436,
+      "step": 865
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 463.0,
+      "completions/mean_length": 322.21875,
+      "completions/mean_terminated_length": 235.95455932617188,
+      "completions/min_length": 18.0,
+      "completions/min_terminated_length": 18.0,
+      "epoch": 0.6928,
+      "grad_norm": 2.933178663253784,
+      "kl": 0.04754638671875,
+      "learning_rate": 1e-06,
+      "loss": 0.0639,
+      "num_tokens": 11287317.0,
+      "reward": 0.041873324662446976,
+      "reward_std": 0.02685678005218506,
+      "rewards/bleu_reward_func/mean": 0.041873324662446976,
+      "rewards/bleu_reward_func/std": 0.039241958409547806,
+      "step": 866
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.28125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 455.0,
+      "completions/mean_length": 208.1875,
+      "completions/mean_terminated_length": 89.30435180664062,
+      "completions/min_length": 22.0,
+      "completions/min_terminated_length": 22.0,
+      "epoch": 0.6936,
+      "grad_norm": 6.312671661376953,
+      "kl": 0.1842041015625,
+      "learning_rate": 1e-06,
+      "loss": 0.1617,
+      "num_tokens": 11297835.0,
+      "reward": 0.103369802236557,
+      "reward_std": 0.04473632201552391,
+      "rewards/bleu_reward_func/mean": 0.103369802236557,
+      "rewards/bleu_reward_func/std": 0.10830661654472351,
+      "step": 867
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.15625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 496.0,
+      "completions/mean_length": 274.375,
+      "completions/mean_terminated_length": 230.37037658691406,
+      "completions/min_length": 64.0,
+      "completions/min_terminated_length": 64.0,
+      "epoch": 0.6944,
+      "grad_norm": 3.08828067779541,
+      "kl": 0.090667724609375,
+      "learning_rate": 1e-06,
+      "loss": -0.0115,
+      "num_tokens": 11312311.0,
+      "reward": 0.16378189623355865,
+      "reward_std": 0.0222244244068861,
+      "rewards/bleu_reward_func/mean": 0.16378189623355865,
+      "rewards/bleu_reward_func/std": 0.19553562998771667,
+      "step": 868
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 446.0,
+      "completions/mean_length": 216.15625,
+      "completions/mean_terminated_length": 117.54167175292969,
+      "completions/min_length": 19.0,
+      "completions/min_terminated_length": 19.0,
+      "epoch": 0.6952,
+      "grad_norm": 5.62147331237793,
+      "kl": 0.15509033203125,
+      "learning_rate": 1e-06,
+      "loss": -0.0272,
+      "num_tokens": 11325348.0,
+      "reward": 0.059518001973629,
+      "reward_std": 0.028110869228839874,
+      "rewards/bleu_reward_func/mean": 0.059518001973629,
+      "rewards/bleu_reward_func/std": 0.048489734530448914,
+      "step": 869
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 472.0,
+      "completions/mean_length": 404.46875,
+      "completions/mean_terminated_length": 296.9375,
+      "completions/min_length": 195.0,
+      "completions/min_terminated_length": 195.0,
+      "epoch": 0.696,
+      "grad_norm": 3.3346071243286133,
+      "kl": 0.038360595703125,
+      "learning_rate": 1e-06,
+      "loss": -0.0205,
+      "num_tokens": 11343635.0,
+      "reward": 0.07933641970157623,
+      "reward_std": 0.021958988159894943,
+      "rewards/bleu_reward_func/mean": 0.07933641970157623,
+      "rewards/bleu_reward_func/std": 0.06096653267741203,
+      "step": 870
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.15625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 464.0,
+      "completions/mean_length": 236.6875,
+      "completions/mean_terminated_length": 185.70370483398438,
+      "completions/min_length": 16.0,
+      "completions/min_terminated_length": 16.0,
+      "epoch": 0.6968,
+      "grad_norm": 4.446465015411377,
+      "kl": 0.17974853515625,
+      "learning_rate": 1e-06,
+      "loss": -0.0515,
+      "num_tokens": 11353817.0,
+      "reward": 0.11160654574632645,
+      "reward_std": 0.039265286177396774,
+      "rewards/bleu_reward_func/mean": 0.11160654574632645,
+      "rewards/bleu_reward_func/std": 0.08857923746109009,
+      "step": 871
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.28125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 502.0,
+      "completions/mean_length": 260.125,
+      "completions/mean_terminated_length": 161.56521606445312,
+      "completions/min_length": 18.0,
+      "completions/min_terminated_length": 18.0,
+      "epoch": 0.6976,
+      "grad_norm": 7.414572715759277,
+      "kl": 0.107696533203125,
+      "learning_rate": 1e-06,
+      "loss": -0.0123,
+      "num_tokens": 11367725.0,
+      "reward": 0.1780683994293213,
+      "reward_std": 0.015433109365403652,
+      "rewards/bleu_reward_func/mean": 0.1780683994293213,
+      "rewards/bleu_reward_func/std": 0.2229662984609604,
+      "step": 872
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 497.0,
+      "completions/mean_length": 408.0625,
+      "completions/mean_terminated_length": 360.8182067871094,
+      "completions/min_length": 171.0,
+      "completions/min_terminated_length": 171.0,
+      "epoch": 0.6984,
+      "grad_norm": 3.227613925933838,
+      "kl": 0.0611572265625,
+      "learning_rate": 1e-06,
+      "loss": -0.0681,
+      "num_tokens": 11385423.0,
+      "reward": 0.0366949737071991,
+      "reward_std": 0.01884927786886692,
+      "rewards/bleu_reward_func/mean": 0.0366949737071991,
+      "rewards/bleu_reward_func/std": 0.028229771181941032,
+      "step": 873
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.15625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 481.0,
+      "completions/mean_length": 222.21875,
+      "completions/mean_terminated_length": 168.55555725097656,
+      "completions/min_length": 16.0,
+      "completions/min_terminated_length": 16.0,
+      "epoch": 0.6992,
+      "grad_norm": 7.29306697845459,
+      "kl": 0.142730712890625,
+      "learning_rate": 1e-06,
+      "loss": 0.11,
+      "num_tokens": 11397022.0,
+      "reward": 0.046667180955410004,
+      "reward_std": 0.020207617431879044,
+      "rewards/bleu_reward_func/mean": 0.046667180955410004,
+      "rewards/bleu_reward_func/std": 0.02555895410478115,
+      "step": 874
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.28125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 504.0,
+      "completions/mean_length": 268.5,
+      "completions/mean_terminated_length": 173.21739196777344,
+      "completions/min_length": 13.0,
+      "completions/min_terminated_length": 13.0,
+      "epoch": 0.7,
+      "grad_norm": 3.604617118835449,
+      "kl": 0.06298828125,
+      "learning_rate": 1e-06,
+      "loss": -0.3505,
+      "num_tokens": 11410046.0,
+      "reward": 0.07897455990314484,
+      "reward_std": 0.014880911447107792,
+      "rewards/bleu_reward_func/mean": 0.07897455990314484,
+      "rewards/bleu_reward_func/std": 0.08343996107578278,
+      "step": 875
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.34375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 508.0,
+      "completions/mean_length": 325.46875,
+      "completions/mean_terminated_length": 227.76190185546875,
+      "completions/min_length": 18.0,
+      "completions/min_terminated_length": 18.0,
+      "epoch": 0.7008,
+      "grad_norm": 8.727375030517578,
+      "kl": 0.133270263671875,
+      "learning_rate": 1e-06,
+      "loss": 0.3117,
+      "num_tokens": 11422725.0,
+      "reward": 0.07061035186052322,
+      "reward_std": 0.0419192910194397,
+      "rewards/bleu_reward_func/mean": 0.07061035186052322,
+      "rewards/bleu_reward_func/std": 0.07667659968137741,
+      "step": 876
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.15625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 468.0,
+      "completions/mean_length": 181.375,
+      "completions/mean_terminated_length": 120.14814758300781,
+      "completions/min_length": 16.0,
+      "completions/min_terminated_length": 16.0,
+      "epoch": 0.7016,
+      "grad_norm": 7.317707061767578,
+      "kl": 0.21905517578125,
+      "learning_rate": 1e-06,
+      "loss": 0.4234,
+      "num_tokens": 11431937.0,
+      "reward": 0.10765747725963593,
+      "reward_std": 0.052248626947402954,
+      "rewards/bleu_reward_func/mean": 0.10765747725963593,
+      "rewards/bleu_reward_func/std": 0.05436404421925545,
+      "step": 877
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 501.0,
+      "completions/mean_length": 220.75,
+      "completions/mean_terminated_length": 211.35482788085938,
+      "completions/min_length": 20.0,
+      "completions/min_terminated_length": 20.0,
+      "epoch": 0.7024,
+      "grad_norm": 4.363486289978027,
+      "kl": 0.214019775390625,
+      "learning_rate": 1e-06,
+      "loss": 0.0009,
+      "num_tokens": 11443057.0,
+      "reward": 0.30547034740448,
+      "reward_std": 0.024015674367547035,
+      "rewards/bleu_reward_func/mean": 0.30547034740448,
+      "rewards/bleu_reward_func/std": 0.2281493991613388,
+      "step": 878
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.21875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 377.0,
+      "completions/mean_length": 205.46875,
+      "completions/mean_terminated_length": 119.63999938964844,
+      "completions/min_length": 19.0,
+      "completions/min_terminated_length": 19.0,
+      "epoch": 0.7032,
+      "grad_norm": 8.484012603759766,
+      "kl": 0.189361572265625,
+      "learning_rate": 1e-06,
+      "loss": 0.1579,
+      "num_tokens": 11451232.0,
+      "reward": 0.0824245885014534,
+      "reward_std": 0.04487679526209831,
+      "rewards/bleu_reward_func/mean": 0.0824245885014534,
+      "rewards/bleu_reward_func/std": 0.07150331139564514,
+      "step": 879
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 449.0,
+      "completions/mean_length": 141.375,
+      "completions/mean_terminated_length": 129.4193572998047,
+      "completions/min_length": 27.0,
+      "completions/min_terminated_length": 27.0,
+      "epoch": 0.704,
+      "grad_norm": 5.958530902862549,
+      "kl": 0.1085205078125,
+      "learning_rate": 1e-06,
+      "loss": -0.0313,
+      "num_tokens": 11462604.0,
+      "reward": 0.04005417972803116,
+      "reward_std": 0.024934137240052223,
+      "rewards/bleu_reward_func/mean": 0.04005417972803116,
+      "rewards/bleu_reward_func/std": 0.03826345130801201,
+      "step": 880
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.40625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 500.0,
+      "completions/mean_length": 283.03125,
+      "completions/mean_terminated_length": 126.36842346191406,
+      "completions/min_length": 23.0,
+      "completions/min_terminated_length": 23.0,
+      "epoch": 0.7048,
+      "grad_norm": 6.075742244720459,
+      "kl": 0.3231201171875,
+      "learning_rate": 1e-06,
+      "loss": 0.0444,
+      "num_tokens": 11477461.0,
+      "reward": 0.10366402566432953,
+      "reward_std": 0.055370062589645386,
+      "rewards/bleu_reward_func/mean": 0.10366402566432953,
+      "rewards/bleu_reward_func/std": 0.11003145575523376,
+      "step": 881
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.46875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 511.0,
+      "completions/mean_length": 361.0,
+      "completions/mean_terminated_length": 227.76470947265625,
+      "completions/min_length": 15.0,
+      "completions/min_terminated_length": 15.0,
+      "epoch": 0.7056,
+      "grad_norm": 5.96100378036499,
+      "kl": 0.19482421875,
+      "learning_rate": 1e-06,
+      "loss": -0.0353,
+      "num_tokens": 11493365.0,
+      "reward": 0.1235186904668808,
+      "reward_std": 0.038026995956897736,
+      "rewards/bleu_reward_func/mean": 0.1235186904668808,
+      "rewards/bleu_reward_func/std": 0.05816841870546341,
+      "step": 882
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.15625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 358.0,
+      "completions/mean_length": 172.09375,
+      "completions/mean_terminated_length": 109.14814758300781,
+      "completions/min_length": 3.0,
+      "completions/min_terminated_length": 3.0,
+      "epoch": 0.7064,
+      "grad_norm": 6.117469787597656,
+      "kl": 0.201904296875,
+      "learning_rate": 1e-06,
+      "loss": 0.067,
+      "num_tokens": 11501456.0,
+      "reward": 0.15373189747333527,
+      "reward_std": 0.05197744071483612,
+      "rewards/bleu_reward_func/mean": 0.15373189747333527,
+      "rewards/bleu_reward_func/std": 0.10633216798305511,
+      "step": 883
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.21875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 510.0,
+      "completions/mean_length": 263.15625,
+      "completions/mean_terminated_length": 193.47999572753906,
+      "completions/min_length": 17.0,
+      "completions/min_terminated_length": 17.0,
+      "epoch": 0.7072,
+      "grad_norm": 4.437005519866943,
+      "kl": 0.12554931640625,
+      "learning_rate": 1e-06,
+      "loss": 0.2681,
+      "num_tokens": 11517437.0,
+      "reward": 0.29476526379585266,
+      "reward_std": 0.13803553581237793,
+      "rewards/bleu_reward_func/mean": 0.29476526379585266,
+      "rewards/bleu_reward_func/std": 0.32065168023109436,
+      "step": 884
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 501.0,
+      "completions/mean_length": 252.75,
+      "completions/mean_terminated_length": 134.90908813476562,
+      "completions/min_length": 16.0,
+      "completions/min_terminated_length": 16.0,
+      "epoch": 0.708,
+      "grad_norm": 8.795902252197266,
+      "kl": 0.3934326171875,
+      "learning_rate": 1e-06,
+      "loss": -0.0341,
+      "num_tokens": 11529885.0,
+      "reward": 0.13262969255447388,
+      "reward_std": 0.037800293415784836,
+      "rewards/bleu_reward_func/mean": 0.13262969255447388,
+      "rewards/bleu_reward_func/std": 0.11564164608716965,
+      "step": 885
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 500.0,
+      "completions/mean_length": 355.6875,
+      "completions/mean_terminated_length": 199.375,
+      "completions/min_length": 39.0,
+      "completions/min_terminated_length": 39.0,
+      "epoch": 0.7088,
+      "grad_norm": 5.155429840087891,
+      "kl": 0.0558319091796875,
+      "learning_rate": 1e-06,
+      "loss": 0.0544,
+      "num_tokens": 11546155.0,
+      "reward": 0.10861489176750183,
+      "reward_std": 0.035860445350408554,
+      "rewards/bleu_reward_func/mean": 0.10861489176750183,
+      "rewards/bleu_reward_func/std": 0.08613201975822449,
+      "step": 886
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.40625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 491.0,
+      "completions/mean_length": 282.84375,
+      "completions/mean_terminated_length": 126.0526351928711,
+      "completions/min_length": 24.0,
+      "completions/min_terminated_length": 24.0,
+      "epoch": 0.7096,
+      "grad_norm": 4.782761096954346,
+      "kl": 0.133056640625,
+      "learning_rate": 1e-06,
+      "loss": 0.0206,
+      "num_tokens": 11561390.0,
+      "reward": 0.0671025738120079,
+      "reward_std": 0.018492672592401505,
+      "rewards/bleu_reward_func/mean": 0.0671025738120079,
+      "rewards/bleu_reward_func/std": 0.06450604647397995,
+      "step": 887
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.28125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 492.0,
+      "completions/mean_length": 331.6875,
+      "completions/mean_terminated_length": 261.13043212890625,
+      "completions/min_length": 36.0,
+      "completions/min_terminated_length": 36.0,
+      "epoch": 0.7104,
+      "grad_norm": 2.8767964839935303,
+      "kl": 0.06182861328125,
+      "learning_rate": 1e-06,
+      "loss": -0.0002,
+      "num_tokens": 11577356.0,
+      "reward": 0.1093081682920456,
+      "reward_std": 0.07805053889751434,
+      "rewards/bleu_reward_func/mean": 0.1093081682920456,
+      "rewards/bleu_reward_func/std": 0.17048169672489166,
+      "step": 888
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 491.0,
+      "completions/mean_length": 397.375,
+      "completions/mean_terminated_length": 359.16668701171875,
+      "completions/min_length": 141.0,
+      "completions/min_terminated_length": 141.0,
+      "epoch": 0.7112,
+      "grad_norm": 2.2424142360687256,
+      "kl": 0.0458984375,
+      "learning_rate": 1e-06,
+      "loss": -0.015,
+      "num_tokens": 11591432.0,
+      "reward": 0.06777183711528778,
+      "reward_std": 0.019787484779953957,
+      "rewards/bleu_reward_func/mean": 0.06777183711528778,
+      "rewards/bleu_reward_func/std": 0.041765324771404266,
+      "step": 889
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 496.0,
+      "completions/mean_length": 292.59375,
+      "completions/mean_terminated_length": 160.9499969482422,
+      "completions/min_length": 13.0,
+      "completions/min_terminated_length": 13.0,
+      "epoch": 0.712,
+      "grad_norm": 4.390679359436035,
+      "kl": 0.0987548828125,
+      "learning_rate": 1e-06,
+      "loss": 0.2246,
+      "num_tokens": 11603515.0,
+      "reward": 0.06538625806570053,
+      "reward_std": 0.03718053176999092,
+      "rewards/bleu_reward_func/mean": 0.06538625806570053,
+      "rewards/bleu_reward_func/std": 0.0816822499036789,
+      "step": 890
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.09375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 424.0,
+      "completions/mean_length": 188.875,
+      "completions/mean_terminated_length": 155.44827270507812,
+      "completions/min_length": 29.0,
+      "completions/min_terminated_length": 29.0,
+      "epoch": 0.7128,
+      "grad_norm": 5.5804290771484375,
+      "kl": 0.123260498046875,
+      "learning_rate": 1e-06,
+      "loss": -0.2822,
+      "num_tokens": 11612927.0,
+      "reward": 0.0781329870223999,
+      "reward_std": 0.049637503921985626,
+      "rewards/bleu_reward_func/mean": 0.0781329870223999,
+      "rewards/bleu_reward_func/std": 0.08602513372898102,
+      "step": 891
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.21875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 499.0,
+      "completions/mean_length": 332.96875,
+      "completions/mean_terminated_length": 282.8399963378906,
+      "completions/min_length": 38.0,
+      "completions/min_terminated_length": 38.0,
+      "epoch": 0.7136,
+      "grad_norm": 2.8855247497558594,
+      "kl": 0.0643310546875,
+      "learning_rate": 1e-06,
+      "loss": 0.1736,
+      "num_tokens": 11625662.0,
+      "reward": 0.03828759491443634,
+      "reward_std": 0.024871867150068283,
+      "rewards/bleu_reward_func/mean": 0.03828759491443634,
+      "rewards/bleu_reward_func/std": 0.03181852772831917,
+      "step": 892
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.34375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 503.0,
+      "completions/mean_length": 322.96875,
+      "completions/mean_terminated_length": 223.952392578125,
+      "completions/min_length": 19.0,
+      "completions/min_terminated_length": 19.0,
+      "epoch": 0.7144,
+      "grad_norm": 8.224991798400879,
+      "kl": 0.19036865234375,
+      "learning_rate": 1e-06,
+      "loss": 0.1601,
+      "num_tokens": 11642029.0,
+      "reward": 0.03835766017436981,
+      "reward_std": 0.013130895793437958,
+      "rewards/bleu_reward_func/mean": 0.03835766017436981,
+      "rewards/bleu_reward_func/std": 0.024478256702423096,
+      "step": 893
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.15625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 469.0,
+      "completions/mean_length": 205.46875,
+      "completions/mean_terminated_length": 148.70370483398438,
+      "completions/min_length": 20.0,
+      "completions/min_terminated_length": 20.0,
+      "epoch": 0.7152,
+      "grad_norm": 6.305431842803955,
+      "kl": 0.14776611328125,
+      "learning_rate": 1e-06,
+      "loss": -0.0135,
+      "num_tokens": 11657452.0,
+      "reward": 0.11420266330242157,
+      "reward_std": 0.04108916223049164,
+      "rewards/bleu_reward_func/mean": 0.11420266330242157,
+      "rewards/bleu_reward_func/std": 0.06337518244981766,
+      "step": 894
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 477.0,
+      "completions/mean_length": 336.71875,
+      "completions/mean_terminated_length": 200.38888549804688,
+      "completions/min_length": 22.0,
+      "completions/min_terminated_length": 22.0,
+      "epoch": 0.716,
+      "grad_norm": 4.193768501281738,
+      "kl": 0.13232421875,
+      "learning_rate": 1e-06,
+      "loss": 0.3397,
+      "num_tokens": 11672971.0,
+      "reward": 0.07947193086147308,
+      "reward_std": 0.04811304062604904,
+      "rewards/bleu_reward_func/mean": 0.07947193086147308,
+      "rewards/bleu_reward_func/std": 0.10142233967781067,
+      "step": 895
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 497.0,
+      "completions/mean_length": 123.09375,
+      "completions/mean_terminated_length": 97.16667175292969,
+      "completions/min_length": 15.0,
+      "completions/min_terminated_length": 15.0,
+      "epoch": 0.7168,
+      "grad_norm": 9.166439056396484,
+      "kl": 0.28271484375,
+      "learning_rate": 1e-06,
+      "loss": -0.1089,
+      "num_tokens": 11684334.0,
+      "reward": 0.27329233288764954,
+      "reward_std": 0.059711530804634094,
+      "rewards/bleu_reward_func/mean": 0.27329233288764954,
+      "rewards/bleu_reward_func/std": 0.1879579871892929,
+      "step": 896
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 463.0,
+      "completions/mean_length": 246.0,
+      "completions/mean_terminated_length": 157.33334350585938,
+      "completions/min_length": 14.0,
+      "completions/min_terminated_length": 14.0,
+      "epoch": 0.7176,
+      "grad_norm": 4.767898082733154,
+      "kl": 0.112060546875,
+      "learning_rate": 1e-06,
+      "loss": 0.0037,
+      "num_tokens": 11700486.0,
+      "reward": 0.07844039797782898,
+      "reward_std": 0.034808725118637085,
+      "rewards/bleu_reward_func/mean": 0.07844039797782898,
+      "rewards/bleu_reward_func/std": 0.0884510949254036,
+      "step": 897
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.40625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 495.0,
+      "completions/mean_length": 291.25,
+      "completions/mean_terminated_length": 140.2105255126953,
+      "completions/min_length": 23.0,
+      "completions/min_terminated_length": 23.0,
+      "epoch": 0.7184,
+      "grad_norm": 5.6404266357421875,
+      "kl": 0.126312255859375,
+      "learning_rate": 1e-06,
+      "loss": 0.0453,
+      "num_tokens": 11712438.0,
+      "reward": 0.07097087800502777,
+      "reward_std": 0.03667715564370155,
+      "rewards/bleu_reward_func/mean": 0.07097087800502777,
+      "rewards/bleu_reward_func/std": 0.08086320012807846,
+      "step": 898
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 511.0,
+      "completions/mean_length": 264.6875,
+      "completions/mean_terminated_length": 229.35714721679688,
+      "completions/min_length": 25.0,
+      "completions/min_terminated_length": 25.0,
+      "epoch": 0.7192,
+      "grad_norm": 4.8742499351501465,
+      "kl": 0.10357666015625,
+      "learning_rate": 1e-06,
+      "loss": 0.1854,
+      "num_tokens": 11725396.0,
+      "reward": 0.21620362997055054,
+      "reward_std": 0.07608456909656525,
+      "rewards/bleu_reward_func/mean": 0.21620362997055054,
+      "rewards/bleu_reward_func/std": 0.2514094114303589,
+      "step": 899
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 502.0,
+      "completions/mean_length": 329.46875,
+      "completions/mean_terminated_length": 268.625,
+      "completions/min_length": 39.0,
+      "completions/min_terminated_length": 39.0,
+      "epoch": 0.72,
+      "grad_norm": 4.756425857543945,
+      "kl": 0.10198974609375,
+      "learning_rate": 1e-06,
+      "loss": 0.0469,
+      "num_tokens": 11738307.0,
+      "reward": 0.053835704922676086,
+      "reward_std": 0.012895071879029274,
+      "rewards/bleu_reward_func/mean": 0.053835704922676086,
+      "rewards/bleu_reward_func/std": 0.03540419042110443,
+      "step": 900
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 472.0,
+      "completions/mean_length": 401.65625,
+      "completions/mean_terminated_length": 259.7857360839844,
+      "completions/min_length": 63.0,
+      "completions/min_terminated_length": 63.0,
+      "epoch": 0.7208,
+      "grad_norm": 3.1605112552642822,
+      "kl": 0.04974365234375,
+      "learning_rate": 1e-06,
+      "loss": 0.2297,
+      "num_tokens": 11756496.0,
+      "reward": 0.24388237297534943,
+      "reward_std": 0.1161736249923706,
+      "rewards/bleu_reward_func/mean": 0.24388237297534943,
+      "rewards/bleu_reward_func/std": 0.3413524627685547,
+      "step": 901
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.09375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 509.0,
+      "completions/mean_length": 271.59375,
+      "completions/mean_terminated_length": 246.72413635253906,
+      "completions/min_length": 14.0,
+      "completions/min_terminated_length": 14.0,
+      "epoch": 0.7216,
+      "grad_norm": 4.188157081604004,
+      "kl": 0.1815185546875,
+      "learning_rate": 1e-06,
+      "loss": -0.0606,
+      "num_tokens": 11768691.0,
+      "reward": 0.12793870270252228,
+      "reward_std": 0.04022746905684471,
+      "rewards/bleu_reward_func/mean": 0.12793870270252228,
+      "rewards/bleu_reward_func/std": 0.15937677025794983,
+      "step": 902
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 511.0,
+      "completions/mean_length": 282.65625,
+      "completions/mean_terminated_length": 178.4091033935547,
+      "completions/min_length": 2.0,
+      "completions/min_terminated_length": 2.0,
+      "epoch": 0.7224,
+      "grad_norm": 4.868112087249756,
+      "kl": 0.1282958984375,
+      "learning_rate": 1e-06,
+      "loss": -0.0395,
+      "num_tokens": 11780800.0,
+      "reward": 0.09620735794305801,
+      "reward_std": 0.021982625126838684,
+      "rewards/bleu_reward_func/mean": 0.09620735794305801,
+      "rewards/bleu_reward_func/std": 0.07161340862512589,
+      "step": 903
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 489.0,
+      "completions/mean_length": 277.09375,
+      "completions/mean_terminated_length": 222.88462829589844,
+      "completions/min_length": 21.0,
+      "completions/min_terminated_length": 21.0,
+      "epoch": 0.7232,
+      "grad_norm": 7.519046783447266,
+      "kl": 0.212158203125,
+      "learning_rate": 1e-06,
+      "loss": -0.1487,
+      "num_tokens": 11795403.0,
+      "reward": 0.07747071981430054,
+      "reward_std": 0.03376290947198868,
+      "rewards/bleu_reward_func/mean": 0.07747071981430054,
+      "rewards/bleu_reward_func/std": 0.055931881070137024,
+      "step": 904
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 512.0,
+      "completions/mean_length": 253.4375,
+      "completions/mean_terminated_length": 167.25,
+      "completions/min_length": 15.0,
+      "completions/min_terminated_length": 15.0,
+      "epoch": 0.724,
+      "grad_norm": 8.574625015258789,
+      "kl": 0.1490478515625,
+      "learning_rate": 1e-06,
+      "loss": 0.0058,
+      "num_tokens": 11806985.0,
+      "reward": 0.06105317175388336,
+      "reward_std": 0.019554441794753075,
+      "rewards/bleu_reward_func/mean": 0.06105317175388336,
+      "rewards/bleu_reward_func/std": 0.038146011531353,
+      "step": 905
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 407.0,
+      "completions/mean_length": 72.1875,
+      "completions/mean_terminated_length": 58.0,
+      "completions/min_length": 17.0,
+      "completions/min_terminated_length": 17.0,
+      "epoch": 0.7248,
+      "grad_norm": 10.871319770812988,
+      "kl": 0.32177734375,
+      "learning_rate": 1e-06,
+      "loss": 0.2813,
+      "num_tokens": 11812335.0,
+      "reward": 0.09286689758300781,
+      "reward_std": 0.02634507045149803,
+      "rewards/bleu_reward_func/mean": 0.09286689758300781,
+      "rewards/bleu_reward_func/std": 0.04922043904662132,
+      "step": 906
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 499.0,
+      "completions/mean_length": 239.3125,
+      "completions/mean_terminated_length": 200.35714721679688,
+      "completions/min_length": 15.0,
+      "completions/min_terminated_length": 15.0,
+      "epoch": 0.7256,
+      "grad_norm": 23.129505157470703,
+      "kl": 0.1500244140625,
+      "learning_rate": 1e-06,
+      "loss": 0.0482,
+      "num_tokens": 11828945.0,
+      "reward": 0.07308115810155869,
+      "reward_std": 0.014882557094097137,
+      "rewards/bleu_reward_func/mean": 0.07308115810155869,
+      "rewards/bleu_reward_func/std": 0.08316269516944885,
+      "step": 907
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 497.0,
+      "completions/mean_length": 429.8125,
+      "completions/mean_terminated_length": 292.8333435058594,
+      "completions/min_length": 58.0,
+      "completions/min_terminated_length": 58.0,
+      "epoch": 0.7264,
+      "grad_norm": 11.226503372192383,
+      "kl": 0.059967041015625,
+      "learning_rate": 1e-06,
+      "loss": 0.0269,
+      "num_tokens": 11848283.0,
+      "reward": 0.04945487529039383,
+      "reward_std": 0.016689039766788483,
+      "rewards/bleu_reward_func/mean": 0.04945487529039383,
+      "rewards/bleu_reward_func/std": 0.04881744086742401,
+      "step": 908
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 472.0,
+      "completions/mean_length": 225.09375,
+      "completions/mean_terminated_length": 129.45834350585938,
+      "completions/min_length": 17.0,
+      "completions/min_terminated_length": 17.0,
+      "epoch": 0.7272,
+      "grad_norm": 8.629831314086914,
+      "kl": 0.2176055908203125,
+      "learning_rate": 1e-06,
+      "loss": 0.5547,
+      "num_tokens": 11858854.0,
+      "reward": 0.14837728440761566,
+      "reward_std": 0.06372867524623871,
+      "rewards/bleu_reward_func/mean": 0.14837728440761566,
+      "rewards/bleu_reward_func/std": 0.18777750432491302,
+      "step": 909
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 262.0,
+      "completions/max_terminated_length": 262.0,
+      "completions/mean_length": 74.25,
+      "completions/mean_terminated_length": 74.25,
+      "completions/min_length": 13.0,
+      "completions/min_terminated_length": 13.0,
+      "epoch": 0.728,
+      "grad_norm": 13.281477928161621,
+      "kl": 0.438720703125,
+      "learning_rate": 1e-06,
+      "loss": 0.0534,
+      "num_tokens": 11869046.0,
+      "reward": 0.20467601716518402,
+      "reward_std": 0.04131526127457619,
+      "rewards/bleu_reward_func/mean": 0.20467601716518402,
+      "rewards/bleu_reward_func/std": 0.14035604894161224,
+      "step": 910
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.53125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 358.0,
+      "completions/mean_length": 341.625,
+      "completions/mean_terminated_length": 148.53334045410156,
+      "completions/min_length": 16.0,
+      "completions/min_terminated_length": 16.0,
+      "epoch": 0.7288,
+      "grad_norm": 6.93421745300293,
+      "kl": 0.177642822265625,
+      "learning_rate": 1e-06,
+      "loss": 0.0368,
+      "num_tokens": 11883586.0,
+      "reward": 0.18407613039016724,
+      "reward_std": 0.020998071879148483,
+      "rewards/bleu_reward_func/mean": 0.18407613039016724,
+      "rewards/bleu_reward_func/std": 0.2021336704492569,
+      "step": 911
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.15625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 502.0,
+      "completions/mean_length": 222.1875,
+      "completions/mean_terminated_length": 168.51852416992188,
+      "completions/min_length": 17.0,
+      "completions/min_terminated_length": 17.0,
+      "epoch": 0.7296,
+      "grad_norm": 4.430690765380859,
+      "kl": 0.18560791015625,
+      "learning_rate": 1e-06,
+      "loss": 0.1092,
+      "num_tokens": 11894448.0,
+      "reward": 0.1439959555864334,
+      "reward_std": 0.04086273908615112,
+      "rewards/bleu_reward_func/mean": 0.1439959555864334,
+      "rewards/bleu_reward_func/std": 0.1705217957496643,
+      "step": 912
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 493.0,
+      "completions/mean_length": 397.25,
+      "completions/mean_terminated_length": 345.0909118652344,
+      "completions/min_length": 158.0,
+      "completions/min_terminated_length": 158.0,
+      "epoch": 0.7304,
+      "grad_norm": 2.7031519412994385,
+      "kl": 0.05963134765625,
+      "learning_rate": 1e-06,
+      "loss": 0.0584,
+      "num_tokens": 11909408.0,
+      "reward": 0.09079495072364807,
+      "reward_std": 0.021243298426270485,
+      "rewards/bleu_reward_func/mean": 0.09079495072364807,
+      "rewards/bleu_reward_func/std": 0.1052529513835907,
+      "step": 913
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.40625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 467.0,
+      "completions/mean_length": 282.9375,
+      "completions/mean_terminated_length": 126.21052551269531,
+      "completions/min_length": 23.0,
+      "completions/min_terminated_length": 23.0,
+      "epoch": 0.7312,
+      "grad_norm": 6.302535057067871,
+      "kl": 0.17462158203125,
+      "learning_rate": 1e-06,
+      "loss": -0.1643,
+      "num_tokens": 11923182.0,
+      "reward": 0.10389965772628784,
+      "reward_std": 0.03838275372982025,
+      "rewards/bleu_reward_func/mean": 0.10389965772628784,
+      "rewards/bleu_reward_func/std": 0.10838860273361206,
+      "step": 914
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 512.0,
+      "completions/mean_length": 411.4375,
+      "completions/mean_terminated_length": 282.14288330078125,
+      "completions/min_length": 117.0,
+      "completions/min_terminated_length": 117.0,
+      "epoch": 0.732,
+      "grad_norm": 2.603992223739624,
+      "kl": 0.052398681640625,
+      "learning_rate": 1e-06,
+      "loss": -0.0025,
+      "num_tokens": 11939596.0,
+      "reward": 0.06734529137611389,
+      "reward_std": 0.0207513514906168,
+      "rewards/bleu_reward_func/mean": 0.06734529137611389,
+      "rewards/bleu_reward_func/std": 0.05821956321597099,
+      "step": 915
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 447.0,
+      "completions/max_terminated_length": 447.0,
+      "completions/mean_length": 111.90625,
+      "completions/mean_terminated_length": 111.90625,
+      "completions/min_length": 26.0,
+      "completions/min_terminated_length": 26.0,
+      "epoch": 0.7328,
+      "grad_norm": 15.395035743713379,
+      "kl": 0.2227783203125,
+      "learning_rate": 1e-06,
+      "loss": 0.3837,
+      "num_tokens": 11947113.0,
+      "reward": 0.24101027846336365,
+      "reward_std": 0.07465855032205582,
+      "rewards/bleu_reward_func/mean": 0.24101027846336365,
+      "rewards/bleu_reward_func/std": 0.17581383883953094,
+      "step": 916
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.40625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 463.0,
+      "completions/mean_length": 312.90625,
+      "completions/mean_terminated_length": 176.68421936035156,
+      "completions/min_length": 23.0,
+      "completions/min_terminated_length": 23.0,
+      "epoch": 0.7336,
+      "grad_norm": 8.670394897460938,
+      "kl": 0.054595947265625,
+      "learning_rate": 1e-06,
+      "loss": -0.1736,
+      "num_tokens": 11965846.0,
+      "reward": 0.2349693477153778,
+      "reward_std": 0.042255695909261703,
+      "rewards/bleu_reward_func/mean": 0.2349693477153778,
+      "rewards/bleu_reward_func/std": 0.37363162636756897,
+      "step": 917
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.34375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 485.0,
+      "completions/mean_length": 384.34375,
+      "completions/mean_terminated_length": 317.4761962890625,
+      "completions/min_length": 74.0,
+      "completions/min_terminated_length": 74.0,
+      "epoch": 0.7344,
+      "grad_norm": 2.5599606037139893,
+      "kl": 0.06549072265625,
+      "learning_rate": 1e-06,
+      "loss": -0.0089,
+      "num_tokens": 11980945.0,
+      "reward": 0.061901748180389404,
+      "reward_std": 0.02856561914086342,
+      "rewards/bleu_reward_func/mean": 0.061901748180389404,
+      "rewards/bleu_reward_func/std": 0.04196527600288391,
+      "step": 918
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.15625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 424.0,
+      "completions/mean_length": 203.9375,
+      "completions/mean_terminated_length": 146.88888549804688,
+      "completions/min_length": 28.0,
+      "completions/min_terminated_length": 28.0,
+      "epoch": 0.7352,
+      "grad_norm": 4.163308143615723,
+      "kl": 0.08990478515625,
+      "learning_rate": 1e-06,
+      "loss": -0.0627,
+      "num_tokens": 11994255.0,
+      "reward": 0.06285493075847626,
+      "reward_std": 0.027241935953497887,
+      "rewards/bleu_reward_func/mean": 0.06285493075847626,
+      "rewards/bleu_reward_func/std": 0.03245123475790024,
+      "step": 919
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 502.0,
+      "completions/mean_length": 445.90625,
+      "completions/mean_terminated_length": 406.25,
+      "completions/min_length": 236.0,
+      "completions/min_terminated_length": 236.0,
+      "epoch": 0.736,
+      "grad_norm": 2.6394991874694824,
+      "kl": 0.0596923828125,
+      "learning_rate": 1e-06,
+      "loss": -0.0646,
+      "num_tokens": 12010980.0,
+      "reward": 0.05676237493753433,
+      "reward_std": 0.014085400849580765,
+      "rewards/bleu_reward_func/mean": 0.05676237493753433,
+      "rewards/bleu_reward_func/std": 0.03611414507031441,
+      "step": 920
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 286.0,
+      "completions/mean_length": 106.0625,
+      "completions/mean_terminated_length": 92.96774291992188,
+      "completions/min_length": 2.0,
+      "completions/min_terminated_length": 2.0,
+      "epoch": 0.7368,
+      "grad_norm": 8.88719654083252,
+      "kl": 0.19818115234375,
+      "learning_rate": 1e-06,
+      "loss": -0.1221,
+      "num_tokens": 12017206.0,
+      "reward": 0.08727812767028809,
+      "reward_std": 0.05162365734577179,
+      "rewards/bleu_reward_func/mean": 0.08727812767028809,
+      "rewards/bleu_reward_func/std": 0.07182831317186356,
+      "step": 921
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.09375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 483.0,
+      "completions/mean_length": 195.25,
+      "completions/mean_terminated_length": 162.48275756835938,
+      "completions/min_length": 17.0,
+      "completions/min_terminated_length": 17.0,
+      "epoch": 0.7376,
+      "grad_norm": 7.483645439147949,
+      "kl": 0.206787109375,
+      "learning_rate": 1e-06,
+      "loss": -0.0919,
+      "num_tokens": 12028214.0,
+      "reward": 0.18070882558822632,
+      "reward_std": 0.04944847524166107,
+      "rewards/bleu_reward_func/mean": 0.18070882558822632,
+      "rewards/bleu_reward_func/std": 0.19004972279071808,
+      "step": 922
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.40625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 470.0,
+      "completions/mean_length": 285.75,
+      "completions/mean_terminated_length": 130.94737243652344,
+      "completions/min_length": 17.0,
+      "completions/min_terminated_length": 17.0,
+      "epoch": 0.7384,
+      "grad_norm": 4.784849643707275,
+      "kl": 0.07586669921875,
+      "learning_rate": 1e-06,
+      "loss": 0.131,
+      "num_tokens": 12042278.0,
+      "reward": 0.05333679914474487,
+      "reward_std": 0.03152618184685707,
+      "rewards/bleu_reward_func/mean": 0.05333679914474487,
+      "rewards/bleu_reward_func/std": 0.055619917809963226,
+      "step": 923
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 51.0,
+      "completions/mean_length": 300.0625,
+      "completions/mean_terminated_length": 27.571430206298828,
+      "completions/min_length": 12.0,
+      "completions/min_terminated_length": 12.0,
+      "epoch": 0.7392,
+      "grad_norm": 5.440861701965332,
+      "kl": 0.145477294921875,
+      "learning_rate": 1e-06,
+      "loss": 0.0245,
+      "num_tokens": 12056416.0,
+      "reward": 0.14792697131633759,
+      "reward_std": 0.02701294980943203,
+      "rewards/bleu_reward_func/mean": 0.14792697131633759,
+      "rewards/bleu_reward_func/std": 0.15142259001731873,
+      "step": 924
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 487.0,
+      "completions/mean_length": 286.15625,
+      "completions/mean_terminated_length": 271.1000061035156,
+      "completions/min_length": 41.0,
+      "completions/min_terminated_length": 41.0,
+      "epoch": 0.74,
+      "grad_norm": 3.949329137802124,
+      "kl": 0.085693359375,
+      "learning_rate": 1e-06,
+      "loss": 0.051,
+      "num_tokens": 12069725.0,
+      "reward": 0.07858790457248688,
+      "reward_std": 0.02233020029962063,
+      "rewards/bleu_reward_func/mean": 0.07858790457248688,
+      "rewards/bleu_reward_func/std": 0.07242675125598907,
+      "step": 925
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.09375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 484.0,
+      "completions/mean_length": 128.75,
+      "completions/mean_terminated_length": 89.10344696044922,
+      "completions/min_length": 14.0,
+      "completions/min_terminated_length": 14.0,
+      "epoch": 0.7408,
+      "grad_norm": 11.58963394165039,
+      "kl": 0.43695068359375,
+      "learning_rate": 1e-06,
+      "loss": 0.0795,
+      "num_tokens": 12080085.0,
+      "reward": 0.11042273789644241,
+      "reward_std": 0.017381731420755386,
+      "rewards/bleu_reward_func/mean": 0.11042273789644241,
+      "rewards/bleu_reward_func/std": 0.04870026186108589,
+      "step": 926
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 506.0,
+      "completions/mean_length": 295.40625,
+      "completions/mean_terminated_length": 126.94444274902344,
+      "completions/min_length": 15.0,
+      "completions/min_terminated_length": 15.0,
+      "epoch": 0.7416,
+      "grad_norm": 9.117962837219238,
+      "kl": 0.22723388671875,
+      "learning_rate": 1e-06,
+      "loss": -0.4999,
+      "num_tokens": 12092682.0,
+      "reward": 0.03860364854335785,
+      "reward_std": 0.019805099815130234,
+      "rewards/bleu_reward_func/mean": 0.03860364854335785,
+      "rewards/bleu_reward_func/std": 0.024968957528471947,
+      "step": 927
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 405.0,
+      "completions/mean_length": 297.71875,
+      "completions/mean_terminated_length": 200.3181915283203,
+      "completions/min_length": 51.0,
+      "completions/min_terminated_length": 51.0,
+      "epoch": 0.7424,
+      "grad_norm": 3.2410967350006104,
+      "kl": 0.1171875,
+      "learning_rate": 1e-06,
+      "loss": -0.0166,
+      "num_tokens": 12107465.0,
+      "reward": 0.16988132894039154,
+      "reward_std": 0.03467182815074921,
+      "rewards/bleu_reward_func/mean": 0.16988132894039154,
+      "rewards/bleu_reward_func/std": 0.1373591423034668,
+      "step": 928
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 483.0,
+      "completions/mean_length": 355.4375,
+      "completions/mean_terminated_length": 233.6666717529297,
+      "completions/min_length": 14.0,
+      "completions/min_terminated_length": 14.0,
+      "epoch": 0.7432,
+      "grad_norm": 14.617587089538574,
+      "kl": 0.15447998046875,
+      "learning_rate": 1e-06,
+      "loss": 0.0284,
+      "num_tokens": 12121815.0,
+      "reward": 0.14590570330619812,
+      "reward_std": 0.026923291385173798,
+      "rewards/bleu_reward_func/mean": 0.14590570330619812,
+      "rewards/bleu_reward_func/std": 0.2141415923833847,
+      "step": 929
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 483.0,
+      "completions/mean_length": 262.78125,
+      "completions/mean_terminated_length": 227.1785888671875,
+      "completions/min_length": 48.0,
+      "completions/min_terminated_length": 48.0,
+      "epoch": 0.744,
+      "grad_norm": 3.273005962371826,
+      "kl": 0.1136474609375,
+      "learning_rate": 1e-06,
+      "loss": -0.2043,
+      "num_tokens": 12136200.0,
+      "reward": 0.08558979630470276,
+      "reward_std": 0.03035646863281727,
+      "rewards/bleu_reward_func/mean": 0.08558979630470276,
+      "rewards/bleu_reward_func/std": 0.0643271803855896,
+      "step": 930
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 494.0,
+      "completions/mean_length": 285.78125,
+      "completions/mean_terminated_length": 150.0500030517578,
+      "completions/min_length": 22.0,
+      "completions/min_terminated_length": 22.0,
+      "epoch": 0.7448,
+      "grad_norm": 6.722025394439697,
+      "kl": 0.117340087890625,
+      "learning_rate": 1e-06,
+      "loss": 0.0744,
+      "num_tokens": 12147369.0,
+      "reward": 0.071571946144104,
+      "reward_std": 0.020615192130208015,
+      "rewards/bleu_reward_func/mean": 0.071571946144104,
+      "rewards/bleu_reward_func/std": 0.06541716307401657,
+      "step": 931
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.21875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 510.0,
+      "completions/mean_length": 391.03125,
+      "completions/mean_terminated_length": 357.1600036621094,
+      "completions/min_length": 4.0,
+      "completions/min_terminated_length": 4.0,
+      "epoch": 0.7456,
+      "grad_norm": 2.371354103088379,
+      "kl": 0.039031982421875,
+      "learning_rate": 1e-06,
+      "loss": -0.0517,
+      "num_tokens": 12165498.0,
+      "reward": 0.07511453330516815,
+      "reward_std": 0.020994337275624275,
+      "rewards/bleu_reward_func/mean": 0.07511453330516815,
+      "rewards/bleu_reward_func/std": 0.043336208909749985,
+      "step": 932
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 444.0,
+      "completions/mean_length": 321.59375,
+      "completions/mean_terminated_length": 235.0454559326172,
+      "completions/min_length": 12.0,
+      "completions/min_terminated_length": 12.0,
+      "epoch": 0.7464,
+      "grad_norm": 2.6573996543884277,
+      "kl": 0.0595703125,
+      "learning_rate": 1e-06,
+      "loss": 0.2007,
+      "num_tokens": 12178413.0,
+      "reward": 0.07766060531139374,
+      "reward_std": 0.030490310862660408,
+      "rewards/bleu_reward_func/mean": 0.07766060531139374,
+      "rewards/bleu_reward_func/std": 0.05291305482387543,
+      "step": 933
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.34375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 510.0,
+      "completions/mean_length": 389.9375,
+      "completions/mean_terminated_length": 326.0,
+      "completions/min_length": 79.0,
+      "completions/min_terminated_length": 79.0,
+      "epoch": 0.7472,
+      "grad_norm": 2.6120660305023193,
+      "kl": 0.0733642578125,
+      "learning_rate": 1e-06,
+      "loss": 0.0789,
+      "num_tokens": 12194507.0,
+      "reward": 0.10922634601593018,
+      "reward_std": 0.0321655347943306,
+      "rewards/bleu_reward_func/mean": 0.10922634601593018,
+      "rewards/bleu_reward_func/std": 0.10983148962259293,
+      "step": 934
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.09375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 499.0,
+      "completions/mean_length": 228.21875,
+      "completions/mean_terminated_length": 198.86207580566406,
+      "completions/min_length": 12.0,
+      "completions/min_terminated_length": 12.0,
+      "epoch": 0.748,
+      "grad_norm": 7.4974236488342285,
+      "kl": 0.24505615234375,
+      "learning_rate": 1e-06,
+      "loss": -0.0333,
+      "num_tokens": 12206842.0,
+      "reward": 0.16729718446731567,
+      "reward_std": 0.050741568207740784,
+      "rewards/bleu_reward_func/mean": 0.16729718446731567,
+      "rewards/bleu_reward_func/std": 0.2129126340150833,
+      "step": 935
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 444.0,
+      "completions/mean_length": 248.5625,
+      "completions/mean_terminated_length": 90.5,
+      "completions/min_length": 14.0,
+      "completions/min_terminated_length": 14.0,
+      "epoch": 0.7488,
+      "grad_norm": 8.694432258605957,
+      "kl": 0.124114990234375,
+      "learning_rate": 1e-06,
+      "loss": 0.0072,
+      "num_tokens": 12216596.0,
+      "reward": 0.10160160809755325,
+      "reward_std": 0.03439757227897644,
+      "rewards/bleu_reward_func/mean": 0.10160160809755325,
+      "rewards/bleu_reward_func/std": 0.06317181140184402,
+      "step": 936
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.21875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 504.0,
+      "completions/mean_length": 173.75,
+      "completions/mean_terminated_length": 79.04000091552734,
+      "completions/min_length": 21.0,
+      "completions/min_terminated_length": 21.0,
+      "epoch": 0.7496,
+      "grad_norm": 7.92829704284668,
+      "kl": 0.2811279296875,
+      "learning_rate": 1e-06,
+      "loss": -0.2142,
+      "num_tokens": 12226292.0,
+      "reward": 0.11500123143196106,
+      "reward_std": 0.030234824866056442,
+      "rewards/bleu_reward_func/mean": 0.11500123143196106,
+      "rewards/bleu_reward_func/std": 0.12273158878087997,
+      "step": 937
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 499.0,
+      "completions/mean_length": 234.875,
+      "completions/mean_terminated_length": 170.92308044433594,
+      "completions/min_length": 20.0,
+      "completions/min_terminated_length": 20.0,
+      "epoch": 0.7504,
+      "grad_norm": 8.183011054992676,
+      "kl": 0.261383056640625,
+      "learning_rate": 1e-06,
+      "loss": 0.017,
+      "num_tokens": 12238216.0,
+      "reward": 0.39511436223983765,
+      "reward_std": 0.1106102392077446,
+      "rewards/bleu_reward_func/mean": 0.39511436223983765,
+      "rewards/bleu_reward_func/std": 0.3091021776199341,
+      "step": 938
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 487.0,
+      "completions/mean_length": 315.6875,
+      "completions/mean_terminated_length": 163.0,
+      "completions/min_length": 22.0,
+      "completions/min_terminated_length": 22.0,
+      "epoch": 0.7512,
+      "grad_norm": 4.409646511077881,
+      "kl": 0.092132568359375,
+      "learning_rate": 1e-06,
+      "loss": -0.0114,
+      "num_tokens": 12254990.0,
+      "reward": 0.1955508440732956,
+      "reward_std": 0.016137830913066864,
+      "rewards/bleu_reward_func/mean": 0.1955508440732956,
+      "rewards/bleu_reward_func/std": 0.26973703503608704,
+      "step": 939
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.46875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 424.0,
+      "completions/mean_length": 276.875,
+      "completions/mean_terminated_length": 69.4117660522461,
+      "completions/min_length": 14.0,
+      "completions/min_terminated_length": 14.0,
+      "epoch": 0.752,
+      "grad_norm": 7.45650577545166,
+      "kl": 0.2735595703125,
+      "learning_rate": 1e-06,
+      "loss": -0.0045,
+      "num_tokens": 12268242.0,
+      "reward": 0.07165145874023438,
+      "reward_std": 0.020489612594246864,
+      "rewards/bleu_reward_func/mean": 0.07165145874023438,
+      "rewards/bleu_reward_func/std": 0.04259462654590607,
+      "step": 940
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.09375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 482.0,
+      "completions/mean_length": 150.8125,
+      "completions/mean_terminated_length": 113.44827270507812,
+      "completions/min_length": 20.0,
+      "completions/min_terminated_length": 20.0,
+      "epoch": 0.7528,
+      "grad_norm": 10.180867195129395,
+      "kl": 0.4688720703125,
+      "learning_rate": 1e-06,
+      "loss": 0.122,
+      "num_tokens": 12276916.0,
+      "reward": 0.15257704257965088,
+      "reward_std": 0.051439568400382996,
+      "rewards/bleu_reward_func/mean": 0.15257704257965088,
+      "rewards/bleu_reward_func/std": 0.11688338220119476,
+      "step": 941
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.28125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 509.0,
+      "completions/mean_length": 270.65625,
+      "completions/mean_terminated_length": 176.21739196777344,
+      "completions/min_length": 16.0,
+      "completions/min_terminated_length": 16.0,
+      "epoch": 0.7536,
+      "grad_norm": 5.519646167755127,
+      "kl": 0.07073974609375,
+      "learning_rate": 1e-06,
+      "loss": 0.437,
+      "num_tokens": 12291593.0,
+      "reward": 0.06806058436632156,
+      "reward_std": 0.05050808936357498,
+      "rewards/bleu_reward_func/mean": 0.06806058436632156,
+      "rewards/bleu_reward_func/std": 0.06130353361368179,
+      "step": 942
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.21875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 510.0,
+      "completions/mean_length": 405.625,
+      "completions/mean_terminated_length": 375.8399963378906,
+      "completions/min_length": 233.0,
+      "completions/min_terminated_length": 233.0,
+      "epoch": 0.7544,
+      "grad_norm": 1.8502905368804932,
+      "kl": 0.0356292724609375,
+      "learning_rate": 1e-06,
+      "loss": 0.0076,
+      "num_tokens": 12310893.0,
+      "reward": 0.21372246742248535,
+      "reward_std": 0.0709368884563446,
+      "rewards/bleu_reward_func/mean": 0.21372246742248535,
+      "rewards/bleu_reward_func/std": 0.1763986349105835,
+      "step": 943
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.28125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 300.0,
+      "completions/mean_length": 245.625,
+      "completions/mean_terminated_length": 141.3913116455078,
+      "completions/min_length": 73.0,
+      "completions/min_terminated_length": 73.0,
+      "epoch": 0.7552,
+      "grad_norm": 3.3902478218078613,
+      "kl": 0.05377197265625,
+      "learning_rate": 1e-06,
+      "loss": 0.076,
+      "num_tokens": 12323369.0,
+      "reward": 0.0433628112077713,
+      "reward_std": 0.03261272981762886,
+      "rewards/bleu_reward_func/mean": 0.0433628112077713,
+      "rewards/bleu_reward_func/std": 0.0436432845890522,
+      "step": 944
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 405.0,
+      "completions/mean_length": 270.0625,
+      "completions/mean_terminated_length": 189.4166717529297,
+      "completions/min_length": 23.0,
+      "completions/min_terminated_length": 23.0,
+      "epoch": 0.756,
+      "grad_norm": 5.770748615264893,
+      "kl": 0.088531494140625,
+      "learning_rate": 1e-06,
+      "loss": 0.0047,
+      "num_tokens": 12333803.0,
+      "reward": 0.10272043943405151,
+      "reward_std": 0.03215545043349266,
+      "rewards/bleu_reward_func/mean": 0.10272043943405151,
+      "rewards/bleu_reward_func/std": 0.11694183200597763,
+      "step": 945
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.09375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 490.0,
+      "completions/mean_length": 224.25,
+      "completions/mean_terminated_length": 194.48275756835938,
+      "completions/min_length": 16.0,
+      "completions/min_terminated_length": 16.0,
+      "epoch": 0.7568,
+      "grad_norm": 7.7613067626953125,
+      "kl": 0.20135498046875,
+      "learning_rate": 1e-06,
+      "loss": 0.1591,
+      "num_tokens": 12344915.0,
+      "reward": 0.08291373401880264,
+      "reward_std": 0.024335253983736038,
+      "rewards/bleu_reward_func/mean": 0.08291373401880264,
+      "rewards/bleu_reward_func/std": 0.03890189528465271,
+      "step": 946
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.59375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 472.0,
+      "completions/mean_length": 455.6875,
+      "completions/mean_terminated_length": 373.3846435546875,
+      "completions/min_length": 216.0,
+      "completions/min_terminated_length": 216.0,
+      "epoch": 0.7576,
+      "grad_norm": 2.599454879760742,
+      "kl": 0.0421142578125,
+      "learning_rate": 1e-06,
+      "loss": -0.0235,
+      "num_tokens": 12362161.0,
+      "reward": 0.03875226154923439,
+      "reward_std": 0.020147912204265594,
+      "rewards/bleu_reward_func/mean": 0.03875226154923439,
+      "rewards/bleu_reward_func/std": 0.023408547043800354,
+      "step": 947
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 426.0,
+      "completions/mean_length": 197.875,
+      "completions/mean_terminated_length": 125.3846206665039,
+      "completions/min_length": 13.0,
+      "completions/min_terminated_length": 13.0,
+      "epoch": 0.7584,
+      "grad_norm": 6.522151947021484,
+      "kl": 0.1690673828125,
+      "learning_rate": 1e-06,
+      "loss": 0.1959,
+      "num_tokens": 12375709.0,
+      "reward": 0.2021377682685852,
+      "reward_std": 0.0921662300825119,
+      "rewards/bleu_reward_func/mean": 0.2021377682685852,
+      "rewards/bleu_reward_func/std": 0.28283461928367615,
+      "step": 948
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.34375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 410.0,
+      "completions/mean_length": 256.21875,
+      "completions/mean_terminated_length": 122.23809814453125,
+      "completions/min_length": 45.0,
+      "completions/min_terminated_length": 45.0,
+      "epoch": 0.7592,
+      "grad_norm": 5.671032905578613,
+      "kl": 0.1173095703125,
+      "learning_rate": 1e-06,
+      "loss": -0.0303,
+      "num_tokens": 12385764.0,
+      "reward": 0.0564446821808815,
+      "reward_std": 0.02071106806397438,
+      "rewards/bleu_reward_func/mean": 0.0564446821808815,
+      "rewards/bleu_reward_func/std": 0.030088067054748535,
+      "step": 949
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.09375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 492.0,
+      "completions/mean_length": 255.59375,
+      "completions/mean_terminated_length": 229.0689697265625,
+      "completions/min_length": 18.0,
+      "completions/min_terminated_length": 18.0,
+      "epoch": 0.76,
+      "grad_norm": 6.897347927093506,
+      "kl": 0.2220458984375,
+      "learning_rate": 1e-06,
+      "loss": 0.1234,
+      "num_tokens": 12396999.0,
+      "reward": 0.09963001310825348,
+      "reward_std": 0.05010713264346123,
+      "rewards/bleu_reward_func/mean": 0.09963001310825348,
+      "rewards/bleu_reward_func/std": 0.08052106946706772,
+      "step": 950
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.40625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 510.0,
+      "completions/mean_length": 456.5,
+      "completions/mean_terminated_length": 418.52630615234375,
+      "completions/min_length": 254.0,
+      "completions/min_terminated_length": 254.0,
+      "epoch": 0.7608,
+      "grad_norm": 2.444167137145996,
+      "kl": 0.04534912109375,
+      "learning_rate": 1e-06,
+      "loss": -0.0097,
+      "num_tokens": 12418039.0,
+      "reward": 0.03447666019201279,
+      "reward_std": 0.01355208083987236,
+      "rewards/bleu_reward_func/mean": 0.03447666019201279,
+      "rewards/bleu_reward_func/std": 0.022434458136558533,
+      "step": 951
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.40625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 470.0,
+      "completions/mean_length": 279.5625,
+      "completions/mean_terminated_length": 120.52631378173828,
+      "completions/min_length": 28.0,
+      "completions/min_terminated_length": 28.0,
+      "epoch": 0.7616,
+      "grad_norm": 8.821101188659668,
+      "kl": 0.091888427734375,
+      "learning_rate": 1e-06,
+      "loss": 0.1929,
+      "num_tokens": 12431177.0,
+      "reward": 0.11506980657577515,
+      "reward_std": 0.033062804490327835,
+      "rewards/bleu_reward_func/mean": 0.11506980657577515,
+      "rewards/bleu_reward_func/std": 0.0943976491689682,
+      "step": 952
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.21875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 456.0,
+      "completions/mean_length": 269.78125,
+      "completions/mean_terminated_length": 201.95999145507812,
+      "completions/min_length": 22.0,
+      "completions/min_terminated_length": 22.0,
+      "epoch": 0.7624,
+      "grad_norm": 6.004775524139404,
+      "kl": 0.07818603515625,
+      "learning_rate": 1e-06,
+      "loss": -0.1255,
+      "num_tokens": 12445970.0,
+      "reward": 0.09985020756721497,
+      "reward_std": 0.0198547150939703,
+      "rewards/bleu_reward_func/mean": 0.09985020756721497,
+      "rewards/bleu_reward_func/std": 0.08852815628051758,
+      "step": 953
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 478.0,
+      "completions/mean_length": 318.8125,
+      "completions/mean_terminated_length": 231.0,
+      "completions/min_length": 18.0,
+      "completions/min_terminated_length": 18.0,
+      "epoch": 0.7632,
+      "grad_norm": 6.8956804275512695,
+      "kl": 0.175567626953125,
+      "learning_rate": 1e-06,
+      "loss": 0.0679,
+      "num_tokens": 12458004.0,
+      "reward": 0.1692689061164856,
+      "reward_std": 0.03958010673522949,
+      "rewards/bleu_reward_func/mean": 0.1692689061164856,
+      "rewards/bleu_reward_func/std": 0.13855873048305511,
+      "step": 954
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.28125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 480.0,
+      "completions/mean_length": 250.9375,
+      "completions/mean_terminated_length": 148.78260803222656,
+      "completions/min_length": 21.0,
+      "completions/min_terminated_length": 21.0,
+      "epoch": 0.764,
+      "grad_norm": 6.749716758728027,
+      "kl": 0.293182373046875,
+      "learning_rate": 1e-06,
+      "loss": 0.021,
+      "num_tokens": 12472626.0,
+      "reward": 0.1224028617143631,
+      "reward_std": 0.027801956981420517,
+      "rewards/bleu_reward_func/mean": 0.1224028617143631,
+      "rewards/bleu_reward_func/std": 0.07426659762859344,
+      "step": 955
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 493.0,
+      "completions/mean_length": 147.6875,
+      "completions/mean_terminated_length": 123.40000915527344,
+      "completions/min_length": 14.0,
+      "completions/min_terminated_length": 14.0,
+      "epoch": 0.7648,
+      "grad_norm": 9.662991523742676,
+      "kl": 0.46905517578125,
+      "learning_rate": 1e-06,
+      "loss": 0.0171,
+      "num_tokens": 12481928.0,
+      "reward": 0.23031684756278992,
+      "reward_std": 0.0920054167509079,
+      "rewards/bleu_reward_func/mean": 0.23031684756278992,
+      "rewards/bleu_reward_func/std": 0.16612249612808228,
+      "step": 956
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 478.0,
+      "completions/mean_length": 258.65625,
+      "completions/mean_terminated_length": 106.6500015258789,
+      "completions/min_length": 15.0,
+      "completions/min_terminated_length": 15.0,
+      "epoch": 0.7656,
+      "grad_norm": 10.25383472442627,
+      "kl": 0.2713623046875,
+      "learning_rate": 1e-06,
+      "loss": 0.0254,
+      "num_tokens": 12493805.0,
+      "reward": 0.15187731385231018,
+      "reward_std": 0.025371436029672623,
+      "rewards/bleu_reward_func/mean": 0.15187731385231018,
+      "rewards/bleu_reward_func/std": 0.12905065715312958,
+      "step": 957
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 140.0,
+      "completions/mean_length": 85.40625,
+      "completions/mean_terminated_length": 71.64515686035156,
+      "completions/min_length": 21.0,
+      "completions/min_terminated_length": 21.0,
+      "epoch": 0.7664,
+      "grad_norm": 8.22080135345459,
+      "kl": 0.393310546875,
+      "learning_rate": 1e-06,
+      "loss": 0.3247,
+      "num_tokens": 12502818.0,
+      "reward": 0.29921823740005493,
+      "reward_std": 0.11694261431694031,
+      "rewards/bleu_reward_func/mean": 0.29921823740005493,
+      "rewards/bleu_reward_func/std": 0.25639036297798157,
+      "step": 958
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 481.0,
+      "completions/mean_length": 386.09375,
+      "completions/mean_terminated_length": 260.1875,
+      "completions/min_length": 49.0,
+      "completions/min_terminated_length": 49.0,
+      "epoch": 0.7672,
+      "grad_norm": 2.847195863723755,
+      "kl": 0.0467529296875,
+      "learning_rate": 1e-06,
+      "loss": 0.0852,
+      "num_tokens": 12520317.0,
+      "reward": 0.039544593542814255,
+      "reward_std": 0.016648683696985245,
+      "rewards/bleu_reward_func/mean": 0.039544593542814255,
+      "rewards/bleu_reward_func/std": 0.034897446632385254,
+      "step": 959
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 452.0,
+      "completions/max_terminated_length": 452.0,
+      "completions/mean_length": 111.4375,
+      "completions/mean_terminated_length": 111.4375,
+      "completions/min_length": 16.0,
+      "completions/min_terminated_length": 16.0,
+      "epoch": 0.768,
+      "grad_norm": 8.353513717651367,
+      "kl": 0.22430419921875,
+      "learning_rate": 1e-06,
+      "loss": 0.2469,
+      "num_tokens": 12530027.0,
+      "reward": 0.26215416193008423,
+      "reward_std": 0.032358862459659576,
+      "rewards/bleu_reward_func/mean": 0.26215416193008423,
+      "rewards/bleu_reward_func/std": 0.22925570607185364,
+      "step": 960
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.71875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 512.0,
+      "completions/mean_length": 456.6875,
+      "completions/mean_terminated_length": 315.3333435058594,
+      "completions/min_length": 41.0,
+      "completions/min_terminated_length": 41.0,
+      "epoch": 0.7688,
+      "grad_norm": 2.2755558490753174,
+      "kl": 0.033355712890625,
+      "learning_rate": 1e-06,
+      "loss": 0.1802,
+      "num_tokens": 12548593.0,
+      "reward": 0.02548890933394432,
+      "reward_std": 0.01250866986811161,
+      "rewards/bleu_reward_func/mean": 0.02548890933394432,
+      "rewards/bleu_reward_func/std": 0.0143959391862154,
+      "step": 961
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 480.0,
+      "completions/mean_length": 359.28125,
+      "completions/mean_terminated_length": 206.5625,
+      "completions/min_length": 27.0,
+      "completions/min_terminated_length": 27.0,
+      "epoch": 0.7696,
+      "grad_norm": 5.7301130294799805,
+      "kl": 0.165802001953125,
+      "learning_rate": 1e-06,
+      "loss": 0.0633,
+      "num_tokens": 12566850.0,
+      "reward": 0.09463217854499817,
+      "reward_std": 0.021320462226867676,
+      "rewards/bleu_reward_func/mean": 0.09463217854499817,
+      "rewards/bleu_reward_func/std": 0.10299301147460938,
+      "step": 962
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 507.0,
+      "completions/mean_length": 388.125,
+      "completions/mean_terminated_length": 313.8000183105469,
+      "completions/min_length": 48.0,
+      "completions/min_terminated_length": 48.0,
+      "epoch": 0.7704,
+      "grad_norm": 2.5917232036590576,
+      "kl": 0.07171630859375,
+      "learning_rate": 1e-06,
+      "loss": 0.1643,
+      "num_tokens": 12581438.0,
+      "reward": 0.06743638217449188,
+      "reward_std": 0.041416820138692856,
+      "rewards/bleu_reward_func/mean": 0.06743638217449188,
+      "rewards/bleu_reward_func/std": 0.0745474174618721,
+      "step": 963
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 507.0,
+      "completions/mean_length": 173.53125,
+      "completions/mean_terminated_length": 162.61289978027344,
+      "completions/min_length": 16.0,
+      "completions/min_terminated_length": 16.0,
+      "epoch": 0.7712,
+      "grad_norm": 5.022093772888184,
+      "kl": 0.110015869140625,
+      "learning_rate": 1e-06,
+      "loss": 0.1948,
+      "num_tokens": 12593087.0,
+      "reward": 0.14474597573280334,
+      "reward_std": 0.039374105632305145,
+      "rewards/bleu_reward_func/mean": 0.14474597573280334,
+      "rewards/bleu_reward_func/std": 0.0781283900141716,
+      "step": 964
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 345.0,
+      "completions/max_terminated_length": 345.0,
+      "completions/mean_length": 129.53125,
+      "completions/mean_terminated_length": 129.53125,
+      "completions/min_length": 16.0,
+      "completions/min_terminated_length": 16.0,
+      "epoch": 0.772,
+      "grad_norm": 5.708756446838379,
+      "kl": 0.185546875,
+      "learning_rate": 1e-06,
+      "loss": -0.112,
+      "num_tokens": 12604360.0,
+      "reward": 0.17942924797534943,
+      "reward_std": 0.04769964888691902,
+      "rewards/bleu_reward_func/mean": 0.17942924797534943,
+      "rewards/bleu_reward_func/std": 0.20441435277462006,
+      "step": 965
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.21875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 459.0,
+      "completions/mean_length": 143.15625,
+      "completions/mean_terminated_length": 39.87999725341797,
+      "completions/min_length": 15.0,
+      "completions/min_terminated_length": 15.0,
+      "epoch": 0.7728,
+      "grad_norm": 10.111825942993164,
+      "kl": 0.39947509765625,
+      "learning_rate": 1e-06,
+      "loss": -0.0058,
+      "num_tokens": 12613941.0,
+      "reward": 0.09816907346248627,
+      "reward_std": 0.009084422141313553,
+      "rewards/bleu_reward_func/mean": 0.09816907346248627,
+      "rewards/bleu_reward_func/std": 0.06435907632112503,
+      "step": 966
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.59375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 481.0,
+      "completions/mean_length": 348.875,
+      "completions/mean_terminated_length": 110.46154022216797,
+      "completions/min_length": 27.0,
+      "completions/min_terminated_length": 27.0,
+      "epoch": 0.7736,
+      "grad_norm": 4.259431838989258,
+      "kl": 0.144195556640625,
+      "learning_rate": 1e-06,
+      "loss": 0.1364,
+      "num_tokens": 12628633.0,
+      "reward": 0.11497487127780914,
+      "reward_std": 0.0397370383143425,
+      "rewards/bleu_reward_func/mean": 0.11497487127780914,
+      "rewards/bleu_reward_func/std": 0.08479689061641693,
+      "step": 967
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.53125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 363.0,
+      "completions/mean_length": 319.125,
+      "completions/mean_terminated_length": 100.53334045410156,
+      "completions/min_length": 25.0,
+      "completions/min_terminated_length": 25.0,
+      "epoch": 0.7744,
+      "grad_norm": 5.853184700012207,
+      "kl": 0.2406005859375,
+      "learning_rate": 1e-06,
+      "loss": 0.0855,
+      "num_tokens": 12642293.0,
+      "reward": 0.1797194480895996,
+      "reward_std": 0.06623274832963943,
+      "rewards/bleu_reward_func/mean": 0.1797194480895996,
+      "rewards/bleu_reward_func/std": 0.20514002442359924,
+      "step": 968
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 506.0,
+      "completions/mean_length": 276.65625,
+      "completions/mean_terminated_length": 198.20834350585938,
+      "completions/min_length": 28.0,
+      "completions/min_terminated_length": 28.0,
+      "epoch": 0.7752,
+      "grad_norm": 3.861785650253296,
+      "kl": 0.048126220703125,
+      "learning_rate": 1e-06,
+      "loss": 0.3173,
+      "num_tokens": 12653674.0,
+      "reward": 0.06044634059071541,
+      "reward_std": 0.02236868627369404,
+      "rewards/bleu_reward_func/mean": 0.06044634059071541,
+      "rewards/bleu_reward_func/std": 0.058306269347667694,
+      "step": 969
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.34375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 512.0,
+      "completions/mean_length": 364.4375,
+      "completions/mean_terminated_length": 287.1428527832031,
+      "completions/min_length": 29.0,
+      "completions/min_terminated_length": 29.0,
+      "epoch": 0.776,
+      "grad_norm": 2.8329427242279053,
+      "kl": 0.037445068359375,
+      "learning_rate": 1e-06,
+      "loss": 0.1258,
+      "num_tokens": 12668128.0,
+      "reward": 0.11927121132612228,
+      "reward_std": 0.0374884158372879,
+      "rewards/bleu_reward_func/mean": 0.11927121132612228,
+      "rewards/bleu_reward_func/std": 0.10864724963903427,
+      "step": 970
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.15625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 511.0,
+      "completions/mean_length": 247.3125,
+      "completions/mean_terminated_length": 198.29629516601562,
+      "completions/min_length": 13.0,
+      "completions/min_terminated_length": 13.0,
+      "epoch": 0.7768,
+      "grad_norm": 14.522860527038574,
+      "kl": 0.3580322265625,
+      "learning_rate": 1e-06,
+      "loss": 0.327,
+      "num_tokens": 12680266.0,
+      "reward": 0.11901617795228958,
+      "reward_std": 0.06829790771007538,
+      "rewards/bleu_reward_func/mean": 0.11901617795228958,
+      "rewards/bleu_reward_func/std": 0.0924401804804802,
+      "step": 971
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 384.0,
+      "completions/mean_length": 270.71875,
+      "completions/mean_terminated_length": 83.05555725097656,
+      "completions/min_length": 20.0,
+      "completions/min_terminated_length": 20.0,
+      "epoch": 0.7776,
+      "grad_norm": 5.036475658416748,
+      "kl": 0.187713623046875,
+      "learning_rate": 1e-06,
+      "loss": 0.1832,
+      "num_tokens": 12692713.0,
+      "reward": 0.13872118294239044,
+      "reward_std": 0.0687481164932251,
+      "rewards/bleu_reward_func/mean": 0.13872118294239044,
+      "rewards/bleu_reward_func/std": 0.14044460654258728,
+      "step": 972
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 385.0,
+      "completions/mean_length": 244.90625,
+      "completions/mean_terminated_length": 155.875,
+      "completions/min_length": 21.0,
+      "completions/min_terminated_length": 21.0,
+      "epoch": 0.7784,
+      "grad_norm": 7.775391578674316,
+      "kl": 0.136444091796875,
+      "learning_rate": 1e-06,
+      "loss": 0.0436,
+      "num_tokens": 12706702.0,
+      "reward": 0.11104710400104523,
+      "reward_std": 0.03642675280570984,
+      "rewards/bleu_reward_func/mean": 0.11104710400104523,
+      "rewards/bleu_reward_func/std": 0.09262983500957489,
+      "step": 973
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 509.0,
+      "completions/mean_length": 293.96875,
+      "completions/mean_terminated_length": 194.8636474609375,
+      "completions/min_length": 13.0,
+      "completions/min_terminated_length": 13.0,
+      "epoch": 0.7792,
+      "grad_norm": 36.50672149658203,
+      "kl": 1.153228759765625,
+      "learning_rate": 1e-06,
+      "loss": 0.1761,
+      "num_tokens": 12721461.0,
+      "reward": 0.2013707458972931,
+      "reward_std": 0.10103052109479904,
+      "rewards/bleu_reward_func/mean": 0.2013707458972931,
+      "rewards/bleu_reward_func/std": 0.16323500871658325,
+      "step": 974
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.09375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 500.0,
+      "completions/mean_length": 164.5,
+      "completions/mean_terminated_length": 128.55172729492188,
+      "completions/min_length": 16.0,
+      "completions/min_terminated_length": 16.0,
+      "epoch": 0.78,
+      "grad_norm": 9.472455024719238,
+      "kl": 0.2548828125,
+      "learning_rate": 1e-06,
+      "loss": 0.6259,
+      "num_tokens": 12733317.0,
+      "reward": 0.07841520756483078,
+      "reward_std": 0.029445767402648926,
+      "rewards/bleu_reward_func/mean": 0.07841520756483078,
+      "rewards/bleu_reward_func/std": 0.0620783306658268,
+      "step": 975
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 480.0,
+      "completions/mean_length": 247.09375,
+      "completions/mean_terminated_length": 229.433349609375,
+      "completions/min_length": 27.0,
+      "completions/min_terminated_length": 27.0,
+      "epoch": 0.7808,
+      "grad_norm": 4.399056911468506,
+      "kl": 0.10150146484375,
+      "learning_rate": 1e-06,
+      "loss": 0.0484,
+      "num_tokens": 12746712.0,
+      "reward": 0.20041930675506592,
+      "reward_std": 0.03039298765361309,
+      "rewards/bleu_reward_func/mean": 0.20041930675506592,
+      "rewards/bleu_reward_func/std": 0.2551174759864807,
+      "step": 976
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.40625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 462.0,
+      "completions/mean_length": 382.375,
+      "completions/mean_terminated_length": 293.6842041015625,
+      "completions/min_length": 74.0,
+      "completions/min_terminated_length": 74.0,
+      "epoch": 0.7816,
+      "grad_norm": 2.5359840393066406,
+      "kl": 0.05230712890625,
+      "learning_rate": 1e-06,
+      "loss": 0.0699,
+      "num_tokens": 12762020.0,
+      "reward": 0.047146447002887726,
+      "reward_std": 0.022996241226792336,
+      "rewards/bleu_reward_func/mean": 0.047146447002887726,
+      "rewards/bleu_reward_func/std": 0.04002131521701813,
+      "step": 977
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.34375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 486.0,
+      "completions/mean_length": 329.90625,
+      "completions/mean_terminated_length": 234.52381896972656,
+      "completions/min_length": 15.0,
+      "completions/min_terminated_length": 15.0,
+      "epoch": 0.7824,
+      "grad_norm": 6.270121097564697,
+      "kl": 0.1246337890625,
+      "learning_rate": 1e-06,
+      "loss": -0.0807,
+      "num_tokens": 12778233.0,
+      "reward": 0.1260184496641159,
+      "reward_std": 0.029545176774263382,
+      "rewards/bleu_reward_func/mean": 0.1260184496641159,
+      "rewards/bleu_reward_func/std": 0.12758195400238037,
+      "step": 978
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.09375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 315.0,
+      "completions/mean_length": 163.6875,
+      "completions/mean_terminated_length": 127.6551742553711,
+      "completions/min_length": 18.0,
+      "completions/min_terminated_length": 18.0,
+      "epoch": 0.7832,
+      "grad_norm": 11.418375015258789,
+      "kl": 0.1888427734375,
+      "learning_rate": 1e-06,
+      "loss": 0.4229,
+      "num_tokens": 12788943.0,
+      "reward": 0.22336477041244507,
+      "reward_std": 0.0984843298792839,
+      "rewards/bleu_reward_func/mean": 0.22336477041244507,
+      "rewards/bleu_reward_func/std": 0.1921825110912323,
+      "step": 979
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 479.0,
+      "completions/mean_length": 141.6875,
+      "completions/mean_terminated_length": 117.00000762939453,
+      "completions/min_length": 14.0,
+      "completions/min_terminated_length": 14.0,
+      "epoch": 0.784,
+      "grad_norm": 7.894710540771484,
+      "kl": 0.4378662109375,
+      "learning_rate": 1e-06,
+      "loss": 0.2326,
+      "num_tokens": 12797693.0,
+      "reward": 0.1561349630355835,
+      "reward_std": 0.05494026839733124,
+      "rewards/bleu_reward_func/mean": 0.1561349630355835,
+      "rewards/bleu_reward_func/std": 0.09167517721652985,
+      "step": 980
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 492.0,
+      "completions/mean_length": 255.34375,
+      "completions/mean_terminated_length": 169.7916717529297,
+      "completions/min_length": 10.0,
+      "completions/min_terminated_length": 10.0,
+      "epoch": 0.7848,
+      "grad_norm": 19.572107315063477,
+      "kl": 0.288604736328125,
+      "learning_rate": 1e-06,
+      "loss": 0.151,
+      "num_tokens": 12812768.0,
+      "reward": 0.06323938816785812,
+      "reward_std": 0.017744949087500572,
+      "rewards/bleu_reward_func/mean": 0.06323938816785812,
+      "rewards/bleu_reward_func/std": 0.07885830849409103,
+      "step": 981
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 443.0,
+      "completions/mean_length": 281.65625,
+      "completions/mean_terminated_length": 176.9545440673828,
+      "completions/min_length": 6.0,
+      "completions/min_terminated_length": 6.0,
+      "epoch": 0.7856,
+      "grad_norm": 8.172053337097168,
+      "kl": 0.24444580078125,
+      "learning_rate": 1e-06,
+      "loss": 0.21,
+      "num_tokens": 12829453.0,
+      "reward": 0.0720784068107605,
+      "reward_std": 0.03868547081947327,
+      "rewards/bleu_reward_func/mean": 0.0720784068107605,
+      "rewards/bleu_reward_func/std": 0.05159585550427437,
+      "step": 982
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 474.0,
+      "completions/mean_length": 235.84375,
+      "completions/mean_terminated_length": 110.31818389892578,
+      "completions/min_length": 16.0,
+      "completions/min_terminated_length": 16.0,
+      "epoch": 0.7864,
+      "grad_norm": 8.912215232849121,
+      "kl": 0.32025146484375,
+      "learning_rate": 1e-06,
+      "loss": 0.12,
+      "num_tokens": 12843376.0,
+      "reward": 0.1997315138578415,
+      "reward_std": 0.030267415568232536,
+      "rewards/bleu_reward_func/mean": 0.1997315138578415,
+      "rewards/bleu_reward_func/std": 0.17783835530281067,
+      "step": 983
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 449.0,
+      "completions/mean_length": 378.0,
+      "completions/mean_terminated_length": 317.0909118652344,
+      "completions/min_length": 86.0,
+      "completions/min_terminated_length": 86.0,
+      "epoch": 0.7872,
+      "grad_norm": 2.6129956245422363,
+      "kl": 0.036346435546875,
+      "learning_rate": 1e-06,
+      "loss": -0.1504,
+      "num_tokens": 12859368.0,
+      "reward": 0.07119783759117126,
+      "reward_std": 0.018479108810424805,
+      "rewards/bleu_reward_func/mean": 0.07119783759117126,
+      "rewards/bleu_reward_func/std": 0.06165986508131027,
+      "step": 984
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 434.0,
+      "completions/mean_length": 220.03125,
+      "completions/mean_terminated_length": 152.6538543701172,
+      "completions/min_length": 21.0,
+      "completions/min_terminated_length": 21.0,
+      "epoch": 0.788,
+      "grad_norm": 5.786254405975342,
+      "kl": 0.15057373046875,
+      "learning_rate": 1e-06,
+      "loss": -0.1709,
+      "num_tokens": 12869665.0,
+      "reward": 0.14459165930747986,
+      "reward_std": 0.03573929890990257,
+      "rewards/bleu_reward_func/mean": 0.14459165930747986,
+      "rewards/bleu_reward_func/std": 0.13286592066287994,
+      "step": 985
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.46875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 499.0,
+      "completions/mean_length": 357.125,
+      "completions/mean_terminated_length": 220.47059631347656,
+      "completions/min_length": 11.0,
+      "completions/min_terminated_length": 11.0,
+      "epoch": 0.7888,
+      "grad_norm": 8.607572555541992,
+      "kl": 0.24078369140625,
+      "learning_rate": 1e-06,
+      "loss": 0.0188,
+      "num_tokens": 12885501.0,
+      "reward": 0.09036614745855331,
+      "reward_std": 0.031877610832452774,
+      "rewards/bleu_reward_func/mean": 0.09036614745855331,
+      "rewards/bleu_reward_func/std": 0.05137631297111511,
+      "step": 986
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 506.0,
+      "completions/mean_length": 381.75,
+      "completions/mean_terminated_length": 280.4444580078125,
+      "completions/min_length": 33.0,
+      "completions/min_terminated_length": 33.0,
+      "epoch": 0.7896,
+      "grad_norm": 2.421383857727051,
+      "kl": 0.046905517578125,
+      "learning_rate": 1e-06,
+      "loss": 0.1548,
+      "num_tokens": 12899517.0,
+      "reward": 0.06479343771934509,
+      "reward_std": 0.01870723068714142,
+      "rewards/bleu_reward_func/mean": 0.06479343771934509,
+      "rewards/bleu_reward_func/std": 0.039773859083652496,
+      "step": 987
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 501.0,
+      "completions/mean_length": 272.3125,
+      "completions/mean_terminated_length": 238.07144165039062,
+      "completions/min_length": 38.0,
+      "completions/min_terminated_length": 38.0,
+      "epoch": 0.7904,
+      "grad_norm": 3.447153329849243,
+      "kl": 0.06671142578125,
+      "learning_rate": 1e-06,
+      "loss": -0.0649,
+      "num_tokens": 12910175.0,
+      "reward": 0.0866774171590805,
+      "reward_std": 0.07288840413093567,
+      "rewards/bleu_reward_func/mean": 0.0866774171590805,
+      "rewards/bleu_reward_func/std": 0.10417941212654114,
+      "step": 988
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 499.0,
+      "completions/mean_length": 156.625,
+      "completions/mean_terminated_length": 74.61538696289062,
+      "completions/min_length": 14.0,
+      "completions/min_terminated_length": 14.0,
+      "epoch": 0.7912,
+      "grad_norm": 11.11733341217041,
+      "kl": 0.34686279296875,
+      "learning_rate": 1e-06,
+      "loss": 0.4111,
+      "num_tokens": 12920075.0,
+      "reward": 0.18872088193893433,
+      "reward_std": 0.05310884118080139,
+      "rewards/bleu_reward_func/mean": 0.18872088193893433,
+      "rewards/bleu_reward_func/std": 0.10052233934402466,
+      "step": 989
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 443.0,
+      "completions/mean_length": 251.15625,
+      "completions/mean_terminated_length": 94.6500015258789,
+      "completions/min_length": 22.0,
+      "completions/min_terminated_length": 22.0,
+      "epoch": 0.792,
+      "grad_norm": 4.343538761138916,
+      "kl": 0.0811767578125,
+      "learning_rate": 1e-06,
+      "loss": 0.255,
+      "num_tokens": 12932672.0,
+      "reward": 0.2421235740184784,
+      "reward_std": 0.03650471195578575,
+      "rewards/bleu_reward_func/mean": 0.2421235740184784,
+      "rewards/bleu_reward_func/std": 0.35968947410583496,
+      "step": 990
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.28125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 512.0,
+      "completions/mean_length": 365.34375,
+      "completions/mean_terminated_length": 307.9565124511719,
+      "completions/min_length": 80.0,
+      "completions/min_terminated_length": 80.0,
+      "epoch": 0.7928,
+      "grad_norm": 4.527655601501465,
+      "kl": 0.109619140625,
+      "learning_rate": 1e-06,
+      "loss": -0.0782,
+      "num_tokens": 12947603.0,
+      "reward": 0.05564543977379799,
+      "reward_std": 0.033515315502882004,
+      "rewards/bleu_reward_func/mean": 0.05564543977379799,
+      "rewards/bleu_reward_func/std": 0.03913462907075882,
+      "step": 991
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.15625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 489.0,
+      "completions/mean_length": 128.46875,
+      "completions/mean_terminated_length": 57.4444465637207,
+      "completions/min_length": 18.0,
+      "completions/min_terminated_length": 18.0,
+      "epoch": 0.7936,
+      "grad_norm": 9.973651885986328,
+      "kl": 0.1922607421875,
+      "learning_rate": 1e-06,
+      "loss": 0.9373,
+      "num_tokens": 12958538.0,
+      "reward": 0.3186902403831482,
+      "reward_std": 0.10882419347763062,
+      "rewards/bleu_reward_func/mean": 0.3186902403831482,
+      "rewards/bleu_reward_func/std": 0.2608534097671509,
+      "step": 992
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.40625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 484.0,
+      "completions/mean_length": 328.375,
+      "completions/mean_terminated_length": 202.73684692382812,
+      "completions/min_length": 16.0,
+      "completions/min_terminated_length": 16.0,
+      "epoch": 0.7944,
+      "grad_norm": 7.580811023712158,
+      "kl": 0.290863037109375,
+      "learning_rate": 1e-06,
+      "loss": 0.1547,
+      "num_tokens": 12975334.0,
+      "reward": 0.16742870211601257,
+      "reward_std": 0.03473435714840889,
+      "rewards/bleu_reward_func/mean": 0.16742870211601257,
+      "rewards/bleu_reward_func/std": 0.1612749844789505,
+      "step": 993
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.15625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 491.0,
+      "completions/mean_length": 119.6875,
+      "completions/mean_terminated_length": 47.03703689575195,
+      "completions/min_length": 14.0,
+      "completions/min_terminated_length": 14.0,
+      "epoch": 0.7952,
+      "grad_norm": 20.384323120117188,
+      "kl": 0.74737548828125,
+      "learning_rate": 1e-06,
+      "loss": -0.0224,
+      "num_tokens": 12988020.0,
+      "reward": 0.1509585976600647,
+      "reward_std": 0.0387745276093483,
+      "rewards/bleu_reward_func/mean": 0.1509585976600647,
+      "rewards/bleu_reward_func/std": 0.13122804462909698,
+      "step": 994
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 499.0,
+      "completions/mean_length": 346.84375,
+      "completions/mean_terminated_length": 181.6875,
+      "completions/min_length": 14.0,
+      "completions/min_terminated_length": 14.0,
+      "epoch": 0.796,
+      "grad_norm": 4.161476135253906,
+      "kl": 0.05682373046875,
+      "learning_rate": 1e-06,
+      "loss": -0.1853,
+      "num_tokens": 13003199.0,
+      "reward": 0.026108039543032646,
+      "reward_std": 0.02537854015827179,
+      "rewards/bleu_reward_func/mean": 0.026108039543032646,
+      "rewards/bleu_reward_func/std": 0.03443064168095589,
+      "step": 995
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 239.0,
+      "completions/mean_length": 181.0625,
+      "completions/mean_terminated_length": 70.75,
+      "completions/min_length": 13.0,
+      "completions/min_terminated_length": 13.0,
+      "epoch": 0.7968,
+      "grad_norm": 14.441998481750488,
+      "kl": 0.575714111328125,
+      "learning_rate": 1e-06,
+      "loss": 0.1517,
+      "num_tokens": 13012889.0,
+      "reward": 0.10783781111240387,
+      "reward_std": 0.053533561527729034,
+      "rewards/bleu_reward_func/mean": 0.10783781111240387,
+      "rewards/bleu_reward_func/std": 0.09023794531822205,
+      "step": 996
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.59375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 444.0,
+      "completions/mean_length": 371.75,
+      "completions/mean_terminated_length": 166.7692413330078,
+      "completions/min_length": 28.0,
+      "completions/min_terminated_length": 28.0,
+      "epoch": 0.7976,
+      "grad_norm": 3.627946615219116,
+      "kl": 0.089813232421875,
+      "learning_rate": 1e-06,
+      "loss": -0.0463,
+      "num_tokens": 13027321.0,
+      "reward": 0.07810983061790466,
+      "reward_std": 0.03820539265871048,
+      "rewards/bleu_reward_func/mean": 0.07810983061790466,
+      "rewards/bleu_reward_func/std": 0.07255319505929947,
+      "step": 997
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 410.0,
+      "completions/mean_length": 299.96875,
+      "completions/mean_terminated_length": 203.59091186523438,
+      "completions/min_length": 22.0,
+      "completions/min_terminated_length": 22.0,
+      "epoch": 0.7984,
+      "grad_norm": 5.938675403594971,
+      "kl": 0.163330078125,
+      "learning_rate": 1e-06,
+      "loss": 0.1322,
+      "num_tokens": 13038888.0,
+      "reward": 0.0998745784163475,
+      "reward_std": 0.12165166437625885,
+      "rewards/bleu_reward_func/mean": 0.0998745784163475,
+      "rewards/bleu_reward_func/std": 0.2023635059595108,
+      "step": 998
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 505.0,
+      "completions/mean_length": 299.25,
+      "completions/mean_terminated_length": 171.60000610351562,
+      "completions/min_length": 16.0,
+      "completions/min_terminated_length": 16.0,
+      "epoch": 0.7992,
+      "grad_norm": 5.83504581451416,
+      "kl": 0.288848876953125,
+      "learning_rate": 1e-06,
+      "loss": 0.0509,
+      "num_tokens": 13051904.0,
+      "reward": 0.10337799787521362,
+      "reward_std": 0.029087794944643974,
+      "rewards/bleu_reward_func/mean": 0.10337799787521362,
+      "rewards/bleu_reward_func/std": 0.07911896705627441,
+      "step": 999
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 492.0,
+      "completions/mean_length": 345.96875,
+      "completions/mean_terminated_length": 322.25,
+      "completions/min_length": 91.0,
+      "completions/min_terminated_length": 91.0,
+      "epoch": 0.8,
+      "grad_norm": 2.916576385498047,
+      "kl": 0.078857421875,
+      "learning_rate": 1e-06,
+      "loss": 0.05,
+      "num_tokens": 13065431.0,
+      "reward": 0.05868455022573471,
+      "reward_std": 0.017369702458381653,
+      "rewards/bleu_reward_func/mean": 0.05868455022573471,
+      "rewards/bleu_reward_func/std": 0.04672805219888687,
+      "step": 1000
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 461.0,
+      "completions/mean_length": 266.8125,
+      "completions/mean_terminated_length": 250.4666748046875,
+      "completions/min_length": 9.0,
+      "completions/min_terminated_length": 9.0,
+      "epoch": 0.8008,
+      "grad_norm": 5.116244316101074,
+      "kl": 0.20050048828125,
+      "learning_rate": 1e-06,
+      "loss": 0.0918,
+      "num_tokens": 13079225.0,
+      "reward": 0.13771796226501465,
+      "reward_std": 0.04302237555384636,
+      "rewards/bleu_reward_func/mean": 0.13771796226501465,
+      "rewards/bleu_reward_func/std": 0.10432249307632446,
+      "step": 1001
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.28125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 505.0,
+      "completions/mean_length": 316.1875,
+      "completions/mean_terminated_length": 239.56521606445312,
+      "completions/min_length": 20.0,
+      "completions/min_terminated_length": 20.0,
+      "epoch": 0.8016,
+      "grad_norm": 3.050908088684082,
+      "kl": 0.0596923828125,
+      "learning_rate": 1e-06,
+      "loss": -0.0323,
+      "num_tokens": 13093775.0,
+      "reward": 0.07833529263734818,
+      "reward_std": 0.02821630984544754,
+      "rewards/bleu_reward_func/mean": 0.07833529263734818,
+      "rewards/bleu_reward_func/std": 0.06890382617712021,
+      "step": 1002
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 380.0,
+      "completions/mean_length": 180.0,
+      "completions/mean_terminated_length": 132.57144165039062,
+      "completions/min_length": 18.0,
+      "completions/min_terminated_length": 18.0,
+      "epoch": 0.8024,
+      "grad_norm": 6.102024078369141,
+      "kl": 0.28985595703125,
+      "learning_rate": 1e-06,
+      "loss": 0.0137,
+      "num_tokens": 13104447.0,
+      "reward": 0.27597150206565857,
+      "reward_std": 0.04547630250453949,
+      "rewards/bleu_reward_func/mean": 0.27597150206565857,
+      "rewards/bleu_reward_func/std": 0.2288428395986557,
+      "step": 1003
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.53125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 476.0,
+      "completions/mean_length": 391.25,
+      "completions/mean_terminated_length": 254.40000915527344,
+      "completions/min_length": 22.0,
+      "completions/min_terminated_length": 22.0,
+      "epoch": 0.8032,
+      "grad_norm": 3.2652981281280518,
+      "kl": 0.084625244140625,
+      "learning_rate": 1e-06,
+      "loss": 0.1886,
+      "num_tokens": 13119655.0,
+      "reward": 0.13283666968345642,
+      "reward_std": 0.029899559915065765,
+      "rewards/bleu_reward_func/mean": 0.13283666968345642,
+      "rewards/bleu_reward_func/std": 0.1536840796470642,
+      "step": 1004
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 456.0,
+      "completions/mean_length": 203.8125,
+      "completions/mean_terminated_length": 183.2666778564453,
+      "completions/min_length": 14.0,
+      "completions/min_terminated_length": 14.0,
+      "epoch": 0.804,
+      "grad_norm": 6.098532199859619,
+      "kl": 0.1204833984375,
+      "learning_rate": 1e-06,
+      "loss": 0.0735,
+      "num_tokens": 13128825.0,
+      "reward": 0.21581590175628662,
+      "reward_std": 0.10097475349903107,
+      "rewards/bleu_reward_func/mean": 0.21581590175628662,
+      "rewards/bleu_reward_func/std": 0.2611050307750702,
+      "step": 1005
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 471.0,
+      "completions/mean_length": 358.1875,
+      "completions/mean_terminated_length": 238.55555725097656,
+      "completions/min_length": 19.0,
+      "completions/min_terminated_length": 19.0,
+      "epoch": 0.8048,
+      "grad_norm": 7.441494464874268,
+      "kl": 0.191802978515625,
+      "learning_rate": 1e-06,
+      "loss": 0.0313,
+      "num_tokens": 13142175.0,
+      "reward": 0.20881031453609467,
+      "reward_std": 0.03906787186861038,
+      "rewards/bleu_reward_func/mean": 0.20881031453609467,
+      "rewards/bleu_reward_func/std": 0.17651565372943878,
+      "step": 1006
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 511.0,
+      "completions/mean_length": 307.65625,
+      "completions/mean_terminated_length": 260.5,
+      "completions/min_length": 32.0,
+      "completions/min_terminated_length": 32.0,
+      "epoch": 0.8056,
+      "grad_norm": 3.559936761856079,
+      "kl": 0.0572509765625,
+      "learning_rate": 1e-06,
+      "loss": 0.1028,
+      "num_tokens": 13153852.0,
+      "reward": 0.04283101111650467,
+      "reward_std": 0.04057364910840988,
+      "rewards/bleu_reward_func/mean": 0.04283101111650467,
+      "rewards/bleu_reward_func/std": 0.059128936380147934,
+      "step": 1007
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.34375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 497.0,
+      "completions/mean_length": 336.875,
+      "completions/mean_terminated_length": 245.1428680419922,
+      "completions/min_length": 118.0,
+      "completions/min_terminated_length": 118.0,
+      "epoch": 0.8064,
+      "grad_norm": 4.735692024230957,
+      "kl": 0.069091796875,
+      "learning_rate": 1e-06,
+      "loss": 0.0972,
+      "num_tokens": 13171048.0,
+      "reward": 0.09638670086860657,
+      "reward_std": 0.019614677876234055,
+      "rewards/bleu_reward_func/mean": 0.09638670086860657,
+      "rewards/bleu_reward_func/std": 0.09023009240627289,
+      "step": 1008
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.40625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 481.0,
+      "completions/mean_length": 329.59375,
+      "completions/mean_terminated_length": 204.7894744873047,
+      "completions/min_length": 2.0,
+      "completions/min_terminated_length": 2.0,
+      "epoch": 0.8072,
+      "grad_norm": 7.47179651260376,
+      "kl": 0.1318359375,
+      "learning_rate": 1e-06,
+      "loss": -0.0877,
+      "num_tokens": 13187195.0,
+      "reward": 0.036128196865320206,
+      "reward_std": 0.020984536036849022,
+      "rewards/bleu_reward_func/mean": 0.036128196865320206,
+      "rewards/bleu_reward_func/std": 0.038413140922784805,
+      "step": 1009
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 252.0,
+      "completions/max_terminated_length": 252.0,
+      "completions/mean_length": 53.5625,
+      "completions/mean_terminated_length": 53.5625,
+      "completions/min_length": 18.0,
+      "completions/min_terminated_length": 18.0,
+      "epoch": 0.808,
+      "grad_norm": 10.647790908813477,
+      "kl": 0.482666015625,
+      "learning_rate": 1e-06,
+      "loss": -0.052,
+      "num_tokens": 13197629.0,
+      "reward": 0.34467193484306335,
+      "reward_std": 0.09173881262540817,
+      "rewards/bleu_reward_func/mean": 0.34467193484306335,
+      "rewards/bleu_reward_func/std": 0.23519103229045868,
+      "step": 1010
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 481.0,
+      "completions/mean_length": 191.6875,
+      "completions/mean_terminated_length": 181.35482788085938,
+      "completions/min_length": 17.0,
+      "completions/min_terminated_length": 17.0,
+      "epoch": 0.8088,
+      "grad_norm": 6.100401878356934,
+      "kl": 0.141632080078125,
+      "learning_rate": 1e-06,
+      "loss": 0.0525,
+      "num_tokens": 13210699.0,
+      "reward": 0.23702389001846313,
+      "reward_std": 0.03205852955579758,
+      "rewards/bleu_reward_func/mean": 0.23702389001846313,
+      "rewards/bleu_reward_func/std": 0.1315021812915802,
+      "step": 1011
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.53125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 511.0,
+      "completions/mean_length": 374.4375,
+      "completions/mean_terminated_length": 218.53334045410156,
+      "completions/min_length": 11.0,
+      "completions/min_terminated_length": 11.0,
+      "epoch": 0.8096,
+      "grad_norm": 8.772051811218262,
+      "kl": 0.103546142578125,
+      "learning_rate": 1e-06,
+      "loss": 0.0866,
+      "num_tokens": 13228945.0,
+      "reward": 0.13881856203079224,
+      "reward_std": 0.044034168124198914,
+      "rewards/bleu_reward_func/mean": 0.13881856203079224,
+      "rewards/bleu_reward_func/std": 0.0626484826207161,
+      "step": 1012
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 479.0,
+      "completions/mean_length": 147.96875,
+      "completions/mean_terminated_length": 95.96428680419922,
+      "completions/min_length": 15.0,
+      "completions/min_terminated_length": 15.0,
+      "epoch": 0.8104,
+      "grad_norm": 14.525429725646973,
+      "kl": 0.4837646484375,
+      "learning_rate": 1e-06,
+      "loss": 0.0162,
+      "num_tokens": 13242400.0,
+      "reward": 0.20639193058013916,
+      "reward_std": 0.05956702679395676,
+      "rewards/bleu_reward_func/mean": 0.20639193058013916,
+      "rewards/bleu_reward_func/std": 0.12604379653930664,
+      "step": 1013
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 510.0,
+      "completions/max_terminated_length": 510.0,
+      "completions/mean_length": 149.90625,
+      "completions/mean_terminated_length": 149.90625,
+      "completions/min_length": 24.0,
+      "completions/min_terminated_length": 24.0,
+      "epoch": 0.8112,
+      "grad_norm": 22.3626766204834,
+      "kl": 0.2891845703125,
+      "learning_rate": 1e-06,
+      "loss": 0.0612,
+      "num_tokens": 13252157.0,
+      "reward": 0.24893034994602203,
+      "reward_std": 0.03380701690912247,
+      "rewards/bleu_reward_func/mean": 0.24893034994602203,
+      "rewards/bleu_reward_func/std": 0.20013003051280975,
+      "step": 1014
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 28.0,
+      "completions/mean_length": 266.6875,
+      "completions/mean_terminated_length": 21.375,
+      "completions/min_length": 18.0,
+      "completions/min_terminated_length": 18.0,
+      "epoch": 0.812,
+      "grad_norm": 7.276104927062988,
+      "kl": 0.26300048828125,
+      "learning_rate": 1e-06,
+      "loss": -0.0225,
+      "num_tokens": 13263955.0,
+      "reward": 0.14417850971221924,
+      "reward_std": 0.037887826561927795,
+      "rewards/bleu_reward_func/mean": 0.14417850971221924,
+      "rewards/bleu_reward_func/std": 0.1605955958366394,
+      "step": 1015
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.21875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 470.0,
+      "completions/mean_length": 226.28125,
+      "completions/mean_terminated_length": 146.27999877929688,
+      "completions/min_length": 51.0,
+      "completions/min_terminated_length": 51.0,
+      "epoch": 0.8128,
+      "grad_norm": 5.462210655212402,
+      "kl": 0.13653564453125,
+      "learning_rate": 1e-06,
+      "loss": 0.1195,
+      "num_tokens": 13273692.0,
+      "reward": 0.08337672054767609,
+      "reward_std": 0.02062853053212166,
+      "rewards/bleu_reward_func/mean": 0.08337672054767609,
+      "rewards/bleu_reward_func/std": 0.048102062195539474,
+      "step": 1016
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.34375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 502.0,
+      "completions/mean_length": 354.0625,
+      "completions/mean_terminated_length": 271.3333435058594,
+      "completions/min_length": 26.0,
+      "completions/min_terminated_length": 26.0,
+      "epoch": 0.8136,
+      "grad_norm": 5.125792980194092,
+      "kl": 0.22271728515625,
+      "learning_rate": 1e-06,
+      "loss": 0.0213,
+      "num_tokens": 13289478.0,
+      "reward": 0.19775965809822083,
+      "reward_std": 0.04682963341474533,
+      "rewards/bleu_reward_func/mean": 0.19775965809822083,
+      "rewards/bleu_reward_func/std": 0.22582097351551056,
+      "step": 1017
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 492.0,
+      "completions/mean_length": 183.65625,
+      "completions/mean_terminated_length": 107.8846206665039,
+      "completions/min_length": 11.0,
+      "completions/min_terminated_length": 11.0,
+      "epoch": 0.8144,
+      "grad_norm": 7.784894943237305,
+      "kl": 0.38018798828125,
+      "learning_rate": 1e-06,
+      "loss": -0.2707,
+      "num_tokens": 13299747.0,
+      "reward": 0.06794524192810059,
+      "reward_std": 0.039994340389966965,
+      "rewards/bleu_reward_func/mean": 0.06794524192810059,
+      "rewards/bleu_reward_func/std": 0.0657891035079956,
+      "step": 1018
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.09375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 476.0,
+      "completions/mean_length": 232.6875,
+      "completions/mean_terminated_length": 203.79310607910156,
+      "completions/min_length": 21.0,
+      "completions/min_terminated_length": 21.0,
+      "epoch": 0.8152,
+      "grad_norm": 6.757852077484131,
+      "kl": 0.23583984375,
+      "learning_rate": 1e-06,
+      "loss": -0.0289,
+      "num_tokens": 13308865.0,
+      "reward": 0.1327855885028839,
+      "reward_std": 0.043607283383607864,
+      "rewards/bleu_reward_func/mean": 0.1327855885028839,
+      "rewards/bleu_reward_func/std": 0.1713796854019165,
+      "step": 1019
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.09375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 483.0,
+      "completions/mean_length": 116.03125,
+      "completions/mean_terminated_length": 75.06896209716797,
+      "completions/min_length": 19.0,
+      "completions/min_terminated_length": 19.0,
+      "epoch": 0.816,
+      "grad_norm": 6.1786370277404785,
+      "kl": 0.21380615234375,
+      "learning_rate": 1e-06,
+      "loss": 0.1686,
+      "num_tokens": 13318914.0,
+      "reward": 0.23704446852207184,
+      "reward_std": 0.057613980025053024,
+      "rewards/bleu_reward_func/mean": 0.23704446852207184,
+      "rewards/bleu_reward_func/std": 0.21550458669662476,
+      "step": 1020
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 512.0,
+      "completions/mean_length": 188.34375,
+      "completions/mean_terminated_length": 166.7666778564453,
+      "completions/min_length": 15.0,
+      "completions/min_terminated_length": 15.0,
+      "epoch": 0.8168,
+      "grad_norm": 7.113982200622559,
+      "kl": 0.258514404296875,
+      "learning_rate": 1e-06,
+      "loss": 0.0406,
+      "num_tokens": 13329205.0,
+      "reward": 0.08146457374095917,
+      "reward_std": 0.02083425223827362,
+      "rewards/bleu_reward_func/mean": 0.08146457374095917,
+      "rewards/bleu_reward_func/std": 0.0736912190914154,
+      "step": 1021
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.28125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 500.0,
+      "completions/mean_length": 248.1875,
+      "completions/mean_terminated_length": 144.95652770996094,
+      "completions/min_length": 14.0,
+      "completions/min_terminated_length": 14.0,
+      "epoch": 0.8176,
+      "grad_norm": 10.716026306152344,
+      "kl": 0.3465576171875,
+      "learning_rate": 1e-06,
+      "loss": -0.0356,
+      "num_tokens": 13345867.0,
+      "reward": 0.1497185379266739,
+      "reward_std": 0.016201931983232498,
+      "rewards/bleu_reward_func/mean": 0.1497185379266739,
+      "rewards/bleu_reward_func/std": 0.17363472282886505,
+      "step": 1022
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 466.0,
+      "completions/mean_length": 306.59375,
+      "completions/mean_terminated_length": 238.125,
+      "completions/min_length": 18.0,
+      "completions/min_terminated_length": 18.0,
+      "epoch": 0.8184,
+      "grad_norm": 5.428062915802002,
+      "kl": 0.152679443359375,
+      "learning_rate": 1e-06,
+      "loss": -0.0033,
+      "num_tokens": 13360854.0,
+      "reward": 0.1825982928276062,
+      "reward_std": 0.057225678116083145,
+      "rewards/bleu_reward_func/mean": 0.1825982928276062,
+      "rewards/bleu_reward_func/std": 0.1867101639509201,
+      "step": 1023
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 504.0,
+      "completions/max_terminated_length": 504.0,
+      "completions/mean_length": 103.6875,
+      "completions/mean_terminated_length": 103.6875,
+      "completions/min_length": 15.0,
+      "completions/min_terminated_length": 15.0,
+      "epoch": 0.8192,
+      "grad_norm": 9.348381042480469,
+      "kl": 0.17828369140625,
+      "learning_rate": 1e-06,
+      "loss": 0.0549,
+      "num_tokens": 13369980.0,
+      "reward": 0.11240973323583603,
+      "reward_std": 0.026126563549041748,
+      "rewards/bleu_reward_func/mean": 0.11240973323583603,
+      "rewards/bleu_reward_func/std": 0.11270570009946823,
+      "step": 1024
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 492.0,
+      "completions/mean_length": 225.875,
+      "completions/mean_terminated_length": 159.84616088867188,
+      "completions/min_length": 15.0,
+      "completions/min_terminated_length": 15.0,
+      "epoch": 0.82,
+      "grad_norm": 7.571503162384033,
+      "kl": 0.237548828125,
+      "learning_rate": 1e-06,
+      "loss": 0.0009,
+      "num_tokens": 13384584.0,
+      "reward": 0.2324497401714325,
+      "reward_std": 0.0470973402261734,
+      "rewards/bleu_reward_func/mean": 0.2324497401714325,
+      "rewards/bleu_reward_func/std": 0.1243894025683403,
+      "step": 1025
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 509.0,
+      "completions/mean_length": 139.34375,
+      "completions/mean_terminated_length": 114.50000762939453,
+      "completions/min_length": 18.0,
+      "completions/min_terminated_length": 18.0,
+      "epoch": 0.8208,
+      "grad_norm": 8.227386474609375,
+      "kl": 0.25714111328125,
+      "learning_rate": 1e-06,
+      "loss": 0.1704,
+      "num_tokens": 13391771.0,
+      "reward": 0.12480157613754272,
+      "reward_std": 0.04330623894929886,
+      "rewards/bleu_reward_func/mean": 0.12480157613754272,
+      "rewards/bleu_reward_func/std": 0.103439562022686,
+      "step": 1026
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.21875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 497.0,
+      "completions/mean_length": 214.3125,
+      "completions/mean_terminated_length": 130.95999145507812,
+      "completions/min_length": 19.0,
+      "completions/min_terminated_length": 19.0,
+      "epoch": 0.8216,
+      "grad_norm": 5.542022228240967,
+      "kl": 0.24603271484375,
+      "learning_rate": 1e-06,
+      "loss": -0.1391,
+      "num_tokens": 13403757.0,
+      "reward": 0.25766974687576294,
+      "reward_std": 0.03755660355091095,
+      "rewards/bleu_reward_func/mean": 0.25766974687576294,
+      "rewards/bleu_reward_func/std": 0.22421182692050934,
+      "step": 1027
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.46875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 512.0,
+      "completions/mean_length": 459.53125,
+      "completions/mean_terminated_length": 413.23529052734375,
+      "completions/min_length": 77.0,
+      "completions/min_terminated_length": 77.0,
+      "epoch": 0.8224,
+      "grad_norm": 2.1804089546203613,
+      "kl": 0.04327392578125,
+      "learning_rate": 1e-06,
+      "loss": -0.1423,
+      "num_tokens": 13421078.0,
+      "reward": 0.07269357144832611,
+      "reward_std": 0.02826325222849846,
+      "rewards/bleu_reward_func/mean": 0.07269357144832611,
+      "rewards/bleu_reward_func/std": 0.034365471452474594,
+      "step": 1028
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 496.0,
+      "completions/mean_length": 233.90625,
+      "completions/mean_terminated_length": 194.17857360839844,
+      "completions/min_length": 23.0,
+      "completions/min_terminated_length": 23.0,
+      "epoch": 0.8232,
+      "grad_norm": 5.306719779968262,
+      "kl": 0.1806640625,
+      "learning_rate": 1e-06,
+      "loss": -0.4029,
+      "num_tokens": 13432323.0,
+      "reward": 0.18320006132125854,
+      "reward_std": 0.08323986828327179,
+      "rewards/bleu_reward_func/mean": 0.18320006132125854,
+      "rewards/bleu_reward_func/std": 0.2284490317106247,
+      "step": 1029
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.21875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 511.0,
+      "completions/mean_length": 258.4375,
+      "completions/mean_terminated_length": 187.44000244140625,
+      "completions/min_length": 27.0,
+      "completions/min_terminated_length": 27.0,
+      "epoch": 0.824,
+      "grad_norm": 3.841357707977295,
+      "kl": 0.070404052734375,
+      "learning_rate": 1e-06,
+      "loss": 0.1459,
+      "num_tokens": 13444169.0,
+      "reward": 0.09387044608592987,
+      "reward_std": 0.07637906074523926,
+      "rewards/bleu_reward_func/mean": 0.09387044608592987,
+      "rewards/bleu_reward_func/std": 0.10294011980295181,
+      "step": 1030
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 505.0,
+      "completions/mean_length": 121.59375,
+      "completions/mean_terminated_length": 109.0,
+      "completions/min_length": 11.0,
+      "completions/min_terminated_length": 11.0,
+      "epoch": 0.8248,
+      "grad_norm": 10.732538223266602,
+      "kl": 0.598388671875,
+      "learning_rate": 1e-06,
+      "loss": -0.0236,
+      "num_tokens": 13450364.0,
+      "reward": 0.18597961962223053,
+      "reward_std": 0.03610639274120331,
+      "rewards/bleu_reward_func/mean": 0.18597961962223053,
+      "rewards/bleu_reward_func/std": 0.14241203665733337,
+      "step": 1031
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 505.0,
+      "completions/mean_length": 174.15625,
+      "completions/mean_terminated_length": 151.6333465576172,
+      "completions/min_length": 35.0,
+      "completions/min_terminated_length": 35.0,
+      "epoch": 0.8256,
+      "grad_norm": 5.849789619445801,
+      "kl": 0.2237548828125,
+      "learning_rate": 1e-06,
+      "loss": -0.1461,
+      "num_tokens": 13458489.0,
+      "reward": 0.10662397742271423,
+      "reward_std": 0.044935449957847595,
+      "rewards/bleu_reward_func/mean": 0.10662397742271423,
+      "rewards/bleu_reward_func/std": 0.08882930874824524,
+      "step": 1032
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 472.0,
+      "completions/mean_length": 365.75,
+      "completions/mean_terminated_length": 278.0,
+      "completions/min_length": 96.0,
+      "completions/min_terminated_length": 96.0,
+      "epoch": 0.8264,
+      "grad_norm": 2.3378233909606934,
+      "kl": 0.040679931640625,
+      "learning_rate": 1e-06,
+      "loss": 0.018,
+      "num_tokens": 13477497.0,
+      "reward": 0.12139745056629181,
+      "reward_std": 0.030839571729302406,
+      "rewards/bleu_reward_func/mean": 0.12139745056629181,
+      "rewards/bleu_reward_func/std": 0.087521493434906,
+      "step": 1033
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 506.0,
+      "completions/mean_length": 307.8125,
+      "completions/mean_terminated_length": 260.69232177734375,
+      "completions/min_length": 34.0,
+      "completions/min_terminated_length": 34.0,
+      "epoch": 0.8272,
+      "grad_norm": 5.579449653625488,
+      "kl": 0.14544677734375,
+      "learning_rate": 1e-06,
+      "loss": -0.0832,
+      "num_tokens": 13491595.0,
+      "reward": 0.06439976394176483,
+      "reward_std": 0.01632755994796753,
+      "rewards/bleu_reward_func/mean": 0.06439976394176483,
+      "rewards/bleu_reward_func/std": 0.025089839473366737,
+      "step": 1034
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 511.0,
+      "completions/mean_length": 227.90625,
+      "completions/mean_terminated_length": 162.34616088867188,
+      "completions/min_length": 32.0,
+      "completions/min_terminated_length": 32.0,
+      "epoch": 0.828,
+      "grad_norm": 6.95845890045166,
+      "kl": 0.08489990234375,
+      "learning_rate": 1e-06,
+      "loss": 0.1636,
+      "num_tokens": 13502304.0,
+      "reward": 0.15672987699508667,
+      "reward_std": 0.07095484435558319,
+      "rewards/bleu_reward_func/mean": 0.15672987699508667,
+      "rewards/bleu_reward_func/std": 0.1326054334640503,
+      "step": 1035
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 507.0,
+      "completions/mean_length": 162.59375,
+      "completions/mean_terminated_length": 139.3000030517578,
+      "completions/min_length": 19.0,
+      "completions/min_terminated_length": 19.0,
+      "epoch": 0.8288,
+      "grad_norm": 7.602659702301025,
+      "kl": 0.2142333984375,
+      "learning_rate": 1e-06,
+      "loss": -0.0952,
+      "num_tokens": 13512379.0,
+      "reward": 0.10166750848293304,
+      "reward_std": 0.022390395402908325,
+      "rewards/bleu_reward_func/mean": 0.10166750848293304,
+      "rewards/bleu_reward_func/std": 0.09791414439678192,
+      "step": 1036
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 482.0,
+      "completions/mean_length": 352.84375,
+      "completions/mean_terminated_length": 257.3500061035156,
+      "completions/min_length": 88.0,
+      "completions/min_terminated_length": 88.0,
+      "epoch": 0.8296,
+      "grad_norm": 3.4908225536346436,
+      "kl": 0.0816650390625,
+      "learning_rate": 1e-06,
+      "loss": 0.1513,
+      "num_tokens": 13526246.0,
+      "reward": 0.10761404037475586,
+      "reward_std": 0.02660614624619484,
+      "rewards/bleu_reward_func/mean": 0.10761404037475586,
+      "rewards/bleu_reward_func/std": 0.08269859850406647,
+      "step": 1037
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.15625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 511.0,
+      "completions/mean_length": 154.03125,
+      "completions/mean_terminated_length": 87.74073791503906,
+      "completions/min_length": 20.0,
+      "completions/min_terminated_length": 20.0,
+      "epoch": 0.8304,
+      "grad_norm": 8.316597938537598,
+      "kl": 0.3048095703125,
+      "learning_rate": 1e-06,
+      "loss": 0.108,
+      "num_tokens": 13538935.0,
+      "reward": 0.14819365739822388,
+      "reward_std": 0.07058853656053543,
+      "rewards/bleu_reward_func/mean": 0.14819365739822388,
+      "rewards/bleu_reward_func/std": 0.1550559103488922,
+      "step": 1038
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.09375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 485.0,
+      "completions/mean_length": 237.75,
+      "completions/mean_terminated_length": 209.37930297851562,
+      "completions/min_length": 31.0,
+      "completions/min_terminated_length": 31.0,
+      "epoch": 0.8312,
+      "grad_norm": 4.724348545074463,
+      "kl": 0.1529541015625,
+      "learning_rate": 1e-06,
+      "loss": 0.0635,
+      "num_tokens": 13551415.0,
+      "reward": 0.16776956617832184,
+      "reward_std": 0.026334762573242188,
+      "rewards/bleu_reward_func/mean": 0.16776956617832184,
+      "rewards/bleu_reward_func/std": 0.18577900528907776,
+      "step": 1039
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 402.0,
+      "completions/max_terminated_length": 402.0,
+      "completions/mean_length": 93.375,
+      "completions/mean_terminated_length": 93.375,
+      "completions/min_length": 16.0,
+      "completions/min_terminated_length": 16.0,
+      "epoch": 0.832,
+      "grad_norm": 8.702837944030762,
+      "kl": 0.33355712890625,
+      "learning_rate": 1e-06,
+      "loss": 0.0333,
+      "num_tokens": 13560747.0,
+      "reward": 0.1746351718902588,
+      "reward_std": 0.039413660764694214,
+      "rewards/bleu_reward_func/mean": 0.1746351718902588,
+      "rewards/bleu_reward_func/std": 0.13439369201660156,
+      "step": 1040
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 297.0,
+      "completions/mean_length": 177.90625,
+      "completions/mean_terminated_length": 66.54167175292969,
+      "completions/min_length": 5.0,
+      "completions/min_terminated_length": 5.0,
+      "epoch": 0.8328,
+      "grad_norm": 14.271418571472168,
+      "kl": 0.298431396484375,
+      "learning_rate": 1e-06,
+      "loss": -0.356,
+      "num_tokens": 13572328.0,
+      "reward": 0.0881040021777153,
+      "reward_std": 0.0392255075275898,
+      "rewards/bleu_reward_func/mean": 0.0881040021777153,
+      "rewards/bleu_reward_func/std": 0.086721271276474,
+      "step": 1041
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.28125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 508.0,
+      "completions/mean_length": 243.34375,
+      "completions/mean_terminated_length": 138.21739196777344,
+      "completions/min_length": 37.0,
+      "completions/min_terminated_length": 37.0,
+      "epoch": 0.8336,
+      "grad_norm": 3.9084980487823486,
+      "kl": 0.07525634765625,
+      "learning_rate": 1e-06,
+      "loss": 0.0813,
+      "num_tokens": 13586859.0,
+      "reward": 0.06463417410850525,
+      "reward_std": 0.022750139236450195,
+      "rewards/bleu_reward_func/mean": 0.06463417410850525,
+      "rewards/bleu_reward_func/std": 0.05624645948410034,
+      "step": 1042
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 511.0,
+      "completions/mean_length": 328.96875,
+      "completions/mean_terminated_length": 267.9583435058594,
+      "completions/min_length": 17.0,
+      "completions/min_terminated_length": 17.0,
+      "epoch": 0.8344,
+      "grad_norm": 9.608210563659668,
+      "kl": 0.2337646484375,
+      "learning_rate": 1e-06,
+      "loss": 0.0092,
+      "num_tokens": 13600978.0,
+      "reward": 0.06980200856924057,
+      "reward_std": 0.015845034271478653,
+      "rewards/bleu_reward_func/mean": 0.06980200856924057,
+      "rewards/bleu_reward_func/std": 0.03303433954715729,
+      "step": 1043
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 442.0,
+      "completions/mean_length": 305.8125,
+      "completions/mean_terminated_length": 145.44444274902344,
+      "completions/min_length": 19.0,
+      "completions/min_terminated_length": 19.0,
+      "epoch": 0.8352,
+      "grad_norm": 4.128615379333496,
+      "kl": 0.06011962890625,
+      "learning_rate": 1e-06,
+      "loss": 0.1566,
+      "num_tokens": 13613068.0,
+      "reward": 0.04747869074344635,
+      "reward_std": 0.013655820861458778,
+      "rewards/bleu_reward_func/mean": 0.04747869074344635,
+      "rewards/bleu_reward_func/std": 0.028707411140203476,
+      "step": 1044
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.40625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 512.0,
+      "completions/mean_length": 426.53125,
+      "completions/mean_terminated_length": 368.0526428222656,
+      "completions/min_length": 108.0,
+      "completions/min_terminated_length": 108.0,
+      "epoch": 0.836,
+      "grad_norm": 2.515371799468994,
+      "kl": 0.043975830078125,
+      "learning_rate": 1e-06,
+      "loss": -0.065,
+      "num_tokens": 13629173.0,
+      "reward": 0.028738608583807945,
+      "reward_std": 0.012511001899838448,
+      "rewards/bleu_reward_func/mean": 0.028738608583807945,
+      "rewards/bleu_reward_func/std": 0.014564147219061852,
+      "step": 1045
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 505.0,
+      "completions/mean_length": 206.03125,
+      "completions/mean_terminated_length": 135.42308044433594,
+      "completions/min_length": 17.0,
+      "completions/min_terminated_length": 17.0,
+      "epoch": 0.8368,
+      "grad_norm": 8.021496772766113,
+      "kl": 0.273529052734375,
+      "learning_rate": 1e-06,
+      "loss": 0.1291,
+      "num_tokens": 13637374.0,
+      "reward": 0.11877701431512833,
+      "reward_std": 0.04857534170150757,
+      "rewards/bleu_reward_func/mean": 0.11877701431512833,
+      "rewards/bleu_reward_func/std": 0.08409105986356735,
+      "step": 1046
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 471.0,
+      "completions/mean_length": 251.65625,
+      "completions/mean_terminated_length": 214.46429443359375,
+      "completions/min_length": 45.0,
+      "completions/min_terminated_length": 45.0,
+      "epoch": 0.8376,
+      "grad_norm": 3.2490315437316895,
+      "kl": 0.06829833984375,
+      "learning_rate": 1e-06,
+      "loss": -0.1358,
+      "num_tokens": 13648067.0,
+      "reward": 0.08158313482999802,
+      "reward_std": 0.02561478689312935,
+      "rewards/bleu_reward_func/mean": 0.08158313482999802,
+      "rewards/bleu_reward_func/std": 0.05671805888414383,
+      "step": 1047
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.21875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 495.0,
+      "completions/mean_length": 269.125,
+      "completions/mean_terminated_length": 201.1199951171875,
+      "completions/min_length": 20.0,
+      "completions/min_terminated_length": 20.0,
+      "epoch": 0.8384,
+      "grad_norm": 5.032691955566406,
+      "kl": 0.1785888671875,
+      "learning_rate": 1e-06,
+      "loss": 0.0164,
+      "num_tokens": 13660719.0,
+      "reward": 0.18114086985588074,
+      "reward_std": 0.03815930336713791,
+      "rewards/bleu_reward_func/mean": 0.18114086985588074,
+      "rewards/bleu_reward_func/std": 0.14998804032802582,
+      "step": 1048
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 355.0,
+      "completions/mean_length": 87.25,
+      "completions/mean_terminated_length": 73.54838562011719,
+      "completions/min_length": 18.0,
+      "completions/min_terminated_length": 18.0,
+      "epoch": 0.8392,
+      "grad_norm": 8.731785774230957,
+      "kl": 0.384765625,
+      "learning_rate": 1e-06,
+      "loss": 0.0594,
+      "num_tokens": 13670767.0,
+      "reward": 0.23021195828914642,
+      "reward_std": 0.09217022359371185,
+      "rewards/bleu_reward_func/mean": 0.23021195828914642,
+      "rewards/bleu_reward_func/std": 0.18223723769187927,
+      "step": 1049
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 454.0,
+      "completions/mean_length": 269.25,
+      "completions/mean_terminated_length": 80.44444274902344,
+      "completions/min_length": 19.0,
+      "completions/min_terminated_length": 19.0,
+      "epoch": 0.84,
+      "grad_norm": 8.531782150268555,
+      "kl": 0.29345703125,
+      "learning_rate": 1e-06,
+      "loss": 0.2277,
+      "num_tokens": 13685927.0,
+      "reward": 0.13349372148513794,
+      "reward_std": 0.053998030722141266,
+      "rewards/bleu_reward_func/mean": 0.13349372148513794,
+      "rewards/bleu_reward_func/std": 0.15102460980415344,
+      "step": 1050
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 500.0,
+      "completions/mean_length": 375.4375,
+      "completions/mean_terminated_length": 238.875,
+      "completions/min_length": 16.0,
+      "completions/min_terminated_length": 16.0,
+      "epoch": 0.8408,
+      "grad_norm": 7.057468414306641,
+      "kl": 0.099639892578125,
+      "learning_rate": 1e-06,
+      "loss": 0.1437,
+      "num_tokens": 13702781.0,
+      "reward": 0.02950356900691986,
+      "reward_std": 0.010849589481949806,
+      "rewards/bleu_reward_func/mean": 0.02950356900691986,
+      "rewards/bleu_reward_func/std": 0.02092377282679081,
+      "step": 1051
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.15625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 487.0,
+      "completions/mean_length": 237.84375,
+      "completions/mean_terminated_length": 187.07408142089844,
+      "completions/min_length": 17.0,
+      "completions/min_terminated_length": 17.0,
+      "epoch": 0.8416,
+      "grad_norm": 8.259405136108398,
+      "kl": 0.57330322265625,
+      "learning_rate": 1e-06,
+      "loss": 0.1046,
+      "num_tokens": 13713560.0,
+      "reward": 0.1828644871711731,
+      "reward_std": 0.04976918175816536,
+      "rewards/bleu_reward_func/mean": 0.1828644871711731,
+      "rewards/bleu_reward_func/std": 0.13261918723583221,
+      "step": 1052
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.15625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 488.0,
+      "completions/mean_length": 154.09375,
+      "completions/mean_terminated_length": 87.81481170654297,
+      "completions/min_length": 14.0,
+      "completions/min_terminated_length": 14.0,
+      "epoch": 0.8424,
+      "grad_norm": 9.288348197937012,
+      "kl": 0.424560546875,
+      "learning_rate": 1e-06,
+      "loss": -0.0227,
+      "num_tokens": 13720387.0,
+      "reward": 0.1263371855020523,
+      "reward_std": 0.031269170343875885,
+      "rewards/bleu_reward_func/mean": 0.1263371855020523,
+      "rewards/bleu_reward_func/std": 0.1025131419301033,
+      "step": 1053
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 94.0,
+      "completions/mean_length": 160.15625,
+      "completions/mean_terminated_length": 42.875,
+      "completions/min_length": 17.0,
+      "completions/min_terminated_length": 17.0,
+      "epoch": 0.8432,
+      "grad_norm": 7.418272972106934,
+      "kl": 0.32086181640625,
+      "learning_rate": 1e-06,
+      "loss": -0.0571,
+      "num_tokens": 13731528.0,
+      "reward": 0.2602638304233551,
+      "reward_std": 0.07646072655916214,
+      "rewards/bleu_reward_func/mean": 0.2602638304233551,
+      "rewards/bleu_reward_func/std": 0.2308470755815506,
+      "step": 1054
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.15625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 453.0,
+      "completions/mean_length": 183.46875,
+      "completions/mean_terminated_length": 122.62963104248047,
+      "completions/min_length": 10.0,
+      "completions/min_terminated_length": 10.0,
+      "epoch": 0.844,
+      "grad_norm": 8.014601707458496,
+      "kl": 0.3927001953125,
+      "learning_rate": 1e-06,
+      "loss": 0.1042,
+      "num_tokens": 13742447.0,
+      "reward": 0.07197493314743042,
+      "reward_std": 0.012622429989278316,
+      "rewards/bleu_reward_func/mean": 0.07197493314743042,
+      "rewards/bleu_reward_func/std": 0.04327724501490593,
+      "step": 1055
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 492.0,
+      "completions/mean_length": 240.59375,
+      "completions/mean_terminated_length": 222.50001525878906,
+      "completions/min_length": 22.0,
+      "completions/min_terminated_length": 22.0,
+      "epoch": 0.8448,
+      "grad_norm": 5.102974891662598,
+      "kl": 0.1640625,
+      "learning_rate": 1e-06,
+      "loss": -0.0185,
+      "num_tokens": 13755202.0,
+      "reward": 0.08292236924171448,
+      "reward_std": 0.023967744782567024,
+      "rewards/bleu_reward_func/mean": 0.08292236924171448,
+      "rewards/bleu_reward_func/std": 0.046691060066223145,
+      "step": 1056
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 379.0,
+      "completions/mean_length": 133.0625,
+      "completions/mean_terminated_length": 45.615386962890625,
+      "completions/min_length": 13.0,
+      "completions/min_terminated_length": 13.0,
+      "epoch": 0.8456,
+      "grad_norm": 12.413713455200195,
+      "kl": 0.4334716796875,
+      "learning_rate": 1e-06,
+      "loss": 0.1467,
+      "num_tokens": 13763268.0,
+      "reward": 0.09796961396932602,
+      "reward_std": 0.02291642501950264,
+      "rewards/bleu_reward_func/mean": 0.09796961396932602,
+      "rewards/bleu_reward_func/std": 0.04426925256848335,
+      "step": 1057
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.15625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 489.0,
+      "completions/mean_length": 155.71875,
+      "completions/mean_terminated_length": 89.74073791503906,
+      "completions/min_length": 13.0,
+      "completions/min_terminated_length": 13.0,
+      "epoch": 0.8464,
+      "grad_norm": 8.978612899780273,
+      "kl": 0.228515625,
+      "learning_rate": 1e-06,
+      "loss": -0.0356,
+      "num_tokens": 13771987.0,
+      "reward": 0.10166356712579727,
+      "reward_std": 0.055922288447618484,
+      "rewards/bleu_reward_func/mean": 0.10166356712579727,
+      "rewards/bleu_reward_func/std": 0.07498722523450851,
+      "step": 1058
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 510.0,
+      "completions/mean_length": 408.84375,
+      "completions/mean_terminated_length": 276.21429443359375,
+      "completions/min_length": 31.0,
+      "completions/min_terminated_length": 31.0,
+      "epoch": 0.8472,
+      "grad_norm": 1.850431203842163,
+      "kl": 0.046905517578125,
+      "learning_rate": 1e-06,
+      "loss": 0.1476,
+      "num_tokens": 13791446.0,
+      "reward": 0.07564342021942139,
+      "reward_std": 0.015303988009691238,
+      "rewards/bleu_reward_func/mean": 0.07564342021942139,
+      "rewards/bleu_reward_func/std": 0.09736621379852295,
+      "step": 1059
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.28125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 493.0,
+      "completions/mean_length": 285.71875,
+      "completions/mean_terminated_length": 197.17391967773438,
+      "completions/min_length": 24.0,
+      "completions/min_terminated_length": 24.0,
+      "epoch": 0.848,
+      "grad_norm": 4.632063388824463,
+      "kl": 0.21221923828125,
+      "learning_rate": 1e-06,
+      "loss": 0.0453,
+      "num_tokens": 13803221.0,
+      "reward": 0.12073882669210434,
+      "reward_std": 0.03981417790055275,
+      "rewards/bleu_reward_func/mean": 0.12073882669210434,
+      "rewards/bleu_reward_func/std": 0.07346338778734207,
+      "step": 1060
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 457.0,
+      "completions/mean_length": 329.1875,
+      "completions/mean_terminated_length": 246.09091186523438,
+      "completions/min_length": 34.0,
+      "completions/min_terminated_length": 34.0,
+      "epoch": 0.8488,
+      "grad_norm": 4.053706645965576,
+      "kl": 0.078857421875,
+      "learning_rate": 1e-06,
+      "loss": 0.2911,
+      "num_tokens": 13818331.0,
+      "reward": 0.06476722657680511,
+      "reward_std": 0.030476348474621773,
+      "rewards/bleu_reward_func/mean": 0.06476722657680511,
+      "rewards/bleu_reward_func/std": 0.06862985342741013,
+      "step": 1061
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.21875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 430.0,
+      "completions/mean_length": 156.5625,
+      "completions/mean_terminated_length": 57.03999710083008,
+      "completions/min_length": 23.0,
+      "completions/min_terminated_length": 23.0,
+      "epoch": 0.8496,
+      "grad_norm": 8.067891120910645,
+      "kl": 0.1793212890625,
+      "learning_rate": 1e-06,
+      "loss": 0.195,
+      "num_tokens": 13826957.0,
+      "reward": 0.20933812856674194,
+      "reward_std": 0.051397278904914856,
+      "rewards/bleu_reward_func/mean": 0.20933812856674194,
+      "rewards/bleu_reward_func/std": 0.2901296019554138,
+      "step": 1062
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.28125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 468.0,
+      "completions/mean_length": 227.09375,
+      "completions/mean_terminated_length": 115.60869598388672,
+      "completions/min_length": 18.0,
+      "completions/min_terminated_length": 18.0,
+      "epoch": 0.8504,
+      "grad_norm": 7.58860445022583,
+      "kl": 0.165283203125,
+      "learning_rate": 1e-06,
+      "loss": -0.0274,
+      "num_tokens": 13838440.0,
+      "reward": 0.11480045318603516,
+      "reward_std": 0.027013186365365982,
+      "rewards/bleu_reward_func/mean": 0.11480045318603516,
+      "rewards/bleu_reward_func/std": 0.11019645631313324,
+      "step": 1063
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 474.0,
+      "completions/mean_length": 189.1875,
+      "completions/mean_terminated_length": 114.69231414794922,
+      "completions/min_length": 22.0,
+      "completions/min_terminated_length": 22.0,
+      "epoch": 0.8512,
+      "grad_norm": 5.411109447479248,
+      "kl": 0.1951904296875,
+      "learning_rate": 1e-06,
+      "loss": 0.0807,
+      "num_tokens": 13849406.0,
+      "reward": 0.1512412428855896,
+      "reward_std": 0.062053047120571136,
+      "rewards/bleu_reward_func/mean": 0.1512412428855896,
+      "rewards/bleu_reward_func/std": 0.12469635158777237,
+      "step": 1064
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.34375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 487.0,
+      "completions/mean_length": 259.5625,
+      "completions/mean_terminated_length": 127.33333587646484,
+      "completions/min_length": 19.0,
+      "completions/min_terminated_length": 19.0,
+      "epoch": 0.852,
+      "grad_norm": 6.397778034210205,
+      "kl": 0.24896240234375,
+      "learning_rate": 1e-06,
+      "loss": -0.0087,
+      "num_tokens": 13864056.0,
+      "reward": 0.1552116870880127,
+      "reward_std": 0.027821514755487442,
+      "rewards/bleu_reward_func/mean": 0.1552116870880127,
+      "rewards/bleu_reward_func/std": 0.10892557352781296,
+      "step": 1065
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 412.0,
+      "completions/max_terminated_length": 412.0,
+      "completions/mean_length": 139.8125,
+      "completions/mean_terminated_length": 139.8125,
+      "completions/min_length": 10.0,
+      "completions/min_terminated_length": 10.0,
+      "epoch": 0.8528,
+      "grad_norm": 8.31270694732666,
+      "kl": 0.20196533203125,
+      "learning_rate": 1e-06,
+      "loss": 0.0519,
+      "num_tokens": 13871490.0,
+      "reward": 0.13230201601982117,
+      "reward_std": 0.04370046779513359,
+      "rewards/bleu_reward_func/mean": 0.13230201601982117,
+      "rewards/bleu_reward_func/std": 0.08659063279628754,
+      "step": 1066
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.21875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 487.0,
+      "completions/mean_length": 169.0625,
+      "completions/mean_terminated_length": 73.04000091552734,
+      "completions/min_length": 19.0,
+      "completions/min_terminated_length": 19.0,
+      "epoch": 0.8536,
+      "grad_norm": 6.236849784851074,
+      "kl": 0.39727783203125,
+      "learning_rate": 1e-06,
+      "loss": 0.1366,
+      "num_tokens": 13885556.0,
+      "reward": 0.17620697617530823,
+      "reward_std": 0.024748487398028374,
+      "rewards/bleu_reward_func/mean": 0.17620697617530823,
+      "rewards/bleu_reward_func/std": 0.10503184050321579,
+      "step": 1067
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.15625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 511.0,
+      "completions/mean_length": 228.75,
+      "completions/mean_terminated_length": 176.29629516601562,
+      "completions/min_length": 19.0,
+      "completions/min_terminated_length": 19.0,
+      "epoch": 0.8544,
+      "grad_norm": 6.749774932861328,
+      "kl": 0.181121826171875,
+      "learning_rate": 1e-06,
+      "loss": -0.1016,
+      "num_tokens": 13894804.0,
+      "reward": 0.17489100992679596,
+      "reward_std": 0.042406514286994934,
+      "rewards/bleu_reward_func/mean": 0.17489100992679596,
+      "rewards/bleu_reward_func/std": 0.14329132437705994,
+      "step": 1068
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 475.0,
+      "completions/mean_length": 246.6875,
+      "completions/mean_terminated_length": 126.09091186523438,
+      "completions/min_length": 19.0,
+      "completions/min_terminated_length": 19.0,
+      "epoch": 0.8552,
+      "grad_norm": 6.8404459953308105,
+      "kl": 0.31402587890625,
+      "learning_rate": 1e-06,
+      "loss": 0.0134,
+      "num_tokens": 13907514.0,
+      "reward": 0.16271845996379852,
+      "reward_std": 0.04602063074707985,
+      "rewards/bleu_reward_func/mean": 0.16271845996379852,
+      "rewards/bleu_reward_func/std": 0.12579885125160217,
+      "step": 1069
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 490.0,
+      "completions/mean_length": 269.78125,
+      "completions/mean_terminated_length": 124.45000457763672,
+      "completions/min_length": 20.0,
+      "completions/min_terminated_length": 20.0,
+      "epoch": 0.856,
+      "grad_norm": 4.438868522644043,
+      "kl": 0.177520751953125,
+      "learning_rate": 1e-06,
+      "loss": 0.0265,
+      "num_tokens": 13919603.0,
+      "reward": 0.13932910561561584,
+      "reward_std": 0.01856398582458496,
+      "rewards/bleu_reward_func/mean": 0.13932910561561584,
+      "rewards/bleu_reward_func/std": 0.14700213074684143,
+      "step": 1070
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.15625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 511.0,
+      "completions/mean_length": 149.9375,
+      "completions/mean_terminated_length": 82.8888931274414,
+      "completions/min_length": 29.0,
+      "completions/min_terminated_length": 29.0,
+      "epoch": 0.8568,
+      "grad_norm": 7.284147262573242,
+      "kl": 0.3516845703125,
+      "learning_rate": 1e-06,
+      "loss": 0.0398,
+      "num_tokens": 13927953.0,
+      "reward": 0.2614789605140686,
+      "reward_std": 0.07057315111160278,
+      "rewards/bleu_reward_func/mean": 0.2614789605140686,
+      "rewards/bleu_reward_func/std": 0.1882997453212738,
+      "step": 1071
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.21875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 506.0,
+      "completions/mean_length": 243.78125,
+      "completions/mean_terminated_length": 168.67999267578125,
+      "completions/min_length": 35.0,
+      "completions/min_terminated_length": 35.0,
+      "epoch": 0.8576,
+      "grad_norm": 5.096418380737305,
+      "kl": 0.16357421875,
+      "learning_rate": 1e-06,
+      "loss": -0.0371,
+      "num_tokens": 13942570.0,
+      "reward": 0.07684318721294403,
+      "reward_std": 0.019258558750152588,
+      "rewards/bleu_reward_func/mean": 0.07684318721294403,
+      "rewards/bleu_reward_func/std": 0.03753623366355896,
+      "step": 1072
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.40625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 433.0,
+      "completions/mean_length": 271.625,
+      "completions/mean_terminated_length": 107.15789794921875,
+      "completions/min_length": 22.0,
+      "completions/min_terminated_length": 22.0,
+      "epoch": 0.8584,
+      "grad_norm": 3.9537734985351562,
+      "kl": 0.081634521484375,
+      "learning_rate": 1e-06,
+      "loss": -0.0181,
+      "num_tokens": 13956350.0,
+      "reward": 0.1140797883272171,
+      "reward_std": 0.023730140179395676,
+      "rewards/bleu_reward_func/mean": 0.1140797883272171,
+      "rewards/bleu_reward_func/std": 0.1426122635602951,
+      "step": 1073
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 461.0,
+      "completions/mean_length": 323.4375,
+      "completions/mean_terminated_length": 237.72727966308594,
+      "completions/min_length": 22.0,
+      "completions/min_terminated_length": 22.0,
+      "epoch": 0.8592,
+      "grad_norm": 4.239878177642822,
+      "kl": 0.10797119140625,
+      "learning_rate": 1e-06,
+      "loss": 0.0029,
+      "num_tokens": 13971620.0,
+      "reward": 0.10953356325626373,
+      "reward_std": 0.07727043330669403,
+      "rewards/bleu_reward_func/mean": 0.10953356325626373,
+      "rewards/bleu_reward_func/std": 0.1143500879406929,
+      "step": 1074
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 495.0,
+      "completions/mean_length": 447.5625,
+      "completions/mean_terminated_length": 383.125,
+      "completions/min_length": 73.0,
+      "completions/min_terminated_length": 73.0,
+      "epoch": 0.86,
+      "grad_norm": 2.2002503871917725,
+      "kl": 0.049072265625,
+      "learning_rate": 1e-06,
+      "loss": -0.0472,
+      "num_tokens": 13990150.0,
+      "reward": 0.08343654125928879,
+      "reward_std": 0.02118324115872383,
+      "rewards/bleu_reward_func/mean": 0.08343654125928879,
+      "rewards/bleu_reward_func/std": 0.08093992620706558,
+      "step": 1075
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 503.0,
+      "completions/mean_length": 472.125,
+      "completions/mean_terminated_length": 420.8571472167969,
+      "completions/min_length": 280.0,
+      "completions/min_terminated_length": 280.0,
+      "epoch": 0.8608,
+      "grad_norm": 2.0650570392608643,
+      "kl": 0.032440185546875,
+      "learning_rate": 1e-06,
+      "loss": -0.0088,
+      "num_tokens": 14011170.0,
+      "reward": 0.04162130132317543,
+      "reward_std": 0.010478474199771881,
+      "rewards/bleu_reward_func/mean": 0.04162130132317543,
+      "rewards/bleu_reward_func/std": 0.017952080816030502,
+      "step": 1076
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.40625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 421.0,
+      "completions/mean_length": 242.78125,
+      "completions/mean_terminated_length": 58.578948974609375,
+      "completions/min_length": 22.0,
+      "completions/min_terminated_length": 22.0,
+      "epoch": 0.8616,
+      "grad_norm": 4.682583808898926,
+      "kl": 0.316986083984375,
+      "learning_rate": 1e-06,
+      "loss": -0.0201,
+      "num_tokens": 14025299.0,
+      "reward": 0.2864699065685272,
+      "reward_std": 0.03820549696683884,
+      "rewards/bleu_reward_func/mean": 0.2864699065685272,
+      "rewards/bleu_reward_func/std": 0.26346415281295776,
+      "step": 1077
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 122.0,
+      "completions/mean_length": 165.625,
+      "completions/mean_terminated_length": 50.16666793823242,
+      "completions/min_length": 16.0,
+      "completions/min_terminated_length": 16.0,
+      "epoch": 0.8624,
+      "grad_norm": 7.724006652832031,
+      "kl": 0.23388671875,
+      "learning_rate": 1e-06,
+      "loss": -0.0182,
+      "num_tokens": 14037399.0,
+      "reward": 0.1795015037059784,
+      "reward_std": 0.07820923626422882,
+      "rewards/bleu_reward_func/mean": 0.1795015037059784,
+      "rewards/bleu_reward_func/std": 0.15337687730789185,
+      "step": 1078
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 172.0,
+      "completions/mean_length": 276.09375,
+      "completions/mean_terminated_length": 40.1875,
+      "completions/min_length": 17.0,
+      "completions/min_terminated_length": 17.0,
+      "epoch": 0.8632,
+      "grad_norm": 8.850776672363281,
+      "kl": 0.24285888671875,
+      "learning_rate": 1e-06,
+      "loss": -0.1453,
+      "num_tokens": 14052562.0,
+      "reward": 0.09584256261587143,
+      "reward_std": 0.01827467978000641,
+      "rewards/bleu_reward_func/mean": 0.09584256261587143,
+      "rewards/bleu_reward_func/std": 0.10066576302051544,
+      "step": 1079
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.28125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 174.0,
+      "completions/mean_length": 203.1875,
+      "completions/mean_terminated_length": 82.34782409667969,
+      "completions/min_length": 18.0,
+      "completions/min_terminated_length": 18.0,
+      "epoch": 0.864,
+      "grad_norm": 4.735725402832031,
+      "kl": 0.0994873046875,
+      "learning_rate": 1e-06,
+      "loss": 0.2218,
+      "num_tokens": 14061568.0,
+      "reward": 0.27030178904533386,
+      "reward_std": 0.057654060423374176,
+      "rewards/bleu_reward_func/mean": 0.27030178904533386,
+      "rewards/bleu_reward_func/std": 0.16439270973205566,
+      "step": 1080
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 392.0,
+      "completions/mean_length": 237.875,
+      "completions/mean_terminated_length": 174.61538696289062,
+      "completions/min_length": 31.0,
+      "completions/min_terminated_length": 31.0,
+      "epoch": 0.8648,
+      "grad_norm": 4.542383193969727,
+      "kl": 0.0601806640625,
+      "learning_rate": 1e-06,
+      "loss": -0.1576,
+      "num_tokens": 14071236.0,
+      "reward": 0.05943232774734497,
+      "reward_std": 0.03797609731554985,
+      "rewards/bleu_reward_func/mean": 0.05943232774734497,
+      "rewards/bleu_reward_func/std": 0.07494883239269257,
+      "step": 1081
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 495.0,
+      "completions/mean_length": 214.625,
+      "completions/mean_terminated_length": 146.0,
+      "completions/min_length": 14.0,
+      "completions/min_terminated_length": 14.0,
+      "epoch": 0.8656,
+      "grad_norm": 9.019255638122559,
+      "kl": 0.340179443359375,
+      "learning_rate": 1e-06,
+      "loss": 0.0456,
+      "num_tokens": 14080680.0,
+      "reward": 0.09385368227958679,
+      "reward_std": 0.041810497641563416,
+      "rewards/bleu_reward_func/mean": 0.09385368227958679,
+      "rewards/bleu_reward_func/std": 0.04523298144340515,
+      "step": 1082
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 451.0,
+      "completions/mean_length": 111.3125,
+      "completions/mean_terminated_length": 54.07143020629883,
+      "completions/min_length": 24.0,
+      "completions/min_terminated_length": 24.0,
+      "epoch": 0.8664,
+      "grad_norm": 5.487109184265137,
+      "kl": 0.3203125,
+      "learning_rate": 1e-06,
+      "loss": 0.195,
+      "num_tokens": 14093178.0,
+      "reward": 0.2254057228565216,
+      "reward_std": 0.0354473814368248,
+      "rewards/bleu_reward_func/mean": 0.2254057228565216,
+      "rewards/bleu_reward_func/std": 0.15529486536979675,
+      "step": 1083
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 512.0,
+      "completions/mean_length": 473.03125,
+      "completions/mean_terminated_length": 408.0833435058594,
+      "completions/min_length": 108.0,
+      "completions/min_terminated_length": 108.0,
+      "epoch": 0.8672,
+      "grad_norm": 2.1760456562042236,
+      "kl": 0.05029296875,
+      "learning_rate": 1e-06,
+      "loss": -0.1127,
+      "num_tokens": 14112195.0,
+      "reward": 0.11573594808578491,
+      "reward_std": 0.034562353044748306,
+      "rewards/bleu_reward_func/mean": 0.11573594808578491,
+      "rewards/bleu_reward_func/std": 0.0883888527750969,
+      "step": 1084
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.34375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 471.0,
+      "completions/mean_length": 248.8125,
+      "completions/mean_terminated_length": 110.95238494873047,
+      "completions/min_length": 7.0,
+      "completions/min_terminated_length": 7.0,
+      "epoch": 0.868,
+      "grad_norm": 20.237722396850586,
+      "kl": 0.27996826171875,
+      "learning_rate": 1e-06,
+      "loss": -0.0279,
+      "num_tokens": 14125485.0,
+      "reward": 0.06478704512119293,
+      "reward_std": 0.01746372878551483,
+      "rewards/bleu_reward_func/mean": 0.06478704512119293,
+      "rewards/bleu_reward_func/std": 0.04760226234793663,
+      "step": 1085
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 491.0,
+      "completions/mean_length": 254.21875,
+      "completions/mean_terminated_length": 194.73077392578125,
+      "completions/min_length": 23.0,
+      "completions/min_terminated_length": 23.0,
+      "epoch": 0.8688,
+      "grad_norm": 4.712401390075684,
+      "kl": 0.119384765625,
+      "learning_rate": 1e-06,
+      "loss": 0.2418,
+      "num_tokens": 14136628.0,
+      "reward": 0.07501716911792755,
+      "reward_std": 0.022581705823540688,
+      "rewards/bleu_reward_func/mean": 0.07501716911792755,
+      "rewards/bleu_reward_func/std": 0.045875921845436096,
+      "step": 1086
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 506.0,
+      "completions/mean_length": 214.4375,
+      "completions/mean_terminated_length": 145.7692413330078,
+      "completions/min_length": 16.0,
+      "completions/min_terminated_length": 16.0,
+      "epoch": 0.8696,
+      "grad_norm": 7.036587715148926,
+      "kl": 0.40423583984375,
+      "learning_rate": 1e-06,
+      "loss": -0.0659,
+      "num_tokens": 14152290.0,
+      "reward": 0.24361515045166016,
+      "reward_std": 0.05023983493447304,
+      "rewards/bleu_reward_func/mean": 0.24361515045166016,
+      "rewards/bleu_reward_func/std": 0.2318515181541443,
+      "step": 1087
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 481.0,
+      "completions/mean_length": 310.15625,
+      "completions/mean_terminated_length": 281.3214416503906,
+      "completions/min_length": 68.0,
+      "completions/min_terminated_length": 68.0,
+      "epoch": 0.8704,
+      "grad_norm": 2.8048856258392334,
+      "kl": 0.06768798828125,
+      "learning_rate": 1e-06,
+      "loss": -0.1012,
+      "num_tokens": 14164015.0,
+      "reward": 0.06632909178733826,
+      "reward_std": 0.022660713642835617,
+      "rewards/bleu_reward_func/mean": 0.06632909178733826,
+      "rewards/bleu_reward_func/std": 0.05323861911892891,
+      "step": 1088
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 246.0,
+      "completions/mean_length": 72.03125,
+      "completions/mean_terminated_length": 57.838706970214844,
+      "completions/min_length": 18.0,
+      "completions/min_terminated_length": 18.0,
+      "epoch": 0.8712,
+      "grad_norm": 9.539746284484863,
+      "kl": 0.28515625,
+      "learning_rate": 1e-06,
+      "loss": 0.2369,
+      "num_tokens": 14175360.0,
+      "reward": 0.2565135359764099,
+      "reward_std": 0.06622748076915741,
+      "rewards/bleu_reward_func/mean": 0.2565135359764099,
+      "rewards/bleu_reward_func/std": 0.2024916261434555,
+      "step": 1089
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 502.0,
+      "completions/mean_length": 253.8125,
+      "completions/mean_terminated_length": 136.4545440673828,
+      "completions/min_length": 18.0,
+      "completions/min_terminated_length": 18.0,
+      "epoch": 0.872,
+      "grad_norm": 8.92574691772461,
+      "kl": 0.2467041015625,
+      "learning_rate": 1e-06,
+      "loss": 0.2116,
+      "num_tokens": 14187954.0,
+      "reward": 0.10993756353855133,
+      "reward_std": 0.041833557188510895,
+      "rewards/bleu_reward_func/mean": 0.10993756353855133,
+      "rewards/bleu_reward_func/std": 0.13020949065685272,
+      "step": 1090
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.21875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 484.0,
+      "completions/mean_length": 193.625,
+      "completions/mean_terminated_length": 104.47999572753906,
+      "completions/min_length": 28.0,
+      "completions/min_terminated_length": 28.0,
+      "epoch": 0.8728,
+      "grad_norm": 5.538515090942383,
+      "kl": 0.0902099609375,
+      "learning_rate": 1e-06,
+      "loss": -0.1573,
+      "num_tokens": 14196270.0,
+      "reward": 0.05786508321762085,
+      "reward_std": 0.02371850796043873,
+      "rewards/bleu_reward_func/mean": 0.05786508321762085,
+      "rewards/bleu_reward_func/std": 0.03462414816021919,
+      "step": 1091
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 498.0,
+      "completions/mean_length": 223.09375,
+      "completions/mean_terminated_length": 156.42308044433594,
+      "completions/min_length": 21.0,
+      "completions/min_terminated_length": 21.0,
+      "epoch": 0.8736,
+      "grad_norm": 5.6724443435668945,
+      "kl": 0.179168701171875,
+      "learning_rate": 1e-06,
+      "loss": 0.0128,
+      "num_tokens": 14208633.0,
+      "reward": 0.107764333486557,
+      "reward_std": 0.021315133199095726,
+      "rewards/bleu_reward_func/mean": 0.107764333486557,
+      "rewards/bleu_reward_func/std": 0.038448914885520935,
+      "step": 1092
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 502.0,
+      "completions/mean_length": 166.78125,
+      "completions/mean_terminated_length": 143.7666778564453,
+      "completions/min_length": 16.0,
+      "completions/min_terminated_length": 16.0,
+      "epoch": 0.8744,
+      "grad_norm": 5.278230667114258,
+      "kl": 0.2412109375,
+      "learning_rate": 1e-06,
+      "loss": -0.0013,
+      "num_tokens": 14218242.0,
+      "reward": 0.4157559275627136,
+      "reward_std": 0.037054967135190964,
+      "rewards/bleu_reward_func/mean": 0.4157559275627136,
+      "rewards/bleu_reward_func/std": 0.2559570372104645,
+      "step": 1093
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 506.0,
+      "completions/mean_length": 313.09375,
+      "completions/mean_terminated_length": 114.1875,
+      "completions/min_length": 24.0,
+      "completions/min_terminated_length": 24.0,
+      "epoch": 0.8752,
+      "grad_norm": 3.892812490463257,
+      "kl": 0.13092041015625,
+      "learning_rate": 1e-06,
+      "loss": 0.1517,
+      "num_tokens": 14232805.0,
+      "reward": 0.12693487107753754,
+      "reward_std": 0.04035983234643936,
+      "rewards/bleu_reward_func/mean": 0.12693487107753754,
+      "rewards/bleu_reward_func/std": 0.12727496027946472,
+      "step": 1094
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 486.0,
+      "completions/mean_length": 205.1875,
+      "completions/mean_terminated_length": 161.35714721679688,
+      "completions/min_length": 13.0,
+      "completions/min_terminated_length": 13.0,
+      "epoch": 0.876,
+      "grad_norm": 8.024927139282227,
+      "kl": 0.350433349609375,
+      "learning_rate": 1e-06,
+      "loss": 0.2352,
+      "num_tokens": 14244755.0,
+      "reward": 0.20502734184265137,
+      "reward_std": 0.09303287416696548,
+      "rewards/bleu_reward_func/mean": 0.20502734184265137,
+      "rewards/bleu_reward_func/std": 0.241799458861351,
+      "step": 1095
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.34375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 501.0,
+      "completions/mean_length": 361.1875,
+      "completions/mean_terminated_length": 282.19049072265625,
+      "completions/min_length": 31.0,
+      "completions/min_terminated_length": 31.0,
+      "epoch": 0.8768,
+      "grad_norm": 2.944213390350342,
+      "kl": 0.068603515625,
+      "learning_rate": 1e-06,
+      "loss": -0.106,
+      "num_tokens": 14257801.0,
+      "reward": 0.04917216673493385,
+      "reward_std": 0.011258791200816631,
+      "rewards/bleu_reward_func/mean": 0.04917216673493385,
+      "rewards/bleu_reward_func/std": 0.038949303328990936,
+      "step": 1096
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 400.0,
+      "completions/mean_length": 234.21875,
+      "completions/mean_terminated_length": 67.55000305175781,
+      "completions/min_length": 17.0,
+      "completions/min_terminated_length": 17.0,
+      "epoch": 0.8776,
+      "grad_norm": 6.089639663696289,
+      "kl": 0.22784423828125,
+      "learning_rate": 1e-06,
+      "loss": 0.1116,
+      "num_tokens": 14267648.0,
+      "reward": 0.10858422517776489,
+      "reward_std": 0.04780227690935135,
+      "rewards/bleu_reward_func/mean": 0.10858422517776489,
+      "rewards/bleu_reward_func/std": 0.07767506688833237,
+      "step": 1097
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 414.0,
+      "completions/mean_length": 383.21875,
+      "completions/mean_terminated_length": 217.6428680419922,
+      "completions/min_length": 31.0,
+      "completions/min_terminated_length": 31.0,
+      "epoch": 0.8784,
+      "grad_norm": 2.4417130947113037,
+      "kl": 0.039581298828125,
+      "learning_rate": 1e-06,
+      "loss": 0.101,
+      "num_tokens": 14284503.0,
+      "reward": 0.1223280131816864,
+      "reward_std": 0.058498185127973557,
+      "rewards/bleu_reward_func/mean": 0.1223280131816864,
+      "rewards/bleu_reward_func/std": 0.08976288139820099,
+      "step": 1098
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.28125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 390.0,
+      "completions/mean_length": 200.5,
+      "completions/mean_terminated_length": 78.60869598388672,
+      "completions/min_length": 15.0,
+      "completions/min_terminated_length": 15.0,
+      "epoch": 0.8792,
+      "grad_norm": 6.311417102813721,
+      "kl": 0.34259033203125,
+      "learning_rate": 1e-06,
+      "loss": 0.0608,
+      "num_tokens": 14294455.0,
+      "reward": 0.12745052576065063,
+      "reward_std": 0.048099152743816376,
+      "rewards/bleu_reward_func/mean": 0.12745052576065063,
+      "rewards/bleu_reward_func/std": 0.12118762731552124,
+      "step": 1099
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 432.0,
+      "completions/mean_length": 294.375,
+      "completions/mean_terminated_length": 244.1538543701172,
+      "completions/min_length": 20.0,
+      "completions/min_terminated_length": 20.0,
+      "epoch": 0.88,
+      "grad_norm": 5.328085422515869,
+      "kl": 0.184539794921875,
+      "learning_rate": 1e-06,
+      "loss": -0.0179,
+      "num_tokens": 14311835.0,
+      "reward": 0.2192595899105072,
+      "reward_std": 0.042960211634635925,
+      "rewards/bleu_reward_func/mean": 0.2192595899105072,
+      "rewards/bleu_reward_func/std": 0.13524703681468964,
+      "step": 1100
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 506.0,
+      "completions/mean_length": 324.625,
+      "completions/mean_terminated_length": 262.16668701171875,
+      "completions/min_length": 121.0,
+      "completions/min_terminated_length": 121.0,
+      "epoch": 0.8808,
+      "grad_norm": 2.6405608654022217,
+      "kl": 0.0440673828125,
+      "learning_rate": 1e-06,
+      "loss": 0.1205,
+      "num_tokens": 14327975.0,
+      "reward": 0.14465495944023132,
+      "reward_std": 0.04526882618665695,
+      "rewards/bleu_reward_func/mean": 0.14465495944023132,
+      "rewards/bleu_reward_func/std": 0.1039966493844986,
+      "step": 1101
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.46875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 466.0,
+      "completions/mean_length": 336.125,
+      "completions/mean_terminated_length": 180.94117736816406,
+      "completions/min_length": 60.0,
+      "completions/min_terminated_length": 60.0,
+      "epoch": 0.8816,
+      "grad_norm": 3.4212841987609863,
+      "kl": 0.0875244140625,
+      "learning_rate": 1e-06,
+      "loss": 0.1697,
+      "num_tokens": 14343595.0,
+      "reward": 0.07303881645202637,
+      "reward_std": 0.023782189935445786,
+      "rewards/bleu_reward_func/mean": 0.07303881645202637,
+      "rewards/bleu_reward_func/std": 0.10003393888473511,
+      "step": 1102
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.21875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 453.0,
+      "completions/mean_length": 226.0,
+      "completions/mean_terminated_length": 145.9199981689453,
+      "completions/min_length": 12.0,
+      "completions/min_terminated_length": 12.0,
+      "epoch": 0.8824,
+      "grad_norm": 7.256896018981934,
+      "kl": 0.2523193359375,
+      "learning_rate": 1e-06,
+      "loss": 0.0389,
+      "num_tokens": 14357595.0,
+      "reward": 0.14129553735256195,
+      "reward_std": 0.05766978859901428,
+      "rewards/bleu_reward_func/mean": 0.14129553735256195,
+      "rewards/bleu_reward_func/std": 0.13982893526554108,
+      "step": 1103
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 479.0,
+      "completions/mean_length": 375.65625,
+      "completions/mean_terminated_length": 313.68182373046875,
+      "completions/min_length": 34.0,
+      "completions/min_terminated_length": 34.0,
+      "epoch": 0.8832,
+      "grad_norm": 2.429725170135498,
+      "kl": 0.0618896484375,
+      "learning_rate": 1e-06,
+      "loss": 0.0492,
+      "num_tokens": 14372368.0,
+      "reward": 0.0918026864528656,
+      "reward_std": 0.019557196646928787,
+      "rewards/bleu_reward_func/mean": 0.0918026864528656,
+      "rewards/bleu_reward_func/std": 0.0768144428730011,
+      "step": 1104
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.09375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 121.0,
+      "completions/mean_length": 81.0625,
+      "completions/mean_terminated_length": 36.482757568359375,
+      "completions/min_length": 13.0,
+      "completions/min_terminated_length": 13.0,
+      "epoch": 0.884,
+      "grad_norm": 10.26905345916748,
+      "kl": 0.4246826171875,
+      "learning_rate": 1e-06,
+      "loss": 0.4653,
+      "num_tokens": 14379298.0,
+      "reward": 0.1903451383113861,
+      "reward_std": 0.0727916806936264,
+      "rewards/bleu_reward_func/mean": 0.1903451383113861,
+      "rewards/bleu_reward_func/std": 0.18681129813194275,
+      "step": 1105
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.28125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 307.0,
+      "completions/mean_length": 218.09375,
+      "completions/mean_terminated_length": 103.08695983886719,
+      "completions/min_length": 19.0,
+      "completions/min_terminated_length": 19.0,
+      "epoch": 0.8848,
+      "grad_norm": 6.238147735595703,
+      "kl": 0.2109375,
+      "learning_rate": 1e-06,
+      "loss": 0.1147,
+      "num_tokens": 14390325.0,
+      "reward": 0.10796605050563812,
+      "reward_std": 0.028253143653273582,
+      "rewards/bleu_reward_func/mean": 0.10796605050563812,
+      "rewards/bleu_reward_func/std": 0.08980042487382889,
+      "step": 1106
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.46875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 392.0,
+      "completions/mean_length": 334.3125,
+      "completions/mean_terminated_length": 177.5294189453125,
+      "completions/min_length": 15.0,
+      "completions/min_terminated_length": 15.0,
+      "epoch": 0.8856,
+      "grad_norm": 5.977288246154785,
+      "kl": 0.12933349609375,
+      "learning_rate": 1e-06,
+      "loss": 0.0113,
+      "num_tokens": 14405719.0,
+      "reward": 0.08215081691741943,
+      "reward_std": 0.012335095554590225,
+      "rewards/bleu_reward_func/mean": 0.08215081691741943,
+      "rewards/bleu_reward_func/std": 0.0935206413269043,
+      "step": 1107
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 507.0,
+      "completions/mean_length": 368.25,
+      "completions/mean_terminated_length": 224.5,
+      "completions/min_length": 11.0,
+      "completions/min_terminated_length": 11.0,
+      "epoch": 0.8864,
+      "grad_norm": 3.1431217193603516,
+      "kl": 0.046478271484375,
+      "learning_rate": 1e-06,
+      "loss": 0.0529,
+      "num_tokens": 14420959.0,
+      "reward": 0.07733479142189026,
+      "reward_std": 0.04769892990589142,
+      "rewards/bleu_reward_func/mean": 0.07733479142189026,
+      "rewards/bleu_reward_func/std": 0.07268877327442169,
+      "step": 1108
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.15625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 498.0,
+      "completions/mean_length": 205.0625,
+      "completions/mean_terminated_length": 148.22222900390625,
+      "completions/min_length": 12.0,
+      "completions/min_terminated_length": 12.0,
+      "epoch": 0.8872,
+      "grad_norm": 8.718172073364258,
+      "kl": 0.27215576171875,
+      "learning_rate": 1e-06,
+      "loss": -0.0245,
+      "num_tokens": 14431297.0,
+      "reward": 0.08119820058345795,
+      "reward_std": 0.02793770469725132,
+      "rewards/bleu_reward_func/mean": 0.08119820058345795,
+      "rewards/bleu_reward_func/std": 0.046033825725317,
+      "step": 1109
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.09375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 505.0,
+      "completions/mean_length": 218.59375,
+      "completions/mean_terminated_length": 188.2413787841797,
+      "completions/min_length": 14.0,
+      "completions/min_terminated_length": 14.0,
+      "epoch": 0.888,
+      "grad_norm": 7.532926082611084,
+      "kl": 0.197265625,
+      "learning_rate": 1e-06,
+      "loss": 0.2,
+      "num_tokens": 14440556.0,
+      "reward": 0.16878977417945862,
+      "reward_std": 0.06408128887414932,
+      "rewards/bleu_reward_func/mean": 0.16878977417945862,
+      "rewards/bleu_reward_func/std": 0.17819638550281525,
+      "step": 1110
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 501.0,
+      "completions/mean_length": 254.875,
+      "completions/mean_terminated_length": 169.1666717529297,
+      "completions/min_length": 19.0,
+      "completions/min_terminated_length": 19.0,
+      "epoch": 0.8888,
+      "grad_norm": 5.752809047698975,
+      "kl": 0.151947021484375,
+      "learning_rate": 1e-06,
+      "loss": -0.0323,
+      "num_tokens": 14453832.0,
+      "reward": 0.08754751831293106,
+      "reward_std": 0.041982948780059814,
+      "rewards/bleu_reward_func/mean": 0.08754751831293106,
+      "rewards/bleu_reward_func/std": 0.08986286073923111,
+      "step": 1111
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 491.0,
+      "completions/mean_length": 431.9375,
+      "completions/mean_terminated_length": 298.5,
+      "completions/min_length": 33.0,
+      "completions/min_terminated_length": 33.0,
+      "epoch": 0.8896,
+      "grad_norm": 2.506591320037842,
+      "kl": 0.04913330078125,
+      "learning_rate": 1e-06,
+      "loss": 0.0059,
+      "num_tokens": 14470966.0,
+      "reward": 0.05948667228221893,
+      "reward_std": 0.031033214181661606,
+      "rewards/bleu_reward_func/mean": 0.05948667228221893,
+      "rewards/bleu_reward_func/std": 0.04187482222914696,
+      "step": 1112
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 498.0,
+      "completions/mean_length": 458.125,
+      "completions/mean_terminated_length": 404.25,
+      "completions/min_length": 216.0,
+      "completions/min_terminated_length": 216.0,
+      "epoch": 0.8904,
+      "grad_norm": 2.1371476650238037,
+      "kl": 0.05169677734375,
+      "learning_rate": 1e-06,
+      "loss": 0.0275,
+      "num_tokens": 14488386.0,
+      "reward": 0.07635128498077393,
+      "reward_std": 0.02666424587368965,
+      "rewards/bleu_reward_func/mean": 0.07635128498077393,
+      "rewards/bleu_reward_func/std": 0.06230180338025093,
+      "step": 1113
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 509.0,
+      "completions/mean_length": 381.3125,
+      "completions/mean_terminated_length": 321.9090881347656,
+      "completions/min_length": 109.0,
+      "completions/min_terminated_length": 109.0,
+      "epoch": 0.8912,
+      "grad_norm": 2.5617504119873047,
+      "kl": 0.05352783203125,
+      "learning_rate": 1e-06,
+      "loss": -0.0071,
+      "num_tokens": 14502500.0,
+      "reward": 0.1177460253238678,
+      "reward_std": 0.02880302257835865,
+      "rewards/bleu_reward_func/mean": 0.1177460253238678,
+      "rewards/bleu_reward_func/std": 0.06036384403705597,
+      "step": 1114
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.28125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 487.0,
+      "completions/mean_length": 354.875,
+      "completions/mean_terminated_length": 293.39129638671875,
+      "completions/min_length": 23.0,
+      "completions/min_terminated_length": 23.0,
+      "epoch": 0.892,
+      "grad_norm": 6.915249824523926,
+      "kl": 0.162353515625,
+      "learning_rate": 1e-06,
+      "loss": -0.1012,
+      "num_tokens": 14515272.0,
+      "reward": 0.08914826065301895,
+      "reward_std": 0.03028780408203602,
+      "rewards/bleu_reward_func/mean": 0.08914826065301895,
+      "rewards/bleu_reward_func/std": 0.042083028703927994,
+      "step": 1115
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 472.0,
+      "completions/mean_length": 145.96875,
+      "completions/mean_terminated_length": 61.500003814697266,
+      "completions/min_length": 17.0,
+      "completions/min_terminated_length": 17.0,
+      "epoch": 0.8928,
+      "grad_norm": 8.211282730102539,
+      "kl": 0.33642578125,
+      "learning_rate": 1e-06,
+      "loss": -0.0331,
+      "num_tokens": 14523199.0,
+      "reward": 0.13895554840564728,
+      "reward_std": 0.06001996994018555,
+      "rewards/bleu_reward_func/mean": 0.13895554840564728,
+      "rewards/bleu_reward_func/std": 0.10717976838350296,
+      "step": 1116
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.28125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 509.0,
+      "completions/mean_length": 269.8125,
+      "completions/mean_terminated_length": 175.04348754882812,
+      "completions/min_length": 12.0,
+      "completions/min_terminated_length": 12.0,
+      "epoch": 0.8936,
+      "grad_norm": 9.093502044677734,
+      "kl": 0.18402099609375,
+      "learning_rate": 1e-06,
+      "loss": -0.0448,
+      "num_tokens": 14535921.0,
+      "reward": 0.08600494265556335,
+      "reward_std": 0.01430382952094078,
+      "rewards/bleu_reward_func/mean": 0.08600494265556335,
+      "rewards/bleu_reward_func/std": 0.03352402523159981,
+      "step": 1117
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.15625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 496.0,
+      "completions/mean_length": 220.65625,
+      "completions/mean_terminated_length": 166.70370483398438,
+      "completions/min_length": 17.0,
+      "completions/min_terminated_length": 17.0,
+      "epoch": 0.8944,
+      "grad_norm": 5.133798122406006,
+      "kl": 0.23126220703125,
+      "learning_rate": 1e-06,
+      "loss": -0.0611,
+      "num_tokens": 14546502.0,
+      "reward": 0.1281801015138626,
+      "reward_std": 0.033460669219493866,
+      "rewards/bleu_reward_func/mean": 0.1281801015138626,
+      "rewards/bleu_reward_func/std": 0.09999439865350723,
+      "step": 1118
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 512.0,
+      "completions/mean_length": 414.90625,
+      "completions/mean_terminated_length": 382.54168701171875,
+      "completions/min_length": 261.0,
+      "completions/min_terminated_length": 261.0,
+      "epoch": 0.8952,
+      "grad_norm": 2.3336856365203857,
+      "kl": 0.05023193359375,
+      "learning_rate": 1e-06,
+      "loss": -0.0381,
+      "num_tokens": 14563043.0,
+      "reward": 0.09009081870317459,
+      "reward_std": 0.024957649409770966,
+      "rewards/bleu_reward_func/mean": 0.09009081870317459,
+      "rewards/bleu_reward_func/std": 0.07389495521783829,
+      "step": 1119
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 440.0,
+      "completions/mean_length": 249.375,
+      "completions/mean_terminated_length": 188.7692413330078,
+      "completions/min_length": 40.0,
+      "completions/min_terminated_length": 40.0,
+      "epoch": 0.896,
+      "grad_norm": 4.001770496368408,
+      "kl": 0.06622314453125,
+      "learning_rate": 1e-06,
+      "loss": 0.4376,
+      "num_tokens": 14574703.0,
+      "reward": 0.12255299836397171,
+      "reward_std": 0.06612245738506317,
+      "rewards/bleu_reward_func/mean": 0.12255299836397171,
+      "rewards/bleu_reward_func/std": 0.1745522916316986,
+      "step": 1120
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 512.0,
+      "completions/mean_length": 341.96875,
+      "completions/mean_terminated_length": 264.68182373046875,
+      "completions/min_length": 21.0,
+      "completions/min_terminated_length": 21.0,
+      "epoch": 0.8968,
+      "grad_norm": 5.190872669219971,
+      "kl": 0.0662841796875,
+      "learning_rate": 1e-06,
+      "loss": -0.2989,
+      "num_tokens": 14590934.0,
+      "reward": 0.015649927780032158,
+      "reward_std": 0.008883368223905563,
+      "rewards/bleu_reward_func/mean": 0.015649927780032158,
+      "rewards/bleu_reward_func/std": 0.014249512925744057,
+      "step": 1121
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.21875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 444.0,
+      "completions/mean_length": 204.71875,
+      "completions/mean_terminated_length": 118.68000030517578,
+      "completions/min_length": 20.0,
+      "completions/min_terminated_length": 20.0,
+      "epoch": 0.8976,
+      "grad_norm": 7.992037296295166,
+      "kl": 0.22357177734375,
+      "learning_rate": 1e-06,
+      "loss": 0.0293,
+      "num_tokens": 14601957.0,
+      "reward": 0.14590373635292053,
+      "reward_std": 0.032411251217126846,
+      "rewards/bleu_reward_func/mean": 0.14590373635292053,
+      "rewards/bleu_reward_func/std": 0.1304609477519989,
+      "step": 1122
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 490.0,
+      "completions/mean_length": 298.4375,
+      "completions/mean_terminated_length": 132.3333282470703,
+      "completions/min_length": 31.0,
+      "completions/min_terminated_length": 31.0,
+      "epoch": 0.8984,
+      "grad_norm": 6.297399044036865,
+      "kl": 0.05999755859375,
+      "learning_rate": 1e-06,
+      "loss": 0.3021,
+      "num_tokens": 14616019.0,
+      "reward": 0.071531280875206,
+      "reward_std": 0.02597668580710888,
+      "rewards/bleu_reward_func/mean": 0.071531280875206,
+      "rewards/bleu_reward_func/std": 0.05073075741529465,
+      "step": 1123
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.53125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 456.0,
+      "completions/mean_length": 353.90625,
+      "completions/mean_terminated_length": 174.73333740234375,
+      "completions/min_length": 38.0,
+      "completions/min_terminated_length": 38.0,
+      "epoch": 0.8992,
+      "grad_norm": 2.9494781494140625,
+      "kl": 0.0675048828125,
+      "learning_rate": 1e-06,
+      "loss": -0.0383,
+      "num_tokens": 14634248.0,
+      "reward": 0.20859137177467346,
+      "reward_std": 0.026030534878373146,
+      "rewards/bleu_reward_func/mean": 0.20859137177467346,
+      "rewards/bleu_reward_func/std": 0.25668489933013916,
+      "step": 1124
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 459.0,
+      "completions/mean_length": 337.03125,
+      "completions/mean_terminated_length": 200.94444274902344,
+      "completions/min_length": 28.0,
+      "completions/min_terminated_length": 28.0,
+      "epoch": 0.9,
+      "grad_norm": 3.6260087490081787,
+      "kl": 0.06976318359375,
+      "learning_rate": 1e-06,
+      "loss": -0.0424,
+      "num_tokens": 14646505.0,
+      "reward": 0.03524015098810196,
+      "reward_std": 0.021195726469159126,
+      "rewards/bleu_reward_func/mean": 0.03524015098810196,
+      "rewards/bleu_reward_func/std": 0.04081031307578087,
+      "step": 1125
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.28125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 377.0,
+      "completions/mean_length": 228.375,
+      "completions/mean_terminated_length": 117.39130401611328,
+      "completions/min_length": 14.0,
+      "completions/min_terminated_length": 14.0,
+      "epoch": 0.9008,
+      "grad_norm": 6.014510154724121,
+      "kl": 0.232421875,
+      "learning_rate": 1e-06,
+      "loss": 0.0001,
+      "num_tokens": 14657525.0,
+      "reward": 0.06341119110584259,
+      "reward_std": 0.03255104646086693,
+      "rewards/bleu_reward_func/mean": 0.06341119110584259,
+      "rewards/bleu_reward_func/std": 0.04315832257270813,
+      "step": 1126
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 485.0,
+      "completions/mean_length": 238.21875,
+      "completions/mean_terminated_length": 175.03846740722656,
+      "completions/min_length": 15.0,
+      "completions/min_terminated_length": 15.0,
+      "epoch": 0.9016,
+      "grad_norm": 4.443993091583252,
+      "kl": 0.07666015625,
+      "learning_rate": 1e-06,
+      "loss": 0.1492,
+      "num_tokens": 14669820.0,
+      "reward": 0.050140924751758575,
+      "reward_std": 0.02134326659142971,
+      "rewards/bleu_reward_func/mean": 0.050140924751758575,
+      "rewards/bleu_reward_func/std": 0.054666925221681595,
+      "step": 1127
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.21875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 507.0,
+      "completions/mean_length": 206.8125,
+      "completions/mean_terminated_length": 121.36000061035156,
+      "completions/min_length": 22.0,
+      "completions/min_terminated_length": 22.0,
+      "epoch": 0.9024,
+      "grad_norm": 6.487152099609375,
+      "kl": 0.30072021484375,
+      "learning_rate": 1e-06,
+      "loss": 0.1095,
+      "num_tokens": 14678878.0,
+      "reward": 0.20913344621658325,
+      "reward_std": 0.06204414367675781,
+      "rewards/bleu_reward_func/mean": 0.20913344621658325,
+      "rewards/bleu_reward_func/std": 0.15058699250221252,
+      "step": 1128
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.15625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 295.0,
+      "completions/mean_length": 168.5625,
+      "completions/mean_terminated_length": 104.96296691894531,
+      "completions/min_length": 16.0,
+      "completions/min_terminated_length": 16.0,
+      "epoch": 0.9032,
+      "grad_norm": 5.987677097320557,
+      "kl": 0.161376953125,
+      "learning_rate": 1e-06,
+      "loss": 0.1159,
+      "num_tokens": 14690400.0,
+      "reward": 0.22108127176761627,
+      "reward_std": 0.03181886300444603,
+      "rewards/bleu_reward_func/mean": 0.22108127176761627,
+      "rewards/bleu_reward_func/std": 0.21734413504600525,
+      "step": 1129
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.09375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 474.0,
+      "completions/mean_length": 224.40625,
+      "completions/mean_terminated_length": 194.65516662597656,
+      "completions/min_length": 17.0,
+      "completions/min_terminated_length": 17.0,
+      "epoch": 0.904,
+      "grad_norm": 4.591039180755615,
+      "kl": 0.1512451171875,
+      "learning_rate": 1e-06,
+      "loss": 0.0318,
+      "num_tokens": 14703621.0,
+      "reward": 0.13979627192020416,
+      "reward_std": 0.024196792393922806,
+      "rewards/bleu_reward_func/mean": 0.13979627192020416,
+      "rewards/bleu_reward_func/std": 0.11370246112346649,
+      "step": 1130
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 476.0,
+      "completions/mean_length": 191.96875,
+      "completions/mean_terminated_length": 170.6333465576172,
+      "completions/min_length": 22.0,
+      "completions/min_terminated_length": 22.0,
+      "epoch": 0.9048,
+      "grad_norm": 9.648924827575684,
+      "kl": 0.3162841796875,
+      "learning_rate": 1e-06,
+      "loss": 0.0993,
+      "num_tokens": 14715324.0,
+      "reward": 0.24474243819713593,
+      "reward_std": 0.03903892636299133,
+      "rewards/bleu_reward_func/mean": 0.24474243819713593,
+      "rewards/bleu_reward_func/std": 0.111121766269207,
+      "step": 1131
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 311.0,
+      "completions/mean_length": 335.125,
+      "completions/mean_terminated_length": 158.25,
+      "completions/min_length": 39.0,
+      "completions/min_terminated_length": 39.0,
+      "epoch": 0.9056,
+      "grad_norm": 3.5864415168762207,
+      "kl": 0.05462646484375,
+      "learning_rate": 1e-06,
+      "loss": 0.0438,
+      "num_tokens": 14728656.0,
+      "reward": 0.08658318221569061,
+      "reward_std": 0.03171471878886223,
+      "rewards/bleu_reward_func/mean": 0.08658318221569061,
+      "rewards/bleu_reward_func/std": 0.05243143439292908,
+      "step": 1132
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 444.0,
+      "completions/mean_length": 196.90625,
+      "completions/mean_terminated_length": 186.74192810058594,
+      "completions/min_length": 29.0,
+      "completions/min_terminated_length": 29.0,
+      "epoch": 0.9064,
+      "grad_norm": 5.391827583312988,
+      "kl": 0.149871826171875,
+      "learning_rate": 1e-06,
+      "loss": 0.209,
+      "num_tokens": 14743445.0,
+      "reward": 0.14488989114761353,
+      "reward_std": 0.05979035794734955,
+      "rewards/bleu_reward_func/mean": 0.14488989114761353,
+      "rewards/bleu_reward_func/std": 0.11484233289957047,
+      "step": 1133
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 40.0,
+      "completions/mean_length": 147.9375,
+      "completions/mean_terminated_length": 26.58333396911621,
+      "completions/min_length": 14.0,
+      "completions/min_terminated_length": 14.0,
+      "epoch": 0.9072,
+      "grad_norm": 9.304245948791504,
+      "kl": 0.4073486328125,
+      "learning_rate": 1e-06,
+      "loss": 0.0082,
+      "num_tokens": 14755539.0,
+      "reward": 0.2897959053516388,
+      "reward_std": 0.11407680809497833,
+      "rewards/bleu_reward_func/mean": 0.2897959053516388,
+      "rewards/bleu_reward_func/std": 0.3296494483947754,
+      "step": 1134
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.28125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 489.0,
+      "completions/mean_length": 302.0625,
+      "completions/mean_terminated_length": 219.9130401611328,
+      "completions/min_length": 45.0,
+      "completions/min_terminated_length": 45.0,
+      "epoch": 0.908,
+      "grad_norm": 4.041188716888428,
+      "kl": 0.08953857421875,
+      "learning_rate": 1e-06,
+      "loss": 0.0784,
+      "num_tokens": 14771221.0,
+      "reward": 0.0867033302783966,
+      "reward_std": 0.03230535611510277,
+      "rewards/bleu_reward_func/mean": 0.0867033302783966,
+      "rewards/bleu_reward_func/std": 0.05712318420410156,
+      "step": 1135
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.40625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 488.0,
+      "completions/mean_length": 353.21875,
+      "completions/mean_terminated_length": 244.57894897460938,
+      "completions/min_length": 31.0,
+      "completions/min_terminated_length": 31.0,
+      "epoch": 0.9088,
+      "grad_norm": 3.4682111740112305,
+      "kl": 0.063140869140625,
+      "learning_rate": 1e-06,
+      "loss": -0.013,
+      "num_tokens": 14786884.0,
+      "reward": 0.2040112018585205,
+      "reward_std": 0.03282826021313667,
+      "rewards/bleu_reward_func/mean": 0.2040112018585205,
+      "rewards/bleu_reward_func/std": 0.2342006117105484,
+      "step": 1136
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 474.0,
+      "completions/mean_length": 323.46875,
+      "completions/mean_terminated_length": 176.8333282470703,
+      "completions/min_length": 15.0,
+      "completions/min_terminated_length": 15.0,
+      "epoch": 0.9096,
+      "grad_norm": 9.80130672454834,
+      "kl": 0.296661376953125,
+      "learning_rate": 1e-06,
+      "loss": 0.0879,
+      "num_tokens": 14807267.0,
+      "reward": 0.15428093075752258,
+      "reward_std": 0.047303371131420135,
+      "rewards/bleu_reward_func/mean": 0.15428093075752258,
+      "rewards/bleu_reward_func/std": 0.12975330650806427,
+      "step": 1137
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 388.0,
+      "completions/mean_length": 213.3125,
+      "completions/mean_terminated_length": 113.75,
+      "completions/min_length": 21.0,
+      "completions/min_terminated_length": 21.0,
+      "epoch": 0.9104,
+      "grad_norm": 6.25737190246582,
+      "kl": 0.125823974609375,
+      "learning_rate": 1e-06,
+      "loss": 0.0125,
+      "num_tokens": 14815933.0,
+      "reward": 0.0721752792596817,
+      "reward_std": 0.021741271018981934,
+      "rewards/bleu_reward_func/mean": 0.0721752792596817,
+      "rewards/bleu_reward_func/std": 0.05829243361949921,
+      "step": 1138
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 349.0,
+      "completions/max_terminated_length": 349.0,
+      "completions/mean_length": 107.125,
+      "completions/mean_terminated_length": 107.125,
+      "completions/min_length": 3.0,
+      "completions/min_terminated_length": 3.0,
+      "epoch": 0.9112,
+      "grad_norm": 25.477249145507812,
+      "kl": 0.55877685546875,
+      "learning_rate": 1e-06,
+      "loss": -0.2259,
+      "num_tokens": 14825321.0,
+      "reward": 0.1802026480436325,
+      "reward_std": 0.11938925087451935,
+      "rewards/bleu_reward_func/mean": 0.1802026480436325,
+      "rewards/bleu_reward_func/std": 0.1613418012857437,
+      "step": 1139
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 201.0,
+      "completions/max_terminated_length": 201.0,
+      "completions/mean_length": 105.3125,
+      "completions/mean_terminated_length": 105.3125,
+      "completions/min_length": 34.0,
+      "completions/min_terminated_length": 34.0,
+      "epoch": 0.912,
+      "grad_norm": 11.604246139526367,
+      "kl": 0.2550048828125,
+      "learning_rate": 1e-06,
+      "loss": -0.0128,
+      "num_tokens": 14831115.0,
+      "reward": 0.19547826051712036,
+      "reward_std": 0.07176055759191513,
+      "rewards/bleu_reward_func/mean": 0.19547826051712036,
+      "rewards/bleu_reward_func/std": 0.11230416595935822,
+      "step": 1140
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.15625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 496.0,
+      "completions/mean_length": 255.59375,
+      "completions/mean_terminated_length": 208.11111450195312,
+      "completions/min_length": 14.0,
+      "completions/min_terminated_length": 14.0,
+      "epoch": 0.9128,
+      "grad_norm": 6.936506271362305,
+      "kl": 0.14453125,
+      "learning_rate": 1e-06,
+      "loss": -0.1892,
+      "num_tokens": 14847286.0,
+      "reward": 0.07467533648014069,
+      "reward_std": 0.0442538745701313,
+      "rewards/bleu_reward_func/mean": 0.07467533648014069,
+      "rewards/bleu_reward_func/std": 0.0976758524775505,
+      "step": 1141
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 491.0,
+      "completions/mean_length": 397.8125,
+      "completions/mean_terminated_length": 309.0,
+      "completions/min_length": 33.0,
+      "completions/min_terminated_length": 33.0,
+      "epoch": 0.9136,
+      "grad_norm": 3.533027410507202,
+      "kl": 0.1064453125,
+      "learning_rate": 1e-06,
+      "loss": 0.2572,
+      "num_tokens": 14862600.0,
+      "reward": 0.07055975496768951,
+      "reward_std": 0.024620652198791504,
+      "rewards/bleu_reward_func/mean": 0.07055975496768951,
+      "rewards/bleu_reward_func/std": 0.04198145866394043,
+      "step": 1142
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 442.0,
+      "completions/mean_length": 374.34375,
+      "completions/mean_terminated_length": 144.9166717529297,
+      "completions/min_length": 32.0,
+      "completions/min_terminated_length": 32.0,
+      "epoch": 0.9144,
+      "grad_norm": 3.106947422027588,
+      "kl": 0.05267333984375,
+      "learning_rate": 1e-06,
+      "loss": 0.3129,
+      "num_tokens": 14879211.0,
+      "reward": 0.047079749405384064,
+      "reward_std": 0.01572955772280693,
+      "rewards/bleu_reward_func/mean": 0.047079749405384064,
+      "rewards/bleu_reward_func/std": 0.03182151913642883,
+      "step": 1143
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 489.0,
+      "completions/mean_length": 296.21875,
+      "completions/mean_terminated_length": 289.258056640625,
+      "completions/min_length": 19.0,
+      "completions/min_terminated_length": 19.0,
+      "epoch": 0.9152,
+      "grad_norm": 9.539961814880371,
+      "kl": 0.26116943359375,
+      "learning_rate": 1e-06,
+      "loss": 0.3821,
+      "num_tokens": 14892074.0,
+      "reward": 0.04650323465466499,
+      "reward_std": 0.016895011067390442,
+      "rewards/bleu_reward_func/mean": 0.04650323465466499,
+      "rewards/bleu_reward_func/std": 0.020110802724957466,
+      "step": 1144
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.40625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 490.0,
+      "completions/mean_length": 237.75,
+      "completions/mean_terminated_length": 50.105262756347656,
+      "completions/min_length": 12.0,
+      "completions/min_terminated_length": 12.0,
+      "epoch": 0.916,
+      "grad_norm": 10.258657455444336,
+      "kl": 0.3250732421875,
+      "learning_rate": 1e-06,
+      "loss": -0.0619,
+      "num_tokens": 14904698.0,
+      "reward": 0.17951351404190063,
+      "reward_std": 0.07376629114151001,
+      "rewards/bleu_reward_func/mean": 0.17951351404190063,
+      "rewards/bleu_reward_func/std": 0.17956046760082245,
+      "step": 1145
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 512.0,
+      "completions/mean_length": 286.78125,
+      "completions/mean_terminated_length": 151.65000915527344,
+      "completions/min_length": 31.0,
+      "completions/min_terminated_length": 31.0,
+      "epoch": 0.9168,
+      "grad_norm": 6.667459964752197,
+      "kl": 0.08001708984375,
+      "learning_rate": 1e-06,
+      "loss": 0.5727,
+      "num_tokens": 14917451.0,
+      "reward": 0.03467312082648277,
+      "reward_std": 0.01676066778600216,
+      "rewards/bleu_reward_func/mean": 0.03467312082648277,
+      "rewards/bleu_reward_func/std": 0.02004723809659481,
+      "step": 1146
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 29.0,
+      "completions/mean_length": 266.90625,
+      "completions/mean_terminated_length": 21.8125,
+      "completions/min_length": 17.0,
+      "completions/min_terminated_length": 17.0,
+      "epoch": 0.9176,
+      "grad_norm": 7.602176189422607,
+      "kl": 0.4158935546875,
+      "learning_rate": 1e-06,
+      "loss": 0.038,
+      "num_tokens": 14932352.0,
+      "reward": 0.19471007585525513,
+      "reward_std": 0.04646201431751251,
+      "rewards/bleu_reward_func/mean": 0.19471007585525513,
+      "rewards/bleu_reward_func/std": 0.1760382354259491,
+      "step": 1147
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 508.0,
+      "completions/mean_length": 315.15625,
+      "completions/mean_terminated_length": 162.05555725097656,
+      "completions/min_length": 16.0,
+      "completions/min_terminated_length": 16.0,
+      "epoch": 0.9184,
+      "grad_norm": 6.479866027832031,
+      "kl": 0.109619140625,
+      "learning_rate": 1e-06,
+      "loss": 0.0595,
+      "num_tokens": 14947741.0,
+      "reward": 0.10786274820566177,
+      "reward_std": 0.036432720720767975,
+      "rewards/bleu_reward_func/mean": 0.10786274820566177,
+      "rewards/bleu_reward_func/std": 0.0789819210767746,
+      "step": 1148
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.53125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 511.0,
+      "completions/mean_length": 472.5,
+      "completions/mean_terminated_length": 427.7333679199219,
+      "completions/min_length": 327.0,
+      "completions/min_terminated_length": 327.0,
+      "epoch": 0.9192,
+      "grad_norm": 2.4095027446746826,
+      "kl": 0.0462646484375,
+      "learning_rate": 1e-06,
+      "loss": 0.0305,
+      "num_tokens": 14966173.0,
+      "reward": 0.1280764639377594,
+      "reward_std": 0.03749135136604309,
+      "rewards/bleu_reward_func/mean": 0.1280764639377594,
+      "rewards/bleu_reward_func/std": 0.05864708498120308,
+      "step": 1149
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.46875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 64.0,
+      "completions/mean_length": 255.90625,
+      "completions/mean_terminated_length": 29.941177368164062,
+      "completions/min_length": 13.0,
+      "completions/min_terminated_length": 13.0,
+      "epoch": 0.92,
+      "grad_norm": 10.67159652709961,
+      "kl": 0.1556396484375,
+      "learning_rate": 1e-06,
+      "loss": 0.1007,
+      "num_tokens": 14979570.0,
+      "reward": 0.055816084146499634,
+      "reward_std": 0.020302332937717438,
+      "rewards/bleu_reward_func/mean": 0.055816084146499634,
+      "rewards/bleu_reward_func/std": 0.03035581111907959,
+      "step": 1150
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 500.0,
+      "completions/mean_length": 210.78125,
+      "completions/mean_terminated_length": 167.75,
+      "completions/min_length": 14.0,
+      "completions/min_terminated_length": 14.0,
+      "epoch": 0.9208,
+      "grad_norm": 6.78237771987915,
+      "kl": 0.17840576171875,
+      "learning_rate": 1e-06,
+      "loss": 0.09,
+      "num_tokens": 14993923.0,
+      "reward": 0.07397650182247162,
+      "reward_std": 0.01999451220035553,
+      "rewards/bleu_reward_func/mean": 0.07397650182247162,
+      "rewards/bleu_reward_func/std": 0.0341508574783802,
+      "step": 1151
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.28125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 412.0,
+      "completions/mean_length": 227.34375,
+      "completions/mean_terminated_length": 115.95652770996094,
+      "completions/min_length": 17.0,
+      "completions/min_terminated_length": 17.0,
+      "epoch": 0.9216,
+      "grad_norm": 9.095620155334473,
+      "kl": 0.483001708984375,
+      "learning_rate": 1e-06,
+      "loss": 0.2875,
+      "num_tokens": 15005734.0,
+      "reward": 0.1028270274400711,
+      "reward_std": 0.05264887586236,
+      "rewards/bleu_reward_func/mean": 0.1028270274400711,
+      "rewards/bleu_reward_func/std": 0.08930659294128418,
+      "step": 1152
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.6875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 506.0,
+      "completions/mean_length": 436.0625,
+      "completions/mean_terminated_length": 269.0,
+      "completions/min_length": 32.0,
+      "completions/min_terminated_length": 32.0,
+      "epoch": 0.9224,
+      "grad_norm": 2.607079029083252,
+      "kl": 0.04913330078125,
+      "learning_rate": 1e-06,
+      "loss": 0.2182,
+      "num_tokens": 15021632.0,
+      "reward": 0.08679656684398651,
+      "reward_std": 0.05561990663409233,
+      "rewards/bleu_reward_func/mean": 0.08679656684398651,
+      "rewards/bleu_reward_func/std": 0.09985605627298355,
+      "step": 1153
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 463.0,
+      "completions/mean_length": 248.28125,
+      "completions/mean_terminated_length": 128.40908813476562,
+      "completions/min_length": 23.0,
+      "completions/min_terminated_length": 23.0,
+      "epoch": 0.9232,
+      "grad_norm": 6.871771812438965,
+      "kl": 0.18536376953125,
+      "learning_rate": 1e-06,
+      "loss": 0.02,
+      "num_tokens": 15035081.0,
+      "reward": 0.19525909423828125,
+      "reward_std": 0.04538525268435478,
+      "rewards/bleu_reward_func/mean": 0.19525909423828125,
+      "rewards/bleu_reward_func/std": 0.18253828585147858,
+      "step": 1154
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 156.0,
+      "completions/mean_length": 285.65625,
+      "completions/mean_terminated_length": 59.3125,
+      "completions/min_length": 30.0,
+      "completions/min_terminated_length": 30.0,
+      "epoch": 0.924,
+      "grad_norm": 6.028687953948975,
+      "kl": 0.0938720703125,
+      "learning_rate": 1e-06,
+      "loss": 0.2695,
+      "num_tokens": 15049558.0,
+      "reward": 0.09561645239591599,
+      "reward_std": 0.0469173789024353,
+      "rewards/bleu_reward_func/mean": 0.09561645239591599,
+      "rewards/bleu_reward_func/std": 0.07949265837669373,
+      "step": 1155
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.21875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 348.0,
+      "completions/mean_length": 237.875,
+      "completions/mean_terminated_length": 161.1199951171875,
+      "completions/min_length": 14.0,
+      "completions/min_terminated_length": 14.0,
+      "epoch": 0.9248,
+      "grad_norm": 18.29808807373047,
+      "kl": 0.55096435546875,
+      "learning_rate": 1e-06,
+      "loss": 0.0911,
+      "num_tokens": 15060954.0,
+      "reward": 0.1312306672334671,
+      "reward_std": 0.05109435319900513,
+      "rewards/bleu_reward_func/mean": 0.1312306672334671,
+      "rewards/bleu_reward_func/std": 0.0968712568283081,
+      "step": 1156
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 461.0,
+      "completions/mean_length": 183.40625,
+      "completions/mean_terminated_length": 73.875,
+      "completions/min_length": 28.0,
+      "completions/min_terminated_length": 28.0,
+      "epoch": 0.9256,
+      "grad_norm": 8.553780555725098,
+      "kl": 0.2044677734375,
+      "learning_rate": 1e-06,
+      "loss": 0.3837,
+      "num_tokens": 15071151.0,
+      "reward": 0.08946996927261353,
+      "reward_std": 0.032011546194553375,
+      "rewards/bleu_reward_func/mean": 0.08946996927261353,
+      "rewards/bleu_reward_func/std": 0.08429201692342758,
+      "step": 1157
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.09375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 496.0,
+      "completions/mean_length": 187.40625,
+      "completions/mean_terminated_length": 153.8275909423828,
+      "completions/min_length": 38.0,
+      "completions/min_terminated_length": 38.0,
+      "epoch": 0.9264,
+      "grad_norm": 7.326318740844727,
+      "kl": 0.14752197265625,
+      "learning_rate": 1e-06,
+      "loss": 0.5061,
+      "num_tokens": 15080252.0,
+      "reward": 0.1023559644818306,
+      "reward_std": 0.045405931770801544,
+      "rewards/bleu_reward_func/mean": 0.1023559644818306,
+      "rewards/bleu_reward_func/std": 0.07145705074071884,
+      "step": 1158
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.21875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 511.0,
+      "completions/mean_length": 304.4375,
+      "completions/mean_terminated_length": 246.3199920654297,
+      "completions/min_length": 40.0,
+      "completions/min_terminated_length": 40.0,
+      "epoch": 0.9272,
+      "grad_norm": 5.513801574707031,
+      "kl": 0.1817626953125,
+      "learning_rate": 1e-06,
+      "loss": 0.1149,
+      "num_tokens": 15095610.0,
+      "reward": 0.11080615222454071,
+      "reward_std": 0.043468981981277466,
+      "rewards/bleu_reward_func/mean": 0.11080615222454071,
+      "rewards/bleu_reward_func/std": 0.04713428020477295,
+      "step": 1159
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 347.0,
+      "completions/mean_length": 205.40625,
+      "completions/mean_terminated_length": 161.60714721679688,
+      "completions/min_length": 23.0,
+      "completions/min_terminated_length": 23.0,
+      "epoch": 0.928,
+      "grad_norm": 7.627071857452393,
+      "kl": 0.318359375,
+      "learning_rate": 1e-06,
+      "loss": 0.1805,
+      "num_tokens": 15104167.0,
+      "reward": 0.0735570564866066,
+      "reward_std": 0.02132660523056984,
+      "rewards/bleu_reward_func/mean": 0.0735570564866066,
+      "rewards/bleu_reward_func/std": 0.05421363562345505,
+      "step": 1160
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.28125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 454.0,
+      "completions/mean_length": 256.25,
+      "completions/mean_terminated_length": 156.17391967773438,
+      "completions/min_length": 23.0,
+      "completions/min_terminated_length": 23.0,
+      "epoch": 0.9288,
+      "grad_norm": 6.11322021484375,
+      "kl": 0.27099609375,
+      "learning_rate": 1e-06,
+      "loss": 0.1028,
+      "num_tokens": 15118823.0,
+      "reward": 0.1414988487958908,
+      "reward_std": 0.03222941979765892,
+      "rewards/bleu_reward_func/mean": 0.1414988487958908,
+      "rewards/bleu_reward_func/std": 0.15351709723472595,
+      "step": 1161
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.65625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 434.0,
+      "completions/mean_length": 417.8125,
+      "completions/mean_terminated_length": 238.0,
+      "completions/min_length": 35.0,
+      "completions/min_terminated_length": 35.0,
+      "epoch": 0.9296,
+      "grad_norm": 3.5942113399505615,
+      "kl": 0.08154296875,
+      "learning_rate": 1e-06,
+      "loss": 0.1689,
+      "num_tokens": 15139105.0,
+      "reward": 0.12588296830654144,
+      "reward_std": 0.04371759667992592,
+      "rewards/bleu_reward_func/mean": 0.12588296830654144,
+      "rewards/bleu_reward_func/std": 0.14081913232803345,
+      "step": 1162
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 503.0,
+      "completions/mean_length": 318.125,
+      "completions/mean_terminated_length": 273.3846130371094,
+      "completions/min_length": 59.0,
+      "completions/min_terminated_length": 59.0,
+      "epoch": 0.9304,
+      "grad_norm": 3.5201575756073,
+      "kl": 0.145751953125,
+      "learning_rate": 1e-06,
+      "loss": -0.0396,
+      "num_tokens": 15151437.0,
+      "reward": 0.06649903953075409,
+      "reward_std": 0.024907082319259644,
+      "rewards/bleu_reward_func/mean": 0.06649903953075409,
+      "rewards/bleu_reward_func/std": 0.04641694948077202,
+      "step": 1163
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.28125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 484.0,
+      "completions/mean_length": 263.8125,
+      "completions/mean_terminated_length": 166.69564819335938,
+      "completions/min_length": 15.0,
+      "completions/min_terminated_length": 15.0,
+      "epoch": 0.9312,
+      "grad_norm": 5.745899200439453,
+      "kl": 0.124755859375,
+      "learning_rate": 1e-06,
+      "loss": 0.1729,
+      "num_tokens": 15164791.0,
+      "reward": 0.33342817425727844,
+      "reward_std": 0.06225915253162384,
+      "rewards/bleu_reward_func/mean": 0.33342817425727844,
+      "rewards/bleu_reward_func/std": 0.3276880383491516,
+      "step": 1164
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 493.0,
+      "completions/mean_length": 280.1875,
+      "completions/mean_terminated_length": 272.70965576171875,
+      "completions/min_length": 59.0,
+      "completions/min_terminated_length": 59.0,
+      "epoch": 0.932,
+      "grad_norm": 7.059730052947998,
+      "kl": 0.154541015625,
+      "learning_rate": 1e-06,
+      "loss": 0.2383,
+      "num_tokens": 15176629.0,
+      "reward": 0.03160897642374039,
+      "reward_std": 0.010618302971124649,
+      "rewards/bleu_reward_func/mean": 0.03160897642374039,
+      "rewards/bleu_reward_func/std": 0.016612010076642036,
+      "step": 1165
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 511.0,
+      "completions/mean_length": 295.59375,
+      "completions/mean_terminated_length": 281.16668701171875,
+      "completions/min_length": 32.0,
+      "completions/min_terminated_length": 32.0,
+      "epoch": 0.9328,
+      "grad_norm": 5.280132293701172,
+      "kl": 0.13616943359375,
+      "learning_rate": 1e-06,
+      "loss": 0.3116,
+      "num_tokens": 15188168.0,
+      "reward": 0.03221059590578079,
+      "reward_std": 0.02213170751929283,
+      "rewards/bleu_reward_func/mean": 0.03221059590578079,
+      "rewards/bleu_reward_func/std": 0.03985392674803734,
+      "step": 1166
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 478.0,
+      "completions/mean_length": 227.375,
+      "completions/mean_terminated_length": 132.5,
+      "completions/min_length": 25.0,
+      "completions/min_terminated_length": 25.0,
+      "epoch": 0.9336,
+      "grad_norm": 6.5651021003723145,
+      "kl": 0.142578125,
+      "learning_rate": 1e-06,
+      "loss": 0.0658,
+      "num_tokens": 15201892.0,
+      "reward": 0.1807694286108017,
+      "reward_std": 0.1409318894147873,
+      "rewards/bleu_reward_func/mean": 0.1807694286108017,
+      "rewards/bleu_reward_func/std": 0.2797906994819641,
+      "step": 1167
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.09375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 507.0,
+      "completions/mean_length": 220.5,
+      "completions/mean_terminated_length": 190.34483337402344,
+      "completions/min_length": 30.0,
+      "completions/min_terminated_length": 30.0,
+      "epoch": 0.9344,
+      "grad_norm": 7.170085430145264,
+      "kl": 0.400634765625,
+      "learning_rate": 1e-06,
+      "loss": 0.0572,
+      "num_tokens": 15212724.0,
+      "reward": 0.1451285183429718,
+      "reward_std": 0.0673985704779625,
+      "rewards/bleu_reward_func/mean": 0.1451285183429718,
+      "rewards/bleu_reward_func/std": 0.12026475369930267,
+      "step": 1168
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 506.0,
+      "completions/mean_length": 313.75,
+      "completions/mean_terminated_length": 247.6666717529297,
+      "completions/min_length": 20.0,
+      "completions/min_terminated_length": 20.0,
+      "epoch": 0.9352,
+      "grad_norm": 5.603696823120117,
+      "kl": 0.152587890625,
+      "learning_rate": 1e-06,
+      "loss": 0.0616,
+      "num_tokens": 15228428.0,
+      "reward": 0.253650963306427,
+      "reward_std": 0.03022560104727745,
+      "rewards/bleu_reward_func/mean": 0.253650963306427,
+      "rewards/bleu_reward_func/std": 0.3357136845588684,
+      "step": 1169
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.28125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 502.0,
+      "completions/mean_length": 255.125,
+      "completions/mean_terminated_length": 154.60870361328125,
+      "completions/min_length": 20.0,
+      "completions/min_terminated_length": 20.0,
+      "epoch": 0.936,
+      "grad_norm": 6.110992431640625,
+      "kl": 0.1976318359375,
+      "learning_rate": 1e-06,
+      "loss": -0.0162,
+      "num_tokens": 15242768.0,
+      "reward": 0.0904015377163887,
+      "reward_std": 0.025095967575907707,
+      "rewards/bleu_reward_func/mean": 0.0904015377163887,
+      "rewards/bleu_reward_func/std": 0.09678779542446136,
+      "step": 1170
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.09375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 476.0,
+      "completions/mean_length": 241.625,
+      "completions/mean_terminated_length": 213.65516662597656,
+      "completions/min_length": 12.0,
+      "completions/min_terminated_length": 12.0,
+      "epoch": 0.9368,
+      "grad_norm": 7.32260799407959,
+      "kl": 0.28955078125,
+      "learning_rate": 1e-06,
+      "loss": 0.2098,
+      "num_tokens": 15256260.0,
+      "reward": 0.09696318954229355,
+      "reward_std": 0.04769141972064972,
+      "rewards/bleu_reward_func/mean": 0.09696318954229355,
+      "rewards/bleu_reward_func/std": 0.07404191046953201,
+      "step": 1171
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.09375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 505.0,
+      "completions/mean_length": 189.96875,
+      "completions/mean_terminated_length": 156.65516662597656,
+      "completions/min_length": 22.0,
+      "completions/min_terminated_length": 22.0,
+      "epoch": 0.9376,
+      "grad_norm": 8.43685531616211,
+      "kl": 0.293212890625,
+      "learning_rate": 1e-06,
+      "loss": -0.1046,
+      "num_tokens": 15269899.0,
+      "reward": 0.09695740044116974,
+      "reward_std": 0.037594642490148544,
+      "rewards/bleu_reward_func/mean": 0.09695740044116974,
+      "rewards/bleu_reward_func/std": 0.05750608071684837,
+      "step": 1172
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 475.0,
+      "completions/mean_length": 343.03125,
+      "completions/mean_terminated_length": 125.78572082519531,
+      "completions/min_length": 18.0,
+      "completions/min_terminated_length": 18.0,
+      "epoch": 0.9384,
+      "grad_norm": 5.582010269165039,
+      "kl": 0.410400390625,
+      "learning_rate": 1e-06,
+      "loss": -0.1385,
+      "num_tokens": 15287108.0,
+      "reward": 0.1451946198940277,
+      "reward_std": 0.03915205970406532,
+      "rewards/bleu_reward_func/mean": 0.1451946198940277,
+      "rewards/bleu_reward_func/std": 0.17974776029586792,
+      "step": 1173
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.28125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 501.0,
+      "completions/mean_length": 331.84375,
+      "completions/mean_terminated_length": 261.34783935546875,
+      "completions/min_length": 20.0,
+      "completions/min_terminated_length": 20.0,
+      "epoch": 0.9392,
+      "grad_norm": 11.770851135253906,
+      "kl": 0.3424072265625,
+      "learning_rate": 1e-06,
+      "loss": 0.1869,
+      "num_tokens": 15304535.0,
+      "reward": 0.1413375586271286,
+      "reward_std": 0.06474150717258453,
+      "rewards/bleu_reward_func/mean": 0.1413375586271286,
+      "rewards/bleu_reward_func/std": 0.0782981589436531,
+      "step": 1174
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.34375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 459.0,
+      "completions/mean_length": 299.46875,
+      "completions/mean_terminated_length": 188.1428680419922,
+      "completions/min_length": 20.0,
+      "completions/min_terminated_length": 20.0,
+      "epoch": 0.94,
+      "grad_norm": 5.467903137207031,
+      "kl": 0.198486328125,
+      "learning_rate": 1e-06,
+      "loss": -0.1137,
+      "num_tokens": 15317078.0,
+      "reward": 0.14337725937366486,
+      "reward_std": 0.03686724230647087,
+      "rewards/bleu_reward_func/mean": 0.14337725937366486,
+      "rewards/bleu_reward_func/std": 0.16096609830856323,
+      "step": 1175
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.21875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 484.0,
+      "completions/mean_length": 253.8125,
+      "completions/mean_terminated_length": 181.51998901367188,
+      "completions/min_length": 29.0,
+      "completions/min_terminated_length": 29.0,
+      "epoch": 0.9408,
+      "grad_norm": 8.620965957641602,
+      "kl": 0.379150390625,
+      "learning_rate": 1e-06,
+      "loss": -0.0426,
+      "num_tokens": 15328768.0,
+      "reward": 0.1426527500152588,
+      "reward_std": 0.0550708994269371,
+      "rewards/bleu_reward_func/mean": 0.1426527500152588,
+      "rewards/bleu_reward_func/std": 0.12621666491031647,
+      "step": 1176
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 421.0,
+      "completions/mean_length": 193.21875,
+      "completions/mean_terminated_length": 86.95833587646484,
+      "completions/min_length": 6.0,
+      "completions/min_terminated_length": 6.0,
+      "epoch": 0.9416,
+      "grad_norm": 10.362127304077148,
+      "kl": 0.565673828125,
+      "learning_rate": 1e-06,
+      "loss": 0.0348,
+      "num_tokens": 15340919.0,
+      "reward": 0.04140050709247589,
+      "reward_std": 0.019718483090400696,
+      "rewards/bleu_reward_func/mean": 0.04140050709247589,
+      "rewards/bleu_reward_func/std": 0.03685431182384491,
+      "step": 1177
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 505.0,
+      "completions/mean_length": 349.5625,
+      "completions/mean_terminated_length": 223.22222900390625,
+      "completions/min_length": 34.0,
+      "completions/min_terminated_length": 34.0,
+      "epoch": 0.9424,
+      "grad_norm": 5.097819805145264,
+      "kl": 0.2220458984375,
+      "learning_rate": 1e-06,
+      "loss": 0.0039,
+      "num_tokens": 15356545.0,
+      "reward": 0.14949634671211243,
+      "reward_std": 0.05013212561607361,
+      "rewards/bleu_reward_func/mean": 0.14949634671211243,
+      "rewards/bleu_reward_func/std": 0.19787877798080444,
+      "step": 1178
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.09375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 403.0,
+      "completions/mean_length": 160.03125,
+      "completions/mean_terminated_length": 123.62068939208984,
+      "completions/min_length": 18.0,
+      "completions/min_terminated_length": 18.0,
+      "epoch": 0.9432,
+      "grad_norm": 10.772759437561035,
+      "kl": 0.300537109375,
+      "learning_rate": 1e-06,
+      "loss": 0.0346,
+      "num_tokens": 15364554.0,
+      "reward": 0.1290975958108902,
+      "reward_std": 0.07744569331407547,
+      "rewards/bleu_reward_func/mean": 0.1290975958108902,
+      "rewards/bleu_reward_func/std": 0.1356077641248703,
+      "step": 1179
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.65625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 493.0,
+      "completions/mean_length": 378.1875,
+      "completions/mean_terminated_length": 122.72727966308594,
+      "completions/min_length": 14.0,
+      "completions/min_terminated_length": 14.0,
+      "epoch": 0.944,
+      "grad_norm": 6.254019260406494,
+      "kl": 0.2237548828125,
+      "learning_rate": 1e-06,
+      "loss": 0.0559,
+      "num_tokens": 15382792.0,
+      "reward": 0.11257205903530121,
+      "reward_std": 0.036544833332300186,
+      "rewards/bleu_reward_func/mean": 0.11257205903530121,
+      "rewards/bleu_reward_func/std": 0.10338166356086731,
+      "step": 1180
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 214.0,
+      "completions/mean_length": 185.75,
+      "completions/mean_terminated_length": 37.45454788208008,
+      "completions/min_length": 17.0,
+      "completions/min_terminated_length": 17.0,
+      "epoch": 0.9448,
+      "grad_norm": 30.484220504760742,
+      "kl": 1.228515625,
+      "learning_rate": 1e-06,
+      "loss": 0.7897,
+      "num_tokens": 15391000.0,
+      "reward": 0.2840408384799957,
+      "reward_std": 0.15822984278202057,
+      "rewards/bleu_reward_func/mean": 0.2840408384799957,
+      "rewards/bleu_reward_func/std": 0.1896887719631195,
+      "step": 1181
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.28125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 500.0,
+      "completions/mean_length": 237.8125,
+      "completions/mean_terminated_length": 130.52174377441406,
+      "completions/min_length": 17.0,
+      "completions/min_terminated_length": 17.0,
+      "epoch": 0.9456,
+      "grad_norm": 18.71758270263672,
+      "kl": 1.376953125,
+      "learning_rate": 1e-06,
+      "loss": 0.403,
+      "num_tokens": 15401594.0,
+      "reward": 0.16319331526756287,
+      "reward_std": 0.07705336064100266,
+      "rewards/bleu_reward_func/mean": 0.16319331526756287,
+      "rewards/bleu_reward_func/std": 0.21551194787025452,
+      "step": 1182
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 500.0,
+      "completions/mean_length": 301.6875,
+      "completions/mean_terminated_length": 175.5,
+      "completions/min_length": 21.0,
+      "completions/min_terminated_length": 21.0,
+      "epoch": 0.9464,
+      "grad_norm": 10.93260383605957,
+      "kl": 0.89013671875,
+      "learning_rate": 1e-06,
+      "loss": 0.073,
+      "num_tokens": 15415696.0,
+      "reward": 0.1893438994884491,
+      "reward_std": 0.050289541482925415,
+      "rewards/bleu_reward_func/mean": 0.1893438994884491,
+      "rewards/bleu_reward_func/std": 0.2888805866241455,
+      "step": 1183
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.6875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 490.0,
+      "completions/mean_length": 409.75,
+      "completions/mean_terminated_length": 184.8000030517578,
+      "completions/min_length": 18.0,
+      "completions/min_terminated_length": 18.0,
+      "epoch": 0.9472,
+      "grad_norm": 18.76006317138672,
+      "kl": 0.81097412109375,
+      "learning_rate": 1e-06,
+      "loss": 0.1983,
+      "num_tokens": 15436760.0,
+      "reward": 0.06023106724023819,
+      "reward_std": 0.03375660628080368,
+      "rewards/bleu_reward_func/mean": 0.06023106724023819,
+      "rewards/bleu_reward_func/std": 0.07748028635978699,
+      "step": 1184
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.40625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 458.0,
+      "completions/mean_length": 319.875,
+      "completions/mean_terminated_length": 188.42105102539062,
+      "completions/min_length": 25.0,
+      "completions/min_terminated_length": 25.0,
+      "epoch": 0.948,
+      "grad_norm": 8.845364570617676,
+      "kl": 1.5517578125,
+      "learning_rate": 1e-06,
+      "loss": 0.1722,
+      "num_tokens": 15449196.0,
+      "reward": 0.08687852323055267,
+      "reward_std": 0.02910173125565052,
+      "rewards/bleu_reward_func/mean": 0.08687852323055267,
+      "rewards/bleu_reward_func/std": 0.06549690663814545,
+      "step": 1185
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 245.0,
+      "completions/mean_length": 192.5625,
+      "completions/mean_terminated_length": 86.08333587646484,
+      "completions/min_length": 22.0,
+      "completions/min_terminated_length": 22.0,
+      "epoch": 0.9488,
+      "grad_norm": 17.081815719604492,
+      "kl": 0.970703125,
+      "learning_rate": 1e-06,
+      "loss": 0.1558,
+      "num_tokens": 15458006.0,
+      "reward": 0.10367533564567566,
+      "reward_std": 0.04477589949965477,
+      "rewards/bleu_reward_func/mean": 0.10367533564567566,
+      "rewards/bleu_reward_func/std": 0.1153038814663887,
+      "step": 1186
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 357.0,
+      "completions/mean_length": 165.3125,
+      "completions/mean_terminated_length": 115.78572082519531,
+      "completions/min_length": 4.0,
+      "completions/min_terminated_length": 4.0,
+      "epoch": 0.9496,
+      "grad_norm": 18.12542152404785,
+      "kl": 1.48876953125,
+      "learning_rate": 1e-06,
+      "loss": 0.1714,
+      "num_tokens": 15469048.0,
+      "reward": 0.14421464502811432,
+      "reward_std": 0.05387473851442337,
+      "rewards/bleu_reward_func/mean": 0.14421464502811432,
+      "rewards/bleu_reward_func/std": 0.11958708614110947,
+      "step": 1187
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 506.0,
+      "completions/mean_length": 170.8125,
+      "completions/mean_terminated_length": 92.0769271850586,
+      "completions/min_length": 23.0,
+      "completions/min_terminated_length": 23.0,
+      "epoch": 0.9504,
+      "grad_norm": 19.755910873413086,
+      "kl": 1.64697265625,
+      "learning_rate": 1e-06,
+      "loss": 0.083,
+      "num_tokens": 15478610.0,
+      "reward": 0.11767937242984772,
+      "reward_std": 0.038736552000045776,
+      "rewards/bleu_reward_func/mean": 0.11767937242984772,
+      "rewards/bleu_reward_func/std": 0.14759457111358643,
+      "step": 1188
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 474.0,
+      "completions/mean_length": 303.34375,
+      "completions/mean_terminated_length": 233.7916717529297,
+      "completions/min_length": 16.0,
+      "completions/min_terminated_length": 16.0,
+      "epoch": 0.9512,
+      "grad_norm": 7.415678024291992,
+      "kl": 0.464111328125,
+      "learning_rate": 1e-06,
+      "loss": -0.0998,
+      "num_tokens": 15493469.0,
+      "reward": 0.13566911220550537,
+      "reward_std": 0.03367416933178902,
+      "rewards/bleu_reward_func/mean": 0.13566911220550537,
+      "rewards/bleu_reward_func/std": 0.16353026032447815,
+      "step": 1189
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 481.0,
+      "completions/mean_length": 371.59375,
+      "completions/mean_terminated_length": 307.7727355957031,
+      "completions/min_length": 75.0,
+      "completions/min_terminated_length": 75.0,
+      "epoch": 0.952,
+      "grad_norm": 4.499618053436279,
+      "kl": 0.21990966796875,
+      "learning_rate": 1e-06,
+      "loss": -0.0327,
+      "num_tokens": 15510992.0,
+      "reward": 0.05285275727510452,
+      "reward_std": 0.026856746524572372,
+      "rewards/bleu_reward_func/mean": 0.05285275727510452,
+      "rewards/bleu_reward_func/std": 0.028620464727282524,
+      "step": 1190
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.15625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 456.0,
+      "completions/mean_length": 211.5,
+      "completions/mean_terminated_length": 155.8518524169922,
+      "completions/min_length": 29.0,
+      "completions/min_terminated_length": 29.0,
+      "epoch": 0.9528,
+      "grad_norm": 11.282941818237305,
+      "kl": 0.353515625,
+      "learning_rate": 1e-06,
+      "loss": -0.0409,
+      "num_tokens": 15519336.0,
+      "reward": 0.09386638551950455,
+      "reward_std": 0.03402595967054367,
+      "rewards/bleu_reward_func/mean": 0.09386638551950455,
+      "rewards/bleu_reward_func/std": 0.08018817007541656,
+      "step": 1191
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 468.0,
+      "completions/max_terminated_length": 468.0,
+      "completions/mean_length": 53.5625,
+      "completions/mean_terminated_length": 53.5625,
+      "completions/min_length": 15.0,
+      "completions/min_terminated_length": 15.0,
+      "epoch": 0.9536,
+      "grad_norm": 30.713850021362305,
+      "kl": 0.921142578125,
+      "learning_rate": 1e-06,
+      "loss": 0.5322,
+      "num_tokens": 15524298.0,
+      "reward": 0.1515873372554779,
+      "reward_std": 0.03387141600251198,
+      "rewards/bleu_reward_func/mean": 0.1515873372554779,
+      "rewards/bleu_reward_func/std": 0.14795146882534027,
+      "step": 1192
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 506.0,
+      "completions/mean_length": 307.78125,
+      "completions/mean_terminated_length": 148.94444274902344,
+      "completions/min_length": 56.0,
+      "completions/min_terminated_length": 56.0,
+      "epoch": 0.9544,
+      "grad_norm": 8.172345161437988,
+      "kl": 0.3499755859375,
+      "learning_rate": 1e-06,
+      "loss": -0.1691,
+      "num_tokens": 15539683.0,
+      "reward": 0.06142358481884003,
+      "reward_std": 0.017768073827028275,
+      "rewards/bleu_reward_func/mean": 0.06142358481884003,
+      "rewards/bleu_reward_func/std": 0.02769811637699604,
+      "step": 1193
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 438.0,
+      "completions/mean_length": 59.1875,
+      "completions/mean_terminated_length": 44.58064270019531,
+      "completions/min_length": 15.0,
+      "completions/min_terminated_length": 15.0,
+      "epoch": 0.9552,
+      "grad_norm": 10.542518615722656,
+      "kl": 0.6220703125,
+      "learning_rate": 1e-06,
+      "loss": -0.2357,
+      "num_tokens": 15546073.0,
+      "reward": 0.13228365778923035,
+      "reward_std": 0.05075054615736008,
+      "rewards/bleu_reward_func/mean": 0.13228365778923035,
+      "rewards/bleu_reward_func/std": 0.14825788140296936,
+      "step": 1194
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 507.0,
+      "completions/mean_length": 351.75,
+      "completions/mean_terminated_length": 255.60000610351562,
+      "completions/min_length": 20.0,
+      "completions/min_terminated_length": 20.0,
+      "epoch": 0.956,
+      "grad_norm": 14.762670516967773,
+      "kl": 0.54180908203125,
+      "learning_rate": 1e-06,
+      "loss": 0.1487,
+      "num_tokens": 15561137.0,
+      "reward": 0.10621648281812668,
+      "reward_std": 0.04206620901823044,
+      "rewards/bleu_reward_func/mean": 0.10621648281812668,
+      "rewards/bleu_reward_func/std": 0.05661296099424362,
+      "step": 1195
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.34375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 500.0,
+      "completions/mean_length": 363.1875,
+      "completions/mean_terminated_length": 285.23809814453125,
+      "completions/min_length": 40.0,
+      "completions/min_terminated_length": 40.0,
+      "epoch": 0.9568,
+      "grad_norm": 2.930482864379883,
+      "kl": 0.08624267578125,
+      "learning_rate": 1e-06,
+      "loss": -0.2447,
+      "num_tokens": 15575311.0,
+      "reward": 0.026056351140141487,
+      "reward_std": 0.01072642207145691,
+      "rewards/bleu_reward_func/mean": 0.026056351140141487,
+      "rewards/bleu_reward_func/std": 0.014662106521427631,
+      "step": 1196
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 488.0,
+      "completions/mean_length": 334.6875,
+      "completions/mean_terminated_length": 228.3000030517578,
+      "completions/min_length": 20.0,
+      "completions/min_terminated_length": 20.0,
+      "epoch": 0.9576,
+      "grad_norm": 4.765532493591309,
+      "kl": 0.20489501953125,
+      "learning_rate": 1e-06,
+      "loss": -0.0713,
+      "num_tokens": 15591997.0,
+      "reward": 0.18965111672878265,
+      "reward_std": 0.03347271308302879,
+      "rewards/bleu_reward_func/mean": 0.18965111672878265,
+      "rewards/bleu_reward_func/std": 0.24074162542819977,
+      "step": 1197
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 496.0,
+      "completions/mean_length": 206.71875,
+      "completions/mean_terminated_length": 186.36668395996094,
+      "completions/min_length": 15.0,
+      "completions/min_terminated_length": 15.0,
+      "epoch": 0.9584,
+      "grad_norm": 9.595136642456055,
+      "kl": 0.4879150390625,
+      "learning_rate": 1e-06,
+      "loss": -0.0063,
+      "num_tokens": 15601852.0,
+      "reward": 0.07087633013725281,
+      "reward_std": 0.024902882054448128,
+      "rewards/bleu_reward_func/mean": 0.07087633013725281,
+      "rewards/bleu_reward_func/std": 0.052057161927223206,
+      "step": 1198
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.21875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 477.0,
+      "completions/mean_length": 286.59375,
+      "completions/mean_terminated_length": 223.47999572753906,
+      "completions/min_length": 36.0,
+      "completions/min_terminated_length": 36.0,
+      "epoch": 0.9592,
+      "grad_norm": 5.445902347564697,
+      "kl": 0.289306640625,
+      "learning_rate": 1e-06,
+      "loss": -0.0083,
+      "num_tokens": 15613743.0,
+      "reward": 0.10761390626430511,
+      "reward_std": 0.03891483694314957,
+      "rewards/bleu_reward_func/mean": 0.10761390626430511,
+      "rewards/bleu_reward_func/std": 0.10792107880115509,
+      "step": 1199
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 112.0,
+      "completions/max_terminated_length": 112.0,
+      "completions/mean_length": 29.75,
+      "completions/mean_terminated_length": 29.75,
+      "completions/min_length": 20.0,
+      "completions/min_terminated_length": 20.0,
+      "epoch": 0.96,
+      "grad_norm": 16.023887634277344,
+      "kl": 1.11669921875,
+      "learning_rate": 1e-06,
+      "loss": 0.1468,
+      "num_tokens": 15619671.0,
+      "reward": 0.3471377491950989,
+      "reward_std": 0.05158979445695877,
+      "rewards/bleu_reward_func/mean": 0.3471377491950989,
+      "rewards/bleu_reward_func/std": 0.13238590955734253,
+      "step": 1200
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 451.0,
+      "completions/mean_length": 325.5,
+      "completions/mean_terminated_length": 213.60000610351562,
+      "completions/min_length": 26.0,
+      "completions/min_terminated_length": 26.0,
+      "epoch": 0.9608,
+      "grad_norm": 5.491018295288086,
+      "kl": 0.32122802734375,
+      "learning_rate": 1e-06,
+      "loss": -0.2311,
+      "num_tokens": 15634047.0,
+      "reward": 0.07880916446447372,
+      "reward_std": 0.030750975012779236,
+      "rewards/bleu_reward_func/mean": 0.07880916446447372,
+      "rewards/bleu_reward_func/std": 0.06850366294384003,
+      "step": 1201
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 510.0,
+      "completions/mean_length": 264.5625,
+      "completions/mean_terminated_length": 248.06668090820312,
+      "completions/min_length": 16.0,
+      "completions/min_terminated_length": 16.0,
+      "epoch": 0.9616,
+      "grad_norm": 4.983512878417969,
+      "kl": 0.1092529296875,
+      "learning_rate": 1e-06,
+      "loss": -0.1154,
+      "num_tokens": 15647369.0,
+      "reward": 0.09226585179567337,
+      "reward_std": 0.04809027165174484,
+      "rewards/bleu_reward_func/mean": 0.09226585179567337,
+      "rewards/bleu_reward_func/std": 0.14570128917694092,
+      "step": 1202
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 419.0,
+      "completions/max_terminated_length": 419.0,
+      "completions/mean_length": 160.71875,
+      "completions/mean_terminated_length": 160.71875,
+      "completions/min_length": 43.0,
+      "completions/min_terminated_length": 43.0,
+      "epoch": 0.9624,
+      "grad_norm": 4.987871170043945,
+      "kl": 0.1322021484375,
+      "learning_rate": 1e-06,
+      "loss": 0.0973,
+      "num_tokens": 15656384.0,
+      "reward": 0.036719270050525665,
+      "reward_std": 0.007080578710883856,
+      "rewards/bleu_reward_func/mean": 0.036719270050525665,
+      "rewards/bleu_reward_func/std": 0.018336299806833267,
+      "step": 1203
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 443.0,
+      "completions/mean_length": 238.03125,
+      "completions/mean_terminated_length": 146.70834350585938,
+      "completions/min_length": 18.0,
+      "completions/min_terminated_length": 18.0,
+      "epoch": 0.9632,
+      "grad_norm": 4.710721492767334,
+      "kl": 0.24249267578125,
+      "learning_rate": 1e-06,
+      "loss": -0.0163,
+      "num_tokens": 15666769.0,
+      "reward": 0.11304133385419846,
+      "reward_std": 0.03101547807455063,
+      "rewards/bleu_reward_func/mean": 0.11304133385419846,
+      "rewards/bleu_reward_func/std": 0.09199430793523788,
+      "step": 1204
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 417.0,
+      "completions/max_terminated_length": 417.0,
+      "completions/mean_length": 129.78125,
+      "completions/mean_terminated_length": 129.78125,
+      "completions/min_length": 5.0,
+      "completions/min_terminated_length": 5.0,
+      "epoch": 0.964,
+      "grad_norm": 9.284921646118164,
+      "kl": 0.5504150390625,
+      "learning_rate": 1e-06,
+      "loss": -0.0268,
+      "num_tokens": 15674122.0,
+      "reward": 0.07591907680034637,
+      "reward_std": 0.03484845906496048,
+      "rewards/bleu_reward_func/mean": 0.07591907680034637,
+      "rewards/bleu_reward_func/std": 0.05640895664691925,
+      "step": 1205
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.34375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 482.0,
+      "completions/mean_length": 280.15625,
+      "completions/mean_terminated_length": 158.71429443359375,
+      "completions/min_length": 21.0,
+      "completions/min_terminated_length": 21.0,
+      "epoch": 0.9648,
+      "grad_norm": 5.940964698791504,
+      "kl": 0.234375,
+      "learning_rate": 1e-06,
+      "loss": 0.2716,
+      "num_tokens": 15690119.0,
+      "reward": 0.19026660919189453,
+      "reward_std": 0.06492812931537628,
+      "rewards/bleu_reward_func/mean": 0.19026660919189453,
+      "rewards/bleu_reward_func/std": 0.1680937260389328,
+      "step": 1206
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.15625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 258.0,
+      "completions/mean_length": 190.96875,
+      "completions/mean_terminated_length": 131.51852416992188,
+      "completions/min_length": 29.0,
+      "completions/min_terminated_length": 29.0,
+      "epoch": 0.9656,
+      "grad_norm": 4.697986602783203,
+      "kl": 0.17156982421875,
+      "learning_rate": 1e-06,
+      "loss": 0.1839,
+      "num_tokens": 15700302.0,
+      "reward": 0.29912805557250977,
+      "reward_std": 0.05129002407193184,
+      "rewards/bleu_reward_func/mean": 0.29912805557250977,
+      "rewards/bleu_reward_func/std": 0.33928823471069336,
+      "step": 1207
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.34375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 506.0,
+      "completions/mean_length": 403.34375,
+      "completions/mean_terminated_length": 346.4285888671875,
+      "completions/min_length": 130.0,
+      "completions/min_terminated_length": 130.0,
+      "epoch": 0.9664,
+      "grad_norm": 3.426858901977539,
+      "kl": 0.05426025390625,
+      "learning_rate": 1e-06,
+      "loss": -0.041,
+      "num_tokens": 15718081.0,
+      "reward": 0.10875709354877472,
+      "reward_std": 0.017351722344756126,
+      "rewards/bleu_reward_func/mean": 0.10875709354877472,
+      "rewards/bleu_reward_func/std": 0.1039399579167366,
+      "step": 1208
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 407.0,
+      "completions/mean_length": 236.59375,
+      "completions/mean_terminated_length": 111.40909576416016,
+      "completions/min_length": 35.0,
+      "completions/min_terminated_length": 35.0,
+      "epoch": 0.9672,
+      "grad_norm": 5.4925079345703125,
+      "kl": 0.2030029296875,
+      "learning_rate": 1e-06,
+      "loss": -0.1031,
+      "num_tokens": 15728028.0,
+      "reward": 0.04499085620045662,
+      "reward_std": 0.01842951774597168,
+      "rewards/bleu_reward_func/mean": 0.04499085620045662,
+      "rewards/bleu_reward_func/std": 0.030968643724918365,
+      "step": 1209
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 501.0,
+      "completions/mean_length": 140.625,
+      "completions/mean_terminated_length": 128.64515686035156,
+      "completions/min_length": 14.0,
+      "completions/min_terminated_length": 14.0,
+      "epoch": 0.968,
+      "grad_norm": 9.575289726257324,
+      "kl": 0.20196533203125,
+      "learning_rate": 1e-06,
+      "loss": 0.1409,
+      "num_tokens": 15735296.0,
+      "reward": 0.07829822599887848,
+      "reward_std": 0.03164747357368469,
+      "rewards/bleu_reward_func/mean": 0.07829822599887848,
+      "rewards/bleu_reward_func/std": 0.07768744975328445,
+      "step": 1210
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 492.0,
+      "completions/mean_length": 338.1875,
+      "completions/mean_terminated_length": 298.0769348144531,
+      "completions/min_length": 46.0,
+      "completions/min_terminated_length": 46.0,
+      "epoch": 0.9688,
+      "grad_norm": 2.519958019256592,
+      "kl": 0.042205810546875,
+      "learning_rate": 1e-06,
+      "loss": 0.2122,
+      "num_tokens": 15750286.0,
+      "reward": 0.11583074182271957,
+      "reward_std": 0.0300702303647995,
+      "rewards/bleu_reward_func/mean": 0.11583074182271957,
+      "rewards/bleu_reward_func/std": 0.0626337081193924,
+      "step": 1211
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 487.0,
+      "completions/mean_length": 202.1875,
+      "completions/mean_terminated_length": 98.91667175292969,
+      "completions/min_length": 15.0,
+      "completions/min_terminated_length": 15.0,
+      "epoch": 0.9696,
+      "grad_norm": 7.297961235046387,
+      "kl": 0.3997802734375,
+      "learning_rate": 1e-06,
+      "loss": 0.1691,
+      "num_tokens": 15759468.0,
+      "reward": 0.14121456444263458,
+      "reward_std": 0.051856689155101776,
+      "rewards/bleu_reward_func/mean": 0.14121456444263458,
+      "rewards/bleu_reward_func/std": 0.1731884926557541,
+      "step": 1212
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 95.0,
+      "completions/mean_length": 183.1875,
+      "completions/mean_terminated_length": 33.727272033691406,
+      "completions/min_length": 17.0,
+      "completions/min_terminated_length": 17.0,
+      "epoch": 0.9704,
+      "grad_norm": 6.750447750091553,
+      "kl": 0.354522705078125,
+      "learning_rate": 1e-06,
+      "loss": 0.3063,
+      "num_tokens": 15771298.0,
+      "reward": 0.20002232491970062,
+      "reward_std": 0.05593840777873993,
+      "rewards/bleu_reward_func/mean": 0.20002232491970062,
+      "rewards/bleu_reward_func/std": 0.1818784922361374,
+      "step": 1213
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 509.0,
+      "completions/max_terminated_length": 509.0,
+      "completions/mean_length": 155.59375,
+      "completions/mean_terminated_length": 155.59375,
+      "completions/min_length": 11.0,
+      "completions/min_terminated_length": 11.0,
+      "epoch": 0.9712,
+      "grad_norm": 8.60672378540039,
+      "kl": 0.29888916015625,
+      "learning_rate": 1e-06,
+      "loss": -0.1436,
+      "num_tokens": 15781949.0,
+      "reward": 0.07949218153953552,
+      "reward_std": 0.030030012130737305,
+      "rewards/bleu_reward_func/mean": 0.07949218153953552,
+      "rewards/bleu_reward_func/std": 0.05354390665888786,
+      "step": 1214
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 512.0,
+      "completions/mean_length": 293.0,
+      "completions/mean_terminated_length": 193.4545440673828,
+      "completions/min_length": 33.0,
+      "completions/min_terminated_length": 33.0,
+      "epoch": 0.972,
+      "grad_norm": 4.160828590393066,
+      "kl": 0.09783935546875,
+      "learning_rate": 1e-06,
+      "loss": -0.0156,
+      "num_tokens": 15795061.0,
+      "reward": 0.1600840538740158,
+      "reward_std": 0.05235850065946579,
+      "rewards/bleu_reward_func/mean": 0.1600840538740158,
+      "rewards/bleu_reward_func/std": 0.0939527377486229,
+      "step": 1215
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 251.0,
+      "completions/mean_length": 148.8125,
+      "completions/mean_terminated_length": 65.0,
+      "completions/min_length": 31.0,
+      "completions/min_terminated_length": 31.0,
+      "epoch": 0.9728,
+      "grad_norm": 5.613617420196533,
+      "kl": 0.1966552734375,
+      "learning_rate": 1e-06,
+      "loss": 0.011,
+      "num_tokens": 15804343.0,
+      "reward": 0.1325000822544098,
+      "reward_std": 0.054465532302856445,
+      "rewards/bleu_reward_func/mean": 0.1325000822544098,
+      "rewards/bleu_reward_func/std": 0.15841807425022125,
+      "step": 1216
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 389.0,
+      "completions/max_terminated_length": 389.0,
+      "completions/mean_length": 118.34375,
+      "completions/mean_terminated_length": 118.34375,
+      "completions/min_length": 21.0,
+      "completions/min_terminated_length": 21.0,
+      "epoch": 0.9736,
+      "grad_norm": 7.5834503173828125,
+      "kl": 0.2967529296875,
+      "learning_rate": 1e-06,
+      "loss": 0.0808,
+      "num_tokens": 15815658.0,
+      "reward": 0.243885338306427,
+      "reward_std": 0.05274055525660515,
+      "rewards/bleu_reward_func/mean": 0.243885338306427,
+      "rewards/bleu_reward_func/std": 0.14211414754390717,
+      "step": 1217
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 506.0,
+      "completions/mean_length": 367.8125,
+      "completions/mean_terminated_length": 223.625,
+      "completions/min_length": 31.0,
+      "completions/min_terminated_length": 31.0,
+      "epoch": 0.9744,
+      "grad_norm": 3.8433821201324463,
+      "kl": 0.0970458984375,
+      "learning_rate": 1e-06,
+      "loss": -0.0201,
+      "num_tokens": 15831204.0,
+      "reward": 0.10006400942802429,
+      "reward_std": 0.02605537325143814,
+      "rewards/bleu_reward_func/mean": 0.10006400942802429,
+      "rewards/bleu_reward_func/std": 0.1093517392873764,
+      "step": 1218
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.15625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 327.0,
+      "completions/mean_length": 191.90625,
+      "completions/mean_terminated_length": 132.62962341308594,
+      "completions/min_length": 11.0,
+      "completions/min_terminated_length": 11.0,
+      "epoch": 0.9752,
+      "grad_norm": 25.382192611694336,
+      "kl": 0.375518798828125,
+      "learning_rate": 1e-06,
+      "loss": 0.2354,
+      "num_tokens": 15842769.0,
+      "reward": 0.21496494114398956,
+      "reward_std": 0.08334603905677795,
+      "rewards/bleu_reward_func/mean": 0.21496494114398956,
+      "rewards/bleu_reward_func/std": 0.3287891745567322,
+      "step": 1219
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.28125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 405.0,
+      "completions/mean_length": 193.0,
+      "completions/mean_terminated_length": 68.17391204833984,
+      "completions/min_length": 30.0,
+      "completions/min_terminated_length": 30.0,
+      "epoch": 0.976,
+      "grad_norm": 8.422406196594238,
+      "kl": 0.158935546875,
+      "learning_rate": 1e-06,
+      "loss": 0.7192,
+      "num_tokens": 15852937.0,
+      "reward": 0.21234184503555298,
+      "reward_std": 0.10839352756738663,
+      "rewards/bleu_reward_func/mean": 0.21234184503555298,
+      "rewards/bleu_reward_func/std": 0.17191235721111298,
+      "step": 1220
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.15625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 458.0,
+      "completions/mean_length": 291.15625,
+      "completions/mean_terminated_length": 250.25926208496094,
+      "completions/min_length": 21.0,
+      "completions/min_terminated_length": 21.0,
+      "epoch": 0.9768,
+      "grad_norm": 3.8467891216278076,
+      "kl": 0.101806640625,
+      "learning_rate": 1e-06,
+      "loss": -0.0472,
+      "num_tokens": 15865934.0,
+      "reward": 0.13597853481769562,
+      "reward_std": 0.034184906631708145,
+      "rewards/bleu_reward_func/mean": 0.13597853481769562,
+      "rewards/bleu_reward_func/std": 0.0799630656838417,
+      "step": 1221
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.15625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 507.0,
+      "completions/mean_length": 268.15625,
+      "completions/mean_terminated_length": 223.0,
+      "completions/min_length": 20.0,
+      "completions/min_terminated_length": 20.0,
+      "epoch": 0.9776,
+      "grad_norm": 5.5957794189453125,
+      "kl": 0.1470947265625,
+      "learning_rate": 1e-06,
+      "loss": 0.137,
+      "num_tokens": 15881691.0,
+      "reward": 0.11758720874786377,
+      "reward_std": 0.05352931469678879,
+      "rewards/bleu_reward_func/mean": 0.11758720874786377,
+      "rewards/bleu_reward_func/std": 0.08839549124240875,
+      "step": 1222
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 471.0,
+      "completions/mean_length": 364.28125,
+      "completions/mean_terminated_length": 174.35714721679688,
+      "completions/min_length": 45.0,
+      "completions/min_terminated_length": 45.0,
+      "epoch": 0.9784,
+      "grad_norm": 3.5430212020874023,
+      "kl": 0.0980224609375,
+      "learning_rate": 1e-06,
+      "loss": -0.0829,
+      "num_tokens": 15901716.0,
+      "reward": 0.10303943604230881,
+      "reward_std": 0.02435348369181156,
+      "rewards/bleu_reward_func/mean": 0.10303943604230881,
+      "rewards/bleu_reward_func/std": 0.10681937634944916,
+      "step": 1223
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.46875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 499.0,
+      "completions/mean_length": 350.09375,
+      "completions/mean_terminated_length": 207.23529052734375,
+      "completions/min_length": 43.0,
+      "completions/min_terminated_length": 43.0,
+      "epoch": 0.9792,
+      "grad_norm": 3.817512035369873,
+      "kl": 0.06549072265625,
+      "learning_rate": 1e-06,
+      "loss": -0.2731,
+      "num_tokens": 15916239.0,
+      "reward": 0.025896022096276283,
+      "reward_std": 0.011423053219914436,
+      "rewards/bleu_reward_func/mean": 0.025896022096276283,
+      "rewards/bleu_reward_func/std": 0.01915143057703972,
+      "step": 1224
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 58.0,
+      "completions/mean_length": 148.59375,
+      "completions/mean_terminated_length": 27.45833396911621,
+      "completions/min_length": 15.0,
+      "completions/min_terminated_length": 15.0,
+      "epoch": 0.98,
+      "grad_norm": 8.734639167785645,
+      "kl": 0.29150390625,
+      "learning_rate": 1e-06,
+      "loss": 0.1203,
+      "num_tokens": 15926002.0,
+      "reward": 0.2994440793991089,
+      "reward_std": 0.08188341557979584,
+      "rewards/bleu_reward_func/mean": 0.2994440793991089,
+      "rewards/bleu_reward_func/std": 0.17080959677696228,
+      "step": 1225
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.09375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 478.0,
+      "completions/mean_length": 146.875,
+      "completions/mean_terminated_length": 109.10344696044922,
+      "completions/min_length": 17.0,
+      "completions/min_terminated_length": 17.0,
+      "epoch": 0.9808,
+      "grad_norm": 6.3398637771606445,
+      "kl": 0.203369140625,
+      "learning_rate": 1e-06,
+      "loss": 0.0948,
+      "num_tokens": 15934822.0,
+      "reward": 0.08409433811903,
+      "reward_std": 0.022457323968410492,
+      "rewards/bleu_reward_func/mean": 0.08409433811903,
+      "rewards/bleu_reward_func/std": 0.12822076678276062,
+      "step": 1226
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 467.0,
+      "completions/mean_length": 127.15625,
+      "completions/mean_terminated_length": 114.74193572998047,
+      "completions/min_length": 15.0,
+      "completions/min_terminated_length": 15.0,
+      "epoch": 0.9816,
+      "grad_norm": 5.545539379119873,
+      "kl": 0.157470703125,
+      "learning_rate": 1e-06,
+      "loss": -0.038,
+      "num_tokens": 15941411.0,
+      "reward": 0.291486918926239,
+      "reward_std": 0.01802459917962551,
+      "rewards/bleu_reward_func/mean": 0.291486918926239,
+      "rewards/bleu_reward_func/std": 0.30036208033561707,
+      "step": 1227
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.15625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 388.0,
+      "completions/mean_length": 188.1875,
+      "completions/mean_terminated_length": 128.22222900390625,
+      "completions/min_length": 33.0,
+      "completions/min_terminated_length": 33.0,
+      "epoch": 0.9824,
+      "grad_norm": 4.385817050933838,
+      "kl": 0.12310791015625,
+      "learning_rate": 1e-06,
+      "loss": -0.2018,
+      "num_tokens": 15948961.0,
+      "reward": 0.050382573157548904,
+      "reward_std": 0.019635431468486786,
+      "rewards/bleu_reward_func/mean": 0.050382573157548904,
+      "rewards/bleu_reward_func/std": 0.0410877950489521,
+      "step": 1228
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.09375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 505.0,
+      "completions/mean_length": 167.40625,
+      "completions/mean_terminated_length": 131.7586212158203,
+      "completions/min_length": 16.0,
+      "completions/min_terminated_length": 16.0,
+      "epoch": 0.9832,
+      "grad_norm": 6.975030422210693,
+      "kl": 0.20166015625,
+      "learning_rate": 1e-06,
+      "loss": 0.0432,
+      "num_tokens": 15959414.0,
+      "reward": 0.10200367867946625,
+      "reward_std": 0.012780029326677322,
+      "rewards/bleu_reward_func/mean": 0.10200367867946625,
+      "rewards/bleu_reward_func/std": 0.07971282303333282,
+      "step": 1229
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 477.0,
+      "completions/mean_length": 172.25,
+      "completions/mean_terminated_length": 149.60000610351562,
+      "completions/min_length": 32.0,
+      "completions/min_terminated_length": 32.0,
+      "epoch": 0.984,
+      "grad_norm": 58.40047836303711,
+      "kl": 0.6114501953125,
+      "learning_rate": 1e-06,
+      "loss": 0.1056,
+      "num_tokens": 15970374.0,
+      "reward": 0.21096709370613098,
+      "reward_std": 0.05436326563358307,
+      "rewards/bleu_reward_func/mean": 0.21096709370613098,
+      "rewards/bleu_reward_func/std": 0.21129880845546722,
+      "step": 1230
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 99.0,
+      "completions/mean_length": 282.875,
+      "completions/mean_terminated_length": 53.75,
+      "completions/min_length": 29.0,
+      "completions/min_terminated_length": 29.0,
+      "epoch": 0.9848,
+      "grad_norm": 7.031259536743164,
+      "kl": 0.173828125,
+      "learning_rate": 1e-06,
+      "loss": -0.0097,
+      "num_tokens": 15985058.0,
+      "reward": 0.13087573647499084,
+      "reward_std": 0.03848683089017868,
+      "rewards/bleu_reward_func/mean": 0.13087573647499084,
+      "rewards/bleu_reward_func/std": 0.07179337739944458,
+      "step": 1231
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.28125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 511.0,
+      "completions/mean_length": 347.21875,
+      "completions/mean_terminated_length": 282.7391357421875,
+      "completions/min_length": 33.0,
+      "completions/min_terminated_length": 33.0,
+      "epoch": 0.9856,
+      "grad_norm": 13.313501358032227,
+      "kl": 0.432952880859375,
+      "learning_rate": 1e-06,
+      "loss": 0.1537,
+      "num_tokens": 16003825.0,
+      "reward": 0.06137411668896675,
+      "reward_std": 0.036481164395809174,
+      "rewards/bleu_reward_func/mean": 0.06137411668896675,
+      "rewards/bleu_reward_func/std": 0.05317319929599762,
+      "step": 1232
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 198.0,
+      "completions/mean_length": 114.03125,
+      "completions/mean_terminated_length": 87.50000762939453,
+      "completions/min_length": 27.0,
+      "completions/min_terminated_length": 27.0,
+      "epoch": 0.9864,
+      "grad_norm": 12.291428565979004,
+      "kl": 0.4300537109375,
+      "learning_rate": 1e-06,
+      "loss": 0.2836,
+      "num_tokens": 16012714.0,
+      "reward": 0.05898230895400047,
+      "reward_std": 0.024662408977746964,
+      "rewards/bleu_reward_func/mean": 0.05898230895400047,
+      "rewards/bleu_reward_func/std": 0.05822930857539177,
+      "step": 1233
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.28125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 506.0,
+      "completions/mean_length": 255.46875,
+      "completions/mean_terminated_length": 155.0869598388672,
+      "completions/min_length": 19.0,
+      "completions/min_terminated_length": 19.0,
+      "epoch": 0.9872,
+      "grad_norm": 6.375350475311279,
+      "kl": 0.20550537109375,
+      "learning_rate": 1e-06,
+      "loss": -0.0603,
+      "num_tokens": 16023033.0,
+      "reward": 0.12795159220695496,
+      "reward_std": 0.034560851752758026,
+      "rewards/bleu_reward_func/mean": 0.12795159220695496,
+      "rewards/bleu_reward_func/std": 0.05310589075088501,
+      "step": 1234
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 512.0,
+      "completions/mean_length": 335.375,
+      "completions/mean_terminated_length": 198.0,
+      "completions/min_length": 45.0,
+      "completions/min_terminated_length": 45.0,
+      "epoch": 0.988,
+      "grad_norm": 2.701765775680542,
+      "kl": 0.1783447265625,
+      "learning_rate": 1e-06,
+      "loss": 0.016,
+      "num_tokens": 16037525.0,
+      "reward": 0.08859970420598984,
+      "reward_std": 0.012333719059824944,
+      "rewards/bleu_reward_func/mean": 0.08859970420598984,
+      "rewards/bleu_reward_func/std": 0.05836745351552963,
+      "step": 1235
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 496.0,
+      "completions/mean_length": 368.5625,
+      "completions/mean_terminated_length": 282.5,
+      "completions/min_length": 30.0,
+      "completions/min_terminated_length": 30.0,
+      "epoch": 0.9888,
+      "grad_norm": 3.917327642440796,
+      "kl": 0.1302490234375,
+      "learning_rate": 1e-06,
+      "loss": -0.3326,
+      "num_tokens": 16051695.0,
+      "reward": 0.055808089673519135,
+      "reward_std": 0.020165979862213135,
+      "rewards/bleu_reward_func/mean": 0.055808089673519135,
+      "rewards/bleu_reward_func/std": 0.05799167603254318,
+      "step": 1236
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.21875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 496.0,
+      "completions/mean_length": 258.4375,
+      "completions/mean_terminated_length": 187.44000244140625,
+      "completions/min_length": 19.0,
+      "completions/min_terminated_length": 19.0,
+      "epoch": 0.9896,
+      "grad_norm": 11.650849342346191,
+      "kl": 0.3966064453125,
+      "learning_rate": 1e-06,
+      "loss": -0.1616,
+      "num_tokens": 16063021.0,
+      "reward": 0.09570951759815216,
+      "reward_std": 0.041778795421123505,
+      "rewards/bleu_reward_func/mean": 0.09570951759815216,
+      "rewards/bleu_reward_func/std": 0.09836214780807495,
+      "step": 1237
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 463.0,
+      "completions/max_terminated_length": 463.0,
+      "completions/mean_length": 130.71875,
+      "completions/mean_terminated_length": 130.71875,
+      "completions/min_length": 19.0,
+      "completions/min_terminated_length": 19.0,
+      "epoch": 0.9904,
+      "grad_norm": 6.946449279785156,
+      "kl": 0.3614501953125,
+      "learning_rate": 1e-06,
+      "loss": 0.0432,
+      "num_tokens": 16073188.0,
+      "reward": 0.19768103957176208,
+      "reward_std": 0.05326389521360397,
+      "rewards/bleu_reward_func/mean": 0.19768103957176208,
+      "rewards/bleu_reward_func/std": 0.14096693694591522,
+      "step": 1238
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.21875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 512.0,
+      "completions/mean_length": 153.59375,
+      "completions/mean_terminated_length": 53.23999786376953,
+      "completions/min_length": 10.0,
+      "completions/min_terminated_length": 10.0,
+      "epoch": 0.9912,
+      "grad_norm": 15.68226432800293,
+      "kl": 0.60394287109375,
+      "learning_rate": 1e-06,
+      "loss": 0.1991,
+      "num_tokens": 16089367.0,
+      "reward": 0.19772392511367798,
+      "reward_std": 0.04295985400676727,
+      "rewards/bleu_reward_func/mean": 0.19772392511367798,
+      "rewards/bleu_reward_func/std": 0.15457068383693695,
+      "step": 1239
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 443.0,
+      "completions/mean_length": 202.1875,
+      "completions/mean_terminated_length": 181.53334045410156,
+      "completions/min_length": 43.0,
+      "completions/min_terminated_length": 43.0,
+      "epoch": 0.992,
+      "grad_norm": 5.087695121765137,
+      "kl": 0.21142578125,
+      "learning_rate": 1e-06,
+      "loss": -0.1565,
+      "num_tokens": 16097533.0,
+      "reward": 0.04568080976605415,
+      "reward_std": 0.0273725725710392,
+      "rewards/bleu_reward_func/mean": 0.04568080976605415,
+      "rewards/bleu_reward_func/std": 0.05127081274986267,
+      "step": 1240
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.09375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 149.0,
+      "completions/mean_length": 98.1875,
+      "completions/mean_terminated_length": 55.379310607910156,
+      "completions/min_length": 10.0,
+      "completions/min_terminated_length": 10.0,
+      "epoch": 0.9928,
+      "grad_norm": 9.800594329833984,
+      "kl": 0.535400390625,
+      "learning_rate": 1e-06,
+      "loss": -0.2267,
+      "num_tokens": 16104579.0,
+      "reward": 0.1168278306722641,
+      "reward_std": 0.04581147059798241,
+      "rewards/bleu_reward_func/mean": 0.1168278306722641,
+      "rewards/bleu_reward_func/std": 0.08855386078357697,
+      "step": 1241
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 301.0,
+      "completions/mean_length": 251.21875,
+      "completions/mean_terminated_length": 48.38888931274414,
+      "completions/min_length": 15.0,
+      "completions/min_terminated_length": 15.0,
+      "epoch": 0.9936,
+      "grad_norm": 6.915892124176025,
+      "kl": 0.300262451171875,
+      "learning_rate": 1e-06,
+      "loss": 0.0052,
+      "num_tokens": 16117106.0,
+      "reward": 0.21942071616649628,
+      "reward_std": 0.06735092401504517,
+      "rewards/bleu_reward_func/mean": 0.21942071616649628,
+      "rewards/bleu_reward_func/std": 0.1295205056667328,
+      "step": 1242
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 505.0,
+      "completions/mean_length": 386.9375,
+      "completions/mean_terminated_length": 311.8999938964844,
+      "completions/min_length": 31.0,
+      "completions/min_terminated_length": 31.0,
+      "epoch": 0.9944,
+      "grad_norm": 11.362247467041016,
+      "kl": 0.95458984375,
+      "learning_rate": 1e-06,
+      "loss": -0.2592,
+      "num_tokens": 16131824.0,
+      "reward": 0.03232087939977646,
+      "reward_std": 0.018025288358330727,
+      "rewards/bleu_reward_func/mean": 0.03232087939977646,
+      "rewards/bleu_reward_func/std": 0.026756620034575462,
+      "step": 1243
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 511.0,
+      "completions/mean_length": 229.40625,
+      "completions/mean_terminated_length": 220.29031372070312,
+      "completions/min_length": 33.0,
+      "completions/min_terminated_length": 33.0,
+      "epoch": 0.9952,
+      "grad_norm": 15.675792694091797,
+      "kl": 0.33203125,
+      "learning_rate": 1e-06,
+      "loss": 0.0615,
+      "num_tokens": 16145325.0,
+      "reward": 0.08530285954475403,
+      "reward_std": 0.03364046663045883,
+      "rewards/bleu_reward_func/mean": 0.08530285954475403,
+      "rewards/bleu_reward_func/std": 0.06811228394508362,
+      "step": 1244
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.34375,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 498.0,
+      "completions/mean_length": 426.5625,
+      "completions/mean_terminated_length": 381.8095397949219,
+      "completions/min_length": 21.0,
+      "completions/min_terminated_length": 21.0,
+      "epoch": 0.996,
+      "grad_norm": 2.0989584922790527,
+      "kl": 0.0555419921875,
+      "learning_rate": 1e-06,
+      "loss": -0.0865,
+      "num_tokens": 16164487.0,
+      "reward": 0.10918224602937698,
+      "reward_std": 0.043439704924821854,
+      "rewards/bleu_reward_func/mean": 0.10918224602937698,
+      "rewards/bleu_reward_func/std": 0.09625791013240814,
+      "step": 1245
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 434.0,
+      "completions/mean_length": 127.90625,
+      "completions/mean_terminated_length": 115.51612854003906,
+      "completions/min_length": 18.0,
+      "completions/min_terminated_length": 18.0,
+      "epoch": 0.9968,
+      "grad_norm": 8.359559059143066,
+      "kl": 0.6103515625,
+      "learning_rate": 1e-06,
+      "loss": 0.2382,
+      "num_tokens": 16172076.0,
+      "reward": 0.0722852572798729,
+      "reward_std": 0.0363241545855999,
+      "rewards/bleu_reward_func/mean": 0.0722852572798729,
+      "rewards/bleu_reward_func/std": 0.05653948336839676,
+      "step": 1246
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 129.0,
+      "completions/max_terminated_length": 129.0,
+      "completions/mean_length": 63.21875,
+      "completions/mean_terminated_length": 63.21875,
+      "completions/min_length": 27.0,
+      "completions/min_terminated_length": 27.0,
+      "epoch": 0.9976,
+      "grad_norm": 14.668207168579102,
+      "kl": 0.6405029296875,
+      "learning_rate": 1e-06,
+      "loss": 0.1996,
+      "num_tokens": 16178059.0,
+      "reward": 0.11710416525602341,
+      "reward_std": 0.044295113533735275,
+      "rewards/bleu_reward_func/mean": 0.11710416525602341,
+      "rewards/bleu_reward_func/std": 0.06186880171298981,
+      "step": 1247
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 474.0,
+      "completions/mean_length": 184.875,
+      "completions/mean_terminated_length": 174.32257080078125,
+      "completions/min_length": 18.0,
+      "completions/min_terminated_length": 18.0,
+      "epoch": 0.9984,
+      "grad_norm": 9.499897956848145,
+      "kl": 0.4058837890625,
+      "learning_rate": 1e-06,
+      "loss": -0.1047,
+      "num_tokens": 16190527.0,
+      "reward": 0.14165818691253662,
+      "reward_std": 0.042991265654563904,
+      "rewards/bleu_reward_func/mean": 0.14165818691253662,
+      "rewards/bleu_reward_func/std": 0.1511020064353943,
+      "step": 1248
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 464.0,
+      "completions/mean_length": 233.25,
+      "completions/mean_terminated_length": 168.92308044433594,
+      "completions/min_length": 32.0,
+      "completions/min_terminated_length": 32.0,
+      "epoch": 0.9992,
+      "grad_norm": 6.786505222320557,
+      "kl": 0.34844970703125,
+      "learning_rate": 1e-06,
+      "loss": -0.0929,
+      "num_tokens": 16203199.0,
+      "reward": 0.14652788639068604,
+      "reward_std": 0.05133647471666336,
+      "rewards/bleu_reward_func/mean": 0.14652788639068604,
+      "rewards/bleu_reward_func/std": 0.18619418144226074,
+      "step": 1249
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 69.0,
+      "completions/max_terminated_length": 69.0,
+      "completions/mean_length": 39.75,
+      "completions/mean_terminated_length": 39.75,
+      "completions/min_length": 28.0,
+      "completions/min_terminated_length": 28.0,
+      "epoch": 1.0,
+      "grad_norm": 9.613625526428223,
+      "kl": 0.37677001953125,
+      "learning_rate": 1e-06,
+      "loss": 0.061,
+      "num_tokens": 16214113.0,
+      "reward": 0.10052811354398727,
+      "reward_std": 0.05825551599264145,
+      "rewards/bleu_reward_func/mean": 0.10052811354398727,
+      "rewards/bleu_reward_func/std": 0.10802065581083298,
+      "step": 1250
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 1250,
+  "num_input_tokens_seen": 16214113,
+  "num_train_epochs": 1,
+  "save_steps": 250,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 0.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}