diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,31284 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 1250, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 485.0, + "completions/mean_length": 262.625, + "completions/mean_terminated_length": 179.5, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 0.0008, + "grad_norm": 5.598288059234619, + "kl": 0.0005154609680175781, + "learning_rate": 1.5873015873015872e-08, + "loss": 0.0537, + "num_tokens": 15100.0, + "reward": 0.04846250265836716, + "reward_std": 0.06843117624521255, + "rewards/bleu_reward_func/mean": 0.04846250265836716, + "rewards/bleu_reward_func/std": 0.07639143615961075, + "step": 1 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 306.0, + "completions/mean_length": 248.09375, + "completions/mean_terminated_length": 128.13636779785156, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 0.0016, + "grad_norm": 7.323095321655273, + "kl": 0.0005979537963867188, + "learning_rate": 3.1746031746031744e-08, + "loss": 0.2393, + "num_tokens": 31479.0, + "reward": 0.03515050560235977, + "reward_std": 0.0315697155892849, + "rewards/bleu_reward_func/mean": 0.03515050560235977, + "rewards/bleu_reward_func/std": 0.048244670033454895, + "step": 2 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 503.0, + "completions/mean_length": 258.34375, + "completions/mean_terminated_length": 159.0869598388672, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 0.0024, + "grad_norm": 5.801818370819092, + "kl": 0.0008335113525390625, + "learning_rate": 4.7619047619047613e-08, + "loss": 0.2227, + "num_tokens": 47330.0, + "reward": 0.0770750418305397, + "reward_std": 0.05211775749921799, + "rewards/bleu_reward_func/mean": 0.0770750418305397, + "rewards/bleu_reward_func/std": 0.07082299888134003, + "step": 3 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 512.0, + "completions/mean_length": 285.59375, + "completions/mean_terminated_length": 197.0, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 0.0032, + "grad_norm": 6.7342329025268555, + "kl": 0.0007953643798828125, + "learning_rate": 6.349206349206349e-08, + "loss": 0.1714, + "num_tokens": 62101.0, + "reward": 0.05630416050553322, + "reward_std": 0.0387054979801178, + "rewards/bleu_reward_func/mean": 0.05630416050553322, + "rewards/bleu_reward_func/std": 0.05173136293888092, + "step": 4 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.34375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 479.0, + "completions/mean_length": 312.75, + "completions/mean_terminated_length": 208.38095092773438, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 0.004, + "grad_norm": 4.261541843414307, + "kl": 0.0007123947143554688, + "learning_rate": 7.936507936507936e-08, + "loss": 0.0096, + "num_tokens": 74629.0, + "reward": 0.03661263734102249, + "reward_std": 0.02765350043773651, + "rewards/bleu_reward_func/mean": 0.03661263734102249, + "rewards/bleu_reward_func/std": 0.05122661218047142, + "step": 5 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 472.0, + "completions/mean_length": 269.0625, + "completions/mean_terminated_length": 80.11111450195312, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 0.0048, + "grad_norm": 25.550201416015625, + "kl": 0.000881195068359375, + "learning_rate": 9.523809523809523e-08, + "loss": -0.1788, + "num_tokens": 91711.0, + "reward": 0.01917407289147377, + "reward_std": 0.014019257389008999, + "rewards/bleu_reward_func/mean": 0.01917407289147377, + "rewards/bleu_reward_func/std": 0.024173468351364136, + "step": 6 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.46875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 499.0, + "completions/mean_length": 377.8125, + "completions/mean_terminated_length": 259.4117736816406, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 0.0056, + "grad_norm": 3.7816717624664307, + "kl": 0.0007390975952148438, + "learning_rate": 1.111111111111111e-07, + "loss": -0.2289, + "num_tokens": 107369.0, + "reward": 0.02209433726966381, + "reward_std": 0.011734157800674438, + "rewards/bleu_reward_func/mean": 0.02209433726966381, + "rewards/bleu_reward_func/std": 0.023080473765730858, + "step": 7 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 493.0, + "completions/mean_length": 283.5625, + "completions/mean_terminated_length": 146.5, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 0.0064, + "grad_norm": 4.340329647064209, + "kl": 0.000911712646484375, + "learning_rate": 1.2698412698412698e-07, + "loss": -0.0252, + "num_tokens": 125275.0, + "reward": 0.03392016887664795, + "reward_std": 0.04013249650597572, + "rewards/bleu_reward_func/mean": 0.03392016887664795, + "rewards/bleu_reward_func/std": 0.05353143438696861, + "step": 8 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 490.0, + "completions/mean_length": 282.09375, + "completions/mean_terminated_length": 192.13043212890625, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 0.0072, + "grad_norm": 5.671853542327881, + "kl": 0.000640869140625, + "learning_rate": 1.4285714285714285e-07, + "loss": -0.4792, + "num_tokens": 142190.0, + "reward": 0.02354184165596962, + "reward_std": 0.015565130859613419, + "rewards/bleu_reward_func/mean": 0.02354184165596962, + "rewards/bleu_reward_func/std": 0.02305246703326702, + "step": 9 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 505.0, + "completions/mean_length": 421.625, + "completions/mean_terminated_length": 359.78948974609375, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "epoch": 0.008, + "grad_norm": 3.240866184234619, + "kl": 0.0006823539733886719, + "learning_rate": 1.5873015873015872e-07, + "loss": -0.0021, + "num_tokens": 158282.0, + "reward": 0.02482026070356369, + "reward_std": 0.0131409652531147, + "rewards/bleu_reward_func/mean": 0.02482026070356369, + "rewards/bleu_reward_func/std": 0.015270248055458069, + "step": 10 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.46875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 486.0, + "completions/mean_length": 345.90625, + "completions/mean_terminated_length": 199.35293579101562, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 0.0088, + "grad_norm": 3.652275800704956, + "kl": 0.0006260871887207031, + "learning_rate": 1.7460317460317458e-07, + "loss": -0.2852, + "num_tokens": 177455.0, + "reward": 0.03390186280012131, + "reward_std": 0.016770539805293083, + "rewards/bleu_reward_func/mean": 0.03390186280012131, + "rewards/bleu_reward_func/std": 0.04328485205769539, + "step": 11 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 438.0, + "completions/mean_length": 275.46875, + "completions/mean_terminated_length": 196.625, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 0.0096, + "grad_norm": 73.21807098388672, + "kl": 0.0032701492309570312, + "learning_rate": 1.9047619047619045e-07, + "loss": 0.0661, + "num_tokens": 189486.0, + "reward": 0.022345196455717087, + "reward_std": 0.019753258675336838, + "rewards/bleu_reward_func/mean": 0.022345196455717087, + "rewards/bleu_reward_func/std": 0.020975911989808083, + "step": 12 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 408.0, + "completions/mean_length": 309.90625, + "completions/mean_terminated_length": 188.65000915527344, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 0.0104, + "grad_norm": 5.3057379722595215, + "kl": 0.0005865097045898438, + "learning_rate": 2.0634920634920632e-07, + "loss": -0.1972, + "num_tokens": 203691.0, + "reward": 0.031099505722522736, + "reward_std": 0.04415294528007507, + "rewards/bleu_reward_func/mean": 0.031099505722522736, + "rewards/bleu_reward_func/std": 0.05319083109498024, + "step": 13 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 495.0, + "completions/mean_length": 211.5625, + "completions/mean_terminated_length": 127.43999481201172, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 0.0112, + "grad_norm": 10.997786521911621, + "kl": 0.0007891654968261719, + "learning_rate": 2.222222222222222e-07, + "loss": 0.005, + "num_tokens": 220117.0, + "reward": 0.07334433495998383, + "reward_std": 0.05255947634577751, + "rewards/bleu_reward_func/mean": 0.07334433495998383, + "rewards/bleu_reward_func/std": 0.11127088218927383, + "step": 14 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 481.0, + "completions/mean_length": 322.71875, + "completions/mean_terminated_length": 175.5, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 0.012, + "grad_norm": 4.558916091918945, + "kl": 0.000732421875, + "learning_rate": 2.3809523809523806e-07, + "loss": -0.1845, + "num_tokens": 232508.0, + "reward": 0.01538888644427061, + "reward_std": 0.012768322601914406, + "rewards/bleu_reward_func/mean": 0.01538888644427061, + "rewards/bleu_reward_func/std": 0.01415330171585083, + "step": 15 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.34375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 397.0, + "completions/mean_length": 266.78125, + "completions/mean_terminated_length": 138.33334350585938, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 0.0128, + "grad_norm": 4.418691158294678, + "kl": 0.0009031295776367188, + "learning_rate": 2.5396825396825396e-07, + "loss": 0.2931, + "num_tokens": 246325.0, + "reward": 0.04519380256533623, + "reward_std": 0.047629594802856445, + "rewards/bleu_reward_func/mean": 0.04519380256533623, + "rewards/bleu_reward_func/std": 0.09796681255102158, + "step": 16 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 512.0, + "completions/max_terminated_length": 498.0, + "completions/mean_length": 375.375, + "completions/mean_terminated_length": 238.75, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 0.0136, + "grad_norm": 4.360379219055176, + "kl": 0.0007829666137695312, + "learning_rate": 2.698412698412698e-07, + "loss": 0.0392, + "num_tokens": 262393.0, + "reward": 0.02785748988389969, + "reward_std": 0.02370397374033928, + "rewards/bleu_reward_func/mean": 0.02785748988389969, + "rewards/bleu_reward_func/std": 0.031648874282836914, + "step": 17 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 512.0, + "completions/max_terminated_length": 506.0, + "completions/mean_length": 341.625, + "completions/mean_terminated_length": 171.25, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 0.0144, + "grad_norm": 4.028530597686768, + "kl": 0.0006041526794433594, + "learning_rate": 2.857142857142857e-07, + "loss": -0.0867, + "num_tokens": 276509.0, + "reward": 0.03313319757580757, + "reward_std": 0.026780985295772552, + "rewards/bleu_reward_func/mean": 0.03313319757580757, + "rewards/bleu_reward_func/std": 0.03177988529205322, + "step": 18 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 508.0, + "completions/mean_length": 323.78125, + "completions/mean_terminated_length": 250.13043212890625, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 0.0152, + "grad_norm": 6.029330253601074, + "kl": 0.0007543563842773438, + "learning_rate": 3.0158730158730156e-07, + "loss": 0.2177, + "num_tokens": 288774.0, + "reward": 0.04934918135404587, + "reward_std": 0.035659849643707275, + "rewards/bleu_reward_func/mean": 0.04934918135404587, + "rewards/bleu_reward_func/std": 0.046043358743190765, + "step": 19 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 495.0, + "completions/mean_length": 365.5625, + "completions/mean_terminated_length": 251.6666717529297, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 0.016, + "grad_norm": 3.2459499835968018, + "kl": 0.0007076263427734375, + "learning_rate": 3.1746031746031743e-07, + "loss": -0.1034, + "num_tokens": 302384.0, + "reward": 0.045273810625076294, + "reward_std": 0.033148057758808136, + "rewards/bleu_reward_func/mean": 0.045273810625076294, + "rewards/bleu_reward_func/std": 0.05641715228557587, + "step": 20 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 470.0, + "completions/mean_length": 309.4375, + "completions/mean_terminated_length": 170.84210205078125, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 0.0168, + "grad_norm": 3.209543228149414, + "kl": 0.0006561279296875, + "learning_rate": 3.333333333333333e-07, + "loss": -0.0094, + "num_tokens": 317406.0, + "reward": 0.10972930490970612, + "reward_std": 0.09467534720897675, + "rewards/bleu_reward_func/mean": 0.10972930490970612, + "rewards/bleu_reward_func/std": 0.1834246814250946, + "step": 21 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 420.0, + "completions/mean_length": 276.4375, + "completions/mean_terminated_length": 169.3636474609375, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "epoch": 0.0176, + "grad_norm": 6.837025165557861, + "kl": 0.0008249282836914062, + "learning_rate": 3.4920634920634917e-07, + "loss": 0.192, + "num_tokens": 331436.0, + "reward": 0.08987575769424438, + "reward_std": 0.03435216099023819, + "rewards/bleu_reward_func/mean": 0.08987575769424438, + "rewards/bleu_reward_func/std": 0.13043095171451569, + "step": 22 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 512.0, + "completions/max_terminated_length": 453.0, + "completions/mean_length": 392.46875, + "completions/mean_terminated_length": 272.9375, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "epoch": 0.0184, + "grad_norm": 6.737916946411133, + "kl": 0.00086212158203125, + "learning_rate": 3.6507936507936504e-07, + "loss": -0.0441, + "num_tokens": 349715.0, + "reward": 0.027110569179058075, + "reward_std": 0.01938316598534584, + "rewards/bleu_reward_func/mean": 0.027110569179058075, + "rewards/bleu_reward_func/std": 0.021934401243925095, + "step": 23 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 500.0, + "completions/mean_length": 274.65625, + "completions/mean_terminated_length": 230.70370483398438, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 0.0192, + "grad_norm": 10.491765022277832, + "kl": 0.0007648468017578125, + "learning_rate": 3.809523809523809e-07, + "loss": 0.269, + "num_tokens": 360336.0, + "reward": 0.03281049802899361, + "reward_std": 0.023013217374682426, + "rewards/bleu_reward_func/mean": 0.03281049802899361, + "rewards/bleu_reward_func/std": 0.026025522500276566, + "step": 24 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 418.0, + "completions/mean_length": 269.59375, + "completions/mean_terminated_length": 103.7368392944336, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 0.02, + "grad_norm": 5.670685291290283, + "kl": 0.0008592605590820312, + "learning_rate": 3.968253968253968e-07, + "loss": 0.2917, + "num_tokens": 374179.0, + "reward": 0.04281582683324814, + "reward_std": 0.0440773144364357, + "rewards/bleu_reward_func/mean": 0.04281582683324814, + "rewards/bleu_reward_func/std": 0.0797559842467308, + "step": 25 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.34375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 499.0, + "completions/mean_length": 296.96875, + "completions/mean_terminated_length": 184.33334350585938, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 0.0208, + "grad_norm": 6.63213586807251, + "kl": 0.0011281967163085938, + "learning_rate": 4.1269841269841265e-07, + "loss": 0.0991, + "num_tokens": 386458.0, + "reward": 0.07768785208463669, + "reward_std": 0.08760131150484085, + "rewards/bleu_reward_func/mean": 0.07768785208463669, + "rewards/bleu_reward_func/std": 0.12583571672439575, + "step": 26 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 447.0, + "completions/mean_length": 280.4375, + "completions/mean_terminated_length": 175.18182373046875, + "completions/min_length": 46.0, + "completions/min_terminated_length": 46.0, + "epoch": 0.0216, + "grad_norm": 5.1314802169799805, + "kl": 0.0008611679077148438, + "learning_rate": 4.285714285714285e-07, + "loss": 0.2129, + "num_tokens": 399600.0, + "reward": 0.034803349524736404, + "reward_std": 0.033125463873147964, + "rewards/bleu_reward_func/mean": 0.034803349524736404, + "rewards/bleu_reward_func/std": 0.04297792166471481, + "step": 27 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.65625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 405.0, + "completions/mean_length": 403.625, + "completions/mean_terminated_length": 196.72727966308594, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 0.0224, + "grad_norm": 2.8215885162353516, + "kl": 0.0007638931274414062, + "learning_rate": 4.444444444444444e-07, + "loss": -0.2953, + "num_tokens": 415372.0, + "reward": 0.02452818863093853, + "reward_std": 0.018821807578206062, + "rewards/bleu_reward_func/mean": 0.02452818863093853, + "rewards/bleu_reward_func/std": 0.03300207853317261, + "step": 28 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 458.0, + "completions/mean_length": 213.84375, + "completions/mean_terminated_length": 114.45833587646484, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 0.0232, + "grad_norm": 7.012094020843506, + "kl": 0.0006189346313476562, + "learning_rate": 4.6031746031746025e-07, + "loss": 0.119, + "num_tokens": 428351.0, + "reward": 0.055403269827365875, + "reward_std": 0.06412488222122192, + "rewards/bleu_reward_func/mean": 0.055403269827365875, + "rewards/bleu_reward_func/std": 0.07173087447881699, + "step": 29 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 496.0, + "completions/mean_length": 374.28125, + "completions/mean_terminated_length": 291.6499938964844, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 0.024, + "grad_norm": 5.251861095428467, + "kl": 0.0007734298706054688, + "learning_rate": 4.761904761904761e-07, + "loss": 0.0396, + "num_tokens": 443856.0, + "reward": 0.033150382339954376, + "reward_std": 0.029685020446777344, + "rewards/bleu_reward_func/mean": 0.033150382339954376, + "rewards/bleu_reward_func/std": 0.04449395835399628, + "step": 30 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.46875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 482.0, + "completions/mean_length": 352.6875, + "completions/mean_terminated_length": 212.11764526367188, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 0.0248, + "grad_norm": 2.8306992053985596, + "kl": 0.0007534027099609375, + "learning_rate": 4.92063492063492e-07, + "loss": 0.0386, + "num_tokens": 458846.0, + "reward": 0.07098191231489182, + "reward_std": 0.07976502180099487, + "rewards/bleu_reward_func/mean": 0.07098191231489182, + "rewards/bleu_reward_func/std": 0.13301755487918854, + "step": 31 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 476.0, + "completions/mean_length": 314.96875, + "completions/mean_terminated_length": 249.2916717529297, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.0256, + "grad_norm": 4.18798303604126, + "kl": 0.0013475418090820312, + "learning_rate": 5.079365079365079e-07, + "loss": 0.1184, + "num_tokens": 475693.0, + "reward": 0.06003670394420624, + "reward_std": 0.04762943834066391, + "rewards/bleu_reward_func/mean": 0.06003670394420624, + "rewards/bleu_reward_func/std": 0.06799852848052979, + "step": 32 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 433.0, + "completions/mean_length": 246.09375, + "completions/mean_terminated_length": 157.45834350585938, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 0.0264, + "grad_norm": 7.522784233093262, + "kl": 0.0013055801391601562, + "learning_rate": 5.238095238095238e-07, + "loss": 0.2574, + "num_tokens": 489632.0, + "reward": 0.035463202744722366, + "reward_std": 0.02683849260210991, + "rewards/bleu_reward_func/mean": 0.035463202744722366, + "rewards/bleu_reward_func/std": 0.05300255864858627, + "step": 33 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 387.0, + "completions/mean_length": 281.40625, + "completions/mean_terminated_length": 191.17391967773438, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 0.0272, + "grad_norm": 4.312166213989258, + "kl": 0.0016279220581054688, + "learning_rate": 5.396825396825396e-07, + "loss": 0.0276, + "num_tokens": 503221.0, + "reward": 0.036928486078977585, + "reward_std": 0.030746515840291977, + "rewards/bleu_reward_func/mean": 0.036928486078977585, + "rewards/bleu_reward_func/std": 0.041675370186567307, + "step": 34 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.46875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 447.0, + "completions/mean_length": 363.0, + "completions/mean_terminated_length": 231.5294189453125, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 0.028, + "grad_norm": 5.934871196746826, + "kl": 0.001247406005859375, + "learning_rate": 5.555555555555555e-07, + "loss": 0.0007, + "num_tokens": 519629.0, + "reward": 0.02279968000948429, + "reward_std": 0.0171576626598835, + "rewards/bleu_reward_func/mean": 0.02279968000948429, + "rewards/bleu_reward_func/std": 0.02809896320104599, + "step": 35 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 471.0, + "completions/mean_length": 245.125, + "completions/mean_terminated_length": 227.33334350585938, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 0.0288, + "grad_norm": 6.152184963226318, + "kl": 0.00135040283203125, + "learning_rate": 5.714285714285714e-07, + "loss": 0.1277, + "num_tokens": 531009.0, + "reward": 0.08614860475063324, + "reward_std": 0.05592390149831772, + "rewards/bleu_reward_func/mean": 0.08614860475063324, + "rewards/bleu_reward_func/std": 0.07292494177818298, + "step": 36 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 477.0, + "completions/mean_length": 315.46875, + "completions/mean_terminated_length": 226.13636779785156, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 0.0296, + "grad_norm": 4.999361991882324, + "kl": 0.0010061264038085938, + "learning_rate": 5.873015873015873e-07, + "loss": -0.1945, + "num_tokens": 553904.0, + "reward": 0.022978566586971283, + "reward_std": 0.0320000983774662, + "rewards/bleu_reward_func/mean": 0.022978566586971283, + "rewards/bleu_reward_func/std": 0.05384916067123413, + "step": 37 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 501.0, + "completions/mean_length": 255.125, + "completions/mean_terminated_length": 195.84616088867188, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.0304, + "grad_norm": 12.462119102478027, + "kl": 0.00140380859375, + "learning_rate": 6.031746031746031e-07, + "loss": -0.0499, + "num_tokens": 569980.0, + "reward": 0.06601191312074661, + "reward_std": 0.06571432948112488, + "rewards/bleu_reward_func/mean": 0.06601191312074661, + "rewards/bleu_reward_func/std": 0.11037519574165344, + "step": 38 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 501.0, + "completions/mean_length": 350.78125, + "completions/mean_terminated_length": 287.6956481933594, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 0.0312, + "grad_norm": 4.318572044372559, + "kl": 0.0013608932495117188, + "learning_rate": 6.19047619047619e-07, + "loss": 0.1581, + "num_tokens": 584165.0, + "reward": 0.03686396777629852, + "reward_std": 0.00873212143778801, + "rewards/bleu_reward_func/mean": 0.03686396777629852, + "rewards/bleu_reward_func/std": 0.03987700119614601, + "step": 39 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 467.0, + "completions/mean_length": 395.40625, + "completions/mean_terminated_length": 315.631591796875, + "completions/min_length": 60.0, + "completions/min_terminated_length": 60.0, + "epoch": 0.032, + "grad_norm": 2.8483028411865234, + "kl": 0.0015163421630859375, + "learning_rate": 6.349206349206349e-07, + "loss": 0.2409, + "num_tokens": 599602.0, + "reward": 0.012605215422809124, + "reward_std": 0.007717709057033062, + "rewards/bleu_reward_func/mean": 0.012605215422809124, + "rewards/bleu_reward_func/std": 0.008546828292310238, + "step": 40 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.46875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 401.0, + "completions/mean_length": 306.46875, + "completions/mean_terminated_length": 125.11764526367188, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 0.0328, + "grad_norm": 15.546673774719238, + "kl": 0.0033998489379882812, + "learning_rate": 6.507936507936507e-07, + "loss": 0.2262, + "num_tokens": 617761.0, + "reward": 0.037311654537916183, + "reward_std": 0.04001215100288391, + "rewards/bleu_reward_func/mean": 0.037311654537916183, + "rewards/bleu_reward_func/std": 0.05116492509841919, + "step": 41 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.53125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 394.0, + "completions/mean_length": 308.375, + "completions/mean_terminated_length": 77.60000610351562, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 0.0336, + "grad_norm": 9.35763168334961, + "kl": 0.0030651092529296875, + "learning_rate": 6.666666666666666e-07, + "loss": 0.1783, + "num_tokens": 637541.0, + "reward": 0.06710080057382584, + "reward_std": 0.0418785884976387, + "rewards/bleu_reward_func/mean": 0.06710080057382584, + "rewards/bleu_reward_func/std": 0.09365852922201157, + "step": 42 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 509.0, + "completions/mean_length": 336.59375, + "completions/mean_terminated_length": 216.57894897460938, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 0.0344, + "grad_norm": 4.267389297485352, + "kl": 0.002620697021484375, + "learning_rate": 6.825396825396826e-07, + "loss": -0.0163, + "num_tokens": 650776.0, + "reward": 0.04351692646741867, + "reward_std": 0.03509015589952469, + "rewards/bleu_reward_func/mean": 0.04351692646741867, + "rewards/bleu_reward_func/std": 0.052853576838970184, + "step": 43 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 387.0, + "completions/mean_length": 240.5, + "completions/mean_terminated_length": 177.84616088867188, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.0352, + "grad_norm": 112.21627807617188, + "kl": 0.0047245025634765625, + "learning_rate": 6.984126984126983e-07, + "loss": 0.0521, + "num_tokens": 665576.0, + "reward": 0.05281548202037811, + "reward_std": 0.034495480358600616, + "rewards/bleu_reward_func/mean": 0.05281548202037811, + "rewards/bleu_reward_func/std": 0.0704483836889267, + "step": 44 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 477.0, + "completions/mean_length": 335.46875, + "completions/mean_terminated_length": 310.25, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "epoch": 0.036, + "grad_norm": 30.739818572998047, + "kl": 0.00278472900390625, + "learning_rate": 7.142857142857143e-07, + "loss": -0.1091, + "num_tokens": 678375.0, + "reward": 0.04049266129732132, + "reward_std": 0.020605597645044327, + "rewards/bleu_reward_func/mean": 0.04049266129732132, + "rewards/bleu_reward_func/std": 0.04322003573179245, + "step": 45 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 456.0, + "completions/mean_length": 253.28125, + "completions/mean_terminated_length": 205.37037658691406, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 0.0368, + "grad_norm": 3.857532501220703, + "kl": 0.002582550048828125, + "learning_rate": 7.301587301587301e-07, + "loss": 0.1863, + "num_tokens": 693632.0, + "reward": 0.03602021187543869, + "reward_std": 0.03167928382754326, + "rewards/bleu_reward_func/mean": 0.03602021187543869, + "rewards/bleu_reward_func/std": 0.060269005596637726, + "step": 46 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 457.0, + "completions/mean_length": 379.125, + "completions/mean_terminated_length": 208.2857208251953, + "completions/min_length": 42.0, + "completions/min_terminated_length": 42.0, + "epoch": 0.0376, + "grad_norm": 44.705684661865234, + "kl": 0.002960205078125, + "learning_rate": 7.46031746031746e-07, + "loss": -0.2537, + "num_tokens": 712244.0, + "reward": 0.009683560580015182, + "reward_std": 0.007736856117844582, + "rewards/bleu_reward_func/mean": 0.009683560580015182, + "rewards/bleu_reward_func/std": 0.010262547992169857, + "step": 47 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 491.0, + "completions/mean_length": 209.46875, + "completions/mean_terminated_length": 153.44444274902344, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 0.0384, + "grad_norm": 4.927426815032959, + "kl": 0.010406494140625, + "learning_rate": 7.619047619047618e-07, + "loss": 0.2249, + "num_tokens": 722779.0, + "reward": 0.06434739381074905, + "reward_std": 0.062096044421195984, + "rewards/bleu_reward_func/mean": 0.06434739381074905, + "rewards/bleu_reward_func/std": 0.07261113822460175, + "step": 48 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 463.0, + "completions/mean_length": 319.53125, + "completions/mean_terminated_length": 187.84210205078125, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 0.0392, + "grad_norm": 6.036177158355713, + "kl": 0.0051422119140625, + "learning_rate": 7.777777777777778e-07, + "loss": 0.2132, + "num_tokens": 735892.0, + "reward": 0.0316137932240963, + "reward_std": 0.028243713080883026, + "rewards/bleu_reward_func/mean": 0.0316137932240963, + "rewards/bleu_reward_func/std": 0.032289810478687286, + "step": 49 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.34375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 489.0, + "completions/mean_length": 329.0625, + "completions/mean_terminated_length": 233.23809814453125, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.04, + "grad_norm": 6.000904560089111, + "kl": 0.0041103363037109375, + "learning_rate": 7.936507936507936e-07, + "loss": 0.164, + "num_tokens": 748926.0, + "reward": 0.031059542670845985, + "reward_std": 0.02046222612261772, + "rewards/bleu_reward_func/mean": 0.031059542670845985, + "rewards/bleu_reward_func/std": 0.029215287417173386, + "step": 50 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 460.0, + "completions/mean_length": 240.03125, + "completions/mean_terminated_length": 201.17857360839844, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 0.0408, + "grad_norm": 4.45400333404541, + "kl": 0.00446319580078125, + "learning_rate": 8.095238095238095e-07, + "loss": 0.095, + "num_tokens": 763935.0, + "reward": 0.06022896245121956, + "reward_std": 0.04401791840791702, + "rewards/bleu_reward_func/mean": 0.06022896245121956, + "rewards/bleu_reward_func/std": 0.06288844347000122, + "step": 51 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 488.0, + "completions/mean_length": 308.875, + "completions/mean_terminated_length": 169.89474487304688, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 0.0416, + "grad_norm": 6.896392822265625, + "kl": 0.00627899169921875, + "learning_rate": 8.253968253968253e-07, + "loss": 0.1302, + "num_tokens": 781619.0, + "reward": 0.02847466617822647, + "reward_std": 0.024918708950281143, + "rewards/bleu_reward_func/mean": 0.02847466617822647, + "rewards/bleu_reward_func/std": 0.03209677338600159, + "step": 52 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 472.0, + "completions/mean_length": 309.9375, + "completions/mean_terminated_length": 263.3077087402344, + "completions/min_length": 50.0, + "completions/min_terminated_length": 50.0, + "epoch": 0.0424, + "grad_norm": 5.093250751495361, + "kl": 0.004283905029296875, + "learning_rate": 8.412698412698413e-07, + "loss": -0.0777, + "num_tokens": 795977.0, + "reward": 0.07096201926469803, + "reward_std": 0.06636855751276016, + "rewards/bleu_reward_func/mean": 0.07096201926469803, + "rewards/bleu_reward_func/std": 0.09039857983589172, + "step": 53 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.59375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 484.0, + "completions/mean_length": 386.4375, + "completions/mean_terminated_length": 202.92308044433594, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.0432, + "grad_norm": 3.2264180183410645, + "kl": 0.0028362274169921875, + "learning_rate": 8.57142857142857e-07, + "loss": -0.0863, + "num_tokens": 814135.0, + "reward": 0.014086933806538582, + "reward_std": 0.013363949954509735, + "rewards/bleu_reward_func/mean": 0.014086933806538582, + "rewards/bleu_reward_func/std": 0.01598522998392582, + "step": 54 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 456.0, + "completions/mean_length": 244.59375, + "completions/mean_terminated_length": 195.07408142089844, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.044, + "grad_norm": 8.706979751586914, + "kl": 0.00571441650390625, + "learning_rate": 8.73015873015873e-07, + "loss": 0.1609, + "num_tokens": 827226.0, + "reward": 0.0647934228181839, + "reward_std": 0.0345802828669548, + "rewards/bleu_reward_func/mean": 0.0647934228181839, + "rewards/bleu_reward_func/std": 0.04030924290418625, + "step": 55 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 463.0, + "completions/mean_length": 126.0625, + "completions/mean_terminated_length": 113.61289978027344, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.0448, + "grad_norm": 8.598736763000488, + "kl": 0.0142974853515625, + "learning_rate": 8.888888888888888e-07, + "loss": 0.1419, + "num_tokens": 834268.0, + "reward": 0.04880748316645622, + "reward_std": 0.042880259454250336, + "rewards/bleu_reward_func/mean": 0.04880748316645622, + "rewards/bleu_reward_func/std": 0.05060458555817604, + "step": 56 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 479.0, + "completions/mean_length": 302.03125, + "completions/mean_terminated_length": 219.86956787109375, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 0.0456, + "grad_norm": 6.926377296447754, + "kl": 0.009532928466796875, + "learning_rate": 9.047619047619047e-07, + "loss": -0.0374, + "num_tokens": 851701.0, + "reward": 0.06913506239652634, + "reward_std": 0.04138587415218353, + "rewards/bleu_reward_func/mean": 0.06913506239652634, + "rewards/bleu_reward_func/std": 0.0750163346529007, + "step": 57 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 416.0, + "completions/mean_length": 211.40625, + "completions/mean_terminated_length": 127.23999786376953, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.0464, + "grad_norm": 7.853041648864746, + "kl": 0.0272064208984375, + "learning_rate": 9.206349206349205e-07, + "loss": 0.4023, + "num_tokens": 865434.0, + "reward": 0.12499310076236725, + "reward_std": 0.08980046212673187, + "rewards/bleu_reward_func/mean": 0.12499310076236725, + "rewards/bleu_reward_func/std": 0.13493874669075012, + "step": 58 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.46875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 505.0, + "completions/mean_length": 390.65625, + "completions/mean_terminated_length": 283.5882263183594, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 0.0472, + "grad_norm": 2.4230539798736572, + "kl": 0.003765106201171875, + "learning_rate": 9.365079365079365e-07, + "loss": -0.0092, + "num_tokens": 884815.0, + "reward": 0.021261584013700485, + "reward_std": 0.027461236342787743, + "rewards/bleu_reward_func/mean": 0.021261584013700485, + "rewards/bleu_reward_func/std": 0.03110821731388569, + "step": 59 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 495.0, + "completions/mean_length": 281.59375, + "completions/mean_terminated_length": 191.43478393554688, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "epoch": 0.048, + "grad_norm": 2.9367215633392334, + "kl": 0.00628662109375, + "learning_rate": 9.523809523809522e-07, + "loss": 0.2021, + "num_tokens": 896810.0, + "reward": 0.023613639175891876, + "reward_std": 0.02252291887998581, + "rewards/bleu_reward_func/mean": 0.023613639175891876, + "rewards/bleu_reward_func/std": 0.041281431913375854, + "step": 60 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 509.0, + "completions/mean_length": 272.34375, + "completions/mean_terminated_length": 227.9629669189453, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.0488, + "grad_norm": 3.998281717300415, + "kl": 0.0076904296875, + "learning_rate": 9.682539682539682e-07, + "loss": -0.1513, + "num_tokens": 907349.0, + "reward": 0.07193129509687424, + "reward_std": 0.05195175111293793, + "rewards/bleu_reward_func/mean": 0.07193129509687424, + "rewards/bleu_reward_func/std": 0.07358168065547943, + "step": 61 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 335.0, + "completions/max_terminated_length": 335.0, + "completions/mean_length": 82.25, + "completions/mean_terminated_length": 82.25, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 0.0496, + "grad_norm": 12.274957656860352, + "kl": 0.049835205078125, + "learning_rate": 9.84126984126984e-07, + "loss": 0.0211, + "num_tokens": 916605.0, + "reward": 0.1968570053577423, + "reward_std": 0.09575757384300232, + "rewards/bleu_reward_func/mean": 0.1968570053577423, + "rewards/bleu_reward_func/std": 0.14971531927585602, + "step": 62 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 353.0, + "completions/mean_length": 209.71875, + "completions/mean_terminated_length": 178.44827270507812, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 0.0504, + "grad_norm": 4.69417142868042, + "kl": 0.0074615478515625, + "learning_rate": 1e-06, + "loss": -0.0102, + "num_tokens": 924772.0, + "reward": 0.026346374303102493, + "reward_std": 0.015668006613850594, + "rewards/bleu_reward_func/mean": 0.026346374303102493, + "rewards/bleu_reward_func/std": 0.016677534207701683, + "step": 63 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 367.0, + "completions/mean_length": 147.34375, + "completions/mean_terminated_length": 95.25000762939453, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 0.0512, + "grad_norm": 7.854241371154785, + "kl": 0.019744873046875, + "learning_rate": 1e-06, + "loss": 0.1402, + "num_tokens": 932879.0, + "reward": 0.039189111441373825, + "reward_std": 0.034408073872327805, + "rewards/bleu_reward_func/mean": 0.039189111441373825, + "rewards/bleu_reward_func/std": 0.06643246859312057, + "step": 64 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 473.0, + "completions/mean_length": 217.53125, + "completions/mean_terminated_length": 163.0, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.052, + "grad_norm": 5.94617223739624, + "kl": 0.012752532958984375, + "learning_rate": 1e-06, + "loss": 0.0629, + "num_tokens": 944744.0, + "reward": 0.0992283821105957, + "reward_std": 0.04174066707491875, + "rewards/bleu_reward_func/mean": 0.0992283821105957, + "rewards/bleu_reward_func/std": 0.14538165926933289, + "step": 65 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.34375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 512.0, + "completions/mean_length": 360.84375, + "completions/mean_terminated_length": 281.66668701171875, + "completions/min_length": 57.0, + "completions/min_terminated_length": 57.0, + "epoch": 0.0528, + "grad_norm": 2.7164971828460693, + "kl": 0.00780487060546875, + "learning_rate": 1e-06, + "loss": -0.1815, + "num_tokens": 959043.0, + "reward": 0.03164489567279816, + "reward_std": 0.024089161306619644, + "rewards/bleu_reward_func/mean": 0.03164489567279816, + "rewards/bleu_reward_func/std": 0.03230883181095123, + "step": 66 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 512.0, + "completions/mean_length": 174.375, + "completions/mean_terminated_length": 79.83999633789062, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 0.0536, + "grad_norm": 8.954367637634277, + "kl": 0.040679931640625, + "learning_rate": 1e-06, + "loss": 0.4022, + "num_tokens": 970487.0, + "reward": 0.1188623458147049, + "reward_std": 0.06528393179178238, + "rewards/bleu_reward_func/mean": 0.1188623458147049, + "rewards/bleu_reward_func/std": 0.10126637667417526, + "step": 67 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.34375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 458.0, + "completions/mean_length": 249.78125, + "completions/mean_terminated_length": 112.42857360839844, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 0.0544, + "grad_norm": 8.929741859436035, + "kl": 0.01055908203125, + "learning_rate": 1e-06, + "loss": -0.3676, + "num_tokens": 980432.0, + "reward": 0.04680415242910385, + "reward_std": 0.015473801642656326, + "rewards/bleu_reward_func/mean": 0.04680415242910385, + "rewards/bleu_reward_func/std": 0.05666949972510338, + "step": 68 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 481.0, + "completions/mean_length": 200.28125, + "completions/mean_terminated_length": 155.75, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 0.0552, + "grad_norm": 19.934701919555664, + "kl": 0.036396026611328125, + "learning_rate": 1e-06, + "loss": 0.1105, + "num_tokens": 988713.0, + "reward": 0.03349726274609566, + "reward_std": 0.007375569082796574, + "rewards/bleu_reward_func/mean": 0.03349726274609566, + "rewards/bleu_reward_func/std": 0.0360921286046505, + "step": 69 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 349.0, + "completions/mean_length": 295.1875, + "completions/mean_terminated_length": 146.84210205078125, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 0.056, + "grad_norm": 3.6616406440734863, + "kl": 0.0112762451171875, + "learning_rate": 1e-06, + "loss": 0.081, + "num_tokens": 1002319.0, + "reward": 0.016106903553009033, + "reward_std": 0.008415726944804192, + "rewards/bleu_reward_func/mean": 0.016106903553009033, + "rewards/bleu_reward_func/std": 0.012413726188242435, + "step": 70 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 501.0, + "completions/mean_length": 270.1875, + "completions/mean_terminated_length": 235.6428680419922, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.0568, + "grad_norm": 11.310477256774902, + "kl": 0.02431488037109375, + "learning_rate": 1e-06, + "loss": 0.0216, + "num_tokens": 1014117.0, + "reward": 0.09336908906698227, + "reward_std": 0.04001408815383911, + "rewards/bleu_reward_func/mean": 0.09336908906698227, + "rewards/bleu_reward_func/std": 0.04507448151707649, + "step": 71 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.34375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 508.0, + "completions/mean_length": 298.96875, + "completions/mean_terminated_length": 187.38095092773438, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.0576, + "grad_norm": 11.831945419311523, + "kl": 0.0236358642578125, + "learning_rate": 1e-06, + "loss": 0.2398, + "num_tokens": 1029324.0, + "reward": 0.06671467423439026, + "reward_std": 0.07224421948194504, + "rewards/bleu_reward_func/mean": 0.06671467423439026, + "rewards/bleu_reward_func/std": 0.09839192777872086, + "step": 72 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 506.0, + "completions/mean_length": 262.28125, + "completions/mean_terminated_length": 245.6333465576172, + "completions/min_length": 70.0, + "completions/min_terminated_length": 70.0, + "epoch": 0.0584, + "grad_norm": 3.4266369342803955, + "kl": 0.01332855224609375, + "learning_rate": 1e-06, + "loss": -0.117, + "num_tokens": 1040677.0, + "reward": 0.048909105360507965, + "reward_std": 0.01749919354915619, + "rewards/bleu_reward_func/mean": 0.048909105360507965, + "rewards/bleu_reward_func/std": 0.046220190823078156, + "step": 73 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.34375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 443.0, + "completions/mean_length": 315.78125, + "completions/mean_terminated_length": 213.0, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.0592, + "grad_norm": 3.334998369216919, + "kl": 0.030864715576171875, + "learning_rate": 1e-06, + "loss": 0.0762, + "num_tokens": 1057350.0, + "reward": 0.06654933840036392, + "reward_std": 0.030867960304021835, + "rewards/bleu_reward_func/mean": 0.06654933840036392, + "rewards/bleu_reward_func/std": 0.04364337399601936, + "step": 74 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 510.0, + "completions/mean_length": 287.4375, + "completions/mean_terminated_length": 235.61538696289062, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 0.06, + "grad_norm": 12.321810722351074, + "kl": 0.05252838134765625, + "learning_rate": 1e-06, + "loss": 0.1111, + "num_tokens": 1072668.0, + "reward": 0.07815341651439667, + "reward_std": 0.05233295261859894, + "rewards/bleu_reward_func/mean": 0.07815341651439667, + "rewards/bleu_reward_func/std": 0.0646696388721466, + "step": 75 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 503.0, + "completions/mean_length": 306.3125, + "completions/mean_terminated_length": 182.90000915527344, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 0.0608, + "grad_norm": 3.883251905441284, + "kl": 0.0373382568359375, + "learning_rate": 1e-06, + "loss": 0.1517, + "num_tokens": 1086694.0, + "reward": 0.06417744606733322, + "reward_std": 0.034075379371643066, + "rewards/bleu_reward_func/mean": 0.06417744606733322, + "rewards/bleu_reward_func/std": 0.049788232892751694, + "step": 76 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 415.0, + "completions/max_terminated_length": 415.0, + "completions/mean_length": 196.4375, + "completions/mean_terminated_length": 196.4375, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.0616, + "grad_norm": 6.460638523101807, + "kl": 0.05291748046875, + "learning_rate": 1e-06, + "loss": -0.0849, + "num_tokens": 1097476.0, + "reward": 0.08122064173221588, + "reward_std": 0.03298315033316612, + "rewards/bleu_reward_func/mean": 0.08122064173221588, + "rewards/bleu_reward_func/std": 0.047924816608428955, + "step": 77 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 448.0, + "completions/mean_length": 199.15625, + "completions/mean_terminated_length": 166.79310607910156, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 0.0624, + "grad_norm": 6.63805627822876, + "kl": 0.07642364501953125, + "learning_rate": 1e-06, + "loss": -0.0967, + "num_tokens": 1108793.0, + "reward": 0.07887591421604156, + "reward_std": 0.05435461550951004, + "rewards/bleu_reward_func/mean": 0.07887591421604156, + "rewards/bleu_reward_func/std": 0.10201766341924667, + "step": 78 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 480.0, + "completions/mean_length": 193.0, + "completions/mean_terminated_length": 147.42857360839844, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 0.0632, + "grad_norm": 14.087907791137695, + "kl": 0.0762939453125, + "learning_rate": 1e-06, + "loss": -0.056, + "num_tokens": 1118721.0, + "reward": 0.035933416336774826, + "reward_std": 0.02187356725335121, + "rewards/bleu_reward_func/mean": 0.035933416336774826, + "rewards/bleu_reward_func/std": 0.025764403864741325, + "step": 79 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 482.0, + "completions/mean_length": 192.9375, + "completions/mean_terminated_length": 133.8518524169922, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.064, + "grad_norm": 6.712767124176025, + "kl": 0.07209014892578125, + "learning_rate": 1e-06, + "loss": 0.2799, + "num_tokens": 1130095.0, + "reward": 0.04000134766101837, + "reward_std": 0.014790613204240799, + "rewards/bleu_reward_func/mean": 0.04000134766101837, + "rewards/bleu_reward_func/std": 0.028310615569353104, + "step": 80 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 509.0, + "completions/mean_length": 395.5625, + "completions/mean_terminated_length": 315.8947448730469, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "epoch": 0.0648, + "grad_norm": 3.1348772048950195, + "kl": 0.012363433837890625, + "learning_rate": 1e-06, + "loss": -0.0477, + "num_tokens": 1145009.0, + "reward": 0.05394501984119415, + "reward_std": 0.019456665962934494, + "rewards/bleu_reward_func/mean": 0.05394501984119415, + "rewards/bleu_reward_func/std": 0.05528007075190544, + "step": 81 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 511.0, + "completions/mean_length": 297.25, + "completions/mean_terminated_length": 213.21739196777344, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 0.0656, + "grad_norm": 4.045035362243652, + "kl": 0.02558135986328125, + "learning_rate": 1e-06, + "loss": -0.0948, + "num_tokens": 1156665.0, + "reward": 0.08088956773281097, + "reward_std": 0.031020794063806534, + "rewards/bleu_reward_func/mean": 0.08088956773281097, + "rewards/bleu_reward_func/std": 0.04719265177845955, + "step": 82 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 462.0, + "completions/mean_length": 252.65625, + "completions/mean_terminated_length": 204.629638671875, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "epoch": 0.0664, + "grad_norm": 7.020449161529541, + "kl": 0.022491455078125, + "learning_rate": 1e-06, + "loss": -0.2084, + "num_tokens": 1170686.0, + "reward": 0.048978567123413086, + "reward_std": 0.014538805931806564, + "rewards/bleu_reward_func/mean": 0.048978567123413086, + "rewards/bleu_reward_func/std": 0.03447263315320015, + "step": 83 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.34375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 511.0, + "completions/mean_length": 279.53125, + "completions/mean_terminated_length": 157.76190185546875, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 0.0672, + "grad_norm": 6.09721040725708, + "kl": 0.02556610107421875, + "learning_rate": 1e-06, + "loss": 0.0979, + "num_tokens": 1181927.0, + "reward": 0.07267215847969055, + "reward_std": 0.029872559010982513, + "rewards/bleu_reward_func/mean": 0.07267215847969055, + "rewards/bleu_reward_func/std": 0.05035723000764847, + "step": 84 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 512.0, + "completions/mean_length": 281.1875, + "completions/mean_terminated_length": 257.3103332519531, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 0.068, + "grad_norm": 5.706907272338867, + "kl": 0.030059814453125, + "learning_rate": 1e-06, + "loss": -0.0967, + "num_tokens": 1198797.0, + "reward": 0.05050581321120262, + "reward_std": 0.023779014125466347, + "rewards/bleu_reward_func/mean": 0.05050581321120262, + "rewards/bleu_reward_func/std": 0.03608938306570053, + "step": 85 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 512.0, + "completions/max_terminated_length": 512.0, + "completions/mean_length": 163.65625, + "completions/mean_terminated_length": 163.65625, + "completions/min_length": 3.0, + "completions/min_terminated_length": 3.0, + "epoch": 0.0688, + "grad_norm": 7.1789960861206055, + "kl": 0.0573577880859375, + "learning_rate": 1e-06, + "loss": 0.0245, + "num_tokens": 1207106.0, + "reward": 0.07873363792896271, + "reward_std": 0.0395892933011055, + "rewards/bleu_reward_func/mean": 0.07873363792896271, + "rewards/bleu_reward_func/std": 0.0705900639295578, + "step": 86 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 425.0, + "completions/mean_length": 304.6875, + "completions/mean_terminated_length": 223.56521606445312, + "completions/min_length": 67.0, + "completions/min_terminated_length": 67.0, + "epoch": 0.0696, + "grad_norm": 3.9068655967712402, + "kl": 0.01955413818359375, + "learning_rate": 1e-06, + "loss": -0.2361, + "num_tokens": 1219760.0, + "reward": 0.03426438570022583, + "reward_std": 0.021733341738581657, + "rewards/bleu_reward_func/mean": 0.03426438570022583, + "rewards/bleu_reward_func/std": 0.031944356858730316, + "step": 87 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 496.0, + "completions/max_terminated_length": 496.0, + "completions/mean_length": 137.1875, + "completions/mean_terminated_length": 137.1875, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 0.0704, + "grad_norm": 7.929437160491943, + "kl": 0.1163482666015625, + "learning_rate": 1e-06, + "loss": -0.0079, + "num_tokens": 1230702.0, + "reward": 0.13437795639038086, + "reward_std": 0.04989761859178543, + "rewards/bleu_reward_func/mean": 0.13437795639038086, + "rewards/bleu_reward_func/std": 0.08757011592388153, + "step": 88 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 428.0, + "completions/mean_length": 248.1875, + "completions/mean_terminated_length": 160.25, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.0712, + "grad_norm": 3.5676372051239014, + "kl": 0.029571533203125, + "learning_rate": 1e-06, + "loss": 0.085, + "num_tokens": 1241748.0, + "reward": 0.06261839717626572, + "reward_std": 0.05303023010492325, + "rewards/bleu_reward_func/mean": 0.06261839717626572, + "rewards/bleu_reward_func/std": 0.07371754199266434, + "step": 89 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 487.0, + "completions/mean_length": 199.21875, + "completions/mean_terminated_length": 178.36666870117188, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 0.072, + "grad_norm": 13.081062316894531, + "kl": 0.0998382568359375, + "learning_rate": 1e-06, + "loss": 0.0303, + "num_tokens": 1254971.0, + "reward": 0.09151400625705719, + "reward_std": 0.049102533608675, + "rewards/bleu_reward_func/mean": 0.09151400625705719, + "rewards/bleu_reward_func/std": 0.08098553121089935, + "step": 90 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 458.0, + "completions/mean_length": 277.8125, + "completions/mean_terminated_length": 244.35714721679688, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.0728, + "grad_norm": 4.541591167449951, + "kl": 0.0150604248046875, + "learning_rate": 1e-06, + "loss": -0.2243, + "num_tokens": 1268229.0, + "reward": 0.029024727642536163, + "reward_std": 0.02233259379863739, + "rewards/bleu_reward_func/mean": 0.029024727642536163, + "rewards/bleu_reward_func/std": 0.0296621173620224, + "step": 91 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 461.0, + "completions/mean_length": 175.875, + "completions/mean_terminated_length": 81.75999450683594, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 0.0736, + "grad_norm": 12.45702075958252, + "kl": 0.09152984619140625, + "learning_rate": 1e-06, + "loss": 0.3301, + "num_tokens": 1279753.0, + "reward": 0.06008782982826233, + "reward_std": 0.03770461678504944, + "rewards/bleu_reward_func/mean": 0.06008782982826233, + "rewards/bleu_reward_func/std": 0.056894708424806595, + "step": 92 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 239.0, + "completions/max_terminated_length": 239.0, + "completions/mean_length": 71.03125, + "completions/mean_terminated_length": 71.03125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.0744, + "grad_norm": 8.271183967590332, + "kl": 0.0682220458984375, + "learning_rate": 1e-06, + "loss": 0.2167, + "num_tokens": 1288762.0, + "reward": 0.17779187858104706, + "reward_std": 0.02900426834821701, + "rewards/bleu_reward_func/mean": 0.17779187858104706, + "rewards/bleu_reward_func/std": 0.1678331196308136, + "step": 93 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 501.0, + "completions/mean_length": 132.28125, + "completions/mean_terminated_length": 78.03572082519531, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 0.0752, + "grad_norm": 45.396934509277344, + "kl": 0.140625, + "learning_rate": 1e-06, + "loss": 0.1526, + "num_tokens": 1299835.0, + "reward": 0.1527654230594635, + "reward_std": 0.061802513897418976, + "rewards/bleu_reward_func/mean": 0.1527654230594635, + "rewards/bleu_reward_func/std": 0.10723396390676498, + "step": 94 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.34375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 500.0, + "completions/mean_length": 350.8125, + "completions/mean_terminated_length": 266.3809509277344, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "epoch": 0.076, + "grad_norm": 4.4763383865356445, + "kl": 0.03443145751953125, + "learning_rate": 1e-06, + "loss": 0.0315, + "num_tokens": 1314877.0, + "reward": 0.08366496115922928, + "reward_std": 0.023002739995718002, + "rewards/bleu_reward_func/mean": 0.08366496115922928, + "rewards/bleu_reward_func/std": 0.07334847003221512, + "step": 95 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 509.0, + "completions/mean_length": 237.75, + "completions/mean_terminated_length": 209.37930297851562, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.0768, + "grad_norm": 13.181612968444824, + "kl": 0.0660247802734375, + "learning_rate": 1e-06, + "loss": -0.1104, + "num_tokens": 1326757.0, + "reward": 0.04618287831544876, + "reward_std": 0.022957133129239082, + "rewards/bleu_reward_func/mean": 0.04618287831544876, + "rewards/bleu_reward_func/std": 0.03049774467945099, + "step": 96 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 508.0, + "completions/mean_length": 326.71875, + "completions/mean_terminated_length": 254.21739196777344, + "completions/min_length": 67.0, + "completions/min_terminated_length": 67.0, + "epoch": 0.0776, + "grad_norm": 5.014129161834717, + "kl": 0.022705078125, + "learning_rate": 1e-06, + "loss": 0.0227, + "num_tokens": 1340716.0, + "reward": 0.08603382110595703, + "reward_std": 0.022703565657138824, + "rewards/bleu_reward_func/mean": 0.08603382110595703, + "rewards/bleu_reward_func/std": 0.09760169684886932, + "step": 97 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 511.0, + "completions/mean_length": 205.03125, + "completions/mean_terminated_length": 195.1290283203125, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 0.0784, + "grad_norm": 10.452898025512695, + "kl": 0.101318359375, + "learning_rate": 1e-06, + "loss": -0.0956, + "num_tokens": 1354373.0, + "reward": 0.07816646993160248, + "reward_std": 0.03450850397348404, + "rewards/bleu_reward_func/mean": 0.07816646993160248, + "rewards/bleu_reward_func/std": 0.05475042015314102, + "step": 98 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 480.0, + "completions/mean_length": 240.4375, + "completions/mean_terminated_length": 190.1481475830078, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.0792, + "grad_norm": 8.82993221282959, + "kl": 0.0435791015625, + "learning_rate": 1e-06, + "loss": -0.1898, + "num_tokens": 1365875.0, + "reward": 0.027829378843307495, + "reward_std": 0.016982190310955048, + "rewards/bleu_reward_func/mean": 0.027829378843307495, + "rewards/bleu_reward_func/std": 0.019511230289936066, + "step": 99 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 466.0, + "completions/mean_length": 210.21875, + "completions/mean_terminated_length": 140.57693481445312, + "completions/min_length": 3.0, + "completions/min_terminated_length": 3.0, + "epoch": 0.08, + "grad_norm": 14.261658668518066, + "kl": 0.1817169189453125, + "learning_rate": 1e-06, + "loss": -0.4193, + "num_tokens": 1375586.0, + "reward": 0.0430663600564003, + "reward_std": 0.023313239216804504, + "rewards/bleu_reward_func/mean": 0.0430663600564003, + "rewards/bleu_reward_func/std": 0.0409073531627655, + "step": 100 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 493.0, + "completions/mean_length": 210.90625, + "completions/mean_terminated_length": 201.19354248046875, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.0808, + "grad_norm": 7.960334300994873, + "kl": 0.0833740234375, + "learning_rate": 1e-06, + "loss": 0.1121, + "num_tokens": 1384975.0, + "reward": 0.0974574014544487, + "reward_std": 0.03397291898727417, + "rewards/bleu_reward_func/mean": 0.0974574014544487, + "rewards/bleu_reward_func/std": 0.10795393586158752, + "step": 101 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 459.0, + "completions/mean_length": 306.21875, + "completions/mean_terminated_length": 225.69566345214844, + "completions/min_length": 51.0, + "completions/min_terminated_length": 51.0, + "epoch": 0.0816, + "grad_norm": 3.8322501182556152, + "kl": 0.0701446533203125, + "learning_rate": 1e-06, + "loss": 0.0153, + "num_tokens": 1403126.0, + "reward": 0.07732782512903214, + "reward_std": 0.038768649101257324, + "rewards/bleu_reward_func/mean": 0.07732782512903214, + "rewards/bleu_reward_func/std": 0.06468553096055984, + "step": 102 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 490.0, + "completions/mean_length": 167.8125, + "completions/mean_terminated_length": 144.86666870117188, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 0.0824, + "grad_norm": 6.311352729797363, + "kl": 0.06109619140625, + "learning_rate": 1e-06, + "loss": 0.1029, + "num_tokens": 1417360.0, + "reward": 0.23947298526763916, + "reward_std": 0.10021178424358368, + "rewards/bleu_reward_func/mean": 0.23947298526763916, + "rewards/bleu_reward_func/std": 0.40957576036453247, + "step": 103 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 512.0, + "completions/max_terminated_length": 502.0, + "completions/mean_length": 319.375, + "completions/mean_terminated_length": 126.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 0.0832, + "grad_norm": 6.871039867401123, + "kl": 0.06162261962890625, + "learning_rate": 1e-06, + "loss": 0.3071, + "num_tokens": 1431932.0, + "reward": 0.11237628757953644, + "reward_std": 0.05608592554926872, + "rewards/bleu_reward_func/mean": 0.11237628757953644, + "rewards/bleu_reward_func/std": 0.1758151650428772, + "step": 104 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 293.0, + "completions/mean_length": 206.34375, + "completions/mean_terminated_length": 104.45833587646484, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.084, + "grad_norm": 13.681912422180176, + "kl": 0.063812255859375, + "learning_rate": 1e-06, + "loss": 0.3711, + "num_tokens": 1440791.0, + "reward": 0.13408097624778748, + "reward_std": 0.07736363261938095, + "rewards/bleu_reward_func/mean": 0.13408097624778748, + "rewards/bleu_reward_func/std": 0.10995227843523026, + "step": 105 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 505.0, + "completions/mean_length": 239.21875, + "completions/mean_terminated_length": 188.70370483398438, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 0.0848, + "grad_norm": 6.377567291259766, + "kl": 0.212432861328125, + "learning_rate": 1e-06, + "loss": 0.1742, + "num_tokens": 1452430.0, + "reward": 0.09214982390403748, + "reward_std": 0.037541188299655914, + "rewards/bleu_reward_func/mean": 0.09214982390403748, + "rewards/bleu_reward_func/std": 0.06507368385791779, + "step": 106 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.53125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 416.0, + "completions/mean_length": 382.84375, + "completions/mean_terminated_length": 236.4666748046875, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "epoch": 0.0856, + "grad_norm": 2.956113338470459, + "kl": 0.015625, + "learning_rate": 1e-06, + "loss": 0.019, + "num_tokens": 1470353.0, + "reward": 0.029356852173805237, + "reward_std": 0.020268836989998817, + "rewards/bleu_reward_func/mean": 0.029356852173805237, + "rewards/bleu_reward_func/std": 0.031047984957695007, + "step": 107 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 500.0, + "completions/mean_length": 346.5, + "completions/mean_terminated_length": 247.1999969482422, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 0.0864, + "grad_norm": 5.237264156341553, + "kl": 0.0384063720703125, + "learning_rate": 1e-06, + "loss": -0.2289, + "num_tokens": 1483353.0, + "reward": 0.06388352811336517, + "reward_std": 0.03146419674158096, + "rewards/bleu_reward_func/mean": 0.06388352811336517, + "rewards/bleu_reward_func/std": 0.0666789561510086, + "step": 108 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 499.0, + "completions/mean_length": 326.1875, + "completions/mean_terminated_length": 199.05262756347656, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 0.0872, + "grad_norm": 5.559842109680176, + "kl": 0.03765106201171875, + "learning_rate": 1e-06, + "loss": -0.0076, + "num_tokens": 1497415.0, + "reward": 0.2991971969604492, + "reward_std": 0.10907518863677979, + "rewards/bleu_reward_func/mean": 0.2991971969604492, + "rewards/bleu_reward_func/std": 0.36222296953201294, + "step": 109 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 238.0, + "completions/max_terminated_length": 238.0, + "completions/mean_length": 98.53125, + "completions/mean_terminated_length": 98.53125, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.088, + "grad_norm": 8.213761329650879, + "kl": 0.078704833984375, + "learning_rate": 1e-06, + "loss": 0.1858, + "num_tokens": 1504528.0, + "reward": 0.048464857041835785, + "reward_std": 0.0210396908223629, + "rewards/bleu_reward_func/mean": 0.048464857041835785, + "rewards/bleu_reward_func/std": 0.03311728686094284, + "step": 110 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 399.0, + "completions/max_terminated_length": 399.0, + "completions/mean_length": 118.59375, + "completions/mean_terminated_length": 118.59375, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.0888, + "grad_norm": 7.166593074798584, + "kl": 0.096771240234375, + "learning_rate": 1e-06, + "loss": -0.0566, + "num_tokens": 1517035.0, + "reward": 0.09873979538679123, + "reward_std": 0.03707325458526611, + "rewards/bleu_reward_func/mean": 0.09873979538679123, + "rewards/bleu_reward_func/std": 0.13200855255126953, + "step": 111 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.59375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 496.0, + "completions/mean_length": 401.5, + "completions/mean_terminated_length": 240.00001525878906, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "epoch": 0.0896, + "grad_norm": 2.8505043983459473, + "kl": 0.02176666259765625, + "learning_rate": 1e-06, + "loss": -0.0153, + "num_tokens": 1534363.0, + "reward": 0.11044108867645264, + "reward_std": 0.03410620242357254, + "rewards/bleu_reward_func/mean": 0.11044108867645264, + "rewards/bleu_reward_func/std": 0.16289857029914856, + "step": 112 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 482.0, + "completions/max_terminated_length": 482.0, + "completions/mean_length": 125.96875, + "completions/mean_terminated_length": 125.96875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.0904, + "grad_norm": 7.475080490112305, + "kl": 0.100982666015625, + "learning_rate": 1e-06, + "loss": -0.0271, + "num_tokens": 1546258.0, + "reward": 0.12119434028863907, + "reward_std": 0.03986787050962448, + "rewards/bleu_reward_func/mean": 0.12119434028863907, + "rewards/bleu_reward_func/std": 0.10625314712524414, + "step": 113 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 470.0, + "completions/mean_length": 332.09375, + "completions/mean_terminated_length": 250.3181915283203, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 0.0912, + "grad_norm": 3.1941514015197754, + "kl": 0.01666259765625, + "learning_rate": 1e-06, + "loss": -0.0363, + "num_tokens": 1559133.0, + "reward": 0.05715271458029747, + "reward_std": 0.04336331784725189, + "rewards/bleu_reward_func/mean": 0.05715271458029747, + "rewards/bleu_reward_func/std": 0.05400845408439636, + "step": 114 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 476.0, + "completions/mean_length": 182.03125, + "completions/mean_terminated_length": 160.03334045410156, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 0.092, + "grad_norm": 8.439948081970215, + "kl": 0.0611572265625, + "learning_rate": 1e-06, + "loss": -0.2392, + "num_tokens": 1569774.0, + "reward": 0.0502852126955986, + "reward_std": 0.01610748842358589, + "rewards/bleu_reward_func/mean": 0.0502852126955986, + "rewards/bleu_reward_func/std": 0.040807489305734634, + "step": 115 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 408.0, + "completions/max_terminated_length": 408.0, + "completions/mean_length": 163.84375, + "completions/mean_terminated_length": 163.84375, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 0.0928, + "grad_norm": 4.541551113128662, + "kl": 0.069915771484375, + "learning_rate": 1e-06, + "loss": -0.0109, + "num_tokens": 1580281.0, + "reward": 0.03980318829417229, + "reward_std": 0.01563824526965618, + "rewards/bleu_reward_func/mean": 0.03980318829417229, + "rewards/bleu_reward_func/std": 0.023048467934131622, + "step": 116 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 458.0, + "completions/mean_length": 288.875, + "completions/mean_terminated_length": 226.39999389648438, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 0.0936, + "grad_norm": 8.210314750671387, + "kl": 0.09372711181640625, + "learning_rate": 1e-06, + "loss": 0.122, + "num_tokens": 1593285.0, + "reward": 0.0629456490278244, + "reward_std": 0.015063179656863213, + "rewards/bleu_reward_func/mean": 0.0629456490278244, + "rewards/bleu_reward_func/std": 0.03602227941155434, + "step": 117 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.34375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 501.0, + "completions/mean_length": 350.0, + "completions/mean_terminated_length": 265.1428527832031, + "completions/min_length": 68.0, + "completions/min_terminated_length": 68.0, + "epoch": 0.0944, + "grad_norm": 2.992391586303711, + "kl": 0.02304840087890625, + "learning_rate": 1e-06, + "loss": -0.1711, + "num_tokens": 1606645.0, + "reward": 0.022465957328677177, + "reward_std": 0.016872048377990723, + "rewards/bleu_reward_func/mean": 0.022465957328677177, + "rewards/bleu_reward_func/std": 0.023790787905454636, + "step": 118 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 509.0, + "completions/mean_length": 285.375, + "completions/mean_terminated_length": 233.07693481445312, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.0952, + "grad_norm": 4.377739429473877, + "kl": 0.0274658203125, + "learning_rate": 1e-06, + "loss": -0.0351, + "num_tokens": 1617737.0, + "reward": 0.053562991321086884, + "reward_std": 0.025934984907507896, + "rewards/bleu_reward_func/mean": 0.053562991321086884, + "rewards/bleu_reward_func/std": 0.03459456190466881, + "step": 119 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.46875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 285.25, + "completions/mean_terminated_length": 85.17646789550781, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "epoch": 0.096, + "grad_norm": 5.66406774520874, + "kl": 0.032745361328125, + "learning_rate": 1e-06, + "loss": 0.1042, + "num_tokens": 1631409.0, + "reward": 0.05639251321554184, + "reward_std": 0.025049947202205658, + "rewards/bleu_reward_func/mean": 0.05639251321554184, + "rewards/bleu_reward_func/std": 0.04031047970056534, + "step": 120 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.53125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 467.0, + "completions/mean_length": 391.25, + "completions/mean_terminated_length": 254.40000915527344, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "epoch": 0.0968, + "grad_norm": 2.7011678218841553, + "kl": 0.019012451171875, + "learning_rate": 1e-06, + "loss": -0.0595, + "num_tokens": 1647665.0, + "reward": 0.1355845332145691, + "reward_std": 0.03834523260593414, + "rewards/bleu_reward_func/mean": 0.1355845332145691, + "rewards/bleu_reward_func/std": 0.17731845378875732, + "step": 121 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 505.0, + "completions/mean_length": 360.09375, + "completions/mean_terminated_length": 241.94444274902344, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "epoch": 0.0976, + "grad_norm": 3.2975172996520996, + "kl": 0.0248870849609375, + "learning_rate": 1e-06, + "loss": -0.0281, + "num_tokens": 1661164.0, + "reward": 0.02182621881365776, + "reward_std": 0.010437489487230778, + "rewards/bleu_reward_func/mean": 0.02182621881365776, + "rewards/bleu_reward_func/std": 0.019065655767917633, + "step": 122 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 497.0, + "completions/mean_length": 329.03125, + "completions/mean_terminated_length": 295.1481628417969, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "epoch": 0.0984, + "grad_norm": 3.011232376098633, + "kl": 0.023040771484375, + "learning_rate": 1e-06, + "loss": -0.0609, + "num_tokens": 1674269.0, + "reward": 0.05195554345846176, + "reward_std": 0.020864665508270264, + "rewards/bleu_reward_func/mean": 0.05195554345846176, + "rewards/bleu_reward_func/std": 0.027087198570370674, + "step": 123 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.46875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 451.0, + "completions/mean_length": 301.0, + "completions/mean_terminated_length": 114.82353210449219, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 0.0992, + "grad_norm": 8.828636169433594, + "kl": 0.0798492431640625, + "learning_rate": 1e-06, + "loss": -0.0267, + "num_tokens": 1691797.0, + "reward": 0.1420682966709137, + "reward_std": 0.04143287241458893, + "rewards/bleu_reward_func/mean": 0.1420682966709137, + "rewards/bleu_reward_func/std": 0.07349839806556702, + "step": 124 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 432.0, + "completions/max_terminated_length": 432.0, + "completions/mean_length": 167.28125, + "completions/mean_terminated_length": 167.28125, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.1, + "grad_norm": 7.268754482269287, + "kl": 0.104766845703125, + "learning_rate": 1e-06, + "loss": 0.2657, + "num_tokens": 1702150.0, + "reward": 0.16663971543312073, + "reward_std": 0.05392443761229515, + "rewards/bleu_reward_func/mean": 0.16663971543312073, + "rewards/bleu_reward_func/std": 0.09980462491512299, + "step": 125 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 512.0, + "completions/max_terminated_length": 432.0, + "completions/mean_length": 367.8125, + "completions/mean_terminated_length": 223.625, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "epoch": 0.1008, + "grad_norm": 2.9197561740875244, + "kl": 0.0296630859375, + "learning_rate": 1e-06, + "loss": 0.069, + "num_tokens": 1720080.0, + "reward": 0.05814104527235031, + "reward_std": 0.023808015510439873, + "rewards/bleu_reward_func/mean": 0.05814104527235031, + "rewards/bleu_reward_func/std": 0.06258071959018707, + "step": 126 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 512.0, + "completions/max_terminated_length": 222.0, + "completions/mean_length": 311.03125, + "completions/mean_terminated_length": 110.0625, + "completions/min_length": 44.0, + "completions/min_terminated_length": 44.0, + "epoch": 0.1016, + "grad_norm": 3.6131699085235596, + "kl": 0.040496826171875, + "learning_rate": 1e-06, + "loss": -0.1103, + "num_tokens": 1736129.0, + "reward": 0.1627029925584793, + "reward_std": 0.048266101628541946, + "rewards/bleu_reward_func/mean": 0.1627029925584793, + "rewards/bleu_reward_func/std": 0.2640880048274994, + "step": 127 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 511.0, + "completions/mean_length": 182.1875, + "completions/mean_terminated_length": 89.83999633789062, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.1024, + "grad_norm": 5.8553900718688965, + "kl": 0.0695953369140625, + "learning_rate": 1e-06, + "loss": 0.2073, + "num_tokens": 1744807.0, + "reward": 0.05680542066693306, + "reward_std": 0.02900797501206398, + "rewards/bleu_reward_func/mean": 0.05680542066693306, + "rewards/bleu_reward_func/std": 0.062428779900074005, + "step": 128 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 434.0, + "completions/mean_length": 304.375, + "completions/mean_terminated_length": 235.1666717529297, + "completions/min_length": 56.0, + "completions/min_terminated_length": 56.0, + "epoch": 0.1032, + "grad_norm": 2.625256299972534, + "kl": 0.014190673828125, + "learning_rate": 1e-06, + "loss": -0.3256, + "num_tokens": 1759979.0, + "reward": 0.07073010504245758, + "reward_std": 0.0585593655705452, + "rewards/bleu_reward_func/mean": 0.07073010504245758, + "rewards/bleu_reward_func/std": 0.0830271914601326, + "step": 129 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 502.0, + "completions/mean_length": 269.21875, + "completions/mean_terminated_length": 188.2916717529297, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.104, + "grad_norm": 7.594178199768066, + "kl": 0.019775390625, + "learning_rate": 1e-06, + "loss": 0.103, + "num_tokens": 1772810.0, + "reward": 0.03343900665640831, + "reward_std": 0.008691318333148956, + "rewards/bleu_reward_func/mean": 0.03343900665640831, + "rewards/bleu_reward_func/std": 0.027092551812529564, + "step": 130 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 509.0, + "completions/mean_length": 307.375, + "completions/mean_terminated_length": 286.2069091796875, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "epoch": 0.1048, + "grad_norm": 3.5237069129943848, + "kl": 0.0251312255859375, + "learning_rate": 1e-06, + "loss": 0.0902, + "num_tokens": 1786094.0, + "reward": 0.03853389620780945, + "reward_std": 0.016378795728087425, + "rewards/bleu_reward_func/mean": 0.03853389620780945, + "rewards/bleu_reward_func/std": 0.02983209490776062, + "step": 131 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 505.0, + "completions/mean_length": 244.03125, + "completions/mean_terminated_length": 194.40740966796875, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.1056, + "grad_norm": 3.428116798400879, + "kl": 0.072906494140625, + "learning_rate": 1e-06, + "loss": 0.1144, + "num_tokens": 1797551.0, + "reward": 0.1538739800453186, + "reward_std": 0.03595956414937973, + "rewards/bleu_reward_func/mean": 0.1538739800453186, + "rewards/bleu_reward_func/std": 0.21548843383789062, + "step": 132 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 494.0, + "completions/mean_length": 220.46875, + "completions/mean_terminated_length": 211.06451416015625, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 0.1064, + "grad_norm": 4.208179950714111, + "kl": 0.058685302734375, + "learning_rate": 1e-06, + "loss": -0.1489, + "num_tokens": 1808726.0, + "reward": 0.18491268157958984, + "reward_std": 0.0416969433426857, + "rewards/bleu_reward_func/mean": 0.18491268157958984, + "rewards/bleu_reward_func/std": 0.2198871225118637, + "step": 133 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 402.0, + "completions/max_terminated_length": 402.0, + "completions/mean_length": 117.5625, + "completions/mean_terminated_length": 117.5625, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 0.1072, + "grad_norm": 9.312064170837402, + "kl": 0.2574310302734375, + "learning_rate": 1e-06, + "loss": -0.1699, + "num_tokens": 1816152.0, + "reward": 0.09744147956371307, + "reward_std": 0.03963543474674225, + "rewards/bleu_reward_func/mean": 0.09744147956371307, + "rewards/bleu_reward_func/std": 0.07821591198444366, + "step": 134 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.34375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 503.0, + "completions/mean_length": 284.5625, + "completions/mean_terminated_length": 165.42857360839844, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 0.108, + "grad_norm": 2.491009473800659, + "kl": 0.5361785888671875, + "learning_rate": 1e-06, + "loss": 0.0755, + "num_tokens": 1830010.0, + "reward": 0.07847163081169128, + "reward_std": 0.07447989284992218, + "rewards/bleu_reward_func/mean": 0.07847163081169128, + "rewards/bleu_reward_func/std": 0.1269197165966034, + "step": 135 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 491.0, + "completions/mean_length": 208.28125, + "completions/mean_terminated_length": 164.8928680419922, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 0.1088, + "grad_norm": 9.84536075592041, + "kl": 0.137481689453125, + "learning_rate": 1e-06, + "loss": 0.0643, + "num_tokens": 1843019.0, + "reward": 0.23385955393314362, + "reward_std": 0.07621090114116669, + "rewards/bleu_reward_func/mean": 0.23385955393314362, + "rewards/bleu_reward_func/std": 0.2127569168806076, + "step": 136 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 485.0, + "completions/mean_length": 243.46875, + "completions/mean_terminated_length": 138.3913116455078, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.1096, + "grad_norm": 6.7033257484436035, + "kl": 0.051727294921875, + "learning_rate": 1e-06, + "loss": 0.1285, + "num_tokens": 1856450.0, + "reward": 0.04753299057483673, + "reward_std": 0.016634728759527206, + "rewards/bleu_reward_func/mean": 0.04753299057483673, + "rewards/bleu_reward_func/std": 0.030512619763612747, + "step": 137 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.53125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 491.0, + "completions/mean_length": 352.40625, + "completions/mean_terminated_length": 171.53334045410156, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.1104, + "grad_norm": 3.812756061553955, + "kl": 0.0177764892578125, + "learning_rate": 1e-06, + "loss": 0.1601, + "num_tokens": 1870943.0, + "reward": 0.04067971557378769, + "reward_std": 0.026344479992985725, + "rewards/bleu_reward_func/mean": 0.04067971557378769, + "rewards/bleu_reward_func/std": 0.06328170746564865, + "step": 138 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 498.0, + "completions/mean_length": 287.46875, + "completions/mean_terminated_length": 264.2413635253906, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 0.1112, + "grad_norm": 4.916464805603027, + "kl": 0.06890869140625, + "learning_rate": 1e-06, + "loss": 0.0108, + "num_tokens": 1883774.0, + "reward": 0.1910865753889084, + "reward_std": 0.09566200524568558, + "rewards/bleu_reward_func/mean": 0.1910865753889084, + "rewards/bleu_reward_func/std": 0.2485995888710022, + "step": 139 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 446.0, + "completions/mean_length": 312.6875, + "completions/mean_terminated_length": 157.6666717529297, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 0.112, + "grad_norm": 5.02897310256958, + "kl": 0.081298828125, + "learning_rate": 1e-06, + "loss": 0.0347, + "num_tokens": 1900964.0, + "reward": 0.1273106336593628, + "reward_std": 0.037408363074064255, + "rewards/bleu_reward_func/mean": 0.1273106336593628, + "rewards/bleu_reward_func/std": 0.1255699247121811, + "step": 140 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 421.0, + "completions/mean_length": 287.40625, + "completions/mean_terminated_length": 199.52174377441406, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 0.1128, + "grad_norm": 3.5728607177734375, + "kl": 0.0290985107421875, + "learning_rate": 1e-06, + "loss": -0.0802, + "num_tokens": 1914153.0, + "reward": 0.1449739634990692, + "reward_std": 0.05561315268278122, + "rewards/bleu_reward_func/mean": 0.1449739634990692, + "rewards/bleu_reward_func/std": 0.10589203238487244, + "step": 141 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 503.0, + "completions/mean_length": 171.6875, + "completions/mean_terminated_length": 149.00001525878906, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.1136, + "grad_norm": 6.168550491333008, + "kl": 0.076751708984375, + "learning_rate": 1e-06, + "loss": 0.1377, + "num_tokens": 1924551.0, + "reward": 0.07935678958892822, + "reward_std": 0.044586654752492905, + "rewards/bleu_reward_func/mean": 0.07935678958892822, + "rewards/bleu_reward_func/std": 0.11080160737037659, + "step": 142 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 485.0, + "completions/mean_length": 188.28125, + "completions/mean_terminated_length": 166.70001220703125, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 0.1144, + "grad_norm": 5.237224102020264, + "kl": 0.042633056640625, + "learning_rate": 1e-06, + "loss": 0.1969, + "num_tokens": 1936192.0, + "reward": 0.07339954376220703, + "reward_std": 0.04980514198541641, + "rewards/bleu_reward_func/mean": 0.07339954376220703, + "rewards/bleu_reward_func/std": 0.06703697144985199, + "step": 143 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 340.0, + "completions/mean_length": 168.03125, + "completions/mean_terminated_length": 53.375, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 0.1152, + "grad_norm": 6.578721523284912, + "kl": 0.148895263671875, + "learning_rate": 1e-06, + "loss": 0.2196, + "num_tokens": 1946361.0, + "reward": 0.2388084977865219, + "reward_std": 0.05400132015347481, + "rewards/bleu_reward_func/mean": 0.2388084977865219, + "rewards/bleu_reward_func/std": 0.2556310296058655, + "step": 144 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 502.0, + "completions/mean_length": 254.90625, + "completions/mean_terminated_length": 218.17857360839844, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.116, + "grad_norm": 5.644160270690918, + "kl": 0.10284423828125, + "learning_rate": 1e-06, + "loss": 0.0093, + "num_tokens": 1958990.0, + "reward": 0.05691784247756004, + "reward_std": 0.045338764786720276, + "rewards/bleu_reward_func/mean": 0.05691784247756004, + "rewards/bleu_reward_func/std": 0.051530975848436356, + "step": 145 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 445.0, + "completions/mean_length": 138.125, + "completions/mean_terminated_length": 126.06451416015625, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 0.1168, + "grad_norm": 6.659230709075928, + "kl": 0.07525634765625, + "learning_rate": 1e-06, + "loss": 0.3905, + "num_tokens": 1966178.0, + "reward": 0.08115407824516296, + "reward_std": 0.05008203536272049, + "rewards/bleu_reward_func/mean": 0.08115407824516296, + "rewards/bleu_reward_func/std": 0.060907039791345596, + "step": 146 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 446.0, + "completions/mean_length": 152.25, + "completions/mean_terminated_length": 69.23077392578125, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 0.1176, + "grad_norm": 10.029218673706055, + "kl": 0.1614227294921875, + "learning_rate": 1e-06, + "loss": 0.0338, + "num_tokens": 1977194.0, + "reward": 0.23646463453769684, + "reward_std": 0.09375543892383575, + "rewards/bleu_reward_func/mean": 0.23646463453769684, + "rewards/bleu_reward_func/std": 0.27427393198013306, + "step": 147 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 502.0, + "completions/mean_length": 300.84375, + "completions/mean_terminated_length": 174.15000915527344, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 0.1184, + "grad_norm": 7.6274027824401855, + "kl": 0.0694122314453125, + "learning_rate": 1e-06, + "loss": -0.0195, + "num_tokens": 1991693.0, + "reward": 0.1271597295999527, + "reward_std": 0.03925805538892746, + "rewards/bleu_reward_func/mean": 0.1271597295999527, + "rewards/bleu_reward_func/std": 0.20968182384967804, + "step": 148 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 442.0, + "completions/mean_length": 235.0625, + "completions/mean_terminated_length": 142.75, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 0.1192, + "grad_norm": 5.1624908447265625, + "kl": 0.074005126953125, + "learning_rate": 1e-06, + "loss": -0.0749, + "num_tokens": 2002359.0, + "reward": 0.0867965817451477, + "reward_std": 0.03743039071559906, + "rewards/bleu_reward_func/mean": 0.0867965817451477, + "rewards/bleu_reward_func/std": 0.06982331722974777, + "step": 149 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 383.0, + "completions/mean_length": 180.125, + "completions/mean_terminated_length": 69.5, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.12, + "grad_norm": 5.687462329864502, + "kl": 0.079742431640625, + "learning_rate": 1e-06, + "loss": 0.1774, + "num_tokens": 2012139.0, + "reward": 0.08913667500019073, + "reward_std": 0.03803376108407974, + "rewards/bleu_reward_func/mean": 0.08913667500019073, + "rewards/bleu_reward_func/std": 0.07373686879873276, + "step": 150 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 486.0, + "completions/mean_length": 246.6875, + "completions/mean_terminated_length": 158.25, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 0.1208, + "grad_norm": 6.3145341873168945, + "kl": 0.168792724609375, + "learning_rate": 1e-06, + "loss": 0.0474, + "num_tokens": 2024961.0, + "reward": 0.09886027127504349, + "reward_std": 0.09059572219848633, + "rewards/bleu_reward_func/mean": 0.09886027127504349, + "rewards/bleu_reward_func/std": 0.20261086523532867, + "step": 151 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 470.0, + "completions/mean_length": 278.375, + "completions/mean_terminated_length": 200.5, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 0.1216, + "grad_norm": 6.056458473205566, + "kl": 0.04815673828125, + "learning_rate": 1e-06, + "loss": 0.0367, + "num_tokens": 2038621.0, + "reward": 0.040941424667835236, + "reward_std": 0.024181999266147614, + "rewards/bleu_reward_func/mean": 0.040941424667835236, + "rewards/bleu_reward_func/std": 0.031022800132632256, + "step": 152 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 408.0, + "completions/mean_length": 174.4375, + "completions/mean_terminated_length": 79.91999816894531, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 0.1224, + "grad_norm": 11.584298133850098, + "kl": 0.10552978515625, + "learning_rate": 1e-06, + "loss": -0.3199, + "num_tokens": 2048947.0, + "reward": 0.057592377066612244, + "reward_std": 0.02831832319498062, + "rewards/bleu_reward_func/mean": 0.057592377066612244, + "rewards/bleu_reward_func/std": 0.0929059162735939, + "step": 153 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 508.0, + "completions/mean_length": 285.71875, + "completions/mean_terminated_length": 197.17391967773438, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "epoch": 0.1232, + "grad_norm": 3.8603882789611816, + "kl": 0.042999267578125, + "learning_rate": 1e-06, + "loss": -0.1903, + "num_tokens": 2059978.0, + "reward": 0.0238445196300745, + "reward_std": 0.016163241118192673, + "rewards/bleu_reward_func/mean": 0.0238445196300745, + "rewards/bleu_reward_func/std": 0.020820245146751404, + "step": 154 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 262.0, + "completions/max_terminated_length": 262.0, + "completions/mean_length": 110.90625, + "completions/mean_terminated_length": 110.90625, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.124, + "grad_norm": 9.231768608093262, + "kl": 0.134063720703125, + "learning_rate": 1e-06, + "loss": -0.12, + "num_tokens": 2068223.0, + "reward": 0.093255415558815, + "reward_std": 0.04695024713873863, + "rewards/bleu_reward_func/mean": 0.093255415558815, + "rewards/bleu_reward_func/std": 0.07957140356302261, + "step": 155 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 490.0, + "completions/mean_length": 196.875, + "completions/mean_terminated_length": 151.85714721679688, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.1248, + "grad_norm": 6.661685466766357, + "kl": 0.11859130859375, + "learning_rate": 1e-06, + "loss": -0.1122, + "num_tokens": 2079491.0, + "reward": 0.10441941022872925, + "reward_std": 0.06782116740942001, + "rewards/bleu_reward_func/mean": 0.10441941022872925, + "rewards/bleu_reward_func/std": 0.1558544933795929, + "step": 156 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 497.0, + "completions/mean_length": 417.84375, + "completions/mean_terminated_length": 353.4210510253906, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "epoch": 0.1256, + "grad_norm": 2.281557559967041, + "kl": 0.020050048828125, + "learning_rate": 1e-06, + "loss": -0.049, + "num_tokens": 2094998.0, + "reward": 0.03994186595082283, + "reward_std": 0.020151065662503242, + "rewards/bleu_reward_func/mean": 0.03994186595082283, + "rewards/bleu_reward_func/std": 0.03798232972621918, + "step": 157 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 490.0, + "completions/mean_length": 186.75, + "completions/mean_terminated_length": 126.51851654052734, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 0.1264, + "grad_norm": 5.349869251251221, + "kl": 0.052093505859375, + "learning_rate": 1e-06, + "loss": 0.0712, + "num_tokens": 2104006.0, + "reward": 0.060490936040878296, + "reward_std": 0.039247751235961914, + "rewards/bleu_reward_func/mean": 0.060490936040878296, + "rewards/bleu_reward_func/std": 0.06767360866069794, + "step": 158 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 503.0, + "completions/mean_length": 312.375, + "completions/mean_terminated_length": 266.3077087402344, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 0.1272, + "grad_norm": 6.974966526031494, + "kl": 0.0902099609375, + "learning_rate": 1e-06, + "loss": -0.0785, + "num_tokens": 2121570.0, + "reward": 0.21717938780784607, + "reward_std": 0.08217764645814896, + "rewards/bleu_reward_func/mean": 0.21717938780784607, + "rewards/bleu_reward_func/std": 0.1689896285533905, + "step": 159 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 505.0, + "completions/mean_length": 350.25, + "completions/mean_terminated_length": 239.57894897460938, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 0.128, + "grad_norm": 8.79633617401123, + "kl": 0.10308837890625, + "learning_rate": 1e-06, + "loss": -0.0746, + "num_tokens": 2134322.0, + "reward": 0.027915209531784058, + "reward_std": 0.008189969696104527, + "rewards/bleu_reward_func/mean": 0.027915209531784058, + "rewards/bleu_reward_func/std": 0.021798407658934593, + "step": 160 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 432.0, + "completions/max_terminated_length": 432.0, + "completions/mean_length": 98.40625, + "completions/mean_terminated_length": 98.40625, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 0.1288, + "grad_norm": 9.201173782348633, + "kl": 0.112060546875, + "learning_rate": 1e-06, + "loss": 0.1926, + "num_tokens": 2139903.0, + "reward": 0.08629470318555832, + "reward_std": 0.0329008549451828, + "rewards/bleu_reward_func/mean": 0.08629470318555832, + "rewards/bleu_reward_func/std": 0.04737285524606705, + "step": 161 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.65625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 426.0, + "completions/mean_length": 408.75, + "completions/mean_terminated_length": 211.63636779785156, + "completions/min_length": 51.0, + "completions/min_terminated_length": 51.0, + "epoch": 0.1296, + "grad_norm": 3.5992963314056396, + "kl": 0.034759521484375, + "learning_rate": 1e-06, + "loss": 0.2439, + "num_tokens": 2158343.0, + "reward": 0.07093626260757446, + "reward_std": 0.04270578920841217, + "rewards/bleu_reward_func/mean": 0.07093626260757446, + "rewards/bleu_reward_func/std": 0.09919130057096481, + "step": 162 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 396.0, + "completions/mean_length": 171.4375, + "completions/mean_terminated_length": 92.84616088867188, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 0.1304, + "grad_norm": 8.71653938293457, + "kl": 0.069793701171875, + "learning_rate": 1e-06, + "loss": -0.2102, + "num_tokens": 2166957.0, + "reward": 0.045812517404556274, + "reward_std": 0.0257731880992651, + "rewards/bleu_reward_func/mean": 0.045812517404556274, + "rewards/bleu_reward_func/std": 0.033692970871925354, + "step": 163 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 507.0, + "completions/mean_length": 435.65625, + "completions/mean_terminated_length": 383.4210510253906, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "epoch": 0.1312, + "grad_norm": 2.308507204055786, + "kl": 0.020599365234375, + "learning_rate": 1e-06, + "loss": -0.142, + "num_tokens": 2182538.0, + "reward": 0.05681996047496796, + "reward_std": 0.022751763463020325, + "rewards/bleu_reward_func/mean": 0.05681996047496796, + "rewards/bleu_reward_func/std": 0.034446995705366135, + "step": 164 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 488.0, + "completions/mean_length": 422.1875, + "completions/mean_terminated_length": 368.3000183105469, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 0.132, + "grad_norm": 2.0656521320343018, + "kl": 0.0328369140625, + "learning_rate": 1e-06, + "loss": -0.1159, + "num_tokens": 2198440.0, + "reward": 0.08400298655033112, + "reward_std": 0.03193335980176926, + "rewards/bleu_reward_func/mean": 0.08400298655033112, + "rewards/bleu_reward_func/std": 0.05056838318705559, + "step": 165 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 468.0, + "completions/mean_length": 265.25, + "completions/mean_terminated_length": 117.20000457763672, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 0.1328, + "grad_norm": 7.659345626831055, + "kl": 0.12249755859375, + "learning_rate": 1e-06, + "loss": -0.0683, + "num_tokens": 2212440.0, + "reward": 0.17941661179065704, + "reward_std": 0.040813662111759186, + "rewards/bleu_reward_func/mean": 0.17941661179065704, + "rewards/bleu_reward_func/std": 0.2576500475406647, + "step": 166 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 410.0, + "completions/max_terminated_length": 410.0, + "completions/mean_length": 171.625, + "completions/mean_terminated_length": 171.625, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "epoch": 0.1336, + "grad_norm": 4.253745079040527, + "kl": 0.03948974609375, + "learning_rate": 1e-06, + "loss": -0.0369, + "num_tokens": 2220868.0, + "reward": 0.15958541631698608, + "reward_std": 0.08837255835533142, + "rewards/bleu_reward_func/mean": 0.15958541631698608, + "rewards/bleu_reward_func/std": 0.2750999629497528, + "step": 167 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 502.0, + "completions/mean_length": 258.09375, + "completions/mean_terminated_length": 158.7391357421875, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 0.1344, + "grad_norm": 7.613649845123291, + "kl": 0.230072021484375, + "learning_rate": 1e-06, + "loss": 0.2337, + "num_tokens": 2232063.0, + "reward": 0.0643484890460968, + "reward_std": 0.04164566472172737, + "rewards/bleu_reward_func/mean": 0.0643484890460968, + "rewards/bleu_reward_func/std": 0.07561130821704865, + "step": 168 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 504.0, + "completions/mean_length": 312.75, + "completions/mean_terminated_length": 193.1999969482422, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 0.1352, + "grad_norm": 7.345302104949951, + "kl": 0.19305419921875, + "learning_rate": 1e-06, + "loss": -0.2036, + "num_tokens": 2248271.0, + "reward": 0.04911228269338608, + "reward_std": 0.018512040376663208, + "rewards/bleu_reward_func/mean": 0.04911228269338608, + "rewards/bleu_reward_func/std": 0.05713532865047455, + "step": 169 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.34375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 468.0, + "completions/mean_length": 327.0, + "completions/mean_terminated_length": 230.09524536132812, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "epoch": 0.136, + "grad_norm": 5.079345226287842, + "kl": 0.02252197265625, + "learning_rate": 1e-06, + "loss": 0.1003, + "num_tokens": 2261791.0, + "reward": 0.03408445790410042, + "reward_std": 0.007548983674496412, + "rewards/bleu_reward_func/mean": 0.03408445790410042, + "rewards/bleu_reward_func/std": 0.030450724065303802, + "step": 170 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 204.0, + "completions/mean_length": 173.3125, + "completions/mean_terminated_length": 78.47999572753906, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 0.1368, + "grad_norm": 7.255119800567627, + "kl": 0.098876953125, + "learning_rate": 1e-06, + "loss": -0.1893, + "num_tokens": 2271041.0, + "reward": 0.08309763669967651, + "reward_std": 0.05162087082862854, + "rewards/bleu_reward_func/mean": 0.08309763669967651, + "rewards/bleu_reward_func/std": 0.08563226461410522, + "step": 171 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 452.0, + "completions/mean_length": 251.59375, + "completions/mean_terminated_length": 164.7916717529297, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 0.1376, + "grad_norm": 9.955636024475098, + "kl": 0.094970703125, + "learning_rate": 1e-06, + "loss": 0.4501, + "num_tokens": 2281108.0, + "reward": 0.09667688608169556, + "reward_std": 0.047036267817020416, + "rewards/bleu_reward_func/mean": 0.09667688608169556, + "rewards/bleu_reward_func/std": 0.05911566689610481, + "step": 172 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 498.0, + "completions/mean_length": 259.625, + "completions/mean_terminated_length": 251.48385620117188, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.1384, + "grad_norm": 2.710672616958618, + "kl": 0.05035400390625, + "learning_rate": 1e-06, + "loss": -0.1131, + "num_tokens": 2292040.0, + "reward": 0.01771564967930317, + "reward_std": 0.0045564379543066025, + "rewards/bleu_reward_func/mean": 0.01771564967930317, + "rewards/bleu_reward_func/std": 0.009397609159350395, + "step": 173 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 507.0, + "completions/mean_length": 279.15625, + "completions/mean_terminated_length": 119.84210968017578, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.1392, + "grad_norm": 5.8446946144104, + "kl": 0.10295867919921875, + "learning_rate": 1e-06, + "loss": -0.0422, + "num_tokens": 2306453.0, + "reward": 0.10576937347650528, + "reward_std": 0.040997594594955444, + "rewards/bleu_reward_func/mean": 0.10576937347650528, + "rewards/bleu_reward_func/std": 0.15739315748214722, + "step": 174 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 499.0, + "completions/mean_length": 406.375, + "completions/mean_terminated_length": 270.5714416503906, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "epoch": 0.14, + "grad_norm": 2.8740079402923584, + "kl": 0.022064208984375, + "learning_rate": 1e-06, + "loss": -0.001, + "num_tokens": 2321945.0, + "reward": 0.07392336428165436, + "reward_std": 0.027644775807857513, + "rewards/bleu_reward_func/mean": 0.07392336428165436, + "rewards/bleu_reward_func/std": 0.079840287566185, + "step": 175 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 454.0, + "completions/mean_length": 384.6875, + "completions/mean_terminated_length": 308.3000183105469, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 0.1408, + "grad_norm": 2.571645498275757, + "kl": 0.0183868408203125, + "learning_rate": 1e-06, + "loss": -0.106, + "num_tokens": 2338695.0, + "reward": 0.043530724942684174, + "reward_std": 0.02269122190773487, + "rewards/bleu_reward_func/mean": 0.043530724942684174, + "rewards/bleu_reward_func/std": 0.029228538274765015, + "step": 176 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 504.0, + "completions/mean_length": 276.71875, + "completions/mean_terminated_length": 184.6521759033203, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.1416, + "grad_norm": 6.338461399078369, + "kl": 0.0661468505859375, + "learning_rate": 1e-06, + "loss": 0.2588, + "num_tokens": 2349054.0, + "reward": 0.04008907824754715, + "reward_std": 0.03199386969208717, + "rewards/bleu_reward_func/mean": 0.04008907824754715, + "rewards/bleu_reward_func/std": 0.05116712674498558, + "step": 177 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 491.0, + "completions/mean_length": 367.84375, + "completions/mean_terminated_length": 182.50001525878906, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 0.1424, + "grad_norm": 7.893227577209473, + "kl": 0.1038818359375, + "learning_rate": 1e-06, + "loss": -0.0007, + "num_tokens": 2365297.0, + "reward": 0.05835431069135666, + "reward_std": 0.01447733398526907, + "rewards/bleu_reward_func/mean": 0.05835431069135666, + "rewards/bleu_reward_func/std": 0.05388018116354942, + "step": 178 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 471.0, + "completions/mean_length": 106.9375, + "completions/mean_terminated_length": 79.93333435058594, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 0.1432, + "grad_norm": 13.133338928222656, + "kl": 0.378204345703125, + "learning_rate": 1e-06, + "loss": 0.1078, + "num_tokens": 2377279.0, + "reward": 0.27373576164245605, + "reward_std": 0.10149600356817245, + "rewards/bleu_reward_func/mean": 0.27373576164245605, + "rewards/bleu_reward_func/std": 0.21089527010917664, + "step": 179 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.34375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 492.0, + "completions/mean_length": 384.875, + "completions/mean_terminated_length": 318.28570556640625, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "epoch": 0.144, + "grad_norm": 2.6368002891540527, + "kl": 0.02276611328125, + "learning_rate": 1e-06, + "loss": 0.0261, + "num_tokens": 2393507.0, + "reward": 0.06703202426433563, + "reward_std": 0.02514977753162384, + "rewards/bleu_reward_func/mean": 0.06703202426433563, + "rewards/bleu_reward_func/std": 0.05334871634840965, + "step": 180 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 408.0, + "completions/mean_length": 216.15625, + "completions/mean_terminated_length": 147.88462829589844, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 0.1448, + "grad_norm": 26.23644256591797, + "kl": 0.052398681640625, + "learning_rate": 1e-06, + "loss": 0.2092, + "num_tokens": 2403920.0, + "reward": 0.07073464244604111, + "reward_std": 0.0369129553437233, + "rewards/bleu_reward_func/mean": 0.07073464244604111, + "rewards/bleu_reward_func/std": 0.04567345231771469, + "step": 181 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.34375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 490.0, + "completions/mean_length": 285.09375, + "completions/mean_terminated_length": 166.23809814453125, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 0.1456, + "grad_norm": 12.993139266967773, + "kl": 0.135498046875, + "learning_rate": 1e-06, + "loss": 0.241, + "num_tokens": 2416907.0, + "reward": 0.05180336907505989, + "reward_std": 0.024485625326633453, + "rewards/bleu_reward_func/mean": 0.05180336907505989, + "rewards/bleu_reward_func/std": 0.03925548121333122, + "step": 182 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 318.0, + "completions/mean_length": 196.96875, + "completions/mean_terminated_length": 91.95833587646484, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 0.1464, + "grad_norm": 8.714866638183594, + "kl": 0.2222137451171875, + "learning_rate": 1e-06, + "loss": 0.1099, + "num_tokens": 2428778.0, + "reward": 0.08364134281873703, + "reward_std": 0.042949263006448746, + "rewards/bleu_reward_func/mean": 0.08364134281873703, + "rewards/bleu_reward_func/std": 0.09259536862373352, + "step": 183 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 508.0, + "completions/mean_length": 314.09375, + "completions/mean_terminated_length": 293.6206970214844, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "epoch": 0.1472, + "grad_norm": 2.9456305503845215, + "kl": 0.029815673828125, + "learning_rate": 1e-06, + "loss": -0.172, + "num_tokens": 2441829.0, + "reward": 0.11525549739599228, + "reward_std": 0.056866977363824844, + "rewards/bleu_reward_func/mean": 0.11525549739599228, + "rewards/bleu_reward_func/std": 0.10229503363370895, + "step": 184 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 512.0, + "completions/max_terminated_length": 308.0, + "completions/mean_length": 310.21875, + "completions/mean_terminated_length": 108.4375, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.148, + "grad_norm": 4.448924541473389, + "kl": 0.118255615234375, + "learning_rate": 1e-06, + "loss": -0.0768, + "num_tokens": 2456612.0, + "reward": 0.1624433547258377, + "reward_std": 0.045910030603408813, + "rewards/bleu_reward_func/mean": 0.1624433547258377, + "rewards/bleu_reward_func/std": 0.19173115491867065, + "step": 185 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 135.0, + "completions/mean_length": 134.09375, + "completions/mean_terminated_length": 28.279998779296875, + "completions/min_length": 3.0, + "completions/min_terminated_length": 3.0, + "epoch": 0.1488, + "grad_norm": 13.645524024963379, + "kl": 0.1475830078125, + "learning_rate": 1e-06, + "loss": 0.0664, + "num_tokens": 2464839.0, + "reward": 0.05004946142435074, + "reward_std": 0.03280433267354965, + "rewards/bleu_reward_func/mean": 0.05004946142435074, + "rewards/bleu_reward_func/std": 0.05075250193476677, + "step": 186 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 503.0, + "completions/mean_length": 383.28125, + "completions/mean_terminated_length": 332.9130554199219, + "completions/min_length": 194.0, + "completions/min_terminated_length": 194.0, + "epoch": 0.1496, + "grad_norm": 2.3289942741394043, + "kl": 0.0212860107421875, + "learning_rate": 1e-06, + "loss": 0.005, + "num_tokens": 2478608.0, + "reward": 0.03798733651638031, + "reward_std": 0.014268442057073116, + "rewards/bleu_reward_func/mean": 0.03798733651638031, + "rewards/bleu_reward_func/std": 0.03045865148305893, + "step": 187 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 448.0, + "completions/mean_length": 355.9375, + "completions/mean_terminated_length": 303.91668701171875, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "epoch": 0.1504, + "grad_norm": 3.192103862762451, + "kl": 0.0250701904296875, + "learning_rate": 1e-06, + "loss": 0.167, + "num_tokens": 2492518.0, + "reward": 0.02103330008685589, + "reward_std": 0.0090586943551898, + "rewards/bleu_reward_func/mean": 0.02103330008685589, + "rewards/bleu_reward_func/std": 0.01017869170755148, + "step": 188 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 350.0, + "completions/max_terminated_length": 350.0, + "completions/mean_length": 60.375, + "completions/mean_terminated_length": 60.375, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 0.1512, + "grad_norm": 10.600973129272461, + "kl": 0.31732177734375, + "learning_rate": 1e-06, + "loss": -0.0974, + "num_tokens": 2503642.0, + "reward": 0.2223111093044281, + "reward_std": 0.05318839102983475, + "rewards/bleu_reward_func/mean": 0.2223111093044281, + "rewards/bleu_reward_func/std": 0.1549021303653717, + "step": 189 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.65625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 494.0, + "completions/mean_length": 468.3125, + "completions/mean_terminated_length": 384.9090881347656, + "completions/min_length": 227.0, + "completions/min_terminated_length": 227.0, + "epoch": 0.152, + "grad_norm": 2.152480363845825, + "kl": 0.02301025390625, + "learning_rate": 1e-06, + "loss": -0.0043, + "num_tokens": 2521540.0, + "reward": 0.04742058366537094, + "reward_std": 0.0165211483836174, + "rewards/bleu_reward_func/mean": 0.04742058366537094, + "rewards/bleu_reward_func/std": 0.038380105048418045, + "step": 190 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 485.0, + "completions/mean_length": 295.5625, + "completions/mean_terminated_length": 147.4736785888672, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 0.1528, + "grad_norm": 3.0485126972198486, + "kl": 0.0381622314453125, + "learning_rate": 1e-06, + "loss": 0.5548, + "num_tokens": 2536702.0, + "reward": 0.059137165546417236, + "reward_std": 0.029524236917495728, + "rewards/bleu_reward_func/mean": 0.059137165546417236, + "rewards/bleu_reward_func/std": 0.04191603511571884, + "step": 191 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 479.0, + "completions/mean_length": 305.4375, + "completions/mean_terminated_length": 211.5454559326172, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.1536, + "grad_norm": 3.727417230606079, + "kl": 0.06072998046875, + "learning_rate": 1e-06, + "loss": 0.0741, + "num_tokens": 2553892.0, + "reward": 0.06053918972611427, + "reward_std": 0.025174250826239586, + "rewards/bleu_reward_func/mean": 0.06053918972611427, + "rewards/bleu_reward_func/std": 0.03798559308052063, + "step": 192 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 511.0, + "completions/mean_length": 327.0625, + "completions/mean_terminated_length": 216.10000610351562, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.1544, + "grad_norm": 22.730863571166992, + "kl": 0.1234130859375, + "learning_rate": 1e-06, + "loss": -0.1216, + "num_tokens": 2569950.0, + "reward": 0.14068183302879333, + "reward_std": 0.05201031640172005, + "rewards/bleu_reward_func/mean": 0.14068183302879333, + "rewards/bleu_reward_func/std": 0.1718810796737671, + "step": 193 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 349.0, + "completions/mean_length": 189.71875, + "completions/mean_terminated_length": 82.29167175292969, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.1552, + "grad_norm": 5.675025939941406, + "kl": 0.063934326171875, + "learning_rate": 1e-06, + "loss": -0.0105, + "num_tokens": 2579693.0, + "reward": 0.08947663754224777, + "reward_std": 0.029948215931653976, + "rewards/bleu_reward_func/mean": 0.08947663754224777, + "rewards/bleu_reward_func/std": 0.06868135929107666, + "step": 194 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 410.0, + "completions/mean_length": 210.5625, + "completions/mean_terminated_length": 167.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 0.156, + "grad_norm": 6.797698974609375, + "kl": 0.17242431640625, + "learning_rate": 1e-06, + "loss": -0.0415, + "num_tokens": 2592967.0, + "reward": 0.16623055934906006, + "reward_std": 0.08808746933937073, + "rewards/bleu_reward_func/mean": 0.16623055934906006, + "rewards/bleu_reward_func/std": 0.17983676493167877, + "step": 195 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 119.09375, + "completions/mean_terminated_length": 119.09375, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 0.1568, + "grad_norm": 5.494751453399658, + "kl": 0.05902099609375, + "learning_rate": 1e-06, + "loss": 0.2727, + "num_tokens": 2602042.0, + "reward": 0.17185799777507782, + "reward_std": 0.10617370158433914, + "rewards/bleu_reward_func/mean": 0.17185799777507782, + "rewards/bleu_reward_func/std": 0.16121239960193634, + "step": 196 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.34375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 512.0, + "completions/mean_length": 368.5, + "completions/mean_terminated_length": 293.3333435058594, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 0.1576, + "grad_norm": 4.1480302810668945, + "kl": 0.0312652587890625, + "learning_rate": 1e-06, + "loss": -0.1879, + "num_tokens": 2615826.0, + "reward": 0.051297686994075775, + "reward_std": 0.018504546955227852, + "rewards/bleu_reward_func/mean": 0.051297686994075775, + "rewards/bleu_reward_func/std": 0.034977275878190994, + "step": 197 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 490.0, + "completions/mean_length": 267.53125, + "completions/mean_terminated_length": 251.2333526611328, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.1584, + "grad_norm": 4.113983631134033, + "kl": 0.027557373046875, + "learning_rate": 1e-06, + "loss": 0.0749, + "num_tokens": 2626275.0, + "reward": 0.054141815751791, + "reward_std": 0.02476467750966549, + "rewards/bleu_reward_func/mean": 0.054141815751791, + "rewards/bleu_reward_func/std": 0.07109448313713074, + "step": 198 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 470.0, + "completions/max_terminated_length": 470.0, + "completions/mean_length": 112.96875, + "completions/mean_terminated_length": 112.96875, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 0.1592, + "grad_norm": 7.432074546813965, + "kl": 0.1759033203125, + "learning_rate": 1e-06, + "loss": 0.0648, + "num_tokens": 2633962.0, + "reward": 0.16682901978492737, + "reward_std": 0.07138749957084656, + "rewards/bleu_reward_func/mean": 0.16682901978492737, + "rewards/bleu_reward_func/std": 0.15276572108268738, + "step": 199 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 495.0, + "completions/mean_length": 451.78125, + "completions/mean_terminated_length": 404.9444580078125, + "completions/min_length": 289.0, + "completions/min_terminated_length": 289.0, + "epoch": 0.16, + "grad_norm": 2.110192060470581, + "kl": 0.022369384765625, + "learning_rate": 1e-06, + "loss": -0.0268, + "num_tokens": 2653971.0, + "reward": 0.11942745745182037, + "reward_std": 0.02005620300769806, + "rewards/bleu_reward_func/mean": 0.11942745745182037, + "rewards/bleu_reward_func/std": 0.09454692155122757, + "step": 200 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 349.0, + "completions/mean_length": 185.9375, + "completions/mean_terminated_length": 110.69231414794922, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.1608, + "grad_norm": 6.2729973793029785, + "kl": 0.063720703125, + "learning_rate": 1e-06, + "loss": -0.0659, + "num_tokens": 2664601.0, + "reward": 0.03557516261935234, + "reward_std": 0.021523961797356606, + "rewards/bleu_reward_func/mean": 0.03557516261935234, + "rewards/bleu_reward_func/std": 0.02618589997291565, + "step": 201 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 509.0, + "completions/mean_length": 292.5, + "completions/mean_terminated_length": 121.77777862548828, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 0.1616, + "grad_norm": 5.936282157897949, + "kl": 0.0358734130859375, + "learning_rate": 1e-06, + "loss": -0.2742, + "num_tokens": 2679849.0, + "reward": 0.038136985152959824, + "reward_std": 0.022807471454143524, + "rewards/bleu_reward_func/mean": 0.038136985152959824, + "rewards/bleu_reward_func/std": 0.061121899634599686, + "step": 202 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 510.0, + "completions/mean_length": 230.96875, + "completions/mean_terminated_length": 221.90321350097656, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.1624, + "grad_norm": 8.785550117492676, + "kl": 0.1523590087890625, + "learning_rate": 1e-06, + "loss": -0.2049, + "num_tokens": 2693024.0, + "reward": 0.1289938986301422, + "reward_std": 0.045512765645980835, + "rewards/bleu_reward_func/mean": 0.1289938986301422, + "rewards/bleu_reward_func/std": 0.09638386219739914, + "step": 203 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 475.0, + "completions/mean_length": 361.8125, + "completions/mean_terminated_length": 168.71429443359375, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 0.1632, + "grad_norm": 6.617871284484863, + "kl": 0.0502777099609375, + "learning_rate": 1e-06, + "loss": 0.1304, + "num_tokens": 2710354.0, + "reward": 0.033049020916223526, + "reward_std": 0.017362549901008606, + "rewards/bleu_reward_func/mean": 0.033049020916223526, + "rewards/bleu_reward_func/std": 0.026102159172296524, + "step": 204 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 456.0, + "completions/mean_length": 281.4375, + "completions/mean_terminated_length": 176.63636779785156, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.164, + "grad_norm": 3.961705446243286, + "kl": 0.104736328125, + "learning_rate": 1e-06, + "loss": -0.0209, + "num_tokens": 2724680.0, + "reward": 0.1750263273715973, + "reward_std": 0.02830299735069275, + "rewards/bleu_reward_func/mean": 0.1750263273715973, + "rewards/bleu_reward_func/std": 0.13747908174991608, + "step": 205 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 509.0, + "completions/mean_length": 287.28125, + "completions/mean_terminated_length": 255.1785888671875, + "completions/min_length": 53.0, + "completions/min_terminated_length": 53.0, + "epoch": 0.1648, + "grad_norm": 3.098118305206299, + "kl": 0.020477294921875, + "learning_rate": 1e-06, + "loss": -0.1863, + "num_tokens": 2736209.0, + "reward": 0.06041261553764343, + "reward_std": 0.033261410892009735, + "rewards/bleu_reward_func/mean": 0.06041261553764343, + "rewards/bleu_reward_func/std": 0.046081364154815674, + "step": 206 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 429.0, + "completions/mean_length": 230.1875, + "completions/mean_terminated_length": 136.25, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.1656, + "grad_norm": 6.539205551147461, + "kl": 0.0445404052734375, + "learning_rate": 1e-06, + "loss": 0.0949, + "num_tokens": 2749503.0, + "reward": 0.039952248334884644, + "reward_std": 0.05510722100734711, + "rewards/bleu_reward_func/mean": 0.039952248334884644, + "rewards/bleu_reward_func/std": 0.08833327889442444, + "step": 207 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 453.0, + "completions/mean_length": 385.8125, + "completions/mean_terminated_length": 223.57144165039062, + "completions/min_length": 62.0, + "completions/min_terminated_length": 62.0, + "epoch": 0.1664, + "grad_norm": 3.262167453765869, + "kl": 0.0330657958984375, + "learning_rate": 1e-06, + "loss": 0.0025, + "num_tokens": 2763289.0, + "reward": 0.06319095194339752, + "reward_std": 0.021728292107582092, + "rewards/bleu_reward_func/mean": 0.06319095194339752, + "rewards/bleu_reward_func/std": 0.03750937059521675, + "step": 208 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 498.0, + "completions/mean_length": 182.71875, + "completions/mean_terminated_length": 172.09677124023438, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 0.1672, + "grad_norm": 6.667765140533447, + "kl": 0.113616943359375, + "learning_rate": 1e-06, + "loss": 0.3285, + "num_tokens": 2773104.0, + "reward": 0.1846303939819336, + "reward_std": 0.16774994134902954, + "rewards/bleu_reward_func/mean": 0.1846303939819336, + "rewards/bleu_reward_func/std": 0.20520828664302826, + "step": 209 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 449.0, + "completions/mean_length": 204.6875, + "completions/mean_terminated_length": 147.7777862548828, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 0.168, + "grad_norm": 5.794483661651611, + "kl": 0.09796142578125, + "learning_rate": 1e-06, + "loss": 0.1363, + "num_tokens": 2787670.0, + "reward": 0.09086121618747711, + "reward_std": 0.052026841789484024, + "rewards/bleu_reward_func/mean": 0.09086121618747711, + "rewards/bleu_reward_func/std": 0.09278357774019241, + "step": 210 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.71875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 367.0, + "completions/mean_length": 399.8125, + "completions/mean_terminated_length": 113.11111450195312, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "epoch": 0.1688, + "grad_norm": 11.576338768005371, + "kl": 0.0574493408203125, + "learning_rate": 1e-06, + "loss": 0.1204, + "num_tokens": 2805864.0, + "reward": 0.023652518168091774, + "reward_std": 0.01210303045809269, + "rewards/bleu_reward_func/mean": 0.023652518168091774, + "rewards/bleu_reward_func/std": 0.02501726523041725, + "step": 211 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 401.0, + "completions/mean_length": 157.6875, + "completions/mean_terminated_length": 58.47999954223633, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 0.1696, + "grad_norm": 10.228835105895996, + "kl": 0.1674041748046875, + "learning_rate": 1e-06, + "loss": 0.0582, + "num_tokens": 2816758.0, + "reward": 0.13800185918807983, + "reward_std": 0.047296687960624695, + "rewards/bleu_reward_func/mean": 0.13800185918807983, + "rewards/bleu_reward_func/std": 0.0863277018070221, + "step": 212 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 506.0, + "completions/mean_length": 402.15625, + "completions/mean_terminated_length": 359.1739196777344, + "completions/min_length": 214.0, + "completions/min_terminated_length": 214.0, + "epoch": 0.1704, + "grad_norm": 2.593717336654663, + "kl": 0.01934814453125, + "learning_rate": 1e-06, + "loss": -0.0091, + "num_tokens": 2833347.0, + "reward": 0.05193600431084633, + "reward_std": 0.018484318628907204, + "rewards/bleu_reward_func/mean": 0.05193600431084633, + "rewards/bleu_reward_func/std": 0.04251272976398468, + "step": 213 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 509.0, + "completions/max_terminated_length": 509.0, + "completions/mean_length": 198.25, + "completions/mean_terminated_length": 198.25, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 0.1712, + "grad_norm": 6.071621894836426, + "kl": 0.057373046875, + "learning_rate": 1e-06, + "loss": -0.1354, + "num_tokens": 2841195.0, + "reward": 0.062206219881772995, + "reward_std": 0.03749649226665497, + "rewards/bleu_reward_func/mean": 0.062206219881772995, + "rewards/bleu_reward_func/std": 0.0528765432536602, + "step": 214 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 491.0, + "completions/mean_length": 246.40625, + "completions/mean_terminated_length": 218.9310302734375, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 0.172, + "grad_norm": 4.833486557006836, + "kl": 0.068695068359375, + "learning_rate": 1e-06, + "loss": 0.1631, + "num_tokens": 2851224.0, + "reward": 0.06542235612869263, + "reward_std": 0.03771442174911499, + "rewards/bleu_reward_func/mean": 0.06542235612869263, + "rewards/bleu_reward_func/std": 0.0579860620200634, + "step": 215 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.53125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 494.0, + "completions/mean_length": 465.15625, + "completions/mean_terminated_length": 412.0666809082031, + "completions/min_length": 309.0, + "completions/min_terminated_length": 309.0, + "epoch": 0.1728, + "grad_norm": 2.1820828914642334, + "kl": 0.0188446044921875, + "learning_rate": 1e-06, + "loss": 0.0277, + "num_tokens": 2870765.0, + "reward": 0.06440776586532593, + "reward_std": 0.013088207691907883, + "rewards/bleu_reward_func/mean": 0.06440776586532593, + "rewards/bleu_reward_func/std": 0.06307429075241089, + "step": 216 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 512.0, + "completions/mean_length": 224.0625, + "completions/mean_terminated_length": 170.74073791503906, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 0.1736, + "grad_norm": 9.596390724182129, + "kl": 0.2492218017578125, + "learning_rate": 1e-06, + "loss": 0.1621, + "num_tokens": 2882151.0, + "reward": 0.15283548831939697, + "reward_std": 0.08103044331073761, + "rewards/bleu_reward_func/mean": 0.15283548831939697, + "rewards/bleu_reward_func/std": 0.13223250210285187, + "step": 217 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.75, + "completions/max_length": 512.0, + "completions/max_terminated_length": 480.0, + "completions/mean_length": 464.09375, + "completions/mean_terminated_length": 320.375, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 0.1744, + "grad_norm": 2.1536099910736084, + "kl": 0.02032470703125, + "learning_rate": 1e-06, + "loss": 0.0402, + "num_tokens": 2900658.0, + "reward": 0.02000538259744644, + "reward_std": 0.008671639487147331, + "rewards/bleu_reward_func/mean": 0.02000538259744644, + "rewards/bleu_reward_func/std": 0.01867109164595604, + "step": 218 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 489.0, + "completions/mean_length": 155.5, + "completions/mean_terminated_length": 55.68000030517578, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 0.1752, + "grad_norm": 8.202893257141113, + "kl": 0.2074127197265625, + "learning_rate": 1e-06, + "loss": -0.0157, + "num_tokens": 2909778.0, + "reward": 0.1296558678150177, + "reward_std": 0.04394569993019104, + "rewards/bleu_reward_func/mean": 0.1296558678150177, + "rewards/bleu_reward_func/std": 0.05605300888419151, + "step": 219 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 512.0, + "completions/max_terminated_length": 485.0, + "completions/mean_length": 350.40625, + "completions/mean_terminated_length": 188.8125, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 0.176, + "grad_norm": 5.133015155792236, + "kl": 0.0847930908203125, + "learning_rate": 1e-06, + "loss": 0.2562, + "num_tokens": 2926239.0, + "reward": 0.1607290506362915, + "reward_std": 0.12061528861522675, + "rewards/bleu_reward_func/mean": 0.1607290506362915, + "rewards/bleu_reward_func/std": 0.19297951459884644, + "step": 220 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.34375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 509.0, + "completions/mean_length": 377.09375, + "completions/mean_terminated_length": 306.4285888671875, + "completions/min_length": 55.0, + "completions/min_terminated_length": 55.0, + "epoch": 0.1768, + "grad_norm": 2.917404890060425, + "kl": 0.02947998046875, + "learning_rate": 1e-06, + "loss": 0.0902, + "num_tokens": 2940970.0, + "reward": 0.05033531412482262, + "reward_std": 0.015085380524396896, + "rewards/bleu_reward_func/mean": 0.05033531412482262, + "rewards/bleu_reward_func/std": 0.03601166605949402, + "step": 221 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 307.0, + "completions/max_terminated_length": 307.0, + "completions/mean_length": 98.625, + "completions/mean_terminated_length": 98.625, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 0.1776, + "grad_norm": 9.739842414855957, + "kl": 0.28759765625, + "learning_rate": 1e-06, + "loss": 0.1954, + "num_tokens": 2951942.0, + "reward": 0.18511344492435455, + "reward_std": 0.09618590772151947, + "rewards/bleu_reward_func/mean": 0.18511344492435455, + "rewards/bleu_reward_func/std": 0.13407698273658752, + "step": 222 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 469.0, + "completions/mean_length": 155.1875, + "completions/mean_terminated_length": 131.40000915527344, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 0.1784, + "grad_norm": 5.931830883026123, + "kl": 0.082611083984375, + "learning_rate": 1e-06, + "loss": 0.0924, + "num_tokens": 2959460.0, + "reward": 0.07271347939968109, + "reward_std": 0.05200031027197838, + "rewards/bleu_reward_func/mean": 0.07271347939968109, + "rewards/bleu_reward_func/std": 0.06765022873878479, + "step": 223 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 391.0, + "completions/mean_length": 130.71875, + "completions/mean_terminated_length": 105.30000305175781, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 0.1792, + "grad_norm": 7.8368730545043945, + "kl": 0.09417724609375, + "learning_rate": 1e-06, + "loss": 0.1685, + "num_tokens": 2966491.0, + "reward": 0.0899183601140976, + "reward_std": 0.05122753232717514, + "rewards/bleu_reward_func/mean": 0.0899183601140976, + "rewards/bleu_reward_func/std": 0.11120127141475677, + "step": 224 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 472.0, + "completions/mean_length": 367.5, + "completions/mean_terminated_length": 268.631591796875, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 0.18, + "grad_norm": 3.555055618286133, + "kl": 0.0318603515625, + "learning_rate": 1e-06, + "loss": -0.0636, + "num_tokens": 2982515.0, + "reward": 0.11773502081632614, + "reward_std": 0.046606093645095825, + "rewards/bleu_reward_func/mean": 0.11773502081632614, + "rewards/bleu_reward_func/std": 0.15673232078552246, + "step": 225 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 505.0, + "completions/max_terminated_length": 505.0, + "completions/mean_length": 122.65625, + "completions/mean_terminated_length": 122.65625, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.1808, + "grad_norm": 10.452176094055176, + "kl": 0.1519775390625, + "learning_rate": 1e-06, + "loss": 0.0183, + "num_tokens": 2991512.0, + "reward": 0.13446207344532013, + "reward_std": 0.060547836124897, + "rewards/bleu_reward_func/mean": 0.13446207344532013, + "rewards/bleu_reward_func/std": 0.07454977184534073, + "step": 226 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.34375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 496.0, + "completions/mean_length": 345.6875, + "completions/mean_terminated_length": 258.5714416503906, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.1816, + "grad_norm": 2.964317560195923, + "kl": 0.038421630859375, + "learning_rate": 1e-06, + "loss": 0.1194, + "num_tokens": 3005678.0, + "reward": 0.14132392406463623, + "reward_std": 0.05001860111951828, + "rewards/bleu_reward_func/mean": 0.14132392406463623, + "rewards/bleu_reward_func/std": 0.08175285160541534, + "step": 227 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.6875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 449.0, + "completions/mean_length": 390.125, + "completions/mean_terminated_length": 122.0, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.1824, + "grad_norm": 5.049752235412598, + "kl": 0.04425048828125, + "learning_rate": 1e-06, + "loss": 0.1754, + "num_tokens": 3021434.0, + "reward": 0.04336467757821083, + "reward_std": 0.018742987886071205, + "rewards/bleu_reward_func/mean": 0.04336467757821083, + "rewards/bleu_reward_func/std": 0.03402964025735855, + "step": 228 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 483.0, + "completions/mean_length": 364.0625, + "completions/mean_terminated_length": 249.0, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "epoch": 0.1832, + "grad_norm": 3.026240348815918, + "kl": 0.0245361328125, + "learning_rate": 1e-06, + "loss": -0.2846, + "num_tokens": 3040956.0, + "reward": 0.028285246342420578, + "reward_std": 0.018473699688911438, + "rewards/bleu_reward_func/mean": 0.028285246342420578, + "rewards/bleu_reward_func/std": 0.02460222877562046, + "step": 229 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 506.0, + "completions/mean_length": 370.1875, + "completions/mean_terminated_length": 330.47998046875, + "completions/min_length": 162.0, + "completions/min_terminated_length": 162.0, + "epoch": 0.184, + "grad_norm": 2.621922731399536, + "kl": 0.0340728759765625, + "learning_rate": 1e-06, + "loss": -0.0699, + "num_tokens": 3058866.0, + "reward": 0.18184542655944824, + "reward_std": 0.06604617834091187, + "rewards/bleu_reward_func/mean": 0.18184542655944824, + "rewards/bleu_reward_func/std": 0.16794371604919434, + "step": 230 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 504.0, + "completions/mean_length": 213.71875, + "completions/mean_terminated_length": 97.0, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 0.1848, + "grad_norm": 5.496671676635742, + "kl": 0.11822509765625, + "learning_rate": 1e-06, + "loss": 0.4021, + "num_tokens": 3071713.0, + "reward": 0.22397759556770325, + "reward_std": 0.09391038119792938, + "rewards/bleu_reward_func/mean": 0.22397759556770325, + "rewards/bleu_reward_func/std": 0.19180122017860413, + "step": 231 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 267.0, + "completions/mean_length": 197.84375, + "completions/mean_terminated_length": 93.125, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "epoch": 0.1856, + "grad_norm": 3.808242082595825, + "kl": 0.04571533203125, + "learning_rate": 1e-06, + "loss": 0.0821, + "num_tokens": 3079892.0, + "reward": 0.060666900128126144, + "reward_std": 0.029011715203523636, + "rewards/bleu_reward_func/mean": 0.060666900128126144, + "rewards/bleu_reward_func/std": 0.0762709304690361, + "step": 232 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 441.0, + "completions/mean_length": 160.40625, + "completions/mean_terminated_length": 61.959999084472656, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 0.1864, + "grad_norm": 11.2310791015625, + "kl": 0.160797119140625, + "learning_rate": 1e-06, + "loss": 0.2881, + "num_tokens": 3087689.0, + "reward": 0.07089974731206894, + "reward_std": 0.03123306669294834, + "rewards/bleu_reward_func/mean": 0.07089974731206894, + "rewards/bleu_reward_func/std": 0.06456828862428665, + "step": 233 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 437.0, + "completions/max_terminated_length": 437.0, + "completions/mean_length": 66.875, + "completions/mean_terminated_length": 66.875, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 0.1872, + "grad_norm": 13.989295959472656, + "kl": 0.3311767578125, + "learning_rate": 1e-06, + "loss": -0.0331, + "num_tokens": 3093357.0, + "reward": 0.15325351059436798, + "reward_std": 0.0506255105137825, + "rewards/bleu_reward_func/mean": 0.15325351059436798, + "rewards/bleu_reward_func/std": 0.19497260451316833, + "step": 234 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 466.0, + "completions/mean_length": 260.75, + "completions/mean_terminated_length": 162.43478393554688, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 0.188, + "grad_norm": 7.557122230529785, + "kl": 0.173126220703125, + "learning_rate": 1e-06, + "loss": 0.1592, + "num_tokens": 3105749.0, + "reward": 0.20930011570453644, + "reward_std": 0.06161898747086525, + "rewards/bleu_reward_func/mean": 0.20930011570453644, + "rewards/bleu_reward_func/std": 0.2159973680973053, + "step": 235 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.46875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 506.0, + "completions/mean_length": 404.9375, + "completions/mean_terminated_length": 310.4705810546875, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 0.1888, + "grad_norm": 4.613722324371338, + "kl": 0.025421142578125, + "learning_rate": 1e-06, + "loss": -0.1006, + "num_tokens": 3121795.0, + "reward": 0.02748030610382557, + "reward_std": 0.0075658103451132774, + "rewards/bleu_reward_func/mean": 0.02748030610382557, + "rewards/bleu_reward_func/std": 0.03438537195324898, + "step": 236 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 512.0, + "completions/max_terminated_length": 472.0, + "completions/mean_length": 389.09375, + "completions/mean_terminated_length": 266.1875, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 0.1896, + "grad_norm": 2.435314178466797, + "kl": 0.0276031494140625, + "learning_rate": 1e-06, + "loss": -0.1746, + "num_tokens": 3140006.0, + "reward": 0.10853572189807892, + "reward_std": 0.05605427548289299, + "rewards/bleu_reward_func/mean": 0.10853572189807892, + "rewards/bleu_reward_func/std": 0.1485956311225891, + "step": 237 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 61.0, + "completions/mean_length": 149.78125, + "completions/mean_terminated_length": 29.041667938232422, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 0.1904, + "grad_norm": 8.839442253112793, + "kl": 0.2632598876953125, + "learning_rate": 1e-06, + "loss": 0.0665, + "num_tokens": 3149743.0, + "reward": 0.13384486734867096, + "reward_std": 0.03735985979437828, + "rewards/bleu_reward_func/mean": 0.13384486734867096, + "rewards/bleu_reward_func/std": 0.17275770008563995, + "step": 238 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 508.0, + "completions/mean_length": 261.625, + "completions/mean_terminated_length": 178.1666717529297, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.1912, + "grad_norm": 4.326257228851318, + "kl": 0.14703369140625, + "learning_rate": 1e-06, + "loss": -0.0395, + "num_tokens": 3163195.0, + "reward": 0.16435688734054565, + "reward_std": 0.051772814244031906, + "rewards/bleu_reward_func/mean": 0.16435688734054565, + "rewards/bleu_reward_func/std": 0.13062016665935516, + "step": 239 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 512.0, + "completions/mean_length": 359.4375, + "completions/mean_terminated_length": 255.05262756347656, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 0.192, + "grad_norm": 6.709453582763672, + "kl": 0.097503662109375, + "learning_rate": 1e-06, + "loss": -0.0418, + "num_tokens": 3180209.0, + "reward": 0.10101380944252014, + "reward_std": 0.030364379286766052, + "rewards/bleu_reward_func/mean": 0.10101380944252014, + "rewards/bleu_reward_func/std": 0.08647928386926651, + "step": 240 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 412.0, + "completions/mean_length": 185.0, + "completions/mean_terminated_length": 93.43999481201172, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.1928, + "grad_norm": 9.118388175964355, + "kl": 0.14984130859375, + "learning_rate": 1e-06, + "loss": -0.0512, + "num_tokens": 3189425.0, + "reward": 0.19255727529525757, + "reward_std": 0.03786986321210861, + "rewards/bleu_reward_func/mean": 0.19255727529525757, + "rewards/bleu_reward_func/std": 0.18927834928035736, + "step": 241 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 497.0, + "completions/mean_length": 162.4375, + "completions/mean_terminated_length": 81.76923370361328, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 0.1936, + "grad_norm": 7.5658745765686035, + "kl": 0.1087493896484375, + "learning_rate": 1e-06, + "loss": 0.11, + "num_tokens": 3196911.0, + "reward": 0.08898752182722092, + "reward_std": 0.01980067417025566, + "rewards/bleu_reward_func/mean": 0.08898752182722092, + "rewards/bleu_reward_func/std": 0.09810609370470047, + "step": 242 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 479.0, + "completions/mean_length": 238.8125, + "completions/mean_terminated_length": 175.7692413330078, + "completions/min_length": 43.0, + "completions/min_terminated_length": 43.0, + "epoch": 0.1944, + "grad_norm": 3.6591224670410156, + "kl": 0.068145751953125, + "learning_rate": 1e-06, + "loss": 0.0677, + "num_tokens": 3212161.0, + "reward": 0.16356298327445984, + "reward_std": 0.08266205340623856, + "rewards/bleu_reward_func/mean": 0.16356298327445984, + "rewards/bleu_reward_func/std": 0.17177340388298035, + "step": 243 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.34375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 489.0, + "completions/mean_length": 369.0625, + "completions/mean_terminated_length": 294.19049072265625, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "epoch": 0.1952, + "grad_norm": 2.8115499019622803, + "kl": 0.032623291015625, + "learning_rate": 1e-06, + "loss": 0.0943, + "num_tokens": 3228483.0, + "reward": 0.06906401365995407, + "reward_std": 0.025964463129639626, + "rewards/bleu_reward_func/mean": 0.06906401365995407, + "rewards/bleu_reward_func/std": 0.044564370065927505, + "step": 244 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 488.0, + "completions/mean_length": 253.875, + "completions/mean_terminated_length": 194.3076934814453, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 0.196, + "grad_norm": 6.153151512145996, + "kl": 0.072845458984375, + "learning_rate": 1e-06, + "loss": 0.1336, + "num_tokens": 3238447.0, + "reward": 0.05225534737110138, + "reward_std": 0.019162572920322418, + "rewards/bleu_reward_func/mean": 0.05225534737110138, + "rewards/bleu_reward_func/std": 0.04069560393691063, + "step": 245 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 508.0, + "completions/mean_length": 357.6875, + "completions/mean_terminated_length": 237.6666717529297, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.1968, + "grad_norm": 4.332682132720947, + "kl": 0.074615478515625, + "learning_rate": 1e-06, + "loss": -0.0015, + "num_tokens": 3252661.0, + "reward": 0.06644366681575775, + "reward_std": 0.029834389686584473, + "rewards/bleu_reward_func/mean": 0.06644366681575775, + "rewards/bleu_reward_func/std": 0.0527600534260273, + "step": 246 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 433.0, + "completions/mean_length": 280.84375, + "completions/mean_terminated_length": 203.7916717529297, + "completions/min_length": 56.0, + "completions/min_terminated_length": 56.0, + "epoch": 0.1976, + "grad_norm": 3.9714043140411377, + "kl": 0.046600341796875, + "learning_rate": 1e-06, + "loss": 0.0574, + "num_tokens": 3263392.0, + "reward": 0.04084426164627075, + "reward_std": 0.022724341601133347, + "rewards/bleu_reward_func/mean": 0.04084426164627075, + "rewards/bleu_reward_func/std": 0.03625248372554779, + "step": 247 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 508.0, + "completions/mean_length": 246.1875, + "completions/mean_terminated_length": 171.75999450683594, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 0.1984, + "grad_norm": 7.287817478179932, + "kl": 0.128570556640625, + "learning_rate": 1e-06, + "loss": 0.0058, + "num_tokens": 3274190.0, + "reward": 0.05778396502137184, + "reward_std": 0.020291190594434738, + "rewards/bleu_reward_func/mean": 0.05778396502137184, + "rewards/bleu_reward_func/std": 0.046611472964286804, + "step": 248 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 454.0, + "completions/mean_length": 317.21875, + "completions/mean_terminated_length": 183.94737243652344, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 0.1992, + "grad_norm": 9.650996208190918, + "kl": 0.1335906982421875, + "learning_rate": 1e-06, + "loss": -0.0956, + "num_tokens": 3288605.0, + "reward": 0.15271537005901337, + "reward_std": 0.0891089141368866, + "rewards/bleu_reward_func/mean": 0.15271537005901337, + "rewards/bleu_reward_func/std": 0.1993638128042221, + "step": 249 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 511.0, + "completions/mean_length": 315.125, + "completions/mean_terminated_length": 238.0869598388672, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 0.2, + "grad_norm": 8.128390312194824, + "kl": 0.073394775390625, + "learning_rate": 1e-06, + "loss": 0.1129, + "num_tokens": 3303449.0, + "reward": 0.05582565814256668, + "reward_std": 0.04732588678598404, + "rewards/bleu_reward_func/mean": 0.05582565814256668, + "rewards/bleu_reward_func/std": 0.06975270062685013, + "step": 250 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 512.0, + "completions/max_terminated_length": 510.0, + "completions/mean_length": 397.34375, + "completions/mean_terminated_length": 282.6875, + "completions/min_length": 61.0, + "completions/min_terminated_length": 61.0, + "epoch": 0.2008, + "grad_norm": 2.3954126834869385, + "kl": 0.027587890625, + "learning_rate": 1e-06, + "loss": -0.0758, + "num_tokens": 3322684.0, + "reward": 0.20381565392017365, + "reward_std": 0.06331950426101685, + "rewards/bleu_reward_func/mean": 0.20381565392017365, + "rewards/bleu_reward_func/std": 0.30689555406570435, + "step": 251 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 507.0, + "completions/mean_length": 187.75, + "completions/mean_terminated_length": 177.29031372070312, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 0.2016, + "grad_norm": 10.365123748779297, + "kl": 0.403076171875, + "learning_rate": 1e-06, + "loss": -0.0409, + "num_tokens": 3332612.0, + "reward": 0.09179520606994629, + "reward_std": 0.042515259236097336, + "rewards/bleu_reward_func/mean": 0.09179520606994629, + "rewards/bleu_reward_func/std": 0.06000783294439316, + "step": 252 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 509.0, + "completions/mean_length": 256.4375, + "completions/mean_terminated_length": 184.87998962402344, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.2024, + "grad_norm": 5.642463207244873, + "kl": 0.1347198486328125, + "learning_rate": 1e-06, + "loss": -0.0626, + "num_tokens": 3347706.0, + "reward": 0.12519359588623047, + "reward_std": 0.036009326577186584, + "rewards/bleu_reward_func/mean": 0.12519359588623047, + "rewards/bleu_reward_func/std": 0.1556256264448166, + "step": 253 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 469.0, + "completions/mean_length": 133.4375, + "completions/mean_terminated_length": 79.35714721679688, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 0.2032, + "grad_norm": 13.66992473602295, + "kl": 0.30645751953125, + "learning_rate": 1e-06, + "loss": -0.1315, + "num_tokens": 3353864.0, + "reward": 0.10196495056152344, + "reward_std": 0.05300650745630264, + "rewards/bleu_reward_func/mean": 0.10196495056152344, + "rewards/bleu_reward_func/std": 0.09023614972829819, + "step": 254 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 487.0, + "completions/mean_length": 267.6875, + "completions/mean_terminated_length": 232.7857208251953, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "epoch": 0.204, + "grad_norm": 24.040653228759766, + "kl": 0.062896728515625, + "learning_rate": 1e-06, + "loss": -0.1232, + "num_tokens": 3365758.0, + "reward": 0.03941156342625618, + "reward_std": 0.017305800691246986, + "rewards/bleu_reward_func/mean": 0.03941156342625618, + "rewards/bleu_reward_func/std": 0.02295033633708954, + "step": 255 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.53125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 486.0, + "completions/mean_length": 420.21875, + "completions/mean_terminated_length": 316.20001220703125, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "epoch": 0.2048, + "grad_norm": 2.9320602416992188, + "kl": 0.033233642578125, + "learning_rate": 1e-06, + "loss": -0.1371, + "num_tokens": 3382765.0, + "reward": 0.05339156836271286, + "reward_std": 0.02982841432094574, + "rewards/bleu_reward_func/mean": 0.05339156836271286, + "rewards/bleu_reward_func/std": 0.07343700528144836, + "step": 256 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 497.0, + "completions/mean_length": 247.8125, + "completions/mean_terminated_length": 173.83999633789062, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 0.2056, + "grad_norm": 8.614324569702148, + "kl": 0.1400146484375, + "learning_rate": 1e-06, + "loss": 0.1332, + "num_tokens": 3394391.0, + "reward": 0.06851230561733246, + "reward_std": 0.04152427613735199, + "rewards/bleu_reward_func/mean": 0.06851230561733246, + "rewards/bleu_reward_func/std": 0.056356508284807205, + "step": 257 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.53125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 503.0, + "completions/mean_length": 365.625, + "completions/mean_terminated_length": 199.73333740234375, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 0.2064, + "grad_norm": 6.318526744842529, + "kl": 0.099365234375, + "learning_rate": 1e-06, + "loss": -0.0091, + "num_tokens": 3411195.0, + "reward": 0.08351869136095047, + "reward_std": 0.012093533761799335, + "rewards/bleu_reward_func/mean": 0.08351869136095047, + "rewards/bleu_reward_func/std": 0.08073550462722778, + "step": 258 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 348.0, + "completions/mean_length": 134.65625, + "completions/mean_terminated_length": 109.50000762939453, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 0.2072, + "grad_norm": 7.35445499420166, + "kl": 0.2371826171875, + "learning_rate": 1e-06, + "loss": 0.4692, + "num_tokens": 3419136.0, + "reward": 0.15089674293994904, + "reward_std": 0.06239618360996246, + "rewards/bleu_reward_func/mean": 0.15089674293994904, + "rewards/bleu_reward_func/std": 0.09912555664777756, + "step": 259 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 507.0, + "completions/mean_length": 242.46875, + "completions/mean_terminated_length": 180.2692413330078, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "epoch": 0.208, + "grad_norm": 4.740394592285156, + "kl": 0.08489990234375, + "learning_rate": 1e-06, + "loss": 0.0594, + "num_tokens": 3432127.0, + "reward": 0.05275239422917366, + "reward_std": 0.050225820392370224, + "rewards/bleu_reward_func/mean": 0.05275239422917366, + "rewards/bleu_reward_func/std": 0.07898835092782974, + "step": 260 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 462.0, + "completions/mean_length": 331.84375, + "completions/mean_terminated_length": 223.75, + "completions/min_length": 44.0, + "completions/min_terminated_length": 44.0, + "epoch": 0.2088, + "grad_norm": 3.1740782260894775, + "kl": 0.05401611328125, + "learning_rate": 1e-06, + "loss": -0.0923, + "num_tokens": 3446746.0, + "reward": 0.12386887520551682, + "reward_std": 0.031204696744680405, + "rewards/bleu_reward_func/mean": 0.12386887520551682, + "rewards/bleu_reward_func/std": 0.1644604653120041, + "step": 261 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 495.0, + "completions/mean_length": 329.5, + "completions/mean_terminated_length": 268.66668701171875, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "epoch": 0.2096, + "grad_norm": 2.937896728515625, + "kl": 0.033477783203125, + "learning_rate": 1e-06, + "loss": -0.0197, + "num_tokens": 3460890.0, + "reward": 0.05950773134827614, + "reward_std": 0.017293047159910202, + "rewards/bleu_reward_func/mean": 0.05950773134827614, + "rewards/bleu_reward_func/std": 0.04094443470239639, + "step": 262 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 455.0, + "completions/max_terminated_length": 455.0, + "completions/mean_length": 135.40625, + "completions/mean_terminated_length": 135.40625, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 0.2104, + "grad_norm": 8.865147590637207, + "kl": 0.2249755859375, + "learning_rate": 1e-06, + "loss": -0.0363, + "num_tokens": 3475103.0, + "reward": 0.20508863031864166, + "reward_std": 0.040958937257528305, + "rewards/bleu_reward_func/mean": 0.20508863031864166, + "rewards/bleu_reward_func/std": 0.14616157114505768, + "step": 263 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 505.0, + "completions/mean_length": 267.8125, + "completions/mean_terminated_length": 186.4166717529297, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 0.2112, + "grad_norm": 9.684611320495605, + "kl": 0.241973876953125, + "learning_rate": 1e-06, + "loss": 0.0071, + "num_tokens": 3487049.0, + "reward": 0.098166324198246, + "reward_std": 0.040819209069013596, + "rewards/bleu_reward_func/mean": 0.098166324198246, + "rewards/bleu_reward_func/std": 0.08471043407917023, + "step": 264 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.34375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 421.0, + "completions/mean_length": 260.875, + "completions/mean_terminated_length": 129.3333282470703, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.212, + "grad_norm": 10.798442840576172, + "kl": 0.1309814453125, + "learning_rate": 1e-06, + "loss": 0.3087, + "num_tokens": 3501029.0, + "reward": 0.12524467706680298, + "reward_std": 0.05395754426717758, + "rewards/bleu_reward_func/mean": 0.12524467706680298, + "rewards/bleu_reward_func/std": 0.1178852915763855, + "step": 265 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 477.0, + "completions/mean_length": 205.71875, + "completions/mean_terminated_length": 103.625, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 0.2128, + "grad_norm": 6.346302032470703, + "kl": 0.132049560546875, + "learning_rate": 1e-06, + "loss": 0.0884, + "num_tokens": 3511372.0, + "reward": 0.10632273554801941, + "reward_std": 0.041688427329063416, + "rewards/bleu_reward_func/mean": 0.10632273554801941, + "rewards/bleu_reward_func/std": 0.09963962435722351, + "step": 266 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 509.0, + "completions/mean_length": 231.4375, + "completions/mean_terminated_length": 137.9166717529297, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 0.2136, + "grad_norm": 9.553611755371094, + "kl": 0.2503204345703125, + "learning_rate": 1e-06, + "loss": -0.085, + "num_tokens": 3524570.0, + "reward": 0.08381873369216919, + "reward_std": 0.026928268373012543, + "rewards/bleu_reward_func/mean": 0.08381873369216919, + "rewards/bleu_reward_func/std": 0.06075910106301308, + "step": 267 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 504.0, + "completions/mean_length": 266.8125, + "completions/mean_terminated_length": 198.1599884033203, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.2144, + "grad_norm": 5.0754289627075195, + "kl": 0.09002685546875, + "learning_rate": 1e-06, + "loss": -0.0985, + "num_tokens": 3535156.0, + "reward": 0.04936995357275009, + "reward_std": 0.02683193050324917, + "rewards/bleu_reward_func/mean": 0.04936995357275009, + "rewards/bleu_reward_func/std": 0.05894342064857483, + "step": 268 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 423.0, + "completions/mean_length": 432.5, + "completions/mean_terminated_length": 300.0, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, + "epoch": 0.2152, + "grad_norm": 2.118546724319458, + "kl": 0.0207061767578125, + "learning_rate": 1e-06, + "loss": -0.0481, + "num_tokens": 3555804.0, + "reward": 0.05241474509239197, + "reward_std": 0.019338509067893028, + "rewards/bleu_reward_func/mean": 0.05241474509239197, + "rewards/bleu_reward_func/std": 0.06824250519275665, + "step": 269 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 507.0, + "completions/mean_length": 272.625, + "completions/mean_terminated_length": 192.83334350585938, + "completions/min_length": 41.0, + "completions/min_terminated_length": 41.0, + "epoch": 0.216, + "grad_norm": 3.938976526260376, + "kl": 0.042999267578125, + "learning_rate": 1e-06, + "loss": -0.1052, + "num_tokens": 3572328.0, + "reward": 0.21750634908676147, + "reward_std": 0.06779822707176208, + "rewards/bleu_reward_func/mean": 0.21750634908676147, + "rewards/bleu_reward_func/std": 0.28914642333984375, + "step": 270 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.34375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 475.0, + "completions/mean_length": 293.3125, + "completions/mean_terminated_length": 178.76190185546875, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "epoch": 0.2168, + "grad_norm": 3.788853645324707, + "kl": 0.041900634765625, + "learning_rate": 1e-06, + "loss": 0.0279, + "num_tokens": 3587050.0, + "reward": 0.04385410249233246, + "reward_std": 0.030311163514852524, + "rewards/bleu_reward_func/mean": 0.04385410249233246, + "rewards/bleu_reward_func/std": 0.047958169132471085, + "step": 271 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 500.0, + "completions/mean_length": 232.25, + "completions/mean_terminated_length": 139.0, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.2176, + "grad_norm": 12.908583641052246, + "kl": 0.464263916015625, + "learning_rate": 1e-06, + "loss": 0.2404, + "num_tokens": 3598874.0, + "reward": 0.1504618227481842, + "reward_std": 0.04004389047622681, + "rewards/bleu_reward_func/mean": 0.1504618227481842, + "rewards/bleu_reward_func/std": 0.16537794470787048, + "step": 272 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 475.0, + "completions/mean_length": 209.03125, + "completions/mean_terminated_length": 199.258056640625, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 0.2184, + "grad_norm": 6.985334873199463, + "kl": 0.16595458984375, + "learning_rate": 1e-06, + "loss": -0.0408, + "num_tokens": 3610515.0, + "reward": 0.21218228340148926, + "reward_std": 0.09676108509302139, + "rewards/bleu_reward_func/mean": 0.21218228340148926, + "rewards/bleu_reward_func/std": 0.22182048857212067, + "step": 273 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 465.0, + "completions/mean_length": 333.9375, + "completions/mean_terminated_length": 195.44444274902344, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.2192, + "grad_norm": 3.482099771499634, + "kl": 0.03741455078125, + "learning_rate": 1e-06, + "loss": -0.1819, + "num_tokens": 3623921.0, + "reward": 0.11982771754264832, + "reward_std": 0.063297338783741, + "rewards/bleu_reward_func/mean": 0.11982771754264832, + "rewards/bleu_reward_func/std": 0.09915972501039505, + "step": 274 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 496.0, + "completions/mean_length": 292.90625, + "completions/mean_terminated_length": 261.6071472167969, + "completions/min_length": 58.0, + "completions/min_terminated_length": 58.0, + "epoch": 0.22, + "grad_norm": 4.1525559425354, + "kl": 0.041351318359375, + "learning_rate": 1e-06, + "loss": -0.0478, + "num_tokens": 3634918.0, + "reward": 0.05305434763431549, + "reward_std": 0.019571729004383087, + "rewards/bleu_reward_func/mean": 0.05305434763431549, + "rewards/bleu_reward_func/std": 0.04326590150594711, + "step": 275 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 488.0, + "completions/mean_length": 229.59375, + "completions/mean_terminated_length": 135.45834350585938, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 0.2208, + "grad_norm": 14.463852882385254, + "kl": 0.257415771484375, + "learning_rate": 1e-06, + "loss": -0.0433, + "num_tokens": 3647289.0, + "reward": 0.23456689715385437, + "reward_std": 0.08336643874645233, + "rewards/bleu_reward_func/mean": 0.23456689715385437, + "rewards/bleu_reward_func/std": 0.2258531004190445, + "step": 276 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 438.0, + "completions/mean_length": 193.375, + "completions/mean_terminated_length": 87.16667175292969, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 0.2216, + "grad_norm": 21.709369659423828, + "kl": 0.1391448974609375, + "learning_rate": 1e-06, + "loss": -0.0033, + "num_tokens": 3656709.0, + "reward": 0.16775630414485931, + "reward_std": 0.03647792339324951, + "rewards/bleu_reward_func/mean": 0.16775630414485931, + "rewards/bleu_reward_func/std": 0.15713484585285187, + "step": 277 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 468.0, + "completions/mean_length": 219.78125, + "completions/mean_terminated_length": 122.375, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 0.2224, + "grad_norm": 7.275771141052246, + "kl": 0.22491455078125, + "learning_rate": 1e-06, + "loss": 0.072, + "num_tokens": 3670158.0, + "reward": 0.1231408566236496, + "reward_std": 0.022272268310189247, + "rewards/bleu_reward_func/mean": 0.1231408566236496, + "rewards/bleu_reward_func/std": 0.1077708899974823, + "step": 278 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 464.0, + "completions/mean_length": 339.46875, + "completions/mean_terminated_length": 261.04547119140625, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.2232, + "grad_norm": 3.146303176879883, + "kl": 0.0390625, + "learning_rate": 1e-06, + "loss": -0.1333, + "num_tokens": 3683925.0, + "reward": 0.0675458312034607, + "reward_std": 0.017428681254386902, + "rewards/bleu_reward_func/mean": 0.0675458312034607, + "rewards/bleu_reward_func/std": 0.05334463343024254, + "step": 279 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 484.0, + "completions/mean_length": 321.9375, + "completions/mean_terminated_length": 268.7200012207031, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 0.224, + "grad_norm": 8.726150512695312, + "kl": 0.156707763671875, + "learning_rate": 1e-06, + "loss": 0.0175, + "num_tokens": 3699747.0, + "reward": 0.11248552799224854, + "reward_std": 0.03111671656370163, + "rewards/bleu_reward_func/mean": 0.11248552799224854, + "rewards/bleu_reward_func/std": 0.08908119797706604, + "step": 280 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 493.0, + "completions/mean_length": 297.375, + "completions/mean_terminated_length": 247.84616088867188, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 0.2248, + "grad_norm": 3.081026077270508, + "kl": 0.021881103515625, + "learning_rate": 1e-06, + "loss": -0.1217, + "num_tokens": 3712759.0, + "reward": 0.09313205629587173, + "reward_std": 0.03823218122124672, + "rewards/bleu_reward_func/mean": 0.09313205629587173, + "rewards/bleu_reward_func/std": 0.06713149696588516, + "step": 281 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 490.0, + "completions/mean_length": 316.46875, + "completions/mean_terminated_length": 251.2916717529297, + "completions/min_length": 50.0, + "completions/min_terminated_length": 50.0, + "epoch": 0.2256, + "grad_norm": 3.1275222301483154, + "kl": 0.035675048828125, + "learning_rate": 1e-06, + "loss": -0.1645, + "num_tokens": 3725598.0, + "reward": 0.032498396933078766, + "reward_std": 0.018658628687262535, + "rewards/bleu_reward_func/mean": 0.032498396933078766, + "rewards/bleu_reward_func/std": 0.019405974075198174, + "step": 282 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 481.0, + "completions/mean_length": 194.71875, + "completions/mean_terminated_length": 173.56668090820312, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 0.2264, + "grad_norm": 4.276275634765625, + "kl": 0.08013916015625, + "learning_rate": 1e-06, + "loss": 0.0746, + "num_tokens": 3736861.0, + "reward": 0.12694165110588074, + "reward_std": 0.04432743415236473, + "rewards/bleu_reward_func/mean": 0.12694165110588074, + "rewards/bleu_reward_func/std": 0.13188457489013672, + "step": 283 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 501.0, + "completions/mean_length": 268.375, + "completions/mean_terminated_length": 101.68421173095703, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.2272, + "grad_norm": 7.712943077087402, + "kl": 0.271392822265625, + "learning_rate": 1e-06, + "loss": -0.0787, + "num_tokens": 3751545.0, + "reward": 0.1655203104019165, + "reward_std": 0.08383054286241531, + "rewards/bleu_reward_func/mean": 0.1655203104019165, + "rewards/bleu_reward_func/std": 0.1525241732597351, + "step": 284 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 451.0, + "completions/mean_length": 342.125, + "completions/mean_terminated_length": 225.89474487304688, + "completions/min_length": 49.0, + "completions/min_terminated_length": 49.0, + "epoch": 0.228, + "grad_norm": 3.6280434131622314, + "kl": 0.046173095703125, + "learning_rate": 1e-06, + "loss": -0.1072, + "num_tokens": 3765781.0, + "reward": 0.042814724147319794, + "reward_std": 0.026553209871053696, + "rewards/bleu_reward_func/mean": 0.042814724147319794, + "rewards/bleu_reward_func/std": 0.03911494091153145, + "step": 285 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 489.0, + "completions/mean_length": 208.96875, + "completions/mean_terminated_length": 199.19354248046875, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.2288, + "grad_norm": 7.985737323760986, + "kl": 0.12091064453125, + "learning_rate": 1e-06, + "loss": -0.1044, + "num_tokens": 3779220.0, + "reward": 0.11331084370613098, + "reward_std": 0.025679122656583786, + "rewards/bleu_reward_func/mean": 0.11331084370613098, + "rewards/bleu_reward_func/std": 0.16165612637996674, + "step": 286 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.46875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 487.0, + "completions/mean_length": 351.3125, + "completions/mean_terminated_length": 209.5294189453125, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 0.2296, + "grad_norm": 3.912679433822632, + "kl": 0.026214599609375, + "learning_rate": 1e-06, + "loss": 0.1208, + "num_tokens": 3794550.0, + "reward": 0.01693039759993553, + "reward_std": 0.0203933697193861, + "rewards/bleu_reward_func/mean": 0.01693039759993553, + "rewards/bleu_reward_func/std": 0.02536601759493351, + "step": 287 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 453.0, + "completions/mean_length": 274.65625, + "completions/mean_terminated_length": 240.75001525878906, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 0.2304, + "grad_norm": 6.236807346343994, + "kl": 0.1007537841796875, + "learning_rate": 1e-06, + "loss": -0.0844, + "num_tokens": 3808595.0, + "reward": 0.13739125430583954, + "reward_std": 0.042728863656520844, + "rewards/bleu_reward_func/mean": 0.13739125430583954, + "rewards/bleu_reward_func/std": 0.09978168457746506, + "step": 288 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 443.0, + "completions/mean_length": 260.15625, + "completions/mean_terminated_length": 87.84210968017578, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 0.2312, + "grad_norm": 12.087539672851562, + "kl": 0.2603912353515625, + "learning_rate": 1e-06, + "loss": 0.1979, + "num_tokens": 3821552.0, + "reward": 0.1537414938211441, + "reward_std": 0.04864966496825218, + "rewards/bleu_reward_func/mean": 0.1537414938211441, + "rewards/bleu_reward_func/std": 0.08011970669031143, + "step": 289 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 274.0, + "completions/mean_length": 214.40625, + "completions/mean_terminated_length": 79.13636779785156, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 0.232, + "grad_norm": 6.511635780334473, + "kl": 0.12432861328125, + "learning_rate": 1e-06, + "loss": 0.2609, + "num_tokens": 3838117.0, + "reward": 0.19495005905628204, + "reward_std": 0.09461250901222229, + "rewards/bleu_reward_func/mean": 0.19495005905628204, + "rewards/bleu_reward_func/std": 0.20672400295734406, + "step": 290 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.34375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 504.0, + "completions/mean_length": 322.78125, + "completions/mean_terminated_length": 223.6666717529297, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.2328, + "grad_norm": 4.590160369873047, + "kl": 0.127716064453125, + "learning_rate": 1e-06, + "loss": -0.1225, + "num_tokens": 3853958.0, + "reward": 0.1360878348350525, + "reward_std": 0.03053300268948078, + "rewards/bleu_reward_func/mean": 0.1360878348350525, + "rewards/bleu_reward_func/std": 0.17878462374210358, + "step": 291 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.59375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 457.0, + "completions/mean_length": 462.6875, + "completions/mean_terminated_length": 390.6153869628906, + "completions/min_length": 315.0, + "completions/min_terminated_length": 315.0, + "epoch": 0.2336, + "grad_norm": 2.3334367275238037, + "kl": 0.030426025390625, + "learning_rate": 1e-06, + "loss": -0.0405, + "num_tokens": 3875596.0, + "reward": 0.06421424448490143, + "reward_std": 0.02072659507393837, + "rewards/bleu_reward_func/mean": 0.06421424448490143, + "rewards/bleu_reward_func/std": 0.02574257366359234, + "step": 292 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 490.0, + "completions/mean_length": 325.03125, + "completions/mean_terminated_length": 251.86956787109375, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 0.2344, + "grad_norm": 3.4702064990997314, + "kl": 0.03289794921875, + "learning_rate": 1e-06, + "loss": 0.1122, + "num_tokens": 3892021.0, + "reward": 0.04875369742512703, + "reward_std": 0.020287783816456795, + "rewards/bleu_reward_func/mean": 0.04875369742512703, + "rewards/bleu_reward_func/std": 0.0285445898771286, + "step": 293 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 368.0, + "completions/mean_length": 107.375, + "completions/mean_terminated_length": 94.32257843017578, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 0.2352, + "grad_norm": 25.415559768676758, + "kl": 0.23876953125, + "learning_rate": 1e-06, + "loss": 0.1099, + "num_tokens": 3903457.0, + "reward": 0.12372880429029465, + "reward_std": 0.02668173238635063, + "rewards/bleu_reward_func/mean": 0.12372880429029465, + "rewards/bleu_reward_func/std": 0.12391357123851776, + "step": 294 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 489.0, + "completions/mean_length": 244.09375, + "completions/mean_terminated_length": 122.31818389892578, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 0.236, + "grad_norm": 8.319884300231934, + "kl": 0.14251708984375, + "learning_rate": 1e-06, + "loss": -0.034, + "num_tokens": 3917028.0, + "reward": 0.16006486117839813, + "reward_std": 0.02584708109498024, + "rewards/bleu_reward_func/mean": 0.16006486117839813, + "rewards/bleu_reward_func/std": 0.1484500914812088, + "step": 295 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 502.0, + "completions/mean_length": 278.875, + "completions/mean_terminated_length": 139.0, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.2368, + "grad_norm": 4.291149616241455, + "kl": 0.131500244140625, + "learning_rate": 1e-06, + "loss": -0.192, + "num_tokens": 3929400.0, + "reward": 0.09954051673412323, + "reward_std": 0.03838299959897995, + "rewards/bleu_reward_func/mean": 0.09954051673412323, + "rewards/bleu_reward_func/std": 0.13533763587474823, + "step": 296 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 486.0, + "completions/mean_length": 288.9375, + "completions/mean_terminated_length": 187.5454559326172, + "completions/min_length": 3.0, + "completions/min_terminated_length": 3.0, + "epoch": 0.2376, + "grad_norm": 5.0546417236328125, + "kl": 0.10858154296875, + "learning_rate": 1e-06, + "loss": -0.0687, + "num_tokens": 3942222.0, + "reward": 0.16907253861427307, + "reward_std": 0.03968513384461403, + "rewards/bleu_reward_func/mean": 0.16907253861427307, + "rewards/bleu_reward_func/std": 0.10800375789403915, + "step": 297 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 507.0, + "completions/mean_length": 309.1875, + "completions/mean_terminated_length": 151.44444274902344, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 0.2384, + "grad_norm": 8.339001655578613, + "kl": 0.1490631103515625, + "learning_rate": 1e-06, + "loss": -0.0121, + "num_tokens": 3954316.0, + "reward": 0.06681232899427414, + "reward_std": 0.015474791638553143, + "rewards/bleu_reward_func/mean": 0.06681232899427414, + "rewards/bleu_reward_func/std": 0.06617429107427597, + "step": 298 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 498.0, + "completions/mean_length": 285.59375, + "completions/mean_terminated_length": 109.5, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.2392, + "grad_norm": 3.8715662956237793, + "kl": 0.050140380859375, + "learning_rate": 1e-06, + "loss": 0.1729, + "num_tokens": 3967087.0, + "reward": 0.1066230833530426, + "reward_std": 0.08889298141002655, + "rewards/bleu_reward_func/mean": 0.1066230833530426, + "rewards/bleu_reward_func/std": 0.14223438501358032, + "step": 299 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 361.0, + "completions/max_terminated_length": 361.0, + "completions/mean_length": 120.9375, + "completions/mean_terminated_length": 120.9375, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 0.24, + "grad_norm": 9.271559715270996, + "kl": 0.223388671875, + "learning_rate": 1e-06, + "loss": 0.0531, + "num_tokens": 3977797.0, + "reward": 0.09239183366298676, + "reward_std": 0.04012807458639145, + "rewards/bleu_reward_func/mean": 0.09239183366298676, + "rewards/bleu_reward_func/std": 0.07950045168399811, + "step": 300 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 492.0, + "completions/mean_length": 270.0, + "completions/mean_terminated_length": 202.239990234375, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 0.2408, + "grad_norm": 8.53159236907959, + "kl": 0.18048095703125, + "learning_rate": 1e-06, + "loss": -0.1823, + "num_tokens": 3988157.0, + "reward": 0.04499006271362305, + "reward_std": 0.015048853121697903, + "rewards/bleu_reward_func/mean": 0.04499006271362305, + "rewards/bleu_reward_func/std": 0.036676883697509766, + "step": 301 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 510.0, + "completions/mean_length": 224.0625, + "completions/mean_terminated_length": 157.61538696289062, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 0.2416, + "grad_norm": 6.0366997718811035, + "kl": 0.099029541015625, + "learning_rate": 1e-06, + "loss": -0.1824, + "num_tokens": 4000135.0, + "reward": 0.1630059778690338, + "reward_std": 0.04720958322286606, + "rewards/bleu_reward_func/mean": 0.1630059778690338, + "rewards/bleu_reward_func/std": 0.1834760457277298, + "step": 302 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 486.0, + "completions/mean_length": 169.3125, + "completions/mean_terminated_length": 90.23077392578125, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.2424, + "grad_norm": 9.543852806091309, + "kl": 0.35198974609375, + "learning_rate": 1e-06, + "loss": -0.2399, + "num_tokens": 4009009.0, + "reward": 0.06052142754197121, + "reward_std": 0.026765264570713043, + "rewards/bleu_reward_func/mean": 0.06052142754197121, + "rewards/bleu_reward_func/std": 0.052253786474466324, + "step": 303 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 509.0, + "completions/mean_length": 365.78125, + "completions/mean_terminated_length": 265.7368469238281, + "completions/min_length": 50.0, + "completions/min_terminated_length": 50.0, + "epoch": 0.2432, + "grad_norm": 3.007157564163208, + "kl": 0.0393524169921875, + "learning_rate": 1e-06, + "loss": 0.032, + "num_tokens": 4023690.0, + "reward": 0.025675857439637184, + "reward_std": 0.013720525428652763, + "rewards/bleu_reward_func/mean": 0.025675857439637184, + "rewards/bleu_reward_func/std": 0.022033939138054848, + "step": 304 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 506.0, + "completions/max_terminated_length": 506.0, + "completions/mean_length": 158.6875, + "completions/mean_terminated_length": 158.6875, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 0.244, + "grad_norm": 7.10622501373291, + "kl": 0.21661376953125, + "learning_rate": 1e-06, + "loss": 0.166, + "num_tokens": 4033848.0, + "reward": 0.19492439925670624, + "reward_std": 0.0628402829170227, + "rewards/bleu_reward_func/mean": 0.19492439925670624, + "rewards/bleu_reward_func/std": 0.22491495311260223, + "step": 305 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 484.0, + "completions/mean_length": 345.53125, + "completions/mean_terminated_length": 290.04168701171875, + "completions/min_length": 42.0, + "completions/min_terminated_length": 42.0, + "epoch": 0.2448, + "grad_norm": 4.572328090667725, + "kl": 0.099700927734375, + "learning_rate": 1e-06, + "loss": 0.1016, + "num_tokens": 4047897.0, + "reward": 0.12647973001003265, + "reward_std": 0.03362637385725975, + "rewards/bleu_reward_func/mean": 0.12647973001003265, + "rewards/bleu_reward_func/std": 0.08024211972951889, + "step": 306 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 497.0, + "completions/mean_length": 217.6875, + "completions/mean_terminated_length": 175.6428680419922, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 0.2456, + "grad_norm": 7.489211082458496, + "kl": 0.17584228515625, + "learning_rate": 1e-06, + "loss": -0.1361, + "num_tokens": 4062471.0, + "reward": 0.15859398245811462, + "reward_std": 0.059820279479026794, + "rewards/bleu_reward_func/mean": 0.15859398245811462, + "rewards/bleu_reward_func/std": 0.11927466094493866, + "step": 307 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 509.0, + "completions/mean_length": 304.0625, + "completions/mean_terminated_length": 209.5454559326172, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 0.2464, + "grad_norm": 6.605251789093018, + "kl": 0.15716552734375, + "learning_rate": 1e-06, + "loss": 0.1835, + "num_tokens": 4079553.0, + "reward": 0.048189468681812286, + "reward_std": 0.01783904619514942, + "rewards/bleu_reward_func/mean": 0.048189468681812286, + "rewards/bleu_reward_func/std": 0.037260618060827255, + "step": 308 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 422.0, + "completions/mean_length": 150.46875, + "completions/mean_terminated_length": 67.03846740722656, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.2472, + "grad_norm": 20.150175094604492, + "kl": 0.31695556640625, + "learning_rate": 1e-06, + "loss": -0.2639, + "num_tokens": 4089536.0, + "reward": 0.19017143547534943, + "reward_std": 0.06138678267598152, + "rewards/bleu_reward_func/mean": 0.19017143547534943, + "rewards/bleu_reward_func/std": 0.25128865242004395, + "step": 309 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 478.0, + "completions/mean_length": 171.3125, + "completions/mean_terminated_length": 136.0689697265625, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 0.248, + "grad_norm": 6.626379013061523, + "kl": 0.103729248046875, + "learning_rate": 1e-06, + "loss": -0.1912, + "num_tokens": 4100146.0, + "reward": 0.08903198689222336, + "reward_std": 0.029232412576675415, + "rewards/bleu_reward_func/mean": 0.08903198689222336, + "rewards/bleu_reward_func/std": 0.09126507490873337, + "step": 310 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 481.0, + "completions/mean_length": 216.46875, + "completions/mean_terminated_length": 117.95833587646484, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 0.2488, + "grad_norm": 5.2524285316467285, + "kl": 0.0589141845703125, + "learning_rate": 1e-06, + "loss": 0.3318, + "num_tokens": 4112841.0, + "reward": 0.07349678874015808, + "reward_std": 0.05337782949209213, + "rewards/bleu_reward_func/mean": 0.07349678874015808, + "rewards/bleu_reward_func/std": 0.10531707108020782, + "step": 311 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 489.0, + "completions/mean_length": 285.0, + "completions/mean_terminated_length": 196.17391967773438, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.2496, + "grad_norm": 5.209020137786865, + "kl": 0.11212158203125, + "learning_rate": 1e-06, + "loss": -0.1362, + "num_tokens": 4125369.0, + "reward": 0.1321243941783905, + "reward_std": 0.035379908978939056, + "rewards/bleu_reward_func/mean": 0.1321243941783905, + "rewards/bleu_reward_func/std": 0.12779219448566437, + "step": 312 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 508.0, + "completions/mean_length": 397.1875, + "completions/mean_terminated_length": 205.83334350585938, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.2504, + "grad_norm": 2.491729974746704, + "kl": 0.029266357421875, + "learning_rate": 1e-06, + "loss": -0.0819, + "num_tokens": 4142671.0, + "reward": 0.021221335977315903, + "reward_std": 0.008927191607654095, + "rewards/bleu_reward_func/mean": 0.021221335977315903, + "rewards/bleu_reward_func/std": 0.01940017379820347, + "step": 313 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 479.0, + "completions/mean_length": 276.8125, + "completions/mean_terminated_length": 222.53846740722656, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "epoch": 0.2512, + "grad_norm": 3.1302947998046875, + "kl": 0.030731201171875, + "learning_rate": 1e-06, + "loss": -0.09, + "num_tokens": 4158681.0, + "reward": 0.18806447088718414, + "reward_std": 0.04276939481496811, + "rewards/bleu_reward_func/mean": 0.18806447088718414, + "rewards/bleu_reward_func/std": 0.2711097002029419, + "step": 314 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 511.0, + "completions/mean_length": 202.71875, + "completions/mean_terminated_length": 182.10000610351562, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 0.252, + "grad_norm": 9.11577320098877, + "kl": 0.321502685546875, + "learning_rate": 1e-06, + "loss": 0.2469, + "num_tokens": 4168304.0, + "reward": 0.17324072122573853, + "reward_std": 0.07514998316764832, + "rewards/bleu_reward_func/mean": 0.17324072122573853, + "rewards/bleu_reward_func/std": 0.15059800446033478, + "step": 315 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.46875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 457.0, + "completions/mean_length": 310.65625, + "completions/mean_terminated_length": 133.0, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.2528, + "grad_norm": 4.476902961730957, + "kl": 0.22100830078125, + "learning_rate": 1e-06, + "loss": -0.0695, + "num_tokens": 4183237.0, + "reward": 0.11044389009475708, + "reward_std": 0.04662460461258888, + "rewards/bleu_reward_func/mean": 0.11044389009475708, + "rewards/bleu_reward_func/std": 0.13189704716205597, + "step": 316 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 506.0, + "completions/mean_length": 326.40625, + "completions/mean_terminated_length": 264.54168701171875, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 0.2536, + "grad_norm": 4.724470138549805, + "kl": 0.039764404296875, + "learning_rate": 1e-06, + "loss": 0.0133, + "num_tokens": 4196786.0, + "reward": 0.1738719940185547, + "reward_std": 0.06735121458768845, + "rewards/bleu_reward_func/mean": 0.1738719940185547, + "rewards/bleu_reward_func/std": 0.15234871208667755, + "step": 317 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 511.0, + "completions/mean_length": 264.65625, + "completions/mean_terminated_length": 116.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 0.2544, + "grad_norm": 7.755268096923828, + "kl": 0.23388671875, + "learning_rate": 1e-06, + "loss": 0.0091, + "num_tokens": 4211319.0, + "reward": 0.16174694895744324, + "reward_std": 0.04472574219107628, + "rewards/bleu_reward_func/mean": 0.16174694895744324, + "rewards/bleu_reward_func/std": 0.13533204793930054, + "step": 318 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.34375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 509.0, + "completions/mean_length": 326.0625, + "completions/mean_terminated_length": 228.6666717529297, + "completions/min_length": 46.0, + "completions/min_terminated_length": 46.0, + "epoch": 0.2552, + "grad_norm": 3.5100746154785156, + "kl": 0.0638427734375, + "learning_rate": 1e-06, + "loss": -0.0174, + "num_tokens": 4224641.0, + "reward": 0.14605101943016052, + "reward_std": 0.039064351469278336, + "rewards/bleu_reward_func/mean": 0.14605101943016052, + "rewards/bleu_reward_func/std": 0.1437525898218155, + "step": 319 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 465.0, + "completions/mean_length": 387.5, + "completions/mean_terminated_length": 180.0, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "epoch": 0.256, + "grad_norm": 3.499901056289673, + "kl": 0.03240966796875, + "learning_rate": 1e-06, + "loss": -0.101, + "num_tokens": 4244041.0, + "reward": 0.038129642605781555, + "reward_std": 0.0157744400203228, + "rewards/bleu_reward_func/mean": 0.038129642605781555, + "rewards/bleu_reward_func/std": 0.030961766839027405, + "step": 320 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 485.0, + "completions/mean_length": 235.75, + "completions/mean_terminated_length": 207.1724090576172, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.2568, + "grad_norm": 6.800954341888428, + "kl": 0.172210693359375, + "learning_rate": 1e-06, + "loss": -0.2682, + "num_tokens": 4257425.0, + "reward": 0.08078090846538544, + "reward_std": 0.0318281352519989, + "rewards/bleu_reward_func/mean": 0.08078090846538544, + "rewards/bleu_reward_func/std": 0.060885149985551834, + "step": 321 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 52.0, + "completions/mean_length": 153.5, + "completions/mean_terminated_length": 34.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 0.2576, + "grad_norm": 6.995741367340088, + "kl": 0.197662353515625, + "learning_rate": 1e-06, + "loss": 0.0321, + "num_tokens": 4270729.0, + "reward": 0.3046156167984009, + "reward_std": 0.045112840831279755, + "rewards/bleu_reward_func/mean": 0.3046156167984009, + "rewards/bleu_reward_func/std": 0.17106564342975616, + "step": 322 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 485.0, + "completions/mean_length": 309.0625, + "completions/mean_terminated_length": 216.8181915283203, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.2584, + "grad_norm": 8.159075736999512, + "kl": 0.11962890625, + "learning_rate": 1e-06, + "loss": 0.0599, + "num_tokens": 4286907.0, + "reward": 0.11749087274074554, + "reward_std": 0.04918123036623001, + "rewards/bleu_reward_func/mean": 0.11749087274074554, + "rewards/bleu_reward_func/std": 0.12518151104450226, + "step": 323 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 424.0, + "completions/max_terminated_length": 424.0, + "completions/mean_length": 156.90625, + "completions/mean_terminated_length": 156.90625, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.2592, + "grad_norm": 7.079853057861328, + "kl": 0.09991455078125, + "learning_rate": 1e-06, + "loss": 0.0397, + "num_tokens": 4295536.0, + "reward": 0.11096417158842087, + "reward_std": 0.04051455110311508, + "rewards/bleu_reward_func/mean": 0.11096417158842087, + "rewards/bleu_reward_func/std": 0.1420901119709015, + "step": 324 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 464.0, + "completions/mean_length": 150.875, + "completions/mean_terminated_length": 49.7599983215332, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 0.26, + "grad_norm": 8.065258026123047, + "kl": 0.167816162109375, + "learning_rate": 1e-06, + "loss": -0.0243, + "num_tokens": 4306404.0, + "reward": 0.13756218552589417, + "reward_std": 0.02154640108346939, + "rewards/bleu_reward_func/mean": 0.13756218552589417, + "rewards/bleu_reward_func/std": 0.14523112773895264, + "step": 325 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 445.0, + "completions/mean_length": 385.75, + "completions/mean_terminated_length": 310.0, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, + "epoch": 0.2608, + "grad_norm": 2.441365957260132, + "kl": 0.019775390625, + "learning_rate": 1e-06, + "loss": 0.024, + "num_tokens": 4323836.0, + "reward": 0.023768192157149315, + "reward_std": 0.009069718420505524, + "rewards/bleu_reward_func/mean": 0.023768192157149315, + "rewards/bleu_reward_func/std": 0.029040560126304626, + "step": 326 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 512.0, + "completions/mean_length": 319.46875, + "completions/mean_terminated_length": 203.9499969482422, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 0.2616, + "grad_norm": 5.7556071281433105, + "kl": 0.091705322265625, + "learning_rate": 1e-06, + "loss": 0.0042, + "num_tokens": 4338667.0, + "reward": 0.07871399819850922, + "reward_std": 0.03653344139456749, + "rewards/bleu_reward_func/mean": 0.07871399819850922, + "rewards/bleu_reward_func/std": 0.06572794169187546, + "step": 327 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 442.0, + "completions/mean_length": 264.84375, + "completions/mean_terminated_length": 152.5, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.2624, + "grad_norm": 6.231250286102295, + "kl": 0.1138916015625, + "learning_rate": 1e-06, + "loss": -0.0458, + "num_tokens": 4351270.0, + "reward": 0.16190959513187408, + "reward_std": 0.02650507725775242, + "rewards/bleu_reward_func/mean": 0.16190959513187408, + "rewards/bleu_reward_func/std": 0.15018552541732788, + "step": 328 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 493.0, + "completions/mean_length": 350.625, + "completions/mean_terminated_length": 277.2727355957031, + "completions/min_length": 65.0, + "completions/min_terminated_length": 65.0, + "epoch": 0.2632, + "grad_norm": 2.828697681427002, + "kl": 0.02972412109375, + "learning_rate": 1e-06, + "loss": 0.0748, + "num_tokens": 4366018.0, + "reward": 0.07461819052696228, + "reward_std": 0.034676797688007355, + "rewards/bleu_reward_func/mean": 0.07461819052696228, + "rewards/bleu_reward_func/std": 0.10171358287334442, + "step": 329 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.46875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 447.0, + "completions/mean_length": 330.96875, + "completions/mean_terminated_length": 171.23529052734375, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 0.264, + "grad_norm": 7.326402187347412, + "kl": 0.0977630615234375, + "learning_rate": 1e-06, + "loss": 0.2768, + "num_tokens": 4378353.0, + "reward": 0.07485680282115936, + "reward_std": 0.04837151616811752, + "rewards/bleu_reward_func/mean": 0.07485680282115936, + "rewards/bleu_reward_func/std": 0.04874453693628311, + "step": 330 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 194.0, + "completions/max_terminated_length": 194.0, + "completions/mean_length": 65.40625, + "completions/mean_terminated_length": 65.40625, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.2648, + "grad_norm": 12.08074951171875, + "kl": 0.335693359375, + "learning_rate": 1e-06, + "loss": 0.1573, + "num_tokens": 4384062.0, + "reward": 0.19588544964790344, + "reward_std": 0.09824244678020477, + "rewards/bleu_reward_func/mean": 0.19588544964790344, + "rewards/bleu_reward_func/std": 0.16972649097442627, + "step": 331 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 507.0, + "completions/mean_length": 270.65625, + "completions/mean_terminated_length": 203.0800018310547, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 0.2656, + "grad_norm": 4.561427593231201, + "kl": 0.039154052734375, + "learning_rate": 1e-06, + "loss": -0.0917, + "num_tokens": 4394427.0, + "reward": 0.06531640887260437, + "reward_std": 0.018873782828450203, + "rewards/bleu_reward_func/mean": 0.06531640887260437, + "rewards/bleu_reward_func/std": 0.059104837477207184, + "step": 332 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 507.0, + "completions/mean_length": 200.25, + "completions/mean_terminated_length": 155.71429443359375, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 0.2664, + "grad_norm": 6.5239057540893555, + "kl": 0.169952392578125, + "learning_rate": 1e-06, + "loss": 0.0545, + "num_tokens": 4409739.0, + "reward": 0.23698079586029053, + "reward_std": 0.08829502761363983, + "rewards/bleu_reward_func/mean": 0.23698079586029053, + "rewards/bleu_reward_func/std": 0.2539888322353363, + "step": 333 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 509.0, + "completions/mean_length": 142.1875, + "completions/mean_terminated_length": 103.93103790283203, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 0.2672, + "grad_norm": 6.988838195800781, + "kl": 0.192413330078125, + "learning_rate": 1e-06, + "loss": 0.0242, + "num_tokens": 4420033.0, + "reward": 0.18931233882904053, + "reward_std": 0.06329823285341263, + "rewards/bleu_reward_func/mean": 0.18931233882904053, + "rewards/bleu_reward_func/std": 0.16267651319503784, + "step": 334 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 455.0, + "completions/mean_length": 149.71875, + "completions/mean_terminated_length": 66.11538696289062, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 0.268, + "grad_norm": 16.95305061340332, + "kl": 0.3260498046875, + "learning_rate": 1e-06, + "loss": 0.3727, + "num_tokens": 4429376.0, + "reward": 0.11154920607805252, + "reward_std": 0.06479852646589279, + "rewards/bleu_reward_func/mean": 0.11154920607805252, + "rewards/bleu_reward_func/std": 0.07707681506872177, + "step": 335 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 506.0, + "completions/mean_length": 216.125, + "completions/mean_terminated_length": 185.51724243164062, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 0.2688, + "grad_norm": 12.891951560974121, + "kl": 0.154815673828125, + "learning_rate": 1e-06, + "loss": 0.1506, + "num_tokens": 4438340.0, + "reward": 0.11881305277347565, + "reward_std": 0.04300341382622719, + "rewards/bleu_reward_func/mean": 0.11881305277347565, + "rewards/bleu_reward_func/std": 0.11628168076276779, + "step": 336 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.34375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 490.0, + "completions/mean_length": 259.375, + "completions/mean_terminated_length": 127.04762268066406, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.2696, + "grad_norm": 8.3147554397583, + "kl": 0.0980682373046875, + "learning_rate": 1e-06, + "loss": 0.0185, + "num_tokens": 4451328.0, + "reward": 0.11791149526834488, + "reward_std": 0.02945806086063385, + "rewards/bleu_reward_func/mean": 0.11791149526834488, + "rewards/bleu_reward_func/std": 0.06387177854776382, + "step": 337 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 397.0, + "completions/mean_length": 159.28125, + "completions/mean_terminated_length": 60.52000045776367, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 0.2704, + "grad_norm": 6.874416828155518, + "kl": 0.235107421875, + "learning_rate": 1e-06, + "loss": 0.1172, + "num_tokens": 4461785.0, + "reward": 0.18331755697727203, + "reward_std": 0.05733542889356613, + "rewards/bleu_reward_func/mean": 0.18331755697727203, + "rewards/bleu_reward_func/std": 0.17218343913555145, + "step": 338 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 505.0, + "completions/mean_length": 206.1875, + "completions/mean_terminated_length": 120.55999755859375, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 0.2712, + "grad_norm": 7.444963455200195, + "kl": 0.08843994140625, + "learning_rate": 1e-06, + "loss": 0.3417, + "num_tokens": 4471031.0, + "reward": 0.08221863210201263, + "reward_std": 0.030037853866815567, + "rewards/bleu_reward_func/mean": 0.08221863210201263, + "rewards/bleu_reward_func/std": 0.05527469143271446, + "step": 339 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 485.0, + "completions/mean_length": 256.84375, + "completions/mean_terminated_length": 197.9615478515625, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.272, + "grad_norm": 6855.86328125, + "kl": 1.03955078125, + "learning_rate": 1e-06, + "loss": 0.0411, + "num_tokens": 4485834.0, + "reward": 0.13405509293079376, + "reward_std": 0.03707335144281387, + "rewards/bleu_reward_func/mean": 0.13405509293079376, + "rewards/bleu_reward_func/std": 0.15687085688114166, + "step": 340 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 388.0, + "completions/mean_length": 211.0, + "completions/mean_terminated_length": 141.53846740722656, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "epoch": 0.2728, + "grad_norm": 8.717283248901367, + "kl": 0.128326416015625, + "learning_rate": 1e-06, + "loss": -0.0381, + "num_tokens": 4496202.0, + "reward": 0.0755915641784668, + "reward_std": 0.029588045552372932, + "rewards/bleu_reward_func/mean": 0.0755915641784668, + "rewards/bleu_reward_func/std": 0.05914263799786568, + "step": 341 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 416.0, + "completions/mean_length": 213.15625, + "completions/mean_terminated_length": 129.47999572753906, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 0.2736, + "grad_norm": 9.269394874572754, + "kl": 0.21234130859375, + "learning_rate": 1e-06, + "loss": -0.0787, + "num_tokens": 4505447.0, + "reward": 0.11310072988271713, + "reward_std": 0.035067904740571976, + "rewards/bleu_reward_func/mean": 0.11310072988271713, + "rewards/bleu_reward_func/std": 0.10819036513566971, + "step": 342 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.53125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 511.0, + "completions/mean_length": 353.90625, + "completions/mean_terminated_length": 174.73333740234375, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 0.2744, + "grad_norm": 6.147165298461914, + "kl": 0.0384521484375, + "learning_rate": 1e-06, + "loss": -0.0413, + "num_tokens": 4519052.0, + "reward": 0.06785966455936432, + "reward_std": 0.039666250348091125, + "rewards/bleu_reward_func/mean": 0.06785966455936432, + "rewards/bleu_reward_func/std": 0.059012189507484436, + "step": 343 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 507.0, + "completions/mean_length": 370.84375, + "completions/mean_terminated_length": 261.0555725097656, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 0.2752, + "grad_norm": 6.257096767425537, + "kl": 0.170440673828125, + "learning_rate": 1e-06, + "loss": 0.0397, + "num_tokens": 4533975.0, + "reward": 0.05020497739315033, + "reward_std": 0.009127253666520119, + "rewards/bleu_reward_func/mean": 0.05020497739315033, + "rewards/bleu_reward_func/std": 0.04745229333639145, + "step": 344 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 463.0, + "completions/mean_length": 260.1875, + "completions/mean_terminated_length": 64.33333587646484, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 0.276, + "grad_norm": 8.694131851196289, + "kl": 0.40167236328125, + "learning_rate": 1e-06, + "loss": 0.124, + "num_tokens": 4548765.0, + "reward": 0.17815490067005157, + "reward_std": 0.04761611297726631, + "rewards/bleu_reward_func/mean": 0.17815490067005157, + "rewards/bleu_reward_func/std": 0.22018791735172272, + "step": 345 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 491.0, + "completions/mean_length": 334.78125, + "completions/mean_terminated_length": 275.7083435058594, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.2768, + "grad_norm": 6.1226325035095215, + "kl": 0.105133056640625, + "learning_rate": 1e-06, + "loss": -0.0251, + "num_tokens": 4565158.0, + "reward": 0.09645688533782959, + "reward_std": 0.0746307447552681, + "rewards/bleu_reward_func/mean": 0.09645688533782959, + "rewards/bleu_reward_func/std": 0.1715475171804428, + "step": 346 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 388.0, + "completions/max_terminated_length": 388.0, + "completions/mean_length": 98.46875, + "completions/mean_terminated_length": 98.46875, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.2776, + "grad_norm": 8.647904396057129, + "kl": 0.328857421875, + "learning_rate": 1e-06, + "loss": -0.0264, + "num_tokens": 4576637.0, + "reward": 0.3595752716064453, + "reward_std": 0.09626303613185883, + "rewards/bleu_reward_func/mean": 0.3595752716064453, + "rewards/bleu_reward_func/std": 0.293544203042984, + "step": 347 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 239.0, + "completions/mean_length": 191.28125, + "completions/mean_terminated_length": 84.375, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 0.2784, + "grad_norm": 7.7827630043029785, + "kl": 0.24041748046875, + "learning_rate": 1e-06, + "loss": 0.0475, + "num_tokens": 4586230.0, + "reward": 0.2051679939031601, + "reward_std": 0.029646433889865875, + "rewards/bleu_reward_func/mean": 0.2051679939031601, + "rewards/bleu_reward_func/std": 0.20678655803203583, + "step": 348 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 498.0, + "completions/mean_length": 320.5, + "completions/mean_terminated_length": 256.66668701171875, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 0.2792, + "grad_norm": 8.593353271484375, + "kl": 0.149017333984375, + "learning_rate": 1e-06, + "loss": -0.0651, + "num_tokens": 4603070.0, + "reward": 0.1438911259174347, + "reward_std": 0.06431536376476288, + "rewards/bleu_reward_func/mean": 0.1438911259174347, + "rewards/bleu_reward_func/std": 0.22814705967903137, + "step": 349 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.59375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 469.0, + "completions/mean_length": 381.96875, + "completions/mean_terminated_length": 191.92308044433594, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 0.28, + "grad_norm": 2.2874648571014404, + "kl": 0.023284912109375, + "learning_rate": 1e-06, + "loss": 0.0246, + "num_tokens": 4619757.0, + "reward": 0.19660863280296326, + "reward_std": 0.08571420609951019, + "rewards/bleu_reward_func/mean": 0.19660863280296326, + "rewards/bleu_reward_func/std": 0.2662343680858612, + "step": 350 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 507.0, + "completions/mean_length": 380.53125, + "completions/mean_terminated_length": 290.5789489746094, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "epoch": 0.2808, + "grad_norm": 2.8600640296936035, + "kl": 0.02679443359375, + "learning_rate": 1e-06, + "loss": 0.082, + "num_tokens": 4636806.0, + "reward": 0.05401962995529175, + "reward_std": 0.019372381269931793, + "rewards/bleu_reward_func/mean": 0.05401962995529175, + "rewards/bleu_reward_func/std": 0.026677841320633888, + "step": 351 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 509.0, + "completions/mean_length": 328.125, + "completions/mean_terminated_length": 244.5454559326172, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 0.2816, + "grad_norm": 6.117258548736572, + "kl": 0.10986328125, + "learning_rate": 1e-06, + "loss": 0.162, + "num_tokens": 4649338.0, + "reward": 0.12430500984191895, + "reward_std": 0.046015314757823944, + "rewards/bleu_reward_func/mean": 0.12430500984191895, + "rewards/bleu_reward_func/std": 0.11290674656629562, + "step": 352 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.46875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 487.0, + "completions/mean_length": 335.15625, + "completions/mean_terminated_length": 179.11764526367188, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.2824, + "grad_norm": 9.883430480957031, + "kl": 0.134429931640625, + "learning_rate": 1e-06, + "loss": -0.0897, + "num_tokens": 4664823.0, + "reward": 0.10318648815155029, + "reward_std": 0.040948014706373215, + "rewards/bleu_reward_func/mean": 0.10318648815155029, + "rewards/bleu_reward_func/std": 0.098084457218647, + "step": 353 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 425.0, + "completions/mean_length": 213.96875, + "completions/mean_terminated_length": 204.35482788085938, + "completions/min_length": 47.0, + "completions/min_terminated_length": 47.0, + "epoch": 0.2832, + "grad_norm": 3.7569406032562256, + "kl": 0.0567626953125, + "learning_rate": 1e-06, + "loss": 0.276, + "num_tokens": 4673198.0, + "reward": 0.02880779653787613, + "reward_std": 0.02136135660111904, + "rewards/bleu_reward_func/mean": 0.02880779653787613, + "rewards/bleu_reward_func/std": 0.031262028962373734, + "step": 354 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.65625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 503.0, + "completions/mean_length": 382.34375, + "completions/mean_terminated_length": 134.8181915283203, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.284, + "grad_norm": 5.402606010437012, + "kl": 0.059539794921875, + "learning_rate": 1e-06, + "loss": 0.0167, + "num_tokens": 4690033.0, + "reward": 0.11326944082975388, + "reward_std": 0.04008851572871208, + "rewards/bleu_reward_func/mean": 0.11326944082975388, + "rewards/bleu_reward_func/std": 0.1632446050643921, + "step": 355 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 184.0, + "completions/max_terminated_length": 184.0, + "completions/mean_length": 49.5, + "completions/mean_terminated_length": 49.5, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 0.2848, + "grad_norm": 15.920856475830078, + "kl": 0.2000732421875, + "learning_rate": 1e-06, + "loss": 0.1343, + "num_tokens": 4696953.0, + "reward": 0.1625998467206955, + "reward_std": 0.10141640901565552, + "rewards/bleu_reward_func/mean": 0.1625998467206955, + "rewards/bleu_reward_func/std": 0.12067051976919174, + "step": 356 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 328.0, + "completions/max_terminated_length": 328.0, + "completions/mean_length": 103.71875, + "completions/mean_terminated_length": 103.71875, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 0.2856, + "grad_norm": 32.86006546020508, + "kl": 0.153564453125, + "learning_rate": 1e-06, + "loss": 0.1497, + "num_tokens": 4705000.0, + "reward": 0.05853947252035141, + "reward_std": 0.014492938295006752, + "rewards/bleu_reward_func/mean": 0.05853947252035141, + "rewards/bleu_reward_func/std": 0.02192818373441696, + "step": 357 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 430.0, + "completions/max_terminated_length": 430.0, + "completions/mean_length": 110.96875, + "completions/mean_terminated_length": 110.96875, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 0.2864, + "grad_norm": 8.785351753234863, + "kl": 0.1767578125, + "learning_rate": 1e-06, + "loss": 0.0291, + "num_tokens": 4713815.0, + "reward": 0.256367951631546, + "reward_std": 0.06547890603542328, + "rewards/bleu_reward_func/mean": 0.256367951631546, + "rewards/bleu_reward_func/std": 0.2225809097290039, + "step": 358 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.46875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 387.0, + "completions/mean_length": 333.21875, + "completions/mean_terminated_length": 175.47059631347656, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "epoch": 0.2872, + "grad_norm": 3.714874744415283, + "kl": 0.0813140869140625, + "learning_rate": 1e-06, + "loss": 0.0176, + "num_tokens": 4732606.0, + "reward": 0.08705547451972961, + "reward_std": 0.02976841665804386, + "rewards/bleu_reward_func/mean": 0.08705547451972961, + "rewards/bleu_reward_func/std": 0.041370097547769547, + "step": 359 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 474.0, + "completions/mean_length": 159.0, + "completions/mean_terminated_length": 147.61289978027344, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.288, + "grad_norm": 7.568475723266602, + "kl": 0.069976806640625, + "learning_rate": 1e-06, + "loss": 0.0023, + "num_tokens": 4742086.0, + "reward": 0.05895683914422989, + "reward_std": 0.036796510219573975, + "rewards/bleu_reward_func/mean": 0.05895683914422989, + "rewards/bleu_reward_func/std": 0.06153297796845436, + "step": 360 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 401.0, + "completions/mean_length": 303.34375, + "completions/mean_terminated_length": 221.69566345214844, + "completions/min_length": 52.0, + "completions/min_terminated_length": 52.0, + "epoch": 0.2888, + "grad_norm": 3.495642900466919, + "kl": 0.033843994140625, + "learning_rate": 1e-06, + "loss": 0.1195, + "num_tokens": 4755153.0, + "reward": 0.024642691016197205, + "reward_std": 0.00707631791010499, + "rewards/bleu_reward_func/mean": 0.024642691016197205, + "rewards/bleu_reward_func/std": 0.01350654847919941, + "step": 361 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 487.0, + "completions/mean_length": 349.59375, + "completions/mean_terminated_length": 238.4736785888672, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.2896, + "grad_norm": 3.2497663497924805, + "kl": 0.032928466796875, + "learning_rate": 1e-06, + "loss": 0.0909, + "num_tokens": 4768724.0, + "reward": 0.06024404242634773, + "reward_std": 0.029051221907138824, + "rewards/bleu_reward_func/mean": 0.06024404242634773, + "rewards/bleu_reward_func/std": 0.05113474279642105, + "step": 362 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.34375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 490.0, + "completions/mean_length": 308.21875, + "completions/mean_terminated_length": 201.4761962890625, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.2904, + "grad_norm": 3.932180643081665, + "kl": 0.0535736083984375, + "learning_rate": 1e-06, + "loss": -0.1216, + "num_tokens": 4784611.0, + "reward": 0.10957776010036469, + "reward_std": 0.018995165824890137, + "rewards/bleu_reward_func/mean": 0.10957776010036469, + "rewards/bleu_reward_func/std": 0.12744034826755524, + "step": 363 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 467.0, + "completions/mean_length": 315.15625, + "completions/mean_terminated_length": 260.0400085449219, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "epoch": 0.2912, + "grad_norm": 3.873363971710205, + "kl": 0.04693603515625, + "learning_rate": 1e-06, + "loss": 0.1157, + "num_tokens": 4798600.0, + "reward": 0.06850136816501617, + "reward_std": 0.03206296265125275, + "rewards/bleu_reward_func/mean": 0.06850136816501617, + "rewards/bleu_reward_func/std": 0.06299194693565369, + "step": 364 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 508.0, + "completions/mean_length": 301.28125, + "completions/mean_terminated_length": 205.5, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "epoch": 0.292, + "grad_norm": 3.491849184036255, + "kl": 0.050079345703125, + "learning_rate": 1e-06, + "loss": 0.1634, + "num_tokens": 4812193.0, + "reward": 0.0632539913058281, + "reward_std": 0.04620906710624695, + "rewards/bleu_reward_func/mean": 0.0632539913058281, + "rewards/bleu_reward_func/std": 0.08490858227014542, + "step": 365 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 446.0, + "completions/mean_length": 320.40625, + "completions/mean_terminated_length": 266.7599792480469, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.2928, + "grad_norm": 10.243452072143555, + "kl": 0.1219482421875, + "learning_rate": 1e-06, + "loss": 0.0432, + "num_tokens": 4824134.0, + "reward": 0.0788659006357193, + "reward_std": 0.019495027139782906, + "rewards/bleu_reward_func/mean": 0.0788659006357193, + "rewards/bleu_reward_func/std": 0.05461956560611725, + "step": 366 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 499.0, + "completions/mean_length": 357.40625, + "completions/mean_terminated_length": 296.9130554199219, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 0.2936, + "grad_norm": 2.715989351272583, + "kl": 0.036712646484375, + "learning_rate": 1e-06, + "loss": -0.1141, + "num_tokens": 4839219.0, + "reward": 0.1387082040309906, + "reward_std": 0.025043122470378876, + "rewards/bleu_reward_func/mean": 0.1387082040309906, + "rewards/bleu_reward_func/std": 0.14657536149024963, + "step": 367 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 477.0, + "completions/mean_length": 418.0, + "completions/mean_terminated_length": 261.3333435058594, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "epoch": 0.2944, + "grad_norm": 2.414018154144287, + "kl": 0.029937744140625, + "learning_rate": 1e-06, + "loss": 0.0066, + "num_tokens": 4857699.0, + "reward": 0.06751300394535065, + "reward_std": 0.05967854708433151, + "rewards/bleu_reward_func/mean": 0.06751300394535065, + "rewards/bleu_reward_func/std": 0.08448994904756546, + "step": 368 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 414.0, + "completions/max_terminated_length": 414.0, + "completions/mean_length": 50.09375, + "completions/mean_terminated_length": 50.09375, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 0.2952, + "grad_norm": 12.07331657409668, + "kl": 0.331298828125, + "learning_rate": 1e-06, + "loss": -0.167, + "num_tokens": 4865766.0, + "reward": 0.2235146164894104, + "reward_std": 0.06765347719192505, + "rewards/bleu_reward_func/mean": 0.2235146164894104, + "rewards/bleu_reward_func/std": 0.15006797015666962, + "step": 369 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 476.0, + "completions/mean_length": 300.125, + "completions/mean_terminated_length": 155.15789794921875, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 0.296, + "grad_norm": 6.003938674926758, + "kl": 0.062225341796875, + "learning_rate": 1e-06, + "loss": 0.0295, + "num_tokens": 4883786.0, + "reward": 0.09686341136693954, + "reward_std": 0.04255010187625885, + "rewards/bleu_reward_func/mean": 0.09686341136693954, + "rewards/bleu_reward_func/std": 0.11752825975418091, + "step": 370 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 484.0, + "completions/mean_length": 381.84375, + "completions/mean_terminated_length": 338.4583435058594, + "completions/min_length": 72.0, + "completions/min_terminated_length": 72.0, + "epoch": 0.2968, + "grad_norm": 2.782743215560913, + "kl": 0.041107177734375, + "learning_rate": 1e-06, + "loss": -0.0931, + "num_tokens": 4898397.0, + "reward": 0.06518180668354034, + "reward_std": 0.017261603847146034, + "rewards/bleu_reward_func/mean": 0.06518180668354034, + "rewards/bleu_reward_func/std": 0.07592527568340302, + "step": 371 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 484.0, + "completions/mean_length": 191.71875, + "completions/mean_terminated_length": 170.36666870117188, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 0.2976, + "grad_norm": 6.834630489349365, + "kl": 0.155029296875, + "learning_rate": 1e-06, + "loss": -0.0674, + "num_tokens": 4907564.0, + "reward": 0.0751166045665741, + "reward_std": 0.03539106994867325, + "rewards/bleu_reward_func/mean": 0.0751166045665741, + "rewards/bleu_reward_func/std": 0.03759034350514412, + "step": 372 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 471.0, + "completions/mean_length": 167.65625, + "completions/mean_terminated_length": 156.5483856201172, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 0.2984, + "grad_norm": 9.550743103027344, + "kl": 0.1636962890625, + "learning_rate": 1e-06, + "loss": 0.1533, + "num_tokens": 4916969.0, + "reward": 0.12691722810268402, + "reward_std": 0.019398069009184837, + "rewards/bleu_reward_func/mean": 0.12691722810268402, + "rewards/bleu_reward_func/std": 0.14723701775074005, + "step": 373 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 511.0, + "completions/mean_length": 267.6875, + "completions/mean_terminated_length": 199.27999877929688, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 0.2992, + "grad_norm": 5.277988433837891, + "kl": 0.143890380859375, + "learning_rate": 1e-06, + "loss": -0.2024, + "num_tokens": 4930367.0, + "reward": 0.21388903260231018, + "reward_std": 0.0590648353099823, + "rewards/bleu_reward_func/mean": 0.21388903260231018, + "rewards/bleu_reward_func/std": 0.2627076506614685, + "step": 374 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.34375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 494.0, + "completions/mean_length": 337.59375, + "completions/mean_terminated_length": 246.23809814453125, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.3, + "grad_norm": 10.797468185424805, + "kl": 0.12530517578125, + "learning_rate": 1e-06, + "loss": -0.06, + "num_tokens": 4948002.0, + "reward": 0.1380675733089447, + "reward_std": 0.049179110676050186, + "rewards/bleu_reward_func/mean": 0.1380675733089447, + "rewards/bleu_reward_func/std": 0.14962899684906006, + "step": 375 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 493.0, + "completions/mean_length": 250.375, + "completions/mean_terminated_length": 163.1666717529297, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.3008, + "grad_norm": 6.259679794311523, + "kl": 0.1361083984375, + "learning_rate": 1e-06, + "loss": -0.0324, + "num_tokens": 4963942.0, + "reward": 0.2779002785682678, + "reward_std": 0.049215167760849, + "rewards/bleu_reward_func/mean": 0.2779002785682678, + "rewards/bleu_reward_func/std": 0.247111514210701, + "step": 376 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 492.0, + "completions/mean_length": 228.96875, + "completions/mean_terminated_length": 188.5357208251953, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 0.3016, + "grad_norm": 9.751809120178223, + "kl": 0.12164306640625, + "learning_rate": 1e-06, + "loss": -0.0784, + "num_tokens": 4974437.0, + "reward": 0.12611877918243408, + "reward_std": 0.05333450064063072, + "rewards/bleu_reward_func/mean": 0.12611877918243408, + "rewards/bleu_reward_func/std": 0.11847065389156342, + "step": 377 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.59375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 506.0, + "completions/mean_length": 428.34375, + "completions/mean_terminated_length": 306.0769348144531, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 0.3024, + "grad_norm": 1.8198633193969727, + "kl": 0.025543212890625, + "learning_rate": 1e-06, + "loss": 0.0668, + "num_tokens": 4993880.0, + "reward": 0.07207944989204407, + "reward_std": 0.019526129588484764, + "rewards/bleu_reward_func/mean": 0.07207944989204407, + "rewards/bleu_reward_func/std": 0.06778865307569504, + "step": 378 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 237.0, + "completions/max_terminated_length": 237.0, + "completions/mean_length": 63.40625, + "completions/mean_terminated_length": 63.40625, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 0.3032, + "grad_norm": 6.933629512786865, + "kl": 0.13250732421875, + "learning_rate": 1e-06, + "loss": 0.3354, + "num_tokens": 5001909.0, + "reward": 0.12609761953353882, + "reward_std": 0.07611958682537079, + "rewards/bleu_reward_func/mean": 0.12609761953353882, + "rewards/bleu_reward_func/std": 0.09586605429649353, + "step": 379 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 505.0, + "completions/mean_length": 205.25, + "completions/mean_terminated_length": 148.44444274902344, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 0.304, + "grad_norm": 8.042766571044922, + "kl": 0.155029296875, + "learning_rate": 1e-06, + "loss": 0.1587, + "num_tokens": 5012533.0, + "reward": 0.18430504202842712, + "reward_std": 0.09831003099679947, + "rewards/bleu_reward_func/mean": 0.18430504202842712, + "rewards/bleu_reward_func/std": 0.1858755648136139, + "step": 380 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 505.0, + "completions/mean_length": 333.46875, + "completions/mean_terminated_length": 211.3157958984375, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 0.3048, + "grad_norm": 19.682376861572266, + "kl": 0.1099853515625, + "learning_rate": 1e-06, + "loss": 0.015, + "num_tokens": 5028972.0, + "reward": 0.09155917167663574, + "reward_std": 0.012800632044672966, + "rewards/bleu_reward_func/mean": 0.09155917167663574, + "rewards/bleu_reward_func/std": 0.1374584585428238, + "step": 381 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 505.0, + "completions/mean_length": 301.6875, + "completions/mean_terminated_length": 253.1538543701172, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.3056, + "grad_norm": 3.2053592205047607, + "kl": 0.051788330078125, + "learning_rate": 1e-06, + "loss": 0.2446, + "num_tokens": 5041090.0, + "reward": 0.06323020905256271, + "reward_std": 0.032996732741594315, + "rewards/bleu_reward_func/mean": 0.06323020905256271, + "rewards/bleu_reward_func/std": 0.05562639981508255, + "step": 382 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.59375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 468.0, + "completions/mean_length": 369.375, + "completions/mean_terminated_length": 160.92308044433594, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.3064, + "grad_norm": 7.353909492492676, + "kl": 0.07830810546875, + "learning_rate": 1e-06, + "loss": 0.0282, + "num_tokens": 5057798.0, + "reward": 0.1571401059627533, + "reward_std": 0.02875007688999176, + "rewards/bleu_reward_func/mean": 0.1571401059627533, + "rewards/bleu_reward_func/std": 0.20372198522090912, + "step": 383 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 506.0, + "completions/mean_length": 316.84375, + "completions/mean_terminated_length": 199.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 0.3072, + "grad_norm": 3.984431743621826, + "kl": 0.066986083984375, + "learning_rate": 1e-06, + "loss": 0.0376, + "num_tokens": 5073449.0, + "reward": 0.03315318748354912, + "reward_std": 0.038507476449012756, + "rewards/bleu_reward_func/mean": 0.03315318748354912, + "rewards/bleu_reward_func/std": 0.06562887132167816, + "step": 384 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 512.0, + "completions/max_terminated_length": 465.0, + "completions/mean_length": 319.375, + "completions/mean_terminated_length": 126.75, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 0.308, + "grad_norm": 6.559665203094482, + "kl": 0.0677490234375, + "learning_rate": 1e-06, + "loss": 0.3157, + "num_tokens": 5087821.0, + "reward": 0.06230534613132477, + "reward_std": 0.03765605762600899, + "rewards/bleu_reward_func/mean": 0.06230534613132477, + "rewards/bleu_reward_func/std": 0.07213454693555832, + "step": 385 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 488.0, + "completions/mean_length": 319.15625, + "completions/mean_terminated_length": 265.1600036621094, + "completions/min_length": 43.0, + "completions/min_terminated_length": 43.0, + "epoch": 0.3088, + "grad_norm": 3.6326193809509277, + "kl": 0.05596923828125, + "learning_rate": 1e-06, + "loss": -0.0929, + "num_tokens": 5100618.0, + "reward": 0.04398781806230545, + "reward_std": 0.02026546560227871, + "rewards/bleu_reward_func/mean": 0.04398781806230545, + "rewards/bleu_reward_func/std": 0.042056936770677567, + "step": 386 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 470.0, + "completions/mean_length": 326.0, + "completions/mean_terminated_length": 181.3333282470703, + "completions/min_length": 52.0, + "completions/min_terminated_length": 52.0, + "epoch": 0.3096, + "grad_norm": 4.189205646514893, + "kl": 0.0577392578125, + "learning_rate": 1e-06, + "loss": 0.0616, + "num_tokens": 5118850.0, + "reward": 0.10049895197153091, + "reward_std": 0.035130538046360016, + "rewards/bleu_reward_func/mean": 0.10049895197153091, + "rewards/bleu_reward_func/std": 0.0897059291601181, + "step": 387 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 479.0, + "completions/mean_length": 357.125, + "completions/mean_terminated_length": 251.15789794921875, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.3104, + "grad_norm": 8.503087997436523, + "kl": 0.1228790283203125, + "learning_rate": 1e-06, + "loss": 0.1152, + "num_tokens": 5131574.0, + "reward": 0.10157294571399689, + "reward_std": 0.05235150083899498, + "rewards/bleu_reward_func/mean": 0.10157294571399689, + "rewards/bleu_reward_func/std": 0.11832693964242935, + "step": 388 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 490.0, + "completions/mean_length": 261.0625, + "completions/mean_terminated_length": 244.33334350585938, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 0.3112, + "grad_norm": 7.511518478393555, + "kl": 0.0782470703125, + "learning_rate": 1e-06, + "loss": 0.1551, + "num_tokens": 5142288.0, + "reward": 0.05309104174375534, + "reward_std": 0.0195770300924778, + "rewards/bleu_reward_func/mean": 0.05309104174375534, + "rewards/bleu_reward_func/std": 0.03859832510352135, + "step": 389 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 136.0, + "completions/max_terminated_length": 136.0, + "completions/mean_length": 77.8125, + "completions/mean_terminated_length": 77.8125, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 0.312, + "grad_norm": 7.358268737792969, + "kl": 0.1142578125, + "learning_rate": 1e-06, + "loss": 0.0692, + "num_tokens": 5147226.0, + "reward": 0.2647009789943695, + "reward_std": 0.0788542777299881, + "rewards/bleu_reward_func/mean": 0.2647009789943695, + "rewards/bleu_reward_func/std": 0.3669854998588562, + "step": 390 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.53125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 376.0, + "completions/mean_length": 331.1875, + "completions/mean_terminated_length": 126.26667022705078, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 0.3128, + "grad_norm": 6.546727180480957, + "kl": 0.131866455078125, + "learning_rate": 1e-06, + "loss": 0.0216, + "num_tokens": 5162552.0, + "reward": 0.06478870660066605, + "reward_std": 0.016362179070711136, + "rewards/bleu_reward_func/mean": 0.06478870660066605, + "rewards/bleu_reward_func/std": 0.07661883533000946, + "step": 391 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 506.0, + "completions/mean_length": 305.5, + "completions/mean_terminated_length": 211.63636779785156, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.3136, + "grad_norm": 3.7394042015075684, + "kl": 0.0411224365234375, + "learning_rate": 1e-06, + "loss": -0.0472, + "num_tokens": 5174632.0, + "reward": 0.07655475288629532, + "reward_std": 0.04063459113240242, + "rewards/bleu_reward_func/mean": 0.07655475288629532, + "rewards/bleu_reward_func/std": 0.05244217440485954, + "step": 392 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 464.0, + "completions/mean_length": 299.4375, + "completions/mean_terminated_length": 239.9199981689453, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 0.3144, + "grad_norm": 3.130519151687622, + "kl": 0.036407470703125, + "learning_rate": 1e-06, + "loss": 0.0573, + "num_tokens": 5189038.0, + "reward": 0.08177624642848969, + "reward_std": 0.03700428456068039, + "rewards/bleu_reward_func/mean": 0.08177624642848969, + "rewards/bleu_reward_func/std": 0.07332108914852142, + "step": 393 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 507.0, + "completions/mean_length": 175.0, + "completions/mean_terminated_length": 152.53334045410156, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.3152, + "grad_norm": 6.235530853271484, + "kl": 0.119140625, + "learning_rate": 1e-06, + "loss": -0.1315, + "num_tokens": 5199614.0, + "reward": 0.08668357878923416, + "reward_std": 0.029862932860851288, + "rewards/bleu_reward_func/mean": 0.08668357878923416, + "rewards/bleu_reward_func/std": 0.04458598420023918, + "step": 394 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 449.0, + "completions/mean_length": 303.21875, + "completions/mean_terminated_length": 140.8333282470703, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 0.316, + "grad_norm": 3.7761735916137695, + "kl": 0.051483154296875, + "learning_rate": 1e-06, + "loss": 0.4336, + "num_tokens": 5216893.0, + "reward": 0.04373088479042053, + "reward_std": 0.025996902957558632, + "rewards/bleu_reward_func/mean": 0.04373088479042053, + "rewards/bleu_reward_func/std": 0.035521000623703, + "step": 395 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 473.0, + "completions/mean_length": 102.375, + "completions/mean_terminated_length": 89.16128540039062, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 0.3168, + "grad_norm": 9.422346115112305, + "kl": 0.20269775390625, + "learning_rate": 1e-06, + "loss": -0.3887, + "num_tokens": 5222225.0, + "reward": 0.0936415046453476, + "reward_std": 0.07821927219629288, + "rewards/bleu_reward_func/mean": 0.0936415046453476, + "rewards/bleu_reward_func/std": 0.1016775444149971, + "step": 396 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 508.0, + "completions/mean_length": 187.875, + "completions/mean_terminated_length": 127.85185241699219, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 0.3176, + "grad_norm": 16.8355712890625, + "kl": 0.111968994140625, + "learning_rate": 1e-06, + "loss": 0.2515, + "num_tokens": 5234477.0, + "reward": 0.2821354866027832, + "reward_std": 0.16070716083049774, + "rewards/bleu_reward_func/mean": 0.2821354866027832, + "rewards/bleu_reward_func/std": 0.34524035453796387, + "step": 397 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 487.0, + "completions/mean_length": 222.625, + "completions/mean_terminated_length": 169.0370330810547, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.3184, + "grad_norm": 4.937644004821777, + "kl": 0.0895843505859375, + "learning_rate": 1e-06, + "loss": 0.1443, + "num_tokens": 5243161.0, + "reward": 0.04823939502239227, + "reward_std": 0.020888181403279305, + "rewards/bleu_reward_func/mean": 0.04823939502239227, + "rewards/bleu_reward_func/std": 0.032690465450286865, + "step": 398 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 507.0, + "completions/mean_length": 330.5625, + "completions/mean_terminated_length": 304.64288330078125, + "completions/min_length": 147.0, + "completions/min_terminated_length": 147.0, + "epoch": 0.3192, + "grad_norm": 2.7899651527404785, + "kl": 0.028900146484375, + "learning_rate": 1e-06, + "loss": -0.0361, + "num_tokens": 5257211.0, + "reward": 0.10274805128574371, + "reward_std": 0.03329307958483696, + "rewards/bleu_reward_func/mean": 0.10274805128574371, + "rewards/bleu_reward_func/std": 0.08635566383600235, + "step": 399 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 511.0, + "completions/mean_length": 173.875, + "completions/mean_terminated_length": 125.5714340209961, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 0.32, + "grad_norm": 9.990334510803223, + "kl": 0.19793701171875, + "learning_rate": 1e-06, + "loss": 0.0415, + "num_tokens": 5265263.0, + "reward": 0.13340914249420166, + "reward_std": 0.06052035093307495, + "rewards/bleu_reward_func/mean": 0.13340914249420166, + "rewards/bleu_reward_func/std": 0.12332285940647125, + "step": 400 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 512.0, + "completions/max_terminated_length": 174.0, + "completions/mean_length": 299.625, + "completions/mean_terminated_length": 87.25, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.3208, + "grad_norm": 5.343194007873535, + "kl": 0.0823516845703125, + "learning_rate": 1e-06, + "loss": 0.1588, + "num_tokens": 5279491.0, + "reward": 0.04100114479660988, + "reward_std": 0.021917924284934998, + "rewards/bleu_reward_func/mean": 0.04100114479660988, + "rewards/bleu_reward_func/std": 0.059245530515909195, + "step": 401 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 481.0, + "completions/mean_length": 363.96875, + "completions/mean_terminated_length": 296.68182373046875, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.3216, + "grad_norm": 2.502444267272949, + "kl": 0.0249786376953125, + "learning_rate": 1e-06, + "loss": -0.1811, + "num_tokens": 5298618.0, + "reward": 0.06452260166406631, + "reward_std": 0.043596021831035614, + "rewards/bleu_reward_func/mean": 0.06452260166406631, + "rewards/bleu_reward_func/std": 0.0457596592605114, + "step": 402 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 504.0, + "completions/mean_length": 186.8125, + "completions/mean_terminated_length": 153.1724090576172, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.3224, + "grad_norm": 6.430903434753418, + "kl": 0.0926513671875, + "learning_rate": 1e-06, + "loss": 0.1091, + "num_tokens": 5308788.0, + "reward": 0.1375400573015213, + "reward_std": 0.044691912829875946, + "rewards/bleu_reward_func/mean": 0.1375400573015213, + "rewards/bleu_reward_func/std": 0.1667727530002594, + "step": 403 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 476.0, + "completions/mean_length": 261.375, + "completions/mean_terminated_length": 163.30435180664062, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.3232, + "grad_norm": 7.348942279815674, + "kl": 0.114410400390625, + "learning_rate": 1e-06, + "loss": 0.124, + "num_tokens": 5325816.0, + "reward": 0.29955723881721497, + "reward_std": 0.09420829266309738, + "rewards/bleu_reward_func/mean": 0.29955723881721497, + "rewards/bleu_reward_func/std": 0.27135762572288513, + "step": 404 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 490.0, + "completions/mean_length": 313.125, + "completions/mean_terminated_length": 284.71429443359375, + "completions/min_length": 45.0, + "completions/min_terminated_length": 45.0, + "epoch": 0.324, + "grad_norm": 5.8108601570129395, + "kl": 0.0474853515625, + "learning_rate": 1e-06, + "loss": -0.0256, + "num_tokens": 5339252.0, + "reward": 0.125982865691185, + "reward_std": 0.03331389278173447, + "rewards/bleu_reward_func/mean": 0.125982865691185, + "rewards/bleu_reward_func/std": 0.07514968514442444, + "step": 405 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 507.0, + "completions/mean_length": 221.1875, + "completions/mean_terminated_length": 154.07693481445312, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.3248, + "grad_norm": 6.334237098693848, + "kl": 0.167236328125, + "learning_rate": 1e-06, + "loss": 0.2154, + "num_tokens": 5350538.0, + "reward": 0.12314164638519287, + "reward_std": 0.034954577684402466, + "rewards/bleu_reward_func/mean": 0.12314164638519287, + "rewards/bleu_reward_func/std": 0.11711690574884415, + "step": 406 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 511.0, + "completions/mean_length": 182.96875, + "completions/mean_terminated_length": 90.83999633789062, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.3256, + "grad_norm": 6.83364200592041, + "kl": 0.09881591796875, + "learning_rate": 1e-06, + "loss": 0.2224, + "num_tokens": 5364465.0, + "reward": 0.23839128017425537, + "reward_std": 0.09448365867137909, + "rewards/bleu_reward_func/mean": 0.23839128017425537, + "rewards/bleu_reward_func/std": 0.17264093458652496, + "step": 407 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 413.0, + "completions/mean_length": 251.59375, + "completions/mean_terminated_length": 164.7916717529297, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 0.3264, + "grad_norm": 5.291790962219238, + "kl": 0.120269775390625, + "learning_rate": 1e-06, + "loss": 0.1051, + "num_tokens": 5379212.0, + "reward": 0.07936831563711166, + "reward_std": 0.026489000767469406, + "rewards/bleu_reward_func/mean": 0.07936831563711166, + "rewards/bleu_reward_func/std": 0.04656874015927315, + "step": 408 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 512.0, + "completions/max_terminated_length": 495.0, + "completions/mean_length": 395.90625, + "completions/mean_terminated_length": 279.8125, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 0.3272, + "grad_norm": 2.7209274768829346, + "kl": 0.033355712890625, + "learning_rate": 1e-06, + "loss": -0.0145, + "num_tokens": 5395369.0, + "reward": 0.05327831208705902, + "reward_std": 0.020644793286919594, + "rewards/bleu_reward_func/mean": 0.05327831208705902, + "rewards/bleu_reward_func/std": 0.044744666665792465, + "step": 409 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.34375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 402.0, + "completions/mean_length": 279.125, + "completions/mean_terminated_length": 157.14285278320312, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.328, + "grad_norm": 6.835958003997803, + "kl": 0.17645263671875, + "learning_rate": 1e-06, + "loss": 0.1004, + "num_tokens": 5408861.0, + "reward": 0.15895725786685944, + "reward_std": 0.053282976150512695, + "rewards/bleu_reward_func/mean": 0.15895725786685944, + "rewards/bleu_reward_func/std": 0.1344875991344452, + "step": 410 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 510.0, + "completions/mean_length": 124.625, + "completions/mean_terminated_length": 112.1290283203125, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.3288, + "grad_norm": 7.9765801429748535, + "kl": 0.12908935546875, + "learning_rate": 1e-06, + "loss": 0.0791, + "num_tokens": 5422569.0, + "reward": 0.29637736082077026, + "reward_std": 0.07562527060508728, + "rewards/bleu_reward_func/mean": 0.29637736082077026, + "rewards/bleu_reward_func/std": 0.1916900873184204, + "step": 411 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 420.0, + "completions/mean_length": 242.6875, + "completions/mean_terminated_length": 180.53846740722656, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "epoch": 0.3296, + "grad_norm": 5.444021701812744, + "kl": 0.079376220703125, + "learning_rate": 1e-06, + "loss": -0.0428, + "num_tokens": 5432847.0, + "reward": 0.1152123510837555, + "reward_std": 0.07390551269054413, + "rewards/bleu_reward_func/mean": 0.1152123510837555, + "rewards/bleu_reward_func/std": 0.14451570808887482, + "step": 412 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 489.0, + "completions/mean_length": 329.4375, + "completions/mean_terminated_length": 204.5263214111328, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 0.3304, + "grad_norm": 14.007586479187012, + "kl": 0.074188232421875, + "learning_rate": 1e-06, + "loss": 0.0903, + "num_tokens": 5451693.0, + "reward": 0.13860949873924255, + "reward_std": 0.032740939408540726, + "rewards/bleu_reward_func/mean": 0.13860949873924255, + "rewards/bleu_reward_func/std": 0.15230515599250793, + "step": 413 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 493.0, + "completions/mean_length": 370.0, + "completions/mean_terminated_length": 322.66668701171875, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "epoch": 0.3312, + "grad_norm": 2.6279470920562744, + "kl": 0.02838134765625, + "learning_rate": 1e-06, + "loss": 0.1011, + "num_tokens": 5465741.0, + "reward": 0.07638199627399445, + "reward_std": 0.018498672172427177, + "rewards/bleu_reward_func/mean": 0.07638199627399445, + "rewards/bleu_reward_func/std": 0.07297802716493607, + "step": 414 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 504.0, + "completions/mean_length": 433.96875, + "completions/mean_terminated_length": 403.4347839355469, + "completions/min_length": 253.0, + "completions/min_terminated_length": 253.0, + "epoch": 0.332, + "grad_norm": 2.4823691844940186, + "kl": 0.035186767578125, + "learning_rate": 1e-06, + "loss": -0.0328, + "num_tokens": 5482924.0, + "reward": 0.06871578842401505, + "reward_std": 0.015666324645280838, + "rewards/bleu_reward_func/mean": 0.06871578842401505, + "rewards/bleu_reward_func/std": 0.03051225282251835, + "step": 415 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 432.0, + "completions/mean_length": 241.5, + "completions/mean_terminated_length": 179.07693481445312, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 0.3328, + "grad_norm": 6.543938159942627, + "kl": 0.16180419921875, + "learning_rate": 1e-06, + "loss": 0.0852, + "num_tokens": 5494084.0, + "reward": 0.1368054300546646, + "reward_std": 0.05007235333323479, + "rewards/bleu_reward_func/mean": 0.1368054300546646, + "rewards/bleu_reward_func/std": 0.17140735685825348, + "step": 416 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 511.0, + "completions/mean_length": 440.0, + "completions/mean_terminated_length": 390.7368469238281, + "completions/min_length": 45.0, + "completions/min_terminated_length": 45.0, + "epoch": 0.3336, + "grad_norm": 2.235297203063965, + "kl": 0.03033447265625, + "learning_rate": 1e-06, + "loss": -0.0838, + "num_tokens": 5511716.0, + "reward": 0.038143888115882874, + "reward_std": 0.01655811443924904, + "rewards/bleu_reward_func/mean": 0.038143888115882874, + "rewards/bleu_reward_func/std": 0.024868454784154892, + "step": 417 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 366.0, + "completions/mean_length": 182.3125, + "completions/mean_terminated_length": 72.41667175292969, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 0.3344, + "grad_norm": 7.744441509246826, + "kl": 0.191436767578125, + "learning_rate": 1e-06, + "loss": 0.3195, + "num_tokens": 5523022.0, + "reward": 0.31701600551605225, + "reward_std": 0.07194612175226212, + "rewards/bleu_reward_func/mean": 0.31701600551605225, + "rewards/bleu_reward_func/std": 0.3555218279361725, + "step": 418 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.46875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 503.0, + "completions/mean_length": 366.15625, + "completions/mean_terminated_length": 237.47059631347656, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.3352, + "grad_norm": 6.1128129959106445, + "kl": 0.156463623046875, + "learning_rate": 1e-06, + "loss": -0.0201, + "num_tokens": 5541259.0, + "reward": 0.08823719620704651, + "reward_std": 0.024577319622039795, + "rewards/bleu_reward_func/mean": 0.08823719620704651, + "rewards/bleu_reward_func/std": 0.06854464113712311, + "step": 419 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.53125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 463.0, + "completions/mean_length": 441.34375, + "completions/mean_terminated_length": 361.2666931152344, + "completions/min_length": 213.0, + "completions/min_terminated_length": 213.0, + "epoch": 0.336, + "grad_norm": 1.8351125717163086, + "kl": 0.021087646484375, + "learning_rate": 1e-06, + "loss": 0.1005, + "num_tokens": 5559934.0, + "reward": 0.04894189164042473, + "reward_std": 0.02001025900244713, + "rewards/bleu_reward_func/mean": 0.04894189164042473, + "rewards/bleu_reward_func/std": 0.05484846979379654, + "step": 420 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 439.0, + "completions/mean_length": 326.625, + "completions/mean_terminated_length": 264.8333435058594, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 0.3368, + "grad_norm": 2.533376932144165, + "kl": 0.036773681640625, + "learning_rate": 1e-06, + "loss": -0.0233, + "num_tokens": 5573530.0, + "reward": 0.040375903248786926, + "reward_std": 0.020407570526003838, + "rewards/bleu_reward_func/mean": 0.040375903248786926, + "rewards/bleu_reward_func/std": 0.03530384972691536, + "step": 421 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 495.0, + "completions/mean_length": 387.0, + "completions/mean_terminated_length": 312.0, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "epoch": 0.3376, + "grad_norm": 2.77024507522583, + "kl": 0.023895263671875, + "learning_rate": 1e-06, + "loss": -0.0548, + "num_tokens": 5587906.0, + "reward": 0.07852312177419662, + "reward_std": 0.01865551620721817, + "rewards/bleu_reward_func/mean": 0.07852312177419662, + "rewards/bleu_reward_func/std": 0.01962001994252205, + "step": 422 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 491.0, + "completions/mean_length": 286.625, + "completions/mean_terminated_length": 223.51998901367188, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 0.3384, + "grad_norm": 10.24564266204834, + "kl": 0.1881103515625, + "learning_rate": 1e-06, + "loss": 0.3541, + "num_tokens": 5600862.0, + "reward": 0.1451932042837143, + "reward_std": 0.04526112228631973, + "rewards/bleu_reward_func/mean": 0.1451932042837143, + "rewards/bleu_reward_func/std": 0.11114869266748428, + "step": 423 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.53125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 485.0, + "completions/mean_length": 374.0, + "completions/mean_terminated_length": 217.60000610351562, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.3392, + "grad_norm": 3.2997934818267822, + "kl": 0.03955078125, + "learning_rate": 1e-06, + "loss": 0.0442, + "num_tokens": 5614662.0, + "reward": 0.029227450489997864, + "reward_std": 0.015134407207369804, + "rewards/bleu_reward_func/mean": 0.029227450489997864, + "rewards/bleu_reward_func/std": 0.03273903205990791, + "step": 424 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 375.0, + "completions/mean_length": 266.59375, + "completions/mean_terminated_length": 184.7916717529297, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.34, + "grad_norm": 3.9605484008789062, + "kl": 0.05718994140625, + "learning_rate": 1e-06, + "loss": 0.3365, + "num_tokens": 5625193.0, + "reward": 0.07731406390666962, + "reward_std": 0.04166540876030922, + "rewards/bleu_reward_func/mean": 0.07731406390666962, + "rewards/bleu_reward_func/std": 0.07211390882730484, + "step": 425 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 506.0, + "completions/mean_length": 260.625, + "completions/mean_terminated_length": 88.63157653808594, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 0.3408, + "grad_norm": 11.257648468017578, + "kl": 0.2677154541015625, + "learning_rate": 1e-06, + "loss": 0.1169, + "num_tokens": 5640717.0, + "reward": 0.19435667991638184, + "reward_std": 0.055491410195827484, + "rewards/bleu_reward_func/mean": 0.19435667991638184, + "rewards/bleu_reward_func/std": 0.1956581324338913, + "step": 426 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.59375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 489.0, + "completions/mean_length": 376.9375, + "completions/mean_terminated_length": 179.53846740722656, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.3416, + "grad_norm": 25.653825759887695, + "kl": 0.1090087890625, + "learning_rate": 1e-06, + "loss": 0.024, + "num_tokens": 5655923.0, + "reward": 0.11750101298093796, + "reward_std": 0.0449095293879509, + "rewards/bleu_reward_func/mean": 0.11750101298093796, + "rewards/bleu_reward_func/std": 0.10332971811294556, + "step": 427 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 454.0, + "completions/mean_length": 313.0, + "completions/mean_terminated_length": 235.13043212890625, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 0.3424, + "grad_norm": 3.291689157485962, + "kl": 0.05206298828125, + "learning_rate": 1e-06, + "loss": -0.0722, + "num_tokens": 5672371.0, + "reward": 0.07329948246479034, + "reward_std": 0.04769134148955345, + "rewards/bleu_reward_func/mean": 0.07329948246479034, + "rewards/bleu_reward_func/std": 0.10588011890649796, + "step": 428 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 512.0, + "completions/mean_length": 218.1875, + "completions/mean_terminated_length": 176.21429443359375, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 0.3432, + "grad_norm": 70.38253784179688, + "kl": 0.111907958984375, + "learning_rate": 1e-06, + "loss": 0.2102, + "num_tokens": 5685137.0, + "reward": 0.057677462697029114, + "reward_std": 0.02635624073445797, + "rewards/bleu_reward_func/mean": 0.057677462697029114, + "rewards/bleu_reward_func/std": 0.03576910123229027, + "step": 429 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 448.0, + "completions/mean_length": 276.34375, + "completions/mean_terminated_length": 184.13043212890625, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.344, + "grad_norm": 6.003584861755371, + "kl": 0.0679931640625, + "learning_rate": 1e-06, + "loss": 0.0754, + "num_tokens": 5699164.0, + "reward": 0.1447058618068695, + "reward_std": 0.02169397845864296, + "rewards/bleu_reward_func/mean": 0.1447058618068695, + "rewards/bleu_reward_func/std": 0.17934927344322205, + "step": 430 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 487.0, + "completions/mean_length": 190.0, + "completions/mean_terminated_length": 179.61289978027344, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 0.3448, + "grad_norm": 6.911223888397217, + "kl": 0.186920166015625, + "learning_rate": 1e-06, + "loss": -0.1198, + "num_tokens": 5707324.0, + "reward": 0.1218734011054039, + "reward_std": 0.029896825551986694, + "rewards/bleu_reward_func/mean": 0.1218734011054039, + "rewards/bleu_reward_func/std": 0.12784428894519806, + "step": 431 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 441.0, + "completions/max_terminated_length": 441.0, + "completions/mean_length": 161.3125, + "completions/mean_terminated_length": 161.3125, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.3456, + "grad_norm": 7.491186141967773, + "kl": 0.174072265625, + "learning_rate": 1e-06, + "loss": -0.0073, + "num_tokens": 5715774.0, + "reward": 0.24741162359714508, + "reward_std": 0.06959841400384903, + "rewards/bleu_reward_func/mean": 0.24741162359714508, + "rewards/bleu_reward_func/std": 0.12952403724193573, + "step": 432 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.34375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 492.0, + "completions/mean_length": 360.375, + "completions/mean_terminated_length": 280.952392578125, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 0.3464, + "grad_norm": 2.933356523513794, + "kl": 0.05389404296875, + "learning_rate": 1e-06, + "loss": 0.1508, + "num_tokens": 5729490.0, + "reward": 0.047768086194992065, + "reward_std": 0.022835325449705124, + "rewards/bleu_reward_func/mean": 0.047768086194992065, + "rewards/bleu_reward_func/std": 0.03785131126642227, + "step": 433 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 486.0, + "completions/mean_length": 290.65625, + "completions/mean_terminated_length": 216.875, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 0.3472, + "grad_norm": 4.7709503173828125, + "kl": 0.202667236328125, + "learning_rate": 1e-06, + "loss": -0.0613, + "num_tokens": 5744007.0, + "reward": 0.17955930531024933, + "reward_std": 0.04158224165439606, + "rewards/bleu_reward_func/mean": 0.17955930531024933, + "rewards/bleu_reward_func/std": 0.16465015709400177, + "step": 434 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 512.0, + "completions/max_terminated_length": 508.0, + "completions/mean_length": 471.46875, + "completions/mean_terminated_length": 430.9375, + "completions/min_length": 318.0, + "completions/min_terminated_length": 318.0, + "epoch": 0.348, + "grad_norm": 2.0240750312805176, + "kl": 0.0249786376953125, + "learning_rate": 1e-06, + "loss": -0.0115, + "num_tokens": 5764798.0, + "reward": 0.06078977510333061, + "reward_std": 0.014253700152039528, + "rewards/bleu_reward_func/mean": 0.06078977510333061, + "rewards/bleu_reward_func/std": 0.061424292623996735, + "step": 435 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 496.0, + "completions/mean_length": 297.5, + "completions/mean_terminated_length": 213.56521606445312, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 0.3488, + "grad_norm": 6.2941999435424805, + "kl": 0.14788818359375, + "learning_rate": 1e-06, + "loss": -0.117, + "num_tokens": 5778070.0, + "reward": 0.18015003204345703, + "reward_std": 0.04164495691657066, + "rewards/bleu_reward_func/mean": 0.18015003204345703, + "rewards/bleu_reward_func/std": 0.25248411297798157, + "step": 436 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 507.0, + "completions/mean_length": 319.0625, + "completions/mean_terminated_length": 265.0400085449219, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 0.3496, + "grad_norm": 6.8885884284973145, + "kl": 0.15325927734375, + "learning_rate": 1e-06, + "loss": 0.1008, + "num_tokens": 5794464.0, + "reward": 0.06313855201005936, + "reward_std": 0.01877717673778534, + "rewards/bleu_reward_func/mean": 0.06313855201005936, + "rewards/bleu_reward_func/std": 0.07749292254447937, + "step": 437 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 431.0, + "completions/mean_length": 174.96875, + "completions/mean_terminated_length": 126.8214340209961, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 0.3504, + "grad_norm": 6.638815402984619, + "kl": 0.21893310546875, + "learning_rate": 1e-06, + "loss": -0.1077, + "num_tokens": 5808279.0, + "reward": 0.1649433970451355, + "reward_std": 0.03847195580601692, + "rewards/bleu_reward_func/mean": 0.1649433970451355, + "rewards/bleu_reward_func/std": 0.1434909999370575, + "step": 438 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 487.0, + "completions/max_terminated_length": 487.0, + "completions/mean_length": 111.46875, + "completions/mean_terminated_length": 111.46875, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 0.3512, + "grad_norm": 8.658287048339844, + "kl": 0.380615234375, + "learning_rate": 1e-06, + "loss": -0.0241, + "num_tokens": 5818006.0, + "reward": 0.16367265582084656, + "reward_std": 0.043664492666721344, + "rewards/bleu_reward_func/mean": 0.16367265582084656, + "rewards/bleu_reward_func/std": 0.09786061942577362, + "step": 439 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 484.0, + "completions/mean_length": 183.0625, + "completions/mean_terminated_length": 161.1333465576172, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 0.352, + "grad_norm": 6.755293369293213, + "kl": 0.094757080078125, + "learning_rate": 1e-06, + "loss": -0.3775, + "num_tokens": 5827832.0, + "reward": 0.20365653932094574, + "reward_std": 0.022682592272758484, + "rewards/bleu_reward_func/mean": 0.20365653932094574, + "rewards/bleu_reward_func/std": 0.28341981768608093, + "step": 440 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 489.0, + "completions/mean_length": 374.15625, + "completions/mean_terminated_length": 196.92857360839844, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 0.3528, + "grad_norm": 3.652517557144165, + "kl": 0.04571533203125, + "learning_rate": 1e-06, + "loss": 0.1243, + "num_tokens": 5846877.0, + "reward": 0.028015542775392532, + "reward_std": 0.017580918967723846, + "rewards/bleu_reward_func/mean": 0.028015542775392532, + "rewards/bleu_reward_func/std": 0.018063105642795563, + "step": 441 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 503.0, + "completions/mean_length": 247.96875, + "completions/mean_terminated_length": 174.0399932861328, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 0.3536, + "grad_norm": 5.749145984649658, + "kl": 0.2269287109375, + "learning_rate": 1e-06, + "loss": 0.0053, + "num_tokens": 5857276.0, + "reward": 0.24086514115333557, + "reward_std": 0.11034538596868515, + "rewards/bleu_reward_func/mean": 0.24086514115333557, + "rewards/bleu_reward_func/std": 0.2930907607078552, + "step": 442 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 458.0, + "completions/mean_length": 180.9375, + "completions/mean_terminated_length": 88.23999786376953, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 0.3544, + "grad_norm": 6.519045352935791, + "kl": 0.3109130859375, + "learning_rate": 1e-06, + "loss": 0.0523, + "num_tokens": 5866418.0, + "reward": 0.14787587523460388, + "reward_std": 0.08442827314138412, + "rewards/bleu_reward_func/mean": 0.14787587523460388, + "rewards/bleu_reward_func/std": 0.13120223581790924, + "step": 443 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 474.0, + "completions/mean_length": 290.6875, + "completions/mean_terminated_length": 239.61538696289062, + "completions/min_length": 50.0, + "completions/min_terminated_length": 50.0, + "epoch": 0.3552, + "grad_norm": 3.2144103050231934, + "kl": 0.042388916015625, + "learning_rate": 1e-06, + "loss": -0.0012, + "num_tokens": 5878832.0, + "reward": 0.06585465371608734, + "reward_std": 0.03217202052474022, + "rewards/bleu_reward_func/mean": 0.06585465371608734, + "rewards/bleu_reward_func/std": 0.0564405731856823, + "step": 444 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 506.0, + "completions/mean_length": 226.0625, + "completions/mean_terminated_length": 196.48275756835938, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.356, + "grad_norm": 7.220034122467041, + "kl": 0.255218505859375, + "learning_rate": 1e-06, + "loss": 0.0212, + "num_tokens": 5894266.0, + "reward": 0.1998336911201477, + "reward_std": 0.05887780338525772, + "rewards/bleu_reward_func/mean": 0.1998336911201477, + "rewards/bleu_reward_func/std": 0.1896047741174698, + "step": 445 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 394.0, + "completions/mean_length": 194.46875, + "completions/mean_terminated_length": 173.3000030517578, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "epoch": 0.3568, + "grad_norm": 5.338675022125244, + "kl": 0.112518310546875, + "learning_rate": 1e-06, + "loss": 0.1041, + "num_tokens": 5902393.0, + "reward": 0.08252020180225372, + "reward_std": 0.041884347796440125, + "rewards/bleu_reward_func/mean": 0.08252020180225372, + "rewards/bleu_reward_func/std": 0.05604247748851776, + "step": 446 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 491.0, + "completions/mean_length": 266.96875, + "completions/mean_terminated_length": 231.96429443359375, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "epoch": 0.3576, + "grad_norm": 3.9111521244049072, + "kl": 0.048919677734375, + "learning_rate": 1e-06, + "loss": 0.0005, + "num_tokens": 5914296.0, + "reward": 0.2005537748336792, + "reward_std": 0.03531679883599281, + "rewards/bleu_reward_func/mean": 0.2005537748336792, + "rewards/bleu_reward_func/std": 0.1125224232673645, + "step": 447 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 491.0, + "completions/mean_length": 274.125, + "completions/mean_terminated_length": 89.11111450195312, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 0.3584, + "grad_norm": 18.863727569580078, + "kl": 0.269134521484375, + "learning_rate": 1e-06, + "loss": 0.4411, + "num_tokens": 5927172.0, + "reward": 0.12709318101406097, + "reward_std": 0.020968245342373848, + "rewards/bleu_reward_func/mean": 0.12709318101406097, + "rewards/bleu_reward_func/std": 0.14331206679344177, + "step": 448 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 483.0, + "completions/mean_length": 378.21875, + "completions/mean_terminated_length": 297.95001220703125, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "epoch": 0.3592, + "grad_norm": 2.759582996368408, + "kl": 0.02587890625, + "learning_rate": 1e-06, + "loss": 0.0122, + "num_tokens": 5944491.0, + "reward": 0.04890431463718414, + "reward_std": 0.01871412619948387, + "rewards/bleu_reward_func/mean": 0.04890431463718414, + "rewards/bleu_reward_func/std": 0.05281543731689453, + "step": 449 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 482.0, + "completions/mean_length": 234.8125, + "completions/mean_terminated_length": 216.33334350585938, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.36, + "grad_norm": 5.44106388092041, + "kl": 0.08453369140625, + "learning_rate": 1e-06, + "loss": -0.2079, + "num_tokens": 5954413.0, + "reward": 0.08892585337162018, + "reward_std": 0.05316928029060364, + "rewards/bleu_reward_func/mean": 0.08892585337162018, + "rewards/bleu_reward_func/std": 0.09096309542655945, + "step": 450 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 385.0, + "completions/mean_length": 182.59375, + "completions/mean_terminated_length": 90.36000061035156, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 0.3608, + "grad_norm": 10.483473777770996, + "kl": 0.30169677734375, + "learning_rate": 1e-06, + "loss": 0.1221, + "num_tokens": 5965776.0, + "reward": 0.2010711133480072, + "reward_std": 0.035105034708976746, + "rewards/bleu_reward_func/mean": 0.2010711133480072, + "rewards/bleu_reward_func/std": 0.20054543018341064, + "step": 451 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 510.0, + "completions/mean_length": 289.5, + "completions/mean_terminated_length": 266.4827575683594, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 0.3616, + "grad_norm": 8.551454544067383, + "kl": 0.21197509765625, + "learning_rate": 1e-06, + "loss": 0.0288, + "num_tokens": 5976704.0, + "reward": 0.03945029526948929, + "reward_std": 0.011974655091762543, + "rewards/bleu_reward_func/mean": 0.03945029526948929, + "rewards/bleu_reward_func/std": 0.027504391968250275, + "step": 452 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 451.0, + "completions/max_terminated_length": 451.0, + "completions/mean_length": 241.90625, + "completions/mean_terminated_length": 241.90625, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.3624, + "grad_norm": 5.853670597076416, + "kl": 0.209869384765625, + "learning_rate": 1e-06, + "loss": -0.0173, + "num_tokens": 5987269.0, + "reward": 0.09715719521045685, + "reward_std": 0.009554330259561539, + "rewards/bleu_reward_func/mean": 0.09715719521045685, + "rewards/bleu_reward_func/std": 0.0827893614768982, + "step": 453 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 512.0, + "completions/mean_length": 258.6875, + "completions/mean_terminated_length": 143.5454559326172, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 0.3632, + "grad_norm": 9.270416259765625, + "kl": 0.208038330078125, + "learning_rate": 1e-06, + "loss": 0.0672, + "num_tokens": 6001411.0, + "reward": 0.1554635763168335, + "reward_std": 0.03311417996883392, + "rewards/bleu_reward_func/mean": 0.1554635763168335, + "rewards/bleu_reward_func/std": 0.1801016479730606, + "step": 454 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 496.0, + "completions/max_terminated_length": 496.0, + "completions/mean_length": 196.65625, + "completions/mean_terminated_length": 196.65625, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.364, + "grad_norm": 11.34135913848877, + "kl": 0.224945068359375, + "learning_rate": 1e-06, + "loss": 0.3216, + "num_tokens": 6012208.0, + "reward": 0.058545198291540146, + "reward_std": 0.017396699637174606, + "rewards/bleu_reward_func/mean": 0.058545198291540146, + "rewards/bleu_reward_func/std": 0.04106508567929268, + "step": 455 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 459.0, + "completions/max_terminated_length": 459.0, + "completions/mean_length": 137.8125, + "completions/mean_terminated_length": 137.8125, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.3648, + "grad_norm": 5.569085597991943, + "kl": 0.173675537109375, + "learning_rate": 1e-06, + "loss": -0.0886, + "num_tokens": 6020026.0, + "reward": 0.25735002756118774, + "reward_std": 0.08652571588754654, + "rewards/bleu_reward_func/mean": 0.25735002756118774, + "rewards/bleu_reward_func/std": 0.34091776609420776, + "step": 456 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 487.0, + "completions/mean_length": 261.0625, + "completions/mean_terminated_length": 162.86956787109375, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 0.3656, + "grad_norm": 10.537775993347168, + "kl": 0.191162109375, + "learning_rate": 1e-06, + "loss": 0.1318, + "num_tokens": 6031956.0, + "reward": 0.12902843952178955, + "reward_std": 0.049239080399274826, + "rewards/bleu_reward_func/mean": 0.12902843952178955, + "rewards/bleu_reward_func/std": 0.1560073047876358, + "step": 457 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 499.0, + "completions/mean_length": 267.65625, + "completions/mean_terminated_length": 156.59091186523438, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 0.3664, + "grad_norm": 8.385242462158203, + "kl": 0.13568115234375, + "learning_rate": 1e-06, + "loss": 0.1967, + "num_tokens": 6048289.0, + "reward": 0.09441059827804565, + "reward_std": 0.02894745022058487, + "rewards/bleu_reward_func/mean": 0.09441059827804565, + "rewards/bleu_reward_func/std": 0.07357289642095566, + "step": 458 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 506.0, + "completions/mean_length": 144.3125, + "completions/mean_terminated_length": 76.22222137451172, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 0.3672, + "grad_norm": 9.128081321716309, + "kl": 0.275970458984375, + "learning_rate": 1e-06, + "loss": -0.0309, + "num_tokens": 6060955.0, + "reward": 0.23786574602127075, + "reward_std": 0.04663696512579918, + "rewards/bleu_reward_func/mean": 0.23786574602127075, + "rewards/bleu_reward_func/std": 0.15007296204566956, + "step": 459 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 509.0, + "completions/mean_length": 293.25, + "completions/mean_terminated_length": 262.0, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.368, + "grad_norm": 10.163530349731445, + "kl": 0.1614837646484375, + "learning_rate": 1e-06, + "loss": 0.0314, + "num_tokens": 6075291.0, + "reward": 0.11764833331108093, + "reward_std": 0.025302093476057053, + "rewards/bleu_reward_func/mean": 0.11764833331108093, + "rewards/bleu_reward_func/std": 0.054068438708782196, + "step": 460 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 465.0, + "completions/mean_length": 160.78125, + "completions/mean_terminated_length": 137.36666870117188, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.3688, + "grad_norm": 7.89539909362793, + "kl": 0.18206787109375, + "learning_rate": 1e-06, + "loss": -0.0019, + "num_tokens": 6083980.0, + "reward": 0.0945214033126831, + "reward_std": 0.046040039509534836, + "rewards/bleu_reward_func/mean": 0.0945214033126831, + "rewards/bleu_reward_func/std": 0.08345890045166016, + "step": 461 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 493.0, + "completions/mean_length": 254.96875, + "completions/mean_terminated_length": 246.6774139404297, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 0.3696, + "grad_norm": 6.462737560272217, + "kl": 0.130950927734375, + "learning_rate": 1e-06, + "loss": 0.0046, + "num_tokens": 6096555.0, + "reward": 0.04283145070075989, + "reward_std": 0.010249357670545578, + "rewards/bleu_reward_func/mean": 0.04283145070075989, + "rewards/bleu_reward_func/std": 0.038907162845134735, + "step": 462 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 208.0, + "completions/mean_length": 190.875, + "completions/mean_terminated_length": 83.83333587646484, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.3704, + "grad_norm": 5.569899559020996, + "kl": 0.115325927734375, + "learning_rate": 1e-06, + "loss": 0.006, + "num_tokens": 6105487.0, + "reward": 0.13501238822937012, + "reward_std": 0.034556735306978226, + "rewards/bleu_reward_func/mean": 0.13501238822937012, + "rewards/bleu_reward_func/std": 0.09039971977472305, + "step": 463 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 454.0, + "completions/mean_length": 219.53125, + "completions/mean_terminated_length": 137.63999938964844, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 0.3712, + "grad_norm": 12.397187232971191, + "kl": 0.133880615234375, + "learning_rate": 1e-06, + "loss": -0.0829, + "num_tokens": 6117800.0, + "reward": 0.18308544158935547, + "reward_std": 0.06162799149751663, + "rewards/bleu_reward_func/mean": 0.18308544158935547, + "rewards/bleu_reward_func/std": 0.15996244549751282, + "step": 464 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 459.0, + "completions/mean_length": 220.65625, + "completions/mean_terminated_length": 190.51724243164062, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 0.372, + "grad_norm": 7.391514301300049, + "kl": 0.11492919921875, + "learning_rate": 1e-06, + "loss": -0.0768, + "num_tokens": 6128413.0, + "reward": 0.05292118340730667, + "reward_std": 0.04890108108520508, + "rewards/bleu_reward_func/mean": 0.05292118340730667, + "rewards/bleu_reward_func/std": 0.07255055755376816, + "step": 465 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 488.0, + "completions/max_terminated_length": 488.0, + "completions/mean_length": 194.15625, + "completions/mean_terminated_length": 194.15625, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.3728, + "grad_norm": 3.9842867851257324, + "kl": 0.050048828125, + "learning_rate": 1e-06, + "loss": -0.0064, + "num_tokens": 6137562.0, + "reward": 0.04538443684577942, + "reward_std": 0.024577371776103973, + "rewards/bleu_reward_func/mean": 0.04538443684577942, + "rewards/bleu_reward_func/std": 0.03160402178764343, + "step": 466 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 508.0, + "completions/mean_length": 368.0, + "completions/mean_terminated_length": 311.6521911621094, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "epoch": 0.3736, + "grad_norm": 2.496399402618408, + "kl": 0.025543212890625, + "learning_rate": 1e-06, + "loss": -0.0006, + "num_tokens": 6152090.0, + "reward": 0.062375668436288834, + "reward_std": 0.031018512323498726, + "rewards/bleu_reward_func/mean": 0.062375668436288834, + "rewards/bleu_reward_func/std": 0.06766829639673233, + "step": 467 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 503.0, + "completions/mean_length": 341.59375, + "completions/mean_terminated_length": 302.2692565917969, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 0.3744, + "grad_norm": 4.490657329559326, + "kl": 0.042816162109375, + "learning_rate": 1e-06, + "loss": 0.2011, + "num_tokens": 6165917.0, + "reward": 0.06601699441671371, + "reward_std": 0.028723105788230896, + "rewards/bleu_reward_func/mean": 0.06601699441671371, + "rewards/bleu_reward_func/std": 0.039854664355516434, + "step": 468 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 484.0, + "completions/mean_length": 250.21875, + "completions/mean_terminated_length": 147.78260803222656, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 0.3752, + "grad_norm": 4.7490010261535645, + "kl": 0.1627197265625, + "learning_rate": 1e-06, + "loss": -0.0409, + "num_tokens": 6178940.0, + "reward": 0.15887555480003357, + "reward_std": 0.018191883340477943, + "rewards/bleu_reward_func/mean": 0.15887555480003357, + "rewards/bleu_reward_func/std": 0.21522025763988495, + "step": 469 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.53125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 472.0, + "completions/mean_length": 396.125, + "completions/mean_terminated_length": 264.8000183105469, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "epoch": 0.376, + "grad_norm": 3.33166241645813, + "kl": 0.046051025390625, + "learning_rate": 1e-06, + "loss": -0.0611, + "num_tokens": 6194232.0, + "reward": 0.0860922709107399, + "reward_std": 0.04104076325893402, + "rewards/bleu_reward_func/mean": 0.0860922709107399, + "rewards/bleu_reward_func/std": 0.13754135370254517, + "step": 470 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 493.0, + "completions/mean_length": 364.6875, + "completions/mean_terminated_length": 323.44000244140625, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "epoch": 0.3768, + "grad_norm": 2.6695375442504883, + "kl": 0.038360595703125, + "learning_rate": 1e-06, + "loss": -0.0899, + "num_tokens": 6207750.0, + "reward": 0.05763555318117142, + "reward_std": 0.022492559626698494, + "rewards/bleu_reward_func/mean": 0.05763555318117142, + "rewards/bleu_reward_func/std": 0.034512683749198914, + "step": 471 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 439.0, + "completions/mean_length": 237.40625, + "completions/mean_terminated_length": 209.0, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.3776, + "grad_norm": 4.532895565032959, + "kl": 0.088165283203125, + "learning_rate": 1e-06, + "loss": 0.0908, + "num_tokens": 6220235.0, + "reward": 0.07317312806844711, + "reward_std": 0.02968096360564232, + "rewards/bleu_reward_func/mean": 0.07317312806844711, + "rewards/bleu_reward_func/std": 0.04997172951698303, + "step": 472 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 505.0, + "completions/mean_length": 310.25, + "completions/mean_terminated_length": 263.69232177734375, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.3784, + "grad_norm": 2.9320926666259766, + "kl": 0.04736328125, + "learning_rate": 1e-06, + "loss": -0.0956, + "num_tokens": 6233691.0, + "reward": 0.07909499108791351, + "reward_std": 0.02384771592915058, + "rewards/bleu_reward_func/mean": 0.07909499108791351, + "rewards/bleu_reward_func/std": 0.08157114684581757, + "step": 473 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 486.0, + "completions/mean_length": 276.8125, + "completions/mean_terminated_length": 233.25926208496094, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 0.3792, + "grad_norm": 5.839748859405518, + "kl": 0.14556884765625, + "learning_rate": 1e-06, + "loss": -0.0466, + "num_tokens": 6245669.0, + "reward": 0.10992265492677689, + "reward_std": 0.027910416945815086, + "rewards/bleu_reward_func/mean": 0.10992265492677689, + "rewards/bleu_reward_func/std": 0.11659030616283417, + "step": 474 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 455.0, + "completions/mean_length": 194.09375, + "completions/mean_terminated_length": 172.90000915527344, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 0.38, + "grad_norm": 22.791318893432617, + "kl": 0.30963134765625, + "learning_rate": 1e-06, + "loss": -0.0749, + "num_tokens": 6255632.0, + "reward": 0.14596156775951385, + "reward_std": 0.0427117757499218, + "rewards/bleu_reward_func/mean": 0.14596156775951385, + "rewards/bleu_reward_func/std": 0.06039505451917648, + "step": 475 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 474.0, + "completions/max_terminated_length": 474.0, + "completions/mean_length": 147.03125, + "completions/mean_terminated_length": 147.03125, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 0.3808, + "grad_norm": 7.391219615936279, + "kl": 0.34600830078125, + "learning_rate": 1e-06, + "loss": -0.0711, + "num_tokens": 6267817.0, + "reward": 0.155485600233078, + "reward_std": 0.03775210678577423, + "rewards/bleu_reward_func/mean": 0.155485600233078, + "rewards/bleu_reward_func/std": 0.14854131639003754, + "step": 476 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 502.0, + "completions/mean_length": 176.09375, + "completions/mean_terminated_length": 153.70001220703125, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 0.3816, + "grad_norm": 7.248151779174805, + "kl": 0.1455078125, + "learning_rate": 1e-06, + "loss": -0.1443, + "num_tokens": 6276772.0, + "reward": 0.08080196380615234, + "reward_std": 0.06804326176643372, + "rewards/bleu_reward_func/mean": 0.08080196380615234, + "rewards/bleu_reward_func/std": 0.11115432530641556, + "step": 477 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 507.0, + "completions/mean_length": 230.03125, + "completions/mean_terminated_length": 151.0800018310547, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.3824, + "grad_norm": 8.359848022460938, + "kl": 0.151123046875, + "learning_rate": 1e-06, + "loss": 0.36, + "num_tokens": 6287581.0, + "reward": 0.06686853617429733, + "reward_std": 0.028161579743027687, + "rewards/bleu_reward_func/mean": 0.06686853617429733, + "rewards/bleu_reward_func/std": 0.054127294570207596, + "step": 478 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 470.0, + "completions/max_terminated_length": 470.0, + "completions/mean_length": 120.03125, + "completions/mean_terminated_length": 120.03125, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.3832, + "grad_norm": 7.5159101486206055, + "kl": 0.190155029296875, + "learning_rate": 1e-06, + "loss": 0.1074, + "num_tokens": 6297390.0, + "reward": 0.19040237367153168, + "reward_std": 0.05353376269340515, + "rewards/bleu_reward_func/mean": 0.19040237367153168, + "rewards/bleu_reward_func/std": 0.17947913706302643, + "step": 479 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 512.0, + "completions/mean_length": 451.4375, + "completions/mean_terminated_length": 410.0, + "completions/min_length": 61.0, + "completions/min_terminated_length": 61.0, + "epoch": 0.384, + "grad_norm": 2.1026315689086914, + "kl": 0.0289306640625, + "learning_rate": 1e-06, + "loss": 0.046, + "num_tokens": 6314548.0, + "reward": 0.09041387587785721, + "reward_std": 0.04015309736132622, + "rewards/bleu_reward_func/mean": 0.09041387587785721, + "rewards/bleu_reward_func/std": 0.09059884399175644, + "step": 480 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 503.0, + "completions/mean_length": 319.5625, + "completions/mean_terminated_length": 204.10000610351562, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.3848, + "grad_norm": 5.199352264404297, + "kl": 0.172821044921875, + "learning_rate": 1e-06, + "loss": -0.062, + "num_tokens": 6333886.0, + "reward": 0.13319844007492065, + "reward_std": 0.03567848354578018, + "rewards/bleu_reward_func/mean": 0.13319844007492065, + "rewards/bleu_reward_func/std": 0.12437637895345688, + "step": 481 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 505.0, + "completions/mean_length": 208.9375, + "completions/mean_terminated_length": 152.8148193359375, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 0.3856, + "grad_norm": 6.110198497772217, + "kl": 0.2414398193359375, + "learning_rate": 1e-06, + "loss": -0.0203, + "num_tokens": 6346564.0, + "reward": 0.19878074526786804, + "reward_std": 0.043283406645059586, + "rewards/bleu_reward_func/mean": 0.19878074526786804, + "rewards/bleu_reward_func/std": 0.1821635365486145, + "step": 482 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 502.0, + "completions/mean_length": 237.53125, + "completions/mean_terminated_length": 186.70370483398438, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 0.3864, + "grad_norm": 8.788106918334961, + "kl": 0.17132568359375, + "learning_rate": 1e-06, + "loss": -0.0653, + "num_tokens": 6358741.0, + "reward": 0.07478289306163788, + "reward_std": 0.019201520830392838, + "rewards/bleu_reward_func/mean": 0.07478289306163788, + "rewards/bleu_reward_func/std": 0.05620751157402992, + "step": 483 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 489.0, + "completions/mean_length": 202.4375, + "completions/mean_terminated_length": 115.75999450683594, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.3872, + "grad_norm": 7.577336311340332, + "kl": 0.109588623046875, + "learning_rate": 1e-06, + "loss": 0.3355, + "num_tokens": 6371787.0, + "reward": 0.09253311157226562, + "reward_std": 0.03513386473059654, + "rewards/bleu_reward_func/mean": 0.09253311157226562, + "rewards/bleu_reward_func/std": 0.0667162612080574, + "step": 484 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 512.0, + "completions/mean_length": 331.625, + "completions/mean_terminated_length": 281.1199951171875, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 0.388, + "grad_norm": 5.319460391998291, + "kl": 0.0972900390625, + "learning_rate": 1e-06, + "loss": 0.057, + "num_tokens": 6388231.0, + "reward": 0.16802164912223816, + "reward_std": 0.024459581822156906, + "rewards/bleu_reward_func/mean": 0.16802164912223816, + "rewards/bleu_reward_func/std": 0.17531749606132507, + "step": 485 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 475.0, + "completions/mean_length": 222.6875, + "completions/mean_terminated_length": 155.92308044433594, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 0.3888, + "grad_norm": 9.174544334411621, + "kl": 0.21746826171875, + "learning_rate": 1e-06, + "loss": 0.0552, + "num_tokens": 6399885.0, + "reward": 0.20374764502048492, + "reward_std": 0.02469576895236969, + "rewards/bleu_reward_func/mean": 0.20374764502048492, + "rewards/bleu_reward_func/std": 0.17774522304534912, + "step": 486 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 465.0, + "completions/mean_length": 212.40625, + "completions/mean_terminated_length": 112.54167175292969, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 0.3896, + "grad_norm": 8.529189109802246, + "kl": 0.412689208984375, + "learning_rate": 1e-06, + "loss": -0.0442, + "num_tokens": 6410746.0, + "reward": 0.13253280520439148, + "reward_std": 0.03401318937540054, + "rewards/bleu_reward_func/mean": 0.13253280520439148, + "rewards/bleu_reward_func/std": 0.09572894126176834, + "step": 487 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 511.0, + "completions/mean_length": 343.40625, + "completions/mean_terminated_length": 277.4347839355469, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "epoch": 0.3904, + "grad_norm": 5.815334796905518, + "kl": 0.06719970703125, + "learning_rate": 1e-06, + "loss": 0.0079, + "num_tokens": 6424639.0, + "reward": 0.14998552203178406, + "reward_std": 0.03536435216665268, + "rewards/bleu_reward_func/mean": 0.14998552203178406, + "rewards/bleu_reward_func/std": 0.08015048503875732, + "step": 488 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 501.0, + "completions/mean_length": 242.21875, + "completions/mean_terminated_length": 192.25926208496094, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 0.3912, + "grad_norm": 8.644153594970703, + "kl": 0.204193115234375, + "learning_rate": 1e-06, + "loss": 0.2233, + "num_tokens": 6437382.0, + "reward": 0.08585190027952194, + "reward_std": 0.032436732202768326, + "rewards/bleu_reward_func/mean": 0.08585190027952194, + "rewards/bleu_reward_func/std": 0.10239724069833755, + "step": 489 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 506.0, + "completions/mean_length": 259.625, + "completions/mean_terminated_length": 108.20000457763672, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 0.392, + "grad_norm": 18.119718551635742, + "kl": 0.39862060546875, + "learning_rate": 1e-06, + "loss": 0.0372, + "num_tokens": 6450634.0, + "reward": 0.07857100665569305, + "reward_std": 0.010440990328788757, + "rewards/bleu_reward_func/mean": 0.07857100665569305, + "rewards/bleu_reward_func/std": 0.06719467043876648, + "step": 490 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.34375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 417.0, + "completions/mean_length": 348.28125, + "completions/mean_terminated_length": 262.5238037109375, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "epoch": 0.3928, + "grad_norm": 2.811199903488159, + "kl": 0.03350830078125, + "learning_rate": 1e-06, + "loss": 0.0223, + "num_tokens": 6464187.0, + "reward": 0.07400047779083252, + "reward_std": 0.021461695432662964, + "rewards/bleu_reward_func/mean": 0.07400047779083252, + "rewards/bleu_reward_func/std": 0.061210907995700836, + "step": 491 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 471.0, + "completions/mean_length": 225.53125, + "completions/mean_terminated_length": 172.48147583007812, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 0.3936, + "grad_norm": 8.102995872497559, + "kl": 0.17437744140625, + "learning_rate": 1e-06, + "loss": -0.0621, + "num_tokens": 6477388.0, + "reward": 0.08205416798591614, + "reward_std": 0.02140321210026741, + "rewards/bleu_reward_func/mean": 0.08205416798591614, + "rewards/bleu_reward_func/std": 0.06504324823617935, + "step": 492 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 484.0, + "completions/mean_length": 248.40625, + "completions/mean_terminated_length": 239.90321350097656, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 0.3944, + "grad_norm": 5.510415554046631, + "kl": 0.054595947265625, + "learning_rate": 1e-06, + "loss": 0.1424, + "num_tokens": 6490681.0, + "reward": 0.09917749464511871, + "reward_std": 0.03953540325164795, + "rewards/bleu_reward_func/mean": 0.09917749464511871, + "rewards/bleu_reward_func/std": 0.062214821577072144, + "step": 493 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.46875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 430.0, + "completions/mean_length": 347.65625, + "completions/mean_terminated_length": 202.64706420898438, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "epoch": 0.3952, + "grad_norm": 3.754049301147461, + "kl": 0.065643310546875, + "learning_rate": 1e-06, + "loss": -0.0312, + "num_tokens": 6508054.0, + "reward": 0.04995376244187355, + "reward_std": 0.018671657890081406, + "rewards/bleu_reward_func/mean": 0.04995376244187355, + "rewards/bleu_reward_func/std": 0.021997425705194473, + "step": 494 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.46875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 448.0, + "completions/mean_length": 304.78125, + "completions/mean_terminated_length": 121.94117736816406, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 0.396, + "grad_norm": 3.28092098236084, + "kl": 0.0880889892578125, + "learning_rate": 1e-06, + "loss": 0.2271, + "num_tokens": 6528167.0, + "reward": 0.21464568376541138, + "reward_std": 0.04326138645410538, + "rewards/bleu_reward_func/mean": 0.21464568376541138, + "rewards/bleu_reward_func/std": 0.2538887560367584, + "step": 495 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 493.0, + "completions/mean_length": 260.0, + "completions/mean_terminated_length": 161.3913116455078, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.3968, + "grad_norm": 7.667696475982666, + "kl": 0.27783203125, + "learning_rate": 1e-06, + "loss": -0.0064, + "num_tokens": 6539503.0, + "reward": 0.14023897051811218, + "reward_std": 0.03843347355723381, + "rewards/bleu_reward_func/mean": 0.14023897051811218, + "rewards/bleu_reward_func/std": 0.12260077148675919, + "step": 496 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 508.0, + "completions/mean_length": 312.78125, + "completions/mean_terminated_length": 275.8888854980469, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.3976, + "grad_norm": 5.650123596191406, + "kl": 0.122955322265625, + "learning_rate": 1e-06, + "loss": -0.218, + "num_tokens": 6556880.0, + "reward": 0.2068222463130951, + "reward_std": 0.08186712116003036, + "rewards/bleu_reward_func/mean": 0.2068222463130951, + "rewards/bleu_reward_func/std": 0.30478134751319885, + "step": 497 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 482.0, + "completions/max_terminated_length": 482.0, + "completions/mean_length": 115.5, + "completions/mean_terminated_length": 115.5, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.3984, + "grad_norm": 9.522087097167969, + "kl": 0.31427001953125, + "learning_rate": 1e-06, + "loss": 0.3453, + "num_tokens": 6564648.0, + "reward": 0.21922443807125092, + "reward_std": 0.07997345924377441, + "rewards/bleu_reward_func/mean": 0.21922443807125092, + "rewards/bleu_reward_func/std": 0.12106078118085861, + "step": 498 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 510.0, + "completions/mean_length": 283.46875, + "completions/mean_terminated_length": 194.04348754882812, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.3992, + "grad_norm": 4.472853183746338, + "kl": 0.10198974609375, + "learning_rate": 1e-06, + "loss": 0.0606, + "num_tokens": 6577575.0, + "reward": 0.1807648241519928, + "reward_std": 0.04940491169691086, + "rewards/bleu_reward_func/mean": 0.1807648241519928, + "rewards/bleu_reward_func/std": 0.2276194989681244, + "step": 499 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 500.0, + "completions/mean_length": 257.6875, + "completions/mean_terminated_length": 210.59259033203125, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 0.4, + "grad_norm": 3.429314136505127, + "kl": 0.120086669921875, + "learning_rate": 1e-06, + "loss": 0.1092, + "num_tokens": 6590341.0, + "reward": 0.13892096281051636, + "reward_std": 0.04246610775589943, + "rewards/bleu_reward_func/mean": 0.13892096281051636, + "rewards/bleu_reward_func/std": 0.12665794789791107, + "step": 500 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 422.0, + "completions/mean_length": 150.78125, + "completions/mean_terminated_length": 126.70000457763672, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 0.4008, + "grad_norm": 6.932479381561279, + "kl": 0.4072265625, + "learning_rate": 1e-06, + "loss": 0.1306, + "num_tokens": 6604182.0, + "reward": 0.13375571370124817, + "reward_std": 0.05735353007912636, + "rewards/bleu_reward_func/mean": 0.13375571370124817, + "rewards/bleu_reward_func/std": 0.14047691226005554, + "step": 501 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 483.0, + "completions/mean_length": 329.375, + "completions/mean_terminated_length": 257.9130554199219, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.4016, + "grad_norm": 3.9977669715881348, + "kl": 0.0543212890625, + "learning_rate": 1e-06, + "loss": 0.1227, + "num_tokens": 6619994.0, + "reward": 0.08314976096153259, + "reward_std": 0.01850474253296852, + "rewards/bleu_reward_func/mean": 0.08314976096153259, + "rewards/bleu_reward_func/std": 0.03126469627022743, + "step": 502 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 501.0, + "completions/mean_length": 328.875, + "completions/mean_terminated_length": 245.63636779785156, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 0.4024, + "grad_norm": 8.637741088867188, + "kl": 0.294342041015625, + "learning_rate": 1e-06, + "loss": -0.0294, + "num_tokens": 6632862.0, + "reward": 0.21461226046085358, + "reward_std": 0.05726875364780426, + "rewards/bleu_reward_func/mean": 0.21461226046085358, + "rewards/bleu_reward_func/std": 0.19377335906028748, + "step": 503 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 495.0, + "completions/mean_length": 345.71875, + "completions/mean_terminated_length": 245.9499969482422, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 0.4032, + "grad_norm": 5.415818691253662, + "kl": 0.11663818359375, + "learning_rate": 1e-06, + "loss": -0.1008, + "num_tokens": 6649573.0, + "reward": 0.10018286108970642, + "reward_std": 0.025530360639095306, + "rewards/bleu_reward_func/mean": 0.10018286108970642, + "rewards/bleu_reward_func/std": 0.08217810094356537, + "step": 504 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 508.0, + "completions/mean_length": 293.21875, + "completions/mean_terminated_length": 123.05555725097656, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.404, + "grad_norm": 5.128636837005615, + "kl": 0.0567626953125, + "learning_rate": 1e-06, + "loss": 0.1747, + "num_tokens": 6661556.0, + "reward": 0.08193753659725189, + "reward_std": 0.036860473453998566, + "rewards/bleu_reward_func/mean": 0.08193753659725189, + "rewards/bleu_reward_func/std": 0.0639234408736229, + "step": 505 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 512.0, + "completions/max_terminated_length": 499.0, + "completions/mean_length": 383.1875, + "completions/mean_terminated_length": 254.375, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "epoch": 0.4048, + "grad_norm": 2.5557072162628174, + "kl": 0.03839111328125, + "learning_rate": 1e-06, + "loss": -0.0743, + "num_tokens": 6678938.0, + "reward": 0.05591622740030289, + "reward_std": 0.017734069377183914, + "rewards/bleu_reward_func/mean": 0.05591622740030289, + "rewards/bleu_reward_func/std": 0.04607876017689705, + "step": 506 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 512.0, + "completions/max_terminated_length": 271.0, + "completions/mean_length": 293.625, + "completions/mean_terminated_length": 75.25, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 0.4056, + "grad_norm": 5.220037460327148, + "kl": 0.07464599609375, + "learning_rate": 1e-06, + "loss": 0.0253, + "num_tokens": 6691822.0, + "reward": 0.029562367126345634, + "reward_std": 0.03146641328930855, + "rewards/bleu_reward_func/mean": 0.029562367126345634, + "rewards/bleu_reward_func/std": 0.04593721404671669, + "step": 507 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.65625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 507.0, + "completions/mean_length": 453.96875, + "completions/mean_terminated_length": 343.18182373046875, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 0.4064, + "grad_norm": 2.189230442047119, + "kl": 0.026153564453125, + "learning_rate": 1e-06, + "loss": -0.0992, + "num_tokens": 6709997.0, + "reward": 0.03725602477788925, + "reward_std": 0.02092660963535309, + "rewards/bleu_reward_func/mean": 0.03725602477788925, + "rewards/bleu_reward_func/std": 0.02429044619202614, + "step": 508 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 290.0, + "completions/mean_length": 106.34375, + "completions/mean_terminated_length": 79.30000305175781, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 0.4072, + "grad_norm": 6.567111015319824, + "kl": 0.233154296875, + "learning_rate": 1e-06, + "loss": 0.2905, + "num_tokens": 6718912.0, + "reward": 0.15163123607635498, + "reward_std": 0.039707012474536896, + "rewards/bleu_reward_func/mean": 0.15163123607635498, + "rewards/bleu_reward_func/std": 0.12998701632022858, + "step": 509 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 411.0, + "completions/mean_length": 119.03125, + "completions/mean_terminated_length": 106.3548355102539, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.408, + "grad_norm": 8.991604804992676, + "kl": 0.14703369140625, + "learning_rate": 1e-06, + "loss": 0.0972, + "num_tokens": 6728609.0, + "reward": 0.23723718523979187, + "reward_std": 0.07665139436721802, + "rewards/bleu_reward_func/mean": 0.23723718523979187, + "rewards/bleu_reward_func/std": 0.27060666680336, + "step": 510 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.34375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 500.0, + "completions/mean_length": 288.375, + "completions/mean_terminated_length": 171.23809814453125, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 0.4088, + "grad_norm": 6.349617958068848, + "kl": 0.168121337890625, + "learning_rate": 1e-06, + "loss": 0.0688, + "num_tokens": 6743789.0, + "reward": 0.1937231868505478, + "reward_std": 0.13082939386367798, + "rewards/bleu_reward_func/mean": 0.1937231868505478, + "rewards/bleu_reward_func/std": 0.25435397028923035, + "step": 511 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 512.0, + "completions/max_terminated_length": 486.0, + "completions/mean_length": 401.96875, + "completions/mean_terminated_length": 291.9375, + "completions/min_length": 56.0, + "completions/min_terminated_length": 56.0, + "epoch": 0.4096, + "grad_norm": 2.5390427112579346, + "kl": 0.03851318359375, + "learning_rate": 1e-06, + "loss": 0.1042, + "num_tokens": 6759732.0, + "reward": 0.029224077239632607, + "reward_std": 0.016936711966991425, + "rewards/bleu_reward_func/mean": 0.029224077239632607, + "rewards/bleu_reward_func/std": 0.022709792479872704, + "step": 512 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 451.0, + "completions/mean_length": 268.46875, + "completions/mean_terminated_length": 79.05555725097656, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.4104, + "grad_norm": 3.7983713150024414, + "kl": 0.137786865234375, + "learning_rate": 1e-06, + "loss": -0.0173, + "num_tokens": 6774051.0, + "reward": 0.20052862167358398, + "reward_std": 0.028155002743005753, + "rewards/bleu_reward_func/mean": 0.20052862167358398, + "rewards/bleu_reward_func/std": 0.2302575409412384, + "step": 513 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 512.0, + "completions/mean_length": 388.71875, + "completions/mean_terminated_length": 360.2692565917969, + "completions/min_length": 176.0, + "completions/min_terminated_length": 176.0, + "epoch": 0.4112, + "grad_norm": 2.4446346759796143, + "kl": 0.028076171875, + "learning_rate": 1e-06, + "loss": -0.0783, + "num_tokens": 6790610.0, + "reward": 0.10578086227178574, + "reward_std": 0.029093941673636436, + "rewards/bleu_reward_func/mean": 0.10578086227178574, + "rewards/bleu_reward_func/std": 0.08641202747821808, + "step": 514 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 483.0, + "completions/mean_length": 275.34375, + "completions/mean_terminated_length": 259.5666809082031, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.412, + "grad_norm": 5.883263111114502, + "kl": 0.18634033203125, + "learning_rate": 1e-06, + "loss": 0.0231, + "num_tokens": 6803965.0, + "reward": 0.1322258561849594, + "reward_std": 0.030806170776486397, + "rewards/bleu_reward_func/mean": 0.1322258561849594, + "rewards/bleu_reward_func/std": 0.16078709065914154, + "step": 515 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 484.0, + "completions/mean_length": 333.84375, + "completions/mean_terminated_length": 274.4583435058594, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "epoch": 0.4128, + "grad_norm": 3.016139507293701, + "kl": 0.03173828125, + "learning_rate": 1e-06, + "loss": 0.025, + "num_tokens": 6818840.0, + "reward": 0.09323176741600037, + "reward_std": 0.05342460051178932, + "rewards/bleu_reward_func/mean": 0.09323176741600037, + "rewards/bleu_reward_func/std": 0.06577997654676437, + "step": 516 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 442.0, + "completions/mean_length": 276.5625, + "completions/mean_terminated_length": 210.63999938964844, + "completions/min_length": 72.0, + "completions/min_terminated_length": 72.0, + "epoch": 0.4136, + "grad_norm": 4.685121059417725, + "kl": 0.050933837890625, + "learning_rate": 1e-06, + "loss": -0.1784, + "num_tokens": 6830770.0, + "reward": 0.03872024267911911, + "reward_std": 0.016178004443645477, + "rewards/bleu_reward_func/mean": 0.03872024267911911, + "rewards/bleu_reward_func/std": 0.025313377380371094, + "step": 517 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.46875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 496.0, + "completions/mean_length": 362.875, + "completions/mean_terminated_length": 231.2941131591797, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.4144, + "grad_norm": 4.639893054962158, + "kl": 0.122711181640625, + "learning_rate": 1e-06, + "loss": 0.0545, + "num_tokens": 6848302.0, + "reward": 0.07996964454650879, + "reward_std": 0.01709877885878086, + "rewards/bleu_reward_func/mean": 0.07996964454650879, + "rewards/bleu_reward_func/std": 0.10056579113006592, + "step": 518 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 500.0, + "completions/mean_length": 294.625, + "completions/mean_terminated_length": 263.5714416503906, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.4152, + "grad_norm": 9.085565567016602, + "kl": 0.1670379638671875, + "learning_rate": 1e-06, + "loss": -0.1226, + "num_tokens": 6859826.0, + "reward": 0.10505213588476181, + "reward_std": 0.05224030464887619, + "rewards/bleu_reward_func/mean": 0.10505213588476181, + "rewards/bleu_reward_func/std": 0.0725407749414444, + "step": 519 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 497.0, + "completions/mean_length": 258.3125, + "completions/mean_terminated_length": 173.75, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.416, + "grad_norm": 7.134840965270996, + "kl": 0.175750732421875, + "learning_rate": 1e-06, + "loss": 0.1168, + "num_tokens": 6873516.0, + "reward": 0.21853026747703552, + "reward_std": 0.06429094821214676, + "rewards/bleu_reward_func/mean": 0.21853026747703552, + "rewards/bleu_reward_func/std": 0.14174966514110565, + "step": 520 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 477.0, + "completions/mean_length": 328.65625, + "completions/mean_terminated_length": 256.9130554199219, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.4168, + "grad_norm": 6.0497517585754395, + "kl": 0.131439208984375, + "learning_rate": 1e-06, + "loss": 0.0008, + "num_tokens": 6887761.0, + "reward": 0.0685054138302803, + "reward_std": 0.012891553342342377, + "rewards/bleu_reward_func/mean": 0.0685054138302803, + "rewards/bleu_reward_func/std": 0.057060711085796356, + "step": 521 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 491.0, + "completions/mean_length": 267.0625, + "completions/mean_terminated_length": 221.70370483398438, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "epoch": 0.4176, + "grad_norm": 4.475714683532715, + "kl": 0.047637939453125, + "learning_rate": 1e-06, + "loss": 0.1563, + "num_tokens": 6901251.0, + "reward": 0.18483126163482666, + "reward_std": 0.02913127839565277, + "rewards/bleu_reward_func/mean": 0.18483126163482666, + "rewards/bleu_reward_func/std": 0.16543246805667877, + "step": 522 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 506.0, + "completions/mean_length": 228.53125, + "completions/mean_terminated_length": 163.11538696289062, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 0.4184, + "grad_norm": 7.8729143142700195, + "kl": 0.297882080078125, + "learning_rate": 1e-06, + "loss": 0.0428, + "num_tokens": 6912844.0, + "reward": 0.1846814900636673, + "reward_std": 0.10159599035978317, + "rewards/bleu_reward_func/mean": 0.1846814900636673, + "rewards/bleu_reward_func/std": 0.2030598670244217, + "step": 523 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 456.0, + "completions/mean_length": 251.09375, + "completions/mean_terminated_length": 233.70001220703125, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 0.4192, + "grad_norm": 4.603794097900391, + "kl": 0.08197021484375, + "learning_rate": 1e-06, + "loss": -0.0057, + "num_tokens": 6923895.0, + "reward": 0.11323156207799911, + "reward_std": 0.03932211175560951, + "rewards/bleu_reward_func/mean": 0.11323156207799911, + "rewards/bleu_reward_func/std": 0.08274988830089569, + "step": 524 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 288.0, + "completions/max_terminated_length": 288.0, + "completions/mean_length": 76.625, + "completions/mean_terminated_length": 76.625, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 0.42, + "grad_norm": 8.708818435668945, + "kl": 0.1953125, + "learning_rate": 1e-06, + "loss": 0.3306, + "num_tokens": 6934091.0, + "reward": 0.18468719720840454, + "reward_std": 0.0689420998096466, + "rewards/bleu_reward_func/mean": 0.18468719720840454, + "rewards/bleu_reward_func/std": 0.12529541552066803, + "step": 525 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 481.0, + "completions/mean_length": 256.5, + "completions/mean_terminated_length": 220.00001525878906, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 0.4208, + "grad_norm": 15.188727378845215, + "kl": 0.10748291015625, + "learning_rate": 1e-06, + "loss": -0.0496, + "num_tokens": 6946923.0, + "reward": 0.09780866652727127, + "reward_std": 0.029562484472990036, + "rewards/bleu_reward_func/mean": 0.09780866652727127, + "rewards/bleu_reward_func/std": 0.09735672175884247, + "step": 526 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.34375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 483.0, + "completions/mean_length": 246.75, + "completions/mean_terminated_length": 107.80952453613281, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 0.4216, + "grad_norm": 5.919389247894287, + "kl": 0.1287841796875, + "learning_rate": 1e-06, + "loss": 0.0944, + "num_tokens": 6958675.0, + "reward": 0.049182113260030746, + "reward_std": 0.03928225487470627, + "rewards/bleu_reward_func/mean": 0.049182113260030746, + "rewards/bleu_reward_func/std": 0.05703483149409294, + "step": 527 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 488.0, + "completions/mean_length": 298.59375, + "completions/mean_terminated_length": 268.1071472167969, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "epoch": 0.4224, + "grad_norm": 4.162198066711426, + "kl": 0.048370361328125, + "learning_rate": 1e-06, + "loss": -0.0255, + "num_tokens": 6973038.0, + "reward": 0.19552364945411682, + "reward_std": 0.05411393195390701, + "rewards/bleu_reward_func/mean": 0.19552364945411682, + "rewards/bleu_reward_func/std": 0.11564817279577255, + "step": 528 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 505.0, + "completions/mean_length": 343.125, + "completions/mean_terminated_length": 266.3636474609375, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 0.4232, + "grad_norm": 7.422494411468506, + "kl": 0.06976318359375, + "learning_rate": 1e-06, + "loss": 0.0526, + "num_tokens": 6988034.0, + "reward": 0.03407738357782364, + "reward_std": 0.010626979172229767, + "rewards/bleu_reward_func/mean": 0.03407738357782364, + "rewards/bleu_reward_func/std": 0.027887288480997086, + "step": 529 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 249.0, + "completions/max_terminated_length": 249.0, + "completions/mean_length": 53.53125, + "completions/mean_terminated_length": 53.53125, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 0.424, + "grad_norm": 13.498769760131836, + "kl": 0.46209716796875, + "learning_rate": 1e-06, + "loss": 0.0087, + "num_tokens": 6995419.0, + "reward": 0.24595381319522858, + "reward_std": 0.09870806336402893, + "rewards/bleu_reward_func/mean": 0.24595381319522858, + "rewards/bleu_reward_func/std": 0.1663571149110794, + "step": 530 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.46875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 472.0, + "completions/mean_length": 392.5, + "completions/mean_terminated_length": 287.058837890625, + "completions/min_length": 46.0, + "completions/min_terminated_length": 46.0, + "epoch": 0.4248, + "grad_norm": 2.2019829750061035, + "kl": 0.0286407470703125, + "learning_rate": 1e-06, + "loss": -0.2369, + "num_tokens": 7014707.0, + "reward": 0.12730640172958374, + "reward_std": 0.03398028016090393, + "rewards/bleu_reward_func/mean": 0.12730640172958374, + "rewards/bleu_reward_func/std": 0.20578297972679138, + "step": 531 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 499.0, + "completions/mean_length": 317.15625, + "completions/mean_terminated_length": 262.6000061035156, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "epoch": 0.4256, + "grad_norm": 5.298976421356201, + "kl": 0.087127685546875, + "learning_rate": 1e-06, + "loss": -0.0481, + "num_tokens": 7030000.0, + "reward": 0.06116287037730217, + "reward_std": 0.04584234952926636, + "rewards/bleu_reward_func/mean": 0.06116287037730217, + "rewards/bleu_reward_func/std": 0.07913482189178467, + "step": 532 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 503.0, + "completions/mean_length": 321.34375, + "completions/mean_terminated_length": 234.68182373046875, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.4264, + "grad_norm": 4.799532413482666, + "kl": 0.1138916015625, + "learning_rate": 1e-06, + "loss": -0.0807, + "num_tokens": 7046643.0, + "reward": 0.08829933404922485, + "reward_std": 0.03609791770577431, + "rewards/bleu_reward_func/mean": 0.08829933404922485, + "rewards/bleu_reward_func/std": 0.10983619093894958, + "step": 533 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 466.0, + "completions/mean_length": 196.84375, + "completions/mean_terminated_length": 151.82144165039062, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 0.4272, + "grad_norm": 15.142457008361816, + "kl": 0.25970458984375, + "learning_rate": 1e-06, + "loss": -0.0027, + "num_tokens": 7057718.0, + "reward": 0.10968612134456635, + "reward_std": 0.05676144361495972, + "rewards/bleu_reward_func/mean": 0.10968612134456635, + "rewards/bleu_reward_func/std": 0.1397821009159088, + "step": 534 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 391.0, + "completions/mean_length": 244.9375, + "completions/mean_terminated_length": 195.48147583007812, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 0.428, + "grad_norm": 6.9789934158325195, + "kl": 0.092926025390625, + "learning_rate": 1e-06, + "loss": 0.1057, + "num_tokens": 7073980.0, + "reward": 0.19463014602661133, + "reward_std": 0.09179598838090897, + "rewards/bleu_reward_func/mean": 0.19463014602661133, + "rewards/bleu_reward_func/std": 0.1903815120458603, + "step": 535 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 506.0, + "completions/mean_length": 146.5, + "completions/mean_terminated_length": 122.13333892822266, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 0.4288, + "grad_norm": 10.111763000488281, + "kl": 0.143310546875, + "learning_rate": 1e-06, + "loss": -0.0902, + "num_tokens": 7085228.0, + "reward": 0.15931251645088196, + "reward_std": 0.06651220470666885, + "rewards/bleu_reward_func/mean": 0.15931251645088196, + "rewards/bleu_reward_func/std": 0.10370245575904846, + "step": 536 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 508.0, + "completions/mean_length": 245.09375, + "completions/mean_terminated_length": 183.5, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 0.4296, + "grad_norm": 11.313093185424805, + "kl": 0.135894775390625, + "learning_rate": 1e-06, + "loss": 0.2524, + "num_tokens": 7098583.0, + "reward": 0.08501166105270386, + "reward_std": 0.03819301724433899, + "rewards/bleu_reward_func/mean": 0.08501166105270386, + "rewards/bleu_reward_func/std": 0.0931810513138771, + "step": 537 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 502.0, + "completions/mean_length": 261.5, + "completions/mean_terminated_length": 244.80001831054688, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "epoch": 0.4304, + "grad_norm": 4.153919696807861, + "kl": 0.050628662109375, + "learning_rate": 1e-06, + "loss": -0.1541, + "num_tokens": 7108663.0, + "reward": 0.06835095584392548, + "reward_std": 0.042577650398015976, + "rewards/bleu_reward_func/mean": 0.06835095584392548, + "rewards/bleu_reward_func/std": 0.05704295262694359, + "step": 538 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 477.0, + "completions/mean_length": 226.375, + "completions/mean_terminated_length": 173.48147583007812, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 0.4312, + "grad_norm": 13.284401893615723, + "kl": 0.14434814453125, + "learning_rate": 1e-06, + "loss": -0.0842, + "num_tokens": 7118427.0, + "reward": 0.08002069592475891, + "reward_std": 0.029213791713118553, + "rewards/bleu_reward_func/mean": 0.08002069592475891, + "rewards/bleu_reward_func/std": 0.03687189891934395, + "step": 539 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 391.0, + "completions/mean_length": 123.65625, + "completions/mean_terminated_length": 83.48275756835938, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 0.432, + "grad_norm": 23.51561164855957, + "kl": 0.1839599609375, + "learning_rate": 1e-06, + "loss": 0.4993, + "num_tokens": 7128856.0, + "reward": 0.2179010808467865, + "reward_std": 0.08272600173950195, + "rewards/bleu_reward_func/mean": 0.2179010808467865, + "rewards/bleu_reward_func/std": 0.26301127672195435, + "step": 540 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.46875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 452.0, + "completions/mean_length": 332.40625, + "completions/mean_terminated_length": 173.94117736816406, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 0.4328, + "grad_norm": 10.934617042541504, + "kl": 0.10113525390625, + "learning_rate": 1e-06, + "loss": -0.1254, + "num_tokens": 7141661.0, + "reward": 0.06413869559764862, + "reward_std": 0.05120678246021271, + "rewards/bleu_reward_func/mean": 0.06413869559764862, + "rewards/bleu_reward_func/std": 0.09179537743330002, + "step": 541 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 482.0, + "completions/mean_length": 294.375, + "completions/mean_terminated_length": 221.83334350585938, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 0.4336, + "grad_norm": 3.505484104156494, + "kl": 0.06707763671875, + "learning_rate": 1e-06, + "loss": 0.0835, + "num_tokens": 7153161.0, + "reward": 0.09516981989145279, + "reward_std": 0.044140610843896866, + "rewards/bleu_reward_func/mean": 0.09516981989145279, + "rewards/bleu_reward_func/std": 0.049775656312704086, + "step": 542 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 478.0, + "completions/mean_length": 326.25, + "completions/mean_terminated_length": 214.8000030517578, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 0.4344, + "grad_norm": 7.354869842529297, + "kl": 0.10260009765625, + "learning_rate": 1e-06, + "loss": -0.1239, + "num_tokens": 7167649.0, + "reward": 0.03533574938774109, + "reward_std": 0.014214935712516308, + "rewards/bleu_reward_func/mean": 0.03533574938774109, + "rewards/bleu_reward_func/std": 0.027195338159799576, + "step": 543 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 501.0, + "completions/mean_length": 225.78125, + "completions/mean_terminated_length": 159.73077392578125, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.4352, + "grad_norm": 5.206735610961914, + "kl": 0.09149169921875, + "learning_rate": 1e-06, + "loss": -0.0428, + "num_tokens": 7181226.0, + "reward": 0.22954684495925903, + "reward_std": 0.06006891652941704, + "rewards/bleu_reward_func/mean": 0.22954684495925903, + "rewards/bleu_reward_func/std": 0.11863149702548981, + "step": 544 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 499.0, + "completions/mean_length": 250.75, + "completions/mean_terminated_length": 190.4615478515625, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "epoch": 0.436, + "grad_norm": 5.510367393493652, + "kl": 0.05084228515625, + "learning_rate": 1e-06, + "loss": 0.1971, + "num_tokens": 7191810.0, + "reward": 0.08453569561243057, + "reward_std": 0.050511520355939865, + "rewards/bleu_reward_func/mean": 0.08453569561243057, + "rewards/bleu_reward_func/std": 0.07364515960216522, + "step": 545 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 508.0, + "completions/mean_length": 388.34375, + "completions/mean_terminated_length": 314.1499938964844, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 0.4368, + "grad_norm": 3.61635160446167, + "kl": 0.045135498046875, + "learning_rate": 1e-06, + "loss": 0.0817, + "num_tokens": 7206789.0, + "reward": 0.050152119249105453, + "reward_std": 0.03165213763713837, + "rewards/bleu_reward_func/mean": 0.050152119249105453, + "rewards/bleu_reward_func/std": 0.05620579421520233, + "step": 546 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 160.0, + "completions/max_terminated_length": 160.0, + "completions/mean_length": 57.5, + "completions/mean_terminated_length": 57.5, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.4376, + "grad_norm": 8.828208923339844, + "kl": 0.25537109375, + "learning_rate": 1e-06, + "loss": 0.103, + "num_tokens": 7213949.0, + "reward": 0.20786888897418976, + "reward_std": 0.06727642565965652, + "rewards/bleu_reward_func/mean": 0.20786888897418976, + "rewards/bleu_reward_func/std": 0.1706974357366562, + "step": 547 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 512.0, + "completions/mean_length": 275.125, + "completions/mean_terminated_length": 196.1666717529297, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 0.4384, + "grad_norm": 13.268147468566895, + "kl": 0.058074951171875, + "learning_rate": 1e-06, + "loss": -0.0694, + "num_tokens": 7227041.0, + "reward": 0.05118046700954437, + "reward_std": 0.02497515268623829, + "rewards/bleu_reward_func/mean": 0.05118046700954437, + "rewards/bleu_reward_func/std": 0.035916514694690704, + "step": 548 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 487.0, + "completions/mean_length": 351.3125, + "completions/mean_terminated_length": 278.2727355957031, + "completions/min_length": 56.0, + "completions/min_terminated_length": 56.0, + "epoch": 0.4392, + "grad_norm": 13.135753631591797, + "kl": 0.077850341796875, + "learning_rate": 1e-06, + "loss": 0.0555, + "num_tokens": 7244763.0, + "reward": 0.07840518653392792, + "reward_std": 0.022635504603385925, + "rewards/bleu_reward_func/mean": 0.07840518653392792, + "rewards/bleu_reward_func/std": 0.06580173969268799, + "step": 549 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 497.0, + "completions/mean_length": 201.1875, + "completions/mean_terminated_length": 156.7857208251953, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 0.44, + "grad_norm": 7.055432319641113, + "kl": 0.25830078125, + "learning_rate": 1e-06, + "loss": -0.0546, + "num_tokens": 7256785.0, + "reward": 0.253431499004364, + "reward_std": 0.028121720999479294, + "rewards/bleu_reward_func/mean": 0.253431499004364, + "rewards/bleu_reward_func/std": 0.20365522801876068, + "step": 550 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 474.0, + "completions/mean_length": 336.8125, + "completions/mean_terminated_length": 287.7599792480469, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "epoch": 0.4408, + "grad_norm": 3.6197187900543213, + "kl": 0.046844482421875, + "learning_rate": 1e-06, + "loss": 0.1119, + "num_tokens": 7268987.0, + "reward": 0.061213478446006775, + "reward_std": 0.01489005982875824, + "rewards/bleu_reward_func/mean": 0.061213478446006775, + "rewards/bleu_reward_func/std": 0.038935884833335876, + "step": 551 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 511.0, + "completions/mean_length": 386.21875, + "completions/mean_terminated_length": 351.0, + "completions/min_length": 3.0, + "completions/min_terminated_length": 3.0, + "epoch": 0.4416, + "grad_norm": 2.6066031455993652, + "kl": 0.034149169921875, + "learning_rate": 1e-06, + "loss": -0.1341, + "num_tokens": 7287306.0, + "reward": 0.11066319048404694, + "reward_std": 0.105903759598732, + "rewards/bleu_reward_func/mean": 0.11066319048404694, + "rewards/bleu_reward_func/std": 0.16723419725894928, + "step": 552 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 476.0, + "completions/mean_length": 263.28125, + "completions/mean_terminated_length": 237.55172729492188, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 0.4424, + "grad_norm": 9.281195640563965, + "kl": 0.226043701171875, + "learning_rate": 1e-06, + "loss": -0.1571, + "num_tokens": 7299899.0, + "reward": 0.06672249734401703, + "reward_std": 0.03525693714618683, + "rewards/bleu_reward_func/mean": 0.06672249734401703, + "rewards/bleu_reward_func/std": 0.0810592845082283, + "step": 553 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 489.0, + "completions/mean_length": 233.84375, + "completions/mean_terminated_length": 182.3333282470703, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 0.4432, + "grad_norm": 7.543862342834473, + "kl": 0.308807373046875, + "learning_rate": 1e-06, + "loss": 0.0082, + "num_tokens": 7313638.0, + "reward": 0.30825120210647583, + "reward_std": 0.07663644850254059, + "rewards/bleu_reward_func/mean": 0.30825120210647583, + "rewards/bleu_reward_func/std": 0.1689450740814209, + "step": 554 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 474.0, + "completions/mean_length": 246.40625, + "completions/mean_terminated_length": 218.9310302734375, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "epoch": 0.444, + "grad_norm": 4.433272361755371, + "kl": 0.104339599609375, + "learning_rate": 1e-06, + "loss": 0.0582, + "num_tokens": 7324939.0, + "reward": 0.19763408601284027, + "reward_std": 0.028635632246732712, + "rewards/bleu_reward_func/mean": 0.19763408601284027, + "rewards/bleu_reward_func/std": 0.18309614062309265, + "step": 555 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 62.96875, + "completions/mean_terminated_length": 48.48386764526367, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 0.4448, + "grad_norm": 10.322346687316895, + "kl": 0.1849365234375, + "learning_rate": 1e-06, + "loss": 0.432, + "num_tokens": 7330802.0, + "reward": 0.19210518896579742, + "reward_std": 0.03121430240571499, + "rewards/bleu_reward_func/mean": 0.19210518896579742, + "rewards/bleu_reward_func/std": 0.16853223741054535, + "step": 556 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 511.0, + "completions/mean_length": 202.78125, + "completions/mean_terminated_length": 182.1666717529297, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.4456, + "grad_norm": 9.519190788269043, + "kl": 0.171844482421875, + "learning_rate": 1e-06, + "loss": 0.0451, + "num_tokens": 7339683.0, + "reward": 0.170665442943573, + "reward_std": 0.06568457931280136, + "rewards/bleu_reward_func/mean": 0.170665442943573, + "rewards/bleu_reward_func/std": 0.1584860235452652, + "step": 557 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.34375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 495.0, + "completions/mean_length": 256.75, + "completions/mean_terminated_length": 123.04762268066406, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 0.4464, + "grad_norm": 5.70733118057251, + "kl": 0.177093505859375, + "learning_rate": 1e-06, + "loss": -0.0177, + "num_tokens": 7354059.0, + "reward": 0.11887075752019882, + "reward_std": 0.037268251180648804, + "rewards/bleu_reward_func/mean": 0.11887075752019882, + "rewards/bleu_reward_func/std": 0.09704269468784332, + "step": 558 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 398.0, + "completions/mean_length": 237.28125, + "completions/mean_terminated_length": 145.70834350585938, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 0.4472, + "grad_norm": 10.510950088500977, + "kl": 0.141387939453125, + "learning_rate": 1e-06, + "loss": 0.2298, + "num_tokens": 7369252.0, + "reward": 0.11686157435178757, + "reward_std": 0.06300412118434906, + "rewards/bleu_reward_func/mean": 0.11686157435178757, + "rewards/bleu_reward_func/std": 0.10008818656206131, + "step": 559 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 464.0, + "completions/mean_length": 294.21875, + "completions/mean_terminated_length": 209.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 0.448, + "grad_norm": 12.427102088928223, + "kl": 0.15325927734375, + "learning_rate": 1e-06, + "loss": -0.1463, + "num_tokens": 7384539.0, + "reward": 0.10454531759023666, + "reward_std": 0.032633934170007706, + "rewards/bleu_reward_func/mean": 0.10454531759023666, + "rewards/bleu_reward_func/std": 0.09093461185693741, + "step": 560 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 471.0, + "completions/mean_length": 175.84375, + "completions/mean_terminated_length": 63.79166793823242, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 0.4488, + "grad_norm": 18.402080535888672, + "kl": 0.187530517578125, + "learning_rate": 1e-06, + "loss": 0.6927, + "num_tokens": 7394846.0, + "reward": 0.21487680077552795, + "reward_std": 0.08058933913707733, + "rewards/bleu_reward_func/mean": 0.21487680077552795, + "rewards/bleu_reward_func/std": 0.20088493824005127, + "step": 561 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.34375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 504.0, + "completions/mean_length": 425.21875, + "completions/mean_terminated_length": 379.76190185546875, + "completions/min_length": 180.0, + "completions/min_terminated_length": 180.0, + "epoch": 0.4496, + "grad_norm": 2.861811637878418, + "kl": 0.030670166015625, + "learning_rate": 1e-06, + "loss": -0.0581, + "num_tokens": 7414069.0, + "reward": 0.09261719137430191, + "reward_std": 0.046390384435653687, + "rewards/bleu_reward_func/mean": 0.09261719137430191, + "rewards/bleu_reward_func/std": 0.14345434308052063, + "step": 562 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 506.0, + "completions/mean_length": 372.53125, + "completions/mean_terminated_length": 277.1052551269531, + "completions/min_length": 58.0, + "completions/min_terminated_length": 58.0, + "epoch": 0.4504, + "grad_norm": 3.737154960632324, + "kl": 0.07086181640625, + "learning_rate": 1e-06, + "loss": -0.0085, + "num_tokens": 7433038.0, + "reward": 0.13954411447048187, + "reward_std": 0.09964635223150253, + "rewards/bleu_reward_func/mean": 0.13954411447048187, + "rewards/bleu_reward_func/std": 0.2269161492586136, + "step": 563 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 490.0, + "completions/mean_length": 217.5, + "completions/mean_terminated_length": 175.42857360839844, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 0.4512, + "grad_norm": 9.693827629089355, + "kl": 0.2703857421875, + "learning_rate": 1e-06, + "loss": 0.0569, + "num_tokens": 7442590.0, + "reward": 0.08689892292022705, + "reward_std": 0.046516068279743195, + "rewards/bleu_reward_func/mean": 0.08689892292022705, + "rewards/bleu_reward_func/std": 0.09460947662591934, + "step": 564 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 489.0, + "completions/mean_length": 344.3125, + "completions/mean_terminated_length": 320.3571472167969, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "epoch": 0.452, + "grad_norm": 3.287851333618164, + "kl": 0.03594970703125, + "learning_rate": 1e-06, + "loss": -0.0888, + "num_tokens": 7455856.0, + "reward": 0.09177221357822418, + "reward_std": 0.02658715285360813, + "rewards/bleu_reward_func/mean": 0.09177221357822418, + "rewards/bleu_reward_func/std": 0.04939228668808937, + "step": 565 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 505.0, + "completions/mean_length": 257.0, + "completions/mean_terminated_length": 141.09091186523438, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.4528, + "grad_norm": 5.329028129577637, + "kl": 0.21087646484375, + "learning_rate": 1e-06, + "loss": -0.0318, + "num_tokens": 7471128.0, + "reward": 0.30248120427131653, + "reward_std": 0.045193642377853394, + "rewards/bleu_reward_func/mean": 0.30248120427131653, + "rewards/bleu_reward_func/std": 0.09429154545068741, + "step": 566 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 505.0, + "completions/mean_length": 396.5625, + "completions/mean_terminated_length": 306.77777099609375, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.4536, + "grad_norm": 2.8240556716918945, + "kl": 0.029327392578125, + "learning_rate": 1e-06, + "loss": -0.006, + "num_tokens": 7486778.0, + "reward": 0.046158432960510254, + "reward_std": 0.012592589482665062, + "rewards/bleu_reward_func/mean": 0.046158432960510254, + "rewards/bleu_reward_func/std": 0.0691753551363945, + "step": 567 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 463.0, + "completions/mean_length": 221.5, + "completions/mean_terminated_length": 180.00001525878906, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 0.4544, + "grad_norm": 5.554408073425293, + "kl": 0.077850341796875, + "learning_rate": 1e-06, + "loss": -0.0114, + "num_tokens": 7501522.0, + "reward": 0.19211658835411072, + "reward_std": 0.052228912711143494, + "rewards/bleu_reward_func/mean": 0.19211658835411072, + "rewards/bleu_reward_func/std": 0.12220965325832367, + "step": 568 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 370.0, + "completions/mean_length": 282.90625, + "completions/mean_terminated_length": 104.72222137451172, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 0.4552, + "grad_norm": 8.932016372680664, + "kl": 0.1943359375, + "learning_rate": 1e-06, + "loss": 0.0056, + "num_tokens": 7515511.0, + "reward": 0.08466814458370209, + "reward_std": 0.03040888160467148, + "rewards/bleu_reward_func/mean": 0.08466814458370209, + "rewards/bleu_reward_func/std": 0.07005324959754944, + "step": 569 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 512.0, + "completions/max_terminated_length": 505.0, + "completions/mean_length": 450.0, + "completions/mean_terminated_length": 388.0, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "epoch": 0.456, + "grad_norm": 1.950373888015747, + "kl": 0.0325927734375, + "learning_rate": 1e-06, + "loss": 0.0603, + "num_tokens": 7535791.0, + "reward": 0.06426975131034851, + "reward_std": 0.02304723486304283, + "rewards/bleu_reward_func/mean": 0.06426975131034851, + "rewards/bleu_reward_func/std": 0.04708797112107277, + "step": 570 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 480.0, + "completions/mean_length": 200.03125, + "completions/mean_terminated_length": 167.7586212158203, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.4568, + "grad_norm": 8.692915916442871, + "kl": 0.300445556640625, + "learning_rate": 1e-06, + "loss": 0.179, + "num_tokens": 7548720.0, + "reward": 0.16858291625976562, + "reward_std": 0.04772442951798439, + "rewards/bleu_reward_func/mean": 0.16858291625976562, + "rewards/bleu_reward_func/std": 0.187880739569664, + "step": 571 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 471.0, + "completions/mean_length": 221.4375, + "completions/mean_terminated_length": 154.38462829589844, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.4576, + "grad_norm": 5.559481143951416, + "kl": 0.2091064453125, + "learning_rate": 1e-06, + "loss": 0.1461, + "num_tokens": 7559926.0, + "reward": 0.2749570608139038, + "reward_std": 0.07935648411512375, + "rewards/bleu_reward_func/mean": 0.2749570608139038, + "rewards/bleu_reward_func/std": 0.20695801079273224, + "step": 572 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 511.0, + "completions/mean_length": 298.25, + "completions/mean_terminated_length": 227.0, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "epoch": 0.4584, + "grad_norm": 4.713781833648682, + "kl": 0.10894775390625, + "learning_rate": 1e-06, + "loss": -0.1979, + "num_tokens": 7573214.0, + "reward": 0.11424913257360458, + "reward_std": 0.0238350722938776, + "rewards/bleu_reward_func/mean": 0.11424913257360458, + "rewards/bleu_reward_func/std": 0.1513095498085022, + "step": 573 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 481.0, + "completions/mean_length": 301.375, + "completions/mean_terminated_length": 231.1666717529297, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.4592, + "grad_norm": 6.019801616668701, + "kl": 0.117706298828125, + "learning_rate": 1e-06, + "loss": -0.0467, + "num_tokens": 7584922.0, + "reward": 0.12773753702640533, + "reward_std": 0.03902646526694298, + "rewards/bleu_reward_func/mean": 0.12773753702640533, + "rewards/bleu_reward_func/std": 0.08676618337631226, + "step": 574 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.46875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 310.0, + "completions/mean_length": 261.3125, + "completions/mean_terminated_length": 40.11764907836914, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 0.46, + "grad_norm": 20.097490310668945, + "kl": 0.32666015625, + "learning_rate": 1e-06, + "loss": 0.0115, + "num_tokens": 7599564.0, + "reward": 0.1775631606578827, + "reward_std": 0.05471285060048103, + "rewards/bleu_reward_func/mean": 0.1775631606578827, + "rewards/bleu_reward_func/std": 0.1462731659412384, + "step": 575 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 483.0, + "completions/mean_length": 224.59375, + "completions/mean_terminated_length": 93.95455169677734, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 0.4608, + "grad_norm": 12.845754623413086, + "kl": 0.2415771484375, + "learning_rate": 1e-06, + "loss": 0.1681, + "num_tokens": 7609343.0, + "reward": 0.10711174458265305, + "reward_std": 0.03790780156850815, + "rewards/bleu_reward_func/mean": 0.10711174458265305, + "rewards/bleu_reward_func/std": 0.11842114478349686, + "step": 576 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.71875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 471.0, + "completions/mean_length": 429.15625, + "completions/mean_terminated_length": 217.44444274902344, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 0.4616, + "grad_norm": 3.16619873046875, + "kl": 0.031341552734375, + "learning_rate": 1e-06, + "loss": -0.1864, + "num_tokens": 7628708.0, + "reward": 0.12277669459581375, + "reward_std": 0.030532412230968475, + "rewards/bleu_reward_func/mean": 0.12277669459581375, + "rewards/bleu_reward_func/std": 0.14895910024642944, + "step": 577 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 452.0, + "completions/mean_length": 190.625, + "completions/mean_terminated_length": 100.63999938964844, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.4624, + "grad_norm": 6.808165550231934, + "kl": 0.2611083984375, + "learning_rate": 1e-06, + "loss": -0.043, + "num_tokens": 7637504.0, + "reward": 0.06198694184422493, + "reward_std": 0.018319500610232353, + "rewards/bleu_reward_func/mean": 0.06198694184422493, + "rewards/bleu_reward_func/std": 0.05399094894528389, + "step": 578 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.46875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 422.0, + "completions/mean_length": 306.65625, + "completions/mean_terminated_length": 125.47058868408203, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 0.4632, + "grad_norm": 6.689248085021973, + "kl": 0.165435791015625, + "learning_rate": 1e-06, + "loss": 0.0301, + "num_tokens": 7651797.0, + "reward": 0.07045552134513855, + "reward_std": 0.018690217286348343, + "rewards/bleu_reward_func/mean": 0.07045552134513855, + "rewards/bleu_reward_func/std": 0.07137548923492432, + "step": 579 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 497.0, + "completions/mean_length": 332.6875, + "completions/mean_terminated_length": 193.22222900390625, + "completions/min_length": 64.0, + "completions/min_terminated_length": 64.0, + "epoch": 0.464, + "grad_norm": 4.129982948303223, + "kl": 0.050750732421875, + "learning_rate": 1e-06, + "loss": 0.2464, + "num_tokens": 7667787.0, + "reward": 0.09383320808410645, + "reward_std": 0.046889662742614746, + "rewards/bleu_reward_func/mean": 0.09383320808410645, + "rewards/bleu_reward_func/std": 0.10615876317024231, + "step": 580 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 438.0, + "completions/mean_length": 145.375, + "completions/mean_terminated_length": 42.71999740600586, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 0.4648, + "grad_norm": 9.511686325073242, + "kl": 0.385894775390625, + "learning_rate": 1e-06, + "loss": 0.0501, + "num_tokens": 7679535.0, + "reward": 0.08031031489372253, + "reward_std": 0.036660827696323395, + "rewards/bleu_reward_func/mean": 0.08031031489372253, + "rewards/bleu_reward_func/std": 0.07939815521240234, + "step": 581 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 512.0, + "completions/mean_length": 294.0, + "completions/mean_terminated_length": 271.4482727050781, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.4656, + "grad_norm": 7.2719950675964355, + "kl": 0.149932861328125, + "learning_rate": 1e-06, + "loss": 0.0885, + "num_tokens": 7690815.0, + "reward": 0.11769823729991913, + "reward_std": 0.02824997529387474, + "rewards/bleu_reward_func/mean": 0.11769823729991913, + "rewards/bleu_reward_func/std": 0.12788043916225433, + "step": 582 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 453.0, + "completions/max_terminated_length": 453.0, + "completions/mean_length": 120.15625, + "completions/mean_terminated_length": 120.15625, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.4664, + "grad_norm": 11.640869140625, + "kl": 0.19036865234375, + "learning_rate": 1e-06, + "loss": 0.1164, + "num_tokens": 7698172.0, + "reward": 0.052216824144124985, + "reward_std": 0.015741443261504173, + "rewards/bleu_reward_func/mean": 0.052216824144124985, + "rewards/bleu_reward_func/std": 0.01899011991918087, + "step": 583 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 476.0, + "completions/mean_length": 236.46875, + "completions/mean_terminated_length": 144.625, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 0.4672, + "grad_norm": 7.708470821380615, + "kl": 0.25323486328125, + "learning_rate": 1e-06, + "loss": 0.0057, + "num_tokens": 7711683.0, + "reward": 0.20987500250339508, + "reward_std": 0.050422437489032745, + "rewards/bleu_reward_func/mean": 0.20987500250339508, + "rewards/bleu_reward_func/std": 0.21432380378246307, + "step": 584 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.34375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 503.0, + "completions/mean_length": 255.09375, + "completions/mean_terminated_length": 120.52381134033203, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 0.468, + "grad_norm": 8.178823471069336, + "kl": 0.181243896484375, + "learning_rate": 1e-06, + "loss": 0.0026, + "num_tokens": 7721638.0, + "reward": 0.11023026704788208, + "reward_std": 0.03732236102223396, + "rewards/bleu_reward_func/mean": 0.11023026704788208, + "rewards/bleu_reward_func/std": 0.06018221378326416, + "step": 585 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 497.0, + "completions/mean_length": 209.40625, + "completions/mean_terminated_length": 124.68000030517578, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.4688, + "grad_norm": 6.163815498352051, + "kl": 0.057861328125, + "learning_rate": 1e-06, + "loss": -0.382, + "num_tokens": 7731483.0, + "reward": 0.022579330950975418, + "reward_std": 0.024172717705368996, + "rewards/bleu_reward_func/mean": 0.022579330950975418, + "rewards/bleu_reward_func/std": 0.03154170513153076, + "step": 586 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 493.0, + "completions/mean_length": 179.28125, + "completions/mean_terminated_length": 131.75, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 0.4696, + "grad_norm": 24.94843864440918, + "kl": 0.303466796875, + "learning_rate": 1e-06, + "loss": -0.149, + "num_tokens": 7740828.0, + "reward": 0.062010329216718674, + "reward_std": 0.030193448066711426, + "rewards/bleu_reward_func/mean": 0.062010329216718674, + "rewards/bleu_reward_func/std": 0.04090145602822304, + "step": 587 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 494.0, + "completions/mean_length": 372.0625, + "completions/mean_terminated_length": 276.3157958984375, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 0.4704, + "grad_norm": 2.5675299167633057, + "kl": 0.03485107421875, + "learning_rate": 1e-06, + "loss": 0.0333, + "num_tokens": 7757862.0, + "reward": 0.037547022104263306, + "reward_std": 0.01179808471351862, + "rewards/bleu_reward_func/mean": 0.037547022104263306, + "rewards/bleu_reward_func/std": 0.03366583213210106, + "step": 588 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 487.0, + "completions/mean_length": 268.875, + "completions/mean_terminated_length": 223.8518524169922, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.4712, + "grad_norm": 9.238602638244629, + "kl": 0.09326171875, + "learning_rate": 1e-06, + "loss": 0.0178, + "num_tokens": 7769098.0, + "reward": 0.0967094898223877, + "reward_std": 0.041084855794906616, + "rewards/bleu_reward_func/mean": 0.0967094898223877, + "rewards/bleu_reward_func/std": 0.10235904902219772, + "step": 589 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 475.0, + "completions/mean_length": 271.71875, + "completions/mean_terminated_length": 107.31578826904297, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.472, + "grad_norm": 12.115531921386719, + "kl": 0.196563720703125, + "learning_rate": 1e-06, + "loss": 0.3242, + "num_tokens": 7782097.0, + "reward": 0.19325336813926697, + "reward_std": 0.0921676903963089, + "rewards/bleu_reward_func/mean": 0.19325336813926697, + "rewards/bleu_reward_func/std": 0.24508582055568695, + "step": 590 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 501.0, + "completions/mean_length": 197.59375, + "completions/mean_terminated_length": 176.6333465576172, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.4728, + "grad_norm": 9.336063385009766, + "kl": 0.17633056640625, + "learning_rate": 1e-06, + "loss": 0.0424, + "num_tokens": 7793588.0, + "reward": 0.14900264143943787, + "reward_std": 0.06498396396636963, + "rewards/bleu_reward_func/mean": 0.14900264143943787, + "rewards/bleu_reward_func/std": 0.13959822058677673, + "step": 591 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 396.0, + "completions/mean_length": 213.96875, + "completions/mean_terminated_length": 145.1923065185547, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 0.4736, + "grad_norm": 40.333492279052734, + "kl": 0.1762237548828125, + "learning_rate": 1e-06, + "loss": 0.034, + "num_tokens": 7809987.0, + "reward": 0.11991982161998749, + "reward_std": 0.024838652461767197, + "rewards/bleu_reward_func/mean": 0.11991982161998749, + "rewards/bleu_reward_func/std": 0.13350419700145721, + "step": 592 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 499.0, + "completions/mean_length": 349.46875, + "completions/mean_terminated_length": 295.29168701171875, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 0.4744, + "grad_norm": 6.914525032043457, + "kl": 0.02703857421875, + "learning_rate": 1e-06, + "loss": 0.1437, + "num_tokens": 7826610.0, + "reward": 0.02816709131002426, + "reward_std": 0.015584287233650684, + "rewards/bleu_reward_func/mean": 0.02816709131002426, + "rewards/bleu_reward_func/std": 0.027631772682070732, + "step": 593 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 444.0, + "completions/mean_length": 189.28125, + "completions/mean_terminated_length": 178.87095642089844, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 0.4752, + "grad_norm": 26.64930534362793, + "kl": 0.619964599609375, + "learning_rate": 1e-06, + "loss": 0.0753, + "num_tokens": 7834363.0, + "reward": 0.1504502296447754, + "reward_std": 0.061798207461833954, + "rewards/bleu_reward_func/mean": 0.1504502296447754, + "rewards/bleu_reward_func/std": 0.1269664466381073, + "step": 594 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 493.0, + "completions/mean_length": 317.125, + "completions/mean_terminated_length": 200.1999969482422, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.476, + "grad_norm": 8.9805908203125, + "kl": 0.294097900390625, + "learning_rate": 1e-06, + "loss": -0.0704, + "num_tokens": 7849831.0, + "reward": 0.13049980998039246, + "reward_std": 0.02749776840209961, + "rewards/bleu_reward_func/mean": 0.13049980998039246, + "rewards/bleu_reward_func/std": 0.109443299472332, + "step": 595 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 466.0, + "completions/mean_length": 272.125, + "completions/mean_terminated_length": 192.1666717529297, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 0.4768, + "grad_norm": 4.8097429275512695, + "kl": 0.08526611328125, + "learning_rate": 1e-06, + "loss": -0.1088, + "num_tokens": 7861347.0, + "reward": 0.0661308616399765, + "reward_std": 0.02051004208624363, + "rewards/bleu_reward_func/mean": 0.0661308616399765, + "rewards/bleu_reward_func/std": 0.05708196386694908, + "step": 596 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 468.0, + "completions/mean_length": 206.125, + "completions/mean_terminated_length": 120.47999572753906, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 0.4776, + "grad_norm": 7.351487636566162, + "kl": 0.334259033203125, + "learning_rate": 1e-06, + "loss": 0.0519, + "num_tokens": 7871295.0, + "reward": 0.16019710898399353, + "reward_std": 0.03656643629074097, + "rewards/bleu_reward_func/mean": 0.16019710898399353, + "rewards/bleu_reward_func/std": 0.19289268553256989, + "step": 597 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 512.0, + "completions/max_terminated_length": 480.0, + "completions/mean_length": 349.3125, + "completions/mean_terminated_length": 186.625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.4784, + "grad_norm": 4.930379867553711, + "kl": 0.181121826171875, + "learning_rate": 1e-06, + "loss": -0.0202, + "num_tokens": 7887577.0, + "reward": 0.1285662055015564, + "reward_std": 0.03015293926000595, + "rewards/bleu_reward_func/mean": 0.1285662055015564, + "rewards/bleu_reward_func/std": 0.08600351959466934, + "step": 598 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 507.0, + "completions/mean_length": 175.875, + "completions/mean_terminated_length": 153.4666748046875, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 0.4792, + "grad_norm": 15.110285758972168, + "kl": 0.34112548828125, + "learning_rate": 1e-06, + "loss": -0.1173, + "num_tokens": 7898453.0, + "reward": 0.09940430521965027, + "reward_std": 0.046547506004571915, + "rewards/bleu_reward_func/mean": 0.09940430521965027, + "rewards/bleu_reward_func/std": 0.05020095780491829, + "step": 599 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 484.0, + "completions/mean_length": 334.03125, + "completions/mean_terminated_length": 315.6206970214844, + "completions/min_length": 41.0, + "completions/min_terminated_length": 41.0, + "epoch": 0.48, + "grad_norm": 3.0496630668640137, + "kl": 0.03216552734375, + "learning_rate": 1e-06, + "loss": -0.1382, + "num_tokens": 7911190.0, + "reward": 0.08442967385053635, + "reward_std": 0.027117565274238586, + "rewards/bleu_reward_func/mean": 0.08442967385053635, + "rewards/bleu_reward_func/std": 0.07272256910800934, + "step": 600 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 389.0, + "completions/mean_length": 132.125, + "completions/mean_terminated_length": 119.87096405029297, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 0.4808, + "grad_norm": 9.77161693572998, + "kl": 0.3096923828125, + "learning_rate": 1e-06, + "loss": 0.0712, + "num_tokens": 7923114.0, + "reward": 0.19400227069854736, + "reward_std": 0.08562377095222473, + "rewards/bleu_reward_func/mean": 0.19400227069854736, + "rewards/bleu_reward_func/std": 0.18403199315071106, + "step": 601 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 433.0, + "completions/mean_length": 270.53125, + "completions/mean_terminated_length": 105.31578826904297, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 0.4816, + "grad_norm": 5.418551445007324, + "kl": 0.132049560546875, + "learning_rate": 1e-06, + "loss": 0.057, + "num_tokens": 7937747.0, + "reward": 0.15049128234386444, + "reward_std": 0.024429049342870712, + "rewards/bleu_reward_func/mean": 0.15049128234386444, + "rewards/bleu_reward_func/std": 0.1750853955745697, + "step": 602 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.46875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 492.0, + "completions/mean_length": 295.46875, + "completions/mean_terminated_length": 104.4117660522461, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.4824, + "grad_norm": 4.260025501251221, + "kl": 0.074981689453125, + "learning_rate": 1e-06, + "loss": 0.0337, + "num_tokens": 7951266.0, + "reward": 0.0902150496840477, + "reward_std": 0.0313844196498394, + "rewards/bleu_reward_func/mean": 0.0902150496840477, + "rewards/bleu_reward_func/std": 0.09559616446495056, + "step": 603 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 480.0, + "completions/mean_length": 359.625, + "completions/mean_terminated_length": 255.36842346191406, + "completions/min_length": 49.0, + "completions/min_terminated_length": 49.0, + "epoch": 0.4832, + "grad_norm": 3.981938362121582, + "kl": 0.05792236328125, + "learning_rate": 1e-06, + "loss": 0.068, + "num_tokens": 7967510.0, + "reward": 0.15250109136104584, + "reward_std": 0.050509147346019745, + "rewards/bleu_reward_func/mean": 0.15250109136104584, + "rewards/bleu_reward_func/std": 0.2119276374578476, + "step": 604 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 507.0, + "completions/mean_length": 243.625, + "completions/mean_terminated_length": 205.2857208251953, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 0.484, + "grad_norm": 4.506271839141846, + "kl": 0.133819580078125, + "learning_rate": 1e-06, + "loss": 0.0768, + "num_tokens": 7979194.0, + "reward": 0.049993276596069336, + "reward_std": 0.01375819742679596, + "rewards/bleu_reward_func/mean": 0.049993276596069336, + "rewards/bleu_reward_func/std": 0.019665135070681572, + "step": 605 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 394.0, + "completions/mean_length": 184.90625, + "completions/mean_terminated_length": 124.33333587646484, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 0.4848, + "grad_norm": 13.16498851776123, + "kl": 0.163360595703125, + "learning_rate": 1e-06, + "loss": 0.2145, + "num_tokens": 7992215.0, + "reward": 0.16849525272846222, + "reward_std": 0.041973263025283813, + "rewards/bleu_reward_func/mean": 0.16849525272846222, + "rewards/bleu_reward_func/std": 0.11670318245887756, + "step": 606 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 468.0, + "completions/max_terminated_length": 468.0, + "completions/mean_length": 278.3125, + "completions/mean_terminated_length": 278.3125, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.4856, + "grad_norm": 9.305381774902344, + "kl": 0.1671142578125, + "learning_rate": 1e-06, + "loss": 0.0542, + "num_tokens": 8003561.0, + "reward": 0.14806249737739563, + "reward_std": 0.04475884884595871, + "rewards/bleu_reward_func/mean": 0.14806249737739563, + "rewards/bleu_reward_func/std": 0.10317616909742355, + "step": 607 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 487.0, + "completions/mean_length": 392.3125, + "completions/mean_terminated_length": 320.5, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "epoch": 0.4864, + "grad_norm": 8.050078392028809, + "kl": 0.0286865234375, + "learning_rate": 1e-06, + "loss": -0.0652, + "num_tokens": 8020771.0, + "reward": 0.09127211570739746, + "reward_std": 0.02751500904560089, + "rewards/bleu_reward_func/mean": 0.09127211570739746, + "rewards/bleu_reward_func/std": 0.04517889395356178, + "step": 608 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 505.0, + "completions/mean_length": 293.0, + "completions/mean_terminated_length": 285.93548583984375, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "epoch": 0.4872, + "grad_norm": 9.551247596740723, + "kl": 0.05633544921875, + "learning_rate": 1e-06, + "loss": 0.0629, + "num_tokens": 8033227.0, + "reward": 0.07062816619873047, + "reward_std": 0.032938919961452484, + "rewards/bleu_reward_func/mean": 0.07062816619873047, + "rewards/bleu_reward_func/std": 0.05320809781551361, + "step": 609 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 461.0, + "completions/mean_length": 246.25, + "completions/mean_terminated_length": 184.92308044433594, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 0.488, + "grad_norm": 28.349098205566406, + "kl": 0.06927490234375, + "learning_rate": 1e-06, + "loss": 0.1921, + "num_tokens": 8048579.0, + "reward": 0.12640823423862457, + "reward_std": 0.028129609301686287, + "rewards/bleu_reward_func/mean": 0.12640823423862457, + "rewards/bleu_reward_func/std": 0.12890547513961792, + "step": 610 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.34375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 510.0, + "completions/mean_length": 334.28125, + "completions/mean_terminated_length": 241.1904754638672, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 0.4888, + "grad_norm": 6.840433597564697, + "kl": 0.054046630859375, + "learning_rate": 1e-06, + "loss": 0.1175, + "num_tokens": 8063660.0, + "reward": 0.06585729867219925, + "reward_std": 0.016829343512654305, + "rewards/bleu_reward_func/mean": 0.06585729867219925, + "rewards/bleu_reward_func/std": 0.027104271575808525, + "step": 611 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 507.0, + "completions/mean_length": 469.4375, + "completions/mean_terminated_length": 443.8999938964844, + "completions/min_length": 359.0, + "completions/min_terminated_length": 359.0, + "epoch": 0.4896, + "grad_norm": 2.304220199584961, + "kl": 0.029449462890625, + "learning_rate": 1e-06, + "loss": -0.0018, + "num_tokens": 8081922.0, + "reward": 0.029903851449489594, + "reward_std": 0.007851570844650269, + "rewards/bleu_reward_func/mean": 0.029903851449489594, + "rewards/bleu_reward_func/std": 0.017835307866334915, + "step": 612 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 483.0, + "completions/mean_length": 197.75, + "completions/mean_terminated_length": 187.61289978027344, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 0.4904, + "grad_norm": 5.66249418258667, + "kl": 0.0855712890625, + "learning_rate": 1e-06, + "loss": 0.1286, + "num_tokens": 8091010.0, + "reward": 0.2724965810775757, + "reward_std": 0.06183997541666031, + "rewards/bleu_reward_func/mean": 0.2724965810775757, + "rewards/bleu_reward_func/std": 0.2708708643913269, + "step": 613 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 485.0, + "completions/mean_length": 328.71875, + "completions/mean_terminated_length": 186.1666717529297, + "completions/min_length": 70.0, + "completions/min_terminated_length": 70.0, + "epoch": 0.4912, + "grad_norm": 3.228691816329956, + "kl": 0.0655670166015625, + "learning_rate": 1e-06, + "loss": 0.0338, + "num_tokens": 8105809.0, + "reward": 0.12389599531888962, + "reward_std": 0.07396578788757324, + "rewards/bleu_reward_func/mean": 0.12389599531888962, + "rewards/bleu_reward_func/std": 0.18483103811740875, + "step": 614 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.34375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 511.0, + "completions/mean_length": 317.59375, + "completions/mean_terminated_length": 215.76190185546875, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "epoch": 0.492, + "grad_norm": 3.5793278217315674, + "kl": 0.044097900390625, + "learning_rate": 1e-06, + "loss": 0.0314, + "num_tokens": 8118620.0, + "reward": 0.08205534517765045, + "reward_std": 0.032849013805389404, + "rewards/bleu_reward_func/mean": 0.08205534517765045, + "rewards/bleu_reward_func/std": 0.05394502356648445, + "step": 615 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 449.0, + "completions/max_terminated_length": 449.0, + "completions/mean_length": 180.9375, + "completions/mean_terminated_length": 180.9375, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 0.4928, + "grad_norm": 7.083518028259277, + "kl": 0.40167236328125, + "learning_rate": 1e-06, + "loss": -0.1081, + "num_tokens": 8128922.0, + "reward": 0.12701216340065002, + "reward_std": 0.03847620263695717, + "rewards/bleu_reward_func/mean": 0.12701216340065002, + "rewards/bleu_reward_func/std": 0.08405326306819916, + "step": 616 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 473.0, + "completions/mean_length": 214.25, + "completions/mean_terminated_length": 145.53846740722656, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 0.4936, + "grad_norm": 6.532368183135986, + "kl": 0.239410400390625, + "learning_rate": 1e-06, + "loss": -0.0347, + "num_tokens": 8143162.0, + "reward": 0.11757355183362961, + "reward_std": 0.02820819616317749, + "rewards/bleu_reward_func/mean": 0.11757355183362961, + "rewards/bleu_reward_func/std": 0.10728771984577179, + "step": 617 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 507.0, + "completions/mean_length": 364.375, + "completions/mean_terminated_length": 297.2727355957031, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, + "epoch": 0.4944, + "grad_norm": 2.549912929534912, + "kl": 0.044189453125, + "learning_rate": 1e-06, + "loss": 0.0332, + "num_tokens": 8158198.0, + "reward": 0.04174516722559929, + "reward_std": 0.011650302447378635, + "rewards/bleu_reward_func/mean": 0.04174516722559929, + "rewards/bleu_reward_func/std": 0.03221089020371437, + "step": 618 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.34375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 441.0, + "completions/mean_length": 311.84375, + "completions/mean_terminated_length": 207.0, + "completions/min_length": 49.0, + "completions/min_terminated_length": 49.0, + "epoch": 0.4952, + "grad_norm": 3.869034767150879, + "kl": 0.05462646484375, + "learning_rate": 1e-06, + "loss": 0.0031, + "num_tokens": 8171945.0, + "reward": 0.028928130865097046, + "reward_std": 0.012434298172593117, + "rewards/bleu_reward_func/mean": 0.028928130865097046, + "rewards/bleu_reward_func/std": 0.025789210572838783, + "step": 619 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 483.0, + "completions/mean_length": 243.90625, + "completions/mean_terminated_length": 205.60714721679688, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 0.496, + "grad_norm": 6.46402645111084, + "kl": 0.304168701171875, + "learning_rate": 1e-06, + "loss": 0.0145, + "num_tokens": 8181710.0, + "reward": 0.13130733370780945, + "reward_std": 0.018212325870990753, + "rewards/bleu_reward_func/mean": 0.13130733370780945, + "rewards/bleu_reward_func/std": 0.09850703179836273, + "step": 620 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 495.0, + "completions/mean_length": 242.0, + "completions/mean_terminated_length": 203.42857360839844, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.4968, + "grad_norm": 5.473258018493652, + "kl": 0.16949462890625, + "learning_rate": 1e-06, + "loss": -0.0354, + "num_tokens": 8193118.0, + "reward": 0.2502046227455139, + "reward_std": 0.03522457554936409, + "rewards/bleu_reward_func/mean": 0.2502046227455139, + "rewards/bleu_reward_func/std": 0.2565787732601166, + "step": 621 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 494.0, + "completions/mean_length": 236.71875, + "completions/mean_terminated_length": 144.95834350585938, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 0.4976, + "grad_norm": 30.692569732666016, + "kl": 0.29986572265625, + "learning_rate": 1e-06, + "loss": -0.0477, + "num_tokens": 8207549.0, + "reward": 0.22225125133991241, + "reward_std": 0.033524345606565475, + "rewards/bleu_reward_func/mean": 0.22225125133991241, + "rewards/bleu_reward_func/std": 0.19432197511196136, + "step": 622 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 495.0, + "completions/mean_length": 120.96875, + "completions/mean_terminated_length": 80.51724243164062, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 0.4984, + "grad_norm": 7.908195495605469, + "kl": 0.380523681640625, + "learning_rate": 1e-06, + "loss": -0.0756, + "num_tokens": 8218324.0, + "reward": 0.23351526260375977, + "reward_std": 0.05452558770775795, + "rewards/bleu_reward_func/mean": 0.23351526260375977, + "rewards/bleu_reward_func/std": 0.1365489512681961, + "step": 623 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 415.0, + "completions/mean_length": 172.5625, + "completions/mean_terminated_length": 137.44827270507812, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 0.4992, + "grad_norm": 8.18444538116455, + "kl": 0.296051025390625, + "learning_rate": 1e-06, + "loss": 0.0021, + "num_tokens": 8228742.0, + "reward": 0.14677512645721436, + "reward_std": 0.04820986092090607, + "rewards/bleu_reward_func/mean": 0.14677512645721436, + "rewards/bleu_reward_func/std": 0.15982075035572052, + "step": 624 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 494.0, + "completions/mean_length": 333.65625, + "completions/mean_terminated_length": 263.86956787109375, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.5, + "grad_norm": 15.107973098754883, + "kl": 0.160400390625, + "learning_rate": 1e-06, + "loss": 0.0015, + "num_tokens": 8245059.0, + "reward": 0.13298457860946655, + "reward_std": 0.018914809450507164, + "rewards/bleu_reward_func/mean": 0.13298457860946655, + "rewards/bleu_reward_func/std": 0.07686522603034973, + "step": 625 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 459.0, + "completions/mean_length": 299.53125, + "completions/mean_terminated_length": 260.1851806640625, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 0.5008, + "grad_norm": 131.54771423339844, + "kl": 0.102081298828125, + "learning_rate": 1e-06, + "loss": 0.1782, + "num_tokens": 8256620.0, + "reward": 0.1039574146270752, + "reward_std": 0.03130800276994705, + "rewards/bleu_reward_func/mean": 0.1039574146270752, + "rewards/bleu_reward_func/std": 0.05177094042301178, + "step": 626 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 486.0, + "completions/mean_length": 202.15625, + "completions/mean_terminated_length": 181.50001525878906, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 0.5016, + "grad_norm": 8.109158515930176, + "kl": 0.213104248046875, + "learning_rate": 1e-06, + "loss": -0.0456, + "num_tokens": 8266585.0, + "reward": 0.23561137914657593, + "reward_std": 0.03910698741674423, + "rewards/bleu_reward_func/mean": 0.23561137914657593, + "rewards/bleu_reward_func/std": 0.1352909654378891, + "step": 627 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 508.0, + "completions/mean_length": 259.5625, + "completions/mean_terminated_length": 144.8181915283203, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.5024, + "grad_norm": 5.162299633026123, + "kl": 0.147857666015625, + "learning_rate": 1e-06, + "loss": -0.0329, + "num_tokens": 8278827.0, + "reward": 0.1635468751192093, + "reward_std": 0.04077983647584915, + "rewards/bleu_reward_func/mean": 0.1635468751192093, + "rewards/bleu_reward_func/std": 0.1520238220691681, + "step": 628 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 501.0, + "completions/mean_length": 354.84375, + "completions/mean_terminated_length": 302.4583435058594, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "epoch": 0.5032, + "grad_norm": 3.0983669757843018, + "kl": 0.0435791015625, + "learning_rate": 1e-06, + "loss": 0.0095, + "num_tokens": 8293502.0, + "reward": 0.05737052857875824, + "reward_std": 0.021961018443107605, + "rewards/bleu_reward_func/mean": 0.05737052857875824, + "rewards/bleu_reward_func/std": 0.03505769371986389, + "step": 629 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 506.0, + "completions/mean_length": 216.25, + "completions/mean_terminated_length": 185.65516662597656, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 0.504, + "grad_norm": 6.821644306182861, + "kl": 0.0950927734375, + "learning_rate": 1e-06, + "loss": -0.0482, + "num_tokens": 8302182.0, + "reward": 0.07243853062391281, + "reward_std": 0.06683069467544556, + "rewards/bleu_reward_func/mean": 0.07243853062391281, + "rewards/bleu_reward_func/std": 0.10312769562005997, + "step": 630 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 497.0, + "completions/mean_length": 352.34375, + "completions/mean_terminated_length": 329.5357360839844, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.5048, + "grad_norm": 3.5717883110046387, + "kl": 0.036041259765625, + "learning_rate": 1e-06, + "loss": 0.1298, + "num_tokens": 8318753.0, + "reward": 0.026654381304979324, + "reward_std": 0.024883870035409927, + "rewards/bleu_reward_func/mean": 0.026654381304979324, + "rewards/bleu_reward_func/std": 0.03104417398571968, + "step": 631 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 499.0, + "completions/max_terminated_length": 499.0, + "completions/mean_length": 183.09375, + "completions/mean_terminated_length": 183.09375, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 0.5056, + "grad_norm": 6.945363998413086, + "kl": 0.234039306640625, + "learning_rate": 1e-06, + "loss": 0.0696, + "num_tokens": 8329596.0, + "reward": 0.18802031874656677, + "reward_std": 0.06351514160633087, + "rewards/bleu_reward_func/mean": 0.18802031874656677, + "rewards/bleu_reward_func/std": 0.16961929202079773, + "step": 632 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 485.0, + "completions/mean_length": 157.9375, + "completions/mean_terminated_length": 134.33334350585938, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 0.5064, + "grad_norm": 99.53560638427734, + "kl": 0.190399169921875, + "learning_rate": 1e-06, + "loss": 0.0621, + "num_tokens": 8343090.0, + "reward": 0.06075248867273331, + "reward_std": 0.018952492624521255, + "rewards/bleu_reward_func/mean": 0.06075248867273331, + "rewards/bleu_reward_func/std": 0.057455144822597504, + "step": 633 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 508.0, + "completions/mean_length": 291.71875, + "completions/mean_terminated_length": 268.9310302734375, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 0.5072, + "grad_norm": 6.928218364715576, + "kl": 0.155853271484375, + "learning_rate": 1e-06, + "loss": 0.0112, + "num_tokens": 8354305.0, + "reward": 0.15043729543685913, + "reward_std": 0.04871266707777977, + "rewards/bleu_reward_func/mean": 0.15043729543685913, + "rewards/bleu_reward_func/std": 0.17611344158649445, + "step": 634 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 498.0, + "completions/mean_length": 198.5625, + "completions/mean_terminated_length": 177.6666717529297, + "completions/min_length": 44.0, + "completions/min_terminated_length": 44.0, + "epoch": 0.508, + "grad_norm": 4.697340488433838, + "kl": 0.094085693359375, + "learning_rate": 1e-06, + "loss": 0.0737, + "num_tokens": 8364091.0, + "reward": 0.06601180136203766, + "reward_std": 0.02227596938610077, + "rewards/bleu_reward_func/mean": 0.06601180136203766, + "rewards/bleu_reward_func/std": 0.043257758021354675, + "step": 635 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 508.0, + "completions/mean_length": 410.78125, + "completions/mean_terminated_length": 332.0555725097656, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.5088, + "grad_norm": 77.46025085449219, + "kl": 0.05377197265625, + "learning_rate": 1e-06, + "loss": -0.1447, + "num_tokens": 8381724.0, + "reward": 0.03671726584434509, + "reward_std": 0.015178699977695942, + "rewards/bleu_reward_func/mean": 0.03671726584434509, + "rewards/bleu_reward_func/std": 0.03225603699684143, + "step": 636 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 512.0, + "completions/max_terminated_length": 408.0, + "completions/mean_length": 333.25, + "completions/mean_terminated_length": 154.5, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 0.5096, + "grad_norm": 6.136791706085205, + "kl": 0.124664306640625, + "learning_rate": 1e-06, + "loss": -0.0018, + "num_tokens": 8397476.0, + "reward": 0.16264644265174866, + "reward_std": 0.03679278865456581, + "rewards/bleu_reward_func/mean": 0.16264644265174866, + "rewards/bleu_reward_func/std": 0.16195148229599, + "step": 637 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 502.0, + "completions/mean_length": 221.125, + "completions/mean_terminated_length": 179.57144165039062, + "completions/min_length": 46.0, + "completions/min_terminated_length": 46.0, + "epoch": 0.5104, + "grad_norm": 4.400974750518799, + "kl": 0.058013916015625, + "learning_rate": 1e-06, + "loss": -0.0244, + "num_tokens": 8406544.0, + "reward": 0.059636689722537994, + "reward_std": 0.024229735136032104, + "rewards/bleu_reward_func/mean": 0.059636689722537994, + "rewards/bleu_reward_func/std": 0.04718983918428421, + "step": 638 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 499.0, + "completions/mean_length": 287.09375, + "completions/mean_terminated_length": 235.19232177734375, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 0.5112, + "grad_norm": 10.916999816894531, + "kl": 0.118011474609375, + "learning_rate": 1e-06, + "loss": 0.0766, + "num_tokens": 8422323.0, + "reward": 0.10468322038650513, + "reward_std": 0.021623361855745316, + "rewards/bleu_reward_func/mean": 0.10468322038650513, + "rewards/bleu_reward_func/std": 0.08197237551212311, + "step": 639 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 461.0, + "completions/mean_length": 336.28125, + "completions/mean_terminated_length": 311.1785888671875, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "epoch": 0.512, + "grad_norm": 2.1528663635253906, + "kl": 0.033172607421875, + "learning_rate": 1e-06, + "loss": -0.0722, + "num_tokens": 8437100.0, + "reward": 0.10049737989902496, + "reward_std": 0.03208357095718384, + "rewards/bleu_reward_func/mean": 0.10049737989902496, + "rewards/bleu_reward_func/std": 0.0739847868680954, + "step": 640 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.53125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 451.0, + "completions/mean_length": 361.375, + "completions/mean_terminated_length": 190.6666717529297, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5128, + "grad_norm": 4.406239986419678, + "kl": 0.163421630859375, + "learning_rate": 1e-06, + "loss": 0.0712, + "num_tokens": 8450184.0, + "reward": 0.06270510703325272, + "reward_std": 0.01890096440911293, + "rewards/bleu_reward_func/mean": 0.06270510703325272, + "rewards/bleu_reward_func/std": 0.04299367591738701, + "step": 641 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 508.0, + "completions/mean_length": 344.75, + "completions/mean_terminated_length": 268.727294921875, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.5136, + "grad_norm": 3.8396599292755127, + "kl": 0.21343994140625, + "learning_rate": 1e-06, + "loss": -0.0071, + "num_tokens": 8464952.0, + "reward": 0.12920798361301422, + "reward_std": 0.025508491322398186, + "rewards/bleu_reward_func/mean": 0.12920798361301422, + "rewards/bleu_reward_func/std": 0.11343086510896683, + "step": 642 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 443.0, + "completions/mean_length": 243.0625, + "completions/mean_terminated_length": 204.6428680419922, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.5144, + "grad_norm": 9.709904670715332, + "kl": 0.127349853515625, + "learning_rate": 1e-06, + "loss": -0.0569, + "num_tokens": 8481010.0, + "reward": 0.08636625856161118, + "reward_std": 0.023468628525733948, + "rewards/bleu_reward_func/mean": 0.08636625856161118, + "rewards/bleu_reward_func/std": 0.11172276735305786, + "step": 643 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.46875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 430.0, + "completions/mean_length": 319.8125, + "completions/mean_terminated_length": 150.23529052734375, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "epoch": 0.5152, + "grad_norm": 4.022066116333008, + "kl": 0.095428466796875, + "learning_rate": 1e-06, + "loss": 0.0192, + "num_tokens": 8495668.0, + "reward": 0.13581258058547974, + "reward_std": 0.049042053520679474, + "rewards/bleu_reward_func/mean": 0.13581258058547974, + "rewards/bleu_reward_func/std": 0.10865607112646103, + "step": 644 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 421.0, + "completions/mean_length": 200.40625, + "completions/mean_terminated_length": 113.15999603271484, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 0.516, + "grad_norm": 14.371395111083984, + "kl": 0.188232421875, + "learning_rate": 1e-06, + "loss": 0.1384, + "num_tokens": 8510193.0, + "reward": 0.16988492012023926, + "reward_std": 0.02835988998413086, + "rewards/bleu_reward_func/mean": 0.16988492012023926, + "rewards/bleu_reward_func/std": 0.22432467341423035, + "step": 645 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 512.0, + "completions/max_terminated_length": 505.0, + "completions/mean_length": 350.625, + "completions/mean_terminated_length": 189.25, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 0.5168, + "grad_norm": 3.603522539138794, + "kl": 0.106658935546875, + "learning_rate": 1e-06, + "loss": 0.0683, + "num_tokens": 8527613.0, + "reward": 0.04950277507305145, + "reward_std": 0.02557562291622162, + "rewards/bleu_reward_func/mean": 0.04950277507305145, + "rewards/bleu_reward_func/std": 0.036064986139535904, + "step": 646 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 463.0, + "completions/mean_length": 240.71875, + "completions/mean_terminated_length": 190.48147583007812, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "epoch": 0.5176, + "grad_norm": 2.1311533451080322, + "kl": 0.040771484375, + "learning_rate": 1e-06, + "loss": -0.0675, + "num_tokens": 8540012.0, + "reward": 0.4242175817489624, + "reward_std": 0.05443207919597626, + "rewards/bleu_reward_func/mean": 0.4242175817489624, + "rewards/bleu_reward_func/std": 0.3835957646369934, + "step": 647 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 488.0, + "completions/mean_length": 162.78125, + "completions/mean_terminated_length": 82.19231414794922, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 0.5184, + "grad_norm": 7.217045783996582, + "kl": 0.238250732421875, + "learning_rate": 1e-06, + "loss": -0.2492, + "num_tokens": 8550285.0, + "reward": 0.15483121573925018, + "reward_std": 0.04074571654200554, + "rewards/bleu_reward_func/mean": 0.15483121573925018, + "rewards/bleu_reward_func/std": 0.1628112941980362, + "step": 648 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 505.0, + "completions/mean_length": 226.03125, + "completions/mean_terminated_length": 196.44827270507812, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 0.5192, + "grad_norm": 9.426239013671875, + "kl": 0.326446533203125, + "learning_rate": 1e-06, + "loss": -0.0078, + "num_tokens": 8562006.0, + "reward": 0.22631683945655823, + "reward_std": 0.046764910221099854, + "rewards/bleu_reward_func/mean": 0.22631683945655823, + "rewards/bleu_reward_func/std": 0.24870522320270538, + "step": 649 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 501.0, + "completions/mean_length": 408.4375, + "completions/mean_terminated_length": 373.91668701171875, + "completions/min_length": 224.0, + "completions/min_terminated_length": 224.0, + "epoch": 0.52, + "grad_norm": 2.5924437046051025, + "kl": 0.03192138671875, + "learning_rate": 1e-06, + "loss": -0.0517, + "num_tokens": 8577388.0, + "reward": 0.050332337617874146, + "reward_std": 0.013445645570755005, + "rewards/bleu_reward_func/mean": 0.050332337617874146, + "rewards/bleu_reward_func/std": 0.04263650253415108, + "step": 650 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 490.0, + "completions/mean_length": 323.65625, + "completions/mean_terminated_length": 296.75, + "completions/min_length": 57.0, + "completions/min_terminated_length": 57.0, + "epoch": 0.5208, + "grad_norm": 6.97017765045166, + "kl": 0.062255859375, + "learning_rate": 1e-06, + "loss": 0.075, + "num_tokens": 8589889.0, + "reward": 0.0671861320734024, + "reward_std": 0.020000552758574486, + "rewards/bleu_reward_func/mean": 0.0671861320734024, + "rewards/bleu_reward_func/std": 0.027637863531708717, + "step": 651 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 500.0, + "completions/mean_length": 202.6875, + "completions/mean_terminated_length": 170.6896514892578, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 0.5216, + "grad_norm": 5.9939727783203125, + "kl": 0.30169677734375, + "learning_rate": 1e-06, + "loss": 0.0126, + "num_tokens": 8603471.0, + "reward": 0.23086336255073547, + "reward_std": 0.03887036070227623, + "rewards/bleu_reward_func/mean": 0.23086336255073547, + "rewards/bleu_reward_func/std": 0.1954699456691742, + "step": 652 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 512.0, + "completions/mean_length": 334.09375, + "completions/mean_terminated_length": 264.478271484375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.5224, + "grad_norm": 10.721747398376465, + "kl": 0.11480712890625, + "learning_rate": 1e-06, + "loss": -0.0181, + "num_tokens": 8618802.0, + "reward": 0.12768197059631348, + "reward_std": 0.018044453114271164, + "rewards/bleu_reward_func/mean": 0.12768197059631348, + "rewards/bleu_reward_func/std": 0.18208470940589905, + "step": 653 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 498.0, + "completions/mean_length": 254.28125, + "completions/mean_terminated_length": 182.1199951171875, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 0.5232, + "grad_norm": 21.832870483398438, + "kl": 0.224090576171875, + "learning_rate": 1e-06, + "loss": 0.0275, + "num_tokens": 8633395.0, + "reward": 0.23750805854797363, + "reward_std": 0.10584703087806702, + "rewards/bleu_reward_func/mean": 0.23750805854797363, + "rewards/bleu_reward_func/std": 0.24472850561141968, + "step": 654 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 488.0, + "completions/mean_length": 293.375, + "completions/mean_terminated_length": 207.8260955810547, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 0.524, + "grad_norm": 14.025049209594727, + "kl": 0.1319580078125, + "learning_rate": 1e-06, + "loss": 0.2269, + "num_tokens": 8647583.0, + "reward": 0.08922699838876724, + "reward_std": 0.022407300770282745, + "rewards/bleu_reward_func/mean": 0.08922699838876724, + "rewards/bleu_reward_func/std": 0.05691966786980629, + "step": 655 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 506.0, + "completions/mean_length": 417.34375, + "completions/mean_terminated_length": 374.3182067871094, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "epoch": 0.5248, + "grad_norm": 2.506775379180908, + "kl": 0.030517578125, + "learning_rate": 1e-06, + "loss": -0.1182, + "num_tokens": 8664458.0, + "reward": 0.056899260729551315, + "reward_std": 0.024433575570583344, + "rewards/bleu_reward_func/mean": 0.056899260729551315, + "rewards/bleu_reward_func/std": 0.043169718235731125, + "step": 656 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 506.0, + "completions/mean_length": 251.78125, + "completions/mean_terminated_length": 214.60714721679688, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 0.5256, + "grad_norm": 7.267916202545166, + "kl": 0.123870849609375, + "learning_rate": 1e-06, + "loss": 0.0057, + "num_tokens": 8673803.0, + "reward": 0.10382385551929474, + "reward_std": 0.051886267960071564, + "rewards/bleu_reward_func/mean": 0.10382385551929474, + "rewards/bleu_reward_func/std": 0.06761174649000168, + "step": 657 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 288.0, + "completions/mean_length": 227.34375, + "completions/mean_terminated_length": 132.45834350585938, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.5264, + "grad_norm": 5.546152114868164, + "kl": 0.098236083984375, + "learning_rate": 1e-06, + "loss": -0.0623, + "num_tokens": 8685270.0, + "reward": 0.1565043181180954, + "reward_std": 0.08428065478801727, + "rewards/bleu_reward_func/mean": 0.1565043181180954, + "rewards/bleu_reward_func/std": 0.17227834463119507, + "step": 658 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 414.0, + "completions/mean_length": 152.96875, + "completions/mean_terminated_length": 86.48148345947266, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 0.5272, + "grad_norm": 14.263039588928223, + "kl": 0.30914306640625, + "learning_rate": 1e-06, + "loss": 0.0075, + "num_tokens": 8696005.0, + "reward": 0.24965888261795044, + "reward_std": 0.051375266164541245, + "rewards/bleu_reward_func/mean": 0.24965888261795044, + "rewards/bleu_reward_func/std": 0.21870571374893188, + "step": 659 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 165.0, + "completions/max_terminated_length": 165.0, + "completions/mean_length": 32.5625, + "completions/mean_terminated_length": 32.5625, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 0.528, + "grad_norm": 442.6045837402344, + "kl": 0.5015869140625, + "learning_rate": 1e-06, + "loss": 0.0936, + "num_tokens": 8704767.0, + "reward": 0.13235034048557281, + "reward_std": 0.07672514766454697, + "rewards/bleu_reward_func/mean": 0.13235034048557281, + "rewards/bleu_reward_func/std": 0.13803941011428833, + "step": 660 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 431.0, + "completions/mean_length": 252.4375, + "completions/mean_terminated_length": 74.84210205078125, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 0.5288, + "grad_norm": 5.065097332000732, + "kl": 0.18255615234375, + "learning_rate": 1e-06, + "loss": 0.0464, + "num_tokens": 8721005.0, + "reward": 0.2883501648902893, + "reward_std": 0.022871889173984528, + "rewards/bleu_reward_func/mean": 0.2883501648902893, + "rewards/bleu_reward_func/std": 0.23920658230781555, + "step": 661 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 402.0, + "completions/mean_length": 178.625, + "completions/mean_terminated_length": 144.13792419433594, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.5296, + "grad_norm": 4.1915202140808105, + "kl": 0.152069091796875, + "learning_rate": 1e-06, + "loss": 0.1483, + "num_tokens": 8729545.0, + "reward": 0.09845434874296188, + "reward_std": 0.049190133810043335, + "rewards/bleu_reward_func/mean": 0.09845434874296188, + "rewards/bleu_reward_func/std": 0.06372099369764328, + "step": 662 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 506.0, + "completions/mean_length": 293.9375, + "completions/mean_terminated_length": 279.4000244140625, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "epoch": 0.5304, + "grad_norm": 10.675251007080078, + "kl": 0.0498046875, + "learning_rate": 1e-06, + "loss": 0.1798, + "num_tokens": 8741895.0, + "reward": 0.09067553281784058, + "reward_std": 0.036186374723911285, + "rewards/bleu_reward_func/mean": 0.09067553281784058, + "rewards/bleu_reward_func/std": 0.057389046996831894, + "step": 663 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 491.0, + "completions/mean_length": 157.75, + "completions/mean_terminated_length": 146.32257080078125, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 0.5312, + "grad_norm": 42.93239212036133, + "kl": 0.20751953125, + "learning_rate": 1e-06, + "loss": 0.0697, + "num_tokens": 8752255.0, + "reward": 0.27352431416511536, + "reward_std": 0.07531043887138367, + "rewards/bleu_reward_func/mean": 0.27352431416511536, + "rewards/bleu_reward_func/std": 0.13157765567302704, + "step": 664 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 497.0, + "completions/mean_length": 285.625, + "completions/mean_terminated_length": 222.239990234375, + "completions/min_length": 42.0, + "completions/min_terminated_length": 42.0, + "epoch": 0.532, + "grad_norm": 5.36870813369751, + "kl": 0.075042724609375, + "learning_rate": 1e-06, + "loss": -0.0052, + "num_tokens": 8764259.0, + "reward": 0.024750784039497375, + "reward_std": 0.022341227158904076, + "rewards/bleu_reward_func/mean": 0.024750784039497375, + "rewards/bleu_reward_func/std": 0.03164950758218765, + "step": 665 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 498.0, + "completions/max_terminated_length": 498.0, + "completions/mean_length": 231.0, + "completions/mean_terminated_length": 231.0, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.5328, + "grad_norm": 5.825709342956543, + "kl": 0.2059326171875, + "learning_rate": 1e-06, + "loss": -0.126, + "num_tokens": 8776043.0, + "reward": 0.09888751804828644, + "reward_std": 0.027325943112373352, + "rewards/bleu_reward_func/mean": 0.09888751804828644, + "rewards/bleu_reward_func/std": 0.06260307133197784, + "step": 666 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 480.0, + "completions/mean_length": 154.09375, + "completions/mean_terminated_length": 71.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 0.5336, + "grad_norm": 13.691543579101562, + "kl": 0.3173828125, + "learning_rate": 1e-06, + "loss": -0.0016, + "num_tokens": 8787118.0, + "reward": 0.16891013085842133, + "reward_std": 0.033737391233444214, + "rewards/bleu_reward_func/mean": 0.16891013085842133, + "rewards/bleu_reward_func/std": 0.1783466339111328, + "step": 667 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 458.0, + "completions/mean_length": 258.71875, + "completions/mean_terminated_length": 174.2916717529297, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 0.5344, + "grad_norm": 5.329594612121582, + "kl": 0.25360107421875, + "learning_rate": 1e-06, + "loss": -0.0877, + "num_tokens": 8801157.0, + "reward": 0.071600541472435, + "reward_std": 0.021211300045251846, + "rewards/bleu_reward_func/mean": 0.071600541472435, + "rewards/bleu_reward_func/std": 0.054277434945106506, + "step": 668 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 463.0, + "completions/mean_length": 300.71875, + "completions/mean_terminated_length": 136.38888549804688, + "completions/min_length": 57.0, + "completions/min_terminated_length": 57.0, + "epoch": 0.5352, + "grad_norm": 10.509814262390137, + "kl": 0.1150054931640625, + "learning_rate": 1e-06, + "loss": 0.0738, + "num_tokens": 8815244.0, + "reward": 0.09365338832139969, + "reward_std": 0.023113342002034187, + "rewards/bleu_reward_func/mean": 0.09365338832139969, + "rewards/bleu_reward_func/std": 0.0734696164727211, + "step": 669 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 496.0, + "completions/mean_length": 253.9375, + "completions/mean_terminated_length": 245.61289978027344, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.536, + "grad_norm": 23.27628517150879, + "kl": 0.17572021484375, + "learning_rate": 1e-06, + "loss": -0.1243, + "num_tokens": 8826930.0, + "reward": 0.10506822168827057, + "reward_std": 0.023658432066440582, + "rewards/bleu_reward_func/mean": 0.10506822168827057, + "rewards/bleu_reward_func/std": 0.06695646047592163, + "step": 670 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.53125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 482.0, + "completions/mean_length": 447.71875, + "completions/mean_terminated_length": 374.86669921875, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, + "epoch": 0.5368, + "grad_norm": 3.2729108333587646, + "kl": 0.028778076171875, + "learning_rate": 1e-06, + "loss": -0.0068, + "num_tokens": 8843889.0, + "reward": 0.025088129565119743, + "reward_std": 0.00651167519390583, + "rewards/bleu_reward_func/mean": 0.025088129565119743, + "rewards/bleu_reward_func/std": 0.02992870658636093, + "step": 671 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 488.0, + "completions/mean_length": 220.0, + "completions/mean_terminated_length": 210.5806427001953, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.5376, + "grad_norm": 245.07106018066406, + "kl": 0.28985595703125, + "learning_rate": 1e-06, + "loss": 0.0867, + "num_tokens": 8855401.0, + "reward": 0.09645279496908188, + "reward_std": 0.0731353610754013, + "rewards/bleu_reward_func/mean": 0.09645279496908188, + "rewards/bleu_reward_func/std": 0.09792789071798325, + "step": 672 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 497.0, + "completions/mean_length": 257.0625, + "completions/mean_terminated_length": 230.6896514892578, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 0.5384, + "grad_norm": 4.84860897064209, + "kl": 0.12005615234375, + "learning_rate": 1e-06, + "loss": -0.1854, + "num_tokens": 8867371.0, + "reward": 0.11370459198951721, + "reward_std": 0.06978605687618256, + "rewards/bleu_reward_func/mean": 0.11370459198951721, + "rewards/bleu_reward_func/std": 0.18471869826316833, + "step": 673 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 463.0, + "completions/mean_length": 237.78125, + "completions/mean_terminated_length": 161.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 0.5392, + "grad_norm": 33.883243560791016, + "kl": 0.1602783203125, + "learning_rate": 1e-06, + "loss": -0.0528, + "num_tokens": 8882588.0, + "reward": 0.20929288864135742, + "reward_std": 0.04879160225391388, + "rewards/bleu_reward_func/mean": 0.20929288864135742, + "rewards/bleu_reward_func/std": 0.17186923325061798, + "step": 674 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 473.0, + "completions/mean_length": 274.5, + "completions/mean_terminated_length": 240.57144165039062, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.54, + "grad_norm": 6.54965877532959, + "kl": 0.08441162109375, + "learning_rate": 1e-06, + "loss": -0.037, + "num_tokens": 8894508.0, + "reward": 0.04714702442288399, + "reward_std": 0.010213707573711872, + "rewards/bleu_reward_func/mean": 0.04714702442288399, + "rewards/bleu_reward_func/std": 0.04436042159795761, + "step": 675 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 497.0, + "completions/mean_length": 183.375, + "completions/mean_terminated_length": 91.36000061035156, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 0.5408, + "grad_norm": 11.63567066192627, + "kl": 0.16424560546875, + "learning_rate": 1e-06, + "loss": -0.0095, + "num_tokens": 8902704.0, + "reward": 0.18900102376937866, + "reward_std": 0.045921262353658676, + "rewards/bleu_reward_func/mean": 0.18900102376937866, + "rewards/bleu_reward_func/std": 0.25213196873664856, + "step": 676 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 496.0, + "completions/mean_length": 181.96875, + "completions/mean_terminated_length": 105.80769348144531, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 0.5416, + "grad_norm": 180.27212524414062, + "kl": 0.203125, + "learning_rate": 1e-06, + "loss": 0.5786, + "num_tokens": 8917647.0, + "reward": 0.11598189175128937, + "reward_std": 0.0453701987862587, + "rewards/bleu_reward_func/mean": 0.11598189175128937, + "rewards/bleu_reward_func/std": 0.12164945900440216, + "step": 677 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 258.0, + "completions/mean_length": 187.59375, + "completions/mean_terminated_length": 79.45833587646484, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.5424, + "grad_norm": 13.031902313232422, + "kl": 0.232452392578125, + "learning_rate": 1e-06, + "loss": 0.1064, + "num_tokens": 8927434.0, + "reward": 0.2563853859901428, + "reward_std": 0.021821634843945503, + "rewards/bleu_reward_func/mean": 0.2563853859901428, + "rewards/bleu_reward_func/std": 0.25126466155052185, + "step": 678 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 501.0, + "completions/mean_length": 157.53125, + "completions/mean_terminated_length": 133.90000915527344, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.5432, + "grad_norm": 37.64163589477539, + "kl": 0.17242431640625, + "learning_rate": 1e-06, + "loss": -0.1013, + "num_tokens": 8939683.0, + "reward": 0.22671283781528473, + "reward_std": 0.05255472660064697, + "rewards/bleu_reward_func/mean": 0.22671283781528473, + "rewards/bleu_reward_func/std": 0.22751960158348083, + "step": 679 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 486.0, + "completions/mean_length": 263.71875, + "completions/mean_terminated_length": 238.03448486328125, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 0.544, + "grad_norm": 129.8694610595703, + "kl": 0.1982421875, + "learning_rate": 1e-06, + "loss": 0.0067, + "num_tokens": 8949442.0, + "reward": 0.08898493647575378, + "reward_std": 0.0456019788980484, + "rewards/bleu_reward_func/mean": 0.08898493647575378, + "rewards/bleu_reward_func/std": 0.10032162815332413, + "step": 680 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.9375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 509.0, + "completions/mean_length": 510.1875, + "completions/mean_terminated_length": 483.0, + "completions/min_length": 457.0, + "completions/min_terminated_length": 457.0, + "epoch": 0.5448, + "grad_norm": 16.652488708496094, + "kl": 0.063751220703125, + "learning_rate": 1e-06, + "loss": 0.0038, + "num_tokens": 8968384.0, + "reward": 0.04759781062602997, + "reward_std": 0.009598957374691963, + "rewards/bleu_reward_func/mean": 0.04759781062602997, + "rewards/bleu_reward_func/std": 0.050501517951488495, + "step": 681 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.46875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 503.0, + "completions/mean_length": 374.875, + "completions/mean_terminated_length": 253.88235473632812, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.5456, + "grad_norm": 130.1652069091797, + "kl": 0.032989501953125, + "learning_rate": 1e-06, + "loss": -0.213, + "num_tokens": 8984364.0, + "reward": 0.07015404105186462, + "reward_std": 0.037277355790138245, + "rewards/bleu_reward_func/mean": 0.07015404105186462, + "rewards/bleu_reward_func/std": 0.10696472972631454, + "step": 682 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 492.0, + "completions/max_terminated_length": 492.0, + "completions/mean_length": 221.59375, + "completions/mean_terminated_length": 221.59375, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 0.5464, + "grad_norm": 15.168272018432617, + "kl": 0.25604248046875, + "learning_rate": 1e-06, + "loss": 0.0072, + "num_tokens": 8993615.0, + "reward": 0.05222689360380173, + "reward_std": 0.015750503167510033, + "rewards/bleu_reward_func/mean": 0.05222689360380173, + "rewards/bleu_reward_func/std": 0.03590291365981102, + "step": 683 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 385.0, + "completions/mean_length": 231.0625, + "completions/mean_terminated_length": 121.13043975830078, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.5472, + "grad_norm": 35.623695373535156, + "kl": 0.1175537109375, + "learning_rate": 1e-06, + "loss": -0.0464, + "num_tokens": 9004377.0, + "reward": 0.04063406586647034, + "reward_std": 0.028225397691130638, + "rewards/bleu_reward_func/mean": 0.04063406586647034, + "rewards/bleu_reward_func/std": 0.05525263398885727, + "step": 684 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 482.0, + "completions/mean_length": 361.78125, + "completions/mean_terminated_length": 259.0, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.548, + "grad_norm": 593.3710327148438, + "kl": 0.0638427734375, + "learning_rate": 1e-06, + "loss": 0.1608, + "num_tokens": 9020330.0, + "reward": 0.1717539131641388, + "reward_std": 0.0751393511891365, + "rewards/bleu_reward_func/mean": 0.1717539131641388, + "rewards/bleu_reward_func/std": 0.25347769260406494, + "step": 685 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 462.0, + "completions/max_terminated_length": 462.0, + "completions/mean_length": 175.59375, + "completions/mean_terminated_length": 175.59375, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.5488, + "grad_norm": 7.423861980438232, + "kl": 0.170654296875, + "learning_rate": 1e-06, + "loss": -0.0565, + "num_tokens": 9028509.0, + "reward": 0.091148242354393, + "reward_std": 0.017926650121808052, + "rewards/bleu_reward_func/mean": 0.091148242354393, + "rewards/bleu_reward_func/std": 0.07815965265035629, + "step": 686 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 512.0, + "completions/max_terminated_length": 479.0, + "completions/mean_length": 455.15625, + "completions/mean_terminated_length": 398.3125, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.5496, + "grad_norm": 2.612523317337036, + "kl": 0.033416748046875, + "learning_rate": 1e-06, + "loss": -0.0994, + "num_tokens": 9046090.0, + "reward": 0.04202251136302948, + "reward_std": 0.015885071828961372, + "rewards/bleu_reward_func/mean": 0.04202251136302948, + "rewards/bleu_reward_func/std": 0.03677666559815407, + "step": 687 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 505.0, + "completions/mean_length": 448.9375, + "completions/mean_terminated_length": 399.8888854980469, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "epoch": 0.5504, + "grad_norm": 14.944954872131348, + "kl": 0.056488037109375, + "learning_rate": 1e-06, + "loss": 0.1249, + "num_tokens": 9064360.0, + "reward": 0.03943703696131706, + "reward_std": 0.024654783308506012, + "rewards/bleu_reward_func/mean": 0.03943703696131706, + "rewards/bleu_reward_func/std": 0.03771531209349632, + "step": 688 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 510.0, + "completions/mean_length": 165.3125, + "completions/mean_terminated_length": 85.30769348144531, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 0.5512, + "grad_norm": 66.34259033203125, + "kl": 0.408477783203125, + "learning_rate": 1e-06, + "loss": -0.1375, + "num_tokens": 9078698.0, + "reward": 0.20156420767307281, + "reward_std": 0.07818345725536346, + "rewards/bleu_reward_func/mean": 0.20156420767307281, + "rewards/bleu_reward_func/std": 0.23512743413448334, + "step": 689 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 471.0, + "completions/mean_length": 224.0, + "completions/mean_terminated_length": 170.6666717529297, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.552, + "grad_norm": 45.66501998901367, + "kl": 1.2672119140625, + "learning_rate": 1e-06, + "loss": 0.3161, + "num_tokens": 9087402.0, + "reward": 0.12671013176441193, + "reward_std": 0.03653056174516678, + "rewards/bleu_reward_func/mean": 0.12671013176441193, + "rewards/bleu_reward_func/std": 0.0971146747469902, + "step": 690 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.34375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 442.0, + "completions/mean_length": 237.125, + "completions/mean_terminated_length": 93.14286041259766, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 0.5528, + "grad_norm": 497.3811950683594, + "kl": 0.163970947265625, + "learning_rate": 1e-06, + "loss": 0.069, + "num_tokens": 9099254.0, + "reward": 0.0354698970913887, + "reward_std": 0.02819395810365677, + "rewards/bleu_reward_func/mean": 0.0354698970913887, + "rewards/bleu_reward_func/std": 0.030991537496447563, + "step": 691 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 504.0, + "completions/mean_length": 287.65625, + "completions/mean_terminated_length": 246.11111450195312, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "epoch": 0.5536, + "grad_norm": 6.150320053100586, + "kl": 0.086273193359375, + "learning_rate": 1e-06, + "loss": 0.1272, + "num_tokens": 9110347.0, + "reward": 0.06792090833187103, + "reward_std": 0.02885974571108818, + "rewards/bleu_reward_func/mean": 0.06792090833187103, + "rewards/bleu_reward_func/std": 0.06222621724009514, + "step": 692 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 491.0, + "completions/mean_length": 244.84375, + "completions/mean_terminated_length": 140.30435180664062, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 0.5544, + "grad_norm": 36.0806999206543, + "kl": 0.23052978515625, + "learning_rate": 1e-06, + "loss": 0.0802, + "num_tokens": 9121382.0, + "reward": 0.09483693540096283, + "reward_std": 0.05147245526313782, + "rewards/bleu_reward_func/mean": 0.09483693540096283, + "rewards/bleu_reward_func/std": 0.08640998601913452, + "step": 693 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 361.0, + "completions/mean_length": 155.0, + "completions/mean_terminated_length": 131.20001220703125, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.5552, + "grad_norm": 55.443695068359375, + "kl": 0.318450927734375, + "learning_rate": 1e-06, + "loss": -0.0617, + "num_tokens": 9132678.0, + "reward": 0.10359849035739899, + "reward_std": 0.07937172800302505, + "rewards/bleu_reward_func/mean": 0.10359849035739899, + "rewards/bleu_reward_func/std": 0.13979652523994446, + "step": 694 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 305.0, + "completions/mean_length": 236.65625, + "completions/mean_terminated_length": 71.45000457763672, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 0.556, + "grad_norm": 146.36070251464844, + "kl": 0.099609375, + "learning_rate": 1e-06, + "loss": 0.1483, + "num_tokens": 9145227.0, + "reward": 0.17079538106918335, + "reward_std": 0.05914284288883209, + "rewards/bleu_reward_func/mean": 0.17079538106918335, + "rewards/bleu_reward_func/std": 0.23325958847999573, + "step": 695 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 472.0, + "completions/mean_length": 228.5625, + "completions/mean_terminated_length": 134.08334350585938, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 0.5568, + "grad_norm": 298.8739013671875, + "kl": 0.1568603515625, + "learning_rate": 1e-06, + "loss": -0.1223, + "num_tokens": 9160869.0, + "reward": 0.037290386855602264, + "reward_std": 0.014398223720490932, + "rewards/bleu_reward_func/mean": 0.037290386855602264, + "rewards/bleu_reward_func/std": 0.03690984100103378, + "step": 696 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 511.0, + "completions/mean_length": 229.3125, + "completions/mean_terminated_length": 164.07693481445312, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 0.5576, + "grad_norm": 220.50381469726562, + "kl": 0.365478515625, + "learning_rate": 1e-06, + "loss": 0.0947, + "num_tokens": 9170959.0, + "reward": 0.11910027265548706, + "reward_std": 0.04049726575613022, + "rewards/bleu_reward_func/mean": 0.11910027265548706, + "rewards/bleu_reward_func/std": 0.14281374216079712, + "step": 697 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 473.0, + "completions/mean_length": 216.15625, + "completions/mean_terminated_length": 161.37037658691406, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.5584, + "grad_norm": 11.570438385009766, + "kl": 0.205352783203125, + "learning_rate": 1e-06, + "loss": 0.0257, + "num_tokens": 9181980.0, + "reward": 0.08099336922168732, + "reward_std": 0.04509742930531502, + "rewards/bleu_reward_func/mean": 0.08099336922168732, + "rewards/bleu_reward_func/std": 0.10288692265748978, + "step": 698 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 463.0, + "completions/mean_length": 349.96875, + "completions/mean_terminated_length": 304.6000061035156, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "epoch": 0.5592, + "grad_norm": 37.448883056640625, + "kl": 0.09320068359375, + "learning_rate": 1e-06, + "loss": 0.0422, + "num_tokens": 9197723.0, + "reward": 0.05653442069888115, + "reward_std": 0.026268266141414642, + "rewards/bleu_reward_func/mean": 0.05653442069888115, + "rewards/bleu_reward_func/std": 0.04277388006448746, + "step": 699 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 503.0, + "completions/mean_length": 255.03125, + "completions/mean_terminated_length": 195.73077392578125, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 0.56, + "grad_norm": 340.22479248046875, + "kl": 0.41278076171875, + "learning_rate": 1e-06, + "loss": -0.2392, + "num_tokens": 9210148.0, + "reward": 0.06149371713399887, + "reward_std": 0.023687850683927536, + "rewards/bleu_reward_func/mean": 0.06149371713399887, + "rewards/bleu_reward_func/std": 0.03754807263612747, + "step": 700 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 497.0, + "completions/mean_length": 282.84375, + "completions/mean_terminated_length": 145.35000610351562, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 0.5608, + "grad_norm": 404.3658752441406, + "kl": 0.29132080078125, + "learning_rate": 1e-06, + "loss": 0.148, + "num_tokens": 9222815.0, + "reward": 0.05664993077516556, + "reward_std": 0.01803937554359436, + "rewards/bleu_reward_func/mean": 0.05664993077516556, + "rewards/bleu_reward_func/std": 0.02324024587869644, + "step": 701 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 205.0, + "completions/mean_length": 116.40625, + "completions/mean_terminated_length": 59.892860412597656, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 0.5616, + "grad_norm": 43.716766357421875, + "kl": 0.216552734375, + "learning_rate": 1e-06, + "loss": -0.0484, + "num_tokens": 9231460.0, + "reward": 0.1364556849002838, + "reward_std": 0.09380181133747101, + "rewards/bleu_reward_func/mean": 0.1364556849002838, + "rewards/bleu_reward_func/std": 0.21690192818641663, + "step": 702 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 477.0, + "completions/mean_length": 305.65625, + "completions/mean_terminated_length": 211.8636474609375, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 0.5624, + "grad_norm": 745.7041625976562, + "kl": 0.14068603515625, + "learning_rate": 1e-06, + "loss": 0.1945, + "num_tokens": 9245265.0, + "reward": 0.04224216938018799, + "reward_std": 0.016487902030348778, + "rewards/bleu_reward_func/mean": 0.04224216938018799, + "rewards/bleu_reward_func/std": 0.025008555501699448, + "step": 703 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 408.0, + "completions/mean_length": 115.0625, + "completions/mean_terminated_length": 102.25806427001953, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 0.5632, + "grad_norm": 216.922607421875, + "kl": 0.3056640625, + "learning_rate": 1e-06, + "loss": 0.0737, + "num_tokens": 9253899.0, + "reward": 0.08648309111595154, + "reward_std": 0.03777506947517395, + "rewards/bleu_reward_func/mean": 0.08648309111595154, + "rewards/bleu_reward_func/std": 0.05951961874961853, + "step": 704 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 461.0, + "completions/mean_length": 253.71875, + "completions/mean_terminated_length": 98.75, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 0.564, + "grad_norm": 162.15737915039062, + "kl": 0.1361083984375, + "learning_rate": 1e-06, + "loss": -0.0623, + "num_tokens": 9269314.0, + "reward": 0.2322496771812439, + "reward_std": 0.045732706785202026, + "rewards/bleu_reward_func/mean": 0.2322496771812439, + "rewards/bleu_reward_func/std": 0.25273510813713074, + "step": 705 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 504.0, + "completions/mean_length": 340.0625, + "completions/mean_terminated_length": 272.7826232910156, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "epoch": 0.5648, + "grad_norm": 4.662173271179199, + "kl": 0.035003662109375, + "learning_rate": 1e-06, + "loss": 0.0266, + "num_tokens": 9282188.0, + "reward": 0.04237870126962662, + "reward_std": 0.01780301332473755, + "rewards/bleu_reward_func/mean": 0.04237870126962662, + "rewards/bleu_reward_func/std": 0.04967799782752991, + "step": 706 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 438.0, + "completions/mean_length": 349.6875, + "completions/mean_terminated_length": 275.9090881347656, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "epoch": 0.5656, + "grad_norm": 2.4642882347106934, + "kl": 0.04644775390625, + "learning_rate": 1e-06, + "loss": 0.0446, + "num_tokens": 9298362.0, + "reward": 0.0549091175198555, + "reward_std": 0.03525715321302414, + "rewards/bleu_reward_func/mean": 0.0549091175198555, + "rewards/bleu_reward_func/std": 0.051221489906311035, + "step": 707 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 510.0, + "completions/mean_length": 249.875, + "completions/mean_terminated_length": 232.40000915527344, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 0.5664, + "grad_norm": 11.45730972290039, + "kl": 0.16558837890625, + "learning_rate": 1e-06, + "loss": 0.0823, + "num_tokens": 9308934.0, + "reward": 0.10365931689739227, + "reward_std": 0.028398117050528526, + "rewards/bleu_reward_func/mean": 0.10365931689739227, + "rewards/bleu_reward_func/std": 0.06428122520446777, + "step": 708 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 452.0, + "completions/mean_length": 254.0625, + "completions/mean_terminated_length": 99.30000305175781, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 0.5672, + "grad_norm": 7.227019786834717, + "kl": 0.154449462890625, + "learning_rate": 1e-06, + "loss": 0.0107, + "num_tokens": 9322640.0, + "reward": 0.18866762518882751, + "reward_std": 0.044271718710660934, + "rewards/bleu_reward_func/mean": 0.18866762518882751, + "rewards/bleu_reward_func/std": 0.1287185698747635, + "step": 709 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 507.0, + "completions/mean_length": 304.96875, + "completions/mean_terminated_length": 283.5517272949219, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "epoch": 0.568, + "grad_norm": 5.722259998321533, + "kl": 0.07122802734375, + "learning_rate": 1e-06, + "loss": -0.1128, + "num_tokens": 9337639.0, + "reward": 0.0695868507027626, + "reward_std": 0.022387558594346046, + "rewards/bleu_reward_func/mean": 0.0695868507027626, + "rewards/bleu_reward_func/std": 0.065777987241745, + "step": 710 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 387.0, + "completions/mean_length": 150.09375, + "completions/mean_terminated_length": 125.9666748046875, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.5688, + "grad_norm": 5.365989685058594, + "kl": 0.0738525390625, + "learning_rate": 1e-06, + "loss": 0.3885, + "num_tokens": 9346522.0, + "reward": 0.2348867505788803, + "reward_std": 0.09850712865591049, + "rewards/bleu_reward_func/mean": 0.2348867505788803, + "rewards/bleu_reward_func/std": 0.302653044462204, + "step": 711 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 492.0, + "completions/mean_length": 342.65625, + "completions/mean_terminated_length": 265.68182373046875, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.5696, + "grad_norm": 7.032015800476074, + "kl": 0.060089111328125, + "learning_rate": 1e-06, + "loss": -0.0338, + "num_tokens": 9359935.0, + "reward": 0.053069278597831726, + "reward_std": 0.015607406385242939, + "rewards/bleu_reward_func/mean": 0.053069278597831726, + "rewards/bleu_reward_func/std": 0.0380670465528965, + "step": 712 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 372.0, + "completions/mean_length": 166.46875, + "completions/mean_terminated_length": 69.72000122070312, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 0.5704, + "grad_norm": 7.543166160583496, + "kl": 0.26068115234375, + "learning_rate": 1e-06, + "loss": 0.3241, + "num_tokens": 9367886.0, + "reward": 0.17086198925971985, + "reward_std": 0.059704020619392395, + "rewards/bleu_reward_func/mean": 0.17086198925971985, + "rewards/bleu_reward_func/std": 0.13924144208431244, + "step": 713 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 496.0, + "completions/mean_length": 306.3125, + "completions/mean_terminated_length": 248.72000122070312, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.5712, + "grad_norm": 3.74664568901062, + "kl": 0.086090087890625, + "learning_rate": 1e-06, + "loss": -0.2145, + "num_tokens": 9381256.0, + "reward": 0.026702141389250755, + "reward_std": 0.010159555822610855, + "rewards/bleu_reward_func/mean": 0.026702141389250755, + "rewards/bleu_reward_func/std": 0.025492098182439804, + "step": 714 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 374.0, + "completions/mean_length": 202.40625, + "completions/mean_terminated_length": 145.07408142089844, + "completions/min_length": 42.0, + "completions/min_terminated_length": 42.0, + "epoch": 0.572, + "grad_norm": 3.670616865158081, + "kl": 0.0810394287109375, + "learning_rate": 1e-06, + "loss": 0.3116, + "num_tokens": 9394869.0, + "reward": 0.09095513820648193, + "reward_std": 0.0500517264008522, + "rewards/bleu_reward_func/mean": 0.09095513820648193, + "rewards/bleu_reward_func/std": 0.07213761657476425, + "step": 715 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 432.0, + "completions/mean_length": 221.5625, + "completions/mean_terminated_length": 202.20001220703125, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 0.5728, + "grad_norm": 9.221464157104492, + "kl": 0.10400390625, + "learning_rate": 1e-06, + "loss": 0.3058, + "num_tokens": 9405335.0, + "reward": 0.12374541163444519, + "reward_std": 0.040764160454273224, + "rewards/bleu_reward_func/mean": 0.12374541163444519, + "rewards/bleu_reward_func/std": 0.13386160135269165, + "step": 716 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 512.0, + "completions/mean_length": 347.1875, + "completions/mean_terminated_length": 234.42105102539062, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 0.5736, + "grad_norm": 12.991151809692383, + "kl": 0.157958984375, + "learning_rate": 1e-06, + "loss": 0.0424, + "num_tokens": 9420789.0, + "reward": 0.07029742747545242, + "reward_std": 0.012854170054197311, + "rewards/bleu_reward_func/mean": 0.07029742747545242, + "rewards/bleu_reward_func/std": 0.041719451546669006, + "step": 717 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.78125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 495.0, + "completions/mean_length": 503.75, + "completions/mean_terminated_length": 474.2857360839844, + "completions/min_length": 408.0, + "completions/min_terminated_length": 408.0, + "epoch": 0.5744, + "grad_norm": 2.107853889465332, + "kl": 0.028350830078125, + "learning_rate": 1e-06, + "loss": -0.0047, + "num_tokens": 9439789.0, + "reward": 0.05256051570177078, + "reward_std": 0.010154004208743572, + "rewards/bleu_reward_func/mean": 0.05256051570177078, + "rewards/bleu_reward_func/std": 0.03522626310586929, + "step": 718 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 496.0, + "completions/mean_length": 244.15625, + "completions/mean_terminated_length": 235.51612854003906, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "epoch": 0.5752, + "grad_norm": 3.158909559249878, + "kl": 0.0923919677734375, + "learning_rate": 1e-06, + "loss": -0.1019, + "num_tokens": 9451978.0, + "reward": 0.12841522693634033, + "reward_std": 0.05657704174518585, + "rewards/bleu_reward_func/mean": 0.12841522693634033, + "rewards/bleu_reward_func/std": 0.07523242384195328, + "step": 719 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 494.0, + "completions/mean_length": 150.4375, + "completions/mean_terminated_length": 83.48148345947266, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 0.576, + "grad_norm": 8.099937438964844, + "kl": 0.214874267578125, + "learning_rate": 1e-06, + "loss": 0.0129, + "num_tokens": 9461736.0, + "reward": 0.08868992328643799, + "reward_std": 0.017071515321731567, + "rewards/bleu_reward_func/mean": 0.08868992328643799, + "rewards/bleu_reward_func/std": 0.08577441424131393, + "step": 720 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 512.0, + "completions/max_terminated_length": 504.0, + "completions/mean_length": 447.65625, + "completions/mean_terminated_length": 383.3125, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "epoch": 0.5768, + "grad_norm": 2.4854142665863037, + "kl": 0.045318603515625, + "learning_rate": 1e-06, + "loss": -0.0504, + "num_tokens": 9477381.0, + "reward": 0.0682106539607048, + "reward_std": 0.022257793694734573, + "rewards/bleu_reward_func/mean": 0.0682106539607048, + "rewards/bleu_reward_func/std": 0.05095710977911949, + "step": 721 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 350.0, + "completions/mean_length": 167.84375, + "completions/mean_terminated_length": 156.74192810058594, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "epoch": 0.5776, + "grad_norm": 4.308876037597656, + "kl": 0.07659912109375, + "learning_rate": 1e-06, + "loss": 0.1774, + "num_tokens": 9489008.0, + "reward": 0.0951995924115181, + "reward_std": 0.033833228051662445, + "rewards/bleu_reward_func/mean": 0.0951995924115181, + "rewards/bleu_reward_func/std": 0.0729941874742508, + "step": 722 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 478.0, + "completions/mean_length": 179.34375, + "completions/mean_terminated_length": 144.9310302734375, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 0.5784, + "grad_norm": 12.132597923278809, + "kl": 0.33624267578125, + "learning_rate": 1e-06, + "loss": 0.2153, + "num_tokens": 9498059.0, + "reward": 0.10619839280843735, + "reward_std": 0.04761648178100586, + "rewards/bleu_reward_func/mean": 0.10619839280843735, + "rewards/bleu_reward_func/std": 0.0809776559472084, + "step": 723 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 511.0, + "completions/mean_length": 374.78125, + "completions/mean_terminated_length": 268.0555725097656, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "epoch": 0.5792, + "grad_norm": 7.476963996887207, + "kl": 0.1087646484375, + "learning_rate": 1e-06, + "loss": 0.0108, + "num_tokens": 9517972.0, + "reward": 0.17816489934921265, + "reward_std": 0.016055870801210403, + "rewards/bleu_reward_func/mean": 0.17816489934921265, + "rewards/bleu_reward_func/std": 0.266427606344223, + "step": 724 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 503.0, + "completions/mean_length": 220.125, + "completions/mean_terminated_length": 152.7692413330078, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 0.58, + "grad_norm": 5.399383544921875, + "kl": 0.13983154296875, + "learning_rate": 1e-06, + "loss": 0.1212, + "num_tokens": 9530760.0, + "reward": 0.07149016857147217, + "reward_std": 0.023819390684366226, + "rewards/bleu_reward_func/mean": 0.07149016857147217, + "rewards/bleu_reward_func/std": 0.053011875599622726, + "step": 725 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 475.0, + "completions/mean_length": 500.3125, + "completions/mean_terminated_length": 418.5, + "completions/min_length": 355.0, + "completions/min_terminated_length": 355.0, + "epoch": 0.5808, + "grad_norm": 2.243359088897705, + "kl": 0.028045654296875, + "learning_rate": 1e-06, + "loss": -0.0175, + "num_tokens": 9550330.0, + "reward": 0.04440176486968994, + "reward_std": 0.00922885537147522, + "rewards/bleu_reward_func/mean": 0.04440176486968994, + "rewards/bleu_reward_func/std": 0.03932040557265282, + "step": 726 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 498.0, + "completions/mean_length": 154.65625, + "completions/mean_terminated_length": 103.60714721679688, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 0.5816, + "grad_norm": 54.60096740722656, + "kl": 0.27252197265625, + "learning_rate": 1e-06, + "loss": 0.1714, + "num_tokens": 9557863.0, + "reward": 0.34987902641296387, + "reward_std": 0.09637948125600815, + "rewards/bleu_reward_func/mean": 0.34987902641296387, + "rewards/bleu_reward_func/std": 0.30998000502586365, + "step": 727 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 400.0, + "completions/mean_length": 394.21875, + "completions/mean_terminated_length": 197.9166717529297, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.5824, + "grad_norm": 3.5557267665863037, + "kl": 0.04107666015625, + "learning_rate": 1e-06, + "loss": -0.2035, + "num_tokens": 9576342.0, + "reward": 0.03708350285887718, + "reward_std": 0.013400746509432793, + "rewards/bleu_reward_func/mean": 0.03708350285887718, + "rewards/bleu_reward_func/std": 0.030460968613624573, + "step": 728 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 492.0, + "completions/mean_length": 287.53125, + "completions/mean_terminated_length": 272.5666809082031, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 0.5832, + "grad_norm": 5.337757110595703, + "kl": 0.106048583984375, + "learning_rate": 1e-06, + "loss": -0.0648, + "num_tokens": 9587719.0, + "reward": 0.08226186782121658, + "reward_std": 0.016267672181129456, + "rewards/bleu_reward_func/mean": 0.08226186782121658, + "rewards/bleu_reward_func/std": 0.047058336436748505, + "step": 729 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 497.0, + "completions/mean_length": 353.46875, + "completions/mean_terminated_length": 291.4347839355469, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 0.584, + "grad_norm": 172.38458251953125, + "kl": 0.142059326171875, + "learning_rate": 1e-06, + "loss": -0.0203, + "num_tokens": 9604270.0, + "reward": 0.08777523040771484, + "reward_std": 0.028989041224122047, + "rewards/bleu_reward_func/mean": 0.08777523040771484, + "rewards/bleu_reward_func/std": 0.053535155951976776, + "step": 730 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 412.0, + "completions/max_terminated_length": 412.0, + "completions/mean_length": 123.5625, + "completions/mean_terminated_length": 123.5625, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 0.5848, + "grad_norm": 14.68234920501709, + "kl": 0.1837158203125, + "learning_rate": 1e-06, + "loss": 0.053, + "num_tokens": 9613376.0, + "reward": 0.09781108796596527, + "reward_std": 0.03509049117565155, + "rewards/bleu_reward_func/mean": 0.09781108796596527, + "rewards/bleu_reward_func/std": 0.07531887292861938, + "step": 731 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 499.0, + "completions/mean_length": 147.53125, + "completions/mean_terminated_length": 63.42308044433594, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 0.5856, + "grad_norm": 471.59912109375, + "kl": 0.191680908203125, + "learning_rate": 1e-06, + "loss": 0.0896, + "num_tokens": 9624305.0, + "reward": 0.08778894692659378, + "reward_std": 0.025603748857975006, + "rewards/bleu_reward_func/mean": 0.08778894692659378, + "rewards/bleu_reward_func/std": 0.06823020428419113, + "step": 732 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 506.0, + "completions/mean_length": 254.0, + "completions/mean_terminated_length": 194.4615478515625, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "epoch": 0.5864, + "grad_norm": 3.397786855697632, + "kl": 0.03460693359375, + "learning_rate": 1e-06, + "loss": -0.0596, + "num_tokens": 9634593.0, + "reward": 0.08798034489154816, + "reward_std": 0.02149152383208275, + "rewards/bleu_reward_func/mean": 0.08798034489154816, + "rewards/bleu_reward_func/std": 0.07060196995735168, + "step": 733 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 511.0, + "completions/mean_length": 300.75, + "completions/mean_terminated_length": 261.629638671875, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.5872, + "grad_norm": 11.675055503845215, + "kl": 0.0538330078125, + "learning_rate": 1e-06, + "loss": 0.1186, + "num_tokens": 9647729.0, + "reward": 0.20255795121192932, + "reward_std": 0.044919952750205994, + "rewards/bleu_reward_func/mean": 0.20255795121192932, + "rewards/bleu_reward_func/std": 0.23513151705265045, + "step": 734 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 381.0, + "completions/max_terminated_length": 381.0, + "completions/mean_length": 100.875, + "completions/mean_terminated_length": 100.875, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 0.588, + "grad_norm": 7.969452381134033, + "kl": 0.2791748046875, + "learning_rate": 1e-06, + "loss": 0.1783, + "num_tokens": 9655565.0, + "reward": 0.1732563078403473, + "reward_std": 0.06255275756120682, + "rewards/bleu_reward_func/mean": 0.1732563078403473, + "rewards/bleu_reward_func/std": 0.14761896431446075, + "step": 735 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 504.0, + "completions/mean_length": 255.78125, + "completions/mean_terminated_length": 196.6538543701172, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 0.5888, + "grad_norm": 37.51567840576172, + "kl": 0.139739990234375, + "learning_rate": 1e-06, + "loss": 0.0247, + "num_tokens": 9672070.0, + "reward": 0.3068300187587738, + "reward_std": 0.018469596281647682, + "rewards/bleu_reward_func/mean": 0.3068300187587738, + "rewards/bleu_reward_func/std": 0.29021966457366943, + "step": 736 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 477.0, + "completions/mean_length": 367.15625, + "completions/mean_terminated_length": 301.31817626953125, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, + "epoch": 0.5896, + "grad_norm": 2.4598867893218994, + "kl": 0.0330657958984375, + "learning_rate": 1e-06, + "loss": -0.0194, + "num_tokens": 9688219.0, + "reward": 0.05701170861721039, + "reward_std": 0.020281650125980377, + "rewards/bleu_reward_func/mean": 0.05701170861721039, + "rewards/bleu_reward_func/std": 0.055385053157806396, + "step": 737 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 486.0, + "completions/mean_length": 318.46875, + "completions/mean_terminated_length": 230.5, + "completions/min_length": 48.0, + "completions/min_terminated_length": 48.0, + "epoch": 0.5904, + "grad_norm": 4.623442649841309, + "kl": 0.045806884765625, + "learning_rate": 1e-06, + "loss": -0.2136, + "num_tokens": 9701082.0, + "reward": 0.05631488561630249, + "reward_std": 0.022235814481973648, + "rewards/bleu_reward_func/mean": 0.05631488561630249, + "rewards/bleu_reward_func/std": 0.0748782679438591, + "step": 738 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 429.0, + "completions/mean_length": 195.40625, + "completions/mean_terminated_length": 122.34616088867188, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.5912, + "grad_norm": 6.426390171051025, + "kl": 0.21075439453125, + "learning_rate": 1e-06, + "loss": 0.0043, + "num_tokens": 9712175.0, + "reward": 0.250314861536026, + "reward_std": 0.043683700263500214, + "rewards/bleu_reward_func/mean": 0.250314861536026, + "rewards/bleu_reward_func/std": 0.27451202273368835, + "step": 739 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 504.0, + "completions/mean_length": 314.375, + "completions/mean_terminated_length": 195.8000030517578, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.592, + "grad_norm": 4.511030197143555, + "kl": 0.0513916015625, + "learning_rate": 1e-06, + "loss": 0.0376, + "num_tokens": 9727491.0, + "reward": 0.24225017428398132, + "reward_std": 0.0391615591943264, + "rewards/bleu_reward_func/mean": 0.24225017428398132, + "rewards/bleu_reward_func/std": 0.23075063526630402, + "step": 740 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 509.0, + "completions/mean_length": 291.34375, + "completions/mean_terminated_length": 191.0454559326172, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.5928, + "grad_norm": 5.867874622344971, + "kl": 0.070037841796875, + "learning_rate": 1e-06, + "loss": 0.0656, + "num_tokens": 9740590.0, + "reward": 0.08597154170274734, + "reward_std": 0.053836189210414886, + "rewards/bleu_reward_func/mean": 0.08597154170274734, + "rewards/bleu_reward_func/std": 0.12926995754241943, + "step": 741 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 414.0, + "completions/mean_length": 205.34375, + "completions/mean_terminated_length": 148.55555725097656, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.5936, + "grad_norm": 6.713605880737305, + "kl": 0.12249755859375, + "learning_rate": 1e-06, + "loss": 0.1915, + "num_tokens": 9751033.0, + "reward": 0.19744500517845154, + "reward_std": 0.041014768183231354, + "rewards/bleu_reward_func/mean": 0.19744500517845154, + "rewards/bleu_reward_func/std": 0.1538456529378891, + "step": 742 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 506.0, + "completions/mean_length": 249.46875, + "completions/mean_terminated_length": 200.8518524169922, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 0.5944, + "grad_norm": 5.988234043121338, + "kl": 0.22430419921875, + "learning_rate": 1e-06, + "loss": 0.0382, + "num_tokens": 9764368.0, + "reward": 0.2869833707809448, + "reward_std": 0.07026369869709015, + "rewards/bleu_reward_func/mean": 0.2869833707809448, + "rewards/bleu_reward_func/std": 0.2287815362215042, + "step": 743 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 464.0, + "completions/mean_length": 265.3125, + "completions/mean_terminated_length": 196.239990234375, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 0.5952, + "grad_norm": 8.931143760681152, + "kl": 0.234375, + "learning_rate": 1e-06, + "loss": -0.1984, + "num_tokens": 9774882.0, + "reward": 0.07221800833940506, + "reward_std": 0.031748898327350616, + "rewards/bleu_reward_func/mean": 0.07221800833940506, + "rewards/bleu_reward_func/std": 0.06019110977649689, + "step": 744 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 491.0, + "completions/mean_length": 410.3125, + "completions/mean_terminated_length": 279.5714416503906, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "epoch": 0.596, + "grad_norm": 2.886622190475464, + "kl": 0.031707763671875, + "learning_rate": 1e-06, + "loss": -0.0903, + "num_tokens": 9790372.0, + "reward": 0.030976204201579094, + "reward_std": 0.015047797001898289, + "rewards/bleu_reward_func/mean": 0.030976204201579094, + "rewards/bleu_reward_func/std": 0.033486902713775635, + "step": 745 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 481.0, + "completions/mean_length": 241.84375, + "completions/mean_terminated_length": 166.1999969482422, + "completions/min_length": 43.0, + "completions/min_terminated_length": 43.0, + "epoch": 0.5968, + "grad_norm": 4.601830005645752, + "kl": 0.069915771484375, + "learning_rate": 1e-06, + "loss": 0.0366, + "num_tokens": 9801439.0, + "reward": 0.10595028847455978, + "reward_std": 0.024186890572309494, + "rewards/bleu_reward_func/mean": 0.10595028847455978, + "rewards/bleu_reward_func/std": 0.10705985873937607, + "step": 746 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 490.0, + "completions/mean_length": 311.53125, + "completions/mean_terminated_length": 255.39999389648438, + "completions/min_length": 67.0, + "completions/min_terminated_length": 67.0, + "epoch": 0.5976, + "grad_norm": 2.84419322013855, + "kl": 0.05322265625, + "learning_rate": 1e-06, + "loss": -0.0815, + "num_tokens": 9816096.0, + "reward": 0.02818489633500576, + "reward_std": 0.008401873521506786, + "rewards/bleu_reward_func/mean": 0.02818489633500576, + "rewards/bleu_reward_func/std": 0.02303573302924633, + "step": 747 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 343.0, + "completions/max_terminated_length": 343.0, + "completions/mean_length": 100.78125, + "completions/mean_terminated_length": 100.78125, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.5984, + "grad_norm": 8.280403137207031, + "kl": 0.14306640625, + "learning_rate": 1e-06, + "loss": 0.0722, + "num_tokens": 9821761.0, + "reward": 0.07066097855567932, + "reward_std": 0.026687482371926308, + "rewards/bleu_reward_func/mean": 0.07066097855567932, + "rewards/bleu_reward_func/std": 0.04903886467218399, + "step": 748 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 396.0, + "completions/max_terminated_length": 396.0, + "completions/mean_length": 129.1875, + "completions/mean_terminated_length": 129.1875, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.5992, + "grad_norm": 6.650774955749512, + "kl": 0.1456298828125, + "learning_rate": 1e-06, + "loss": 0.1458, + "num_tokens": 9829943.0, + "reward": 0.19511400163173676, + "reward_std": 0.03503159433603287, + "rewards/bleu_reward_func/mean": 0.19511400163173676, + "rewards/bleu_reward_func/std": 0.21101784706115723, + "step": 749 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 469.0, + "completions/mean_length": 258.1875, + "completions/mean_terminated_length": 105.9000015258789, + "completions/min_length": 42.0, + "completions/min_terminated_length": 42.0, + "epoch": 0.6, + "grad_norm": 5.664450645446777, + "kl": 0.081695556640625, + "learning_rate": 1e-06, + "loss": 0.008, + "num_tokens": 9841813.0, + "reward": 0.20738312602043152, + "reward_std": 0.07332950830459595, + "rewards/bleu_reward_func/mean": 0.20738312602043152, + "rewards/bleu_reward_func/std": 0.2197185456752777, + "step": 750 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 421.0, + "completions/mean_length": 160.03125, + "completions/mean_terminated_length": 136.56668090820312, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 0.6008, + "grad_norm": 69.39850616455078, + "kl": 0.24407958984375, + "learning_rate": 1e-06, + "loss": 0.016, + "num_tokens": 9852910.0, + "reward": 0.2434358447790146, + "reward_std": 0.10366295278072357, + "rewards/bleu_reward_func/mean": 0.2434358447790146, + "rewards/bleu_reward_func/std": 0.18655826151371002, + "step": 751 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 512.0, + "completions/mean_length": 316.875, + "completions/mean_terminated_length": 251.83334350585938, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.6016, + "grad_norm": 4.908503532409668, + "kl": 0.0460205078125, + "learning_rate": 1e-06, + "loss": 0.1812, + "num_tokens": 9865266.0, + "reward": 0.05490465834736824, + "reward_std": 0.02047915570437908, + "rewards/bleu_reward_func/mean": 0.05490465834736824, + "rewards/bleu_reward_func/std": 0.04037528112530708, + "step": 752 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 491.0, + "completions/mean_length": 269.09375, + "completions/mean_terminated_length": 252.90000915527344, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "epoch": 0.6024, + "grad_norm": 2.893157958984375, + "kl": 0.048187255859375, + "learning_rate": 1e-06, + "loss": -0.0419, + "num_tokens": 9876613.0, + "reward": 0.05797014757990837, + "reward_std": 0.029720589518547058, + "rewards/bleu_reward_func/mean": 0.05797014757990837, + "rewards/bleu_reward_func/std": 0.07483170926570892, + "step": 753 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.65625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 506.0, + "completions/mean_length": 434.21875, + "completions/mean_terminated_length": 285.727294921875, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.6032, + "grad_norm": 3.0334763526916504, + "kl": 0.042449951171875, + "learning_rate": 1e-06, + "loss": -0.167, + "num_tokens": 9894044.0, + "reward": 0.0762338861823082, + "reward_std": 0.02360478974878788, + "rewards/bleu_reward_func/mean": 0.0762338861823082, + "rewards/bleu_reward_func/std": 0.0673457533121109, + "step": 754 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 509.0, + "completions/mean_length": 423.21875, + "completions/mean_terminated_length": 309.0714416503906, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "epoch": 0.604, + "grad_norm": 2.2641522884368896, + "kl": 0.03399658203125, + "learning_rate": 1e-06, + "loss": -0.0917, + "num_tokens": 9911107.0, + "reward": 0.03881996497511864, + "reward_std": 0.012424922548234463, + "rewards/bleu_reward_func/mean": 0.03881996497511864, + "rewards/bleu_reward_func/std": 0.02825937233865261, + "step": 755 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 431.0, + "completions/mean_length": 196.53125, + "completions/mean_terminated_length": 108.19999694824219, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 0.6048, + "grad_norm": 9.265530586242676, + "kl": 0.2891845703125, + "learning_rate": 1e-06, + "loss": 0.1189, + "num_tokens": 9924868.0, + "reward": 0.2168477475643158, + "reward_std": 0.07323689758777618, + "rewards/bleu_reward_func/mean": 0.2168477475643158, + "rewards/bleu_reward_func/std": 0.17768503725528717, + "step": 756 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 447.0, + "completions/mean_length": 163.53125, + "completions/mean_terminated_length": 152.29031372070312, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 0.6056, + "grad_norm": 44.89513397216797, + "kl": 0.31927490234375, + "learning_rate": 1e-06, + "loss": 0.0321, + "num_tokens": 9935765.0, + "reward": 0.08691335469484329, + "reward_std": 0.03311008960008621, + "rewards/bleu_reward_func/mean": 0.08691335469484329, + "rewards/bleu_reward_func/std": 0.08314234763383865, + "step": 757 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 425.0, + "completions/max_terminated_length": 425.0, + "completions/mean_length": 97.375, + "completions/mean_terminated_length": 97.375, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 0.6064, + "grad_norm": 8.500920295715332, + "kl": 0.15838623046875, + "learning_rate": 1e-06, + "loss": 0.1392, + "num_tokens": 9944529.0, + "reward": 0.25747808814048767, + "reward_std": 0.048998236656188965, + "rewards/bleu_reward_func/mean": 0.25747808814048767, + "rewards/bleu_reward_func/std": 0.22997993230819702, + "step": 758 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 498.0, + "completions/mean_length": 357.78125, + "completions/mean_terminated_length": 265.25, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "epoch": 0.6072, + "grad_norm": 3.4296295642852783, + "kl": 0.05389404296875, + "learning_rate": 1e-06, + "loss": -0.0162, + "num_tokens": 9961058.0, + "reward": 0.07074315845966339, + "reward_std": 0.0662379041314125, + "rewards/bleu_reward_func/mean": 0.07074315845966339, + "rewards/bleu_reward_func/std": 0.1079547107219696, + "step": 759 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 505.0, + "completions/mean_length": 207.375, + "completions/mean_terminated_length": 175.86207580566406, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 0.608, + "grad_norm": 6.224173545837402, + "kl": 0.1185302734375, + "learning_rate": 1e-06, + "loss": 0.0424, + "num_tokens": 9972646.0, + "reward": 0.09982403367757797, + "reward_std": 0.06204840913414955, + "rewards/bleu_reward_func/mean": 0.09982403367757797, + "rewards/bleu_reward_func/std": 0.10973682999610901, + "step": 760 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 452.0, + "completions/mean_length": 152.8125, + "completions/mean_terminated_length": 128.86666870117188, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 0.6088, + "grad_norm": 12.398276329040527, + "kl": 0.240966796875, + "learning_rate": 1e-06, + "loss": 0.198, + "num_tokens": 9985136.0, + "reward": 0.059197958558797836, + "reward_std": 0.02872345596551895, + "rewards/bleu_reward_func/mean": 0.059197958558797836, + "rewards/bleu_reward_func/std": 0.04119112715125084, + "step": 761 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 493.0, + "completions/mean_length": 377.6875, + "completions/mean_terminated_length": 273.22222900390625, + "completions/min_length": 56.0, + "completions/min_terminated_length": 56.0, + "epoch": 0.6096, + "grad_norm": 3.9670443534851074, + "kl": 0.046600341796875, + "learning_rate": 1e-06, + "loss": 0.0648, + "num_tokens": 10003078.0, + "reward": 0.0413321927189827, + "reward_std": 0.015110660344362259, + "rewards/bleu_reward_func/mean": 0.0413321927189827, + "rewards/bleu_reward_func/std": 0.032528944313526154, + "step": 762 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.59375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 498.0, + "completions/mean_length": 472.0, + "completions/mean_terminated_length": 413.5384826660156, + "completions/min_length": 283.0, + "completions/min_terminated_length": 283.0, + "epoch": 0.6104, + "grad_norm": 2.092273712158203, + "kl": 0.0268096923828125, + "learning_rate": 1e-06, + "loss": 0.0147, + "num_tokens": 10020230.0, + "reward": 0.08549900352954865, + "reward_std": 0.03175706788897514, + "rewards/bleu_reward_func/mean": 0.08549900352954865, + "rewards/bleu_reward_func/std": 0.042792484164237976, + "step": 763 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 446.0, + "completions/mean_length": 150.6875, + "completions/mean_terminated_length": 83.77777862548828, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 0.6112, + "grad_norm": 9.28490924835205, + "kl": 0.265869140625, + "learning_rate": 1e-06, + "loss": 0.2918, + "num_tokens": 10031172.0, + "reward": 0.21292155981063843, + "reward_std": 0.06925603747367859, + "rewards/bleu_reward_func/mean": 0.21292155981063843, + "rewards/bleu_reward_func/std": 0.18994402885437012, + "step": 764 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 507.0, + "completions/mean_length": 328.3125, + "completions/mean_terminated_length": 218.10000610351562, + "completions/min_length": 56.0, + "completions/min_terminated_length": 56.0, + "epoch": 0.612, + "grad_norm": 21.738191604614258, + "kl": 0.1495361328125, + "learning_rate": 1e-06, + "loss": -0.0895, + "num_tokens": 10046358.0, + "reward": 0.09622834622859955, + "reward_std": 0.04176661744713783, + "rewards/bleu_reward_func/mean": 0.09622834622859955, + "rewards/bleu_reward_func/std": 0.09296616911888123, + "step": 765 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 502.0, + "completions/mean_length": 288.21875, + "completions/mean_terminated_length": 246.7777862548828, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 0.6128, + "grad_norm": 5.178511142730713, + "kl": 0.1473388671875, + "learning_rate": 1e-06, + "loss": 0.0688, + "num_tokens": 10059877.0, + "reward": 0.088385209441185, + "reward_std": 0.016962474212050438, + "rewards/bleu_reward_func/mean": 0.088385209441185, + "rewards/bleu_reward_func/std": 0.08985943347215652, + "step": 766 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 504.0, + "completions/mean_length": 311.46875, + "completions/mean_terminated_length": 244.625, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 0.6136, + "grad_norm": 11.883882522583008, + "kl": 0.2354583740234375, + "learning_rate": 1e-06, + "loss": -0.0238, + "num_tokens": 10073076.0, + "reward": 0.18446165323257446, + "reward_std": 0.10309243947267532, + "rewards/bleu_reward_func/mean": 0.18446165323257446, + "rewards/bleu_reward_func/std": 0.30338525772094727, + "step": 767 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 468.0, + "completions/mean_length": 275.59375, + "completions/mean_terminated_length": 168.13636779785156, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 0.6144, + "grad_norm": 5.438653469085693, + "kl": 0.081451416015625, + "learning_rate": 1e-06, + "loss": 0.0639, + "num_tokens": 10085871.0, + "reward": 0.06602545082569122, + "reward_std": 0.030349329113960266, + "rewards/bleu_reward_func/mean": 0.06602545082569122, + "rewards/bleu_reward_func/std": 0.04767395555973053, + "step": 768 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 467.0, + "completions/max_terminated_length": 467.0, + "completions/mean_length": 136.09375, + "completions/mean_terminated_length": 136.09375, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.6152, + "grad_norm": 5.674292087554932, + "kl": 0.26715087890625, + "learning_rate": 1e-06, + "loss": 0.2437, + "num_tokens": 10094858.0, + "reward": 0.07597756385803223, + "reward_std": 0.027629435062408447, + "rewards/bleu_reward_func/mean": 0.07597756385803223, + "rewards/bleu_reward_func/std": 0.054181892424821854, + "step": 769 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 401.0, + "completions/mean_length": 107.9375, + "completions/mean_terminated_length": 94.9032211303711, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 0.616, + "grad_norm": 9.938776969909668, + "kl": 0.16571044921875, + "learning_rate": 1e-06, + "loss": 0.082, + "num_tokens": 10106672.0, + "reward": 0.3735446035861969, + "reward_std": 0.03898521885275841, + "rewards/bleu_reward_func/mean": 0.3735446035861969, + "rewards/bleu_reward_func/std": 0.30306297540664673, + "step": 770 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 508.0, + "completions/mean_length": 338.6875, + "completions/mean_terminated_length": 280.91668701171875, + "completions/min_length": 43.0, + "completions/min_terminated_length": 43.0, + "epoch": 0.6168, + "grad_norm": 3.914602041244507, + "kl": 0.05010986328125, + "learning_rate": 1e-06, + "loss": -0.1112, + "num_tokens": 10120510.0, + "reward": 0.09634806215763092, + "reward_std": 0.04157658666372299, + "rewards/bleu_reward_func/mean": 0.09634806215763092, + "rewards/bleu_reward_func/std": 0.08702099323272705, + "step": 771 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 511.0, + "completions/mean_length": 270.875, + "completions/mean_terminated_length": 215.23077392578125, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.6176, + "grad_norm": 5.461522102355957, + "kl": 0.3319091796875, + "learning_rate": 1e-06, + "loss": 0.0344, + "num_tokens": 10132138.0, + "reward": 0.18050828576087952, + "reward_std": 0.033299222588539124, + "rewards/bleu_reward_func/mean": 0.18050828576087952, + "rewards/bleu_reward_func/std": 0.21068614721298218, + "step": 772 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 310.0, + "completions/max_terminated_length": 310.0, + "completions/mean_length": 131.375, + "completions/mean_terminated_length": 131.375, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 0.6184, + "grad_norm": 8.162845611572266, + "kl": 0.2459716796875, + "learning_rate": 1e-06, + "loss": 0.1064, + "num_tokens": 10142558.0, + "reward": 0.1030242070555687, + "reward_std": 0.05847536772489548, + "rewards/bleu_reward_func/mean": 0.1030242070555687, + "rewards/bleu_reward_func/std": 0.14961844682693481, + "step": 773 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 492.0, + "completions/max_terminated_length": 492.0, + "completions/mean_length": 169.9375, + "completions/mean_terminated_length": 169.9375, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 0.6192, + "grad_norm": 43.1834602355957, + "kl": 0.22735595703125, + "learning_rate": 1e-06, + "loss": -0.0649, + "num_tokens": 10151012.0, + "reward": 0.04478512331843376, + "reward_std": 0.012456279247999191, + "rewards/bleu_reward_func/mean": 0.04478512331843376, + "rewards/bleu_reward_func/std": 0.04301442950963974, + "step": 774 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 495.0, + "completions/mean_length": 249.3125, + "completions/mean_terminated_length": 188.69232177734375, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.62, + "grad_norm": 8.241740226745605, + "kl": 0.188079833984375, + "learning_rate": 1e-06, + "loss": 0.1066, + "num_tokens": 10163446.0, + "reward": 0.15149368345737457, + "reward_std": 0.028546612709760666, + "rewards/bleu_reward_func/mean": 0.15149368345737457, + "rewards/bleu_reward_func/std": 0.14032159745693207, + "step": 775 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 495.0, + "completions/mean_length": 235.125, + "completions/mean_terminated_length": 126.78260803222656, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 0.6208, + "grad_norm": 8.033968925476074, + "kl": 0.22894287109375, + "learning_rate": 1e-06, + "loss": 0.098, + "num_tokens": 10174522.0, + "reward": 0.22483249008655548, + "reward_std": 0.04489654302597046, + "rewards/bleu_reward_func/mean": 0.22483249008655548, + "rewards/bleu_reward_func/std": 0.24184906482696533, + "step": 776 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 478.0, + "completions/mean_length": 273.0625, + "completions/mean_terminated_length": 179.56521606445312, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 0.6216, + "grad_norm": 6.315563201904297, + "kl": 0.09613037109375, + "learning_rate": 1e-06, + "loss": 0.1547, + "num_tokens": 10188964.0, + "reward": 0.06417440623044968, + "reward_std": 0.01652311347424984, + "rewards/bleu_reward_func/mean": 0.06417440623044968, + "rewards/bleu_reward_func/std": 0.05222758278250694, + "step": 777 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 512.0, + "completions/max_terminated_length": 506.0, + "completions/mean_length": 350.40625, + "completions/mean_terminated_length": 188.8125, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 0.6224, + "grad_norm": 4.945059776306152, + "kl": 0.08636474609375, + "learning_rate": 1e-06, + "loss": 0.0638, + "num_tokens": 10203657.0, + "reward": 0.07400526106357574, + "reward_std": 0.03621644526720047, + "rewards/bleu_reward_func/mean": 0.07400526106357574, + "rewards/bleu_reward_func/std": 0.05740804970264435, + "step": 778 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 509.0, + "completions/mean_length": 319.1875, + "completions/mean_terminated_length": 243.7391357421875, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "epoch": 0.6232, + "grad_norm": 4.364378929138184, + "kl": 0.07342529296875, + "learning_rate": 1e-06, + "loss": -0.0132, + "num_tokens": 10219919.0, + "reward": 0.24302205443382263, + "reward_std": 0.05576051399111748, + "rewards/bleu_reward_func/mean": 0.24302205443382263, + "rewards/bleu_reward_func/std": 0.21575042605400085, + "step": 779 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 491.0, + "completions/mean_length": 298.75, + "completions/mean_terminated_length": 201.8181915283203, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.624, + "grad_norm": 3.959169864654541, + "kl": 0.06689453125, + "learning_rate": 1e-06, + "loss": 0.0405, + "num_tokens": 10234119.0, + "reward": 0.09677430242300034, + "reward_std": 0.02671782858669758, + "rewards/bleu_reward_func/mean": 0.09677430242300034, + "rewards/bleu_reward_func/std": 0.06890682131052017, + "step": 780 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 499.0, + "completions/mean_length": 211.40625, + "completions/mean_terminated_length": 168.46429443359375, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 0.6248, + "grad_norm": 10.254522323608398, + "kl": 0.30865478515625, + "learning_rate": 1e-06, + "loss": -0.044, + "num_tokens": 10247388.0, + "reward": 0.2194344401359558, + "reward_std": 0.04920031875371933, + "rewards/bleu_reward_func/mean": 0.2194344401359558, + "rewards/bleu_reward_func/std": 0.15552020072937012, + "step": 781 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 510.0, + "completions/mean_length": 201.46875, + "completions/mean_terminated_length": 180.7666778564453, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.6256, + "grad_norm": 7.234709739685059, + "kl": 0.1651611328125, + "learning_rate": 1e-06, + "loss": 0.2297, + "num_tokens": 10256947.0, + "reward": 0.11007180064916611, + "reward_std": 0.07193183898925781, + "rewards/bleu_reward_func/mean": 0.11007180064916611, + "rewards/bleu_reward_func/std": 0.13098347187042236, + "step": 782 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 482.0, + "completions/mean_length": 288.15625, + "completions/mean_terminated_length": 246.70370483398438, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 0.6264, + "grad_norm": 3.6756701469421387, + "kl": 0.050689697265625, + "learning_rate": 1e-06, + "loss": -0.2297, + "num_tokens": 10271504.0, + "reward": 0.07084184139966965, + "reward_std": 0.03263479843735695, + "rewards/bleu_reward_func/mean": 0.07084184139966965, + "rewards/bleu_reward_func/std": 0.07953313738107681, + "step": 783 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 370.0, + "completions/mean_length": 127.375, + "completions/mean_terminated_length": 114.96773529052734, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.6272, + "grad_norm": 7.3130879402160645, + "kl": 0.143096923828125, + "learning_rate": 1e-06, + "loss": -0.1119, + "num_tokens": 10281236.0, + "reward": 0.17116650938987732, + "reward_std": 0.040961284190416336, + "rewards/bleu_reward_func/mean": 0.17116650938987732, + "rewards/bleu_reward_func/std": 0.16110415756702423, + "step": 784 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 458.0, + "completions/mean_length": 198.125, + "completions/mean_terminated_length": 140.0, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.628, + "grad_norm": 5.703468322753906, + "kl": 0.168975830078125, + "learning_rate": 1e-06, + "loss": 0.0673, + "num_tokens": 10293288.0, + "reward": 0.14155232906341553, + "reward_std": 0.059418316930532455, + "rewards/bleu_reward_func/mean": 0.14155232906341553, + "rewards/bleu_reward_func/std": 0.142944797873497, + "step": 785 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 475.0, + "completions/mean_length": 312.5, + "completions/mean_terminated_length": 266.4615478515625, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "epoch": 0.6288, + "grad_norm": 2.4137492179870605, + "kl": 0.023956298828125, + "learning_rate": 1e-06, + "loss": 0.0405, + "num_tokens": 10309744.0, + "reward": 0.16134724020957947, + "reward_std": 0.01978662982583046, + "rewards/bleu_reward_func/mean": 0.16134724020957947, + "rewards/bleu_reward_func/std": 0.16176313161849976, + "step": 786 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 475.0, + "completions/mean_length": 281.34375, + "completions/mean_terminated_length": 228.11538696289062, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.6296, + "grad_norm": 5.432137489318848, + "kl": 0.126953125, + "learning_rate": 1e-06, + "loss": -0.0834, + "num_tokens": 10322867.0, + "reward": 0.13262051343917847, + "reward_std": 0.054468683898448944, + "rewards/bleu_reward_func/mean": 0.13262051343917847, + "rewards/bleu_reward_func/std": 0.1454581618309021, + "step": 787 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 434.0, + "completions/mean_length": 157.8125, + "completions/mean_terminated_length": 121.17241668701172, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 0.6304, + "grad_norm": 8.820817947387695, + "kl": 0.384033203125, + "learning_rate": 1e-06, + "loss": 0.1057, + "num_tokens": 10330085.0, + "reward": 0.14398705959320068, + "reward_std": 0.05267474800348282, + "rewards/bleu_reward_func/mean": 0.14398705959320068, + "rewards/bleu_reward_func/std": 0.12204661965370178, + "step": 788 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 492.0, + "completions/mean_length": 313.875, + "completions/mean_terminated_length": 223.8181915283203, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 0.6312, + "grad_norm": 6.252136707305908, + "kl": 0.202178955078125, + "learning_rate": 1e-06, + "loss": -0.0958, + "num_tokens": 10344937.0, + "reward": 0.08566081523895264, + "reward_std": 0.0418044775724411, + "rewards/bleu_reward_func/mean": 0.08566081523895264, + "rewards/bleu_reward_func/std": 0.1277945637702942, + "step": 789 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.65625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 504.0, + "completions/mean_length": 476.40625, + "completions/mean_terminated_length": 408.4545593261719, + "completions/min_length": 270.0, + "completions/min_terminated_length": 270.0, + "epoch": 0.632, + "grad_norm": 2.1677629947662354, + "kl": 0.03436279296875, + "learning_rate": 1e-06, + "loss": -0.0004, + "num_tokens": 10362222.0, + "reward": 0.04695405811071396, + "reward_std": 0.013839447870850563, + "rewards/bleu_reward_func/mean": 0.04695405811071396, + "rewards/bleu_reward_func/std": 0.03280064836144447, + "step": 790 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 512.0, + "completions/max_terminated_length": 505.0, + "completions/mean_length": 404.46875, + "completions/mean_terminated_length": 296.9375, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "epoch": 0.6328, + "grad_norm": 2.3538496494293213, + "kl": 0.0289459228515625, + "learning_rate": 1e-06, + "loss": -0.0297, + "num_tokens": 10382845.0, + "reward": 0.08459493517875671, + "reward_std": 0.029446884989738464, + "rewards/bleu_reward_func/mean": 0.08459493517875671, + "rewards/bleu_reward_func/std": 0.051741067320108414, + "step": 791 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 418.0, + "completions/mean_length": 133.0625, + "completions/mean_terminated_length": 107.80000305175781, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.6336, + "grad_norm": 6.885672092437744, + "kl": 0.20733642578125, + "learning_rate": 1e-06, + "loss": 0.1098, + "num_tokens": 10391199.0, + "reward": 0.10581733286380768, + "reward_std": 0.034825149923563004, + "rewards/bleu_reward_func/mean": 0.10581733286380768, + "rewards/bleu_reward_func/std": 0.10278832167387009, + "step": 792 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 95.0, + "completions/mean_length": 154.4375, + "completions/mean_terminated_length": 35.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 0.6344, + "grad_norm": 8.279248237609863, + "kl": 0.2750244140625, + "learning_rate": 1e-06, + "loss": -0.0444, + "num_tokens": 10399789.0, + "reward": 0.1634266972541809, + "reward_std": 0.029335156083106995, + "rewards/bleu_reward_func/mean": 0.1634266972541809, + "rewards/bleu_reward_func/std": 0.1743723601102829, + "step": 793 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 477.0, + "completions/mean_length": 244.96875, + "completions/mean_terminated_length": 123.59091186523438, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 0.6352, + "grad_norm": 5.577760219573975, + "kl": 0.230865478515625, + "learning_rate": 1e-06, + "loss": -0.0138, + "num_tokens": 10413116.0, + "reward": 0.18318259716033936, + "reward_std": 0.02782328985631466, + "rewards/bleu_reward_func/mean": 0.18318259716033936, + "rewards/bleu_reward_func/std": 0.14704957604408264, + "step": 794 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 491.0, + "completions/mean_length": 354.78125, + "completions/mean_terminated_length": 344.3000183105469, + "completions/min_length": 176.0, + "completions/min_terminated_length": 176.0, + "epoch": 0.636, + "grad_norm": 2.591658115386963, + "kl": 0.0296630859375, + "learning_rate": 1e-06, + "loss": -0.0201, + "num_tokens": 10426797.0, + "reward": 0.06094507500529289, + "reward_std": 0.02977069467306137, + "rewards/bleu_reward_func/mean": 0.06094507500529289, + "rewards/bleu_reward_func/std": 0.03347548097372055, + "step": 795 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 512.0, + "completions/mean_length": 209.25, + "completions/mean_terminated_length": 189.06668090820312, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.6368, + "grad_norm": 7.372705936431885, + "kl": 0.226715087890625, + "learning_rate": 1e-06, + "loss": 0.0166, + "num_tokens": 10435821.0, + "reward": 0.17854920029640198, + "reward_std": 0.039038486778736115, + "rewards/bleu_reward_func/mean": 0.17854920029640198, + "rewards/bleu_reward_func/std": 0.11250942945480347, + "step": 796 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 510.0, + "completions/max_terminated_length": 510.0, + "completions/mean_length": 139.90625, + "completions/mean_terminated_length": 139.90625, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 0.6376, + "grad_norm": 7.399951457977295, + "kl": 0.2593994140625, + "learning_rate": 1e-06, + "loss": 0.0215, + "num_tokens": 10444706.0, + "reward": 0.20415213704109192, + "reward_std": 0.05372469127178192, + "rewards/bleu_reward_func/mean": 0.20415213704109192, + "rewards/bleu_reward_func/std": 0.15420135855674744, + "step": 797 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 507.0, + "completions/mean_length": 253.3125, + "completions/mean_terminated_length": 193.61538696289062, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 0.6384, + "grad_norm": 5.452202320098877, + "kl": 0.207672119140625, + "learning_rate": 1e-06, + "loss": 0.0658, + "num_tokens": 10457300.0, + "reward": 0.18789556622505188, + "reward_std": 0.06054109334945679, + "rewards/bleu_reward_func/mean": 0.18789556622505188, + "rewards/bleu_reward_func/std": 0.18226853013038635, + "step": 798 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 436.0, + "completions/mean_length": 164.53125, + "completions/mean_terminated_length": 100.18518829345703, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 0.6392, + "grad_norm": 9.581258773803711, + "kl": 0.312286376953125, + "learning_rate": 1e-06, + "loss": -0.0913, + "num_tokens": 10465053.0, + "reward": 0.14276297390460968, + "reward_std": 0.028537599369883537, + "rewards/bleu_reward_func/mean": 0.14276297390460968, + "rewards/bleu_reward_func/std": 0.10928227007389069, + "step": 799 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 313.0, + "completions/max_terminated_length": 313.0, + "completions/mean_length": 56.59375, + "completions/mean_terminated_length": 56.59375, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 0.64, + "grad_norm": 7.995264053344727, + "kl": 0.2779541015625, + "learning_rate": 1e-06, + "loss": 0.3759, + "num_tokens": 10477608.0, + "reward": 0.34325188398361206, + "reward_std": 0.07241753488779068, + "rewards/bleu_reward_func/mean": 0.34325188398361206, + "rewards/bleu_reward_func/std": 0.20597775280475616, + "step": 800 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 245.0, + "completions/mean_length": 191.5625, + "completions/mean_terminated_length": 117.61538696289062, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 0.6408, + "grad_norm": 4.229004859924316, + "kl": 0.0904541015625, + "learning_rate": 1e-06, + "loss": 0.1193, + "num_tokens": 10485490.0, + "reward": 0.11370354145765305, + "reward_std": 0.061382561922073364, + "rewards/bleu_reward_func/mean": 0.11370354145765305, + "rewards/bleu_reward_func/std": 0.15154796838760376, + "step": 801 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.46875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 439.0, + "completions/mean_length": 293.0, + "completions/mean_terminated_length": 99.76470947265625, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.6416, + "grad_norm": 4.936343193054199, + "kl": 0.227630615234375, + "learning_rate": 1e-06, + "loss": 0.0723, + "num_tokens": 10497930.0, + "reward": 0.15342603623867035, + "reward_std": 0.018828846514225006, + "rewards/bleu_reward_func/mean": 0.15342603623867035, + "rewards/bleu_reward_func/std": 0.22573818266391754, + "step": 802 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 357.0, + "completions/mean_length": 206.9375, + "completions/mean_terminated_length": 163.35714721679688, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 0.6424, + "grad_norm": 4.5400471687316895, + "kl": 0.1302490234375, + "learning_rate": 1e-06, + "loss": -0.0265, + "num_tokens": 10509096.0, + "reward": 0.042201556265354156, + "reward_std": 0.01641710475087166, + "rewards/bleu_reward_func/mean": 0.042201556265354156, + "rewards/bleu_reward_func/std": 0.026252396404743195, + "step": 803 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 487.0, + "completions/mean_length": 339.8125, + "completions/mean_terminated_length": 222.0, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "epoch": 0.6432, + "grad_norm": 4.132330417633057, + "kl": 0.042144775390625, + "learning_rate": 1e-06, + "loss": -0.0638, + "num_tokens": 10522810.0, + "reward": 0.05155924707651138, + "reward_std": 0.017338326200842857, + "rewards/bleu_reward_func/mean": 0.05155924707651138, + "rewards/bleu_reward_func/std": 0.03961692750453949, + "step": 804 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 375.0, + "completions/mean_length": 251.0, + "completions/mean_terminated_length": 132.3636474609375, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 0.644, + "grad_norm": 6.286128044128418, + "kl": 0.150299072265625, + "learning_rate": 1e-06, + "loss": 0.1961, + "num_tokens": 10533090.0, + "reward": 0.03828435763716698, + "reward_std": 0.01768323965370655, + "rewards/bleu_reward_func/mean": 0.03828435763716698, + "rewards/bleu_reward_func/std": 0.035699598491191864, + "step": 805 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 472.0, + "completions/mean_length": 286.03125, + "completions/mean_terminated_length": 253.75001525878906, + "completions/min_length": 63.0, + "completions/min_terminated_length": 63.0, + "epoch": 0.6448, + "grad_norm": 3.333425283432007, + "kl": 0.0326385498046875, + "learning_rate": 1e-06, + "loss": 0.0689, + "num_tokens": 10544131.0, + "reward": 0.11853313446044922, + "reward_std": 0.06690388172864914, + "rewards/bleu_reward_func/mean": 0.11853313446044922, + "rewards/bleu_reward_func/std": 0.14521227777004242, + "step": 806 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 428.0, + "completions/mean_length": 176.6875, + "completions/mean_terminated_length": 165.87095642089844, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.6456, + "grad_norm": 6.8076090812683105, + "kl": 0.24957275390625, + "learning_rate": 1e-06, + "loss": -0.1496, + "num_tokens": 10554641.0, + "reward": 0.12172487378120422, + "reward_std": 0.05724428966641426, + "rewards/bleu_reward_func/mean": 0.12172487378120422, + "rewards/bleu_reward_func/std": 0.11496427655220032, + "step": 807 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 505.0, + "completions/mean_length": 146.875, + "completions/mean_terminated_length": 62.615386962890625, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 0.6464, + "grad_norm": 9.55725383758545, + "kl": 0.30224609375, + "learning_rate": 1e-06, + "loss": 0.0709, + "num_tokens": 10563829.0, + "reward": 0.20068883895874023, + "reward_std": 0.06663694977760315, + "rewards/bleu_reward_func/mean": 0.20068883895874023, + "rewards/bleu_reward_func/std": 0.13896267116069794, + "step": 808 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 176.0, + "completions/mean_length": 141.34375, + "completions/mean_terminated_length": 37.55999755859375, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 0.6472, + "grad_norm": 26.54884147644043, + "kl": 0.41717529296875, + "learning_rate": 1e-06, + "loss": -0.1795, + "num_tokens": 10574624.0, + "reward": 0.22808396816253662, + "reward_std": 0.06877206265926361, + "rewards/bleu_reward_func/mean": 0.22808396816253662, + "rewards/bleu_reward_func/std": 0.21049334108829498, + "step": 809 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 308.0, + "completions/max_terminated_length": 308.0, + "completions/mean_length": 111.21875, + "completions/mean_terminated_length": 111.21875, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 0.648, + "grad_norm": 13.591133117675781, + "kl": 0.272674560546875, + "learning_rate": 1e-06, + "loss": -0.0292, + "num_tokens": 10583671.0, + "reward": 0.2966269850730896, + "reward_std": 0.015265233814716339, + "rewards/bleu_reward_func/mean": 0.2966269850730896, + "rewards/bleu_reward_func/std": 0.24745707213878632, + "step": 810 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 473.0, + "completions/mean_length": 94.34375, + "completions/mean_terminated_length": 80.87096405029297, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.6488, + "grad_norm": 10.449114799499512, + "kl": 0.2784423828125, + "learning_rate": 1e-06, + "loss": -0.2645, + "num_tokens": 10592586.0, + "reward": 0.23048871755599976, + "reward_std": 0.05683053284883499, + "rewards/bleu_reward_func/mean": 0.23048871755599976, + "rewards/bleu_reward_func/std": 0.304109662771225, + "step": 811 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 262.0, + "completions/mean_length": 184.65625, + "completions/mean_terminated_length": 75.54167175292969, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 0.6496, + "grad_norm": 6.2868170738220215, + "kl": 0.202392578125, + "learning_rate": 1e-06, + "loss": 0.0856, + "num_tokens": 10604447.0, + "reward": 0.06996900588274002, + "reward_std": 0.01753135770559311, + "rewards/bleu_reward_func/mean": 0.06996900588274002, + "rewards/bleu_reward_func/std": 0.07089151442050934, + "step": 812 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 171.0, + "completions/max_terminated_length": 171.0, + "completions/mean_length": 46.96875, + "completions/mean_terminated_length": 46.96875, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 0.6504, + "grad_norm": 15.99052619934082, + "kl": 0.55419921875, + "learning_rate": 1e-06, + "loss": 0.1537, + "num_tokens": 10612710.0, + "reward": 0.08658448606729507, + "reward_std": 0.03601383790373802, + "rewards/bleu_reward_func/mean": 0.08658448606729507, + "rewards/bleu_reward_func/std": 0.05530841648578644, + "step": 813 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 484.0, + "completions/mean_length": 355.0, + "completions/mean_terminated_length": 283.6363830566406, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "epoch": 0.6512, + "grad_norm": 3.1796348094940186, + "kl": 0.059417724609375, + "learning_rate": 1e-06, + "loss": 0.1311, + "num_tokens": 10625326.0, + "reward": 0.11449694633483887, + "reward_std": 0.027395280078053474, + "rewards/bleu_reward_func/mean": 0.11449694633483887, + "rewards/bleu_reward_func/std": 0.05288613215088844, + "step": 814 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 499.0, + "completions/mean_length": 128.96875, + "completions/mean_terminated_length": 103.43334197998047, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.652, + "grad_norm": 6.432667255401611, + "kl": 0.1822509765625, + "learning_rate": 1e-06, + "loss": 0.1721, + "num_tokens": 10635421.0, + "reward": 0.18185263872146606, + "reward_std": 0.0783199891448021, + "rewards/bleu_reward_func/mean": 0.18185263872146606, + "rewards/bleu_reward_func/std": 0.18959355354309082, + "step": 815 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 493.0, + "completions/mean_length": 242.9375, + "completions/mean_terminated_length": 204.50001525878906, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 0.6528, + "grad_norm": 4.343508720397949, + "kl": 0.06640625, + "learning_rate": 1e-06, + "loss": 0.1592, + "num_tokens": 10647283.0, + "reward": 0.10118309408426285, + "reward_std": 0.026538610458374023, + "rewards/bleu_reward_func/mean": 0.10118309408426285, + "rewards/bleu_reward_func/std": 0.08866976201534271, + "step": 816 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 456.0, + "completions/mean_length": 217.5625, + "completions/mean_terminated_length": 175.50001525878906, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.6536, + "grad_norm": 4.102277755737305, + "kl": 0.141204833984375, + "learning_rate": 1e-06, + "loss": 0.2738, + "num_tokens": 10660189.0, + "reward": 0.23801954090595245, + "reward_std": 0.07484984397888184, + "rewards/bleu_reward_func/mean": 0.23801954090595245, + "rewards/bleu_reward_func/std": 0.168580561876297, + "step": 817 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 485.0, + "completions/mean_length": 180.625, + "completions/mean_terminated_length": 169.93548583984375, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "epoch": 0.6544, + "grad_norm": 5.659482002258301, + "kl": 0.08270263671875, + "learning_rate": 1e-06, + "loss": 0.1325, + "num_tokens": 10668393.0, + "reward": 0.08136270940303802, + "reward_std": 0.02622528187930584, + "rewards/bleu_reward_func/mean": 0.08136270940303802, + "rewards/bleu_reward_func/std": 0.03544744476675987, + "step": 818 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 511.0, + "completions/mean_length": 289.375, + "completions/mean_terminated_length": 227.0399932861328, + "completions/min_length": 55.0, + "completions/min_terminated_length": 55.0, + "epoch": 0.6552, + "grad_norm": 3.059807300567627, + "kl": 0.076446533203125, + "learning_rate": 1e-06, + "loss": -0.016, + "num_tokens": 10682749.0, + "reward": 0.07544586062431335, + "reward_std": 0.0220788661390543, + "rewards/bleu_reward_func/mean": 0.07544586062431335, + "rewards/bleu_reward_func/std": 0.04309820756316185, + "step": 819 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 407.0, + "completions/mean_length": 271.4375, + "completions/mean_terminated_length": 191.25, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.656, + "grad_norm": 4.126210689544678, + "kl": 0.06744384765625, + "learning_rate": 1e-06, + "loss": 0.2091, + "num_tokens": 10693011.0, + "reward": 0.12100762873888016, + "reward_std": 0.040202461183071136, + "rewards/bleu_reward_func/mean": 0.12100762873888016, + "rewards/bleu_reward_func/std": 0.09315716475248337, + "step": 820 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.46875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 478.0, + "completions/mean_length": 360.34375, + "completions/mean_terminated_length": 226.5294189453125, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "epoch": 0.6568, + "grad_norm": 3.169628620147705, + "kl": 0.030731201171875, + "learning_rate": 1e-06, + "loss": -0.1334, + "num_tokens": 10706846.0, + "reward": 0.041019197553396225, + "reward_std": 0.012767975218594074, + "rewards/bleu_reward_func/mean": 0.041019197553396225, + "rewards/bleu_reward_func/std": 0.050586286932229996, + "step": 821 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.46875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 378.0, + "completions/mean_length": 264.46875, + "completions/mean_terminated_length": 46.05882263183594, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 0.6576, + "grad_norm": 6.150150775909424, + "kl": 0.174896240234375, + "learning_rate": 1e-06, + "loss": 0.0093, + "num_tokens": 10720189.0, + "reward": 0.10611159354448318, + "reward_std": 0.044405680149793625, + "rewards/bleu_reward_func/mean": 0.10611159354448318, + "rewards/bleu_reward_func/std": 0.10892455279827118, + "step": 822 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 494.0, + "completions/mean_length": 266.84375, + "completions/mean_terminated_length": 210.2692413330078, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.6584, + "grad_norm": 6.753479957580566, + "kl": 0.20147705078125, + "learning_rate": 1e-06, + "loss": 0.0273, + "num_tokens": 10731600.0, + "reward": 0.10170187056064606, + "reward_std": 0.03044716641306877, + "rewards/bleu_reward_func/mean": 0.10170187056064606, + "rewards/bleu_reward_func/std": 0.05836126208305359, + "step": 823 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 510.0, + "completions/mean_length": 392.28125, + "completions/mean_terminated_length": 299.1666564941406, + "completions/min_length": 44.0, + "completions/min_terminated_length": 44.0, + "epoch": 0.6592, + "grad_norm": 2.5884182453155518, + "kl": 0.0380859375, + "learning_rate": 1e-06, + "loss": -0.0465, + "num_tokens": 10748121.0, + "reward": 0.03844967484474182, + "reward_std": 0.012901275418698788, + "rewards/bleu_reward_func/mean": 0.03844967484474182, + "rewards/bleu_reward_func/std": 0.032823171466588974, + "step": 824 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 491.0, + "completions/mean_length": 270.5, + "completions/mean_terminated_length": 214.7692413330078, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "epoch": 0.66, + "grad_norm": 4.168792247772217, + "kl": 0.083251953125, + "learning_rate": 1e-06, + "loss": -0.0346, + "num_tokens": 10763353.0, + "reward": 0.08619528263807297, + "reward_std": 0.02499576285481453, + "rewards/bleu_reward_func/mean": 0.08619528263807297, + "rewards/bleu_reward_func/std": 0.10102304071187973, + "step": 825 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 510.0, + "completions/mean_length": 264.78125, + "completions/mean_terminated_length": 195.55999755859375, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 0.6608, + "grad_norm": 10.431943893432617, + "kl": 0.154876708984375, + "learning_rate": 1e-06, + "loss": 0.1006, + "num_tokens": 10776642.0, + "reward": 0.06286956369876862, + "reward_std": 0.027797410264611244, + "rewards/bleu_reward_func/mean": 0.06286956369876862, + "rewards/bleu_reward_func/std": 0.07537111639976501, + "step": 826 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 476.0, + "completions/mean_length": 231.1875, + "completions/mean_terminated_length": 202.13792419433594, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 0.6616, + "grad_norm": 7.486932754516602, + "kl": 0.15045166015625, + "learning_rate": 1e-06, + "loss": 0.2166, + "num_tokens": 10788656.0, + "reward": 0.10479411482810974, + "reward_std": 0.029287472367286682, + "rewards/bleu_reward_func/mean": 0.10479411482810974, + "rewards/bleu_reward_func/std": 0.07098822295665741, + "step": 827 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 92.0, + "completions/mean_length": 165.3125, + "completions/mean_terminated_length": 49.75, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.6624, + "grad_norm": 7.1387104988098145, + "kl": 0.23382568359375, + "learning_rate": 1e-06, + "loss": 0.0727, + "num_tokens": 10795930.0, + "reward": 0.11342652887105942, + "reward_std": 0.027198534458875656, + "rewards/bleu_reward_func/mean": 0.11342652887105942, + "rewards/bleu_reward_func/std": 0.11971734464168549, + "step": 828 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 464.0, + "completions/max_terminated_length": 464.0, + "completions/mean_length": 108.96875, + "completions/mean_terminated_length": 108.96875, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.6632, + "grad_norm": 8.27657413482666, + "kl": 0.36053466796875, + "learning_rate": 1e-06, + "loss": -0.0108, + "num_tokens": 10804241.0, + "reward": 0.28853511810302734, + "reward_std": 0.07218953967094421, + "rewards/bleu_reward_func/mean": 0.28853511810302734, + "rewards/bleu_reward_func/std": 0.20515379309654236, + "step": 829 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 143.0, + "completions/max_terminated_length": 143.0, + "completions/mean_length": 51.875, + "completions/mean_terminated_length": 51.875, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 0.664, + "grad_norm": 8.845324516296387, + "kl": 0.4793701171875, + "learning_rate": 1e-06, + "loss": 0.1651, + "num_tokens": 10813957.0, + "reward": 0.2881876826286316, + "reward_std": 0.06279260665178299, + "rewards/bleu_reward_func/mean": 0.2881876826286316, + "rewards/bleu_reward_func/std": 0.23817574977874756, + "step": 830 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 482.0, + "completions/mean_length": 268.46875, + "completions/mean_terminated_length": 157.77273559570312, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 0.6648, + "grad_norm": 5.753482818603516, + "kl": 0.143280029296875, + "learning_rate": 1e-06, + "loss": 0.0154, + "num_tokens": 10826356.0, + "reward": 0.13365639746189117, + "reward_std": 0.023316586390137672, + "rewards/bleu_reward_func/mean": 0.13365639746189117, + "rewards/bleu_reward_func/std": 0.20613247156143188, + "step": 831 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 444.0, + "completions/mean_length": 194.125, + "completions/mean_terminated_length": 172.933349609375, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.6656, + "grad_norm": 5.661513805389404, + "kl": 0.106048583984375, + "learning_rate": 1e-06, + "loss": 0.0722, + "num_tokens": 10836128.0, + "reward": 0.08124659210443497, + "reward_std": 0.016117524355649948, + "rewards/bleu_reward_func/mean": 0.08124659210443497, + "rewards/bleu_reward_func/std": 0.08725257217884064, + "step": 832 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 60.0, + "completions/max_terminated_length": 60.0, + "completions/mean_length": 26.625, + "completions/mean_terminated_length": 26.625, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 0.6664, + "grad_norm": 12.99825382232666, + "kl": 0.453857421875, + "learning_rate": 1e-06, + "loss": 0.1057, + "num_tokens": 10842116.0, + "reward": 0.3153986930847168, + "reward_std": 0.0658825933933258, + "rewards/bleu_reward_func/mean": 0.3153986930847168, + "rewards/bleu_reward_func/std": 0.17146961390972137, + "step": 833 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 500.0, + "completions/mean_length": 185.5625, + "completions/mean_terminated_length": 94.15999603271484, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 0.6672, + "grad_norm": 9.406866073608398, + "kl": 0.144378662109375, + "learning_rate": 1e-06, + "loss": 0.2779, + "num_tokens": 10852030.0, + "reward": 0.13975293934345245, + "reward_std": 0.04399016499519348, + "rewards/bleu_reward_func/mean": 0.13975293934345245, + "rewards/bleu_reward_func/std": 0.17490676045417786, + "step": 834 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 512.0, + "completions/max_terminated_length": 484.0, + "completions/mean_length": 355.125, + "completions/mean_terminated_length": 198.25, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.668, + "grad_norm": 8.956555366516113, + "kl": 0.220977783203125, + "learning_rate": 1e-06, + "loss": -0.047, + "num_tokens": 10869930.0, + "reward": 0.04670516401529312, + "reward_std": 0.01496485248208046, + "rewards/bleu_reward_func/mean": 0.04670516401529312, + "rewards/bleu_reward_func/std": 0.03720833733677864, + "step": 835 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 495.0, + "completions/mean_length": 370.71875, + "completions/mean_terminated_length": 331.1600036621094, + "completions/min_length": 151.0, + "completions/min_terminated_length": 151.0, + "epoch": 0.6688, + "grad_norm": 2.452960729598999, + "kl": 0.048583984375, + "learning_rate": 1e-06, + "loss": -0.0511, + "num_tokens": 10884969.0, + "reward": 0.05606111139059067, + "reward_std": 0.01513909362256527, + "rewards/bleu_reward_func/mean": 0.05606111139059067, + "rewards/bleu_reward_func/std": 0.05016703903675079, + "step": 836 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 340.0, + "completions/mean_length": 200.40625, + "completions/mean_terminated_length": 96.54167175292969, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 0.6696, + "grad_norm": 4.966017723083496, + "kl": 0.11663818359375, + "learning_rate": 1e-06, + "loss": 0.2335, + "num_tokens": 10897982.0, + "reward": 0.0694584771990776, + "reward_std": 0.04167729243636131, + "rewards/bleu_reward_func/mean": 0.0694584771990776, + "rewards/bleu_reward_func/std": 0.06985452026128769, + "step": 837 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 397.0, + "completions/mean_length": 150.125, + "completions/mean_terminated_length": 126.00000762939453, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 0.6704, + "grad_norm": 6.741494178771973, + "kl": 0.11309814453125, + "learning_rate": 1e-06, + "loss": -0.1029, + "num_tokens": 10911978.0, + "reward": 0.2410159707069397, + "reward_std": 0.056731171905994415, + "rewards/bleu_reward_func/mean": 0.2410159707069397, + "rewards/bleu_reward_func/std": 0.20536428689956665, + "step": 838 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 492.0, + "completions/mean_length": 198.8125, + "completions/mean_terminated_length": 140.8148193359375, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.6712, + "grad_norm": 5.741243839263916, + "kl": 0.2135009765625, + "learning_rate": 1e-06, + "loss": 0.1006, + "num_tokens": 10925892.0, + "reward": 0.2018284797668457, + "reward_std": 0.04848968982696533, + "rewards/bleu_reward_func/mean": 0.2018284797668457, + "rewards/bleu_reward_func/std": 0.19715876877307892, + "step": 839 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 401.0, + "completions/max_terminated_length": 401.0, + "completions/mean_length": 158.5, + "completions/mean_terminated_length": 158.5, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 0.672, + "grad_norm": 6.41207218170166, + "kl": 0.09161376953125, + "learning_rate": 1e-06, + "loss": -0.0922, + "num_tokens": 10934300.0, + "reward": 0.0522538498044014, + "reward_std": 0.021779239177703857, + "rewards/bleu_reward_func/mean": 0.0522538498044014, + "rewards/bleu_reward_func/std": 0.02408943697810173, + "step": 840 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 506.0, + "completions/mean_length": 389.8125, + "completions/mean_terminated_length": 316.5, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, + "epoch": 0.6728, + "grad_norm": 2.996119976043701, + "kl": 0.06011962890625, + "learning_rate": 1e-06, + "loss": 0.0296, + "num_tokens": 10951166.0, + "reward": 0.08128196746110916, + "reward_std": 0.01865270733833313, + "rewards/bleu_reward_func/mean": 0.08128196746110916, + "rewards/bleu_reward_func/std": 0.05130209028720856, + "step": 841 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 482.0, + "completions/mean_length": 211.25, + "completions/mean_terminated_length": 180.13792419433594, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "epoch": 0.6736, + "grad_norm": 4.3674750328063965, + "kl": 0.07989501953125, + "learning_rate": 1e-06, + "loss": 0.2159, + "num_tokens": 10962038.0, + "reward": 0.035381607711315155, + "reward_std": 0.015435540117323399, + "rewards/bleu_reward_func/mean": 0.035381607711315155, + "rewards/bleu_reward_func/std": 0.02227640338242054, + "step": 842 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 494.0, + "completions/mean_length": 326.15625, + "completions/mean_terminated_length": 241.68182373046875, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 0.6744, + "grad_norm": 7.144293308258057, + "kl": 0.1136932373046875, + "learning_rate": 1e-06, + "loss": 0.0113, + "num_tokens": 10980947.0, + "reward": 0.24271616339683533, + "reward_std": 0.03907809406518936, + "rewards/bleu_reward_func/mean": 0.24271616339683533, + "rewards/bleu_reward_func/std": 0.21944448351860046, + "step": 843 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 503.0, + "completions/mean_length": 318.6875, + "completions/mean_terminated_length": 274.0769348144531, + "completions/min_length": 55.0, + "completions/min_terminated_length": 55.0, + "epoch": 0.6752, + "grad_norm": 3.7767539024353027, + "kl": 0.080078125, + "learning_rate": 1e-06, + "loss": -0.0971, + "num_tokens": 10993977.0, + "reward": 0.034242644906044006, + "reward_std": 0.01977381855249405, + "rewards/bleu_reward_func/mean": 0.034242644906044006, + "rewards/bleu_reward_func/std": 0.024919696152210236, + "step": 844 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 454.0, + "completions/mean_length": 265.375, + "completions/mean_terminated_length": 73.55555725097656, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.676, + "grad_norm": 8.71445083618164, + "kl": 0.34588623046875, + "learning_rate": 1e-06, + "loss": 0.0431, + "num_tokens": 11007565.0, + "reward": 0.23305484652519226, + "reward_std": 0.05401034653186798, + "rewards/bleu_reward_func/mean": 0.23305484652519226, + "rewards/bleu_reward_func/std": 0.2091369926929474, + "step": 845 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 508.0, + "completions/mean_length": 363.375, + "completions/mean_terminated_length": 305.2174072265625, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.6768, + "grad_norm": 5.406923770904541, + "kl": 0.061279296875, + "learning_rate": 1e-06, + "loss": -0.1012, + "num_tokens": 11023473.0, + "reward": 0.15702804923057556, + "reward_std": 0.02012755163013935, + "rewards/bleu_reward_func/mean": 0.15702804923057556, + "rewards/bleu_reward_func/std": 0.16683605313301086, + "step": 846 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 158.0, + "completions/mean_length": 84.125, + "completions/mean_terminated_length": 70.32257843017578, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 0.6776, + "grad_norm": 7.79757022857666, + "kl": 0.094970703125, + "learning_rate": 1e-06, + "loss": 0.1681, + "num_tokens": 11033277.0, + "reward": 0.10883745551109314, + "reward_std": 0.06399966031312943, + "rewards/bleu_reward_func/mean": 0.10883745551109314, + "rewards/bleu_reward_func/std": 0.11935968697071075, + "step": 847 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 507.0, + "completions/mean_length": 282.375, + "completions/mean_terminated_length": 239.8518524169922, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.6784, + "grad_norm": 7.137828350067139, + "kl": 0.18231201171875, + "learning_rate": 1e-06, + "loss": -0.1134, + "num_tokens": 11047977.0, + "reward": 0.07731978595256805, + "reward_std": 0.026035165414214134, + "rewards/bleu_reward_func/mean": 0.07731978595256805, + "rewards/bleu_reward_func/std": 0.08138881623744965, + "step": 848 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 427.0, + "completions/mean_length": 180.21875, + "completions/mean_terminated_length": 118.77777862548828, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 0.6792, + "grad_norm": 10.036446571350098, + "kl": 0.2659912109375, + "learning_rate": 1e-06, + "loss": 0.1133, + "num_tokens": 11058320.0, + "reward": 0.13975116610527039, + "reward_std": 0.02090391516685486, + "rewards/bleu_reward_func/mean": 0.13975116610527039, + "rewards/bleu_reward_func/std": 0.15142837166786194, + "step": 849 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 478.0, + "completions/mean_length": 369.6875, + "completions/mean_terminated_length": 284.3000183105469, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.68, + "grad_norm": 3.8194141387939453, + "kl": 0.038970947265625, + "learning_rate": 1e-06, + "loss": 0.0524, + "num_tokens": 11073662.0, + "reward": 0.08185985684394836, + "reward_std": 0.033635906875133514, + "rewards/bleu_reward_func/mean": 0.08185985684394836, + "rewards/bleu_reward_func/std": 0.06655923277139664, + "step": 850 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 54.0, + "completions/mean_length": 106.5, + "completions/mean_terminated_length": 31.407407760620117, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 0.6808, + "grad_norm": 6.564956188201904, + "kl": 0.1971435546875, + "learning_rate": 1e-06, + "loss": 0.1071, + "num_tokens": 11085022.0, + "reward": 0.22551283240318298, + "reward_std": 0.04716075211763382, + "rewards/bleu_reward_func/mean": 0.22551283240318298, + "rewards/bleu_reward_func/std": 0.16660061478614807, + "step": 851 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 417.0, + "completions/mean_length": 154.375, + "completions/mean_terminated_length": 130.53334045410156, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 0.6816, + "grad_norm": 15.921568870544434, + "kl": 0.28778076171875, + "learning_rate": 1e-06, + "loss": -0.0597, + "num_tokens": 11095378.0, + "reward": 0.1252121478319168, + "reward_std": 0.05527370423078537, + "rewards/bleu_reward_func/mean": 0.1252121478319168, + "rewards/bleu_reward_func/std": 0.12923383712768555, + "step": 852 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 501.0, + "completions/mean_length": 305.9375, + "completions/mean_terminated_length": 212.27273559570312, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.6824, + "grad_norm": 8.1792631149292, + "kl": 0.1992645263671875, + "learning_rate": 1e-06, + "loss": 0.0667, + "num_tokens": 11109616.0, + "reward": 0.0659586489200592, + "reward_std": 0.027510065585374832, + "rewards/bleu_reward_func/mean": 0.0659586489200592, + "rewards/bleu_reward_func/std": 0.08249466121196747, + "step": 853 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 499.0, + "completions/mean_length": 341.125, + "completions/mean_terminated_length": 224.2105255126953, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "epoch": 0.6832, + "grad_norm": 2.265425443649292, + "kl": 0.039276123046875, + "learning_rate": 1e-06, + "loss": 0.2352, + "num_tokens": 11123036.0, + "reward": 0.10829215496778488, + "reward_std": 0.10021056979894638, + "rewards/bleu_reward_func/mean": 0.10829215496778488, + "rewards/bleu_reward_func/std": 0.15350966155529022, + "step": 854 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 512.0, + "completions/max_terminated_length": 436.0, + "completions/mean_length": 398.5625, + "completions/mean_terminated_length": 285.125, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 0.684, + "grad_norm": 2.628075361251831, + "kl": 0.047515869140625, + "learning_rate": 1e-06, + "loss": 0.1379, + "num_tokens": 11139950.0, + "reward": 0.04525969177484512, + "reward_std": 0.025323685258626938, + "rewards/bleu_reward_func/mean": 0.04525969177484512, + "rewards/bleu_reward_func/std": 0.04984954744577408, + "step": 855 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.8125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 459.0, + "completions/mean_length": 489.0625, + "completions/mean_terminated_length": 389.66668701171875, + "completions/min_length": 325.0, + "completions/min_terminated_length": 325.0, + "epoch": 0.6848, + "grad_norm": 2.320620059967041, + "kl": 0.035888671875, + "learning_rate": 1e-06, + "loss": -0.0345, + "num_tokens": 11158672.0, + "reward": 0.016138827428221703, + "reward_std": 0.0038068746216595173, + "rewards/bleu_reward_func/mean": 0.016138827428221703, + "rewards/bleu_reward_func/std": 0.016928784549236298, + "step": 856 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 453.0, + "completions/mean_length": 317.84375, + "completions/mean_terminated_length": 241.86956787109375, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.6856, + "grad_norm": 3.551910638809204, + "kl": 0.109039306640625, + "learning_rate": 1e-06, + "loss": 0.0289, + "num_tokens": 11173291.0, + "reward": 0.20694701373577118, + "reward_std": 0.014496378600597382, + "rewards/bleu_reward_func/mean": 0.20694701373577118, + "rewards/bleu_reward_func/std": 0.2963625490665436, + "step": 857 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 453.0, + "completions/mean_length": 147.125, + "completions/mean_terminated_length": 62.92308044433594, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 0.6864, + "grad_norm": 15.02341079711914, + "kl": 0.70330810546875, + "learning_rate": 1e-06, + "loss": 0.2886, + "num_tokens": 11182215.0, + "reward": 0.19951725006103516, + "reward_std": 0.052443791180849075, + "rewards/bleu_reward_func/mean": 0.19951725006103516, + "rewards/bleu_reward_func/std": 0.19433696568012238, + "step": 858 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 471.0, + "completions/mean_length": 284.125, + "completions/mean_terminated_length": 128.2105255126953, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.6872, + "grad_norm": 8.070085525512695, + "kl": 0.110443115234375, + "learning_rate": 1e-06, + "loss": 0.1827, + "num_tokens": 11194667.0, + "reward": 0.04423338174819946, + "reward_std": 0.017294086515903473, + "rewards/bleu_reward_func/mean": 0.04423338174819946, + "rewards/bleu_reward_func/std": 0.047055598348379135, + "step": 859 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 500.0, + "completions/mean_length": 329.71875, + "completions/mean_terminated_length": 287.65386962890625, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.688, + "grad_norm": 2.8106324672698975, + "kl": 0.039154052734375, + "learning_rate": 1e-06, + "loss": -0.2058, + "num_tokens": 11207642.0, + "reward": 0.06786108016967773, + "reward_std": 0.0352618470788002, + "rewards/bleu_reward_func/mean": 0.06786108016967773, + "rewards/bleu_reward_func/std": 0.04090343415737152, + "step": 860 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 512.0, + "completions/mean_length": 368.53125, + "completions/mean_terminated_length": 320.7083435058594, + "completions/min_length": 54.0, + "completions/min_terminated_length": 54.0, + "epoch": 0.6888, + "grad_norm": 2.837979793548584, + "kl": 0.052001953125, + "learning_rate": 1e-06, + "loss": 0.107, + "num_tokens": 11221547.0, + "reward": 0.07621696591377258, + "reward_std": 0.029543904587626457, + "rewards/bleu_reward_func/mean": 0.07621696591377258, + "rewards/bleu_reward_func/std": 0.04072652757167816, + "step": 861 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 481.0, + "completions/mean_length": 398.59375, + "completions/mean_terminated_length": 321.0, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "epoch": 0.6896, + "grad_norm": 2.941206693649292, + "kl": 0.046844482421875, + "learning_rate": 1e-06, + "loss": -0.0139, + "num_tokens": 11237182.0, + "reward": 0.026391834020614624, + "reward_std": 0.016949903219938278, + "rewards/bleu_reward_func/mean": 0.026391834020614624, + "rewards/bleu_reward_func/std": 0.03409172222018242, + "step": 862 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 484.0, + "completions/mean_length": 357.40625, + "completions/mean_terminated_length": 251.63157653808594, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.6904, + "grad_norm": 2.633358955383301, + "kl": 0.055328369140625, + "learning_rate": 1e-06, + "loss": -0.0911, + "num_tokens": 11251547.0, + "reward": 0.042053550481796265, + "reward_std": 0.021867552772164345, + "rewards/bleu_reward_func/mean": 0.042053550481796265, + "rewards/bleu_reward_func/std": 0.029616717249155045, + "step": 863 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 512.0, + "completions/mean_length": 168.3125, + "completions/mean_terminated_length": 119.21429443359375, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 0.6912, + "grad_norm": 4.309210777282715, + "kl": 0.13665771484375, + "learning_rate": 1e-06, + "loss": -0.1275, + "num_tokens": 11261245.0, + "reward": 0.1768188774585724, + "reward_std": 0.030298635363578796, + "rewards/bleu_reward_func/mean": 0.1768188774585724, + "rewards/bleu_reward_func/std": 0.12399855256080627, + "step": 864 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 494.0, + "completions/mean_length": 252.78125, + "completions/mean_terminated_length": 151.3478240966797, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.692, + "grad_norm": 8.154788970947266, + "kl": 0.23638916015625, + "learning_rate": 1e-06, + "loss": 0.0438, + "num_tokens": 11275478.0, + "reward": 0.07366465032100677, + "reward_std": 0.029438909143209457, + "rewards/bleu_reward_func/mean": 0.07366465032100677, + "rewards/bleu_reward_func/std": 0.05699191242456436, + "step": 865 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 463.0, + "completions/mean_length": 322.21875, + "completions/mean_terminated_length": 235.95455932617188, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 0.6928, + "grad_norm": 2.933178663253784, + "kl": 0.04754638671875, + "learning_rate": 1e-06, + "loss": 0.0639, + "num_tokens": 11287317.0, + "reward": 0.041873324662446976, + "reward_std": 0.02685678005218506, + "rewards/bleu_reward_func/mean": 0.041873324662446976, + "rewards/bleu_reward_func/std": 0.039241958409547806, + "step": 866 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 455.0, + "completions/mean_length": 208.1875, + "completions/mean_terminated_length": 89.30435180664062, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.6936, + "grad_norm": 6.312671661376953, + "kl": 0.1842041015625, + "learning_rate": 1e-06, + "loss": 0.1617, + "num_tokens": 11297835.0, + "reward": 0.103369802236557, + "reward_std": 0.04473632201552391, + "rewards/bleu_reward_func/mean": 0.103369802236557, + "rewards/bleu_reward_func/std": 0.10830661654472351, + "step": 867 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 496.0, + "completions/mean_length": 274.375, + "completions/mean_terminated_length": 230.37037658691406, + "completions/min_length": 64.0, + "completions/min_terminated_length": 64.0, + "epoch": 0.6944, + "grad_norm": 3.08828067779541, + "kl": 0.090667724609375, + "learning_rate": 1e-06, + "loss": -0.0115, + "num_tokens": 11312311.0, + "reward": 0.16378189623355865, + "reward_std": 0.0222244244068861, + "rewards/bleu_reward_func/mean": 0.16378189623355865, + "rewards/bleu_reward_func/std": 0.19553562998771667, + "step": 868 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 446.0, + "completions/mean_length": 216.15625, + "completions/mean_terminated_length": 117.54167175292969, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 0.6952, + "grad_norm": 5.62147331237793, + "kl": 0.15509033203125, + "learning_rate": 1e-06, + "loss": -0.0272, + "num_tokens": 11325348.0, + "reward": 0.059518001973629, + "reward_std": 0.028110869228839874, + "rewards/bleu_reward_func/mean": 0.059518001973629, + "rewards/bleu_reward_func/std": 0.048489734530448914, + "step": 869 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 512.0, + "completions/max_terminated_length": 472.0, + "completions/mean_length": 404.46875, + "completions/mean_terminated_length": 296.9375, + "completions/min_length": 195.0, + "completions/min_terminated_length": 195.0, + "epoch": 0.696, + "grad_norm": 3.3346071243286133, + "kl": 0.038360595703125, + "learning_rate": 1e-06, + "loss": -0.0205, + "num_tokens": 11343635.0, + "reward": 0.07933641970157623, + "reward_std": 0.021958988159894943, + "rewards/bleu_reward_func/mean": 0.07933641970157623, + "rewards/bleu_reward_func/std": 0.06096653267741203, + "step": 870 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 464.0, + "completions/mean_length": 236.6875, + "completions/mean_terminated_length": 185.70370483398438, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 0.6968, + "grad_norm": 4.446465015411377, + "kl": 0.17974853515625, + "learning_rate": 1e-06, + "loss": -0.0515, + "num_tokens": 11353817.0, + "reward": 0.11160654574632645, + "reward_std": 0.039265286177396774, + "rewards/bleu_reward_func/mean": 0.11160654574632645, + "rewards/bleu_reward_func/std": 0.08857923746109009, + "step": 871 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 502.0, + "completions/mean_length": 260.125, + "completions/mean_terminated_length": 161.56521606445312, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 0.6976, + "grad_norm": 7.414572715759277, + "kl": 0.107696533203125, + "learning_rate": 1e-06, + "loss": -0.0123, + "num_tokens": 11367725.0, + "reward": 0.1780683994293213, + "reward_std": 0.015433109365403652, + "rewards/bleu_reward_func/mean": 0.1780683994293213, + "rewards/bleu_reward_func/std": 0.2229662984609604, + "step": 872 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 497.0, + "completions/mean_length": 408.0625, + "completions/mean_terminated_length": 360.8182067871094, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, + "epoch": 0.6984, + "grad_norm": 3.227613925933838, + "kl": 0.0611572265625, + "learning_rate": 1e-06, + "loss": -0.0681, + "num_tokens": 11385423.0, + "reward": 0.0366949737071991, + "reward_std": 0.01884927786886692, + "rewards/bleu_reward_func/mean": 0.0366949737071991, + "rewards/bleu_reward_func/std": 0.028229771181941032, + "step": 873 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 481.0, + "completions/mean_length": 222.21875, + "completions/mean_terminated_length": 168.55555725097656, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 0.6992, + "grad_norm": 7.29306697845459, + "kl": 0.142730712890625, + "learning_rate": 1e-06, + "loss": 0.11, + "num_tokens": 11397022.0, + "reward": 0.046667180955410004, + "reward_std": 0.020207617431879044, + "rewards/bleu_reward_func/mean": 0.046667180955410004, + "rewards/bleu_reward_func/std": 0.02555895410478115, + "step": 874 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 504.0, + "completions/mean_length": 268.5, + "completions/mean_terminated_length": 173.21739196777344, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 0.7, + "grad_norm": 3.604617118835449, + "kl": 0.06298828125, + "learning_rate": 1e-06, + "loss": -0.3505, + "num_tokens": 11410046.0, + "reward": 0.07897455990314484, + "reward_std": 0.014880911447107792, + "rewards/bleu_reward_func/mean": 0.07897455990314484, + "rewards/bleu_reward_func/std": 0.08343996107578278, + "step": 875 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.34375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 508.0, + "completions/mean_length": 325.46875, + "completions/mean_terminated_length": 227.76190185546875, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 0.7008, + "grad_norm": 8.727375030517578, + "kl": 0.133270263671875, + "learning_rate": 1e-06, + "loss": 0.3117, + "num_tokens": 11422725.0, + "reward": 0.07061035186052322, + "reward_std": 0.0419192910194397, + "rewards/bleu_reward_func/mean": 0.07061035186052322, + "rewards/bleu_reward_func/std": 0.07667659968137741, + "step": 876 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 468.0, + "completions/mean_length": 181.375, + "completions/mean_terminated_length": 120.14814758300781, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 0.7016, + "grad_norm": 7.317707061767578, + "kl": 0.21905517578125, + "learning_rate": 1e-06, + "loss": 0.4234, + "num_tokens": 11431937.0, + "reward": 0.10765747725963593, + "reward_std": 0.052248626947402954, + "rewards/bleu_reward_func/mean": 0.10765747725963593, + "rewards/bleu_reward_func/std": 0.05436404421925545, + "step": 877 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 501.0, + "completions/mean_length": 220.75, + "completions/mean_terminated_length": 211.35482788085938, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.7024, + "grad_norm": 4.363486289978027, + "kl": 0.214019775390625, + "learning_rate": 1e-06, + "loss": 0.0009, + "num_tokens": 11443057.0, + "reward": 0.30547034740448, + "reward_std": 0.024015674367547035, + "rewards/bleu_reward_func/mean": 0.30547034740448, + "rewards/bleu_reward_func/std": 0.2281493991613388, + "step": 878 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 377.0, + "completions/mean_length": 205.46875, + "completions/mean_terminated_length": 119.63999938964844, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 0.7032, + "grad_norm": 8.484012603759766, + "kl": 0.189361572265625, + "learning_rate": 1e-06, + "loss": 0.1579, + "num_tokens": 11451232.0, + "reward": 0.0824245885014534, + "reward_std": 0.04487679526209831, + "rewards/bleu_reward_func/mean": 0.0824245885014534, + "rewards/bleu_reward_func/std": 0.07150331139564514, + "step": 879 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 449.0, + "completions/mean_length": 141.375, + "completions/mean_terminated_length": 129.4193572998047, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.704, + "grad_norm": 5.958530902862549, + "kl": 0.1085205078125, + "learning_rate": 1e-06, + "loss": -0.0313, + "num_tokens": 11462604.0, + "reward": 0.04005417972803116, + "reward_std": 0.024934137240052223, + "rewards/bleu_reward_func/mean": 0.04005417972803116, + "rewards/bleu_reward_func/std": 0.03826345130801201, + "step": 880 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 500.0, + "completions/mean_length": 283.03125, + "completions/mean_terminated_length": 126.36842346191406, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.7048, + "grad_norm": 6.075742244720459, + "kl": 0.3231201171875, + "learning_rate": 1e-06, + "loss": 0.0444, + "num_tokens": 11477461.0, + "reward": 0.10366402566432953, + "reward_std": 0.055370062589645386, + "rewards/bleu_reward_func/mean": 0.10366402566432953, + "rewards/bleu_reward_func/std": 0.11003145575523376, + "step": 881 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.46875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 511.0, + "completions/mean_length": 361.0, + "completions/mean_terminated_length": 227.76470947265625, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.7056, + "grad_norm": 5.96100378036499, + "kl": 0.19482421875, + "learning_rate": 1e-06, + "loss": -0.0353, + "num_tokens": 11493365.0, + "reward": 0.1235186904668808, + "reward_std": 0.038026995956897736, + "rewards/bleu_reward_func/mean": 0.1235186904668808, + "rewards/bleu_reward_func/std": 0.05816841870546341, + "step": 882 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 358.0, + "completions/mean_length": 172.09375, + "completions/mean_terminated_length": 109.14814758300781, + "completions/min_length": 3.0, + "completions/min_terminated_length": 3.0, + "epoch": 0.7064, + "grad_norm": 6.117469787597656, + "kl": 0.201904296875, + "learning_rate": 1e-06, + "loss": 0.067, + "num_tokens": 11501456.0, + "reward": 0.15373189747333527, + "reward_std": 0.05197744071483612, + "rewards/bleu_reward_func/mean": 0.15373189747333527, + "rewards/bleu_reward_func/std": 0.10633216798305511, + "step": 883 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 510.0, + "completions/mean_length": 263.15625, + "completions/mean_terminated_length": 193.47999572753906, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 0.7072, + "grad_norm": 4.437005519866943, + "kl": 0.12554931640625, + "learning_rate": 1e-06, + "loss": 0.2681, + "num_tokens": 11517437.0, + "reward": 0.29476526379585266, + "reward_std": 0.13803553581237793, + "rewards/bleu_reward_func/mean": 0.29476526379585266, + "rewards/bleu_reward_func/std": 0.32065168023109436, + "step": 884 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 501.0, + "completions/mean_length": 252.75, + "completions/mean_terminated_length": 134.90908813476562, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 0.708, + "grad_norm": 8.795902252197266, + "kl": 0.3934326171875, + "learning_rate": 1e-06, + "loss": -0.0341, + "num_tokens": 11529885.0, + "reward": 0.13262969255447388, + "reward_std": 0.037800293415784836, + "rewards/bleu_reward_func/mean": 0.13262969255447388, + "rewards/bleu_reward_func/std": 0.11564164608716965, + "step": 885 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 512.0, + "completions/max_terminated_length": 500.0, + "completions/mean_length": 355.6875, + "completions/mean_terminated_length": 199.375, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "epoch": 0.7088, + "grad_norm": 5.155429840087891, + "kl": 0.0558319091796875, + "learning_rate": 1e-06, + "loss": 0.0544, + "num_tokens": 11546155.0, + "reward": 0.10861489176750183, + "reward_std": 0.035860445350408554, + "rewards/bleu_reward_func/mean": 0.10861489176750183, + "rewards/bleu_reward_func/std": 0.08613201975822449, + "step": 886 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 491.0, + "completions/mean_length": 282.84375, + "completions/mean_terminated_length": 126.0526351928711, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.7096, + "grad_norm": 4.782761096954346, + "kl": 0.133056640625, + "learning_rate": 1e-06, + "loss": 0.0206, + "num_tokens": 11561390.0, + "reward": 0.0671025738120079, + "reward_std": 0.018492672592401505, + "rewards/bleu_reward_func/mean": 0.0671025738120079, + "rewards/bleu_reward_func/std": 0.06450604647397995, + "step": 887 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 492.0, + "completions/mean_length": 331.6875, + "completions/mean_terminated_length": 261.13043212890625, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "epoch": 0.7104, + "grad_norm": 2.8767964839935303, + "kl": 0.06182861328125, + "learning_rate": 1e-06, + "loss": -0.0002, + "num_tokens": 11577356.0, + "reward": 0.1093081682920456, + "reward_std": 0.07805053889751434, + "rewards/bleu_reward_func/mean": 0.1093081682920456, + "rewards/bleu_reward_func/std": 0.17048169672489166, + "step": 888 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 491.0, + "completions/mean_length": 397.375, + "completions/mean_terminated_length": 359.16668701171875, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "epoch": 0.7112, + "grad_norm": 2.2424142360687256, + "kl": 0.0458984375, + "learning_rate": 1e-06, + "loss": -0.015, + "num_tokens": 11591432.0, + "reward": 0.06777183711528778, + "reward_std": 0.019787484779953957, + "rewards/bleu_reward_func/mean": 0.06777183711528778, + "rewards/bleu_reward_func/std": 0.041765324771404266, + "step": 889 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 496.0, + "completions/mean_length": 292.59375, + "completions/mean_terminated_length": 160.9499969482422, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 0.712, + "grad_norm": 4.390679359436035, + "kl": 0.0987548828125, + "learning_rate": 1e-06, + "loss": 0.2246, + "num_tokens": 11603515.0, + "reward": 0.06538625806570053, + "reward_std": 0.03718053176999092, + "rewards/bleu_reward_func/mean": 0.06538625806570053, + "rewards/bleu_reward_func/std": 0.0816822499036789, + "step": 890 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 424.0, + "completions/mean_length": 188.875, + "completions/mean_terminated_length": 155.44827270507812, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.7128, + "grad_norm": 5.5804290771484375, + "kl": 0.123260498046875, + "learning_rate": 1e-06, + "loss": -0.2822, + "num_tokens": 11612927.0, + "reward": 0.0781329870223999, + "reward_std": 0.049637503921985626, + "rewards/bleu_reward_func/mean": 0.0781329870223999, + "rewards/bleu_reward_func/std": 0.08602513372898102, + "step": 891 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 499.0, + "completions/mean_length": 332.96875, + "completions/mean_terminated_length": 282.8399963378906, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "epoch": 0.7136, + "grad_norm": 2.8855247497558594, + "kl": 0.0643310546875, + "learning_rate": 1e-06, + "loss": 0.1736, + "num_tokens": 11625662.0, + "reward": 0.03828759491443634, + "reward_std": 0.024871867150068283, + "rewards/bleu_reward_func/mean": 0.03828759491443634, + "rewards/bleu_reward_func/std": 0.03181852772831917, + "step": 892 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.34375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 503.0, + "completions/mean_length": 322.96875, + "completions/mean_terminated_length": 223.952392578125, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 0.7144, + "grad_norm": 8.224991798400879, + "kl": 0.19036865234375, + "learning_rate": 1e-06, + "loss": 0.1601, + "num_tokens": 11642029.0, + "reward": 0.03835766017436981, + "reward_std": 0.013130895793437958, + "rewards/bleu_reward_func/mean": 0.03835766017436981, + "rewards/bleu_reward_func/std": 0.024478256702423096, + "step": 893 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 469.0, + "completions/mean_length": 205.46875, + "completions/mean_terminated_length": 148.70370483398438, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.7152, + "grad_norm": 6.305431842803955, + "kl": 0.14776611328125, + "learning_rate": 1e-06, + "loss": -0.0135, + "num_tokens": 11657452.0, + "reward": 0.11420266330242157, + "reward_std": 0.04108916223049164, + "rewards/bleu_reward_func/mean": 0.11420266330242157, + "rewards/bleu_reward_func/std": 0.06337518244981766, + "step": 894 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 477.0, + "completions/mean_length": 336.71875, + "completions/mean_terminated_length": 200.38888549804688, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.716, + "grad_norm": 4.193768501281738, + "kl": 0.13232421875, + "learning_rate": 1e-06, + "loss": 0.3397, + "num_tokens": 11672971.0, + "reward": 0.07947193086147308, + "reward_std": 0.04811304062604904, + "rewards/bleu_reward_func/mean": 0.07947193086147308, + "rewards/bleu_reward_func/std": 0.10142233967781067, + "step": 895 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 497.0, + "completions/mean_length": 123.09375, + "completions/mean_terminated_length": 97.16667175292969, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.7168, + "grad_norm": 9.166439056396484, + "kl": 0.28271484375, + "learning_rate": 1e-06, + "loss": -0.1089, + "num_tokens": 11684334.0, + "reward": 0.27329233288764954, + "reward_std": 0.059711530804634094, + "rewards/bleu_reward_func/mean": 0.27329233288764954, + "rewards/bleu_reward_func/std": 0.1879579871892929, + "step": 896 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 463.0, + "completions/mean_length": 246.0, + "completions/mean_terminated_length": 157.33334350585938, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 0.7176, + "grad_norm": 4.767898082733154, + "kl": 0.112060546875, + "learning_rate": 1e-06, + "loss": 0.0037, + "num_tokens": 11700486.0, + "reward": 0.07844039797782898, + "reward_std": 0.034808725118637085, + "rewards/bleu_reward_func/mean": 0.07844039797782898, + "rewards/bleu_reward_func/std": 0.0884510949254036, + "step": 897 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 495.0, + "completions/mean_length": 291.25, + "completions/mean_terminated_length": 140.2105255126953, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.7184, + "grad_norm": 5.6404266357421875, + "kl": 0.126312255859375, + "learning_rate": 1e-06, + "loss": 0.0453, + "num_tokens": 11712438.0, + "reward": 0.07097087800502777, + "reward_std": 0.03667715564370155, + "rewards/bleu_reward_func/mean": 0.07097087800502777, + "rewards/bleu_reward_func/std": 0.08086320012807846, + "step": 898 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 511.0, + "completions/mean_length": 264.6875, + "completions/mean_terminated_length": 229.35714721679688, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.7192, + "grad_norm": 4.8742499351501465, + "kl": 0.10357666015625, + "learning_rate": 1e-06, + "loss": 0.1854, + "num_tokens": 11725396.0, + "reward": 0.21620362997055054, + "reward_std": 0.07608456909656525, + "rewards/bleu_reward_func/mean": 0.21620362997055054, + "rewards/bleu_reward_func/std": 0.2514094114303589, + "step": 899 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 502.0, + "completions/mean_length": 329.46875, + "completions/mean_terminated_length": 268.625, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "epoch": 0.72, + "grad_norm": 4.756425857543945, + "kl": 0.10198974609375, + "learning_rate": 1e-06, + "loss": 0.0469, + "num_tokens": 11738307.0, + "reward": 0.053835704922676086, + "reward_std": 0.012895071879029274, + "rewards/bleu_reward_func/mean": 0.053835704922676086, + "rewards/bleu_reward_func/std": 0.03540419042110443, + "step": 900 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 472.0, + "completions/mean_length": 401.65625, + "completions/mean_terminated_length": 259.7857360839844, + "completions/min_length": 63.0, + "completions/min_terminated_length": 63.0, + "epoch": 0.7208, + "grad_norm": 3.1605112552642822, + "kl": 0.04974365234375, + "learning_rate": 1e-06, + "loss": 0.2297, + "num_tokens": 11756496.0, + "reward": 0.24388237297534943, + "reward_std": 0.1161736249923706, + "rewards/bleu_reward_func/mean": 0.24388237297534943, + "rewards/bleu_reward_func/std": 0.3413524627685547, + "step": 901 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 509.0, + "completions/mean_length": 271.59375, + "completions/mean_terminated_length": 246.72413635253906, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 0.7216, + "grad_norm": 4.188157081604004, + "kl": 0.1815185546875, + "learning_rate": 1e-06, + "loss": -0.0606, + "num_tokens": 11768691.0, + "reward": 0.12793870270252228, + "reward_std": 0.04022746905684471, + "rewards/bleu_reward_func/mean": 0.12793870270252228, + "rewards/bleu_reward_func/std": 0.15937677025794983, + "step": 902 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 511.0, + "completions/mean_length": 282.65625, + "completions/mean_terminated_length": 178.4091033935547, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 0.7224, + "grad_norm": 4.868112087249756, + "kl": 0.1282958984375, + "learning_rate": 1e-06, + "loss": -0.0395, + "num_tokens": 11780800.0, + "reward": 0.09620735794305801, + "reward_std": 0.021982625126838684, + "rewards/bleu_reward_func/mean": 0.09620735794305801, + "rewards/bleu_reward_func/std": 0.07161340862512589, + "step": 903 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 489.0, + "completions/mean_length": 277.09375, + "completions/mean_terminated_length": 222.88462829589844, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.7232, + "grad_norm": 7.519046783447266, + "kl": 0.212158203125, + "learning_rate": 1e-06, + "loss": -0.1487, + "num_tokens": 11795403.0, + "reward": 0.07747071981430054, + "reward_std": 0.03376290947198868, + "rewards/bleu_reward_func/mean": 0.07747071981430054, + "rewards/bleu_reward_func/std": 0.055931881070137024, + "step": 904 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 512.0, + "completions/mean_length": 253.4375, + "completions/mean_terminated_length": 167.25, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.724, + "grad_norm": 8.574625015258789, + "kl": 0.1490478515625, + "learning_rate": 1e-06, + "loss": 0.0058, + "num_tokens": 11806985.0, + "reward": 0.06105317175388336, + "reward_std": 0.019554441794753075, + "rewards/bleu_reward_func/mean": 0.06105317175388336, + "rewards/bleu_reward_func/std": 0.038146011531353, + "step": 905 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 407.0, + "completions/mean_length": 72.1875, + "completions/mean_terminated_length": 58.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 0.7248, + "grad_norm": 10.871319770812988, + "kl": 0.32177734375, + "learning_rate": 1e-06, + "loss": 0.2813, + "num_tokens": 11812335.0, + "reward": 0.09286689758300781, + "reward_std": 0.02634507045149803, + "rewards/bleu_reward_func/mean": 0.09286689758300781, + "rewards/bleu_reward_func/std": 0.04922043904662132, + "step": 906 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 499.0, + "completions/mean_length": 239.3125, + "completions/mean_terminated_length": 200.35714721679688, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.7256, + "grad_norm": 23.129505157470703, + "kl": 0.1500244140625, + "learning_rate": 1e-06, + "loss": 0.0482, + "num_tokens": 11828945.0, + "reward": 0.07308115810155869, + "reward_std": 0.014882557094097137, + "rewards/bleu_reward_func/mean": 0.07308115810155869, + "rewards/bleu_reward_func/std": 0.08316269516944885, + "step": 907 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 497.0, + "completions/mean_length": 429.8125, + "completions/mean_terminated_length": 292.8333435058594, + "completions/min_length": 58.0, + "completions/min_terminated_length": 58.0, + "epoch": 0.7264, + "grad_norm": 11.226503372192383, + "kl": 0.059967041015625, + "learning_rate": 1e-06, + "loss": 0.0269, + "num_tokens": 11848283.0, + "reward": 0.04945487529039383, + "reward_std": 0.016689039766788483, + "rewards/bleu_reward_func/mean": 0.04945487529039383, + "rewards/bleu_reward_func/std": 0.04881744086742401, + "step": 908 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 472.0, + "completions/mean_length": 225.09375, + "completions/mean_terminated_length": 129.45834350585938, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 0.7272, + "grad_norm": 8.629831314086914, + "kl": 0.2176055908203125, + "learning_rate": 1e-06, + "loss": 0.5547, + "num_tokens": 11858854.0, + "reward": 0.14837728440761566, + "reward_std": 0.06372867524623871, + "rewards/bleu_reward_func/mean": 0.14837728440761566, + "rewards/bleu_reward_func/std": 0.18777750432491302, + "step": 909 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 262.0, + "completions/max_terminated_length": 262.0, + "completions/mean_length": 74.25, + "completions/mean_terminated_length": 74.25, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 0.728, + "grad_norm": 13.281477928161621, + "kl": 0.438720703125, + "learning_rate": 1e-06, + "loss": 0.0534, + "num_tokens": 11869046.0, + "reward": 0.20467601716518402, + "reward_std": 0.04131526127457619, + "rewards/bleu_reward_func/mean": 0.20467601716518402, + "rewards/bleu_reward_func/std": 0.14035604894161224, + "step": 910 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.53125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 358.0, + "completions/mean_length": 341.625, + "completions/mean_terminated_length": 148.53334045410156, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 0.7288, + "grad_norm": 6.93421745300293, + "kl": 0.177642822265625, + "learning_rate": 1e-06, + "loss": 0.0368, + "num_tokens": 11883586.0, + "reward": 0.18407613039016724, + "reward_std": 0.020998071879148483, + "rewards/bleu_reward_func/mean": 0.18407613039016724, + "rewards/bleu_reward_func/std": 0.2021336704492569, + "step": 911 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 502.0, + "completions/mean_length": 222.1875, + "completions/mean_terminated_length": 168.51852416992188, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 0.7296, + "grad_norm": 4.430690765380859, + "kl": 0.18560791015625, + "learning_rate": 1e-06, + "loss": 0.1092, + "num_tokens": 11894448.0, + "reward": 0.1439959555864334, + "reward_std": 0.04086273908615112, + "rewards/bleu_reward_func/mean": 0.1439959555864334, + "rewards/bleu_reward_func/std": 0.1705217957496643, + "step": 912 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 493.0, + "completions/mean_length": 397.25, + "completions/mean_terminated_length": 345.0909118652344, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "epoch": 0.7304, + "grad_norm": 2.7031519412994385, + "kl": 0.05963134765625, + "learning_rate": 1e-06, + "loss": 0.0584, + "num_tokens": 11909408.0, + "reward": 0.09079495072364807, + "reward_std": 0.021243298426270485, + "rewards/bleu_reward_func/mean": 0.09079495072364807, + "rewards/bleu_reward_func/std": 0.1052529513835907, + "step": 913 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 467.0, + "completions/mean_length": 282.9375, + "completions/mean_terminated_length": 126.21052551269531, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.7312, + "grad_norm": 6.302535057067871, + "kl": 0.17462158203125, + "learning_rate": 1e-06, + "loss": -0.1643, + "num_tokens": 11923182.0, + "reward": 0.10389965772628784, + "reward_std": 0.03838275372982025, + "rewards/bleu_reward_func/mean": 0.10389965772628784, + "rewards/bleu_reward_func/std": 0.10838860273361206, + "step": 914 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 512.0, + "completions/mean_length": 411.4375, + "completions/mean_terminated_length": 282.14288330078125, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "epoch": 0.732, + "grad_norm": 2.603992223739624, + "kl": 0.052398681640625, + "learning_rate": 1e-06, + "loss": -0.0025, + "num_tokens": 11939596.0, + "reward": 0.06734529137611389, + "reward_std": 0.0207513514906168, + "rewards/bleu_reward_func/mean": 0.06734529137611389, + "rewards/bleu_reward_func/std": 0.05821956321597099, + "step": 915 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 447.0, + "completions/max_terminated_length": 447.0, + "completions/mean_length": 111.90625, + "completions/mean_terminated_length": 111.90625, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.7328, + "grad_norm": 15.395035743713379, + "kl": 0.2227783203125, + "learning_rate": 1e-06, + "loss": 0.3837, + "num_tokens": 11947113.0, + "reward": 0.24101027846336365, + "reward_std": 0.07465855032205582, + "rewards/bleu_reward_func/mean": 0.24101027846336365, + "rewards/bleu_reward_func/std": 0.17581383883953094, + "step": 916 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 463.0, + "completions/mean_length": 312.90625, + "completions/mean_terminated_length": 176.68421936035156, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.7336, + "grad_norm": 8.670394897460938, + "kl": 0.054595947265625, + "learning_rate": 1e-06, + "loss": -0.1736, + "num_tokens": 11965846.0, + "reward": 0.2349693477153778, + "reward_std": 0.042255695909261703, + "rewards/bleu_reward_func/mean": 0.2349693477153778, + "rewards/bleu_reward_func/std": 0.37363162636756897, + "step": 917 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.34375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 485.0, + "completions/mean_length": 384.34375, + "completions/mean_terminated_length": 317.4761962890625, + "completions/min_length": 74.0, + "completions/min_terminated_length": 74.0, + "epoch": 0.7344, + "grad_norm": 2.5599606037139893, + "kl": 0.06549072265625, + "learning_rate": 1e-06, + "loss": -0.0089, + "num_tokens": 11980945.0, + "reward": 0.061901748180389404, + "reward_std": 0.02856561914086342, + "rewards/bleu_reward_func/mean": 0.061901748180389404, + "rewards/bleu_reward_func/std": 0.04196527600288391, + "step": 918 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 424.0, + "completions/mean_length": 203.9375, + "completions/mean_terminated_length": 146.88888549804688, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.7352, + "grad_norm": 4.163308143615723, + "kl": 0.08990478515625, + "learning_rate": 1e-06, + "loss": -0.0627, + "num_tokens": 11994255.0, + "reward": 0.06285493075847626, + "reward_std": 0.027241935953497887, + "rewards/bleu_reward_func/mean": 0.06285493075847626, + "rewards/bleu_reward_func/std": 0.03245123475790024, + "step": 919 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 502.0, + "completions/mean_length": 445.90625, + "completions/mean_terminated_length": 406.25, + "completions/min_length": 236.0, + "completions/min_terminated_length": 236.0, + "epoch": 0.736, + "grad_norm": 2.6394991874694824, + "kl": 0.0596923828125, + "learning_rate": 1e-06, + "loss": -0.0646, + "num_tokens": 12010980.0, + "reward": 0.05676237493753433, + "reward_std": 0.014085400849580765, + "rewards/bleu_reward_func/mean": 0.05676237493753433, + "rewards/bleu_reward_func/std": 0.03611414507031441, + "step": 920 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 286.0, + "completions/mean_length": 106.0625, + "completions/mean_terminated_length": 92.96774291992188, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 0.7368, + "grad_norm": 8.88719654083252, + "kl": 0.19818115234375, + "learning_rate": 1e-06, + "loss": -0.1221, + "num_tokens": 12017206.0, + "reward": 0.08727812767028809, + "reward_std": 0.05162365734577179, + "rewards/bleu_reward_func/mean": 0.08727812767028809, + "rewards/bleu_reward_func/std": 0.07182831317186356, + "step": 921 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 483.0, + "completions/mean_length": 195.25, + "completions/mean_terminated_length": 162.48275756835938, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 0.7376, + "grad_norm": 7.483645439147949, + "kl": 0.206787109375, + "learning_rate": 1e-06, + "loss": -0.0919, + "num_tokens": 12028214.0, + "reward": 0.18070882558822632, + "reward_std": 0.04944847524166107, + "rewards/bleu_reward_func/mean": 0.18070882558822632, + "rewards/bleu_reward_func/std": 0.19004972279071808, + "step": 922 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 470.0, + "completions/mean_length": 285.75, + "completions/mean_terminated_length": 130.94737243652344, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 0.7384, + "grad_norm": 4.784849643707275, + "kl": 0.07586669921875, + "learning_rate": 1e-06, + "loss": 0.131, + "num_tokens": 12042278.0, + "reward": 0.05333679914474487, + "reward_std": 0.03152618184685707, + "rewards/bleu_reward_func/mean": 0.05333679914474487, + "rewards/bleu_reward_func/std": 0.055619917809963226, + "step": 923 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 51.0, + "completions/mean_length": 300.0625, + "completions/mean_terminated_length": 27.571430206298828, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 0.7392, + "grad_norm": 5.440861701965332, + "kl": 0.145477294921875, + "learning_rate": 1e-06, + "loss": 0.0245, + "num_tokens": 12056416.0, + "reward": 0.14792697131633759, + "reward_std": 0.02701294980943203, + "rewards/bleu_reward_func/mean": 0.14792697131633759, + "rewards/bleu_reward_func/std": 0.15142259001731873, + "step": 924 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 487.0, + "completions/mean_length": 286.15625, + "completions/mean_terminated_length": 271.1000061035156, + "completions/min_length": 41.0, + "completions/min_terminated_length": 41.0, + "epoch": 0.74, + "grad_norm": 3.949329137802124, + "kl": 0.085693359375, + "learning_rate": 1e-06, + "loss": 0.051, + "num_tokens": 12069725.0, + "reward": 0.07858790457248688, + "reward_std": 0.02233020029962063, + "rewards/bleu_reward_func/mean": 0.07858790457248688, + "rewards/bleu_reward_func/std": 0.07242675125598907, + "step": 925 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 484.0, + "completions/mean_length": 128.75, + "completions/mean_terminated_length": 89.10344696044922, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 0.7408, + "grad_norm": 11.58963394165039, + "kl": 0.43695068359375, + "learning_rate": 1e-06, + "loss": 0.0795, + "num_tokens": 12080085.0, + "reward": 0.11042273789644241, + "reward_std": 0.017381731420755386, + "rewards/bleu_reward_func/mean": 0.11042273789644241, + "rewards/bleu_reward_func/std": 0.04870026186108589, + "step": 926 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 506.0, + "completions/mean_length": 295.40625, + "completions/mean_terminated_length": 126.94444274902344, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.7416, + "grad_norm": 9.117962837219238, + "kl": 0.22723388671875, + "learning_rate": 1e-06, + "loss": -0.4999, + "num_tokens": 12092682.0, + "reward": 0.03860364854335785, + "reward_std": 0.019805099815130234, + "rewards/bleu_reward_func/mean": 0.03860364854335785, + "rewards/bleu_reward_func/std": 0.024968957528471947, + "step": 927 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 405.0, + "completions/mean_length": 297.71875, + "completions/mean_terminated_length": 200.3181915283203, + "completions/min_length": 51.0, + "completions/min_terminated_length": 51.0, + "epoch": 0.7424, + "grad_norm": 3.2410967350006104, + "kl": 0.1171875, + "learning_rate": 1e-06, + "loss": -0.0166, + "num_tokens": 12107465.0, + "reward": 0.16988132894039154, + "reward_std": 0.03467182815074921, + "rewards/bleu_reward_func/mean": 0.16988132894039154, + "rewards/bleu_reward_func/std": 0.1373591423034668, + "step": 928 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 483.0, + "completions/mean_length": 355.4375, + "completions/mean_terminated_length": 233.6666717529297, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 0.7432, + "grad_norm": 14.617587089538574, + "kl": 0.15447998046875, + "learning_rate": 1e-06, + "loss": 0.0284, + "num_tokens": 12121815.0, + "reward": 0.14590570330619812, + "reward_std": 0.026923291385173798, + "rewards/bleu_reward_func/mean": 0.14590570330619812, + "rewards/bleu_reward_func/std": 0.2141415923833847, + "step": 929 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 483.0, + "completions/mean_length": 262.78125, + "completions/mean_terminated_length": 227.1785888671875, + "completions/min_length": 48.0, + "completions/min_terminated_length": 48.0, + "epoch": 0.744, + "grad_norm": 3.273005962371826, + "kl": 0.1136474609375, + "learning_rate": 1e-06, + "loss": -0.2043, + "num_tokens": 12136200.0, + "reward": 0.08558979630470276, + "reward_std": 0.03035646863281727, + "rewards/bleu_reward_func/mean": 0.08558979630470276, + "rewards/bleu_reward_func/std": 0.0643271803855896, + "step": 930 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 494.0, + "completions/mean_length": 285.78125, + "completions/mean_terminated_length": 150.0500030517578, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.7448, + "grad_norm": 6.722025394439697, + "kl": 0.117340087890625, + "learning_rate": 1e-06, + "loss": 0.0744, + "num_tokens": 12147369.0, + "reward": 0.071571946144104, + "reward_std": 0.020615192130208015, + "rewards/bleu_reward_func/mean": 0.071571946144104, + "rewards/bleu_reward_func/std": 0.06541716307401657, + "step": 931 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 510.0, + "completions/mean_length": 391.03125, + "completions/mean_terminated_length": 357.1600036621094, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 0.7456, + "grad_norm": 2.371354103088379, + "kl": 0.039031982421875, + "learning_rate": 1e-06, + "loss": -0.0517, + "num_tokens": 12165498.0, + "reward": 0.07511453330516815, + "reward_std": 0.020994337275624275, + "rewards/bleu_reward_func/mean": 0.07511453330516815, + "rewards/bleu_reward_func/std": 0.043336208909749985, + "step": 932 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 444.0, + "completions/mean_length": 321.59375, + "completions/mean_terminated_length": 235.0454559326172, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 0.7464, + "grad_norm": 2.6573996543884277, + "kl": 0.0595703125, + "learning_rate": 1e-06, + "loss": 0.2007, + "num_tokens": 12178413.0, + "reward": 0.07766060531139374, + "reward_std": 0.030490310862660408, + "rewards/bleu_reward_func/mean": 0.07766060531139374, + "rewards/bleu_reward_func/std": 0.05291305482387543, + "step": 933 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.34375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 510.0, + "completions/mean_length": 389.9375, + "completions/mean_terminated_length": 326.0, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "epoch": 0.7472, + "grad_norm": 2.6120660305023193, + "kl": 0.0733642578125, + "learning_rate": 1e-06, + "loss": 0.0789, + "num_tokens": 12194507.0, + "reward": 0.10922634601593018, + "reward_std": 0.0321655347943306, + "rewards/bleu_reward_func/mean": 0.10922634601593018, + "rewards/bleu_reward_func/std": 0.10983148962259293, + "step": 934 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 499.0, + "completions/mean_length": 228.21875, + "completions/mean_terminated_length": 198.86207580566406, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 0.748, + "grad_norm": 7.4974236488342285, + "kl": 0.24505615234375, + "learning_rate": 1e-06, + "loss": -0.0333, + "num_tokens": 12206842.0, + "reward": 0.16729718446731567, + "reward_std": 0.050741568207740784, + "rewards/bleu_reward_func/mean": 0.16729718446731567, + "rewards/bleu_reward_func/std": 0.2129126340150833, + "step": 935 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 444.0, + "completions/mean_length": 248.5625, + "completions/mean_terminated_length": 90.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 0.7488, + "grad_norm": 8.694432258605957, + "kl": 0.124114990234375, + "learning_rate": 1e-06, + "loss": 0.0072, + "num_tokens": 12216596.0, + "reward": 0.10160160809755325, + "reward_std": 0.03439757227897644, + "rewards/bleu_reward_func/mean": 0.10160160809755325, + "rewards/bleu_reward_func/std": 0.06317181140184402, + "step": 936 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 504.0, + "completions/mean_length": 173.75, + "completions/mean_terminated_length": 79.04000091552734, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.7496, + "grad_norm": 7.92829704284668, + "kl": 0.2811279296875, + "learning_rate": 1e-06, + "loss": -0.2142, + "num_tokens": 12226292.0, + "reward": 0.11500123143196106, + "reward_std": 0.030234824866056442, + "rewards/bleu_reward_func/mean": 0.11500123143196106, + "rewards/bleu_reward_func/std": 0.12273158878087997, + "step": 937 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 499.0, + "completions/mean_length": 234.875, + "completions/mean_terminated_length": 170.92308044433594, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.7504, + "grad_norm": 8.183011054992676, + "kl": 0.261383056640625, + "learning_rate": 1e-06, + "loss": 0.017, + "num_tokens": 12238216.0, + "reward": 0.39511436223983765, + "reward_std": 0.1106102392077446, + "rewards/bleu_reward_func/mean": 0.39511436223983765, + "rewards/bleu_reward_func/std": 0.3091021776199341, + "step": 938 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 487.0, + "completions/mean_length": 315.6875, + "completions/mean_terminated_length": 163.0, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.7512, + "grad_norm": 4.409646511077881, + "kl": 0.092132568359375, + "learning_rate": 1e-06, + "loss": -0.0114, + "num_tokens": 12254990.0, + "reward": 0.1955508440732956, + "reward_std": 0.016137830913066864, + "rewards/bleu_reward_func/mean": 0.1955508440732956, + "rewards/bleu_reward_func/std": 0.26973703503608704, + "step": 939 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.46875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 424.0, + "completions/mean_length": 276.875, + "completions/mean_terminated_length": 69.4117660522461, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 0.752, + "grad_norm": 7.45650577545166, + "kl": 0.2735595703125, + "learning_rate": 1e-06, + "loss": -0.0045, + "num_tokens": 12268242.0, + "reward": 0.07165145874023438, + "reward_std": 0.020489612594246864, + "rewards/bleu_reward_func/mean": 0.07165145874023438, + "rewards/bleu_reward_func/std": 0.04259462654590607, + "step": 940 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 482.0, + "completions/mean_length": 150.8125, + "completions/mean_terminated_length": 113.44827270507812, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.7528, + "grad_norm": 10.180867195129395, + "kl": 0.4688720703125, + "learning_rate": 1e-06, + "loss": 0.122, + "num_tokens": 12276916.0, + "reward": 0.15257704257965088, + "reward_std": 0.051439568400382996, + "rewards/bleu_reward_func/mean": 0.15257704257965088, + "rewards/bleu_reward_func/std": 0.11688338220119476, + "step": 941 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 509.0, + "completions/mean_length": 270.65625, + "completions/mean_terminated_length": 176.21739196777344, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 0.7536, + "grad_norm": 5.519646167755127, + "kl": 0.07073974609375, + "learning_rate": 1e-06, + "loss": 0.437, + "num_tokens": 12291593.0, + "reward": 0.06806058436632156, + "reward_std": 0.05050808936357498, + "rewards/bleu_reward_func/mean": 0.06806058436632156, + "rewards/bleu_reward_func/std": 0.06130353361368179, + "step": 942 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 510.0, + "completions/mean_length": 405.625, + "completions/mean_terminated_length": 375.8399963378906, + "completions/min_length": 233.0, + "completions/min_terminated_length": 233.0, + "epoch": 0.7544, + "grad_norm": 1.8502905368804932, + "kl": 0.0356292724609375, + "learning_rate": 1e-06, + "loss": 0.0076, + "num_tokens": 12310893.0, + "reward": 0.21372246742248535, + "reward_std": 0.0709368884563446, + "rewards/bleu_reward_func/mean": 0.21372246742248535, + "rewards/bleu_reward_func/std": 0.1763986349105835, + "step": 943 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 300.0, + "completions/mean_length": 245.625, + "completions/mean_terminated_length": 141.3913116455078, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "epoch": 0.7552, + "grad_norm": 3.3902478218078613, + "kl": 0.05377197265625, + "learning_rate": 1e-06, + "loss": 0.076, + "num_tokens": 12323369.0, + "reward": 0.0433628112077713, + "reward_std": 0.03261272981762886, + "rewards/bleu_reward_func/mean": 0.0433628112077713, + "rewards/bleu_reward_func/std": 0.0436432845890522, + "step": 944 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 405.0, + "completions/mean_length": 270.0625, + "completions/mean_terminated_length": 189.4166717529297, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.756, + "grad_norm": 5.770748615264893, + "kl": 0.088531494140625, + "learning_rate": 1e-06, + "loss": 0.0047, + "num_tokens": 12333803.0, + "reward": 0.10272043943405151, + "reward_std": 0.03215545043349266, + "rewards/bleu_reward_func/mean": 0.10272043943405151, + "rewards/bleu_reward_func/std": 0.11694183200597763, + "step": 945 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 490.0, + "completions/mean_length": 224.25, + "completions/mean_terminated_length": 194.48275756835938, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 0.7568, + "grad_norm": 7.7613067626953125, + "kl": 0.20135498046875, + "learning_rate": 1e-06, + "loss": 0.1591, + "num_tokens": 12344915.0, + "reward": 0.08291373401880264, + "reward_std": 0.024335253983736038, + "rewards/bleu_reward_func/mean": 0.08291373401880264, + "rewards/bleu_reward_func/std": 0.03890189528465271, + "step": 946 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.59375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 472.0, + "completions/mean_length": 455.6875, + "completions/mean_terminated_length": 373.3846435546875, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, + "epoch": 0.7576, + "grad_norm": 2.599454879760742, + "kl": 0.0421142578125, + "learning_rate": 1e-06, + "loss": -0.0235, + "num_tokens": 12362161.0, + "reward": 0.03875226154923439, + "reward_std": 0.020147912204265594, + "rewards/bleu_reward_func/mean": 0.03875226154923439, + "rewards/bleu_reward_func/std": 0.023408547043800354, + "step": 947 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 426.0, + "completions/mean_length": 197.875, + "completions/mean_terminated_length": 125.3846206665039, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 0.7584, + "grad_norm": 6.522151947021484, + "kl": 0.1690673828125, + "learning_rate": 1e-06, + "loss": 0.1959, + "num_tokens": 12375709.0, + "reward": 0.2021377682685852, + "reward_std": 0.0921662300825119, + "rewards/bleu_reward_func/mean": 0.2021377682685852, + "rewards/bleu_reward_func/std": 0.28283461928367615, + "step": 948 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.34375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 410.0, + "completions/mean_length": 256.21875, + "completions/mean_terminated_length": 122.23809814453125, + "completions/min_length": 45.0, + "completions/min_terminated_length": 45.0, + "epoch": 0.7592, + "grad_norm": 5.671032905578613, + "kl": 0.1173095703125, + "learning_rate": 1e-06, + "loss": -0.0303, + "num_tokens": 12385764.0, + "reward": 0.0564446821808815, + "reward_std": 0.02071106806397438, + "rewards/bleu_reward_func/mean": 0.0564446821808815, + "rewards/bleu_reward_func/std": 0.030088067054748535, + "step": 949 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 492.0, + "completions/mean_length": 255.59375, + "completions/mean_terminated_length": 229.0689697265625, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 0.76, + "grad_norm": 6.897347927093506, + "kl": 0.2220458984375, + "learning_rate": 1e-06, + "loss": 0.1234, + "num_tokens": 12396999.0, + "reward": 0.09963001310825348, + "reward_std": 0.05010713264346123, + "rewards/bleu_reward_func/mean": 0.09963001310825348, + "rewards/bleu_reward_func/std": 0.08052106946706772, + "step": 950 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 510.0, + "completions/mean_length": 456.5, + "completions/mean_terminated_length": 418.52630615234375, + "completions/min_length": 254.0, + "completions/min_terminated_length": 254.0, + "epoch": 0.7608, + "grad_norm": 2.444167137145996, + "kl": 0.04534912109375, + "learning_rate": 1e-06, + "loss": -0.0097, + "num_tokens": 12418039.0, + "reward": 0.03447666019201279, + "reward_std": 0.01355208083987236, + "rewards/bleu_reward_func/mean": 0.03447666019201279, + "rewards/bleu_reward_func/std": 0.022434458136558533, + "step": 951 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 470.0, + "completions/mean_length": 279.5625, + "completions/mean_terminated_length": 120.52631378173828, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.7616, + "grad_norm": 8.821101188659668, + "kl": 0.091888427734375, + "learning_rate": 1e-06, + "loss": 0.1929, + "num_tokens": 12431177.0, + "reward": 0.11506980657577515, + "reward_std": 0.033062804490327835, + "rewards/bleu_reward_func/mean": 0.11506980657577515, + "rewards/bleu_reward_func/std": 0.0943976491689682, + "step": 952 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 456.0, + "completions/mean_length": 269.78125, + "completions/mean_terminated_length": 201.95999145507812, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.7624, + "grad_norm": 6.004775524139404, + "kl": 0.07818603515625, + "learning_rate": 1e-06, + "loss": -0.1255, + "num_tokens": 12445970.0, + "reward": 0.09985020756721497, + "reward_std": 0.0198547150939703, + "rewards/bleu_reward_func/mean": 0.09985020756721497, + "rewards/bleu_reward_func/std": 0.08852815628051758, + "step": 953 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 478.0, + "completions/mean_length": 318.8125, + "completions/mean_terminated_length": 231.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 0.7632, + "grad_norm": 6.8956804275512695, + "kl": 0.175567626953125, + "learning_rate": 1e-06, + "loss": 0.0679, + "num_tokens": 12458004.0, + "reward": 0.1692689061164856, + "reward_std": 0.03958010673522949, + "rewards/bleu_reward_func/mean": 0.1692689061164856, + "rewards/bleu_reward_func/std": 0.13855873048305511, + "step": 954 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 480.0, + "completions/mean_length": 250.9375, + "completions/mean_terminated_length": 148.78260803222656, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.764, + "grad_norm": 6.749716758728027, + "kl": 0.293182373046875, + "learning_rate": 1e-06, + "loss": 0.021, + "num_tokens": 12472626.0, + "reward": 0.1224028617143631, + "reward_std": 0.027801956981420517, + "rewards/bleu_reward_func/mean": 0.1224028617143631, + "rewards/bleu_reward_func/std": 0.07426659762859344, + "step": 955 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 493.0, + "completions/mean_length": 147.6875, + "completions/mean_terminated_length": 123.40000915527344, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 0.7648, + "grad_norm": 9.662991523742676, + "kl": 0.46905517578125, + "learning_rate": 1e-06, + "loss": 0.0171, + "num_tokens": 12481928.0, + "reward": 0.23031684756278992, + "reward_std": 0.0920054167509079, + "rewards/bleu_reward_func/mean": 0.23031684756278992, + "rewards/bleu_reward_func/std": 0.16612249612808228, + "step": 956 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 478.0, + "completions/mean_length": 258.65625, + "completions/mean_terminated_length": 106.6500015258789, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.7656, + "grad_norm": 10.25383472442627, + "kl": 0.2713623046875, + "learning_rate": 1e-06, + "loss": 0.0254, + "num_tokens": 12493805.0, + "reward": 0.15187731385231018, + "reward_std": 0.025371436029672623, + "rewards/bleu_reward_func/mean": 0.15187731385231018, + "rewards/bleu_reward_func/std": 0.12905065715312958, + "step": 957 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 140.0, + "completions/mean_length": 85.40625, + "completions/mean_terminated_length": 71.64515686035156, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.7664, + "grad_norm": 8.22080135345459, + "kl": 0.393310546875, + "learning_rate": 1e-06, + "loss": 0.3247, + "num_tokens": 12502818.0, + "reward": 0.29921823740005493, + "reward_std": 0.11694261431694031, + "rewards/bleu_reward_func/mean": 0.29921823740005493, + "rewards/bleu_reward_func/std": 0.25639036297798157, + "step": 958 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 512.0, + "completions/max_terminated_length": 481.0, + "completions/mean_length": 386.09375, + "completions/mean_terminated_length": 260.1875, + "completions/min_length": 49.0, + "completions/min_terminated_length": 49.0, + "epoch": 0.7672, + "grad_norm": 2.847195863723755, + "kl": 0.0467529296875, + "learning_rate": 1e-06, + "loss": 0.0852, + "num_tokens": 12520317.0, + "reward": 0.039544593542814255, + "reward_std": 0.016648683696985245, + "rewards/bleu_reward_func/mean": 0.039544593542814255, + "rewards/bleu_reward_func/std": 0.034897446632385254, + "step": 959 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 452.0, + "completions/max_terminated_length": 452.0, + "completions/mean_length": 111.4375, + "completions/mean_terminated_length": 111.4375, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 0.768, + "grad_norm": 8.353513717651367, + "kl": 0.22430419921875, + "learning_rate": 1e-06, + "loss": 0.2469, + "num_tokens": 12530027.0, + "reward": 0.26215416193008423, + "reward_std": 0.032358862459659576, + "rewards/bleu_reward_func/mean": 0.26215416193008423, + "rewards/bleu_reward_func/std": 0.22925570607185364, + "step": 960 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.71875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 512.0, + "completions/mean_length": 456.6875, + "completions/mean_terminated_length": 315.3333435058594, + "completions/min_length": 41.0, + "completions/min_terminated_length": 41.0, + "epoch": 0.7688, + "grad_norm": 2.2755558490753174, + "kl": 0.033355712890625, + "learning_rate": 1e-06, + "loss": 0.1802, + "num_tokens": 12548593.0, + "reward": 0.02548890933394432, + "reward_std": 0.01250866986811161, + "rewards/bleu_reward_func/mean": 0.02548890933394432, + "rewards/bleu_reward_func/std": 0.0143959391862154, + "step": 961 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 512.0, + "completions/max_terminated_length": 480.0, + "completions/mean_length": 359.28125, + "completions/mean_terminated_length": 206.5625, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.7696, + "grad_norm": 5.7301130294799805, + "kl": 0.165802001953125, + "learning_rate": 1e-06, + "loss": 0.0633, + "num_tokens": 12566850.0, + "reward": 0.09463217854499817, + "reward_std": 0.021320462226867676, + "rewards/bleu_reward_func/mean": 0.09463217854499817, + "rewards/bleu_reward_func/std": 0.10299301147460938, + "step": 962 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 507.0, + "completions/mean_length": 388.125, + "completions/mean_terminated_length": 313.8000183105469, + "completions/min_length": 48.0, + "completions/min_terminated_length": 48.0, + "epoch": 0.7704, + "grad_norm": 2.5917232036590576, + "kl": 0.07171630859375, + "learning_rate": 1e-06, + "loss": 0.1643, + "num_tokens": 12581438.0, + "reward": 0.06743638217449188, + "reward_std": 0.041416820138692856, + "rewards/bleu_reward_func/mean": 0.06743638217449188, + "rewards/bleu_reward_func/std": 0.0745474174618721, + "step": 963 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 507.0, + "completions/mean_length": 173.53125, + "completions/mean_terminated_length": 162.61289978027344, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 0.7712, + "grad_norm": 5.022093772888184, + "kl": 0.110015869140625, + "learning_rate": 1e-06, + "loss": 0.1948, + "num_tokens": 12593087.0, + "reward": 0.14474597573280334, + "reward_std": 0.039374105632305145, + "rewards/bleu_reward_func/mean": 0.14474597573280334, + "rewards/bleu_reward_func/std": 0.0781283900141716, + "step": 964 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 345.0, + "completions/max_terminated_length": 345.0, + "completions/mean_length": 129.53125, + "completions/mean_terminated_length": 129.53125, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 0.772, + "grad_norm": 5.708756446838379, + "kl": 0.185546875, + "learning_rate": 1e-06, + "loss": -0.112, + "num_tokens": 12604360.0, + "reward": 0.17942924797534943, + "reward_std": 0.04769964888691902, + "rewards/bleu_reward_func/mean": 0.17942924797534943, + "rewards/bleu_reward_func/std": 0.20441435277462006, + "step": 965 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 459.0, + "completions/mean_length": 143.15625, + "completions/mean_terminated_length": 39.87999725341797, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.7728, + "grad_norm": 10.111825942993164, + "kl": 0.39947509765625, + "learning_rate": 1e-06, + "loss": -0.0058, + "num_tokens": 12613941.0, + "reward": 0.09816907346248627, + "reward_std": 0.009084422141313553, + "rewards/bleu_reward_func/mean": 0.09816907346248627, + "rewards/bleu_reward_func/std": 0.06435907632112503, + "step": 966 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.59375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 481.0, + "completions/mean_length": 348.875, + "completions/mean_terminated_length": 110.46154022216797, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.7736, + "grad_norm": 4.259431838989258, + "kl": 0.144195556640625, + "learning_rate": 1e-06, + "loss": 0.1364, + "num_tokens": 12628633.0, + "reward": 0.11497487127780914, + "reward_std": 0.0397370383143425, + "rewards/bleu_reward_func/mean": 0.11497487127780914, + "rewards/bleu_reward_func/std": 0.08479689061641693, + "step": 967 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.53125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 363.0, + "completions/mean_length": 319.125, + "completions/mean_terminated_length": 100.53334045410156, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.7744, + "grad_norm": 5.853184700012207, + "kl": 0.2406005859375, + "learning_rate": 1e-06, + "loss": 0.0855, + "num_tokens": 12642293.0, + "reward": 0.1797194480895996, + "reward_std": 0.06623274832963943, + "rewards/bleu_reward_func/mean": 0.1797194480895996, + "rewards/bleu_reward_func/std": 0.20514002442359924, + "step": 968 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 506.0, + "completions/mean_length": 276.65625, + "completions/mean_terminated_length": 198.20834350585938, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.7752, + "grad_norm": 3.861785650253296, + "kl": 0.048126220703125, + "learning_rate": 1e-06, + "loss": 0.3173, + "num_tokens": 12653674.0, + "reward": 0.06044634059071541, + "reward_std": 0.02236868627369404, + "rewards/bleu_reward_func/mean": 0.06044634059071541, + "rewards/bleu_reward_func/std": 0.058306269347667694, + "step": 969 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.34375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 512.0, + "completions/mean_length": 364.4375, + "completions/mean_terminated_length": 287.1428527832031, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.776, + "grad_norm": 2.8329427242279053, + "kl": 0.037445068359375, + "learning_rate": 1e-06, + "loss": 0.1258, + "num_tokens": 12668128.0, + "reward": 0.11927121132612228, + "reward_std": 0.0374884158372879, + "rewards/bleu_reward_func/mean": 0.11927121132612228, + "rewards/bleu_reward_func/std": 0.10864724963903427, + "step": 970 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 511.0, + "completions/mean_length": 247.3125, + "completions/mean_terminated_length": 198.29629516601562, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 0.7768, + "grad_norm": 14.522860527038574, + "kl": 0.3580322265625, + "learning_rate": 1e-06, + "loss": 0.327, + "num_tokens": 12680266.0, + "reward": 0.11901617795228958, + "reward_std": 0.06829790771007538, + "rewards/bleu_reward_func/mean": 0.11901617795228958, + "rewards/bleu_reward_func/std": 0.0924401804804802, + "step": 971 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 384.0, + "completions/mean_length": 270.71875, + "completions/mean_terminated_length": 83.05555725097656, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.7776, + "grad_norm": 5.036475658416748, + "kl": 0.187713623046875, + "learning_rate": 1e-06, + "loss": 0.1832, + "num_tokens": 12692713.0, + "reward": 0.13872118294239044, + "reward_std": 0.0687481164932251, + "rewards/bleu_reward_func/mean": 0.13872118294239044, + "rewards/bleu_reward_func/std": 0.14044460654258728, + "step": 972 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 385.0, + "completions/mean_length": 244.90625, + "completions/mean_terminated_length": 155.875, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.7784, + "grad_norm": 7.775391578674316, + "kl": 0.136444091796875, + "learning_rate": 1e-06, + "loss": 0.0436, + "num_tokens": 12706702.0, + "reward": 0.11104710400104523, + "reward_std": 0.03642675280570984, + "rewards/bleu_reward_func/mean": 0.11104710400104523, + "rewards/bleu_reward_func/std": 0.09262983500957489, + "step": 973 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 509.0, + "completions/mean_length": 293.96875, + "completions/mean_terminated_length": 194.8636474609375, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 0.7792, + "grad_norm": 36.50672149658203, + "kl": 1.153228759765625, + "learning_rate": 1e-06, + "loss": 0.1761, + "num_tokens": 12721461.0, + "reward": 0.2013707458972931, + "reward_std": 0.10103052109479904, + "rewards/bleu_reward_func/mean": 0.2013707458972931, + "rewards/bleu_reward_func/std": 0.16323500871658325, + "step": 974 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 500.0, + "completions/mean_length": 164.5, + "completions/mean_terminated_length": 128.55172729492188, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 0.78, + "grad_norm": 9.472455024719238, + "kl": 0.2548828125, + "learning_rate": 1e-06, + "loss": 0.6259, + "num_tokens": 12733317.0, + "reward": 0.07841520756483078, + "reward_std": 0.029445767402648926, + "rewards/bleu_reward_func/mean": 0.07841520756483078, + "rewards/bleu_reward_func/std": 0.0620783306658268, + "step": 975 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 480.0, + "completions/mean_length": 247.09375, + "completions/mean_terminated_length": 229.433349609375, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.7808, + "grad_norm": 4.399056911468506, + "kl": 0.10150146484375, + "learning_rate": 1e-06, + "loss": 0.0484, + "num_tokens": 12746712.0, + "reward": 0.20041930675506592, + "reward_std": 0.03039298765361309, + "rewards/bleu_reward_func/mean": 0.20041930675506592, + "rewards/bleu_reward_func/std": 0.2551174759864807, + "step": 976 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 462.0, + "completions/mean_length": 382.375, + "completions/mean_terminated_length": 293.6842041015625, + "completions/min_length": 74.0, + "completions/min_terminated_length": 74.0, + "epoch": 0.7816, + "grad_norm": 2.5359840393066406, + "kl": 0.05230712890625, + "learning_rate": 1e-06, + "loss": 0.0699, + "num_tokens": 12762020.0, + "reward": 0.047146447002887726, + "reward_std": 0.022996241226792336, + "rewards/bleu_reward_func/mean": 0.047146447002887726, + "rewards/bleu_reward_func/std": 0.04002131521701813, + "step": 977 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.34375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 486.0, + "completions/mean_length": 329.90625, + "completions/mean_terminated_length": 234.52381896972656, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.7824, + "grad_norm": 6.270121097564697, + "kl": 0.1246337890625, + "learning_rate": 1e-06, + "loss": -0.0807, + "num_tokens": 12778233.0, + "reward": 0.1260184496641159, + "reward_std": 0.029545176774263382, + "rewards/bleu_reward_func/mean": 0.1260184496641159, + "rewards/bleu_reward_func/std": 0.12758195400238037, + "step": 978 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 315.0, + "completions/mean_length": 163.6875, + "completions/mean_terminated_length": 127.6551742553711, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 0.7832, + "grad_norm": 11.418375015258789, + "kl": 0.1888427734375, + "learning_rate": 1e-06, + "loss": 0.4229, + "num_tokens": 12788943.0, + "reward": 0.22336477041244507, + "reward_std": 0.0984843298792839, + "rewards/bleu_reward_func/mean": 0.22336477041244507, + "rewards/bleu_reward_func/std": 0.1921825110912323, + "step": 979 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 479.0, + "completions/mean_length": 141.6875, + "completions/mean_terminated_length": 117.00000762939453, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 0.784, + "grad_norm": 7.894710540771484, + "kl": 0.4378662109375, + "learning_rate": 1e-06, + "loss": 0.2326, + "num_tokens": 12797693.0, + "reward": 0.1561349630355835, + "reward_std": 0.05494026839733124, + "rewards/bleu_reward_func/mean": 0.1561349630355835, + "rewards/bleu_reward_func/std": 0.09167517721652985, + "step": 980 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 492.0, + "completions/mean_length": 255.34375, + "completions/mean_terminated_length": 169.7916717529297, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 0.7848, + "grad_norm": 19.572107315063477, + "kl": 0.288604736328125, + "learning_rate": 1e-06, + "loss": 0.151, + "num_tokens": 12812768.0, + "reward": 0.06323938816785812, + "reward_std": 0.017744949087500572, + "rewards/bleu_reward_func/mean": 0.06323938816785812, + "rewards/bleu_reward_func/std": 0.07885830849409103, + "step": 981 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 443.0, + "completions/mean_length": 281.65625, + "completions/mean_terminated_length": 176.9545440673828, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 0.7856, + "grad_norm": 8.172053337097168, + "kl": 0.24444580078125, + "learning_rate": 1e-06, + "loss": 0.21, + "num_tokens": 12829453.0, + "reward": 0.0720784068107605, + "reward_std": 0.03868547081947327, + "rewards/bleu_reward_func/mean": 0.0720784068107605, + "rewards/bleu_reward_func/std": 0.05159585550427437, + "step": 982 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 474.0, + "completions/mean_length": 235.84375, + "completions/mean_terminated_length": 110.31818389892578, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 0.7864, + "grad_norm": 8.912215232849121, + "kl": 0.32025146484375, + "learning_rate": 1e-06, + "loss": 0.12, + "num_tokens": 12843376.0, + "reward": 0.1997315138578415, + "reward_std": 0.030267415568232536, + "rewards/bleu_reward_func/mean": 0.1997315138578415, + "rewards/bleu_reward_func/std": 0.17783835530281067, + "step": 983 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 449.0, + "completions/mean_length": 378.0, + "completions/mean_terminated_length": 317.0909118652344, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "epoch": 0.7872, + "grad_norm": 2.6129956245422363, + "kl": 0.036346435546875, + "learning_rate": 1e-06, + "loss": -0.1504, + "num_tokens": 12859368.0, + "reward": 0.07119783759117126, + "reward_std": 0.018479108810424805, + "rewards/bleu_reward_func/mean": 0.07119783759117126, + "rewards/bleu_reward_func/std": 0.06165986508131027, + "step": 984 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 434.0, + "completions/mean_length": 220.03125, + "completions/mean_terminated_length": 152.6538543701172, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.788, + "grad_norm": 5.786254405975342, + "kl": 0.15057373046875, + "learning_rate": 1e-06, + "loss": -0.1709, + "num_tokens": 12869665.0, + "reward": 0.14459165930747986, + "reward_std": 0.03573929890990257, + "rewards/bleu_reward_func/mean": 0.14459165930747986, + "rewards/bleu_reward_func/std": 0.13286592066287994, + "step": 985 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.46875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 499.0, + "completions/mean_length": 357.125, + "completions/mean_terminated_length": 220.47059631347656, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 0.7888, + "grad_norm": 8.607572555541992, + "kl": 0.24078369140625, + "learning_rate": 1e-06, + "loss": 0.0188, + "num_tokens": 12885501.0, + "reward": 0.09036614745855331, + "reward_std": 0.031877610832452774, + "rewards/bleu_reward_func/mean": 0.09036614745855331, + "rewards/bleu_reward_func/std": 0.05137631297111511, + "step": 986 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 506.0, + "completions/mean_length": 381.75, + "completions/mean_terminated_length": 280.4444580078125, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 0.7896, + "grad_norm": 2.421383857727051, + "kl": 0.046905517578125, + "learning_rate": 1e-06, + "loss": 0.1548, + "num_tokens": 12899517.0, + "reward": 0.06479343771934509, + "reward_std": 0.01870723068714142, + "rewards/bleu_reward_func/mean": 0.06479343771934509, + "rewards/bleu_reward_func/std": 0.039773859083652496, + "step": 987 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 501.0, + "completions/mean_length": 272.3125, + "completions/mean_terminated_length": 238.07144165039062, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "epoch": 0.7904, + "grad_norm": 3.447153329849243, + "kl": 0.06671142578125, + "learning_rate": 1e-06, + "loss": -0.0649, + "num_tokens": 12910175.0, + "reward": 0.0866774171590805, + "reward_std": 0.07288840413093567, + "rewards/bleu_reward_func/mean": 0.0866774171590805, + "rewards/bleu_reward_func/std": 0.10417941212654114, + "step": 988 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 499.0, + "completions/mean_length": 156.625, + "completions/mean_terminated_length": 74.61538696289062, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 0.7912, + "grad_norm": 11.11733341217041, + "kl": 0.34686279296875, + "learning_rate": 1e-06, + "loss": 0.4111, + "num_tokens": 12920075.0, + "reward": 0.18872088193893433, + "reward_std": 0.05310884118080139, + "rewards/bleu_reward_func/mean": 0.18872088193893433, + "rewards/bleu_reward_func/std": 0.10052233934402466, + "step": 989 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 443.0, + "completions/mean_length": 251.15625, + "completions/mean_terminated_length": 94.6500015258789, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.792, + "grad_norm": 4.343538761138916, + "kl": 0.0811767578125, + "learning_rate": 1e-06, + "loss": 0.255, + "num_tokens": 12932672.0, + "reward": 0.2421235740184784, + "reward_std": 0.03650471195578575, + "rewards/bleu_reward_func/mean": 0.2421235740184784, + "rewards/bleu_reward_func/std": 0.35968947410583496, + "step": 990 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 512.0, + "completions/mean_length": 365.34375, + "completions/mean_terminated_length": 307.9565124511719, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "epoch": 0.7928, + "grad_norm": 4.527655601501465, + "kl": 0.109619140625, + "learning_rate": 1e-06, + "loss": -0.0782, + "num_tokens": 12947603.0, + "reward": 0.05564543977379799, + "reward_std": 0.033515315502882004, + "rewards/bleu_reward_func/mean": 0.05564543977379799, + "rewards/bleu_reward_func/std": 0.03913462907075882, + "step": 991 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 489.0, + "completions/mean_length": 128.46875, + "completions/mean_terminated_length": 57.4444465637207, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 0.7936, + "grad_norm": 9.973651885986328, + "kl": 0.1922607421875, + "learning_rate": 1e-06, + "loss": 0.9373, + "num_tokens": 12958538.0, + "reward": 0.3186902403831482, + "reward_std": 0.10882419347763062, + "rewards/bleu_reward_func/mean": 0.3186902403831482, + "rewards/bleu_reward_func/std": 0.2608534097671509, + "step": 992 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 484.0, + "completions/mean_length": 328.375, + "completions/mean_terminated_length": 202.73684692382812, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 0.7944, + "grad_norm": 7.580811023712158, + "kl": 0.290863037109375, + "learning_rate": 1e-06, + "loss": 0.1547, + "num_tokens": 12975334.0, + "reward": 0.16742870211601257, + "reward_std": 0.03473435714840889, + "rewards/bleu_reward_func/mean": 0.16742870211601257, + "rewards/bleu_reward_func/std": 0.1612749844789505, + "step": 993 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 491.0, + "completions/mean_length": 119.6875, + "completions/mean_terminated_length": 47.03703689575195, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 0.7952, + "grad_norm": 20.384323120117188, + "kl": 0.74737548828125, + "learning_rate": 1e-06, + "loss": -0.0224, + "num_tokens": 12988020.0, + "reward": 0.1509585976600647, + "reward_std": 0.0387745276093483, + "rewards/bleu_reward_func/mean": 0.1509585976600647, + "rewards/bleu_reward_func/std": 0.13122804462909698, + "step": 994 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 512.0, + "completions/max_terminated_length": 499.0, + "completions/mean_length": 346.84375, + "completions/mean_terminated_length": 181.6875, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 0.796, + "grad_norm": 4.161476135253906, + "kl": 0.05682373046875, + "learning_rate": 1e-06, + "loss": -0.1853, + "num_tokens": 13003199.0, + "reward": 0.026108039543032646, + "reward_std": 0.02537854015827179, + "rewards/bleu_reward_func/mean": 0.026108039543032646, + "rewards/bleu_reward_func/std": 0.03443064168095589, + "step": 995 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 239.0, + "completions/mean_length": 181.0625, + "completions/mean_terminated_length": 70.75, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 0.7968, + "grad_norm": 14.441998481750488, + "kl": 0.575714111328125, + "learning_rate": 1e-06, + "loss": 0.1517, + "num_tokens": 13012889.0, + "reward": 0.10783781111240387, + "reward_std": 0.053533561527729034, + "rewards/bleu_reward_func/mean": 0.10783781111240387, + "rewards/bleu_reward_func/std": 0.09023794531822205, + "step": 996 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.59375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 444.0, + "completions/mean_length": 371.75, + "completions/mean_terminated_length": 166.7692413330078, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.7976, + "grad_norm": 3.627946615219116, + "kl": 0.089813232421875, + "learning_rate": 1e-06, + "loss": -0.0463, + "num_tokens": 13027321.0, + "reward": 0.07810983061790466, + "reward_std": 0.03820539265871048, + "rewards/bleu_reward_func/mean": 0.07810983061790466, + "rewards/bleu_reward_func/std": 0.07255319505929947, + "step": 997 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 410.0, + "completions/mean_length": 299.96875, + "completions/mean_terminated_length": 203.59091186523438, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.7984, + "grad_norm": 5.938675403594971, + "kl": 0.163330078125, + "learning_rate": 1e-06, + "loss": 0.1322, + "num_tokens": 13038888.0, + "reward": 0.0998745784163475, + "reward_std": 0.12165166437625885, + "rewards/bleu_reward_func/mean": 0.0998745784163475, + "rewards/bleu_reward_func/std": 0.2023635059595108, + "step": 998 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 505.0, + "completions/mean_length": 299.25, + "completions/mean_terminated_length": 171.60000610351562, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 0.7992, + "grad_norm": 5.83504581451416, + "kl": 0.288848876953125, + "learning_rate": 1e-06, + "loss": 0.0509, + "num_tokens": 13051904.0, + "reward": 0.10337799787521362, + "reward_std": 0.029087794944643974, + "rewards/bleu_reward_func/mean": 0.10337799787521362, + "rewards/bleu_reward_func/std": 0.07911896705627441, + "step": 999 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 492.0, + "completions/mean_length": 345.96875, + "completions/mean_terminated_length": 322.25, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "epoch": 0.8, + "grad_norm": 2.916576385498047, + "kl": 0.078857421875, + "learning_rate": 1e-06, + "loss": 0.05, + "num_tokens": 13065431.0, + "reward": 0.05868455022573471, + "reward_std": 0.017369702458381653, + "rewards/bleu_reward_func/mean": 0.05868455022573471, + "rewards/bleu_reward_func/std": 0.04672805219888687, + "step": 1000 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 461.0, + "completions/mean_length": 266.8125, + "completions/mean_terminated_length": 250.4666748046875, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 0.8008, + "grad_norm": 5.116244316101074, + "kl": 0.20050048828125, + "learning_rate": 1e-06, + "loss": 0.0918, + "num_tokens": 13079225.0, + "reward": 0.13771796226501465, + "reward_std": 0.04302237555384636, + "rewards/bleu_reward_func/mean": 0.13771796226501465, + "rewards/bleu_reward_func/std": 0.10432249307632446, + "step": 1001 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 505.0, + "completions/mean_length": 316.1875, + "completions/mean_terminated_length": 239.56521606445312, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.8016, + "grad_norm": 3.050908088684082, + "kl": 0.0596923828125, + "learning_rate": 1e-06, + "loss": -0.0323, + "num_tokens": 13093775.0, + "reward": 0.07833529263734818, + "reward_std": 0.02821630984544754, + "rewards/bleu_reward_func/mean": 0.07833529263734818, + "rewards/bleu_reward_func/std": 0.06890382617712021, + "step": 1002 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 380.0, + "completions/mean_length": 180.0, + "completions/mean_terminated_length": 132.57144165039062, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 0.8024, + "grad_norm": 6.102024078369141, + "kl": 0.28985595703125, + "learning_rate": 1e-06, + "loss": 0.0137, + "num_tokens": 13104447.0, + "reward": 0.27597150206565857, + "reward_std": 0.04547630250453949, + "rewards/bleu_reward_func/mean": 0.27597150206565857, + "rewards/bleu_reward_func/std": 0.2288428395986557, + "step": 1003 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.53125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 476.0, + "completions/mean_length": 391.25, + "completions/mean_terminated_length": 254.40000915527344, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.8032, + "grad_norm": 3.2652981281280518, + "kl": 0.084625244140625, + "learning_rate": 1e-06, + "loss": 0.1886, + "num_tokens": 13119655.0, + "reward": 0.13283666968345642, + "reward_std": 0.029899559915065765, + "rewards/bleu_reward_func/mean": 0.13283666968345642, + "rewards/bleu_reward_func/std": 0.1536840796470642, + "step": 1004 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 456.0, + "completions/mean_length": 203.8125, + "completions/mean_terminated_length": 183.2666778564453, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 0.804, + "grad_norm": 6.098532199859619, + "kl": 0.1204833984375, + "learning_rate": 1e-06, + "loss": 0.0735, + "num_tokens": 13128825.0, + "reward": 0.21581590175628662, + "reward_std": 0.10097475349903107, + "rewards/bleu_reward_func/mean": 0.21581590175628662, + "rewards/bleu_reward_func/std": 0.2611050307750702, + "step": 1005 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 471.0, + "completions/mean_length": 358.1875, + "completions/mean_terminated_length": 238.55555725097656, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 0.8048, + "grad_norm": 7.441494464874268, + "kl": 0.191802978515625, + "learning_rate": 1e-06, + "loss": 0.0313, + "num_tokens": 13142175.0, + "reward": 0.20881031453609467, + "reward_std": 0.03906787186861038, + "rewards/bleu_reward_func/mean": 0.20881031453609467, + "rewards/bleu_reward_func/std": 0.17651565372943878, + "step": 1006 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 511.0, + "completions/mean_length": 307.65625, + "completions/mean_terminated_length": 260.5, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "epoch": 0.8056, + "grad_norm": 3.559936761856079, + "kl": 0.0572509765625, + "learning_rate": 1e-06, + "loss": 0.1028, + "num_tokens": 13153852.0, + "reward": 0.04283101111650467, + "reward_std": 0.04057364910840988, + "rewards/bleu_reward_func/mean": 0.04283101111650467, + "rewards/bleu_reward_func/std": 0.059128936380147934, + "step": 1007 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.34375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 497.0, + "completions/mean_length": 336.875, + "completions/mean_terminated_length": 245.1428680419922, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "epoch": 0.8064, + "grad_norm": 4.735692024230957, + "kl": 0.069091796875, + "learning_rate": 1e-06, + "loss": 0.0972, + "num_tokens": 13171048.0, + "reward": 0.09638670086860657, + "reward_std": 0.019614677876234055, + "rewards/bleu_reward_func/mean": 0.09638670086860657, + "rewards/bleu_reward_func/std": 0.09023009240627289, + "step": 1008 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 481.0, + "completions/mean_length": 329.59375, + "completions/mean_terminated_length": 204.7894744873047, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 0.8072, + "grad_norm": 7.47179651260376, + "kl": 0.1318359375, + "learning_rate": 1e-06, + "loss": -0.0877, + "num_tokens": 13187195.0, + "reward": 0.036128196865320206, + "reward_std": 0.020984536036849022, + "rewards/bleu_reward_func/mean": 0.036128196865320206, + "rewards/bleu_reward_func/std": 0.038413140922784805, + "step": 1009 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 252.0, + "completions/max_terminated_length": 252.0, + "completions/mean_length": 53.5625, + "completions/mean_terminated_length": 53.5625, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 0.808, + "grad_norm": 10.647790908813477, + "kl": 0.482666015625, + "learning_rate": 1e-06, + "loss": -0.052, + "num_tokens": 13197629.0, + "reward": 0.34467193484306335, + "reward_std": 0.09173881262540817, + "rewards/bleu_reward_func/mean": 0.34467193484306335, + "rewards/bleu_reward_func/std": 0.23519103229045868, + "step": 1010 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 481.0, + "completions/mean_length": 191.6875, + "completions/mean_terminated_length": 181.35482788085938, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 0.8088, + "grad_norm": 6.100401878356934, + "kl": 0.141632080078125, + "learning_rate": 1e-06, + "loss": 0.0525, + "num_tokens": 13210699.0, + "reward": 0.23702389001846313, + "reward_std": 0.03205852955579758, + "rewards/bleu_reward_func/mean": 0.23702389001846313, + "rewards/bleu_reward_func/std": 0.1315021812915802, + "step": 1011 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.53125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 511.0, + "completions/mean_length": 374.4375, + "completions/mean_terminated_length": 218.53334045410156, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 0.8096, + "grad_norm": 8.772051811218262, + "kl": 0.103546142578125, + "learning_rate": 1e-06, + "loss": 0.0866, + "num_tokens": 13228945.0, + "reward": 0.13881856203079224, + "reward_std": 0.044034168124198914, + "rewards/bleu_reward_func/mean": 0.13881856203079224, + "rewards/bleu_reward_func/std": 0.0626484826207161, + "step": 1012 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 479.0, + "completions/mean_length": 147.96875, + "completions/mean_terminated_length": 95.96428680419922, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.8104, + "grad_norm": 14.525429725646973, + "kl": 0.4837646484375, + "learning_rate": 1e-06, + "loss": 0.0162, + "num_tokens": 13242400.0, + "reward": 0.20639193058013916, + "reward_std": 0.05956702679395676, + "rewards/bleu_reward_func/mean": 0.20639193058013916, + "rewards/bleu_reward_func/std": 0.12604379653930664, + "step": 1013 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 510.0, + "completions/max_terminated_length": 510.0, + "completions/mean_length": 149.90625, + "completions/mean_terminated_length": 149.90625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8112, + "grad_norm": 22.3626766204834, + "kl": 0.2891845703125, + "learning_rate": 1e-06, + "loss": 0.0612, + "num_tokens": 13252157.0, + "reward": 0.24893034994602203, + "reward_std": 0.03380701690912247, + "rewards/bleu_reward_func/mean": 0.24893034994602203, + "rewards/bleu_reward_func/std": 0.20013003051280975, + "step": 1014 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 512.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 266.6875, + "completions/mean_terminated_length": 21.375, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 0.812, + "grad_norm": 7.276104927062988, + "kl": 0.26300048828125, + "learning_rate": 1e-06, + "loss": -0.0225, + "num_tokens": 13263955.0, + "reward": 0.14417850971221924, + "reward_std": 0.037887826561927795, + "rewards/bleu_reward_func/mean": 0.14417850971221924, + "rewards/bleu_reward_func/std": 0.1605955958366394, + "step": 1015 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 470.0, + "completions/mean_length": 226.28125, + "completions/mean_terminated_length": 146.27999877929688, + "completions/min_length": 51.0, + "completions/min_terminated_length": 51.0, + "epoch": 0.8128, + "grad_norm": 5.462210655212402, + "kl": 0.13653564453125, + "learning_rate": 1e-06, + "loss": 0.1195, + "num_tokens": 13273692.0, + "reward": 0.08337672054767609, + "reward_std": 0.02062853053212166, + "rewards/bleu_reward_func/mean": 0.08337672054767609, + "rewards/bleu_reward_func/std": 0.048102062195539474, + "step": 1016 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.34375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 502.0, + "completions/mean_length": 354.0625, + "completions/mean_terminated_length": 271.3333435058594, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.8136, + "grad_norm": 5.125792980194092, + "kl": 0.22271728515625, + "learning_rate": 1e-06, + "loss": 0.0213, + "num_tokens": 13289478.0, + "reward": 0.19775965809822083, + "reward_std": 0.04682963341474533, + "rewards/bleu_reward_func/mean": 0.19775965809822083, + "rewards/bleu_reward_func/std": 0.22582097351551056, + "step": 1017 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 492.0, + "completions/mean_length": 183.65625, + "completions/mean_terminated_length": 107.8846206665039, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 0.8144, + "grad_norm": 7.784894943237305, + "kl": 0.38018798828125, + "learning_rate": 1e-06, + "loss": -0.2707, + "num_tokens": 13299747.0, + "reward": 0.06794524192810059, + "reward_std": 0.039994340389966965, + "rewards/bleu_reward_func/mean": 0.06794524192810059, + "rewards/bleu_reward_func/std": 0.0657891035079956, + "step": 1018 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 476.0, + "completions/mean_length": 232.6875, + "completions/mean_terminated_length": 203.79310607910156, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.8152, + "grad_norm": 6.757852077484131, + "kl": 0.23583984375, + "learning_rate": 1e-06, + "loss": -0.0289, + "num_tokens": 13308865.0, + "reward": 0.1327855885028839, + "reward_std": 0.043607283383607864, + "rewards/bleu_reward_func/mean": 0.1327855885028839, + "rewards/bleu_reward_func/std": 0.1713796854019165, + "step": 1019 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 483.0, + "completions/mean_length": 116.03125, + "completions/mean_terminated_length": 75.06896209716797, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 0.816, + "grad_norm": 6.1786370277404785, + "kl": 0.21380615234375, + "learning_rate": 1e-06, + "loss": 0.1686, + "num_tokens": 13318914.0, + "reward": 0.23704446852207184, + "reward_std": 0.057613980025053024, + "rewards/bleu_reward_func/mean": 0.23704446852207184, + "rewards/bleu_reward_func/std": 0.21550458669662476, + "step": 1020 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 512.0, + "completions/mean_length": 188.34375, + "completions/mean_terminated_length": 166.7666778564453, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.8168, + "grad_norm": 7.113982200622559, + "kl": 0.258514404296875, + "learning_rate": 1e-06, + "loss": 0.0406, + "num_tokens": 13329205.0, + "reward": 0.08146457374095917, + "reward_std": 0.02083425223827362, + "rewards/bleu_reward_func/mean": 0.08146457374095917, + "rewards/bleu_reward_func/std": 0.0736912190914154, + "step": 1021 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 500.0, + "completions/mean_length": 248.1875, + "completions/mean_terminated_length": 144.95652770996094, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 0.8176, + "grad_norm": 10.716026306152344, + "kl": 0.3465576171875, + "learning_rate": 1e-06, + "loss": -0.0356, + "num_tokens": 13345867.0, + "reward": 0.1497185379266739, + "reward_std": 0.016201931983232498, + "rewards/bleu_reward_func/mean": 0.1497185379266739, + "rewards/bleu_reward_func/std": 0.17363472282886505, + "step": 1022 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 466.0, + "completions/mean_length": 306.59375, + "completions/mean_terminated_length": 238.125, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 0.8184, + "grad_norm": 5.428062915802002, + "kl": 0.152679443359375, + "learning_rate": 1e-06, + "loss": -0.0033, + "num_tokens": 13360854.0, + "reward": 0.1825982928276062, + "reward_std": 0.057225678116083145, + "rewards/bleu_reward_func/mean": 0.1825982928276062, + "rewards/bleu_reward_func/std": 0.1867101639509201, + "step": 1023 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 504.0, + "completions/max_terminated_length": 504.0, + "completions/mean_length": 103.6875, + "completions/mean_terminated_length": 103.6875, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.8192, + "grad_norm": 9.348381042480469, + "kl": 0.17828369140625, + "learning_rate": 1e-06, + "loss": 0.0549, + "num_tokens": 13369980.0, + "reward": 0.11240973323583603, + "reward_std": 0.026126563549041748, + "rewards/bleu_reward_func/mean": 0.11240973323583603, + "rewards/bleu_reward_func/std": 0.11270570009946823, + "step": 1024 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 492.0, + "completions/mean_length": 225.875, + "completions/mean_terminated_length": 159.84616088867188, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.82, + "grad_norm": 7.571503162384033, + "kl": 0.237548828125, + "learning_rate": 1e-06, + "loss": 0.0009, + "num_tokens": 13384584.0, + "reward": 0.2324497401714325, + "reward_std": 0.0470973402261734, + "rewards/bleu_reward_func/mean": 0.2324497401714325, + "rewards/bleu_reward_func/std": 0.1243894025683403, + "step": 1025 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 509.0, + "completions/mean_length": 139.34375, + "completions/mean_terminated_length": 114.50000762939453, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 0.8208, + "grad_norm": 8.227386474609375, + "kl": 0.25714111328125, + "learning_rate": 1e-06, + "loss": 0.1704, + "num_tokens": 13391771.0, + "reward": 0.12480157613754272, + "reward_std": 0.04330623894929886, + "rewards/bleu_reward_func/mean": 0.12480157613754272, + "rewards/bleu_reward_func/std": 0.103439562022686, + "step": 1026 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 497.0, + "completions/mean_length": 214.3125, + "completions/mean_terminated_length": 130.95999145507812, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 0.8216, + "grad_norm": 5.542022228240967, + "kl": 0.24603271484375, + "learning_rate": 1e-06, + "loss": -0.1391, + "num_tokens": 13403757.0, + "reward": 0.25766974687576294, + "reward_std": 0.03755660355091095, + "rewards/bleu_reward_func/mean": 0.25766974687576294, + "rewards/bleu_reward_func/std": 0.22421182692050934, + "step": 1027 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.46875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 512.0, + "completions/mean_length": 459.53125, + "completions/mean_terminated_length": 413.23529052734375, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "epoch": 0.8224, + "grad_norm": 2.1804089546203613, + "kl": 0.04327392578125, + "learning_rate": 1e-06, + "loss": -0.1423, + "num_tokens": 13421078.0, + "reward": 0.07269357144832611, + "reward_std": 0.02826325222849846, + "rewards/bleu_reward_func/mean": 0.07269357144832611, + "rewards/bleu_reward_func/std": 0.034365471452474594, + "step": 1028 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 496.0, + "completions/mean_length": 233.90625, + "completions/mean_terminated_length": 194.17857360839844, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.8232, + "grad_norm": 5.306719779968262, + "kl": 0.1806640625, + "learning_rate": 1e-06, + "loss": -0.4029, + "num_tokens": 13432323.0, + "reward": 0.18320006132125854, + "reward_std": 0.08323986828327179, + "rewards/bleu_reward_func/mean": 0.18320006132125854, + "rewards/bleu_reward_func/std": 0.2284490317106247, + "step": 1029 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 511.0, + "completions/mean_length": 258.4375, + "completions/mean_terminated_length": 187.44000244140625, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.824, + "grad_norm": 3.841357707977295, + "kl": 0.070404052734375, + "learning_rate": 1e-06, + "loss": 0.1459, + "num_tokens": 13444169.0, + "reward": 0.09387044608592987, + "reward_std": 0.07637906074523926, + "rewards/bleu_reward_func/mean": 0.09387044608592987, + "rewards/bleu_reward_func/std": 0.10294011980295181, + "step": 1030 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 505.0, + "completions/mean_length": 121.59375, + "completions/mean_terminated_length": 109.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 0.8248, + "grad_norm": 10.732538223266602, + "kl": 0.598388671875, + "learning_rate": 1e-06, + "loss": -0.0236, + "num_tokens": 13450364.0, + "reward": 0.18597961962223053, + "reward_std": 0.03610639274120331, + "rewards/bleu_reward_func/mean": 0.18597961962223053, + "rewards/bleu_reward_func/std": 0.14241203665733337, + "step": 1031 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 505.0, + "completions/mean_length": 174.15625, + "completions/mean_terminated_length": 151.6333465576172, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "epoch": 0.8256, + "grad_norm": 5.849789619445801, + "kl": 0.2237548828125, + "learning_rate": 1e-06, + "loss": -0.1461, + "num_tokens": 13458489.0, + "reward": 0.10662397742271423, + "reward_std": 0.044935449957847595, + "rewards/bleu_reward_func/mean": 0.10662397742271423, + "rewards/bleu_reward_func/std": 0.08882930874824524, + "step": 1032 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 472.0, + "completions/mean_length": 365.75, + "completions/mean_terminated_length": 278.0, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "epoch": 0.8264, + "grad_norm": 2.3378233909606934, + "kl": 0.040679931640625, + "learning_rate": 1e-06, + "loss": 0.018, + "num_tokens": 13477497.0, + "reward": 0.12139745056629181, + "reward_std": 0.030839571729302406, + "rewards/bleu_reward_func/mean": 0.12139745056629181, + "rewards/bleu_reward_func/std": 0.087521493434906, + "step": 1033 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 506.0, + "completions/mean_length": 307.8125, + "completions/mean_terminated_length": 260.69232177734375, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 0.8272, + "grad_norm": 5.579449653625488, + "kl": 0.14544677734375, + "learning_rate": 1e-06, + "loss": -0.0832, + "num_tokens": 13491595.0, + "reward": 0.06439976394176483, + "reward_std": 0.01632755994796753, + "rewards/bleu_reward_func/mean": 0.06439976394176483, + "rewards/bleu_reward_func/std": 0.025089839473366737, + "step": 1034 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 511.0, + "completions/mean_length": 227.90625, + "completions/mean_terminated_length": 162.34616088867188, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "epoch": 0.828, + "grad_norm": 6.95845890045166, + "kl": 0.08489990234375, + "learning_rate": 1e-06, + "loss": 0.1636, + "num_tokens": 13502304.0, + "reward": 0.15672987699508667, + "reward_std": 0.07095484435558319, + "rewards/bleu_reward_func/mean": 0.15672987699508667, + "rewards/bleu_reward_func/std": 0.1326054334640503, + "step": 1035 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 507.0, + "completions/mean_length": 162.59375, + "completions/mean_terminated_length": 139.3000030517578, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 0.8288, + "grad_norm": 7.602659702301025, + "kl": 0.2142333984375, + "learning_rate": 1e-06, + "loss": -0.0952, + "num_tokens": 13512379.0, + "reward": 0.10166750848293304, + "reward_std": 0.022390395402908325, + "rewards/bleu_reward_func/mean": 0.10166750848293304, + "rewards/bleu_reward_func/std": 0.09791414439678192, + "step": 1036 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 482.0, + "completions/mean_length": 352.84375, + "completions/mean_terminated_length": 257.3500061035156, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "epoch": 0.8296, + "grad_norm": 3.4908225536346436, + "kl": 0.0816650390625, + "learning_rate": 1e-06, + "loss": 0.1513, + "num_tokens": 13526246.0, + "reward": 0.10761404037475586, + "reward_std": 0.02660614624619484, + "rewards/bleu_reward_func/mean": 0.10761404037475586, + "rewards/bleu_reward_func/std": 0.08269859850406647, + "step": 1037 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 511.0, + "completions/mean_length": 154.03125, + "completions/mean_terminated_length": 87.74073791503906, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.8304, + "grad_norm": 8.316597938537598, + "kl": 0.3048095703125, + "learning_rate": 1e-06, + "loss": 0.108, + "num_tokens": 13538935.0, + "reward": 0.14819365739822388, + "reward_std": 0.07058853656053543, + "rewards/bleu_reward_func/mean": 0.14819365739822388, + "rewards/bleu_reward_func/std": 0.1550559103488922, + "step": 1038 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 485.0, + "completions/mean_length": 237.75, + "completions/mean_terminated_length": 209.37930297851562, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 0.8312, + "grad_norm": 4.724348545074463, + "kl": 0.1529541015625, + "learning_rate": 1e-06, + "loss": 0.0635, + "num_tokens": 13551415.0, + "reward": 0.16776956617832184, + "reward_std": 0.026334762573242188, + "rewards/bleu_reward_func/mean": 0.16776956617832184, + "rewards/bleu_reward_func/std": 0.18577900528907776, + "step": 1039 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 402.0, + "completions/max_terminated_length": 402.0, + "completions/mean_length": 93.375, + "completions/mean_terminated_length": 93.375, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 0.832, + "grad_norm": 8.702837944030762, + "kl": 0.33355712890625, + "learning_rate": 1e-06, + "loss": 0.0333, + "num_tokens": 13560747.0, + "reward": 0.1746351718902588, + "reward_std": 0.039413660764694214, + "rewards/bleu_reward_func/mean": 0.1746351718902588, + "rewards/bleu_reward_func/std": 0.13439369201660156, + "step": 1040 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 297.0, + "completions/mean_length": 177.90625, + "completions/mean_terminated_length": 66.54167175292969, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 0.8328, + "grad_norm": 14.271418571472168, + "kl": 0.298431396484375, + "learning_rate": 1e-06, + "loss": -0.356, + "num_tokens": 13572328.0, + "reward": 0.0881040021777153, + "reward_std": 0.0392255075275898, + "rewards/bleu_reward_func/mean": 0.0881040021777153, + "rewards/bleu_reward_func/std": 0.086721271276474, + "step": 1041 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 508.0, + "completions/mean_length": 243.34375, + "completions/mean_terminated_length": 138.21739196777344, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "epoch": 0.8336, + "grad_norm": 3.9084980487823486, + "kl": 0.07525634765625, + "learning_rate": 1e-06, + "loss": 0.0813, + "num_tokens": 13586859.0, + "reward": 0.06463417410850525, + "reward_std": 0.022750139236450195, + "rewards/bleu_reward_func/mean": 0.06463417410850525, + "rewards/bleu_reward_func/std": 0.05624645948410034, + "step": 1042 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 511.0, + "completions/mean_length": 328.96875, + "completions/mean_terminated_length": 267.9583435058594, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 0.8344, + "grad_norm": 9.608210563659668, + "kl": 0.2337646484375, + "learning_rate": 1e-06, + "loss": 0.0092, + "num_tokens": 13600978.0, + "reward": 0.06980200856924057, + "reward_std": 0.015845034271478653, + "rewards/bleu_reward_func/mean": 0.06980200856924057, + "rewards/bleu_reward_func/std": 0.03303433954715729, + "step": 1043 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 442.0, + "completions/mean_length": 305.8125, + "completions/mean_terminated_length": 145.44444274902344, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 0.8352, + "grad_norm": 4.128615379333496, + "kl": 0.06011962890625, + "learning_rate": 1e-06, + "loss": 0.1566, + "num_tokens": 13613068.0, + "reward": 0.04747869074344635, + "reward_std": 0.013655820861458778, + "rewards/bleu_reward_func/mean": 0.04747869074344635, + "rewards/bleu_reward_func/std": 0.028707411140203476, + "step": 1044 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 512.0, + "completions/mean_length": 426.53125, + "completions/mean_terminated_length": 368.0526428222656, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "epoch": 0.836, + "grad_norm": 2.515371799468994, + "kl": 0.043975830078125, + "learning_rate": 1e-06, + "loss": -0.065, + "num_tokens": 13629173.0, + "reward": 0.028738608583807945, + "reward_std": 0.012511001899838448, + "rewards/bleu_reward_func/mean": 0.028738608583807945, + "rewards/bleu_reward_func/std": 0.014564147219061852, + "step": 1045 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 505.0, + "completions/mean_length": 206.03125, + "completions/mean_terminated_length": 135.42308044433594, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 0.8368, + "grad_norm": 8.021496772766113, + "kl": 0.273529052734375, + "learning_rate": 1e-06, + "loss": 0.1291, + "num_tokens": 13637374.0, + "reward": 0.11877701431512833, + "reward_std": 0.04857534170150757, + "rewards/bleu_reward_func/mean": 0.11877701431512833, + "rewards/bleu_reward_func/std": 0.08409105986356735, + "step": 1046 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 471.0, + "completions/mean_length": 251.65625, + "completions/mean_terminated_length": 214.46429443359375, + "completions/min_length": 45.0, + "completions/min_terminated_length": 45.0, + "epoch": 0.8376, + "grad_norm": 3.2490315437316895, + "kl": 0.06829833984375, + "learning_rate": 1e-06, + "loss": -0.1358, + "num_tokens": 13648067.0, + "reward": 0.08158313482999802, + "reward_std": 0.02561478689312935, + "rewards/bleu_reward_func/mean": 0.08158313482999802, + "rewards/bleu_reward_func/std": 0.05671805888414383, + "step": 1047 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 495.0, + "completions/mean_length": 269.125, + "completions/mean_terminated_length": 201.1199951171875, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.8384, + "grad_norm": 5.032691955566406, + "kl": 0.1785888671875, + "learning_rate": 1e-06, + "loss": 0.0164, + "num_tokens": 13660719.0, + "reward": 0.18114086985588074, + "reward_std": 0.03815930336713791, + "rewards/bleu_reward_func/mean": 0.18114086985588074, + "rewards/bleu_reward_func/std": 0.14998804032802582, + "step": 1048 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 355.0, + "completions/mean_length": 87.25, + "completions/mean_terminated_length": 73.54838562011719, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 0.8392, + "grad_norm": 8.731785774230957, + "kl": 0.384765625, + "learning_rate": 1e-06, + "loss": 0.0594, + "num_tokens": 13670767.0, + "reward": 0.23021195828914642, + "reward_std": 0.09217022359371185, + "rewards/bleu_reward_func/mean": 0.23021195828914642, + "rewards/bleu_reward_func/std": 0.18223723769187927, + "step": 1049 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 454.0, + "completions/mean_length": 269.25, + "completions/mean_terminated_length": 80.44444274902344, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 0.84, + "grad_norm": 8.531782150268555, + "kl": 0.29345703125, + "learning_rate": 1e-06, + "loss": 0.2277, + "num_tokens": 13685927.0, + "reward": 0.13349372148513794, + "reward_std": 0.053998030722141266, + "rewards/bleu_reward_func/mean": 0.13349372148513794, + "rewards/bleu_reward_func/std": 0.15102460980415344, + "step": 1050 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 512.0, + "completions/max_terminated_length": 500.0, + "completions/mean_length": 375.4375, + "completions/mean_terminated_length": 238.875, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 0.8408, + "grad_norm": 7.057468414306641, + "kl": 0.099639892578125, + "learning_rate": 1e-06, + "loss": 0.1437, + "num_tokens": 13702781.0, + "reward": 0.02950356900691986, + "reward_std": 0.010849589481949806, + "rewards/bleu_reward_func/mean": 0.02950356900691986, + "rewards/bleu_reward_func/std": 0.02092377282679081, + "step": 1051 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 487.0, + "completions/mean_length": 237.84375, + "completions/mean_terminated_length": 187.07408142089844, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 0.8416, + "grad_norm": 8.259405136108398, + "kl": 0.57330322265625, + "learning_rate": 1e-06, + "loss": 0.1046, + "num_tokens": 13713560.0, + "reward": 0.1828644871711731, + "reward_std": 0.04976918175816536, + "rewards/bleu_reward_func/mean": 0.1828644871711731, + "rewards/bleu_reward_func/std": 0.13261918723583221, + "step": 1052 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 488.0, + "completions/mean_length": 154.09375, + "completions/mean_terminated_length": 87.81481170654297, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 0.8424, + "grad_norm": 9.288348197937012, + "kl": 0.424560546875, + "learning_rate": 1e-06, + "loss": -0.0227, + "num_tokens": 13720387.0, + "reward": 0.1263371855020523, + "reward_std": 0.031269170343875885, + "rewards/bleu_reward_func/mean": 0.1263371855020523, + "rewards/bleu_reward_func/std": 0.1025131419301033, + "step": 1053 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 94.0, + "completions/mean_length": 160.15625, + "completions/mean_terminated_length": 42.875, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 0.8432, + "grad_norm": 7.418272972106934, + "kl": 0.32086181640625, + "learning_rate": 1e-06, + "loss": -0.0571, + "num_tokens": 13731528.0, + "reward": 0.2602638304233551, + "reward_std": 0.07646072655916214, + "rewards/bleu_reward_func/mean": 0.2602638304233551, + "rewards/bleu_reward_func/std": 0.2308470755815506, + "step": 1054 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 453.0, + "completions/mean_length": 183.46875, + "completions/mean_terminated_length": 122.62963104248047, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 0.844, + "grad_norm": 8.014601707458496, + "kl": 0.3927001953125, + "learning_rate": 1e-06, + "loss": 0.1042, + "num_tokens": 13742447.0, + "reward": 0.07197493314743042, + "reward_std": 0.012622429989278316, + "rewards/bleu_reward_func/mean": 0.07197493314743042, + "rewards/bleu_reward_func/std": 0.04327724501490593, + "step": 1055 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 492.0, + "completions/mean_length": 240.59375, + "completions/mean_terminated_length": 222.50001525878906, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.8448, + "grad_norm": 5.102974891662598, + "kl": 0.1640625, + "learning_rate": 1e-06, + "loss": -0.0185, + "num_tokens": 13755202.0, + "reward": 0.08292236924171448, + "reward_std": 0.023967744782567024, + "rewards/bleu_reward_func/mean": 0.08292236924171448, + "rewards/bleu_reward_func/std": 0.046691060066223145, + "step": 1056 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 379.0, + "completions/mean_length": 133.0625, + "completions/mean_terminated_length": 45.615386962890625, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 0.8456, + "grad_norm": 12.413713455200195, + "kl": 0.4334716796875, + "learning_rate": 1e-06, + "loss": 0.1467, + "num_tokens": 13763268.0, + "reward": 0.09796961396932602, + "reward_std": 0.02291642501950264, + "rewards/bleu_reward_func/mean": 0.09796961396932602, + "rewards/bleu_reward_func/std": 0.04426925256848335, + "step": 1057 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 489.0, + "completions/mean_length": 155.71875, + "completions/mean_terminated_length": 89.74073791503906, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 0.8464, + "grad_norm": 8.978612899780273, + "kl": 0.228515625, + "learning_rate": 1e-06, + "loss": -0.0356, + "num_tokens": 13771987.0, + "reward": 0.10166356712579727, + "reward_std": 0.055922288447618484, + "rewards/bleu_reward_func/mean": 0.10166356712579727, + "rewards/bleu_reward_func/std": 0.07498722523450851, + "step": 1058 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 510.0, + "completions/mean_length": 408.84375, + "completions/mean_terminated_length": 276.21429443359375, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 0.8472, + "grad_norm": 1.850431203842163, + "kl": 0.046905517578125, + "learning_rate": 1e-06, + "loss": 0.1476, + "num_tokens": 13791446.0, + "reward": 0.07564342021942139, + "reward_std": 0.015303988009691238, + "rewards/bleu_reward_func/mean": 0.07564342021942139, + "rewards/bleu_reward_func/std": 0.09736621379852295, + "step": 1059 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 493.0, + "completions/mean_length": 285.71875, + "completions/mean_terminated_length": 197.17391967773438, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.848, + "grad_norm": 4.632063388824463, + "kl": 0.21221923828125, + "learning_rate": 1e-06, + "loss": 0.0453, + "num_tokens": 13803221.0, + "reward": 0.12073882669210434, + "reward_std": 0.03981417790055275, + "rewards/bleu_reward_func/mean": 0.12073882669210434, + "rewards/bleu_reward_func/std": 0.07346338778734207, + "step": 1060 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 457.0, + "completions/mean_length": 329.1875, + "completions/mean_terminated_length": 246.09091186523438, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 0.8488, + "grad_norm": 4.053706645965576, + "kl": 0.078857421875, + "learning_rate": 1e-06, + "loss": 0.2911, + "num_tokens": 13818331.0, + "reward": 0.06476722657680511, + "reward_std": 0.030476348474621773, + "rewards/bleu_reward_func/mean": 0.06476722657680511, + "rewards/bleu_reward_func/std": 0.06862985342741013, + "step": 1061 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 430.0, + "completions/mean_length": 156.5625, + "completions/mean_terminated_length": 57.03999710083008, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.8496, + "grad_norm": 8.067891120910645, + "kl": 0.1793212890625, + "learning_rate": 1e-06, + "loss": 0.195, + "num_tokens": 13826957.0, + "reward": 0.20933812856674194, + "reward_std": 0.051397278904914856, + "rewards/bleu_reward_func/mean": 0.20933812856674194, + "rewards/bleu_reward_func/std": 0.2901296019554138, + "step": 1062 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 468.0, + "completions/mean_length": 227.09375, + "completions/mean_terminated_length": 115.60869598388672, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 0.8504, + "grad_norm": 7.58860445022583, + "kl": 0.165283203125, + "learning_rate": 1e-06, + "loss": -0.0274, + "num_tokens": 13838440.0, + "reward": 0.11480045318603516, + "reward_std": 0.027013186365365982, + "rewards/bleu_reward_func/mean": 0.11480045318603516, + "rewards/bleu_reward_func/std": 0.11019645631313324, + "step": 1063 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 474.0, + "completions/mean_length": 189.1875, + "completions/mean_terminated_length": 114.69231414794922, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.8512, + "grad_norm": 5.411109447479248, + "kl": 0.1951904296875, + "learning_rate": 1e-06, + "loss": 0.0807, + "num_tokens": 13849406.0, + "reward": 0.1512412428855896, + "reward_std": 0.062053047120571136, + "rewards/bleu_reward_func/mean": 0.1512412428855896, + "rewards/bleu_reward_func/std": 0.12469635158777237, + "step": 1064 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.34375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 487.0, + "completions/mean_length": 259.5625, + "completions/mean_terminated_length": 127.33333587646484, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 0.852, + "grad_norm": 6.397778034210205, + "kl": 0.24896240234375, + "learning_rate": 1e-06, + "loss": -0.0087, + "num_tokens": 13864056.0, + "reward": 0.1552116870880127, + "reward_std": 0.027821514755487442, + "rewards/bleu_reward_func/mean": 0.1552116870880127, + "rewards/bleu_reward_func/std": 0.10892557352781296, + "step": 1065 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 412.0, + "completions/max_terminated_length": 412.0, + "completions/mean_length": 139.8125, + "completions/mean_terminated_length": 139.8125, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 0.8528, + "grad_norm": 8.31270694732666, + "kl": 0.20196533203125, + "learning_rate": 1e-06, + "loss": 0.0519, + "num_tokens": 13871490.0, + "reward": 0.13230201601982117, + "reward_std": 0.04370046779513359, + "rewards/bleu_reward_func/mean": 0.13230201601982117, + "rewards/bleu_reward_func/std": 0.08659063279628754, + "step": 1066 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 487.0, + "completions/mean_length": 169.0625, + "completions/mean_terminated_length": 73.04000091552734, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 0.8536, + "grad_norm": 6.236849784851074, + "kl": 0.39727783203125, + "learning_rate": 1e-06, + "loss": 0.1366, + "num_tokens": 13885556.0, + "reward": 0.17620697617530823, + "reward_std": 0.024748487398028374, + "rewards/bleu_reward_func/mean": 0.17620697617530823, + "rewards/bleu_reward_func/std": 0.10503184050321579, + "step": 1067 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 511.0, + "completions/mean_length": 228.75, + "completions/mean_terminated_length": 176.29629516601562, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 0.8544, + "grad_norm": 6.749774932861328, + "kl": 0.181121826171875, + "learning_rate": 1e-06, + "loss": -0.1016, + "num_tokens": 13894804.0, + "reward": 0.17489100992679596, + "reward_std": 0.042406514286994934, + "rewards/bleu_reward_func/mean": 0.17489100992679596, + "rewards/bleu_reward_func/std": 0.14329132437705994, + "step": 1068 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 475.0, + "completions/mean_length": 246.6875, + "completions/mean_terminated_length": 126.09091186523438, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 0.8552, + "grad_norm": 6.8404459953308105, + "kl": 0.31402587890625, + "learning_rate": 1e-06, + "loss": 0.0134, + "num_tokens": 13907514.0, + "reward": 0.16271845996379852, + "reward_std": 0.04602063074707985, + "rewards/bleu_reward_func/mean": 0.16271845996379852, + "rewards/bleu_reward_func/std": 0.12579885125160217, + "step": 1069 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 490.0, + "completions/mean_length": 269.78125, + "completions/mean_terminated_length": 124.45000457763672, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.856, + "grad_norm": 4.438868522644043, + "kl": 0.177520751953125, + "learning_rate": 1e-06, + "loss": 0.0265, + "num_tokens": 13919603.0, + "reward": 0.13932910561561584, + "reward_std": 0.01856398582458496, + "rewards/bleu_reward_func/mean": 0.13932910561561584, + "rewards/bleu_reward_func/std": 0.14700213074684143, + "step": 1070 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 511.0, + "completions/mean_length": 149.9375, + "completions/mean_terminated_length": 82.8888931274414, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.8568, + "grad_norm": 7.284147262573242, + "kl": 0.3516845703125, + "learning_rate": 1e-06, + "loss": 0.0398, + "num_tokens": 13927953.0, + "reward": 0.2614789605140686, + "reward_std": 0.07057315111160278, + "rewards/bleu_reward_func/mean": 0.2614789605140686, + "rewards/bleu_reward_func/std": 0.1882997453212738, + "step": 1071 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 506.0, + "completions/mean_length": 243.78125, + "completions/mean_terminated_length": 168.67999267578125, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "epoch": 0.8576, + "grad_norm": 5.096418380737305, + "kl": 0.16357421875, + "learning_rate": 1e-06, + "loss": -0.0371, + "num_tokens": 13942570.0, + "reward": 0.07684318721294403, + "reward_std": 0.019258558750152588, + "rewards/bleu_reward_func/mean": 0.07684318721294403, + "rewards/bleu_reward_func/std": 0.03753623366355896, + "step": 1072 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 433.0, + "completions/mean_length": 271.625, + "completions/mean_terminated_length": 107.15789794921875, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.8584, + "grad_norm": 3.9537734985351562, + "kl": 0.081634521484375, + "learning_rate": 1e-06, + "loss": -0.0181, + "num_tokens": 13956350.0, + "reward": 0.1140797883272171, + "reward_std": 0.023730140179395676, + "rewards/bleu_reward_func/mean": 0.1140797883272171, + "rewards/bleu_reward_func/std": 0.1426122635602951, + "step": 1073 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 461.0, + "completions/mean_length": 323.4375, + "completions/mean_terminated_length": 237.72727966308594, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.8592, + "grad_norm": 4.239878177642822, + "kl": 0.10797119140625, + "learning_rate": 1e-06, + "loss": 0.0029, + "num_tokens": 13971620.0, + "reward": 0.10953356325626373, + "reward_std": 0.07727043330669403, + "rewards/bleu_reward_func/mean": 0.10953356325626373, + "rewards/bleu_reward_func/std": 0.1143500879406929, + "step": 1074 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 512.0, + "completions/max_terminated_length": 495.0, + "completions/mean_length": 447.5625, + "completions/mean_terminated_length": 383.125, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "epoch": 0.86, + "grad_norm": 2.2002503871917725, + "kl": 0.049072265625, + "learning_rate": 1e-06, + "loss": -0.0472, + "num_tokens": 13990150.0, + "reward": 0.08343654125928879, + "reward_std": 0.02118324115872383, + "rewards/bleu_reward_func/mean": 0.08343654125928879, + "rewards/bleu_reward_func/std": 0.08093992620706558, + "step": 1075 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 503.0, + "completions/mean_length": 472.125, + "completions/mean_terminated_length": 420.8571472167969, + "completions/min_length": 280.0, + "completions/min_terminated_length": 280.0, + "epoch": 0.8608, + "grad_norm": 2.0650570392608643, + "kl": 0.032440185546875, + "learning_rate": 1e-06, + "loss": -0.0088, + "num_tokens": 14011170.0, + "reward": 0.04162130132317543, + "reward_std": 0.010478474199771881, + "rewards/bleu_reward_func/mean": 0.04162130132317543, + "rewards/bleu_reward_func/std": 0.017952080816030502, + "step": 1076 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 421.0, + "completions/mean_length": 242.78125, + "completions/mean_terminated_length": 58.578948974609375, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.8616, + "grad_norm": 4.682583808898926, + "kl": 0.316986083984375, + "learning_rate": 1e-06, + "loss": -0.0201, + "num_tokens": 14025299.0, + "reward": 0.2864699065685272, + "reward_std": 0.03820549696683884, + "rewards/bleu_reward_func/mean": 0.2864699065685272, + "rewards/bleu_reward_func/std": 0.26346415281295776, + "step": 1077 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 122.0, + "completions/mean_length": 165.625, + "completions/mean_terminated_length": 50.16666793823242, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 0.8624, + "grad_norm": 7.724006652832031, + "kl": 0.23388671875, + "learning_rate": 1e-06, + "loss": -0.0182, + "num_tokens": 14037399.0, + "reward": 0.1795015037059784, + "reward_std": 0.07820923626422882, + "rewards/bleu_reward_func/mean": 0.1795015037059784, + "rewards/bleu_reward_func/std": 0.15337687730789185, + "step": 1078 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 512.0, + "completions/max_terminated_length": 172.0, + "completions/mean_length": 276.09375, + "completions/mean_terminated_length": 40.1875, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 0.8632, + "grad_norm": 8.850776672363281, + "kl": 0.24285888671875, + "learning_rate": 1e-06, + "loss": -0.1453, + "num_tokens": 14052562.0, + "reward": 0.09584256261587143, + "reward_std": 0.01827467978000641, + "rewards/bleu_reward_func/mean": 0.09584256261587143, + "rewards/bleu_reward_func/std": 0.10066576302051544, + "step": 1079 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 174.0, + "completions/mean_length": 203.1875, + "completions/mean_terminated_length": 82.34782409667969, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 0.864, + "grad_norm": 4.735725402832031, + "kl": 0.0994873046875, + "learning_rate": 1e-06, + "loss": 0.2218, + "num_tokens": 14061568.0, + "reward": 0.27030178904533386, + "reward_std": 0.057654060423374176, + "rewards/bleu_reward_func/mean": 0.27030178904533386, + "rewards/bleu_reward_func/std": 0.16439270973205566, + "step": 1080 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 392.0, + "completions/mean_length": 237.875, + "completions/mean_terminated_length": 174.61538696289062, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 0.8648, + "grad_norm": 4.542383193969727, + "kl": 0.0601806640625, + "learning_rate": 1e-06, + "loss": -0.1576, + "num_tokens": 14071236.0, + "reward": 0.05943232774734497, + "reward_std": 0.03797609731554985, + "rewards/bleu_reward_func/mean": 0.05943232774734497, + "rewards/bleu_reward_func/std": 0.07494883239269257, + "step": 1081 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 495.0, + "completions/mean_length": 214.625, + "completions/mean_terminated_length": 146.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 0.8656, + "grad_norm": 9.019255638122559, + "kl": 0.340179443359375, + "learning_rate": 1e-06, + "loss": 0.0456, + "num_tokens": 14080680.0, + "reward": 0.09385368227958679, + "reward_std": 0.041810497641563416, + "rewards/bleu_reward_func/mean": 0.09385368227958679, + "rewards/bleu_reward_func/std": 0.04523298144340515, + "step": 1082 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 451.0, + "completions/mean_length": 111.3125, + "completions/mean_terminated_length": 54.07143020629883, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8664, + "grad_norm": 5.487109184265137, + "kl": 0.3203125, + "learning_rate": 1e-06, + "loss": 0.195, + "num_tokens": 14093178.0, + "reward": 0.2254057228565216, + "reward_std": 0.0354473814368248, + "rewards/bleu_reward_func/mean": 0.2254057228565216, + "rewards/bleu_reward_func/std": 0.15529486536979675, + "step": 1083 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 512.0, + "completions/mean_length": 473.03125, + "completions/mean_terminated_length": 408.0833435058594, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "epoch": 0.8672, + "grad_norm": 2.1760456562042236, + "kl": 0.05029296875, + "learning_rate": 1e-06, + "loss": -0.1127, + "num_tokens": 14112195.0, + "reward": 0.11573594808578491, + "reward_std": 0.034562353044748306, + "rewards/bleu_reward_func/mean": 0.11573594808578491, + "rewards/bleu_reward_func/std": 0.0883888527750969, + "step": 1084 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.34375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 471.0, + "completions/mean_length": 248.8125, + "completions/mean_terminated_length": 110.95238494873047, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 0.868, + "grad_norm": 20.237722396850586, + "kl": 0.27996826171875, + "learning_rate": 1e-06, + "loss": -0.0279, + "num_tokens": 14125485.0, + "reward": 0.06478704512119293, + "reward_std": 0.01746372878551483, + "rewards/bleu_reward_func/mean": 0.06478704512119293, + "rewards/bleu_reward_func/std": 0.04760226234793663, + "step": 1085 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 491.0, + "completions/mean_length": 254.21875, + "completions/mean_terminated_length": 194.73077392578125, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.8688, + "grad_norm": 4.712401390075684, + "kl": 0.119384765625, + "learning_rate": 1e-06, + "loss": 0.2418, + "num_tokens": 14136628.0, + "reward": 0.07501716911792755, + "reward_std": 0.022581705823540688, + "rewards/bleu_reward_func/mean": 0.07501716911792755, + "rewards/bleu_reward_func/std": 0.045875921845436096, + "step": 1086 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 506.0, + "completions/mean_length": 214.4375, + "completions/mean_terminated_length": 145.7692413330078, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 0.8696, + "grad_norm": 7.036587715148926, + "kl": 0.40423583984375, + "learning_rate": 1e-06, + "loss": -0.0659, + "num_tokens": 14152290.0, + "reward": 0.24361515045166016, + "reward_std": 0.05023983493447304, + "rewards/bleu_reward_func/mean": 0.24361515045166016, + "rewards/bleu_reward_func/std": 0.2318515181541443, + "step": 1087 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 481.0, + "completions/mean_length": 310.15625, + "completions/mean_terminated_length": 281.3214416503906, + "completions/min_length": 68.0, + "completions/min_terminated_length": 68.0, + "epoch": 0.8704, + "grad_norm": 2.8048856258392334, + "kl": 0.06768798828125, + "learning_rate": 1e-06, + "loss": -0.1012, + "num_tokens": 14164015.0, + "reward": 0.06632909178733826, + "reward_std": 0.022660713642835617, + "rewards/bleu_reward_func/mean": 0.06632909178733826, + "rewards/bleu_reward_func/std": 0.05323861911892891, + "step": 1088 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 246.0, + "completions/mean_length": 72.03125, + "completions/mean_terminated_length": 57.838706970214844, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 0.8712, + "grad_norm": 9.539746284484863, + "kl": 0.28515625, + "learning_rate": 1e-06, + "loss": 0.2369, + "num_tokens": 14175360.0, + "reward": 0.2565135359764099, + "reward_std": 0.06622748076915741, + "rewards/bleu_reward_func/mean": 0.2565135359764099, + "rewards/bleu_reward_func/std": 0.2024916261434555, + "step": 1089 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 502.0, + "completions/mean_length": 253.8125, + "completions/mean_terminated_length": 136.4545440673828, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 0.872, + "grad_norm": 8.92574691772461, + "kl": 0.2467041015625, + "learning_rate": 1e-06, + "loss": 0.2116, + "num_tokens": 14187954.0, + "reward": 0.10993756353855133, + "reward_std": 0.041833557188510895, + "rewards/bleu_reward_func/mean": 0.10993756353855133, + "rewards/bleu_reward_func/std": 0.13020949065685272, + "step": 1090 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 484.0, + "completions/mean_length": 193.625, + "completions/mean_terminated_length": 104.47999572753906, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.8728, + "grad_norm": 5.538515090942383, + "kl": 0.0902099609375, + "learning_rate": 1e-06, + "loss": -0.1573, + "num_tokens": 14196270.0, + "reward": 0.05786508321762085, + "reward_std": 0.02371850796043873, + "rewards/bleu_reward_func/mean": 0.05786508321762085, + "rewards/bleu_reward_func/std": 0.03462414816021919, + "step": 1091 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 498.0, + "completions/mean_length": 223.09375, + "completions/mean_terminated_length": 156.42308044433594, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.8736, + "grad_norm": 5.6724443435668945, + "kl": 0.179168701171875, + "learning_rate": 1e-06, + "loss": 0.0128, + "num_tokens": 14208633.0, + "reward": 0.107764333486557, + "reward_std": 0.021315133199095726, + "rewards/bleu_reward_func/mean": 0.107764333486557, + "rewards/bleu_reward_func/std": 0.038448914885520935, + "step": 1092 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 502.0, + "completions/mean_length": 166.78125, + "completions/mean_terminated_length": 143.7666778564453, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 0.8744, + "grad_norm": 5.278230667114258, + "kl": 0.2412109375, + "learning_rate": 1e-06, + "loss": -0.0013, + "num_tokens": 14218242.0, + "reward": 0.4157559275627136, + "reward_std": 0.037054967135190964, + "rewards/bleu_reward_func/mean": 0.4157559275627136, + "rewards/bleu_reward_func/std": 0.2559570372104645, + "step": 1093 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 512.0, + "completions/max_terminated_length": 506.0, + "completions/mean_length": 313.09375, + "completions/mean_terminated_length": 114.1875, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8752, + "grad_norm": 3.892812490463257, + "kl": 0.13092041015625, + "learning_rate": 1e-06, + "loss": 0.1517, + "num_tokens": 14232805.0, + "reward": 0.12693487107753754, + "reward_std": 0.04035983234643936, + "rewards/bleu_reward_func/mean": 0.12693487107753754, + "rewards/bleu_reward_func/std": 0.12727496027946472, + "step": 1094 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 486.0, + "completions/mean_length": 205.1875, + "completions/mean_terminated_length": 161.35714721679688, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 0.876, + "grad_norm": 8.024927139282227, + "kl": 0.350433349609375, + "learning_rate": 1e-06, + "loss": 0.2352, + "num_tokens": 14244755.0, + "reward": 0.20502734184265137, + "reward_std": 0.09303287416696548, + "rewards/bleu_reward_func/mean": 0.20502734184265137, + "rewards/bleu_reward_func/std": 0.241799458861351, + "step": 1095 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.34375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 501.0, + "completions/mean_length": 361.1875, + "completions/mean_terminated_length": 282.19049072265625, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 0.8768, + "grad_norm": 2.944213390350342, + "kl": 0.068603515625, + "learning_rate": 1e-06, + "loss": -0.106, + "num_tokens": 14257801.0, + "reward": 0.04917216673493385, + "reward_std": 0.011258791200816631, + "rewards/bleu_reward_func/mean": 0.04917216673493385, + "rewards/bleu_reward_func/std": 0.038949303328990936, + "step": 1096 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 400.0, + "completions/mean_length": 234.21875, + "completions/mean_terminated_length": 67.55000305175781, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 0.8776, + "grad_norm": 6.089639663696289, + "kl": 0.22784423828125, + "learning_rate": 1e-06, + "loss": 0.1116, + "num_tokens": 14267648.0, + "reward": 0.10858422517776489, + "reward_std": 0.04780227690935135, + "rewards/bleu_reward_func/mean": 0.10858422517776489, + "rewards/bleu_reward_func/std": 0.07767506688833237, + "step": 1097 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 414.0, + "completions/mean_length": 383.21875, + "completions/mean_terminated_length": 217.6428680419922, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 0.8784, + "grad_norm": 2.4417130947113037, + "kl": 0.039581298828125, + "learning_rate": 1e-06, + "loss": 0.101, + "num_tokens": 14284503.0, + "reward": 0.1223280131816864, + "reward_std": 0.058498185127973557, + "rewards/bleu_reward_func/mean": 0.1223280131816864, + "rewards/bleu_reward_func/std": 0.08976288139820099, + "step": 1098 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 390.0, + "completions/mean_length": 200.5, + "completions/mean_terminated_length": 78.60869598388672, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.8792, + "grad_norm": 6.311417102813721, + "kl": 0.34259033203125, + "learning_rate": 1e-06, + "loss": 0.0608, + "num_tokens": 14294455.0, + "reward": 0.12745052576065063, + "reward_std": 0.048099152743816376, + "rewards/bleu_reward_func/mean": 0.12745052576065063, + "rewards/bleu_reward_func/std": 0.12118762731552124, + "step": 1099 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 432.0, + "completions/mean_length": 294.375, + "completions/mean_terminated_length": 244.1538543701172, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.88, + "grad_norm": 5.328085422515869, + "kl": 0.184539794921875, + "learning_rate": 1e-06, + "loss": -0.0179, + "num_tokens": 14311835.0, + "reward": 0.2192595899105072, + "reward_std": 0.042960211634635925, + "rewards/bleu_reward_func/mean": 0.2192595899105072, + "rewards/bleu_reward_func/std": 0.13524703681468964, + "step": 1100 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 506.0, + "completions/mean_length": 324.625, + "completions/mean_terminated_length": 262.16668701171875, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "epoch": 0.8808, + "grad_norm": 2.6405608654022217, + "kl": 0.0440673828125, + "learning_rate": 1e-06, + "loss": 0.1205, + "num_tokens": 14327975.0, + "reward": 0.14465495944023132, + "reward_std": 0.04526882618665695, + "rewards/bleu_reward_func/mean": 0.14465495944023132, + "rewards/bleu_reward_func/std": 0.1039966493844986, + "step": 1101 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.46875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 466.0, + "completions/mean_length": 336.125, + "completions/mean_terminated_length": 180.94117736816406, + "completions/min_length": 60.0, + "completions/min_terminated_length": 60.0, + "epoch": 0.8816, + "grad_norm": 3.4212841987609863, + "kl": 0.0875244140625, + "learning_rate": 1e-06, + "loss": 0.1697, + "num_tokens": 14343595.0, + "reward": 0.07303881645202637, + "reward_std": 0.023782189935445786, + "rewards/bleu_reward_func/mean": 0.07303881645202637, + "rewards/bleu_reward_func/std": 0.10003393888473511, + "step": 1102 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 453.0, + "completions/mean_length": 226.0, + "completions/mean_terminated_length": 145.9199981689453, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 0.8824, + "grad_norm": 7.256896018981934, + "kl": 0.2523193359375, + "learning_rate": 1e-06, + "loss": 0.0389, + "num_tokens": 14357595.0, + "reward": 0.14129553735256195, + "reward_std": 0.05766978859901428, + "rewards/bleu_reward_func/mean": 0.14129553735256195, + "rewards/bleu_reward_func/std": 0.13982893526554108, + "step": 1103 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 479.0, + "completions/mean_length": 375.65625, + "completions/mean_terminated_length": 313.68182373046875, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 0.8832, + "grad_norm": 2.429725170135498, + "kl": 0.0618896484375, + "learning_rate": 1e-06, + "loss": 0.0492, + "num_tokens": 14372368.0, + "reward": 0.0918026864528656, + "reward_std": 0.019557196646928787, + "rewards/bleu_reward_func/mean": 0.0918026864528656, + "rewards/bleu_reward_func/std": 0.0768144428730011, + "step": 1104 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 121.0, + "completions/mean_length": 81.0625, + "completions/mean_terminated_length": 36.482757568359375, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 0.884, + "grad_norm": 10.26905345916748, + "kl": 0.4246826171875, + "learning_rate": 1e-06, + "loss": 0.4653, + "num_tokens": 14379298.0, + "reward": 0.1903451383113861, + "reward_std": 0.0727916806936264, + "rewards/bleu_reward_func/mean": 0.1903451383113861, + "rewards/bleu_reward_func/std": 0.18681129813194275, + "step": 1105 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 307.0, + "completions/mean_length": 218.09375, + "completions/mean_terminated_length": 103.08695983886719, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 0.8848, + "grad_norm": 6.238147735595703, + "kl": 0.2109375, + "learning_rate": 1e-06, + "loss": 0.1147, + "num_tokens": 14390325.0, + "reward": 0.10796605050563812, + "reward_std": 0.028253143653273582, + "rewards/bleu_reward_func/mean": 0.10796605050563812, + "rewards/bleu_reward_func/std": 0.08980042487382889, + "step": 1106 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.46875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 392.0, + "completions/mean_length": 334.3125, + "completions/mean_terminated_length": 177.5294189453125, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.8856, + "grad_norm": 5.977288246154785, + "kl": 0.12933349609375, + "learning_rate": 1e-06, + "loss": 0.0113, + "num_tokens": 14405719.0, + "reward": 0.08215081691741943, + "reward_std": 0.012335095554590225, + "rewards/bleu_reward_func/mean": 0.08215081691741943, + "rewards/bleu_reward_func/std": 0.0935206413269043, + "step": 1107 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 512.0, + "completions/max_terminated_length": 507.0, + "completions/mean_length": 368.25, + "completions/mean_terminated_length": 224.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 0.8864, + "grad_norm": 3.1431217193603516, + "kl": 0.046478271484375, + "learning_rate": 1e-06, + "loss": 0.0529, + "num_tokens": 14420959.0, + "reward": 0.07733479142189026, + "reward_std": 0.04769892990589142, + "rewards/bleu_reward_func/mean": 0.07733479142189026, + "rewards/bleu_reward_func/std": 0.07268877327442169, + "step": 1108 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 498.0, + "completions/mean_length": 205.0625, + "completions/mean_terminated_length": 148.22222900390625, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 0.8872, + "grad_norm": 8.718172073364258, + "kl": 0.27215576171875, + "learning_rate": 1e-06, + "loss": -0.0245, + "num_tokens": 14431297.0, + "reward": 0.08119820058345795, + "reward_std": 0.02793770469725132, + "rewards/bleu_reward_func/mean": 0.08119820058345795, + "rewards/bleu_reward_func/std": 0.046033825725317, + "step": 1109 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 505.0, + "completions/mean_length": 218.59375, + "completions/mean_terminated_length": 188.2413787841797, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 0.888, + "grad_norm": 7.532926082611084, + "kl": 0.197265625, + "learning_rate": 1e-06, + "loss": 0.2, + "num_tokens": 14440556.0, + "reward": 0.16878977417945862, + "reward_std": 0.06408128887414932, + "rewards/bleu_reward_func/mean": 0.16878977417945862, + "rewards/bleu_reward_func/std": 0.17819638550281525, + "step": 1110 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 501.0, + "completions/mean_length": 254.875, + "completions/mean_terminated_length": 169.1666717529297, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 0.8888, + "grad_norm": 5.752809047698975, + "kl": 0.151947021484375, + "learning_rate": 1e-06, + "loss": -0.0323, + "num_tokens": 14453832.0, + "reward": 0.08754751831293106, + "reward_std": 0.041982948780059814, + "rewards/bleu_reward_func/mean": 0.08754751831293106, + "rewards/bleu_reward_func/std": 0.08986286073923111, + "step": 1111 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 491.0, + "completions/mean_length": 431.9375, + "completions/mean_terminated_length": 298.5, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 0.8896, + "grad_norm": 2.506591320037842, + "kl": 0.04913330078125, + "learning_rate": 1e-06, + "loss": 0.0059, + "num_tokens": 14470966.0, + "reward": 0.05948667228221893, + "reward_std": 0.031033214181661606, + "rewards/bleu_reward_func/mean": 0.05948667228221893, + "rewards/bleu_reward_func/std": 0.04187482222914696, + "step": 1112 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 512.0, + "completions/max_terminated_length": 498.0, + "completions/mean_length": 458.125, + "completions/mean_terminated_length": 404.25, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, + "epoch": 0.8904, + "grad_norm": 2.1371476650238037, + "kl": 0.05169677734375, + "learning_rate": 1e-06, + "loss": 0.0275, + "num_tokens": 14488386.0, + "reward": 0.07635128498077393, + "reward_std": 0.02666424587368965, + "rewards/bleu_reward_func/mean": 0.07635128498077393, + "rewards/bleu_reward_func/std": 0.06230180338025093, + "step": 1113 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 509.0, + "completions/mean_length": 381.3125, + "completions/mean_terminated_length": 321.9090881347656, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "epoch": 0.8912, + "grad_norm": 2.5617504119873047, + "kl": 0.05352783203125, + "learning_rate": 1e-06, + "loss": -0.0071, + "num_tokens": 14502500.0, + "reward": 0.1177460253238678, + "reward_std": 0.02880302257835865, + "rewards/bleu_reward_func/mean": 0.1177460253238678, + "rewards/bleu_reward_func/std": 0.06036384403705597, + "step": 1114 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 487.0, + "completions/mean_length": 354.875, + "completions/mean_terminated_length": 293.39129638671875, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.892, + "grad_norm": 6.915249824523926, + "kl": 0.162353515625, + "learning_rate": 1e-06, + "loss": -0.1012, + "num_tokens": 14515272.0, + "reward": 0.08914826065301895, + "reward_std": 0.03028780408203602, + "rewards/bleu_reward_func/mean": 0.08914826065301895, + "rewards/bleu_reward_func/std": 0.042083028703927994, + "step": 1115 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 472.0, + "completions/mean_length": 145.96875, + "completions/mean_terminated_length": 61.500003814697266, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 0.8928, + "grad_norm": 8.211282730102539, + "kl": 0.33642578125, + "learning_rate": 1e-06, + "loss": -0.0331, + "num_tokens": 14523199.0, + "reward": 0.13895554840564728, + "reward_std": 0.06001996994018555, + "rewards/bleu_reward_func/mean": 0.13895554840564728, + "rewards/bleu_reward_func/std": 0.10717976838350296, + "step": 1116 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 509.0, + "completions/mean_length": 269.8125, + "completions/mean_terminated_length": 175.04348754882812, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 0.8936, + "grad_norm": 9.093502044677734, + "kl": 0.18402099609375, + "learning_rate": 1e-06, + "loss": -0.0448, + "num_tokens": 14535921.0, + "reward": 0.08600494265556335, + "reward_std": 0.01430382952094078, + "rewards/bleu_reward_func/mean": 0.08600494265556335, + "rewards/bleu_reward_func/std": 0.03352402523159981, + "step": 1117 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 496.0, + "completions/mean_length": 220.65625, + "completions/mean_terminated_length": 166.70370483398438, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 0.8944, + "grad_norm": 5.133798122406006, + "kl": 0.23126220703125, + "learning_rate": 1e-06, + "loss": -0.0611, + "num_tokens": 14546502.0, + "reward": 0.1281801015138626, + "reward_std": 0.033460669219493866, + "rewards/bleu_reward_func/mean": 0.1281801015138626, + "rewards/bleu_reward_func/std": 0.09999439865350723, + "step": 1118 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 512.0, + "completions/mean_length": 414.90625, + "completions/mean_terminated_length": 382.54168701171875, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "epoch": 0.8952, + "grad_norm": 2.3336856365203857, + "kl": 0.05023193359375, + "learning_rate": 1e-06, + "loss": -0.0381, + "num_tokens": 14563043.0, + "reward": 0.09009081870317459, + "reward_std": 0.024957649409770966, + "rewards/bleu_reward_func/mean": 0.09009081870317459, + "rewards/bleu_reward_func/std": 0.07389495521783829, + "step": 1119 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 440.0, + "completions/mean_length": 249.375, + "completions/mean_terminated_length": 188.7692413330078, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "epoch": 0.896, + "grad_norm": 4.001770496368408, + "kl": 0.06622314453125, + "learning_rate": 1e-06, + "loss": 0.4376, + "num_tokens": 14574703.0, + "reward": 0.12255299836397171, + "reward_std": 0.06612245738506317, + "rewards/bleu_reward_func/mean": 0.12255299836397171, + "rewards/bleu_reward_func/std": 0.1745522916316986, + "step": 1120 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 512.0, + "completions/mean_length": 341.96875, + "completions/mean_terminated_length": 264.68182373046875, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.8968, + "grad_norm": 5.190872669219971, + "kl": 0.0662841796875, + "learning_rate": 1e-06, + "loss": -0.2989, + "num_tokens": 14590934.0, + "reward": 0.015649927780032158, + "reward_std": 0.008883368223905563, + "rewards/bleu_reward_func/mean": 0.015649927780032158, + "rewards/bleu_reward_func/std": 0.014249512925744057, + "step": 1121 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 444.0, + "completions/mean_length": 204.71875, + "completions/mean_terminated_length": 118.68000030517578, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.8976, + "grad_norm": 7.992037296295166, + "kl": 0.22357177734375, + "learning_rate": 1e-06, + "loss": 0.0293, + "num_tokens": 14601957.0, + "reward": 0.14590373635292053, + "reward_std": 0.032411251217126846, + "rewards/bleu_reward_func/mean": 0.14590373635292053, + "rewards/bleu_reward_func/std": 0.1304609477519989, + "step": 1122 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 490.0, + "completions/mean_length": 298.4375, + "completions/mean_terminated_length": 132.3333282470703, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 0.8984, + "grad_norm": 6.297399044036865, + "kl": 0.05999755859375, + "learning_rate": 1e-06, + "loss": 0.3021, + "num_tokens": 14616019.0, + "reward": 0.071531280875206, + "reward_std": 0.02597668580710888, + "rewards/bleu_reward_func/mean": 0.071531280875206, + "rewards/bleu_reward_func/std": 0.05073075741529465, + "step": 1123 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.53125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 456.0, + "completions/mean_length": 353.90625, + "completions/mean_terminated_length": 174.73333740234375, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "epoch": 0.8992, + "grad_norm": 2.9494781494140625, + "kl": 0.0675048828125, + "learning_rate": 1e-06, + "loss": -0.0383, + "num_tokens": 14634248.0, + "reward": 0.20859137177467346, + "reward_std": 0.026030534878373146, + "rewards/bleu_reward_func/mean": 0.20859137177467346, + "rewards/bleu_reward_func/std": 0.25668489933013916, + "step": 1124 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 459.0, + "completions/mean_length": 337.03125, + "completions/mean_terminated_length": 200.94444274902344, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.9, + "grad_norm": 3.6260087490081787, + "kl": 0.06976318359375, + "learning_rate": 1e-06, + "loss": -0.0424, + "num_tokens": 14646505.0, + "reward": 0.03524015098810196, + "reward_std": 0.021195726469159126, + "rewards/bleu_reward_func/mean": 0.03524015098810196, + "rewards/bleu_reward_func/std": 0.04081031307578087, + "step": 1125 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 377.0, + "completions/mean_length": 228.375, + "completions/mean_terminated_length": 117.39130401611328, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 0.9008, + "grad_norm": 6.014510154724121, + "kl": 0.232421875, + "learning_rate": 1e-06, + "loss": 0.0001, + "num_tokens": 14657525.0, + "reward": 0.06341119110584259, + "reward_std": 0.03255104646086693, + "rewards/bleu_reward_func/mean": 0.06341119110584259, + "rewards/bleu_reward_func/std": 0.04315832257270813, + "step": 1126 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 485.0, + "completions/mean_length": 238.21875, + "completions/mean_terminated_length": 175.03846740722656, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.9016, + "grad_norm": 4.443993091583252, + "kl": 0.07666015625, + "learning_rate": 1e-06, + "loss": 0.1492, + "num_tokens": 14669820.0, + "reward": 0.050140924751758575, + "reward_std": 0.02134326659142971, + "rewards/bleu_reward_func/mean": 0.050140924751758575, + "rewards/bleu_reward_func/std": 0.054666925221681595, + "step": 1127 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 507.0, + "completions/mean_length": 206.8125, + "completions/mean_terminated_length": 121.36000061035156, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.9024, + "grad_norm": 6.487152099609375, + "kl": 0.30072021484375, + "learning_rate": 1e-06, + "loss": 0.1095, + "num_tokens": 14678878.0, + "reward": 0.20913344621658325, + "reward_std": 0.06204414367675781, + "rewards/bleu_reward_func/mean": 0.20913344621658325, + "rewards/bleu_reward_func/std": 0.15058699250221252, + "step": 1128 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 295.0, + "completions/mean_length": 168.5625, + "completions/mean_terminated_length": 104.96296691894531, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 0.9032, + "grad_norm": 5.987677097320557, + "kl": 0.161376953125, + "learning_rate": 1e-06, + "loss": 0.1159, + "num_tokens": 14690400.0, + "reward": 0.22108127176761627, + "reward_std": 0.03181886300444603, + "rewards/bleu_reward_func/mean": 0.22108127176761627, + "rewards/bleu_reward_func/std": 0.21734413504600525, + "step": 1129 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 474.0, + "completions/mean_length": 224.40625, + "completions/mean_terminated_length": 194.65516662597656, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 0.904, + "grad_norm": 4.591039180755615, + "kl": 0.1512451171875, + "learning_rate": 1e-06, + "loss": 0.0318, + "num_tokens": 14703621.0, + "reward": 0.13979627192020416, + "reward_std": 0.024196792393922806, + "rewards/bleu_reward_func/mean": 0.13979627192020416, + "rewards/bleu_reward_func/std": 0.11370246112346649, + "step": 1130 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 476.0, + "completions/mean_length": 191.96875, + "completions/mean_terminated_length": 170.6333465576172, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.9048, + "grad_norm": 9.648924827575684, + "kl": 0.3162841796875, + "learning_rate": 1e-06, + "loss": 0.0993, + "num_tokens": 14715324.0, + "reward": 0.24474243819713593, + "reward_std": 0.03903892636299133, + "rewards/bleu_reward_func/mean": 0.24474243819713593, + "rewards/bleu_reward_func/std": 0.111121766269207, + "step": 1131 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 512.0, + "completions/max_terminated_length": 311.0, + "completions/mean_length": 335.125, + "completions/mean_terminated_length": 158.25, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "epoch": 0.9056, + "grad_norm": 3.5864415168762207, + "kl": 0.05462646484375, + "learning_rate": 1e-06, + "loss": 0.0438, + "num_tokens": 14728656.0, + "reward": 0.08658318221569061, + "reward_std": 0.03171471878886223, + "rewards/bleu_reward_func/mean": 0.08658318221569061, + "rewards/bleu_reward_func/std": 0.05243143439292908, + "step": 1132 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 444.0, + "completions/mean_length": 196.90625, + "completions/mean_terminated_length": 186.74192810058594, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.9064, + "grad_norm": 5.391827583312988, + "kl": 0.149871826171875, + "learning_rate": 1e-06, + "loss": 0.209, + "num_tokens": 14743445.0, + "reward": 0.14488989114761353, + "reward_std": 0.05979035794734955, + "rewards/bleu_reward_func/mean": 0.14488989114761353, + "rewards/bleu_reward_func/std": 0.11484233289957047, + "step": 1133 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 40.0, + "completions/mean_length": 147.9375, + "completions/mean_terminated_length": 26.58333396911621, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 0.9072, + "grad_norm": 9.304245948791504, + "kl": 0.4073486328125, + "learning_rate": 1e-06, + "loss": 0.0082, + "num_tokens": 14755539.0, + "reward": 0.2897959053516388, + "reward_std": 0.11407680809497833, + "rewards/bleu_reward_func/mean": 0.2897959053516388, + "rewards/bleu_reward_func/std": 0.3296494483947754, + "step": 1134 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 489.0, + "completions/mean_length": 302.0625, + "completions/mean_terminated_length": 219.9130401611328, + "completions/min_length": 45.0, + "completions/min_terminated_length": 45.0, + "epoch": 0.908, + "grad_norm": 4.041188716888428, + "kl": 0.08953857421875, + "learning_rate": 1e-06, + "loss": 0.0784, + "num_tokens": 14771221.0, + "reward": 0.0867033302783966, + "reward_std": 0.03230535611510277, + "rewards/bleu_reward_func/mean": 0.0867033302783966, + "rewards/bleu_reward_func/std": 0.05712318420410156, + "step": 1135 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 488.0, + "completions/mean_length": 353.21875, + "completions/mean_terminated_length": 244.57894897460938, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 0.9088, + "grad_norm": 3.4682111740112305, + "kl": 0.063140869140625, + "learning_rate": 1e-06, + "loss": -0.013, + "num_tokens": 14786884.0, + "reward": 0.2040112018585205, + "reward_std": 0.03282826021313667, + "rewards/bleu_reward_func/mean": 0.2040112018585205, + "rewards/bleu_reward_func/std": 0.2342006117105484, + "step": 1136 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 474.0, + "completions/mean_length": 323.46875, + "completions/mean_terminated_length": 176.8333282470703, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.9096, + "grad_norm": 9.80130672454834, + "kl": 0.296661376953125, + "learning_rate": 1e-06, + "loss": 0.0879, + "num_tokens": 14807267.0, + "reward": 0.15428093075752258, + "reward_std": 0.047303371131420135, + "rewards/bleu_reward_func/mean": 0.15428093075752258, + "rewards/bleu_reward_func/std": 0.12975330650806427, + "step": 1137 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 388.0, + "completions/mean_length": 213.3125, + "completions/mean_terminated_length": 113.75, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.9104, + "grad_norm": 6.25737190246582, + "kl": 0.125823974609375, + "learning_rate": 1e-06, + "loss": 0.0125, + "num_tokens": 14815933.0, + "reward": 0.0721752792596817, + "reward_std": 0.021741271018981934, + "rewards/bleu_reward_func/mean": 0.0721752792596817, + "rewards/bleu_reward_func/std": 0.05829243361949921, + "step": 1138 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 349.0, + "completions/max_terminated_length": 349.0, + "completions/mean_length": 107.125, + "completions/mean_terminated_length": 107.125, + "completions/min_length": 3.0, + "completions/min_terminated_length": 3.0, + "epoch": 0.9112, + "grad_norm": 25.477249145507812, + "kl": 0.55877685546875, + "learning_rate": 1e-06, + "loss": -0.2259, + "num_tokens": 14825321.0, + "reward": 0.1802026480436325, + "reward_std": 0.11938925087451935, + "rewards/bleu_reward_func/mean": 0.1802026480436325, + "rewards/bleu_reward_func/std": 0.1613418012857437, + "step": 1139 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 201.0, + "completions/max_terminated_length": 201.0, + "completions/mean_length": 105.3125, + "completions/mean_terminated_length": 105.3125, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 0.912, + "grad_norm": 11.604246139526367, + "kl": 0.2550048828125, + "learning_rate": 1e-06, + "loss": -0.0128, + "num_tokens": 14831115.0, + "reward": 0.19547826051712036, + "reward_std": 0.07176055759191513, + "rewards/bleu_reward_func/mean": 0.19547826051712036, + "rewards/bleu_reward_func/std": 0.11230416595935822, + "step": 1140 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 496.0, + "completions/mean_length": 255.59375, + "completions/mean_terminated_length": 208.11111450195312, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 0.9128, + "grad_norm": 6.936506271362305, + "kl": 0.14453125, + "learning_rate": 1e-06, + "loss": -0.1892, + "num_tokens": 14847286.0, + "reward": 0.07467533648014069, + "reward_std": 0.0442538745701313, + "rewards/bleu_reward_func/mean": 0.07467533648014069, + "rewards/bleu_reward_func/std": 0.0976758524775505, + "step": 1141 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 491.0, + "completions/mean_length": 397.8125, + "completions/mean_terminated_length": 309.0, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 0.9136, + "grad_norm": 3.533027410507202, + "kl": 0.1064453125, + "learning_rate": 1e-06, + "loss": 0.2572, + "num_tokens": 14862600.0, + "reward": 0.07055975496768951, + "reward_std": 0.024620652198791504, + "rewards/bleu_reward_func/mean": 0.07055975496768951, + "rewards/bleu_reward_func/std": 0.04198145866394043, + "step": 1142 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 442.0, + "completions/mean_length": 374.34375, + "completions/mean_terminated_length": 144.9166717529297, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "epoch": 0.9144, + "grad_norm": 3.106947422027588, + "kl": 0.05267333984375, + "learning_rate": 1e-06, + "loss": 0.3129, + "num_tokens": 14879211.0, + "reward": 0.047079749405384064, + "reward_std": 0.01572955772280693, + "rewards/bleu_reward_func/mean": 0.047079749405384064, + "rewards/bleu_reward_func/std": 0.03182151913642883, + "step": 1143 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 489.0, + "completions/mean_length": 296.21875, + "completions/mean_terminated_length": 289.258056640625, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 0.9152, + "grad_norm": 9.539961814880371, + "kl": 0.26116943359375, + "learning_rate": 1e-06, + "loss": 0.3821, + "num_tokens": 14892074.0, + "reward": 0.04650323465466499, + "reward_std": 0.016895011067390442, + "rewards/bleu_reward_func/mean": 0.04650323465466499, + "rewards/bleu_reward_func/std": 0.020110802724957466, + "step": 1144 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 490.0, + "completions/mean_length": 237.75, + "completions/mean_terminated_length": 50.105262756347656, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 0.916, + "grad_norm": 10.258657455444336, + "kl": 0.3250732421875, + "learning_rate": 1e-06, + "loss": -0.0619, + "num_tokens": 14904698.0, + "reward": 0.17951351404190063, + "reward_std": 0.07376629114151001, + "rewards/bleu_reward_func/mean": 0.17951351404190063, + "rewards/bleu_reward_func/std": 0.17956046760082245, + "step": 1145 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 512.0, + "completions/mean_length": 286.78125, + "completions/mean_terminated_length": 151.65000915527344, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 0.9168, + "grad_norm": 6.667459964752197, + "kl": 0.08001708984375, + "learning_rate": 1e-06, + "loss": 0.5727, + "num_tokens": 14917451.0, + "reward": 0.03467312082648277, + "reward_std": 0.01676066778600216, + "rewards/bleu_reward_func/mean": 0.03467312082648277, + "rewards/bleu_reward_func/std": 0.02004723809659481, + "step": 1146 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 512.0, + "completions/max_terminated_length": 29.0, + "completions/mean_length": 266.90625, + "completions/mean_terminated_length": 21.8125, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 0.9176, + "grad_norm": 7.602176189422607, + "kl": 0.4158935546875, + "learning_rate": 1e-06, + "loss": 0.038, + "num_tokens": 14932352.0, + "reward": 0.19471007585525513, + "reward_std": 0.04646201431751251, + "rewards/bleu_reward_func/mean": 0.19471007585525513, + "rewards/bleu_reward_func/std": 0.1760382354259491, + "step": 1147 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 508.0, + "completions/mean_length": 315.15625, + "completions/mean_terminated_length": 162.05555725097656, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 0.9184, + "grad_norm": 6.479866027832031, + "kl": 0.109619140625, + "learning_rate": 1e-06, + "loss": 0.0595, + "num_tokens": 14947741.0, + "reward": 0.10786274820566177, + "reward_std": 0.036432720720767975, + "rewards/bleu_reward_func/mean": 0.10786274820566177, + "rewards/bleu_reward_func/std": 0.0789819210767746, + "step": 1148 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.53125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 511.0, + "completions/mean_length": 472.5, + "completions/mean_terminated_length": 427.7333679199219, + "completions/min_length": 327.0, + "completions/min_terminated_length": 327.0, + "epoch": 0.9192, + "grad_norm": 2.4095027446746826, + "kl": 0.0462646484375, + "learning_rate": 1e-06, + "loss": 0.0305, + "num_tokens": 14966173.0, + "reward": 0.1280764639377594, + "reward_std": 0.03749135136604309, + "rewards/bleu_reward_func/mean": 0.1280764639377594, + "rewards/bleu_reward_func/std": 0.05864708498120308, + "step": 1149 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.46875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 64.0, + "completions/mean_length": 255.90625, + "completions/mean_terminated_length": 29.941177368164062, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 0.92, + "grad_norm": 10.67159652709961, + "kl": 0.1556396484375, + "learning_rate": 1e-06, + "loss": 0.1007, + "num_tokens": 14979570.0, + "reward": 0.055816084146499634, + "reward_std": 0.020302332937717438, + "rewards/bleu_reward_func/mean": 0.055816084146499634, + "rewards/bleu_reward_func/std": 0.03035581111907959, + "step": 1150 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 500.0, + "completions/mean_length": 210.78125, + "completions/mean_terminated_length": 167.75, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 0.9208, + "grad_norm": 6.78237771987915, + "kl": 0.17840576171875, + "learning_rate": 1e-06, + "loss": 0.09, + "num_tokens": 14993923.0, + "reward": 0.07397650182247162, + "reward_std": 0.01999451220035553, + "rewards/bleu_reward_func/mean": 0.07397650182247162, + "rewards/bleu_reward_func/std": 0.0341508574783802, + "step": 1151 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 412.0, + "completions/mean_length": 227.34375, + "completions/mean_terminated_length": 115.95652770996094, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 0.9216, + "grad_norm": 9.095620155334473, + "kl": 0.483001708984375, + "learning_rate": 1e-06, + "loss": 0.2875, + "num_tokens": 15005734.0, + "reward": 0.1028270274400711, + "reward_std": 0.05264887586236, + "rewards/bleu_reward_func/mean": 0.1028270274400711, + "rewards/bleu_reward_func/std": 0.08930659294128418, + "step": 1152 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.6875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 506.0, + "completions/mean_length": 436.0625, + "completions/mean_terminated_length": 269.0, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "epoch": 0.9224, + "grad_norm": 2.607079029083252, + "kl": 0.04913330078125, + "learning_rate": 1e-06, + "loss": 0.2182, + "num_tokens": 15021632.0, + "reward": 0.08679656684398651, + "reward_std": 0.05561990663409233, + "rewards/bleu_reward_func/mean": 0.08679656684398651, + "rewards/bleu_reward_func/std": 0.09985605627298355, + "step": 1153 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 463.0, + "completions/mean_length": 248.28125, + "completions/mean_terminated_length": 128.40908813476562, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.9232, + "grad_norm": 6.871771812438965, + "kl": 0.18536376953125, + "learning_rate": 1e-06, + "loss": 0.02, + "num_tokens": 15035081.0, + "reward": 0.19525909423828125, + "reward_std": 0.04538525268435478, + "rewards/bleu_reward_func/mean": 0.19525909423828125, + "rewards/bleu_reward_func/std": 0.18253828585147858, + "step": 1154 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 512.0, + "completions/max_terminated_length": 156.0, + "completions/mean_length": 285.65625, + "completions/mean_terminated_length": 59.3125, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 0.924, + "grad_norm": 6.028687953948975, + "kl": 0.0938720703125, + "learning_rate": 1e-06, + "loss": 0.2695, + "num_tokens": 15049558.0, + "reward": 0.09561645239591599, + "reward_std": 0.0469173789024353, + "rewards/bleu_reward_func/mean": 0.09561645239591599, + "rewards/bleu_reward_func/std": 0.07949265837669373, + "step": 1155 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 348.0, + "completions/mean_length": 237.875, + "completions/mean_terminated_length": 161.1199951171875, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 0.9248, + "grad_norm": 18.29808807373047, + "kl": 0.55096435546875, + "learning_rate": 1e-06, + "loss": 0.0911, + "num_tokens": 15060954.0, + "reward": 0.1312306672334671, + "reward_std": 0.05109435319900513, + "rewards/bleu_reward_func/mean": 0.1312306672334671, + "rewards/bleu_reward_func/std": 0.0968712568283081, + "step": 1156 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 461.0, + "completions/mean_length": 183.40625, + "completions/mean_terminated_length": 73.875, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.9256, + "grad_norm": 8.553780555725098, + "kl": 0.2044677734375, + "learning_rate": 1e-06, + "loss": 0.3837, + "num_tokens": 15071151.0, + "reward": 0.08946996927261353, + "reward_std": 0.032011546194553375, + "rewards/bleu_reward_func/mean": 0.08946996927261353, + "rewards/bleu_reward_func/std": 0.08429201692342758, + "step": 1157 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 496.0, + "completions/mean_length": 187.40625, + "completions/mean_terminated_length": 153.8275909423828, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "epoch": 0.9264, + "grad_norm": 7.326318740844727, + "kl": 0.14752197265625, + "learning_rate": 1e-06, + "loss": 0.5061, + "num_tokens": 15080252.0, + "reward": 0.1023559644818306, + "reward_std": 0.045405931770801544, + "rewards/bleu_reward_func/mean": 0.1023559644818306, + "rewards/bleu_reward_func/std": 0.07145705074071884, + "step": 1158 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 511.0, + "completions/mean_length": 304.4375, + "completions/mean_terminated_length": 246.3199920654297, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "epoch": 0.9272, + "grad_norm": 5.513801574707031, + "kl": 0.1817626953125, + "learning_rate": 1e-06, + "loss": 0.1149, + "num_tokens": 15095610.0, + "reward": 0.11080615222454071, + "reward_std": 0.043468981981277466, + "rewards/bleu_reward_func/mean": 0.11080615222454071, + "rewards/bleu_reward_func/std": 0.04713428020477295, + "step": 1159 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 347.0, + "completions/mean_length": 205.40625, + "completions/mean_terminated_length": 161.60714721679688, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.928, + "grad_norm": 7.627071857452393, + "kl": 0.318359375, + "learning_rate": 1e-06, + "loss": 0.1805, + "num_tokens": 15104167.0, + "reward": 0.0735570564866066, + "reward_std": 0.02132660523056984, + "rewards/bleu_reward_func/mean": 0.0735570564866066, + "rewards/bleu_reward_func/std": 0.05421363562345505, + "step": 1160 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 454.0, + "completions/mean_length": 256.25, + "completions/mean_terminated_length": 156.17391967773438, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.9288, + "grad_norm": 6.11322021484375, + "kl": 0.27099609375, + "learning_rate": 1e-06, + "loss": 0.1028, + "num_tokens": 15118823.0, + "reward": 0.1414988487958908, + "reward_std": 0.03222941979765892, + "rewards/bleu_reward_func/mean": 0.1414988487958908, + "rewards/bleu_reward_func/std": 0.15351709723472595, + "step": 1161 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.65625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 434.0, + "completions/mean_length": 417.8125, + "completions/mean_terminated_length": 238.0, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "epoch": 0.9296, + "grad_norm": 3.5942113399505615, + "kl": 0.08154296875, + "learning_rate": 1e-06, + "loss": 0.1689, + "num_tokens": 15139105.0, + "reward": 0.12588296830654144, + "reward_std": 0.04371759667992592, + "rewards/bleu_reward_func/mean": 0.12588296830654144, + "rewards/bleu_reward_func/std": 0.14081913232803345, + "step": 1162 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 503.0, + "completions/mean_length": 318.125, + "completions/mean_terminated_length": 273.3846130371094, + "completions/min_length": 59.0, + "completions/min_terminated_length": 59.0, + "epoch": 0.9304, + "grad_norm": 3.5201575756073, + "kl": 0.145751953125, + "learning_rate": 1e-06, + "loss": -0.0396, + "num_tokens": 15151437.0, + "reward": 0.06649903953075409, + "reward_std": 0.024907082319259644, + "rewards/bleu_reward_func/mean": 0.06649903953075409, + "rewards/bleu_reward_func/std": 0.04641694948077202, + "step": 1163 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 484.0, + "completions/mean_length": 263.8125, + "completions/mean_terminated_length": 166.69564819335938, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.9312, + "grad_norm": 5.745899200439453, + "kl": 0.124755859375, + "learning_rate": 1e-06, + "loss": 0.1729, + "num_tokens": 15164791.0, + "reward": 0.33342817425727844, + "reward_std": 0.06225915253162384, + "rewards/bleu_reward_func/mean": 0.33342817425727844, + "rewards/bleu_reward_func/std": 0.3276880383491516, + "step": 1164 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 493.0, + "completions/mean_length": 280.1875, + "completions/mean_terminated_length": 272.70965576171875, + "completions/min_length": 59.0, + "completions/min_terminated_length": 59.0, + "epoch": 0.932, + "grad_norm": 7.059730052947998, + "kl": 0.154541015625, + "learning_rate": 1e-06, + "loss": 0.2383, + "num_tokens": 15176629.0, + "reward": 0.03160897642374039, + "reward_std": 0.010618302971124649, + "rewards/bleu_reward_func/mean": 0.03160897642374039, + "rewards/bleu_reward_func/std": 0.016612010076642036, + "step": 1165 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 511.0, + "completions/mean_length": 295.59375, + "completions/mean_terminated_length": 281.16668701171875, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "epoch": 0.9328, + "grad_norm": 5.280132293701172, + "kl": 0.13616943359375, + "learning_rate": 1e-06, + "loss": 0.3116, + "num_tokens": 15188168.0, + "reward": 0.03221059590578079, + "reward_std": 0.02213170751929283, + "rewards/bleu_reward_func/mean": 0.03221059590578079, + "rewards/bleu_reward_func/std": 0.03985392674803734, + "step": 1166 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 478.0, + "completions/mean_length": 227.375, + "completions/mean_terminated_length": 132.5, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.9336, + "grad_norm": 6.5651021003723145, + "kl": 0.142578125, + "learning_rate": 1e-06, + "loss": 0.0658, + "num_tokens": 15201892.0, + "reward": 0.1807694286108017, + "reward_std": 0.1409318894147873, + "rewards/bleu_reward_func/mean": 0.1807694286108017, + "rewards/bleu_reward_func/std": 0.2797906994819641, + "step": 1167 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 507.0, + "completions/mean_length": 220.5, + "completions/mean_terminated_length": 190.34483337402344, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 0.9344, + "grad_norm": 7.170085430145264, + "kl": 0.400634765625, + "learning_rate": 1e-06, + "loss": 0.0572, + "num_tokens": 15212724.0, + "reward": 0.1451285183429718, + "reward_std": 0.0673985704779625, + "rewards/bleu_reward_func/mean": 0.1451285183429718, + "rewards/bleu_reward_func/std": 0.12026475369930267, + "step": 1168 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 506.0, + "completions/mean_length": 313.75, + "completions/mean_terminated_length": 247.6666717529297, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.9352, + "grad_norm": 5.603696823120117, + "kl": 0.152587890625, + "learning_rate": 1e-06, + "loss": 0.0616, + "num_tokens": 15228428.0, + "reward": 0.253650963306427, + "reward_std": 0.03022560104727745, + "rewards/bleu_reward_func/mean": 0.253650963306427, + "rewards/bleu_reward_func/std": 0.3357136845588684, + "step": 1169 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 502.0, + "completions/mean_length": 255.125, + "completions/mean_terminated_length": 154.60870361328125, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.936, + "grad_norm": 6.110992431640625, + "kl": 0.1976318359375, + "learning_rate": 1e-06, + "loss": -0.0162, + "num_tokens": 15242768.0, + "reward": 0.0904015377163887, + "reward_std": 0.025095967575907707, + "rewards/bleu_reward_func/mean": 0.0904015377163887, + "rewards/bleu_reward_func/std": 0.09678779542446136, + "step": 1170 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 476.0, + "completions/mean_length": 241.625, + "completions/mean_terminated_length": 213.65516662597656, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 0.9368, + "grad_norm": 7.32260799407959, + "kl": 0.28955078125, + "learning_rate": 1e-06, + "loss": 0.2098, + "num_tokens": 15256260.0, + "reward": 0.09696318954229355, + "reward_std": 0.04769141972064972, + "rewards/bleu_reward_func/mean": 0.09696318954229355, + "rewards/bleu_reward_func/std": 0.07404191046953201, + "step": 1171 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 505.0, + "completions/mean_length": 189.96875, + "completions/mean_terminated_length": 156.65516662597656, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.9376, + "grad_norm": 8.43685531616211, + "kl": 0.293212890625, + "learning_rate": 1e-06, + "loss": -0.1046, + "num_tokens": 15269899.0, + "reward": 0.09695740044116974, + "reward_std": 0.037594642490148544, + "rewards/bleu_reward_func/mean": 0.09695740044116974, + "rewards/bleu_reward_func/std": 0.05750608071684837, + "step": 1172 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 475.0, + "completions/mean_length": 343.03125, + "completions/mean_terminated_length": 125.78572082519531, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 0.9384, + "grad_norm": 5.582010269165039, + "kl": 0.410400390625, + "learning_rate": 1e-06, + "loss": -0.1385, + "num_tokens": 15287108.0, + "reward": 0.1451946198940277, + "reward_std": 0.03915205970406532, + "rewards/bleu_reward_func/mean": 0.1451946198940277, + "rewards/bleu_reward_func/std": 0.17974776029586792, + "step": 1173 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 501.0, + "completions/mean_length": 331.84375, + "completions/mean_terminated_length": 261.34783935546875, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.9392, + "grad_norm": 11.770851135253906, + "kl": 0.3424072265625, + "learning_rate": 1e-06, + "loss": 0.1869, + "num_tokens": 15304535.0, + "reward": 0.1413375586271286, + "reward_std": 0.06474150717258453, + "rewards/bleu_reward_func/mean": 0.1413375586271286, + "rewards/bleu_reward_func/std": 0.0782981589436531, + "step": 1174 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.34375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 459.0, + "completions/mean_length": 299.46875, + "completions/mean_terminated_length": 188.1428680419922, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.94, + "grad_norm": 5.467903137207031, + "kl": 0.198486328125, + "learning_rate": 1e-06, + "loss": -0.1137, + "num_tokens": 15317078.0, + "reward": 0.14337725937366486, + "reward_std": 0.03686724230647087, + "rewards/bleu_reward_func/mean": 0.14337725937366486, + "rewards/bleu_reward_func/std": 0.16096609830856323, + "step": 1175 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 484.0, + "completions/mean_length": 253.8125, + "completions/mean_terminated_length": 181.51998901367188, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.9408, + "grad_norm": 8.620965957641602, + "kl": 0.379150390625, + "learning_rate": 1e-06, + "loss": -0.0426, + "num_tokens": 15328768.0, + "reward": 0.1426527500152588, + "reward_std": 0.0550708994269371, + "rewards/bleu_reward_func/mean": 0.1426527500152588, + "rewards/bleu_reward_func/std": 0.12621666491031647, + "step": 1176 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 421.0, + "completions/mean_length": 193.21875, + "completions/mean_terminated_length": 86.95833587646484, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 0.9416, + "grad_norm": 10.362127304077148, + "kl": 0.565673828125, + "learning_rate": 1e-06, + "loss": 0.0348, + "num_tokens": 15340919.0, + "reward": 0.04140050709247589, + "reward_std": 0.019718483090400696, + "rewards/bleu_reward_func/mean": 0.04140050709247589, + "rewards/bleu_reward_func/std": 0.03685431182384491, + "step": 1177 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 505.0, + "completions/mean_length": 349.5625, + "completions/mean_terminated_length": 223.22222900390625, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 0.9424, + "grad_norm": 5.097819805145264, + "kl": 0.2220458984375, + "learning_rate": 1e-06, + "loss": 0.0039, + "num_tokens": 15356545.0, + "reward": 0.14949634671211243, + "reward_std": 0.05013212561607361, + "rewards/bleu_reward_func/mean": 0.14949634671211243, + "rewards/bleu_reward_func/std": 0.19787877798080444, + "step": 1178 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 403.0, + "completions/mean_length": 160.03125, + "completions/mean_terminated_length": 123.62068939208984, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 0.9432, + "grad_norm": 10.772759437561035, + "kl": 0.300537109375, + "learning_rate": 1e-06, + "loss": 0.0346, + "num_tokens": 15364554.0, + "reward": 0.1290975958108902, + "reward_std": 0.07744569331407547, + "rewards/bleu_reward_func/mean": 0.1290975958108902, + "rewards/bleu_reward_func/std": 0.1356077641248703, + "step": 1179 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.65625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 493.0, + "completions/mean_length": 378.1875, + "completions/mean_terminated_length": 122.72727966308594, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 0.944, + "grad_norm": 6.254019260406494, + "kl": 0.2237548828125, + "learning_rate": 1e-06, + "loss": 0.0559, + "num_tokens": 15382792.0, + "reward": 0.11257205903530121, + "reward_std": 0.036544833332300186, + "rewards/bleu_reward_func/mean": 0.11257205903530121, + "rewards/bleu_reward_func/std": 0.10338166356086731, + "step": 1180 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 214.0, + "completions/mean_length": 185.75, + "completions/mean_terminated_length": 37.45454788208008, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 0.9448, + "grad_norm": 30.484220504760742, + "kl": 1.228515625, + "learning_rate": 1e-06, + "loss": 0.7897, + "num_tokens": 15391000.0, + "reward": 0.2840408384799957, + "reward_std": 0.15822984278202057, + "rewards/bleu_reward_func/mean": 0.2840408384799957, + "rewards/bleu_reward_func/std": 0.1896887719631195, + "step": 1181 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 500.0, + "completions/mean_length": 237.8125, + "completions/mean_terminated_length": 130.52174377441406, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 0.9456, + "grad_norm": 18.71758270263672, + "kl": 1.376953125, + "learning_rate": 1e-06, + "loss": 0.403, + "num_tokens": 15401594.0, + "reward": 0.16319331526756287, + "reward_std": 0.07705336064100266, + "rewards/bleu_reward_func/mean": 0.16319331526756287, + "rewards/bleu_reward_func/std": 0.21551194787025452, + "step": 1182 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 500.0, + "completions/mean_length": 301.6875, + "completions/mean_terminated_length": 175.5, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.9464, + "grad_norm": 10.93260383605957, + "kl": 0.89013671875, + "learning_rate": 1e-06, + "loss": 0.073, + "num_tokens": 15415696.0, + "reward": 0.1893438994884491, + "reward_std": 0.050289541482925415, + "rewards/bleu_reward_func/mean": 0.1893438994884491, + "rewards/bleu_reward_func/std": 0.2888805866241455, + "step": 1183 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.6875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 490.0, + "completions/mean_length": 409.75, + "completions/mean_terminated_length": 184.8000030517578, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 0.9472, + "grad_norm": 18.76006317138672, + "kl": 0.81097412109375, + "learning_rate": 1e-06, + "loss": 0.1983, + "num_tokens": 15436760.0, + "reward": 0.06023106724023819, + "reward_std": 0.03375660628080368, + "rewards/bleu_reward_func/mean": 0.06023106724023819, + "rewards/bleu_reward_func/std": 0.07748028635978699, + "step": 1184 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 458.0, + "completions/mean_length": 319.875, + "completions/mean_terminated_length": 188.42105102539062, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.948, + "grad_norm": 8.845364570617676, + "kl": 1.5517578125, + "learning_rate": 1e-06, + "loss": 0.1722, + "num_tokens": 15449196.0, + "reward": 0.08687852323055267, + "reward_std": 0.02910173125565052, + "rewards/bleu_reward_func/mean": 0.08687852323055267, + "rewards/bleu_reward_func/std": 0.06549690663814545, + "step": 1185 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 245.0, + "completions/mean_length": 192.5625, + "completions/mean_terminated_length": 86.08333587646484, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.9488, + "grad_norm": 17.081815719604492, + "kl": 0.970703125, + "learning_rate": 1e-06, + "loss": 0.1558, + "num_tokens": 15458006.0, + "reward": 0.10367533564567566, + "reward_std": 0.04477589949965477, + "rewards/bleu_reward_func/mean": 0.10367533564567566, + "rewards/bleu_reward_func/std": 0.1153038814663887, + "step": 1186 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 357.0, + "completions/mean_length": 165.3125, + "completions/mean_terminated_length": 115.78572082519531, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 0.9496, + "grad_norm": 18.12542152404785, + "kl": 1.48876953125, + "learning_rate": 1e-06, + "loss": 0.1714, + "num_tokens": 15469048.0, + "reward": 0.14421464502811432, + "reward_std": 0.05387473851442337, + "rewards/bleu_reward_func/mean": 0.14421464502811432, + "rewards/bleu_reward_func/std": 0.11958708614110947, + "step": 1187 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 506.0, + "completions/mean_length": 170.8125, + "completions/mean_terminated_length": 92.0769271850586, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.9504, + "grad_norm": 19.755910873413086, + "kl": 1.64697265625, + "learning_rate": 1e-06, + "loss": 0.083, + "num_tokens": 15478610.0, + "reward": 0.11767937242984772, + "reward_std": 0.038736552000045776, + "rewards/bleu_reward_func/mean": 0.11767937242984772, + "rewards/bleu_reward_func/std": 0.14759457111358643, + "step": 1188 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 474.0, + "completions/mean_length": 303.34375, + "completions/mean_terminated_length": 233.7916717529297, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 0.9512, + "grad_norm": 7.415678024291992, + "kl": 0.464111328125, + "learning_rate": 1e-06, + "loss": -0.0998, + "num_tokens": 15493469.0, + "reward": 0.13566911220550537, + "reward_std": 0.03367416933178902, + "rewards/bleu_reward_func/mean": 0.13566911220550537, + "rewards/bleu_reward_func/std": 0.16353026032447815, + "step": 1189 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 481.0, + "completions/mean_length": 371.59375, + "completions/mean_terminated_length": 307.7727355957031, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "epoch": 0.952, + "grad_norm": 4.499618053436279, + "kl": 0.21990966796875, + "learning_rate": 1e-06, + "loss": -0.0327, + "num_tokens": 15510992.0, + "reward": 0.05285275727510452, + "reward_std": 0.026856746524572372, + "rewards/bleu_reward_func/mean": 0.05285275727510452, + "rewards/bleu_reward_func/std": 0.028620464727282524, + "step": 1190 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 456.0, + "completions/mean_length": 211.5, + "completions/mean_terminated_length": 155.8518524169922, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.9528, + "grad_norm": 11.282941818237305, + "kl": 0.353515625, + "learning_rate": 1e-06, + "loss": -0.0409, + "num_tokens": 15519336.0, + "reward": 0.09386638551950455, + "reward_std": 0.03402595967054367, + "rewards/bleu_reward_func/mean": 0.09386638551950455, + "rewards/bleu_reward_func/std": 0.08018817007541656, + "step": 1191 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 468.0, + "completions/max_terminated_length": 468.0, + "completions/mean_length": 53.5625, + "completions/mean_terminated_length": 53.5625, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.9536, + "grad_norm": 30.713850021362305, + "kl": 0.921142578125, + "learning_rate": 1e-06, + "loss": 0.5322, + "num_tokens": 15524298.0, + "reward": 0.1515873372554779, + "reward_std": 0.03387141600251198, + "rewards/bleu_reward_func/mean": 0.1515873372554779, + "rewards/bleu_reward_func/std": 0.14795146882534027, + "step": 1192 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 506.0, + "completions/mean_length": 307.78125, + "completions/mean_terminated_length": 148.94444274902344, + "completions/min_length": 56.0, + "completions/min_terminated_length": 56.0, + "epoch": 0.9544, + "grad_norm": 8.172345161437988, + "kl": 0.3499755859375, + "learning_rate": 1e-06, + "loss": -0.1691, + "num_tokens": 15539683.0, + "reward": 0.06142358481884003, + "reward_std": 0.017768073827028275, + "rewards/bleu_reward_func/mean": 0.06142358481884003, + "rewards/bleu_reward_func/std": 0.02769811637699604, + "step": 1193 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 438.0, + "completions/mean_length": 59.1875, + "completions/mean_terminated_length": 44.58064270019531, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.9552, + "grad_norm": 10.542518615722656, + "kl": 0.6220703125, + "learning_rate": 1e-06, + "loss": -0.2357, + "num_tokens": 15546073.0, + "reward": 0.13228365778923035, + "reward_std": 0.05075054615736008, + "rewards/bleu_reward_func/mean": 0.13228365778923035, + "rewards/bleu_reward_func/std": 0.14825788140296936, + "step": 1194 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 507.0, + "completions/mean_length": 351.75, + "completions/mean_terminated_length": 255.60000610351562, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.956, + "grad_norm": 14.762670516967773, + "kl": 0.54180908203125, + "learning_rate": 1e-06, + "loss": 0.1487, + "num_tokens": 15561137.0, + "reward": 0.10621648281812668, + "reward_std": 0.04206620901823044, + "rewards/bleu_reward_func/mean": 0.10621648281812668, + "rewards/bleu_reward_func/std": 0.05661296099424362, + "step": 1195 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.34375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 500.0, + "completions/mean_length": 363.1875, + "completions/mean_terminated_length": 285.23809814453125, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "epoch": 0.9568, + "grad_norm": 2.930482864379883, + "kl": 0.08624267578125, + "learning_rate": 1e-06, + "loss": -0.2447, + "num_tokens": 15575311.0, + "reward": 0.026056351140141487, + "reward_std": 0.01072642207145691, + "rewards/bleu_reward_func/mean": 0.026056351140141487, + "rewards/bleu_reward_func/std": 0.014662106521427631, + "step": 1196 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 488.0, + "completions/mean_length": 334.6875, + "completions/mean_terminated_length": 228.3000030517578, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.9576, + "grad_norm": 4.765532493591309, + "kl": 0.20489501953125, + "learning_rate": 1e-06, + "loss": -0.0713, + "num_tokens": 15591997.0, + "reward": 0.18965111672878265, + "reward_std": 0.03347271308302879, + "rewards/bleu_reward_func/mean": 0.18965111672878265, + "rewards/bleu_reward_func/std": 0.24074162542819977, + "step": 1197 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 496.0, + "completions/mean_length": 206.71875, + "completions/mean_terminated_length": 186.36668395996094, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.9584, + "grad_norm": 9.595136642456055, + "kl": 0.4879150390625, + "learning_rate": 1e-06, + "loss": -0.0063, + "num_tokens": 15601852.0, + "reward": 0.07087633013725281, + "reward_std": 0.024902882054448128, + "rewards/bleu_reward_func/mean": 0.07087633013725281, + "rewards/bleu_reward_func/std": 0.052057161927223206, + "step": 1198 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 477.0, + "completions/mean_length": 286.59375, + "completions/mean_terminated_length": 223.47999572753906, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "epoch": 0.9592, + "grad_norm": 5.445902347564697, + "kl": 0.289306640625, + "learning_rate": 1e-06, + "loss": -0.0083, + "num_tokens": 15613743.0, + "reward": 0.10761390626430511, + "reward_std": 0.03891483694314957, + "rewards/bleu_reward_func/mean": 0.10761390626430511, + "rewards/bleu_reward_func/std": 0.10792107880115509, + "step": 1199 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 112.0, + "completions/max_terminated_length": 112.0, + "completions/mean_length": 29.75, + "completions/mean_terminated_length": 29.75, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.96, + "grad_norm": 16.023887634277344, + "kl": 1.11669921875, + "learning_rate": 1e-06, + "loss": 0.1468, + "num_tokens": 15619671.0, + "reward": 0.3471377491950989, + "reward_std": 0.05158979445695877, + "rewards/bleu_reward_func/mean": 0.3471377491950989, + "rewards/bleu_reward_func/std": 0.13238590955734253, + "step": 1200 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 451.0, + "completions/mean_length": 325.5, + "completions/mean_terminated_length": 213.60000610351562, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.9608, + "grad_norm": 5.491018295288086, + "kl": 0.32122802734375, + "learning_rate": 1e-06, + "loss": -0.2311, + "num_tokens": 15634047.0, + "reward": 0.07880916446447372, + "reward_std": 0.030750975012779236, + "rewards/bleu_reward_func/mean": 0.07880916446447372, + "rewards/bleu_reward_func/std": 0.06850366294384003, + "step": 1201 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 510.0, + "completions/mean_length": 264.5625, + "completions/mean_terminated_length": 248.06668090820312, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 0.9616, + "grad_norm": 4.983512878417969, + "kl": 0.1092529296875, + "learning_rate": 1e-06, + "loss": -0.1154, + "num_tokens": 15647369.0, + "reward": 0.09226585179567337, + "reward_std": 0.04809027165174484, + "rewards/bleu_reward_func/mean": 0.09226585179567337, + "rewards/bleu_reward_func/std": 0.14570128917694092, + "step": 1202 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 419.0, + "completions/max_terminated_length": 419.0, + "completions/mean_length": 160.71875, + "completions/mean_terminated_length": 160.71875, + "completions/min_length": 43.0, + "completions/min_terminated_length": 43.0, + "epoch": 0.9624, + "grad_norm": 4.987871170043945, + "kl": 0.1322021484375, + "learning_rate": 1e-06, + "loss": 0.0973, + "num_tokens": 15656384.0, + "reward": 0.036719270050525665, + "reward_std": 0.007080578710883856, + "rewards/bleu_reward_func/mean": 0.036719270050525665, + "rewards/bleu_reward_func/std": 0.018336299806833267, + "step": 1203 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 443.0, + "completions/mean_length": 238.03125, + "completions/mean_terminated_length": 146.70834350585938, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 0.9632, + "grad_norm": 4.710721492767334, + "kl": 0.24249267578125, + "learning_rate": 1e-06, + "loss": -0.0163, + "num_tokens": 15666769.0, + "reward": 0.11304133385419846, + "reward_std": 0.03101547807455063, + "rewards/bleu_reward_func/mean": 0.11304133385419846, + "rewards/bleu_reward_func/std": 0.09199430793523788, + "step": 1204 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 417.0, + "completions/max_terminated_length": 417.0, + "completions/mean_length": 129.78125, + "completions/mean_terminated_length": 129.78125, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 0.964, + "grad_norm": 9.284921646118164, + "kl": 0.5504150390625, + "learning_rate": 1e-06, + "loss": -0.0268, + "num_tokens": 15674122.0, + "reward": 0.07591907680034637, + "reward_std": 0.03484845906496048, + "rewards/bleu_reward_func/mean": 0.07591907680034637, + "rewards/bleu_reward_func/std": 0.05640895664691925, + "step": 1205 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.34375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 482.0, + "completions/mean_length": 280.15625, + "completions/mean_terminated_length": 158.71429443359375, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.9648, + "grad_norm": 5.940964698791504, + "kl": 0.234375, + "learning_rate": 1e-06, + "loss": 0.2716, + "num_tokens": 15690119.0, + "reward": 0.19026660919189453, + "reward_std": 0.06492812931537628, + "rewards/bleu_reward_func/mean": 0.19026660919189453, + "rewards/bleu_reward_func/std": 0.1680937260389328, + "step": 1206 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 258.0, + "completions/mean_length": 190.96875, + "completions/mean_terminated_length": 131.51852416992188, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.9656, + "grad_norm": 4.697986602783203, + "kl": 0.17156982421875, + "learning_rate": 1e-06, + "loss": 0.1839, + "num_tokens": 15700302.0, + "reward": 0.29912805557250977, + "reward_std": 0.05129002407193184, + "rewards/bleu_reward_func/mean": 0.29912805557250977, + "rewards/bleu_reward_func/std": 0.33928823471069336, + "step": 1207 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.34375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 506.0, + "completions/mean_length": 403.34375, + "completions/mean_terminated_length": 346.4285888671875, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "epoch": 0.9664, + "grad_norm": 3.426858901977539, + "kl": 0.05426025390625, + "learning_rate": 1e-06, + "loss": -0.041, + "num_tokens": 15718081.0, + "reward": 0.10875709354877472, + "reward_std": 0.017351722344756126, + "rewards/bleu_reward_func/mean": 0.10875709354877472, + "rewards/bleu_reward_func/std": 0.1039399579167366, + "step": 1208 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 407.0, + "completions/mean_length": 236.59375, + "completions/mean_terminated_length": 111.40909576416016, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "epoch": 0.9672, + "grad_norm": 5.4925079345703125, + "kl": 0.2030029296875, + "learning_rate": 1e-06, + "loss": -0.1031, + "num_tokens": 15728028.0, + "reward": 0.04499085620045662, + "reward_std": 0.01842951774597168, + "rewards/bleu_reward_func/mean": 0.04499085620045662, + "rewards/bleu_reward_func/std": 0.030968643724918365, + "step": 1209 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 501.0, + "completions/mean_length": 140.625, + "completions/mean_terminated_length": 128.64515686035156, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 0.968, + "grad_norm": 9.575289726257324, + "kl": 0.20196533203125, + "learning_rate": 1e-06, + "loss": 0.1409, + "num_tokens": 15735296.0, + "reward": 0.07829822599887848, + "reward_std": 0.03164747357368469, + "rewards/bleu_reward_func/mean": 0.07829822599887848, + "rewards/bleu_reward_func/std": 0.07768744975328445, + "step": 1210 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 492.0, + "completions/mean_length": 338.1875, + "completions/mean_terminated_length": 298.0769348144531, + "completions/min_length": 46.0, + "completions/min_terminated_length": 46.0, + "epoch": 0.9688, + "grad_norm": 2.519958019256592, + "kl": 0.042205810546875, + "learning_rate": 1e-06, + "loss": 0.2122, + "num_tokens": 15750286.0, + "reward": 0.11583074182271957, + "reward_std": 0.0300702303647995, + "rewards/bleu_reward_func/mean": 0.11583074182271957, + "rewards/bleu_reward_func/std": 0.0626337081193924, + "step": 1211 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 487.0, + "completions/mean_length": 202.1875, + "completions/mean_terminated_length": 98.91667175292969, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.9696, + "grad_norm": 7.297961235046387, + "kl": 0.3997802734375, + "learning_rate": 1e-06, + "loss": 0.1691, + "num_tokens": 15759468.0, + "reward": 0.14121456444263458, + "reward_std": 0.051856689155101776, + "rewards/bleu_reward_func/mean": 0.14121456444263458, + "rewards/bleu_reward_func/std": 0.1731884926557541, + "step": 1212 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 95.0, + "completions/mean_length": 183.1875, + "completions/mean_terminated_length": 33.727272033691406, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 0.9704, + "grad_norm": 6.750447750091553, + "kl": 0.354522705078125, + "learning_rate": 1e-06, + "loss": 0.3063, + "num_tokens": 15771298.0, + "reward": 0.20002232491970062, + "reward_std": 0.05593840777873993, + "rewards/bleu_reward_func/mean": 0.20002232491970062, + "rewards/bleu_reward_func/std": 0.1818784922361374, + "step": 1213 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 509.0, + "completions/max_terminated_length": 509.0, + "completions/mean_length": 155.59375, + "completions/mean_terminated_length": 155.59375, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 0.9712, + "grad_norm": 8.60672378540039, + "kl": 0.29888916015625, + "learning_rate": 1e-06, + "loss": -0.1436, + "num_tokens": 15781949.0, + "reward": 0.07949218153953552, + "reward_std": 0.030030012130737305, + "rewards/bleu_reward_func/mean": 0.07949218153953552, + "rewards/bleu_reward_func/std": 0.05354390665888786, + "step": 1214 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 512.0, + "completions/mean_length": 293.0, + "completions/mean_terminated_length": 193.4545440673828, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 0.972, + "grad_norm": 4.160828590393066, + "kl": 0.09783935546875, + "learning_rate": 1e-06, + "loss": -0.0156, + "num_tokens": 15795061.0, + "reward": 0.1600840538740158, + "reward_std": 0.05235850065946579, + "rewards/bleu_reward_func/mean": 0.1600840538740158, + "rewards/bleu_reward_func/std": 0.0939527377486229, + "step": 1215 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 251.0, + "completions/mean_length": 148.8125, + "completions/mean_terminated_length": 65.0, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 0.9728, + "grad_norm": 5.613617420196533, + "kl": 0.1966552734375, + "learning_rate": 1e-06, + "loss": 0.011, + "num_tokens": 15804343.0, + "reward": 0.1325000822544098, + "reward_std": 0.054465532302856445, + "rewards/bleu_reward_func/mean": 0.1325000822544098, + "rewards/bleu_reward_func/std": 0.15841807425022125, + "step": 1216 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 389.0, + "completions/max_terminated_length": 389.0, + "completions/mean_length": 118.34375, + "completions/mean_terminated_length": 118.34375, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.9736, + "grad_norm": 7.5834503173828125, + "kl": 0.2967529296875, + "learning_rate": 1e-06, + "loss": 0.0808, + "num_tokens": 15815658.0, + "reward": 0.243885338306427, + "reward_std": 0.05274055525660515, + "rewards/bleu_reward_func/mean": 0.243885338306427, + "rewards/bleu_reward_func/std": 0.14211414754390717, + "step": 1217 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 512.0, + "completions/max_terminated_length": 506.0, + "completions/mean_length": 367.8125, + "completions/mean_terminated_length": 223.625, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 0.9744, + "grad_norm": 3.8433821201324463, + "kl": 0.0970458984375, + "learning_rate": 1e-06, + "loss": -0.0201, + "num_tokens": 15831204.0, + "reward": 0.10006400942802429, + "reward_std": 0.02605537325143814, + "rewards/bleu_reward_func/mean": 0.10006400942802429, + "rewards/bleu_reward_func/std": 0.1093517392873764, + "step": 1218 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 327.0, + "completions/mean_length": 191.90625, + "completions/mean_terminated_length": 132.62962341308594, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 0.9752, + "grad_norm": 25.382192611694336, + "kl": 0.375518798828125, + "learning_rate": 1e-06, + "loss": 0.2354, + "num_tokens": 15842769.0, + "reward": 0.21496494114398956, + "reward_std": 0.08334603905677795, + "rewards/bleu_reward_func/mean": 0.21496494114398956, + "rewards/bleu_reward_func/std": 0.3287891745567322, + "step": 1219 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 405.0, + "completions/mean_length": 193.0, + "completions/mean_terminated_length": 68.17391204833984, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 0.976, + "grad_norm": 8.422406196594238, + "kl": 0.158935546875, + "learning_rate": 1e-06, + "loss": 0.7192, + "num_tokens": 15852937.0, + "reward": 0.21234184503555298, + "reward_std": 0.10839352756738663, + "rewards/bleu_reward_func/mean": 0.21234184503555298, + "rewards/bleu_reward_func/std": 0.17191235721111298, + "step": 1220 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 458.0, + "completions/mean_length": 291.15625, + "completions/mean_terminated_length": 250.25926208496094, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.9768, + "grad_norm": 3.8467891216278076, + "kl": 0.101806640625, + "learning_rate": 1e-06, + "loss": -0.0472, + "num_tokens": 15865934.0, + "reward": 0.13597853481769562, + "reward_std": 0.034184906631708145, + "rewards/bleu_reward_func/mean": 0.13597853481769562, + "rewards/bleu_reward_func/std": 0.0799630656838417, + "step": 1221 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 507.0, + "completions/mean_length": 268.15625, + "completions/mean_terminated_length": 223.0, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.9776, + "grad_norm": 5.5957794189453125, + "kl": 0.1470947265625, + "learning_rate": 1e-06, + "loss": 0.137, + "num_tokens": 15881691.0, + "reward": 0.11758720874786377, + "reward_std": 0.05352931469678879, + "rewards/bleu_reward_func/mean": 0.11758720874786377, + "rewards/bleu_reward_func/std": 0.08839549124240875, + "step": 1222 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 471.0, + "completions/mean_length": 364.28125, + "completions/mean_terminated_length": 174.35714721679688, + "completions/min_length": 45.0, + "completions/min_terminated_length": 45.0, + "epoch": 0.9784, + "grad_norm": 3.5430212020874023, + "kl": 0.0980224609375, + "learning_rate": 1e-06, + "loss": -0.0829, + "num_tokens": 15901716.0, + "reward": 0.10303943604230881, + "reward_std": 0.02435348369181156, + "rewards/bleu_reward_func/mean": 0.10303943604230881, + "rewards/bleu_reward_func/std": 0.10681937634944916, + "step": 1223 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.46875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 499.0, + "completions/mean_length": 350.09375, + "completions/mean_terminated_length": 207.23529052734375, + "completions/min_length": 43.0, + "completions/min_terminated_length": 43.0, + "epoch": 0.9792, + "grad_norm": 3.817512035369873, + "kl": 0.06549072265625, + "learning_rate": 1e-06, + "loss": -0.2731, + "num_tokens": 15916239.0, + "reward": 0.025896022096276283, + "reward_std": 0.011423053219914436, + "rewards/bleu_reward_func/mean": 0.025896022096276283, + "rewards/bleu_reward_func/std": 0.01915143057703972, + "step": 1224 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 58.0, + "completions/mean_length": 148.59375, + "completions/mean_terminated_length": 27.45833396911621, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.98, + "grad_norm": 8.734639167785645, + "kl": 0.29150390625, + "learning_rate": 1e-06, + "loss": 0.1203, + "num_tokens": 15926002.0, + "reward": 0.2994440793991089, + "reward_std": 0.08188341557979584, + "rewards/bleu_reward_func/mean": 0.2994440793991089, + "rewards/bleu_reward_func/std": 0.17080959677696228, + "step": 1225 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 478.0, + "completions/mean_length": 146.875, + "completions/mean_terminated_length": 109.10344696044922, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 0.9808, + "grad_norm": 6.3398637771606445, + "kl": 0.203369140625, + "learning_rate": 1e-06, + "loss": 0.0948, + "num_tokens": 15934822.0, + "reward": 0.08409433811903, + "reward_std": 0.022457323968410492, + "rewards/bleu_reward_func/mean": 0.08409433811903, + "rewards/bleu_reward_func/std": 0.12822076678276062, + "step": 1226 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 467.0, + "completions/mean_length": 127.15625, + "completions/mean_terminated_length": 114.74193572998047, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.9816, + "grad_norm": 5.545539379119873, + "kl": 0.157470703125, + "learning_rate": 1e-06, + "loss": -0.038, + "num_tokens": 15941411.0, + "reward": 0.291486918926239, + "reward_std": 0.01802459917962551, + "rewards/bleu_reward_func/mean": 0.291486918926239, + "rewards/bleu_reward_func/std": 0.30036208033561707, + "step": 1227 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 388.0, + "completions/mean_length": 188.1875, + "completions/mean_terminated_length": 128.22222900390625, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 0.9824, + "grad_norm": 4.385817050933838, + "kl": 0.12310791015625, + "learning_rate": 1e-06, + "loss": -0.2018, + "num_tokens": 15948961.0, + "reward": 0.050382573157548904, + "reward_std": 0.019635431468486786, + "rewards/bleu_reward_func/mean": 0.050382573157548904, + "rewards/bleu_reward_func/std": 0.0410877950489521, + "step": 1228 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 505.0, + "completions/mean_length": 167.40625, + "completions/mean_terminated_length": 131.7586212158203, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 0.9832, + "grad_norm": 6.975030422210693, + "kl": 0.20166015625, + "learning_rate": 1e-06, + "loss": 0.0432, + "num_tokens": 15959414.0, + "reward": 0.10200367867946625, + "reward_std": 0.012780029326677322, + "rewards/bleu_reward_func/mean": 0.10200367867946625, + "rewards/bleu_reward_func/std": 0.07971282303333282, + "step": 1229 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 477.0, + "completions/mean_length": 172.25, + "completions/mean_terminated_length": 149.60000610351562, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "epoch": 0.984, + "grad_norm": 58.40047836303711, + "kl": 0.6114501953125, + "learning_rate": 1e-06, + "loss": 0.1056, + "num_tokens": 15970374.0, + "reward": 0.21096709370613098, + "reward_std": 0.05436326563358307, + "rewards/bleu_reward_func/mean": 0.21096709370613098, + "rewards/bleu_reward_func/std": 0.21129880845546722, + "step": 1230 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 512.0, + "completions/max_terminated_length": 99.0, + "completions/mean_length": 282.875, + "completions/mean_terminated_length": 53.75, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.9848, + "grad_norm": 7.031259536743164, + "kl": 0.173828125, + "learning_rate": 1e-06, + "loss": -0.0097, + "num_tokens": 15985058.0, + "reward": 0.13087573647499084, + "reward_std": 0.03848683089017868, + "rewards/bleu_reward_func/mean": 0.13087573647499084, + "rewards/bleu_reward_func/std": 0.07179337739944458, + "step": 1231 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 511.0, + "completions/mean_length": 347.21875, + "completions/mean_terminated_length": 282.7391357421875, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 0.9856, + "grad_norm": 13.313501358032227, + "kl": 0.432952880859375, + "learning_rate": 1e-06, + "loss": 0.1537, + "num_tokens": 16003825.0, + "reward": 0.06137411668896675, + "reward_std": 0.036481164395809174, + "rewards/bleu_reward_func/mean": 0.06137411668896675, + "rewards/bleu_reward_func/std": 0.05317319929599762, + "step": 1232 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 198.0, + "completions/mean_length": 114.03125, + "completions/mean_terminated_length": 87.50000762939453, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.9864, + "grad_norm": 12.291428565979004, + "kl": 0.4300537109375, + "learning_rate": 1e-06, + "loss": 0.2836, + "num_tokens": 16012714.0, + "reward": 0.05898230895400047, + "reward_std": 0.024662408977746964, + "rewards/bleu_reward_func/mean": 0.05898230895400047, + "rewards/bleu_reward_func/std": 0.05822930857539177, + "step": 1233 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 506.0, + "completions/mean_length": 255.46875, + "completions/mean_terminated_length": 155.0869598388672, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 0.9872, + "grad_norm": 6.375350475311279, + "kl": 0.20550537109375, + "learning_rate": 1e-06, + "loss": -0.0603, + "num_tokens": 16023033.0, + "reward": 0.12795159220695496, + "reward_std": 0.034560851752758026, + "rewards/bleu_reward_func/mean": 0.12795159220695496, + "rewards/bleu_reward_func/std": 0.05310589075088501, + "step": 1234 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 512.0, + "completions/mean_length": 335.375, + "completions/mean_terminated_length": 198.0, + "completions/min_length": 45.0, + "completions/min_terminated_length": 45.0, + "epoch": 0.988, + "grad_norm": 2.701765775680542, + "kl": 0.1783447265625, + "learning_rate": 1e-06, + "loss": 0.016, + "num_tokens": 16037525.0, + "reward": 0.08859970420598984, + "reward_std": 0.012333719059824944, + "rewards/bleu_reward_func/mean": 0.08859970420598984, + "rewards/bleu_reward_func/std": 0.05836745351552963, + "step": 1235 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 496.0, + "completions/mean_length": 368.5625, + "completions/mean_terminated_length": 282.5, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 0.9888, + "grad_norm": 3.917327642440796, + "kl": 0.1302490234375, + "learning_rate": 1e-06, + "loss": -0.3326, + "num_tokens": 16051695.0, + "reward": 0.055808089673519135, + "reward_std": 0.020165979862213135, + "rewards/bleu_reward_func/mean": 0.055808089673519135, + "rewards/bleu_reward_func/std": 0.05799167603254318, + "step": 1236 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 496.0, + "completions/mean_length": 258.4375, + "completions/mean_terminated_length": 187.44000244140625, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 0.9896, + "grad_norm": 11.650849342346191, + "kl": 0.3966064453125, + "learning_rate": 1e-06, + "loss": -0.1616, + "num_tokens": 16063021.0, + "reward": 0.09570951759815216, + "reward_std": 0.041778795421123505, + "rewards/bleu_reward_func/mean": 0.09570951759815216, + "rewards/bleu_reward_func/std": 0.09836214780807495, + "step": 1237 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 463.0, + "completions/max_terminated_length": 463.0, + "completions/mean_length": 130.71875, + "completions/mean_terminated_length": 130.71875, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 0.9904, + "grad_norm": 6.946449279785156, + "kl": 0.3614501953125, + "learning_rate": 1e-06, + "loss": 0.0432, + "num_tokens": 16073188.0, + "reward": 0.19768103957176208, + "reward_std": 0.05326389521360397, + "rewards/bleu_reward_func/mean": 0.19768103957176208, + "rewards/bleu_reward_func/std": 0.14096693694591522, + "step": 1238 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 512.0, + "completions/mean_length": 153.59375, + "completions/mean_terminated_length": 53.23999786376953, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 0.9912, + "grad_norm": 15.68226432800293, + "kl": 0.60394287109375, + "learning_rate": 1e-06, + "loss": 0.1991, + "num_tokens": 16089367.0, + "reward": 0.19772392511367798, + "reward_std": 0.04295985400676727, + "rewards/bleu_reward_func/mean": 0.19772392511367798, + "rewards/bleu_reward_func/std": 0.15457068383693695, + "step": 1239 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 443.0, + "completions/mean_length": 202.1875, + "completions/mean_terminated_length": 181.53334045410156, + "completions/min_length": 43.0, + "completions/min_terminated_length": 43.0, + "epoch": 0.992, + "grad_norm": 5.087695121765137, + "kl": 0.21142578125, + "learning_rate": 1e-06, + "loss": -0.1565, + "num_tokens": 16097533.0, + "reward": 0.04568080976605415, + "reward_std": 0.0273725725710392, + "rewards/bleu_reward_func/mean": 0.04568080976605415, + "rewards/bleu_reward_func/std": 0.05127081274986267, + "step": 1240 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 149.0, + "completions/mean_length": 98.1875, + "completions/mean_terminated_length": 55.379310607910156, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 0.9928, + "grad_norm": 9.800594329833984, + "kl": 0.535400390625, + "learning_rate": 1e-06, + "loss": -0.2267, + "num_tokens": 16104579.0, + "reward": 0.1168278306722641, + "reward_std": 0.04581147059798241, + "rewards/bleu_reward_func/mean": 0.1168278306722641, + "rewards/bleu_reward_func/std": 0.08855386078357697, + "step": 1241 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 301.0, + "completions/mean_length": 251.21875, + "completions/mean_terminated_length": 48.38888931274414, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.9936, + "grad_norm": 6.915892124176025, + "kl": 0.300262451171875, + "learning_rate": 1e-06, + "loss": 0.0052, + "num_tokens": 16117106.0, + "reward": 0.21942071616649628, + "reward_std": 0.06735092401504517, + "rewards/bleu_reward_func/mean": 0.21942071616649628, + "rewards/bleu_reward_func/std": 0.1295205056667328, + "step": 1242 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 505.0, + "completions/mean_length": 386.9375, + "completions/mean_terminated_length": 311.8999938964844, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 0.9944, + "grad_norm": 11.362247467041016, + "kl": 0.95458984375, + "learning_rate": 1e-06, + "loss": -0.2592, + "num_tokens": 16131824.0, + "reward": 0.03232087939977646, + "reward_std": 0.018025288358330727, + "rewards/bleu_reward_func/mean": 0.03232087939977646, + "rewards/bleu_reward_func/std": 0.026756620034575462, + "step": 1243 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 511.0, + "completions/mean_length": 229.40625, + "completions/mean_terminated_length": 220.29031372070312, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 0.9952, + "grad_norm": 15.675792694091797, + "kl": 0.33203125, + "learning_rate": 1e-06, + "loss": 0.0615, + "num_tokens": 16145325.0, + "reward": 0.08530285954475403, + "reward_std": 0.03364046663045883, + "rewards/bleu_reward_func/mean": 0.08530285954475403, + "rewards/bleu_reward_func/std": 0.06811228394508362, + "step": 1244 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.34375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 498.0, + "completions/mean_length": 426.5625, + "completions/mean_terminated_length": 381.8095397949219, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.996, + "grad_norm": 2.0989584922790527, + "kl": 0.0555419921875, + "learning_rate": 1e-06, + "loss": -0.0865, + "num_tokens": 16164487.0, + "reward": 0.10918224602937698, + "reward_std": 0.043439704924821854, + "rewards/bleu_reward_func/mean": 0.10918224602937698, + "rewards/bleu_reward_func/std": 0.09625791013240814, + "step": 1245 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 434.0, + "completions/mean_length": 127.90625, + "completions/mean_terminated_length": 115.51612854003906, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 0.9968, + "grad_norm": 8.359559059143066, + "kl": 0.6103515625, + "learning_rate": 1e-06, + "loss": 0.2382, + "num_tokens": 16172076.0, + "reward": 0.0722852572798729, + "reward_std": 0.0363241545855999, + "rewards/bleu_reward_func/mean": 0.0722852572798729, + "rewards/bleu_reward_func/std": 0.05653948336839676, + "step": 1246 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 129.0, + "completions/max_terminated_length": 129.0, + "completions/mean_length": 63.21875, + "completions/mean_terminated_length": 63.21875, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.9976, + "grad_norm": 14.668207168579102, + "kl": 0.6405029296875, + "learning_rate": 1e-06, + "loss": 0.1996, + "num_tokens": 16178059.0, + "reward": 0.11710416525602341, + "reward_std": 0.044295113533735275, + "rewards/bleu_reward_func/mean": 0.11710416525602341, + "rewards/bleu_reward_func/std": 0.06186880171298981, + "step": 1247 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 474.0, + "completions/mean_length": 184.875, + "completions/mean_terminated_length": 174.32257080078125, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 0.9984, + "grad_norm": 9.499897956848145, + "kl": 0.4058837890625, + "learning_rate": 1e-06, + "loss": -0.1047, + "num_tokens": 16190527.0, + "reward": 0.14165818691253662, + "reward_std": 0.042991265654563904, + "rewards/bleu_reward_func/mean": 0.14165818691253662, + "rewards/bleu_reward_func/std": 0.1511020064353943, + "step": 1248 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 464.0, + "completions/mean_length": 233.25, + "completions/mean_terminated_length": 168.92308044433594, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "epoch": 0.9992, + "grad_norm": 6.786505222320557, + "kl": 0.34844970703125, + "learning_rate": 1e-06, + "loss": -0.0929, + "num_tokens": 16203199.0, + "reward": 0.14652788639068604, + "reward_std": 0.05133647471666336, + "rewards/bleu_reward_func/mean": 0.14652788639068604, + "rewards/bleu_reward_func/std": 0.18619418144226074, + "step": 1249 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 69.0, + "completions/max_terminated_length": 69.0, + "completions/mean_length": 39.75, + "completions/mean_terminated_length": 39.75, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 1.0, + "grad_norm": 9.613625526428223, + "kl": 0.37677001953125, + "learning_rate": 1e-06, + "loss": 0.061, + "num_tokens": 16214113.0, + "reward": 0.10052811354398727, + "reward_std": 0.05825551599264145, + "rewards/bleu_reward_func/mean": 0.10052811354398727, + "rewards/bleu_reward_func/std": 0.10802065581083298, + "step": 1250 + } + ], + "logging_steps": 1, + "max_steps": 1250, + "num_input_tokens_seen": 16214113, + "num_train_epochs": 1, + "save_steps": 250, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}