{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 1250, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 470.0, "completions/mean_length": 276.6875, "completions/mean_terminated_length": 184.60870361328125, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.0008, "grad_norm": 6.258511543273926, "kl": 0.0003216266632080078, "learning_rate": 0.0, "loss": -0.1947, "num_tokens": 15006.0, "reward": 0.03416593745350838, "reward_std": 0.019731489941477776, "rewards/bleu_reward_func/mean": 0.03416593745350838, "rewards/bleu_reward_func/std": 0.030305052176117897, "step": 1 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 512.0, "completions/max_terminated_length": 424.0, "completions/mean_length": 313.625, "completions/mean_terminated_length": 159.3333282470703, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.0016, "grad_norm": 4.826038360595703, "kl": 0.0002695322036743164, "learning_rate": 1.5873015873015872e-08, "loss": -0.1342, "num_tokens": 28018.0, "reward": 0.0247221440076828, "reward_std": 0.01764773763716221, "rewards/bleu_reward_func/mean": 0.0247221440076828, "rewards/bleu_reward_func/std": 0.03452335670590401, "step": 2 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 512.0, "completions/max_terminated_length": 480.0, "completions/mean_length": 382.78125, "completions/mean_terminated_length": 268.76470947265625, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.0024, "grad_norm": 2.2084195613861084, "kl": 0.0002834796905517578, "learning_rate": 3.1746031746031744e-08, "loss": 0.0891, "num_tokens": 43059.0, "reward": 0.022482896223664284, "reward_std": 0.009540551342070103, "rewards/bleu_reward_func/mean": 0.022482896223664284, "rewards/bleu_reward_func/std": 0.014530075713992119, "step": 3 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 512.0, "completions/max_terminated_length": 488.0, "completions/mean_length": 182.59375, "completions/mean_terminated_length": 148.51724243164062, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 0.0032, "grad_norm": 3.871168613433838, "kl": 0.00033736228942871094, "learning_rate": 4.7619047619047613e-08, "loss": -0.1353, "num_tokens": 50982.0, "reward": 0.04917050898075104, "reward_std": 0.023263752460479736, "rewards/bleu_reward_func/mean": 0.04917050898075104, "rewards/bleu_reward_func/std": 0.036324601620435715, "step": 4 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 512.0, "completions/max_terminated_length": 466.0, "completions/mean_length": 235.53125, "completions/mean_terminated_length": 206.9310302734375, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "epoch": 0.004, "grad_norm": 3.321488380432129, "kl": 0.00027179718017578125, "learning_rate": 6.349206349206349e-08, "loss": -0.1569, "num_tokens": 62191.0, "reward": 0.03882071375846863, "reward_std": 0.02520540915429592, "rewards/bleu_reward_func/mean": 0.03882071375846863, "rewards/bleu_reward_func/std": 0.034420739859342575, "step": 5 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 512.0, "completions/max_terminated_length": 511.0, "completions/mean_length": 274.5, "completions/mean_terminated_length": 219.69232177734375, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.0048, "grad_norm": 2.86037015914917, "kl": 0.0002570152282714844, "learning_rate": 7.936507936507936e-08, "loss": 0.0153, "num_tokens": 73551.0, "reward": 0.03558861464262009, "reward_std": 0.019850196316838264, "rewards/bleu_reward_func/mean": 0.03558861464262009, "rewards/bleu_reward_func/std": 0.02189255878329277, "step": 6 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 372.0, "completions/mean_length": 276.9375, "completions/mean_terminated_length": 184.95652770996094, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.0056, "grad_norm": 2.263695478439331, "kl": 0.0002677440643310547, "learning_rate": 9.523809523809523e-08, "loss": 0.11, "num_tokens": 85237.0, "reward": 0.017801083624362946, "reward_std": 0.008379511535167694, "rewards/bleu_reward_func/mean": 0.017801083624362946, "rewards/bleu_reward_func/std": 0.015220904722809792, "step": 7 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 512.0, "completions/max_terminated_length": 485.0, "completions/mean_length": 399.6875, "completions/mean_terminated_length": 300.5882263183594, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 0.0064, "grad_norm": 2.1360983848571777, "kl": 0.00027871131896972656, "learning_rate": 1.111111111111111e-07, "loss": -0.0919, "num_tokens": 100099.0, "reward": 0.04303022474050522, "reward_std": 0.020876675844192505, "rewards/bleu_reward_func/mean": 0.04303022474050522, "rewards/bleu_reward_func/std": 0.04522383213043213, "step": 8 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 482.0, "completions/mean_length": 359.96875, "completions/mean_terminated_length": 317.3999938964844, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 0.0072, "grad_norm": 2.249929904937744, "kl": 0.00033664703369140625, "learning_rate": 1.2698412698412698e-07, "loss": 0.001, "num_tokens": 116226.0, "reward": 0.03631145507097244, "reward_std": 0.019055547192692757, "rewards/bleu_reward_func/mean": 0.03631145507097244, "rewards/bleu_reward_func/std": 0.024671798571944237, "step": 9 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 512.0, "completions/max_terminated_length": 507.0, "completions/mean_length": 402.65625, "completions/mean_terminated_length": 293.3125, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.008, "grad_norm": 2.0875115394592285, "kl": 0.00031948089599609375, "learning_rate": 1.4285714285714285e-07, "loss": -0.1079, "num_tokens": 132767.0, "reward": 0.03503231331706047, "reward_std": 0.015032317489385605, "rewards/bleu_reward_func/mean": 0.03503231331706047, "rewards/bleu_reward_func/std": 0.053626786917448044, "step": 10 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.71875, "completions/max_length": 512.0, "completions/max_terminated_length": 434.0, "completions/mean_length": 427.8125, "completions/mean_terminated_length": 212.6666717529297, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.0088, "grad_norm": 2.3419556617736816, "kl": 0.00034618377685546875, "learning_rate": 1.5873015873015872e-07, "loss": -0.1937, "num_tokens": 150273.0, "reward": 0.023869339376688004, "reward_std": 0.010186510160565376, "rewards/bleu_reward_func/mean": 0.023869339376688004, "rewards/bleu_reward_func/std": 0.024761516600847244, "step": 11 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 512.0, "completions/max_terminated_length": 460.0, "completions/mean_length": 180.3125, "completions/mean_terminated_length": 146.0, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.0096, "grad_norm": 4.572720527648926, "kl": 0.00017344951629638672, "learning_rate": 1.7460317460317458e-07, "loss": -0.1778, "num_tokens": 158243.0, "reward": 0.06084510311484337, "reward_std": 0.031551554799079895, "rewards/bleu_reward_func/mean": 0.06084510311484337, "rewards/bleu_reward_func/std": 0.04978843033313751, "step": 12 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/max_terminated_length": 504.0, "completions/mean_length": 244.0625, "completions/mean_terminated_length": 244.0625, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.0104, "grad_norm": 3.3186678886413574, "kl": 0.00041675567626953125, "learning_rate": 1.9047619047619045e-07, "loss": 0.2379, "num_tokens": 168149.0, "reward": 0.021276462823152542, "reward_std": 0.00621379679068923, "rewards/bleu_reward_func/mean": 0.021276462823152542, "rewards/bleu_reward_func/std": 0.008626200258731842, "step": 13 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 460.0, "completions/mean_length": 332.5, "completions/mean_terminated_length": 272.66668701171875, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.0112, "grad_norm": 2.2437744140625, "kl": 0.00031495094299316406, "learning_rate": 2.0634920634920632e-07, "loss": 0.09, "num_tokens": 180917.0, "reward": 0.022824838757514954, "reward_std": 0.014015388675034046, "rewards/bleu_reward_func/mean": 0.022824838757514954, "rewards/bleu_reward_func/std": 0.018382087349891663, "step": 14 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 501.0, "completions/mean_length": 274.90625, "completions/mean_terminated_length": 208.51998901367188, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.012, "grad_norm": 2.874119997024536, "kl": 0.00016677379608154297, "learning_rate": 2.222222222222222e-07, "loss": -0.0669, "num_tokens": 194290.0, "reward": 0.13837847113609314, "reward_std": 0.10791029036045074, "rewards/bleu_reward_func/mean": 0.13837847113609314, "rewards/bleu_reward_func/std": 0.16033971309661865, "step": 15 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 465.0, "completions/max_terminated_length": 465.0, "completions/mean_length": 179.8125, "completions/mean_terminated_length": 179.8125, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.0128, "grad_norm": 15.92119312286377, "kl": 0.00028777122497558594, "learning_rate": 2.3809523809523806e-07, "loss": 0.0326, "num_tokens": 205636.0, "reward": 0.050816405564546585, "reward_std": 0.02174009010195732, "rewards/bleu_reward_func/mean": 0.050816405564546585, "rewards/bleu_reward_func/std": 0.03758488595485687, "step": 16 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 512.0, "completions/max_terminated_length": 498.0, "completions/mean_length": 257.25, "completions/mean_terminated_length": 210.07408142089844, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.0136, "grad_norm": 3.077911615371704, "kl": 0.00032806396484375, "learning_rate": 2.5396825396825396e-07, "loss": -0.079, "num_tokens": 216124.0, "reward": 0.03544948250055313, "reward_std": 0.023141874000430107, "rewards/bleu_reward_func/mean": 0.03544948250055313, "rewards/bleu_reward_func/std": 0.02584005706012249, "step": 17 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 512.0, "completions/max_terminated_length": 481.0, "completions/mean_length": 304.25, "completions/mean_terminated_length": 179.60000610351562, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.0144, "grad_norm": 9.573840141296387, "kl": 0.00026106834411621094, "learning_rate": 2.698412698412698e-07, "loss": 0.1542, "num_tokens": 229060.0, "reward": 0.09650908410549164, "reward_std": 0.06372307240962982, "rewards/bleu_reward_func/mean": 0.09650908410549164, "rewards/bleu_reward_func/std": 0.13649234175682068, "step": 18 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 512.0, "completions/max_terminated_length": 471.0, "completions/mean_length": 212.3125, "completions/mean_terminated_length": 202.64515686035156, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.0152, "grad_norm": 3.6799280643463135, "kl": 0.00029969215393066406, "learning_rate": 2.857142857142857e-07, "loss": -0.258, "num_tokens": 238062.0, "reward": 0.03407663479447365, "reward_std": 0.018440743908286095, "rewards/bleu_reward_func/mean": 0.03407663479447365, "rewards/bleu_reward_func/std": 0.020070552825927734, "step": 19 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/max_terminated_length": 498.0, "completions/mean_length": 278.59375, "completions/mean_terminated_length": 278.59375, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 0.016, "grad_norm": 2.5249006748199463, "kl": 0.0003578662872314453, "learning_rate": 3.0158730158730156e-07, "loss": -0.2122, "num_tokens": 248801.0, "reward": 0.04736609756946564, "reward_std": 0.03644244372844696, "rewards/bleu_reward_func/mean": 0.04736609756946564, "rewards/bleu_reward_func/std": 0.058418869972229004, "step": 20 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 512.0, "completions/max_terminated_length": 467.0, "completions/mean_length": 254.4375, "completions/mean_terminated_length": 227.79310607910156, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.0168, "grad_norm": 3.26560640335083, "kl": 0.000385284423828125, "learning_rate": 3.1746031746031743e-07, "loss": -0.0564, "num_tokens": 259415.0, "reward": 0.03675752133131027, "reward_std": 0.012391982600092888, "rewards/bleu_reward_func/mean": 0.03675752133131027, "rewards/bleu_reward_func/std": 0.01883433386683464, "step": 21 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 512.0, "completions/max_terminated_length": 432.0, "completions/mean_length": 263.5625, "completions/mean_terminated_length": 150.63636779785156, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.0176, "grad_norm": 3.0202367305755615, "kl": 0.00021600723266601562, "learning_rate": 3.333333333333333e-07, "loss": 0.0991, "num_tokens": 270649.0, "reward": 0.046073149889707565, "reward_std": 0.035358842462301254, "rewards/bleu_reward_func/mean": 0.046073149889707565, "rewards/bleu_reward_func/std": 0.0658104419708252, "step": 22 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 512.0, "completions/mean_length": 376.90625, "completions/mean_terminated_length": 331.875, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 0.0184, "grad_norm": 2.1112754344940186, "kl": 0.00023984909057617188, "learning_rate": 3.4920634920634917e-07, "loss": 0.0604, "num_tokens": 285446.0, "reward": 0.06469863653182983, "reward_std": 0.052234675735235214, "rewards/bleu_reward_func/mean": 0.06469863653182983, "rewards/bleu_reward_func/std": 0.1052434891462326, "step": 23 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 512.0, "completions/max_terminated_length": 509.0, "completions/mean_length": 262.875, "completions/mean_terminated_length": 227.2857208251953, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.0192, "grad_norm": 5.028998374938965, "kl": 0.00035119056701660156, "learning_rate": 3.6507936507936504e-07, "loss": -0.0787, "num_tokens": 297450.0, "reward": 0.06576113402843475, "reward_std": 0.02428753674030304, "rewards/bleu_reward_func/mean": 0.06576113402843475, "rewards/bleu_reward_func/std": 0.06561829149723053, "step": 24 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 512.0, "completions/max_terminated_length": 447.0, "completions/mean_length": 251.375, "completions/mean_terminated_length": 214.1428680419922, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.02, "grad_norm": 4.391946315765381, "kl": 0.0003197193145751953, "learning_rate": 3.809523809523809e-07, "loss": -0.1142, "num_tokens": 309646.0, "reward": 0.03192237764596939, "reward_std": 0.018320664763450623, "rewards/bleu_reward_func/mean": 0.03192237764596939, "rewards/bleu_reward_func/std": 0.020625513046979904, "step": 25 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 322.0, "completions/mean_length": 199.3125, "completions/mean_terminated_length": 76.9565200805664, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.0208, "grad_norm": 4.005726337432861, "kl": 0.0002747774124145508, "learning_rate": 3.968253968253968e-07, "loss": -0.0165, "num_tokens": 319608.0, "reward": 0.04598322883248329, "reward_std": 0.03894542530179024, "rewards/bleu_reward_func/mean": 0.04598322883248329, "rewards/bleu_reward_func/std": 0.0656987875699997, "step": 26 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 512.0, "completions/max_terminated_length": 504.0, "completions/mean_length": 445.1875, "completions/mean_terminated_length": 378.375, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 0.0216, "grad_norm": 1.7314307689666748, "kl": 0.00025200843811035156, "learning_rate": 4.1269841269841265e-07, "loss": 0.0145, "num_tokens": 339982.0, "reward": 0.03940076753497124, "reward_std": 0.01753806695342064, "rewards/bleu_reward_func/mean": 0.03940076753497124, "rewards/bleu_reward_func/std": 0.024869710206985474, "step": 27 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 512.0, "completions/max_terminated_length": 380.0, "completions/mean_length": 176.78125, "completions/mean_terminated_length": 165.96774291992188, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.0224, "grad_norm": 3.5480945110321045, "kl": 0.0003466606140136719, "learning_rate": 4.285714285714285e-07, "loss": 0.3461, "num_tokens": 350399.0, "reward": 0.032109640538692474, "reward_std": 0.01587669923901558, "rewards/bleu_reward_func/mean": 0.032109640538692474, "rewards/bleu_reward_func/std": 0.021649450063705444, "step": 28 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 512.0, "completions/max_terminated_length": 422.0, "completions/mean_length": 286.28125, "completions/mean_terminated_length": 131.84210205078125, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.0232, "grad_norm": 3.5640323162078857, "kl": 0.00038433074951171875, "learning_rate": 4.444444444444444e-07, "loss": -0.3725, "num_tokens": 366616.0, "reward": 0.032555509358644485, "reward_std": 0.02120809443295002, "rewards/bleu_reward_func/mean": 0.032555509358644485, "rewards/bleu_reward_func/std": 0.0240026768296957, "step": 29 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 470.0, "completions/mean_length": 316.5625, "completions/mean_terminated_length": 251.4166717529297, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.024, "grad_norm": 2.3427391052246094, "kl": 0.0003643035888671875, "learning_rate": 4.6031746031746025e-07, "loss": 0.1076, "num_tokens": 379194.0, "reward": 0.05133620649576187, "reward_std": 0.03176493942737579, "rewards/bleu_reward_func/mean": 0.05133620649576187, "rewards/bleu_reward_func/std": 0.05137130245566368, "step": 30 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 512.0, "completions/max_terminated_length": 459.0, "completions/mean_length": 317.5625, "completions/mean_terminated_length": 166.3333282470703, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.0248, "grad_norm": 2.8795974254608154, "kl": 0.00021916627883911133, "learning_rate": 4.761904761904761e-07, "loss": -0.0397, "num_tokens": 393332.0, "reward": 0.02336902543902397, "reward_std": 0.016461046412587166, "rewards/bleu_reward_func/mean": 0.02336902543902397, "rewards/bleu_reward_func/std": 0.01688769832253456, "step": 31 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 474.0, "completions/mean_length": 273.5625, "completions/mean_terminated_length": 180.26087951660156, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.0256, "grad_norm": 4.866830348968506, "kl": 0.0003495216369628906, "learning_rate": 4.92063492063492e-07, "loss": -0.1723, "num_tokens": 406286.0, "reward": 0.0500735342502594, "reward_std": 0.03149079158902168, "rewards/bleu_reward_func/mean": 0.0500735342502594, "rewards/bleu_reward_func/std": 0.042752303183078766, "step": 32 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 512.0, "completions/max_terminated_length": 428.0, "completions/mean_length": 199.90625, "completions/mean_terminated_length": 155.32144165039062, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.0264, "grad_norm": 3.734315872192383, "kl": 0.00031948089599609375, "learning_rate": 5.079365079365079e-07, "loss": 0.1723, "num_tokens": 417019.0, "reward": 0.04127680882811546, "reward_std": 0.026887936517596245, "rewards/bleu_reward_func/mean": 0.04127680882811546, "rewards/bleu_reward_func/std": 0.028591442853212357, "step": 33 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 496.0, "completions/mean_length": 238.8125, "completions/mean_terminated_length": 162.3199920654297, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.0272, "grad_norm": 3.5805702209472656, "kl": 0.0003371238708496094, "learning_rate": 5.238095238095238e-07, "loss": -0.1221, "num_tokens": 430517.0, "reward": 0.038475487381219864, "reward_std": 0.023987405002117157, "rewards/bleu_reward_func/mean": 0.038475487381219864, "rewards/bleu_reward_func/std": 0.03382611274719238, "step": 34 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 483.0, "completions/mean_length": 320.15625, "completions/mean_terminated_length": 245.0869598388672, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.028, "grad_norm": 2.8223023414611816, "kl": 0.0003685951232910156, "learning_rate": 5.396825396825396e-07, "loss": -0.1863, "num_tokens": 444386.0, "reward": 0.044519804418087006, "reward_std": 0.020455416291952133, "rewards/bleu_reward_func/mean": 0.044519804418087006, "rewards/bleu_reward_func/std": 0.02401871234178543, "step": 35 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 512.0, "completions/max_terminated_length": 444.0, "completions/mean_length": 235.21875, "completions/mean_terminated_length": 109.40909576416016, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.0288, "grad_norm": 4.932171821594238, "kl": 0.0004143714904785156, "learning_rate": 5.555555555555555e-07, "loss": -0.0821, "num_tokens": 454729.0, "reward": 0.051868241280317307, "reward_std": 0.03919130563735962, "rewards/bleu_reward_func/mean": 0.051868241280317307, "rewards/bleu_reward_func/std": 0.04572274535894394, "step": 36 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 512.0, "completions/max_terminated_length": 480.0, "completions/mean_length": 240.1875, "completions/mean_terminated_length": 231.41934204101562, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 0.0296, "grad_norm": 2.3933703899383545, "kl": 0.0003342628479003906, "learning_rate": 5.714285714285714e-07, "loss": 0.0184, "num_tokens": 464367.0, "reward": 0.033591024577617645, "reward_std": 0.012664815410971642, "rewards/bleu_reward_func/mean": 0.033591024577617645, "rewards/bleu_reward_func/std": 0.01647285185754299, "step": 37 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 239.0, "completions/mean_length": 186.21875, "completions/mean_terminated_length": 77.625, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.0304, "grad_norm": 4.726412296295166, "kl": 0.00037977099418640137, "learning_rate": 5.873015873015873e-07, "loss": -0.1188, "num_tokens": 475974.0, "reward": 0.11781854927539825, "reward_std": 0.07036956399679184, "rewards/bleu_reward_func/mean": 0.11781854927539825, "rewards/bleu_reward_func/std": 0.17497578263282776, "step": 38 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 433.0, "completions/mean_length": 325.34375, "completions/mean_terminated_length": 263.125, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.0312, "grad_norm": 2.215810537338257, "kl": 0.0004220008850097656, "learning_rate": 6.031746031746031e-07, "loss": -0.0677, "num_tokens": 488921.0, "reward": 0.02974233217537403, "reward_std": 0.0150698097422719, "rewards/bleu_reward_func/mean": 0.02974233217537403, "rewards/bleu_reward_func/std": 0.016928359866142273, "step": 39 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 512.0, "completions/max_terminated_length": 419.0, "completions/mean_length": 209.84375, "completions/mean_terminated_length": 140.11538696289062, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.032, "grad_norm": 7.524960041046143, "kl": 0.0004963874816894531, "learning_rate": 6.19047619047619e-07, "loss": -0.3064, "num_tokens": 499980.0, "reward": 0.03384634852409363, "reward_std": 0.025826433673501015, "rewards/bleu_reward_func/mean": 0.03384634852409363, "rewards/bleu_reward_func/std": 0.026973972097039223, "step": 40 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 506.0, "completions/mean_length": 367.0625, "completions/mean_terminated_length": 318.75, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 0.0328, "grad_norm": 2.123147487640381, "kl": 0.0004987716674804688, "learning_rate": 6.349206349206349e-07, "loss": -0.032, "num_tokens": 513846.0, "reward": 0.02649177610874176, "reward_std": 0.01194241177290678, "rewards/bleu_reward_func/mean": 0.02649177610874176, "rewards/bleu_reward_func/std": 0.01315221842378378, "step": 41 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 382.0, "completions/max_terminated_length": 382.0, "completions/mean_length": 177.59375, "completions/mean_terminated_length": 177.59375, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.0336, "grad_norm": 2.9975976943969727, "kl": 0.0005216598510742188, "learning_rate": 6.507936507936507e-07, "loss": -0.0346, "num_tokens": 524625.0, "reward": 0.07642016559839249, "reward_std": 0.062030211091041565, "rewards/bleu_reward_func/mean": 0.07642016559839249, "rewards/bleu_reward_func/std": 0.08820059895515442, "step": 42 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 447.0, "completions/mean_length": 226.03125, "completions/mean_terminated_length": 130.70834350585938, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 0.0344, "grad_norm": 4.617770671844482, "kl": 0.0005788803100585938, "learning_rate": 6.666666666666666e-07, "loss": -0.1183, "num_tokens": 534186.0, "reward": 0.022412922233343124, "reward_std": 0.028917275369167328, "rewards/bleu_reward_func/mean": 0.022412922233343124, "rewards/bleu_reward_func/std": 0.030789699405431747, "step": 43 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 472.0, "completions/mean_length": 333.625, "completions/mean_terminated_length": 283.67999267578125, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.0352, "grad_norm": 2.284592390060425, "kl": 0.0004913806915283203, "learning_rate": 6.825396825396826e-07, "loss": -0.0931, "num_tokens": 547598.0, "reward": 0.04416097328066826, "reward_std": 0.02134130708873272, "rewards/bleu_reward_func/mean": 0.04416097328066826, "rewards/bleu_reward_func/std": 0.02967960573732853, "step": 44 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 512.0, "completions/max_terminated_length": 496.0, "completions/mean_length": 352.3125, "completions/mean_terminated_length": 228.11111450195312, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "epoch": 0.036, "grad_norm": 1.9085440635681152, "kl": 0.0005497932434082031, "learning_rate": 6.984126984126983e-07, "loss": -0.1154, "num_tokens": 562848.0, "reward": 0.07549206912517548, "reward_std": 0.030871842056512833, "rewards/bleu_reward_func/mean": 0.07549206912517548, "rewards/bleu_reward_func/std": 0.07412120699882507, "step": 45 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 512.0, "completions/max_terminated_length": 488.0, "completions/mean_length": 380.875, "completions/mean_terminated_length": 302.20001220703125, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.0368, "grad_norm": 2.130894660949707, "kl": 0.0005145072937011719, "learning_rate": 7.142857142857143e-07, "loss": -0.0617, "num_tokens": 578060.0, "reward": 0.12002657353878021, "reward_std": 0.08936386555433273, "rewards/bleu_reward_func/mean": 0.12002657353878021, "rewards/bleu_reward_func/std": 0.16516655683517456, "step": 46 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 512.0, "completions/max_terminated_length": 277.0, "completions/mean_length": 119.21875, "completions/mean_terminated_length": 78.5862045288086, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.0376, "grad_norm": 5.716607093811035, "kl": 0.000782012939453125, "learning_rate": 7.301587301587301e-07, "loss": 0.0529, "num_tokens": 588707.0, "reward": 0.0403611958026886, "reward_std": 0.03214065358042717, "rewards/bleu_reward_func/mean": 0.0403611958026886, "rewards/bleu_reward_func/std": 0.053722139447927475, "step": 47 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 512.0, "completions/max_terminated_length": 495.0, "completions/mean_length": 254.125, "completions/mean_terminated_length": 206.37037658691406, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.0384, "grad_norm": 3.3116095066070557, "kl": 0.0006084442138671875, "learning_rate": 7.46031746031746e-07, "loss": -0.177, "num_tokens": 598639.0, "reward": 0.04110237956047058, "reward_std": 0.030773304402828217, "rewards/bleu_reward_func/mean": 0.04110237956047058, "rewards/bleu_reward_func/std": 0.0406816266477108, "step": 48 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 512.0, "completions/max_terminated_length": 483.0, "completions/mean_length": 338.34375, "completions/mean_terminated_length": 234.15000915527344, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.0392, "grad_norm": 2.5459675788879395, "kl": 0.0008707046508789062, "learning_rate": 7.619047619047618e-07, "loss": 0.1096, "num_tokens": 611754.0, "reward": 0.039899833500385284, "reward_std": 0.016289234161376953, "rewards/bleu_reward_func/mean": 0.039899833500385284, "rewards/bleu_reward_func/std": 0.03525659814476967, "step": 49 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 512.0, "completions/max_terminated_length": 410.0, "completions/mean_length": 265.8125, "completions/mean_terminated_length": 230.6428680419922, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.04, "grad_norm": 2.73764967918396, "kl": 0.0009059906005859375, "learning_rate": 7.777777777777778e-07, "loss": 0.151, "num_tokens": 622884.0, "reward": 0.04385654628276825, "reward_std": 0.02457226999104023, "rewards/bleu_reward_func/mean": 0.04385654628276825, "rewards/bleu_reward_func/std": 0.03391377627849579, "step": 50 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 512.0, "completions/max_terminated_length": 508.0, "completions/mean_length": 324.09375, "completions/mean_terminated_length": 211.35000610351562, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.0408, "grad_norm": 3.26839017868042, "kl": 0.0012226104736328125, "learning_rate": 7.936507936507936e-07, "loss": 0.0831, "num_tokens": 635439.0, "reward": 0.052643824368715286, "reward_std": 0.03537372499704361, "rewards/bleu_reward_func/mean": 0.052643824368715286, "rewards/bleu_reward_func/std": 0.06656704843044281, "step": 51 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 494.0, "completions/mean_length": 286.875, "completions/mean_terminated_length": 223.83999633789062, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.0416, "grad_norm": 4.038628578186035, "kl": 0.00116729736328125, "learning_rate": 8.095238095238095e-07, "loss": -0.0229, "num_tokens": 648083.0, "reward": 0.05057225376367569, "reward_std": 0.05185646191239357, "rewards/bleu_reward_func/mean": 0.05057225376367569, "rewards/bleu_reward_func/std": 0.07916928082704544, "step": 52 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 512.0, "completions/max_terminated_length": 394.0, "completions/mean_length": 121.09375, "completions/mean_terminated_length": 108.48387145996094, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 0.0424, "grad_norm": 11.901754379272461, "kl": 0.0016069412231445312, "learning_rate": 8.253968253968253e-07, "loss": -0.193, "num_tokens": 654558.0, "reward": 0.02811940386891365, "reward_std": 0.017252802848815918, "rewards/bleu_reward_func/mean": 0.02811940386891365, "rewards/bleu_reward_func/std": 0.020083896815776825, "step": 53 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 512.0, "completions/max_terminated_length": 484.0, "completions/mean_length": 229.625, "completions/mean_terminated_length": 177.3333282470703, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.0432, "grad_norm": 4.099213600158691, "kl": 0.0010061264038085938, "learning_rate": 8.412698412698413e-07, "loss": 0.1683, "num_tokens": 666394.0, "reward": 0.020335812121629715, "reward_std": 0.008468281477689743, "rewards/bleu_reward_func/mean": 0.020335812121629715, "rewards/bleu_reward_func/std": 0.017663516104221344, "step": 54 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 512.0, "completions/max_terminated_length": 414.0, "completions/mean_length": 182.59375, "completions/mean_terminated_length": 148.51724243164062, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.044, "grad_norm": 5.222848892211914, "kl": 0.00174713134765625, "learning_rate": 8.57142857142857e-07, "loss": -0.0074, "num_tokens": 675413.0, "reward": 0.08503767848014832, "reward_std": 0.06149422377347946, "rewards/bleu_reward_func/mean": 0.08503767848014832, "rewards/bleu_reward_func/std": 0.06967518478631973, "step": 55 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 512.0, "completions/max_terminated_length": 501.0, "completions/mean_length": 311.8125, "completions/mean_terminated_length": 135.1764678955078, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.0448, "grad_norm": 3.82554030418396, "kl": 0.001544952392578125, "learning_rate": 8.73015873015873e-07, "loss": 0.1187, "num_tokens": 688055.0, "reward": 0.021008048206567764, "reward_std": 0.006345948204398155, "rewards/bleu_reward_func/mean": 0.021008048206567764, "rewards/bleu_reward_func/std": 0.015550477430224419, "step": 56 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 512.0, "completions/max_terminated_length": 445.0, "completions/mean_length": 384.90625, "completions/mean_terminated_length": 272.76470947265625, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.0456, "grad_norm": 2.4369232654571533, "kl": 0.001430511474609375, "learning_rate": 8.888888888888888e-07, "loss": -0.1333, "num_tokens": 704084.0, "reward": 0.03840646147727966, "reward_std": 0.01636688783764839, "rewards/bleu_reward_func/mean": 0.03840646147727966, "rewards/bleu_reward_func/std": 0.020463639870285988, "step": 57 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 512.0, "completions/max_terminated_length": 494.0, "completions/mean_length": 299.5625, "completions/mean_terminated_length": 203.0, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.0464, "grad_norm": 2.234159231185913, "kl": 0.0017108917236328125, "learning_rate": 9.047619047619047e-07, "loss": -0.0062, "num_tokens": 716110.0, "reward": 0.03707782179117203, "reward_std": 0.03189729154109955, "rewards/bleu_reward_func/mean": 0.03707782179117203, "rewards/bleu_reward_func/std": 0.03396334871649742, "step": 58 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 492.0, "completions/mean_length": 310.125, "completions/mean_terminated_length": 242.83334350585938, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 0.0472, "grad_norm": 2.620929002761841, "kl": 0.0015125274658203125, "learning_rate": 9.206349206349205e-07, "loss": -0.1756, "num_tokens": 728050.0, "reward": 0.04129425063729286, "reward_std": 0.02301635593175888, "rewards/bleu_reward_func/mean": 0.04129425063729286, "rewards/bleu_reward_func/std": 0.043909139931201935, "step": 59 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 509.0, "completions/mean_length": 345.90625, "completions/mean_terminated_length": 290.54168701171875, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.048, "grad_norm": 2.1176917552948, "kl": 0.0021114349365234375, "learning_rate": 9.365079365079365e-07, "loss": 0.1678, "num_tokens": 741791.0, "reward": 0.023899374529719353, "reward_std": 0.011497007682919502, "rewards/bleu_reward_func/mean": 0.023899374529719353, "rewards/bleu_reward_func/std": 0.015334444120526314, "step": 60 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 512.0, "completions/max_terminated_length": 484.0, "completions/mean_length": 316.90625, "completions/mean_terminated_length": 280.77777099609375, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.0488, "grad_norm": 2.236198902130127, "kl": 0.0020008087158203125, "learning_rate": 9.523809523809522e-07, "loss": 0.1585, "num_tokens": 754820.0, "reward": 0.03658726438879967, "reward_std": 0.014603394083678722, "rewards/bleu_reward_func/mean": 0.03658726438879967, "rewards/bleu_reward_func/std": 0.0265581663697958, "step": 61 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 512.0, "completions/max_terminated_length": 500.0, "completions/mean_length": 260.8125, "completions/mean_terminated_length": 214.29629516601562, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.0496, "grad_norm": 3.063876152038574, "kl": 0.0018711090087890625, "learning_rate": 9.682539682539682e-07, "loss": 0.3903, "num_tokens": 765190.0, "reward": 0.03036242537200451, "reward_std": 0.02141759917140007, "rewards/bleu_reward_func/mean": 0.03036242537200451, "rewards/bleu_reward_func/std": 0.027455288916826248, "step": 62 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 512.0, "completions/max_terminated_length": 497.0, "completions/mean_length": 368.375, "completions/mean_terminated_length": 293.1428527832031, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.0504, "grad_norm": 2.7348315715789795, "kl": 0.0015745162963867188, "learning_rate": 9.84126984126984e-07, "loss": -0.0601, "num_tokens": 780034.0, "reward": 0.04812411963939667, "reward_std": 0.022147245705127716, "rewards/bleu_reward_func/mean": 0.04812411963939667, "rewards/bleu_reward_func/std": 0.04721507802605629, "step": 63 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 512.0, "completions/max_terminated_length": 495.0, "completions/mean_length": 339.25, "completions/mean_terminated_length": 204.88888549804688, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.0512, "grad_norm": 2.7344889640808105, "kl": 0.0023365020751953125, "learning_rate": 1e-06, "loss": 0.1724, "num_tokens": 793210.0, "reward": 0.02877359464764595, "reward_std": 0.010511023923754692, "rewards/bleu_reward_func/mean": 0.02877359464764595, "rewards/bleu_reward_func/std": 0.011428051628172398, "step": 64 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 512.0, "completions/max_terminated_length": 508.0, "completions/mean_length": 318.15625, "completions/mean_terminated_length": 290.46429443359375, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 0.052, "grad_norm": 2.4814369678497314, "kl": 0.001934051513671875, "learning_rate": 1e-06, "loss": -0.0405, "num_tokens": 806207.0, "reward": 0.05107945576310158, "reward_std": 0.03433047980070114, "rewards/bleu_reward_func/mean": 0.05107945576310158, "rewards/bleu_reward_func/std": 0.05961597338318825, "step": 65 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 512.0, "completions/max_terminated_length": 369.0, "completions/mean_length": 321.09375, "completions/mean_terminated_length": 190.4736785888672, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.0528, "grad_norm": 2.776719808578491, "kl": 0.0016841888427734375, "learning_rate": 1e-06, "loss": 0.0672, "num_tokens": 818946.0, "reward": 0.026194388046860695, "reward_std": 0.029626624658703804, "rewards/bleu_reward_func/mean": 0.026194388046860695, "rewards/bleu_reward_func/std": 0.047487590461969376, "step": 66 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.53125, "completions/max_length": 512.0, "completions/max_terminated_length": 497.0, "completions/mean_length": 416.84375, "completions/mean_terminated_length": 309.0000305175781, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.0536, "grad_norm": 2.1129209995269775, "kl": 0.0015621185302734375, "learning_rate": 1e-06, "loss": -0.0238, "num_tokens": 836741.0, "reward": 0.02843387797474861, "reward_std": 0.010509947314858437, "rewards/bleu_reward_func/mean": 0.02843387797474861, "rewards/bleu_reward_func/std": 0.018165679648518562, "step": 67 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.625, "completions/max_length": 512.0, "completions/max_terminated_length": 502.0, "completions/mean_length": 442.0625, "completions/mean_terminated_length": 325.5, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "epoch": 0.0544, "grad_norm": 1.8852394819259644, "kl": 0.0014791488647460938, "learning_rate": 1e-06, "loss": -0.086, "num_tokens": 853783.0, "reward": 0.028096213936805725, "reward_std": 0.008382029831409454, "rewards/bleu_reward_func/mean": 0.028096213936805725, "rewards/bleu_reward_func/std": 0.01885552704334259, "step": 68 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 512.0, "completions/max_terminated_length": 486.0, "completions/mean_length": 318.125, "completions/mean_terminated_length": 290.4285888671875, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.0552, "grad_norm": 2.9050652980804443, "kl": 0.0017156600952148438, "learning_rate": 1e-06, "loss": -0.0435, "num_tokens": 865659.0, "reward": 0.03590589016675949, "reward_std": 0.017201866954565048, "rewards/bleu_reward_func/mean": 0.03590589016675949, "rewards/bleu_reward_func/std": 0.033825989812612534, "step": 69 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 512.0, "completions/max_terminated_length": 500.0, "completions/mean_length": 300.0, "completions/mean_terminated_length": 269.71429443359375, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.056, "grad_norm": 2.2988243103027344, "kl": 0.001689910888671875, "learning_rate": 1e-06, "loss": -0.118, "num_tokens": 877371.0, "reward": 0.04953785985708237, "reward_std": 0.04012230038642883, "rewards/bleu_reward_func/mean": 0.04953785985708237, "rewards/bleu_reward_func/std": 0.05754861235618591, "step": 70 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 512.0, "completions/max_terminated_length": 505.0, "completions/mean_length": 334.625, "completions/mean_terminated_length": 254.0, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 0.0568, "grad_norm": 2.465021848678589, "kl": 0.00246429443359375, "learning_rate": 1e-06, "loss": -0.0057, "num_tokens": 891055.0, "reward": 0.030064472928643227, "reward_std": 0.019575169309973717, "rewards/bleu_reward_func/mean": 0.030064472928643227, "rewards/bleu_reward_func/std": 0.023523783311247826, "step": 71 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 512.0, "completions/max_terminated_length": 502.0, "completions/mean_length": 430.6875, "completions/mean_terminated_length": 381.8999938964844, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.0576, "grad_norm": 2.359802722930908, "kl": 0.002536773681640625, "learning_rate": 1e-06, "loss": -0.0236, "num_tokens": 907301.0, "reward": 0.043774448335170746, "reward_std": 0.01555405743420124, "rewards/bleu_reward_func/mean": 0.043774448335170746, "rewards/bleu_reward_func/std": 0.034126028418540955, "step": 72 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 474.0, "completions/mean_length": 270.09375, "completions/mean_terminated_length": 189.45834350585938, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.0584, "grad_norm": 3.2106242179870605, "kl": 0.0029931068420410156, "learning_rate": 1e-06, "loss": -0.1018, "num_tokens": 919472.0, "reward": 0.019871417433023453, "reward_std": 0.015124676749110222, "rewards/bleu_reward_func/mean": 0.019871417433023453, "rewards/bleu_reward_func/std": 0.020977023988962173, "step": 73 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 512.0, "completions/max_terminated_length": 433.0, "completions/mean_length": 234.5625, "completions/mean_terminated_length": 170.53846740722656, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.0592, "grad_norm": 5.234200954437256, "kl": 0.0031566619873046875, "learning_rate": 1e-06, "loss": -0.3525, "num_tokens": 929602.0, "reward": 0.03137686848640442, "reward_std": 0.02154741995036602, "rewards/bleu_reward_func/mean": 0.03137686848640442, "rewards/bleu_reward_func/std": 0.029113655909895897, "step": 74 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 512.0, "completions/max_terminated_length": 501.0, "completions/mean_length": 301.25, "completions/mean_terminated_length": 262.22222900390625, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.06, "grad_norm": 2.501121759414673, "kl": 0.002643585205078125, "learning_rate": 1e-06, "loss": -0.1282, "num_tokens": 942298.0, "reward": 0.026925798505544662, "reward_std": 0.012626022100448608, "rewards/bleu_reward_func/mean": 0.026925798505544662, "rewards/bleu_reward_func/std": 0.014885513111948967, "step": 75 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.75, "completions/max_length": 512.0, "completions/max_terminated_length": 509.0, "completions/mean_length": 472.625, "completions/mean_terminated_length": 354.5, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "epoch": 0.0608, "grad_norm": 1.793560266494751, "kl": 0.0019183158874511719, "learning_rate": 1e-06, "loss": 0.019, "num_tokens": 962526.0, "reward": 0.05564770847558975, "reward_std": 0.048538610339164734, "rewards/bleu_reward_func/mean": 0.05564770847558975, "rewards/bleu_reward_func/std": 0.06820879131555557, "step": 76 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 494.0, "completions/max_terminated_length": 494.0, "completions/mean_length": 179.0625, "completions/mean_terminated_length": 179.0625, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.0616, "grad_norm": 5.047986030578613, "kl": 0.003528594970703125, "learning_rate": 1e-06, "loss": -0.2638, "num_tokens": 970776.0, "reward": 0.06285654753446579, "reward_std": 0.03820263221859932, "rewards/bleu_reward_func/mean": 0.06285654753446579, "rewards/bleu_reward_func/std": 0.05687018483877182, "step": 77 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 512.0, "completions/max_terminated_length": 429.0, "completions/mean_length": 202.71875, "completions/mean_terminated_length": 170.72413635253906, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.0624, "grad_norm": 3.0843141078948975, "kl": 0.0020785927772521973, "learning_rate": 1e-06, "loss": 0.2207, "num_tokens": 981399.0, "reward": 0.05065721645951271, "reward_std": 0.03324894234538078, "rewards/bleu_reward_func/mean": 0.05065721645951271, "rewards/bleu_reward_func/std": 0.05357068404555321, "step": 78 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 512.0, "completions/max_terminated_length": 496.0, "completions/mean_length": 355.34375, "completions/mean_terminated_length": 273.28570556640625, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.0632, "grad_norm": 2.630469799041748, "kl": 0.00506591796875, "learning_rate": 1e-06, "loss": -0.0981, "num_tokens": 995754.0, "reward": 0.059061747044324875, "reward_std": 0.030806895345449448, "rewards/bleu_reward_func/mean": 0.059061747044324875, "rewards/bleu_reward_func/std": 0.05187317356467247, "step": 79 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 512.0, "completions/max_terminated_length": 490.0, "completions/mean_length": 300.3125, "completions/mean_terminated_length": 189.42857360839844, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 0.064, "grad_norm": 3.2961418628692627, "kl": 0.0030832290649414062, "learning_rate": 1e-06, "loss": -0.0105, "num_tokens": 1013100.0, "reward": 0.05653802305459976, "reward_std": 0.017924563959240913, "rewards/bleu_reward_func/mean": 0.05653802305459976, "rewards/bleu_reward_func/std": 0.05833124369382858, "step": 80 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 512.0, "completions/max_terminated_length": 451.0, "completions/mean_length": 341.40625, "completions/mean_terminated_length": 239.0500030517578, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "epoch": 0.0648, "grad_norm": 2.498276710510254, "kl": 0.0046234130859375, "learning_rate": 1e-06, "loss": -0.2798, "num_tokens": 1026945.0, "reward": 0.040751807391643524, "reward_std": 0.018808823078870773, "rewards/bleu_reward_func/mean": 0.040751807391643524, "rewards/bleu_reward_func/std": 0.02410094253718853, "step": 81 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 440.0, "completions/mean_length": 324.6875, "completions/mean_terminated_length": 262.25, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.0656, "grad_norm": 2.130357265472412, "kl": 0.002826690673828125, "learning_rate": 1e-06, "loss": 0.0411, "num_tokens": 1042063.0, "reward": 0.036513280123472214, "reward_std": 0.01892837882041931, "rewards/bleu_reward_func/mean": 0.036513280123472214, "rewards/bleu_reward_func/std": 0.03590543195605278, "step": 82 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 512.0, "completions/max_terminated_length": 506.0, "completions/mean_length": 301.78125, "completions/mean_terminated_length": 271.75, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 0.0664, "grad_norm": 2.751675605773926, "kl": 0.003963470458984375, "learning_rate": 1e-06, "loss": 0.1396, "num_tokens": 1053920.0, "reward": 0.0392126627266407, "reward_std": 0.011845908127725124, "rewards/bleu_reward_func/mean": 0.0392126627266407, "rewards/bleu_reward_func/std": 0.02583330124616623, "step": 83 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 512.0, "completions/max_terminated_length": 460.0, "completions/mean_length": 261.875, "completions/mean_terminated_length": 236.0, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 0.0672, "grad_norm": 2.6925158500671387, "kl": 0.00390625, "learning_rate": 1e-06, "loss": 0.0117, "num_tokens": 1065300.0, "reward": 0.029011068865656853, "reward_std": 0.016968993470072746, "rewards/bleu_reward_func/mean": 0.029011068865656853, "rewards/bleu_reward_func/std": 0.02045534923672676, "step": 84 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 447.0, "completions/max_terminated_length": 447.0, "completions/mean_length": 226.59375, "completions/mean_terminated_length": 226.59375, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.068, "grad_norm": 3.6709952354431152, "kl": 0.00417327880859375, "learning_rate": 1e-06, "loss": 0.1037, "num_tokens": 1075647.0, "reward": 0.06442218273878098, "reward_std": 0.028383802622556686, "rewards/bleu_reward_func/mean": 0.06442218273878098, "rewards/bleu_reward_func/std": 0.04861043021082878, "step": 85 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 437.0, "completions/mean_length": 324.125, "completions/mean_terminated_length": 261.5, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "epoch": 0.0688, "grad_norm": 2.570126533508301, "kl": 0.0053558349609375, "learning_rate": 1e-06, "loss": 0.041, "num_tokens": 1089091.0, "reward": 0.05085538700222969, "reward_std": 0.01834620162844658, "rewards/bleu_reward_func/mean": 0.05085538700222969, "rewards/bleu_reward_func/std": 0.031752023845911026, "step": 86 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 495.0, "completions/max_terminated_length": 495.0, "completions/mean_length": 180.6875, "completions/mean_terminated_length": 180.6875, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.0696, "grad_norm": 6.8046674728393555, "kl": 0.004756927490234375, "learning_rate": 1e-06, "loss": -0.2613, "num_tokens": 1097313.0, "reward": 0.04579862207174301, "reward_std": 0.027461305260658264, "rewards/bleu_reward_func/mean": 0.04579862207174301, "rewards/bleu_reward_func/std": 0.04285133630037308, "step": 87 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 512.0, "completions/max_terminated_length": 458.0, "completions/mean_length": 352.3125, "completions/mean_terminated_length": 228.11111450195312, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.0704, "grad_norm": 3.2189218997955322, "kl": 0.0033960342407226562, "learning_rate": 1e-06, "loss": 0.02, "num_tokens": 1115235.0, "reward": 0.053727827966213226, "reward_std": 0.04648362472653389, "rewards/bleu_reward_func/mean": 0.053727827966213226, "rewards/bleu_reward_func/std": 0.07603882998228073, "step": 88 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 512.0, "completions/max_terminated_length": 484.0, "completions/mean_length": 296.03125, "completions/mean_terminated_length": 148.26315307617188, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.0712, "grad_norm": 3.394224166870117, "kl": 0.00843048095703125, "learning_rate": 1e-06, "loss": -0.0242, "num_tokens": 1127868.0, "reward": 0.04170762002468109, "reward_std": 0.015014639124274254, "rewards/bleu_reward_func/mean": 0.04170762002468109, "rewards/bleu_reward_func/std": 0.028102483600378036, "step": 89 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 512.0, "completions/max_terminated_length": 509.0, "completions/mean_length": 275.71875, "completions/mean_terminated_length": 221.19232177734375, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "epoch": 0.072, "grad_norm": 3.0822086334228516, "kl": 0.00803375244140625, "learning_rate": 1e-06, "loss": -0.142, "num_tokens": 1139531.0, "reward": 0.07560917735099792, "reward_std": 0.045496731996536255, "rewards/bleu_reward_func/mean": 0.07560917735099792, "rewards/bleu_reward_func/std": 0.0789395347237587, "step": 90 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 512.0, "completions/max_terminated_length": 440.0, "completions/mean_length": 403.53125, "completions/mean_terminated_length": 295.0625, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.0728, "grad_norm": 2.45961594581604, "kl": 0.00630950927734375, "learning_rate": 1e-06, "loss": -0.0524, "num_tokens": 1155996.0, "reward": 0.03898419439792633, "reward_std": 0.01788502372801304, "rewards/bleu_reward_func/mean": 0.03898419439792633, "rewards/bleu_reward_func/std": 0.022303381934762, "step": 91 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 512.0, "completions/max_terminated_length": 456.0, "completions/mean_length": 377.3125, "completions/mean_terminated_length": 242.625, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.0736, "grad_norm": 1.5934685468673706, "kl": 0.00415802001953125, "learning_rate": 1e-06, "loss": 0.1459, "num_tokens": 1172846.0, "reward": 0.08643854409456253, "reward_std": 0.0729157105088234, "rewards/bleu_reward_func/mean": 0.08643854409456253, "rewards/bleu_reward_func/std": 0.12770512700080872, "step": 92 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 485.0, "completions/mean_length": 332.90625, "completions/mean_terminated_length": 282.7599792480469, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.0744, "grad_norm": 2.671640157699585, "kl": 0.00707244873046875, "learning_rate": 1e-06, "loss": -0.0502, "num_tokens": 1185467.0, "reward": 0.05207536742091179, "reward_std": 0.02466990053653717, "rewards/bleu_reward_func/mean": 0.05207536742091179, "rewards/bleu_reward_func/std": 0.03447216376662254, "step": 93 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 512.0, "completions/max_terminated_length": 464.0, "completions/mean_length": 295.6875, "completions/mean_terminated_length": 273.3103332519531, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.0752, "grad_norm": 2.9223239421844482, "kl": 0.006927490234375, "learning_rate": 1e-06, "loss": -0.1365, "num_tokens": 1198065.0, "reward": 0.052932240068912506, "reward_std": 0.01840699091553688, "rewards/bleu_reward_func/mean": 0.052932240068912506, "rewards/bleu_reward_func/std": 0.039161618798971176, "step": 94 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.59375, "completions/max_length": 512.0, "completions/max_terminated_length": 509.0, "completions/mean_length": 445.90625, "completions/mean_terminated_length": 349.3077087402344, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "epoch": 0.076, "grad_norm": 1.8899106979370117, "kl": 0.0072784423828125, "learning_rate": 1e-06, "loss": -0.0753, "num_tokens": 1215878.0, "reward": 0.03316285461187363, "reward_std": 0.01574653573334217, "rewards/bleu_reward_func/mean": 0.03316285461187363, "rewards/bleu_reward_func/std": 0.01957116089761257, "step": 95 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 512.0, "completions/max_terminated_length": 496.0, "completions/mean_length": 204.375, "completions/mean_terminated_length": 183.86666870117188, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.0768, "grad_norm": 4.009552478790283, "kl": 0.005107879638671875, "learning_rate": 1e-06, "loss": 0.077, "num_tokens": 1227954.0, "reward": 0.1502164751291275, "reward_std": 0.09705069661140442, "rewards/bleu_reward_func/mean": 0.1502164751291275, "rewards/bleu_reward_func/std": 0.23583538830280304, "step": 96 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 512.0, "completions/max_terminated_length": 415.0, "completions/mean_length": 258.75, "completions/mean_terminated_length": 200.3076934814453, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.0776, "grad_norm": 2.948690891265869, "kl": 0.00766754150390625, "learning_rate": 1e-06, "loss": -0.1655, "num_tokens": 1240202.0, "reward": 0.0456559993326664, "reward_std": 0.033094413578510284, "rewards/bleu_reward_func/mean": 0.0456559993326664, "rewards/bleu_reward_func/std": 0.03354474529623985, "step": 97 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 512.0, "completions/max_terminated_length": 462.0, "completions/mean_length": 333.5625, "completions/mean_terminated_length": 240.09524536132812, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.0784, "grad_norm": 2.532744884490967, "kl": 0.008470535278320312, "learning_rate": 1e-06, "loss": 0.0968, "num_tokens": 1255796.0, "reward": 0.0555446520447731, "reward_std": 0.03220447525382042, "rewards/bleu_reward_func/mean": 0.0555446520447731, "rewards/bleu_reward_func/std": 0.05794409662485123, "step": 98 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 512.0, "completions/max_terminated_length": 487.0, "completions/mean_length": 293.8125, "completions/mean_terminated_length": 262.64288330078125, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.0792, "grad_norm": 10.322442054748535, "kl": 0.00716400146484375, "learning_rate": 1e-06, "loss": -0.0058, "num_tokens": 1270534.0, "reward": 0.06366641819477081, "reward_std": 0.04360166937112808, "rewards/bleu_reward_func/mean": 0.06366641819477081, "rewards/bleu_reward_func/std": 0.09251260757446289, "step": 99 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 512.0, "completions/max_terminated_length": 499.0, "completions/mean_length": 305.21875, "completions/mean_terminated_length": 291.433349609375, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 0.08, "grad_norm": 2.6224372386932373, "kl": 0.0085296630859375, "learning_rate": 1e-06, "loss": -0.0406, "num_tokens": 1282741.0, "reward": 0.04561196267604828, "reward_std": 0.032932039350271225, "rewards/bleu_reward_func/mean": 0.04561196267604828, "rewards/bleu_reward_func/std": 0.06322391331195831, "step": 100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 510.0, "completions/mean_length": 282.0625, "completions/mean_terminated_length": 192.0869598388672, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.0808, "grad_norm": 3.268648147583008, "kl": 0.01361083984375, "learning_rate": 1e-06, "loss": -0.1683, "num_tokens": 1294135.0, "reward": 0.05667008087038994, "reward_std": 0.030532341450452805, "rewards/bleu_reward_func/mean": 0.05667008087038994, "rewards/bleu_reward_func/std": 0.05759035050868988, "step": 101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 512.0, "completions/max_terminated_length": 501.0, "completions/mean_length": 333.8125, "completions/mean_terminated_length": 300.8148193359375, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 0.0816, "grad_norm": 2.907650947570801, "kl": 0.01136016845703125, "learning_rate": 1e-06, "loss": 0.0913, "num_tokens": 1306993.0, "reward": 0.03263912349939346, "reward_std": 0.009969690814614296, "rewards/bleu_reward_func/mean": 0.03263912349939346, "rewards/bleu_reward_func/std": 0.014208728447556496, "step": 102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 512.0, "completions/max_terminated_length": 440.0, "completions/mean_length": 220.0625, "completions/mean_terminated_length": 166.0, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.0824, "grad_norm": 4.579634666442871, "kl": 0.0120391845703125, "learning_rate": 1e-06, "loss": -0.0133, "num_tokens": 1317267.0, "reward": 0.033596813678741455, "reward_std": 0.015902765095233917, "rewards/bleu_reward_func/mean": 0.033596813678741455, "rewards/bleu_reward_func/std": 0.01867716945707798, "step": 103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 512.0, "completions/max_terminated_length": 454.0, "completions/mean_length": 355.375, "completions/mean_terminated_length": 198.75, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.0832, "grad_norm": 2.706808090209961, "kl": 0.013458251953125, "learning_rate": 1e-06, "loss": -0.1151, "num_tokens": 1334423.0, "reward": 0.05682121962308884, "reward_std": 0.0347580686211586, "rewards/bleu_reward_func/mean": 0.05682121962308884, "rewards/bleu_reward_func/std": 0.055416397750377655, "step": 104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.75, "completions/max_length": 512.0, "completions/max_terminated_length": 487.0, "completions/mean_length": 490.84375, "completions/mean_terminated_length": 427.375, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "epoch": 0.084, "grad_norm": 1.9632360935211182, "kl": 0.00994873046875, "learning_rate": 1e-06, "loss": -0.0389, "num_tokens": 1352930.0, "reward": 0.03201688453555107, "reward_std": 0.00869814120233059, "rewards/bleu_reward_func/mean": 0.03201688453555107, "rewards/bleu_reward_func/std": 0.013300970196723938, "step": 105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.59375, "completions/max_length": 512.0, "completions/max_terminated_length": 456.0, "completions/mean_length": 426.5, "completions/mean_terminated_length": 301.5384826660156, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 0.0848, "grad_norm": 1.8980785608291626, "kl": 0.00910186767578125, "learning_rate": 1e-06, "loss": 0.1299, "num_tokens": 1370418.0, "reward": 0.024408889934420586, "reward_std": 0.016656802967190742, "rewards/bleu_reward_func/mean": 0.024408889934420586, "rewards/bleu_reward_func/std": 0.026626311242580414, "step": 106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 512.0, "completions/max_terminated_length": 506.0, "completions/mean_length": 448.09375, "completions/mean_terminated_length": 384.1875, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.0856, "grad_norm": 2.022083044052124, "kl": 0.01107025146484375, "learning_rate": 1e-06, "loss": -0.0435, "num_tokens": 1387805.0, "reward": 0.03671019896864891, "reward_std": 0.015504223294556141, "rewards/bleu_reward_func/mean": 0.03671019896864891, "rewards/bleu_reward_func/std": 0.028309425339102745, "step": 107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 512.0, "completions/max_terminated_length": 481.0, "completions/mean_length": 356.46875, "completions/mean_terminated_length": 200.9375, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.0864, "grad_norm": 2.7162857055664062, "kl": 0.01024627685546875, "learning_rate": 1e-06, "loss": -0.1255, "num_tokens": 1402212.0, "reward": 0.03878065198659897, "reward_std": 0.02206358313560486, "rewards/bleu_reward_func/mean": 0.03878065198659897, "rewards/bleu_reward_func/std": 0.029185639694333076, "step": 108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 485.0, "completions/mean_length": 355.3125, "completions/mean_terminated_length": 303.0833435058594, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.0872, "grad_norm": 2.625575304031372, "kl": 0.01190185546875, "learning_rate": 1e-06, "loss": -0.0505, "num_tokens": 1416102.0, "reward": 0.04079345613718033, "reward_std": 0.021366603672504425, "rewards/bleu_reward_func/mean": 0.04079345613718033, "rewards/bleu_reward_func/std": 0.03239119052886963, "step": 109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 487.0, "completions/mean_length": 333.46875, "completions/mean_terminated_length": 263.60870361328125, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.088, "grad_norm": 2.462629556655884, "kl": 0.0115509033203125, "learning_rate": 1e-06, "loss": 0.0333, "num_tokens": 1429517.0, "reward": 0.04992126300930977, "reward_std": 0.026332605630159378, "rewards/bleu_reward_func/mean": 0.04992126300930977, "rewards/bleu_reward_func/std": 0.0346212200820446, "step": 110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 512.0, "completions/max_terminated_length": 509.0, "completions/mean_length": 310.46875, "completions/mean_terminated_length": 189.5500030517578, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 0.0888, "grad_norm": 4.681856155395508, "kl": 0.0270538330078125, "learning_rate": 1e-06, "loss": 0.2249, "num_tokens": 1443588.0, "reward": 0.061340004205703735, "reward_std": 0.020015515387058258, "rewards/bleu_reward_func/mean": 0.061340004205703735, "rewards/bleu_reward_func/std": 0.031175505369901657, "step": 111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 512.0, "completions/max_terminated_length": 484.0, "completions/mean_length": 376.4375, "completions/mean_terminated_length": 240.875, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.0896, "grad_norm": 2.6015677452087402, "kl": 0.00934600830078125, "learning_rate": 1e-06, "loss": -0.0971, "num_tokens": 1459066.0, "reward": 0.03436078503727913, "reward_std": 0.01115034706890583, "rewards/bleu_reward_func/mean": 0.03436078503727913, "rewards/bleu_reward_func/std": 0.022014673799276352, "step": 112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 512.0, "completions/max_terminated_length": 489.0, "completions/mean_length": 414.25, "completions/mean_terminated_length": 347.3684387207031, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.0904, "grad_norm": 2.417910099029541, "kl": 0.0102996826171875, "learning_rate": 1e-06, "loss": 0.0133, "num_tokens": 1476074.0, "reward": 0.0862937867641449, "reward_std": 0.033545784652233124, "rewards/bleu_reward_func/mean": 0.0862937867641449, "rewards/bleu_reward_func/std": 0.04349099099636078, "step": 113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5625, "completions/max_length": 512.0, "completions/max_terminated_length": 485.0, "completions/mean_length": 417.0625, "completions/mean_terminated_length": 295.0, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 0.0912, "grad_norm": 2.263538360595703, "kl": 0.0116119384765625, "learning_rate": 1e-06, "loss": 0.0398, "num_tokens": 1491932.0, "reward": 0.04719041287899017, "reward_std": 0.024131447076797485, "rewards/bleu_reward_func/mean": 0.04719041287899017, "rewards/bleu_reward_func/std": 0.02794639579951763, "step": 114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 512.0, "completions/max_terminated_length": 334.0, "completions/mean_length": 314.59375, "completions/mean_terminated_length": 140.41175842285156, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.092, "grad_norm": 2.4896774291992188, "kl": 0.0186920166015625, "learning_rate": 1e-06, "loss": -0.1274, "num_tokens": 1508711.0, "reward": 0.06531796604394913, "reward_std": 0.05683267116546631, "rewards/bleu_reward_func/mean": 0.06531796604394913, "rewards/bleu_reward_func/std": 0.08694739639759064, "step": 115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.53125, "completions/max_length": 512.0, "completions/max_terminated_length": 503.0, "completions/mean_length": 389.15625, "completions/mean_terminated_length": 249.933349609375, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 0.0928, "grad_norm": 2.473355531692505, "kl": 0.0122833251953125, "learning_rate": 1e-06, "loss": -0.1519, "num_tokens": 1524220.0, "reward": 0.03568326681852341, "reward_std": 0.013284995220601559, "rewards/bleu_reward_func/mean": 0.03568326681852341, "rewards/bleu_reward_func/std": 0.018093997612595558, "step": 116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 512.0, "completions/max_terminated_length": 414.0, "completions/mean_length": 235.03125, "completions/mean_terminated_length": 171.11538696289062, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "epoch": 0.0936, "grad_norm": 3.366828680038452, "kl": 0.009637832641601562, "learning_rate": 1e-06, "loss": 0.0635, "num_tokens": 1538085.0, "reward": 0.057241007685661316, "reward_std": 0.02658858895301819, "rewards/bleu_reward_func/mean": 0.057241007685661316, "rewards/bleu_reward_func/std": 0.05006576329469681, "step": 117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 512.0, "completions/max_terminated_length": 471.0, "completions/mean_length": 320.90625, "completions/mean_terminated_length": 190.15789794921875, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 0.0944, "grad_norm": 2.6779184341430664, "kl": 0.012842178344726562, "learning_rate": 1e-06, "loss": -0.1215, "num_tokens": 1552642.0, "reward": 0.11695751547813416, "reward_std": 0.03736204653978348, "rewards/bleu_reward_func/mean": 0.11695751547813416, "rewards/bleu_reward_func/std": 0.09987916797399521, "step": 118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 495.0, "completions/mean_length": 309.46875, "completions/mean_terminated_length": 252.75999450683594, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.0952, "grad_norm": 3.17213773727417, "kl": 0.0166168212890625, "learning_rate": 1e-06, "loss": 0.081, "num_tokens": 1567041.0, "reward": 0.06574233621358871, "reward_std": 0.04479731619358063, "rewards/bleu_reward_func/mean": 0.06574233621358871, "rewards/bleu_reward_func/std": 0.06972567737102509, "step": 119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 512.0, "completions/max_terminated_length": 500.0, "completions/mean_length": 348.53125, "completions/mean_terminated_length": 185.0625, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "epoch": 0.096, "grad_norm": 3.1326615810394287, "kl": 0.00875091552734375, "learning_rate": 1e-06, "loss": 0.0663, "num_tokens": 1585914.0, "reward": 0.02676137164235115, "reward_std": 0.007912165485322475, "rewards/bleu_reward_func/mean": 0.02676137164235115, "rewards/bleu_reward_func/std": 0.019907211884856224, "step": 120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 498.0, "completions/mean_length": 312.53125, "completions/mean_terminated_length": 246.0416717529297, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.0968, "grad_norm": 2.3142240047454834, "kl": 0.0168609619140625, "learning_rate": 1e-06, "loss": 0.0616, "num_tokens": 1599083.0, "reward": 0.0773664191365242, "reward_std": 0.03548593446612358, "rewards/bleu_reward_func/mean": 0.0773664191365242, "rewards/bleu_reward_func/std": 0.09226932376623154, "step": 121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 503.0, "completions/mean_length": 352.625, "completions/mean_terminated_length": 290.2608642578125, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.0976, "grad_norm": 3.595508098602295, "kl": 0.01422119140625, "learning_rate": 1e-06, "loss": 0.1431, "num_tokens": 1612983.0, "reward": 0.04203544184565544, "reward_std": 0.013445420190691948, "rewards/bleu_reward_func/mean": 0.04203544184565544, "rewards/bleu_reward_func/std": 0.01539506297558546, "step": 122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 481.0, "completions/mean_length": 318.96875, "completions/mean_terminated_length": 243.43478393554688, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 0.0984, "grad_norm": 2.913456678390503, "kl": 0.0119171142578125, "learning_rate": 1e-06, "loss": -0.0963, "num_tokens": 1626678.0, "reward": 0.03706140071153641, "reward_std": 0.0105556296184659, "rewards/bleu_reward_func/mean": 0.03706140071153641, "rewards/bleu_reward_func/std": 0.015232140198349953, "step": 123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 496.0, "completions/mean_length": 313.0625, "completions/mean_terminated_length": 246.75, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.0992, "grad_norm": 2.5508322715759277, "kl": 0.0186309814453125, "learning_rate": 1e-06, "loss": 0.1845, "num_tokens": 1643632.0, "reward": 0.02773209474980831, "reward_std": 0.006956611294299364, "rewards/bleu_reward_func/mean": 0.02773209474980831, "rewards/bleu_reward_func/std": 0.012230291962623596, "step": 124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 423.0, "completions/mean_length": 314.25, "completions/mean_terminated_length": 248.33334350585938, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "epoch": 0.1, "grad_norm": 2.6516916751861572, "kl": 0.013763427734375, "learning_rate": 1e-06, "loss": -0.1173, "num_tokens": 1655768.0, "reward": 0.052763912826776505, "reward_std": 0.02353248931467533, "rewards/bleu_reward_func/mean": 0.052763912826776505, "rewards/bleu_reward_func/std": 0.03502753749489784, "step": 125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 512.0, "completions/max_terminated_length": 409.0, "completions/mean_length": 372.5625, "completions/mean_terminated_length": 249.5294189453125, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.1008, "grad_norm": 2.2216272354125977, "kl": 0.0171661376953125, "learning_rate": 1e-06, "loss": 0.0064, "num_tokens": 1672106.0, "reward": 0.035765521228313446, "reward_std": 0.009950447827577591, "rewards/bleu_reward_func/mean": 0.035765521228313446, "rewards/bleu_reward_func/std": 0.022508379071950912, "step": 126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5625, "completions/max_length": 512.0, "completions/max_terminated_length": 495.0, "completions/mean_length": 410.09375, "completions/mean_terminated_length": 279.0714416503906, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.1016, "grad_norm": 2.5408473014831543, "kl": 0.0154571533203125, "learning_rate": 1e-06, "loss": -0.2255, "num_tokens": 1689573.0, "reward": 0.035822078585624695, "reward_std": 0.01573784276843071, "rewards/bleu_reward_func/mean": 0.035822078585624695, "rewards/bleu_reward_func/std": 0.02253863401710987, "step": 127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 512.0, "completions/max_terminated_length": 502.0, "completions/mean_length": 298.8125, "completions/mean_terminated_length": 284.6000061035156, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.1024, "grad_norm": 3.248319387435913, "kl": 0.0136260986328125, "learning_rate": 1e-06, "loss": -0.0412, "num_tokens": 1701151.0, "reward": 0.058197036385536194, "reward_std": 0.017663825303316116, "rewards/bleu_reward_func/mean": 0.058197036385536194, "rewards/bleu_reward_func/std": 0.04830459877848625, "step": 128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 512.0, "completions/max_terminated_length": 500.0, "completions/mean_length": 409.8125, "completions/mean_terminated_length": 348.5, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 0.1032, "grad_norm": 2.0520575046539307, "kl": 0.015594482421875, "learning_rate": 1e-06, "loss": -0.0989, "num_tokens": 1717193.0, "reward": 0.07774099707603455, "reward_std": 0.024711469188332558, "rewards/bleu_reward_func/mean": 0.07774099707603455, "rewards/bleu_reward_func/std": 0.0407242514193058, "step": 129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 512.0, "completions/max_terminated_length": 499.0, "completions/mean_length": 329.5625, "completions/mean_terminated_length": 303.5, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.104, "grad_norm": 2.5769379138946533, "kl": 0.0155029296875, "learning_rate": 1e-06, "loss": 0.1015, "num_tokens": 1730843.0, "reward": 0.04991535469889641, "reward_std": 0.017646994441747665, "rewards/bleu_reward_func/mean": 0.04991535469889641, "rewards/bleu_reward_func/std": 0.048128433525562286, "step": 130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 489.0, "completions/mean_length": 260.875, "completions/mean_terminated_length": 177.1666717529297, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.1048, "grad_norm": 3.685297727584839, "kl": 0.01507568359375, "learning_rate": 1e-06, "loss": 0.0444, "num_tokens": 1743639.0, "reward": 0.0769617035984993, "reward_std": 0.030974943190813065, "rewards/bleu_reward_func/mean": 0.0769617035984993, "rewards/bleu_reward_func/std": 0.09881884604692459, "step": 131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 479.0, "completions/mean_length": 359.375, "completions/mean_terminated_length": 299.6521911621094, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.1056, "grad_norm": 2.244091272354126, "kl": 0.01470947265625, "learning_rate": 1e-06, "loss": -0.0148, "num_tokens": 1758339.0, "reward": 0.02497226372361183, "reward_std": 0.006721612066030502, "rewards/bleu_reward_func/mean": 0.02497226372361183, "rewards/bleu_reward_func/std": 0.011903750710189342, "step": 132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 512.0, "completions/max_terminated_length": 480.0, "completions/mean_length": 345.9375, "completions/mean_terminated_length": 322.21429443359375, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.1064, "grad_norm": 2.510582208633423, "kl": 0.0145721435546875, "learning_rate": 1e-06, "loss": -0.0027, "num_tokens": 1771673.0, "reward": 0.032875481992959976, "reward_std": 0.010129079222679138, "rewards/bleu_reward_func/mean": 0.032875481992959976, "rewards/bleu_reward_func/std": 0.01497406605631113, "step": 133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 512.0, "completions/max_terminated_length": 452.0, "completions/mean_length": 307.0, "completions/mean_terminated_length": 213.8181915283203, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 0.1072, "grad_norm": 2.790792942047119, "kl": 0.0141448974609375, "learning_rate": 1e-06, "loss": -0.0355, "num_tokens": 1784081.0, "reward": 0.019524620845913887, "reward_std": 0.0064018769189715385, "rewards/bleu_reward_func/mean": 0.019524620845913887, "rewards/bleu_reward_func/std": 0.011706347577273846, "step": 134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 442.0, "completions/max_terminated_length": 442.0, "completions/mean_length": 204.34375, "completions/mean_terminated_length": 204.34375, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.108, "grad_norm": 2.8634583950042725, "kl": 0.0163726806640625, "learning_rate": 1e-06, "loss": 0.1028, "num_tokens": 1793868.0, "reward": 0.08782406896352768, "reward_std": 0.05941709131002426, "rewards/bleu_reward_func/mean": 0.08782406896352768, "rewards/bleu_reward_func/std": 0.1270333081483841, "step": 135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 495.0, "completions/max_terminated_length": 495.0, "completions/mean_length": 229.625, "completions/mean_terminated_length": 229.625, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.1088, "grad_norm": 3.192974090576172, "kl": 0.0136566162109375, "learning_rate": 1e-06, "loss": -0.0969, "num_tokens": 1803704.0, "reward": 0.04424320533871651, "reward_std": 0.017299409955739975, "rewards/bleu_reward_func/mean": 0.04424320533871651, "rewards/bleu_reward_func/std": 0.03912116214632988, "step": 136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 512.0, "completions/max_terminated_length": 497.0, "completions/mean_length": 316.9375, "completions/mean_terminated_length": 183.4736785888672, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.1096, "grad_norm": 2.307054042816162, "kl": 0.018157958984375, "learning_rate": 1e-06, "loss": -0.0291, "num_tokens": 1817166.0, "reward": 0.03945375978946686, "reward_std": 0.025181055068969727, "rewards/bleu_reward_func/mean": 0.03945375978946686, "rewards/bleu_reward_func/std": 0.03327018395066261, "step": 137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 512.0, "completions/max_terminated_length": 442.0, "completions/mean_length": 210.1875, "completions/mean_terminated_length": 178.96551513671875, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.1104, "grad_norm": 5.996477127075195, "kl": 0.0174713134765625, "learning_rate": 1e-06, "loss": 0.0224, "num_tokens": 1828812.0, "reward": 0.03139394521713257, "reward_std": 0.011995144188404083, "rewards/bleu_reward_func/mean": 0.03139394521713257, "rewards/bleu_reward_func/std": 0.01749689131975174, "step": 138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 512.0, "completions/max_terminated_length": 503.0, "completions/mean_length": 398.6875, "completions/mean_terminated_length": 285.375, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 0.1112, "grad_norm": 2.496495246887207, "kl": 0.0198974609375, "learning_rate": 1e-06, "loss": 0.0135, "num_tokens": 1845378.0, "reward": 0.04385095834732056, "reward_std": 0.012373005039989948, "rewards/bleu_reward_func/mean": 0.04385095834732056, "rewards/bleu_reward_func/std": 0.023492127656936646, "step": 139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 512.0, "completions/max_terminated_length": 440.0, "completions/mean_length": 407.1875, "completions/mean_terminated_length": 325.6666564941406, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.112, "grad_norm": 2.1035640239715576, "kl": 0.013671875, "learning_rate": 1e-06, "loss": 0.0472, "num_tokens": 1861792.0, "reward": 0.06062568724155426, "reward_std": 0.02712031453847885, "rewards/bleu_reward_func/mean": 0.06062568724155426, "rewards/bleu_reward_func/std": 0.038938842713832855, "step": 140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 469.0, "completions/mean_length": 337.40625, "completions/mean_terminated_length": 279.2083435058594, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.1128, "grad_norm": 2.4870097637176514, "kl": 0.0177764892578125, "learning_rate": 1e-06, "loss": -0.042, "num_tokens": 1875197.0, "reward": 0.07142765074968338, "reward_std": 0.025316152721643448, "rewards/bleu_reward_func/mean": 0.07142765074968338, "rewards/bleu_reward_func/std": 0.054619304835796356, "step": 141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 512.0, "completions/max_terminated_length": 499.0, "completions/mean_length": 296.34375, "completions/mean_terminated_length": 256.40740966796875, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.1136, "grad_norm": 3.088263750076294, "kl": 0.025543212890625, "learning_rate": 1e-06, "loss": 0.0453, "num_tokens": 1888160.0, "reward": 0.06487879157066345, "reward_std": 0.019724037498235703, "rewards/bleu_reward_func/mean": 0.06487879157066345, "rewards/bleu_reward_func/std": 0.045981332659721375, "step": 142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.71875, "completions/max_length": 512.0, "completions/max_terminated_length": 118.0, "completions/mean_length": 387.8125, "completions/mean_terminated_length": 70.44444274902344, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.1144, "grad_norm": 3.4739062786102295, "kl": 0.023529052734375, "learning_rate": 1e-06, "loss": 0.0193, "num_tokens": 1904546.0, "reward": 0.016506584361195564, "reward_std": 0.010010890662670135, "rewards/bleu_reward_func/mean": 0.016506584361195564, "rewards/bleu_reward_func/std": 0.014175361953675747, "step": 143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.625, "completions/max_length": 512.0, "completions/max_terminated_length": 439.0, "completions/mean_length": 446.65625, "completions/mean_terminated_length": 337.75, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.1152, "grad_norm": 2.3656280040740967, "kl": 0.01031494140625, "learning_rate": 1e-06, "loss": -0.0996, "num_tokens": 1923767.0, "reward": 0.08849923312664032, "reward_std": 0.05801050364971161, "rewards/bleu_reward_func/mean": 0.08849923312664032, "rewards/bleu_reward_func/std": 0.10124781727790833, "step": 144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 496.0, "completions/mean_length": 327.125, "completions/mean_terminated_length": 275.3599853515625, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.116, "grad_norm": 2.725642681121826, "kl": 0.022552490234375, "learning_rate": 1e-06, "loss": 0.0032, "num_tokens": 1940155.0, "reward": 0.04029117524623871, "reward_std": 0.023508241400122643, "rewards/bleu_reward_func/mean": 0.04029117524623871, "rewards/bleu_reward_func/std": 0.037789031863212585, "step": 145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 507.0, "completions/mean_length": 302.0625, "completions/mean_terminated_length": 219.9130401611328, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.1168, "grad_norm": 3.752535104751587, "kl": 0.0152587890625, "learning_rate": 1e-06, "loss": -0.0033, "num_tokens": 1951805.0, "reward": 0.03404291719198227, "reward_std": 0.018146470189094543, "rewards/bleu_reward_func/mean": 0.03404291719198227, "rewards/bleu_reward_func/std": 0.023315640166401863, "step": 146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5625, "completions/max_length": 512.0, "completions/max_terminated_length": 483.0, "completions/mean_length": 411.6875, "completions/mean_terminated_length": 282.71429443359375, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.1176, "grad_norm": 2.323500156402588, "kl": 0.0173492431640625, "learning_rate": 1e-06, "loss": 0.0301, "num_tokens": 1967539.0, "reward": 0.06629061698913574, "reward_std": 0.01657968759536743, "rewards/bleu_reward_func/mean": 0.06629061698913574, "rewards/bleu_reward_func/std": 0.07008767873048782, "step": 147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 478.0, "completions/mean_length": 300.6875, "completions/mean_terminated_length": 230.25, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "epoch": 0.1184, "grad_norm": 2.4731006622314453, "kl": 0.022491455078125, "learning_rate": 1e-06, "loss": -0.0209, "num_tokens": 1980961.0, "reward": 0.05584581196308136, "reward_std": 0.016303110867738724, "rewards/bleu_reward_func/mean": 0.05584581196308136, "rewards/bleu_reward_func/std": 0.05100340396165848, "step": 148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 473.0, "completions/mean_length": 299.84375, "completions/mean_terminated_length": 240.4399871826172, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.1192, "grad_norm": 2.91982364654541, "kl": 0.020263671875, "learning_rate": 1e-06, "loss": -0.0718, "num_tokens": 1993940.0, "reward": 0.048127830028533936, "reward_std": 0.01851847395300865, "rewards/bleu_reward_func/mean": 0.048127830028533936, "rewards/bleu_reward_func/std": 0.03433293104171753, "step": 149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 498.0, "completions/mean_length": 335.46875, "completions/mean_terminated_length": 276.625, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.12, "grad_norm": 3.051020383834839, "kl": 0.01885986328125, "learning_rate": 1e-06, "loss": 0.0288, "num_tokens": 2007011.0, "reward": 0.04020792990922928, "reward_std": 0.008897930383682251, "rewards/bleu_reward_func/mean": 0.04020792990922928, "rewards/bleu_reward_func/std": 0.018972909078001976, "step": 150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 512.0, "completions/max_terminated_length": 482.0, "completions/mean_length": 366.625, "completions/mean_terminated_length": 300.54547119140625, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 0.1208, "grad_norm": 3.5532686710357666, "kl": 0.0150604248046875, "learning_rate": 1e-06, "loss": 0.1151, "num_tokens": 2025423.0, "reward": 0.05799319967627525, "reward_std": 0.03025471605360508, "rewards/bleu_reward_func/mean": 0.05799319967627525, "rewards/bleu_reward_func/std": 0.04186660796403885, "step": 151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 512.0, "completions/max_terminated_length": 504.0, "completions/mean_length": 457.0625, "completions/mean_terminated_length": 402.125, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.1216, "grad_norm": 2.035194158554077, "kl": 0.0230865478515625, "learning_rate": 1e-06, "loss": 0.0452, "num_tokens": 2043697.0, "reward": 0.06312853842973709, "reward_std": 0.014973493292927742, "rewards/bleu_reward_func/mean": 0.06312853842973709, "rewards/bleu_reward_func/std": 0.038368310779333115, "step": 152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 510.0, "completions/mean_length": 418.21875, "completions/mean_terminated_length": 381.5217590332031, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "epoch": 0.1224, "grad_norm": 2.344296455383301, "kl": 0.0169830322265625, "learning_rate": 1e-06, "loss": 0.0044, "num_tokens": 2060936.0, "reward": 0.05173652246594429, "reward_std": 0.024875259026885033, "rewards/bleu_reward_func/mean": 0.05173652246594429, "rewards/bleu_reward_func/std": 0.027617480605840683, "step": 153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 512.0, "completions/max_terminated_length": 468.0, "completions/mean_length": 387.96875, "completions/mean_terminated_length": 313.5500183105469, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.1232, "grad_norm": 2.4385931491851807, "kl": 0.020843505859375, "learning_rate": 1e-06, "loss": 0.0303, "num_tokens": 2076247.0, "reward": 0.09360536932945251, "reward_std": 0.023839600384235382, "rewards/bleu_reward_func/mean": 0.09360536932945251, "rewards/bleu_reward_func/std": 0.04197891801595688, "step": 154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 487.0, "completions/mean_length": 347.5625, "completions/mean_terminated_length": 292.75, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 0.124, "grad_norm": 3.0927393436431885, "kl": 0.020721435546875, "learning_rate": 1e-06, "loss": -0.1683, "num_tokens": 2090385.0, "reward": 0.06330172717571259, "reward_std": 0.0384925901889801, "rewards/bleu_reward_func/mean": 0.06330172717571259, "rewards/bleu_reward_func/std": 0.05383189022541046, "step": 155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 424.0, "completions/mean_length": 318.4375, "completions/mean_terminated_length": 253.9166717529297, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 0.1248, "grad_norm": 3.673290252685547, "kl": 0.02178955078125, "learning_rate": 1e-06, "loss": -0.0271, "num_tokens": 2104047.0, "reward": 0.05628419667482376, "reward_std": 0.024199776351451874, "rewards/bleu_reward_func/mean": 0.05628419667482376, "rewards/bleu_reward_func/std": 0.03944230079650879, "step": 156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 512.0, "completions/max_terminated_length": 444.0, "completions/mean_length": 317.28125, "completions/mean_terminated_length": 200.4499969482422, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.1256, "grad_norm": 2.7515523433685303, "kl": 0.01515960693359375, "learning_rate": 1e-06, "loss": 0.0605, "num_tokens": 2116984.0, "reward": 0.03348308056592941, "reward_std": 0.02000669576227665, "rewards/bleu_reward_func/mean": 0.03348308056592941, "rewards/bleu_reward_func/std": 0.04217757657170296, "step": 157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 512.0, "completions/max_terminated_length": 484.0, "completions/mean_length": 352.90625, "completions/mean_terminated_length": 336.4482727050781, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.1264, "grad_norm": 2.3758599758148193, "kl": 0.01812744140625, "learning_rate": 1e-06, "loss": 0.0262, "num_tokens": 2130381.0, "reward": 0.0528571754693985, "reward_std": 0.015917008742690086, "rewards/bleu_reward_func/mean": 0.0528571754693985, "rewards/bleu_reward_func/std": 0.03905298560857773, "step": 158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 512.0, "completions/max_terminated_length": 511.0, "completions/mean_length": 428.53125, "completions/mean_terminated_length": 345.0625, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 0.1272, "grad_norm": 2.2279083728790283, "kl": 0.0218353271484375, "learning_rate": 1e-06, "loss": -0.0695, "num_tokens": 2146966.0, "reward": 0.07023762166500092, "reward_std": 0.022503603249788284, "rewards/bleu_reward_func/mean": 0.07023762166500092, "rewards/bleu_reward_func/std": 0.04653822258114815, "step": 159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 467.0, "completions/mean_length": 335.40625, "completions/mean_terminated_length": 276.54168701171875, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.128, "grad_norm": 2.385481357574463, "kl": 0.01656341552734375, "learning_rate": 1e-06, "loss": 0.0463, "num_tokens": 2159947.0, "reward": 0.030797000974416733, "reward_std": 0.010636158287525177, "rewards/bleu_reward_func/mean": 0.030797000974416733, "rewards/bleu_reward_func/std": 0.012442766688764095, "step": 160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 512.0, "completions/max_terminated_length": 503.0, "completions/mean_length": 319.1875, "completions/mean_terminated_length": 283.4814758300781, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 0.1288, "grad_norm": 2.474431037902832, "kl": 0.01513671875, "learning_rate": 1e-06, "loss": 0.099, "num_tokens": 2173321.0, "reward": 0.05447715148329735, "reward_std": 0.016968414187431335, "rewards/bleu_reward_func/mean": 0.05447715148329735, "rewards/bleu_reward_func/std": 0.04606984928250313, "step": 161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 512.0, "completions/max_terminated_length": 489.0, "completions/mean_length": 283.40625, "completions/mean_terminated_length": 276.0322570800781, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 0.1296, "grad_norm": 2.662762403488159, "kl": 0.0158233642578125, "learning_rate": 1e-06, "loss": -0.0236, "num_tokens": 2185702.0, "reward": 0.029800117015838623, "reward_std": 0.011480635963380337, "rewards/bleu_reward_func/mean": 0.029800117015838623, "rewards/bleu_reward_func/std": 0.013534092344343662, "step": 162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 512.0, "completions/max_terminated_length": 481.0, "completions/mean_length": 201.625, "completions/mean_terminated_length": 169.51724243164062, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.1304, "grad_norm": 3.084282398223877, "kl": 0.02069091796875, "learning_rate": 1e-06, "loss": -0.0555, "num_tokens": 2194330.0, "reward": 0.07677525281906128, "reward_std": 0.03891972452402115, "rewards/bleu_reward_func/mean": 0.07677525281906128, "rewards/bleu_reward_func/std": 0.08827344328165054, "step": 163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 437.0, "completions/mean_length": 281.3125, "completions/mean_terminated_length": 204.4166717529297, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.1312, "grad_norm": 3.4201643466949463, "kl": 0.019195556640625, "learning_rate": 1e-06, "loss": 0.077, "num_tokens": 2205572.0, "reward": 0.04335915669798851, "reward_std": 0.011742215603590012, "rewards/bleu_reward_func/mean": 0.04335915669798851, "rewards/bleu_reward_func/std": 0.02501273900270462, "step": 164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/max_terminated_length": 394.0, "completions/mean_length": 200.0625, "completions/mean_terminated_length": 200.0625, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 0.132, "grad_norm": 4.607831954956055, "kl": 0.0267181396484375, "learning_rate": 1e-06, "loss": -0.2349, "num_tokens": 2214158.0, "reward": 0.023556701838970184, "reward_std": 0.017645370215177536, "rewards/bleu_reward_func/mean": 0.023556701838970184, "rewards/bleu_reward_func/std": 0.026666147634387016, "step": 165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 512.0, "completions/max_terminated_length": 510.0, "completions/mean_length": 345.84375, "completions/mean_terminated_length": 334.7666931152344, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.1328, "grad_norm": 2.454258918762207, "kl": 0.0163421630859375, "learning_rate": 1e-06, "loss": -0.0247, "num_tokens": 2227585.0, "reward": 0.04457944631576538, "reward_std": 0.015946604311466217, "rewards/bleu_reward_func/mean": 0.04457944631576538, "rewards/bleu_reward_func/std": 0.027206508442759514, "step": 166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 512.0, "completions/max_terminated_length": 507.0, "completions/mean_length": 366.84375, "completions/mean_terminated_length": 339.96295166015625, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "epoch": 0.1336, "grad_norm": 2.361379384994507, "kl": 0.015533447265625, "learning_rate": 1e-06, "loss": -0.0269, "num_tokens": 2242860.0, "reward": 0.03885602205991745, "reward_std": 0.016611171886324883, "rewards/bleu_reward_func/mean": 0.03885602205991745, "rewards/bleu_reward_func/std": 0.02544359117746353, "step": 167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 438.0, "completions/max_terminated_length": 438.0, "completions/mean_length": 179.0625, "completions/mean_terminated_length": 179.0625, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "epoch": 0.1344, "grad_norm": 3.451462507247925, "kl": 0.0236968994140625, "learning_rate": 1e-06, "loss": 0.0097, "num_tokens": 2251926.0, "reward": 0.08403593301773071, "reward_std": 0.0387713760137558, "rewards/bleu_reward_func/mean": 0.08403593301773071, "rewards/bleu_reward_func/std": 0.058938704431056976, "step": 168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 418.0, "completions/max_terminated_length": 418.0, "completions/mean_length": 250.9375, "completions/mean_terminated_length": 250.9375, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 0.1352, "grad_norm": 3.0590903759002686, "kl": 0.018585205078125, "learning_rate": 1e-06, "loss": 0.0594, "num_tokens": 2262204.0, "reward": 0.1359768509864807, "reward_std": 0.030772076919674873, "rewards/bleu_reward_func/mean": 0.1359768509864807, "rewards/bleu_reward_func/std": 0.11267537623643875, "step": 169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 512.0, "completions/max_terminated_length": 511.0, "completions/mean_length": 282.75, "completions/mean_terminated_length": 275.3548278808594, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.136, "grad_norm": 2.741846799850464, "kl": 0.0178985595703125, "learning_rate": 1e-06, "loss": -0.0933, "num_tokens": 2273812.0, "reward": 0.06083029881119728, "reward_std": 0.046626534312963486, "rewards/bleu_reward_func/mean": 0.06083029881119728, "rewards/bleu_reward_func/std": 0.09518548846244812, "step": 170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.59375, "completions/max_length": 512.0, "completions/max_terminated_length": 500.0, "completions/mean_length": 481.84375, "completions/mean_terminated_length": 437.7692565917969, "completions/min_length": 363.0, "completions/min_terminated_length": 363.0, "epoch": 0.1368, "grad_norm": 2.0689496994018555, "kl": 0.02508544921875, "learning_rate": 1e-06, "loss": -0.0084, "num_tokens": 2293239.0, "reward": 0.02994382753968239, "reward_std": 0.006383362226188183, "rewards/bleu_reward_func/mean": 0.02994382753968239, "rewards/bleu_reward_func/std": 0.013090057298541069, "step": 171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.53125, "completions/max_length": 512.0, "completions/max_terminated_length": 473.0, "completions/mean_length": 439.8125, "completions/mean_terminated_length": 358.0000305175781, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "epoch": 0.1376, "grad_norm": 2.1585867404937744, "kl": 0.020751953125, "learning_rate": 1e-06, "loss": -0.0719, "num_tokens": 2311881.0, "reward": 0.06273654103279114, "reward_std": 0.016566328704357147, "rewards/bleu_reward_func/mean": 0.06273654103279114, "rewards/bleu_reward_func/std": 0.062433164566755295, "step": 172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 512.0, "completions/max_terminated_length": 497.0, "completions/mean_length": 353.25, "completions/mean_terminated_length": 281.0909118652344, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "epoch": 0.1384, "grad_norm": 3.0912060737609863, "kl": 0.026611328125, "learning_rate": 1e-06, "loss": 0.0097, "num_tokens": 2329057.0, "reward": 0.04469408839941025, "reward_std": 0.013722876086831093, "rewards/bleu_reward_func/mean": 0.04469408839941025, "rewards/bleu_reward_func/std": 0.039968349039554596, "step": 173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 512.0, "completions/max_terminated_length": 506.0, "completions/mean_length": 417.84375, "completions/mean_terminated_length": 361.3500061035156, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "epoch": 0.1392, "grad_norm": 2.3754167556762695, "kl": 0.0189361572265625, "learning_rate": 1e-06, "loss": -0.0304, "num_tokens": 2345180.0, "reward": 0.07604481279850006, "reward_std": 0.01629452034831047, "rewards/bleu_reward_func/mean": 0.07604481279850006, "rewards/bleu_reward_func/std": 0.07586659491062164, "step": 174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 431.0, "completions/max_terminated_length": 431.0, "completions/mean_length": 165.6875, "completions/mean_terminated_length": 165.6875, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 0.14, "grad_norm": 5.492390155792236, "kl": 0.02996826171875, "learning_rate": 1e-06, "loss": 0.1718, "num_tokens": 2354442.0, "reward": 0.03355713561177254, "reward_std": 0.017250124365091324, "rewards/bleu_reward_func/mean": 0.03355713561177254, "rewards/bleu_reward_func/std": 0.020392760634422302, "step": 175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 512.0, "completions/max_terminated_length": 426.0, "completions/mean_length": 331.40625, "completions/mean_terminated_length": 289.73077392578125, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 0.1408, "grad_norm": 2.441545009613037, "kl": 0.02288818359375, "learning_rate": 1e-06, "loss": -0.0219, "num_tokens": 2368495.0, "reward": 0.03965570032596588, "reward_std": 0.01631091721355915, "rewards/bleu_reward_func/mean": 0.03965570032596588, "rewards/bleu_reward_func/std": 0.02324427105486393, "step": 176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 512.0, "completions/max_terminated_length": 509.0, "completions/mean_length": 383.6875, "completions/mean_terminated_length": 354.0769348144531, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.1416, "grad_norm": 2.1311628818511963, "kl": 0.0158843994140625, "learning_rate": 1e-06, "loss": -0.0019, "num_tokens": 2382621.0, "reward": 0.06360460817813873, "reward_std": 0.035029761493206024, "rewards/bleu_reward_func/mean": 0.06360460817813873, "rewards/bleu_reward_func/std": 0.052434373646974564, "step": 177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 451.0, "completions/mean_length": 298.0625, "completions/mean_terminated_length": 238.1599884033203, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.1424, "grad_norm": 2.817237138748169, "kl": 0.01739501953125, "learning_rate": 1e-06, "loss": -0.0277, "num_tokens": 2396607.0, "reward": 0.08697853982448578, "reward_std": 0.02260083705186844, "rewards/bleu_reward_func/mean": 0.08697853982448578, "rewards/bleu_reward_func/std": 0.08185648173093796, "step": 178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 388.0, "completions/mean_length": 279.375, "completions/mean_terminated_length": 188.3478240966797, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 0.1432, "grad_norm": 2.8254637718200684, "kl": 0.018768310546875, "learning_rate": 1e-06, "loss": 0.1222, "num_tokens": 2408243.0, "reward": 0.029145658016204834, "reward_std": 0.011095807887613773, "rewards/bleu_reward_func/mean": 0.029145658016204834, "rewards/bleu_reward_func/std": 0.021273698657751083, "step": 179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 512.0, "completions/max_terminated_length": 499.0, "completions/mean_length": 337.46875, "completions/mean_terminated_length": 246.04762268066406, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 0.144, "grad_norm": 3.114673376083374, "kl": 0.02655029296875, "learning_rate": 1e-06, "loss": 0.2521, "num_tokens": 2422274.0, "reward": 0.03623339533805847, "reward_std": 0.023222438991069794, "rewards/bleu_reward_func/mean": 0.03623339533805847, "rewards/bleu_reward_func/std": 0.03441086784005165, "step": 180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 413.0, "completions/max_terminated_length": 413.0, "completions/mean_length": 164.53125, "completions/mean_terminated_length": 164.53125, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.1448, "grad_norm": 4.906919479370117, "kl": 0.0315399169921875, "learning_rate": 1e-06, "loss": 0.0263, "num_tokens": 2430475.0, "reward": 0.06976894289255142, "reward_std": 0.027743544429540634, "rewards/bleu_reward_func/mean": 0.06976894289255142, "rewards/bleu_reward_func/std": 0.06131015717983246, "step": 181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 512.0, "completions/max_terminated_length": 487.0, "completions/mean_length": 244.28125, "completions/mean_terminated_length": 182.5, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.1456, "grad_norm": 4.8848371505737305, "kl": 0.01995849609375, "learning_rate": 1e-06, "loss": 0.0627, "num_tokens": 2442788.0, "reward": 0.1191171407699585, "reward_std": 0.042263854295015335, "rewards/bleu_reward_func/mean": 0.1191171407699585, "rewards/bleu_reward_func/std": 0.10318046808242798, "step": 182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 511.0, "completions/mean_length": 321.875, "completions/mean_terminated_length": 247.478271484375, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.1464, "grad_norm": 2.6523852348327637, "kl": 0.01776123046875, "learning_rate": 1e-06, "loss": 0.0565, "num_tokens": 2455648.0, "reward": 0.04977214336395264, "reward_std": 0.016601046547293663, "rewards/bleu_reward_func/mean": 0.04977214336395264, "rewards/bleu_reward_func/std": 0.0421786792576313, "step": 183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 512.0, "completions/max_terminated_length": 508.0, "completions/mean_length": 370.09375, "completions/mean_terminated_length": 349.8214416503906, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 0.1472, "grad_norm": 2.1785075664520264, "kl": 0.018157958984375, "learning_rate": 1e-06, "loss": -0.0337, "num_tokens": 2470147.0, "reward": 0.10260511934757233, "reward_std": 0.01860986091196537, "rewards/bleu_reward_func/mean": 0.10260511934757233, "rewards/bleu_reward_func/std": 0.10457844287157059, "step": 184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 512.0, "completions/max_terminated_length": 487.0, "completions/mean_length": 346.71875, "completions/mean_terminated_length": 323.1071472167969, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.148, "grad_norm": 2.3310489654541016, "kl": 0.0196685791015625, "learning_rate": 1e-06, "loss": 0.0781, "num_tokens": 2485410.0, "reward": 0.049770474433898926, "reward_std": 0.022042104974389076, "rewards/bleu_reward_func/mean": 0.049770474433898926, "rewards/bleu_reward_func/std": 0.06963635981082916, "step": 185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 512.0, "completions/mean_length": 330.71875, "completions/mean_terminated_length": 270.29168701171875, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 0.1488, "grad_norm": 2.7905642986297607, "kl": 0.0135955810546875, "learning_rate": 1e-06, "loss": -0.1012, "num_tokens": 2498737.0, "reward": 0.060851939022541046, "reward_std": 0.04031149670481682, "rewards/bleu_reward_func/mean": 0.060851939022541046, "rewards/bleu_reward_func/std": 0.06394880264997482, "step": 186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 502.0, "completions/mean_length": 356.8125, "completions/mean_terminated_length": 305.0833435058594, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.1496, "grad_norm": 2.6979751586914062, "kl": 0.020355224609375, "learning_rate": 1e-06, "loss": 0.0853, "num_tokens": 2512435.0, "reward": 0.025785673409700394, "reward_std": 0.008767616003751755, "rewards/bleu_reward_func/mean": 0.025785673409700394, "rewards/bleu_reward_func/std": 0.016393397003412247, "step": 187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 512.0, "completions/max_terminated_length": 497.0, "completions/mean_length": 315.8125, "completions/mean_terminated_length": 309.4838562011719, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.1504, "grad_norm": 2.6376216411590576, "kl": 0.0171051025390625, "learning_rate": 1e-06, "loss": 0.1299, "num_tokens": 2524845.0, "reward": 0.061950668692588806, "reward_std": 0.029896268621087074, "rewards/bleu_reward_func/mean": 0.061950668692588806, "rewards/bleu_reward_func/std": 0.0432080440223217, "step": 188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 458.0, "completions/mean_length": 236.9375, "completions/mean_terminated_length": 129.30435180664062, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.1512, "grad_norm": 5.452763557434082, "kl": 0.0180816650390625, "learning_rate": 1e-06, "loss": 0.0144, "num_tokens": 2535291.0, "reward": 0.06512497365474701, "reward_std": 0.021872583776712418, "rewards/bleu_reward_func/mean": 0.06512497365474701, "rewards/bleu_reward_func/std": 0.05072392150759697, "step": 189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 458.0, "completions/mean_length": 361.5625, "completions/mean_terminated_length": 319.44000244140625, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 0.152, "grad_norm": 2.5568687915802, "kl": 0.021331787109375, "learning_rate": 1e-06, "loss": -0.078, "num_tokens": 2549005.0, "reward": 0.03104579634964466, "reward_std": 0.014428281225264072, "rewards/bleu_reward_func/mean": 0.03104579634964466, "rewards/bleu_reward_func/std": 0.023532235994935036, "step": 190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 512.0, "completions/max_terminated_length": 489.0, "completions/mean_length": 288.53125, "completions/mean_terminated_length": 247.1481475830078, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "epoch": 0.1528, "grad_norm": 3.435012102127075, "kl": 0.019561767578125, "learning_rate": 1e-06, "loss": 0.1562, "num_tokens": 2560494.0, "reward": 0.028866298496723175, "reward_std": 0.013667687773704529, "rewards/bleu_reward_func/mean": 0.028866298496723175, "rewards/bleu_reward_func/std": 0.014043555594980717, "step": 191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 512.0, "completions/max_terminated_length": 442.0, "completions/mean_length": 364.53125, "completions/mean_terminated_length": 297.5, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.1536, "grad_norm": 2.173910617828369, "kl": 0.0178375244140625, "learning_rate": 1e-06, "loss": 0.0114, "num_tokens": 2575791.0, "reward": 0.03910418599843979, "reward_std": 0.013818096369504929, "rewards/bleu_reward_func/mean": 0.03910418599843979, "rewards/bleu_reward_func/std": 0.014301484450697899, "step": 192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 512.0, "completions/max_terminated_length": 470.0, "completions/mean_length": 236.0625, "completions/mean_terminated_length": 207.51724243164062, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.1544, "grad_norm": 4.14415979385376, "kl": 0.0251617431640625, "learning_rate": 1e-06, "loss": 0.0366, "num_tokens": 2586057.0, "reward": 0.07592535018920898, "reward_std": 0.04757307469844818, "rewards/bleu_reward_func/mean": 0.07592535018920898, "rewards/bleu_reward_func/std": 0.10841362178325653, "step": 193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.71875, "completions/max_length": 512.0, "completions/max_terminated_length": 478.0, "completions/mean_length": 453.375, "completions/mean_terminated_length": 303.5555725097656, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 0.1552, "grad_norm": 2.0680158138275146, "kl": 0.0290679931640625, "learning_rate": 1e-06, "loss": 0.0241, "num_tokens": 2606325.0, "reward": 0.031244732439517975, "reward_std": 0.01845286600291729, "rewards/bleu_reward_func/mean": 0.031244732439517975, "rewards/bleu_reward_func/std": 0.03221140056848526, "step": 194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 512.0, "completions/max_terminated_length": 507.0, "completions/mean_length": 307.53125, "completions/mean_terminated_length": 214.59091186523438, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.156, "grad_norm": 4.693017482757568, "kl": 0.03253173828125, "learning_rate": 1e-06, "loss": -0.081, "num_tokens": 2618206.0, "reward": 0.043432123959064484, "reward_std": 0.020170938223600388, "rewards/bleu_reward_func/mean": 0.043432123959064484, "rewards/bleu_reward_func/std": 0.024602122604846954, "step": 195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/max_terminated_length": 511.0, "completions/mean_length": 255.125, "completions/mean_terminated_length": 255.125, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.1568, "grad_norm": 3.0103824138641357, "kl": 0.01934814453125, "learning_rate": 1e-06, "loss": -0.0563, "num_tokens": 2628394.0, "reward": 0.05487871170043945, "reward_std": 0.022487737238407135, "rewards/bleu_reward_func/mean": 0.05487871170043945, "rewards/bleu_reward_func/std": 0.04914577677845955, "step": 196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 424.0, "completions/mean_length": 300.5, "completions/mean_terminated_length": 217.7391357421875, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 0.1576, "grad_norm": 3.266918659210205, "kl": 0.0201568603515625, "learning_rate": 1e-06, "loss": 0.0096, "num_tokens": 2642122.0, "reward": 0.10000570863485336, "reward_std": 0.027525175362825394, "rewards/bleu_reward_func/mean": 0.10000570863485336, "rewards/bleu_reward_func/std": 0.06606002897024155, "step": 197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 494.0, "completions/mean_length": 322.4375, "completions/mean_terminated_length": 269.3599853515625, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.1584, "grad_norm": 2.755803108215332, "kl": 0.0229949951171875, "learning_rate": 1e-06, "loss": -0.1242, "num_tokens": 2654848.0, "reward": 0.04044795408844948, "reward_std": 0.017633788287639618, "rewards/bleu_reward_func/mean": 0.04044795408844948, "rewards/bleu_reward_func/std": 0.02563118189573288, "step": 198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 512.0, "completions/max_terminated_length": 496.0, "completions/mean_length": 289.90625, "completions/mean_terminated_length": 248.7777862548828, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.1592, "grad_norm": 3.5435428619384766, "kl": 0.014862060546875, "learning_rate": 1e-06, "loss": -0.0525, "num_tokens": 2666789.0, "reward": 0.18700216710567474, "reward_std": 0.06094446778297424, "rewards/bleu_reward_func/mean": 0.18700216710567474, "rewards/bleu_reward_func/std": 0.12359358370304108, "step": 199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.71875, "completions/max_length": 512.0, "completions/max_terminated_length": 417.0, "completions/mean_length": 435.21875, "completions/mean_terminated_length": 239.0, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.16, "grad_norm": 2.0920848846435547, "kl": 0.0169677734375, "learning_rate": 1e-06, "loss": -0.0518, "num_tokens": 2683212.0, "reward": 0.04426024854183197, "reward_std": 0.022208159789443016, "rewards/bleu_reward_func/mean": 0.04426024854183197, "rewards/bleu_reward_func/std": 0.03876553475856781, "step": 200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 512.0, "completions/max_terminated_length": 478.0, "completions/mean_length": 234.34375, "completions/mean_terminated_length": 205.6206817626953, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.1608, "grad_norm": 3.487138032913208, "kl": 0.01708984375, "learning_rate": 1e-06, "loss": -0.0069, "num_tokens": 2693463.0, "reward": 0.05370340123772621, "reward_std": 0.03217038884758949, "rewards/bleu_reward_func/mean": 0.05370340123772621, "rewards/bleu_reward_func/std": 0.0736880749464035, "step": 201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 429.0, "completions/max_terminated_length": 429.0, "completions/mean_length": 243.65625, "completions/mean_terminated_length": 243.65625, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.1616, "grad_norm": 3.3060476779937744, "kl": 0.022186279296875, "learning_rate": 1e-06, "loss": -0.0214, "num_tokens": 2707276.0, "reward": 0.052934836596250534, "reward_std": 0.0255296491086483, "rewards/bleu_reward_func/mean": 0.052934836596250534, "rewards/bleu_reward_func/std": 0.04204695671796799, "step": 202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 512.0, "completions/max_terminated_length": 469.0, "completions/mean_length": 365.71875, "completions/mean_terminated_length": 236.64706420898438, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "epoch": 0.1624, "grad_norm": 2.4367294311523438, "kl": 0.0296630859375, "learning_rate": 1e-06, "loss": 0.0339, "num_tokens": 2724531.0, "reward": 0.024516377598047256, "reward_std": 0.00745509285479784, "rewards/bleu_reward_func/mean": 0.024516377598047256, "rewards/bleu_reward_func/std": 0.017017923295497894, "step": 203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.65625, "completions/max_length": 512.0, "completions/max_terminated_length": 512.0, "completions/mean_length": 472.9375, "completions/mean_terminated_length": 398.3636474609375, "completions/min_length": 278.0, "completions/min_terminated_length": 278.0, "epoch": 0.1632, "grad_norm": 1.9994412660598755, "kl": 0.021209716796875, "learning_rate": 1e-06, "loss": -0.0193, "num_tokens": 2744433.0, "reward": 0.015427513048052788, "reward_std": 0.0040624747052788734, "rewards/bleu_reward_func/mean": 0.015427513048052788, "rewards/bleu_reward_func/std": 0.012431508861482143, "step": 204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 512.0, "completions/max_terminated_length": 502.0, "completions/mean_length": 330.21875, "completions/mean_terminated_length": 188.8333282470703, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 0.164, "grad_norm": 3.1568257808685303, "kl": 0.01837158203125, "learning_rate": 1e-06, "loss": -0.0931, "num_tokens": 2757264.0, "reward": 0.07403382658958435, "reward_std": 0.02855892851948738, "rewards/bleu_reward_func/mean": 0.07403382658958435, "rewards/bleu_reward_func/std": 0.056942496448755264, "step": 205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 512.0, "completions/max_terminated_length": 428.0, "completions/mean_length": 385.90625, "completions/mean_terminated_length": 259.8125, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 0.1648, "grad_norm": 2.4639594554901123, "kl": 0.022552490234375, "learning_rate": 1e-06, "loss": 0.096, "num_tokens": 2773037.0, "reward": 0.05201449990272522, "reward_std": 0.012545755133032799, "rewards/bleu_reward_func/mean": 0.05201449990272522, "rewards/bleu_reward_func/std": 0.03619503602385521, "step": 206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 512.0, "completions/max_terminated_length": 462.0, "completions/mean_length": 214.53125, "completions/mean_terminated_length": 204.9354705810547, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.1656, "grad_norm": 7.2300615310668945, "kl": 0.0164642333984375, "learning_rate": 1e-06, "loss": -0.0894, "num_tokens": 2782534.0, "reward": 0.05154382437467575, "reward_std": 0.02355325222015381, "rewards/bleu_reward_func/mean": 0.05154382437467575, "rewards/bleu_reward_func/std": 0.03372048959136009, "step": 207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 465.0, "completions/mean_length": 247.4375, "completions/mean_terminated_length": 173.36000061035156, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.1664, "grad_norm": 4.72393274307251, "kl": 0.02191162109375, "learning_rate": 1e-06, "loss": 0.0464, "num_tokens": 2795668.0, "reward": 0.08009414374828339, "reward_std": 0.04780849814414978, "rewards/bleu_reward_func/mean": 0.08009414374828339, "rewards/bleu_reward_func/std": 0.11779887974262238, "step": 208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 477.0, "completions/mean_length": 351.59375, "completions/mean_terminated_length": 306.67999267578125, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.1672, "grad_norm": 3.04226016998291, "kl": 0.0213470458984375, "learning_rate": 1e-06, "loss": 0.1197, "num_tokens": 2809247.0, "reward": 0.07256356626749039, "reward_std": 0.018727965652942657, "rewards/bleu_reward_func/mean": 0.07256356626749039, "rewards/bleu_reward_func/std": 0.03669372946023941, "step": 209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 512.0, "completions/max_terminated_length": 473.0, "completions/mean_length": 225.25, "completions/mean_terminated_length": 184.2857208251953, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.168, "grad_norm": 4.737229824066162, "kl": 0.02362060546875, "learning_rate": 1e-06, "loss": 0.1422, "num_tokens": 2818399.0, "reward": 0.035233426839113235, "reward_std": 0.01539241336286068, "rewards/bleu_reward_func/mean": 0.035233426839113235, "rewards/bleu_reward_func/std": 0.018226031213998795, "step": 210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 512.0, "completions/max_terminated_length": 450.0, "completions/mean_length": 352.84375, "completions/mean_terminated_length": 243.94737243652344, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 0.1688, "grad_norm": 3.0926690101623535, "kl": 0.017730712890625, "learning_rate": 1e-06, "loss": -0.0648, "num_tokens": 2833410.0, "reward": 0.05755448341369629, "reward_std": 0.025512943044304848, "rewards/bleu_reward_func/mean": 0.05755448341369629, "rewards/bleu_reward_func/std": 0.09508957713842392, "step": 211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 512.0, "completions/max_terminated_length": 495.0, "completions/mean_length": 330.6875, "completions/mean_terminated_length": 288.8461608886719, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 0.1696, "grad_norm": 2.5703189373016357, "kl": 0.01244354248046875, "learning_rate": 1e-06, "loss": 0.0727, "num_tokens": 2848008.0, "reward": 0.1348918080329895, "reward_std": 0.10722550749778748, "rewards/bleu_reward_func/mean": 0.1348918080329895, "rewards/bleu_reward_func/std": 0.20862343907356262, "step": 212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 367.0, "completions/max_terminated_length": 367.0, "completions/mean_length": 187.375, "completions/mean_terminated_length": 187.375, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.1704, "grad_norm": 3.83489990234375, "kl": 0.0211029052734375, "learning_rate": 1e-06, "loss": -0.1678, "num_tokens": 2856124.0, "reward": 0.08521190285682678, "reward_std": 0.04054812341928482, "rewards/bleu_reward_func/mean": 0.08521190285682678, "rewards/bleu_reward_func/std": 0.050891559571027756, "step": 213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 508.0, "completions/mean_length": 325.4375, "completions/mean_terminated_length": 273.1999816894531, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 0.1712, "grad_norm": 2.281848192214966, "kl": 0.0146026611328125, "learning_rate": 1e-06, "loss": 0.0584, "num_tokens": 2868610.0, "reward": 0.03156504034996033, "reward_std": 0.00848651397973299, "rewards/bleu_reward_func/mean": 0.03156504034996033, "rewards/bleu_reward_func/std": 0.024025922641158104, "step": 214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.53125, "completions/max_length": 512.0, "completions/max_terminated_length": 463.0, "completions/mean_length": 391.125, "completions/mean_terminated_length": 254.1333465576172, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 0.172, "grad_norm": 2.294666290283203, "kl": 0.0182952880859375, "learning_rate": 1e-06, "loss": -0.023, "num_tokens": 2884318.0, "reward": 0.06922988593578339, "reward_std": 0.021780148148536682, "rewards/bleu_reward_func/mean": 0.06922988593578339, "rewards/bleu_reward_func/std": 0.060424305498600006, "step": 215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 512.0, "completions/max_terminated_length": 505.0, "completions/mean_length": 319.6875, "completions/mean_terminated_length": 292.21429443359375, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 0.1728, "grad_norm": 2.571230411529541, "kl": 0.016632080078125, "learning_rate": 1e-06, "loss": 0.1612, "num_tokens": 2896860.0, "reward": 0.09209141135215759, "reward_std": 0.04961652681231499, "rewards/bleu_reward_func/mean": 0.09209141135215759, "rewards/bleu_reward_func/std": 0.0983605682849884, "step": 216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 512.0, "completions/max_terminated_length": 458.0, "completions/mean_length": 201.0625, "completions/mean_terminated_length": 156.6428680419922, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.1736, "grad_norm": 4.5812458992004395, "kl": 0.0264892578125, "learning_rate": 1e-06, "loss": -0.0075, "num_tokens": 2910246.0, "reward": 0.03037147969007492, "reward_std": 0.01431269571185112, "rewards/bleu_reward_func/mean": 0.03037147969007492, "rewards/bleu_reward_func/std": 0.018613692373037338, "step": 217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 512.0, "completions/max_terminated_length": 480.0, "completions/mean_length": 312.5, "completions/mean_terminated_length": 176.0, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 0.1744, "grad_norm": 2.713181734085083, "kl": 0.0229949951171875, "learning_rate": 1e-06, "loss": -0.0948, "num_tokens": 2923078.0, "reward": 0.026773083955049515, "reward_std": 0.013973203487694263, "rewards/bleu_reward_func/mean": 0.026773083955049515, "rewards/bleu_reward_func/std": 0.018786389380693436, "step": 218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 512.0, "completions/max_terminated_length": 447.0, "completions/mean_length": 379.125, "completions/mean_terminated_length": 246.25, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 0.1752, "grad_norm": 2.995640516281128, "kl": 0.01776123046875, "learning_rate": 1e-06, "loss": -0.0257, "num_tokens": 2938594.0, "reward": 0.046795397996902466, "reward_std": 0.032065290957689285, "rewards/bleu_reward_func/mean": 0.046795397996902466, "rewards/bleu_reward_func/std": 0.036055758595466614, "step": 219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 512.0, "completions/max_terminated_length": 485.0, "completions/mean_length": 256.875, "completions/mean_terminated_length": 239.86668395996094, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 0.176, "grad_norm": 3.257596492767334, "kl": 0.020233154296875, "learning_rate": 1e-06, "loss": -0.1987, "num_tokens": 2950486.0, "reward": 0.04207265004515648, "reward_std": 0.015948571264743805, "rewards/bleu_reward_func/mean": 0.04207265004515648, "rewards/bleu_reward_func/std": 0.02378344163298607, "step": 220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 403.0, "completions/mean_length": 303.3125, "completions/mean_terminated_length": 221.6521759033203, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.1768, "grad_norm": 2.1766207218170166, "kl": 0.01788330078125, "learning_rate": 1e-06, "loss": -0.0255, "num_tokens": 2963320.0, "reward": 0.02319950982928276, "reward_std": 0.02059568464756012, "rewards/bleu_reward_func/mean": 0.02319950982928276, "rewards/bleu_reward_func/std": 0.024123726412653923, "step": 221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 512.0, "completions/max_terminated_length": 476.0, "completions/mean_length": 249.3125, "completions/mean_terminated_length": 231.80001831054688, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.1776, "grad_norm": 3.291715621948242, "kl": 0.018707275390625, "learning_rate": 1e-06, "loss": -0.0471, "num_tokens": 2973706.0, "reward": 0.03604161739349365, "reward_std": 0.0192702729254961, "rewards/bleu_reward_func/mean": 0.03604161739349365, "rewards/bleu_reward_func/std": 0.03049510158598423, "step": 222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/max_terminated_length": 505.0, "completions/mean_length": 253.03125, "completions/mean_terminated_length": 253.03125, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.1784, "grad_norm": 4.5163798332214355, "kl": 0.0269317626953125, "learning_rate": 1e-06, "loss": 0.1078, "num_tokens": 2984139.0, "reward": 0.08039151877164841, "reward_std": 0.03706767037510872, "rewards/bleu_reward_func/mean": 0.08039151877164841, "rewards/bleu_reward_func/std": 0.08994851261377335, "step": 223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 510.0, "completions/mean_length": 314.8125, "completions/mean_terminated_length": 249.08334350585938, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "epoch": 0.1792, "grad_norm": 2.842648506164551, "kl": 0.0249176025390625, "learning_rate": 1e-06, "loss": 0.0148, "num_tokens": 2997189.0, "reward": 0.05913674458861351, "reward_std": 0.020169682800769806, "rewards/bleu_reward_func/mean": 0.05913674458861351, "rewards/bleu_reward_func/std": 0.03278661519289017, "step": 224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.53125, "completions/max_length": 512.0, "completions/max_terminated_length": 307.0, "completions/mean_length": 325.4375, "completions/mean_terminated_length": 114.00000762939453, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.18, "grad_norm": 3.618607521057129, "kl": 0.026611328125, "learning_rate": 1e-06, "loss": -0.0644, "num_tokens": 3012395.0, "reward": 0.06164587289094925, "reward_std": 0.038472697138786316, "rewards/bleu_reward_func/mean": 0.06164587289094925, "rewards/bleu_reward_func/std": 0.07648678123950958, "step": 225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 495.0, "completions/max_terminated_length": 495.0, "completions/mean_length": 192.96875, "completions/mean_terminated_length": 192.96875, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.1808, "grad_norm": 4.200687408447266, "kl": 0.02935791015625, "learning_rate": 1e-06, "loss": 0.2838, "num_tokens": 3020850.0, "reward": 0.06510348618030548, "reward_std": 0.03152220696210861, "rewards/bleu_reward_func/mean": 0.06510348618030548, "rewards/bleu_reward_func/std": 0.05109791085124016, "step": 226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 512.0, "completions/max_terminated_length": 485.0, "completions/mean_length": 339.53125, "completions/mean_terminated_length": 299.73077392578125, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.1816, "grad_norm": 2.5614731311798096, "kl": 0.020538330078125, "learning_rate": 1e-06, "loss": -0.0955, "num_tokens": 3034523.0, "reward": 0.02740243449807167, "reward_std": 0.013225449249148369, "rewards/bleu_reward_func/mean": 0.02740243449807167, "rewards/bleu_reward_func/std": 0.022882074117660522, "step": 227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 458.0, "completions/max_terminated_length": 458.0, "completions/mean_length": 195.5625, "completions/mean_terminated_length": 195.5625, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.1824, "grad_norm": 3.117429494857788, "kl": 0.027130126953125, "learning_rate": 1e-06, "loss": 0.0859, "num_tokens": 3046133.0, "reward": 0.04862482473254204, "reward_std": 0.032265372574329376, "rewards/bleu_reward_func/mean": 0.04862482473254204, "rewards/bleu_reward_func/std": 0.050319138914346695, "step": 228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 512.0, "completions/max_terminated_length": 477.0, "completions/mean_length": 256.40625, "completions/mean_terminated_length": 239.36668395996094, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 0.1832, "grad_norm": 2.8993618488311768, "kl": 0.0225982666015625, "learning_rate": 1e-06, "loss": -0.021, "num_tokens": 3058066.0, "reward": 0.07180735468864441, "reward_std": 0.03843909874558449, "rewards/bleu_reward_func/mean": 0.07180735468864441, "rewards/bleu_reward_func/std": 0.09140986949205399, "step": 229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 512.0, "completions/max_terminated_length": 505.0, "completions/mean_length": 271.0625, "completions/mean_terminated_length": 126.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.184, "grad_norm": 7.003058910369873, "kl": 0.0565185546875, "learning_rate": 1e-06, "loss": -0.0465, "num_tokens": 3070092.0, "reward": 0.05519847571849823, "reward_std": 0.015686171129345894, "rewards/bleu_reward_func/mean": 0.05519847571849823, "rewards/bleu_reward_func/std": 0.02879628911614418, "step": 230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 512.0, "completions/max_terminated_length": 507.0, "completions/mean_length": 334.15625, "completions/mean_terminated_length": 315.75860595703125, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.1848, "grad_norm": 2.597579002380371, "kl": 0.021514892578125, "learning_rate": 1e-06, "loss": -0.011, "num_tokens": 3083105.0, "reward": 0.057620543986558914, "reward_std": 0.02059962786734104, "rewards/bleu_reward_func/mean": 0.057620543986558914, "rewards/bleu_reward_func/std": 0.03736231103539467, "step": 231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 512.0, "completions/max_terminated_length": 484.0, "completions/mean_length": 198.03125, "completions/mean_terminated_length": 177.10000610351562, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.1856, "grad_norm": 3.682512044906616, "kl": 0.02569580078125, "learning_rate": 1e-06, "loss": 0.2759, "num_tokens": 3091698.0, "reward": 0.052232254296541214, "reward_std": 0.035380616784095764, "rewards/bleu_reward_func/mean": 0.052232254296541214, "rewards/bleu_reward_func/std": 0.06341397017240524, "step": 232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 512.0, "completions/max_terminated_length": 507.0, "completions/mean_length": 316.3125, "completions/mean_terminated_length": 310.0, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 0.1864, "grad_norm": 2.698293685913086, "kl": 0.0152130126953125, "learning_rate": 1e-06, "loss": 0.1178, "num_tokens": 3104156.0, "reward": 0.0916142389178276, "reward_std": 0.04101229086518288, "rewards/bleu_reward_func/mean": 0.0916142389178276, "rewards/bleu_reward_func/std": 0.10220352560281754, "step": 233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 512.0, "completions/max_terminated_length": 365.0, "completions/mean_length": 232.4375, "completions/mean_terminated_length": 203.51724243164062, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 0.1872, "grad_norm": 3.543837785720825, "kl": 0.023590087890625, "learning_rate": 1e-06, "loss": 0.0425, "num_tokens": 3113882.0, "reward": 0.035266127437353134, "reward_std": 0.011640656739473343, "rewards/bleu_reward_func/mean": 0.035266127437353134, "rewards/bleu_reward_func/std": 0.022689029574394226, "step": 234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 512.0, "completions/max_terminated_length": 510.0, "completions/mean_length": 340.8125, "completions/mean_terminated_length": 335.2903137207031, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.188, "grad_norm": 2.4419240951538086, "kl": 0.0162506103515625, "learning_rate": 1e-06, "loss": -0.0465, "num_tokens": 3129372.0, "reward": 0.08140328526496887, "reward_std": 0.03153820335865021, "rewards/bleu_reward_func/mean": 0.08140328526496887, "rewards/bleu_reward_func/std": 0.09040741622447968, "step": 235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 512.0, "completions/max_terminated_length": 426.0, "completions/mean_length": 149.8125, "completions/mean_terminated_length": 125.66667175292969, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.1888, "grad_norm": 5.230119705200195, "kl": 0.0335540771484375, "learning_rate": 1e-06, "loss": -0.1208, "num_tokens": 3137742.0, "reward": 0.08085089921951294, "reward_std": 0.06161949411034584, "rewards/bleu_reward_func/mean": 0.08085089921951294, "rewards/bleu_reward_func/std": 0.08157042413949966, "step": 236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 499.0, "completions/max_terminated_length": 499.0, "completions/mean_length": 218.96875, "completions/mean_terminated_length": 218.96875, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.1896, "grad_norm": 3.527757406234741, "kl": 0.0172576904296875, "learning_rate": 1e-06, "loss": -0.0976, "num_tokens": 3147781.0, "reward": 0.10917741060256958, "reward_std": 0.04233718663454056, "rewards/bleu_reward_func/mean": 0.10917741060256958, "rewards/bleu_reward_func/std": 0.09499745815992355, "step": 237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 512.0, "completions/max_terminated_length": 507.0, "completions/mean_length": 418.53125, "completions/mean_terminated_length": 376.04547119140625, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.1904, "grad_norm": 2.3092899322509766, "kl": 0.0175323486328125, "learning_rate": 1e-06, "loss": 0.0596, "num_tokens": 3163734.0, "reward": 0.0354929119348526, "reward_std": 0.016294876113533974, "rewards/bleu_reward_func/mean": 0.0354929119348526, "rewards/bleu_reward_func/std": 0.01924656331539154, "step": 238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 512.0, "completions/max_terminated_length": 505.0, "completions/mean_length": 296.5, "completions/mean_terminated_length": 282.13336181640625, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 0.1912, "grad_norm": 2.64257550239563, "kl": 0.020233154296875, "learning_rate": 1e-06, "loss": -0.0343, "num_tokens": 3176118.0, "reward": 0.05447524040937424, "reward_std": 0.01689964160323143, "rewards/bleu_reward_func/mean": 0.05447524040937424, "rewards/bleu_reward_func/std": 0.03051072731614113, "step": 239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 512.0, "completions/max_terminated_length": 506.0, "completions/mean_length": 311.71875, "completions/mean_terminated_length": 291.0, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.192, "grad_norm": 2.7584307193756104, "kl": 0.020416259765625, "learning_rate": 1e-06, "loss": -0.1264, "num_tokens": 3190469.0, "reward": 0.13415664434432983, "reward_std": 0.0733218789100647, "rewards/bleu_reward_func/mean": 0.13415664434432983, "rewards/bleu_reward_func/std": 0.11462453752756119, "step": 240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 512.0, "completions/max_terminated_length": 483.0, "completions/mean_length": 289.5, "completions/mean_terminated_length": 238.1538543701172, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.1928, "grad_norm": 3.149303913116455, "kl": 0.0215301513671875, "learning_rate": 1e-06, "loss": 0.0126, "num_tokens": 3202589.0, "reward": 0.10199414938688278, "reward_std": 0.044677168130874634, "rewards/bleu_reward_func/mean": 0.10199414938688278, "rewards/bleu_reward_func/std": 0.14810438454151154, "step": 241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 512.0, "completions/max_terminated_length": 507.0, "completions/mean_length": 391.21875, "completions/mean_terminated_length": 318.75, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.1936, "grad_norm": 2.327310800552368, "kl": 0.0197906494140625, "learning_rate": 1e-06, "loss": -0.0446, "num_tokens": 3220500.0, "reward": 0.04337170720100403, "reward_std": 0.020304495468735695, "rewards/bleu_reward_func/mean": 0.04337170720100403, "rewards/bleu_reward_func/std": 0.0399162657558918, "step": 242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 512.0, "completions/max_terminated_length": 465.0, "completions/mean_length": 404.375, "completions/mean_terminated_length": 309.4117736816406, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.1944, "grad_norm": 2.4237332344055176, "kl": 0.023651123046875, "learning_rate": 1e-06, "loss": -0.0035, "num_tokens": 3236000.0, "reward": 0.059977754950523376, "reward_std": 0.022927038371562958, "rewards/bleu_reward_func/mean": 0.059977754950523376, "rewards/bleu_reward_func/std": 0.032850753515958786, "step": 243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 456.0, "completions/mean_length": 352.84375, "completions/mean_terminated_length": 308.2799987792969, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.1952, "grad_norm": 2.4127137660980225, "kl": 0.01862335205078125, "learning_rate": 1e-06, "loss": -0.0088, "num_tokens": 3252163.0, "reward": 0.13270705938339233, "reward_std": 0.03011954203248024, "rewards/bleu_reward_func/mean": 0.13270705938339233, "rewards/bleu_reward_func/std": 0.10800201445817947, "step": 244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 512.0, "completions/max_terminated_length": 438.0, "completions/mean_length": 257.0625, "completions/mean_terminated_length": 248.8386993408203, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.196, "grad_norm": 4.602512359619141, "kl": 0.02532958984375, "learning_rate": 1e-06, "loss": 0.0806, "num_tokens": 3266605.0, "reward": 0.03341788053512573, "reward_std": 0.013418522663414478, "rewards/bleu_reward_func/mean": 0.03341788053512573, "rewards/bleu_reward_func/std": 0.021642079576849937, "step": 245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 512.0, "completions/max_terminated_length": 505.0, "completions/mean_length": 353.09375, "completions/mean_terminated_length": 257.75, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 0.1968, "grad_norm": 2.3705356121063232, "kl": 0.02313232421875, "learning_rate": 1e-06, "loss": 0.03, "num_tokens": 3280688.0, "reward": 0.06873345375061035, "reward_std": 0.040343694388866425, "rewards/bleu_reward_func/mean": 0.06873345375061035, "rewards/bleu_reward_func/std": 0.056226469576358795, "step": 246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 481.0, "completions/mean_length": 370.4375, "completions/mean_terminated_length": 315.0434875488281, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "epoch": 0.1976, "grad_norm": 2.6424918174743652, "kl": 0.025970458984375, "learning_rate": 1e-06, "loss": -0.044, "num_tokens": 3295534.0, "reward": 0.04273587465286255, "reward_std": 0.014303158968687057, "rewards/bleu_reward_func/mean": 0.04273587465286255, "rewards/bleu_reward_func/std": 0.021357977762818336, "step": 247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 512.0, "completions/max_terminated_length": 466.0, "completions/mean_length": 382.0625, "completions/mean_terminated_length": 252.125, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "epoch": 0.1984, "grad_norm": 2.337956190109253, "kl": 0.0235137939453125, "learning_rate": 1e-06, "loss": -0.0092, "num_tokens": 3310872.0, "reward": 0.037085238844156265, "reward_std": 0.0252089761197567, "rewards/bleu_reward_func/mean": 0.037085238844156265, "rewards/bleu_reward_func/std": 0.034677691757678986, "step": 248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 512.0, "completions/max_terminated_length": 459.0, "completions/mean_length": 253.9375, "completions/mean_terminated_length": 206.1481475830078, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.1992, "grad_norm": 3.5819602012634277, "kl": 0.0323333740234375, "learning_rate": 1e-06, "loss": 0.0387, "num_tokens": 3322966.0, "reward": 0.06450790166854858, "reward_std": 0.022195765748620033, "rewards/bleu_reward_func/mean": 0.06450790166854858, "rewards/bleu_reward_func/std": 0.054868634790182114, "step": 249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 512.0, "completions/max_terminated_length": 502.0, "completions/mean_length": 360.875, "completions/mean_terminated_length": 281.71429443359375, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 0.2, "grad_norm": 2.814183473587036, "kl": 0.0179443359375, "learning_rate": 1e-06, "loss": -0.1752, "num_tokens": 3337362.0, "reward": 0.039325565099716187, "reward_std": 0.025641005486249924, "rewards/bleu_reward_func/mean": 0.039325565099716187, "rewards/bleu_reward_func/std": 0.04151046276092529, "step": 250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 475.0, "completions/mean_length": 344.65625, "completions/mean_terminated_length": 279.1739196777344, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.2008, "grad_norm": 2.49495005607605, "kl": 0.01953125, "learning_rate": 1e-06, "loss": 0.0217, "num_tokens": 3352271.0, "reward": 0.04269051179289818, "reward_std": 0.020738966763019562, "rewards/bleu_reward_func/mean": 0.04269051179289818, "rewards/bleu_reward_func/std": 0.02881108783185482, "step": 251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 512.0, "completions/max_terminated_length": 509.0, "completions/mean_length": 333.28125, "completions/mean_terminated_length": 252.0454559326172, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.2016, "grad_norm": 4.844610214233398, "kl": 0.0460662841796875, "learning_rate": 1e-06, "loss": 0.0077, "num_tokens": 3366952.0, "reward": 0.02587553858757019, "reward_std": 0.01308115478605032, "rewards/bleu_reward_func/mean": 0.02587553858757019, "rewards/bleu_reward_func/std": 0.02540062554180622, "step": 252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 512.0, "completions/max_terminated_length": 458.0, "completions/mean_length": 136.1875, "completions/mean_terminated_length": 124.06451416015625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.2024, "grad_norm": 8.286824226379395, "kl": 0.0217132568359375, "learning_rate": 1e-06, "loss": -0.2455, "num_tokens": 3373646.0, "reward": 0.03720610588788986, "reward_std": 0.02497956156730652, "rewards/bleu_reward_func/mean": 0.03720610588788986, "rewards/bleu_reward_func/std": 0.04735667258501053, "step": 253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 433.0, "completions/max_terminated_length": 433.0, "completions/mean_length": 273.28125, "completions/mean_terminated_length": 273.28125, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 0.2032, "grad_norm": 3.4898288249969482, "kl": 0.0250244140625, "learning_rate": 1e-06, "loss": -0.0109, "num_tokens": 3386335.0, "reward": 0.07081638276576996, "reward_std": 0.028427409008145332, "rewards/bleu_reward_func/mean": 0.07081638276576996, "rewards/bleu_reward_func/std": 0.05091365799307823, "step": 254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 507.0, "completions/max_terminated_length": 507.0, "completions/mean_length": 295.15625, "completions/mean_terminated_length": 295.15625, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.204, "grad_norm": 2.3208115100860596, "kl": 0.0214996337890625, "learning_rate": 1e-06, "loss": -0.1527, "num_tokens": 3397868.0, "reward": 0.09917673468589783, "reward_std": 0.0416448600590229, "rewards/bleu_reward_func/mean": 0.09917673468589783, "rewards/bleu_reward_func/std": 0.08467306196689606, "step": 255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 512.0, "completions/max_terminated_length": 395.0, "completions/mean_length": 210.375, "completions/mean_terminated_length": 200.64515686035156, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.2048, "grad_norm": 3.7243247032165527, "kl": 0.02008056640625, "learning_rate": 1e-06, "loss": 0.0121, "num_tokens": 3406896.0, "reward": 0.09600116312503815, "reward_std": 0.04207791015505791, "rewards/bleu_reward_func/mean": 0.09600116312503815, "rewards/bleu_reward_func/std": 0.11218695342540741, "step": 256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 512.0, "completions/max_terminated_length": 485.0, "completions/mean_length": 416.78125, "completions/mean_terminated_length": 359.6499938964844, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.2056, "grad_norm": 2.2771170139312744, "kl": 0.0246124267578125, "learning_rate": 1e-06, "loss": -0.0175, "num_tokens": 3423553.0, "reward": 0.09614823013544083, "reward_std": 0.024631768465042114, "rewards/bleu_reward_func/mean": 0.09614823013544083, "rewards/bleu_reward_func/std": 0.05439407005906105, "step": 257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 512.0, "completions/max_terminated_length": 483.0, "completions/mean_length": 347.0, "completions/mean_terminated_length": 218.6666717529297, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "epoch": 0.2064, "grad_norm": 3.226184844970703, "kl": 0.030609130859375, "learning_rate": 1e-06, "loss": -0.0879, "num_tokens": 3439921.0, "reward": 0.04686765745282173, "reward_std": 0.020355040207505226, "rewards/bleu_reward_func/mean": 0.04686765745282173, "rewards/bleu_reward_func/std": 0.0473078228533268, "step": 258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 512.0, "completions/max_terminated_length": 492.0, "completions/mean_length": 297.75, "completions/mean_terminated_length": 258.0740661621094, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.2072, "grad_norm": 2.7204718589782715, "kl": 0.02374267578125, "learning_rate": 1e-06, "loss": 0.2345, "num_tokens": 3451953.0, "reward": 0.07326146960258484, "reward_std": 0.055320855230093, "rewards/bleu_reward_func/mean": 0.07326146960258484, "rewards/bleu_reward_func/std": 0.10083870589733124, "step": 259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/max_terminated_length": 359.0, "completions/mean_length": 168.03125, "completions/mean_terminated_length": 168.03125, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.208, "grad_norm": 4.883479595184326, "kl": 0.07550048828125, "learning_rate": 1e-06, "loss": -0.1051, "num_tokens": 3462906.0, "reward": 0.06956590712070465, "reward_std": 0.03378972038626671, "rewards/bleu_reward_func/mean": 0.06956590712070465, "rewards/bleu_reward_func/std": 0.05879009887576103, "step": 260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 512.0, "completions/max_terminated_length": 483.0, "completions/mean_length": 383.0, "completions/mean_terminated_length": 315.4285888671875, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.2088, "grad_norm": 2.163914918899536, "kl": 0.022429943084716797, "learning_rate": 1e-06, "loss": 0.0093, "num_tokens": 3480722.0, "reward": 0.12491203844547272, "reward_std": 0.10957963019609451, "rewards/bleu_reward_func/mean": 0.12491203844547272, "rewards/bleu_reward_func/std": 0.22631219029426575, "step": 261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 512.0, "completions/max_terminated_length": 435.0, "completions/mean_length": 217.03125, "completions/mean_terminated_length": 186.51724243164062, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.2096, "grad_norm": 4.656850814819336, "kl": 0.046905517578125, "learning_rate": 1e-06, "loss": 0.1128, "num_tokens": 3490139.0, "reward": 0.03650316223502159, "reward_std": 0.01834101229906082, "rewards/bleu_reward_func/mean": 0.03650316223502159, "rewards/bleu_reward_func/std": 0.019939929246902466, "step": 262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 512.0, "completions/max_terminated_length": 499.0, "completions/mean_length": 170.46875, "completions/mean_terminated_length": 121.67857360839844, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.2104, "grad_norm": 4.333711624145508, "kl": 0.0684814453125, "learning_rate": 1e-06, "loss": -0.0531, "num_tokens": 3500210.0, "reward": 0.12142158299684525, "reward_std": 0.051336318254470825, "rewards/bleu_reward_func/mean": 0.12142158299684525, "rewards/bleu_reward_func/std": 0.11394964903593063, "step": 263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 512.0, "completions/max_terminated_length": 445.0, "completions/mean_length": 313.09375, "completions/mean_terminated_length": 222.68182373046875, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.2112, "grad_norm": 4.1456618309021, "kl": 0.040740966796875, "learning_rate": 1e-06, "loss": -0.0505, "num_tokens": 3512237.0, "reward": 0.04106439650058746, "reward_std": 0.010877052322030067, "rewards/bleu_reward_func/mean": 0.04106439650058746, "rewards/bleu_reward_func/std": 0.029797548428177834, "step": 264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 485.0, "completions/mean_length": 325.78125, "completions/mean_terminated_length": 273.6399841308594, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.212, "grad_norm": 2.98254656791687, "kl": 0.0294189453125, "learning_rate": 1e-06, "loss": -0.0504, "num_tokens": 3525790.0, "reward": 0.0902123674750328, "reward_std": 0.02512788400053978, "rewards/bleu_reward_func/mean": 0.0902123674750328, "rewards/bleu_reward_func/std": 0.09073200821876526, "step": 265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.71875, "completions/max_length": 512.0, "completions/max_terminated_length": 507.0, "completions/mean_length": 459.65625, "completions/mean_terminated_length": 325.8888854980469, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.2128, "grad_norm": 2.1296770572662354, "kl": 0.0257110595703125, "learning_rate": 1e-06, "loss": 0.0355, "num_tokens": 3543467.0, "reward": 0.0487542450428009, "reward_std": 0.01123578380793333, "rewards/bleu_reward_func/mean": 0.0487542450428009, "rewards/bleu_reward_func/std": 0.0231720469892025, "step": 266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 512.0, "completions/max_terminated_length": 466.0, "completions/mean_length": 248.03125, "completions/mean_terminated_length": 187.11538696289062, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.2136, "grad_norm": 8.958330154418945, "kl": 0.0360107421875, "learning_rate": 1e-06, "loss": 0.1156, "num_tokens": 3553948.0, "reward": 0.07910416275262833, "reward_std": 0.015652041882276535, "rewards/bleu_reward_func/mean": 0.07910416275262833, "rewards/bleu_reward_func/std": 0.07359592616558075, "step": 267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 512.0, "completions/max_terminated_length": 471.0, "completions/mean_length": 257.96875, "completions/mean_terminated_length": 231.6896514892578, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.2144, "grad_norm": 2.6220574378967285, "kl": 0.030303955078125, "learning_rate": 1e-06, "loss": 0.1163, "num_tokens": 3566659.0, "reward": 0.04102238267660141, "reward_std": 0.01749418117105961, "rewards/bleu_reward_func/mean": 0.04102238267660141, "rewards/bleu_reward_func/std": 0.02520221658051014, "step": 268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 410.0, "completions/mean_length": 300.5625, "completions/mean_terminated_length": 230.08334350585938, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.2152, "grad_norm": 3.4121954441070557, "kl": 0.01523590087890625, "learning_rate": 1e-06, "loss": -0.0559, "num_tokens": 3579605.0, "reward": 0.05740036815404892, "reward_std": 0.023826539516448975, "rewards/bleu_reward_func/mean": 0.05740036815404892, "rewards/bleu_reward_func/std": 0.0372060090303421, "step": 269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 512.0, "completions/max_terminated_length": 506.0, "completions/mean_length": 360.9375, "completions/mean_terminated_length": 227.64706420898438, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.216, "grad_norm": 2.156470775604248, "kl": 0.031585693359375, "learning_rate": 1e-06, "loss": -0.2085, "num_tokens": 3595707.0, "reward": 0.03448399901390076, "reward_std": 0.01828095316886902, "rewards/bleu_reward_func/mean": 0.03448399901390076, "rewards/bleu_reward_func/std": 0.02368060126900673, "step": 270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 512.0, "completions/max_terminated_length": 370.0, "completions/mean_length": 352.5625, "completions/mean_terminated_length": 211.88235473632812, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 0.2168, "grad_norm": 2.4653913974761963, "kl": 0.027069091796875, "learning_rate": 1e-06, "loss": -0.0934, "num_tokens": 3611005.0, "reward": 0.032711900770664215, "reward_std": 0.010354666039347649, "rewards/bleu_reward_func/mean": 0.032711900770664215, "rewards/bleu_reward_func/std": 0.013180587440729141, "step": 271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 373.0, "completions/mean_length": 233.1875, "completions/mean_terminated_length": 155.1199951171875, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.2176, "grad_norm": 3.7216508388519287, "kl": 0.039306640625, "learning_rate": 1e-06, "loss": 0.0428, "num_tokens": 3623131.0, "reward": 0.025905201211571693, "reward_std": 0.017245225608348846, "rewards/bleu_reward_func/mean": 0.025905201211571693, "rewards/bleu_reward_func/std": 0.02249467745423317, "step": 272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 512.0, "completions/max_terminated_length": 491.0, "completions/mean_length": 362.0625, "completions/mean_terminated_length": 229.76470947265625, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 0.2184, "grad_norm": 2.705111265182495, "kl": 0.039459228515625, "learning_rate": 1e-06, "loss": -0.0603, "num_tokens": 3641261.0, "reward": 0.04823341965675354, "reward_std": 0.017576558515429497, "rewards/bleu_reward_func/mean": 0.04823341965675354, "rewards/bleu_reward_func/std": 0.03309940919280052, "step": 273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 384.0, "completions/mean_length": 273.0625, "completions/mean_terminated_length": 179.56521606445312, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.2192, "grad_norm": 7.897398471832275, "kl": 0.030792236328125, "learning_rate": 1e-06, "loss": -0.1464, "num_tokens": 3653487.0, "reward": 0.02417801320552826, "reward_std": 0.012017752975225449, "rewards/bleu_reward_func/mean": 0.02417801320552826, "rewards/bleu_reward_func/std": 0.018230969086289406, "step": 274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 480.0, "completions/mean_length": 321.28125, "completions/mean_terminated_length": 246.6521759033203, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 0.22, "grad_norm": 3.6866021156311035, "kl": 0.028167724609375, "learning_rate": 1e-06, "loss": 0.0214, "num_tokens": 3667096.0, "reward": 0.10455590486526489, "reward_std": 0.04352106153964996, "rewards/bleu_reward_func/mean": 0.10455590486526489, "rewards/bleu_reward_func/std": 0.07865530997514725, "step": 275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 512.0, "completions/max_terminated_length": 489.0, "completions/mean_length": 236.375, "completions/mean_terminated_length": 218.00001525878906, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.2208, "grad_norm": 3.3426926136016846, "kl": 0.02593994140625, "learning_rate": 1e-06, "loss": 0.0467, "num_tokens": 3677572.0, "reward": 0.07310892641544342, "reward_std": 0.049130503088235855, "rewards/bleu_reward_func/mean": 0.07310892641544342, "rewards/bleu_reward_func/std": 0.0947326272726059, "step": 276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 512.0, "completions/max_terminated_length": 499.0, "completions/mean_length": 443.9375, "completions/mean_terminated_length": 408.2857360839844, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.2216, "grad_norm": 2.185582399368286, "kl": 0.0277252197265625, "learning_rate": 1e-06, "loss": 0.0697, "num_tokens": 3693906.0, "reward": 0.0204878319054842, "reward_std": 0.0076804393902421, "rewards/bleu_reward_func/mean": 0.0204878319054842, "rewards/bleu_reward_func/std": 0.010448083281517029, "step": 277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 499.0, "completions/max_terminated_length": 499.0, "completions/mean_length": 179.8125, "completions/mean_terminated_length": 179.8125, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.2224, "grad_norm": 3.5973546504974365, "kl": 0.03302001953125, "learning_rate": 1e-06, "loss": 0.1902, "num_tokens": 3701988.0, "reward": 0.05479752644896507, "reward_std": 0.02715984173119068, "rewards/bleu_reward_func/mean": 0.05479752644896507, "rewards/bleu_reward_func/std": 0.046242304146289825, "step": 278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.59375, "completions/max_length": 512.0, "completions/max_terminated_length": 416.0, "completions/mean_length": 390.65625, "completions/mean_terminated_length": 213.3076934814453, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.2232, "grad_norm": 2.381040096282959, "kl": 0.029388427734375, "learning_rate": 1e-06, "loss": 0.0447, "num_tokens": 3717433.0, "reward": 0.0374862439930439, "reward_std": 0.013264529407024384, "rewards/bleu_reward_func/mean": 0.0374862439930439, "rewards/bleu_reward_func/std": 0.027201348915696144, "step": 279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 498.0, "completions/mean_length": 332.8125, "completions/mean_terminated_length": 262.6956481933594, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.224, "grad_norm": 2.9090490341186523, "kl": 0.032562255859375, "learning_rate": 1e-06, "loss": 0.0125, "num_tokens": 3730611.0, "reward": 0.04209320247173309, "reward_std": 0.012306570075452328, "rewards/bleu_reward_func/mean": 0.04209320247173309, "rewards/bleu_reward_func/std": 0.03222496807575226, "step": 280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 512.0, "completions/max_terminated_length": 478.0, "completions/mean_length": 223.0625, "completions/mean_terminated_length": 181.7857208251953, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 0.2248, "grad_norm": 3.2262182235717773, "kl": 0.03692626953125, "learning_rate": 1e-06, "loss": -0.0013, "num_tokens": 3739557.0, "reward": 0.028793197125196457, "reward_std": 0.009944088757038116, "rewards/bleu_reward_func/mean": 0.028793197125196457, "rewards/bleu_reward_func/std": 0.011341102421283722, "step": 281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.6875, "completions/max_length": 512.0, "completions/max_terminated_length": 429.0, "completions/mean_length": 435.8125, "completions/mean_terminated_length": 268.20001220703125, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.2256, "grad_norm": 2.236760139465332, "kl": 0.032806396484375, "learning_rate": 1e-06, "loss": -0.086, "num_tokens": 3760695.0, "reward": 0.044973913580179214, "reward_std": 0.010784904472529888, "rewards/bleu_reward_func/mean": 0.044973913580179214, "rewards/bleu_reward_func/std": 0.014913774095475674, "step": 282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 492.0, "completions/mean_length": 345.84375, "completions/mean_terminated_length": 299.32000732421875, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.2264, "grad_norm": 2.4806909561157227, "kl": 0.031829833984375, "learning_rate": 1e-06, "loss": 0.0659, "num_tokens": 3773914.0, "reward": 0.04734322056174278, "reward_std": 0.01934235356748104, "rewards/bleu_reward_func/mean": 0.04734322056174278, "rewards/bleu_reward_func/std": 0.030831577256321907, "step": 283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 512.0, "completions/max_terminated_length": 478.0, "completions/mean_length": 315.5625, "completions/mean_terminated_length": 162.7777862548828, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.2272, "grad_norm": 3.51826810836792, "kl": 0.039093017578125, "learning_rate": 1e-06, "loss": 0.0564, "num_tokens": 3787564.0, "reward": 0.05522763729095459, "reward_std": 0.02003159187734127, "rewards/bleu_reward_func/mean": 0.05522763729095459, "rewards/bleu_reward_func/std": 0.046319931745529175, "step": 284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5625, "completions/max_length": 512.0, "completions/max_terminated_length": 503.0, "completions/mean_length": 411.59375, "completions/mean_terminated_length": 282.5, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 0.228, "grad_norm": 2.7587406635284424, "kl": 0.032806396484375, "learning_rate": 1e-06, "loss": 0.0267, "num_tokens": 3803919.0, "reward": 0.033719129860401154, "reward_std": 0.00807010754942894, "rewards/bleu_reward_func/mean": 0.033719129860401154, "rewards/bleu_reward_func/std": 0.013236233964562416, "step": 285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 300.0, "completions/max_terminated_length": 300.0, "completions/mean_length": 170.625, "completions/mean_terminated_length": 170.625, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.2288, "grad_norm": 3.637995481491089, "kl": 0.0235595703125, "learning_rate": 1e-06, "loss": 0.0381, "num_tokens": 3812531.0, "reward": 0.06265231966972351, "reward_std": 0.023745257407426834, "rewards/bleu_reward_func/mean": 0.06265231966972351, "rewards/bleu_reward_func/std": 0.04604180529713631, "step": 286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 512.0, "completions/max_terminated_length": 495.0, "completions/mean_length": 351.6875, "completions/mean_terminated_length": 267.71429443359375, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 0.2296, "grad_norm": 2.835132122039795, "kl": 0.03070068359375, "learning_rate": 1e-06, "loss": -0.0183, "num_tokens": 3826105.0, "reward": 0.042010486125946045, "reward_std": 0.02037208527326584, "rewards/bleu_reward_func/mean": 0.042010486125946045, "rewards/bleu_reward_func/std": 0.025461316108703613, "step": 287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 512.0, "completions/max_terminated_length": 473.0, "completions/mean_length": 286.28125, "completions/mean_terminated_length": 234.19232177734375, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 0.2304, "grad_norm": 3.922866106033325, "kl": 0.030029296875, "learning_rate": 1e-06, "loss": 0.1685, "num_tokens": 3839346.0, "reward": 0.016506824642419815, "reward_std": 0.003976969514042139, "rewards/bleu_reward_func/mean": 0.016506824642419815, "rewards/bleu_reward_func/std": 0.008394270204007626, "step": 288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 512.0, "completions/max_terminated_length": 464.0, "completions/mean_length": 425.875, "completions/mean_terminated_length": 339.75, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.2312, "grad_norm": 2.1530535221099854, "kl": 0.031036376953125, "learning_rate": 1e-06, "loss": -0.0435, "num_tokens": 3856542.0, "reward": 0.04274771362543106, "reward_std": 0.015802588313817978, "rewards/bleu_reward_func/mean": 0.04274771362543106, "rewards/bleu_reward_func/std": 0.018804650753736496, "step": 289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 486.0, "completions/mean_length": 264.90625, "completions/mean_terminated_length": 182.5416717529297, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.232, "grad_norm": 4.459621429443359, "kl": 0.03009033203125, "learning_rate": 1e-06, "loss": 0.0053, "num_tokens": 3869307.0, "reward": 0.09130969643592834, "reward_std": 0.04283912479877472, "rewards/bleu_reward_func/mean": 0.09130969643592834, "rewards/bleu_reward_func/std": 0.08340450376272202, "step": 290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 450.0, "completions/mean_length": 303.34375, "completions/mean_terminated_length": 233.7916717529297, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.2328, "grad_norm": 2.758756160736084, "kl": 0.036041259765625, "learning_rate": 1e-06, "loss": -0.0283, "num_tokens": 3881046.0, "reward": 0.03966425359249115, "reward_std": 0.01337943784892559, "rewards/bleu_reward_func/mean": 0.03966425359249115, "rewards/bleu_reward_func/std": 0.020967189222574234, "step": 291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 506.0, "completions/mean_length": 289.5, "completions/mean_terminated_length": 215.33334350585938, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.2336, "grad_norm": 4.92682409286499, "kl": 0.0416259765625, "learning_rate": 1e-06, "loss": 0.0587, "num_tokens": 3893494.0, "reward": 0.07078565657138824, "reward_std": 0.025623325258493423, "rewards/bleu_reward_func/mean": 0.07078565657138824, "rewards/bleu_reward_func/std": 0.04824245721101761, "step": 292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 512.0, "completions/max_terminated_length": 503.0, "completions/mean_length": 373.34375, "completions/mean_terminated_length": 278.47369384765625, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.2344, "grad_norm": 2.9965121746063232, "kl": 0.0308837890625, "learning_rate": 1e-06, "loss": 0.0633, "num_tokens": 3908305.0, "reward": 0.04714567959308624, "reward_std": 0.011470139026641846, "rewards/bleu_reward_func/mean": 0.04714567959308624, "rewards/bleu_reward_func/std": 0.028922023251652718, "step": 293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 512.0, "completions/max_terminated_length": 506.0, "completions/mean_length": 363.625, "completions/mean_terminated_length": 285.9047546386719, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "epoch": 0.2352, "grad_norm": 2.949458122253418, "kl": 0.034149169921875, "learning_rate": 1e-06, "loss": 0.1081, "num_tokens": 3924405.0, "reward": 0.04587670788168907, "reward_std": 0.02025657892227173, "rewards/bleu_reward_func/mean": 0.04587670788168907, "rewards/bleu_reward_func/std": 0.029254309833049774, "step": 294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 512.0, "completions/max_terminated_length": 486.0, "completions/mean_length": 403.65625, "completions/mean_terminated_length": 319.3888854980469, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 0.236, "grad_norm": 2.505204200744629, "kl": 0.0325927734375, "learning_rate": 1e-06, "loss": -0.0542, "num_tokens": 3939506.0, "reward": 0.03485488519072533, "reward_std": 0.014378003776073456, "rewards/bleu_reward_func/mean": 0.03485488519072533, "rewards/bleu_reward_func/std": 0.01973474584519863, "step": 295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 512.0, "completions/max_terminated_length": 484.0, "completions/mean_length": 309.125, "completions/mean_terminated_length": 216.9091033935547, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.2368, "grad_norm": 6.216697692871094, "kl": 0.031768798828125, "learning_rate": 1e-06, "loss": 0.059, "num_tokens": 3952214.0, "reward": 0.07422341406345367, "reward_std": 0.01819428987801075, "rewards/bleu_reward_func/mean": 0.07422341406345367, "rewards/bleu_reward_func/std": 0.07971075177192688, "step": 296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.75, "completions/max_length": 512.0, "completions/max_terminated_length": 367.0, "completions/mean_length": 449.65625, "completions/mean_terminated_length": 262.625, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.2376, "grad_norm": 2.2672359943389893, "kl": 0.028839111328125, "learning_rate": 1e-06, "loss": 0.0007, "num_tokens": 3971571.0, "reward": 0.05678567662835121, "reward_std": 0.020236749202013016, "rewards/bleu_reward_func/mean": 0.05678567662835121, "rewards/bleu_reward_func/std": 0.0485292449593544, "step": 297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 512.0, "completions/max_terminated_length": 489.0, "completions/mean_length": 305.53125, "completions/mean_terminated_length": 257.8846130371094, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 0.2384, "grad_norm": 3.060343027114868, "kl": 0.036529541015625, "learning_rate": 1e-06, "loss": 0.0017, "num_tokens": 3984268.0, "reward": 0.09549540281295776, "reward_std": 0.02425481379032135, "rewards/bleu_reward_func/mean": 0.09549540281295776, "rewards/bleu_reward_func/std": 0.10033179074525833, "step": 298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 512.0, "completions/max_terminated_length": 467.0, "completions/mean_length": 359.90625, "completions/mean_terminated_length": 255.84210205078125, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.2392, "grad_norm": 2.7860310077667236, "kl": 0.024871826171875, "learning_rate": 1e-06, "loss": 0.0637, "num_tokens": 3998681.0, "reward": 0.02910490334033966, "reward_std": 0.008882608264684677, "rewards/bleu_reward_func/mean": 0.02910490334033966, "rewards/bleu_reward_func/std": 0.012598116882145405, "step": 299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 512.0, "completions/max_terminated_length": 511.0, "completions/mean_length": 253.84375, "completions/mean_terminated_length": 236.6333465576172, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 0.24, "grad_norm": 3.2227437496185303, "kl": 0.035919189453125, "learning_rate": 1e-06, "loss": 0.0523, "num_tokens": 4008996.0, "reward": 0.03897559642791748, "reward_std": 0.011807247996330261, "rewards/bleu_reward_func/mean": 0.03897559642791748, "rewards/bleu_reward_func/std": 0.014171565882861614, "step": 300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 341.0, "completions/mean_length": 264.65625, "completions/mean_terminated_length": 182.20834350585938, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.2408, "grad_norm": 3.2822182178497314, "kl": 0.02972412109375, "learning_rate": 1e-06, "loss": -0.0655, "num_tokens": 4021769.0, "reward": 0.12274128198623657, "reward_std": 0.06403186917304993, "rewards/bleu_reward_func/mean": 0.12274128198623657, "rewards/bleu_reward_func/std": 0.09856269508600235, "step": 301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.53125, "completions/max_length": 512.0, "completions/max_terminated_length": 509.0, "completions/mean_length": 467.09375, "completions/mean_terminated_length": 416.20001220703125, "completions/min_length": 283.0, "completions/min_terminated_length": 283.0, "epoch": 0.2416, "grad_norm": 2.018660306930542, "kl": 0.03399658203125, "learning_rate": 1e-06, "loss": -0.0454, "num_tokens": 4040132.0, "reward": 0.034933868795633316, "reward_std": 0.007452279329299927, "rewards/bleu_reward_func/mean": 0.034933868795633316, "rewards/bleu_reward_func/std": 0.02128129079937935, "step": 302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.53125, "completions/max_length": 512.0, "completions/max_terminated_length": 504.0, "completions/mean_length": 392.46875, "completions/mean_terminated_length": 257.0, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 0.2424, "grad_norm": 2.8577029705047607, "kl": 0.0274200439453125, "learning_rate": 1e-06, "loss": -0.0111, "num_tokens": 4056587.0, "reward": 0.02288379706442356, "reward_std": 0.006262771785259247, "rewards/bleu_reward_func/mean": 0.02288379706442356, "rewards/bleu_reward_func/std": 0.014597552828490734, "step": 303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 332.0, "completions/mean_length": 271.75, "completions/mean_terminated_length": 191.6666717529297, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "epoch": 0.2432, "grad_norm": 5.046367168426514, "kl": 0.02838134765625, "learning_rate": 1e-06, "loss": 0.0364, "num_tokens": 4067563.0, "reward": 0.02564316801726818, "reward_std": 0.011585518717765808, "rewards/bleu_reward_func/mean": 0.02564316801726818, "rewards/bleu_reward_func/std": 0.023773526772856712, "step": 304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 294.0, "completions/mean_length": 271.96875, "completions/mean_terminated_length": 191.95834350585938, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 0.244, "grad_norm": 3.2017996311187744, "kl": 0.0457763671875, "learning_rate": 1e-06, "loss": 0.0443, "num_tokens": 4080074.0, "reward": 0.048807431012392044, "reward_std": 0.01481578778475523, "rewards/bleu_reward_func/mean": 0.048807431012392044, "rewards/bleu_reward_func/std": 0.025570906698703766, "step": 305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 452.0, "completions/max_terminated_length": 452.0, "completions/mean_length": 265.21875, "completions/mean_terminated_length": 265.21875, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 0.2448, "grad_norm": 2.6399717330932617, "kl": 0.025665283203125, "learning_rate": 1e-06, "loss": -0.0796, "num_tokens": 4091201.0, "reward": 0.04743821173906326, "reward_std": 0.01849541999399662, "rewards/bleu_reward_func/mean": 0.04743821173906326, "rewards/bleu_reward_func/std": 0.02645285800099373, "step": 306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 356.0, "completions/mean_length": 277.875, "completions/mean_terminated_length": 186.26087951660156, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 0.2456, "grad_norm": 2.4667458534240723, "kl": 0.029388427734375, "learning_rate": 1e-06, "loss": 0.1769, "num_tokens": 4102253.0, "reward": 0.07918344438076019, "reward_std": 0.03603646531701088, "rewards/bleu_reward_func/mean": 0.07918344438076019, "rewards/bleu_reward_func/std": 0.09297043830156326, "step": 307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 512.0, "completions/max_terminated_length": 378.0, "completions/mean_length": 198.4375, "completions/mean_terminated_length": 177.53334045410156, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.2464, "grad_norm": 2.9031858444213867, "kl": 0.0194854736328125, "learning_rate": 1e-06, "loss": -0.0419, "num_tokens": 4111451.0, "reward": 0.05468355864286423, "reward_std": 0.02622107043862343, "rewards/bleu_reward_func/mean": 0.05468355864286423, "rewards/bleu_reward_func/std": 0.03912202641367912, "step": 308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 512.0, "completions/max_terminated_length": 489.0, "completions/mean_length": 294.65625, "completions/mean_terminated_length": 287.6451416015625, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.2472, "grad_norm": 2.6340668201446533, "kl": 0.0333251953125, "learning_rate": 1e-06, "loss": 0.0643, "num_tokens": 4122936.0, "reward": 0.07767876982688904, "reward_std": 0.017796212807297707, "rewards/bleu_reward_func/mean": 0.07767876982688904, "rewards/bleu_reward_func/std": 0.08556719124317169, "step": 309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 463.0, "completions/max_terminated_length": 463.0, "completions/mean_length": 252.40625, "completions/mean_terminated_length": 252.40625, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 0.248, "grad_norm": 3.0190749168395996, "kl": 0.0158843994140625, "learning_rate": 1e-06, "loss": -0.0605, "num_tokens": 4136381.0, "reward": 0.10290344059467316, "reward_std": 0.03325870633125305, "rewards/bleu_reward_func/mean": 0.10290344059467316, "rewards/bleu_reward_func/std": 0.06130888685584068, "step": 310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 512.0, "completions/max_terminated_length": 395.0, "completions/mean_length": 229.46875, "completions/mean_terminated_length": 210.6333465576172, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.2488, "grad_norm": 3.156555652618408, "kl": 0.024261474609375, "learning_rate": 1e-06, "loss": 0.0582, "num_tokens": 4146276.0, "reward": 0.074183389544487, "reward_std": 0.024614207446575165, "rewards/bleu_reward_func/mean": 0.074183389544487, "rewards/bleu_reward_func/std": 0.06735430657863617, "step": 311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 502.0, "completions/mean_length": 342.3125, "completions/mean_terminated_length": 294.79998779296875, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.2496, "grad_norm": 3.10429310798645, "kl": 0.02587890625, "learning_rate": 1e-06, "loss": 0.144, "num_tokens": 4160590.0, "reward": 0.05624938756227493, "reward_std": 0.01750083453953266, "rewards/bleu_reward_func/mean": 0.05624938756227493, "rewards/bleu_reward_func/std": 0.03703475371003151, "step": 312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 512.0, "completions/max_terminated_length": 500.0, "completions/mean_length": 327.625, "completions/mean_terminated_length": 217.0, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.2504, "grad_norm": 3.401366949081421, "kl": 0.04010009765625, "learning_rate": 1e-06, "loss": -0.1329, "num_tokens": 4175810.0, "reward": 0.06421037018299103, "reward_std": 0.023138659074902534, "rewards/bleu_reward_func/mean": 0.06421037018299103, "rewards/bleu_reward_func/std": 0.02672930806875229, "step": 313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 512.0, "completions/max_terminated_length": 512.0, "completions/mean_length": 287.46875, "completions/mean_terminated_length": 255.3928680419922, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "epoch": 0.2512, "grad_norm": 2.5841426849365234, "kl": 0.020111083984375, "learning_rate": 1e-06, "loss": 0.1266, "num_tokens": 4187337.0, "reward": 0.06008291244506836, "reward_std": 0.03618035838007927, "rewards/bleu_reward_func/mean": 0.06008291244506836, "rewards/bleu_reward_func/std": 0.07106407731771469, "step": 314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 312.0, "completions/mean_length": 265.46875, "completions/mean_terminated_length": 183.2916717529297, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.252, "grad_norm": 3.229022741317749, "kl": 0.03179931640625, "learning_rate": 1e-06, "loss": -0.0984, "num_tokens": 4201208.0, "reward": 0.07605750858783722, "reward_std": 0.02687455154955387, "rewards/bleu_reward_func/mean": 0.07605750858783722, "rewards/bleu_reward_func/std": 0.046578504145145416, "step": 315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 486.0, "completions/mean_length": 360.25, "completions/mean_terminated_length": 300.86956787109375, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 0.2528, "grad_norm": 2.501115560531616, "kl": 0.030731201171875, "learning_rate": 1e-06, "loss": -0.0733, "num_tokens": 4218152.0, "reward": 0.15790392458438873, "reward_std": 0.05716419219970703, "rewards/bleu_reward_func/mean": 0.15790392458438873, "rewards/bleu_reward_func/std": 0.18658459186553955, "step": 316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 501.0, "completions/mean_length": 318.125, "completions/mean_terminated_length": 253.5, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 0.2536, "grad_norm": 3.5820846557617188, "kl": 0.0333251953125, "learning_rate": 1e-06, "loss": 0.0424, "num_tokens": 4231732.0, "reward": 0.08669077605009079, "reward_std": 0.023732244968414307, "rewards/bleu_reward_func/mean": 0.08669077605009079, "rewards/bleu_reward_func/std": 0.10351579636335373, "step": 317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 512.0, "completions/mean_length": 252.09375, "completions/mean_terminated_length": 179.3199920654297, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.2544, "grad_norm": 4.425368785858154, "kl": 0.032501220703125, "learning_rate": 1e-06, "loss": -0.0886, "num_tokens": 4242311.0, "reward": 0.03214521333575249, "reward_std": 0.02095024473965168, "rewards/bleu_reward_func/mean": 0.03214521333575249, "rewards/bleu_reward_func/std": 0.03977763652801514, "step": 318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 512.0, "completions/max_terminated_length": 464.0, "completions/mean_length": 260.34375, "completions/mean_terminated_length": 202.2692413330078, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.2552, "grad_norm": 3.3167836666107178, "kl": 0.029937744140625, "learning_rate": 1e-06, "loss": 0.0631, "num_tokens": 4254082.0, "reward": 0.03737305477261543, "reward_std": 0.012050272896885872, "rewards/bleu_reward_func/mean": 0.03737305477261543, "rewards/bleu_reward_func/std": 0.028553711250424385, "step": 319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 509.0, "completions/mean_length": 317.40625, "completions/mean_terminated_length": 262.91998291015625, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 0.256, "grad_norm": 2.909987688064575, "kl": 0.031707763671875, "learning_rate": 1e-06, "loss": -0.0386, "num_tokens": 4266695.0, "reward": 0.039175860583782196, "reward_std": 0.009842153638601303, "rewards/bleu_reward_func/mean": 0.039175860583782196, "rewards/bleu_reward_func/std": 0.017322974279522896, "step": 320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 510.0, "completions/mean_length": 287.28125, "completions/mean_terminated_length": 212.375, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 0.2568, "grad_norm": 3.3348236083984375, "kl": 0.025726318359375, "learning_rate": 1e-06, "loss": 0.0681, "num_tokens": 4278376.0, "reward": 0.05241217091679573, "reward_std": 0.015721352770924568, "rewards/bleu_reward_func/mean": 0.05241217091679573, "rewards/bleu_reward_func/std": 0.03155159205198288, "step": 321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 509.0, "completions/mean_length": 217.65625, "completions/mean_terminated_length": 119.54167175292969, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 0.2576, "grad_norm": 5.488730430603027, "kl": 0.05377197265625, "learning_rate": 1e-06, "loss": 0.0444, "num_tokens": 4287813.0, "reward": 0.09449569880962372, "reward_std": 0.05701503902673721, "rewards/bleu_reward_func/mean": 0.09449569880962372, "rewards/bleu_reward_func/std": 0.09492365270853043, "step": 322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.625, "completions/max_length": 512.0, "completions/max_terminated_length": 511.0, "completions/mean_length": 441.4375, "completions/mean_terminated_length": 323.8333435058594, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.2584, "grad_norm": 2.3454339504241943, "kl": 0.023223876953125, "learning_rate": 1e-06, "loss": 0.0284, "num_tokens": 4306675.0, "reward": 0.0388575978577137, "reward_std": 0.012517341412603855, "rewards/bleu_reward_func/mean": 0.0388575978577137, "rewards/bleu_reward_func/std": 0.05242987349629402, "step": 323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 512.0, "completions/max_terminated_length": 505.0, "completions/mean_length": 278.90625, "completions/mean_terminated_length": 263.3666687011719, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 0.2592, "grad_norm": 2.8751518726348877, "kl": 0.025543212890625, "learning_rate": 1e-06, "loss": 0.0622, "num_tokens": 4317360.0, "reward": 0.032819002866744995, "reward_std": 0.010129611939191818, "rewards/bleu_reward_func/mean": 0.032819002866744995, "rewards/bleu_reward_func/std": 0.025955382734537125, "step": 324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 505.0, "completions/mean_length": 332.84375, "completions/mean_terminated_length": 262.7391357421875, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 0.26, "grad_norm": 7.335513591766357, "kl": 0.0263671875, "learning_rate": 1e-06, "loss": -0.0788, "num_tokens": 4333659.0, "reward": 0.08996531367301941, "reward_std": 0.03278956562280655, "rewards/bleu_reward_func/mean": 0.08996531367301941, "rewards/bleu_reward_func/std": 0.11964567005634308, "step": 325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 512.0, "completions/max_terminated_length": 508.0, "completions/mean_length": 293.90625, "completions/mean_terminated_length": 279.3666687011719, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.2608, "grad_norm": 2.7281105518341064, "kl": 0.02679443359375, "learning_rate": 1e-06, "loss": 0.0164, "num_tokens": 4346760.0, "reward": 0.035310667008161545, "reward_std": 0.01689826510846615, "rewards/bleu_reward_func/mean": 0.035310667008161545, "rewards/bleu_reward_func/std": 0.024357853457331657, "step": 326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.53125, "completions/max_length": 512.0, "completions/max_terminated_length": 416.0, "completions/mean_length": 395.4375, "completions/mean_terminated_length": 263.3333435058594, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.2616, "grad_norm": 2.143420457839966, "kl": 0.0240478515625, "learning_rate": 1e-06, "loss": 0.0552, "num_tokens": 4362278.0, "reward": 0.026128560304641724, "reward_std": 0.00915272906422615, "rewards/bleu_reward_func/mean": 0.026128560304641724, "rewards/bleu_reward_func/std": 0.016462024301290512, "step": 327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 512.0, "completions/max_terminated_length": 481.0, "completions/mean_length": 276.34375, "completions/mean_terminated_length": 268.7419128417969, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.2624, "grad_norm": 2.9250733852386475, "kl": 0.02899169921875, "learning_rate": 1e-06, "loss": -0.0632, "num_tokens": 4376457.0, "reward": 0.06787262856960297, "reward_std": 0.02760476991534233, "rewards/bleu_reward_func/mean": 0.06787262856960297, "rewards/bleu_reward_func/std": 0.04441879689693451, "step": 328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 512.0, "completions/max_terminated_length": 496.0, "completions/mean_length": 316.46875, "completions/mean_terminated_length": 310.1612854003906, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.2632, "grad_norm": 2.7614059448242188, "kl": 0.0227508544921875, "learning_rate": 1e-06, "loss": 0.052, "num_tokens": 4388400.0, "reward": 0.10147911310195923, "reward_std": 0.01994110643863678, "rewards/bleu_reward_func/mean": 0.10147911310195923, "rewards/bleu_reward_func/std": 0.09049764275550842, "step": 329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 512.0, "completions/max_terminated_length": 505.0, "completions/mean_length": 405.5625, "completions/mean_terminated_length": 322.77777099609375, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.264, "grad_norm": 2.2795681953430176, "kl": 0.02447509765625, "learning_rate": 1e-06, "loss": 0.0386, "num_tokens": 4403378.0, "reward": 0.035395894199609756, "reward_std": 0.01003876980394125, "rewards/bleu_reward_func/mean": 0.035395894199609756, "rewards/bleu_reward_func/std": 0.026000048965215683, "step": 330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 512.0, "completions/max_terminated_length": 510.0, "completions/mean_length": 278.125, "completions/mean_terminated_length": 253.9310302734375, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 0.2648, "grad_norm": 2.9069783687591553, "kl": 0.03643798828125, "learning_rate": 1e-06, "loss": 0.035, "num_tokens": 4414550.0, "reward": 0.06586553156375885, "reward_std": 0.02944871410727501, "rewards/bleu_reward_func/mean": 0.06586553156375885, "rewards/bleu_reward_func/std": 0.051484063267707825, "step": 331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 503.0, "completions/max_terminated_length": 503.0, "completions/mean_length": 245.78125, "completions/mean_terminated_length": 245.78125, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 0.2656, "grad_norm": 2.919708490371704, "kl": 0.022796630859375, "learning_rate": 1e-06, "loss": 0.114, "num_tokens": 4424263.0, "reward": 0.05288301408290863, "reward_std": 0.03779301792383194, "rewards/bleu_reward_func/mean": 0.05288301408290863, "rewards/bleu_reward_func/std": 0.048032332211732864, "step": 332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 512.0, "completions/max_terminated_length": 457.0, "completions/mean_length": 332.15625, "completions/mean_terminated_length": 326.3548278808594, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.2664, "grad_norm": 2.5168418884277344, "kl": 0.0219573974609375, "learning_rate": 1e-06, "loss": 0.0112, "num_tokens": 4437548.0, "reward": 0.07673460245132446, "reward_std": 0.024972733110189438, "rewards/bleu_reward_func/mean": 0.07673460245132446, "rewards/bleu_reward_func/std": 0.06732524931430817, "step": 333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 512.0, "completions/max_terminated_length": 507.0, "completions/mean_length": 194.34375, "completions/mean_terminated_length": 184.09677124023438, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "epoch": 0.2672, "grad_norm": 3.8905692100524902, "kl": 0.03387451171875, "learning_rate": 1e-06, "loss": -0.0208, "num_tokens": 4449471.0, "reward": 0.03636704757809639, "reward_std": 0.015638206154108047, "rewards/bleu_reward_func/mean": 0.03636704757809639, "rewards/bleu_reward_func/std": 0.01746034063398838, "step": 334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 507.0, "completions/mean_length": 334.34375, "completions/mean_terminated_length": 275.125, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 0.268, "grad_norm": 2.877241373062134, "kl": 0.031646728515625, "learning_rate": 1e-06, "loss": -0.0796, "num_tokens": 4465066.0, "reward": 0.04503689333796501, "reward_std": 0.019507717341184616, "rewards/bleu_reward_func/mean": 0.04503689333796501, "rewards/bleu_reward_func/std": 0.03148737922310829, "step": 335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 512.0, "completions/max_terminated_length": 393.0, "completions/mean_length": 361.46875, "completions/mean_terminated_length": 244.38888549804688, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.2688, "grad_norm": 2.3143179416656494, "kl": 0.023193359375, "learning_rate": 1e-06, "loss": -0.051, "num_tokens": 4479641.0, "reward": 0.04236375913023949, "reward_std": 0.01834411546587944, "rewards/bleu_reward_func/mean": 0.04236375913023949, "rewards/bleu_reward_func/std": 0.023316362872719765, "step": 336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 512.0, "completions/max_terminated_length": 462.0, "completions/mean_length": 280.6875, "completions/mean_terminated_length": 237.8518524169922, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.2696, "grad_norm": 2.4942591190338135, "kl": 0.0158843994140625, "learning_rate": 1e-06, "loss": -0.0403, "num_tokens": 4490735.0, "reward": 0.0699649378657341, "reward_std": 0.025273269042372704, "rewards/bleu_reward_func/mean": 0.0699649378657341, "rewards/bleu_reward_func/std": 0.07529071718454361, "step": 337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 507.0, "completions/mean_length": 368.1875, "completions/mean_terminated_length": 311.9130554199219, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.2704, "grad_norm": 2.073747158050537, "kl": 0.0158843994140625, "learning_rate": 1e-06, "loss": 0.0201, "num_tokens": 4505757.0, "reward": 0.06272565573453903, "reward_std": 0.04652194678783417, "rewards/bleu_reward_func/mean": 0.06272565573453903, "rewards/bleu_reward_func/std": 0.09588578343391418, "step": 338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 500.0, "completions/mean_length": 301.90625, "completions/mean_terminated_length": 231.875, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.2712, "grad_norm": 3.1780126094818115, "kl": 0.0195159912109375, "learning_rate": 1e-06, "loss": -0.1707, "num_tokens": 4523066.0, "reward": 0.029990248382091522, "reward_std": 0.01466078869998455, "rewards/bleu_reward_func/mean": 0.029990248382091522, "rewards/bleu_reward_func/std": 0.028150422498583794, "step": 339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 512.0, "completions/max_terminated_length": 398.0, "completions/mean_length": 358.5, "completions/mean_terminated_length": 239.11111450195312, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 0.272, "grad_norm": 2.1125426292419434, "kl": 0.0198822021484375, "learning_rate": 1e-06, "loss": -0.0078, "num_tokens": 4540362.0, "reward": 0.14181923866271973, "reward_std": 0.10672705620527267, "rewards/bleu_reward_func/mean": 0.14181923866271973, "rewards/bleu_reward_func/std": 0.26110920310020447, "step": 340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 512.0, "completions/max_terminated_length": 510.0, "completions/mean_length": 247.5, "completions/mean_terminated_length": 220.13792419433594, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.2728, "grad_norm": 4.835620403289795, "kl": 0.0215606689453125, "learning_rate": 1e-06, "loss": 0.3826, "num_tokens": 4551714.0, "reward": 0.16201910376548767, "reward_std": 0.08808144181966782, "rewards/bleu_reward_func/mean": 0.16201910376548767, "rewards/bleu_reward_func/std": 0.2542886435985565, "step": 341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 512.0, "completions/max_terminated_length": 493.0, "completions/mean_length": 242.84375, "completions/mean_terminated_length": 215.0, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "epoch": 0.2736, "grad_norm": 2.835961103439331, "kl": 0.0259246826171875, "learning_rate": 1e-06, "loss": -0.0701, "num_tokens": 4561693.0, "reward": 0.04708701744675636, "reward_std": 0.015518728643655777, "rewards/bleu_reward_func/mean": 0.04708701744675636, "rewards/bleu_reward_func/std": 0.048917315900325775, "step": 342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 512.0, "completions/max_terminated_length": 106.0, "completions/mean_length": 282.78125, "completions/mean_terminated_length": 53.5625, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.2744, "grad_norm": 5.41905403137207, "kl": 0.092864990234375, "learning_rate": 1e-06, "loss": -0.0831, "num_tokens": 4574398.0, "reward": 0.054219573736190796, "reward_std": 0.02302156388759613, "rewards/bleu_reward_func/mean": 0.054219573736190796, "rewards/bleu_reward_func/std": 0.04756642505526543, "step": 343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 512.0, "completions/max_terminated_length": 493.0, "completions/mean_length": 401.09375, "completions/mean_terminated_length": 290.1875, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.2752, "grad_norm": 2.166447401046753, "kl": 0.0235137939453125, "learning_rate": 1e-06, "loss": 0.0272, "num_tokens": 4590585.0, "reward": 0.036001622676849365, "reward_std": 0.02016858570277691, "rewards/bleu_reward_func/mean": 0.036001622676849365, "rewards/bleu_reward_func/std": 0.03957979381084442, "step": 344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/max_terminated_length": 391.0, "completions/mean_length": 157.0625, "completions/mean_terminated_length": 157.0625, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 0.276, "grad_norm": 4.106038570404053, "kl": 0.029022216796875, "learning_rate": 1e-06, "loss": -0.0406, "num_tokens": 4600707.0, "reward": 0.05328774452209473, "reward_std": 0.018984105437994003, "rewards/bleu_reward_func/mean": 0.05328774452209473, "rewards/bleu_reward_func/std": 0.035300616174936295, "step": 345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 501.0, "completions/mean_length": 328.84375, "completions/mean_terminated_length": 267.79168701171875, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 0.2768, "grad_norm": 2.932767391204834, "kl": 0.028656005859375, "learning_rate": 1e-06, "loss": 0.0407, "num_tokens": 4613366.0, "reward": 0.032469406723976135, "reward_std": 0.008480279706418514, "rewards/bleu_reward_func/mean": 0.032469406723976135, "rewards/bleu_reward_func/std": 0.018695853650569916, "step": 346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 464.0, "completions/mean_length": 336.125, "completions/mean_terminated_length": 267.3043518066406, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.2776, "grad_norm": 2.633439064025879, "kl": 0.023101806640625, "learning_rate": 1e-06, "loss": -0.0152, "num_tokens": 4627018.0, "reward": 0.04361742362380028, "reward_std": 0.02035902440547943, "rewards/bleu_reward_func/mean": 0.04361742362380028, "rewards/bleu_reward_func/std": 0.021847298368811607, "step": 347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 460.0, "completions/mean_length": 293.46875, "completions/mean_terminated_length": 207.95652770996094, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.2784, "grad_norm": 2.943420886993408, "kl": 0.027130126953125, "learning_rate": 1e-06, "loss": 0.007, "num_tokens": 4640505.0, "reward": 0.21496786177158356, "reward_std": 0.06119208037853241, "rewards/bleu_reward_func/mean": 0.21496786177158356, "rewards/bleu_reward_func/std": 0.28509706258773804, "step": 348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 512.0, "completions/max_terminated_length": 359.0, "completions/mean_length": 216.59375, "completions/mean_terminated_length": 196.90000915527344, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.2792, "grad_norm": 2.763566255569458, "kl": 0.01910400390625, "learning_rate": 1e-06, "loss": 0.0424, "num_tokens": 4649468.0, "reward": 0.07204422354698181, "reward_std": 0.04057842493057251, "rewards/bleu_reward_func/mean": 0.07204422354698181, "rewards/bleu_reward_func/std": 0.06618453562259674, "step": 349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 511.0, "completions/mean_length": 314.34375, "completions/mean_terminated_length": 259.0, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "epoch": 0.28, "grad_norm": 2.9624099731445312, "kl": 0.021331787109375, "learning_rate": 1e-06, "loss": 0.0837, "num_tokens": 4665599.0, "reward": 0.06650006771087646, "reward_std": 0.04054763540625572, "rewards/bleu_reward_func/mean": 0.06650006771087646, "rewards/bleu_reward_func/std": 0.05784449353814125, "step": 350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 511.0, "completions/mean_length": 302.375, "completions/mean_terminated_length": 220.3478240966797, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 0.2808, "grad_norm": 3.327723741531372, "kl": 0.02691650390625, "learning_rate": 1e-06, "loss": -0.0126, "num_tokens": 4679795.0, "reward": 0.04633244499564171, "reward_std": 0.0104750357568264, "rewards/bleu_reward_func/mean": 0.04633244499564171, "rewards/bleu_reward_func/std": 0.0204442348331213, "step": 351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 512.0, "completions/max_terminated_length": 476.0, "completions/mean_length": 357.96875, "completions/mean_terminated_length": 238.1666717529297, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 0.2816, "grad_norm": 3.8369596004486084, "kl": 0.0295562744140625, "learning_rate": 1e-06, "loss": -0.0252, "num_tokens": 4693930.0, "reward": 0.0545232817530632, "reward_std": 0.017153870314359665, "rewards/bleu_reward_func/mean": 0.0545232817530632, "rewards/bleu_reward_func/std": 0.05576448515057564, "step": 352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 489.0, "completions/mean_length": 326.03125, "completions/mean_terminated_length": 264.04168701171875, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.2824, "grad_norm": 2.7297415733337402, "kl": 0.025543212890625, "learning_rate": 1e-06, "loss": 0.1881, "num_tokens": 4707739.0, "reward": 0.04129674285650253, "reward_std": 0.041807621717453, "rewards/bleu_reward_func/mean": 0.04129674285650253, "rewards/bleu_reward_func/std": 0.05771995335817337, "step": 353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 512.0, "completions/max_terminated_length": 493.0, "completions/mean_length": 315.40625, "completions/mean_terminated_length": 279.0, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "epoch": 0.2832, "grad_norm": 3.3393967151641846, "kl": 0.026458740234375, "learning_rate": 1e-06, "loss": -0.0207, "num_tokens": 4720072.0, "reward": 0.0364898145198822, "reward_std": 0.009451567195355892, "rewards/bleu_reward_func/mean": 0.0364898145198822, "rewards/bleu_reward_func/std": 0.030448051169514656, "step": 354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 497.0, "completions/mean_length": 282.25, "completions/mean_terminated_length": 205.6666717529297, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 0.284, "grad_norm": 3.096372365951538, "kl": 0.029876708984375, "learning_rate": 1e-06, "loss": -0.0442, "num_tokens": 4731840.0, "reward": 0.03462470322847366, "reward_std": 0.024994423612952232, "rewards/bleu_reward_func/mean": 0.03462470322847366, "rewards/bleu_reward_func/std": 0.03290289640426636, "step": 355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 512.0, "completions/max_terminated_length": 430.0, "completions/mean_length": 298.90625, "completions/mean_terminated_length": 284.70001220703125, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 0.2848, "grad_norm": 3.161959409713745, "kl": 0.022979736328125, "learning_rate": 1e-06, "loss": -0.1124, "num_tokens": 4743517.0, "reward": 0.04477345570921898, "reward_std": 0.014152650721371174, "rewards/bleu_reward_func/mean": 0.04477345570921898, "rewards/bleu_reward_func/std": 0.023930195719003677, "step": 356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 512.0, "completions/max_terminated_length": 433.0, "completions/mean_length": 276.34375, "completions/mean_terminated_length": 260.63336181640625, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.2856, "grad_norm": 2.8648571968078613, "kl": 0.0195159912109375, "learning_rate": 1e-06, "loss": -0.1692, "num_tokens": 4755056.0, "reward": 0.0786188468337059, "reward_std": 0.036090441048145294, "rewards/bleu_reward_func/mean": 0.0786188468337059, "rewards/bleu_reward_func/std": 0.04780329018831253, "step": 357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 506.0, "completions/mean_length": 298.28125, "completions/mean_terminated_length": 227.0416717529297, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 0.2864, "grad_norm": 3.4011240005493164, "kl": 0.047210693359375, "learning_rate": 1e-06, "loss": 0.0091, "num_tokens": 4766817.0, "reward": 0.05945078283548355, "reward_std": 0.01760122738778591, "rewards/bleu_reward_func/mean": 0.05945078283548355, "rewards/bleu_reward_func/std": 0.030062546953558922, "step": 358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 495.0, "completions/mean_length": 290.75, "completions/mean_terminated_length": 228.79998779296875, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 0.2872, "grad_norm": 2.69232439994812, "kl": 0.03240966796875, "learning_rate": 1e-06, "loss": 0.1015, "num_tokens": 4778625.0, "reward": 0.04637129232287407, "reward_std": 0.017768511548638344, "rewards/bleu_reward_func/mean": 0.04637129232287407, "rewards/bleu_reward_func/std": 0.024475086480379105, "step": 359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/max_terminated_length": 482.0, "completions/mean_length": 196.8125, "completions/mean_terminated_length": 196.8125, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.288, "grad_norm": 5.5619120597839355, "kl": 0.035400390625, "learning_rate": 1e-06, "loss": -0.1212, "num_tokens": 4787059.0, "reward": 0.057328179478645325, "reward_std": 0.024165252223610878, "rewards/bleu_reward_func/mean": 0.057328179478645325, "rewards/bleu_reward_func/std": 0.04007139429450035, "step": 360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 461.0, "completions/max_terminated_length": 461.0, "completions/mean_length": 247.90625, "completions/mean_terminated_length": 247.90625, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 0.2888, "grad_norm": 3.4733636379241943, "kl": 0.0301513671875, "learning_rate": 1e-06, "loss": 0.105, "num_tokens": 4797280.0, "reward": 0.0336172953248024, "reward_std": 0.02327040769159794, "rewards/bleu_reward_func/mean": 0.0336172953248024, "rewards/bleu_reward_func/std": 0.03187631443142891, "step": 361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 512.0, "completions/max_terminated_length": 482.0, "completions/mean_length": 255.28125, "completions/mean_terminated_length": 238.1666717529297, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 0.2896, "grad_norm": 3.501699686050415, "kl": 0.0192413330078125, "learning_rate": 1e-06, "loss": -0.0395, "num_tokens": 4807833.0, "reward": 0.10404136776924133, "reward_std": 0.03069019690155983, "rewards/bleu_reward_func/mean": 0.10404136776924133, "rewards/bleu_reward_func/std": 0.11612053960561752, "step": 362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 512.0, "completions/max_terminated_length": 446.0, "completions/mean_length": 311.28125, "completions/mean_terminated_length": 190.85000610351562, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 0.2904, "grad_norm": 3.420729160308838, "kl": 0.029693603515625, "learning_rate": 1e-06, "loss": -0.1242, "num_tokens": 4821250.0, "reward": 0.03283509612083435, "reward_std": 0.00878961943089962, "rewards/bleu_reward_func/mean": 0.03283509612083435, "rewards/bleu_reward_func/std": 0.03208829462528229, "step": 363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 512.0, "completions/max_terminated_length": 495.0, "completions/mean_length": 410.65625, "completions/mean_terminated_length": 357.5714416503906, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.2912, "grad_norm": 2.394613742828369, "kl": 0.020355224609375, "learning_rate": 1e-06, "loss": 0.0723, "num_tokens": 4838031.0, "reward": 0.03294292837381363, "reward_std": 0.0140132587403059, "rewards/bleu_reward_func/mean": 0.03294292837381363, "rewards/bleu_reward_func/std": 0.036045484244823456, "step": 364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 512.0, "completions/max_terminated_length": 478.0, "completions/mean_length": 212.4375, "completions/mean_terminated_length": 202.77418518066406, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.292, "grad_norm": 4.775505542755127, "kl": 0.021697998046875, "learning_rate": 1e-06, "loss": -0.0461, "num_tokens": 4850365.0, "reward": 0.09430208802223206, "reward_std": 0.06408664584159851, "rewards/bleu_reward_func/mean": 0.09430208802223206, "rewards/bleu_reward_func/std": 0.11308187246322632, "step": 365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/max_terminated_length": 391.0, "completions/mean_length": 203.0625, "completions/mean_terminated_length": 203.0625, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "epoch": 0.2928, "grad_norm": 3.6602747440338135, "kl": 0.029144287109375, "learning_rate": 1e-06, "loss": 0.1312, "num_tokens": 4859159.0, "reward": 0.09555967152118683, "reward_std": 0.028682151809334755, "rewards/bleu_reward_func/mean": 0.09555967152118683, "rewards/bleu_reward_func/std": 0.07242386043071747, "step": 366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 512.0, "completions/max_terminated_length": 512.0, "completions/mean_length": 392.46875, "completions/mean_terminated_length": 287.0, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "epoch": 0.2936, "grad_norm": 2.596052408218384, "kl": 0.0323486328125, "learning_rate": 1e-06, "loss": -0.0453, "num_tokens": 4875438.0, "reward": 0.06572610139846802, "reward_std": 0.019221346825361252, "rewards/bleu_reward_func/mean": 0.06572610139846802, "rewards/bleu_reward_func/std": 0.05693361535668373, "step": 367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 485.0, "completions/mean_length": 330.90625, "completions/mean_terminated_length": 260.0434875488281, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 0.2944, "grad_norm": 2.464620590209961, "kl": 0.027130126953125, "learning_rate": 1e-06, "loss": -0.0032, "num_tokens": 4889683.0, "reward": 0.051844000816345215, "reward_std": 0.016486987471580505, "rewards/bleu_reward_func/mean": 0.051844000816345215, "rewards/bleu_reward_func/std": 0.017129171639680862, "step": 368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 382.0, "completions/mean_length": 303.25, "completions/mean_terminated_length": 233.6666717529297, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 0.2952, "grad_norm": 2.683809280395508, "kl": 0.0198974609375, "learning_rate": 1e-06, "loss": 0.0027, "num_tokens": 4901515.0, "reward": 0.04742293432354927, "reward_std": 0.022541342303156853, "rewards/bleu_reward_func/mean": 0.04742293432354927, "rewards/bleu_reward_func/std": 0.03928080573678017, "step": 369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 490.0, "completions/mean_length": 276.46875, "completions/mean_terminated_length": 210.51998901367188, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.296, "grad_norm": 2.637234926223755, "kl": 0.0212860107421875, "learning_rate": 1e-06, "loss": -0.187, "num_tokens": 4913330.0, "reward": 0.0685403048992157, "reward_std": 0.03239838033914566, "rewards/bleu_reward_func/mean": 0.0685403048992157, "rewards/bleu_reward_func/std": 0.09188274294137955, "step": 370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 512.0, "completions/max_terminated_length": 480.0, "completions/mean_length": 315.78125, "completions/mean_terminated_length": 213.0, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.2968, "grad_norm": 4.868023872375488, "kl": 0.026947021484375, "learning_rate": 1e-06, "loss": 0.0314, "num_tokens": 4926643.0, "reward": 0.11580727994441986, "reward_std": 0.05246927589178085, "rewards/bleu_reward_func/mean": 0.11580727994441986, "rewards/bleu_reward_func/std": 0.12015223503112793, "step": 371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 512.0, "completions/max_terminated_length": 503.0, "completions/mean_length": 281.25, "completions/mean_terminated_length": 77.64705657958984, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.2976, "grad_norm": 5.435993194580078, "kl": 0.03948974609375, "learning_rate": 1e-06, "loss": 0.0457, "num_tokens": 4940059.0, "reward": 0.021783608943223953, "reward_std": 0.0037742627318948507, "rewards/bleu_reward_func/mean": 0.021783608943223953, "rewards/bleu_reward_func/std": 0.016994595527648926, "step": 372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 502.0, "completions/mean_length": 361.21875, "completions/mean_terminated_length": 310.9583435058594, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.2984, "grad_norm": 3.6274783611297607, "kl": 0.02838134765625, "learning_rate": 1e-06, "loss": 0.1372, "num_tokens": 4954882.0, "reward": 0.033363357186317444, "reward_std": 0.010118735022842884, "rewards/bleu_reward_func/mean": 0.033363357186317444, "rewards/bleu_reward_func/std": 0.01759323477745056, "step": 373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 512.0, "completions/max_terminated_length": 460.0, "completions/mean_length": 268.46875, "completions/mean_terminated_length": 212.2692413330078, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 0.2992, "grad_norm": 3.4594333171844482, "kl": 0.040863037109375, "learning_rate": 1e-06, "loss": -0.03, "num_tokens": 4965705.0, "reward": 0.06493013352155685, "reward_std": 0.024484504014253616, "rewards/bleu_reward_func/mean": 0.06493013352155685, "rewards/bleu_reward_func/std": 0.05946136638522148, "step": 374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 494.0, "completions/mean_length": 356.1875, "completions/mean_terminated_length": 312.55999755859375, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.3, "grad_norm": 2.657532215118408, "kl": 0.0255126953125, "learning_rate": 1e-06, "loss": -0.1039, "num_tokens": 4982839.0, "reward": 0.09929439425468445, "reward_std": 0.02464054897427559, "rewards/bleu_reward_func/mean": 0.09929439425468445, "rewards/bleu_reward_func/std": 0.12181542813777924, "step": 375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 511.0, "completions/mean_length": 298.53125, "completions/mean_terminated_length": 238.75999450683594, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 0.3008, "grad_norm": 2.7878925800323486, "kl": 0.030426025390625, "learning_rate": 1e-06, "loss": -0.1121, "num_tokens": 4994976.0, "reward": 0.09792232513427734, "reward_std": 0.03112916275858879, "rewards/bleu_reward_func/mean": 0.09792232513427734, "rewards/bleu_reward_func/std": 0.10117895156145096, "step": 376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 512.0, "completions/max_terminated_length": 510.0, "completions/mean_length": 246.875, "completions/mean_terminated_length": 209.00001525878906, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 0.3016, "grad_norm": 3.0961310863494873, "kl": 0.039215087890625, "learning_rate": 1e-06, "loss": -0.0131, "num_tokens": 5005532.0, "reward": 0.04401427507400513, "reward_std": 0.011130438186228275, "rewards/bleu_reward_func/mean": 0.04401427507400513, "rewards/bleu_reward_func/std": 0.026963340118527412, "step": 377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 512.0, "completions/max_terminated_length": 432.0, "completions/mean_length": 352.96875, "completions/mean_terminated_length": 269.6666564941406, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.3024, "grad_norm": 2.8589773178100586, "kl": 0.029449462890625, "learning_rate": 1e-06, "loss": 0.0124, "num_tokens": 5019211.0, "reward": 0.05802150070667267, "reward_std": 0.021435074508190155, "rewards/bleu_reward_func/mean": 0.05802150070667267, "rewards/bleu_reward_func/std": 0.03692251443862915, "step": 378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/max_terminated_length": 480.0, "completions/mean_length": 236.9375, "completions/mean_terminated_length": 236.9375, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.3032, "grad_norm": 3.043027877807617, "kl": 0.02935791015625, "learning_rate": 1e-06, "loss": -0.0605, "num_tokens": 5029297.0, "reward": 0.027532659471035004, "reward_std": 0.012727165594696999, "rewards/bleu_reward_func/mean": 0.027532659471035004, "rewards/bleu_reward_func/std": 0.016757391393184662, "step": 379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 512.0, "completions/max_terminated_length": 404.0, "completions/mean_length": 190.5625, "completions/mean_terminated_length": 180.19354248046875, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 0.304, "grad_norm": 3.4946486949920654, "kl": 0.0279083251953125, "learning_rate": 1e-06, "loss": 0.1935, "num_tokens": 5039483.0, "reward": 0.060268811881542206, "reward_std": 0.03754986822605133, "rewards/bleu_reward_func/mean": 0.060268811881542206, "rewards/bleu_reward_func/std": 0.038724955171346664, "step": 380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 512.0, "completions/max_terminated_length": 457.0, "completions/mean_length": 221.03125, "completions/mean_terminated_length": 190.9310302734375, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 0.3048, "grad_norm": 3.8986661434173584, "kl": 0.034942626953125, "learning_rate": 1e-06, "loss": 0.0139, "num_tokens": 5049020.0, "reward": 0.034335315227508545, "reward_std": 0.00831439159810543, "rewards/bleu_reward_func/mean": 0.034335315227508545, "rewards/bleu_reward_func/std": 0.01297684945166111, "step": 381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 512.0, "completions/max_terminated_length": 501.0, "completions/mean_length": 281.0, "completions/mean_terminated_length": 265.6000061035156, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 0.3056, "grad_norm": 3.340994119644165, "kl": 0.034881591796875, "learning_rate": 1e-06, "loss": 0.1244, "num_tokens": 5059932.0, "reward": 0.031757794320583344, "reward_std": 0.00996050052344799, "rewards/bleu_reward_func/mean": 0.031757794320583344, "rewards/bleu_reward_func/std": 0.014244799502193928, "step": 382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 486.0, "completions/mean_length": 312.96875, "completions/mean_terminated_length": 235.0869598388672, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.3064, "grad_norm": 3.1813881397247314, "kl": 0.02911376953125, "learning_rate": 1e-06, "loss": 0.0916, "num_tokens": 5072299.0, "reward": 0.02868136763572693, "reward_std": 0.011929353699088097, "rewards/bleu_reward_func/mean": 0.02868136763572693, "rewards/bleu_reward_func/std": 0.018595216795802116, "step": 383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 499.0, "completions/mean_length": 290.03125, "completions/mean_terminated_length": 203.17391967773438, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 0.3072, "grad_norm": 2.8743865489959717, "kl": 0.027252197265625, "learning_rate": 1e-06, "loss": 0.0136, "num_tokens": 5085700.0, "reward": 0.06972040981054306, "reward_std": 0.030980605632066727, "rewards/bleu_reward_func/mean": 0.06972040981054306, "rewards/bleu_reward_func/std": 0.060256555676460266, "step": 384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 469.0, "completions/mean_length": 296.78125, "completions/mean_terminated_length": 225.0416717529297, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 0.308, "grad_norm": 2.9356751441955566, "kl": 0.02740478515625, "learning_rate": 1e-06, "loss": -0.0112, "num_tokens": 5097589.0, "reward": 0.04450830817222595, "reward_std": 0.02866341546177864, "rewards/bleu_reward_func/mean": 0.04450830817222595, "rewards/bleu_reward_func/std": 0.04176363721489906, "step": 385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 512.0, "completions/max_terminated_length": 504.0, "completions/mean_length": 382.125, "completions/mean_terminated_length": 304.20001220703125, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.3088, "grad_norm": 2.262878656387329, "kl": 0.0264129638671875, "learning_rate": 1e-06, "loss": -0.0724, "num_tokens": 5113993.0, "reward": 0.12424831092357635, "reward_std": 0.04111553356051445, "rewards/bleu_reward_func/mean": 0.12424831092357635, "rewards/bleu_reward_func/std": 0.15211449563503265, "step": 386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 512.0, "completions/max_terminated_length": 464.0, "completions/mean_length": 362.09375, "completions/mean_terminated_length": 293.9545593261719, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "epoch": 0.3096, "grad_norm": 2.2440779209136963, "kl": 0.0277099609375, "learning_rate": 1e-06, "loss": -0.015, "num_tokens": 5128876.0, "reward": 0.04977039247751236, "reward_std": 0.01728152297437191, "rewards/bleu_reward_func/mean": 0.04977039247751236, "rewards/bleu_reward_func/std": 0.029223492369055748, "step": 387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 372.0, "completions/mean_length": 276.4375, "completions/mean_terminated_length": 197.9166717529297, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "epoch": 0.3104, "grad_norm": 2.516524076461792, "kl": 0.027374267578125, "learning_rate": 1e-06, "loss": -0.0645, "num_tokens": 5140674.0, "reward": 0.146553173661232, "reward_std": 0.04176880046725273, "rewards/bleu_reward_func/mean": 0.146553173661232, "rewards/bleu_reward_func/std": 0.09359844774007797, "step": 388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 512.0, "completions/max_terminated_length": 507.0, "completions/mean_length": 324.96875, "completions/mean_terminated_length": 305.6206970214844, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "epoch": 0.3112, "grad_norm": 2.948312759399414, "kl": 0.02374267578125, "learning_rate": 1e-06, "loss": 0.0621, "num_tokens": 5155745.0, "reward": 0.106197290122509, "reward_std": 0.06629303842782974, "rewards/bleu_reward_func/mean": 0.106197290122509, "rewards/bleu_reward_func/std": 0.11851444095373154, "step": 389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 512.0, "completions/max_terminated_length": 508.0, "completions/mean_length": 298.5625, "completions/mean_terminated_length": 152.5263214111328, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 0.312, "grad_norm": 14.583303451538086, "kl": 0.05859375, "learning_rate": 1e-06, "loss": -0.0482, "num_tokens": 5168939.0, "reward": 0.07266208529472351, "reward_std": 0.019275350496172905, "rewards/bleu_reward_func/mean": 0.07266208529472351, "rewards/bleu_reward_func/std": 0.11420844495296478, "step": 390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 512.0, "completions/max_terminated_length": 424.0, "completions/mean_length": 265.125, "completions/mean_terminated_length": 257.1612854003906, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.3128, "grad_norm": 2.5951454639434814, "kl": 0.025299072265625, "learning_rate": 1e-06, "loss": 0.0048, "num_tokens": 5179439.0, "reward": 0.0554918497800827, "reward_std": 0.020207270979881287, "rewards/bleu_reward_func/mean": 0.0554918497800827, "rewards/bleu_reward_func/std": 0.052480507642030716, "step": 391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5625, "completions/max_length": 512.0, "completions/max_terminated_length": 508.0, "completions/mean_length": 452.09375, "completions/mean_terminated_length": 375.0714416503906, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 0.3136, "grad_norm": 1.9732576608657837, "kl": 0.02559661865234375, "learning_rate": 1e-06, "loss": -0.0084, "num_tokens": 5196810.0, "reward": 0.07723353058099747, "reward_std": 0.021651268005371094, "rewards/bleu_reward_func/mean": 0.07723353058099747, "rewards/bleu_reward_func/std": 0.06710720807313919, "step": 392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5625, "completions/max_length": 512.0, "completions/max_terminated_length": 474.0, "completions/mean_length": 392.0, "completions/mean_terminated_length": 237.71429443359375, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "epoch": 0.3144, "grad_norm": 2.6798901557922363, "kl": 0.03570556640625, "learning_rate": 1e-06, "loss": -0.0554, "num_tokens": 5212434.0, "reward": 0.06295132637023926, "reward_std": 0.022763650864362717, "rewards/bleu_reward_func/mean": 0.06295132637023926, "rewards/bleu_reward_func/std": 0.03463296964764595, "step": 393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5625, "completions/max_length": 512.0, "completions/max_terminated_length": 472.0, "completions/mean_length": 456.6875, "completions/mean_terminated_length": 385.5714416503906, "completions/min_length": 289.0, "completions/min_terminated_length": 289.0, "epoch": 0.3152, "grad_norm": 2.0554585456848145, "kl": 0.029510498046875, "learning_rate": 1e-06, "loss": -0.0209, "num_tokens": 5230848.0, "reward": 0.04525914788246155, "reward_std": 0.007387248799204826, "rewards/bleu_reward_func/mean": 0.04525914788246155, "rewards/bleu_reward_func/std": 0.02194630168378353, "step": 394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 512.0, "completions/max_terminated_length": 383.0, "completions/mean_length": 155.40625, "completions/mean_terminated_length": 143.90322875976562, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.316, "grad_norm": 3.7899491786956787, "kl": 0.037841796875, "learning_rate": 1e-06, "loss": -0.177, "num_tokens": 5238805.0, "reward": 0.039721157401800156, "reward_std": 0.03929724916815758, "rewards/bleu_reward_func/mean": 0.039721157401800156, "rewards/bleu_reward_func/std": 0.05673614889383316, "step": 395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 430.0, "completions/mean_length": 358.65625, "completions/mean_terminated_length": 298.6521911621094, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.3168, "grad_norm": 2.6228115558624268, "kl": 0.01983642578125, "learning_rate": 1e-06, "loss": -0.0173, "num_tokens": 5256122.0, "reward": 0.0510735958814621, "reward_std": 0.016432739794254303, "rewards/bleu_reward_func/mean": 0.0510735958814621, "rewards/bleu_reward_func/std": 0.0460241362452507, "step": 396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 339.0, "completions/max_terminated_length": 339.0, "completions/mean_length": 125.5625, "completions/mean_terminated_length": 125.5625, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 0.3176, "grad_norm": 14.036137580871582, "kl": 0.1956787109375, "learning_rate": 1e-06, "loss": -0.0332, "num_tokens": 5265292.0, "reward": 0.09528109431266785, "reward_std": 0.0211674515157938, "rewards/bleu_reward_func/mean": 0.09528109431266785, "rewards/bleu_reward_func/std": 0.07186417281627655, "step": 397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 466.0, "completions/max_terminated_length": 466.0, "completions/mean_length": 261.75, "completions/mean_terminated_length": 261.75, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 0.3184, "grad_norm": 3.5530920028686523, "kl": 0.02069091796875, "learning_rate": 1e-06, "loss": -0.2157, "num_tokens": 5275676.0, "reward": 0.05174801126122475, "reward_std": 0.020869575440883636, "rewards/bleu_reward_func/mean": 0.05174801126122475, "rewards/bleu_reward_func/std": 0.04189353436231613, "step": 398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 480.0, "completions/mean_length": 234.75, "completions/mean_terminated_length": 157.1199951171875, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 0.3192, "grad_norm": 3.5558056831359863, "kl": 0.0399169921875, "learning_rate": 1e-06, "loss": 0.1006, "num_tokens": 5286300.0, "reward": 0.04154960438609123, "reward_std": 0.019385188817977905, "rewards/bleu_reward_func/mean": 0.04154960438609123, "rewards/bleu_reward_func/std": 0.028244102373719215, "step": 399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/max_terminated_length": 502.0, "completions/mean_length": 255.625, "completions/mean_terminated_length": 255.625, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "epoch": 0.32, "grad_norm": 3.1600592136383057, "kl": 0.031707763671875, "learning_rate": 1e-06, "loss": -0.0721, "num_tokens": 5296840.0, "reward": 0.06132878363132477, "reward_std": 0.017585109919309616, "rewards/bleu_reward_func/mean": 0.06132878363132477, "rewards/bleu_reward_func/std": 0.03360173851251602, "step": 400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.71875, "completions/max_length": 512.0, "completions/max_terminated_length": 69.0, "completions/mean_length": 378.65625, "completions/mean_terminated_length": 37.88888931274414, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.3208, "grad_norm": 4.9900360107421875, "kl": 0.033050537109375, "learning_rate": 1e-06, "loss": -0.0658, "num_tokens": 5313149.0, "reward": 0.025677043944597244, "reward_std": 0.0111201461404562, "rewards/bleu_reward_func/mean": 0.025677043944597244, "rewards/bleu_reward_func/std": 0.02006489410996437, "step": 401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 512.0, "completions/max_terminated_length": 221.0, "completions/mean_length": 297.375, "completions/mean_terminated_length": 108.0, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.3216, "grad_norm": 3.406717538833618, "kl": 0.040863037109375, "learning_rate": 1e-06, "loss": -0.0749, "num_tokens": 5325945.0, "reward": 0.045327264815568924, "reward_std": 0.02512126788496971, "rewards/bleu_reward_func/mean": 0.045327264815568924, "rewards/bleu_reward_func/std": 0.03467832878232002, "step": 402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 491.0, "completions/mean_length": 296.5625, "completions/mean_terminated_length": 236.239990234375, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 0.3224, "grad_norm": 3.1772117614746094, "kl": 0.032318115234375, "learning_rate": 1e-06, "loss": 0.0194, "num_tokens": 5338475.0, "reward": 0.044717058539390564, "reward_std": 0.014735497534275055, "rewards/bleu_reward_func/mean": 0.044717058539390564, "rewards/bleu_reward_func/std": 0.025379199534654617, "step": 403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 512.0, "completions/max_terminated_length": 486.0, "completions/mean_length": 248.46875, "completions/mean_terminated_length": 239.9677276611328, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.3232, "grad_norm": 4.004060745239258, "kl": 0.04180908203125, "learning_rate": 1e-06, "loss": -0.093, "num_tokens": 5349122.0, "reward": 0.06194135546684265, "reward_std": 0.02376762218773365, "rewards/bleu_reward_func/mean": 0.06194135546684265, "rewards/bleu_reward_func/std": 0.02893226593732834, "step": 404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 512.0, "completions/max_terminated_length": 412.0, "completions/mean_length": 164.25, "completions/mean_terminated_length": 153.03225708007812, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.324, "grad_norm": 4.68143892288208, "kl": 0.0418701171875, "learning_rate": 1e-06, "loss": 0.0933, "num_tokens": 5356842.0, "reward": 0.17849373817443848, "reward_std": 0.10065864771604538, "rewards/bleu_reward_func/mean": 0.17849373817443848, "rewards/bleu_reward_func/std": 0.25785142183303833, "step": 405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 511.0, "completions/mean_length": 394.03125, "completions/mean_terminated_length": 354.7083435058594, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.3248, "grad_norm": 2.360607862472534, "kl": 0.026702880859375, "learning_rate": 1e-06, "loss": -0.0347, "num_tokens": 5371827.0, "reward": 0.08791603147983551, "reward_std": 0.02064087614417076, "rewards/bleu_reward_func/mean": 0.08791603147983551, "rewards/bleu_reward_func/std": 0.08002207428216934, "step": 406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.65625, "completions/max_length": 512.0, "completions/max_terminated_length": 469.0, "completions/mean_length": 407.75, "completions/mean_terminated_length": 208.72727966308594, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 0.3256, "grad_norm": 2.7047009468078613, "kl": 0.034332275390625, "learning_rate": 1e-06, "loss": 0.0369, "num_tokens": 5388667.0, "reward": 0.035750459879636765, "reward_std": 0.00714261457324028, "rewards/bleu_reward_func/mean": 0.035750459879636765, "rewards/bleu_reward_func/std": 0.02296554110944271, "step": 407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 491.0, "completions/mean_length": 337.78125, "completions/mean_terminated_length": 279.7083435058594, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 0.3264, "grad_norm": 2.637406349182129, "kl": 0.036590576171875, "learning_rate": 1e-06, "loss": 0.0154, "num_tokens": 5401732.0, "reward": 0.029655063524842262, "reward_std": 0.010500041767954826, "rewards/bleu_reward_func/mean": 0.029655063524842262, "rewards/bleu_reward_func/std": 0.012400495819747448, "step": 408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 512.0, "completions/max_terminated_length": 491.0, "completions/mean_length": 406.71875, "completions/mean_terminated_length": 351.5714416503906, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "epoch": 0.3272, "grad_norm": 2.2207653522491455, "kl": 0.026458740234375, "learning_rate": 1e-06, "loss": 0.0013, "num_tokens": 5419075.0, "reward": 0.06239059194922447, "reward_std": 0.018355626612901688, "rewards/bleu_reward_func/mean": 0.06239059194922447, "rewards/bleu_reward_func/std": 0.03472558781504631, "step": 409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 512.0, "completions/max_terminated_length": 503.0, "completions/mean_length": 330.625, "completions/mean_terminated_length": 304.71429443359375, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.328, "grad_norm": 2.539473295211792, "kl": 0.03277587890625, "learning_rate": 1e-06, "loss": -0.0873, "num_tokens": 5431623.0, "reward": 0.06792166829109192, "reward_std": 0.02972714975476265, "rewards/bleu_reward_func/mean": 0.06792166829109192, "rewards/bleu_reward_func/std": 0.04709053412079811, "step": 410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 512.0, "completions/max_terminated_length": 221.0, "completions/mean_length": 327.4375, "completions/mean_terminated_length": 142.875, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 0.3288, "grad_norm": 3.064293384552002, "kl": 0.0247650146484375, "learning_rate": 1e-06, "loss": -0.0849, "num_tokens": 5445821.0, "reward": 0.027155395597219467, "reward_std": 0.012970471754670143, "rewards/bleu_reward_func/mean": 0.027155395597219467, "rewards/bleu_reward_func/std": 0.016487330198287964, "step": 411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 488.0, "completions/mean_length": 323.96875, "completions/mean_terminated_length": 250.3913116455078, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 0.3296, "grad_norm": 3.3556032180786133, "kl": 0.026641845703125, "learning_rate": 1e-06, "loss": 0.0394, "num_tokens": 5459084.0, "reward": 0.05504102632403374, "reward_std": 0.01868896186351776, "rewards/bleu_reward_func/mean": 0.05504102632403374, "rewards/bleu_reward_func/std": 0.032063912600278854, "step": 412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 512.0, "completions/max_terminated_length": 498.0, "completions/mean_length": 293.34375, "completions/mean_terminated_length": 262.1071472167969, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 0.3304, "grad_norm": 2.61944842338562, "kl": 0.027618408203125, "learning_rate": 1e-06, "loss": -0.0423, "num_tokens": 5471703.0, "reward": 0.04594315215945244, "reward_std": 0.016052130609750748, "rewards/bleu_reward_func/mean": 0.04594315215945244, "rewards/bleu_reward_func/std": 0.030643180012702942, "step": 413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 512.0, "completions/max_terminated_length": 497.0, "completions/mean_length": 381.40625, "completions/mean_terminated_length": 292.0526428222656, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.3312, "grad_norm": 2.4337165355682373, "kl": 0.0261993408203125, "learning_rate": 1e-06, "loss": 0.0127, "num_tokens": 5486460.0, "reward": 0.03960081934928894, "reward_std": 0.010423287749290466, "rewards/bleu_reward_func/mean": 0.03960081934928894, "rewards/bleu_reward_func/std": 0.018299689516425133, "step": 414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 512.0, "completions/max_terminated_length": 512.0, "completions/mean_length": 415.25, "completions/mean_terminated_length": 329.8823547363281, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.332, "grad_norm": 2.1501145362854004, "kl": 0.034088134765625, "learning_rate": 1e-06, "loss": -0.0909, "num_tokens": 5503620.0, "reward": 0.051469504833221436, "reward_std": 0.01700912043452263, "rewards/bleu_reward_func/mean": 0.051469504833221436, "rewards/bleu_reward_func/std": 0.03233012557029724, "step": 415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 512.0, "completions/max_terminated_length": 471.0, "completions/mean_length": 339.90625, "completions/mean_terminated_length": 300.19232177734375, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.3328, "grad_norm": 2.446009397506714, "kl": 0.027069091796875, "learning_rate": 1e-06, "loss": 0.0161, "num_tokens": 5518281.0, "reward": 0.04302642494440079, "reward_std": 0.018161989748477936, "rewards/bleu_reward_func/mean": 0.04302642494440079, "rewards/bleu_reward_func/std": 0.037101197987794876, "step": 416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 456.0, "completions/mean_length": 329.46875, "completions/mean_terminated_length": 268.625, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 0.3336, "grad_norm": 2.420654058456421, "kl": 0.030914306640625, "learning_rate": 1e-06, "loss": -0.1055, "num_tokens": 5532568.0, "reward": 0.04830830916762352, "reward_std": 0.015328258275985718, "rewards/bleu_reward_func/mean": 0.04830830916762352, "rewards/bleu_reward_func/std": 0.02932472713291645, "step": 417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 512.0, "completions/mean_length": 312.78125, "completions/mean_terminated_length": 246.375, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.3344, "grad_norm": 2.4366395473480225, "kl": 0.03277587890625, "learning_rate": 1e-06, "loss": -0.063, "num_tokens": 5548425.0, "reward": 0.028444793075323105, "reward_std": 0.009239492937922478, "rewards/bleu_reward_func/mean": 0.028444793075323105, "rewards/bleu_reward_func/std": 0.024574536830186844, "step": 418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 512.0, "completions/max_terminated_length": 486.0, "completions/mean_length": 336.5625, "completions/mean_terminated_length": 324.8666687011719, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "epoch": 0.3352, "grad_norm": 2.1961772441864014, "kl": 0.028564453125, "learning_rate": 1e-06, "loss": -0.0136, "num_tokens": 5561371.0, "reward": 0.03844983130693436, "reward_std": 0.01347007229924202, "rewards/bleu_reward_func/mean": 0.03844983130693436, "rewards/bleu_reward_func/std": 0.02470613457262516, "step": 419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 512.0, "completions/max_terminated_length": 435.0, "completions/mean_length": 283.21875, "completions/mean_terminated_length": 240.8518524169922, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 0.336, "grad_norm": 2.4404196739196777, "kl": 0.02447509765625, "learning_rate": 1e-06, "loss": 0.1708, "num_tokens": 5576154.0, "reward": 0.12900003790855408, "reward_std": 0.05478304252028465, "rewards/bleu_reward_func/mean": 0.12900003790855408, "rewards/bleu_reward_func/std": 0.08241026103496552, "step": 420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 512.0, "completions/max_terminated_length": 483.0, "completions/mean_length": 288.65625, "completions/mean_terminated_length": 247.29629516601562, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 0.3368, "grad_norm": 2.4558486938476562, "kl": 0.02593994140625, "learning_rate": 1e-06, "loss": 0.133, "num_tokens": 5588247.0, "reward": 0.04614394158124924, "reward_std": 0.022616572678089142, "rewards/bleu_reward_func/mean": 0.04614394158124924, "rewards/bleu_reward_func/std": 0.042861953377723694, "step": 421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 512.0, "completions/max_terminated_length": 451.0, "completions/mean_length": 405.34375, "completions/mean_terminated_length": 298.6875, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 0.3376, "grad_norm": 2.181236505508423, "kl": 0.035614013671875, "learning_rate": 1e-06, "loss": 0.1833, "num_tokens": 5604570.0, "reward": 0.029749825596809387, "reward_std": 0.019359689205884933, "rewards/bleu_reward_func/mean": 0.029749825596809387, "rewards/bleu_reward_func/std": 0.0279875285923481, "step": 422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 509.0, "completions/mean_length": 362.21875, "completions/mean_terminated_length": 312.29168701171875, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.3384, "grad_norm": 2.532691240310669, "kl": 0.029205322265625, "learning_rate": 1e-06, "loss": 0.0322, "num_tokens": 5618353.0, "reward": 0.07113789021968842, "reward_std": 0.01926705427467823, "rewards/bleu_reward_func/mean": 0.07113789021968842, "rewards/bleu_reward_func/std": 0.07943008095026016, "step": 423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 512.0, "completions/max_terminated_length": 488.0, "completions/mean_length": 272.71875, "completions/mean_terminated_length": 256.7666931152344, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 0.3392, "grad_norm": 3.0738492012023926, "kl": 0.032440185546875, "learning_rate": 1e-06, "loss": 0.0292, "num_tokens": 5629712.0, "reward": 0.0503704771399498, "reward_std": 0.021814901381731033, "rewards/bleu_reward_func/mean": 0.0503704771399498, "rewards/bleu_reward_func/std": 0.05399306118488312, "step": 424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 359.0, "completions/mean_length": 316.59375, "completions/mean_terminated_length": 240.13043212890625, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.34, "grad_norm": 2.2919762134552, "kl": 0.03045654296875, "learning_rate": 1e-06, "loss": -0.0064, "num_tokens": 5642171.0, "reward": 0.04583510756492615, "reward_std": 0.016485266387462616, "rewards/bleu_reward_func/mean": 0.04583510756492615, "rewards/bleu_reward_func/std": 0.033508144319057465, "step": 425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 453.0, "completions/max_terminated_length": 453.0, "completions/mean_length": 293.15625, "completions/mean_terminated_length": 293.15625, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "epoch": 0.3408, "grad_norm": 2.3206934928894043, "kl": 0.027252197265625, "learning_rate": 1e-06, "loss": 0.0463, "num_tokens": 5653728.0, "reward": 0.020433904603123665, "reward_std": 0.011948324739933014, "rewards/bleu_reward_func/mean": 0.020433904603123665, "rewards/bleu_reward_func/std": 0.019608385860919952, "step": 426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 512.0, "completions/max_terminated_length": 471.0, "completions/mean_length": 266.75, "completions/mean_terminated_length": 210.1538543701172, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 0.3416, "grad_norm": 2.8593051433563232, "kl": 0.040802001953125, "learning_rate": 1e-06, "loss": -0.0247, "num_tokens": 5664072.0, "reward": 0.03389505296945572, "reward_std": 0.013023952953517437, "rewards/bleu_reward_func/mean": 0.03389505296945572, "rewards/bleu_reward_func/std": 0.02550988271832466, "step": 427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 484.0, "completions/mean_length": 302.5625, "completions/mean_terminated_length": 232.75, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.3424, "grad_norm": 2.5821361541748047, "kl": 0.0328369140625, "learning_rate": 1e-06, "loss": -0.0366, "num_tokens": 5678370.0, "reward": 0.05251599848270416, "reward_std": 0.01706322655081749, "rewards/bleu_reward_func/mean": 0.05251599848270416, "rewards/bleu_reward_func/std": 0.0254330113530159, "step": 428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 512.0, "completions/max_terminated_length": 509.0, "completions/mean_length": 458.53125, "completions/mean_terminated_length": 426.45001220703125, "completions/min_length": 290.0, "completions/min_terminated_length": 290.0, "epoch": 0.3432, "grad_norm": 1.9138927459716797, "kl": 0.02752685546875, "learning_rate": 1e-06, "loss": -0.0187, "num_tokens": 5696027.0, "reward": 0.0476941354572773, "reward_std": 0.01937401294708252, "rewards/bleu_reward_func/mean": 0.0476941354572773, "rewards/bleu_reward_func/std": 0.030997809022665024, "step": 429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 507.0, "completions/mean_length": 252.4375, "completions/mean_terminated_length": 165.9166717529297, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.344, "grad_norm": 4.442004203796387, "kl": 0.035736083984375, "learning_rate": 1e-06, "loss": -0.0416, "num_tokens": 5706201.0, "reward": 0.09095358103513718, "reward_std": 0.04099667817354202, "rewards/bleu_reward_func/mean": 0.09095358103513718, "rewards/bleu_reward_func/std": 0.08905672281980515, "step": 430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.75, "completions/max_length": 512.0, "completions/max_terminated_length": 504.0, "completions/mean_length": 489.5625, "completions/mean_terminated_length": 422.25, "completions/min_length": 279.0, "completions/min_terminated_length": 279.0, "epoch": 0.3448, "grad_norm": 1.909276008605957, "kl": 0.035797119140625, "learning_rate": 1e-06, "loss": 0.0172, "num_tokens": 5726163.0, "reward": 0.041696012020111084, "reward_std": 0.019856570288538933, "rewards/bleu_reward_func/mean": 0.041696012020111084, "rewards/bleu_reward_func/std": 0.03666767105460167, "step": 431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 512.0, "completions/max_terminated_length": 497.0, "completions/mean_length": 397.15625, "completions/mean_terminated_length": 337.0, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.3456, "grad_norm": 2.4237301349639893, "kl": 0.036224365234375, "learning_rate": 1e-06, "loss": 0.0294, "num_tokens": 5745016.0, "reward": 0.038987092673778534, "reward_std": 0.013197116553783417, "rewards/bleu_reward_func/mean": 0.038987092673778534, "rewards/bleu_reward_func/std": 0.03411531820893288, "step": 432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 512.0, "completions/max_terminated_length": 512.0, "completions/mean_length": 225.28125, "completions/mean_terminated_length": 195.6206817626953, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.3464, "grad_norm": 4.905090808868408, "kl": 0.045806884765625, "learning_rate": 1e-06, "loss": 0.1137, "num_tokens": 5754873.0, "reward": 0.04227697476744652, "reward_std": 0.012464666739106178, "rewards/bleu_reward_func/mean": 0.04227697476744652, "rewards/bleu_reward_func/std": 0.02255011536180973, "step": 433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 512.0, "completions/max_terminated_length": 439.0, "completions/mean_length": 390.65625, "completions/mean_terminated_length": 269.3125, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.3472, "grad_norm": 2.905046224594116, "kl": 0.0345458984375, "learning_rate": 1e-06, "loss": 0.0644, "num_tokens": 5772358.0, "reward": 0.05010952055454254, "reward_std": 0.026643291115760803, "rewards/bleu_reward_func/mean": 0.05010952055454254, "rewards/bleu_reward_func/std": 0.04131516441702843, "step": 434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 512.0, "completions/max_terminated_length": 437.0, "completions/mean_length": 389.78125, "completions/mean_terminated_length": 316.45001220703125, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.348, "grad_norm": 2.3226656913757324, "kl": 0.032135009765625, "learning_rate": 1e-06, "loss": 0.0247, "num_tokens": 5787815.0, "reward": 0.04076055437326431, "reward_std": 0.009578779339790344, "rewards/bleu_reward_func/mean": 0.04076055437326431, "rewards/bleu_reward_func/std": 0.018154015764594078, "step": 435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 508.0, "completions/mean_length": 389.65625, "completions/mean_terminated_length": 355.3999938964844, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "epoch": 0.3488, "grad_norm": 2.4872429370880127, "kl": 0.027496337890625, "learning_rate": 1e-06, "loss": -0.0609, "num_tokens": 5802556.0, "reward": 0.09538507461547852, "reward_std": 0.02605431340634823, "rewards/bleu_reward_func/mean": 0.09538507461547852, "rewards/bleu_reward_func/std": 0.060399290174245834, "step": 436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 479.0, "completions/mean_length": 281.0625, "completions/mean_terminated_length": 204.08334350585938, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 0.3496, "grad_norm": 3.9208173751831055, "kl": 0.0377197265625, "learning_rate": 1e-06, "loss": 0.0755, "num_tokens": 5814790.0, "reward": 0.07319147884845734, "reward_std": 0.021372804418206215, "rewards/bleu_reward_func/mean": 0.07319147884845734, "rewards/bleu_reward_func/std": 0.06475379317998886, "step": 437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 512.0, "completions/max_terminated_length": 509.0, "completions/mean_length": 273.375, "completions/mean_terminated_length": 265.6773986816406, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.3504, "grad_norm": 2.6346755027770996, "kl": 0.025146484375, "learning_rate": 1e-06, "loss": -0.0409, "num_tokens": 5827610.0, "reward": 0.04968114197254181, "reward_std": 0.01877327263355255, "rewards/bleu_reward_func/mean": 0.04968114197254181, "rewards/bleu_reward_func/std": 0.0327991247177124, "step": 438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 512.0, "completions/max_terminated_length": 505.0, "completions/mean_length": 343.5625, "completions/mean_terminated_length": 326.137939453125, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.3512, "grad_norm": 2.0653529167175293, "kl": 0.02227783203125, "learning_rate": 1e-06, "loss": 0.0021, "num_tokens": 5844580.0, "reward": 0.04412662982940674, "reward_std": 0.03156070411205292, "rewards/bleu_reward_func/mean": 0.04412662982940674, "rewards/bleu_reward_func/std": 0.039357323199510574, "step": 439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 510.0, "completions/mean_length": 387.90625, "completions/mean_terminated_length": 339.34783935546875, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.352, "grad_norm": 2.318570852279663, "kl": 0.037872314453125, "learning_rate": 1e-06, "loss": -0.0107, "num_tokens": 5860249.0, "reward": 0.04714466631412506, "reward_std": 0.008974202908575535, "rewards/bleu_reward_func/mean": 0.04714466631412506, "rewards/bleu_reward_func/std": 0.06077088788151741, "step": 440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 457.0, "completions/mean_length": 221.6875, "completions/mean_terminated_length": 124.91667175292969, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 0.3528, "grad_norm": 4.25236177444458, "kl": 0.05303955078125, "learning_rate": 1e-06, "loss": -0.211, "num_tokens": 5870535.0, "reward": 0.04983676224946976, "reward_std": 0.0235724039375782, "rewards/bleu_reward_func/mean": 0.04983676224946976, "rewards/bleu_reward_func/std": 0.05665838345885277, "step": 441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 512.0, "completions/max_terminated_length": 499.0, "completions/mean_length": 178.6875, "completions/mean_terminated_length": 167.93548583984375, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.3536, "grad_norm": 4.9184794425964355, "kl": 0.047027587890625, "learning_rate": 1e-06, "loss": 0.1442, "num_tokens": 5878733.0, "reward": 0.07976078987121582, "reward_std": 0.03809776157140732, "rewards/bleu_reward_func/mean": 0.07976078987121582, "rewards/bleu_reward_func/std": 0.08007866889238358, "step": 442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 465.0, "completions/mean_length": 318.40625, "completions/mean_terminated_length": 253.875, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "epoch": 0.3544, "grad_norm": 3.3139231204986572, "kl": 0.028900146484375, "learning_rate": 1e-06, "loss": -0.0201, "num_tokens": 5891178.0, "reward": 0.16382896900177002, "reward_std": 0.028719859197735786, "rewards/bleu_reward_func/mean": 0.16382896900177002, "rewards/bleu_reward_func/std": 0.2198071926832199, "step": 443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 512.0, "completions/max_terminated_length": 507.0, "completions/mean_length": 358.8125, "completions/mean_terminated_length": 330.4444580078125, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.3552, "grad_norm": 2.494014024734497, "kl": 0.0315093994140625, "learning_rate": 1e-06, "loss": 0.0693, "num_tokens": 5905684.0, "reward": 0.15048904716968536, "reward_std": 0.041027601808309555, "rewards/bleu_reward_func/mean": 0.15048904716968536, "rewards/bleu_reward_func/std": 0.21759282052516937, "step": 444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 512.0, "completions/max_terminated_length": 464.0, "completions/mean_length": 305.0, "completions/mean_terminated_length": 266.6666564941406, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.356, "grad_norm": 2.9205825328826904, "kl": 0.031890869140625, "learning_rate": 1e-06, "loss": -0.0476, "num_tokens": 5917428.0, "reward": 0.03617691248655319, "reward_std": 0.013296255841851234, "rewards/bleu_reward_func/mean": 0.03617691248655319, "rewards/bleu_reward_func/std": 0.01859820820391178, "step": 445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 512.0, "completions/max_terminated_length": 510.0, "completions/mean_length": 263.28125, "completions/mean_terminated_length": 227.75001525878906, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.3568, "grad_norm": 5.170389652252197, "kl": 0.03790283203125, "learning_rate": 1e-06, "loss": -0.0332, "num_tokens": 5929189.0, "reward": 0.043621551245450974, "reward_std": 0.01647448167204857, "rewards/bleu_reward_func/mean": 0.043621551245450974, "rewards/bleu_reward_func/std": 0.026312116533517838, "step": 446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 512.0, "completions/max_terminated_length": 487.0, "completions/mean_length": 391.03125, "completions/mean_terminated_length": 284.29412841796875, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.3576, "grad_norm": 2.2997524738311768, "kl": 0.0362548828125, "learning_rate": 1e-06, "loss": -0.0569, "num_tokens": 5945750.0, "reward": 0.03494875133037567, "reward_std": 0.00850139930844307, "rewards/bleu_reward_func/mean": 0.03494875133037567, "rewards/bleu_reward_func/std": 0.02439236082136631, "step": 447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 512.0, "completions/max_terminated_length": 505.0, "completions/mean_length": 219.78125, "completions/mean_terminated_length": 189.55172729492188, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 0.3584, "grad_norm": 3.48850154876709, "kl": 0.028778076171875, "learning_rate": 1e-06, "loss": -0.0647, "num_tokens": 5954879.0, "reward": 0.08465160429477692, "reward_std": 0.0695774257183075, "rewards/bleu_reward_func/mean": 0.08465160429477692, "rewards/bleu_reward_func/std": 0.11551004648208618, "step": 448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 512.0, "completions/max_terminated_length": 444.0, "completions/mean_length": 338.8125, "completions/mean_terminated_length": 220.3157958984375, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.3592, "grad_norm": 3.661801338195801, "kl": 0.0262603759765625, "learning_rate": 1e-06, "loss": -0.0604, "num_tokens": 5969169.0, "reward": 0.09182567894458771, "reward_std": 0.03519277274608612, "rewards/bleu_reward_func/mean": 0.09182567894458771, "rewards/bleu_reward_func/std": 0.08222125470638275, "step": 449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 512.0, "completions/max_terminated_length": 507.0, "completions/mean_length": 351.8125, "completions/mean_terminated_length": 314.8461608886719, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 0.36, "grad_norm": 2.7594478130340576, "kl": 0.02459716796875, "learning_rate": 1e-06, "loss": -0.1806, "num_tokens": 5982683.0, "reward": 0.07073526084423065, "reward_std": 0.04453439265489578, "rewards/bleu_reward_func/mean": 0.07073526084423065, "rewards/bleu_reward_func/std": 0.049988992512226105, "step": 450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 439.0, "completions/mean_length": 234.6875, "completions/mean_terminated_length": 142.25, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.3608, "grad_norm": 6.336760520935059, "kl": 0.08734130859375, "learning_rate": 1e-06, "loss": -0.0034, "num_tokens": 5995337.0, "reward": 0.14889724552631378, "reward_std": 0.030763918533921242, "rewards/bleu_reward_func/mean": 0.14889724552631378, "rewards/bleu_reward_func/std": 0.18464770913124084, "step": 451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 417.0, "completions/max_terminated_length": 417.0, "completions/mean_length": 270.75, "completions/mean_terminated_length": 270.75, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.3616, "grad_norm": 2.630225419998169, "kl": 0.03399658203125, "learning_rate": 1e-06, "loss": 0.014, "num_tokens": 6006921.0, "reward": 0.02614082582294941, "reward_std": 0.010051444172859192, "rewards/bleu_reward_func/mean": 0.02614082582294941, "rewards/bleu_reward_func/std": 0.011256206780672073, "step": 452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 494.0, "completions/max_terminated_length": 494.0, "completions/mean_length": 248.375, "completions/mean_terminated_length": 248.375, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.3624, "grad_norm": 3.011857032775879, "kl": 0.0301513671875, "learning_rate": 1e-06, "loss": 0.1245, "num_tokens": 6019309.0, "reward": 0.06672752648591995, "reward_std": 0.02101920172572136, "rewards/bleu_reward_func/mean": 0.06672752648591995, "rewards/bleu_reward_func/std": 0.053218722343444824, "step": 453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 512.0, "completions/max_terminated_length": 505.0, "completions/mean_length": 357.375, "completions/mean_terminated_length": 328.7407531738281, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.3632, "grad_norm": 2.4943957328796387, "kl": 0.0428466796875, "learning_rate": 1e-06, "loss": -0.1298, "num_tokens": 6032993.0, "reward": 0.04359011352062225, "reward_std": 0.016948901116847992, "rewards/bleu_reward_func/mean": 0.04359011352062225, "rewards/bleu_reward_func/std": 0.025716470554471016, "step": 454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 512.0, "completions/max_terminated_length": 462.0, "completions/mean_length": 197.15625, "completions/mean_terminated_length": 176.1666717529297, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.364, "grad_norm": 3.7868881225585938, "kl": 0.051300048828125, "learning_rate": 1e-06, "loss": 0.0982, "num_tokens": 6041718.0, "reward": 0.03436025232076645, "reward_std": 0.00970546342432499, "rewards/bleu_reward_func/mean": 0.03436025232076645, "rewards/bleu_reward_func/std": 0.01754005253314972, "step": 455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 497.0, "completions/mean_length": 279.1875, "completions/mean_terminated_length": 214.0, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "epoch": 0.3648, "grad_norm": 3.222365617752075, "kl": 0.051025390625, "learning_rate": 1e-06, "loss": 0.1112, "num_tokens": 6054284.0, "reward": 0.0742957666516304, "reward_std": 0.015302993357181549, "rewards/bleu_reward_func/mean": 0.0742957666516304, "rewards/bleu_reward_func/std": 0.061175521463155746, "step": 456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 512.0, "completions/max_terminated_length": 510.0, "completions/mean_length": 260.78125, "completions/mean_terminated_length": 234.79310607910156, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 0.3656, "grad_norm": 3.3111205101013184, "kl": 0.04364013671875, "learning_rate": 1e-06, "loss": 0.0182, "num_tokens": 6064853.0, "reward": 0.04622993618249893, "reward_std": 0.0203024260699749, "rewards/bleu_reward_func/mean": 0.04622993618249893, "rewards/bleu_reward_func/std": 0.04259706288576126, "step": 457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 512.0, "completions/max_terminated_length": 410.0, "completions/mean_length": 366.5, "completions/mean_terminated_length": 266.9473571777344, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 0.3664, "grad_norm": 2.6107733249664307, "kl": 0.0300750732421875, "learning_rate": 1e-06, "loss": 0.0746, "num_tokens": 6081629.0, "reward": 0.05567412078380585, "reward_std": 0.017643755301833153, "rewards/bleu_reward_func/mean": 0.05567412078380585, "rewards/bleu_reward_func/std": 0.029336081817746162, "step": 458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 512.0, "completions/max_terminated_length": 476.0, "completions/mean_length": 423.6875, "completions/mean_terminated_length": 370.70001220703125, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "epoch": 0.3672, "grad_norm": 1.9285842180252075, "kl": 0.0279541015625, "learning_rate": 1e-06, "loss": -0.0551, "num_tokens": 6099291.0, "reward": 0.05514095351099968, "reward_std": 0.02625124529004097, "rewards/bleu_reward_func/mean": 0.05514095351099968, "rewards/bleu_reward_func/std": 0.034680720418691635, "step": 459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 512.0, "completions/max_terminated_length": 509.0, "completions/mean_length": 338.125, "completions/mean_terminated_length": 247.04762268066406, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.368, "grad_norm": 3.4013428688049316, "kl": 0.038787841796875, "learning_rate": 1e-06, "loss": 0.0224, "num_tokens": 6113607.0, "reward": 0.04680419713258743, "reward_std": 0.01677008531987667, "rewards/bleu_reward_func/mean": 0.04680419713258743, "rewards/bleu_reward_func/std": 0.02986827678978443, "step": 460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 485.0, "completions/mean_length": 419.0, "completions/mean_terminated_length": 382.60870361328125, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "epoch": 0.3688, "grad_norm": 2.1877553462982178, "kl": 0.0406494140625, "learning_rate": 1e-06, "loss": 0.0481, "num_tokens": 6129815.0, "reward": 0.061673715710639954, "reward_std": 0.01531613152474165, "rewards/bleu_reward_func/mean": 0.061673715710639954, "rewards/bleu_reward_func/std": 0.04928870499134064, "step": 461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 502.0, "completions/mean_length": 408.25, "completions/mean_terminated_length": 373.66668701171875, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.3696, "grad_norm": 2.0608067512512207, "kl": 0.028839111328125, "learning_rate": 1e-06, "loss": -0.0268, "num_tokens": 6145599.0, "reward": 0.09434099495410919, "reward_std": 0.017724918201565742, "rewards/bleu_reward_func/mean": 0.09434099495410919, "rewards/bleu_reward_func/std": 0.09700965881347656, "step": 462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 512.0, "completions/max_terminated_length": 428.0, "completions/mean_length": 267.96875, "completions/mean_terminated_length": 211.6538543701172, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.3704, "grad_norm": 4.569107532501221, "kl": 0.036895751953125, "learning_rate": 1e-06, "loss": -0.1095, "num_tokens": 6158022.0, "reward": 0.03472236171364784, "reward_std": 0.011135936714708805, "rewards/bleu_reward_func/mean": 0.03472236171364784, "rewards/bleu_reward_func/std": 0.019644495099782944, "step": 463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.0, "completions/max_terminated_length": 379.0, "completions/mean_length": 171.03125, "completions/mean_terminated_length": 171.03125, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.3712, "grad_norm": 6.893502235412598, "kl": 0.048492431640625, "learning_rate": 1e-06, "loss": -0.075, "num_tokens": 6165759.0, "reward": 0.06419667601585388, "reward_std": 0.027115123346447945, "rewards/bleu_reward_func/mean": 0.06419667601585388, "rewards/bleu_reward_func/std": 0.0568375438451767, "step": 464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 438.0, "completions/max_terminated_length": 438.0, "completions/mean_length": 213.875, "completions/mean_terminated_length": 213.875, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "epoch": 0.372, "grad_norm": 3.312126874923706, "kl": 0.0325927734375, "learning_rate": 1e-06, "loss": 0.0366, "num_tokens": 6174859.0, "reward": 0.07219819724559784, "reward_std": 0.021119076758623123, "rewards/bleu_reward_func/mean": 0.07219819724559784, "rewards/bleu_reward_func/std": 0.05505922809243202, "step": 465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 512.0, "completions/max_terminated_length": 497.0, "completions/mean_length": 298.75, "completions/mean_terminated_length": 268.2857360839844, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.3728, "grad_norm": 2.6578316688537598, "kl": 0.03704833984375, "learning_rate": 1e-06, "loss": -0.0174, "num_tokens": 6187539.0, "reward": 0.054259613156318665, "reward_std": 0.028212059289216995, "rewards/bleu_reward_func/mean": 0.054259613156318665, "rewards/bleu_reward_func/std": 0.04007524251937866, "step": 466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 426.0, "completions/max_terminated_length": 426.0, "completions/mean_length": 190.625, "completions/mean_terminated_length": 190.625, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.3736, "grad_norm": 4.327213764190674, "kl": 0.059539794921875, "learning_rate": 1e-06, "loss": -0.0598, "num_tokens": 6196935.0, "reward": 0.04006721451878548, "reward_std": 0.015936415642499924, "rewards/bleu_reward_func/mean": 0.04006721451878548, "rewards/bleu_reward_func/std": 0.0206410214304924, "step": 467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 481.0, "completions/mean_length": 358.875, "completions/mean_terminated_length": 316.0, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.3744, "grad_norm": 2.417494535446167, "kl": 0.03228759765625, "learning_rate": 1e-06, "loss": 0.0124, "num_tokens": 6210563.0, "reward": 0.04092847555875778, "reward_std": 0.019996026530861855, "rewards/bleu_reward_func/mean": 0.04092847555875778, "rewards/bleu_reward_func/std": 0.03578585386276245, "step": 468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 510.0, "completions/mean_length": 401.75, "completions/mean_terminated_length": 370.8800048828125, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.3752, "grad_norm": 2.1943817138671875, "kl": 0.03948974609375, "learning_rate": 1e-06, "loss": -0.0128, "num_tokens": 6225899.0, "reward": 0.05942702293395996, "reward_std": 0.018910693004727364, "rewards/bleu_reward_func/mean": 0.05942702293395996, "rewards/bleu_reward_func/std": 0.03152807429432869, "step": 469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 458.0, "completions/mean_length": 339.84375, "completions/mean_terminated_length": 272.478271484375, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "epoch": 0.376, "grad_norm": 2.7955689430236816, "kl": 0.03955078125, "learning_rate": 1e-06, "loss": -0.0509, "num_tokens": 6239326.0, "reward": 0.025974374264478683, "reward_std": 0.011392309330403805, "rewards/bleu_reward_func/mean": 0.025974374264478683, "rewards/bleu_reward_func/std": 0.018641771748661995, "step": 470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 445.0, "completions/max_terminated_length": 445.0, "completions/mean_length": 217.90625, "completions/mean_terminated_length": 217.90625, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 0.3768, "grad_norm": 4.0323405265808105, "kl": 0.0279541015625, "learning_rate": 1e-06, "loss": 0.0632, "num_tokens": 6250795.0, "reward": 0.09103134274482727, "reward_std": 0.027077559381723404, "rewards/bleu_reward_func/mean": 0.09103134274482727, "rewards/bleu_reward_func/std": 0.08849667012691498, "step": 471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 512.0, "completions/max_terminated_length": 492.0, "completions/mean_length": 393.28125, "completions/mean_terminated_length": 381.0, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.3776, "grad_norm": 2.2373785972595215, "kl": 0.03131103515625, "learning_rate": 1e-06, "loss": -0.0229, "num_tokens": 6265660.0, "reward": 0.05581410974264145, "reward_std": 0.02045728638768196, "rewards/bleu_reward_func/mean": 0.05581410974264145, "rewards/bleu_reward_func/std": 0.03612606227397919, "step": 472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 449.0, "completions/mean_length": 347.0, "completions/mean_terminated_length": 282.4347839355469, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.3784, "grad_norm": 2.50510573387146, "kl": 0.0333251953125, "learning_rate": 1e-06, "loss": -0.1003, "num_tokens": 6281332.0, "reward": 0.09204696118831635, "reward_std": 0.03634490817785263, "rewards/bleu_reward_func/mean": 0.09204696118831635, "rewards/bleu_reward_func/std": 0.10213056951761246, "step": 473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 512.0, "completions/max_terminated_length": 490.0, "completions/mean_length": 398.46875, "completions/mean_terminated_length": 346.8636474609375, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.3792, "grad_norm": 2.1126277446746826, "kl": 0.0368194580078125, "learning_rate": 1e-06, "loss": -0.0118, "num_tokens": 6297131.0, "reward": 0.07596694678068161, "reward_std": 0.023722348734736443, "rewards/bleu_reward_func/mean": 0.07596694678068161, "rewards/bleu_reward_func/std": 0.06731286644935608, "step": 474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 512.0, "completions/max_terminated_length": 493.0, "completions/mean_length": 296.59375, "completions/mean_terminated_length": 274.3103332519531, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 0.38, "grad_norm": 2.5251946449279785, "kl": 0.03680419921875, "learning_rate": 1e-06, "loss": 0.0783, "num_tokens": 6310086.0, "reward": 0.036592863500118256, "reward_std": 0.023251082748174667, "rewards/bleu_reward_func/mean": 0.036592863500118256, "rewards/bleu_reward_func/std": 0.03332400694489479, "step": 475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.0, "completions/max_terminated_length": 488.0, "completions/mean_length": 257.59375, "completions/mean_terminated_length": 257.59375, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.3808, "grad_norm": 3.1550424098968506, "kl": 0.0521240234375, "learning_rate": 1e-06, "loss": -0.0125, "num_tokens": 6320209.0, "reward": 0.03460177034139633, "reward_std": 0.013276169076561928, "rewards/bleu_reward_func/mean": 0.03460177034139633, "rewards/bleu_reward_func/std": 0.021203402429819107, "step": 476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 512.0, "completions/max_terminated_length": 505.0, "completions/mean_length": 355.25, "completions/mean_terminated_length": 326.22222900390625, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.3816, "grad_norm": 2.3512442111968994, "kl": 0.033203125, "learning_rate": 1e-06, "loss": -0.0415, "num_tokens": 6334825.0, "reward": 0.03142130374908447, "reward_std": 0.008906159549951553, "rewards/bleu_reward_func/mean": 0.03142130374908447, "rewards/bleu_reward_func/std": 0.013355448842048645, "step": 477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 512.0, "completions/max_terminated_length": 489.0, "completions/mean_length": 278.0, "completions/mean_terminated_length": 234.6666717529297, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 0.3824, "grad_norm": 3.489647150039673, "kl": 0.04071044921875, "learning_rate": 1e-06, "loss": -0.1996, "num_tokens": 6346761.0, "reward": 0.05901729688048363, "reward_std": 0.03193315863609314, "rewards/bleu_reward_func/mean": 0.05901729688048363, "rewards/bleu_reward_func/std": 0.06475787609815598, "step": 478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 458.0, "completions/max_terminated_length": 458.0, "completions/mean_length": 263.34375, "completions/mean_terminated_length": 263.34375, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.3832, "grad_norm": 2.6905431747436523, "kl": 0.04022216796875, "learning_rate": 1e-06, "loss": -0.0777, "num_tokens": 6357628.0, "reward": 0.029398879036307335, "reward_std": 0.014063382521271706, "rewards/bleu_reward_func/mean": 0.029398879036307335, "rewards/bleu_reward_func/std": 0.01639050990343094, "step": 479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 512.0, "completions/max_terminated_length": 405.0, "completions/mean_length": 265.375, "completions/mean_terminated_length": 257.4193420410156, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.384, "grad_norm": 3.13386607170105, "kl": 0.046142578125, "learning_rate": 1e-06, "loss": 0.0544, "num_tokens": 6367960.0, "reward": 0.03670423477888107, "reward_std": 0.009904170408844948, "rewards/bleu_reward_func/mean": 0.03670423477888107, "rewards/bleu_reward_func/std": 0.026974406093358994, "step": 480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.625, "completions/max_length": 512.0, "completions/max_terminated_length": 497.0, "completions/mean_length": 392.53125, "completions/mean_terminated_length": 193.4166717529297, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "epoch": 0.3848, "grad_norm": 3.0782554149627686, "kl": 0.059326171875, "learning_rate": 1e-06, "loss": -0.1164, "num_tokens": 6384225.0, "reward": 0.06590355932712555, "reward_std": 0.018399305641651154, "rewards/bleu_reward_func/mean": 0.06590355932712555, "rewards/bleu_reward_func/std": 0.03893038630485535, "step": 481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 512.0, "completions/max_terminated_length": 349.0, "completions/mean_length": 328.4375, "completions/mean_terminated_length": 144.875, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.3856, "grad_norm": 3.397596836090088, "kl": 0.03509521484375, "learning_rate": 1e-06, "loss": -0.0348, "num_tokens": 6398527.0, "reward": 0.1248546689748764, "reward_std": 0.06577208638191223, "rewards/bleu_reward_func/mean": 0.1248546689748764, "rewards/bleu_reward_func/std": 0.17896804213523865, "step": 482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 512.0, "completions/max_terminated_length": 489.0, "completions/mean_length": 248.8125, "completions/mean_terminated_length": 200.07408142089844, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.3864, "grad_norm": 5.109568119049072, "kl": 0.071258544921875, "learning_rate": 1e-06, "loss": 0.048, "num_tokens": 6408897.0, "reward": 0.04945829138159752, "reward_std": 0.028692957013845444, "rewards/bleu_reward_func/mean": 0.04945829138159752, "rewards/bleu_reward_func/std": 0.049232520163059235, "step": 483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 512.0, "completions/max_terminated_length": 362.0, "completions/mean_length": 198.5, "completions/mean_terminated_length": 188.3870849609375, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.3872, "grad_norm": 3.791825532913208, "kl": 0.04644775390625, "learning_rate": 1e-06, "loss": 0.0976, "num_tokens": 6417281.0, "reward": 0.042633987963199615, "reward_std": 0.012612289749085903, "rewards/bleu_reward_func/mean": 0.042633987963199615, "rewards/bleu_reward_func/std": 0.01618749275803566, "step": 484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.65625, "completions/max_length": 512.0, "completions/max_terminated_length": 505.0, "completions/mean_length": 481.375, "completions/mean_terminated_length": 422.90911865234375, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "epoch": 0.388, "grad_norm": 2.041027069091797, "kl": 0.0343017578125, "learning_rate": 1e-06, "loss": 0.0043, "num_tokens": 6438757.0, "reward": 0.07603560388088226, "reward_std": 0.02007678709924221, "rewards/bleu_reward_func/mean": 0.07603560388088226, "rewards/bleu_reward_func/std": 0.03578682988882065, "step": 485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 512.0, "completions/max_terminated_length": 501.0, "completions/mean_length": 288.53125, "completions/mean_terminated_length": 247.1481475830078, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 0.3888, "grad_norm": 2.764733076095581, "kl": 0.0421142578125, "learning_rate": 1e-06, "loss": -0.0224, "num_tokens": 6450254.0, "reward": 0.04120934009552002, "reward_std": 0.020221907645463943, "rewards/bleu_reward_func/mean": 0.04120934009552002, "rewards/bleu_reward_func/std": 0.039144545793533325, "step": 486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 496.0, "completions/mean_length": 282.8125, "completions/mean_terminated_length": 218.63999938964844, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.3896, "grad_norm": 3.3871254920959473, "kl": 0.041168212890625, "learning_rate": 1e-06, "loss": 0.0849, "num_tokens": 6461760.0, "reward": 0.03640275448560715, "reward_std": 0.011773956939578056, "rewards/bleu_reward_func/mean": 0.03640275448560715, "rewards/bleu_reward_func/std": 0.016311539337038994, "step": 487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 512.0, "completions/max_terminated_length": 511.0, "completions/mean_length": 434.0625, "completions/mean_terminated_length": 356.125, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 0.3904, "grad_norm": 2.419614553451538, "kl": 0.037078857421875, "learning_rate": 1e-06, "loss": 0.0212, "num_tokens": 6479426.0, "reward": 0.07108810544013977, "reward_std": 0.01458063255995512, "rewards/bleu_reward_func/mean": 0.07108810544013977, "rewards/bleu_reward_func/std": 0.04704602435231209, "step": 488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 333.0, "completions/max_terminated_length": 333.0, "completions/mean_length": 158.0625, "completions/mean_terminated_length": 158.0625, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.3912, "grad_norm": 4.360542297363281, "kl": 0.085205078125, "learning_rate": 1e-06, "loss": 0.1004, "num_tokens": 6486812.0, "reward": 0.08444488793611526, "reward_std": 0.028125371783971786, "rewards/bleu_reward_func/mean": 0.08444488793611526, "rewards/bleu_reward_func/std": 0.06024865806102753, "step": 489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 484.0, "completions/max_terminated_length": 484.0, "completions/mean_length": 228.125, "completions/mean_terminated_length": 228.125, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.392, "grad_norm": 3.349405288696289, "kl": 0.049224853515625, "learning_rate": 1e-06, "loss": 0.0236, "num_tokens": 6496928.0, "reward": 0.0794411301612854, "reward_std": 0.015294745564460754, "rewards/bleu_reward_func/mean": 0.0794411301612854, "rewards/bleu_reward_func/std": 0.071323461830616, "step": 490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5625, "completions/max_length": 512.0, "completions/max_terminated_length": 411.0, "completions/mean_length": 391.34375, "completions/mean_terminated_length": 236.21429443359375, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.3928, "grad_norm": 2.2550203800201416, "kl": 0.03802490234375, "learning_rate": 1e-06, "loss": -0.0734, "num_tokens": 6517467.0, "reward": 0.10378064960241318, "reward_std": 0.05355631560087204, "rewards/bleu_reward_func/mean": 0.10378064960241318, "rewards/bleu_reward_func/std": 0.08197584748268127, "step": 491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 489.0, "completions/mean_length": 316.25, "completions/mean_terminated_length": 261.44000244140625, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.3936, "grad_norm": 2.498426914215088, "kl": 0.03643798828125, "learning_rate": 1e-06, "loss": -0.0782, "num_tokens": 6531899.0, "reward": 0.038530509918928146, "reward_std": 0.029034338891506195, "rewards/bleu_reward_func/mean": 0.038530509918928146, "rewards/bleu_reward_func/std": 0.04517725482583046, "step": 492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 512.0, "completions/max_terminated_length": 511.0, "completions/mean_length": 346.0, "completions/mean_terminated_length": 216.88888549804688, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.3944, "grad_norm": 3.5450828075408936, "kl": 0.05267333984375, "learning_rate": 1e-06, "loss": 0.0445, "num_tokens": 6546123.0, "reward": 0.03807983547449112, "reward_std": 0.012768654152750969, "rewards/bleu_reward_func/mean": 0.03807983547449112, "rewards/bleu_reward_func/std": 0.019427087157964706, "step": 493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 512.0, "completions/max_terminated_length": 478.0, "completions/mean_length": 164.375, "completions/mean_terminated_length": 128.41378784179688, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.3952, "grad_norm": 7.575675010681152, "kl": 0.067230224609375, "learning_rate": 1e-06, "loss": 0.0558, "num_tokens": 6553823.0, "reward": 0.036415085196495056, "reward_std": 0.015921277925372124, "rewards/bleu_reward_func/mean": 0.036415085196495056, "rewards/bleu_reward_func/std": 0.03126350790262222, "step": 494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 512.0, "completions/max_terminated_length": 511.0, "completions/mean_length": 306.53125, "completions/mean_terminated_length": 259.1153869628906, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.396, "grad_norm": 2.6021785736083984, "kl": 0.034881591796875, "learning_rate": 1e-06, "loss": 0.0336, "num_tokens": 6570000.0, "reward": 0.038223300129175186, "reward_std": 0.014713780023157597, "rewards/bleu_reward_func/mean": 0.038223300129175186, "rewards/bleu_reward_func/std": 0.01743900217115879, "step": 495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 512.0, "completions/max_terminated_length": 490.0, "completions/mean_length": 362.25, "completions/mean_terminated_length": 283.8095397949219, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 0.3968, "grad_norm": 2.632026433944702, "kl": 0.0277099609375, "learning_rate": 1e-06, "loss": 0.0039, "num_tokens": 6586440.0, "reward": 0.05412636697292328, "reward_std": 0.02242736518383026, "rewards/bleu_reward_func/mean": 0.05412636697292328, "rewards/bleu_reward_func/std": 0.03469071537256241, "step": 496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 512.0, "completions/mean_length": 381.9375, "completions/mean_terminated_length": 338.5833435058594, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.3976, "grad_norm": 2.3472609519958496, "kl": 0.031829833984375, "learning_rate": 1e-06, "loss": -0.0066, "num_tokens": 6601902.0, "reward": 0.05178507789969444, "reward_std": 0.015618492849171162, "rewards/bleu_reward_func/mean": 0.05178507789969444, "rewards/bleu_reward_func/std": 0.020302964374423027, "step": 497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 506.0, "completions/mean_length": 328.28125, "completions/mean_terminated_length": 256.39129638671875, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 0.3984, "grad_norm": 3.0667872428894043, "kl": 0.04681396484375, "learning_rate": 1e-06, "loss": 0.0113, "num_tokens": 6614759.0, "reward": 0.02746613696217537, "reward_std": 0.006646636873483658, "rewards/bleu_reward_func/mean": 0.02746613696217537, "rewards/bleu_reward_func/std": 0.016086198389530182, "step": 498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 512.0, "completions/max_terminated_length": 467.0, "completions/mean_length": 188.5, "completions/mean_terminated_length": 155.03448486328125, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.3992, "grad_norm": 5.149783134460449, "kl": 0.07086181640625, "learning_rate": 1e-06, "loss": 0.0702, "num_tokens": 6622999.0, "reward": 0.06750474870204926, "reward_std": 0.014433549717068672, "rewards/bleu_reward_func/mean": 0.06750474870204926, "rewards/bleu_reward_func/std": 0.05492382496595383, "step": 499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 512.0, "completions/max_terminated_length": 432.0, "completions/mean_length": 385.0625, "completions/mean_terminated_length": 273.058837890625, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.4, "grad_norm": 2.2382819652557373, "kl": 0.035736083984375, "learning_rate": 1e-06, "loss": -0.0591, "num_tokens": 6637913.0, "reward": 0.07065648585557938, "reward_std": 0.03134492412209511, "rewards/bleu_reward_func/mean": 0.07065648585557938, "rewards/bleu_reward_func/std": 0.05255349352955818, "step": 500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 512.0, "completions/max_terminated_length": 508.0, "completions/mean_length": 386.0625, "completions/mean_terminated_length": 320.0952453613281, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.4008, "grad_norm": 2.6071560382843018, "kl": 0.02655029296875, "learning_rate": 1e-06, "loss": 0.0223, "num_tokens": 6653755.0, "reward": 0.03733908385038376, "reward_std": 0.021098248660564423, "rewards/bleu_reward_func/mean": 0.03733908385038376, "rewards/bleu_reward_func/std": 0.03734488785266876, "step": 501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 494.0, "completions/mean_length": 267.09375, "completions/mean_terminated_length": 198.51998901367188, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 0.4016, "grad_norm": 4.854734420776367, "kl": 0.04248046875, "learning_rate": 1e-06, "loss": -0.0771, "num_tokens": 6665286.0, "reward": 0.21961082518100739, "reward_std": 0.09568939357995987, "rewards/bleu_reward_func/mean": 0.21961082518100739, "rewards/bleu_reward_func/std": 0.3429633677005768, "step": 502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 512.0, "completions/max_terminated_length": 509.0, "completions/mean_length": 275.46875, "completions/mean_terminated_length": 267.8387145996094, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.4024, "grad_norm": 3.6082074642181396, "kl": 0.033843994140625, "learning_rate": 1e-06, "loss": -0.04, "num_tokens": 6676325.0, "reward": 0.0696449875831604, "reward_std": 0.045844756066799164, "rewards/bleu_reward_func/mean": 0.0696449875831604, "rewards/bleu_reward_func/std": 0.051705408841371536, "step": 503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 512.0, "completions/max_terminated_length": 506.0, "completions/mean_length": 325.0, "completions/mean_terminated_length": 240.0, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.4032, "grad_norm": 4.330098628997803, "kl": 0.04473876953125, "learning_rate": 1e-06, "loss": -0.1325, "num_tokens": 6689805.0, "reward": 0.03944069519639015, "reward_std": 0.01668594963848591, "rewards/bleu_reward_func/mean": 0.03944069519639015, "rewards/bleu_reward_func/std": 0.017823999747633934, "step": 504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 423.0, "completions/mean_length": 250.4375, "completions/mean_terminated_length": 163.25, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.404, "grad_norm": 3.294508218765259, "kl": 0.05078125, "learning_rate": 1e-06, "loss": 0.1719, "num_tokens": 6700619.0, "reward": 0.0427066832780838, "reward_std": 0.018565086647868156, "rewards/bleu_reward_func/mean": 0.0427066832780838, "rewards/bleu_reward_func/std": 0.02504456229507923, "step": 505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 512.0, "completions/max_terminated_length": 429.0, "completions/mean_length": 367.6875, "completions/mean_terminated_length": 292.0952453613281, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.4048, "grad_norm": 2.24147629737854, "kl": 0.034454345703125, "learning_rate": 1e-06, "loss": 0.1088, "num_tokens": 6717257.0, "reward": 0.034352123737335205, "reward_std": 0.007905229926109314, "rewards/bleu_reward_func/mean": 0.034352123737335205, "rewards/bleu_reward_func/std": 0.028700148686766624, "step": 506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 512.0, "completions/max_terminated_length": 483.0, "completions/mean_length": 417.5, "completions/mean_terminated_length": 360.8000183105469, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.4056, "grad_norm": 2.1727077960968018, "kl": 0.032928466796875, "learning_rate": 1e-06, "loss": -0.0452, "num_tokens": 6733721.0, "reward": 0.04472000151872635, "reward_std": 0.012819021940231323, "rewards/bleu_reward_func/mean": 0.04472000151872635, "rewards/bleu_reward_func/std": 0.020674971863627434, "step": 507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 512.0, "completions/max_terminated_length": 510.0, "completions/mean_length": 348.46875, "completions/mean_terminated_length": 262.8095397949219, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.4064, "grad_norm": 7.316678524017334, "kl": 0.035491943359375, "learning_rate": 1e-06, "loss": -0.0005, "num_tokens": 6749720.0, "reward": 0.06846344470977783, "reward_std": 0.022212965413928032, "rewards/bleu_reward_func/mean": 0.06846344470977783, "rewards/bleu_reward_func/std": 0.07043828815221786, "step": 508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/max_terminated_length": 492.0, "completions/mean_length": 237.84375, "completions/mean_terminated_length": 237.84375, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "epoch": 0.4072, "grad_norm": 3.6807727813720703, "kl": 0.03271484375, "learning_rate": 1e-06, "loss": -0.0019, "num_tokens": 6759187.0, "reward": 0.044801339507102966, "reward_std": 0.01820746809244156, "rewards/bleu_reward_func/mean": 0.044801339507102966, "rewards/bleu_reward_func/std": 0.02651328034698963, "step": 509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 306.0, "completions/mean_length": 235.0625, "completions/mean_terminated_length": 142.75, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 0.408, "grad_norm": 3.6571717262268066, "kl": 0.0440673828125, "learning_rate": 1e-06, "loss": -0.0074, "num_tokens": 6770805.0, "reward": 0.043380383402109146, "reward_std": 0.025585712864995003, "rewards/bleu_reward_func/mean": 0.043380383402109146, "rewards/bleu_reward_func/std": 0.04197373613715172, "step": 510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 398.0, "completions/max_terminated_length": 398.0, "completions/mean_length": 164.65625, "completions/mean_terminated_length": 164.65625, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.4088, "grad_norm": 4.0850830078125, "kl": 0.0361785888671875, "learning_rate": 1e-06, "loss": -0.2266, "num_tokens": 6778194.0, "reward": 0.1240055114030838, "reward_std": 0.038006868213415146, "rewards/bleu_reward_func/mean": 0.1240055114030838, "rewards/bleu_reward_func/std": 0.1415473371744156, "step": 511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 508.0, "completions/mean_length": 360.71875, "completions/mean_terminated_length": 318.3599853515625, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.4096, "grad_norm": 2.39262056350708, "kl": 0.03729248046875, "learning_rate": 1e-06, "loss": 0.1221, "num_tokens": 6792945.0, "reward": 0.026915479451417923, "reward_std": 0.008309369906783104, "rewards/bleu_reward_func/mean": 0.026915479451417923, "rewards/bleu_reward_func/std": 0.01223670318722725, "step": 512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.53125, "completions/max_length": 512.0, "completions/max_terminated_length": 475.0, "completions/mean_length": 424.9375, "completions/mean_terminated_length": 326.2666931152344, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "epoch": 0.4104, "grad_norm": 2.1014010906219482, "kl": 0.03375244140625, "learning_rate": 1e-06, "loss": -0.0362, "num_tokens": 6809839.0, "reward": 0.038692403584718704, "reward_std": 0.012931729666888714, "rewards/bleu_reward_func/mean": 0.038692403584718704, "rewards/bleu_reward_func/std": 0.03173365071415901, "step": 513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 511.0, "completions/mean_length": 423.09375, "completions/mean_terminated_length": 388.3043518066406, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "epoch": 0.4112, "grad_norm": 2.0441877841949463, "kl": 0.03924560546875, "learning_rate": 1e-06, "loss": 0.0653, "num_tokens": 6826258.0, "reward": 0.033866725862026215, "reward_std": 0.004912001546472311, "rewards/bleu_reward_func/mean": 0.033866725862026215, "rewards/bleu_reward_func/std": 0.03187695890665054, "step": 514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 511.0, "completions/mean_length": 265.875, "completions/mean_terminated_length": 196.95999145507812, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.412, "grad_norm": 4.132297039031982, "kl": 0.05572509765625, "learning_rate": 1e-06, "loss": 0.0553, "num_tokens": 6838686.0, "reward": 0.038207922130823135, "reward_std": 0.007710086181759834, "rewards/bleu_reward_func/mean": 0.038207922130823135, "rewards/bleu_reward_func/std": 0.01941581815481186, "step": 515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 500.0, "completions/mean_length": 369.375, "completions/mean_terminated_length": 321.8333435058594, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.4128, "grad_norm": 2.4867780208587646, "kl": 0.033538818359375, "learning_rate": 1e-06, "loss": -0.0815, "num_tokens": 6853018.0, "reward": 0.041043445467948914, "reward_std": 0.015324940904974937, "rewards/bleu_reward_func/mean": 0.041043445467948914, "rewards/bleu_reward_func/std": 0.028135672211647034, "step": 516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 446.0, "completions/max_terminated_length": 446.0, "completions/mean_length": 158.375, "completions/mean_terminated_length": 158.375, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.4136, "grad_norm": 5.145825386047363, "kl": 0.0673828125, "learning_rate": 1e-06, "loss": -0.0586, "num_tokens": 6862070.0, "reward": 0.049210622906684875, "reward_std": 0.017335664480924606, "rewards/bleu_reward_func/mean": 0.049210622906684875, "rewards/bleu_reward_func/std": 0.02346484549343586, "step": 517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 512.0, "completions/max_terminated_length": 494.0, "completions/mean_length": 397.34375, "completions/mean_terminated_length": 296.1764831542969, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.4144, "grad_norm": 3.008666515350342, "kl": 0.0364837646484375, "learning_rate": 1e-06, "loss": -0.1012, "num_tokens": 6880193.0, "reward": 0.038832880556583405, "reward_std": 0.012767771258950233, "rewards/bleu_reward_func/mean": 0.038832880556583405, "rewards/bleu_reward_func/std": 0.03091544844210148, "step": 518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 439.0, "completions/mean_length": 260.6875, "completions/mean_terminated_length": 190.3199920654297, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.4152, "grad_norm": 2.8610570430755615, "kl": 0.051849365234375, "learning_rate": 1e-06, "loss": -0.021, "num_tokens": 6891911.0, "reward": 0.03679898753762245, "reward_std": 0.017421672120690346, "rewards/bleu_reward_func/mean": 0.03679898753762245, "rewards/bleu_reward_func/std": 0.030264802277088165, "step": 519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.75, "completions/max_length": 512.0, "completions/max_terminated_length": 507.0, "completions/mean_length": 448.09375, "completions/mean_terminated_length": 256.375, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 0.416, "grad_norm": 2.3569674491882324, "kl": 0.040191650390625, "learning_rate": 1e-06, "loss": 0.16, "num_tokens": 6909442.0, "reward": 0.04537253454327583, "reward_std": 0.026473576202988625, "rewards/bleu_reward_func/mean": 0.04537253454327583, "rewards/bleu_reward_func/std": 0.03027988225221634, "step": 520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 512.0, "completions/max_terminated_length": 486.0, "completions/mean_length": 399.28125, "completions/mean_terminated_length": 299.8235168457031, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.4168, "grad_norm": 2.596334934234619, "kl": 0.047119140625, "learning_rate": 1e-06, "loss": -0.0005, "num_tokens": 6925683.0, "reward": 0.04248907417058945, "reward_std": 0.008885795250535011, "rewards/bleu_reward_func/mean": 0.04248907417058945, "rewards/bleu_reward_func/std": 0.027150554582476616, "step": 521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 512.0, "completions/max_terminated_length": 396.0, "completions/mean_length": 224.5, "completions/mean_terminated_length": 215.22579956054688, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.4176, "grad_norm": 4.284977912902832, "kl": 0.056060791015625, "learning_rate": 1e-06, "loss": 0.0544, "num_tokens": 6934955.0, "reward": 0.031697362661361694, "reward_std": 0.01221911795437336, "rewards/bleu_reward_func/mean": 0.031697362661361694, "rewards/bleu_reward_func/std": 0.014145019464194775, "step": 522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 512.0, "completions/max_terminated_length": 495.0, "completions/mean_length": 260.1875, "completions/mean_terminated_length": 243.40000915527344, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.4184, "grad_norm": 2.888882637023926, "kl": 0.05804443359375, "learning_rate": 1e-06, "loss": -0.0438, "num_tokens": 6945593.0, "reward": 0.041664689779281616, "reward_std": 0.013896044343709946, "rewards/bleu_reward_func/mean": 0.041664689779281616, "rewards/bleu_reward_func/std": 0.024034207686781883, "step": 523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 503.0, "completions/mean_length": 349.59375, "completions/mean_terminated_length": 286.0434875488281, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.4192, "grad_norm": 3.398181676864624, "kl": 0.03240966796875, "learning_rate": 1e-06, "loss": -0.1082, "num_tokens": 6959404.0, "reward": 0.07310491800308228, "reward_std": 0.0231708325445652, "rewards/bleu_reward_func/mean": 0.07310491800308228, "rewards/bleu_reward_func/std": 0.04921337589621544, "step": 524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 512.0, "completions/max_terminated_length": 504.0, "completions/mean_length": 317.5, "completions/mean_terminated_length": 297.3793029785156, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "epoch": 0.42, "grad_norm": 3.1394002437591553, "kl": 0.049713134765625, "learning_rate": 1e-06, "loss": -0.0133, "num_tokens": 6971580.0, "reward": 0.07620484381914139, "reward_std": 0.029079508036375046, "rewards/bleu_reward_func/mean": 0.07620484381914139, "rewards/bleu_reward_func/std": 0.055587053298950195, "step": 525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 460.0, "completions/mean_length": 297.6875, "completions/mean_terminated_length": 226.25, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.4208, "grad_norm": 15.365631103515625, "kl": 0.191741943359375, "learning_rate": 1e-06, "loss": -0.006, "num_tokens": 6985578.0, "reward": 0.0780831128358841, "reward_std": 0.018097946420311928, "rewards/bleu_reward_func/mean": 0.0780831128358841, "rewards/bleu_reward_func/std": 0.09850489348173141, "step": 526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 512.0, "completions/max_terminated_length": 504.0, "completions/mean_length": 369.125, "completions/mean_terminated_length": 348.71429443359375, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.4216, "grad_norm": 2.0799665451049805, "kl": 0.03955078125, "learning_rate": 1e-06, "loss": 0.0467, "num_tokens": 7000926.0, "reward": 0.05712277069687843, "reward_std": 0.02389085479080677, "rewards/bleu_reward_func/mean": 0.05712277069687843, "rewards/bleu_reward_func/std": 0.04602767527103424, "step": 527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 234.40625, "completions/mean_terminated_length": 141.875, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.4224, "grad_norm": 3.6912519931793213, "kl": 0.0577392578125, "learning_rate": 1e-06, "loss": -0.0821, "num_tokens": 7011011.0, "reward": 0.02966947853565216, "reward_std": 0.009855142794549465, "rewards/bleu_reward_func/mean": 0.02966947853565216, "rewards/bleu_reward_func/std": 0.012489824555814266, "step": 528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 512.0, "completions/max_terminated_length": 509.0, "completions/mean_length": 345.40625, "completions/mean_terminated_length": 334.3000183105469, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 0.4232, "grad_norm": 2.465789794921875, "kl": 0.032958984375, "learning_rate": 1e-06, "loss": 0.1264, "num_tokens": 7026168.0, "reward": 0.10139614343643188, "reward_std": 0.04301796853542328, "rewards/bleu_reward_func/mean": 0.10139614343643188, "rewards/bleu_reward_func/std": 0.12598717212677002, "step": 529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 512.0, "completions/max_terminated_length": 491.0, "completions/mean_length": 243.78125, "completions/mean_terminated_length": 205.46429443359375, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.424, "grad_norm": 4.074453830718994, "kl": 0.038360595703125, "learning_rate": 1e-06, "loss": 0.004, "num_tokens": 7036849.0, "reward": 0.07757672667503357, "reward_std": 0.02031567506492138, "rewards/bleu_reward_func/mean": 0.07757672667503357, "rewards/bleu_reward_func/std": 0.06997023522853851, "step": 530 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 512.0, "completions/max_terminated_length": 474.0, "completions/mean_length": 218.25, "completions/mean_terminated_length": 187.86207580566406, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 0.4248, "grad_norm": 3.842729091644287, "kl": 0.041717529296875, "learning_rate": 1e-06, "loss": -0.0248, "num_tokens": 7046225.0, "reward": 0.03865049406886101, "reward_std": 0.01612503081560135, "rewards/bleu_reward_func/mean": 0.03865049406886101, "rewards/bleu_reward_func/std": 0.02242407016456127, "step": 531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.59375, "completions/max_length": 512.0, "completions/max_terminated_length": 506.0, "completions/mean_length": 461.375, "completions/mean_terminated_length": 387.3846435546875, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "epoch": 0.4256, "grad_norm": 2.2066099643707275, "kl": 0.036346435546875, "learning_rate": 1e-06, "loss": 0.0395, "num_tokens": 7065165.0, "reward": 0.036675065755844116, "reward_std": 0.009393100626766682, "rewards/bleu_reward_func/mean": 0.036675065755844116, "rewards/bleu_reward_func/std": 0.021476779133081436, "step": 532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 503.0, "completions/mean_length": 325.5625, "completions/mean_terminated_length": 263.41668701171875, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 0.4264, "grad_norm": 3.2754671573638916, "kl": 0.045806884765625, "learning_rate": 1e-06, "loss": -0.0558, "num_tokens": 7078359.0, "reward": 0.06816762685775757, "reward_std": 0.016703680157661438, "rewards/bleu_reward_func/mean": 0.06816762685775757, "rewards/bleu_reward_func/std": 0.04132222384214401, "step": 533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 512.0, "completions/max_terminated_length": 498.0, "completions/mean_length": 383.0625, "completions/mean_terminated_length": 294.84210205078125, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.4272, "grad_norm": 2.0327775478363037, "kl": 0.0302734375, "learning_rate": 1e-06, "loss": -0.0255, "num_tokens": 7092673.0, "reward": 0.10351169109344482, "reward_std": 0.02997823804616928, "rewards/bleu_reward_func/mean": 0.10351169109344482, "rewards/bleu_reward_func/std": 0.1509554237127304, "step": 534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 368.0, "completions/mean_length": 302.125, "completions/mean_terminated_length": 232.1666717529297, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 0.428, "grad_norm": 3.195488929748535, "kl": 0.041168212890625, "learning_rate": 1e-06, "loss": -0.1486, "num_tokens": 7104781.0, "reward": 0.04248940944671631, "reward_std": 0.01792888715863228, "rewards/bleu_reward_func/mean": 0.04248940944671631, "rewards/bleu_reward_func/std": 0.02808193489909172, "step": 535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 461.0, "completions/mean_length": 341.84375, "completions/mean_terminated_length": 275.2608642578125, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.4288, "grad_norm": 2.6887435913085938, "kl": 0.03656005859375, "learning_rate": 1e-06, "loss": -0.1183, "num_tokens": 7118416.0, "reward": 0.07263948023319244, "reward_std": 0.02492811344563961, "rewards/bleu_reward_func/mean": 0.07263948023319244, "rewards/bleu_reward_func/std": 0.089384526014328, "step": 536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 512.0, "completions/max_terminated_length": 420.0, "completions/mean_length": 388.6875, "completions/mean_terminated_length": 265.375, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.4296, "grad_norm": 2.2471821308135986, "kl": 0.033721923828125, "learning_rate": 1e-06, "loss": 0.0593, "num_tokens": 7133742.0, "reward": 0.01961388997733593, "reward_std": 0.005338278133422136, "rewards/bleu_reward_func/mean": 0.01961388997733593, "rewards/bleu_reward_func/std": 0.009376008063554764, "step": 537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 512.0, "completions/max_terminated_length": 438.0, "completions/mean_length": 268.25, "completions/mean_terminated_length": 223.11111450195312, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 0.4304, "grad_norm": 4.410032749176025, "kl": 0.03692626953125, "learning_rate": 1e-06, "loss": -0.0016, "num_tokens": 7149422.0, "reward": 0.05439123511314392, "reward_std": 0.02193494513630867, "rewards/bleu_reward_func/mean": 0.05439123511314392, "rewards/bleu_reward_func/std": 0.05751095712184906, "step": 538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 512.0, "completions/max_terminated_length": 237.0, "completions/mean_length": 314.5, "completions/mean_terminated_length": 117.0, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "epoch": 0.4312, "grad_norm": 3.9528393745422363, "kl": 0.04327392578125, "learning_rate": 1e-06, "loss": 0.0183, "num_tokens": 7165182.0, "reward": 0.11979202926158905, "reward_std": 0.029252737760543823, "rewards/bleu_reward_func/mean": 0.11979202926158905, "rewards/bleu_reward_func/std": 0.05838814005255699, "step": 539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 512.0, "completions/max_terminated_length": 495.0, "completions/mean_length": 348.46875, "completions/mean_terminated_length": 310.73077392578125, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 0.432, "grad_norm": 2.450528383255005, "kl": 0.037353515625, "learning_rate": 1e-06, "loss": -0.0731, "num_tokens": 7178621.0, "reward": 0.07015785574913025, "reward_std": 0.013684559613466263, "rewards/bleu_reward_func/mean": 0.07015785574913025, "rewards/bleu_reward_func/std": 0.08590352535247803, "step": 540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 512.0, "completions/max_terminated_length": 460.0, "completions/mean_length": 248.15625, "completions/mean_terminated_length": 199.29629516601562, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.4328, "grad_norm": 4.424651145935059, "kl": 0.05096435546875, "learning_rate": 1e-06, "loss": -0.1833, "num_tokens": 7189602.0, "reward": 0.05476554483175278, "reward_std": 0.02339433878660202, "rewards/bleu_reward_func/mean": 0.05476554483175278, "rewards/bleu_reward_func/std": 0.03689594194293022, "step": 541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 439.0, "completions/max_terminated_length": 439.0, "completions/mean_length": 225.46875, "completions/mean_terminated_length": 225.46875, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.4336, "grad_norm": 4.021416664123535, "kl": 0.05731201171875, "learning_rate": 1e-06, "loss": -0.0843, "num_tokens": 7199641.0, "reward": 0.057144373655319214, "reward_std": 0.01742716133594513, "rewards/bleu_reward_func/mean": 0.057144373655319214, "rewards/bleu_reward_func/std": 0.041288405656814575, "step": 542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 512.0, "completions/max_terminated_length": 511.0, "completions/mean_length": 400.3125, "completions/mean_terminated_length": 301.76470947265625, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.4344, "grad_norm": 2.6051321029663086, "kl": 0.033721923828125, "learning_rate": 1e-06, "loss": -0.054, "num_tokens": 7214963.0, "reward": 0.045022256672382355, "reward_std": 0.013825424946844578, "rewards/bleu_reward_func/mean": 0.045022256672382355, "rewards/bleu_reward_func/std": 0.021449485793709755, "step": 543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 509.0, "completions/mean_length": 245.75, "completions/mean_terminated_length": 171.1999969482422, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.4352, "grad_norm": 5.376564979553223, "kl": 0.0401611328125, "learning_rate": 1e-06, "loss": 0.0519, "num_tokens": 7225603.0, "reward": 0.25259390473365784, "reward_std": 0.10001413524150848, "rewards/bleu_reward_func/mean": 0.25259390473365784, "rewards/bleu_reward_func/std": 0.2557314336299896, "step": 544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 512.0, "completions/max_terminated_length": 486.0, "completions/mean_length": 380.8125, "completions/mean_terminated_length": 278.77777099609375, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "epoch": 0.436, "grad_norm": 3.3095123767852783, "kl": 0.03887939453125, "learning_rate": 1e-06, "loss": -0.0544, "num_tokens": 7241525.0, "reward": 0.05672682821750641, "reward_std": 0.021520383656024933, "rewards/bleu_reward_func/mean": 0.05672682821750641, "rewards/bleu_reward_func/std": 0.04318132996559143, "step": 545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/max_terminated_length": 356.0, "completions/mean_length": 145.125, "completions/mean_terminated_length": 145.125, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.4368, "grad_norm": 4.073734760284424, "kl": 0.0557861328125, "learning_rate": 1e-06, "loss": -0.1033, "num_tokens": 7255145.0, "reward": 0.09350405633449554, "reward_std": 0.03147149458527565, "rewards/bleu_reward_func/mean": 0.09350405633449554, "rewards/bleu_reward_func/std": 0.10825508832931519, "step": 546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 512.0, "completions/max_terminated_length": 510.0, "completions/mean_length": 333.125, "completions/mean_terminated_length": 291.8461608886719, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 0.4376, "grad_norm": 3.2068068981170654, "kl": 0.03753662109375, "learning_rate": 1e-06, "loss": -0.1119, "num_tokens": 7269005.0, "reward": 0.03155931830406189, "reward_std": 0.018099911510944366, "rewards/bleu_reward_func/mean": 0.03155931830406189, "rewards/bleu_reward_func/std": 0.02396266907453537, "step": 547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 512.0, "completions/max_terminated_length": 490.0, "completions/mean_length": 327.8125, "completions/mean_terminated_length": 301.5, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 0.4384, "grad_norm": 3.2204573154449463, "kl": 0.041412353515625, "learning_rate": 1e-06, "loss": -0.0929, "num_tokens": 7285751.0, "reward": 0.031415294855833054, "reward_std": 0.011055306531488895, "rewards/bleu_reward_func/mean": 0.031415294855833054, "rewards/bleu_reward_func/std": 0.025729909539222717, "step": 548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 378.0, "completions/mean_length": 274.90625, "completions/mean_terminated_length": 182.13043212890625, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 0.4392, "grad_norm": 4.724465847015381, "kl": 0.041778564453125, "learning_rate": 1e-06, "loss": 0.132, "num_tokens": 7297668.0, "reward": 0.11313501000404358, "reward_std": 0.036550864577293396, "rewards/bleu_reward_func/mean": 0.11313501000404358, "rewards/bleu_reward_func/std": 0.048100944608449936, "step": 549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 461.0, "completions/mean_length": 273.28125, "completions/mean_terminated_length": 193.70834350585938, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 0.44, "grad_norm": 3.449267625808716, "kl": 0.0501708984375, "learning_rate": 1e-06, "loss": -0.0538, "num_tokens": 7308973.0, "reward": 0.0663086399435997, "reward_std": 0.025801170617341995, "rewards/bleu_reward_func/mean": 0.0663086399435997, "rewards/bleu_reward_func/std": 0.05170860141515732, "step": 550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 468.0, "completions/mean_length": 315.0625, "completions/mean_terminated_length": 249.4166717529297, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.4408, "grad_norm": 2.530473232269287, "kl": 0.04833984375, "learning_rate": 1e-06, "loss": 0.1201, "num_tokens": 7322271.0, "reward": 0.07075276970863342, "reward_std": 0.02442948892712593, "rewards/bleu_reward_func/mean": 0.07075276970863342, "rewards/bleu_reward_func/std": 0.027710873633623123, "step": 551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 512.0, "completions/max_terminated_length": 299.0, "completions/mean_length": 342.28125, "completions/mean_terminated_length": 172.5625, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "epoch": 0.4416, "grad_norm": 3.0482726097106934, "kl": 0.04998779296875, "learning_rate": 1e-06, "loss": -0.0208, "num_tokens": 7336592.0, "reward": 0.03471437096595764, "reward_std": 0.006943271495401859, "rewards/bleu_reward_func/mean": 0.03471437096595764, "rewards/bleu_reward_func/std": 0.018991071730852127, "step": 552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 511.0, "completions/mean_length": 266.75, "completions/mean_terminated_length": 198.0800018310547, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.4424, "grad_norm": 4.837971210479736, "kl": 0.05224609375, "learning_rate": 1e-06, "loss": -0.027, "num_tokens": 7353232.0, "reward": 0.09570033848285675, "reward_std": 0.023528877645730972, "rewards/bleu_reward_func/mean": 0.09570033848285675, "rewards/bleu_reward_func/std": 0.0557950884103775, "step": 553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 512.0, "completions/max_terminated_length": 497.0, "completions/mean_length": 229.15625, "completions/mean_terminated_length": 188.75001525878906, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.4432, "grad_norm": 6.145666122436523, "kl": 0.04083251953125, "learning_rate": 1e-06, "loss": 0.1587, "num_tokens": 7363293.0, "reward": 0.15402229130268097, "reward_std": 0.060593266040086746, "rewards/bleu_reward_func/mean": 0.15402229130268097, "rewards/bleu_reward_func/std": 0.21718958020210266, "step": 554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5625, "completions/max_length": 512.0, "completions/max_terminated_length": 512.0, "completions/mean_length": 431.96875, "completions/mean_terminated_length": 329.0714416503906, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.444, "grad_norm": 2.2111520767211914, "kl": 0.04241943359375, "learning_rate": 1e-06, "loss": -0.0774, "num_tokens": 7381116.0, "reward": 0.09687276929616928, "reward_std": 0.020684881135821342, "rewards/bleu_reward_func/mean": 0.09687276929616928, "rewards/bleu_reward_func/std": 0.0794241800904274, "step": 555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 512.0, "completions/max_terminated_length": 504.0, "completions/mean_length": 290.5, "completions/mean_terminated_length": 283.3548278808594, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.4448, "grad_norm": 3.065490484237671, "kl": 0.03900146484375, "learning_rate": 1e-06, "loss": -0.1863, "num_tokens": 7392580.0, "reward": 0.07492414861917496, "reward_std": 0.026103414595127106, "rewards/bleu_reward_func/mean": 0.07492414861917496, "rewards/bleu_reward_func/std": 0.0615268349647522, "step": 556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 512.0, "completions/max_terminated_length": 500.0, "completions/mean_length": 288.875, "completions/mean_terminated_length": 281.6773986816406, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 0.4456, "grad_norm": 3.6267192363739014, "kl": 0.03515625, "learning_rate": 1e-06, "loss": -0.1178, "num_tokens": 7407360.0, "reward": 0.051946789026260376, "reward_std": 0.01748417690396309, "rewards/bleu_reward_func/mean": 0.051946789026260376, "rewards/bleu_reward_func/std": 0.05551343038678169, "step": 557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 512.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 358.4375, "completions/mean_terminated_length": 204.875, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.4464, "grad_norm": 2.4198813438415527, "kl": 0.0382080078125, "learning_rate": 1e-06, "loss": 0.0175, "num_tokens": 7423862.0, "reward": 0.04432545229792595, "reward_std": 0.016445258632302284, "rewards/bleu_reward_func/mean": 0.04432545229792595, "rewards/bleu_reward_func/std": 0.027678146958351135, "step": 558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 512.0, "completions/max_terminated_length": 478.0, "completions/mean_length": 431.375, "completions/mean_terminated_length": 360.23529052734375, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "epoch": 0.4472, "grad_norm": 2.404078722000122, "kl": 0.039306640625, "learning_rate": 1e-06, "loss": -0.0133, "num_tokens": 7440074.0, "reward": 0.03491047024726868, "reward_std": 0.017125248908996582, "rewards/bleu_reward_func/mean": 0.03491047024726868, "rewards/bleu_reward_func/std": 0.03506564348936081, "step": 559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 512.0, "completions/max_terminated_length": 490.0, "completions/mean_length": 376.21875, "completions/mean_terminated_length": 240.4375, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.448, "grad_norm": 3.0429000854492188, "kl": 0.054931640625, "learning_rate": 1e-06, "loss": -0.1107, "num_tokens": 7454921.0, "reward": 0.07136739790439606, "reward_std": 0.015603035688400269, "rewards/bleu_reward_func/mean": 0.07136739790439606, "rewards/bleu_reward_func/std": 0.06946107745170593, "step": 560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/max_terminated_length": 480.0, "completions/mean_length": 266.6875, "completions/mean_terminated_length": 266.6875, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.4488, "grad_norm": 2.670884609222412, "kl": 0.0489501953125, "learning_rate": 1e-06, "loss": 0.0048, "num_tokens": 7465999.0, "reward": 0.06271904706954956, "reward_std": 0.02406102605164051, "rewards/bleu_reward_func/mean": 0.06271904706954956, "rewards/bleu_reward_func/std": 0.04184677079319954, "step": 561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.59375, "completions/max_length": 512.0, "completions/max_terminated_length": 365.0, "completions/mean_length": 403.25, "completions/mean_terminated_length": 244.30770874023438, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.4496, "grad_norm": 2.7331299781799316, "kl": 0.03823089599609375, "learning_rate": 1e-06, "loss": 0.1477, "num_tokens": 7481983.0, "reward": 0.059584274888038635, "reward_std": 0.0577334426343441, "rewards/bleu_reward_func/mean": 0.059584274888038635, "rewards/bleu_reward_func/std": 0.1189492866396904, "step": 562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 512.0, "completions/max_terminated_length": 511.0, "completions/mean_length": 412.15625, "completions/mean_terminated_length": 393.6666564941406, "completions/min_length": 285.0, "completions/min_terminated_length": 285.0, "epoch": 0.4504, "grad_norm": 2.2429537773132324, "kl": 0.037750244140625, "learning_rate": 1e-06, "loss": 0.013, "num_tokens": 7497380.0, "reward": 0.061556171625852585, "reward_std": 0.018194040283560753, "rewards/bleu_reward_func/mean": 0.061556171625852585, "rewards/bleu_reward_func/std": 0.044673360884189606, "step": 563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 512.0, "completions/max_terminated_length": 510.0, "completions/mean_length": 418.75, "completions/mean_terminated_length": 346.22222900390625, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.4512, "grad_norm": 2.5125789642333984, "kl": 0.0450439453125, "learning_rate": 1e-06, "loss": 0.0744, "num_tokens": 7514652.0, "reward": 0.09208458662033081, "reward_std": 0.031232329085469246, "rewards/bleu_reward_func/mean": 0.09208458662033081, "rewards/bleu_reward_func/std": 0.10837023705244064, "step": 564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 511.0, "completions/mean_length": 323.96875, "completions/mean_terminated_length": 261.29168701171875, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 0.452, "grad_norm": 2.9214463233947754, "kl": 0.048583984375, "learning_rate": 1e-06, "loss": -0.0177, "num_tokens": 7527531.0, "reward": 0.0327458456158638, "reward_std": 0.007936608046293259, "rewards/bleu_reward_func/mean": 0.0327458456158638, "rewards/bleu_reward_func/std": 0.023914847522974014, "step": 565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 512.0, "completions/max_terminated_length": 488.0, "completions/mean_length": 380.90625, "completions/mean_terminated_length": 312.23809814453125, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.4528, "grad_norm": 2.2859840393066406, "kl": 0.045196533203125, "learning_rate": 1e-06, "loss": -0.0492, "num_tokens": 7542848.0, "reward": 0.07620274275541306, "reward_std": 0.027277415618300438, "rewards/bleu_reward_func/mean": 0.07620274275541306, "rewards/bleu_reward_func/std": 0.043575797230005264, "step": 566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 512.0, "completions/max_terminated_length": 504.0, "completions/mean_length": 295.84375, "completions/mean_terminated_length": 288.8709716796875, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.4536, "grad_norm": 2.6817574501037598, "kl": 0.035888671875, "learning_rate": 1e-06, "loss": 0.0675, "num_tokens": 7554347.0, "reward": 0.04382229968905449, "reward_std": 0.016639089211821556, "rewards/bleu_reward_func/mean": 0.04382229968905449, "rewards/bleu_reward_func/std": 0.048808448016643524, "step": 567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.6875, "completions/max_length": 512.0, "completions/max_terminated_length": 489.0, "completions/mean_length": 396.875, "completions/mean_terminated_length": 143.60000610351562, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.4544, "grad_norm": 3.5614659786224365, "kl": 0.03765869140625, "learning_rate": 1e-06, "loss": -0.1146, "num_tokens": 7571287.0, "reward": 0.047724343836307526, "reward_std": 0.025788918137550354, "rewards/bleu_reward_func/mean": 0.047724343836307526, "rewards/bleu_reward_func/std": 0.0531436912715435, "step": 568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 512.0, "completions/max_terminated_length": 502.0, "completions/mean_length": 324.8125, "completions/mean_terminated_length": 281.6153869628906, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.4552, "grad_norm": 2.7155921459198, "kl": 0.031982421875, "learning_rate": 1e-06, "loss": 0.0584, "num_tokens": 7583833.0, "reward": 0.10536953061819077, "reward_std": 0.01935265213251114, "rewards/bleu_reward_func/mean": 0.10536953061819077, "rewards/bleu_reward_func/std": 0.1391269862651825, "step": 569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 512.0, "completions/max_terminated_length": 493.0, "completions/mean_length": 389.125, "completions/mean_terminated_length": 280.70587158203125, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 0.456, "grad_norm": 3.139817953109741, "kl": 0.045562744140625, "learning_rate": 1e-06, "loss": 0.1345, "num_tokens": 7600669.0, "reward": 0.03381893038749695, "reward_std": 0.016742901876568794, "rewards/bleu_reward_func/mean": 0.03381893038749695, "rewards/bleu_reward_func/std": 0.02772444114089012, "step": 570 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 512.0, "completions/max_terminated_length": 494.0, "completions/mean_length": 336.28125, "completions/mean_terminated_length": 244.23809814453125, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.4568, "grad_norm": 3.0243892669677734, "kl": 0.043701171875, "learning_rate": 1e-06, "loss": 0.066, "num_tokens": 7613814.0, "reward": 0.04240579158067703, "reward_std": 0.01026132982224226, "rewards/bleu_reward_func/mean": 0.04240579158067703, "rewards/bleu_reward_func/std": 0.03058365173637867, "step": 571 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 512.0, "completions/max_terminated_length": 452.0, "completions/mean_length": 208.8125, "completions/mean_terminated_length": 199.03225708007812, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.4576, "grad_norm": 5.602510929107666, "kl": 0.04998779296875, "learning_rate": 1e-06, "loss": 0.0459, "num_tokens": 7623040.0, "reward": 0.08647982776165009, "reward_std": 0.03401945158839226, "rewards/bleu_reward_func/mean": 0.08647982776165009, "rewards/bleu_reward_func/std": 0.06481946259737015, "step": 572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 512.0, "completions/max_terminated_length": 494.0, "completions/mean_length": 394.9375, "completions/mean_terminated_length": 314.84210205078125, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.4584, "grad_norm": 2.113441228866577, "kl": 0.0284423828125, "learning_rate": 1e-06, "loss": -0.1758, "num_tokens": 7641758.0, "reward": 0.05996987968683243, "reward_std": 0.03625640273094177, "rewards/bleu_reward_func/mean": 0.05996987968683243, "rewards/bleu_reward_func/std": 0.07776294648647308, "step": 573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 469.0, "completions/mean_length": 338.125, "completions/mean_terminated_length": 289.44000244140625, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.4592, "grad_norm": 3.0386271476745605, "kl": 0.03668212890625, "learning_rate": 1e-06, "loss": 0.0939, "num_tokens": 7655066.0, "reward": 0.04174087196588516, "reward_std": 0.01017804816365242, "rewards/bleu_reward_func/mean": 0.04174087196588516, "rewards/bleu_reward_func/std": 0.011822175234556198, "step": 574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 512.0, "completions/max_terminated_length": 479.0, "completions/mean_length": 361.4375, "completions/mean_terminated_length": 258.4210510253906, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.46, "grad_norm": 3.434180498123169, "kl": 0.035400390625, "learning_rate": 1e-06, "loss": 0.0801, "num_tokens": 7670080.0, "reward": 0.061414189636707306, "reward_std": 0.017353273928165436, "rewards/bleu_reward_func/mean": 0.061414189636707306, "rewards/bleu_reward_func/std": 0.06352873891592026, "step": 575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 512.0, "completions/max_terminated_length": 498.0, "completions/mean_length": 359.25, "completions/mean_terminated_length": 289.81817626953125, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.4608, "grad_norm": 3.759580612182617, "kl": 0.046783447265625, "learning_rate": 1e-06, "loss": 0.0683, "num_tokens": 7684952.0, "reward": 0.03925281763076782, "reward_std": 0.01383259054273367, "rewards/bleu_reward_func/mean": 0.03925281763076782, "rewards/bleu_reward_func/std": 0.026625417172908783, "step": 576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.71875, "completions/max_length": 512.0, "completions/max_terminated_length": 467.0, "completions/mean_length": 463.9375, "completions/mean_terminated_length": 341.1111145019531, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "epoch": 0.4616, "grad_norm": 2.7189910411834717, "kl": 0.037567138671875, "learning_rate": 1e-06, "loss": 0.0284, "num_tokens": 7704214.0, "reward": 0.05996118485927582, "reward_std": 0.011609978042542934, "rewards/bleu_reward_func/mean": 0.05996118485927582, "rewards/bleu_reward_func/std": 0.016379138454794884, "step": 577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 512.0, "completions/max_terminated_length": 497.0, "completions/mean_length": 316.84375, "completions/mean_terminated_length": 296.6551818847656, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.4624, "grad_norm": 3.2235851287841797, "kl": 0.0435791015625, "learning_rate": 1e-06, "loss": 0.0329, "num_tokens": 7718913.0, "reward": 0.046588048338890076, "reward_std": 0.01663251593708992, "rewards/bleu_reward_func/mean": 0.046588048338890076, "rewards/bleu_reward_func/std": 0.027757668867707253, "step": 578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 512.0, "completions/max_terminated_length": 507.0, "completions/mean_length": 347.71875, "completions/mean_terminated_length": 219.94444274902344, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.4632, "grad_norm": 7.463287353515625, "kl": 0.03564453125, "learning_rate": 1e-06, "loss": -0.2157, "num_tokens": 7732912.0, "reward": 0.046244870871305466, "reward_std": 0.029473457485437393, "rewards/bleu_reward_func/mean": 0.046244870871305466, "rewards/bleu_reward_func/std": 0.047696422785520554, "step": 579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 512.0, "completions/max_terminated_length": 509.0, "completions/mean_length": 329.21875, "completions/mean_terminated_length": 295.370361328125, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.464, "grad_norm": 3.48012375831604, "kl": 0.058197021484375, "learning_rate": 1e-06, "loss": -0.0137, "num_tokens": 7746103.0, "reward": 0.08398178219795227, "reward_std": 0.02495785802602768, "rewards/bleu_reward_func/mean": 0.08398178219795227, "rewards/bleu_reward_func/std": 0.042412761598825455, "step": 580 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 512.0, "completions/max_terminated_length": 448.0, "completions/mean_length": 266.21875, "completions/mean_terminated_length": 118.75, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.4648, "grad_norm": 4.121673107147217, "kl": 0.0460205078125, "learning_rate": 1e-06, "loss": 0.0792, "num_tokens": 7756350.0, "reward": 0.05314105004072189, "reward_std": 0.025027906522154808, "rewards/bleu_reward_func/mean": 0.05314105004072189, "rewards/bleu_reward_func/std": 0.0441046804189682, "step": 581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 473.0, "completions/mean_length": 312.0, "completions/mean_terminated_length": 233.7391357421875, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.4656, "grad_norm": 4.499547004699707, "kl": 0.037322998046875, "learning_rate": 1e-06, "loss": -0.037, "num_tokens": 7769326.0, "reward": 0.03417757526040077, "reward_std": 0.014021034352481365, "rewards/bleu_reward_func/mean": 0.03417757526040077, "rewards/bleu_reward_func/std": 0.020486222580075264, "step": 582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 512.0, "completions/max_terminated_length": 490.0, "completions/mean_length": 211.9375, "completions/mean_terminated_length": 191.933349609375, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.4664, "grad_norm": 3.6834604740142822, "kl": 0.05157470703125, "learning_rate": 1e-06, "loss": -0.0825, "num_tokens": 7781164.0, "reward": 0.03835342079401016, "reward_std": 0.012080431915819645, "rewards/bleu_reward_func/mean": 0.03835342079401016, "rewards/bleu_reward_func/std": 0.03473048284649849, "step": 583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 511.0, "completions/mean_length": 377.78125, "completions/mean_terminated_length": 340.1999816894531, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "epoch": 0.4672, "grad_norm": 2.79353666305542, "kl": 0.0355224609375, "learning_rate": 1e-06, "loss": 0.0312, "num_tokens": 7796253.0, "reward": 0.053808994591236115, "reward_std": 0.011477467603981495, "rewards/bleu_reward_func/mean": 0.053808994591236115, "rewards/bleu_reward_func/std": 0.026024699211120605, "step": 584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 512.0, "completions/max_terminated_length": 503.0, "completions/mean_length": 308.5625, "completions/mean_terminated_length": 287.5172424316406, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.468, "grad_norm": 2.750420570373535, "kl": 0.04071044921875, "learning_rate": 1e-06, "loss": -0.1319, "num_tokens": 7809199.0, "reward": 0.033281125128269196, "reward_std": 0.0192633755505085, "rewards/bleu_reward_func/mean": 0.033281125128269196, "rewards/bleu_reward_func/std": 0.02813081629574299, "step": 585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.75, "completions/max_length": 512.0, "completions/max_terminated_length": 257.0, "completions/mean_length": 424.75, "completions/mean_terminated_length": 163.0, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "epoch": 0.4688, "grad_norm": 3.282240152359009, "kl": 0.05096435546875, "learning_rate": 1e-06, "loss": -0.057, "num_tokens": 7825903.0, "reward": 0.03255011513829231, "reward_std": 0.009569083340466022, "rewards/bleu_reward_func/mean": 0.03255011513829231, "rewards/bleu_reward_func/std": 0.025185568258166313, "step": 586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 512.0, "completions/max_terminated_length": 499.0, "completions/mean_length": 283.75, "completions/mean_terminated_length": 268.5333557128906, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.4696, "grad_norm": 3.6260647773742676, "kl": 0.03955078125, "learning_rate": 1e-06, "loss": -0.18, "num_tokens": 7837039.0, "reward": 0.0632857158780098, "reward_std": 0.03276119753718376, "rewards/bleu_reward_func/mean": 0.0632857158780098, "rewards/bleu_reward_func/std": 0.049753375351428986, "step": 587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/max_terminated_length": 500.0, "completions/mean_length": 310.625, "completions/mean_terminated_length": 310.625, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.4704, "grad_norm": 2.673442840576172, "kl": 0.037353515625, "learning_rate": 1e-06, "loss": 0.0266, "num_tokens": 7848931.0, "reward": 0.04075375944375992, "reward_std": 0.01151657197624445, "rewards/bleu_reward_func/mean": 0.04075375944375992, "rewards/bleu_reward_func/std": 0.02330603078007698, "step": 588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 512.0, "completions/max_terminated_length": 500.0, "completions/mean_length": 306.9375, "completions/mean_terminated_length": 277.64288330078125, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 0.4712, "grad_norm": 4.481053829193115, "kl": 0.046234130859375, "learning_rate": 1e-06, "loss": 0.0059, "num_tokens": 7862305.0, "reward": 0.05103903263807297, "reward_std": 0.01719430461525917, "rewards/bleu_reward_func/mean": 0.05103903263807297, "rewards/bleu_reward_func/std": 0.05009397119283676, "step": 589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 512.0, "completions/max_terminated_length": 502.0, "completions/mean_length": 282.0625, "completions/mean_terminated_length": 239.48147583007812, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.472, "grad_norm": 2.702883243560791, "kl": 0.038818359375, "learning_rate": 1e-06, "loss": 0.0339, "num_tokens": 7873635.0, "reward": 0.043863605707883835, "reward_std": 0.01219463162124157, "rewards/bleu_reward_func/mean": 0.043863605707883835, "rewards/bleu_reward_func/std": 0.02272559143602848, "step": 590 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 512.0, "completions/max_terminated_length": 511.0, "completions/mean_length": 380.5625, "completions/mean_terminated_length": 311.71429443359375, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.4728, "grad_norm": 3.459629535675049, "kl": 0.030670166015625, "learning_rate": 1e-06, "loss": 0.0118, "num_tokens": 7888205.0, "reward": 0.05964861810207367, "reward_std": 0.014393117278814316, "rewards/bleu_reward_func/mean": 0.05964861810207367, "rewards/bleu_reward_func/std": 0.030775554478168488, "step": 591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 512.0, "completions/max_terminated_length": 494.0, "completions/mean_length": 208.875, "completions/mean_terminated_length": 152.74073791503906, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.4736, "grad_norm": 3.997481346130371, "kl": 0.053131103515625, "learning_rate": 1e-06, "loss": -0.0981, "num_tokens": 7896961.0, "reward": 0.09768746048212051, "reward_std": 0.06965920329093933, "rewards/bleu_reward_func/mean": 0.09768746048212051, "rewards/bleu_reward_func/std": 0.09702237695455551, "step": 592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 512.0, "completions/max_terminated_length": 510.0, "completions/mean_length": 365.0, "completions/mean_terminated_length": 264.4210510253906, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 0.4744, "grad_norm": 2.9996702671051025, "kl": 0.04754638671875, "learning_rate": 1e-06, "loss": -0.0128, "num_tokens": 7911913.0, "reward": 0.03393930196762085, "reward_std": 0.011044314131140709, "rewards/bleu_reward_func/mean": 0.03393930196762085, "rewards/bleu_reward_func/std": 0.02244512550532818, "step": 593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 506.0, "completions/mean_length": 398.4375, "completions/mean_terminated_length": 360.5833435058594, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.4752, "grad_norm": 2.3302993774414062, "kl": 0.027374267578125, "learning_rate": 1e-06, "loss": 0.003, "num_tokens": 7926615.0, "reward": 0.055141009390354156, "reward_std": 0.03131604939699173, "rewards/bleu_reward_func/mean": 0.055141009390354156, "rewards/bleu_reward_func/std": 0.03603653982281685, "step": 594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 462.0, "completions/mean_length": 345.96875, "completions/mean_terminated_length": 290.625, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.476, "grad_norm": 2.3898162841796875, "kl": 0.03863525390625, "learning_rate": 1e-06, "loss": -0.0262, "num_tokens": 7942582.0, "reward": 0.09485931694507599, "reward_std": 0.0281388983130455, "rewards/bleu_reward_func/mean": 0.09485931694507599, "rewards/bleu_reward_func/std": 0.09958592057228088, "step": 595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 512.0, "completions/max_terminated_length": 510.0, "completions/mean_length": 447.4375, "completions/mean_terminated_length": 390.4705810546875, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.4768, "grad_norm": 2.3441829681396484, "kl": 0.0261077880859375, "learning_rate": 1e-06, "loss": -0.0278, "num_tokens": 7960692.0, "reward": 0.0386398509144783, "reward_std": 0.015945829451084137, "rewards/bleu_reward_func/mean": 0.0386398509144783, "rewards/bleu_reward_func/std": 0.030894169583916664, "step": 596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 430.0, "completions/max_terminated_length": 430.0, "completions/mean_length": 231.21875, "completions/mean_terminated_length": 231.21875, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 0.4776, "grad_norm": 3.6054701805114746, "kl": 0.04876708984375, "learning_rate": 1e-06, "loss": 0.1195, "num_tokens": 7971027.0, "reward": 0.060304559767246246, "reward_std": 0.019354872405529022, "rewards/bleu_reward_func/mean": 0.060304559767246246, "rewards/bleu_reward_func/std": 0.02801518701016903, "step": 597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 447.0, "completions/max_terminated_length": 447.0, "completions/mean_length": 247.40625, "completions/mean_terminated_length": 247.40625, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.4784, "grad_norm": 3.131502628326416, "kl": 0.036651611328125, "learning_rate": 1e-06, "loss": 0.0377, "num_tokens": 7981976.0, "reward": 0.04672680422663689, "reward_std": 0.014762789011001587, "rewards/bleu_reward_func/mean": 0.04672680422663689, "rewards/bleu_reward_func/std": 0.02195819653570652, "step": 598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 512.0, "completions/max_terminated_length": 504.0, "completions/mean_length": 417.25, "completions/mean_terminated_length": 352.4210510253906, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.4792, "grad_norm": 2.603997230529785, "kl": 0.0496826171875, "learning_rate": 1e-06, "loss": -0.1255, "num_tokens": 7998472.0, "reward": 0.027579082176089287, "reward_std": 0.01969665102660656, "rewards/bleu_reward_func/mean": 0.027579082176089287, "rewards/bleu_reward_func/std": 0.03603406250476837, "step": 599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 455.0, "completions/mean_length": 328.78125, "completions/mean_terminated_length": 257.08697509765625, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.48, "grad_norm": 4.4895172119140625, "kl": 0.05499267578125, "learning_rate": 1e-06, "loss": 0.0563, "num_tokens": 8015513.0, "reward": 0.12187319993972778, "reward_std": 0.024902716279029846, "rewards/bleu_reward_func/mean": 0.12187319993972778, "rewards/bleu_reward_func/std": 0.04940319433808327, "step": 600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 512.0, "completions/max_terminated_length": 508.0, "completions/mean_length": 402.21875, "completions/mean_terminated_length": 305.3529357910156, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 0.4808, "grad_norm": 2.3858461380004883, "kl": 0.04559326171875, "learning_rate": 1e-06, "loss": -0.146, "num_tokens": 8031040.0, "reward": 0.05429249256849289, "reward_std": 0.025205722078680992, "rewards/bleu_reward_func/mean": 0.05429249256849289, "rewards/bleu_reward_func/std": 0.040760673582553864, "step": 601 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 512.0, "completions/max_terminated_length": 508.0, "completions/mean_length": 402.625, "completions/mean_terminated_length": 317.5555725097656, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 0.4816, "grad_norm": 2.570845127105713, "kl": 0.04290771484375, "learning_rate": 1e-06, "loss": -0.1161, "num_tokens": 8047092.0, "reward": 0.03673902899026871, "reward_std": 0.02603769302368164, "rewards/bleu_reward_func/mean": 0.03673902899026871, "rewards/bleu_reward_func/std": 0.03560277447104454, "step": 602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 512.0, "completions/max_terminated_length": 504.0, "completions/mean_length": 347.21875, "completions/mean_terminated_length": 272.31817626953125, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 0.4824, "grad_norm": 2.776517391204834, "kl": 0.038055419921875, "learning_rate": 1e-06, "loss": -0.0265, "num_tokens": 8061299.0, "reward": 0.058416951447725296, "reward_std": 0.016790183261036873, "rewards/bleu_reward_func/mean": 0.058416951447725296, "rewards/bleu_reward_func/std": 0.025730496272444725, "step": 603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 392.0, "completions/max_terminated_length": 392.0, "completions/mean_length": 223.1875, "completions/mean_terminated_length": 223.1875, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.4832, "grad_norm": 3.1524572372436523, "kl": 0.0537109375, "learning_rate": 1e-06, "loss": 0.0024, "num_tokens": 8070425.0, "reward": 0.05830112844705582, "reward_std": 0.019749192520976067, "rewards/bleu_reward_func/mean": 0.05830112844705582, "rewards/bleu_reward_func/std": 0.06983724236488342, "step": 604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 512.0, "completions/max_terminated_length": 497.0, "completions/mean_length": 315.5625, "completions/mean_terminated_length": 181.15789794921875, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 0.484, "grad_norm": 2.831850051879883, "kl": 0.048583984375, "learning_rate": 1e-06, "loss": 0.1295, "num_tokens": 8084955.0, "reward": 0.08599106967449188, "reward_std": 0.029317699372768402, "rewards/bleu_reward_func/mean": 0.08599106967449188, "rewards/bleu_reward_func/std": 0.08069133013486862, "step": 605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 496.0, "completions/mean_length": 328.96875, "completions/mean_terminated_length": 277.7200012207031, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 0.4848, "grad_norm": 2.704080820083618, "kl": 0.034210205078125, "learning_rate": 1e-06, "loss": -0.0324, "num_tokens": 8101450.0, "reward": 0.12661004066467285, "reward_std": 0.03845934569835663, "rewards/bleu_reward_func/mean": 0.12661004066467285, "rewards/bleu_reward_func/std": 0.16385024785995483, "step": 606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 512.0, "completions/mean_length": 420.875, "completions/mean_terminated_length": 395.3599853515625, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "epoch": 0.4856, "grad_norm": 2.152388095855713, "kl": 0.030975341796875, "learning_rate": 1e-06, "loss": -0.0631, "num_tokens": 8119342.0, "reward": 0.05611906573176384, "reward_std": 0.01729283295571804, "rewards/bleu_reward_func/mean": 0.05611906573176384, "rewards/bleu_reward_func/std": 0.03917882218956947, "step": 607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 512.0, "completions/max_terminated_length": 504.0, "completions/mean_length": 371.25, "completions/mean_terminated_length": 338.7692565917969, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.4864, "grad_norm": 2.3123111724853516, "kl": 0.037353515625, "learning_rate": 1e-06, "loss": -0.0253, "num_tokens": 8133462.0, "reward": 0.046634070575237274, "reward_std": 0.016718275845050812, "rewards/bleu_reward_func/mean": 0.046634070575237274, "rewards/bleu_reward_func/std": 0.04330248758196831, "step": 608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 482.0, "completions/mean_length": 313.4375, "completions/mean_terminated_length": 247.25, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 0.4872, "grad_norm": 3.9397592544555664, "kl": 0.04949951171875, "learning_rate": 1e-06, "loss": 0.0353, "num_tokens": 8149140.0, "reward": 0.11079287528991699, "reward_std": 0.02652319148182869, "rewards/bleu_reward_func/mean": 0.11079287528991699, "rewards/bleu_reward_func/std": 0.04020438715815544, "step": 609 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 338.0, "completions/mean_length": 275.75, "completions/mean_terminated_length": 197.0, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.488, "grad_norm": 3.4938271045684814, "kl": 0.0595703125, "learning_rate": 1e-06, "loss": -0.0852, "num_tokens": 8162356.0, "reward": 0.0434708371758461, "reward_std": 0.008759420365095139, "rewards/bleu_reward_func/mean": 0.0434708371758461, "rewards/bleu_reward_func/std": 0.01517016626894474, "step": 610 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/max_terminated_length": 506.0, "completions/mean_length": 327.65625, "completions/mean_terminated_length": 327.65625, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.4888, "grad_norm": 2.629668951034546, "kl": 0.05108642578125, "learning_rate": 1e-06, "loss": 0.1184, "num_tokens": 8175073.0, "reward": 0.06240731105208397, "reward_std": 0.019540153443813324, "rewards/bleu_reward_func/mean": 0.06240731105208397, "rewards/bleu_reward_func/std": 0.04322398081421852, "step": 611 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 512.0, "completions/max_terminated_length": 512.0, "completions/mean_length": 369.78125, "completions/mean_terminated_length": 360.3000183105469, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 0.4896, "grad_norm": 2.4656405448913574, "kl": 0.04461669921875, "learning_rate": 1e-06, "loss": 0.1145, "num_tokens": 8189442.0, "reward": 0.017378607764840126, "reward_std": 0.005311779212206602, "rewards/bleu_reward_func/mean": 0.017378607764840126, "rewards/bleu_reward_func/std": 0.012686874717473984, "step": 612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 512.0, "completions/max_terminated_length": 498.0, "completions/mean_length": 286.375, "completions/mean_terminated_length": 254.1428680419922, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 0.4904, "grad_norm": 4.021340370178223, "kl": 0.05572509765625, "learning_rate": 1e-06, "loss": -0.0739, "num_tokens": 8200734.0, "reward": 0.0681406781077385, "reward_std": 0.02279684692621231, "rewards/bleu_reward_func/mean": 0.0681406781077385, "rewards/bleu_reward_func/std": 0.0697670578956604, "step": 613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 512.0, "completions/max_terminated_length": 509.0, "completions/mean_length": 249.03125, "completions/mean_terminated_length": 221.8275909423828, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.4912, "grad_norm": 6.891125679016113, "kl": 0.041168212890625, "learning_rate": 1e-06, "loss": 0.0114, "num_tokens": 8213271.0, "reward": 0.10955880582332611, "reward_std": 0.05131708085536957, "rewards/bleu_reward_func/mean": 0.10955880582332611, "rewards/bleu_reward_func/std": 0.11350703984498978, "step": 614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 461.0, "completions/max_terminated_length": 461.0, "completions/mean_length": 193.09375, "completions/mean_terminated_length": 193.09375, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.492, "grad_norm": 5.938106536865234, "kl": 0.0616455078125, "learning_rate": 1e-06, "loss": 0.0362, "num_tokens": 8222730.0, "reward": 0.06254906952381134, "reward_std": 0.013879001140594482, "rewards/bleu_reward_func/mean": 0.06254906952381134, "rewards/bleu_reward_func/std": 0.03822917118668556, "step": 615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.53125, "completions/max_length": 512.0, "completions/max_terminated_length": 458.0, "completions/mean_length": 403.0625, "completions/mean_terminated_length": 279.6000061035156, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 0.4928, "grad_norm": 2.2214114665985107, "kl": 0.044952392578125, "learning_rate": 1e-06, "loss": 0.1107, "num_tokens": 8238876.0, "reward": 0.0595497228205204, "reward_std": 0.02531789056956768, "rewards/bleu_reward_func/mean": 0.0595497228205204, "rewards/bleu_reward_func/std": 0.04042090103030205, "step": 616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 503.0, "completions/mean_length": 370.3125, "completions/mean_terminated_length": 314.86956787109375, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 0.4936, "grad_norm": 2.4716479778289795, "kl": 0.04425048828125, "learning_rate": 1e-06, "loss": -0.1187, "num_tokens": 8252774.0, "reward": 0.040819909423589706, "reward_std": 0.01310974545776844, "rewards/bleu_reward_func/mean": 0.040819909423589706, "rewards/bleu_reward_func/std": 0.014063726179301739, "step": 617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 512.0, "completions/max_terminated_length": 503.0, "completions/mean_length": 303.21875, "completions/mean_terminated_length": 193.85714721679688, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "epoch": 0.4944, "grad_norm": 3.001490592956543, "kl": 0.0645751953125, "learning_rate": 1e-06, "loss": -0.084, "num_tokens": 8268237.0, "reward": 0.06573346257209778, "reward_std": 0.016953492537140846, "rewards/bleu_reward_func/mean": 0.06573346257209778, "rewards/bleu_reward_func/std": 0.029022369533777237, "step": 618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 512.0, "completions/max_terminated_length": 501.0, "completions/mean_length": 290.21875, "completions/mean_terminated_length": 189.4091033935547, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.4952, "grad_norm": 3.4469478130340576, "kl": 0.056976318359375, "learning_rate": 1e-06, "loss": 0.0337, "num_tokens": 8279260.0, "reward": 0.11688727140426636, "reward_std": 0.05538788437843323, "rewards/bleu_reward_func/mean": 0.11688727140426636, "rewards/bleu_reward_func/std": 0.15909381210803986, "step": 619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 415.0, "completions/max_terminated_length": 415.0, "completions/mean_length": 253.125, "completions/mean_terminated_length": 253.125, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 0.496, "grad_norm": 3.6850404739379883, "kl": 0.0391845703125, "learning_rate": 1e-06, "loss": -0.0111, "num_tokens": 8289320.0, "reward": 0.08325017243623734, "reward_std": 0.038202736526727676, "rewards/bleu_reward_func/mean": 0.08325017243623734, "rewards/bleu_reward_func/std": 0.09062850475311279, "step": 620 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 512.0, "completions/max_terminated_length": 479.0, "completions/mean_length": 210.125, "completions/mean_terminated_length": 178.89654541015625, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "epoch": 0.4968, "grad_norm": 3.800476551055908, "kl": 0.067138671875, "learning_rate": 1e-06, "loss": -0.0907, "num_tokens": 8297828.0, "reward": 0.03796212375164032, "reward_std": 0.013833219185471535, "rewards/bleu_reward_func/mean": 0.03796212375164032, "rewards/bleu_reward_func/std": 0.017871350049972534, "step": 621 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.53125, "completions/max_length": 512.0, "completions/max_terminated_length": 430.0, "completions/mean_length": 381.5625, "completions/mean_terminated_length": 233.7333526611328, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.4976, "grad_norm": 2.6759557723999023, "kl": 0.0574951171875, "learning_rate": 1e-06, "loss": -0.0515, "num_tokens": 8315566.0, "reward": 0.03240504860877991, "reward_std": 0.015285526402294636, "rewards/bleu_reward_func/mean": 0.03240504860877991, "rewards/bleu_reward_func/std": 0.038935378193855286, "step": 622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 512.0, "completions/max_terminated_length": 511.0, "completions/mean_length": 389.0, "completions/mean_terminated_length": 266.0, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.4984, "grad_norm": 2.6809816360473633, "kl": 0.0592498779296875, "learning_rate": 1e-06, "loss": -0.1443, "num_tokens": 8332166.0, "reward": 0.09420234709978104, "reward_std": 0.040369659662246704, "rewards/bleu_reward_func/mean": 0.09420234709978104, "rewards/bleu_reward_func/std": 0.13240835070610046, "step": 623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 512.0, "completions/max_terminated_length": 418.0, "completions/mean_length": 251.125, "completions/mean_terminated_length": 202.8148193359375, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "epoch": 0.4992, "grad_norm": 3.325798273086548, "kl": 0.039554595947265625, "learning_rate": 1e-06, "loss": 0.1752, "num_tokens": 8343354.0, "reward": 0.03703116998076439, "reward_std": 0.019214333966374397, "rewards/bleu_reward_func/mean": 0.03703116998076439, "rewards/bleu_reward_func/std": 0.023005735129117966, "step": 624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 497.0, "completions/mean_length": 323.1875, "completions/mean_terminated_length": 270.32000732421875, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.5, "grad_norm": 3.1225502490997314, "kl": 0.0413818359375, "learning_rate": 1e-06, "loss": -0.04, "num_tokens": 8358024.0, "reward": 0.02892148494720459, "reward_std": 0.01050527486950159, "rewards/bleu_reward_func/mean": 0.02892148494720459, "rewards/bleu_reward_func/std": 0.016583766788244247, "step": 625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 433.0, "completions/mean_length": 312.8125, "completions/mean_terminated_length": 246.4166717529297, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.5008, "grad_norm": 4.3559088706970215, "kl": 0.0572509765625, "learning_rate": 1e-06, "loss": -0.1897, "num_tokens": 8370506.0, "reward": 0.06611833721399307, "reward_std": 0.023439275100827217, "rewards/bleu_reward_func/mean": 0.06611833721399307, "rewards/bleu_reward_func/std": 0.04732180014252663, "step": 626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 455.0, "completions/max_terminated_length": 455.0, "completions/mean_length": 230.84375, "completions/mean_terminated_length": 230.84375, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.5016, "grad_norm": 6.250845909118652, "kl": 0.0501708984375, "learning_rate": 1e-06, "loss": -0.0384, "num_tokens": 8380765.0, "reward": 0.20678502321243286, "reward_std": 0.03768392652273178, "rewards/bleu_reward_func/mean": 0.20678502321243286, "rewards/bleu_reward_func/std": 0.2695082426071167, "step": 627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 512.0, "completions/max_terminated_length": 499.0, "completions/mean_length": 416.0625, "completions/mean_terminated_length": 358.5, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "epoch": 0.5024, "grad_norm": 2.0310866832733154, "kl": 0.037506103515625, "learning_rate": 1e-06, "loss": 0.0731, "num_tokens": 8396487.0, "reward": 0.06099681928753853, "reward_std": 0.019012173637747765, "rewards/bleu_reward_func/mean": 0.06099681928753853, "rewards/bleu_reward_func/std": 0.08027222752571106, "step": 628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 512.0, "completions/max_terminated_length": 460.0, "completions/mean_length": 283.9375, "completions/mean_terminated_length": 180.27273559570312, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "epoch": 0.5032, "grad_norm": 3.856144666671753, "kl": 0.045654296875, "learning_rate": 1e-06, "loss": 0.0224, "num_tokens": 8407765.0, "reward": 0.05136800557374954, "reward_std": 0.014663058333098888, "rewards/bleu_reward_func/mean": 0.05136800557374954, "rewards/bleu_reward_func/std": 0.04050833731889725, "step": 629 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 512.0, "completions/max_terminated_length": 499.0, "completions/mean_length": 352.34375, "completions/mean_terminated_length": 279.7727355957031, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.504, "grad_norm": 2.120601177215576, "kl": 0.030643463134765625, "learning_rate": 1e-06, "loss": -0.0581, "num_tokens": 8424784.0, "reward": 0.18435360491275787, "reward_std": 0.07129880785942078, "rewards/bleu_reward_func/mean": 0.18435360491275787, "rewards/bleu_reward_func/std": 0.28386473655700684, "step": 630 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 512.0, "completions/max_terminated_length": 493.0, "completions/mean_length": 447.75, "completions/mean_terminated_length": 409.20001220703125, "completions/min_length": 285.0, "completions/min_terminated_length": 285.0, "epoch": 0.5048, "grad_norm": 2.139927625656128, "kl": 0.04376220703125, "learning_rate": 1e-06, "loss": -0.0121, "num_tokens": 8443384.0, "reward": 0.03744620829820633, "reward_std": 0.011771570891141891, "rewards/bleu_reward_func/mean": 0.03744620829820633, "rewards/bleu_reward_func/std": 0.024914991110563278, "step": 631 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 433.0, "completions/max_terminated_length": 433.0, "completions/mean_length": 152.46875, "completions/mean_terminated_length": 152.46875, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 0.5056, "grad_norm": 4.558698654174805, "kl": 0.079833984375, "learning_rate": 1e-06, "loss": -0.0267, "num_tokens": 8451391.0, "reward": 0.06995508074760437, "reward_std": 0.03034752979874611, "rewards/bleu_reward_func/mean": 0.06995508074760437, "rewards/bleu_reward_func/std": 0.0674305334687233, "step": 632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 504.0, "completions/mean_length": 341.34375, "completions/mean_terminated_length": 293.55999755859375, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.5064, "grad_norm": 2.3992624282836914, "kl": 0.045166015625, "learning_rate": 1e-06, "loss": -0.0131, "num_tokens": 8465602.0, "reward": 0.032830677926540375, "reward_std": 0.007615429349243641, "rewards/bleu_reward_func/mean": 0.032830677926540375, "rewards/bleu_reward_func/std": 0.017384473234415054, "step": 633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 512.0, "completions/max_terminated_length": 468.0, "completions/mean_length": 265.1875, "completions/mean_terminated_length": 219.48147583007812, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.5072, "grad_norm": 3.78257155418396, "kl": 0.044921875, "learning_rate": 1e-06, "loss": -0.0649, "num_tokens": 8476480.0, "reward": 0.08779050409793854, "reward_std": 0.03017434850335121, "rewards/bleu_reward_func/mean": 0.08779050409793854, "rewards/bleu_reward_func/std": 0.09867992997169495, "step": 634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 503.0, "completions/mean_length": 340.28125, "completions/mean_terminated_length": 283.04168701171875, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.508, "grad_norm": 2.275812864303589, "kl": 0.0479736328125, "learning_rate": 1e-06, "loss": -0.0002, "num_tokens": 8490385.0, "reward": 0.027935819700360298, "reward_std": 0.01714843139052391, "rewards/bleu_reward_func/mean": 0.027935819700360298, "rewards/bleu_reward_func/std": 0.02698652818799019, "step": 635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 512.0, "completions/max_terminated_length": 460.0, "completions/mean_length": 339.8125, "completions/mean_terminated_length": 236.5, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.5088, "grad_norm": 3.2245848178863525, "kl": 0.0477447509765625, "learning_rate": 1e-06, "loss": 0.1014, "num_tokens": 8505507.0, "reward": 0.12299371510744095, "reward_std": 0.08378162980079651, "rewards/bleu_reward_func/mean": 0.12299371510744095, "rewards/bleu_reward_func/std": 0.10894149541854858, "step": 636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 512.0, "completions/max_terminated_length": 512.0, "completions/mean_length": 348.375, "completions/mean_terminated_length": 318.0740661621094, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 0.5096, "grad_norm": 2.773343086242676, "kl": 0.036895751953125, "learning_rate": 1e-06, "loss": -0.1357, "num_tokens": 8521959.0, "reward": 0.05392155051231384, "reward_std": 0.021032003685832024, "rewards/bleu_reward_func/mean": 0.05392155051231384, "rewards/bleu_reward_func/std": 0.03237828612327576, "step": 637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 512.0, "completions/max_terminated_length": 471.0, "completions/mean_length": 373.21875, "completions/mean_terminated_length": 300.5238037109375, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.5104, "grad_norm": 2.5911452770233154, "kl": 0.0361328125, "learning_rate": 1e-06, "loss": 0.0505, "num_tokens": 8535774.0, "reward": 0.07769744098186493, "reward_std": 0.01766272261738777, "rewards/bleu_reward_func/mean": 0.07769744098186493, "rewards/bleu_reward_func/std": 0.04564467817544937, "step": 638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 479.0, "completions/mean_length": 336.5625, "completions/mean_terminated_length": 278.0833435058594, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 0.5112, "grad_norm": 2.6450626850128174, "kl": 0.04058837890625, "learning_rate": 1e-06, "loss": -0.112, "num_tokens": 8550680.0, "reward": 0.039511341601610184, "reward_std": 0.020058486610651016, "rewards/bleu_reward_func/mean": 0.039511341601610184, "rewards/bleu_reward_func/std": 0.03323696553707123, "step": 639 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 512.0, "completions/max_terminated_length": 489.0, "completions/mean_length": 338.59375, "completions/mean_terminated_length": 219.94737243652344, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 0.512, "grad_norm": 3.6534550189971924, "kl": 0.0657958984375, "learning_rate": 1e-06, "loss": -0.0021, "num_tokens": 8564075.0, "reward": 0.04427193105220795, "reward_std": 0.009654231369495392, "rewards/bleu_reward_func/mean": 0.04427193105220795, "rewards/bleu_reward_func/std": 0.016233008354902267, "step": 640 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 512.0, "completions/max_terminated_length": 496.0, "completions/mean_length": 387.90625, "completions/mean_terminated_length": 278.4117736816406, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.5128, "grad_norm": 2.5884809494018555, "kl": 0.0670166015625, "learning_rate": 1e-06, "loss": -0.022, "num_tokens": 8579848.0, "reward": 0.04640874266624451, "reward_std": 0.009371737949550152, "rewards/bleu_reward_func/mean": 0.04640874266624451, "rewards/bleu_reward_func/std": 0.014582473784685135, "step": 641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 512.0, "completions/max_terminated_length": 492.0, "completions/mean_length": 420.34375, "completions/mean_terminated_length": 328.6875, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.5136, "grad_norm": 2.2449564933776855, "kl": 0.05255126953125, "learning_rate": 1e-06, "loss": -0.0035, "num_tokens": 8597587.0, "reward": 0.027716750279068947, "reward_std": 0.00882766768336296, "rewards/bleu_reward_func/mean": 0.027716750279068947, "rewards/bleu_reward_func/std": 0.018463805317878723, "step": 642 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 458.0, "completions/max_terminated_length": 458.0, "completions/mean_length": 274.0, "completions/mean_terminated_length": 274.0, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.5144, "grad_norm": 2.397284507751465, "kl": 0.0361328125, "learning_rate": 1e-06, "loss": 0.1852, "num_tokens": 8608219.0, "reward": 0.06662235409021378, "reward_std": 0.021093640476465225, "rewards/bleu_reward_func/mean": 0.06662235409021378, "rewards/bleu_reward_func/std": 0.04868040978908539, "step": 643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.65625, "completions/max_length": 512.0, "completions/max_terminated_length": 491.0, "completions/mean_length": 421.625, "completions/mean_terminated_length": 249.09091186523438, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 0.5152, "grad_norm": 2.3074114322662354, "kl": 0.05316162109375, "learning_rate": 1e-06, "loss": 0.0663, "num_tokens": 8626695.0, "reward": 0.022337350994348526, "reward_std": 0.005283673293888569, "rewards/bleu_reward_func/mean": 0.022337350994348526, "rewards/bleu_reward_func/std": 0.014813877642154694, "step": 644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 482.0, "completions/mean_length": 320.34375, "completions/mean_terminated_length": 266.67999267578125, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 0.516, "grad_norm": 3.265336275100708, "kl": 0.041290283203125, "learning_rate": 1e-06, "loss": -0.0401, "num_tokens": 8640778.0, "reward": 0.08885028958320618, "reward_std": 0.04409442096948624, "rewards/bleu_reward_func/mean": 0.08885028958320618, "rewards/bleu_reward_func/std": 0.05580241233110428, "step": 645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.71875, "completions/max_length": 512.0, "completions/max_terminated_length": 507.0, "completions/mean_length": 402.625, "completions/mean_terminated_length": 123.11111450195312, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 0.5168, "grad_norm": 4.289402008056641, "kl": 0.09149169921875, "learning_rate": 1e-06, "loss": 0.0137, "num_tokens": 8656158.0, "reward": 0.04943261295557022, "reward_std": 0.01372533105313778, "rewards/bleu_reward_func/mean": 0.04943261295557022, "rewards/bleu_reward_func/std": 0.022848060354590416, "step": 646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 458.0, "completions/max_terminated_length": 458.0, "completions/mean_length": 233.4375, "completions/mean_terminated_length": 233.4375, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 0.5176, "grad_norm": 2.6133716106414795, "kl": 0.04571533203125, "learning_rate": 1e-06, "loss": -0.0194, "num_tokens": 8666388.0, "reward": 0.06988528370857239, "reward_std": 0.0369546078145504, "rewards/bleu_reward_func/mean": 0.06988528370857239, "rewards/bleu_reward_func/std": 0.07029401510953903, "step": 647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 512.0, "completions/max_terminated_length": 475.0, "completions/mean_length": 386.09375, "completions/mean_terminated_length": 288.1666564941406, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.5184, "grad_norm": 2.8481228351593018, "kl": 0.03912353515625, "learning_rate": 1e-06, "loss": -0.0154, "num_tokens": 8680743.0, "reward": 0.04563836753368378, "reward_std": 0.013341530226171017, "rewards/bleu_reward_func/mean": 0.04563836753368378, "rewards/bleu_reward_func/std": 0.028745615854859352, "step": 648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 508.0, "completions/mean_length": 240.90625, "completions/mean_terminated_length": 150.5416717529297, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.5192, "grad_norm": 8.067500114440918, "kl": 0.084716796875, "learning_rate": 1e-06, "loss": 0.048, "num_tokens": 8695428.0, "reward": 0.18488597869873047, "reward_std": 0.028848692774772644, "rewards/bleu_reward_func/mean": 0.18488597869873047, "rewards/bleu_reward_func/std": 0.1518058031797409, "step": 649 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 429.0, "completions/mean_length": 306.09375, "completions/mean_terminated_length": 237.45834350585938, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 0.52, "grad_norm": 2.86417818069458, "kl": 0.0567626953125, "learning_rate": 1e-06, "loss": -0.0099, "num_tokens": 8708903.0, "reward": 0.07543742656707764, "reward_std": 0.024170244112610817, "rewards/bleu_reward_func/mean": 0.07543742656707764, "rewards/bleu_reward_func/std": 0.058858614414930344, "step": 650 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.6875, "completions/max_length": 512.0, "completions/max_terminated_length": 476.0, "completions/mean_length": 454.09375, "completions/mean_terminated_length": 326.70001220703125, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 0.5208, "grad_norm": 2.209012985229492, "kl": 0.04949951171875, "learning_rate": 1e-06, "loss": 0.0536, "num_tokens": 8726226.0, "reward": 0.03396552428603172, "reward_std": 0.01698872074484825, "rewards/bleu_reward_func/mean": 0.03396552428603172, "rewards/bleu_reward_func/std": 0.024311579763889313, "step": 651 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 512.0, "completions/max_terminated_length": 499.0, "completions/mean_length": 353.0, "completions/mean_terminated_length": 257.6000061035156, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.5216, "grad_norm": 5.167045593261719, "kl": 0.037353515625, "learning_rate": 1e-06, "loss": 0.0297, "num_tokens": 8740538.0, "reward": 0.07770118117332458, "reward_std": 0.031651660799980164, "rewards/bleu_reward_func/mean": 0.07770118117332458, "rewards/bleu_reward_func/std": 0.062497008591890335, "step": 652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 512.0, "completions/max_terminated_length": 466.0, "completions/mean_length": 418.25, "completions/mean_terminated_length": 354.1052551269531, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "epoch": 0.5224, "grad_norm": 2.1188549995422363, "kl": 0.04437255859375, "learning_rate": 1e-06, "loss": -0.0311, "num_tokens": 8756394.0, "reward": 0.07865491509437561, "reward_std": 0.03826368600130081, "rewards/bleu_reward_func/mean": 0.07865491509437561, "rewards/bleu_reward_func/std": 0.06751979142427444, "step": 653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 512.0, "completions/max_terminated_length": 506.0, "completions/mean_length": 267.25, "completions/mean_terminated_length": 259.3548278808594, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.5232, "grad_norm": 2.7834532260894775, "kl": 0.045166015625, "learning_rate": 1e-06, "loss": -0.1165, "num_tokens": 8767866.0, "reward": 0.12012386322021484, "reward_std": 0.05065811797976494, "rewards/bleu_reward_func/mean": 0.12012386322021484, "rewards/bleu_reward_func/std": 0.08707272261381149, "step": 654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 504.0, "completions/mean_length": 347.0625, "completions/mean_terminated_length": 282.5217590332031, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "epoch": 0.524, "grad_norm": 3.0647759437561035, "kl": 0.04437255859375, "learning_rate": 1e-06, "loss": -0.0115, "num_tokens": 8783660.0, "reward": 0.09437389671802521, "reward_std": 0.03334784880280495, "rewards/bleu_reward_func/mean": 0.09437389671802521, "rewards/bleu_reward_func/std": 0.07559803873300552, "step": 655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 512.0, "completions/max_terminated_length": 489.0, "completions/mean_length": 415.96875, "completions/mean_terminated_length": 319.9375, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.5248, "grad_norm": 2.300849199295044, "kl": 0.0504150390625, "learning_rate": 1e-06, "loss": 0.0585, "num_tokens": 8800083.0, "reward": 0.05515716224908829, "reward_std": 0.01902184821665287, "rewards/bleu_reward_func/mean": 0.05515716224908829, "rewards/bleu_reward_func/std": 0.030146554112434387, "step": 656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 512.0, "completions/max_terminated_length": 512.0, "completions/mean_length": 291.28125, "completions/mean_terminated_length": 291.28125, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.5256, "grad_norm": 2.5877864360809326, "kl": 0.041748046875, "learning_rate": 1e-06, "loss": 0.0736, "num_tokens": 8811428.0, "reward": 0.11595845222473145, "reward_std": 0.06343421339988708, "rewards/bleu_reward_func/mean": 0.11595845222473145, "rewards/bleu_reward_func/std": 0.1822866052389145, "step": 657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 512.0, "completions/max_terminated_length": 440.0, "completions/mean_length": 308.84375, "completions/mean_terminated_length": 216.5, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.5264, "grad_norm": 4.225070953369141, "kl": 0.042755126953125, "learning_rate": 1e-06, "loss": 0.0062, "num_tokens": 8823455.0, "reward": 0.08148862421512604, "reward_std": 0.021023821085691452, "rewards/bleu_reward_func/mean": 0.08148862421512604, "rewards/bleu_reward_func/std": 0.07011328637599945, "step": 658 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 470.0, "completions/mean_length": 318.59375, "completions/mean_terminated_length": 254.125, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.5272, "grad_norm": 2.809264659881592, "kl": 0.03753662109375, "learning_rate": 1e-06, "loss": 0.076, "num_tokens": 8835834.0, "reward": 0.04945838451385498, "reward_std": 0.02451205439865589, "rewards/bleu_reward_func/mean": 0.04945838451385498, "rewards/bleu_reward_func/std": 0.03474752977490425, "step": 659 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 512.0, "completions/max_terminated_length": 469.0, "completions/mean_length": 323.53125, "completions/mean_terminated_length": 304.03448486328125, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.528, "grad_norm": 2.119414806365967, "kl": 0.034088134765625, "learning_rate": 1e-06, "loss": 0.0428, "num_tokens": 8848443.0, "reward": 0.07028196007013321, "reward_std": 0.017643148079514503, "rewards/bleu_reward_func/mean": 0.07028196007013321, "rewards/bleu_reward_func/std": 0.05406918004155159, "step": 660 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.65625, "completions/max_length": 512.0, "completions/max_terminated_length": 490.0, "completions/mean_length": 448.9375, "completions/mean_terminated_length": 328.54547119140625, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 0.5288, "grad_norm": 2.035613536834717, "kl": 0.05206298828125, "learning_rate": 1e-06, "loss": -0.0222, "num_tokens": 8866841.0, "reward": 0.05816391110420227, "reward_std": 0.016069550067186356, "rewards/bleu_reward_func/mean": 0.05816391110420227, "rewards/bleu_reward_func/std": 0.020034752786159515, "step": 661 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 512.0, "completions/max_terminated_length": 511.0, "completions/mean_length": 261.40625, "completions/mean_terminated_length": 244.70001220703125, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.5296, "grad_norm": 3.209188461303711, "kl": 0.043365478515625, "learning_rate": 1e-06, "loss": -0.0163, "num_tokens": 8877422.0, "reward": 0.04772093892097473, "reward_std": 0.016260413452982903, "rewards/bleu_reward_func/mean": 0.04772093892097473, "rewards/bleu_reward_func/std": 0.024178825318813324, "step": 662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 512.0, "completions/max_terminated_length": 504.0, "completions/mean_length": 310.65625, "completions/mean_terminated_length": 289.82757568359375, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.5304, "grad_norm": 2.511366367340088, "kl": 0.0345458984375, "learning_rate": 1e-06, "loss": 0.0756, "num_tokens": 8890523.0, "reward": 0.09435027837753296, "reward_std": 0.029953738674521446, "rewards/bleu_reward_func/mean": 0.09435027837753296, "rewards/bleu_reward_func/std": 0.07095350325107574, "step": 663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 483.0, "completions/mean_length": 265.03125, "completions/mean_terminated_length": 182.70834350585938, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5312, "grad_norm": 4.535026550292969, "kl": 0.050811767578125, "learning_rate": 1e-06, "loss": -0.0718, "num_tokens": 8902308.0, "reward": 0.07631168514490128, "reward_std": 0.018188592046499252, "rewards/bleu_reward_func/mean": 0.07631168514490128, "rewards/bleu_reward_func/std": 0.04229210317134857, "step": 664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 512.0, "completions/max_terminated_length": 501.0, "completions/mean_length": 223.5, "completions/mean_terminated_length": 214.19354248046875, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 0.532, "grad_norm": 3.217503070831299, "kl": 0.04931640625, "learning_rate": 1e-06, "loss": 0.1727, "num_tokens": 8911500.0, "reward": 0.023189637809991837, "reward_std": 0.007045770063996315, "rewards/bleu_reward_func/mean": 0.023189637809991837, "rewards/bleu_reward_func/std": 0.011228468269109726, "step": 665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 512.0, "completions/max_terminated_length": 475.0, "completions/mean_length": 292.59375, "completions/mean_terminated_length": 261.25, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.5328, "grad_norm": 9.476320266723633, "kl": 0.213714599609375, "learning_rate": 1e-06, "loss": -0.016, "num_tokens": 8925455.0, "reward": 0.08436296880245209, "reward_std": 0.021188655868172646, "rewards/bleu_reward_func/mean": 0.08436296880245209, "rewards/bleu_reward_func/std": 0.07301143556833267, "step": 666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 493.0, "completions/mean_length": 245.25, "completions/mean_terminated_length": 156.33334350585938, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.5336, "grad_norm": 5.552678108215332, "kl": 0.04876708984375, "learning_rate": 1e-06, "loss": 0.0796, "num_tokens": 8939039.0, "reward": 0.09512823820114136, "reward_std": 0.03876760974526405, "rewards/bleu_reward_func/mean": 0.09512823820114136, "rewards/bleu_reward_func/std": 0.13977199792861938, "step": 667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 383.0, "completions/mean_length": 239.75, "completions/mean_terminated_length": 163.51998901367188, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.5344, "grad_norm": 2.919515371322632, "kl": 0.02215576171875, "learning_rate": 1e-06, "loss": 0.0769, "num_tokens": 8949271.0, "reward": 0.1288418173789978, "reward_std": 0.06042589992284775, "rewards/bleu_reward_func/mean": 0.1288418173789978, "rewards/bleu_reward_func/std": 0.10018094629049301, "step": 668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 512.0, "completions/max_terminated_length": 510.0, "completions/mean_length": 292.5625, "completions/mean_terminated_length": 261.21429443359375, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.5352, "grad_norm": 2.719533920288086, "kl": 0.03509521484375, "learning_rate": 1e-06, "loss": -0.029, "num_tokens": 8960585.0, "reward": 0.11158549785614014, "reward_std": 0.025866547599434853, "rewards/bleu_reward_func/mean": 0.11158549785614014, "rewards/bleu_reward_func/std": 0.06140602380037308, "step": 669 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 494.0, "completions/mean_length": 331.34375, "completions/mean_terminated_length": 271.125, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.536, "grad_norm": 2.3229901790618896, "kl": 0.03875732421875, "learning_rate": 1e-06, "loss": 0.1602, "num_tokens": 8975308.0, "reward": 0.07134771347045898, "reward_std": 0.013178054243326187, "rewards/bleu_reward_func/mean": 0.07134771347045898, "rewards/bleu_reward_func/std": 0.049940213561058044, "step": 670 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 512.0, "completions/max_terminated_length": 481.0, "completions/mean_length": 418.25, "completions/mean_terminated_length": 354.1052551269531, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.5368, "grad_norm": 2.3134210109710693, "kl": 0.04217529296875, "learning_rate": 1e-06, "loss": -0.0344, "num_tokens": 8992764.0, "reward": 0.050522781908512115, "reward_std": 0.01361394114792347, "rewards/bleu_reward_func/mean": 0.050522781908512115, "rewards/bleu_reward_func/std": 0.020486921072006226, "step": 671 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 501.0, "completions/max_terminated_length": 501.0, "completions/mean_length": 202.96875, "completions/mean_terminated_length": 202.96875, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.5376, "grad_norm": 3.8227100372314453, "kl": 0.0574951171875, "learning_rate": 1e-06, "loss": -0.0995, "num_tokens": 9001603.0, "reward": 0.06377962231636047, "reward_std": 0.026187829673290253, "rewards/bleu_reward_func/mean": 0.06377962231636047, "rewards/bleu_reward_func/std": 0.03809577226638794, "step": 672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 512.0, "completions/max_terminated_length": 441.0, "completions/mean_length": 350.8125, "completions/mean_terminated_length": 189.625, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.5384, "grad_norm": 4.19647216796875, "kl": 0.037567138671875, "learning_rate": 1e-06, "loss": 0.021, "num_tokens": 9017317.0, "reward": 0.06122228503227234, "reward_std": 0.029229629784822464, "rewards/bleu_reward_func/mean": 0.06122228503227234, "rewards/bleu_reward_func/std": 0.0862286314368248, "step": 673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.875, "completions/max_length": 512.0, "completions/max_terminated_length": 494.0, "completions/mean_length": 498.0, "completions/mean_terminated_length": 400.0, "completions/min_length": 312.0, "completions/min_terminated_length": 312.0, "epoch": 0.5392, "grad_norm": 1.9404997825622559, "kl": 0.040435791015625, "learning_rate": 1e-06, "loss": -0.017, "num_tokens": 9037933.0, "reward": 0.04427298903465271, "reward_std": 0.006757338996976614, "rewards/bleu_reward_func/mean": 0.04427298903465271, "rewards/bleu_reward_func/std": 0.030673207715153694, "step": 674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 512.0, "completions/max_terminated_length": 499.0, "completions/mean_length": 406.875, "completions/mean_terminated_length": 359.0909118652344, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.54, "grad_norm": 2.2146005630493164, "kl": 0.033935546875, "learning_rate": 1e-06, "loss": -0.0065, "num_tokens": 9053473.0, "reward": 0.06295045465230942, "reward_std": 0.017118435353040695, "rewards/bleu_reward_func/mean": 0.06295045465230942, "rewards/bleu_reward_func/std": 0.053336694836616516, "step": 675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 512.0, "completions/max_terminated_length": 486.0, "completions/mean_length": 315.53125, "completions/mean_terminated_length": 226.22727966308594, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 0.5408, "grad_norm": 2.9350626468658447, "kl": 0.052520751953125, "learning_rate": 1e-06, "loss": 0.1251, "num_tokens": 9066026.0, "reward": 0.028412725776433945, "reward_std": 0.00589718297123909, "rewards/bleu_reward_func/mean": 0.028412725776433945, "rewards/bleu_reward_func/std": 0.00858322810381651, "step": 676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.0, "completions/max_terminated_length": 488.0, "completions/mean_length": 214.40625, "completions/mean_terminated_length": 214.40625, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.5416, "grad_norm": 2.6705992221832275, "kl": 0.0361328125, "learning_rate": 1e-06, "loss": -0.0716, "num_tokens": 9075087.0, "reward": 0.04127844423055649, "reward_std": 0.01752633973956108, "rewards/bleu_reward_func/mean": 0.04127844423055649, "rewards/bleu_reward_func/std": 0.024528201669454575, "step": 677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 512.0, "completions/max_terminated_length": 488.0, "completions/mean_length": 424.90625, "completions/mean_terminated_length": 348.058837890625, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 0.5424, "grad_norm": 2.137740135192871, "kl": 0.033782958984375, "learning_rate": 1e-06, "loss": 0.0089, "num_tokens": 9094100.0, "reward": 0.10619800537824631, "reward_std": 0.02650071680545807, "rewards/bleu_reward_func/mean": 0.10619800537824631, "rewards/bleu_reward_func/std": 0.11194527894258499, "step": 678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5625, "completions/max_length": 512.0, "completions/max_terminated_length": 512.0, "completions/mean_length": 362.625, "completions/mean_terminated_length": 170.57144165039062, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5432, "grad_norm": 4.145079612731934, "kl": 0.09063720703125, "learning_rate": 1e-06, "loss": -0.0814, "num_tokens": 9108312.0, "reward": 0.03859299048781395, "reward_std": 0.013083922676742077, "rewards/bleu_reward_func/mean": 0.03859299048781395, "rewards/bleu_reward_func/std": 0.027552325278520584, "step": 679 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 512.0, "completions/max_terminated_length": 480.0, "completions/mean_length": 329.15625, "completions/mean_terminated_length": 323.258056640625, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "epoch": 0.544, "grad_norm": 2.8421475887298584, "kl": 0.03363037109375, "learning_rate": 1e-06, "loss": -0.0938, "num_tokens": 9122117.0, "reward": 0.07394321262836456, "reward_std": 0.027200117707252502, "rewards/bleu_reward_func/mean": 0.07394321262836456, "rewards/bleu_reward_func/std": 0.05345241352915764, "step": 680 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 506.0, "completions/mean_length": 356.375, "completions/mean_terminated_length": 295.478271484375, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 0.5448, "grad_norm": 2.2800376415252686, "kl": 0.03594970703125, "learning_rate": 1e-06, "loss": -0.0034, "num_tokens": 9135601.0, "reward": 0.034722380340099335, "reward_std": 0.011629019863903522, "rewards/bleu_reward_func/mean": 0.034722380340099335, "rewards/bleu_reward_func/std": 0.02644946798682213, "step": 681 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 483.0, "completions/max_terminated_length": 483.0, "completions/mean_length": 178.71875, "completions/mean_terminated_length": 178.71875, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 0.5456, "grad_norm": 3.663381338119507, "kl": 0.030303955078125, "learning_rate": 1e-06, "loss": 0.0411, "num_tokens": 9142976.0, "reward": 0.08912401646375656, "reward_std": 0.03632171079516411, "rewards/bleu_reward_func/mean": 0.08912401646375656, "rewards/bleu_reward_func/std": 0.05631924048066139, "step": 682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 493.0, "completions/mean_length": 314.21875, "completions/mean_terminated_length": 248.2916717529297, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.5464, "grad_norm": 4.255805492401123, "kl": 0.0494384765625, "learning_rate": 1e-06, "loss": -0.0512, "num_tokens": 9157071.0, "reward": 0.05893013998866081, "reward_std": 0.019025936722755432, "rewards/bleu_reward_func/mean": 0.05893013998866081, "rewards/bleu_reward_func/std": 0.05139394477009773, "step": 683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 512.0, "completions/max_terminated_length": 437.0, "completions/mean_length": 243.625, "completions/mean_terminated_length": 225.7333526611328, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.5472, "grad_norm": 3.2021656036376953, "kl": 0.0478515625, "learning_rate": 1e-06, "loss": 0.105, "num_tokens": 9166795.0, "reward": 0.026314986869692802, "reward_std": 0.007651232648640871, "rewards/bleu_reward_func/mean": 0.026314986869692802, "rewards/bleu_reward_func/std": 0.010862961411476135, "step": 684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.0, "completions/max_terminated_length": 389.0, "completions/mean_length": 165.25, "completions/mean_terminated_length": 165.25, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.548, "grad_norm": 3.6673038005828857, "kl": 0.075836181640625, "learning_rate": 1e-06, "loss": -0.1709, "num_tokens": 9174915.0, "reward": 0.0813303291797638, "reward_std": 0.025265149772167206, "rewards/bleu_reward_func/mean": 0.0813303291797638, "rewards/bleu_reward_func/std": 0.07939371466636658, "step": 685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 512.0, "completions/max_terminated_length": 455.0, "completions/mean_length": 238.96875, "completions/mean_terminated_length": 188.40740966796875, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.5488, "grad_norm": 5.579142093658447, "kl": 0.0555419921875, "learning_rate": 1e-06, "loss": 0.0005, "num_tokens": 9185234.0, "reward": 0.05107392370700836, "reward_std": 0.02069205418229103, "rewards/bleu_reward_func/mean": 0.05107392370700836, "rewards/bleu_reward_func/std": 0.047578368335962296, "step": 686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.59375, "completions/max_length": 512.0, "completions/max_terminated_length": 451.0, "completions/mean_length": 406.1875, "completions/mean_terminated_length": 251.53846740722656, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.5496, "grad_norm": 2.2841057777404785, "kl": 0.0462646484375, "learning_rate": 1e-06, "loss": -0.0739, "num_tokens": 9201024.0, "reward": 0.03643956780433655, "reward_std": 0.008713691495358944, "rewards/bleu_reward_func/mean": 0.03643956780433655, "rewards/bleu_reward_func/std": 0.024110866710543633, "step": 687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 512.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 270.4375, "completions/mean_terminated_length": 235.9285888671875, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "epoch": 0.5504, "grad_norm": 3.174736738204956, "kl": 0.043487548828125, "learning_rate": 1e-06, "loss": -0.1688, "num_tokens": 9212678.0, "reward": 0.08469453454017639, "reward_std": 0.03412746265530586, "rewards/bleu_reward_func/mean": 0.08469453454017639, "rewards/bleu_reward_func/std": 0.057441964745521545, "step": 688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 412.0, "completions/mean_length": 311.0, "completions/mean_terminated_length": 244.0, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.5512, "grad_norm": 2.86037540435791, "kl": 0.03607177734375, "learning_rate": 1e-06, "loss": -0.0022, "num_tokens": 9226670.0, "reward": 0.10889802128076553, "reward_std": 0.0333615243434906, "rewards/bleu_reward_func/mean": 0.10889802128076553, "rewards/bleu_reward_func/std": 0.11031165719032288, "step": 689 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 512.0, "completions/max_terminated_length": 511.0, "completions/mean_length": 426.3125, "completions/mean_terminated_length": 340.625, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.552, "grad_norm": 2.6019811630249023, "kl": 0.0339202880859375, "learning_rate": 1e-06, "loss": -0.0429, "num_tokens": 9244608.0, "reward": 0.05243753641843796, "reward_std": 0.016431640833616257, "rewards/bleu_reward_func/mean": 0.05243753641843796, "rewards/bleu_reward_func/std": 0.033674199134111404, "step": 690 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 512.0, "completions/max_terminated_length": 503.0, "completions/mean_length": 274.71875, "completions/mean_terminated_length": 258.9000244140625, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.5528, "grad_norm": 2.8404016494750977, "kl": 0.0394287109375, "learning_rate": 1e-06, "loss": -0.0131, "num_tokens": 9256231.0, "reward": 0.03480883315205574, "reward_std": 0.022311776876449585, "rewards/bleu_reward_func/mean": 0.03480883315205574, "rewards/bleu_reward_func/std": 0.04648389294743538, "step": 691 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.53125, "completions/max_length": 512.0, "completions/max_terminated_length": 510.0, "completions/mean_length": 464.40625, "completions/mean_terminated_length": 410.4666748046875, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.5536, "grad_norm": 1.8518298864364624, "kl": 0.03631591796875, "learning_rate": 1e-06, "loss": 0.0071, "num_tokens": 9274868.0, "reward": 0.02014119178056717, "reward_std": 0.006333409808576107, "rewards/bleu_reward_func/mean": 0.02014119178056717, "rewards/bleu_reward_func/std": 0.016668220981955528, "step": 692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 466.0, "completions/max_terminated_length": 466.0, "completions/mean_length": 214.875, "completions/mean_terminated_length": 214.875, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.5544, "grad_norm": 8.611943244934082, "kl": 0.066192626953125, "learning_rate": 1e-06, "loss": 0.0955, "num_tokens": 9285320.0, "reward": 0.051611416041851044, "reward_std": 0.010077946819365025, "rewards/bleu_reward_func/mean": 0.051611416041851044, "rewards/bleu_reward_func/std": 0.023184411227703094, "step": 693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 502.0, "completions/mean_length": 324.46875, "completions/mean_terminated_length": 261.9583435058594, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.5552, "grad_norm": 14.116987228393555, "kl": 0.0401611328125, "learning_rate": 1e-06, "loss": 0.0856, "num_tokens": 9300647.0, "reward": 0.11092260479927063, "reward_std": 0.034637127071619034, "rewards/bleu_reward_func/mean": 0.11092260479927063, "rewards/bleu_reward_func/std": 0.056095682084560394, "step": 694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 512.0, "completions/max_terminated_length": 511.0, "completions/mean_length": 304.4375, "completions/mean_terminated_length": 282.96551513671875, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.556, "grad_norm": 2.5600430965423584, "kl": 0.04901123046875, "learning_rate": 1e-06, "loss": -0.0664, "num_tokens": 9312669.0, "reward": 0.027583984658122063, "reward_std": 0.00833278801292181, "rewards/bleu_reward_func/mean": 0.027583984658122063, "rewards/bleu_reward_func/std": 0.011630790308117867, "step": 695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 512.0, "completions/max_terminated_length": 503.0, "completions/mean_length": 339.03125, "completions/mean_terminated_length": 235.25, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.5568, "grad_norm": 3.3769948482513428, "kl": 0.05389404296875, "learning_rate": 1e-06, "loss": -0.0098, "num_tokens": 9327814.0, "reward": 0.028040939942002296, "reward_std": 0.010371413081884384, "rewards/bleu_reward_func/mean": 0.028040939942002296, "rewards/bleu_reward_func/std": 0.01842903159558773, "step": 696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 472.0, "completions/mean_length": 324.40625, "completions/mean_terminated_length": 261.875, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.5576, "grad_norm": 2.9584405422210693, "kl": 0.051727294921875, "learning_rate": 1e-06, "loss": -0.027, "num_tokens": 9341091.0, "reward": 0.0529029443860054, "reward_std": 0.015040645375847816, "rewards/bleu_reward_func/mean": 0.0529029443860054, "rewards/bleu_reward_func/std": 0.03654506802558899, "step": 697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 470.0, "completions/mean_length": 378.75, "completions/mean_terminated_length": 334.3333435058594, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "epoch": 0.5584, "grad_norm": 2.1786551475524902, "kl": 0.037689208984375, "learning_rate": 1e-06, "loss": -0.032, "num_tokens": 9356283.0, "reward": 0.03368465229868889, "reward_std": 0.009372582659125328, "rewards/bleu_reward_func/mean": 0.03368465229868889, "rewards/bleu_reward_func/std": 0.026379503309726715, "step": 698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 216.0, "completions/mean_length": 238.71875, "completions/mean_terminated_length": 131.78260803222656, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 0.5592, "grad_norm": 4.376157760620117, "kl": 0.07623291015625, "learning_rate": 1e-06, "loss": 0.028, "num_tokens": 9367506.0, "reward": 0.05952916666865349, "reward_std": 0.023435667157173157, "rewards/bleu_reward_func/mean": 0.05952916666865349, "rewards/bleu_reward_func/std": 0.04434419795870781, "step": 699 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 359.0, "completions/mean_length": 274.0, "completions/mean_terminated_length": 194.6666717529297, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 0.56, "grad_norm": 2.882861852645874, "kl": 0.053955078125, "learning_rate": 1e-06, "loss": 0.0267, "num_tokens": 9379354.0, "reward": 0.04700922966003418, "reward_std": 0.017730262130498886, "rewards/bleu_reward_func/mean": 0.04700922966003418, "rewards/bleu_reward_func/std": 0.04125557094812393, "step": 700 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 447.0, "completions/max_terminated_length": 447.0, "completions/mean_length": 155.53125, "completions/mean_terminated_length": 155.53125, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.5608, "grad_norm": 3.9251868724823, "kl": 0.0543212890625, "learning_rate": 1e-06, "loss": -0.0544, "num_tokens": 9386403.0, "reward": 0.04936821013689041, "reward_std": 0.03505264222621918, "rewards/bleu_reward_func/mean": 0.04936821013689041, "rewards/bleu_reward_func/std": 0.03844968229532242, "step": 701 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 512.0, "completions/max_terminated_length": 474.0, "completions/mean_length": 399.3125, "completions/mean_terminated_length": 348.0909118652344, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "epoch": 0.5616, "grad_norm": 2.3044822216033936, "kl": 0.046051025390625, "learning_rate": 1e-06, "loss": 0.0413, "num_tokens": 9404461.0, "reward": 0.05913422256708145, "reward_std": 0.014526767656207085, "rewards/bleu_reward_func/mean": 0.05913422256708145, "rewards/bleu_reward_func/std": 0.02571999840438366, "step": 702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 288.0, "completions/mean_length": 227.96875, "completions/mean_terminated_length": 133.2916717529297, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.5624, "grad_norm": 3.9835569858551025, "kl": 0.08758544921875, "learning_rate": 1e-06, "loss": -0.0113, "num_tokens": 9414196.0, "reward": 0.04630535468459129, "reward_std": 0.02081681415438652, "rewards/bleu_reward_func/mean": 0.04630535468459129, "rewards/bleu_reward_func/std": 0.029882358387112617, "step": 703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 512.0, "completions/max_terminated_length": 512.0, "completions/mean_length": 246.25, "completions/mean_terminated_length": 237.6774139404297, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.5632, "grad_norm": 3.185335874557495, "kl": 0.029571533203125, "learning_rate": 1e-06, "loss": 0.0604, "num_tokens": 9424820.0, "reward": 0.08910296112298965, "reward_std": 0.030036643147468567, "rewards/bleu_reward_func/mean": 0.08910296112298965, "rewards/bleu_reward_func/std": 0.04086713492870331, "step": 704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 408.0, "completions/max_terminated_length": 408.0, "completions/mean_length": 161.34375, "completions/mean_terminated_length": 161.34375, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 0.564, "grad_norm": 3.8254852294921875, "kl": 0.072540283203125, "learning_rate": 1e-06, "loss": -0.1746, "num_tokens": 9432663.0, "reward": 0.027168624103069305, "reward_std": 0.008981114253401756, "rewards/bleu_reward_func/mean": 0.027168624103069305, "rewards/bleu_reward_func/std": 0.02017930895090103, "step": 705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 512.0, "completions/max_terminated_length": 466.0, "completions/mean_length": 292.75, "completions/mean_terminated_length": 242.1538543701172, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 0.5648, "grad_norm": 2.8090875148773193, "kl": 0.036773681640625, "learning_rate": 1e-06, "loss": 0.0482, "num_tokens": 9444167.0, "reward": 0.06258943676948547, "reward_std": 0.02587122470140457, "rewards/bleu_reward_func/mean": 0.06258943676948547, "rewards/bleu_reward_func/std": 0.07109640538692474, "step": 706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 510.0, "completions/mean_length": 341.5, "completions/mean_terminated_length": 293.7599792480469, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 0.5656, "grad_norm": 2.612130880355835, "kl": 0.06427001953125, "learning_rate": 1e-06, "loss": -0.0579, "num_tokens": 9457983.0, "reward": 0.06597445905208588, "reward_std": 0.028306953608989716, "rewards/bleu_reward_func/mean": 0.06597445905208588, "rewards/bleu_reward_func/std": 0.05735038220882416, "step": 707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 479.0, "completions/mean_length": 325.1875, "completions/mean_terminated_length": 252.0869598388672, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.5664, "grad_norm": 2.9376416206359863, "kl": 0.040283203125, "learning_rate": 1e-06, "loss": 0.0036, "num_tokens": 9470925.0, "reward": 0.08510507643222809, "reward_std": 0.024027425795793533, "rewards/bleu_reward_func/mean": 0.08510507643222809, "rewards/bleu_reward_func/std": 0.06637418270111084, "step": 708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 512.0, "completions/max_terminated_length": 512.0, "completions/mean_length": 328.4375, "completions/mean_terminated_length": 302.21429443359375, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 0.5672, "grad_norm": 2.756030321121216, "kl": 0.034027099609375, "learning_rate": 1e-06, "loss": -0.0527, "num_tokens": 9483395.0, "reward": 0.05016753822565079, "reward_std": 0.020536717027425766, "rewards/bleu_reward_func/mean": 0.05016753822565079, "rewards/bleu_reward_func/std": 0.04033496230840683, "step": 709 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 512.0, "completions/max_terminated_length": 428.0, "completions/mean_length": 307.1875, "completions/mean_terminated_length": 184.3000030517578, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 0.568, "grad_norm": 2.9602859020233154, "kl": 0.07080078125, "learning_rate": 1e-06, "loss": 0.0387, "num_tokens": 9495689.0, "reward": 0.03180449828505516, "reward_std": 0.02256343513727188, "rewards/bleu_reward_func/mean": 0.03180449828505516, "rewards/bleu_reward_func/std": 0.02739291824400425, "step": 710 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 512.0, "completions/max_terminated_length": 504.0, "completions/mean_length": 282.15625, "completions/mean_terminated_length": 229.11538696289062, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.5688, "grad_norm": 2.9386565685272217, "kl": 0.0604248046875, "learning_rate": 1e-06, "loss": -0.0003, "num_tokens": 9509142.0, "reward": 0.026789426803588867, "reward_std": 0.007120601832866669, "rewards/bleu_reward_func/mean": 0.026789426803588867, "rewards/bleu_reward_func/std": 0.01533250231295824, "step": 711 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 512.0, "completions/max_terminated_length": 501.0, "completions/mean_length": 445.1875, "completions/mean_terminated_length": 410.19049072265625, "completions/min_length": 308.0, "completions/min_terminated_length": 308.0, "epoch": 0.5696, "grad_norm": 2.129762887954712, "kl": 0.0325927734375, "learning_rate": 1e-06, "loss": -0.0566, "num_tokens": 9525972.0, "reward": 0.0978296622633934, "reward_std": 0.022149382159113884, "rewards/bleu_reward_func/mean": 0.0978296622633934, "rewards/bleu_reward_func/std": 0.08168322592973709, "step": 712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 510.0, "completions/mean_length": 366.1875, "completions/mean_terminated_length": 317.5833435058594, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.5704, "grad_norm": 2.2154335975646973, "kl": 0.04345703125, "learning_rate": 1e-06, "loss": 0.0888, "num_tokens": 9541162.0, "reward": 0.03301853686571121, "reward_std": 0.018747247755527496, "rewards/bleu_reward_func/mean": 0.03301853686571121, "rewards/bleu_reward_func/std": 0.03410644084215164, "step": 713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 512.0, "completions/max_terminated_length": 499.0, "completions/mean_length": 332.90625, "completions/mean_terminated_length": 225.4499969482422, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.5712, "grad_norm": 3.655517101287842, "kl": 0.034454345703125, "learning_rate": 1e-06, "loss": -0.0415, "num_tokens": 9554111.0, "reward": 0.06881336867809296, "reward_std": 0.02799813821911812, "rewards/bleu_reward_func/mean": 0.06881336867809296, "rewards/bleu_reward_func/std": 0.046132639050483704, "step": 714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 512.0, "completions/max_terminated_length": 375.0, "completions/mean_length": 323.53125, "completions/mean_terminated_length": 194.57894897460938, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 0.572, "grad_norm": 2.6926376819610596, "kl": 0.0515899658203125, "learning_rate": 1e-06, "loss": 0.0721, "num_tokens": 9569472.0, "reward": 0.03615172579884529, "reward_std": 0.010936561971902847, "rewards/bleu_reward_func/mean": 0.03615172579884529, "rewards/bleu_reward_func/std": 0.02190208248794079, "step": 715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 512.0, "completions/max_terminated_length": 469.0, "completions/mean_length": 369.78125, "completions/mean_terminated_length": 259.1666564941406, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.5728, "grad_norm": 2.659458875656128, "kl": 0.05145263671875, "learning_rate": 1e-06, "loss": 0.0228, "num_tokens": 9586745.0, "reward": 0.04501129686832428, "reward_std": 0.022404177114367485, "rewards/bleu_reward_func/mean": 0.04501129686832428, "rewards/bleu_reward_func/std": 0.03303966298699379, "step": 716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 512.0, "completions/max_terminated_length": 488.0, "completions/mean_length": 301.5, "completions/mean_terminated_length": 262.5185241699219, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.5736, "grad_norm": 2.87153697013855, "kl": 0.045135498046875, "learning_rate": 1e-06, "loss": -0.0035, "num_tokens": 9598977.0, "reward": 0.08759818971157074, "reward_std": 0.016007091850042343, "rewards/bleu_reward_func/mean": 0.08759818971157074, "rewards/bleu_reward_func/std": 0.058541323989629745, "step": 717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 465.0, "completions/mean_length": 376.84375, "completions/mean_terminated_length": 331.79168701171875, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.5744, "grad_norm": 2.2236804962158203, "kl": 0.04248046875, "learning_rate": 1e-06, "loss": -0.0317, "num_tokens": 9613732.0, "reward": 0.027184750884771347, "reward_std": 0.008963186293840408, "rewards/bleu_reward_func/mean": 0.027184750884771347, "rewards/bleu_reward_func/std": 0.012044839560985565, "step": 718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.71875, "completions/max_length": 512.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 404.875, "completions/mean_terminated_length": 131.11111450195312, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.5752, "grad_norm": 3.107280731201172, "kl": 0.05364990234375, "learning_rate": 1e-06, "loss": -0.1847, "num_tokens": 9632200.0, "reward": 0.022721879184246063, "reward_std": 0.015824276953935623, "rewards/bleu_reward_func/mean": 0.022721879184246063, "rewards/bleu_reward_func/std": 0.026122624054551125, "step": 719 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 512.0, "completions/max_terminated_length": 464.0, "completions/mean_length": 302.71875, "completions/mean_terminated_length": 288.7666931152344, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 0.576, "grad_norm": 2.6198558807373047, "kl": 0.03948974609375, "learning_rate": 1e-06, "loss": 0.1811, "num_tokens": 9644919.0, "reward": 0.028920229524374008, "reward_std": 0.009729383513331413, "rewards/bleu_reward_func/mean": 0.028920229524374008, "rewards/bleu_reward_func/std": 0.012061752378940582, "step": 720 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 413.0, "completions/mean_length": 334.40625, "completions/mean_terminated_length": 275.2083435058594, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.5768, "grad_norm": 2.3346035480499268, "kl": 0.03521728515625, "learning_rate": 1e-06, "loss": -0.0052, "num_tokens": 9657428.0, "reward": 0.033784326165914536, "reward_std": 0.012217823415994644, "rewards/bleu_reward_func/mean": 0.033784326165914536, "rewards/bleu_reward_func/std": 0.017847878858447075, "step": 721 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 501.0, "completions/mean_length": 392.6875, "completions/mean_terminated_length": 352.91668701171875, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.5776, "grad_norm": 2.2907447814941406, "kl": 0.0361328125, "learning_rate": 1e-06, "loss": -0.0447, "num_tokens": 9673434.0, "reward": 0.05538846179842949, "reward_std": 0.027156081050634384, "rewards/bleu_reward_func/mean": 0.05538846179842949, "rewards/bleu_reward_func/std": 0.0433816984295845, "step": 722 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 512.0, "completions/max_terminated_length": 494.0, "completions/mean_length": 323.65625, "completions/mean_terminated_length": 304.17242431640625, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.5784, "grad_norm": 2.55709171295166, "kl": 0.039825439453125, "learning_rate": 1e-06, "loss": -0.0556, "num_tokens": 9685775.0, "reward": 0.08060881495475769, "reward_std": 0.02005528286099434, "rewards/bleu_reward_func/mean": 0.08060881495475769, "rewards/bleu_reward_func/std": 0.07852939516305923, "step": 723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 512.0, "completions/max_terminated_length": 508.0, "completions/mean_length": 381.1875, "completions/mean_terminated_length": 351.0, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.5792, "grad_norm": 2.155306816101074, "kl": 0.03192138671875, "learning_rate": 1e-06, "loss": 0.0164, "num_tokens": 9700613.0, "reward": 0.1121407151222229, "reward_std": 0.025965237990021706, "rewards/bleu_reward_func/mean": 0.1121407151222229, "rewards/bleu_reward_func/std": 0.14134716987609863, "step": 724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 494.0, "completions/mean_length": 324.78125, "completions/mean_terminated_length": 262.375, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.58, "grad_norm": 2.3479228019714355, "kl": 0.04144287109375, "learning_rate": 1e-06, "loss": 0.0594, "num_tokens": 9713542.0, "reward": 0.025968845933675766, "reward_std": 0.008134625852108002, "rewards/bleu_reward_func/mean": 0.025968845933675766, "rewards/bleu_reward_func/std": 0.01379316858947277, "step": 725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 512.0, "completions/max_terminated_length": 426.0, "completions/mean_length": 347.03125, "completions/mean_terminated_length": 234.15789794921875, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.5808, "grad_norm": 2.790693759918213, "kl": 0.0469970703125, "learning_rate": 1e-06, "loss": -0.0013, "num_tokens": 9727183.0, "reward": 0.07316627353429794, "reward_std": 0.02672586776316166, "rewards/bleu_reward_func/mean": 0.07316627353429794, "rewards/bleu_reward_func/std": 0.04870554059743881, "step": 726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.59375, "completions/max_length": 512.0, "completions/max_terminated_length": 473.0, "completions/mean_length": 413.65625, "completions/mean_terminated_length": 269.923095703125, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.5816, "grad_norm": 2.371492624282837, "kl": 0.038482666015625, "learning_rate": 1e-06, "loss": 0.0874, "num_tokens": 9743332.0, "reward": 0.028866440057754517, "reward_std": 0.009991827420890331, "rewards/bleu_reward_func/mean": 0.028866440057754517, "rewards/bleu_reward_func/std": 0.017279163002967834, "step": 727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.75, "completions/max_length": 512.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 389.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.5824, "grad_norm": 1.6963756084442139, "kl": 0.03743934631347656, "learning_rate": 1e-06, "loss": 0.0015, "num_tokens": 9760908.0, "reward": 0.26194876432418823, "reward_std": 0.0028098882175982, "rewards/bleu_reward_func/mean": 0.26194876432418823, "rewards/bleu_reward_func/std": 0.43297266960144043, "step": 728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 479.0, "completions/mean_length": 372.53125, "completions/mean_terminated_length": 317.9565124511719, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 0.5832, "grad_norm": 2.238243579864502, "kl": 0.041015625, "learning_rate": 1e-06, "loss": -0.0603, "num_tokens": 9775317.0, "reward": 0.05312762036919594, "reward_std": 0.021938100457191467, "rewards/bleu_reward_func/mean": 0.05312762036919594, "rewards/bleu_reward_func/std": 0.026401638984680176, "step": 729 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 512.0, "completions/max_terminated_length": 463.0, "completions/mean_length": 273.78125, "completions/mean_terminated_length": 249.13792419433594, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.584, "grad_norm": 2.6824791431427, "kl": 0.040802001953125, "learning_rate": 1e-06, "loss": 0.1689, "num_tokens": 9785942.0, "reward": 0.043269768357276917, "reward_std": 0.01920248009264469, "rewards/bleu_reward_func/mean": 0.043269768357276917, "rewards/bleu_reward_func/std": 0.036198996007442474, "step": 730 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 512.0, "completions/max_terminated_length": 512.0, "completions/mean_length": 391.96875, "completions/mean_terminated_length": 369.7407531738281, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.5848, "grad_norm": 2.225390672683716, "kl": 0.030548095703125, "learning_rate": 1e-06, "loss": -0.0489, "num_tokens": 9801301.0, "reward": 0.043691135942935944, "reward_std": 0.016106728464365005, "rewards/bleu_reward_func/mean": 0.043691135942935944, "rewards/bleu_reward_func/std": 0.03660671412944794, "step": 731 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 452.0, "completions/mean_length": 316.09375, "completions/mean_terminated_length": 239.43478393554688, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5856, "grad_norm": 3.061321258544922, "kl": 0.066162109375, "learning_rate": 1e-06, "loss": 0.0924, "num_tokens": 9815992.0, "reward": 0.04043514281511307, "reward_std": 0.02053326927125454, "rewards/bleu_reward_func/mean": 0.04043514281511307, "rewards/bleu_reward_func/std": 0.04115479812026024, "step": 732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 512.0, "completions/max_terminated_length": 507.0, "completions/mean_length": 383.8125, "completions/mean_terminated_length": 375.2666931152344, "completions/min_length": 269.0, "completions/min_terminated_length": 269.0, "epoch": 0.5864, "grad_norm": 1.9717614650726318, "kl": 0.0321044921875, "learning_rate": 1e-06, "loss": 0.0407, "num_tokens": 9830522.0, "reward": 0.026746738702058792, "reward_std": 0.006414106115698814, "rewards/bleu_reward_func/mean": 0.026746738702058792, "rewards/bleu_reward_func/std": 0.0077305627055466175, "step": 733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 512.0, "completions/max_terminated_length": 488.0, "completions/mean_length": 331.0, "completions/mean_terminated_length": 248.72727966308594, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "epoch": 0.5872, "grad_norm": 2.053102970123291, "kl": 0.0360870361328125, "learning_rate": 1e-06, "loss": 0.0197, "num_tokens": 9843674.0, "reward": 0.05009941756725311, "reward_std": 0.027085162699222565, "rewards/bleu_reward_func/mean": 0.05009941756725311, "rewards/bleu_reward_func/std": 0.03603508323431015, "step": 734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 456.0, "completions/mean_length": 378.34375, "completions/mean_terminated_length": 333.79168701171875, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.588, "grad_norm": 2.1135923862457275, "kl": 0.0374755859375, "learning_rate": 1e-06, "loss": 0.042, "num_tokens": 9858789.0, "reward": 0.025262653827667236, "reward_std": 0.009363568387925625, "rewards/bleu_reward_func/mean": 0.025262653827667236, "rewards/bleu_reward_func/std": 0.015334555879235268, "step": 735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 512.0, "completions/max_terminated_length": 505.0, "completions/mean_length": 290.09375, "completions/mean_terminated_length": 249.0, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.5888, "grad_norm": 3.513441562652588, "kl": 0.0484619140625, "learning_rate": 1e-06, "loss": 0.2233, "num_tokens": 9870368.0, "reward": 0.031309109181165695, "reward_std": 0.012565305456519127, "rewards/bleu_reward_func/mean": 0.031309109181165695, "rewards/bleu_reward_func/std": 0.019024599343538284, "step": 736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 430.0, "completions/max_terminated_length": 430.0, "completions/mean_length": 203.96875, "completions/mean_terminated_length": 203.96875, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 0.5896, "grad_norm": 3.6366395950317383, "kl": 0.043121337890625, "learning_rate": 1e-06, "loss": -0.1022, "num_tokens": 9878959.0, "reward": 0.10009264945983887, "reward_std": 0.04424141347408295, "rewards/bleu_reward_func/mean": 0.10009264945983887, "rewards/bleu_reward_func/std": 0.07444142550230026, "step": 737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 512.0, "completions/max_terminated_length": 432.0, "completions/mean_length": 298.0625, "completions/mean_terminated_length": 200.8181915283203, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "epoch": 0.5904, "grad_norm": 3.4232091903686523, "kl": 0.05059814453125, "learning_rate": 1e-06, "loss": -0.007, "num_tokens": 9893561.0, "reward": 0.039665337651968, "reward_std": 0.0123225636780262, "rewards/bleu_reward_func/mean": 0.039665337651968, "rewards/bleu_reward_func/std": 0.024130748584866524, "step": 738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 512.0, "completions/max_terminated_length": 508.0, "completions/mean_length": 336.125, "completions/mean_terminated_length": 244.0, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "epoch": 0.5912, "grad_norm": 2.8178460597991943, "kl": 0.047271728515625, "learning_rate": 1e-06, "loss": -0.1451, "num_tokens": 9909901.0, "reward": 0.04546702653169632, "reward_std": 0.021167172119021416, "rewards/bleu_reward_func/mean": 0.04546702653169632, "rewards/bleu_reward_func/std": 0.04271375387907028, "step": 739 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 512.0, "completions/max_terminated_length": 451.0, "completions/mean_length": 292.9375, "completions/mean_terminated_length": 285.8709716796875, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 0.592, "grad_norm": 2.625126600265503, "kl": 0.0390625, "learning_rate": 1e-06, "loss": -0.2049, "num_tokens": 9922867.0, "reward": 0.09222639352083206, "reward_std": 0.03261449933052063, "rewards/bleu_reward_func/mean": 0.09222639352083206, "rewards/bleu_reward_func/std": 0.09164327383041382, "step": 740 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 512.0, "completions/max_terminated_length": 510.0, "completions/mean_length": 249.96875, "completions/mean_terminated_length": 222.862060546875, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.5928, "grad_norm": 3.74029803276062, "kl": 0.059112548828125, "learning_rate": 1e-06, "loss": -0.1066, "num_tokens": 9935442.0, "reward": 0.252871572971344, "reward_std": 0.0454796738922596, "rewards/bleu_reward_func/mean": 0.252871572971344, "rewards/bleu_reward_func/std": 0.29931873083114624, "step": 741 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5625, "completions/max_length": 512.0, "completions/max_terminated_length": 468.0, "completions/mean_length": 442.03125, "completions/mean_terminated_length": 352.0714416503906, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "epoch": 0.5936, "grad_norm": 1.9116039276123047, "kl": 0.04791259765625, "learning_rate": 1e-06, "loss": -0.012, "num_tokens": 9953635.0, "reward": 0.03428558260202408, "reward_std": 0.0095596294850111, "rewards/bleu_reward_func/mean": 0.03428558260202408, "rewards/bleu_reward_func/std": 0.03276081383228302, "step": 742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 512.0, "completions/max_terminated_length": 498.0, "completions/mean_length": 285.71875, "completions/mean_terminated_length": 243.8148193359375, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 0.5944, "grad_norm": 3.1582913398742676, "kl": 0.052490234375, "learning_rate": 1e-06, "loss": 0.0456, "num_tokens": 9965906.0, "reward": 0.048588261008262634, "reward_std": 0.017030086368322372, "rewards/bleu_reward_func/mean": 0.048588261008262634, "rewards/bleu_reward_func/std": 0.027793744578957558, "step": 743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 512.0, "completions/max_terminated_length": 500.0, "completions/mean_length": 349.625, "completions/mean_terminated_length": 264.5714416503906, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 0.5952, "grad_norm": 2.2140707969665527, "kl": 0.041290283203125, "learning_rate": 1e-06, "loss": 0.0686, "num_tokens": 9982022.0, "reward": 0.045479342341423035, "reward_std": 0.021971603855490685, "rewards/bleu_reward_func/mean": 0.045479342341423035, "rewards/bleu_reward_func/std": 0.037209052592515945, "step": 744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 512.0, "completions/max_terminated_length": 487.0, "completions/mean_length": 265.53125, "completions/mean_terminated_length": 257.58062744140625, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.596, "grad_norm": 5.462935447692871, "kl": 0.0552978515625, "learning_rate": 1e-06, "loss": -0.0597, "num_tokens": 9994487.0, "reward": 0.03378114849328995, "reward_std": 0.012287897989153862, "rewards/bleu_reward_func/mean": 0.03378114849328995, "rewards/bleu_reward_func/std": 0.01468950230628252, "step": 745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 512.0, "completions/max_terminated_length": 497.0, "completions/mean_length": 396.78125, "completions/mean_terminated_length": 307.1666564941406, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.5968, "grad_norm": 2.1518917083740234, "kl": 0.041534423828125, "learning_rate": 1e-06, "loss": 0.0918, "num_tokens": 10009392.0, "reward": 0.025878749787807465, "reward_std": 0.015285806730389595, "rewards/bleu_reward_func/mean": 0.025878749787807465, "rewards/bleu_reward_func/std": 0.031384509056806564, "step": 746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 512.0, "completions/max_terminated_length": 505.0, "completions/mean_length": 257.59375, "completions/mean_terminated_length": 240.6333465576172, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 0.5976, "grad_norm": 3.0447263717651367, "kl": 0.0303955078125, "learning_rate": 1e-06, "loss": -0.1925, "num_tokens": 10021539.0, "reward": 0.10353910177946091, "reward_std": 0.06533479690551758, "rewards/bleu_reward_func/mean": 0.10353910177946091, "rewards/bleu_reward_func/std": 0.12369221448898315, "step": 747 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 512.0, "completions/max_terminated_length": 504.0, "completions/mean_length": 254.5, "completions/mean_terminated_length": 237.33334350585938, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "epoch": 0.5984, "grad_norm": 2.9500083923339844, "kl": 0.03375244140625, "learning_rate": 1e-06, "loss": 0.0413, "num_tokens": 10032043.0, "reward": 0.09129080176353455, "reward_std": 0.04000641033053398, "rewards/bleu_reward_func/mean": 0.09129080176353455, "rewards/bleu_reward_func/std": 0.06342270225286484, "step": 748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 416.0, "completions/max_terminated_length": 416.0, "completions/mean_length": 180.875, "completions/mean_terminated_length": 180.875, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 0.5992, "grad_norm": 3.592421293258667, "kl": 0.0577392578125, "learning_rate": 1e-06, "loss": -0.0851, "num_tokens": 10039823.0, "reward": 0.038300834596157074, "reward_std": 0.012440194375813007, "rewards/bleu_reward_func/mean": 0.038300834596157074, "rewards/bleu_reward_func/std": 0.021254317834973335, "step": 749 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 512.0, "completions/max_terminated_length": 387.0, "completions/mean_length": 280.125, "completions/mean_terminated_length": 226.61538696289062, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.6, "grad_norm": 2.9240732192993164, "kl": 0.046600341796875, "learning_rate": 1e-06, "loss": 0.0647, "num_tokens": 10051187.0, "reward": 0.05779760330915451, "reward_std": 0.014689221978187561, "rewards/bleu_reward_func/mean": 0.05779760330915451, "rewards/bleu_reward_func/std": 0.04038412868976593, "step": 750 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 512.0, "completions/max_terminated_length": 463.0, "completions/mean_length": 299.40625, "completions/mean_terminated_length": 250.34616088867188, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "epoch": 0.6008, "grad_norm": 2.695838689804077, "kl": 0.03765869140625, "learning_rate": 1e-06, "loss": 0.0816, "num_tokens": 10064256.0, "reward": 0.056154537945985794, "reward_std": 0.04283731430768967, "rewards/bleu_reward_func/mean": 0.056154537945985794, "rewards/bleu_reward_func/std": 0.06427828222513199, "step": 751 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 512.0, "completions/max_terminated_length": 507.0, "completions/mean_length": 235.4375, "completions/mean_terminated_length": 195.92857360839844, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "epoch": 0.6016, "grad_norm": 3.5030364990234375, "kl": 0.054962158203125, "learning_rate": 1e-06, "loss": 0.1026, "num_tokens": 10074310.0, "reward": 0.0592612624168396, "reward_std": 0.038583509624004364, "rewards/bleu_reward_func/mean": 0.0592612624168396, "rewards/bleu_reward_func/std": 0.06044392287731171, "step": 752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 403.0, "completions/max_terminated_length": 403.0, "completions/mean_length": 177.59375, "completions/mean_terminated_length": 177.59375, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 0.6024, "grad_norm": 4.249257564544678, "kl": 0.03570556640625, "learning_rate": 1e-06, "loss": 0.0027, "num_tokens": 10082169.0, "reward": 0.06106291711330414, "reward_std": 0.03034021332859993, "rewards/bleu_reward_func/mean": 0.06106291711330414, "rewards/bleu_reward_func/std": 0.055352307856082916, "step": 753 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 512.0, "completions/max_terminated_length": 423.0, "completions/mean_length": 305.40625, "completions/mean_terminated_length": 144.72222900390625, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "epoch": 0.6032, "grad_norm": 3.6615188121795654, "kl": 0.04107666015625, "learning_rate": 1e-06, "loss": -0.0413, "num_tokens": 10096142.0, "reward": 0.07839120924472809, "reward_std": 0.027994198724627495, "rewards/bleu_reward_func/mean": 0.07839120924472809, "rewards/bleu_reward_func/std": 0.08647292107343674, "step": 754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 413.0, "completions/mean_length": 247.5625, "completions/mean_terminated_length": 173.51998901367188, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.604, "grad_norm": 3.6631929874420166, "kl": 0.05218505859375, "learning_rate": 1e-06, "loss": -0.1032, "num_tokens": 10106736.0, "reward": 0.03967122733592987, "reward_std": 0.01704089716076851, "rewards/bleu_reward_func/mean": 0.03967122733592987, "rewards/bleu_reward_func/std": 0.024564094841480255, "step": 755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 461.0, "completions/mean_length": 382.75, "completions/mean_terminated_length": 332.1739196777344, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.6048, "grad_norm": 2.315969944000244, "kl": 0.0330810546875, "learning_rate": 1e-06, "loss": 0.0998, "num_tokens": 10123032.0, "reward": 0.039958953857421875, "reward_std": 0.010448317043483257, "rewards/bleu_reward_func/mean": 0.039958953857421875, "rewards/bleu_reward_func/std": 0.03418637439608574, "step": 756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 485.0, "completions/mean_length": 299.09375, "completions/mean_terminated_length": 215.78260803222656, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.6056, "grad_norm": 2.656048059463501, "kl": 0.02557373046875, "learning_rate": 1e-06, "loss": -0.0165, "num_tokens": 10136523.0, "reward": 0.16351552307605743, "reward_std": 0.1012059897184372, "rewards/bleu_reward_func/mean": 0.16351552307605743, "rewards/bleu_reward_func/std": 0.3160014748573303, "step": 757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 512.0, "completions/max_terminated_length": 439.0, "completions/mean_length": 184.5, "completions/mean_terminated_length": 162.6666717529297, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.6064, "grad_norm": 6.093653202056885, "kl": 0.0682373046875, "learning_rate": 1e-06, "loss": 0.0402, "num_tokens": 10144755.0, "reward": 0.04594259709119797, "reward_std": 0.014256558381021023, "rewards/bleu_reward_func/mean": 0.04594259709119797, "rewards/bleu_reward_func/std": 0.021436382085084915, "step": 758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 512.0, "completions/max_terminated_length": 497.0, "completions/mean_length": 388.03125, "completions/mean_terminated_length": 303.2105407714844, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.6072, "grad_norm": 2.354794979095459, "kl": 0.0377197265625, "learning_rate": 1e-06, "loss": -0.0584, "num_tokens": 10162964.0, "reward": 0.08822646737098694, "reward_std": 0.03655345365405083, "rewards/bleu_reward_func/mean": 0.08822646737098694, "rewards/bleu_reward_func/std": 0.04937893897294998, "step": 759 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 512.0, "completions/max_terminated_length": 506.0, "completions/mean_length": 265.1875, "completions/mean_terminated_length": 219.48147583007812, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 0.608, "grad_norm": 3.1614513397216797, "kl": 0.0482177734375, "learning_rate": 1e-06, "loss": -0.1552, "num_tokens": 10174554.0, "reward": 0.09878893941640854, "reward_std": 0.045050833374261856, "rewards/bleu_reward_func/mean": 0.09878893941640854, "rewards/bleu_reward_func/std": 0.09232120960950851, "step": 760 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 479.0, "completions/mean_length": 347.84375, "completions/mean_terminated_length": 301.8800048828125, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.6088, "grad_norm": 2.764409065246582, "kl": 0.0374755859375, "learning_rate": 1e-06, "loss": 0.0588, "num_tokens": 10188061.0, "reward": 0.0298735611140728, "reward_std": 0.012367211282253265, "rewards/bleu_reward_func/mean": 0.0298735611140728, "rewards/bleu_reward_func/std": 0.019031813368201256, "step": 761 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 512.0, "completions/max_terminated_length": 456.0, "completions/mean_length": 300.15625, "completions/mean_terminated_length": 203.8636474609375, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.6096, "grad_norm": 4.376463413238525, "kl": 0.0523681640625, "learning_rate": 1e-06, "loss": -0.0498, "num_tokens": 10200210.0, "reward": 0.04680792987346649, "reward_std": 0.012458568438887596, "rewards/bleu_reward_func/mean": 0.04680792987346649, "rewards/bleu_reward_func/std": 0.015480151399970055, "step": 762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 425.0, "completions/max_terminated_length": 425.0, "completions/mean_length": 133.9375, "completions/mean_terminated_length": 133.9375, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "epoch": 0.6104, "grad_norm": 5.940901756286621, "kl": 0.05828857421875, "learning_rate": 1e-06, "loss": 0.0172, "num_tokens": 10207136.0, "reward": 0.04009559005498886, "reward_std": 0.016614696010947227, "rewards/bleu_reward_func/mean": 0.04009559005498886, "rewards/bleu_reward_func/std": 0.03101743757724762, "step": 763 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 444.0, "completions/mean_length": 326.84375, "completions/mean_terminated_length": 275.0, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.6112, "grad_norm": 2.4283318519592285, "kl": 0.04022216796875, "learning_rate": 1e-06, "loss": -0.0041, "num_tokens": 10219763.0, "reward": 0.034885473549366, "reward_std": 0.011106548830866814, "rewards/bleu_reward_func/mean": 0.034885473549366, "rewards/bleu_reward_func/std": 0.01220767293125391, "step": 764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 333.0, "completions/mean_length": 242.0, "completions/mean_terminated_length": 152.0, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.612, "grad_norm": 4.512617111206055, "kl": 0.057891845703125, "learning_rate": 1e-06, "loss": -0.0146, "num_tokens": 10233067.0, "reward": 0.04029272869229317, "reward_std": 0.010187342762947083, "rewards/bleu_reward_func/mean": 0.04029272869229317, "rewards/bleu_reward_func/std": 0.01465767901390791, "step": 765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 498.0, "completions/mean_length": 380.4375, "completions/mean_terminated_length": 328.95654296875, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.6128, "grad_norm": 2.1606087684631348, "kl": 0.029327392578125, "learning_rate": 1e-06, "loss": 0.1129, "num_tokens": 10250633.0, "reward": 0.04106716811656952, "reward_std": 0.020106343552470207, "rewards/bleu_reward_func/mean": 0.04106716811656952, "rewards/bleu_reward_func/std": 0.023741254583001137, "step": 766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 512.0, "completions/max_terminated_length": 438.0, "completions/mean_length": 432.6875, "completions/mean_terminated_length": 353.375, "completions/min_length": 272.0, "completions/min_terminated_length": 272.0, "epoch": 0.6136, "grad_norm": 2.2307021617889404, "kl": 0.044036865234375, "learning_rate": 1e-06, "loss": 0.0143, "num_tokens": 10269047.0, "reward": 0.030770011246204376, "reward_std": 0.007664786651730537, "rewards/bleu_reward_func/mean": 0.030770011246204376, "rewards/bleu_reward_func/std": 0.013669944368302822, "step": 767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.59375, "completions/max_length": 512.0, "completions/max_terminated_length": 490.0, "completions/mean_length": 456.28125, "completions/mean_terminated_length": 374.8461608886719, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "epoch": 0.6144, "grad_norm": 2.2227320671081543, "kl": 0.050994873046875, "learning_rate": 1e-06, "loss": -0.0417, "num_tokens": 10288664.0, "reward": 0.05277414619922638, "reward_std": 0.01249920204281807, "rewards/bleu_reward_func/mean": 0.05277414619922638, "rewards/bleu_reward_func/std": 0.031958963721990585, "step": 768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 512.0, "completions/max_terminated_length": 493.0, "completions/mean_length": 222.21875, "completions/mean_terminated_length": 212.87095642089844, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "epoch": 0.6152, "grad_norm": 4.83256196975708, "kl": 0.0826416015625, "learning_rate": 1e-06, "loss": 0.0396, "num_tokens": 10299135.0, "reward": 0.044097334146499634, "reward_std": 0.019661743193864822, "rewards/bleu_reward_func/mean": 0.044097334146499634, "rewards/bleu_reward_func/std": 0.025644388049840927, "step": 769 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 512.0, "completions/max_terminated_length": 501.0, "completions/mean_length": 349.5, "completions/mean_terminated_length": 332.6896667480469, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.616, "grad_norm": 2.3625354766845703, "kl": 0.036376953125, "learning_rate": 1e-06, "loss": -0.0849, "num_tokens": 10312383.0, "reward": 0.03365849331021309, "reward_std": 0.009999667294323444, "rewards/bleu_reward_func/mean": 0.03365849331021309, "rewards/bleu_reward_func/std": 0.0307177621871233, "step": 770 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 472.0, "completions/mean_length": 341.3125, "completions/mean_terminated_length": 293.5199890136719, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 0.6168, "grad_norm": 2.5119853019714355, "kl": 0.0298614501953125, "learning_rate": 1e-06, "loss": -0.1266, "num_tokens": 10326233.0, "reward": 0.06748858094215393, "reward_std": 0.056034184992313385, "rewards/bleu_reward_func/mean": 0.06748858094215393, "rewards/bleu_reward_func/std": 0.06614639610052109, "step": 771 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 512.0, "completions/max_terminated_length": 451.0, "completions/mean_length": 199.03125, "completions/mean_terminated_length": 188.93548583984375, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 0.6176, "grad_norm": 3.840604066848755, "kl": 0.0477294921875, "learning_rate": 1e-06, "loss": -0.0144, "num_tokens": 10338010.0, "reward": 0.05809897184371948, "reward_std": 0.03548566997051239, "rewards/bleu_reward_func/mean": 0.05809897184371948, "rewards/bleu_reward_func/std": 0.03829097002744675, "step": 772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 445.0, "completions/max_terminated_length": 445.0, "completions/mean_length": 268.5625, "completions/mean_terminated_length": 268.5625, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.6184, "grad_norm": 3.002624273300171, "kl": 0.033477783203125, "learning_rate": 1e-06, "loss": -0.0311, "num_tokens": 10348676.0, "reward": 0.04606045410037041, "reward_std": 0.013474350795149803, "rewards/bleu_reward_func/mean": 0.04606045410037041, "rewards/bleu_reward_func/std": 0.03948834538459778, "step": 773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 500.0, "completions/mean_length": 259.625, "completions/mean_terminated_length": 175.5, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.6192, "grad_norm": 4.124053955078125, "kl": 0.055450439453125, "learning_rate": 1e-06, "loss": -0.0785, "num_tokens": 10361736.0, "reward": 0.05262724682688713, "reward_std": 0.00995348859578371, "rewards/bleu_reward_func/mean": 0.05262724682688713, "rewards/bleu_reward_func/std": 0.04580436274409294, "step": 774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 483.0, "completions/max_terminated_length": 483.0, "completions/mean_length": 268.09375, "completions/mean_terminated_length": 268.09375, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.62, "grad_norm": 3.3182435035705566, "kl": 0.05712890625, "learning_rate": 1e-06, "loss": 0.0194, "num_tokens": 10372547.0, "reward": 0.03659249097108841, "reward_std": 0.01361929066479206, "rewards/bleu_reward_func/mean": 0.03659249097108841, "rewards/bleu_reward_func/std": 0.017074862495064735, "step": 775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 512.0, "completions/max_terminated_length": 502.0, "completions/mean_length": 399.0625, "completions/mean_terminated_length": 339.9047546386719, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.6208, "grad_norm": 1.9342231750488281, "kl": 0.035369873046875, "learning_rate": 1e-06, "loss": 0.0046, "num_tokens": 10388957.0, "reward": 0.09792732447385788, "reward_std": 0.02487635612487793, "rewards/bleu_reward_func/mean": 0.09792732447385788, "rewards/bleu_reward_func/std": 0.0962534099817276, "step": 776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 466.0, "completions/max_terminated_length": 466.0, "completions/mean_length": 167.1875, "completions/mean_terminated_length": 167.1875, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.6216, "grad_norm": 6.700768947601318, "kl": 0.04681396484375, "learning_rate": 1e-06, "loss": -0.0026, "num_tokens": 10397779.0, "reward": 0.032624099403619766, "reward_std": 0.011605742387473583, "rewards/bleu_reward_func/mean": 0.032624099403619766, "rewards/bleu_reward_func/std": 0.020308885723352432, "step": 777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 512.0, "completions/max_terminated_length": 500.0, "completions/mean_length": 385.75, "completions/mean_terminated_length": 319.6190490722656, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.6224, "grad_norm": 2.1677191257476807, "kl": 0.051605224609375, "learning_rate": 1e-06, "loss": 0.0461, "num_tokens": 10414483.0, "reward": 0.08826855570077896, "reward_std": 0.02615802362561226, "rewards/bleu_reward_func/mean": 0.08826855570077896, "rewards/bleu_reward_func/std": 0.1053905114531517, "step": 778 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 468.0, "completions/mean_length": 252.5625, "completions/mean_terminated_length": 179.9199981689453, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.6232, "grad_norm": 6.194201469421387, "kl": 0.04010009765625, "learning_rate": 1e-06, "loss": -0.0265, "num_tokens": 10425557.0, "reward": 0.02823360078036785, "reward_std": 0.03581786900758743, "rewards/bleu_reward_func/mean": 0.02823360078036785, "rewards/bleu_reward_func/std": 0.04058787599205971, "step": 779 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 423.0, "completions/mean_length": 226.40625, "completions/mean_terminated_length": 131.20834350585938, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.624, "grad_norm": 5.036434173583984, "kl": 0.050079345703125, "learning_rate": 1e-06, "loss": 0.1302, "num_tokens": 10436914.0, "reward": 0.044641509652137756, "reward_std": 0.016347650438547134, "rewards/bleu_reward_func/mean": 0.044641509652137756, "rewards/bleu_reward_func/std": 0.03360149264335632, "step": 780 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 512.0, "completions/max_terminated_length": 368.0, "completions/mean_length": 204.0, "completions/mean_terminated_length": 194.06451416015625, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.6248, "grad_norm": 2.847588062286377, "kl": 0.033935546875, "learning_rate": 1e-06, "loss": 0.0755, "num_tokens": 10448866.0, "reward": 0.07604652643203735, "reward_std": 0.022845547646284103, "rewards/bleu_reward_func/mean": 0.07604652643203735, "rewards/bleu_reward_func/std": 0.052399422973394394, "step": 781 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 433.0, "completions/max_terminated_length": 433.0, "completions/mean_length": 202.4375, "completions/mean_terminated_length": 202.4375, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 0.6256, "grad_norm": 3.90671968460083, "kl": 0.033660888671875, "learning_rate": 1e-06, "loss": 0.0688, "num_tokens": 10462408.0, "reward": 0.05024778097867966, "reward_std": 0.02265213616192341, "rewards/bleu_reward_func/mean": 0.05024778097867966, "rewards/bleu_reward_func/std": 0.03187122941017151, "step": 782 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 444.0, "completions/mean_length": 276.78125, "completions/mean_terminated_length": 198.375, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "epoch": 0.6264, "grad_norm": 3.0001416206359863, "kl": 0.04388427734375, "learning_rate": 1e-06, "loss": -0.0108, "num_tokens": 10473897.0, "reward": 0.03825919330120087, "reward_std": 0.013500811532139778, "rewards/bleu_reward_func/mean": 0.03825919330120087, "rewards/bleu_reward_func/std": 0.02070869877934456, "step": 783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 512.0, "completions/max_terminated_length": 444.0, "completions/mean_length": 254.8125, "completions/mean_terminated_length": 195.4615478515625, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "epoch": 0.6272, "grad_norm": 2.65230131149292, "kl": 0.03281402587890625, "learning_rate": 1e-06, "loss": -0.1691, "num_tokens": 10485235.0, "reward": 0.08533032983541489, "reward_std": 0.05742814019322395, "rewards/bleu_reward_func/mean": 0.08533032983541489, "rewards/bleu_reward_func/std": 0.106187604367733, "step": 784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 512.0, "completions/max_terminated_length": 505.0, "completions/mean_length": 319.90625, "completions/mean_terminated_length": 292.46429443359375, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 0.628, "grad_norm": 2.5516910552978516, "kl": 0.042205810546875, "learning_rate": 1e-06, "loss": -0.1961, "num_tokens": 10502464.0, "reward": 0.047782108187675476, "reward_std": 0.032827217131853104, "rewards/bleu_reward_func/mean": 0.047782108187675476, "rewards/bleu_reward_func/std": 0.04888693243265152, "step": 785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 512.0, "completions/max_terminated_length": 502.0, "completions/mean_length": 302.4375, "completions/mean_terminated_length": 207.18182373046875, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.6288, "grad_norm": 3.678699493408203, "kl": 0.05059814453125, "learning_rate": 1e-06, "loss": -0.2486, "num_tokens": 10514454.0, "reward": 0.0350123755633831, "reward_std": 0.019910816103219986, "rewards/bleu_reward_func/mean": 0.0350123755633831, "rewards/bleu_reward_func/std": 0.03280069679021835, "step": 786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 512.0, "completions/max_terminated_length": 441.0, "completions/mean_length": 230.03125, "completions/mean_terminated_length": 220.9354705810547, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 0.6296, "grad_norm": 3.5091724395751953, "kl": 0.060211181640625, "learning_rate": 1e-06, "loss": -0.035, "num_tokens": 10526559.0, "reward": 0.0770467221736908, "reward_std": 0.025734489783644676, "rewards/bleu_reward_func/mean": 0.0770467221736908, "rewards/bleu_reward_func/std": 0.06628144532442093, "step": 787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 505.0, "completions/mean_length": 315.65625, "completions/mean_terminated_length": 238.8260955810547, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.6304, "grad_norm": 2.624885082244873, "kl": 0.030364990234375, "learning_rate": 1e-06, "loss": 0.0859, "num_tokens": 10538644.0, "reward": 0.031655922532081604, "reward_std": 0.012592853978276253, "rewards/bleu_reward_func/mean": 0.031655922532081604, "rewards/bleu_reward_func/std": 0.015495400875806808, "step": 788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 504.0, "completions/mean_length": 320.40625, "completions/mean_terminated_length": 256.54168701171875, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.6312, "grad_norm": 7.265035152435303, "kl": 0.045196533203125, "learning_rate": 1e-06, "loss": -0.0586, "num_tokens": 10551681.0, "reward": 0.11770100891590118, "reward_std": 0.048164598643779755, "rewards/bleu_reward_func/mean": 0.11770100891590118, "rewards/bleu_reward_func/std": 0.10264891386032104, "step": 789 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 512.0, "completions/max_terminated_length": 352.0, "completions/mean_length": 350.5, "completions/mean_terminated_length": 189.0, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.632, "grad_norm": 2.7487597465515137, "kl": 0.0435791015625, "learning_rate": 1e-06, "loss": -0.0856, "num_tokens": 10567657.0, "reward": 0.030746258795261383, "reward_std": 0.012422507628798485, "rewards/bleu_reward_func/mean": 0.030746258795261383, "rewards/bleu_reward_func/std": 0.0163208469748497, "step": 790 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 420.0, "completions/mean_length": 312.5, "completions/mean_terminated_length": 246.0, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 0.6328, "grad_norm": 5.9650187492370605, "kl": 0.049163818359375, "learning_rate": 1e-06, "loss": -0.1018, "num_tokens": 10580481.0, "reward": 0.1085551381111145, "reward_std": 0.03489597514271736, "rewards/bleu_reward_func/mean": 0.1085551381111145, "rewards/bleu_reward_func/std": 0.07419593632221222, "step": 791 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 512.0, "completions/max_terminated_length": 500.0, "completions/mean_length": 274.34375, "completions/mean_terminated_length": 258.5, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 0.6336, "grad_norm": 2.4724159240722656, "kl": 0.034820556640625, "learning_rate": 1e-06, "loss": -0.0042, "num_tokens": 10591612.0, "reward": 0.048242103308439255, "reward_std": 0.018077358603477478, "rewards/bleu_reward_func/mean": 0.048242103308439255, "rewards/bleu_reward_func/std": 0.038909122347831726, "step": 792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 497.0, "completions/mean_length": 399.28125, "completions/mean_terminated_length": 355.1739196777344, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.6344, "grad_norm": 2.54805588722229, "kl": 0.025634765625, "learning_rate": 1e-06, "loss": -0.085, "num_tokens": 10606909.0, "reward": 0.05102061852812767, "reward_std": 0.021770458668470383, "rewards/bleu_reward_func/mean": 0.05102061852812767, "rewards/bleu_reward_func/std": 0.025904852896928787, "step": 793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 512.0, "completions/max_terminated_length": 493.0, "completions/mean_length": 353.25, "completions/mean_terminated_length": 336.82757568359375, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.6352, "grad_norm": 2.3026504516601562, "kl": 0.040191650390625, "learning_rate": 1e-06, "loss": -0.0445, "num_tokens": 10622421.0, "reward": 0.044227294623851776, "reward_std": 0.01541022676974535, "rewards/bleu_reward_func/mean": 0.044227294623851776, "rewards/bleu_reward_func/std": 0.027547884732484818, "step": 794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 512.0, "completions/max_terminated_length": 498.0, "completions/mean_length": 400.59375, "completions/mean_terminated_length": 289.1875, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.636, "grad_norm": 2.2095108032226562, "kl": 0.049224853515625, "learning_rate": 1e-06, "loss": -0.0347, "num_tokens": 10639072.0, "reward": 0.04881560057401657, "reward_std": 0.02202250249683857, "rewards/bleu_reward_func/mean": 0.04881560057401657, "rewards/bleu_reward_func/std": 0.053479425609111786, "step": 795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 512.0, "completions/max_terminated_length": 443.0, "completions/mean_length": 348.40625, "completions/mean_terminated_length": 204.05882263183594, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 0.6368, "grad_norm": 2.7905285358428955, "kl": 0.04681396484375, "learning_rate": 1e-06, "loss": 0.1886, "num_tokens": 10652597.0, "reward": 0.08770878612995148, "reward_std": 0.03572450950741768, "rewards/bleu_reward_func/mean": 0.08770878612995148, "rewards/bleu_reward_func/std": 0.07477344572544098, "step": 796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.625, "completions/max_length": 512.0, "completions/max_terminated_length": 511.0, "completions/mean_length": 477.6875, "completions/mean_terminated_length": 420.5, "completions/min_length": 358.0, "completions/min_terminated_length": 358.0, "epoch": 0.6376, "grad_norm": 2.1412551403045654, "kl": 0.04559326171875, "learning_rate": 1e-06, "loss": 0.0094, "num_tokens": 10670643.0, "reward": 0.07936374843120575, "reward_std": 0.015750272199511528, "rewards/bleu_reward_func/mean": 0.07936374843120575, "rewards/bleu_reward_func/std": 0.044607013463974, "step": 797 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 512.0, "completions/max_terminated_length": 498.0, "completions/mean_length": 369.5625, "completions/mean_terminated_length": 258.77777099609375, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.6384, "grad_norm": 7.5998334884643555, "kl": 0.0509033203125, "learning_rate": 1e-06, "loss": 0.0505, "num_tokens": 10688821.0, "reward": 0.053146444261074066, "reward_std": 0.020619917660951614, "rewards/bleu_reward_func/mean": 0.053146444261074066, "rewards/bleu_reward_func/std": 0.03774289786815643, "step": 798 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 512.0, "completions/max_terminated_length": 314.0, "completions/mean_length": 388.0, "completions/mean_terminated_length": 264.0, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "epoch": 0.6392, "grad_norm": 2.1621346473693848, "kl": 0.03448486328125, "learning_rate": 1e-06, "loss": -0.0252, "num_tokens": 10707373.0, "reward": 0.01923811435699463, "reward_std": 0.0031193974427878857, "rewards/bleu_reward_func/mean": 0.01923811435699463, "rewards/bleu_reward_func/std": 0.02292207069694996, "step": 799 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 512.0, "completions/max_terminated_length": 480.0, "completions/mean_length": 430.6875, "completions/mean_terminated_length": 375.0526428222656, "completions/min_length": 258.0, "completions/min_terminated_length": 258.0, "epoch": 0.64, "grad_norm": 2.0542428493499756, "kl": 0.038909912109375, "learning_rate": 1e-06, "loss": -0.0029, "num_tokens": 10724091.0, "reward": 0.04182392358779907, "reward_std": 0.018276244401931763, "rewards/bleu_reward_func/mean": 0.04182392358779907, "rewards/bleu_reward_func/std": 0.023004500195384026, "step": 800 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 483.0, "completions/mean_length": 292.4375, "completions/mean_terminated_length": 219.25, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.6408, "grad_norm": 3.799773931503296, "kl": 0.049468994140625, "learning_rate": 1e-06, "loss": -0.1183, "num_tokens": 10736089.0, "reward": 0.05615938454866409, "reward_std": 0.019352678209543228, "rewards/bleu_reward_func/mean": 0.05615938454866409, "rewards/bleu_reward_func/std": 0.04468993842601776, "step": 801 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 512.0, "completions/max_terminated_length": 493.0, "completions/mean_length": 253.09375, "completions/mean_terminated_length": 235.83334350585938, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 0.6416, "grad_norm": 2.5247280597686768, "kl": 0.024139404296875, "learning_rate": 1e-06, "loss": -0.1418, "num_tokens": 10748996.0, "reward": 0.1489913910627365, "reward_std": 0.06708651781082153, "rewards/bleu_reward_func/mean": 0.1489913910627365, "rewards/bleu_reward_func/std": 0.24455946683883667, "step": 802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 443.0, "completions/mean_length": 262.65625, "completions/mean_terminated_length": 192.83999633789062, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.6424, "grad_norm": 3.614666223526001, "kl": 0.07672119140625, "learning_rate": 1e-06, "loss": -0.063, "num_tokens": 10759945.0, "reward": 0.08040255308151245, "reward_std": 0.023083828389644623, "rewards/bleu_reward_func/mean": 0.08040255308151245, "rewards/bleu_reward_func/std": 0.05763205140829086, "step": 803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 512.0, "completions/max_terminated_length": 496.0, "completions/mean_length": 379.0, "completions/mean_terminated_length": 309.3333435058594, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.6432, "grad_norm": 2.4461886882781982, "kl": 0.05419921875, "learning_rate": 1e-06, "loss": -0.0328, "num_tokens": 10777153.0, "reward": 0.08907453715801239, "reward_std": 0.022760968655347824, "rewards/bleu_reward_func/mean": 0.08907453715801239, "rewards/bleu_reward_func/std": 0.06350069493055344, "step": 804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 512.0, "completions/max_terminated_length": 490.0, "completions/mean_length": 374.28125, "completions/mean_terminated_length": 302.1428527832031, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 0.644, "grad_norm": 2.2589452266693115, "kl": 0.0455322265625, "learning_rate": 1e-06, "loss": 0.1583, "num_tokens": 10791714.0, "reward": 0.04732927680015564, "reward_std": 0.01938834972679615, "rewards/bleu_reward_func/mean": 0.04732927680015564, "rewards/bleu_reward_func/std": 0.04413124546408653, "step": 805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.625, "completions/max_length": 512.0, "completions/max_terminated_length": 505.0, "completions/mean_length": 410.3125, "completions/mean_terminated_length": 240.83334350585938, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 0.6448, "grad_norm": 2.670647382736206, "kl": 0.07098388671875, "learning_rate": 1e-06, "loss": -0.0396, "num_tokens": 10809420.0, "reward": 0.04357248917222023, "reward_std": 0.008856004104018211, "rewards/bleu_reward_func/mean": 0.04357248917222023, "rewards/bleu_reward_func/std": 0.0222486425191164, "step": 806 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 512.0, "completions/max_terminated_length": 505.0, "completions/mean_length": 200.65625, "completions/mean_terminated_length": 190.61289978027344, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 0.6456, "grad_norm": 5.75560998916626, "kl": 0.06439208984375, "learning_rate": 1e-06, "loss": 0.1668, "num_tokens": 10819033.0, "reward": 0.054602716118097305, "reward_std": 0.0195465050637722, "rewards/bleu_reward_func/mean": 0.054602716118097305, "rewards/bleu_reward_func/std": 0.03817151114344597, "step": 807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.59375, "completions/max_length": 512.0, "completions/max_terminated_length": 482.0, "completions/mean_length": 408.75, "completions/mean_terminated_length": 257.8461608886719, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.6464, "grad_norm": 2.3174612522125244, "kl": 0.0399169921875, "learning_rate": 1e-06, "loss": -0.0048, "num_tokens": 10834729.0, "reward": 0.07070265710353851, "reward_std": 0.02076653018593788, "rewards/bleu_reward_func/mean": 0.07070265710353851, "rewards/bleu_reward_func/std": 0.04173728823661804, "step": 808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 512.0, "completions/max_terminated_length": 485.0, "completions/mean_length": 389.53125, "completions/mean_terminated_length": 385.58062744140625, "completions/min_length": 297.0, "completions/min_terminated_length": 297.0, "epoch": 0.6472, "grad_norm": 2.186995267868042, "kl": 0.0399169921875, "learning_rate": 1e-06, "loss": 0.0048, "num_tokens": 10852794.0, "reward": 0.07276012748479843, "reward_std": 0.019968077540397644, "rewards/bleu_reward_func/mean": 0.07276012748479843, "rewards/bleu_reward_func/std": 0.02828538604080677, "step": 809 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 512.0, "completions/max_terminated_length": 431.0, "completions/mean_length": 334.09375, "completions/mean_terminated_length": 253.22727966308594, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 0.648, "grad_norm": 2.8079397678375244, "kl": 0.055694580078125, "learning_rate": 1e-06, "loss": -0.1421, "num_tokens": 10866021.0, "reward": 0.05294843763113022, "reward_std": 0.02259877324104309, "rewards/bleu_reward_func/mean": 0.05294843763113022, "rewards/bleu_reward_func/std": 0.03158554434776306, "step": 810 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 505.0, "completions/mean_length": 364.0625, "completions/mean_terminated_length": 306.1739196777344, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.6488, "grad_norm": 2.486483335494995, "kl": 0.038848876953125, "learning_rate": 1e-06, "loss": 0.0101, "num_tokens": 10879623.0, "reward": 0.04237189143896103, "reward_std": 0.022582165896892548, "rewards/bleu_reward_func/mean": 0.04237189143896103, "rewards/bleu_reward_func/std": 0.0278725977987051, "step": 811 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 493.0, "completions/mean_length": 326.96875, "completions/mean_terminated_length": 254.56521606445312, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 0.6496, "grad_norm": 2.8792357444763184, "kl": 0.0413818359375, "learning_rate": 1e-06, "loss": -0.0378, "num_tokens": 10892558.0, "reward": 0.04466039687395096, "reward_std": 0.01942962221801281, "rewards/bleu_reward_func/mean": 0.04466039687395096, "rewards/bleu_reward_func/std": 0.036387860774993896, "step": 812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 512.0, "completions/max_terminated_length": 504.0, "completions/mean_length": 334.875, "completions/mean_terminated_length": 302.0740661621094, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 0.6504, "grad_norm": 2.8896868228912354, "kl": 0.05438232421875, "learning_rate": 1e-06, "loss": 0.0963, "num_tokens": 10905386.0, "reward": 0.08064363896846771, "reward_std": 0.026876429095864296, "rewards/bleu_reward_func/mean": 0.08064363896846771, "rewards/bleu_reward_func/std": 0.07220647484064102, "step": 813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 512.0, "completions/max_terminated_length": 472.0, "completions/mean_length": 418.3125, "completions/mean_terminated_length": 335.6470642089844, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "epoch": 0.6512, "grad_norm": 2.1156792640686035, "kl": 0.041168212890625, "learning_rate": 1e-06, "loss": -0.0214, "num_tokens": 10922348.0, "reward": 0.05280526727437973, "reward_std": 0.020577870309352875, "rewards/bleu_reward_func/mean": 0.05280526727437973, "rewards/bleu_reward_func/std": 0.04993457347154617, "step": 814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 512.0, "completions/mean_length": 377.28125, "completions/mean_terminated_length": 324.5652160644531, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 0.652, "grad_norm": 2.4333136081695557, "kl": 0.05401611328125, "learning_rate": 1e-06, "loss": -0.0442, "num_tokens": 10937101.0, "reward": 0.042016930878162384, "reward_std": 0.013686037622392178, "rewards/bleu_reward_func/mean": 0.042016930878162384, "rewards/bleu_reward_func/std": 0.023568252101540565, "step": 815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 460.0, "completions/mean_length": 327.4375, "completions/mean_terminated_length": 275.7599792480469, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 0.6528, "grad_norm": 2.788355827331543, "kl": 0.050811767578125, "learning_rate": 1e-06, "loss": -0.1467, "num_tokens": 10950227.0, "reward": 0.047510623931884766, "reward_std": 0.02448885142803192, "rewards/bleu_reward_func/mean": 0.047510623931884766, "rewards/bleu_reward_func/std": 0.054734427481889725, "step": 816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 512.0, "completions/max_terminated_length": 485.0, "completions/mean_length": 343.71875, "completions/mean_terminated_length": 212.8333282470703, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "epoch": 0.6536, "grad_norm": 3.3152077198028564, "kl": 0.0623779296875, "learning_rate": 1e-06, "loss": 0.1212, "num_tokens": 10964714.0, "reward": 0.03251378983259201, "reward_std": 0.008201804012060165, "rewards/bleu_reward_func/mean": 0.03251378983259201, "rewards/bleu_reward_func/std": 0.02223658747971058, "step": 817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 512.0, "completions/max_terminated_length": 495.0, "completions/mean_length": 211.125, "completions/mean_terminated_length": 191.06668090820312, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 0.6544, "grad_norm": 3.372784376144409, "kl": 0.07244873046875, "learning_rate": 1e-06, "loss": -0.0801, "num_tokens": 10975582.0, "reward": 0.07603277266025543, "reward_std": 0.024979114532470703, "rewards/bleu_reward_func/mean": 0.07603277266025543, "rewards/bleu_reward_func/std": 0.07992418855428696, "step": 818 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 334.0, "completions/mean_length": 253.15625, "completions/mean_terminated_length": 166.875, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 0.6552, "grad_norm": 4.9590744972229, "kl": 0.06866455078125, "learning_rate": 1e-06, "loss": 0.1993, "num_tokens": 10985955.0, "reward": 0.07587097585201263, "reward_std": 0.032769832760095596, "rewards/bleu_reward_func/mean": 0.07587097585201263, "rewards/bleu_reward_func/std": 0.09669305384159088, "step": 819 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 500.0, "completions/mean_length": 361.96875, "completions/mean_terminated_length": 303.2608642578125, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 0.656, "grad_norm": 2.3579721450805664, "kl": 0.036773681640625, "learning_rate": 1e-06, "loss": 0.0023, "num_tokens": 11001882.0, "reward": 0.1000506579875946, "reward_std": 0.01921307109296322, "rewards/bleu_reward_func/mean": 0.1000506579875946, "rewards/bleu_reward_func/std": 0.12420003116130829, "step": 820 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 499.0, "completions/mean_length": 364.375, "completions/mean_terminated_length": 323.03997802734375, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.6568, "grad_norm": 3.5548665523529053, "kl": 0.0386962890625, "learning_rate": 1e-06, "loss": 0.0001, "num_tokens": 11017006.0, "reward": 0.1209985613822937, "reward_std": 0.04797535389661789, "rewards/bleu_reward_func/mean": 0.1209985613822937, "rewards/bleu_reward_func/std": 0.10385487228631973, "step": 821 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 512.0, "completions/max_terminated_length": 419.0, "completions/mean_length": 357.34375, "completions/mean_terminated_length": 287.04547119140625, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.6576, "grad_norm": 2.376903772354126, "kl": 0.0343017578125, "learning_rate": 1e-06, "loss": 0.1205, "num_tokens": 11031105.0, "reward": 0.043021492660045624, "reward_std": 0.020202111452817917, "rewards/bleu_reward_func/mean": 0.043021492660045624, "rewards/bleu_reward_func/std": 0.03094971366226673, "step": 822 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 512.0, "completions/max_terminated_length": 454.0, "completions/mean_length": 295.6875, "completions/mean_terminated_length": 197.3636474609375, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 0.6584, "grad_norm": 3.0443296432495117, "kl": 0.049591064453125, "learning_rate": 1e-06, "loss": -0.0138, "num_tokens": 11047383.0, "reward": 0.09186838567256927, "reward_std": 0.024521898478269577, "rewards/bleu_reward_func/mean": 0.09186838567256927, "rewards/bleu_reward_func/std": 0.1259845644235611, "step": 823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/max_terminated_length": 504.0, "completions/mean_length": 226.6875, "completions/mean_terminated_length": 226.6875, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.6592, "grad_norm": 6.048206329345703, "kl": 0.04132080078125, "learning_rate": 1e-06, "loss": 0.1452, "num_tokens": 11057229.0, "reward": 0.12500673532485962, "reward_std": 0.05105290934443474, "rewards/bleu_reward_func/mean": 0.12500673532485962, "rewards/bleu_reward_func/std": 0.12394329905509949, "step": 824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.53125, "completions/max_length": 512.0, "completions/max_terminated_length": 489.0, "completions/mean_length": 395.625, "completions/mean_terminated_length": 263.73333740234375, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 0.66, "grad_norm": 2.4559149742126465, "kl": 0.06475830078125, "learning_rate": 1e-06, "loss": 0.0041, "num_tokens": 11074705.0, "reward": 0.1022477000951767, "reward_std": 0.03607521206140518, "rewards/bleu_reward_func/mean": 0.1022477000951767, "rewards/bleu_reward_func/std": 0.06924700736999512, "step": 825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 512.0, "completions/max_terminated_length": 501.0, "completions/mean_length": 304.78125, "completions/mean_terminated_length": 266.40740966796875, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "epoch": 0.6608, "grad_norm": 3.020120620727539, "kl": 0.05322265625, "learning_rate": 1e-06, "loss": 0.0953, "num_tokens": 11087218.0, "reward": 0.02280343510210514, "reward_std": 0.004782763309776783, "rewards/bleu_reward_func/mean": 0.02280343510210514, "rewards/bleu_reward_func/std": 0.005676077678799629, "step": 826 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 512.0, "completions/max_terminated_length": 488.0, "completions/mean_length": 288.90625, "completions/mean_terminated_length": 281.70965576171875, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 0.6616, "grad_norm": 2.7771506309509277, "kl": 0.03759765625, "learning_rate": 1e-06, "loss": -0.1367, "num_tokens": 11099039.0, "reward": 0.0848507508635521, "reward_std": 0.03843146190047264, "rewards/bleu_reward_func/mean": 0.0848507508635521, "rewards/bleu_reward_func/std": 0.05654771253466606, "step": 827 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 512.0, "completions/max_terminated_length": 458.0, "completions/mean_length": 283.96875, "completions/mean_terminated_length": 241.74073791503906, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "epoch": 0.6624, "grad_norm": 2.895529270172119, "kl": 0.060211181640625, "learning_rate": 1e-06, "loss": -0.0433, "num_tokens": 11110862.0, "reward": 0.029147926717996597, "reward_std": 0.008030948229134083, "rewards/bleu_reward_func/mean": 0.029147926717996597, "rewards/bleu_reward_func/std": 0.012568029575049877, "step": 828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 480.0, "completions/mean_length": 339.15625, "completions/mean_terminated_length": 290.7599792480469, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.6632, "grad_norm": 2.2857818603515625, "kl": 0.0377197265625, "learning_rate": 1e-06, "loss": 0.1356, "num_tokens": 11124107.0, "reward": 0.024584250524640083, "reward_std": 0.011461092159152031, "rewards/bleu_reward_func/mean": 0.024584250524640083, "rewards/bleu_reward_func/std": 0.014021635986864567, "step": 829 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 512.0, "completions/max_terminated_length": 490.0, "completions/mean_length": 339.46875, "completions/mean_terminated_length": 261.04547119140625, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "epoch": 0.664, "grad_norm": 3.047325372695923, "kl": 0.07183837890625, "learning_rate": 1e-06, "loss": -0.0554, "num_tokens": 11138034.0, "reward": 0.03661057725548744, "reward_std": 0.009148099459707737, "rewards/bleu_reward_func/mean": 0.03661057725548744, "rewards/bleu_reward_func/std": 0.01764591969549656, "step": 830 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 499.0, "completions/mean_length": 340.0625, "completions/mean_terminated_length": 272.7826232910156, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 0.6648, "grad_norm": 2.7614407539367676, "kl": 0.03033447265625, "learning_rate": 1e-06, "loss": -0.0483, "num_tokens": 11151476.0, "reward": 0.05648787319660187, "reward_std": 0.035596661269664764, "rewards/bleu_reward_func/mean": 0.05648787319660187, "rewards/bleu_reward_func/std": 0.055284641683101654, "step": 831 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 512.0, "completions/max_terminated_length": 511.0, "completions/mean_length": 288.59375, "completions/mean_terminated_length": 273.70001220703125, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 0.6656, "grad_norm": 2.4882915019989014, "kl": 0.05047607421875, "learning_rate": 1e-06, "loss": 0.0126, "num_tokens": 11165719.0, "reward": 0.07529251277446747, "reward_std": 0.032291531562805176, "rewards/bleu_reward_func/mean": 0.07529251277446747, "rewards/bleu_reward_func/std": 0.05323096737265587, "step": 832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 506.0, "completions/mean_length": 407.15625, "completions/mean_terminated_length": 366.13043212890625, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 0.6664, "grad_norm": 1.9872773885726929, "kl": 0.0447998046875, "learning_rate": 1e-06, "loss": 0.0506, "num_tokens": 11181668.0, "reward": 0.06329767405986786, "reward_std": 0.016374491155147552, "rewards/bleu_reward_func/mean": 0.06329767405986786, "rewards/bleu_reward_func/std": 0.04609806835651398, "step": 833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 512.0, "completions/max_terminated_length": 507.0, "completions/mean_length": 286.03125, "completions/mean_terminated_length": 270.9666748046875, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 0.6672, "grad_norm": 2.7774431705474854, "kl": 0.045074462890625, "learning_rate": 1e-06, "loss": 0.0337, "num_tokens": 11193869.0, "reward": 0.044695161283016205, "reward_std": 0.01957538165152073, "rewards/bleu_reward_func/mean": 0.044695161283016205, "rewards/bleu_reward_func/std": 0.02945699170231819, "step": 834 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 512.0, "completions/max_terminated_length": 505.0, "completions/mean_length": 258.09375, "completions/mean_terminated_length": 249.90321350097656, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "epoch": 0.668, "grad_norm": 2.96142315864563, "kl": 0.040771484375, "learning_rate": 1e-06, "loss": 0.3593, "num_tokens": 11207264.0, "reward": 0.055455561727285385, "reward_std": 0.029085490852594376, "rewards/bleu_reward_func/mean": 0.055455561727285385, "rewards/bleu_reward_func/std": 0.06734622269868851, "step": 835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 512.0, "completions/max_terminated_length": 446.0, "completions/mean_length": 368.5625, "completions/mean_terminated_length": 225.125, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 0.6688, "grad_norm": 2.931868076324463, "kl": 0.0482177734375, "learning_rate": 1e-06, "loss": -0.0531, "num_tokens": 11224506.0, "reward": 0.04490472376346588, "reward_std": 0.011960483156144619, "rewards/bleu_reward_func/mean": 0.04490472376346588, "rewards/bleu_reward_func/std": 0.015119385905563831, "step": 836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 512.0, "completions/max_terminated_length": 426.0, "completions/mean_length": 351.0, "completions/mean_terminated_length": 266.6666564941406, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.6696, "grad_norm": 2.4483470916748047, "kl": 0.0392913818359375, "learning_rate": 1e-06, "loss": -0.0262, "num_tokens": 11241138.0, "reward": 0.055649228394031525, "reward_std": 0.028573956340551376, "rewards/bleu_reward_func/mean": 0.055649228394031525, "rewards/bleu_reward_func/std": 0.05205146595835686, "step": 837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 512.0, "completions/max_terminated_length": 507.0, "completions/mean_length": 406.9375, "completions/mean_terminated_length": 359.18182373046875, "completions/min_length": 247.0, "completions/min_terminated_length": 247.0, "epoch": 0.6704, "grad_norm": 2.2513904571533203, "kl": 0.04888916015625, "learning_rate": 1e-06, "loss": 0.0047, "num_tokens": 11256696.0, "reward": 0.06957369297742844, "reward_std": 0.016341013833880424, "rewards/bleu_reward_func/mean": 0.06957369297742844, "rewards/bleu_reward_func/std": 0.047397319227457047, "step": 838 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.53125, "completions/max_length": 512.0, "completions/max_terminated_length": 468.0, "completions/mean_length": 359.65625, "completions/mean_terminated_length": 187.00001525878906, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.6712, "grad_norm": 4.712099075317383, "kl": 0.1260986328125, "learning_rate": 1e-06, "loss": 0.0522, "num_tokens": 11271373.0, "reward": 0.0421738438308239, "reward_std": 0.012205126695334911, "rewards/bleu_reward_func/mean": 0.0421738438308239, "rewards/bleu_reward_func/std": 0.017471130937337875, "step": 839 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 512.0, "completions/max_terminated_length": 506.0, "completions/mean_length": 199.875, "completions/mean_terminated_length": 179.06668090820312, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 0.672, "grad_norm": 5.982988357543945, "kl": 0.0614013671875, "learning_rate": 1e-06, "loss": 0.015, "num_tokens": 11280041.0, "reward": 0.03582533821463585, "reward_std": 0.008306249044835567, "rewards/bleu_reward_func/mean": 0.03582533821463585, "rewards/bleu_reward_func/std": 0.010333981364965439, "step": 840 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 478.0, "completions/mean_length": 400.71875, "completions/mean_terminated_length": 363.625, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "epoch": 0.6728, "grad_norm": 2.2374308109283447, "kl": 0.04034423828125, "learning_rate": 1e-06, "loss": -0.0348, "num_tokens": 11297040.0, "reward": 0.020916707813739777, "reward_std": 0.005250955931842327, "rewards/bleu_reward_func/mean": 0.020916707813739777, "rewards/bleu_reward_func/std": 0.012848958373069763, "step": 841 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 512.0, "completions/max_terminated_length": 497.0, "completions/mean_length": 389.71875, "completions/mean_terminated_length": 316.3500061035156, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 0.6736, "grad_norm": 2.3556578159332275, "kl": 0.041046142578125, "learning_rate": 1e-06, "loss": 0.0674, "num_tokens": 11313111.0, "reward": 0.05335354059934616, "reward_std": 0.018081864342093468, "rewards/bleu_reward_func/mean": 0.05335354059934616, "rewards/bleu_reward_func/std": 0.039343688637018204, "step": 842 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.53125, "completions/max_length": 512.0, "completions/max_terminated_length": 428.0, "completions/mean_length": 418.0, "completions/mean_terminated_length": 311.4666748046875, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.6744, "grad_norm": 2.2276229858398438, "kl": 0.05731201171875, "learning_rate": 1e-06, "loss": -0.06, "num_tokens": 11333615.0, "reward": 0.04341096431016922, "reward_std": 0.010301424190402031, "rewards/bleu_reward_func/mean": 0.04341096431016922, "rewards/bleu_reward_func/std": 0.028370829299092293, "step": 843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5625, "completions/max_length": 512.0, "completions/max_terminated_length": 414.0, "completions/mean_length": 398.25, "completions/mean_terminated_length": 252.00001525878906, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 0.6752, "grad_norm": 4.02369499206543, "kl": 0.051849365234375, "learning_rate": 1e-06, "loss": -0.1083, "num_tokens": 11351647.0, "reward": 0.04887228459119797, "reward_std": 0.01692984066903591, "rewards/bleu_reward_func/mean": 0.04887228459119797, "rewards/bleu_reward_func/std": 0.04280164837837219, "step": 844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 512.0, "completions/max_terminated_length": 417.0, "completions/mean_length": 241.84375, "completions/mean_terminated_length": 233.1290283203125, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 0.676, "grad_norm": 2.695026397705078, "kl": 0.045989990234375, "learning_rate": 1e-06, "loss": 0.0799, "num_tokens": 11362234.0, "reward": 0.045725200325250626, "reward_std": 0.020237425342202187, "rewards/bleu_reward_func/mean": 0.045725200325250626, "rewards/bleu_reward_func/std": 0.02247374691069126, "step": 845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 512.0, "completions/max_terminated_length": 336.0, "completions/mean_length": 337.78125, "completions/mean_terminated_length": 163.5625, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.6768, "grad_norm": 2.6178877353668213, "kl": 0.0535888671875, "learning_rate": 1e-06, "loss": -0.0213, "num_tokens": 11378195.0, "reward": 0.07340110093355179, "reward_std": 0.027613390237092972, "rewards/bleu_reward_func/mean": 0.07340110093355179, "rewards/bleu_reward_func/std": 0.06718737632036209, "step": 846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 471.0, "completions/max_terminated_length": 471.0, "completions/mean_length": 315.125, "completions/mean_terminated_length": 315.125, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.6776, "grad_norm": 2.427767753601074, "kl": 0.04351806640625, "learning_rate": 1e-06, "loss": 0.0628, "num_tokens": 11390599.0, "reward": 0.11289885640144348, "reward_std": 0.016688670963048935, "rewards/bleu_reward_func/mean": 0.11289885640144348, "rewards/bleu_reward_func/std": 0.10308769345283508, "step": 847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 512.0, "completions/max_terminated_length": 460.0, "completions/mean_length": 357.65625, "completions/mean_terminated_length": 276.8095397949219, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.6784, "grad_norm": 2.0324172973632812, "kl": 0.04730224609375, "learning_rate": 1e-06, "loss": 0.0362, "num_tokens": 11404364.0, "reward": 0.09633086621761322, "reward_std": 0.05087493732571602, "rewards/bleu_reward_func/mean": 0.09633086621761322, "rewards/bleu_reward_func/std": 0.15251684188842773, "step": 848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 483.0, "completions/max_terminated_length": 483.0, "completions/mean_length": 173.59375, "completions/mean_terminated_length": 173.59375, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 0.6792, "grad_norm": 3.7329139709472656, "kl": 0.0618896484375, "learning_rate": 1e-06, "loss": 0.0479, "num_tokens": 11412271.0, "reward": 0.07129530608654022, "reward_std": 0.03777293860912323, "rewards/bleu_reward_func/mean": 0.07129530608654022, "rewards/bleu_reward_func/std": 0.07965229451656342, "step": 849 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 512.0, "completions/max_terminated_length": 319.0, "completions/mean_length": 351.46875, "completions/mean_terminated_length": 190.9375, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.68, "grad_norm": 2.7004735469818115, "kl": 0.0643310546875, "learning_rate": 1e-06, "loss": 0.0879, "num_tokens": 11427262.0, "reward": 0.10036734491586685, "reward_std": 0.018841760233044624, "rewards/bleu_reward_func/mean": 0.10036734491586685, "rewards/bleu_reward_func/std": 0.1509210765361786, "step": 850 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 501.0, "completions/mean_length": 301.78125, "completions/mean_terminated_length": 231.70834350585938, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 0.6808, "grad_norm": 2.8924641609191895, "kl": 0.0609130859375, "learning_rate": 1e-06, "loss": -0.0404, "num_tokens": 11439663.0, "reward": 0.02212350070476532, "reward_std": 0.007656387519091368, "rewards/bleu_reward_func/mean": 0.02212350070476532, "rewards/bleu_reward_func/std": 0.017611265182495117, "step": 851 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/max_terminated_length": 504.0, "completions/mean_length": 168.875, "completions/mean_terminated_length": 168.875, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.6816, "grad_norm": 4.338345050811768, "kl": 0.031097412109375, "learning_rate": 1e-06, "loss": -0.1581, "num_tokens": 11448411.0, "reward": 0.055831264704465866, "reward_std": 0.034439653158187866, "rewards/bleu_reward_func/mean": 0.055831264704465866, "rewards/bleu_reward_func/std": 0.03716801106929779, "step": 852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 512.0, "completions/max_terminated_length": 432.0, "completions/mean_length": 241.03125, "completions/mean_terminated_length": 213.0, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.6824, "grad_norm": 2.851404905319214, "kl": 0.0327606201171875, "learning_rate": 1e-06, "loss": 0.0359, "num_tokens": 11458036.0, "reward": 0.06256793439388275, "reward_std": 0.02849128656089306, "rewards/bleu_reward_func/mean": 0.06256793439388275, "rewards/bleu_reward_func/std": 0.0491851307451725, "step": 853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 512.0, "completions/max_terminated_length": 456.0, "completions/mean_length": 343.21875, "completions/mean_terminated_length": 266.5, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.6832, "grad_norm": 2.9721109867095947, "kl": 0.03997802734375, "learning_rate": 1e-06, "loss": -0.0055, "num_tokens": 11471459.0, "reward": 0.03509419411420822, "reward_std": 0.007715485990047455, "rewards/bleu_reward_func/mean": 0.03509419411420822, "rewards/bleu_reward_func/std": 0.02039431221783161, "step": 854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 512.0, "completions/max_terminated_length": 512.0, "completions/mean_length": 383.65625, "completions/mean_terminated_length": 316.4285888671875, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.684, "grad_norm": 2.379918336868286, "kl": 0.039520263671875, "learning_rate": 1e-06, "loss": 0.1156, "num_tokens": 11486432.0, "reward": 0.017331784591078758, "reward_std": 0.008001319132745266, "rewards/bleu_reward_func/mean": 0.017331784591078758, "rewards/bleu_reward_func/std": 0.009670063853263855, "step": 855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.84375, "completions/max_length": 512.0, "completions/max_terminated_length": 488.0, "completions/mean_length": 480.375, "completions/mean_terminated_length": 309.6000061035156, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 0.6848, "grad_norm": 2.0767323970794678, "kl": 0.05438232421875, "learning_rate": 1e-06, "loss": -0.0073, "num_tokens": 11507884.0, "reward": 0.030316852033138275, "reward_std": 0.01650041714310646, "rewards/bleu_reward_func/mean": 0.030316852033138275, "rewards/bleu_reward_func/std": 0.028973711654543877, "step": 856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 365.0, "completions/mean_length": 260.28125, "completions/mean_terminated_length": 189.8000030517578, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 0.6856, "grad_norm": 3.4007835388183594, "kl": 0.040740966796875, "learning_rate": 1e-06, "loss": -0.0284, "num_tokens": 11520933.0, "reward": 0.051799893379211426, "reward_std": 0.018662042915821075, "rewards/bleu_reward_func/mean": 0.051799893379211426, "rewards/bleu_reward_func/std": 0.03477845713496208, "step": 857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 413.0, "completions/max_terminated_length": 413.0, "completions/mean_length": 205.84375, "completions/mean_terminated_length": 205.84375, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 0.6864, "grad_norm": 3.2630794048309326, "kl": 0.057403564453125, "learning_rate": 1e-06, "loss": 0.0015, "num_tokens": 11529584.0, "reward": 0.05795145779848099, "reward_std": 0.02359882742166519, "rewards/bleu_reward_func/mean": 0.05795145779848099, "rewards/bleu_reward_func/std": 0.046813130378723145, "step": 858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.71875, "completions/max_length": 512.0, "completions/max_terminated_length": 497.0, "completions/mean_length": 479.65625, "completions/mean_terminated_length": 397.0, "completions/min_length": 315.0, "completions/min_terminated_length": 315.0, "epoch": 0.6872, "grad_norm": 2.053889274597168, "kl": 0.036376953125, "learning_rate": 1e-06, "loss": 0.011, "num_tokens": 11547757.0, "reward": 0.03168204054236412, "reward_std": 0.007757972460240126, "rewards/bleu_reward_func/mean": 0.03168204054236412, "rewards/bleu_reward_func/std": 0.019959628582000732, "step": 859 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 512.0, "completions/max_terminated_length": 501.0, "completions/mean_length": 429.65625, "completions/mean_terminated_length": 373.3157958984375, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.688, "grad_norm": 2.033658504486084, "kl": 0.05596923828125, "learning_rate": 1e-06, "loss": -0.0154, "num_tokens": 11565514.0, "reward": 0.04394299536943436, "reward_std": 0.01149829663336277, "rewards/bleu_reward_func/mean": 0.04394299536943436, "rewards/bleu_reward_func/std": 0.034712888300418854, "step": 860 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 512.0, "completions/max_terminated_length": 489.0, "completions/mean_length": 304.6875, "completions/mean_terminated_length": 180.3000030517578, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.6888, "grad_norm": 3.552960157394409, "kl": 0.054290771484375, "learning_rate": 1e-06, "loss": -0.0313, "num_tokens": 11579096.0, "reward": 0.06726164370775223, "reward_std": 0.022827234119176865, "rewards/bleu_reward_func/mean": 0.06726164370775223, "rewards/bleu_reward_func/std": 0.07642409950494766, "step": 861 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 512.0, "completions/max_terminated_length": 468.0, "completions/mean_length": 438.09375, "completions/mean_terminated_length": 364.1875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "epoch": 0.6896, "grad_norm": 2.134580135345459, "kl": 0.04888916015625, "learning_rate": 1e-06, "loss": -0.0561, "num_tokens": 11596747.0, "reward": 0.04212401062250137, "reward_std": 0.010989276692271233, "rewards/bleu_reward_func/mean": 0.04212401062250137, "rewards/bleu_reward_func/std": 0.011919494718313217, "step": 862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 446.0, "completions/mean_length": 346.4375, "completions/mean_terminated_length": 291.25, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.6904, "grad_norm": 2.5982649326324463, "kl": 0.0455322265625, "learning_rate": 1e-06, "loss": -0.0463, "num_tokens": 11614025.0, "reward": 0.09341640025377274, "reward_std": 0.045992907136678696, "rewards/bleu_reward_func/mean": 0.09341640025377274, "rewards/bleu_reward_func/std": 0.08428861200809479, "step": 863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 512.0, "completions/max_terminated_length": 498.0, "completions/mean_length": 326.3125, "completions/mean_terminated_length": 291.9259338378906, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.6912, "grad_norm": 2.6331946849823, "kl": 0.034912109375, "learning_rate": 1e-06, "loss": -0.0157, "num_tokens": 11626787.0, "reward": 0.0679611936211586, "reward_std": 0.025558585301041603, "rewards/bleu_reward_func/mean": 0.0679611936211586, "rewards/bleu_reward_func/std": 0.041840873658657074, "step": 864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 512.0, "completions/max_terminated_length": 470.0, "completions/mean_length": 280.96875, "completions/mean_terminated_length": 227.6538543701172, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.692, "grad_norm": 6.058828830718994, "kl": 0.0814361572265625, "learning_rate": 1e-06, "loss": -0.0091, "num_tokens": 11640106.0, "reward": 0.1249103844165802, "reward_std": 0.021891392767429352, "rewards/bleu_reward_func/mean": 0.1249103844165802, "rewards/bleu_reward_func/std": 0.08676422387361526, "step": 865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 512.0, "completions/max_terminated_length": 475.0, "completions/mean_length": 382.15625, "completions/mean_terminated_length": 281.1666564941406, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.6928, "grad_norm": 2.0802950859069824, "kl": 0.0343017578125, "learning_rate": 1e-06, "loss": -0.072, "num_tokens": 11658143.0, "reward": 0.1035676896572113, "reward_std": 0.04491373896598816, "rewards/bleu_reward_func/mean": 0.1035676896572113, "rewards/bleu_reward_func/std": 0.08027082681655884, "step": 866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 430.0, "completions/max_terminated_length": 430.0, "completions/mean_length": 247.9375, "completions/mean_terminated_length": 247.9375, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 0.6936, "grad_norm": 2.882784605026245, "kl": 0.061248779296875, "learning_rate": 1e-06, "loss": 0.1637, "num_tokens": 11668869.0, "reward": 0.03542046248912811, "reward_std": 0.010823436081409454, "rewards/bleu_reward_func/mean": 0.03542046248912811, "rewards/bleu_reward_func/std": 0.02019406110048294, "step": 867 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.6875, "completions/max_length": 512.0, "completions/max_terminated_length": 487.0, "completions/mean_length": 431.28125, "completions/mean_terminated_length": 253.6999969482422, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.6944, "grad_norm": 2.128084182739258, "kl": 0.040283203125, "learning_rate": 1e-06, "loss": -0.0763, "num_tokens": 11685502.0, "reward": 0.02058091014623642, "reward_std": 0.005482015199959278, "rewards/bleu_reward_func/mean": 0.02058091014623642, "rewards/bleu_reward_func/std": 0.01129836868494749, "step": 868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 512.0, "completions/max_terminated_length": 499.0, "completions/mean_length": 265.78125, "completions/mean_terminated_length": 249.36668395996094, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.6952, "grad_norm": 8.264237403869629, "kl": 0.040863037109375, "learning_rate": 1e-06, "loss": 0.0395, "num_tokens": 11696007.0, "reward": 0.04947113245725632, "reward_std": 0.010559817776083946, "rewards/bleu_reward_func/mean": 0.04947113245725632, "rewards/bleu_reward_func/std": 0.020097067579627037, "step": 869 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 443.0, "completions/mean_length": 282.5625, "completions/mean_terminated_length": 192.78260803222656, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.696, "grad_norm": 4.883482456207275, "kl": 0.053009033203125, "learning_rate": 1e-06, "loss": -0.0779, "num_tokens": 11709769.0, "reward": 0.05990312993526459, "reward_std": 0.015603918582201004, "rewards/bleu_reward_func/mean": 0.05990312993526459, "rewards/bleu_reward_func/std": 0.0239902064204216, "step": 870 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 512.0, "completions/max_terminated_length": 320.0, "completions/mean_length": 335.125, "completions/mean_terminated_length": 179.05882263183594, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.6968, "grad_norm": 2.609515905380249, "kl": 0.04803466796875, "learning_rate": 1e-06, "loss": 0.0612, "num_tokens": 11723349.0, "reward": 0.03629864379763603, "reward_std": 0.01367709320038557, "rewards/bleu_reward_func/mean": 0.03629864379763603, "rewards/bleu_reward_func/std": 0.018606344237923622, "step": 871 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 512.0, "completions/mean_length": 268.46875, "completions/mean_terminated_length": 173.17391967773438, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.6976, "grad_norm": 6.626194000244141, "kl": 0.10137939453125, "learning_rate": 1e-06, "loss": -0.1281, "num_tokens": 11734708.0, "reward": 0.1171385869383812, "reward_std": 0.06305581331253052, "rewards/bleu_reward_func/mean": 0.1171385869383812, "rewards/bleu_reward_func/std": 0.18313851952552795, "step": 872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 508.0, "completions/mean_length": 294.71875, "completions/mean_terminated_length": 222.2916717529297, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.6984, "grad_norm": 2.969879388809204, "kl": 0.05072021484375, "learning_rate": 1e-06, "loss": 0.0654, "num_tokens": 11746683.0, "reward": 0.09640492498874664, "reward_std": 0.04729197546839714, "rewards/bleu_reward_func/mean": 0.09640492498874664, "rewards/bleu_reward_func/std": 0.10373617708683014, "step": 873 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 476.0, "completions/max_terminated_length": 476.0, "completions/mean_length": 241.09375, "completions/mean_terminated_length": 241.09375, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "epoch": 0.6992, "grad_norm": 4.217690944671631, "kl": 0.05010986328125, "learning_rate": 1e-06, "loss": -0.1491, "num_tokens": 11758790.0, "reward": 0.08641447871923447, "reward_std": 0.028802432119846344, "rewards/bleu_reward_func/mean": 0.08641447871923447, "rewards/bleu_reward_func/std": 0.04867434501647949, "step": 874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 506.0, "completions/mean_length": 340.46875, "completions/mean_terminated_length": 273.34783935546875, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.7, "grad_norm": 2.67362904548645, "kl": 0.039642333984375, "learning_rate": 1e-06, "loss": 0.0822, "num_tokens": 11772093.0, "reward": 0.05074232816696167, "reward_std": 0.023221492767333984, "rewards/bleu_reward_func/mean": 0.05074232816696167, "rewards/bleu_reward_func/std": 0.03728149086236954, "step": 875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 512.0, "completions/max_terminated_length": 398.0, "completions/mean_length": 199.28125, "completions/mean_terminated_length": 178.433349609375, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 0.7008, "grad_norm": 14.30516529083252, "kl": 0.17584228515625, "learning_rate": 1e-06, "loss": 0.1759, "num_tokens": 11782310.0, "reward": 0.11937517672777176, "reward_std": 0.021323315799236298, "rewards/bleu_reward_func/mean": 0.11937517672777176, "rewards/bleu_reward_func/std": 0.15308451652526855, "step": 876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 511.0, "completions/mean_length": 350.1875, "completions/mean_terminated_length": 304.8800048828125, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.7016, "grad_norm": 2.588109254837036, "kl": 0.044189453125, "learning_rate": 1e-06, "loss": 0.0225, "num_tokens": 11796252.0, "reward": 0.05437985807657242, "reward_std": 0.017482522875070572, "rewards/bleu_reward_func/mean": 0.05437985807657242, "rewards/bleu_reward_func/std": 0.028886273503303528, "step": 877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 501.0, "completions/max_terminated_length": 501.0, "completions/mean_length": 344.3125, "completions/mean_terminated_length": 344.3125, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 0.7024, "grad_norm": 3.1232547760009766, "kl": 0.033905029296875, "learning_rate": 1e-06, "loss": 0.0186, "num_tokens": 11810094.0, "reward": 0.07601115107536316, "reward_std": 0.022839991375803947, "rewards/bleu_reward_func/mean": 0.07601115107536316, "rewards/bleu_reward_func/std": 0.0427204929292202, "step": 878 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 512.0, "completions/mean_length": 382.4375, "completions/mean_terminated_length": 339.25, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.7032, "grad_norm": 2.555143117904663, "kl": 0.05218505859375, "learning_rate": 1e-06, "loss": 0.0152, "num_tokens": 11825772.0, "reward": 0.02484015002846718, "reward_std": 0.009351451881229877, "rewards/bleu_reward_func/mean": 0.02484015002846718, "rewards/bleu_reward_func/std": 0.015805674716830254, "step": 879 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.625, "completions/max_length": 512.0, "completions/max_terminated_length": 494.0, "completions/mean_length": 474.625, "completions/mean_terminated_length": 412.3333435058594, "completions/min_length": 274.0, "completions/min_terminated_length": 274.0, "epoch": 0.704, "grad_norm": 2.2161693572998047, "kl": 0.0496826171875, "learning_rate": 1e-06, "loss": -0.0232, "num_tokens": 11843736.0, "reward": 0.03902929276227951, "reward_std": 0.009834162890911102, "rewards/bleu_reward_func/mean": 0.03902929276227951, "rewards/bleu_reward_func/std": 0.03208939731121063, "step": 880 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 486.0, "completions/max_terminated_length": 486.0, "completions/mean_length": 255.03125, "completions/mean_terminated_length": 255.03125, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.7048, "grad_norm": 3.189509391784668, "kl": 0.038238525390625, "learning_rate": 1e-06, "loss": -0.2021, "num_tokens": 11853825.0, "reward": 0.06721623241901398, "reward_std": 0.04335642606019974, "rewards/bleu_reward_func/mean": 0.06721623241901398, "rewards/bleu_reward_func/std": 0.06198061630129814, "step": 881 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 512.0, "completions/max_terminated_length": 501.0, "completions/mean_length": 335.375, "completions/mean_terminated_length": 229.40000915527344, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 0.7056, "grad_norm": 2.6181299686431885, "kl": 0.030914306640625, "learning_rate": 1e-06, "loss": 0.0489, "num_tokens": 11869869.0, "reward": 0.036412715911865234, "reward_std": 0.017522014677524567, "rewards/bleu_reward_func/mean": 0.036412715911865234, "rewards/bleu_reward_func/std": 0.02936912514269352, "step": 882 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.75, "completions/max_length": 512.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 389.875, "completions/mean_terminated_length": 23.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.7064, "grad_norm": 6.321810245513916, "kl": 0.11798095703125, "learning_rate": 1e-06, "loss": -0.0217, "num_tokens": 11885249.0, "reward": 0.039120785892009735, "reward_std": 0.005084656178951263, "rewards/bleu_reward_func/mean": 0.039120785892009735, "rewards/bleu_reward_func/std": 0.021427100524306297, "step": 883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 466.0, "completions/mean_length": 335.0, "completions/mean_terminated_length": 265.7391357421875, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 0.7072, "grad_norm": 3.0909547805786133, "kl": 0.0372314453125, "learning_rate": 1e-06, "loss": -0.1258, "num_tokens": 11899921.0, "reward": 0.06846681982278824, "reward_std": 0.024172725155949593, "rewards/bleu_reward_func/mean": 0.06846681982278824, "rewards/bleu_reward_func/std": 0.04641611874103546, "step": 884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 479.0, "completions/mean_length": 345.0, "completions/mean_terminated_length": 298.239990234375, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 0.708, "grad_norm": 2.417402744293213, "kl": 0.027801513671875, "learning_rate": 1e-06, "loss": 0.1077, "num_tokens": 11914009.0, "reward": 0.0673043429851532, "reward_std": 0.031290117651224136, "rewards/bleu_reward_func/mean": 0.0673043429851532, "rewards/bleu_reward_func/std": 0.06805533170700073, "step": 885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 391.0, "completions/mean_length": 244.4375, "completions/mean_terminated_length": 155.25, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 0.7088, "grad_norm": 3.7891881465911865, "kl": 0.030517578125, "learning_rate": 1e-06, "loss": 0.0371, "num_tokens": 11924071.0, "reward": 0.09851931035518646, "reward_std": 0.02925381436944008, "rewards/bleu_reward_func/mean": 0.09851931035518646, "rewards/bleu_reward_func/std": 0.061319418251514435, "step": 886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.65625, "completions/max_length": 512.0, "completions/max_terminated_length": 462.0, "completions/mean_length": 431.71875, "completions/mean_terminated_length": 278.4545593261719, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.7096, "grad_norm": 2.5203967094421387, "kl": 0.043731689453125, "learning_rate": 1e-06, "loss": -0.0583, "num_tokens": 11945294.0, "reward": 0.07713477313518524, "reward_std": 0.015437297523021698, "rewards/bleu_reward_func/mean": 0.07713477313518524, "rewards/bleu_reward_func/std": 0.035572707653045654, "step": 887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 426.0, "completions/mean_length": 280.3125, "completions/mean_terminated_length": 203.08334350585938, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.7104, "grad_norm": 3.726912021636963, "kl": 0.0765380859375, "learning_rate": 1e-06, "loss": 0.1117, "num_tokens": 11957336.0, "reward": 0.06727063655853271, "reward_std": 0.029535435140132904, "rewards/bleu_reward_func/mean": 0.06727063655853271, "rewards/bleu_reward_func/std": 0.03691576421260834, "step": 888 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 512.0, "completions/max_terminated_length": 491.0, "completions/mean_length": 244.625, "completions/mean_terminated_length": 236.0, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.7112, "grad_norm": 3.1454975605010986, "kl": 0.0346832275390625, "learning_rate": 1e-06, "loss": -0.1414, "num_tokens": 11967484.0, "reward": 0.05579820275306702, "reward_std": 0.0413711853325367, "rewards/bleu_reward_func/mean": 0.05579820275306702, "rewards/bleu_reward_func/std": 0.058547936379909515, "step": 889 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 511.0, "completions/mean_length": 401.40625, "completions/mean_terminated_length": 358.13043212890625, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.712, "grad_norm": 2.106522798538208, "kl": 0.04827880859375, "learning_rate": 1e-06, "loss": 0.1188, "num_tokens": 11984513.0, "reward": 0.0451212078332901, "reward_std": 0.021773334592580795, "rewards/bleu_reward_func/mean": 0.0451212078332901, "rewards/bleu_reward_func/std": 0.028573498129844666, "step": 890 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 512.0, "completions/max_terminated_length": 508.0, "completions/mean_length": 408.875, "completions/mean_terminated_length": 328.6666564941406, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 0.7128, "grad_norm": 2.2036218643188477, "kl": 0.05535888671875, "learning_rate": 1e-06, "loss": 0.0686, "num_tokens": 12000717.0, "reward": 0.0747871994972229, "reward_std": 0.020796824246644974, "rewards/bleu_reward_func/mean": 0.0747871994972229, "rewards/bleu_reward_func/std": 0.039392877370119095, "step": 891 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 505.0, "completions/mean_length": 346.125, "completions/mean_terminated_length": 281.2174072265625, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 0.7136, "grad_norm": 2.303058624267578, "kl": 0.04833984375, "learning_rate": 1e-06, "loss": -0.0624, "num_tokens": 12014489.0, "reward": 0.08780650794506073, "reward_std": 0.030875790864229202, "rewards/bleu_reward_func/mean": 0.08780650794506073, "rewards/bleu_reward_func/std": 0.06451728194952011, "step": 892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 486.0, "completions/mean_length": 241.125, "completions/mean_terminated_length": 165.27999877929688, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 0.7144, "grad_norm": 3.9183602333068848, "kl": 0.0474853515625, "learning_rate": 1e-06, "loss": 0.083, "num_tokens": 12025885.0, "reward": 0.0884522795677185, "reward_std": 0.03579477593302727, "rewards/bleu_reward_func/mean": 0.0884522795677185, "rewards/bleu_reward_func/std": 0.09096165746450424, "step": 893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 498.0, "completions/mean_length": 344.59375, "completions/mean_terminated_length": 279.08697509765625, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 0.7152, "grad_norm": 2.937997579574585, "kl": 0.0556640625, "learning_rate": 1e-06, "loss": 0.2263, "num_tokens": 12041160.0, "reward": 0.035102441906929016, "reward_std": 0.016423923894762993, "rewards/bleu_reward_func/mean": 0.035102441906929016, "rewards/bleu_reward_func/std": 0.03288643807172775, "step": 894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 512.0, "completions/max_terminated_length": 504.0, "completions/mean_length": 403.0, "completions/mean_terminated_length": 387.4285888671875, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.716, "grad_norm": 2.2048861980438232, "kl": 0.03814697265625, "learning_rate": 1e-06, "loss": -0.0469, "num_tokens": 12056088.0, "reward": 0.02945767343044281, "reward_std": 0.014292486011981964, "rewards/bleu_reward_func/mean": 0.02945767343044281, "rewards/bleu_reward_func/std": 0.015402048826217651, "step": 895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 512.0, "completions/max_terminated_length": 492.0, "completions/mean_length": 236.46875, "completions/mean_terminated_length": 207.96551513671875, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "epoch": 0.7168, "grad_norm": 3.6095573902130127, "kl": 0.0482177734375, "learning_rate": 1e-06, "loss": 0.1104, "num_tokens": 12065823.0, "reward": 0.04199256747961044, "reward_std": 0.015113498084247112, "rewards/bleu_reward_func/mean": 0.04199256747961044, "rewards/bleu_reward_func/std": 0.031457021832466125, "step": 896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 470.0, "completions/mean_length": 301.8125, "completions/mean_terminated_length": 242.95999145507812, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.7176, "grad_norm": 4.166423320770264, "kl": 0.056304931640625, "learning_rate": 1e-06, "loss": -0.1865, "num_tokens": 12078721.0, "reward": 0.06818559765815735, "reward_std": 0.049522291868925095, "rewards/bleu_reward_func/mean": 0.06818559765815735, "rewards/bleu_reward_func/std": 0.11108224838972092, "step": 897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 483.0, "completions/mean_length": 318.9375, "completions/mean_terminated_length": 254.58334350585938, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 0.7184, "grad_norm": 3.5479068756103516, "kl": 0.06591796875, "learning_rate": 1e-06, "loss": 0.0512, "num_tokens": 12094239.0, "reward": 0.16995030641555786, "reward_std": 0.02772948332130909, "rewards/bleu_reward_func/mean": 0.16995030641555786, "rewards/bleu_reward_func/std": 0.19171921908855438, "step": 898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 199.0, "completions/mean_length": 189.34375, "completions/mean_terminated_length": 81.79167175292969, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.7192, "grad_norm": 7.562760829925537, "kl": 0.04559326171875, "learning_rate": 1e-06, "loss": -0.0977, "num_tokens": 12103242.0, "reward": 0.06262214481830597, "reward_std": 0.01612667180597782, "rewards/bleu_reward_func/mean": 0.06262214481830597, "rewards/bleu_reward_func/std": 0.051322340965270996, "step": 899 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 512.0, "completions/max_terminated_length": 495.0, "completions/mean_length": 288.5, "completions/mean_terminated_length": 265.3793029785156, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 0.72, "grad_norm": 3.002432107925415, "kl": 0.036834716796875, "learning_rate": 1e-06, "loss": -0.081, "num_tokens": 12114562.0, "reward": 0.08370120078325272, "reward_std": 0.027285337448120117, "rewards/bleu_reward_func/mean": 0.08370120078325272, "rewards/bleu_reward_func/std": 0.06422236561775208, "step": 900 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 512.0, "completions/max_terminated_length": 496.0, "completions/mean_length": 413.4375, "completions/mean_terminated_length": 346.0, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.7208, "grad_norm": 1.9330230951309204, "kl": 0.041168212890625, "learning_rate": 1e-06, "loss": 0.076, "num_tokens": 12131424.0, "reward": 0.07572037726640701, "reward_std": 0.026293717324733734, "rewards/bleu_reward_func/mean": 0.07572037726640701, "rewards/bleu_reward_func/std": 0.055959172546863556, "step": 901 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 484.0, "completions/mean_length": 323.6875, "completions/mean_terminated_length": 250.0, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 0.7216, "grad_norm": 2.5331993103027344, "kl": 0.06146240234375, "learning_rate": 1e-06, "loss": 0.0994, "num_tokens": 12145606.0, "reward": 0.02315894514322281, "reward_std": 0.015285233967006207, "rewards/bleu_reward_func/mean": 0.02315894514322281, "rewards/bleu_reward_func/std": 0.024109287187457085, "step": 902 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 512.0, "completions/max_terminated_length": 473.0, "completions/mean_length": 440.03125, "completions/mean_terminated_length": 368.0625, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.7224, "grad_norm": 2.0079493522644043, "kl": 0.05462646484375, "learning_rate": 1e-06, "loss": 0.0609, "num_tokens": 12162431.0, "reward": 0.028865192085504532, "reward_std": 0.00568732712417841, "rewards/bleu_reward_func/mean": 0.028865192085504532, "rewards/bleu_reward_func/std": 0.019452739506959915, "step": 903 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 512.0, "completions/max_terminated_length": 511.0, "completions/mean_length": 274.5, "completions/mean_terminated_length": 258.66668701171875, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.7232, "grad_norm": 3.427934169769287, "kl": 0.059814453125, "learning_rate": 1e-06, "loss": 0.0583, "num_tokens": 12173223.0, "reward": 0.08930553495883942, "reward_std": 0.0346166156232357, "rewards/bleu_reward_func/mean": 0.08930553495883942, "rewards/bleu_reward_func/std": 0.06352285295724869, "step": 904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 512.0, "completions/max_terminated_length": 485.0, "completions/mean_length": 412.8125, "completions/mean_terminated_length": 325.29412841796875, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.724, "grad_norm": 2.114323854446411, "kl": 0.04071044921875, "learning_rate": 1e-06, "loss": 0.0525, "num_tokens": 12189569.0, "reward": 0.0793527215719223, "reward_std": 0.0372716560959816, "rewards/bleu_reward_func/mean": 0.0793527215719223, "rewards/bleu_reward_func/std": 0.08270096778869629, "step": 905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 493.0, "completions/max_terminated_length": 493.0, "completions/mean_length": 269.40625, "completions/mean_terminated_length": 269.40625, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 0.7248, "grad_norm": 3.237521171569824, "kl": 0.05322265625, "learning_rate": 1e-06, "loss": 0.1096, "num_tokens": 12200398.0, "reward": 0.03743357956409454, "reward_std": 0.011607276275753975, "rewards/bleu_reward_func/mean": 0.03743357956409454, "rewards/bleu_reward_func/std": 0.013176437467336655, "step": 906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 454.0, "completions/mean_length": 323.46875, "completions/mean_terminated_length": 260.625, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 0.7256, "grad_norm": 3.23765230178833, "kl": 0.05694580078125, "learning_rate": 1e-06, "loss": -0.0564, "num_tokens": 12214461.0, "reward": 0.023959729820489883, "reward_std": 0.009282315149903297, "rewards/bleu_reward_func/mean": 0.023959729820489883, "rewards/bleu_reward_func/std": 0.013883906416594982, "step": 907 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.75, "completions/max_length": 512.0, "completions/max_terminated_length": 438.0, "completions/mean_length": 469.96875, "completions/mean_terminated_length": 343.875, "completions/min_length": 278.0, "completions/min_terminated_length": 278.0, "epoch": 0.7264, "grad_norm": 2.196124315261841, "kl": 0.041290283203125, "learning_rate": 1e-06, "loss": 0.0084, "num_tokens": 12233852.0, "reward": 0.0776321142911911, "reward_std": 0.021009789779782295, "rewards/bleu_reward_func/mean": 0.0776321142911911, "rewards/bleu_reward_func/std": 0.0949036255478859, "step": 908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 507.0, "completions/mean_length": 306.34375, "completions/mean_terminated_length": 237.7916717529297, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 0.7272, "grad_norm": 2.8371667861938477, "kl": 0.06854248046875, "learning_rate": 1e-06, "loss": -0.1164, "num_tokens": 12246503.0, "reward": 0.03938760608434677, "reward_std": 0.012361581437289715, "rewards/bleu_reward_func/mean": 0.03938760608434677, "rewards/bleu_reward_func/std": 0.028173107653856277, "step": 909 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 512.0, "completions/max_terminated_length": 510.0, "completions/mean_length": 270.03125, "completions/mean_terminated_length": 245.0, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.728, "grad_norm": 6.242187023162842, "kl": 0.067474365234375, "learning_rate": 1e-06, "loss": 0.1375, "num_tokens": 12258464.0, "reward": 0.03787752240896225, "reward_std": 0.0169361662119627, "rewards/bleu_reward_func/mean": 0.03787752240896225, "rewards/bleu_reward_func/std": 0.03063538856804371, "step": 910 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/max_terminated_length": 357.0, "completions/mean_length": 177.0, "completions/mean_terminated_length": 177.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.7288, "grad_norm": 3.455866575241089, "kl": 0.04833984375, "learning_rate": 1e-06, "loss": -0.3367, "num_tokens": 12267968.0, "reward": 0.14032083749771118, "reward_std": 0.15820267796516418, "rewards/bleu_reward_func/mean": 0.14032083749771118, "rewards/bleu_reward_func/std": 0.23967435956001282, "step": 911 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.6875, "completions/max_length": 512.0, "completions/max_terminated_length": 484.0, "completions/mean_length": 472.53125, "completions/mean_terminated_length": 385.70001220703125, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "epoch": 0.7296, "grad_norm": 2.2268266677856445, "kl": 0.04461669921875, "learning_rate": 1e-06, "loss": 0.0363, "num_tokens": 12286913.0, "reward": 0.03583249822258949, "reward_std": 0.014493023976683617, "rewards/bleu_reward_func/mean": 0.03583249822258949, "rewards/bleu_reward_func/std": 0.02188328467309475, "step": 912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 489.0, "completions/mean_length": 374.4375, "completions/mean_terminated_length": 320.60870361328125, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.7304, "grad_norm": 2.3796823024749756, "kl": 0.042236328125, "learning_rate": 1e-06, "loss": -0.0413, "num_tokens": 12301247.0, "reward": 0.06036565452814102, "reward_std": 0.017455367371439934, "rewards/bleu_reward_func/mean": 0.06036565452814102, "rewards/bleu_reward_func/std": 0.06808813661336899, "step": 913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 512.0, "completions/max_terminated_length": 499.0, "completions/mean_length": 325.90625, "completions/mean_terminated_length": 319.9032287597656, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 0.7312, "grad_norm": 2.5203535556793213, "kl": 0.03826904296875, "learning_rate": 1e-06, "loss": -0.0426, "num_tokens": 12314348.0, "reward": 0.037568479776382446, "reward_std": 0.017504602670669556, "rewards/bleu_reward_func/mean": 0.037568479776382446, "rewards/bleu_reward_func/std": 0.025791103020310402, "step": 914 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 512.0, "completions/max_terminated_length": 504.0, "completions/mean_length": 222.15625, "completions/mean_terminated_length": 212.8064422607422, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.732, "grad_norm": 4.8649139404296875, "kl": 0.05670166015625, "learning_rate": 1e-06, "loss": -0.0406, "num_tokens": 12324497.0, "reward": 0.047999829053878784, "reward_std": 0.024475431069731712, "rewards/bleu_reward_func/mean": 0.047999829053878784, "rewards/bleu_reward_func/std": 0.031057659536600113, "step": 915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 436.0, "completions/mean_length": 320.84375, "completions/mean_terminated_length": 257.125, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 0.7328, "grad_norm": 2.876401662826538, "kl": 0.03778076171875, "learning_rate": 1e-06, "loss": -0.0378, "num_tokens": 12337788.0, "reward": 0.04696403443813324, "reward_std": 0.02812850847840309, "rewards/bleu_reward_func/mean": 0.04696403443813324, "rewards/bleu_reward_func/std": 0.06415504217147827, "step": 916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5625, "completions/max_length": 512.0, "completions/max_terminated_length": 505.0, "completions/mean_length": 407.03125, "completions/mean_terminated_length": 272.0714416503906, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 0.7336, "grad_norm": 2.623408079147339, "kl": 0.0589599609375, "learning_rate": 1e-06, "loss": -0.0659, "num_tokens": 12354373.0, "reward": 0.056746140122413635, "reward_std": 0.01886637695133686, "rewards/bleu_reward_func/mean": 0.056746140122413635, "rewards/bleu_reward_func/std": 0.03354233503341675, "step": 917 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 512.0, "completions/max_terminated_length": 257.0, "completions/mean_length": 338.71875, "completions/mean_terminated_length": 165.4375, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.7344, "grad_norm": 2.7854504585266113, "kl": 0.0494384765625, "learning_rate": 1e-06, "loss": 0.0484, "num_tokens": 12368292.0, "reward": 0.04390311986207962, "reward_std": 0.016904659569263458, "rewards/bleu_reward_func/mean": 0.04390311986207962, "rewards/bleu_reward_func/std": 0.039624132215976715, "step": 918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 512.0, "completions/max_terminated_length": 415.0, "completions/mean_length": 186.65625, "completions/mean_terminated_length": 164.9666748046875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7352, "grad_norm": 6.439005374908447, "kl": 0.073272705078125, "learning_rate": 1e-06, "loss": -0.0798, "num_tokens": 12377241.0, "reward": 0.05543770641088486, "reward_std": 0.02084982395172119, "rewards/bleu_reward_func/mean": 0.05543770641088486, "rewards/bleu_reward_func/std": 0.03484691306948662, "step": 919 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 512.0, "completions/max_terminated_length": 491.0, "completions/mean_length": 357.15625, "completions/mean_terminated_length": 220.5294189453125, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 0.736, "grad_norm": 2.5606613159179688, "kl": 0.04241943359375, "learning_rate": 1e-06, "loss": -0.0456, "num_tokens": 12391358.0, "reward": 0.03371516987681389, "reward_std": 0.011735515668988228, "rewards/bleu_reward_func/mean": 0.03371516987681389, "rewards/bleu_reward_func/std": 0.018063481897115707, "step": 920 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 512.0, "completions/max_terminated_length": 500.0, "completions/mean_length": 294.625, "completions/mean_terminated_length": 287.6128845214844, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.7368, "grad_norm": 2.889098644256592, "kl": 0.041656494140625, "learning_rate": 1e-06, "loss": 0.1173, "num_tokens": 12402634.0, "reward": 0.05923088267445564, "reward_std": 0.029831603169441223, "rewards/bleu_reward_func/mean": 0.05923088267445564, "rewards/bleu_reward_func/std": 0.04481290653347969, "step": 921 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 512.0, "completions/max_terminated_length": 474.0, "completions/mean_length": 241.3125, "completions/mean_terminated_length": 191.1851806640625, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.7376, "grad_norm": 4.053657531738281, "kl": 0.046722412109375, "learning_rate": 1e-06, "loss": -0.046, "num_tokens": 12412556.0, "reward": 0.06542409956455231, "reward_std": 0.025028303265571594, "rewards/bleu_reward_func/mean": 0.06542409956455231, "rewards/bleu_reward_func/std": 0.05025548115372658, "step": 922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 451.0, "completions/mean_length": 329.5, "completions/mean_terminated_length": 268.66668701171875, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.7384, "grad_norm": 2.5536530017852783, "kl": 0.0421142578125, "learning_rate": 1e-06, "loss": -0.0712, "num_tokens": 12425148.0, "reward": 0.04631096124649048, "reward_std": 0.024599246680736542, "rewards/bleu_reward_func/mean": 0.04631096124649048, "rewards/bleu_reward_func/std": 0.04204652085900307, "step": 923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 512.0, "completions/max_terminated_length": 503.0, "completions/mean_length": 350.40625, "completions/mean_terminated_length": 313.1153869628906, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.7392, "grad_norm": 3.109853982925415, "kl": 0.04412841796875, "learning_rate": 1e-06, "loss": -0.0658, "num_tokens": 12438361.0, "reward": 0.032508477568626404, "reward_std": 0.016699161380529404, "rewards/bleu_reward_func/mean": 0.032508477568626404, "rewards/bleu_reward_func/std": 0.028412526473402977, "step": 924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 512.0, "completions/max_terminated_length": 491.0, "completions/mean_length": 275.59375, "completions/mean_terminated_length": 267.9677429199219, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 0.74, "grad_norm": 2.6314737796783447, "kl": 0.06317138671875, "learning_rate": 1e-06, "loss": 0.1181, "num_tokens": 12449460.0, "reward": 0.03400625288486481, "reward_std": 0.011289702728390694, "rewards/bleu_reward_func/mean": 0.03400625288486481, "rewards/bleu_reward_func/std": 0.01916448399424553, "step": 925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 468.0, "completions/mean_length": 310.5, "completions/mean_terminated_length": 231.6521759033203, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 0.7408, "grad_norm": 2.9635672569274902, "kl": 0.030517578125, "learning_rate": 1e-06, "loss": 0.1214, "num_tokens": 12461476.0, "reward": 0.07918738573789597, "reward_std": 0.06687770783901215, "rewards/bleu_reward_func/mean": 0.07918738573789597, "rewards/bleu_reward_func/std": 0.1105826124548912, "step": 926 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.53125, "completions/max_length": 512.0, "completions/max_terminated_length": 512.0, "completions/mean_length": 364.875, "completions/mean_terminated_length": 198.1333465576172, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 0.7416, "grad_norm": 5.0117106437683105, "kl": 0.0650634765625, "learning_rate": 1e-06, "loss": -0.0201, "num_tokens": 12478872.0, "reward": 0.10731503367424011, "reward_std": 0.03286542743444443, "rewards/bleu_reward_func/mean": 0.10731503367424011, "rewards/bleu_reward_func/std": 0.059237416833639145, "step": 927 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 512.0, "completions/max_terminated_length": 488.0, "completions/mean_length": 285.75, "completions/mean_terminated_length": 262.3448181152344, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.7424, "grad_norm": 3.3980448246002197, "kl": 0.042205810546875, "learning_rate": 1e-06, "loss": -0.025, "num_tokens": 12490048.0, "reward": 0.0708259865641594, "reward_std": 0.02675773948431015, "rewards/bleu_reward_func/mean": 0.0708259865641594, "rewards/bleu_reward_func/std": 0.04802871122956276, "step": 928 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 512.0, "completions/max_terminated_length": 500.0, "completions/mean_length": 284.53125, "completions/mean_terminated_length": 252.0357208251953, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "epoch": 0.7432, "grad_norm": 2.9506585597991943, "kl": 0.0633544921875, "learning_rate": 1e-06, "loss": -0.0753, "num_tokens": 12501129.0, "reward": 0.10486802458763123, "reward_std": 0.022330686450004578, "rewards/bleu_reward_func/mean": 0.10486802458763123, "rewards/bleu_reward_func/std": 0.090861976146698, "step": 929 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 512.0, "completions/max_terminated_length": 480.0, "completions/mean_length": 341.9375, "completions/mean_terminated_length": 252.85714721679688, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.744, "grad_norm": 2.826270580291748, "kl": 0.04290771484375, "learning_rate": 1e-06, "loss": -0.0349, "num_tokens": 12516175.0, "reward": 0.06202811375260353, "reward_std": 0.030784565955400467, "rewards/bleu_reward_func/mean": 0.06202811375260353, "rewards/bleu_reward_func/std": 0.0687461644411087, "step": 930 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 512.0, "completions/max_terminated_length": 490.0, "completions/mean_length": 242.90625, "completions/mean_terminated_length": 215.0689697265625, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.7448, "grad_norm": 4.026130676269531, "kl": 0.0509033203125, "learning_rate": 1e-06, "loss": -0.1079, "num_tokens": 12528500.0, "reward": 0.0759795531630516, "reward_std": 0.07113184779882431, "rewards/bleu_reward_func/mean": 0.0759795531630516, "rewards/bleu_reward_func/std": 0.14475159347057343, "step": 931 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/max_terminated_length": 480.0, "completions/mean_length": 188.21875, "completions/mean_terminated_length": 188.21875, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.7456, "grad_norm": 6.228883266448975, "kl": 0.0828857421875, "learning_rate": 1e-06, "loss": -0.0074, "num_tokens": 12538371.0, "reward": 0.048906613141298294, "reward_std": 0.01954766921699047, "rewards/bleu_reward_func/mean": 0.048906613141298294, "rewards/bleu_reward_func/std": 0.059886980801820755, "step": 932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 490.0, "completions/mean_length": 320.5, "completions/mean_terminated_length": 266.8800048828125, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 0.7464, "grad_norm": 2.9069249629974365, "kl": 0.0599365234375, "learning_rate": 1e-06, "loss": 0.0547, "num_tokens": 12550459.0, "reward": 0.027797240763902664, "reward_std": 0.00949520617723465, "rewards/bleu_reward_func/mean": 0.027797240763902664, "rewards/bleu_reward_func/std": 0.023359699174761772, "step": 933 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 512.0, "completions/max_terminated_length": 503.0, "completions/mean_length": 280.5625, "completions/mean_terminated_length": 273.0967712402344, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "epoch": 0.7472, "grad_norm": 3.0222465991973877, "kl": 0.09490966796875, "learning_rate": 1e-06, "loss": 0.0435, "num_tokens": 12561077.0, "reward": 0.05467883497476578, "reward_std": 0.01736966334283352, "rewards/bleu_reward_func/mean": 0.05467883497476578, "rewards/bleu_reward_func/std": 0.05791114643216133, "step": 934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.6875, "completions/max_length": 512.0, "completions/max_terminated_length": 474.0, "completions/mean_length": 473.59375, "completions/mean_terminated_length": 389.1000061035156, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "epoch": 0.748, "grad_norm": 2.246959924697876, "kl": 0.0468292236328125, "learning_rate": 1e-06, "loss": 0.0044, "num_tokens": 12579592.0, "reward": 0.059155356138944626, "reward_std": 0.012122802436351776, "rewards/bleu_reward_func/mean": 0.059155356138944626, "rewards/bleu_reward_func/std": 0.03615579754114151, "step": 935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 443.0, "completions/max_terminated_length": 443.0, "completions/mean_length": 175.96875, "completions/mean_terminated_length": 175.96875, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.7488, "grad_norm": 3.9639289379119873, "kl": 0.05889892578125, "learning_rate": 1e-06, "loss": 0.1749, "num_tokens": 12587911.0, "reward": 0.04553116112947464, "reward_std": 0.027312763035297394, "rewards/bleu_reward_func/mean": 0.04553116112947464, "rewards/bleu_reward_func/std": 0.045869771391153336, "step": 936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 512.0, "completions/max_terminated_length": 493.0, "completions/mean_length": 298.21875, "completions/mean_terminated_length": 291.32257080078125, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "epoch": 0.7496, "grad_norm": 3.3128387928009033, "kl": 0.05743408203125, "learning_rate": 1e-06, "loss": 0.048, "num_tokens": 12601302.0, "reward": 0.02700314298272133, "reward_std": 0.011166905984282494, "rewards/bleu_reward_func/mean": 0.02700314298272133, "rewards/bleu_reward_func/std": 0.015710683539509773, "step": 937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 481.0, "completions/mean_length": 286.0625, "completions/mean_terminated_length": 210.75, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.7504, "grad_norm": 5.0102081298828125, "kl": 0.067474365234375, "learning_rate": 1e-06, "loss": 0.0025, "num_tokens": 12615776.0, "reward": 0.052227430045604706, "reward_std": 0.014771172776818275, "rewards/bleu_reward_func/mean": 0.052227430045604706, "rewards/bleu_reward_func/std": 0.019430244341492653, "step": 938 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 512.0, "completions/max_terminated_length": 491.0, "completions/mean_length": 404.21875, "completions/mean_terminated_length": 347.76190185546875, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.7512, "grad_norm": 2.042522430419922, "kl": 0.04541015625, "learning_rate": 1e-06, "loss": 0.0251, "num_tokens": 12631239.0, "reward": 0.05530402064323425, "reward_std": 0.019676920026540756, "rewards/bleu_reward_func/mean": 0.05530402064323425, "rewards/bleu_reward_func/std": 0.027803683653473854, "step": 939 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 512.0, "completions/max_terminated_length": 507.0, "completions/mean_length": 295.5, "completions/mean_terminated_length": 104.47058868408203, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.752, "grad_norm": 4.662267208099365, "kl": 0.09735107421875, "learning_rate": 1e-06, "loss": -0.0136, "num_tokens": 12643503.0, "reward": 0.05965786427259445, "reward_std": 0.027585718780755997, "rewards/bleu_reward_func/mean": 0.05965786427259445, "rewards/bleu_reward_func/std": 0.0370444729924202, "step": 940 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 512.0, "completions/max_terminated_length": 474.0, "completions/mean_length": 303.625, "completions/mean_terminated_length": 273.8571472167969, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 0.7528, "grad_norm": 2.7425622940063477, "kl": 0.05535888671875, "learning_rate": 1e-06, "loss": 0.0114, "num_tokens": 12655531.0, "reward": 0.04901716113090515, "reward_std": 0.00994904711842537, "rewards/bleu_reward_func/mean": 0.04901716113090515, "rewards/bleu_reward_func/std": 0.040510956197977066, "step": 941 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 512.0, "completions/max_terminated_length": 497.0, "completions/mean_length": 348.3125, "completions/mean_terminated_length": 262.5714416503906, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 0.7536, "grad_norm": 2.3559534549713135, "kl": 0.04656982421875, "learning_rate": 1e-06, "loss": -0.0641, "num_tokens": 12670781.0, "reward": 0.03677675127983093, "reward_std": 0.009964315220713615, "rewards/bleu_reward_func/mean": 0.03677675127983093, "rewards/bleu_reward_func/std": 0.019602550193667412, "step": 942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 512.0, "completions/max_terminated_length": 412.0, "completions/mean_length": 229.03125, "completions/mean_terminated_length": 163.73077392578125, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "epoch": 0.7544, "grad_norm": 3.2343432903289795, "kl": 0.071380615234375, "learning_rate": 1e-06, "loss": -0.0938, "num_tokens": 12683318.0, "reward": 0.03962566331028938, "reward_std": 0.011244509369134903, "rewards/bleu_reward_func/mean": 0.03962566331028938, "rewards/bleu_reward_func/std": 0.01717083901166916, "step": 943 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 512.0, "completions/max_terminated_length": 476.0, "completions/mean_length": 315.84375, "completions/mean_terminated_length": 142.76470947265625, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.7552, "grad_norm": 3.105497360229492, "kl": 0.10089111328125, "learning_rate": 1e-06, "loss": 0.0565, "num_tokens": 12696873.0, "reward": 0.046081312000751495, "reward_std": 0.007257817313075066, "rewards/bleu_reward_func/mean": 0.046081312000751495, "rewards/bleu_reward_func/std": 0.023574350401759148, "step": 944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5625, "completions/max_length": 512.0, "completions/max_terminated_length": 493.0, "completions/mean_length": 460.375, "completions/mean_terminated_length": 394.0000305175781, "completions/min_length": 304.0, "completions/min_terminated_length": 304.0, "epoch": 0.756, "grad_norm": 2.2013721466064453, "kl": 0.062255859375, "learning_rate": 1e-06, "loss": -0.0391, "num_tokens": 12715285.0, "reward": 0.07103259861469269, "reward_std": 0.017729321494698524, "rewards/bleu_reward_func/mean": 0.07103259861469269, "rewards/bleu_reward_func/std": 0.03985920175909996, "step": 945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 502.0, "completions/mean_length": 280.65625, "completions/mean_terminated_length": 203.5416717529297, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 0.7568, "grad_norm": 5.288199424743652, "kl": 0.08099365234375, "learning_rate": 1e-06, "loss": 0.041, "num_tokens": 12730138.0, "reward": 0.02993028610944748, "reward_std": 0.00784086249768734, "rewards/bleu_reward_func/mean": 0.02993028610944748, "rewards/bleu_reward_func/std": 0.01985708624124527, "step": 946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.625, "completions/max_length": 512.0, "completions/max_terminated_length": 404.0, "completions/mean_length": 383.3125, "completions/mean_terminated_length": 168.83334350585938, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 0.7576, "grad_norm": 3.2791404724121094, "kl": 0.07684326171875, "learning_rate": 1e-06, "loss": -0.0277, "num_tokens": 12744788.0, "reward": 0.041376739740371704, "reward_std": 0.009136617183685303, "rewards/bleu_reward_func/mean": 0.041376739740371704, "rewards/bleu_reward_func/std": 0.034718483686447144, "step": 947 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 512.0, "completions/max_terminated_length": 505.0, "completions/mean_length": 306.40625, "completions/mean_terminated_length": 268.3333435058594, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 0.7584, "grad_norm": 2.8197238445281982, "kl": 0.037139892578125, "learning_rate": 1e-06, "loss": -0.0318, "num_tokens": 12757025.0, "reward": 0.06905034184455872, "reward_std": 0.024921495467424393, "rewards/bleu_reward_func/mean": 0.06905034184455872, "rewards/bleu_reward_func/std": 0.059942930936813354, "step": 948 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 511.0, "completions/mean_length": 278.21875, "completions/mean_terminated_length": 212.75999450683594, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.7592, "grad_norm": 4.428009510040283, "kl": 0.064208984375, "learning_rate": 1e-06, "loss": 0.1261, "num_tokens": 12771512.0, "reward": 0.08314824104309082, "reward_std": 0.028866248205304146, "rewards/bleu_reward_func/mean": 0.08314824104309082, "rewards/bleu_reward_func/std": 0.07680681347846985, "step": 949 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 512.0, "completions/max_terminated_length": 471.0, "completions/mean_length": 303.3125, "completions/mean_terminated_length": 289.4000244140625, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 0.76, "grad_norm": 3.076261043548584, "kl": 0.0711669921875, "learning_rate": 1e-06, "loss": 0.0991, "num_tokens": 12783722.0, "reward": 0.0795808807015419, "reward_std": 0.02329542487859726, "rewards/bleu_reward_func/mean": 0.0795808807015419, "rewards/bleu_reward_func/std": 0.055487681180238724, "step": 950 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 512.0, "completions/max_terminated_length": 499.0, "completions/mean_length": 441.65625, "completions/mean_terminated_length": 399.45001220703125, "completions/min_length": 254.0, "completions/min_terminated_length": 254.0, "epoch": 0.7608, "grad_norm": 2.099597454071045, "kl": 0.042877197265625, "learning_rate": 1e-06, "loss": 0.0511, "num_tokens": 12800943.0, "reward": 0.04137660562992096, "reward_std": 0.01036759465932846, "rewards/bleu_reward_func/mean": 0.04137660562992096, "rewards/bleu_reward_func/std": 0.016977576538920403, "step": 951 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 512.0, "completions/max_terminated_length": 418.0, "completions/mean_length": 216.3125, "completions/mean_terminated_length": 196.60000610351562, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.7616, "grad_norm": 3.2768120765686035, "kl": 0.04909515380859375, "learning_rate": 1e-06, "loss": 0.0627, "num_tokens": 12812129.0, "reward": 0.0957229807972908, "reward_std": 0.04901205375790596, "rewards/bleu_reward_func/mean": 0.0957229807972908, "rewards/bleu_reward_func/std": 0.13014653325080872, "step": 952 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 507.0, "completions/max_terminated_length": 507.0, "completions/mean_length": 316.53125, "completions/mean_terminated_length": 316.53125, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.7624, "grad_norm": 2.4262685775756836, "kl": 0.03955078125, "learning_rate": 1e-06, "loss": -0.044, "num_tokens": 12824258.0, "reward": 0.06452548503875732, "reward_std": 0.02723013609647751, "rewards/bleu_reward_func/mean": 0.06452548503875732, "rewards/bleu_reward_func/std": 0.05586642026901245, "step": 953 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 495.0, "completions/mean_length": 381.0, "completions/mean_terminated_length": 344.32000732421875, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.7632, "grad_norm": 1.949605107307434, "kl": 0.03692626953125, "learning_rate": 1e-06, "loss": -0.1027, "num_tokens": 12838722.0, "reward": 0.05391934886574745, "reward_std": 0.022434931248426437, "rewards/bleu_reward_func/mean": 0.05391934886574745, "rewards/bleu_reward_func/std": 0.04070979356765747, "step": 954 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 512.0, "completions/max_terminated_length": 455.0, "completions/mean_length": 252.34375, "completions/mean_terminated_length": 243.9677276611328, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.764, "grad_norm": 2.8324923515319824, "kl": 0.046630859375, "learning_rate": 1e-06, "loss": -0.0475, "num_tokens": 12849053.0, "reward": 0.07158366590738297, "reward_std": 0.04349514842033386, "rewards/bleu_reward_func/mean": 0.07158366590738297, "rewards/bleu_reward_func/std": 0.07585739344358444, "step": 955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 512.0, "completions/max_terminated_length": 469.0, "completions/mean_length": 298.46875, "completions/mean_terminated_length": 132.38888549804688, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.7648, "grad_norm": 5.58326530456543, "kl": 0.057861328125, "learning_rate": 1e-06, "loss": 0.1246, "num_tokens": 12863756.0, "reward": 0.03694935888051987, "reward_std": 0.015439806506037712, "rewards/bleu_reward_func/mean": 0.03694935888051987, "rewards/bleu_reward_func/std": 0.02632940374314785, "step": 956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 512.0, "completions/max_terminated_length": 492.0, "completions/mean_length": 367.34375, "completions/mean_terminated_length": 254.8333282470703, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.7656, "grad_norm": 3.1758320331573486, "kl": 0.0777130126953125, "learning_rate": 1e-06, "loss": 0.0586, "num_tokens": 12877663.0, "reward": 0.03734767809510231, "reward_std": 0.018983110785484314, "rewards/bleu_reward_func/mean": 0.03734767809510231, "rewards/bleu_reward_func/std": 0.026389576494693756, "step": 957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 512.0, "completions/max_terminated_length": 458.0, "completions/mean_length": 156.90625, "completions/mean_terminated_length": 145.4516143798828, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.7664, "grad_norm": 4.952297210693359, "kl": 0.08062744140625, "learning_rate": 1e-06, "loss": -0.0385, "num_tokens": 12885020.0, "reward": 0.029940243810415268, "reward_std": 0.010614018887281418, "rewards/bleu_reward_func/mean": 0.029940243810415268, "rewards/bleu_reward_func/std": 0.018806666135787964, "step": 958 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 512.0, "completions/max_terminated_length": 462.0, "completions/mean_length": 378.75, "completions/mean_terminated_length": 308.952392578125, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.7672, "grad_norm": 2.2859702110290527, "kl": 0.04290771484375, "learning_rate": 1e-06, "loss": 0.0048, "num_tokens": 12900348.0, "reward": 0.028122084215283394, "reward_std": 0.008207820355892181, "rewards/bleu_reward_func/mean": 0.028122084215283394, "rewards/bleu_reward_func/std": 0.016654757782816887, "step": 959 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 512.0, "completions/max_terminated_length": 448.0, "completions/mean_length": 254.71875, "completions/mean_terminated_length": 237.56668090820312, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.768, "grad_norm": 3.055973768234253, "kl": 0.04168701171875, "learning_rate": 1e-06, "loss": -0.056, "num_tokens": 12912875.0, "reward": 0.04096674174070358, "reward_std": 0.01361355185508728, "rewards/bleu_reward_func/mean": 0.04096674174070358, "rewards/bleu_reward_func/std": 0.023293767124414444, "step": 960 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/max_terminated_length": 504.0, "completions/mean_length": 243.5, "completions/mean_terminated_length": 243.5, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "epoch": 0.7688, "grad_norm": 3.707246780395508, "kl": 0.040069580078125, "learning_rate": 1e-06, "loss": -0.0225, "num_tokens": 12923339.0, "reward": 0.09101220965385437, "reward_std": 0.03245137259364128, "rewards/bleu_reward_func/mean": 0.09101220965385437, "rewards/bleu_reward_func/std": 0.052579786628484726, "step": 961 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 339.0, "completions/mean_length": 235.5625, "completions/mean_terminated_length": 143.4166717529297, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "epoch": 0.7696, "grad_norm": 4.457637310028076, "kl": 0.109130859375, "learning_rate": 1e-06, "loss": 0.0719, "num_tokens": 12936781.0, "reward": 0.08339278399944305, "reward_std": 0.03469950705766678, "rewards/bleu_reward_func/mean": 0.08339278399944305, "rewards/bleu_reward_func/std": 0.08822762966156006, "step": 962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.0, "completions/max_terminated_length": 379.0, "completions/mean_length": 173.65625, "completions/mean_terminated_length": 173.65625, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.7704, "grad_norm": 4.618581295013428, "kl": 0.072509765625, "learning_rate": 1e-06, "loss": -0.0867, "num_tokens": 12944306.0, "reward": 0.03851824253797531, "reward_std": 0.029345914721488953, "rewards/bleu_reward_func/mean": 0.03851824253797531, "rewards/bleu_reward_func/std": 0.04824645444750786, "step": 963 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 512.0, "completions/max_terminated_length": 501.0, "completions/mean_length": 354.375, "completions/mean_terminated_length": 331.8571472167969, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.7712, "grad_norm": 2.1267480850219727, "kl": 0.046844482421875, "learning_rate": 1e-06, "loss": 0.1329, "num_tokens": 12957422.0, "reward": 0.03777143731713295, "reward_std": 0.019737938418984413, "rewards/bleu_reward_func/mean": 0.03777143731713295, "rewards/bleu_reward_func/std": 0.039779361337423325, "step": 964 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 494.0, "completions/mean_length": 259.96875, "completions/mean_terminated_length": 189.39999389648438, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 0.772, "grad_norm": 5.85866641998291, "kl": 0.081817626953125, "learning_rate": 1e-06, "loss": 0.0736, "num_tokens": 12972021.0, "reward": 0.07170456647872925, "reward_std": 0.020086858421564102, "rewards/bleu_reward_func/mean": 0.07170456647872925, "rewards/bleu_reward_func/std": 0.04547082632780075, "step": 965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 408.0, "completions/mean_length": 342.09375, "completions/mean_terminated_length": 294.5199890136719, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.7728, "grad_norm": 2.059349298477173, "kl": 0.028564453125, "learning_rate": 1e-06, "loss": 0.0158, "num_tokens": 12985888.0, "reward": 0.06088118255138397, "reward_std": 0.01464940793812275, "rewards/bleu_reward_func/mean": 0.06088118255138397, "rewards/bleu_reward_func/std": 0.031058233231306076, "step": 966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 512.0, "completions/max_terminated_length": 400.0, "completions/mean_length": 233.78125, "completions/mean_terminated_length": 215.23333740234375, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.7736, "grad_norm": 2.824509620666504, "kl": 0.04962158203125, "learning_rate": 1e-06, "loss": 0.0278, "num_tokens": 12995185.0, "reward": 0.058438584208488464, "reward_std": 0.017195967957377434, "rewards/bleu_reward_func/mean": 0.058438584208488464, "rewards/bleu_reward_func/std": 0.0505751296877861, "step": 967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 491.0, "completions/mean_length": 342.1875, "completions/mean_terminated_length": 275.7391357421875, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.7744, "grad_norm": 4.1130452156066895, "kl": 0.0518798828125, "learning_rate": 1e-06, "loss": -0.2853, "num_tokens": 13009615.0, "reward": 0.050040245056152344, "reward_std": 0.01541995070874691, "rewards/bleu_reward_func/mean": 0.050040245056152344, "rewards/bleu_reward_func/std": 0.0558185949921608, "step": 968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 512.0, "completions/max_terminated_length": 450.0, "completions/mean_length": 203.75, "completions/mean_terminated_length": 183.20001220703125, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.7752, "grad_norm": 4.958765506744385, "kl": 0.0552978515625, "learning_rate": 1e-06, "loss": -0.0387, "num_tokens": 13020095.0, "reward": 0.06027444452047348, "reward_std": 0.027696281671524048, "rewards/bleu_reward_func/mean": 0.06027444452047348, "rewards/bleu_reward_func/std": 0.03125971183180809, "step": 969 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 512.0, "completions/max_terminated_length": 498.0, "completions/mean_length": 331.8125, "completions/mean_terminated_length": 223.6999969482422, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 0.776, "grad_norm": 3.1290132999420166, "kl": 0.0631103515625, "learning_rate": 1e-06, "loss": 0.0774, "num_tokens": 13032841.0, "reward": 0.041490666568279266, "reward_std": 0.023433692753314972, "rewards/bleu_reward_func/mean": 0.041490666568279266, "rewards/bleu_reward_func/std": 0.030176030471920967, "step": 970 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 512.0, "completions/max_terminated_length": 506.0, "completions/mean_length": 231.125, "completions/mean_terminated_length": 212.40000915527344, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.7768, "grad_norm": 5.468991279602051, "kl": 0.04461669921875, "learning_rate": 1e-06, "loss": 0.0587, "num_tokens": 13042621.0, "reward": 0.038487743586301804, "reward_std": 0.011531597934663296, "rewards/bleu_reward_func/mean": 0.038487743586301804, "rewards/bleu_reward_func/std": 0.01654433086514473, "step": 971 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 512.0, "completions/max_terminated_length": 479.0, "completions/mean_length": 274.09375, "completions/mean_terminated_length": 240.10714721679688, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.7776, "grad_norm": 3.751850128173828, "kl": 0.084228515625, "learning_rate": 1e-06, "loss": 0.0013, "num_tokens": 13053616.0, "reward": 0.03094501793384552, "reward_std": 0.012867014855146408, "rewards/bleu_reward_func/mean": 0.03094501793384552, "rewards/bleu_reward_func/std": 0.01749509572982788, "step": 972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 512.0, "completions/max_terminated_length": 511.0, "completions/mean_length": 313.8125, "completions/mean_terminated_length": 285.5, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.7784, "grad_norm": 4.073342800140381, "kl": 0.043365478515625, "learning_rate": 1e-06, "loss": 0.0175, "num_tokens": 13069146.0, "reward": 0.07553990185260773, "reward_std": 0.018323319032788277, "rewards/bleu_reward_func/mean": 0.07553990185260773, "rewards/bleu_reward_func/std": 0.05046038329601288, "step": 973 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 512.0, "completions/max_terminated_length": 504.0, "completions/mean_length": 287.15625, "completions/mean_terminated_length": 245.51852416992188, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 0.7792, "grad_norm": 4.537550449371338, "kl": 0.14056396484375, "learning_rate": 1e-06, "loss": -0.0867, "num_tokens": 13081991.0, "reward": 0.03124011494219303, "reward_std": 0.010205641388893127, "rewards/bleu_reward_func/mean": 0.03124011494219303, "rewards/bleu_reward_func/std": 0.019145779311656952, "step": 974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 512.0, "completions/max_terminated_length": 507.0, "completions/mean_length": 242.28125, "completions/mean_terminated_length": 214.37930297851562, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.78, "grad_norm": 4.508155822753906, "kl": 0.08660888671875, "learning_rate": 1e-06, "loss": 0.0179, "num_tokens": 13092056.0, "reward": 0.054113149642944336, "reward_std": 0.026252152398228645, "rewards/bleu_reward_func/mean": 0.054113149642944336, "rewards/bleu_reward_func/std": 0.0647139772772789, "step": 975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 512.0, "completions/max_terminated_length": 474.0, "completions/mean_length": 284.25, "completions/mean_terminated_length": 251.71429443359375, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "epoch": 0.7808, "grad_norm": 3.1219375133514404, "kl": 0.06097412109375, "learning_rate": 1e-06, "loss": 0.1548, "num_tokens": 13104032.0, "reward": 0.050552576780319214, "reward_std": 0.014210234396159649, "rewards/bleu_reward_func/mean": 0.050552576780319214, "rewards/bleu_reward_func/std": 0.04330144450068474, "step": 976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 512.0, "completions/max_terminated_length": 512.0, "completions/mean_length": 194.8125, "completions/mean_terminated_length": 173.6666717529297, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.7816, "grad_norm": 6.059234619140625, "kl": 0.1226806640625, "learning_rate": 1e-06, "loss": -0.1676, "num_tokens": 13114154.0, "reward": 0.07722032815217972, "reward_std": 0.03570058196783066, "rewards/bleu_reward_func/mean": 0.07722032815217972, "rewards/bleu_reward_func/std": 0.06760042905807495, "step": 977 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 512.0, "completions/max_terminated_length": 504.0, "completions/mean_length": 207.125, "completions/mean_terminated_length": 175.58621215820312, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.7824, "grad_norm": 4.949698448181152, "kl": 0.11578369140625, "learning_rate": 1e-06, "loss": -0.1415, "num_tokens": 13123174.0, "reward": 0.09893815964460373, "reward_std": 0.022481422871351242, "rewards/bleu_reward_func/mean": 0.09893815964460373, "rewards/bleu_reward_func/std": 0.05471767112612724, "step": 978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/max_terminated_length": 480.0, "completions/mean_length": 265.34375, "completions/mean_terminated_length": 265.34375, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.7832, "grad_norm": 5.226635932922363, "kl": 0.05767822265625, "learning_rate": 1e-06, "loss": -0.1474, "num_tokens": 13134937.0, "reward": 0.05351312458515167, "reward_std": 0.021641388535499573, "rewards/bleu_reward_func/mean": 0.05351312458515167, "rewards/bleu_reward_func/std": 0.03878382593393326, "step": 979 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 512.0, "completions/max_terminated_length": 491.0, "completions/mean_length": 381.8125, "completions/mean_terminated_length": 357.7037048339844, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.784, "grad_norm": 2.6123292446136475, "kl": 0.04351806640625, "learning_rate": 1e-06, "loss": -0.0909, "num_tokens": 13149627.0, "reward": 0.04280403256416321, "reward_std": 0.014645563438534737, "rewards/bleu_reward_func/mean": 0.04280403256416321, "rewards/bleu_reward_func/std": 0.024342091754078865, "step": 980 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 512.0, "completions/max_terminated_length": 498.0, "completions/mean_length": 384.5625, "completions/mean_terminated_length": 355.15386962890625, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "epoch": 0.7848, "grad_norm": 2.536220073699951, "kl": 0.04449462890625, "learning_rate": 1e-06, "loss": -0.0598, "num_tokens": 13167829.0, "reward": 0.04370192438364029, "reward_std": 0.022699596360325813, "rewards/bleu_reward_func/mean": 0.04370192438364029, "rewards/bleu_reward_func/std": 0.029828311875462532, "step": 981 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 494.0, "completions/mean_length": 339.0625, "completions/mean_terminated_length": 271.39129638671875, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.7856, "grad_norm": 2.4252421855926514, "kl": 0.05010986328125, "learning_rate": 1e-06, "loss": -0.081, "num_tokens": 13182295.0, "reward": 0.07624062150716782, "reward_std": 0.023894930258393288, "rewards/bleu_reward_func/mean": 0.07624062150716782, "rewards/bleu_reward_func/std": 0.059954702854156494, "step": 982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 512.0, "completions/max_terminated_length": 379.0, "completions/mean_length": 154.65625, "completions/mean_terminated_length": 143.1290283203125, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.7864, "grad_norm": 5.228031158447266, "kl": 0.062255859375, "learning_rate": 1e-06, "loss": 0.039, "num_tokens": 13190772.0, "reward": 0.06501435488462448, "reward_std": 0.021548718214035034, "rewards/bleu_reward_func/mean": 0.06501435488462448, "rewards/bleu_reward_func/std": 0.06341297179460526, "step": 983 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 512.0, "completions/max_terminated_length": 486.0, "completions/mean_length": 390.125, "completions/mean_terminated_length": 295.3333435058594, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.7872, "grad_norm": 2.599104642868042, "kl": 0.04437255859375, "learning_rate": 1e-06, "loss": -0.0018, "num_tokens": 13209592.0, "reward": 0.08236557990312576, "reward_std": 0.036718301475048065, "rewards/bleu_reward_func/mean": 0.08236557990312576, "rewards/bleu_reward_func/std": 0.07425292581319809, "step": 984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 462.0, "completions/mean_length": 209.71875, "completions/mean_terminated_length": 108.95833587646484, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 0.788, "grad_norm": 5.494588375091553, "kl": 0.076995849609375, "learning_rate": 1e-06, "loss": -0.3202, "num_tokens": 13221839.0, "reward": 0.12329405546188354, "reward_std": 0.04771920293569565, "rewards/bleu_reward_func/mean": 0.12329405546188354, "rewards/bleu_reward_func/std": 0.12109994888305664, "step": 985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 512.0, "completions/max_terminated_length": 512.0, "completions/mean_length": 368.0625, "completions/mean_terminated_length": 269.5789489746094, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 0.7888, "grad_norm": 2.7720680236816406, "kl": 0.08441162109375, "learning_rate": 1e-06, "loss": -0.0384, "num_tokens": 13236369.0, "reward": 0.05234910920262337, "reward_std": 0.015475506894290447, "rewards/bleu_reward_func/mean": 0.05234910920262337, "rewards/bleu_reward_func/std": 0.03180435299873352, "step": 986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 512.0, "completions/max_terminated_length": 443.0, "completions/mean_length": 356.59375, "completions/mean_terminated_length": 263.3500061035156, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.7896, "grad_norm": 2.421665668487549, "kl": 0.05206298828125, "learning_rate": 1e-06, "loss": 0.1374, "num_tokens": 13250052.0, "reward": 0.032731182873249054, "reward_std": 0.012113362550735474, "rewards/bleu_reward_func/mean": 0.032731182873249054, "rewards/bleu_reward_func/std": 0.020152967423200607, "step": 987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 244.0, "completions/max_terminated_length": 244.0, "completions/mean_length": 131.03125, "completions/mean_terminated_length": 131.03125, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.7904, "grad_norm": 3.4968388080596924, "kl": 0.0459136962890625, "learning_rate": 1e-06, "loss": -0.1296, "num_tokens": 13258461.0, "reward": 0.03249422460794449, "reward_std": 0.014297829940915108, "rewards/bleu_reward_func/mean": 0.03249422460794449, "rewards/bleu_reward_func/std": 0.041689660400152206, "step": 988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 512.0, "completions/max_terminated_length": 493.0, "completions/mean_length": 377.21875, "completions/mean_terminated_length": 352.2592468261719, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.7912, "grad_norm": 2.379380702972412, "kl": 0.04193115234375, "learning_rate": 1e-06, "loss": 0.0994, "num_tokens": 13273484.0, "reward": 0.026992671191692352, "reward_std": 0.006863096728920937, "rewards/bleu_reward_func/mean": 0.026992671191692352, "rewards/bleu_reward_func/std": 0.01573537290096283, "step": 989 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 474.0, "completions/mean_length": 274.40625, "completions/mean_terminated_length": 207.87998962402344, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 0.792, "grad_norm": 2.755162477493286, "kl": 0.0350341796875, "learning_rate": 1e-06, "loss": 0.0889, "num_tokens": 13284617.0, "reward": 0.0448073148727417, "reward_std": 0.02365909144282341, "rewards/bleu_reward_func/mean": 0.0448073148727417, "rewards/bleu_reward_func/std": 0.03250681236386299, "step": 990 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 512.0, "completions/max_terminated_length": 508.0, "completions/mean_length": 234.75, "completions/mean_terminated_length": 183.40740966796875, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 0.7928, "grad_norm": 3.7924673557281494, "kl": 0.0814208984375, "learning_rate": 1e-06, "loss": 0.051, "num_tokens": 13294065.0, "reward": 0.025965671986341476, "reward_std": 0.009640274569392204, "rewards/bleu_reward_func/mean": 0.025965671986341476, "rewards/bleu_reward_func/std": 0.013380059972405434, "step": 991 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 506.0, "completions/mean_length": 254.09375, "completions/mean_terminated_length": 181.87998962402344, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.7936, "grad_norm": 4.138406753540039, "kl": 0.06060791015625, "learning_rate": 1e-06, "loss": 0.1006, "num_tokens": 13307548.0, "reward": 0.06567200273275375, "reward_std": 0.028147000819444656, "rewards/bleu_reward_func/mean": 0.06567200273275375, "rewards/bleu_reward_func/std": 0.05875537171959877, "step": 992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 512.0, "completions/max_terminated_length": 442.0, "completions/mean_length": 366.71875, "completions/mean_terminated_length": 221.4375, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "epoch": 0.7944, "grad_norm": 2.8539927005767822, "kl": 0.0716552734375, "learning_rate": 1e-06, "loss": 0.0213, "num_tokens": 13324051.0, "reward": 0.05763605982065201, "reward_std": 0.019663766026496887, "rewards/bleu_reward_func/mean": 0.05763605982065201, "rewards/bleu_reward_func/std": 0.023355742916464806, "step": 993 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 512.0, "completions/max_terminated_length": 467.0, "completions/mean_length": 347.25, "completions/mean_terminated_length": 201.88235473632812, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 0.7952, "grad_norm": 2.3459670543670654, "kl": 0.05255126953125, "learning_rate": 1e-06, "loss": 0.0511, "num_tokens": 13341395.0, "reward": 0.034985754638910294, "reward_std": 0.017113730311393738, "rewards/bleu_reward_func/mean": 0.034985754638910294, "rewards/bleu_reward_func/std": 0.029299011453986168, "step": 994 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 512.0, "completions/max_terminated_length": 436.0, "completions/mean_length": 342.8125, "completions/mean_terminated_length": 265.9090881347656, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.796, "grad_norm": 2.3377294540405273, "kl": 0.037078857421875, "learning_rate": 1e-06, "loss": 0.1455, "num_tokens": 13355133.0, "reward": 0.0256805419921875, "reward_std": 0.011619502678513527, "rewards/bleu_reward_func/mean": 0.0256805419921875, "rewards/bleu_reward_func/std": 0.021871058270335197, "step": 995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 352.0, "completions/mean_length": 239.15625, "completions/mean_terminated_length": 148.20834350585938, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.7968, "grad_norm": 3.7179551124572754, "kl": 0.059417724609375, "learning_rate": 1e-06, "loss": 0.1004, "num_tokens": 13369122.0, "reward": 0.14195622503757477, "reward_std": 0.09248249232769012, "rewards/bleu_reward_func/mean": 0.14195622503757477, "rewards/bleu_reward_func/std": 0.22318032383918762, "step": 996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 512.0, "completions/max_terminated_length": 510.0, "completions/mean_length": 352.0, "completions/mean_terminated_length": 329.14288330078125, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "epoch": 0.7976, "grad_norm": 2.396754026412964, "kl": 0.0418701171875, "learning_rate": 1e-06, "loss": 0.0533, "num_tokens": 13383586.0, "reward": 0.056401364505290985, "reward_std": 0.02100227400660515, "rewards/bleu_reward_func/mean": 0.056401364505290985, "rewards/bleu_reward_func/std": 0.06413192301988602, "step": 997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 437.0, "completions/max_terminated_length": 437.0, "completions/mean_length": 275.53125, "completions/mean_terminated_length": 275.53125, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.7984, "grad_norm": 3.08534836769104, "kl": 0.057586669921875, "learning_rate": 1e-06, "loss": 0.0777, "num_tokens": 13394491.0, "reward": 0.07327760756015778, "reward_std": 0.024326477199792862, "rewards/bleu_reward_func/mean": 0.07327760756015778, "rewards/bleu_reward_func/std": 0.053993549197912216, "step": 998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.59375, "completions/max_length": 512.0, "completions/max_terminated_length": 446.0, "completions/mean_length": 409.625, "completions/mean_terminated_length": 260.0, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.7992, "grad_norm": 2.469346046447754, "kl": 0.06561279296875, "learning_rate": 1e-06, "loss": 0.0564, "num_tokens": 13410879.0, "reward": 0.026840589940547943, "reward_std": 0.007271309848874807, "rewards/bleu_reward_func/mean": 0.026840589940547943, "rewards/bleu_reward_func/std": 0.018895737826824188, "step": 999 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 512.0, "completions/max_terminated_length": 491.0, "completions/mean_length": 336.96875, "completions/mean_terminated_length": 296.5769348144531, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "epoch": 0.8, "grad_norm": 2.4286251068115234, "kl": 0.0396728515625, "learning_rate": 1e-06, "loss": -0.0733, "num_tokens": 13426686.0, "reward": 0.03406568616628647, "reward_std": 0.017840351909399033, "rewards/bleu_reward_func/mean": 0.03406568616628647, "rewards/bleu_reward_func/std": 0.03109470196068287, "step": 1000 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 512.0, "completions/max_terminated_length": 508.0, "completions/mean_length": 347.28125, "completions/mean_terminated_length": 330.2413635253906, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.8008, "grad_norm": 3.0312039852142334, "kl": 0.04400634765625, "learning_rate": 1e-06, "loss": -0.0108, "num_tokens": 13439727.0, "reward": 0.07976769655942917, "reward_std": 0.024406295269727707, "rewards/bleu_reward_func/mean": 0.07976769655942917, "rewards/bleu_reward_func/std": 0.07874192297458649, "step": 1001 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 512.0, "completions/max_terminated_length": 496.0, "completions/mean_length": 371.375, "completions/mean_terminated_length": 338.923095703125, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.8016, "grad_norm": 2.266893148422241, "kl": 0.04248046875, "learning_rate": 1e-06, "loss": 0.0417, "num_tokens": 13453755.0, "reward": 0.030901743099093437, "reward_std": 0.010838410817086697, "rewards/bleu_reward_func/mean": 0.030901743099093437, "rewards/bleu_reward_func/std": 0.015698978677392006, "step": 1002 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 512.0, "completions/max_terminated_length": 495.0, "completions/mean_length": 321.8125, "completions/mean_terminated_length": 222.1904754638672, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.8024, "grad_norm": 2.922273874282837, "kl": 0.0587158203125, "learning_rate": 1e-06, "loss": 0.0059, "num_tokens": 13466341.0, "reward": 0.029490074142813683, "reward_std": 0.009350080043077469, "rewards/bleu_reward_func/mean": 0.029490074142813683, "rewards/bleu_reward_func/std": 0.011806486174464226, "step": 1003 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.625, "completions/max_length": 512.0, "completions/max_terminated_length": 508.0, "completions/mean_length": 454.53125, "completions/mean_terminated_length": 358.75, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.8032, "grad_norm": 1.9764541387557983, "kl": 0.07159423828125, "learning_rate": 1e-06, "loss": -0.0171, "num_tokens": 13485782.0, "reward": 0.04661983996629715, "reward_std": 0.012564010918140411, "rewards/bleu_reward_func/mean": 0.04661983996629715, "rewards/bleu_reward_func/std": 0.0264245867729187, "step": 1004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 512.0, "completions/max_terminated_length": 455.0, "completions/mean_length": 325.6875, "completions/mean_terminated_length": 180.7777862548828, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 0.804, "grad_norm": 3.197694778442383, "kl": 0.05792236328125, "learning_rate": 1e-06, "loss": -0.0419, "num_tokens": 13499588.0, "reward": 0.09644008427858353, "reward_std": 0.03086179867386818, "rewards/bleu_reward_func/mean": 0.09644008427858353, "rewards/bleu_reward_func/std": 0.08714324980974197, "step": 1005 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 512.0, "completions/max_terminated_length": 510.0, "completions/mean_length": 185.96875, "completions/mean_terminated_length": 175.4516143798828, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.8048, "grad_norm": 4.771821975708008, "kl": 0.08587646484375, "learning_rate": 1e-06, "loss": 0.0805, "num_tokens": 13508259.0, "reward": 0.07439538836479187, "reward_std": 0.041163403540849686, "rewards/bleu_reward_func/mean": 0.07439538836479187, "rewards/bleu_reward_func/std": 0.08994947373867035, "step": 1006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 512.0, "completions/max_terminated_length": 423.0, "completions/mean_length": 197.96875, "completions/mean_terminated_length": 139.8148193359375, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "epoch": 0.8056, "grad_norm": 3.3728678226470947, "kl": 0.0677490234375, "learning_rate": 1e-06, "loss": -0.0892, "num_tokens": 13516818.0, "reward": 0.0446418896317482, "reward_std": 0.016185201704502106, "rewards/bleu_reward_func/mean": 0.0446418896317482, "rewards/bleu_reward_func/std": 0.028570961207151413, "step": 1007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 427.0, "completions/max_terminated_length": 427.0, "completions/mean_length": 216.4375, "completions/mean_terminated_length": 216.4375, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 0.8064, "grad_norm": 3.718839645385742, "kl": 0.03741455078125, "learning_rate": 1e-06, "loss": -0.0024, "num_tokens": 13526016.0, "reward": 0.10925551503896713, "reward_std": 0.03529820218682289, "rewards/bleu_reward_func/mean": 0.10925551503896713, "rewards/bleu_reward_func/std": 0.1309969276189804, "step": 1008 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 502.0, "completions/mean_length": 250.71875, "completions/mean_terminated_length": 163.625, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.8072, "grad_norm": 5.933348655700684, "kl": 0.0469512939453125, "learning_rate": 1e-06, "loss": -0.0945, "num_tokens": 13541023.0, "reward": 0.1345641016960144, "reward_std": 0.04520343244075775, "rewards/bleu_reward_func/mean": 0.1345641016960144, "rewards/bleu_reward_func/std": 0.17300467193126678, "step": 1009 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 512.0, "completions/max_terminated_length": 433.0, "completions/mean_length": 323.9375, "completions/mean_terminated_length": 238.45455932617188, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 0.808, "grad_norm": 2.280043601989746, "kl": 0.0390625, "learning_rate": 1e-06, "loss": 0.0644, "num_tokens": 13556133.0, "reward": 0.05336067080497742, "reward_std": 0.015810808166861534, "rewards/bleu_reward_func/mean": 0.05336067080497742, "rewards/bleu_reward_func/std": 0.055556174367666245, "step": 1010 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 512.0, "completions/max_terminated_length": 493.0, "completions/mean_length": 376.0, "completions/mean_terminated_length": 256.0, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 0.8088, "grad_norm": 2.4357681274414062, "kl": 0.06976318359375, "learning_rate": 1e-06, "loss": -0.0255, "num_tokens": 13571037.0, "reward": 0.054675132036209106, "reward_std": 0.013293720781803131, "rewards/bleu_reward_func/mean": 0.054675132036209106, "rewards/bleu_reward_func/std": 0.07392200827598572, "step": 1011 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.65625, "completions/max_length": 512.0, "completions/max_terminated_length": 434.0, "completions/mean_length": 420.3125, "completions/mean_terminated_length": 245.27273559570312, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.8096, "grad_norm": 2.6430299282073975, "kl": 0.05645751953125, "learning_rate": 1e-06, "loss": -0.0093, "num_tokens": 13588983.0, "reward": 0.051457397639751434, "reward_std": 0.013515422120690346, "rewards/bleu_reward_func/mean": 0.051457397639751434, "rewards/bleu_reward_func/std": 0.03232685476541519, "step": 1012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 512.0, "completions/max_terminated_length": 512.0, "completions/mean_length": 237.03125, "completions/mean_terminated_length": 208.58621215820312, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.8104, "grad_norm": 3.8462376594543457, "kl": 0.0496826171875, "learning_rate": 1e-06, "loss": -0.0937, "num_tokens": 13599568.0, "reward": 0.039942845702171326, "reward_std": 0.011215153150260448, "rewards/bleu_reward_func/mean": 0.039942845702171326, "rewards/bleu_reward_func/std": 0.044292986392974854, "step": 1013 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 468.0, "completions/max_terminated_length": 468.0, "completions/mean_length": 246.8125, "completions/mean_terminated_length": 246.8125, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 0.8112, "grad_norm": 2.7778167724609375, "kl": 0.0450439453125, "learning_rate": 1e-06, "loss": 0.0687, "num_tokens": 13609586.0, "reward": 0.03794855996966362, "reward_std": 0.013286858797073364, "rewards/bleu_reward_func/mean": 0.03794855996966362, "rewards/bleu_reward_func/std": 0.018705854192376137, "step": 1014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 512.0, "completions/max_terminated_length": 494.0, "completions/mean_length": 405.15625, "completions/mean_terminated_length": 310.8823547363281, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.812, "grad_norm": 2.940082311630249, "kl": 0.052734375, "learning_rate": 1e-06, "loss": -0.1224, "num_tokens": 13628087.0, "reward": 0.02557062916457653, "reward_std": 0.013271758332848549, "rewards/bleu_reward_func/mean": 0.02557062916457653, "rewards/bleu_reward_func/std": 0.022851891815662384, "step": 1015 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 470.0, "completions/mean_length": 378.09375, "completions/mean_terminated_length": 340.6000061035156, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "epoch": 0.8128, "grad_norm": 1.974778175354004, "kl": 0.0416259765625, "learning_rate": 1e-06, "loss": 0.0703, "num_tokens": 13642746.0, "reward": 0.0754852443933487, "reward_std": 0.04545840620994568, "rewards/bleu_reward_func/mean": 0.0754852443933487, "rewards/bleu_reward_func/std": 0.08230523765087128, "step": 1016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 512.0, "completions/max_terminated_length": 500.0, "completions/mean_length": 295.78125, "completions/mean_terminated_length": 255.74073791503906, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.8136, "grad_norm": 6.134253978729248, "kl": 0.17449951171875, "learning_rate": 1e-06, "loss": 0.0521, "num_tokens": 13656819.0, "reward": 0.10997641086578369, "reward_std": 0.030572956427931786, "rewards/bleu_reward_func/mean": 0.10997641086578369, "rewards/bleu_reward_func/std": 0.10363367944955826, "step": 1017 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 512.0, "completions/max_terminated_length": 368.0, "completions/mean_length": 326.40625, "completions/mean_terminated_length": 162.64706420898438, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "epoch": 0.8144, "grad_norm": 3.2220001220703125, "kl": 0.09747314453125, "learning_rate": 1e-06, "loss": 0.1033, "num_tokens": 13669280.0, "reward": 0.03305242210626602, "reward_std": 0.009479910135269165, "rewards/bleu_reward_func/mean": 0.03305242210626602, "rewards/bleu_reward_func/std": 0.018387850373983383, "step": 1018 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 277.0, "completions/mean_length": 217.21875, "completions/mean_terminated_length": 118.95833587646484, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "epoch": 0.8152, "grad_norm": 4.264994144439697, "kl": 0.08660888671875, "learning_rate": 1e-06, "loss": 0.1815, "num_tokens": 13679071.0, "reward": 0.07369013130664825, "reward_std": 0.030868127942085266, "rewards/bleu_reward_func/mean": 0.07369013130664825, "rewards/bleu_reward_func/std": 0.08622196316719055, "step": 1019 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 512.0, "completions/max_terminated_length": 507.0, "completions/mean_length": 347.78125, "completions/mean_terminated_length": 330.7930908203125, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.816, "grad_norm": 2.2914328575134277, "kl": 0.04351806640625, "learning_rate": 1e-06, "loss": -0.0436, "num_tokens": 13692432.0, "reward": 0.05755352973937988, "reward_std": 0.015490137040615082, "rewards/bleu_reward_func/mean": 0.05755352973937988, "rewards/bleu_reward_func/std": 0.05329318344593048, "step": 1020 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 512.0, "completions/max_terminated_length": 511.0, "completions/mean_length": 350.375, "completions/mean_terminated_length": 276.9090881347656, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 0.8168, "grad_norm": 3.4762771129608154, "kl": 0.06256103515625, "learning_rate": 1e-06, "loss": -0.0528, "num_tokens": 13706084.0, "reward": 0.07274037599563599, "reward_std": 0.02579139545559883, "rewards/bleu_reward_func/mean": 0.07274037599563599, "rewards/bleu_reward_func/std": 0.05641184002161026, "step": 1021 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 512.0, "completions/max_terminated_length": 496.0, "completions/mean_length": 378.125, "completions/mean_terminated_length": 359.0000305175781, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "epoch": 0.8176, "grad_norm": 2.312500476837158, "kl": 0.04180908203125, "learning_rate": 1e-06, "loss": -0.0063, "num_tokens": 13720832.0, "reward": 0.04683421924710274, "reward_std": 0.016781536862254143, "rewards/bleu_reward_func/mean": 0.04683421924710274, "rewards/bleu_reward_func/std": 0.033053260296583176, "step": 1022 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 512.0, "completions/max_terminated_length": 497.0, "completions/mean_length": 385.15625, "completions/mean_terminated_length": 273.23529052734375, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 0.8184, "grad_norm": 2.3284387588500977, "kl": 0.061279296875, "learning_rate": 1e-06, "loss": 0.0868, "num_tokens": 13736573.0, "reward": 0.01643741875886917, "reward_std": 0.00393636105582118, "rewards/bleu_reward_func/mean": 0.01643741875886917, "rewards/bleu_reward_func/std": 0.011089936830103397, "step": 1023 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 512.0, "completions/max_terminated_length": 494.0, "completions/mean_length": 294.75, "completions/mean_terminated_length": 244.61538696289062, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.8192, "grad_norm": 4.2773542404174805, "kl": 0.080963134765625, "learning_rate": 1e-06, "loss": -0.0395, "num_tokens": 13748445.0, "reward": 0.09083592146635056, "reward_std": 0.024321725592017174, "rewards/bleu_reward_func/mean": 0.09083592146635056, "rewards/bleu_reward_func/std": 0.07333005964756012, "step": 1024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.625, "completions/max_length": 512.0, "completions/max_terminated_length": 502.0, "completions/mean_length": 423.75, "completions/mean_terminated_length": 276.66668701171875, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.82, "grad_norm": 2.270141363143921, "kl": 0.05609130859375, "learning_rate": 1e-06, "loss": 0.048, "num_tokens": 13766077.0, "reward": 0.02606740966439247, "reward_std": 0.010008657351136208, "rewards/bleu_reward_func/mean": 0.02606740966439247, "rewards/bleu_reward_func/std": 0.024353953078389168, "step": 1025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 447.0, "completions/max_terminated_length": 447.0, "completions/mean_length": 172.625, "completions/mean_terminated_length": 172.625, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "epoch": 0.8208, "grad_norm": 3.84765887260437, "kl": 0.060302734375, "learning_rate": 1e-06, "loss": 0.0361, "num_tokens": 13777121.0, "reward": 0.06873498857021332, "reward_std": 0.022971976548433304, "rewards/bleu_reward_func/mean": 0.06873498857021332, "rewards/bleu_reward_func/std": 0.0848066657781601, "step": 1026 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 414.0, "completions/max_terminated_length": 414.0, "completions/mean_length": 192.71875, "completions/mean_terminated_length": 192.71875, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.8216, "grad_norm": 5.911830902099609, "kl": 0.13067626953125, "learning_rate": 1e-06, "loss": 0.2607, "num_tokens": 13785552.0, "reward": 0.12106100469827652, "reward_std": 0.032082945108413696, "rewards/bleu_reward_func/mean": 0.12106100469827652, "rewards/bleu_reward_func/std": 0.17073681950569153, "step": 1027 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 512.0, "completions/max_terminated_length": 492.0, "completions/mean_length": 266.34375, "completions/mean_terminated_length": 249.9666748046875, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.8224, "grad_norm": 3.98777437210083, "kl": 0.05194091796875, "learning_rate": 1e-06, "loss": -0.0154, "num_tokens": 13798923.0, "reward": 0.03594356030225754, "reward_std": 0.02046639285981655, "rewards/bleu_reward_func/mean": 0.03594356030225754, "rewards/bleu_reward_func/std": 0.030679911375045776, "step": 1028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 512.0, "completions/max_terminated_length": 476.0, "completions/mean_length": 263.5, "completions/mean_terminated_length": 237.79310607910156, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 0.8232, "grad_norm": 3.6039505004882812, "kl": 0.06573486328125, "learning_rate": 1e-06, "loss": -0.0975, "num_tokens": 13809459.0, "reward": 0.034439343959093094, "reward_std": 0.014079989865422249, "rewards/bleu_reward_func/mean": 0.034439343959093094, "rewards/bleu_reward_func/std": 0.025304077193140984, "step": 1029 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.75, "completions/max_length": 512.0, "completions/max_terminated_length": 506.0, "completions/mean_length": 476.0625, "completions/mean_terminated_length": 368.25, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "epoch": 0.824, "grad_norm": 2.245523691177368, "kl": 0.057952880859375, "learning_rate": 1e-06, "loss": -0.0427, "num_tokens": 13829117.0, "reward": 0.05782981216907501, "reward_std": 0.01562406774610281, "rewards/bleu_reward_func/mean": 0.05782981216907501, "rewards/bleu_reward_func/std": 0.036193206906318665, "step": 1030 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 396.0, "completions/mean_length": 297.75, "completions/mean_terminated_length": 213.9130401611328, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.8248, "grad_norm": 2.7590172290802, "kl": 0.05572509765625, "learning_rate": 1e-06, "loss": 0.1533, "num_tokens": 13841837.0, "reward": 0.15948085486888885, "reward_std": 0.051790352910757065, "rewards/bleu_reward_func/mean": 0.15948085486888885, "rewards/bleu_reward_func/std": 0.1838230937719345, "step": 1031 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.0, "completions/max_terminated_length": 496.0, "completions/mean_length": 241.0625, "completions/mean_terminated_length": 241.0625, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.8256, "grad_norm": 2.9878034591674805, "kl": 0.0655517578125, "learning_rate": 1e-06, "loss": 0.0115, "num_tokens": 13851447.0, "reward": 0.05960501357913017, "reward_std": 0.018924448639154434, "rewards/bleu_reward_func/mean": 0.05960501357913017, "rewards/bleu_reward_func/std": 0.03717625513672829, "step": 1032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 408.0, "completions/mean_length": 267.90625, "completions/mean_terminated_length": 186.5416717529297, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 0.8264, "grad_norm": 3.7042665481567383, "kl": 0.095947265625, "learning_rate": 1e-06, "loss": -0.1878, "num_tokens": 13862444.0, "reward": 0.027990631759166718, "reward_std": 0.010054003447294235, "rewards/bleu_reward_func/mean": 0.027990631759166718, "rewards/bleu_reward_func/std": 0.024867286905646324, "step": 1033 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 512.0, "completions/max_terminated_length": 492.0, "completions/mean_length": 303.875, "completions/mean_terminated_length": 274.14288330078125, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 0.8272, "grad_norm": 3.176471710205078, "kl": 0.04852294921875, "learning_rate": 1e-06, "loss": 0.0688, "num_tokens": 13874104.0, "reward": 0.028669901192188263, "reward_std": 0.009087910875678062, "rewards/bleu_reward_func/mean": 0.028669901192188263, "rewards/bleu_reward_func/std": 0.012309697456657887, "step": 1034 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 512.0, "completions/max_terminated_length": 488.0, "completions/mean_length": 280.34375, "completions/mean_terminated_length": 264.9000244140625, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.828, "grad_norm": 2.438011407852173, "kl": 0.048095703125, "learning_rate": 1e-06, "loss": -0.0052, "num_tokens": 13885019.0, "reward": 0.07248373329639435, "reward_std": 0.022065263241529465, "rewards/bleu_reward_func/mean": 0.07248373329639435, "rewards/bleu_reward_func/std": 0.060632698237895966, "step": 1035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.6875, "completions/max_length": 512.0, "completions/max_terminated_length": 501.0, "completions/mean_length": 453.8125, "completions/mean_terminated_length": 325.8000183105469, "completions/min_length": 257.0, "completions/min_terminated_length": 257.0, "epoch": 0.8288, "grad_norm": 2.2002475261688232, "kl": 0.05474853515625, "learning_rate": 1e-06, "loss": -0.0041, "num_tokens": 13903325.0, "reward": 0.04414498060941696, "reward_std": 0.01269291341304779, "rewards/bleu_reward_func/mean": 0.04414498060941696, "rewards/bleu_reward_func/std": 0.020714420825242996, "step": 1036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 512.0, "completions/max_terminated_length": 467.0, "completions/mean_length": 281.1875, "completions/mean_terminated_length": 238.44444274902344, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "epoch": 0.8296, "grad_norm": 2.660327434539795, "kl": 0.04754638671875, "learning_rate": 1e-06, "loss": -0.0327, "num_tokens": 13917459.0, "reward": 0.04724155738949776, "reward_std": 0.019107088446617126, "rewards/bleu_reward_func/mean": 0.04724155738949776, "rewards/bleu_reward_func/std": 0.03912588581442833, "step": 1037 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 512.0, "completions/max_terminated_length": 504.0, "completions/mean_length": 199.21875, "completions/mean_terminated_length": 189.1290283203125, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 0.8304, "grad_norm": 3.1837692260742188, "kl": 0.076416015625, "learning_rate": 1e-06, "loss": 0.0188, "num_tokens": 13926098.0, "reward": 0.09293580800294876, "reward_std": 0.031207388266921043, "rewards/bleu_reward_func/mean": 0.09293580800294876, "rewards/bleu_reward_func/std": 0.11434992402791977, "step": 1038 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 512.0, "completions/max_terminated_length": 437.0, "completions/mean_length": 222.53125, "completions/mean_terminated_length": 213.19354248046875, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.8312, "grad_norm": 2.744227409362793, "kl": 0.044677734375, "learning_rate": 1e-06, "loss": 0.0984, "num_tokens": 13937107.0, "reward": 0.14024724066257477, "reward_std": 0.020312879234552383, "rewards/bleu_reward_func/mean": 0.14024724066257477, "rewards/bleu_reward_func/std": 0.18536008894443512, "step": 1039 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 512.0, "completions/max_terminated_length": 507.0, "completions/mean_length": 413.0625, "completions/mean_terminated_length": 368.0909118652344, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "epoch": 0.832, "grad_norm": 2.2251086235046387, "kl": 0.04925537109375, "learning_rate": 1e-06, "loss": -0.0266, "num_tokens": 13952421.0, "reward": 0.019466448575258255, "reward_std": 0.004886062350124121, "rewards/bleu_reward_func/mean": 0.019466448575258255, "rewards/bleu_reward_func/std": 0.010416182689368725, "step": 1040 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 512.0, "completions/max_terminated_length": 398.0, "completions/mean_length": 355.4375, "completions/mean_terminated_length": 217.2941131591797, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 0.8328, "grad_norm": 2.832329750061035, "kl": 0.039093017578125, "learning_rate": 1e-06, "loss": 0.0415, "num_tokens": 13967235.0, "reward": 0.07604211568832397, "reward_std": 0.018734116107225418, "rewards/bleu_reward_func/mean": 0.07604211568832397, "rewards/bleu_reward_func/std": 0.0775146409869194, "step": 1041 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 452.0, "completions/max_terminated_length": 452.0, "completions/mean_length": 202.84375, "completions/mean_terminated_length": 202.84375, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.8336, "grad_norm": 3.4680063724517822, "kl": 0.1029052734375, "learning_rate": 1e-06, "loss": -0.0144, "num_tokens": 13975950.0, "reward": 0.03598593920469284, "reward_std": 0.01417174655944109, "rewards/bleu_reward_func/mean": 0.03598593920469284, "rewards/bleu_reward_func/std": 0.021121855825185776, "step": 1042 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 512.0, "completions/max_terminated_length": 481.0, "completions/mean_length": 341.125, "completions/mean_terminated_length": 309.4814758300781, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.8344, "grad_norm": 2.456043243408203, "kl": 0.0452880859375, "learning_rate": 1e-06, "loss": -0.0375, "num_tokens": 13992426.0, "reward": 0.1423608958721161, "reward_std": 0.04012633115053177, "rewards/bleu_reward_func/mean": 0.1423608958721161, "rewards/bleu_reward_func/std": 0.11575107276439667, "step": 1043 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 512.0, "completions/max_terminated_length": 502.0, "completions/mean_length": 346.53125, "completions/mean_terminated_length": 329.4137878417969, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.8352, "grad_norm": 2.6406376361846924, "kl": 0.0521240234375, "learning_rate": 1e-06, "loss": -0.0763, "num_tokens": 14005787.0, "reward": 0.09989124536514282, "reward_std": 0.024893736466765404, "rewards/bleu_reward_func/mean": 0.09989124536514282, "rewards/bleu_reward_func/std": 0.06233161687850952, "step": 1044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 512.0, "completions/max_terminated_length": 496.0, "completions/mean_length": 381.90625, "completions/mean_terminated_length": 313.76190185546875, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 0.836, "grad_norm": 2.2674143314361572, "kl": 0.05908203125, "learning_rate": 1e-06, "loss": -0.0206, "num_tokens": 14023792.0, "reward": 0.08537600189447403, "reward_std": 0.04241234064102173, "rewards/bleu_reward_func/mean": 0.08537600189447403, "rewards/bleu_reward_func/std": 0.07984770834445953, "step": 1045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 488.0, "completions/mean_length": 313.4375, "completions/mean_terminated_length": 235.7391357421875, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 0.8368, "grad_norm": 3.6653220653533936, "kl": 0.0543212890625, "learning_rate": 1e-06, "loss": 0.0199, "num_tokens": 14036142.0, "reward": 0.03577762842178345, "reward_std": 0.014606889337301254, "rewards/bleu_reward_func/mean": 0.03577762842178345, "rewards/bleu_reward_func/std": 0.021284347400069237, "step": 1046 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 512.0, "completions/max_terminated_length": 467.0, "completions/mean_length": 263.9375, "completions/mean_terminated_length": 218.0, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.8376, "grad_norm": 4.6712870597839355, "kl": 0.05914306640625, "learning_rate": 1e-06, "loss": 0.1302, "num_tokens": 14046380.0, "reward": 0.04743397980928421, "reward_std": 0.01841292530298233, "rewards/bleu_reward_func/mean": 0.04743397980928421, "rewards/bleu_reward_func/std": 0.0290218573063612, "step": 1047 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 512.0, "completions/max_terminated_length": 504.0, "completions/mean_length": 204.9375, "completions/mean_terminated_length": 184.4666748046875, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.8384, "grad_norm": 8.011630058288574, "kl": 0.0655517578125, "learning_rate": 1e-06, "loss": -0.0986, "num_tokens": 14057290.0, "reward": 0.2640398442745209, "reward_std": 0.029374700039625168, "rewards/bleu_reward_func/mean": 0.2640398442745209, "rewards/bleu_reward_func/std": 0.3879520893096924, "step": 1048 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 512.0, "completions/max_terminated_length": 365.0, "completions/mean_length": 252.875, "completions/mean_terminated_length": 193.07693481445312, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 0.8392, "grad_norm": 3.1190173625946045, "kl": 0.0576171875, "learning_rate": 1e-06, "loss": 0.01, "num_tokens": 14069918.0, "reward": 0.05034583806991577, "reward_std": 0.020114287734031677, "rewards/bleu_reward_func/mean": 0.05034583806991577, "rewards/bleu_reward_func/std": 0.02764110080897808, "step": 1049 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 455.0, "completions/mean_length": 270.78125, "completions/mean_terminated_length": 203.239990234375, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "epoch": 0.84, "grad_norm": 3.3168582916259766, "kl": 0.039947509765625, "learning_rate": 1e-06, "loss": -0.05, "num_tokens": 14080375.0, "reward": 0.0682622641324997, "reward_std": 0.026746664196252823, "rewards/bleu_reward_func/mean": 0.0682622641324997, "rewards/bleu_reward_func/std": 0.04914075881242752, "step": 1050 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 512.0, "completions/max_terminated_length": 494.0, "completions/mean_length": 356.15625, "completions/mean_terminated_length": 274.5238037109375, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.8408, "grad_norm": 2.6723437309265137, "kl": 0.0855712890625, "learning_rate": 1e-06, "loss": -0.0573, "num_tokens": 14094404.0, "reward": 0.032941900193691254, "reward_std": 0.004675520583987236, "rewards/bleu_reward_func/mean": 0.032941900193691254, "rewards/bleu_reward_func/std": 0.03677041456103325, "step": 1051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 512.0, "completions/max_terminated_length": 490.0, "completions/mean_length": 398.8125, "completions/mean_terminated_length": 298.9411926269531, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.8416, "grad_norm": 2.244532823562622, "kl": 0.043792724609375, "learning_rate": 1e-06, "loss": -0.0009, "num_tokens": 14110374.0, "reward": 0.03393974155187607, "reward_std": 0.009483535774052143, "rewards/bleu_reward_func/mean": 0.03393974155187607, "rewards/bleu_reward_func/std": 0.0170142725110054, "step": 1052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 512.0, "completions/max_terminated_length": 484.0, "completions/mean_length": 215.53125, "completions/mean_terminated_length": 184.86207580566406, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "epoch": 0.8424, "grad_norm": 5.13472843170166, "kl": 0.13140869140625, "learning_rate": 1e-06, "loss": 0.0382, "num_tokens": 14121543.0, "reward": 0.03505343943834305, "reward_std": 0.007273062132298946, "rewards/bleu_reward_func/mean": 0.03505343943834305, "rewards/bleu_reward_func/std": 0.0165236946195364, "step": 1053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 423.0, "completions/mean_length": 295.65625, "completions/mean_terminated_length": 223.5416717529297, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.8432, "grad_norm": 3.089383125305176, "kl": 0.0748291015625, "learning_rate": 1e-06, "loss": 0.0492, "num_tokens": 14133372.0, "reward": 0.03953177481889725, "reward_std": 0.013197172433137894, "rewards/bleu_reward_func/mean": 0.03953177481889725, "rewards/bleu_reward_func/std": 0.02268371731042862, "step": 1054 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 512.0, "completions/max_terminated_length": 444.0, "completions/mean_length": 360.28125, "completions/mean_terminated_length": 226.41175842285156, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.844, "grad_norm": 4.491260528564453, "kl": 0.08428955078125, "learning_rate": 1e-06, "loss": -0.0276, "num_tokens": 14148525.0, "reward": 0.030032211914658546, "reward_std": 0.010867346078157425, "rewards/bleu_reward_func/mean": 0.030032211914658546, "rewards/bleu_reward_func/std": 0.018144365400075912, "step": 1055 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 512.0, "completions/max_terminated_length": 463.0, "completions/mean_length": 262.84375, "completions/mean_terminated_length": 246.2333526611328, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.8448, "grad_norm": 2.9773359298706055, "kl": 0.046600341796875, "learning_rate": 1e-06, "loss": -0.0172, "num_tokens": 14159544.0, "reward": 0.045382194221019745, "reward_std": 0.021608000621199608, "rewards/bleu_reward_func/mean": 0.045382194221019745, "rewards/bleu_reward_func/std": 0.04553521052002907, "step": 1056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 501.0, "completions/mean_length": 341.5625, "completions/mean_terminated_length": 293.8399963378906, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.8456, "grad_norm": 3.592421531677246, "kl": 0.092529296875, "learning_rate": 1e-06, "loss": 0.0606, "num_tokens": 14173122.0, "reward": 0.08871526271104813, "reward_std": 0.02755703032016754, "rewards/bleu_reward_func/mean": 0.08871526271104813, "rewards/bleu_reward_func/std": 0.10348460078239441, "step": 1057 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 512.0, "completions/max_terminated_length": 485.0, "completions/mean_length": 255.0625, "completions/mean_terminated_length": 237.933349609375, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.8464, "grad_norm": 3.676213502883911, "kl": 0.0556640625, "learning_rate": 1e-06, "loss": -0.0521, "num_tokens": 14182972.0, "reward": 0.03509638085961342, "reward_std": 0.011363822966814041, "rewards/bleu_reward_func/mean": 0.03509638085961342, "rewards/bleu_reward_func/std": 0.019865239039063454, "step": 1058 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 512.0, "completions/max_terminated_length": 510.0, "completions/mean_length": 274.65625, "completions/mean_terminated_length": 230.70370483398438, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.8472, "grad_norm": 3.6321067810058594, "kl": 0.053924560546875, "learning_rate": 1e-06, "loss": 0.0394, "num_tokens": 14197185.0, "reward": 0.09843635559082031, "reward_std": 0.03348027914762497, "rewards/bleu_reward_func/mean": 0.09843635559082031, "rewards/bleu_reward_func/std": 0.07304807007312775, "step": 1059 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 512.0, "completions/max_terminated_length": 504.0, "completions/mean_length": 387.34375, "completions/mean_terminated_length": 330.68182373046875, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.848, "grad_norm": 2.2477974891662598, "kl": 0.0562744140625, "learning_rate": 1e-06, "loss": 0.0342, "num_tokens": 14213164.0, "reward": 0.050817858427762985, "reward_std": 0.015856031328439713, "rewards/bleu_reward_func/mean": 0.050817858427762985, "rewards/bleu_reward_func/std": 0.06451952457427979, "step": 1060 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 474.0, "completions/mean_length": 324.28125, "completions/mean_terminated_length": 250.8260955810547, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 0.8488, "grad_norm": 2.6654868125915527, "kl": 0.0555419921875, "learning_rate": 1e-06, "loss": -0.072, "num_tokens": 14225901.0, "reward": 0.040826499462127686, "reward_std": 0.01575326919555664, "rewards/bleu_reward_func/mean": 0.040826499462127686, "rewards/bleu_reward_func/std": 0.015293111093342304, "step": 1061 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 484.0, "completions/max_terminated_length": 484.0, "completions/mean_length": 207.75, "completions/mean_terminated_length": 207.75, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "epoch": 0.8496, "grad_norm": 3.5825765132904053, "kl": 0.079986572265625, "learning_rate": 1e-06, "loss": -0.0862, "num_tokens": 14234253.0, "reward": 0.03848409280180931, "reward_std": 0.013242291286587715, "rewards/bleu_reward_func/mean": 0.03848409280180931, "rewards/bleu_reward_func/std": 0.01712285354733467, "step": 1062 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 512.0, "completions/max_terminated_length": 498.0, "completions/mean_length": 398.34375, "completions/mean_terminated_length": 346.68182373046875, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.8504, "grad_norm": 2.3341686725616455, "kl": 0.06793212890625, "learning_rate": 1e-06, "loss": -0.1081, "num_tokens": 14249488.0, "reward": 0.06572327017784119, "reward_std": 0.025146422907710075, "rewards/bleu_reward_func/mean": 0.06572327017784119, "rewards/bleu_reward_func/std": 0.033365800976753235, "step": 1063 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 463.0, "completions/mean_length": 376.3125, "completions/mean_terminated_length": 338.32000732421875, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.8512, "grad_norm": 2.4101240634918213, "kl": 0.04913330078125, "learning_rate": 1e-06, "loss": -0.0386, "num_tokens": 14264370.0, "reward": 0.046704962849617004, "reward_std": 0.02123313769698143, "rewards/bleu_reward_func/mean": 0.046704962849617004, "rewards/bleu_reward_func/std": 0.03704674169421196, "step": 1064 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 490.0, "completions/mean_length": 354.03125, "completions/mean_terminated_length": 292.2174072265625, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "epoch": 0.852, "grad_norm": 2.654803514480591, "kl": 0.0799560546875, "learning_rate": 1e-06, "loss": 0.0249, "num_tokens": 14278251.0, "reward": 0.036083631217479706, "reward_std": 0.009487833827733994, "rewards/bleu_reward_func/mean": 0.036083631217479706, "rewards/bleu_reward_func/std": 0.03182501345872879, "step": 1065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 475.0, "completions/mean_length": 343.53125, "completions/mean_terminated_length": 287.375, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 0.8528, "grad_norm": 2.5892786979675293, "kl": 0.04949951171875, "learning_rate": 1e-06, "loss": 0.0723, "num_tokens": 14292188.0, "reward": 0.054016873240470886, "reward_std": 0.027925897389650345, "rewards/bleu_reward_func/mean": 0.054016873240470886, "rewards/bleu_reward_func/std": 0.031196916475892067, "step": 1066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 512.0, "completions/max_terminated_length": 477.0, "completions/mean_length": 261.6875, "completions/mean_terminated_length": 245.00001525878906, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.8536, "grad_norm": 5.397347450256348, "kl": 0.106109619140625, "learning_rate": 1e-06, "loss": -0.0081, "num_tokens": 14302522.0, "reward": 0.06117306649684906, "reward_std": 0.023319777101278305, "rewards/bleu_reward_func/mean": 0.06117306649684906, "rewards/bleu_reward_func/std": 0.046599678695201874, "step": 1067 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 471.0, "completions/mean_length": 302.90625, "completions/mean_terminated_length": 244.36000061035156, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "epoch": 0.8544, "grad_norm": 3.079831600189209, "kl": 0.051025390625, "learning_rate": 1e-06, "loss": -0.0999, "num_tokens": 14314487.0, "reward": 0.06549815088510513, "reward_std": 0.017141040414571762, "rewards/bleu_reward_func/mean": 0.06549815088510513, "rewards/bleu_reward_func/std": 0.07451631128787994, "step": 1068 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 512.0, "completions/max_terminated_length": 499.0, "completions/mean_length": 247.96875, "completions/mean_terminated_length": 199.07408142089844, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.8552, "grad_norm": 3.2531063556671143, "kl": 0.06011962890625, "learning_rate": 1e-06, "loss": -0.0422, "num_tokens": 14324478.0, "reward": 0.03668832778930664, "reward_std": 0.018588969483971596, "rewards/bleu_reward_func/mean": 0.03668832778930664, "rewards/bleu_reward_func/std": 0.024964287877082825, "step": 1069 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 440.0, "completions/mean_length": 288.375, "completions/mean_terminated_length": 213.83334350585938, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 0.856, "grad_norm": 2.7994515895843506, "kl": 0.037628173828125, "learning_rate": 1e-06, "loss": -0.0105, "num_tokens": 14337218.0, "reward": 0.04852975159883499, "reward_std": 0.011727490462362766, "rewards/bleu_reward_func/mean": 0.04852975159883499, "rewards/bleu_reward_func/std": 0.021890046074986458, "step": 1070 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 512.0, "completions/max_terminated_length": 465.0, "completions/mean_length": 229.09375, "completions/mean_terminated_length": 163.8076934814453, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "epoch": 0.8568, "grad_norm": 3.4524424076080322, "kl": 0.0914306640625, "learning_rate": 1e-06, "loss": 0.0403, "num_tokens": 14347365.0, "reward": 0.04209073632955551, "reward_std": 0.010336240753531456, "rewards/bleu_reward_func/mean": 0.04209073632955551, "rewards/bleu_reward_func/std": 0.017330747097730637, "step": 1071 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 512.0, "completions/max_terminated_length": 443.0, "completions/mean_length": 325.15625, "completions/mean_terminated_length": 319.1290283203125, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.8576, "grad_norm": 2.200551748275757, "kl": 0.04730224609375, "learning_rate": 1e-06, "loss": -0.0014, "num_tokens": 14361250.0, "reward": 0.06647847592830658, "reward_std": 0.02658942900598049, "rewards/bleu_reward_func/mean": 0.06647847592830658, "rewards/bleu_reward_func/std": 0.08473870158195496, "step": 1072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.625, "completions/max_length": 512.0, "completions/max_terminated_length": 510.0, "completions/mean_length": 434.78125, "completions/mean_terminated_length": 306.0833435058594, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.8584, "grad_norm": 2.121492385864258, "kl": 0.04705810546875, "learning_rate": 1e-06, "loss": 0.0316, "num_tokens": 14377891.0, "reward": 0.027470655739307404, "reward_std": 0.009700166061520576, "rewards/bleu_reward_func/mean": 0.027470655739307404, "rewards/bleu_reward_func/std": 0.018717704340815544, "step": 1073 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 512.0, "completions/max_terminated_length": 498.0, "completions/mean_length": 456.46875, "completions/mean_terminated_length": 400.9375, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.8592, "grad_norm": 1.9872575998306274, "kl": 0.052825927734375, "learning_rate": 1e-06, "loss": 0.0179, "num_tokens": 14394938.0, "reward": 0.031011758372187614, "reward_std": 0.006797453388571739, "rewards/bleu_reward_func/mean": 0.031011758372187614, "rewards/bleu_reward_func/std": 0.013120264746248722, "step": 1074 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 512.0, "completions/max_terminated_length": 490.0, "completions/mean_length": 327.1875, "completions/mean_terminated_length": 230.38095092773438, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.86, "grad_norm": 3.2750420570373535, "kl": 0.05621337890625, "learning_rate": 1e-06, "loss": 0.0067, "num_tokens": 14407512.0, "reward": 0.04015614092350006, "reward_std": 0.01684856228530407, "rewards/bleu_reward_func/mean": 0.04015614092350006, "rewards/bleu_reward_func/std": 0.024104053154587746, "step": 1075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 512.0, "completions/max_terminated_length": 510.0, "completions/mean_length": 282.78125, "completions/mean_terminated_length": 275.3870849609375, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.8608, "grad_norm": 2.747910499572754, "kl": 0.06060791015625, "learning_rate": 1e-06, "loss": -0.0831, "num_tokens": 14418377.0, "reward": 0.059567973017692566, "reward_std": 0.02570568397641182, "rewards/bleu_reward_func/mean": 0.059567973017692566, "rewards/bleu_reward_func/std": 0.06817185878753662, "step": 1076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 512.0, "completions/max_terminated_length": 511.0, "completions/mean_length": 284.125, "completions/mean_terminated_length": 268.933349609375, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 0.8616, "grad_norm": 3.2307136058807373, "kl": 0.0513916015625, "learning_rate": 1e-06, "loss": 0.0257, "num_tokens": 14429501.0, "reward": 0.03993712365627289, "reward_std": 0.016049114987254143, "rewards/bleu_reward_func/mean": 0.03993712365627289, "rewards/bleu_reward_func/std": 0.032372910529375076, "step": 1077 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 512.0, "completions/max_terminated_length": 472.0, "completions/mean_length": 396.59375, "completions/mean_terminated_length": 327.3500061035156, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.8624, "grad_norm": 2.057788133621216, "kl": 0.04718017578125, "learning_rate": 1e-06, "loss": 0.06, "num_tokens": 14444720.0, "reward": 0.036224670708179474, "reward_std": 0.009702024050056934, "rewards/bleu_reward_func/mean": 0.036224670708179474, "rewards/bleu_reward_func/std": 0.014296884648501873, "step": 1078 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 512.0, "completions/max_terminated_length": 507.0, "completions/mean_length": 267.375, "completions/mean_terminated_length": 222.07408142089844, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "epoch": 0.8632, "grad_norm": 3.0467591285705566, "kl": 0.06610107421875, "learning_rate": 1e-06, "loss": -0.018, "num_tokens": 14455836.0, "reward": 0.024699728935956955, "reward_std": 0.0076002031564712524, "rewards/bleu_reward_func/mean": 0.024699728935956955, "rewards/bleu_reward_func/std": 0.010115176439285278, "step": 1079 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 336.0, "completions/mean_length": 244.9375, "completions/mean_terminated_length": 155.9166717529297, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 0.864, "grad_norm": 4.053822994232178, "kl": 0.1051025390625, "learning_rate": 1e-06, "loss": -0.179, "num_tokens": 14466490.0, "reward": 0.021421607583761215, "reward_std": 0.008295105770230293, "rewards/bleu_reward_func/mean": 0.021421607583761215, "rewards/bleu_reward_func/std": 0.015474777668714523, "step": 1080 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.59375, "completions/max_length": 512.0, "completions/max_terminated_length": 449.0, "completions/mean_length": 414.25, "completions/mean_terminated_length": 271.3846130371094, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 0.8648, "grad_norm": 2.2090256214141846, "kl": 0.0640869140625, "learning_rate": 1e-06, "loss": -0.0172, "num_tokens": 14483202.0, "reward": 0.03543311357498169, "reward_std": 0.012544544413685799, "rewards/bleu_reward_func/mean": 0.03543311357498169, "rewards/bleu_reward_func/std": 0.030554452911019325, "step": 1081 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 415.0, "completions/max_terminated_length": 415.0, "completions/mean_length": 139.4375, "completions/mean_terminated_length": 139.4375, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.8656, "grad_norm": 8.19969654083252, "kl": 0.13037109375, "learning_rate": 1e-06, "loss": 0.2538, "num_tokens": 14491720.0, "reward": 0.042763207107782364, "reward_std": 0.01436428539454937, "rewards/bleu_reward_func/mean": 0.042763207107782364, "rewards/bleu_reward_func/std": 0.02836323343217373, "step": 1082 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 434.0, "completions/max_terminated_length": 434.0, "completions/mean_length": 221.1875, "completions/mean_terminated_length": 221.1875, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.8664, "grad_norm": 4.217365741729736, "kl": 0.08453369140625, "learning_rate": 1e-06, "loss": -0.0391, "num_tokens": 14504486.0, "reward": 0.05494450032711029, "reward_std": 0.0162642952054739, "rewards/bleu_reward_func/mean": 0.05494450032711029, "rewards/bleu_reward_func/std": 0.025794783607125282, "step": 1083 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 502.0, "completions/mean_length": 328.90625, "completions/mean_terminated_length": 267.875, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.8672, "grad_norm": 2.3826496601104736, "kl": 0.05877685546875, "learning_rate": 1e-06, "loss": -0.0386, "num_tokens": 14521755.0, "reward": 0.11050405353307724, "reward_std": 0.034873202443122864, "rewards/bleu_reward_func/mean": 0.11050405353307724, "rewards/bleu_reward_func/std": 0.11130403727293015, "step": 1084 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 432.0, "completions/mean_length": 252.25, "completions/mean_terminated_length": 165.6666717529297, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 0.868, "grad_norm": 3.0554423332214355, "kl": 0.079742431640625, "learning_rate": 1e-06, "loss": -0.0614, "num_tokens": 14532419.0, "reward": 0.030541863292455673, "reward_std": 0.016133692115545273, "rewards/bleu_reward_func/mean": 0.030541863292455673, "rewards/bleu_reward_func/std": 0.020966939628124237, "step": 1085 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 440.0, "completions/max_terminated_length": 440.0, "completions/mean_length": 208.59375, "completions/mean_terminated_length": 208.59375, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "epoch": 0.8688, "grad_norm": 3.5242645740509033, "kl": 0.05633544921875, "learning_rate": 1e-06, "loss": 0.0629, "num_tokens": 14544030.0, "reward": 0.04443598911166191, "reward_std": 0.021839329972863197, "rewards/bleu_reward_func/mean": 0.04443598911166191, "rewards/bleu_reward_func/std": 0.03833690285682678, "step": 1086 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 501.0, "completions/mean_length": 370.5625, "completions/mean_terminated_length": 315.2174072265625, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.8696, "grad_norm": 2.691652774810791, "kl": 0.069580078125, "learning_rate": 1e-06, "loss": -0.061, "num_tokens": 14558856.0, "reward": 0.04129372909665108, "reward_std": 0.013989726081490517, "rewards/bleu_reward_func/mean": 0.04129372909665108, "rewards/bleu_reward_func/std": 0.02188796177506447, "step": 1087 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 512.0, "completions/max_terminated_length": 496.0, "completions/mean_length": 229.78125, "completions/mean_terminated_length": 220.6774139404297, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.8704, "grad_norm": 4.128385543823242, "kl": 0.0552978515625, "learning_rate": 1e-06, "loss": -0.1428, "num_tokens": 14569505.0, "reward": 0.26207196712493896, "reward_std": 0.07250937074422836, "rewards/bleu_reward_func/mean": 0.26207196712493896, "rewards/bleu_reward_func/std": 0.34467241168022156, "step": 1088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 512.0, "completions/max_terminated_length": 434.0, "completions/mean_length": 381.5625, "completions/mean_terminated_length": 251.125, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.8712, "grad_norm": 2.2956695556640625, "kl": 0.05340576171875, "learning_rate": 1e-06, "loss": 0.0395, "num_tokens": 14584859.0, "reward": 0.08875560760498047, "reward_std": 0.03270617872476578, "rewards/bleu_reward_func/mean": 0.08875560760498047, "rewards/bleu_reward_func/std": 0.10525688529014587, "step": 1089 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 432.0, "completions/max_terminated_length": 432.0, "completions/mean_length": 213.8125, "completions/mean_terminated_length": 213.8125, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 0.872, "grad_norm": 4.106683731079102, "kl": 0.087982177734375, "learning_rate": 1e-06, "loss": -0.0803, "num_tokens": 14596485.0, "reward": 0.07446331530809402, "reward_std": 0.02199474349617958, "rewards/bleu_reward_func/mean": 0.07446331530809402, "rewards/bleu_reward_func/std": 0.056231312453746796, "step": 1090 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 512.0, "completions/max_terminated_length": 465.0, "completions/mean_length": 311.46875, "completions/mean_terminated_length": 265.19232177734375, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.8728, "grad_norm": 2.7658278942108154, "kl": 0.07470703125, "learning_rate": 1e-06, "loss": 0.009, "num_tokens": 14608980.0, "reward": 0.03502470254898071, "reward_std": 0.010442346334457397, "rewards/bleu_reward_func/mean": 0.03502470254898071, "rewards/bleu_reward_func/std": 0.012194231152534485, "step": 1091 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 512.0, "completions/max_terminated_length": 166.0, "completions/mean_length": 308.875, "completions/mean_terminated_length": 105.75, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 0.8736, "grad_norm": 3.983816385269165, "kl": 0.10113525390625, "learning_rate": 1e-06, "loss": -0.0715, "num_tokens": 14623104.0, "reward": 0.03933839499950409, "reward_std": 0.013284911401569843, "rewards/bleu_reward_func/mean": 0.03933839499950409, "rewards/bleu_reward_func/std": 0.01618482731282711, "step": 1092 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.625, "completions/max_length": 512.0, "completions/max_terminated_length": 511.0, "completions/mean_length": 410.84375, "completions/mean_terminated_length": 242.25, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 0.8744, "grad_norm": 2.227384090423584, "kl": 0.05706787109375, "learning_rate": 1e-06, "loss": -0.0022, "num_tokens": 14641235.0, "reward": 0.025841325521469116, "reward_std": 0.008813188411295414, "rewards/bleu_reward_func/mean": 0.025841325521469116, "rewards/bleu_reward_func/std": 0.01780826225876808, "step": 1093 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 420.0, "completions/max_terminated_length": 420.0, "completions/mean_length": 192.6875, "completions/mean_terminated_length": 192.6875, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 0.8752, "grad_norm": 5.167781352996826, "kl": 0.081085205078125, "learning_rate": 1e-06, "loss": -0.0051, "num_tokens": 14649313.0, "reward": 0.05482110381126404, "reward_std": 0.018363406881690025, "rewards/bleu_reward_func/mean": 0.05482110381126404, "rewards/bleu_reward_func/std": 0.03553561493754387, "step": 1094 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.65625, "completions/max_length": 512.0, "completions/max_terminated_length": 504.0, "completions/mean_length": 439.03125, "completions/mean_terminated_length": 299.727294921875, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.876, "grad_norm": 2.0647776126861572, "kl": 0.0677490234375, "learning_rate": 1e-06, "loss": 0.0269, "num_tokens": 14666842.0, "reward": 0.02372824400663376, "reward_std": 0.00758307846263051, "rewards/bleu_reward_func/mean": 0.02372824400663376, "rewards/bleu_reward_func/std": 0.015121630392968655, "step": 1095 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 512.0, "completions/max_terminated_length": 391.0, "completions/mean_length": 374.40625, "completions/mean_terminated_length": 280.2631530761719, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.8768, "grad_norm": 2.0318610668182373, "kl": 0.057373046875, "learning_rate": 1e-06, "loss": 0.0453, "num_tokens": 14681551.0, "reward": 0.01864839717745781, "reward_std": 0.005486675538122654, "rewards/bleu_reward_func/mean": 0.01864839717745781, "rewards/bleu_reward_func/std": 0.020449459552764893, "step": 1096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 512.0, "completions/max_terminated_length": 500.0, "completions/mean_length": 426.9375, "completions/mean_terminated_length": 360.77777099609375, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.8776, "grad_norm": 2.280367374420166, "kl": 0.0638427734375, "learning_rate": 1e-06, "loss": -0.045, "num_tokens": 14700125.0, "reward": 0.041227325797080994, "reward_std": 0.029045000672340393, "rewards/bleu_reward_func/mean": 0.041227325797080994, "rewards/bleu_reward_func/std": 0.05818454921245575, "step": 1097 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 512.0, "completions/max_terminated_length": 492.0, "completions/mean_length": 288.71875, "completions/mean_terminated_length": 237.19232177734375, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 0.8784, "grad_norm": 4.109658241271973, "kl": 0.07611083984375, "learning_rate": 1e-06, "loss": -0.0113, "num_tokens": 14711996.0, "reward": 0.08606592565774918, "reward_std": 0.030430836603045464, "rewards/bleu_reward_func/mean": 0.08606592565774918, "rewards/bleu_reward_func/std": 0.06479021161794662, "step": 1098 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.0, "completions/max_terminated_length": 362.0, "completions/mean_length": 238.15625, "completions/mean_terminated_length": 238.15625, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.8792, "grad_norm": 2.743206262588501, "kl": 0.07562255859375, "learning_rate": 1e-06, "loss": 0.0047, "num_tokens": 14722057.0, "reward": 0.04888010770082474, "reward_std": 0.016005024313926697, "rewards/bleu_reward_func/mean": 0.04888010770082474, "rewards/bleu_reward_func/std": 0.026392478495836258, "step": 1099 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 512.0, "completions/max_terminated_length": 461.0, "completions/mean_length": 385.84375, "completions/mean_terminated_length": 319.76190185546875, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.88, "grad_norm": 2.339637279510498, "kl": 0.063323974609375, "learning_rate": 1e-06, "loss": -0.0166, "num_tokens": 14737660.0, "reward": 0.05646644905209541, "reward_std": 0.022537769749760628, "rewards/bleu_reward_func/mean": 0.05646644905209541, "rewards/bleu_reward_func/std": 0.03494390472769737, "step": 1100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.75, "completions/max_length": 512.0, "completions/max_terminated_length": 99.0, "completions/mean_length": 399.9375, "completions/mean_terminated_length": 63.75, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 0.8808, "grad_norm": 4.858819484710693, "kl": 0.0853271484375, "learning_rate": 1e-06, "loss": 0.0164, "num_tokens": 14754002.0, "reward": 0.08757828921079636, "reward_std": 0.027417277917265892, "rewards/bleu_reward_func/mean": 0.08757828921079636, "rewards/bleu_reward_func/std": 0.07575616985559464, "step": 1101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 512.0, "completions/max_terminated_length": 467.0, "completions/mean_length": 381.75, "completions/mean_terminated_length": 266.8235168457031, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 0.8816, "grad_norm": 2.965717315673828, "kl": 0.0623779296875, "learning_rate": 1e-06, "loss": -0.0092, "num_tokens": 14769226.0, "reward": 0.07998257875442505, "reward_std": 0.047210924327373505, "rewards/bleu_reward_func/mean": 0.07998257875442505, "rewards/bleu_reward_func/std": 0.06257197260856628, "step": 1102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 512.0, "completions/max_terminated_length": 469.0, "completions/mean_length": 368.625, "completions/mean_terminated_length": 282.6000061035156, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "epoch": 0.8824, "grad_norm": 2.529975652694702, "kl": 0.05145263671875, "learning_rate": 1e-06, "loss": 0.1021, "num_tokens": 14783990.0, "reward": 0.05064859241247177, "reward_std": 0.030285051092505455, "rewards/bleu_reward_func/mean": 0.05064859241247177, "rewards/bleu_reward_func/std": 0.050740811973810196, "step": 1103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.75, "completions/max_length": 512.0, "completions/max_terminated_length": 371.0, "completions/mean_length": 458.46875, "completions/mean_terminated_length": 297.875, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "epoch": 0.8832, "grad_norm": 2.1087520122528076, "kl": 0.05126953125, "learning_rate": 1e-06, "loss": 0.0163, "num_tokens": 14803693.0, "reward": 0.08669351041316986, "reward_std": 0.02402358688414097, "rewards/bleu_reward_func/mean": 0.08669351041316986, "rewards/bleu_reward_func/std": 0.08511854708194733, "step": 1104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 448.0, "completions/max_terminated_length": 448.0, "completions/mean_length": 188.1875, "completions/mean_terminated_length": 188.1875, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 0.884, "grad_norm": 4.063257217407227, "kl": 0.0767822265625, "learning_rate": 1e-06, "loss": 0.18, "num_tokens": 14812451.0, "reward": 0.05730229616165161, "reward_std": 0.03279360383749008, "rewards/bleu_reward_func/mean": 0.05730229616165161, "rewards/bleu_reward_func/std": 0.0810193419456482, "step": 1105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/max_terminated_length": 391.0, "completions/mean_length": 231.09375, "completions/mean_terminated_length": 231.09375, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.8848, "grad_norm": 3.3575527667999268, "kl": 0.0682373046875, "learning_rate": 1e-06, "loss": -0.0423, "num_tokens": 14822950.0, "reward": 0.08444621413946152, "reward_std": 0.0277608260512352, "rewards/bleu_reward_func/mean": 0.08444621413946152, "rewards/bleu_reward_func/std": 0.03642307594418526, "step": 1106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 512.0, "completions/max_terminated_length": 509.0, "completions/mean_length": 301.96875, "completions/mean_terminated_length": 295.19354248046875, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 0.8856, "grad_norm": 2.9735851287841797, "kl": 0.06231689453125, "learning_rate": 1e-06, "loss": 0.0507, "num_tokens": 14835285.0, "reward": 0.045117855072021484, "reward_std": 0.0092654749751091, "rewards/bleu_reward_func/mean": 0.045117855072021484, "rewards/bleu_reward_func/std": 0.017960846424102783, "step": 1107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 398.0, "completions/mean_length": 284.6875, "completions/mean_terminated_length": 208.9166717529297, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.8864, "grad_norm": 2.991635322570801, "kl": 0.0849609375, "learning_rate": 1e-06, "loss": -0.0632, "num_tokens": 14847299.0, "reward": 0.029840070754289627, "reward_std": 0.011749091558158398, "rewards/bleu_reward_func/mean": 0.029840070754289627, "rewards/bleu_reward_func/std": 0.02153618633747101, "step": 1108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 501.0, "completions/mean_length": 351.3125, "completions/mean_terminated_length": 297.75, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 0.8872, "grad_norm": 2.6633989810943604, "kl": 0.07733154296875, "learning_rate": 1e-06, "loss": -0.0163, "num_tokens": 14860997.0, "reward": 0.10886503756046295, "reward_std": 0.021869435906410217, "rewards/bleu_reward_func/mean": 0.10886503756046295, "rewards/bleu_reward_func/std": 0.0551656112074852, "step": 1109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 512.0, "completions/max_terminated_length": 511.0, "completions/mean_length": 349.09375, "completions/mean_terminated_length": 318.9259338378906, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 0.888, "grad_norm": 2.456130027770996, "kl": 0.05157470703125, "learning_rate": 1e-06, "loss": 0.0138, "num_tokens": 14874728.0, "reward": 0.03805284574627876, "reward_std": 0.012986007146537304, "rewards/bleu_reward_func/mean": 0.03805284574627876, "rewards/bleu_reward_func/std": 0.01945861615240574, "step": 1110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 512.0, "completions/max_terminated_length": 503.0, "completions/mean_length": 415.4375, "completions/mean_terminated_length": 364.8571472167969, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "epoch": 0.8888, "grad_norm": 2.130664110183716, "kl": 0.050140380859375, "learning_rate": 1e-06, "loss": -0.0034, "num_tokens": 14890278.0, "reward": 0.0457424521446228, "reward_std": 0.016246361657977104, "rewards/bleu_reward_func/mean": 0.0457424521446228, "rewards/bleu_reward_func/std": 0.033452119678258896, "step": 1111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 512.0, "completions/max_terminated_length": 510.0, "completions/mean_length": 511.9375, "completions/mean_terminated_length": 510.0, "completions/min_length": 510.0, "completions/min_terminated_length": 510.0, "epoch": 0.8896, "grad_norm": 1.8803907632827759, "kl": 0.067138671875, "learning_rate": 1e-06, "loss": 0.0027, "num_tokens": 14912388.0, "reward": 0.05271358788013458, "reward_std": 0.012337183579802513, "rewards/bleu_reward_func/mean": 0.05271358788013458, "rewards/bleu_reward_func/std": 0.03781459480524063, "step": 1112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.75, "completions/max_length": 512.0, "completions/max_terminated_length": 419.0, "completions/mean_length": 472.5625, "completions/mean_terminated_length": 354.25, "completions/min_length": 302.0, "completions/min_terminated_length": 302.0, "epoch": 0.8904, "grad_norm": 2.125682830810547, "kl": 0.0582275390625, "learning_rate": 1e-06, "loss": 0.02, "num_tokens": 14931926.0, "reward": 0.037002939730882645, "reward_std": 0.006545543670654297, "rewards/bleu_reward_func/mean": 0.037002939730882645, "rewards/bleu_reward_func/std": 0.02415914461016655, "step": 1113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 512.0, "completions/max_terminated_length": 448.0, "completions/mean_length": 285.5625, "completions/mean_terminated_length": 233.3076934814453, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.8912, "grad_norm": 2.972381353378296, "kl": 0.0584716796875, "learning_rate": 1e-06, "loss": 0.1917, "num_tokens": 14944224.0, "reward": 0.07146148383617401, "reward_std": 0.04880303889513016, "rewards/bleu_reward_func/mean": 0.07146148383617401, "rewards/bleu_reward_func/std": 0.07057799398899078, "step": 1114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 512.0, "completions/max_terminated_length": 488.0, "completions/mean_length": 256.875, "completions/mean_terminated_length": 209.629638671875, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.892, "grad_norm": 4.273919105529785, "kl": 0.043212890625, "learning_rate": 1e-06, "loss": -0.1796, "num_tokens": 14958652.0, "reward": 0.05981897562742233, "reward_std": 0.048418186604976654, "rewards/bleu_reward_func/mean": 0.05981897562742233, "rewards/bleu_reward_func/std": 0.05630670115351677, "step": 1115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 512.0, "completions/max_terminated_length": 490.0, "completions/mean_length": 301.65625, "completions/mean_terminated_length": 287.63336181640625, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 0.8928, "grad_norm": 3.231736421585083, "kl": 0.06976318359375, "learning_rate": 1e-06, "loss": 0.1316, "num_tokens": 14971193.0, "reward": 0.09151319414377213, "reward_std": 0.03449167683720589, "rewards/bleu_reward_func/mean": 0.09151319414377213, "rewards/bleu_reward_func/std": 0.08941276371479034, "step": 1116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 512.0, "completions/max_terminated_length": 508.0, "completions/mean_length": 310.0, "completions/mean_terminated_length": 218.18182373046875, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 0.8936, "grad_norm": 3.243455648422241, "kl": 0.09100341796875, "learning_rate": 1e-06, "loss": 0.0944, "num_tokens": 14983321.0, "reward": 0.07781277596950531, "reward_std": 0.015596391633152962, "rewards/bleu_reward_func/mean": 0.07781277596950531, "rewards/bleu_reward_func/std": 0.08945278078317642, "step": 1117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 512.0, "completions/max_terminated_length": 501.0, "completions/mean_length": 305.40625, "completions/mean_terminated_length": 298.7419128417969, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 0.8944, "grad_norm": 2.6733896732330322, "kl": 0.049560546875, "learning_rate": 1e-06, "loss": 0.1494, "num_tokens": 14995086.0, "reward": 0.03880076855421066, "reward_std": 0.01223827339708805, "rewards/bleu_reward_func/mean": 0.03880076855421066, "rewards/bleu_reward_func/std": 0.017184842377901077, "step": 1118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 510.0, "completions/mean_length": 321.0, "completions/mean_terminated_length": 257.3333435058594, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 0.8952, "grad_norm": 2.7894961833953857, "kl": 0.05950927734375, "learning_rate": 1e-06, "loss": -0.0209, "num_tokens": 15010286.0, "reward": 0.08945924043655396, "reward_std": 0.03418608009815216, "rewards/bleu_reward_func/mean": 0.08945924043655396, "rewards/bleu_reward_func/std": 0.06008521839976311, "step": 1119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 411.0, "completions/mean_length": 261.1875, "completions/mean_terminated_length": 177.58334350585938, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.896, "grad_norm": 2.97663950920105, "kl": 0.086181640625, "learning_rate": 1e-06, "loss": 0.166, "num_tokens": 15023892.0, "reward": 0.1153651624917984, "reward_std": 0.0873895213007927, "rewards/bleu_reward_func/mean": 0.1153651624917984, "rewards/bleu_reward_func/std": 0.11178025603294373, "step": 1120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.625, "completions/max_length": 512.0, "completions/max_terminated_length": 512.0, "completions/mean_length": 449.46875, "completions/mean_terminated_length": 345.25, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.8968, "grad_norm": 2.1350767612457275, "kl": 0.066314697265625, "learning_rate": 1e-06, "loss": 0.0029, "num_tokens": 15043787.0, "reward": 0.02139696478843689, "reward_std": 0.004052299074828625, "rewards/bleu_reward_func/mean": 0.02139696478843689, "rewards/bleu_reward_func/std": 0.01895984448492527, "step": 1121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 512.0, "completions/max_terminated_length": 503.0, "completions/mean_length": 356.59375, "completions/mean_terminated_length": 285.9545593261719, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "epoch": 0.8976, "grad_norm": 2.074108839035034, "kl": 0.0611572265625, "learning_rate": 1e-06, "loss": -0.0119, "num_tokens": 15057830.0, "reward": 0.04125259816646576, "reward_std": 0.017305299639701843, "rewards/bleu_reward_func/mean": 0.04125259816646576, "rewards/bleu_reward_func/std": 0.02421402372419834, "step": 1122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.65625, "completions/max_length": 512.0, "completions/max_terminated_length": 510.0, "completions/mean_length": 442.46875, "completions/mean_terminated_length": 309.727294921875, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 0.8984, "grad_norm": 2.22540283203125, "kl": 0.05780029296875, "learning_rate": 1e-06, "loss": -0.0008, "num_tokens": 15075733.0, "reward": 0.06470570713281631, "reward_std": 0.0157428327947855, "rewards/bleu_reward_func/mean": 0.06470570713281631, "rewards/bleu_reward_func/std": 0.021143831312656403, "step": 1123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9375, "completions/max_length": 512.0, "completions/max_terminated_length": 497.0, "completions/mean_length": 510.09375, "completions/mean_terminated_length": 481.5, "completions/min_length": 466.0, "completions/min_terminated_length": 466.0, "epoch": 0.8992, "grad_norm": 2.060910701751709, "kl": 0.0660400390625, "learning_rate": 1e-06, "loss": 0.0009, "num_tokens": 15095424.0, "reward": 0.04325367882847786, "reward_std": 0.017079303041100502, "rewards/bleu_reward_func/mean": 0.04325367882847786, "rewards/bleu_reward_func/std": 0.062008537352085114, "step": 1124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 512.0, "completions/max_terminated_length": 481.0, "completions/mean_length": 348.8125, "completions/mean_terminated_length": 325.5, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.9, "grad_norm": 2.4369888305664062, "kl": 0.05029296875, "learning_rate": 1e-06, "loss": 0.0233, "num_tokens": 15111818.0, "reward": 0.044592827558517456, "reward_std": 0.022017713636159897, "rewards/bleu_reward_func/mean": 0.044592827558517456, "rewards/bleu_reward_func/std": 0.042288888245821, "step": 1125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 304.0, "completions/mean_length": 239.875, "completions/mean_terminated_length": 163.67999267578125, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.9008, "grad_norm": 2.867711067199707, "kl": 0.0728759765625, "learning_rate": 1e-06, "loss": 0.0002, "num_tokens": 15121334.0, "reward": 0.01965026557445526, "reward_std": 0.00810272991657257, "rewards/bleu_reward_func/mean": 0.01965026557445526, "rewards/bleu_reward_func/std": 0.010319051332771778, "step": 1126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 512.0, "completions/max_terminated_length": 509.0, "completions/mean_length": 420.40625, "completions/mean_terminated_length": 378.7727355957031, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.9016, "grad_norm": 2.170319080352783, "kl": 0.039642333984375, "learning_rate": 1e-06, "loss": -0.0223, "num_tokens": 15138355.0, "reward": 0.08797721564769745, "reward_std": 0.021382782608270645, "rewards/bleu_reward_func/mean": 0.08797721564769745, "rewards/bleu_reward_func/std": 0.0978621318936348, "step": 1127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.59375, "completions/max_length": 512.0, "completions/max_terminated_length": 510.0, "completions/mean_length": 432.3125, "completions/mean_terminated_length": 315.8461608886719, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 0.9024, "grad_norm": 2.282930850982666, "kl": 0.0628662109375, "learning_rate": 1e-06, "loss": -0.0182, "num_tokens": 15154437.0, "reward": 0.022853992879390717, "reward_std": 0.007267317268997431, "rewards/bleu_reward_func/mean": 0.022853992879390717, "rewards/bleu_reward_func/std": 0.010599992237985134, "step": 1128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 512.0, "completions/max_terminated_length": 512.0, "completions/mean_length": 389.34375, "completions/mean_terminated_length": 333.5909118652344, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "epoch": 0.9032, "grad_norm": 2.7985355854034424, "kl": 0.04034423828125, "learning_rate": 1e-06, "loss": 0.0193, "num_tokens": 15169968.0, "reward": 0.05068669840693474, "reward_std": 0.011722628958523273, "rewards/bleu_reward_func/mean": 0.05068669840693474, "rewards/bleu_reward_func/std": 0.028417525812983513, "step": 1129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 512.0, "completions/max_terminated_length": 447.0, "completions/mean_length": 263.3125, "completions/mean_terminated_length": 237.58621215820312, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "epoch": 0.904, "grad_norm": 3.5234053134918213, "kl": 0.0648193359375, "learning_rate": 1e-06, "loss": 0.0324, "num_tokens": 15180522.0, "reward": 0.0427127331495285, "reward_std": 0.018165353685617447, "rewards/bleu_reward_func/mean": 0.0427127331495285, "rewards/bleu_reward_func/std": 0.02901976928114891, "step": 1130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 482.0, "completions/mean_length": 362.46875, "completions/mean_terminated_length": 312.625, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.9048, "grad_norm": 2.3100168704986572, "kl": 0.0533447265625, "learning_rate": 1e-06, "loss": -0.0478, "num_tokens": 15195017.0, "reward": 0.06346133351325989, "reward_std": 0.023072250187397003, "rewards/bleu_reward_func/mean": 0.06346133351325989, "rewards/bleu_reward_func/std": 0.043767333030700684, "step": 1131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 512.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 262.3125, "completions/mean_terminated_length": 204.69232177734375, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "epoch": 0.9056, "grad_norm": 3.143620014190674, "kl": 0.052947998046875, "learning_rate": 1e-06, "loss": -0.0513, "num_tokens": 15206091.0, "reward": 0.0305976253002882, "reward_std": 0.014570488594472408, "rewards/bleu_reward_func/mean": 0.0305976253002882, "rewards/bleu_reward_func/std": 0.018198352307081223, "step": 1132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 443.0, "completions/mean_length": 328.46875, "completions/mean_terminated_length": 267.29168701171875, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.9064, "grad_norm": 2.5279524326324463, "kl": 0.06573486328125, "learning_rate": 1e-06, "loss": 0.0373, "num_tokens": 15219594.0, "reward": 0.03247949108481407, "reward_std": 0.0067100487649440765, "rewards/bleu_reward_func/mean": 0.03247949108481407, "rewards/bleu_reward_func/std": 0.023386213928461075, "step": 1133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 512.0, "completions/max_terminated_length": 503.0, "completions/mean_length": 256.25, "completions/mean_terminated_length": 239.20001220703125, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 0.9072, "grad_norm": 2.703608274459839, "kl": 0.06512451171875, "learning_rate": 1e-06, "loss": -0.0771, "num_tokens": 15230138.0, "reward": 0.055908337235450745, "reward_std": 0.02350510098040104, "rewards/bleu_reward_func/mean": 0.055908337235450745, "rewards/bleu_reward_func/std": 0.04675652086734772, "step": 1134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 512.0, "completions/max_terminated_length": 469.0, "completions/mean_length": 287.21875, "completions/mean_terminated_length": 263.96551513671875, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.908, "grad_norm": 3.3396432399749756, "kl": 0.0732421875, "learning_rate": 1e-06, "loss": -0.1557, "num_tokens": 15241889.0, "reward": 0.06957037001848221, "reward_std": 0.03351438045501709, "rewards/bleu_reward_func/mean": 0.06957037001848221, "rewards/bleu_reward_func/std": 0.06069939583539963, "step": 1135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 511.0, "completions/mean_length": 342.625, "completions/mean_terminated_length": 286.16668701171875, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.9088, "grad_norm": 2.8781590461730957, "kl": 0.0584716796875, "learning_rate": 1e-06, "loss": 0.0245, "num_tokens": 15259365.0, "reward": 0.04491988569498062, "reward_std": 0.012549539096653461, "rewards/bleu_reward_func/mean": 0.04491988569498062, "rewards/bleu_reward_func/std": 0.019155489280819893, "step": 1136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 486.0, "completions/mean_length": 297.71875, "completions/mean_terminated_length": 226.2916717529297, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "epoch": 0.9096, "grad_norm": 2.8332650661468506, "kl": 0.073974609375, "learning_rate": 1e-06, "loss": 0.0845, "num_tokens": 15273084.0, "reward": 0.040149278938770294, "reward_std": 0.017163297161459923, "rewards/bleu_reward_func/mean": 0.040149278938770294, "rewards/bleu_reward_func/std": 0.028974436223506927, "step": 1137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 510.0, "completions/mean_length": 376.4375, "completions/mean_terminated_length": 323.39129638671875, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 0.9104, "grad_norm": 2.322246551513672, "kl": 0.0478515625, "learning_rate": 1e-06, "loss": 0.1149, "num_tokens": 15287562.0, "reward": 0.04656628519296646, "reward_std": 0.021638479083776474, "rewards/bleu_reward_func/mean": 0.04656628519296646, "rewards/bleu_reward_func/std": 0.033972807228565216, "step": 1138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 512.0, "completions/max_terminated_length": 362.0, "completions/mean_length": 244.09375, "completions/mean_terminated_length": 205.82144165039062, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.9112, "grad_norm": 2.6419429779052734, "kl": 0.05712890625, "learning_rate": 1e-06, "loss": 0.0536, "num_tokens": 15297725.0, "reward": 0.07112360745668411, "reward_std": 0.036042600870132446, "rewards/bleu_reward_func/mean": 0.07112360745668411, "rewards/bleu_reward_func/std": 0.08656957000494003, "step": 1139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 512.0, "completions/max_terminated_length": 507.0, "completions/mean_length": 405.78125, "completions/mean_terminated_length": 357.5, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 0.912, "grad_norm": 2.181318998336792, "kl": 0.0428466796875, "learning_rate": 1e-06, "loss": 0.0321, "num_tokens": 15314014.0, "reward": 0.051423329859972, "reward_std": 0.016586489975452423, "rewards/bleu_reward_func/mean": 0.051423329859972, "rewards/bleu_reward_func/std": 0.028527207672595978, "step": 1140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 506.0, "completions/mean_length": 324.78125, "completions/mean_terminated_length": 272.3599853515625, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.9128, "grad_norm": 2.6057114601135254, "kl": 0.05413818359375, "learning_rate": 1e-06, "loss": 0.0203, "num_tokens": 15326527.0, "reward": 0.0373198464512825, "reward_std": 0.007190403528511524, "rewards/bleu_reward_func/mean": 0.0373198464512825, "rewards/bleu_reward_func/std": 0.011146528646349907, "step": 1141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 453.0, "completions/mean_length": 325.96875, "completions/mean_terminated_length": 263.9583435058594, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.9136, "grad_norm": 2.6041626930236816, "kl": 0.0791015625, "learning_rate": 1e-06, "loss": -0.0076, "num_tokens": 15339198.0, "reward": 0.049278415739536285, "reward_std": 0.01710457354784012, "rewards/bleu_reward_func/mean": 0.049278415739536285, "rewards/bleu_reward_func/std": 0.037151776254177094, "step": 1142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5625, "completions/max_length": 512.0, "completions/max_terminated_length": 510.0, "completions/mean_length": 376.78125, "completions/mean_terminated_length": 202.92857360839844, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.9144, "grad_norm": 4.027402877807617, "kl": 0.0850830078125, "learning_rate": 1e-06, "loss": -0.0234, "num_tokens": 15357719.0, "reward": 0.028639614582061768, "reward_std": 0.007897703908383846, "rewards/bleu_reward_func/mean": 0.028639614582061768, "rewards/bleu_reward_func/std": 0.018189268186688423, "step": 1143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 512.0, "completions/max_terminated_length": 467.0, "completions/mean_length": 255.9375, "completions/mean_terminated_length": 247.6774139404297, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.9152, "grad_norm": 5.938324451446533, "kl": 0.082763671875, "learning_rate": 1e-06, "loss": 0.0391, "num_tokens": 15372557.0, "reward": 0.06757717579603195, "reward_std": 0.023928619921207428, "rewards/bleu_reward_func/mean": 0.06757717579603195, "rewards/bleu_reward_func/std": 0.04241359233856201, "step": 1144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 512.0, "completions/max_terminated_length": 502.0, "completions/mean_length": 393.8125, "completions/mean_terminated_length": 366.5384826660156, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "epoch": 0.916, "grad_norm": 2.242014169692993, "kl": 0.0362548828125, "learning_rate": 1e-06, "loss": -0.0631, "num_tokens": 15387383.0, "reward": 0.05110463500022888, "reward_std": 0.0156728345900774, "rewards/bleu_reward_func/mean": 0.05110463500022888, "rewards/bleu_reward_func/std": 0.04327816516160965, "step": 1145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 512.0, "completions/max_terminated_length": 500.0, "completions/mean_length": 333.6875, "completions/mean_terminated_length": 292.5384826660156, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.9168, "grad_norm": 2.574808120727539, "kl": 0.04595947265625, "learning_rate": 1e-06, "loss": 0.0006, "num_tokens": 15399901.0, "reward": 0.04336816817522049, "reward_std": 0.009710687212646008, "rewards/bleu_reward_func/mean": 0.04336816817522049, "rewards/bleu_reward_func/std": 0.04393818601965904, "step": 1146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 444.0, "completions/mean_length": 284.375, "completions/mean_terminated_length": 220.63999938964844, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.9176, "grad_norm": 3.7397375106811523, "kl": 0.055633544921875, "learning_rate": 1e-06, "loss": -0.2068, "num_tokens": 15415873.0, "reward": 0.08271771669387817, "reward_std": 0.03239203989505768, "rewards/bleu_reward_func/mean": 0.08271771669387817, "rewards/bleu_reward_func/std": 0.05870966985821724, "step": 1147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 512.0, "completions/mean_length": 281.375, "completions/mean_terminated_length": 216.79998779296875, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.9184, "grad_norm": 3.4246437549591064, "kl": 0.03533935546875, "learning_rate": 1e-06, "loss": 0.0401, "num_tokens": 15427453.0, "reward": 0.02845573052763939, "reward_std": 0.009434389881789684, "rewards/bleu_reward_func/mean": 0.02845573052763939, "rewards/bleu_reward_func/std": 0.010721604339778423, "step": 1148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 510.0, "completions/mean_length": 330.375, "completions/mean_terminated_length": 269.8333435058594, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.9192, "grad_norm": 2.9520516395568848, "kl": 0.05230712890625, "learning_rate": 1e-06, "loss": -0.0683, "num_tokens": 15441313.0, "reward": 0.09095876663923264, "reward_std": 0.031137729063630104, "rewards/bleu_reward_func/mean": 0.09095876663923264, "rewards/bleu_reward_func/std": 0.0966407060623169, "step": 1149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.0, "completions/max_terminated_length": 379.0, "completions/mean_length": 196.59375, "completions/mean_terminated_length": 196.59375, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 0.92, "grad_norm": 3.3121564388275146, "kl": 0.11737060546875, "learning_rate": 1e-06, "loss": 0.0138, "num_tokens": 15449564.0, "reward": 0.04255712777376175, "reward_std": 0.015855029225349426, "rewards/bleu_reward_func/mean": 0.04255712777376175, "rewards/bleu_reward_func/std": 0.018233712762594223, "step": 1150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 309.0, "completions/mean_length": 231.875, "completions/mean_terminated_length": 138.5, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.9208, "grad_norm": 3.608539342880249, "kl": 0.07647705078125, "learning_rate": 1e-06, "loss": 0.0542, "num_tokens": 15459792.0, "reward": 0.04274564981460571, "reward_std": 0.016751842573285103, "rewards/bleu_reward_func/mean": 0.04274564981460571, "rewards/bleu_reward_func/std": 0.025919852778315544, "step": 1151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 512.0, "completions/max_terminated_length": 490.0, "completions/mean_length": 411.90625, "completions/mean_terminated_length": 311.8125, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.9216, "grad_norm": 2.1542413234710693, "kl": 0.076904296875, "learning_rate": 1e-06, "loss": 0.0758, "num_tokens": 15475893.0, "reward": 0.046681758016347885, "reward_std": 0.014086486771702766, "rewards/bleu_reward_func/mean": 0.046681758016347885, "rewards/bleu_reward_func/std": 0.02536054514348507, "step": 1152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/max_terminated_length": 368.0, "completions/mean_length": 148.8125, "completions/mean_terminated_length": 148.8125, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 0.9224, "grad_norm": 4.492018222808838, "kl": 0.088714599609375, "learning_rate": 1e-06, "loss": 0.0612, "num_tokens": 15483447.0, "reward": 0.042024992406368256, "reward_std": 0.019391583278775215, "rewards/bleu_reward_func/mean": 0.042024992406368256, "rewards/bleu_reward_func/std": 0.029765766113996506, "step": 1153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 512.0, "completions/max_terminated_length": 497.0, "completions/mean_length": 365.21875, "completions/mean_terminated_length": 288.3333435058594, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 0.9232, "grad_norm": 2.9250950813293457, "kl": 0.0654296875, "learning_rate": 1e-06, "loss": -0.0311, "num_tokens": 15497710.0, "reward": 0.0304352305829525, "reward_std": 0.01103723980486393, "rewards/bleu_reward_func/mean": 0.0304352305829525, "rewards/bleu_reward_func/std": 0.03708275035023689, "step": 1154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 512.0, "completions/max_terminated_length": 505.0, "completions/mean_length": 364.3125, "completions/mean_terminated_length": 234.0, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 0.924, "grad_norm": 2.453594207763672, "kl": 0.06170654296875, "learning_rate": 1e-06, "loss": -0.0598, "num_tokens": 15512032.0, "reward": 0.07078155130147934, "reward_std": 0.021277839317917824, "rewards/bleu_reward_func/mean": 0.07078155130147934, "rewards/bleu_reward_func/std": 0.05780218914151192, "step": 1155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 512.0, "completions/max_terminated_length": 399.0, "completions/mean_length": 267.78125, "completions/mean_terminated_length": 232.8928680419922, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.9248, "grad_norm": 2.6847267150878906, "kl": 0.0552978515625, "learning_rate": 1e-06, "loss": -0.0401, "num_tokens": 15522577.0, "reward": 0.051142215728759766, "reward_std": 0.019845107570290565, "rewards/bleu_reward_func/mean": 0.051142215728759766, "rewards/bleu_reward_func/std": 0.026456067338585854, "step": 1156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 512.0, "completions/max_terminated_length": 497.0, "completions/mean_length": 308.84375, "completions/mean_terminated_length": 279.8214416503906, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 0.9256, "grad_norm": 4.366429805755615, "kl": 0.07275390625, "learning_rate": 1e-06, "loss": -0.0373, "num_tokens": 15534556.0, "reward": 0.035104669630527496, "reward_std": 0.011860767379403114, "rewards/bleu_reward_func/mean": 0.035104669630527496, "rewards/bleu_reward_func/std": 0.01294864621013403, "step": 1157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 512.0, "completions/max_terminated_length": 510.0, "completions/mean_length": 285.65625, "completions/mean_terminated_length": 253.32144165039062, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 0.9264, "grad_norm": 3.2398438453674316, "kl": 0.0435791015625, "learning_rate": 1e-06, "loss": 0.05, "num_tokens": 15545873.0, "reward": 0.06170883774757385, "reward_std": 0.024604424834251404, "rewards/bleu_reward_func/mean": 0.06170883774757385, "rewards/bleu_reward_func/std": 0.025780370458960533, "step": 1158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 505.0, "completions/mean_length": 294.125, "completions/mean_terminated_length": 208.86956787109375, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.9272, "grad_norm": 4.35112190246582, "kl": 0.0579833984375, "learning_rate": 1e-06, "loss": -0.0676, "num_tokens": 15559021.0, "reward": 0.122794009745121, "reward_std": 0.04096021503210068, "rewards/bleu_reward_func/mean": 0.122794009745121, "rewards/bleu_reward_func/std": 0.1473662406206131, "step": 1159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 512.0, "completions/max_terminated_length": 447.0, "completions/mean_length": 413.875, "completions/mean_terminated_length": 327.29412841796875, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.928, "grad_norm": 2.200023651123047, "kl": 0.0390625, "learning_rate": 1e-06, "loss": 0.0099, "num_tokens": 15575625.0, "reward": 0.043420542031526566, "reward_std": 0.015745732933282852, "rewards/bleu_reward_func/mean": 0.043420542031526566, "rewards/bleu_reward_func/std": 0.01907273940742016, "step": 1160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 358.0, "completions/mean_length": 251.3125, "completions/mean_terminated_length": 164.4166717529297, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 0.9288, "grad_norm": 3.7651917934417725, "kl": 0.1016845703125, "learning_rate": 1e-06, "loss": 0.0955, "num_tokens": 15585923.0, "reward": 0.02488887310028076, "reward_std": 0.008125792257487774, "rewards/bleu_reward_func/mean": 0.02488887310028076, "rewards/bleu_reward_func/std": 0.02298036403954029, "step": 1161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.59375, "completions/max_length": 512.0, "completions/max_terminated_length": 500.0, "completions/mean_length": 460.21875, "completions/mean_terminated_length": 384.5384826660156, "completions/min_length": 274.0, "completions/min_terminated_length": 274.0, "epoch": 0.9296, "grad_norm": 2.216195583343506, "kl": 0.05767822265625, "learning_rate": 1e-06, "loss": -0.074, "num_tokens": 15603114.0, "reward": 0.04072732850909233, "reward_std": 0.020597945898771286, "rewards/bleu_reward_func/mean": 0.04072732850909233, "rewards/bleu_reward_func/std": 0.03298228979110718, "step": 1162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 512.0, "completions/max_terminated_length": 508.0, "completions/mean_length": 346.78125, "completions/mean_terminated_length": 233.73684692382812, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 0.9304, "grad_norm": 3.4893603324890137, "kl": 0.0799560546875, "learning_rate": 1e-06, "loss": -0.0215, "num_tokens": 15616915.0, "reward": 0.06695497781038284, "reward_std": 0.024308744817972183, "rewards/bleu_reward_func/mean": 0.06695497781038284, "rewards/bleu_reward_func/std": 0.035929832607507706, "step": 1163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 480.0, "completions/mean_length": 331.375, "completions/mean_terminated_length": 271.16668701171875, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 0.9312, "grad_norm": 2.481180429458618, "kl": 0.044219970703125, "learning_rate": 1e-06, "loss": -0.073, "num_tokens": 15630071.0, "reward": 0.030527595430612564, "reward_std": 0.01595548912882805, "rewards/bleu_reward_func/mean": 0.030527595430612564, "rewards/bleu_reward_func/std": 0.025926506146788597, "step": 1164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 512.0, "completions/max_terminated_length": 404.0, "completions/mean_length": 379.875, "completions/mean_terminated_length": 277.1111145019531, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.932, "grad_norm": 2.4039766788482666, "kl": 0.0494384765625, "learning_rate": 1e-06, "loss": -0.0824, "num_tokens": 15645003.0, "reward": 0.06304138153791428, "reward_std": 0.02535812184214592, "rewards/bleu_reward_func/mean": 0.06304138153791428, "rewards/bleu_reward_func/std": 0.06451297551393509, "step": 1165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 435.0, "completions/mean_length": 321.5, "completions/mean_terminated_length": 258.0, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "epoch": 0.9328, "grad_norm": 3.711174964904785, "kl": 0.07073974609375, "learning_rate": 1e-06, "loss": 0.0119, "num_tokens": 15657851.0, "reward": 0.02661607414484024, "reward_std": 0.006588813848793507, "rewards/bleu_reward_func/mean": 0.02661607414484024, "rewards/bleu_reward_func/std": 0.014483918435871601, "step": 1166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 512.0, "completions/max_terminated_length": 490.0, "completions/mean_length": 449.46875, "completions/mean_terminated_length": 421.04547119140625, "completions/min_length": 312.0, "completions/min_terminated_length": 312.0, "epoch": 0.9336, "grad_norm": 2.1988365650177, "kl": 0.05462646484375, "learning_rate": 1e-06, "loss": -0.0023, "num_tokens": 15676234.0, "reward": 0.04999478906393051, "reward_std": 0.0120457224547863, "rewards/bleu_reward_func/mean": 0.04999478906393051, "rewards/bleu_reward_func/std": 0.04736227169632912, "step": 1167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/max_terminated_length": 506.0, "completions/mean_length": 249.34375, "completions/mean_terminated_length": 249.34375, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 0.9344, "grad_norm": 2.655139207839966, "kl": 0.05450439453125, "learning_rate": 1e-06, "loss": 0.1106, "num_tokens": 15686653.0, "reward": 0.03939780965447426, "reward_std": 0.02221381478011608, "rewards/bleu_reward_func/mean": 0.03939780965447426, "rewards/bleu_reward_func/std": 0.029370024800300598, "step": 1168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 512.0, "completions/max_terminated_length": 448.0, "completions/mean_length": 224.15625, "completions/mean_terminated_length": 214.87095642089844, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 0.9352, "grad_norm": 2.832167863845825, "kl": 0.05328369140625, "learning_rate": 1e-06, "loss": 0.1287, "num_tokens": 15695698.0, "reward": 0.07494577765464783, "reward_std": 0.019553756341338158, "rewards/bleu_reward_func/mean": 0.07494577765464783, "rewards/bleu_reward_func/std": 0.09201066941022873, "step": 1169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 512.0, "completions/max_terminated_length": 473.0, "completions/mean_length": 266.59375, "completions/mean_terminated_length": 221.1481475830078, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.936, "grad_norm": 3.4584455490112305, "kl": 0.0975341796875, "learning_rate": 1e-06, "loss": -0.0885, "num_tokens": 15706157.0, "reward": 0.05156881362199783, "reward_std": 0.02069806307554245, "rewards/bleu_reward_func/mean": 0.05156881362199783, "rewards/bleu_reward_func/std": 0.0440862663090229, "step": 1170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 512.0, "completions/max_terminated_length": 494.0, "completions/mean_length": 371.65625, "completions/mean_terminated_length": 287.45001220703125, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 0.9368, "grad_norm": 2.4048993587493896, "kl": 0.04290771484375, "learning_rate": 1e-06, "loss": -0.0467, "num_tokens": 15720266.0, "reward": 0.10653826594352722, "reward_std": 0.0501834899187088, "rewards/bleu_reward_func/mean": 0.10653826594352722, "rewards/bleu_reward_func/std": 0.08799951523542404, "step": 1171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 512.0, "completions/max_terminated_length": 496.0, "completions/mean_length": 319.5625, "completions/mean_terminated_length": 306.73333740234375, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.9376, "grad_norm": 2.697983503341675, "kl": 0.07415771484375, "learning_rate": 1e-06, "loss": -0.015, "num_tokens": 15733156.0, "reward": 0.038380008190870285, "reward_std": 0.011081306263804436, "rewards/bleu_reward_func/mean": 0.038380008190870285, "rewards/bleu_reward_func/std": 0.018261730670928955, "step": 1172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 512.0, "completions/max_terminated_length": 479.0, "completions/mean_length": 308.0625, "completions/mean_terminated_length": 149.44444274902344, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "epoch": 0.9384, "grad_norm": 3.311505079269409, "kl": 0.06927490234375, "learning_rate": 1e-06, "loss": -0.0721, "num_tokens": 15750158.0, "reward": 0.0754072293639183, "reward_std": 0.028568794950842857, "rewards/bleu_reward_func/mean": 0.0754072293639183, "rewards/bleu_reward_func/std": 0.0542432963848114, "step": 1173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5625, "completions/max_length": 512.0, "completions/max_terminated_length": 512.0, "completions/mean_length": 422.625, "completions/mean_terminated_length": 307.71429443359375, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.9392, "grad_norm": 3.5287482738494873, "kl": 0.075225830078125, "learning_rate": 1e-06, "loss": 0.2262, "num_tokens": 15766810.0, "reward": 0.058596234768629074, "reward_std": 0.014998164027929306, "rewards/bleu_reward_func/mean": 0.058596234768629074, "rewards/bleu_reward_func/std": 0.03898334875702858, "step": 1174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 456.0, "completions/max_terminated_length": 456.0, "completions/mean_length": 211.75, "completions/mean_terminated_length": 211.75, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 0.94, "grad_norm": 3.2140583992004395, "kl": 0.07330322265625, "learning_rate": 1e-06, "loss": 0.1292, "num_tokens": 15775706.0, "reward": 0.047663480043411255, "reward_std": 0.014913933351635933, "rewards/bleu_reward_func/mean": 0.047663480043411255, "rewards/bleu_reward_func/std": 0.03790759667754173, "step": 1175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 469.0, "completions/max_terminated_length": 469.0, "completions/mean_length": 231.75, "completions/mean_terminated_length": 231.75, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "epoch": 0.9408, "grad_norm": 3.363187551498413, "kl": 0.07049560546875, "learning_rate": 1e-06, "loss": -0.0657, "num_tokens": 15785258.0, "reward": 0.07324777543544769, "reward_std": 0.022441495209932327, "rewards/bleu_reward_func/mean": 0.07324777543544769, "rewards/bleu_reward_func/std": 0.05588282272219658, "step": 1176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 512.0, "completions/max_terminated_length": 511.0, "completions/mean_length": 391.4375, "completions/mean_terminated_length": 328.28570556640625, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 0.9416, "grad_norm": 2.2609217166900635, "kl": 0.066162109375, "learning_rate": 1e-06, "loss": 0.0827, "num_tokens": 15800352.0, "reward": 0.04194977134466171, "reward_std": 0.015291344374418259, "rewards/bleu_reward_func/mean": 0.04194977134466171, "rewards/bleu_reward_func/std": 0.035137806087732315, "step": 1177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 512.0, "completions/max_terminated_length": 479.0, "completions/mean_length": 366.53125, "completions/mean_terminated_length": 253.38888549804688, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "epoch": 0.9424, "grad_norm": 2.5422301292419434, "kl": 0.0784912109375, "learning_rate": 1e-06, "loss": 0.0902, "num_tokens": 15814801.0, "reward": 0.024109739810228348, "reward_std": 0.008774411864578724, "rewards/bleu_reward_func/mean": 0.024109739810228348, "rewards/bleu_reward_func/std": 0.012804670259356499, "step": 1178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 299.0, "completions/max_terminated_length": 299.0, "completions/mean_length": 130.0625, "completions/mean_terminated_length": 130.0625, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.9432, "grad_norm": 4.302926540374756, "kl": 0.14208984375, "learning_rate": 1e-06, "loss": 0.0642, "num_tokens": 15825731.0, "reward": 0.23027461767196655, "reward_std": 0.039070405066013336, "rewards/bleu_reward_func/mean": 0.23027461767196655, "rewards/bleu_reward_func/std": 0.2543703317642212, "step": 1179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.53125, "completions/max_length": 512.0, "completions/max_terminated_length": 474.0, "completions/mean_length": 431.96875, "completions/mean_terminated_length": 341.2666931152344, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.944, "grad_norm": 2.821815252304077, "kl": 0.073486328125, "learning_rate": 1e-06, "loss": 0.084, "num_tokens": 15842882.0, "reward": 0.045671649277210236, "reward_std": 0.012227097526192665, "rewards/bleu_reward_func/mean": 0.045671649277210236, "rewards/bleu_reward_func/std": 0.025908511132001877, "step": 1180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.0, "completions/max_terminated_length": 376.0, "completions/mean_length": 153.5, "completions/mean_terminated_length": 153.5, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 0.9448, "grad_norm": 3.858163595199585, "kl": 0.07586669921875, "learning_rate": 1e-06, "loss": 0.1228, "num_tokens": 15849690.0, "reward": 0.0736662819981575, "reward_std": 0.03443998470902443, "rewards/bleu_reward_func/mean": 0.0736662819981575, "rewards/bleu_reward_func/std": 0.05406653508543968, "step": 1181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/max_terminated_length": 498.0, "completions/mean_length": 228.375, "completions/mean_terminated_length": 228.375, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 0.9456, "grad_norm": 3.6348228454589844, "kl": 0.06280517578125, "learning_rate": 1e-06, "loss": 0.0013, "num_tokens": 15862254.0, "reward": 0.03549562767148018, "reward_std": 0.013098573312163353, "rewards/bleu_reward_func/mean": 0.03549562767148018, "rewards/bleu_reward_func/std": 0.016965733841061592, "step": 1182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 505.0, "completions/mean_length": 440.5, "completions/mean_terminated_length": 420.47998046875, "completions/min_length": 275.0, "completions/min_terminated_length": 275.0, "epoch": 0.9464, "grad_norm": 2.2529780864715576, "kl": 0.05780029296875, "learning_rate": 1e-06, "loss": -0.0692, "num_tokens": 15878750.0, "reward": 0.07513043284416199, "reward_std": 0.025522038340568542, "rewards/bleu_reward_func/mean": 0.07513043284416199, "rewards/bleu_reward_func/std": 0.06642089039087296, "step": 1183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.6875, "completions/max_length": 512.0, "completions/max_terminated_length": 494.0, "completions/mean_length": 470.34375, "completions/mean_terminated_length": 378.70001220703125, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 0.9472, "grad_norm": 1.893031358718872, "kl": 0.0584716796875, "learning_rate": 1e-06, "loss": -0.0729, "num_tokens": 15897969.0, "reward": 0.0447755828499794, "reward_std": 0.01626760885119438, "rewards/bleu_reward_func/mean": 0.0447755828499794, "rewards/bleu_reward_func/std": 0.02777096815407276, "step": 1184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 512.0, "completions/max_terminated_length": 495.0, "completions/mean_length": 259.90625, "completions/mean_terminated_length": 233.8275909423828, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 0.948, "grad_norm": 2.977755546569824, "kl": 0.074462890625, "learning_rate": 1e-06, "loss": -0.1044, "num_tokens": 15909150.0, "reward": 0.05368737503886223, "reward_std": 0.015224416740238667, "rewards/bleu_reward_func/mean": 0.05368737503886223, "rewards/bleu_reward_func/std": 0.03522547334432602, "step": 1185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 512.0, "completions/max_terminated_length": 466.0, "completions/mean_length": 376.46875, "completions/mean_terminated_length": 256.8823547363281, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 0.9488, "grad_norm": 2.1552228927612305, "kl": 0.06488037109375, "learning_rate": 1e-06, "loss": 0.067, "num_tokens": 15924293.0, "reward": 0.023215215653181076, "reward_std": 0.007974323816597462, "rewards/bleu_reward_func/mean": 0.023215215653181076, "rewards/bleu_reward_func/std": 0.01594514399766922, "step": 1186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5625, "completions/max_length": 512.0, "completions/max_terminated_length": 453.0, "completions/mean_length": 432.8125, "completions/mean_terminated_length": 331.0, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.9496, "grad_norm": 1.988263487815857, "kl": 0.033172607421875, "learning_rate": 1e-06, "loss": 0.0445, "num_tokens": 15942727.0, "reward": 0.03754414618015289, "reward_std": 0.0140055101364851, "rewards/bleu_reward_func/mean": 0.03754414618015289, "rewards/bleu_reward_func/std": 0.019993774592876434, "step": 1187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 425.0, "completions/max_terminated_length": 425.0, "completions/mean_length": 240.84375, "completions/mean_terminated_length": 240.84375, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.9504, "grad_norm": 3.113128900527954, "kl": 0.06243896484375, "learning_rate": 1e-06, "loss": -0.1811, "num_tokens": 15952514.0, "reward": 0.030695520341396332, "reward_std": 0.009731138125061989, "rewards/bleu_reward_func/mean": 0.030695520341396332, "rewards/bleu_reward_func/std": 0.013646015897393227, "step": 1188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 512.0, "completions/max_terminated_length": 512.0, "completions/mean_length": 329.3125, "completions/mean_terminated_length": 317.13336181640625, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.9512, "grad_norm": 2.414848804473877, "kl": 0.045074462890625, "learning_rate": 1e-06, "loss": -0.0355, "num_tokens": 15965252.0, "reward": 0.04139825701713562, "reward_std": 0.021453116089105606, "rewards/bleu_reward_func/mean": 0.04139825701713562, "rewards/bleu_reward_func/std": 0.03966396301984787, "step": 1189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 467.0, "completions/mean_length": 280.625, "completions/mean_terminated_length": 203.5, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "epoch": 0.952, "grad_norm": 3.5793888568878174, "kl": 0.06951904296875, "learning_rate": 1e-06, "loss": -0.0784, "num_tokens": 15978392.0, "reward": 0.038947951048612595, "reward_std": 0.008256456814706326, "rewards/bleu_reward_func/mean": 0.038947951048612595, "rewards/bleu_reward_func/std": 0.03032156452536583, "step": 1190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 469.0, "completions/max_terminated_length": 469.0, "completions/mean_length": 178.5625, "completions/mean_terminated_length": 178.5625, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 0.9528, "grad_norm": 4.517023086547852, "kl": 0.08197021484375, "learning_rate": 1e-06, "loss": -0.0047, "num_tokens": 15987898.0, "reward": 0.06466805189847946, "reward_std": 0.027795474976301193, "rewards/bleu_reward_func/mean": 0.06466805189847946, "rewards/bleu_reward_func/std": 0.05030693858861923, "step": 1191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 505.0, "completions/mean_length": 240.625, "completions/mean_terminated_length": 164.63999938964844, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 0.9536, "grad_norm": 3.730811357498169, "kl": 0.07061767578125, "learning_rate": 1e-06, "loss": 0.0561, "num_tokens": 15998142.0, "reward": 0.047298163175582886, "reward_std": 0.01436680555343628, "rewards/bleu_reward_func/mean": 0.047298163175582886, "rewards/bleu_reward_func/std": 0.0181716401129961, "step": 1192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 452.0, "completions/mean_length": 391.46875, "completions/mean_terminated_length": 344.3043518066406, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.9544, "grad_norm": 2.0034618377685547, "kl": 0.03424072265625, "learning_rate": 1e-06, "loss": -0.0098, "num_tokens": 16015477.0, "reward": 0.10216254740953445, "reward_std": 0.0467507429420948, "rewards/bleu_reward_func/mean": 0.10216254740953445, "rewards/bleu_reward_func/std": 0.10135076195001602, "step": 1193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 512.0, "completions/max_terminated_length": 500.0, "completions/mean_length": 423.46875, "completions/mean_terminated_length": 383.227294921875, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "epoch": 0.9552, "grad_norm": 2.356437921524048, "kl": 0.06634521484375, "learning_rate": 1e-06, "loss": -0.0102, "num_tokens": 16033492.0, "reward": 0.05241236090660095, "reward_std": 0.018471311777830124, "rewards/bleu_reward_func/mean": 0.05241236090660095, "rewards/bleu_reward_func/std": 0.030719749629497528, "step": 1194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 458.0, "completions/max_terminated_length": 458.0, "completions/mean_length": 283.09375, "completions/mean_terminated_length": 283.09375, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 0.956, "grad_norm": 2.7636096477508545, "kl": 0.0780029296875, "learning_rate": 1e-06, "loss": -0.0413, "num_tokens": 16044871.0, "reward": 0.08429600298404694, "reward_std": 0.04460438713431358, "rewards/bleu_reward_func/mean": 0.08429600298404694, "rewards/bleu_reward_func/std": 0.09819035232067108, "step": 1195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 436.0, "completions/mean_length": 276.78125, "completions/mean_terminated_length": 210.9199981689453, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 0.9568, "grad_norm": 3.776975393295288, "kl": 0.07073974609375, "learning_rate": 1e-06, "loss": 0.1762, "num_tokens": 16056568.0, "reward": 0.08197371661663055, "reward_std": 0.05241422355175018, "rewards/bleu_reward_func/mean": 0.08197371661663055, "rewards/bleu_reward_func/std": 0.07864004373550415, "step": 1196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 512.0, "completions/max_terminated_length": 511.0, "completions/mean_length": 454.40625, "completions/mean_terminated_length": 428.227294921875, "completions/min_length": 277.0, "completions/min_terminated_length": 277.0, "epoch": 0.9576, "grad_norm": 1.9507607221603394, "kl": 0.0592041015625, "learning_rate": 1e-06, "loss": 0.0529, "num_tokens": 16074165.0, "reward": 0.03464844077825546, "reward_std": 0.008609910495579243, "rewards/bleu_reward_func/mean": 0.03464844077825546, "rewards/bleu_reward_func/std": 0.02310766465961933, "step": 1197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 512.0, "completions/max_terminated_length": 489.0, "completions/mean_length": 168.84375, "completions/mean_terminated_length": 119.8214340209961, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 0.9584, "grad_norm": 4.090397357940674, "kl": 0.0902099609375, "learning_rate": 1e-06, "loss": 0.0189, "num_tokens": 16081696.0, "reward": 0.06762534379959106, "reward_std": 0.053265273571014404, "rewards/bleu_reward_func/mean": 0.06762534379959106, "rewards/bleu_reward_func/std": 0.08817991614341736, "step": 1198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 512.0, "completions/max_terminated_length": 484.0, "completions/mean_length": 403.28125, "completions/mean_terminated_length": 307.3529357910156, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.9592, "grad_norm": 2.376145839691162, "kl": 0.06719970703125, "learning_rate": 1e-06, "loss": 0.0666, "num_tokens": 16099561.0, "reward": 0.05755573883652687, "reward_std": 0.012290094047784805, "rewards/bleu_reward_func/mean": 0.05755573883652687, "rewards/bleu_reward_func/std": 0.034789226949214935, "step": 1199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 512.0, "completions/max_terminated_length": 453.0, "completions/mean_length": 304.28125, "completions/mean_terminated_length": 282.7930908203125, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 0.96, "grad_norm": 2.4257760047912598, "kl": 0.046630859375, "learning_rate": 1e-06, "loss": -0.0061, "num_tokens": 16113370.0, "reward": 0.1820225715637207, "reward_std": 0.07978139072656631, "rewards/bleu_reward_func/mean": 0.1820225715637207, "rewards/bleu_reward_func/std": 0.09627845138311386, "step": 1200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.90625, "completions/max_length": 512.0, "completions/max_terminated_length": 456.0, "completions/mean_length": 485.15625, "completions/mean_terminated_length": 225.6666717529297, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.9608, "grad_norm": 2.164067268371582, "kl": 0.06500244140625, "learning_rate": 1e-06, "loss": -0.0841, "num_tokens": 16133479.0, "reward": 0.030571069568395615, "reward_std": 0.009849293157458305, "rewards/bleu_reward_func/mean": 0.030571069568395615, "rewards/bleu_reward_func/std": 0.015346908010542393, "step": 1201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 512.0, "completions/max_terminated_length": 510.0, "completions/mean_length": 444.25, "completions/mean_terminated_length": 376.5, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "epoch": 0.9616, "grad_norm": 2.0394792556762695, "kl": 0.057525634765625, "learning_rate": 1e-06, "loss": 0.0648, "num_tokens": 16152527.0, "reward": 0.013919162563979626, "reward_std": 0.006773381493985653, "rewards/bleu_reward_func/mean": 0.013919162563979626, "rewards/bleu_reward_func/std": 0.010089041665196419, "step": 1202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 512.0, "completions/max_terminated_length": 501.0, "completions/mean_length": 362.5, "completions/mean_terminated_length": 260.2105407714844, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 0.9624, "grad_norm": 3.046417474746704, "kl": 0.0723876953125, "learning_rate": 1e-06, "loss": 0.069, "num_tokens": 16166959.0, "reward": 0.02852245420217514, "reward_std": 0.007626072503626347, "rewards/bleu_reward_func/mean": 0.02852245420217514, "rewards/bleu_reward_func/std": 0.008153699338436127, "step": 1203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 475.0, "completions/mean_length": 334.09375, "completions/mean_terminated_length": 274.79168701171875, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.9632, "grad_norm": 2.7134299278259277, "kl": 0.06134033203125, "learning_rate": 1e-06, "loss": 0.0099, "num_tokens": 16181378.0, "reward": 0.09094207733869553, "reward_std": 0.016675401479005814, "rewards/bleu_reward_func/mean": 0.09094207733869553, "rewards/bleu_reward_func/std": 0.10062676668167114, "step": 1204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 512.0, "completions/max_terminated_length": 511.0, "completions/mean_length": 335.25, "completions/mean_terminated_length": 316.96551513671875, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.964, "grad_norm": 2.4693562984466553, "kl": 0.05059814453125, "learning_rate": 1e-06, "loss": 0.0418, "num_tokens": 16194082.0, "reward": 0.08112166076898575, "reward_std": 0.030807986855506897, "rewards/bleu_reward_func/mean": 0.08112166076898575, "rewards/bleu_reward_func/std": 0.0743534192442894, "step": 1205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 512.0, "completions/max_terminated_length": 507.0, "completions/mean_length": 339.3125, "completions/mean_terminated_length": 307.3333435058594, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 0.9648, "grad_norm": 2.691605806350708, "kl": 0.0670166015625, "learning_rate": 1e-06, "loss": -0.133, "num_tokens": 16206900.0, "reward": 0.028721408918499947, "reward_std": 0.012371614575386047, "rewards/bleu_reward_func/mean": 0.028721408918499947, "rewards/bleu_reward_func/std": 0.013438318856060505, "step": 1206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 415.0, "completions/max_terminated_length": 415.0, "completions/mean_length": 300.875, "completions/mean_terminated_length": 300.875, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.9656, "grad_norm": 2.7378876209259033, "kl": 0.0537109375, "learning_rate": 1e-06, "loss": -0.0358, "num_tokens": 16220984.0, "reward": 0.07377751916646957, "reward_std": 0.025469692423939705, "rewards/bleu_reward_func/mean": 0.07377751916646957, "rewards/bleu_reward_func/std": 0.059645578265190125, "step": 1207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 495.0, "completions/mean_length": 246.9375, "completions/mean_terminated_length": 158.58334350585938, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 0.9664, "grad_norm": 3.1071829795837402, "kl": 0.09368896484375, "learning_rate": 1e-06, "loss": 0.1223, "num_tokens": 16231742.0, "reward": 0.07445356994867325, "reward_std": 0.0362502858042717, "rewards/bleu_reward_func/mean": 0.07445356994867325, "rewards/bleu_reward_func/std": 0.06206907704472542, "step": 1208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 484.0, "completions/mean_length": 358.875, "completions/mean_terminated_length": 316.0, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.9672, "grad_norm": 2.5442681312561035, "kl": 0.05780029296875, "learning_rate": 1e-06, "loss": -0.0651, "num_tokens": 16248674.0, "reward": 0.0365150049328804, "reward_std": 0.0132023636251688, "rewards/bleu_reward_func/mean": 0.0365150049328804, "rewards/bleu_reward_func/std": 0.02038295939564705, "step": 1209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 512.0, "completions/max_terminated_length": 478.0, "completions/mean_length": 367.65625, "completions/mean_terminated_length": 302.04547119140625, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.968, "grad_norm": 2.3779213428497314, "kl": 0.0618896484375, "learning_rate": 1e-06, "loss": 0.035, "num_tokens": 16262847.0, "reward": 0.07498673349618912, "reward_std": 0.023969056084752083, "rewards/bleu_reward_func/mean": 0.07498673349618912, "rewards/bleu_reward_func/std": 0.0473894327878952, "step": 1210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 510.0, "completions/mean_length": 366.28125, "completions/mean_terminated_length": 309.2608642578125, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "epoch": 0.9688, "grad_norm": 2.7652668952941895, "kl": 0.08758544921875, "learning_rate": 1e-06, "loss": 0.0972, "num_tokens": 16277592.0, "reward": 0.05950748920440674, "reward_std": 0.02408502995967865, "rewards/bleu_reward_func/mean": 0.05950748920440674, "rewards/bleu_reward_func/std": 0.06841208040714264, "step": 1211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 471.0, "completions/mean_length": 292.65625, "completions/mean_terminated_length": 219.5416717529297, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 0.9696, "grad_norm": 3.904935359954834, "kl": 0.09576416015625, "learning_rate": 1e-06, "loss": -0.0152, "num_tokens": 16290237.0, "reward": 0.0675918310880661, "reward_std": 0.013060121797025204, "rewards/bleu_reward_func/mean": 0.0675918310880661, "rewards/bleu_reward_func/std": 0.05720841512084007, "step": 1212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 512.0, "completions/max_terminated_length": 509.0, "completions/mean_length": 420.28125, "completions/mean_terminated_length": 378.5909118652344, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 0.9704, "grad_norm": 2.223933219909668, "kl": 0.05810546875, "learning_rate": 1e-06, "loss": -0.0617, "num_tokens": 16306462.0, "reward": 0.06224057823419571, "reward_std": 0.022418636828660965, "rewards/bleu_reward_func/mean": 0.06224057823419571, "rewards/bleu_reward_func/std": 0.03051481395959854, "step": 1213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 512.0, "completions/max_terminated_length": 489.0, "completions/mean_length": 234.4375, "completions/mean_terminated_length": 183.0370330810547, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.9712, "grad_norm": 5.2260236740112305, "kl": 0.071075439453125, "learning_rate": 1e-06, "loss": -0.1912, "num_tokens": 16316172.0, "reward": 0.053409986197948456, "reward_std": 0.025385111570358276, "rewards/bleu_reward_func/mean": 0.053409986197948456, "rewards/bleu_reward_func/std": 0.03583426773548126, "step": 1214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 502.0, "completions/mean_length": 292.53125, "completions/mean_terminated_length": 219.375, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "epoch": 0.972, "grad_norm": 3.1006455421447754, "kl": 0.05877685546875, "learning_rate": 1e-06, "loss": -0.0024, "num_tokens": 16331325.0, "reward": 0.03592420741915703, "reward_std": 0.00873337872326374, "rewards/bleu_reward_func/mean": 0.03592420741915703, "rewards/bleu_reward_func/std": 0.02614421211183071, "step": 1215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 508.0, "completions/mean_length": 263.75, "completions/mean_terminated_length": 181.0, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.9728, "grad_norm": 11.719066619873047, "kl": 0.18597412109375, "learning_rate": 1e-06, "loss": -0.1213, "num_tokens": 16345373.0, "reward": 0.04153061658143997, "reward_std": 0.015695935115218163, "rewards/bleu_reward_func/mean": 0.04153061658143997, "rewards/bleu_reward_func/std": 0.031689949333667755, "step": 1216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 512.0, "completions/max_terminated_length": 495.0, "completions/mean_length": 378.28125, "completions/mean_terminated_length": 347.423095703125, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 0.9736, "grad_norm": 1.9226611852645874, "kl": 0.072265625, "learning_rate": 1e-06, "loss": -0.0867, "num_tokens": 16362718.0, "reward": 0.03419474512338638, "reward_std": 0.02102738618850708, "rewards/bleu_reward_func/mean": 0.03419474512338638, "rewards/bleu_reward_func/std": 0.033396027982234955, "step": 1217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 512.0, "completions/max_terminated_length": 411.0, "completions/mean_length": 249.0, "completions/mean_terminated_length": 240.51612854003906, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.9744, "grad_norm": 3.4589998722076416, "kl": 0.07830810546875, "learning_rate": 1e-06, "loss": -0.1008, "num_tokens": 16372462.0, "reward": 0.032402701675891876, "reward_std": 0.013161510229110718, "rewards/bleu_reward_func/mean": 0.032402701675891876, "rewards/bleu_reward_func/std": 0.018716424703598022, "step": 1218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 465.0, "completions/mean_length": 353.625, "completions/mean_terminated_length": 291.6521911621094, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.9752, "grad_norm": 2.2739009857177734, "kl": 0.04034423828125, "learning_rate": 1e-06, "loss": 0.0097, "num_tokens": 16386298.0, "reward": 0.11592083424329758, "reward_std": 0.02954115904867649, "rewards/bleu_reward_func/mean": 0.11592083424329758, "rewards/bleu_reward_func/std": 0.10508622229099274, "step": 1219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 512.0, "completions/max_terminated_length": 406.0, "completions/mean_length": 187.0, "completions/mean_terminated_length": 176.51612854003906, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 0.976, "grad_norm": 3.4201154708862305, "kl": 0.0491943359375, "learning_rate": 1e-06, "loss": 0.0096, "num_tokens": 16396570.0, "reward": 0.09430958330631256, "reward_std": 0.04518333077430725, "rewards/bleu_reward_func/mean": 0.09430958330631256, "rewards/bleu_reward_func/std": 0.06046329066157341, "step": 1220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 465.0, "completions/max_terminated_length": 465.0, "completions/mean_length": 223.9375, "completions/mean_terminated_length": 223.9375, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 0.9768, "grad_norm": 4.284764766693115, "kl": 0.10467529296875, "learning_rate": 1e-06, "loss": -0.0042, "num_tokens": 16406816.0, "reward": 0.0637713298201561, "reward_std": 0.019082939252257347, "rewards/bleu_reward_func/mean": 0.0637713298201561, "rewards/bleu_reward_func/std": 0.027624597772955894, "step": 1221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 510.0, "completions/mean_length": 288.78125, "completions/mean_terminated_length": 201.43478393554688, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 0.9776, "grad_norm": 3.965935468673706, "kl": 0.0511474609375, "learning_rate": 1e-06, "loss": -0.0813, "num_tokens": 16419049.0, "reward": 0.03097653202712536, "reward_std": 0.009922297671437263, "rewards/bleu_reward_func/mean": 0.03097653202712536, "rewards/bleu_reward_func/std": 0.013633164577186108, "step": 1222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 276.0, "completions/mean_length": 202.5625, "completions/mean_terminated_length": 115.91999816894531, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 0.9784, "grad_norm": 4.116911888122559, "kl": 0.095611572265625, "learning_rate": 1e-06, "loss": 0.0455, "num_tokens": 16429107.0, "reward": 0.05831120163202286, "reward_std": 0.027516763657331467, "rewards/bleu_reward_func/mean": 0.05831120163202286, "rewards/bleu_reward_func/std": 0.04774592071771622, "step": 1223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 421.0, "completions/mean_length": 342.59375, "completions/mean_terminated_length": 286.125, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.9792, "grad_norm": 2.510331630706787, "kl": 0.06231689453125, "learning_rate": 1e-06, "loss": -0.0519, "num_tokens": 16442350.0, "reward": 0.0445207916200161, "reward_std": 0.014958234503865242, "rewards/bleu_reward_func/mean": 0.0445207916200161, "rewards/bleu_reward_func/std": 0.03363141417503357, "step": 1224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 445.0, "completions/max_terminated_length": 445.0, "completions/mean_length": 234.84375, "completions/mean_terminated_length": 234.84375, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 0.98, "grad_norm": 3.040597915649414, "kl": 0.0728759765625, "learning_rate": 1e-06, "loss": 0.1173, "num_tokens": 16451969.0, "reward": 0.06547506153583527, "reward_std": 0.020516231656074524, "rewards/bleu_reward_func/mean": 0.06547506153583527, "rewards/bleu_reward_func/std": 0.055397652089595795, "step": 1225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 512.0, "completions/max_terminated_length": 458.0, "completions/mean_length": 205.09375, "completions/mean_terminated_length": 195.19354248046875, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.9808, "grad_norm": 2.874229907989502, "kl": 0.039337158203125, "learning_rate": 1e-06, "loss": 0.1034, "num_tokens": 16461804.0, "reward": 0.03653764724731445, "reward_std": 0.022169658914208412, "rewards/bleu_reward_func/mean": 0.03653764724731445, "rewards/bleu_reward_func/std": 0.03301393613219261, "step": 1226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.53125, "completions/max_length": 512.0, "completions/max_terminated_length": 465.0, "completions/mean_length": 399.1875, "completions/mean_terminated_length": 271.3333435058594, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 0.9816, "grad_norm": 3.17681622505188, "kl": 0.07647705078125, "learning_rate": 1e-06, "loss": 0.0085, "num_tokens": 16479034.0, "reward": 0.052174534648656845, "reward_std": 0.025132428854703903, "rewards/bleu_reward_func/mean": 0.052174534648656845, "rewards/bleu_reward_func/std": 0.037799958139657974, "step": 1227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 512.0, "completions/max_terminated_length": 510.0, "completions/mean_length": 270.65625, "completions/mean_terminated_length": 245.6896514892578, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 0.9824, "grad_norm": 2.6783218383789062, "kl": 0.067138671875, "learning_rate": 1e-06, "loss": -0.0257, "num_tokens": 16493439.0, "reward": 0.0533532090485096, "reward_std": 0.025761138647794724, "rewards/bleu_reward_func/mean": 0.0533532090485096, "rewards/bleu_reward_func/std": 0.04583704099059105, "step": 1228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 307.0, "completions/max_terminated_length": 307.0, "completions/mean_length": 184.3125, "completions/mean_terminated_length": 184.3125, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 0.9832, "grad_norm": 3.430950403213501, "kl": 0.0555419921875, "learning_rate": 1e-06, "loss": 0.1023, "num_tokens": 16502305.0, "reward": 0.08854292333126068, "reward_std": 0.03958655521273613, "rewards/bleu_reward_func/mean": 0.08854292333126068, "rewards/bleu_reward_func/std": 0.1360902488231659, "step": 1229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.53125, "completions/max_length": 512.0, "completions/max_terminated_length": 501.0, "completions/mean_length": 439.28125, "completions/mean_terminated_length": 356.86669921875, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.984, "grad_norm": 2.2083981037139893, "kl": 0.062896728515625, "learning_rate": 1e-06, "loss": 0.102, "num_tokens": 16520890.0, "reward": 0.1506825089454651, "reward_std": 0.0348081961274147, "rewards/bleu_reward_func/mean": 0.1506825089454651, "rewards/bleu_reward_func/std": 0.1484927535057068, "step": 1230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 448.0, "completions/mean_length": 326.21875, "completions/mean_terminated_length": 253.52174377441406, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.9848, "grad_norm": 3.313183307647705, "kl": 0.1004638671875, "learning_rate": 1e-06, "loss": -0.056, "num_tokens": 16533841.0, "reward": 0.034567564725875854, "reward_std": 0.010901572182774544, "rewards/bleu_reward_func/mean": 0.034567564725875854, "rewards/bleu_reward_func/std": 0.017620721831917763, "step": 1231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.0, "completions/max_terminated_length": 487.0, "completions/mean_length": 225.0, "completions/mean_terminated_length": 225.0, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.9856, "grad_norm": 3.367202043533325, "kl": 0.08966064453125, "learning_rate": 1e-06, "loss": 0.0463, "num_tokens": 16543385.0, "reward": 0.07799240946769714, "reward_std": 0.019178325310349464, "rewards/bleu_reward_func/mean": 0.07799240946769714, "rewards/bleu_reward_func/std": 0.06506384909152985, "step": 1232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 512.0, "completions/max_terminated_length": 500.0, "completions/mean_length": 377.875, "completions/mean_terminated_length": 273.5555725097656, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "epoch": 0.9864, "grad_norm": 2.6554360389709473, "kl": 0.04986572265625, "learning_rate": 1e-06, "loss": 0.1209, "num_tokens": 16558325.0, "reward": 0.07152421027421951, "reward_std": 0.01307828351855278, "rewards/bleu_reward_func/mean": 0.07152421027421951, "rewards/bleu_reward_func/std": 0.0781111940741539, "step": 1233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 512.0, "completions/max_terminated_length": 436.0, "completions/mean_length": 415.96875, "completions/mean_terminated_length": 319.9375, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.9872, "grad_norm": 2.2727460861206055, "kl": 0.04705810546875, "learning_rate": 1e-06, "loss": 0.025, "num_tokens": 16575612.0, "reward": 0.04410577565431595, "reward_std": 0.009795146994292736, "rewards/bleu_reward_func/mean": 0.04410577565431595, "rewards/bleu_reward_func/std": 0.03114950843155384, "step": 1234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.53125, "completions/max_length": 512.0, "completions/max_terminated_length": 479.0, "completions/mean_length": 370.21875, "completions/mean_terminated_length": 209.53334045410156, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "epoch": 0.988, "grad_norm": 3.1277663707733154, "kl": 0.0838623046875, "learning_rate": 1e-06, "loss": -0.0597, "num_tokens": 16590387.0, "reward": 0.0662313848733902, "reward_std": 0.01718086190521717, "rewards/bleu_reward_func/mean": 0.0662313848733902, "rewards/bleu_reward_func/std": 0.04029948636889458, "step": 1235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 512.0, "completions/max_terminated_length": 511.0, "completions/mean_length": 378.78125, "completions/mean_terminated_length": 275.1666564941406, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 0.9888, "grad_norm": 2.5095629692077637, "kl": 0.08868408203125, "learning_rate": 1e-06, "loss": -0.0581, "num_tokens": 16605916.0, "reward": 0.05035623162984848, "reward_std": 0.01956663653254509, "rewards/bleu_reward_func/mean": 0.05035623162984848, "rewards/bleu_reward_func/std": 0.022561483085155487, "step": 1236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 486.0, "completions/mean_length": 328.6875, "completions/mean_terminated_length": 277.3599853515625, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 0.9896, "grad_norm": 2.212676525115967, "kl": 0.060302734375, "learning_rate": 1e-06, "loss": 0.1095, "num_tokens": 16619338.0, "reward": 0.09644582122564316, "reward_std": 0.0474303662776947, "rewards/bleu_reward_func/mean": 0.09644582122564316, "rewards/bleu_reward_func/std": 0.13067010045051575, "step": 1237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 512.0, "completions/max_terminated_length": 497.0, "completions/mean_length": 399.34375, "completions/mean_terminated_length": 348.1363830566406, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 0.9904, "grad_norm": 2.4537155628204346, "kl": 0.045166015625, "learning_rate": 1e-06, "loss": -0.0291, "num_tokens": 16635413.0, "reward": 0.07322587072849274, "reward_std": 0.02353905513882637, "rewards/bleu_reward_func/mean": 0.07322587072849274, "rewards/bleu_reward_func/std": 0.058683864772319794, "step": 1238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 498.0, "completions/mean_length": 357.90625, "completions/mean_terminated_length": 297.60870361328125, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 0.9912, "grad_norm": 2.812755823135376, "kl": 0.0718994140625, "learning_rate": 1e-06, "loss": -0.007, "num_tokens": 16650594.0, "reward": 0.04551263526082039, "reward_std": 0.008796430192887783, "rewards/bleu_reward_func/mean": 0.04551263526082039, "rewards/bleu_reward_func/std": 0.021256960928440094, "step": 1239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 512.0, "completions/max_terminated_length": 437.0, "completions/mean_length": 378.21875, "completions/mean_terminated_length": 244.4375, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 0.992, "grad_norm": 2.3463385105133057, "kl": 0.06915283203125, "learning_rate": 1e-06, "loss": -0.0618, "num_tokens": 16666545.0, "reward": 0.03398827463388443, "reward_std": 0.007483157329261303, "rewards/bleu_reward_func/mean": 0.03398827463388443, "rewards/bleu_reward_func/std": 0.015963921323418617, "step": 1240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 472.0, "completions/max_terminated_length": 472.0, "completions/mean_length": 288.71875, "completions/mean_terminated_length": 288.71875, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.9928, "grad_norm": 2.5251529216766357, "kl": 0.0472412109375, "learning_rate": 1e-06, "loss": -0.02, "num_tokens": 16677928.0, "reward": 0.0579800084233284, "reward_std": 0.016186034306883812, "rewards/bleu_reward_func/mean": 0.0579800084233284, "rewards/bleu_reward_func/std": 0.046285130083560944, "step": 1241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 512.0, "completions/max_terminated_length": 505.0, "completions/mean_length": 329.21875, "completions/mean_terminated_length": 287.0384826660156, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.9936, "grad_norm": 2.9839494228363037, "kl": 0.05584716796875, "learning_rate": 1e-06, "loss": -0.1253, "num_tokens": 16690791.0, "reward": 0.08833082020282745, "reward_std": 0.037769023329019547, "rewards/bleu_reward_func/mean": 0.08833082020282745, "rewards/bleu_reward_func/std": 0.06798525899648666, "step": 1242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 512.0, "completions/max_terminated_length": 510.0, "completions/mean_length": 381.875, "completions/mean_terminated_length": 292.84210205078125, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 0.9944, "grad_norm": 2.434730291366577, "kl": 0.05291748046875, "learning_rate": 1e-06, "loss": -0.0027, "num_tokens": 16707947.0, "reward": 0.08765649050474167, "reward_std": 0.032737139612436295, "rewards/bleu_reward_func/mean": 0.08765649050474167, "rewards/bleu_reward_func/std": 0.06498604267835617, "step": 1243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 512.0, "completions/max_terminated_length": 416.0, "completions/mean_length": 223.25, "completions/mean_terminated_length": 213.9354705810547, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 0.9952, "grad_norm": 2.878969430923462, "kl": 0.063812255859375, "learning_rate": 1e-06, "loss": 0.0242, "num_tokens": 16718507.0, "reward": 0.062188923358917236, "reward_std": 0.015940211713314056, "rewards/bleu_reward_func/mean": 0.062188923358917236, "rewards/bleu_reward_func/std": 0.06735063344240189, "step": 1244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.65625, "completions/max_length": 512.0, "completions/max_terminated_length": 487.0, "completions/mean_length": 417.53125, "completions/mean_terminated_length": 237.18182373046875, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.996, "grad_norm": 2.7636101245880127, "kl": 0.08599853515625, "learning_rate": 1e-06, "loss": 0.0432, "num_tokens": 16735052.0, "reward": 0.05753006786108017, "reward_std": 0.015103975310921669, "rewards/bleu_reward_func/mean": 0.05753006786108017, "rewards/bleu_reward_func/std": 0.058132320642471313, "step": 1245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 512.0, "completions/max_terminated_length": 499.0, "completions/mean_length": 426.90625, "completions/mean_terminated_length": 368.6842041015625, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 0.9968, "grad_norm": 2.3084988594055176, "kl": 0.0635986328125, "learning_rate": 1e-06, "loss": -0.0934, "num_tokens": 16752049.0, "reward": 0.046952854841947556, "reward_std": 0.012242695316672325, "rewards/bleu_reward_func/mean": 0.046952854841947556, "rewards/bleu_reward_func/std": 0.018551921471953392, "step": 1246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 393.0, "completions/max_terminated_length": 393.0, "completions/mean_length": 236.28125, "completions/mean_terminated_length": 236.28125, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.9976, "grad_norm": 2.7959964275360107, "kl": 0.0523681640625, "learning_rate": 1e-06, "loss": -0.0355, "num_tokens": 16761786.0, "reward": 0.04307990521192551, "reward_std": 0.014406087808310986, "rewards/bleu_reward_func/mean": 0.04307990521192551, "rewards/bleu_reward_func/std": 0.02000701241195202, "step": 1247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 502.0, "completions/mean_length": 320.6875, "completions/mean_terminated_length": 267.1199951171875, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.9984, "grad_norm": 2.47160005569458, "kl": 0.0494384765625, "learning_rate": 1e-06, "loss": 0.0789, "num_tokens": 16774280.0, "reward": 0.04456353932619095, "reward_std": 0.015042895451188087, "rewards/bleu_reward_func/mean": 0.04456353932619095, "rewards/bleu_reward_func/std": 0.06020372360944748, "step": 1248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 512.0, "completions/max_terminated_length": 507.0, "completions/mean_length": 261.75, "completions/mean_terminated_length": 245.06668090820312, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 0.9992, "grad_norm": 2.724760055541992, "kl": 0.05419921875, "learning_rate": 1e-06, "loss": 0.1319, "num_tokens": 16785400.0, "reward": 0.03123091161251068, "reward_std": 0.020780162885785103, "rewards/bleu_reward_func/mean": 0.03123091161251068, "rewards/bleu_reward_func/std": 0.029042916372418404, "step": 1249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/max_terminated_length": 391.0, "completions/mean_length": 297.375, "completions/mean_terminated_length": 297.375, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "epoch": 1.0, "grad_norm": 2.339179277420044, "kl": 0.0479736328125, "learning_rate": 1e-06, "loss": -0.0543, "num_tokens": 16804158.0, "reward": 0.0883922278881073, "reward_std": 0.03340703248977661, "rewards/bleu_reward_func/mean": 0.0883922278881073, "rewards/bleu_reward_func/std": 0.07894789427518845, "step": 1250 } ], "logging_steps": 1, "max_steps": 1250, "num_input_tokens_seen": 16804158, "num_train_epochs": 1, "save_steps": 250, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }